Apple Microarchitecture Research by Dougall Johnson

M1/A14 P-core (Firestorm): Overview | Base Instructions | SIMD and FP Instructions
M1/A14 E-core (Icestorm):  Overview | Base Instructions | SIMD and FP Instructions

TBX (four register table, 16B)

Test 1: uops

Code:

  tbx v0.16b, { v1.16b, v2.16b, v3.16b, v4.16b }, v5.16b
  movi v0.16b, 1
  movi v1.16b, 2
  movi v2.16b, 3
  movi v3.16b, 4
  movi v4.16b, 5
  movi v5.16b, 6

(no loop instructions)

1000 unrolls and 1 iteration

Retires: 4.000

Issues: 4.000

Integer unit issues: 0.001

Load/store unit issues: 0.000

SIMD/FP unit issues: 4.000

retire uop (01)cycle (02)schedule uop (52)schedule int uop (53)schedule simd uop (54)dispatch simd uop (57)ldst uops in schedulers (5b)dispatch uop (78)map simd uop (7e)map simd uop inputs (81)? int output thing (e9)? simd retires (ee)
400480334001140004000203248400040001200014000
400480334001140004000203248400040001200014000
400480334001140004000203248400040001200014000
400480334001140004000203248400040001200014000
400480334001140004000203248400040001200014000
400480334001140004000203248400040001200014000
400480334001140004000203248400040001200014000
400480334001140004000203248400040001200014000
400480334001140004000203248400040001200014000
400480334001140004000203248400040001200014000

Test 2: Latency 1->1

Code:

  tbx v0.16b, { v1.16b, v2.16b, v3.16b, v4.16b }, v5.16b
  movi v0.16b, 1
  movi v1.16b, 2
  movi v2.16b, 3
  movi v3.16b, 4
  movi v4.16b, 5
  movi v5.16b, 6

(fused SUBS/B.cc loop)

100 unrolls and 100 iterations

Result (median cycles for code): 8.0033

retire uop (01)cycle (02)schedule uop (52)schedule int uop (53)schedule simd uop (54)dispatch int uop (56)dispatch simd uop (57)int uops in schedulers (59)ldst uops in schedulers (5b)dispatch uop (78)map int uop (7c)map simd uop (7e)map int uop inputs (7f)map simd uop inputs (81)? int output thing (e9)? simd retires (ee)? int retires (ef)
402048003340101101400001004000030020392484010020040008200120024140000100
402048003340101101400001004000030020392484010020040008200120024140000100
402048003340101101400001004000030020392484010020040008200120024140000100
402048003340101101400001004000030020392484010020040008200120024140000100
402048003340101101400001004000030020392484010020040008200120024140000100
402048003340101101400001004000030020392484010020040008200120024140000100
402048003340101101400001004000030020392484010020040008200120024140000100
402048003340101101400001004000030020392484010020040008200120024140000100
402048003340101101400001004000030020392484010020040008200120024140000100
402048003340101101400001004000030020392484010020040008200120024140000100

1000 unrolls and 10 iterations

Result (median cycles for code): 8.0033

retire uop (01)cycle (02)schedule uop (52)schedule int uop (53)schedule simd uop (54)dispatch int uop (56)dispatch simd uop (57)int uops in schedulers (59)ldst uops in schedulers (5b)dispatch uop (78)map int uop (7c)map simd uop (7e)map int uop inputs (7f)map simd uop inputs (81)? int output thing (e9)? simd retires (ee)? int retires (ef)
400248003340011114000010400003020392484001020400002012000014000010
400248003340011114000010400003020392484001020400002012000014000010
400248003340011114000010400003020392484001020400002012000014000010
400248003340011114000010400003020392484001020400002012000014000010
400258006640019114000810400343020392484001020400002012000014000010
400248003340011114000010400003020392484001020400002012000014000010
400248003340011114000010400003020392484001020400002012000014000010
400248003340011114000010400003020392484001020400002012000014000010
400248003340011114000010400003020392484001020400002012000014000010
400248003340011114000010400003020392484001020400002012000014000010

Test 3: Latency 1->2

Chain cycles: 2

Code:

  movi v0.16b, 0
  tbx v0.16b, { v1.16b, v2.16b, v3.16b, v4.16b }, v5.16b
  add v1.16b, v0.16b, v0.16b
  movi v0.16b, 1
  movi v1.16b, 2
  movi v2.16b, 3
  movi v3.16b, 4
  movi v4.16b, 5
  movi v5.16b, 6

(fused SUBS/B.cc loop)

100 unrolls and 100 iterations

Result (median cycles for code, minus 2 chain cycles): 8.0033

retire uop (01)cycle (02)schedule uop (52)schedule int uop (53)schedule simd uop (54)dispatch int uop (56)dispatch simd uop (57)int uops in schedulers (59)ldst uops in schedulers (5b)dispatch uop (78)map int uop (7c)map simd uop (7e)map int uop inputs (7f)map simd uop inputs (81)? int output thing (e9)? simd retires (ee)? int retires (ef)
6020410003350101101500001005000030025492485010020050004200140012160000100
6020410003350101101500001005000030025492485010020050004200140012160000100
6020410003350101101500001005000030025492485010020050004200140012160000100
6020510006650111103500081025003430025492485010020050004200140012160000100
6020410003350101101500001005000030025492485010020050004200140012160000100
6020410003350101101500001005000030025492485010020050004200140012160000100
6020510006650109101500081005003430025492485010020050004200140012160000100
6020510006650109101500081005003420117088453361613433561200140012160000100
6020410003350101101500001005000030025492485010020050004200140012160000100
6020410003350101101500001005000030025492485010020050004200140012160000100

1000 unrolls and 10 iterations

Result (median cycles for code, minus 2 chain cycles): 8.0033

retire uop (01)cycle (02)schedule uop (52)schedule int uop (53)schedule simd uop (54)dispatch int uop (56)dispatch simd uop (57)dispatch ldst uop (58)int uops in schedulers (59)simd uops in schedulers (5a)ldst uops in schedulers (5b)dispatch uop (78)map int uop (7c)map ldst uop (7d)map simd uop (7e)map int uop inputs (7f)map simd uop inputs (81)? int output thing (e9)? simd retires (ee)? int retires (ef)
6002410003350011115000010500002036918765058725498865561250712317500862014001216000010
6002410003350011115000010500000300254924850010200500002014000016000010
6002410003350011115000010500000300254924850010200500002014000016000010
6002410003350011115000010500000340254958050045200500442014000016000010
6002410003350011115000010500000300254924850010200500002014000016000010
6002410003350011115000010500000300254924850010200500002014000016000010
6002410003350011115000010500000300254924850010200500002014000016000010
6002410003350011115000010500000300254924850010200500002014000016000010
6002410003350011115000010500000300254924850010200500002014000016000010
6002410003350011115000010500000300254924850010200500002014000016000010

Test 4: Latency 1->3

Chain cycles: 2

Code:

  movi v0.16b, 0
  tbx v0.16b, { v1.16b, v2.16b, v3.16b, v4.16b }, v5.16b
  add v2.16b, v0.16b, v0.16b
  movi v0.16b, 1
  movi v1.16b, 2
  movi v2.16b, 3
  movi v3.16b, 4
  movi v4.16b, 5
  movi v5.16b, 6

(fused SUBS/B.cc loop)

100 unrolls and 100 iterations

Result (median cycles for code, minus 2 chain cycles): 6.0035

retire uop (01)cycle (02)schedule uop (52)schedule int uop (53)schedule simd uop (54)dispatch int uop (56)dispatch simd uop (57)int uops in schedulers (59)ldst uops in schedulers (5b)dispatch uop (78)map int uop (7c)map simd uop (7e)map int uop inputs (7f)map simd uop inputs (81)? int output thing (e9)? simd retires (ee)? int retires (ef)
602048003550101101500001005000030020292715010020050004200140012160000100
602048003550101101500001005000030020292745010020050004200140012160000100
602058006850117101500161005004230020295675014220050052200140012160000100
602048003550101101500001005000030020292775010020050004200140012160000100
602048003550101101500001005000030020292775010020050004200140012160000100
602048003550101101500001005000030020292775010020050004200140012160000100
602048003550101101500001005000030020292775010020050004200140012160000100
602048003550101101500001005000030020292775010020050004200140012160000100
602048003550101101500001005000030020292775010020050004200140012160000100
602048003550101101500001005000030020292775010020050004200140012160000100

1000 unrolls and 10 iterations

Result (median cycles for code, minus 2 chain cycles): 6.0035

retire uop (01)cycle (02)schedule uop (52)schedule int uop (53)schedule simd uop (54)dispatch int uop (56)dispatch simd uop (57)int uops in schedulers (59)ldst uops in schedulers (5b)dispatch uop (78)map int uop (7c)map simd uop (7e)map int uop inputs (7f)map simd uop inputs (81)? int output thing (e9)? simd retires (ee)? int retires (ef)
600248003550011115000010500003020292755001020500042014001216000010
600248003550011115000010500003020295675005220500522014000016000010
600248003550011115000010500003020292775001020500002014000016000010
600248003550011115000010500003020292775001020500002014000016000010
600248003550011115000010500003020292775001020500002014000016000010
600248003550011115000010500003020292775001020500002014000016000010
600248003550011115000010500003020292775001020500002014000016000010
600248003550011115000010500003020292775001020500002014000016000010
600248003550011115000010500003020292775001020500002014000016000010
600248003550011115000010500003020292775001020500002014000016000010

Test 5: Latency 1->4

Chain cycles: 2

Code:

  movi v0.16b, 0
  tbx v0.16b, { v1.16b, v2.16b, v3.16b, v4.16b }, v5.16b
  add v3.16b, v0.16b, v0.16b
  movi v0.16b, 1
  movi v1.16b, 2
  movi v2.16b, 3
  movi v3.16b, 4
  movi v4.16b, 5
  movi v5.16b, 6

(fused SUBS/B.cc loop)

100 unrolls and 100 iterations

Result (median cycles for code, minus 2 chain cycles): 4.0037

retire uop (01)cycle (02)schedule uop (52)schedule int uop (53)schedule simd uop (54)dispatch int uop (56)dispatch simd uop (57)int uops in schedulers (59)ldst uops in schedulers (5b)dispatch uop (78)map int uop (7c)map simd uop (7e)map int uop inputs (7f)map simd uop inputs (81)? int output thing (e9)? simd retires (ee)? int retires (ef)
602046003750101101500001005000030015092515010020050007200140020160000100
602046003750101101500001005000030015092855010020050007200140020160000100
602046003750101101500001005000030015092855010020050007200140020160000100
602046003750101101500001005000030015092855010020050007200140020160000100
602046003750101101500001005000030015092855010020050007200140020160000100
602046003750101101500001005000030015092855010020050007200140020160000100
602046003750101101500001005000030015092855010020050007200140020160000100
602046003750101101500001005000030015092855010020050007200140020160000100
602046003750101101500001005000030015092855010020050007200140020160000100
602046003750101101500001005000030015092855010020050007200140020160000100

1000 unrolls and 10 iterations

Result (median cycles for code, minus 2 chain cycles): 4.0037

retire uop (01)cycle (02)schedule uop (52)schedule int uop (53)schedule simd uop (54)dispatch int uop (56)dispatch simd uop (57)int uops in schedulers (59)ldst uops in schedulers (5b)dispatch uop (78)map int uop (7c)map simd uop (7e)map int uop inputs (7f)map simd uop inputs (81)? int output thing (e9)? simd retires (ee)? int retires (ef)
600246003750011115000010500003015092675001020500002014000016000010
600246003750011115000010500003015092855001020500002014000016000010
600246003750011115000010500003015092855001020500002014000016000010
600246003750011115000010500003015092855001020500002014000016000010
600246003750011115000010500003015092855001020500002014000016000010
600246003750011115000010500003015092855001020500002014000016000010
600246003750011115000010500003015092855001020500002014017416000010
600246003750011115000010500003015092855001020500002014000016000010
600246003750011115000010500003015098595011820501272014018816000010
600256007250039115002810500543015092855001020500072014002016000010

Test 6: Latency 1->5

Chain cycles: 2

Code:

  movi v0.16b, 0
  tbx v0.16b, { v1.16b, v2.16b, v3.16b, v4.16b }, v5.16b
  add v4.16b, v0.16b, v0.16b
  movi v0.16b, 1
  movi v1.16b, 2
  movi v2.16b, 3
  movi v3.16b, 4
  movi v4.16b, 5
  movi v5.16b, 6

(fused SUBS/B.cc loop)

100 unrolls and 100 iterations

Result (median cycles for code, minus 2 chain cycles): 2.0039

retire uop (01)cycle (02)schedule uop (52)schedule int uop (53)schedule simd uop (54)dispatch int uop (56)dispatch simd uop (57)int uops in schedulers (59)ldst uops in schedulers (5b)dispatch uop (78)map int uop (7c)map simd uop (7e)map int uop inputs (7f)map simd uop inputs (81)? int output thing (e9)? simd retires (ee)? int retires (ef)
60204400395010110150000100500033009815215010320050007200140020160000100
60204400395010110150000100500033009817485010320050007200140020160000100
60204400395010110150000100500033009820555017820050088200140020160000100
60204400395010110150000100500033009817485010320050007200140020160000100
60204400395010110150000100500033009817485010320050007200140020160000100
60204400395010110150000100500033009817485010320050007200140020160000100
60204400395010110150000100500033009817485010320050007200140020160000100
60204400395010110150000100500033009823685017520050085200140020160000100
60204400395010110150000100500033009817485010320050007200140020160000100
60204400395010110150000100500033009816375010320050009200140020160000100

1000 unrolls and 10 iterations

Result (median cycles for code, minus 2 chain cycles): 2.0039

retire uop (01)cycle (02)schedule uop (52)schedule int uop (53)schedule simd uop (54)dispatch int uop (56)dispatch simd uop (57)int uops in schedulers (59)ldst uops in schedulers (5b)dispatch uop (78)map int uop (7c)map simd uop (7e)map int uop inputs (7f)map simd uop inputs (81)? int output thing (e9)? simd retires (ee)? int retires (ef)
60024400395001111500001050003309815135001020500002014000016000010
60024400395001111500001050000309817335001020500002014000016000010
60024400395001111500001050000309817335001020500002014000016000010
60024400395001111500001050000309817335001020500002014000016000010
60024400395001111500001050000309817335001020500002014000016000010
60024400395001111500001050000309817335001020500002014000016000010
60024400395001111500001050000309817335001020500002014000016000010
60024400395001111500001050000309817335001020500002014000016000010
60024400395001111500001050000309817335001020500002014000016000010
60024400395001111500001050000309817335001020500002014000016000010

Test 7: Latency 1->6

Chain cycles: 2

Code:

  movi v0.16b, 0
  tbx v0.16b, { v1.16b, v2.16b, v3.16b, v4.16b }, v5.16b
  add v5.16b, v0.16b, v0.16b
  movi v0.16b, 1
  movi v1.16b, 2
  movi v2.16b, 3
  movi v3.16b, 4
  movi v4.16b, 5
  movi v5.16b, 6

(fused SUBS/B.cc loop)

100 unrolls and 100 iterations

Result (median cycles for code, minus 2 chain cycles): 8.0033

retire uop (01)cycle (02)schedule uop (52)schedule int uop (53)schedule simd uop (54)dispatch int uop (56)dispatch simd uop (57)int uops in schedulers (59)ldst uops in schedulers (5b)dispatch uop (78)map int uop (7c)map simd uop (7e)map int uop inputs (7f)map simd uop inputs (81)? int output thing (e9)? simd retires (ee)? int retires (ef)
6020410003350101101500001005000030025492485010020050004200140012160000100
6020410003350101101500001005000030025492485010020050004200140012160000100
6020410003350101101500001005000030025492485010020050004200140012160000100
6020410003350101101500001005000030025492485010020050004200140012160000100
6020410003350101101500001005000030025492485010020050004200140012160000100
6020510006750109101500081005003430025492485010020050004200140012160000100
6020410003350101101500001005000030025492485010020050004200140012160000100
6020410003350101101500001005000030025492485010020050004200140012160000100
6020410003350101101500001005000030025492485010020050004200140012160000100
6020410003350101101500001005000030025492485010020050004200140012160000100

1000 unrolls and 10 iterations

Result (median cycles for code, minus 2 chain cycles): 8.0033

retire uop (01)cycle (02)schedule uop (52)schedule int uop (53)schedule simd uop (54)schedule ldst uop (55)dispatch int uop (56)dispatch simd uop (57)int uops in schedulers (59)ldst uops in schedulers (5b)dispatch uop (78)map int uop (7c)map simd uop (7e)map int uop inputs (7f)map simd uop inputs (81)? int output thing (e9)? simd retires (ee)? int retires (ef)
60024100033500111150000010500003025492485001020500002014000016000010
60024100033500111150000010500003025492485001020500002014000016000010
60024100033500111150000010500003025492485001020500002014000016000010
60024100033500111150000010500003025492485001020500002014000016000010
735001289876128178014895245287297490433025492485001020500002014013016000010
60024100033500111150000010500003025492485001020500002014000016000010
60024100033500111150000010500003025492485001020500002014000016000010
60024100033500111150000010500003025492485001020500002014000016000010
60024100033500111150000010500003025492485001020500002014000016000010
60024100033500111150000010500003025492485001020500002014000016000010

Test 8: throughput

Count: 8

Code:

  movi v0.16b, 0
  tbx v0.16b, { v8.16b, v9.16b, v10.16b, v11.16b }, v12.16b
  movi v1.16b, 0
  tbx v1.16b, { v8.16b, v9.16b, v10.16b, v11.16b }, v12.16b
  movi v2.16b, 0
  tbx v2.16b, { v8.16b, v9.16b, v10.16b, v11.16b }, v12.16b
  movi v3.16b, 0
  tbx v3.16b, { v8.16b, v9.16b, v10.16b, v11.16b }, v12.16b
  movi v4.16b, 0
  tbx v4.16b, { v8.16b, v9.16b, v10.16b, v11.16b }, v12.16b
  movi v5.16b, 0
  tbx v5.16b, { v8.16b, v9.16b, v10.16b, v11.16b }, v12.16b
  movi v6.16b, 0
  tbx v6.16b, { v8.16b, v9.16b, v10.16b, v11.16b }, v12.16b
  movi v7.16b, 0
  tbx v7.16b, { v8.16b, v9.16b, v10.16b, v11.16b }, v12.16b
  movi v8.16b, 9
  movi v9.16b, 10
  movi v10.16b, 11
  movi v11.16b, 12
  movi v12.16b, 13

(fused SUBS/B.cc loop)

100 unrolls and 100 iterations

Result (median cycles for code divided by count): 2.0006

retire uop (01)cycle (02)schedule uop (52)schedule int uop (53)schedule simd uop (54)dispatch int uop (56)dispatch simd uop (57)dispatch ldst uop (58)int uops in schedulers (59)simd uops in schedulers (5a)ldst uops in schedulers (5b)dispatch uop (78)map int uop (7c)map ldst uop (7d)map simd uop (7e)map int uop inputs (7f)map simd uop inputs (81)? int output thing (e9)? simd retires (ee)? int retires (ef)
40020416008932010110132000010032001203000348455632011220003200152009602371400000100
40020416004732010110132000010032001203000350716932011220003200152009600451400000100
40020616012832018110132008010032012503000350716932011220003200152009602371400000100
40020516009232014210132004110032007403000350716932011220003200152009600451400000100
40020416004732010110132000010032001203000275143732017420003200792009602371400000100
40020616012832018010132007910032012703000349746732011220003200152009602341400000100
40020416004732010110132000010032001203000350716932011220003200152009600451400000100
40020416004732010110132000010032001203000350716932011220003200152009600451400000100
40020416004732010110132000010032001203000350716932011220003200152009600451400000100
40020516008932014310132004210032007303000350716932011220003200152009600451400000100

1000 unrolls and 10 iterations

Result (median cycles for code divided by count): 2.0006

retire uop (01)cycle (02)schedule uop (52)schedule int uop (53)schedule simd uop (54)dispatch int uop (56)dispatch simd uop (57)dispatch ldst uop (58)int uops in schedulers (59)simd uops in schedulers (5a)ldst uops in schedulers (5b)dispatch uop (78)map int uop (7c)map ldst uop (7d)map simd uop (7e)map int uop inputs (7f)map ldst uop inputs (80)map simd uop inputs (81)? int output thing (e9)? ldst retires (ed)? simd retires (ee)? int retires (ef)
4000241600773200111132000010320012030032958673200222003200152009600451040000010
4000241600473200111132000010320012030033919693200222003200152009600451040000010
4000241600473200111132000010320012030033919693200222003200152009600451040000010
4000241600473200111132000010320012030033919693200222003200152009600451040000010
4000241600473200111132000010320012030033919693200222003200152009600451040000010
4000251600923200511132004010320074030033919693200222003200152009600451040000010
4000241600473200111132000010320012030033919693200222003200152009600451040000010
4000241600473200111132000010320012030033919693200222003200152009602011040000010
4000241600493200121132000110320012030033919693200222003200152009604021040000010
4000241600483200111132000010320012030033915483200222003200152009600451040000010