Apple Microarchitecture Research by Dougall Johnson

M1/A14 P-core (Firestorm): Overview | Base Instructions | SIMD and FP Instructions
M1/A14 E-core (Icestorm):  Overview | Base Instructions | SIMD and FP Instructions

TBX (two register table, 16B)

Test 1: uops

Code:

  tbx v0.16b, { v1.16b, v2.16b }, v3.16b
  movi v0.16b, 1
  movi v1.16b, 2
  movi v2.16b, 3
  movi v3.16b, 4

(no loop instructions)

1000 unrolls and 1 iteration

Retires: 2.000

Issues: 2.000

Integer unit issues: 0.001

Load/store unit issues: 0.000

SIMD/FP unit issues: 2.000

retire uop (01)cycle (02)schedule uop (52)schedule int uop (53)schedule simd uop (54)dispatch simd uop (57)ldst uops in schedulers (5b)dispatch uop (78)map simd uop (7e)map simd uop inputs (81)? int output thing (e9)? simd retires (ee)
20044033200112000200010124820002000600012000
20044033200112000200010124820002000600012000
20044033200112000200010124820002000600012000
20044033200112000200010124820002000600012000
20044033200112000200010124820002000600012000
20044033200112000200010124820002000600012000
20044033200112000200010124820002000600012000
20044033200112000200010124820002000600012000
20044033200112000200010124820002000600012000
20044033200112000200010124820002000600012000

Test 2: Latency 1->1

Code:

  tbx v0.16b, { v1.16b, v2.16b }, v3.16b
  movi v0.16b, 1
  movi v1.16b, 2
  movi v2.16b, 3
  movi v3.16b, 4

(fused SUBS/B.cc loop)

100 unrolls and 100 iterations

Result (median cycles for code): 4.0033

retire uop (01)cycle (02)schedule uop (52)schedule int uop (53)schedule simd uop (54)dispatch int uop (56)dispatch simd uop (57)int uops in schedulers (59)ldst uops in schedulers (5b)dispatch uop (78)map int uop (7c)map simd uop (7e)map int uop inputs (7f)map simd uop inputs (81)? int output thing (e9)? simd retires (ee)? int retires (ef)
20204400332010110120000100200003001019248201002002000620060012120000100
20204400332010110120000100200003001019248201002002000420060012120000100
20204400332010110120000100200003001019248201002002000420060012120000100
20204400332010110120000100200003001019248201002002000420060012120000100
20204400332010110120000100200003001019248201002002000420060144120000100
20204400332010110120000100200003001019248201002002000420060012120000100
20204400332010110120000100200003001019248201002002000620060012120000100
20204400332010110120000100200003001019580201342002004620060012120000100
20204400332010110120000100200003001019248201002002000420060012120000100
20204400332010110120000100200003001019248201002002000420060012120000100

1000 unrolls and 10 iterations

Result (median cycles for code): 4.0033

retire uop (01)cycle (02)schedule uop (52)schedule int uop (53)schedule simd uop (54)schedule ldst uop (55)dispatch int uop (56)dispatch simd uop (57)int uops in schedulers (59)ldst uops in schedulers (5b)dispatch uop (78)map int uop (7c)map simd uop (7e)map int uop inputs (7f)map simd uop inputs (81)? int output thing (e9)? simd retires (ee)? int retires (ef)
200244003320011112000001020000301019248200102020006206001812000010
200244003320011112000001020000301019248200102020000206000012000010
200244003320011112000001020000301019248200102020000206000012000010
200244003320011112000001020000301019248200102020000206000012000010
200244003320011112000001020000301019248200102020000206000012000010
200244003320011112000001020000301019248200102020000206000012000010
200244003320011112000001020000301019248200102020000206000012000010
200244003320011112000001020000301019248200102020000206000012000010
200244003320011112000001020000301019248200102020000206000012000010
200244003320011112000001020000301019248200102020000206000012000010

Test 3: Latency 1->2

Chain cycles: 2

Code:

  movi v0.16b, 0
  tbx v0.16b, { v1.16b, v2.16b }, v3.16b
  add v1.16b, v0.16b, v0.16b
  movi v0.16b, 1
  movi v1.16b, 2
  movi v2.16b, 3
  movi v3.16b, 4

(fused SUBS/B.cc loop)

100 unrolls and 100 iterations

Result (median cycles for code, minus 2 chain cycles): 4.0033

retire uop (01)cycle (02)schedule uop (52)schedule int uop (53)schedule simd uop (54)dispatch int uop (56)dispatch simd uop (57)int uops in schedulers (59)ldst uops in schedulers (5b)dispatch uop (78)map int uop (7c)map simd uop (7e)map int uop inputs (7f)map simd uop inputs (81)? int output thing (e9)? simd retires (ee)? int retires (ef)
40204600333010110130000100300003001529248301002003000420080011140000100
40204600333010110130000100300003001529248301002003000420080008140000100
40204600333010110130000100300003001529248301002003000320080008140000100
40204600333010110130000100300003001529248301002003000320080008140000100
40204600333010110130000100300003001529248301002003000320080008140000100
40204600333010110130000100300003001529248301002003000320080008140000100
40204600333010110130000100300003001529248301002003000320080118140000100
40204600333010110130000100300003001529248301002003000320080008140000100
40204600333010110130000100300003001529248301002003000320080008140000100
40204600333010110130000100300003001529248301002003000320080008140000100

1000 unrolls and 10 iterations

Result (median cycles for code, minus 2 chain cycles): 4.0033

retire uop (01)cycle (02)schedule uop (52)schedule int uop (53)schedule simd uop (54)dispatch int uop (56)dispatch simd uop (57)int uops in schedulers (59)ldst uops in schedulers (5b)dispatch uop (78)map int uop (7c)map simd uop (7e)map int uop inputs (7f)map simd uop inputs (81)? int output thing (e9)? simd retires (ee)? int retires (ef)
40024600333001111300001030000301529248300102030000208000014000010
40024600333001111300001030000301529248300102030000208000014000010
40024600333001111300001030000301529248300102030000208000014000010
40024600333001111300001030000301529248300102030000208000014000010
40024600333001111300001030000301529248300102030000208000014000010
40024600333001111300001030000301529248300102030000208000014000010
40024600333001111300001030000301529248300102030000208000014000010
40024600333001111300001030000301529248300102030000208000014000010
40024600333001111300001030000301529248300102030000208000014000010
40024600333001111300001030000301529248300102030000208000014000010

Test 4: Latency 1->3

Chain cycles: 2

Code:

  movi v0.16b, 0
  tbx v0.16b, { v1.16b, v2.16b }, v3.16b
  add v2.16b, v0.16b, v0.16b
  movi v0.16b, 1
  movi v1.16b, 2
  movi v2.16b, 3
  movi v3.16b, 4

(fused SUBS/B.cc loop)

100 unrolls and 100 iterations

Result (median cycles for code, minus 2 chain cycles): 2.0035

retire uop (01)cycle (02)schedule uop (52)schedule int uop (53)schedule simd uop (54)dispatch int uop (56)dispatch simd uop (57)int uops in schedulers (59)ldst uops in schedulers (5b)dispatch uop (78)map int uop (7c)map simd uop (7e)map int uop inputs (7f)map simd uop inputs (81)? int output thing (e9)? simd retires (ee)? int retires (ef)
40204400353010110130000100300003001009232301002003000720080019140000100
40204400353010110130000100300003001009243301002003000620080016140000100
40204400353010110130000100300003001009243301002003000620080016140000100
40204400353010110130000100300003001009243301002003000620080016140000100
40204400353010110130000100300003001009243301002003000620080016140000100
40205400713012510130024100300493001009232301002003000620080016140000100
40204400353010110130000100300003001009230301002003000720080016140000100
40204400353010110130000100300003001009243301002003000620080016140000100
40204400353010110130000100300003001009243301002003000620080016140000100
40204400353010110130000100300003001009243301002003000620080016140000100

1000 unrolls and 10 iterations

Result (median cycles for code, minus 2 chain cycles): 2.0035

retire uop (01)cycle (02)schedule uop (52)schedule int uop (53)schedule simd uop (54)dispatch int uop (56)dispatch simd uop (57)int uops in schedulers (59)ldst uops in schedulers (5b)dispatch uop (78)map int uop (7c)map simd uop (7e)map int uop inputs (7f)map simd uop inputs (81)? int output thing (e9)? simd retires (ee)? int retires (ef)
40024400353001111300001030000301009228300102030000208000014000010
40024400353001111300001030000301009243300102030000208000014000010
40024400353001111300001030000301009243300102030000208000014000010
40024400353001111300001030000301009243300102030000208000014000010
40024400353001111300001030000301009243300102030000208000014000010
40024400353001111300001030000301009243300102030000208000014000010
40024400353001111300001030000301009243300102030000208016914000010
40024400353001111300001030000301009224300102030006208001614000010
40024400353001111300001030000301009243300102030000208000014000010
40024400353001111300001030000301010249301142030111208000014000010

Test 5: Latency 1->4

Chain cycles: 2

Code:

  movi v0.16b, 0
  tbx v0.16b, { v1.16b, v2.16b }, v3.16b
  add v3.16b, v0.16b, v0.16b
  movi v0.16b, 1
  movi v1.16b, 2
  movi v2.16b, 3
  movi v3.16b, 4

(fused SUBS/B.cc loop)

100 unrolls and 100 iterations

Result (median cycles for code, minus 2 chain cycles): 4.0033

retire uop (01)cycle (02)schedule uop (52)schedule int uop (53)schedule simd uop (54)dispatch int uop (56)dispatch simd uop (57)int uops in schedulers (59)ldst uops in schedulers (5b)dispatch uop (78)map int uop (7c)map simd uop (7e)map int uop inputs (7f)map simd uop inputs (81)? int output thing (e9)? simd retires (ee)? int retires (ef)
40204600333010110130000100300003001529248301002003000420080008140000100
40204600333010110130000100300003001529248301002003000320080008140000100
40204600333010110130000100300003001529248301002003000320080008140000100
40204600333010110130000100300003001529248301002003000320080120140000100
40204600333010110130000100300003001529248301002003000320080008140000100
40204600333010110130000100300003001529248301002003000320080008140000100
40204600333010110130000100300003001529248301002003000320080008140000100
40204600333010110130000100300003001529248301002003000320080008140000100
40204600333010110130000100300003001529248301002003000320080008140000100
40204600333010110130000100300003001529248301002003000320080008140000100

1000 unrolls and 10 iterations

Result (median cycles for code, minus 2 chain cycles): 4.0033

retire uop (01)cycle (02)schedule uop (52)schedule int uop (53)schedule simd uop (54)dispatch int uop (56)dispatch simd uop (57)dispatch ldst uop (58)int uops in schedulers (59)simd uops in schedulers (5a)ldst uops in schedulers (5b)dispatch uop (78)map int uop (7c)map ldst uop (7d)map simd uop (7e)map int uop inputs (7f)map simd uop inputs (81)? int output thing (e9)? simd retires (ee)? int retires (ef)
40024600333001111300001030000030015292483001020030004208000014000010
40024600333001111300001030000030015292483001020030000208000014000010
40024600333001111300001030000030015292483001020030000208000014000010
40024600333001111300001030000030015292483001020030000208000014000010
40024600333001111300001030000030015292483001020030000208000014000010
40024600333001111300001030000030015292483001020030000208000014000010
40024600333001111300001030000030015292483001020030000208000014000010
40024600333001111300001030000030015292483001020030000208000014000010
40024600333001111300001030000030015292483001020030000208000014000010
40024600333001111300001030000030015292483001020030000208011714000010

Test 6: throughput

Count: 8

Code:

  movi v0.16b, 0
  tbx v0.16b, { v8.16b, v9.16b }, v10.16b
  movi v1.16b, 0
  tbx v1.16b, { v8.16b, v9.16b }, v10.16b
  movi v2.16b, 0
  tbx v2.16b, { v8.16b, v9.16b }, v10.16b
  movi v3.16b, 0
  tbx v3.16b, { v8.16b, v9.16b }, v10.16b
  movi v4.16b, 0
  tbx v4.16b, { v8.16b, v9.16b }, v10.16b
  movi v5.16b, 0
  tbx v5.16b, { v8.16b, v9.16b }, v10.16b
  movi v6.16b, 0
  tbx v6.16b, { v8.16b, v9.16b }, v10.16b
  movi v7.16b, 0
  tbx v7.16b, { v8.16b, v9.16b }, v10.16b
  movi v8.16b, 9
  movi v9.16b, 10
  movi v10.16b, 11

(fused SUBS/B.cc loop)

100 unrolls and 100 iterations

Result (median cycles for code divided by count): 1.0005

retire uop (01)cycle (02)schedule uop (52)schedule int uop (53)schedule simd uop (54)dispatch int uop (56)dispatch simd uop (57)dispatch ldst uop (58)int uops in schedulers (59)simd uops in schedulers (5a)ldst uops in schedulers (5b)dispatch uop (78)map int uop (7c)map ldst uop (7d)map simd uop (7e)map int uop inputs (7f)map ldst uop inputs (80)map simd uop inputs (81)? int output thing (e9)? ldst retires (ed)? simd retires (ee)? int retires (ef)
2402048009716020720116000620016001216015753754380027016067357919116006720004800391010240000100
2402048003616020720116000620016001207000800270160258200016006820004800511010240000100
2402058007216024120116004020016005607000800051160210200016001380735402411132341623402053973862
240204800361602052011600042001600101858105194648326455301342684414217112911820004800511010240000100
2402048003616020720116000620016001207000800051160210200016001320004800391010240000100
2402048003616020520116000420016001007000800270160258200016006720004800391010240000100
2402048003616020520116000420016001007000800051160210200016001320004800391010240000100
2402048003616020520116000420016001007000800051160210200016001320004800391010240000100
2402048003616020520116000420016001007000800051160210200016001320004800391010240000100
2402048003616020520116000420016001007000800051160210200016001320004800391010240000100

1000 unrolls and 10 iterations

Result (median cycles for code divided by count): 1.0005

retire uop (01)cycle (02)schedule uop (52)schedule int uop (53)schedule simd uop (54)schedule ldst uop (55)dispatch int uop (56)dispatch simd uop (57)int uops in schedulers (59)ldst uops in schedulers (5b)dispatch uop (78)map int uop (7c)map simd uop (7e)map int uop inputs (7f)map simd uop inputs (81)? int output thing (e9)? simd retires (ee)? int retires (ef)
2400248040716001511160004010160010308000001600102016000020480000124000010
2400248013516001111160000010160000308000001600102016000020480000124000010
2400248003716001111160000010160000308000001600102016000020480000124000010
2400248004416001111160000010160000308000001600102016000020480000124000010
2400248003616001111160000010160000308000001600102016000020480000124000010
2400248003616001111160000010160000308000001600102016000020480000124000010
2400248003616001111160000010160000308000001600102016000020480000124000010
2400248003616001111160000010160000308000001600102016000020480000124000010
2400248003616001111160000010160000308000001600102016000020480000124000010
2400248003616001111160000010160000308000001600102016000020480000124000010