Apple Microarchitecture Research by Dougall Johnson

M1/A14 P-core (Firestorm): Overview | Base Instructions | SIMD and FP Instructions
M1/A14 E-core (Icestorm):  Overview | Base Instructions | SIMD and FP Instructions

TBX (two register table, 8B)

Test 1: uops

Code:

  tbx v0.8b, { v1.16b, v2.16b }, v3.8b
  movi v0.16b, 1
  movi v1.16b, 2
  movi v2.16b, 3
  movi v3.16b, 4

(no loop instructions)

1000 unrolls and 1 iteration

Retires: 2.000

Issues: 2.000

Integer unit issues: 0.001

Load/store unit issues: 0.000

SIMD/FP unit issues: 2.000

retire uop (01)cycle (02)schedule uop (52)schedule int uop (53)schedule simd uop (54)dispatch simd uop (57)ldst uops in schedulers (5b)dispatch uop (78)map simd uop (7e)map simd uop inputs (81)? int output thing (e9)? simd retires (ee)
20044033200112000200010124820002000600012000
20044033200112000200010124820002000600012000
20044033200112000200010124820002000600012000
20044033200112000200010124820002000600012000
20044033200112000200010124820002000600012000
20044033200112000200010124820002000600012000
20044033200112000200010124820002000600012000
20044033200112000200010124820002000600012000
20044033200112000200010124820002000600012000
20044033200112000200010124820002000600012000

Test 2: Latency 1->1

Code:

  tbx v0.8b, { v1.16b, v2.16b }, v3.8b
  movi v0.16b, 1
  movi v1.16b, 2
  movi v2.16b, 3
  movi v3.16b, 4

(fused SUBS/B.cc loop)

100 unrolls and 100 iterations

Result (median cycles for code): 4.0033

retire uop (01)cycle (02)schedule uop (52)schedule int uop (53)schedule simd uop (54)dispatch int uop (56)dispatch simd uop (57)int uops in schedulers (59)ldst uops in schedulers (5b)dispatch uop (78)map int uop (7c)map simd uop (7e)map int uop inputs (7f)map simd uop inputs (81)? int output thing (e9)? simd retires (ee)? int retires (ef)
20204400332010110120000100200003001019248201002002000620060012120000100
20204400332010110120000100200003001019248201002002000420060012120000100
20204400332010110120000100200003001019248201002002000420060012120000100
20204400332010110120000100200003001019248201002002000420060012120000100
20204400332010110120000100200003001019248201002002000420060012120000100
20204400332010110120000100200003001019248201002002000420060012120000100
20204400332010110120000100200003001019248201002002000420060012120000100
20204400332010110120000100200003001019248201002002000420060012120000100
20204400332010110120000100200003001019248201002002000420060012120000100
20204400332010110120000100200003001019248201002002000420060012120000100

1000 unrolls and 10 iterations

Result (median cycles for code): 4.0033

retire uop (01)cycle (02)schedule uop (52)schedule int uop (53)schedule simd uop (54)dispatch int uop (56)dispatch simd uop (57)int uops in schedulers (59)ldst uops in schedulers (5b)dispatch uop (78)map int uop (7c)map simd uop (7e)map int uop inputs (7f)map simd uop inputs (81)? int output thing (e9)? simd retires (ee)? int retires (ef)
20024400332001111200001020000301019248200102020000206000012000010
20024400332001111200001020000301019248200102020000206000012000010
20024400332001111200001020000301019248200102020000206000012000010
20024400332001111200001020000301019248200102020000206014412000010
20024400332001111200001020000301019248200102020000206000012000010
20024400332001111200001020000301019248200102020000206000012000010
20024400332001111200001020000301019248200102020000206000012000010
20024400332001111200001020000301019248200102020000206000012000010
20024400332001111200001020000301019248200102020000206000012000010
20024400332001111200001020000301019248200102020000206000012000010

Test 3: Latency 1->2

Chain cycles: 2

Code:

  movi v0.16b, 0
  tbx v0.8b, { v1.16b, v2.16b }, v3.8b
  add v1.16b, v0.16b, v0.16b
  movi v0.16b, 1
  movi v1.16b, 2
  movi v2.16b, 3
  movi v3.16b, 4

(fused SUBS/B.cc loop)

100 unrolls and 100 iterations

Result (median cycles for code, minus 2 chain cycles): 4.0033

retire uop (01)cycle (02)schedule uop (52)schedule int uop (53)schedule simd uop (54)dispatch int uop (56)dispatch simd uop (57)int uops in schedulers (59)ldst uops in schedulers (5b)dispatch uop (78)map int uop (7c)map simd uop (7e)map int uop inputs (7f)map simd uop inputs (81)? int output thing (e9)? simd retires (ee)? int retires (ef)
40204600333010110130000100300003001529248301002003000420080008140000100
40204600333010110130000100300003001529580301342003004420080008140000100
40204600333010110130000100300003001529248301002003000320080008140000100
40204600333010110130000100300003001529248301002003000320080008140000100
40204600333010110130000100300003001529248301002003000320080011140000100
40204600333010110130000100300003001529248301002003000320080008140000100
40204600333010110130000100300003001529248301002003000320080008140000100
40204600333010110130000100300003001529248301002003000320080008140000100
40205600663010910130008100300343001529248301002003000320080008140000100
40204600333010110130000100300003001529248301002003000320080008140000100

1000 unrolls and 10 iterations

Result (median cycles for code, minus 2 chain cycles): 4.0033

retire uop (01)cycle (02)schedule uop (52)schedule int uop (53)schedule simd uop (54)dispatch int uop (56)dispatch simd uop (57)int uops in schedulers (59)ldst uops in schedulers (5b)dispatch uop (78)map int uop (7c)map simd uop (7e)map int uop inputs (7f)map simd uop inputs (81)? int output thing (e9)? simd retires (ee)? int retires (ef)
40024600333001111300001030000301529248300102030004208000814000010
40024600333001111300001030000301529248300102030000208000014000010
40024600333001111300001030000301529248300102030000208000014000010
40024600333001111300001030000301529248300102030000208000014000010
40024600333001111300001030000301529248300102030000208000014000010
40024600333001111300001030000301529248300102030000208000014000010
40024600333001111300001030000301529248300102030000208000014000010
40024600333001111300001030000301529248300102030000208000014000010
40024600333001111300001030000301529580300442030045208000014000010
40024600333001111300001030000301529248300102030000208000014000010

Test 4: Latency 1->3

Chain cycles: 2

Code:

  movi v0.16b, 0
  tbx v0.8b, { v1.16b, v2.16b }, v3.8b
  add v2.16b, v0.16b, v0.16b
  movi v0.16b, 1
  movi v1.16b, 2
  movi v2.16b, 3
  movi v3.16b, 4

(fused SUBS/B.cc loop)

100 unrolls and 100 iterations

Result (median cycles for code, minus 2 chain cycles): 2.0035

retire uop (01)cycle (02)schedule uop (52)schedule int uop (53)schedule simd uop (54)dispatch int uop (56)dispatch simd uop (57)int uops in schedulers (59)ldst uops in schedulers (5b)dispatch uop (78)map int uop (7c)map simd uop (7e)map int uop inputs (7f)map simd uop inputs (81)? int output thing (e9)? simd retires (ee)? int retires (ef)
40204400353010110130000100300003001009225301002003000720080019140000100
40204400353010110130000100300003001009243301002003000620080016140000100
40204400353010110130000100300003001009243301002003000620080016140000100
40204400353010110130000100300003001009243301002003000620080016140000100
40204400353010110130000100300003001009491301502003006320080016140000100
40204400353010110130000100300003001009243301002003000620080016140000100
40204400353010110130000100300003001009243301002003000620080016140000100
40204400353010110130000100300003001009243301002003000620080016140000100
40204400353010110130000100300003001009243301002003000620080016140000100
40204400353010110130000100300003001009243301002003000620080016140000100

1000 unrolls and 10 iterations

Result (median cycles for code, minus 2 chain cycles): 2.0035

retire uop (01)cycle (02)schedule uop (52)schedule int uop (53)schedule simd uop (54)dispatch int uop (56)dispatch simd uop (57)int uops in schedulers (59)ldst uops in schedulers (5b)dispatch uop (78)map int uop (7c)map simd uop (7e)map int uop inputs (7f)map simd uop inputs (81)? int output thing (e9)? simd retires (ee)? int retires (ef)
40024400353001111300001030000301009243300102030000208000014000010
40024400353001111300001030000301009243300102030000208000014000010
40024400353001111300001030000301009243300102030000208000014000010
40024400353001111300001030000301009243300102030000208000014000010
40024400353001111300001030000301009486300592030062208000014000010
40024400353001111300001030000301009243300102030000208000014000010
40024400353001111300001030000301009243300102030000208000014000010
40024400353001111300001030000301009243300102030000208000014000010
40024400353001111300001030000301009243300102030000208000014000010
40024400353001111300001030000301009243300102030000208000014000010

Test 5: Latency 1->4

Chain cycles: 2

Code:

  movi v0.16b, 0
  tbx v0.8b, { v1.16b, v2.16b }, v3.8b
  add v3.16b, v0.16b, v0.16b
  movi v0.16b, 1
  movi v1.16b, 2
  movi v2.16b, 3
  movi v3.16b, 4

(fused SUBS/B.cc loop)

100 unrolls and 100 iterations

Result (median cycles for code, minus 2 chain cycles): 4.0033

retire uop (01)cycle (02)schedule uop (52)schedule int uop (53)schedule simd uop (54)dispatch int uop (56)dispatch simd uop (57)int uops in schedulers (59)ldst uops in schedulers (5b)dispatch uop (78)map int uop (7c)map simd uop (7e)map int uop inputs (7f)map simd uop inputs (81)? int output thing (e9)? simd retires (ee)? int retires (ef)
40204600333010110130000100300003001529248301002003000420080011140000100
40204600333010110130000100300003001529248301002003000320080008140000100
40204600333010110130000100300003001529248301002003000320080008140000100
40204600333010110130000100300003001529248301002003000320080008140000100
40204600333010110130000100300003001529248301002003000320080008140000100
40204600333010110130000100300003001529248301002003000320080008140000100
40204600333010110130000100300003001529248301002003000320080008140000100
40204600333010110130000100300003001529248301002003000320080008140000100
40204600333010110130000100300003001529580301342003004520080008140000100
40204600333010110130000100300003001529248301002003000320080008140000100

1000 unrolls and 10 iterations

Result (median cycles for code, minus 2 chain cycles): 4.0033

retire uop (01)cycle (02)schedule uop (52)schedule int uop (53)schedule simd uop (54)dispatch int uop (56)dispatch simd uop (57)int uops in schedulers (59)ldst uops in schedulers (5b)dispatch uop (78)map int uop (7c)map simd uop (7e)map int uop inputs (7f)map simd uop inputs (81)? int output thing (e9)? simd retires (ee)? int retires (ef)
40024600333001111300001030000301529248300102030004208000014000010
40024600333001111300001030000301529248300102030000208000014000010
40024600333001111300001030000301529248300102030000208000014000010
40024600333001111300001030000301529248300102030000208000014000010
40024600333001111300001030000301529248300102030000208000014000010
40024600333001111300001030000301529248300102030000208000014000010
40024600333001111300001030000301529248300102030000208000014000010
40024600333001111300001030000301529248300102030000208000014000010
40024600333001111300001030000301529248300102030000208000014000010
40024600333001111300001030000301529248300102030000208000014000010

Test 6: throughput

Count: 8

Code:

  movi v0.16b, 0
  tbx v0.8b, { v8.16b, v9.16b }, v10.8b
  movi v1.16b, 0
  tbx v1.8b, { v8.16b, v9.16b }, v10.8b
  movi v2.16b, 0
  tbx v2.8b, { v8.16b, v9.16b }, v10.8b
  movi v3.16b, 0
  tbx v3.8b, { v8.16b, v9.16b }, v10.8b
  movi v4.16b, 0
  tbx v4.8b, { v8.16b, v9.16b }, v10.8b
  movi v5.16b, 0
  tbx v5.8b, { v8.16b, v9.16b }, v10.8b
  movi v6.16b, 0
  tbx v6.8b, { v8.16b, v9.16b }, v10.8b
  movi v7.16b, 0
  tbx v7.8b, { v8.16b, v9.16b }, v10.8b
  movi v8.16b, 9
  movi v9.16b, 10
  movi v10.16b, 11

(fused SUBS/B.cc loop)

100 unrolls and 100 iterations

Result (median cycles for code divided by count): 1.0005

retire uop (01)cycle (02)schedule uop (52)schedule int uop (53)schedule simd uop (54)dispatch int uop (56)dispatch simd uop (57)int uops in schedulers (59)ldst uops in schedulers (5b)dispatch uop (78)map int uop (7c)map simd uop (7e)map int uop inputs (7f)map simd uop inputs (81)? int output thing (e9)? simd retires (ee)? int retires (ef)
24020480102160205201160004200160010700800051160210200160013200480039101240000100
24020480036160207201160006200160012700800051160210200160013200480039101240000100
24020480036160207201160006200160012700800051160210200160013200480039101240000100
24020480036160205201160004200160010700800051160210200160013200480039101240000100
24020480036160205201160004200160010700800051160210200160013200480039101240000100
24020480036160205201160004200160010700800051160210200160013200480198101240000100
24020480036160205201160004200160010700800051160210200160013200480039101240000100
24020480036160205201160004200160010700800051160210200160013200480039101240000100
24020480036160205201160004200160010700800051160210200160013200480039101240000100
24020480036160205201160004200160010700800051160210200160013200480039101240000100

1000 unrolls and 10 iterations

Result (median cycles for code divided by count): 1.0005

retire uop (01)cycle (02)schedule uop (52)schedule int uop (53)schedule simd uop (54)dispatch int uop (56)dispatch simd uop (57)int uops in schedulers (59)ldst uops in schedulers (5b)dispatch uop (78)map int uop (7c)map simd uop (7e)map int uop inputs (7f)map ldst uop inputs (80)map simd uop inputs (81)? int output thing (e9)? ldst retires (ed)? simd retires (ee)? int retires (ef)
24002480453160015111600041016001030800000160010201600002004800001024000010
24002480058160011111600001016000030800000160010201600002004800001024000010
24002480036160011111600001016000030800000160010201600002004801921024000010
24002480036160011111600001016000030800000160010201600002004800001024000010
24002580074160049111600381016005430800000160010201600002004800001024000010
24002480036160011111600001016000030800000160010201600002004800001024000010
24002480036160011111600001016000030800000160010201600002004800001024000010
24002480037160011111600001016000030800000160010201600002004800001024000010
24002480036160011111600001016000030800000160010201600002004800001024000010
24002480036160011111600001016000030800000160010201600002004800001024000010