Apple Microarchitecture Research by Dougall Johnson

M1/A14 P-core (Firestorm): Overview | Base Instructions | SIMD and FP Instructions
M1/A14 E-core (Icestorm):  Overview | Base Instructions | SIMD and FP Instructions

TBX (three register table, 8B)

Test 1: uops

Code:

  tbx v0.8b, { v1.16b, v2.16b, v3.16b }, v4.8b
  movi v0.16b, 1
  movi v1.16b, 2
  movi v2.16b, 3
  movi v3.16b, 4
  movi v4.16b, 5

(no loop instructions)

1000 unrolls and 1 iteration

Retires: 3.000

Issues: 3.000

Integer unit issues: 0.001

Load/store unit issues: 0.000

SIMD/FP unit issues: 3.000

retire uop (01)cycle (02)schedule uop (52)schedule int uop (53)schedule simd uop (54)dispatch simd uop (57)ldst uops in schedulers (5b)dispatch uop (78)map simd uop (7e)map simd uop inputs (81)? int output thing (e9)? simd retires (ee)
30046033300113000300015224830003000900013000
30046033300113000300015224830003000900013000
30046033300113000300015224830003000900013000
30046033300113000300015224830003000900013000
30046033300113000300015224830003000900013000
30046033300113000300015224830003000900013000
30046033300113000300015224830003000900013000
30046033300113000300015224830003000900013000
30046033300113000300015224830003000900013000
30046033300113000300015224830003000900013000

Test 2: Latency 1->1

Code:

  tbx v0.8b, { v1.16b, v2.16b, v3.16b }, v4.8b
  movi v0.16b, 1
  movi v1.16b, 2
  movi v2.16b, 3
  movi v3.16b, 4
  movi v4.16b, 5

(fused SUBS/B.cc loop)

100 unrolls and 100 iterations

Result (median cycles for code): 6.0033

retire uop (01)cycle (02)schedule uop (52)schedule int uop (53)schedule simd uop (54)dispatch int uop (56)dispatch simd uop (57)int uops in schedulers (59)ldst uops in schedulers (5b)dispatch uop (78)map int uop (7c)map simd uop (7e)map int uop inputs (7f)map simd uop inputs (81)? int output thing (e9)? simd retires (ee)? int retires (ef)
3020460174302362003003619930108700152924730200200300062009001810130000100
3020460033302012013000020030000700152924830200200300042009001210130000100
3020460033302012013000020030000700152924830200200300042009001210130000100
3020460033302012013000020030000687152958030233200300482009001210130000100
3020460033302012013000020030000700152924730200200300062009001210130000100
3020560066302102023000820130034700152924830200200300042009001210130000100
3020460033302012013000020030000700152924830200200300042009001210130000100
3020460033302012013000020030000700152924830200200300042009001210130000100
3020460033302012013000020030000700152924830200200300042009001210130000100
3020460033302012013000020030000700152924830200200300042009014410130000100

1000 unrolls and 10 iterations

Result (median cycles for code): 6.0033

retire uop (01)cycle (02)schedule uop (52)schedule int uop (53)schedule simd uop (54)dispatch int uop (56)dispatch simd uop (57)int uops in schedulers (59)ldst uops in schedulers (5b)dispatch uop (78)map int uop (7c)map simd uop (7e)map int uop inputs (7f)map simd uop inputs (81)? int output thing (e9)? simd retires (ee)? int retires (ef)
30024600333001111300001030000301529248300102030004209001213000010
30024600333001111300001030000301529248300102030000209000013000010
30024600333001111300001030000301529248300102030000209000013000010
30024600333001111300001030000301529248300102030000209000013000010
30024600333001111300001030000301529248300102030000209000013000010
30024600333001111300001030000301529248300102030000209014413000010
30024600333001111300001030000341529580300452030047209000013000010
30024600333001111300001030000301529248300102030000209000013000010
30024600333001111300001030000301529248300102030000209000013000010
30024600333001111300001030000301529248300102030000209000013000010

Test 3: Latency 1->2

Chain cycles: 2

Code:

  movi v0.16b, 0
  tbx v0.8b, { v1.16b, v2.16b, v3.16b }, v4.8b
  add v1.16b, v0.16b, v0.16b
  movi v0.16b, 1
  movi v1.16b, 2
  movi v2.16b, 3
  movi v3.16b, 4
  movi v4.16b, 5

(fused SUBS/B.cc loop)

100 unrolls and 100 iterations

Result (median cycles for code, minus 2 chain cycles): 6.0033

retire uop (01)cycle (02)schedule uop (52)schedule int uop (53)schedule simd uop (54)dispatch int uop (56)dispatch simd uop (57)int uops in schedulers (59)ldst uops in schedulers (5b)dispatch uop (78)map int uop (7c)map simd uop (7e)map int uop inputs (7f)map simd uop inputs (81)? int output thing (e9)? simd retires (ee)? int retires (ef)
502048003340101101400001004000030020392484010020040003200110009150000100
502058006640109101400081004003430020392484010020040003200110009150000100
502048003340101101400001004000030020392484010020040003200110009150000100
502048003340101101400001004000030020392484010020040003200110009150000100
502048003340101101400001004000030020392484010020040003200110009150000100
502048003340101101400001004000030020392484010020040003200110009150000100
502048003340101101400001004000030020395804013420040044200110009150000100
502048003340101101400001004000030020392484010020040003200110009150000100
502048003340101101400001004000030020392484010020040003200110009150000100
502048003340101101400001004000030020392484010020040003200110009150000100

1000 unrolls and 10 iterations

Result (median cycles for code, minus 2 chain cycles): 6.0033

retire uop (01)cycle (02)schedule uop (52)schedule int uop (53)schedule simd uop (54)dispatch int uop (56)dispatch simd uop (57)int uops in schedulers (59)ldst uops in schedulers (5b)dispatch uop (78)map int uop (7c)map simd uop (7e)map int uop inputs (7f)map simd uop inputs (81)? int output thing (e9)? simd retires (ee)? int retires (ef)
500248003340011114000010400003020392484001020400032011000915000010
500248003340011114000010400003020392484001020400002011000015000010
500258006640020124000811400343020392484001020400002011000015000010
500248003340011114000010400003020392484001020400002011000015000010
500248003340011114000010400003020392484001020400002011000015000010
500248003340011114000010400003020392484001020400002011000015000010
500248003340011114000010400003020392484001020400002011000015000010
500248003340011114000010400003020392484001020400002011000015000010
500248003340011114000010400003020392484001020400002011000015000010
500248003340011114000010400003020392484001020400002011000015000010

Test 4: Latency 1->3

Chain cycles: 2

Code:

  movi v0.16b, 0
  tbx v0.8b, { v1.16b, v2.16b, v3.16b }, v4.8b
  add v2.16b, v0.16b, v0.16b
  movi v0.16b, 1
  movi v1.16b, 2
  movi v2.16b, 3
  movi v3.16b, 4
  movi v4.16b, 5

(fused SUBS/B.cc loop)

100 unrolls and 100 iterations

Result (median cycles for code, minus 2 chain cycles): 4.0035

retire uop (01)cycle (02)schedule uop (52)schedule int uop (53)schedule simd uop (54)dispatch int uop (56)dispatch simd uop (57)int uops in schedulers (59)ldst uops in schedulers (5b)dispatch uop (78)map int uop (7c)map simd uop (7e)map int uop inputs (7f)map simd uop inputs (81)? int output thing (e9)? simd retires (ee)? int retires (ef)
502046003540101101400001004000030015192424010020040008200110017150000100
502046003540101101400001004000030015192544010020040006200110017150000100
502046003540101101400001004000030015192544010020040006200110017150000100
502046003540101101400001004000030015192544010020040006200110017150000100
502046003540101101400001004000030015192544010020040006200110017150000100
502046003540101101400001004000030015192544010020040006200110017150000100
502046003540101101400001004000030015192544010020040006200110017150000100
502046003540101101400001004000030015192544010020040006200110017150000100
502056007040121102400191014004530715195814014720240059200110017150000100
502046003540101101400001004000030015192544010020040006200110017150000100

1000 unrolls and 10 iterations

Result (median cycles for code, minus 2 chain cycles): 4.0035

retire uop (01)cycle (02)schedule uop (52)schedule int uop (53)schedule simd uop (54)dispatch int uop (56)dispatch simd uop (57)int uops in schedulers (59)ldst uops in schedulers (5b)dispatch uop (78)map int uop (7c)map simd uop (7e)map int uop inputs (7f)map simd uop inputs (81)? int output thing (e9)? simd retires (ee)? int retires (ef)
500246003540011114000010400003015192544001020400062011000015000010
500246003540011114000010400003015192544001020400002011056625000010
500246003540011114000010400003015192544001020400002011000015000010
500246003540011114000010400003015197764005720400512011000015000010
500246003540011114000010400003015192544001020400002011000015000010
500246003540011114000010400003015192544001020400002011000015000010
500246003540011114000010400003015197764005720400512011000015000010
500246014240060124004811400943015192544001020400002011014315000010
500246029740131114012010402353015217414029220403122011056025000010
500246029840131114012010402353015206504019820402032011100015000010

Test 5: Latency 1->4

Chain cycles: 2

Code:

  movi v0.16b, 0
  tbx v0.8b, { v1.16b, v2.16b, v3.16b }, v4.8b
  add v3.16b, v0.16b, v0.16b
  movi v0.16b, 1
  movi v1.16b, 2
  movi v2.16b, 3
  movi v3.16b, 4
  movi v4.16b, 5

(fused SUBS/B.cc loop)

100 unrolls and 100 iterations

Result (median cycles for code, minus 2 chain cycles): 2.0037

retire uop (01)cycle (02)schedule uop (52)schedule int uop (53)schedule simd uop (54)dispatch int uop (56)dispatch simd uop (57)int uops in schedulers (59)ldst uops in schedulers (5b)dispatch uop (78)map int uop (7c)map simd uop (7e)map int uop inputs (7f)map ldst uop inputs (80)map simd uop inputs (81)? int output thing (e9)? ldst retires (ed)? simd retires (ee)? int retires (ef)
5020440037401011014000010040001300999232401022004000820001100171050000100
5020440037401011014000010040001300999270401012004000620001100171050000100
5020440037401011014000010040001300999270401012004000620001100171050000100
5020440037401011014000010040001300999270401012004000620001100171050000100
5020540072401381014003710040063300999270401012004000620001100171050000100
5020440037401011014000010040001300999270401012004000620001100171050000100
5020440037401011014000010040001300999270401012004000620001102101050000100
5020440037401011014000010040001300999270401012004000620001100171050000100
5020440037401011014000010040001300999270401012004000620001100171050000100
5020440037401011014000010040001300999270401012004000620001100171050000100

1000 unrolls and 10 iterations

Result (median cycles for code, minus 2 chain cycles): 2.0074

retire uop (01)cycle (02)schedule uop (52)schedule int uop (53)schedule simd uop (54)dispatch int uop (56)dispatch simd uop (57)int uops in schedulers (59)ldst uops in schedulers (5b)dispatch uop (78)map int uop (7c)map simd uop (7e)map int uop inputs (7f)map simd uop inputs (81)? int output thing (e9)? simd retires (ee)? int retires (ef)
50024400374001111400001040002309992294001020400002011020415000010
50024400944005611400451040069309992354001020400002011000015000010
50024400374001111400001040000309992654001020400002011057315000010
50024400374001111400001040000309992654001020400002011000015000010
50024400374001111400001040000309992654001020400002011000015000010
500244034640099114008810401323010010434027520402722011056715000010
500244026840189114017810402663010006914021020402082011057015000010
500254030440228114021710403323010017654034520403472011037725000010
50024403554010011400891040133309997654007720400682011038215000010
500244032240232114022110403313710011464027822402742011056715000010

Test 6: Latency 1->5

Chain cycles: 2

Code:

  movi v0.16b, 0
  tbx v0.8b, { v1.16b, v2.16b, v3.16b }, v4.8b
  add v4.16b, v0.16b, v0.16b
  movi v0.16b, 1
  movi v1.16b, 2
  movi v2.16b, 3
  movi v3.16b, 4
  movi v4.16b, 5

(fused SUBS/B.cc loop)

100 unrolls and 100 iterations

Result (median cycles for code, minus 2 chain cycles): 6.0033

retire uop (01)cycle (02)schedule uop (52)schedule int uop (53)schedule simd uop (54)schedule ldst uop (55)dispatch int uop (56)dispatch simd uop (57)int uops in schedulers (59)ldst uops in schedulers (5b)dispatch uop (78)map int uop (7c)map simd uop (7e)map int uop inputs (7f)map simd uop inputs (81)? int output thing (e9)? simd retires (ee)? int retires (ef)
5020480033401011014000001004000030020392484010020040004200110009150000100
5020480033401011014000001004000030020392484010020040003200110009150000100
5020480033401011014000001004000030020392484010020040003200110009150000100
5020480033401011014000001004000030020392484010020040003200110121150000100
5020480033401011014000001004000030020392484010020040003200110009150000100
5020480033401011014000001004000030020392484010020040003200110009150000100
5020480033401011014000001004000030020392484010020040003200110009150000100
5020480033401011014000001004000030020392484010020040003200110009150000100
5020480033401011014000001004000030020392484010020040003200110009150000100
5020480033401011014000001004000030020392484010020040003200110009150000100

1000 unrolls and 10 iterations

Result (median cycles for code, minus 2 chain cycles): 6.0033

retire uop (01)cycle (02)schedule uop (52)schedule int uop (53)schedule simd uop (54)dispatch int uop (56)dispatch simd uop (57)int uops in schedulers (59)ldst uops in schedulers (5b)dispatch uop (78)map int uop (7c)map simd uop (7e)map int uop inputs (7f)map ldst uop inputs (80)map simd uop inputs (81)? int output thing (e9)? ldst retires (ed)? simd retires (ee)? int retires (ef)
50024800334001111400001040000302039248400102040000200110000105000010
50024800334001111400001040000302039248400102040000200110000105000010
50024800334001111400001040000302039248400102040000200110000105000010
50024800334001111400001040000302039248400102040000200110000105000010
5002480033400111140000104000030203924840010204000040833611012419711850001227
50024800334001111400001040000302039580400442040045200110124205000010
50024800334001111400001040000302039248400102040000200110119105000010
50024800334001111400001040000302039248400102040000200110000105000010
50024800334001111400001040000302039248400102040000200110000105000010
50024800334001111400001040000302039248400102040000200110000105000010

Test 7: throughput

Count: 8

Code:

  movi v0.16b, 0
  tbx v0.8b, { v8.16b, v9.16b, v10.16b }, v11.8b
  movi v1.16b, 0
  tbx v1.8b, { v8.16b, v9.16b, v10.16b }, v11.8b
  movi v2.16b, 0
  tbx v2.8b, { v8.16b, v9.16b, v10.16b }, v11.8b
  movi v3.16b, 0
  tbx v3.8b, { v8.16b, v9.16b, v10.16b }, v11.8b
  movi v4.16b, 0
  tbx v4.8b, { v8.16b, v9.16b, v10.16b }, v11.8b
  movi v5.16b, 0
  tbx v5.8b, { v8.16b, v9.16b, v10.16b }, v11.8b
  movi v6.16b, 0
  tbx v6.8b, { v8.16b, v9.16b, v10.16b }, v11.8b
  movi v7.16b, 0
  tbx v7.8b, { v8.16b, v9.16b, v10.16b }, v11.8b
  movi v8.16b, 9
  movi v9.16b, 10
  movi v10.16b, 11
  movi v11.16b, 12

(fused SUBS/B.cc loop)

100 unrolls and 100 iterations

Result (median cycles for code divided by count): 1.5005

retire uop (01)cycle (02)schedule uop (52)schedule int uop (53)schedule simd uop (54)dispatch int uop (56)dispatch simd uop (57)int uops in schedulers (59)ldst uops in schedulers (5b)dispatch uop (78)map int uop (7c)map simd uop (7e)map int uop inputs (7f)map ldst uop inputs (80)map simd uop inputs (81)? int output thing (e9)? ldst retires (ed)? simd retires (ee)? int retires (ef)
3202041200592401011012400001002400083002009644240110200240013200072003610320000100
3202041200392401011012400001002400083001919894240108200240012200072003610320000100
3202041200392401011012400001002400083001920150240166200240074200072003610320000100
3202041200392401011012400001002400083001919894240108200240012200072003610320000100
3202041200392401011012400001002400083001919894240108200240012200072003610320000100
3202041200392401011012400001002400083001919894240108200240012200072003610320000100
3202041200392401011012400001002400083001919894240108200240012200072003610320000100
3202041200392401011012400001002400083001919894240108200240012200072003910320000100
3202041200492401021012400011002400103001919758240110200240013200072003610320000100
3202041200392401011012400001002400083001919894240108200240012200072021610320000100

1000 unrolls and 10 iterations

Result (median cycles for code divided by count): 1.5005

retire uop (01)cycle (02)schedule uop (52)schedule int uop (53)schedule simd uop (54)dispatch int uop (56)dispatch simd uop (57)int uops in schedulers (59)ldst uops in schedulers (5b)dispatch uop (78)map int uop (7c)map simd uop (7e)map int uop inputs (7f)map simd uop inputs (81)? int output thing (e9)? simd retires (ee)? int retires (ef)
32002412015224001211240001102400083019196962400102024000020720000132000010
32002412003924001111240000102400003019198482400102024000020720000132000010
32002412003924001111240000102400003019198482400102024000020720000132000010
32002412003924001111240000102400003019201562401242024011420720180132000010
32002412025424017411240163102401773019198482400102024000020720213132000010
32002412010024001111240000102400003019198482400102024000020720000132000010
32002412018324011811240107102401173019198482400102024000020720171132000010
32002512014624005311240042102400663019197552400192024001220720039132000010
32002412003924001111240000102400003019198482400102024000020720000132000010
32002412003924001111240000102400003019198482400102024000020720216132000010