Apple Microarchitecture Research by Dougall Johnson

M1/A14 P-core (Firestorm): Overview | Base Instructions | SIMD and FP Instructions
M1/A14 E-core (Icestorm):  Overview | Base Instructions | SIMD and FP Instructions

TBX (four register table, 8B)

Test 1: uops

Code:

  tbx v0.8b, { v1.16b, v2.16b, v3.16b, v4.16b }, v5.8b
  movi v0.16b, 1
  movi v1.16b, 2
  movi v2.16b, 3
  movi v3.16b, 4
  movi v4.16b, 5
  movi v5.16b, 6

(no loop instructions)

1000 unrolls and 1 iteration

Retires: 4.000

Issues: 4.000

Integer unit issues: 0.001

Load/store unit issues: 0.000

SIMD/FP unit issues: 4.000

retire uop (01)cycle (02)schedule uop (52)schedule int uop (53)schedule simd uop (54)dispatch simd uop (57)ldst uops in schedulers (5b)dispatch uop (78)map simd uop (7e)map int uop inputs (7f)map ldst uop inputs (80)map simd uop inputs (81)? int output thing (e9)? ldst retires (ed)? simd retires (ee)? int retires (ef)
4004803340011400040002032484000400000120001040000
4004803340011400040002032484000400000120001040000
4004803340011400040002032484000400000120001040000
4004803340011400040002032484000400000120001040000
4004803340011400040002032484000400000120001040000
4004803340011400040002032484000400000120001040000
4004803340011400040002032484000400000120001040000
4004803340011400040002032484000400000120001040000
4004803340011400040002032484000400000120001040000
4004803340011400040002032484000400000120001040000

Test 2: Latency 1->1

Code:

  tbx v0.8b, { v1.16b, v2.16b, v3.16b, v4.16b }, v5.8b
  movi v0.16b, 1
  movi v1.16b, 2
  movi v2.16b, 3
  movi v3.16b, 4
  movi v4.16b, 5
  movi v5.16b, 6

(fused SUBS/B.cc loop)

100 unrolls and 100 iterations

Result (median cycles for code): 8.0033

retire uop (01)cycle (02)schedule uop (52)schedule int uop (53)schedule simd uop (54)schedule ldst uop (55)dispatch int uop (56)dispatch simd uop (57)int uops in schedulers (59)ldst uops in schedulers (5b)dispatch uop (78)map int uop (7c)map simd uop (7e)map int uop inputs (7f)map simd uop inputs (81)? int output thing (e9)? simd retires (ee)? int retires (ef)
4020480033401011014000001004000030020392484010020040008200120024140000100
4020480033401011014000001004000030020392484010020040008200120024140000100
4020480033401011014000001004000030020392484010020040008200120024140000100
4020580066401091014000801004003430020392484010020040008200120024140000100
4020480033401011014000001004000030020392484010020040008200120024140000100
4020480033401011014000001004000030020392484010020040008200120024140000100
4020480033401011014000001004000030020392484010020040008200120024140000100
4020480033401011014000001004000030020392484010020040008200120024140000100
4020480033401011014000001004000030020392484010020040008200120024140000100
4020480033401011014000001004000030020392484010020040008200120024140000100

1000 unrolls and 10 iterations

Result (median cycles for code): 8.0033

retire uop (01)cycle (02)schedule uop (52)schedule int uop (53)schedule simd uop (54)dispatch int uop (56)dispatch simd uop (57)int uops in schedulers (59)ldst uops in schedulers (5b)dispatch uop (78)map int uop (7c)map simd uop (7e)map int uop inputs (7f)map simd uop inputs (81)? int output thing (e9)? simd retires (ee)? int retires (ef)
400248003340011114000010400003020392484001020400082012000014000010
400248003340011114000010400003020392484001020400002012000014000010
400248003340011114000010400003020392484001020400002012000014000010
400248003340011114000010400003020392484001020400002012000014000010
400248003340011114000010400003020392484001020400002012000014000010
400248003340011114000010400003020392484001020400002012000014000010
400248003340011114000010400003020392484001020400002012000014000010
400248003340011114000010400003020392484001020400002012000014000010
400248003340011114000010400003020392484001020400002012000014000010
400248003340011114000010400003020392484001020400002012002414000010

Test 3: Latency 1->2

Chain cycles: 2

Code:

  movi v0.16b, 0
  tbx v0.8b, { v1.16b, v2.16b, v3.16b, v4.16b }, v5.8b
  add v1.16b, v0.16b, v0.16b
  movi v0.16b, 1
  movi v1.16b, 2
  movi v2.16b, 3
  movi v3.16b, 4
  movi v4.16b, 5
  movi v5.16b, 6

(fused SUBS/B.cc loop)

100 unrolls and 100 iterations

Result (median cycles for code, minus 2 chain cycles): 8.0033

retire uop (01)cycle (02)schedule uop (52)schedule int uop (53)schedule simd uop (54)dispatch int uop (56)dispatch simd uop (57)dispatch ldst uop (58)int uops in schedulers (59)simd uops in schedulers (5a)ldst uops in schedulers (5b)dispatch uop (78)map int uop (7c)map ldst uop (7d)map simd uop (7e)map int uop inputs (7f)map simd uop inputs (81)? int output thing (e9)? simd retires (ee)? int retires (ef)
6020410003350101101500001005000003000254958050134200050044200140012160000100
6020410003350101101500001005000003000254924850100200050004200140012160000100
6020410003350101101500001005000003000254924850100200050004200140012160000100
6020410003350101101500001005000003000254924850100200050004200140012160000100
6020410003350101101500001005000003000254924850100200050004200140012160000100
6020410003350101101500001005000003000254924850100200050004200140012160000100
6020410003350101101500001005000003000254924850100200050004200140012160000100
6020410003350101101500001005000003000254958050134200050044200140012160000100
6020410003350101101500001005000003000254924850100200050004200140012160000100
6020410003350101101500001005000003000254958050134200050046200140012160000100

1000 unrolls and 10 iterations

Result (median cycles for code, minus 2 chain cycles): 8.0033

retire uop (01)cycle (02)schedule uop (52)schedule int uop (53)schedule simd uop (54)dispatch int uop (56)dispatch simd uop (57)int uops in schedulers (59)ldst uops in schedulers (5b)dispatch uop (78)map int uop (7c)map simd uop (7e)map int uop inputs (7f)map ldst uop inputs (80)map simd uop inputs (81)? int output thing (e9)? ldst retires (ed)? simd retires (ee)? int retires (ef)
600241000335001111500001050000302549248500102050000200140000106000010
600241000335001111500001050000302549248500102050000200140000106000010
600241000335001111500001050000302549248500102050000200140000106000010
600241000335001111500001050000302549248500102050000200140000106000010
600241000335001111500001050000302549248500102050000200140000106000010
600241000335001111500001050000302549248500102050000200140000106000010
600241000335001111500001050000302549248500102050004200140130106000010
600251000665001911500081050034302549248500102050000200140000106000010
600241000335001111500001050000302549248500102050000200140000106000010
600241000335001111500001050000302549248500102050000200140124106000010

Test 4: Latency 1->3

Chain cycles: 2

Code:

  movi v0.16b, 0
  tbx v0.8b, { v1.16b, v2.16b, v3.16b, v4.16b }, v5.8b
  add v2.16b, v0.16b, v0.16b
  movi v0.16b, 1
  movi v1.16b, 2
  movi v2.16b, 3
  movi v3.16b, 4
  movi v4.16b, 5
  movi v5.16b, 6

(fused SUBS/B.cc loop)

100 unrolls and 100 iterations

Result (median cycles for code, minus 2 chain cycles): 6.0035

retire uop (01)cycle (02)schedule uop (52)schedule int uop (53)schedule simd uop (54)dispatch int uop (56)dispatch simd uop (57)dispatch ldst uop (58)int uops in schedulers (59)simd uops in schedulers (5a)ldst uops in schedulers (5b)dispatch uop (78)map int uop (7c)map ldst uop (7d)map simd uop (7e)map int uop inputs (7f)map simd uop inputs (81)? int output thing (e9)? simd retires (ee)? int retires (ef)
602048180950844109507351085156003140204680451616204051657206144908460000100
602048181250844109507351085156003310205103851941210052000206145590460000100
602048196050909111507981105169203070204914451795202051849204145322360000100
602058208650944109508351085178203420204930351807212051851206144758460000100
602048185450861105507561045160103450204610551582214051605208144100560000100
602048194450907109507981085168403210204657151614206051648204145034360000100
60204818585086510950756108516028587841763092516134219500824726115525110199053238206143686460000100
602048090250460103503571025075703140204434751395204051411202143126260000100
602048100150506107503991065084903210203653650729206050683204141924360000100
602048089750462105503571045075703000203682250764200050731200142062160000100

1000 unrolls and 10 iterations

Result (median cycles for code, minus 2 chain cycles): 6.0035

retire uop (01)cycle (02)schedule uop (52)schedule int uop (53)schedule simd uop (54)dispatch int uop (56)dispatch simd uop (57)int uops in schedulers (59)ldst uops in schedulers (5b)dispatch uop (78)map int uop (7c)map simd uop (7e)map int uop inputs (7f)map ldst uop inputs (80)map simd uop inputs (81)? int output thing (e9)? ldst retires (ed)? simd retires (ee)? int retires (ef)
60024800355001111500001050000302029275500102050004200140012106000010
60024800355001111500001050000342029567500532050052200140000106000010
60024800355001111500001050000302029277500102050000200140000106000010
60024800355001111500001050000302029277500102050000200140000106000010
60024800355001111500001050000302029277500102050000200140000106000010
60024800355001111500001050000302029277500102050000200140000106000010
60024800355001111500001050000302029277500102050000200140000106000010
60024800355001111500001050000302029277500102050000200140000106000010
60024800355001111500001050000302029567500522050052200140000106000010
60024800355001111500001050000302029277500102050000200140000106000010

Test 5: Latency 1->4

Chain cycles: 2

Code:

  movi v0.16b, 0
  tbx v0.8b, { v1.16b, v2.16b, v3.16b, v4.16b }, v5.8b
  add v3.16b, v0.16b, v0.16b
  movi v0.16b, 1
  movi v1.16b, 2
  movi v2.16b, 3
  movi v3.16b, 4
  movi v4.16b, 5
  movi v5.16b, 6

(fused SUBS/B.cc loop)

100 unrolls and 100 iterations

Result (median cycles for code, minus 2 chain cycles): 4.0037

retire uop (01)cycle (02)schedule uop (52)schedule int uop (53)schedule simd uop (54)schedule ldst uop (55)dispatch int uop (56)dispatch simd uop (57)int uops in schedulers (59)ldst uops in schedulers (5b)dispatch uop (78)map int uop (7c)map simd uop (7e)map int uop inputs (7f)map simd uop inputs (81)? int output thing (e9)? simd retires (ee)? int retires (ef)
6020460037501011015000001005000030015092465010020050007200140020160000100
6020460037501011015000001005000030015092855010020050007200140020160000100
6020460037501011015000001005000030015092855010020050007200140020160000100
6020460037501011015000001005000030015092855010020050007200140020160000100
6020460037501011015000001005000030015092855010020050007200140020160000100
6020560074501301015002901005005430015092855010020050007200140020160000100
6020460037501011015000001005000030015092855010020050007200140020160000100
6020460037501011015000001005000030015092855010020050007200140020160000100
6020460037501011015000001005000030015092855010020050007200140020160000100
6020460037501011015000001005000030015092855010020050007200140020160000100

1000 unrolls and 10 iterations

Result (median cycles for code, minus 2 chain cycles): 4.0037

retire uop (01)cycle (02)schedule uop (52)schedule int uop (53)schedule simd uop (54)dispatch int uop (56)dispatch simd uop (57)int uops in schedulers (59)ldst uops in schedulers (5b)dispatch uop (78)map int uop (7c)map simd uop (7e)map int uop inputs (7f)map simd uop inputs (81)? int output thing (e9)? simd retires (ee)? int retires (ef)
600246003750011115000010500003015092685001020500072014000016000010
600246003750011115000010500003015092855001020500002014000016000010
600246003750011115000010500003015092855001020500002014000016000010
600246003750011115000010500003015092855001020500002014000016000010
600246003750011115000010500003015092855001020500002014000016000010
600246003750011115000010500003015092855001020500002014000016000010
600246003750011115000010500003015092855001020500002014000016000010
600246003750011115000010500003015092855001020500002014000016000010
600246003750011115000010500003015092855001020500002014000016000010
600246003750011115000010500003015092855001020500002014000016000010

Test 6: Latency 1->5

Chain cycles: 2

Code:

  movi v0.16b, 0
  tbx v0.8b, { v1.16b, v2.16b, v3.16b, v4.16b }, v5.8b
  add v4.16b, v0.16b, v0.16b
  movi v0.16b, 1
  movi v1.16b, 2
  movi v2.16b, 3
  movi v3.16b, 4
  movi v4.16b, 5
  movi v5.16b, 6

(fused SUBS/B.cc loop)

100 unrolls and 100 iterations

Result (median cycles for code, minus 2 chain cycles): 2.0039

retire uop (01)cycle (02)schedule uop (52)schedule int uop (53)schedule simd uop (54)dispatch int uop (56)dispatch simd uop (57)int uops in schedulers (59)ldst uops in schedulers (5b)dispatch uop (78)map int uop (7c)map simd uop (7e)map int uop inputs (7f)map simd uop inputs (81)? int output thing (e9)? simd retires (ee)? int retires (ef)
60204400395010110150000100500033009814755010420050009200140020160000100
60204400395010110150000100500033009829355025720050163200140026160000100
60204400395010110150000100500033009817485010320050007200140470160000100
60204400395010110150000100500033009817485010320050007200140020160000100
60204401565021510350112102501563009817485010320050007200140020160000100
60204401565021510150114100501573009817485010320050007200140242160000100
60204400395010110150000100500033009827475025920050165200140020160000100
60204400395010110150000100500033009826095026120050167202140680260000100
60204402175027210350169102502343009829105033720050243200140230160000100
60204400395010110150000100500033009817485010320050007200140020160000100

1000 unrolls and 10 iterations

Result (median cycles for code, minus 2 chain cycles): 2.0039

retire uop (01)cycle (02)schedule uop (52)schedule int uop (53)schedule simd uop (54)dispatch int uop (56)dispatch simd uop (57)int uops in schedulers (59)ldst uops in schedulers (5b)dispatch uop (78)map int uop (7c)map simd uop (7e)map int uop inputs (7f)map simd uop inputs (81)? int output thing (e9)? simd retires (ee)? int retires (ef)
60024400395001211500011050004309817335001020500002014000016000010
60024400395001111500001050000309817335001020500002014000016000010
60024400395001111500001050000309817335001020500002014000016000010
60024400395001111500001050000309817335001020500002014000016000010
60024400395001111500001050000309817335001020500002014000016000010
60024400395001111500001050000309817335001020500002014000016000010
60024400395001111500001050000309817335001020500002014000016000010
60024400395001111500001050000309817335001020500002014000016000010
60024400395001111500001050003309815135001020500002014000016000010
60024400395001111500001050000309817335001020500002014000016000010

Test 7: Latency 1->6

Chain cycles: 2

Code:

  movi v0.16b, 0
  tbx v0.8b, { v1.16b, v2.16b, v3.16b, v4.16b }, v5.8b
  add v5.16b, v0.16b, v0.16b
  movi v0.16b, 1
  movi v1.16b, 2
  movi v2.16b, 3
  movi v3.16b, 4
  movi v4.16b, 5
  movi v5.16b, 6

(fused SUBS/B.cc loop)

100 unrolls and 100 iterations

Result (median cycles for code, minus 2 chain cycles): 8.0033

retire uop (01)cycle (02)schedule uop (52)schedule int uop (53)schedule simd uop (54)dispatch int uop (56)dispatch simd uop (57)int uops in schedulers (59)ldst uops in schedulers (5b)dispatch uop (78)map int uop (7c)map simd uop (7e)map int uop inputs (7f)map ldst uop inputs (80)map simd uop inputs (81)? int output thing (e9)? ldst retires (ed)? simd retires (ee)? int retires (ef)
602051000665010910150008100500343002549248501002005000420001400121060000100
602041000335010110150000100500003002549248501002005000420001400121060000100
602041000335010110150000100500003002549248501002005000420001400121060000100
602041000335010110150000100500003002549248501002005000420001400121060000100
602041000335010110150000100500003002549248501002005000420001400121060000100
602041000335010110150000100500003072549580501362025004420001401241060000100
602041000335010110150000100500003002549248501002005000420001400121060000100
602041000335010110150000100500003002549248501002005000420001400121060000100
602041000335010110150000100500003002549248501002005000420001400121060000100
602041000335010110150000100500003002549248501002005000420001400121060000100

1000 unrolls and 10 iterations

Result (median cycles for code, minus 2 chain cycles): 8.0033

retire uop (01)cycle (02)schedule uop (52)schedule int uop (53)schedule simd uop (54)schedule ldst uop (55)dispatch int uop (56)dispatch simd uop (57)int uops in schedulers (59)ldst uops in schedulers (5b)dispatch uop (78)map int uop (7c)map simd uop (7e)map int uop inputs (7f)map simd uop inputs (81)? int output thing (e9)? simd retires (ee)? int retires (ef)
60024100033500111150000010500003025492485001020500002014000016000010
60026100099500271150016010500683025492485001020500002014000016000010
60024100033500111150000010500003025492485001020500002014000016000010
60024100033500111150000010500003025492485001020500002014000016000010
60024100033500111150000010500003025492485001020500002014000016000010
60024100033500111150000010500003025492485001020500002014000016000010
60024100033500111150000010500003025492485001020500002014000016000010
60024100033500111150000010500003025492485001020500002014000016000010
60024100033500111150000010500003025492485001020500002014000016000010
60025100066500191150008010500343025492485001020500002014000016000010

Test 8: throughput

Count: 8

Code:

  movi v0.16b, 0
  tbx v0.8b, { v8.16b, v9.16b, v10.16b, v11.16b }, v12.8b
  movi v1.16b, 0
  tbx v1.8b, { v8.16b, v9.16b, v10.16b, v11.16b }, v12.8b
  movi v2.16b, 0
  tbx v2.8b, { v8.16b, v9.16b, v10.16b, v11.16b }, v12.8b
  movi v3.16b, 0
  tbx v3.8b, { v8.16b, v9.16b, v10.16b, v11.16b }, v12.8b
  movi v4.16b, 0
  tbx v4.8b, { v8.16b, v9.16b, v10.16b, v11.16b }, v12.8b
  movi v5.16b, 0
  tbx v5.8b, { v8.16b, v9.16b, v10.16b, v11.16b }, v12.8b
  movi v6.16b, 0
  tbx v6.8b, { v8.16b, v9.16b, v10.16b, v11.16b }, v12.8b
  movi v7.16b, 0
  tbx v7.8b, { v8.16b, v9.16b, v10.16b, v11.16b }, v12.8b
  movi v8.16b, 9
  movi v9.16b, 10
  movi v10.16b, 11
  movi v11.16b, 12
  movi v12.16b, 13

(fused SUBS/B.cc loop)

100 unrolls and 100 iterations

Result (median cycles for code divided by count): 2.0006

retire uop (01)cycle (02)schedule uop (52)schedule int uop (53)schedule simd uop (54)schedule ldst uop (55)dispatch int uop (56)dispatch simd uop (57)int uops in schedulers (59)ldst uops in schedulers (5b)dispatch uop (78)map int uop (7c)map simd uop (7e)map int uop inputs (7f)map simd uop inputs (81)? int output thing (e9)? simd retires (ee)? int retires (ef)
400205160116320141101320040010032007430034974673201122003200152009602341400000100
400204160060320102101320001010032001230035071693201122003200152009600451400000100
400204160047320101101320000010032001230035071693201122003200152009600451400000100
400204160047320101101320000010032001230035071693201122003200152009600451400000100
400204160047320101101320000010032001230035071693201122003200152009600451400000100
400204160047320101101320000010032001230035071693201122003200152009600451400000100
400204160047320101101320000010032001230035071693201122003200152009600451400000100
400204160047320101101320000010032001230035071693201122003200152009600451400000100
400204160074320101101320000010032001230035071693201122003200152009600451400000100
400204160047320101101320000010032001230035071693201122003200152009600451400000100

1000 unrolls and 10 iterations

Result (median cycles for code divided by count): 2.0006

retire uop (01)cycle (02)schedule uop (52)schedule int uop (53)schedule simd uop (54)schedule ldst uop (55)dispatch int uop (56)dispatch simd uop (57)dispatch ldst uop (58)int uops in schedulers (59)simd uops in schedulers (5a)ldst uops in schedulers (5b)dispatch uop (78)map int uop (7c)map ldst uop (7d)map simd uop (7e)map int uop inputs (7f)map simd uop inputs (81)? int output thing (e9)? simd retires (ee)? int retires (ef)
400024160079320011113200000103200120300329586732002220032001520960000140000010
400024160047320011113200000103200000300339188932001020032000020960000140000010
400024160047320011113200000103200000300267234432007720032007020960000140000010
400024160047320011113200000103200000300339188932001020032000020960000140000010
400024160047320011113200000103200000300339188932001020032000020960000140000010
400024160047320011113200000103200000300339188932001020032000020960000140000010
400024160047320011113200000103200000300339188932001020032000020960000140000010
400025160087320054113200430103200740300229479732008420032007920960000140000010
400024160047320011113200000103200000300339188932001020032000020960234140000010
400024160047320011113200000103200000300264473732008220032007820960045140000010