Apple M1 Microarchitecture Research by Dougall Johnson

Firestorm: Overview | Base Instructions | SIMD and FP Instructions
Icestorm:  Overview | Base Instructions | SIMD and FP Instructions

TBX (two register table, 16B)

Test 1: uops

Code:

  tbx v0.16b, { v1.16b, v2.16b }, v3.16b
  movi v0.16b, 1
  movi v1.16b, 2
  movi v2.16b, 3
  movi v3.16b, 4

(no loop instructions)

1000 unrolls and 1 iteration

Retires: 2.000

Issues: 2.000

Integer unit issues: 0.000

Load/store unit issues: 0.000

SIMD/FP unit issues: 2.000

retire (01)cycle (02)030b191e1f3f4e51inst issue (52)~issue fp/simd (54)~dispatch fp/simd (57)huge thing fp/simd (5b)60696d6edispatch stall (70)scheduler rewind (75)scheduler stall (76)~dispatch op (78)~map op fp/simd (7e)~map lookup fp/simd (81)8283pipeline redirect (84)85inst all (8c)inst neon or fp (9a)a6a8accfd5d6ddinst fetch restart (de)e0? fp/simd (ee)f5f6f7f8fd
20044086310018082368725200020002000551680040184087408534473377020002000600040374037111001100001073116113787200040384038403840384038
20044037310000103368744201220002000551680040184037403734473377020002000600040374037111001100000073116113853200040384038403840384038
2004403730000061368725200020002000551680040184037403734473377020002000600040374037111001100001384373116113787200040384038403840384038
2004403730000061368725200020002000551680040184084408634473377020002000600040374037111001100000073116113787200040384038403840384038
2004403730000061368725200020002000551680040184037403734473377021522000600040374037111001100000073116113787200040384038403840384038
200440373000249061368725200020002000551680040184037403734473377021522000600040374037111001100000073116113787200040384038403840384038
2004403731000061368725200020002000551680040184037403734473377020002000600040374037111001100000073116113787200040384038403840384038
2004403730000061368725200020002000551680040184037403734473377020002000600040374037111001100000073116113787200040384038403840384038
20044037300033084368725200020002000551680040184037403734473377020002000600040374037111001100000073116113787200040384038403840384038
2004403730001920251368725200020002000551680040184037403734473377020002000600040374037111001100000073116113787200040384038403840384038

Test 2: Latency 1->1

Code:

  tbx v0.16b, { v1.16b, v2.16b }, v3.16b
  movi v0.16b, 1
  movi v1.16b, 2
  movi v2.16b, 3
  movi v3.16b, 4

(fused SUBS/B.cc loop)

100 unrolls and 100 iterations

Result (median cycles for code): 4.0037

retire (01)cycle (02)03070a181e3a3f4e51inst issue (52)~issue int (53)~issue fp/simd (54)~dispatch int (56)~dispatch fp/simd (57)huge thing int (59)huge thing fp/simd (5b)696d6edispatch stall (70)scheduler rewind (75)scheduler stall (76)~dispatch op (78)~map op int (7c)~map op fp/simd (7e)~map lookup int (7f)~map lookup fp/simd (81)8283pipeline redirect (84)85inst all (8c)inst branch (8d)inst branch taken (90)inst b.cc (94)inst integer (97)inst neon or fp (9a)9fa6a8a9acc2c5branch mispredict (cb)cdcfd0d5d6ddinst fetch restart (de)e0? int output thing (e9)? fp/simd (ee)gpr retires (ef)f5f6f7f8fd
20204400372992200165396872520100100200001002000050057176804001840037400373717963749120100200200082006002440037400371110201100991001001000010002000111718031643398010200001004003840038400384003840038
20204400373002200165396872520100100200001002000050057176804001840037400373717963749120100200200082006002440037400371110201100991001001000010000000111718051643398570200001004003840232400384003840038
202044003730021001653968725201001002004810020000500571768040018402774003737172337495201002002000020060000400374003711102011009910010010000100000900007120516510397870200001004003840184400384008640038
20204400373001100165396872520100100200001002000050057176804001840037400373717233749520406202200002006000040037402321110201100991001001000010003000000712061666397870200001004003840038400384003840038
2020440037300110121107396872520100100200001002000050057176804001840037400373717233749520100200200002006000040037400371110201100991001001000010000000000712071666397870200001004003840038400384003840038
20204400373001102191730396872520100104200001002000050057176804001840037400373717233749520100200200002006000040037400371110201100991001001000010000000000712061655397870200001004003840038400384003840038
20204400373001100165396872520100100200001002000050057176804005440037400373717233749520100200200002006000040037400371110201100991001001000010020000000712075366397870200001004003840038400384003840038
20204400373001100165396872520100100200001002000050057176804001840037400373717233749520100200200002006000040037400371110201100991001001000010000081660000712061664397872200001004008640038400384003840038
20204400373001100165396872520100100200001002000050057176804001840037400373717233749520100200200002006000040037400371110201100991001001000010001090000712051676397870200001004003840038400384003840038
202044003730011078165396872520100100200001002000050057176804001840037400373717233749520100200200002006000040037400371110201100991001001000010001000000712041666397870200001004003840038400384003840038

1000 unrolls and 10 iterations

Result (median cycles for code): 4.0037

retire (01)cycle (02)03080a0b18191e1f3f4e51inst issue (52)~issue int (53)~issue fp/simd (54)~dispatch int (56)~dispatch fp/simd (57)huge thing int (59)huge thing fp/simd (5b)60696d6edispatch stall (70)scheduler rewind (75)scheduler stall (76)~dispatch op (78)~map op int (7c)~map op fp/simd (7e)~map lookup int (7f)~map lookup fp/simd (81)8283pipeline redirect (84)85inst all (8c)inst branch (8d)inst branch taken (90)inst b.cc (94)inst integer (97)inst neon or fp (9a)9fa0a1a6a7a8a9acc2cfd0d5d6ddinst fetch restart (de)e0? int output thing (e9)? fp/simd (ee)gpr retires (ef)f5f6f7f8fd
200244003730000000006139687252001010200001020000505717680040018400374003737194337517200102020000206000040037400371110021109101010000100000000064007163339787020000104003840038400384003840038
200244003730000000006139687252001010200001020000505717680040018400374003737194337517200102020000206000040037400371110021109101010000100000000064003163339787020000104003840038400384003840038
200244003729900000006139687252001010200001020000505717680040018400374003737194337517200102020660206000040037400371110021109101010000100000000064003163339787020000104003840038400384003840038
200244003730000000005466395662412007010201321021672505731480140414405584061237211493773221682202098222654424065640572121100211091010100001040201022560078107704340195020000104062240623403274023240086
2002440085299000000059493963224320142122014412216727257317931404504060640617372225637745216822221968226557440617405771311002110910101000010222012244832857081337740009220000104033040622403294042740330
2002440572302010110006139687252001010200001020000505717680140414404644023237214737686213802220980206490840230404743110021109101010000100000100078206974339972020000104028140474404264033340475
200244026730400110754088061396872520010102000010200005057176800404504046540621372154437572219902221640206688840525403161211002110910101000010000000239800847061325940027020000104003840038400384003840038
20024400373110009617167926139687252014410201921021520835731793040414404654052537224337517200102020000206000040037400371110021109101010000102402000064003163439787020000104003840038400384003840038
200244003731100000306139687252001010200001020000505717680040065400374003737194337517200102020000206000040037400371110021109101010000100000003064003163339787020000104003840038400384003840038
200244003730000000006139687252001010200001020000505717680040018400374003737194337517200102020000206000040037400371110021109101010000100000000064003163339787020000104003840038400384003840038

Test 3: Latency 1->2

Chain cycles: 2

Code:

  movi v0.16b, 0
  tbx v0.16b, { v1.16b, v2.16b }, v3.16b
  add v1.16b, v0.16b, v0.16b
  movi v0.16b, 1
  movi v1.16b, 2
  movi v2.16b, 3
  movi v3.16b, 4

(fused SUBS/B.cc loop)

100 unrolls and 100 iterations

Result (median cycles for code, minus 2 chain cycles): 4.0038

retire (01)cycle (02)0307080a0b18191e1f3f4e51inst issue (52)~issue int (53)~issue fp/simd (54)~dispatch int (56)~dispatch fp/simd (57)huge thing int (59)huge thing fp/simd (5b)60696b6d6edispatch stall (70)72scheduler rewind (75)scheduler stall (76)~dispatch op (78)~map op int (7c)~map op fp/simd (7e)~map lookup int (7f)~map lookup fp/simd (81)8283pipeline redirect (84)85inst all (8c)inst branch (8d)inst branch taken (90)inst b.cc (94)inst integer (97)inst neon or fp (9a)9fa0a1a6a7a8a9acc2c5branch mispredict (cb)cdcfd5d6ddinst fetch restart (de)e0? int output thing (e9)? fp/simd (ee)gpr retires (ef)f5f6f7f8fd
40204600384491010005465281455968726301251253000012530000625858762706001906003860038546710354995301252003000020080000600386003811302011009910010030000100000000000001910216225980125400001006003960039600396003960039
402046003844900000000615968726301251253000012530000625858762706001906013960246546710354995301252003000020080000600386003811302011009910010030000100000000300001910216225980125400001006003960039600396003960039
40204600384500000004530615968726301251253000012530000625858762706001906003860038546710354995301252003000020080000600386003811302011009910010030000100000000000001910216225980125400001006003960039600396003960039
402046003845000000000615968726301251253000012530000625858762706001906003860038546710354995301252003000020080000600386003811302011009910010030000100000000000001910216125980125400001006003960039600396003960039
4020460038450000000006159687263012512530000125300006258587627060019060038600385467103549953012520030000200817706003860038113020110099100100300001000000502000001938216225980125400001006003960039600396003960039
4020460038451010000006159667263012512530000125300006258587627060019360038600385467101255040301252003000020080000600386003811302011009910010030000100000000000001910216225980125400001006003960039600396003960039
4020460038450000000006159687165301641253000012530000625858762706005606003860038546710354995301252003000020080000600386003811302011009910010030000100000000020001910216225980125400001006003960039600396003960039
402046003844900000030615968753301251253000012530000625859504906001906003860038546530354995301252023000020080000600386003811302021009910010030000100000000000001910216625980125400001006009560039600396003960039
4020460038450000000006159687161301251253000012530000625858762706001936003860038546710354995301252003000020080000600386003871302021009910010030000100000070000001910216225980125400001006003960039603506003960039
4020460038450000060006159687263012512530000130300006258587627060019060038600385467103549953027820630000200800006003860038113020110099100100300001000032141234300001910216235980125400001006003960039600396003960039

1000 unrolls and 10 iterations

Result (median cycles for code, minus 2 chain cycles): 4.0038

retire (01)cycle (02)03080b18191e1f3a3f4e51inst issue (52)~issue int (53)~issue fp/simd (54)~dispatch int (56)~dispatch fp/simd (57)huge thing int (59)huge thing fp/simd (5b)60696b6d6edispatch stall (70)72scheduler rewind (75)scheduler stall (76)~dispatch op (78)~map op int (7c)~map op fp/simd (7e)~map lookup int (7f)~map lookup fp/simd (81)8283pipeline redirect (84)85inst all (8c)inst branch (8d)inst branch taken (90)inst b.cc (94)inst integer (97)inst neon or fp (9a)9fa0a1a6a7a8a9acc5cdcfd0d5d6ddinst fetch restart (de)e0? int output thing (e9)? fp/simd (ee)gpr retires (ef)f5f6f7f8fd
4002460038450000000061596872630013133000013300006585876270600190600386003854693035501730013203000020800006003860038113002110910103000010000000300189002172259801340000106003960039600396003960039
40024600384490000681007265968726300131330000133000065858762716001906003860038546930355017300132030000208000060038600381130021109101030000100000701500189003172259801340000106003960039600396003960039
4002460299449000000010359687263001313300001330000658587627060019060038600385469303550173001320300002080000600386003811300211091010300001000000023100189002172259801340000106003960039600396003960039
40024600384500000000251596872630013133000013300006585876271600190600386003854693035501730013203000020800006003860038113002110910103000010000000600189002172259801340000106003960039600396003960039
40024600384490000000615967826300131330000133000065858762716001906003860092546910355017300132030000208000060038603501130021109101030000100000001800189002172359801340000106003960039600396003960039
4002460038450000000061596872630013133000013300006585876270600190600386003854693035511230013203000020800006003860038113002110910103000010000000300189002172259801340000106014260092600396003960039
400246003844900000001475968726300131330000133000065858762716001906003860038546930355017300132030000208000060038600381130021109101030000100000009300189002172259801340000106003960039600396003960039
400246003844900000026159687263001313300001330000658587627060019060038600385469303550173001320300002080000600386003811300211091010300001000001021300189002172559801340000106003960039600936003960039
400246003845001000006159687263001313300001330000658587627060019060038600385469303550173001320300002080000600386003811300211091010300001000000021900189003172259801340000106003960039600396003960039
400246003845000000006159687263001313300001330000658587627160019060038600385469303550173001320300002080000600386003811300211091010300001000000022500189003172259801340000106003960039600396003960039

Test 4: Latency 1->3

Chain cycles: 2

Code:

  movi v0.16b, 0
  tbx v0.16b, { v1.16b, v2.16b }, v3.16b
  add v2.16b, v0.16b, v0.16b
  movi v0.16b, 1
  movi v1.16b, 2
  movi v2.16b, 3
  movi v3.16b, 4

(fused SUBS/B.cc loop)

100 unrolls and 100 iterations

Result (median cycles for code, minus 2 chain cycles): 2.0040

retire (01)cycle (02)03070818191e1f373f4e51inst issue (52)~issue int (53)~issue fp/simd (54)~dispatch int (56)~dispatch fp/simd (57)huge thing int (59)huge thing fp/simd (5b)60696d6edispatch stall (70)72scheduler rewind (75)scheduler stall (76)~dispatch op (78)~map op int (7c)~map op fp/simd (7e)~map lookup int (7f)~map lookup fp/simd (81)8283pipeline redirect (84)85inst all (8c)inst branch (8d)inst branch taken (90)inst b.cc (94)inst integer (97)inst neon or fp (9a)9fa8acc5branch mispredict (cb)cfd5d6ddinst fetch restart (de)e0? int output thing (e9)? fp/simd (ee)gpr retires (ef)f5f6f7f8fd
4020440043300011200128439659833021312530083125300006425700933040021400404007034731033504230125200300002008000040040400401130201100991001003000010000001910216223984425400001004004440041400414010140044
402044007029900000008439707263012612530002125300006255701226140024400434004034685033499730125200300002008000040043400401130201100991001003000010000001910216223984429400001004004140041400414004140041
4020440040300000000225439707263012712530002125300006255700760040024400434004034716033499730125200300002008000040040400431130201100991001003000010000001910216223984425400001004004140041400414004140086
402044004029900000016139699263013512530000125300006255700933140021400404004034685033500030125200300002008000040043400431130201100991001003000010003001910216223984425400001004004140041400414004140041
40204400403000000001101239699263012712530002125300006255700933040021400404004034685033499730125200300002008000040043400401130201100991001003000010000001910216223984425400001004004440041400414004140041
4020440085300000000116539707263012612530000125300006255700760040066400404004034685033500030125200300002008126440040400401130201100991001003000010000001910216223984425400001004004140071400864007140041
402044004030000000006139699263012712530000125300006255700305040021400434004034685733500030125200300002008000040085400401130201100991001003000010000001910216223987425400001004008640044400444004140074
402044004030000000016139699263012512530000125300006255700933040021400404004034685033499730125200300002008000040040400401130201100991001003000010000001910216223984425400001004004140044400414004740041
402044004030000002220025139707263013512530010125300006255700933140021400434007034688033499730125200300002008000040070400701130201100991001003000010000001910216223984425400001004004140041400414004140041
402044004030000000008939699263012612530000125300006255700933040021400404004334685033499730125200300002008000040040400431130201100991001003000010000001910216223988925400001004004140041400444004140044

1000 unrolls and 10 iterations

Result (median cycles for code, minus 2 chain cycles): 2.0040

retire (01)cycle (02)031e373f4e51inst issue (52)~issue int (53)~issue fp/simd (54)~dispatch int (56)~dispatch fp/simd (57)huge thing int (59)huge thing fp/simd (5b)60696b6d6edispatch stall (70)scheduler rewind (75)scheduler stall (76)~dispatch op (78)~map op int (7c)~map op fp/simd (7e)~map lookup int (7f)~map lookup fp/simd (81)8283pipeline redirect (84)85inst all (8c)inst branch (8d)inst branch taken (90)inst b.cc (94)inst integer (97)inst neon or fp (9a)9fa8accdcfd5d6ddinst fetch restart (de)e0? int output thing (e9)? fp/simd (ee)gpr retires (ef)f5f6f7f8fd
400244004030000613970754300491330001133000065570030514002104004340040347083350193001320300002080000400854004011300211091010300001000018904173339846340000104004140041400414004140041
40024400403001320613970726300131330000133000065570093314002104004040040347083350193001320300002080000400434004011300211091010300001000018903173339846340000104004140041400414004140044
400244004330001613970726300131330000133000065570093314002104004040043347083350253001320300002080000400404004011300211091010300001010018903173339843340000104004440044400414004440047
4002440040300001283971026300131330001133000065570130014002104004040040347083350193001320300002080000400404004311300211091010300001000018903174339843340000104004440041400414004140041
400244004329900613970726300491330000133000065570093314002104004040040347083350193001320300002080000400404004011300211091010300001000018903173339843340000104004140044400414004140041
4002440046300010613971026300131330000133000065570122604002104004340043347083350493001320300002080000400404004011300211091010300001000018903173339846340000104004740041400444004440044
4002440046300002143970726300131330000133000065570475214002104004040040347113350223001320300002080000400404004011300211091010300001000018903174339843340000104004140041400714004440044
4002440040299001053970726300131330000133000065570093304002104004640040347113350193001320300002080000400404008511300211091010300001000018903173439843340000104004140041400414004140041
4002440040300001743970726300131330010133000065570122604002104004040040347113350253001320300002080000400404004311300211091010300001000018904173339843340000104004440044400414004140044
400244004330002613970726300131330001133000065570093304002104004040040347083350193001320300002080000400404004011300211091010300001000018903173339843340000104004140041400414004140044

Test 5: Latency 1->4

Chain cycles: 2

Code:

  movi v0.16b, 0
  tbx v0.16b, { v1.16b, v2.16b }, v3.16b
  add v3.16b, v0.16b, v0.16b
  movi v0.16b, 1
  movi v1.16b, 2
  movi v2.16b, 3
  movi v3.16b, 4

(fused SUBS/B.cc loop)

100 unrolls and 100 iterations

Result (median cycles for code, minus 2 chain cycles): 4.0038

retire (01)cycle (02)03070a1e3a3f4e51inst issue (52)~issue int (53)~issue fp/simd (54)~dispatch int (56)~dispatch fp/simd (57)huge thing int (59)huge thing fp/simd (5b)696d6edispatch stall (70)72scheduler rewind (75)scheduler stall (76)~dispatch op (78)~map op int (7c)~map op fp/simd (7e)~map lookup int (7f)~map lookup fp/simd (81)8283pipeline redirect (84)85inst all (8c)inst branch (8d)inst branch taken (90)inst b.cc (94)inst integer (97)inst neon or fp (9a)9fa0a1a7a8a9accdcfd5d6ddinst fetch restart (de)e0? int output thing (e9)? fp/simd (ee)gpr retires (ef)f5f6f7f8fd
40204600384491102645968726301251253000012530000625858762760019600386003854671035499530125200300002008000060038600381130201100991001003000010000000001912516555980125400001006003960039600396003960039
402046003845011012545968726301251253000012530000625858762760019600386003854671035499530125200300002008000060038600381130201100991001003000010000000001912516565980125400001006003960039600396003960039
4020460038449110115259687263012512530000125300006258587627600196003860038546710354995301252003000020080000600386003811302011009910010030000100000002101912516355980125400001006003960039600396009060039
40204600384501101645968726301251253000012530000625858762760019600386003854671035499530125200300002008000060038600381130201100991001003000010000000901912616555994925400001006003960039600396003960039
40204600914511101645968726301251253000012530000625858762760019600386003854671035499530125200300002008000060038600381130201100991001003000010000000001913416445980125400001006003960039600396003960039
402046003845011121645968726301251253000012530000625858762760019600386003854671035499530125200300002048000060038600911130201100991001003000010000000001913650535980125400001006003960039600396003960039
402046003845011121645968726301251253000012530000625858762760019600386003854671035499530125200300002008000060038600381130201100991001003000010024004405601912416785980125400001006003960039600396003960039
40204600384501101645968726301251253000012530000625858762760019600386003854671035499530125200300002008000060038600381130201100991001003000010000000001912516355980125400001006003960039600396003960039
4020460038449110110035968726301251253000012530000625858762760019600386003854671035499530125200300002008000060038600381130201100991001003000010000010001912516555980125400001006003960039600396003960039
40204600384511101645968726301251253000012530000625858762760019600386003854671035499530125200300002008000060038600381130201100991001003000010000000001913616535980125400001006003960039600396003960039

1000 unrolls and 10 iterations

Result (median cycles for code, minus 2 chain cycles): 4.0038

retire (01)cycle (02)03191e3f4e51inst issue (52)~issue int (53)~issue fp/simd (54)~dispatch int (56)~dispatch fp/simd (57)huge thing int (59)huge thing fp/simd (5b)60696b6d6edispatch stall (70)72scheduler rewind (75)scheduler stall (76)~dispatch op (78)~map op int (7c)~map op fp/simd (7e)~map lookup int (7f)~map lookup fp/simd (81)8283pipeline redirect (84)85inst all (8c)inst branch (8d)inst branch taken (90)inst b.cc (94)inst integer (97)inst neon or fp (9a)9fa6a8accfd5d6ddinst fetch restart (de)e0? int output thing (e9)? fp/simd (ee)gpr retires (ef)f5f6f7f8fd
4002460038450006159687263001313300001330000658587627160019060038600385469303550173001320300002080000600386003811300211091010300001000018907177759801340000106003960039600396003960039
4002460038450006159687263001313300001330000658587627160019060038600385469303550173001320300002080000600386003811300211091010300001000018904174759801340000106003960039600396003960039
40024600384500011859687263001313300121330000658587627160019060038600385469303550173001320300002080000600386003811300211091010300001000018904174759801340000106003960039600396003960039
4002460038450006159687263001313300001330000658587627060019060038600385469303550173001320300002080000600386003811300211091010300001000018904174759801340000106003960039600396003960039
4002460038450006159687263001313300001330000658587627060019060038600385469303550173001320300002080000600386003811300211091010300001006018907177759801340000106003960039600396003960039
40024600384500025159687263001313300001330000658587627060019060038600385469303550173001320300002080000600386003811300211091010300001000018907177559801340000106003960039600396003960039
4002460038449006159687263001313300001330000658587627160019060038600385469303550173001320300002080880600896003811300211091010300001000018904177459801340000106003960039600396003960039
4002460038449006159687263001313300001330000658587627160019060038600385469003550173001320301682080000601406003811300211091010300001000018907177459801340000106019660039600396003960039
40024601394500132126659687263001313300001330000818588904060093060038600385468703550173001320300002080000600906003811300211091010300001027527018907177459801340000106008960039600396003960039
4002460038450018022159687263001313300001330000658587627060019060038600385469303550173001320300002080000600386003811300211091010300001001018907177459801340000106003960039600396003960039

Test 6: throughput

Count: 8

Code:

  movi v0.16b, 0
  tbx v0.16b, { v8.16b, v9.16b }, v10.16b
  movi v1.16b, 0
  tbx v1.16b, { v8.16b, v9.16b }, v10.16b
  movi v2.16b, 0
  tbx v2.16b, { v8.16b, v9.16b }, v10.16b
  movi v3.16b, 0
  tbx v3.16b, { v8.16b, v9.16b }, v10.16b
  movi v4.16b, 0
  tbx v4.16b, { v8.16b, v9.16b }, v10.16b
  movi v5.16b, 0
  tbx v5.16b, { v8.16b, v9.16b }, v10.16b
  movi v6.16b, 0
  tbx v6.16b, { v8.16b, v9.16b }, v10.16b
  movi v7.16b, 0
  tbx v7.16b, { v8.16b, v9.16b }, v10.16b
  movi v8.16b, 9
  movi v9.16b, 10
  movi v10.16b, 11

(fused SUBS/B.cc loop)

100 unrolls and 100 iterations

Result (median cycles for code divided by count): 0.5005

retire (01)cycle (02)030b1e373f4e51inst issue (52)~issue int (53)~issue fp/simd (54)~dispatch int (56)~dispatch fp/simd (57)huge thing int (59)huge thing fp/simd (5b)5f60696d6edispatch stall (70)72scheduler rewind (75)scheduler stall (76)~dispatch op (78)~map op int (7c)~map op fp/simd (7e)~map lookup int (7f)~map lookup fp/simd (81)8283pipeline redirect (84)85inst all (8c)inst branch (8d)inst branch taken (90)inst b.cc (94)inst integer (97)inst neon or fp (9a)9fa0a6a8a9acc2cfd5d6d9ddinst fetch restart (de)e0? int output thing (e9)? fp/simd (ee)gpr retires (ef)f5f6f7f8fd
2402044006564400032602616012512516000012516000065015999600040070400434004399730355557160125200160000200480000400434004311160201100991001001600001000000001011012970114004062400001004041340165868844004440044
240204400433000006702616012512516000012516000065015999600040024400434004399730310000160125200160000200480000400434004311160201100991001001600001000000001011011601140040252400001004004440044400444004440044
2402044004330000015302616012512516000012516000065015999600140024400434004399730310000160125200160000200480000400434004311160201100991001001600001000000001011011601140040252400001004004440044400444004440044
2402044004330000010902616012512516000012516000065015999600140024400434004399738807310000160125200160000200480000400434004311160201100991001001600001000000001011011601140040252400001004004440044401114004487213
240204400436510008602616012512516000012516000065015999600140024400434004399730310000160125200160000200480000400434004311160201100991001001600001000000001011011601140040252400001004004440044400444004440044
240204400433000009002616012512516000012516000065015999600140024400434004399730310000160125200160000200480000400434004311160201100991001001600001000000001011011601140040252400001004004440044400444004440044
240204400432990006702616012512516000012516000065015999600140024400434004399730310000160125200160000200480000400434004311160201100991001001600001000000001011011601140040252400001004004440044400444004440044
2402044004330000081702616012512516000012516000065015999600140024400434004399730310000160125200160000200480000400434004311160201100991001001600001000000001011011601140040252400001004004440044400444004440044
24020440043300000137902616012512516000012516000065015999600185650400434004399730310000160125200160000200480000400434004311160201100991001001600001000000001011011601140040252400001004004440044400444004440044
2402044004330000010702616012512516000012516000065015999600140024400434004399730310000160125200160000200480000400438567311160201100991001001600001000000001011011601140040252400001004004440044400444004440044

1000 unrolls and 10 iterations

Result (median cycles for code divided by count): 0.5005

retire (01)cycle (02)030818191e1f373a3f4e51inst issue (52)~issue int (53)~issue fp/simd (54)~dispatch int (56)~dispatch fp/simd (57)huge thing int (59)huge thing fp/simd (5b)5f60696d6edispatch stall (70)72scheduler rewind (75)scheduler stall (76)~dispatch op (78)~map op int (7c)~map op fp/simd (7e)~map lookup int (7f)~map lookup fp/simd (81)8283pipeline redirect (84)85inst all (8c)inst branch (8d)inst branch taken (90)inst b.cc (94)inst integer (97)inst neon or fp (9a)9fa0a1a8accfd0d2icache miss (d3)d5d6d9dadbddinst fetch restart (de)e0? int output thing (e9)eaeb? fp/simd (ee)gpr retires (ef)f5f6f7f8fd
2400244004330000000001750261600131316000013160000681599960114002440043400439996031002216001320160000204800004004340043111600211091010160000100000100223111917211515400403152240000104004440044400444004440044
2400244004330000000002860261600131327757913160000681599960114002440043400439996031002216001320160000204800008636740043111600211091010160000100000100223111417211157400403152240000104004440044400444004440044
24002440043300000001161670514026160013131600001316000068159996011400244004340043493630310022160013201600002048000040043400431116002110910101600001020031002432241722115104004031515240000104004440163400444004440044
2400244011130011301920013302616001313160000131600006815999601140024400434004399968985310022160013201600002048000084678400431116002110910101600001000001002231151722169400403304240000104004440044400444004440044
24002440043300000000015402616001313160000131600006812043078014002488092400434852603100221600132016000020480000400438707111160021109101016000010000010024622917421512400403152240000108722840044400444004440044
240024400436590000000346026160013131600001316000068159996001400244004340043999603100221600132016000020480000860934004311160021109101016000010000310022321817411149400403152240000104004440044400444004440044
2400244004330000000004902616001313160000131600006815999600140024400434004399960371004316001320160000204800004004340043111600211091010160000100000100243111017221109400403152240000104004440044400444004440044
24002486367647000120001370471601151116000013160000681600968114002440043401094784289723100221600132016000020480000400434004311160021109101016000010002310022311917211116400403152240000104004440044400444004440044
240024400433000000000490261600131316000013160000661599960114002486093400439996035634616001320160000204800004004340043111600211091010160000100003100223111417211109400403152240000104004440044400444004440044
2400244004329900012000490261600131316000013160000681168551411400248809240043999603100221600132016000020480000400434004311160021109101016000010003010022311517211810400403152240000104004440044400444004440044