Apple Microarchitecture Research by Dougall Johnson

M1/A14 P-core (Firestorm): Overview | Base Instructions | SIMD and FP Instructions
M1/A14 E-core (Icestorm):  Overview | Base Instructions | SIMD and FP Instructions

TBX (three register table, 8B)

Test 1: uops

Code:

  tbx v0.8b, { v1.16b, v2.16b, v3.16b }, v4.8b
  movi v0.16b, 1
  movi v1.16b, 2
  movi v2.16b, 3
  movi v3.16b, 4
  movi v4.16b, 5

(no loop instructions)

1000 unrolls and 1 iteration

Retires: 3.000

Issues: 3.000

Integer unit issues: 0.000

Load/store unit issues: 0.000

SIMD/FP unit issues: 3.000

retire uop (01)cycle (02)031e3f4e51schedule uop (52)schedule simd uop (54)dispatch simd uop (57)ldst uops in schedulers (5b)60696d6emap stall dispatch (70)map rewind (75)map stall (76)dispatch uop (78)map simd uop (7e)map simd uop inputs (81)8283flush restart other nonspec (84)85inst all (8c)inst simd alu (9a)l1d tlb access (a0)l1d cache writeback (a8)a9accfd5map dispatch bubble (d6)ddfetch restart (de)e0? simd retires (ee)f5f6f7f8fd
30046037450615687253000300030008386800601860376037532235645300030009000603760371110011000000073216225787300060386038603860386038
30046037450615687253000300030008386801601860376037532235645300030009000603760371110011000000073216225787300060386038603860386038
30046037450615687253000300030008386800601860376037532235645300030009486603760371110011000000073216225824300060386038603860386038
30046037450615687253000300030008386800601860376037532235645300030009000603760371110011000000073216225853300060386038603860386038
30046037460615687253000300030008386800601860376037532235645300030009000603760371110011000000073216225787300060386038603860386038
30046037450615687253012300030008386801601860376037532235645300030009000603760371110011000000073216225787300060386038603860386038
30046037450615687253000300030008386800601860376037532235645300030009000603760371110011000000073216225787300060386038603860386038
30046037450615687253000300030008386800601860376037532235645300030009000603760371110011000000073216225787300060386038603860386038
30046037450615687253000300030008386800601860376037532235645300030009000603760371110011000000073216225787300060386038603860386038
30046037450845687253000300030008386800601860376037532235645300030009000603760371110011000000073216225787300060386038603860386038

Test 2: Latency 1->1

Code:

  tbx v0.8b, { v1.16b, v2.16b, v3.16b }, v4.8b
  movi v0.16b, 1
  movi v1.16b, 2
  movi v2.16b, 3
  movi v3.16b, 4
  movi v4.16b, 5

(fused SUBS/B.cc loop)

100 unrolls and 100 iterations

Result (median cycles for code): 6.0037

retire uop (01)cycle (02)03mmu table walk data (08)l2 tlb miss data (0b)18191e1f3a3f4e51schedule uop (52)schedule int uop (53)schedule simd uop (54)dispatch int uop (56)dispatch simd uop (57)int uops in schedulers (59)ldst uops in schedulers (5b)60696d6emap stall dispatch (70)map rewind (75)map stall (76)dispatch uop (78)map int uop (7c)map simd uop (7e)map int uop inputs (7f)map simd uop inputs (81)8283flush restart other nonspec (84)85inst all (8c)inst branch (8d)inst branch taken (90)inst branch cond (94)inst int alu (97)inst simd alu (9a)9fld unit uop (a6)l1d cache writeback (a8)a9acbranch cond mispred nonspec (c5)branch mispred nonspec (cb)cdcfd5map dispatch bubble (d6)fetch restart (de)e0? int output thing (e9)eb? simd retires (ee)? int retires (ef)f5f6f7f8fd
3020460037450000000072659687253010010030000100300005008587680160018600376003755929656240301002003000820090024600376003711102011009910010010000100000011171701605979600300001006003860038600386003860038
302046003745000000006159687253010010030000100300005008587680160018600376003755929656241301002003000820090024600376003711102011009910010010000100000011171701615979500300001006003860038600386003860038
302046003744900000008959687253010010030000100300005008587680160018600376003755929756241301002003000820090024600376003711102011009910010010000100000011171601615979500300001006003860038600386003860038
3020460037450000042006159687253010010030000100300005008587680160018600376003755929656241301002003000820090024600376003711102011009910010010000100000011171701605979500300001006003860038600386003860038
302046003745000000006159687253010010030000100300005008587680160018600376003755929656241301002003000820090024600376003711102011009910010010000100000011171601605979600300001006003860038600386003860083
302046003745000000006159687253010010030000100300005008587680160018600376003755929756240301002003000820090024600376003711102011009910010010000100000011171601605979600300001006003860038600386003860038
302046003744900000006159687253010010030000100300005008587680160018600376003755929756241301002003000820090024600376003711102011009910010010000100000011171701615979500300001006003860038600386003860038
302046003745000000006159687253010010030000100300005008587680160018600376003755929656241301002003000820090024600376003711102011009910010010000100000011171701615979500300001006003860038600386003860038
302046003745000000006159687253010010030000100300005008587680160018600376003755929656241301002003000820090024600376003711102011009910010010000100000011171601605979600300001006003860038600386003860038
302046003745000000006159687253010010030000100300005008587680160018600376003755932756240301002003000820090024600376003711102011009910010010000100000011171701615979500300001006003860038600386003860038

1000 unrolls and 10 iterations

Result (median cycles for code): 6.0037

retire uop (01)cycle (02)03mmu table walk data (08)l2 tlb miss data (0b)18191e1f3a3f4e51schedule uop (52)schedule int uop (53)schedule simd uop (54)dispatch int uop (56)dispatch simd uop (57)int uops in schedulers (59)ldst uops in schedulers (5b)60696b6d6emap stall dispatch (70)map rewind (75)map stall (76)dispatch uop (78)map int uop (7c)map simd uop (7e)map int uop inputs (7f)map simd uop inputs (81)8283flush restart other nonspec (84)85inst all (8c)inst branch (8d)inst branch taken (90)inst branch cond (94)inst int alu (97)inst simd alu (9a)9fl1d tlb access (a0)l1d tlb miss (a1)ld unit uop (a6)l1d cache writeback (a8)a9acc2cfd5map dispatch bubble (d6)ddfetch restart (de)e0? int output thing (e9)? simd retires (ee)? int retires (ef)f5f6f7f8fd
300246003745000000003215968725300101030000103000050858768016001806003760037559443562673001020300002090000600376003711100211091010100001000000006402162259785030000106003860038600386003860038
300246003744900000001935968725300101030000103000050858768016001806003760037559443562673001020300002090000600376003711100211091010100001000100006402162259785030000106003860038600386003860038
300246003744900000001455968725300101030000103000050858768016001806003760037559443562673001020300002090000600376003711100211091010100001000000006402162359785030000106003860038600386003860038
300246003744900000001245968725300101030000103000050858768016001806003760037559443562673016320300002090000600376003711100211091010100001000000006403162259785030000106003860038600386003860038
30024600374500000000615968725300101030000103000050858768016001806003760037559443562673001020300002090000600376003711100211091010100001000000006402162259785030000106003860038600386003860038
3002460037450000000040959533296301461530156173212877860564216055806070060713559856456528321452432311209695160716607161511002110910101000010002043026828412803260341530000106003860038600866003860038
300246003745000101848123206752595332553018012301801431672558606925160558060750607195599062564953229120324792497404604756090316110021109101010000100401033960088141795360433130000106120461147608126110061147
30024611854741000001531859566319301421430192103000050858768016073836079760811560107156565324452032473229644760713608111311002110910101000010000001998464011372259785130000106003860038600386003860038
30024600374650001120014115967625300101030000103000050858768016001806003760085559443562673001020300002090000600376003711100211091010100001000000006402162259785030000106003860038600386003860038
300246003745000000001455968725300101230000103000050859152916001806003760037559443562673001020300002090000600376003711100211091010100001000000006402162259785030000106003860038600386003860038

Test 3: Latency 1->2

Chain cycles: 2

Code:

  movi v0.16b, 0
  tbx v0.8b, { v1.16b, v2.16b, v3.16b }, v4.8b
  add v1.16b, v0.16b, v0.16b
  movi v0.16b, 1
  movi v1.16b, 2
  movi v2.16b, 3
  movi v3.16b, 4
  movi v4.16b, 5

(fused SUBS/B.cc loop)

100 unrolls and 100 iterations

Result (median cycles for code, minus 2 chain cycles): 6.0038

retire uop (01)cycle (02)0309l2 tlb miss instruction (0a)l2 tlb miss data (0b)191e1f3a3f4e51schedule uop (52)schedule int uop (53)schedule simd uop (54)dispatch int uop (56)dispatch simd uop (57)int uops in schedulers (59)ldst uops in schedulers (5b)60696b6d6emap stall dispatch (70)map rewind (75)map stall (76)dispatch uop (78)map int uop (7c)map simd uop (7e)map int uop inputs (7f)map simd uop inputs (81)8283flush restart other nonspec (84)85inst all (8c)inst branch (8d)inst branch taken (90)inst branch cond (94)inst int alu (97)inst simd alu (9a)9fl1d cache writeback (a8)acc2branch cond mispred nonspec (c5)branch mispred nonspec (cb)cdcfmap dispatch bubble (d6)fetch restart (de)e0? int output thing (e9)? simd retires (ee)? int retires (ef)f5f6f7f8fd
50204800386000000000617968626401251254000012540000625114576400800190800388009273428773740401252004000720011002080038800381130201100991001003000010000011119171607980925500001008003980039800398003980039
50204800385990000000617968626401251254000012540000625114576401800190800388003873428673741401252004000720011002080038800381130201100991001003000010000011119171607980925500001008003980039800398003980039
502048003859900000001297968626401251254000012540000625114576401800190800388003873428773740401252004000720011002080038800381130201100991001003000010000011119161607980925500001008003980039800398003980039
50204800385990000000617968626401251254000012540000625114576400800190800388003873424673741401252004000720011002080038800381130201100991001003000010000011119171607980925500001008003980039800398003980039
50204800385990000000617968626401251254000012540000625114576401800193800388003873431773741401252004000720011002080038800381130201100991001003000010000011119161607980925500001008003980039800398003980039
502048003860000000007267968626401251254000012540000625114576401800190800388003873428673741401252004000720011002080038800381130201100991001003000010000011119171607980925500001008003980039800398003980039
50204800386000000000617968626401251254000012540000625114576401800190800388003873428673741401252004000720011002080038800381130201100991001003000010000011119161607980925500001008003980039800398003980039
50204800385990000000617968626401251254000012540000625114576401800190800388003873428673741401252004000720011002080038800381130201100991001003000010000011119171607980925500001008003980039802438003980039
50204800385990004120010579686119401251254000012540608625114576401800190800388003873428673741401252004000720011002080038800381130201100991001003000010000011119161607980925500001008003980039800398003980039
502048003860000000001037968626401251254000012540000625114576400800190800388003873428673741401252004000720011002080038800381130201100991001003000010000011119161607980925500001008003980039800928003980039

1000 unrolls and 10 iterations

Result (median cycles for code, minus 2 chain cycles): 6.0038

retire uop (01)cycle (02)0318191e1f3f4e51schedule uop (52)schedule int uop (53)schedule simd uop (54)dispatch int uop (56)dispatch simd uop (57)int uops in schedulers (59)ldst uops in schedulers (5b)5f60696b6d6emap stall dispatch (70)simd prf full (72)map rewind (75)map stall (76)dispatch uop (78)map int uop (7c)map simd uop (7e)map int uop inputs (7f)map simd uop inputs (81)8283flush restart other nonspec (84)85inst all (8c)inst branch (8d)inst branch taken (90)inst branch cond (94)inst int alu (97)inst simd alu (9a)9fl1d tlb access (a0)l1d cache writeback (a8)a9acc2branch mispred nonspec (cb)cdcfd5map dispatch bubble (d6)ddfetch restart (de)e0? int output thing (e9)? simd retires (ee)? int retires (ef)f5f6f7f8fd
50024800385990000726796862640013134000013400006511457640008001908003880038734430373767400132040000201100008003880038113002110910103000010000000018905173579796350000108003980039800398003980039
50024800386000090726796862640013134000013400006511457842008001908003880038734430373767400132040000201100008003880038113002110910103000010000000018905175379796350000108003980039800398003980039
5002480038599000061796652640013134000013400006511457640018001908003880038734430373767400132040000201100008003880038113002110910103000010000300018903173579796350000108003980039800398003980039
5002480038600000061796862640013134000013400006511457640008001908003880038734430373767400132040000201100008003880038113002110910103000010000000018905175379796350000108003980039800398003980039
5002480038600000061796862640013134000013400006011457640008001908003880038734430373767400132040000201100008003880038113002110910103000010000000018905173579796350000108003980039800398003980039
5002480038599000061796862640013134000013400006511457640008001908003880038734430373767400132040000201100008003880038113002110910103000010000000018905175879796350000108003980039800398003980039
5002480038599000061796862640013134000013400006511457640008001908003880038734430373767400132040000201100008003880038113002110910103000010000000018905175379796350000108003980039800398003980039
5002480038599000061796862640013134000013400006511457640008001908003880038734437373767400132040000201100008009280038113002110910103000010000000018905175779796350000108003980039800398003980039
50024800915990000726796862640013134000013400006511457640118001908003880038734430373767400132040000201100008003880038113002110910103000010000000018903175379796350000108003980039800398003980039
50024800386000000536796862640013134000013400006511457640008001908003880038734437373767400132040000201100008003880038113002110910103000010000000018903175379796350000108003980039800398003980039

Test 4: Latency 1->3

Chain cycles: 2

Code:

  movi v0.16b, 0
  tbx v0.8b, { v1.16b, v2.16b, v3.16b }, v4.8b
  add v2.16b, v0.16b, v0.16b
  movi v0.16b, 1
  movi v1.16b, 2
  movi v2.16b, 3
  movi v3.16b, 4
  movi v4.16b, 5

(fused SUBS/B.cc loop)

100 unrolls and 100 iterations

Result (median cycles for code, minus 2 chain cycles): 4.0040

retire uop (01)cycle (02)03mmu table walk data (08)l2 tlb miss data (0b)18191e1f373f4e51schedule uop (52)schedule int uop (53)schedule simd uop (54)dispatch int uop (56)dispatch simd uop (57)int uops in schedulers (59)ldst uops in schedulers (5b)60696d6emap stall dispatch (70)map rewind (75)map stall (76)dispatch uop (78)map int uop (7c)map simd uop (7e)map int uop inputs (7f)map simd uop inputs (81)8283flush restart other nonspec (84)85inst all (8c)inst branch (8d)inst branch taken (90)inst branch cond (94)inst int alu (97)inst simd alu (9a)9fl1d tlb miss (a1)l1d cache writeback (a8)acc2branch cond mispred nonspec (c5)branch mispred nonspec (cb)cdcfmap dispatch bubble (d6)e0? int output thing (e9)? simd retires (ee)? int retires (ef)f5f6f7f8fd
50204600404500000000615969626401251254000012540000625857668706002160040600405343975374540125200400072001100206004060040113020110099100100300001000024601111916165983425500001006004160041600416004160041
50204600404500000000615969626401251254000012540000593857668706002160040600405344275374240125200400072001100206004060040113020110099100100300001000025201111916165983525500001006004160041600416004160041
50204600404500000000615969626401251254000012540000625857668706002160040600405343975374240125200400072001100206004060040113020110099100100300001000024601111917165983525500001006004160041600416004160041
50204600404500000000615969626401261254000012540000625857668706002160040600405343975374240125200400072001100206004060040113020110099100100300001000010201111916165983425500001006004160044600416004160041
502046004045000001320061596962640125125400001254000062585766871600216004360040534396537434012520040007200110020600406004011302011009910010030000100006001111917165983825500001006004160041600446004160041
50204600404500000000615969626401251254000012540000625857668706002160040600435343965374240125200400072001100206004060040113020110099100100300001000023401111917165983825500001006004160153601536004460041
502046015045101002760079759686264012612540000125400006258576687060021600406004053442753743401252004000720011002060043600401130201100991001003000010000301111916165983525500001006004160044600416004160041
5020460040450000000072659696264012512540000125400006258577118060021600406004053439653742401252004000720011002060040600401130201100991001003000010001301111917165983425500001006004160041600416004160041
50204600434500000001615971726401251254000012540000625857668716002160040600405344275374240125200400072001100206004060040113020110099100100300001000023101111916165983725500001006004160041600416004160041
50204600434500000000615969626401251254000012540000625857668706002160040600405344275374240125200400072001100206004060040113020110099100100300001000217101111916165983425500001006004160041600416004160041

1000 unrolls and 10 iterations

Result (median cycles for code, minus 2 chain cycles): 4.0040

retire uop (01)cycle (02)03l2 tlb miss data (0b)18191e1f373f4e51schedule uop (52)schedule int uop (53)schedule simd uop (54)dispatch int uop (56)dispatch simd uop (57)int uops in schedulers (59)ldst uops in schedulers (5b)60696d6emap stall dispatch (70)map rewind (75)map stall (76)dispatch uop (78)map int uop (7c)map simd uop (7e)map int uop inputs (7f)map simd uop inputs (81)8283flush restart other nonspec (84)85inst all (8c)inst branch (8d)inst branch taken (90)inst branch cond (94)inst int alu (97)inst simd alu (9a)9fl1d tlb access (a0)l1d tlb miss (a1)ld unit uop (a6)l1d cache writeback (a8)acc2cfd5map dispatch bubble (d6)ddfetch restart (de)e0? int output thing (e9)? simd retires (ee)? int retires (ef)f5f6f7f8fd
50024600434500000006159696264001313400001341005658576687160021600406026753454235376940013204000020112341600406004011300211091010300001000009018904332259826250000106004160041600416004160041
5002460040450000005615969626400131340000134000065857668716002160040600405345435382840013204000020110000600406004011300211091010300001000000018902173259826350000106004160041600416004160041
5002460040449000000615969626400131340000134000065857668706002160040600405345435376940013204000020110000600406004011300211091010300001000000018902172259826350000106004160044600416004160041
5002460040449004000615969626400131340001134080465857668716002160040600405345435376940013204000020110000600406004011300211091010300001000013018902173259826350000106004460041600416004160044
5002460040450000001615969626400131340001134000065857668706002160040600405345435376940013204000020110000600406004011300211091010300001000000018902172259826350000106004160041600416004160041
5002460040449000000615969626400131340000134000081857668716002160040600405345435376940013204000020110000600406004011300211091010300001000000019112172259826350000106004160041600416004160041
500246021244900012006159658264001413400001340000658576687160021600406004053454455377240013204000020110000600406004311300211091010300001000200018902172259829350000106004160041600416004160041
5002460040449000000615969626400131340000134000065857668706002160040600435345435376940013204000020110000600406004011300211091010300001000000018902172259826350000106004160041600416004460044
500246004044900000011145969626400131340009134000065857672506002160040600405345435376940013204000020110000600406004311300211091010300001000006018902172259826350000106026660044600966004160041
50024600404490000006365971726400131340000134000065857711816002460150602625345445537874061520400002011000060040600401130021109101030000100001108018902492259829350000106004160041600416004160041

Test 5: Latency 1->4

Chain cycles: 2

Code:

  movi v0.16b, 0
  tbx v0.8b, { v1.16b, v2.16b, v3.16b }, v4.8b
  add v3.16b, v0.16b, v0.16b
  movi v0.16b, 1
  movi v1.16b, 2
  movi v2.16b, 3
  movi v3.16b, 4
  movi v4.16b, 5

(fused SUBS/B.cc loop)

100 unrolls and 100 iterations

Result (median cycles for code, minus 2 chain cycles): 2.0042

retire uop (01)cycle (02)0318191e373f4e51schedule uop (52)schedule int uop (53)schedule simd uop (54)dispatch int uop (56)dispatch simd uop (57)int uops in schedulers (59)ldst uops in schedulers (5b)60696d6emap stall dispatch (70)map rewind (75)map stall (76)dispatch uop (78)map int uop (7c)map simd uop (7e)map int uop inputs (7f)map simd uop inputs (81)8283flush restart other nonspec (84)85inst all (8c)inst branch (8d)inst branch taken (90)inst branch cond (94)inst int alu (97)inst simd alu (9a)9fl1d tlb access (a0)ld unit uop (a6)l1d cache writeback (a8)acbranch cond mispred nonspec (c5)branch mispred nonspec (cb)cdcfmap dispatch bubble (d6)e0? int output thing (e9)? simd retires (ee)? int retires (ef)f5f6f7f8fd
5020440042300000210339687264012712540003125400006255692179140023400424004233453733743401252004000720011002040042400421130201100991001003000010000001111917163988125500001004004340043400434004340043
502044004229900036139685264012712540004125400006255692424040023400424004233453633744401252004000720011002040042400421130201100991001003000010000001111917173988125500001004004340043400434004340043
502044004230000016139685264012712540002125400006255693510040023400424004233453733744401252004000720011002040042400421130201100991001003000010000001111917163988125500001004004340043400434004340043
502044004230000026139685264012812540001125400006255691797040023400424004233453733744401252004000720011003640042400421130201100991001003000010000001111917163988125500001004004340043400434004340043
502044004229900016139685264012812540003125400006255692269040023400424004233453733743401252004000720011002040042400421130201100991001003000010000001111917163988125500001004004340043400434004340043
502044004230000036139684264012712540003125400006255692152140023400424004233453733743401252004000720011003640042400421130201100991001003000010000001111916163988125500001004004340043400434004340043
50204400423000004141739684264012612540003125400006255692179140023400424004233453733743401252004000720011003640042400421130201100991001003000010000001111917163988125500001004004340043400434004340043
502044004230000006139687264012712540003125400006255691934040023400424004233453733743401252004000720011002040042400421130201100991001003000010000001111916163988125500001004004340043400434004340043
502044004230000026139685264012712540002125400006255693436040023400424004233453733743401252004000720011002040042400421130201100991001003000010000001111916163988125500001004004340043400434004340043
502044004230000006139686264012512540003125400006255692089040023400424004233453633744401252004000720011002040042400421130201100991001003000010000001111917163988125500001004004340043400434004340043

1000 unrolls and 10 iterations

Result (median cycles for code, minus 2 chain cycles): 2.0042

retire uop (01)cycle (02)031e373f4e51schedule uop (52)schedule int uop (53)schedule simd uop (54)dispatch int uop (56)dispatch simd uop (57)int uops in schedulers (59)ldst uops in schedulers (5b)60696d6emap stall dispatch (70)map rewind (75)map stall (76)dispatch uop (78)map int uop (7c)map simd uop (7e)map int uop inputs (7f)map simd uop inputs (81)8283flush restart other nonspec (84)85inst all (8c)inst branch (8d)inst branch taken (90)inst branch cond (94)inst int alu (97)inst simd alu (9a)9fl1d cache writeback (a8)acbranch cond mispred nonspec (c5)cfd5map dispatch bubble (d6)ddfetch restart (de)e0? int output thing (e9)? simd retires (ee)? int retires (ef)f5f6f7f8fd
500244004230000613968526400161340001134000065569343704002340042400423346933377140013204000020110000400424004211300211091010300001000018902172239871350000104004340043400434004340043
5002440042300027683968426400131340002134000065569268204002340042400423346933377140013204000020110000400424004211300211091010300001000018902172239871350000104004340043400434004340043
500244004230003613968726400131340003134000065569268504002340042400423346933377140013204000020110000400424004211300211091010300001000018902172239871350000104004340043400434004340043
500244004230003613968426400161340003134000065569229604002340042400423346933377140013204000020110000400424004211300211091010300001000018902172239871350000104004340043400434004340043
500244004229902613968526400171340001134000065569343604002340042400423346933377140013204000020110000400424004211300211091010300001000018902172239871350000104004340043400434004340043
500244004230001613968726400141340000134000065569226904002340042400423346933377140013204000020110000400424004211300211091010300001000018902172239871350000104004340043400434004340043
500244004230001613968426400161340002134000065569219504002340042400423346933377140013204000022110000400424004211300211091010300001000018902172239871350000104004340043400434004340043
500244004230004613968426400151340002134000065569228704002340042400423346933377140013204000020110000400424004211300211091010300001000018902172239871350000104004340043400434004340043
500244004230000613968526400141340001134000065569343604002340042400423346933377140013204000020110000400424004211300211091010300001000018902172239871350000104004340043400434004340043
500244004229902613968426400161340003134000065569219504002340042400423346933377140013204000020110000400424004211300211091010300001000018902172239871350000104004340043400434004340043

Test 6: Latency 1->5

Chain cycles: 2

Code:

  movi v0.16b, 0
  tbx v0.8b, { v1.16b, v2.16b, v3.16b }, v4.8b
  add v4.16b, v0.16b, v0.16b
  movi v0.16b, 1
  movi v1.16b, 2
  movi v2.16b, 3
  movi v3.16b, 4
  movi v4.16b, 5

(fused SUBS/B.cc loop)

100 unrolls and 100 iterations

Result (median cycles for code, minus 2 chain cycles): 6.0038

retire uop (01)cycle (02)03mmu table walk data (08)091e3f4e51schedule uop (52)schedule int uop (53)schedule simd uop (54)dispatch int uop (56)dispatch simd uop (57)int uops in schedulers (59)ldst uops in schedulers (5b)6061696b6d6emap stall dispatch (70)map rewind (75)map stall (76)dispatch uop (78)map int uop (7c)map simd uop (7e)map int uop inputs (7f)map simd uop inputs (81)8283flush restart other nonspec (84)85inst all (8c)inst branch (8d)inst branch taken (90)inst branch cond (94)inst int alu (97)inst simd alu (9a)9fld unit uop (a6)l1d cache writeback (a8)acbranch cond mispred nonspec (c5)branch mispred nonspec (cb)cdcfd0d2d5map dispatch bubble (d6)ddfetch restart (de)e0? int output thing (e9)? simd retires (ee)? int retires (ef)f5f6f7f8fd
50204800386000012103796862640125125400001254000062511457640008001908003880038734287737414012520040007200110020800388003811302011009910010030000100006111195900316537980925500001008003980039800398003980039
5020480038599001261796862640125125400001254000062511457640108001908003880038734287737404012520040007200110020800388003811302011009910010030000100000111191700216437980925500001008003980039800398003980039
5020480038599001261796862640125125400001254000062511457640108001908003880038734286737414012520040007200110020800388003811302011009910010030000100040111191700316347980925500001008003980039800398003980039
502048003859900061796862640125125400001254000062511457640008001908003880038734286737414012520040007200110020800388003811302011009910010030000100000111191600416447980925500001008003980039800398003980039
502048003860000061796862640125125400001254000062511457640108001908003880038734286737414012520040007200110020800388003811302011009910010030000100000111191600316377980925500001008003980039800398003980039
5020480038599010298796862640125125400001254000062511457640108001938003880038734287737404012520040007200110020800388003811302011009910010030000100000111191600316337980925500001008003980039800398003980039
502048003859900061796862640125125400001254000062511457640008001908003880038734286737414012520040007200110020800388003811302011009910010030000100000111191700116337980925500001008003980039800398003980039
502048003860000061796862640125125400001254000062511457640108001908003880038734287737414012520040007200110020800388003811302011009910010030000100009111191700316317980925500001008003980039800398003980039
502048003859900061796862640125125400001254000062511457640008001908003880038734286737414012520040007200110020800388003811302011009910010030000100023111191700416437980925500001008003980039800398003980039
502048003859900061796862640125125400001254000062511457640108001908003880038734286737414012520040007200110020800388003811302011009910010030000100000111191600316337980925500001008003980039800398003980039

1000 unrolls and 10 iterations

Result (median cycles for code, minus 2 chain cycles): 6.0038

retire uop (01)cycle (02)03l1i tlb fill (04)l2 tlb miss data (0b)18191e1f3a3f4e51schedule uop (52)schedule int uop (53)schedule simd uop (54)dispatch int uop (56)dispatch simd uop (57)int uops in schedulers (59)ldst uops in schedulers (5b)60696d6emap stall dispatch (70)simd prf full (72)map rewind (75)map stall (76)dispatch uop (78)map int uop (7c)map simd uop (7e)map int uop inputs (7f)map simd uop inputs (81)8283flush restart other nonspec (84)85inst all (8c)inst branch (8d)inst branch taken (90)inst branch cond (94)inst int alu (97)inst simd alu (9a)9fl1d tlb access (a0)l1d tlb miss (a1)ld unit uop (a6)st unit uop (a7)l1d cache writeback (a8)a9acc2cdcfd5map dispatch bubble (d6)ddfetch restart (de)e0? int output thing (e9)? simd retires (ee)? int retires (ef)f5f6f7f8fd
5002480038600100000061796862640013134000013400006511457640080019800388003873443037376740013204000020110000800388003811300211091010300001000000000018903173379796350000108003980039800398003980039
5002480038599000000061796862640013134000013400006511457640080019800388003873443037376740013204000020110000800388003811300211091010300001000000002018903173379796350000108003980039800398003980039
5002480038599000000061796862640013134000013400006511457640080019800388003873443037376740013204000020110000800388003811300211091010300001000000000018904173379796350000108003980039800398003980039
5002480038599000000061796862640013134000013400006511457640080019800388003873443037376740013204000020110000800388003811300211091010300001000000000018903173379796350000108003980039800398003980039
5002480038600000000061796862640013134000013400006511457640080019800388003873443037376740013204000020110000800388003811300211091010300001000000000018903173379796350000108003980039800398019480039
5002480038599000000061796862640013134000013400006511457640080019800388003873443037376740013204000020110000800388003811300211091010300001000000000018903173379796350000108003980039800398003980039
5002480038600000000061796862640013134000013400006511457640080019800388003873443037376740013204000020110000800388003811300211091010300001000000000018903173379796350000108003980039800398003980039
5002480038600000000061796862640013134000013400006511457640080019800388003873443037376740013204000020110000800388003811300211091010300001000000000018903174379796350000108003980039800398003980039
5002480038599000000061796862640013134000013400006511457640080019800388003873443037376740013204000020110000800388003811300211091010300001000000000018903174379796350000108003980039800398003980039
50024800386000000000726796862640013134000013400006511457640080019800388003873443037376740013204000020110000800388003811300211091010300001000000000018903173379796350000108003980039800398003980039

Test 7: throughput

Count: 8

Code:

  movi v0.16b, 0
  tbx v0.8b, { v8.16b, v9.16b, v10.16b }, v11.8b
  movi v1.16b, 0
  tbx v1.8b, { v8.16b, v9.16b, v10.16b }, v11.8b
  movi v2.16b, 0
  tbx v2.8b, { v8.16b, v9.16b, v10.16b }, v11.8b
  movi v3.16b, 0
  tbx v3.8b, { v8.16b, v9.16b, v10.16b }, v11.8b
  movi v4.16b, 0
  tbx v4.8b, { v8.16b, v9.16b, v10.16b }, v11.8b
  movi v5.16b, 0
  tbx v5.8b, { v8.16b, v9.16b, v10.16b }, v11.8b
  movi v6.16b, 0
  tbx v6.8b, { v8.16b, v9.16b, v10.16b }, v11.8b
  movi v7.16b, 0
  tbx v7.8b, { v8.16b, v9.16b, v10.16b }, v11.8b
  movi v8.16b, 9
  movi v9.16b, 10
  movi v10.16b, 11
  movi v11.16b, 12

(fused SUBS/B.cc loop)

100 unrolls and 100 iterations

Result (median cycles for code divided by count): 0.7506

retire uop (01)cycle (02)031e1f373f4e51schedule uop (52)schedule int uop (53)schedule simd uop (54)dispatch int uop (56)dispatch simd uop (57)int uops in schedulers (59)ldst uops in schedulers (5b)60696b6d6emap stall dispatch (70)simd prf full (72)map rewind (75)map stall (76)dispatch uop (78)map int uop (7c)map simd uop (7e)map int uop inputs (7f)map simd uop inputs (81)8283flush restart other nonspec (84)85inst all (8c)inst branch (8d)inst branch taken (90)inst branch cond (94)inst int alu (97)inst simd alu (9a)9fl1d tlb miss (a1)l1d cache writeback (a8)accfd5map dispatch bubble (d6)ddfetch restart (de)e0? int output thing (e9)? simd retires (ee)? int retires (ef)f5f6f7f8fd
32020460057450004911552002624012512524000012524000062532995711600280600476004719973032000424012520024000020072000060047814691116020110099100100160000100000101105161160044253200001006004860048600486004860048
3202046004744900028302624012512524000012524000062532995711600940600476004719973034142624012520024000020072000060047814001116020110099100100160000100000101101161160044253200001006004860048818006004860048
3202046004744900022202624012512524000012524000062532995711600280600476004719973032000424012520024000020072000060047600471116020110099100100160000100010101101161160044253200001006004860048600486004860048
3202046004744900020102624012512524000012524000062532995711600280817366004719973034132124012520024000020072000060047600471116020110099100100160000100000101101161160044253200001006004860048600486004881470
32020460047450000191602624012512524000012524000062532995711814050600476004741252032000424012520024000020072000060047600471116020110099100100160000100000101101161160044253200001008134960048600488142560048
3202048130245000018002624012512529063212524000062532995711600280600476004719973032000424012520024000020072000060047600471116020110099100100160000100000101101161160044253200001006004860048600486004860048
32020460047450000108802624012512524000012524000062532995710600280600476004719973032000424012520024000020072000060047600471116020110099100100160000100000101101161160044253200001006004860048815296004860048
3202048173645000022202624012512524000012524000062532995710600280600476004719973032000424012520024000020072000060047600471116020110099100100160000100000101101161160044253200001008173760048600486004860048
320204809824500049553106202624012512528989912524000062532995710600280600476004719973032000424012520024000020072000060047600471116020110099100100160000100000101101161160044253200001006004860048600486004860048
32020460047450000157026240125125240000125240000640114346831600280600476004719973032000424012520024000020072000060047600471116020110099100100160000100003101101161160044253200001006004860048600486004860048

1000 unrolls and 10 iterations

Result (median cycles for code divided by count): 0.7506

retire uop (01)cycle (02)03mmu table walk instruction (07)mmu table walk data (08)l2 tlb miss instruction (0a)l2 tlb miss data (0b)1e1f373f4e51schedule uop (52)schedule int uop (53)schedule simd uop (54)dispatch int uop (56)dispatch simd uop (57)int uops in schedulers (59)ldst uops in schedulers (5b)5f6061696d6emap stall dispatch (70)simd prf full (72)map rewind (75)map stall (76)dispatch uop (78)map int uop (7c)map simd uop (7e)map int uop inputs (7f)map simd uop inputs (81)8283flush restart other nonspec (84)85inst all (8c)inst branch (8d)inst branch taken (90)inst branch cond (94)inst int alu (97)inst simd alu (9a)9fl1d tlb access (a0)l1d tlb miss (a1)l1d cache writeback (a8)acbranch cond mispred nonspec (c5)cfd0d2l1i cache miss demand (d3)d5map dispatch bubble (d6)d9dadbddfetch restart (de)e0? int output thing (e9)eaeb? simd retires (ee)? int retires (ef)f5f6f7f8fd
3200246004860900000001510262400131324000013240000653299571105600286004760047199960320026240013202400002072000060047600471116002110910101600001000000100228112017211920600443157320000108117960048600486004860048
320024600474500000000164026240013132400001324000065329957111560028600476004719996032002624001320240000207200006004760047111600211091010160000100000010022841191721120206004431514320000106004860048600486004860048
3200246004745000000001390262400131324000013240000653299571115600286004760047199960320026240013202400002072000060047600471116002110910101600001000000100228411917411619600443309320000106004860048600486004860048
320024600474500000000160026240013132400001324000065329957111560028815196004719996032002624001320240000207200006004760047111600211091010160000100000010022114119172111919600443157320000106004860107600486004860048
320024600474500000000107602624001313240000132400006532995711156002860047600471999603200262400132024000020720000600476004711160021109101016000010000001002284119172111919600443157320000106004860048600486004860048
3200246004745000000006290262400131324000013240000653299571115600286004760047199960320026240013202400002072000060047600471116002110910101600001000000100228411917211619600443157320000106004860048600486004860048
3200246004745000000005820262400131324000013240000653299571115600286004760047199960320026240013202400002072000060047600471116002110910101600001000000100228411917211198600443157320000106004860048600486004860048
3200246004744900000009580262400131324000013240000653299571115600286004760047199960320026240013202400002072000060047600471116002110910101600001000000100228411917211198600443157320000106004860048600486004881663
3200246004745000000001580262400961324000013240000653299571115600286004760047199960320026240013202400002072000060047600471116002110910101600001000000100248416172111020600445157320000106004860048600486004860048
320024600474500000000674579352624001313240000132400006532995711156002860047600471999603200262400132024000020720000600476004711160021109101016000010000001002211411917211198600443157320000106004860048600488164460048