Apple Microarchitecture Research by Dougall Johnson

M1/A14 P-core (Firestorm): Overview | Base Instructions | SIMD and FP Instructions
M1/A14 E-core (Icestorm):  Overview | Base Instructions | SIMD and FP Instructions

TBX (four register table, 8B)

Test 1: uops

Code:

  tbx v0.8b, { v1.16b, v2.16b, v3.16b, v4.16b }, v5.8b
  movi v0.16b, 1
  movi v1.16b, 2
  movi v2.16b, 3
  movi v3.16b, 4
  movi v4.16b, 5
  movi v5.16b, 6

(no loop instructions)

1000 unrolls and 1 iteration

Retires: 4.000

Issues: 4.000

Integer unit issues: 0.000

Load/store unit issues: 0.000

SIMD/FP unit issues: 4.000

retire uop (01)cycle (02)03l2 tlb miss data (0b)181e3f4e51schedule uop (52)schedule simd uop (54)dispatch simd uop (57)ldst uops in schedulers (5b)60696d6emap stall dispatch (70)map rewind (75)map stall (76)dispatch uop (78)map simd uop (7e)map simd uop inputs (81)8283flush restart other nonspec (84)85inst all (8c)inst simd alu (9a)l1d cache writeback (a8)accfd5map dispatch bubble (d6)ddfetch restart (de)e0? simd retires (ee)f5f6f7f8fd
40048037600001037687254000400040001125680080188037803766973702040004000120008037803711100110000073216227796400080388038803880388038
4004803760000617687254000400040001125680080188037803766973702040004000120008037803711100110000073216227796400080388038803880388038
4004803760000617687254000400040001125680080188037803766973702040004000120008037803711100110000073216227796400080388038803880388038
4004803760009617676254000400040001125680080188037803766973702040004000120008037803711100110000073225227796400080388038803880388038
40048037600001037687254000400040001125680080188037803766973702040004000120008037803711100110000073216227796400080388038803880388038
40048037600008276876440244024430411295290809081338181666511701644124492129848037803711100110000073216227796400080388038803880388038
4004803760000847687254000400040001125680080188037803766973702040004000120008037803711100110000073216227796400080388038803880388038
40048037600002337687254000400040001125680080188037803766973702040004000120008037803711100110000373216227796400080388038803880388038
40048037600006176872540004000400011256800801880378037669737020400040001200080378037111001100009373216227796400080388038803880388038
40048037600004417687254000400040001125680080188037803766973702040004000120008037803711100110000073216227796400080388038803880388038

Test 2: Latency 1->1

Code:

  tbx v0.8b, { v1.16b, v2.16b, v3.16b, v4.16b }, v5.8b
  movi v0.16b, 1
  movi v1.16b, 2
  movi v2.16b, 3
  movi v3.16b, 4
  movi v4.16b, 5
  movi v5.16b, 6

(fused SUBS/B.cc loop)

100 unrolls and 100 iterations

Result (median cycles for code): 8.0037

retire uop (01)cycle (02)03mmu table walk data (08)l2 tlb miss data (0b)18191e1f3a3f4e51schedule uop (52)schedule int uop (53)schedule simd uop (54)dispatch int uop (56)dispatch simd uop (57)int uops in schedulers (59)ldst uops in schedulers (5b)60696d6emap stall dispatch (70)map rewind (75)map stall (76)dispatch uop (78)map int uop (7c)map simd uop (7e)map int uop inputs (7f)map simd uop inputs (81)8283flush restart other nonspec (84)85inst all (8c)inst branch (8d)inst branch taken (90)inst branch cond (94)inst int alu (97)inst simd alu (9a)9fl1d tlb access (a0)l1d tlb miss (a1)ld unit uop (a6)l1d cache writeback (a8)a9acc2branch cond mispred nonspec (c5)cdcfd5map dispatch bubble (d6)ddfetch restart (de)e0? int output thing (e9)? simd retires (ee)? int retires (ef)f5f6f7f8fd
40204800376000000900536796872540100100400001004000050011457680180018800378003769697370019401002004000020012000080037800371110201100991001001000010000000000071031711797970400001008003880038800388003880038
402048003760000000001610796872540100100400001004000050011457680180018800378003769697370019401002004000020012000080037800371110201100991001001000010000000000071011711797970400001008003880038800388003880038
4020480037599000000061796872540100100400001004000050011457680180018800378003769697370019401002004000020012000080037800371110201100991001001000010000060000071011711797970400001008003880038800388003880038
40204800375990000000631796872540100100400001004000050011457680180018800378003769697370019401002004000020012000080037800371110201100991001001000010000000000071021711797970400001008003880038800388003880038
40204800376000000000251796872540100100400001004000050011457680180018800378003769697370019401002004000020012000080037802291110201100991001001000010000000000071011711797970400001008003880038800388003880038
40204800376000000000617968725401001004000010040000500114576801800188003780037696973700194010020040000200120000800378003711102011009910010010000100000001200071011711797970400001008003880038800388003880038
4020480037599000000061796872540100100400001004000050011457680180018800378003769697370019401002004000020012000080037800371110201100991001001000010000000000071011711797970400001008003880038800388003880038
4020480037599000000061796872540100100400001004000050011457680180018800378003769697370019401002004000020012000080037800371110201100991001001000010000000000071021711797970400001008003880038800388003880038
4020480037599000000061796872540100100400001004000050011457680180018800378003769697370019401002004000020012000080037800371110201100991001001000010000000000071011711797970400001008003880038800388003880038
4020480037600000000061796872540100100400001004000050011457680180018800378003769697370019401002004000020012000080037802281110201100991001001000010000000000071011711797970400001008003880038800388003880038

1000 unrolls and 10 iterations

Result (median cycles for code): 8.0037

retire uop (01)cycle (02)031e1f3f4e51schedule uop (52)schedule int uop (53)schedule simd uop (54)dispatch int uop (56)dispatch simd uop (57)int uops in schedulers (59)ldst uops in schedulers (5b)60696d6emap stall dispatch (70)simd prf full (72)map rewind (75)map stall (76)dispatch uop (78)map int uop (7c)map simd uop (7e)map int uop inputs (7f)map simd uop inputs (81)8283flush restart other nonspec (84)85inst all (8c)inst branch (8d)inst branch taken (90)inst branch cond (94)inst int alu (97)inst simd alu (9a)9fl1d tlb miss (a1)l1d cache writeback (a8)accfd5map dispatch bubble (d6)ddfetch restart (de)e0? simd retires (ee)? int retires (ef)f5f6f7f8fd
400248003759900617968725400101040000104000050114576800800188003780037696970370019400102040000201200008003780037111002110910101000010000649117017979740000108003880038800388003880038
400248003760000617968725400101040000104000050114576801800188003780037696970370019400102040000201200008003780037111002110910101000010013649317017979740000108003880038800388003880038
4002480037599001087968725400101040000104000050114576801800188003780037696970370019400102040000201200008003780037111002110910101000010000649317017979740000108003880038800388003880038
400248003759900617968725400101040000104000050114576800800188003780037696970370019400102040000201200008003780228111002110910101000010000669217017979740000108003880038800388003880038
400248003759900617968725400101040000104000050114576800800188003780037696970370019400102040000201200008003780037111002110910101000010000649317017979740000108003880038800388003880038
4002480037599001247968725400101040000104000050114576801800188003780037696970370019400102040000201200008003780037111002110910101000010000649217017979740000108003880038800388003880038
400248003759900617968725400101040000104000050114576801800188003780037696970370019400102040000201200008003780037111002110910101000010000649317017979740000108003880038800388003880038
400248003759900617968725400101040000104000050114576800800188003780037696970370019400102040000201200008003780037111002110910101000010000649217017979740000108003880038800388003880038
400248003759900617968725400101040000104000050114576801800188003780037696970370019400102040000201200008023080037111002110910101000010002015649417017979740000108003880038800388003880038
400248003760000617968725400101040000104000050114576800800188003780037696970370019400102040000201200008003780037111002110910101000010000649217017979740000108003880038800388003880038

Test 3: Latency 1->2

Chain cycles: 2

Code:

  movi v0.16b, 0
  tbx v0.8b, { v1.16b, v2.16b, v3.16b, v4.16b }, v5.8b
  add v1.16b, v0.16b, v0.16b
  movi v0.16b, 1
  movi v1.16b, 2
  movi v2.16b, 3
  movi v3.16b, 4
  movi v4.16b, 5
  movi v5.16b, 6

(fused SUBS/B.cc loop)

100 unrolls and 100 iterations

Result (median cycles for code, minus 2 chain cycles): 8.0038

retire uop (01)cycle (02)03mmu table walk data (08)l2 tlb miss data (0b)18191e1f3f4e51schedule uop (52)schedule int uop (53)schedule simd uop (54)dispatch int uop (56)dispatch simd uop (57)int uops in schedulers (59)ldst uops in schedulers (5b)60696d6emap stall dispatch (70)map rewind (75)map stall (76)dispatch uop (78)map int uop (7c)map simd uop (7e)map int uop inputs (7f)map simd uop inputs (81)8283flush restart other nonspec (84)85inst all (8c)inst branch (8d)inst branch taken (90)inst branch cond (94)inst int alu (97)inst simd alu (9a)9fl1d tlb access (a0)l1d tlb miss (a1)ld unit uop (a6)l1d cache writeback (a8)a9acc2cdcfd5map dispatch bubble (d6)ddfetch restart (de)e0? int output thing (e9)? simd retires (ee)? int retires (ef)f5f6f7f8fd
60204100038749000000619968626501001005000010050000500143276480100019100038100038896963900195010020050000200140000100038100038113020110099100100300001000000000019101171199798060000100100039100039100039100039100039
602041000387490000907269968626501001005000010050000500143276480100019100038100038896963900195010020050000200140000100038100038113020110099100100300001000000000019101171199798060000100100039100039100039100039100039
60204100038749000000619968626501121205021612050152500143276480100019100038100038896963900195010020050000200140000100038100038113020110099100100300001000000000019101171199798060000100100039100039100039100039100039
60204100038749000000619968626501001005000010050000500143276480100019100038100038896963900195010020050000200140000100038100038113020110099100100300001000000000019101171199798060000100100039100039100039100039100039
60204100038749000000619968626501001005000010050000500143276480100019100038100038896963900195010020050000200140000100038100038113020110099100100300001000000000019101171199798060000100100039100039100039100039100039
60204100038750000000619968626501001005000010050000500143276480100019100038100038896963900195010020050000200140000100038100038113020110099100100300001000000000019101171199798060000100100089100039100039100039100039
60204100038750000000619968626501001005000010050000500143276480100019100038100038896963900195010020050000200140000100038100038113020110099100100300001000000000019101171199798060000100100039100039100039100039100039
60204100038749000000619968626501001005000010050000500143276480100019100038100038896963900195010020050000200140000100038100038113020110099100100300001000000000019101171199798060000100100039100039100039100039100039
60204100038749000000619968626501001005000010050000500143276480100019100038100038896963900195010020050000200140000100038100038113020110099100100300001000000000019101171199798060000100100039100039100039100039100039
60204100038750000000619968626501001005000010050000500143276480100019100038100038896963900195010020050000200140000100038100038113020110099100100300001000000000019101171199798060000100100039100039100039100039100039

1000 unrolls and 10 iterations

Result (median cycles for code, minus 2 chain cycles): 8.0038

retire uop (01)cycle (02)03mmu table walk instruction (07)mmu table walk data (08)09l2 tlb miss instruction (0a)l2 tlb miss data (0b)18191e1f3a3f4e51schedule uop (52)schedule int uop (53)schedule simd uop (54)dispatch int uop (56)dispatch simd uop (57)int uops in schedulers (59)ldst uops in schedulers (5b)5f60696d6emap stall dispatch (70)map rewind (75)map stall (76)dispatch uop (78)map int uop (7c)map simd uop (7e)map int uop inputs (7f)map simd uop inputs (81)8283flush restart other nonspec (84)85inst all (8c)inst branch (8d)inst branch taken (90)inst branch cond (94)inst int alu (97)inst simd alu (9a)9fl1d tlb access (a0)l1d tlb miss (a1)ld unit uop (a6)l1d cache writeback (a8)a9acc2cfl1i cache miss demand (d3)d5map dispatch bubble (d6)dbddfetch restart (de)e0? int output thing (e9)ea? simd retires (ee)? int retires (ef)f5f6f7f8fd
60024100038749010000024006199686265001010500121050000501432764800100019100038100038896963900195001020500002014000010008710003811300211091010300001000200001899081705699798006000010100039100039100039100039100039
6002410003874900010001890010399686265001010500001050000501432764801100055100038100038896963900195001020500002014000010003810003811300211091010300001000000001899061704699798006000010100039100039100039100039100039
6002410003874900000000006199686265001010500001050000501432764800100019100038100038896963900195001020500002014000010003810003811300211091010300001000000001899061705699798006000010100039100039100039100039100039
6002410003874900000010006199686265001010500121050000501432764801100019100038100038896963900195001020500002014000010003810003811300211091010300001000000001899061704599798006000010100039100039100039100039100039
6002410003874900000000006199686265001010500001050000501432764801100019100038100038896969900195001020500002014000010003810003811300211091010300001000000301917051706999798006000010100039100039100039100039100039
6002410003874900000001650019199686265001010500001050000501432764800100019100038100038896963900195001020500002014000010003810003811300211091010300001000000001899041707699798006000010100039100039100039100039100039
6002410003875000000000006199686265001010500001050000501432764800100019100038100038896963900195001020500002014000010003810003811300211091010300001000010301899061704699798006000010100039100039100039100039100039
6002410003874900000000006199686265001010500001050000501432764800100019100038100038896963900195001020500002014000010003810003811300211091010300001002000001899061705699798006000010100039100039100039100039100039
600241000387490000000120053699686265001010500001050000501432764800100019100038100038896963900195001020501652014000010003810003811300211091010300001000200001899061704699798006000010100039100039100039100039100039
60024100038750000000000018999686265001010500001050000501432764801100055100038100038896963900195001020500002014000010003810003811300211091010300001000000321922071705499798006000010100039100039100039100039100039

Test 4: Latency 1->3

Chain cycles: 2

Code:

  movi v0.16b, 0
  tbx v0.8b, { v1.16b, v2.16b, v3.16b, v4.16b }, v5.8b
  add v2.16b, v0.16b, v0.16b
  movi v0.16b, 1
  movi v1.16b, 2
  movi v2.16b, 3
  movi v3.16b, 4
  movi v4.16b, 5
  movi v5.16b, 6

(fused SUBS/B.cc loop)

100 unrolls and 100 iterations

Result (median cycles for code, minus 2 chain cycles): 6.0040

retire uop (01)cycle (02)03191e373a3f4e51schedule uop (52)schedule int uop (53)schedule simd uop (54)dispatch int uop (56)dispatch simd uop (57)int uops in schedulers (59)ldst uops in schedulers (5b)60696d6emap stall dispatch (70)simd prf full (72)map rewind (75)map stall (76)dispatch uop (78)map int uop (7c)map simd uop (7e)map int uop inputs (7f)map simd uop inputs (81)8283flush restart other nonspec (84)85inst all (8c)inst branch (8d)inst branch taken (90)inst branch cond (94)inst int alu (97)inst simd alu (9a)9fl1d tlb miss (a1)ld unit uop (a6)l1d cache writeback (a8)a9acbranch cond mispred nonspec (c5)branch mispred nonspec (cb)cdcfd5map dispatch bubble (d6)ddfetch restart (de)e0ea? simd retires (ee)? int retires (ef)f5f6f7f8fd
60204800405990020617968226501001005000010050000500114474471800218004080040697187670017501002005000920014002680040800401130201100991001003000010000000111191701600798900600001008004180044800448004180041
60204800405990010617968226501011005000110050000500114474471800218004080040697180670020501002005000920014002680040800401130201100991001003000010000000111191601600798450600001008004180044800418004480041
60204800405990010617966226501011005000010050000500114474471800218009580040697180670017501002005000920014002680040800401130201100991001003000010000000111191601600798360600001008004180041800448004180041
6020480040599000028627968226501011005000110050000500114474471800218004080040697180770016501002005000920014002680040800401130201100991001003000010000000111191601600798350600001008004180041800418004180041
60204800406000000617969626501001005000110050000500114474471800218004080040697180670017501002005000920014002680040800401130201100991001003000010000000111191601600798350600001008032380041800418004180041
60204800405990000617968226501031005000010050000500114474471800218004080040697180670017501002005000920014002680040800401130201100991001003000010000000111191701600798350600001008004180041800418004180041
60204800436000010617968226501001005000010050000500114474471800218004080040697180670017501002005000920014002680040800401130201100991001003000010000000111191601600798360600001008004180041800418004180041
60204800405990000617968226501001005000010050186500114474471800218004080040697180670017501002005000920014002680040800401130201100991001003000010000000111191601600798350600001008004480041800418004180096
60204800926000000617968226501011005000110050000500114474471800218004080040697180670017501002005000920014002680040800401130201100991001003000010000000111191701600798350600001008004180041800418004180041
60204800405990000617968226501001005000010050000500114474471800218004080040697000770016501002005000920014002680040800401130201100991001003000010000000111191601600798360600001008004180041800418004180041

1000 unrolls and 10 iterations

Result (median cycles for code, minus 2 chain cycles): 6.0040

retire uop (01)cycle (02)03181e1f373f4e51schedule uop (52)schedule int uop (53)schedule simd uop (54)dispatch int uop (56)dispatch simd uop (57)int uops in schedulers (59)ldst uops in schedulers (5b)5f60696d6emap stall dispatch (70)map rewind (75)map stall (76)dispatch uop (78)map int uop (7c)map simd uop (7e)map int uop inputs (7f)map simd uop inputs (81)8283flush restart other nonspec (84)85inst all (8c)inst branch (8d)inst branch taken (90)inst branch cond (94)inst int alu (97)inst simd alu (9a)9fl1d cache writeback (a8)accfd5map dispatch bubble (d6)ddfetch restart (de)e0eb? simd retires (ee)? int retires (ef)f5f6f7f8fd
6002480040600000261796822650010105000010500005011447447008002180040800406970437002150010205000020140000800408004011300211091010300001090018992177479894060000108004180041800418009480041
600248004059900026179682265001010500531050000501144744700800218004080040696563700215019920500002014000080198802512130021109101030000100018994174579821060000108004180044800418004180041
6002480040599000161796822650010105000010500005011447447008002180040800406970737002150010205000020140000800408004011300211091010300001072018992173479821060000108004180041800418004180041
6002480040600000261796822650011105000010500005011447447008002180040800406970437002150010205000020140000800408004011300211091010300001074018994173479821060000108004180041800418004180041
600248004060000132061796852650011105000010500005011447447008002180040800406970437002150010205000020140000800408004011300221091010300001036018992171579821060000108004180041800418004180041
6002480040599000161796822650011105000010500005011447882008002180040800406970437002450010205000020140000800408004011300211091010300001068018994173579821060000108004180041800448004180044
6002480040599000061796824950010105000110500005011447447018002180043800406970437002150010205000020140000800408004011300211091010300001075018994172479821060000108004180041800418004180041
6002480040599000061796822650010105000010500005011447447018002180040800406970437002150010205000020140000800408004011300211091010300001085018993173479821060000108004180041800418004180041
6002480040599000061796822650011105000010500005011447447018002180040800406970437002150010205000020140000800408004011300211091010300001072018994172579821060000108004180041800418004180041
600248004359900006179682265001010500001050000501144744701800218004080040697043700215001020500002014000080043800401130021109101030000100018993172479821060000108004480041800418004180041

Test 5: Latency 1->4

Chain cycles: 2

Code:

  movi v0.16b, 0
  tbx v0.8b, { v1.16b, v2.16b, v3.16b, v4.16b }, v5.8b
  add v3.16b, v0.16b, v0.16b
  movi v0.16b, 1
  movi v1.16b, 2
  movi v2.16b, 3
  movi v3.16b, 4
  movi v4.16b, 5
  movi v5.16b, 6

(fused SUBS/B.cc loop)

100 unrolls and 100 iterations

Result (median cycles for code, minus 2 chain cycles): 4.0042

retire uop (01)cycle (02)03mmu table walk data (08)18191e1f373a3f4e51schedule uop (52)schedule int uop (53)schedule simd uop (54)dispatch int uop (56)dispatch simd uop (57)int uops in schedulers (59)ldst uops in schedulers (5b)60696b6d6emap stall dispatch (70)map rewind (75)map stall (76)dispatch uop (78)map int uop (7c)map simd uop (7e)map int uop inputs (7f)map simd uop inputs (81)8283flush restart other nonspec (84)85inst all (8c)inst branch (8d)inst branch taken (90)inst branch cond (94)inst int alu (97)inst simd alu (9a)9fl1d cache writeback (a8)acbranch cond mispred nonspec (c5)branch mispred nonspec (cb)cfd5map dispatch bubble (d6)ddfetch restart (de)e0? simd retires (ee)? int retires (ef)f5f6f7f8fd
60204600424500000000615970326501001005000110050000500856446916002306004860054497273500235010020050000200140000600426005411302011009910010030000100000019101171159853600001006005560043600556004360055
60204600544500000000825968626501041005000410050000500856446916002306004260054497273500355010020050000200140000600546004211302011009910010030000100000019101171159853600001006004360043600496004960043
602046004244900025513220615970326501011005000110050000500856535116002306004860042497153500355010020050000200140000600546004211302011009910010030000100000019101171159859600001006004360043600496005560043
60204600424500000000615968626501041005000410050000500856449706002306004860048497153500295010020050000200140000600546004211302011009910010030000100000019101171159853600001006004360049600436004360055
60204600424500000020615967926501001005000010050000500856446916002906004860054497273500355010020050000200140000600546004211302011009910010030000100000019101171159853600001006005560049600436004960043
60204600544500000020615967926501041005000410050000500856446916002306009660042497213500355010020050000200140000600546004211302011009910010030000100000019101171159859600001006005560043600556004360055
602046005444900000207265967926501011005000110050000500856560916003506005460042497153500235010020050000200140000600426005411302011009910010030000100000019101171159865600001006005560043600436004360049
60204600544500000020615967926501001005000010050000500856560906003536004260042497213500235010020050000200140000600426005411302011009910010030000100000019101171159865600001006004360055600436005560043
60204600424500000040615968626501001005000110050000500856446916002306004260054497273500355010020050000200140000600426004211302011009910010030000100000019101171159853600001006004360049600496004360043
60204600424500000030615967926501001005000010050000500856560916003506005460042497153500235010020050000200140000600426005411302011009910010030000100000019101171159853600001006005560043600556004360055

1000 unrolls and 10 iterations

Result (median cycles for code, minus 2 chain cycles): 4.0042

retire uop (01)cycle (02)03l2 tlb miss data (0b)18191e1f373f4e51schedule uop (52)schedule int uop (53)schedule simd uop (54)dispatch int uop (56)dispatch simd uop (57)int uops in schedulers (59)ldst uops in schedulers (5b)5f60696b6d6emap stall dispatch (70)map rewind (75)map stall (76)dispatch uop (78)map int uop (7c)map simd uop (7e)map int uop inputs (7f)map simd uop inputs (81)8283flush restart other nonspec (84)85inst all (8c)inst branch (8d)inst branch taken (90)inst branch cond (94)inst int alu (97)inst simd alu (9a)9fl1d tlb access (a0)l1d tlb miss (a1)l1d cache writeback (a8)accfd5map dispatch bubble (d6)daddfetch restart (de)e0eaebec? simd retires (ee)? int retires (ef)f5f6f7f8fd
600246004244900000014075968626500131050002105000050856446901600350600426004849704350023500102050000201400006004260048113002110910103000010000018991170015986500060000106004360049600496004360049
60024600424500019046375967926500101050000105000050856560901600230600546005449715350023500102050000201400006004260054113002110910103000010000018991170015985300060000106004960043600436004360055
600246005444900000021485967926500111050000105000050856535101600230600426005449727350035500102050000201400006005460042113002110910103000010000018991170015985300060000106005560043600556004360055
600246005445000000018415970326500141050004105000050856446901600230600426005449727350035500102050000201400006004260048113002110910103000010000018994170015985300060000106004360055600436005560043
600246004245000000419865967926500101050002105000050856535100600230600426004849721350023500102050000201400006004260054113002110910103000010000018991170015986500060000106005560043600556004360055
6002460054449000000190959679265001010500001050000508565609006003506005460042497153050023500102050000201400006004860048113002110910103000010000018991170015985900060000106004960049600496004360043
600246004244900000019515970326500101050002105000050856446901600230600426005449727350023500102050000201400006005460042113002110910103000010000018991170015986500060000106004360055600436005560043
6002460042450000004615964126500101050002105000050856560911600350600546004249715350029500102050000201400006004260048113002110910103000010000018991170015985300060000106005560046600556004360055
60024600544490000046555968626500141050004105000050856446900600230600426005449727350035500102050000201400006005460042113002110910103000010000018991170015985300060000106005560043600436005560043
60024600424500000022565968626500141050004105000050856446900600230600426005449727350035500102050000201400006005460042113002110910103000010000018992170015985300060000106004360055600436004360043

Test 6: Latency 1->5

Chain cycles: 2

Code:

  movi v0.16b, 0
  tbx v0.8b, { v1.16b, v2.16b, v3.16b, v4.16b }, v5.8b
  add v4.16b, v0.16b, v0.16b
  movi v0.16b, 1
  movi v1.16b, 2
  movi v2.16b, 3
  movi v3.16b, 4
  movi v4.16b, 5
  movi v5.16b, 6

(fused SUBS/B.cc loop)

100 unrolls and 100 iterations

Result (median cycles for code, minus 2 chain cycles): 2.0098

retire uop (01)cycle (02)03mmu table walk data (08)18191e1f373f4e51schedule uop (52)schedule int uop (53)schedule simd uop (54)dispatch int uop (56)dispatch simd uop (57)int uops in schedulers (59)ldst uops in schedulers (5b)60696d6emap stall dispatch (70)map rewind (75)map stall (76)dispatch uop (78)map int uop (7c)map simd uop (7e)map int uop inputs (7f)map simd uop inputs (81)8283flush restart other nonspec (84)85inst all (8c)inst branch (8d)inst branch taken (90)inst branch cond (94)inst int alu (97)inst simd alu (9a)9fl1d tlb access (a0)l1d tlb miss (a1)l1d cache writeback (a8)a9acc2cfd5map dispatch bubble (d6)ddfetch restart (de)e0? int output thing (e9)? simd retires (ee)? int retires (ef)f5f6f7f8fd
602044006530000000179061396752651902100500041005000050056896750400884021840065297653300615010020050000200140000400864012211302011009910010030000100000000191011711399270600001004011140081400874013240105
6020440110300000001761397252651824100517181005000050056884210400494012240125297973300735010020050000200140000401044006811302011009910010030000100000000191011711399570600001004012340069400814010540303
6020440089300000001561397082650115100500231005000050056927010400614008340065297763300975010020050000200140000401134013411302011009910010030000100000000191011711399800600001004011440102401024010840105
60204400683000000015103397182650132100500241005000050056892970400794008040098297613300885010020050000200140000400864008611302011009910010030000100000000191011711402950600001004011140111401024011140114
602044009830000000179089397052650116100500201005000050056918750400644021840089297883300705010020050000200140000400624009811302011009910010030000100000000191011711399360600001004008440216401474018340093
6020440206300000001561396942650114100500341005000050056862160400584009840098297853300855010020050000200140000400924026611302011009910010030000100000000191011711399480600001004008440099402434010840108
6020440101301000002261396902650121100500191005000050057111550400884022740095298003300675010020050000200140000400864011311302011009910010030000100000000191011711399630600001004009040129400604008140126
602044023630000000176461397032650116100500211005000050056897140400734010140110297583300675010020050000200140000400864008311302011009910010030000100000000191001711399870600001004009340084401024017740129
6020440080300000001561397152650116100500071005000050056875010402624011940173297833300885010020050000200140000402484011611302011009910010030000100000000191011711401020600001004022240233403364010240093
6020440215301000002461397032650126100518511005000050056884210400494010440104297793301605010020050000200140000400834005611302011009910010030000100000000191011711399480600001004008740063400994008740108

1000 unrolls and 10 iterations

Result (median cycles for code, minus 2 chain cycles): 2.0095

retire uop (01)cycle (02)031e373f4e51schedule uop (52)schedule int uop (53)schedule simd uop (54)dispatch int uop (56)dispatch simd uop (57)int uops in schedulers (59)ldst uops in schedulers (5b)60696d6emap stall dispatch (70)simd prf full (72)map rewind (75)map stall (76)dispatch uop (78)map int uop (7c)map simd uop (7e)map int uop inputs (7f)map simd uop inputs (81)8283flush restart other nonspec (84)85inst all (8c)inst branch (8d)inst branch taken (90)inst branch cond (94)inst int alu (97)inst simd alu (9a)9fl1d tlb access (a0)l1d cache writeback (a8)accdcfd0d2l1i cache miss demand (d3)d5map dispatch bubble (d6)d9dadbddfetch restart (de)e0ebec? simd retires (ee)? int retires (ef)f5f6f7f8fd
60024400773000169003969826500251050025105000050568866514007940125401132979703300825001020500002014000040077400711130021109101030000100000189900061700056399510060000104009640099400814009340090
6002440077300016921893967826500411050011105000050568811614009440101400802980003300645001020500002014000040176402541130021109101030000100000189900061700036399690060000104012340087401204009940072
6002440080300017516139728265003810500191050000505691524040082400714008929803033008550010205000020140000400954011911300211091010300001000001899000121700067401280060000104012340108401264007540087
6002440095301023613968926500321050013105000050568943404009140119400832978203300705001020500002014000040107400741130021109101030000100000189900061700067399600060000104010840117401264009640255
60024400683010167263985626500351050027105000050570961514010040104401312976703300645001020500002014000040077400801130021109101030000100000189900071700037399510060000104014440108401084008440072
60024400803010311033985926518271050024105000050570762104008840095401042977003300765001020500002014000040086400861130021109101030000100000189900071700078399540060000104008140075401114009040180
6002440107300021613972126500321050014105000050568855804007640083401102978503301845001020500002014000040116401011130021109101030000100000189900091700075399480060000104020740078401144008740225
6002440080300018613974926500271050020105000050568532504011240254401132978203300915001020500002014000040104400981130021109101030000100000189900061700056399630060000104007240084402164007840087
6002440119301017613973626500261050019105000050569011304007640107401042976703301005001020500002014000040113401131130021109101030000100000189900071700057399750060000104006940096402464017940093
6002440101300019613968726500311051789105000050569222814007340110401012976403300795001020500002014000040110402031130021109101030000100000189900071700066399570060000104011140192400874013540087

Test 7: Latency 1->6

Chain cycles: 2

Code:

  movi v0.16b, 0
  tbx v0.8b, { v1.16b, v2.16b, v3.16b, v4.16b }, v5.8b
  add v5.16b, v0.16b, v0.16b
  movi v0.16b, 1
  movi v1.16b, 2
  movi v2.16b, 3
  movi v3.16b, 4
  movi v4.16b, 5
  movi v5.16b, 6

(fused SUBS/B.cc loop)

100 unrolls and 100 iterations

Result (median cycles for code, minus 2 chain cycles): 8.0038

retire uop (01)cycle (02)03mmu table walk data (08)18191e1f3a3f4e51schedule uop (52)schedule int uop (53)schedule simd uop (54)dispatch int uop (56)dispatch simd uop (57)int uops in schedulers (59)ldst uops in schedulers (5b)60696b6d6emap stall dispatch (70)map rewind (75)map stall (76)dispatch uop (78)map int uop (7c)map simd uop (7e)map int uop inputs (7f)map simd uop inputs (81)8283flush restart other nonspec (84)85inst all (8c)inst branch (8d)inst branch taken (90)inst branch cond (94)inst int alu (97)inst simd alu (9a)9fl1d tlb access (a0)l1d tlb miss (a1)ld unit uop (a6)l1d cache writeback (a8)a9acc2branch mispred nonspec (cb)cdcfd5map dispatch bubble (d6)ddfetch restart (de)e0? int output thing (e9)? simd retires (ee)? int retires (ef)f5f6f7f8fd
6020410003875000000061996862650100100500001005000050014327648110001901000381000388969639001950100200500002001400001000381000381130201100991001003000010000010000019101171199798060000100100039100039100039100039100039
60204100038749000000726996862650100100500001005000050014327648110001901000381000388969639001950100200500002001400001000381000381130201100991001003000010000000000119101171199798060000100100039100039100039100039100039
6020410003874900000061996862650100100500001005000050014327648110001901000381000388969939001950100200500002001400001000381000381130201100991001003000010000000000019101171199798060000100100039100039100039100039100039
6020410003874900000061996862650100100500001005000050014327648110001901000381000388969639001950100200500002001400001000381000381130201100991001003000010000000000019101171199798060000100100039100039100039100039100039
6020410003874900000061996862650100100500001005000050014327648110001901000381000388969639001950100200500002001400001000381000381130201100991001003000010000000000019101171199798060000100100039100039100039100039100039
602041000387490000001135996862650100100500001005000050014327648110001901000381000388969639001950100200500002001400001000381000381130201100991001003000010000000000019101171199798060000100100039100039100039100039100039
6020410003874900000061996862650100100500001005000050014327648110001901000381000388969639001950100200500002001400001000381000381130201100991001003000010000000000019101171199798060000100100039100039100039100039100039
6020410003874900000061996862650100100500001005000050014327648110001901000381000388969639001950100200500002001400001000381000381130201100991001003000010000000000019101171199798060000100100039100039100039100039100039
6020410003874900000061996862650100100500001005000050014327648110001901000381000388969639001950100200500002001400001000381000381130201100991001003000010000000000019101171199798060000100100039100039100039100039100039
6020410003875000000061996862650100100500001005000050014327648110001931000381000388969639001950100200500002001400001000381000381130201100991001003000010000000000019101171199798060000100100039100039100039100039100039

1000 unrolls and 10 iterations

Result (median cycles for code, minus 2 chain cycles): 8.0038

retire uop (01)cycle (02)03l2 tlb miss data (0b)1e1f3f4e51schedule uop (52)schedule int uop (53)schedule simd uop (54)dispatch int uop (56)dispatch simd uop (57)int uops in schedulers (59)ldst uops in schedulers (5b)60696d6emap stall dispatch (70)map rewind (75)map stall (76)dispatch uop (78)map int uop (7c)map simd uop (7e)map int uop inputs (7f)map simd uop inputs (81)8283flush restart other nonspec (84)85inst all (8c)inst branch (8d)inst branch taken (90)inst branch cond (94)inst int alu (97)inst simd alu (9a)9fl1d tlb miss (a1)ld unit uop (a6)l1d cache writeback (a8)acbranch mispred nonspec (cb)cfd0d5map dispatch bubble (d6)ddfetch restart (de)e0? simd retires (ee)? int retires (ef)f5f6f7f8fd
600241000387490006199686265001010500001050000501432764801000191000381000388969639001950010205000020140000100038100038113002110910103000010000001899041732997986000010100039100039100039100039100039
6002410003874900053699686265001010500001050000501432764801000191000381000388969639001950010205000020140000100038100038113002110910103000010000001899041734997986000010100039100039100039100039100039
600241000387490006199686265001010500001050000501432764801000191000381000388969639001950010205000020140000100038100038113002110910103000010000001899041734997986000010100039100039100039100039100039
600241000387490006199686265001010500001050000501432764801000191000381000388969639001950010205000020140460100038100233113002110910103000010000001899021734997986000010100039100039100039100039100039
6002410003875000072699686265001010500001050000501432764801000191000381000388969639001950010205000020140488100038100038113002110910103000010000001899041732997986000010100039100039100039100039100039
600241000387490006199686265001010500001050000501432764801000191000381000388969639001950010205000020140000100038100038113002110910103000010000001899021732997986000010100039100039100039100039100039
600241000387490006199686265001010500001050000501432764801000191000381000388969639001950010205000020140000100038100038113002110910103000010000001899041714997986000010100039100039100039100039100039
6002410003875006072699686265001010500001050000501432764811000191000381000388969639001950010205000020140000100038100038113002110910103000010000001899041714997986000010100039100091100039100039100039
600241000387500006199686265001010500001050000501432764801000191000381000388969639001950010205000020140000100038100038113002110910103000010000001899021734997986000010100039100039100039100039100039
600241000387500006199686265001010500001050000501432764801000191000381000388969639001950010205000020140000100038100038113002110910103000010000001899041732997986000010100039100039100039100039100039

Test 8: throughput

Count: 8

Code:

  movi v0.16b, 0
  tbx v0.8b, { v8.16b, v9.16b, v10.16b, v11.16b }, v12.8b
  movi v1.16b, 0
  tbx v1.8b, { v8.16b, v9.16b, v10.16b, v11.16b }, v12.8b
  movi v2.16b, 0
  tbx v2.8b, { v8.16b, v9.16b, v10.16b, v11.16b }, v12.8b
  movi v3.16b, 0
  tbx v3.8b, { v8.16b, v9.16b, v10.16b, v11.16b }, v12.8b
  movi v4.16b, 0
  tbx v4.8b, { v8.16b, v9.16b, v10.16b, v11.16b }, v12.8b
  movi v5.16b, 0
  tbx v5.8b, { v8.16b, v9.16b, v10.16b, v11.16b }, v12.8b
  movi v6.16b, 0
  tbx v6.8b, { v8.16b, v9.16b, v10.16b, v11.16b }, v12.8b
  movi v7.16b, 0
  tbx v7.8b, { v8.16b, v9.16b, v10.16b, v11.16b }, v12.8b
  movi v8.16b, 9
  movi v9.16b, 10
  movi v10.16b, 11
  movi v11.16b, 12
  movi v12.16b, 13

(fused SUBS/B.cc loop)

100 unrolls and 100 iterations

Result (median cycles for code divided by count): 1.0006

retire uop (01)cycle (02)03mmu table walk instruction (07)09l2 tlb miss data (0b)18191e1f3f51schedule uop (52)schedule int uop (53)schedule simd uop (54)dispatch int uop (56)dispatch simd uop (57)int uops in schedulers (59)ldst uops in schedulers (5b)60696d6emap rewind (75)map stall (76)dispatch uop (78)map int uop (7c)map simd uop (7e)map int uop inputs (7f)map simd uop inputs (81)8283flush restart other nonspec (84)85inst all (8c)inst branch (8d)inst branch taken (90)inst branch cond (94)inst int alu (97)inst simd alu (9a)9fl1d tlb access (a0)l1d tlb miss (a1)l1d cache writeback (a8)a9acc2branch cond mispred nonspec (c5)branch mispred nonspec (cb)cdcfd5map dispatch bubble (d6)ddfetch restart (de)e0? simd retires (ee)? int retires (ef)f5f6f7f8fd
40020480058600000001500322632011110032001110032003250035202451800268004580045615320132200320032200960096800458004511160201100991001001600001000000001111011611611800424000001008004680046800468004680046
40020480045600000003930312632011110032001110032003250035202451800268004580045615320132200320032200960096800458004511160201100991001001600001000000001111011611611800424000001008004680046800468004680046
4002048004560000000780322632011110032001110032003250035202450800268004580045615320132200320032200960096800458004511160201100991001001600001000000001111011711611800424000001008004680046800468004680046
40020480045599000003660312632011110032001110032003250035202451800268004580045614320132200320032200960096800458004511160201100991001001600001000000001111011611611800424000001008004680046800468004680046
4002058004560000000450322632011110032001110032003250035202451800708015580045151532013220032003220096009680045800451116020110099100100160000100001449801111011611611800424000001008004680046800468004680046
40020480045599000004502222732011110032001110032003250035202451800268004580045615320132200320032200960096800458004511160201100991001001600001000000001111011611611800424000001008004680046800468004680046
400204801566000101013207392732011110032001110032003250035202450800268004580045615320132200320032200960096800458004511160201100991001001600001000000001111011711611800424000001008004680046800468004680046
40020480045599000001080322632011110032001110032003250035202451800268004580045615320132200320032200960096800458004511160201100991001001600001000000041111011711611800424000001008004680046800468004680046
40020480045600000005130322732011110032001110032003250035202451800268004580045615320132200320032200960096800458004511160201100991001001600001000000001111011611611800424000001008004680046800468004680046
40020480045599000002880312632011110032001110032003250035202451800268004580045614320132200320032200960096800458004511160201100991001001600001000000001111011611611800424000001008004680046800468004680046

1000 unrolls and 10 iterations

Result (median cycles for code divided by count): 1.0006

retire uop (01)cycle (02)03l1i tlb fill (04)l2 tlb miss data (0b)18191e3f51schedule uop (52)schedule int uop (53)schedule simd uop (54)dispatch int uop (56)dispatch simd uop (57)int uops in schedulers (59)ldst uops in schedulers (5b)5f60696b6d6emap rewind (75)map stall (76)dispatch uop (78)map int uop (7c)map simd uop (7e)map int uop inputs (7f)map simd uop inputs (81)8283flush restart other nonspec (84)85inst all (8c)inst branch (8d)inst branch taken (90)inst branch cond (94)inst int alu (97)inst simd alu (9a)9fl1d cache writeback (a8)accfd0d2l1i cache miss demand (d3)l1i tlb miss demand (d4)d5map dispatch bubble (d6)d9dadbddfetch restart (de)e0eaeb? simd retires (ee)? int retires (ef)f5f6f7f8fd
4000258005860010000492632001010320000103200005035200000180026080045800453263200102032000020960000800458004511160021109101016000010001002634215017121403980042166400000108004680046800468010180046
4000248004559910009552632001010320000103200005035200001180026380045800453263200102032000020960000800458004511160021109101016000010001002563204117121254180042313400000108004680046800468004680046
40002480045599000002412632001010320000103200005035200000080026080045800453263200102032000020960000800458004511160021109101016000010001002331106217121383880042163400000108004680046800468004680046
4000248004560000000492632001010320000103200005035200001180026080045800453263200102032000020960000800458004511160021109101016000010001002361102617311412380042163400000108004680046800468004680046
4000248004560000000492632001010320000103200005035200001080026080045800453263200102032000020960000800458004511160021109101016000010001002362104217111414180042163400000108004680046800468004680046
4000248004559900000492632001010320000103200005035200000180026080045800453263200102032000020960096800458004511160021109101016000010001002331102717111412880042213400000108004680046800468010280105
4000248004559900000492632001010320000103200005035200001180026080045800453263200102032000020960000800458004511160021109101016000010001002531104217111294280042213400000108004680046800468004680046
4000248004560000000492632001010320000103200005035200001180026080045800453263200102032000020960000800458004511160021109101016000010001002331104317111264180042213400000108004680046800468004680046
4000248004560000000492632001010320000103200005035200001180026080045800453263200102032000020960000800458004511160021109101016000010001002331104117111444580042213400000108004680046800468004680046
4000248004560000000492632001010320000103200005035200001180026080045800453263200102032000020960000800458004511160021109101016000010001002131104119111434380042213400000108010780046800468004680046