Apple Microarchitecture Research by Dougall Johnson

M1/A14 P-core (Firestorm): Overview | Base Instructions | SIMD and FP Instructions
M1/A14 E-core (Icestorm):  Overview | Base Instructions | SIMD and FP Instructions

TBX (four register table, 16B)

Test 1: uops

Code:

  tbx v0.16b, { v1.16b, v2.16b, v3.16b, v4.16b }, v5.16b
  movi v0.16b, 1
  movi v1.16b, 2
  movi v2.16b, 3
  movi v3.16b, 4
  movi v4.16b, 5
  movi v5.16b, 6

(no loop instructions)

1000 unrolls and 1 iteration

Retires: 4.000

Issues: 4.000

Integer unit issues: 0.000

Load/store unit issues: 0.000

SIMD/FP unit issues: 4.000

retire uop (01)cycle (02)0318191e3f4e51schedule uop (52)schedule simd uop (54)dispatch simd uop (57)ldst uops in schedulers (5b)60696d6emap stall dispatch (70)map rewind (75)map stall (76)dispatch uop (78)map simd uop (7e)map simd uop inputs (81)8283flush restart other nonspec (84)85inst all (8c)inst simd alu (9a)cfd5map dispatch bubble (d6)ddfetch restart (de)e0? simd retires (ee)f5f6f7f8fd
400480376000306176872540004000400011256800801880378037669737020400040001200080378037111001100073216117796400080388038803880388038
4004803760003126176872540004000400011256800801880378037669737020400040001200080378037111001100073116117796400080388038803880388038
40048037600006176872540004000400011256801801880378037669737020400040001200080378037111001100073116117796400080388038803880388038
40048037600006176872540004000400011256800801880378037669737020400040001200080378037111001100073116117796400080388038803880388038
40048037600066176872540004000400011256801801880378037669737020400040001200080378037111001100073116117796400080388038803880388038
40048037600006176872540004000400011256801801880378037669737020400040001200080378037111001100073116117796400080388038803880388038
400480376000636176872540004000400011256801801880378037669737020400040001200080378037111001100073116117929400080388038803880388038
40048037601106176872540004000400011256801801880378037669737020400040001200080378037111001100073116117796400080388038803880388038
40048037600006176872540004000400011256800801880378037669737020400040001200080378037111001100073116117796400080388038803880388038
40048037600006176872540004000400011256801801880378037669737020400040001200080378037111001100073116117796400080388038803880388038

Test 2: Latency 1->1

Code:

  tbx v0.16b, { v1.16b, v2.16b, v3.16b, v4.16b }, v5.16b
  movi v0.16b, 1
  movi v1.16b, 2
  movi v2.16b, 3
  movi v3.16b, 4
  movi v4.16b, 5
  movi v5.16b, 6

(fused SUBS/B.cc loop)

100 unrolls and 100 iterations

Result (median cycles for code): 8.0037

retire uop (01)cycle (02)03mmu table walk data (08)191e1f3f4e51schedule uop (52)schedule int uop (53)schedule simd uop (54)dispatch int uop (56)dispatch simd uop (57)int uops in schedulers (59)ldst uops in schedulers (5b)60696b6d6emap stall dispatch (70)map rewind (75)map stall (76)dispatch uop (78)map int uop (7c)map simd uop (7e)map int uop inputs (7f)map simd uop inputs (81)8283flush restart other nonspec (84)85inst all (8c)inst branch (8d)inst branch taken (90)inst branch cond (94)inst int alu (97)inst simd alu (9a)9fl1d tlb access (a0)l1d tlb miss (a1)ld unit uop (a6)l1d cache writeback (a8)acc2cfd0d5map dispatch bubble (d6)ddfetch restart (de)e0? simd retires (ee)? int retires (ef)f5f6f7f8fd
402048003760000007267968725401001004000010040000500114576800800180800378003769697370019401002004000020012000080037800371110201100991001001000010000000071001171279797400001008003880038800388003880038
402048003759900002517968725401001004000010040000500114576800800180800378003769697370019401002004000020012000080037800371110201100991001001000010000000071001171179797400001008003880038800388003880038
40204800376000000617968725401001004000010040000500114576800800180800378003769697370019401002004000020012000080037800371110201100991001001000010000000071001171179797400001008003880038800388003880038
4020480037599003906179687254010010040000100400005001145768008001808003780037696973700194010020040000200120000800378003711102011009910010010000100000520071001171179797400001008003880038800388003880038
402048003759900210617968725401001004000010040000500114576801800180800378003769697370019401002004000020012000080037800371110201100991001001000010000010071001171179797400001008003880038800388003880038
40204800375990060617968725401001004000010040000500114576800800180800378003769697370019401002004000020012000080037800371110201100991001001000010000000071001171179797400001008003880038800388003880038
40204800376000000617968725401001004000010040000500114576800800180800378003769697370019401002004000020012000080037800371110201100991001001000010000000071001171179858400001008003880038800388003880038
40204800376000000617968725401001004000010040000500114576800800180800378003769697370019401002004000020012000080037800371110202100991001001000010000000071001171179797400001008003880038800388003880038
402048003759900006179687254010010040000100400005001145768008001808003780037696973700194010020040000200120000800378003711102011009910010010000100002340071001171179797400001008003880038800388003880038
40204800376000000617968725401001004000010040000500114576800800180800378003769697370019401002004000020012000080037800371110201100991001001000010000000071001171079797400001008003880038800388003880038

1000 unrolls and 10 iterations

Result (median cycles for code): 8.0037

retire uop (01)cycle (02)0318191e3f4e51schedule uop (52)schedule int uop (53)schedule simd uop (54)dispatch int uop (56)dispatch simd uop (57)int uops in schedulers (59)ldst uops in schedulers (5b)60696d6emap stall dispatch (70)simd prf full (72)map rewind (75)map stall (76)dispatch uop (78)map int uop (7c)map simd uop (7e)map int uop inputs (7f)map simd uop inputs (81)8283flush restart other nonspec (84)85inst all (8c)inst branch (8d)inst branch taken (90)inst branch cond (94)inst int alu (97)inst simd alu (9a)9fld unit uop (a6)l1d cache writeback (a8)accdcfd5map dispatch bubble (d6)fetch restart (de)e0? simd retires (ee)? int retires (ef)f5f6f7f8fd
40024800376000003467968725400101040000104000050114576801800188003780037696970370019400102040000201200008003780037111002110910101000010000064911717979740000108003880038800388003880038
4002480037599000617968725400101040000104000050114576800800188003780037696970370019400102040000201200008003780084111002110910101000010003064911717979740000108003880038800388003880038
4002480037600000617968725400101040000104000050114576801800188003780037696970370019400102040000201200008003780037111002110910101000010000064911717979740000108003880038800388003880083
40024800375990001267968725400101040000104000050114576800800188017780037696970370019400102040000201200008003780037111002110910101000010000064911717979740000108003880038800388003880038
4002480037600000617968725400101040000104000050114582300800188003780037696970370019400102040000201200008003780037111002110910101000010000064911717979740000108003880038800388003880038
4002480037600000617968725400101040000104000050114576800800188003780037696970370019400102040000201200008003780037111002110910101000010000064911717979740000108003880038800388003880038
4002480037599000617968725400101040000104000050114576801800188003780037696970370019400102040000201200008003780037111002110910101000010000064911717979740000108003880038800388003880038
4002480037599000617968725400101040000104000050114576801800188003780037696970370019400102040000201200008003780037111002110910101000010010064911717979740000108003880038800388003880038
4002480037599000617968725400101040000104000050114576801800188003780037696970370019400102040000201200008003780037111002110910101000010000064911747979740000108003880038800388003880038
4002480037600000617968725400101040000104000050114576800800188003780037696970370019400102040000201200008003780037111002110910101000010000064911717979740000108003880038800388003880038

Test 3: Latency 1->2

Chain cycles: 2

Code:

  movi v0.16b, 0
  tbx v0.16b, { v1.16b, v2.16b, v3.16b, v4.16b }, v5.16b
  add v1.16b, v0.16b, v0.16b
  movi v0.16b, 1
  movi v1.16b, 2
  movi v2.16b, 3
  movi v3.16b, 4
  movi v4.16b, 5
  movi v5.16b, 6

(fused SUBS/B.cc loop)

100 unrolls and 100 iterations

Result (median cycles for code, minus 2 chain cycles): 8.0038

retire uop (01)cycle (02)0318191e1f3f4e51schedule uop (52)schedule int uop (53)schedule simd uop (54)dispatch int uop (56)dispatch simd uop (57)int uops in schedulers (59)ldst uops in schedulers (5b)60696d6emap stall dispatch (70)map rewind (75)map stall (76)dispatch uop (78)map int uop (7c)map simd uop (7e)map int uop inputs (7f)map simd uop inputs (81)8283flush restart other nonspec (84)85inst all (8c)inst branch (8d)inst branch taken (90)inst branch cond (94)inst int alu (97)inst simd alu (9a)9fl1d tlb miss (a1)ld unit uop (a6)st unit uop (a7)l1d cache writeback (a8)a9acc2branch cond mispred nonspec (c5)branch mispred nonspec (cb)cdcfl1i cache miss demand (d3)d5map dispatch bubble (d6)ddfetch restart (de)e0? simd retires (ee)? int retires (ef)f5f6f7f8fd
60204100038749000061996862650100100500001005000050014327648010001910003810003889696390019501002005000020014000010003810003811302011009910010030000100000000000019100117119979860000100100039100039100039100039100039
60204100038749000061996862650100100500001005000050014327648010001910003810003889696390019501002005000020014000010003810003811302011009910010030000100000000000019100117119979860000100100039100039100039100039100039
60204100038749000061996862650100100500001005000050014327648110001910003810003889696390019501002005000020014000010003810003811302011009910010030000100000000000019100117119979860000100100039100039100039100039100039
602041000387490000619968626501001005000010050000500143276480100019100038100038896963900195010020050000200140000100038100038113020110099100100300001000204300200019100117119979860000100100039100039100039100039100039
602041000387490000726996862650100100500001005000050014327648010001910003810003889696390019501002005000020014000010003810003811302011009910010030000100000000000019100117119986660000100100039100039100039100039100039
60204100038749000061996862650100100500001005000050014327648010001910003810003889696390019501002005000020014000010003810003811302011009910010030000100000000000019100117119979860000100100039100039100039100039100039
602041000387490000329996862650100100500001005000050014327648010001910003810008889696390019501002005000020014000010003810003811302011009910010030000100000000000019100117119979860000100100039100039100039100039100039
602041000387500000726996862650100100500001005000050014327648010001910003810003889696390019501002005000020014000010003810003811302011009910010030000100000000000019100117119979860000100100039100039100039100039100039
6020410003874940001610996862650100100500001005000050014327648110001910003810003889696390019501002005000020014000010003810003811302011009910010030000100000008051000019100117119979860000100100039100039100039100039100039
60204100038749000061996862650100100500001005000050014327648010001910003810003889696390019501002005000020014000010003810003811302011009910010030000100000000000019100117119979860000100100039100039100039100039100039

1000 unrolls and 10 iterations

Result (median cycles for code, minus 2 chain cycles): 8.0038

retire uop (01)cycle (02)03mmu table walk data (08)0918191e1f3a3f4e51schedule uop (52)schedule int uop (53)schedule simd uop (54)dispatch int uop (56)dispatch simd uop (57)int uops in schedulers (59)ldst uops in schedulers (5b)60696b6d6emap stall dispatch (70)map rewind (75)map stall (76)dispatch uop (78)map int uop (7c)map simd uop (7e)map int uop inputs (7f)map simd uop inputs (81)8283flush restart other nonspec (84)85inst all (8c)inst branch (8d)inst branch taken (90)inst branch cond (94)inst int alu (97)inst simd alu (9a)9fl1d tlb access (a0)l1d tlb miss (a1)ld unit uop (a6)st unit uop (a7)l1d cache writeback (a8)a9acc2branch mispred nonspec (cb)cdcfd5map dispatch bubble (d6)ddfetch restart (de)e0? simd retires (ee)? int retires (ef)f5f6f7f8fd
6002410003874900000001039968626500101050000105000050143276480100019010003810003889662390019500102050000201400001000381000382130021109101030000100000000000189911701997986000010100039100039100039100039100039
600241000387490000000619968626500101050000105000050143276481100019010003810003889696390019500102050000201400001000381000381130021109101030000100000000000189911701997986000010100039100039100039100039100039
600241000387500000000619968626500101050000105000050143276481100019010003810003889696390019500102050000201400001000381000381130021109101030000100000000000189911701997986000010100039100039100039100090100039
600241000387490000000619968626500101050000105000050143276480100019010003810003889696390019500102050000201400001000381000381130021109101030000100000000000189911701997986000010100039100039100039100039100039
600241000387490000000619968626500101050000105000050143276480100019010003810003889696390019500102050000201400001000381000381130021109101030000100000000000189911701997986000010100039100039100039100039100039
600241000387490000000619968626500101050024105000050143276481100019010003810003889696390019500102050000201400001000381000381130021109101030000100000000010189911701997986000010100039100039100039100039100039
600241000387490000000619968626500101050000105000050143276481100019010003810003889696390019500102050000201400001000381000381130021109101030000100000000000189911701997986000010100039100039100039100039100039
600241000387780000000619968626500101050000105000050143276481100019010003810003889696390019500102050000201400001000381000381130021109101030000100000003000189911701997986000010100039100039100039100039100039
600241000387500000000619968626500101050000105000050143276481100019010003810003889696390019500102050000201400001000381000381130021109101030000100000100010189911701997986000010100039100039100039100039100039
600241000387490000000619968626500101050000105000050143276481100019010003810003889696390019500102050000201400001000381000381130021109101030000100000000000189911701997986000010100039100090100039100039100039

Test 4: Latency 1->3

Chain cycles: 2

Code:

  movi v0.16b, 0
  tbx v0.16b, { v1.16b, v2.16b, v3.16b, v4.16b }, v5.16b
  add v2.16b, v0.16b, v0.16b
  movi v0.16b, 1
  movi v1.16b, 2
  movi v2.16b, 3
  movi v3.16b, 4
  movi v4.16b, 5
  movi v5.16b, 6

(fused SUBS/B.cc loop)

100 unrolls and 100 iterations

Result (median cycles for code, minus 2 chain cycles): 6.0040

retire uop (01)cycle (02)03mmu table walk data (08)l2 tlb miss instruction (0a)18191e1f373f4e51schedule uop (52)schedule int uop (53)schedule simd uop (54)dispatch int uop (56)dispatch simd uop (57)int uops in schedulers (59)ldst uops in schedulers (5b)60696b6d6emap stall dispatch (70)map rewind (75)map stall (76)dispatch uop (78)map int uop (7c)map simd uop (7e)map int uop inputs (7f)map simd uop inputs (81)8283flush restart other nonspec (84)85inst all (8c)inst branch (8d)inst branch taken (90)inst branch cond (94)inst int alu (97)inst simd alu (9a)9fl1d tlb access (a0)l1d tlb miss (a1)ld unit uop (a6)l1d cache writeback (a8)a9acc2branch mispred nonspec (cb)cdcfd5map dispatch bubble (d6)ddfetch restart (de)e0? int output thing (e9)? simd retires (ee)? int retires (ef)f5f6f7f8fd
602048004360000000009479682265010110050000100500005001144744708002108004380040697043700215010020050000200140000800408004011302011009910010030000100002000001191011711798210600001008004180041800418004180041
602048004060000000006179682265010010050000100500005001144744718002108004380040697043700215010020050000200140000800408004011302011009910010030000100000000000191011711798210600001008004180041800418004180044
602048004359900000006179686265010310050003100500005001144744708002108004080040697043700215010020050000200140000800438004311302011009910010030000100000000000191011711798210600001008004180041800418004180041
60204800435990000000245079605725029810450250104509455331145417908002108004380040697073700215010020050000200140000800438004311302011009910010030000100000000000191011711798210600001008004180041800418004480041
602048004059900000016179682265010010050000100500005001144793818002108004080040697043700245010020050000200140000800408004011302011009910010030000100000000000191011711798240600001008004180041800418004480044
602048004059900000006179682265010010050000100500005001144744708002108004080043697043700215010020050000200140000800408004051302011009910010030000100000000000191011711798210600001008004180041800418004480041
6020480040600000000072679682265010010050003100500005001144744718002108004380040697043700215010020050000200140000800438004311302011009910010030000100000000000191011711798210600001008004180041800418004180041
602048004060000000006179686265010010050003100500005001144744718002108004380040697043700215010020050000200140000800408004011302011009910010030000100000000000191011711798240600001008004180041800448004180041
602048004060000000006179682265010010050000100500005001144744718002408004080043697043700215010020050000200140000800408004311302011009910010030000100000000000191011711798210600001008004180041800418004180041
602048004360000000016179685265010010050000100500005001144744718002108004080040697043700215010020050000200140000800408004011302011009910010030000100000000010191011711798240600001008004180041800418004180041

1000 unrolls and 10 iterations

Result (median cycles for code, minus 2 chain cycles): 6.0040

retire uop (01)cycle (02)031e373f4e51schedule uop (52)schedule int uop (53)schedule simd uop (54)dispatch int uop (56)dispatch simd uop (57)int uops in schedulers (59)ldst uops in schedulers (5b)5f60696b6d6emap stall dispatch (70)simd prf full (72)map rewind (75)map stall (76)dispatch uop (78)map int uop (7c)map simd uop (7e)map int uop inputs (7f)map simd uop inputs (81)8283flush restart other nonspec (84)85inst all (8c)inst branch (8d)inst branch taken (90)inst branch cond (94)inst int alu (97)inst simd alu (9a)9fl1d tlb access (a0)l1d tlb miss (a1)ld unit uop (a6)l1d cache writeback (a8)acc2cfd0d5map dispatch bubble (d6)d9dadbddfetch restart (de)e0eaec? simd retires (ee)? int retires (ef)f5f6f7f8fd
600248004060000617968226500101050001105000050114474470180021080043800406970403700215001020500002014000080040800401130021109101030000100000001899061700046798210060000108004180041800418004180041
600248004060002617968226500101050000105000050114474470180021080040800406970403700215001020500002014000080040800401130021109101030000100000001899071700067798210060000108004180044800418004480044
600248004059901617968226500101050000105000050114474470080021080040800406970703700215001020500002014000080040800401130021109101030000100000001899051700045798210060000108004180041800418004180041
600248004059900617968226500101050000105000050114474470080024080040800406970403700215001020500002014000080040800401130021109101030000100000001899061700056798210060000108004180041800418004180041
60024800406005851617968226500121050001105000050114474580080021080040800406971803700215001020500002014000080040800401130021109101030000100000001899071700057798210060000108004180041800418004480041
600248004060000617968226500111050000105000050114474470180021080040800406970403700215001020500002014000080040800401130021109101030000100000001899071700045798210060000108004180044800418004480041
6002480040600017267968226500121050002105000050114474470180024080040800406970403700215001020500002014000080040800431130021109101030000100003001899071700057798210060000108004480041800448004180041
6002480040599007267968526500101050000105000050114474470180021080040800406970403700215001020500002014000080043800401130021109101030000100000001899071700064798210060000108004180041800418004180041
600248004059900617968226500111050000105000050114474470180021080040800406970403700215001020500002014000080040800401130021109101030000100000001899071700038798210060000108004180044800418004480041
600248004062000617968226500131050001105000050114474470180021080040800406970403700215001020500002014000080040800401130021109101030000100000001899061700056798210060000108004180041800418004180041

Test 5: Latency 1->4

Chain cycles: 2

Code:

  movi v0.16b, 0
  tbx v0.16b, { v1.16b, v2.16b, v3.16b, v4.16b }, v5.16b
  add v3.16b, v0.16b, v0.16b
  movi v0.16b, 1
  movi v1.16b, 2
  movi v2.16b, 3
  movi v3.16b, 4
  movi v4.16b, 5
  movi v5.16b, 6

(fused SUBS/B.cc loop)

100 unrolls and 100 iterations

Result (median cycles for code, minus 2 chain cycles): 4.0042

retire uop (01)cycle (02)03181e373f4e51schedule uop (52)schedule int uop (53)schedule simd uop (54)dispatch int uop (56)dispatch simd uop (57)int uops in schedulers (59)ldst uops in schedulers (5b)60696b6d6emap stall dispatch (70)map rewind (75)map stall (76)dispatch uop (78)map int uop (7c)map simd uop (7e)map int uop inputs (7f)map simd uop inputs (81)8283flush restart other nonspec (84)85inst all (8c)inst branch (8d)inst branch taken (90)inst branch cond (94)inst int alu (97)inst simd alu (9a)9fl1d cache writeback (a8)a9acc2cfd5map dispatch bubble (d6)ddfetch restart (de)e0? int output thing (e9)? simd retires (ee)? int retires (ef)f5f6f7f8fd
60204600424490006159679265010110050000100500005008565351160035060054600424971835002650100200500002001400006004860042113020110099100100300001000000191011711598650600001006004360055600436005560043
60204600424500026159686265010410050004100500005008564497160023060042600544972735003550100200500002001400006005460042113020110099100100300001009000191011711598650600001006005560043600556004360055
60204600544500906159679265010410050004100500005008564469160023060048600484971535002350100200500002001400006005460042113020110099100100300001000000191011711598650600001006004360055600436005560043
60204600424500046159686265010410050004100500005008564469160023060042600484971535002350100200500002001400006004260054113020110099100100300001000000191011711598590600001006005560043600556004360055
60204600544490016659686265010410050004100500005008564497060035060054600424971535002950100200500002001400006004260054113020110099100100300001000000191011711598650600001006004360055600436005560043
60204600544490006159686265010410050004100500005008564497160023060054600424971535002350100200500002001400006004260054113020110099100100300001000000191011711598590600001006005560043601106004360043
60204600424500046159679265010110050000100500005008565609160035060054600424971535002350100200500002001400006004260054113020110099100100300001000000191011711598650600001006004960043600496005560043
60204600424500016159686265010410050004100500005008564801060023060054600424971535002350848200500002001400006005460042113020110099100100300001000000191011711598530600001006005560043600556004660055
60204600544500026159679265010410050000100500005008565609160035060054600424971535002350100200500002001400006004260054113020110099100100300001001000191011711598650600001006005560043600556004360043
602046004245000072659686265010410050004100500005008564469160023060042600544972735003550100200500002001400006005460042113020110099100100300001000000191011711598530600001006005560043600556004360055

1000 unrolls and 10 iterations

Result (median cycles for code, minus 2 chain cycles): 4.0045

retire uop (01)cycle (02)03mmu table walk data (08)09l2 tlb miss data (0b)1e373f4e51schedule uop (52)schedule int uop (53)schedule simd uop (54)dispatch int uop (56)dispatch simd uop (57)int uops in schedulers (59)ldst uops in schedulers (5b)60696d6emap stall dispatch (70)map rewind (75)map stall (76)dispatch uop (78)map int uop (7c)map simd uop (7e)map int uop inputs (7f)map simd uop inputs (81)8283flush restart other nonspec (84)85inst all (8c)inst branch (8d)inst branch taken (90)inst branch cond (94)inst int alu (97)inst simd alu (9a)9facc2branch cond mispred nonspec (c5)cfd5map dispatch bubble (d6)ddfetch restart (de)e0eb? simd retires (ee)? int retires (ef)f5f6f7f8fd
600246004545000000615967926500111050001105000050856446916002660042600544971535002950010205000020140000600486004211300211091010300001000018996173359853060000106004360043600436004360055
600246011846611190615967926500141050000105000050856446906002960048600544972735003550010205000020140000600426005411300211091010300001000018994172459865060000106004360055600436005560043
6002460045450000006159679265001010500021050000508565609160035600546004249715350023500102050000201400006004260042113002110910103000010000189931734598531960000106004660043600496004960049
600246004245000000615968126500101050001105000050856523316002660042600544971535002350010205000020140000600426005411300211091010300001000018994173459853060000106004960043600436004360043
6002460042449000150615967926500101050000105000050856560916003560048600424971535002350010205000020140000600486004211300211091010300001000118994173359865060000106004360055600436005560043
600246004245000000615968626500141050004105000050856446916002960048600424971535002350010205000020140000600486004811300211091010300001000018994173459859060000106004360043600436004960043
600246004245000000615968626500141050004105000050856446916002960048600544972735003550010205000020140000600426004211300211091010300001000018994172459853060000106005560099600496004960046
600246004245000002615967926500101050000105000050856449706002960042600484970035002950010205000020140000600486004811300211091010300001000018994172359859060000106004960055600436004360043
600246004244900104615968626500141050004105000050856446916002360042600544972735003550010205000020140000600426004211300211091010300001000018994172459865060000106005560043600556004360055
600246005444900000615968626500141050004105000050856446916002360042600544972735003550010205000020140000600426004811300211091010300001000018994173359853060000106005560043600556004660055

Test 6: Latency 1->5

Chain cycles: 2

Code:

  movi v0.16b, 0
  tbx v0.16b, { v1.16b, v2.16b, v3.16b, v4.16b }, v5.16b
  add v4.16b, v0.16b, v0.16b
  movi v0.16b, 1
  movi v1.16b, 2
  movi v2.16b, 3
  movi v3.16b, 4
  movi v4.16b, 5
  movi v5.16b, 6

(fused SUBS/B.cc loop)

100 unrolls and 100 iterations

Result (median cycles for code, minus 2 chain cycles): 2.0731

retire uop (01)cycle (02)03l2 tlb miss data (0b)191e373f4e51schedule uop (52)schedule int uop (53)schedule simd uop (54)dispatch int uop (56)dispatch simd uop (57)int uops in schedulers (59)ldst uops in schedulers (5b)60696d6emap stall dispatch (70)simd prf full (72)map rewind (75)map stall (76)dispatch uop (78)map int uop (7c)map simd uop (7e)map int uop inputs (7f)map simd uop inputs (81)8283flush restart other nonspec (84)85inst all (8c)inst branch (8d)inst branch taken (90)inst branch cond (94)inst int alu (97)inst simd alu (9a)9fl1d tlb miss (a1)ld unit uop (a6)l1d cache writeback (a8)acc2cdcfd5map dispatch bubble (d6)ddfetch restart (de)e0? simd retires (ee)? int retires (ef)f5f6f7f8fd
602044065930500033961403312650500100503391005000050057751331408024073740704303940330613501002005000020014000040791406501130201100991001003000010000000019101171140662600001004066340729408014061840723
602044078230500035461402942650433100503411005000050057834551406434072840800303640330868501002005000020014000040731407011130201100991001003000010000000019101171140738600001004078640624407024063340864
602044060230500029061402882650461100504081005000050058003231407484064740659303900330670501002005000020014000040608407521130201100991001003000010000000019101171140594600001004078940663408254082240768
602044064730500038661402882650389100503201005000050057903121407784064740788305110330844501002005000020014000040659406771130201100991001003000010000000019101171140524600001004070240693406034075040708
6020440662306000457251402862650407100503491005000050057846571406764070140692304370330721501002005000020014000040797407311130201100991001003000010000000019100171140512600001004086740702406934069640798
60204406863050012357103404332650461100503881005000050057729581407784078840662304580330619501002005000020014000040809407731130201100991001003000010010000019101171140638600001004082540810407744085540798
602044064130400036161403382650528100503021005000050057808961408054082140743304120330730501002005000020014000040797407311130201100991001003000010000000019101171140638600001004074140798407324072340678
602044078530500036161404262650497100503761005000050057766761407784067140887305170330679501002005000020014000040740407851130201100991001003000010000000019101171140638600001004067840750407234062740888
602044077930500034161402422650454100503391005000050057890331405954073740737304840330703501002005000020014000040776406321130201100991001003000010000000019101171140662600001004073540741407984073240822
602044073130500043461403692650515100503931005000050057588991407184063240797304660330778501002005000020014000040611406771130201100991001003000010000000019101171140674600001004069340603407504088240690

1000 unrolls and 10 iterations

Result (median cycles for code, minus 2 chain cycles): 2.0104

retire uop (01)cycle (02)03191e373f4e51schedule uop (52)schedule int uop (53)schedule simd uop (54)dispatch int uop (56)dispatch simd uop (57)int uops in schedulers (59)ldst uops in schedulers (5b)60696b6d6emap stall dispatch (70)map rewind (75)map stall (76)dispatch uop (78)map int uop (7c)map simd uop (7e)map int uop inputs (7f)map simd uop inputs (81)8283flush restart other nonspec (84)85inst all (8c)inst branch (8d)inst branch taken (90)91inst branch cond (94)inst int alu (97)inst simd alu (9a)9fld unit uop (a6)l1d cache writeback (a8)accfd5map dispatch bubble (d6)ddfetch restart (de)e0? simd retires (ee)? int retires (ef)f5f6f7f8fd
600244069530000186139694265003310500311050000505690882040076040083401312977633012750010205000020140000402364010411300211090101030000100031899417233997160000104009940078400814006340075
600244007430000176139685265002810500271050000505687641040076040194401942980333005850010205000020140000400864009211300211090101030000100001899317234000760000104021940093401744008740199
600244009230000196139694265003010500181050000505689834140067040131400712977333007050010205000020140000400684008911300211090101030000100001899217233997260000104006940210400724024640078
600244008630100206139791265002110500201050000505686276040070040083401312977633007950010205000020140000401074020011300211090101030000100001899317233999860000104019240060402644008140084
600244024530000146139709265177310518141050000505688506140091040101401012977033008850010205000020140000400984008611300211090101030000100001899317233995360000104009640099401114009040084
600244011630000156139707265002710500301050000505696628140100040107400832976733005550010205000020140000401104008911300211090101030000100001899317224007160000104011440105401084008440069
600244006830100246139659265003910500211050000505694789140064040113402542980333005550010205000020140000400864013111300211090101030000100001899217133996060000104014140075401174008740081
6002440125300002272639707265003310500211050000505694789140094040089400922978833007050010205000020140000402004011611300211090101030000100001899317233996060000104007840087401174014440090
600244009530000236139724265003010500141050000505686272140058040086401042977033007050010205000020140000400954006811300211090101030000100001899317233996360000104011440177400694021040099
600244011330200196139688495002310500361050000505691546140070040098400832987633009750010205000020140000400924007111300211090101030000100001899317233997260000104008740090401054008740204

Test 7: Latency 1->6

Chain cycles: 2

Code:

  movi v0.16b, 0
  tbx v0.16b, { v1.16b, v2.16b, v3.16b, v4.16b }, v5.16b
  add v5.16b, v0.16b, v0.16b
  movi v0.16b, 1
  movi v1.16b, 2
  movi v2.16b, 3
  movi v3.16b, 4
  movi v4.16b, 5
  movi v5.16b, 6

(fused SUBS/B.cc loop)

100 unrolls and 100 iterations

Result (median cycles for code, minus 2 chain cycles): 8.0038

retire uop (01)cycle (02)03mmu table walk data (08)191e3f4e51schedule uop (52)schedule int uop (53)schedule simd uop (54)dispatch int uop (56)dispatch simd uop (57)int uops in schedulers (59)ldst uops in schedulers (5b)60696d6emap stall dispatch (70)map rewind (75)map stall (76)dispatch uop (78)map int uop (7c)map simd uop (7e)map int uop inputs (7f)map simd uop inputs (81)8283flush restart other nonspec (84)85inst all (8c)inst branch (8d)inst branch taken (90)inst branch cond (94)inst int alu (97)inst simd alu (9a)9fl1d cache writeback (a8)acc2branch cond mispred nonspec (c5)cfd5map dispatch bubble (d6)ddfetch restart (de)e0? int output thing (e9)? simd retires (ee)? int retires (ef)f5f6f7f8fd
60204100038749000944996862650100100500001005000050014327648110001910003810003889696390019501002005000020014000010003810003811302011009910010030000100000019101171199798260000100100039100039100039100039100039
60204100038749000300996862650100100500001005000050014327648110001910003810003889696390019501002005000020014000010003810003811302011009910010030000100000019101171199798060000100100039100039100039100039100039
602041000387490005899968626501001005000010050000500143276481100019100038100038896963900195010020050000200140000100038100038113020110099100100300001000143000020292751199912160000100100039100039100039100039100039
6020410003875006061996862650100100500001005000050014327648110001910003810003889696390019501002005000020014000010003810003811302011009910010030000100000019101171099798060000100100039100039100039100039100039
60204100038750000212996862650100100500001005000050014327648110001910003810003889696390019501002005000020014000010003810003811302011009910010030000100000019101171199798060000100100039100039100039100039100039
6020410003874900121135996862650100100500001005000050014327874110001910003810003889696390019501002005000020014000010003810003811302011009910010030000100130019101171199798060000100100039100039100039100039100039
6020410003874900122028996862650100100500001005000050014327648110001910003810003889696890019501002005000020014092010003810003811302011009910010030000100030019101171199798060000100100039100039100039100039100039
60204100038750001224769968626501001005000010050000500143276481100019100038100038896963900195010020450000200140000100038100038113020110099100100300001000000191011711100035060000100100039100039100039100039100039
60204100038749000780996862650114100500001005000050014327648110001910003810003889696390019501002005000020014000010003810003811302011009910010030000100000019101171199798060000100100039100039100039100039100039
60204100038750000822996862650100100500001005000050014327648110001910003810003889696390019501002005000020014000010003810003811302011009910010030000100000019101171199798060000100100039100039100039100039100039

1000 unrolls and 10 iterations

Result (median cycles for code, minus 2 chain cycles): 8.0038

retire uop (01)cycle (02)03mmu table walk data (08)09l2 tlb miss instruction (0a)l2 tlb miss data (0b)18191e1f3f4e51schedule uop (52)schedule int uop (53)schedule simd uop (54)dispatch int uop (56)dispatch simd uop (57)int uops in schedulers (59)ldst uops in schedulers (5b)5f60696d6emap stall dispatch (70)map rewind (75)map stall (76)dispatch uop (78)map int uop (7c)map simd uop (7e)map int uop inputs (7f)map simd uop inputs (81)8283flush restart other nonspec (84)85inst all (8c)inst branch (8d)inst branch taken (90)inst branch cond (94)inst int alu (97)inst simd alu (9a)9fl1d tlb access (a0)ld unit uop (a6)l1d cache writeback (a8)a9accfd5map dispatch bubble (d6)ddfetch restart (de)e0? simd retires (ee)? int retires (ef)f5f6f7f8fd
600241000387490000000076899686265001010500001050000501432764810100019100038100038896963900195001020500002014000010003810003811300211091010300001000000189961725997986000010100039100039100039100039100039
600241000387490000003006199686265001010500001050000501432764810100019100038100038896963900195001020500002014184210003810003811300211091010300001000400189951745997986000010100039100039100039100039100039
60024100038749000001006199642265001010500001050000501432764810100019100038100038896963900195001020500002014000010003810003811300211091010300001000000189951725997986000010100039100039100039100039100039
60024100038749000100006199686265001010500001050000501432764810100163100038100038896963900195001020500002014000010003810003811300211091010300001000003189961756997986000010100039100039100039100039100039
60024100038750000000006199686265001010500001050000501432764810100019100038100038896963900195001020500002014000010003810003811300211091010300001000000189951725997986000010100039100039100039100039100039
60024100038749000100006199686265005810500001050000501432764810100019100038100038896963900195001020500002014000010003810003811300211091010300001000300189951743997986000010100039100039100039100039100039
6002410003874900000057306199686265001010500001050000501432764810100019100038100038896963900195001020500002014000010003810003811300211091010300001000000189935045999446000010100039100039100039100039100039
600241000387500000000067399686265001010500001050000501432764810100019100038100038896963900195001020500002014000010003810003811300211091010300001020006132189931743997986000010100039100039100039100039100089
60024100038749000040006199686265001010500001050000501432764811100019100134100086896963900195001020500002014000010003810003811300211091010300001000003189931725997986000010100039100039100039100039100039
600241000387490000000061996862650010105000010500005014327648101000191000381000388969625900195001020500002014000010003810003811300211091010300001000000189955143997986000010100039100039100039100039100039

Test 8: throughput

Count: 8

Code:

  movi v0.16b, 0
  tbx v0.16b, { v8.16b, v9.16b, v10.16b, v11.16b }, v12.16b
  movi v1.16b, 0
  tbx v1.16b, { v8.16b, v9.16b, v10.16b, v11.16b }, v12.16b
  movi v2.16b, 0
  tbx v2.16b, { v8.16b, v9.16b, v10.16b, v11.16b }, v12.16b
  movi v3.16b, 0
  tbx v3.16b, { v8.16b, v9.16b, v10.16b, v11.16b }, v12.16b
  movi v4.16b, 0
  tbx v4.16b, { v8.16b, v9.16b, v10.16b, v11.16b }, v12.16b
  movi v5.16b, 0
  tbx v5.16b, { v8.16b, v9.16b, v10.16b, v11.16b }, v12.16b
  movi v6.16b, 0
  tbx v6.16b, { v8.16b, v9.16b, v10.16b, v11.16b }, v12.16b
  movi v7.16b, 0
  tbx v7.16b, { v8.16b, v9.16b, v10.16b, v11.16b }, v12.16b
  movi v8.16b, 9
  movi v9.16b, 10
  movi v10.16b, 11
  movi v11.16b, 12
  movi v12.16b, 13

(fused SUBS/B.cc loop)

100 unrolls and 100 iterations

Result (median cycles for code divided by count): 1.0006

retire uop (01)cycle (02)03mmu table walk instruction (07)mmu table walk data (08)l2 tlb miss instruction (0a)18191e1f3a3f51schedule uop (52)schedule int uop (53)schedule simd uop (54)dispatch int uop (56)dispatch simd uop (57)int uops in schedulers (59)ldst uops in schedulers (5b)5f60696b6d6emap rewind (75)map stall (76)dispatch uop (78)map int uop (7c)map simd uop (7e)map int uop inputs (7f)map simd uop inputs (81)8283flush restart other nonspec (84)85inst all (8c)inst branch (8d)inst branch taken (90)inst branch cond (94)inst int alu (97)inst simd alu (9a)9fl1d tlb access (a0)l1d tlb miss (a1)st unit uop (a7)l1d cache writeback (a8)a9acc2branch cond mispred nonspec (c5)branch mispred nonspec (cb)cdcfd0d5map dispatch bubble (d6)dadbddfetch restart (de)e0? int output thing (e9)? simd retires (ee)? int retires (ef)f5f6f7f8fd
40020480071600101000009726320111100320011100320032500352024500801140802108004571532013220032003220096009680045800451116020110099100100160000100000000011110116021600228004204000001008004680046800468004680046
40020480045599101000013226320111100320011100320032500352024501800260800458004561532013220032003220096009680045800451116020110099100100160000100000000011110116021600438004204000001008004680046800468004680046
40020480045599101000015226320111100320011100320032500352024500800260800458004561432013220032003220096009680045800451116020110099100100160000100000000011110116021600228004204000001008004680046800468004680046
40020480045599101000013126320111100320011100320032500352024500800260800458004571432013220032003220096009680045800451116020110099100100160000100000000011110116021600228004204000001008004680046800468004680046
40020480045599101000013259320111100320011100320032500352024500800260801048004561532013220032003220096009680045800451116020110099100100160000100000000011110117021600228004204000001008010680046800468004680046
40020480045599101000013127320111100320011100320032500352024500800260800458004561532013220032003220096009680045800451116020110099100100160000100000003011110116021600228004204000001008004680046800468004680046
400204800456001010000169626320111100320011100320032500352024500800260800458004561532013220032003220096009680045800451116020110099100100160000100000000011110116021600228004204000001008004680046800468004680046
40020480045600101000013126320111100320011100320032500352024500800260800458004571532013220032003220096009680045800451116020110099100100160000100000000011110116021600228004204000001008004680046800468004680046
40020480100600101000013226320111100320011100320032500352024500800260800458004561532013220032003220096009680045800451116020110099100100160000100000000011110116021600448004204000001008004680046800468004680046
40020480045599101000013226320111100320011100320032500352024500800260800458004561432013220032003220096009680045800451116020110099100100160000100000000011110117021600228004204000001008004680046800468004680046

1000 unrolls and 10 iterations

Result (median cycles for code divided by count): 1.0006

retire uop (01)cycle (02)0318191e1f3a3f51schedule uop (52)schedule int uop (53)schedule simd uop (54)dispatch int uop (56)dispatch simd uop (57)int uops in schedulers (59)ldst uops in schedulers (5b)5f60696b6d6emap rewind (75)map stall (76)dispatch uop (78)map int uop (7c)map simd uop (7e)map int uop inputs (7f)map simd uop inputs (81)8283flush restart other nonspec (84)85inst all (8c)inst branch (8d)inst branch taken (90)inst branch cond (94)inst int alu (97)inst simd alu (9a)9fl1d tlb access (a0)l1d tlb miss (a1)l1d cache writeback (a8)acc2branch mispred nonspec (cb)cdcfd0d2l1i cache miss demand (d3)d5map dispatch bubble (d6)d9dadbddfetch restart (de)e0eaeb? simd retires (ee)? int retires (ef)f5f6f7f8fd
4000248005860000002376263200101032000010320000503520000118002608004580045326320010203200002096000080045800451116002110910101600001000000001002142119172116680042152400000108004680046800468004680046
40002480045599000014926320010103200001032000050352000011800260800458004532632001020320000209600008004580045111600211091010160000100000000100213115172115680042152400000108004680046800468004680046
40002480045600000014926320010103200001032000050352000011800260800458004532632001020320000209600008004580045111600211091010160000100000000100213115173116580042152400000108004680046800468004680046
40002480045600000024926320010103200001032000050352000011800260800458004532632001020320000209600008004580045111600211091010160000100000000100213219172218780042152400000108004680046800468004680046
40002480045600000024926320010103200001032000050352000011800260800458004532632001020320000209600008004580045111600211091010160000100000000100233117172118880042152400000108004680046800468004680046
40002480045599000026026320010103200001032000050352000011800260800458004532632001020320000209600008004580045111600211091010160000100000000100213117172116780042302400000108004680046800468004680046
40002480045600000034926320010103200001032000050352000011800260800458004532632001020320000209600008004580045111600211091010160000100000000100213116172128580042152400000108004680046800468004680046
40002480045600000014926320010103200001032000050352000011800260800458004532632001020320000209600008004580045111600211091010160000100000000100213126172215780042152400000108004680046801198004680046
400024800456000000127426320010103200761032000050352000011800260800458004532632001020320000209600008004580045111600211091010160000100000000100213115172128680042152400000108004680046800468004680046
40002480045599000015526320010103200001032000050352000011800260800458004532632001020320000209600008004580045111600211091010160000100100000100213116172116580042152400000108004680046800468004680046