Apple M1 Microarchitecture Research by Dougall Johnson

Firestorm: Overview | Base Instructions | SIMD and FP Instructions
Icestorm:  Overview | Base Instructions | SIMD and FP Instructions

TBX (four register table, 16B)

Test 1: uops

Code:

  tbx v0.16b, { v1.16b, v2.16b, v3.16b, v4.16b }, v5.16b
  movi v0.16b, 1
  movi v1.16b, 2
  movi v2.16b, 3
  movi v3.16b, 4
  movi v4.16b, 5
  movi v5.16b, 6

(no loop instructions)

1000 unrolls and 1 iteration

Retires: 4.000

Issues: 4.000

Integer unit issues: 0.000

Load/store unit issues: 0.000

SIMD/FP unit issues: 4.000

retire (01)cycle (02)0318191e3f4e51inst issue (52)~issue fp/simd (54)~dispatch fp/simd (57)huge thing fp/simd (5b)60696d6edispatch stall (70)scheduler rewind (75)scheduler stall (76)~dispatch op (78)~map op fp/simd (7e)~map lookup fp/simd (81)8283pipeline redirect (84)85inst all (8c)inst neon or fp (9a)cfd5d6ddinst fetch restart (de)e0? fp/simd (ee)f5f6f7f8fd
400480376000306176872540004000400011256800801880378037669737020400040001200080378037111001100073216117796400080388038803880388038
4004803760003126176872540004000400011256800801880378037669737020400040001200080378037111001100073116117796400080388038803880388038
40048037600006176872540004000400011256801801880378037669737020400040001200080378037111001100073116117796400080388038803880388038
40048037600006176872540004000400011256800801880378037669737020400040001200080378037111001100073116117796400080388038803880388038
40048037600066176872540004000400011256801801880378037669737020400040001200080378037111001100073116117796400080388038803880388038
40048037600006176872540004000400011256801801880378037669737020400040001200080378037111001100073116117796400080388038803880388038
400480376000636176872540004000400011256801801880378037669737020400040001200080378037111001100073116117929400080388038803880388038
40048037601106176872540004000400011256801801880378037669737020400040001200080378037111001100073116117796400080388038803880388038
40048037600006176872540004000400011256800801880378037669737020400040001200080378037111001100073116117796400080388038803880388038
40048037600006176872540004000400011256801801880378037669737020400040001200080378037111001100073116117796400080388038803880388038

Test 2: Latency 1->1

Code:

  tbx v0.16b, { v1.16b, v2.16b, v3.16b, v4.16b }, v5.16b
  movi v0.16b, 1
  movi v1.16b, 2
  movi v2.16b, 3
  movi v3.16b, 4
  movi v4.16b, 5
  movi v5.16b, 6

(fused SUBS/B.cc loop)

100 unrolls and 100 iterations

Result (median cycles for code): 8.0037

retire (01)cycle (02)0308191e1f3f4e51inst issue (52)~issue int (53)~issue fp/simd (54)~dispatch int (56)~dispatch fp/simd (57)huge thing int (59)huge thing fp/simd (5b)60696b6d6edispatch stall (70)scheduler rewind (75)scheduler stall (76)~dispatch op (78)~map op int (7c)~map op fp/simd (7e)~map lookup int (7f)~map lookup fp/simd (81)8283pipeline redirect (84)85inst all (8c)inst branch (8d)inst branch taken (90)inst b.cc (94)inst integer (97)inst neon or fp (9a)9fa0a1a6a8acc2cfd0d5d6ddinst fetch restart (de)e0? fp/simd (ee)gpr retires (ef)f5f6f7f8fd
402048003760000007267968725401001004000010040000500114576800800180800378003769697370019401002004000020012000080037800371110201100991001001000010000000071001171279797400001008003880038800388003880038
402048003759900002517968725401001004000010040000500114576800800180800378003769697370019401002004000020012000080037800371110201100991001001000010000000071001171179797400001008003880038800388003880038
40204800376000000617968725401001004000010040000500114576800800180800378003769697370019401002004000020012000080037800371110201100991001001000010000000071001171179797400001008003880038800388003880038
4020480037599003906179687254010010040000100400005001145768008001808003780037696973700194010020040000200120000800378003711102011009910010010000100000520071001171179797400001008003880038800388003880038
402048003759900210617968725401001004000010040000500114576801800180800378003769697370019401002004000020012000080037800371110201100991001001000010000010071001171179797400001008003880038800388003880038
40204800375990060617968725401001004000010040000500114576800800180800378003769697370019401002004000020012000080037800371110201100991001001000010000000071001171179797400001008003880038800388003880038
40204800376000000617968725401001004000010040000500114576800800180800378003769697370019401002004000020012000080037800371110201100991001001000010000000071001171179858400001008003880038800388003880038
40204800376000000617968725401001004000010040000500114576800800180800378003769697370019401002004000020012000080037800371110202100991001001000010000000071001171179797400001008003880038800388003880038
402048003759900006179687254010010040000100400005001145768008001808003780037696973700194010020040000200120000800378003711102011009910010010000100002340071001171179797400001008003880038800388003880038
40204800376000000617968725401001004000010040000500114576800800180800378003769697370019401002004000020012000080037800371110201100991001001000010000000071001171079797400001008003880038800388003880038

1000 unrolls and 10 iterations

Result (median cycles for code): 8.0037

retire (01)cycle (02)0318191e3f4e51inst issue (52)~issue int (53)~issue fp/simd (54)~dispatch int (56)~dispatch fp/simd (57)huge thing int (59)huge thing fp/simd (5b)60696d6edispatch stall (70)72scheduler rewind (75)scheduler stall (76)~dispatch op (78)~map op int (7c)~map op fp/simd (7e)~map lookup int (7f)~map lookup fp/simd (81)8283pipeline redirect (84)85inst all (8c)inst branch (8d)inst branch taken (90)inst b.cc (94)inst integer (97)inst neon or fp (9a)9fa6a8accdcfd5d6inst fetch restart (de)e0? fp/simd (ee)gpr retires (ef)f5f6f7f8fd
40024800376000003467968725400101040000104000050114576801800188003780037696970370019400102040000201200008003780037111002110910101000010000064911717979740000108003880038800388003880038
4002480037599000617968725400101040000104000050114576800800188003780037696970370019400102040000201200008003780084111002110910101000010003064911717979740000108003880038800388003880038
4002480037600000617968725400101040000104000050114576801800188003780037696970370019400102040000201200008003780037111002110910101000010000064911717979740000108003880038800388003880083
40024800375990001267968725400101040000104000050114576800800188017780037696970370019400102040000201200008003780037111002110910101000010000064911717979740000108003880038800388003880038
4002480037600000617968725400101040000104000050114582300800188003780037696970370019400102040000201200008003780037111002110910101000010000064911717979740000108003880038800388003880038
4002480037600000617968725400101040000104000050114576800800188003780037696970370019400102040000201200008003780037111002110910101000010000064911717979740000108003880038800388003880038
4002480037599000617968725400101040000104000050114576801800188003780037696970370019400102040000201200008003780037111002110910101000010000064911717979740000108003880038800388003880038
4002480037599000617968725400101040000104000050114576801800188003780037696970370019400102040000201200008003780037111002110910101000010010064911717979740000108003880038800388003880038
4002480037599000617968725400101040000104000050114576801800188003780037696970370019400102040000201200008003780037111002110910101000010000064911747979740000108003880038800388003880038
4002480037600000617968725400101040000104000050114576800800188003780037696970370019400102040000201200008003780037111002110910101000010000064911717979740000108003880038800388003880038

Test 3: Latency 1->2

Chain cycles: 2

Code:

  movi v0.16b, 0
  tbx v0.16b, { v1.16b, v2.16b, v3.16b, v4.16b }, v5.16b
  add v1.16b, v0.16b, v0.16b
  movi v0.16b, 1
  movi v1.16b, 2
  movi v2.16b, 3
  movi v3.16b, 4
  movi v4.16b, 5
  movi v5.16b, 6

(fused SUBS/B.cc loop)

100 unrolls and 100 iterations

Result (median cycles for code, minus 2 chain cycles): 8.0038

retire (01)cycle (02)0318191e1f3f4e51inst issue (52)~issue int (53)~issue fp/simd (54)~dispatch int (56)~dispatch fp/simd (57)huge thing int (59)huge thing fp/simd (5b)60696d6edispatch stall (70)scheduler rewind (75)scheduler stall (76)~dispatch op (78)~map op int (7c)~map op fp/simd (7e)~map lookup int (7f)~map lookup fp/simd (81)8283pipeline redirect (84)85inst all (8c)inst branch (8d)inst branch taken (90)inst b.cc (94)inst integer (97)inst neon or fp (9a)9fa1a6a7a8a9acc2c5branch mispredict (cb)cdcficache miss (d3)d5d6ddinst fetch restart (de)e0? fp/simd (ee)gpr retires (ef)f5f6f7f8fd
60204100038749000061996862650100100500001005000050014327648010001910003810003889696390019501002005000020014000010003810003811302011009910010030000100000000000019100117119979860000100100039100039100039100039100039
60204100038749000061996862650100100500001005000050014327648010001910003810003889696390019501002005000020014000010003810003811302011009910010030000100000000000019100117119979860000100100039100039100039100039100039
60204100038749000061996862650100100500001005000050014327648110001910003810003889696390019501002005000020014000010003810003811302011009910010030000100000000000019100117119979860000100100039100039100039100039100039
602041000387490000619968626501001005000010050000500143276480100019100038100038896963900195010020050000200140000100038100038113020110099100100300001000204300200019100117119979860000100100039100039100039100039100039
602041000387490000726996862650100100500001005000050014327648010001910003810003889696390019501002005000020014000010003810003811302011009910010030000100000000000019100117119986660000100100039100039100039100039100039
60204100038749000061996862650100100500001005000050014327648010001910003810003889696390019501002005000020014000010003810003811302011009910010030000100000000000019100117119979860000100100039100039100039100039100039
602041000387490000329996862650100100500001005000050014327648010001910003810008889696390019501002005000020014000010003810003811302011009910010030000100000000000019100117119979860000100100039100039100039100039100039
602041000387500000726996862650100100500001005000050014327648010001910003810003889696390019501002005000020014000010003810003811302011009910010030000100000000000019100117119979860000100100039100039100039100039100039
6020410003874940001610996862650100100500001005000050014327648110001910003810003889696390019501002005000020014000010003810003811302011009910010030000100000008051000019100117119979860000100100039100039100039100039100039
60204100038749000061996862650100100500001005000050014327648010001910003810003889696390019501002005000020014000010003810003811302011009910010030000100000000000019100117119979860000100100039100039100039100039100039

1000 unrolls and 10 iterations

Result (median cycles for code, minus 2 chain cycles): 8.0038

retire (01)cycle (02)03080918191e1f3a3f4e51inst issue (52)~issue int (53)~issue fp/simd (54)~dispatch int (56)~dispatch fp/simd (57)huge thing int (59)huge thing fp/simd (5b)60696b6d6edispatch stall (70)scheduler rewind (75)scheduler stall (76)~dispatch op (78)~map op int (7c)~map op fp/simd (7e)~map lookup int (7f)~map lookup fp/simd (81)8283pipeline redirect (84)85inst all (8c)inst branch (8d)inst branch taken (90)inst b.cc (94)inst integer (97)inst neon or fp (9a)9fa0a1a6a7a8a9acc2branch mispredict (cb)cdcfd5d6ddinst fetch restart (de)e0? fp/simd (ee)gpr retires (ef)f5f6f7f8fd
6002410003874900000001039968626500101050000105000050143276480100019010003810003889662390019500102050000201400001000381000382130021109101030000100000000000189911701997986000010100039100039100039100039100039
600241000387490000000619968626500101050000105000050143276481100019010003810003889696390019500102050000201400001000381000381130021109101030000100000000000189911701997986000010100039100039100039100039100039
600241000387500000000619968626500101050000105000050143276481100019010003810003889696390019500102050000201400001000381000381130021109101030000100000000000189911701997986000010100039100039100039100090100039
600241000387490000000619968626500101050000105000050143276480100019010003810003889696390019500102050000201400001000381000381130021109101030000100000000000189911701997986000010100039100039100039100039100039
600241000387490000000619968626500101050000105000050143276480100019010003810003889696390019500102050000201400001000381000381130021109101030000100000000000189911701997986000010100039100039100039100039100039
600241000387490000000619968626500101050024105000050143276481100019010003810003889696390019500102050000201400001000381000381130021109101030000100000000010189911701997986000010100039100039100039100039100039
600241000387490000000619968626500101050000105000050143276481100019010003810003889696390019500102050000201400001000381000381130021109101030000100000000000189911701997986000010100039100039100039100039100039
600241000387780000000619968626500101050000105000050143276481100019010003810003889696390019500102050000201400001000381000381130021109101030000100000003000189911701997986000010100039100039100039100039100039
600241000387500000000619968626500101050000105000050143276481100019010003810003889696390019500102050000201400001000381000381130021109101030000100000100010189911701997986000010100039100039100039100039100039
600241000387490000000619968626500101050000105000050143276481100019010003810003889696390019500102050000201400001000381000381130021109101030000100000000000189911701997986000010100039100090100039100039100039

Test 4: Latency 1->3

Chain cycles: 2

Code:

  movi v0.16b, 0
  tbx v0.16b, { v1.16b, v2.16b, v3.16b, v4.16b }, v5.16b
  add v2.16b, v0.16b, v0.16b
  movi v0.16b, 1
  movi v1.16b, 2
  movi v2.16b, 3
  movi v3.16b, 4
  movi v4.16b, 5
  movi v5.16b, 6

(fused SUBS/B.cc loop)

100 unrolls and 100 iterations

Result (median cycles for code, minus 2 chain cycles): 6.0040

retire (01)cycle (02)03080a18191e1f373f4e51inst issue (52)~issue int (53)~issue fp/simd (54)~dispatch int (56)~dispatch fp/simd (57)huge thing int (59)huge thing fp/simd (5b)60696b6d6edispatch stall (70)scheduler rewind (75)scheduler stall (76)~dispatch op (78)~map op int (7c)~map op fp/simd (7e)~map lookup int (7f)~map lookup fp/simd (81)8283pipeline redirect (84)85inst all (8c)inst branch (8d)inst branch taken (90)inst b.cc (94)inst integer (97)inst neon or fp (9a)9fa0a1a6a8a9acc2branch mispredict (cb)cdcfd5d6ddinst fetch restart (de)e0? int output thing (e9)? fp/simd (ee)gpr retires (ef)f5f6f7f8fd
602048004360000000009479682265010110050000100500005001144744708002108004380040697043700215010020050000200140000800408004011302011009910010030000100002000001191011711798210600001008004180041800418004180041
602048004060000000006179682265010010050000100500005001144744718002108004380040697043700215010020050000200140000800408004011302011009910010030000100000000000191011711798210600001008004180041800418004180044
602048004359900000006179686265010310050003100500005001144744708002108004080040697043700215010020050000200140000800438004311302011009910010030000100000000000191011711798210600001008004180041800418004180041
60204800435990000000245079605725029810450250104509455331145417908002108004380040697073700215010020050000200140000800438004311302011009910010030000100000000000191011711798210600001008004180041800418004480041
602048004059900000016179682265010010050000100500005001144793818002108004080040697043700245010020050000200140000800408004011302011009910010030000100000000000191011711798240600001008004180041800418004480044
602048004059900000006179682265010010050000100500005001144744708002108004080043697043700215010020050000200140000800408004051302011009910010030000100000000000191011711798210600001008004180041800418004480041
6020480040600000000072679682265010010050003100500005001144744718002108004380040697043700215010020050000200140000800438004311302011009910010030000100000000000191011711798210600001008004180041800418004180041
602048004060000000006179686265010010050003100500005001144744718002108004380040697043700215010020050000200140000800408004011302011009910010030000100000000000191011711798240600001008004180041800448004180041
602048004060000000006179682265010010050000100500005001144744718002408004080043697043700215010020050000200140000800408004311302011009910010030000100000000000191011711798210600001008004180041800418004180041
602048004360000000016179685265010010050000100500005001144744718002108004080040697043700215010020050000200140000800408004011302011009910010030000100000000010191011711798240600001008004180041800418004180041

1000 unrolls and 10 iterations

Result (median cycles for code, minus 2 chain cycles): 6.0040

retire (01)cycle (02)031e373f4e51inst issue (52)~issue int (53)~issue fp/simd (54)~dispatch int (56)~dispatch fp/simd (57)huge thing int (59)huge thing fp/simd (5b)5f60696b6d6edispatch stall (70)72scheduler rewind (75)scheduler stall (76)~dispatch op (78)~map op int (7c)~map op fp/simd (7e)~map lookup int (7f)~map lookup fp/simd (81)8283pipeline redirect (84)85inst all (8c)inst branch (8d)inst branch taken (90)inst b.cc (94)inst integer (97)inst neon or fp (9a)9fa0a1a6a8acc2cfd0d5d6d9dadbddinst fetch restart (de)e0eaec? fp/simd (ee)gpr retires (ef)f5f6f7f8fd
600248004060000617968226500101050001105000050114474470180021080043800406970403700215001020500002014000080040800401130021109101030000100000001899061700046798210060000108004180041800418004180041
600248004060002617968226500101050000105000050114474470180021080040800406970403700215001020500002014000080040800401130021109101030000100000001899071700067798210060000108004180044800418004480044
600248004059901617968226500101050000105000050114474470080021080040800406970703700215001020500002014000080040800401130021109101030000100000001899051700045798210060000108004180041800418004180041
600248004059900617968226500101050000105000050114474470080024080040800406970403700215001020500002014000080040800401130021109101030000100000001899061700056798210060000108004180041800418004180041
60024800406005851617968226500121050001105000050114474580080021080040800406971803700215001020500002014000080040800401130021109101030000100000001899071700057798210060000108004180041800418004480041
600248004060000617968226500111050000105000050114474470180021080040800406970403700215001020500002014000080040800401130021109101030000100000001899071700045798210060000108004180044800418004480041
6002480040600017267968226500121050002105000050114474470180024080040800406970403700215001020500002014000080040800431130021109101030000100003001899071700057798210060000108004480041800448004180041
6002480040599007267968526500101050000105000050114474470180021080040800406970403700215001020500002014000080043800401130021109101030000100000001899071700064798210060000108004180041800418004180041
600248004059900617968226500111050000105000050114474470180021080040800406970403700215001020500002014000080040800401130021109101030000100000001899071700038798210060000108004180044800418004480041
600248004062000617968226500131050001105000050114474470180021080040800406970403700215001020500002014000080040800401130021109101030000100000001899061700056798210060000108004180041800418004180041

Test 5: Latency 1->4

Chain cycles: 2

Code:

  movi v0.16b, 0
  tbx v0.16b, { v1.16b, v2.16b, v3.16b, v4.16b }, v5.16b
  add v3.16b, v0.16b, v0.16b
  movi v0.16b, 1
  movi v1.16b, 2
  movi v2.16b, 3
  movi v3.16b, 4
  movi v4.16b, 5
  movi v5.16b, 6

(fused SUBS/B.cc loop)

100 unrolls and 100 iterations

Result (median cycles for code, minus 2 chain cycles): 4.0042

retire (01)cycle (02)03181e373f4e51inst issue (52)~issue int (53)~issue fp/simd (54)~dispatch int (56)~dispatch fp/simd (57)huge thing int (59)huge thing fp/simd (5b)60696b6d6edispatch stall (70)scheduler rewind (75)scheduler stall (76)~dispatch op (78)~map op int (7c)~map op fp/simd (7e)~map lookup int (7f)~map lookup fp/simd (81)8283pipeline redirect (84)85inst all (8c)inst branch (8d)inst branch taken (90)inst b.cc (94)inst integer (97)inst neon or fp (9a)9fa8a9acc2cfd5d6ddinst fetch restart (de)e0? int output thing (e9)? fp/simd (ee)gpr retires (ef)f5f6f7f8fd
60204600424490006159679265010110050000100500005008565351160035060054600424971835002650100200500002001400006004860042113020110099100100300001000000191011711598650600001006004360055600436005560043
60204600424500026159686265010410050004100500005008564497160023060042600544972735003550100200500002001400006005460042113020110099100100300001009000191011711598650600001006005560043600556004360055
60204600544500906159679265010410050004100500005008564469160023060048600484971535002350100200500002001400006005460042113020110099100100300001000000191011711598650600001006004360055600436005560043
60204600424500046159686265010410050004100500005008564469160023060042600484971535002350100200500002001400006004260054113020110099100100300001000000191011711598590600001006005560043600556004360055
60204600544490016659686265010410050004100500005008564497060035060054600424971535002950100200500002001400006004260054113020110099100100300001000000191011711598650600001006004360055600436005560043
60204600544490006159686265010410050004100500005008564497160023060054600424971535002350100200500002001400006004260054113020110099100100300001000000191011711598590600001006005560043601106004360043
60204600424500046159679265010110050000100500005008565609160035060054600424971535002350100200500002001400006004260054113020110099100100300001000000191011711598650600001006004960043600496005560043
60204600424500016159686265010410050004100500005008564801060023060054600424971535002350848200500002001400006005460042113020110099100100300001000000191011711598530600001006005560043600556004660055
60204600544500026159679265010410050000100500005008565609160035060054600424971535002350100200500002001400006004260054113020110099100100300001001000191011711598650600001006005560043600556004360043
602046004245000072659686265010410050004100500005008564469160023060042600544972735003550100200500002001400006005460042113020110099100100300001000000191011711598530600001006005560043600556004360055

1000 unrolls and 10 iterations

Result (median cycles for code, minus 2 chain cycles): 4.0045

retire (01)cycle (02)0308090b1e373f4e51inst issue (52)~issue int (53)~issue fp/simd (54)~dispatch int (56)~dispatch fp/simd (57)huge thing int (59)huge thing fp/simd (5b)60696d6edispatch stall (70)scheduler rewind (75)scheduler stall (76)~dispatch op (78)~map op int (7c)~map op fp/simd (7e)~map lookup int (7f)~map lookup fp/simd (81)8283pipeline redirect (84)85inst all (8c)inst branch (8d)inst branch taken (90)inst b.cc (94)inst integer (97)inst neon or fp (9a)9facc2c5cfd5d6ddinst fetch restart (de)e0eb? fp/simd (ee)gpr retires (ef)f5f6f7f8fd
600246004545000000615967926500111050001105000050856446916002660042600544971535002950010205000020140000600486004211300211091010300001000018996173359853060000106004360043600436004360055
600246011846611190615967926500141050000105000050856446906002960048600544972735003550010205000020140000600426005411300211091010300001000018994172459865060000106004360055600436005560043
6002460045450000006159679265001010500021050000508565609160035600546004249715350023500102050000201400006004260042113002110910103000010000189931734598531960000106004660043600496004960049
600246004245000000615968126500101050001105000050856523316002660042600544971535002350010205000020140000600426005411300211091010300001000018994173459853060000106004960043600436004360043
6002460042449000150615967926500101050000105000050856560916003560048600424971535002350010205000020140000600486004211300211091010300001000118994173359865060000106004360055600436005560043
600246004245000000615968626500141050004105000050856446916002960048600424971535002350010205000020140000600486004811300211091010300001000018994173459859060000106004360043600436004960043
600246004245000000615968626500141050004105000050856446916002960048600544972735003550010205000020140000600426004211300211091010300001000018994172459853060000106005560099600496004960046
600246004245000002615967926500101050000105000050856449706002960042600484970035002950010205000020140000600486004811300211091010300001000018994172359859060000106004960055600436004360043
600246004244900104615968626500141050004105000050856446916002360042600544972735003550010205000020140000600426004211300211091010300001000018994172459865060000106005560043600556004360055
600246005444900000615968626500141050004105000050856446916002360042600544972735003550010205000020140000600426004811300211091010300001000018994173359853060000106005560043600556004660055

Test 6: Latency 1->5

Chain cycles: 2

Code:

  movi v0.16b, 0
  tbx v0.16b, { v1.16b, v2.16b, v3.16b, v4.16b }, v5.16b
  add v4.16b, v0.16b, v0.16b
  movi v0.16b, 1
  movi v1.16b, 2
  movi v2.16b, 3
  movi v3.16b, 4
  movi v4.16b, 5
  movi v5.16b, 6

(fused SUBS/B.cc loop)

100 unrolls and 100 iterations

Result (median cycles for code, minus 2 chain cycles): 2.0731

retire (01)cycle (02)030b191e373f4e51inst issue (52)~issue int (53)~issue fp/simd (54)~dispatch int (56)~dispatch fp/simd (57)huge thing int (59)huge thing fp/simd (5b)60696d6edispatch stall (70)72scheduler rewind (75)scheduler stall (76)~dispatch op (78)~map op int (7c)~map op fp/simd (7e)~map lookup int (7f)~map lookup fp/simd (81)8283pipeline redirect (84)85inst all (8c)inst branch (8d)inst branch taken (90)inst b.cc (94)inst integer (97)inst neon or fp (9a)9fa1a6a8acc2cdcfd5d6ddinst fetch restart (de)e0? fp/simd (ee)gpr retires (ef)f5f6f7f8fd
602044065930500033961403312650500100503391005000050057751331408024073740704303940330613501002005000020014000040791406501130201100991001003000010000000019101171140662600001004066340729408014061840723
602044078230500035461402942650433100503411005000050057834551406434072840800303640330868501002005000020014000040731407011130201100991001003000010000000019101171140738600001004078640624407024063340864
602044060230500029061402882650461100504081005000050058003231407484064740659303900330670501002005000020014000040608407521130201100991001003000010000000019101171140594600001004078940663408254082240768
602044064730500038661402882650389100503201005000050057903121407784064740788305110330844501002005000020014000040659406771130201100991001003000010000000019101171140524600001004070240693406034075040708
6020440662306000457251402862650407100503491005000050057846571406764070140692304370330721501002005000020014000040797407311130201100991001003000010000000019100171140512600001004086740702406934069640798
60204406863050012357103404332650461100503881005000050057729581407784078840662304580330619501002005000020014000040809407731130201100991001003000010010000019101171140638600001004082540810407744085540798
602044064130400036161403382650528100503021005000050057808961408054082140743304120330730501002005000020014000040797407311130201100991001003000010000000019101171140638600001004074140798407324072340678
602044078530500036161404262650497100503761005000050057766761407784067140887305170330679501002005000020014000040740407851130201100991001003000010000000019101171140638600001004067840750407234062740888
602044077930500034161402422650454100503391005000050057890331405954073740737304840330703501002005000020014000040776406321130201100991001003000010000000019101171140662600001004073540741407984073240822
602044073130500043461403692650515100503931005000050057588991407184063240797304660330778501002005000020014000040611406771130201100991001003000010000000019101171140674600001004069340603407504088240690

1000 unrolls and 10 iterations

Result (median cycles for code, minus 2 chain cycles): 2.0104

retire (01)cycle (02)03191e373f4e51inst issue (52)~issue int (53)~issue fp/simd (54)~dispatch int (56)~dispatch fp/simd (57)huge thing int (59)huge thing fp/simd (5b)60696b6d6edispatch stall (70)scheduler rewind (75)scheduler stall (76)~dispatch op (78)~map op int (7c)~map op fp/simd (7e)~map lookup int (7f)~map lookup fp/simd (81)8283pipeline redirect (84)85inst all (8c)inst branch (8d)inst branch taken (90)91inst b.cc (94)inst integer (97)inst neon or fp (9a)9fa6a8accfd5d6ddinst fetch restart (de)e0? fp/simd (ee)gpr retires (ef)f5f6f7f8fd
600244069530000186139694265003310500311050000505690882040076040083401312977633012750010205000020140000402364010411300211090101030000100031899417233997160000104009940078400814006340075
600244007430000176139685265002810500271050000505687641040076040194401942980333005850010205000020140000400864009211300211090101030000100001899317234000760000104021940093401744008740199
600244009230000196139694265003010500181050000505689834140067040131400712977333007050010205000020140000400684008911300211090101030000100001899217233997260000104006940210400724024640078
600244008630100206139791265002110500201050000505686276040070040083401312977633007950010205000020140000401074020011300211090101030000100001899317233999860000104019240060402644008140084
600244024530000146139709265177310518141050000505688506140091040101401012977033008850010205000020140000400984008611300211090101030000100001899317233995360000104009640099401114009040084
600244011630000156139707265002710500301050000505696628140100040107400832976733005550010205000020140000401104008911300211090101030000100001899317224007160000104011440105401084008440069
600244006830100246139659265003910500211050000505694789140064040113402542980333005550010205000020140000400864013111300211090101030000100001899217133996060000104014140075401174008740081
6002440125300002272639707265003310500211050000505694789140094040089400922978833007050010205000020140000402004011611300211090101030000100001899317233996060000104007840087401174014440090
600244009530000236139724265003010500141050000505686272140058040086401042977033007050010205000020140000400954006811300211090101030000100001899317233996360000104011440177400694021040099
600244011330200196139688495002310500361050000505691546140070040098400832987633009750010205000020140000400924007111300211090101030000100001899317233997260000104008740090401054008740204

Test 7: Latency 1->6

Chain cycles: 2

Code:

  movi v0.16b, 0
  tbx v0.16b, { v1.16b, v2.16b, v3.16b, v4.16b }, v5.16b
  add v5.16b, v0.16b, v0.16b
  movi v0.16b, 1
  movi v1.16b, 2
  movi v2.16b, 3
  movi v3.16b, 4
  movi v4.16b, 5
  movi v5.16b, 6

(fused SUBS/B.cc loop)

100 unrolls and 100 iterations

Result (median cycles for code, minus 2 chain cycles): 8.0038

retire (01)cycle (02)0308191e3f4e51inst issue (52)~issue int (53)~issue fp/simd (54)~dispatch int (56)~dispatch fp/simd (57)huge thing int (59)huge thing fp/simd (5b)60696d6edispatch stall (70)scheduler rewind (75)scheduler stall (76)~dispatch op (78)~map op int (7c)~map op fp/simd (7e)~map lookup int (7f)~map lookup fp/simd (81)8283pipeline redirect (84)85inst all (8c)inst branch (8d)inst branch taken (90)inst b.cc (94)inst integer (97)inst neon or fp (9a)9fa8acc2c5cfd5d6ddinst fetch restart (de)e0? int output thing (e9)? fp/simd (ee)gpr retires (ef)f5f6f7f8fd
60204100038749000944996862650100100500001005000050014327648110001910003810003889696390019501002005000020014000010003810003811302011009910010030000100000019101171199798260000100100039100039100039100039100039
60204100038749000300996862650100100500001005000050014327648110001910003810003889696390019501002005000020014000010003810003811302011009910010030000100000019101171199798060000100100039100039100039100039100039
602041000387490005899968626501001005000010050000500143276481100019100038100038896963900195010020050000200140000100038100038113020110099100100300001000143000020292751199912160000100100039100039100039100039100039
6020410003875006061996862650100100500001005000050014327648110001910003810003889696390019501002005000020014000010003810003811302011009910010030000100000019101171099798060000100100039100039100039100039100039
60204100038750000212996862650100100500001005000050014327648110001910003810003889696390019501002005000020014000010003810003811302011009910010030000100000019101171199798060000100100039100039100039100039100039
6020410003874900121135996862650100100500001005000050014327874110001910003810003889696390019501002005000020014000010003810003811302011009910010030000100130019101171199798060000100100039100039100039100039100039
6020410003874900122028996862650100100500001005000050014327648110001910003810003889696890019501002005000020014092010003810003811302011009910010030000100030019101171199798060000100100039100039100039100039100039
60204100038750001224769968626501001005000010050000500143276481100019100038100038896963900195010020450000200140000100038100038113020110099100100300001000000191011711100035060000100100039100039100039100039100039
60204100038749000780996862650114100500001005000050014327648110001910003810003889696390019501002005000020014000010003810003811302011009910010030000100000019101171199798060000100100039100039100039100039100039
60204100038750000822996862650100100500001005000050014327648110001910003810003889696390019501002005000020014000010003810003811302011009910010030000100000019101171199798060000100100039100039100039100039100039

1000 unrolls and 10 iterations

Result (median cycles for code, minus 2 chain cycles): 8.0038

retire (01)cycle (02)0308090a0b18191e1f3f4e51inst issue (52)~issue int (53)~issue fp/simd (54)~dispatch int (56)~dispatch fp/simd (57)huge thing int (59)huge thing fp/simd (5b)5f60696d6edispatch stall (70)scheduler rewind (75)scheduler stall (76)~dispatch op (78)~map op int (7c)~map op fp/simd (7e)~map lookup int (7f)~map lookup fp/simd (81)8283pipeline redirect (84)85inst all (8c)inst branch (8d)inst branch taken (90)inst b.cc (94)inst integer (97)inst neon or fp (9a)9fa0a6a8a9accfd5d6ddinst fetch restart (de)e0? fp/simd (ee)gpr retires (ef)f5f6f7f8fd
600241000387490000000076899686265001010500001050000501432764810100019100038100038896963900195001020500002014000010003810003811300211091010300001000000189961725997986000010100039100039100039100039100039
600241000387490000003006199686265001010500001050000501432764810100019100038100038896963900195001020500002014184210003810003811300211091010300001000400189951745997986000010100039100039100039100039100039
60024100038749000001006199642265001010500001050000501432764810100019100038100038896963900195001020500002014000010003810003811300211091010300001000000189951725997986000010100039100039100039100039100039
60024100038749000100006199686265001010500001050000501432764810100163100038100038896963900195001020500002014000010003810003811300211091010300001000003189961756997986000010100039100039100039100039100039
60024100038750000000006199686265001010500001050000501432764810100019100038100038896963900195001020500002014000010003810003811300211091010300001000000189951725997986000010100039100039100039100039100039
60024100038749000100006199686265005810500001050000501432764810100019100038100038896963900195001020500002014000010003810003811300211091010300001000300189951743997986000010100039100039100039100039100039
6002410003874900000057306199686265001010500001050000501432764810100019100038100038896963900195001020500002014000010003810003811300211091010300001000000189935045999446000010100039100039100039100039100039
600241000387500000000067399686265001010500001050000501432764810100019100038100038896963900195001020500002014000010003810003811300211091010300001020006132189931743997986000010100039100039100039100039100089
60024100038749000040006199686265001010500001050000501432764811100019100134100086896963900195001020500002014000010003810003811300211091010300001000003189931725997986000010100039100039100039100039100039
600241000387490000000061996862650010105000010500005014327648101000191000381000388969625900195001020500002014000010003810003811300211091010300001000000189955143997986000010100039100039100039100039100039

Test 8: throughput

Count: 8

Code:

  movi v0.16b, 0
  tbx v0.16b, { v8.16b, v9.16b, v10.16b, v11.16b }, v12.16b
  movi v1.16b, 0
  tbx v1.16b, { v8.16b, v9.16b, v10.16b, v11.16b }, v12.16b
  movi v2.16b, 0
  tbx v2.16b, { v8.16b, v9.16b, v10.16b, v11.16b }, v12.16b
  movi v3.16b, 0
  tbx v3.16b, { v8.16b, v9.16b, v10.16b, v11.16b }, v12.16b
  movi v4.16b, 0
  tbx v4.16b, { v8.16b, v9.16b, v10.16b, v11.16b }, v12.16b
  movi v5.16b, 0
  tbx v5.16b, { v8.16b, v9.16b, v10.16b, v11.16b }, v12.16b
  movi v6.16b, 0
  tbx v6.16b, { v8.16b, v9.16b, v10.16b, v11.16b }, v12.16b
  movi v7.16b, 0
  tbx v7.16b, { v8.16b, v9.16b, v10.16b, v11.16b }, v12.16b
  movi v8.16b, 9
  movi v9.16b, 10
  movi v10.16b, 11
  movi v11.16b, 12
  movi v12.16b, 13

(fused SUBS/B.cc loop)

100 unrolls and 100 iterations

Result (median cycles for code divided by count): 1.0006

retire (01)cycle (02)0307080a18191e1f3a3f51inst issue (52)~issue int (53)~issue fp/simd (54)~dispatch int (56)~dispatch fp/simd (57)huge thing int (59)huge thing fp/simd (5b)5f60696b6d6escheduler rewind (75)scheduler stall (76)~dispatch op (78)~map op int (7c)~map op fp/simd (7e)~map lookup int (7f)~map lookup fp/simd (81)8283pipeline redirect (84)85inst all (8c)inst branch (8d)inst branch taken (90)inst b.cc (94)inst integer (97)inst neon or fp (9a)9fa0a1a7a8a9acc2c5branch mispredict (cb)cdcfd0d5d6dadbddinst fetch restart (de)e0? int output thing (e9)? fp/simd (ee)gpr retires (ef)f5f6f7f8fd
40020480071600101000009726320111100320011100320032500352024500801140802108004571532013220032003220096009680045800451116020110099100100160000100000000011110116021600228004204000001008004680046800468004680046
40020480045599101000013226320111100320011100320032500352024501800260800458004561532013220032003220096009680045800451116020110099100100160000100000000011110116021600438004204000001008004680046800468004680046
40020480045599101000015226320111100320011100320032500352024500800260800458004561432013220032003220096009680045800451116020110099100100160000100000000011110116021600228004204000001008004680046800468004680046
40020480045599101000013126320111100320011100320032500352024500800260800458004571432013220032003220096009680045800451116020110099100100160000100000000011110116021600228004204000001008004680046800468004680046
40020480045599101000013259320111100320011100320032500352024500800260801048004561532013220032003220096009680045800451116020110099100100160000100000000011110117021600228004204000001008010680046800468004680046
40020480045599101000013127320111100320011100320032500352024500800260800458004561532013220032003220096009680045800451116020110099100100160000100000003011110116021600228004204000001008004680046800468004680046
400204800456001010000169626320111100320011100320032500352024500800260800458004561532013220032003220096009680045800451116020110099100100160000100000000011110116021600228004204000001008004680046800468004680046
40020480045600101000013126320111100320011100320032500352024500800260800458004571532013220032003220096009680045800451116020110099100100160000100000000011110116021600228004204000001008004680046800468004680046
40020480100600101000013226320111100320011100320032500352024500800260800458004561532013220032003220096009680045800451116020110099100100160000100000000011110116021600448004204000001008004680046800468004680046
40020480045599101000013226320111100320011100320032500352024500800260800458004561432013220032003220096009680045800451116020110099100100160000100000000011110117021600228004204000001008004680046800468004680046

1000 unrolls and 10 iterations

Result (median cycles for code divided by count): 1.0006

retire (01)cycle (02)0318191e1f3a3f51inst issue (52)~issue int (53)~issue fp/simd (54)~dispatch int (56)~dispatch fp/simd (57)huge thing int (59)huge thing fp/simd (5b)5f60696b6d6escheduler rewind (75)scheduler stall (76)~dispatch op (78)~map op int (7c)~map op fp/simd (7e)~map lookup int (7f)~map lookup fp/simd (81)8283pipeline redirect (84)85inst all (8c)inst branch (8d)inst branch taken (90)inst b.cc (94)inst integer (97)inst neon or fp (9a)9fa0a1a8acc2branch mispredict (cb)cdcfd0d2icache miss (d3)d5d6d9dadbddinst fetch restart (de)e0eaeb? fp/simd (ee)gpr retires (ef)f5f6f7f8fd
4000248005860000002376263200101032000010320000503520000118002608004580045326320010203200002096000080045800451116002110910101600001000000001002142119172116680042152400000108004680046800468004680046
40002480045599000014926320010103200001032000050352000011800260800458004532632001020320000209600008004580045111600211091010160000100000000100213115172115680042152400000108004680046800468004680046
40002480045600000014926320010103200001032000050352000011800260800458004532632001020320000209600008004580045111600211091010160000100000000100213115173116580042152400000108004680046800468004680046
40002480045600000024926320010103200001032000050352000011800260800458004532632001020320000209600008004580045111600211091010160000100000000100213219172218780042152400000108004680046800468004680046
40002480045600000024926320010103200001032000050352000011800260800458004532632001020320000209600008004580045111600211091010160000100000000100233117172118880042152400000108004680046800468004680046
40002480045599000026026320010103200001032000050352000011800260800458004532632001020320000209600008004580045111600211091010160000100000000100213117172116780042302400000108004680046800468004680046
40002480045600000034926320010103200001032000050352000011800260800458004532632001020320000209600008004580045111600211091010160000100000000100213116172128580042152400000108004680046800468004680046
40002480045600000014926320010103200001032000050352000011800260800458004532632001020320000209600008004580045111600211091010160000100000000100213126172215780042152400000108004680046801198004680046
400024800456000000127426320010103200761032000050352000011800260800458004532632001020320000209600008004580045111600211091010160000100000000100213115172128680042152400000108004680046800468004680046
40002480045599000015526320010103200001032000050352000011800260800458004532632001020320000209600008004580045111600211091010160000100100000100213116172116580042152400000108004680046800468004680046