Apple M1 Microarchitecture Research by Dougall Johnson

Firestorm: Overview | Base Instructions | SIMD and FP Instructions
Icestorm:  Overview | Base Instructions | SIMD and FP Instructions

TBX (four register table, 8B)

Test 1: uops

Code:

  tbx v0.8b, { v1.16b, v2.16b, v3.16b, v4.16b }, v5.8b
  movi v0.16b, 1
  movi v1.16b, 2
  movi v2.16b, 3
  movi v3.16b, 4
  movi v4.16b, 5
  movi v5.16b, 6

(no loop instructions)

1000 unrolls and 1 iteration

Retires: 4.000

Issues: 4.000

Integer unit issues: 0.000

Load/store unit issues: 0.000

SIMD/FP unit issues: 4.000

retire (01)cycle (02)030b181e3f4e51inst issue (52)~issue fp/simd (54)~dispatch fp/simd (57)huge thing fp/simd (5b)60696d6edispatch stall (70)scheduler rewind (75)scheduler stall (76)~dispatch op (78)~map op fp/simd (7e)~map lookup fp/simd (81)8283pipeline redirect (84)85inst all (8c)inst neon or fp (9a)a8accfd5d6ddinst fetch restart (de)e0? fp/simd (ee)f5f6f7f8fd
40048037600001037687254000400040001125680080188037803766973702040004000120008037803711100110000073216227796400080388038803880388038
4004803760000617687254000400040001125680080188037803766973702040004000120008037803711100110000073216227796400080388038803880388038
4004803760000617687254000400040001125680080188037803766973702040004000120008037803711100110000073216227796400080388038803880388038
4004803760009617676254000400040001125680080188037803766973702040004000120008037803711100110000073225227796400080388038803880388038
40048037600001037687254000400040001125680080188037803766973702040004000120008037803711100110000073216227796400080388038803880388038
40048037600008276876440244024430411295290809081338181666511701644124492129848037803711100110000073216227796400080388038803880388038
4004803760000847687254000400040001125680080188037803766973702040004000120008037803711100110000073216227796400080388038803880388038
40048037600002337687254000400040001125680080188037803766973702040004000120008037803711100110000373216227796400080388038803880388038
40048037600006176872540004000400011256800801880378037669737020400040001200080378037111001100009373216227796400080388038803880388038
40048037600004417687254000400040001125680080188037803766973702040004000120008037803711100110000073216227796400080388038803880388038

Test 2: Latency 1->1

Code:

  tbx v0.8b, { v1.16b, v2.16b, v3.16b, v4.16b }, v5.8b
  movi v0.16b, 1
  movi v1.16b, 2
  movi v2.16b, 3
  movi v3.16b, 4
  movi v4.16b, 5
  movi v5.16b, 6

(fused SUBS/B.cc loop)

100 unrolls and 100 iterations

Result (median cycles for code): 8.0037

retire (01)cycle (02)03080b18191e1f3a3f4e51inst issue (52)~issue int (53)~issue fp/simd (54)~dispatch int (56)~dispatch fp/simd (57)huge thing int (59)huge thing fp/simd (5b)60696d6edispatch stall (70)scheduler rewind (75)scheduler stall (76)~dispatch op (78)~map op int (7c)~map op fp/simd (7e)~map lookup int (7f)~map lookup fp/simd (81)8283pipeline redirect (84)85inst all (8c)inst branch (8d)inst branch taken (90)inst b.cc (94)inst integer (97)inst neon or fp (9a)9fa0a1a6a8a9acc2c5cdcfd5d6ddinst fetch restart (de)e0? int output thing (e9)? fp/simd (ee)gpr retires (ef)f5f6f7f8fd
40204800376000000900536796872540100100400001004000050011457680180018800378003769697370019401002004000020012000080037800371110201100991001001000010000000000071031711797970400001008003880038800388003880038
402048003760000000001610796872540100100400001004000050011457680180018800378003769697370019401002004000020012000080037800371110201100991001001000010000000000071011711797970400001008003880038800388003880038
4020480037599000000061796872540100100400001004000050011457680180018800378003769697370019401002004000020012000080037800371110201100991001001000010000060000071011711797970400001008003880038800388003880038
40204800375990000000631796872540100100400001004000050011457680180018800378003769697370019401002004000020012000080037800371110201100991001001000010000000000071021711797970400001008003880038800388003880038
40204800376000000000251796872540100100400001004000050011457680180018800378003769697370019401002004000020012000080037802291110201100991001001000010000000000071011711797970400001008003880038800388003880038
40204800376000000000617968725401001004000010040000500114576801800188003780037696973700194010020040000200120000800378003711102011009910010010000100000001200071011711797970400001008003880038800388003880038
4020480037599000000061796872540100100400001004000050011457680180018800378003769697370019401002004000020012000080037800371110201100991001001000010000000000071011711797970400001008003880038800388003880038
4020480037599000000061796872540100100400001004000050011457680180018800378003769697370019401002004000020012000080037800371110201100991001001000010000000000071021711797970400001008003880038800388003880038
4020480037599000000061796872540100100400001004000050011457680180018800378003769697370019401002004000020012000080037800371110201100991001001000010000000000071011711797970400001008003880038800388003880038
4020480037600000000061796872540100100400001004000050011457680180018800378003769697370019401002004000020012000080037802281110201100991001001000010000000000071011711797970400001008003880038800388003880038

1000 unrolls and 10 iterations

Result (median cycles for code): 8.0037

retire (01)cycle (02)031e1f3f4e51inst issue (52)~issue int (53)~issue fp/simd (54)~dispatch int (56)~dispatch fp/simd (57)huge thing int (59)huge thing fp/simd (5b)60696d6edispatch stall (70)72scheduler rewind (75)scheduler stall (76)~dispatch op (78)~map op int (7c)~map op fp/simd (7e)~map lookup int (7f)~map lookup fp/simd (81)8283pipeline redirect (84)85inst all (8c)inst branch (8d)inst branch taken (90)inst b.cc (94)inst integer (97)inst neon or fp (9a)9fa1a8accfd5d6ddinst fetch restart (de)e0? fp/simd (ee)gpr retires (ef)f5f6f7f8fd
400248003759900617968725400101040000104000050114576800800188003780037696970370019400102040000201200008003780037111002110910101000010000649117017979740000108003880038800388003880038
400248003760000617968725400101040000104000050114576801800188003780037696970370019400102040000201200008003780037111002110910101000010013649317017979740000108003880038800388003880038
4002480037599001087968725400101040000104000050114576801800188003780037696970370019400102040000201200008003780037111002110910101000010000649317017979740000108003880038800388003880038
400248003759900617968725400101040000104000050114576800800188003780037696970370019400102040000201200008003780228111002110910101000010000669217017979740000108003880038800388003880038
400248003759900617968725400101040000104000050114576800800188003780037696970370019400102040000201200008003780037111002110910101000010000649317017979740000108003880038800388003880038
4002480037599001247968725400101040000104000050114576801800188003780037696970370019400102040000201200008003780037111002110910101000010000649217017979740000108003880038800388003880038
400248003759900617968725400101040000104000050114576801800188003780037696970370019400102040000201200008003780037111002110910101000010000649317017979740000108003880038800388003880038
400248003759900617968725400101040000104000050114576800800188003780037696970370019400102040000201200008003780037111002110910101000010000649217017979740000108003880038800388003880038
400248003759900617968725400101040000104000050114576801800188003780037696970370019400102040000201200008023080037111002110910101000010002015649417017979740000108003880038800388003880038
400248003760000617968725400101040000104000050114576800800188003780037696970370019400102040000201200008003780037111002110910101000010000649217017979740000108003880038800388003880038

Test 3: Latency 1->2

Chain cycles: 2

Code:

  movi v0.16b, 0
  tbx v0.8b, { v1.16b, v2.16b, v3.16b, v4.16b }, v5.8b
  add v1.16b, v0.16b, v0.16b
  movi v0.16b, 1
  movi v1.16b, 2
  movi v2.16b, 3
  movi v3.16b, 4
  movi v4.16b, 5
  movi v5.16b, 6

(fused SUBS/B.cc loop)

100 unrolls and 100 iterations

Result (median cycles for code, minus 2 chain cycles): 8.0038

retire (01)cycle (02)03080b18191e1f3f4e51inst issue (52)~issue int (53)~issue fp/simd (54)~dispatch int (56)~dispatch fp/simd (57)huge thing int (59)huge thing fp/simd (5b)60696d6edispatch stall (70)scheduler rewind (75)scheduler stall (76)~dispatch op (78)~map op int (7c)~map op fp/simd (7e)~map lookup int (7f)~map lookup fp/simd (81)8283pipeline redirect (84)85inst all (8c)inst branch (8d)inst branch taken (90)inst b.cc (94)inst integer (97)inst neon or fp (9a)9fa0a1a6a8a9acc2cdcfd5d6ddinst fetch restart (de)e0? int output thing (e9)? fp/simd (ee)gpr retires (ef)f5f6f7f8fd
60204100038749000000619968626501001005000010050000500143276480100019100038100038896963900195010020050000200140000100038100038113020110099100100300001000000000019101171199798060000100100039100039100039100039100039
602041000387490000907269968626501001005000010050000500143276480100019100038100038896963900195010020050000200140000100038100038113020110099100100300001000000000019101171199798060000100100039100039100039100039100039
60204100038749000000619968626501121205021612050152500143276480100019100038100038896963900195010020050000200140000100038100038113020110099100100300001000000000019101171199798060000100100039100039100039100039100039
60204100038749000000619968626501001005000010050000500143276480100019100038100038896963900195010020050000200140000100038100038113020110099100100300001000000000019101171199798060000100100039100039100039100039100039
60204100038749000000619968626501001005000010050000500143276480100019100038100038896963900195010020050000200140000100038100038113020110099100100300001000000000019101171199798060000100100039100039100039100039100039
60204100038750000000619968626501001005000010050000500143276480100019100038100038896963900195010020050000200140000100038100038113020110099100100300001000000000019101171199798060000100100089100039100039100039100039
60204100038750000000619968626501001005000010050000500143276480100019100038100038896963900195010020050000200140000100038100038113020110099100100300001000000000019101171199798060000100100039100039100039100039100039
60204100038749000000619968626501001005000010050000500143276480100019100038100038896963900195010020050000200140000100038100038113020110099100100300001000000000019101171199798060000100100039100039100039100039100039
60204100038749000000619968626501001005000010050000500143276480100019100038100038896963900195010020050000200140000100038100038113020110099100100300001000000000019101171199798060000100100039100039100039100039100039
60204100038750000000619968626501001005000010050000500143276480100019100038100038896963900195010020050000200140000100038100038113020110099100100300001000000000019101171199798060000100100039100039100039100039100039

1000 unrolls and 10 iterations

Result (median cycles for code, minus 2 chain cycles): 8.0038

retire (01)cycle (02)030708090a0b18191e1f3a3f4e51inst issue (52)~issue int (53)~issue fp/simd (54)~dispatch int (56)~dispatch fp/simd (57)huge thing int (59)huge thing fp/simd (5b)5f60696d6edispatch stall (70)scheduler rewind (75)scheduler stall (76)~dispatch op (78)~map op int (7c)~map op fp/simd (7e)~map lookup int (7f)~map lookup fp/simd (81)8283pipeline redirect (84)85inst all (8c)inst branch (8d)inst branch taken (90)inst b.cc (94)inst integer (97)inst neon or fp (9a)9fa0a1a6a8a9acc2cficache miss (d3)d5d6dbddinst fetch restart (de)e0? int output thing (e9)ea? fp/simd (ee)gpr retires (ef)f5f6f7f8fd
60024100038749010000024006199686265001010500121050000501432764800100019100038100038896963900195001020500002014000010008710003811300211091010300001000200001899081705699798006000010100039100039100039100039100039
6002410003874900010001890010399686265001010500001050000501432764801100055100038100038896963900195001020500002014000010003810003811300211091010300001000000001899061704699798006000010100039100039100039100039100039
6002410003874900000000006199686265001010500001050000501432764800100019100038100038896963900195001020500002014000010003810003811300211091010300001000000001899061705699798006000010100039100039100039100039100039
6002410003874900000010006199686265001010500121050000501432764801100019100038100038896963900195001020500002014000010003810003811300211091010300001000000001899061704599798006000010100039100039100039100039100039
6002410003874900000000006199686265001010500001050000501432764801100019100038100038896969900195001020500002014000010003810003811300211091010300001000000301917051706999798006000010100039100039100039100039100039
6002410003874900000001650019199686265001010500001050000501432764800100019100038100038896963900195001020500002014000010003810003811300211091010300001000000001899041707699798006000010100039100039100039100039100039
6002410003875000000000006199686265001010500001050000501432764800100019100038100038896963900195001020500002014000010003810003811300211091010300001000010301899061704699798006000010100039100039100039100039100039
6002410003874900000000006199686265001010500001050000501432764800100019100038100038896963900195001020500002014000010003810003811300211091010300001002000001899061705699798006000010100039100039100039100039100039
600241000387490000000120053699686265001010500001050000501432764800100019100038100038896963900195001020501652014000010003810003811300211091010300001000200001899061704699798006000010100039100039100039100039100039
60024100038750000000000018999686265001010500001050000501432764801100055100038100038896963900195001020500002014000010003810003811300211091010300001000000321922071705499798006000010100039100039100039100039100039

Test 4: Latency 1->3

Chain cycles: 2

Code:

  movi v0.16b, 0
  tbx v0.8b, { v1.16b, v2.16b, v3.16b, v4.16b }, v5.8b
  add v2.16b, v0.16b, v0.16b
  movi v0.16b, 1
  movi v1.16b, 2
  movi v2.16b, 3
  movi v3.16b, 4
  movi v4.16b, 5
  movi v5.16b, 6

(fused SUBS/B.cc loop)

100 unrolls and 100 iterations

Result (median cycles for code, minus 2 chain cycles): 6.0040

retire (01)cycle (02)03191e373a3f4e51inst issue (52)~issue int (53)~issue fp/simd (54)~dispatch int (56)~dispatch fp/simd (57)huge thing int (59)huge thing fp/simd (5b)60696d6edispatch stall (70)72scheduler rewind (75)scheduler stall (76)~dispatch op (78)~map op int (7c)~map op fp/simd (7e)~map lookup int (7f)~map lookup fp/simd (81)8283pipeline redirect (84)85inst all (8c)inst branch (8d)inst branch taken (90)inst b.cc (94)inst integer (97)inst neon or fp (9a)9fa1a6a8a9acc5branch mispredict (cb)cdcfd5d6ddinst fetch restart (de)e0ea? fp/simd (ee)gpr retires (ef)f5f6f7f8fd
60204800405990020617968226501001005000010050000500114474471800218004080040697187670017501002005000920014002680040800401130201100991001003000010000000111191701600798900600001008004180044800448004180041
60204800405990010617968226501011005000110050000500114474471800218004080040697180670020501002005000920014002680040800401130201100991001003000010000000111191601600798450600001008004180044800418004480041
60204800405990010617966226501011005000010050000500114474471800218009580040697180670017501002005000920014002680040800401130201100991001003000010000000111191601600798360600001008004180041800448004180041
6020480040599000028627968226501011005000110050000500114474471800218004080040697180770016501002005000920014002680040800401130201100991001003000010000000111191601600798350600001008004180041800418004180041
60204800406000000617969626501001005000110050000500114474471800218004080040697180670017501002005000920014002680040800401130201100991001003000010000000111191601600798350600001008032380041800418004180041
60204800405990000617968226501031005000010050000500114474471800218004080040697180670017501002005000920014002680040800401130201100991001003000010000000111191701600798350600001008004180041800418004180041
60204800436000010617968226501001005000010050000500114474471800218004080040697180670017501002005000920014002680040800401130201100991001003000010000000111191601600798360600001008004180041800418004180041
60204800405990000617968226501001005000010050186500114474471800218004080040697180670017501002005000920014002680040800401130201100991001003000010000000111191601600798350600001008004480041800418004180096
60204800926000000617968226501011005000110050000500114474471800218004080040697180670017501002005000920014002680040800401130201100991001003000010000000111191701600798350600001008004180041800418004180041
60204800405990000617968226501001005000010050000500114474471800218004080040697000770016501002005000920014002680040800401130201100991001003000010000000111191601600798360600001008004180041800418004180041

1000 unrolls and 10 iterations

Result (median cycles for code, minus 2 chain cycles): 6.0040

retire (01)cycle (02)03181e1f373f4e51inst issue (52)~issue int (53)~issue fp/simd (54)~dispatch int (56)~dispatch fp/simd (57)huge thing int (59)huge thing fp/simd (5b)5f60696d6edispatch stall (70)scheduler rewind (75)scheduler stall (76)~dispatch op (78)~map op int (7c)~map op fp/simd (7e)~map lookup int (7f)~map lookup fp/simd (81)8283pipeline redirect (84)85inst all (8c)inst branch (8d)inst branch taken (90)inst b.cc (94)inst integer (97)inst neon or fp (9a)9fa8accfd5d6ddinst fetch restart (de)e0eb? fp/simd (ee)gpr retires (ef)f5f6f7f8fd
6002480040600000261796822650010105000010500005011447447008002180040800406970437002150010205000020140000800408004011300211091010300001090018992177479894060000108004180041800418009480041
600248004059900026179682265001010500531050000501144744700800218004080040696563700215019920500002014000080198802512130021109101030000100018994174579821060000108004180044800418004180041
6002480040599000161796822650010105000010500005011447447008002180040800406970737002150010205000020140000800408004011300211091010300001072018992173479821060000108004180041800418004180041
6002480040600000261796822650011105000010500005011447447008002180040800406970437002150010205000020140000800408004011300211091010300001074018994173479821060000108004180041800418004180041
600248004060000132061796852650011105000010500005011447447008002180040800406970437002150010205000020140000800408004011300221091010300001036018992171579821060000108004180041800418004180041
6002480040599000161796822650011105000010500005011447882008002180040800406970437002450010205000020140000800408004011300211091010300001068018994173579821060000108004180041800448004180044
6002480040599000061796824950010105000110500005011447447018002180043800406970437002150010205000020140000800408004011300211091010300001075018994172479821060000108004180041800418004180041
6002480040599000061796822650010105000010500005011447447018002180040800406970437002150010205000020140000800408004011300211091010300001085018993173479821060000108004180041800418004180041
6002480040599000061796822650011105000010500005011447447018002180040800406970437002150010205000020140000800408004011300211091010300001072018994172579821060000108004180041800418004180041
600248004359900006179682265001010500001050000501144744701800218004080040697043700215001020500002014000080043800401130021109101030000100018993172479821060000108004480041800418004180041

Test 5: Latency 1->4

Chain cycles: 2

Code:

  movi v0.16b, 0
  tbx v0.8b, { v1.16b, v2.16b, v3.16b, v4.16b }, v5.8b
  add v3.16b, v0.16b, v0.16b
  movi v0.16b, 1
  movi v1.16b, 2
  movi v2.16b, 3
  movi v3.16b, 4
  movi v4.16b, 5
  movi v5.16b, 6

(fused SUBS/B.cc loop)

100 unrolls and 100 iterations

Result (median cycles for code, minus 2 chain cycles): 4.0042

retire (01)cycle (02)030818191e1f373a3f4e51inst issue (52)~issue int (53)~issue fp/simd (54)~dispatch int (56)~dispatch fp/simd (57)huge thing int (59)huge thing fp/simd (5b)60696b6d6edispatch stall (70)scheduler rewind (75)scheduler stall (76)~dispatch op (78)~map op int (7c)~map op fp/simd (7e)~map lookup int (7f)~map lookup fp/simd (81)8283pipeline redirect (84)85inst all (8c)inst branch (8d)inst branch taken (90)inst b.cc (94)inst integer (97)inst neon or fp (9a)9fa8acc5branch mispredict (cb)cfd5d6ddinst fetch restart (de)e0? fp/simd (ee)gpr retires (ef)f5f6f7f8fd
60204600424500000000615970326501001005000110050000500856446916002306004860054497273500235010020050000200140000600426005411302011009910010030000100000019101171159853600001006005560043600556004360055
60204600544500000000825968626501041005000410050000500856446916002306004260054497273500355010020050000200140000600546004211302011009910010030000100000019101171159853600001006004360043600496004960043
602046004244900025513220615970326501011005000110050000500856535116002306004860042497153500355010020050000200140000600546004211302011009910010030000100000019101171159859600001006004360043600496005560043
60204600424500000000615968626501041005000410050000500856449706002306004860048497153500295010020050000200140000600546004211302011009910010030000100000019101171159853600001006004360049600436004360055
60204600424500000020615967926501001005000010050000500856446916002906004860054497273500355010020050000200140000600546004211302011009910010030000100000019101171159853600001006005560049600436004960043
60204600544500000020615967926501041005000410050000500856446916002306009660042497213500355010020050000200140000600546004211302011009910010030000100000019101171159859600001006005560043600556004360055
602046005444900000207265967926501011005000110050000500856560916003506005460042497153500235010020050000200140000600426005411302011009910010030000100000019101171159865600001006005560043600436004360049
60204600544500000020615967926501001005000010050000500856560906003536004260042497213500235010020050000200140000600426005411302011009910010030000100000019101171159865600001006004360055600436005560043
60204600424500000040615968626501001005000110050000500856446916002306004260054497273500355010020050000200140000600426004211302011009910010030000100000019101171159853600001006004360049600496004360043
60204600424500000030615967926501001005000010050000500856560916003506005460042497153500235010020050000200140000600426005411302011009910010030000100000019101171159853600001006005560043600556004360055

1000 unrolls and 10 iterations

Result (median cycles for code, minus 2 chain cycles): 4.0042

retire (01)cycle (02)030b18191e1f373f4e51inst issue (52)~issue int (53)~issue fp/simd (54)~dispatch int (56)~dispatch fp/simd (57)huge thing int (59)huge thing fp/simd (5b)5f60696b6d6edispatch stall (70)scheduler rewind (75)scheduler stall (76)~dispatch op (78)~map op int (7c)~map op fp/simd (7e)~map lookup int (7f)~map lookup fp/simd (81)8283pipeline redirect (84)85inst all (8c)inst branch (8d)inst branch taken (90)inst b.cc (94)inst integer (97)inst neon or fp (9a)9fa0a1a8accfd5d6daddinst fetch restart (de)e0eaebec? fp/simd (ee)gpr retires (ef)f5f6f7f8fd
600246004244900000014075968626500131050002105000050856446901600350600426004849704350023500102050000201400006004260048113002110910103000010000018991170015986500060000106004360049600496004360049
60024600424500019046375967926500101050000105000050856560901600230600546005449715350023500102050000201400006004260054113002110910103000010000018991170015985300060000106004960043600436004360055
600246005444900000021485967926500111050000105000050856535101600230600426005449727350035500102050000201400006005460042113002110910103000010000018991170015985300060000106005560043600556004360055
600246005445000000018415970326500141050004105000050856446901600230600426005449727350035500102050000201400006004260048113002110910103000010000018994170015985300060000106004360055600436005560043
600246004245000000419865967926500101050002105000050856535100600230600426004849721350023500102050000201400006004260054113002110910103000010000018991170015986500060000106005560043600556004360055
6002460054449000000190959679265001010500001050000508565609006003506005460042497153050023500102050000201400006004860048113002110910103000010000018991170015985900060000106004960049600496004360043
600246004244900000019515970326500101050002105000050856446901600230600426005449727350023500102050000201400006005460042113002110910103000010000018991170015986500060000106004360055600436005560043
6002460042450000004615964126500101050002105000050856560911600350600546004249715350029500102050000201400006004260048113002110910103000010000018991170015985300060000106005560046600556004360055
60024600544490000046555968626500141050004105000050856446900600230600426005449727350035500102050000201400006005460042113002110910103000010000018991170015985300060000106005560043600436005560043
60024600424500000022565968626500141050004105000050856446900600230600426005449727350035500102050000201400006005460042113002110910103000010000018992170015985300060000106004360055600436004360043

Test 6: Latency 1->5

Chain cycles: 2

Code:

  movi v0.16b, 0
  tbx v0.8b, { v1.16b, v2.16b, v3.16b, v4.16b }, v5.8b
  add v4.16b, v0.16b, v0.16b
  movi v0.16b, 1
  movi v1.16b, 2
  movi v2.16b, 3
  movi v3.16b, 4
  movi v4.16b, 5
  movi v5.16b, 6

(fused SUBS/B.cc loop)

100 unrolls and 100 iterations

Result (median cycles for code, minus 2 chain cycles): 2.0098

retire (01)cycle (02)030818191e1f373f4e51inst issue (52)~issue int (53)~issue fp/simd (54)~dispatch int (56)~dispatch fp/simd (57)huge thing int (59)huge thing fp/simd (5b)60696d6edispatch stall (70)scheduler rewind (75)scheduler stall (76)~dispatch op (78)~map op int (7c)~map op fp/simd (7e)~map lookup int (7f)~map lookup fp/simd (81)8283pipeline redirect (84)85inst all (8c)inst branch (8d)inst branch taken (90)inst b.cc (94)inst integer (97)inst neon or fp (9a)9fa0a1a8a9acc2cfd5d6ddinst fetch restart (de)e0? int output thing (e9)? fp/simd (ee)gpr retires (ef)f5f6f7f8fd
602044006530000000179061396752651902100500041005000050056896750400884021840065297653300615010020050000200140000400864012211302011009910010030000100000000191011711399270600001004011140081400874013240105
6020440110300000001761397252651824100517181005000050056884210400494012240125297973300735010020050000200140000401044006811302011009910010030000100000000191011711399570600001004012340069400814010540303
6020440089300000001561397082650115100500231005000050056927010400614008340065297763300975010020050000200140000401134013411302011009910010030000100000000191011711399800600001004011440102401024010840105
60204400683000000015103397182650132100500241005000050056892970400794008040098297613300885010020050000200140000400864008611302011009910010030000100000000191011711402950600001004011140111401024011140114
602044009830000000179089397052650116100500201005000050056918750400644021840089297883300705010020050000200140000400624009811302011009910010030000100000000191011711399360600001004008440216401474018340093
6020440206300000001561396942650114100500341005000050056862160400584009840098297853300855010020050000200140000400924026611302011009910010030000100000000191011711399480600001004008440099402434010840108
6020440101301000002261396902650121100500191005000050057111550400884022740095298003300675010020050000200140000400864011311302011009910010030000100000000191011711399630600001004009040129400604008140126
602044023630000000176461397032650116100500211005000050056897140400734010140110297583300675010020050000200140000400864008311302011009910010030000100000000191001711399870600001004009340084401024017740129
6020440080300000001561397152650116100500071005000050056875010402624011940173297833300885010020050000200140000402484011611302011009910010030000100000000191011711401020600001004022240233403364010240093
6020440215301000002461397032650126100518511005000050056884210400494010440104297793301605010020050000200140000400834005611302011009910010030000100000000191011711399480600001004008740063400994008740108

1000 unrolls and 10 iterations

Result (median cycles for code, minus 2 chain cycles): 2.0095

retire (01)cycle (02)031e373f4e51inst issue (52)~issue int (53)~issue fp/simd (54)~dispatch int (56)~dispatch fp/simd (57)huge thing int (59)huge thing fp/simd (5b)60696d6edispatch stall (70)72scheduler rewind (75)scheduler stall (76)~dispatch op (78)~map op int (7c)~map op fp/simd (7e)~map lookup int (7f)~map lookup fp/simd (81)8283pipeline redirect (84)85inst all (8c)inst branch (8d)inst branch taken (90)inst b.cc (94)inst integer (97)inst neon or fp (9a)9fa0a8accdcfd0d2icache miss (d3)d5d6d9dadbddinst fetch restart (de)e0ebec? fp/simd (ee)gpr retires (ef)f5f6f7f8fd
60024400773000169003969826500251050025105000050568866514007940125401132979703300825001020500002014000040077400711130021109101030000100000189900061700056399510060000104009640099400814009340090
6002440077300016921893967826500411050011105000050568811614009440101400802980003300645001020500002014000040176402541130021109101030000100000189900061700036399690060000104012340087401204009940072
6002440080300017516139728265003810500191050000505691524040082400714008929803033008550010205000020140000400954011911300211091010300001000001899000121700067401280060000104012340108401264007540087
6002440095301023613968926500321050013105000050568943404009140119400832978203300705001020500002014000040107400741130021109101030000100000189900061700067399600060000104010840117401264009640255
60024400683010167263985626500351050027105000050570961514010040104401312976703300645001020500002014000040077400801130021109101030000100000189900071700037399510060000104014440108401084008440072
60024400803010311033985926518271050024105000050570762104008840095401042977003300765001020500002014000040086400861130021109101030000100000189900071700078399540060000104008140075401114009040180
6002440107300021613972126500321050014105000050568855804007640083401102978503301845001020500002014000040116401011130021109101030000100000189900091700075399480060000104020740078401144008740225
6002440080300018613974926500271050020105000050568532504011240254401132978203300915001020500002014000040104400981130021109101030000100000189900061700056399630060000104007240084402164007840087
6002440119301017613973626500261050019105000050569011304007640107401042976703301005001020500002014000040113401131130021109101030000100000189900071700057399750060000104006940096402464017940093
6002440101300019613968726500311051789105000050569222814007340110401012976403300795001020500002014000040110402031130021109101030000100000189900071700066399570060000104011140192400874013540087

Test 7: Latency 1->6

Chain cycles: 2

Code:

  movi v0.16b, 0
  tbx v0.8b, { v1.16b, v2.16b, v3.16b, v4.16b }, v5.8b
  add v5.16b, v0.16b, v0.16b
  movi v0.16b, 1
  movi v1.16b, 2
  movi v2.16b, 3
  movi v3.16b, 4
  movi v4.16b, 5
  movi v5.16b, 6

(fused SUBS/B.cc loop)

100 unrolls and 100 iterations

Result (median cycles for code, minus 2 chain cycles): 8.0038

retire (01)cycle (02)030818191e1f3a3f4e51inst issue (52)~issue int (53)~issue fp/simd (54)~dispatch int (56)~dispatch fp/simd (57)huge thing int (59)huge thing fp/simd (5b)60696b6d6edispatch stall (70)scheduler rewind (75)scheduler stall (76)~dispatch op (78)~map op int (7c)~map op fp/simd (7e)~map lookup int (7f)~map lookup fp/simd (81)8283pipeline redirect (84)85inst all (8c)inst branch (8d)inst branch taken (90)inst b.cc (94)inst integer (97)inst neon or fp (9a)9fa0a1a6a8a9acc2branch mispredict (cb)cdcfd5d6ddinst fetch restart (de)e0? int output thing (e9)? fp/simd (ee)gpr retires (ef)f5f6f7f8fd
6020410003875000000061996862650100100500001005000050014327648110001901000381000388969639001950100200500002001400001000381000381130201100991001003000010000010000019101171199798060000100100039100039100039100039100039
60204100038749000000726996862650100100500001005000050014327648110001901000381000388969639001950100200500002001400001000381000381130201100991001003000010000000000119101171199798060000100100039100039100039100039100039
6020410003874900000061996862650100100500001005000050014327648110001901000381000388969939001950100200500002001400001000381000381130201100991001003000010000000000019101171199798060000100100039100039100039100039100039
6020410003874900000061996862650100100500001005000050014327648110001901000381000388969639001950100200500002001400001000381000381130201100991001003000010000000000019101171199798060000100100039100039100039100039100039
6020410003874900000061996862650100100500001005000050014327648110001901000381000388969639001950100200500002001400001000381000381130201100991001003000010000000000019101171199798060000100100039100039100039100039100039
602041000387490000001135996862650100100500001005000050014327648110001901000381000388969639001950100200500002001400001000381000381130201100991001003000010000000000019101171199798060000100100039100039100039100039100039
6020410003874900000061996862650100100500001005000050014327648110001901000381000388969639001950100200500002001400001000381000381130201100991001003000010000000000019101171199798060000100100039100039100039100039100039
6020410003874900000061996862650100100500001005000050014327648110001901000381000388969639001950100200500002001400001000381000381130201100991001003000010000000000019101171199798060000100100039100039100039100039100039
6020410003874900000061996862650100100500001005000050014327648110001901000381000388969639001950100200500002001400001000381000381130201100991001003000010000000000019101171199798060000100100039100039100039100039100039
6020410003875000000061996862650100100500001005000050014327648110001931000381000388969639001950100200500002001400001000381000381130201100991001003000010000000000019101171199798060000100100039100039100039100039100039

1000 unrolls and 10 iterations

Result (median cycles for code, minus 2 chain cycles): 8.0038

retire (01)cycle (02)030b1e1f3f4e51inst issue (52)~issue int (53)~issue fp/simd (54)~dispatch int (56)~dispatch fp/simd (57)huge thing int (59)huge thing fp/simd (5b)60696d6edispatch stall (70)scheduler rewind (75)scheduler stall (76)~dispatch op (78)~map op int (7c)~map op fp/simd (7e)~map lookup int (7f)~map lookup fp/simd (81)8283pipeline redirect (84)85inst all (8c)inst branch (8d)inst branch taken (90)inst b.cc (94)inst integer (97)inst neon or fp (9a)9fa1a6a8acbranch mispredict (cb)cfd0d5d6ddinst fetch restart (de)e0? fp/simd (ee)gpr retires (ef)f5f6f7f8fd
600241000387490006199686265001010500001050000501432764801000191000381000388969639001950010205000020140000100038100038113002110910103000010000001899041732997986000010100039100039100039100039100039
6002410003874900053699686265001010500001050000501432764801000191000381000388969639001950010205000020140000100038100038113002110910103000010000001899041734997986000010100039100039100039100039100039
600241000387490006199686265001010500001050000501432764801000191000381000388969639001950010205000020140000100038100038113002110910103000010000001899041734997986000010100039100039100039100039100039
600241000387490006199686265001010500001050000501432764801000191000381000388969639001950010205000020140460100038100233113002110910103000010000001899021734997986000010100039100039100039100039100039
6002410003875000072699686265001010500001050000501432764801000191000381000388969639001950010205000020140488100038100038113002110910103000010000001899041732997986000010100039100039100039100039100039
600241000387490006199686265001010500001050000501432764801000191000381000388969639001950010205000020140000100038100038113002110910103000010000001899021732997986000010100039100039100039100039100039
600241000387490006199686265001010500001050000501432764801000191000381000388969639001950010205000020140000100038100038113002110910103000010000001899041714997986000010100039100039100039100039100039
6002410003875006072699686265001010500001050000501432764811000191000381000388969639001950010205000020140000100038100038113002110910103000010000001899041714997986000010100039100091100039100039100039
600241000387500006199686265001010500001050000501432764801000191000381000388969639001950010205000020140000100038100038113002110910103000010000001899021734997986000010100039100039100039100039100039
600241000387500006199686265001010500001050000501432764801000191000381000388969639001950010205000020140000100038100038113002110910103000010000001899041732997986000010100039100039100039100039100039

Test 8: throughput

Count: 8

Code:

  movi v0.16b, 0
  tbx v0.8b, { v8.16b, v9.16b, v10.16b, v11.16b }, v12.8b
  movi v1.16b, 0
  tbx v1.8b, { v8.16b, v9.16b, v10.16b, v11.16b }, v12.8b
  movi v2.16b, 0
  tbx v2.8b, { v8.16b, v9.16b, v10.16b, v11.16b }, v12.8b
  movi v3.16b, 0
  tbx v3.8b, { v8.16b, v9.16b, v10.16b, v11.16b }, v12.8b
  movi v4.16b, 0
  tbx v4.8b, { v8.16b, v9.16b, v10.16b, v11.16b }, v12.8b
  movi v5.16b, 0
  tbx v5.8b, { v8.16b, v9.16b, v10.16b, v11.16b }, v12.8b
  movi v6.16b, 0
  tbx v6.8b, { v8.16b, v9.16b, v10.16b, v11.16b }, v12.8b
  movi v7.16b, 0
  tbx v7.8b, { v8.16b, v9.16b, v10.16b, v11.16b }, v12.8b
  movi v8.16b, 9
  movi v9.16b, 10
  movi v10.16b, 11
  movi v11.16b, 12
  movi v12.16b, 13

(fused SUBS/B.cc loop)

100 unrolls and 100 iterations

Result (median cycles for code divided by count): 1.0006

retire (01)cycle (02)0307090b18191e1f3f51inst issue (52)~issue int (53)~issue fp/simd (54)~dispatch int (56)~dispatch fp/simd (57)huge thing int (59)huge thing fp/simd (5b)60696d6escheduler rewind (75)scheduler stall (76)~dispatch op (78)~map op int (7c)~map op fp/simd (7e)~map lookup int (7f)~map lookup fp/simd (81)8283pipeline redirect (84)85inst all (8c)inst branch (8d)inst branch taken (90)inst b.cc (94)inst integer (97)inst neon or fp (9a)9fa0a1a8a9acc2c5branch mispredict (cb)cdcfd5d6ddinst fetch restart (de)e0? fp/simd (ee)gpr retires (ef)f5f6f7f8fd
40020480058600000001500322632011110032001110032003250035202451800268004580045615320132200320032200960096800458004511160201100991001001600001000000001111011611611800424000001008004680046800468004680046
40020480045600000003930312632011110032001110032003250035202451800268004580045615320132200320032200960096800458004511160201100991001001600001000000001111011611611800424000001008004680046800468004680046
4002048004560000000780322632011110032001110032003250035202450800268004580045615320132200320032200960096800458004511160201100991001001600001000000001111011711611800424000001008004680046800468004680046
40020480045599000003660312632011110032001110032003250035202451800268004580045614320132200320032200960096800458004511160201100991001001600001000000001111011611611800424000001008004680046800468004680046
4002058004560000000450322632011110032001110032003250035202451800708015580045151532013220032003220096009680045800451116020110099100100160000100001449801111011611611800424000001008004680046800468004680046
40020480045599000004502222732011110032001110032003250035202451800268004580045615320132200320032200960096800458004511160201100991001001600001000000001111011611611800424000001008004680046800468004680046
400204801566000101013207392732011110032001110032003250035202450800268004580045615320132200320032200960096800458004511160201100991001001600001000000001111011711611800424000001008004680046800468004680046
40020480045599000001080322632011110032001110032003250035202451800268004580045615320132200320032200960096800458004511160201100991001001600001000000041111011711611800424000001008004680046800468004680046
40020480045600000005130322732011110032001110032003250035202451800268004580045615320132200320032200960096800458004511160201100991001001600001000000001111011611611800424000001008004680046800468004680046
40020480045599000002880312632011110032001110032003250035202451800268004580045614320132200320032200960096800458004511160201100991001001600001000000001111011611611800424000001008004680046800468004680046

1000 unrolls and 10 iterations

Result (median cycles for code divided by count): 1.0006

retire (01)cycle (02)03040b18191e3f51inst issue (52)~issue int (53)~issue fp/simd (54)~dispatch int (56)~dispatch fp/simd (57)huge thing int (59)huge thing fp/simd (5b)5f60696b6d6escheduler rewind (75)scheduler stall (76)~dispatch op (78)~map op int (7c)~map op fp/simd (7e)~map lookup int (7f)~map lookup fp/simd (81)8283pipeline redirect (84)85inst all (8c)inst branch (8d)inst branch taken (90)inst b.cc (94)inst integer (97)inst neon or fp (9a)9fa8accfd0d2icache miss (d3)itlb miss (d4)d5d6d9dadbddinst fetch restart (de)e0eaeb? fp/simd (ee)gpr retires (ef)f5f6f7f8fd
4000258005860010000492632001010320000103200005035200000180026080045800453263200102032000020960000800458004511160021109101016000010001002634215017121403980042166400000108004680046800468010180046
4000248004559910009552632001010320000103200005035200001180026380045800453263200102032000020960000800458004511160021109101016000010001002563204117121254180042313400000108004680046800468004680046
40002480045599000002412632001010320000103200005035200000080026080045800453263200102032000020960000800458004511160021109101016000010001002331106217121383880042163400000108004680046800468004680046
4000248004560000000492632001010320000103200005035200001180026080045800453263200102032000020960000800458004511160021109101016000010001002361102617311412380042163400000108004680046800468004680046
4000248004560000000492632001010320000103200005035200001080026080045800453263200102032000020960000800458004511160021109101016000010001002362104217111414180042163400000108004680046800468004680046
4000248004559900000492632001010320000103200005035200000180026080045800453263200102032000020960096800458004511160021109101016000010001002331102717111412880042213400000108004680046800468010280105
4000248004559900000492632001010320000103200005035200001180026080045800453263200102032000020960000800458004511160021109101016000010001002531104217111294280042213400000108004680046800468004680046
4000248004560000000492632001010320000103200005035200001180026080045800453263200102032000020960000800458004511160021109101016000010001002331104317111264180042213400000108004680046800468004680046
4000248004560000000492632001010320000103200005035200001180026080045800453263200102032000020960000800458004511160021109101016000010001002331104117111444580042213400000108004680046800468004680046
4000248004560000000492632001010320000103200005035200001180026080045800453263200102032000020960000800458004511160021109101016000010001002131104119111434380042213400000108010780046800468004680046