Apple M1 Microarchitecture Research by Dougall Johnson

Firestorm: Overview | Base Instructions | SIMD and FP Instructions
Icestorm:  Overview | Base Instructions | SIMD and FP Instructions

USRA (vector, 8H)

Test 1: uops

Code:

  usra v0.8h, v1.8h, #3
  movi v0.16b, 1
  movi v1.16b, 2

(no loop instructions)

1000 unrolls and 1 iteration

Retires: 1.000

Issues: 1.000

Integer unit issues: 0.000

Load/store unit issues: 0.000

SIMD/FP unit issues: 1.000

retire (01)cycle (02)030408090b18191e1f3a3f4e51inst issue (52)~issue fp/simd (54)~dispatch fp/simd (57)huge thing fp/simd (5b)60696d6edispatch stall (70)scheduler rewind (75)scheduler stall (76)~dispatch op (78)~map op fp/simd (7e)~map lookup fp/simd (81)8283pipeline redirect (84)85inst all (8c)inst neon or fp (9a)a1a6a8a9acc2cfd5d6ddinst fetch restart (de)e0? fp/simd (ee)f5f6f7f8fd
100430372300000000282254825100010001000398313030183037303724153289510001000200030373037111001100000000077416442630100030383038303830383038
100430372410000000282254825100010001000398313030183037303724153289510001000200030373037111001100000000077416442630100030383038303830383038
100430372310000000282254825100010001000398313030183037303724153289510001000200030373037111001100000000077416442630100030383038303830383038
100430372410000000282254825100010001000398313030183037303724153289510001000200030373037111001100000000077416442630100030383038303830383038
100430372310000000282254825100010001000398313030183037303724153289510001000200030373037111001100000000077416442630100030383038303830383038
100430372410000000282254825100010001000398313030183037303724153289510001000200030373037111001100000000077416442630100030383038303830383038
100430372410000000282254825100010001000398313030183037303724153289510001000200030373037111001100000000077416442630100030383038303830383038
1004303723100000002822548251000100010003983130301830373037241532895100010002000303730371110011000000084077416442630100030383038303830383038
1004303723100000002124254825100010001000398313030183037303724153289510001000200030373037111001100000000077416442630100030383038303830383038
1004303724100000002822548251000100010003983130301830373037241532895100010002000303730371110011000001063077416442630100030383038303830383038

Test 2: Latency 1->1

Code:

  usra v0.8h, v1.8h, #3
  movi v0.16b, 1
  movi v1.16b, 2

(fused SUBS/B.cc loop)

100 unrolls and 100 iterations

Result (median cycles for code): 3.0037

retire (01)cycle (02)03191e3f4e51inst issue (52)~issue int (53)~issue fp/simd (54)~dispatch int (56)~dispatch fp/simd (57)huge thing int (59)huge thing fp/simd (5b)60696d6edispatch stall (70)scheduler rewind (75)scheduler stall (76)~dispatch op (78)~map op int (7c)~map op fp/simd (7e)~map lookup int (7f)~map lookup fp/simd (81)8283pipeline redirect (84)85inst all (8c)inst branch (8d)inst branch taken (90)inst b.cc (94)inst integer (97)inst neon or fp (9a)9fa8a9accdcfd5d6ddinst fetch restart (de)e0? int output thing (e9)? fp/simd (ee)gpr retires (ef)f5f6f7f8fd
1020430037241006129548251010010010000100100005004277313130018300373003728265328745101002001000020020000300373003711102011009910010010000100100071011611296340100001003003830038300383003830038
1020430037232006129548251010010010000100100005004277313030018300373003728265328745101002001000020020000300373003711102021009910010010000100000071011611296340100001003003830038300383003830038
10204300372320025329548251010010010000100100005004277313030018300373003728265328745101002001000020020000300373003711102011009910010010000100000071011611296340100001003003830038300383003830038
1020430037233008929548251010010010000100100005004277313030018300373003728265328745101002001000020020000300373003711102011009910010010000100000071011612296340100001003003830038300383003830038
102043003723201211729548251010010010000100100005004277313030018300373003728265328745101002001000020020000300373003711102011009910010010000100000071011611296340100001003003830038300383003830038
1020430037233006129548251010010010000100100005004277313030018300373003728265328745101002001000020020000300373003711102011009910010010000100000071011611296340100001003003830038300383003830038
1020430037233006129548251010010010000100100005004277313030018300373003728265328745101002001000020020000300373003711102011009910010010000100000071011611296340100001003003830038300383003830038
10204300372330061295482510100100100001001000050042773130300183003730037282653287451010020010000200200003003730037111020110099100100100001000000710116112963417100001003003830038300383003830038
1020430037233006129548251012510010000100100005004277313030018300373003728265328745101002001000020020000300373003711102011009910010010000100000071011611296340100001003003830038300383003830038
1020430037233006129548251010010010000100100005004277313030018300373003728265328764101002001000020020000300373003711102011009910010010000100000071011611296340100001003003830038300383003830038

1000 unrolls and 10 iterations

Result (median cycles for code): 3.0037

retire (01)cycle (02)031e3f4e51inst issue (52)~issue int (53)~issue fp/simd (54)~dispatch int (56)~dispatch fp/simd (57)huge thing int (59)huge thing fp/simd (5b)60696d6edispatch stall (70)scheduler rewind (75)scheduler stall (76)~dispatch op (78)~map op int (7c)~map op fp/simd (7e)~map lookup int (7f)~map lookup fp/simd (81)8283pipeline redirect (84)85inst all (8c)inst branch (8d)inst branch taken (90)inst b.cc (94)inst integer (97)inst neon or fp (9a)9fa8accfd5d6ddinst fetch restart (de)e0? fp/simd (ee)gpr retires (ef)f5f6f7f8fd
1002430037232061295482510010101000010100005042773130300183003730037282873287671001020100002020000300373003711100211091010100001000640216222963010000103003830038300383003830038
1002430037233061295482510010101000010100005042773130300183003730037282873287671001020100002020000300373003711100211091010100001000640216222963010000103003830038300383003830038
1002430037233061295482510010101000010100005042773131300183003730037282873287671001020100002020998300373003711100211091010100001000640216222963010000103003830038300383003830038
1002430037233061295482510010101000010100005042773131300183003730037282873287671001020100002020000300373003711100211091010100001000640216222963010000103003830038300383003830038
1002430037245061295482510010101000010100005042773130300183003730037282873287671001020100002020000300373003711100211091010100001006640216222963010000103003830038300383003830038
100243003723306129548251001010100001010000504277313030018300373003728287328767100102010000202000030037300371110021109101010000104815640216222963010000103003830038300383003830038
1002430037233061295482510010101000010100005042773130300183003730037282873287671001020100002020000300373003711100211091010100001003640216222963010000103003830038300383003830038
10024300372320103295482510010101000010100005042773130300183003730037282873287671001020100002021334300373003711100211091010100001000640216222963010000103003830038300383003830038
1002430037233061295482510010101000010100005042773130300183003730037282873287671001020100002020000300373003711100211091010100001000640216222963010000103003830038300383003830038
1002430037233061295482510010101000010100005042773130300183003730037282873287671001020100002020000300373003711100211091010100001000640216222963010000103003830038300383003830038

Test 3: Latency 1->2

Code:

  usra v0.8h, v0.8h, #3
  movi v0.16b, 1

(fused SUBS/B.cc loop)

100 unrolls and 100 iterations

Result (median cycles for code): 3.0037

retire (01)cycle (02)03080b1e1f3a3f4e51inst issue (52)~issue int (53)~issue fp/simd (54)~dispatch int (56)~dispatch fp/simd (57)huge thing int (59)huge thing fp/simd (5b)60696d6edispatch stall (70)scheduler rewind (75)scheduler stall (76)~dispatch op (78)~map op int (7c)~map op fp/simd (7e)~map lookup int (7f)~map lookup fp/simd (81)8283pipeline redirect (84)85inst all (8c)inst branch (8d)inst branch taken (90)inst b.cc (94)inst integer (97)inst neon or fp (9a)9fa1a6a8acc2c5branch mispredict (cb)cdcfd5d6ddinst fetch restart (de)e0? int output thing (e9)? fp/simd (ee)gpr retires (ef)f5f6f7f8fd
10204300372330000061295472510100100100001001000050042771601300183003730037282716287411010020010008200200163003730037211020110099100100100001000000211171711600296460100001003003830038300863003830038
10204300372330000061295472510112100100161001000050042771601300183003730037282756287411025420010008200200163008630037111020110099100100100001000000011171701600296460100001003008530038300383003830038
1020430037233000880612954725101001001000010010000500427716013005430037300372827111287411010020010008200203583003730037111020110099100100100001004000011171801600296460100001003003830038300383003830038
1020430037233001320061295472510100100100001001000050042771601300183003730037282711228741101002001000820020348300373003711102011009910010010000100000220011171701600296450100001003003830038300383003830038
10204300372330000061295472510100100100001001000050042771601300183003730037282716287401010020010008200200163003730037211020110099100100100001000003011171802400296840100001003008630038300383003830038
102043003723200900124295472510100100100001001000050042771601300903003730037282716287401010020010008200200163003730037111020110099100100100001000003011171701600296460100001003003830038300383003830038
1020430037232000001856295472510100100100001001000050042771601300183017930037282716287411010020010008200200163003730037111020110099100100100001000000011171701600296450100001003003830038300383003830038
102043003723300000699295472510100100100001001000050042771601300183003730037282716287411010020010008200200163003730037111020110099100100100001000000011171701600296450100001003003830038300383003830038
102043003723300120061295472510100100100001001000050042771601300183003730037282717287411010020010008200200163008630037111020110099100100100001000000011171802600296450100001003003830038300873003830038
10204300372330000061295472510100100100001001000050042771601300183003730037282717287411010020010008200200163003730037111020110099100100100001000005590011171701600296460100001003003830038300383003830038

1000 unrolls and 10 iterations

Result (median cycles for code): 3.0037

retire (01)cycle (02)031e1f3f4e51inst issue (52)~issue int (53)~issue fp/simd (54)~dispatch int (56)~dispatch fp/simd (57)huge thing int (59)huge thing fp/simd (5b)60696b6d6edispatch stall (70)scheduler rewind (75)scheduler stall (76)~dispatch op (78)~map op int (7c)~map op fp/simd (7e)~map lookup int (7f)~map lookup fp/simd (81)8283pipeline redirect (84)85inst all (8c)inst branch (8d)inst branch taken (90)inst b.cc (94)inst integer (97)inst neon or fp (9a)9fa8a9accfd5d6ddinst fetch restart (de)e0eaec? fp/simd (ee)gpr retires (ef)f5f6f7f8fd
100243003723200612954725100101010000101000050427716003001803003730037282863287671001020100002020000300373003711100211091010100001000064021622296290010000103003830038300383003830038
100243003723300612954725100101010000101000050427716003001803003730037282863287671001020100002020000300373003711100211091010100001000064021622296290010000103003830038300383003830038
100243003723300612954725100101010000101000050427716013001803003730037282863287671001020100002020000300373003711100211091010100001000064021622296290010000103003830038300383003830038
1002430037233004002954725100101010000101000050428121613001803003730037282863287671001020100002020000300373003711100211091010100001000064021621296290010000103003830038300383003830038
100243003723200612954725100101010000101000050427716003001803003730037282867287671001020100002020000300373003711100211091010100001000064021622296292010000103003830038300383003830038
100243003723300612954725100101010000101000050427716003001803003730037282863287671001020100002020000300373003711100211091010100001000064021622296290010000103003830038300383003830038
100243003723200612954725100101010000101000050427716013001803003730037282863287671001020100002020000300373003711100211091010100001010064021622296290010000103003830038300383003830038
1002430037233180612954725100101010000101000050427716003001803003730037282863287671001020100002020000300373003711100211091010100001000064021622296290010000103003830038300383003830038
100243003723300612954725100101010000101000050427716003001803003730037282863287671001020100002020000300373003711100211091010100001000064021622296290010000103003830038300383003830038
100243003723300612954725100101010000101000050427716003001803003730037282863287671001020100002020000300373003711100211091010100001000064021622296290010000103003830038300383003830038

Test 4: throughput

Count: 8

Code:

  movi v0.16b, 0
  usra v0.8h, v8.8h, #3
  movi v1.16b, 0
  usra v1.8h, v8.8h, #3
  movi v2.16b, 0
  usra v2.8h, v8.8h, #3
  movi v3.16b, 0
  usra v3.8h, v8.8h, #3
  movi v4.16b, 0
  usra v4.8h, v8.8h, #3
  movi v5.16b, 0
  usra v5.8h, v8.8h, #3
  movi v6.16b, 0
  usra v6.8h, v8.8h, #3
  movi v7.16b, 0
  usra v7.8h, v8.8h, #3
  movi v8.16b, 9

(fused SUBS/B.cc loop)

100 unrolls and 100 iterations

Result (median cycles for code divided by count): 0.2509

retire (01)cycle (02)031e3f51inst issue (52)~issue int (53)~issue fp/simd (54)~dispatch int (56)~dispatch fp/simd (57)huge thing int (59)huge thing fp/simd (5b)60696d6escheduler rewind (75)scheduler stall (76)~dispatch op (78)~map op int (7c)~map op fp/simd (7e)~map lookup int (7f)~map lookup fp/simd (81)8283pipeline redirect (84)85inst all (8c)inst branch (8d)inst branch taken (90)inst b.cc (94)inst integer (97)inst neon or fp (9a)9fa8acc5branch mispredict (cb)cdcfd5d6ddinst fetch restart (de)e0? fp/simd (ee)gpr retires (ef)f5f6f7f8fd
160204200651550712580116100800161008002850064019602004520065200656128012820080028200160056200652006511160201100991001001600001000751111011901600200621600001002006620066200662006620066
160204200651550292580116100800161008002850064019602004520065200656128012820080028200160056200652006511160201100991001001600001000751111011901600200621600001002006620066200662006620066
16020420065155029258011610080016100800285006401960200452006520065612801282008002820016005620065200651116020110099100100160000100001111017101600200621600001002006620066200662006620066
1602042006515602925801161008001610080028500640196020045200652006561280128200800282001600562006520065111602011009910010016000010001711111011901600200621600001002006620066200662006620066
160204200651560292580116100800161008002850064019602004520065200656128012820080028200160056200652006511160201100991001001600001000631111011901600200621600001002006620066200662006620066
160204200651560292580116100800161008002850064019602004520065200656128012820080028200160056200652006511160201100991001001600001001781111011901600200621600001002006620066200662006620066
16020420065156029258011610080016100800285006401960200452006520065612801282008002820016005620065200651116020110099100100160000100001111011901600200621600001002006620066200662006620066
16020420065155029258011610080016100800285006401960200452006520065612801282008002820016005620065200651116020110099100100160000100001111011901600200621600001002006620066200662006620066
16020420065155029258011610080016100800285006401960200452006520065612801282008002820016005620065200651116020110099100100160000100001111011901600200621600001002006620066200662006620066
1602042006515502925801161008001610080028500640196020045200652006561280128200800282001600562006520065111602011009910010016000010001051111011901600200621600001002006620066200662006620066

1000 unrolls and 10 iterations

Result (median cycles for code divided by count): 0.2506

retire (01)cycle (02)030708090a0b18191e1f3a3f51inst issue (52)~issue int (53)~issue fp/simd (54)~dispatch int (56)~dispatch fp/simd (57)huge thing int (59)huge thing fp/simd (5b)5f6061696d6escheduler rewind (75)scheduler stall (76)~dispatch op (78)~map op int (7c)~map op fp/simd (7e)~map lookup int (7f)~map lookup fp/simd (81)8283pipeline redirect (84)85inst all (8c)inst branch (8d)inst branch taken (90)inst b.cc (94)inst integer (97)inst neon or fp (9a)9fa0a1a3a6a7a8a9acc2c5cfd0d2icache miss (d3)d5d6d9dadbddinst fetch restart (de)e0eaebec? fp/simd (ee)gpr retires (ef)f5f6f7f8fd
16002420054155200100000045258001010800001080000506400001152002720046200463228001020800002016000020046200461116002110910101600001000000000001003782114202111615200431500160000102004720047200472004720047
16002420046156400100000045258001010800001080000506400001152002720050200463228001020800002016000020046200461116002110910101600001000000000001003982117202111416200441500160000102004720047200472004720047
16002420046156100000030045258001010800801080000506400001152002720047200463228001020800002016000020046200461116002110910101600001000000000001003982116202111416200431500160000102004720047200472004720047
16002420046155200100000045258001010800001080000506400001152002720046200463228001020800002016000020046200461116002110910101600001000000000001004082114202111416200431500160000102004720047200472004720047
16002420046155100000000045258001010800001080000506400001152002720046200463228001020800002016000020046200471116002110910101600001000000000001003682115202111614200431500160000102004720047200472004720047
16002420046155000000000045258001010800001080000506400001152002920046200463228001020800002016000020046200461116002110910101600001000000000001003882115202111417200431500160000102004720047200472004720047
16002420046155000000030073258001010800001080000506400001152002720046200463228001020800002016000020046200461116002110910101600001000000000001003882116202111613200431500160000102004720047200482004720047
16002420046155000000000045258001010800001080000506400001152002820046200463228001020800002016000020046200461116002110910101600001000000000001003782114202111516200431500160000102004720047200472004720047
16002420046155000000000045258001010800001080000506400001152002720046200463228001020800002016000020046200461116002110910101600001000000000001003882115202111715200431500160000102004720047200472004720047
16002420046155000000000045258001010800001080000506400001152002720046200463228001020800002016000020046200461116002110910101600001000000000001003882114202111715200431500160000102004720047200472004720047

Test 5: throughput

Count: 16

Code:

  usra v0.8h, v16.8h, #3
  usra v1.8h, v16.8h, #3
  usra v2.8h, v16.8h, #3
  usra v3.8h, v16.8h, #3
  usra v4.8h, v16.8h, #3
  usra v5.8h, v16.8h, #3
  usra v6.8h, v16.8h, #3
  usra v7.8h, v16.8h, #3
  usra v8.8h, v16.8h, #3
  usra v9.8h, v16.8h, #3
  usra v10.8h, v16.8h, #3
  usra v11.8h, v16.8h, #3
  usra v12.8h, v16.8h, #3
  usra v13.8h, v16.8h, #3
  usra v14.8h, v16.8h, #3
  usra v15.8h, v16.8h, #3
  movi v16.16b, 17

(fused SUBS/B.cc loop)

100 unrolls and 100 iterations

Result (median cycles for code divided by count): 0.2502

retire (01)cycle (02)030818191e1f3a3f51inst issue (52)~issue int (53)~issue fp/simd (54)~dispatch int (56)~dispatch fp/simd (57)huge thing int (59)huge thing fp/simd (5b)60696d6edispatch stall (70)scheduler rewind (75)scheduler stall (76)~dispatch op (78)~map op int (7c)~map op fp/simd (7e)~map lookup int (7f)~map lookup fp/simd (81)8283pipeline redirect (84)85inst all (8c)inst branch (8d)inst branch taken (90)inst b.cc (94)inst integer (97)inst neon or fp (9a)9fa0a1a6a8a9acc5branch mispredict (cb)cdcfd5d6ddinst fetch restart (de)e0? int output thing (e9)? fp/simd (ee)gpr retires (ef)f5f6f7f8fd
16020440039311000000302516010810016000810016002050012801321400204003940039199776199901601202001600322003200644003940039111602011009910010016000010000010011110118016004003601600001004004040040400404004040040
16020440039310000000302516010810016000810016002050012801320400204003940039199776199901601202001600322003200644003940039111602011009910010016000010000000011110118016004003601600001004004040040400404004040040
16020440039310000000722516010810016000810016002050012801320400204003940039199776199901601202001600322003200644003940039111602011009910010016000010000000011110118016114063901600001004057040872408234082740772
1602044082231611615211213201730232416119110316166710016174750012930261407064061940983201278020473161765200161689202323362406164081416116020110099100100160000100223008005222103512206114069511600001004097940870409344052240623
1602044092731501313106811440478217416147010416117110016132150012905161405514072740675200816320345160820200161322200322684407184070514116020110099100100160000100002043530111103422179324036601600001004071640727407174072840726
16020440689316081010625280642616011610016001610016002850012801960400294004840049199769199861601282001600382003200764004940048111602011009910010016000010000000022210128123114004601600001004004940049400494005040050
160204400483110000006426160116100160016100160028500128019604002940048400481997610199861601282001600382003200764004840048111602011009910010016000010000000022210128123114004501600001004005040050400504004940050
160204400483100000001592616011610016001610016002850012801960400294004840049199769199861601282001600382003200764004840048111602011009910010016000010000010022210145123114004501600001004004940049400494005040049
16020440048310000000642716011610016001610016002850012801960400294004940048199769199861601282001600382003200764004840048111602011009910010016000010000000022210129123114004501600001004005040049400494004940049
16020440048311000000642616011610016001610016002850012801960400294004940049199769199861601282001600382003200764004840049111602011009910010016000010000000022210128123114004501600001004005040050400504005040050

1000 unrolls and 10 iterations

Result (median cycles for code divided by count): 0.2502

retire (01)cycle (02)03071e3f51inst issue (52)~issue int (53)~issue fp/simd (54)~dispatch int (56)~dispatch fp/simd (57)huge thing int (59)huge thing fp/simd (5b)5f60696d6edispatch stall (70)scheduler rewind (75)scheduler stall (76)~dispatch op (78)~map op int (7c)~map op fp/simd (7e)~map lookup int (7f)~map lookup fp/simd (81)8283pipeline redirect (84)85inst all (8c)inst branch (8d)inst branch taken (90)inst b.cc (94)inst integer (97)inst neon or fp (9a)9fa0a6a8a9acb8c2c5branch mispredict (cb)cfd0d2icache miss (d3)itlb miss (d4)d5d6d9dadbddinst fetch restart (de)e0eaeb? fp/simd (ee)gpr retires (ef)f5f6f7f8fd
1600244005031110462516001010160000101600005012800001140020400394003919996320019160010201600002032000040039400391116002110910101600001000680000001002231102216211171740036206160000104004040040400404004040040
1600244003931000462516001010160000101600005012800001140020400394003919996320019160010201600002032000040039400391116002110910101600001000760000001002231101116111141740036206160000104004040040400404004040040
1600244003932210462516001010160000101600005012800000140020400394003919996320019160010201600002032000040039400391116002110910101600001000001200001002231101616211171740036155160000104004040040400404004040040
16002440039310007112516001010160388101600005012800001140020400394003919996320019160010201600002032000040039400391116002110910101600001000480000001002231101516111201440036155160000104004040040400404004040040
160024400393100279462516001010160000101600005012800001140020400394003919996320019160010201600002032000040039400391116002210910101600001000390000001002231101516111181640036155160000104004040040400404004040040
160024400393110067891600101016000010160000501280000114002040039400391999632001916001020160000203200004003940039111600211091010160000100080600001002231101516111171740036155160000104004040040400404004040040
16002440039310004032516001010160000101600005012800001140020400394003919996320019160010201600002032000040039400391116002110910101600001000620000001002231101416111191740036155160000104004040040400404004040040
1600244003931100462516001010160000101600005012800001140020400394003919996320019160010201600002032000040039400391116002110910101600001000360000001002231101616211201840036165160000104004040040400404004040040
1600244003931000462516001010160395101600005012800001140020400394003919996320019160010201600002032000040039400391116002110910101600001000001800001002231101816211161640036165160000104004040040400404004040040
160024400393100046251600101016000010160000501280000114002040039400391999632001916001020160000203200004003940039111600211091010160000100070300001002231101716211171640036155160000104004040040400404004040040