Apple M1 Microarchitecture Research by Dougall Johnson

Firestorm: Overview | Base Instructions | SIMD and FP Instructions
Icestorm:  Overview | Base Instructions | SIMD and FP Instructions

SQRSHRUN2 (4S)

Test 1: uops

Code:

  sqrshrun2 v0.8h, v1.4s, #3
  movi v0.16b, 1
  movi v1.16b, 2

(no loop instructions)

1000 unrolls and 1 iteration

Retires: 1.000

Issues: 1.000

Integer unit issues: 0.000

Load/store unit issues: 0.000

SIMD/FP unit issues: 1.000

retire (01)cycle (02)031e3f4e51inst issue (52)~issue fp/simd (54)~dispatch fp/simd (57)huge thing fp/simd (5b)60696d6edispatch stall (70)scheduler rewind (75)scheduler stall (76)~dispatch op (78)~map op fp/simd (7e)~map lookup fp/simd (81)8283pipeline redirect (84)85inst all (8c)inst neon or fp (9a)accfd5d6ddinst fetch restart (de)e0? fp/simd (ee)f5f6f7f8fd
10043037230612548251000100010003983131301830373037241532895100010002000303730371110011000673216112630100030383038303830383038
100430372306125482510001000100039831313018303730372415328951000100020003037303711100110003073116112630100030383038303830383038
10043037220612548251000100010003983131301830373037241532895100010002000303730371110011000073116112630100030383038303830383038
10043037220612548251000100010003983131301830373037241532895100010002000303730371110011000073116112630100030383038303830383038
10043037230612548251000100010003983131301830373037241532895100010002000303730371110011000073116112630100030383038303830383038
100430372206125482510001000100039831313018303730372415328951000100020003037303711100110005773116112630100030383038303830383038
100430372306625482510001000100039831313018303730372415328951000100020003037303711100110005473116112630100030383038303830383038
10043037220612548251000100010003983131301830373037241532895100010002000303730371110011000073116112630100030383038303830383038
10043037220612548251000100010003983131301830373037241532895100010002000303730371110011000073116112630100030383038303830383038
100430372206125482510001000100039831313018303730372415328951000100020003037303711100110002173116112630100030383038303830383038

Test 2: Latency 1->1

Code:

  sqrshrun2 v0.8h, v1.4s, #3
  movi v0.16b, 1
  movi v1.16b, 2

(fused SUBS/B.cc loop)

100 unrolls and 100 iterations

Result (median cycles for code): 3.0037

retire (01)cycle (02)0318191e1f3f4e51inst issue (52)~issue int (53)~issue fp/simd (54)~dispatch int (56)~dispatch fp/simd (57)huge thing int (59)huge thing fp/simd (5b)696d6edispatch stall (70)72scheduler rewind (75)scheduler stall (76)~dispatch op (78)~map op int (7c)~map op fp/simd (7e)~map lookup int (7f)~map lookup fp/simd (81)8283pipeline redirect (84)85inst all (8c)inst branch (8d)inst branch taken (90)inst b.cc (94)inst integer (97)inst neon or fp (9a)9fa0a8acc2cfd5d6ddinst fetch restart (de)e0? int output thing (e9)? fp/simd (ee)gpr retires (ef)f5f6f7f8fd
1020430037225000053629548251010010010000100100005004277313300183003730037282650328745101002001000020020000300373003711102011009910010010000100006071011611296340100001003003830038300383003830038
1020430037225000072629548251010010010000100100005004277313300183003730037282650328745101002001000020020000300373003711102011009910010010000100000071021611296348100001003003830038300383003830038
1020430037225000061295482510100100100001001000050042813843001830037300372826503228889111882281049722822664304443042191102011009910010010000100212214828903101212992728100001003042030468304193042330417
1020430420228881068704523529485162102071431007214211192715428952630342302733041728295039288171133620010000200200003003730037111020110099100100100001000062730150212981430100001003032330324302303032330276
10204302772276553426421082954844101001001000012210000500427731330018300373003728271015287451010020010000200200003003730037111020110099100100100001000115071011611296340100001003003830038300383003830038
102043003722500006129548251010010010000100100005004277313300183003730037282650328745101002001000020020000300373003711102011009910010010000100000071011611296340100001003003830038300383003830038
102043003722500006129548251010010010000100100005004277313300183003730037282657328745101002001000020020000300373003711102011009910010010000100006071011611296340100001003003830038300383003830038
102043003722500006129548251010010010000100100006804277313300183003730037282650328745101002001000020020000300373003711102011009910010010000100003071011611296340100001003003830038300383003830038
102043003722500006129548251010010010000100100005004277313300183003730037282650328745101002001000020020000300373003711102011009910010010000100006071011611296340100001003003830038300383003830038
102043003722500006129548251010010010000100100005004277313300183003730037282650328745101002001000020020000300373003711102011009910010010000100003071011611296340100001003003830038300383003830038

1000 unrolls and 10 iterations

Result (median cycles for code): 3.0037

retire (01)cycle (02)03080b18191e1f3a3f4e51inst issue (52)~issue int (53)~issue fp/simd (54)~dispatch int (56)~dispatch fp/simd (57)huge thing int (59)huge thing fp/simd (5b)60696d6edispatch stall (70)scheduler rewind (75)scheduler stall (76)~dispatch op (78)~map op int (7c)~map op fp/simd (7e)~map lookup int (7f)~map lookup fp/simd (81)8283pipeline redirect (84)85inst all (8c)inst branch (8d)inst branch taken (90)inst b.cc (94)inst integer (97)inst neon or fp (9a)9fa0a1a6a8acc2c5cfd5d6ddinst fetch restart (de)e0? int output thing (e9)? fp/simd (ee)gpr retires (ef)f5f6f7f8fd
1002430037225000000082295482510010101000010100005042773131300183003730037282873287671001020100002020000300373003711100211091010100001000003006423162229630010000103003830038300383003830038
10024300372240000300061295482510012101000010101495042773130300183003730037282873287671001220100002020000300373003711100211091010100001000002738006404162229630010000103003830038300383003830038
1002430037224000000061295482510010101000010100005042773130300183003730037282873287671001020100002020000300373003711100211091010100001000000006402162229630010000103003830038300383003830038
100243003722400001200103295482510010101000010100006042773131300183003730037282873287671001020100002020000300373003711100211091010100001000010006423162429630210000103003830038300383003830038
10024300372250000000103295482510010101000010100006042773130300183003730037282873287671001020100002020000300373003711100211091010100001000000006402162229630010000103003830038300383003830038
1002430037225000000061295482510010101000010100005042773130300183003730180282873287671015920100002020000300373003711100211091010100001000000006402162229630010000103003830038300383003830038
1002430037225000000061295482510010101000010100006542773131300183003730037282873287671001020100002020000300373003711100211091010100001000009006402162229630010000103003830038300383003830038
1002430037225000000061295482510010121000012100006042773130300183003730037282873287671001220100002020000300373003711100211091010100001000003006402162229630010000103003830038300383003830038
10024300372250000000612954825100101010000101000060427731303001830037300372828732876710010201000020200003003730037111002110910101000010000030006402162229630210000103003830038300383003830038
1002430037225000000061295482510010101000010100006042773130300183003730037282873287671001020100002020000300373003711100211091010100001000030006402162229630010000103003830038300383003830038

Test 3: Latency 1->2

Code:

  sqrshrun2 v0.8h, v0.4s, #3
  movi v0.16b, 1

(fused SUBS/B.cc loop)

100 unrolls and 100 iterations

Result (median cycles for code): 3.0037

retire (01)cycle (02)030e1e1f3a3f4e51inst issue (52)~issue int (53)~issue fp/simd (54)~dispatch int (56)~dispatch fp/simd (57)huge thing int (59)huge thing fp/simd (5b)60696d6edispatch stall (70)scheduler rewind (75)scheduler stall (76)~dispatch op (78)~map op int (7c)~map op fp/simd (7e)~map lookup int (7f)~map lookup fp/simd (81)8283pipeline redirect (84)85inst all (8c)inst branch (8d)inst branch taken (90)inst b.cc (94)inst integer (97)inst neon or fp (9a)9fa0a1a6a8acc2c5branch mispredict (cb)cdcfd5d6ddinst fetch restart (de)e0? int output thing (e9)? fp/simd (ee)gpr retires (ef)f5f6f7f8fd
10204300372250000612954725101001001000010010000500427716013001830037300372827162874110100200100082002001630037300371110201100991001001000010000010011171701600296460100001003003830038300383003830038
10204300372250000612954725101001001000010010000500427716013001830037300372827162874010100200100082002001630037300371110201100991001001000010000010011171701600296460100001003003830038300383003830038
10204300372250000612954725101001001000010010000500427716013001830037300372827172874010100200100082002001630037300371110201100991001001000010000023011171801600296450100001003003830038300383003830038
10204300372240000612954725101001001000010010000500427716013001830037300372827172874110100200100082002001630037300371110201100991001001000010000013011171801600296460100001003003830038300383003830038
10204300372250000612954725101001001000010610000500427716013001830037300372825262873310100200100002002000030037300371110201100991001001000010000006011172222422296290100001003003830038300383003830038
10204300372250001972954725101001001000010010000500427716013001830037300372825262873310100200100002002000030037300371110201100991001001000010000023011172222422296290100001003003830038300383003830038
10204300372250001972954725101001001000010010000500427716013001830037300372825262873310100200100002002000030037300371110201100991001001000010000013011172222422296290100001003003830038300383003830038
102043003722500019729547251010010010000100100005004277160130018300373003728252628733101002001000020020000300373003711102011009910010010000100000048011172222422296290100001003003830038300383003830038
10204300372250001972954725101001001000010010000500427716013001830037300372825262873310100200100002002000030037300371110201100991001001000010000020011172222422296290100001003003830038300383008530038
10204300372250001972954725101001001000010010000500427716013001830037300372825262873310100200100002002000030037300371110201100991001001000010000000011172222422296290100001003003830038300383003830038

1000 unrolls and 10 iterations

Result (median cycles for code): 3.0037

retire (01)cycle (02)031e3f4e51inst issue (52)~issue int (53)~issue fp/simd (54)~dispatch int (56)~dispatch fp/simd (57)huge thing int (59)huge thing fp/simd (5b)60696d6edispatch stall (70)scheduler rewind (75)scheduler stall (76)~dispatch op (78)~map op int (7c)~map op fp/simd (7e)~map lookup int (7f)~map lookup fp/simd (81)8283pipeline redirect (84)85inst all (8c)inst branch (8d)inst branch taken (90)inst b.cc (94)inst integer (97)inst neon or fp (9a)9fa8accfd5d6ddinst fetch restart (de)e0? int output thing (e9)? fp/simd (ee)gpr retires (ef)f5f6f7f8fd
100243003722512612954725100101010000101000050427716003001830037300372828632876710010201000020200003003730037111002110910101000010166404164429629010000103003830038300383003830038
100243003722506129547251001010100001010000504277160130018300373003728286328767100102010000202000030037300371110021109101010000102236404163429629010000103003830038300383003830038
10024300372240612954725100101010000101000050427716013001830037300372828632876710010201000020200003003730037111002110910101000010136403163429629010000103003830038300383003830038
10024300372240612954725100101010000121000050427716013001830037300372828632876710010201000020200003003730037111002110910101000010066403163429629010000103003830038300383003830038
10024300372250612954725100101010000101000050427716003001830037300372828632876710010201000020200003003730037111002110910101000010206403163429629010000103003830038300383003830038
10024300372250612954725100101010000101000050427716013001830037300372828632876710010201000020200003003730037111002110910101000010136404164429629010000103003830038300383003830038
10024300372250612954725100101010000101000050427716013001830037300372828632876710010201000020200003003730037111002110910101000010136404163429629010000103003830038300383003830038
10024300372250612954725100101010000101000050427716003001830037300372828632876710010201000020200003003730037111002110910101000010136404163429629010000103003830038300383003830038
10024300372240612954725100101010000101000050427716003001830037300372828632876710010201000020200003003730037111002110910101000010036404163429629010000103003830038300383003830038
100243003722506129547251001010100001010000504277160030018300373003728286328767100102010172202000030037300371110021109101010000102536403163429629010000103003830038300383003830038

Test 4: throughput

Count: 8

Code:

  movi v0.16b, 0
  sqrshrun2 v0.8h, v8.4s, #3
  movi v1.16b, 0
  sqrshrun2 v1.8h, v8.4s, #3
  movi v2.16b, 0
  sqrshrun2 v2.8h, v8.4s, #3
  movi v3.16b, 0
  sqrshrun2 v3.8h, v8.4s, #3
  movi v4.16b, 0
  sqrshrun2 v4.8h, v8.4s, #3
  movi v5.16b, 0
  sqrshrun2 v5.8h, v8.4s, #3
  movi v6.16b, 0
  sqrshrun2 v6.8h, v8.4s, #3
  movi v7.16b, 0
  sqrshrun2 v7.8h, v8.4s, #3
  movi v8.16b, 9

(fused SUBS/B.cc loop)

100 unrolls and 100 iterations

Result (median cycles for code divided by count): 0.2508

retire (01)cycle (02)03081e1f3f51inst issue (52)~issue int (53)~issue fp/simd (54)~dispatch int (56)~dispatch fp/simd (57)huge thing int (59)huge thing fp/simd (5b)60696d6escheduler rewind (75)scheduler stall (76)~dispatch op (78)~map op int (7c)~map op fp/simd (7e)~map lookup int (7f)~map lookup fp/simd (81)8283pipeline redirect (84)85inst all (8c)inst branch (8d)inst branch taken (90)inst b.cc (94)inst integer (97)inst neon or fp (9a)9fa8acc5branch mispredict (cb)cdcficache miss (d3)d5d6ddinst fetch restart (de)e0? fp/simd (ee)gpr retires (ef)f5f6f7f8fd
16020420091151000292580116100800161008002850064019602004520065200656128012820080028200160056200652006511160201100991001001600001000011110122031644200621600001002006620066200662006620066
16020420065151000292580116100800161008002850064019602004520065200656128012820080028200160056200652006511160201100991001001600001001311110122041655200621600001002006620066200662006620066
16020420065150000292580116100800161008002850064019602004520065200656128012820080028200160056200652006511160201100991001001600001001011110122051645200621600001002006620066200662006620066
16020420065150000292580116100800161008002850064019602004520065200656128012820080028200160056200652006511160201100991001001600001000011110122051654200621600001002006620066200662006620066
160204200651510002925801161008001610080028500640196020045200652006561280128200800282001600562006520065111602011009910010016000010015611110123041644200621600001002006620066200662006620066
16020420065150000292580116100800161008002850064019602004520065200656128012820080028200160056200652006511160201100991001001600001000011110122041645200621600001002006620066200662006620066
16020420065150000292580116100800161008002850064019602004520065200656128012820080028200160056200652006511160201100991001001600001000011110123051655200621600001002006620066200662006620066
16020420065151000292580116100800161008002850064019602004520065200656128012820080028200160056200652006511160201100991001001600001000011110123041645200621600001002006620066200662006620066
16020420065150000292580116100800161008002850064019602004520065200656128012820080028200160056200652006511160201100991001001600001000011110122041645200621600001002006620066200662006620066
16020420065150000292580116100800161008002850064019602004520065200656128012820080028200160056200652006511160201100991001001600001001311110122041644200621600001002006620066200662006620066

1000 unrolls and 10 iterations

Result (median cycles for code divided by count): 0.2506

retire (01)cycle (02)03090b1e3a3f51inst issue (52)~issue int (53)~issue fp/simd (54)~dispatch int (56)~dispatch fp/simd (57)huge thing int (59)huge thing fp/simd (5b)5f60696b6d6escheduler rewind (75)scheduler stall (76)~dispatch op (78)~map op int (7c)~map op fp/simd (7e)~map lookup int (7f)~map lookup fp/simd (81)8283pipeline redirect (84)85inst all (8c)inst branch (8d)inst branch taken (90)inst b.cc (94)inst integer (97)inst neon or fp (9a)9fa0a1a6a7a8a9accfd0d2icache miss (d3)d5d6d9dadbddinst fetch restart (de)e0eaec? fp/simd (ee)gpr retires (ef)f5f6f7f8fd
16002420075150010045298001010800001080000506400001120032020062200513228001020800002016000020051200511116002110910101600001000000021100326225342116520048401160000102005220052200522005220052
1600242006215000004529800101080000108000050640000112003202005120051322800102080000201600002005120051111600211091010160000100000000100293116252116720048201160000102005220052200612005220052
1600242005115000004529800101080000108000050640000112003202005120051322800102080000201600002005120051111600211091010160000100000003100286119252116620048202160000102005220052200522005220052
16002420062150000045298001010800001080000506400001120032020051200513228001020800002016000020051200511116002110910101600001002001015100293115253118520048201160000102005220131201332005220061
16002420051150101504529801141280000108000050640000112003202005120051322800102080000201600002005120051111600211091010160000100000006100293116252116520048201160000102005220052200522006320052
1600242005115000004529800101080000108000050640000112003202005120051322800102080000201600002005120051111600211091010160000100000006100283117252116720048201160000102005220052200522005220052
16002420060150001204533800101080000108000050640000112003202005120051322800102080000201600002005120051111600211091010160000100000000100303116252116820048201160000102005220052200522005220052
1600242005115000004529800101080000108000050640000112003202005120051322800102080000201600002005120051111600211091010160000100000103100293116252115720048201160000102005220052200522005220061
16002420060150002104524480638108063010800005064000011200320200512005132280010208000020160000200512005111160021109101016000010000018036100313118252116520048202160000102005220052200522005220052
16002420060163000018533800101080000108000050640000112003202005120051322800102080000201600002005120051111600211091010160000100000000100303117252115420048411160000102005220052200522005220063

Test 5: throughput

Count: 16

Code:

  sqrshrun2 v0.8h, v16.4s, #3
  sqrshrun2 v1.8h, v16.4s, #3
  sqrshrun2 v2.8h, v16.4s, #3
  sqrshrun2 v3.8h, v16.4s, #3
  sqrshrun2 v4.8h, v16.4s, #3
  sqrshrun2 v5.8h, v16.4s, #3
  sqrshrun2 v6.8h, v16.4s, #3
  sqrshrun2 v7.8h, v16.4s, #3
  sqrshrun2 v8.8h, v16.4s, #3
  sqrshrun2 v9.8h, v16.4s, #3
  sqrshrun2 v10.8h, v16.4s, #3
  sqrshrun2 v11.8h, v16.4s, #3
  sqrshrun2 v12.8h, v16.4s, #3
  sqrshrun2 v13.8h, v16.4s, #3
  sqrshrun2 v14.8h, v16.4s, #3
  sqrshrun2 v15.8h, v16.4s, #3
  movi v16.16b, 17

(fused SUBS/B.cc loop)

100 unrolls and 100 iterations

Result (median cycles for code divided by count): 0.2502

retire (01)cycle (02)030818191e1f3f51inst issue (52)~issue int (53)~issue fp/simd (54)~dispatch int (56)~dispatch fp/simd (57)huge thing int (59)huge thing fp/simd (5b)60696d6edispatch stall (70)scheduler rewind (75)scheduler stall (76)~dispatch op (78)~map op int (7c)~map op fp/simd (7e)~map lookup int (7f)~map lookup fp/simd (81)8283pipeline redirect (84)85inst all (8c)inst branch (8d)inst branch taken (90)inst b.cc (94)inst integer (97)inst neon or fp (9a)9fa0a1a8a9acc5branch mispredict (cb)cdcfd2d5d6ddinst fetch restart (de)e0? int output thing (e9)? fp/simd (ee)gpr retires (ef)f5f6f7f8fd
1602044005930000000302516010810016000810016002050012810121400204003940039199776200171601202001600322003200644003940346111602011009910010016000010000030111101180016004003601600001004004040346401014009140402
160204405593041591323880183421216098710216069310016104851112874801404344029940496200693820207161147200160550202322128405804061111116020110099100100160000100221247212211029103170454056411600001004072640724407214072440679
1602044040730511312158710561368251161369100161264102161239522129050414055040667403582007435203131613302001600002003200004009840048111602011009910010016000010000000111101200224224004501600001004004940049400494004940049
1602044004830000130762716010010016000010016000050012800000400294004840048199716199941601002001600002003200004004840048111602011009910010016000010000000111101200224224004501600001004004940049400494004940049
1602044004830000027016027160100100160000100160000500128000014002940048400481997161999416010020016000020032000040048400481116020110099100100160000100001001111012036323114004501600001004004040040400404004040040
160204400393000000010382516010810016000810016002050012801320400204003940039199776199901601202001600322003200644003940039111602011009910010016000010000000111101180016004003601600001004004040040400404004040040
1602044003930000000302516010810016000810016002050012801320400204003940039199776199901601202001600322003200644003940039111602011009910010016000010000000111101180016004003601600001004004040040400404004040040
16020440039300000005052516010810016000810016002050012801320400204009040039199776199901601202001600322003200644003940039111602011009910010016000010000000111101180016004003601600001004004040040400404004040040
1602044003930000000302516010810016000810016002050012801320400204003940039199776199901601202001600322003200644003940039211602011009910010016000010000000111101180016004003601600001004004040040400404004040040
1602044003930000000302516010810016000810016002050012801320400204003940039199776199901601202001600322003200644003940039111602011009910010016000010000000111101180016004003601600001004004040040400404004040040

1000 unrolls and 10 iterations

Result (median cycles for code divided by count): 0.2502

retire (01)cycle (02)03090b1e3f51inst issue (52)~issue int (53)~issue fp/simd (54)~dispatch int (56)~dispatch fp/simd (57)huge thing int (59)huge thing fp/simd (5b)5f6061696b6d6edispatch stall (70)scheduler rewind (75)scheduler stall (76)~dispatch op (78)~map op int (7c)~map op fp/simd (7e)~map lookup int (7f)~map lookup fp/simd (81)8283pipeline redirect (84)85inst all (8c)inst branch (8d)inst branch taken (90)inst b.cc (94)inst integer (97)inst neon or fp (9a)9fa0a1a8acc5cfd0d2icache miss (d3)d5d6d9dadbddinst fetch restart (de)e0eaeb? fp/simd (ee)gpr retires (ef)f5f6f7f8fd
1600244005130000046251600101016000010160000501280000115400200400394003919996320048160010201600002032000040039400391116002110910101600001000150010023162211164222121400361510160000104004040040400404004040040
160024400393000005225160010101600001016000050128000001104002004003940039199963200191600102016000020320000400394003911160021109101016000010000001002316629162211520400363010160000104004040040400404004040040
160024400392990004699160010101600001016000050128000001104002004003940039199963200191600102016000020320000400394003911160021109101016000010023001002316512016222219400363010160000104004040040400404004040040
160024400393000005225160010101600001016000050128000011104002004003940039199963200191600102016000020320000400394003911160021109101016000010000001002516512016212209400363010160000104004040043400404004040040
160024400393000005225160010101600001016000050128000011104002004003940039199963200191600102016014020320000400394003911160021109101016000010000001002513622016212188400363010160000104004040040400404004040040
160024400392990005225160010101600001016000050128000011104002004003940039199963200191600102016000020320000400394003911160021109101016000010000001002516512016421212140036305160000104004040040400404004040040
1600244003930000946251600101016000010160000501280000111040020040039400391999632001916001020160000203200004003940039111600211091010160000100000010025135281621182040036155160000104004040040400404004040040
16002440039300000468016001010160000101600005012800001110400200400394003919996320019160010201600002032000040039400391116002110910101600001000000100231351201621192040036155160000104004040040400404004040040
16002440039300000462516001010160000101600005012800001110400200400394003919996320019160010201600002032000040039400391116002110910101600001000100100251351201621192040036155160000104004040040400404004040040
16002440039299000882516001010160000101600005012800001110400200400394003919996320019160010201600002032000040039400391116002110910101600001000000100231351201621192040036155160000104004040040400404004040040