Apple M1 Microarchitecture Research by Dougall Johnson

Firestorm: Overview | Base Instructions | SIMD and FP Instructions
Icestorm:  Overview | Base Instructions | SIMD and FP Instructions

SQRSHRN2 (4S)

Test 1: uops

Code:

  sqrshrn2 v0.8h, v1.4s, #3
  movi v0.16b, 1
  movi v1.16b, 2

(no loop instructions)

1000 unrolls and 1 iteration

Retires: 1.000

Issues: 1.000

Integer unit issues: 0.000

Load/store unit issues: 0.000

SIMD/FP unit issues: 1.000

retire (01)cycle (02)031e3f4e51inst issue (52)~issue fp/simd (54)~dispatch fp/simd (57)huge thing fp/simd (5b)60696d6edispatch stall (70)scheduler rewind (75)scheduler stall (76)~dispatch op (78)~map op fp/simd (7e)~map lookup fp/simd (81)8283pipeline redirect (84)85inst all (8c)inst neon or fp (9a)a8a9accfd5d6ddinst fetch restart (de)e0? fp/simd (ee)f5f6f7f8fd
10043037230612548251000100010003983131301830373037241532895100010002000303730371110011000160073316222630100030383038303830383038
1004303723061254825100010001000398313130183037303724153289510001000200030373037111001100000073216222630100030383038303830383038
1004303722061254843100010001000398313030183037303724153289510001000200030373037111001100000073216222630100030383038303830383038
1004303723061254825100010001000398313030183037303724153289510001000200030373037111001100000073216222630100030383038303830383038
1004303722061254825100010001000398313030183037303724153289510001000200030373037111001100000073216222630100030383038303831333038
1004303722061254825100010001000398313130183037303724153289510001000200030373037111001100000073216222646100030383038303830383038
1004303723061254825100010001000398313030183037303724153289510001000200030373037111001100000073216222630100030383038303830383038
1004303723061254825100010001000398313130183037303724153289510001000200030373037111001100000073216222672100030383038303830383038
1004303722061254825100010001000398313030183037303724153289510001000200030373037111001100000073216222630100030383038303830383038
1004303722061254825100010001000398313130183037303724153289510001000200030373037111001100000073216222630100030383038303830383038

Test 2: Latency 1->1

Code:

  sqrshrn2 v0.8h, v1.4s, #3
  movi v0.16b, 1
  movi v1.16b, 2

(fused SUBS/B.cc loop)

100 unrolls and 100 iterations

Result (median cycles for code): 3.0037

retire (01)cycle (02)0308090b18191e1f3f4e51inst issue (52)~issue int (53)~issue fp/simd (54)~dispatch int (56)~dispatch fp/simd (57)huge thing int (59)huge thing fp/simd (5b)60696d6edispatch stall (70)scheduler rewind (75)scheduler stall (76)~dispatch op (78)~map op int (7c)~map op fp/simd (7e)~map lookup int (7f)~map lookup fp/simd (81)8283pipeline redirect (84)85inst all (8c)inst branch (8d)inst branch taken (90)inst b.cc (94)inst integer (97)inst neon or fp (9a)9fa0a1a6a7a8a9acc2cfd5d6ddinst fetch restart (de)e0? int output thing (e9)? fp/simd (ee)gpr retires (ef)f5f6f7f8fd
10204300372250000000232295482510100100100001001000050042773131300183003730037282653287451010020010000200200003003730037111020110099100100100001000000000071011611296340100001003003830038300383003830038
1020430037225000000061295482510100100100001001000050042773131300183003730037282653287451010020010000200200003003730037111020110099100100100001000000000071011611296340100001003003830038300863010030038
1020430037225000000061295482510100100100001001000050042773131300183003730037282653287451010020010000200200003003730037111020110099100100100001000000000071011611296340100001003003830038300383003830038
102043003722401000301032954825101001001000010010000500427731313001830037300372826525287451010020010000200200003003730037111020110099100100100001000000000071011611296340100001003003830038300383003830038
1020430037225000000061295482510100100100001001000050042773131300183003730037282653287451010020010000200200003003730037111020110099100100100001000000000071011611296340100001003003830038300383003830038
102043003722500000141061295482510100100100001161000050042773131300183003730037282653287451010020010000200200003003730037111020110099100100100001000000000071011611296340100001003003830038300383003830038
10204300372250000000612954825101001001000010010000500427731313001830037300372826532874510100200100002002000030037300371110201100991001001000010000000000710116112963415100001003003830038300383003830227
102043003722400000008229548251010010010000100100005004277313130018300373003728272122874510100200100002002000030037300371110201100991001001000010000311427830733116112985028100001003074030994308943095430276
10204305172321108910597045774294942821033018610144146111927454301774130270304163045728295372890211185222113302342271630359304148110201100991001001000010044313021457828514105122994231100001003036830370304203041830421
10204304642280119811917045265294662001020814910072157112677184288169130306305143046628302312887811335226113242242266430232303241011020110099100100100001000021042509828894106312982031100001003027530552305623047030407

1000 unrolls and 10 iterations

Result (median cycles for code): 3.0037

retire (01)cycle (02)0308090b18191e1f3f4e51inst issue (52)~issue int (53)~issue fp/simd (54)~dispatch int (56)~dispatch fp/simd (57)huge thing int (59)huge thing fp/simd (5b)60696d6edispatch stall (70)scheduler rewind (75)scheduler stall (76)~dispatch op (78)~map op int (7c)~map op fp/simd (7e)~map lookup int (7f)~map lookup fp/simd (81)8283pipeline redirect (84)85inst all (8c)inst branch (8d)inst branch taken (90)inst b.cc (94)inst integer (97)inst neon or fp (9a)9fa1a6a8a9acc2cfd5d6ddinst fetch restart (de)e0? int output thing (e9)? fp/simd (ee)gpr retires (ef)f5f6f7f8fd
100243003722500000906129548251001010100001010000504277313130018300373003728287328767100102010000202000030037300371110021109101010000100000006402162229630010000103008630132300383003830038
100243003722500000006129548251001010100001010000504277313130018300373003728287328767100102010000202000030037300371110021109101010000100000006402162229630010000103003830038300383003830038
100243003722500000006129548251001010100001010000504277313030018300373003728287328767100102010000202000030037300371110021109101010000100000006402162229630010000103003830038300383003830038
100243003722500000006129548251001010100001010000504277313030018300373003728287328767100102010000202000030037300371110021109101010000100000006402162229630010000103003830038300383003830038
1002430037233000000061295482510010101000010100005042773130300183003730037282873287671001020100002020000300373003711100211091010100001000430906402162229630010000103003830038300383003830038
100243003722500000006129548251001010100001010000504277313030018300373003728287328767100102010000202000030037300371110021109101010000100000006402162229630010000103003830038300383003830038
100243003722500000006129548251001010100001010000504277313030018300373003728287328767100102010000202000030037300371110021109101010000100000006402162229630010000103003830038300383003830038
100243003722400000006129548251001010100001010000504277313130018300373003728287328767100102010000202000030037300371110021109101010000100000006402162229630010000103003830038300383003830038
100243003722500000006129548251001010100001010000504277313030018300373003728287328767100102010000202000030037300371110021109101010000100000606402162229630010000103003830038300383003830038
100243003722500000006129548251001010100001010000504277313030018300373003728287328767100102010000202000030037300371110021109101010000100000006402162229630010000103003830038300383003830038

Test 3: Latency 1->2

Code:

  sqrshrn2 v0.8h, v0.4s, #3
  movi v0.16b, 1

(fused SUBS/B.cc loop)

100 unrolls and 100 iterations

Result (median cycles for code): 3.0037

retire (01)cycle (02)03080b18191e1f3a3f4e51inst issue (52)~issue int (53)~issue fp/simd (54)~dispatch int (56)~dispatch fp/simd (57)huge thing int (59)huge thing fp/simd (5b)696d6edispatch stall (70)scheduler rewind (75)scheduler stall (76)~dispatch op (78)~map op int (7c)~map op fp/simd (7e)~map lookup int (7f)~map lookup fp/simd (81)8283pipeline redirect (84)85inst all (8c)inst branch (8d)inst branch taken (90)inst b.cc (94)inst integer (97)inst neon or fp (9a)9fa1a6a8a9acc2c5branch mispredict (cb)cdcfd5d6ddinst fetch restart (de)e0? int output thing (e9)? fp/simd (ee)gpr retires (ef)f5f6f7f8fd
10204300372250000300082295472510100100100001041045050042771603001830037300372827162874110100200100082002001630037300371110201100991001001000010000000011171801600296460100001003023230038300383003830038
10204300372240000000804295472510100100100001001000050042771603001830037300372825262873310100200100002002000030037300371110201100991001001000010000000411172222422296290100001003003830038300383003830038
1020430037225000000097295472510100100100001001000050042771603001830037300372825262873310100200100002002000030037300371110201100991001001000010000000011172222422296290100001003003830038300383003830038
1020430037225000000061295472510100100100001001000050042771603001830037300372827172874110100200100082002001630037300371110201100991001001000010000000011171701600296460100001003003830038300383003830038
10204300372240000000536295472510100100100001041000052242771603001830037300372827162874110100200100082002001630037300371110201100991001001000010000106011172222422296290100001003003830038300383003830038
1020430037225000000097295472510100100100001001000050042771603001830037300372825262873310100200100002002000030037300371110201100991001001000010000003011172222422296290100001003003830038300383003830038
1020430037225000000061295472510100100100001001000050042771603001830037300372827172874010100200100082002001630037300371110201100991001001000010000000011171801600296460100001003003830038300383003830038
1020430037225000000061295472510100100100001001000050042771603001830037300372827172874010100200100082002001630037300371110201100991001001000010000000011172223122296290100001003003830038300383008730038
102043003722400004080097295472510100100100001001000050042771603001830037300372825262873310100200100002002000030037300371110201100991001001000010000000011172222422296290100001003023030038300383003830038
1020430037225000000097295472510100100100081001000050042771603001830037300372825262873310100200100002002000030037300371110201100991001001000010000003011172222422296290100001003003830038300383003830038

1000 unrolls and 10 iterations

Result (median cycles for code): 3.0037

retire (01)cycle (02)030818193a3f4e51inst issue (52)~issue int (53)~issue fp/simd (54)~dispatch int (56)~dispatch fp/simd (57)huge thing int (59)huge thing fp/simd (5b)696b6d6edispatch stall (70)scheduler rewind (75)scheduler stall (76)~dispatch op (78)~map op int (7c)~map op fp/simd (7e)~map lookup int (7f)~map lookup fp/simd (81)8283pipeline redirect (84)85inst all (8c)inst branch (8d)inst branch taken (90)inst b.cc (94)inst integer (97)inst neon or fp (9a)9fa9accdcfd5d6ddinst fetch restart (de)e0? fp/simd (ee)gpr retires (ef)f5f6f7f8fd
10024300372250000612954725100101010000101000050427716030018030037300372828632876710010201000020200003003730037111002110910101000010000640216222962910000103003830038300383003830038
10024300372240000612954725100101010000101000066427716030018030037300372828632876710010201000020200003003730037111002110910101000010000640216222962910000103003830038300383003830038
10024300372250000612954725100101010000101000050427716030018030037300372828632876710010201000020200003003730037111002110910101000010000640216222962910000103003830038300383003830038
10024300372250000612954725100101010000101000050427716030018030037300372828632876710010201000020200003003730037111002110910101000010000640216222962910000103003830038300383003830038
10024300372250000612954725100101010000101000050427716030018030037300372828632876710010201000020200003003730037111002110910101000010000640216222962910000103003830038300383003830038
10024300372250000612954725100101010000101000050427716030018030037300372828632876710010201000020200003003730037111002110910101000010000640216222962910000103003830038300383003830038
10024300372250000612954725100101010000101000050427716030018030037300372828632876710010201000020200003003730037111002110910101000010000640216222962910000103003830038300383003830038
10024300372250000612954725100101010000101000050427716030018030037300372828632876710010201000020200003003730037111002110910101000010000640216222962910000103003830038300383003830038
10024300372250000612954725100101010000101000050427716030018030037300372828632876710010201000020200003003730037111002110910101000010000640216222962910000103003830038300383003830038
10024300372250000612954725100101010000101000050427716030018030037300372828632876710010201000020200003003730037111002110910101000010000640216222962910000103003830038300383003830038

Test 4: throughput

Count: 8

Code:

  movi v0.16b, 0
  sqrshrn2 v0.8h, v8.4s, #3
  movi v1.16b, 0
  sqrshrn2 v1.8h, v8.4s, #3
  movi v2.16b, 0
  sqrshrn2 v2.8h, v8.4s, #3
  movi v3.16b, 0
  sqrshrn2 v3.8h, v8.4s, #3
  movi v4.16b, 0
  sqrshrn2 v4.8h, v8.4s, #3
  movi v5.16b, 0
  sqrshrn2 v5.8h, v8.4s, #3
  movi v6.16b, 0
  sqrshrn2 v6.8h, v8.4s, #3
  movi v7.16b, 0
  sqrshrn2 v7.8h, v8.4s, #3
  movi v8.16b, 9

(fused SUBS/B.cc loop)

100 unrolls and 100 iterations

Result (median cycles for code divided by count): 0.2508

retire (01)cycle (02)031e3f51inst issue (52)~issue int (53)~issue fp/simd (54)~dispatch int (56)~dispatch fp/simd (57)huge thing int (59)huge thing fp/simd (5b)5f60696d6e74scheduler rewind (75)scheduler stall (76)~dispatch op (78)~map op int (7c)~map op fp/simd (7e)~map lookup int (7f)~map lookup fp/simd (81)8283pipeline redirect (84)85inst all (8c)inst branch (8d)inst branch taken (90)inst b.cc (94)inst integer (97)inst neon or fp (9a)9fa8acc5branch mispredict (cb)cdcfd5d6ddinst fetch restart (de)e0? fp/simd (ee)gpr retires (ef)f5f6f7f8fd
1602042009015002925801161008001610080028500640196012004520065200650612801282008002820016005620065200651116020110099100100160000100001111012681689200621600001002006620066200662006620066
1602042006515102949802221008001610080028500640196012004520065200650612801282008002820016005620065200651116020110099100100160000100001111012281689200621600001002006620066200662006620066
1602042006515002925801161008001610080028500640196012004520065200650612801282008013620016005620065200651116020110099100100160000100001111012231683200621600001002006620066200662006620066
1602042006515002925801161008001610080028500640196012004520065200650612801282008002820016005620065200651116020110099100100160000100101111012881628200621600001002006620066200662006620066
1602042006515002925801161008001610080028500640196012004520065200650612801282008002820016005620065200651116020110099100100160000100001111012781688200621600001002006620066200662006620066
16020420065151029258011610080016100800285006401960120045200652006506128012820080028200160056200652006511160201100991001001600001000011110125616810200621600001002006620066200662006620066
1602042006515092925801161008001610080028500640196112004520065200650612801282008002820016005620065200651116020110099100100160000100001111012771677200621600001002006620066200662006620066
1602042006515002925801161008001610080028500640196012004520065200650612801282008002820016005620065200651116020110099100100160000100001111012731666200621600001002006620066200662006620066
1602042006515002925801161008001610080028500640196012004520065200650612801282008002820016005620065200651116020110099100100160000100001111012681688200621600001002014720150200662006620146
1602042006515102925801161008001610080028500640196012004520065200650612801282008002820016005620065200651116020110099100100160000100031111012731687200621600001002006620066200662006620066

1000 unrolls and 10 iterations

Result (median cycles for code divided by count): 0.2506

retire (01)cycle (02)031e3f51inst issue (52)~issue int (53)~issue fp/simd (54)~dispatch int (56)~dispatch fp/simd (57)huge thing int (59)huge thing fp/simd (5b)5f6061696d6escheduler rewind (75)scheduler stall (76)~dispatch op (78)~map op int (7c)~map op fp/simd (7e)~map lookup int (7f)~map lookup fp/simd (81)8283pipeline redirect (84)85inst all (8c)inst branch (8d)inst branch taken (90)inst b.cc (94)inst integer (97)inst neon or fp (9a)9fa8accfd0d2icache miss (d3)d5d6d9dadbddinst fetch restart (de)e0ea? fp/simd (ee)gpr retires (ef)f5f6f7f8fd
160024200881500283258001010800001080000506400001152002720046200463228001020800002016000020050200461116002110910101600001000100261331620411442004332160000102004720047200472004720047
1600242004615002332580010108000010800005064000011102002720046200463228001020800002016000020046200461116002110910101600001000100271351620212442004364160000102004720047200472004720047
1600242004615002232580010108000010800005064000001102002720046200463228001020800002016000020046200461116002110910101600001003100271351424211762004332160000102004720047200472004720047
16002420046150017662580010108000010800005064000011102002720046200463228001020800002016000020046200461116002110910101600001000100271361420211642004332160000102004720047200472004720047
1600242004615002202580010108000010800005064000011102002720046200463228001020800002016000020046200461116002110910101600001000100291361724421462004376160000102005120047200512005120051
16002420050150022425801151080000108000050640000011020027200462004632280010208000020160000200462004611160021109101016000010078100291361320211672004364160000102004720047200472004720047
1600242004615002182580010108000010800005064000011102002720046200463228001020800002016000020046200461116002110910101600001000100271361420211472004332160000102004720047200472004720047
1600242005015101842580010108000010800005064000011102002720050200463228001020800002016000020046200461116002110910101600001000100261361320221442004332160000102004720047200472004720047
1600242004615101832580010108000010800005064000011102002720046200463228001020800002016000020046200461116002110910101600001000100271361420211442004332160000102004720047200472004720047
1600242004615001972580010108000010800005064000011102002720046200463228001020800002016000020046200461116002110910101600001000100301362720211662004332160000102004720047200472005120047

Test 5: throughput

Count: 16

Code:

  sqrshrn2 v0.8h, v16.4s, #3
  sqrshrn2 v1.8h, v16.4s, #3
  sqrshrn2 v2.8h, v16.4s, #3
  sqrshrn2 v3.8h, v16.4s, #3
  sqrshrn2 v4.8h, v16.4s, #3
  sqrshrn2 v5.8h, v16.4s, #3
  sqrshrn2 v6.8h, v16.4s, #3
  sqrshrn2 v7.8h, v16.4s, #3
  sqrshrn2 v8.8h, v16.4s, #3
  sqrshrn2 v9.8h, v16.4s, #3
  sqrshrn2 v10.8h, v16.4s, #3
  sqrshrn2 v11.8h, v16.4s, #3
  sqrshrn2 v12.8h, v16.4s, #3
  sqrshrn2 v13.8h, v16.4s, #3
  sqrshrn2 v14.8h, v16.4s, #3
  sqrshrn2 v15.8h, v16.4s, #3
  movi v16.16b, 17

(fused SUBS/B.cc loop)

100 unrolls and 100 iterations

Result (median cycles for code divided by count): 0.2503

retire (01)cycle (02)03191f3f51inst issue (52)~issue int (53)~issue fp/simd (54)~dispatch int (56)~dispatch fp/simd (57)huge thing int (59)huge thing fp/simd (5b)5f60696d6edispatch stall (70)scheduler rewind (75)scheduler stall (76)~dispatch op (78)~map op int (7c)~map op fp/simd (7e)~map lookup int (7f)~map lookup fp/simd (81)8283pipeline redirect (84)85inst all (8c)inst branch (8d)inst branch taken (90)inst b.cc (94)inst integer (97)inst neon or fp (9a)9fa8acc5branch mispredict (cb)cdcfd2d5d6ddinst fetch restart (de)e0? int output thing (e9)eb? fp/simd (ee)gpr retires (ef)f5f6f7f8fd
160204400612990051251601081001600081001600205001280132004002040039400391997761999016012020016003220032006440039400391116020110099100100160000100001111011801160040036001600001004004040040400404004040040
1602044003930000695251601081001600081001600205001280132014002040039400391997761999016012020016003220032006440039400391116020110099100100160000100001111011802160040036001600001004004040040400404004040040
160204400393000030251601081001600081001600205001280132014002040039400391997761999016012020016014420032006440039400391116020110099100100160000100031111011801160040036001600001004004040040400404004040040
160204400393000030251601081001600081001600205001280132004002040039400391997761999016012020016003220032006440039400391116020110099100100160000100001111011802160040036001600001004004040040400404004040040
160204400393000030251601081001600081001600205001280132014002040039400391997761999016012020016003820032007640049400481116020110099100100160000100002221012802231140046001600001004005040050400494005040049
160204400493000064261601161001600161001600285001280196014002940048400481997691998616012820016003820032007640048400481116020110099100100160000100002221012903231140046001600001004005040049400494005040049
160204400493000064261601161001600161001600285001280196114002940049400481997691998616012820016003820032007640049400481116020110099100100160000100002221012803231140046001600001004004940049400504004940049
160204400483000064261601161001600161001600285001280196004002940048400491997691998616012820016003820032007640048400491116020110099100100160000100002221012802232140045001600001004004940049400494005040049
160204400482990064261601161001600161001600285001280196004002940048400481997691998616012820016003820032007640048400481116020110099100100160000100002221012803231140046001600001004004940049400494004940049
160204400493000064261601161001600161001600285001280196014002940049400481997691998616012820016003820032007640049400481116020110099100100160000100002221012802231140045001600001004004940049400504004940049

1000 unrolls and 10 iterations

Result (median cycles for code divided by count): 0.2502

retire (01)cycle (02)03191e3f51inst issue (52)~issue int (53)~issue fp/simd (54)~dispatch int (56)~dispatch fp/simd (57)huge thing int (59)huge thing fp/simd (5b)5f6061696b6d6edispatch stall (70)scheduler rewind (75)scheduler stall (76)~dispatch op (78)~map op int (7c)~map op fp/simd (7e)~map lookup int (7f)~map lookup fp/simd (81)8283pipeline redirect (84)85inst all (8c)inst branch (8d)inst branch taken (90)inst b.cc (94)inst integer (97)inst neon or fp (9a)9fa8acc5cdcfd0d2icache miss (d3)d5d6d9dadbddinst fetch restart (de)e0? int output thing (e9)eaebec? fp/simd (ee)gpr retires (ef)f5f6f7f8fd
16002440050300004625160010101600001016000050128000011104002004003940039199963200191600102016000020320000400414003911160021109101016000010030010022132191642213640036040120160000104014340040400404004040040
1600244003930011716482516001010160099101600005012800001110400200400394003919996320019160010201600002032000040041400391116002110910101600001000001002213711216212564003602060160000104004040040400404004040040
160024400393000046251600101016000010160000501280000111040020040039400391999632001916001020160000203200004004140039111600211091010160000100300100221361101621111104003602060160000104004040040400404004040040
16002440039300012462516001010160000101600005012800001110400200400394003919996320019160010201600002032000040041400391116002110910101600001000001002213614162115104003604060160000104004040040400404004040040
1600244003929900462516001010160000101602105012824641110400200400394003919996320019160010201600002032081440041400391116002110910101600001000001002213616162116104003612060160000104004040040400404004040040
160024400393000046251600101016000010160000501280000111040020040039400391999632001916001020160000203200004004140039111600211091010160000100000100221361416211744003602060160000104004040040400404004040040
1600244003930006882516001010160000101600005012807881110400200400394003919996320019160010201600002032000040039400391116002110910101600001000001002213615164124640036020120160000104004040040400404004040040
1600244003929900462516001010160000101600005012800001110400200400394003919996320019160010201600002032000040039400391116002110910101600001000001002413617164124640036020120160000104004040040400404004040040
160024400393000052251600101016000010160000501280000111040020040039400391999632001916001020160000203200004003940039111600211091010160000100000100221361516211464003602060160000104004040040400404004040040
16002440039299004625160010101600001016000050128000011104002004003940039199963200191600102016000020320000400394003911160021109101016000010000010022136112162124124003602060160000104004040040400404004040040