Apple M1 Microarchitecture Research by Dougall Johnson

Firestorm: Overview | Base Instructions | SIMD and FP Instructions
Icestorm:  Overview | Base Instructions | SIMD and FP Instructions

SQSHRUN2 (8H)

Test 1: uops

Code:

  sqshrun2 v0.16b, v1.8h, #3
  movi v0.16b, 1
  movi v1.16b, 2

(no loop instructions)

1000 unrolls and 1 iteration

Retires: 1.000

Issues: 1.000

Integer unit issues: 0.000

Load/store unit issues: 0.000

SIMD/FP unit issues: 1.000

retire (01)cycle (02)031e3f4e51inst issue (52)~issue fp/simd (54)~dispatch fp/simd (57)huge thing fp/simd (5b)696d6edispatch stall (70)scheduler rewind (75)scheduler stall (76)~dispatch op (78)~map op fp/simd (7e)~map lookup fp/simd (81)8283pipeline redirect (84)85inst all (8c)inst neon or fp (9a)accfd5d6ddinst fetch restart (de)e0? fp/simd (ee)f5f6f7f8fd
1004303722061254825100010001000398313301830373037241532895100010002000303730371110011000073116112630100030383038303830383038
1004303722061254825100010001000398313301830373037241532895100010002000303730371110011000073116112630100030383038303830383038
1004303722061254825100010001000398313301830373037241532895100010002000303730371110011000073116112630100030383038303830383038
10043037220156254825100010001000398313301830373037241532895100010002000303730371110011000073116112630100030383038303830383038
1004303722061254825100010001000398313301830373037241532895100010002000303730371110011000073116112630100030383038303830383038
1004303723061254825100010001000398313301830373037241532895100010002000303730371110011000073116112692100030383038303830383038
1004303722061254825100010001000398313301830373037241532895100010002000303730371110011000073116112630100030383038303830383038
1004303723061254825100010001000398313301830373037241532895100010002000303730371110011000073116112630100030383038303830383038
1004303723061254825100010001000398313301830373037241532895100010002000303730371110011000073116112630100030383038303830383038
1004303723061254825100010001000398313301830373037241532895100010002000303730371110011000073116112630100030383038303830383038

Test 2: Latency 1->1

Code:

  sqshrun2 v0.16b, v1.8h, #3
  movi v0.16b, 1
  movi v1.16b, 2

(fused SUBS/B.cc loop)

100 unrolls and 100 iterations

Result (median cycles for code): 3.0037

retire (01)cycle (02)0318191e1f3f4e51inst issue (52)~issue int (53)~issue fp/simd (54)~dispatch int (56)~dispatch fp/simd (57)huge thing int (59)huge thing fp/simd (5b)696d6edispatch stall (70)scheduler rewind (75)scheduler stall (76)~dispatch op (78)~map op int (7c)~map op fp/simd (7e)~map lookup int (7f)~map lookup fp/simd (81)8283pipeline redirect (84)85inst all (8c)inst branch (8d)inst branch taken (90)inst b.cc (94)inst integer (97)inst neon or fp (9a)9fa0a8accfd5d6ddinst fetch restart (de)e0? fp/simd (ee)gpr retires (ef)f5f6f7f8fd
102043003722500006129548251010010010000100100005004277313300183003730037282653287451010020010000200200003003730037111020110099100100100001000007101161129634100001003003830038300383003830038
102043003722500006129548251010010010000100100005004277313300183003730037282653287451010020010000200200003003730037111020110099100100100001000007101161129634100001003003830038300383003830038
10204300372250024306129548251010010010000100100005004277313300183003730037282653287451010020010000200200003003730037111020110099100100100001000007101161129634100001003003830038300383003830038
102043003722500006129548251010010010000100100005004277313300183003730037282653287451010020010000200200003003730037111020110099100100100001000007101161129634100001003003830038300383003830038
102043003722500006129548251010010010000100100005004277313300183003730037282653287451010020010000200200003003730037111020110099100100100001000007101161129634100001003003830038300383003830038
102043003722500006129548251010010010000100100005004277313300183003730037282653287451010020010000200200003003730037111020110099100100100001000007101161129634100001003003830038300383003830038
102043003722400006129548251010010010000100100005004277313300183003730037282653287451010020010000200200003003730037111020110099100100100001000007101161129634100001003003830038300383003830038
102043003722500006129548251010010010000100100005004277313300183003730037282653287451010020010000200200003003730037111020110099100100100001000007101161129634100001003003830038300383003830038
102043003722500906129548251010010010000100100005004277313300183003730037282653287451010020010000200200003003730037111020110099100100100001000007101161129634100001003003830038300383003830038
102043003722500006129548251010010010000100100005004277313300183003730037282653287451010020010000200200003003730037111020110099100100100001000007101161129634100001003003830038300383003830038

1000 unrolls and 10 iterations

Result (median cycles for code): 3.0037

retire (01)cycle (02)031e1f3f4e51inst issue (52)~issue int (53)~issue fp/simd (54)~dispatch int (56)~dispatch fp/simd (57)huge thing int (59)huge thing fp/simd (5b)60696b6d6edispatch stall (70)scheduler rewind (75)scheduler stall (76)~dispatch op (78)~map op int (7c)~map op fp/simd (7e)~map lookup int (7f)~map lookup fp/simd (81)8283pipeline redirect (84)85inst all (8c)inst branch (8d)inst branch taken (90)inst b.cc (94)inst integer (97)inst neon or fp (9a)9fa8accfd5d6ddinst fetch restart (de)e0? fp/simd (ee)gpr retires (ef)f5f6f7f8fd
100243003722400612954825100101010000101000050427731303001803003730037282873287671001020100002020000300373003711100211091010100001000640316442963010000103003830038300383003830038
10024300372251980612954825100101010000101000050427731303001803003730037282873287671001020100002020000300373003711100211091010100001000640416432963010000103003830038300383003830038
100243003722500612954825100101010000101000050427731303001803003730037282873287671001020100002020000300373003711100211091010100001000640416432963010000103003830038300383003830038
100243003722500612954825100101010000101000050427731313001803003730037282873287671001020100002020000300373003711100211091010100001000640416342963010000103003830038300383003830038
100243003722500612954825100101010000101000050427731313001803003730037282873287671001020100002020000300373003711100211091010100001000640416442963010000103003830038300383003830038
100243003722500822954825100101010000101000050427731303001803003730037282873287671001020100002020000300373003711100211091010100001000640416442963010000103003830038300383003830038
100243003722500612954825100101010000101000050427731313001803003730037282873287671001020100002020000300373003711100211091010100001000640416342963010000103003830038300383003830038
100243003722500612954825100101010000101000050427731313001803003730037282873287671001020100002020000300373003711100211091010100001000640316432963010000103003830038300383003830038
1002430037225001032954825100101010000101000050427731303001803003730037283073287671001020100002020000300373003711100211091010100001000640316342963010000103003830038300383003830038
100243003722400612954825100101010000101000050427731303001803003730037282873287671001020100002020000300373003711100211091010100001000640416442963010000103003830038300383003830038

Test 3: Latency 1->2

Code:

  sqshrun2 v0.16b, v0.8h, #3
  movi v0.16b, 1

(fused SUBS/B.cc loop)

100 unrolls and 100 iterations

Result (median cycles for code): 3.0037

retire (01)cycle (02)03080918191e1f3a3f4e51inst issue (52)~issue int (53)~issue fp/simd (54)~dispatch int (56)~dispatch fp/simd (57)huge thing int (59)huge thing fp/simd (5b)696d6edispatch stall (70)scheduler rewind (75)scheduler stall (76)~dispatch op (78)~map op int (7c)~map op fp/simd (7e)~map lookup int (7f)~map lookup fp/simd (81)8283pipeline redirect (84)85inst all (8c)inst branch (8d)inst branch taken (90)inst b.cc (94)inst integer (97)inst neon or fp (9a)9fa0a1a6a7a8a9acc2c5branch mispredict (cb)cdcfd5d6ddinst fetch restart (de)e0? int output thing (e9)? fp/simd (ee)gpr retires (ef)f5f6f7f8fd
102043003722500000006129547251010010010000100100005004277160300183003730037282716287411010020010342204200163003730037111020110099100100100001000000003011171701600296460100001003003830038300383003830038
102043003722500000006129547251010010010000100100005004277160300183003730037282717287411010020010008200200163003730037111020110099100100100001000000000011171701600296450100001003003830038300383003830038
102043003722500000006129547251010010010000100100005004277160300183003730037282716287401010020010008200200163007930037111020110099100100100001000000000011171701601296455100001003003830038300383003830038
1020430037225000024352018129547251010010010000104100005004277160300183003730037282526287411010020010008200200163003730037111020110099100100100001000000203011171701600297460100001003003830038300383003830038
1020430037225000151006129547251010010010000100100005004277160300183003730037282717287411010020010008200200163003730037111020110099100100100001000000000011171801600296450100001003003830038300383003830038
102043003722500000006129547251010010010000100100005004277160300183003730037282926287411010020010008200200163003730037111020110099100100100001002200000011171803200296460100001003003830038300383003830038
1020430037225000000010329547251010010010000100100005004277160300183003730037282716287411010020010008200200163003730037111020110099100100100001000000000011171701600296460100001003003830038300383003830038
1020430037224000027006129547251010010010000100100005004277160300183003730037282716287411010020010008200200163003730037111020110099100100100001000000000011171721600296460100001003003830038300383003830038
102043003722500000001460295472510100100100001001000050042771603001830037300372827117287401010020010008200200163003730037111020110099100100100001000000000011171801600296450100001003003830038300383003830038
102043003722500000011032954725101001001000010010000500427716030018300853003728271628741101002001000820020016300373003711102011009910010010000100002012250760111937097103002213100001003051230513305183046730510

1000 unrolls and 10 iterations

Result (median cycles for code): 3.0037

retire (01)cycle (02)03071e3f4e51inst issue (52)~issue int (53)~issue fp/simd (54)~dispatch int (56)~dispatch fp/simd (57)huge thing int (59)huge thing fp/simd (5b)60696d6edispatch stall (70)scheduler rewind (75)scheduler stall (76)~dispatch op (78)~map op int (7c)~map op fp/simd (7e)~map lookup int (7f)~map lookup fp/simd (81)8283pipeline redirect (84)85inst all (8c)inst branch (8d)inst branch taken (90)inst b.cc (94)inst integer (97)inst neon or fp (9a)9fa8accfd5d6ddinst fetch restart (de)e0? fp/simd (ee)gpr retires (ef)f5f6f7f8fd
10024300372250061295472510010101000010100005042771601300183003730037282863287671001020100002020000300373003711100211091010100001000640216222962910000103003830038300383003830038
10024300372250061295472510010101000010100005042771600300183003730037282863287671001020100002020000300373003711100211091010100001000640216222962910000103003830038300383003830038
10024300372250061295472510010101000010100005042771601300183003730037282863287671001020100002020000300373003711100211091010100001000640216222962910000103003830038300383003830038
10024300372250061295472510010101000010100005042771601300183003730037282863287671001020100002020000300373003711100211091010100001000640216222962910000103003830038300383003830038
10024300372240061295472510010101000010100005042771601300183003730037282863287671001020100002020000300373003711100211091010100001000640216222962910000103003830038300383003830038
10024300372250061295472510010101000010100005042771601300183003730037282863287671001020100002020000300373003711100211091010100001000640216222962910000103003830038300383003830038
10024300372250061295472510010101000010100005042771601300183003730037282863287671001020100002020000300373003711100211091010100001000640216222962910000103003830038300383003830038
10024300372250061295472510010101000010100005042771601300183003730037282863287671001020100002020000300373003711100211091010100001000640216222962910000103003830038300383003830038
10024300372250061295472510010101000010100005042771601300183003730037282863287671001020100002020000300373003711100211091010100001000640216222962910000103003830038300383003830038
10024300372250061295472510010101000010100005042771601300183003730037282863287671001020100002020000300373003711100211091010100001000640216222962910000103003830038300383003830038

Test 4: throughput

Count: 8

Code:

  movi v0.16b, 0
  sqshrun2 v0.16b, v8.8h, #3
  movi v1.16b, 0
  sqshrun2 v1.16b, v8.8h, #3
  movi v2.16b, 0
  sqshrun2 v2.16b, v8.8h, #3
  movi v3.16b, 0
  sqshrun2 v3.16b, v8.8h, #3
  movi v4.16b, 0
  sqshrun2 v4.16b, v8.8h, #3
  movi v5.16b, 0
  sqshrun2 v5.16b, v8.8h, #3
  movi v6.16b, 0
  sqshrun2 v6.16b, v8.8h, #3
  movi v7.16b, 0
  sqshrun2 v7.16b, v8.8h, #3
  movi v8.16b, 9

(fused SUBS/B.cc loop)

100 unrolls and 100 iterations

Result (median cycles for code divided by count): 0.2508

retire (01)cycle (02)030b191e3f51inst issue (52)~issue int (53)~issue fp/simd (54)~dispatch int (56)~dispatch fp/simd (57)huge thing int (59)huge thing fp/simd (5b)60696d6escheduler rewind (75)scheduler stall (76)~dispatch op (78)~map op int (7c)~map op fp/simd (7e)~map lookup int (7f)~map lookup fp/simd (81)8283pipeline redirect (84)85inst all (8c)inst branch (8d)inst branch taken (90)inst b.cc (94)inst integer (97)inst neon or fp (9a)9fa1a6a8acc5branch mispredict (cb)cdcfd6dde0? fp/simd (ee)gpr retires (ef)f5f6f7f8fd
1602042009115000029258011610080016100800285006401961200452006520065612801282008002820016005620065200651116020110099100100160000100001011110119160200621600001002006620066200662006620066
1602042006515010029258011610080016100800285006401961200452006520065612801282008002820016005620065200651116020110099100100160000100000011110119160200621600001002006620066200662006620066
1602042006515000029258011610080016100800285006401961200452006520065612801282008002820016005620065200651116020110099100100160000100000011110119160200621600001002006620066200662006620066
1602042006515000029258011610080016100800285006401961200452006520065612801282008002820016005620065200651116020110099100100160000100000011110119160200621600001002006620066200662006620066
1602042006515000029258011610080016100800285006401961200452006520065612801282008002820016005620065200651116020110099100100160000100000011110119160200621600001002006620066200662006620066
160204200651500002925801161008001610080028500640196120045200652006561280128200800282001600562006520065111602011009910010016000010000513911110119160200621600001002006620066200662006620066
16020420065150000124258011610080016100800285006401961200452006520065612801282008002820016005620065200651116020110099100100160000100000011110119160200621600001002006620066200662006620066
1602042006515000029258011610080016100800285006401961200452006520065612801282008002820016005620065200651116020110099100100160000100000011110119160200621600001002006620066200662006620066
1602042006515000029258011610080016100800285006401961200452006520065612801282008002820016005620065200651116020110099100100160000100000011110119160200621600001002006620066200662006620066
1602042006515000029258011610080016100800285006401961200452006520065612801282008002820016005620065200651116020110099100100160000100000011110119160200621600001002006620066200662006620066

1000 unrolls and 10 iterations

Result (median cycles for code divided by count): 0.2506

retire (01)cycle (02)0309181e3f51inst issue (52)~issue int (53)~issue fp/simd (54)~dispatch int (56)~dispatch fp/simd (57)huge thing int (59)huge thing fp/simd (5b)5f6061696d6e74scheduler rewind (75)scheduler stall (76)~dispatch op (78)~map op int (7c)~map op fp/simd (7e)~map lookup int (7f)~map lookup fp/simd (81)8283pipeline redirect (84)85inst all (8c)inst branch (8d)inst branch taken (90)inst b.cc (94)inst integer (97)inst neon or fp (9a)9fa6a8accfd0d2icache miss (d3)d5d6d9dadbddinst fetch restart (de)e0eaec? fp/simd (ee)gpr retires (ef)f5f6f7f8fd
16002420107161000452780010108000010800005064000011020032200512005103228001020800002016000020051200511116002110910101600001001151002532111252114220048201160000102024620065200742007420052
160024200511610008727800101080000108000050640000100200322005120051032280010208000020160000200512005111160021109101016000010023100273114252114220048201160000102024320065200522005220052
1600242005116100056227800101080000108000050640000100200322005120051032280010208000020160000200602005111160021109101016000010003100533114252114420048201160000102026620074200612006120061
160024200601610004532800101080000108000050640000000200412006020060032280010208000020160000200602006011160021109101016000010030100273114252114220048201160000102022720074200522006120052
16002420060161105117929800101080000108000050640000110200322005120051032280010208000020160000200512005111160021109101016000010023100256112252112420048201160000102020120074200612005220052
160024200511610005129800101080000108000050640000100200322005120051032280010208000020160000200512005111160021109101016000010050100273114252112420057201160000102019320074200522006120061
1600242006016100094129800101080000108000050640000110200412006020060032280010208000020160000200602006011160021109101016000010010100253114252112420048201160000102019020065200532005220052
160024200511570004527800101080000108000050640000110200322005120051032280010208000020160000200512005111160021109101016000010049100293112252114220048201160000102019320065200522005220052
1600242005116100124529800101080000108000050640000100200322005120051532280010208000020160000200512005111160021109101016000010020100303124252114220048201160000102018120074200522005220052
160024200511610004527800101080000108000050640000100200322005120051034380010208000020160000200512005311160021109101016000010003100283114252114420048201160000102020520065200522005220051

Test 5: throughput

Count: 16

Code:

  sqshrun2 v0.16b, v16.8h, #3
  sqshrun2 v1.16b, v16.8h, #3
  sqshrun2 v2.16b, v16.8h, #3
  sqshrun2 v3.16b, v16.8h, #3
  sqshrun2 v4.16b, v16.8h, #3
  sqshrun2 v5.16b, v16.8h, #3
  sqshrun2 v6.16b, v16.8h, #3
  sqshrun2 v7.16b, v16.8h, #3
  sqshrun2 v8.16b, v16.8h, #3
  sqshrun2 v9.16b, v16.8h, #3
  sqshrun2 v10.16b, v16.8h, #3
  sqshrun2 v11.16b, v16.8h, #3
  sqshrun2 v12.16b, v16.8h, #3
  sqshrun2 v13.16b, v16.8h, #3
  sqshrun2 v14.16b, v16.8h, #3
  sqshrun2 v15.16b, v16.8h, #3
  movi v16.16b, 17

(fused SUBS/B.cc loop)

100 unrolls and 100 iterations

Result (median cycles for code divided by count): 0.2502

retire (01)cycle (02)03081e3a3f51inst issue (52)~issue int (53)~issue fp/simd (54)~dispatch int (56)~dispatch fp/simd (57)huge thing int (59)huge thing fp/simd (5b)5d60696d6edispatch stall (70)scheduler rewind (75)scheduler stall (76)~dispatch op (78)~map op int (7c)~map op fp/simd (7e)~map lookup int (7f)~map lookup fp/simd (81)8283pipeline redirect (84)85inst all (8c)inst branch (8d)inst branch taken (90)inst b.cc (94)inst integer (97)inst neon or fp (9a)9fa8a9acc5branch mispredict (cb)cdcfd5d6ddinst fetch restart (de)e0? fp/simd (ee)gpr retires (ef)f5f6f7f8fd
160204400593000016512516010810016000810016002050012801320140020400394003919977619990160120200160032200320064400394003911160201100991001001600001000001111011801600400361600001004004040040400404004040040
16020440039300001512516010810016000810016002050012801320140020400394003919977619990160120200160032200320064400394003911160201100991001001600001000001111011801600400361600001004004040040400404004040040
160204400393000011772516010810016000810016002050012801320040020400394003919977619990160120200160032200320064400394003911160201100991001001600001000031111011815000400361600001004004040040400404004040040
160204400392990011982516010810016000810016002050012801320040020400394003919977619990160120200160032200320064400394003911160201100991001001600001000001111011801600400361600001004004040040400404004040040
16020440039300001512516010810016000810016002050012801320040020400904003919977619990160120200160032200320064400394003911160201100991001001600001000001111011801600400361600001004004040040400404004040040
160204400393000011142516010810016000810016002050012801320040020400394003919977619990160120200160032200320064400394003911160201100991001001600001000001111011801600400361600001004004040040400404004040040
160204400393000011352516010810016000810016002050012801320040020400394003919977619990160120200160032200320064400394003911160201100991001001600001000001111011801600400361600001004004040040400404004040040
160204400393000011832516010810016000810016002050012801320040020400394003919977619990160120200160032200320064400394003911160201100991001001600001000031111013501600400361600001004004040040400404004040040
16020440039300165412902516010810016000810016002050012801320040020400394003919977619990160120200160032200320064400394003911160201100991001001600001006018761111011801600400361600001004004040040400404004040040
160204400393000011772516010810016000810016002050012801320040020400394003919977619990160120200160032200320064400394003911160201100991001001600001000001111011801600400361600001004004040040400404004040040

1000 unrolls and 10 iterations

Result (median cycles for code divided by count): 0.2502

retire (01)cycle (02)031e3f51inst issue (52)~issue int (53)~issue fp/simd (54)~dispatch int (56)~dispatch fp/simd (57)huge thing int (59)huge thing fp/simd (5b)5f60696d6edispatch stall (70)scheduler rewind (75)scheduler stall (76)~dispatch op (78)~map op int (7c)~map op fp/simd (7e)~map lookup int (7f)~map lookup fp/simd (81)8283pipeline redirect (84)85inst all (8c)inst branch (8d)inst branch taken (90)inst b.cc (94)inst integer (97)inst neon or fp (9a)9fa0a1a6a8acc5cfd0d2icache miss (d3)d5d6d9dadbddinst fetch restart (de)e0eaeb? fp/simd (ee)gpr retires (ef)f5f6f7f8fd
160024400513000521251600101016000010160000501280000114002040039400391999632001916001020160000203200004003940039111600211091010160000100000001002432141622223400363010160000104004040040400404004040040
1600244003930004625160010101600001016000050128000001400204003940039199963200191600102016000020320000400394003911160021109101016000010000000100223113162113440036155160000104004040040400404004040040
1600244003930004625160010101600001016000050128000011400204003940039199963200191600102016000020320000400394003911160021109101016000010000000100223113162113340036155160000104014140040400404004040040
1600244003930004625160010101600001016000050128000011400204003940039199963200191600102016000020320000400394003911160021109101016000010000000100223113162114240036155160000104004040040400404004040040
1600244003930004625160010101600001016000050128000011400204003940039199963200191600102016000020320000400394003911160021109101016000010000000100223114162123340036155160000104004040040400404004040040
1600244003929906725160010101600001016000050128000011400204003940039199963200191600102016000020320000400394003911160021109101016000010000000100223213162123240036155160000104004040040400404004040040
1600244003930006725160010101600001016000050128000001400204003940039199963200191600102016000020320000400394003911160021109101016000010000000100243213162112340036155160000104004040040400404004040040
1600244003929904625160010101600001016000050128000001400204003940039199963200191600102016000020320000400394003911160021109101016000010000000100223113162123340036155160000104004040040400404004040040
1600244003930004625160010101600001016000050128000011400204003940039199963200191600102016000020320000400394003911160021109101016000010000000100223112164113240036155160000104004040040400404004040040
1600244003930004625160010101600001016000050128000011400204003940039199963200191600102016000020320000400394003921160021109101016000010000000100226122164113240036155160000104004040040400404004040040