Apple M1 Microarchitecture Research by Dougall Johnson

Firestorm: Overview | Base Instructions | SIMD and FP Instructions
Icestorm:  Overview | Base Instructions | SIMD and FP Instructions

SQSHRN2 (2D)

Test 1: uops

Code:

  sqshrn2 v0.4s, v1.2d, #3
  movi v0.16b, 1
  movi v1.16b, 2

(no loop instructions)

1000 unrolls and 1 iteration

Retires: 1.000

Issues: 1.000

Integer unit issues: 0.000

Load/store unit issues: 0.000

SIMD/FP unit issues: 1.000

retire (01)cycle (02)0308090b18191e1f3f4e51inst issue (52)~issue fp/simd (54)~dispatch fp/simd (57)huge thing fp/simd (5b)696d6edispatch stall (70)scheduler rewind (75)scheduler stall (76)~dispatch op (78)~map op fp/simd (7e)~map lookup fp/simd (81)8283pipeline redirect (84)85inst all (8c)inst neon or fp (9a)a0a1a6a8a9acc2cfd5d6ddinst fetch restart (de)e0? fp/simd (ee)f5f6f7f8fd
10043037220000000103254825100010001000398313301830373037241532895100010002000303730371110011000000000073116112630100030383038303830383038
1004308323000000061254825100010001000398313301830373037241532895100010002000303730371110011000000000073116112630100030383038303830383038
10043037230000000103254825100010001000398313301830373037241532895100010002000303730371110011000000000073116112630100030383038303830383038
1004303723000000061254825100010001000398313301830373037241532895100010002000303730371110011000000000073116112630100030383038303830383038
1004303723000000061254825100010001000398313301830373037241532895100010002000303730371110011000000000073116112630100030383038303830383038
1004303723000000084254825100010001000399670301830373037241532895100010002000303730371110011000000000073116112630100030383038303830383038
1004303723000006061254825100010001000398313301830373037241532895100010002000303730371110011000000000073116112630100030383038303830383038
10043037220000000103254825100010001000398313301830373037241532895100010002000303730372110011000000000073116112630100030383038303830383038
1004303723000010061254825100010001000398313301830373037241532895100010002000303730371110011000000000073116112630100030383038303830383038
100430372200001128882254825100010001000398313301830373037241532915100010002000303730371110011000000003073116112630100030383038303830383038

Test 2: Latency 1->1

Code:

  sqshrn2 v0.4s, v1.2d, #3
  movi v0.16b, 1
  movi v1.16b, 2

(fused SUBS/B.cc loop)

100 unrolls and 100 iterations

Result (median cycles for code): 3.0037

retire (01)cycle (02)030818191e1f3a3f4e51inst issue (52)~issue int (53)~issue fp/simd (54)~dispatch int (56)~dispatch fp/simd (57)huge thing int (59)huge thing fp/simd (5b)696d6edispatch stall (70)scheduler rewind (75)scheduler stall (76)~dispatch op (78)~map op int (7c)~map op fp/simd (7e)~map lookup int (7f)~map lookup fp/simd (81)8283pipeline redirect (84)85inst all (8c)inst branch (8d)inst branch taken (90)inst b.cc (94)inst integer (97)inst neon or fp (9a)9fa0a1a6a7a8a9acc2cfd5d6ddinst fetch restart (de)e0? int output thing (e9)? fp/simd (ee)gpr retires (ef)f5f6f7f8fd
102043003722500000016829548251010010010000100100005004277313300183003730037282653287451010020010000200200003003730037111020110099100100100001000000000071011611296340100001003003830038300383003830038
10204300372250000006129548251010010010000100100005004277313300183003730037282653287451010020010000200200003003730037111020110099100100100001000000000071011611296340100001003003830038300383003830038
102043003722500000047429548251010010010000100100005004277313300183003730037282653287451010020010000200200003003730037111020110099100100100001000000000071011611296340100001003003830038300383003830087
102043003722500000014529548251010010010000100100005004277313300183003730037282653287451010020010000200200003003730037111020110099100100100001000000000071011611296340100001003003830038300383003830038
10204300372250000006129548251010010010000100100005004277313300183003730037282653287451010020010000200200003003730037111020110099100100100001000000000071011611296340100001003003830038300383003830038
102043003722500000047529548251010010010000100100005004277313300183003730037282653287451010020010000200200003003730037111020110099100100100001000000000071011611296340100001003003830038300383003830038
102043003722500000063629548251010010010000100100005004277313300183003730037282653287451010020010000200200003003730037111020110099100100100001000000000071011611296340100001003003830038300383003830038
102043003722500000050629548251010010010000100100005004277313300183003730037282653287451010020010000200200003003730037111020110099100100100001000000000071011611296340100001003003830038300383003830038
102043003722500000051029548251010010010000100100005004277313300183003730037282653287451010020010000200200003003730037111020110099100100100001000000000071011611296340100001003003830038300383003830038
10204300372250009006129548251010010010000100100005004277313300183003730037282653287451010020010000200200003003730037111020110099100100100001000000000071011611296340100001003003830038300383003830038

1000 unrolls and 10 iterations

Result (median cycles for code): 3.0037

retire (01)cycle (02)0308090b18191e1f3f4e51inst issue (52)~issue int (53)~issue fp/simd (54)~dispatch int (56)~dispatch fp/simd (57)huge thing int (59)huge thing fp/simd (5b)6061696b6d6edispatch stall (70)scheduler rewind (75)scheduler stall (76)~dispatch op (78)~map op int (7c)~map op fp/simd (7e)~map lookup int (7f)~map lookup fp/simd (81)8283pipeline redirect (84)85inst all (8c)inst branch (8d)inst branch taken (90)inst b.cc (94)inst integer (97)inst neon or fp (9a)9fa0a1a6a7a8a9acc2cfd0d5d6ddinst fetch restart (de)e0? int output thing (e9)? fp/simd (ee)gpr retires (ef)f5f6f7f8fd
1002430263225101101207292952144100281310024121044760428138410301620302263027328303728803103132010326202098230133301784110021109101010000100220122723266102412229774210000103013430133301323012130651
10024301802271005110688812829530811005811100081210596774289526003016203017930178282872428841101602010161202032630133301672110021109101010000100000002583268203162229707010000103013330038302093013230085
10024302742271011126728475929503441002810100001010000554278670003005403022730037282911228804104592010000222228430132300372110021109101010000100000006923066102162229668310000103008630038300383013130180
10024300842250000028352990822954825100101010000101000050428002700300900301323013228291328786100382210492242000030178303712110021109101010000100201100066102165229630210000103003830038300383013330038
100243003722400000001032954825100201210000101000055427731300300180300373003728287328767100102010000202000030037300371110021109101010000100000100064002162329700010000103003830038300383003830038
1002430037225001020014529548251001010100001210000504277313003001803013030083282871128767100102010000202000030037300371110021109101010000102000103064002162229630010000103003830038300383003830085
1002430037224000001201032954825100101010000101000050427731300300180300373003728287328786100102010000202000030037300371110021109101010000100000103064002162229630010000103003830038300383003830038
100243003722510000006129548251001010100001010000504277313003001803003730037282872128914116562010000202000030037300371110021109101010000100000000064002162229630010000103003830038300383003830038
10024300372250000000612954825100101010000101000050427731310300180300373003728287328767100102010000202000030037300371110021109101010000100000000064002162229630010000103003830038300383003830038
10024300372250000000612954825100101010000101000050427731303300180300373003728287328767100102010000202000030037300371110021109101010000100000000064032162229630010000103003830038300383003830038

Test 3: Latency 1->2

Code:

  sqshrn2 v0.4s, v0.2d, #3
  movi v0.16b, 1

(fused SUBS/B.cc loop)

100 unrolls and 100 iterations

Result (median cycles for code): 3.0037

retire (01)cycle (02)03070a1e3f4e51inst issue (52)~issue int (53)~issue fp/simd (54)~dispatch int (56)~dispatch fp/simd (57)huge thing int (59)huge thing fp/simd (5b)696d6edispatch stall (70)scheduler rewind (75)scheduler stall (76)~dispatch op (78)~map op int (7c)~map op fp/simd (7e)~map lookup int (7f)~map lookup fp/simd (81)8283pipeline redirect (84)85inst all (8c)inst branch (8d)inst branch taken (90)inst b.cc (94)inst integer (97)inst neon or fp (9a)9fa6a8a9acc2c5branch mispredict (cb)cdcfd5d6ddinst fetch restart (de)e0? int output thing (e9)? fp/simd (ee)gpr retires (ef)f5f6f7f8fd
1020430037225114326129547251010010010000100100005004277160300183003730037282716287401010020010008200200163003730037111020110099100100100001000000011171711611296480100001003003830038300853003830038
1020430037225112946129547251010010010000100100005004277160300183003730037282716287411010020010008200200163003730037111020110099100100100001000000011171711611296470100001003003830038300383003830038
1020430037225114986129547251010010010000100100005004277160300183003730037282716287411010020010008200200163003730037111020110099100100100001000000011171711611296470100001003003830038300383003830038
1020430037225114896129547251010010010000100100005004277160300183003730037282717287401010020010008200200163003730037111020110099100100100001000000011171811611296470100001003003830038300383003830038
1020430037225114986129547251010010010000100100005004277160300183003730037282716287741010020010008200200163003730037111020110099100100100001000300011171711611296480100001003003830038300383003830038
1020430037225114836129547251010010010000100100005004277160300183003730037282716287401010020010008200200163003730037111020110099100100100001000000011171711611296480100001003003830038300383003830038
1020430037225114236129547251010010010000100100005004277160300183003730037282716287401010020010008200200163003730037111020110099100100100001000000011171711611296480100001003003830038300383003830038
102043003722511061295472510100100100001001000050042771603001830037300372827172874010100200100082002001630037300371110201100991001001000010002203011171711611296470100001003003830038300383003830038
1020430037224115196129547251010010010000100100005004277160300183003730037282716287401010020010008200200163003730037111020110099100100100001000000011171811611296480100001003003830038300383003830038
1020430037225114656129547251010010010000100100005004277160300183003730037282717287411010020010008200200163003730037111020110099100100100001000000011171711611296480100001003003830038300383003830038

1000 unrolls and 10 iterations

Result (median cycles for code): 3.0037

retire (01)cycle (02)03081e1f3f4e51inst issue (52)~issue int (53)~issue fp/simd (54)~dispatch int (56)~dispatch fp/simd (57)huge thing int (59)huge thing fp/simd (5b)60696b6d6edispatch stall (70)72scheduler rewind (75)scheduler stall (76)~dispatch op (78)~map op int (7c)~map op fp/simd (7e)~map lookup int (7f)~map lookup fp/simd (81)8283pipeline redirect (84)85inst all (8c)inst branch (8d)inst branch taken (90)inst b.cc (94)inst integer (97)inst neon or fp (9a)9fa6a8cfd5d6ddinst fetch restart (de)e0? fp/simd (ee)gpr retires (ef)f5f6f7f8fd
100243003722400099229547251001010100001010000504277160130018030037300372828603287671001020100002020000300373003711100211091010100001000640216222962910000103003830038300383003830038
10024300372250006129547251001010100001010000504277160030018030037300372828603287671001020100002020000300373003711100211091010100001000640216222962910000103003830038300383003830038
10024300372250006129547251001010100001010000504277160130018030037300372828603287671001020100002020000300373003711100211091010100001000640216222962910000103003830038300383003830038
10024300372250006129547251001010100001010000504277160130018030037300372828603287671001020100002020000300373003711100211091010100001000640216222962910000103003830038300383003830038
10024300372250006129547251001010100001010000504277160130018030037300372828603287671001020100002020000300373003711100211091010100001000640216222962910000103003830038300383003830038
10024300372250006129547251001010100001010000504277160130018030037300372828603287671001020100002020000300373003711100211091010100001000640216222962910000103003830038300383003830038
10024300372250006129547251001010100001010000504277160130018030037300372828603287671001020100002020000300373003711100211091010100001000640216222962910000103003830038300383003830038
10024300372250006129547251001010100001010000504277160130018030037300372828603287671001020100002020000300373003711100211091010100001000640216222962910000103008530038300383003830038
10024300372250006129547251001010100001010000504277160130018030037300372828603287671001020100002020000300373003711100211091010100001000640216222962910000103003830038300383003830038
10024300372250006129547251001010100001010000504277160030018030037300372828603287671001020100002020000300373003711100211091010100001000640216222962910000103003830038300383003830038

Test 4: throughput

Count: 8

Code:

  movi v0.16b, 0
  sqshrn2 v0.4s, v8.2d, #3
  movi v1.16b, 0
  sqshrn2 v1.4s, v8.2d, #3
  movi v2.16b, 0
  sqshrn2 v2.4s, v8.2d, #3
  movi v3.16b, 0
  sqshrn2 v3.4s, v8.2d, #3
  movi v4.16b, 0
  sqshrn2 v4.4s, v8.2d, #3
  movi v5.16b, 0
  sqshrn2 v5.4s, v8.2d, #3
  movi v6.16b, 0
  sqshrn2 v6.4s, v8.2d, #3
  movi v7.16b, 0
  sqshrn2 v7.4s, v8.2d, #3
  movi v8.16b, 9

(fused SUBS/B.cc loop)

100 unrolls and 100 iterations

Result (median cycles for code divided by count): 0.2510

retire (01)cycle (02)031e3f51inst issue (52)~issue int (53)~issue fp/simd (54)~dispatch int (56)~dispatch fp/simd (57)huge thing int (59)huge thing fp/simd (5b)60696d6escheduler rewind (75)scheduler stall (76)~dispatch op (78)~map op int (7c)~map op fp/simd (7e)~map lookup int (7f)~map lookup fp/simd (81)8283pipeline redirect (84)85inst all (8c)inst branch (8d)inst branch taken (90)inst b.cc (94)inst integer (97)inst neon or fp (9a)9fa8acc5branch mispredict (cb)cdcfd5d6ddinst fetch restart (de)e0? fp/simd (ee)gpr retires (ef)f5f6f7f8fd
1602042008815037829258011610080016100800285006401961200452006520065612801282008002820016005620065200651116020110099100100160000100001111011901600200621600001002006620066200662006620066
16020420065156050258011610080016100800285006401960200452006520065612801282008002820016005620065200651116020110099100100160000100001111011901600200621600001002006620066200662006620066
16020420065151029258011610080016100800285006401960200452006520065612801282008002820016005620065200651116020110099100100160000100001111011901600200621600001002006620066200662006620066
16020420065150029258011610080016100800285006401960200452006520065612801282008002820016005620065200651116020110099100100160000100001111011901600200621600001002006620066200662006620066
160204200651500504258011610080016100800285006401960200452006520065612801282008002820016005620065200651116020110099100100160000100001111011901600200621600001002006620066200662006620066
16020420065150029258011610080016100800285006401960200452006520065612801282008002820016005620065200651116020110099100100160000100001111011901600200621600001002006620066200662006620066
16020420065150029258011610080016100800285006401960200452006520065612801282008002820016005620065200651116020110099100100160000100001111011901600200621600001002006620066200662006620066
1602042006515033929258011610080016100800285006401960200452006520065612801282008002820016005620065200651116020110099100100160000100001111011901600200621600001002006620066200662006620066
160204200651502129258011610080016100800285006401960200452006520065612801282008002820016005620065200651116020110099100100160000100001111011901600200621600001002006620066200662006620066
16020420065150029258011610080016100800285006401960200452006520065612801282008002820016005620065200651116020110099100100160000100001111011901600200621600001002006620066200662006620066

1000 unrolls and 10 iterations

Result (median cycles for code divided by count): 0.2506

retire (01)cycle (02)0308090b18191e1f3a3f5051inst issue (52)~issue int (53)~issue fp/simd (54)~dispatch int (56)~dispatch fp/simd (57)huge thing int (59)huge thing fp/simd (5b)5f6061696d6escheduler rewind (75)scheduler stall (76)~dispatch op (78)~map op int (7c)~map op fp/simd (7e)~map lookup int (7f)~map lookup fp/simd (81)8283pipeline redirect (84)85inst all (8c)inst branch (8d)inst branch taken (90)inst b.cc (94)inst integer (97)inst neon or fp (9a)9fa0a1a6a7a9acc2cfd0d2icache miss (d3)d5d6d9dadbddinst fetch restart (de)e0? int output thing (e9)eaec? fp/simd (ee)gpr retires (ef)f5f6f7f8fd
1600242007415000000300450258001010800001080000506400001152002720050200503228001020800002016000020046200461116002110910101600001000000001004183121202111819200430150160000102004720047200512004720047
160024200461510100048601510258001010800001080000506400001152002720046200463228001020800002016000020046200501116002110910101600001000000001004182118202111819200430150160000102004720047200472004720047
160024200461500000048600450258001010800001080000506400001152009820046200463228001020800002016000020046200461116002110910101600001000000001004132118202111919200430150160000102004720047200472004720047
16002420046150000000004502580010108000010800005064000010020027200462004632280010208000020160000200462004611160021109101016000010000002473010191321262152112220203760150160000102018620504204532045420616
16002420368153101668015280660168801151080632108000050645032115200272028720529271508032720805222016126220046205263116002110910101600001000000001004682116202111618200430150160000102004720047200472004720047
1600242004615000000000450258001010800001080000506400001052002720050200463228043220800002016000020046200461116002110910101600001000000001004082119202111918200430150160000102004720047200472004720047
1600242004615000000000450258001010800001080000506400001152002720046200463228001020800002016000020046200461116002110910101600001000000001004482118202112019200430150160000102004720047200472004720047
160024200461510000033300450258001010800001080000506400001052002720046200463228001020800002016000020046200461116002110910101600001000000001004182120202121818200430300160000102005120047200472004720047
16002420050150000000002370258001010800001080000506400001152002720046200463228001020800002016000020046200461116002110910101600001000000001004231119202121919200470150160000102004720051200472004720047
1600242004615000000000450258001010800001080000506400001002002720046200463228001020800002016000020046200461116002110910101600001000000001004082217202111918200430150160000102004720047200472004720047

Test 5: throughput

Count: 16

Code:

  sqshrn2 v0.4s, v16.2d, #3
  sqshrn2 v1.4s, v16.2d, #3
  sqshrn2 v2.4s, v16.2d, #3
  sqshrn2 v3.4s, v16.2d, #3
  sqshrn2 v4.4s, v16.2d, #3
  sqshrn2 v5.4s, v16.2d, #3
  sqshrn2 v6.4s, v16.2d, #3
  sqshrn2 v7.4s, v16.2d, #3
  sqshrn2 v8.4s, v16.2d, #3
  sqshrn2 v9.4s, v16.2d, #3
  sqshrn2 v10.4s, v16.2d, #3
  sqshrn2 v11.4s, v16.2d, #3
  sqshrn2 v12.4s, v16.2d, #3
  sqshrn2 v13.4s, v16.2d, #3
  sqshrn2 v14.4s, v16.2d, #3
  sqshrn2 v15.4s, v16.2d, #3
  movi v16.16b, 17

(fused SUBS/B.cc loop)

100 unrolls and 100 iterations

Result (median cycles for code divided by count): 0.2502

retire (01)cycle (02)03080918191e1f3f51inst issue (52)~issue int (53)~issue fp/simd (54)~dispatch int (56)~dispatch fp/simd (57)huge thing int (59)huge thing fp/simd (5b)696d6edispatch stall (70)scheduler rewind (75)scheduler stall (76)~dispatch op (78)~map op int (7c)~map op fp/simd (7e)~map lookup int (7f)~map lookup fp/simd (81)8283pipeline redirect (84)85inst all (8c)inst branch (8d)inst branch taken (90)inst b.cc (94)inst integer (97)inst neon or fp (9a)9fa0a6a8a9acc2c5branch mispredict (cb)cdcfd5d6ddinst fetch restart (de)e0? int output thing (e9)? fp/simd (ee)gpr retires (ef)f5f6f7f8fd
16020440059300000024030251601081001600081001600205001280132400204003940039199776199901601202001600322003200644003940039111602011009910010016000010000000011110118016004003601600001004004040040400404004040040
1602044003930000000030251601081001600081001600205001280132400204003940039199776199901601202001600322003200644003940039111602011009910010016000010000000011110118016004003601600001004004040040400404004040040
1602044003930000000030251601081001600081001600205001280132400204003940039199776199901601202001600322003200644003940039111602011009910010016000010000000011110118016004003601600001004004040040400404004040040
16020440039301000000695251601081001600081001600205001280132400204003940039199776199901601202001600322003200644003940039111602011009910010016000010000000011110118016004003601600001004004040040400404004040040
1602044003930000000030251601081001600081001600205001280132400204003940039199776199901601202001600322003200644003940039111602011009910010016000010000000011110118016004003601600001004004040040400404004040040
1602044003930000000030251601081001600081001600205001280132400204003940039199776199901601202001600322003200644003940039111602011009910010016000010000000011110118016004003601600001004004040040400404004040040
1602044003930000000030251601081001600081001600205001280132400204003940039199776199901601202001600322003200644003940039111602011009910010016000010000000011110118016004003601600001004004040040400404004040040
16020440039300000027030251601081001600081001600205001280132400204003940039199776199901601202001600322003200644003940039111602011009910010016000010000000011110118016004003601600001004004040040400404004040040
1602044003929900000030251601081001600081001600205001280132400204003940039199776199901601202001600322003200644003940039111602011009910010016000010000000011110118016004003601600001004004040040400404004040040
1602044003930000000030251601081001600081001600205001280132400204003940039199776199901601202001600322003200644003940039111602011009910010016000010000000011110118016004003601600001004004040040400404004040040

1000 unrolls and 10 iterations

Result (median cycles for code divided by count): 0.2502

retire (01)cycle (02)030408091e3f51inst issue (52)~issue int (53)~issue fp/simd (54)~dispatch int (56)~dispatch fp/simd (57)huge thing int (59)huge thing fp/simd (5b)5f60696d6edispatch stall (70)scheduler rewind (75)scheduler stall (76)~dispatch op (78)~map op int (7c)~map op fp/simd (7e)~map lookup int (7f)~map lookup fp/simd (81)8283pipeline redirect (84)85inst all (8c)inst branch (8d)inst branch taken (90)inst b.cc (94)inst integer (97)inst neon or fp (9a)9fa0a1a8acc2c5branch mispredict (cb)cfd0d2icache miss (d3)d5d6d9dadbddinst fetch restart (de)e0eaeb? fp/simd (ee)gpr retires (ef)f5f6f7f8fd
16002440050300010652811600101016000010160000501280000014002040039400391999632001916001020160000203200004003940039111600211091010160000100000000100246229164224240036155160000104004040040400404004040040
1600244003930000063302516001010160000101600005012800000140020400394003919996320019160010201600002032000040039400391116002110910101600001000000001002432221642224400363010160000104004040040400404004040040
160024400393000000522516001010160000101600005012800000040020400394024519996320019160010201600002032000040039400391116002110910101600001000000001002231141621142400363010160000104004040040400404004040040
16002440039299000046251600101016000010160000501280000014002040039400391999632001916001020160000203200004003940039111600211091010160000100000000100223117162218840036155160000104004040040400404004040040
160024400393000000462516001010160000101600005012800000140020400394003919996320019160010201600002032000040039400391116002110910101600001000000001002231121621248400361510160000104004040040400404004040088
16002440039300000046251600101016000010160000501280000114002040039400391999632001916001020160000203200004003940039111600211091010160000100000000100226225162114540036155160000104004040040400404004040040
16002440039300000046251600101016000010160000501280000114002040039400391999632001916001020160000203200004003940039111600211091010160000100000000100223112162114240036155160000104004040040400404004040040
1600244003929900007112516001010160000101600005012800001140020400394024619996320019160010201600002032061640039400391116002110910101600001000000001002431121641124400361510160000104004040040400404004040040
160024400393000000462516001010160000101600005012800001140020400394003919996320019160010201600002032000040039400391116002110910101600001000000001002231141621144400361510160000104004040040400404004040040
160024400393000000462516001010160000101600005012800001140020400394003919996320019160010201600002032000040039400391116002110910101600001000000001002261131642124400361510160000104004040040400404004040040