Apple M1 Microarchitecture Research by Dougall Johnson

Firestorm: Overview | Base Instructions | SIMD and FP Instructions
Icestorm:  Overview | Base Instructions | SIMD and FP Instructions

SQSHL (immediate, scalar, S)

Test 1: uops

Code:

  sqshl s0, s0, #3
  movi v0.16b, 1
  movi v1.16b, 2

(no loop instructions)

1000 unrolls and 1 iteration

Retires: 1.000

Issues: 1.000

Integer unit issues: 0.000

Load/store unit issues: 0.000

SIMD/FP unit issues: 1.000

retire (01)cycle (02)031e3f4e51inst issue (52)~issue fp/simd (54)~dispatch fp/simd (57)huge thing fp/simd (5b)60696d6edispatch stall (70)scheduler rewind (75)scheduler stall (76)~dispatch op (78)~map op fp/simd (7e)~map lookup fp/simd (81)8283pipeline redirect (84)85inst all (8c)inst neon or fp (9a)accfd5d6ddinst fetch restart (de)e0? fp/simd (ee)f5f6f7f8fd
10042037150611686251000100010002645211201820372037157131895100010001000203720371110011000073116111786100020382038203820382038
10042037150611686251000100010002645211201820372037157131895100010001000203720371110011000073116111786100020382038203820382038
10042037150611686251000100010002645211201820372037157131895100010001000203720371110011000073116111786100020382038203820382038
10042037150611686251000100010002645211201820372037157131895100010001000203720371110011000073116111786100020382038203820382038
10042037150611686251000100010002645211201820372037157131895100010001000203720371110011000073116111786100020382038203820382038
10042037150611686251000100010002645211201820372037157131895100010001000203720371110011000073116111786100020382038203820382038
10042037150611686251000100010002645211201820372037157131895100010001000203720371110011000073116111786100020382038203820382038
10042037160611686251000100010002645211201820372037157131895100010001000203720371110011000073116111786100020382038203820382038
10042037150611686251000100010002645211201820372037157131895100010001000203720371110011000073116111786100020382038203820382038
10042037150611686251000100010002645211201820372037157131895100010001000203720371110011000073116111786100020382038203820382038

Test 2: Latency 1->2

Code:

  sqshl s0, s0, #3
  movi v0.16b, 1
  movi v1.16b, 2

(fused SUBS/B.cc loop)

100 unrolls and 100 iterations

Result (median cycles for code): 2.0037

retire (01)cycle (02)03081e1f3f4e51inst issue (52)~issue int (53)~issue fp/simd (54)~dispatch int (56)~dispatch fp/simd (57)huge thing int (59)huge thing fp/simd (5b)696d6edispatch stall (70)scheduler rewind (75)scheduler stall (76)~dispatch op (78)~map op int (7c)~map op fp/simd (7e)~map lookup int (7f)~map lookup fp/simd (81)8283pipeline redirect (84)85inst all (8c)inst branch (8d)inst branch taken (90)inst b.cc (94)inst integer (97)inst neon or fp (9a)9fa6a7a8a9accfd5d6ddinst fetch restart (de)e0? fp/simd (ee)gpr retires (ef)f5f6f7f8fd
1020420037150000611968625101001001000010010000500284752120018200372003718421318745101002001000020010000200372003711102011009910010010000100000007101161119791100001002003820038200382003820038
1020420037150000611968625101001001000010010000500284752120018200372003718421318745101002001000020010000200372003711102011009910010010000100000007101161119791100001002003820038200382003820038
1020420037150000611968625101001001000010010000500284752120018200372003718421318745101002001000020010000200372003711102011009910010010000100000007101161119791100001002003820038200382003820038
1020420037150060611968625101001001000010010000500284752120018200372003718421318745101002001000020010000200372003711102011009910010010000100000007101161119791100001002003820038200382003820038
1020420037150000611968625101001001000010010000500284752120018200372003718421318745101002001000020010000200372003711102011009910010010000100000007101161119791100001002003820038200382003820038
1020420037150000611968625101001001000010010000500284752120018200372003718421318745101002001000020010000200372003711102011009910010010000100000007101161119791100001002003820038200382003820038
1020420037150000611968625101001001000010010000500284752120018200372003718421318745101002001000020010000200372003711102011009910010010000100000007101161119791100001002003820038200382003820038
1020420037150000611968625101001001000010010000500284752120018200372003718421318745101002001000020010000200372003711102011009910010010000100000007101161119791100001002003820038200382003820038
1020420037150000611968625101001001000010010000500284752120018200372003718421318745101002001000020010000200372003711102011009910010010000100000007101161119791100001002003820038200382003820038
1020420037150000611968625101001001000010010000500284752120018200372003718421318745101002001000020010000200372003711102011009910010010000100000007101161119791100001002003820038200382003820038

1000 unrolls and 10 iterations

Result (median cycles for code): 2.0037

retire (01)cycle (02)0318191e1f3f4e51inst issue (52)~issue int (53)~issue fp/simd (54)~dispatch int (56)~dispatch fp/simd (57)huge thing int (59)huge thing fp/simd (5b)60696d6edispatch stall (70)scheduler rewind (75)scheduler stall (76)~dispatch op (78)~map op int (7c)~map op fp/simd (7e)~map lookup int (7f)~map lookup fp/simd (81)8283pipeline redirect (84)85inst all (8c)inst branch (8d)inst branch taken (90)inst b.cc (94)inst integer (97)inst neon or fp (9a)9fa0a1a8acbranch mispredict (cb)cfd5d6ddinst fetch restart (de)e0? int output thing (e9)? fp/simd (ee)gpr retires (ef)f5f6f7f8fd
10024200371500000611968625100101010000101000050284752112001820037200371844331876710010201000020100002003720037111002110910101000010000006403162219786010000102003820038200382003820038
10024200371500000611968625100101010000101000050284752112001820037200371844331876710010201000020100002003720037111002110910101000010000006402162219786010000102003820038200382003820038
10024200371500000611968625100101010000101000050284752112001820037200371844331876710010201000020100002003720037111002110910101000010000006402162219786010000102003820038200382003820038
1002420037150003420611968625100101010000101000050284752112001820037200371844331876710010201000020100002003720037111002110910101000010000006402162219786010000102003820038200382003820038
1002420037150002580611968625100101010000101000050284752112001820037200371844331876710010201000020100002003720037111002110910101000010000006402162219786010000102003820038200382003820038
10024200371500028502511968625100101010000101000050284752112001820037200371844331876710010201000020100002003720037111002110910101000010000006402162219786010000102003820038200382003820038
10024200371500000611968625100101010000101000050284752112001820037200371844331876710010201000020100002003720037111002110910101000010000006402162219786010000102003820038200382003820038
10024200371500000611968625100101010000101000050284752112001820037200371844331876710010201000020100002003720037111002110910101000010000006402162219786010000102003820038200382003820038
10024200371500000611968625100101010000101000050284752112001820037200371844331876710010201000020100002003720037111002110910101000010000006402162219786010000102003820038200382003820038
1002420037150003180611968625100101010000101000050284752112001820037200371844331876710010201000020100002003720037111002110910101000010000006402162219786010000102003820038200382003820038

Test 3: throughput

Count: 8

Code:

  sqshl s0, s8, #3
  sqshl s1, s8, #3
  sqshl s2, s8, #3
  sqshl s3, s8, #3
  sqshl s4, s8, #3
  sqshl s5, s8, #3
  sqshl s6, s8, #3
  sqshl s7, s8, #3
  movi v8.16b, 9

(fused SUBS/B.cc loop)

100 unrolls and 100 iterations

Result (median cycles for code divided by count): 0.2506

retire (01)cycle (02)033f51inst issue (52)~issue int (53)~issue fp/simd (54)~dispatch int (56)~dispatch fp/simd (57)huge thing int (59)huge thing fp/simd (5b)696d6edispatch stall (70)scheduler rewind (75)scheduler stall (76)~dispatch op (78)~map op int (7c)~map op fp/simd (7e)~map lookup int (7f)~map lookup fp/simd (81)8283pipeline redirect (84)85inst all (8c)inst branch (8d)inst branch taken (90)inst b.cc (94)inst integer (97)inst neon or fp (9a)9fa6a8acc5branch mispredict (cb)cdcfd5d6ddinst fetch restart (de)e0? fp/simd (ee)gpr retires (ef)f5f6f7f8fd
8020420038150292580108100800081008002050064013220019200382003899776998980120200800322008003820048200481180201100991001008000010001022251292231120045800001002004820048200482004820049
80204200491506426801161008001610080028500640196200282004820048997610998680128200800382008003820048200491180201100991001008000010001022251281231120045800001002005020049200492005020050
802042004915064278011610080016100800285006401962002820048200489976999868012820080038200800382004820048118020110099100100800001000010222251291231120046800001002005020049200492004920049
80204200481506426801161008001610080028500640196200282004920048997610998680128200800382008003820049200481180201100991001008000010001922251441231120046800001002004820048200482004820048
80204200471502652780100100800001008000050064000020028200472004799716999380100200800002008000020047200471180201100991001008000010000011151202242220044800001002004920050200502004920049
8020420048150642780116100800001008000050064000020028200472004799716999380100200800002008000020047200471180201100991001008000010000011151202242220044800001002004820048200482004820048
8020420047150752780100100800001008000050064000020028200472004799716999380100200800002008000020047200481180201100991001008000010000022251281231120045800001002004920050200502004920050
80204200481506426801161008001610080028500640196200282004820048997699986801282008003820080038200482004911802011009910010080000100003922251281231120045800001002004920049200492004920049
80204200481506426801161008001610080028500640196200282004920048997699986801282008003820080038200482004811802011009910010080000100037022251281231120046800001002004920049200502004920049
80204200481501592680116100800161008002850064019620028200492004899769998680128200800382008003820049200481180201100991001008000010000022251291231120046800001002004920049200502004920049

1000 unrolls and 10 iterations

Result (median cycles for code divided by count): 0.2505

retire (01)cycle (02)0318191e1f3f51inst issue (52)~issue int (53)~issue fp/simd (54)~dispatch int (56)~dispatch fp/simd (57)huge thing int (59)huge thing fp/simd (5b)60696d6edispatch stall (70)scheduler rewind (75)scheduler stall (76)~dispatch op (78)~map op int (7c)~map op fp/simd (7e)~map lookup int (7f)~map lookup fp/simd (81)8283pipeline redirect (84)85inst all (8c)inst branch (8d)inst branch taken (90)inst b.cc (94)inst integer (97)inst neon or fp (9a)9fa7a8a9accfd5d6ddinst fetch restart (de)e0? fp/simd (ee)gpr retires (ef)f5f6f7f8fd
800242005015000901234280010108000010800005064000012001920090200891001412100188030420800982080196200892003811800211091010800001001035020216112003580000102003920039200392003920039
800242003815000003925800101080000108000050640000120019200382003899963100188001020800002080000200382003811800211091010800001000005020116112003580000102034020497204442065020592
800242039716447528792197923181035108056110810685064383612041320545205941008852102298108420810682081167206002059881800211091010800001010251355020116112003580000102003920039200392003920039
80024200381550000210625680575108074510810675064767212049220595204401004349103188098920800002080000200382003811800211091010800001000005020116112003580000102003920039200392003920039
800242003815000603925800101080000108000050640000120019200382003899963100188001020800002080000200382003811800211091010800001000005020116112003580000102003920039200392003920039
8002420038150000094025800101080000108000050640000120019200382003899963100188001020800002080000200382003811800211091010800001000005020216222003580000102003920039200392003920039
8002420038150009070425800101080000108000050640000120019200382003899963100188001020800002080000200382003811800211091010800001000005020116222003580000102003920039200392003920039
800242003815000003925800101080000108000050640000120019200382003899963100188001020800002080000200382003811800211091010800001000005020116112003580000102003920039200392003920039
800242003815000003925800101080000108000050640000120019200382003899963100188001020800002080000200382003811800211091010800001000005020216222003580000102003920039200392003920039
800242003815000003925800101080000108000050640000120019200382003899963100188001020800002080000200382003811800211091010800001000005020216112003580000102003920039200392003920039