Apple M1 Microarchitecture Research by Dougall Johnson

Firestorm: Overview | Base Instructions | SIMD and FP Instructions
Icestorm:  Overview | Base Instructions | SIMD and FP Instructions

SRI (vector, 2D)

Test 1: uops

Code:

  sri v0.2d, v1.2d, #3
  movi v0.16b, 1
  movi v1.16b, 2

(no loop instructions)

1000 unrolls and 1 iteration

Retires: 1.000

Issues: 1.000

Integer unit issues: 0.000

Load/store unit issues: 0.000

SIMD/FP unit issues: 1.000

retire (01)cycle (02)0308181e3f4e51inst issue (52)~issue fp/simd (54)~dispatch fp/simd (57)huge thing fp/simd (5b)60696d6edispatch stall (70)scheduler rewind (75)scheduler stall (76)~dispatch op (78)~map op fp/simd (7e)~map lookup fp/simd (81)8283pipeline redirect (84)85inst all (8c)inst neon or fp (9a)a0a8a9accfd5d6ddinst fetch restart (de)e0? fp/simd (ee)f5f6f7f8fd
1004203716000611687251000100010002646801201820372037157231895100010002000203720371110011000000073116111787100020382038203820382038
1004203716000611687251000100010002646800201820372037157231895100010002000203720371110011000000073116111787100020382038203820382038
1004203716000611687251000100010002646801201820372037157231895100010002000203720371110011000000073116111855100020382038203820382038
1004203717000611687251000100010002646800201820372037157231895100010002000203720371110011000000073116111787100020382038203820382038
1004203715000611687251000100010002646800201820372037157231895100010002000203720841110011000000073116111787100020382038203820382038
1004203716000611687251000100010002646801201820372037157231895100010002000203720371110011000000073116111787100020382038203820382038
1004203716000611687251000100010002646801201820372037157231895100010002000203720371110011000000073116111787100020382038203820382038
1004203716000611687251000100010002646800201820372037157231895100010002000203720371110011000000073116111787100020382038203820382038
1004203716000611687251000100010002646800201820372037157231895100010002000203720371110011000000073116111787100020382038203820382038
1004203716000611687251000100010002646800201820372037157231895100010002000203720371110011000000073116111787100020382038203820382038

Test 2: Latency 1->1

Code:

  sri v0.2d, v1.2d, #3
  movi v0.16b, 1
  movi v1.16b, 2

(fused SUBS/B.cc loop)

100 unrolls and 100 iterations

Result (median cycles for code): 2.0037

retire (01)cycle (02)03080b18191e1f3f4e51inst issue (52)~issue int (53)~issue fp/simd (54)~dispatch int (56)~dispatch fp/simd (57)huge thing int (59)huge thing fp/simd (5b)60696d6edispatch stall (70)scheduler rewind (75)scheduler stall (76)~dispatch op (78)~map op int (7c)~map op fp/simd (7e)~map lookup int (7f)~map lookup fp/simd (81)8283pipeline redirect (84)85inst all (8c)inst branch (8d)inst branch taken (90)inst b.cc (94)inst integer (97)inst neon or fp (9a)9fa0a1a6a8a9accdcfd5d6ddinst fetch restart (de)e0? int output thing (e9)? fp/simd (ee)gpr retires (ef)f5f6f7f8fd
102042003715500000012419687441010010010000100100005002847680020018200372003718425718745105812001000020020000200372003711102011009910010010000100020000071021611198230100001002003820038200382013420038
10204200371560000006119687251010010010000100100005002847680120018200372003718422318854113622001049520020000200372003711102011009910010010000100000000071011621197910100001002003820038200382003820038
10204200371550000006119687251010010010000100100005002847680020018200372003718422818745101002001016420020000200372003711102011009910010010000100000000071011611197910100001002003820038200382003820038
10204200371550000006119687251010010010000100100005002847680020018200852003718422318745101002001000020020000200372013121102011009910010010000100000000071011611197910100001002003820038200382003820038
10204200371550000006119687251010010010000100100005002847680120018200372003718422318745101002001000020020000200372003711102011009910010010000100000000071011611197910100001002003820038200382003820038
102042003715600000014819687251010010010023100100005002847680120018200372003718422318745101002001000020020000200372003711102011009910010010000100020002055071011611197910100001002003820038200382003820038
10204200371550000006119687251010010010000100100005002847680020018200372003718422318745101002001000020020000200372003711102011009910010010000100000000071011611197910100001002003820038200382003820038
10204200371550000006119687251010010010000100100005002848963020018200372003718422318745101002001000020020000200372003711102011009910010010000100000000071011611197910100001002003820038200382003820038
10204200371550000006119687251010010010000100100005002847680120018200372003718422318745101002001000020020000200372003711102011009910010010000100000000071011611197910100001002003820038200382003820038
102042003715600000061196872510100100100001001000050028476800200182003720037184221118745101002001000020020000200372003711102011009910010010000100000000173311611197910100001002003820038200382003820038

1000 unrolls and 10 iterations

Result (median cycles for code): 2.0037

retire (01)cycle (02)03080a0b18191e1f3f4e51inst issue (52)~issue int (53)~issue fp/simd (54)~dispatch int (56)~dispatch fp/simd (57)huge thing int (59)huge thing fp/simd (5b)60696d6edispatch stall (70)scheduler rewind (75)scheduler stall (76)~dispatch op (78)~map op int (7c)~map op fp/simd (7e)~map lookup int (7f)~map lookup fp/simd (81)8283pipeline redirect (84)85inst all (8c)inst branch (8d)inst branch taken (90)inst b.cc (94)inst integer (97)inst neon or fp (9a)9fa0a1a6a8a9acc2c5cfd5d6ddinst fetch restart (de)e0? int output thing (e9)? fp/simd (ee)gpr retires (ef)f5f6f7f8fd
100242003715500000001031968725100101010000111000050284768002005420037200371844431876710010201000020200002003720037111002110910101000010000000006404162219785010000102003820038200382003820085
10024200371550000092641241968725100101010000101000050284768002001820037200371844431876710010201000020200002003720037111002110910101000010000000006402162219785010000102003820038200382003820038
10024200371550000000611968725100101010000101000050284768002001820037200371844431876710010201000020200002003720037111002110910101000010000000006402162219785010000102003820038200382003820038
10024200371550000000611968725100101010000101000050284768012001820037200371844431876710010201000020200002003720037111002110910101000010000000006402162219785010000102003820038200382003820038
10024200371550000000611968725100251010000101000050284768002001820037200371844431876710010201000020200002003720037111002110910101000010000003006402162219785010000102003820038200382003820038
100242003716100100002011968725100101010000101000050284768002001820037200371844431876710010201000020200002003720037111002110910101000010000000006402162219785010000102003820038200382003820038
10024200371610000000611968725100101010000111045665285152912001820037200371844471876710010201000020200002003720037111002110910101000010000000006402162219785210000102003820038200382003820038
1002420037161000000037171968725100101010000101000050284768002001820037200371844431876710010201000020200002003720037111002110910101000010000000006402162219785010000102003820038200382003820038
10024200371550100000611968725100101010000101000050284768002001820037200371844431876710010201000020200002003720037311002110910101000010000000006402162219785010000102003820038200382003820038
100242003715500000001491968725100101010000101000050284768002001820037200371844431876710010201000020200002003720037111002110910101000010000000006402162219785010000102003820038200382003820038

Test 3: Latency 1->2

Code:

  sri v0.2d, v0.2d, #3
  movi v0.16b, 1

(fused SUBS/B.cc loop)

100 unrolls and 100 iterations

Result (median cycles for code): 2.0037

retire (01)cycle (02)03080b18191e1f3a3f4e51inst issue (52)~issue int (53)~issue fp/simd (54)~dispatch int (56)~dispatch fp/simd (57)huge thing int (59)huge thing fp/simd (5b)696d6edispatch stall (70)scheduler rewind (75)scheduler stall (76)~dispatch op (78)~map op int (7c)~map op fp/simd (7e)~map lookup int (7f)~map lookup fp/simd (81)8283pipeline redirect (84)85inst all (8c)inst branch (8d)inst branch taken (90)inst b.cc (94)inst integer (97)inst neon or fp (9a)9fa0a1a6a7a8a9acc2c5branch mispredict (cb)cdcfd5d6ddinst fetch restart (de)e0? int output thing (e9)? fp/simd (ee)gpr retires (ef)f5f6f7f8fd
1020420037150000000017419686251010010010000100100005002847521200182003720037184286187411010020010008200200162003720037111020110099100100100001000000000011171822400198010100001002003820038200382003820038
1020420037150000000010319686251010010010000100100005002847521200182003720037184287187411010020010008200200002003720037111020110099100100100001000000000011171822422198000100001002003820038200382003820038
1020420037150000000113919686251010010610012100100005002847521200182003720085184287187401010020010008200200162008620085111020110099100100100001000000000011172201600198010100001002003820038200382003820038
1020420037150000012006119686251014510010000100100005002847521200182003720037184286187411010020010008200200162003720037111020110099100100100001000000000011171801600198000100001002003820038200382003820038
102042003715000000006119686251010010010000100100005002847521200182003720037184286187411010020010008200200162003720037111020110099100100100001000000000011171701600198010100001002003820038200382003820038
102042003715000000006119686251010010010000100100005002847521200182003720037184286187401010020010008200200162003720037111020110099100100100001000000000011171801600198010100001002003820038200382003820038
102042003715000000006119686251010010010000100100005002847521200182003720037184287187401010020010008200200162003720037111020110099100100100001000000000011171801600198010100001002003820038200382003820038
102042003715000000006119686251010010010000100100005002847521200182003720037184287187411010020010008200200162003720037111020110099100100100001000000000011171801600198000100001002003820038200382003820038
102042003715000000006119686251010010010000100100005002847521200182003720037184286187411010020010008200200162003720037111020110099100100100001000000000011171801600198000100001002003820038200382003820038
102042003715000000006119686251010010010000100100005002847521200182003720037184287187401010020010008200200162003720037111020110099100100100001000000000011171701600198000100001002003820038200382003820038

1000 unrolls and 10 iterations

Result (median cycles for code): 2.0037

retire (01)cycle (02)03191e1f3f4e51inst issue (52)~issue int (53)~issue fp/simd (54)~dispatch int (56)~dispatch fp/simd (57)huge thing int (59)huge thing fp/simd (5b)696d6edispatch stall (70)scheduler rewind (75)scheduler stall (76)~dispatch op (78)~map op int (7c)~map op fp/simd (7e)~map lookup int (7f)~map lookup fp/simd (81)8283pipeline redirect (84)85inst all (8c)inst branch (8d)inst branch taken (90)inst b.cc (94)inst integer (97)inst neon or fp (9a)9fa6a8accfitlb miss (d4)d5d6ddinst fetch restart (de)e0? int output thing (e9)? fp/simd (ee)gpr retires (ef)f5f6f7f8fd
100242003715000010319686251001010100001010000502847521200182003720037184433187671001020100002020000200372003711100211091010100001000064008162219856010000102003820038200382003820038
10024200371490006119686251001010100001010000502847521200182013220037184433187671001020100002020000200372017911100211091010100001000364002162219852010000102003820038200382003820038
10024200371500006119686251001010100001010000502847521200182003720037184433187671001020100002020000200372003711100211091010100001000064002162219786010000102003820038200382003820038
10024200371500006119686251001010100001010000502847521200182003720037184433187671001020100002020000200372003711100211091010100001000064002162219786010000102003820038200382003820038
100242003715000010319686251001010100001010000502847521200182003720037184433187671001020100002020000200372003711100211091010100001000064002162219786010000102003820038200382003820038
100242003715000012419686251001010100001010000502847521200182003720037184433188041001020100002020000200372003711100211091010100001000064002162219786010000102003820038200382003820038
10024200371500006119686251001010100001010000502847521200182003720037184433187671001020100002420000200372003711100211091010100001000064002162219786010000102003820038200382003820038
100242003715000010319686251001010100001010000502847521200182003720037184433187671001020100002020000200372003711100211091010100001001248864002162219786010000102003820038200382003820038
100242003715000020819686251001010100001010000502847521200182003720037184433187671001020100002020000200372003711100211091010100001000064002162219786010000102003820038200382003820038
100242003715030018719686251001010100001010000502847521200182003720037184433187671001020101632020000200372003711100211091010100001000364002162219786010000102003820038200382003820038

Test 4: throughput

Count: 8

Code:

  movi v0.16b, 0
  sri v0.2d, v8.2d, #3
  movi v1.16b, 0
  sri v1.2d, v8.2d, #3
  movi v2.16b, 0
  sri v2.2d, v8.2d, #3
  movi v3.16b, 0
  sri v3.2d, v8.2d, #3
  movi v4.16b, 0
  sri v4.2d, v8.2d, #3
  movi v5.16b, 0
  sri v5.2d, v8.2d, #3
  movi v6.16b, 0
  sri v6.2d, v8.2d, #3
  movi v7.16b, 0
  sri v7.2d, v8.2d, #3
  movi v8.16b, 9

(fused SUBS/B.cc loop)

100 unrolls and 100 iterations

Result (median cycles for code divided by count): 0.2508

retire (01)cycle (02)033f51inst issue (52)~issue int (53)~issue fp/simd (54)~dispatch int (56)~dispatch fp/simd (57)huge thing int (59)huge thing fp/simd (5b)696d6escheduler rewind (75)scheduler stall (76)~dispatch op (78)~map op int (7c)~map op fp/simd (7e)~map lookup int (7f)~map lookup fp/simd (81)8283pipeline redirect (84)85inst all (8c)inst branch (8d)inst branch taken (90)inst b.cc (94)inst integer (97)inst neon or fp (9a)9fa8c5branch mispredict (cb)cdcfd5d6ddinst fetch restart (de)e0? fp/simd (ee)gpr retires (ef)f5f6f7f8fd
16020420091150292580116100800161008002850064019620044200652006561280128200800282001600562006520065111602011009910010016000010001111012131612200621600001002006620066200662006620066
16020420065150292580116100800161008002850064019620044200652006561280128200800282001600562006520065111602011009910010016000010001111012121622200621600001002006620066200662006620066
16020420065150292580116100800161008002850064019620044200652006561280128200800282001600562006520065111602011009910010016000010001111012141621200621600001002006620066200662006620066
16020420065151292580116100800161008002850064019620044200652006561280128200800282001600562006520065111602011009910010016000010001111012031611200621600001002006620066200662006620066
16020420065151292580116100800161008013250064098820044200652006561280128200800282001600562006520065111602011009910010016000010001111012141621200621600001002006620066200662006620066
16020420065150292580116100800161008002850064019620044200652006561280128200800282001600562006520065111602011009910010016000010001111012151622200621600001002006620066200662006620066
16020420065150292580116100800161008002850064019620044200652006561280128200800282001600562006520065111602011009910010016000010001111012131621200621600001002006620066200662006620066
16020420065150292580116100800161008002850064019620044200652006561280128200800282001600562006520065111602011009910010016000010001111012151621200621600001002006620066200662006620066
16020420065151292580116100800161008002850064019620044200652006561280128200800282001600562006520065111602011009910010016000010001111012141622200621600001002006620066200662006620066
16020420065150292580116100800161008002850064019620044200652006561280128200800282001600562006520065111602011009910010016000010001111012141621200621600001002006620066200662006620066

1000 unrolls and 10 iterations

Result (median cycles for code divided by count): 0.2506

retire (01)cycle (02)031e3f51inst issue (52)~issue int (53)~issue fp/simd (54)~dispatch int (56)~dispatch fp/simd (57)huge thing int (59)huge thing fp/simd (5b)5f60696d6escheduler rewind (75)scheduler stall (76)~dispatch op (78)~map op int (7c)~map op fp/simd (7e)~map lookup int (7f)~map lookup fp/simd (81)8283pipeline redirect (84)85inst all (8c)inst branch (8d)inst branch taken (90)inst b.cc (94)inst integer (97)inst neon or fp (9a)9fa0a6a8a9accfd0d2icache miss (d3)d5d6d9dadbddinst fetch restart (de)e0ea? fp/simd (ee)gpr retires (ef)f5f6f7f8fd
16002420064150244425800101080000108000050640000112003320052200523218001020800002016000020056200521116002110910101600001000000100283212229212552004915160000102005320053200532005320053
16002420056150014925800101080000108000050640000112003320052200523218001020800002016000020056200521116002110910101600001000000100313111928211442004915160000102005320053200532005320053
1600242005615004425800101080000108000050640000112003320052200523218001020800002016000020052200521116002110910101600001000000100263111332211432004915160000102005320053200532005320053
1600242005215104425800101080000108000050640000112003320052200523218001020800002016000020052200521116002110910101600001000000100263113428211342004915160000102005320053200532005320053
1600242005215104425800101080000108000050640000112003320052200523218001020800002016000020052200521116002110910101600001000000100273113528211542004915160000102005320053200532005320053
1600242005215004425800101080000108000050640000112003320052200523218001020800002016000020052200521116002110910101600001000000100293222828411442004915160000102005320053200532005320053
1600242005215004425800101080000108000050640000112003320052200523218001020800002016000020052200521116002110910101600001000000100283112228211432004915160000102005320053200532005320053
1600242005215004425800101080000108000050640000112003320052200523218001020800002016000020052200521116002110910101600001010000100263111828211342004915160000102005320053200532005320053
1600242005215004425800101080000108000050640000112003320052200523218001020800002016000020052200521116002110910101600001000000100263112128211332004915160000102005320053200532005320053
1600242005215004425800101080000108000050640000112003320052200523218001020800002016000020052200521116002110910101600001000000100273112028211342004915160000102005320053200532005320053

Test 5: throughput

Count: 16

Code:

  sri v0.2d, v16.2d, #3
  sri v1.2d, v16.2d, #3
  sri v2.2d, v16.2d, #3
  sri v3.2d, v16.2d, #3
  sri v4.2d, v16.2d, #3
  sri v5.2d, v16.2d, #3
  sri v6.2d, v16.2d, #3
  sri v7.2d, v16.2d, #3
  sri v8.2d, v16.2d, #3
  sri v9.2d, v16.2d, #3
  sri v10.2d, v16.2d, #3
  sri v11.2d, v16.2d, #3
  sri v12.2d, v16.2d, #3
  sri v13.2d, v16.2d, #3
  sri v14.2d, v16.2d, #3
  sri v15.2d, v16.2d, #3
  movi v16.16b, 17

(fused SUBS/B.cc loop)

100 unrolls and 100 iterations

Result (median cycles for code divided by count): 0.2502

retire (01)cycle (02)0318191e1f3f51inst issue (52)~issue int (53)~issue fp/simd (54)~dispatch int (56)~dispatch fp/simd (57)huge thing int (59)huge thing fp/simd (5b)60696d6edispatch stall (70)scheduler rewind (75)scheduler stall (76)~dispatch op (78)~map op int (7c)~map op fp/simd (7e)~map lookup int (7f)~map lookup fp/simd (81)8283pipeline redirect (84)85inst all (8c)inst branch (8d)inst branch taken (90)inst b.cc (94)inst integer (97)inst neon or fp (9a)9fa8a9acc5branch mispredict (cb)cdcfd5d6ddinst fetch restart (de)e0? int output thing (e9)? fp/simd (ee)gpr retires (ef)f5f6f7f8fd
160204400603100000292516010810016000810016002050012801320400194003840038199776199891601202001600322003200644003840038111602011009910010016000010000011110118416114003501600001004003940039400394003940039
160204400382990000292516010810016000810016002050012801320400194003840038199776199891601202001600322003200644003840038111602011009910010016000010000011110118116104003501600001004003940039400394003940039
160204400383000000292516010810016000810016002050012801320400194003840038199776199891601202001600322003200644003840038111602011009910010016000010000011110118016004003501600001004003940039400394008940091
160204400382990000292516010810016000810016002050012801320400194003840038199776199891601202001600322003200644003840038111602011009910010016000010000011110118016004003501600001004003940039400394003940039
16020440038300005070292516010810016000810016002050012801320400194003840038199776199891601202001600322003200644003840038111602011009910010016000010000011110118216104003501600001004003940039400394003940039
1602044003830000002192516010810016000810016002050012801321400194003840038199776199891601202001600322003200644003840038111602011009910010016000010000011110118116114003501600001004003940039400394003940039
160204400383000000292516010810016000810016002050012801321400194003840038199776199891601202001600322003200644003840038111602011009910010016000010000011110118116014003501600001004003940039400394003940039
160204400383000000292516010810016000810016002050012801320400194003840038199776199891601202001600322003200644003840038111602011009910010016000010000011110118116114003501600001004003940039400394003940039
160204400383000000292516010810016000810016002050012801320400194003840038199776199891601202001600322003200644003840038111602011009910010016000010000011110118016014003501600001004003940039400394003940039
160204400383000000295016010810016000810016002050012801321400194003840038199776199891601202001600322003200644003840038111602011009910010016000010000011110118016114003501600001004003940039400394003940039

1000 unrolls and 10 iterations

Result (median cycles for code divided by count): 0.2502

retire (01)cycle (02)03080b18191e1f3f51inst issue (52)~issue int (53)~issue fp/simd (54)~dispatch int (56)~dispatch fp/simd (57)huge thing int (59)huge thing fp/simd (5b)5f60696b6d6edispatch stall (70)scheduler rewind (75)scheduler stall (76)~dispatch op (78)~map op int (7c)~map op fp/simd (7e)~map lookup int (7f)~map lookup fp/simd (81)8283pipeline redirect (84)85inst all (8c)inst branch (8d)inst branch taken (90)inst b.cc (94)inst integer (97)inst neon or fp (9a)9fa6a8a9accfd0d2icache miss (d3)d5d6d9dadbddinst fetch restart (de)e0eaeb? fp/simd (ee)gpr retires (ef)f5f6f7f8fd
160024400513000000003362516001010160000101600005012800001140019040038400381999632001816001020160000203200004003840038111600211091010160000100003100223115016211202040035155160000104003940039400394003940039
1600244009630100010104452516001010160000101600005012800001140019040038400381999632001816001020160000203200004003840038111600211091010160000107221410100223114216211212140035155160000104003940039400394003940039
16002440038299000000682516001010160000101600005012800001140019040038400381999632001816001020160000203200004003840038111600211091010160000100000100223114616211211940035155160000104003940039400394003940039
16002440038299000000452516001010160000101600005012800001140019040038400381999632001816001020160000203200004003840038111600211091010160000100000100223113916211221640035155160000104003940039400394003940039
160024400383000000120512516001010160000101600005012800001140019040038400381999632001816001020160000203200004003840038111600211091010160000100100100583114316211232040035155160000104003940039400394014040039
16002440038300001190452516001010160000101600005012800001140019040038400381999632001816001020160000203200004003840038111600211091010160000100000100223113316211242240035155160000104003940039400394003940039
16002440038300000000452516001010160000101600005012800001140019040038400381999632001816001020160000203200004003840038111600211091010160000100100100223113916211232240035155160000104003940039400394003940039
160024400383000000007522516001010160000101600005012800001140019040038400381999632001816001020160000203200004003840038111600211091010160000100100100223115016411142140035155160000104003940039400394003940039
160024400383000000004525160010101600001016000050128000011400190400384003819996320018160010201600002032000040038400381116002110910101600001001001002231192216211222240035155160000104003940039400394003940039
1600244003830000005340452516001010160000101600005012800001140019040038400381999632001816001020160000203200004003840038111600211091010160000100000100223111859211192140035155160000104003940039400394003940039