Apple M1 Microarchitecture Research by Dougall Johnson

Firestorm: Overview | Base Instructions | SIMD and FP Instructions
Icestorm:  Overview | Base Instructions | SIMD and FP Instructions

SRI (vector, 16B)

Test 1: uops

Code:

  sri v0.16b, v1.16b, #3
  movi v0.16b, 1
  movi v1.16b, 2

(no loop instructions)

1000 unrolls and 1 iteration

Retires: 1.000

Issues: 1.000

Integer unit issues: 0.000

Load/store unit issues: 0.000

SIMD/FP unit issues: 1.000

retire (01)cycle (02)030b1e3f4e51inst issue (52)~issue fp/simd (54)~dispatch fp/simd (57)huge thing fp/simd (5b)696d6edispatch stall (70)scheduler rewind (75)scheduler stall (76)~dispatch op (78)~map op fp/simd (7e)~map lookup fp/simd (81)8283pipeline redirect (84)85inst all (8c)inst neon or fp (9a)a0a8accfd5d6ddinst fetch restart (de)e0? fp/simd (ee)f5f6f7f8fd
1004203715006116872510001000100026468020182037203715723189510001000200020372037111001100000073216111787100020382038203820382038
1004203716106116872510001000100026468020182037203715723189510001000200020372037111001100000073116111787100020382038203820382038
1004203715006116872510001000100026468020182037203715723189510001000200020372037111001100000073116111787100020382038203820382038
1004203715006116872510001000100026468020182037203715723189510001000200020372037111001100000373116111787100020382038203820382038
1004203715006116872510001000100026468020182037203715723189510001000200020372037111001100000073116111787100020382038203820382038
1004203715006116872510001000100026468020182037203715723189510001000200020372037111001100000073116111787100020382038203820382038
1004203715006116872510001000100026468020182037203715723189510001000200020372037111001100000073116111787100020382038203820382038
1004203715006116872510001000100026468020182037203715723189510001000200020372037111001100000073116111787100020382038203820382038
1004203715006116872510001000100026468020182037203715723189510001000200020372037111001100000073116111787100020382038203820382038
1004203715006116872510001000100026468020182037203715723189510001000200020372037111001100000073116111787100020382038203820382038

Test 2: Latency 1->1

Code:

  sri v0.16b, v1.16b, #3
  movi v0.16b, 1
  movi v1.16b, 2

(fused SUBS/B.cc loop)

100 unrolls and 100 iterations

Result (median cycles for code): 2.0037

retire (01)cycle (02)031e3f4e51inst issue (52)~issue int (53)~issue fp/simd (54)~dispatch int (56)~dispatch fp/simd (57)huge thing int (59)huge thing fp/simd (5b)60696d6edispatch stall (70)scheduler rewind (75)scheduler stall (76)~dispatch op (78)~map op int (7c)~map op fp/simd (7e)~map lookup int (7f)~map lookup fp/simd (81)8283pipeline redirect (84)85inst all (8c)inst branch (8d)inst branch taken (90)inst b.cc (94)inst integer (97)inst neon or fp (9a)9fa1a8acc2cfd5d6ddinst fetch restart (de)e0? fp/simd (ee)gpr retires (ef)f5f6f7f8fd
10204200371500611968725101001001000010010000500284768012001820037200371842231874510100200100002002000020037200371110201100991001001000010000007101161119791100001002003820038200382003820038
102042003715001031968725101001001000010010000500284768012001820037200371842231874510100200100002002000020037200371110201100991001001000010000007101161119791100001002003820038200382003820038
1020420037150010771968725101001001000010010000500284768012001820037200371842231874510100200100002002000020037200371110201100991001001000010000007101161119791100001002003820038200382003820038
10204200371500611968725101001001000010010000500284768012001820037200371842231874510100200100002002000020037200371110201100991001001000010000007101161119791100001002003820038200382003820038
10204200371500611968725101001001000010010000500284768012001820037200371842231874510100200100002002000020037200371110201100991001001000010000007101161119791100001002003820038200382003820038
10204200371500611968725101001001000010010000500284768012001820037200371842231874510100200100002002000020037200371110201100991001001000010000007101161119791100001002003820038200382003820038
10204200371500611968725101001001000010010000500284768012001820037200371842231874510100200100002002000020037200371110201100991001001000010000007101161119791100001002003820038200382003820038
102042003715006119687251010010010000100100005002847680120018200372003718422318745101002001000020020000200372003711102011009910010010000100026307101161119791100001002003820038200382003820038
102042003715006119687251010010010000100100005002847680120018200372003718422318745101002001000020020000200372003711102011009910010010000100023788307101161119791100001002003820038200382003820038
10204200371500108019687251010010010000100100005002847680120018200372003718422318745101002001000020020000200372003711102011009910010010000100026307101161119791100001002003820038200382003820038

1000 unrolls and 10 iterations

Result (median cycles for code): 2.0037

retire (01)cycle (02)03091e3f4e51inst issue (52)~issue int (53)~issue fp/simd (54)~dispatch int (56)~dispatch fp/simd (57)huge thing int (59)huge thing fp/simd (5b)60696d6edispatch stall (70)72scheduler rewind (75)scheduler stall (76)~dispatch op (78)~map op int (7c)~map op fp/simd (7e)~map lookup int (7f)~map lookup fp/simd (81)8283pipeline redirect (84)85inst all (8c)inst branch (8d)inst branch taken (90)inst b.cc (94)inst integer (97)inst neon or fp (9a)9fa8a9acc5cfd5d6ddinst fetch restart (de)e0? fp/simd (ee)gpr retires (ef)f5f6f7f8fd
100242003715000611968725100101010000101000050284768002001820037200371844403187671001020100002020000200372003711100211091010100001046090640316331978510000102003820038200382003820038
10024200371500061196872510010101000010100005028476801200182003720037184440318767100102010000202000020037200371110021109101010000104060640316331978510000102003820038200382003820038
10024200371500061196872510010101000010100005028476801200182003720037184440318767100102010000202000020037200371110021109101010000100030640316331978510000102003820038200382003820038
10024200371500061196872510010101000010101525028476801200182003720037184440318767100102010000222000020037200371110021109101010000100259330640316331978510000102003820038200382003820038
10024200371500061196872510010101000010100005028476800200182003720037184440318767100102010000202000020037200371110021109101010000101000640316331978510000102003820038200382003820038
10024200371500061196872510010101000010100005028476801200182003720037184440318767100102010000202000020037200371110021109101010000102000640316331978510000102003820038200382003820038
10024200371500061196872510010101000010100005028476801200182003720037184447318767100102010000202000020037200371110021109101010000105000640316331978510000102003820038200382003820038
10024200371500061196872510010101000010100005028476800200182003720037184440318767100102010000202000020037200371110021109101010000100030640316331978510000102003820038200382003820038
10024200371500061196872510010101002410100005028476800200182003720037184440318767100102010000202000020037200371110021109101010000100060640316331978510000102003820038200382003820038
100242003715000611968725100101010000101000050284768012001820037200371844403187671001020100002020000200372003711100211091010100001027000640316331978510000102003820038200382003820038

Test 3: Latency 1->2

Code:

  sri v0.16b, v0.16b, #3
  movi v0.16b, 1

(fused SUBS/B.cc loop)

100 unrolls and 100 iterations

Result (median cycles for code): 2.0037

retire (01)cycle (02)030b181e3a3f4e51inst issue (52)~issue int (53)~issue fp/simd (54)~dispatch int (56)~dispatch fp/simd (57)huge thing int (59)huge thing fp/simd (5b)60696d6edispatch stall (70)72scheduler rewind (75)scheduler stall (76)~dispatch op (78)~map op int (7c)~map op fp/simd (7e)~map lookup int (7f)~map lookup fp/simd (81)8283pipeline redirect (84)85inst all (8c)inst branch (8d)inst branch taken (90)inst b.cc (94)inst integer (97)inst neon or fp (9a)9fa6a8a9acc5branch mispredict (cb)cdcfd5d6ddinst fetch restart (de)e0? fp/simd (ee)gpr retires (ef)f5f6f7f8fd
1020420037150000012419686251010010010000100100005002847521020018200372003718428071874010100200100082002001620037200371110201100991001001000010000001117171160019800100001002003820038200382003820038
102042003715000006119686251010010010000100100005002847521020018200372003718428061874010100200100082002001620037200371110201100991001001000010000061117180160019800100001002003820038200382003820038
1020420037150000012619686251010010010000100100005002847521120018200372003718428071874010100200100082002001620037200371110201100991001001000010000001117170160019800100001002003820038200382003820038
1020420037150000015019686251010010010000100100005002847521020018200372003718428071874010100200100082002001620037200371110201100991001001000010000031117170160019800100001002003820038200382003820038
10204200371500000429196862510100100100001001000050028475210200182003720037184280718741101002001000820020016200372003711102011009910010010000100010151117180160019800100001002003820038200382003820038
102042003715000006119686251010010010000100100005002847521120018200372003718428071874110100200100082002001620037200371110201100991001001000010001201117180160019800100001002003820038200382003820038
1020420037150000053619686251010010010000100100005002847521120018200372003718428061874110100200100082002001620037200371110201100991001001000010000001117180160019801100001002003820038200382003820038
102042003715000008419686251010010010000100100005002847521120018200372003718428071874010100200100082002001620037200371110201100991001001000010000031117180160019800100001002003820038200382003820038
1020420037150000012619686251010010010000100100005002847521120018200372003718428061874110100200100082002001620037200371110201100991001001000010000031117170160019801100001002003820038200382003820038
1020420037150000012619686251010010010000100100005002847521120018200372003718428071874010100200100082002001620037200371110201100991001001000010000001117170160019801100001002003820038200382003820038

1000 unrolls and 10 iterations

Result (median cycles for code): 2.0037

retire (01)cycle (02)031e3f4e51inst issue (52)~issue int (53)~issue fp/simd (54)~dispatch int (56)~dispatch fp/simd (57)huge thing int (59)huge thing fp/simd (5b)60696d6edispatch stall (70)scheduler rewind (75)scheduler stall (76)~dispatch op (78)~map op int (7c)~map op fp/simd (7e)~map lookup int (7f)~map lookup fp/simd (81)8283pipeline redirect (84)85inst all (8c)inst branch (8d)inst branch taken (90)inst b.cc (94)inst integer (97)inst neon or fp (9a)9fa0a8acbranch mispredict (cb)cfd5d6ddinst fetch restart (de)e0? fp/simd (ee)gpr retires (ef)f5f6f7f8fd
1002420037150074519686251001010100001010000502847521120018200372003718443318767100102010000202000020037200371110021109101010000100000640216221978610000102003820038200382003820038
100242003715006119686251001010100001010000502847521120018200372003718443318767100102010000202000020037200371110021109101010000100000640216221978610000102003820038200382003820038
10024200371500470196862510010101000010100005028475211200182003720037184433187671001020100002020000200372003711100211091010100001002400640216221978610000102003820038200382003820038
10024200371500101819686251001010100001010000502847521120018200372003718443318767100102010000202000020037200371110021109101010000100000640216221978610000102003820038200382003820038
1002420037150019319686251001010100001010000502847521120018200372003718443318767100102010000202000020037200371110021109101010000100000640216221978610000102003820038200382003820038
1002420037150061196862510010101000010100005028475211200182003720037184433187671001020100002020000200372003711100211091010100001001900640216221978610000102003820038200382003820038
100242003715006119686251001010100001010000502847521120018200372003718443318767100102010000202000020037200371110021109101010000100000640216221978610000102003820038200382003820038
10024200371500118619686251001010100001010000502847521120018200372003718443318767100102010000202000020037200371110021109101010000100030640216221978610000102003820038200382003820038
100242003715006119686251001010100001010000502847521120018200372003718443318767100102010000202000020037200371110021109101010000100000640216221978610000102003820038200382003820038
100242003715006119686251001010100001010000502847521120018200372003718443318767100102010000202000020037200371110021109101010000100000640216221978610000102003820038200382003820038

Test 4: throughput

Count: 8

Code:

  movi v0.16b, 0
  sri v0.16b, v8.16b, #3
  movi v1.16b, 0
  sri v1.16b, v8.16b, #3
  movi v2.16b, 0
  sri v2.16b, v8.16b, #3
  movi v3.16b, 0
  sri v3.16b, v8.16b, #3
  movi v4.16b, 0
  sri v4.16b, v8.16b, #3
  movi v5.16b, 0
  sri v5.16b, v8.16b, #3
  movi v6.16b, 0
  sri v6.16b, v8.16b, #3
  movi v7.16b, 0
  sri v7.16b, v8.16b, #3
  movi v8.16b, 9

(fused SUBS/B.cc loop)

100 unrolls and 100 iterations

Result (median cycles for code divided by count): 0.2510

retire (01)cycle (02)033f51inst issue (52)~issue int (53)~issue fp/simd (54)~dispatch int (56)~dispatch fp/simd (57)huge thing int (59)huge thing fp/simd (5b)60696d6e74scheduler rewind (75)scheduler stall (76)~dispatch op (78)~map op int (7c)~map op fp/simd (7e)~map lookup int (7f)~map lookup fp/simd (81)8283pipeline redirect (84)85inst all (8c)inst branch (8d)inst branch taken (90)inst b.cc (94)inst integer (97)inst neon or fp (9a)9fa8c5branch mispredict (cb)cdcfd5d6ddinst fetch restart (de)e0? fp/simd (ee)gpr retires (ef)f5f6f7f8fd
16020420088150186258011610080016100800285006401961200442006520065061280128200800282001600562006520065111602011009910010016000010001111011901600200621600001002006620066200662006620066
1602042006515029258011610080016100800285006401961200442006520065061280128200800282001600562006520065111602011009910010016000010001111011901600200621600001002006620066200662006620066
16020420065150486258011610080016100800285006401960200442006520065061280128200800282001600562006520065111602011009910010016000010001111011901600200621600001002006620066200662006620155
1602042006515029258011610080016100800285006401961200442006520065061280128200800282001600562006520065111602011009910010016000010001111011901600200621600001002006620066200662006620066
1602042006515029258011610080016100800285006401960200442006520065061280128200800282001600562006520065111602011009910010016000010001111011901600200621600001002006620066200662006620066
1602042006515032258011610080016100800285006401961200442006520065061280128200800282001600562006520065111602011009910010016000010001111011901600200621600001002006620066200662006620066
1602042006515092258011610080016100800285006401960200442006520065061280128200800282001600562006520065111602011009910010016000010002221013112311200731600001002015420077200772007720078
16020420076151642880119100800191008003150064022002005620076200760101080131200800312001600622007620077111602011009910010016000010002221013012311200731600001002016620077200772007720077
1602042007715164298011910080019100800315006402201200562007720076091080131200800312001600622007620077111602011009910010016000010002221013112311200731600001002007820077200772007720077
16020420076151127298011910080019100800315006402201200562007620076091080131200800312001600622007720076111602011009910010016000010002221013112311200731600001002007720077200772007720077

1000 unrolls and 10 iterations

Result (median cycles for code divided by count): 0.2506

retire (01)cycle (02)030818191e1f3a3f51inst issue (52)~issue int (53)~issue fp/simd (54)~dispatch int (56)~dispatch fp/simd (57)huge thing int (59)huge thing fp/simd (5b)5f6061696d6escheduler rewind (75)scheduler stall (76)~dispatch op (78)~map op int (7c)~map op fp/simd (7e)~map lookup int (7f)~map lookup fp/simd (81)8283pipeline redirect (84)85inst all (8c)inst branch (8d)inst branch taken (90)inst b.cc (94)inst integer (97)inst neon or fp (9a)9fa8a9accfd0d2icache miss (d3)d5d6d9dadbddinst fetch restart (de)e0eaec? fp/simd (ee)gpr retires (ef)f5f6f7f8fd
160024200681500000003532580010108000010800005064000011520026200452004932180010208000020160000200452004511160021109101016000010200100398112020211181820270150160000102004620046200462005020048
16002420045150000000712580010108000010800005064000011520026200452004532180010208000020160000200452004511160021109101016000010000100438321722221191820195160160000102004620046200462004820046
160024200471500000002162580010108000010800005064000011520028200472004732180010208000020160000200452004511160021109101016000010000100408311720111182120229160160000102004620046200462004620046
16002420045150000000442580010108000010800005064000011520028200452004532180010208000020160000200452004511160021109101016000010000100408511922211181920178150160000102004820046200462004620048
16002420045150000001442580010108000010800005064000001520028200472004532180010208000020160000200452004511160021109101016000010000100418511920111181420174150160000102004620046200462004620048
16002420049150000000442580010108000010800005064000011520028200452004532180010208000020160000200452004511160021109101016000010000100428512120211171220190160160000102004620046200462005220046
160024200471510000002392580010108000010800005064000011520028200452004532180010208000020160000200452004511160021109101016000010120100648511720211141820345150160000102004620046200462004620046
160024200451500000004425800101080000108029650640800115200262004520045321800102080000201600002004720045111600211091010160000100001004011511920211191720197160160000102004620046200482011620046
16002420047150000000442580010108000010800005064000011520028200452004532180010208000020160000200452004511160021109101016000010000100428311820111201320188150160000102004620046200482004820046
160024200451500000003252580010108000010800005064000011520026200452004532180010208000020160000200452004711160021109101016000010000100388311520211191920204150160000102004620046200462004820048

Test 5: throughput

Count: 16

Code:

  sri v0.16b, v16.16b, #3
  sri v1.16b, v16.16b, #3
  sri v2.16b, v16.16b, #3
  sri v3.16b, v16.16b, #3
  sri v4.16b, v16.16b, #3
  sri v5.16b, v16.16b, #3
  sri v6.16b, v16.16b, #3
  sri v7.16b, v16.16b, #3
  sri v8.16b, v16.16b, #3
  sri v9.16b, v16.16b, #3
  sri v10.16b, v16.16b, #3
  sri v11.16b, v16.16b, #3
  sri v12.16b, v16.16b, #3
  sri v13.16b, v16.16b, #3
  sri v14.16b, v16.16b, #3
  sri v15.16b, v16.16b, #3
  movi v16.16b, 17

(fused SUBS/B.cc loop)

100 unrolls and 100 iterations

Result (median cycles for code divided by count): 0.2502

retire (01)cycle (02)031e3f51inst issue (52)~issue int (53)~issue fp/simd (54)~dispatch int (56)~dispatch fp/simd (57)huge thing int (59)huge thing fp/simd (5b)60696d6edispatch stall (70)scheduler rewind (75)scheduler stall (76)~dispatch op (78)~map op int (7c)~map op fp/simd (7e)~map lookup int (7f)~map lookup fp/simd (81)8283pipeline redirect (84)85inst all (8c)inst branch (8d)inst branch taken (90)inst b.cc (94)inst integer (97)inst neon or fp (9a)9fa0a7a8c2c5branch mispredict (cb)cdcfd5d6e0? fp/simd (ee)gpr retires (ef)f5f6f7f8fd
1602044005930002925160108100160008100160020500128013214001940038400381997761998916012020016003220032006440038400381116020110099100100160000100000011110118116400351600001004003940039400394003940039
16020440038299021925160108100160008100160020500128013214001940038400381997761998916012020016003220032006440038400381116020110099100100160000100000021110118016400351600001004003940039400394003940039
1602044003830007125160108100160008100160020500128013214001940038400381997761998916012020016003220032006440038400381116020110099100100160000100000011110118016400351600001004003940039400394003940039
1602044003829902925160108100160008100160020500128013214001940038400381997761998916012020016003220032006440038400381116020110099100100160000100000011110118016400351600001004003940039400394003940039
1602044003830002925160108100160008100160020500128013214001940038400381997761998916012020016003220032006440038400381116020110099100100160000100000011110118016400351600001004003940039400394003940039
1602044003830002925160108100160008100160020500128013214001940038400381997761998916012020016003220032006440038400381116020110099100100160000100000011110118016400351600001004003940039400394003940039
1602044003829902925160108100160008100160020500128013214001940038400381997761998916012020016003220032006440038400381116020110099100100160000100000011110118016400351600001004003940039400394003940039
1602044003830002925160108100160008100160020500128013214001940038400381997761998916012020016003220032006440038400381116020110099100100160000100000011110118016400351600001004003940039400394003940039
1602044003829902925160108100160008100160020500128013214001940038400381997761998916012020016003220032006440038400381116020110099100100160000100000011110118116400351600001004003940039400394003940039
1602044003830002925160108100160008100160020500128013214001940038400381997761998916012020016003220032006440038400381116020110099100100160000100000011110118016400351600001004003940039400394003940039

1000 unrolls and 10 iterations

Result (median cycles for code divided by count): 0.2502

retire (01)cycle (02)030b18191e1f3f51inst issue (52)~issue int (53)~issue fp/simd (54)~dispatch int (56)~dispatch fp/simd (57)huge thing int (59)huge thing fp/simd (5b)5f6061696b6d6edispatch stall (70)72scheduler rewind (75)scheduler stall (76)~dispatch op (78)~map op int (7c)~map op fp/simd (7e)~map lookup int (7f)~map lookup fp/simd (81)8283pipeline redirect (84)85inst all (8c)inst branch (8d)inst branch taken (90)inst b.cc (94)inst integer (97)inst neon or fp (9a)9fa1a8a9accfd0d2icache miss (d3)d5d6d9dadbddinst fetch restart (de)e0eaeb? fp/simd (ee)gpr retires (ef)f5f6f7f8fd
1600244005130000000738251600101016000010160000501280000115400190400384003819996032001816001020160000203200004003840038111600211091010160000100000100228221216225111640035305160000104003940039400394003940039
1600244003830000000452516001010160000101600005012800001154001904003840038199960320018160010201600002032000040038400381116002110910101600001000001002284241642325540035155160000104003940039400394003940039
160024400383000000045251600101016000010160000501280000115400190400384003819996032001816001020160000203200004003840038111600211091010160000100004210022841316424153400351510160000104003940039400394003940039
16002440038300000005125160010101600001016000050128000001540019040038400381999603200181600102016000020320000400384003811160021109101016000010000010022841516226195400351510160000104003940039400394003940039
16002440038300000004525160010101600001016000050128000011540019040038400381999603200181600102016000020320000400384003811160021109101016000010000010022114151642414940035155160000104003940039400394003940039
160024400383000000071025160010101600001016000050128000011540019040038400381999603200181600102016000020320000400384003811160021109101016000010000010022851516421246400351510160000104003940039400394003940039
1600244003829900000273025160010101600001016000050128000001540019040038400381999603200181600102016000020320000400384003811160021109101016000010000010022851316222291040035155160000104003940039400394003940039
16002440038299000004525160010101600001016000050128000011540019040038400381999603200181600102016000020320000400384003811160021109101016000010000010022841616428131240035155160000104003940039400394003940039
16002440038300000004525160010101600001016000050128000011540019340038400381999603200181600102016000020320000400384003811160021109101016000010000010024115151622625640035155160000104003940039400394003940039
1600244003830000000452516001010160000101600005012800000154001904003840038199960320018160010201600002032000040038400381116002110910101600001000001002484181622523540035305160000104003940039400394003940039