Apple M1 Microarchitecture Research by Dougall Johnson

Firestorm: Overview | Base Instructions | SIMD and FP Instructions
Icestorm:  Overview | Base Instructions | SIMD and FP Instructions

SRI (vector, 8B)

Test 1: uops

Code:

  sri v0.8b, v1.8b, #3
  movi v0.16b, 1
  movi v1.16b, 2

(no loop instructions)

1000 unrolls and 1 iteration

Retires: 1.000

Issues: 1.000

Integer unit issues: 0.000

Load/store unit issues: 0.000

SIMD/FP unit issues: 1.000

retire (01)cycle (02)031e1f3f4e51inst issue (52)~issue fp/simd (54)~dispatch fp/simd (57)huge thing fp/simd (5b)60696d6edispatch stall (70)scheduler rewind (75)scheduler stall (76)~dispatch op (78)~map op fp/simd (7e)~map lookup fp/simd (81)8283pipeline redirect (84)85inst all (8c)inst neon or fp (9a)a8accfd5d6ddinst fetch restart (de)e0? fp/simd (ee)f5f6f7f8fd
10042037160024516872510001000100026468002018203720371572318951000100020002037203711100110000073116111787100020382038203820382038
1004203715006116872510001000100026468002018203720371572318951000100020002037203711100110000073116111787100020382038203820382038
10042037150027516872510001000100026468002018203720371572318951000100020002037203711100110000073116111787100020382038203820382038
1004203715006116872510001000100026468002018203720371572318951000100020002037203711100110000073116111787100020382038203820382038
1004203716006116872510001000100026468002018203720371574718951000100020002037203711100110000073116111787100020382038203820382038
1004203715006116872510001000100026468002018203720371572318951000100020002037203711100110000073116111787100020382038203820382038
1004203715006116872510001000100026468002018203720371572318951000100020002037203711100110000073116111787100020382038203820382038
10042037150061168725100010001000264680120182037203715723189510001000200020372037111001100001873116111787100020382038203820382038
10042037159346116872510001000100026468002018203720371572318951000100020002037203711100110000073116111787100020382038203820382038
1004203715006116872510001000100026468002018203720371572318951000100020002037203711100110000095116111787100020382038203820382038

Test 2: Latency 1->1

Code:

  sri v0.8b, v1.8b, #3
  movi v0.16b, 1
  movi v1.16b, 2

(fused SUBS/B.cc loop)

100 unrolls and 100 iterations

Result (median cycles for code): 2.0037

retire (01)cycle (02)03191e3f4e51inst issue (52)~issue int (53)~issue fp/simd (54)~dispatch int (56)~dispatch fp/simd (57)huge thing int (59)huge thing fp/simd (5b)60696d6edispatch stall (70)scheduler rewind (75)scheduler stall (76)~dispatch op (78)~map op int (7c)~map op fp/simd (7e)~map lookup int (7f)~map lookup fp/simd (81)8283pipeline redirect (84)85inst all (8c)inst branch (8d)inst branch taken (90)inst b.cc (94)inst integer (97)inst neon or fp (9a)9fa8accfd5d6ddinst fetch restart (de)e0? int output thing (e9)? fp/simd (ee)gpr retires (ef)f5f6f7f8fd
102042003715000124196872510100100100001001000050028476801200182003720037184223187451010020010000200200002003720037111020110099100100100001000071011611197910100001002003820038200382003820038
10204200371500061196872510100100100001001000050028476801200182003720037184223187451010020010000200200002003720037111020110099100100100001000071011611197910100001002003820038200382003820038
10204200371500061196872510100100100001001000050028476800200182003720037184223187451010020010000200200002003720037111020110099100100100001000071011611197910100001002003820038200382003820038
10204200371500061196872510100100100001001000050028476801200182003720037184223187451010020010000200200002003720037111020110099100100100001000371011611197910100001002003820038200382003820038
10204200371500082196872510100100100001001000050028476801200182003720037184223187451010020010000200200002003720037111020110099100100100001000071011611197910100001002003820038200382003820038
10204200371500061196872510100100100001001000050028489631200182003720037184223187451010020010000200200002003720037111020110099100100100001000071011611197910100001002003820038200382003820038
102042003715000273196872510100100100001001000050028476801200182003720037184223187451010020010000200200002003720037111020110099100100100001000071011611197910100001002003820038200382003820038
102042003715000567196872510100100100001001000050028476801200182003720037184223187451010020010000200200002003720037111020110099100100100001000071011611197910100001002003820038200382003820038
102042003715000212196872510100100100001001000050028476801200182003720037184223187451010020010000200200002003720037111020110099100100100001000071011611197910100001002003820038200382003820038
102042003715000124196872510100100100001001000050028476801200182003720037184223187451010020010000200200002003720037111020110099100100100001000071011611197910100001002003820038200382003820038

1000 unrolls and 10 iterations

Result (median cycles for code): 2.0037

retire (01)cycle (02)0318191e3f4e51inst issue (52)~issue int (53)~issue fp/simd (54)~dispatch int (56)~dispatch fp/simd (57)huge thing int (59)huge thing fp/simd (5b)60696d6edispatch stall (70)scheduler rewind (75)scheduler stall (76)~dispatch op (78)~map op int (7c)~map op fp/simd (7e)~map lookup int (7f)~map lookup fp/simd (81)8283pipeline redirect (84)85inst all (8c)inst branch (8d)inst branch taken (90)inst b.cc (94)inst integer (97)inst neon or fp (9a)9fa8accfd5d6ddinst fetch restart (de)e0? int output thing (e9)? fp/simd (ee)gpr retires (ef)f5f6f7f8fd
10024200371500002161968725100101010000101000050284768012001820037200371844431876710010201000020200002003720037111002110910101000010006402162219785010000102003820038200382003820038
1002420037150000841968725100101010000101000050284768002001820037200371844431876710010201000020200002003720037111002110910101000010006402162219785010000102003820038200382003820038
100242003715000151391968725100101110000101000050284768002001820037200371844431876710010201000020200002003720037111002110910101000010006402162219785010000102003820038200382003820038
10024200371500001261968725100101010000101000050284768012001820037200371844431876710010201000020200002003720037111002110910101000010006402162219785010000102003820038200382003820038
1002420037150000611968725100101010000101000050284768002001820037200371844431876710162201000020200002003720037111002110910101000010006402162219785010000102003820038200382003820038
10024200371500001051968725100101010000101000050284768012001820037200371844431876710010201000020200002003720084111002110910101000010106402162219785010000102003820038200382003820038
1002420037150000611968725100101010000101000050284768002001820037200371844431876710010201000020200002003720037111002110910101000010006402162219785010000102003820038200382003820038
100242003715000072619687251001010100001010000502847680120018200372003718444318767100102010000202000020037200371110021109101010000100396402162219785010000102003820038200382003820038
1002420037150000611968725100101010000101000050284768002001820037200371844431876710010201000020200002003720037111002110910101000010006402162219785010000102003820038200382003820038
10024200371500001051968725100101010000101000050284768002001820037200371844431876710010201000020200002003720037111002110910101000010006402162219785010000102003820038200382003820038

Test 3: Latency 1->2

Code:

  sri v0.8b, v0.8b, #3
  movi v0.16b, 1

(fused SUBS/B.cc loop)

100 unrolls and 100 iterations

Result (median cycles for code): 2.0037

retire (01)cycle (02)0304070a0b1e3a3f4e51inst issue (52)~issue int (53)~issue fp/simd (54)~dispatch int (56)~dispatch fp/simd (57)huge thing int (59)huge thing fp/simd (5b)60696b6d6edispatch stall (70)scheduler rewind (75)scheduler stall (76)~dispatch op (78)~map op int (7c)~map op fp/simd (7e)~map lookup int (7f)~map lookup fp/simd (81)8283pipeline redirect (84)85inst all (8c)inst branch (8d)inst branch taken (90)inst b.cc (94)inst integer (97)inst neon or fp (9a)9fa6a8acc5branch mispredict (cb)cdcfd5d6ddinst fetch restart (de)e0? int output thing (e9)? fp/simd (ee)gpr retires (ef)f5f6f7f8fd
102042003715001100016819686251010010010000100100005002847521120018020037200371842861874010100200100082002001620037200371110201100991001001000010000011171811611198050100001002003820038200382003820038
10204200371500110006119686251010010010000100100005002847521020018020037200371842861874010100200100082002001620037200371110201100991001001000010000011171811611198050100001002003820038200382003820038
10204200371500110008219686251010010010000100100005002847521020018020037200371842861874110100200100082002001620037200371110201100991001001000010000011171811611198050100001002003820038200382003820038
10204200371500110006119686251010010010000100100005002847521120018020037200371842871874110100200100082002001620037200371110201100991001001000010000011171811611198040100001002003820038200382003820038
10204200371500110006119686251010010010000100100005002847521020018020037200371842871874110100200100082002001620037200371110201100991001001000010000011174011611198040100001002003820038200382003820038
10204200371500110006119686251010010010000100100005002847521020018020037200371842861874010100200100082002001620037200371110201100991001001000010000011171811611198040100001002003820038200382003820038
102042003715001100021219686251010010010000100100005002847521020018020037200371842861874110100200100082002001620037200371110201100991001001000010000011171711611198050100001002013320038200382003820038
10204200371490110006119686251010010010000100100005002847521020018020037200371842871874110256200100082002001620037200371110201100991001001000010000011171711611198040100001002003820038200382003820038
10204200371490110006119686251010010010000100100005002847521020018020037200371842861874010100200100082002001620037200371110201100991001001000010000011171711611198050100001002003820038200382003820038
10204200371500110006119686251010010010000100100005002847521020018020037200371842861874110100200100082002001620037200371110201100991001001000010000011171711611198040100001002003820038200382003820038

1000 unrolls and 10 iterations

Result (median cycles for code): 2.0037

retire (01)cycle (02)03080b1e3f4e51inst issue (52)~issue int (53)~issue fp/simd (54)~dispatch int (56)~dispatch fp/simd (57)huge thing int (59)huge thing fp/simd (5b)60696b6d6edispatch stall (70)scheduler rewind (75)scheduler stall (76)~dispatch op (78)~map op int (7c)~map op fp/simd (7e)~map lookup int (7f)~map lookup fp/simd (81)8283pipeline redirect (84)85inst all (8c)inst branch (8d)inst branch taken (90)inst b.cc (94)inst integer (97)inst neon or fp (9a)9fa8accfd5d6ddinst fetch restart (de)e0? fp/simd (ee)gpr retires (ef)f5f6f7f8fd
10024200371500101241968625100101010000101000050284752102001802003720037184433187671001020100002020000200372003711100211091010100001010640216221978610000102003820038200382003820038
100242003715000015671968625100101010000101000050284752102001802003720037184433187671001020100002020000200372003711100211091010100001000640216221978610000102003820038200382003820038
1002420037150000611968625100101010000101000050284752102001802003720037184433187671001020100002020000200372003711100211091010100001000640216221978610000102003820038200382003820038
1002420037150000841968625100101010000101000050284752102001802003720037184433187671001020100002020000200372003711100211091010100001000640216221978610000102003820038200382003820038
1002420037150000821968625100101010000101000050284752102001802003720037184433187671001020100002020000200372003711100211091010100001000640216221978610000102003820038200382003820038
1002420037150000611968625100101010000101000050284752102001802003720037184433187671001020100002020000200372003711100211091010100001000640216221978610000102003820038200382003820038
10024200371500006121968625100101010000101000050284752102001802003720037184433187671001020100002020000200372003711100211091010100001000640216221978610000102003820038200382003820038
1002420037150000611968625100101010000101000050284752102001802003720037184433187671001020100002020000200372003711100211091010100001000640216221978610000102003820038200382003820038
1002420037150000611968625100101010000101000050284752102001802003720037184433187671001020100002020000200372003711100211091010100001000640216221978610000102003820038200382003820038
100242003715000012691968625100101010000101000050284752102001802003720037184433187671001020100002020000200372003711100211091010100001000640216221978610000102003820038200382003820038

Test 4: throughput

Count: 8

Code:

  movi v0.16b, 0
  sri v0.8b, v8.8b, #3
  movi v1.16b, 0
  sri v1.8b, v8.8b, #3
  movi v2.16b, 0
  sri v2.8b, v8.8b, #3
  movi v3.16b, 0
  sri v3.8b, v8.8b, #3
  movi v4.16b, 0
  sri v4.8b, v8.8b, #3
  movi v5.16b, 0
  sri v5.8b, v8.8b, #3
  movi v6.16b, 0
  sri v6.8b, v8.8b, #3
  movi v7.16b, 0
  sri v7.8b, v8.8b, #3
  movi v8.16b, 9

(fused SUBS/B.cc loop)

100 unrolls and 100 iterations

Result (median cycles for code divided by count): 0.2508

retire (01)cycle (02)0308090b18191e1f3f51inst issue (52)~issue int (53)~issue fp/simd (54)~dispatch int (56)~dispatch fp/simd (57)huge thing int (59)huge thing fp/simd (5b)60696d6escheduler rewind (75)scheduler stall (76)~dispatch op (78)~map op int (7c)~map op fp/simd (7e)~map lookup int (7f)~map lookup fp/simd (81)8283pipeline redirect (84)85inst all (8c)inst branch (8d)inst branch taken (90)inst b.cc (94)inst integer (97)inst neon or fp (9a)9fa0a1a6a8acc2c5branch mispredict (cb)cdcfd6dde0? int output thing (e9)? fp/simd (ee)gpr retires (ef)f5f6f7f8fd
1602042008815000000810184258011610080092100800285006401961200442006520065612801282008002820016005620065200651116020110099100100160000100000000111101191602006201600001002006620066201332006620066
1602042017415101000414029258011610080016100800285006401961200442006520065612801282008002820016005620065200651116020110099100100160000100000000111101191602006201600001002006620066200662006620066
16020420065150000000029258011610080016100800285006401961200442006520065612801282008002820016005620065200651116020110099100100160000100000000111101191602006201600001002006620066200662006620066
16020420065150000000029258011610080016100800285006401961200442006520065612801282008002820016005620065200651116020110099100100160000100000000111101191602006201600001002015720066200662006620066
16020420065150000000029258011610080016100800285006401961200442006520065612801282008002820016005620065200651116020110099100100160000100000000111101191602006201600001002006620066200662006620066
16020420065150000000029258011610080016100800285006401961200442006520065612801282008002820016005620065200652116020110099100100160000100000000111101191602006201600001002006620066200662006620066
16020420065150000000029258011610080016100800285006401961200442006520065612801282008002820016005620065200651116020110099100100160000100000000111101191602006201600001002006620066200662006620066
16020420065150000000029258011610080016100800285006401961200442006520065612801282008002820016005620065200651116020110099100100160000100000000111101191602006201600001002006620066200662006620066
16020420065150000000029258011610080016100800285006401961200442006520065612801282008002820016005620065200651116020110099100100160000100000000111101191602006201600001002006620066200662006620066
16020420065151000000029258011610080016100800285006401961200442006520065612801282008002820016005620065200651116020110099100100160000100000000111101191602006201600001002006620066200662006620066

1000 unrolls and 10 iterations

Result (median cycles for code divided by count): 0.2506

retire (01)cycle (02)031e3f51inst issue (52)~issue int (53)~issue fp/simd (54)~dispatch int (56)~dispatch fp/simd (57)huge thing int (59)huge thing fp/simd (5b)5f6061696d6escheduler rewind (75)scheduler stall (76)~dispatch op (78)~map op int (7c)~map op fp/simd (7e)~map lookup int (7f)~map lookup fp/simd (81)8283pipeline redirect (84)85inst all (8c)inst branch (8d)inst branch taken (90)inst b.cc (94)inst integer (97)inst neon or fp (9a)9facc2cfd0d2icache miss (d3)d5d6d9dadbddinst fetch restart (de)e0ea? fp/simd (ee)gpr retires (ef)f5f6f7f8fd
16002420067150044258001010800001080000506400001102002620045200453218001020800002016000020045200451116002110910101600001000100268212202422342004215160000102004620046200462004620046
160024200451500366258001010800001080000506400001152002620045200453218001020800002016000020045200451116002110910101600001000100258413202392322004215160000102004620046200462004620046
16002420045150044258001010800001080000506400001152002620045200453218001020800002016000020045200451116002110910101600001000100258413202342232004230160000102004620046200462004620046
160024200451500652580010108000010800005064000001520030200492014832180010208000020160000200452004511160021109101016000010001002911523242372232004215160000102004620046200462004620046
160024200451500139258001010800001080000506400001152002620113200453218001020800002016000020045200451116002110910101600001000100258412202374232004215160000102004620046200462004620046
16002420045150044258001010800001080000506400000152002620045200453218001020800002016000020045200451116002110910101600001000100268413244404322004215160000102004620046200462004620046
16002420045150044258001010800001080000506400001152002620045200453218001020800002016000020045200451116002110910101600001060100268412202323332004215160000102004620046200462004620046
16002420045150044258001010800001080000506400001152002620045200453218001020800002016000020045200451116002110910101600001000100268412202332232004215160000102004620046200462004620046
16002420045150044258001010800001080000506400001152002620045200453218001020800002016000020045200451116002110910101600001000100268413202314322004215160000102004620046200462004620046
16002420045150044258001010800001080000506400001152002620045200453218001020800002016000020045200451116002110910101600001000100268413202294322004215160000102004620046200462004620046

Test 5: throughput

Count: 16

Code:

  sri v0.8b, v16.8b, #3
  sri v1.8b, v16.8b, #3
  sri v2.8b, v16.8b, #3
  sri v3.8b, v16.8b, #3
  sri v4.8b, v16.8b, #3
  sri v5.8b, v16.8b, #3
  sri v6.8b, v16.8b, #3
  sri v7.8b, v16.8b, #3
  sri v8.8b, v16.8b, #3
  sri v9.8b, v16.8b, #3
  sri v10.8b, v16.8b, #3
  sri v11.8b, v16.8b, #3
  sri v12.8b, v16.8b, #3
  sri v13.8b, v16.8b, #3
  sri v14.8b, v16.8b, #3
  sri v15.8b, v16.8b, #3
  movi v16.16b, 17

(fused SUBS/B.cc loop)

100 unrolls and 100 iterations

Result (median cycles for code divided by count): 0.2502

retire (01)cycle (02)03080b18191e1f3a3f51inst issue (52)~issue int (53)~issue fp/simd (54)~dispatch int (56)~dispatch fp/simd (57)huge thing int (59)huge thing fp/simd (5b)60696b6d6edispatch stall (70)scheduler rewind (75)scheduler stall (76)~dispatch op (78)~map op int (7c)~map op fp/simd (7e)~map lookup int (7f)~map lookup fp/simd (81)8283pipeline redirect (84)85inst all (8c)inst branch (8d)inst branch taken (90)inst b.cc (94)inst integer (97)inst neon or fp (9a)9fa0a1a6a8a9acc2c5branch mispredict (cb)cdcfd5d6ddinst fetch restart (de)e0? int output thing (e9)? fp/simd (ee)gpr retires (ef)f5f6f7f8fd
16020440057299000000029251601081001600081001600205001280132140019040038400381997761998916012020016003220032006440038400381116020110099100100160000100000000011110136016004003501600001004003940039400394003940039
160204400383000000000117251601081001600081001600205001280132140019340038400381997761998916012020016003220032006440038400381116020110099100100160000100000000011110118016004003501600001004003940039400394003940039
16020440038300000000029251601081001600081001600205001280132140019040038400381997761998916012020016003220032006440038400381116020110099100100160000100000000011110118016004003501600001004003940039400394003940039
1602044003830000003990029251601081001600081001600205001280132140019040038400381997761998916012020016003220032006440038400381116020110099100100160000100000000011110118016004003501600001004003940039400394003940039
1602044003830000004470029251601081001600081001600205001280132140019040038400381997761998916012020016003220032006440038400381116020110099100100160000100000000011110118016004003501600001004003940039400394003940039
1602044003830000001230029251601081001600081001600205001280132140019040038400381997761998916012020016003220032006440038400381116020110099100100160000100000000011110118016004003501600001004003940039400394003940039
160204400383000000330029251601081001600081001600205001280132140019040038400381997761998916012020016003220032006440038400381116020110099100100160000100000000011110118016004003501600001004003940039400394003940039
160204400383000000630029251601081001600081001600205001280132140019040038400381997761998916012020016003220032006440038400381116020110099100100160000100000000011110118016004003501600001004003940039400394003940039
1602044003830000002610029251601081001600081001600205001280132140019040038400381997761998916012020016003220032006440038400381116020110099100100160000100000000011110118016004003501600001004003940039400394003940039
160204400383000000990029251601081001600081001600205001280132140019040038400381997761998916012020016003220032006440038400381116020110099100100160000100000000011110118016004003501600001004003940039400394003940039

1000 unrolls and 10 iterations

Result (median cycles for code divided by count): 0.2502

retire (01)cycle (02)0308090a0b1e1f3f51inst issue (52)~issue int (53)~issue fp/simd (54)~dispatch int (56)~dispatch fp/simd (57)huge thing int (59)huge thing fp/simd (5b)5f6061696d6edispatch stall (70)72scheduler rewind (75)scheduler stall (76)~dispatch op (78)~map op int (7c)~map op fp/simd (7e)~map lookup int (7f)~map lookup fp/simd (81)8283pipeline redirect (84)85inst all (8c)inst branch (8d)inst branch taken (90)inst b.cc (94)inst integer (97)inst neon or fp (9a)9fa0a1a6a7a8a9acc9cfd0d2icache miss (d3)d5d6d9dadbddinst fetch restart (de)e0eaebec? fp/simd (ee)gpr retires (ef)f5f6f7f8fd
16002440051300000000507251600101016000010160000501280000115400194003840038199960320018160010201600002032000040038400381116002110910101600001000000000100228112223211794003520160160000104003940039400394003940039
1600244003830000000045251600101016000010160000501280000115400194003840038199960320018160010201600002032000040038400381116002110910101600001000000000100228516162111615400352080160000104003940039400394003940039
160024400383000000004525160010101600001016000050128000011540019400894003820005010200451601062016009720320196400384003811160021109101016000010223100001002285112212211412400352080160000104016840118401204017040104
16002440102301110105986625160010101600001016000050128000011540019400384003819996032001816001020160000203200004003840038111600211091010160000100000000010022851122121114134003520160160000104003940039400394003940039
16002440038300000000452516001010160000101600005012800001154001940038400381999603200181600102016000020320000400384003821160021109101016000010000000001002411511344211815400352080160000104003940039400394003940039
1600244003829900000045251600101016000010160000501280000115400194003840038199960320018160010201600002032000040038400381116002110910101600001000001000100228519212211213400352080160000104003940039400394003940039
16002440038300000000452516001010160000101600005012800001154001940038400381999603200181600102016000020320000400384003811160021109101016000010000000001002485182121159400352080160000104003940039400394003940039
16002440038300000000512516008810160000101600005012800001154001940038400381999603200181600102016000020320000400384003811160021109101016000010000000001002211517214111010400352080160000104003940039400394003940039
16002440038300000000452516001010160000101600005012800001154001940038400381999673200181600102016000020320000400384003811160021109101016000010000060001002285192121196400352070160000104003940039400394003940039
1600244003834900000045251600101016000010160000501280000115400194003840038199960320018160010201600002032000040038400381116002110910101600001000000000100228616214111074003520160160000104003940039400394003940039