Apple M1 Microarchitecture Research by Dougall Johnson

Firestorm: Overview | Base Instructions | SIMD and FP Instructions
Icestorm:  Overview | Base Instructions | SIMD and FP Instructions

SLI (scalar, D)

Test 1: uops

Code:

  sli d0, d1, #3
  movi v0.16b, 1
  movi v1.16b, 2

(no loop instructions)

1000 unrolls and 1 iteration

Retires: 1.000

Issues: 1.000

Integer unit issues: 0.000

Load/store unit issues: 0.000

SIMD/FP unit issues: 1.000

retire (01)cycle (02)03191e3f4e51inst issue (52)~issue fp/simd (54)~dispatch fp/simd (57)huge thing fp/simd (5b)60696d6edispatch stall (70)scheduler rewind (75)scheduler stall (76)~dispatch op (78)~map op fp/simd (7e)~map lookup fp/simd (81)8283pipeline redirect (84)85inst all (8c)inst neon or fp (9a)a8accfd5d6ddinst fetch restart (de)e0? fp/simd (ee)f5f6f7f8fd
1004203715006116872510001000100026468012022203720371572318951000100020002037203711100110000073116111787100020382038203820382038
10042037150306116872510001000100026468002018203720371572318951000100020002037203711100110000073116111787100020382038203820382038
1004203715006116872510001000100026468012018203720371572318951000100020002037203711100110000073116111787100020382038203820382038
1004203715008216872510001000100026468012018203720371572318951000100020002037203711100110000073116111787100020382038203820382038
1004203715006116872510001000100026468002018203720371572318951000100020002037203711100110000073116111787100020382038203820382038
1004203715006116872510001000100026468012018203720371572318951000100020002037203711100110000073116111787100020382038203820382038
1004203715006116872510001000100026468012018203720371572318951000100020002037203711100110000073116111787100020382038203820382038
1004203716106116872510001000100026468002018203720371572318951000100020002037203711100110000073116111787100020382038203820382038
10042037150246116872510001000100026468012018203720371572318951000100020002037203711100110000073116111787100020382038203820382038
1004203715008216872510001000100026468002018203720371572318951000100020002037203711100110000073116111787100020382038203820382038

Test 2: Latency 1->1

Code:

  sli d0, d1, #3
  movi v0.16b, 1
  movi v1.16b, 2

(fused SUBS/B.cc loop)

100 unrolls and 100 iterations

Result (median cycles for code): 2.0037

retire (01)cycle (02)031e3f4e51inst issue (52)~issue int (53)~issue fp/simd (54)~dispatch int (56)~dispatch fp/simd (57)huge thing int (59)huge thing fp/simd (5b)60696d6edispatch stall (70)scheduler rewind (75)scheduler stall (76)~dispatch op (78)~map op int (7c)~map op fp/simd (7e)~map lookup int (7f)~map lookup fp/simd (81)8283pipeline redirect (84)85inst all (8c)inst branch (8d)inst branch taken (90)inst b.cc (94)inst integer (97)inst neon or fp (9a)9fa0a8accfd5d6ddinst fetch restart (de)e0? fp/simd (ee)gpr retires (ef)f5f6f7f8fd
1020420037150061196872510100100100001001000050028476800200182003720037184223187451010020010000200200002003720037111020110099100100100001000007101161119791100001002003820038200382003820038
1020420037150061196872510100100100001001000050028476801200182003720037184223187451010020010000200200002003720037111020110099100100100001000007101161119791100001002003820038200382003820038
1020420037150061196872510100100100001001000050028476801200182003720037184223187451010020010000200200002003720037111020110099100100100001000007101161119791100001002003820038200382003820038
1020420037150061196872510100100100001001000050028476801200182003720037184223187451010020010000200200002003720037111020110099100100100001000107101161119791100001002003820038200382003820038
1020420037150061196872510100100100001001000050028476801200182003720037184223187451027820010000200200002003720037111020110099100100100001000107101161119791100001002003820038200382003820038
1020420037150061196872510100100100001001000050028476801200182003720037184223187451010020010000200200002003720037111020110099100100100001000107101161119791100001002003820038200382003820038
10204200371500611968725101001001000010010000500284768012001820037200371842231874510100200100002002000020037200371110201100991001001000010003007101161119791100001002003820038200382003820038
1020420037150061196872510100100100001001000050028476801200182003720037184223187451010020010000200200002003720037111020110099100100100001000007101161119791100001002003820038200382003820038
1020420037150061196872510100100100001001000050028476801200182003720037184223187451010020010000200200002003720037111020110099100100100001000007101161119791100001002003820038200382003820038
1020420037150061196872510100100100001001000050028476801200182003720037184223187451010020010000200200002003720037111020110099100100100001000607101161119791100001002003820038200382003820038

1000 unrolls and 10 iterations

Result (median cycles for code): 2.0037

retire (01)cycle (02)031e3f4e51inst issue (52)~issue int (53)~issue fp/simd (54)~dispatch int (56)~dispatch fp/simd (57)huge thing int (59)huge thing fp/simd (5b)60696d6edispatch stall (70)scheduler rewind (75)scheduler stall (76)~dispatch op (78)~map op int (7c)~map op fp/simd (7e)~map lookup int (7f)~map lookup fp/simd (81)8283pipeline redirect (84)85inst all (8c)inst branch (8d)inst branch taken (90)inst b.cc (94)inst integer (97)inst neon or fp (9a)9fa8accfd5d6ddinst fetch restart (de)e0? fp/simd (ee)gpr retires (ef)f5f6f7f8fd
1002420037150961196872510010101000013100005028476801200182003720037184443187671001020100002020000200372003711100211091010100001030640316341978510000102003820038200382003820038
10024200371503361196872510010101000010100005028476801200182003720037184443187671001020100002020000200372003711100211091010100001040640316331978510000102003820038200382003820038
1002420037150061196872510010101000010100005028476801200182003720037184443187671001020100002020000200372003711100211091010100001010640316331978510000102003820038200382003820038
1002420037150061196872510010101000010100005028476801200182003720037184443187671001020100002020000200372003711100211091010100001010640316331978510000102003820038200382003820038
10024200371500251196872510010101000010100005028476801200182003720037184443187671001020100002020000200372003711100211091010100001080640316331978510000102003820038200382003820038
1002420037150061196878010010101000010100005028476801200182003720037184443187671001020100002020000200372003711100211091010100001020640316331978510000102003820038200382003820038
1002420037150061196872510010101000010100005028476800200182003720037184443187671001020100002020000200372003711100211091010100001000640316331978510000102003820038200382003820038
1002420037150061196872510010101000010100005028476800200182003720037184443187671001020100002020000200372003711100211091010100001000640316331985010000102003820038200382003820038
1002420037150061196872510010101000010100005028476801200182003720037184443187671001020100002020000200372003711100211091010100001020640316331978510000102003820038200382003820038
1002420037150061196872510010101000010100005028476801200182003720037184443187671001020100002020000200372003711100211091010100001000640316331978510000102003820038200382003820038

Test 3: Latency 1->2

Code:

  sli d0, d0, #3
  movi v0.16b, 1

(fused SUBS/B.cc loop)

100 unrolls and 100 iterations

Result (median cycles for code): 2.0037

retire (01)cycle (02)031e3a3f4e51inst issue (52)~issue int (53)~issue fp/simd (54)~dispatch int (56)~dispatch fp/simd (57)huge thing int (59)huge thing fp/simd (5b)60696d6edispatch stall (70)72scheduler rewind (75)scheduler stall (76)~dispatch op (78)~map op int (7c)~map op fp/simd (7e)~map lookup int (7f)~map lookup fp/simd (81)8283pipeline redirect (84)85inst all (8c)inst branch (8d)inst branch taken (90)inst b.cc (94)inst integer (97)inst neon or fp (9a)9fa8acc5branch mispredict (cb)cdcfd5d6ddinst fetch restart (de)e0? fp/simd (ee)gpr retires (ef)f5f6f7f8fd
10204200371500061196862510100100100001001000050028475211200182003720037184280618741101002001000820020016200372003711102011009910010010000100001117222242219787100001002003820038200382003820038
10204200371506197196862510100100100001001000050028475210200182003720037184090618733101002001000020020000200372003711102011009910010010000100001117222242219787100001002003820038200382003820038
10204200371500061196862510100100100001001000050028475210200182003720037184280618741101002001000820020016200372003711102011009910010010000100001117180160019801100001002003820038200382003820038
10204200371500061196862510100100100001001000050028475210200182003720037184280618740101002001000820020016200372003711102011009910010010000100001117180160019800100001002003820038200382003820038
10204200371500061196862510100100100001001000050028475210200182003720037184280718741101002001000820020016200372003711102011009910010010000100131117180160019800100001002003820038200382003820038
10204200371500061196862510100100100001001000050028475210200182003720037184280718740101002001000820020016200372003711102011009910010010000100001117170160019800100001002003820038200382003820038
102042003714900611968625101001001000010010000500284752102001820037200371842806187411010020010008200200162003720037111020110099100100100001000121117180160019801100001002003820038200382003820038
10204200371500061196862510100100100001001000050028475210200182003720037184280618740101002001000820020016200372003711102011009910010010000100001117170160019800100001002003820038200382003820038
10204200371500061196862510100100100001001000050028475210200182003720037184280718740101002001000820020016200372003711102011009910010010000100421201117170160019800100001002003820038200382003820038
10204200371500061196862510100100100001001000050028475210200182003720037184280618740101002001000820020016200372003711102011009910010010000100001117170160019800100001002003820038200382003820038

1000 unrolls and 10 iterations

Result (median cycles for code): 2.0037

retire (01)cycle (02)030818191e1f3f4e51inst issue (52)~issue int (53)~issue fp/simd (54)~dispatch int (56)~dispatch fp/simd (57)huge thing int (59)huge thing fp/simd (5b)60696d6edispatch stall (70)scheduler rewind (75)scheduler stall (76)~dispatch op (78)~map op int (7c)~map op fp/simd (7e)~map lookup int (7f)~map lookup fp/simd (81)8283pipeline redirect (84)85inst all (8c)inst branch (8d)inst branch taken (90)inst b.cc (94)inst integer (97)inst neon or fp (9a)9fa6a8accdcfd5d6ddinst fetch restart (de)e0? fp/simd (ee)gpr retires (ef)f5f6f7f8fd
10024200371500000072619686251001010100001010000502847521020018200372003718443318767100102010000202000020037200371110021109101010000100001640216221978610000102003820038200382003820038
1002420037150000006119686251001010100001010000502847521020018200372003718443318767100102010000202000020037200371110021109101010000100000640216221978610000102003820038200382003820038
1002420037150000906119686251001010100001010000502847521020018200372003718443318767100102010000202000020037200371110021109101010000100000640216221978610000102003820038200382003820038
1002420037150000006119686251001010100001010000502847521020018200372003718443318767100102010000202000020037200371110021109101010000100000640216221978610000102003820038200382003820038
1002420037150000006119686251001010100001010000502847521020018200372003718443318767100102010000202000020037200371110021109101010000100000640216221978610000102003820038200382003820038
1002420037156000006119686251001010100001010000502847521020018200372003718443318767100102010000202000020037200371110021109101010000100000640216221978610000102003820038200382003820038
1002420037150000006119686251001010100001010000502847521020018200372003718443318767100102010000202000020037200371110021109101010000100000640216221978610000102003820038200382003820038
1002420037150000006119686251001010100001010000502847521020018200852003718443318767100102010000202000020037200371110021109101010000100090640216221978610000102003820038200382003820038
1002420037150000006119686251001010100001010000502847521020018200372003718443318767100102010000202000020037200371110021109101010000100030640216221978610000102003820038200382003820038
1002420037150000006119686251001010100001010000502847521020018200372003718443318767100102010000202000020037200371110021109101010000100000640216221978610000102003820038200382003820038

Test 4: throughput

Count: 8

Code:

  movi v0.16b, 0
  sli d0, d8, #3
  movi v1.16b, 0
  sli d1, d8, #3
  movi v2.16b, 0
  sli d2, d8, #3
  movi v3.16b, 0
  sli d3, d8, #3
  movi v4.16b, 0
  sli d4, d8, #3
  movi v5.16b, 0
  sli d5, d8, #3
  movi v6.16b, 0
  sli d6, d8, #3
  movi v7.16b, 0
  sli d7, d8, #3
  movi v8.16b, 9

(fused SUBS/B.cc loop)

100 unrolls and 100 iterations

Result (median cycles for code divided by count): 0.2510

retire (01)cycle (02)03070a1e3f51inst issue (52)~issue int (53)~issue fp/simd (54)~dispatch int (56)~dispatch fp/simd (57)huge thing int (59)huge thing fp/simd (5b)60696d6escheduler rewind (75)scheduler stall (76)~dispatch op (78)~map op int (7c)~map op fp/simd (7e)~map lookup int (7f)~map lookup fp/simd (81)8283pipeline redirect (84)85inst all (8c)inst branch (8d)inst branch taken (90)inst b.cc (94)inst integer (97)inst neon or fp (9a)9fa1a8acc5branch mispredict (cb)cdcfd5d6ddinst fetch restart (de)e0? fp/simd (ee)gpr retires (ef)f5f6f7f8fd
16020420091150119292580116100800161008002850064019602004420065200656128012820080028200160056200652006511160201100991001001600001000401111012131633200621600001002006620066200662006620066
16020420065150110292580116100800161008002850064019602004420065200656128012820080028200160056200652006511160201100991001001600001000001111012141632200621600001002006620066200662006620066
16020420065150110292580116100800161008002850064019602004420065200656128012820080028200160056200652006511160201100991001001600001000001111012131633200621600001002006620066200662006620066
160204200651501102925801161008001610080028500640196020044200652006511128012820080028200160056200652006511160201100991001001600001002001111012131633200621600001002006620066200662006620066
16020420065150110292580116100800161008002850064019602004420065201326128012820080028200160056200652006511160201100991001001600001000001111012131633200621600001002006620066200662006620066
16020420065150110292580116100800161008002850064019602004420065200656128012820080028200160056200652006511160201100991001001600001000001111012131633200621600001002006620066200662006620066
16020420065151110292580116100800161008002850064019602004420065200656128012820080028200160056200652006511160201100991001001600001000001111012131634200621600001002006620066200662006620066
16020420065150110292580116100800161008002850064019602004420065200656128012820080028200160056200652006511160201100991001001600001000001111012131633200621600001002006620066200662006620066
160204200651501102925803181008001610080028500640196020044200652006561280128200800282001600562006520065111602011009910010016000010000151111012131633200621600001002006620066200662006620066
160204200651501102925801161008001610080028500640196020044200652006561280128200800282001600562006520065111602011009910010016000010004701111012131633200621600001002006620066200662006620066

1000 unrolls and 10 iterations

Result (median cycles for code divided by count): 0.2506

retire (01)cycle (02)03071e3f51inst issue (52)~issue int (53)~issue fp/simd (54)~dispatch int (56)~dispatch fp/simd (57)huge thing int (59)huge thing fp/simd (5b)5f6061696d6escheduler rewind (75)scheduler stall (76)~dispatch op (78)~map op int (7c)~map op fp/simd (7e)~map lookup int (7f)~map lookup fp/simd (81)8283pipeline redirect (84)85inst all (8c)inst branch (8d)inst branch taken (90)inst b.cc (94)inst integer (97)inst neon or fp (9a)9fa8accdcfd0d2icache miss (d3)itlb miss (d4)d5d6d9dadbddinst fetch restart (de)e0eaeb? fp/simd (ee)gpr retires (ef)f5f6f7f8fd
160024200651500044258001010800001080000506400001102002620045200453218001020800002016000020045200451116002110910101600001007801004431101120211209200421557160000102004620046200462004620046
16002420045150004425800101080000108000050640000110200262004520045321800102080000201600002004520045111600211091010160000100001003331102020211920200421538160000102004620046200462004620046
16002420045150004425800101080000108000050640000110200262004520045321800102080000201600002004520045111600211091010160000100001004331107202112121200421528160000102004620046200462004620046
1600242004515000442580010108000010800005064000011020026200452004532180010208000020160000200452004911160021109101016000010000100473220720222218200423022160000102004620046200462004620046
16002420045150005025800101080000108000050640000010200262004920049321800102080000201600002004520045111600211091010160000100001004761102020412821200421518160000102004620050200502004620046
160024200451500044258001010800001080000506400001102002620045200453218001020800002016000020045200451116002110910101600001000010044311020202111021200421513160000102004620046200462004620046
160024200451500044258001010800001080000506400001102002620045200453218001020800002016000020045200451116002110910101600001000010044311020202112121200421513160000102004620046200462004620046
160024200491500044258001010800001080000506400001102002620045200453218001020800002016000020045200451116002110910101600001000010044311020202211018200421513160000102004620046200462004620046
16002420045150004425800101080000108000050640000010200262004520049321800102080000201600002004520045111600211091010160000100001004431102020211218200421513160000102004620050200462004620050
160024200491560050258001010800001080000506400001102002620045200453218001020800002016000020045200451116002110910101600001000010044621018202112121200461514160000102004620046200462004620046

Test 5: throughput

Count: 16

Code:

  sli d0, d16, #3
  sli d1, d16, #3
  sli d2, d16, #3
  sli d3, d16, #3
  sli d4, d16, #3
  sli d5, d16, #3
  sli d6, d16, #3
  sli d7, d16, #3
  sli d8, d16, #3
  sli d9, d16, #3
  sli d10, d16, #3
  sli d11, d16, #3
  sli d12, d16, #3
  sli d13, d16, #3
  sli d14, d16, #3
  sli d15, d16, #3
  movi v16.16b, 17

(fused SUBS/B.cc loop)

100 unrolls and 100 iterations

Result (median cycles for code divided by count): 0.2502

retire (01)cycle (02)03080918191e3a3f51inst issue (52)~issue int (53)~issue fp/simd (54)~dispatch int (56)~dispatch fp/simd (57)huge thing int (59)huge thing fp/simd (5b)60696d6edispatch stall (70)scheduler rewind (75)scheduler stall (76)~dispatch op (78)~map op int (7c)~map op fp/simd (7e)~map lookup int (7f)~map lookup fp/simd (81)8283pipeline redirect (84)85inst all (8c)inst branch (8d)inst branch taken (90)inst b.cc (94)inst integer (97)inst neon or fp (9a)9fa0a1a6a8a9acc2c5branch mispredict (cb)cdcfd5d6ddinst fetch restart (de)e0? fp/simd (ee)gpr retires (ef)f5f6f7f8fd
160204400583000100006092516010810016000810016002050012801321400194003840038199776199891601202001600322003200644003840038111602011009910010016000010000000001111011801600400351600001004003940039400394003940039
16020440038300000000292516010810016000810016002050012801320400194003840038199776199891601202001600322003200644003840038111602011009910010016000010000000001111011801600400351600001004003940039400394003940039
160204400383000000002202516010810016000810016002050012801320400194003840038199776199891601202001600322003200644003840038111602011009910010016000010000000021111011801600400351600001004003940039400394003940039
160204400383000000002182516010810016000810016002050012801320400194003840038199776199891601202001600322003200644003840038111602011009910010016000010000000001111011801600400351600001004003940039400394003940039
16020440038300100000292516010810016000810016002050012801320400194003840038199776199891601202001600322003200644003840038111602011009910010016000010000000001111011801600400351600001004003940039400394003940236
16020440038300000000712516010810016000810016002050012801321400194003840038199776199891601202001600322003200644003840038111602011009910010016000010000000001111011801600400351600001004003940039400394003940039
160204400383000000002202516010810016000810016002050012801321400194003840038199776199891601202001600322003200644003840038111602011009910010016000010000000001111011801600400351600001004003940039400394003940039
160204400383000000001132516010810016000810016002050012801320400194003840038199776199891601202001600322003200644003840038111602011009910010016000010000000001111011801600400351600001004003940039400394003940039
160204400383000000007572516010810016000810016002050012801320400194003840038199776199891601202001600322003200644003840038111602011009910010016000010000000001111011801600400351600001004003940039400394003940039
160204400383000000002202516010810016000810016002050012801320400194003840038199776199891601202001600322003200644003840038111602011009910010016000010000000001111011801600400351600001004003940039400394003940039

1000 unrolls and 10 iterations

Result (median cycles for code divided by count): 0.2502

retire (01)cycle (02)030b1e3a3f51inst issue (52)~issue int (53)~issue fp/simd (54)~dispatch int (56)~dispatch fp/simd (57)huge thing int (59)huge thing fp/simd (5b)5f60696d6edispatch stall (70)72scheduler rewind (75)scheduler stall (76)~dispatch op (78)~map op int (7c)~map op fp/simd (7e)~map lookup int (7f)~map lookup fp/simd (81)8283pipeline redirect (84)85inst all (8c)inst branch (8d)inst branch taken (90)inst b.cc (94)inst integer (97)inst neon or fp (9a)9fa0a1a8accfd0d2icache miss (d3)d5d6d9dadbddinst fetch restart (de)e0eaeb? fp/simd (ee)gpr retires (ef)f5f6f7f8fd
16002440051300000391251600101016000010160000501280000014001940038400381999603200181600102016000020320000400384003811160021109101016000010000010023311101621172040056155160000104003940039400394003940039
160024400383000002172516001010160000101600005012800001040019400384003819996032001816001020160000203200004003840038111600211091010160000100000100236111816211192040061155160000104003940039400394003940039
160024400383000001502516001010160000101600005012800001140019400384003819996032001816001020160000203200004003840038111600211091010160000100000100223111816211182040059155160000104003940039400394003940039
1600244003830000045251600101016000010160000501280000114001940038400381999603200181600102016000020320000400384003811160021109101016000010000010022311616211189400591510160000104003940039400394003940039
16002440038299000452516001010160000101600005012800001140019400384003819996032001816001020160000203200004003840038111600211091010160000100000100223111816211182040059155160000104003940039400394003940039
160024400382990004525160010101600001016000050128000011400194003840038199960320018160010201600002032000040038400381116002110910101600001000001002231161621118740059155160000104003940039400394003940039
16002440038300001108251600101016000010160000501280000114001940038400381999603200181600102016000020320000400384003811160021109101016000010000010022311181621162040062155160000104003940039400394003940039
1600244003830000051251600101016000010160000501280000114001940038400381999603200181600102016000020320000400384003811160021109101016000010000010022311181621182040061155160000104003940039400394003940039
16002440038300000452516001010160000101600005012800001140019400384003819996032001816001020160000203200004003840038111600211091010160000100000100243111816211187400591510160000104008940039400394003940039
160024400383000005125160010101600001016000050128000001400194003840038199960320018160010201600002032000040038400381116002110910101600001000001002462218164221820400583010160000104003940039400394003940039