Apple M1 Microarchitecture Research by Dougall Johnson

Firestorm: Overview | Base Instructions | SIMD and FP Instructions
Icestorm:  Overview | Base Instructions | SIMD and FP Instructions

SLI (vector, 16B)

Test 1: uops

Code:

  sli v0.16b, v1.16b, #3
  movi v0.16b, 1
  movi v1.16b, 2

(no loop instructions)

1000 unrolls and 1 iteration

Retires: 1.000

Issues: 1.000

Integer unit issues: 0.000

Load/store unit issues: 0.000

SIMD/FP unit issues: 1.000

retire (01)cycle (02)031e3f4e51inst issue (52)~issue fp/simd (54)~dispatch fp/simd (57)huge thing fp/simd (5b)60696d6edispatch stall (70)scheduler rewind (75)scheduler stall (76)~dispatch op (78)~map op fp/simd (7e)~map lookup fp/simd (81)8283pipeline redirect (84)85inst all (8c)inst neon or fp (9a)a8accfd5d6ddinst fetch restart (de)e0? fp/simd (ee)f5f6f7f8fd
100420371506116872510001000100026468012018203720371572318951000100020002037203711100110000073116111787100020382038203820382038
100420371606116872510001000100026468012018203720371572318951000100020002037203711100110000073116111787100020382038203820382038
100420371506116872510001000100026468012018203720371572318951000100020002037203711100110000073116111787100020382038203820382038
100420371506116872510001000100026468012018203720371572318951000100020002037203711100110000073116111787100020382038203820382038
100420371506116872510001000100026468012018203720371572318951000100020002037203711100110000073116111787100020382038203820382038
100420371506116872510001000100026468012018203720371572318951000100020002037203711100110000073116111787100020382038203820382038
100420371506116872510001000100026468012018203720371572318951000100020002037203711100110000073116111787100020382038203820382038
100420371506116872510001000100026468012018203720371572318951000100020002037203711100110000073116111787100020382038203820382038
100420371506116872510001000100026468012018203720371572318951000100020002037203711100110000073116111787100020382038203820382038
100420371506116872510001000100026468012018203720371572318951000100020002037203711100110000073116111787100020382038203820382038

Test 2: Latency 1->1

Code:

  sli v0.16b, v1.16b, #3
  movi v0.16b, 1
  movi v1.16b, 2

(fused SUBS/B.cc loop)

100 unrolls and 100 iterations

Result (median cycles for code): 2.0037

retire (01)cycle (02)03080b18191e1f3a3f4e51inst issue (52)~issue int (53)~issue fp/simd (54)~dispatch int (56)~dispatch fp/simd (57)huge thing int (59)huge thing fp/simd (5b)696d6edispatch stall (70)scheduler rewind (75)scheduler stall (76)~dispatch op (78)~map op int (7c)~map op fp/simd (7e)~map lookup int (7f)~map lookup fp/simd (81)8283pipeline redirect (84)85inst all (8c)inst branch (8d)inst branch taken (90)inst b.cc (94)inst integer (97)inst neon or fp (9a)9fa0a1a6a8a9acc2branch mispredict (cb)cfd5d6ddinst fetch restart (de)e0? int output thing (e9)? fp/simd (ee)gpr retires (ef)f5f6f7f8fd
102042003715000000006119687251010010010000100100005002847680200182003720037184223187451010020010000200200002003720037111020110099100100100001000000000071011611197910100001002003820038200382003820038
102042003715000000006119687251010010010000100100005002847680200182003720037184223187451010020010000200200002003720037111020110099100100100001000000000071011611197910100001002003820038200382003820038
102042003715000000006119687441010010610000100100005002847680200182003720037184223187451010020010000200200002003720037111020110099100100100001000000000071011611197910100001002003820038200382003820038
1020420037150000021006119687251010010010000100100005002847680200182003720037184223187451010020010000200200002003720037111020110099100100100001000000000071011611197910100001002003820038200382003820038
102042003715000000006119687251010010010000100100005002847680200182003720037184223187451010020010000200200002003720037111020110099100100100001000000000071011611197910100001002003820038200382003820038
102042003715000000006119687251010010010000100100005002847680200182003720037184223187451010020010000200200002003720037111020110099100100100001000000000071011611197910100001002003820038200382003820038
102042003715000000006119687251010010010000100100005002847680200182003720037184223187451010020010000200200002003720037111020110099100100100001000000000071011611197910100001002003820038200382003820038
102042003715000000006119687251010010010000100100005002847680200182003720037184223187451010020010000200200002003720037111020110099100100100001000000000071011611197910100001002003820038200382003820038
102042003715000000006119687251010010010000100100005002847680200182003720037184223187451010020010000200200002003720037111020110099100100100001000000000071011611197910100001002003820038200382003820038
1020420037150000018006119687251010010010000100100005002847680200182003720037184223187451010020010000200200002003720037111020110099100100100001000000000071011611197910100001002003820038200382003820038

1000 unrolls and 10 iterations

Result (median cycles for code): 2.0037

retire (01)cycle (02)031e3f4e51inst issue (52)~issue int (53)~issue fp/simd (54)~dispatch int (56)~dispatch fp/simd (57)huge thing int (59)huge thing fp/simd (5b)60696d6edispatch stall (70)scheduler rewind (75)scheduler stall (76)~dispatch op (78)~map op int (7c)~map op fp/simd (7e)~map lookup int (7f)~map lookup fp/simd (81)8283pipeline redirect (84)85inst all (8c)inst branch (8d)inst branch taken (90)inst b.cc (94)inst integer (97)inst neon or fp (9a)9fa8a9accfd5d6ddinst fetch restart (de)e0? fp/simd (ee)gpr retires (ef)f5f6f7f8fd
10024200371500611968725100101010000101000050284768012001820037200371844431876710010201000020200002003720037111002110910101000010000640216221978510000102003820038200382003820038
10024200371500611968725100101010000101000050284768012001820037200371844431876710010201000020200002003720037111002110910101000010000640216221978510000102003820038200382003820038
10024200371500611968725100101010000101000050284768012001820037200371844431876710010201000020200002003720037111002110910101000010000640216221978510000102003820038200382003820038
10024200371500611968725100101010000101000050284768002001820037200371844431876710010201000020200002003720037111002110910101000010000640216221978510000102003820038200382003820038
10024200371500611968725100101010000101000050284768012001820037200371844431876710010201000020200002003720037111002110910101000010000640216221978510000102003820038200382003820038
10024200371500611968725100101010000101000050284768012001820037200371844431876710010201017220200002003720037111002110910101000010000640216221978510000102003820038200382003820038
10024200371500611968725100101010000101000050284768012001820037200371844431876710010201000020200002003720037111002110910101000010000640216221978510000102003820038200382003820038
10024200371500611968725100101010000101000050284768002001820037200371844431876710010201000020200002003720037111002110910101000010000640216221978510000102003820038200382003820038
10024200371500611968725100101010000101000050284768012001820037200371844431876710010201000020200002003720037111002110910101000010000640216221978510000102003820038200382003820038
10024200371500611968725100101010000101000050284768012001820037200371844431876710010201000020200002003720037111002110910101000010000640216221978510000102003820038200382003820038

Test 3: Latency 1->2

Code:

  sli v0.16b, v0.16b, #3
  movi v0.16b, 1

(fused SUBS/B.cc loop)

100 unrolls and 100 iterations

Result (median cycles for code): 2.0037

retire (01)cycle (02)030b181e3a3f4e51inst issue (52)~issue int (53)~issue fp/simd (54)~dispatch int (56)~dispatch fp/simd (57)huge thing int (59)huge thing fp/simd (5b)60696d6edispatch stall (70)scheduler rewind (75)scheduler stall (76)~dispatch op (78)~map op int (7c)~map op fp/simd (7e)~map lookup int (7f)~map lookup fp/simd (81)8283pipeline redirect (84)85inst all (8c)inst branch (8d)inst branch taken (90)inst b.cc (94)inst integer (97)inst neon or fp (9a)9fa8acc5branch mispredict (cb)cdcfd5d6ddinst fetch restart (de)e0? fp/simd (ee)gpr retires (ef)f5f6f7f8fd
1020420037150000061196862510100100100001001000050028475210200182003720037184287187411010020010008200200162003720037111020110099100100100001006391117180160019801100001002003820038200382003820038
102042003715000006119686251010010010000100100005002847521120018200372003718428718740101002001000820020016200372003711102011009910010010000100931117170160019801100001002003820038200382003820038
1020420037150000061196862510100100100001001000050028475210200182003720037184286187401010020010008200200162003720037111020110099100100100001004701117170160019801100001002003820038200382003820038
102042003715000006119686251010010010000100100005002847521120018200372003718428718741101002001000820020016200372003711102011009910010010000100261117170160019800100001002003820038200382003820038
1020420037150010061196862510100100100001001000050028475211200182003720037184287187411010020010008200200162003720037111020110099100100100001000661117180160019801100001002003820038200382003820038
102042003715000006119686251010010010000100100005002847521020018200372003718428618740101002001000820020016200372003711102011009910010010000100431117180160019801100001002003820038200382003820038
102042003715000006119686251010010010000100100005002847521020018200372003718428718741101002001000820020016200372003711102011009910010010000100401117180160019801100001002003820038200382003820038
10204200371500000611968625101001001000010010000500284752102001820037200371842861874010100200100082002001620037200371110201100991001001000010023841117180160019801100001002003820038200382003820038
102042003715000006119686251010010010000100100005002847521120018200372003718428718741101002001000820020016200372003711102011009910010010000100420081117170160019801100001002003820038200382003820038
10204200371500000156196862510100100100001001000050028475211200182003720037184286187411010020010008200200162003720037111020110099100100100001000421117180160019801100001002003820038200382003820038

1000 unrolls and 10 iterations

Result (median cycles for code): 2.0037

retire (01)cycle (02)03080b18191e1f3f4e51inst issue (52)~issue int (53)~issue fp/simd (54)~dispatch int (56)~dispatch fp/simd (57)huge thing int (59)huge thing fp/simd (5b)60696d6edispatch stall (70)scheduler rewind (75)scheduler stall (76)~dispatch op (78)~map op int (7c)~map op fp/simd (7e)~map lookup int (7f)~map lookup fp/simd (81)8283pipeline redirect (84)85inst all (8c)inst branch (8d)inst branch taken (90)inst b.cc (94)inst integer (97)inst neon or fp (9a)9fa0a1a6a8a9acc2cdcfd5d6ddinst fetch restart (de)e0? int output thing (e9)? fp/simd (ee)gpr retires (ef)f5f6f7f8fd
1002420084151100000103196862510010101000010100005028475210200182003720037184433187671001020100002020000200372003711100211091010100001000000105006403163319786010000102003820038200382003820038
1002420037150000000611968625100101010000101000050284752102001820037200371844331876710010201000020200002003720037111002110910101000010000003006403163319786010000102003820038200382003820038
1002420037150000000821968625100101010000101000050284752102001820037200371844331876710010201000020200002003720037111002110910101000010000000006403163319786010000102003820038200382003820038
1002420037150000000611968625100101010000101000050284752102001820037200371844331876710010201000020200002003720037111002110910101000010000000006403163319786010000102003820038200382003820038
1002420037150000000611968625100101010000101000050284752102001820037200371844331876710010201000020200002003720037111002110910101000010000000006403163319786010000102003820038200382003820038
1002420037150000000611968625100101010000101000050284752102001820037200371844331876710010201000020200002003720037111002110910101000010000503006403163319786010000102003820038200382003820038
1002420037150000000611968625100101010000101000050284752112001820037200371844331876710010201000020200002003720037111002110910101000010000003006403163319786010000102003820038200382003820038
1002420037149000000611968625100101010000101000050284752112001820037200371844331876710010201000020200002003720037111002110910101000010000000006403163319786010000102003820038200382003820038
1002420037150000060611968625100101010000101000050284752112001820037200371844331876710010201000020200002003720037111002110910101000010000000006403163319786010000102003820038200382003820038
10024200371500000006119686251001010100001010000502847521120018200372003718443318767100102010000202000020037200371110021109101010000100002700006403163319786010000102003820038200382003820038

Test 4: throughput

Count: 8

Code:

  movi v0.16b, 0
  sli v0.16b, v8.16b, #3
  movi v1.16b, 0
  sli v1.16b, v8.16b, #3
  movi v2.16b, 0
  sli v2.16b, v8.16b, #3
  movi v3.16b, 0
  sli v3.16b, v8.16b, #3
  movi v4.16b, 0
  sli v4.16b, v8.16b, #3
  movi v5.16b, 0
  sli v5.16b, v8.16b, #3
  movi v6.16b, 0
  sli v6.16b, v8.16b, #3
  movi v7.16b, 0
  sli v7.16b, v8.16b, #3
  movi v8.16b, 9

(fused SUBS/B.cc loop)

100 unrolls and 100 iterations

Result (median cycles for code divided by count): 0.2508

retire (01)cycle (02)031e3f51inst issue (52)~issue int (53)~issue fp/simd (54)~dispatch int (56)~dispatch fp/simd (57)huge thing int (59)huge thing fp/simd (5b)60696d6escheduler rewind (75)scheduler stall (76)~dispatch op (78)~map op int (7c)~map op fp/simd (7e)~map lookup int (7f)~map lookup fp/simd (81)8283pipeline redirect (84)85inst all (8c)inst branch (8d)inst branch taken (90)inst b.cc (94)inst integer (97)inst neon or fp (9a)9fa8acc5branch mispredict (cb)cdcfd6dde0? fp/simd (ee)gpr retires (ef)f5f6f7f8fd
160204200891510292580116100800161008002850064019602004420065200656128012820080028200160056200652006511160201100991001001600001000011110119160200621600001002006620066200662006620066
1602042006515002925801161008001610080028500640196020044200652006561280128200800282001600562006520065111602011009910010016000010026311110119160200621600001002006620066200662006620066
160204200651500292580116100800161008002850064019612004420065200656128012820080028200160056200652006511160201100991001001600001002011110119160200621600001002006620066200662006620066
160204200651500292580116100800161008002850064019612004420065200656128012820080028200160056200652006511160201100991001001600001002011110119160200621600001002006620066200662006620066
1602042006515002925801161008001610080028500640196120044201312006561280128200800282001600562006520065111602011009910010016000010001511110119160200621600001002006620066200662006620066
160204200651500292580116100800161008002850064019612004420065200656128012820080028200160056200652006511160201100991001001600001000911110119160200621600001002006620066200662006620066
160204200651500292580116100800161008002850064019612004420065200656128012820080028200160056200652006511160201100991001001600001005011110119160200621600001002006620066200662006620066
160204200651510292580116100800161008002850064019602004420065200656128012820080028200160056200652006511160201100991001001600001003911110119160200621600001002006620066200662006620066
160204200651500502580116100800161008002850064019612004420065200656128012820080028200160056200652006511160201100991001001600001007011110119160200621600001002006620066200662006620066
16020420065150029258011610080016100800285006401961200442006520065612801282008002820016005620065200651116020110099100100160000100169911110119160200621600001002006620066200662006620066

1000 unrolls and 10 iterations

Result (median cycles for code divided by count): 0.2506

retire (01)cycle (02)03081e3f51inst issue (52)~issue int (53)~issue fp/simd (54)~dispatch int (56)~dispatch fp/simd (57)huge thing int (59)huge thing fp/simd (5b)5f6061696d6escheduler rewind (75)scheduler stall (76)~dispatch op (78)~map op int (7c)~map op fp/simd (7e)~map lookup int (7f)~map lookup fp/simd (81)8283pipeline redirect (84)85inst all (8c)inst branch (8d)inst branch taken (90)inst b.cc (94)inst integer (97)inst neon or fp (9a)9fa6a8a9accfd0d2icache miss (d3)d5d6d9dadbddinst fetch restart (de)e0eaec? fp/simd (ee)gpr retires (ef)f5f6f7f8fd
160024200951500044318001010800001080000506400001102003120050200503218001020800002016000020050200501116002110910101600001000001003382210252119920047201160000102005120051200512021220060
16002420059151005027800101080297108000050640000015200402005020050321800102080000201600002005020050111600211091010160000102129100338515252115920047201160000102005120051200512022520051
160024200501500044278001010800001080000506400000152004020059200593218001020800002016000020059200591116002110910101600001001001004011629344229620056402160000102006020060200602020820051
16002420050150004427800101080000108000050640000115200312005020050321800102080000201600002005020050111600211091010160000100100100328515252119620047201160000102005120051200602020620051
16002420059150005027800101080000108000050640000115200312005020050321800102080000201600002005020050111600211091010160000100100100298519252119520047201160000102005120051200512021320051
160024200501500044278001010800001080000506400001152003120050200503218001020800002016000020050200501116002110910101600001004003100298514252119920047201160000102005120051200512018920051
1600242005015000442780010108000010800005064000011520031200502005032180010208000020160000200502005011160021109101016000010000271002885114252118820047201160000102005120051200512020420051
1600242005015000442780010108000010800005064000011520031200502005032180010208000020160000200502005011160021109101016000010040001002885113252118520047201160000102005120051200512020120051
160024200501500044298001010800001080000506400000152004020059200593218001020800002016000020059200501116002110910101600001002001007411626344225920056402160000102006020060200602020120060
16002420059150005029800101080000108000050640000115200402005920059321800102080000201600002005920050111600211091010160000100100100318519252119920047201160000102005120051200512020420051

Test 5: throughput

Count: 16

Code:

  sli v0.16b, v16.16b, #3
  sli v1.16b, v16.16b, #3
  sli v2.16b, v16.16b, #3
  sli v3.16b, v16.16b, #3
  sli v4.16b, v16.16b, #3
  sli v5.16b, v16.16b, #3
  sli v6.16b, v16.16b, #3
  sli v7.16b, v16.16b, #3
  sli v8.16b, v16.16b, #3
  sli v9.16b, v16.16b, #3
  sli v10.16b, v16.16b, #3
  sli v11.16b, v16.16b, #3
  sli v12.16b, v16.16b, #3
  sli v13.16b, v16.16b, #3
  sli v14.16b, v16.16b, #3
  sli v15.16b, v16.16b, #3
  movi v16.16b, 17

(fused SUBS/B.cc loop)

100 unrolls and 100 iterations

Result (median cycles for code divided by count): 0.2502

retire (01)cycle (02)031e3f51inst issue (52)~issue int (53)~issue fp/simd (54)~dispatch int (56)~dispatch fp/simd (57)huge thing int (59)huge thing fp/simd (5b)60696d6edispatch stall (70)scheduler rewind (75)scheduler stall (76)~dispatch op (78)~map op int (7c)~map op fp/simd (7e)~map lookup int (7f)~map lookup fp/simd (81)8283pipeline redirect (84)85inst all (8c)inst branch (8d)inst branch taken (90)inst b.cc (94)inst integer (97)inst neon or fp (9a)9fa8acc5branch mispredict (cb)cdcfd5d6ddinst fetch restart (de)e0? fp/simd (ee)gpr retires (ef)f5f6f7f8fd
16020440060300029251601081001600081001600205001280132140019400384003819977619989160120200160032200320064400384003811160201100991001001600001002121111011811611400351600001004003940039400394003940039
1602044003829907125160108100160008100160020500128013214001940038400381997761998916012020016003220032006440038400381116020110099100100160000100401111013611611400351600001004003940039400394003940039
1602044003829975292516010810016000810016002050012809361400194003840038199776199891601202001601322003200644003840038111602011009910010016000010001891111011811611400351600001004003940039400394003940039
1602044003830002925160108100160008100160020500128013204001940038400381997761998916012020016003220032006440038400381116020110099100100160000100401111011811611400351600001004003940039400394003940039
160204400383001229251601081001601101001600205001280132140061401404003819995101998916022420016003220032006440038400381116020110099100100160000100101111011811611400351600001004003940039400394003940039
160204400383000114525160108100160008100160020500128013214001940038400381997761998916012020016003220032006440038400381116020110099100100160000100131111011811611400351600001004003940039400394003940039
1602044003830005025160108100160008100160020500128013204001940038400381997761998916012020016003220032006440038400381116020110099100100160000100561111011811611400351600001004003940039400394003940039
1602044003830002925160108100160008100160020500128013214001940038400381997761998916012020016003220032006440038400381116020110099100100160000100001111011811611400351600001004003940039400394003940039
1602044003829902925160108100160008100160020500128013214001940038400381997761998916012020016003220032006440038400381116020110099100100160000100001111011811711400351600001004003940039400394003940085
1602044003830002925160108100160008100160020500128013204001940038400381997761998916012020016003220032006440038400381116020110099100100160000100001111011811611400351600001004003940039400394003940039

1000 unrolls and 10 iterations

Result (median cycles for code divided by count): 0.2502

retire (01)cycle (02)0318191e1f3f51inst issue (52)~issue int (53)~issue fp/simd (54)~dispatch int (56)~dispatch fp/simd (57)huge thing int (59)huge thing fp/simd (5b)5f6061696d6edispatch stall (70)scheduler rewind (75)scheduler stall (76)~dispatch op (78)~map op int (7c)~map op fp/simd (7e)~map lookup int (7f)~map lookup fp/simd (81)8283pipeline redirect (84)85inst all (8c)inst branch (8d)inst branch taken (90)inst b.cc (94)inst integer (97)inst neon or fp (9a)9fa8acc2cfd0d2icache miss (d3)d5d6d9dadbddinst fetch restart (de)e0eaeb? fp/simd (ee)gpr retires (ef)f5f6f7f8fd
16002440051300000045251600101016000010160000501280000115400194003840038199963200181600102016000020320000400384003811160021109101016000010000100228313162123440035208160000104003940045400394003940039
160024400383000000452516001010160000101600005012800001154001940038400381999632001816001020160000203200004003840038111600211091010160000100001002211313162113440035208160000104003940039400394003940039
160024400383000000512516001010160000101600005012800001154001940038400381999632001816001020160000203200004003840038111600211091010160000100001002411313162223240035208160000104003940039400394003940039
160024400383000000451041600101016000010160000501280000115400194003840038199963200181600102016000020320000400384003811160021109101016000010000100248413162123440035408160000104003940039400394003940039
16002540038300000045251600101016000010160000501280000115400194003840038199963200181600102016000020320000400384003811160021109101016000010000100228313162114440035208160000104003940039400394003940039
16002440038299000045251600101016000010160000501280000115400194003840038199963200181600102016000020320000400384003811160021109101016000010000100228315162114340035208160000104003940039400394003940039
16002440038300000045251600101016000010160000501280000115400194003840038199963200181600102016000020320000400384003811160021109101016000010000100228314162114440035208160000104003940039400394003940039
16002440038300000045251600101016000010160000501280000115400194003840038199963200181600102016000020320000400384003811160021109101016000010000100228315162115540035208160000104003940039400394003940039
16002440038300000045251600101016000010160000501280000115400194003840038199963200181600102016000020320000400384003811160021109101016000010400100228314162114440035208160000104003940039400394003940039
16002440038300000045251600101016000010160000501280000115400194003840038199963200181600102016000020320000400384003811160021109101016000010000100228315162115340035208160000104003940039400394003940039