Apple M1 Microarchitecture Research by Dougall Johnson

Firestorm: Overview | Base Instructions | SIMD and FP Instructions
Icestorm:  Overview | Base Instructions | SIMD and FP Instructions

SLI (vector, 4S)

Test 1: uops

Code:

  sli v0.4s, v1.4s, #3
  movi v0.16b, 1
  movi v1.16b, 2

(no loop instructions)

1000 unrolls and 1 iteration

Retires: 1.000

Issues: 1.000

Integer unit issues: 0.000

Load/store unit issues: 0.000

SIMD/FP unit issues: 1.000

retire (01)cycle (02)031e3f4e51inst issue (52)~issue fp/simd (54)~dispatch fp/simd (57)huge thing fp/simd (5b)60696d6edispatch stall (70)scheduler rewind (75)scheduler stall (76)~dispatch op (78)~map op fp/simd (7e)~map lookup fp/simd (81)8283pipeline redirect (84)85inst all (8c)inst neon or fp (9a)a8accfd5d6ddinst fetch restart (de)e0? fp/simd (ee)f5f6f7f8fd
1004203715313216872510001000100026468012018203720371572318951000100020002037203711100110000073116111787100020382038203820382038
100420371506116872510001000100026468012018203720371572318951000100020002037203711100110000073116111787100020382038203820382038
100420371506116872510001000100026468012018203720371572318951000100020002037203711100110000073116111787100020382038203820382038
100420371506116872510001000100026468012018203720371572318951000100020002037203711100110000073116111787100020382038203820382038
100420371506116872510001000100026468002018203720371572318951000100020002037203711100110000073116111787100020382038203820382038
1004203716014716872510001000100026468002018203720371572318951000100020002037203711100110000073116111787100020382038203820382038
100420371506116872510001000100026468012018203720371572318951000100020002037203711100110000073116111787100020382038203820382038
100420371506116872510001000100026468002018203720371572318951000100020002037203711100110000073116111787100020382038203820382038
100420371506116872510001000100026468002018203720371572318951000100020002037203711100110000073116111787100020382038203820382038
100420371606116872510001000100026468012018203720371572318951000100020002037203711100110000073116111787100020382038203820382038

Test 2: Latency 1->1

Code:

  sli v0.4s, v1.4s, #3
  movi v0.16b, 1
  movi v1.16b, 2

(fused SUBS/B.cc loop)

100 unrolls and 100 iterations

Result (median cycles for code): 2.0037

retire (01)cycle (02)030b1e3f4e51inst issue (52)~issue int (53)~issue fp/simd (54)~dispatch int (56)~dispatch fp/simd (57)huge thing int (59)huge thing fp/simd (5b)60696d6edispatch stall (70)scheduler rewind (75)scheduler stall (76)~dispatch op (78)~map op int (7c)~map op fp/simd (7e)~map lookup int (7f)~map lookup fp/simd (81)8283pipeline redirect (84)85inst all (8c)inst branch (8d)inst branch taken (90)inst b.cc (94)inst integer (97)inst neon or fp (9a)9faccfd5d6ddinst fetch restart (de)e0? int output thing (e9)? fp/simd (ee)gpr retires (ef)f5f6f7f8fd
1020420037150006119687251010010010000100100005002847680020018200372003718422318745101002001000020020000200372008311102011009910010010000100071011611197910100001002003820038200382003820038
1020420037150006119687251010010010000100100005002847680020018200372003718422318745101002001000020020000200372003711102011009910010010000100071011611197910100001002003820038200382003820038
1020420037150006119687251010010010000100100005002847680020018200372003718422318745101002001000020020000200372003711102011009910010010000100071011611197910100001002003820038200382003820038
1020420037150006119687251010010010000100100005002847680020018200372003718422318745101002001000020020000200372003711102011009910010010000100071011611197910100001002003820038200382003820038
1020420037150006119687251010010010000100100005002847680120018200372003718422318745101002001000020020000200372003711102011009910010010000100071011611197910100001002003820038200382003820038
10204200371500010919687251010010010000100100005002847680020018200372003718422318745101002001000020020000200372003711102011009910010010000100071011611197910100001002003820038200382003820038
1020420037150006119687251010010010000100100005002847680020018200372003718422318745101002001000020020000200372003711102011009910010010000100071011611197910100001002003820038200382003820038
10204200371490156119687251010010010000100100005002847680020018200372003718422318745101002001000020020000200372003711102011009910010010000100071011611197910100001002003820038200382003820038
102042003715000156196872510100100100001001000050028476800200182003720037184223187451010020010000200200002003720037111020110099100100100001000710116111979121100001002003820038200382003820038
1020420037150006119687251010010010000100100005002847680020018200372003718422318745101002001000020020000200372003711102011009910010010000100071011611197910100001002003820038200382003820038

1000 unrolls and 10 iterations

Result (median cycles for code): 2.0037

retire (01)cycle (02)030b181e3f4e51inst issue (52)~issue int (53)~issue fp/simd (54)~dispatch int (56)~dispatch fp/simd (57)huge thing int (59)huge thing fp/simd (5b)60696d6edispatch stall (70)scheduler rewind (75)scheduler stall (76)~dispatch op (78)~map op int (7c)~map op fp/simd (7e)~map lookup int (7f)~map lookup fp/simd (81)8283pipeline redirect (84)85inst all (8c)inst branch (8d)inst branch taken (90)inst b.cc (94)inst integer (97)inst neon or fp (9a)9fa8accfd5d6ddinst fetch restart (de)e0? fp/simd (ee)gpr retires (ef)f5f6f7f8fd
100242003715000061196872510010101000010100005028476801200182003720037184443187671001020100002020000200372003711100211091010100001000640216221978510000102003820038200382003820038
100242003715000061196872510010101000010100005028476801200182003720037184443187671001020100002020000200372003711100211091010100001000640216221978510000102003820038200382003820038
100242003714900661196872510010101000010100005028476801200182003720037184443187671001020100002020000200372003711100211091010100001000640216221978510000102003820038200382003820038
100242003715000061196872510010101000010100005028476801200182003720037184443187671001020100002020000200372003711100211091010100001000640216221978510000102003820038200382003820038
100242003715000061196872510010101000010100005028476801200182003720037184443187671001020100002020000200372003711100211091010100001000640216221978510000102003820038200382003820038
100242003715001061196872510010101000010100005028476801200182003720037184443187671001020100002020000200372003711100211091010100001000640216221978510000102003820038200382003820038
100242003715000061196872510010101000010100005028476801200182003720037184443187671001020100002020000200372003711100211091010100001000640216221978510000102003820038200382003820038
100242003715000061196872510010101000010100005028476801200182003720037184443187671001020100002020000200372003711100211091010100001000640216221978510000102003820038200382003820038
100242003715000061196872510010101000010100005028476801200182003720037184443187671001020100002020000200372003711100211091010100001000640216221978510000102003820038200382003820038
100242003715000061196872510010101000010100005028476801200182003720037184443187671001020100002020000200372003711100211091010100001000640216221978510000102003820038200382003820038

Test 3: Latency 1->2

Code:

  sli v0.4s, v0.4s, #3
  movi v0.16b, 1

(fused SUBS/B.cc loop)

100 unrolls and 100 iterations

Result (median cycles for code): 2.0037

retire (01)cycle (02)03080b1e3a3f4e51inst issue (52)~issue int (53)~issue fp/simd (54)~dispatch int (56)~dispatch fp/simd (57)huge thing int (59)huge thing fp/simd (5b)60696d6edispatch stall (70)scheduler rewind (75)scheduler stall (76)~dispatch op (78)~map op int (7c)~map op fp/simd (7e)~map lookup int (7f)~map lookup fp/simd (81)8283pipeline redirect (84)85inst all (8c)inst branch (8d)inst branch taken (90)inst b.cc (94)inst integer (97)inst neon or fp (9a)9fa8acc2c5branch mispredict (cb)cdcfd5d6ddinst fetch restart (de)e0? fp/simd (ee)gpr retires (ef)f5f6f7f8fd
1020420037150000061196862510100100100001001000050028475210200182003720037184286187331010020010000200200002003720037111020110099100100100001000001117222242219787100001002003820038200382003820038
10204200371500000747196862510100100100001001000050028475210200182003720037184096187331010020010000200200002003720037111020110099100100100001000001117222242219787100001002003820038200382003820038
1020420037150000097196862510100100100001001000050028475210200182003720037184096187331010020010000200200002003720037111020110099100100100001000001117222242219787100001002003820038200382003820038
1020420037150000097196862510100100100001001000050028475210200182003720037184096187331010020010000200200002003720037111020110099100100100001000001117222242219787100001002003820038200382003820038
1020420037150000197196862510100100100001001000050028475210200182003720037184096187331010020010000200200002003720037111020110099100100100001000001117222242219787100001002003820038200382003820038
1020420037150000197196662510100100100001001000050028475210200182003720037184096187331010020010000200200002003720037111020110099100100100001001301117222242219787100001002003820038200382003820038
1020420037150000197196862510100100100001001000050028475210200182003720037184096187331010020010000200200002003720037111020110099100100100001000001117222242219787100001002003820038200382003820038
1020420037150000197196862510100100100001001000050028475210200182003720037184096187331010020010000200200002003720037111020110099100100100001000001117222242219787100001002003820038200382003820038
1020420037150000197196862510100100100001001000050028475210200182003720037184096187331010020010000200200002003720037111020110099100100100001000001117222242219787100001002003820038200382003820038
1020420037150000197196862510100100100001001000050028475210200182003720037184096187331010020010000200200002003720037111020110099100100100001000001117222242219787100001002003820038200872003820038

1000 unrolls and 10 iterations

Result (median cycles for code): 2.0037

retire (01)cycle (02)031e3f4e51inst issue (52)~issue int (53)~issue fp/simd (54)~dispatch int (56)~dispatch fp/simd (57)huge thing int (59)huge thing fp/simd (5b)696d6edispatch stall (70)scheduler rewind (75)scheduler stall (76)~dispatch op (78)~map op int (7c)~map op fp/simd (7e)~map lookup int (7f)~map lookup fp/simd (81)8283pipeline redirect (84)85inst all (8c)inst branch (8d)inst branch taken (90)inst b.cc (94)inst integer (97)inst neon or fp (9a)9fa6a7a8cfd5d6ddinst fetch restart (de)e0? fp/simd (ee)gpr retires (ef)f5f6f7f8fd
10024200371500611968625100101010000101000050284752120018200372003718443318767100102010000202000020037200371110021109101010000100045640216221978610000102003820038200382003820038
1002420037150061196862510010101000010100005028475212001820037200371844331876710010201000020200002003720180111002110910101000010000640216221978610000102003820038200382003820038
1002420037150061196862510010101000010100005028475212001820037200371844331876710010201000020200002003720037111002110910101000010000640216221978610000102003820038200382003820038
1002420037150061196862510010101000010100005028475212001820037200371844331876710010201000020200002003720037111002110910101000010000640216221978610000102003820038200382003820038
10024200371500124196862510010101000010100005028475212001820037200371844331876710010201000020200002003720037111002110910101000010000640216221978610000102003820038200382003820038
1002420037150061196862510010101000010100005028475212001820037200371844331876710010201000020200002003720037111002110910101000010000640216221978610000102003820038200382003820038
1002420037150661196862510010101000010100005028475212001820037200371844331876710010201000020206722003720037111002110910101000010000640216221978610000102003820038200382003820038
10024200371500726196862510010101000010100005028475212001820037200371844331876710010201000020200002003720037111002110910101000010000640216221978610000102003820038200382003820038
1002420037150061196862510010101000010100005028475212001820037200371844331876710010201000020200002003720037111002110910101000010000640216221978610000102003820038200382003820038
1002420037150061196862510010101000010100005028475212001820037200371844331876710010201000020200002003720037111002110910101000010000640216221978610000102003820038200382003820038

Test 4: throughput

Count: 8

Code:

  movi v0.16b, 0
  sli v0.4s, v8.4s, #3
  movi v1.16b, 0
  sli v1.4s, v8.4s, #3
  movi v2.16b, 0
  sli v2.4s, v8.4s, #3
  movi v3.16b, 0
  sli v3.4s, v8.4s, #3
  movi v4.16b, 0
  sli v4.4s, v8.4s, #3
  movi v5.16b, 0
  sli v5.4s, v8.4s, #3
  movi v6.16b, 0
  sli v6.4s, v8.4s, #3
  movi v7.16b, 0
  sli v7.4s, v8.4s, #3
  movi v8.16b, 9

(fused SUBS/B.cc loop)

100 unrolls and 100 iterations

Result (median cycles for code divided by count): 0.2508

retire (01)cycle (02)031e3f51inst issue (52)~issue int (53)~issue fp/simd (54)~dispatch int (56)~dispatch fp/simd (57)huge thing int (59)huge thing fp/simd (5b)60696d6escheduler rewind (75)scheduler stall (76)~dispatch op (78)~map op int (7c)~map op fp/simd (7e)~map lookup int (7f)~map lookup fp/simd (81)8283pipeline redirect (84)85inst all (8c)inst branch (8d)inst branch taken (90)inst b.cc (94)inst integer (97)inst neon or fp (9a)9fc5branch mispredict (cb)cdcfd5d6ddinst fetch restart (de)e0? fp/simd (ee)gpr retires (ef)f5f6f7f8fd
160204200901500292580116100800161008002850064019612004420065200656128012820080028200160056200652006511160201100991001001600001001111012001600200621600001002006620066200662006620066
160204200651510292580116100800161008002850064019602004420065200656128012820080028200160056200652006511160201100991001001600001001111012001600200621600001002006620066200662006620066
160204200651500292580116100800161008002850064019602004420065200656128012820080028200160056200652006511160201100991001001600001001111011901600200621600001002006620066200662006620157
160204200651500292580116100800161008002850064019612004420065200656128012820080028200160056200652006511160201100991001001600001001111014111600200621600001002006620066200662006620066
160204200651520292580116100800161008002850064019602004420065200656128012820080028200160056200652006511160201100991001001600001001111012001600200621600001002006620066200662006620066
1602042006515001342580116100800161008002850064019612004420065200656128012820080028200160056200652006511160201100991001001600001001111011901600200621600001002006620066200662006620066
160204200651500922580116100800161008002850064019612004420065200656128012820080028200160056200652006511160201100991001001600001001111012011600200621600001002006620066200662006620066
160204200651500292580116100800161008002850064019602004420065200656128012820080028200160056200652006511160201100991001001600001001111012001611200621600001002006620066200662006620066
160204200651500292580116100800161008002850064019602004420065200656128012820080028200160056200652006511160201100991001001600001001111011901600200621600001002006620066200662006620066
160204200651500292580116100800161008002850064019602004420065200656128022420080028200160056200652006511160201100991001001600001001111011901611200621600001002006620066200662006620066

1000 unrolls and 10 iterations

Result (median cycles for code divided by count): 0.2506

retire (01)cycle (02)030407080a0b18191e1f3a3f51inst issue (52)~issue int (53)~issue fp/simd (54)~dispatch int (56)~dispatch fp/simd (57)huge thing int (59)huge thing fp/simd (5b)5f60696d6escheduler rewind (75)scheduler stall (76)~dispatch op (78)~map op int (7c)~map op fp/simd (7e)~map lookup int (7f)~map lookup fp/simd (81)8283pipeline redirect (84)85inst all (8c)inst branch (8d)inst branch taken (90)inst b.cc (94)inst integer (97)inst neon or fp (9a)9fa0a1a8a9acc2cfd0d2icache miss (d3)itlb miss (d4)d5d6d9dadbddinst fetch restart (de)e0eaeb? fp/simd (ee)gpr retires (ef)f5f6f7f8fd
160024200621510200000000442580010108000010800005064000011200262004520045321800102080000201600002005120051111600211091010160000100000001015562213426322292720048310160000102005220052200522005220052
160024200511501001000001852580010108000010800005064000001200322005120051321800102080000201600002005120051111600211091010160000100000001013662213026322242920048310160000102005220052200522005220052
160024200511501203000000852580010108000010800005064000001200322005120051321800102080000201600002005120051111600211091010160000100000001012162211826322301920048310160000102005220052200522005220052
1600242005115013030000012372580010108000010800005064000001200322005120051321800102080000201600002026220051111600211091010160000100000001012162212926322233120048310160000102005220052200522005220052
1600242005115012020000001002580010108000010800005064000001200322005120051321800102080000201600002005120051111600211091010160000100000001011862212926322303020048310160000102005220052200522005220052
160024200511501303000000852580010108000010800005064000001200322005120051321800102080000201600002005120051111600211091010160000100000001011462212626322183020048310160000102005220052200522005220052
1600242005115113030000019725800101080000108000050640000012003220051200513218001020800002016000020051200511116002110910101600001000000010110622130263222929200483118160000102005220052200522005220052
160024200511501202000000912580010108000010800005064000001200322005120051321800102080000201600002005120051111600211091010160000100000001012162213026322292920048310160000102005220052200522005220052
1600242005115012030000004652580010108000010800005064000001200322005120051321800102080000201600002005120051111600211091010160000100000001011762211826322302120048310160000102005220052200522005220052
160024200511501303000000852580010108000010800005064000001200322005120051321800102080000201600002005120051111600211091010160000100000001012062212826322293120048310160000102005220052200522005220052

Test 5: throughput

Count: 16

Code:

  sli v0.4s, v16.4s, #3
  sli v1.4s, v16.4s, #3
  sli v2.4s, v16.4s, #3
  sli v3.4s, v16.4s, #3
  sli v4.4s, v16.4s, #3
  sli v5.4s, v16.4s, #3
  sli v6.4s, v16.4s, #3
  sli v7.4s, v16.4s, #3
  sli v8.4s, v16.4s, #3
  sli v9.4s, v16.4s, #3
  sli v10.4s, v16.4s, #3
  sli v11.4s, v16.4s, #3
  sli v12.4s, v16.4s, #3
  sli v13.4s, v16.4s, #3
  sli v14.4s, v16.4s, #3
  sli v15.4s, v16.4s, #3
  movi v16.16b, 17

(fused SUBS/B.cc loop)

100 unrolls and 100 iterations

Result (median cycles for code divided by count): 0.2502

retire (01)cycle (02)030809191e1f3a3f51inst issue (52)~issue int (53)~issue fp/simd (54)~dispatch int (56)~dispatch fp/simd (57)huge thing int (59)huge thing fp/simd (5b)60696d6edispatch stall (70)72scheduler rewind (75)scheduler stall (76)~dispatch op (78)~map op int (7c)~map op fp/simd (7e)~map lookup int (7f)~map lookup fp/simd (81)8283pipeline redirect (84)85inst all (8c)inst branch (8d)inst branch taken (90)inst b.cc (94)inst integer (97)inst neon or fp (9a)9fa0a1a8a9acc2c5branch mispredict (cb)cdcfd6e0? fp/simd (ee)gpr retires (ef)f5f6f7f8fd
16020440059299000000712516010810016000810016002050012801321400194003840038199770619989160120200160032200320064400384003811160201100991001001600001000000001111011816400351600001004003940039400394003940039
16020440038300000000292516010810016000810016002050012801320400194003840038199770619989160120200160032200320064400384003811160201100991001001600001000000001111011816400351600001004003940039400394003940039
16020440038300000000292516010810016000810016002050012801320400194003840038199770619989160120200160032200320064400384003811160201100991001001600001000000001111011816400351600001004003940039400394003940039
16020440038300000000712516010810016000810016002050012801321400194003840088199770619989160120200160032200320064400384003811160201100991001001600001000000001111011816400351600001004003940039400394003940039
16020440038299000000292516010810016000810016002050012801321400194003840038199770619989160120200160032200320064402404003811160201100991001001600001000000001111011816400351600001004003940039400394003940039
16020440038300000000292516010810016000810016002050012801321400194003840038199770619989160120200160032200320064400384003811160201100991001001600001000000001111011816400351600001004003940039400394003940039
16020440038300000000292516010810016000810016002050012801321400194003840038199770619989160120200160032200320064400384003811160201100991001001600001000000001111011816400351600001004003940039400394003940039
16020440038300000000522516010810016000810016002050012801321400194003840038199770619989160120200160032200320064400384003811160201100991001001600001000000001111011816400351600001004003940039400394003940039
16020440038300000000292516010810016000810016002050012801320400194003840038199770619989160120200160032200320064400384003811160201100991001001600001000000001111011816400351600001004003940039400394003940039
160204400383001000002192516010810016000810016002050012801320400194003840038199770619989160120200160427200320064400384003811160201100991001001600001000000001111011816400351600001004003940039400394003940039

1000 unrolls and 10 iterations

Result (median cycles for code divided by count): 0.2502

retire (01)cycle (02)03080b18191e1f3f51inst issue (52)~issue int (53)~issue fp/simd (54)~dispatch int (56)~dispatch fp/simd (57)huge thing int (59)huge thing fp/simd (5b)5f6061696d6edispatch stall (70)scheduler rewind (75)scheduler stall (76)~dispatch op (78)~map op int (7c)~map op fp/simd (7e)~map lookup int (7f)~map lookup fp/simd (81)8283pipeline redirect (84)85inst all (8c)inst branch (8d)inst branch taken (90)inst b.cc (94)inst integer (97)inst neon or fp (9a)9fa0a1a6acc2cfd0d2icache miss (d3)d5d6d9dadbddinst fetch restart (de)e0eaeb? fp/simd (ee)gpr retires (ef)f5f6f7f8fd
160024400393000000004525160010101603741016000050128000011540019400384003819996320018160010201600002032000040038400381116002110910101600001000000100223113162114440035208160000104003940039400394003940039
160024400383000000270452516001010160000101600005012800001010400194003840038199963200181600102016000020320000400384003811160021109101016000010000141301002213614162214440035208160000104003940039400394003940039
16002440038300000027045251600101016000010160000501280000100400194003840038199963200181600102016000020320000400384003811160021109101016000010000001002213514162114540035208160000104003940039400394003940039
16002440038300000036045251600101016000010160000501280000100400194003840038199963200181600102016000020320000400384003811160021109101016000010000001002213514162114440035208160000104003940039400394003940039
16002440038300000036045251600101016000010160000501280000111040019400384003819996320018160010201600002032000040038400381116002110910101600001000000100223724162114440035208160000104003940039400394003940039
160024400383000000195045251600101016000010160000501280000111040019400384003819996320018160010201600002032000040038400381116002110910101600001000000100223115164124440035208160000104003940039400394003940039
160024400383000000240452516001010160000101600005012800001010400194003840038199963200181600102016000020320000400384003811160021109101016000010000001002213714162125540035208160000104003940039400394003940039
16002440038300000042015825160010101600001016000050128000011104001940038400381999632001816001020160000203200004003840038111600211091010160000100000010024131131621233400352016160000104003940039400394003940039
160024400383000000150722516001010160000101600005012800001110400194003840038199963200181600102016000020320000400384003811160021109101016000010000001002213715162113340035208160000104003940039400394003940039
16002440038300010000452516001010160000101600005012800001010400194003840038199963200181601222016000020320000400384003811160021109101016000010000001002413713354114340035208160000104003940039400394003940039