Apple M1 Microarchitecture Research by Dougall Johnson

Firestorm: Overview | Base Instructions | SIMD and FP Instructions
Icestorm:  Overview | Base Instructions | SIMD and FP Instructions

INS (element, B)

Test 1: uops

Code:

  ins v0.b[2], v1.b[1]
  movi v0.16b, 1
  movi v1.16b, 2

(no loop instructions)

1000 unrolls and 1 iteration

Retires: 1.000

Issues: 1.000

Integer unit issues: 0.000

Load/store unit issues: 0.000

SIMD/FP unit issues: 1.000

retire (01)cycle (02)031e3f4e51inst issue (52)~issue fp/simd (54)~dispatch fp/simd (57)huge thing fp/simd (5b)60696d6edispatch stall (70)scheduler rewind (75)scheduler stall (76)~dispatch op (78)~map op fp/simd (7e)~map lookup fp/simd (81)8283pipeline redirect (84)85inst all (8c)inst neon or fp (9a)a8accfd5d6ddinst fetch restart (de)e0? fp/simd (ee)f5f6f7f8fd
1004203715019216872510001000100026468012018203720371572318951000100020002037203711100110000073116111787100020382038203820382038
100420371506116872510001000100026468012018203720371572318951000100020002037203711100110000373116111787100020382038203820382038
100420371606116872510001000100026468012018203720371572318951000100020002037203711100110000073116111787100020382038203820382038
100420371606116872510001000100026468012018203720371572318951000100020002037203711100110002073116111787100020382038203820382038
100420371506116872510001000100026468012018203720371572318951000100020002037203711100110000073116111787100020382038203820382038
100420371506116872510001000100026468012018203720371572318951000100020002037203711100110000073116111787100020382038203820382038
100420371506116872510001000100026468012018203720371572318951000100020002037203711100110000073116111787100020382038203820382038
1004203715010016872510001000100026468012018203720371572318951000100020002037203711100110000373116111787100020382038203820382038
100420371506116872510001000100026468012018203720371572318951000100020002037203711100110000373116111787100020382038203820382038
1004203715010316872510001000100026468012018203720371572318951000100020002037203711100110000073116111787100020382038203820382038

Test 2: Latency 1->1

Code:

  ins v0.b[2], v1.b[1]
  movi v0.16b, 1
  movi v1.16b, 2

(fused SUBS/B.cc loop)

100 unrolls and 100 iterations

Result (median cycles for code): 2.0037

retire (01)cycle (02)031e1f3f4e51inst issue (52)~issue int (53)~issue fp/simd (54)~dispatch int (56)~dispatch fp/simd (57)huge thing int (59)huge thing fp/simd (5b)60696d6edispatch stall (70)scheduler rewind (75)scheduler stall (76)~dispatch op (78)~map op int (7c)~map op fp/simd (7e)~map lookup int (7f)~map lookup fp/simd (81)8283pipeline redirect (84)85inst all (8c)inst branch (8d)inst branch taken (90)inst b.cc (94)inst integer (97)inst neon or fp (9a)9fa0a1a8accfd5d6ddinst fetch restart (de)e0? int output thing (e9)? fp/simd (ee)gpr retires (ef)f5f6f7f8fd
1020420037150006119687251010010010000100100005002847680120018200372003718422318745101002001000020020000200372003711102011009910010010000100000071011611197910100001002003820038200382003820038
1020420037150006119687251010010010000100100005002847680020018200372003718422318745101002001000020020000200372003711102011009910010010000100000071011611197910100001002003820038200382003820038
1020420037150006119687251010010010000100100005002847680020018200372003718422318745101002001000020020000200372003711102011009910010010000100000071011611197910100001002003820038200382003820038
1020420037150006119687251010010010000100100005002847680020018200372003718422318745101002001000020020000200372003711102011009910010010000100000071011611197910100001002003820038200382003820038
1020420037150006119687251010010010000100100005002847680020018200372003718422318745101002001000020020000200372003711102011009910010010000100000071011611197910100001002003820038200382003820038
1020420037150006119687251010010010000100100005002847680120018200372003718422318745101002001000020020000200372003711102011009910010010000100000071011611197910100001002003820038200382003820038
1020420037150006119687251010010010000100100005002847680020018200372003718422318745101002001000020020000200372003711102011009910010010000100000071011611197910100001002003820038200382003820038
10204200371505106119687251010010010000100100005002847680020018200372003718422318745101002021000020020000200372003711102011009910010010000100000071011611197910100001002003820038200382003820038
1020420037150006119687251010010010000100100005002847680020018200372003718422318745101002001000020020000200372003711102011009910010010000100000071011611197910100001002003820038200382003820038
1020420037150006119687251010010010000100100005002847680120018200372003718422318745101002001000020020000200372003711102011009910010010000100000071011611197910100001002003820038200382003820038

1000 unrolls and 10 iterations

Result (median cycles for code): 2.0037

retire (01)cycle (02)03191e3f4e51inst issue (52)~issue int (53)~issue fp/simd (54)~dispatch int (56)~dispatch fp/simd (57)huge thing int (59)huge thing fp/simd (5b)60696d6edispatch stall (70)scheduler rewind (75)scheduler stall (76)~dispatch op (78)~map op int (7c)~map op fp/simd (7e)~map lookup int (7f)~map lookup fp/simd (81)8283pipeline redirect (84)85inst all (8c)inst branch (8d)inst branch taken (90)inst b.cc (94)inst integer (97)inst neon or fp (9a)9faccfd5d6ddinst fetch restart (de)e0? fp/simd (ee)gpr retires (ef)f5f6f7f8fd
10024200371500426119687251001010100001010000502847680020018200372003718444318767100102010000202000020037200371110021109101010000100640216221978510000102003820038200382003820038
1002420037150006119687251001010100001010000502847680020018200372003718444318767100102010000202000020037200371110021109101010000100640216221978510000102003820038200382003820038
1002420037149006119687251001010100001010000502847680020018200372003718444318767100102010000202000020037200371110021109101010000100640216221978510000102003820038200382003820038
10024200371500846119687251001010100001010000502847680020018200372003718444318767100102010000202000020037200371110021109101010000100640216221978510000102003820038200382003820038
100242003715003816119687251001010100001010000502847680020018200372003718444318767100102010000202000020037200371110021109101010000100640216221978510000102003820038200382003820038
1002420037150006119687251001010100001010000502847680020018200372003718444318767100102010000202000020037200371110021109101010000100640216221978510000102003820038200382003820038
1002420037150006119687251001010100001010000502847680020018200372003718444318767100102010000202000020037200371110021109101010000100640216221978510000102003820038200382003820038
10024200371500061196872510010101000010100005028476801200182003720037184442718767100102010000202000020037200371110021109101010000100640216221978510000102003820038200382003820038
10024200371500126119687251001010100001010000502847680020018200372003718444318767100102010000202000020037200371110021109101010000100640216221978510000102003820038200382003820038
10024200371500456119687251001010100001010000502847680020018200372003718444318767100102010000202000020037200371110021109101010000100640216221978510000102003820038200382003820038

Test 3: Latency 1->2

Code:

  ins v0.b[2], v0.b[1]
  movi v0.16b, 1

(fused SUBS/B.cc loop)

100 unrolls and 100 iterations

Result (median cycles for code): 2.0037

retire (01)cycle (02)03070a181e3a3f4e51inst issue (52)~issue int (53)~issue fp/simd (54)~dispatch int (56)~dispatch fp/simd (57)huge thing int (59)huge thing fp/simd (5b)696d6edispatch stall (70)scheduler rewind (75)scheduler stall (76)~dispatch op (78)~map op int (7c)~map op fp/simd (7e)~map lookup int (7f)~map lookup fp/simd (81)8283pipeline redirect (84)85inst all (8c)inst branch (8d)inst branch taken (90)inst b.cc (94)inst integer (97)inst neon or fp (9a)9fa6a8acc5branch mispredict (cb)cdcfd5d6ddinst fetch restart (de)e0? fp/simd (ee)gpr retires (ef)f5f6f7f8fd
10204200371501102106119686251010010010000100100005002847521200182003720037184096187331010020010000200200002003720037111020110099100100100001000001117223243319787100001002003820038200382003820038
1020420037150000009719686251010010010000100100005002847521200182003720037184096187331010020010000200200002003720037111020110099100100100001000001117181161119804100001002003820038200382003820038
10204200371501100111819686251010010010000100100005002847521200182003720037184096187331010020010000200200002003720037111020110099100100100001000001117223243319787100001002003820038200382003820038
10204200371501102406119686251010010010000100100005002847521200182003720037184286187411010020010008200200162003720037111020110099100100100001000001117171161119804100001002003820038200382003820038
10204200371501141206119686251010010010000100100005002847521200182003720037184286187411010020010008200200162003720037111020110099100100100001003101117171161119804100001002003820038200382003820038
1020420037150110007119686251010010010000100100005002847521200182003720037184286187411010020010008200200162003720037111020110099100100100001000001117171161119805100001002003820038200382003820038
1020420037150110006119686251010010010000100100005002847521200182003720037184287187411010020010008200200162003720037111020110099100100100001000001117171161119805100001002003820038200382003820038
1020420037150110006119686251010010010000100100005002847521200182003720037184286187401010020010008200200162003720037111020110099100100100001000001117171161119805100001002003820038200382003820038
1020420037150110006119686251010010010000100100005002847521200182003720037184287187411010020010008200200162003720037111020110099100100100001000001117181161119804100001002003820038200382003820038
1020420037150110006119686251010010010000100100005002847521200182003720037184286187401010020010008200200162003720037111020110099100100100001000020701117181161119805100001002003820038200382003820038

1000 unrolls and 10 iterations

Result (median cycles for code): 2.0037

retire (01)cycle (02)030b181e3f4e5051inst issue (52)~issue int (53)~issue fp/simd (54)~dispatch int (56)~dispatch fp/simd (57)huge thing int (59)huge thing fp/simd (5b)60696d6edispatch stall (70)scheduler rewind (75)scheduler stall (76)~dispatch op (78)~map op int (7c)~map op fp/simd (7e)~map lookup int (7f)~map lookup fp/simd (81)8283pipeline redirect (84)85inst all (8c)inst branch (8d)inst branch taken (90)inst b.cc (94)inst integer (97)inst neon or fp (9a)9fa8accfd5d6ddinst fetch restart (de)e0? fp/simd (ee)gpr retires (ef)f5f6f7f8fd
10024200371500015611968602510010101000010100005028475211200182003720037184433187671001020100002020000200372003711100211091010100001000640216221978610000102003820038200382003820038
1002420037150000611968602510010101000010100005028475210200182003720037184433187671001020101672020000200372003711100211091010100001000640216221978610000102003820038200382003820038
1002420037150000611968602510010101000010100005028475210200182003720037184433187671001020100002020000200372003711100211091010100001000640216221978610000102003820038200382003820038
100242003715000369611968602510010101000010100005028475211200182003720037184433187671001020100002020000200372003711100211091010100001000640216221978610000102003820038200382003820038
1002420037150000611968602510010101000010100005028475211200182003720037184433187671001020100002020000200372003711100211091010100001000640216221978610000102003820038200382003820038
1002420037150000611968602510010101000010100005028475210200182003720037184433187671001020100002020000200372003711100211091010100001003640216221978610000102003820038200382003820038
1002420037150000611968602510010101000010100005028475210200182003720037184433187671001020100002020000200372003711100211091010100001000640216221978610000102003820038200382003820038
1002420037150000611968602510010101000010100005028475210200182003720037184433187671001020100002020000200372003711100211091010100001000640216221978610000102003820038200382003820038
1002420037150000611968602510010101000012100005028475210200182003720037184433187671001020100002020000200372003711100211091010100001000640216221978610000102003820038200382003820038
1002420037150000611968602510010101000010100005028475210200182003720037184433187671001020100002020000200372003711100211091010100001000640216221978610000102003820038200382003820038

Test 4: throughput

Count: 8

Code:

  movi v0.16b, 0
  ins v0.b[2], v8.b[1]
  movi v1.16b, 0
  ins v1.b[2], v8.b[1]
  movi v2.16b, 0
  ins v2.b[2], v8.b[1]
  movi v3.16b, 0
  ins v3.b[2], v8.b[1]
  movi v4.16b, 0
  ins v4.b[2], v8.b[1]
  movi v5.16b, 0
  ins v5.b[2], v8.b[1]
  movi v6.16b, 0
  ins v6.b[2], v8.b[1]
  movi v7.16b, 0
  ins v7.b[2], v8.b[1]
  movi v8.16b, 9

(fused SUBS/B.cc loop)

100 unrolls and 100 iterations

Result (median cycles for code divided by count): 0.2510

retire (01)cycle (02)03080b3f51inst issue (52)~issue int (53)~issue fp/simd (54)~dispatch int (56)~dispatch fp/simd (57)huge thing int (59)huge thing fp/simd (5b)696d6escheduler rewind (75)scheduler stall (76)~dispatch op (78)~map op int (7c)~map op fp/simd (7e)~map lookup int (7f)~map lookup fp/simd (81)8283pipeline redirect (84)85inst all (8c)inst branch (8d)inst branch taken (90)inst b.cc (94)inst integer (97)inst neon or fp (9a)9fa8acc5branch mispredict (cb)cdcfd5d6ddinst fetch restart (de)e0? fp/simd (ee)gpr retires (ef)f5f6f7f8fd
16020420088150002925801161008001610080028500640196200442006520065612801282008002820016005620065200651116020110099100100160000100001111012021612200621600001002006620066200662006620066
160204200651500069425801161008001610080028500640196200442006520065612801282008002820016005620065200651116020110099100100160000100001111012111612200621600001002006620066200662006620066
16020420065153002925801161008001610080028500640196200442006520065612801282008002820016005620065200651116020110099100100160000100231111012021621200621600001002006620066200662006620066
16020420065150002925801161008001610080028500640196200442006520065612801282008002820016005620065200651116020110099100100160000100001111012011612200621600001002006620066200662006620066
160204200651500029258011610080016100800285006401962004420065200656128012820080028200160056200652006511160201100991001001600001000271111012011612200621600001002006620066200662006620066
16020420065151002925801161008001610080028500640196200442006520065612801282008002820016005620065200651116020110099100100160000100401111012021621200621600001002006620066200662006620066
16020420065150002925801161008001610080028500640196200442006520065612801282008002820016005620065200651116020110099100100160000100401111012011621200621600001002006620066200662006620066
16020420382150002925801161008001610080028500640196200442006520065612801282008002820016005620065201461116020110099100100160000100001111012021622200621600001002006620066200662006620066
16020420065151002925801161008001610080028500640196200442006520065612801282008002820016005620065200651116020110099100100160000100401111012121612200621600001002006620066200662006620066
16020420065150002925801161008001610080028500640196200442006520065612801282008002820016005620065200651116020110099100100160000100401111012011622200621600001002006620066200662006620066

1000 unrolls and 10 iterations

Result (median cycles for code divided by count): 0.2506

retire (01)cycle (02)030b181e1f3f51inst issue (52)~issue int (53)~issue fp/simd (54)~dispatch int (56)~dispatch fp/simd (57)huge thing int (59)huge thing fp/simd (5b)5f60696d6escheduler rewind (75)scheduler stall (76)~dispatch op (78)~map op int (7c)~map op fp/simd (7e)~map lookup int (7f)~map lookup fp/simd (81)8283pipeline redirect (84)85inst all (8c)inst branch (8d)inst branch taken (90)inst b.cc (94)inst integer (97)inst neon or fp (9a)9fa0a8acc2branch mispredict (cb)cfd0d2icache miss (d3)d5d6d9dadbddinst fetch restart (de)e0ea? fp/simd (ee)gpr retires (ef)f5f6f7f8fd
1600242006515000004425800101080000108000050640000112003020045200493218001020800002016000020049200491116002110910101600001007000100313111220211962004215160000102004620322200542004620046
160024200451500000442580010108000010800005064000011200302004520045321800102080000201600002004520045111600211091010160000100130010027311820211482004630160000102005020290200542004620050
160024200451500000442580010108000010800005064000001200262004520045321800102080000201600002004920049111600211091010160000100100010032311420211852004215160000102004620266200662004620378
160024200451500000442580010108000010800005064000011200262004520045321800102080000201600002004520045111600211091010160000100000010031311520211882004215160000102004620266200662004620046
160024200451500000442580010108000010800005064000001200262004920049321800102080000201600002004520045111600211091010160000100000010031311420211882004215160000102004620264200662004620046
160024200451500000442580010108000010800005064000011200302004520049321800102080000201600002004520045111600211091010160000100000010033321820211952004215160000102004620280200542004620046
160024200451500000442580010108000010800005064000011200262004520045321800102080000201600002004520045111600211091010160000100030010027611624212482004615160000102004620247200542004620046
1600242004915000004425800101080000108000050640000112002620045200453218001020800002016000020045200451116002110910101600001000180010032311420211952004230160000102005020297200542005420046
160024200491500000442580010108000010800005064000011200262004520049321800102080000201600002004520045111600211091010160000100000010032311420211962004215160000102004620282200542004620046
160024200451500000442580010108000010800005064000001200262004520045321800102080000201600002004520045111600211091010160000100000010028311420211852004215160000102004620273200542004620046

Test 5: throughput

Count: 16

Code:

  ins v0.b[2], v16.b[1]
  ins v1.b[2], v16.b[1]
  ins v2.b[2], v16.b[1]
  ins v3.b[2], v16.b[1]
  ins v4.b[2], v16.b[1]
  ins v5.b[2], v16.b[1]
  ins v6.b[2], v16.b[1]
  ins v7.b[2], v16.b[1]
  ins v8.b[2], v16.b[1]
  ins v9.b[2], v16.b[1]
  ins v10.b[2], v16.b[1]
  ins v11.b[2], v16.b[1]
  ins v12.b[2], v16.b[1]
  ins v13.b[2], v16.b[1]
  ins v14.b[2], v16.b[1]
  ins v15.b[2], v16.b[1]
  movi v16.16b, 17

(fused SUBS/B.cc loop)

100 unrolls and 100 iterations

Result (median cycles for code divided by count): 0.2502

retire (01)cycle (02)03080b181e1f373f4e51inst issue (52)~issue int (53)~issue fp/simd (54)~dispatch int (56)~dispatch fp/simd (57)huge thing int (59)huge thing fp/simd (5b)6061696d6edispatch stall (70)scheduler rewind (75)scheduler stall (76)~dispatch op (78)~map op int (7c)~map op fp/simd (7e)~map lookup int (7f)~map lookup fp/simd (81)8283pipeline redirect (84)85inst all (8c)inst branch (8d)inst branch taken (90)inst b.cc (94)inst integer (97)inst neon or fp (9a)9fa0a8a9acc2c5branch mispredict (cb)cdcfd0d2d6inst fetch restart (de)e0? fp/simd (ee)gpr retires (ef)f5f6f7f8fd
160204401003000001200710251601081001600081001600205001280132104047740099400381997761998916012020016003220032006440038400381116020110099100100160000100010001111011851160400371600001004003940058400394003940040
16020440492300000511764011702516010810016008210016002050012801320540019400994009919977619989160120200160032200320064400384003811160201100991001001600001000541221111011851160400351600001004003940091401624005840058
1602044015630100000029025160108100160008100160020500128013215400194003840038199772319989160121200160032200320064400384003911160201100991001001600001000240001111011851160400961600001004003940058402774009140102
16020440057299000000290251602471001601471001600225001280132154013740038400572006861998916012120016003220032006440038402651116020110099100100160000100010001111011851160400351600001004003940058401424003940266
160204400383000002700290251601091001600081001600205001280132154001940038400381997761998616012020016003220032006440057400381116020110099100100160000100010109821111011851160400351600001004003940040400394003940039
16020440156300000000290251601821001600091001600205001280132054008040099400991997762009216012020016003220032006440099400991116020110099100100160000100000001111011851160400351600001004010040074401314004040040
160204400993000005101290251601081001600521001600215001280132054003840038400382001262005016012020016003220032006440038400381116020110099100100160000100010001111011851160400351600001004003940040400394003940102
16020440038301000004029307251602471001601471001600225001320130054001940038400382001261998916012020016003220032006440038400381116020110099100100160000100010001111011851160400351600001004003940074401314004040058
16020440039300000001712110251601081001600231001600205001280132104008040038400991997762005016012120016003220032006440038400381116020110099100100160000100000001111011800160400351600001004003940040400394003940139
1602044008730000000029125251601231001600081001600205001280132104001940038400382001261998916012020016003220032006440038400991116020110099100100160000100010001111011800160400961600001004003940058401424010240039

1000 unrolls and 10 iterations

Result (median cycles for code divided by count): 0.2502

retire (01)cycle (02)0307080a1e373f4e51inst issue (52)~issue int (53)~issue fp/simd (54)~dispatch int (56)~dispatch fp/simd (57)huge thing int (59)huge thing fp/simd (5b)5f60696b6d6edispatch stall (70)72scheduler rewind (75)scheduler stall (76)~dispatch op (78)~map op int (7c)~map op fp/simd (7e)~map lookup int (7f)~map lookup fp/simd (81)8283pipeline redirect (84)85inst all (8c)inst branch (8d)inst branch taken (90)inst b.cc (94)inst integer (97)inst neon or fp (9a)9fa0a8acc5cfd0d2icache miss (d3)d5d6d9dadbddinst fetch restart (de)e0eaeb? fp/simd (ee)gpr retires (ef)f5f6f7f8fd
1600244007430000002471102516001010160000101600005012800001140019040038401122003713320059160010201600002032000040038400381116002110910101600001003000100223112616211252540035208160000104003940058400404003940058
160024400383010000275460251600101016000010160000501280000114001904003840057199970320018160010201607262032000040038400391116002110910101600001001001002231111162122513400354016160000104003940048400934003940040
160024400383000000045025160011101600001016000050128000011400190400384003820138732001816001020160000203200004003840038111600211091010160000104500100223211116211251340035208160000104003940198400884011340113
1600244011229900001969025160010101600191016000050128000011400190400384003819996032001816001020160000203200004003840087111600211091010160000100100100223111216211251140035208160000104003940057400404003940058
1600244003830000000451025160010101600001016000050128000011400380400384021020138732001816001020160000203200004021040038111600211091010160000100000100223111316211112540207208160000104003940047400404004040039
160024400383010000045044160082101600461016000050132000001401910400394003819996032003716001020160000203200004003840038111600211091010160000100000100226111116212251340035207160000104004040091401034003940039
160024400573000000045025160010101600001016000050307893511400190400574003819996732001816001020160000203200004021040038111600211091010160000100000100223111116211251340035208160000104003940058400404006540039
16002440038300000027567025160285101600001016000050403741511400190400574021020138732001816001020160000203200004003840038111600211091010160000100000100223112516211132540035208160000104003940053400794021140039
160024400382990000046025160010101600001016000050128000011402170400384003819996032001816001020160000203200004003840072111600211091010160000100100100223111316211251340035208160000104003940114400394011340039
160024400393000100045025160010101600001016000050128000011400930401124003819996032009216001020160000203200004011240038111600211091010160000100100100223112516211252540054208160000104021140047400654006440192