Apple M1 Microarchitecture Research by Dougall Johnson

Firestorm: Overview | Base Instructions | SIMD and FP Instructions
Icestorm:  Overview | Base Instructions | SIMD and FP Instructions

INS (element, S)

Test 1: uops

Code:

  ins v0.s[2], v1.s[1]
  movi v0.16b, 1
  movi v1.16b, 2

(no loop instructions)

1000 unrolls and 1 iteration

Retires: 1.000

Issues: 1.000

Integer unit issues: 0.000

Load/store unit issues: 0.000

SIMD/FP unit issues: 1.000

retire (01)cycle (02)033f4e51inst issue (52)~issue fp/simd (54)~dispatch fp/simd (57)huge thing fp/simd (5b)696d6edispatch stall (70)scheduler rewind (75)scheduler stall (76)~dispatch op (78)~map op fp/simd (7e)~map lookup fp/simd (81)8283pipeline redirect (84)85inst all (8c)inst neon or fp (9a)a8a9accfd5d6ddinst fetch restart (de)e0? fp/simd (ee)f5f6f7f8fd
100420371661168725100010001000264680201820372037157231895100010002000203720371110011000410073116111787100020382038203820382038
10042037156116872510001000100026468020182037203715723189510001000200020372037111001100000973116111787100020382038203820382038
10042037156116872510001000100026468020182037203715723189510001000200020372037111001100000073116111787100020382038203820382038
10042037156116872510001000100026468020182037203715723189510001000200020372037111001100000073116111787100020382038203820382038
10042037156116872510001000100026468020182037203715723189510001000200020372037111001100000073116111787100020382038203820382038
10042037156116872510001000100026468020182037203715723189510001000200020372037111001100000073116111787100020382038203820382038
10042037166116872510001000100026468020182037203715723189510001000200020372037111001100000073116111787100020382038203820382038
10042037156116872510001000100026468020182037203715723189510001000200020372037111001100000073116111787100020382038203820382038
10042037156116872510001000100026468020182037203715723189510001000200020372037111001100000073116111787100020382038203820382038
10042037156116872510001000100026468020182037203715723189510001000200020372037111001100000073116111787100020382038203820382038

Test 2: Latency 1->1

Code:

  ins v0.s[2], v1.s[1]
  movi v0.16b, 1
  movi v1.16b, 2

(fused SUBS/B.cc loop)

100 unrolls and 100 iterations

Result (median cycles for code): 2.0037

retire (01)cycle (02)03181e3a3f4e51inst issue (52)~issue int (53)~issue fp/simd (54)~dispatch int (56)~dispatch fp/simd (57)huge thing int (59)huge thing fp/simd (5b)60696d6edispatch stall (70)scheduler rewind (75)scheduler stall (76)~dispatch op (78)~map op int (7c)~map op fp/simd (7e)~map lookup int (7f)~map lookup fp/simd (81)8283pipeline redirect (84)85inst all (8c)inst branch (8d)inst branch taken (90)inst b.cc (94)inst integer (97)inst neon or fp (9a)9faccfd5d6ddinst fetch restart (de)e0? fp/simd (ee)gpr retires (ef)f5f6f7f8fd
1020420037150000611968725101001001000010010000500284768002001820037200371842231874510100200100002002000020037200371110201100991001001000010007101161119791100001002003820038200382003820038
1020420037150000611968725101001001000010010000500284768002001820037200371842231874510100200100002002000020037200371110201100991001001000010007101161119791100001002003820038200382003820038
1020420037150000611968725101001001000010010000500284768002001820037200371842231874510100200100002002000020037200371110201100991001001000010007101161119791100001002003820038200382003820038
10204200371500004411968725101001001000010010000500284768002001820037200371842231874510100200100002002000020037200371110201100991001001000010007101161119791100001002003820038200382003820038
1020420037150100611968725101001001000010010000500284768002001820037200371842231874510100200100002002000020037200371110201100991001001000010007101161119791100001002003820038200382003820038
1020420037150000611968725101161001000010010000500284768002001820037200371842231874510100200100002002000020037200371110201100991001001000010007101161119791100001002003820038200382003820038
1020420037150000611968725101001001000010010000500284768002001820037200371842231876410100200100002002000020037200371110201100991001001000010007101161119791100001002003820038200382003820038
10204200371500004831968725101001001000010010000500284768002001820037200371842231874510100200100002002000020037200371110201100991001001000010007101161119791100001002003820038200382003820038
10204200371500006701968725101001001000010010000500284768002001820037200371842231874510100200100002002000020037200371110201100991001001000010007101161119791100001002003820038200382003820038
10204200371500002511968725101001001000010010000500284768002001820037200371842231874510100200100002002000020037200371110201100991001001000010007101161119791100001002003820038200382003820038

1000 unrolls and 10 iterations

Result (median cycles for code): 2.0037

retire (01)cycle (02)03191e3f4e51inst issue (52)~issue int (53)~issue fp/simd (54)~dispatch int (56)~dispatch fp/simd (57)huge thing int (59)huge thing fp/simd (5b)696d6edispatch stall (70)scheduler rewind (75)scheduler stall (76)~dispatch op (78)~map op int (7c)~map op fp/simd (7e)~map lookup int (7f)~map lookup fp/simd (81)8283pipeline redirect (84)85inst all (8c)inst branch (8d)inst branch taken (90)inst b.cc (94)inst integer (97)inst neon or fp (9a)9fa8a9acc2cdcfd5d6ddinst fetch restart (de)e0? fp/simd (ee)gpr retires (ef)f5f6f7f8fd
10024200371500010319687251001010100001010000502847680200182003720037184443187671001020100002020000200372003711100211091010100001000000640216221978510000102003820038200382003820038
1002420037150006119687251001010100001010000502847680200182003720037184443187671001020100002020000200372003711100211091010100001000000640216221978510000102003820038200382003820038
10024200371500069019687251001010100001010000502847680200182003720037184443187671001020100002020000200372003711100211091010100001000000640216221978510000102003820038200382003820038
10024200371500070919687251001010100001010000502847680200182003720037184443187671001020100002020000200372003711100211091010100001000000640216221978510000102003820038200382003820038
10024200371500012419687251001010100001010000502847680200182003720037184443187671001020100002020000200372003711100211091010100001000000640216221978510000102003820038200382003820038
10024200371501065019687251001010100001010000502847680200182003720037184443187671001020100002020000200372003711100211091010100001000000640216221978510000102003820038200382003820038
10024200371500061196872510010101000010100005028476802001820037200371844431876710010201000020200002003720037111002110910101000010440000640216221978510000102003820038200382003820038
1002420037150008219687251001010100001010000502847680200182003720037184443187671001020100002020000200372003711100211091010100001000000640216221978510000102003820038200382003820038
1002420037150006119687251001010100001010000502847680200182003720037184443187671001020100002020000200372003711100211091010100001000000640216221978510000102003820038200382003820038
1002420037149006119687251001010100001010000502847680200182003720037184443187671001020100002020000200372003711100211091010100001000000640216221978510000102003820038200382003820038

Test 3: Latency 1->2

Code:

  ins v0.s[2], v0.s[1]
  movi v0.16b, 1

(fused SUBS/B.cc loop)

100 unrolls and 100 iterations

Result (median cycles for code): 2.0037

retire (01)cycle (02)03081e3a3f4e51inst issue (52)~issue int (53)~issue fp/simd (54)~dispatch int (56)~dispatch fp/simd (57)huge thing int (59)huge thing fp/simd (5b)60696d6edispatch stall (70)scheduler rewind (75)scheduler stall (76)~dispatch op (78)~map op int (7c)~map op fp/simd (7e)~map lookup int (7f)~map lookup fp/simd (81)8283pipeline redirect (84)85inst all (8c)inst branch (8d)inst branch taken (90)inst b.cc (94)inst integer (97)inst neon or fp (9a)9fa8a9acc5branch mispredict (cb)cdcfd5d6ddinst fetch restart (de)e0? fp/simd (ee)gpr retires (ef)f5f6f7f8fd
10204200371500279061196862510100100100001001000050028475211200182003720037184287187411010020010008200200162003720037111020110099100100100001000021151117170160019800100001002003820038200382003820038
102042003715000061196862510100100100001001000050028475211200182003720037184287187411010020010008200200162003720037111020110099100100100001000001117180160019800100001002003820038200382003820038
102042003715000061196862510100100100001001000050028475211200182003720037184286187411010020010008200200162003720037111020110099100100100001000001117170160019800100001002003820038200382003820038
10204200371500519061196862510100100100001001000050028475210200182003720037184286187411010020010008200200162003720037111020110099100100100001000001117180160019801100001002003820038200382003820038
10204200371500462197196862510100100100001001000050028475211200182003720037184096187331010020010000200200002003720037111020110099100100100001000001117222242219787100001002003820038200382003820038
102042003715000197196862510100100100001001000050028475211200182003720037184096187331010020010000200200002003720037111020110099100100100001001001117222242219787100001002003820038200382003820038
102042003715000197196862510100100100001001000050028475210200182003720037184096187331010020010000200200002003720037111020110099100100100001000001117222242219787100001002003820038200382003820038
10204200371500378197196862510100100100001001000050028475211200182003720037184096187331010020010000200200002003720037111020110099100100100001000001117222242219787100001002003820038200382003820038
102042003715000197196862510116100100001001000050028475211200182003720037184096187331010020010000200200002003720037111020110099100100100001000001117222242219787100001002003820038200382003820038
102042003715000197196862510100100100001001000050028475210200182003720037184096187331010020010000200200002003720037111020110099100100100001000001117222242219787100001002003820038200382003820038

1000 unrolls and 10 iterations

Result (median cycles for code): 2.0037

retire (01)cycle (02)030b18191e1f3f4e51inst issue (52)~issue int (53)~issue fp/simd (54)~dispatch int (56)~dispatch fp/simd (57)huge thing int (59)huge thing fp/simd (5b)60696d6edispatch stall (70)72scheduler rewind (75)scheduler stall (76)~dispatch op (78)~map op int (7c)~map op fp/simd (7e)~map lookup int (7f)~map lookup fp/simd (81)8283pipeline redirect (84)85inst all (8c)inst branch (8d)inst branch taken (90)inst b.cc (94)inst integer (97)inst neon or fp (9a)9fa0a6a8accfd5d6ddinst fetch restart (de)e0? int output thing (e9)? fp/simd (ee)gpr retires (ef)f5f6f7f8fd
10024203201501771170704437719596181101081410096141121694286039102030620371204171847503718942112312211338242267420417205111011002110910101000010220177138275813220074110000102041520417204662045520371
100242051315201110240264611968625100101010000101000050284752102001820037200371844373187671001020100002020000200372003711100211091010100001000006402162219786010000102003820038200382003820038
100242003715000000611968625100101010000101000050284752112001820037200371844303187671001020100002020000200372003711100211091010100001000006402162219786010000102003820038200382003820038
100242003715000000611968625100101010000101000050284752112001820037200371844303187671001020100002020000200372003711100211091010100001000006402162219786010000102003820038200382003820038
100242003715000000611968625100101010000101000050284752102001820037200371844303187671001020100002020000200372003711100211091010100001000006402162219786010000102003820038200382003820038
100242003715000000611968625100101010000101000050284752102001820037200371844303187671001020100002020000200372003711100211091010100001000006402162219786010000102003820038200382003820038
100242003715000000611968625100101010000101000050284752112001820037200371844303187671001020100002020000200372003711100211091010100001000006402162219786010000102003820038200382003820038
1002420037150000270611968625100101010000101015250284752102001820037200371844303187671001020100002020000200372003711100211091010100001000006402162219786010000102003820038200382003820038
1002420037150000300611968625100101010000101000050284752112001820037200371844303187671001020100002020000200372003711100211091010100001000006402162219786010000102003820038200382003820083
100242003715000000611968625100101010000101000050284752112001820037200371844303187671001020100002020000200372003711100211091010100001000006402162219786010000102003820038200382003820038

Test 4: throughput

Count: 8

Code:

  movi v0.16b, 0
  ins v0.s[2], v8.s[1]
  movi v1.16b, 0
  ins v1.s[2], v8.s[1]
  movi v2.16b, 0
  ins v2.s[2], v8.s[1]
  movi v3.16b, 0
  ins v3.s[2], v8.s[1]
  movi v4.16b, 0
  ins v4.s[2], v8.s[1]
  movi v5.16b, 0
  ins v5.s[2], v8.s[1]
  movi v6.16b, 0
  ins v6.s[2], v8.s[1]
  movi v7.16b, 0
  ins v7.s[2], v8.s[1]
  movi v8.16b, 9

(fused SUBS/B.cc loop)

100 unrolls and 100 iterations

Result (median cycles for code divided by count): 0.2508

retire (01)cycle (02)031e3f51inst issue (52)~issue int (53)~issue fp/simd (54)~dispatch int (56)~dispatch fp/simd (57)huge thing int (59)huge thing fp/simd (5b)5f60696d6escheduler rewind (75)scheduler stall (76)~dispatch op (78)~map op int (7c)~map op fp/simd (7e)~map lookup int (7f)~map lookup fp/simd (81)8283pipeline redirect (84)85inst all (8c)inst branch (8d)inst branch taken (90)inst b.cc (94)inst integer (97)inst neon or fp (9a)9fa6a8acc5branch mispredict (cb)cdcfd6e0? fp/simd (ee)gpr retires (ef)f5f6f7f8fd
160204200891501322925801161008001610080028500640196012004420065200656128012820080028200160056200652006511160201100991001001600001000001111011916200621600001002006620066200662006620066
1602042006515002925801161008001610080028500640196012004420065200656128012820080028200160056200652006511160201100991001001600001000001111011916200621600001002006620066200662006620066
16020520065150782925801161008001610080028500640196012004420065200656128012820080028200160056200652006511160201100991001001600001000001111011916200621600001002006620066200662006620066
16020420065150537292580116100800161008002850064019601200442006520065612801282008002820016005620065200651116020110099100100160000100006081111011916200621600001002006620066200662006620066
1602042006515102925801161008001610080028500640196012004420065200656128012820080028200160056200652006511160201100991001001600001000001111011916200621600001002006620066200662006620066
1602042006515002925801161008001610080028500640196012004420065200656128012820080028200160056200652006511160201100991001001600001000001111011916200621600001002006620066200662006620066
160204200651506722925801161008001610080028500640196012004420065200656128012820080028200160056200652006511160201100991001001600001000001111011916200621600001002006620066200662006620066
1602042006515002925801161008001610080028500640196012004420065200656128012820080028200160056200652006511160201100991001001600001000001111011916200621600001002006620066200662006620066
1602042006515106925801161008001610080028500640196012004420065200656128012820080028200160056200652006511160201100991001001600001000001111011916200621600001002006620066200662006620066
1602042006515002925801161008001610080028500640196012004420065200656128012820080028200160056200652006511160201100991001001600001000001111011916200621600001002006620066200662006620066

1000 unrolls and 10 iterations

Result (median cycles for code divided by count): 0.2506

retire (01)cycle (02)031e1f3f51inst issue (52)~issue int (53)~issue fp/simd (54)~dispatch int (56)~dispatch fp/simd (57)huge thing int (59)huge thing fp/simd (5b)5f6061696d6escheduler rewind (75)scheduler stall (76)~dispatch op (78)~map op int (7c)~map op fp/simd (7e)~map lookup int (7f)~map lookup fp/simd (81)8283pipeline redirect (84)85inst all (8c)inst branch (8d)inst branch taken (90)inst b.cc (94)inst integer (97)inst neon or fp (9a)9fa0a1a6a8a9acc2cfd0d2icache miss (d3)d5d6d9dadbddinst fetch restart (de)e0eaec? fp/simd (ee)gpr retires (ef)f5f6f7f8fd
160024200911503001092780010108000010800005064000011520031200502005032180010208000020160000200502005011160021109101016000010000103010034841182521112720047201160000102005120051200512005120051
16002420050150210442780010108000010800005064000011520271200502005032180010208000020160000200502005011160021109101016000010000000010034831725211101120047201160000102005120051200512005120051
160024200501502520442780010108000010800005064000011520031200502005032180010208000020160000200502005011160021109101016000010000000010029831112521161120047201160000102005120051200512005120051
1600242005015042304427800101080000108000050640000115200312005020050321800102080000201600002005020050111600211091010160000100000000100308311125211111120047201160000102005120051200512005120051
1600242005015000442780010108000010800005064000011520031200502005032180010208000020160000201302005011160021109101016000010000140010034831132521111620047201160000102005120051200512005120051
160024200501503600442780010108000010800005064000011520031200502005032180010208000020160000200502005011160021109101016000010000000010034831825211121220047201160000102005120051200512005120051
160024200571503300442780010108000010800005064000011520031200502005032180010208000020160000200502005011160021109101016000010000000010030831112521161120047201160000102005120051200512006020051
16002420050150180442780010108000010800005064000011520031200502005032180010208000020160000200502005011160021109101016000010000003010035831122521112720047201160000102005120051200512005120051
160024200501509044278001010800001080000506400001152003120050200503218001020800002016000020050200501116002110910101600001000000001003483162521111620047201160000102005120051200512005120051
160024200501500176442780010108000010800005064000011520031200502005032180010208000020160000200502005011160021109101016000010020000010030831112521171220047201160000102005120051200512005120051

Test 5: throughput

Count: 16

Code:

  ins v0.s[2], v16.s[1]
  ins v1.s[2], v16.s[1]
  ins v2.s[2], v16.s[1]
  ins v3.s[2], v16.s[1]
  ins v4.s[2], v16.s[1]
  ins v5.s[2], v16.s[1]
  ins v6.s[2], v16.s[1]
  ins v7.s[2], v16.s[1]
  ins v8.s[2], v16.s[1]
  ins v9.s[2], v16.s[1]
  ins v10.s[2], v16.s[1]
  ins v11.s[2], v16.s[1]
  ins v12.s[2], v16.s[1]
  ins v13.s[2], v16.s[1]
  ins v14.s[2], v16.s[1]
  ins v15.s[2], v16.s[1]
  movi v16.16b, 17

(fused SUBS/B.cc loop)

100 unrolls and 100 iterations

Result (median cycles for code divided by count): 0.2504

retire (01)cycle (02)0318191e373f4e51inst issue (52)~issue int (53)~issue fp/simd (54)~dispatch int (56)~dispatch fp/simd (57)huge thing int (59)huge thing fp/simd (5b)696d6edispatch stall (70)scheduler rewind (75)scheduler stall (76)~dispatch op (78)~map op int (7c)~map op fp/simd (7e)~map lookup int (7f)~map lookup fp/simd (81)8283pipeline redirect (84)85inst all (8c)inst branch (8d)inst branch taken (90)inst b.cc (94)inst integer (97)inst neon or fp (9a)9fa8acc5branch mispredict (cb)cdcfd5d6ddinst fetch restart (de)e0? fp/simd (ee)gpr retires (ef)f5f6f7f8fd
1602044009530000003002516020610016000810016002250013201324005440073400391997762004916012220016003220032006440075400381116020110099100100160000100431111011801600400811600001004004040088400654004040040
1602044008430000039291492516012310016000810016002050054384904002040039400391997762001516012220016003220032006440039400381116020110099100100160000100031111011801600400701600001004007440040400404004040039
160204400653000000531282516012310016001710016002250055162504005440073400391997762001316012220016003220032006440038400841116020110099100100160000100031111011801600400701600001004007440040400744004040083
1602044003930000045292825160198100160008100160021500132013240020400394003919997620035160122200160032200320064400394007311160201100991001001600001001121111011801600400361600001004004040039400854004040085
16020440084300000030282516013910016000910016002050013201294006540039400571997762003516012220016003220032006440084400581116020110099100100160000100031111011801600400351600001004007440040400744004040058
16020440038300000055702516010810016000810016002250012801324001940065400751998862000616012020016003220032006440073400381116020110099100100160000100501111011801600400361600001004006640039400764003940039
16020440038300000055025160109100160009100160020500551625040020400844005819977620029160120200160032200320064400384008411160201100991001001600001005301111011801600400811600001004007440040400744004040040
160204400393000003956172516020610016000910016002050047981974001940078400841999762003516012220016003220032006440039400731116020110099100100160000100261111011801600400361600001004007940085400404008540088
160204400393000006030025160108100160023100160020500132012940020400394008619982620015160122200160032200320064400394007311160201100991001001600001005201111011801600400811600001004007440056400404008540039
160204400393000004530025160108100160039100160020500491817840020400574003919977620011160122200160032200320064400844003911160201100991001001600001002801111011801600400811600001004004040040400404007440040

1000 unrolls and 10 iterations

Result (median cycles for code divided by count): 0.2502

retire (01)cycle (02)031e373f4e51inst issue (52)~issue int (53)~issue fp/simd (54)~dispatch int (56)~dispatch fp/simd (57)huge thing int (59)huge thing fp/simd (5b)5f6061696d6edispatch stall (70)72scheduler rewind (75)scheduler stall (76)~dispatch op (78)~map op int (7c)~map op fp/simd (7e)~map lookup int (7f)~map lookup fp/simd (81)8283pipeline redirect (84)85inst all (8c)inst branch (8d)inst branch taken (90)inst b.cc (94)inst integer (97)inst neon or fp (9a)9fa8acbranch mispredict (cb)cfd0d2icache miss (d3)d5d6d9dadbddinst fetch restart (de)e0eaeb? fp/simd (ee)gpr retires (ef)f5f6f7f8fd
1600244006529900670251600551016000010160000501280000110400194003840098199960320044160010201600002032000040101400381116002110910101600001055001002283111162116640035155160000104008540039400854003940039
160024400383020046025160033101600951016000050128000001540038400394003819997032001816001020160000203200004003940038111600211091010160000101001002285161621144401401510160000104003940040400394003940074
1600244003830002745025160010101600001016000050128000011540020400394003920037032001916001020160000203200004008740064111600211091010160000101180100248524162114440052155160000104004040039400994004040039
160024400383000150453251600381016000110160000501280000115400194003840112199960320044160010201600002032000040038400381116002110910101600001026010022116141621156400363010160000104003940085400394004040039
160024401123000145325160010101600001016000050551610011540065400844003819996032001816001020160000203200004005840084111600211091010160000103500100228514162115440060305160000104003940040400404006440040
1600244003930006845025160066101600011016000050399742411540082400874003919996032001816001020160000203200004006440038111600211091010160000103600100228515162116540035155160000104004040039400994004040039
1600244003930000451052516001110160000101600005012800001154001940038400392002903200181600102016000020320000400844003811160021109101016000010100100228515164124540035155160000104014340039400394011340040
160024400393000061614925160011101600601016000050403741511540019400384003820010032001816001020160000203200004003840057111600211091010160000101300100228515162114340035157160000104003940040400404003940039
1600244007329902345025160010101601041016000050128000011540020400384014319996032001816001020160000203200004005740084111600211091010160000103601002411524164213440081155160000104003940144400404003940091
1600244003830006067025160010101600001016000050131999811540065400844003819996032001816001020160000203200004003840084111600211091010160000106100100228515162115440054155160000104003940085400394008540040