Apple M1 Microarchitecture Research by Dougall Johnson

Firestorm: Overview | Base Instructions | SIMD and FP Instructions
Icestorm:  Overview | Base Instructions | SIMD and FP Instructions

INS (element, H)

Test 1: uops

Code:

  ins v0.h[2], v1.h[1]
  movi v0.16b, 1
  movi v1.16b, 2

(no loop instructions)

1000 unrolls and 1 iteration

Retires: 1.000

Issues: 1.000

Integer unit issues: 0.000

Load/store unit issues: 0.000

SIMD/FP unit issues: 1.000

retire (01)cycle (02)03081e3f4e51inst issue (52)~issue fp/simd (54)~dispatch fp/simd (57)huge thing fp/simd (5b)60696d6edispatch stall (70)scheduler rewind (75)scheduler stall (76)~dispatch op (78)~map op fp/simd (7e)~map lookup fp/simd (81)8283pipeline redirect (84)85inst all (8c)inst neon or fp (9a)a1a8accfd5d6ddinst fetch restart (de)e0? fp/simd (ee)f5f6f7f8fd
10042037150061168725100010001000264680020182037203715723189510001000200020372037111001100000073216111787100020382038203820382038
100420371500251168725100010001000264680020182037203715723189510001000200020372037111001100000073116111787100020382038203820382038
10042037160061168725100010001000264680020182037203715723189510001000200020372037111001100000073116111787100020382038203820382038
10042037150061168725100010001000264680020182037203715723189510001000200020372037111001100000673116111787100020382038203820382038
10042037150061168725100010001000264680020182037203715723189510001000200020372037111001100000073116111787100020382038203820382038
10042037160084168725100010001000264680120182037203715723189510001000200020372037111001100000073116111787100020382038203820382038
100420371600105168725100010001000264680120182037203715723189510001000200020372037111001100000073116111787100020382038203820382038
100420371500103168725100010001000264680020182037203715723189510001000200020372037111001100000073116111787100020382038203820382038
10042037150061168725100010001000264680020182037203715723189510001000200020372037111001100000073116111787100020382038203820382038
10042037160061168725100010001000264680120182037203715723189510001000200020372037111001100000073116111787100020382038203820382038

Test 2: Latency 1->1

Code:

  ins v0.h[2], v1.h[1]
  movi v0.16b, 1
  movi v1.16b, 2

(fused SUBS/B.cc loop)

100 unrolls and 100 iterations

Result (median cycles for code): 2.0037

retire (01)cycle (02)031e3f4e51inst issue (52)~issue int (53)~issue fp/simd (54)~dispatch int (56)~dispatch fp/simd (57)huge thing int (59)huge thing fp/simd (5b)60696d6edispatch stall (70)scheduler rewind (75)scheduler stall (76)~dispatch op (78)~map op int (7c)~map op fp/simd (7e)~map lookup int (7f)~map lookup fp/simd (81)8283pipeline redirect (84)85inst all (8c)inst branch (8d)inst branch taken (90)inst b.cc (94)inst integer (97)inst neon or fp (9a)9fa8accfd5d6ddinst fetch restart (de)e0? fp/simd (ee)gpr retires (ef)f5f6f7f8fd
1020420037150072619687251010010010000100100005002847680020018200372003718422318745101002001000020020000200372003711102011009910010010000100007101161119791100001002003820038200382003820038
102042003715008219687251010010010000100100005002847680120018200372003718422318745101002001000020020000200372003711102011009910010010000100107101161119791100001002003820038200382003820038
102042003715006119687251010010010000100100005002847680120018200372003718422318745101002001000020020000200372003711102011009910010010000100007101161119791100001002003820038200382003820038
102042003715006119687251010010010000100100005002847680020018200372003718422318745101002001000020020000200372003711102011009910010010000100007101161119791100001002003820038200382003820038
102042003715006119687251010010010000100100005002847680020018200372003718422318745101002001000020020000200372003711102011009910010010000100007101161119824100001002003820038200382003820038
102042003715006619687251010010010000100100005002847680120018200372003718422318745101002001000020020000200372003711102011009910010010000100007101161119791100001002003820038200382003820038
102042003715006119687251010010010000100100005002847680020018200372003718422318745101002001000020020000200372003711102011009910010010000100007101161119791100001002003820038200382003820038
102042003715006119687251010010010000100100005002847680020018200372003718422318745101002001000020020000200372003711102011009910010010000100007101161119791100001002003820038200382003820038
102042003715006119687251010010010000100100005002847680120018200372003718422318745101002001000020020000200372003711102011009910010010000100037101161119791100001002003820038200382003820038
102042003715006119687251010010010000100100005002847680020018200372003718422318745101002001000020020000200372003711102011009910010010000100107101161119791100001002003820038200382003820038

1000 unrolls and 10 iterations

Result (median cycles for code): 2.0037

retire (01)cycle (02)030b1e1f3f4e51inst issue (52)~issue int (53)~issue fp/simd (54)~dispatch int (56)~dispatch fp/simd (57)huge thing int (59)huge thing fp/simd (5b)60696d6edispatch stall (70)scheduler rewind (75)scheduler stall (76)~dispatch op (78)~map op int (7c)~map op fp/simd (7e)~map lookup int (7f)~map lookup fp/simd (81)8283pipeline redirect (84)85inst all (8c)inst branch (8d)inst branch taken (90)inst b.cc (94)inst integer (97)inst neon or fp (9a)9fa0a1a8accfd5d6ddinst fetch restart (de)e0? fp/simd (ee)gpr retires (ef)f5f6f7f8fd
100242003715000012619687251001010100001010000502847680020018200372003718444318767100102010000202000020037200371110021109101010000100000640216221978510000102003820038200382003820038
10024200371500006119687251001010100001010000502847680020018200372003718444318767100102010000202000020037200371110021109101010000100000640216221978510000102003820038200382003820038
100242003715000014519687251001010100001010000502847680020018200372003718444318767100102010000202000020037200842110021109101010000100000640216221978510000102003820038200382003820038
10024200371500006119687251001010100001010000502847680020018200372003718444318767100102010000202000020037200371110021109101010000100000640216221978510000102003820038200382003820038
100242003715000016619687251001010100001010000502847680020018200372003718444318767100102010000202000020037200371110021109101010000100000640216221978510000102003820038201332008620038
1002420037150108819319687251001010100001010000502847680120018200372003718444318767100102010000202000020037200371110021109101010000100000640216221978510000102003820038200382003820038
100242003715000010319687251001010100001010000502847680120018200372003718444318767100102010000202000020037200371110021109101010000100000640216221978510000102003820038200382003820038
100242003715000014519687251001010100001010000502847680120018200372003718444318767100102010000202000020037200371110021109101010000100000640216221978510000102003820038200382003820038
100242003715000014519687251001010100001010000502847680020018200372003718444318767100102010000202000020037200371110021109101010000100000640216221978510000102003820038200382003820038
100242003715000010319687251001010100001010000502847680020018200372003718444318767100102010000202000020037200371110021109101010000100000640216221978510000102003820038200382003820038

Test 3: Latency 1->2

Code:

  ins v0.h[2], v0.h[1]
  movi v0.16b, 1

(fused SUBS/B.cc loop)

100 unrolls and 100 iterations

Result (median cycles for code): 2.0037

retire (01)cycle (02)030b1e3a3f4e51inst issue (52)~issue int (53)~issue fp/simd (54)~dispatch int (56)~dispatch fp/simd (57)huge thing int (59)huge thing fp/simd (5b)60696d6edispatch stall (70)72scheduler rewind (75)scheduler stall (76)~dispatch op (78)~map op int (7c)~map op fp/simd (7e)~map lookup int (7f)~map lookup fp/simd (81)8283pipeline redirect (84)85inst all (8c)inst branch (8d)inst branch taken (90)inst b.cc (94)inst integer (97)inst neon or fp (9a)9fa0a1a6a7a8a9acc5branch mispredict (cb)cdcfd5d6ddinst fetch restart (de)e0? int output thing (e9)? fp/simd (ee)gpr retires (ef)f5f6f7f8fd
1020420037150000103196862510100100100001001000050028475210200182003720037184280718741101002001000820020016200372003711102011009910010010000100000010011171701600198000100001002003820038200382003820038
10204200371501120557196862510100100100001001000050028475210200182003720037184090618733101002001000020020000200372003711102011009910010010000100000050011172222422197870100001002003820038200382003820038
102042003715000097196862510100100100001001000050028475210200182003720037184090618733101002001000020020000200372003711102011009910010010000100000000011172222422197870100001002003820038200382003820038
1020420037150000202196862510100100100001001000050028475210200182003720037184090618733102562001000820020016200372003711102011009910010010000100000000011171701600198010100001002003820038200382003820038
102042003715000061196862510100100100001001000050028475210200182003720037184280618740101002001000820020016200372003711102011009910010010000100000050011171801600198000100001002003820038200382003820038
102042003715000061196862510100100100001001000050028475210200182003720037184280618741101002001000820020016200372003711102011009910010010000100000000011171801600198000100001002003820038200872003820038
102042003715000082196862510100100100001001000050028475210200182003720037184280718740101002001000820020016200372003711102011009910010010000100000010011171801600198640100001002003820038200382003820038
102042003715000061196862510100100100001001000050028475210200182003720037184280618740101002001000820020016200372003711102011009910010010000100000000011171701600198000100001002003820038200382003820038
102042003715000061196862510100100100001001000050028475210200182003720037184280618759101002001000820020016200372003711102011009910010010000100000000011171801600198010100001002003820038200382003820038
102042003715000061196862510100100100001001000050028475210200182003720037184280718740101002001000820020016200372003711102011009910010010000100000050011171701600198010100001002003820038200382003820038

1000 unrolls and 10 iterations

Result (median cycles for code): 2.0037

retire (01)cycle (02)030b3f4e51inst issue (52)~issue int (53)~issue fp/simd (54)~dispatch int (56)~dispatch fp/simd (57)huge thing int (59)huge thing fp/simd (5b)60696d6edispatch stall (70)72scheduler rewind (75)scheduler stall (76)~dispatch op (78)~map op int (7c)~map op fp/simd (7e)~map lookup int (7f)~map lookup fp/simd (81)8283pipeline redirect (84)85inst all (8c)inst branch (8d)inst branch taken (90)inst b.cc (94)inst integer (97)inst neon or fp (9a)9fa0a1a6a8a9accfd5d6ddinst fetch restart (de)e0? fp/simd (ee)gpr retires (ef)f5f6f7f8fd
100242003715008919686251001010100001010000502847521020018200372003718443031876710010201000020200002003720037111002110910101000010000000640316331978610000102003820038200382003820086
1002420083150023519686251001010100001010000502847521020018200372003718443031876710010201000020200002003720037111002110910101000010000000640316331978610000102003820038200382003820038
100242003715006119686251001010100001010000502847521120018200372003718443031876710010201000020200002003720037111002110910101000010000000640316331978610000102003820038200382003820038
100242003715006119686251001010100001010000502847521120018200372003718443031876710010201000020200002003720037111002110910101000010001000640316331978610000102003820038200382003820038
1002420037150032419686251001010100001010000502847521020018200372003718443031876710010201000020200002003720037111002110910101000010000000640316331978610000102003820038200382003820038
100242003715006119686251001010100001010000502847521120018200372003718443031876710010201000020200002003720037111002110910101000010100000640316331978610000102003820038200382003820038
100242003715006119686251001010100001010000502847521020018200372003718443031876710010201000020200002003720037111002110910101000010000000640316331978610000102003820038200382003820038
100242003715006119686251001010100001010000502847521020018200372003718443031876710010201000020200002003720037111002110910101000010000000640316331978610000102003820038200382003820038
100242003715006119686251001010100001010000502847521120018200372003718443031876710010201000020200002003720037111002110910101000010000000640316331978610000102018020038200382003820038
1002420037149036219686251001010100001010000502848235120018200372003718443031876710010201000020200002003720037111002110910101000010000000640316331978610000102003820038200382003820038

Test 4: throughput

Count: 8

Code:

  movi v0.16b, 0
  ins v0.h[2], v8.h[1]
  movi v1.16b, 0
  ins v1.h[2], v8.h[1]
  movi v2.16b, 0
  ins v2.h[2], v8.h[1]
  movi v3.16b, 0
  ins v3.h[2], v8.h[1]
  movi v4.16b, 0
  ins v4.h[2], v8.h[1]
  movi v5.16b, 0
  ins v5.h[2], v8.h[1]
  movi v6.16b, 0
  ins v6.h[2], v8.h[1]
  movi v7.16b, 0
  ins v7.h[2], v8.h[1]
  movi v8.16b, 9

(fused SUBS/B.cc loop)

100 unrolls and 100 iterations

Result (median cycles for code divided by count): 0.2508

retire (01)cycle (02)03191e3f51inst issue (52)~issue int (53)~issue fp/simd (54)~dispatch int (56)~dispatch fp/simd (57)huge thing int (59)huge thing fp/simd (5b)60696b6d6escheduler rewind (75)scheduler stall (76)~dispatch op (78)~map op int (7c)~map op fp/simd (7e)~map lookup int (7f)~map lookup fp/simd (81)8283pipeline redirect (84)85inst all (8c)inst branch (8d)inst branch taken (90)inst b.cc (94)inst integer (97)inst neon or fp (9a)9fa1a8acc2c5branch mispredict (cb)cdcfd5d6e0? fp/simd (ee)gpr retires (ef)f5f6f7f8fd
16020420077151001132580116100800161008002850064019602004402006520065612801282008002820016005620065200651116020110099100100160000100000011110119416200621600001002006620066200662006620066
16020420065150001382580116100800161008002850064019602004402006520065612801282008002820016005620065200651116020110099100100160000100000011110119016200621600001002006620066200662006620066
1602042006515000292580116100800161008002850064019602004402006520065612801282008002820016005620065200651116020110099100100160000100000011110119016200621600001002006620066200662006620066
1602042006515000292580116100800161008002850064019602004402006520065612801282008002820016005620065200651116020110099100100160000100000011110119016200621600001002006620066200662006620066
1602042006515000292580116100800161008002850064019602004402006520065612801282008002820016005620065200651116020110099100100160000100000011110119016200621600001002006620066200662006620066
1602042006515000292580116100800161008002850064019602004402006520065612801282008002820016005620065200651116020110099100100160000100000011110119016200621600001002006620066200662006620066
1602042006515100292580116100800161008002850064019602004402006520065612801282008002820016005620065200651116020110099100100160000100000011110119016200621600001002006620066200662006620066
1602042006515000292580116100800161008002850064019602004402006520065612801282008002820016005620065200651116020110099100100160000100000011110119016200621600001002006620066200662006620066
1602042006515000922580116100800161008002850064019602004402006520065612801282008002820016005620065200651116020110099100100160000100000011110119016200621600001002006620066200662006620066
1602042006515000292580116100800161008002850064019602004402006520065612801282008002820016005620065200651116020110099100100160000100000011110119016200621600001002006620066200662006620066

1000 unrolls and 10 iterations

Result (median cycles for code divided by count): 0.2506

retire (01)cycle (02)03080b18191e1f3a3f51inst issue (52)~issue int (53)~issue fp/simd (54)~dispatch int (56)~dispatch fp/simd (57)huge thing int (59)huge thing fp/simd (5b)5f60696d6escheduler rewind (75)scheduler stall (76)~dispatch op (78)~map op int (7c)~map op fp/simd (7e)~map lookup int (7f)~map lookup fp/simd (81)8283pipeline redirect (84)85inst all (8c)inst branch (8d)inst branch taken (90)inst b.cc (94)inst integer (97)inst neon or fp (9a)9fa0a1a6a8a9acc2cfd0d2icache miss (d3)d5d6d9dadbddinst fetch restart (de)e0ea? fp/simd (ee)gpr retires (ef)f5f6f7f8fd
160024200691510000000442580010108000010800005064000011200262004520045321800102080000201600002004520045111600211091010160000100000000100263114242112332004215160000102004620046200462004620046
160024200451510000000442580010108000010800005064000011200892013220045321800102080000201600002004520045111600211091010160000100000000100273114202112342004215160000102004620046200502004620046
160024200451500000000442580010108000010800005064000011200262004520045321800102080000201600002004520045111600211091010160000100000000100273115204111442004215160000102004620046200462004620046
1600242004515000000003702580010108000010800005064000011200262004520045321800102080129201600002004520049111600211091010160000100000000100273114202110542004215160000102004620046200462004620046
160024200451500000000442580010108000010800005064000011200262004520045321800102080000201600002004520045111600211091010160000100000000100283115202110452004215160000102004620046200462004620046
160024200451500000000442580010108000010800005064000011200302004520045321800102080000201600002004520045111600211091010160000100000000100253115242113322004215160000102005020050200462004620046
1600242004515000100001512580010108000010800005064000011200262004520045321800102080000201600002004520045111600211091010160000100000030100266124244216442005030160000102004620046200462004620046
1600242004515000000006142580010108000010800005064000011200262004520045321800102080000201600002004520049111600211091010160000100000000100283112202110442004215160000102004620046200462004620113
1600242004515000000001572580010108000010800005064000001200262004520049321800102080000201600002004920049111600211091010160000100000000100273214202213432004215160000102004620046200462004620046
160024200451500000000442580010108000010800005064000011200262004520045321800102080000201600002004520045111600211091010160000100000000100273114202111432004215160000102004620046200462004620050

Test 5: throughput

Count: 16

Code:

  ins v0.h[2], v16.h[1]
  ins v1.h[2], v16.h[1]
  ins v2.h[2], v16.h[1]
  ins v3.h[2], v16.h[1]
  ins v4.h[2], v16.h[1]
  ins v5.h[2], v16.h[1]
  ins v6.h[2], v16.h[1]
  ins v7.h[2], v16.h[1]
  ins v8.h[2], v16.h[1]
  ins v9.h[2], v16.h[1]
  ins v10.h[2], v16.h[1]
  ins v11.h[2], v16.h[1]
  ins v12.h[2], v16.h[1]
  ins v13.h[2], v16.h[1]
  ins v14.h[2], v16.h[1]
  ins v15.h[2], v16.h[1]
  movi v16.16b, 17

(fused SUBS/B.cc loop)

100 unrolls and 100 iterations

Result (median cycles for code divided by count): 0.2503

retire (01)cycle (02)031e1f373f4e51inst issue (52)~issue int (53)~issue fp/simd (54)~dispatch int (56)~dispatch fp/simd (57)huge thing int (59)huge thing fp/simd (5b)5f6061696d6edispatch stall (70)72scheduler rewind (75)scheduler stall (76)~dispatch op (78)~map op int (7c)~map op fp/simd (7e)~map lookup int (7f)~map lookup fp/simd (81)8283pipeline redirect (84)85inst all (8c)inst branch (8d)inst branch taken (90)inst b.cc (94)inst integer (97)inst neon or fp (9a)9fa1a8a9acc5branch mispredict (cb)cdcfd0d2icache miss (d3)d5d6d9dadbddinst fetch restart (de)e0eaeb? fp/simd (ee)gpr retires (ef)f5f6f7f8fd
1602044006030000053002516023310016000810016002050012801320054001940038400382003014620016160120200160032200320064400394003911160201100991001001600001000000111101185000160210040035001600001004003940113400394011340039
160204401123000005202516010810016000810016002050030790670154001940038400381997714619989160120200160032200320064400394003811160201100991001001600001000000111101185000160010040035001600001004005840133400394003940039
160204400383010019290251601081001600081001600205001280132005400194013240038199770619989160121200160032200320064400384003811160201100991001001600001000000111101180000160000040035001600001004013340039401334003940039
16020440132300003151168251601831001600081001600205001280132005401134003840057199770619989160120200160032200320064400384003811160201100991001001600001000000111101185000160000040035001600001004011340039400394003940039
160204400382990019473251601081001601331001600205001280132010400384005740038199770620009160120200160032200320064400384003811160201100991001001600001000000111101180000160000040035001600001004003940039400644003940039
16020440038299000390251601991001600081001600205001280132005400194003840132199770620057160120200160032200320064400384006411160201100991001001600001000000111101185000160000040035001600001004005940039400394003940058
160204400383000073290251601081001600171001600205001280132015400194003840039199770620008160120200160032200320064402204006411160201100991001001600001000000111101185000160010040036001600001004003940039400394003940039
16020440038300000290251601081001600081001600205003079067005400194003840132200390619989160120200160032200320064400394003811160201100991001001600001000000111101185000160000040035001600001004005840039400584003940113
1602044003829900133290251601081001602391001600205001280132010400384003840038199770620008160120200160032200320064400384003811160201100991001001600001000000111101180000160000040054001600001004003940056400394003940113
16020440112300003222002516010810016000810016002050012801320104001940038400381997706199891601202001600322003200644003840087111602011009910010016000010000015111101185000480000040035001600001004003940039400394033240039

1000 unrolls and 10 iterations

Result (median cycles for code divided by count): 0.2502

retire (01)cycle (02)031e373a3f4e51inst issue (52)~issue int (53)~issue fp/simd (54)~dispatch int (56)~dispatch fp/simd (57)huge thing int (59)huge thing fp/simd (5b)5f6061696d6edispatch stall (70)72scheduler rewind (75)scheduler stall (76)~dispatch op (78)~map op int (7c)~map op fp/simd (7e)~map lookup int (7f)~map lookup fp/simd (81)8283pipeline redirect (84)85inst all (8c)inst branch (8d)inst branch taken (90)inst b.cc (94)inst integer (97)inst neon or fp (9a)9fa7a8branch mispredict (cb)cfd0d2icache miss (d3)d5d6d9dadbddinst fetch restart (de)e0eaeb? fp/simd (ee)gpr retires (ef)f5f6f7f8fd
16002440078300086045702516007010160060101600005012800001154001940038400842001503201041600102016000020320000401264003811160021109101016000010000100228212116211182140081218160000104003940085400394008540039
160024400383000006702516001010160000101600005055161001154002040084400381999603200181600102016000020320000400904003811160021109101016000010000100228212417422221940035208160000104003940040400884006540059
1600244003830006006702516001010160000101600005013199981154001940038400391999603200181600102016000020320000400554003811160021109101016000010000100248212416211181840109208160000104003940085400394008540039
1600244003830030045702516007010160060101600005030789351154004540084400381999703200671600102016000020320000400954003811160021109101016000010000100228211816121201840081208160000104004040039401134003940039
16002440038300010606714925160010101600681016000050128000011540019400754003819996032004516011820160000203200004004340038111600211091010160000100011002282117162111820400352016160000104003940076400584008840085
1600244008730000067392516001110160023101600005012800001154001940038401431999603200181600102016000020320000400904003811160021109101016000010000100228211416211171540035218160000104011340039400394003940039
16002440038300045067392516005510160045101600005012800001154001940058400842001503200641600102016000020320000400394003911160021109101016000010000100228311616222181540054418160000104003940113400394014440040
160024400393000014677025160070101600601016000050128000001540019400384008420015032006416001020160000203200004004140038111600211091010160000100001002283118161111821401092016160000104003940076400394011340113
16002440038300000235705016007810160001101600005012800001154001940038400841999603200641600102016000020320000400594003811160021109101016000010000100228211616212202240081207160000104004040039401134003940113
1600244011230006006702516001010160000101600005055161000154006540038401121999603200181600102016000020320000400724003811160021109101016000010000100228322016122191940035218160000104003940113400394011340039