Apple Microarchitecture Research by Dougall Johnson

M1/A14 P-core (Firestorm): Overview | Base Instructions | SIMD and FP Instructions
M1/A14 E-core (Icestorm):  Overview | Base Instructions | SIMD and FP Instructions

INS (element, H)

Test 1: uops

Code:

  ins v0.h[2], v1.h[1]
  movi v0.16b, 1
  movi v1.16b, 2

(no loop instructions)

1000 unrolls and 1 iteration

Retires: 1.000

Issues: 1.000

Integer unit issues: 0.000

Load/store unit issues: 0.000

SIMD/FP unit issues: 1.000

retire uop (01)cycle (02)03mmu table walk data (08)1e3f4e51schedule uop (52)schedule simd uop (54)dispatch simd uop (57)ldst uops in schedulers (5b)60696d6emap stall dispatch (70)map rewind (75)map stall (76)dispatch uop (78)map simd uop (7e)map simd uop inputs (81)8283flush restart other nonspec (84)85inst all (8c)inst simd alu (9a)l1d tlb miss (a1)l1d cache writeback (a8)accfd5map dispatch bubble (d6)ddfetch restart (de)e0? simd retires (ee)f5f6f7f8fd
10042037150061168725100010001000264680020182037203715723189510001000200020372037111001100000073216111787100020382038203820382038
100420371500251168725100010001000264680020182037203715723189510001000200020372037111001100000073116111787100020382038203820382038
10042037160061168725100010001000264680020182037203715723189510001000200020372037111001100000073116111787100020382038203820382038
10042037150061168725100010001000264680020182037203715723189510001000200020372037111001100000673116111787100020382038203820382038
10042037150061168725100010001000264680020182037203715723189510001000200020372037111001100000073116111787100020382038203820382038
10042037160084168725100010001000264680120182037203715723189510001000200020372037111001100000073116111787100020382038203820382038
100420371600105168725100010001000264680120182037203715723189510001000200020372037111001100000073116111787100020382038203820382038
100420371500103168725100010001000264680020182037203715723189510001000200020372037111001100000073116111787100020382038203820382038
10042037150061168725100010001000264680020182037203715723189510001000200020372037111001100000073116111787100020382038203820382038
10042037160061168725100010001000264680120182037203715723189510001000200020372037111001100000073116111787100020382038203820382038

Test 2: Latency 1->1

Code:

  ins v0.h[2], v1.h[1]
  movi v0.16b, 1
  movi v1.16b, 2

(fused SUBS/B.cc loop)

100 unrolls and 100 iterations

Result (median cycles for code): 2.0037

retire uop (01)cycle (02)031e3f4e51schedule uop (52)schedule int uop (53)schedule simd uop (54)dispatch int uop (56)dispatch simd uop (57)int uops in schedulers (59)ldst uops in schedulers (5b)60696d6emap stall dispatch (70)map rewind (75)map stall (76)dispatch uop (78)map int uop (7c)map simd uop (7e)map int uop inputs (7f)map simd uop inputs (81)8283flush restart other nonspec (84)85inst all (8c)inst branch (8d)inst branch taken (90)inst branch cond (94)inst int alu (97)inst simd alu (9a)9fl1d cache writeback (a8)accfd5map dispatch bubble (d6)ddfetch restart (de)e0? simd retires (ee)? int retires (ef)f5f6f7f8fd
1020420037150072619687251010010010000100100005002847680020018200372003718422318745101002001000020020000200372003711102011009910010010000100007101161119791100001002003820038200382003820038
102042003715008219687251010010010000100100005002847680120018200372003718422318745101002001000020020000200372003711102011009910010010000100107101161119791100001002003820038200382003820038
102042003715006119687251010010010000100100005002847680120018200372003718422318745101002001000020020000200372003711102011009910010010000100007101161119791100001002003820038200382003820038
102042003715006119687251010010010000100100005002847680020018200372003718422318745101002001000020020000200372003711102011009910010010000100007101161119791100001002003820038200382003820038
102042003715006119687251010010010000100100005002847680020018200372003718422318745101002001000020020000200372003711102011009910010010000100007101161119824100001002003820038200382003820038
102042003715006619687251010010010000100100005002847680120018200372003718422318745101002001000020020000200372003711102011009910010010000100007101161119791100001002003820038200382003820038
102042003715006119687251010010010000100100005002847680020018200372003718422318745101002001000020020000200372003711102011009910010010000100007101161119791100001002003820038200382003820038
102042003715006119687251010010010000100100005002847680020018200372003718422318745101002001000020020000200372003711102011009910010010000100007101161119791100001002003820038200382003820038
102042003715006119687251010010010000100100005002847680120018200372003718422318745101002001000020020000200372003711102011009910010010000100037101161119791100001002003820038200382003820038
102042003715006119687251010010010000100100005002847680020018200372003718422318745101002001000020020000200372003711102011009910010010000100107101161119791100001002003820038200382003820038

1000 unrolls and 10 iterations

Result (median cycles for code): 2.0037

retire uop (01)cycle (02)03l2 tlb miss data (0b)1e1f3f4e51schedule uop (52)schedule int uop (53)schedule simd uop (54)dispatch int uop (56)dispatch simd uop (57)int uops in schedulers (59)ldst uops in schedulers (5b)60696d6emap stall dispatch (70)map rewind (75)map stall (76)dispatch uop (78)map int uop (7c)map simd uop (7e)map int uop inputs (7f)map simd uop inputs (81)8283flush restart other nonspec (84)85inst all (8c)inst branch (8d)inst branch taken (90)inst branch cond (94)inst int alu (97)inst simd alu (9a)9fl1d tlb access (a0)l1d tlb miss (a1)l1d cache writeback (a8)accfd5map dispatch bubble (d6)ddfetch restart (de)e0? simd retires (ee)? int retires (ef)f5f6f7f8fd
100242003715000012619687251001010100001010000502847680020018200372003718444318767100102010000202000020037200371110021109101010000100000640216221978510000102003820038200382003820038
10024200371500006119687251001010100001010000502847680020018200372003718444318767100102010000202000020037200371110021109101010000100000640216221978510000102003820038200382003820038
100242003715000014519687251001010100001010000502847680020018200372003718444318767100102010000202000020037200842110021109101010000100000640216221978510000102003820038200382003820038
10024200371500006119687251001010100001010000502847680020018200372003718444318767100102010000202000020037200371110021109101010000100000640216221978510000102003820038200382003820038
100242003715000016619687251001010100001010000502847680020018200372003718444318767100102010000202000020037200371110021109101010000100000640216221978510000102003820038201332008620038
1002420037150108819319687251001010100001010000502847680120018200372003718444318767100102010000202000020037200371110021109101010000100000640216221978510000102003820038200382003820038
100242003715000010319687251001010100001010000502847680120018200372003718444318767100102010000202000020037200371110021109101010000100000640216221978510000102003820038200382003820038
100242003715000014519687251001010100001010000502847680120018200372003718444318767100102010000202000020037200371110021109101010000100000640216221978510000102003820038200382003820038
100242003715000014519687251001010100001010000502847680020018200372003718444318767100102010000202000020037200371110021109101010000100000640216221978510000102003820038200382003820038
100242003715000010319687251001010100001010000502847680020018200372003718444318767100102010000202000020037200371110021109101010000100000640216221978510000102003820038200382003820038

Test 3: Latency 1->2

Code:

  ins v0.h[2], v0.h[1]
  movi v0.16b, 1

(fused SUBS/B.cc loop)

100 unrolls and 100 iterations

Result (median cycles for code): 2.0037

retire uop (01)cycle (02)03l2 tlb miss data (0b)1e3a3f4e51schedule uop (52)schedule int uop (53)schedule simd uop (54)dispatch int uop (56)dispatch simd uop (57)int uops in schedulers (59)ldst uops in schedulers (5b)60696d6emap stall dispatch (70)simd prf full (72)map rewind (75)map stall (76)dispatch uop (78)map int uop (7c)map simd uop (7e)map int uop inputs (7f)map simd uop inputs (81)8283flush restart other nonspec (84)85inst all (8c)inst branch (8d)inst branch taken (90)inst branch cond (94)inst int alu (97)inst simd alu (9a)9fl1d tlb access (a0)l1d tlb miss (a1)ld unit uop (a6)st unit uop (a7)l1d cache writeback (a8)a9acbranch cond mispred nonspec (c5)branch mispred nonspec (cb)cdcfd5map dispatch bubble (d6)ddfetch restart (de)e0? int output thing (e9)? simd retires (ee)? int retires (ef)f5f6f7f8fd
1020420037150000103196862510100100100001001000050028475210200182003720037184280718741101002001000820020016200372003711102011009910010010000100000010011171701600198000100001002003820038200382003820038
10204200371501120557196862510100100100001001000050028475210200182003720037184090618733101002001000020020000200372003711102011009910010010000100000050011172222422197870100001002003820038200382003820038
102042003715000097196862510100100100001001000050028475210200182003720037184090618733101002001000020020000200372003711102011009910010010000100000000011172222422197870100001002003820038200382003820038
1020420037150000202196862510100100100001001000050028475210200182003720037184090618733102562001000820020016200372003711102011009910010010000100000000011171701600198010100001002003820038200382003820038
102042003715000061196862510100100100001001000050028475210200182003720037184280618740101002001000820020016200372003711102011009910010010000100000050011171801600198000100001002003820038200382003820038
102042003715000061196862510100100100001001000050028475210200182003720037184280618741101002001000820020016200372003711102011009910010010000100000000011171801600198000100001002003820038200872003820038
102042003715000082196862510100100100001001000050028475210200182003720037184280718740101002001000820020016200372003711102011009910010010000100000010011171801600198640100001002003820038200382003820038
102042003715000061196862510100100100001001000050028475210200182003720037184280618740101002001000820020016200372003711102011009910010010000100000000011171701600198000100001002003820038200382003820038
102042003715000061196862510100100100001001000050028475210200182003720037184280618759101002001000820020016200372003711102011009910010010000100000000011171801600198010100001002003820038200382003820038
102042003715000061196862510100100100001001000050028475210200182003720037184280718740101002001000820020016200372003711102011009910010010000100000050011171701600198010100001002003820038200382003820038

1000 unrolls and 10 iterations

Result (median cycles for code): 2.0037

retire uop (01)cycle (02)03l2 tlb miss data (0b)3f4e51schedule uop (52)schedule int uop (53)schedule simd uop (54)dispatch int uop (56)dispatch simd uop (57)int uops in schedulers (59)ldst uops in schedulers (5b)60696d6emap stall dispatch (70)simd prf full (72)map rewind (75)map stall (76)dispatch uop (78)map int uop (7c)map simd uop (7e)map int uop inputs (7f)map simd uop inputs (81)8283flush restart other nonspec (84)85inst all (8c)inst branch (8d)inst branch taken (90)inst branch cond (94)inst int alu (97)inst simd alu (9a)9fl1d tlb access (a0)l1d tlb miss (a1)ld unit uop (a6)l1d cache writeback (a8)a9accfd5map dispatch bubble (d6)ddfetch restart (de)e0? simd retires (ee)? int retires (ef)f5f6f7f8fd
100242003715008919686251001010100001010000502847521020018200372003718443031876710010201000020200002003720037111002110910101000010000000640316331978610000102003820038200382003820086
1002420083150023519686251001010100001010000502847521020018200372003718443031876710010201000020200002003720037111002110910101000010000000640316331978610000102003820038200382003820038
100242003715006119686251001010100001010000502847521120018200372003718443031876710010201000020200002003720037111002110910101000010000000640316331978610000102003820038200382003820038
100242003715006119686251001010100001010000502847521120018200372003718443031876710010201000020200002003720037111002110910101000010001000640316331978610000102003820038200382003820038
1002420037150032419686251001010100001010000502847521020018200372003718443031876710010201000020200002003720037111002110910101000010000000640316331978610000102003820038200382003820038
100242003715006119686251001010100001010000502847521120018200372003718443031876710010201000020200002003720037111002110910101000010100000640316331978610000102003820038200382003820038
100242003715006119686251001010100001010000502847521020018200372003718443031876710010201000020200002003720037111002110910101000010000000640316331978610000102003820038200382003820038
100242003715006119686251001010100001010000502847521020018200372003718443031876710010201000020200002003720037111002110910101000010000000640316331978610000102003820038200382003820038
100242003715006119686251001010100001010000502847521120018200372003718443031876710010201000020200002003720037111002110910101000010000000640316331978610000102018020038200382003820038
1002420037149036219686251001010100001010000502848235120018200372003718443031876710010201000020200002003720037111002110910101000010000000640316331978610000102003820038200382003820038

Test 4: throughput

Count: 8

Code:

  movi v0.16b, 0
  ins v0.h[2], v8.h[1]
  movi v1.16b, 0
  ins v1.h[2], v8.h[1]
  movi v2.16b, 0
  ins v2.h[2], v8.h[1]
  movi v3.16b, 0
  ins v3.h[2], v8.h[1]
  movi v4.16b, 0
  ins v4.h[2], v8.h[1]
  movi v5.16b, 0
  ins v5.h[2], v8.h[1]
  movi v6.16b, 0
  ins v6.h[2], v8.h[1]
  movi v7.16b, 0
  ins v7.h[2], v8.h[1]
  movi v8.16b, 9

(fused SUBS/B.cc loop)

100 unrolls and 100 iterations

Result (median cycles for code divided by count): 0.2508

retire uop (01)cycle (02)03191e3f51schedule uop (52)schedule int uop (53)schedule simd uop (54)dispatch int uop (56)dispatch simd uop (57)int uops in schedulers (59)ldst uops in schedulers (5b)60696b6d6emap rewind (75)map stall (76)dispatch uop (78)map int uop (7c)map simd uop (7e)map int uop inputs (7f)map simd uop inputs (81)8283flush restart other nonspec (84)85inst all (8c)inst branch (8d)inst branch taken (90)inst branch cond (94)inst int alu (97)inst simd alu (9a)9fl1d tlb miss (a1)l1d cache writeback (a8)acc2branch cond mispred nonspec (c5)branch mispred nonspec (cb)cdcfd5map dispatch bubble (d6)e0? simd retires (ee)? int retires (ef)f5f6f7f8fd
16020420077151001132580116100800161008002850064019602004402006520065612801282008002820016005620065200651116020110099100100160000100000011110119416200621600001002006620066200662006620066
16020420065150001382580116100800161008002850064019602004402006520065612801282008002820016005620065200651116020110099100100160000100000011110119016200621600001002006620066200662006620066
1602042006515000292580116100800161008002850064019602004402006520065612801282008002820016005620065200651116020110099100100160000100000011110119016200621600001002006620066200662006620066
1602042006515000292580116100800161008002850064019602004402006520065612801282008002820016005620065200651116020110099100100160000100000011110119016200621600001002006620066200662006620066
1602042006515000292580116100800161008002850064019602004402006520065612801282008002820016005620065200651116020110099100100160000100000011110119016200621600001002006620066200662006620066
1602042006515000292580116100800161008002850064019602004402006520065612801282008002820016005620065200651116020110099100100160000100000011110119016200621600001002006620066200662006620066
1602042006515100292580116100800161008002850064019602004402006520065612801282008002820016005620065200651116020110099100100160000100000011110119016200621600001002006620066200662006620066
1602042006515000292580116100800161008002850064019602004402006520065612801282008002820016005620065200651116020110099100100160000100000011110119016200621600001002006620066200662006620066
1602042006515000922580116100800161008002850064019602004402006520065612801282008002820016005620065200651116020110099100100160000100000011110119016200621600001002006620066200662006620066
1602042006515000292580116100800161008002850064019602004402006520065612801282008002820016005620065200651116020110099100100160000100000011110119016200621600001002006620066200662006620066

1000 unrolls and 10 iterations

Result (median cycles for code divided by count): 0.2506

retire uop (01)cycle (02)03mmu table walk data (08)l2 tlb miss data (0b)18191e1f3a3f51schedule uop (52)schedule int uop (53)schedule simd uop (54)dispatch int uop (56)dispatch simd uop (57)int uops in schedulers (59)ldst uops in schedulers (5b)5f60696d6emap rewind (75)map stall (76)dispatch uop (78)map int uop (7c)map simd uop (7e)map int uop inputs (7f)map simd uop inputs (81)8283flush restart other nonspec (84)85inst all (8c)inst branch (8d)inst branch taken (90)inst branch cond (94)inst int alu (97)inst simd alu (9a)9fl1d tlb access (a0)l1d tlb miss (a1)ld unit uop (a6)l1d cache writeback (a8)a9acc2cfd0d2l1i cache miss demand (d3)d5map dispatch bubble (d6)d9dadbddfetch restart (de)e0ea? simd retires (ee)? int retires (ef)f5f6f7f8fd
160024200691510000000442580010108000010800005064000011200262004520045321800102080000201600002004520045111600211091010160000100000000100263114242112332004215160000102004620046200462004620046
160024200451510000000442580010108000010800005064000011200892013220045321800102080000201600002004520045111600211091010160000100000000100273114202112342004215160000102004620046200502004620046
160024200451500000000442580010108000010800005064000011200262004520045321800102080000201600002004520045111600211091010160000100000000100273115204111442004215160000102004620046200462004620046
1600242004515000000003702580010108000010800005064000011200262004520045321800102080129201600002004520049111600211091010160000100000000100273114202110542004215160000102004620046200462004620046
160024200451500000000442580010108000010800005064000011200262004520045321800102080000201600002004520045111600211091010160000100000000100283115202110452004215160000102004620046200462004620046
160024200451500000000442580010108000010800005064000011200302004520045321800102080000201600002004520045111600211091010160000100000000100253115242113322004215160000102005020050200462004620046
1600242004515000100001512580010108000010800005064000011200262004520045321800102080000201600002004520045111600211091010160000100000030100266124244216442005030160000102004620046200462004620046
1600242004515000000006142580010108000010800005064000011200262004520045321800102080000201600002004520049111600211091010160000100000000100283112202110442004215160000102004620046200462004620113
1600242004515000000001572580010108000010800005064000001200262004520049321800102080000201600002004920049111600211091010160000100000000100273214202213432004215160000102004620046200462004620046
160024200451500000000442580010108000010800005064000011200262004520045321800102080000201600002004520045111600211091010160000100000000100273114202111432004215160000102004620046200462004620050

Test 5: throughput

Count: 16

Code:

  ins v0.h[2], v16.h[1]
  ins v1.h[2], v16.h[1]
  ins v2.h[2], v16.h[1]
  ins v3.h[2], v16.h[1]
  ins v4.h[2], v16.h[1]
  ins v5.h[2], v16.h[1]
  ins v6.h[2], v16.h[1]
  ins v7.h[2], v16.h[1]
  ins v8.h[2], v16.h[1]
  ins v9.h[2], v16.h[1]
  ins v10.h[2], v16.h[1]
  ins v11.h[2], v16.h[1]
  ins v12.h[2], v16.h[1]
  ins v13.h[2], v16.h[1]
  ins v14.h[2], v16.h[1]
  ins v15.h[2], v16.h[1]
  movi v16.16b, 17

(fused SUBS/B.cc loop)

100 unrolls and 100 iterations

Result (median cycles for code divided by count): 0.2503

retire uop (01)cycle (02)031e1f373f4e51schedule uop (52)schedule int uop (53)schedule simd uop (54)dispatch int uop (56)dispatch simd uop (57)int uops in schedulers (59)ldst uops in schedulers (5b)5f6061696d6emap stall dispatch (70)simd prf full (72)map rewind (75)map stall (76)dispatch uop (78)map int uop (7c)map simd uop (7e)map int uop inputs (7f)map simd uop inputs (81)8283flush restart other nonspec (84)85inst all (8c)inst branch (8d)inst branch taken (90)inst branch cond (94)inst int alu (97)inst simd alu (9a)9fl1d tlb miss (a1)l1d cache writeback (a8)a9acbranch cond mispred nonspec (c5)branch mispred nonspec (cb)cdcfd0d2l1i cache miss demand (d3)d5map dispatch bubble (d6)d9dadbddfetch restart (de)e0eaeb? simd retires (ee)? int retires (ef)f5f6f7f8fd
1602044006030000053002516023310016000810016002050012801320054001940038400382003014620016160120200160032200320064400394003911160201100991001001600001000000111101185000160210040035001600001004003940113400394011340039
160204401123000005202516010810016000810016002050030790670154001940038400381997714619989160120200160032200320064400394003811160201100991001001600001000000111101185000160010040035001600001004005840133400394003940039
160204400383010019290251601081001600081001600205001280132005400194013240038199770619989160121200160032200320064400384003811160201100991001001600001000000111101180000160000040035001600001004013340039401334003940039
16020440132300003151168251601831001600081001600205001280132005401134003840057199770619989160120200160032200320064400384003811160201100991001001600001000000111101185000160000040035001600001004011340039400394003940039
160204400382990019473251601081001601331001600205001280132010400384005740038199770620009160120200160032200320064400384003811160201100991001001600001000000111101180000160000040035001600001004003940039400644003940039
16020440038299000390251601991001600081001600205001280132005400194003840132199770620057160120200160032200320064400384006411160201100991001001600001000000111101185000160000040035001600001004005940039400394003940058
160204400383000073290251601081001600171001600205001280132015400194003840039199770620008160120200160032200320064402204006411160201100991001001600001000000111101185000160010040036001600001004003940039400394003940039
16020440038300000290251601081001600081001600205003079067005400194003840132200390619989160120200160032200320064400394003811160201100991001001600001000000111101185000160000040035001600001004005840039400584003940113
1602044003829900133290251601081001602391001600205001280132010400384003840038199770620008160120200160032200320064400384003811160201100991001001600001000000111101180000160000040054001600001004003940056400394003940113
16020440112300003222002516010810016000810016002050012801320104001940038400381997706199891601202001600322003200644003840087111602011009910010016000010000015111101185000480000040035001600001004003940039400394033240039

1000 unrolls and 10 iterations

Result (median cycles for code divided by count): 0.2502

retire uop (01)cycle (02)031e373a3f4e51schedule uop (52)schedule int uop (53)schedule simd uop (54)dispatch int uop (56)dispatch simd uop (57)int uops in schedulers (59)ldst uops in schedulers (5b)5f6061696d6emap stall dispatch (70)simd prf full (72)map rewind (75)map stall (76)dispatch uop (78)map int uop (7c)map simd uop (7e)map int uop inputs (7f)map simd uop inputs (81)8283flush restart other nonspec (84)85inst all (8c)inst branch (8d)inst branch taken (90)inst branch cond (94)inst int alu (97)inst simd alu (9a)9fst unit uop (a7)l1d cache writeback (a8)branch mispred nonspec (cb)cfd0d2l1i cache miss demand (d3)d5map dispatch bubble (d6)d9dadbddfetch restart (de)e0eaeb? simd retires (ee)? int retires (ef)f5f6f7f8fd
16002440078300086045702516007010160060101600005012800001154001940038400842001503201041600102016000020320000401264003811160021109101016000010000100228212116211182140081218160000104003940085400394008540039
160024400383000006702516001010160000101600005055161001154002040084400381999603200181600102016000020320000400904003811160021109101016000010000100228212417422221940035208160000104003940040400884006540059
1600244003830006006702516001010160000101600005013199981154001940038400391999603200181600102016000020320000400554003811160021109101016000010000100248212416211181840109208160000104003940085400394008540039
1600244003830030045702516007010160060101600005030789351154004540084400381999703200671600102016000020320000400954003811160021109101016000010000100228211816121201840081208160000104004040039401134003940039
16002440038300010606714925160010101600681016000050128000011540019400754003819996032004516011820160000203200004004340038111600211091010160000100011002282117162111820400352016160000104003940076400584008840085
1600244008730000067392516001110160023101600005012800001154001940038401431999603200181600102016000020320000400904003811160021109101016000010000100228211416211171540035218160000104011340039400394003940039
16002440038300045067392516005510160045101600005012800001154001940058400842001503200641600102016000020320000400394003911160021109101016000010000100228311616222181540054418160000104003940113400394014440040
160024400393000014677025160070101600601016000050128000001540019400384008420015032006416001020160000203200004004140038111600211091010160000100001002283118161111821401092016160000104003940076400394011340113
16002440038300000235705016007810160001101600005012800001154001940038400841999603200641600102016000020320000400594003811160021109101016000010000100228211616212202240081207160000104004040039401134003940113
1600244011230006006702516001010160000101600005055161000154006540038401121999603200181600102016000020320000400724003811160021109101016000010000100228322016122191940035218160000104003940113400394011340039