Apple Microarchitecture Research by Dougall Johnson

M1/A14 P-core (Firestorm): Overview | Base Instructions | SIMD and FP Instructions
M1/A14 E-core (Icestorm):  Overview | Base Instructions | SIMD and FP Instructions

LDNP (32-bit)

Test 1: uops

Code:

  ldnp w0, w1, [x6]
  mov x0, 1
  mov x1, 2
  mov x8, 0

(no loop instructions)

1000 unrolls and 1 iteration

Retires: 2.000

Issues: 1.000

Integer unit issues: 0.000

Load/store unit issues: 1.000

SIMD/FP unit issues: 0.000

retire uop (01)cycle (02)03l2 tlb miss data (0b)0e0f1e22243a3f4346494f51schedule uop (52)schedule ldst uop (55)dispatch ldst uop (58)simd uops in schedulers (5a)6d6emap stall dispatch (70)map rewind (75)map stall (76)dispatch uop (78)map ldst uop (7d)map ldst uop inputs (80)8283flush restart other nonspec (84)85inst all (8c)inst int load (95)inst ldst (9b)9dl1d tlb access (a0)l1d cache miss ld (a3)ld unit uop (a6)l1d cache writeback (a8)acafb5b6bbl1d cache miss ld nonspec (bf)cfd5map dispatch bubble (d6)ddfetch restart (de)e0ld nt uop (e6)eaebec? ldst retires (ed)? int retires (ef)f5f6f7f8fd
200540330014510137921121925100010001000153893983989631271000200010003943511100110001000010004310390611038613944731161139510381010410001000399399399399399
20044033000441013832111925100010001000151053983949631311000200010003983511100110001000010004310390391038613944731161139110381414710001000399395395395399
20044003000440013832111925100010001000152883983989631341000200010003983511100110001000010004310380391039613944731161139510381414410001000399399399399399
20043983000441013832111925100010001000157803983989631271000200010003983511100110001000010004310380391038613943731161139510381414710001000399399399399399
200439430014400138321211925100010001000150143983989631311000200010003983511100110001000010004310380391039613943731161139110391410710001000399395395399399
2004394300045101379212121625100010001000152883983989631341000200010003943511100110001000010004310380381039613944731161139110381414710001000399399404399399
200439830004600138321211925100010001000152463983989631311000200010003943511100110001000010004310380391039613944731161139510381410410001000395395395399399
2004402300045001379212121625100010001000152884073989631311000200010003983511100110001000010004310380391039613943731161139110391010410001000395395395395395
200439430004500138321116251004113010001528439939896181311000200010003943511100110001000010004310380391038613944731251139110381410710001000399399399399399
2004419301020110138321211925100010001000152843943989631271000200010003983511100110001000010004410380391038613944731161139510381410710001000399399399399399

Test 2: Latency 1->3 (with chain penalty)

Chain cycles: 3

Code:

  ldnp w0, w1, [x6]
  eor x8, x8, x0
  eor x8, x8, x0
  add x6, x6, x8
  mov x0, 1
  mov x1, 2
  mov x8, 0

(fused SUBS/B.cc loop)

100 unrolls and 100 iterations

Result (median cycles for code, minus 3 chain cycles): 4.0054

retire uop (01)cycle (02)03l1d tlb fill (05)mmu table walk data (08)09l2 tlb miss data (0b)0e0f18191e1f22243a3f4d4f51schedule uop (52)schedule int uop (53)schedule ldst uop (55)dispatch int uop (56)dispatch ldst uop (58)int uops in schedulers (59)simd uops in schedulers (5a)696a6b6d6emap stall dispatch (70)map rewind (75)map stall (76)dispatch uop (78)map int uop (7c)map ldst uop (7d)map int uop inputs (7f)map ldst uop inputs (80)8283flush restart other nonspec (84)85inst all (8c)inst branch (8d)inst branch taken (90)inst branch cond (94)inst int load (95)inst int alu (97)inst ldst (9b)9d9fl1d tlb access (a0)l1d tlb miss (a1)l1d cache miss ld (a3)ld unit uop (a6)st unit uop (a7)l1d cache writeback (a8)a9acafb5bbl1d cache miss ld nonspec (bf)l1d tlb miss nonspec (c1)c2cfd5map dispatch bubble (d6)ddfetch restart (de)e0ld nt uop (e6)? int output thing (e9)eaebec? ldst retires (ed)? int retires (ef)f5f6f7f8fd
50205700515240000000020100700396969859703254010430103100023010010000613888334225049669790700357005463407363695401003020020000602001000070054371140201100991001000030100100001100100014110002040111000010100261027811698081000130003010010000401007005870061700617006170058
502047006052411100000600007003669698596842540108301061000230100100006136463342394496697107005170054633883636954010030200200006020010000700513711402011009910010000301001000001001000001100010531211000011011261016411698091000130006010010000401007004270042700617006170061
502047006052510100000200007004569725597092540104301061000230100100006139423341607496697407005470054634073636954010030200200006020010000700543711402011009910010000301001000001001000001100000340310000100002610164116981710000300030101310000401007003670052700557003670060
502047003552400000000000007003669719596842540104301031000130100100006136523341304496697107005170054634073637114010030200200006020010000700543711402011009910010000301001000001001000001100000530010000101002610164116979810000300031301310000401007005570052700557003670076
502047010052500000000000007002069719597032540104301031000130100100006139303342250496697407005470035634073637144010030200200006020010000700543711402011009910010000301001000001001000001100000100100000010026101641169814100003000310101010000401007005570036700367003670094
502047003552400000000100007003969719597002540100301031000130100100006139303342250496697107005470035634043637144010030200200006020010000700353711402011009910010000301001000001001000110100010591110000011112610178116982310001300031310010000401007006170042700617004270123
50204700605241011000020000700266972559706254010830106100023010010000613646334253849669613700547005463407363695401003020020000602001000070054371140201100991001000030100100000100100000010000070310000100002610164116982010000300000101310000401007005570055700367005570060
5020470051525000000006010070042697115968825401083010310002301001000061364633416074966980070041700606341036368740100302002000060200100007006037114020110099100100003010010000010010002201000113511100000111126101641169823100013000613131310000401007006170061700427004270118
5020470057524111000002000070026697255970925401043010610002301001000061394233425384966980070057700606341036368740100302002000060200100007004137114020110099100100003010010000010010001101000113811100001111126101641169798100003000010131310000401007003670055700557005570107
50204700545240000100000100700206971959684254010430100100013010010000613888334225049669710700547003563407363714401003020020000602001000070054371140201100991001000030100100000100100000010000060121000000000261016411698141000030000001310000401007003670052700557005570056

1000 unrolls and 10 iterations

Result (median cycles for code, minus 3 chain cycles): 4.0047

retire uop (01)cycle (02)03mmu table walk data (08)09l2 tlb miss instruction (0a)l2 tlb miss data (0b)0e0f18191e1f2223243f4d4f51schedule uop (52)schedule int uop (53)schedule ldst uop (55)dispatch int uop (56)dispatch ldst uop (58)int uops in schedulers (59)simd uops in schedulers (5a)6061696a6d6emap stall dispatch (70)int prf full (71)map rewind (75)map stall (76)dispatch uop (78)map int uop (7c)map ldst uop (7d)map int uop inputs (7f)map ldst uop inputs (80)8283flush restart other nonspec (84)85inst all (8c)inst branch (8d)inst branch taken (90)inst branch cond (94)inst int load (95)inst int alu (97)inst ldst (9b)9d9fl1d tlb access (a0)l1d tlb miss (a1)l1d cache miss ld (a3)ld unit uop (a6)st unit uop (a7)l1d cache writeback (a8)a9acafb5l1d cache miss ld nonspec (bf)c2cdcfd0d5map dispatch bubble (d6)ddfetch restart (de)e0ld nt uop (e6)? int output thing (e9)eaebec? ldst retires (ed)? int retires (ef)f5f6f7f8fd
5002570047525000000001000070032697195969225400103001010001300101000061499433418991049669707004770050634150363718400103002020000600201000070035371140021109101000030010100000101000001100000200100001000252102782269814100003000000910000400107003670048700487003670067
50024700505240000000010000700326971859689254001030010100013001010000615126334189910496695570050700356341503637154001030020200006002010000700503711400211091010000300101000001010000001000003000100000100252102782269817100003000300010000400107005170048700517004870101
5002470050524000000000000070032697145969225400143001310001300101000061510233418991049669677004770035634030363718400103002020000600201000070047371140021109101000030010100000101000000100000103100001100252102782269814100003000360910000400107005170039700517005170113
5002470047524010000001000070035697185968925400143001010001300101000061510233412951049669707004770035634030363682400103002020000600201000070047371140021109101000030010100000101000001100000100100001100252102992269801100003000366610000400107004870048700487004870073
5002470047525000000000010070020697185968925400103001310001300101000061499433418991049669557003570035634180363717400103002020000600201000070050371140021109101000030010100000101000000100000500100001100252102782269801100003000300610000400107003670051700517003670049
50024700475240000000010100700356971959677254001030013100013001010000615126334204610496697070047700356341503636824001030020200006002010000700473711400211091010000300101000001010000001000005900100001100252102782469801100003000360610000400107005170036700487004870037
50024700475240000000010000700206971959689254001030010100013001010000615102334204610496695570050700506341803636824001030020200006002010000700503711400211091010000300101000001010000011000005200100000000252102992269803100003000306010000400107003670092700997005770089
5002470050525000001001000070020697145967725400103001010001300101000061512633420461049669707003570050634180363682400103021520000600201000070050371140021109101000030010100000101000000100000209100001100252102992269817100003000366010000400107003670051700517005170083
50024700355250000000010000700326971459677254001430013100013001010000615102334204610496697070050700476341803637154001030020200006002010000700503711400211091010000300101000001010000011000003900100001100252102992269814100003000399910000400107005170036700487005170094
500247004752500000000100007003269714596772540010300131000030010100006151263341439104966967700477003563403036372140010300202000060020100007004737114002110910100003001010000010100000110000040024100001100252102792269801100003000300910000400107003670048700367004870056

Test 3: Latency 2->3 (with chain penalty)

Chain cycles: 3

Code:

  ldnp w0, w1, [x6]
  eor x8, x8, x1
  eor x8, x8, x1
  add x6, x6, x8
  mov x0, 1
  mov x1, 2
  mov x8, 0

(fused SUBS/B.cc loop)

100 unrolls and 100 iterations

Result (median cycles for code, minus 3 chain cycles): 4.0051

retire uop (01)cycle (02)03mmu table walk data (08)0e0f181e1f22243f4d4f51schedule uop (52)schedule int uop (53)schedule ldst uop (55)dispatch int uop (56)dispatch ldst uop (58)int uops in schedulers (59)simd uops in schedulers (5a)60696a6b6d6emap stall dispatch (70)map rewind (75)map stall (76)dispatch uop (78)map int uop (7c)map ldst uop (7d)map int uop inputs (7f)map ldst uop inputs (80)8283flush restart other nonspec (84)85inst all (8c)inst branch (8d)inst branch taken (90)inst branch cond (94)inst int load (95)inst int alu (97)inst ldst (9b)9d9fl1d tlb access (a0)l1d cache miss ld (a3)ld unit uop (a6)l1d cache writeback (a8)acafb5l1d cache miss ld nonspec (bf)branch cond mispred nonspec (c5)branch mispred nonspec (cb)cdcfd5map dispatch bubble (d6)ddfetch restart (de)e0ld nt uop (e6)? int output thing (e9)eaebec? ldst retires (ed)? int retires (ef)f5f6f7f8fd
50205700515240110101070042696985970025401043010310001301001000061393033413041496695507005170051634043637114010030200200006020010000700513711402011009910010000301001000011001000011000330100001100026103641169798100003000610101010000401007005270052700527005270052
50204700515250000100070036697155970025401173010310000301001000061393033452240496697107005170051634043636954010030200200006020010000700513711402011009910010000301001000001001000011000000100000000026101641169814100003000310101010000401007005270052700527005270052
50204700515240000101070036697155968425401043010310001301001000061365233421041496697107005170035634043637114010030200200006020010000700353711402011009910010000301001000001001000011000000100001100026101641169814100003000310101010000401007005270052700527005270055
50204700515240000001070036697155970225401043010310001301001000061393933413040496697107003570051634043637114010030200200006020010000700513711402011009910010000301001000001001000011000004560100000100027111641169814100003000310101010000401007003670052700527005270052
50204700515250100601070119697155970025401043010310000301001000061393033421040496697107005170035634043637114010030200200006020010067700543711402011009910010000301001000001001000011000010100001100126101641169798100003000310101010000401007005270052700527005270052
5020470051525000010107003669698597002540100301031000130100100006139303342104049669550700357005163404363711401003020020000602001000070035371140201100991001000030100100001100100000100000010000110002610164116981410000300030101010000401007005270052700367003670036
5020470035525000010107014269715597002540104301031000130100100006139393342104049669713700517005163404363711401003020020000602001006670107371140201100991001000030100100000100100001100001010000010002610164116979810000300031001010000401007003670036700367005270052
50204700355250000101070123696985970025401043010310001301001000061365233421040496697107005170035633883637114010030200200006020010000700513711402011009910010000301001000001001000001000000100001100026101641169814100003000610101010000401007005270052700527005270052
50204700515250000101070036697155970025401043010310001301001000061365233421041496697107005170051634043636954010030200200006020010000700513711402011009910010000301001000001001000011000000100001100026101641169814100003000310101010000401007005270036700527003670052
50204700515250100100070036697155970025401043010310001301001000061393033421040496697107003570051634043637114010030200200006020010000700513711402011009910010000301001000011001000011000003100001000026101641169814100003000310101010000401007005270052700527005270036

1000 unrolls and 10 iterations

Result (median cycles for code, minus 3 chain cycles): 4.0051

retire uop (01)cycle (02)03l1d tlb fill (05)mmu table walk data (08)l2 tlb miss data (0b)0e0f18191e1f2223243a3f4d4f51schedule uop (52)schedule int uop (53)schedule ldst uop (55)dispatch int uop (56)dispatch ldst uop (58)int uops in schedulers (59)simd uops in schedulers (5a)60696a6d6emap stall dispatch (70)map rewind (75)map stall (76)dispatch uop (78)map int uop (7c)map ldst uop (7d)map int uop inputs (7f)map ldst uop inputs (80)8283flush restart other nonspec (84)85inst all (8c)inst branch (8d)inst branch taken (90)inst branch cond (94)inst int load (95)inst int alu (97)inst ldst (9b)9d9fl1d tlb access (a0)l1d tlb miss (a1)l1d cache miss ld (a3)ld unit uop (a6)st unit uop (a7)l1d cache writeback (a8)a9acafb5bbl1d cache miss ld nonspec (bf)l1d tlb miss nonspec (c1)c2cfd5map dispatch bubble (d6)ddfetch restart (de)e0ld nt uop (e6)? int output thing (e9)eaebec? ldst retires (ed)? int retires (ef)f5f6f7f8fd
50025700515250000000100000701416972259710254001830016100023001010000615163334239804966977700577004163409363739400103002020000600201000070057371140021109101000030010100001101000121100020001100001101125201641369818100003000610101010000400107003670052700527003670052
5002470054525000000010100070125697155967725400143001310001300101000061514533412950496697170035700516341936371940010300202000060020100007005137114002110910100003001010000010100000110003000010000101002520178136981810000300061001010000400107005270052700527003670052
50024700515250000000001000700996971559693254001030013100013001010000614994334209504966971700517005163419363719400103002020000600201000070051371140021109101000030010100000101000001100000000100000010025203781169801100003000610101010000400107005270052700527005270036
5002470051525000000010000070154697155967725400143001010001300101000061514533420951496697170051700516340336371940010300202000060020100007003537114002110910100003001010000010100000110000000010000101002520178116981910000300061001010000400107003670052700367005270036
50024700515240000000101000701526971559693254001430013100013016410000615145334209504966971700517005163419363719400103002020000600201000070051372140021109101000030010100000101000001100000003100001000025203641169818100003000610101010000400107006270052700527005270052
50024700515250001000101000701296971959693254001030013100013001010000615145334209504966971700517005163419363719400103002020000600201000070051371140021109101000030010100000101000001100000000100000010025203781169818100003000610101010000400107005270054700527003670053
50024702395250001000101000700426981959693764001430013100003001010000615145334209504966971700517005163419363719400103002020000600201000070051371140021109101000030010100000101000001100000000100001010025201781169801100003000610101010000400107005270036700527005270052
50024700515240000000101000700426971959693254001030013100013001010000615145334209514966971700357005163403363719400103002020000600201000070051371140021109101000030010100000101000001100000000100000000025201781169818100003000610101010000400107005470052700527005470055
50024700515250000100101000700426971559693254001430010100013001010000615145334209504966971700517005163421363682400103002020000600201000070051371140021109101000030010100000101000021100000004541100001010025201781169818100183000310101010000400107005270052700527005270052
500247005152400000001301000700426971959695254001430010100003001010000615145334209504966971700357005663424363682400103002020000600201000070035371140021109101000030010100000101000321100010021100001101025201641369823100013000610101010000400107005270052700527005270052

Test 4: throughput

Count: 8

Code:

  ldnp w0, w1, [x6]
  ldnp w0, w1, [x6]
  ldnp w0, w1, [x6]
  ldnp w0, w1, [x6]
  ldnp w0, w1, [x6]
  ldnp w0, w1, [x6]
  ldnp w0, w1, [x6]
  ldnp w0, w1, [x6]
  mov x7, x6
  mov x8, x6
  mov x9, x6
  mov x10, x6
  mov x11, x6
  mov x12, x6
  mov x13, x6

(fused SUBS/B.cc loop)

100 unrolls and 100 iterations

Result (median cycles for code divided by count): 0.3342

retire uop (01)cycle (02)03l1d tlb fill (05)mmu table walk data (08)09l2 tlb miss data (0b)0e0f191e2223243a3f4346494f51schedule uop (52)schedule int uop (53)schedule ldst uop (55)dispatch int uop (56)dispatch ldst uop (58)int uops in schedulers (59)simd uops in schedulers (5a)60696a6d6emap stall dispatch (70)map rewind (75)map stall (76)dispatch uop (78)map int uop (7c)map ldst uop (7d)map int uop inputs (7f)map ldst uop inputs (80)8283flush restart other nonspec (84)85inst all (8c)inst branch (8d)inst branch taken (90)inst branch cond (94)inst int load (95)inst int alu (97)inst ldst (9b)9d9fl1d tlb access (a0)l1d tlb miss (a1)l1d cache miss ld (a3)ld unit uop (a6)st unit uop (a7)l1d cache writeback (a8)a9acafb5b6bbl1d cache miss ld nonspec (bf)l1d tlb miss nonspec (c1)c2branch cond mispred nonspec (c5)branch mispred nonspec (cb)cdcfd5map dispatch bubble (d6)ddfetch restart (de)e0ld nt uop (e6)eaebec? ldst retires (ed)? int retires (ef)f5f6f7f8fd
160205267242010000000410001267142181802580100100800001008000050011705290492365226732267146637366908010020016000020080000267323511802011009910080000100800000100800000398003500038800356103900000511021622267298005799280000801002673326733267332673326733
16020426741202110111021100226799200122580100100800001008000050011701070492362726816267226645366808010020016000020080000267313511802011009910080000100800000100800000080035000388003561353900000511021622267298005799280000801002673326715267332673326715
16020426714200110000021100226789218181225801001008000010080000500117010704923642267222672266303668080100200160000200800002672235118020110099100800001008000001008000003980035000358000061353900000511021622267118005799280000801002673326733267332673326733
1602042671420111010002110022669921818142580100100800001008000050011688980492364226722267226645366808010020016000020080000267073511802011009910080000100800000100800000398003500008000061353900000511021622267298005799280000801002673326733267332673326733
16020426732200110000065000026713218181225801001008000010080000500117010704923642267222672266303668080100200160000200800002672235118020110099100800001008000011008000003980035000358003561353900000511021622267298005899280000801002671526733267152671526715
1602042673220010010006510022671321818122580100100800001008000050011688800492364226722267076645366808010020016000020080000267223511802011009910080000100800000100800000398003500035800356103900000511021622267298005799280000801002671526733267152673326733
1602042671420011000002110022670921818025801001008000010080000500117010704923642267222672266453666580100200160000200800002672235118020110099100800001008000001008000003980000000358000001353900000511021622267298005799280000801002673326733267332673326733
16020426732200100000065000226713018181225801001008000010080000500117010704923642267222671366303666580100200160000200800002672235118020110099100800001008000001008000003980035000358003561383900000511021622267298005799280000801002674526733267342673326734
1602042673221511000006501022671301818025801001008000010080000500117010709823642267222670766453668080100200160000200800002670735118020110099100800001008000011008000003980035000358000061353900000511021622267298005799280000801002673326715267332673326733
16020426732200100010065000226713218012258010010080000100800005001170107049236272672226722664536680801002001600002008000026707351180201100991008000010080000010080000008000000035800000135000000511021622267298005799080000801002673326715267332673326733

1000 unrolls and 10 iterations

Result (median cycles for code divided by count): 0.3341

retire uop (01)cycle (02)03l2 tlb miss data (0b)0e191e22243a3f4346494f51schedule uop (52)schedule int uop (53)schedule ldst uop (55)dispatch int uop (56)dispatch ldst uop (58)int uops in schedulers (59)simd uops in schedulers (5a)696a6b6d6emap stall dispatch (70)map rewind (75)map stall (76)dispatch uop (78)map int uop (7c)map ldst uop (7d)map int uop inputs (7f)map ldst uop inputs (80)8283flush restart other nonspec (84)85inst all (8c)inst branch (8d)inst branch taken (90)inst branch cond (94)inst int load (95)inst int alu (97)inst ldst (9b)9d9fl1d tlb access (a0)l1d cache miss ld (a3)ld unit uop (a6)l1d cache writeback (a8)acafb5b6bbl1d cache miss ld nonspec (bf)cfd5map dispatch bubble (d6)ddfetch restart (de)e0ld nt uop (e6)eaebec? ldst retires (ed)? int retires (ef)f5f6f7f8fd
160025267292000104410126740212119258001010800001080000501168880492365102670726731667636711800102016000020800002673135118002110910800001080000010800000800380388003961394350205165526728800381410780000800102670826708267322670826708
160024267272010004400026692211162580010108000010800005011688804923627026731267076676367078001020160000208000026707351180021109108000010800000108000043800390080038600050203165426730800391410780000800102670926732267322672826708
16002427137200000000126716011192580010108000010800005011748874923651026707267316676366878001020160000208000026727351180021109108000010800001108000008000000800006138445020416632670480000140080000800102673226708267322673226729
1600242670720100001012671420019258001010800001080000501168880492365102672726727665336711800102016000020800002673135118002110910800001080000010800004380038008003861394450205165326707800391410080000800102673526708267082673226708
160024267312000004500126692212116258001010800001080000501168627492365102673126707667636687800102016000020800002673535118002110910800001080000010800004380038038800386139050205163526728800381410780000800102672826708267082673226732
1600242673120000001012671221119258001010800001080000501174887492362702670726731667636711800102016000020800002673135118002110910800001080000010800004380038039800396100502031635267288003800780000800102672826728267082673226708
160024267272000004400026716211025800101080000108000050116908549236270267312672766533668780010201600002080000267313511800211091080000108000001080000438003813880039613905020516352672480038010780000800102670826732267322673226708
1600242673120000001012669221120258001010800001080000501174628492364702673126727665336687800102016000020800002673135118002110910800001080000010800004380038041800386039445020516532670480038014780000800102670826708267322672826732
1600242673120000044100266922120192580010108000010800005011688804923651026707267076676367118001020160000208000026731351180021109108000010800000108000043800000418003861005020316532672880038140780000800102670826708267282670826708
1600242673120000000002671620119258001010800001080000501168880492365102670726707667636711800102016000020800002670735118002110910800001080000010800004380000038800386004450205163526704800381410480000800102673226708267082673226728