Apple Microarchitecture Research by Dougall Johnson

M1/A14 P-core (Firestorm): Overview | Base Instructions | SIMD and FP Instructions
M1/A14 E-core (Icestorm):  Overview | Base Instructions | SIMD and FP Instructions

LDR (register, uxtw, 32-bit)

Test 1: uops

Code:

  ldr w0, [x6, w7, uxtw]
  mov x7, #4
  mov x8, 0

(no loop instructions)

1000 unrolls and 1 iteration

Retires: 1.000

Issues: 1.000

Integer unit issues: 0.000

Load/store unit issues: 1.000

SIMD/FP unit issues: 0.000

retire uop (01)cycle (02)03l1d tlb fill (05)mmu table walk data (08)09l2 tlb miss data (0b)0e0f1e22233a3f4346494f51schedule uop (52)schedule ldst uop (55)dispatch ldst uop (58)simd uops in schedulers (5a)6d6emap stall dispatch (70)map rewind (75)map stall (76)dispatch uop (78)map ldst uop (7d)map ldst uop inputs (80)8283flush restart other nonspec (84)85inst all (8c)92inst int load (95)inst ldst (9b)9dl1d tlb access (a0)l1d tlb miss (a1)l1d cache miss ld (a3)ld unit uop (a6)st unit uop (a7)a9acafb5b6bbl1d cache miss ld nonspec (bf)l1d tlb miss nonspec (c1)c2cfd5map dispatch bubble (d6)ddfetch restart (de)e0eaebec? ldst retires (ed)f5f6f7f8fd
10054033110110661033882771925100010001000155554034032253261100010002000403851110010100010000101921431059006010396158431927311611400131351000404404404403404
10044033111100670023883771925100010001000155554034032253261100010002000403851110010100010000101919431059006010406159431917311611403131351000404404404404403
10044033101011671023883772025100010001000155554034032253261100010002000404851110010100010000101919431059106010396159431917311611400131351000404404404403404
10044032111000930033883771925100010001000155254034032253260100010002000403851110010100010000102120431059106010406160431907311611400131351000404403404404404
10044033110100670033883771925100010001000154804034032253260100010002000403851110010100010000102020431058106010406158431927311611399131351000404404404404404
10044033100100660023882772025100010001000155064034032263261100010002000403851110010100010000101919431059006110406158431907311611400131351000403404404404404
10044033111100660023882772025100010001000155064024032253260100010002000403851110010100010000102019431059106110396159431917311611399131351000403404404404404
10044033101100660023872771925100010001000154804054032253261100010002000403851110010100010000101919431058006010406160431907311611400131351000404404404403404
10044083100000760023883771925100010001000155244114032283261100010002000403851110010100010000101919431059106110406158431927311611400131351000404403404404404
10044034101000660023882771825100010001000155244024032253261100010002000403851110010100010000101920431059106010406158431907311611400131351000404404404403404

Test 2: Latency 1->2 (with chain penalty)

Chain cycles: 3

Code:

  ldr w0, [x6, w7, uxtw]
  eor x8, x8, x0
  eor x8, x8, x0
  add x6, x6, x8
  mov x7, #4
  mov x8, 0

(fused SUBS/B.cc loop)

100 unrolls and 100 iterations

Result (median cycles for code, minus 3 chain cycles): 4.0051

retire uop (01)cycle (02)03l1d tlb fill (05)mmu table walk data (08)09l2 tlb miss data (0b)0e0f191e223a3f4d4f51schedule uop (52)schedule int uop (53)schedule ldst uop (55)dispatch int uop (56)dispatch ldst uop (58)int uops in schedulers (59)simd uops in schedulers (5a)60696a6d6emap stall dispatch (70)map rewind (75)map stall (76)dispatch uop (78)map int uop (7c)map ldst uop (7d)map int uop inputs (7f)map ldst uop inputs (80)8283flush restart other nonspec (84)85inst all (8c)inst branch (8d)inst branch taken (90)inst branch cond (94)inst int load (95)inst int alu (97)inst ldst (9b)9d9fl1d tlb access (a0)l1d tlb miss (a1)l1d cache miss ld (a3)ld unit uop (a6)l1d cache writeback (a8)a9acafb5bbl1d cache miss ld nonspec (bf)l1d tlb miss nonspec (c1)c2branch cond mispred nonspec (c5)cfd5map dispatch bubble (d6)ddfetch restart (de)e0? int output thing (e9)eaebec? ldst retires (ed)? int retires (ef)f5f6f7f8fd
402057005152500101101007003669782597102540104301031000130100100006161753342254149669717009270051646313649524010030200100006020020000700573511402011009910010000301001000001001000001100000001000010100026101711169798300031001010000301007005270052700527005270052
4020470051524000000011170042697885970125401043010610002301001000061606833417691496697770060700416465336494440100302001000060200200007006235114020110099100100003010010000010010000011000000010000101000261017111698143000610101010000301007005870058700587005870042
4020470057525100100021070036697825971025401043010310001301001000061601433422541496697170051700516464736493840100302001000060200200007003535114020110099100100003010010000010010000011000000010000001000261017111697983000010101010000301007005270052700527005270052
4020470051525000000001170042697885971625401083010310001301001000061606833425421496697770057700576463736496040100302001000060200200007006035114020110099100100003010010000110010000011000000010000101000261017111698143000610101010000301007004270058700587005870058
402047005752511100002007003669782597102540104301121000030100100006160143342254149669717005170051646473649464010030200100006020020000700513511402011009910010000301001000001001000210100010211000001012026101711169820300031001310000301007005270036700527005270036
40204700515250000000110700206978259710254010030103100003010010000616014334225414966971700357003564631364954401003020010000602002000070052351140201100991001000030100100000100100012010002001100001111202610171116980430000001010000301007003670052700527005270036
402047003552500001000017002669702597162540108301061000230100100006160783342542049639317005170035646473649544010030200100006020020000700513511402011009910010000301001000001001000131100010011000011111026101711269798300061001010000301007005270036700367003670036
4020470035524000000010170042697885970125401083010610002301001000061607833425420496697770057700416465336496040100302001000060200200007005735114020110099100100003010010000010010000011000000010000001000261017111698143000310101010000301007005570052700367005270036
4020470061526000000011070020697825969525401043010010001301001000061617533414700496697170051700516464736495440100302001000060200200007003535114020110099100100003010010000010010001111000100110000011100261017111698203000310101010000301007003670052700527003670036
402047005152400001101007004069782597102540100301031000130100100006161753341470049669717003570035646313649544010030200100006020020000700353511402011009910010000301001000001001000001100000001000000100026101711169814300031001010000301007005870042700427005870058

1000 unrolls and 10 iterations

Result (median cycles for code, minus 3 chain cycles): 4.0053

retire uop (01)cycle (02)03l1d tlb fill (05)mmu table walk data (08)09l2 tlb miss data (0b)0e0f1e1f22233a3f4d4f51schedule uop (52)schedule int uop (53)schedule ldst uop (55)dispatch int uop (56)dispatch ldst uop (58)int uops in schedulers (59)simd uops in schedulers (5a)60696a6d6emap stall dispatch (70)int prf full (71)map rewind (75)map stall (76)dispatch uop (78)map int uop (7c)map ldst uop (7d)map int uop inputs (7f)map ldst uop inputs (80)8283flush restart other nonspec (84)85inst all (8c)inst branch (8d)inst branch taken (90)inst branch cond (94)inst int load (95)inst int alu (97)inst ldst (9b)9d9fl1d tlb access (a0)l1d tlb miss (a1)l1d cache miss ld (a3)ld unit uop (a6)l1d cache writeback (a8)a9acafb5bbl1d cache miss ld nonspec (bf)l1d tlb miss nonspec (c1)c2branch mispred nonspec (cb)cdcfd5map dispatch bubble (d6)ddfetch restart (de)e0? int output thing (e9)eaebec? ldst retires (ed)? int retires (ef)f5f6f7f8fd
400257005352410010020101700446978059715254001830016100023001010000619212334235004966976700417005664674036496640010300201000060020200007004135114002110910100003001010000110100031010002001100001101000252017111699023000696010000300107004270054700577005770057
400247005652410110020000700416978059715254001430016100023001010000617036334176904966973700417004164674036496640010300201000060020200007005335114002110910100003001010000010100012010002001100001101000252017111698843000666910000300107005770057700577005770057
400247005652411000020000700416978059701254001430013100023001010000617036334249404966961700417005664674036498140010300201000060020200007005635214002110910100003001010000010100011110001001100001111100252017121698973000399910000300107004270042700547004270054
400247004152410100010001700416978059717254001830013100023001010057616995334249404966976700567004164659036498140010300201000060020200007005335114002110910100003001010000010100023110001011100000101000252017112699043000600010000300107004270057700427005470054
400247005652411010010000700386977759712254001830016100023001010000616995334249404966961700537005664671036498140010300201000060020200007004135114002110910100003001010000010100022110001001100001111000252017111698933000606010000300107005770042700427005770057
400247005652510000070000700266970259715254001430016100023001010000617009334249404966976700537004164674036498140010300201000060020200007005635114002110910100003001010000010100011110001121100001101000252017111698843000666910000300107005770042700427005770054
400247005652411110010000700266970259712254001430016100023001010000616995334235004966961700567005664659036498140010300201000060020200007005635114002110910100003001010000010100011010001001100000101100252017111698953000606010000300107006370049700547005470057
400247005352511100020000700416977759701254001830016100023001010000617009334249404966973700417005364671036498140010300201000060020200007004135114002110910100003001010000010100011110002001100000101200252017111698933000696610000300107005770054700547004270057
400247005352511000120000700386978059712254001430013100023001010000617036334176904966976700567005664674036498140010300201000060020200007005635114002110910100003001010000010100022110002101100001111100252017111698483000696010000300107005470057700577005770057
400247005652411110010001700386970259715254001430016100023001010000617036334176904966976700537005364674036498140010300201000060020200007005635114002110910100003001010000010100021110001001100001101000252017111699103000696910000300107005470054700577005770057

Test 3: Latency 1->3 (with chain penalty)

Chain cycles: 3

Code:

  ldr w0, [x6, w7, uxtw]
  eor x8, x8, x0
  eor x8, x8, x0
  add x7, x7, x8
  mov x7, #4
  mov x8, 0

(fused SUBS/B.cc loop)

100 unrolls and 100 iterations

Result (median cycles for code, minus 3 chain cycles): 4.0050

retire uop (01)cycle (02)03090e0f1e223f4d4f51schedule uop (52)schedule int uop (53)schedule ldst uop (55)dispatch int uop (56)dispatch ldst uop (58)int uops in schedulers (59)simd uops in schedulers (5a)60696a6d6emap stall dispatch (70)map rewind (75)map stall (76)dispatch uop (78)map int uop (7c)map ldst uop (7d)map int uop inputs (7f)map ldst uop inputs (80)8283flush restart other nonspec (84)85inst all (8c)inst branch (8d)inst branch taken (90)inst branch cond (94)inst int load (95)inst int alu (97)inst ldst (9b)9d9fl1d tlb access (a0)l1d cache miss ld (a3)ld unit uop (a6)l1d cache writeback (a8)acafb5l1d cache miss ld nonspec (bf)c2branch cond mispred nonspec (c5)cdcfd5map dispatch bubble (d6)ddfetch restart (de)e0? int output thing (e9)eaebec? ldst retires (ed)? int retires (ef)f5f6f7f8fd
40205700475251111170020697815970625401043010010002302511000061617533422061496696770050700506464336495340100302001000060200200007003535114020110099100100003010010000010010004010000701000010000261017111697983000390610000301007004870036700487003670051
40204700475250000170020697815970925401003010310001301001000061600533422061496697070035700356464636495340100302001000060200200007004735114020110099100100003010010000010010000110000001000011000261017111697983000099010000301007005170036700517005170051
40204700475240004170035697645970925401043010310013302691000061600533422060496696770035700506464336495040100302001000060200200007005035114020110099100100003010010000010010000110000001000010000261017111698133000309610000301007005170051700367004870051
40204700505250001170032697645970925401003010310001301001000061601533414701496697070047700476464336495040100302001000060200200007003535114020110099100100003010010000010010000110000031000011000261017111697983000360010000301007005170051700367003670051
40204700355250001170035697355970625401043010310001301001000061601533420620496696770035700476464636495040100302001000060200200007003535114020110099100100003010010000010010000110000001000011000261017111698133000300910000301007005170036700487004870051
40204700505240101170020697645970925401003010310000301001000061617533420620496697070050700356464636495340100302001000060200200007005035114020110099100100003010010000010010000010000101000000000261017111698133000306010000301007005170054700487005170051
40204700505250001170020697645970625401043010310001301001000061600533422060496697070050700506464636495340100302001000060200200007003535114020110099100100003010010000010010000010000001000011000261017111697983000306010000301007005170051700367005170051
40204700505250001170020697815970925401043010310001301001000061601533420621496697070050700476463136495340100302001000060200200007005035114020110099100100003010010000010010000110000001000010000261017111697983000099910000301007003670051700367005170036
40204700355240111170020697815970925401043010310001301001000061601533422061496394370050700356464636495340100302001000060200200007005035114020110099100100003010010000010010000110000001000010000261017111698133000690010000301007005170051700367005170051
40205700355240011170035697645969525401043010310000301001000061617533414700496697070050700476463136495040100302001000060200200007005035114020110099100100003010010000110010000110000031000011000261017111697983000096610000301007003670048700517005170051

1000 unrolls and 10 iterations

Result (median cycles for code, minus 3 chain cycles): 4.0051

retire uop (01)cycle (02)03mmu table walk instruction (07)090e0f181e22233f4d4f51schedule uop (52)schedule int uop (53)schedule ldst uop (55)dispatch int uop (56)dispatch ldst uop (58)int uops in schedulers (59)simd uops in schedulers (5a)60696a6d6emap stall dispatch (70)int prf full (71)map rewind (75)map stall (76)dispatch uop (78)map int uop (7c)map ldst uop (7d)map int uop inputs (7f)map ldst uop inputs (80)8283flush restart other nonspec (84)85inst all (8c)inst branch (8d)inst branch taken (90)inst branch cond (94)inst int load (95)inst int alu (97)inst ldst (9b)9d9fl1d tlb access (a0)l1d cache miss ld (a3)ld unit uop (a6)l1d cache writeback (a8)acafb5l1d cache miss ld nonspec (bf)cfd5map dispatch bubble (d6)ddfetch restart (de)e0? int output thing (e9)eaebec? ldst retires (ed)? int retires (ef)f5f6f7f8fd
40025700515240100011070020697435971325400143001310001300101000061699133423980496706670035700356465303649604001030020100006002020000700353511400211091010000300101000001010000110000001000010252011717769814300001301310000300107029270163700367003670036
4002470054525000000007003969778597132540014300131000130010100006170683342254049669817005170054646720364960400103002010000600202000070054351140021109101000030010100000101000011000000100000125209719969817300030131010000300107003670055700557005570036
4002470054525000000007003969743597132540014300131000130010100006170683342254049669557005470051646720364979400103002010000600202000070054351140021109101000030010100000101000011000000100001025208716669814300030131010000300107003670052700527005270055
4002470051525001001107002069778596952540014300131000130010100006170183341470049669747003570054646720364979400103002010000600202000070051351140021109101000030010100000101000001000000100001125206716869817300001301310000300107005270055700557005270055
40024700515240000010070020697765971025400143001310001300101000061709033414700496697470035700356467203649764001030020100006002020000700353521400211091010000300101000001010000110000001000000252087179697983000313131310000300107005570036700557003670060
40024700545250000010070039697785971325400103001310000300101000061701833423980496697470054700356465303649794001030020100006002020000700353511400211091010000300101000001010000110000001000011262077188702303000013131310000300107003670380700527005270400
4002470051525000001007003669743597102540014300101000130010100006170183342398049669717003570054646530364979400103002010000600202000070035351140021109101000030010100000101000011000000100001125206716969814300031313010000300107003670055700777005270055
40024700395250000000070020697435971025400143001310000300101000061706833422540496697470054700546467203649794001030020100006002020000700353511400211091010000300101000001010000110000001000001252097168698143000313131010000300107005270052700557003670055
40024700545250000000070020697785971325400103001310000300101000061699133414700496697170054700546466903649604001030020100006002020134700553511400211091010000300101000001010000010000001000010252067168698173000313101310000300107005570060700527005570052
40024700545250000061070039697785971325400143001010001300101000061701833423980496697170054700356467203649604021830020100006002020000700543511400211091010000300101000001010000110000031000011252087110669817300001301310000300107005270052700557005270055

Test 4: throughput

Count: 8

Code:

  ldr w0, [x6, w7, uxtw]
  ldr w0, [x6, w7, uxtw]
  ldr w0, [x6, w7, uxtw]
  ldr w0, [x6, w7, uxtw]
  ldr w0, [x6, w7, uxtw]
  ldr w0, [x6, w7, uxtw]
  ldr w0, [x6, w7, uxtw]
  ldr w0, [x6, w7, uxtw]
  mov x7, 8

(fused SUBS/B.cc loop)

100 unrolls and 100 iterations

Result (median cycles for code divided by count): 0.3341

retire uop (01)cycle (02)03l1d tlb fill (05)mmu table walk data (08)090e0f1e2223243a3f4346494f51schedule uop (52)schedule int uop (53)schedule ldst uop (55)dispatch int uop (56)dispatch ldst uop (58)int uops in schedulers (59)simd uops in schedulers (5a)60696a6d6emap stall dispatch (70)map rewind (75)map stall (76)dispatch uop (78)map int uop (7c)map ldst uop (7d)map int uop inputs (7f)map ldst uop inputs (80)8283flush restart other nonspec (84)85inst all (8c)inst branch (8d)inst branch taken (90)92inst branch cond (94)inst int load (95)inst int alu (97)inst ldst (9b)9fl1d tlb access (a0)l1d tlb miss (a1)l1d cache miss ld (a3)ld unit uop (a6)st unit uop (a7)l1d cache writeback (a8)a9acafb5b6bbl1d cache miss ld nonspec (bf)l1d tlb miss nonspec (c1)c2branch cond mispred nonspec (c5)branch mispred nonspec (cb)cdcfd5map dispatch bubble (d6)ddfetch restart (de)e0? int output thing (e9)eaebec? ldst retires (ed)? int retires (ef)f5f6f7f8fd
802052672820000011670001267162010258010010080000100800135001166836149236392673126731166556166888011520080024200160048267366411802011009901008000010080000100800212043800581002180000613944001115118016002673400130800001002673826738267382673726715
8020426736200111000100026716210212580100100800001008001550011683801492365626736267141666361668980116200800242001600482673685118020110099010080000100800001008001919438005910160800400139430011151180160026724014144800001002673226708267322673226732
8020426731201000006710002672107719258010010080000100800155001169949149236272673126707166596166798011520080024200160048267317711802011009901008000010080000100800212043800610006180040613943001115118016002672400147800001002670826732267322672826732
802042673120000010670002267212701925801001008000010080016500116829314923668267072673116655616683801132008002420016004826731771180201100990100800001008000010080020224380059110608004061044001115118016002672801407800001002673226708267282672826732
8020426731200000000000126692212119258010010080000100800155001166596149236272670726707166356166598011520080024200160048267315611802011009901008000010080000100800201943800590002180039610440011151180160026724010140800001002673226732267082673227003
80204267412000000044100126716201192580100100800001008001550011671981492365126707267311665961668380115200800242001600482671077118020110099010080000100800001008001919080059111678004061614319011151180160026711013130800001002673726715267372673726715
80204267372001010045100126720011279258010010080000100800155001166596149239492703526707166596166618011420080024200160048268837711802011009901008000010080000100800192008005810160800406119431901115118016002673301305800001002674126737267372673726737
8020426736200101114400012671601019498010010080130100800155001165620149236362674226714166646166668011520080024200160048267368511802011009901008000010080000100800212043800590106480039610440011151180160026728014144800001002673226728267282673226732
802042673120000011211000267220702025801001008000010080015500116683614923651267312671216663616687801152008002420016004826731771180201100990100800001008000010080019204380059000648000061194319111151180160026716013135800001002673726737267152673726715
8020426736200110001200002671621102580100100800001008001350011671981492365726714267361664461666680291200802212001600482674085218020110099010080000100800001008001920438001900060800406159431911115143016002691311300800001002673726738267382674126737

1000 unrolls and 10 iterations

Result (median cycles for code divided by count): 0.3340

retire uop (01)cycle (02)030e0f1e223a3f4346494f51schedule uop (52)schedule int uop (53)schedule ldst uop (55)dispatch int uop (56)dispatch ldst uop (58)int uops in schedulers (59)simd uops in schedulers (5a)6061696a6d6emap stall dispatch (70)map rewind (75)map stall (76)dispatch uop (78)map int uop (7c)map ldst uop (7d)map int uop inputs (7f)map ldst uop inputs (80)8283flush restart other nonspec (84)85inst all (8c)inst branch (8d)inst branch taken (90)inst branch cond (94)inst int load (95)inst int alu (97)inst ldst (9b)9d9fl1d tlb access (a0)l1d cache miss ld (a3)ld unit uop (a6)l1d cache writeback (a8)acafb5b6bbl1d cache miss ld nonspec (bf)cdcfd0d2d5map dispatch bubble (d6)ddfetch restart (de)e0? int output thing (e9)eaebec? ldst retires (ed)? int retires (ef)f5f6f7f8fd
80025267222000145122669321812122580010108000010800005011688431049236482670826727166723167088001020800002016000026727721180021109108000010800001108000039800350358000061350050200071675267190106080000102670926709267282670926709
8002426728200000012671321212162580010108000010800005011668960049236282672826708166723167088001020800002016000026727711180021109108000010800001108000039800390398003961393905020008167726719066280000102672326728267092670926723
800242670820000410026712212120258001010800001080000501166750004923628267272672816672316707800102080000201600002670856118002110910800001080000110800000800390358000001353905020005168726705006480000102672926709267282672826723
8002426708200000112670721818122580010108013010800005011676050049236422672826727166723167028001020800002016000026725711180021109108000010800000108000039800390358003960354305020007165526719066280000102672826728267282672826709
8002426727199004502266932121252580010108000010800005011688431049236282672726727166723167078001020800002016000026728711180021109108000010800000108000008000003980000013543050200071678267190106080000102671226729267292670926728
80024267082000045122670701812122580010108000010800005011669931049236282672826722166673167078001020800002016000026708711180021109108000010800000108000039800390398003961039050200071655267250106280000102672326728267282670926729
800242672820000411226707212016258001010800001080000501166886104923628267282672816672316707800102080000201600002672756118002110910800001080000010800003980000035800396139390502000416592670501010080000102672626723267092672926709
80024267082000045002671121818025800101080000108000050116884310492364826708267081667231668880010208000020160000267417111800211091080000108000001080000398003903580039603543050200091677267240010080000102672326728267292672926723
8002426728200004102267070120162580010108000010800005011667500049236422672226709166523167088001020800002016000026727561180021109108000010800000108000039800000080039613900502000716872670501010280000102672326723267092672826729
800242672820000450026712218002580010108000010800005011722400049236472672826727166723167028001020800002016000026727711180021109108000010800000108000039800390358003960350050200071651026725006480000102672926728267282672826729