Apple Microarchitecture Research by Dougall Johnson

M1/A14 P-core (Firestorm): Overview | Base Instructions | SIMD and FP Instructions
M1/A14 E-core (Icestorm):  Overview | Base Instructions | SIMD and FP Instructions

LDRSH (register, sxtw, 32-bit)

Test 1: uops

Code:

  ldrsh w0, [x6, w7, sxtw]
  mov x7, #4
  mov x8, 0

(no loop instructions)

1000 unrolls and 1 iteration

Retires: 1.000

Issues: 1.000

Integer unit issues: 0.000

Load/store unit issues: 1.000

SIMD/FP unit issues: 0.000

retire uop (01)cycle (02)03l1d tlb fill (05)mmu table walk data (08)l2 tlb miss data (0b)0e0f1e22233a3f4346494f51schedule uop (52)schedule ldst uop (55)dispatch ldst uop (58)simd uops in schedulers (5a)6d6emap stall dispatch (70)map rewind (75)map stall (76)dispatch uop (78)map ldst uop (7d)map ldst uop inputs (80)8283flush restart other nonspec (84)85inst all (8c)92inst int load (95)inst ldst (9b)9dl1d tlb access (a0)l1d tlb miss (a1)l1d cache miss ld (a3)ld unit uop (a6)st unit uop (a7)a9acafb5b6bbl1d cache miss ld nonspec (bf)l1d tlb miss nonspec (c1)c2cfd5map dispatch bubble (d6)ddfetch restart (de)e0eaebec? ldst retires (ed)f5f6f7f8fd
10054033000004510338837719251000100010001415939439421632551000100020003947711100101000100001000043103900391039613943007321622395101041000395399399399399
10043943000004400138321119251000100010001520839439822132561000100020003987711100101000100011000043103800391039613944007321622400131351000404404404404404
1004403311000660033883771925100010001000155554044032253261100010002000403851110010100010000102119431059116010406159431907321622400131351000404404403405403
1004403310100650033883772025100010001000155014064032253260100010002000403851110010100010000101919441059106010396160431927321622400131351000404404407404404
1004403310000670023873772025100010001000154804024032253261100010002000405851110010100010000102019431059116010406158431917321622400131351000404403403404404
1004406310100660023882771825100010001000155234034032253262100010002000403851110010100010000101920431058116110406159431917321622400131351000404404404404404
1004403311000671023883772025100010001000155554034032253261100010002000403851110010100010000101920431059116010406158431917321622400131351000404404404403404
1004403311100660033883771925100010001000155264044022253261100010002000402851110010100010000102021431059126110396159431917321622400131351000404403403404404
1004403210000690023883772025100010001000155064034032253262100010002000403851110010100010000101919451059126010396159431907321622400131451000404404405404404
1004403310000660033883771925100010001000155554034062253261100010002000403851110010100010000102119431059106010406159431917321622400131351000404404404404404

Test 2: Latency 1->2 (with chain penalty)

Chain cycles: 3

Code:

  ldrsh w0, [x6, w7, sxtw]
  eor x8, x8, x0
  eor x8, x8, x0
  add x6, x6, x8
  mov x7, #4
  mov x8, 0

(fused SUBS/B.cc loop)

100 unrolls and 100 iterations

Result (median cycles for code, minus 3 chain cycles): 4.0047

retire uop (01)cycle (02)030e0f1e22233f4d4f51schedule uop (52)schedule int uop (53)schedule ldst uop (55)dispatch int uop (56)dispatch ldst uop (58)int uops in schedulers (59)simd uops in schedulers (5a)60696a6d6emap stall dispatch (70)int prf full (71)map rewind (75)map stall (76)dispatch uop (78)map int uop (7c)map ldst uop (7d)map int uop inputs (7f)map ldst uop inputs (80)8283flush restart other nonspec (84)85inst all (8c)inst branch (8d)inst branch taken (90)inst branch cond (94)inst int load (95)inst int alu (97)inst ldst (9b)9d9fl1d tlb access (a0)l1d cache miss ld (a3)ld unit uop (a6)l1d cache writeback (a8)acafb5bbl1d cache miss ld nonspec (bf)cfd5map dispatch bubble (d6)ddfetch restart (de)e0? int output thing (e9)eaebec? ldst retires (ed)? int retires (ef)f5f6f7f8fd
4020570047524110007002069747597062540104301031000130100100006161753342062149669677004770047646310364938401003020010000602002000070047351140201100991001000030100100001100100001100000010000101261017111698133000366010000301007006770049700537005270048
4020470050524111007005869781597092540100301001000130100100006160053341470049669707003570035646430364938401003020010000605962000070050351140201100991001000030100100001100100000100000010000100261017111698133000306910000301007005370054700517005170051
4020470047524001007003269735596952540104301031000130100100006161753342206049669557004770050646310364953401003020010000602002000070050351140201100991001000030100100001100100000100000010000001261017111698133000396910000301007033170143700587008570039
4020470050524000107003569735597092540104301031000130100100006160053341470049669707004770050646310364953401003020010000602002000070050351140201100991001000030100100001100100001100000010000101261017111698103000390010000301007005170051700487004870048
4020470047525006107003269764597062540100301031000130100100006160053342062149669677004770047646430364953403143020010000602002000070050351140201100991001000030100100001100100001100000010000101261017111698103000366910000301007005270054700517005170051
4020470050525001007003269764597092540100301031000030100100006160053342206049669707004770050646460364950401003020010000602002000070035351140201100991001000030100100001100100000100001010000000261017111698133000366010000301007005370039700517005170036
4020470047525000107002069735596952540100301031000130100100006160153342062149669677004770035646890364938401003020010000602002000070050351140201100991001000030100100000100100001100000010000101261017111698133000096610000301007005070050700367004870048
4020470035525000007003569781596952540104301031000030100100006160053342206149669707004770051646460364963401003039810000602002000070047351140201100991001000030100100001100100001100000010000101261017111698103000066910000301007041570052700657003870048
4020470047524000007003269764596952540104301031000130100100006160053342206049669707005070050646310364938401003020010000602002000070050351140201100991001000030100100001100100000100000010000100261017111698103000396910000301007005470048700517004870036
4020470047524000007003269764597442540100301001000130100100006160053342062049669727004770047646430364950401003020010000602002000070050351140201100991001000030100100001100100000100000010000101261017111698133000390010000301007004870049700367004870051

1000 unrolls and 10 iterations

Result (median cycles for code, minus 3 chain cycles): 4.0054

retire uop (01)cycle (02)030e1e223f4d4f51schedule uop (52)schedule int uop (53)schedule ldst uop (55)dispatch int uop (56)dispatch ldst uop (58)int uops in schedulers (59)simd uops in schedulers (5a)5f60696a6d6emap stall dispatch (70)map rewind (75)map stall (76)dispatch uop (78)map int uop (7c)map ldst uop (7d)map int uop inputs (7f)map ldst uop inputs (80)8283flush restart other nonspec (84)85inst all (8c)inst branch (8d)inst branch taken (90)inst branch cond (94)inst int load (95)inst int alu (97)inst ldst (9b)9d9fl1d tlb access (a0)l1d cache miss ld (a3)ld unit uop (a6)l1d cache writeback (a8)acafb5l1d cache miss ld nonspec (bf)c2branch mispred nonspec (cb)cdcfd5map dispatch bubble (d6)dbddfetch restart (de)e0? int output thing (e9)eaebec? ldst retires (ed)? int retires (ef)f5f6f7f8fd
400257005152401070060697755971025400103001310000300101000061701833423980149669557005470054646723649764001030020100006002020000700513511400211091010000300101000001010000110000001000010000252037102269798300030101010000300107005570036700557003670036
40024700545250107004169779596952540014300131000030010101026170773342542014966955700547003564653364979400103002010000600202000070054351140021109101000030010100000101000011000000100001120025202710226979830003010010000300107003670036700527003670055
4002470054524061700206977859695254001430013100003001010000617068334225401496695570051700546467236497940010300201000060020200007005435114002110910100003001010000010100001100000010000010002520271022697983000313101310000300107005570055700567005270055
40024700355250117002069778597152540014300131000030010100006170683341470014966971700547005464653364960400103002010000600202000070054351140021109101000030010100000101000011000000100000000025202710226981430003130010000300107003670055700527005570036
4002470051543010700396974359695254001430013100043001010000617018334239801496697470054701506467236496040010300201000060020200007005435114002110910100003001010000010100001100000010000100002520271022698173000313131310000300107003670055700557005570052
4002470054525011700396974359713254001030013100013001010000616991334225401496697470035700546467236497640010300201000060020200007003535114002110910100003001010000010100001100000010000110002520371022698143000013131310000300107003670055700367003670036
400247005452401170039697785969525400143001310000300101000061706833422540149669747003570035646533649794001030020100006002020000700543511400211091010000300101000001010000110000001000010000252027102269814300030131310000300107005270052700527005570052
40024700355240007002069778597132540010300131000130010100006170183342398014966974700547005164653364979400103002010000600202000070035351140021109101000030010100000101000001000000100001100025202710226981730000001310000300107005570055700557005270055
4002470051525001700206974359695424001430013100013001010000617576334230200496697170054700356465336497940010300201000060020200007005435114002110910100003001010000110100001100000010000010002520271022698143000010101310000300107005570036700557003670036
4002470054525011700206977559713254001430013100013001010000617090334239801496697470054700356467236497940010300201000060020200007005435114002110910100003001010000010100001100000010000110002520371022698173000313131310000300107005570036700557005570055

Test 3: Latency 1->3 (with chain penalty)

Chain cycles: 3

Code:

  ldrsh w0, [x6, w7, sxtw]
  eor x8, x8, x0
  eor x8, x8, x0
  add x7, x7, x8
  mov x7, #4
  mov x8, 0

(fused SUBS/B.cc loop)

100 unrolls and 100 iterations

Result (median cycles for code, minus 3 chain cycles): 4.0051

retire uop (01)cycle (02)0309l2 tlb miss data (0b)0e0f181e1f22233f4d4f51schedule uop (52)schedule int uop (53)schedule ldst uop (55)dispatch int uop (56)dispatch ldst uop (58)int uops in schedulers (59)simd uops in schedulers (5a)696a6b6d6emap stall dispatch (70)map rewind (75)map stall (76)dispatch uop (78)map int uop (7c)map ldst uop (7d)map int uop inputs (7f)map ldst uop inputs (80)8283flush restart other nonspec (84)85inst all (8c)inst branch (8d)inst branch taken (90)inst branch cond (94)inst int load (95)inst int alu (97)inst ldst (9b)9d9fl1d tlb access (a0)l1d cache miss ld (a3)ld unit uop (a6)l1d cache writeback (a8)a9acafb5l1d cache miss ld nonspec (bf)cfd5map dispatch bubble (d6)ddfetch restart (de)e0? int output thing (e9)eaebec? ldst retires (ed)? int retires (ef)f5f6f7f8fd
4020570145524100001010700366978959714254010430103100023010010000616014334225449669710700517005164647364954401003020010000602002000070051351140201100991001000030100100001100100001100000001000011261127133698143000310101010000301007005570054700537005270052
402047005152500000101070036697825971025401003010310000301001000061601433422544966971070035700516464736495440100302001000060200200007005135114020110099100100003010010000110010000110000000100001126113713369814300031010010000301007005670054700367003670052
4020470051524000004010700366978259710254010430103100013010010000616014334225449669710700517005164647364954401003020010000602002000070052351140201100991001000030100100001100100031100000001000011261127132698143000310101010000301007005570054700527003670052
4020470051524000001010700206978259710254010430103100003010010000616014334225449669710700517005164647364954401003020010000602002000070051351140201100991001000030100100001100100001100000001000011261137133698143000310101010000301007041270058700527007770052
4020470051525000001000700366978259710254010030103100013010010000616014334225449669710700517005164650364938401003020010000602002000070035351140201100991001000030100100001100100001100000001000011261137132698143000010101010000301007005970090700527005270052
4020470051524000001301070036697825971025401043010310001301001000061601433414704966971070051700516464736495440100302001000060200200007005135114020110099100100003010010000110010000010000003100000126112712369814300001010010000301007003970053700527005270052
402047005152400000100070036697825971025401043010010001301001000061601433422544966971070051700516464736495440100302001000060200200007005135114020110099100100003010010000110010000110000100100001126113713369814300031010010000301007008670083700567005270052
40204700515240000070007003669786597102540104301031000030100100006160143341470496697107005170051646473650134010030200100006020020000700513511402011009910010000301001000011001000011000000010000112611371326981430000100010000301007005470052700527005270052
402047005152400000100070036697825971025401043010310001301001000061601433422544966971070051700516464736495440100302001000060200200007005135114020110099100100003010010000110010000010000000100001126113713369814300031010010000301007041270053700527008470036
40204700515240011010107003669782597102540104301031000030100100006160143342254496697107005170051646313649544010030200100006020020000700513511402011009910010000301001000011001000011000000010000112611271326981430003100010000301007005470053700557005270052

1000 unrolls and 10 iterations

Result (median cycles for code, minus 3 chain cycles): 4.0060

retire uop (01)cycle (02)03l1d tlb fill (05)mmu table walk data (08)l2 tlb miss instruction (0a)l2 tlb miss data (0b)0e0f1e22233f4d4f51schedule uop (52)schedule int uop (53)schedule ldst uop (55)dispatch int uop (56)dispatch ldst uop (58)int uops in schedulers (59)simd uops in schedulers (5a)696a6d6emap stall dispatch (70)map rewind (75)map stall (76)dispatch uop (78)map int uop (7c)map ldst uop (7d)map int uop inputs (7f)map ldst uop inputs (80)8283flush restart other nonspec (84)85inst all (8c)inst branch (8d)inst branch taken (90)inst branch cond (94)inst int load (95)inst int alu (97)inst ldst (9b)9d9fl1d tlb access (a0)l1d cache miss ld (a3)ld unit uop (a6)l1d cache writeback (a8)acafb5bbl1d cache miss ld nonspec (bf)branch mispred nonspec (cb)cfd5map dispatch bubble (d6)ddfetch restart (de)e0? int output thing (e9)eaebec? ldst retires (ed)? int retires (ef)f5f6f7f8fd
40025700545240000001107004069775597102540010300131000230010100006169913342254496697170051700356473136496640010300201000060020200007005135114002110910100003001010000110100001100000010000100025209714869823300061001010000300107005270052700607005270036
40024700955240000001107003969778597132540014300131000030010100006169913341470496695570054700356465336496040010300201000060020200007005435114002110910100003001010000010100001100000010000101025203713669823300031301310000300107006170042700427004770061
400247004152411010020070020697785971325400143001310001300101000061699133422544966974700547005464672364960400103002010000600202000070054351140021109101000030010100000101000001000000100001010252057149698233000613131310000300107004270058700617005870042
40024700605251101001107003969743597132540010300131000030010100006170183341470496697470054700356465336497940010300201000060020200007003535114002110910100003001010000010100001100000010000101025204714969804300061001310000300107005870061700617006170061
40024700605251100002007002069743596952540014300131000130010100006170183342398496697170054700546467236496040010300201000060020200007005435114002110910100003001010000010100000100010010000101025208718369804300030131010000300107006170058700427006170042
400247006052411000020070039697785969525400143001310000300101000061701833423984966955700547003564669364979400103002010000600202000070051351140021109101000030010100000101000011000000100001010252077173698263000313131310000300107006170061700617006170061
400247006052511010020070039697815971325400143001310001301541000061701833423984966977700427005164655364976400103002010000600202000070035351140021109101000030010100000101000001000000100000010255487188698233000313131310000300107005870061700587004270061
40024700415251000002007003969743597132540014300131000130010100006170683342398496697470054700546466936496040010300201000060020200007011735114002110910100003001010000010100001100000310000011025208718869896300061010010000300107005870061700617004270058
400247006052511000021070039697785971325400143001310000300101000061701833423984966974700547005464672364960400103002010065600202000070054351140021109101000030010100000101000001000000100001000252087147698043000313131310000300107004270042700427004270061
40024700605241000012107008269778596952540014300131000130010100006170183342398496695570054700356467236497940010300201000060020200007005135114002110910100003001010000010100001100000010000101025207714469823300030101310000300107005870042700617006170061

Test 4: throughput

Count: 8

Code:

  ldrsh w0, [x6, w7, sxtw]
  ldrsh w0, [x6, w7, sxtw]
  ldrsh w0, [x6, w7, sxtw]
  ldrsh w0, [x6, w7, sxtw]
  ldrsh w0, [x6, w7, sxtw]
  ldrsh w0, [x6, w7, sxtw]
  ldrsh w0, [x6, w7, sxtw]
  ldrsh w0, [x6, w7, sxtw]
  mov x7, 8

(fused SUBS/B.cc loop)

100 unrolls and 100 iterations

Result (median cycles for code divided by count): 0.3340

retire uop (01)cycle (02)03090e0f1e1f22233a3f4346494f51schedule uop (52)schedule int uop (53)schedule ldst uop (55)dispatch int uop (56)dispatch ldst uop (58)int uops in schedulers (59)simd uops in schedulers (5a)60696a6d6emap stall dispatch (70)map rewind (75)map stall (76)dispatch uop (78)map int uop (7c)map ldst uop (7d)map int uop inputs (7f)map ldst uop inputs (80)8283flush restart other nonspec (84)85inst all (8c)inst branch (8d)inst branch taken (90)inst branch cond (94)inst int load (95)inst int alu (97)inst ldst (9b)9d9fl1d tlb access (a0)l1d cache miss ld (a3)a5ld unit uop (a6)l1d cache writeback (a8)acafb5b6bbl1d cache miss ld nonspec (bf)branch cond mispred nonspec (c5)branch mispred nonspec (cb)cdcfd5map dispatch bubble (d6)ddfetch restart (de)e0eaebec? ldst retires (ed)? int retires (ef)f5f6f7f8fd
802052672320000045000126712212121725801001008000010080015500116730314923647267272670716635616679801152008002420016004826707711180201100991008000010080000010080000391268000014398003960354311151182161126704604800001002677526730267122673426723
8020426727200011001012671200181825801001008000010080016500116787514923647267222672716650616659801152008002420016004826707711180201100991008000010080000110080000390800390358000061353911151181161126704062800001002677926739268742672826728
802042672220000041010126696212121625801001008000010080015500116659614923647267072670716635616674801142008002420016004826722721180201100991008000010080000010080000008003513980039603539111511811611267191062800001002679026733267912672526728
802042672720010041010226707012017258010010080000100800165001166596049236272670726707166506166788011520080024200161210267317111802011009910080000100800000100800000080035235800396100111511811611267241064800001002678126729267552673126723
8020426727200000450001267070001225801001008000010080015500117711604923627267222672216650616679801142008002420016004826707711180201100991008000010080000010080000008000003980035613539111511811611267041004800001002677126738267362672826708
8020426715200000001012670721212122580100100800001008001550011678751492381426722267071665061667980115200800242001600482672771118020110099100800001008000001008000039080039008003901394311151181161126704660800001002677526732268252670826708
8020426707200000410101266922001325801001008000010080014500116787504923627267272672716650616659801152008002420016004826707711180201100991008000010080000010080000390800390398003961350111511811611267241064800001002678526717267922670826708
8020426725200001450101267070120142580100100800001008001450011678750492364226722267221663561665980114200800242001600482672272118020110099100800001008000011008000000800390080000610391115118116112671910102800001002673426708267232672326723
8020426722201000450000267122180182580100100800001008001550011771160492364726722267221665061667480115200800242001600482670771118020110099100800001008000001008000039080039008003501353911151181161126719600800001002676426729267442673626723
80204267072000014500022673121201525801001008000010080014500116585604923642267312673116639616807801142008002420016004826722721180201100991008000010080000110080000390800390398003961039111511811611267191004800001002678026730267202672926728

1000 unrolls and 10 iterations

Result (median cycles for code divided by count): 0.3341

retire uop (01)cycle (02)030e0f1e22233a3f4346494f51schedule uop (52)schedule int uop (53)schedule ldst uop (55)dispatch int uop (56)dispatch ldst uop (58)int uops in schedulers (59)simd uops in schedulers (5a)696a6d6emap stall dispatch (70)map rewind (75)map stall (76)dispatch uop (78)map int uop (7c)map ldst uop (7d)map int uop inputs (7f)map ldst uop inputs (80)8283flush restart other nonspec (84)85inst all (8c)inst branch (8d)inst branch taken (90)inst branch cond (94)inst int load (95)inst int alu (97)inst ldst (9b)9d9fl1d tlb access (a0)l1d cache miss ld (a3)a5ld unit uop (a6)l1d cache writeback (a8)acafb5b6bbl1d cache miss ld nonspec (bf)cfd5map dispatch bubble (d6)ddfetch restart (de)e0eaebec? ldst retires (ed)? int retires (ef)f5f6f7f8fd
80025267272001044101267132121216258001010800001080000501166750492364726728267311667231668880010208000020160000267317711800211091080000108000001080000430800390398003961394350202161226841010480000102670926729267292672826728
800242670820011451012669321212162580010108000010800005011672014923647267082672716672316708800102080000201600002672877118002110910800001080000010800004308003903880039613944502011622268451010480000102673226729267292673226732
80024267312000053101267122111625800101080000108000050116675049236482673126708166723167078001020800002016000026727771180021109108000010800000108000043080039038800396004350201161326710010080000102672926709267282672826709
800242672720010450012671321211925800101080000108000050116884349236472672726728166723167078001020800002016000026727561180021109108000010800000108000043080000038800406000502011611267241014780000102672926728267282670926728
8002426727200004500126713012121925800101080000108000050116688649236472673126708166723167078001020800002016000026708771180021109108000010800000108000043080039038800006139445020116112673900480000102672826728267282672926729
8002426728200000101266932001625800101080000108000050116675049236482670826728166723167078001020800002016000026727771180021109108000010800000108000043080039039800006100502021622268321010480000102670926728267282672826729
8002426728200004510126716212116258001010800001080000501168843492364826727267081667231670880010208000020160000267087711800211091080000108000011080000430800380080000603944502011622267241010080000102672826728267282673226729
8002526768200008700026712011219258001010800001080000501168843492364726728267281667231668880010208000020160000267287711800211091080000108000001080000440800380398003860394350201161126825100480000102672926729267322672826728
80024267312000000012671521116258001010800001080000501168843492362826727267281667631670880010208000020160000267277711800211091080000108000001080000008003904180000013944502011611267241014480000102673226732267092673226896
800242673120000450012671600121625800101080000108000050116750149236482672726727166523167078001020800002016000026708771180021109108000010800000108000000800000080000613944502011611268621410480000102672826732267282673226709