Apple Microarchitecture Research by Dougall Johnson

M1/A14 P-core (Firestorm): Overview | Base Instructions | SIMD and FP Instructions
M1/A14 E-core (Icestorm):  Overview | Base Instructions | SIMD and FP Instructions

LDRSH (unsigned offset, 32-bit)

Test 1: uops

Code:

  ldrsh w0, [x6, #8]
  mov x0, 1
  mov x1, 2
  mov x8, 0

(no loop instructions)

1000 unrolls and 1 iteration

Retires: 1.000

Issues: 1.000

Integer unit issues: 0.000

Load/store unit issues: 1.000

SIMD/FP unit issues: 0.000

retire uop (01)cycle (02)03l1d tlb fill (05)mmu table walk data (08)09l2 tlb miss data (0b)0e0f1e22233a3f4346494f51schedule uop (52)schedule ldst uop (55)dispatch ldst uop (58)simd uops in schedulers (5a)606d6emap stall dispatch (70)map rewind (75)map stall (76)dispatch uop (78)map ldst uop (7d)map ldst uop inputs (80)8283flush restart other nonspec (84)85inst all (8c)inst int load (95)inst ldst (9b)9dl1d tlb access (a0)l1d tlb miss (a1)l1d cache miss ld (a3)ld unit uop (a6)st unit uop (a7)l1d cache writeback (a8)a9acafb5b6bbl1d cache miss ld nonspec (bf)l1d tlb miss nonspec (c1)c2cfd5map dispatch bubble (d6)ddfetch restart (de)e0eaebec? ldst retires (ed)f5f6f7f8fd
100539921111006510338321818172510001000100015334139839922132561000100010003998111100110001000110192042105710059103861574219273316223969921000399400401399400
100439931101106510338321818152510001000100015357039939922132571000100010003998111100110001000110202042105700059103861574219173216223969921000399400400399403
100439931010106500338421818152510001000100015334039839922132571000100010003998111100110001000110202042105710059103761574219073216333969921000399400400400400
1004399311100065003384218181625100010001000153750399399221325710001000100039981111001100010001102020421057114158103861574119273216223969921000400399400400399
100439831001006500338421818162510001000100015375039939822132571000100010003998211100110001000110201942105700059103861574219173216223969921000400400400400400
100439931001006500238421818152510001000117815334039939822132571000100010003988111100110001000010201942105710159103961574219273216223959921000400400400399400
100439921101006500238421818162510001000100015362139939922432561000100010003998111100110001000110211942105710059103861574219273216223969921000399400400399400
100439921101006500338421818162510001000100015318039939922132561000100010003998211100110001000010201942105712159103861574219173216223969921000400400399400399
100439931110006500338421818162510001000100015362039939922232561000100010003998111100110001000110212040105700159103861574219073216223969921000400399400400400
100439931110006500338421818162510001000100015334139939922232561000100010003998111100110001000010192142105800159103861574218173216223969921000400400399400399

Test 2: Latency 1->2 (with chain penalty)

Chain cycles: 3

Code:

  ldrsh w0, [x6, #8]
  eor x8, x8, x0
  eor x8, x8, x0
  add x6, x6, x8
  mov x0, 1
  mov x1, 2
  mov x8, 0

(fused SUBS/B.cc loop)

100 unrolls and 100 iterations

Result (median cycles for code, minus 3 chain cycles): 4.0054

retire uop (01)cycle (02)03mmu table walk data (08)09l2 tlb miss data (0b)0e0f18191e1f22233a3f4d4f51schedule uop (52)schedule int uop (53)schedule ldst uop (55)dispatch int uop (56)dispatch ldst uop (58)int uops in schedulers (59)simd uops in schedulers (5a)696a6d6emap stall dispatch (70)map rewind (75)map stall (76)dispatch uop (78)map int uop (7c)map ldst uop (7d)map int uop inputs (7f)map ldst uop inputs (80)8283flush restart other nonspec (84)85inst all (8c)inst branch (8d)inst branch taken (90)inst branch cond (94)inst int load (95)inst int alu (97)inst ldst (9b)9d9fl1d tlb access (a0)l1d tlb miss (a1)l1d cache miss ld (a3)ld unit uop (a6)l1d cache writeback (a8)a9acafb5l1d cache miss ld nonspec (bf)branch cond mispred nonspec (c5)cdcfd5map dispatch bubble (d6)ddfetch restart (de)e0? int output thing (e9)eaebec? ldst retires (ed)? int retires (ef)f5f6f7f8fd
402057005752501000000010070036697825971025401043010310001301001000061604133423984966971700517005164650364954401003020010000602001000070054351140201100991001000030100100000100100000110000000100001100261027111698173000310101010000301007005670054700557005570036
402047003552600011001000070039697645973925401043010310000301001000061617533423984966971700357005164631364954401003020010000605341000070035351140201100991001000030100100001100100000110000000100001000261017111698173000313131310000301007005570055700557005270036
402047005452500000001000070020697645971325401003010310000301001000061604133423984966974700547005464650364938401003020010000602001000070054351140201100991001000030100100000100100000010000000100001100261017111698953000013131310000301007003670055700557005270055
402047005452500000001010070020697645971325401043010010001301001000061604133423984966971700547005464647364957401003020010000602001000070051351140201100991001000030100100000100100000110000000100001100261017111697983000310101310000301007005770055700377005870055
4020570038525000000010000700396978259710254010430100100013010010000616041334239849669717005170054646473649574010030200100676020010000700513511402011009910010000301001000001001000001100001001000010002610171116979830003010010000301007005270055700527003670055
402047005452500000000000070036697205971825401003010010000301001000061617533414704966974700547005464631364938401003020010000602001000070035351140201100991001000030100100001100100000110000000100000100261017111698173000313101310000301007003670055700557003670055
40204700545240000000001007003969782597132540100301031000130100100006160413342254496697170035700546464712649544010030200100006020010000700353511402011009910010000301001000011001000001100000001000000002610171116981730003013010000301007003670055700367005570055
402047005452500011001000070039697855969525401003010310001301001000061601433414704966974700547005164650364957401003020010000602001000070035351140201100991001000030100100000100100000110000000100001100261017111698173000313101310000301007005570055700367005270055
40204700545250000000100007003969785597132540100301031000030100100006160413342398496697470054700516465036495740100302001000060200100007005435114020110099100100003010010000010010000011000000010000110026101711169817300030101010000301007003670036700527003970055
402047005452400000000000070089698475971425401043011210001301001000061601433422544966974700357003564650364957401003020010000602001000070051351140201100991001000030100100000100100000010000000100001100261017111698173000313101010000301007003670055700527005270055

1000 unrolls and 10 iterations

Result (median cycles for code, minus 3 chain cycles): 4.0060

retire uop (01)cycle (02)03l1d tlb fill (05)mmu table walk data (08)09l2 tlb miss data (0b)0e0f1e1f22233a3f4d4f51schedule uop (52)schedule int uop (53)schedule ldst uop (55)dispatch int uop (56)dispatch ldst uop (58)int uops in schedulers (59)simd uops in schedulers (5a)60696a6d6emap stall dispatch (70)map rewind (75)map stall (76)dispatch uop (78)map int uop (7c)map ldst uop (7d)map int uop inputs (7f)map ldst uop inputs (80)8283flush restart other nonspec (84)85inst all (8c)inst branch (8d)inst branch taken (90)inst branch cond (94)inst int load (95)inst int alu (97)inst ldst (9b)9d9fl1d tlb access (a0)l1d tlb miss (a1)l1d cache miss ld (a3)ld unit uop (a6)l1d cache writeback (a8)a9acafb5bbl1d cache miss ld nonspec (bf)l1d tlb miss nonspec (c1)c2cfd5map dispatch bubble (d6)daddfetch restart (de)e0? int output thing (e9)eaebec? ldst retires (ed)? int retires (ef)f5f6f7f8fd
4002570060525111000200007008869866597082540018300161000230010100006170683342398049669717005470035646533649824001030020100006002010000700573511400211091010000300101000001010000011000000310000111112520117101212698043000313101310000300107008870116700587008870042
4002470060524100100100017004569783597252540018300131000230010100006169953341769049669617004170060646593649854001030020100006002010000700603511400211091010000300101000001010001411000200110000111102520771013669823300060131010000300107004270139701357006170061
4002470057524100010100017004269752597212540018300131000230010100006170723341769049669807004170060646783649854001030020100006002010000700573511400211091010000300101000001010001101000200110000110112520221280131269823300061301310000300107006170161701017005970058
40024700575251011002000070045698395971925400143001610001300101000061707233417691496697770064700576467836498240010300201000060020100007006035114002110910100003001010000010100013110002001100001101125201371013136982030006100010000300107005870145701147006270058
400247006052410010020100700456979259726254001830016100023001010000616995334268614966961700417006064678364982400103002010000603461005570060351140021109101000030010100000101000120100020111000011111252011710131169804300061301010000300107005870128700867006070042
40024700605251001001000070045697425972325400183001310002300101000061699533426860496698070060700416467836498240010300201000060020100007006035114002110910100003001010000010100042010001012211000011111252013710131369823300060131310000300107007170109701097006570495
400247004352410110010010170026698225977325400183001610002300101000061704533426861496696170060700576465936498540010300201000060020100007004135114002110910100003001010000010100013110003001100000111125201271051369823300061001310000300107015070079700677004670042
40024700575241010007000170042698165972225400143001610002300101000061699533417690496696170057700416465936496640010300201000060020100007006035114002110910100003001010000010100013110001001100001101225201371013136982330006130010000300107006170120700807006070058
400247006052411010020000700456983259726254001430016100023001010000617072334268604966980700607004164678364966400103002010000600201000070060351140021109101000030010100000101000120100020141000011010252013710111369823300061301310000300107006170061701017009870061
4002470060525100100201007004269783597032540018300161000230010100006170723341769049669777004170057646783649824001030020100006035610000700413511400211091010000300101000001010001211000211110000111112520127101213698043000613101010000300107015270076700617006970061

Test 3: throughput

Count: 8

Code:

  ldrsh w0, [x6, #8]
  ldrsh w0, [x6, #8]
  ldrsh w0, [x6, #8]
  ldrsh w0, [x6, #8]
  ldrsh w0, [x6, #8]
  ldrsh w0, [x6, #8]
  ldrsh w0, [x6, #8]
  ldrsh w0, [x6, #8]
  mov x7, x6
  mov x8, x6
  mov x9, x6
  mov x10, x6
  mov x11, x6
  mov x12, x6
  mov x13, x6

(fused SUBS/B.cc loop)

100 unrolls and 100 iterations

Result (median cycles for code divided by count): 0.3341

retire uop (01)cycle (02)03mmu table walk data (08)0e0f18191e1f223a3f4346494f51schedule uop (52)schedule int uop (53)schedule ldst uop (55)dispatch int uop (56)dispatch ldst uop (58)int uops in schedulers (59)simd uops in schedulers (5a)60696a6d6emap stall dispatch (70)map rewind (75)map stall (76)dispatch uop (78)map int uop (7c)map ldst uop (7d)map int uop inputs (7f)map ldst uop inputs (80)8283flush restart other nonspec (84)85inst all (8c)inst branch (8d)inst branch taken (90)inst branch cond (94)inst int load (95)inst int alu (97)inst ldst (9b)9d9fl1d tlb access (a0)l1d cache miss ld (a3)ld unit uop (a6)l1d cache writeback (a8)acafb5b6bbl1d cache miss ld nonspec (bf)cdcfd5map dispatch bubble (d6)ddfetch restart (de)e0eaebec? ldst retires (ed)? int retires (ef)f5f6f7f8fd
8020526728200000004588112671221202425801001008000010080000500116723104923627267272672716650316809801002008000020080000267277711802011009910080000100800000100800004380039008003961394305110116132672410104800001002672826728267082672826728
80204267072000000045010267122120162580100100800001008000050011672310492364726727267071665015166858010020080000200800002672777118020110099100800001008000001008000043800390388000061394305110316312670410104800001002673026708267282672826728
80204267272000100045011267122121216258010010080000100800005001166525049236472672726727166503166858010020080000200800002672756118020110099100800001008000001008000043800000398003961394305110216132672410104800001002670826728267282670826728
8020426707200010004401026692201216258010010080000100800005001167127049236472672726727166503166858010020080000200800002672777118020110099100800001008000001008000043800380398000060394305110216132672410104800001002673226728267282670826732
80204267272000000045011267120120162580100100800001008000050011770380492362726727267071663031668580100200800002008000026727771180201100991008000010080000010080000438003903980039613900511041632267240104800001002670826728267322670826728
802042672720000000450112671921212025801001008000010080000500117703804923627267272670716650316685801002008000020080000267277711802011009910080000100800000100800004380060039800396139430511031632267240105800001002670826728267282670826708
80204267272000010044011267122112162580100100800001008000050011672310492364726727267271665031668580100200800002008000026727771180201100991008000010080000010080000438003903980039603900511031632267240104800001002672826728267322670826729
802042672720000000450012671200121625801001008000010080000500116723104923627267272670716650316685801002008000020080000267275611802011009910080000100800000100800004380000008004161394305110416522672410104800001002672826728267282670826728
802042672720000000000126692212121625801001008000010080000500117703814923627267272670716654316685801002008000020080000267077711802011009910080000100800001100800004380039039800396039430511041643267240104800001002673226708267082672826728
802042673120000000450012671221212162580100100800001008000050011657890492364726727267271663031668580100200800002008000026707771180201100991008000010080000010080000438003900800396004305110516422672410104800001002670826728267082672826728

1000 unrolls and 10 iterations

Result (median cycles for code divided by count): 0.3340

retire uop (01)cycle (02)03l1d tlb fill (05)mmu table walk data (08)09l2 tlb miss data (0b)0e0f1e22243a3f4346494f51schedule uop (52)schedule int uop (53)schedule ldst uop (55)dispatch int uop (56)dispatch ldst uop (58)int uops in schedulers (59)simd uops in schedulers (5a)60696a6d6emap stall dispatch (70)map rewind (75)map stall (76)dispatch uop (78)map int uop (7c)map ldst uop (7d)map int uop inputs (7f)map ldst uop inputs (80)8283flush restart other nonspec (84)85inst all (8c)inst branch (8d)inst branch taken (90)inst branch cond (94)inst int load (95)inst int alu (97)inst ldst (9b)9d9fl1d tlb access (a0)l1d tlb miss (a1)l1d cache miss ld (a3)ld unit uop (a6)st unit uop (a7)l1d cache writeback (a8)a9acafb5b6bbl1d cache miss ld nonspec (bf)cfd0d5map dispatch bubble (d6)ddfetch restart (de)e0eaebec? ldst retires (ed)? int retires (ef)f5f6f7f8fd
8002526735200100000410022670721818222580010108000010800005011669930492364226722267081665231670780010208000020800002672271118002110910800001080000010800001939800360103980039613539502001016742671966280000102687426818267272672326731
800242670820000000045001266930181214258001010800001080000501167605049236422672226722166523166888001020800002080000267227111800211091080000108000011080000210800350003580039613539502004162426719106480000102686526846267352675426723
80024267222000000004110126707218181325800101080000108000050116675004923642267272672816652316711800102080000208000026727711180021109108000010800000108001919428005700039801696100502002164426725100280000102689626758267332685926723
80024267222000000004110126693218151625800101080000108000050116675004923642267082670816667316702800102080000208000026722561180021109108000010800000108000019080035000418003501353950200416242670566280000102698626750267362680426728
80024267222000000004500126693018121425800101080000108000050117224004923628267222672216667316702800102080000208000026722711180021109108000010800000108000021080035060358003960353950200416242672560480000102672326802268192674226742
80024267322001010004500126707218181811980010108000010800005011683270492364226708267251667231670780010208000020800002672271118002110910800001080000010800002139800350003580035003539502004164426719106480000102672326801268272673826737
8002426722200000000410012670721818225800101080000108019550116760504923628267222672216667316688800102080000208000026722711180021109108000010800000108000020398003503039800006000502002162426725106480000102672326788268382672926742
800242670820000000041001266932181818258001010800001080000501168843049236422672226722166723167088001020800002080000267277111800211091080000108000001080000213980035020398003501353950200316242672466280000102672326784268282673026728
800242672220000000001012669301218152580010108000010800005011676050492364726722267221666731670280010208000020800002672271118002110910800001080000010800002008000003035800356135395020041642267051010280000102687826763267902673626757
800242670820100000141000267150018552580010108000010800005011676050492364726722267081666731670280010208000020800002672871118002110910800001080000010800002139800350403580035013543502002162426719106280000102672326783268342671426736