Apple Microarchitecture Research by Dougall Johnson

M1/A14 P-core (Firestorm): Overview | Base Instructions | SIMD and FP Instructions
M1/A14 E-core (Icestorm):  Overview | Base Instructions | SIMD and FP Instructions

LDRSH (unsigned offset, 64-bit)

Test 1: uops

Code:

  ldrsh x0, [x6, #8]
  mov x0, 1
  mov x1, 2
  mov x8, 0

(no loop instructions)

1000 unrolls and 1 iteration

Retires: 1.000

Issues: 1.000

Integer unit issues: 0.000

Load/store unit issues: 1.000

SIMD/FP unit issues: 0.000

retire uop (01)cycle (02)03l1d tlb fill (05)mmu table walk data (08)l2 tlb miss data (0b)0f1e22233a3f4346494f51schedule uop (52)schedule ldst uop (55)dispatch ldst uop (58)simd uops in schedulers (5a)606d6emap stall dispatch (70)map rewind (75)map stall (76)dispatch uop (78)map ldst uop (7d)map ldst uop inputs (80)8283flush restart other nonspec (84)85inst all (8c)inst int load (95)inst ldst (9b)9dl1d tlb access (a0)l1d tlb miss (a1)l1d cache miss ld (a3)ld unit uop (a6)st unit uop (a7)l1d cache writeback (a8)a9acafb5b6bbl1d cache miss ld nonspec (bf)l1d tlb miss nonspec (c1)c2cfd5map dispatch bubble (d6)ddfetch restart (de)e0eaebec? ldst retires (ed)f5f6f7f8fd
1005399311006510136621818162510001000100015604139939822132561000100010003818111100110001000110192042105700021103860194219173216223969921000383399400382382
1004381311006500038421818592510001000100015362038138222132571000100010003818111100110001000010202042106610059103701194219173216223969021000400382400382400
100439921000650003842018162510001000100015375138238122132561000100010003998211100110001000010212040105700021103801574219073216223799021000400399400400400
10043813101065002366018012510001000100015318038139922232571000100010003988111100110001000110191942101900121103761194219073216223959921000400383399400383
100438221000210033832180162510001000100015328038239922132571000100010003816411100110001000010212042105700059103860194219173216223960921000400401382410383
10043983111065103384201820251000100010001445603993992213256100010001000399811110011000100001019204210190005910380157019073216223969921000400382382382400
10043993101065002385000162510001000100015320039939922132571000100010003998111100110001000010192042105700059103861574219073216223789921000399382400400400
100439931000650023840181816251000100010001536204083982213242100010001000398641110011000100001020194210191002110006157019173216223959901000399400399400383
10043982100066100383001816251000100010001453303993992213257100010001000399811110011000100001019190105811060103860564219073216223959921000400400399400383
1004398311012100338401818125100010001000153570398400221325710001000100038164111001100010000102020010570002110380057019073216223959921000400399400401400

Test 2: Latency 1->2 (with chain penalty)

Chain cycles: 3

Code:

  ldrsh x0, [x6, #8]
  eor x8, x8, x0
  eor x8, x8, x0
  add x6, x6, x8
  mov x0, 1
  mov x1, 2
  mov x8, 0

(fused SUBS/B.cc loop)

100 unrolls and 100 iterations

Result (median cycles for code, minus 3 chain cycles): 4.0054

retire uop (01)cycle (02)03l1d tlb fill (05)mmu table walk data (08)09l2 tlb miss data (0b)0e0f1e233a3f4d4f51schedule uop (52)schedule int uop (53)schedule ldst uop (55)dispatch int uop (56)dispatch ldst uop (58)int uops in schedulers (59)simd uops in schedulers (5a)60696a6b6d6emap stall dispatch (70)map rewind (75)map stall (76)dispatch uop (78)map int uop (7c)map ldst uop (7d)map int uop inputs (7f)map ldst uop inputs (80)8283flush restart other nonspec (84)85inst all (8c)inst branch (8d)inst branch taken (90)inst branch cond (94)inst int load (95)inst int alu (97)inst ldst (9b)9d9fl1d tlb access (a0)l1d tlb miss (a1)l1d cache miss ld (a3)ld unit uop (a6)l1d cache writeback (a8)a9acafb5bbl1d cache miss ld nonspec (bf)l1d tlb miss nonspec (c1)c2cfd5map dispatch bubble (d6)ddfetch restart (de)e0? int output thing (e9)eaebec? ldst retires (ed)? int retires (ef)f5f6f7f8fd
402057005252500201110170045697915971925401043010610002301001000061609533426860496698007006070078646583649634010030200100006020010000700573511402011009910010000301001000011001000111100010111000011111261027111698203000610101010000301007006170061700427006170042
4020470057525111000100700396976459713254010430103100013010010000616041334147004966974070054700546465036495740100302001000060200100007003535114020110099100100003010010000010010000011000000010003101002610171116981730003100010000301007005570036700607003670036
402047005452400000010070020697825971025401043010310001301001000061617533423981496697107005470051646473649574010030200100006020010000700353511402011009910010000301001000001001000000100000001000010000261017111698143000310101310000301007003670055700557005270052
4020470051525000000100700206978559713254010430103100013010010000616175334147014966955070054700356463111651044010030200100006020010000700513511402011009910010000301001000001001000001100010001000000100261017111698173000310101310000301007005570055700527005270055
40204700545240000001007003969785597132540104301031000130255100006160413342398049669740700357005464631364938401003020010000602001000070054351140201100991001000030100100001100100000110000000100001000026101711169817300030101310000301007003670055700367005570036
4020470051524000000100700206978259713254010030103100013010010000616175334239804966955070054700356463136493840100303981000060200100007003535114020110099100100003010010000010010000011000000010000100002610171116979830003100010000301007005570055700557005570055
40204700545240000001007002069785596952540104301001000130100100006161753342254149670300700547005464650364954401003020010000602001000070051351140201100991001000030100100000100100000110000000100001010026101711169814300091013010000301007003670052700527005570052
402047018852400000013107003669764597132540104301031000030100100006161753342398149669740700357005164650364957401003020010000602001000070035351140201100991001000030100100000100100000010000000100001010026101711169798300001013010000301007005570055700557005570052
402047005452400000011070020697855971325401043010010001301001000061617533423981496697407005470054646503649384010030200100006020010000700353511402011009910010000301001000011001000001100010031000010100261017111698173000310101310000301007005570060700557005570036
40204700545250000101107004569764597132540100301031000130100100006160863342398149669710700517005464650364957401003020010000602001000070035351140201100991001000030100100001100100000010000000100001010026101711169817300001001310000301007005570055700557003670052

1000 unrolls and 10 iterations

Result (median cycles for code, minus 3 chain cycles): 4.0057

retire uop (01)cycle (02)03l1d tlb fill (05)mmu table walk data (08)l2 tlb miss data (0b)18191e22233a3f4d4f51schedule uop (52)schedule int uop (53)schedule ldst uop (55)dispatch int uop (56)dispatch ldst uop (58)int uops in schedulers (59)simd uops in schedulers (5a)60696a6d6emap stall dispatch (70)int prf full (71)map rewind (75)map stall (76)dispatch uop (78)map int uop (7c)map ldst uop (7d)map int uop inputs (7f)map ldst uop inputs (80)8283flush restart other nonspec (84)85inst all (8c)inst branch (8d)inst branch taken (90)inst branch cond (94)inst int load (95)inst int alu (97)inst ldst (9b)9d9fl1d tlb access (a0)l1d tlb miss (a1)l1d cache miss ld (a3)ld unit uop (a6)st unit uop (a7)l1d cache writeback (a8)a9acafb5bbl1d cache miss ld nonspec (bf)l1d tlb miss nonspec (c1)c2cdcfd5map dispatch bubble (d6)ddfetch restart (de)e0? int output thing (e9)eaebec? ldst retires (ed)? int retires (ef)f5f6f7f8fd
4002570057524111002101700426978559716254001830016100023001010000617045334254214966980700607005764678036498540010300201000060020100007006035114002110910100003001010000110100012110001000110000111110252057111698203000310101010000300107005970061700617005870058
4002470057525110002001700456977759719254001830016100023001010000617045334254214966980700607005764675036498540010303471005560020100007006035114002110910100003001010000110100021010001000110000111120252027111698233000313101010000300107006170061700587005870061
40024700415251110021017004269702597162540018300161000230010100006170723342542149669777006070060646750364982400103002010000600201000070057351140021109101000030010100001101000311100020011100001111102520171116980430003010010000300107005870058700617009970058
4002470057524110002001700426970259716254001830013100013001010000617072334268614966977700577005764675036496640010300201000060020100007006035114002110910100003001010000110100021110002002110000111110252017111698233000613101310000300107005970061700587005870058
400247006052510100210070045697845971625400183001610002300101000061707233425421496697770060700606467803649824001030020100006002010000700413511400211091010000300101000001010003301000100111000001111025201712269820300031010010000300107006670058700587005870042
4002470041525111002101700426978159716254001830016100023001010000617072334254214966977700577006064678036498240010300201000060020100007006035114002110910100003001010000010100011110002000110000111110252017111698233000610101010000300107006670042700587005870061
4002470057524110002000700456978359716254001830016100023001010000617045334254214966977700417006064678036498540010300201000060020100007004135114002110910100003001010000110100012110001001110000110100252017111698233000613131010000300107006170061700617006170058
4002470060524100002100700456978459719254001830016100023001010000617045334254214966977700577005764659036498240010300201006560020100007005735114002110910100003001010000010100011110002001410000111110252027111698203000313101110000300107012570065700617005870058
400247005752410000200170042697815976925400183001610002300101000061707233426861496696170041700576467503649664001030020100006002010000700603511400211091010000300101000011010003111000201111000011111025201711169823300061001310000300107006070061700427005870061
4002470060524100002100700456970259716254001830013100013001010000617045334268614966961700417006064675036498240010300201000060020100007004335114002110910100003001010000010100022110002010110000010100252017111698043000610101310000300107006170061700587005870061

Test 3: throughput

Count: 8

Code:

  ldrsh x0, [x6, #8]
  ldrsh x0, [x6, #8]
  ldrsh x0, [x6, #8]
  ldrsh x0, [x6, #8]
  ldrsh x0, [x6, #8]
  ldrsh x0, [x6, #8]
  ldrsh x0, [x6, #8]
  ldrsh x0, [x6, #8]
  mov x7, x6
  mov x8, x6
  mov x9, x6
  mov x10, x6
  mov x11, x6
  mov x12, x6
  mov x13, x6

(fused SUBS/B.cc loop)

100 unrolls and 100 iterations

Result (median cycles for code divided by count): 0.3342

retire uop (01)cycle (02)03l1d tlb fill (05)mmu table walk data (08)09l2 tlb miss data (0b)18191e22233a3f4346494f51schedule uop (52)schedule int uop (53)schedule ldst uop (55)dispatch int uop (56)dispatch ldst uop (58)int uops in schedulers (59)simd uops in schedulers (5a)60696a6b6d6emap stall dispatch (70)map rewind (75)map stall (76)dispatch uop (78)map int uop (7c)map ldst uop (7d)map int uop inputs (7f)map ldst uop inputs (80)8283flush restart other nonspec (84)85inst all (8c)inst branch (8d)inst branch taken (90)92inst branch cond (94)inst int load (95)inst int alu (97)inst ldst (9b)9d9fl1d tlb access (a0)l1d tlb miss (a1)l1d cache miss ld (a3)a5ld unit uop (a6)st unit uop (a7)l1d cache writeback (a8)a9acafb5b6bbl1d cache miss ld nonspec (bf)l1d tlb miss nonspec (c1)c2cfd5map dispatch bubble (d6)ddfetch restart (de)e0eaebec? ldst retires (ed)? int retires (ef)f5f6f7f8fd
802052672820000100021003267252772025801001008000010080000500116675804923656026737267361665931669480100200800002008000026736851180201100990100800001008000001008002220430800591016180040615901925110116112678913135800001002673826715267372673726737
8020426736200110100670032672137920258010010080000100800005001167224049236570267152673616659316694801002008000020080000267368511802011009901008000010080000010080020214308001900261800406119431905110116112673913135800001002694426747267372673726737
8020426736200110000660022672137720258010010080000100800005001164560049236560267142673616637316695801002008000020080000267368511802011009901008000010080000110080019194308005810160800006159431905110116112673913135800001002673826738267162673726737
80204267362001000006700326721377202580100100800001008000050011671011492365602673626736166373166958010020080000200800002673685118020110099010080000100800000100800191900800591013880000613944005110116112679610107800001002673226728267082672826732
8020426731200000000441012671221119258010010080000100800005001165556049236510267272673116650316689801002008000020080000267317711802011009901008000010080000010080000043080168020080038613944005110116112677814144800001002670826708267282673226732
80204267272000000004500126716201192580100100800001008000050011671271492362702672726727166303166898010020080000200800002673177118020110099010080000100800000100800000430800380003880038613944005110116112671214104800001002670826708267322672826732
80204267322000000004400126716211219258010010080000100800005001167231149236510267272670716650316685801002008000020080000267077711802011009901008000010080000010080000043080039000388003961394400511011611267341407800001002672826708267082672826708
8020426707200000000450012671221016258010010080000100800005001168312149236510267312673116654316689801002008000020080000267318011802011009901008000010080000010080000043080038000388003861044005110116112673414107800001002673226728267282673226728
80204267072000000000000267122112192580100100800001008000050011671271492365102673126731166543166898010020080000200800002673177118020110099010080000100800000100800000430800390003880038613944005110116112673414107800001002673626728267282672826732
80204267312000000000001267162121192580100100800001008000050011655561492365102672726731166503166898010020080000200800002670777118020110099010080000100800000100800000430800380003880038613944005110116112672810100800001002672826728267282672826732

1000 unrolls and 10 iterations

Result (median cycles for code divided by count): 0.3340

retire uop (01)cycle (02)030e0f18191e22233a3f4346494f51schedule uop (52)schedule int uop (53)schedule ldst uop (55)dispatch int uop (56)dispatch ldst uop (58)int uops in schedulers (59)simd uops in schedulers (5a)60696a6d6emap stall dispatch (70)map rewind (75)map stall (76)dispatch uop (78)map int uop (7c)map ldst uop (7d)map int uop inputs (7f)map ldst uop inputs (80)8283flush restart other nonspec (84)85inst all (8c)inst branch (8d)inst branch taken (90)inst branch cond (94)inst int load (95)inst int alu (97)inst ldst (9b)9d9fl1d tlb access (a0)l1d cache miss ld (a3)ld unit uop (a6)st unit uop (a7)l1d cache writeback (a8)acafb5b6bbl1d cache miss ld nonspec (bf)branch mispred nonspec (cb)cfd5map dispatch bubble (d6)ddfetch restart (de)e0eaebec? ldst retires (ed)? int retires (ef)f5f6f7f8fd
8002526737200100041001266962000258001010800001080000501166750149236472672226722166673166888001020800002080000267887131800211091080000108000011080000398003501358000060350050203162226739106280000102670926723267282670926709
800242670820000004100126707012180258001010800001080000501167605149236422672226722166673167028001020800002080000267085611800211091080000108000001080000398000000080000613539050202162226735100080000102682126723267232672326723
80024267082000000451002671320181625800101080000108000050116675014923642267222672216672316688800102080000208000026722711180021109108000010800000108000039800000035800000103905020216222672866080000102672826709267282672326723
80024267082010000000026707000162580010108000010800005011668961492362826722267081667231668880010208000020800002672271118002110910800001080000010800000800350035800350135005020216222672466280000102670926709267092672326709
8002426727200000041100267130012122580010108000010800005011667501492364226708267281666731668880010208000020800002672256118002110910800001080000010800003980039003580039600005020216222670806080000102672926709267092670926709
8002426722200000000012670730180258001010800001080000501172240149238072672226722166523167028001020800002080000267287111800211091080000108000001080000398003500358000061039050202162226724010480000102672826729267232672326723
80024267082000000010026707201202580010108000010800005011667500492362826722267221666731670280010208000020800002672272118002110910800001080000010800000800000008000000353905020216222687500080000102672326727267232672326709
800242670820000004500126712212181625800101080000108000050116675004923642267082672816652316702800102080000208000026722711180021109108000010800000108000039800350039800366035005020216232672166280000102670926729267232672326709
8002426708200100001002671201801225800101080000108000050116675004923642267222672216668316702800102080000208000026708561180021109108000010800000108000039800000035800356035005020216232672406280000102672926709267092672326723
80024267082000000410022669300011258001010800001080000501167605149236282672726727166673167078001020800002080000267227111800211091080000108000001080000398000000398003961043050202162326709610280000102672326709267282672326723