Apple Microarchitecture Research by Dougall Johnson

M1/A14 P-core (Firestorm): Overview | Base Instructions | SIMD and FP Instructions
M1/A14 E-core (Icestorm):  Overview | Base Instructions | SIMD and FP Instructions

LDRSH (register, 32-bit)

Test 1: uops

Code:

  ldrsh w0, [x6, x7]
  mov x7, #4
  mov x8, 0

(no loop instructions)

1000 unrolls and 1 iteration

Retires: 1.000

Issues: 1.000

Integer unit issues: 0.000

Load/store unit issues: 1.000

SIMD/FP unit issues: 0.000

retire uop (01)cycle (02)03090e1e223a3f4346494f51schedule uop (52)schedule ldst uop (55)dispatch ldst uop (58)simd uops in schedulers (5a)606d6emap stall dispatch (70)map rewind (75)map stall (76)dispatch uop (78)map ldst uop (7d)map ldst uop inputs (80)8283flush restart other nonspec (84)85inst all (8c)inst int load (95)inst ldst (9b)9dl1d tlb access (a0)l1d cache miss ld (a3)ld unit uop (a6)l1d cache writeback (a8)acafb5b6bbl1d cache miss ld nonspec (bf)cfd5map dispatch bubble (d6)ddfetch restart (de)e0eaebec? ldst retires (ed)f5f6f7f8fd
10053943104511379212121625100010001000150180394394216325210001000200039477111001100010000100043103903910006139437321622391101041000395395395395395
10043943004511379212121625100010001000150180394394216325210001000200039477111001100010000100043103903910396139437321622371101041000395395395395395
10043943014511379212121625100010001000149890394394221325210001000200039456111001100010000100043103903910396139437321622391101041000395398395395395
1004394300450137921212162510001000100014989039439421732521000100020003747711100110001000010000103903910396139437321622391101041000395395395395395
100439430045013792121202510001000100015037039439419732521000100020003947711100110001000010004310390010396004373216223910041000395395395395395
10043943104511379212121625100010001000150370394394217325210001000200039456111001100010000100043103903910390139437321622391101041000395395395395395
10043943004510379212121925100010001000150181394394217325210001000200039477111001100010000100043103903910006139437321622391101041000395395395395395
100439420045113792121216251000100010001498913943942173252100010002000394771110011000100001000431039001039613907321622391101041000395395395395395
10043943004511379212016251000100010001501813943942173252100010002000394771110011000100001000431039001039613907321622391101041000395395395395395
10043943014511379212121625100010001000150371394394197325210001000200039477111001100010000100043103914010396139437321622391101041000395395395395395

Test 2: Latency 1->2 (with chain penalty)

Chain cycles: 3

Code:

  ldrsh w0, [x6, x7]
  eor x8, x8, x0
  eor x8, x8, x0
  add x6, x6, x8
  mov x7, #4
  mov x8, 0

(fused SUBS/B.cc loop)

100 unrolls and 100 iterations

Result (median cycles for code, minus 3 chain cycles): 4.0051

retire uop (01)cycle (02)03090e0f1e22233f4d4f51schedule uop (52)schedule int uop (53)schedule ldst uop (55)dispatch int uop (56)dispatch ldst uop (58)int uops in schedulers (59)simd uops in schedulers (5a)696a6d6emap stall dispatch (70)map rewind (75)map stall (76)dispatch uop (78)map int uop (7c)map ldst uop (7d)map int uop inputs (7f)map ldst uop inputs (80)8283flush restart other nonspec (84)85inst all (8c)inst branch (8d)inst branch taken (90)inst branch cond (94)inst int load (95)inst int alu (97)inst ldst (9b)9d9fl1d tlb access (a0)l1d cache miss ld (a3)ld unit uop (a6)l1d cache writeback (a8)a9acafb5bbl1d cache miss ld nonspec (bf)branch cond mispred nonspec (c5)cfd5map dispatch bubble (d6)ddfetch restart (de)e0? int output thing (e9)eaebec? ldst retires (ed)? int retires (ef)f5f6f7f8fd
40205700515251000107003969764597102540104301031000230100100516211893341470496697470051700546465036495740100302001000060200200007005435114020110099100100003010010000010010000110000000100001010261017111698173000310101310000301007003670052700557005570055
40204700515240006107003669785597102540100301031000030100100006161753342254496697470054700516464736493840100302001000060200200007005435114020110099100100003010010000010010000110000000100001000261017111697983000310101310000301007005570036700557005570053
40204700515250005800700366978259710254010430103100013010010000616014334147049669717005470054646313649544010030200100006020020000700513511402011009910010000301001000001001000011000000010000001026101711169817300001301310000301007005270055700527003670036
4020470035524000110700396976459710254010430103100003010010000616014334225449669557005170051646473649544010030200100006020020000700513511402011009910010000301001000001001000011000000010000101026101711169817300031310010000301007005570055700527005270036
40204700545250001007003969785597132540104301171000130100101026185833342254496697470035700546463136495440100302001000060200200007005435114020110099100100003010010000010010000110000000100001010261017111698143000310101010000301007003670052700557005570052
40204700515250006107002069785597402540104301001000130100100006160143342254496695570051700356464736493840100302001000060200200007005135114020110099100100003010010000010010000010000000100001010261017111698143000313101010000301007005870057700537005570055
40204700515250000107003669764597102540104301031000130100100006160143341470496697170054700546464736493840100302001000060200200007005135114020110099100100003010010000010010000010000000100001010261017111698143000310101010000301007005270055700527005270036
4020470035525000100700366978259710254010430103100013010010000616041334225498669887005470051646313649544010030200100006020020000700513511402011009910010000301001000001001000001000000010000100026101711169814300031301310000301007005570055700557005270036
402047003552400001070020697855969525401043010310000301001000061604133422544966955700517005164647364938401003020010000602002000070054351140201100991001000030100100000100100001100000001000010102610171116981830003100010000301007005270036700527005570036
402047005152400010107003669785596952540100301001000030100100006161753342398496695570035700356463136495740100302001000060200200007003535114020110099100100003010010000010010000110000000100001010261017111698163000313101010000301007003670036700527005570036

1000 unrolls and 10 iterations

Result (median cycles for code, minus 3 chain cycles): 4.0050

retire uop (01)cycle (02)0309l2 tlb miss data (0b)0e0f18191e1f22233f4d4f51schedule uop (52)schedule int uop (53)schedule ldst uop (55)dispatch int uop (56)dispatch ldst uop (58)int uops in schedulers (59)simd uops in schedulers (5a)60696a6d6emap stall dispatch (70)map rewind (75)map stall (76)dispatch uop (78)map int uop (7c)map ldst uop (7d)map int uop inputs (7f)map ldst uop inputs (80)8283flush restart other nonspec (84)85inst all (8c)inst branch (8d)inst branch taken (90)inst branch cond (94)inst int load (95)inst int alu (97)inst ldst (9b)9d9fl1d tlb access (a0)l1d cache miss ld (a3)ld unit uop (a6)l1d cache writeback (a8)acafb5l1d cache miss ld nonspec (bf)c2branch cond mispred nonspec (c5)cfd5map dispatch bubble (d6)ddfetch restart (de)e0? int output thing (e9)eaebec? ldst retires (ed)? int retires (ef)f5f6f7f8fd
40025700475251000001000700356976059706254001030013100013001010000616982334206204966967700507005064668364975400103002010000600202000070050352140021109101000030010100001101000011000000100001100252017112698133000096910000300107005170051700367005170036
40024700355240001001000700356976059695254001430010100013001010000616982334220604966955700507005064653364975400103002010000600202000070035351140021109101000030010100000101000011000000100001100252017111697983000390910000300107005170051700517003670051
40024700355250000006000700356976059706254001430013100013001010000616982334220604966970700507005064653364975400103002010000600202000070047351140021109101000030010100000101000011000000100001100252017111698133000396910000300107003670036700517005170041
40024700475250000006000700356976059712254001430010100013001010000617068334220604966955700507005064668364960400103002010000600202000070035351140021109101000030010100000101000001000000100001100252017112698133000396910000300107004870051700367005170051
40024700505250000001010700326972859706254002630013100013001010000619359334579404966973700507004764653364960400103002010000600202000070035351140021109101000030010100000101000011000010100001000252017112698103000366910000300107005170051700487003670051
40024700355250000000000700356976059709254001430013100013001010000617068334220604966970700507003564665364975400103002010000600202000070035351140021109101000030010100000101000011000000100001100252027121698133000399910000300107009370050700407005370048
40024700505240000000000700206974359695254001430013100003001010000616982334147004966970700507003564668364960400103002010000600202000070035351140021109101000030010100000101000001000000100000000252027111698133000366910000300107005170051700517005170051
40024700355240000001000700356974359709254001430010100013001010000616982334220604966970700507003564668364960400103002010000600202000070047351140021109101000030010100000101000011000000100001000252017111698133000396910000300107005170036700487004870051
40024700505240000001000700206974859709254001430013100013001010000616982334220604966967700507005064668364960402183002010000600202000070050351140021109101000030010100000101000011000000100001000252017111698133000096910000300107004170036700537003670036
40024700505240000001010700326976059709254001430013100013001010000617068334206214966970700507003564668364975400103002010000600202000070035351140021109101000030010100001101000011000003100001100252017111698133000390910000300107004870051700367003670051

Test 3: Latency 1->3 (with chain penalty)

Chain cycles: 3

Code:

  ldrsh w0, [x6, x7]
  eor x8, x8, x0
  eor x8, x8, x0
  add x7, x7, x8
  mov x7, #4
  mov x8, 0

(fused SUBS/B.cc loop)

100 unrolls and 100 iterations

Result (median cycles for code, minus 3 chain cycles): 4.0047

retire uop (01)cycle (02)030e0f1e223f4d4f51schedule uop (52)schedule int uop (53)schedule ldst uop (55)dispatch int uop (56)dispatch ldst uop (58)int uops in schedulers (59)simd uops in schedulers (5a)60696a6d6emap stall dispatch (70)map rewind (75)map stall (76)dispatch uop (78)map int uop (7c)map ldst uop (7d)map int uop inputs (7f)map ldst uop inputs (80)8283flush restart other nonspec (84)85inst all (8c)inst branch (8d)inst branch taken (90)91inst branch cond (94)inst int load (95)inst int alu (97)inst ldst (9b)9d9fl1d tlb access (a0)l1d cache miss ld (a3)ld unit uop (a6)st unit uop (a7)l1d cache writeback (a8)acafb5l1d cache miss ld nonspec (bf)c2branch cond mispred nonspec (c5)branch mispred nonspec (cb)cfd5map dispatch bubble (d6)ddfetch restart (de)e0? int output thing (e9)eaebec? ldst retires (ed)? int retires (ef)f5f6f7f8fd
402057004752500117003569781597092540104301031000230100100006161753342062149669707003570050646463649534010030200100006020020000700473511402011009901001000030100100000100100001100000001000011000261027122697983000366610000301007005170048700517004870051
402047005052400117003269735596952540104301031000030100100006161753341470149669677004770047646463649504010030200100006020020000700473511402011009901001000030100100000100100001100000031000011000261027122698103000360910000301007004870048700487005170036
402047005352400107003569781597092540100301031000130100100006160053342206149639427005070047646433649504010030200100006020020000700473511402011009901001000030100100000100100001100000001000011000261027122698103000366010000301007005270048700517006670059
402047005052400117003269781597062540104301031000130100100006160153342062149669707004770047646463649504010030200100006020020000700473511402011009901001000030100100000100100001100000001000010000261027122698103000309610000301007005170051700487003670051
402047004752400107003269735597062540104301031000130100100006160053342062149669677003570050646313649504010030200100006020020000700473511402011009901001000030100100000100100001100000001000011000261027122697983000366910000301007005170051700487004870051
402047004752500117003569735597062540104301031000130100100006160053341470149669707005070047646313649504010030200100006020020000700473511402011009901001000030100100000100100001100000001000001000261027122698113000366610000301007005170051700487004870048
402047005052400117003569735597092540104301031000130100100006161753342206149669677003570047646463649504010030200100006020020000701513511402011009901001000030100100000100100001100000001000011000261027122698103000366610000301007005170051700487004870048
402047003552400107003269735597092540104301031000130100100006160333342206149669677003570035646433649534010030200100006020020000700473511402011009901001000030100100000100100001100000001000011000261027122698003000366610000301007005170048700487004870036
402047003552400107003269737597062540104301031000130100100006160153342206149669677004970047647133649504010030200100006020020000700473511402011009901001000030100100001100100001100000001000011000261027122698103000399010000301007003670048700517005170051
4020470050524115517003269735597062540100301001000130100100006160153342483149669677004770047646433649504010030200100006020020000700473511402011009901001000030100100000100100001100000001000011000261027122698103000300910000301007004870048700367004870036

1000 unrolls and 10 iterations

Result (median cycles for code, minus 3 chain cycles): 4.0051

retire uop (01)cycle (02)03mmu table walk instruction (07)mmu table walk data (08)l2 tlb miss instruction (0a)l2 tlb miss data (0b)0e0f18191e2223243a3f4d4f51schedule uop (52)schedule int uop (53)schedule ldst uop (55)dispatch int uop (56)dispatch ldst uop (58)int uops in schedulers (59)simd uops in schedulers (5a)60696a6d6emap stall dispatch (70)map rewind (75)map stall (76)dispatch uop (78)map int uop (7c)map ldst uop (7d)map int uop inputs (7f)map ldst uop inputs (80)8283flush restart other nonspec (84)85inst all (8c)inst branch (8d)inst branch taken (90)inst branch cond (94)inst int load (95)inst int alu (97)inst ldst (9b)9d9fl1d tlb access (a0)l1d tlb miss (a1)l1d cache miss ld (a3)ld unit uop (a6)l1d cache writeback (a8)acafb5l1d cache miss ld nonspec (bf)c2cfd5map dispatch bubble (d6)ddfetch restart (de)e0? int output thing (e9)eaebec? ldst retires (ed)? int retires (ef)f5f6f7f8fd
4002570051524101011001181003700366978159710254001430013100023001010000616991334225404967081700577005664669365028400103002010000600202000070035351140021109101000030010100001101000001100000010000100252616711915697983000310101310000300107005270036700367005270036
40024700515251010000001003700366974359713254001430013100013001010000616991334147004966977700947007464671364976400103002010000600202000070051351140021109101000030010100000101000000100000010000010252615711721698173000310101010000300107005570052700367005270052
40024700515251010000010003700366977559695254001030010100013001010000616991334225404966971700807007764671364960400103002010000600202000070035351140021109101000030010100000101000001100000010000110252616711817698143000310101010000300107005270052700527003670052
4002470056526101000001000370036697755971325400143001310001300101000061699133422540496697170086700926466936498240010300201000060020200007005135114002110910100003001010000010100000110000001000011025261671191869798300030101010000300107005270052700527003670036
40024700375251010000000003700366974359695254001430010100003001010000617018334225404967064700527005164672364979400103002010000600202000070051351140021109101000030010100000101000000100000310000000252616711613698143001410101010000300107005270036700527005270036
40024700515241010000137100370036697755969525400143001010001300101000061706833422540496706670053700536474436496040010300201000060020200007003535114002110910100003001010000010100002110000101000011025261571191569818300031013010000300107003670052700527005270052
400247003752510100000010037003669743597132540014300131000030010100006170683342254049669717010270096646783649764001030020100006002020000700543511400211091010000300101000001010000011000003100001102526168719196981530010010010000300107024070036700367005570052
4002470054525101000001000370126697755969525400143001310001300101000061706833422540496697170106700946467336497840010300201000060020200007003535114002110910100003001010000010100050110000001000011025261671161769814300000101010000300107005270036700537014170039
40024700355241010000010003700366977659713254001430013100013001010000619285334225404967071703357005164716364960400103002010165600202022070436351140021109101000030010100000101000001100000010000110252617712018698143000310101010000300107005270052700367003670036
400247005152410101100131003700366974359695254001430013100013001010000616991334239804966971700947005564669364978400103002010000600202000070054351140021109101000030010100000101000000100000010000110252617711414698143000310101010000300107005270052700527005570052

Test 4: throughput

Count: 8

Code:

  ldrsh w0, [x6, x7]
  ldrsh w0, [x6, x7]
  ldrsh w0, [x6, x7]
  ldrsh w0, [x6, x7]
  ldrsh w0, [x6, x7]
  ldrsh w0, [x6, x7]
  ldrsh w0, [x6, x7]
  ldrsh w0, [x6, x7]
  mov x7, 8

(fused SUBS/B.cc loop)

100 unrolls and 100 iterations

Result (median cycles for code divided by count): 0.3342

retire uop (01)cycle (02)03l1d tlb fill (05)mmu table walk data (08)l2 tlb miss data (0b)0e0f18191e1f22233a3f4346494f51schedule uop (52)schedule int uop (53)schedule ldst uop (55)dispatch int uop (56)dispatch ldst uop (58)int uops in schedulers (59)simd uops in schedulers (5a)60696a6d6emap stall dispatch (70)map rewind (75)map stall (76)dispatch uop (78)map int uop (7c)map ldst uop (7d)map int uop inputs (7f)map ldst uop inputs (80)8283flush restart other nonspec (84)85inst all (8c)inst branch (8d)inst branch taken (90)inst branch cond (94)inst int load (95)inst int alu (97)inst ldst (9b)9d9fl1d tlb access (a0)l1d tlb miss (a1)l1d cache miss ld (a3)a5ld unit uop (a6)st unit uop (a7)l1d cache writeback (a8)a9acafb5b6bbl1d cache miss ld nonspec (bf)l1d tlb miss nonspec (c1)c2branch cond mispred nonspec (c5)branch mispred nonspec (cb)cdcfd5map dispatch bubble (d6)e0eaebec? ldst retires (ed)? int retires (ef)f5f6f7f8fd
80205267152011010000210103267172180162580100100800001008001650011670950492365226733267141665961668480115200800242001600482688981118020110099100800001008000011008001919420800570002180038615742191111511821626711990800001002673326733267342673326734
8020426714201101100021000226717218181625801001008000010080015500116765304923652267322673216642616666801152008002420016004826895811180201100991008000010080000010080019190080019101218000060190191111511801626736992800001002680526715267332673426733
80204267322001110100210002267240181815258010010080000100800165001169757049236532673226732166726166848011620080024200160048269076411802011009910080000100800000100800192042080187101598003861190190111511801626729992800001002671526734267332673326715
80204267142001010000210103267172181816258010010080000100800155001167763049236532673226732166606166668011520080024200160048267328111802011009910080000100800000100800202800800570005980038005742180111511801626711092800001002673326734267332671526734
802042671420010000006500012671731818162580100100800001008001650011671070492365226715267331665961668580114200800242001600482673382118020110099100800001008000001008001921420800190005980038611942190111511801626730002800001002673426733267332673326733
80204267142001000000210100267170018162580100100800001008001550011680610492365226732267331666061668480116200800242001600482673382118020110099100800001008000001008002119420800191105980038615742192111511801626711002800001002673326733267332673326715
802042671420011000006501022669901818162580100100800001008001650011665900492363426733267321666061668480115200800242001600482673281118020110099100800001008000001008002020420800570015980000611941190111511801626737990800001002673326715267332673326715
8020426732200100000065000226699018181525801001008000010080015500116776304923653267322673316660616684801162008002420016004826732811180201100991008000010080000010080020200080057000608003860570191111511801626729990800001002673326716267332671526734
8020426714200101000021010326717201802580100100800001008001550011668360492365326732267141666061668480115200800242001600482673281118020110099100800001008000001008002119420800191025980000615742190111511801626711092800001002673326733267332671526733
8020426714200110000021000326717201802580100100800001008001650011677330492363426733267141664261668480113200800242001600482679481118020110099100800001008000001008002020420800570022180037615742192111511801626711902800001002673326733267342673326734

1000 unrolls and 10 iterations

Result (median cycles for code divided by count): 0.3341

retire uop (01)cycle (02)03mmu table walk data (08)090f18191e22233a3f4346494f51schedule uop (52)schedule int uop (53)schedule ldst uop (55)dispatch int uop (56)dispatch ldst uop (58)int uops in schedulers (59)simd uops in schedulers (5a)60696a6d6emap stall dispatch (70)map rewind (75)map stall (76)dispatch uop (78)map int uop (7c)map ldst uop (7d)map int uop inputs (7f)map ldst uop inputs (80)8283flush restart other nonspec (84)85inst all (8c)inst branch (8d)inst branch taken (90)inst branch cond (94)inst int load (95)inst int alu (97)inst ldst (9b)9d9fl1d tlb access (a0)l1d tlb miss (a1)l1d cache miss ld (a3)ld unit uop (a6)st unit uop (a7)l1d cache writeback (a8)a9acafb5b6bbl1d cache miss ld nonspec (bf)cdcfl1i cache miss demand (d3)d5map dispatch bubble (d6)ddfetch restart (de)e0eaebec? ldst retires (ed)? int retires (ef)f5f6f7f8fd
80025267222000000075002267070181816258001010800001080000501172240149236972672826758166823166888001020800002016000026722711180021109108000010800001108000003980035000398003561350050200161611102672506480000102672926729267092672926728
8002426722200000004510226693201216258001010800001080000501168843149236332673626734166723166888001020800002016000026727711180021109108000010800000108000004380039000398003961043050200121612132671960480000102672326728267092672926709
8002426708201000000002267122121216258001010800001080000501166750149237702673526731166813167028001020800002016000026708561180021109108000010800000108000000800390000800390035430502001216121226725106480000102670926728267282670926709
8002526708201100000002267070012025800101080000108000050116688614923767267162671616672316708800102080000201600002672771118002110910800001080000010800000398003900039800000039430502001016111126705100480000102672826709267282672826728
80024267082000000045002266930121216258001010800001080000501166750149237432673326735166723166888001020800002016000026728711180021109108000010800000108000000800390003580035013943050200121613102672460080000102672826728267092672926729
800242672820000000450022669301212162580010108000010800005011667501492375326734267381671031670780010208000020160000267277711800211091080000108000001080000008003900039800396135005020013161213267051010480000102672826729267292672926728
80024267082000010354100267122012162580010108000010800005011667501492364826728268621667931670880010208000020160000267287711800211091080000108000001080000039800390003580039603600502001016121226705100480000102673026728267292672326709
800242670820000000010126713212016258001010800001080000501166896149237532673426736166933167028001020800002016000026708561180021109108000010800000108000003980039010398000060394305020012161310267051010080000102670926709267092670926729
80024267282000000000022671220016258001010800001080000501167605149237482674126736166623167078001020800002016000026708711180021109108000010800000108000004380000000358003961000502001316101326705106080000102672926709267292672326723
8002426722200000004510226713012121625800101080000108000050116675014923648267082689716663316711800102080000201600002670871118002110910800001080000110800000398003500008003901354305020012161213267251010080000102673326709267312672926723