Apple Microarchitecture Research by Dougall Johnson

M1/A14 P-core (Firestorm): Overview | Base Instructions | SIMD and FP Instructions
M1/A14 E-core (Icestorm):  Overview | Base Instructions | SIMD and FP Instructions

LDRSH (register, uxtw, 32-bit)

Test 1: uops

Code:

  ldrsh w0, [x6, w7, uxtw]
  mov x7, #4
  mov x8, 0

(no loop instructions)

1000 unrolls and 1 iteration

Retires: 1.000

Issues: 1.000

Integer unit issues: 0.000

Load/store unit issues: 1.000

SIMD/FP unit issues: 0.000

retire uop (01)cycle (02)03l1d tlb fill (05)mmu table walk data (08)l2 tlb miss data (0b)0e0f1e22243a3f4346494f51schedule uop (52)schedule ldst uop (55)dispatch ldst uop (58)simd uops in schedulers (5a)606d6emap stall dispatch (70)map rewind (75)map stall (76)dispatch uop (78)map ldst uop (7d)map ldst uop inputs (80)8283flush restart other nonspec (84)85inst all (8c)92inst int load (95)inst ldst (9b)9dl1d tlb access (a0)l1d tlb miss (a1)l1d cache miss ld (a3)ld unit uop (a6)st unit uop (a7)l1d cache writeback (a8)a9acafb5b6bbl1d cache miss ld nonspec (bf)l1d tlb miss nonspec (c1)c2cfd5map dispatch bubble (d6)ddfetch restart (de)e0eaebec? ldst retires (ed)f5f6f7f8fd
100540331110066101388377202510001000100015526140340322632611000100020004028511100101000100001019214310580016010396159431927311611400131351000403404404404404
100440831110067102388377212510001000100015526140240322532611000100020004038511100101000100001020214310580006110406158431907311611400131351000404404404403405
100440331100067002387377202510001000100015480140340222532611000100020004038511100101000100011019194310611026110406159431907311611400131351000408404414403404
100440331101067103366277192510001000100015526140340222532611000100020004028511100101000100001020204410581006110406160431907311611400131351000405405404403404
100440321100066103388377202510001000100015555140240322532611000100020004058511100101000100001020204510610006010396159431907311611400131351000404404403403404
100440331110067003388277202510001000100015480140340322532611000100020004038511100101000100001020204310591016010396158431907311611400131351000404404403404404
100440331010066002388277182510001000100015523140240322532611000100020004038511100101000100001020194310581006110406159431917311611400131351000404404404404403
100440531110067003388277182510001000100015525140340322532611000100020004038511100101000100001019194310601016010406158431927311611400131351000403404404404404
100440331010067102387377202510001000100015480140340322532611000100020004038511100101000100001019214310580006310406158431907311611400131351000404404404404403
100440331110066102367277202510001000100015480140340322532601000100020004038511100101000100001021204310591016110406159431907311611400131351000403403404404404

Test 2: Latency 1->2 (with chain penalty)

Chain cycles: 3

Code:

  ldrsh w0, [x6, w7, uxtw]
  eor x8, x8, x0
  eor x8, x8, x0
  add x6, x6, x8
  mov x7, #4
  mov x8, 0

(fused SUBS/B.cc loop)

100 unrolls and 100 iterations

Result (median cycles for code, minus 3 chain cycles): 4.0054

retire uop (01)cycle (02)030e0f1e1f22233a3f4d4f51schedule uop (52)schedule int uop (53)schedule ldst uop (55)dispatch int uop (56)dispatch ldst uop (58)int uops in schedulers (59)simd uops in schedulers (5a)60696a6d6emap stall dispatch (70)map rewind (75)map stall (76)dispatch uop (78)map int uop (7c)map ldst uop (7d)map int uop inputs (7f)map ldst uop inputs (80)8283flush restart other nonspec (84)85inst all (8c)inst branch (8d)inst branch taken (90)inst branch cond (94)inst int load (95)inst int alu (97)inst ldst (9b)9d9fl1d tlb access (a0)l1d tlb miss (a1)l1d cache miss ld (a3)ld unit uop (a6)l1d cache writeback (a8)acafb5bbl1d cache miss ld nonspec (bf)c2branch cond mispred nonspec (c5)branch mispred nonspec (cb)cdcfd5map dispatch bubble (d6)ddfetch restart (de)e0? int output thing (e9)eaebec? ldst retires (ed)? int retires (ef)f5f6f7f8fd
4020570054525101010070036697645975225401003010310001301111000761557633416390496697170051700516470366498440118302301001060260200207005435114020110099100100003010010000010010000001000010100001010111262021612698603000313101310000301007005270036700587005270055
402047005452410100007002069764597522540104301031000130111100076155763342567049670227005470054647036650034011830230100106026020020700543511402011009910010000301001000001001000001100000910000001011126202162169862300030101310000301007005270055700557005570055
4020470054524001000070039697855975225401003010310001301111000761571033416390496695570035700546470066500340118302301001060260200207003535114020110099100100003010010000010010000011000000100001010111262011621698623000313131310000301007005570055700557005570036
402047005452501100007003969785597342540104301001000030111100076155763341639049669727005170051647036650034011830230100106026020020700543511402011009910010000301001000001001000001100000010000101011126202162269862300130131010000301007005570055700367005570036
402047003552500100007002069785597342540104301031000130111100076155763342567049669817005470051647036649844011830230100106026020020700543511402011009910010000301001000001001000001100000010000101011126202161269862300031310010000301007003670055700557005270036
402047005152500000007004069764597522540104301001000030111100076155763341639049669727005470054647036650034011830230100106026020020700513511402011009910010000301001000001001000000100000010000101011126202161269862300031310010000301007003670055700367005570055
4020470051525001000070039697855975225401043010310001301111000761557633425670496697470054700546470366500040118302301001060260200207003535114020110099100100003010010000010010000011000010100001000111262021621699003000313131310000301007005570055700367005570036
4020470035524001000070039697855975225401043010310001301111000761557633416390496697470054700356470366500340118302301001060260200207003535114020110099100100003010010000010010000011000000100000010111262021622698673000313131310000301007005570036700557005570055
4020470054524000010070039697645975225401043010310001301111000761571033424230496697570054700546470066500340118302301001060260200207005435114020110099100100003010010000010010000011000000100001010111262021621698593000013131310000301007005570055700557005570055
4020470035525000000070039698225975825401043010010001301111000761557633416390496697170054700546468466498440118302301001060260200207003535114020110099100100003010010000010010000001000000100001010111262011622698593000313131310000301007003670036700557003670055

1000 unrolls and 10 iterations

Result (median cycles for code, minus 3 chain cycles): 4.0050

retire uop (01)cycle (02)03l1d tlb fill (05)mmu table walk data (08)09l2 tlb miss data (0b)0e0f18191e1f22233a3f4d4f51schedule uop (52)schedule int uop (53)schedule ldst uop (55)dispatch int uop (56)dispatch ldst uop (58)int uops in schedulers (59)simd uops in schedulers (5a)6061696a6b6d6emap stall dispatch (70)map rewind (75)map stall (76)dispatch uop (78)map int uop (7c)map ldst uop (7d)map int uop inputs (7f)map ldst uop inputs (80)8283flush restart other nonspec (84)85inst all (8c)inst branch (8d)inst branch taken (90)inst branch cond (94)inst int load (95)inst int alu (97)inst ldst (9b)9d9fl1d tlb access (a0)l1d tlb miss (a1)l1d cache miss ld (a3)ld unit uop (a6)l1d cache writeback (a8)a9acafb5bbl1d cache miss ld nonspec (bf)l1d tlb miss nonspec (c1)c2branch cond mispred nonspec (c5)branch mispred nonspec (cb)cfd0d2d5map dispatch bubble (d6)ddfetch restart (de)e0? int output thing (e9)eaebec? ldst retires (ed)? int retires (ef)f5f6f7f8fd
400257005352400001000101017007169729597062540014300131000130010100006170363342489004966976070166700566465921649784039930020101106002020000700413511400211091010000300101000001010007701000100110000110110025200047144698163004490910000300107005470042700577005770057
400247005852510110000200017003869777597152540018300231000230010100006175943343262004966961070056700566467436498140010300201000060020200007005635114002110910100003001010000010100021110001007100001111100252000310224698043000600610000300107005670057700487005470042
400247005652411000000201017004169702597152540014300101000130010100006169823342062004966967070035700356465336497540010300201000060020200007005035214002110910100003001010000010100011110002021310000101000025200027134698103000306910000300107005170036700487005170051
40024700475240000000010100700206974359695254001430013100013001010000616982334220600496697007004770035646683649754001030020100006002020000700503511400211091010000300101000001010002211000302110000110100025200047142698193000690910000300107005470054700547005470057
40024700415251110000010001700416978059701254001830013100023001010000617009334235000496697607005670041646743649814001030020100006002020000700563511400211091010000300101000011010003301000211110000011100025200027142698133000396910000300107003970051700367005670051
40024700505250000000010100700386976059709254001030013100013001010000616982334196700496697007005070050646533649604001030020100006002020000700353511400211091010000300101000001010002211000101110000101000025200047152698133000396910000300107005170051700517005170036
40024700505240000000100000700346974359709254001430013100003001010000616952334206200496697007004770035646683649754001030020100006002020000700353511400211091010000300101000001010002111000200110000111100025200047124698043000696910000300107005770042700547005470057
40024700565241111000020100700416970259701254001830016100023001010000617009334249410496697307005670041646743649814001030020100006002020000700413511400211091010000300101000001010001101000201110000111110025200047144698163000360910000300107005170051700517003670048
40024700505240000000000000700226976059695254001430013100003001010000616973334206200496697007016070035646533649754001030020100006002020000700503511400211091010000300101000001010002311000200110000101000025200047142698133000090910000300107005170048700487003670086
40024700475250000010010100700356973259695254001430013100003001010000616952334206200496696707005070047646743649724001030020100006002020000700353511400211091010000300101000001010002201000201110000101000025200047142698133000396010000300107003670051700517003670036

Test 3: Latency 1->3 (with chain penalty)

Chain cycles: 3

Code:

  ldrsh w0, [x6, w7, uxtw]
  eor x8, x8, x0
  eor x8, x8, x0
  add x7, x7, x8
  mov x7, #4
  mov x8, 0

(fused SUBS/B.cc loop)

100 unrolls and 100 iterations

Result (median cycles for code, minus 3 chain cycles): 4.0051

retire uop (01)cycle (02)03l1d tlb fill (05)mmu table walk data (08)09l2 tlb miss data (0b)0e0f1e223a3f4d4f51schedule uop (52)schedule int uop (53)schedule ldst uop (55)dispatch int uop (56)dispatch ldst uop (58)int uops in schedulers (59)simd uops in schedulers (5a)60696a6b6d6emap stall dispatch (70)map rewind (75)map stall (76)dispatch uop (78)map int uop (7c)map ldst uop (7d)map int uop inputs (7f)map ldst uop inputs (80)8283flush restart other nonspec (84)85inst all (8c)inst branch (8d)inst branch taken (90)inst branch cond (94)inst int load (95)inst int alu (97)inst ldst (9b)9d9fl1d tlb access (a0)l1d tlb miss (a1)l1d cache miss ld (a3)ld unit uop (a6)st unit uop (a7)l1d cache writeback (a8)a9acafb5bbl1d cache miss ld nonspec (bf)l1d tlb miss nonspec (c1)c2cfd5map dispatch bubble (d6)ddfetch restart (de)e0? int output thing (e9)eaebec? ldst retires (ed)? int retires (ef)f5f6f7f8fd
4020570057525111110211700426978859716254010830103100023010010000616077334176914966977070041700576463736496340100302001000060200200007005735114020110099100100003010010000010010001111000100111000011010262017111698203000610101010000301007005870058700587005870058
402047005752411111000070036697645969525401043010310000301001000061601433422541496697707005770057646533649544010030200100006020020000700513511402011009910010000301001000001001000001100000000100001010026101711169814300031001010000301007005270036700527005270036
4020470051525000010110700366976459710254010430103100013010010000616014334225414966971370051700516463136495440100302001000060200200007005135114020110099100100003010010000010010000011000000001000000100261017111698143000310101010000301007005270052700367005270052
4020470051525000000211700426978859716254010830106100023010010000616104334254214966977070057700416465336496040100302001000060200200007005735114020110099100100003010010000010010002211000101041000010100261017111698143000310101010000301007003670052700527005270052
4020470035525000000100700366978259710254010430100100013010010000616175334225414966977070057700576465336496040100302001000060200200007005735114020110099100100003010010000110010000001000000001000010100261017111698143000310101010000301007005270052700527003670052
4020470051524000000010700206978259710254010430100100013010010000616014334225414966977070057700576465336496040100302001000060200200007005735114020110099100100003010010000010010002101000300111000011112261017111698203000610101010000301007005870062700587004270042
40204700575241111101007003669782597102540104301031000130100100006160143342254149669710700357005164631364954401003020010000602002000070051351140201100991001000030100100000100100000110000000121000011112261017111698223000610101010000301007005870058700587004270058
4020470057525111100211700426978859716254010830106100023010010000616068334176914966971070035700516463136493840100302001000060200200007005135114020110099100100003010010000110010000011000000001000011111261017111698043000610101010000301007004270042700587004270058
402047005752511010011070036697645969525401043010010001301001000061617533414701496696107005770041646533649604010030200100006020020000700573511402011009910010000301001000001001000331100020001100001111126101711069804300061001010000301007005870058700427005870058
4020470057524100000100700206976459710254010430112100013010010000616086334225414966971070051700516464736495440100302001000060200200007005135114020110099100100003010010000110010000011000000001000011110261017111697983000310101010000301007005270052700367003670037

1000 unrolls and 10 iterations

Result (median cycles for code, minus 3 chain cycles): 4.0053

retire uop (01)cycle (02)03l1d tlb fill (05)mmu table walk instruction (07)mmu table walk data (08)09l2 tlb miss instruction (0a)l2 tlb miss data (0b)0e0f191e223a3f4d4f51schedule uop (52)schedule int uop (53)schedule ldst uop (55)dispatch int uop (56)dispatch ldst uop (58)int uops in schedulers (59)simd uops in schedulers (5a)696a6b6d6emap stall dispatch (70)int prf full (71)map rewind (75)map stall (76)dispatch uop (78)map int uop (7c)map ldst uop (7d)map int uop inputs (7f)map ldst uop inputs (80)8283flush restart other nonspec (84)85inst all (8c)inst branch (8d)inst branch taken (90)inst branch cond (94)inst int load (95)inst int alu (97)inst ldst (9b)9d9fl1d tlb access (a0)l1d tlb miss (a1)l1d cache miss ld (a3)ld unit uop (a6)l1d cache writeback (a8)a9acafb5bbl1d cache miss ld nonspec (bf)l1d tlb miss nonspec (c1)c2branch mispred nonspec (cb)cfd0d5map dispatch bubble (d6)ddfetch restart (de)e0? int output thing (e9)eaebec? ldst retires (ed)? int retires (ef)f5f6f7f8fd
4002570053524101101000101700386978059712254001830016100023001010000617036334235049669760700567005664674736497840010300201000060020200007014635114002110910100003001010000110100021110001001100001101102520067157698163000390010000300107005770057700427005770057
4002470053525100001100200700386970259701254001830016100023001010000616995334249449669760700537005664674036498140010300201000060020200007011635114002110910100003001010000010100022110002011100001111202520067156698163000666010000300107004270042700597005470042
4002470056525100201000210700416970259715254001430016100013001010000617036334176949669760700537005664659036498140010300201000060020200007011535114002110910100003001010000110100011010002111100000111102520057177698043000696910000300107005470057700577005470057
4002470041525100101000200700266970259715254001430016100023001010000617117334350249669730700417005664674036496640010300201000060020200007015835114002110910100003001010000010100033110002001100001101102520067155698043000390910000300107005770042700427005470042
4002470053525100100000201700416978059715254001430016100023001010000617009334249449669730700567004164659036497840010300201000060020200007013835114002110910100003001010000010100033110002021100000111002520037155698163000690910000300107005770094700437006070054
4002470041526101100000200700416970259715254001830016100013001010000616995334249449669760700567005664674036498140010300201000060020200007013435114002110910100003001010000010100022110002001100000111102520057164698193000699610000300107005770057700577005770057
40024700565251001010002007002669702597152540018300161000130010100006170363342494496697307005670056646740364966400103002010000600202000070120351140021109101000030010100000101000220100010125100001101002520047166698043000396610000300107005770042700427005470060
4002470056524100000000100700416970259715254001830016100023001010000617036334176949669760700577005364674036498140010300201000060020200007013335114002110910100003001010000010100011110002001100001101002520087175698193000696910000300107004270054700427005470057
4002470053525101001000100700416970259715254001430013100023001010000617009334176949669610700567005664674036498140010300201000060020200007010735114002110910100003001010000010100011110002001100000101002520057155698043000699910000300107005470042700427004270057
4002470053525101101000100700416978059712254001830016100013001010000617036334235049669760700417005664671036496640010300201000060020200007011235114002110910100003001010000010100032010002001100001111002520057166698193000690910000300107005770057700577005770042

Test 4: throughput

Count: 8

Code:

  ldrsh w0, [x6, w7, uxtw]
  ldrsh w0, [x6, w7, uxtw]
  ldrsh w0, [x6, w7, uxtw]
  ldrsh w0, [x6, w7, uxtw]
  ldrsh w0, [x6, w7, uxtw]
  ldrsh w0, [x6, w7, uxtw]
  ldrsh w0, [x6, w7, uxtw]
  ldrsh w0, [x6, w7, uxtw]
  mov x7, 8

(fused SUBS/B.cc loop)

100 unrolls and 100 iterations

Result (median cycles for code divided by count): 0.3342

retire uop (01)cycle (02)03l1d tlb fill (05)mmu table walk data (08)l2 tlb miss data (0b)0e0f1e23243a3f4346494f51schedule uop (52)schedule int uop (53)schedule ldst uop (55)dispatch int uop (56)dispatch ldst uop (58)int uops in schedulers (59)simd uops in schedulers (5a)60696a6d6emap stall dispatch (70)map rewind (75)map stall (76)dispatch uop (78)map int uop (7c)map ldst uop (7d)map int uop inputs (7f)map ldst uop inputs (80)8283flush restart other nonspec (84)85inst all (8c)inst branch (8d)inst branch taken (90)inst branch cond (94)inst int load (95)inst int alu (97)inst ldst (9b)9d9fl1d tlb access (a0)l1d tlb miss (a1)l1d cache miss ld (a3)a5ld unit uop (a6)st unit uop (a7)l1d cache writeback (a8)a9acafb5b6bbl1d cache miss ld nonspec (bf)l1d tlb miss nonspec (c1)c2branch cond mispred nonspec (c5)branch mispred nonspec (cb)cdcfd5map dispatch bubble (d6)ddfetch restart (de)e0eaebec? ldst retires (ed)? int retires (ef)f5f6f7f8fd
8020526732200111112100326703218016258010010080000100800155001167009049236532673226736166426166858011520080024200160048267148211802011009910080000100800001100800202000800192401718000061574219111151181162126729990800001002673626733267342673426734
80204267392001110065002267182181815258010010080000100800155001166836049236342671426734166626166888011620080224200160048267326311802011009910080000100800000100800192042080058181718003861574219011151182162126729992800001002673326733267152673426827
8020426714200100006500226717300162580100100800001008001650011695631492365226733267141666061666680115200800242001600482673281118020110099100800001008000001008001919420800191411598003861574219111151180162126730402800001002673326733267332671526819
8020426732200100006501226717018181525801001008000010080015500116802304923634267322673316642616684801162008002420016004826714641180201100991008000010080000010080019190080058101688003801574219111151182162226730992800001002673326717267332673326733
802042673220011011210012669920182125801001008000010080015500116975704923652267322673216660616685801152008002420016004826732811180201100991008000010080000010080019200080057112598000060574119011151182162126729902800001002671526715267332673326743
8020426734201101006501226717218015258010010080000100800155001167763049236532673226732166606166848011520080024200160048267328111802011009910080000100800000100800202042080057020598003861574219111151182162126729092800001002671526733267332671526738
802042673620011100210022671721818152580100100800001008001550011677631492363426732267331665961668480116200800242001600482671581118020110099100800001008000001008001920420800191542718000061194219111151183162126729992800001002673326733267332673526769
80204267452011111083013267182018162580100100800001008001550011596950492365326726267331665961668580116200800242001600482673281118020110099100800001008000001008001920400800191332598000061574219011151181161226729090800001002671526733267332673426738
8020426734200111006501326699218002580100100800001008001550011695631492365226715267321666061668480115200800242001600482671464118020110099100800001008000001008001919008005812159800386057019111151180161226745992800001002673326715267332671526738
80204267362001100065003266992181815258010010080000100800125001167078049236522671426733166606166848011620080024200160048267326411802011009910080000100800001100800201941080058121598003861574119211151182161226711992800001002673326733267332673326735

1000 unrolls and 10 iterations

Result (median cycles for code divided by count): 0.3341

retire uop (01)cycle (02)03090e191e2223243a3f4346494f51schedule uop (52)schedule int uop (53)schedule ldst uop (55)dispatch int uop (56)dispatch ldst uop (58)int uops in schedulers (59)simd uops in schedulers (5a)60696a6b6d6emap stall dispatch (70)map rewind (75)map stall (76)dispatch uop (78)map int uop (7c)map ldst uop (7d)map int uop inputs (7f)map ldst uop inputs (80)8283flush restart other nonspec (84)85inst all (8c)inst branch (8d)inst branch taken (90)inst branch cond (94)inst int load (95)inst int alu (97)inst ldst (9b)9d9fl1d tlb access (a0)l1d cache miss ld (a3)ld unit uop (a6)l1d cache writeback (a8)acafb5b6bbl1d cache miss ld nonspec (bf)c2branch cond mispred nonspec (c5)cfd0l1i cache miss demand (d3)d5map dispatch bubble (d6)ddfetch restart (de)e0eaebec? ldst retires (ed)? int retires (ef)f5f6f7f8fd
800252672820010020710012671320116258001010800001080000501168843149236480267362685116683316707800102080000201600002673177118002110910800001080000110800004380039039800396139440050200061624267051010480000102672926709267292672926728
800242672720000060100126713211216258001010800001080000501166750149236470267372686916683316707800102080000201600002672756118002110910800001080000010800004380040038800396039430050200041642267051010480000102672826729267092670926728
8002426731200000690001267122111925800101080000108000050116675014923647026737268161667731671180010208000020160000267277711800211091080000108000001080000438003803980000613943005020004164226728010480000102672926728267282672926728
8002426708200000558100026713011219258001010800001080000551174647149236470267362683916680316717800102080000201600002673180118002110910800001080000010800004380000038800396139430050200041624267051410480000102673226732267322670926732
8002426731200000516100126712211219258001010800001080000501167124149236510267362670816672316807800102080000201600002670877118002110910800001080000010800004380038038800006139430050200021624267241010480000102672826709267282673226732
800242672820000017100012671620019258001010800001080000501167501149236280267362682616680316707800102080000201600002673177118002110910800001080000010800004380000038800396138440050200041642267051010480000102673226732267322672826729
800242672820100015000126693212121925800101080000108000050116712414923651026739268971678131670880010208000020160000267288111800211091080000108000001080000438003903980038613944005020002164226724010080000102672826728267092673226732
8002426731200000601001267132111925800101080000108000050116688614923628026737268551668231668880010208000020160000267318111800211091080000108000001080000080038038800386139440050200021624267281010480000102672826709267322673226732
8002426731200000601001267122120192580010108000010800005011672011492364702673726868166773167118001020800002016000026727771180021109108000010800000108000008000003980038610430050200021662267251010480000102672826709267282673226728
80024267272000006910012671321212025800101080000108000050116750114923647326818268421668131669680010208000020160000267278011800211091080000108000001080000080054008000061043005020006164626724100480000102673226732267322673226728