Apple Microarchitecture Research by Dougall Johnson

M1/A14 P-core (Firestorm): Overview | Base Instructions | SIMD and FP Instructions
M1/A14 E-core (Icestorm):  Overview | Base Instructions | SIMD and FP Instructions

LDRSW (register, sxtw)

Test 1: uops

Code:

  ldrsw x0, [x6, w7, sxtw]
  mov x7, #4
  mov x8, 0

(no loop instructions)

1000 unrolls and 1 iteration

Retires: 1.000

Issues: 1.000

Integer unit issues: 0.000

Load/store unit issues: 1.000

SIMD/FP unit issues: 0.000

retire uop (01)cycle (02)03l1d tlb fill (05)mmu table walk data (08)l2 tlb miss data (0b)0e0f1e22233a3f4346494f51schedule uop (52)schedule ldst uop (55)dispatch ldst uop (58)simd uops in schedulers (5a)606d6emap stall dispatch (70)map rewind (75)map stall (76)dispatch uop (78)map ldst uop (7d)map ldst uop inputs (80)8283flush restart other nonspec (84)85inst all (8c)inst int load (95)inst ldst (9b)9dl1d tlb access (a0)l1d tlb miss (a1)l1d cache miss ld (a3)ld unit uop (a6)st unit uop (a7)l1d cache writeback (a8)a9acafb5b6bbl1d cache miss ld nonspec (bf)l1d tlb miss nonspec (c1)c2cfd5map dispatch bubble (d6)ddfetch restart (de)e0eaebec? ldst retires (ed)f5f6f7f8fd
10053743000006500138421818152510001000100014060037439121232571000100020004088111100110001000010000010390000103561570192731161137110641000375375375395390
100439430000041102374012121225100010001000140600394394197325210001000200039456111001100010000100003910000000103900043007311611371101041000395375375390395
10043743000004500237920120251000100010001503703943942173252100010002000374711110011000100001000039103900039100060394300731161137110041000395395375375375
1004394300010450023742000251000100010001406013943942173252100010002000381711110011000100001000039103500042100060354300731161139110041000395375395375375
100437430000045002374212121225100010001000149890394394216325210001000200038972111001100010000100000103900039103561353900731161139110601000395395395375395
10043893000004100035900121625100010001000150370374394197325310001000200039456111001100010000100003910390003910350000007311611371101041000395395390395395
10043743000004500037421212025100010001000148381374374197325210001000200039471111001100010000100000100000035103561043007311611371101041000395395375395395
10043943000004500137921201625100010001000149890394394197325210001000200039471111001100010000100003910390003910000039000731161139010041000395395375395379
1004374300000450003742121202510001000100015037039439419732521000100020003947111100110001000010000391035000351000003500073116113916621000375375395375392
100439430001100023592001625100010001000140600394374216323210001000200039471111001100010000100000100000001039000000731161137110041000390395375375375

Test 2: Latency 1->2 (with chain penalty)

Chain cycles: 3

Code:

  ldrsw x0, [x6, w7, sxtw]
  eor x8, x8, x0
  eor x8, x8, x0
  add x6, x6, x8
  mov x7, #4
  mov x8, 0

(fused SUBS/B.cc loop)

100 unrolls and 100 iterations

Result (median cycles for code, minus 3 chain cycles): 4.0047

retire uop (01)cycle (02)03mmu table walk data (08)09l2 tlb miss data (0b)0e0f191e22233f4d4f51schedule uop (52)schedule int uop (53)schedule ldst uop (55)dispatch int uop (56)dispatch ldst uop (58)int uops in schedulers (59)simd uops in schedulers (5a)696a6d6emap stall dispatch (70)map rewind (75)map stall (76)dispatch uop (78)map int uop (7c)map ldst uop (7d)map int uop inputs (7f)map ldst uop inputs (80)8283flush restart other nonspec (84)85inst all (8c)inst branch (8d)inst branch taken (90)inst branch cond (94)inst int load (95)inst int alu (97)inst ldst (9b)9d9fl1d tlb access (a0)l1d tlb miss (a1)l1d cache miss ld (a3)ld unit uop (a6)l1d cache writeback (a8)acafb5l1d cache miss ld nonspec (bf)branch cond mispred nonspec (c5)cdcfd5map dispatch bubble (d6)ddfetch restart (de)e0? int output thing (e9)eaebec? ldst retires (ed)? int retires (ef)f5f6f7f8fd
40205700475250100001069107002069735597092540104301031000130100100006160153342062496696770047700476464336495040100302001000060200200007004735114020110099100100003010010000110010000001000000100001100261027122698133000360910000301007005170048700517005170036
4020470035525000000846107003269781597092540104301031000030100100006161753342206496697070050700506463136495640100302001000060200200007004735114020110099100100003010010000110010000001000010100000100261027122698133000396910000301007005170051700367005170051
4020470050525000000861107003669764597112540149301001000130100100006160053342206496695570035700506464636500840100302001000061180200007005035114020110099100100003010010000110010000011000000100001100261027122697983000399910000301007003670051700517005170051
4020470050524000000841107003569781597092540100301031000130100100006161753342446496696770050700356464336495340100302001000060200200007005035114020110099100100003010010000010010000011000010100001000261027122698133000006010000301007005170051700517003670051
40204700505240000008501070035697355970925401003010310001301001000061601533422064966967700507005064643366495140100302001000060200200007005035114020110099100100003010010000010010000011000000100001101261027122698133000366610000301007005170036700487003670036
4020470054524000000880007003269781596952540100301031000030100100006160053342206496697070050700506464636495540100302001000060200200007004735114020110099100100003010010000010010000011000000100000100261027122698103000390010000301007005170051700517005170051
4020470050525000000865107003269735596952540100301031000130100100006161753341470496697070047700506464636495540100302001000060200200007005035114020110099100100003010010000110010000001000000100001000261027122697983000390910000301007005170048700517005170051
4020470050524000000849107003269735597062540100301031000130100100006160053342206496697070047700356464336495540100302001000060200200007003535114020110099100100003010010000010010000011000000100001100261027122698133000300910000301007004870036700487003670036
4020470035525000000892007003569781597062540104301001000030100100006160053342206496695670050700396463136495140100302001000060200200007004935114020110099100100003010010000010010000011000000100000100261027122697983000396010000301007005170051700367004870051
40204700505250000008020070035697815969525401043010010000301001000061601533422064966970700507005064631366495640100302001000060200200007005035114020110099100100003010010000010010000011000003100001100261017122697983000306610000301007003670051700397005170051

1000 unrolls and 10 iterations

Result (median cycles for code, minus 3 chain cycles): 4.0047

retire uop (01)cycle (02)03mmu table walk data (08)090e0f1e1f22233f4d4f51schedule uop (52)schedule int uop (53)schedule ldst uop (55)dispatch int uop (56)dispatch ldst uop (58)int uops in schedulers (59)simd uops in schedulers (5a)5f60696a6d6emap stall dispatch (70)map rewind (75)map stall (76)dispatch uop (78)map int uop (7c)map ldst uop (7d)map int uop inputs (7f)map ldst uop inputs (80)8283flush restart other nonspec (84)85inst all (8c)inst branch (8d)inst branch taken (90)inst branch cond (94)inst int load (95)inst int alu (97)inst ldst (9b)9d9fl1d tlb access (a0)l1d tlb miss (a1)l1d cache miss ld (a3)ld unit uop (a6)l1d cache writeback (a8)acafb5l1d cache miss ld nonspec (bf)c2cfd5map dispatch bubble (d6)ddfetch restart (de)e0? int output thing (e9)eaebec? ldst retires (ed)? int retires (ef)f5f6f7f8fd
4002570050524001010107004069728597082540010300131000230010100006169523341470004966967700477004764665364973400103002010000600202000070047351140021109101000030010100000101000001100000310012110252027132698103000360610000300107003670036700487004870036
40024700475240000480107007169743596962540014300131000130010100006169523341470004966967700477004764665364972400103002010000600202000070047351140021109101000030010100000101000001100000010000110252027132698103000306610000300107004870048700487004870048
4002470047525000010107005969728597062540014300101000030010100006169523342062004966967700477004764665364972400103002010000600202000070047351140021109101000030010100000101000001100000310012110252037132698103000366610000300107003870048700487004870048
4002470035525000000107003269728597062540014300131000030010100006169523342062004966967700357004764653364972400103002010000600202000070047351140021109101000030010100000101000001100000010000110252037132698103000366610000300107004870048700487004870036
4002470047525000060007003269743596952540014300131000130010100006169523342062004966967700357004764665364972400103002010000600202000070047351140021109101000030010100000101000001100000010000110252037132698103000366610000300107003670048700487004870036
4002470047525000010107003269728597062540014300131000030010100006169523342062004967334700477004764665364960400103002010000600202000070047351140021109101000030010100000101000000100000010000110252037132698103000066610000300107005270048700487003670048
4002470035524001000007005569735597092540014300131000130010100006170683342062004966967700477004764665364972400103002010000600202000070047351140021109101000030010100000101000001100001899110000110252027123698103000360610000300107004870048700487004870048
40024700355250000400107003269731597062540014300101000030010100006169523342062104966967700477004764665364960400103002010000600202000070035351140021109101000030010100000101000001100000010000110252027122698173000366610000300107004870053700487003670048
4002470047525001110107003269728596952540014300131000130010100006169523342062014966955700477003564653364972400103002010000600202000070035351140021109101000030010100000101000001100000010000110252027122698103000366610000300107003670036700367004870048
400247005252510005650107002069728597062540010300131000130010100006169523342062004966967700477004764653364972400103002010000600202000070048355140021109101000030010100001101000001100000010000110252027132698103000366610000300107003670048700487004870048

Test 3: Latency 1->3 (with chain penalty)

Chain cycles: 3

Code:

  ldrsw x0, [x6, w7, sxtw]
  eor x8, x8, x0
  eor x8, x8, x0
  add x7, x7, x8
  mov x7, #4
  mov x8, 0

(fused SUBS/B.cc loop)

100 unrolls and 100 iterations

Result (median cycles for code, minus 3 chain cycles): 4.0060

retire uop (01)cycle (02)03l1d tlb fill (05)mmu table walk data (08)09l2 tlb miss data (0b)0e0f181e22243a3f4d4f51schedule uop (52)schedule int uop (53)schedule ldst uop (55)dispatch int uop (56)dispatch ldst uop (58)int uops in schedulers (59)simd uops in schedulers (5a)696a6b6d6emap stall dispatch (70)map rewind (75)map stall (76)dispatch uop (78)map int uop (7c)map ldst uop (7d)map int uop inputs (7f)map ldst uop inputs (80)8283flush restart other nonspec (84)85inst all (8c)inst branch (8d)inst branch taken (90)inst branch cond (94)inst int load (95)inst int alu (97)inst ldst (9b)9d9fl1d tlb access (a0)l1d tlb miss (a1)l1d cache miss ld (a3)ld unit uop (a6)l1d cache writeback (a8)a9acafb5bbl1d cache miss ld nonspec (bf)l1d tlb miss nonspec (c1)c2branch cond mispred nonspec (c5)branch mispred nonspec (cb)cfd5map dispatch bubble (d6)ddfetch restart (de)e0? int output thing (e9)eaebec? ldst retires (ed)? int retires (ef)f5f6f7f8fd
40205700575250000010200070042697925977625401043010310002301001000061607833425424966961070057700576466036496740100302001000060200200007004136114020110099100100003010010000110010000011000000010000100000026101711169824300060101310000301007004270058700997006970042
40204700415241101000200070045697735971925401083010610002301001000061609533417694966980070060700606465736496340100302001000060200200007006035114020110099100100003010010000010010001101000100110000110110026101711169823300061301310000301007006170058701407009270061
402047006052411110002100700426980259716254010830106100023010010000616095334268649669610700417004164663364963401003020010000602002000070041351140201100991001000030100100000100100012110003011100001111100261017111698233000613131010000301007006170134700667004970061
40204700605241101000100170042698605986525401083010610002301001000061609533426864966980070060700606465636496840100302001000060200200007018335214020110099100100003010010000010010001111000100178100001111000261017111698643000613131010000301007004270114700747008970042
40204700605251111000200070042698155971925401293010310004301001000061609533426869867054070041700606465636496340100302001000060200200007006036114020110099100100003010010000110010001211000201110000011100026453641169823300061313010000301007006270058700587006170061
402047006052511000002000700266979159716254013330106100023010010000616510334268649669800700417005764663364944401003020010000602002000070060351140201100991001000030100100001100100023010002014100001101000261017111698043000313101010000301007006170062700427006170042
40204700415251101000210070042697915970125401083010310001301251000061651033426864966961070097700416465636494440100302001000060200200007006035114020110099100100003010010000110010000001000000010000100000026101711169823300061313010000301007006670062700617006170042
402047006052410011002000700456979159701254010830106100013010010000616095334176949669800700607006064637364963401003020010000602002000070060351140201100991001000030100100001100100012010001001100001101000261017111698233000300010000301007006070062700617006170061
402047006052510100002100700456970859716254010430106100023010010000616078334268649669800700577005764653364970401253020010000602002000070060351140201100991001000030100100001100100011010003101100000101000261017111698043000613101310000301007004370059700587006170058
40204700415241011000200070026697085971925401083010310002301251000061643433425424966977070060700416465336494440100302001000060200200007004135114020110099100100003010010000110010002111000301110000111100026101711169823300030131310000301007005870061700427004270062

1000 unrolls and 10 iterations

Result (median cycles for code, minus 3 chain cycles): 4.0060

retire uop (01)cycle (02)03l1d tlb fill (05)mmu table walk data (08)09l2 tlb miss data (0b)0e0f18191e1f22233a3f4d4f51schedule uop (52)schedule int uop (53)schedule ldst uop (55)dispatch int uop (56)dispatch ldst uop (58)int uops in schedulers (59)simd uops in schedulers (5a)5f60696a6d6emap stall dispatch (70)map rewind (75)map stall (76)dispatch uop (78)map int uop (7c)map ldst uop (7d)map int uop inputs (7f)map ldst uop inputs (80)8283flush restart other nonspec (84)85inst all (8c)inst branch (8d)inst branch taken (90)inst branch cond (94)inst int load (95)inst int alu (97)inst ldst (9b)9d9fl1d tlb access (a0)l1d tlb miss (a1)l1d cache miss ld (a3)ld unit uop (a6)l1d cache writeback (a8)a9acafb5bbl1d cache miss ld nonspec (bf)l1d tlb miss nonspec (c1)c2cfd5map dispatch bubble (d6)d9daddfetch restart (de)e0? int output thing (e9)eaebec? ldst retires (ed)? int retires (ef)f5f6f7f8fd
40025700575251011000020101700556978459716254001430016100023001010000617072334268600496697770057700606465936498540010300201000060020200007006035114002110910100003001010000110100012110002001100001111025209710022698043000313101310000300107004270042700617004270061
4002470057524111000002000170026697845970125400183001610001300101000061699533426860049669617006070057646783649664001030020100006002020000700573511400211091010000300101000011010001311000100110000011112520371006269823300490101310000300107006170061700587004270042
40024700785241012000041000170026697025971925400183001310001300101000061707233425420149669807005770057646593649824001030020100006002020000700573511400211091010000300101000001010001211000100110000011112520271002369823300061301010000300107006170061700587006170061
4002470060525111100002010170150697845971925400183001610001300101000061707233417690149669777004170057646783649664001030020100006002020000700603511400211091010000300101000001010001101000201110000011102520671006269823300061301310000300107004270061700617006170061
40024700415251001000020101700456978459716254001430016100023001010000616995334176900496697770041700416467836496640010300201000060020200007006035114002110910100003001010000010100021110001024100001101225202710066698233000613101310000300107006170061700617006170042
40024700415251000000020101700436970259719254001830016100023001010000616995334268610496698070057700416467836498540010300201000060020200007004135114002110910100003001010000010100023110002001100000111025206710022698233000610101310000300107004270061700587004270061
4002470060525101100007010170045697025970125400143001310001300101000061707233417690049669777006070060646783649664001030020100006002020000700603511400211091010000300101000011010001111000300110000111102520271003669823300061313010000300107004270042700617005870061
40024700605241100000010101700266978159719254001830016100023001010000617045334176900496697770057700416467536498240010300201000060020200007004135114002110910100003001010000010100021010002004100001101025202710022698233000613131010000300107006170061700617006170061
400247006052410010000201017004569784597192540018300161000230010100006170723342542004966980700607005764678365037400103002010000600202000070057351140021109101000030010100000101000111100030011000011110252027100226982330006100010000300107006170042700427006170061
40024700415241000000020001700426978459719254001830016100023001010000616995334268600496697770060700416465936496640010300201000060020200007005735114002110910100003001010000010100012110002001100001111025202710022698233000313131010000300107006170044700617006170058

Test 4: throughput

Count: 8

Code:

  ldrsw x0, [x6, w7, sxtw]
  ldrsw x0, [x6, w7, sxtw]
  ldrsw x0, [x6, w7, sxtw]
  ldrsw x0, [x6, w7, sxtw]
  ldrsw x0, [x6, w7, sxtw]
  ldrsw x0, [x6, w7, sxtw]
  ldrsw x0, [x6, w7, sxtw]
  ldrsw x0, [x6, w7, sxtw]
  mov x7, 8

(fused SUBS/B.cc loop)

100 unrolls and 100 iterations

Result (median cycles for code divided by count): 0.3340

retire uop (01)cycle (02)03l1d tlb fill (05)mmu table walk data (08)l2 tlb miss data (0b)0e0f18191e22243a3f4346494f51schedule uop (52)schedule int uop (53)schedule ldst uop (55)dispatch int uop (56)dispatch ldst uop (58)int uops in schedulers (59)simd uops in schedulers (5a)60696a6d6emap stall dispatch (70)map rewind (75)map stall (76)dispatch uop (78)map int uop (7c)map ldst uop (7d)map int uop inputs (7f)map ldst uop inputs (80)8283flush restart other nonspec (84)85inst all (8c)inst branch (8d)inst branch taken (90)inst branch cond (94)inst int load (95)inst int alu (97)inst ldst (9b)9d9fl1d tlb access (a0)l1d tlb miss (a1)l1d cache miss ld (a3)ld unit uop (a6)st unit uop (a7)l1d cache writeback (a8)a9acafb5b6bbl1d cache miss ld nonspec (bf)l1d tlb miss nonspec (c1)c2branch cond mispred nonspec (c5)branch mispred nonspec (cb)cdcfd5map dispatch bubble (d6)ddfetch restart (de)e0eaebec? ldst retires (ed)? int retires (ef)f5f6f7f8fd
802052673220011100006610326717218187825801001008000010080015500115919114923652267322673316660616684801152008002420016004826732821180201100991008000010080000110080021194280057100598003861574219111151182163426730992800001002673726715268132671526734
80204267332001100100650022671720181525801001008000010080015500116776304923652267322673216660616666801162008002420016004826732811180201100991008000010080000110080019204280057000598003861564219211151183163426730992800001002673926736267392674226733
802042673220111100006600226699018181725801001008000010080016500116709504923652267322673216659616684801152008002420016004826732821180201100991008000010080000110080020204280057001598003861574219011151183163426729992800001002673926737267212674226733
802042673320010100006500326699218182325801001008000010080016500116659004923652267142673216660616684801172008002420016004826732811180201100991008000010080000110080019204280058100598003860564219111151183162426730992800001002673426733267332686926740
8020426732200101000065003266992180952580100100800001008001450011694181492365226733267331666061666680115200800242001600482673364118020110099100800001008000011008002020080019100598003800194219011151464164526711992800001002673326740268592673526733
802042673320011000006510326717218181925801001008000010080015500116710704923653267322673316660616684801162008002420016004826732821180201100991008000010080000110080019204280058101598003761574219111151184164326729992800001002673826734267392674326733
802042673220010000006510226699218017258010010080000100800155001168190049236532673226733166606166878011420080024200160048267328211802011009910080000100800001100800202042800570010800366135390011151183164326704064800001002673226726267292673226728
80204267222000000000411012670721218122580100100800001008001450011678750492362726722267221663561665980115200800242001600482672271118020110099100800001008000001008000003980035000358003560354300111511821644267191064800001002672926727267292673226708
80204267222000000000450002670721218142580100100800001008001550011678750492364226722267071665061667980114200800242001600482672271118020110099100800001008000011008000003980039030038800356135390011151182162426719662800001002672726728268722672526723
80204267221990001100010126707218012258010010080000100800165001177165049236422672726722166506166798011420080024200160048267225611802011009910080000100800001100800000398003500035800356135430011151183164326719662800001002673426711267912672826723

1000 unrolls and 10 iterations

Result (median cycles for code divided by count): 0.3340

retire uop (01)cycle (02)03090e0f1e22233a3f4346494f51schedule uop (52)schedule int uop (53)schedule ldst uop (55)dispatch int uop (56)dispatch ldst uop (58)int uops in schedulers (59)simd uops in schedulers (5a)5f60696a6b6d6emap stall dispatch (70)map rewind (75)map stall (76)dispatch uop (78)map int uop (7c)map ldst uop (7d)map int uop inputs (7f)map ldst uop inputs (80)8283flush restart other nonspec (84)85inst all (8c)inst branch (8d)inst branch taken (90)inst branch cond (94)inst int load (95)inst int alu (97)inst ldst (9b)9fl1d tlb access (a0)l1d tlb miss (a1)l1d cache miss ld (a3)a5ld unit uop (a6)l1d cache writeback (a8)acafb5b6bbl1d cache miss ld nonspec (bf)cfd5map dispatch bubble (d6)ddfetch restart (de)e0eaebec? ldst retires (ed)? int retires (ef)f5f6f7f8fd
8002526724200000411012670721818122580010108000010800005011676050149236420267222670816667316702800102080000201600002680471118002110910800001080000108000003908003500800356135395020716222671966280000102672326709267232672326723
80024267222000004110126707018181225800101080000108000050116760500492362802672226722166673167028001020800002016000026785711180021109108000010800001080000039080035035800356135395020216632671966280000102672326723267232672326723
8002426722200010411012670721818112580010108000010800005011676050049236420267222672216667316702800102080000201600002673671118002110910800001080000108000003908003500800356135395020216322670566280000102672326723267232672326723
80024267222000004110126707218181102580010108000010800005011676050049236420267232672216667316702800102080000201600002672271118002110910800001080000108000003908003503580035610395020216262671966280000102672326723267232672326723
80024267222000004110026707218181125800101080000108000050116675000492364502672226722166673167028001020800002016000026787711180021109108000010800001080000039080035035800006135395052216222671966280000102672326723267232670926723
80024267222000004100126707218181125800101080000108000050116675000492364202672226722166673167028001020800002016000026792711180021109108000010800001080000039080035035800356135395020616262671966280000102672326723267232672326723
80024267222000004110026707218181225800101080000108000050116699300492364202672226722166673167028001020800002016000026835721180021109108000010800001080000039080035035800356135395020216362671966280000102670926723267232672326709
80024267222000004110126707218181225800101080000108000050116760500492364202672226722166683167028001020800002016000026965711180021109108000010800001080000039080035035800356135395020616262672166280000102672326723267232672326729
80024267222000004110126707218181225800101080000108000050116760500492364202672226722166673167028001020800002016000026722711180021109108000010800001080000039080035035800350135395020616322672066080000102672326723267232672326723
8002426722200010411012670721801249801401080000108000050116760500492364202672226722166673167028001020800002016000026722711180021109108000010800001080000039080035135800356135395020616222672166280000102672326723267232672326723