Apple Microarchitecture Research by Dougall Johnson

M1/A14 P-core (Firestorm): Overview | Base Instructions | SIMD and FP Instructions
M1/A14 E-core (Icestorm):  Overview | Base Instructions | SIMD and FP Instructions

LDRSH (register, lsl, 64-bit)

Test 1: uops

Code:

  ldrsh x0, [x6, x7, lsl #1]
  mov x7, #4
  mov x8, 0

(no loop instructions)

1000 unrolls and 1 iteration

Retires: 1.000

Issues: 1.000

Integer unit issues: 0.000

Load/store unit issues: 1.000

SIMD/FP unit issues: 0.000

retire uop (01)cycle (02)030e0f1e22233a3f4346494f51schedule uop (52)schedule ldst uop (55)dispatch ldst uop (58)simd uops in schedulers (5a)606d6emap stall dispatch (70)map rewind (75)map stall (76)dispatch uop (78)map ldst uop (7d)map ldst uop inputs (80)8283flush restart other nonspec (84)85inst all (8c)inst int load (95)inst ldst (9b)9dl1d tlb access (a0)l1d cache miss ld (a3)ld unit uop (a6)l1d cache writeback (a8)acafb5b6bbl1d cache miss ld nonspec (bf)cfd5map dispatch bubble (d6)ddfetch restart (de)e0eaebec? ldst retires (ed)f5f6f7f8fd
100538930045001379200225100010001000150371374394217325210001000200037456111001100010000100039103903910356103973116113916621000390395375390395
1004394300410003740121212251000100010001483803943942173252100010002000394711110011000100001000010390391039613543731161138610001000394390393375395
10043742014500237421200251000100010001501813743942163252100010002000394561110011000100001000391039035103561394373116113866641000378390395395396
1004394311450013592180162510001000100014060139439419732471000100020003947211100110001000010000103503510390104373116113860601000395375395375375
10043743010000359212002510001000100015207138937421732331000100020003747111100110001000010003910000421039003943731161137110041000375375379395395
10043743000000374201212510001000100014989037437421732471000100020003895611100110001000010003910390010336135073116113860641000375395395395390
1004391300010237421800251000100010001406013743942173232100010002000389721110011000100001000391000139103561354373116113910041000390397395395375
10043943114500037421818122510001000100014838139439421732321000100020003947111100110001000010000103900103561350731161137110041000375375395395375
100438930042102379200122510001000100014989037437421732471000100020003745611100110001000010000100025110396100731161138810001000375375375375390
1004394310451013800151202510001000100014838137439421732471000100020003947111100110001000010003910350351035600073116113910621000398377379375375

Test 2: Latency 1->2 (with chain penalty)

Chain cycles: 3

Code:

  ldrsh x0, [x6, x7, lsl #1]
  eor x8, x8, x0
  eor x8, x8, x0
  add x6, x6, x8
  mov x7, #4
  mov x8, 0

(fused SUBS/B.cc loop)

100 unrolls and 100 iterations

Result (median cycles for code, minus 3 chain cycles): 4.0051

retire uop (01)cycle (02)03l1d tlb fill (05)mmu table walk data (08)l2 tlb miss data (0b)0e0f1e22243a3f4d4f51schedule uop (52)schedule int uop (53)schedule ldst uop (55)dispatch int uop (56)dispatch ldst uop (58)int uops in schedulers (59)simd uops in schedulers (5a)60696a6b6d6emap stall dispatch (70)map rewind (75)map stall (76)dispatch uop (78)map int uop (7c)map ldst uop (7d)map int uop inputs (7f)map ldst uop inputs (80)8283flush restart other nonspec (84)85inst all (8c)inst branch (8d)inst branch taken (90)inst branch cond (94)inst int load (95)inst int alu (97)inst ldst (9b)9d9fl1d tlb access (a0)l1d tlb miss (a1)l1d cache miss ld (a3)ld unit uop (a6)st unit uop (a7)l1d cache writeback (a8)a9acafb5bbl1d cache miss ld nonspec (bf)l1d tlb miss nonspec (c1)c2branch cond mispred nonspec (c5)cfd5map dispatch bubble (d6)ddfetch restart (de)e0? int output thing (e9)eaebec? ldst retires (ed)? int retires (ef)f5f6f7f8fd
4020570051525000012101700426970259718254010830106100023010010000616068334176914966977070036700546464736495440100302001000060200200007005135114020110099100100003010010000010010000011000000011000010111026101711169820300031010010000301007005870042700587005870058
40204700575251100010007003669782596952540104301031000130100100006160143341470149669710700907005364650364954401003020010000602002000070051351140201100991001000030100100000100100000110000000010000000100261017111697983000310101010000301007015370036700527003670052
40204700355250000060007003669782596952540104301001000130100100006161753342254149669740701127008864647364954401003020010000602002000070035351140201100991001000030100100000100100000010000000010000101000261017111698143000310101010000301007005270052700527005270052
402047003552500011010070020697825971025401043010310001301001000061617533422541496697107008970051646473649384010030200100006020020000700513511402011009910010000301001000001001000001100000103100000010002610171116981430003010010000301007005270052700527005270052
4020470051524000001000700366978259710254010430103100013010010000616014334225414966971070104700556464836493840100302001000060200200007005135114020110099100100003010010000010010001201000100211000011110026101711169820300061001010000301007005870042700617004270058
40204700605241010011007003669764596952540104301031000030100100006160143342254149669710700717005664631364954401003020010000602002000070051351140201100991001000030100100000100100000110000000010000101000261017111698143000310101010000301007003670052700527003670052
40204700515240000100007003669782597102540104301001000130100100006160143342398149669710700917005664647364938401003020010000602002000070035351140201100991001000030100100000100100000110000000010000101040261017111697983000310101010000301007005270052700367005270052
402047005152500000010070036697645971025401043010310001301001000061601433422541496695507010770054646473649544010030200100006020020000700513511402011009910010000301001000001001000000100000000100001010002610171116981430003001010000301007003670052700367003670052
40204700515240000021017004269788597162540108301031000230100100006160683341769149669770700867005864653364960401003020010000602002000070057351140201100991001000030100100000100100011110002100110000110130261017111698203000610101010000301007006170058700427005870058
4020470041524110002000700426978859716254010430106100023010010000616068334254214966971070088700536463136495440100302001000060200200007005135114020110099100100003010010000110010000011000000001000000100026101711169814300031010010000301007005270036700367005270052

1000 unrolls and 10 iterations

Result (median cycles for code, minus 3 chain cycles): 4.0053

retire uop (01)cycle (02)03l1d tlb fill (05)mmu table walk data (08)09l2 tlb miss data (0b)0e0f191e22233a3f4d4f51schedule uop (52)schedule int uop (53)schedule ldst uop (55)dispatch int uop (56)dispatch ldst uop (58)int uops in schedulers (59)simd uops in schedulers (5a)60696a6d6emap stall dispatch (70)map rewind (75)map stall (76)dispatch uop (78)map int uop (7c)map ldst uop (7d)map int uop inputs (7f)map ldst uop inputs (80)8283flush restart other nonspec (84)85inst all (8c)inst branch (8d)inst branch taken (90)inst branch cond (94)inst int load (95)inst int alu (97)inst ldst (9b)9d9fl1d tlb access (a0)l1d tlb miss (a1)l1d cache miss ld (a3)ld unit uop (a6)l1d cache writeback (a8)a9acafb5bbl1d cache miss ld nonspec (bf)l1d tlb miss nonspec (c1)c2cfd5map dispatch bubble (d6)ddfetch restart (de)e0? int output thing (e9)eaebec? ldst retires (ed)? int retires (ef)f5f6f7f8fd
400257005352411101002101700416977759715254001830013100023001010000616995334249414966995700537005364671364966400103002010000600202000070041351140021109101000030010100000101000211100020111000011112252037123698163000660610000300107004270042700547005470054
400247005352510001102100700266977759712254001830016100023001010000617009334235004966973700417004164671364966400103002010000600202000070053351140021109101000030010100000101000220100010011000611010252027123698243000666010000300107005470054700427004270057
400247004152410010007001702486977759712254001830016100023001010000617009334176904966973700537005364659364978400103002010000600202000070064351140021109101000030010100000101000121100012111000001110252027144698043000690610000300107005470054700547004270042
400247004152410010002000700386977759715254001830013100023001010000617009334235004966982700537004164671364966400103002010000600202000070056351140021109101000030010100000101000231100010111000011110252037142698043000666010000300107005470054700547004270054
400247005352411000002001700386977759712254001830016100023001010000617036334249414966995700417005664671364981400103002010000600202000070041351140021109101000030010100000101000111100010111000011011252037123698163000666610000300107005470054700577004270042
400247005652510100001001700386977759712254001830016100023001010000616995334176904966975700567005364659364978400103002010000600202000070053771140021109101000030010100000101000211100011111000011010252027142698043000606610000300107005470042700547005470057
400247005352511010001100700266977759712254001830013100023001010000617009334235004967027700417005664671364966400103002010000600202000070056351140021109101000030010100000101000120100010111000001110252027122698043000600010000300107005470054700547004270057
400247004152511000032001700416978059712254001830016100023001010000617009334235014967012700417005364671364966400103002010000600202000070041351140021109101000030010100000101000131100020211000001111252047133698043000666010000300107005470054700427005770054
400247005652411110002001700416977759715254001830016100023001010000617009334176904967040700537005364674364978400103002010000600202000070041351140021109101000030010100000101000111100020011000001110252047135698163000660610000300107005470054700427005470042
400247005352411010002101700266970259712254001430016100013001010000616995334176904966973700537005364659364979400103002010000600202000070041351140021109101000030010100000101000121100020011000011110252027142698163000660610000300107005470042700547005470042

Test 3: Latency 1->3 (with chain penalty)

Chain cycles: 3

Code:

  ldrsh x0, [x6, x7, lsl #1]
  eor x8, x8, x0
  eor x8, x8, x0
  add x7, x7, x8
  mov x7, #4
  mov x8, 0

(fused SUBS/B.cc loop)

100 unrolls and 100 iterations

Result (median cycles for code, minus 3 chain cycles): 4.0060

retire uop (01)cycle (02)03l1d tlb fill (05)mmu table walk data (08)09l2 tlb miss data (0b)0e0f1e223a3f4d4f51schedule uop (52)schedule int uop (53)schedule ldst uop (55)dispatch int uop (56)dispatch ldst uop (58)int uops in schedulers (59)simd uops in schedulers (5a)60696a6b6d6emap stall dispatch (70)map rewind (75)map stall (76)dispatch uop (78)map int uop (7c)map ldst uop (7d)map int uop inputs (7f)map ldst uop inputs (80)8283flush restart other nonspec (84)85inst all (8c)inst branch (8d)inst branch taken (90)inst branch cond (94)inst int load (95)inst int alu (97)inst ldst (9b)9d9fl1d tlb access (a0)l1d tlb miss (a1)l1d cache miss ld (a3)ld unit uop (a6)st unit uop (a7)l1d cache writeback (a8)a9acafb5bbl1d cache miss ld nonspec (bf)l1d tlb miss nonspec (c1)c2cdcfd5map dispatch bubble (d6)ddfetch restart (de)e0? int output thing (e9)eaebec? ldst retires (ed)? int retires (ef)f5f6f7f8fd
4020570051524001000210700456979159719254010830103100023010010000616078334268604966983070060700416465636493840100302001000060200200007005435114020110099100100003010010000110010001201000200011000011111026101711169820300060101310000301007004270061700427005870061
4020470060524110211100700456970259719254015330106100023010010000616095334283004967016070060700606465636496340100302001000060200200007006035114020110099100100003010010000010010001211000200011000011112026101711169823300061301310000301007004270061700617006170061
40204700415431101002007004569791597192540108301061000230100100006160783342686149669820700607006064656364963401003020010000602002000070060351140201100991001000030100100001100100012110003000110000110120261017111698043000613131310000301007004270061700617004270061
40204700415241001002007004569791597192540120301031000230100100006160783342542049669800700607006064656364944401003020010000602002000070060351140201100991001000030100100000100100011010002002110000111120261017111698233000613131310000301007006170061700427006270061
4020470041524110000201700456979159719254010830106100023010010000616068334268614966982070041700606465636496340100302001000060200200007006035114020110099100100003010010000010010002111000100041000011111026101711169804300031010010000301007006170042700427005870042
402047006052410010010117004269788597012540108301031000230100100006160953342686149669780700577004164653364960401003020010000602002000070062351140201100991001000030100100000100100032110002000110000111100261017111698233000613101310000301007004270061700617004270042
4020470060525110000210700266979159719254010830106100023010010000616095334176914966978070041700606465636497340100302001000060200200007004135114020210099100100003010010000010010002111000200111000011111026101711169820300031301310000301007006170042700617004270042
4020470060525100100110700456978859716254010430103100023010010000616068334176914966967070042700606463736494440100302001000060200200007006035114020110099100100003010010000010010001201000100011000011011026101711169804300031013010000301007004270061700617006170061
4020470060524110100100700456979159716254010830103100023010010000616078334268614966977070057700576465336496340100302001000060200200007006035114020110099100100003010010000010010003211000200011000011010026101711169804300060131310000301007006170061700617004270061
402047004152511010010070045697885970125401043010310001301001000061607833425420496696307006070060646563649634010030200100006020020000700603511402011009910010000301001000001001000320100010104100001111102610171116980430006130010000301007005570061700617006170061

1000 unrolls and 10 iterations

Result (median cycles for code, minus 3 chain cycles): 4.0051

retire uop (01)cycle (02)030e0f181e1f22233f4d4f51schedule uop (52)schedule int uop (53)schedule ldst uop (55)dispatch int uop (56)dispatch ldst uop (58)int uops in schedulers (59)simd uops in schedulers (5a)60696a6d6emap stall dispatch (70)map rewind (75)map stall (76)dispatch uop (78)map int uop (7c)map ldst uop (7d)map int uop inputs (7f)map ldst uop inputs (80)8283flush restart other nonspec (84)85inst all (8c)inst branch (8d)inst branch taken (90)inst branch cond (94)inst int load (95)inst int alu (97)inst ldst (9b)9d9fl1d tlb access (a0)l1d cache miss ld (a3)ld unit uop (a6)l1d cache writeback (a8)a9acafb5l1d cache miss ld nonspec (bf)c2branch mispred nonspec (cb)cdcfd2d5map dispatch bubble (d6)ddfetch restart (de)e0? int output thing (e9)eaebec? ldst retires (ed)? int retires (ef)f5f6f7f8fd
4002570057524000101070036697755971025400203001310002300101000061699133422540496697170051700516465336497640010300201000060020200007005135114002110910100003001010000110100001100007701810000111002522027122698143000510101010000300107003670052700527005270142
40024700575250001010700206977559710254001630013100013001010000616991334289004966971700517005164669364976400103002010000600202000070051351140021109101000030010100000101000011000051001000011000252002712269814300030101010000300107005270052700527005270141
400247003552500000007003669775596952540014300131000130012100006169913342254049669557005170051646693649764001030020100006002020000700513511400211091010000300101000001010000110000770010000000002520027122698143000310101010000300107005270036700527005270052
400247005252500010107003669775596952540014300131000030010100006169913341470049669557005170051646723649764001030020100006002020000700353511400211091010000300101000001010000110000820010000110002520027122698143000310101010000300107005270052700527003670055
400247009052500000007003669775597102540014300131000130012100006170683342254049669717005170051646533649764001030020100006002020000700513511400211091010000300101000001010000110000740010000110002520027122698143000310101010000300107005270052700527005270119
400247005152400010107003669757597102540014300151000130012100006169913342254049669717005170051646693649774001230020100006002020000700353511400211091010000300101000001010000110000740010000100002520027123698143000310101010000300107005270052700527005270133
400247005252400010107002069775597102540014300101000130010100006169913342254049669717005170051646533649764001030020100006002020000700513511400211091010000300101000001010000110000720010000000002522037122698143000310101010000300107005270036700527005270137
400247005152400010107003669775597102540014300131000130010100006169913342254049669717005170051646693649764001030020100006002020000700353511400211091010000300101000001010000110000750010000000002522027124698143000310101010000300107005270052700527005370036
40024700545250001000700366974359710254001430013100003001010000616991334225404966971700517005164671364976400103002010000600202000070051351140021109101000030010100000101000011000080010000110002520027122698143000010101010000300107005470054700527005370073
4002470053525001265176107003869901597105040016300351000130443100516191613342254149669717005170051646723649604001030020100006002020000700363511400211091010000300101000001010000010000540010003110002522027143698203000310101010000300107005570054700527005570052

Test 4: throughput

Count: 8

Code:

  ldrsh x0, [x6, x7, lsl #1]
  ldrsh x0, [x6, x7, lsl #1]
  ldrsh x0, [x6, x7, lsl #1]
  ldrsh x0, [x6, x7, lsl #1]
  ldrsh x0, [x6, x7, lsl #1]
  ldrsh x0, [x6, x7, lsl #1]
  ldrsh x0, [x6, x7, lsl #1]
  ldrsh x0, [x6, x7, lsl #1]
  mov x7, 8

(fused SUBS/B.cc loop)

100 unrolls and 100 iterations

Result (median cycles for code divided by count): 0.3340

retire uop (01)cycle (02)03090e0f1e223a3f4346494f51schedule uop (52)schedule int uop (53)schedule ldst uop (55)dispatch int uop (56)dispatch ldst uop (58)int uops in schedulers (59)simd uops in schedulers (5a)60696a6d6emap stall dispatch (70)map rewind (75)map stall (76)dispatch uop (78)map int uop (7c)map ldst uop (7d)map int uop inputs (7f)map ldst uop inputs (80)8283flush restart other nonspec (84)85inst all (8c)inst branch (8d)inst branch taken (90)inst branch cond (94)inst int load (95)inst int alu (97)inst ldst (9b)9fl1d tlb access (a0)l1d cache miss ld (a3)a5ld unit uop (a6)l1d cache writeback (a8)acafb5b6bbl1d cache miss ld nonspec (bf)c2branch cond mispred nonspec (c5)branch mispred nonspec (cb)cdcfd5map dispatch bubble (d6)ddfetch restart (de)e0? int output thing (e9)eaebec? ldst retires (ed)? int retires (ef)f5f6f7f8fd
80205267232001117802267072181812258010010080000100800145001166596049236452673326730166506166748011420080024200160048267277111802011009910080000100800001008000039080039042800006135390111511821622267040000800001002670826723267282672326708
802042672220001045112670701818025801001008000010080015500116585604923633267292672216653616659801152008002420016004826707711180201100991008000010080000100800000080039008000061353901115118216222671901004800001002672326728267232670826708
8020426707200100000266922181811258010010080000100800165001167875049236532673726722166356166798011520080024200160048267077111802011009910080000100800001008000039080039035800396039430111511811622267190604800001002672826708267232672826723
80204267222000004110267132120162580100100800001008001550011673030492363326716267281665061667480115200800242001600482672771118020110099100800001008000010080000008003903980000010041115118216222672401064800001002672826728267232670826723
802042672220000001126707000122580100100800001008001450011665960492364826720267281665861667480115200800242001600482670771118020110099100800001008000010080000390800390388003561393901115118116122671906104800001002672826728267232672826723
8020426722200000410226707218181625801001008000010080014500116787504923649267362673116650616674801142008002420016004826722561180201100991008000010080000100800004308003903980035612390111511821621267040062800001002670826723267282672326728
80204267272001000112671220120258010010080000100800155001167875149236422684726743168236166748011420080024200160048267225611802011009910080000100800001008000039080000038800006135390111511811612267190062800001002672326723267232670826708
80204267222000004101267072018025801001008000010080015500116787504923647267352672216650616679801142008002420016004826727561180201100991008000010080000100800003908003903980040010001115118216222670406102800001002670826728267082672326728
8020426722200000410126707001212258010010080000100800155001166596149236342671526725166506166598011520080024200160048267277111802011009910080000100800001008000039080035035800396135001115118116122671901062800001002672326708267232670826728
802042673020000053112670720016258010010080000100800165001167303149236562673626731166356166598011420080024200160048267227211802011009910080000100800001008000000800000358000061354301115118216122670400104800001002672826728267082672326728

1000 unrolls and 10 iterations

Result (median cycles for code divided by count): 0.3342

retire uop (01)cycle (02)03l1i tlb fill (04)l1d tlb fill (05)mmu table walk data (08)09l2 tlb miss data (0b)0e0f1e223a3f4346494f51schedule uop (52)schedule int uop (53)schedule ldst uop (55)dispatch int uop (56)dispatch ldst uop (58)int uops in schedulers (59)simd uops in schedulers (5a)5f6067696a6d6emap stall dispatch (70)map rewind (75)map stall (76)dispatch uop (78)map int uop (7c)map ldst uop (7d)map int uop inputs (7f)map ldst uop inputs (80)8283flush restart other nonspec (84)85inst all (8c)inst branch (8d)inst branch taken (90)92inst branch cond (94)inst int load (95)inst int alu (97)inst ldst (9b)9d9fl1d tlb access (a0)l1d tlb miss (a1)l1d cache miss ld (a3)ld unit uop (a6)st unit uop (a7)l1d cache writeback (a8)a9acafb5b6bbl1d cache miss ld nonspec (bf)l1d tlb miss nonspec (c1)c2branch mispred nonspec (cb)cfd0l1i tlb miss demand (d4)d5map dispatch bubble (d6)ddfetch restart (de)e0eaebec? ldst retires (ed)? int retires (ef)f5f6f7f8fd
800252673720011001006703267223773258001010800001080000501168472010492365726736267151668131671780010208000020160000267378611800211090108000010800000108002019448001910060800416158441900502101171616172671111713080000102674526740267522673726716
800242673720111110107302267222072025800101080000108000050116721901049236572673626736166603167178001020800002016000026715851180021109010800001080000010800191943800591006180040615843192050210114161017267341030580000102671526737267372673726738
800242673720011020002102267223771925800101080000108000050117397501049236562673626736166813166948001020800002016000026714851180021109010800001080000010800192008001910161800396019431920502101151617142673310313580000102673926737267372671626737
8002426737200111110067032672107720258001010800001080000501167219010492365626736267361668231671780010208000020160000267366411800211090108000010800000108001920448005900160800416159431900502101161617162673410313580000102673926737267382673826737
800242673620011120006600266993001925800101080000108000050117397501049236352671526736166813167168001020800002016000026715851180021109010800001080000010800192043800190016180040611901900502101101617102673311713580000102673826744268422673826738
800242671520011021006602267003772258001010800001080000501166522010492365726715267371668131671780010208000020160000267368511800211090108000010800001108002019438006200021800000159431900502131171614172671111413580000102671626738267152673826716
8002426736201111200066022670030720258001010800001080000501166277010492363426737267151668231671780010208000020160000267368511800211090108000010800001108002019438001910121800390059431920502101161615172673310513580000102671626738267382673726737
80024267152001112000660226721077192580010108000010800005011682860104923657267362673716660316716800102080000201600002673685118002110901080000108000011080020210800590012180040615901920502101171617172673310313080000102671626738267382673726716
8002426736200110210021022670027019258001010800001080000501167791010492365626736267151668131669480010208000020160000267158511800211090108000010800001108001921438005910261800406058431920502101916179267331030580000102671626738267382673826715
80024267362001101100700226722079192580010108000010800005011739750104923656267372673616681316716800102080000201600002673785118002110901080000108000001080019214380059101618004061580191050210117161714267331040080000102674726738267502671626715