Apple Microarchitecture Research by Dougall Johnson

M1/A14 P-core (Firestorm): Overview | Base Instructions | SIMD and FP Instructions
M1/A14 E-core (Icestorm):  Overview | Base Instructions | SIMD and FP Instructions

LDRSB (register, sxtw, 64-bit)

Test 1: uops

Code:

  ldrsb x0, [x6, w7, sxtw]
  mov x7, #4
  mov x8, 0

(no loop instructions)

1000 unrolls and 1 iteration

Retires: 1.000

Issues: 1.000

Integer unit issues: 0.000

Load/store unit issues: 1.000

SIMD/FP unit issues: 0.000

retire uop (01)cycle (02)03090e0f1e1f22233a3f4346494f51schedule uop (52)schedule ldst uop (55)dispatch ldst uop (58)simd uops in schedulers (5a)6d6emap stall dispatch (70)map rewind (75)map stall (76)dispatch uop (78)map ldst uop (7d)map ldst uop inputs (80)8283flush restart other nonspec (84)85inst all (8c)inst int load (95)inst ldst (9b)9dl1d tlb access (a0)l1d cache miss ld (a3)a5ld unit uop (a6)st unit uop (a7)l1d cache writeback (a8)acafb5b6bbl1d cache miss ld nonspec (bf)cfd5map dispatch bubble (d6)ddfetch restart (de)e0eaebec? ldst retires (ed)f5f6f7f8fd
1005389300141010137421818122510001000100014774389389212324710001000200038971111001100010000100039010350035103561353973116113866621000393390390390390
1004389301141000137421818122510001000100014838389389212324710001000200038971111001100010000100039010350035103561353973116113866621000390390392390390
1004389300041010137421818122510001000100014838389389212324710001000200038971111001100010000100039010350035103561353973116113866621000390390390390390
1004389300141000137421818112510001000100014838389389212324710001000200038971111001100010000100039010350035103561353973116113866621000390390390390390
1004389300141010137421818122510001000100014838389389212324710001000200038971111001100010000100039010350035103561353973116113866621000390392390390390
1004389300141010137421818112510001000100014844389389212324710001000200038971111001100010000100039010350035103561353973116113886621000392390406391390
1004389300045010137421818122510001000100014838389391212324710001000200039171111001100010000100039010350035103561353973116113866621000390392390390390
1004389300147010137421818112510001000100014838389389212324710001000200038971111001100010000100039010350035103561353973116113866621000390390390390392
1004389300141010137421818122510001000100014838389391212324710001000200039171111001100010000100039010350035103561353973116113866621000390390390390390
1004389300141010137621818122510001000100014844389391212324710001000200039171111001100010001100039010350035103561353973116113886621000392390390390390

Test 2: Latency 1->2 (with chain penalty)

Chain cycles: 3

Code:

  ldrsb x0, [x6, w7, sxtw]
  eor x8, x8, x0
  eor x8, x8, x0
  add x6, x6, x8
  mov x7, #4
  mov x8, 0

(fused SUBS/B.cc loop)

100 unrolls and 100 iterations

Result (median cycles for code, minus 3 chain cycles): 4.0051

retire uop (01)cycle (02)03l1d tlb fill (05)mmu table walk instruction (07)mmu table walk data (08)09l2 tlb miss instruction (0a)l2 tlb miss data (0b)0e0f18191e22243f4d4f51schedule uop (52)schedule int uop (53)schedule ldst uop (55)dispatch int uop (56)dispatch ldst uop (58)int uops in schedulers (59)simd uops in schedulers (5a)66696a6d6emap stall dispatch (70)map rewind (75)map stall (76)dispatch uop (78)map int uop (7c)map ldst uop (7d)map int uop inputs (7f)map ldst uop inputs (80)8283flush restart other nonspec (84)85inst all (8c)inst branch (8d)inst branch taken (90)inst branch cond (94)inst int load (95)inst int alu (97)inst ldst (9b)9d9fl1d tlb access (a0)l1d tlb miss (a1)l1d cache miss ld (a3)ld unit uop (a6)l1d cache writeback (a8)acafb5l1d cache miss ld nonspec (bf)c2branch mispred nonspec (cb)cfd5map dispatch bubble (d6)ddfetch restart (de)e0? int output thing (e9)eaebec? ldst retires (ed)? int retires (ef)f5f6f7f8fd
4020570057529101101110011070039697745971325401003010310001301001000061617533423986649669747003770054646503649384010030200100006020020000700353511402011009910010000301001000001001000001100001610000110026101711169817300001301010000301007005670057701467005970036
402047003552400000000001007004269782597132540104301191000130100100006161753342398049669747003670149646503649394010030200100006053020000700383511402011009910010000301001000011001000001100000010000012026101741169817300001313010000301007005370036700367005570055
4020470091524000000000000170036697855969525401003010310001301001000061617533422540496697470054700516465036495740304302001000060200200007005435114020110099100100003010010000010010000011000010100001100261017111698173000013131310000301007005270058700527005270036
4020470054525000000000010170039697885971325401323010310001301001000061604133423980496697470054700546465036495740100302001000060200200007005435114020110099100100003010010000110010003011000000100001100261017111697983000013101310000301007004070057700527005270036
4020470057525000000000010070039697645971325401043010310000301001000061601433442420496697470051700546465036502140100302001000060200200007005435114020110099100100003010010000010010000001000002254100001100261017111698173000013131310000301007005570058700557005570055
402047003552500000000000007005869792597134840104301001000130100100006160143342398049669747003570035646313649384010030200100006020020000700353511402011009910010000301001000001001000001100000010000000026101711169814300001301010000301007039970052700687005370053
40204700355250000000000100700366976459695254010430100100013010010000616014334239804966971700547005764651364957401003020010000602002000070056351140201100991001000030100100000100100000010000001000011002610171116981430003013010000301007005970055700527005570055
402047003552500000000001007002069782596952540104301031000130100100006160413342254049669557005170054646503649574010030200100006020020000700353511402011009910010000301001000011001000001100000010000110026101711169817300061301310000301007011970053700557005270036
402047003552400000010000017002069785597155140104301031000130100100006160143341470049670687005470054646313649574010030200100006020020000700353511402011009910010000301001000001001000001100001224110000110026101711169817300031301310000301007005870036700607005570055
402047003552400000011001007002069764597102540100301031000130100100006160413342254049669717005470054646503649544010030200100006020020000700513511402011009910010000301001000001001000001100000010000000026161711169818300031001010000301007003770057700557005570052

1000 unrolls and 10 iterations

Result (median cycles for code, minus 3 chain cycles): 4.0050

retire uop (01)cycle (02)03l1d tlb fill (05)mmu table walk data (08)09l2 tlb miss data (0b)0e0f1e223a3f4d4f51schedule uop (52)schedule int uop (53)schedule ldst uop (55)dispatch int uop (56)dispatch ldst uop (58)int uops in schedulers (59)simd uops in schedulers (5a)60696a6b6d6emap stall dispatch (70)map rewind (75)map stall (76)dispatch uop (78)map int uop (7c)map ldst uop (7d)map int uop inputs (7f)map ldst uop inputs (80)8283flush restart other nonspec (84)85inst all (8c)inst branch (8d)inst branch taken (90)inst branch cond (94)inst int load (95)inst int alu (97)inst ldst (9b)9d9fl1d tlb access (a0)l1d tlb miss (a1)l1d cache miss ld (a3)ld unit uop (a6)l1d cache writeback (a8)a9acafb5bbl1d cache miss ld nonspec (bf)l1d tlb miss nonspec (c1)c2cdcfd5map dispatch bubble (d6)ddfetch restart (de)dfe0? int output thing (e9)eaebec? ldst retires (ed)? int retires (ef)f5f6f7f8fd
4002570053524000011100700266970259715254001430016100023001010000617036334249414966967070047700476466836496040010300201000060020200007003535114002110910100003001010000110100000110000003100001101002525171112698043000690910000300107005470057700427005770057
40024700535241011101007003969728596952540014300131000130010100006170683342206049669700700507003564665364975400103002010000600202000070035351140021109101000030010100000101000001100003100100000000002525171112697983000300910000300107005170051700497005470051
4002470037524000000100700206974359793254001830016100023001010000617009334249414966961070053700566467436496640010300201000060020200007005335114002110910100003001010000010100021110014100100001010002525171212698133000096610000300107004870051700487005470057
4002470041525101000100700266970259743254001030013100013001010000617068334220604966970070078701256465836497540010300201000060020200007005335114002110910100003001010000010100000110002011100000111202525171112698043000696610000300107004270054700427005770048
4002470047525000000100700206974359796254001830013100023001010000617036334176914966961070056700566467436498140010300201000060020200007005035114002110910100003001010000010100000010001001100001101002525171112698043000696910000300107005770042700577004270049
4002470048525000000000700356976059795254001430010100003001010000616982334220604966955070050700356466836497240010300201000060020200007003535114002110910100003001010000010100000110000000100001010002525171122698133000096910000300107003670036700367005770051
40024700395240000001007003569743597072540014300131000130010100006170683342110049669730700507005164723364975400103002010000600202000070050351140021109101000030010100001101000001100000054100121010002525171112698133000396910000300107005470054701507025370053
4002470222524000000100700206972859706254001030013100003001010000616982334220604966967070050700506465336497540010300201000060020200007005077114002110910100003001010000010100000110000003100000010002525171112698153000090010000300107005170036700517005470233
40024700475260001003010700366976059715254001430016100023001010000617009334181704966961070041700416467136496640010300201000060020200007005335114002110910100003001010000010100012110000303100001010002525171112698133000309910000300107005170051700487005770059
4002470041524100011210700286976559701254001030013100013001010000616982334220604966955070050700356465336497240010306821000060020204407005035114002110910100003001010000110100000110001101100001111002525171112698193000600610000300107004270042700427005470048

Test 3: Latency 1->3 (with chain penalty)

Chain cycles: 3

Code:

  ldrsb x0, [x6, w7, sxtw]
  eor x8, x8, x0
  eor x8, x8, x0
  add x7, x7, x8
  mov x7, #4
  mov x8, 0

(fused SUBS/B.cc loop)

100 unrolls and 100 iterations

Result (median cycles for code, minus 3 chain cycles): 4.0051

retire uop (01)cycle (02)03l1d tlb fill (05)mmu table walk data (08)09l2 tlb miss data (0b)0e0f1e1f22233a3f4d4f51schedule uop (52)schedule int uop (53)schedule ldst uop (55)dispatch int uop (56)dispatch ldst uop (58)int uops in schedulers (59)simd uops in schedulers (5a)60696a6b6d6emap stall dispatch (70)map rewind (75)map stall (76)dispatch uop (78)map int uop (7c)map ldst uop (7d)map int uop inputs (7f)map ldst uop inputs (80)8283flush restart other nonspec (84)85inst all (8c)inst branch (8d)inst branch taken (90)inst branch cond (94)inst int load (95)inst int alu (97)inst ldst (9b)9d9fl1d tlb access (a0)l1d tlb miss (a1)l1d cache miss ld (a3)ld unit uop (a6)l1d cache writeback (a8)a9acafb5bbl1d cache miss ld nonspec (bf)l1d tlb miss nonspec (c1)c2branch mispred nonspec (cb)cfd5map dispatch bubble (d6)ddfetch restart (de)e0? int output thing (e9)eaebec? ldst retires (ed)? int retires (ef)f5f6f7f8fd
40205700575250000002000070036697825971025401043010310002301001000061601433422541496697107005170035646473649544010030200100006020020000700513511402011009910010000301001000001001000001100000001000011100026101711069814300031001010000301007005270052700527005270052
4020470051525000000190100700366978259695254010430103100013010010000616014334147014966971070051700516464736495440100302001000060200200007005135114020110099100100003010010000010010000011000000010000101000261017111698143000310101010000301007005370052700527005270052
40204700355240000001010070036697825971025401043010310001301001005761685333422541496697107005170035646473649544010030398100006020020000700513511402011009910010000301001000001001000000100000001000010100026101711169814300031010010000301007003670052700527005270052
402047005152400000010100700366978259710254010430103100013010010000616175334225404966971070051700356464736495440100302001000060200200007005135114020110099100100003010010000010010000011000000010000101000261017111698143000310101010000301007005270052700527005270052
402047005152500001010100700206978259695254010430103100013010010000616175334225414966971070051700516464736493840100302001000060200200007005135114020110099100100003010010000110010000011000000010000101000261017111698143000310101010000301007005270052700527005370052
402047005152400000010100700366976459710254010430103100013010010000616175334147014966971070051700516464736495440100302001000060200200007003535114020110099100100003010010000010010000011000000010000101000261017111698703000310101010000301007005270052700527005270052
402047005152400000000100700366978259695254010430103100013010010000616175334225404966971070051701606464836495440100302001000060200200007005135114020110099100100003010010000010010000011000000010000101000261017111698143000310101010000301007005470052700527005270052
4020470051524000000101007003669782597102540104301131000030251100006160143342686149669710700517005164647364954401003020010000602002000070051351140201100991001000030100100000100100000110000300100001010002610171116981430000010010000301007005270052700367005270052
402047005152500000010100700366978259695254010430100100013010010000616014334225404966955070051700516463136495440100302001000060200200007005135114020110099100100003010010000010010000011000000010000101000261017111698143000310101010000301007005270052700527005270052
402047005152500000090100700366978259710254010430103100013010010000616175334225414966955070051700516464736495440100302001000060200200007005135114020110099100100003010010000010010000011000000010000101000261017111698143000310101010000301007005270052700367005270052

1000 unrolls and 10 iterations

Result (median cycles for code, minus 3 chain cycles): 4.0047

retire uop (01)cycle (02)03mmu table walk instruction (07)090e0f1e223f4d4f51schedule uop (52)schedule int uop (53)schedule ldst uop (55)dispatch int uop (56)dispatch ldst uop (58)int uops in schedulers (59)simd uops in schedulers (5a)60696a6d6emap stall dispatch (70)map rewind (75)map stall (76)dispatch uop (78)map int uop (7c)map ldst uop (7d)map int uop inputs (7f)map ldst uop inputs (80)8283flush restart other nonspec (84)85inst all (8c)inst branch (8d)inst branch taken (90)inst branch cond (94)inst int load (95)inst int alu (97)inst ldst (9b)9d9fl1d tlb access (a0)l1d cache miss ld (a3)ld unit uop (a6)l1d cache writeback (a8)acafb5l1d cache miss ld nonspec (bf)branch mispred nonspec (cb)cfd5map dispatch bubble (d6)ddfetch restart (de)e0? int output thing (e9)eaebec? ldst retires (ed)? int retires (ef)f5f6f7f8fd
400257004752501007070020697285982525400143001310001300101000061695233420620496696770047700476466536497240010300201000060020200007004735114002110910100003001010000010100001100000010000110252017111698103000366610000300107004870048700487005470048
400247004752500011170032698375970725400143001310004300101000061695233420621496696770047700476466536497240010300201000060020200007004835214002110910100003001010000010100001100008010000111252017111698103000366610000300107004870036700487004870048
400247004952400001070032697285978025400143001310001300101000061695233420621496696770047700476466536497240010300201000060020200007004735114002110910100003001010000010100001100000010000110252017111698103000366610000300107003670048700487004870048
400247004752400001170032697285978625400143001310001300101000061695233420621496696770047700476466536497240010300201000060020200007004735114002110910100003001010000010100001100141010000110252017111698103000366010000300107004870048700487017170048
400247004752500001170032697285972325400143001310001300101000061695233420620496696770047700476466536497240010300201000060020200007004735114002110910100003001010000010100001100000010000110252017111698193000366610000300107004870048700487004870048
400247003552500001070032697285980425400143001310001300101000061706833420620496696770047700476466536497240010300201000060020200007004735114002110910100003001010000010100001100003010000110252017111698103000366610000300107004870048700487004870048
400247004752400101170032697285980125400143001310001300101000061695233420620496696770047700856466536497240010300201000060020200007004735114002110910100003001010000010100001100000010000110252017111698103000366610000300107004870048700487005470049
400247004752400001170032697285970625400143001310001300101000061706833420620496696770047700476466536497240010300201000060020200007004735114002110910100003001010000010100001100000010000110252017111698103000066610000300107004870048700487004870048
400247004752500001170032697285979025400143001310001300101000061695233420620496696770047700476466536497240010300201000060020200007004735114002110910100003001010000010100001100001010000110252017111698103000366610000300107004870036700487004870048
400247004752500001070032697285980025400143001310001300101000061695233420620496696770047700476466536497240010300201000060020200007005035114002110910100003001010000110100001100008010000110252017111698103000366610000300107004870048700367004870048

Test 4: throughput

Count: 8

Code:

  ldrsb x0, [x6, w7, sxtw]
  ldrsb x0, [x6, w7, sxtw]
  ldrsb x0, [x6, w7, sxtw]
  ldrsb x0, [x6, w7, sxtw]
  ldrsb x0, [x6, w7, sxtw]
  ldrsb x0, [x6, w7, sxtw]
  ldrsb x0, [x6, w7, sxtw]
  ldrsb x0, [x6, w7, sxtw]
  mov x7, 8

(fused SUBS/B.cc loop)

100 unrolls and 100 iterations

Result (median cycles for code divided by count): 0.3340

retire uop (01)cycle (02)030e0f1e22243a3f4346494f51schedule uop (52)schedule int uop (53)schedule ldst uop (55)dispatch int uop (56)dispatch ldst uop (58)int uops in schedulers (59)simd uops in schedulers (5a)6061696a6d6emap stall dispatch (70)map rewind (75)map stall (76)dispatch uop (78)map int uop (7c)map ldst uop (7d)map int uop inputs (7f)map ldst uop inputs (80)8283flush restart other nonspec (84)85inst all (8c)inst branch (8d)inst branch taken (90)inst branch cond (94)inst int load (95)inst int alu (97)inst ldst (9b)9d9fl1d tlb access (a0)l1d tlb miss (a1)l1d cache miss ld (a3)ld unit uop (a6)l1d cache writeback (a8)acafb5b6bbl1d cache miss ld nonspec (bf)branch cond mispred nonspec (c5)branch mispred nonspec (cb)cdcfd0map dispatch bubble (d6)e0eaebec? ldst retires (ed)? int retires (ef)f5f6f7f8fd
80205267322001041001267072120142580100100800001008001550011678751349236472672226707166506166748011420080024200160048267077111802011009910080000100800001100800000398003503680035613539111511831626719662800001002673026735267082672326723
80204267072000041000267073181816258010010080000100800155001166888134923642267222672716655616674801152008002420016004826707561180201100991008000010080000010080000039800360080000013539111511831626719602800001002672026723267082672326723
802042670720000010026707201816258010010080000100800145001166596134923647267222672216655616659801142008002420016004826722561180201100991008000010080000010080000039800000358003561039111511831626719604800001002672026727267082672926723
80204267222000000002671220012258010010080000100800145001159814134923642269022686716635616659801152008002420016004826722561180201100991008000010080000110080000008003503580035013539111511831626719060800001002672126709267082672326723
802042672220010411012671520121225801001008000010080015500116787513492362726727267071665061665980115200800242001600482672771118020110099100800001008000001008000003980035035800396135391115118316267041062800001002673026710267282670826708
802042672720000000126707218016258010010080000100800145001167875134923627267222672216650616674801142008002420016004826727711180201100991008000010080000010080000039800350358000061350111511831626724662800001002671926723267082672326708
80204267072000041001267072181812258010010080000100800155001167875134923642267222672216650616674801152008002420016004826707561180201100991008000010080000010080000008003900800396135391115118316267191000800001002673726724267282670826708
802042670720000000026707201202580100100800001008001450011673031349236472672226707166556166598011420080024200160048267225611802011009910080000100800001100800000398003500800356000111511831626719662800001002671226782267322672826723
802042670720000411012670721818132580100100800001008001550011678751349236422670726707166556166598011420080024200160048267077111802011009910080000100800000100800000398003903880035003543111511831626704060800001002673526708267282672326723
80204267222000141000267072181202580100100800001008001450011678751349236272670726727166586166598011420080024200160048267227111802011009910080000100800000100800000080039035800396100111511831626719060800001002671426863267302671626723

1000 unrolls and 10 iterations

Result (median cycles for code divided by count): 0.3340

retire uop (01)cycle (02)0309l2 tlb miss data (0b)0e0f191e22233a3f4346494f51schedule uop (52)schedule int uop (53)schedule ldst uop (55)dispatch int uop (56)dispatch ldst uop (58)int uops in schedulers (59)simd uops in schedulers (5a)696a6d6emap stall dispatch (70)map rewind (75)map stall (76)dispatch uop (78)map int uop (7c)map ldst uop (7d)map int uop inputs (7f)map ldst uop inputs (80)8283flush restart other nonspec (84)85inst all (8c)inst branch (8d)inst branch taken (90)inst branch cond (94)inst int load (95)inst int alu (97)inst ldst (9b)9fl1d tlb access (a0)l1d tlb miss (a1)l1d cache miss ld (a3)a5ld unit uop (a6)st unit uop (a7)l1d cache writeback (a8)acafb5b6bbl1d cache miss ld nonspec (bf)cfd5map dispatch bubble (d6)daddfetch restart (de)e0? int output thing (e9)eaebec? ldst retires (ed)? int retires (ef)f5f6f7f8fd
800252672320010000411012670721818142580010108000010800005011666914923642267222670816667316702800102080000201600002672271118002110910800001080000108002103908003500680000613639502011601126719066280000102672326723267232672326734
800242672820000000411012669301818225800101080000108000050116760549236422672226722166673167028001020800002016000026722561180021109108000010800001080000039080036003880035603539502011601126719066280000102672426844267302674126723
8002426722200000004110126707218188825800101080000108000050116760549236492672226725166523167028001020800002016000026722711180021109108000010800001080000039080035033580035003539502011601126719066280000102682226713267092672926723
80024267222000000041101267072181872580010108000010800005011676054923628267222672216652316702800102080000201600002672256118002110910800001080000108000003908003500358000060039502011601126705266080000102684226729267192672926709
80024267082000000000012669300181425800101080000108000050116760549236422672226722166523167028001020800002016000026708561180021109108000010800001080000039080027003580035613539502011601126719006080000102683026714267252672926723
80024267082000000041001267072181815258001010800001080000501167605492362826722267221666831670280010208000020160000267225611800211091080000108000010800000008003504041800350100502011601126719066280000102672326723267232672326725
800242672220000000010126707218181225800101080000108000050116772649236452672726710166693167028001020800002016000026722711180021109108000010800001080000039080035003580035613539502011601126719066080000102673026723267872672326723
80024267222000000041001267072181819258001010800001080000501167605492364226708267221666731670580010208000020160000267227211800211091080000108000010800000390800350015880035613539502011601126719066280000102680026731267232673126723
80024267222000010053000267070181810225800101080000108000050116760549236312672726726166713167028001020800002016000026722711180021109108000010800001080000039080035003580035603539502011601126705060280000102672326723267232672326725
800242671020000000121012671221818922580010108000010800005011676054923642267222672216668316702800102080000201600002672271118002110910800001080000108000003908003500358000061039502011601126719066280000102672826723267092671226726