Apple Microarchitecture Research by Dougall Johnson

M1/A14 P-core (Firestorm): Overview | Base Instructions | SIMD and FP Instructions
M1/A14 E-core (Icestorm):  Overview | Base Instructions | SIMD and FP Instructions

LDRB (register, sxtw)

Test 1: uops

Code:

  ldrb w0, [x6, w7, sxtw]
  mov x7, #4
  mov x8, 0

(no loop instructions)

1000 unrolls and 1 iteration

Retires: 1.000

Issues: 1.000

Integer unit issues: 0.000

Load/store unit issues: 1.000

SIMD/FP unit issues: 0.000

retire uop (01)cycle (02)030e0f1e2223243a3f4346494f51schedule uop (52)schedule ldst uop (55)dispatch ldst uop (58)simd uops in schedulers (5a)6d6emap stall dispatch (70)map rewind (75)map stall (76)dispatch uop (78)map ldst uop (7d)map ldst uop inputs (80)8283flush restart other nonspec (84)85inst all (8c)inst int load (95)inst ldst (9b)9dl1d tlb access (a0)l1d cache miss ld (a3)ld unit uop (a6)l1d cache writeback (a8)acafb5b6bbl1d cache miss ld nonspec (bf)cfd5map dispatch bubble (d6)ddfetch restart (de)e0eaebec? ldst retires (ed)f5f6f7f8fd
1005403300441000379210162510001000100014989398394217325510001000200037456111001100010001100043100032010386139437311611391131071000406375407399399
10043943004400013830101925100010001000152083943982213232100010002000398561110011000100011000441038038100001007311611371101401000381375415375401
100439820053100038320119251000100010001520837439822132561000100020003985611100110001000010000100000103861007311611371131041000399375399399399
10043742000000138320119251000100010001526739939822132561000100020003985611100110001000110004310380391038613907311611395101051000399395399399395
1004398300440001383211025100010001000152743983981973256100010002000398771110011000100011000010000381038603844731161137113001000399399399375375
100439820000001383211192510001000100015208398398221325610001000200037477111001100010000100001038138103861007311611391101071000375378399399399
1004398200010013830112025100010001000152743983981973232100010002000398561110011000100001000010380010386139447311611395101471000399399399399399
100437430144000137921102510001000100015208374374221325610001000200039877111001100010000100043100003810386138437311611371101071000402399375375395
10043943000000035921002510001000100014060398374221325610001000200039477111001100010000100043103800103800394473116113710071000375399399399375
100437430044100138320102510001000100014060398374199325610001000200039877111001100010000100044100003910386139437311611395131471000399399399399399

Test 2: Latency 1->2 (with chain penalty)

Chain cycles: 3

Code:

  ldrb w0, [x6, w7, sxtw]
  eor x8, x8, x0
  eor x8, x8, x0
  add x6, x6, x8
  mov x7, #4
  mov x8, 0

(fused SUBS/B.cc loop)

100 unrolls and 100 iterations

Result (median cycles for code, minus 3 chain cycles): 4.0057

retire uop (01)cycle (02)03l1d tlb fill (05)mmu table walk data (08)l2 tlb miss data (0b)0e0f1e223a3f4d4f51schedule uop (52)schedule int uop (53)schedule ldst uop (55)dispatch int uop (56)dispatch ldst uop (58)int uops in schedulers (59)simd uops in schedulers (5a)60696a6d6emap stall dispatch (70)map rewind (75)map stall (76)dispatch uop (78)map int uop (7c)map ldst uop (7d)map int uop inputs (7f)map ldst uop inputs (80)8283flush restart other nonspec (84)85inst all (8c)inst branch (8d)inst branch taken (90)inst branch cond (94)inst int load (95)inst int alu (97)inst ldst (9b)9d9fl1d tlb access (a0)l1d tlb miss (a1)l1d cache miss ld (a3)ld unit uop (a6)l1d cache writeback (a8)a9acafb5bbl1d cache miss ld nonspec (bf)l1d tlb miss nonspec (c1)c2cfd5map dispatch bubble (d6)ddfetch restart (de)e0? int output thing (e9)eaebec? ldst retires (ed)? int retires (ef)f5f6f7f8fd
4020570057525111001107003669782597132540104301031000230100100006160413341470149670137043170067646813649604010030200100006020020000700413511402011009910010000301001000001001000420100021011000011111261017111698203000613101310000301007005870061700617006170061
402047005752511101200700456979159716254010830106100013010010000616078334268614967030700607005764656364963401003020010000602002000070060351140201100991001000030100100000100100011010002801100000101026101711169823300060131010000301007004270058700587006170058
402047006052511000200700266979159719254010830103100023010010000616095334176914966966700587005764656364963401003020010000602002000070060351140201100991001000030100100001100100021110002001100001110026101711169817300031001310000301007006170135700627004970061
4020470041524110001007003669785597132540100301001000030100100006160413342254149669937003570035646503649574010030200100006020020000700353511402011009910010000301001000001001000001100000001000010100261017111698143000313101310000301007005270055700367003670036
402047003552500010101700456979159719254010830106100013010010000616095334268614967006700577005764653364963401003020010000602002000070041351140201100991001000030100100000100100012010002021100001101026101711169823300060131310000301007004270061700427004270061
402047004152410000210700426979159716254010430106100023010010000616068334176914966989700837006264653365016401003020010000602002000070041351140201100991001000030100100000100100023110003011100000111026101711169823300030101310000301007004670046700587006470042
402047004252511000110700426979159719254010830106100023010010000616095334268614967010700417006064656364963401003020010000602002000070060351140201100991001000030100100000100100012110001001100001101026101711169820300061313010000301007005870061700587005870058
4020470060524111002007002669791597012540104301061000130100100006160683341769149669867004170060646533649444010030200100006020020000700603511402011009910010000301001000001001000111100010211000001111261017111698043000313131310000301007004270061700617006670042
40204700605241110020170042697915971925401043010310002301001000061607833426861496700670060700416465636496340100302001000060200200007004135114020110099100100003010010000010010003111000100110000111102610171116982030006001310000301007006170058700617006170058
402047006052510110200700426970259719254010430106100023010010000616095334176914967013700417006064656364963401003020010000602002000070041351140201100991001000030100100000100100022110002021100001111026101711169820300061301010000301007005870061700427006170042

1000 unrolls and 10 iterations

Result (median cycles for code, minus 3 chain cycles): 4.0047

retire uop (01)cycle (02)03090e0f1e22233f4d4f51schedule uop (52)schedule int uop (53)schedule ldst uop (55)dispatch int uop (56)dispatch ldst uop (58)int uops in schedulers (59)simd uops in schedulers (5a)60696a6d6emap stall dispatch (70)map rewind (75)map stall (76)dispatch uop (78)map int uop (7c)map ldst uop (7d)map int uop inputs (7f)map ldst uop inputs (80)8283flush restart other nonspec (84)85inst all (8c)inst branch (8d)inst branch taken (90)inst branch cond (94)inst int load (95)inst int alu (97)inst ldst (9b)9d9fl1d tlb access (a0)l1d cache miss ld (a3)ld unit uop (a6)l1d cache writeback (a8)acafb5l1d cache miss ld nonspec (bf)c2cfd5map dispatch bubble (d6)ddfetch restart (de)e0? int output thing (e9)eaebec? ldst retires (ed)? int retires (ef)f5f6f7f8fd
400257004752510111070032697285970625400143001310002300101000061695233420621496696770047700476466536497240010300201000060020200007004735114002110910100003001010000110100001100000010000110252037123698103000366610000300107004870048700487004870048
400247004752400011070032697285970625400143001310001300101000061706833420621496696770047700356466536497240010300201000060020200007004735114002110910100003001010000010100001100000010000110252027122698103000366610000300107004870048700367004870048
400247004752400001070032697285970625400143001310001300101005161929833420621496696770047700476466536497240010300201000060020200007004735114002110910100003001010000010100001100001610000110252027123698103000366610000300107004870048700487004870048
4002470047525010431070032697285970625400143001310001300101000061695233420620496696770035700476466536497240010300201000060020200007004735114002110910100003001010000010100001100000010000110252027122698103000366610000300107004870036700487004870048
400247004752500011070032697285970625400103001310001300101000061695233420620496696770047700476466536497240010300201000060020200007004735114002110910100003001010000010100001100000010000110252037123698103000366610000300107004870048700487004870048
400247004752500011070032697285970625400143001310001301611000061695233420620496696770047700476466536497240010300201000060020200007004735114002110910100003001010000010100001100000010000110252027122698103000366610000300107004870048700487004870051
400247004752501111070032697285970625400143001310001300101000061695233420621496696770047700476466536497240010300201000060020200007004735114002110910100003001010000010100000100000010000110252027122698103000366610000300107004870048700487004870048
40024700475240002101070020697285970725400143001310001300101000061695233420620496696770047700476466536497240010300201000060020200007004735114002110910100003001010000010100001100000310000110252027122698103000366610000300107004870048700487004870048
400247004752400011070032697285970625400143001310001300101000061695233414700496696770047700476466536497240010300201000060020200007004735114002110910100003001010000110100001100000010000110252027132698103000366610000300107004870048700487004870048
400247004752400010070032697285970625400143001310001300101000061695233420620496696770047700476466536497240010300201000060020200007004735114002110910100003001010000010100001100000010000110252027122698103000366610000300107004870048700487004870048

Test 3: Latency 1->3 (with chain penalty)

Chain cycles: 3

Code:

  ldrb w0, [x6, w7, sxtw]
  eor x8, x8, x0
  eor x8, x8, x0
  add x7, x7, x8
  mov x7, #4
  mov x8, 0

(fused SUBS/B.cc loop)

100 unrolls and 100 iterations

Result (median cycles for code, minus 3 chain cycles): 4.0051

retire uop (01)cycle (02)03090f18191e223f4d4f51schedule uop (52)schedule int uop (53)schedule ldst uop (55)dispatch int uop (56)dispatch ldst uop (58)int uops in schedulers (59)simd uops in schedulers (5a)696a6d6emap stall dispatch (70)map rewind (75)map stall (76)dispatch uop (78)map int uop (7c)map ldst uop (7d)map int uop inputs (7f)map ldst uop inputs (80)8283flush restart other nonspec (84)85inst all (8c)inst branch (8d)inst branch taken (90)inst branch cond (94)inst int load (95)inst int alu (97)inst ldst (9b)9d9fl1d tlb access (a0)l1d cache miss ld (a3)ld unit uop (a6)l1d cache writeback (a8)acafb5l1d cache miss ld nonspec (bf)cfd5map dispatch bubble (d6)ddfetch restart (de)e0? int output thing (e9)eaebec? ldst retires (ed)? int retires (ef)f5f6f7f8fd
4020570051525110011700206978259695254010430103100023010010000616014334225449669717005170054646473649544010030200100006020020000700513511402011009910010000301001000011001000011000000100001126101711169798300031010010000301007005270052700527005270052
4020470051525010011700206978259710254010430103100013010010000616014334225449669897005170051646313649384010030200100006020020000700513511402011009910010000301001000001001000011000000100001126101171169800300030101010000301007005270052700527005270052
4020470051525010061700366978259710254010030103100013010010000616014334225449669717005170051646473649384010030200100006020020000700513511402011009910010000301001000001001000011000000100001126101711169815300031010010000301007005270036700527005270052
4020470035524010011700206978259710254010030103100013010010000616175334225449669717005170051646473649384010030200100006020020000700513511402011009910010000301001000001001000001000000100001126101711169814300031001010000301007005270036700527005270052
40204700515240100117003669764597102540104301031000130100100006160143342254496697170035700516464736495440100302001000060200200007003535114020110099100100003010010000010010000110000001000011261017111698143000310101010000301007005270052700527005270052
402047005152401000070036697825971025401003010310001301001000061601433423024966973700357005164631364954401003020010000602002000070051351140201100991001000030100100000100100001100000010000112610171116981430000010010000301007003670052700527005270052
40204700515250100007003669764597102540104301031000130100100006161753341470496695570051700516464736493840100302001000060200200007003535114020110099100100003010010000010010000110000001000011261017111698143000310101010000301007003670052700527005270052
40204700515250100117004369782597102540104301001000130100100006161753342254496697170054700556468036495440100302001000060200200007005135114020110099100100003010010000010010000110000001000010261017111697983000310101010000301007005270052700527005270036
40204700515430100117003669782596952540104301001000130100100006160143342254496697170051700356463136493840100302001000060200200007003535114020110099100100003010010000010010000110000001000011261017111698143000310101010000301007005270036700527005270052
40204700515240100117003669764597102540104301031000130100100006160143342254496697170051700516464736493840100302001000060200200007005135114020110099100100003010010000010010000110000001000001261017111697983000010101010000301007005270036700367005270052

1000 unrolls and 10 iterations

Result (median cycles for code, minus 3 chain cycles): 4.0047

retire uop (01)cycle (02)03mmu table walk data (08)09l2 tlb miss instruction (0a)l2 tlb miss data (0b)0e0f1e2223243a3f4d4f51schedule uop (52)schedule int uop (53)schedule ldst uop (55)dispatch int uop (56)dispatch ldst uop (58)int uops in schedulers (59)simd uops in schedulers (5a)60696a6b6d6emap stall dispatch (70)int prf full (71)map rewind (75)map stall (76)dispatch uop (78)map int uop (7c)map ldst uop (7d)map int uop inputs (7f)map ldst uop inputs (80)8283flush restart other nonspec (84)85inst all (8c)inst branch (8d)inst branch taken (90)inst branch cond (94)inst int load (95)inst int alu (97)inst ldst (9b)9d9fl1d tlb access (a0)l1d cache miss ld (a3)ld unit uop (a6)l1d cache writeback (a8)acafb5l1d cache miss ld nonspec (bf)c2cfl1i cache miss demand (d3)d5map dispatch bubble (d6)ddfetch restart (de)e0? int output thing (e9)eaebec? ldst retires (ed)? int retires (ef)f5f6f7f8fd
40025700475250100121000070032697285970625400143001310001300101000061695233420620496696707004770047646650364984400103002010000600202000070047351140021109101000030010100000101000011000000100001102520127111698103000366610000300107003670048700487004870048
40024700475240100001100070032697285970625400143001310000300101000061695233420620496696707004770047646530365001400103002010000600202000070047351140021109101000030010100000101000011000000100000002520037112698103000366610000300107004870048700487004870048
40024700475240000001100070032697285970625400143001310001300101000061695233420620496696707004770035647110365000400103002010000600202000070047351140021109101000030010100000101000011000000100001102520017111698103000306610000300107004870048700487004870048
40024700475240000006100070032697285970650400273002510001300101000061706833420621496696707004770047646650364973400103002010000600202000070047351140021109101000030010100000101000011000000100001102520017121698103000366610000300107004870048700487004870036
400247003552500000016100070032697285970625400143001010000300101000061695233414700496696707004770047646530365027400103002010000600202000070047351140021109101000030010100000101000011000000100001102520017111698103000306010000300107003670048700487004870048
40024700475250000001100070032697285970625400143001310001300101000061695233414700496696707004770047646680365000400103002010000600202000070047351140021109101000030010100000101000011000000100001002520017111698103000366610000300107004870036700487004870036
40024700355250000001100070032697435970625400143001310001300101000061695233420620496696707004770047646530365027400103002010000600202000070047351140021109101000030010100000101000001000000100001102520017111698103000360610000300107004870036700487004870048
40024700475250000001100070032697285970625400143001310001300101000061695233420621496696707004770047646650364994400103002010000600202000070047351140021109101000030010100000101000011000000100001102520017112698103000366610000300107004870048700487004870048
40024700355240000001100070020697285970625400273001310000300101000061706833422060496696707004770047646650364973400103002010000600202000070047351140021109101000030010100000101000011000010100001102520017111698103000366610000300107005670048700487004970048
40024700475240000001100070032697825970625400143001310001300101000061706833420620496696707004770035646560365007400103002010000600202000070047351140021109101000030010100001101000001000000100001102520017111698103000366610000300107004870048700487005170048

Test 4: throughput

Count: 8

Code:

  ldrb w0, [x6, w7, sxtw]
  ldrb w0, [x6, w7, sxtw]
  ldrb w0, [x6, w7, sxtw]
  ldrb w0, [x6, w7, sxtw]
  ldrb w0, [x6, w7, sxtw]
  ldrb w0, [x6, w7, sxtw]
  ldrb w0, [x6, w7, sxtw]
  ldrb w0, [x6, w7, sxtw]
  mov x7, 8

(fused SUBS/B.cc loop)

100 unrolls and 100 iterations

Result (median cycles for code divided by count): 0.3341

retire uop (01)cycle (02)03l2 tlb miss data (0b)0e0f1e22233a3f4346494f51schedule uop (52)schedule int uop (53)schedule ldst uop (55)dispatch int uop (56)dispatch ldst uop (58)int uops in schedulers (59)simd uops in schedulers (5a)696a6d6emap stall dispatch (70)map rewind (75)map stall (76)dispatch uop (78)map int uop (7c)map ldst uop (7d)map int uop inputs (7f)map ldst uop inputs (80)8283flush restart other nonspec (84)85inst all (8c)inst branch (8d)inst branch taken (90)inst branch cond (94)inst int load (95)inst int alu (97)inst ldst (9b)9d9fl1d tlb access (a0)l1d tlb miss (a1)l1d cache miss ld (a3)a5ld unit uop (a6)l1d cache writeback (a8)a9acafb5b6bbl1d cache miss ld nonspec (bf)branch cond mispred nonspec (c5)branch mispred nonspec (cb)cdcfd5map dispatch bubble (d6)ddfetch restart (de)e0e7eaebec? ldst retires (ed)? int retires (ef)f5f6f7f8fd
8020526737200011451012671221212162580100100800001008001550011673034923631267332671716635616659801152008002420016004826707771180201100991008000010080000110080000043080000003980000610011151182160026724010100800001002672826728267282672826728
80204267072000010101267120012025801001008000010080015500116585649236522673626729166556166798011520080024200160048267277711802011009910080000100800000100800000430800000039800000139431115118016002670400104800001002670826728267082672826708
80204267272000014800126692012121625801001008000010080015500116659649236272673426727166356166798011520080024200160048267075611802011009910080000100800000100800000008000000080039013801115118016112672900104800001002670826728267282670826708
80204267072000014500126712212121625801001008000010080015500116585649236552670726727166356166798011520080024200160048267075611802011009910080000100800000100800000430800390039800396058431115118016012670400104800001002672826708267282672826708
802042672720000100012669221016258010010080000100800165001166596492363626716267271665561667980115200800242001600482672756118020110099100800001008000001008000004308003900080039600431115118016002670400147800001002672826728267322673226732
802042672720000100002671220121625801001008000010080014500116730349236342670726727166356166798011520080024200160048267277711802011009910080000100800000100800000430800390008003960394311151180160026724010100800001002672826728267282672826728
8020426727200001451012671221212162580100100800001008001550011665964923635267272672716635616679801152008002420016004826727771180201100991008000010080000010080000043080039003980039010431115118016002672401000800001002672826728267282670826728
802042670720000145001267122001625801001008000010080015500116730349239522679726709166556166798011520080024200160048267277711802011009910080000100800000100800000430800000039800396000111511811600267240000800001002670826728267282672826728
8020426727200001450012671201212162580100100800001008001650011658564923778267852672916656616679801152008002420016004826727561180201100991008000010080000010080000000800390039800390039011151180161126705010104800001002672826709267092672826728
802042671120000169011266932121216258010010080000100800215001166714492381726715267141664810166668012120080030200160060267317711802011009910080000100800000100800000430800390039800396139022251281232126724010104800001002672826709267092683126728

1000 unrolls and 10 iterations

Result (median cycles for code divided by count): 0.3341

retire uop (01)cycle (02)03l1i tlb fill (04)mmu table walk data (08)09l2 tlb miss instruction (0a)l2 tlb miss data (0b)0e0f18191e1f22243a3f4346494f51schedule uop (52)schedule int uop (53)schedule ldst uop (55)dispatch int uop (56)dispatch ldst uop (58)int uops in schedulers (59)simd uops in schedulers (5a)60696a6d6emap stall dispatch (70)map rewind (75)map stall (76)dispatch uop (78)map int uop (7c)map ldst uop (7d)map int uop inputs (7f)map ldst uop inputs (80)8283flush restart other nonspec (84)85inst all (8c)inst branch (8d)inst branch taken (90)inst branch cond (94)inst int load (95)inst int alu (97)inst ldst (9b)9d9fl1d tlb access (a0)l1d tlb miss (a1)l1d cache miss ld (a3)ld unit uop (a6)l1d cache writeback (a8)a9acafb5b6bbl1d cache miss ld nonspec (bf)c2cfd5map dispatch bubble (d6)ddfetch restart (de)e0eaebec? ldst retires (ed)? int retires (ef)f5f6f7f8fd
80025267232000000000004100002669301212025800101080000108000050116689614923628267272672816672316707800102080000201600002672771118002110910800001080000010800000398003500428000061350050202716261926725106080000102670926728267282672826728
8002426727200000001000001002669300012258001010800001080000501166750149236482670826708166723167078001020800002016000026727721180021109108000010800001108000003980039003580039613543050202816152826725010480000102670926728267092672826709
80024267272000000000004501022669321201625800101080000108000050116675004923628267082672816652316707800102080000201600002672771118002110910800001080000010800000080039003980039613943050202716152826719010080000102672326709267232672326723
8002426722200000000000410102267132012112580010108000010800005011688430492362826728267081667231670780010208000020160000267275611800211091080000108000001080000039800000039800396035430502027162828267241010480000102672826728267292671126728
8002426708200000000000450102267072012025800101080000108000050116760504923628267082670816672316707800102080000201600002674671118002110910800001080000010800000398000000080000613943050202916292926705010080000102672926709267092672326729
80024267082000000000004500002669320002580010108000010800005011688430492362826757267351667231668880010208000020160000267225611800211091080000108000001080000039800390008003561350050202816282926725100480000102670926723267282670926728
800242670820000000000045000226712212121625800101080000108000050116884304923642267282670816652316702800102080000201600002672256118002110910800001080000010800000080000000800396004305020281614262672400480000102672826723267292672926729
8002426708200000000000450102266932120112580010108000010800005011688430492362826728267271665231670780010208000020160000267225611800211091080000108000001080000039800391008003561043050202816282726724010280000102672326728267232672326728
80024267272000000000005700002671321212025800101080000108000050116689604923648267272670816672316710800102080000201600002673571118002110910800001080000010800000438000000080000003943050201516271626724100280000102672326723267282672826709
80024267272000000000000000226693212120258001010800001080000501166750149236482672726728166523167088001020800002016000026727711180021109108000010800000108000000800390035800350035390502030161428267341010280000102670926728267282672926729