Apple Microarchitecture Research by Dougall Johnson

M1/A14 P-core (Firestorm): Overview | Base Instructions | SIMD and FP Instructions
M1/A14 E-core (Icestorm):  Overview | Base Instructions | SIMD and FP Instructions

LDRSB (register, uxtw, 64-bit)

Test 1: uops

Code:

  ldrsb x0, [x6, w7, uxtw]
  mov x7, #4
  mov x8, 0

(no loop instructions)

1000 unrolls and 1 iteration

Retires: 1.000

Issues: 1.000

Integer unit issues: 0.000

Load/store unit issues: 1.000

SIMD/FP unit issues: 0.000

retire uop (01)cycle (02)030f1e22233a3f4346494f51schedule uop (52)schedule ldst uop (55)dispatch ldst uop (58)simd uops in schedulers (5a)60616d6emap stall dispatch (70)map rewind (75)map stall (76)dispatch uop (78)map ldst uop (7d)map ldst uop inputs (80)8283flush restart other nonspec (84)85inst all (8c)inst int load (95)inst ldst (9b)l1d tlb access (a0)l1d cache miss ld (a3)ld unit uop (a6)l1d cache writeback (a8)acafb5b6bbl1d cache miss ld nonspec (bf)c2cfd0d5map dispatch bubble (d6)ddfetch restart (de)e0eaebec? ldst retires (ed)f5f6f7f8fd
100540321451013792121202510001000100015037103973942173252100010002000394771110011000100010004310390391039613943073011622391101041000395395397395395
1004394204510137921212162510001000100015018003743942173232100010002000394771110011000100010004310391391039613943073911611391101001000375395376399395
1004394314510137921212162510001000100015208003943942173252100010002000394771110011000100010004310390391039600430730116113910041000395375395395395
100439431451013832121216251000100010001501800394394217325210001000200039477111001100010001000431039039100061394307301161139110041000395395395395395
1004394314510035920120251000100010001406000394394218323210001000200039456111001100010001000431039001039603943073011611391101041000395395375395397
100439431000137921212162510001000100015037193743942173232100010002000374771110011000100010004310390421039613943073911611391101041000395395395395395
1004394314510137901212162510001000100014060003943942173232100010002000394771110011000100010004310000421039603943273911611391101041000395395395395375
1004394214500137920121625100010001000150371939439421732521000100020003745711100110001000100001039039103961390073011611391101041000395395395395375
10043943145101379212120251000100010001498900394394217325210001000200037477111001100010001000431039001000603943073011611391101041000395395395395395
10043943045011380212121625100010001000140601939437421732521000100020003947711100110001000100043103904310396100073011611391101041000395395375375395

Test 2: Latency 1->2 (with chain penalty)

Chain cycles: 3

Code:

  ldrsb x0, [x6, w7, uxtw]
  eor x8, x8, x0
  eor x8, x8, x0
  add x6, x6, x8
  mov x7, #4
  mov x8, 0

(fused SUBS/B.cc loop)

100 unrolls and 100 iterations

Result (median cycles for code, minus 3 chain cycles): 4.0055

retire uop (01)cycle (02)03l1d tlb fill (05)mmu table walk data (08)09l2 tlb miss data (0b)0e0f1e223f43494d4f51schedule uop (52)schedule int uop (53)schedule ldst uop (55)dispatch int uop (56)dispatch ldst uop (58)int uops in schedulers (59)simd uops in schedulers (5a)696a6d6emap stall dispatch (70)map rewind (75)map stall (76)dispatch uop (78)map int uop (7c)map ldst uop (7d)map int uop inputs (7f)map ldst uop inputs (80)8283flush restart other nonspec (84)85inst all (8c)inst branch (8d)inst branch taken (90)inst branch cond (94)inst int load (95)inst int alu (97)inst ldst (9b)9d9fl1d tlb access (a0)l1d tlb miss (a1)l1d cache miss ld (a3)ld unit uop (a6)l1d cache writeback (a8)a9acafb5bbl1d cache miss ld nonspec (bf)l1d tlb miss nonspec (c1)c2cfd5map dispatch bubble (d6)ddfetch restart (de)e0? int output thing (e9)eaebec? ldst retires (ed)? int retires (ef)f5f6f7f8fd
40205700555250010112170046116979259708254011230109100033010010000616104334215849669817006170061646573650644010030200100006020020000700433511402011009910010000301001000001001000002100010011000111100261017111698183000360610000301007004470056700447005670056
40204700555250000012170046116979259712254011230109100033010010000616104334273449669817006170061646573650114010030200100006020020000700553511402011009910010000301001000001001000002100010011000101100261017111698243000606610000301007005070062700627006270062
40204700615241010003070034106979259708254011230109100033010010000616104334273449669817004970049646573650414010030200100006020020000700433511402011009910010000301001000001001000002100010011000111100261017111698243000906610000301007006270050700627006270062
40204700615251101003170040116978659714254010830106100013010010000615971334244649669637005570055646513650564010030200100006020020000700493511402011009910010000301001000001001000212100020021000112110261017111698183000666610000301007009970125700597005670044
40204700435240000007070046116978059720254011230109100083010010000616104334273449669697004970061646453650094010030200100006020020000700557711402011009910010000301001000001001000002100010011000111100261017111698243000966610000301007006270062700627006270050
40204700615251110003170040116971059714254010430106100023010010000616050334244649669757005570055646513650294010030200100006020020000700433511402011009910010000301001000001001000002100010011000111100261017111698063000666610000301007005670124701117005470056
40204700555240000002170040116978659714254010830106100023010010000615971334186749669757005570043646513650524010030200100006020020000700613511402011009910010000301001000001001000112100020021000112010261017111698183000306610000301007004470056700567004470056
40204700435250000002170046106978059720254011230109100033010010000615996334273449669817006170061646573649784010030200100006020020000700553511402011009910010000301001000001001000002100010011000111100261017111698243000966010000301007006270062700507005070050
40204700495251011002170040116978659714254010830103100013010010000616050334244649669757005570055646513650444010030200100006020020000700613511402011009910010000301001000001001000122100041221000112111261017111698183000366010000301007004470056700567004470044
40204700555250000001070046116979259720254011230106100033010010000616104334273449669817004970061646573649584010030200100006020020000700433511402011009910010000301001000011001000002100010011000111100261017111698243000606610000301007006270050700507006270062

1000 unrolls and 10 iterations

Result (median cycles for code, minus 3 chain cycles): 4.0047

retire uop (01)cycle (02)03mmu table walk data (08)0e0f181e1f223a3f4d4f51schedule uop (52)schedule int uop (53)schedule ldst uop (55)dispatch int uop (56)dispatch ldst uop (58)int uops in schedulers (59)simd uops in schedulers (5a)60696a6d6emap stall dispatch (70)map rewind (75)map stall (76)dispatch uop (78)map int uop (7c)map ldst uop (7d)map int uop inputs (7f)map ldst uop inputs (80)8283flush restart other nonspec (84)85inst all (8c)inst branch (8d)inst branch taken (90)inst branch cond (94)inst int load (95)inst int alu (97)inst ldst (9b)9fl1d tlb access (a0)l1d cache miss ld (a3)ld unit uop (a6)l1d cache writeback (a8)acafb5l1d cache miss ld nonspec (bf)c2cfd5map dispatch bubble (d6)ddfetch restart (de)e0? int output thing (e9)eaebec? ldst retires (ed)? int retires (ef)f5f6f7f8fd
4002570047524000010007002069744597132540014300131000230010100006170683341470049669557004770047646653649604001030020100006002020000700473511400211091010000300101000010100001100000010000100252037135698103000360010000300107004670416700537004970036
4002470035524000010007003269730597062540014300131000130010100006170683342062149669677004770047646533649604001030020100006040820000700473511400211091010000300101000010100001100000010000110252037155697983000366610000300107007170038700557004870048
4002470047524000010107003269729597062540014300101000130010100006170683341470049669557004770035646653649724001030020100006002020000700473511400211091010000300101000010100001100000010000110252057154697983000006610000300107007970050700567003670048
4002470035525000010107003269733596952540014300131000130010100006169523342350149669677004770047646533649724001030020100006002020000700473511400211091010000300101000010100001100000010000010252027133698103000006610000300107005670048700487004870036
4002470047524000000107003269736597062540014300131000130010100006169523341470149669557004770047646533649724001030020100006002020000700473521400211091010000300101000010100000100000010000010252047153697983000300610000300107005570053700707004870041
4002470047525000000107003269732597062540014300131000030010100006170683342062149669677004770047646653649724001030020100006002020000700473511400211091010000300101000010100000100000010000100252027144698103000360610000300107005270398700517007870048
4002470047525000010107003269743596952540010300101000130010100006169523341470149669677005770050646653649604001030020100006002020000700353511400211091010000300101000010100000100000010000100252037133698103000066610000300107010570040700487004870036
4002470047524000010007002069729596952540010300131000130010100006169523342062049669677004970053646653649604001030020100006002020000700473511400211091010000300101000010100000100000010000102252037146698103000066610000300107010270051700537004870036
4002470047524000060107002069730597062540010300101000130010100006170683341470149669677004770035646653649604001030020100006002020000700353511400211091010000300101000010100001100000310000100252037133697983000366610000300107008870052700527003670048
4002470047525000000007003269731597092540014300101000130010100006170683341470149669677003570047646533649604001030020100006002020000700353511400211091010000300101000010100000100000010000000252047145698103000366610000300107010570049700517003670048

Test 3: Latency 1->3 (with chain penalty)

Chain cycles: 3

Code:

  ldrsb x0, [x6, w7, uxtw]
  eor x8, x8, x0
  eor x8, x8, x0
  add x7, x7, x8
  mov x7, #4
  mov x8, 0

(fused SUBS/B.cc loop)

100 unrolls and 100 iterations

Result (median cycles for code, minus 3 chain cycles): 4.0057

retire uop (01)cycle (02)03l1d tlb fill (05)mmu table walk data (08)l2 tlb miss data (0b)0e0f1e1f223a3f4d4f51schedule uop (52)schedule int uop (53)schedule ldst uop (55)dispatch int uop (56)dispatch ldst uop (58)int uops in schedulers (59)simd uops in schedulers (5a)696a6b6d6emap stall dispatch (70)int prf full (71)map rewind (75)map stall (76)dispatch uop (78)map int uop (7c)map ldst uop (7d)map int uop inputs (7f)map ldst uop inputs (80)8283flush restart other nonspec (84)85inst all (8c)inst branch (8d)inst branch taken (90)inst branch cond (94)inst int load (95)inst int alu (97)inst ldst (9b)9d9fl1d tlb access (a0)l1d tlb miss (a1)l1d cache miss ld (a3)ld unit uop (a6)l1d cache writeback (a8)a9acafb5bbl1d cache miss ld nonspec (bf)l1d tlb miss nonspec (c1)c2branch cond mispred nonspec (c5)cfd5map dispatch bubble (d6)ddfetch restart (de)e0? int output thing (e9)eaebec? ldst retires (ed)? int retires (ef)f5f6f7f8fd
4020570051525000001001700426978859717254010430106100053026210000616014334225449669770700417005764653036495540100302001000060200200007004135114020110099100100003010010000010010002211000100110000111110261017111698203000610101010000301007005870042700587005870058
40204700575241110020107003669782597102540104301031000030100100006160143342254496697407003570035646470365041401003020010000602002000070057351140201100991001000030100100000100100022110003641110000111110261017111698233000610101010000301007004270058700587005870058
4020470057525101002010700366978259695254010430103100013010010000616014334225449669710700517005164647036503840100302001000060200200007005735114020110099100100003010010000010010003311000101110000111120261017111698203000310101010000301007005870042700587004270058
402047005752511000200070036697885971625401083010310001301001000061607833425424966977070057700576465303650344010030200100006020020000700573511402011009910010000301001000001001000620100010011000011111026101711169820300061001010000301007005870058700587005870058
402047004152510000101170026697915970125401083010310002301001000061607833425424966961070060700576465603650204010030200100006020020000700573511402011009910010000301001000011001000121100010011000011110026101711169820300061010010000301007004270058700587005870058
4020470057525110002011700426978859716254010830106100023010010000616078334254249669770700417005764637036505140100302001000060200200007004135114020110099100100003010010000010010001211000200110000111110261017111698203000610101010000301007004270058700587005870058
4020470057525100002010700266978859716254010430106100013010010000616068334254249669610700577005764653036503340100302001000060200200007005735114020110099100100003010010000010010003111000302110000111100261017111698043000310101010000301007005870058700587005870058
40204700425251100020117002669788597162540108301061000230100100006160683342542496697707005770057646370365046401003020010000602002000070057351140201100991001000030100100000100100011010001021100001111002610171116982030003100010000301007004270058700587005870042
4020470041525110012010700456979159719254010830106100023010010000616078334176949669710700517005164631036505140100302001000060200200007004135114020110099100100003010010000010010002101000201110000011100261017111698203000610101010000301007004270058700587006170042
402047005752411100101070042697915971625401083010610002301001000061606833417694966961070041700416463703650434010030200100006020020000700353511402011009910010000301001000001001000211100020011000011110026101711169817300061001010000301007005870058700587005870058

1000 unrolls and 10 iterations

Result (median cycles for code, minus 3 chain cycles): 4.0051

retire uop (01)cycle (02)03l1d tlb fill (05)mmu table walk data (08)09l2 tlb miss data (0b)0e0f18191e1f2223243a3f4d4f51schedule uop (52)schedule int uop (53)schedule ldst uop (55)dispatch int uop (56)dispatch ldst uop (58)int uops in schedulers (59)simd uops in schedulers (5a)60696a6b6d6emap stall dispatch (70)map rewind (75)map stall (76)dispatch uop (78)map int uop (7c)map ldst uop (7d)map int uop inputs (7f)map ldst uop inputs (80)8283flush restart other nonspec (84)85inst all (8c)inst branch (8d)inst branch taken (90)inst branch cond (94)inst int load (95)inst int alu (97)inst ldst (9b)9d9fl1d tlb access (a0)l1d tlb miss (a1)l1d cache miss ld (a3)ld unit uop (a6)l1d cache writeback (a8)a9acafb5bbl1d cache miss ld nonspec (bf)l1d tlb miss nonspec (c1)c2branch mispred nonspec (cb)cfd5map dispatch bubble (d6)ddfetch restart (de)e0? int output thing (e9)eaebec? ldst retires (ed)? int retires (ef)f5f6f7f8fd
40025700575250000000010100070036697785971025400143001310002300101000061699133422541496697107005170051646533649764001030020100006002020000700513511400211091010000300101000011010000011000000010000101000252057154697983000310101010000300107003670036700527005270052
4002470035524000000001010007003669775596952540010300131000130010100006169913342254149669550700357004164675364966400103002010000600202000070041351140021109101000030010100000101000001100000031000010100025204714269798300031010010000300107005270052700527005270052
4002470051525010011001000007003669775596952540010300101000130010100006169913342254049669710700517005164669364976400103002010000600202000070035351140021109101000030010100000101000001100001001000010100025202714569798300031001010000300107005170055700527005270036
4002470051525000010000000007003669775597162540018300131000030010100006170683341614149669710700517005164669364976400103002010000600202000070051351140021109101000030010100001101000011100000241000300102025204712469991300140101010000300107015070052702447005270058
400247003552500000000001000700366977559710254001830016100003001010000616991334225414966974070051700516466936497640010300201000060020200007003535114002110910100003001010000110100000110000003100001110002520471426981430003001010000300107004370058700587005270052
40024700545250000000010000070036697755971050400143001010000300101000061704533423021496695507005170051646693649604001030020100006002020000700513511400211091010000300101000001010000011000000010000101000252057144697983000310101010000300107005270052700527003670052
4002470051524000000000010007003669775596952540010300101000130010100006169913341470149669710700517005164669364976400103002010000600202000070051351140021109101000030010100000101000001100000001000010000025203714369798300031010010000300107005870042700587005270052
400247005152400000000001000700366978159695254001030013100013001010000617045334254204966961070051700356466936497640010300201000060020200007003535114002110910100003001010000010100000010000000100001010002520471456981430000001010000300107005670052700527005270052
40024700515250000000000000070036697755971025400103001310000300101000061706833422540496697107003570051646533649764001030020100006002020000700513511400211091010000300101000011010000001000000010000100001252057144698143000310101010000300107005270052700527005270052
4002470121524000000001000007002069775597162540010300131000030010100006170683342254149669710700517003564653364976400103002010000600202000070051351140021109101000030010100001101000211100010011000001100025202714469814300031001010000300107005270036700367005270052

Test 4: throughput

Count: 8

Code:

  ldrsb x0, [x6, w7, uxtw]
  ldrsb x0, [x6, w7, uxtw]
  ldrsb x0, [x6, w7, uxtw]
  ldrsb x0, [x6, w7, uxtw]
  ldrsb x0, [x6, w7, uxtw]
  ldrsb x0, [x6, w7, uxtw]
  ldrsb x0, [x6, w7, uxtw]
  ldrsb x0, [x6, w7, uxtw]
  mov x7, 8

(fused SUBS/B.cc loop)

100 unrolls and 100 iterations

Result (median cycles for code divided by count): 0.3340

retire uop (01)cycle (02)030e0f191e22233a3f4346494f51schedule uop (52)schedule int uop (53)schedule ldst uop (55)dispatch int uop (56)dispatch ldst uop (58)int uops in schedulers (59)simd uops in schedulers (5a)60696a6d6emap stall dispatch (70)map rewind (75)map stall (76)dispatch uop (78)map int uop (7c)map ldst uop (7d)map int uop inputs (7f)map ldst uop inputs (80)8283flush restart other nonspec (84)85inst all (8c)inst branch (8d)inst branch taken (90)inst branch cond (94)inst int load (95)inst int alu (97)inst ldst (9b)9d9fl1d tlb access (a0)l1d cache miss ld (a3)ld unit uop (a6)l1d cache writeback (a8)acafb5b6bbl1d cache miss ld nonspec (bf)branch cond mispred nonspec (c5)branch mispred nonspec (cb)cdcfd5map dispatch bubble (d6)fetch restart (de)e0eaebec? ldst retires (ed)? int retires (ef)f5f6f7f8fd
802052672220010038610126707218181225801001008000010080014500116787504923627267222672216650616743801152008002420016004826707711180201100991008000010080000110080000398003503580000610391115118016026704602800001002672326723267082670826708
802042672220000041600126710201802580100100800001008001550011678750492362726722267071665061667480115200800242001600482673171118020110099100800001008000001008000008000000800356135391115118016026704062800001002672326723267082672326723
802042670720100033800026707201812258010010080000100800145001167875049236422672226722166356166808011420080024200160048267077111802011009910080000100800000100800000800350358000000001115118016026719660800001002672326723267232672326708
8020426722200000393101267072001125801001008000010080014500115981404923642267072670716650616685801142008002420016004826707711180201100991008000010080000010080000080035008003561001115118016026704662800001002672326708267232672326708
802042672220000039500126707000025801001008000010080014500116659604923642267072672216650616680801142008002420016004826722711180201100991008000010080000010080000398000003580000010431115118016026719062800001002672326723267082672326708
8020426707200010210012670721818122580100100800001008001450011678750492364226722267271665561674780114200800242001600482672271118020110099100800001008000001008000039800350080000613501115118016026719662800001002670826708267232672326708
80204267072000003620002670720181225801001008000010080015500116659614923642267222672216650616674801142008002420016004826707711180201100991008000010080000010080000398000003580000610391115118016326719662800001002672326723267232672326708
8020426707200000386101267072180122580100100800001008001550011665960492364226707267221665061681380114200800242001600482672271118020110099100800001008000001008000008000003580035000391115118016026719662800001002670826723267232672326723
8020426707200001310126707218012358010010080000100800145001166596049236422670726711166356167388011520080024200160048267225611802011009910080000100800000100800000800350080035603501115118016026719062800001002670826723267232672326728
80204267222000104500126707200122580100100800001008001450011678750492362726707267221665061670080115200800242001600482672271118020110099100800001008000001008000039800350080035600391115118016026719662800001002672326708267082672326723

1000 unrolls and 10 iterations

Result (median cycles for code divided by count): 0.3342

retire uop (01)cycle (02)03l1d tlb fill (05)mmu table walk data (08)09l2 tlb miss data (0b)0e0f18191e223a3f4346494f51schedule uop (52)schedule int uop (53)schedule ldst uop (55)dispatch int uop (56)dispatch ldst uop (58)int uops in schedulers (59)simd uops in schedulers (5a)60696a6d6emap stall dispatch (70)map rewind (75)map stall (76)dispatch uop (78)map int uop (7c)map ldst uop (7d)map int uop inputs (7f)map ldst uop inputs (80)8283flush restart other nonspec (84)85inst all (8c)inst branch (8d)inst branch taken (90)92inst branch cond (94)inst int load (95)inst int alu (97)inst ldst (9b)9d9fl1d tlb access (a0)l1d tlb miss (a1)l1d cache miss ld (a3)a5ld unit uop (a6)st unit uop (a7)l1d cache writeback (a8)a9acafb5b6bbl1d cache miss ld nonspec (bf)l1d tlb miss nonspec (c1)c2cfd5map dispatch bubble (d6)dbddfetch restart (de)e0eaebec? ldst retires (ed)? int retires (ef)f5f6f7f8fd
800252674120111000000670326700307212580010108026010800005011682861492366026763267361705131671680010208000020160000267366411800211090108000010800000108002021430800190006480000615943191502041602226738130580000102675026717267382671526716
8002426737200111000002102267002771925800101080000108000050116696014923657267152673716681316716800102080000201600002671585118002110901080000108000001080019214308005800061800406158431905020216033267331313580000102675426846267162673726738
80024267362001111000067102673727702580010108000010800005011642250492365626736267361668231669580010208000020160000267148511800211091108000010800001108001919440800580006180000615843191502021602226712130580000102671526737267372673726741
80024267152001011100066022672630012580010108000010800005011720090492365726736267371666031671680010208000020160000267368511800211090108000010800000108001920430800190106180040615943191502021602226711013580000102674126719267382673826737
800242673820010010000670326700377202580010108000010800005011672190492365726736267391666031671780010208000020160000267366411800211090108000010800000108001921008057912060800000159431915020225022267331313580000102674926737267372673726715
80024267372001011000067022673437020258001010800001080000501166722149236602674027035166883166958001020800002016000026714641180021109010800001080000010800202000800591126480039011943191502021602226712130580000102673826737267382673726737
800242673620111110004670026699077192580010108000010800005011653041492366526737267141668131671680010208000020160000267378511800211090108000010800001108002020430800591016080041005801915020216022267151313080000102674826748267412673726738
80024267402001012000021032670420702580010108000010800005011672191492365726736267151668131673480544208000020160000267368511800211090108000010800000108002020430800591006180000605901925020216022267341313580000102675026738267162673826737
80024267142011000000079122672527712580010108000010800005011677360492365626715267371668131671680010208000020160000267368511800211090108000010800000108001919440800580016080041605843190502021602226712013580000102674126737267412682926738
800242673720011110100760326720377332580010108000010800005011653040492365726736267361668131671680010208000020160000267378511800211090108000010800001108002020008001900061800396158431915020216022267331313580000102674926715267372673726737