Apple Microarchitecture Research by Dougall Johnson

M1/A14 P-core (Firestorm): Overview | Base Instructions | SIMD and FP Instructions
M1/A14 E-core (Icestorm):  Overview | Base Instructions | SIMD and FP Instructions

LDRSH (register, lsl, 32-bit)

Test 1: uops

Code:

  ldrsh w0, [x6, x7, lsl #1]
  mov x7, #4
  mov x8, 0

(no loop instructions)

1000 unrolls and 1 iteration

Retires: 1.000

Issues: 1.000

Integer unit issues: 0.000

Load/store unit issues: 1.000

SIMD/FP unit issues: 0.000

retire uop (01)cycle (02)030e0f1e22233a3f4346494f51schedule uop (52)schedule ldst uop (55)dispatch ldst uop (58)simd uops in schedulers (5a)606d6emap stall dispatch (70)map rewind (75)map stall (76)dispatch uop (78)map ldst uop (7d)map ldst uop inputs (80)8283flush restart other nonspec (84)85inst all (8c)inst int load (95)inst ldst (9b)9dl1d tlb access (a0)l1d cache miss ld (a3)ld unit uop (a6)l1d cache writeback (a8)acafb5b6bbl1d cache miss ld nonspec (bf)cfd5map dispatch bubble (d6)ddfetch restart (de)e0eaebec? ldst retires (ed)f5f6f7f8fd
10053893110100376218181225100010001000148380395394212324710001000200037471111001100010000100039100003510356103973116113860621000391375390395390
1004389300410013742181812251000100010001406003893892123249100010002000389711110011000100001000391035035103561353973116113716621000375390375375390
10043892000001359201216251000100010001483813943892163252100010002000374711110011000100001000391039039103961043731161137110641000390395375395375
1004389300410023800120162510001000100014060139837421632521000100020003947111100110001000010003910394351039613943731161138610601000390395395375378
10043943004510037401812122510001000100014989139438919732521000100020003945611100110001000010000103903810390135073116113910641000375395395395398
100437420045000379218121625100010001000149891374394197323210001000200039471111001100010000100039103500103561354373116113914621000375390390390375
1004391300451003790121216251000100010001406013943942173252100010002000374711110011000100001000010350391039013907311611371101001000375395395395395
10043743004100237920002510001000100014774139439421732321000100020003747111100110001000010003910390391039613543731161138610051000395395395395395
10043943004100237400122251000100010001483813943762163252100010002000394711110011000100001000391000139103561043731161139110041000395375395375375
1004394200451023590120122510001000100014989137439419732321000100020003747111100110001000010003910000010006139437311611391101021000395395395375393

Test 2: Latency 1->2 (with chain penalty)

Chain cycles: 3

Code:

  ldrsh w0, [x6, x7, lsl #1]
  eor x8, x8, x0
  eor x8, x8, x0
  add x6, x6, x8
  mov x7, #4
  mov x8, 0

(fused SUBS/B.cc loop)

100 unrolls and 100 iterations

Result (median cycles for code, minus 3 chain cycles): 4.0065

retire uop (01)cycle (02)03l1d tlb fill (05)mmu table walk data (08)09l2 tlb miss data (0b)0e0f18191e223a3f43494d4f51schedule uop (52)schedule int uop (53)schedule ldst uop (55)dispatch int uop (56)dispatch ldst uop (58)int uops in schedulers (59)simd uops in schedulers (5a)60696a6b6d6emap stall dispatch (70)map rewind (75)map stall (76)dispatch uop (78)map int uop (7c)map ldst uop (7d)map int uop inputs (7f)map ldst uop inputs (80)8283flush restart other nonspec (84)85inst all (8c)inst branch (8d)inst branch taken (90)inst branch cond (94)inst int load (95)inst int alu (97)inst ldst (9b)9d9fl1d tlb access (a0)l1d tlb miss (a1)l1d cache miss ld (a3)ld unit uop (a6)l1d cache writeback (a8)a9acafb5bbl1d cache miss ld nonspec (bf)l1d tlb miss nonspec (c1)c2cfd5map dispatch bubble (d6)ddfetch restart (de)e0? int output thing (e9)eaebec? ldst retires (ed)? int retires (ef)f5f6f7f8fd
40205700655251001000014710700501069788597082540112301061000330100100006161403342926149669850700657004964726364968401003020010000602002000070153351140201100991001000030100100000100100012010003012100010211126101711169812300090101010000301007006170050700507006670061
402047004952510011100290070050116979659724254011230109100033010010000616140334215814966969070057700656475736495240100302001000060200200007006535114020110099100100003010010000010010002211000201210001111112610171116982830009010010000301007006670061700667005870050
402047006552510111000310700341169796597082540108301061000330100100006161403342926149669690700657004964758364952401003020010000602002000070049351140201100991001000030100100000100100012010002002100010211126101711169828300060101010000301007006670050700667006670066
40204700655241011000036107005011697965972425401083010910003301001000061614033421581496698507004970065647673649684010030200100006020020000700653511402011009910010000301001000001001000210100020151000112010261017111698123000610101010000301007006670066700667017870050
402047004952411110000361270050106978059724254010830109100033010010000616140334215814966969370065700656467736495240100302001000060200200007006535114020110099100100003010010000010010002221000201210001121102610171116981230006001010000301007006670066700507005070066
4020470049525111000002900700501069796597082540108301061000330100100006161403342158149669690700657006564755364968401003020010000602002000070049351140201100991001000030100100000100100011010003012100011201026101711169914300060101010000301007006670061700587006170058
40204700655251000000054107005010697805972425401083010910003301001000061614033421581496698507004970049647003649524010030200100006020020000700653511402011009910010000301001000001001000120100020021000102110261017111698283000610101010000301007006670050700667006670061
40204700655241010000039107005011697805972425401123010910003301001000061614033429261496696907006570049647303649524010030200100006020020000700653511402011009910010000301001000001001000110100020121000112011261017111698283000610101010000301007005070068700667006670050
4020470065524110000003000700501069796597242540108301061000330100100006161403342926149669850700497004964772364968401003020010000602002000070065351140201100991001000030100100000100100023210002022100011201126101711169828300090101010000301007006670066700667006670050
40204700495251001000039007005011697965970825401123010910002301001000061599633429260496696907006570065647273649634010030200100006020020000700603511402011009910010000301001000011001000121100020121000112110261017111698283000610101010000301007006670066700507005070066

1000 unrolls and 10 iterations

Result (median cycles for code, minus 3 chain cycles): 4.0051

retire uop (01)cycle (02)03l1d tlb fill (05)mmu table walk data (08)l2 tlb miss data (0b)0e0f1e22233f4d4f51schedule uop (52)schedule int uop (53)schedule ldst uop (55)dispatch int uop (56)dispatch ldst uop (58)int uops in schedulers (59)simd uops in schedulers (5a)60696a6b6d6emap stall dispatch (70)map rewind (75)map stall (76)dispatch uop (78)map int uop (7c)map ldst uop (7d)map int uop inputs (7f)map ldst uop inputs (80)8283flush restart other nonspec (84)85inst all (8c)inst branch (8d)inst branch taken (90)inst branch cond (94)inst int load (95)inst int alu (97)inst ldst (9b)9d9fl1d tlb access (a0)l1d cache miss ld (a3)ld unit uop (a6)l1d cache writeback (a8)acafb5bbl1d cache miss ld nonspec (bf)c2cfl1i cache miss demand (d3)d5map dispatch bubble (d6)ddfetch restart (de)e0? int output thing (e9)eaebec? ldst retires (ed)? int retires (ef)f5f6f7f8fd
4002570057524111001007003969775597102540014300131000230010100006170453342542149669710700517005164669364976400103002010000600202000070051351140021109101000030010100000101000011000000100001100252004714469814300031001010000300107005270036700527005270052
40024700515250000111070036697755971025400103001310001300101000061699133422541496702907003570051646533649604001030020100006002020000700513511400211091010000300101000001010000110000001000011002520037144698143000310101010000300107003670036700527005270036
4002470094525000001007003669775596952540010300131000130010100006169913342254149669580700517005164669364976400103002010000600202000070051351140021109101000030010100001101000011000003100001110252002712669798300030101310000300107005570052700527005270052
4002470035524000001107003669775597102540014300101000130010100006169913342254149669710700527003564672364976400103002010000600202000070035351140021109101000030010100000101000011000000100001110252004713569814300030101010000300107005270052700527009070052
4002470051524000001107002069778597102540014300101000130010100006170683341470149669710700517003564669364976400103002010000600202000070035351140021109101000030010100000101000011000000100001010252002714269815300031001010000300107005270052700527003670036
40024700515250000001070036697755971025400143001310001300101000061699133422541496697107003570051646693649764001030020100006002020000700513511400211091010000300101000001010000010000001000011102520027124698143000310101010000300107003670052700527005270036
4002470051524000001107003669775596952540010300101000130010100006169913342254149669550700517005164653364960400103002010000600202000070051351140021109101000030010100000101000011000000100001100252004713469814300031001010000300107005270052700527005570052
40024700515240000061070020697755971025400143001310001300101000061699133422541496697107003570051646693649764001030020100006002020000700513511400211091010000300101000001010000010000001000011102520047142698043000610101010000300107005870170700587005270036
40024700355250000000070036697755969625400143001010000300101000061706833422541496700607005170035646533649764001030020100006002020000700353511400211091010000300101000001010000110000001000011102520047142698173000010101010000300107005270052700367005270052
40024700515250000001070036697755971025400143001310001300101000061706833414701496697107003570035646693649764001030020100006002020000700513511400211091010000300101000001010000010000001000000102520047142698163000010101010000300107005270055700527003670036

Test 3: Latency 1->3 (with chain penalty)

Chain cycles: 3

Code:

  ldrsh w0, [x6, x7, lsl #1]
  eor x8, x8, x0
  eor x8, x8, x0
  add x7, x7, x8
  mov x7, #4
  mov x8, 0

(fused SUBS/B.cc loop)

100 unrolls and 100 iterations

Result (median cycles for code, minus 3 chain cycles): 4.0047

retire uop (01)cycle (02)03l2 tlb miss data (0b)0e18191e1f22233f4d4f51schedule uop (52)schedule int uop (53)schedule ldst uop (55)dispatch int uop (56)dispatch ldst uop (58)int uops in schedulers (59)simd uops in schedulers (5a)60696a6b6d6emap stall dispatch (70)map rewind (75)map stall (76)dispatch uop (78)map int uop (7c)map ldst uop (7d)map int uop inputs (7f)map ldst uop inputs (80)8283flush restart other nonspec (84)85inst all (8c)inst branch (8d)inst branch taken (90)inst branch cond (94)inst int load (95)inst int alu (97)inst ldst (9b)9d9fl1d tlb access (a0)l1d cache miss ld (a3)ld unit uop (a6)l1d cache writeback (a8)acafb5l1d cache miss ld nonspec (bf)branch cond mispred nonspec (c5)branch mispred nonspec (cb)cfd5map dispatch bubble (d6)ddfetch restart (de)e0? int output thing (e9)eaebec? ldst retires (ed)? int retires (ef)f5f6f7f8fd
40205700515430098175013201070036701095986625401043010310002301001000061603233422540496697107011670051646803649574010030200100006020020000700513511402011009910010000301001000001001000011001000100001100261017111698103000366610000301007004870048700487004870048
40204700475250000100070032697355970625401043010310001301001000061601533420620496696707004770047647133649504010030200100006020020000700353511402011009910010000301001000001001000011000000100001100261017111698103000366610000301007004870048700487003670048
40204700475250000700070032697355970625401043010310001301001000061601533420620496696707004770047647493649504010030200100006059420000700473511402011009910010000301001000001001000011000000100001100261017111698103000066610000301007004870048700487004870048
402047004752400001900170032697355970625401043010010004304131000061601533420620496696707004770035646563653184010030200101116020020000700473531402011009910010000301001000001001000011000000100001110261017111697983000366010000301007004870036700487004870048
402047004752500004600170032697355970625401043010310001301001000061601533420620496695537004770035646903649384010030200100006020020000700473511402011009910010000301001000001001000011000000100001100261017111698103000366610000301007004870048700487015970048
402047004752500001300170032697355969525401043010310000301001000061601533420620496696707004770047646753649384010030200100006020020000700473511402011009910010000301001000001001000011000003100001100261017111698103000306610000301007004870048700487004870048
402047003552400002800070032697645970625401043010310000301001000061601533420620496696707004770047647193649504010030200100006020020000700473511402011009910010000301001000001001000011000000100000100261017111698103000366610000301007004870048700487004870084
402047004752510001900170032697355970625401003010310001301001000061601533420621496696707004770035647163649384010030200100006020020000700353511402011009910010000301001000001001000011000000100001100261017111697983000366610000301007003670048700487004870048
402047004752400002800170032697355970625401043010010001301001000061601533420620496696707004770047647173649504010030200100006020020000700473511402011009910010000301001000001001000011000030100001100261017111698103000366610000301007004870048700487004870048
402047004752400002800070032697355970625401043010010001301001000061601533420620496695507004770047647433649504010030200100006020020000700473511402011009910010000301001000001001000011000000100001100261017111697983000366610000301007003670148700487004870048

1000 unrolls and 10 iterations

Result (median cycles for code, minus 3 chain cycles): 4.0050

retire uop (01)cycle (02)03l1d tlb fill (05)mmu table walk data (08)09l2 tlb miss data (0b)0e0f1e223a3f4d4f51schedule uop (52)schedule int uop (53)schedule ldst uop (55)dispatch int uop (56)dispatch ldst uop (58)int uops in schedulers (59)simd uops in schedulers (5a)5f60696a6d6emap stall dispatch (70)map rewind (75)map stall (76)dispatch uop (78)map int uop (7c)map ldst uop (7d)map int uop inputs (7f)map ldst uop inputs (80)8283flush restart other nonspec (84)85inst all (8c)inst branch (8d)inst branch taken (90)inst branch cond (94)inst int load (95)inst int alu (97)inst ldst (9b)9d9fl1d tlb access (a0)l1d tlb miss (a1)l1d cache miss ld (a3)ld unit uop (a6)l1d cache writeback (a8)a9acafb5bbl1d cache miss ld nonspec (bf)l1d tlb miss nonspec (c1)c2branch cond mispred nonspec (c5)cfd0d2l1i cache miss demand (d3)d5map dispatch bubble (d6)d9dadbddfetch restart (de)e0? int output thing (e9)eaebec? ldst retires (ed)? int retires (ef)f5f6f7f8fd
400257005352400000020070036697435970925400103001310002300101000061707233446060149669767005770057646753649604001030020100006002020000700503511400211091010000300101000011010000011000000010000111110252000027100011698043000390910000300107005470057700427005770094
4002470060525111000700700416977959705254006330016100013001010000617009334249401496696170053700416467436498140010300201000060020200007005635114002110910100003001010000010100012110001391410000011100252001037100011698043000660910000300107004270042700427005770108
40024700535241210002107003869780597122540018300131000230010100006170363342494004966973700537005364671364978400103002010000600202000070060351140021109101000030010100000101000121100014521010000110100252000019900011698193001299910000300107005170058704117005470110
4002470050525000000130070036697435970925400103001310000300101000061699133422060149669677005070050646583649604001030020100006002020000700373511400211091010000300101000001010000011000090010000100000252000037100011698103000366610000300107003670036700517004870089
40024700485240000000007003669728596952540014300101000130010100006169913341470014966970700357003564654364975400103002010000600202000070050351140021109101000030010100000101000000100004300100010000002520000171000116979830000219910000300107005170036700517004970103
400247004752400000001070035697605970925400143001310004303021000061707733422540149669557003570047646683652024001030020100006002020000700353521400211091010000300101000001010000001000010010000101000252000127111023698133000360010000300107004870048700487003670038
4002470035524000000600700326976059695254001430010100013001010000617068334147000496697070050700506466836497540010300201000060020200007005035114002110910100003001010000010100000110000380010000001000252000017110132698103000060910000300107004870036700517005170103
40024704195240000000007003269743597092540010300131000130010102046170773342062014966967700357005064653364975400103002010000600202000070035351140021109101000030010100000101000001100125306100001010002520000171000116981130003309010000300107003670036700487003670090
4002470050525000000121070032697605970625400143001310000300101000061706833414700149669727003570050646683649754001030020100006002020000700563511400211091010000300101000001010000001000043031000010100025200001710001169798300032127910000300107005170036700367005270094
40024700525250000001300700206974459709254001430010100003001010000617068334220600496697070047700506466536497540010300201000060020200007005035114002110910100003001010000110100000010000380912610000000000252000017100011698133000099610000300107004870048700487004870114

Test 4: throughput

Count: 8

Code:

  ldrsh w0, [x6, x7, lsl #1]
  ldrsh w0, [x6, x7, lsl #1]
  ldrsh w0, [x6, x7, lsl #1]
  ldrsh w0, [x6, x7, lsl #1]
  ldrsh w0, [x6, x7, lsl #1]
  ldrsh w0, [x6, x7, lsl #1]
  ldrsh w0, [x6, x7, lsl #1]
  ldrsh w0, [x6, x7, lsl #1]
  mov x7, 8

(fused SUBS/B.cc loop)

100 unrolls and 100 iterations

Result (median cycles for code divided by count): 0.3341

retire uop (01)cycle (02)03l1d tlb fill (05)mmu table walk data (08)09l2 tlb miss data (0b)0e0f1e223a3f4346494f51schedule uop (52)schedule int uop (53)schedule ldst uop (55)dispatch int uop (56)dispatch ldst uop (58)int uops in schedulers (59)simd uops in schedulers (5a)60696a6d6emap stall dispatch (70)map rewind (75)map stall (76)dispatch uop (78)map int uop (7c)map ldst uop (7d)map int uop inputs (7f)map ldst uop inputs (80)8283flush restart other nonspec (84)85inst all (8c)inst branch (8d)inst branch taken (90)inst branch cond (94)inst int load (95)inst int alu (97)inst ldst (9b)9d9fl1d tlb access (a0)l1d tlb miss (a1)l1d cache miss ld (a3)ld unit uop (a6)st unit uop (a7)l1d cache writeback (a8)a9acafb5b6bbl1d cache miss ld nonspec (bf)l1d tlb miss nonspec (c1)c2branch cond mispred nonspec (c5)branch mispred nonspec (cb)cdcfd5map dispatch bubble (d6)ddfetch restart (de)e0eaebec? ldst retires (ed)? int retires (ef)f5f6f7f8fd
8020526732200100000147122670721212162580100100800001008001450011673031492364726840267361670761668480116200800242001600482673381118020110099100800001008000001008001820080054001598003861574219211151180160026729992800001002673326734267342673426734
8020426732201100011630126712212016258010010080000100800145001167303049236472683426735167476166848011620080024200160048267338111802011009910080000100800000100800192042800571015980038615742001115118016002672410104800001002670826728267232672826728
80204267272000000006002267122121216258010010080000100800155001167303049236272672626731166556166638011620080024200160048267087111802011009910080000100800000100800000398003500039800006135390011151180160026704064800001002672326723267282672326723
802042672720000000077012671701818162580100100800001008001650011673480492365226835267381666661668480115200800242001600482673282118020110099100800001008000001008000003980039000398000061353919311151180160026729992800001002673326733267332673326734
80204267322001100009513267172181814258010010080000100800145001169563049236522684026732166616166798011520080024200160048267077111802011009910080000100800000100800000438003900039800396104300111511801600267241062800001002672826723267282672826728
80204267072000000007201267120001625801001008000010080016500116730304923647268362673216661616674801152008002420016004826727128118020110099100800001008000001008000003980039000398003961354319011151180160026729092800001002674026733267332673326734
802042673320110000083032671720181525801001008000010080016500116773304923652268272671816742616684801162008002420016004826732811180201100991008000010080000010080020194280019000598003761574219111151180160026729092800001002673326733267152671526733
8020426732200111000101122670721212162580100100800001008001550011598140492364726843267361666361667980114200800242001600482672772118020110099100800001008000001008000003980039040398003961394319011151180160026711990800001002673326733267152673326734
80204267332001111006802267122121216258010010080000100800155001167303049236472684026737166676166848011620080024200160048267328111802011009910080000100800000100800201942800191005980037615742001115118016002672410104800001002672826728267282672826728
802042672720000000084122671221212162580100100800001008001650011658560492364726835267411666761668580115200800242001600482673281118020110099100800001008000011008001919428005710159800000019420011151180160026724662800001002672826728267282672826728

1000 unrolls and 10 iterations

Result (median cycles for code divided by count): 0.3342

retire uop (01)cycle (02)03l1d tlb fill (05)mmu table walk instruction (07)mmu table walk data (08)09l2 tlb miss data (0b)0e0f1e223a3f4346494f51schedule uop (52)schedule int uop (53)schedule ldst uop (55)dispatch int uop (56)dispatch ldst uop (58)int uops in schedulers (59)simd uops in schedulers (5a)5f60696a6d6emap stall dispatch (70)map rewind (75)map stall (76)dispatch uop (78)map int uop (7c)map ldst uop (7d)map int uop inputs (7f)map ldst uop inputs (80)8283flush restart other nonspec (84)85inst all (8c)inst branch (8d)inst branch taken (90)inst branch cond (94)inst int load (95)inst int alu (97)inst ldst (9b)9d9fl1d tlb access (a0)l1d tlb miss (a1)l1d cache miss ld (a3)ld unit uop (a6)st unit uop (a7)l1d cache writeback (a8)a9acafb5b6bbl1d cache miss ld nonspec (bf)l1d tlb miss nonspec (c1)c2cfd0d2l1i cache miss demand (d3)d5map dispatch bubble (d6)d9dadbddfetch restart (de)e0eaebec? ldst retires (ed)? int retires (ef)f5f6f7f8fd
800252673320110001006503267002181815258001010800001080000501166960014923652267322673216659316712800102080000201600002679482118002110910800001080000010800191942800580005980038615701905020000116000112672999280000102673326716267342673426733
800242673220010101106513267182181816258001010800001080000501167599014923634267322671516677316712800102080000201600002673381118002110910800001080000010801522242800571015980038611901925020000116000112673799280000102673326733267332671526733
800242673220010110002412267212180172580010108000010800005011676600149236522671426732166783167138001020800002016000026816811180021109108000010800000108002020428005810159800390158421905020000116000112672999280000102671626733267332671626734
80024267142001000100210226699018180258001010800001080000501166960014923635267322673216677316694800102080000201600002677764118002110910800001080000010800201908005710160800006157421905020000116000112672990280000102671626734267332671526716
8002426732200101000065022671720180258001010800001080000501165304014923652267152673216677316712800102080000201600002675882118002110910800001080000010800202008005700059800000019421905020000116000112672990280000102673326716267332673326734
8002426733201100110065112670021818162580010108000010800005011672980149236522673226715166783166958001020800002016000026813641180021109108000010800000108001920080057000242800386157421915020000116000112672909280000102673426734267492671626733
8002426732200101010065032669921818152580010108000010800005011722010149236532671526732166773167128001020800002016000026714811180021109108000010800000108001919428005600059800000057421905020000117000112671290080000102673326715267332673326733
8002426733200101100064032671701818152580010108000010800005011675990149236532673226733166773167128001020800002016000026798811180021109108000010800000108002021428005710159800386157421915020000116000112673090280000102673426715267332673326733
800242673220010110006503267182181815258001010800001080000501166960014923634267152673316677316694800102080000201600002694464118002110910800001080000010800202042800191012180000605701915020000116000112672990280000102673326715267332671626716
800242673320010101002112266992181817258001010800001080000501170204014923652267322671516677316712800102080000201600002679464118002110910800001080000010800192008005810121800380057421905020000116000112672999280000102671626733267332671626715