Apple Microarchitecture Research by Dougall Johnson

M1/A14 P-core (Firestorm): Overview | Base Instructions | SIMD and FP Instructions
M1/A14 E-core (Icestorm):  Overview | Base Instructions | SIMD and FP Instructions

LDRH (register, uxtw)

Test 1: uops

Code:

  ldrh w0, [x6, w7, uxtw]
  mov x7, #4
  mov x8, 0

(no loop instructions)

1000 unrolls and 1 iteration

Retires: 1.000

Issues: 1.000

Integer unit issues: 0.000

Load/store unit issues: 1.000

SIMD/FP unit issues: 0.000

retire uop (01)cycle (02)031e22243a3f4346494f51schedule uop (52)schedule ldst uop (55)dispatch ldst uop (58)simd uops in schedulers (5a)606d6emap stall dispatch (70)map rewind (75)map stall (76)dispatch uop (78)map ldst uop (7d)map ldst uop inputs (80)8283flush restart other nonspec (84)85inst all (8c)inst int load (95)inst ldst (9b)9dl1d tlb access (a0)l1d cache miss ld (a3)ld unit uop (a6)l1d cache writeback (a8)acafb5b6bbl1d cache miss ld nonspec (bf)cfd5map dispatch bubble (d6)ddfetch restart (de)e0eaebec? ldst retires (ed)f5f6f7f8fd
10053893451013792121216251000100010001483803943942173252100010002000394711110011000100001000391039039103961354373216223866621000390390390390395
100439434500237921218162510001000100015037138939421732521000100020003947111100110001000010003910350391039613543732162239110641000392395395395395
1004394345102379212121225100010001000149890394394217325210001000200038971111001100010000100039103903910396135437321622391101041000395395395395395
1004394345001379212121225100010001000150181394394217325210001000200038971111001100010000100039103503910396135437321622391101041000395390395395395
100439434500237921212162510001000100015037039439421732521000100020003947111100110001000010003910390351039613543732162239110641000395395396395395
1004394345001374212181125100010001000149890389394217324710001000200039471111001100010000100039103903910396135437321622391101041000395395395395395
100439434600237921812162510001000100014989039439421732521000100020003897111100110001000010003910350391039613543732162238610641000395395395395395
1004394345002379212121625100010001000150370394394217325210001000200039471111001100010000100043103903910396135437321622391101021000395395390393390
100439424500137921212162510001000100015018039439421632521000100020003947211100110001000010003910350351039613543732162239110641000395395395395395
1004389345102379212121625100010001000149890394394217325210001000200039471111001100010000100039103503910396135437321622403101041000395395395395395

Test 2: Latency 1->2 (with chain penalty)

Chain cycles: 3

Code:

  ldrh w0, [x6, w7, uxtw]
  eor x8, x8, x0
  eor x8, x8, x0
  add x6, x6, x8
  mov x7, #4
  mov x8, 0

(fused SUBS/B.cc loop)

100 unrolls and 100 iterations

Result (median cycles for code, minus 3 chain cycles): 4.0047

retire uop (01)cycle (02)03mmu table walk instruction (07)mmu table walk data (08)l2 tlb miss data (0b)0e0f18191e1f22233a3f4d4f51schedule uop (52)schedule int uop (53)schedule ldst uop (55)dispatch int uop (56)dispatch ldst uop (58)int uops in schedulers (59)simd uops in schedulers (5a)60696a6b6d6emap stall dispatch (70)map rewind (75)map stall (76)dispatch uop (78)map int uop (7c)map ldst uop (7d)map int uop inputs (7f)map ldst uop inputs (80)8283flush restart other nonspec (84)85inst all (8c)inst branch (8d)inst branch taken (90)inst branch cond (94)inst int load (95)inst int alu (97)inst ldst (9b)9d9fl1d tlb access (a0)l1d tlb miss (a1)l1d cache miss ld (a3)ld unit uop (a6)l1d cache writeback (a8)acafb5l1d cache miss ld nonspec (bf)cfd5map dispatch bubble (d6)ddfetch restart (de)e0? int output thing (e9)eaebec? ldst retires (ed)? int retires (ef)f5f6f7f8fd
40205700535250000000101007031069735597062540100301031000130100100006160153342062149669550700477004764643364950401003020010000602002000070035351140201100991001000030100100001100100000110000001000011261017111698143000366610000301007004870036700487004870048
402047004752500011001010070020697355970625401043010310001301001000061601533420621496695507004770047646433649504010030200100006020020000700473511402011009910010000301001000001001000000100000010000112610171116983130003126010000301007004870048700367004870049
40204700475240000000101007003269735597072540100301031000130696100006161753342062049669670700357004764649364950401003020010000602002000070047351140201100991001000030100100000100100000110000001000011261017111698753000306610000301007004870051700367004870036
40204700355250001000000107003569735597062540100301031000130100100006161753342062049669670700477004764643364950401003020010000602002000070047351140201100991001000030100100000100100000110000001000001261017111698133000000610000301007004870048700487004870036
40204700475250000000100107002069735597062540100301031000130100100006160153342062049669670700357004764643364938401003020010000602002000070035351140201100991001000030100100000100100000010000001000010261017111698633000366610000301007004870048700487004870036
40204700475250000040100107003569735596952540100301001000030100100006160053342062049673400700477004764631364950401003020010000602002000070035351140201100991001000030100100000100100000010000001000011261017111698413000066010000301007040970049700567004870036
40204700475250000000100007003269764597062540104301031000130100100006160153342062049669670700477003564631364950401003020010000602002000070050351140201100991001000030100100000100100000110000001000011261017111698333000090610000301007004870048700487004870048
40204700475240000000100007003269735597062540104301031000130100100006160153342062149669550700477005064646364950401003020010000602002000070047351140201100991001000030100100000100100002110000001000011261017111698143000366010000301007004870048700487004870048
40204700355250000000100107002069735596952540104301031000130100100006161753342062149669670700507004764643364938401003020010000602002000070047351140201100991001000030100100000100100000110000101000011261017111698403000366610000301007004870051700487003670036
40204700355240001000100107002069735597062540100301031000130100100006160153342062149669670700357004764631364938401003020010000602002000070035351140201100991001000030100100000100100000110000031000010261017111698103000366610000301007004870048700487004870048

1000 unrolls and 10 iterations

Result (median cycles for code, minus 3 chain cycles): 4.0047

retire uop (01)cycle (02)0309l2 tlb miss data (0b)0e0f181e2223243f4d4f51schedule uop (52)schedule int uop (53)schedule ldst uop (55)dispatch int uop (56)dispatch ldst uop (58)int uops in schedulers (59)simd uops in schedulers (5a)696a6d6emap stall dispatch (70)map rewind (75)map stall (76)dispatch uop (78)map int uop (7c)map ldst uop (7d)map int uop inputs (7f)map ldst uop inputs (80)8283flush restart other nonspec (84)85inst all (8c)inst branch (8d)inst branch taken (90)inst branch cond (94)inst int load (95)inst int alu (97)inst ldst (9b)9d9fl1d tlb access (a0)l1d cache miss ld (a3)ld unit uop (a6)l1d cache writeback (a8)acafb5l1d cache miss ld nonspec (bf)branch cond mispred nonspec (c5)cfd5map dispatch bubble (d6)ddfetch restart (de)e0? int output thing (e9)eaebec? ldst retires (ed)? int retires (ef)f5f6f7f8fd
40025700475250000011007003269743597062540010300131000130010100006170683342062496696770047700476466836497240010300201000060020200007005035114002110910100003001010000110100001100001010000110252047135697983000366610000300107005170049700487004870051
40024700475240000011007002069728597062540014300131000130010100006169523342062496696770047700356465336497240010300201000060020200007005035114002110910100003001010000010100001100000010000110252047145698103000366610000300107005170052700497005170049
40024700475240000010007002069743597062540010300131000130010100006169523342062496697070047700476465336497540010300201000060020200007005035114002110910100003001010000110100001100000310000111252047146698133000396610000300107005470036700527004870048
40024700505250001001007003569728597062540010300131000130010100006169523342206496696770047700476466536497240010300201000060020200007005035114002110910100003001010000110100120100000010000110252057137698103000360010000300107005370048700527003670036
40024700355240000010007002069743597062540014300131000030010100006169823342206496696770035700356466536497540010300201000060020200007003535214002110910100003001010000110100001100000010000100252057145698133000006610000300107005270058700487003670036
40024700475250000011007003569743597062540014300131000130010100006170683341470496695570047700356466536497240010300201000060020200007004735114002110910100003001010000110100000100000010000110252047155697983000399910000300107004970048700367004870051
40024700355240000010007003669728597062540014300131000130010100006169523342062496697070050700476466536497540010300201000060020200007003535114002110910100003001010000110100001100000010000110252047154699003000399910000300107008170082700527005070048
40024700475250000010007002069732600332540014300131000130010100006169823342206496696770047700356465336496040010300201000060020200007003535114002110910100003001010000110100001100000010000110252047165698103000000610000300107005070050700487005170048
40024700475250000011007003469743597062540014300131000130010100006170363342062496695570035700476466536496040010300201000060020200007003535114002110910100003001010000110100121100000010000100252047146698133000309010000300107005270048700367004870048
40024700355241011001007002069743597062540014300131000130010100006169823342206496696770048700506466536497540010300201000060020200007005035114002110910100003001010000110100001100000010000110252047135698133000399610000300107004970054700487004870051

Test 3: Latency 1->3 (with chain penalty)

Chain cycles: 3

Code:

  ldrh w0, [x6, w7, uxtw]
  eor x8, x8, x0
  eor x8, x8, x0
  add x7, x7, x8
  mov x7, #4
  mov x8, 0

(fused SUBS/B.cc loop)

100 unrolls and 100 iterations

Result (median cycles for code, minus 3 chain cycles): 4.0050

retire uop (01)cycle (02)03l1d tlb fill (05)mmu table walk data (08)09l2 tlb miss data (0b)0e0f18191e1f22243a3f4d4f51schedule uop (52)schedule int uop (53)schedule ldst uop (55)dispatch int uop (56)dispatch ldst uop (58)int uops in schedulers (59)simd uops in schedulers (5a)60696a6d6emap stall dispatch (70)map rewind (75)map stall (76)dispatch uop (78)map int uop (7c)map ldst uop (7d)map int uop inputs (7f)map ldst uop inputs (80)8283flush restart other nonspec (84)85inst all (8c)inst branch (8d)inst branch taken (90)inst branch cond (94)inst int load (95)inst int alu (97)inst ldst (9b)9d9fl1d tlb access (a0)l1d tlb miss (a1)l1d cache miss ld (a3)ld unit uop (a6)l1d cache writeback (a8)a9acafb5bbl1d cache miss ld nonspec (bf)l1d tlb miss nonspec (c1)c2cdcfd5map dispatch bubble (d6)ddfetch restart (de)e0? int output thing (e9)eaebec? ldst retires (ed)? int retires (ef)f5f6f7f8fd
402057005052600201014162248105610070049697815970925401003010310002301001000061617533422061496695570050700356464336502540100302001000060200200007005035114020110099100100003010010000010010000001000010010000101000261017111698133000396910000301007005770049700487005170036
4020470050524110001006000070032697815969525401043010310001301001000061600533422060496695570047700506464636504540100303971000060200200007003535114020110099100100003010010000010010000001000000010000001000261017111698133000360610000301007003670048700517004870036
4020470047524000011000010070035697815970925401043010310004302641000061601533422061496697070053700356463136495640100302001000060200200007005035114020110099100100003010010000010010000001000000010000100001261017111698213000360010000301007003670051700517005170051
4020470050524000000001010070032697645969525401043010310000301001000061619333422060496697070035700506463136503740100302001000060200200007005035114020110099100100003010010000010010000011000000010000001000261017111698133000366010000301007005170036700517005170048
4020470050525000000000010070035697645970925401043010310001301001000061600533414700496392270035700506464636505540100302001000060200200007005035114020110099100100003010010000010010000011000000010000101000261017111698103000309010000301007005170048700517005170054
4020470050524000000001000070097697355970925401003010310001301001000061617533420620496695570050700506464636503440100302001000060200200007003535114020110099100100003010010000010010000011000000010000101000261017111698133000360910000301007005170048700367005170051
4020470035525000000000010070035697815970625401043010010001301001000061600533422060496696770050700476464636501640100302001000060200200007005035114020110099100100003010010000010010000001000000010000100000261017111698133000309010000301007004870051700487005170051
4020470035524010000000000070020697645970925401283010310001301001000061600533422060496695570050700356464636502340100302001000060200200007003535114020110099100100003010010000010010000011000000010000001000261017111698133000390610000301007004870048700517003670051
4020470050524000000001010070032697355970672401043010310001301001010261600533422060496697070581700506464336498540100302001000060200200007004735114020110099100100003010010000010010000011000000010000001000261017111698133000306910000301007003670036700367005170054
4020470050524000000001000070035697645969525401043010310001301001000061600533422060496695570035700506464336502340100302001000060200200007005035114020110099100100003010010000010010000011000000010000101000261017111697983000399610000301007003670051700367005170036

1000 unrolls and 10 iterations

Result (median cycles for code, minus 3 chain cycles): 4.0051

retire uop (01)cycle (02)0309l2 tlb miss instruction (0a)l2 tlb miss data (0b)0e0f18191e22233f4d4f51schedule uop (52)schedule int uop (53)schedule ldst uop (55)dispatch int uop (56)dispatch ldst uop (58)int uops in schedulers (59)simd uops in schedulers (5a)5f60696a6d6emap stall dispatch (70)map rewind (75)map stall (76)dispatch uop (78)map int uop (7c)map ldst uop (7d)map int uop inputs (7f)map ldst uop inputs (80)8283flush restart other nonspec (84)85inst all (8c)inst branch (8d)inst branch taken (90)inst branch cond (94)inst int load (95)inst int alu (97)inst ldst (9b)9d9fl1d tlb access (a0)l1d tlb miss (a1)l1d cache miss ld (a3)ld unit uop (a6)l1d cache writeback (a8)acafb5bbl1d cache miss ld nonspec (bf)branch cond mispred nonspec (c5)branch mispred nonspec (cb)cfd5map dispatch bubble (d6)dbddfetch restart (de)e0? int output thing (e9)eaebec? ldst retires (ed)? int retires (ef)f5f6f7f8fd
4002570057525000000010070020697755971025400143001010002300101000061699133414700149669717005170051646693649764001030020100006002020000700513511400211091010000300101000001010000011000000100001010025203710226979830000001010000300107003670036700527005270052
400247005152400000000007003669743597102540014300131000130010100006169913341470004966955700517005164669364976400103002010000600202000070035351140021109101000030010100000101000000100000010000100002520271022697983000310101010000300107005270052700367005270052
40024700515250000000100700206977559695254001430013100013001010000616991334225401496695570035700516466936496040010300201000060020200007005135114002110910100003001010000010100000110000001000010100252027102269798300030101010000300107005270052700527003670052
400247003552500011000007003669775596952540014300101000130010100006169913342254004966971700517005164669364976400103002010000600202000070051351140021109101000030010100000101000001100000010000101002520271022697983000310101010000300107005270052700527005270036
4002470051524000000001070020697435969525400263001610000300101000061699133422540049669717003570051646533649604001030020100006002020000700353511400211091010000300101000001010000011000003100001000025202710226981430000001010000300107005270036700527003670052
40024700515250000000100700366974359710254001430013100003001010000616991334225400496697170051700516465336497640010300201000060020200007003535114002110910100003001010000010100000010000001000000100252037102269798300001310010000300107005270036700527005270052
40024700515250000000100700366977559710254001030010100013001010000616991334225400496695570035700356466936497640010300201000060020200007005135114002110910100003001010000010100000110000001000010000252027102269798300031001010000300107005270052700367005370036
40024700355240001100100700366977559695254001430013100013001010000617068334225400496697570035700516466936497640010300201000060020200007003535114002110910100003001010000010100000010000461000010100252027102369814300031001010000300107005670052704157003970222
4002470038527100002033107003669775597102540014300131000130010100006169913342398014966974700357003564669364979400103002010000600202000070051351140021109101000030010100001101000001100000010000001002520271022698143000310101010000300107003670052700527003670036
40024700555250000000110700366977559710254001430013100003001010000617000334225411496697170051700356466936497640010300201000060020200007003535114002110910100003001010000010100000010000001000000000252027102269814300030101010000300107003670052700527003670038

Test 4: throughput

Count: 8

Code:

  ldrh w0, [x6, w7, uxtw]
  ldrh w0, [x6, w7, uxtw]
  ldrh w0, [x6, w7, uxtw]
  ldrh w0, [x6, w7, uxtw]
  ldrh w0, [x6, w7, uxtw]
  ldrh w0, [x6, w7, uxtw]
  ldrh w0, [x6, w7, uxtw]
  ldrh w0, [x6, w7, uxtw]
  mov x7, 8

(fused SUBS/B.cc loop)

100 unrolls and 100 iterations

Result (median cycles for code divided by count): 0.3341

retire uop (01)cycle (02)03l1d tlb fill (05)mmu table walk data (08)09l2 tlb miss data (0b)0e0f181e1f22233a3f4346494f51schedule uop (52)schedule int uop (53)schedule ldst uop (55)dispatch int uop (56)dispatch ldst uop (58)int uops in schedulers (59)simd uops in schedulers (5a)60696a6d6emap stall dispatch (70)map rewind (75)map stall (76)dispatch uop (78)map int uop (7c)map ldst uop (7d)map int uop inputs (7f)map ldst uop inputs (80)8283flush restart other nonspec (84)85inst all (8c)inst branch (8d)inst branch taken (90)inst branch cond (94)inst int load (95)inst int alu (97)inst ldst (9b)9d9fl1d tlb access (a0)l1d tlb miss (a1)l1d cache miss ld (a3)ld unit uop (a6)st unit uop (a7)l1d cache writeback (a8)a9acafb5b6bbl1d cache miss ld nonspec (bf)l1d tlb miss nonspec (c1)c2branch cond mispred nonspec (c5)branch mispred nonspec (cb)cdcfmap dispatch bubble (d6)ddfetch restart (de)e0eaebec? ldst retires (ed)? int retires (ef)f5f6f7f8fd
80205267282000010110570101267120121216258010010080000100800165001167303049236372673126731166556166798011520080024200160048267275611802011009910080000100800000100800000438003900039800006039000111511816002687510100800001002672826879269972672826819
802042672720000001100010126712212016258010010080000100800145001166596049236522674126731166396166798011520080024200160048267071151180201100991008000010080000010080000043800390003980169600430011151181600267241000800001002671926708267082672826728
80204267272000000110450100266920120162580100100800001008001550011665960492365626707267071665561667980115200800242001600482672777118020110099100800001008000001008000004380039000398003961000011151181600267080104800001002672826728267082670826728
80204267272000000114750100267120120025801001008000010080014500116730304923650267312670716655616679801142008002420016004826707772180201100991008000010080000010080000043800000003980039610000111511816002672410100800001002672826728267122670826728
802042672720000000007000032672107902580100100800001008001650011673770492364426723267371666461668880116200800242001600482673785118020110099100800001008000001008002020080019020248000061594319211151411600267150135800001002674126737267372671526738
80204267142001101000121081002669820121825802301028000010080014511116659604923635267292672716655616659801142008002420016004826707771180201100991008000010080000010080000008000000039800000139430011151181610267241000800001002670826728267282673226728
802042670720000001100000126692212002580100100800001008001550011705890492371726751267401666361667180116200800242001600482671464118020110099100800001008000001008001919080058010638003861190192111511816002671113130800001002671526738267422673826715
802042673620010010006600132672230702580100100800001008001550011676280492365626718267371666861668880114200800242001600482673664118020110099100800001008000001008001920438005901164800000060431911115118160126711000800001002673826715267372671526715
80204267362001000000663520032672537731258010010080000100800165001167377049237292674026720166476166668011520080024200160048267148511802011009910080000100800000100800201943800191002180000610430011151181600267240104800001002672826728267282672826728
80204267272000000000000102669220121625801001008000010080015500117711604923627267272670716659616679801152008002420016004826707771180201100991008000010080000010080000043800390000800000139430011151181600267241000800001002670826728267282670826708

1000 unrolls and 10 iterations

Result (median cycles for code divided by count): 0.3340

retire uop (01)cycle (02)03l2 tlb miss data (0b)0e0f1e2223243a3f4346494f51schedule uop (52)schedule int uop (53)schedule ldst uop (55)dispatch int uop (56)dispatch ldst uop (58)int uops in schedulers (59)simd uops in schedulers (5a)696a6d6emap stall dispatch (70)map rewind (75)map stall (76)dispatch uop (78)map int uop (7c)map ldst uop (7d)map int uop inputs (7f)map ldst uop inputs (80)8283flush restart other nonspec (84)85inst all (8c)inst branch (8d)inst branch taken (90)inst branch cond (94)inst int load (95)inst int alu (97)inst ldst (9b)9d9fl1d tlb access (a0)l1d tlb miss (a1)l1d cache miss ld (a3)a5ld unit uop (a6)l1d cache writeback (a8)acafb5b6bbl1d cache miss ld nonspec (bf)c2cfd5map dispatch bubble (d6)ddfetch restart (de)e0eaebec? ldst retires (ed)? int retires (ef)f5f6f7f8fd
8002526731200000411001267072180122580010108000010800005011688434923633267312673216677316702800102080000201600002673271118002110910800001080000010800000390800350358003561354305020181614112672566080000102672526732267322672326733
80024267132000004510022670721218125800101080000108000050116760549236422672226713166773167118001020800002016000026713711180021109108000010800000108000003908003903980018601843050201116131126710106080000102672326732267232671426714
800242672720100045000126719181830112580010108000010800005011669934923642267222671316871316711800102080000201600002673171118002110910800001080000010800000390800540188001860353905020111611122671966080000102672926723267322672326714
8002426722200000180001266982001625800101080000108000050116884349236422672726731166673166938001020800002016000026722621180021109108000010800000108000003908003501880018613543050201116111126710610480000102672326714267142671426732
8002426731201000250002267072180122580010108000010800005011674054923652267132671316667316693800102080000201600002672262118002110910800001080000010800000390800350358003561184305020111611132671066380000102672926714267292673226723
8002426732200000411001267071712012258001010800001080000501167605492364226713267271667631669380010208000020160000267137111800211091080000108000001080000058080039018800356154005020111613122671960380000102672326723267142672326723
8002426727201000411001267162183017258001010800001080000501167292492364826727267311667631671180010208000020160000267288111800211091080000108000001080000039080018035800180139005020111612122671006080000102672326723267232672326728
80025269092000004100022671218012122580010108000010800005011668964923662267352672616659316705800102080000201600002671981118002110910800001080000010800000390800350358001801353905020111612122672866080000102672326729267322672926714
8002426727200000600001267122181817258001010800001080000501167178492364726727267311667231671280010208000020160000267137111800211091080000108000001080000000800350398003561184305020121612122671966480000102671426714267322672326733
80024267222000004110012671318120162580010108000010800005011688434923642267222671316672316702800102080000201600002671371118002110910800001080000110800000390800390358003901394305020121611112672866380000102672326723267332673326723