Apple Microarchitecture Research by Dougall Johnson

M1/A14 P-core (Firestorm): Overview | Base Instructions | SIMD and FP Instructions
M1/A14 E-core (Icestorm):  Overview | Base Instructions | SIMD and FP Instructions

LDR (register, uxtw, 64-bit)

Test 1: uops

Code:

  ldr x0, [x6, w7, uxtw]
  mov x7, #4
  mov x8, 0

(no loop instructions)

1000 unrolls and 1 iteration

Retires: 1.000

Issues: 1.000

Integer unit issues: 0.000

Load/store unit issues: 1.000

SIMD/FP unit issues: 0.000

retire uop (01)cycle (02)030e0f1e22233a3f4346494f51schedule uop (52)schedule ldst uop (55)dispatch ldst uop (58)simd uops in schedulers (5a)6d6emap stall dispatch (70)map rewind (75)map stall (76)dispatch uop (78)map ldst uop (7d)map ldst uop inputs (80)8283flush restart other nonspec (84)85inst all (8c)inst int load (95)inst ldst (9b)l1d tlb access (a0)l1d cache miss ld (a3)ld unit uop (a6)l1d cache writeback (a8)acafb5b6bbl1d cache miss ld nonspec (bf)cfd5map dispatch bubble (d6)ddfetch restart (de)e0eaebec? ldst retires (ed)f5f6f7f8fd
1005381211450033792121216251000100010001504439439421732521000100020003947711100110001000100043103974810396139437741644391101041000395395395395395
1004394200450033792121216251000100010001498939439421732521000100020003947711100110001000100043103903910396139437741644391101041000395395395395395
100439430045003379212920251000100010001498939739421732521000100020003947711100110001000100043103904510396139437741644391101041000395395395395395
1004394300450033812121216251000100010001501839439421632521000100020003947711100110001000100043103903910396139437741644391101041000395395396395395
10043942004500337920716251000100010001501839439421632521000100020003947711100110001000100043103903910396139437741644391101041000395395395395395
100439430045003379212716251000100010001501839439421632521000100020003947711100110001000100043103803810386139447741644395141071000399399399399399
100439830044003383212719251000100010001527439439822132521000100020003987711100110001000100043103803810386138447741644395141471000399399399399399
1004398300440033832171925100010001000153653943982213256100010002000398771110011000100010004310380010386138447741644396141471000399408399399395
10043983004400338321919251000100010001527439839822132561000100020003987711100110001000100043103903810386138447741644395141471000399395399399399
10044033004400338321719251000100010001520839839822132561000100020003987711100110001000100043103803810386139447741644395101471000399395395399399

Test 2: Latency 1->2 (with chain penalty)

Chain cycles: 3

Code:

  ldr x0, [x6, w7, uxtw]
  eor x8, x8, x0
  eor x8, x8, x0
  add x6, x6, x8
  mov x7, #4
  mov x8, 0

(fused SUBS/B.cc loop)

100 unrolls and 100 iterations

Result (median cycles for code, minus 3 chain cycles): 4.0056

retire uop (01)cycle (02)03l1d tlb fill (05)mmu table walk instruction (07)mmu table walk data (08)09l2 tlb miss data (0b)0e0f18191e1f22233a3f4d4f51schedule uop (52)schedule int uop (53)schedule ldst uop (55)dispatch int uop (56)dispatch ldst uop (58)int uops in schedulers (59)simd uops in schedulers (5a)60696a6d6emap stall dispatch (70)map rewind (75)map stall (76)dispatch uop (78)map int uop (7c)map ldst uop (7d)map int uop inputs (7f)map ldst uop inputs (80)8283flush restart other nonspec (84)85inst all (8c)inst branch (8d)inst branch taken (90)inst branch cond (94)inst int load (95)inst int alu (97)inst ldst (9b)9d9fl1d tlb access (a0)l1d tlb miss (a1)l1d cache miss ld (a3)ld unit uop (a6)l1d cache writeback (a8)a9acafb5bbl1d cache miss ld nonspec (bf)l1d tlb miss nonspec (c1)c2branch cond mispred nonspec (c5)cfd5map dispatch bubble (d6)ddfetch restart (de)e0? int output thing (e9)eaebec? ldst retires (ed)? int retires (ef)f5f6f7f8fd
40205700565251001110002880007003869787597152540208301061000530267100006208563342350149684247147071191651911396587042858328371087465088217467150435151402011009910010000301001000011001004040100491035924100481111003003219133711453014196910000301007162771648714947150571611
402047161155511002101616225814961007184570741603195124039430308100653300410765658569339397314968881718687132565402215660284500234462110366839922658722113524140201100991001000030100100000100100674110064033817110000010131261027122711863011396610000301007151271499715147150971509
40204713335341111100116212013200007150970159602252540104301061000230100100006160323342494149669737004170041646371265114401003020010439637642164671413351414020110099100100003010010000010010002111004916225210000010110261021722698163000696010000301007005770042700547005770057
40204700565251011111001000070026697875971525401043010310002301001000061605933424940496697670056700566465236495940100302001000060200200007005335114020110099100100003010010000010010001101000220110000011100261027122698163000690610000301007005770042700577005470057
40204700415241000100002001170041697875971525401083010610002301001000061603233423501496697370041700546465336496140100302001000060200200007004135114020110099100100003010010000010010001101000201110000111110261027122698163000366910000301007004270057700547007970048
40204700535251011110002001070041697975971925401083010310002301001000061607833423501496697670041700536463736495940100302001000060200200007005635114020110099100100003010010000010010002111000100110000111100261027122698193000600010000301007005770057700547004270042
4020470041525100110100593000070041697875970125401043010310002301001000061631133451821496696170056700566463736495940100302001000060200200007005635214020110099100100003010010000110010002111000301110000111110261027122698193000696010000301007005770057700577005770057
40204700565251000010001001070041697875971525401083010610002301001000061603233457581496697670056700566465236495940100302001000060200200007004135114020110099100100003010010000010010002111000122110000111110261027122698193000609910000301007005470057700427004270057
4020470041525100110000586001070041697875971525401083010310002301001000061603233477261496697670056700536463736495940100302001000060200200007004135114020110099100100003010010000010010002111000102410000010100269127122698163000669010000301007004270042700427005770057
40204700565421001001002000070026697025971525401043010310002301001000061603233464731496696170053700416464936495940100302001000060200200007005635114020110099100100003010010000110010002211000100110000111100261027122698043000309910000301007005970054700427005470057

1000 unrolls and 10 iterations

Result (median cycles for code, minus 3 chain cycles): 4.0047

retire uop (01)cycle (02)03090e0f1e22233f4d4f51schedule uop (52)schedule int uop (53)schedule ldst uop (55)dispatch int uop (56)dispatch ldst uop (58)int uops in schedulers (59)simd uops in schedulers (5a)6061696a6b6d6emap stall dispatch (70)map rewind (75)map stall (76)dispatch uop (78)map int uop (7c)map ldst uop (7d)map int uop inputs (7f)map ldst uop inputs (80)8283flush restart other nonspec (84)85inst all (8c)inst branch (8d)inst branch taken (90)inst branch cond (94)inst int load (95)inst int alu (97)inst ldst (9b)9d9fl1d tlb access (a0)l1d cache miss ld (a3)ld unit uop (a6)l1d cache writeback (a8)acafb5l1d cache miss ld nonspec (bf)branch cond mispred nonspec (c5)cdcfd0d2d5map dispatch bubble (d6)ddfetch restart (de)e0? int output thing (e9)eaebec? ldst retires (ed)? int retires (ef)f5f6f7f8fd
40025700475251121107002069728597062540014300131000130010100006169523342062154966967070035700476466536497440010300201000060020200007004735114002110910100003001010000010100001100000010000110025200067145698103000306610000300107004870075700487004870048
40024700475250001107003269743597062540014300131000030010100006170683341470104966967070047700476466536500940010300201000060020200007004735114002110910100003001010000010100001100000010000010025200067136698103000366610000300107004870048700367004870036
40024700475250011007002069728597062540010300131000130010100006169523341470054966967070035700476465336504640010300201000060020200007003535114002110910100003001010000010100001100000010000110025205357135698103000366610000300107004870048700487004870048
40024700475250001107003269728597062540014300131000030010100006169523342062154966955070047700476466536499040010300201000060020200007003535114002110910100003001010000010100001100000010000110025205347174698103000360010000300107004870048700487003670048
40024700475240001107003269728597062540014300131000130010100006169523342062154966967070047700476466536500340010300201000060020200007004735114002110910100003001010000010100001100000010000100025205327133698103000366610000300107003670048700487003670036
40024700475250000007003269743597062540010300131000130010100006170683342062154966967070047700356465336501140010300201000060020200007004735114002110910100003001010000010100001100000310000110125200057144698103000066610000300107004870048700487004870048
40024700475250001107003269743597062540014300131000030010100006169523342062104966968070047700476466536500940010300201000060020200007004735114002110910100003001010000010100000100000010000010025200057153698103000060010000300107004870048700367004870048
40024700355250001107003269728597062540014300131000130010100006170683342062104966967370047700476466536498740010300201000060020200007004735114002110910100003001010000010100001100000010000110025205347143697983000366610000300107004870036700487004870036
40024700475240000107003269728597062540014300101000130010100006170683342062154966967070047700476465336503340010300201000060020200007004735114002110910100003001010000010100000100000010000110025205337144698103000306610000300107004870048700487004870036
400247004752500081107003269728597062540014300131000030010100006169523341470154966967070047700476466536498740010300201000060020200007004735114002110910100003001010000010100001100000010000110025205347154698103000366610000300107004870048700487004870048

Test 3: Latency 1->3 (with chain penalty)

Chain cycles: 3

Code:

  ldr x0, [x6, w7, uxtw]
  eor x8, x8, x0
  eor x8, x8, x0
  add x7, x7, x8
  mov x7, #4
  mov x8, 0

(fused SUBS/B.cc loop)

100 unrolls and 100 iterations

Result (median cycles for code, minus 3 chain cycles): 4.0054

retire uop (01)cycle (02)03l1d tlb fill (05)mmu table walk instruction (07)mmu table walk data (08)l2 tlb miss data (0b)0e0f1e22233a3f4d4f51schedule uop (52)schedule int uop (53)schedule ldst uop (55)dispatch int uop (56)dispatch ldst uop (58)int uops in schedulers (59)simd uops in schedulers (5a)60696a6b6d6emap stall dispatch (70)map rewind (75)map stall (76)dispatch uop (78)map int uop (7c)map ldst uop (7d)map int uop inputs (7f)map ldst uop inputs (80)8283flush restart other nonspec (84)85inst all (8c)inst branch (8d)inst branch taken (90)inst branch cond (94)inst int load (95)inst int alu (97)inst ldst (9b)9d9fl1d tlb access (a0)l1d tlb miss (a1)l1d cache miss ld (a3)ld unit uop (a6)l1d cache writeback (a8)a9acafb5bbl1d cache miss ld nonspec (bf)l1d tlb miss nonspec (c1)c2branch mispred nonspec (cb)cfd5map dispatch bubble (d6)ddfetch restart (de)e0? int output thing (e9)eaebec? ldst retires (ed)? int retires (ef)f5f6f7f8fd
4020570057525000011554000700426970459702254010430103100023010010000616811334238114966981070059700416467336496740125302001000060200200007006035114020110099100100003010010000010010001111000200110000101000263717111698233000313101310000301007006170061700617006170061
40204700605250000106000700396978559713254010430103100013010010000617076334407304966980070060700606466336496340100302001000060200200007006035114020110099100100003010010000010010002111000201110000101000261017111698173000310101310000301007005270055700557005570055
40204700545251011001410170045697715976275401083014710001302561000062982133452300496695507005770056646543649574010030200100006020020000700543511402011009910010000301001000001001000000100000001000011110026101711169823300061301310000301007006170062700617006170042
402047004452500000113000700396976759713254010030103100013010010000617176334489404966980070060700606463736496340100302001005660200200007006035114020110099100100003010010000010010000011000040310000111110261017111698203003110101010000301007006170061700617005870061
40204700575240100001000700366978559713254010430103100013010010000616068334782214966977070041700606465636496040100302001000060200200007006035114020110099100100003010010000010010001201000101110000001000261017111697983000313131310000301007005570055700367005570055
402047005452511110020017004569791597162540108301061000230125100006167083344702049669740700547009764647364964401253020010000602002000070035351140201100991001000030100100000100100000110000000100001010002610171116981430003010010000301007005270036700527003670036
40204700355251010001100700266978859724254010430106100023010010000617473334479804966955070035700516464736495740125302001000060200200007003535114020110099100100003010010000010010000011000000010000111110261017111698203000310101010000301007006170058700587005870058
40204700605250100001100700396978559710254010430100100003010010000616068334555014966955070035700546465736495740100302001000060200200007005436114020110099100100003010010000010010003001000000310000111110261017111698043000613101010000301007006170042700617006170042
402047006052501001000007003969764597642540100301031000130100100006160683345849149669770700417006064656364963401003020010000602002000070060351140201100991001000030100100000100100021110002011100001000002610171116982330003130010000301007006170042700617006170058
40204700605240100001100700396978259713254010430103100013010010000616068334622204966974070035700516463836495440125302001000060200200007005335114020110099100100003010010000010010000011000000010000111100261017111698203000610131010000301007006170042700617004270061

1000 unrolls and 10 iterations

Result (median cycles for code, minus 3 chain cycles): 4.0049

retire uop (01)cycle (02)030e0f1e2223243f4d4f51schedule uop (52)schedule int uop (53)schedule ldst uop (55)dispatch int uop (56)dispatch ldst uop (58)int uops in schedulers (59)simd uops in schedulers (5a)60696a6b6d6emap stall dispatch (70)map rewind (75)map stall (76)dispatch uop (78)map int uop (7c)map ldst uop (7d)map int uop inputs (7f)map ldst uop inputs (80)8283flush restart other nonspec (84)85inst all (8c)inst branch (8d)inst branch taken (90)inst branch cond (94)inst int load (95)inst int alu (97)inst ldst (9b)9d9fl1d tlb access (a0)l1d cache miss ld (a3)ld unit uop (a6)l1d cache writeback (a8)acafb5l1d cache miss ld nonspec (bf)l1d tlb miss nonspec (c1)cdcfd5map dispatch bubble (d6)ddfetch restart (de)e0? int output thing (e9)eaebec? ldst retires (ed)? int retires (ef)f5f6f7f8fd
40025700475250021007007369728597062540014300131000130010100006169973342062149669670700477004764665364972400103002010000600202000070047351140021109101000030010100000101000011000000100001100252027157698103000366610000300107004870048700487004870048
4002470052524100100700326972859706254001430013100013001010000616970334206204966967070047700476466536497240010300201000060020200007004735114002110910100003001010000010100001100000165100001100252047144698103000306610000300107004870048700487004870048
40024700505250011007002069728596952540014300131000130010100006169523342062149669670700477004764653364972400103002010000600202000070047351140021109101000030010100000101000011000006100001100252047136698103000366610000300107004870048700487004870048
400247004952500110070032697285969525400143001310001300101000061695233420620496696707004770047646653650194001030020100006002020000700473511400211091010000300101000001010000110000084100001000252027134698103000366610000300107004870048700367004870048
40024700515250011007003269728597062540014300131000130010100006169523342062049669670700477004764665364960400103002010000600202000070047351140021109101000030010100000101000011000009101100031100255647134698103000366610000300107003670053700487004870048
40024700485240011007003269728597062540014300131000130010100006169523341470149669670700477004764665364972400103002010000600202000070047351140021109101000030010100000101000011000003100001100252047134697983000366610000300107004870048700487004870048
400247004852400110070032697285970625400143001310001300101000061695233420621496696707004770047646653649724001030020100006002020000700473511400211091010000300101000001010000110000081100001100252047143698103000366610000300107004870048700487004870048
40024700495250011007003269728597062540014300101000130010100006169523342062149669670700477004764665364972400103002010000600202000070047351140021109101000030010100000101000011000003100001100252047142698103000366610000300107004870048700487004870048
40024700495250011007003269728597062540014300131000130010100006169523342062149669670700477004764665364972400103002010000600202000070035351140021109101000030010100000101000011000003100001100252047142698103000366610000300107004870048700367004870048
40024700525240011007003269728597062540014300131000230010100006169523342062149669670700477004764665364972400103002010000600202000070047351140021109101000030010100000101000011000309056100001100252047134698103000366610000300107004870048700487004870048

Test 4: throughput

Count: 8

Code:

  ldr x0, [x6, w7, uxtw]
  ldr x0, [x6, w7, uxtw]
  ldr x0, [x6, w7, uxtw]
  ldr x0, [x6, w7, uxtw]
  ldr x0, [x6, w7, uxtw]
  ldr x0, [x6, w7, uxtw]
  ldr x0, [x6, w7, uxtw]
  ldr x0, [x6, w7, uxtw]
  mov x7, 8

(fused SUBS/B.cc loop)

100 unrolls and 100 iterations

Result (median cycles for code divided by count): 0.3353

retire uop (01)cycle (02)03mmu table walk instruction (07)l2 tlb miss instruction (0a)l2 tlb miss data (0b)0e0f1e22233a3f4346494f51schedule uop (52)schedule int uop (53)schedule ldst uop (55)dispatch int uop (56)dispatch ldst uop (58)int uops in schedulers (59)simd uops in schedulers (5a)60696a6d6emap stall dispatch (70)map rewind (75)map stall (76)dispatch uop (78)map int uop (7c)map ldst uop (7d)map int uop inputs (7f)map ldst uop inputs (80)8283flush restart other nonspec (84)85inst all (8c)inst branch (8d)inst branch taken (90)inst branch cond (94)inst int load (95)inst int alu (97)inst ldst (9b)9d9fl1d tlb access (a0)l1d tlb miss (a1)l1d cache miss ld (a3)ld unit uop (a6)l1d cache writeback (a8)acafb5b6bbl1d cache miss ld nonspec (bf)branch cond mispred nonspec (c5)branch mispred nonspec (cb)cdcfd5map dispatch bubble (d6)ddfetch restart (de)e0eaebec? ldst retires (ed)? int retires (ef)f5f6f7f8fd
80205267362001100045100267122121216258010010080000100800155001167303149236472672726727166556166798011520080024200160048267277711802011009910080000100800000100800000080039042800396139431115118316322672410104800001002672826728267282672826728
8020426732200110004500126712212121625801001008000010080015500116659604923647267272672716655616723801152008002420016004826727771180201100991008000010080000010080000043800390380039013901115118316332672410100800001002679426728267282670826728
8020426736200110004510126712012121625801001008000010080015500117755004923647267272672716655151667980119200800242001600482672777118020110099100800001008000001008000004380039045800006139431115118316332672410104800001002672826728267282672826728
80204267272001100045101267122121202580100100800001008001650011677880492362726727267271665561667980114200800242001600482672777118020110099100800001008000001008000004380039011780039603943111511831632267241004800001002672826728267282672826728
8020426727201110004510126692212121625801001008000010080016500116748104923627267272672716635616679801152008002420016004826707771180201100991008000010080000010080000043800000132800396104311151183163326724004800001002670826728267082672826728
802042683620011011451012669220121625801001008000010080015500116429814923647267072670716655616659801152008002420016004826707771180201100991008000010080000010080000043800391120800336139431115118216332672410104800001002672826728267282672826728
802042685320011000451012671221212162580100100800001008001650011671230492362726727267071665561667980114200800242001600482672777118020110099100800001008000001008000004380039012680039603943111511831632267241000800001002672826728267282672826728
8020426845200110004510126712212121625801001008000010080015500116722204923647267272686816645916666801212008003020016006026727771180201100991008000010080000010080000043800390138800396139432225129423552672410104800001002672926728267282672826729
8020426728200110004510126712201216258010010080000100800195001171109049236282672726727166451016667801212008003020016006026727771180201100991008000010080000010080000043800000123800396139432225128523552672410104800001002672926728267282672826729
80204268362001100046101267122121202580100100800001008002050011648301492364726728267081662591666680121200800302001600602670877118020110099100800001008000001008000004380039012080039613943222512952355267241004800001002672826709267292672826728

1000 unrolls and 10 iterations

Result (median cycles for code divided by count): 0.3341

retire uop (01)cycle (02)03mmu table walk data (08)09l2 tlb miss data (0b)0e0f18191e1f22233a3f4346494f51schedule uop (52)schedule int uop (53)schedule ldst uop (55)dispatch int uop (56)dispatch ldst uop (58)int uops in schedulers (59)simd uops in schedulers (5a)696a6d6emap stall dispatch (70)map rewind (75)map stall (76)dispatch uop (78)map int uop (7c)map ldst uop (7d)map int uop inputs (7f)map ldst uop inputs (80)8283flush restart other nonspec (84)85inst all (8c)inst branch (8d)inst branch taken (90)inst branch cond (94)inst int load (95)inst int alu (97)inst ldst (9b)9d9fl1d tlb access (a0)l1d tlb miss (a1)l1d cache miss ld (a3)a5ld unit uop (a6)l1d cache writeback (a8)acafb5b6bbl1d cache miss ld nonspec (bf)c2cfd5map dispatch bubble (d6)ddfetch restart (de)e0eaebec? ldst retires (ed)? int retires (ef)f5f6f7f8fd
80025267232000000000410102267122012525800101080000108000050116675049238012670826727166523166888001020800002016000026708711180021109108000010800000108000000080035139800396100050201161126705100280000102672926971267222686826728
8002426727200010010045000226712201816258001010800001080000501166886492365126723267281665231670780010208000020160000267087111800211091080000108000001080000000800393398000061043050201161126724100280000102672826741267332675026742
80024267272000000000450002267120181812925800101080000108000050116749149236282670826708166523167028001020800002016000026708771180021109108000010800001108000003908003900800006035390502011611267191010480000102672926983267352682826728
80024267082000000000000022669321818192580010108000010800005011676054923628267222670816652316708800102080000201600002672756118002110910800001080000010800000390800004080039603500502011611267251010480000102672826955267352687526729
8002426728200000000080010026712212122074800101080000108000050116675049236472670826722166523168228001020800002016000026722711180021109108000010800000108000000080000042800356039005020116112671906080000102670926967267372683626728
8002426880201000000045000226693218121625800101080000108000050116707749236482672826727166723166888001020800002016000026734711180021109108000010800000108000000080000139800006000050201161126728010480000102670926980267412679226709
80024267272000000000630001267162180425800101080000108000050116699349236462673126884166673167078001020800002016000026722561180021109108000010800000108000043908003900800006003905020116112672506480000102672826921267642673126721
8002426727200000000000002267160018832580010108000010800005011688434923642267222672216652316702800102080000201600002670872118002110910800001080000010800000008003503980035603543050201161126724010280000102670926867267702673626741
8002426708200000010000102267132181211625800101080260108000050116699349236422672226722166523166888001020800002016000026712561180021109108000010800000108000000080035008003960350250201161126705106480000102672926986267392683926728
80024267272000000100900022669321818152580010108000010800005011688434923648267222672216672316688800102080000201600002672856118002110910800001080000010800000390800350398003960043050201161126705610080000102672926987267192680326728