Apple Microarchitecture Research by Dougall Johnson

M1/A14 P-core (Firestorm): Overview | Base Instructions | SIMD and FP Instructions
M1/A14 E-core (Icestorm):  Overview | Base Instructions | SIMD and FP Instructions

LDR (unsigned offset, 32-bit)

Test 1: uops

Code:

  ldr w0, [x6, #8]
  mov x0, 1
  mov x1, 2
  mov x8, 0

(no loop instructions)

1000 unrolls and 1 iteration

Retires: 1.000

Issues: 1.000

Integer unit issues: 0.000

Load/store unit issues: 1.000

SIMD/FP unit issues: 0.000

retire uop (01)cycle (02)030e0f1e2223243a3f4346494f51schedule uop (52)schedule ldst uop (55)dispatch ldst uop (58)simd uops in schedulers (5a)606d6emap stall dispatch (70)map rewind (75)map stall (76)dispatch uop (78)map ldst uop (7d)map ldst uop inputs (80)8283flush restart other nonspec (84)85inst all (8c)inst int load (95)inst ldst (9b)9dl1d tlb access (a0)l1d cache miss ld (a3)ld unit uop (a6)l1d cache writeback (a8)acafb5b6bbl1d cache miss ld nonspec (bf)cfd5map dispatch bubble (d6)ddfetch restart (de)e0eaebec? ldst retires (ed)f5f6f7f8fd
1005394310010013832110251000100010001406013983981973256100010001000394771110011000100001000431000039103860394373116113710001000375399399395399
1004374300010013832101925100010001000152741394394221323210001000100039877111001100010001100001038101038613943731161139514071000375399375395399
100439820044100038320019251000100010001527413983742213232100010001000394561110011000100001000431038039100060390731161137101441000375399375402395
10043983004500013832101625100010001000150371398398221323210001000100039877111001100010001100001038039100001044731161137114001000395399399399399
1004398210000013832121192510001000100014060139439822132561000100010003747711100110001000010000103800103861380731161139114071000399399399395395
10043743104510003832121192510001000100015208139839819732521000100010003745611100110001000010004310380010000038447311611371141471000399399399395375
10043743000000135920119251000100010001406013983982173256100010001000394771110011000100011000010000381039003807311611395141401000375375395375395
1004398300000013592111925100010001000140601398374197325610001000100039877111001100010000100043100003810386100731162139501471000399375375395375
100437430044100137921212192510001000100014989139839821632561000100010003745611100110001000010004310380010386139437311611371141441000399375399395395
100437431145100037921116251000100010001498913983742213256100010001000374561110011000100001000010380010386038437311611391141471000399375399375375

Test 2: Latency 1->2 (with chain penalty)

Chain cycles: 3

Code:

  ldr w0, [x6, #8]
  eor x8, x8, x0
  eor x8, x8, x0
  add x6, x6, x8
  mov x0, 1
  mov x1, 2
  mov x8, 0

(fused SUBS/B.cc loop)

100 unrolls and 100 iterations

Result (median cycles for code, minus 3 chain cycles): 4.0054

retire uop (01)cycle (02)03l1d tlb fill (05)mmu table walk data (08)l2 tlb miss data (0b)0e0f181e223a3f4d4f51schedule uop (52)schedule int uop (53)schedule ldst uop (55)dispatch int uop (56)dispatch ldst uop (58)int uops in schedulers (59)simd uops in schedulers (5a)60696a6d6emap stall dispatch (70)map rewind (75)map stall (76)dispatch uop (78)map int uop (7c)map ldst uop (7d)map int uop inputs (7f)map ldst uop inputs (80)8283flush restart other nonspec (84)85inst all (8c)inst branch (8d)inst branch taken (90)inst branch cond (94)inst int load (95)inst int alu (97)inst ldst (9b)9d9fl1d tlb access (a0)l1d tlb miss (a1)l1d cache miss ld (a3)ld unit uop (a6)l1d cache writeback (a8)a9acafb5bbl1d cache miss ld nonspec (bf)l1d tlb miss nonspec (c1)c2branch mispred nonspec (cb)cdcfd5map dispatch bubble (d6)ddfetch restart (de)e0? int output thing (e9)eaebec? ldst retires (ed)? int retires (ef)f5f6f7f8fd
40205700595261000101971070020698055985025401083010610002301001000061604133423981496717970054700546465036496340100302001000060200100007006035114020110099100100003010010000110010001201000210110000101000026102711269820300061310010000301007005570055700527003670052
4020470051524000000100700396979159716254010430106100013010010000616095334268604966980700607006064637364963401003020010000602001000070057351140201100991001000030100100001100100000110000000100000010000261017111698173000313101310000301007006470058700617004270062
40204700415251110001007004269782596952540104301001000030100100006160413341470049669747005170051646313649574010030200100006052810000700353511402011009910010000301001000001001000211100020111000011111002610171116982330006130010000301007005770036700557005570055
40204700515250000001007003969702597192540108301061000230100100006160783342686049669807006070060646563649634010030200100006020010000700603511402011009910010000301001000001001000000100000001000010000002610171116981730003013010000301007004770061700617004270061
402047006052510000011007004569764597154940104301031000130100100006160143342398049669557005470035646503649384010030200100006020010000700543511402011009910010000301001000001001000221100020111000001011002610171116980430006130010000301007005270115700577011770055
40204700545240000006007002069805597192540108301061000230100100006160683341769049669807004170041646373649634010030200100006020010000700413511402011009910010000301001000011001000001100000001000010000002610171116981730003001310000301007006270067700617006170042
4020470041525111000200700456978759695254010430100100003010010000616041334239804966955700557003564631364938401003020010000602001000070038351140201100991001000030100100000100100022110001331100001101100261017111698233000313131010000301007005270055700557005270036
4020470054524010000010700396970259701254010830106100023010010000616095334176904966980700607006064656364963402943020010000602001000070041351140201100991001000030100100000100100000010000000100001010000261017111698143000313131310000301007006170058700587006170061
40204700605251000012007004269764597132540104301031000130100100006160143342254049669747005470051646473649384010030200100006020010000700353511402011009910010000301001000011001000111100010011000001010002610171116982330006130010000301007005270036700367003670055
402047005452500000010070020697025971925401083010310001301001000061609533417690496696170041700576465636501840100302001000060200100007005735114020110099100100003010010000010010000011000000010000101000026101711169798300000101310000301007006670061700617006170058

1000 unrolls and 10 iterations

Result (median cycles for code, minus 3 chain cycles): 4.0065

retire uop (01)cycle (02)03l1d tlb fill (05)mmu table walk data (08)09l2 tlb miss data (0b)0e0f1e22233a3f4d4f51schedule uop (52)schedule int uop (53)schedule ldst uop (55)dispatch int uop (56)dispatch ldst uop (58)int uops in schedulers (59)simd uops in schedulers (5a)60696a6d6emap stall dispatch (70)int prf full (71)map rewind (75)map stall (76)dispatch uop (78)map int uop (7c)map ldst uop (7d)map int uop inputs (7f)map ldst uop inputs (80)8283flush restart other nonspec (84)85inst all (8c)inst branch (8d)inst branch taken (90)inst branch cond (94)inst int load (95)inst int alu (97)inst ldst (9b)9d9fl1d tlb access (a0)l1d tlb miss (a1)l1d cache miss ld (a3)ld unit uop (a6)l1d cache writeback (a8)a9acafb5bbl1d cache miss ld nonspec (bf)l1d tlb miss nonspec (c1)c2branch cond mispred nonspec (c5)cfd5map dispatch bubble (d6)ddfetch restart (de)e0? int output thing (e9)eaebec? ldst retires (ed)? int retires (ef)f5f6f7f8fd
400257005352500000011007003269728597062540010300131000230010100006169793342062149669557005370047646650364980400103002010000600201000070041351140021109101000030010100001101000001100000022510000101000252137133698103000360610000300107004870048700517004870054
400247010852411100020017003869777597122540018300161000230010100006169883342062149669677005370053646710364972400103002010000600201000070047351140021109101000030010100000101000001100000024310000100000252137133698103000366610000300107003670036700487003670036
400247014052500000001007002069728597062540014300131000430010102556170513341566149669677004770035646650364966400103002010000600201000070047351140021109101000030010100000101000211100020024310000101000252137134698103000366010000300107003670036700367004870048
400247012352500000001007003269728597062540014300131000230010100006170633342350149669617005370053646710364972400103002010000600201000070047351140021109101000030010100000101000001100000021610000000000252137133698103000306610000300107004870048700487004870087
400247004852500001101007003269728596952540014300131000230010100006171173342062149669677003570047646590364978400103002010000600201000070053351140021109101000030010100000101000001100000025210000101000252147133698443000360610000300107004870048700487004870048
400247013752500000011007002069728597062540014300131000230010100006202273342686149669677005370047646650364978400103002010000600201000070053351140021109101000030010100001101000111100000022810000101000252137133698163000606610000300107005470054700547005470054
400247014752610001111007003869777597122540018300161000230010100006170183342734149669737005370053646710364966400103002010000600201000070047351140021109101000030010100001101000001100000022810000101000252137133698103000366610000300107003670048700487004870048
4002470129525000000110070032697285970625400103001310001300101000062619333423981496696770053700536465503649784001030020100006002010000700473511400211091010000300101000001010000001000010010000101000252137134698163000666610000300107005570057700427005470054
400247013852510000020017003869777597012540018300161000230010100006170763342350149669737005370047646590364978400103002010000600201000070047351140021109101000030010100000101000000100000024610000001000252137133698103000366610000300107004870048700487004870036
400247012952500000020017002669777597122540018300161000230010100006170453342350149669677004170047646650364978400103002010000600201000070047351140021109101000030010100000101000001100000020110000101000252137133698103000366610000300107004870036700487004870036

Test 3: throughput

Count: 8

Code:

  ldr w0, [x6, #8]
  ldr w0, [x6, #8]
  ldr w0, [x6, #8]
  ldr w0, [x6, #8]
  ldr w0, [x6, #8]
  ldr w0, [x6, #8]
  ldr w0, [x6, #8]
  ldr w0, [x6, #8]
  mov x7, x6
  mov x8, x6
  mov x9, x6
  mov x10, x6
  mov x11, x6
  mov x12, x6
  mov x13, x6

(fused SUBS/B.cc loop)

100 unrolls and 100 iterations

Result (median cycles for code divided by count): 0.3341

retire uop (01)cycle (02)03l1d tlb fill (05)mmu table walk data (08)l2 tlb miss data (0b)0e0f1e223a3f4346494f51schedule uop (52)schedule int uop (53)schedule ldst uop (55)dispatch int uop (56)dispatch ldst uop (58)int uops in schedulers (59)simd uops in schedulers (5a)60696a6d6emap stall dispatch (70)map rewind (75)map stall (76)dispatch uop (78)map int uop (7c)map ldst uop (7d)map int uop inputs (7f)map ldst uop inputs (80)8283flush restart other nonspec (84)85inst all (8c)inst branch (8d)inst branch taken (90)inst branch cond (94)inst int load (95)inst int alu (97)inst ldst (9b)9d9fl1d tlb access (a0)l1d tlb miss (a1)l1d cache miss ld (a3)ld unit uop (a6)st unit uop (a7)l1d cache writeback (a8)a9acafb5b6bbl1d cache miss ld nonspec (bf)l1d tlb miss nonspec (c1)c2branch mispred nonspec (cb)cfd5map dispatch bubble (d6)ddfetch restart (de)e0eaebec? ldst retires (ed)? int retires (ef)f5f6f7f8fd
8020526738200000006600267213774925801001008000010080000500117098114923656267142673616849316672801002008000020080000269828511802011009910080000100800000100800192043800580110147800396019430015110116132673710104800001002672826708267082672826732
8020426731201000004510267120120025801001008000010080000500116652514923627267072672716630316685801002008000020080000267275611802011009910080000100800000100800000080039026038003961000015110116112670410104800001002672826728267282672826830
802042673620000000010267120120162580100100800001008000050011776701492362726727267271665031666580100200800002008000026727561180201100991008000010080000010080000043800390104280000603900005110116112672810100800001002672826708267082670826815
802042673420000000450126712212016258010010080000100800005001172729149236272672726707166503166858010020080193200800002672777118020110099100800001008000001008000000800390206380000611943190051101161126724004800001002672826728267282670826716
80204267272000000045012671220121625801001008000010080000500116578914923647267272672716650316665801002008000020080000267275611802011009910080000100800000100800000080000020618003960594319005110116112673313135800001002673826715267382671526739
802042674020011000450126692000225801001008000010080000500116652514923647267072672716630316685801002008000020080000267075611802011009910080000100800000100800000438000000042800390139440005110116112670410100800001002672826728267282672826708
802042672720000001450026712000025801001008000010080000500116652514923647267272670716650316685801002008000020080000267075611802011009910080000100800000100800000438003901042800396139000051101161126704004800001002670826708267282672826735
802042672720100000440126712000162580100100800001008000050011672311492364726707267071665031666580100200800002008000026707771180201100991008000010080000010080000043800000503800000058019005110116212673413135800001002673726715267152671526714
802042673420000000011267122001625801001008000010080000500116716214923627267072672716650316665801002008000020080000267075611802011009910080000100800000100800000438003903704580000613900005110116112670410107800001002670826728267282672926733
80204267272000000045102673020121625801001008000010080000500116723114923627267272672716630316685801002008000020080000267077711802011009910080000100800001100800000080039000908000000594319005110116112671113135800001002673726737267372673726724

1000 unrolls and 10 iterations

Result (median cycles for code divided by count): 0.3342

retire uop (01)cycle (02)03l1i tlb fill (04)l1d tlb fill (05)mmu table walk data (08)09l2 tlb miss data (0b)0e0f18191e22233a3f4346494f51schedule uop (52)schedule int uop (53)schedule ldst uop (55)dispatch int uop (56)dispatch ldst uop (58)int uops in schedulers (59)simd uops in schedulers (5a)60696a6d6emap stall dispatch (70)map rewind (75)map stall (76)dispatch uop (78)map int uop (7c)map ldst uop (7d)map int uop inputs (7f)map ldst uop inputs (80)8283flush restart other nonspec (84)85inst all (8c)inst branch (8d)inst branch taken (90)92inst branch cond (94)inst int load (95)inst int alu (97)inst ldst (9b)9d9fl1d tlb access (a0)l1d tlb miss (a1)l1d cache miss ld (a3)a5ld unit uop (a6)st unit uop (a7)l1d cache writeback (a8)a9acafb5b6bbl1d cache miss ld nonspec (bf)l1d tlb miss nonspec (c1)c2cfd2d5map dispatch bubble (d6)ddfetch restart (de)e0eaebec? ldst retires (ed)? int retires (ef)f5f6f7f8fd
80025267362001101111006510326722377242580010108000010800005011717641492365626715267371668131671680010208000020800002671485118002110901080000108000001080019204308005810160800406019431915020025161224267121313580000102674226716267372673726737
8002426736200011010000690022672237721258001010800001080000501167106149236562673626736166823166958001020800002080000267368511800211090108000010800000108002119008005900161800406119431905020021162122267331313580000102674426746268542673826737
8002426736200011110000660032672130720258001010800001080000501167298149236352673626714166603167178001020800002080000267158511800211090108000010800001108002019430800591006180000001943191502002516262626711013580000102674626750267382671726715
8002426714200011010100660022670037760258001010800001080000501167219149236352673626715166603167178001020800002080000267368511800211090108000010800000108002120430800591002180040615943192502002616252626733013580000102674326737267152671526805
8002426715200010110000211022672220019258001010800001080000501166960149236562671526737166813167168001020800002080000267366411800211090108000010800000108001919430800190006180040005843191502001916251226734130580000102674326819267402674826715
8002426736200011100000211032669930024258001010800001080000501166960149236572671526736166833166988001020800002080000267366411800211090108000010800000108002019430800591016480039601943191502002516252526711013080000102673626715267382673726716
8002426715200010000000670032672130720258001010800001080000501173975149236562673726736166813167168001020800002080000267378511800211090108000010800000108002019430800591016080042611943192502002716152626735130080000102675126737267162673726716
800242671520001101000067103267223071925800101080000108000050117397514923656267372671416659316716800102080000208000026737851180021109010800001080000010800211945080019102218004060590191502002016241226711013580000102672426737267372673726715
8002426738200010010000670012672220002580010108000010800005011682861492365726736267371668131671680010208000020800002671585118002110901080000108000011080019194308005910061800420159431905020013162624267331313080000102674726741267282671626737
8002426736200011011100660002672137020258001010800001080000501166960149236572673626737166813167188001020800002080000267368511800211090108000010800000108001919430800590012180040611901905020016162613267111313580000102673326716267162673826737