Apple Microarchitecture Research by Dougall Johnson

M1/A14 P-core (Firestorm): Overview | Base Instructions | SIMD and FP Instructions
M1/A14 E-core (Icestorm):  Overview | Base Instructions | SIMD and FP Instructions

LDR (register, 32-bit)

Test 1: uops

Code:

  ldr w0, [x6, x7]
  mov x7, #4
  mov x8, 0

(no loop instructions)

1000 unrolls and 1 iteration

Retires: 1.000

Issues: 1.000

Integer unit issues: 0.000

Load/store unit issues: 1.000

SIMD/FP unit issues: 0.000

retire uop (01)cycle (02)030e0f1e22233a3f4346494f51schedule uop (52)schedule ldst uop (55)dispatch ldst uop (58)simd uops in schedulers (5a)606d6emap stall dispatch (70)map rewind (75)map stall (76)dispatch uop (78)map ldst uop (7d)map ldst uop inputs (80)8283flush restart other nonspec (84)85inst all (8c)inst int load (95)inst ldst (9b)l1d tlb access (a0)l1d cache miss ld (a3)ld unit uop (a6)l1d cache writeback (a8)acafb5b6bbl1d cache miss ld nonspec (bf)cfd5map dispatch bubble (d6)ddfetch restart (de)e0eaebec? ldst retires (ed)f5f6f7f8fd
1005402211451013832121192510001000100015274039839822132561000100020003947711100110001000100043103803810386139447311611391141071000395399395399399
100439531145001383211192510001000100015208039839822132561000100020003987711100110001000100043103803810386139447311611395141471000399399399399399
1004398311450013832112192510001000100015037139439822132561000100020003947711100110001000100043103803810386139447311611395141471000399399399399399
100439830045101383211192510001000100015274039839821732561000100020003987711100110001000100043103803810386138447311611395141471000395399399399399
100439830044001383211192510001000100015208039839822132561000100020003947711100110001000100043103803810386138447311611395141471000399399399399399
1004398300451013822121192510001000100015267139839822132561000100020003987711100110001000100043103803810386138447311611395141471000399399399395395
100439820044101382211192510001000100015274039839821732561000100020003947711100110001000100043103903910386138447311611395141471000399399399399399
100439820044001383211192510001000100015274139839822132561000100020003987711100110001000100044103803910396138447311611391141471000399399399399399
100439430047101383211192510001000100015274039839822132561000100020003987711100110001000100044103833810386138447311611391101041000399395395395399
100439830044001383211192510001000100015267139839821632591000100020003987711100110001000100044103803810386138447311611395141471000399399399399399

Test 2: Latency 1->2 (with chain penalty)

Chain cycles: 3

Code:

  ldr w0, [x6, x7]
  eor x8, x8, x0
  eor x8, x8, x0
  add x6, x6, x8
  mov x7, #4
  mov x8, 0

(fused SUBS/B.cc loop)

100 unrolls and 100 iterations

Result (median cycles for code, minus 3 chain cycles): 4.0047

retire uop (01)cycle (02)03mmu table walk data (08)l2 tlb miss data (0b)0e0f1e22243f4d4f51schedule uop (52)schedule int uop (53)schedule ldst uop (55)dispatch int uop (56)dispatch ldst uop (58)int uops in schedulers (59)simd uops in schedulers (5a)60696a6b6d6emap stall dispatch (70)map rewind (75)map stall (76)dispatch uop (78)map int uop (7c)map ldst uop (7d)map int uop inputs (7f)map ldst uop inputs (80)8283flush restart other nonspec (84)85inst all (8c)inst branch (8d)inst branch taken (90)inst branch cond (94)inst int load (95)inst int alu (97)inst ldst (9b)9d9fl1d tlb access (a0)l1d cache miss ld (a3)ld unit uop (a6)l1d cache writeback (a8)a9acafb5l1d cache miss ld nonspec (bf)cfd5map dispatch bubble (d6)ddfetch restart (de)e0? int output thing (e9)eaebec? ldst retires (ed)? int retires (ef)f5f6f7f8fd
402057005352400100007002069764597092540104301031000130100100006161753342206049669700700507005064646364953401003020010000602002000070050351140201100991001000030100100000100100000100000001000011261017111698133000360910000301007005170036700367004870051
402047005052400110107003269764597092540100301031000130100100006161753341470049669700700477004764631364950401003020010000602002000070050351140201100991001000030100100000100100000100000001000001261017111698133000360910000301007005170036700517004870036
402047005052400000107002069764596952540104301031000030100100006160153341470049669670700507003564646364938401003020010000602002000070050811140201100991001000030100100001100100001100000001000011261017111698133000366910000301007005170048700517005170036
402047004752500000107003569781597092540104301031000130100100006160053342206149669700700507003564631364938401003084710000602002000070047351140201100991001000030100100000100100001100000001000301261017111697983000006910000301007005170051700517005170036
402047003552400000007002069781596952540104301031000130100100006160053341470049669700700507005064631364950401003020010000602002000070047351140201100991001000030100100000100100001100000001000011261017111698103000300010000301007005170051700517005170036
402047005052500001007002069735597062540104301031000130100100006160053342206049669710700357005164631364941401003020010000602002000070050351140201100991001000030100100000100100001100001001000011261011711697983000390910000301007003670051700517004870051
402047005052500000007002069781597062540104301031000030100100006160053341470049669550700477005064646364953401003020010000602002000070050351140201100991001000030100100000100100000100000001000010261017111697983000006010000301007003670051700517005170051
402047005052500001007006169781597092540100301001000130100100006160053342062049669700700507005064643364953401003020010000602002000070035351140201100991001000030100100000100100001100000031000011261017111697983000399910000301007004870048700487003670050
402047004752400006007002069764597092540104301001000030100100006160503342206049669700700507009864643364956401003036510054602002000070054351140201100991001000030100100000100100000100000001000311261017111697983000399910000301007005170036700517005170036
402047004752500001107003569764596952540104301031000130100100006160053342206049669700700507005064646365034401003020010000602002000070047351140201100991001000030100100001100100001100000001000010261017111697983000066010000301007004870036700517003670048

1000 unrolls and 10 iterations

Result (median cycles for code, minus 3 chain cycles): 4.0047

retire uop (01)cycle (02)03l2 tlb miss data (0b)0e0f191e22233f4d4f51schedule uop (52)schedule int uop (53)schedule ldst uop (55)dispatch int uop (56)dispatch ldst uop (58)int uops in schedulers (59)simd uops in schedulers (5a)6061696a6d6emap stall dispatch (70)map rewind (75)map stall (76)dispatch uop (78)map int uop (7c)map ldst uop (7d)map int uop inputs (7f)map ldst uop inputs (80)8283flush restart other nonspec (84)85inst all (8c)inst branch (8d)inst branch taken (90)inst branch cond (94)inst int load (95)inst int alu (97)inst ldst (9b)9d9fl1d tlb access (a0)l1d tlb miss (a1)l1d cache miss ld (a3)ld unit uop (a6)l1d cache writeback (a8)acafb5bbl1d cache miss ld nonspec (bf)branch mispred nonspec (cb)cfd0d2l1i cache miss demand (d3)d5map dispatch bubble (d6)ddfetch restart (de)e0? int output thing (e9)eaebec? ldst retires (ed)? int retires (ef)f5f6f7f8fd
4002570047524000011070032697285969525400143001010002300101000061695233414701049669677005070035646653649724001030020100006002020000701603511400211091010000300101000001010000011000000100001000252000017132698133000396610000300107004870048700517004870048
4002470047524000010070035697285969525400143001310001300101000061698233414701049669707005070050646653649604001030020100006002020000700473511400211091010000300101000001010000011000000100001010252000017121698133000360610000300107004870036700487005170036
400247004752400001091070032697485970925400143001310001300101000061696133420621049669677004770047646653649724001030020100006002020000700473511400211091010000300101000001010000001000000100000000252000027121697983000366610000300107004870036700367004870048
4002470047525000010070020697285970625400143001310001300101000061695233420621049669677003570035646653649724001030020100006002020000701513511400211091010000300101000001010000011000000100001010252000017132698103000360610000300107004870048700367004870048
4002470035525000010070032697605970925400143001310001300101000061698233420621049669557003570047646653649724001030020100006002020000700473511400211091010000300101000001010000011000000100001010252000017112698103000360010000300107005470049700487004870048
4002470035525000010070032697285970642400143001310000300101000061695233420621049669677004770047646543649724001030020100006002020000700473511400211091010000300101000001010000011000000100001000252000017121698103000366010000300107004870048700487004870048
4002470050524000060070032697295970625400143001310000300101000061698233414701049669677004770047646653649604001030020100006002020000701133511400211091010000300101000001010000011000000100000010252000017112698133000366610000300107005170048700487005170048
40024700355240000100700206972859706254001430010100013001010000616952334206210496697070047700476466836498540010300201000060020200007012035114002110910100003001010000010100000010000024100001000252000027112698103000306910000300107004870051700367004870036
4002470035524000010070032697285970625400143001010001300101000061706833422061049669557004770047646683649604001030020100006002020000701513511400211091010000300101000001010000001000000100001010252000027111698133000306610000300107004870036700517004870099
40024700355250000110700326972859706254001430010100003001010000617176334206210496697070090700356466836497240010300201000060020200007004735114002110910100003001010000010100000110000036100000000252000027123698133000366010000300107004870048700487003670048

Test 3: Latency 1->3 (with chain penalty)

Chain cycles: 3

Code:

  ldr w0, [x6, x7]
  eor x8, x8, x0
  eor x8, x8, x0
  add x7, x7, x8
  mov x7, #4
  mov x8, 0

(fused SUBS/B.cc loop)

100 unrolls and 100 iterations

Result (median cycles for code, minus 3 chain cycles): 4.0051

retire uop (01)cycle (02)03l1d tlb fill (05)mmu table walk data (08)09l2 tlb miss data (0b)0e0f1e223a3f4d4f51schedule uop (52)schedule int uop (53)schedule ldst uop (55)dispatch int uop (56)dispatch ldst uop (58)int uops in schedulers (59)simd uops in schedulers (5a)60696a6d6emap stall dispatch (70)map rewind (75)map stall (76)dispatch uop (78)map int uop (7c)map ldst uop (7d)map int uop inputs (7f)map ldst uop inputs (80)8283flush restart other nonspec (84)85inst all (8c)inst branch (8d)inst branch taken (90)inst branch cond (94)inst int load (95)inst int alu (97)inst ldst (9b)9d9fl1d tlb access (a0)l1d tlb miss (a1)l1d cache miss ld (a3)ld unit uop (a6)l1d cache writeback (a8)a9acafb5bbl1d cache miss ld nonspec (bf)l1d tlb miss nonspec (c1)c2cfd5map dispatch bubble (d6)ddfetch restart (de)e0? int output thing (e9)eaebec? ldst retires (ed)? int retires (ef)f5f6f7f8fd
4020570051525001001210700206978259713254010430103100023010010000616041334225414966974700357003564650364950401003020010000602002000070054351140201100991001000030100100001100100032010002520010000101002610171116979830000010010000301007005270055700367003670036
402047005152400000001070036697645969525401043010010001301001000061604133423981496695570054700546463136504940100302001000060200200007003535114020110099100100003010010000010010000011000000010000101002610171116981430003010010000301007005270052700527003670036
4020470035525000000100700396978559713254010030103100013010010000616175334225404966974700547005464631365022401003020010000602002000070035351140201100991001000030100100000100100000110000000100001010026101711169880300030101310000301007005770055700527003670055
40204700355240000000007003969785596952540104301031000130100100006161753341470149669747005470054646313650404010030200100006020020000700353511402011009910010000301001000001001000001100000001000000100261017111697983000310131010000301007005270052700557005270055
402047005452400000011070039697855971025401003010010000301001000061604133414700496695570035700356465036500640894302001000060200200007005935114020110099100100003010010000010010000011000000010000000002610171116979830000001310000301007005570036700557005570055
4020470054524000000110700366978259713254010030103100013010010000616175334239814966955700547003564631365039401003020010000602002000070051351140201100991001000030100100000100100000110000000100001010026101711169814300031301310000301007003670055700557005270055
4020470054525000000110700436978559713254010430103100013010010000616041334239814966971700547005464631365050401003020010000602002000070035351140201100991001000030100100000100100000110000000100001010026101711169817300030101310000301007005570052700557003670055
402047003552500000041070036697825969525401043010310001301001000061617533423981496697470035700546463136503940100302001000060200200007003535114020110099100100003010010000010010000011000000010000101002610171116981430003001310000301007003670036700367005570055
40204700515250000001107009569785597132540104301031000030100100006160413342254149669747005470035646313650684010030200100006020020000700543511402011009910010000301001000001001000001100000001000000100261017111697983000313101310000301007005270055700527003670036
4020470051525000011100700396978559713254010030100100003010010000616086334263814966974700547005464647365038408923020010000602002000070035351140201100991001000030100100000100100000110000000100000010026101711169798300000131310000301007005570055700367005570036

1000 unrolls and 10 iterations

Result (median cycles for code, minus 3 chain cycles): 4.0047

retire uop (01)cycle (02)03mmu table walk data (08)09l2 tlb miss data (0b)0e0f181e1f2223243a3f4d4f51schedule uop (52)schedule int uop (53)schedule ldst uop (55)dispatch int uop (56)dispatch ldst uop (58)int uops in schedulers (59)simd uops in schedulers (5a)60696a6d6emap stall dispatch (70)map rewind (75)map stall (76)dispatch uop (78)map int uop (7c)map ldst uop (7d)map int uop inputs (7f)map ldst uop inputs (80)8283flush restart other nonspec (84)85inst all (8c)inst branch (8d)inst branch taken (90)inst branch cond (94)inst int load (95)inst int alu (97)inst ldst (9b)9d9fl1d tlb access (a0)l1d cache miss ld (a3)ld unit uop (a6)l1d cache writeback (a8)acafb5l1d cache miss ld nonspec (bf)branch cond mispred nonspec (c5)branch mispred nonspec (cb)cdcfd5map dispatch bubble (d6)ddfetch restart (de)e0? int output thing (e9)eaebec? ldst retires (ed)? int retires (ef)f5f6f7f8fd
4002570206524111010117600017003269728597062540014300131000130010100006169523342206049669687003570047646683649724001030020100006002020000700503511400211091010000300101000011010000110000001000011000252017121698133000396910000300107005170036700517004170048
400247004752500011013010007003469760597062540014300131000130010100006169523342062049669557004770035646533649604001030020102206002020436700473511400211091010000300101000001010000110000001000011100252017111698133000300910000300107005170051700517003670036
40024700505250001001010007003269728597412540014300131000130010100006171043342062049669677003570050646683649724001030020100006002020000700473511400211091010000300101000001010000110000001000011000252017111698103000369910000300107004870048700487003670048
40024700475250000001000007003569743597062540014300131000130010100006169823342062049669677005070035646533649724001030020100006002020000700503511400211091010000300101000001010000110000001000001000252017111698103000099910000300107003670051700367004870036
40024700475250000001010007003269728596952540014300131000130010100006169523345602049669557005070090646653649824001030020100006002020000700503511400211091010000300101000001010000110000031000000000252017111698103000396910000300107005170036700797004870036
40024700585250000001000007002069760596952540010300101000130010100006169823342206049669557004770047646683649724001030020100006002020000700473511400211091010000300101000001010000110000001000011000252017111698103000309610000300107003670051700517004870048
40024700475240001100000007002069760597092540014300131000130010100006169823342206049669677005070050646683649754001030020100006002020000700473511400211091010000300101000011010000110000101000010000252017111698133000396910000300107004870048700487005170051
40024700505240000001000007003269728597062540014300101000130010100006169523342062049669677004770047646533649724001030020100006002020000700473511400211091010000300101000001010000110003101000011000252027111698103000090010000300107003670051700487004870048
40024700475240000006010007003269760597062540014300131000130010100006169823341470049669557003570050646683649604001030020100006002020000700473511400211091010000300101000001010000110000001000011000252027121697983000366010000300107004870048700367004870048
40024700355240000001010007002069728597062540014300131000130010100006192453342062049669677004770035646653649724001030020100006002020000700503511400211091010000300101000001010000110000001000010000252017111698103000366610000300107005170036700517004870048

Test 4: throughput

Count: 8

Code:

  ldr w0, [x6, x7]
  ldr w0, [x6, x7]
  ldr w0, [x6, x7]
  ldr w0, [x6, x7]
  ldr w0, [x6, x7]
  ldr w0, [x6, x7]
  ldr w0, [x6, x7]
  ldr w0, [x6, x7]
  mov x7, 8

(fused SUBS/B.cc loop)

100 unrolls and 100 iterations

Result (median cycles for code divided by count): 0.3342

retire uop (01)cycle (02)03l1d tlb fill (05)mmu table walk data (08)l2 tlb miss data (0b)0e0f1e22233a3f4346494f51schedule uop (52)schedule int uop (53)schedule ldst uop (55)dispatch int uop (56)dispatch ldst uop (58)int uops in schedulers (59)simd uops in schedulers (5a)60696a6d6emap stall dispatch (70)map rewind (75)map stall (76)dispatch uop (78)map int uop (7c)map ldst uop (7d)map int uop inputs (7f)map ldst uop inputs (80)8283flush restart other nonspec (84)85inst all (8c)inst branch (8d)inst branch taken (90)inst branch cond (94)inst int load (95)inst int alu (97)inst ldst (9b)9d9fl1d tlb access (a0)l1d tlb miss (a1)l1d cache miss ld (a3)ld unit uop (a6)st unit uop (a7)l1d cache writeback (a8)a9acafb5b6bbl1d cache miss ld nonspec (bf)l1d tlb miss nonspec (c1)c2branch cond mispred nonspec (c5)branch mispred nonspec (cb)cdcfd5map dispatch bubble (d6)ddfetch restart (de)e0? int output thing (e9)eaebec? ldst retires (ed)? int retires (ef)f5f6f7f8fd
8020526732200111007710326718218181258010010080000100800165001175101049236522730726725166596166888011620080024200160048267378711802011009910080000100800000100800192042800571005980038615742193111511801600267110992800001002673326733267332673326715
80204267322001110065003267172180152580100100800001008001450011674931492363926736267361666061668480116200800242001600482673281118020110099100800001008000001008002119080019101598003861570190111511801600267290992800001002673426734267152673326715
8020426732200110006500326718218180258010010080000100800155001167579049236572672426737166426166668011620080024200160048267148211802011009910080000100800000100800202042800571002180038015742190111511801600267310902800001002673526715267152671526715
8020426714200100006400126699018181825801001008000010080016500116659014923654267362673416659616684801152008002420016004826714811180201100991008000010080000010080019194280057102228003801570190111511801600267290600800001002673326715267332673326733
802042671420010100650022669901801525801001008000010080016500116776314923655267372675016660616666801152008002420016004826732821180201100991008000010080000010080019200800571015980038605742192111511801600267290992800001002673326733267152673326733
802042673220011100650022671821800258010010080000100800155001167250049236552673226732166666166848011520080024200160048267148111802011009910080000100800000100800202140800571015880000605742190111511801600267290992800001002671626715267342673426733
80204267322001111011000326699218181258010010080000100800155001167763049236352674826745166606166858011520080024200160048267326411802011009910080000100800000100800191942800560005980000615642190111511801600267470092800001002673326733267332673326733
8020426732200110006510226700201816258010010080000100800225001166730149236572673426749166501016654801222008003020016006026714641180201100991008000010080000010080020204280019000598003800570190222512912311267110902800001002673326733267332673326822
802042675420111000211022670001801625801001008000010080022500116727414923654267342672416650916654801212008003020016006026732821180201100991008000010080000010080020204280019000598003861570191222512812311267110900800001002671526733267332673326733
802042671520011001650032671721818152580100100800001008002150011692780492363926735267321663210166548012020080030200160060267328111802011009910080000100800000100800191942800191005980000601942191222512912311267290992800001002673326715267332673326733

1000 unrolls and 10 iterations

Result (median cycles for code divided by count): 0.3342

retire uop (01)cycle (02)03l1d tlb fill (05)mmu table walk data (08)l2 tlb miss data (0b)0e0f1e1f223a3f4346494f51schedule uop (52)schedule int uop (53)schedule ldst uop (55)dispatch int uop (56)dispatch ldst uop (58)int uops in schedulers (59)simd uops in schedulers (5a)60696a6d6emap stall dispatch (70)map rewind (75)map stall (76)dispatch uop (78)map int uop (7c)map ldst uop (7d)map int uop inputs (7f)map ldst uop inputs (80)8283flush restart other nonspec (84)85inst all (8c)inst branch (8d)inst branch taken (90)92inst branch cond (94)inst int load (95)inst int alu (97)inst ldst (9b)9d9fl1d tlb access (a0)l1d tlb miss (a1)l1d cache miss ld (a3)ld unit uop (a6)st unit uop (a7)l1d cache writeback (a8)a9acafb5b6bbl1d cache miss ld nonspec (bf)l1d tlb miss nonspec (c1)c2branch cond mispred nonspec (c5)cfd5map dispatch bubble (d6)ddfetch restart (de)e0eaebec? ldst retires (ed)? int retires (ef)f5f6f7f8fd
800252673620110110790032672137920258001010800001080000501167791049236572691226802166653167178001020800002016000026736641180021109010800001080000010800212043800581016180040615943190050201616617267331313580000102673826738267162673826737
80024267362001100085013267213771925800101080000108000050117397504923656268792684016713316716800102080000201600002671564118002110901080000108000001080019204380059101618003960594319005020171681726733013580000102673726716267162673726737
800242673620110100930122672137720258001010800001080000501167219149236352671526798166683166948001020800002016000026736851180021109010800001080000010800191943800591006180039615843190050201416717267341313580000102671526737267372673726737
800242673620010100910132672227712580010108000010800005011682830492363426737267831666531671680010208000020160000267368511800211090108000010800000108002020458005900161800006159431900502081681726733130580000102673826737267162673826737
800242673620011000210112670027002580010108000010800005011672981492365626736268311668631669480010208000020160000267368611800211090108000010800000108002019438005810163800396058019005020816617267331313080000102673726738267372671626737
8002426714200100007901226721370192580010108000010800005011739750492365626736267151668131680080010208000020160000267368511800211090108000010800000108001920438005900061800396119019005020616178267331313580000102673726737267162673826737
8002426736200101006701226721300192580010108000010800005011739750492365626737267971668931671780010208000020160000267368511800211090108000010800000108001919438006010161800416158431900502014161717267341313580000102673726737267372673726716
80024267372011000034500026699377025800101080000108000050117397504923656267372677716692316695800102080000201600002673685118002110901080000108000001080020200800590016180000615843190050201716178267341313580000102673726739267372673726716
800242673620011100940132672137712580010108000010800005011682860492363526736268321668931671680010208000020160000267368611800211090108000010800000108001920080019001618000061580190050201716178267341313580000102671626737267392673726742
8002426737200111102910032672137019258001010800001080000501174119049236562673726736166823168508001020800002016000026736851180021109010800001080000010800202043800591006180039601943190050201716171726733130580000102671526737267372673726870