Apple Microarchitecture Research by Dougall Johnson

M1/A14 P-core (Firestorm): Overview | Base Instructions | SIMD and FP Instructions
M1/A14 E-core (Icestorm):  Overview | Base Instructions | SIMD and FP Instructions

LDRB (register)

Test 1: uops

Code:

  ldrb w0, [x6, x7]
  mov x7, #4
  mov x8, 0

(no loop instructions)

1000 unrolls and 1 iteration

Retires: 1.000

Issues: 1.000

Integer unit issues: 0.000

Load/store unit issues: 1.000

SIMD/FP unit issues: 0.000

retire uop (01)cycle (02)030e0f1e22233a3f4346494f51schedule uop (52)schedule ldst uop (55)dispatch ldst uop (58)simd uops in schedulers (5a)6d6emap stall dispatch (70)map rewind (75)map stall (76)dispatch uop (78)map ldst uop (7d)map ldst uop inputs (80)8283flush restart other nonspec (84)85inst all (8c)inst int load (95)inst ldst (9b)9dl1d tlb access (a0)l1d cache miss ld (a3)ld unit uop (a6)l1d cache writeback (a8)acafb5b6bbl1d cache miss ld nonspec (bf)cfd5map dispatch bubble (d6)ddfetch restart (de)e0eaebec? ldst retires (ed)f5f6f7f8fd
100539431045000359212016251000100010001503739839421732321000100020003747711100110001000110000103904010396039437321622391101041000375375395401395
1004394300450003590121216251000100010001406039439819732561000100020003945611100110001000110004310390391038000437321622391101001000399375375395395
1004394300451013592000251000100010001503737439421632321000100020003945611100110001000010004310390391000600447321622371101041000375375399399399
10043943004500137921212162510001000100015052394394216325610001000200039877111001100010000100043103900103961043732162239110001000395397395395399
100439430045001359211219251000100010001535339439419732521000100020003747711100110001000010004310000391000603907321622391101071000395378395375375
1004394300010137921212162510001000100014060398394216323210001000200039877111001100010000100043103900100061390732162239101041000395375395395395
1004394200451013590111925100010001000142033743742173252100010002000374771110011000100011000010390391000613907321622371101041000395375399400395
10043943004510137921210251000100010001406037439419732561000100020003947711100110001000110004310000381039610437321622371101001000398395381375375
1004394300000137921212162510001000100015182374398197325210001000200039477111001100010000100001039038103901007321621395101001000385395376375375
100437430045001379212121625100010001000140603744202193232100010002000374561110011000100011000431039001039010437321622391101451000399375399395375

Test 2: Latency 1->2 (with chain penalty)

Chain cycles: 3

Code:

  ldrb w0, [x6, x7]
  eor x8, x8, x0
  eor x8, x8, x0
  add x6, x6, x8
  mov x7, #4
  mov x8, 0

(fused SUBS/B.cc loop)

100 unrolls and 100 iterations

Result (median cycles for code, minus 3 chain cycles): 4.0051

retire uop (01)cycle (02)03l1d tlb fill (05)mmu table walk data (08)09l2 tlb miss data (0b)0e0f181e22233a3f4d4f51schedule uop (52)schedule int uop (53)schedule ldst uop (55)dispatch int uop (56)dispatch ldst uop (58)int uops in schedulers (59)simd uops in schedulers (5a)60696a6d6emap stall dispatch (70)int prf full (71)map rewind (75)map stall (76)dispatch uop (78)map int uop (7c)map ldst uop (7d)map int uop inputs (7f)map ldst uop inputs (80)8283flush restart other nonspec (84)85inst all (8c)inst branch (8d)inst branch taken (90)inst branch cond (94)inst int load (95)inst int alu (97)inst ldst (9b)9d9fl1d tlb access (a0)l1d tlb miss (a1)l1d cache miss ld (a3)ld unit uop (a6)l1d cache writeback (a8)a9acafb5bbl1d cache miss ld nonspec (bf)l1d tlb miss nonspec (c1)c2branch cond mispred nonspec (c5)branch mispred nonspec (cb)cfd5map dispatch bubble (d6)ddfetch restart (de)e0? int output thing (e9)eaebec? ldst retires (ed)? int retires (ef)f5f6f7f8fd
40205700575250000010210170042697025971625401083010610002301001000061606833425420496708570061700576465303649604010030200100006020020000700353511402011009910010000301001000011001000121100001054100001111100261017111698143000310101010000301007005270052700527005270052
402047005454300000001100700206978259710254010430103100013010010000616014334225404966971701257005164647036495440100302001000060200200007005135114020110099100100003010010000010010000011000000010000101000026101711169814300030101010000301007005270036700527005270052
4020470051525000000010007003669782597102540100301031000130100100006160143342254149669717005470054646470364954401003020010000602002000070051351140201100991001000030100100000100100000110000000100000010000261017111698143000310101010000301007005270052700527005270052
4020470051525000000011007003669782597102540104301031000130100100006160143342254149669717005570055646470364954401003020010000602002000070051351140201100991001000030100100000100100000110000000100001010000261017111698143000310101010000301007005270052700527005270052
402047005154300000001000700366978259710254010430103100013010010000616014334225404966971700537005464647036495440100302001000060200200007005135114020110099100100003010010000010010000001000000010000001040026101711169814300030101010000301007017270438701517005270052
4020470051524000010011007002069782597102540104301031000130100100006160143342254149670737005370051646310364954401003020010000602002000070051351140201100991001000030100100000100100000110000000100001010000261017111697983000310101010000301007005270052700527005270052
4020470051525000000011007003669782597102540104301031000130100100006160143342254049669717005170051646470364954401003020010000602002000070035351140201100991001000030100100000100100000110000000100001010000261017111697983000310101010000301007005270052700527005270036
402047005152400000001000700366978259710394010430100100013010010000616014334147014966971700517005264647036495440100302001000060200200007005135114020110099100100003010010000010010000011000000010000001000026101711169814300030101010000301007005270052700527009370052
4020470059525000000010007003669782597102540104301031000130100100006160143342254049669717005570056646470364954401003020010000602002000070051351140201100991001000030100100000100100000110000000100001010002261017111698143000310101010000301007005270036700367005270052
402047005152400000001000700366978259710254010430103100013010010000616014334147014966971700547005164647036495440100302001000060200200007005135114020110099100100003010010000010010000011000000010000101000026101711169814300031001010000301007005270052700367005270052

1000 unrolls and 10 iterations

Result (median cycles for code, minus 3 chain cycles): 4.0053

retire uop (01)cycle (02)03l1d tlb fill (05)mmu table walk data (08)l2 tlb miss instruction (0a)l2 tlb miss data (0b)0e0f1e1f22233a3f4d4f51schedule uop (52)schedule int uop (53)schedule ldst uop (55)dispatch int uop (56)dispatch ldst uop (58)int uops in schedulers (59)simd uops in schedulers (5a)60696a6d6emap stall dispatch (70)map rewind (75)map stall (76)dispatch uop (78)map int uop (7c)map ldst uop (7d)map int uop inputs (7f)map ldst uop inputs (80)8283flush restart other nonspec (84)85inst all (8c)inst branch (8d)inst branch taken (90)inst branch cond (94)inst int load (95)inst int alu (97)inst ldst (9b)9d9fl1d tlb access (a0)l1d tlb miss (a1)l1d cache miss ld (a3)ld unit uop (a6)l1d cache writeback (a8)a9acafb5bbl1d cache miss ld nonspec (bf)l1d tlb miss nonspec (c1)c2branch cond mispred nonspec (c5)cfd5map dispatch bubble (d6)ddfetch restart (de)e0? int output thing (e9)eaebec? ldst retires (ed)? int retires (ef)f5f6f7f8fd
4002570053525110100201017003869777597122540018300161000230010100006170093342350149669737005370114646733649784001030683100006002020000700533511400211091010000300101000011010001211000302470810000111110252027134698043000606610000300107005470054700547004270054
4002470041525100100701017002669777597122540018300161000230010100006170093341769149669737005370053646713649784001030020100006002020000700533511400211091010000300101000011010002201000401110000011110252047133698163000666610000300107005470054700547005470054
4002470053525110100201017002669702597122540018300161000230010100006170093342350149669737005370041646713649664001030020100006002020000700533511400211091010000300101000001010001201000201110000111120252137132698163000666610000300107005470054700547005470054
4002470053525100000201017003869777597122540018300161000230010100006170093342350149669737005370053646713649784001030020100006002020000700413511400211091010000300101000001010002311000300110000111120252037133698043000366610000300107005470054700427005470054
4002470041525110000200017003869777597122540014300161000230010100006170093342350149669617005370053646713649784001030020100006002020000700533511400211091010000300101000011010003111000201110000111110252037133698163000606610000300107005470054700547005470054
4002470053525110000201017003869777597122540018300161000230010100006170093342350149669737005370041646713649664001030020102196002020000700533511400211091010000300101000011010004211000300239710000110111252037122698163000666610000300107004270054700547004270054
4002470053524110100100017003869777597122540018300161000230010100006170093342350149669737005370053646713649784001030020100006002020000700413511400211091010000300101000001010002111000100110000111100252227122698163000666610000300107005470054700547005470042
4002470041524110000201017002669777597122540018300161000230010100006170093342350149669617004170053646713649784001030020100006002020000700533511400211091010000300101000001010001111000101110000111110252027132698043000666610000300107004270054700427005470054
4002470041525110100100017003869777597012540018300161000230010100006169953342350149669737005370053646713649784001030020100006002020000700533511400211091010000300101000001010001211000200110000111120252127133698163000660610000300107005470054700427005470054
4002470053525110100200017003869777597012540018300131000130010100006170093342350149669737005370053646713649784001030020100006002020000700413511400211091010000300101000001010001101000101110000010110252027144698163000666610000300107005470054700547005470054

Test 3: Latency 1->3 (with chain penalty)

Chain cycles: 3

Code:

  ldrb w0, [x6, x7]
  eor x8, x8, x0
  eor x8, x8, x0
  add x7, x7, x8
  mov x7, #4
  mov x8, 0

(fused SUBS/B.cc loop)

100 unrolls and 100 iterations

Result (median cycles for code, minus 3 chain cycles): 4.0051

retire uop (01)cycle (02)0309l2 tlb miss instruction (0a)l2 tlb miss data (0b)0e0f181e1f22243f4d4f51schedule uop (52)schedule int uop (53)schedule ldst uop (55)dispatch int uop (56)dispatch ldst uop (58)int uops in schedulers (59)simd uops in schedulers (5a)60696a6d6emap stall dispatch (70)map rewind (75)map stall (76)dispatch uop (78)map int uop (7c)map ldst uop (7d)map int uop inputs (7f)map ldst uop inputs (80)8283flush restart other nonspec (84)85inst all (8c)inst branch (8d)inst branch taken (90)inst branch cond (94)inst int load (95)inst int alu (97)inst ldst (9b)9d9fl1d tlb access (a0)l1d cache miss ld (a3)ld unit uop (a6)l1d cache writeback (a8)acafb5l1d cache miss ld nonspec (bf)branch mispred nonspec (cb)cfd5map dispatch bubble (d6)ddfetch restart (de)e0? int output thing (e9)eaebec? ldst retires (ed)? int retires (ef)f5f6f7f8fd
402057005152500000010010700366978559713254010030103100023010010000616041334147014966974700547003564631364957401003020010000602002000070035351140201100991001000030100100000100100001100000010000110261027122698173000300010000301007005570052700557003670055
4020470054525000110101070039697855969525401043010310001301001000061601433438381496697470054700546465036495740100302001000060200200007003535114020110099100100003010010000110010000110000001000010026102712269817300031313010000301007003670055700527003670036
40204700355250000001000700396978559695254010430103100013010010000616014334492614966974700357005464650364955401003020010000602002000070054351140201100991001000030100100000100100000100000010000000261027132697983000013101310000301007005570036700367005270055
40204700545250000030010700366978559713254010430103100013010010000616770334249414966974700357003564647364957401003020010000602002000070054351140201100991001000030100100000100100001100000910000110261027122698173000313101310000301007003670036700367005570055
40204700355250000000000700396978859715254010030103100003010010000616014334383814966955700357005464650364938401003020010000602002000070417351140201100991001000030100100000100100001100001010000100261027122697983000313131010000301007005570055700377005670052
40204700355250000106010700206978259695254010430103100013010010000616014334446214966955700517003564700364957401003020010000602002000070054351140201100991001000030100100000100100001100001010000110261027122697983000000010000301007005570055700557005570055
4020470035525000000000070039697645971025401043010010000301001000061627533421901496695570035700546463136493840100302001000060200200007005135114020110099100100003010010000010010000010000001000001026102712269817300031313010000301007003670036700557005570036
40204700545250001101010700396976459713254010430100100013010010000616014334315014966974700517003564650364957401003020010000602002000070054351140201100991001000030100100000100100000100003010000100271627122698173000310101010000301007005570036700527005270052
4020470054525000110101070039697645971025401003010010001301001000061604133446061496697470035700356465036493840100302001000060200200007005135114020110099100100003010010000010010000110000001000011026102712269817300000131310000301007003670036700557005570036
4020470054525000100101070036697855971025401043010010001301001000061601433451341496697470054700546465036495740100302001000060200200007005135114020110099100100003010010000010010000110000001000011026102712269817300031301310000301007003670055700557005570036

1000 unrolls and 10 iterations

Result (median cycles for code, minus 3 chain cycles): 4.0047

retire uop (01)cycle (02)03l1d tlb fill (05)mmu table walk instruction (07)mmu table walk data (08)09l2 tlb miss instruction (0a)0e0f18191e1f22233a3f4d4f51schedule uop (52)schedule int uop (53)schedule ldst uop (55)dispatch int uop (56)dispatch ldst uop (58)int uops in schedulers (59)simd uops in schedulers (5a)6061696a6d6emap stall dispatch (70)map rewind (75)map stall (76)dispatch uop (78)map int uop (7c)map ldst uop (7d)map int uop inputs (7f)map ldst uop inputs (80)8283flush restart other nonspec (84)85inst all (8c)inst branch (8d)inst branch taken (90)inst branch cond (94)inst int load (95)inst int alu (97)inst ldst (9b)9d9fl1d tlb access (a0)l1d tlb miss (a1)l1d cache miss ld (a3)ld unit uop (a6)l1d cache writeback (a8)a9acafb5bbl1d cache miss ld nonspec (bf)c2cfd0d2d5map dispatch bubble (d6)ddfetch restart (de)e0? int output thing (e9)eaebec? ldst retires (ed)? int retires (ef)f5f6f7f8fd
400257005352500000010015000070032697285970625400143001310001300101000062238233477261049669707005070035646653649724001030020100006134620000700473511400211091010000300101000001010000011000000914210000100025205357175698103000066610000300107004870036700497005170048
40024700475250000000001010070020697605969525400143004510001300101000061700933414701549669677003570047646533651844001030020100006002020000700503511400211091010000300101000001010000011000030010000100225205457157698103000366910000300107004870048700367003670048
400247005052710000000013010070032697435970625400143001310001300101000061924933430581549669707005070035646533651964001030020100006002020000700503511400211091010000300101000001010000011000000010000100025205077177698103000366010000300107003770418700397005270048
40024700475241000000021010070032697435970625400143001310001300101000061700933423501549669677005070047646653649754001030020100006002020000700473511400211091010000300101000001010000011000000010000101025205477175698103000399910000300107004870048700487004870036
40024700475251000000001010070032697285970625400143001310000300101000061700933423501549669707004770047646683649724001030020100006002020000700353511400211091010000300101000001010000011000000010000001025205457175698103000396610000300107005170051700487004870048
40024700475241000000001000070020697605970625400143001310001300101000061773833425901549669707004770047646653649724001030020100006002020000700503511400211091010000300101000001010000011000000010000100025205477177698103000369010000300107003870058700487003670036
40024700475251000011000000070035697605970925400143001010001300101000061700933424941549669677004770050646533649724001030020100006002020000700353511400211091010000300101000011010000011000000010000101025205457175697983000060910000300107004870048700487005170048
40024700505241000001001010070035697295969525400143001310001300101000061748633426861549669677004770047646653649724001030020100006002020000700353511400211091010000300101000001010000001000000010000101025205457167697983000360010000300107003670051700487004870036
40024700475250200000005235210070032697435969525400103001310001305871000061700933423501549669707003570035646653649754001030020100006002020000700353511400211091010000300101000001010000011000000010000101025205477187698103000309910000300107004970418700497005170051
40024700505250000011001000070032697605970925400143001010001300101000061700933423501549669697023370049646653649724001030020100006002020000700513511400211091010000300101000001010000011000000010000101025205457175697983000399910000300107003670048700487004870036

Test 4: throughput

Count: 8

Code:

  ldrb w0, [x6, x7]
  ldrb w0, [x6, x7]
  ldrb w0, [x6, x7]
  ldrb w0, [x6, x7]
  ldrb w0, [x6, x7]
  ldrb w0, [x6, x7]
  ldrb w0, [x6, x7]
  ldrb w0, [x6, x7]
  mov x7, 8

(fused SUBS/B.cc loop)

100 unrolls and 100 iterations

Result (median cycles for code divided by count): 0.3341

retire uop (01)cycle (02)030e0f191e1f223a3f4346494f51schedule uop (52)schedule int uop (53)schedule ldst uop (55)dispatch int uop (56)dispatch ldst uop (58)int uops in schedulers (59)simd uops in schedulers (5a)60696a6b6d6emap stall dispatch (70)map rewind (75)map stall (76)dispatch uop (78)map int uop (7c)map ldst uop (7d)map int uop inputs (7f)map ldst uop inputs (80)8283flush restart other nonspec (84)85inst all (8c)inst branch (8d)inst branch taken (90)inst branch cond (94)inst int load (95)inst int alu (97)inst ldst (9b)9d9fl1d tlb access (a0)l1d cache miss ld (a3)ld unit uop (a6)l1d cache writeback (a8)a9acafb5b6bbl1d cache miss ld nonspec (bf)branch cond mispred nonspec (c5)branch mispred nonspec (cb)cdcfd5map dispatch bubble (d6)fetch restart (de)e0? int output thing (e9)eaebec? ldst retires (ed)? int retires (ef)f5f6f7f8fd
80205267282000016601126712212121625801001008000010080015500116730314923647026727267271665561667980115200800242001600482672777118020110099100800001008000011008000043800390039800396139431115118116026724010104800001002672826728267282672826728
80204267292001104501126712212121625801001008000010080015500118165214923647026727267271665561667980115200800242001600482672777118020110099100800001008000001008000043800390039800406139431115118016026724010104800001002672826728267282672826728
80204267272010004500126712212121625801001008000010080015500116863614923647026727267071665561667980115200800242001600482672777118020110099100800001008000001008000043800390039800396139431115118016026724010104800001002672826728267282672826728
8020426727200000450112671221212162580100100800001008001450011731031492364702672726707166556166798011520080024200160048267277711802011009910080000100800000100800004380039003980040610431115118016026724010104800001002672826728267282670826728
80204267272000004501126692212121625801001008000010080015500117131614923647026727267271663561667980116200800242001600482672777118020110099100800001008000001008000043800390039800396139431115118016026724010104800001002672926746267282672826728
8020426727200110450012671220121625801001008000010080014500117199314923647026707267071683361667980115200800242001600482672777118020110099100800001008000001008000043800390039800396139431115118016026724010104800001002674726728267282672826708
80204267072000004501126712212121625801001008000010080014500117258114923647026727267271665561667980115200800242001600482672777118020110099100800001008000001008000043800390039800396140431115118016126724010104800001002672826728267282672826728
8020426727200000450012671221212162580100100800001008001450011719751492364702672726727166556166798011520080024200160048267277711802011009910080000100800000100800000800390039800396039431115118016026724010104800001002672826728267282672826728
80204267272000004500126712212121625801001008000010080015500118171714923653026730267271665561665980115200800242001600482672777118020110099100800001008000001008000043800390039800396139431115118016026724010104800001002672826728267282672826728
80204267272000004500126782212121625801001008000010080015500117241814923647026727267271665561667980115200800242001600482672777118020110099100800001008000001008000043800390039800396139431115118016026724010104800001002672826728267282672826708

1000 unrolls and 10 iterations

Result (median cycles for code divided by count): 0.3353

retire uop (01)cycle (02)030e0f1e22233a3f4346494f51schedule uop (52)schedule int uop (53)schedule ldst uop (55)dispatch int uop (56)dispatch ldst uop (58)int uops in schedulers (59)simd uops in schedulers (5a)60696a6d6emap stall dispatch (70)map rewind (75)map stall (76)dispatch uop (78)map int uop (7c)map ldst uop (7d)map int uop inputs (7f)map ldst uop inputs (80)8283flush restart other nonspec (84)85inst all (8c)inst branch (8d)inst branch taken (90)inst branch cond (94)inst int load (95)inst int alu (97)inst ldst (9b)9d9fl1d tlb access (a0)l1d cache miss ld (a3)ld unit uop (a6)l1d cache writeback (a8)acafb5b6bbl1d cache miss ld nonspec (bf)c2cfd5map dispatch bubble (d6)ddfetch restart (de)e0eaebec? ldst retires (ed)? int retires (ef)f5f6f7f8fd
8002526727200004510126713012116258001010800001080000501168843049236472673126728166723167118001020800002016000026727771180021109108000010800001108000043800390117800386139430502051635267281010480000102672826728267292673226732
80024268292000001012671221119258001010800001080000501174273049236482673126708166723167118001020800002016000026731771180021109108000010800001108000043800000128800386139440502021642267281414480000102672826732267092672926732
800242681620000440012671221121925800101080000108000050116720104923651267272672716676316711800102080000201600002672777118002110910800001080000010800004380000038800386139440502021643267051414080000102670926732267322672926732
80024268232000001012671621116258001010800001080000501157633149236472673126728166763167118001020800002016000026731771180021109108000010800000108000043800000125800386139440502021642267271010480000102672826728267282672826709
800242682520000441012671201119258001010800001080000501172344049236472673126728166723166888001020800002016000026731771180021109108000010800001108000043800380135800386139440502041642267051010780000102672826728267282673226732
800242685320000451012671221201925800101080000108000050116646804923647267282672816672316711800102080000201600002672856118002110910800001080000010800004480000011980038613900502021643267281014480000102673226709267332673226728
800242684420000440012671320116258001010800001080000501168520049236512670826708166723167088001020800002016000026731771180021109108000010800000108000043800390131800386039430502021624267051414780000102670926732267322673226732
8002426841200004400126693201216258001010800001080000501167172049236512670826727166723167078001020800002016000026731771180021109108000010800000108000043800000398003861390250204164226728100780000102672926732267282673226732
80024268312000001012671621119258001010800001080000501169370049236472672826727166763167118001020800002016000026731561180021109108000010800000108000043800380140800386138440502041644267241014780000102670926732267322672926728
8002426834201004400126713211192580010108000010800005011621160492365126731267311667231670880010208000020160000267317711800211091080000108000001080000438003901298003861390050204162426728010780000102673226729267322670926728