Apple Microarchitecture Research by Dougall Johnson

M1/A14 P-core (Firestorm): Overview | Base Instructions | SIMD and FP Instructions
M1/A14 E-core (Icestorm):  Overview | Base Instructions | SIMD and FP Instructions

LDR (register, lsl, 64-bit)

Test 1: uops

Code:

  ldr x0, [x6, x7, lsl #3]
  mov x7, #4
  mov x8, 0

(no loop instructions)

1000 unrolls and 1 iteration

Retires: 1.000

Issues: 1.000

Integer unit issues: 0.000

Load/store unit issues: 1.000

SIMD/FP unit issues: 0.000

retire uop (01)cycle (02)03l1d tlb fill (05)mmu table walk data (08)l2 tlb miss data (0b)0e0f1e22243a3f4346494f51schedule uop (52)schedule ldst uop (55)dispatch ldst uop (58)simd uops in schedulers (5a)606d6emap stall dispatch (70)map rewind (75)map stall (76)dispatch uop (78)map ldst uop (7d)map ldst uop inputs (80)8283flush restart other nonspec (84)85inst all (8c)inst int load (95)inst ldst (9b)9dl1d tlb access (a0)l1d tlb miss (a1)l1d cache miss ld (a3)ld unit uop (a6)st unit uop (a7)l1d cache writeback (a8)a9acafb5b6bbl1d cache miss ld nonspec (bf)l1d tlb miss nonspec (c1)c2cfd5map dispatch bubble (d6)ddfetch restart (de)e0eaebec? ldst retires (ed)f5f6f7f8fd
100539921100065100385218211625100010001000153310399400222325810001000200040082111001100010000102019010601016510413160421917321611397101031000401400401401400
10043993100006510338521818025100010001000154101400399204325810001000200039964111001100010000101919421059101211000311942190731161139701031000402400401401382
10044003111016510138521801725100010001000154230400400222325810001000200040082111001100010000102120010191012110433119421917311611378101031000401382383401401
100440131101165102384018181625100010001000144560399400222323910001000200040064111001100010000101921421060110211041016042192731161139710031000400401401400401
100438131100065002384218181625100010001000153821399400222325810001000200040082111001100010000102020421060101621000316042191731161139601031000401401401401401
100440021110167000366018181725100010001000153660400400222325710001000200040064111001100010000102020421060101621041311942192731161138410021000400401401400382
100438221110065102385018017251000100010001542303813992223258100010002000399821110011000100001020204210621016210410160421827311611397101001000401401401400401
100440031100165003384018181625100010001000144560399400222325810001000200040082111001100010000102019421019100621000316001917311611396101001000401383382382400
10043993111016500238421800251000100010001556314013812223258100010002000399821110011000100001019224210611016210003160421917311611378101031000401404383401383
1004381311110650033852018172510001000100015363140039922232581000100020004006411100110001000110191901060100681043311942191731161137801031000383401401400382

Test 2: Latency 1->2 (with chain penalty)

Chain cycles: 3

Code:

  ldr x0, [x6, x7, lsl #3]
  eor x8, x8, x0
  eor x8, x8, x0
  add x6, x6, x8
  mov x7, #4
  mov x8, 0

(fused SUBS/B.cc loop)

100 unrolls and 100 iterations

Result (median cycles for code, minus 3 chain cycles): 4.0054

retire uop (01)cycle (02)03l1d tlb fill (05)mmu table walk instruction (07)mmu table walk data (08)l2 tlb miss data (0b)0e0f1e2223243a3f4d4f51schedule uop (52)schedule int uop (53)schedule ldst uop (55)dispatch int uop (56)dispatch ldst uop (58)int uops in schedulers (59)simd uops in schedulers (5a)60696a6b6d6emap stall dispatch (70)map rewind (75)map stall (76)dispatch uop (78)map int uop (7c)map ldst uop (7d)map int uop inputs (7f)map ldst uop inputs (80)8283flush restart other nonspec (84)85inst all (8c)inst branch (8d)inst branch taken (90)inst branch cond (94)inst int load (95)inst int alu (97)inst ldst (9b)9d9fl1d tlb access (a0)l1d tlb miss (a1)l1d cache miss ld (a3)ld unit uop (a6)st unit uop (a7)l1d cache writeback (a8)a9acafb5bbl1d cache miss ld nonspec (bf)l1d tlb miss nonspec (c1)c2branch cond mispred nonspec (c5)branch mispred nonspec (cb)cfd5map dispatch bubble (d6)ddfetch restart (de)e0? int output thing (e9)eaebec? ldst retires (ed)? int retires (ef)f5f6f7f8fd
402057005452400001021000700396978559713254010830106100023010010000616159334211004966974070054700486465936495740100302001000060200200007043135114020110099100100003010010000110010003211000011111100001111000261037122698043000670710000301007005870055700557004270055
40204700545241001001100170039697855971325401043010310001301001000061606833423980496697407005470054646503650194010030200100006020020000700823511402011009910010000301001000001001000211100020114100001111100261027122698743000600710000301007005570055700557004270055
4020470041525100000122100170033697505970725401083010310002301001000061607833417690496696107005470054646503649574010030200100006020020000700843511402011009910010000301001000001001000310100010001100001111000261027122698043000677710000301007005570055700557005570055
40204700415251011001000170039697855971325401083010610002301001000061607833423980496697407004170054646503649574010030200100006020020000700963511402011009910010000301001000001001000111100020001100001111100261027122698173000677010000301007005570055700557005570055
40204700545241010002000170039697855971325401083010610002301001000061604133423981496697407005470054646503649444010030200100006020020000700553511402011009910010000301001000001001000211100030011100001111210261027122698173000677710000301007005570055700557005570055
402047004152510000020101700396970259713254010830106100023010010000616041334239809866961070115700606465036495740100302001000060200200007005735114020110099100100003010010000010010001311000200211000011111002610271226979830003071010000301007005570055700557005570042
40204700415241011002010170039697025971325401083010610002301001000061606833423981496697407005470054646373649574010030200100006020020000700853511402011009910010000301001000001001000221100020001100001111200261027122698043000607710000301007004270055700557005570096
40204701225251010001010070026697855971325401083010610002301001000061607833423980496696807004170054647753649514010030200100006020020000700543511402011009910010000301001000001001000131100020011100001101000261027122698173000677010000301007005570055700427005570042
40204700545251010001010070039697855971325401083010610002301001000061604133423980496697407005470054646503649574010030200100006060020000700863511402011009910010000301001000001001000130100030721100001010000261027122698173000377710000301007004270042700557005570055
40204700545251011002010170039697855971325401083010610002301001000061604133423980496697407004170054646503649574010030200100006020020000700593511402011009910010000301001000001001000121100010001100001101200261027112698103000377710000301007004970049700497004970055

1000 unrolls and 10 iterations

Result (median cycles for code, minus 3 chain cycles): 4.0052

retire uop (01)cycle (02)030e0f18191e22233f4d4f51schedule uop (52)schedule int uop (53)schedule ldst uop (55)dispatch int uop (56)dispatch ldst uop (58)int uops in schedulers (59)simd uops in schedulers (5a)60696a6d6emap stall dispatch (70)int prf full (71)map rewind (75)map stall (76)dispatch uop (78)map int uop (7c)map ldst uop (7d)map int uop inputs (7f)map ldst uop inputs (80)8283flush restart other nonspec (84)85inst all (8c)inst branch (8d)inst branch taken (90)inst branch cond (94)inst int load (95)inst int alu (97)inst ldst (9b)9d9fl1d tlb access (a0)l1d tlb miss (a1)l1d cache miss ld (a3)ld unit uop (a6)l1d cache writeback (a8)a9acafb5l1d cache miss ld nonspec (bf)cfd5map dispatch bubble (d6)ddfetch restart (de)e0? int output thing (e9)eaebec? ldst retires (ed)? int retires (ef)f5f6f7f8fd
400257005252410001007003769759597152540014300101000130010100006170683342302149669557003570035646700364960400103002010000600202000070055351140021109101000030010100000101000001100000001000011252017111698153000311111410000300107006070053700667005370053
400247005252500001107004069780596952540014300131000130010100006170543342446149669557005570052646530364980400103002010000600202000070035351140021109101000030010100000101000001100000001000011252017111698183000314111410000300107003870107700557005870053
400247005252400001107003769780597142540010300101000030010100006170273342302149669557005270055646530364980400103002010000600202000070055351140021109101000030010100001101000001100000031000000252017111698153000311141410000300107007670054700567005670056
400247005552400001107003769787597142540014300131000130010100006170003342446149669757005270052646700364960400103002010000600202000070055351140021109101000030010100000101000001100000001000011252017111698153000311111110000300107006870040700577003970053
400247003552500001107003769782597142540014300131000130010100006170003342446149669757005570035646700364980400103002010000600202000070052351140021109101000030010100000101000001100000001000011252017111698183000311141410000300107011670043703987005970053
400247003552400001007002069776597112540014300101000130010100006170003342302149669727005570052646730364980400103002010000600202000070052351140021109101000030010100000101000000100000031000011252027111698153000311141110000300107009270055700607005370036
4002470055524000010070037697835971425400223001310001300101000061706833424461496697270035700526467003649804001030020100006002020000700603511400211091010000300101000001010000011000010010000112520171116981830000014010000300107010170053700567005370053
4002470035525000010070020697815971125400143001310001300101000061706833424461496697570052700526465303649774001030020100006002020000700553511400211091010000300101000001010000001000000010000112520271116981830003014010000300107008770056700587005670056
400247005252500001107003769786597142540014300131000130010100006170003342446149669727005270055646730364960400103002010000600202000070055351140021109101000030010100000101000001100000001000011252017111698153000314111110000300107005870042700567003670053
400247003552400001107004069827597112540014300131000130010100006170273341518149669777003570055646730364980400103002010000600202000070055351140021109101000030010100001101000001100000001000011252017111698153000314111410000300107041670057700917005570053

Test 3: Latency 1->3 (with chain penalty)

Chain cycles: 3

Code:

  ldr x0, [x6, x7, lsl #3]
  eor x8, x8, x0
  eor x8, x8, x0
  add x7, x7, x8
  mov x7, #4
  mov x8, 0

(fused SUBS/B.cc loop)

100 unrolls and 100 iterations

Result (median cycles for code, minus 3 chain cycles): 4.0052

retire uop (01)cycle (02)030e0f18191e22243f4d4f51schedule uop (52)schedule int uop (53)schedule ldst uop (55)dispatch int uop (56)dispatch ldst uop (58)int uops in schedulers (59)simd uops in schedulers (5a)60696a6d6emap stall dispatch (70)map rewind (75)map stall (76)dispatch uop (78)map int uop (7c)map ldst uop (7d)map int uop inputs (7f)map ldst uop inputs (80)8283flush restart other nonspec (84)85inst all (8c)inst branch (8d)inst branch taken (90)inst branch cond (94)inst int load (95)inst int alu (97)inst ldst (9b)9d9fl1d tlb access (a0)l1d cache miss ld (a3)ld unit uop (a6)l1d cache writeback (a8)acafb5l1d cache miss ld nonspec (bf)c2cfd5map dispatch bubble (d6)ddfetch restart (de)e0? int output thing (e9)eaebec? ldst retires (ed)? int retires (ef)f5f6f7f8fd
402057005252400001300700376978359711254010430103100133010010000616023334230204966975700527005264648364955401003020010000602002000070127351140201100991001000030100100000100100001100000310000110261027111698153000311111110000301007005370053700537005370053
402047005252400301107003969783597131224010430103100013010010000616023334230204966972700527003564631364955401003020010000602002000070113351140201100991001000030100100000100100001100000010000110261017111698153000311111110000301007005370053700537005370053
40204700525240000110700376978359695254010430103100013010010000616023334230214966972700527005264648364955401003020010000602002000070093351140201100991001000030100100000100100001100000010000110261017111698153000311111110000301007005370053700537005370053
40204700525250000110700376978359711254010430103100013010010204616041334230204966972700527003564648364955401003020010000602002000070057351140201100991001000030100100000100100001100000010000110261011711698153000311111110000301007005370036700537005370053
40204700525240000110700376978359711254010430103100013010010000616023334230204966972700527005264648364955401003020010000602002000070093351140201100991001000030100100000100100001100000010000110261017111698173000311111110000301007005370053700537005370053
40204700525250000110700376978359711254010430103100003010010000616023334230204966972700527005264648364955401003020010000602002000070107351140201100991001000030100100000100100001100000010000110261017111698153000311111110000301007005370053700537005370053
40204700525250000110700376978359711254010430103100013010010000616023334230204966975704147005264650364955401003020010000602002000070093351140201100991001000030100100000100100001100000010000110261017111698153000311111110000301007005370053700537005370053
40204700525250000110700376978359711254010430103100013010010000616175334230204966972700527005264648364955401003020010000602002000070098351140201100991001000030100100000100100001100000010000110261017111698153000311111110000301007005370053700537005370053
40204700525240000110700206978159711254010430103100013010010000616023334230214966972700527005264648364961401003020010000602002000070090351140201100991001000030100100000100100001100000010003110261017111698153000011111110000301007005370053700537005370053
40204700525240000110700376978359711254010430103100013010010000616023334230214966972700527005264648364955401003020010000602002000070106351140201100991001000030100100000100100001100000310006110261017111698153000311111110000301007005870053700537003670053

1000 unrolls and 10 iterations

Result (median cycles for code, minus 3 chain cycles): 4.0058

retire uop (01)cycle (02)03l1d tlb fill (05)mmu table walk data (08)09l2 tlb miss data (0b)0e0f18191e1f22233a3f4d4f51schedule uop (52)schedule int uop (53)schedule ldst uop (55)dispatch int uop (56)dispatch ldst uop (58)int uops in schedulers (59)simd uops in schedulers (5a)60696a6d6emap stall dispatch (70)map rewind (75)map stall (76)dispatch uop (78)map int uop (7c)map ldst uop (7d)map int uop inputs (7f)map ldst uop inputs (80)8283flush restart other nonspec (84)85inst all (8c)inst branch (8d)inst branch taken (90)inst branch cond (94)inst int load (95)inst int alu (97)inst ldst (9b)9d9fl1d tlb access (a0)l1d tlb miss (a1)l1d cache miss ld (a3)ld unit uop (a6)st unit uop (a7)l1d cache writeback (a8)a9acafb5bbl1d cache miss ld nonspec (bf)l1d tlb miss nonspec (c1)c2cfd5map dispatch bubble (d6)ddfetch restart (de)e0? int output thing (e9)eaebec? ldst retires (ed)? int retires (ef)f5f6f7f8fd
400257004152511000000100017004669782597012540018300161000230010100006170133342734049669817004170058646593649864001030020100006002020000700613511400211091010000300101000001010003111000300022100001111125201711169821300061111010000300107004270062700427006070062
400247004152510100000200007002669785597632540014300161000130010100006169953342734049669787006170061646793649874001030020100006002020000700413511400211091010000300101000001010002101000100011000011110252017111698823000314141410000300107004270042700627006270059
40024700415241111000026880007004369789597202540018300161000130010100006170543341769049669787006170078646793649834001030020100006002020000700613511400211091010000300101000001010002111000100011000011010252017112698213000614141410000300107006270062700447005970064
4002470041525111100011010070043697025972025400183001610001300101000061705433417690496698170041700416465936498640010300201000060020200007006135114002110910100003001010000010100012110002000410000110112520171116982430006140010000300107006270042700597004270138
400247005852411110000200007004369785597222540014300271000230010100006170813341769049669617004170041646593649864001030020100006002020000700583511400211091010000300101000001010001111000300241000011111254417111698213001711141410000300107006770063701547006670044
4002470063525100100105201007004369785597012540018300161000230010100006169953341769049669787006170061646763649864001030183100006002020000700583511400211091010000300101000001010001101000201204100001111225201711169821300031101410000300107006170062700627006270059
400247004952411000000201007004469782597222540018300161000230010100006170813341769049669787005870058646793649834001030020100006002020000700413511400211091010000300101000001010003201000103111000011110252017111698263000314141410000300107005970042700427006270059
4002470061524100011002000070026697855972025400183001610002300101000061709033455540496698170154700416467936498640010300201000060020200007005835114002110910100003001010000010100022010001001110000110102544171116980430006014010000300107005970062700627006270062
40024700415241010000020100700466978259701254001830016100023001010000616995334176904966978700617006664664364987400103002010000603442000070061351140021109101000030010100000101000111100020012318100001111025201711169810300061411010000300107005970042700627006270042
4002470058525100100002000070046697855971725400183001310002300101000061706333425900496698170061700586467936498640010300201000060020200007004135114002110910100003001010000110100021110002002110000011102520171116982130006140010000300107004270062700597006270059

Test 4: throughput

Count: 8

Code:

  ldr x0, [x6, x7, lsl #3]
  ldr x0, [x6, x7, lsl #3]
  ldr x0, [x6, x7, lsl #3]
  ldr x0, [x6, x7, lsl #3]
  ldr x0, [x6, x7, lsl #3]
  ldr x0, [x6, x7, lsl #3]
  ldr x0, [x6, x7, lsl #3]
  ldr x0, [x6, x7, lsl #3]
  mov x7, 8

(fused SUBS/B.cc loop)

100 unrolls and 100 iterations

Result (median cycles for code divided by count): 0.3339

retire uop (01)cycle (02)03l1d tlb fill (05)mmu table walk data (08)09l2 tlb miss data (0b)191e3a3f494f51schedule uop (52)schedule int uop (53)schedule ldst uop (55)dispatch int uop (56)dispatch ldst uop (58)int uops in schedulers (59)simd uops in schedulers (5a)60696a6d6emap stall dispatch (70)map rewind (75)map stall (76)dispatch uop (78)map int uop (7c)map ldst uop (7d)map int uop inputs (7f)map ldst uop inputs (80)8283flush restart other nonspec (84)85inst all (8c)inst branch (8d)inst branch taken (90)inst branch cond (94)inst int load (95)inst int alu (97)inst ldst (9b)9fl1d tlb access (a0)l1d tlb miss (a1)a5ld unit uop (a6)st unit uop (a7)l1d cache writeback (a8)a9acafbbl1d tlb miss nonspec (c1)c2branch cond mispred nonspec (c5)branch mispred nonspec (cb)cdcfd5map dispatch bubble (d6)ddfetch restart (de)e0? ldst retires (ed)? int retires (ef)f5f6f7f8fd
802052671520011110212266990025801001008000010080015500116659014923634267152671416642616666801162008002420016004826716641180201100991008000010080000100800202008001910121800001919011151181160026711800001002671526715267152671526716
802042671420011010211266990025801001008000010080015500116659004923634267142671416642616666801162008002420016004826722641180201100991008000010080000100800201908001910221800001919011151180160026711800001002671526715267152671526715
802042671420011010212266990125801001008000010080015500116721504923634267142671416642616666801152008002420016004826915641180201100991008000010080000100800192108001920021800001919211151180160026718800001002671526717267152671526715
802042671420010100211266990125801001008000010080015500116683604923634267142671416642616666801152008002420016004826886651180201100991008000010080000100800202108001900021800001919011151180160026711800001002671526715267152671526715
802042671420011110211266990025801001008000010080016500116659004923634267142671516642616666801152008002420016004826719641180201100991008000010080000100801512008001920021800001919111151180160026711800001002671526715267152671526715
802042671420011000211266990125801001008000010080015500116683604923634268052671416642616666801152008002420016004826723641180201100991008000010080000100800202008001900221800001919111151180160026711800001002671526716267152671526715
802042671420011110211266990125801001008000010080015500116683604923634267142671416642616666801152008002420016004826723641180201100991008000010080000100800212108001910021800001919011151180160026711800001002671526715267152671526715
802042671420011110212266990125801001008000010080015500116720314923634267142671416642616666801152008002420016004827167641180201100991008000010080000100800211908001900021800001919011151180160026711800001002671526715267152671526716
802042671520010110211266990025801001008000010080021500116616414923634267142671416635916655801192008003020016006026842641180201100991008000010080000100805422008001910121800001919122251281231126711800001002671626715267152671526715
802042671420011110212266990025801001008000010080020500116646704923634267142671416632916655801212008003020016006026748641180201100991008000010080000100800192008001901021800001919022251291231126714800001002671526715267152671526716

1000 unrolls and 10 iterations

Result (median cycles for code divided by count): 0.3338

retire uop (01)cycle (02)03l1i tlb fill (04)mmu table walk data (08)1e223a3f494f51schedule uop (52)schedule int uop (53)schedule ldst uop (55)dispatch int uop (56)dispatch ldst uop (58)int uops in schedulers (59)simd uops in schedulers (5a)696a6d6emap stall dispatch (70)map rewind (75)map stall (76)dispatch uop (78)map int uop (7c)map ldst uop (7d)map int uop inputs (7f)map ldst uop inputs (80)8283flush restart other nonspec (84)85inst all (8c)inst branch (8d)inst branch taken (90)inst branch cond (94)inst int load (95)inst int alu (97)inst ldst (9b)9fl1d tlb access (a0)l1d tlb miss (a1)ld unit uop (a6)l1d cache writeback (a8)a9acafbbl1d cache miss ld nonspec (bf)branch cond mispred nonspec (c5)cfd5map dispatch bubble (d6)ddfetch restart (de)e0? ldst retires (ed)? int retires (ef)f5f6f7f8fd
80025267082000000226693002580010108000010800005011667504923628267082670816652316815800102080000201600002670856118002110910800001080000108000008000000080000000502411168172670880000102670926709267092670926709
800242670820000012266930025800101080000108000050116675049236282670826708166523167508001020800002016000026708561180021109108000010800001080000080000000800000015024151616142670580000102670926709267092670926709
800242670820000002266930025800101080000108000050116675049236282670826708166523167478001020800002016000026708561180021109108000010800001080000080000000800000005024161616162670580000102670926709267092670926709
800242671020000002266930025800101080000108000050116675049236282670826708166523167588001020800002016000026708561180021109108000010800001080000080000000800000005024131613172670580000102670926709267092670926709
8002426708200000022669300258001010800001080000501166750492362826708267081665231668880010208000020160000267085611800211091080000108000010800000800000008000004305024171616142670580000102670926709267092670926709
800242670820000002266930025800101080000108000050116675049236282679526708166523167758001020800002016000026708561180021109108000010800001080000080000000800000005024131616142670580000102671226709267092670926712
800242670820001002266960025800101080000108000050116675049236282670826708166563166908001020800002016000026718561180021109108000010800001080000080000020800000005096162415212685480000102674426709267092670926709
8002426708200001212266930025800101080000108000050116675049236282670826708166523166888001020800002016000026726561180021109108000010800001080000080000000800000005024111616142670580000102670926709267092670926709
800242670820000002266930025800101080000108000050116675049236282670826708166523166918001020800002016000026724561180021109108000010800001080000080000000800000005024161614162670580000102670926709267092670926709
800242670820000002266930025800101080000108000050117004449236312671226711166523166888001020800002016000026720561180021109108000010800001080000080000000800000005024141618192670580000102670926709267092670926709