Apple Microarchitecture Research by Dougall Johnson

M1/A14 P-core (Firestorm): Overview | Base Instructions | SIMD and FP Instructions
M1/A14 E-core (Icestorm):  Overview | Base Instructions | SIMD and FP Instructions

LDRSW (register, uxtw)

Test 1: uops

Code:

  ldrsw x0, [x6, w7, uxtw]
  mov x7, #4
  mov x8, 0

(no loop instructions)

1000 unrolls and 1 iteration

Retires: 1.000

Issues: 1.000

Integer unit issues: 0.000

Load/store unit issues: 1.000

SIMD/FP unit issues: 0.000

retire uop (01)cycle (02)030e0f1e22243a3f4346494f51schedule uop (52)schedule ldst uop (55)dispatch ldst uop (58)simd uops in schedulers (5a)606d6emap stall dispatch (70)map rewind (75)map stall (76)dispatch uop (78)map ldst uop (7d)map ldst uop inputs (80)8283flush restart other nonspec (84)85inst all (8c)inst int load (95)inst ldst (9b)l1d tlb access (a0)l1d cache miss ld (a3)ld unit uop (a6)l1d cache writeback (a8)acafb5b6bbl1d cache miss ld nonspec (bf)cfd5map dispatch bubble (d6)ddfetch restart (de)e0eaebec? ldst retires (ed)f5f6f7f8fd
1005399300450023792121216251000100010001503713943942173252100010002000395711110011000100010003910390351040613543733162239110641000395395395395395
10043943004500137921212162510001000100015037139439421732521000100020003997111100110001000100039103503910406135437321622391101041000397395395395395
1004389300410023792121216251000100010001498913943942173252100010002000400711110011000100010003910390391039613543732162239110641000395395398395395
10043942004500237921212122510001000100015018138939421732521000100020003947111100110001000100039103903510396135397321622391101041000395395395395395
1004391200450023792121216251000100010001503713943942173252100010002000394711110011000100010003910390391035613543732162238810641000395395390395395
10043893004500237921212162510001000100015037139439421632521000100020003947111100110001000100039103903910356135437321622391101041000395395392390395
10043943004100237921212162510001000100014989139439421732521000100020003947111100110001000100039103503910396135397331622391101041000395395394395395
10043942004500237921212162510001000100014989139739421732521000100020003947111100110001000100039103903510356135437321622386101021000395395395395395
1004394300450013792121216251000100010001503713943892173252100010002000394711110011000100010003910390391039613543732162239110621000390392395395395
1004394300450023792121216251000100010001503713943942163247100010002000394711110011000100010003910390391039613543732162238610641000390395395395395

Test 2: Latency 1->2 (with chain penalty)

Chain cycles: 3

Code:

  ldrsw x0, [x6, w7, uxtw]
  eor x8, x8, x0
  eor x8, x8, x0
  add x6, x6, x8
  mov x7, #4
  mov x8, 0

(fused SUBS/B.cc loop)

100 unrolls and 100 iterations

Result (median cycles for code, minus 3 chain cycles): 4.0051

retire uop (01)cycle (02)03mmu table walk data (08)09l2 tlb miss data (0b)0e0f18191e1f2223243f4d4f51schedule uop (52)schedule int uop (53)schedule ldst uop (55)dispatch int uop (56)dispatch ldst uop (58)int uops in schedulers (59)simd uops in schedulers (5a)60696a6d6emap stall dispatch (70)map rewind (75)map stall (76)dispatch uop (78)map int uop (7c)map ldst uop (7d)map int uop inputs (7f)map ldst uop inputs (80)8283flush restart other nonspec (84)85inst all (8c)inst branch (8d)inst branch taken (90)inst branch cond (94)inst int load (95)inst int alu (97)inst ldst (9b)9d9fl1d tlb access (a0)l1d tlb miss (a1)l1d cache miss ld (a3)ld unit uop (a6)l1d cache writeback (a8)a9acafb5l1d cache miss ld nonspec (bf)c2branch mispred nonspec (cb)cfd5map dispatch bubble (d6)ddfetch restart (de)e0? int output thing (e9)eaebec? ldst retires (ed)? int retires (ef)f5f6f7f8fd
4020570051524010110060100700366978259713254010430103100013010010000616014334225414966971700517005164647364954401003020010000602002000070051351140201100991001000030100100001100100000110000000100001100261017111698373000610101010000301007005270052700527005270052
4020470051525000000010100700366978259710254010430103100003010010000616014334225414966971700517005164647364954401003020010000602002000070051351140201100991001000030100100000100100000110000000100001100261017111698143000310101010000301007005270052700527005270052
4020470051524000000010100700206978259710254010430103100013010010000616014334147004966971700517005164647364954401003020010000602002000070051351140201100991001000030100100000100100000110000000100001100261017111698143000310101010000301007005270052700367005270052
4020470051525000000010100700366978259695254010430103100013010010000616014334225414965821700597005164647364954401003020010000602002000070051351140201100991001000030100100000100100000110000000100001000261017111699173000310101010000301007005270052700527007970052
4020470051524000000010000700206978259710254010430103100013010010000616014334225414966971700517005164647364954401003020010000602002000070051351140201100991001000030100100000100100000110000000100001100261017111698143000010101010000301007005270052700527005270036
4020470051524000000010100700366978259710254010430103100013010010000616014334225404966971700517005164647364954401003020010000602002000070051351140201100991001000030100100000100100000110000000100001100261017111698143000310101010000301007005270052700527005270055
4020470051525000000010100700366980059715254010430103100013010010000616014334225404966955700517003564647364954401003020010000602002000070051351140201100991001000030100100000100100000110000000100001100261017111698143000310101010000301007005270052700527005270052
4020470035525000000010000700366978259710254010430103100013010010000616014334225404966971700517005164647364954401003020010000602002000070051351140201100991001000030100100000100100000010000000100001000261017111698143000310101010000301007005270052700527005270052
4020470051525000000010100700366978259710254010430103100013010010000616014334225414966971700517003564647364954401003020010000602002000070035351140201100991001000030100100000100100000110000000100001100261017111698143000310101010000301007005270052700527005270052
4020470051524000000010100700366978259710254010430103100013010010000616014334225414966971700517005164647364954401003020010000602002000070051351140201100991001000030100100001100100000010000000100001100261017111698143000310101010000301007003670052700527005270052

1000 unrolls and 10 iterations

Result (median cycles for code, minus 3 chain cycles): 4.0047

retire uop (01)cycle (02)03l1d tlb fill (05)mmu table walk data (08)09l2 tlb miss data (0b)0e0f18191e1f2223243a3f4d4f51schedule uop (52)schedule int uop (53)schedule ldst uop (55)dispatch int uop (56)dispatch ldst uop (58)int uops in schedulers (59)simd uops in schedulers (5a)60696a6d6emap stall dispatch (70)int prf full (71)map rewind (75)map stall (76)dispatch uop (78)map int uop (7c)map ldst uop (7d)map int uop inputs (7f)map ldst uop inputs (80)8283flush restart other nonspec (84)85inst all (8c)inst branch (8d)inst branch taken (90)inst branch cond (94)inst int load (95)inst int alu (97)inst ldst (9b)9d9fl1d tlb access (a0)l1d tlb miss (a1)l1d cache miss ld (a3)ld unit uop (a6)l1d cache writeback (a8)a9acafb5bbl1d cache miss ld nonspec (bf)l1d tlb miss nonspec (c1)c2branch cond mispred nonspec (c5)branch mispred nonspec (cb)cfd0d5map dispatch bubble (d6)dbddfetch restart (de)e0? int output thing (e9)eaebec? ldst retires (ed)? int retires (ef)f5f6f7f8fd
400257005552411101000101001700386977759701254001830016100023001010000616952334206214966967700477004764665021649784001030020100006100420000700493511400211091010000300101000001010002211000102110000101000025200171011697983000366610000300107004870051700487004870048
40024700355250000000060100170038697775971225400143001310001300101000061695233420620496696770035700356466503649724001030020100006002020000700473511400211091010000300101000001010000011000010010000101000025200171011698103000060610000300107004870048700487004870048
40024700475240000110060000170026697775971525400183001610001300101000061700933423501496697370053700536467103649664001030020100006002020000700473511400211091010000300101000001010000011000000010000101000025203171011698043000660610000300107005470054700547005470054
40024700415251111000020100070032697285970625400143001310000300101000061695233414701496696770047700356466503649724001030020100006002020000701153511400211091010000300101000001010001211000203110006110110025203171011697983000366610000300107004870048700487005070048
40024700475250011000020100070032697285970625400143001310001300101000061695233414700496696770047700476466503649724001030020100006002020000700473511400211091010000300101000001010000011000000010000101001025200271011698163000666610000300107005470054700547005470054
40024700535251100000010100070032697285970625400103001310001300101000061695233420620496695570047700356466503649724001030020100006002020000700533511400211091010000300101000001010000011000010010003101000025200171011698103000366010000300107003670048700487004870048
40024700355250000000010100070026697775970125400143001610002300101000061700933423500496696170053700536467103649784001030020100006002020000700473511400211091010000300101000001010000011000010010000101000025200271111698103001206010000300107031370140700487004970036
40024700355240000000010000070032697285970625400103001310001300101000061695233414700496696770047700356466503649604001030020100006002020000700533511400211091010000300101000001010002101000100110000111110025200171011697983000366610000300107003670048700367004870036
40024700475240000000060000170026697775971225400183001610002300101000061700933417690496697370041700536467103649784001030020100006002020000700473511400211091010000300101000001010000011000000010000101000025203171011698103000366610000300107004870036700487003670048
40024700475240000000010100070020697285970625400143001310001300101000061706833420620496696770047700476465303649724001030020100006002020000700533511400211091010000300101000001010002211000100410000111110025200271011697983000300610000300107004870036700487004870048

Test 3: Latency 1->3 (with chain penalty)

Chain cycles: 3

Code:

  ldrsw x0, [x6, w7, uxtw]
  eor x8, x8, x0
  eor x8, x8, x0
  add x7, x7, x8
  mov x7, #4
  mov x8, 0

(fused SUBS/B.cc loop)

100 unrolls and 100 iterations

Result (median cycles for code, minus 3 chain cycles): 4.0057

retire uop (01)cycle (02)03l1d tlb fill (05)mmu table walk data (08)l2 tlb miss data (0b)0e0f18191e1f22233a3f4d4f51schedule uop (52)schedule int uop (53)schedule ldst uop (55)dispatch int uop (56)dispatch ldst uop (58)int uops in schedulers (59)simd uops in schedulers (5a)60696a6d6emap stall dispatch (70)int prf full (71)map rewind (75)map stall (76)dispatch uop (78)map int uop (7c)map ldst uop (7d)map int uop inputs (7f)map ldst uop inputs (80)8283flush restart other nonspec (84)85inst all (8c)inst branch (8d)inst branch taken (90)inst branch cond (94)inst int load (95)inst int alu (97)inst ldst (9b)9d9fl1d tlb access (a0)l1d tlb miss (a1)l1d cache miss ld (a3)ld unit uop (a6)st unit uop (a7)l1d cache writeback (a8)a9acafb5bbl1d cache miss ld nonspec (bf)l1d tlb miss nonspec (c1)c2branch cond mispred nonspec (c5)cfd5map dispatch bubble (d6)ddfetch restart (de)e0? int output thing (e9)eaebec? ldst retires (ed)? int retires (ef)f5f6f7f8fd
40205700515251100000200017004269788597162540108301061000230100100006160683341769149669777005770057646530364960401003020010000602002000070057351140201100991001000030100100000100100023110002000110000111110261017111698203000613101310000301007005870058700587005870058
40204700575241111100201017004269788597162540108301061000230100100006160683342542049669777005770057646537364964401003020010000602002000070057351140201100991001000030100100000100100012110001003110000111100261017111698203000610101010000301007004270058700587004270058
402047005752410100002000170042697885971625401083010610002301001005161752233425421496697770112700576469101264966401003052610109602002000070108351140201100991001000030100100000100100021110001001110000111110263517211700103000610101010000301007005870058700587005870058
40204700575241000000140101700456978859716254010430103100023010010000616068334254214966961700577005764653036494440100302001000060200200007005735114020110099100100003010010000010010002211000100041000011111026101711169820300061001010000301007009370058700587005870058
402047006352510200112841921027004269788597012540108301061000230100100006160683341769049669617005770059646530364947401003020010000602002000070059351140201100991001000030100100000100100023110002012110000111110261017111698203000610101010000301007005870058700587005870061
402047005752511100001401007004269788597162540108301061000230100100006183433342542149669817005770137646530364946401003020010000602002000070057351140201100991001000030100100000100100032110002002110000111110261017111698203000610101010000301007005870042700587004270042
4020470057525101000020101700426978859716254010830106100023010010000616068334176914966977700577005764653036496040100302001000060200200007004135114020110099100100003010010000010010002111000100211000011111026101711170082300061010010000301007005870058700587010070058
40204700575251110000201017004269788597162540104301061000230100100006160683342542149669617005770057646530364947401003020010000602002000070057351140201100991001000030100100000100100022110002000410000111100261017111699173000610101010000301007005870058700587005870058
402047005752511000002000170026697885971625401083010610002301001000061606833425421496697770057700576465303649604010030200100006020020000700573511402011009910010000301001000001001000221100020011100001111202610171116982030003100010000301007004270101700587005870058
402047005752411100002010170042697025971625401083010610002301001000061606833425421496697770057700576465303649604010030200100006020020000700413511402011009910010000301001000011001000121100010011100001111002610171116993930006010010000301007004470058700587005870058

1000 unrolls and 10 iterations

Result (median cycles for code, minus 3 chain cycles): 4.0051

retire uop (01)cycle (02)03mmu table walk instruction (07)090e0f18191e223f4d4f51schedule uop (52)schedule int uop (53)schedule ldst uop (55)dispatch int uop (56)dispatch ldst uop (58)int uops in schedulers (59)simd uops in schedulers (5a)60696a6b6d6emap stall dispatch (70)map rewind (75)map stall (76)dispatch uop (78)map int uop (7c)map ldst uop (7d)map int uop inputs (7f)map ldst uop inputs (80)8283flush restart other nonspec (84)85inst all (8c)inst branch (8d)inst branch taken (90)inst branch cond (94)inst int load (95)inst int alu (97)inst ldst (9b)9d9fl1d tlb access (a0)l1d cache miss ld (a3)ld unit uop (a6)l1d cache writeback (a8)a9acafb5l1d cache miss ld nonspec (bf)cdcfd5map dispatch bubble (d6)ddfetch restart (de)e0? int output thing (e9)eaebec? ldst retires (ed)? int retires (ef)f5f6f7f8fd
4002570057525000000117003669743597102540014300131000130010100006170683342254049669710700517005164669365055400103002010000600202000070051351140021109101000030010100000101000011000000010000110252017111698143000310101010000300107005270053700547005270052
400247005152400000011700366977559710254001430013100043015510000616991334147014966971070051700516466936501140010300201000060020200007005135114002110910100003001010000010100001100000001000011025201711169814300001001010000300107005270052700527003670052
4002470035525000000017003669775596952540014300101000130010100006169913341470149669710700357005164669365049400103002010000600202000070051351140021109101000030010100001101000011000000010000110252017111698143000310101010000300107005270052700527005270052
4002470051525000100617003669775597102540014300131000130010100006169913342254149669713700357005164669365046400103002010000600202000070035351140021109101000030010100001101000011000000010000110252017111698143000310101010000300107005270037700947005270052
4002470051525000000117003669775597102540010300131000130010100006169913341470049669550700517005164669365008400103002010000600202000070051351140021109101000030010100000101000011000000010000110252017111698143000010101010000300107005270052700527005270052
4002470051525000000617003669775597102540014300131000130010100006169913341470049669710700517005164669365045400103002010000600202000070051351140021109101000030010100000101000011000000010000010252017121698073000310101010000300107005270052700527005270052
400247005152400000011117003669775596952540014300131000130010100006169913341470049669710700357005164669364985400103002010000600202000070051351140021109101000030010100001101000011000000010000100252017111698143000310101010000300107005270052700367003670036
400247005152400000011700366977559710254001430013100013001010000616991334147014966971070051700356466936501340010300201000060020200007005135114002110910100003001010000010100001100000001000011025201711269814300030101010000300107005270052700527005270052
4002470051525000000117003669775597102540010300131000130010100006169913342254049669550700357005164669365082400103002010000600202000070051351140021109101000030010100000101000011000000010000110255517111698143000310101010000300107005270052700527005270052
4002470051525000000017003669775597102540010300131000130010100006169913341470049669710700517005164669364976400103002010000600202000070051351140021109101000030010100001101000011000000010000100252017111697983000310101010000300107005270052700527005270052

Test 4: throughput

Count: 8

Code:

  ldrsw x0, [x6, w7, uxtw]
  ldrsw x0, [x6, w7, uxtw]
  ldrsw x0, [x6, w7, uxtw]
  ldrsw x0, [x6, w7, uxtw]
  ldrsw x0, [x6, w7, uxtw]
  ldrsw x0, [x6, w7, uxtw]
  ldrsw x0, [x6, w7, uxtw]
  ldrsw x0, [x6, w7, uxtw]
  mov x7, 8

(fused SUBS/B.cc loop)

100 unrolls and 100 iterations

Result (median cycles for code divided by count): 0.3342

retire uop (01)cycle (02)03l1d tlb fill (05)mmu table walk data (08)09l2 tlb miss data (0b)0e0f18191e1f22233a3f4346494f51schedule uop (52)schedule int uop (53)schedule ldst uop (55)dispatch int uop (56)dispatch ldst uop (58)int uops in schedulers (59)simd uops in schedulers (5a)60696a6d6emap stall dispatch (70)map rewind (75)map stall (76)dispatch uop (78)map int uop (7c)map ldst uop (7d)map int uop inputs (7f)map ldst uop inputs (80)8283flush restart other nonspec (84)85inst all (8c)inst branch (8d)inst branch taken (90)92inst branch cond (94)inst int load (95)inst int alu (97)inst ldst (9b)9d9fl1d tlb access (a0)l1d tlb miss (a1)l1d cache miss ld (a3)ld unit uop (a6)st unit uop (a7)l1d cache writeback (a8)a9acafb5b6bbl1d cache miss ld nonspec (bf)l1d tlb miss nonspec (c1)c2branch cond mispred nonspec (c5)branch mispred nonspec (cb)cdcfd5map dispatch bubble (d6)dde0eaebec? ldst retires (ed)? int retires (ef)f5f6f7f8fd
80205267372011001000067010326726377192580100100800001008001550011699491492365626736267361666461668880113200800242001600482673785118020110099010080000100800000100800202143800601006380039615843190111511811602673513135800001002673726737267372673726737
80204267362001111000079010326721277192580100100800001008001350011685101492365626736267361666361668980116200800242001600482673686118020110099010080000100800000100800191943800591026080041615943191111511801602673413135800001002673726737267372673726738
80204267372001111000066010226725077202580100100800001008001650011673001492425926749267391666461668880115200800242001600482673685118020110099010080000100800000100800201943800590316180040615843191111511801602674013135800001002673726737267372673726738
80204267372001001000021010326721077182580100100800001008001250011673771492365626736267361666361668880113200800242001600482673685118020110099010080000100800000100800201943800580016180040615943191111511801602673313135800001002673726737267382673826909
80204267362001111000067010326726377192580100100800001008001550011687541492365626737267361666461668880113200800242001600482673785118020110099010080000100800000100800212043800580016380040615843191111511801602673313135800001002673726737267152673726737
802042673620011010000670102267252792812380100100800001008001550011699491492365626737267361666461668880113200800242001600482673785118020110099010080000100800000100800202043800591006180040615943190111511801602673413135800001002673726737267372673826737
80204267372001111000066010326739277192580100100800001008001350011724931492365626736267361666361668980116200800242001600482673685118020110099010080000100800000100800191943801912006180039615843190111511801602673413135800001002673726737267372671526738
8020426737200101100006601032672627719258010010080000100800155001169949149236562673626736166646166888011620080024200160048267368611802011009901008000010080000010080019204380059000618004061594319011151180160267341305800001002674126741267412674526737
80204267362001010000066010126729277192580100100800001008001550011661711492365726736267361666461668880115200800242001600482673685118020110099010080000100800000100800202043800591016080042615943190111511801602673313135800001002673726744267372673726738
80204267372001100000067000226730077222580100100800001008001450011676551492365626739267371666461668880115200802242001600482673685118020110099010080000100800001100800191943800590006180040615943191111511801602673413135800001002673726737267412673726737

1000 unrolls and 10 iterations

Result (median cycles for code divided by count): 0.3341

retire uop (01)cycle (02)03l1d tlb fill (05)mmu table walk instruction (07)mmu table walk data (08)09l2 tlb miss data (0b)0e0f18191e1f22233a3f4346494f51schedule uop (52)schedule int uop (53)schedule ldst uop (55)dispatch int uop (56)dispatch ldst uop (58)int uops in schedulers (59)simd uops in schedulers (5a)5f60696a6d6emap stall dispatch (70)map rewind (75)map stall (76)dispatch uop (78)map int uop (7c)map ldst uop (7d)map int uop inputs (7f)map ldst uop inputs (80)8283flush restart other nonspec (84)85inst all (8c)inst branch (8d)inst branch taken (90)inst branch cond (94)inst int load (95)inst int alu (97)inst ldst (9b)9d9fl1d tlb access (a0)l1d tlb miss (a1)l1d cache miss ld (a3)a5ld unit uop (a6)st unit uop (a7)l1d cache writeback (a8)a9acafb5b6bbl1d cache miss ld nonspec (bf)l1d tlb miss nonspec (c1)c2cfd5map dispatch bubble (d6)daddfetch restart (de)e0eaebec? ldst retires (ed)? int retires (ef)f5f6f7f8fd
8002526736200101101100670102267213702025800101080000108000050116758400492363526737267361665931671680010208000020160000267156411800211091080000108000011080019204308005900059800406159431905020516033267331313080000102673826738267382671626737
800242673620110101000067000226700077202580010108000010800005011679460149236572673626715166823167178001020800002016000026736851180021109108000010800000108001921430800190026180040605843191502021602326720013580000102671626738267162673726737
800242673620110111000067010326722270192580010108000010800005011679850049236562673726736166813167168001020800002016000026737851180021109108000010800000108001920430800581026180040605801915020416044267341313080000102673826716267162671626740
800242678520210010000066000226700277192580010108000010800005011799560149236352673626715166823167178001020800002016000026737641180021109108000010800000108001919430800590006380040016043191502031604426733013580000102673726715267162671626737
8002426736201101100000660002267213001925800101080000108000050116774401492365726714267361668131671680010208000020160000267158511800211091080000108000001080019204308005900061800006118431915020316044267121313580000102673826737267152673726738
8002426805200100010000660001267213772025800101080000108000050116696000492363426736267151666031671680010208000020160000267368511800211091080000108000001080019190080149117061800406158431925020316055267331313580000102673726738267152671526716
8002426737201101000000670102267232770258001010800001080000501165550014923656267372673616681316695800102080000201600002673764118002110910800001080000010800191943080059102618000001580192502041604426738013580000102674026738267182673826737
80024267372011000001007601032672227719258001010800001080000501172549014923635267362673616682316717800102080000201600002673685118002110910800001080000010800202043080019001648004000580192502041604426734013080000102673826719267382673926737
80024268162011010100006700012672137720258001010800001080000501166960014923656267372673716681316716800102080000201600002673764118002110910800001080000010800191943080058000638004061190191502041604526734013080000102673726738267382673826715
8002426736201101110000210003267002071258001010800001080000501168158014923656267372673716681316716800102080000201600002671786118002110910800001080000010800212143080019000608003961594319050202160442673300580000102673726737267372671626716