Apple Microarchitecture Research by Dougall Johnson

M1/A14 P-core (Firestorm): Overview | Base Instructions | SIMD and FP Instructions
M1/A14 E-core (Icestorm):  Overview | Base Instructions | SIMD and FP Instructions

LDP (signed offset, 64-bit)

Test 1: uops

Code:

  ldp x0, x1, [x6, #8]
  mov x0, 1
  mov x1, 2
  mov x8, 0

(no loop instructions)

1000 unrolls and 1 iteration

Retires: 2.000

Issues: 1.000

Integer unit issues: 0.000

Load/store unit issues: 1.000

SIMD/FP unit issues: 0.000

retire uop (01)cycle (02)03l2 tlb miss data (0b)0e0f181e22233a3f4346494f51schedule uop (52)schedule ldst uop (55)dispatch ldst uop (58)simd uops in schedulers (5a)606d6emap stall dispatch (70)map rewind (75)map stall (76)dispatch uop (78)map ldst uop (7d)map ldst uop inputs (80)8283flush restart other nonspec (84)85inst all (8c)inst int load (95)inst ldst (9b)9dl1d tlb access (a0)l1d cache miss ld (a3)ld unit uop (a6)l1d cache writeback (a8)acafb5b6bbl1d cache miss ld nonspec (bf)cfd5map dispatch bubble (d6)ddfetch restart (de)e0eaebec? ldst retires (ed)? int retires (ef)f5f6f7f8fd
20053943000045101379312121625100010001000150651394394923127100020001000394351110011000100001000431039039103961394373316223911010410001000395395395395395
20043942000045001379212121625100010001000150651394394923128100020001000394351110011000100001000431039039103961394373216223911010410001000395395395395395
20043943000045101379212121625100010001000150140394394923127100020001000394351110011000100001000431039039103961394373216223911010410001000395395395395395
20043943000045101379212121625100010001000152261394394923127100020001000394351110011000100001000431039139103961394373216223911010410001000395395395397395
20043943000045101379212121625100010001000150141394394923127100020001000394351110011000100001000431039039103961394373216223911010410001000395395395395395
20043943000045101379212121625100010001000150141394394923127100020001000394351110011000100001000431039039103961394373216223911010410001000395395396395395
20043942000045101379212121625100010001000150140394394923127100020001000394351110011000100001000431039039103961394373216223911010410001000395395395395395
20043942000045101379212121625100010001000150650394394923127100020001000394351110011000100001000431039039103961394373216223911010410001000395395395395395
20043943010045101379212121625100010001000150530394394923127100020001000394351110011000100001000431039039103961394373216223911010410001000395395395395395
20043943000045101379212121625100010001000150650394394923127100020001000394351110011000100001000431039039103961394373216223911010410001000395395395395395

Test 2: Latency 1->3 (with chain penalty)

Chain cycles: 3

Code:

  ldp x0, x1, [x6, #8]
  eor x8, x8, x0
  eor x8, x8, x0
  add x6, x6, x8
  mov x0, 1
  mov x1, 2
  mov x8, 0

(fused SUBS/B.cc loop)

100 unrolls and 100 iterations

Result (median cycles for code, minus 3 chain cycles): 4.0047

retire uop (01)cycle (02)03090e0f18191e1f22233a3f4d4f51schedule uop (52)schedule int uop (53)schedule ldst uop (55)dispatch int uop (56)dispatch ldst uop (58)int uops in schedulers (59)simd uops in schedulers (5a)60696a6d6emap stall dispatch (70)int prf full (71)map rewind (75)map stall (76)dispatch uop (78)map int uop (7c)map ldst uop (7d)map int uop inputs (7f)map ldst uop inputs (80)8283flush restart other nonspec (84)85inst all (8c)inst branch (8d)inst branch taken (90)inst branch cond (94)inst int load (95)inst int alu (97)inst ldst (9b)9d9fl1d tlb access (a0)l1d tlb miss (a1)l1d cache miss ld (a3)ld unit uop (a6)st unit uop (a7)l1d cache writeback (a8)acafb5l1d cache miss ld nonspec (bf)c2cfd5map dispatch bubble (d6)ddfetch restart (de)e0? int output thing (e9)eaebec? ldst retires (ed)? int retires (ef)f5f6f7f8fd
502057004752501000400100700206971159694254010430103100013010010000613756334130404966957700367023463405036369540100302002000060200100007004737114020110099100100003010010000110010000011000000010000100261026411697983000306010000401007003670048700487004870048
50204700525250110410000700326969859694254010030100100013026610000613756334190604966955700477003563400036369540100302002000060200100007004737114020110099100100003010010000010010000011000000010000100261016411698103000366610000401007004870048700487004870048
50204700475240010000000700326971159684254010030103100013010010000613756334190604966967700477004763400036370740100302002000060200100007004737114020110099100100003010010000010010000011000000010000110261016411698103000060610000401007005970048700407004870048
50204700475250000000000700206969859684254010430103100003010010000613915334130404966955700477004763388036369540100302002000060200100007004737114020110099100100003010010000010010000011000000010000100261016411698103000366610000401007004870036700367003670048
50204700475250010000100700206969859684254010430103100003010010000613756334130404966967700477004763388036370740885302002000060200100007004737114020110099100100003010010000010010008001000000010000100261016411698023000000010000401007003670036700367004870036
50204700475250000010100700206971159684254010430103100013010010000613715335142504966967700357004763403036369540100302002000061500100007003537114020110099100100003010010000010010000001000000010000110261016411697983000066010000401007004870036700367004870048
50204700355240000010010700326971659823254010030100100013010010000613756334190604966967700357003563400036370740100302002000060200100007004737114020110099100100003010010000010010000011000001010000100261016411697983000360010000401007003670048700487003670036
50204703235240000010010700326971159684254010030103100013010010000613652334190604966969700477003563388036370740100302002000060200100007004237114020110099100100003010010000010010000001000013010000100261016411698103000360010000401007003670048700487004870048
50204701055251000010000700206971159694254010430103100013010010000613652334190604966967700357004763388036370740100302002000060200100007003537114020110099100100003010010000010010000001000000010000110261016411698103000306610000401007003670048700367004870051
50204700505250110010000700326971159694254010430103100003010010000613756334144804966967700357003563400036370740100302002000060200100007003537114020110099100100003010010000010010000001000000010000000261016411697983000066610000401007004870036700487003670065

1000 unrolls and 10 iterations

Result (median cycles for code, minus 3 chain cycles): 4.0057

retire uop (01)cycle (02)03l1d tlb fill (05)mmu table walk instruction (07)mmu table walk data (08)09l2 tlb miss instruction (0a)l2 tlb miss data (0b)0e0f191e22233a3f4d4f51schedule uop (52)schedule int uop (53)schedule ldst uop (55)dispatch int uop (56)dispatch ldst uop (58)int uops in schedulers (59)simd uops in schedulers (5a)60696a6d6emap stall dispatch (70)map rewind (75)map stall (76)dispatch uop (78)map int uop (7c)map ldst uop (7d)map int uop inputs (7f)map ldst uop inputs (80)8283flush restart other nonspec (84)85inst all (8c)inst branch (8d)inst branch taken (90)inst branch cond (94)inst int load (95)inst int alu (97)inst ldst (9b)9fl1d tlb access (a0)l1d tlb miss (a1)l1d cache miss ld (a3)ld unit uop (a6)l1d cache writeback (a8)a9acafb5bbl1d cache miss ld nonspec (bf)l1d tlb miss nonspec (c1)c2cfd5map dispatch bubble (d6)ddfetch restart (de)e0? int output thing (e9)eaebec? ldst retires (ed)? int retires (ef)f5f6f7f8fd
5002570060525101101000200270042697225969525400183001610002300101000061550833423980496697770057700576342536373940010300202000060020100007005737114002110910100003001010000101000211100010211000011112252176413969911300060101010000400107006270058700587004270042
5002470041524101101000210270042697225971025400183001610001300101000061550833423980496697770041700576340936373940010300202000060020100007005737114002110910100003001010000101000111100010011000011110252186411969820300061010010000400107005870042700587005870058
50024700575251001001002101700266969759710254001830013100013001010000615508334239804966977700577005763425363739400103002020138600201000070041371140021109101000030010100001010002201000200110000111122521106488698243000610101010000400107005870058700587005870058
5002470057525100101000210070042696975971025400183001610002300101000061550833423980496697770057700416342536372240010300202000060020100007005737114002110910100003001010000101000121100020111000011110252196497698203000610101010000400107005870058700427005870058
5002470057524101000000210070042697225971025400183001610002300101000061550833423980496696170057700576340936372240010300202000060020100007005737114002110910100003001010000101000121100020111000011111252196411869844300061010010000400107005870042700587005870042
500247005752511110000021017004269722597102540018300161000230010100006155083341614049669777005770041634253637394001030020200006002010000700573711400211091010000300101000010100022110002021100001111025219648969832300061001010000400107005870058700587005870042
500247005752410110000071017004269722597102540018300161000230010100006155083342398049669777005770057634253637394001030020200006002010000700573711400211091010000300101000010100033110002021100000111225219657769823300061010010000400107005870058700427005870058
500247005752410110000021017004269697597102540031300161000230010100006155083342398049669777004270063634253637254001030020200006002010000700573711400211091010000300101000010100022110001011100001111225217658769820300061010010000400107005870058700587005870058
50024700575251001010001101700426972259710254001830016100023001010000615508334239804966977700577005763425363739400103002020000600201000070057371140021109101000030010100001010002111000201110000111122521865117698243000610101010000400107005870058700587005870058
500257014652510010100020017004269697597102540018300161000130010100006151633342398049669777005770057634253637394001030020200006002010000700573711400211091010000300101000010100032110001011100001111125217641210698103000610101010000400107004270058700587005870058

Test 3: Latency 2->3 (with chain penalty)

Chain cycles: 3

Code:

  ldp x0, x1, [x6, #8]
  eor x8, x8, x1
  eor x8, x8, x1
  add x6, x6, x8
  mov x0, 1
  mov x1, 2
  mov x8, 0

(fused SUBS/B.cc loop)

100 unrolls and 100 iterations

Result (median cycles for code, minus 3 chain cycles): 4.0047

retire uop (01)cycle (02)0309l2 tlb miss instruction (0a)l2 tlb miss data (0b)0e0f18191e1f22233f4d4f51schedule uop (52)schedule int uop (53)schedule ldst uop (55)dispatch int uop (56)dispatch ldst uop (58)int uops in schedulers (59)simd uops in schedulers (5a)696a6d6emap stall dispatch (70)int prf full (71)map rewind (75)map stall (76)dispatch uop (78)map int uop (7c)map ldst uop (7d)map int uop inputs (7f)map ldst uop inputs (80)8283flush restart other nonspec (84)85inst all (8c)inst branch (8d)inst branch taken (90)inst branch cond (94)inst int load (95)inst int alu (97)inst ldst (9b)9d9fl1d tlb access (a0)l1d cache miss ld (a3)ld unit uop (a6)l1d cache writeback (a8)acafb5l1d cache miss ld nonspec (bf)branch cond mispred nonspec (c5)branch mispred nonspec (cb)cdcfd5map dispatch bubble (d6)ddfetch restart (de)e0? int output thing (e9)eaebec? ldst retires (ed)? int retires (ef)f5f6f7f8fd
50205700475240000000100070020697115972325401043010310001301111000761330633414734966967700477004763444076374240118302332002360266100127003537114020110099100100003010010000110010000110000001000011111262001600698573000006610000401007004870036700487004870048
50204700355240000000100070032697115973325401003010310001301111000761330633420754966967700357003563444036370740100302002000060200100007004737114020110099100100003010010000110010000110000001000011000261016411697983000066610000401007004870048700487003670048
5020470047525000000011321070032697115969425401043010010001301001000061375633419064966967700477004763388036374840100302002000060200100007004737114020110099100100003010010000010010000110000001000311000261016411698103000366610000401007004870048700487004870048
502047004752500000001000070032697115968425401043010310001301001000061375633419064966967700477004763400036370740100302002000060200100007004737114020110099100100003010010000110010000110000001000010000261016411698103000366610000401007004870036700487004870048
50204700475250000000101070032697115969425401043010310001301001000061459133419064966967700477004763400036370740100302002000060200100007004737114020110099100100003010010000010010000110000001000011000261016411698103000306610000401007004870048700487004870048
50204700475250000000001070032697145969425401043010310003301001000061365233419064966967700477003563400036370740100302002000060200100007004737114020110099100100003010010000010010000110000001000001000261016411698103000366610000401007004870048700487004870048
50204700475250000000100070032697115968425401043010310001301001000061375633413044966967700477004763400036369540100302002000060200100007004737114020110099100100003010010000010010000110000101000011000261016411698103000306610000401007004870048700487004870048
50204700475250000100000070032697115969425401043010310001301001000061375633419064966967700477003563400036370740100302002000060200100007003537114020110099100100003010010000010010000010000001000011000261011711698103000366610000401007004870048700487004870048
50204700475240000000100170032697115969425401043010310001301001000061375633419064964010700477005163400036370740100302002000060200100007003537114020110099100100003010010000010010000110000001000011000261016411698103000366610000401007004870048700487003670048
50204700475250000000100070020697115969425401043010310001301001000061375633419064966967700477004763400036370740100302002000060200100007004737114020110099100100003010010000110010000110000001000010000261016411697983000366610000401007004870048700487004870048

1000 unrolls and 10 iterations

Result (median cycles for code, minus 3 chain cycles): 4.0057

retire uop (01)cycle (02)03l1d tlb fill (05)mmu table walk data (08)09l2 tlb miss data (0b)0e0f1e22233a3f4d4f51schedule uop (52)schedule int uop (53)schedule ldst uop (55)dispatch int uop (56)dispatch ldst uop (58)int uops in schedulers (59)simd uops in schedulers (5a)60696a6b6d6emap stall dispatch (70)int prf full (71)map rewind (75)map stall (76)dispatch uop (78)map int uop (7c)map ldst uop (7d)map int uop inputs (7f)map ldst uop inputs (80)8283flush restart other nonspec (84)85inst all (8c)inst branch (8d)inst branch taken (90)inst branch cond (94)inst int load (95)inst int alu (97)inst ldst (9b)9d9fl1d tlb access (a0)l1d tlb miss (a1)l1d cache miss ld (a3)ld unit uop (a6)l1d cache writeback (a8)a9acafb5bbl1d cache miss ld nonspec (bf)l1d tlb miss nonspec (c1)c2cdcfl1i cache miss demand (d3)d5map dispatch bubble (d6)ddfetch restart (de)e0? int output thing (e9)eaebec? ldst retires (ed)? int retires (ef)f5f6f7f8fd
500257005752411100021017004269722597102540018300161000230010100006155083342542149669770700607006063425036378640010300202000060020100007005737114002110910100003001010000110100022110002021100001111202520013641216698203000613101010000400107005870061700587006170058
50024700575241000002100700426972259710254001830016100023001010000615508334161414966961070041700576342803637434001030020200006002010000700573711400211091010000300101000011010002211000202110000111100252006641412698203000310101010000400107005870061700587006170058
500257012652510000121007002669722597132540014300161000230010100006151633342542149669770700577006063428036379040010300202000060020100007005737114002110910100003001010000010100021110001004100000111002520012641411698233000610101010000400107005870058700587004270058
5002470057525101100110170042697255971325400183001610002300101000061553533425420496697707006070057634280363785400103002020000600201000070041371140021109101000030010100000101000221100011111000011112025200116481469823300060131010000400107005870061700617006170042
50024700635251100001101700426972259710254001830016100023001010000615163334239804966977070057700606342503637414001030020200006002010000700413711400211091010000300101000001010003111000201110000111110252001165138698233000610101010000400107004270061700587007270058
50024700415251111002101700426969759710254001430016100023001010000615163334239814966961070060700416342803637604001030020200006002010065700573711400211091010000300101000001010001111000101110000111120252001364714698203000613131310000400107005870042700587005870058
50024700585251001002101700266972559710254001830016100023001010000615508334239804966977070060700576342503637514001030020200006002010000700603711400211091010000300101000001010002111000201110000111100252001464914698203000313101010000400107005870061700587006170058
50025700575251111001101700426972559712254001830016100023001010000615163334254204966961070057700576342803637884001030020200006002010000700603711400211091010000300101000001010002211000100110000111100252001364121369805300030131010000400107004270058700427005870042
50024700415251111002100700426972259710254001830016100023001010000615163334254204966977070057700576340903637774001030020200006002010000700413711400211091010000300101000001010001101000200110000111100252001364136698203000610101310000400107005870061700587004270058
50024700575251110005561007004269722597102540018300161000230010100006151633342398149669770700577005763409036374240010300202000060020100007005737114002110910100003001010000010100011110002011100001101002520076414769823300061313010000400107004270042700617006170058

Test 4: throughput

Count: 8

Code:

  ldp x0, x1, [x6, #8]
  ldp x0, x1, [x6, #8]
  ldp x0, x1, [x6, #8]
  ldp x0, x1, [x6, #8]
  ldp x0, x1, [x6, #8]
  ldp x0, x1, [x6, #8]
  ldp x0, x1, [x6, #8]
  ldp x0, x1, [x6, #8]
  mov x7, x6
  mov x8, x6
  mov x9, x6
  mov x10, x6
  mov x11, x6
  mov x12, x6
  mov x13, x6

(fused SUBS/B.cc loop)

100 unrolls and 100 iterations

Result (median cycles for code divided by count): 0.3342

retire uop (01)cycle (02)03l1d tlb fill (05)mmu table walk data (08)l2 tlb miss data (0b)0e1e22233a3f4346494f51schedule uop (52)schedule int uop (53)schedule ldst uop (55)dispatch int uop (56)dispatch ldst uop (58)int uops in schedulers (59)simd uops in schedulers (5a)60696a6d6emap stall dispatch (70)int prf full (71)map rewind (75)map stall (76)dispatch uop (78)map int uop (7c)map ldst uop (7d)map int uop inputs (7f)map ldst uop inputs (80)8283flush restart other nonspec (84)85inst all (8c)inst branch (8d)inst branch taken (90)inst branch cond (94)inst int load (95)inst int alu (97)inst ldst (9b)9d9fl1d tlb access (a0)l1d tlb miss (a1)l1d cache miss ld (a3)ld unit uop (a6)st unit uop (a7)l1d cache writeback (a8)a9acafb5b6bbl1d cache miss ld nonspec (bf)l1d tlb miss nonspec (c1)c2cfd5map dispatch bubble (d6)ddfetch restart (de)e0eaebec? ldst retires (ed)? int retires (ef)f5f6f7f8fd
1602052673220010108910326718201816258010010080000100800005001164410049236522681426738668503669080100200160000200800002671435118020110099100800001008000001008002022428005810159800386157421915110116112672909280000801002673326733267332671526733
1602042671420011101221032671721801625801001008000010080000500117309304923654268052673766570366728010020016000020080000267323511802011009910080000100800000100800192008005710159800386157421925110116212673599280000801002673326715267332673326733
160204267322001010651012671721818152580100100800001008000050011675301492365226846267436725036690801002001600002008000026732351180201100991008000010080000010080021210800561022180037015601915110116112672990280000801002673326715267332673326733
1602042673220010109210226717201816258010010080000100800005001170529149236522683526738666703667280100200160000200800002673235118020110099100800001008000001008001919428005700259800386157421905110116112673399280000801002673326733267332673326733
16020426732200111078103267173181815258010010080000100800005001167530149236522682826741666703669080100200160000200800002673235118020110099100800001008000001008001920428005700059800386157421905110116112671190280000801002673326733267332673326733
16020426732200111095003267173181816258010010080000100800005001171570149236522673226732679703669680100200160000200800002673235118020110099100800001008000001008001920428005700059800380158421925110116112672999080000801002673326715267152673326715
16020426732200110098003267172181816258010010080000100800005001180419149236522673826734678903681380100200160000200800002673235118020110099100800001008000001008002119428005610259800376157421915110116112672999280000801002671526733269002674426733
160204267322011110770032671721818172580100100800001008000050011706821492365226831267376663036690801002001600002008000026732351180201100991008000010080000010080021190800191005980038615701925110116112672999280000801002673326733267332673326733
1602042673220010108010226717218180258010010080000100800005001171570149236522684126737666203669080100200160000200800002673235118020110099100800001008000001008002120428005710159800006157421915110116112672999280000801002673326715267332671526733
16020426732200111071103267172181816258010010080000100800005001167999149236522683326738666303669080100200160000200800002671435118020110099100800001008000001008001919428005710421800006157421915110116112671199280000801002673326733267332673326733

1000 unrolls and 10 iterations

Result (median cycles for code divided by count): 0.3340

retire uop (01)cycle (02)030e0f1e22233a3f4346494f51schedule uop (52)schedule int uop (53)schedule ldst uop (55)dispatch int uop (56)dispatch ldst uop (58)int uops in schedulers (59)simd uops in schedulers (5a)60696a6d6emap stall dispatch (70)map rewind (75)map stall (76)dispatch uop (78)map int uop (7c)map ldst uop (7d)map int uop inputs (7f)map ldst uop inputs (80)8283flush restart other nonspec (84)85inst all (8c)inst branch (8d)inst branch taken (90)inst branch cond (94)inst int load (95)inst int alu (97)inst ldst (9b)9d9fl1d tlb access (a0)l1d cache miss ld (a3)ld unit uop (a6)l1d cache writeback (a8)acafb5b6bbl1d cache miss ld nonspec (bf)cfl1i cache miss demand (d3)d5map dispatch bubble (d6)d9daddfetch restart (de)e0eaebec? ldst retires (ed)? int retires (ef)f5f6f7f8fd
160025267262000081102267120181802580010108000010800005011746280492364726727267076668367028001020160000208000026707351180021109108000010800001108000039800390080000603505020029160028272695146280000800102672826728267082670826728
1600242672720011361012671221218025800101080000108000050117010704923647267272672266533671580010201600002080000267273511800211091080000108000011080000080000035800396135050200161600261626724106480000800102672826708267282672826728
160024267222000151002267122121216258001010800001080000501174628149236422672226727666836695800102016000020800002672235118002110910800001080000010800000800390398003961393950200271600252726724100080000800102672826728267282672826728
1600242670720000241022670701818025800101080000108000050116875414923647267222670766683671580010201600002080000267073511800211091080000108000001080000080000035800350135435020027160026262672406080000800102672826708267082670826723
16002426707200007810226712212016258001010800001080000501174628149236472672726727665336702800102016000020800002672235118002110910800001080000010800003980000039800006104350200281600173126704100280000800102670826723267232672826723
16002426722200001981022689021212162580010108000010800005011731831492362726727267276817367118001020160000208000026707351180021109108000010800000108000008017000800006139435020028160027182672446080000800102672826723267282670826728
160024267072000060102267072181211258001010800001080000501170107149236272672726729684836707800102016000020800002670735118002110910800001080000010800003980035008003961354350200261600252626704100480000800102670826708267282672326728
1600242670720000641022670721818162580010108000010800005011688801492364726727267226668366958001020160000208000026727351180021109108000010800000108000039800390398003901005020018160025152670406280000800102672326708267082670826728
16002426727200008400226692201216258001010800001080000501173183149236472672726727666836871800102016000020800002672735118002110910800001080000010800003980039039800516104350200271600172626724410480000800102672826708267082670826723
16002426710200000001267122181216258001010800001080000501168754149236472672726727666836710800102016000020800002670735118002110910800001080000010800003980039042800396104350200171600271726724106480000800102672826708267282672826708