Apple Microarchitecture Research by Dougall Johnson

M1/A14 P-core (Firestorm): Overview | Base Instructions | SIMD and FP Instructions
M1/A14 E-core (Icestorm):  Overview | Base Instructions | SIMD and FP Instructions

LDP (64-bit)

Test 1: uops

Code:

  ldp x0, x1, [x6]
  mov x0, 1
  mov x1, 2
  mov x8, 0

(no loop instructions)

1000 unrolls and 1 iteration

Retires: 2.000

Issues: 1.000

Integer unit issues: 0.000

Load/store unit issues: 1.000

SIMD/FP unit issues: 0.000

retire uop (01)cycle (02)03l1d tlb fill (05)mmu table walk data (08)l2 tlb miss data (0b)0e0f181e22243a3f4346494f51schedule uop (52)schedule ldst uop (55)dispatch ldst uop (58)simd uops in schedulers (5a)6d6emap stall dispatch (70)map rewind (75)map stall (76)dispatch uop (78)map ldst uop (7d)map ldst uop inputs (80)8283flush restart other nonspec (84)85inst all (8c)92inst int load (95)inst ldst (9b)9dl1d tlb access (a0)l1d tlb miss (a1)l1d cache miss ld (a3)ld unit uop (a6)st unit uop (a7)l1d cache writeback (a8)a9acafb5b6bbl1d cache miss ld nonspec (bf)l1d tlb miss nonspec (c1)c2cfd5map dispatch bubble (d6)ddfetch restart (de)e0eaebec? ldst retires (ed)? int retires (ef)f5f6f7f8fd
2005402311010067102387277192510001000100015559402403101313610002000100040335111001010001000110192043106000060104161584319173616663991313510001000404403404405404
2004402311000066103387397192510001000100015575403402100313610002000100040235111001010001000010201943105911264104061594319073616663991313510001000404403404403404
200440331100006910238837719251000100010001553540340310031361000200010004023511100101000100011021200105901160104061604319173516553991313510001000403404404403403
2004402311101067103388277182510001000100015550402403100313510002000100040335111001010001000010202143105910061103961604319173616663991313510001000404403404404404
2004402311100066103387377202510001000100015538403403101313610002000100040335111001010001000010192043105911064104061594319173516664001313510001000403404403404404
2004403311100067103388277182510001000100015559402402101313510002000100040335111001010001000010192043105910163103961594319173616664001313510001000403405403404403
2004402311100067103389377202510001000100015535403403100313610002000100040235111001010001000010202043105910164104061584419073616664001313510001000404404403404404
2004403211000067102387277192510001000100015554403403100313510002000100040336111001010001000010202043105901064104161594319173616654001313510001000404404403404404
2004403310000076102387377192510001000100015521402402100313510002000100040335111001010001000010192143105811164103961594319273616664001313510001000404404403404403
2004402310000066103388377202510001000100015559402402101313610002000100040235111001010001000010192043106001061104061594319173616664001313510001000403404404404404

Test 2: Latency 1->3 (with chain penalty)

Chain cycles: 3

Code:

  ldp x0, x1, [x6]
  eor x8, x8, x0
  eor x8, x8, x0
  add x6, x6, x8
  mov x0, 1
  mov x1, 2
  mov x8, 0

(fused SUBS/B.cc loop)

100 unrolls and 100 iterations

Result (median cycles for code, minus 3 chain cycles): 4.0050

retire uop (01)cycle (02)03090e0f1e22233a3f4d4f51schedule uop (52)schedule int uop (53)schedule ldst uop (55)dispatch int uop (56)dispatch ldst uop (58)int uops in schedulers (59)simd uops in schedulers (5a)696a6d6emap stall dispatch (70)map rewind (75)map stall (76)dispatch uop (78)map int uop (7c)map ldst uop (7d)map int uop inputs (7f)map ldst uop inputs (80)8283flush restart other nonspec (84)85inst all (8c)inst branch (8d)inst branch taken (90)inst branch cond (94)inst int load (95)inst int alu (97)inst ldst (9b)9d9fl1d tlb access (a0)l1d cache miss ld (a3)ld unit uop (a6)l1d cache writeback (a8)acafb5l1d cache miss ld nonspec (bf)branch cond mispred nonspec (c5)cfd5map dispatch bubble (d6)ddfetch restart (de)e0? int output thing (e9)eaebec? ldst retires (ed)? int retires (ef)f5f6f7f8fd
5020570047525101110070057696985970025401003010310001301001000061391533413044966955700477004763403363707401003020020000602001000070047371140201100991001000030100100000100100001100001310000100261026411697983000396010000401007005170036700367005170051
5020470035525000601170035696985968425401043010010001301001000061391533420564966970700477004763400363695401003020020000602001000070050371140201100991001000030100100000100100001100000010000110261016411698133000300910000401007005170051700517003670048
502047005052500010107003269714596842540104301001000130100100006139153342056496695570047700506340336370740100302002000060200100007005037114020110099100100003010010000010010001110000019510000010261016411698133000366910000401007003670051700517003670051
5020470050525000000170038696985968425401043010310001301001000061391533419064966970700357004763400363710401003020020000602001000070047371140201100991001000030100100000100100000100000010000100261016411698133000390910000401007005170051700517005170051
50204700505250004800070039697145970025401003010310001301001000061391533420564966970700477005063434363695401003020020000602001000070035371140201100991001000030100100000100100000100001010000000261016412708053000096010000401007005670056700517003670051
5020470050525000101170035697145970025401043010010001301001000061391533413044966970700507003563403363707401003020020000602001000070059371140201100991001000030100100000100100001100000010000110261016411698133000096610000401007005170048700517005170051
5020470050524001101170035697145970025401003010310001301001000061365233420564966970700477003563403363695401003020020000602001000070035371140201100991001000030100100000100100001100001010000110261016411697983000000910000401007005170036700367003670051
5020470035525000000170032696985970025401003010310001301001000061391533413044966970700507005063388363710401003020020000602001000070050371140201100991001000030100100000100100001100000010000100261016411698133000090910000401007003670036700517005170036
5020470043525000100170032697145970025401043010310001301001000061375633420564963994700357005063404363695401003020020000602001000070050371140201100991001000030100100000100100001100000010000110261016411697983000390610000401007003670051700517004870038
5020470052525000101170098697145970025401043010010000301001000061365233413044966955700357015163403363710401003020020000602001000070035371140201100991001000030100100000100100001100000010000110261016411698133000306910000401007004870036700517003670051

1000 unrolls and 10 iterations

Result (median cycles for code, minus 3 chain cycles): 4.0050

retire uop (01)cycle (02)03mmu table walk data (08)090e0f191e1f2223243f4d4f51schedule uop (52)schedule int uop (53)schedule ldst uop (55)dispatch int uop (56)dispatch ldst uop (58)int uops in schedulers (59)simd uops in schedulers (5a)60696a6b6d6emap stall dispatch (70)map rewind (75)map stall (76)dispatch uop (78)map int uop (7c)map ldst uop (7d)map int uop inputs (7f)map ldst uop inputs (80)8283flush restart other nonspec (84)85inst all (8c)inst branch (8d)inst branch taken (90)inst branch cond (94)inst int load (95)inst int alu (97)inst ldst (9b)9d9fl1d tlb access (a0)l1d tlb miss (a1)l1d cache miss ld (a3)ld unit uop (a6)st unit uop (a7)l1d cache writeback (a8)acafb5l1d cache miss ld nonspec (bf)c2branch mispred nonspec (cb)cdcfd5map dispatch bubble (d6)ddfetch restart (de)e0? int output thing (e9)eaebec? ldst retires (ed)? int retires (ef)f5f6f7f8fd
500257004752401100101007005969714596922540014300131000230010100006151263342046149669670700507005063446363715400103002020000600201000070050371140021109101000030010100000101000001100000001000010000252017811698013000306910000400107005170051700517005170051
500247004752400000100007003569719596922540014300101000130010100006151263342046049669700700357005063415363683400103002020000600201000070035372140021109101000030010100001101000001100000001000011000252017811698173000006910000400107003670036700517003670051
500247005052500000000007002069714596772540014300101000030010100006149943342046049669550700357003563403363718400103002020000600201000070050371140021109101000030010100001101000001100000001000011000252017813698173000396010000400107005170051700517003670051
500247003552500000000007002069714596922540014300131000130010100006151263341536049669700700507005063418363718400103002020000600201000070035371140021109101000030010100000101000001100000001000000000252037811698143000396910000400107005170036700367005170036
500247005052500000001007003569714596922540014300101000130010100006149943342046049669700700507005063415363718400103002020000600201000070050371140021109101000030010100000101000000100000101000001000252017811698173000090910000400107003670051700517005170051
500247005052500000100007006969718596772540014300101000130010100006151263341295049669550700507005063418363718400103002020000600201000070050371140021109101000030010100001101000001100000001000011000252017813698173000309010000400107005170051700517005170051
50024700505250000064201007003269718596892540014300131000130010100006149943342046049669700700507005063418363718400103002020000600201000070035371140021109101000030010100000101000000100000001000011000252017813698013000360910000400107005170051700517005170036
500247005052500000000007003269714596922540014300131000130010100006151263341295049669670700507004763403363718400103002020000600201000070050371140021109101000030010100000101000000100000001000011000252039911698173000396910000400107003670051700517003670036
500247003552500000000007003269714596772540010300131000130010100006149943341295049669550700507004763418363718400103002020000600201000070050371140021109101000030010100000101000000100000101000011000252037811698173000306910000400107003670051700517004870051
500247003552500110001007002069719596922540014300131000130010100006151263342046049669550700507003563418363682400103002020000600201000070050371140021109101000030010100000101000001100000001000001000252019913698173000099010000400107005170092701137005170053

Test 3: Latency 2->3 (with chain penalty)

Chain cycles: 3

Code:

  ldp x0, x1, [x6]
  eor x8, x8, x1
  eor x8, x8, x1
  add x6, x6, x8
  mov x0, 1
  mov x1, 2
  mov x8, 0

(fused SUBS/B.cc loop)

100 unrolls and 100 iterations

Result (median cycles for code, minus 3 chain cycles): 4.0057

retire uop (01)cycle (02)03l1d tlb fill (05)mmu table walk data (08)l2 tlb miss data (0b)0e0f1e22233a3f4d4f51schedule uop (52)schedule int uop (53)schedule ldst uop (55)dispatch int uop (56)dispatch ldst uop (58)int uops in schedulers (59)simd uops in schedulers (5a)60696a6b6d6emap stall dispatch (70)map rewind (75)map stall (76)dispatch uop (78)map int uop (7c)map ldst uop (7d)map int uop inputs (7f)map ldst uop inputs (80)8283flush restart other nonspec (84)85inst all (8c)inst branch (8d)inst branch taken (90)inst branch cond (94)inst int load (95)inst int alu (97)inst ldst (9b)9d9fl1d tlb access (a0)l1d tlb miss (a1)l1d cache miss ld (a3)ld unit uop (a6)l1d cache writeback (a8)a9acafb5bbl1d cache miss ld nonspec (bf)l1d tlb miss nonspec (c1)c2cfd5map dispatch bubble (d6)ddfetch restart (de)e0? int output thing (e9)eaebec? ldst retires (ed)? int retires (ef)f5f6f7f8fd
502057005752511100210070042698275970725401083010610002301001000061391533416070496697707005770057634123636874010030200200006020010000700603711402011009910010000301001000011001000111100010111000011010261027811698513000613101310000401007005870058700617006170042
502047006052411111210070045697255970925401083010610001301001000061394233425380496698037004170060634113637204010030200200006020010000700603711402011009910010000301001000001001000330100010211000001111261016411698603000610131310000401007004270061700617006170042
50204700605241010020007002669725596882540104301061000230100100006139423342538049669800700607006063413363720401003020020000602001000070057371140201100991001000030100100000100100012110001011100001111026101641169845300030131310000401007005870061700427006170058
5020470060525111002000700456972559688254010830103100013010010000613942334253814966961070060700606341336372040100302002000060200100007004137114020110099100100003010010000010010001211000211410000110102610164116986630006130010000401007006170042700617006170042
50204700575251000070017002669722596882540108301061000230100100006136463342394049669610700417006063413363720401003020020000602001000070060371140201100991001000030100100000100100022110005011100001101026101641169854300060131310000401007006170042700617006170061
50204700575251110021007004569711597062540108301031000230100100006139423342394049669770700607006063410363720401003020020000605941000070060371140201100991001000030100100000100100013110002001100001111126101641169867300061010010000401007005870042700587005870058
502047006052410100200170045697225970625401043010610002301001000061364633457630496698007004170060634133637204010030200200006020010000700603711402011009910010000301001000001001000220100020111000011111261016411698823000310101010000401007005870042700587005870058
50204700575241010020007004569725597092540108301061000130100100006139153342394049669610700607004163413363720401003020020000602001000070060371140201100991001000030100100000100100011110003001100001111026101641169867300060131010000401007005870042700617004270061
502047005752511000200070045697255968825401043010610002301001000061394233425381496696107006070060634133637204010030200200006020010000700603711402011009910010000301001000001001000111100020111000001111261017811698913000613131310000401007006170061700617004270042
50204700605241001110007004569711597092540108301061000230100100006139423342538149669770700417004163410363717401003020020000605721000070057371140201100991001000030100100000100100013110002001100001111026101641269880300060101010000401007006170042700617004270061

1000 unrolls and 10 iterations

Result (median cycles for code, minus 3 chain cycles): 4.0047

retire uop (01)cycle (02)03mmu table walk data (08)09l2 tlb miss data (0b)0e0f1e1f22233f4d4f51schedule uop (52)schedule int uop (53)schedule ldst uop (55)dispatch int uop (56)dispatch ldst uop (58)int uops in schedulers (59)simd uops in schedulers (5a)60696a6b6d6emap stall dispatch (70)map rewind (75)map stall (76)dispatch uop (78)map int uop (7c)map ldst uop (7d)map int uop inputs (7f)map ldst uop inputs (80)8283flush restart other nonspec (84)85inst all (8c)inst branch (8d)inst branch taken (90)inst branch cond (94)inst int load (95)inst int alu (97)inst ldst (9b)9fl1d tlb access (a0)l1d cache miss ld (a3)ld unit uop (a6)l1d cache writeback (a8)acafb5l1d cache miss ld nonspec (bf)branch mispred nonspec (cb)cdcfd5map dispatch bubble (d6)ddfetch restart (de)e0? int output thing (e9)eaebec? ldst retires (ed)? int retires (ef)f5f6f7f8fd
5002570047524010001010700206971459689254001430013100013001010000615102334129514966955070047700476341536371540010300202000060020100007005037114002110910100003001010000101000001000010100001100252057834698013000366610000400107004870036700487004870048
5002470050525000001000700326971459692254001430013100003001010000614994334189914966970070050700356341836371540010300202000060020100007005037114002110910100003001010000101000001000010100001100252047836698173000399010000400107005170036700487005170051
5002470035525000009000700206971459692254001430010100013001010000615126334204614966970070035700476341536371540010300202000060020100007005037114002110910100003001010000101000011000000100001000252047844698173000306910000400107003670051700517005170036
5002570050524000001010700326971959708254001030013100013001010000615126334209414966955070050700356341836368240010300202000060020100007003537114002110910100003001010000101000001000000100001000252059943698173000096610000400107004870048700367004870051
5002470035525000000000700206971859677254001430010100013001010000615102334129514966970070047700356341536371840010300202000060020100007003537114002110910100003001010000101000011000000100001100252057844698143000309010000400107003670051700487004870036
5002470047525010001010700356971459677254001430013100003001010000615102334204614966970070035700476340336371840010300202000060020100007005037114002110910100003001010000101000011000013100000000252039934698173000309910000400107005170048700487005170036
5002470035525000001000700206971459692254001030013100003001010000615102334189914966970070050700356341836371540010300202000060020100007005037114002110910100003001010000101000001000000100000000252059955698143000396010000400107005170036700517004870048
5002470047524000001010700356971459677254001430010100013001010000615126334204614966970070035700476348036368240010300202000060020100007005037114002110910100003001010000101000001000000100001000252029924698173000396610000400107005170051700367004870051
5002470050525010001010700356971959692254001430013100013001010000615126334204614966970070050700506341836368240010300202000060020100007005037114002110910100003001010000101000001000000100000100252067844698173000399910000400107005470052700517005170054
5002470050525010101000700356971859692254001030013100013001010000615021334189914966970070050700506341836377540010300202000060020100007005037114002110910100003001010000101000011000000100001000252047835698143001306010000400107034270126700367005370051

Test 4: throughput

Count: 8

Code:

  ldp x0, x1, [x6]
  ldp x0, x1, [x6]
  ldp x0, x1, [x6]
  ldp x0, x1, [x6]
  ldp x0, x1, [x6]
  ldp x0, x1, [x6]
  ldp x0, x1, [x6]
  ldp x0, x1, [x6]
  mov x7, x6
  mov x8, x6
  mov x9, x6
  mov x10, x6
  mov x11, x6
  mov x12, x6
  mov x13, x6

(fused SUBS/B.cc loop)

100 unrolls and 100 iterations

Result (median cycles for code divided by count): 0.3340

retire uop (01)cycle (02)03l1i tlb fill (04)mmu table walk instruction (07)mmu table walk data (08)09l2 tlb miss instruction (0a)0e0f18191e1f22233a3f4346494f51schedule uop (52)schedule int uop (53)schedule ldst uop (55)dispatch int uop (56)dispatch ldst uop (58)int uops in schedulers (59)simd uops in schedulers (5a)60696a6d6emap stall dispatch (70)int prf full (71)map rewind (75)map stall (76)dispatch uop (78)map int uop (7c)map ldst uop (7d)map int uop inputs (7f)map ldst uop inputs (80)8283flush restart other nonspec (84)85inst all (8c)inst branch (8d)inst branch taken (90)inst branch cond (94)inst int load (95)inst int alu (97)inst ldst (9b)9d9fl1d tlb access (a0)l1d tlb miss (a1)l1d cache miss ld (a3)a5ld unit uop (a6)st unit uop (a7)l1d cache writeback (a8)a9acafb5b6bbl1d cache miss ld nonspec (bf)cfd5map dispatch bubble (d6)ddfetch restart (de)e0? int output thing (e9)eaebec? ldst retires (ed)? int retires (ef)f5f6f7f8fd
1602052672620010000000000100267072181802580100100800001008000050011688801492364226722267226645036680801002001600002008000026722351180201100991008000010080000010080000039080035000358000061353951102162226726060280000801002670826723267232670826708
160204267222000000011004101012670701801225801001008000010080000500117583714923642267072672266450366658010020016000020080000267223511802011009910080000100800000100800000390800350000800350003951102162226720066280000801002672326723267082672326723
160204267272000000000000010126692218121225801001008000010080000500116888014923642267222672266450366658010020016000020080000267273511802011009910080000100800000100800000390800350000800350104351102162226728006280000801002670826708267082672326723
16020426707200000000000001012670720016258010010080000100800005001170107049236422672226722664503668080100200160000200800002672235118020110099100800001008000001008000000080035000358003560363951102162326735006280000801002672326723267082672326723
1602042672219900000000041010126707200122580231100801301028000050011753591492362726722267226645036680801002001600002008000026722351180201100991008000010080000010080000239318003501035800006103951102162226726066280000801002672326723267232671126723
160204267072000000000005300002733401218122580100100800001008000050011701071492364426707267226630036670801002001600002008000026722351180201100991008000010080000010080000039080000020388039001353951102162226708006080000801002672326708267232672326723
16020426707200000000000000012671021201225801001008000010080000500117010704923647267222670766450366658010020016000020080173267273511802011009910080000100800000100800000390800350003800356135051102162226707060280000801002670826726267282670826723
160204267222000000000004101002669220181225801001008000010080000500116888004923642267272672266450366808010020016000020080000267223511802011009910080000100800000100800000390800360000800356135051102162226728006280000801002670826708267082672326728
16020426722200000000000410001266922180112580100100800001008000050011701071492364226727267076645036665801002001600002008000026722351180201100991008000010080000010080000039080035000080035613539511021622267240100280000801002672326723267232672326723
160204267072000000010004100002671201818325801001008000010080000500116752004923642267222672266450366808010020016000020080000267223511802011009910080000100800000100800000390800000003980039003539511021622267190010280000801002672326723267232672326708

1000 unrolls and 10 iterations

Result (median cycles for code divided by count): 0.3340

retire uop (01)cycle (02)03l1d tlb fill (05)mmu table walk data (08)09l2 tlb miss data (0b)0e0f18191e22233a3f4346494f51schedule uop (52)schedule int uop (53)schedule ldst uop (55)dispatch int uop (56)dispatch ldst uop (58)int uops in schedulers (59)simd uops in schedulers (5a)696a6d6emap stall dispatch (70)map rewind (75)map stall (76)dispatch uop (78)map int uop (7c)map ldst uop (7d)map int uop inputs (7f)map ldst uop inputs (80)8283flush restart other nonspec (84)85inst all (8c)inst branch (8d)inst branch taken (90)inst branch cond (94)inst int load (95)inst int alu (97)inst ldst (9b)9d9fl1d tlb access (a0)l1d tlb miss (a1)l1d cache miss ld (a3)a5ld unit uop (a6)st unit uop (a7)l1d cache writeback (a8)a9acafb5b6bbl1d cache miss ld nonspec (bf)l1d tlb miss nonspec (c1)c2cfl1i tlb miss demand (d4)d5map dispatch bubble (d6)ddfetch restart (de)e0eaebec? ldst retires (ed)? int retires (ef)f5f6f7f8fd
16002526733201111010006510026696212181625800101080000108000050116888049236272670826732665336707800102016000020800002679235118002110910800001080000010800000008003901039800396103900502001161126719106480000800102672826728267082672826728
1600242672220000000000451012669220181225800101080000108000050117010749236472673026727667236702800102016000020800002675835118002110910800001080000010800000390800390003980039600000502001161126704100480000800102670826728267082672826728
16002426727200000000004500026712012121625801421080000108000050116888049236272672726707665336707800102016000020800002676835118002110910800001080000010800000008003500008003961353900502001161126724010080000800102672826723267282672826728
160024267072000000000001002671201212162580010108000010800005011687544923627267272672766723668780010201600002080000267193511800211091080000108000001080000000800000003980040013943005020011611267241010080000800102670826723267232672326728
1600242670720000000000451002671200121625800101080000108000050117010749236472670726722665336687800102016000020800002686335118002110910800001080000010800000390800390003580039610430050200116112670446080000800102672826728267082670826728
1600242672720000000000411022669200121625800101080000108000050117462849236472672726727667336687800102016000020800002673235118002110910800001080000010800000008003900039800396139430050200116112672404480000800102682726728267282672826728
1600242672720000000000000226712212016258001010800001080000501174628492364726727267276653367028001020160000208000026737351180021109108000010800000108000003908000000008003961393900502001161126723010080000800102670826728267282672626708
16002426722200000000004500026714212016258001010800001080000501173183492364726727267276672367078001020160354208000026732351180021109108000010800000108000003908003501008003501400005020011611267041010480000800102672826708267282670826728
16002426707200000000004510226692212016258001010800001080000501173183492362726727267276653366878001020160000208000026719351180021109108000010800000108000000080000000080039610000502001161126719010080000800102672326728267282672326708
16002426727200000010004500026692018181625800101080000108000050116875449236472670726727667236702800102016000020800002674535118002110910800001080000010800000390800000005480039603943005020011611267241010080000800102672326708267282672326728