Apple Microarchitecture Research by Dougall Johnson

M1/A14 P-core (Firestorm): Overview | Base Instructions | SIMD and FP Instructions
M1/A14 E-core (Icestorm):  Overview | Base Instructions | SIMD and FP Instructions

LDNP (64-bit)

Test 1: uops

Code:

  ldnp x0, x1, [x6]
  mov x0, 1
  mov x1, 2
  mov x8, 0

(no loop instructions)

1000 unrolls and 1 iteration

Retires: 2.000

Issues: 1.000

Integer unit issues: 0.000

Load/store unit issues: 1.000

SIMD/FP unit issues: 0.000

retire uop (01)cycle (02)03l1d tlb fill (05)mmu table walk data (08)09l2 tlb miss data (0b)0e0f181e22233a3f4346494f51schedule uop (52)schedule ldst uop (55)dispatch ldst uop (58)simd uops in schedulers (5a)606d6emap stall dispatch (70)map rewind (75)map stall (76)dispatch uop (78)map ldst uop (7d)map ldst uop inputs (80)8283flush restart other nonspec (84)85inst all (8c)inst int load (95)inst ldst (9b)9dl1d tlb access (a0)l1d tlb miss (a1)l1d cache miss ld (a3)ld unit uop (a6)st unit uop (a7)l1d cache writeback (a8)a9acafb5b6bbl1d cache miss ld nonspec (bf)l1d tlb miss nonspec (c1)c2cfd5map dispatch bubble (d6)ddfetch restart (de)e0ld nt uop (e6)eaebec? ldst retires (ed)? int retires (ef)f5f6f7f8fd
200538930010000010137401800251000100010001481203893898731321000200010003993511100110001000010212042105710059103861574219173116113861000106010001000375375395390395
200438930000000010338420181525100010001000153580381381963115100020001000381351110011000100011000001039000391039613543007311611378101999210001000399399399400382
200438131000000210033842181802510001000100015358039839996313210002000100039935111001100010000102119010570005910376057421907311611371103906410001000390375395375395
2004389200000004500035900121625100010001000148460394394923127100020001000389351110011000100001001001035000391035600430073116113711039100410001000395395395395395
2004394300000004500336601818162510001000100015358039839897311510002000100039935111001100010000100003910390000100060000073116113861039100210001000395375395395375
200439430000000451023590012162510001000100015014039439492312710002000100039435111001100010000101920010190002110006119421917311611386100066210001000375375395395395
20043743000010000023662001625100010001000153580398398973131100020001000399351110011000100001000039103900039103561350007311611391103906010001000395395395375395
2004394300001004500237921201125100010001000148120374374923107100020001000374351110011000100001000001039010391000013543007311611391100006410001000395395395395377
200438930000000450023792120025100010001000150530394394923107100020001000374351110011000100001020204210571115910380157019173116113871000106410001000395375395395375
2004389300001104500235901812025100010001000140750394374723107100020001000394351110011000100001000039103900039100001400007311611396105790210001000399400400399382

Test 2: Latency 1->3 (with chain penalty)

Chain cycles: 3

Code:

  ldnp x0, x1, [x6]
  eor x8, x8, x0
  eor x8, x8, x0
  add x6, x6, x8
  mov x0, 1
  mov x1, 2
  mov x8, 0

(fused SUBS/B.cc loop)

100 unrolls and 100 iterations

Result (median cycles for code, minus 3 chain cycles): 4.0047

retire uop (01)cycle (02)03l1d tlb fill (05)mmu table walk data (08)l2 tlb miss data (0b)0e0f18191e1f22243a3f4d4f51schedule uop (52)schedule int uop (53)schedule ldst uop (55)dispatch int uop (56)dispatch ldst uop (58)int uops in schedulers (59)simd uops in schedulers (5a)60696a6b6d6emap stall dispatch (70)map rewind (75)map stall (76)dispatch uop (78)map int uop (7c)map ldst uop (7d)map int uop inputs (7f)map ldst uop inputs (80)8283flush restart other nonspec (84)85inst all (8c)inst branch (8d)inst branch taken (90)inst branch cond (94)inst int load (95)inst int alu (97)inst ldst (9b)9d9fl1d tlb access (a0)l1d tlb miss (a1)l1d cache miss ld (a3)ld unit uop (a6)st unit uop (a7)l1d cache writeback (a8)a9acafb5bbl1d cache miss ld nonspec (bf)l1d tlb miss nonspec (c1)c2branch cond mispred nonspec (c5)branch mispred nonspec (cb)cfd5map dispatch bubble (d6)ddfetch restart (de)e0ld nt uop (e6)? int output thing (e9)eaebec? ldst retires (ed)? int retires (ef)f5f6f7f8fd
50205700565251110000201017003869711597022540108301061000230100100006138463342202049669730700537005363406363713401003020020000602001000070053371140201100991001000030100100000100100021110001001710000111100026102641169819100013000606610000401007014970259704337004270154
50204700535241000001200017003869718597022540108301061000130100100006138733342202049669730700537005363407363713401003020020000602001000070055371140201100991001000030100100000100100022110001020410000111110026101782169876100013000666010000401007005470055700597005470054
50204700535251111000201017003869718597022540108301061000230100100006138463342202049669610700677042263522363713401003020020000602001000070054375140201100991001000030100100000100100011110003001410000111100026101641169817100013000666610000401007005470054700547005470054
50204700535251000000200007003869718597022540108301061000230100100006138463342202049669730700537005363394363713401003020020000602001000070053371140201100991001000030100100000100100102110001000110000111110026101782169820100013000366610000401007005470054700427005470042
50204700535251110000200017003869718597022540108301061000130100100006138463342202049669730700417004163394363713401003020020000602001000070041371140201100991001000030100100000100100031110001071110000110120026101641169818100013000360610000401007005470054700547005470057
50204700535241110000700017002669711597042540104301061000230100100006136463342202049669730700417005363394363687401003020020000602001000070053371140201100991001000030100100000100100022010001001110000111110026101641169819100013000660610000401007005470042701777005570058
50204700435241011100201017003869718597022540104301061000230100100006136463341607049669730700537004663406363713401003020020000602001000070041371140201100991001000030100100000100100032010002000110000111110026101641169820100013000306610000401007005470042700427005470054
50204700535240001100101007002069711596942540104301031000130100100006137563341906049669580700477004763400363707401003020020000602001000070047371140201100991001000030100100000100100000110000000010000101000026101640169810100003000066610000401007004870048700487003670048
50204700355240000000101007003269711596942540100301031000130100100006137563341906149669550700357004763388363707401003020020000602001000070047371140201100991001000030100100000100100000110000000010000101000026101642169812100003000360610000401007004870048700487004970036
50204700475240000000101007003269711596942540104301031000130100100006137563341906049669670700477004763400363707401003020020000602001000070047371140201100991001000030100100000100100000110000000010000101000026100641169834100003000366010000401007004870048700487004870048

1000 unrolls and 10 iterations

Result (median cycles for code, minus 3 chain cycles): 4.0057

retire uop (01)cycle (02)03l1d tlb fill (05)mmu table walk data (08)09l2 tlb miss data (0b)0e0f1e22233a3f4d4f51schedule uop (52)schedule int uop (53)schedule ldst uop (55)dispatch int uop (56)dispatch ldst uop (58)int uops in schedulers (59)simd uops in schedulers (5a)696a6d6emap stall dispatch (70)map rewind (75)map stall (76)dispatch uop (78)map int uop (7c)map ldst uop (7d)map int uop inputs (7f)map ldst uop inputs (80)8283flush restart other nonspec (84)85inst all (8c)inst branch (8d)inst branch taken (90)inst branch cond (94)inst int load (95)inst int alu (97)inst ldst (9b)9d9fl1d tlb access (a0)l1d tlb miss (a1)l1d cache miss ld (a3)ld unit uop (a6)st unit uop (a7)l1d cache writeback (a8)a9acafb5bbl1d cache miss ld nonspec (bf)l1d tlb miss nonspec (c1)c2c3cfd5map dispatch bubble (d6)d9ddfetch restart (de)e0ld nt uop (e6)? int output thing (e9)eaebec? ldst retires (ed)? int retires (ef)f5f6f7f8fd
50025700575251011112101700426972559710254001430016100023001010000615508334268649669617006070060634293637394001030020200006002010000700573711400211091010000300101000001010001211000200011000011011025205640866982810001300061010010000400107006170042700617006170061
50024700605251101002000700456972559717254001830016100013001010000615508334254249669807006070060634093637424001030020200006002010000700573711400211091010000300101000001010003111000200241000001110025206640766982010001300061301310000400107006170061700427006170061
50024700605251001002100700456972559713254001830016100013001010000615508334161449669617006070060634283637424001030020200006002010000700413711400211091010000300101000001010002111000300341000011111025204640106698091000130006010010000400107006170061700427006170058
500247006052511000020007004569725597102540014300131000130010100006155353341614496696170173700616341336372240010300202000060020100007006037114002110910100003001010000010100022010002001110000111100252046507469807100013000313101310000400107006170061700427006170058
50024700605251001002101700266972259710254001430013100023001010000615535334254249669807004170060634283637224001030020200006002010000700573711400211091010000300101000001010001211000100241000011010025204640766980710001300061310010000400107005870061700617005870061
5002470060524110100200070026697255971025400183001610001300101000061553533425424966980700607006063409363722400103002020000600201000070060371140021109101000030010100000101000111100020004100001111102520564074698221000130006130010000400107005870042700427006170058
50024700605241111111000700456969759710254001830016100023001010000615508334254249669777006070041634283637424001030020200006002010000700413711400211091010000300101000001010001201000201141000001012025203650846982410001300061313010000400107006170058700587004270061
500247004152411110020007004269725597132540018300131000230010100006155083341614496696170041700416340936372240010300202000060020100667006037114002110910100003001010000010100021110002001110000111110252056408469822100013000613131310000400107005870042700587006170042
50024700415241111001000700456969759713254001830016100023001010000615508334254249669807006070060634283637424001030020200006002010000700603711400211091010000300101000001010002211000200111000011011025204640736982110001300061301310000400107005870061700427004270061
5002470041525111101200070042696975971025400143001610002300101000061553533416144966980700577004163409363722400103002020000600201000070041371140021109101000030010100000101000221100020011100001111102520464010669858100013000613101310000400107006170061700617005870042

Test 3: Latency 2->3 (with chain penalty)

Chain cycles: 3

Code:

  ldnp x0, x1, [x6]
  eor x8, x8, x1
  eor x8, x8, x1
  add x6, x6, x8
  mov x0, 1
  mov x1, 2
  mov x8, 0

(fused SUBS/B.cc loop)

100 unrolls and 100 iterations

Result (median cycles for code, minus 3 chain cycles): 4.0050

retire uop (01)cycle (02)03l1d tlb fill (05)mmu table walk data (08)09l2 tlb miss data (0b)0e0f18191e1f22233a3f4d4f51schedule uop (52)schedule int uop (53)schedule ldst uop (55)dispatch int uop (56)dispatch ldst uop (58)int uops in schedulers (59)simd uops in schedulers (5a)60696a6d6emap stall dispatch (70)map rewind (75)map stall (76)dispatch uop (78)map int uop (7c)map ldst uop (7d)map int uop inputs (7f)map ldst uop inputs (80)8283flush restart other nonspec (84)85inst all (8c)inst branch (8d)inst branch taken (90)inst branch cond (94)inst int load (95)inst int alu (97)inst ldst (9b)9d9fl1d tlb access (a0)l1d tlb miss (a1)l1d cache miss ld (a3)ld unit uop (a6)st unit uop (a7)l1d cache writeback (a8)a9acafb5bbl1d cache miss ld nonspec (bf)l1d tlb miss nonspec (c1)c2c3branch cond mispred nonspec (c5)branch mispred nonspec (cb)cdcfd5map dispatch bubble (d6)ddfetch restart (de)e0ld nt uop (e6)? int output thing (e9)eaebec? ldst retires (ed)? int retires (ef)f5f6f7f8fd
502057004752400001100432010070020697115972325401043010310002301111000761346533414731496695570035700476344086374640118302332002360266100127005037114020110099100100003010010000010010000011000000001000010100011126210160069845100003000306010000401007003670051700517005170051
5020470050525000010006000070020697115973325401043010010000301111000761320233420751496697070035700356350986373040118302332002360266100127005037114020110099100100003010010000010010000011000000001000000000011126210160069861100003000396910000401007005170051700367003670051
50204700505240000000070000070020697145972425401003010010001301111000761320233414731496697070051700506349176373140118302332002360266100127005037114020110099100100003010010000110010000011000000001000010100011126200160070192100003000306010000401007048470332703697027670048
5020470050525001001301479010070035697145973325401003010310006302501000061391533420561496697070035700476346236371040100302002000060200100007005037114020110099100100003010010000010010000011000000001000010100000026101641169813100003000300910000401007003670053700517003670042
50204700355250000000070000700356971459684254010430103100013010010000613915334205614966970700507005063447363707401003020020000602001000070050371140201100991001000030100100000100100000110000000451000010100000026101641169798100003000000910000401007003670036700517005170039
50204700505240000000031000070020697215970025401043010010001301001000061375633420561496399370051701176340436371940100302002000060200100007005037114020110099100100003010010000010010000001000000001000010000000026101641169813100003000306910000401007003670036700367005170051
5020470035524000000001000070020697115969425401043010310001301001000061391533420561496697070047700506349136370740100302002000060200100007003537114020110099100100003010010000010010000001000000001000010100000026101641169813100003000390610000401007005270036700517005170051
5020470050524000000000010070035697145970025401043010010001301001000061365233420561496695570047700476344736369540100302002000060200100007005037114020110099100100003010010000110010000011000000001000010100000226101641169813100003000006010000401007003670051700517005170048
5020470047525000000001010070035696985970025401043010010000301001000061391533413041496697670035700476342236371040100302002000060200100007003537114020110099100100003010010000010010000011000000001000011000000026101641169813100013000099010000401007004870042700577005770036
5020470050524000000001000070020697145970025401043010010000301001000061391533420561496695570047700476343136371340100302002000060200100007005037114020110099100100003010010000010010000011000000001000000000000026101641169813100003000300910000401007005170051700367005170051

1000 unrolls and 10 iterations

Result (median cycles for code, minus 3 chain cycles): 4.0047

retire uop (01)cycle (02)03mmu table walk instruction (07)mmu table walk data (08)09l2 tlb miss instruction (0a)l2 tlb miss data (0b)0e0f18191e1f22243a3f4d4f51schedule uop (52)schedule int uop (53)schedule ldst uop (55)dispatch int uop (56)dispatch ldst uop (58)int uops in schedulers (59)simd uops in schedulers (5a)5f60696a6d6emap stall dispatch (70)map rewind (75)map stall (76)dispatch uop (78)map int uop (7c)map ldst uop (7d)map int uop inputs (7f)map ldst uop inputs (80)8283flush restart other nonspec (84)85inst all (8c)inst branch (8d)inst branch taken (90)inst branch cond (94)inst int load (95)inst int alu (97)inst ldst (9b)9d9fl1d tlb access (a0)l1d tlb miss (a1)l1d cache miss ld (a3)ld unit uop (a6)l1d cache writeback (a8)a9acafb5l1d cache miss ld nonspec (bf)c2branch mispred nonspec (cb)cfd2d5map dispatch bubble (d6)d9dadbddfetch restart (de)e0ld nt uop (e6)? int output thing (e9)eaebec? ldst retires (ed)? int retires (ef)f5f6f7f8fd
50025700475250010111001000070035697145968925400143001010001300101000061510233418990049669717004770056634033637184001030020200006002010000700543711400211091010000300101000011010000011000050010000110025200678000131269802100003000096610000400107005170048700527005770051
5002470050525000000000100007020969714596772540014300101000030584100006168993341899114966994703217005663408363715400103066020000600201000070035371140021109101000030010100000101000000100004201000011202520011781011213698141000030003924610000400107004870051700487004870051
500247004752500000100000100700326971459689254001430013100013001010000615126334129500496703470035700356341536368240010303442011060020100007005037114002110910100003001010000010100000010000000100001000252001178000121169817100003000096010000400107003670051700907004870036
500247005652500000000010100700326971959694254001430010100003001010000615102334129501496700670050700556341836368240010300202000060020100007005037114002110910100003001010000010100000110000000100001000252001478000121369801100003000306010000400107003670036700367003670051
500247032752500000000010000700326971459677254001430013100013029610000615207334129501496701870038700496341536371840010300202000060020100007003537114002110910100003001010000010100000010000000100001100252001278000121269801100003000396610000400107003670039700487003670036
500247003552400000000000100700356971959692254001430013100013001010000615126334129500496699670048700476340336371540010300202000060020100007004737114002110910100003001010000010100000110000000100001000252001378000131369817100003000360610000400107003670036700517003670036
500247005052400000000000100700326971859689254001430013100013001010000614994334204600496732770086700386341836371840010300202000060020100007004737114002110910100003001010000010100000010000000100001100252001378000131369801100003000306610000400107003670048700487005170051
500247004752500000000010100700206971459689254001430013100013001010000615129334673700496701770053700486341836368240010300202000060020100007005037114002110910100003001010000010100000110000003100000100252001499000131369814100003000090610000400107003670048700487005170048
5002470047524000000000100007003269714596922540014300131000030010100006149943342046014966967700517003563415363682400103002020000600201000070050371140021109101000030010100000101000001100003001000011002520013781001356981410000300006181810000400107004870036700367003670048
500247005052500000100010000700206971459692254001030013100003001010000615021334129500496705070052700476341836371840010300202000060020100007005637114002110910100003001010000110100000110000003100000000252001299000131369817100003000306910000400107005170051700367004870051

Test 4: throughput

Count: 8

Code:

  ldnp x0, x1, [x6]
  ldnp x0, x1, [x6]
  ldnp x0, x1, [x6]
  ldnp x0, x1, [x6]
  ldnp x0, x1, [x6]
  ldnp x0, x1, [x6]
  ldnp x0, x1, [x6]
  ldnp x0, x1, [x6]
  mov x7, x6
  mov x8, x6
  mov x9, x6
  mov x10, x6
  mov x11, x6
  mov x12, x6
  mov x13, x6

(fused SUBS/B.cc loop)

100 unrolls and 100 iterations

Result (median cycles for code divided by count): 0.3341

retire uop (01)cycle (02)030e0f1e223a3f4346494f51schedule uop (52)schedule int uop (53)schedule ldst uop (55)dispatch int uop (56)dispatch ldst uop (58)int uops in schedulers (59)simd uops in schedulers (5a)696a6d6emap stall dispatch (70)int prf full (71)map rewind (75)map stall (76)dispatch uop (78)map int uop (7c)map ldst uop (7d)map int uop inputs (7f)map ldst uop inputs (80)8283flush restart other nonspec (84)85inst all (8c)inst branch (8d)inst branch taken (90)inst branch cond (94)inst int load (95)inst int alu (97)inst ldst (9b)9d9fl1d tlb access (a0)l1d cache miss ld (a3)a5ld unit uop (a6)l1d cache writeback (a8)acafb5b6bbl1d cache miss ld nonspec (bf)cfd5map dispatch bubble (d6)dbddfetch restart (de)e0ld nt uop (e6)eaebec? ldst retires (ed)? int retires (ef)f5f6f7f8fd
1602052672820000141102671201102580100100800001008000050011688804923629267272671766670366858010020016000020080000267313511802011009910080000100800001100800004308003904280038613943511011601426724800001010780000801002673226732267322673226732
16020426731201004400267160121025801001008000010080000500116877849237502673726727665703668980100200160000200800002673135118020110099100800001008000001008000043080038039800006139051103160242672880000140780000801002673226708267282673226732
160204267312000044012669221019258010010080000100800005001168627492373726707272096630036691801002001600002008000026739351180201100991008000010080000010080000430800380080038600051103160432672480038010080000801002672826711267322673226728
160204267072000000126716211219258010010080000100800005001168880492374926876267146654036685801002001600002008000026731351180201100991008000010080000010080000430800000080038603944511041603226728800381410080000801002673226708267282673226708
16020426707199000112671620119258010010080000100800005001174887492375326737267176664036689801002001600002008000026731351180201100991008000010080000010080000008003803980038013944511031603226728800381010780000801002673226732267082673226708
160204267312000045112671201019258010010080000100800005001168880492378126737267366663036689801002001600002008000026707351180201100991008000010080000010080000008003803980038013943511031602226728800381410780000801002673226708267082670826708
160204267312000044012671621119258010010080000100800005001168880492376626737267356674036689801002001600002008000026731351180201100991008000010080000010080000008003803980000600445110316031267048003800780000801002673226728267322673526728
16020426731200004401267162112162580100100800001008000050011688804923663267072674566620366678010020016000020080000267333511802011009910080000100800000100800004308000003880039613944511011601326728800391410080000801002673226732267282670826732
160204267312000010501267203010258010010080000100800005001174887492364726733267496654036691801002001600002008000026731351180201100991008000010080000010080000430800380388000060043511031605226728800381010080000801002673226708267322673226728
160204267072000044002671220119258010010080000100800005001168880492376026737267366668036689801002001600002008000026731351180201100991008000010080000010080000430800000388003861394451101160232672880038010780000801002673226708267322673226732

1000 unrolls and 10 iterations

Result (median cycles for code divided by count): 0.3348

retire uop (01)cycle (02)03090e0f1e22233a3f4346494f51schedule uop (52)schedule int uop (53)schedule ldst uop (55)dispatch int uop (56)dispatch ldst uop (58)int uops in schedulers (59)simd uops in schedulers (5a)696a6d6emap stall dispatch (70)int prf full (71)map rewind (75)map stall (76)dispatch uop (78)map int uop (7c)map ldst uop (7d)map int uop inputs (7f)map ldst uop inputs (80)8283flush restart other nonspec (84)85inst all (8c)inst branch (8d)inst branch taken (90)inst branch cond (94)inst int load (95)inst int alu (97)inst ldst (9b)9d9fl1d tlb access (a0)l1d cache miss ld (a3)ld unit uop (a6)l1d cache writeback (a8)acafb5b6bbl1d cache miss ld nonspec (bf)branch cond mispred nonspec (c5)branch mispred nonspec (cb)cfl1i cache miss demand (d3)d5map dispatch bubble (d6)ddfetch restart (de)e0ld nt uop (e6)eaebec? ldst retires (ed)? int retires (ef)f5f6f7f8fd
16002526729200100010126692000025800101080000108000050116805849236512670726731665303668780010201600002080000267073511800211091080000108000001080000080000138800396039440050200116112672880038140480000800102672826732267082673226732
160024267312000000000267122000258001010800001080000501172556492365126727267316672036687800102016000020800002670735118002110910800001080000010800004380038038800386104400502001161126728800381414780000800102673226708267322670826708
160024267312000000001267160111925800101080000108000050117765449236512670726707667203668780010201600002080000267073511800211091080000108000001080000438003900800396139440050200116112672880038014080000800102673226732267082673226708
1600242670720000045101267162101925800101080000108000050116888049236512673126731667603670780010201600002080000267313511800211091080000108000001080000438003910800366039440050200116112672880039140780000800102673226732267322673226728
1600242670720000045000266922101925800101080000108000050117563749236272673126731667703668780010201600002080000267273511800211091080000108000001080000438000000800326139440050200116112670480000147780000800102673226732267082673226732
1600242672720100001012669221002580010108000010800005011746284923651267312673166760367118001020160000208000026731351180021109108000010800000108000043800000080038003900050200116112672980000140780000800102673226732267082673226735
1600242673120000044101267162100258001010800001080000501179233492365126731267316676036711800102016000020800002670735118002110910800001080000010800004380038038800396039440050200116112672880038010780000800102673226728267082673226728
16002426707200010440012671600102580010108000010800005011772364923627267312673166770367078001020160000208000026707351180021109108000010800000108000043800380388003861390005020011611267288003800780000800102673226732267322673226708
1600242670720000044001266920111625800101080000108000050116850049236512672726727667603671180010201600002080000267333511800211091080000108000011080000080168038800396100005020011611267288016900780000800102673226708267322672826708
160024267072000004400026692000025800101080000108000050117394749236512673126731667703671180010201600002080000267313511800211091080000108000001080000438003804180000603901050200116112682580039014780000800102673226732267082673226732