Apple Microarchitecture Research by Dougall Johnson

M1/A14 P-core (Firestorm): Overview | Base Instructions | SIMD and FP Instructions
M1/A14 E-core (Icestorm):  Overview | Base Instructions | SIMD and FP Instructions

LDPSW (signed offset)

Test 1: uops

Code:

  ldpsw x0, x1, [x6, #8]
  mov x0, 1
  mov x1, 2
  mov x8, 0

(no loop instructions)

1000 unrolls and 1 iteration

Retires: 2.000

Issues: 1.000

Integer unit issues: 0.000

Load/store unit issues: 1.000

SIMD/FP unit issues: 0.000

retire uop (01)cycle (02)03090e0f1e223a3f4346494f51schedule uop (52)schedule ldst uop (55)dispatch ldst uop (58)simd uops in schedulers (5a)606d6emap stall dispatch (70)map rewind (75)map stall (76)dispatch uop (78)map ldst uop (7d)map ldst uop inputs (80)8283flush restart other nonspec (84)85inst all (8c)inst int load (95)inst ldst (9b)9dl1d tlb access (a0)l1d cache miss ld (a3)ld unit uop (a6)l1d cache writeback (a8)acafb5b6bbl1d cache miss ld nonspec (bf)cfd5map dispatch bubble (d6)ddfetch restart (de)e0eaebec? ldst retires (ed)? int retires (ef)f5f6f7f8fd
200539431004500379011162510001000100015246139439496310710002000100039835111001100010000100043100004110386139447311611391140710001000399399399399375
20043983000451038320016251000100010001528813783949231311000200010003943511100110001000010004310390381039013907311611371140710001000399399399399395
2004398200001138320119251000100010001407503763959231311000200010003983511100110001000010004310380411038613907311611395140410001000399375375399399
200439430004410383210182510001000100014075039837673313110002000100039835111001100010000100043100003810386139447311611395010410001000399399399399399
200437430000013832010251000100010001524614034127831311000200010003943511100110001000010000100003810006038073116113951414010001000375399399399375
20043982000450138321201925100010001000140751394398723127100020001000374351110011000100001000431039038103860394373116113711010010001000403380375377375
200437430004410383211219251000100010001407513943949631071000200010003983511100110001000010000100003810390138447311611371140710001000375399375399375
2004398300068003790101925100010001000152841394398963131100020001000398351110011000100001000431038001038013944731161137100710001000399399375399399
200437430000013592121219251000100010001528403983747231311000200010003943511100110001000010004410000381000010437311611371014710001000395399375375395
2004374301101135921102510001000100015053139837493312710002000100039835111001100010000100001038041103961007311611391010010001000399399400399375

Test 2: Latency 1->3 (with chain penalty)

Chain cycles: 3

Code:

  ldpsw x0, x1, [x6, #8]
  eor x8, x8, x0
  eor x8, x8, x0
  add x6, x6, x8
  mov x0, 1
  mov x1, 2
  mov x8, 0

(fused SUBS/B.cc loop)

100 unrolls and 100 iterations

Result (median cycles for code, minus 3 chain cycles): 4.0053

retire uop (01)cycle (02)03l1d tlb fill (05)mmu table walk data (08)l2 tlb miss data (0b)0e0f1e22243a3f4d4f51schedule uop (52)schedule int uop (53)schedule ldst uop (55)dispatch int uop (56)dispatch ldst uop (58)int uops in schedulers (59)simd uops in schedulers (5a)696a6d6emap stall dispatch (70)int prf full (71)map rewind (75)map stall (76)dispatch uop (78)map int uop (7c)map ldst uop (7d)map int uop inputs (7f)map ldst uop inputs (80)8283flush restart other nonspec (84)85inst all (8c)inst branch (8d)inst branch taken (90)inst branch cond (94)inst int load (95)inst int alu (97)inst ldst (9b)9d9fl1d tlb access (a0)l1d tlb miss (a1)l1d cache miss ld (a3)ld unit uop (a6)l1d cache writeback (a8)a9acafb5bbl1d cache miss ld nonspec (bf)l1d tlb miss nonspec (c1)c2cdcfd5map dispatch bubble (d6)ddfetch restart (de)e0? int output thing (e9)eaebec? ldst retires (ed)? int retires (ef)f5f6f7f8fd
50205700535251110021017003869718597022540108301061000230100100006138463342202496697370053700536340603637134010030200200006020010000700533711402011009910010000301001000001001000221100011021310000111110261026411698163000666610000401007005470054700547005470091
502047005352511110210170038697185970225401083010610002301001000061384633422024966973700537005363406036371340100302002000060200100007005337114020110099100100003010010000010010001211000140110000111110261016411698163000666610000401007005470054700547005470068
5020470058543110002101700386971859702254010830150100023010010000613846334220249669737052970053634060363713401003020020000602001000070053371140201100991001000030100100000100100012010001430110000111120261016411698163000666610000401007005470054700547005470106
502047005352511100210170038697185970225401083010610002301001000061384633422024966973700537005363394036371340100302002000060200100007005337114020110099100100003010010000010010001211000110010000101000261016411698103000366610000401007004870048700487004870079
502047004752500000110070020696985969425401043010310001301001000061375633413044966967700477004763400036370740100302002000060200100007003537114020110099100100003010010000010010000011000040010000101000261016411698103000366610000401007004870048700487003670053
5020470047525000000100700326969859694254010430103100013010010000613756334190649669677004770047634000363707401003020020000602001000070047371140201100991001000030100100000100100000110000230010000101000261016411698103000366610000401007004870048700487004870065
502047004752400000100070032697115968425401043010310001301001000061375633419064966967700477004763400036370740100302002000060200100007004737114020110099100100003010010000010010000011000060010000101000261016411698103000666010000401007005470054700547004270097
50204700535251110121017003869718597022540108301061000230100100006138463342202496697370053700536340601363713401003020020000602001000070053371140201100991001000030100100000100100021110001411110000111110261016411698163000606610000401007005470054700547005470055
5020470053525111002101700386971859702254010830106100023010010000613846334220249669737005370041634060363713401003020020000602001000070053371140201100991001000030100100000100100022110001310110000111110261016411698163000666610000401007005470054700607005470060
502047005352511100210170038697185968825401083010610002301001000061364633422024966973700537005363406036371340100302002000060200100007005337114020110099100100003010010000010010001111000140110000111110261016411698163000666610000401007005470054700547005470093

1000 unrolls and 10 iterations

Result (median cycles for code, minus 3 chain cycles): 4.0050

retire uop (01)cycle (02)0309l2 tlb miss data (0b)0e0f18191e1f223f4d4f51schedule uop (52)schedule int uop (53)schedule ldst uop (55)dispatch int uop (56)dispatch ldst uop (58)int uops in schedulers (59)simd uops in schedulers (5a)60696a6d6emap stall dispatch (70)map rewind (75)map stall (76)dispatch uop (78)map int uop (7c)map ldst uop (7d)map int uop inputs (7f)map ldst uop inputs (80)8283flush restart other nonspec (84)85inst all (8c)inst branch (8d)inst branch taken (90)inst branch cond (94)inst int load (95)inst int alu (97)inst ldst (9b)9d9fl1d tlb access (a0)l1d tlb miss (a1)l1d cache miss ld (a3)ld unit uop (a6)l1d cache writeback (a8)a9acafb5l1d cache miss ld nonspec (bf)c2branch mispred nonspec (cb)cfd5map dispatch bubble (d6)ddfetch restart (de)e0? int output thing (e9)eaebec? ldst retires (ed)? int retires (ef)f5f6f7f8fd
5002570047525101000101700356971459692254001430013100023001010000615002334267014966970700507003563403363682400103002020000600201000070050371140021109101000030010100000101000000100000090100001100252049945698143000396610000400107003670051700517003670051
5002470088524001400101700456971959692254001430013100013001010000615156334204604966970700507005063418363682400103002020000600201000070050371140021109101000030010100000101000001100000015100000100252047844698173000066010000400107005170051700517004870051
5002470055524000000101700356971459677254001030013100013001010000614994334204604966970700507003563418363682400103002020000600201000070050371140021109101000030010100000101000301100000090100001000259547845700673000396910000400107005170036703957044670051
500247010053000003270070035697145969225400103001310001300101000061521633438701496695570050700356342636371840010300202000060020100007005037114002110910100003001010000010100000010000003100001000252047844698013000366910000400107005170051700517005170036
5002470097525000000100700356971459690254001030010100013001010000615165334129504966955700507005063418363718400103002020000600201000070047371140021109101000030010100000101000001100000087100000000252047854698173000306910000400107003670036700517005170051
5002470065525000000001700356971459692254001430013100013001010000615144334204604966970700357005063403363718400103002020000603981000070047371140021109101000030010100000101000000100003096100001000252049944698173000360010000400107005170036700517003670036
5002470056524000000100700326971459689254001030013100003001010000615153334204604966970700507003563418363718400103002020000600201000070050371140021109101000030010100000101000000100000012100001100252049944698143000099910000400107005170051700487005170048
50024700505240011001007003569714596892540014300131000130010100006151443342046049669677003570035634183637184022930020200006002010000700473711400211091010000300101000001010000011000090117100001100252047844698173000366610000400107004870036700517003670048
50024700925250000001007002069714596892540010300131000130010100006151383342046049669707004770050634153636824001030020200006002010000700353711400211091010000300101000001010000011000000141100001000252047864698173000366910000400107005170090701117005170036
50024700775250000000017003569718596892540014300131000130010100006151653341899049669707005070035634183636824001030020200006002010000700503711400211091010000300101000011010000011000000174100001100252047844698173000399610000400107003670048700997003670051

Test 3: Latency 2->3 (with chain penalty)

Chain cycles: 3

Code:

  ldpsw x0, x1, [x6, #8]
  eor x8, x8, x1
  eor x8, x8, x1
  add x6, x6, x8
  mov x0, 1
  mov x1, 2
  mov x8, 0

(fused SUBS/B.cc loop)

100 unrolls and 100 iterations

Result (median cycles for code, minus 3 chain cycles): 4.0051

retire uop (01)cycle (02)03l1d tlb fill (05)mmu table walk data (08)l2 tlb miss data (0b)0e0f1e22233a3f4d4f51schedule uop (52)schedule int uop (53)schedule ldst uop (55)dispatch int uop (56)dispatch ldst uop (58)int uops in schedulers (59)simd uops in schedulers (5a)60696a6b6d6emap stall dispatch (70)map rewind (75)map stall (76)dispatch uop (78)map int uop (7c)map ldst uop (7d)map int uop inputs (7f)map ldst uop inputs (80)8283flush restart other nonspec (84)85inst all (8c)inst branch (8d)inst branch taken (90)inst branch cond (94)inst int load (95)inst int alu (97)inst ldst (9b)9d9fl1d tlb access (a0)l1d tlb miss (a1)l1d cache miss ld (a3)ld unit uop (a6)st unit uop (a7)l1d cache writeback (a8)a9acafb5bbl1d cache miss ld nonspec (bf)l1d tlb miss nonspec (c1)c2cfd5map dispatch bubble (d6)ddfetch restart (de)e0? int output thing (e9)eaebec? ldst retires (ed)? int retires (ef)f5f6f7f8fd
502057005152400000700070036697155968625401003010310001301001000061365233421040496697107005170051634043637174010030200200006020010000700573711402011009910010000301001000011001000211100010001100001111026102641169820300031001010000401007005370052700527008570059
502047005152500010110070036697155973025401043010310000301001000061393033421040496695507005170051633883637114010030200200006020010000700513721402011009910010000301001000001001000001100000000100000010026101641169814300001010010000401007005270055700527013670036
5020470051524000000100700366971559733254010430103100013010010000613652334210414966955070035700356340436369540100302002000060200100007009337114020110099100100003010010000010010002211000200011000011011261016411698203000610101010000401007004270058700587013670058
50204700575251110035100700436981159723254010830106100023010010000613915334239404966961070057700416339436368740100302002010860200100007005737114020110099100100003010010000010010001211000100011000011112261016411698203000610101010000401007005870061700587015770058
5020470057525110002101700426972259765254010830106100013010010000613915334160714966977070057700416339436368740100302002000060200100007005737114020110099100100003010010000010010002211000101041000011011261017811698203000310101010000401007004270058700587013370058
502047005752511000210170042697225970725401043010610002301001000061391533423941496698007004170041634103637174010030200200006020010000700573711402011009910010000301001000001001000320100020004100001111026101641169808300060101010000401007005870058701427011070042
502047005752510100200170026697225977025401083010310002301001000061364633423940496697707005770057634103637174010030200200006020010000700413711402011009910010000301001000001001000220100020021100000111026101641169823300061001310000401007006170061700587011870042
5020470057525100001000700426972259779254010830103100023010010000614844334278814966961070057700576341036371740100302002000060200100007004137114020110099100100003010010000010010001211000200111000011110261016411698083000610101010000401007004270058700587016970058
502047004152411100210070042697115971125401083013110002301001000061391533416070496696107005770041634103637174010030200200006020010055700573711402011009910010000301001000001001000221100010011100001111126101641169820300060101010000401007009770058701597010870058
502047005752510100200070042698085972725401043010610001301001000061391533423941496696107005770057634103637174010030200200006020010000700573711402011009910010000301001000001001000211100010001100001111026101641169820300030131010000401007005870058700837004270058

1000 unrolls and 10 iterations

Result (median cycles for code, minus 3 chain cycles): 4.0047

retire uop (01)cycle (02)030e0f191e223a3f4d4f51schedule uop (52)schedule int uop (53)schedule ldst uop (55)dispatch int uop (56)dispatch ldst uop (58)int uops in schedulers (59)simd uops in schedulers (5a)5f60696a6d6emap stall dispatch (70)int prf full (71)map rewind (75)map stall (76)dispatch uop (78)map int uop (7c)map ldst uop (7d)map int uop inputs (7f)map ldst uop inputs (80)8283flush restart other nonspec (84)85inst all (8c)inst branch (8d)inst branch taken (90)inst branch cond (94)inst int load (95)inst int alu (97)inst ldst (9b)9d9fl1d tlb access (a0)l1d cache miss ld (a3)ld unit uop (a6)l1d cache writeback (a8)a9acafb5l1d cache miss ld nonspec (bf)cfd5map dispatch bubble (d6)dbddfetch restart (de)e0? int output thing (e9)eaebec? ldst retires (ed)? int retires (ef)f5f6f7f8fd
5002570050526000110700326971859689254001430013100013001010000615192334204300496696770047700496341503637154001030020200006002010000700353711400211091010000300101000001010000110000570010000112520137801312698143000366610000400107004870048700487004870099
5002470035525000100700326971859689254001430013100013001010000615102334129500496696770035700476341503637154001030020200006002010000700473711400211091010000300101000001010000010000470010000112520137801313698173000366010000400107004870036700367004870097
50024700475240001007002069719596892540010300131000130010100006149943341899004966967700477004763415036371540010300202000060020100007004737114002110910100003001010000010100001100006031000010254467801112698013000360610000400107004870048700487003670058
5002470049525004100700246972559689514001430013100013001010051617385334194700496697070048700496341504263720400103002020000607921000070050371140021109101000030010100000101000011000050010000102520117801211698143000366610000400107003870048700487004870082
500247004752501010070032697185968925400143001310001300101000061510233412950049669687003570051634150363715400103002020000600201000070052371140021109101000030010100000101000011000055001000011252014780126698143000366010000400107004870036700367004870105
500247004752500010070032697185968925400103001310001300101000061499433418990049669677003570047634030363715400103002020000600201000070047371140021109101000030010100000101000011000050010000002520137801312698143000366610000400107004870036700487004870071
5002470047525010110700206971959689254001030013100013001010000615102334129500496696770047701076340703636824001030020200006002010000700503711400211091010000300101000001010000110000260610000102520157801313698733000306610000400107004870088700487011270051
5002470049524000110700326971859689254001430013100013001010000615102334189900496695570035700476341503637154001030020200006041610000700473711400211091010000300101000001010000110000721210000112520117801213698143000366610000400107004870050700487004870057
50024700475240001007003269718596892540014300131000130010100006151023341899004967060700477004763415736371540010300202000060020100007003537114002110910100003001010000010100001100065001210003112520137801312698173000366610000400107004970048700487004870241
500247005052400011070032697195969125400143001310001300101000061510233412950049669687004770035634150363715400103002020000600201000070047371140021109101000030010100000101000011000057001000011252012780913698143000066010000400107004870048700487004870108

Test 4: throughput

Count: 8

Code:

  ldpsw x0, x1, [x6, #8]
  ldpsw x0, x1, [x6, #8]
  ldpsw x0, x1, [x6, #8]
  ldpsw x0, x1, [x6, #8]
  ldpsw x0, x1, [x6, #8]
  ldpsw x0, x1, [x6, #8]
  ldpsw x0, x1, [x6, #8]
  ldpsw x0, x1, [x6, #8]
  mov x7, x6
  mov x8, x6
  mov x9, x6
  mov x10, x6
  mov x11, x6
  mov x12, x6
  mov x13, x6

(fused SUBS/B.cc loop)

100 unrolls and 100 iterations

Result (median cycles for code divided by count): 0.3341

retire uop (01)cycle (02)030e0f1e22233a3f4346494f51schedule uop (52)schedule int uop (53)schedule ldst uop (55)dispatch int uop (56)dispatch ldst uop (58)int uops in schedulers (59)simd uops in schedulers (5a)60696a6d6emap stall dispatch (70)int prf full (71)map rewind (75)map stall (76)dispatch uop (78)map int uop (7c)map ldst uop (7d)map int uop inputs (7f)map ldst uop inputs (80)8283flush restart other nonspec (84)85inst all (8c)inst branch (8d)inst branch taken (90)inst branch cond (94)inst int load (95)inst int alu (97)inst ldst (9b)9d9fl1d tlb access (a0)l1d cache miss ld (a3)ld unit uop (a6)l1d cache writeback (a8)acafb5b6bbl1d cache miss ld nonspec (bf)cfd5map dispatch bubble (d6)ddfetch restart (de)e0eaebec? ldst retires (ed)? int retires (ef)f5f6f7f8fd
16020526738200007810026692200192580100100800001008000050011748870492362726737267126650036689801002001600002008000026731351180201100991008000010080000010080000438000005480000613905110116112672800080000801002670826732267322673226732
160204267312000001002671621119258010010080000100800005001168880149206692675526803665103668980100200160000200800002673135118020110099100800001008000001008000043800000388003961404351101161026728140080000801002670826732267282670826732
1602042673120000450012669220119258010010080000100800005001174628149236512672726727665403668880100200160000200800002670735118020110099100800001008000001008000043800390388003901390511011611267281410780000801002670826728267322673226708
16020426731200006610026692212121625801001008000010080000500117462804923647268252673467740366898010020016000020080000267073511802011009910080000100800000100800004380039008003800394351100161126728100080000801002672826728267322672826732
160204267352000001002671621019258010010080000100800005001174887149237572673426767665603666580100200160000200800002673135118020110099100800001008000001008000043800000080000603944511011611267281410080000801002673426708267282673226728
160204267272000045001266922112025801001008000010080000500117488714923651268822677067310366658010020016000020080000267273511802011009910080000100800000100800000800380080039613943511011611267241414080000801002673226836268032670926728
1602042672720000450002669221002580100100800001008000050011688801492365526733267396659036665801002001600002008000026731351180201100991008000010080000010080000438003803880000603943511011611267281010480000801002673226728267282670826732
1602042673120000440012671201002580100100800001008000050011748871492365227360267316654036693801002001600002008000026727351180201100991008000010080000010080000438003803880000613843511011611267281414780000801002672826728267322670826732
16020426707200004500126692011192580100100800001008000050011688801492365626736267276630036665801002001600002008000026707351180201100991008000010080000010080000438000003880038003843511011611267041410480000801002673226732267282672826708
16020426707200110101266922111625801001008000010080000500117488714923656267362673166300366898010020016000020080000267073511802011009910080000100800000100800004380039008003800043511011611267281414780000801002673226732267322673226732

1000 unrolls and 10 iterations

Result (median cycles for code divided by count): 0.3341

retire uop (01)cycle (02)030e0f1e22243a3f4346494f51schedule uop (52)schedule int uop (53)schedule ldst uop (55)dispatch int uop (56)dispatch ldst uop (58)int uops in schedulers (59)simd uops in schedulers (5a)60696a6d6emap stall dispatch (70)map rewind (75)map stall (76)dispatch uop (78)map int uop (7c)map ldst uop (7d)map int uop inputs (7f)map ldst uop inputs (80)8283flush restart other nonspec (84)85inst all (8c)inst branch (8d)inst branch taken (90)inst branch cond (94)inst int load (95)inst int alu (97)inst ldst (9b)9d9fl1d tlb access (a0)l1d cache miss ld (a3)ld unit uop (a6)l1d cache writeback (a8)acafb5b6bbl1d cache miss ld nonspec (bf)cfd5map dispatch bubble (d6)ddfetch restart (de)e0eaebec? ldst retires (ed)? int retires (ef)f5f6f7f8fd
16002526728200100106267112112192580010108000010800005011746280492365126731267276677367118001020160000208000026731351180021109108000010800000108000043800390080039613944502031623267281410080000800102673226728267282672826732
1600242673120000441002672401211925800101080000108000050116888004923651267072670766763670780010201600002080000267313511800211091080000108000011080000438000003980039603944502021623267041410480000800102670826732267322673226732
16002426731201004400126722012121925800101080000108000050117488704923627267312670766763668780010201600002080000267273511800211091080000108000001080000438000009800000100502031636267281010480000800102672826708267322670826732
1600242673120011000126716200192580010108000010800005011688801492362726731267276673367118001020160000208000026707351180021109108000010800000108000043800380388003861394350203165326711140480000800102674026732267282672826728
160024267302000047001267042121025800101080000108000050116875404923627267312673166723670780010201600002080000267313511800211091080000108000001080000080038008000001380502061663267281010080000800102673226732267082673226732
16002426731200004400126698010192580010108000010800005011690851492364726731267316676366878001020160000208000026729351180021109108000010800001108000043800380080039613943502051656267281414080000800102673226732267322673226708
16002426727200104500126698212019258001010800001080000501169085049236272672726707667636711800102016000020800002672735118002110910800001080000110800000800380388003961390502031635267281414080000800102673326732267082673226732
16002426727200004410126699211172580010108000010800005011686270492362726707267076676366878001020160000208000026731351180021109108000010800000108000008000003880000613944502061656267281414080000800102673226732267282673226708
16002426707201004400126722001162580010108000010800005011690850492365126707267076676367118001020160000208000026707351180021109108000010800000108000043800000388003861394450202163626728014080000800102673226732267082670826708
16002426897200000101267162111925800101080000108000050117467604923627267312670766773671180010201600002080000267313511800211091080000108000001080000080000008000060394350202162326728010080000800102673226732267282672826732