Apple Microarchitecture Research by Dougall Johnson

M1/A14 P-core (Firestorm): Overview | Base Instructions | SIMD and FP Instructions
M1/A14 E-core (Icestorm):  Overview | Base Instructions | SIMD and FP Instructions

LDP (signed offset, 32-bit)

Test 1: uops

Code:

  ldp w0, w1, [x6, #8]
  mov x0, 1
  mov x1, 2
  mov x8, 0

(no loop instructions)

1000 unrolls and 1 iteration

Retires: 2.000

Issues: 1.000

Integer unit issues: 0.000

Load/store unit issues: 1.000

SIMD/FP unit issues: 0.000

retire uop (01)cycle (02)030e0f1e22243a3f4346494f51schedule uop (52)schedule ldst uop (55)dispatch ldst uop (58)simd uops in schedulers (5a)606d6emap stall dispatch (70)map rewind (75)map stall (76)dispatch uop (78)map ldst uop (7d)map ldst uop inputs (80)8283flush restart other nonspec (84)85inst all (8c)inst int load (95)inst ldst (9b)9dl1d tlb access (a0)l1d cache miss ld (a3)ld unit uop (a6)acafb5b6bbl1d cache miss ld nonspec (bf)cfd5map dispatch bubble (d6)ddfetch restart (de)e0eaebec? ldst retires (ed)? int retires (ef)f5f6f7f8fd
2005394300451013862120192510001000100014075139437492312710002000100039835111001100010000100001038010386040073316113951410710001000399399375395399
2004398210440013842011925100010001000150651374398763127100020001000398351110011000100011000431039010396104473116113951010010001000399399395375399
2004398300010035921211925100010001000152881398394723127100020001000398351110011000100001000431039010390139447311611395014710001000375395375399399
2004374200450003592120025100010001000152841398398923131100020001000374351110011000100001000431000010380138073116113951414710001000375399395375399
2004397311440003790112192510001000100015288137437496310710002000100039435111001100010000100043100001038013807311611395140710001000395395395375377
20043942004510137920016251000100010001407513743987231311000200010003983511100110001000010004310393810386138073116113951010410001000375375399375375
20043983004410038320102510001000100015294139837472310710002000100039835111001100010000100043103838103861384373216223951410410001000399399399375399
20043743004400135921102510001000100014075139839472313110002000100037435111001100010001100043103841103861007311611391140710001000399375399400375
200437421144101359211025100010001000152841398374923131100020001000374351110011000100001000431000421038010447311611391140710001000399375399399375
20043982014500038421201925100010001000152881398374923131100020001000398351110011000100001000010393810396104473116113711410010001000375395399395399

Test 2: Latency 1->3 (with chain penalty)

Chain cycles: 3

Code:

  ldp w0, w1, [x6, #8]
  eor x8, x8, x0
  eor x8, x8, x0
  add x6, x6, x8
  mov x0, 1
  mov x1, 2
  mov x8, 0

(fused SUBS/B.cc loop)

100 unrolls and 100 iterations

Result (median cycles for code, minus 3 chain cycles): 4.0050

retire uop (01)cycle (02)03mmu table walk data (08)l2 tlb miss data (0b)0e0f181e22233f4d4f51schedule uop (52)schedule int uop (53)schedule ldst uop (55)dispatch int uop (56)dispatch ldst uop (58)int uops in schedulers (59)simd uops in schedulers (5a)60696a6d6emap stall dispatch (70)map rewind (75)map stall (76)dispatch uop (78)map int uop (7c)map ldst uop (7d)map int uop inputs (7f)map ldst uop inputs (80)8283flush restart other nonspec (84)85inst all (8c)inst branch (8d)inst branch taken (90)inst branch cond (94)inst int load (95)inst int alu (97)inst ldst (9b)9d9fl1d tlb access (a0)l1d cache miss ld (a3)ld unit uop (a6)l1d cache writeback (a8)acafb5bbl1d cache miss ld nonspec (bf)c2branch cond mispred nonspec (c5)cfd5map dispatch bubble (d6)ddfetch restart (de)e0? int output thing (e9)eaebec? ldst retires (ed)? int retires (ef)f5f6f7f8fd
50205700475250011011070032697145970025401043010310001301001000061365233420560496697370035700356340536369540100302002000060200100007005037114020110099100100003010010000010010000110000001000010100261026411698133000366010000401007005170051700487004870051
50204700505240000010070035697145968425401043010310001301001000061391533413040496706170048700556338836371040100302002000060200100007003537114020110099100100003010010000010010000110000101000010000261016411698133000366910000401007005170051700517005170051
502047003552500000281070032697115968425401043010310001301001000061391533420560496697370051700536338836370740100302002000060200100007005037114020110099100100003010010000010010000110000131000010100261016411698133000366010000401007005170051700367005170048
502047005052500000100700356969859694254010030100100003010010000613915334215204966967700527005363403363710401003020020000602001000070050371140201100991001000030100100000100100001100001201000010100261016411698133000390910000401007004870048700537005170036
50204700505250000010070035697145970025401003010310000301001000061391533420560496696770037700516338836371040100302002000060200100007003537114020110099100100003010010000010010000110000001000000100261016411698183000006010000401007005170051700517005170051
50204700355240010011070020696985970025401043010010000301001000061365233420560496704370054700506338836371040100302002000060200100007004737114020110099100100003010010000010010000110000001000010100261016411698103000396010000401007005170051700537005170051
50204700505250000011070035697145968425401003010310001301001000061365233413040496704970055700476338836369540100302002000060200100007003537214020110099100100003010010000010010000010000001000000100261016411698103000390010000401007005170051700367003670036
502047005052400004101070020696985970025401003010310001301001000061391533420560496696770062700386340036369540100302002000060200100007005037114020110099100100003010010000010010000110000001000010100261016411698133000006010000401007005170036700517005170048
50204700475250001001070032697115968425401003010310001301001000061391533420560496696770036700926340336371040100302002000060200100007003537114020110099100100003010010000010010000010000001000010100261016411697983000366910000401007005170120700517003670036
50204700505240000011070035697145970025401043010310001301001000061365233420560496696770047700516340336370740100302002000060200100007005037114020110099100100003010010000010010000110000001000010000261016411698133000399010000401007005170036700517005170036

1000 unrolls and 10 iterations

Result (median cycles for code, minus 3 chain cycles): 4.0051

retire uop (01)cycle (02)030e0f18191e22233f4d4f51schedule uop (52)schedule int uop (53)schedule ldst uop (55)dispatch int uop (56)dispatch ldst uop (58)int uops in schedulers (59)simd uops in schedulers (5a)5f60696a6d6emap stall dispatch (70)map rewind (75)map stall (76)dispatch uop (78)map int uop (7c)map ldst uop (7d)map int uop inputs (7f)map ldst uop inputs (80)8283flush restart other nonspec (84)85inst all (8c)inst branch (8d)inst branch taken (90)inst branch cond (94)inst int load (95)inst int alu (97)inst ldst (9b)9d9fl1d tlb access (a0)l1d cache miss ld (a3)ld unit uop (a6)st unit uop (a7)l1d cache writeback (a8)acafb5l1d cache miss ld nonspec (bf)branch mispred nonspec (cb)cdcfd5map dispatch bubble (d6)ddfetch restart (de)e0? int output thing (e9)eaebec? ldst retires (ed)? int retires (ef)f5f6f7f8fd
5002570057525110000070036697155969325400143001310001300101000061514533420950049669717005170051634033637194001030020200006002010000700353711400211091010000300101000001010000110000003100001100252127822698183000310101010000400107005270052700527005270052
500247005152401001107003669715596932540010300131000130010100006149943342095004966971700517005163419363719400103002020000600201000070035371140021109101000030010100001101000011000000010000110025212782269818300031001010000400107005270036700527005270052
5002470035524010011070036697155969325400103001010000300101000061514533412950049669717005170035634193637194001030020200006002010000700513711400211091010000300101000001010000110000000100001100252137832698183000310101010000400107005270053700537005270052
5002470051525000011070036697195967725400143001310001300101000061514533420950049669557003570051634193637194001030020200006002010000700513711400211091010000300101000011010000010000000100001100252127823698183000010101010000400107003770052700527005270052
500247005152500001107003669719596772540014300101000130010100006151453342095004966971700517005163419363682400103002020000600201000070051371140021109101000030010100000101000011000001010000100025212783269818300031001010000400107008170053700407005270052
5002470051525000000070020697155969325400103001310001300101000061514533420951149639417005170051634033637194001030020200006002010000700513711400211091010000300101000001010000110000003100001100252127832698193000310101010000400107005270056700367005470058
500247005152510001107002069715596932540014300131000030010100006151453342095004966971700517005163419363719400103002020000600201000070051371140021109101000030010100000101000001000000010000110025212992269818300031001010000400107003670052700367005270052
500247005152500001007003669715596932540014300101000130010100006151453342095004966971700517005163419363719400103002020000600201000070051371140021109101000030010100000101000001000000310000010025212783269801300031010010000400107005270036700527005270052
500247005152510001107002069716596932540014300131000030010100006151453342095104966955700517005163419363719400103002020000600201000070051371140021109101000030010100000101000011000000010000110025212992269818300031010010000400107005270052700527005270052
5002470051525000011070036697155969325400143001310001300101000061514533420951049669717005170051634193637194022030020200006002010000700513711400211091010000300101000011010000110000003100001000252129932698183000310101010000400107005370052700527005270052

Test 3: Latency 2->3 (with chain penalty)

Chain cycles: 3

Code:

  ldp w0, w1, [x6, #8]
  eor x8, x8, x1
  eor x8, x8, x1
  add x6, x6, x8
  mov x0, 1
  mov x1, 2
  mov x8, 0

(fused SUBS/B.cc loop)

100 unrolls and 100 iterations

Result (median cycles for code, minus 3 chain cycles): 4.0054

retire uop (01)cycle (02)03mmu table walk data (08)090e0f18191e22233f4d4f51schedule uop (52)schedule int uop (53)schedule ldst uop (55)dispatch int uop (56)dispatch ldst uop (58)int uops in schedulers (59)simd uops in schedulers (5a)60696a6d6emap stall dispatch (70)int prf full (71)map rewind (75)map stall (76)dispatch uop (78)map int uop (7c)map ldst uop (7d)map int uop inputs (7f)map ldst uop inputs (80)8283flush restart other nonspec (84)85inst all (8c)inst branch (8d)inst branch taken (90)inst branch cond (94)inst int load (95)inst int alu (97)inst ldst (9b)9d9fl1d tlb access (a0)l1d tlb miss (a1)l1d cache miss ld (a3)ld unit uop (a6)l1d cache writeback (a8)a9acafb5l1d cache miss ld nonspec (bf)c2cfd5map dispatch bubble (d6)ddfetch restart (de)e0? int output thing (e9)eaebec? ldst retires (ed)? int retires (ef)f5f6f7f8fd
50205700575250011001007002069715597032540100301031000130100100006139303341304149669747005470051634040363714401003020020000602001000070035371140201100991001000030100100001100100000110000000100001002610264116981430003010010000401007005570055700557004070036
5020470054525000000010700396971559700254010030103100003010010000613888334225014966974700547005463407036369540100302002000060200100007005137114020110099100100003010010000010010000001000000010000110261016411697983000013131010000401007005570055700367005270055
502047003552500000011070020697195970325401003010010001301001000061388833422501496697470051700546340403637144010030200200006020010000700543711402011009910010000301001000001001000001100000001000011026101641169798300030131310000401007005570060700367005270036
502047003552500110000070036697155970325401003010010001301001000061393033421041496697170054700356340403637144010030200200006020010000700543711402011009910010000301001000001001000001100000001000011026101641169817300031301310000401007005570055700527005570036
5020470035525000000000700206971559703254010430103100013010010000613888334225014966974700547005463388036369540100302002000060200100007011637114020110099100100003010010000010010000011000000010000110261016411698173000310101010000401007005570052700527003670036
502047003552400100010070039697155970325401043010310001301001000061367933413041496697470051700516338803636954010030200200006020010000700353711402011009910010000301001000001001000001100000001000310026101641169798300001001310000401007003670055700367005570036
50204700355250000000017002069698596842540104301031000130100100006139303342250149669557005170035634070363695401003020020000602001000070051371140201100991001000030100100000100100000010000000100000102610164116979830003013010000401007005270055700557005270055
5020470035524000000100700606973059684254010430100100013010010000613888334225014966955700747010763408036369540100302002000060200100007005137114020110099100100003010010000010010000011000000010000100261016411698173000313131310000401007005570055700367003670055
502047005152400000010070020697195970325401003010010001301001000061388833422501496697470054700516340403637144010030200200006020010000700513711402011009910010000301001000001001000000100000001000010026101641169798300241001310000401007003670052700527003670036
502047003552500000010170036696985968425401043010310001301001000061393033413041496697470054700356338803637144010030200200006020010000700353711402011009910010000301001000001001000001100000001000011026101641169817300001313010000401007003670106700657003670036

1000 unrolls and 10 iterations

Result (median cycles for code, minus 3 chain cycles): 4.0054

retire uop (01)cycle (02)03l1d tlb fill (05)mmu table walk data (08)l2 tlb miss data (0b)0e0f18191e1f22233a3f4d4f51schedule uop (52)schedule int uop (53)schedule ldst uop (55)dispatch int uop (56)dispatch ldst uop (58)int uops in schedulers (59)simd uops in schedulers (5a)5f60696a6b6d6emap stall dispatch (70)map rewind (75)map stall (76)dispatch uop (78)map int uop (7c)map ldst uop (7d)map int uop inputs (7f)map ldst uop inputs (80)8283flush restart other nonspec (84)85inst all (8c)inst branch (8d)inst branch taken (90)inst branch cond (94)inst int load (95)inst int alu (97)inst ldst (9b)9d9fl1d tlb access (a0)l1d tlb miss (a1)l1d cache miss ld (a3)ld unit uop (a6)st unit uop (a7)l1d cache writeback (a8)a9acafb5bbl1d cache miss ld nonspec (bf)l1d tlb miss nonspec (c1)c2cdcfl1i cache miss demand (d3)d5map dispatch bubble (d6)d9dadbddfetch restart (de)e0? int output thing (e9)eaebec? ldst retires (ed)? int retires (ef)f5f6f7f8fd
5002570057525101000020100700396971559693254001030013100023001010000615481334129500496697307005670035634193637194001030020200006002010000700513711400211091010000300101000011010000011000000001000010000025220107800055698183000310101310000400107003670055700367003670052
500247005152500000001000070036697195969325400143001310001300101000061514533420950049669710700547003563422363719400103002020000600201000070054371140021109101000030010100000101000001100000000100001010002520067800055698183000313101010000400107005270055700527005270055
50024700545240000000200017004269724597102540018300131000130010100006155083342398014966977070058700596342936373940010300202000060020100007006037114002110910100003001010000010100011010001000110000111110252006640005469820300060101310000400107005870042700587005870061
500247006052510100002010170042697225971325400183001610002300101000061550833416140149669770700417005763428363742400103002020000600201000070060371140021109101000030010100000101000331100010014100001111002520056400076698203000610101010000400107005870061700617006170061
50024700575251010000201017004269725597132540018300161000230010100006151633341614004966961070041700576340936374240010300202000060020100007005737114002110910100003001010000010100044110003011110000111100252005640005569820300061301310000400107006170058700587005870058
500247005752510000003010070042696975969525400143001610002300101000061550833423980049669800700607005763425363739400103002020000600201000070057371140021109101000030010100000101000231100011121100001111102520076400055699203000610101010000400107006170058700617004270061
500247005752510100002000170042697225971025400183001610002300101000061550833423980049669770700577005763425363742400103002020000600201000070041371140021109101000030010100000101000111100020001100000111002520056500055698053000610101010000400107006170058700587005870042
500247005752511100002000170042697225969525400143001310002300101000061553533423980049669613700607005763428363722400103002020000600201000070057371140021109101000030010100000101000111100030001100001111002520066400055698203000610101010000400107005870058700587005870061
5002470060525111000020100700206971959693254001430013100013001010000615145334209500496697107005170056634033637364001030020200006002010000700353711400211091010000300101000001010000011000000001000010000025200599000556980130003001310000400107005270052700557005270036
50024700545250000100000007003969719596932540014300101000130010100006151453342095004966955070051700356341936368240010300202000060020100007005137114002110910100003001010000110100000110000000010000001000252015170006569818300032523610000400107003670036700557003670052

Test 4: throughput

Count: 8

Code:

  ldp w0, w1, [x6, #8]
  ldp w0, w1, [x6, #8]
  ldp w0, w1, [x6, #8]
  ldp w0, w1, [x6, #8]
  ldp w0, w1, [x6, #8]
  ldp w0, w1, [x6, #8]
  ldp w0, w1, [x6, #8]
  ldp w0, w1, [x6, #8]
  mov x7, x6
  mov x8, x6
  mov x9, x6
  mov x10, x6
  mov x11, x6
  mov x12, x6
  mov x13, x6

(fused SUBS/B.cc loop)

100 unrolls and 100 iterations

Result (median cycles for code divided by count): 0.3342

retire uop (01)cycle (02)03l1d tlb fill (05)mmu table walk data (08)l2 tlb miss data (0b)0e0f18191e1f22233a3f4346494f51schedule uop (52)schedule int uop (53)schedule ldst uop (55)dispatch int uop (56)dispatch ldst uop (58)int uops in schedulers (59)simd uops in schedulers (5a)60696a6d6emap stall dispatch (70)int prf full (71)map rewind (75)map stall (76)dispatch uop (78)map int uop (7c)map ldst uop (7d)map int uop inputs (7f)map ldst uop inputs (80)8283flush restart other nonspec (84)85inst all (8c)inst branch (8d)inst branch taken (90)inst branch cond (94)inst int load (95)inst int alu (97)inst ldst (9b)9d9fl1d tlb access (a0)l1d tlb miss (a1)l1d cache miss ld (a3)a5ld unit uop (a6)st unit uop (a7)l1d cache writeback (a8)a9acafb5b6bbl1d cache miss ld nonspec (bf)l1d tlb miss nonspec (c1)c2branch cond mispred nonspec (c5)cfd5map dispatch bubble (d6)ddfetch restart (de)e0? int output thing (e9)eaebec? ldst retires (ed)? int retires (ef)f5f6f7f8fd
16020526736200100000017601032671701818162580100100800001008000050011675300492365226735267366654064681180100200160000200800002671435118020110099100800001008000001008002020420800571015980038615742192051102162226740090280000801002673326733267332673326733
16020426732200111000083000226717218181525801001008000010080000500116994104923652268302673767550366908010020016000020080000267323511802011009910080000100800000100800192000800541015980000005742192051102162226729009280000801002671526733267332673326733
16020426732200101000010101032670020181525801001008000010080000500116753014923634268352673866700366908010020016000020080000267323511802011009910080000100800000100800192000800570005980038605742190051102162226711090280000801002673326733267332673326733
160204267322001110000390003266990181815258010010080000100800005001169154049236522683126737666803669080100200160000200800002673235118020110099100800001008000001008002020420800571022180038605742191051102162226729099280000801002673326733267332673326733
16020426732200111010080010226717218181525801001008000010080000500116753014923641272122672466590366988010020016000020080000267323511802011009910080000100800000100800201942080057102598003861570190051102162226711099280000801002673326733267332673326733
16020426732200101000098010326717218181625801001008000010080000500116737104923652268092673866571173683080100200160000200800002673235118020110099100800001008000001008002019420800571015980038615642191051102162226729099280000801002673326733267332671526736
16020426714200101000083000226699318181525801001008000010080192500116875704923652268382674166920366908010020016000020080000267323511802011009910080000100800000100800202100800561016280000015742191051102162226729090280000801002673326733267332673326733
16020426732200111000065010326717018181525801001008000010080000500116753014923652267712674166570366908010020016000020080000267323611802011009910080000100800000100800201900800191012180039615742190051102162226729090280000801002673326733267332673326715
160204267142001110000920103267172181816258010010080000100800005001175054049236522682026738666103667280100200160000200800002673235118020110099100800001008000001008001920420800571005980038601942190051102162226711099280000801002671526733267152673326733
16020426732200111000065001226717218181725801001008000010080000500117175314923652268382671966610366908010020016000020080000267323511802011009910080000100800000100800201942080019100688003861570191051102162226729099280000801002673326733267152673326733

1000 unrolls and 10 iterations

Result (median cycles for code divided by count): 0.3341

retire uop (01)cycle (02)03l1d tlb fill (05)l2 tlb miss data (0b)0e0f1e22243a3f4346494f51schedule uop (52)schedule int uop (53)schedule ldst uop (55)dispatch int uop (56)dispatch ldst uop (58)int uops in schedulers (59)simd uops in schedulers (5a)60696a6d6emap stall dispatch (70)map rewind (75)map stall (76)dispatch uop (78)map int uop (7c)map ldst uop (7d)map int uop inputs (7f)map ldst uop inputs (80)8283flush restart other nonspec (84)85inst all (8c)inst branch (8d)inst branch taken (90)inst branch cond (94)inst int load (95)inst int alu (97)inst ldst (9b)9d9fl1d tlb access (a0)l1d tlb miss (a1)l1d cache miss ld (a3)ld unit uop (a6)st unit uop (a7)l1d cache writeback (a8)a9acafb5b6bbl1d cache miss ld nonspec (bf)l1d tlb miss nonspec (c1)c2cdcfd2d5map dispatch bubble (d6)d9ddfetch restart (de)e0eaebec? ldst retires (ed)? int retires (ef)f5f6f7f8fd
160025267232010000451032670121818162580010108000010800005011701071492365226714267146677367128001020160000208000026732351180021109108000010800000108000003980039000080039613543000502001160112671900080000800102670826708267282672326728
160024267272000000450002671521201625800101080000108000050116753014923647267072672766773668780010201600002080000267273511800211091080000108000001080000039800000003580000010430005020011601126719100480000800102673026740267362670826728
160024267222000000411012672520181625802701080000108000050117010714923627267272672266685268288001020160000208000026727351180021109108000010800000108000000800000003980000613539000502001160112671990080000800102670826728267282670826728
16002426707200001001012671720121625800101080000108000050117498914923652267272672766723668780010201600002080000267273511800211091080000108000001080000039800390003980000603500005020011601126724106280000800102672826728267282670826708
160024267272000000000226712212181625800101080000108000050116875414923647267072672766723670780010201600002080000267273511800211091080000108000001080000039800350103980035000390005020011601126724106080000800102672826728267082672826728
160024267072000000000226707201216258001010800001080000501168754049236472673226722665336707800102016000020800002672735118002110910800001080000010800000398003900059800386019421910502001160112672499280000800102673326733267332672326728
16002426732200000045001267122000258001010800001080000501173183149236472672726707666836707800102016000020800002670735118002110910800001080000010800192008005710121800396035430005020011601126719106280000800102670826708267282672326708
16002426722200000045001268412121816258001010800001080000501168754149236272672726707666836687800102016000020800002672235118002110910800001080000010800000080000000398003961350000502001160112672449080000800102672826728267282672826733
16002426727200000045002267122120112580010108000010800005011746281492362726727267276678367078001020160000208000026727351180021109108000010800000108000003980039010398003960000005020011601126719106480000800102672826723267232672326728
1600242672720000104510226823200162580010108000010800005011688801492364726727267226668368548001020160000208000026727351180021109108000010800001108000003980039000080039610430005020011601126724106080000800102670826728267082672326728