Apple Microarchitecture Research by Dougall Johnson

M1/A14 P-core (Firestorm): Overview | Base Instructions | SIMD and FP Instructions
M1/A14 E-core (Icestorm):  Overview | Base Instructions | SIMD and FP Instructions

LD4 (multiple, post-index, 4H)

Test 1: uops

Code:

  ld4 { v0.4h, v1.4h, v2.4h, v3.4h }, [x6], x8
  nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop
  mov x0, 1
  mov x1, 2
  mov x8, 0

(no loop instructions)

1000 unrolls and 1 iteration

Retires (minus 60 nops): 6.000

Issues: 7.008

Integer unit issues: 1.000

Load/store unit issues: 2.000

SIMD/FP unit issues: 4.008

retire uop (01)cycle (02)03mmu table walk instruction (07)mmu table walk data (08)09l2 tlb miss instruction (0a)l2 tlb miss data (0b)0e0f1e22243a3f43464951schedule uop (52)schedule int uop (53)schedule simd uop (54)schedule ldst uop (55)dispatch int uop (56)dispatch simd uop (57)dispatch ldst uop (58)int uops in schedulers (59)simd uops in schedulers (5a)ldst uops in schedulers (5b)5f696d6emap rewind (75)map stall (76)dispatch uop (78)map ldst uop (7d)map simd uop (7e)map ldst uop inputs (80)map simd uop inputs (81)8283flush restart other nonspec (84)85inst all (8c)inst simd load (98)inst ldst (9b)9dl1d tlb access (a0)l1d cache miss ld (a3)ld unit uop (a6)l1d cache writeback (a8)acafb5b6bbl1d cache miss ld nonspec (bf)cfd0d2l1i cache miss demand (d3)l1i tlb miss demand (d4)d5map dispatch bubble (d6)d9dadbddfetch restart (de)e0? int output thing (e9)eaebec? ldst retires (ed)? simd retires (ee)? int retires (ef)f5f6f7f8fd
66005293602202601210117104668288230001695570081000400820001000400020005000100004752452297129065292393107000200040003000800029242291261161001100010001200062002002004400412890917768243063106920064310838181060572838210001564613374143972000400010002931229359294012924429389
6600429357219220020000400454728850020169477008100040082000100040002000500010000476045229562908429235310700020004000300080002926429190116100110001000120004200000200040061279393166842327595820262326338101667612839510001583013138143102000400010002924829504293962940829305
6600429269219220021000510457428959001168977008100040082000100040002000500010000476083230212926129383310700020004000300080002921629142116100110001000020004200000200040041292192636837322876220130312638131464562843810001635113222142942000400010002927029461293152936429324
66004292772202000210006004674288700001687970081000401220001000400020005000100004756422299029099292623107000200040003000800029213292551161001100010000200062000002004400612857943970283233106620172304438151754582849310001593113266144182000400010002927329381291712933129383
66004294142202000230006104557288900001682770161000401620001000400020005000100004752622296929155292523107000200040003000800029221291701161001100010000200062000022002400612897970270293066165420079313638131266582842310001628312963144022000400010002939729237293332936029283
6600429159220240021000810469129023020168387008100040122000100040002000500010000475645229672909529348310700020004000300080002922729130116100110001000020006200200200040001293892086856325577020172305238151760602841010001634513249142862000400010002933829426292332930529244
6600429266219190023000600457828819020168457012100040002000100040002000500010000476252230222906629306310700020004000300080002919529167116100110001000020000200010200040261293191656874306586720087310038111455642843110001631313204144822000400010002917029323293272943029269
66004294042202400210002104880289470001702370121000401220001000400020005000100004757852293529095293253107000200040003000800029126291881161001100010000200002000122004404412862926868263096116519975306738201659622844110001633713263144072000400010002929029369293272942729268
66004292612212600180000004567288090021700070121000400820001000400020005000100004759052294429167293903107000200040003000800029152291871161001100010000200002000022000404613092916870283035116120100303738181658602842510001643913150140312000400010002928629237291942927529406
66004292262202400250018004566289120221686270001000400820001000400020005000100004740872295829124293383107000200040003000800029112291111161001100010001200042000002000400412988915070703053107520087302938141757622848210001630412998143652000400010002930529392293562933429312

Test 2: throughput

Count: 8

Code:

  ld4 { v0.4h, v1.4h, v2.4h, v3.4h }, [x6], x8
  ld4 { v0.4h, v1.4h, v2.4h, v3.4h }, [x6], x8
  ld4 { v0.4h, v1.4h, v2.4h, v3.4h }, [x6], x8
  ld4 { v0.4h, v1.4h, v2.4h, v3.4h }, [x6], x8
  ld4 { v0.4h, v1.4h, v2.4h, v3.4h }, [x6], x8
  ld4 { v0.4h, v1.4h, v2.4h, v3.4h }, [x6], x8
  ld4 { v0.4h, v1.4h, v2.4h, v3.4h }, [x6], x8
  ld4 { v0.4h, v1.4h, v2.4h, v3.4h }, [x6], x8
  mov x7, x6
  mov x8, x6
  mov x9, x6
  mov x10, x6
  mov x11, x6
  mov x12, x6
  mov x13, x6
  mov x8, 0

(fused SUBS/B.cc loop)

100 unrolls and 100 iterations

Result (median cycles for code divided by count): 1.0009

retire uop (01)cycle (02)03l1d tlb fill (05)mmu table walk data (08)l2 tlb miss data (0b)0e0f18191e1f22233a3f4346494e4f51schedule uop (52)schedule int uop (53)schedule simd uop (54)schedule ldst uop (55)dispatch int uop (56)dispatch simd uop (57)dispatch ldst uop (58)int uops in schedulers (59)simd uops in schedulers (5a)ldst uops in schedulers (5b)6067696b6d6emap stall dispatch (70)simd prf full (72)map rewind (75)map stall (76)dispatch uop (78)map int uop (7c)map ldst uop (7d)map simd uop (7e)map int uop inputs (7f)map ldst uop inputs (80)map simd uop inputs (81)8283flush restart other nonspec (84)85inst all (8c)inst branch (8d)inst branch taken (90)inst branch cond (94)inst int alu (97)inst simd load (98)inst ldst (9b)9d9fl1d tlb access (a0)l1d tlb miss (a1)l1d cache miss ld (a3)ld unit uop (a6)st unit uop (a7)l1d cache writeback (a8)a9acafb5b6bbl1d cache miss ld nonspec (bf)l1d tlb miss nonspec (c1)c2branch cond mispred nonspec (c5)branch mispred nonspec (cb)cdcfd5map dispatch bubble (d6)ddfetch restart (de)e0? int output thing (e9)eaeb? ldst retires (ed)? simd retires (ee)? int retires (ef)f5f6f7f8fd
48020580069600101100056000280032277502556013480106320064160008801083200321600164805379611965760268108005008006980069307405601482001605503200322002400246400648006980070118020110099100100800008000001001600121143160013000101600026150431111115116016018006680006013160000320000801008004980048800708007080275
480204800696001010100550102800542605812556021080106320076160008801083200241600164805379612321088092000800290800698006900954056014820016001632003220024002464006480069800691180201100991001008000080000010016001311431600520111031600416050011111151171160080676800061313160000320000801008007080070800488007080070
480204800486001000000560002800332065025560174801063200761600088010832002416001648053896119257602680080050080069800480074056014820016001632003220024002464218480069800751180201100991001008000080000010016001312431600531001116004161504311111151160160080045800061313160000320000801008007080049800488007080070
4802048006959910010005601028005420650255601908010632007616000880108320024160016480537961192576026810800290800478006900617560148202160016320032200240024640064800538004811802011009910010080000800000100160012114316005203150160041005043111111511611620800668000600160000320000801008007080070800498004880048
48020480069599100000011000280054077002556013880106320096160008801083200241600164805379611965440284108005008006980069307405601562001600163200322002400246400648005080060118020110099100100800008000001001600151243160053121101600020150011111151160160080045800061313160000320000801008007180070800708004980049
48020480069600112100023010180033066502556021080106320876160008801083200241600164805379611961088092010800280800488006930719560148200160532320032200240219640064800478004811802011009910010080000800000100160013104316001404310160002614943112111511611600800458000600160000320000801008078780070800708007080070
4802048006960010100001101038003327700255602108010632009616052880108320024160016480537961196108810561080050080069800483064056014820016001632003220024002464006480079800691180201100991001008000080000010016001411431600521001116000200104310011151170160080066800061313160000320000801008004980070800708007080049
4802048004859911011005501028003226000255601788010632002016000880108320024160016480538961182108809240080050080069800693029195601482001600163200322002400246400648007280069118020110099100100800008000001001600141143160052001111600026050431101115117065008006680006013160000320000801008007080070800488007080049
48020480069600100000011000380054000502556019080106320076160008801083200321600164805379601135760268008005008006980069007195601482001600163200322002400246400648006980069118020110099100100800008000001001600131101600131025016056161504311011151171160080066800061313160000320000801008007080070800708007080070
480204800696001001000560003800332665025560138801063200201600088010832002416001648053796011357602680080050080069800693071856015620016001632003220024002464006480054800691180201100991001008000080000010016001513431600120005016004100504310011151170160080066800061313160000320000801008007080070800498004980070

1000 unrolls and 10 iterations

Result (median cycles for code divided by count): 1.0009

retire uop (01)cycle (02)03l1d tlb fill (05)mmu table walk data (08)l2 tlb miss data (0b)0e0f18191e1f22243a3f4346494e51schedule uop (52)schedule int uop (53)schedule simd uop (54)schedule ldst uop (55)dispatch int uop (56)dispatch simd uop (57)dispatch ldst uop (58)int uops in schedulers (59)simd uops in schedulers (5a)ldst uops in schedulers (5b)6061696d6emap stall dispatch (70)simd prf full (72)map rewind (75)map stall (76)dispatch uop (78)map int uop (7c)map ldst uop (7d)map simd uop (7e)map int uop inputs (7f)map ldst uop inputs (80)map simd uop inputs (81)8283flush restart other nonspec (84)85inst all (8c)inst branch (8d)inst branch taken (90)inst branch cond (94)inst int alu (97)inst simd load (98)inst ldst (9b)9d9fl1d tlb access (a0)l1d tlb miss (a1)l1d cache miss ld (a3)a5ld unit uop (a6)st unit uop (a7)l1d cache writeback (a8)a9acafb5b6bbl1d cache miss ld nonspec (bf)l1d tlb miss nonspec (c1)c2branch cond mispred nonspec (c5)branch mispred nonspec (cb)cdcfd0d2l1i cache miss demand (d3)d5map dispatch bubble (d6)ddfetch restart (de)e0? int output thing (e9)eaebec? ldst retires (ed)? simd retires (ee)? int retires (ef)f5f6f7f8fd
4800258006960011100001101008005406002556003480010320248160000800103200001600004800489611075760016008005080069800690033056001020160000320000202400006400008006980069118002110910108000080000010160012124301600490012021600396050431100005019000617538006680000000160000320000800108007080049800708007080048
4800248006959911000005600028005400652556007480010320096160000800103200001600004800499611125440032108005080069800690035256001020160000320000202400006400008004780069118002110910108000080000010160011110016005010050160039015043111000501900051653800668000013130160000320000800108007080070800708007080070
480024800696191010000560100800542705255600308001032002416000080010320000160000480048961101108807521080050800698006930329560010201600003200002024000064000080047800691180021109101080000800000101600111143016004900050160039605043111000501900031635800668000013130160000320000800108007080049800498007080049
480024800476001010000560102800542666255600348001032009616000080010320000160000480049961112576001610800298006980069003305600102016000032029220240000640000800698004811800211091010800008000001016001212001600100034916003901494310100050190005163580066800000130160000320000800108007080049800708007080070
480024800696001011100560002800542700255601068001032009616000080010320000160000480048961112108807521080050800698006930329560010201600003200002024000064000080069800691180021109101080000800000101600101100160010001101600396149011100050190007165580044800001300160000320000800108004880070800498004980070
480024800696001000100110002800542665255600868001032002416000080010320000160000480049961108576001610800508004780048003525600102016000032000020240000640000800698004711800211091010800008000001016001111001600501115016003960504311000050190005165380044800001300160000320000800108007080070800498007080070
4800248006960010000005500018003300652556008680010320076160000800103200001600004800499600295760016108005180069800473035256001020160000320000202400006400008006980047118002110910108000080000010160012114301600490021016003901500101000501900051653800668000013130160000320000800108004980049800488007080049
4800248006959910100001000038003300652556007080010320024160000800103200001600004800489600295760016108005080048800690035256001020160000320000202400006400008006980048118002110910108000080000010160011110016005101011160039615043112000501900041645800668000013130160000320000800108007080049800708004980070
4800248006960011100001101028005420652556008680010320064160000800103200001600004800489611385760016108005080048800690035256001020160000320000202400006400008006980048118002110910108000080000010160011114301600111004916003901504311100050190005165580066800000130160000320000800108007080070800498007080049
48002480048599111014410000380054066025560074800103200761600008007332051616000048004896110810880752108019080069800693046533056001020160136320000202400006400008004880069118002110910108000080000010160011110016004900049160039615043111000501900051653800458000013130160000320000800108007080070800708007080070