Apple Microarchitecture Research by Dougall Johnson

M1/A14 P-core (Firestorm): Overview | Base Instructions | SIMD and FP Instructions
M1/A14 E-core (Icestorm):  Overview | Base Instructions | SIMD and FP Instructions

SWPAL (64-bit)

Test 1: uops

Code:

  swpal x0, x1, [x6]
  nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop
  mov x0, 0

(no loop instructions)

1000 unrolls and 1 iteration

Retires (minus 70 nops): 2.000

Issues: 2.000

Integer unit issues: 0.000

Load/store unit issues: 2.000

SIMD/FP unit issues: 0.000

retire uop (01)cycle (02)03mmu table walk instruction (07)l2 tlb miss instruction (0a)0e0f181e1f2223243a3f51schedule uop (52)schedule ldst uop (55)dispatch ldst uop (58)simd uops in schedulers (5a)5f6061696a6b6d6emap rewind (75)map stall (76)dispatch uop (78)map ldst uop (7d)map ldst uop inputs (80)8283flush restart other nonspec (84)85inst all (8c)inst int load (95)inst ldst (9b)9dl1d tlb access (a0)l1d tlb miss (a1)l1d cache miss st (a2)l1d cache miss ld (a3)ld unit uop (a6)st unit uop (a7)l1d cache writeback (a8)acafbbl1d cache miss ld nonspec (bf)l1d cache miss st nonspec (c0)cfd0d1d2l1i cache miss demand (d3)l1i tlb miss demand (d4)d5map dispatch bubble (d6)d9dadbddfetch restart (de)e0eaebec? ldst retires (ed)f5f6f7f8fd
7200534095256251500010640100546533966290372000200020001000110049309920336773403331220002000300033794539011710011000100002000022100210000420035221498810658179483548136324213353644481348563302818991169231833720003413134137340523410134101
7200434152256151700010120100530433985290162000200020001000050649309570336833414931020002002300033804531911710011000100002000002100310000520043021519910629079953530105224169361244441352503299218966167971829020003403634038340723408134096
72004340572551518000105788000539233930290062000200020001000000049309830337073401231020002000300033785538011710011000100002000002100310000320042221498911044179683535124324167355244451250493297419142169031864320003413734013340643417534051
7200434096256201900010060000531433997289622000200020001000000049310160335683408331020002000300033852536011710011000100002000022100310000320042201523110847179653523125324165356644471456543303919035168591835420003407334104341213403534129
720043407225522140001004000053373397529081200020002000100000004931047033716341173102000200030003384453812171001100010000200000210021000032002522149321058917986354694724232355044441648513299918848167621813620003408534104340433408534068
720043413125511150001005010053223402728984200020002000100001004931072033656341083122000200030003380053781171001100010000200000210041000042004322151101068107992360495424183362144411451523307618863165891839120003410934067341093409234127
720043406225519140001006000054303398628962200020002000100000004930983033642342103102000200030003380854061171001100010000200002210021000042003302150661077917930352294924207354344531750523302318854168461855520003401234128340483404034107
720043410125518110001004010053143399928892200020002000100001004930969033670341103102000200030003378854011171001100010001200002010041000042003220149971068717954355985324195350944421352503303218897169851853020003403634137341553403434098
7200434005255162001010060000531133938289452000200020001000110049311260337933408231020002000300033872538811710011000100002000002100410000620023221514210755080173533104324174358444481748493306218814167841807520003406434107341153415834104
7200434030254181911010020000529633982289692000200020021000030649309940336133405531020002000300033858538711710011000100012000022100410000420013221497010661179663558104524227358044521250463303318969168371829120003412134113341353414434080

Test 2: throughput

Code:

  swpal x0, x1, [x6]
  add x6, x6, 8

(fused SUBS/B.cc loop)

100 unrolls and 100 iterations

Result (median cycles for code): 7.0069

retire uop (01)cycle (02)03l1d tlb fill (05)mmu table walk data (08)l2 tlb miss data (0b)1e1f2022293a3e3f4043464951schedule uop (52)schedule int uop (53)schedule ldst uop (55)dispatch int uop (56)dispatch ldst uop (58)int uops in schedulers (59)simd uops in schedulers (5a)696a6d6emap stall dispatch (70)map rewind (75)map stall (76)dispatch uop (78)map int uop (7c)map ldst uop (7d)map int uop inputs (7f)map ldst uop inputs (80)8283flush restart other nonspec (84)85inst all (8c)inst branch (8d)inst branch taken (90)inst branch cond (94)inst int load (95)inst int alu (97)inst ldst (9b)9fl1d tlb access (a0)l1d tlb miss (a1)l1d cache miss st (a2)l1d cache miss ld (a3)ld unit uop (a6)st unit uop (a7)l1d cache writeback (a8)a9aaabacafb6bbbcl1d cache miss ld nonspec (bf)l1d cache miss st nonspec (c0)l1d tlb miss nonspec (c1)c2c3cfd5map dispatch bubble (d6)ddfetch restart (de)e0? int output thing (e9)eaebec? ldst retires (ed)? int retires (ef)f5f6f7f8fd
30209700825241106967188291704106104700557873777342277301001010020000101002000059212122069496698770058700671366278301001020020000102003000070068533112020110099100100001010010000100208894433669107921018834418820389052105211002136730403313101161170028100001010420000101007006870070700697007170071
3020470067525000689914845168873124700557963111644224930100101002000010100200005921212191549669877006170066136627530100102002000010200300007006853311202011009910010000101001000010020842041271910798102352775581234866210341932032527100113101161170031100001010420000101007007270069700687007270069
30204700705241016958127881688999670054770412610242304301001010020000101002000059212121974496698970061700681366275301001020020000102003000070068533112020110099100100001010010000100209204436716108171022924563826729302106811361135929140613101161170031100001010420000101007006970069700687006970068
30204700655250006985137911728811007004980816714442269301001010020000101002000059212122074496698970055700651366271301001020020000102003000070068533112020110099100100001010010000100208760404702107881021925259848468492098811332040130303613101161170034100001010420000101007007070071700727007070068
3020470070525000687618784152076144700477945827442287301001010020000101002000059212122335496699270063700691366273301001020020000102003000070072533112020110099100100001010010000100208520384680107491021826459858528402099911331934832703513101161170034100001010420000101007007070067700637007270068
302047006952510070921880316649414470051759189914225630100101002000010100200005921212204249671277005670069136627330100102002000010200300007006753311202011009910010000101001000010020871038664310772102132592081036823210051951532628800413101161170034100001010420000101007006770067700707006870067
3020470069525100689115807171293140700547762665842245301001010020000101002000059212122206496699270061700681366306301001020020000102003000070070533112020110099100100001010010000100208790405695107711021824961844507992098211152836928600513101161170055100001010420000101007006970069700727006970070
3020470072525000721820819168061196700518202815542312301001010020000101002000059212121970496698870058700701366277301001020020000102003000070068533112020110099100100001010010000100208680411731108001024423054888508132102311031731928800413101161170036100001010420000101007006970071700687006470069
3020470069525000693719814167266132700547901998142305301001010020000101002000059212122147496698970058700661366271301001020020000102003000070068533112020110099100100001010010000100208614401733108361022326456790388832100411002340932400513101161170035100001010420000101007007270072700717006670069
302047006952510070011779015207734870049778498114422873010010100200001010020000592121221294966988700607006913662763010010200200001020030000700685331120201100991001000010100100001002088753956891078310245248188644287621028111412410286401213101161170028100001010420000101007006970070700697006970065

1000 unrolls and 10 iterations

Result (median cycles for code): 7.0064

retire uop (01)cycle (02)03l1d tlb fill (05)mmu table walk data (08)09l2 tlb miss data (0b)0e0f181e1f2022293a3e3f4043464951schedule uop (52)schedule int uop (53)schedule ldst uop (55)dispatch int uop (56)dispatch ldst uop (58)int uops in schedulers (59)simd uops in schedulers (5a)696a6d6emap stall dispatch (70)map rewind (75)map stall (76)dispatch uop (78)map int uop (7c)map ldst uop (7d)map int uop inputs (7f)map ldst uop inputs (80)8283flush restart other nonspec (84)85inst all (8c)inst branch (8d)inst branch taken (90)inst branch cond (94)inst int load (95)inst int alu (97)inst ldst (9b)9d9fl1d tlb access (a0)l1d tlb miss (a1)l1d cache miss st (a2)l1d cache miss ld (a3)ld unit uop (a6)st unit uop (a7)l1d cache writeback (a8)a9aaabacafb6bbbcl1d cache miss ld nonspec (bf)l1d cache miss st nonspec (c0)l1d tlb miss nonspec (c1)c2c3cdcfd5map dispatch bubble (d6)ddfetch restart (de)e0? int output thing (e9)eaebec? ldst retires (ed)? int retires (ef)f5f6f7f8fd
300297007852510110006873207512320652327004980810818242409300101001020000100102000058829122009496698470054700641366294300101002020000100203000070065533112002110910100001001010000110208713405791107961021223633149674896210061134183623353914012702151170031100001010420000100107006570066700667006670067
3002470062525111000067161375234246620470048752137810242377300101001020000100102000058829122200496698470067700721366292300101002020000100203000070067533112002110910100001001010000010208833449716108141022223327553214486021008115919434338329012701151170030100001010420000100107006470064700647006470064
300247006252410000006901257752552613927005075716919442374300101001020000100102000058829122085496698470067701201366295300101002020000100203000070064533112002110910100001001010000010208710421792108061021623718652454786209722140184493090244012701152370025100001010420000100107006270066700687006770065
300247006452500000006781177972528693287005074014808042384300101001020000100102000058829122259496698170066700741366295300101002020000100203000070064533112002110910100001001010000010208460477735107671020922126454098797210811133153872950020012701152370029100001010420000100107006470065700647006470067
30024701115250000000694025822243232252700507761582108423943001010010200001001020000588291221594966984700687006413662953001010020200001002030000700645331120021109101000010010100000102086804257331084110224247404524112791210511130213822910014012701152170030100001010420000100107006770066700647006670066
300247006552400000006666237982504454007005271818768642352300101001020000100102000058829122053496698370066700721366297300101002020000100203000070064533112002110910100001001010000010208680415736107971023023738253652845209721155244013010010012702151270030100001010420000100107006370067700657006670067
30024700645240000000683120789247246264700507951688854238530010100102000010010200005882912212749669837006870067136629830010100202000010020300007006353311200211091010000100101000001020845043472710821102162283115166088820969115026395341007012701152270028100001010420000100107006570065700657006470066
300247006152500001006746257572224352447005276612911114237630010100102000010010200275882912225049669857006570075136629530010100202000010020300007006553311200211091010000100101000001020864347874710819102042263444287491021070112813384350304012702151170029100001010420000100107006770063700657006770064
3002470065524100000069103378724405823670048750121071044237230010100102000010010200005882912195649669857006770073136629530010100202000010020300007006153311200211091010000100101000001020844340374910779102052153353846880520973115116411289308012702152170029100001010420000100107006570065700667006770065
30024700645251100000676529772250452292700517701895974240030010100102000010010200005882912195449669857005770064136629530010100202000010020300007006653311200211091010000100101000001020879346973910786102132313105207891321061112922421316304012701153370031100001010420000100107006670065700657006370067

Test 3: throughput

Code:

  swpal x0, x1, [x6]
  mov x7, 8

(fused SUBS/B.cc loop)

100 unrolls and 100 iterations

Result (median cycles for code): 18.0053

retire uop (01)cycle (02)03l1d tlb fill (05)mmu table walk data (08)l2 tlb miss data (0b)0e0f18191e1f22233a3f43464951schedule uop (52)schedule int uop (53)schedule ldst uop (55)dispatch int uop (56)dispatch ldst uop (58)int uops in schedulers (59)simd uops in schedulers (5a)60696a6d6emap stall dispatch (70)map rewind (75)map stall (76)dispatch uop (78)7bmap int uop (7c)map ldst uop (7d)map int uop inputs (7f)map ldst uop inputs (80)8283flush restart other nonspec (84)85inst all (8c)inst branch (8d)inst branch taken (90)inst branch cond (94)inst int load (95)inst int alu (97)inst ldst (9b)9d9fl1d tlb access (a0)l1d tlb miss (a1)l1d cache miss st (a2)l1d cache miss ld (a3)ld unit uop (a6)st unit uop (a7)l1d cache writeback (a8)a9acafbbbcl1d cache miss ld nonspec (bf)l1d cache miss st nonspec (c0)l1d tlb miss nonspec (c1)c2branch cond mispred nonspec (c5)branch mispred nonspec (cb)cdcfd1d5map dispatch bubble (d6)ddfetch restart (de)e0? int output thing (e9)eaebec? ldst retires (ed)? int retires (ef)f5f6f7f8fd
2020518005313491110000244691210018003801091601502010010020000100200005001068560491769651800941800872617749820100357520020000200300001800532726111020110099100100001001000001002002424020245401001202143813427114331121212232111716764001600179882000020000100180054180054180054180054180054
20204180053134911100002441412102180038210101598282010010020000100200005001067870491769651800531801102617750720100335420020000200300001800452718211020110099100100001001000001002002323122224216100120114260345191440212120231111716797001600179882099020000100180054180046180054180054180054
202041800451348110010024318120001800301011160035201001002000010020000500107045049176973180053180108211177507201003396200200002003000018005327261110201100991001000010010000010020025251220242701001202142033400814165121212230111716760501600179882099920000100180054180054180054180054180046
20204180053134811100042394812002180038010916065120100100200001002000051110771204917696518015418004826177507201003706200200002003000018005327261110201100991001000010010000110020022241221241261001202139173407914368121212230111716743001600179874090020000100180046180055180054180054180054
202041800531349111000023864120021800383901604972010010020000100200005001076820491769731800531800952617753820100365320020000200300001800942726111020110099100100001001000001002002524122224475100120214369344581459912012220111716785601600179882099920000100180054180054180054180046180054
2020418005313481100000242641200118003821091604392010010020000100200005001075980491769651801511800592617750720100398620020000200300001800532726111020110099100100001001000001002002323022242041001211142753435514191121212230111716763101600179882099020000100180054180054180054180046180054
2020418005313491100000241211200418003120016002820100100200001002000050010653304917697418013818005626177499201003473200200002003000018005327181110201100991001000010010000010020024240222440910012011439334509143841200230111716766101600179882009020000100180054180054180046180054180054
2020418005313491100000242631200018003000915990220100100200001002000050010655204917696518005318005326177507201003601200200002003000018005327261110201100991001000010010000010020025241222241871001200143463447714554121212222111716775801600179874099020000100180054180054180046180046180054
2020418005313491100000242871200218003811010159793201001002000010020000500106532049176976180053180053261774992010035312002000020030000180053272611102011009910010000100100000100200242300244481001200145193456314334121212240111716761501600179874009920000100180054180054180054180054180054
20204180045134811100002418612101180038011016051020100100200001002000050010754704917697318004518005326177499201003779200200002003000018005327261110201100991001000010010000010020023231222241851001201140653413914194121212231111716760101600179882099920000100180046180046180054180054180054

1000 unrolls and 10 iterations

Result (median cycles for code): 18.0053

retire uop (01)cycle (02)03l1d tlb fill (05)mmu table walk data (08)l2 tlb miss data (0b)0e0f1e1f223a3f43464951schedule uop (52)schedule int uop (53)schedule ldst uop (55)dispatch int uop (56)dispatch ldst uop (58)int uops in schedulers (59)simd uops in schedulers (5a)60696a6d6emap stall dispatch (70)map rewind (75)map stall (76)dispatch uop (78)7bmap int uop (7c)map ldst uop (7d)map int uop inputs (7f)map ldst uop inputs (80)8283flush restart other nonspec (84)85inst all (8c)inst branch (8d)inst branch taken (90)inst branch cond (94)inst int load (95)inst int alu (97)inst ldst (9b)9d9fl1d tlb access (a0)l1d tlb miss (a1)l1d cache miss st (a2)l1d cache miss ld (a3)ld unit uop (a6)st unit uop (a7)l1d cache writeback (a8)a9acafbbbcl1d cache miss ld nonspec (bf)l1d cache miss st nonspec (c0)l1d tlb miss nonspec (c1)c2branch mispred nonspec (cb)cfd1d5map dispatch bubble (d6)ddfetch restart (de)e0eaebec? ldst retires (ed)? int retires (ef)f5f6f7f8fd
2002518005313481101026804121118003800716480020010102000010200005010001804917697318005318004523177533200107202000020300001800532744111002110910100001010000110200252412026695100120216815367401671912120230064010116316331798640902000010180046180054180054180054180054
200241800531348100002673912111800382801647992001010200001020000501000170491769651800531800532317752520010720200002030000180045273611100211091010000101000001020023231220267101001201168093675416760121212240064010127316231798729092000010180046180046180054180054180054
200241800451377110102681512121800381871648202001010200001020000501000050491769731800531800532317753320010020200002030000180045274411100211091010000101000001020023231220267591001200167793671016734121212241064010090316331798720992000010180046180046180054180054180054
200241800531349110002680912001800302781648482001010200001020000501000031491769731800891800532317753320010020200002030000180053274411100211091010000101000001020023250026753100120116812367411672812120240064010120316331798720092000010180049180054180099180595180055
2002418007713491020026754121118003808016479720010102000010200005010000404917697318005318005323177533200100202000020300001800452736111002110910100001010000110201262301926740100722116816367431676312120231064010096316331798729002000010180054180122180062180054180054
200241800531349110002673512021800300701647982003410200001020000501000040491769751800531800532317753320010020200002030000180053273611100211091010000101000001020024231220267891001202167403675016789121212241064010162316431798720992000010180054180054180046180046180054
20024180053134810100267621201180030260164796200101020000102000050100005149176973180045180045231775332001002020000203000018005327441110021109101000010100000102002623121926765100120116785367091673112120241064010162316331798729992000010180054180108180083180061180054
200241800451348101012673612021800382071647962001010200001020000501000050491769731800531800532317753320010020200002030000180053274411100211091010000101000001020023231220267601001200167703678716800121212232064010156316311798729992000010180054180054180111180054180046
20024180045134910100267331200180038207164788200101020000102000050100005149176973180053180053231775332001002020000203000018005327441110021109101000010100000102002324120267321001202168183674916765121212231064010097316331798729992000010180046180054180086180600180054
2002418004513491010126774120018003000816479620010102000010200005010000504917696518005318005323177533200100202000020300001800452736111002110910100001010000110200242301926762100120016786367771676212120231064010131316311798649092000010180054180152180079180054180054