Apple Microarchitecture Research by Dougall Johnson

M1/A14 P-core (Firestorm): Overview | Base Instructions | SIMD and FP Instructions
M1/A14 E-core (Icestorm):  Overview | Base Instructions | SIMD and FP Instructions

SWPAH

Test 1: uops

Code:

  swpah w0, w1, [x6]
  nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop
  mov x0, 0

(no loop instructions)

1000 unrolls and 1 iteration

Retires (minus 70 nops): 2.000

Issues: 2.000

Integer unit issues: 0.000

Load/store unit issues: 2.000

SIMD/FP unit issues: 0.000

retire uop (01)cycle (02)03l1i tlb fill (04)l1d tlb fill (05)mmu table walk instruction (07)mmu table walk data (08)09l2 tlb miss instruction (0a)l2 tlb miss data (0b)0e0f18191e1f223a3f4951schedule uop (52)schedule ldst uop (55)dispatch ldst uop (58)simd uops in schedulers (5a)5f60696a6b6d6emap rewind (75)map stall (76)dispatch uop (78)map ldst uop (7d)map ldst uop inputs (80)8283flush restart other nonspec (84)85inst all (8c)inst int load (95)inst ldst (9b)9dl1d tlb access (a0)l1d tlb miss (a1)l1d cache miss st (a2)l1d cache miss ld (a3)ld unit uop (a6)st unit uop (a7)l1d cache writeback (a8)a9acafbbbcl1d cache miss ld nonspec (bf)l1d cache miss st nonspec (c0)l1d tlb miss nonspec (c1)c2cfd0d1d2l1i cache miss demand (d3)l1i tlb miss demand (d4)d5map dispatch bubble (d6)d9dadbddfetch restart (de)e0eaebec? ldst retires (ed)f5f6f7f8fd
720053408625521201211100100410526633865029054200020002000100000049311190337653408431020002000300033941535711710011000100002000020100110000042003202200148931060707988355115424151358144521448543300819185167851834020003400934087340983404234011
72004342042551030030000010020053923385002897420002000200010000104931036033811340393102000200230003392653511171001100010000200002210011000004200230200015071106740800436141502427635524455953543306319172166561825520003398834078341343405934085
720043404925520300300000100900536133926029006200020002000100010049309900337723407031020002000300033935538711710011000100002000022100210000062003202200149951054118005360705124132360044541352543299619097165211836320003411234088340513411234054
720043405925500400400000100500524533991028921200020002000100011049310680337933406531020002000300033913539911710011000100002000022100210000022001202200149311063017965358315524209357744451455543301619190168151847120003405334131340603403734057
720043415625620300401111100410534233857028962200020002000100010049310160337763416131020002000300033984539711710011000100002003322100410010172002212221149411071317967358005224111357344531753543301418932167421827520003405734037341163409734009
720043402925501201201000100110532533928028980200020002000100020049309820337653409831020002000300033979537311710011000100002002402100410010232002310020149961054708025358805424227355844571550503320719050167861828420003413734085340443406434104
720043404225501301110000100510538633830028983200020002000100000049311210337503406031020002000300034004535011710011000100002003222100210010110200131222115007105970795935540512423035344450958503302719117168071822420003408634032341193409234098
720043407425601301100000100610530933929029012200020002000100000049309810337613412131020002000300033977539811710011000100002002322100410010262002310220148921062907977357004924092357044591851563296519048167871847520003408534098342183405434112
720043416225601300410100100610530233919029091200020002000100001049310300337543409231020002000300033962536011710011000100002003222100510010232002312022149711065317980354204924206362444491255513305119080167381816620003412134147341163408434109
720043411925501401210100100410538433975029014200020002000100004049310040338213413631020002000300033881539011710011000100002004220100410010132002310221150431049717982352515324079361244501651533301319164168041829820003414834026341143404234065

Test 2: throughput

Code:

  swpah w0, w1, [x6]
  add x6, x6, 2

(fused SUBS/B.cc loop)

100 unrolls and 100 iterations

Result (median cycles for code): 3.0124

retire uop (01)cycle (02)03l1d tlb fill (05)mmu table walk data (08)l2 tlb miss data (0b)0e0f1e1f2022233a3e3f4043464951schedule uop (52)schedule int uop (53)schedule ldst uop (55)dispatch int uop (56)dispatch ldst uop (58)int uops in schedulers (59)simd uops in schedulers (5a)60696a6d6emap stall dispatch (70)map rewind (75)map stall (76)dispatch uop (78)map int uop (7c)map ldst uop (7d)map int uop inputs (7f)map ldst uop inputs (80)8283flush restart other nonspec (84)85inst all (8c)inst branch (8d)inst branch taken (90)inst branch cond (94)inst int load (95)inst int alu (97)inst ldst (9b)9d9fl1d tlb access (a0)l1d tlb miss (a1)l1d cache miss st (a2)l1d cache miss ld (a3)ld unit uop (a6)st unit uop (a7)l1d cache writeback (a8)a9aaabacafb6bbbcl1d cache miss ld nonspec (bf)l1d cache miss st nonspec (c0)l1d tlb miss nonspec (c1)c2c3cfd5map dispatch bubble (d6)ddfetch restart (de)e0? int output thing (e9)eaebec? ldst retires (ed)? int retires (ef)f5f6f7f8fd
3020630210226101013466448101403010507232838583010010100200001010020000592121218131492703530137301291326328301001020020000102003000030119251112020110099100100001010010000110020037659831006710017163140542014734035563400131011611301221000018121220000101003015830152301453015030137
30204301222251110034345431080301012941312383230100101002000010100200005921212175214927052301673014313263433010010200200001020030000301102611120201100991001000010100100000100200354557910095100073737349020092444549474001310116113013210000511720000101003009530102300893010430097
30204301052251110033975261012030086295131238293010010100200271010020000592121217350492704730115301441326327301001020020000102003000030118261112020110099100100001010010000110020030559701007910010162522472010633946254414131011611301271000022141420000101003012130126301173010830110
3020430135226101003420433101203010735513143847301001010020000101002000059212121794049270503014630141132630530100102002000010200300003009626411202011009910010000101001000001002004353960100641000835273670200913353506141013101161130091100001917920000101003012430116301153009030098
302043012922611100342444110141230133248221738413010010100200001010020000592121217790492705830125301561326333301001020020000102003000030148261112020110099100100001010010000010020038462651008910011144524532010533454854403131011611301181000020151120000101003014130105301103012430113
30204301312261100034324311013030094237141138573010010100200001010020000592121217761492704730155301181326327301001020020000102003000030108261112020110099100100001010010000010020031448941007410007363928702008833435837410131011611301101000010121120000101003012930085301233013930111
3020430102225111003396529102103010241416163831301001010020052101002000059212121781149270413014630122132633830100102002000010200300003011026111202011009910010000101001000011002004057366100861001516511280201092515626340313101161130112100001816520000101003012130128301073010030114
30204301012261010034449451014030097195893857301001010020000101002000059212121775149270253011830100132632130100102002000010200300003012025711202011009910010000101001000001002003945560100951000915473674201173354497340013100161130164100002415620000101003016430124301393013330148
30204301512261100134815441020030120498232339053010010100200001010020000592121216761492706330198301291326351301001020020000102003000030133257112020110099100100001010010000010020042585831009210007176036582012534466765411131011611301061000021131220000101003011830113301083011030124
30204301062261001034334311011030130514151238563010010100200001010020000592121217131492701830118301151326337301001020020000102003000030118261112020110099100100001010010000110020023539501004710005153324622010633244946401131011611301621000021131220000101003010030098300953010030087

1000 unrolls and 10 iterations

Result (median cycles for code): 3.0117

retire uop (01)cycle (02)03l1d tlb fill (05)mmu table walk data (08)09l2 tlb miss data (0b)0e0f1e1f2022293a3e3f4043464951schedule uop (52)schedule int uop (53)schedule ldst uop (55)dispatch int uop (56)dispatch ldst uop (58)int uops in schedulers (59)simd uops in schedulers (5a)5f696a6d6emap stall dispatch (70)map rewind (75)map stall (76)dispatch uop (78)map int uop (7c)map ldst uop (7d)map int uop inputs (7f)map ldst uop inputs (80)8283flush restart other nonspec (84)85inst all (8c)inst branch (8d)inst branch taken (90)inst branch cond (94)inst int load (95)inst int alu (97)inst ldst (9b)9d9e9fl1d tlb access (a0)l1d tlb miss (a1)l1d cache miss st (a2)l1d cache miss ld (a3)ld unit uop (a6)st unit uop (a7)l1d cache writeback (a8)a9aaabacafb6bbbcl1d cache miss ld nonspec (bf)l1d cache miss st nonspec (c0)c2c3cfd2d5map dispatch bubble (d6)ddfetch restart (de)e0? int output thing (e9)eaebec? ldst retires (ed)? int retires (ef)f5f6f7f8fd
30026301132261002113418126101236301112494035384830010100102000010010200005883012173504927043301143014113263493001010020200001002030000301172571120021109101000010010100000110200370117114100941000311056448520106555163770012700116113013110000138720000100103012330253301233011730128
30024301162250000003427138101536301002472212384730010100102000010010200005883012175004927037301263013713263563001010020200001002030157301032571120021109101000010010100000010200440689210083100043164230632009924217776001270011611301231000075620000100103014830126301233012330119
3002430110225100000351634000170301204973924384330010100102000010010200005883012173404927054301333013013263633001010020200001002030000301332571120021109101000010010100000010200380618110094100042836306420164350285550012700116113011710000169720000100103011930114301203013130118
3002430127225000000347844002412323012252113528382130010100102000010010200005883012178704927029301033012613263513001010020200001002030000301392591120021109101000010010100000010200281293137100931000639502261201133551656000127001161130114100001311820000100103013130123301223011730114
30024301222251000003485156001703010738618183830300101001020000100102000058830121738049270193010530113132634830010100202000010020300003011825311200211091010000100101000000102003212857410101100031736345820153156097740012700116113010010000910720000100103012930122301293010530119
3002430137225000000342705610110300955363347383230010100102000010010200005883012173704927018300973009513263353001010020200001002030000300972571120021109101000010010100000010200541255721010610003493644592009625821099800127001161130116100001410720000100103012430118301173011830131
3002430212226000000344725110110300983571827383830010100102000010010200005883012172904927049301113011313263373001010020200001002030000301112571120021109101000010010100000010200390749510104100112103630672009325639312100127001161130114100001712920000100103013730118301193012730116
3002430117225000000345624210142830091526262338493001010010200001001020000588301217380492701230103300961326339300101002020000100203000030092257112002110910100001001010000001020028121141111011610007310404683201411462808200127001161230115100001310820000100103013530131301343013930125
30024301292250000003448138101012300852972522385530010100102000010010200005883012173604927041301293012513263453001010020200001002030000301142571120021109101000010010100001010200330928010092100082930694200822604826101127001161230115100001911720000100103011030132301233012430118
30024301412250000003455436101503008735739193837300101001020000100102000058830121726049270303011730122132634630010100202000010020300003011325911200211091010000100101000000102005006310010094100151040014420090252272690012700116113013610000812620000100103011330104301223010930107

Test 3: throughput

Code:

  swpah w0, w1, [x6]
  mov x7, 8

(fused SUBS/B.cc loop)

100 unrolls and 100 iterations

Result (median cycles for code): 13.0051

retire uop (01)cycle (02)03mmu table walk data (08)09l2 tlb miss data (0b)0e0f18191e1f22233a3f43464951schedule uop (52)schedule int uop (53)schedule ldst uop (55)dispatch int uop (56)dispatch ldst uop (58)int uops in schedulers (59)simd uops in schedulers (5a)60696a6b6d6emap stall dispatch (70)map rewind (75)map stall (76)dispatch uop (78)7bmap int uop (7c)map ldst uop (7d)map int uop inputs (7f)map ldst uop inputs (80)8283flush restart other nonspec (84)85inst all (8c)inst branch (8d)inst branch taken (90)inst branch cond (94)inst int load (95)inst int alu (97)inst ldst (9b)9d9fl1d tlb access (a0)l1d tlb miss (a1)l1d cache miss st (a2)l1d cache miss ld (a3)ld unit uop (a6)st unit uop (a7)l1d cache writeback (a8)a9acafbbl1d cache miss ld nonspec (bf)l1d cache miss st nonspec (c0)c2branch cond mispred nonspec (c5)branch mispred nonspec (cb)cdcfd1d5map dispatch bubble (d6)ddfetch restart (de)e0? int output thing (e9)eaebec? ldst retires (ed)? int retires (ef)f5f6f7f8fd
202051300529740000100148470002361301118001102242010010020000100200005001072001491269620130054130055261275092010033752002000020030000130055196911102011009910010000100100001100200000121424296100000014325341971411412120111716767101600129950014101420000100130053130056130056130056130053
202041300559740000000144220009913004500011025220100100200001002000050010654814912697501300511300562612750920100330520020000200300001300561979111020110099100100001001000011002000001202417510000001434034516143911212011171678270160012995101413020000100130053130057130057130057130057
20204130056975000000014141000123130045000110583201001002000010020000500106798149126976013005513005626127496201003103200200002003000013005219791110201100991001000010010000010020000012122448310000001456934307144091212011171676140160012995001013020000100130056130056130056130056130056
20204130042974000000014237010122130029000110694201001002000010020000500106824049126962013005413004226127509201003515200200002003000013004219791110201100991001000010010000110020000012132430310000001425034331144500120111716792401600129950013101320000100130056130056130056130056130057
2020413005597500000001426300010913019700011001120100100200001002000050010683214912697501300541300552612750620100339320020000200300001300521979111020110099100100001001000001002000000142425210000001414234226142450120111716797901600129947010101020000100130056130056130056130043130056
202041300529740000000144270011031300450001102262010010020000100200005001059501491269750130054130042261275092010033182002000020030000130055197911102011009910010000100100000100200000121324372100000014570345011447000011171678610160012995000101020000100130166130053130072130053130043
202041300529740000000145190001221300450001102092010010020000100200005001074101491269620130054130055261274962010035392002000020030000130135197911102011009910010000100100001100200000120241311000000141933447614451120011171676600160012995001310020000100130043130057130057130057130043
2020413004297400000001435800112313016100011015020100100200001002000050010692814912697201300541300552612750920100357220020000200300001300521979111020110099100100001001000011002000001202433810012001441234362143831212011171678400160012993700131020000100130056130043130043130053130056
2020413005597400000001510700112213004500010993120100100200001002000050010673014912697501300411300552612749620100345120020000200300001300521969111020110099100100001001000001002000001214242451000000141443408614190000111716767001600129937010131420000100130043130056130056130056130043
2020413005597401000001426000095130156010110136201001002000010020000500106652049126975013005413004226127509201003164200200002003000013005519691110201100991001000010010000010020076701324217100000014468343301437712001117167525016001299500001020000100130053130053130046130056130043

1000 unrolls and 10 iterations

Result (median cycles for code): 13.0056

retire uop (01)cycle (02)03l2 tlb miss data (0b)0e0f191e1f22233a3f464951schedule uop (52)schedule int uop (53)schedule ldst uop (55)dispatch int uop (56)dispatch ldst uop (58)int uops in schedulers (59)simd uops in schedulers (5a)60696a6b6d6emap stall dispatch (70)map rewind (75)map stall (76)dispatch uop (78)7bmap int uop (7c)map ldst uop (7d)map int uop inputs (7f)map ldst uop inputs (80)8283flush restart other nonspec (84)85inst all (8c)inst branch (8d)inst branch taken (90)inst branch cond (94)inst int load (95)inst int alu (97)inst ldst (9b)9d9fl1d tlb access (a0)l1d cache miss st (a2)l1d cache miss ld (a3)ld unit uop (a6)st unit uop (a7)l1d cache writeback (a8)acafbbl1d cache miss ld nonspec (bf)l1d cache miss st nonspec (c0)branch cond mispred nonspec (c5)branch mispred nonspec (cb)cfd1d5map dispatch bubble (d6)ddfetch restart (de)e0eaebec? ldst retires (ed)? int retires (ef)f5f6f7f8fd
20025130061974000016669010124130111021148582001010200001020000501000170491269720130055130056231275362001072020000203000013005219921110021109101000010100001102000012162663510000016697366801663512120064010044216221299381410142000010130043130057130057130057130043
200241300429740000166760001071301170011486520010102000010200005010000304912697601300411300812312753620010020200002030000130042199211100211091010000101000001020000121626641100000166703664416682000064010012216221299381410142000010130057130057130043130057130057
20024130042974000016678010127130092001148692001010200001020000501000032491269760130041130042231275222001002020000203000013005619921110021109101000010100000102000012162663710000016669366771667912120064010051216221299421410142000010130057130057130043130057130057
2002413005697400001664301012413003300114869200101020000102000050100003149126972013005113005623127532200100202000020300001300521992111002110910100001010000010200001216266791000001664536672166461200064010005216221299421410142000010130057130057130057130057130057
20024130056974000016645010126130092001148552001010200001020000501000031491269760130055130042231275362001002020000203000013004319921110021109101000010100000102000012162668010000016648366681667012120064010014216221299421410142000010130057130057130053130057130043
200241300429740000166480001241300300011486920010102000010200005010000314912697601300551300562312753620010020200002030000130042199611100211091010000101000001020000121626646100000166783666816677120006401003821622129942014142000010130043130043130053130057130057
2002413004297400001667800012713007320114865200101020000102000050100003149126972013005113005223127536200100202000020300001300521992111002110910100001010000010200001216266471000001667736636166370120064010017216221299421014142000010130057130043130043130043130057
2002413004297400001664100012413007900114869200101020000102000050100003149126976013005513005623127536200100202000020300001300561992111002110910100001010000010200001216266471000001667836636166371212006401004721622129928010142000010130057130057130053130057130057
20024130056974000016645000128130078201148552001010200001020000501000031491269720130041130056231275362001002020000203000013005619921110021109101000010100001102000000266711000001663636645166351212006401000521622129951100102000010130057130043130057130043130043
20024130056974000016680000127130045021148692001010200001020000501000031491269720130055130056231275222001002020000203000013004219821110021109101000010100001102000012162666810000016635366771667812120064010002216221299421414142000010130043130053130043130057130057