Apple Microarchitecture Research by Dougall Johnson

M1/A14 P-core (Firestorm): Overview | Base Instructions | SIMD and FP Instructions
M1/A14 E-core (Icestorm):  Overview | Base Instructions | SIMD and FP Instructions

LD1 (multiple, 1 reg, 8H)

Test 1: uops

Code:

  ld1 { v0.8h }, [x6]
  nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop
  mov x0, 1
  mov x1, 2
  mov x8, 0

(no loop instructions)

1000 unrolls and 1 iteration

Retires (minus 60 nops): 1.000

Issues: 1.000

Integer unit issues: 0.000

Load/store unit issues: 1.000

SIMD/FP unit issues: 0.000

retire uop (01)cycle (02)03l1i tlb fill (04)mmu table walk instruction (07)mmu table walk data (08)l2 tlb miss instruction (0a)0e0f1e22243a3f43464951schedule uop (52)schedule ldst uop (55)dispatch ldst uop (58)simd uops in schedulers (5a)5f6061696d6emap rewind (75)map stall (76)dispatch uop (78)map ldst uop (7d)map ldst uop inputs (80)8283flush restart other nonspec (84)85inst all (8c)inst simd load (98)inst ldst (9b)9dl1d tlb access (a0)l1d cache miss ld (a3)ld unit uop (a6)l1d cache writeback (a8)acafb5bbl1d cache miss ld nonspec (bf)cfd0d2l1i cache miss demand (d3)l1i tlb miss demand (d4)d5map dispatch bubble (d6)d9dadbddfetch restart (de)e0eaebec? ldst retires (ed)f5f6f7f8fd
610052929521912000010045902883510124292100010001000500020016127285142927331010001000100029137291491161001100010000100041001011001212131109293694531170452163030543814956402848616370136921579510002933129245292532931029371
610042930522000100011046422870210124224100010001000500000015964285202923631010001000100029074290761161001100010000100041001011001212128259096693131881432166730773818749452851516450136231588210002926529362293192938929247
6100429291219000000100465728789101242801000100010005000100159762866629221310100010001000290762914711610011000100001000010010110012121315592966862313504121643306738121248422860616026136111565510002923629180293262930829234
6100429292219000000500470028820101242861000100010005000000159772855229296310100010001000291382911011610011000100001000310010110012121318793966951313404421716314438121045452846316158138121574610002924029289293052933629248
6100429217220000000100475028826100243351000100010005000000159702856629222310100010001000291512914311610011000100001000310010110013121286993596751306904221645306438151445452848216418138261562410002924729245293112931529307
610042924622000000010046452878010124201100010001000500040015970285992924831010001000100029115290651161001100010000100031001011001012131539060683530810372153130953814651452831916011136811560510002921029234291392927429203
6100429274219000000100471728768101242601000100010005000200159782857929228310100010001000291252898211610011000100001000310010110012121288392656941306403821734310538131346442846316482137841569510002919629294293352930629243
6100429310221010000110467128796101242761000100010005000100159382864829206310100010001000291022905111610011000100001000310010110012121289991686848313104321731308138181346412842116031136231592410002919629223292622929729298
61004293842191000004271047602872710124176100010001000500020015977286092930831010001000100029028291281161001100010000100031001011001212131199104696731570402159630393813749452835116375138511537410002924929302292322931229180
6100429232219100000110464528687100242441000100010005000000159742857929235310100010001000290972910711610011000100001000310010110013121324890606876309803921590311738161261452839416319137761567110002931829243293152935829224

Test 2: Latency 1->2 roundtrip

Chain cycles: 3

Code:

  ld1 { v0.8h }, [x6]
  fmov x0, d0
  eor x8, x8, x0
  eor x8, x8, x0
  add x6, x6, x8
  mov x0, 1
  mov x1, 2
  mov x8, 0

(fused SUBS/B.cc loop)

100 unrolls and 100 iterations

Result (median cycles for code, minus 3 chain cycles): 9.0054

retire uop (01)cycle (02)03mmu table walk instruction (07)0e0f1e223f4d4f51schedule uop (52)schedule int uop (53)schedule simd uop (54)schedule ldst uop (55)dispatch int uop (56)dispatch simd uop (57)dispatch ldst uop (58)int uops in schedulers (59)simd uops in schedulers (5a)ldst uops in schedulers (5b)60696d6emap stall dispatch (70)map rewind (75)map stall (76)dispatch uop (78)map int uop (7c)map ldst uop (7d)map simd uop (7e)map int uop inputs (7f)map ldst uop inputs (80)map simd uop inputs (81)8283flush restart other nonspec (84)85inst all (8c)inst branch (8d)inst branch taken (90)inst branch cond (94)inst int alu (97)inst simd load (98)inst ldst (9b)9d9fl1d tlb access (a0)l1d cache miss ld (a3)ld unit uop (a6)l1d cache writeback (a8)acafb5l1d cache miss ld nonspec (bf)cfd5map dispatch bubble (d6)ddfetch restart (de)e0? int output thing (e9)eaebec? ldst retires (ed)? int retires (ef)f5f6f7f8fd
50205120051899000101200391195131094672560103401001000110000301001000010000107900857362366136317012003112005412003511314131136715010030200100001000060200100001000012005112010111502011009910040100100001000001001000001000010100001032102107221196684000213001000040100120036120052120052120055120036
50204120051899000220120020119513109467256010340102100001000030100100001000010790085736380613631711200301200541200351131493113671501003020010000100006020010000100001200511200511150201100991004010010000100000100100000100000010000103210210124119720400001313121000040100120055120036120055120052120155
50204120035900000101200391198121094642560100401021000110000301001000010000107900857363806134461112003012005412005411315417113668501003020010054100006020010000100001200541200511150201100991004010010000100000100100001100000010000113210210722119661400021310121000040100120055120055120052120055120036
502041200548990001012007011951310946725601034010210001100003010010000100001079035573545561363171120011120054120054113141311367150100302001000010000602001000010000120054120051115020110099100401001000010000010010000110000001000001321021072211966240004130121000040100120055120036120055120036120036
502041200358990001012003911949310946425601034010210001100003010010000100001078862573623661363171120030120054120054113149311365850100302001000010000602001000010000120035120051115020110099100401001000010000010010000110000101000010321021072211965840002131391000040100120055120055120036120052120036
50204120054899000101200391195131094672560100401021000110000301001000010000107886257363806136470112002712016512003811314631136585010030200100001000060200100001000012005412003511502011009910040100100001000001001000011000003100001132102107221196584000213001000040100120055120052120052120052120052
50204120051899000101200391195101094492560103401021000010000301001000010000107905357363806144888112003012005112003511314931136685010030200100001000060200100001000012005412003511502011009910040100100001000001001000001000000100001132102101251196614000001301000040100120055120036120036120036120055
50204120051899000101200391194931094642560103401021000110000301001000010000107903557354556136470112003012005512003511314631136725010030200100001000060200100001000012003512005111502011009910040100100001000001001000011000000100000132102107221196584000213001000040100120055120055120055120036120055
5020412003589900011120039119510109467256010040102100011000030100100001000010790355736380613446111200111200541200351131493113671501003020010000100006020010000100001200541200511150201100991004010010000100000100100001100000010000113210210722119661400021313121000040100120055120228120057120055120154
502041200548990001012002011951010946725601034010010000100003010010000100001079035573545561363171120030120054120051113149311366850100302001000010000602001000010000120037120035115020110099100401001000010000010010000110000001000011321021072211964640002131301000040100120055120036120036120052120055

1000 unrolls and 10 iterations

Result (median cycles for code, minus 3 chain cycles): 9.0050

retire uop (01)cycle (02)03l1i tlb fill (04)0e0f1e22243f4d4f51schedule uop (52)schedule int uop (53)schedule simd uop (54)schedule ldst uop (55)dispatch int uop (56)dispatch simd uop (57)dispatch ldst uop (58)int uops in schedulers (59)simd uops in schedulers (5a)ldst uops in schedulers (5b)60696d6emap stall dispatch (70)map rewind (75)map stall (76)dispatch uop (78)map int uop (7c)map ldst uop (7d)map simd uop (7e)map int uop inputs (7f)map ldst uop inputs (80)map simd uop inputs (81)8283flush restart other nonspec (84)85inst all (8c)inst branch (8d)inst branch taken (90)inst branch cond (94)inst int alu (97)inst simd load (98)inst ldst (9b)9d9fl1d tlb access (a0)l1d tlb miss (a1)l1d cache miss ld (a3)ld unit uop (a6)l1d cache writeback (a8)acafb5l1d cache miss ld nonspec (bf)c2branch mispred nonspec (cb)cfd5map dispatch bubble (d6)ddfetch restart (de)e0? int output thing (e9)eaebec? ldst retires (ed)? int retires (ef)f5f6f7f8fd
50025120047900011200120020119508109463256001040012100011000030010100001000010795235736044613349901200231200501200501131683113689500103002010000100006002010000100001200531201081150021109104001010000100000101000001100006001000011003140171071112119662400009681000040010120051120048120048120051120051
500241200508990001001200321195081095162560013400121000110000300101000010000107954857361886135759012002712005012004711316831136895001030020100001000060020100001000012005012005211500211091040010100001000001010000011000051210000112031409171513119666400009651000040010120056120051120048120076120051
500241200508990001001200351195081094492560013400121000110000300101000010000107954857360446133499112002612005112005011316831136895001030020100001000060020100001000012004712011011500211091040010100001000001010000011000010100001100314012107105119662400026951000040010120051120051120400120051120051
50024120047899000100120035119508109463256001340012100011000030010100001000010795235736044613349911200111200471200501131683113689500103002010000100006002010000100001200501201111150021109104001010000100000101000001100000010000110031409107711119665400029681000040010120053120036120052120048120051
500241200508990001001200351195081094632560013400121000110000300101000010000107951757361886133652112002612005012005011317331136895001030020100001000060020100001000012005012004711500211091040010100001000001010000011000000100000100314013107712119665400029981000040010120051120051120051120051120051
5002412005089900010012003511955610946925600134001210001100003001010000100001079548573604461338660120028120126120051113153311369250010300201000010000600201000010000120050120114115002110910400101000010000010100060110001001000011003292191451212119665400029981000040010120053120051120036120051120051
50024120050899000100120040119508109463256001340044100011000030010100001009910912875736188613365211205041200501200531133001611368950010306681005310000600201000010161120050120047115002110910400101000010000010100382110036231000011003140111191312119650400180981000040010120054120051120051120051120051
50024120050899000281012002111950810946425600134002010000100003001010000100001079566573633261337031120027120161120050113165311368650010300201000010000600201000010000120050120047115002110910400101000010000110100000010000061000011003140131071012119665400029981000040010120058120051120053120051120051
5002412004793100018100120035119512109463256001340012100011000230010100001000010840445736284613349901200231200501200501131653113689500103002010000100006002010000100001200501200941150021109104001010000100000101000001100000010000110031409107912119662400026681000040010120051120051120048120051120051
500241200358990001001200351195131094632560024400121000110000300101000010000107954857365246133499112002612005012004711316831136895001030020100001000060020100001000012010312006511500211091040010100001000001010000011000000100001000314010107129119665400029981000040010120051120051120048120051120051

Test 3: throughput

Count: 8

Code:

  ld1 { v0.8h }, [x6]
  ld1 { v0.8h }, [x6]
  ld1 { v0.8h }, [x6]
  ld1 { v0.8h }, [x6]
  ld1 { v0.8h }, [x6]
  ld1 { v0.8h }, [x6]
  ld1 { v0.8h }, [x6]
  ld1 { v0.8h }, [x6]
  mov x7, x6
  mov x8, x6
  mov x9, x6
  mov x10, x6
  mov x11, x6
  mov x12, x6
  mov x13, x6

(fused SUBS/B.cc loop)

100 unrolls and 100 iterations

Result (median cycles for code divided by count): 0.3341

retire uop (01)cycle (02)03090e0f1e22233a3f4346494f51schedule uop (52)schedule int uop (53)schedule ldst uop (55)dispatch int uop (56)dispatch ldst uop (58)int uops in schedulers (59)simd uops in schedulers (5a)60696d6emap stall dispatch (70)map rewind (75)map stall (76)dispatch uop (78)map int uop (7c)map ldst uop (7d)map int uop inputs (7f)map ldst uop inputs (80)8283flush restart other nonspec (84)85inst all (8c)inst branch (8d)inst branch taken (90)inst branch cond (94)inst int alu (97)inst simd load (98)inst ldst (9b)9d9fl1d tlb access (a0)l1d cache miss ld (a3)ld unit uop (a6)l1d cache writeback (a8)acafb5b6bbl1d cache miss ld nonspec (bf)cdcfd5map dispatch bubble (d6)ddfetch restart (de)e0eaebec? ldst retires (ed)? int retires (ef)f5f6f7f8fd
802052672720010013710126716211025801001008000010080000500116712702668226717267171666431668980100200800002008000026727267271180201100991001008000080000010080000080040008000061000511011611267280107800001002673226708267082673226732
80204267312001009900126715012121925801001008000010080000500116555602668226736267271665431670180100200800002008000026727267071180201100991001008000080000110080000438003803880038613844051101161226728007800001002673226708267322670826708
8020426731200011600002669221211925801001008000010080000500116831202670226799267101663631666780100200800002008000026731267071180201100991001008000080000010080000438003903880039600430511011611267280140800001002673226732267082670826708
8020426731200000107001266922010258010010080000100800005001166525026706268142673216660316665801002008000020080000267272672711802011009910010080000800000100800004380038008000061000511011611267241404800001002673226732267322672826732
80204267312000006101267120101925801001008000010080000500116831202668226822267371674731668980100200800002008000026731267271180201100991001008000080000010080000438003900800380138005110116112670414100800001002673226732267322670826732
8020426731200000741012671620102580100100800001008000050011665250266822684226716166563166658010020080000200800002672726727118020110099100100800008000001008000043800380448000061044051101161126704007800001002673226728267322670826732
802052697620000044001267130111925801001008000010080000500116652502668226745268741664631668580100200800002008000026731267271180201100991001008000080000010080000438003903880038603900511011611267241007800001002670826728267282670826732
8020426727200000590002671621019258010010080000100800005001166525026706268392673316660316665801002008000020080000267272672711802011009910010080000800000100800004380039038800006139440511011611267041407800001002670826708267322672826708
802042673120000090101267162001925801001008000010080000500116652502670626707267311663031667680100200800002008000026731267271180201100991001008000080000010080000438003803880038013844051101161126728005800001002673226708267322673226729
80204267072000116010126716200192580100100800001008000050011665250266822684526743166733166658010020080000200800002673126727118020110099100100800008000001008000043800000548000061384405110116112672814104800001002673326728267082673226732

1000 unrolls and 10 iterations

Result (median cycles for code divided by count): 0.3340

retire uop (01)cycle (02)030e0f1e223a3f4346494f51schedule uop (52)schedule int uop (53)schedule ldst uop (55)dispatch int uop (56)dispatch ldst uop (58)int uops in schedulers (59)simd uops in schedulers (5a)60696d6emap stall dispatch (70)map rewind (75)map stall (76)dispatch uop (78)map int uop (7c)map ldst uop (7d)map int uop inputs (7f)map ldst uop inputs (80)8283flush restart other nonspec (84)85inst all (8c)inst branch (8d)inst branch taken (90)inst branch cond (94)inst int alu (97)inst simd load (98)inst ldst (9b)9d9fl1d tlb access (a0)l1d cache miss ld (a3)a5ld unit uop (a6)l1d cache writeback (a8)acafb5b6bbl1d cache miss ld nonspec (bf)cfd5map dispatch bubble (d6)ddfetch restart (de)e0eaebec? ldst retires (ed)? int retires (ef)f5f6f7f8fd
800252672420011801126707218181225800101080000108000050117224012669726730267251666731670280010208000020800002672226722118002110910108000080000110800003908003503580035613539502013168132671966280000102672326723267232672326723
8002426722200005011267072181816258001010800001080000501167605126697267282673316674316702800102080000208000026722267221180021109101080000800000108000039080035035800356135395020121614122671966280000102672326723267232672326723
8002426722200009511267072181812258001010800001080000501167605126702267292672816669316702800102080000208022026727267411180021109101080000800000108000043080035041800356135395020131612112671966280000102672326723267232672326723
800242672220000410126707218181225800101080000108000050116760512683326764268731667831670280010208000020800002672226722118002110910108000080000010800003908003503580035610395020111612112671966280000102672326723267232672326723
8002426749200009811267072181811258001010800001080000501167605126697268282672816673316702800102080000208000026722267221180021109101080000800000108000039080035035800356135395020121611122671966280000102672326723267232672326723
800242672220000591126707218181225800101080000108000050116760512669726838267291675931670280010208000020800002672226708118002110910108000080000010800003908003503580035613539502012168122671966280000102672326723267232672326723
8002426722200009811267072181212258001010800001080000501172240126697268202673116675316702800102080000208000026722267221180021109101080000800000108000039080035012580035613539502013161282671960280000102672326723267232672326723
8002426722200008911267072181811258001010800001080000501167605126697267592672816669316702800102080000208000026722267221180021109101080000800000108000039080035035800356135395020121610112671966280000102672326723267232672326723
80024267222000011311267072181812258001010800001080000501168843126697268382673016674316702800102080000208000026722267221180021109101080000800000108000039080035035800396135395020101612122672166280000102672326723267232672326723
8002426722200005601267072181812258001010800001080000501168843126697267412673316675316702800102080000208000026722267221180021109101080000800000108000039080035035800356135395020121612102671966280000102672326723267232672326723