Apple M1 Microarchitecture Research by Dougall Johnson

Firestorm: Overview | Base Instructions | SIMD and FP Instructions
Icestorm:  Overview | Base Instructions | SIMD and FP Instructions

LD1 (multiple, one register, 8H)

Test 1: uops

Code:

  ld1 { v0.8h }, [x6]
  nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop
  mov x0, 1
  mov x1, 2
  mov x8, 0

(no loop instructions)

1000 unrolls and 1 iteration

Retires (minus 60 nops): 1.000

Issues: 1.000

Integer unit issues: 0.000

Load/store unit issues: 1.000

SIMD/FP unit issues: 0.000

retire (01)cycle (02)030407080a0e0f1e22243a3f43464951inst issue (52)~issue ld/st (55)~dispatch ld/st (58)huge thing ld/st (5a)5f6061696d6escheduler rewind (75)scheduler stall (76)~dispatch op (78)~map op ld/st (7d)~map lookup ld/st (80)8283pipeline redirect (84)85inst all (8c)inst fp/simd load (98)inst ldst (9b)9da0a3a6a8acafb5bbdcache load miss (bf)cfd0d2icache miss (d3)itlb miss (d4)d5d6d9dadbddinst fetch restart (de)e0eaebecld/st retires (ed)f5f6f7f8fd
610052929521912000010045902883510124292100010001000500020016127285142927331010001000100029137291491161001100010000100041001011001212131109293694531170452163030543814956402848616370136921579510002933129245292532931029371
610042930522000100011046422870210124224100010001000500000015964285202923631010001000100029074290761161001100010000100041001011001212128259096693131881432166730773818749452851516450136231588210002926529362293192938929247
6100429291219000000100465728789101242801000100010005000100159762866629221310100010001000290762914711610011000100001000010010110012121315592966862313504121643306738121248422860616026136111565510002923629180293262930829234
6100429292219000000500470028820101242861000100010005000000159772855229296310100010001000291382911011610011000100001000310010110012121318793966951313404421716314438121045452846316158138121574610002924029289293052933629248
6100429217220000000100475028826100243351000100010005000000159702856629222310100010001000291512914311610011000100001000310010110013121286993596751306904221645306438151445452848216418138261562410002924729245293112931529307
610042924622000000010046452878010124201100010001000500040015970285992924831010001000100029115290651161001100010000100031001011001012131539060683530810372153130953814651452831916011136811560510002921029234291392927429203
6100429274219000000100471728768101242601000100010005000200159782857929228310100010001000291252898211610011000100001000310010110012121288392656941306403821734310538131346442846316482137841569510002919629294293352930629243
6100429310221010000110467128796101242761000100010005000100159382864829206310100010001000291022905111610011000100001000310010110012121289991686848313104321731308138181346412842116031136231592410002919629223292622929729298
61004293842191000004271047602872710124176100010001000500020015977286092930831010001000100029028291281161001100010000100031001011001212131199104696731570402159630393813749452835116375138511537410002924929302292322931229180
6100429232219100000110464528687100242441000100010005000000159742857929235310100010001000290972910711610011000100001000310010110013121324890606876309803921590311738161261452839416319137761567110002931829243293152935829224

Test 2: Latency 1->2 roundtrip

Chain cycles: 3

Code:

  ld1 { v0.8h }, [x6]
  fmov x0, d0
  eor x8, x8, x0
  eor x8, x8, x0
  add x6, x6, x8
  mov x0, 1
  mov x1, 2
  mov x8, 0

(fused SUBS/B.cc loop)

100 unrolls and 100 iterations

Result (median cycles for code, minus 3 chain cycles): 9.0054

retire (01)cycle (02)03070e0f1e223f4d4f51inst issue (52)~issue int (53)~issue fp/simd (54)~issue ld/st (55)~dispatch int (56)~dispatch fp/simd (57)~dispatch ld/st (58)huge thing int (59)huge thing ld/st (5a)huge thing fp/simd (5b)60696d6edispatch stall (70)scheduler rewind (75)scheduler stall (76)~dispatch op (78)~map op int (7c)~map op ld/st (7d)~map op fp/simd (7e)~map lookup int (7f)~map lookup ld/st (80)~map lookup fp/simd (81)8283pipeline redirect (84)85inst all (8c)inst branch (8d)inst branch taken (90)inst b.cc (94)inst integer (97)inst fp/simd load (98)inst ldst (9b)9d9fa0a3a6a8acafb5dcache load miss (bf)cfd5d6ddinst fetch restart (de)e0? int output thing (e9)eaebecld/st retires (ed)gpr retires (ef)f5f6f7f8fd
50205120051899000101200391195131094672560103401001000110000301001000010000107900857362366136317012003112005412003511314131136715010030200100001000060200100001000012005112010111502011009910040100100001000001001000001000010100001032102107221196684000213001000040100120036120052120052120055120036
50204120051899000220120020119513109467256010340102100001000030100100001000010790085736380613631711200301200541200351131493113671501003020010000100006020010000100001200511200511150201100991004010010000100000100100000100000010000103210210124119720400001313121000040100120055120036120055120052120155
50204120035900000101200391198121094642560100401021000110000301001000010000107900857363806134461112003012005412005411315417113668501003020010054100006020010000100001200541200511150201100991004010010000100000100100001100000010000113210210722119661400021310121000040100120055120055120052120055120036
502041200548990001012007011951310946725601034010210001100003010010000100001079035573545561363171120011120054120054113141311367150100302001000010000602001000010000120054120051115020110099100401001000010000010010000110000001000001321021072211966240004130121000040100120055120036120055120036120036
502041200358990001012003911949310946425601034010210001100003010010000100001078862573623661363171120030120054120054113149311365850100302001000010000602001000010000120035120051115020110099100401001000010000010010000110000101000010321021072211965840002131391000040100120055120055120036120052120036
50204120054899000101200391195131094672560100401021000110000301001000010000107886257363806136470112002712016512003811314631136585010030200100001000060200100001000012005412003511502011009910040100100001000001001000011000003100001132102107221196584000213001000040100120055120052120052120052120052
50204120051899000101200391195101094492560103401021000010000301001000010000107905357363806144888112003012005112003511314931136685010030200100001000060200100001000012005412003511502011009910040100100001000001001000001000000100001132102101251196614000001301000040100120055120036120036120036120055
50204120051899000101200391194931094642560103401021000110000301001000010000107903557354556136470112003012005512003511314631136725010030200100001000060200100001000012003512005111502011009910040100100001000001001000011000000100000132102107221196584000213001000040100120055120055120055120036120055
5020412003589900011120039119510109467256010040102100011000030100100001000010790355736380613446111200111200541200351131493113671501003020010000100006020010000100001200541200511150201100991004010010000100000100100001100000010000113210210722119661400021313121000040100120055120228120057120055120154
502041200548990001012002011951010946725601034010010000100003010010000100001079035573545561363171120030120054120051113149311366850100302001000010000602001000010000120037120035115020110099100401001000010000010010000110000001000011321021072211964640002131301000040100120055120036120036120052120055

1000 unrolls and 10 iterations

Result (median cycles for code, minus 3 chain cycles): 9.0050

retire (01)cycle (02)03040e0f1e22243f4d4f51inst issue (52)~issue int (53)~issue fp/simd (54)~issue ld/st (55)~dispatch int (56)~dispatch fp/simd (57)~dispatch ld/st (58)huge thing int (59)huge thing ld/st (5a)huge thing fp/simd (5b)60696d6edispatch stall (70)scheduler rewind (75)scheduler stall (76)~dispatch op (78)~map op int (7c)~map op ld/st (7d)~map op fp/simd (7e)~map lookup int (7f)~map lookup ld/st (80)~map lookup fp/simd (81)8283pipeline redirect (84)85inst all (8c)inst branch (8d)inst branch taken (90)inst b.cc (94)inst integer (97)inst fp/simd load (98)inst ldst (9b)9d9fa0a1a3a6a8acafb5dcache load miss (bf)c2branch mispredict (cb)cfd5d6ddinst fetch restart (de)e0? int output thing (e9)eaebecld/st retires (ed)gpr retires (ef)f5f6f7f8fd
50025120047900011200120020119508109463256001040012100011000030010100001000010795235736044613349901200231200501200501131683113689500103002010000100006002010000100001200531201081150021109104001010000100000101000001100006001000011003140171071112119662400009681000040010120051120048120048120051120051
500241200508990001001200321195081095162560013400121000110000300101000010000107954857361886135759012002712005012004711316831136895001030020100001000060020100001000012005012005211500211091040010100001000001010000011000051210000112031409171513119666400009651000040010120056120051120048120076120051
500241200508990001001200351195081094492560013400121000110000300101000010000107954857360446133499112002612005112005011316831136895001030020100001000060020100001000012004712011011500211091040010100001000001010000011000010100001100314012107105119662400026951000040010120051120051120400120051120051
50024120047899000100120035119508109463256001340012100011000030010100001000010795235736044613349911200111200471200501131683113689500103002010000100006002010000100001200501201111150021109104001010000100000101000001100000010000110031409107711119665400029681000040010120053120036120052120048120051
500241200508990001001200351195081094632560013400121000110000300101000010000107951757361886133652112002612005012005011317331136895001030020100001000060020100001000012005012004711500211091040010100001000001010000011000000100000100314013107712119665400029981000040010120051120051120051120051120051
5002412005089900010012003511955610946925600134001210001100003001010000100001079548573604461338660120028120126120051113153311369250010300201000010000600201000010000120050120114115002110910400101000010000010100060110001001000011003292191451212119665400029981000040010120053120051120036120051120051
50024120050899000100120040119508109463256001340044100011000030010100001009910912875736188613365211205041200501200531133001611368950010306681005310000600201000010161120050120047115002110910400101000010000010100382110036231000011003140111191312119650400180981000040010120054120051120051120051120051
50024120050899000281012002111950810946425600134002010000100003001010000100001079566573633261337031120027120161120050113165311368650010300201000010000600201000010000120050120047115002110910400101000010000110100000010000061000011003140131071012119665400029981000040010120058120051120053120051120051
5002412004793100018100120035119512109463256001340012100011000230010100001000010840445736284613349901200231200501200501131653113689500103002010000100006002010000100001200501200941150021109104001010000100000101000001100000010000110031409107912119662400026681000040010120051120051120048120051120051
500241200358990001001200351195131094632560024400121000110000300101000010000107954857365246133499112002612005012004711316831136895001030020100001000060020100001000012010312006511500211091040010100001000001010000011000000100001000314010107129119665400029981000040010120051120051120048120051120051

Test 3: throughput

Count: 8

Code:

  ld1 { v0.8h }, [x6]
  ld1 { v0.8h }, [x6]
  ld1 { v0.8h }, [x6]
  ld1 { v0.8h }, [x6]
  ld1 { v0.8h }, [x6]
  ld1 { v0.8h }, [x6]
  ld1 { v0.8h }, [x6]
  ld1 { v0.8h }, [x6]
  mov x7, x6
  mov x8, x6
  mov x9, x6
  mov x10, x6
  mov x11, x6
  mov x12, x6
  mov x13, x6

(fused SUBS/B.cc loop)

100 unrolls and 100 iterations

Result (median cycles for code divided by count): 0.3341

retire (01)cycle (02)03090e0f1e22233a3f4346494f51inst issue (52)~issue int (53)~issue ld/st (55)~dispatch int (56)~dispatch ld/st (58)huge thing int (59)huge thing ld/st (5a)60696d6edispatch stall (70)scheduler rewind (75)scheduler stall (76)~dispatch op (78)~map op int (7c)~map op ld/st (7d)~map lookup int (7f)~map lookup ld/st (80)8283pipeline redirect (84)85inst all (8c)inst branch (8d)inst branch taken (90)inst b.cc (94)inst integer (97)inst fp/simd load (98)inst ldst (9b)9d9fa0a3a6a8acafb5b6bbdcache load miss (bf)cdcfd5d6ddinst fetch restart (de)e0eaebecld/st retires (ed)gpr retires (ef)f5f6f7f8fd
802052672720010013710126716211025801001008000010080000500116712702668226717267171666431668980100200800002008000026727267271180201100991001008000080000010080000080040008000061000511011611267280107800001002673226708267082673226732
80204267312001009900126715012121925801001008000010080000500116555602668226736267271665431670180100200800002008000026727267071180201100991001008000080000110080000438003803880038613844051101161226728007800001002673226708267322670826708
8020426731200011600002669221211925801001008000010080000500116831202670226799267101663631666780100200800002008000026731267071180201100991001008000080000010080000438003903880039600430511011611267280140800001002673226732267082670826708
8020426731200000107001266922010258010010080000100800005001166525026706268142673216660316665801002008000020080000267272672711802011009910010080000800000100800004380038008000061000511011611267241404800001002673226732267322672826732
80204267312000006101267120101925801001008000010080000500116831202668226822267371674731668980100200800002008000026731267271180201100991001008000080000010080000438003900800380138005110116112670414100800001002673226732267322670826732
8020426731200000741012671620102580100100800001008000050011665250266822684226716166563166658010020080000200800002672726727118020110099100100800008000001008000043800380448000061044051101161126704007800001002673226728267322670826732
802052697620000044001267130111925801001008000010080000500116652502668226745268741664631668580100200800002008000026731267271180201100991001008000080000010080000438003903880038603900511011611267241007800001002670826728267282670826732
8020426727200000590002671621019258010010080000100800005001166525026706268392673316660316665801002008000020080000267272672711802011009910010080000800000100800004380039038800006139440511011611267041407800001002670826708267322672826708
802042673120000090101267162001925801001008000010080000500116652502670626707267311663031667680100200800002008000026731267271180201100991001008000080000010080000438003803880038013844051101161126728005800001002673226708267322673226729
80204267072000116010126716200192580100100800001008000050011665250266822684526743166733166658010020080000200800002673126727118020110099100100800008000001008000043800000548000061384405110116112672814104800001002673326728267082673226732

1000 unrolls and 10 iterations

Result (median cycles for code divided by count): 0.3340

retire (01)cycle (02)030e0f1e223a3f4346494f51inst issue (52)~issue int (53)~issue ld/st (55)~dispatch int (56)~dispatch ld/st (58)huge thing int (59)huge thing ld/st (5a)60696d6edispatch stall (70)scheduler rewind (75)scheduler stall (76)~dispatch op (78)~map op int (7c)~map op ld/st (7d)~map lookup int (7f)~map lookup ld/st (80)8283pipeline redirect (84)85inst all (8c)inst branch (8d)inst branch taken (90)inst b.cc (94)inst integer (97)inst fp/simd load (98)inst ldst (9b)9d9fa0a3a5a6a8acafb5b6bbdcache load miss (bf)cfd5d6ddinst fetch restart (de)e0eaebecld/st retires (ed)gpr retires (ef)f5f6f7f8fd
800252672420011801126707218181225800101080000108000050117224012669726730267251666731670280010208000020800002672226722118002110910108000080000110800003908003503580035613539502013168132671966280000102672326723267232672326723
8002426722200005011267072181816258001010800001080000501167605126697267282673316674316702800102080000208000026722267221180021109101080000800000108000039080035035800356135395020121614122671966280000102672326723267232672326723
8002426722200009511267072181812258001010800001080000501167605126702267292672816669316702800102080000208022026727267411180021109101080000800000108000043080035041800356135395020131612112671966280000102672326723267232672326723
800242672220000410126707218181225800101080000108000050116760512683326764268731667831670280010208000020800002672226722118002110910108000080000010800003908003503580035610395020111612112671966280000102672326723267232672326723
8002426749200009811267072181811258001010800001080000501167605126697268282672816673316702800102080000208000026722267221180021109101080000800000108000039080035035800356135395020121611122671966280000102672326723267232672326723
800242672220000591126707218181225800101080000108000050116760512669726838267291675931670280010208000020800002672226708118002110910108000080000010800003908003503580035613539502012168122671966280000102672326723267232672326723
8002426722200009811267072181212258001010800001080000501172240126697268202673116675316702800102080000208000026722267221180021109101080000800000108000039080035012580035613539502013161282671960280000102672326723267232672326723
8002426722200008911267072181811258001010800001080000501167605126697267592672816669316702800102080000208000026722267221180021109101080000800000108000039080035035800356135395020121610112671966280000102672326723267232672326723
80024267222000011311267072181812258001010800001080000501168843126697268382673016674316702800102080000208000026722267221180021109101080000800000108000039080035035800396135395020101612122672166280000102672326723267232672326723
8002426722200005601267072181812258001010800001080000501168843126697267412673316675316702800102080000208000026722267221180021109101080000800000108000039080035035800356135395020121612102671966280000102672326723267232672326723