Apple M1 Microarchitecture Research by Dougall Johnson

Firestorm: Overview | Base Instructions | SIMD and FP Instructions
Icestorm:  Overview | Base Instructions | SIMD and FP Instructions

LD1 (multiple, one register, 4S)

Test 1: uops

Code:

  ld1 { v0.4s }, [x6]
  nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop
  mov x0, 1
  mov x1, 2
  mov x8, 0

(no loop instructions)

1000 unrolls and 1 iteration

Retires (minus 60 nops): 1.000

Issues: 1.000

Integer unit issues: 0.000

Load/store unit issues: 1.000

SIMD/FP unit issues: 0.000

retire (01)cycle (02)03050708090a0b0e0f18191e1f22243a3f464951inst issue (52)~issue ld/st (55)~dispatch ld/st (58)huge thing ld/st (5a)5f61696d6escheduler rewind (75)scheduler stall (76)~dispatch op (78)~map op ld/st (7d)~map lookup ld/st (80)8283pipeline redirect (84)85inst all (8c)inst fp/simd load (98)inst ldst (9b)9da0a1a3a6a8a9acafb5bbdcache load miss (bf)dtlb miss (c1)c2cfd0d2icache miss (d3)itlb miss (d4)d5d6d9dadbddinst fetch restart (de)e0eaebecld/st retires (ed)f5f6f7f8fd
61005292072190300014011003010453228769102433510001000100050003015950286102927631010001000100029081292561161001100010000100003100000010012030012830913868503074104921691303338201547462831516505139311567110002933929271293512931329240
61004292852200240011000001700046302873500242981000100010005000601595328636293183101000100010002907629125116100110001000010000310000001000203001292291336886306874721768306438151346412832616420139331578710002927029262293762933229218
6100429289220024001800000300045552873600243361000100010005000601594528608292483101000100010002908729139116100110001000010000310010001001212001287491506824302074521746305938211646472838216365139791565610002923829202293652929729282
610042922622002611171110050104533288661024243100010001000500000159552867229214310100010001000291112909511610011000100001000031001000100020300128489160682830519452171630553822745442837516440138371570710002933529232293112933029310
6100429234219024001201100301045762880701242691000100010005000701598828620293583101000100010002914629140116100110001000010010210000001000213001286591886857305394221659309138211043442838516340136811579010002933429267293352925929311
6100429247219021001401100301046292871800242671000100010005000461596028704293163101000100010002911629169116100110001000010000310000001001300001295093886835304995021612305438181744442833116356138851575910002926729315293192938129348
6100429256220021001301110901045432878800242941000100010005000101594728577292973101000100010002905229170116100110001000010000010010001000002001289191566845307584721644308938192148462836416426137031569310002925729315292982936429312
6100429295219025001400000201046322878410242991000100010005000001594228738292823101000100010002909029130116100110001000010000310000041001202001281391176832308694321675306238211642432837416218137871558110002922929270292722917729283
61004292592200280016000003000452228776102421910001000100050002015963286252884831010001000100029171291421161001100010000100132100200410002030012894931968643061104521714307738171049462835516312137961566810002921129290292182928829264
6100429239219021001500000300045432880000242771000100010005000301595228598292303101000100010002909729100116100110001000010000310000011000203001292490646877306384721627307338231948472834616319137251567010002929329411293262933429288

Test 2: Latency 1->2 roundtrip

Chain cycles: 3

Code:

  ld1 { v0.4s }, [x6]
  fmov x0, d0
  eor x8, x8, x0
  eor x8, x8, x0
  add x6, x6, x8
  mov x0, 1
  mov x1, 2
  mov x8, 0

(fused SUBS/B.cc loop)

100 unrolls and 100 iterations

Result (median cycles for code, minus 3 chain cycles): 9.0051

retire (01)cycle (02)0305080b0e0f1e22233a3f4d4f51inst issue (52)~issue int (53)~issue fp/simd (54)~issue ld/st (55)~dispatch int (56)~dispatch fp/simd (57)~dispatch ld/st (58)huge thing int (59)huge thing ld/st (5a)huge thing fp/simd (5b)60696d6edispatch stall (70)71scheduler rewind (75)scheduler stall (76)~dispatch op (78)~map op int (7c)~map op ld/st (7d)~map op fp/simd (7e)~map lookup int (7f)~map lookup ld/st (80)~map lookup fp/simd (81)8283pipeline redirect (84)85inst all (8c)inst branch (8d)inst branch taken (90)inst b.cc (94)inst integer (97)inst fp/simd load (98)inst ldst (9b)9d9fa0a1a3a6a8a9acafb5bbdcache load miss (bf)dtlb miss (c1)c2branch mispredict (cb)cdcfd5d6ddinst fetch restart (de)e0? int output thing (e9)eaebecld/st retires (ed)gpr retires (ef)f5f6f7f8fd
50205120051899000112000120042119520109470256010640102100011000030100100001000010790895735750613662301200361200601200571131550311363050100302001000010000602001000010000120057120057115020110099100401001000010000010010000011000100110000110110032101107111196644000200121000040100120042120061120058120058120058
50204120057899100111001120042119519109455256010640104100021000030100100001000010790625736524613345811200331200571200571131520311363050100302001000010000602001000010000120041120057115020110099100401001000010000110010001001000000010000101000032101101111196614000201091000040100120052120036120052120052120052
50204120051900001002001120042119516109470256010340104100021000030100100001000010790625736668613662311200171200601200411131520311367450100302001000010000602001000010000120041120057115020110099100401001000010000010010000011000000010000111100032101107111196644000401391000040100120058120042120058120058120058
502041200578991100020001200361195131094492560103401021000110000301001000010000107900857362366136317112003312005712005711315203113674501003020010000100006020010000100001200571200571150201100991004010010000100000100100000110000000100001111000321011351111966440004101091000040100120092120058120055120055120055
502041200548990000001001200361194931094952560103401021000110000301001000010000107886257362366136971112003012003512005111313703113668501003020010000100006020010000100001200511200351150201100991004010010000100000100100000110000000100001010000321011011111965840000101091000040100120036120052120052120056120038
50204120051900101001100120039119510109467256010340102100011000030100100001000010798445736764613662301200331200581200611131370311367750100302001000010000602001000010000120057120041115020110099100401001000010000010010000011000000010000111100032101135111196644000401091000040100120058120058120058120061120042
50204120057900111002000120042119516109470256010640104100021000030100100001000010793855737340613693001200331200411200601131520311367750100302001000010000602001000010000120057120041215020110099100401001000010000010010000011000000010002011100132101107111196544000401391000040100120058120058120058120058120058
502041200579001000020001200421195131094732560103401041000210000301001000010000107912457365246136623012003612005712005711315503113674501003020010000100006020010062100001200511200511150201100991004010010000100000100100000110000000100001101000321011071111966440004101391000040100120042120042120061120058120058
50204120041899110002001120039119510109464256010340102100011000030100100001000010794585737244613698701200331200601201101131520311363050100302001000010000602001000010000120060120057115020110099100401001000010000010010002111000201110000111100032101135111196674000410091000040100120042120042120042120061120061
502041200418991001021011200421195131094702560103401041000210000301001000010000107957957369086136623012003312005712005711315203113674501003020010000100006020010000100001200541200511150201100991004010010000100000100100000010000000100001111000321011071111965840002101091000040100120052120055120052120037120052

1000 unrolls and 10 iterations

Result (median cycles for code, minus 3 chain cycles): 9.0050

retire (01)cycle (02)030e0f1e2223243f4d4f51inst issue (52)~issue int (53)~issue fp/simd (54)~issue ld/st (55)~dispatch int (56)~dispatch fp/simd (57)~dispatch ld/st (58)huge thing int (59)huge thing ld/st (5a)huge thing fp/simd (5b)6069interrupt pending (6c)6d6edispatch stall (70)scheduler rewind (75)scheduler stall (76)~dispatch op (78)~map op int (7c)~map op ld/st (7d)~map op fp/simd (7e)~map lookup int (7f)~map lookup ld/st (80)~map lookup fp/simd (81)8283pipeline redirect (84)85inst all (8c)inst branch (8d)inst branch taken (90)inst b.cc (94)inst integer (97)inst fp/simd load (98)inst ldst (9b)9d9fa0a1a3a6a8acafb5bbdcache load miss (bf)c5cfd5d6dbddinst fetch restart (de)e0? int output thing (e9)eaebecld/st retires (ed)gpr retires (ef)f5f6f7f8fd
500251200478990010001200351195081094632560013400101000110000300101000010000107951757361886133499012002601200501200501131683113689500103002010000100006002010000100001200501200471150021109104001010000100001101000001100000010000101031401107011119665400029681000040010120048120051120051120051120051
5002412005089900100012003511950810946125600134001210001100003001010000100001079548573618861340631120026012004712004711316531136895001030020100001000060020100001000012005012004711500211091040010100001000011010000001000000100001010314011070111196654000091601000040010120051120051120036120051120051
500241200508990010001200351194951094552560025400331000110000300101000010000107954857354556133499012002601200501200551132703113689500103002010000100006002010000100001200501200471150021109104001010000100000101000001100000010000100031401107011119665400029981000040010120051120051120051120051120036
5002412005089900120001200321195051094632560016400101000110000300101000010000107954857361886133499012002301200511200501131533113689500103002010000100006002010000100001200541200501150021109104001010000100001101000001100000010000101031401107011119662400006901000040010120051120051120051120051120051
500241200509000061001200351194951094492560013400121000110000300101000010000107952357361886133499012002301200501200501131533113689500103002010000100006002010000100001200471200351150021109104001010000100001101000000100000010000101031401107021119662400029981000040010120048120048120048120048120048
500241200478990010001200321195051094612560013400121000010000300101000010000107952357360446133499012002301200471200471131653113686500103002010000100006002010000100001200501200471150021109104001010000100000101000001100000010000100031401107011119665400029651000040010120048120048120048120036120048
500241200358990010001200351195081094612560013400121000110000300101000010000107954857361886133499012002701200351200501131653113689500103002010000100006002010000100001200501200471150021109104001010000100001101000001100002010000101031401107011119665400020981000040010120049120048120048120048120048
500241200478990010001200351195051094612560013400121000110000300101000010000107951757360446133499012002601201581202891133573113694500103002010106100006002010000100001200501203801150021109104001010000100000101000001100000732510000101031401107012119662400006651000040010120480120206120049120049120048
500241200478990170001200331195051094612560013400121000110000300101000010000107952357360446132757012001101200471200471131653113686500103002010000100006002010000100001200471200471150021109104001010000100000101000001100010010000101031401107011119650400009601000040010120051120051120051120056120051
500241200538991000001200351195081094632560013400121000010000300101000010000107952357361886133499012002301200471200501131653113689500103002010000100006002010000100001200351200471150021109104001010000100000101000001100000110000100031401107011119665400020081000040010120051120051120048120051120051

Test 3: throughput

Count: 8

Code:

  ld1 { v0.4s }, [x6]
  ld1 { v0.4s }, [x6]
  ld1 { v0.4s }, [x6]
  ld1 { v0.4s }, [x6]
  ld1 { v0.4s }, [x6]
  ld1 { v0.4s }, [x6]
  ld1 { v0.4s }, [x6]
  ld1 { v0.4s }, [x6]
  mov x7, x6
  mov x8, x6
  mov x9, x6
  mov x10, x6
  mov x11, x6
  mov x12, x6
  mov x13, x6

(fused SUBS/B.cc loop)

100 unrolls and 100 iterations

Result (median cycles for code divided by count): 0.3341

retire (01)cycle (02)0305080b0e0f191e223a3f4346494f51inst issue (52)~issue int (53)~issue ld/st (55)~dispatch int (56)~dispatch ld/st (58)huge thing int (59)huge thing ld/st (5a)60696d6edispatch stall (70)scheduler rewind (75)scheduler stall (76)~dispatch op (78)~map op int (7c)~map op ld/st (7d)~map lookup int (7f)~map lookup ld/st (80)8283pipeline redirect (84)85inst all (8c)inst branch (8d)inst branch taken (90)inst b.cc (94)inst integer (97)inst fp/simd load (98)inst ldst (9b)9d9fa0a3a6a8a9acafb5b6bbdcache load miss (bf)dtlb miss (c1)c2cfd5d6ddinst fetch restart (de)e0? int output thing (e9)eaebecld/st retires (ed)gpr retires (ef)f5f6f7f8fd
802052673220011111065132669920021258010010080000100800165001167763026689267142673216642616684801162008002420080024267322671411802011009910010080000800000100800003980000204280039605742192511021611267290994800001002670826728267282670826728
80204267222000000004512267120181216258010010080000100800005001167231126702267272672716650316665801002008000020080000267272670711802011009910010080000800001100800003980000000800390135000511011611267040060800001002672826710267282670826708
8020426727200000000710026712212062580100100800001008000050011665251266972671126726166503166858010020080000200800002672726707118020110099100100800008000001008000008003900398003961043005132116112672400100800001002700626731267322673026876
8020426737200000000450226712200025801001008000010080000500117703812669726726267261665031666580100200800002008000026727267221180201100991001008000080000010080000398003510398003961043005110116112672401064800001002680626730267372672826708
80204267272000000004512267120121202580100100800001008000050011666601267022672726727166303166858010020080000200800002672926722118020110099100100800008000001008000039800351008003501394300511011611267240060800001002673126719267232671826708
802042673120000000045022669220016258010010080000100800005001166525026702267322671116630316685801002008000020080000267072672211802011009910010080000800001100800003980000103980040613939005110116112671901060800001002672826708267232672326714
8020426727200000000012266923121212258010010080000100800005001167231126702267072671116630316665801002008000020080000267072672211802011009910010080000800000100800003980039104280000013539005110116112672401062800001002673826755267332673126732
802042672920000000157002671001801625801001008000010080000500116723102670226730267221665491693380100202800002008019326731268571180201100991001008000080000110080000398000000358000061390005110116112672401060800001002673926738267282672826728
8020426727200000000451226707012024258010010080000100800005001166818126706267272672216649316685801002008000020080000267072672211802011009910010080000800001100800000800000008000061390005110116112670401060800001002672826708267282672826708
8020426727200000000410226712218181625801001008000010080000500117703812669726722267071665031668080100200800002008000026722267261180201100991001008000080000110080000398003900398003961350005110116112700101000800001002673226732267282672826733

1000 unrolls and 10 iterations

Result (median cycles for code divided by count): 0.3341

retire (01)cycle (02)03090a0b0e0f18191e22233a3f4346494f51inst issue (52)~issue int (53)~issue ld/st (55)~dispatch int (56)~dispatch ld/st (58)huge thing int (59)huge thing ld/st (5a)5f60696b6d6edispatch stall (70)scheduler rewind (75)scheduler stall (76)~dispatch op (78)~map op int (7c)~map op ld/st (7d)~map lookup int (7f)~map lookup ld/st (80)8283pipeline redirect (84)85inst all (8c)inst branch (8d)inst branch taken (90)inst b.cc (94)inst integer (97)inst fp/simd load (98)inst ldst (9b)9d9fa0a1a3a5a6a8acafb5b6bbdcache load miss (bf)c2cfd5d6daddinst fetch restart (de)e0eaebecld/st retires (ed)gpr retires (ef)f5f6f7f8fd
8002526730200100000046101267002121162580010108000010800005011675011126702026731267311667631670780010208000020800002670826728118002110910108000080000110800000430800390478003861394405020816075267241410480000102673226732267322672826732
80024267312010000000440012671621119258001010800001080000501167201002670302673126731166763167078001020800002080000267312670811800211091010800008000001080000043080038038800396139440502091607526724140780000102673226732267322673226732
80024267282000000100440012672521102580010108000010800005011685550126706026732267311667631671180010208000020800002673126727118002110910108000080000010800000430800390388003861394305020716057269571410480000102672926735267322672926728
80024267272000000000440012671621119258001010800001080000501167501012670302673126731166723167118001020800002080000267312672711800211091010800008000001080000043080038039800386139440502071606726724014780000102673226728267282672826728
80024267082000000000450012671321119258001010800001080000501167201002670602673126727166893167118001020800002080000267272672711800211091010800008000001080000043080039038800386139440502051605726725014780000102673226732267322673226732
80024267312000000000440012671321102580010108000010800005011671240026706026727267311667631671180010208000020800002672726728118002110910108000080000010800000430800380388003861394405020716057267241010480000102672826709267322673226728
800242672720100000004400126716011192580010108000010800005011671240126706026731267311667631671180010208000020800002672726727118002110910108000080000010800000430800380388003861394405020816089267251410780000102670926709267092672926709
80024267312000000000451012671221116258001010800001080000501167124012670602673526731166523167118001020800002080000267272672811800211091010800008000001080000043080038008003861394405020716087269851410780000102673826729267352674826729
800242673120000000004400126716212119258001010800001080000501168843012668302670826708166763167118001020800002080000267312672811800211091010800008000001080000043080038038800396104405020716058267241414780000102673226732267322673226732
8002426731200100100053101267162121219258001010800001080000501167501012670602672726728166723166888001020800002080000267312672811800211091010800008000011080000000801690428003901394405020716058267251414780000102673226732267282673226732