Apple M1 Microarchitecture Research by Dougall Johnson

Firestorm: Overview | Base Instructions | SIMD and FP Instructions
Icestorm:  Overview | Base Instructions | SIMD and FP Instructions

LD1 (multiple, one register, 16B)

Test 1: uops

Code:

  ld1 { v0.16b }, [x6]
  nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop
  mov x0, 1
  mov x1, 2
  mov x8, 0

(no loop instructions)

1000 unrolls and 1 iteration

Retires (minus 60 nops): 1.000

Issues: 1.000

Integer unit issues: 0.000

Load/store unit issues: 1.000

SIMD/FP unit issues: 0.000

retire (01)cycle (02)0307080a0e0f1e22243a3f43464951inst issue (52)~issue ld/st (55)~dispatch ld/st (58)huge thing ld/st (5a)5f696b6d6escheduler rewind (75)scheduler stall (76)~dispatch op (78)~map op ld/st (7d)~map lookup ld/st (80)8283pipeline redirect (84)85inst all (8c)inst fp/simd load (98)inst ldst (9b)9da0a1a3a6a7a8a9acafb5bbdcache load miss (bf)c2cfd0d2icache miss (d3)itlb miss (d4)d5d6d9dadbddinst fetch restart (de)e0eaebecld/st retires (ed)f5f6f7f8fd
61005281682121701900310512427985001233871000100010005000515967028112282443101000100010002813728092116100110001000110040210000101100123301393210199723334638572063834663813846452816914188119981376410002858828087281482852828708
6100428141212240160041049642813401123495100010001000500161598002788228451310100010001000280552802911610011000100001000021001000010002120134171005371913150104620547320238151746452785614286121281376010002865428625283252862028596
6100428668215140140020051332785301123091100010001000500031597002789528514310100010001000281102820611610011000100001000031000000110012130141161033169253242104620442338138111138402787115069121021361710002861028505286292825728144
610042819721117013004104808279211012316610001000100050005159520278492829231010001000100028118281801161001100010000100102100111211003212013295956872263462104320947324038111741432802214283128921491010002816428317283982828128113
61004286252121401900410482527886000231781000100010005001515997027976281343101000100010002815628493116100110001000010000310010001100121301321510415723934487482044334703813840442778914409128411351710002817728523285262845928191
61004282912112001900400519828194111234381000100010005000215989027832280893101000100010002808328023116100110001000010000210010001100121201306096636952340794820486339438101242392778115182122651364410002807328320284722869228203
6100428296212170150021051862793011123535100010001000500081596602808028616310100010001000282222829511610011000100001000031003000010012030138281005072253367104420595316738101246482782315460129561466510002849928418286012837428508
610042814421213020004005095281590012316710001000100050013159840281882859631010001000100028014284641161001100010000100003100100001001213013919103957311315194521069339038111349412804114094120781371110002858728515285672824328234
6100428104210170170031050522810511123187100010001000500091598402785328076310100010001000284942814711610011000100001000031000010110011130132891049571603385103920579322338111043392781515569122381472810002810928608286382833928222
610042845921313019003105136279401002312610001000100050006159720279122859934010001000100028050284531161001100010000100003100100001001213013719103367124332574421045342238111245412774814073121601330510002806628517283982821228213

Test 2: Latency 1->2 roundtrip

Chain cycles: 3

Code:

  ld1 { v0.16b }, [x6]
  fmov x0, d0
  eor x8, x8, x0
  eor x8, x8, x0
  add x6, x6, x8
  mov x0, 1
  mov x1, 2
  mov x8, 0

(fused SUBS/B.cc loop)

100 unrolls and 100 iterations

Result (median cycles for code, minus 3 chain cycles): 9.0054

retire (01)cycle (02)03080b0e0f1e22243a3f4d4f51inst issue (52)~issue int (53)~issue fp/simd (54)~issue ld/st (55)~dispatch int (56)~dispatch fp/simd (57)~dispatch ld/st (58)huge thing int (59)huge thing ld/st (5a)huge thing fp/simd (5b)60696b6d6edispatch stall (70)scheduler rewind (75)scheduler stall (76)~dispatch op (78)~map op int (7c)~map op ld/st (7d)~map op fp/simd (7e)~map lookup int (7f)~map lookup ld/st (80)~map lookup fp/simd (81)8283pipeline redirect (84)85inst all (8c)inst branch (8d)inst branch taken (90)inst b.cc (94)inst integer (97)inst fp/simd load (98)inst ldst (9b)9d9fa0a1a3a6a8a9acafb5bbdcache load miss (bf)dtlb miss (c1)cfd5d6ddinst fetch restart (de)e0? int output thing (e9)eaebecld/st retires (ed)gpr retires (ef)f5f6f7f8fd
50205120051900001117910012003611951010946425601034010210001100023010010000100001079008573638061363170120030012003512005411314631136685010030200100001000060200100001000012005412005111502011009910040100100001000001001000001100000001000010103210210722119661400021310121000040100120055120061120061120055120052
502041200608990110710012003911951310946725601034010210001100003010010000100001079008573545561363170120030012003512005411314631136685010030200100001000060200100001000012005112003511502011009910040100100001000001001000001100000001000010103210210122119646400001013121000040100120055120102120058120058120052
502041200518990000100012003611951510946725601004010210000100003010010000100001078862573638061363170120030312005412005411314631136695010030200100001000060200100001000012005112005111502011009910040100100001000001001001241100000001000000103210210122119658400021310121000040100120036120036120055120055120055
50204120035899000011001200361195101094642560100401021000110000301001000010000107900857362366136317012002701200541200541131463113668501003020010000100006020010000100001200511200511150201100991004010010000100001100100000010000000100001010321021072211966140002101091000040100120055120055120036120055120036
502041200518990000100012003911949310946725601034010210001100003010010000100001079035573638061363170120027012005412005411314631136715010030200100001000060200100001000012005412005111502011009910040100100001000001001000001100000001000010103210210722119658400021010121000040100120036120055120055120055120052
502041200548990000251001200361195131094672560103401021000110000301001000010000107903557363806136317012003401200351200351131463113671501003020010000100006020010000100001200541200511150201100991004010010000100000100100000110000000100001010321021072211966140002130121000040100120055120036120055120055120055
5020412003589900001310012003911951310946725601034010210001100003010010000100001081102573545561363170120027012005412005411314131136715010030200100001000060200100001000012005112005111502011009910040100100001000001001000001100000001000010003210210722119775400021313121000040100120055120055120055120055120055
50204120054899000010001200391195101094642560103401021000310000301001000010000107900857363806136317012002701200541200541131463113668501003020010000100006020010000100001200541200511150201100991004010010000100000100100000110000000100001010321021072211965840002131301000040100120052120036120055120036120052
502041200548990000110012003911951310946725601034010210001100003010010000100001079035573638061364700120027012005112005411319031136715010030200100001000060200100001000012005112005111502011009910040100100001000001001000001100000001000000103210210722119661400021310121000040100120055120055120036120055120052
502041200548990000110012003911951310946725601034010210001100003010010000100001078862573638061344610120030012003812005411321531136585010030200100001000060200100001000012005112003511502011009910040100100001000001001000000100000001000010103210210722119661400021013121000040100120036120055120055120055120055

1000 unrolls and 10 iterations

Result (median cycles for code, minus 3 chain cycles): 9.0051

retire (01)cycle (02)03080b0e0f18191e1f22243a3f4d4f51inst issue (52)~issue int (53)~issue fp/simd (54)~issue ld/st (55)~dispatch int (56)~dispatch fp/simd (57)~dispatch ld/st (58)huge thing int (59)huge thing ld/st (5a)huge thing fp/simd (5b)60696d6edispatch stall (70)scheduler rewind (75)scheduler stall (76)~dispatch op (78)~map op int (7c)~map op ld/st (7d)~map op fp/simd (7e)~map lookup int (7f)~map lookup ld/st (80)~map lookup fp/simd (81)8283pipeline redirect (84)85inst all (8c)inst branch (8d)inst branch taken (90)inst b.cc (94)inst integer (97)inst fp/simd load (98)inst ldst (9b)9d9fa0a1a3a6a8a9acafb5dcache load miss (bf)c2cdcfd5d6ddinst fetch restart (de)e0? int output thing (e9)eaebecld/st retires (ed)gpr retires (ef)f5f6f7f8fd
5002512005189900110010100120020119509109464256001340012100011000030010100001000010796295736236613366201200271200511200511131693113690500103002010000100006002010000100001200511200511150021109104001010000100001101000001100000001000011003140111079911967140002101091000040010120052120052120052120052120052
50024120051899000000101001200361195091094642560013400121000110000300101000010000107994457369566133873012002712003512005111315331136905001030020100001000060020100001000012005112005111500211091040010100001000001010000011000000010000110031401110710811966640002101091000040010120052120052120052120052120052
50024120051900000000101001200411195091094642560013400121000110000300101000010000107988157362366133662112002712005112005111316931136905001030020100001000060020100001000012005112005111500211091040010100001000001010000011000010010000110031401010710811966640002101091000040010120104120052120055120056120052
500241200518990000001010012003611950910946425600134001210001100003001010000100001080106573690861337651120027120051120051113153311369050010300201000010000600201000010000120051120051115002110910400101000010000010100000110000000100001100314071078711966640002101001000040010120052120052120052120052120052
500241200518990000001010012003611950910946425600134001210001100003001010000100001079944574607661337130120027120145120051113169311369050010300201000010000600201000010000120051120287715002110910400101000010000010100000110000000100001100314081078711966640002101091000040010120052120054120052120052120098
50024120051900000000101001200361195091094642560013400121000110000300101000010000107955757362366133662012001112005112005111316931136905001030020100001000060020100001000012005112005211500211091040010100001000001010000011000000010000110031401310710811966640002101091000040010120036120052120052120052120052
500241200358990000000000012002011950910946425600134001210001100003001010000100001080070573690861336621120027120051120051113169311369050010300201000010000600201000010000120051120051115002110910400101000010000010100000110000000100001100314010107101211966640002101091000040010120052120052120052120052120052
500241200548990000001010012003611962010947825600134001010001100003001010000100001079809573839661337640120027120051120051113169311369050010300201000010000600201000010000120051120051115002110910400101000010000010100000110000101810000110031401210781011966640002101091000040010120036120052120052120052120052
5002412005190000110010100120036119509109464256001340012100011000030010100001000010799715737628613376801200271201461201101133071411379450010300201000010000600201000010000120051120051115002110910400101000010000010100000110000007281100001100314010107131011966640002101091000040010120052120052120053120055120052
5002412005189900000013000012003611950910946625600134001210001100003001010000100001090924573681261338680120028120051120257113250281139085049430020100001000060020100001010812024012005131500211091040010100001000001010000011000010010000110031401210710711966740002101091000040010120052120052120052120052120052

Test 3: throughput

Count: 8

Code:

  ld1 { v0.16b }, [x6]
  ld1 { v0.16b }, [x6]
  ld1 { v0.16b }, [x6]
  ld1 { v0.16b }, [x6]
  ld1 { v0.16b }, [x6]
  ld1 { v0.16b }, [x6]
  ld1 { v0.16b }, [x6]
  ld1 { v0.16b }, [x6]
  mov x7, x6
  mov x8, x6
  mov x9, x6
  mov x10, x6
  mov x11, x6
  mov x12, x6
  mov x13, x6

(fused SUBS/B.cc loop)

100 unrolls and 100 iterations

Result (median cycles for code divided by count): 0.3343

retire (01)cycle (02)0305080b0e0f1e223a3f4346494f51inst issue (52)~issue int (53)~issue ld/st (55)~dispatch int (56)~dispatch ld/st (58)huge thing int (59)huge thing ld/st (5a)60696d6edispatch stall (70)scheduler rewind (75)scheduler stall (76)~dispatch op (78)~map op int (7c)~map op ld/st (7d)~map lookup int (7f)~map lookup ld/st (80)8283pipeline redirect (84)85inst all (8c)inst branch (8d)inst branch taken (90)92inst b.cc (94)inst integer (97)inst fp/simd load (98)inst ldst (9b)9d9fa0a1a3a6a7a8a9acafb5b6bbdcache load miss (bf)dtlb miss (c1)c2cfd5d6ddinst fetch restart (de)e0eaebecld/st retires (ed)gpr retires (ef)f5f6f7f8fd
80205267362001011167022672137719258010010080000100800005001184702026711267362673616659316695801002008000020080000267362673611802011009901001008000080000010080020204380058029060800406159431915110116112673413135800001002673726737267382673726852
80204267382001110067132672137719258010010080000100800005001167762026711267362673616659316695801002008000020080000267372673611802011009901001008000080000010080019204380060024164800406159431915110116112673413135800001002673726737267382673726864
8020426746208100006602267212702025801001008000010080000500117243702671126736267361665931669480100200800002008000026736267371180201100990100100800008000001008002119438006000161800406158431925110116112673313135800001002673826737267372674026753
80204267432011010067032672127719258010010080000100800005001168818026711267362673616658316694801002008000020080000267362673611802011009901001008000080000010080021204380059037163800426158431915110116112673313135800001002673726737267372673826880
8020426744200100006602267212772025801001008000010080000500116852902671126736267361665931669580100200800002008000026736267361180201100990100100800008000001008002020438006000161800406158451915110116112673313135800001002673826737267372673826738
80204267992011001167022672229722258010010080000100800005001169868026711267362673616659316694801002008000020080000267372673711802011009901001008000080000010080020194380059024161800396158431915110116112673313135800001002673726737267372673826849
80204267432001110069032672127720258010010080000100800005001169868026711267362673616659316672801002008000020080000267372673611802011009901001008000080000010080020194380059141066800396159431905110116112673313135800001002673726737267372673726873
80204267432001100067132672427720258010010080000100800005001167644026711267362673616659316694801002008000020080000267362673611802011009901001008000080000010080019194380059022164800406158431925110116112673413135800001002673726737267382673726845
8020426743200111007803267213771925801001008000010080000500116736602671126736267361665831669480100200800002008000026736267361180201100990100100800008000001008001919438005900160800406159451915110116112673313135800001002673726737267382673726880
8020426742200110116603267223770258010010080000100800005001169868026711267362673616659316694801002008000020080000267372673711802011009901001008000080000110080020204380059039063800406159431915110116112673313135800001002673726737267372673726861

1000 unrolls and 10 iterations

Result (median cycles for code divided by count): 0.3341

retire (01)cycle (02)03090e1e22233a3f4346494f51inst issue (52)~issue int (53)~issue ld/st (55)~dispatch int (56)~dispatch ld/st (58)huge thing int (59)huge thing ld/st (5a)60696d6edispatch stall (70)scheduler rewind (75)scheduler stall (76)~dispatch op (78)~map op int (7c)~map op ld/st (7d)~map lookup int (7f)~map lookup ld/st (80)8283pipeline redirect (84)85inst all (8c)inst branch (8d)inst branch taken (90)inst b.cc (94)inst integer (97)inst fp/simd load (98)inst ldst (9b)9fa0a3a6a8acafb5b6bbdcache load miss (bf)cfd5d6ddinst fetch restart (de)e0eaebecld/st retires (ed)gpr retires (ef)f5f6f7f8fd
800252672220010450022669320121225800101080000108000050116907902670226708267271666731668880010208000020800002670826708118002110910108000080000108000043800001819880035610050201161126719106080000102670926709267282672926776
80024267282000100012669321212025800101080000108000050116622412668326722267221667210167028001020800002080000267222670811800211091010800008000010800003980039211928003561354350201161126724010080000102672926709267092670926787
80024268572000045100267132121216258001010800001080000501167754126683267282672816672316708800102080000208000026722267081180021109101080000800001080000398000018458000061350502011611267051010480000102672826723267232670926803
8002426816200004500226713012016258001010800001080000501166750126683267272670816672316708800102080000208000026708267221180021109101080000800001080000398000045448003960354350201161126705010480000102670926729267232670926887
800242674220000010026693212002580010108000010800005011667500266832672826708166523166888001020800002080000267282670811800211091010800008000010800003980000494180000603543502011611267241010080000102670926729267092672826723
800242672220100450022671220016258001010800001080000501170881126683267272672716667316707800102080000208000026727267081180021109101080000800001080000080039320800390035050201161126725100480000102672926709267092672826767
8002426727204004500226712212121625800101080000108000050116684612668326727267271678231670780010208000020800002672226708118002110910108000080000108000043800353214780039600435020116112672400480000102672926728267092670926744
800242670820000451002671320180258001010800001080000501168216126702267282672216672316702800102080000208000026727267221180021109101080000800001080000080039244280039013543502011611267241010480000102672926709267282670926790
80024267282000045002267130121216258001010800001080000501167230126702267272672216680316707800102080000208000026728267221180021109101080000800001080000398003923380039613505020116112670560480000102672826709267292672826794
800242685520000000226712012121625800101080000108000050116762102670326708267221665231670880010208000020800002672226708118002110910108000080000108000008003923458000061043502011611267251010480000102670926729267292672826845