Apple Microarchitecture Research by Dougall Johnson

M1/A14 P-core (Firestorm): Overview | Base Instructions | SIMD and FP Instructions
M1/A14 E-core (Icestorm):  Overview | Base Instructions | SIMD and FP Instructions

LD1 (multiple, 1 reg, 16B)

Test 1: uops

Code:

  ld1 { v0.16b }, [x6]
  nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop
  mov x0, 1
  mov x1, 2
  mov x8, 0

(no loop instructions)

1000 unrolls and 1 iteration

Retires (minus 60 nops): 1.000

Issues: 1.000

Integer unit issues: 0.000

Load/store unit issues: 1.000

SIMD/FP unit issues: 0.000

retire uop (01)cycle (02)03mmu table walk instruction (07)mmu table walk data (08)l2 tlb miss instruction (0a)0e0f1e22243a3f43464951schedule uop (52)schedule ldst uop (55)dispatch ldst uop (58)simd uops in schedulers (5a)5f696b6d6emap rewind (75)map stall (76)dispatch uop (78)map ldst uop (7d)map ldst uop inputs (80)8283flush restart other nonspec (84)85inst all (8c)inst simd load (98)inst ldst (9b)9dl1d tlb access (a0)l1d tlb miss (a1)l1d cache miss ld (a3)ld unit uop (a6)st unit uop (a7)l1d cache writeback (a8)a9acafb5bbl1d cache miss ld nonspec (bf)c2cfd0d2l1i cache miss demand (d3)l1i tlb miss demand (d4)d5map dispatch bubble (d6)d9dadbddfetch restart (de)e0eaebec? ldst retires (ed)f5f6f7f8fd
61005281682121701900310512427985001233871000100010005000515967028112282443101000100010002813728092116100110001000110040210000101100123301393210199723334638572063834663813846452816914188119981376410002858828087281482852828708
6100428141212240160041049642813401123495100010001000500161598002788228451310100010001000280552802911610011000100001000021001000010002120134171005371913150104620547320238151746452785614286121281376010002865428625283252862028596
6100428668215140140020051332785301123091100010001000500031597002789528514310100010001000281102820611610011000100001000031000000110012130141161033169253242104620442338138111138402787115069121021361710002861028505286292825728144
610042819721117013004104808279211012316610001000100050005159520278492829231010001000100028118281801161001100010000100102100111211003212013295956872263462104320947324038111741432802214283128921491010002816428317283982828128113
61004286252121401900410482527886000231781000100010005001515997027976281343101000100010002815628493116100110001000010000310010001100121301321510415723934487482044334703813840442778914409128411351710002817728523285262845928191
61004282912112001900400519828194111234381000100010005000215989027832280893101000100010002808328023116100110001000010000210010001100121201306096636952340794820486339438101242392778115182122651364410002807328320284722869228203
6100428296212170150021051862793011123535100010001000500081596602808028616310100010001000282222829511610011000100001000031003000010012030138281005072253367104420595316738101246482782315460129561466510002849928418286012837428508
610042814421213020004005095281590012316710001000100050013159840281882859631010001000100028014284641161001100010000100003100100001001213013919103957311315194521069339038111349412804114094120781371110002858728515285672824328234
6100428104210170170031050522810511123187100010001000500091598402785328076310100010001000284942814711610011000100001000031000010110011130132891049571603385103920579322338111043392781515569122381472810002810928608286382833928222
610042845921313019003105136279401002312610001000100050006159720279122859934010001000100028050284531161001100010000100003100100001001213013719103367124332574421045342238111245412774814073121601330510002806628517283982821228213

Test 2: Latency 1->2 roundtrip

Chain cycles: 3

Code:

  ld1 { v0.16b }, [x6]
  fmov x0, d0
  eor x8, x8, x0
  eor x8, x8, x0
  add x6, x6, x8
  mov x0, 1
  mov x1, 2
  mov x8, 0

(fused SUBS/B.cc loop)

100 unrolls and 100 iterations

Result (median cycles for code, minus 3 chain cycles): 9.0054

retire uop (01)cycle (02)03mmu table walk data (08)l2 tlb miss data (0b)0e0f1e22243a3f4d4f51schedule uop (52)schedule int uop (53)schedule simd uop (54)schedule ldst uop (55)dispatch int uop (56)dispatch simd uop (57)dispatch ldst uop (58)int uops in schedulers (59)simd uops in schedulers (5a)ldst uops in schedulers (5b)60696b6d6emap stall dispatch (70)map rewind (75)map stall (76)dispatch uop (78)map int uop (7c)map ldst uop (7d)map simd uop (7e)map int uop inputs (7f)map ldst uop inputs (80)map simd uop inputs (81)8283flush restart other nonspec (84)85inst all (8c)inst branch (8d)inst branch taken (90)inst branch cond (94)inst int alu (97)inst simd load (98)inst ldst (9b)9d9fl1d tlb access (a0)l1d tlb miss (a1)l1d cache miss ld (a3)ld unit uop (a6)l1d cache writeback (a8)a9acafb5bbl1d cache miss ld nonspec (bf)l1d tlb miss nonspec (c1)cfd5map dispatch bubble (d6)ddfetch restart (de)e0? int output thing (e9)eaebec? ldst retires (ed)? int retires (ef)f5f6f7f8fd
50205120051900001117910012003611951010946425601034010210001100023010010000100001079008573638061363170120030012003512005411314631136685010030200100001000060200100001000012005412005111502011009910040100100001000001001000001100000001000010103210210722119661400021310121000040100120055120061120061120055120052
502041200608990110710012003911951310946725601034010210001100003010010000100001079008573545561363170120030012003512005411314631136685010030200100001000060200100001000012005112003511502011009910040100100001000001001000001100000001000010103210210122119646400001013121000040100120055120102120058120058120052
502041200518990000100012003611951510946725601004010210000100003010010000100001078862573638061363170120030312005412005411314631136695010030200100001000060200100001000012005112005111502011009910040100100001000001001001241100000001000000103210210122119658400021310121000040100120036120036120055120055120055
50204120035899000011001200361195101094642560100401021000110000301001000010000107900857362366136317012002701200541200541131463113668501003020010000100006020010000100001200511200511150201100991004010010000100001100100000010000000100001010321021072211966140002101091000040100120055120055120036120055120036
502041200518990000100012003911949310946725601034010210001100003010010000100001079035573638061363170120027012005412005411314631136715010030200100001000060200100001000012005412005111502011009910040100100001000001001000001100000001000010103210210722119658400021010121000040100120036120055120055120055120052
502041200548990000251001200361195131094672560103401021000110000301001000010000107903557363806136317012003401200351200351131463113671501003020010000100006020010000100001200541200511150201100991004010010000100000100100000110000000100001010321021072211966140002130121000040100120055120036120055120055120055
5020412003589900001310012003911951310946725601034010210001100003010010000100001081102573545561363170120027012005412005411314131136715010030200100001000060200100001000012005112005111502011009910040100100001000001001000001100000001000010003210210722119775400021313121000040100120055120055120055120055120055
50204120054899000010001200391195101094642560103401021000310000301001000010000107900857363806136317012002701200541200541131463113668501003020010000100006020010000100001200541200511150201100991004010010000100000100100000110000000100001010321021072211965840002131301000040100120052120036120055120036120052
502041200548990000110012003911951310946725601034010210001100003010010000100001079035573638061364700120027012005112005411319031136715010030200100001000060200100001000012005112005111502011009910040100100001000001001000001100000001000000103210210722119661400021310121000040100120055120055120036120055120052
502041200548990000110012003911951310946725601034010210001100003010010000100001078862573638061344610120030012003812005411321531136585010030200100001000060200100001000012005112003511502011009910040100100001000001001000000100000001000010103210210722119661400021013121000040100120036120055120055120055120055

1000 unrolls and 10 iterations

Result (median cycles for code, minus 3 chain cycles): 9.0051

retire uop (01)cycle (02)03mmu table walk data (08)l2 tlb miss data (0b)0e0f18191e1f22243a3f4d4f51schedule uop (52)schedule int uop (53)schedule simd uop (54)schedule ldst uop (55)dispatch int uop (56)dispatch simd uop (57)dispatch ldst uop (58)int uops in schedulers (59)simd uops in schedulers (5a)ldst uops in schedulers (5b)60696d6emap stall dispatch (70)map rewind (75)map stall (76)dispatch uop (78)map int uop (7c)map ldst uop (7d)map simd uop (7e)map int uop inputs (7f)map ldst uop inputs (80)map simd uop inputs (81)8283flush restart other nonspec (84)85inst all (8c)inst branch (8d)inst branch taken (90)inst branch cond (94)inst int alu (97)inst simd load (98)inst ldst (9b)9d9fl1d tlb access (a0)l1d tlb miss (a1)l1d cache miss ld (a3)ld unit uop (a6)l1d cache writeback (a8)a9acafb5l1d cache miss ld nonspec (bf)c2cdcfd5map dispatch bubble (d6)ddfetch restart (de)e0? int output thing (e9)eaebec? ldst retires (ed)? int retires (ef)f5f6f7f8fd
5002512005189900110010100120020119509109464256001340012100011000030010100001000010796295736236613366201200271200511200511131693113690500103002010000100006002010000100001200511200511150021109104001010000100001101000001100000001000011003140111079911967140002101091000040010120052120052120052120052120052
50024120051899000000101001200361195091094642560013400121000110000300101000010000107994457369566133873012002712003512005111315331136905001030020100001000060020100001000012005112005111500211091040010100001000001010000011000000010000110031401110710811966640002101091000040010120052120052120052120052120052
50024120051900000000101001200411195091094642560013400121000110000300101000010000107988157362366133662112002712005112005111316931136905001030020100001000060020100001000012005112005111500211091040010100001000001010000011000010010000110031401010710811966640002101091000040010120104120052120055120056120052
500241200518990000001010012003611950910946425600134001210001100003001010000100001080106573690861337651120027120051120051113153311369050010300201000010000600201000010000120051120051115002110910400101000010000010100000110000000100001100314071078711966640002101001000040010120052120052120052120052120052
500241200518990000001010012003611950910946425600134001210001100003001010000100001079944574607661337130120027120145120051113169311369050010300201000010000600201000010000120051120287715002110910400101000010000010100000110000000100001100314081078711966640002101091000040010120052120054120052120052120098
50024120051900000000101001200361195091094642560013400121000110000300101000010000107955757362366133662012001112005112005111316931136905001030020100001000060020100001000012005112005211500211091040010100001000001010000011000000010000110031401310710811966640002101091000040010120036120052120052120052120052
500241200358990000000000012002011950910946425600134001210001100003001010000100001080070573690861336621120027120051120051113169311369050010300201000010000600201000010000120051120051115002110910400101000010000010100000110000000100001100314010107101211966640002101091000040010120052120052120052120052120052
500241200548990000001010012003611962010947825600134001010001100003001010000100001079809573839661337640120027120051120051113169311369050010300201000010000600201000010000120051120051115002110910400101000010000010100000110000101810000110031401210781011966640002101091000040010120036120052120052120052120052
5002412005190000110010100120036119509109464256001340012100011000030010100001000010799715737628613376801200271201461201101133071411379450010300201000010000600201000010000120051120051115002110910400101000010000010100000110000007281100001100314010107131011966640002101091000040010120052120052120053120055120052
5002412005189900000013000012003611950910946625600134001210001100003001010000100001090924573681261338680120028120051120257113250281139085049430020100001000060020100001010812024012005131500211091040010100001000001010000011000010010000110031401210710711966740002101091000040010120052120052120052120052120052

Test 3: throughput

Count: 8

Code:

  ld1 { v0.16b }, [x6]
  ld1 { v0.16b }, [x6]
  ld1 { v0.16b }, [x6]
  ld1 { v0.16b }, [x6]
  ld1 { v0.16b }, [x6]
  ld1 { v0.16b }, [x6]
  ld1 { v0.16b }, [x6]
  ld1 { v0.16b }, [x6]
  mov x7, x6
  mov x8, x6
  mov x9, x6
  mov x10, x6
  mov x11, x6
  mov x12, x6
  mov x13, x6

(fused SUBS/B.cc loop)

100 unrolls and 100 iterations

Result (median cycles for code divided by count): 0.3343

retire uop (01)cycle (02)03l1d tlb fill (05)mmu table walk data (08)l2 tlb miss data (0b)0e0f1e223a3f4346494f51schedule uop (52)schedule int uop (53)schedule ldst uop (55)dispatch int uop (56)dispatch ldst uop (58)int uops in schedulers (59)simd uops in schedulers (5a)60696d6emap stall dispatch (70)map rewind (75)map stall (76)dispatch uop (78)map int uop (7c)map ldst uop (7d)map int uop inputs (7f)map ldst uop inputs (80)8283flush restart other nonspec (84)85inst all (8c)inst branch (8d)inst branch taken (90)92inst branch cond (94)inst int alu (97)inst simd load (98)inst ldst (9b)9d9fl1d tlb access (a0)l1d tlb miss (a1)l1d cache miss ld (a3)ld unit uop (a6)st unit uop (a7)l1d cache writeback (a8)a9acafb5b6bbl1d cache miss ld nonspec (bf)l1d tlb miss nonspec (c1)c2cfd5map dispatch bubble (d6)ddfetch restart (de)e0eaebec? ldst retires (ed)? int retires (ef)f5f6f7f8fd
80205267362001011167022672137719258010010080000100800005001184702026711267362673616659316695801002008000020080000267362673611802011009901001008000080000010080020204380058029060800406159431915110116112673413135800001002673726737267382673726852
80204267382001110067132672137719258010010080000100800005001167762026711267362673616659316695801002008000020080000267372673611802011009901001008000080000010080019204380060024164800406159431915110116112673413135800001002673726737267382673726864
8020426746208100006602267212702025801001008000010080000500117243702671126736267361665931669480100200800002008000026736267371180201100990100100800008000001008002119438006000161800406158431925110116112673313135800001002673826737267372674026753
80204267432011010067032672127719258010010080000100800005001168818026711267362673616658316694801002008000020080000267362673611802011009901001008000080000010080021204380059037163800426158431915110116112673313135800001002673726737267372673826880
8020426744200100006602267212772025801001008000010080000500116852902671126736267361665931669580100200800002008000026736267361180201100990100100800008000001008002020438006000161800406158451915110116112673313135800001002673826737267372673826738
80204267992011001167022672229722258010010080000100800005001169868026711267362673616659316694801002008000020080000267372673711802011009901001008000080000010080020194380059024161800396158431915110116112673313135800001002673726737267372673826849
80204267432001110069032672127720258010010080000100800005001169868026711267362673616659316672801002008000020080000267372673611802011009901001008000080000010080020194380059141066800396159431905110116112673313135800001002673726737267372673726873
80204267432001100067132672427720258010010080000100800005001167644026711267362673616659316694801002008000020080000267362673611802011009901001008000080000010080019194380059022164800406158431925110116112673413135800001002673726737267382673726845
8020426743200111007803267213771925801001008000010080000500116736602671126736267361665831669480100200800002008000026736267361180201100990100100800008000001008001919438005900160800406159451915110116112673313135800001002673726737267382673726880
8020426742200110116603267223770258010010080000100800005001169868026711267362673616659316694801002008000020080000267372673711802011009901001008000080000110080020204380059039063800406159431915110116112673313135800001002673726737267372673726861

1000 unrolls and 10 iterations

Result (median cycles for code divided by count): 0.3341

retire uop (01)cycle (02)03090e1e22233a3f4346494f51schedule uop (52)schedule int uop (53)schedule ldst uop (55)dispatch int uop (56)dispatch ldst uop (58)int uops in schedulers (59)simd uops in schedulers (5a)60696d6emap stall dispatch (70)map rewind (75)map stall (76)dispatch uop (78)map int uop (7c)map ldst uop (7d)map int uop inputs (7f)map ldst uop inputs (80)8283flush restart other nonspec (84)85inst all (8c)inst branch (8d)inst branch taken (90)inst branch cond (94)inst int alu (97)inst simd load (98)inst ldst (9b)9fl1d tlb access (a0)l1d cache miss ld (a3)ld unit uop (a6)l1d cache writeback (a8)acafb5b6bbl1d cache miss ld nonspec (bf)cfd5map dispatch bubble (d6)ddfetch restart (de)e0eaebec? ldst retires (ed)? int retires (ef)f5f6f7f8fd
800252672220010450022669320121225800101080000108000050116907902670226708267271666731668880010208000020800002670826708118002110910108000080000108000043800001819880035610050201161126719106080000102670926709267282672926776
80024267282000100012669321212025800101080000108000050116622412668326722267221667210167028001020800002080000267222670811800211091010800008000010800003980039211928003561354350201161126724010080000102672926709267092670926787
80024268572000045100267132121216258001010800001080000501167754126683267282672816672316708800102080000208000026722267081180021109101080000800001080000398000018458000061350502011611267051010480000102672826723267232670926803
8002426816200004500226713012016258001010800001080000501166750126683267272670816672316708800102080000208000026708267221180021109101080000800001080000398000045448003960354350201161126705010480000102670926729267232670926887
800242674220000010026693212002580010108000010800005011667500266832672826708166523166888001020800002080000267282670811800211091010800008000010800003980000494180000603543502011611267241010080000102670926729267092672826723
800242672220100450022671220016258001010800001080000501170881126683267272672716667316707800102080000208000026727267081180021109101080000800001080000080039320800390035050201161126725100480000102672926709267092672826767
8002426727204004500226712212121625800101080000108000050116684612668326727267271678231670780010208000020800002672226708118002110910108000080000108000043800353214780039600435020116112672400480000102672926728267092670926744
800242670820000451002671320180258001010800001080000501168216126702267282672216672316702800102080000208000026727267221180021109101080000800001080000080039244280039013543502011611267241010480000102672926709267282670926790
80024267282000045002267130121216258001010800001080000501167230126702267272672216680316707800102080000208000026728267221180021109101080000800001080000398003923380039613505020116112670560480000102672826709267292672826794
800242685520000000226712012121625800101080000108000050116762102670326708267221665231670880010208000020800002672226708118002110910108000080000108000008003923458000061043502011611267251010480000102670926729267292672826845