Apple Microarchitecture Research by Dougall Johnson

M1/A14 P-core (Firestorm): Overview | Base Instructions | SIMD and FP Instructions
M1/A14 E-core (Icestorm):  Overview | Base Instructions | SIMD and FP Instructions

LD1 (multiple, 1 reg, 4S)

Test 1: uops

Code:

  ld1 { v0.4s }, [x6]
  nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop
  mov x0, 1
  mov x1, 2
  mov x8, 0

(no loop instructions)

1000 unrolls and 1 iteration

Retires (minus 60 nops): 1.000

Issues: 1.000

Integer unit issues: 0.000

Load/store unit issues: 1.000

SIMD/FP unit issues: 0.000

retire uop (01)cycle (02)03l1d tlb fill (05)mmu table walk instruction (07)mmu table walk data (08)09l2 tlb miss instruction (0a)l2 tlb miss data (0b)0e0f18191e1f22243a3f464951schedule uop (52)schedule ldst uop (55)dispatch ldst uop (58)simd uops in schedulers (5a)5f61696d6emap rewind (75)map stall (76)dispatch uop (78)map ldst uop (7d)map ldst uop inputs (80)8283flush restart other nonspec (84)85inst all (8c)inst simd load (98)inst ldst (9b)9dl1d tlb access (a0)l1d tlb miss (a1)l1d cache miss ld (a3)ld unit uop (a6)l1d cache writeback (a8)a9acafb5bbl1d cache miss ld nonspec (bf)l1d tlb miss nonspec (c1)c2cfd0d2l1i cache miss demand (d3)l1i tlb miss demand (d4)d5map dispatch bubble (d6)d9dadbddfetch restart (de)e0eaebec? ldst retires (ed)f5f6f7f8fd
61005292072190300014011003010453228769102433510001000100050003015950286102927631010001000100029081292561161001100010000100003100000010012030012830913868503074104921691303338201547462831516505139311567110002933929271293512931329240
61004292852200240011000001700046302873500242981000100010005000601595328636293183101000100010002907629125116100110001000010000310000001000203001292291336886306874721768306438151346412832616420139331578710002927029262293762933229218
6100429289220024001800000300045552873600243361000100010005000601594528608292483101000100010002908729139116100110001000010000310010001001212001287491506824302074521746305938211646472838216365139791565610002923829202293652929729282
610042922622002611171110050104533288661024243100010001000500000159552867229214310100010001000291112909511610011000100001000031001000100020300128489160682830519452171630553822745442837516440138371570710002933529232293112933029310
6100429234219024001201100301045762880701242691000100010005000701598828620293583101000100010002914629140116100110001000010010210000001000213001286591886857305394221659309138211043442838516340136811579010002933429267293352925929311
6100429247219021001401100301046292871800242671000100010005000461596028704293163101000100010002911629169116100110001000010000310000001001300001295093886835304995021612305438181744442833116356138851575910002926729315293192938129348
6100429256220021001301110901045432878800242941000100010005000101594728577292973101000100010002905229170116100110001000010000010010001000002001289191566845307584721644308938192148462836416426137031569310002925729315292982936429312
6100429295219025001400000201046322878410242991000100010005000001594228738292823101000100010002909029130116100110001000010000310000041001202001281391176832308694321675306238211642432837416218137871558110002922929270292722917729283
61004292592200280016000003000452228776102421910001000100050002015963286252884831010001000100029171291421161001100010000100132100200410002030012894931968643061104521714307738171049462835516312137961566810002921129290292182928829264
6100429239219021001500000300045432880000242771000100010005000301595228598292303101000100010002909729100116100110001000010000310000011000203001292490646877306384721627307338231948472834616319137251567010002929329411293262933429288

Test 2: Latency 1->2 roundtrip

Chain cycles: 3

Code:

  ld1 { v0.4s }, [x6]
  fmov x0, d0
  eor x8, x8, x0
  eor x8, x8, x0
  add x6, x6, x8
  mov x0, 1
  mov x1, 2
  mov x8, 0

(fused SUBS/B.cc loop)

100 unrolls and 100 iterations

Result (median cycles for code, minus 3 chain cycles): 9.0051

retire uop (01)cycle (02)03l1d tlb fill (05)mmu table walk data (08)l2 tlb miss data (0b)0e0f1e22233a3f4d4f51schedule uop (52)schedule int uop (53)schedule simd uop (54)schedule ldst uop (55)dispatch int uop (56)dispatch simd uop (57)dispatch ldst uop (58)int uops in schedulers (59)simd uops in schedulers (5a)ldst uops in schedulers (5b)60696d6emap stall dispatch (70)int prf full (71)map rewind (75)map stall (76)dispatch uop (78)map int uop (7c)map ldst uop (7d)map simd uop (7e)map int uop inputs (7f)map ldst uop inputs (80)map simd uop inputs (81)8283flush restart other nonspec (84)85inst all (8c)inst branch (8d)inst branch taken (90)inst branch cond (94)inst int alu (97)inst simd load (98)inst ldst (9b)9d9fl1d tlb access (a0)l1d tlb miss (a1)l1d cache miss ld (a3)ld unit uop (a6)l1d cache writeback (a8)a9acafb5bbl1d cache miss ld nonspec (bf)l1d tlb miss nonspec (c1)c2branch mispred nonspec (cb)cdcfd5map dispatch bubble (d6)ddfetch restart (de)e0? int output thing (e9)eaebec? ldst retires (ed)? int retires (ef)f5f6f7f8fd
50205120051899000112000120042119520109470256010640102100011000030100100001000010790895735750613662301200361200601200571131550311363050100302001000010000602001000010000120057120057115020110099100401001000010000010010000011000100110000110110032101107111196644000200121000040100120042120061120058120058120058
50204120057899100111001120042119519109455256010640104100021000030100100001000010790625736524613345811200331200571200571131520311363050100302001000010000602001000010000120041120057115020110099100401001000010000110010001001000000010000101000032101101111196614000201091000040100120052120036120052120052120052
50204120051900001002001120042119516109470256010340104100021000030100100001000010790625736668613662311200171200601200411131520311367450100302001000010000602001000010000120041120057115020110099100401001000010000010010000011000000010000111100032101107111196644000401391000040100120058120042120058120058120058
502041200578991100020001200361195131094492560103401021000110000301001000010000107900857362366136317112003312005712005711315203113674501003020010000100006020010000100001200571200571150201100991004010010000100000100100000110000000100001111000321011351111966440004101091000040100120092120058120055120055120055
502041200548990000001001200361194931094952560103401021000110000301001000010000107886257362366136971112003012003512005111313703113668501003020010000100006020010000100001200511200351150201100991004010010000100000100100000110000000100001010000321011011111965840000101091000040100120036120052120052120056120038
50204120051900101001100120039119510109467256010340102100011000030100100001000010798445736764613662301200331200581200611131370311367750100302001000010000602001000010000120057120041115020110099100401001000010000010010000011000000010000111100032101135111196644000401091000040100120058120058120058120061120042
50204120057900111002000120042119516109470256010640104100021000030100100001000010793855737340613693001200331200411200601131520311367750100302001000010000602001000010000120057120041215020110099100401001000010000010010000011000000010002011100132101107111196544000401391000040100120058120058120058120058120058
502041200579001000020001200421195131094732560103401041000210000301001000010000107912457365246136623012003612005712005711315503113674501003020010000100006020010062100001200511200511150201100991004010010000100000100100000110000000100001101000321011071111966440004101391000040100120042120042120061120058120058
50204120041899110002001120039119510109464256010340102100011000030100100001000010794585737244613698701200331200601201101131520311363050100302001000010000602001000010000120060120057115020110099100401001000010000010010002111000201110000111100032101135111196674000410091000040100120042120042120042120061120061
502041200418991001021011200421195131094702560103401041000210000301001000010000107957957369086136623012003312005712005711315203113674501003020010000100006020010000100001200541200511150201100991004010010000100000100100000010000000100001111000321011071111965840002101091000040100120052120055120052120037120052

1000 unrolls and 10 iterations

Result (median cycles for code, minus 3 chain cycles): 9.0050

retire uop (01)cycle (02)030e0f1e2223243f4d4f51schedule uop (52)schedule int uop (53)schedule simd uop (54)schedule ldst uop (55)dispatch int uop (56)dispatch simd uop (57)dispatch ldst uop (58)int uops in schedulers (59)simd uops in schedulers (5a)ldst uops in schedulers (5b)6069interrupt pending (6c)6d6emap stall dispatch (70)map rewind (75)map stall (76)dispatch uop (78)map int uop (7c)map ldst uop (7d)map simd uop (7e)map int uop inputs (7f)map ldst uop inputs (80)map simd uop inputs (81)8283flush restart other nonspec (84)85inst all (8c)inst branch (8d)inst branch taken (90)inst branch cond (94)inst int alu (97)inst simd load (98)inst ldst (9b)9d9fl1d tlb access (a0)l1d tlb miss (a1)l1d cache miss ld (a3)ld unit uop (a6)l1d cache writeback (a8)acafb5bbl1d cache miss ld nonspec (bf)branch cond mispred nonspec (c5)cfd5map dispatch bubble (d6)dbddfetch restart (de)e0? int output thing (e9)eaebec? ldst retires (ed)? int retires (ef)f5f6f7f8fd
500251200478990010001200351195081094632560013400101000110000300101000010000107951757361886133499012002601200501200501131683113689500103002010000100006002010000100001200501200471150021109104001010000100001101000001100000010000101031401107011119665400029681000040010120048120051120051120051120051
5002412005089900100012003511950810946125600134001210001100003001010000100001079548573618861340631120026012004712004711316531136895001030020100001000060020100001000012005012004711500211091040010100001000011010000001000000100001010314011070111196654000091601000040010120051120051120036120051120051
500241200508990010001200351194951094552560025400331000110000300101000010000107954857354556133499012002601200501200551132703113689500103002010000100006002010000100001200501200471150021109104001010000100000101000001100000010000100031401107011119665400029981000040010120051120051120051120051120036
5002412005089900120001200321195051094632560016400101000110000300101000010000107954857361886133499012002301200511200501131533113689500103002010000100006002010000100001200541200501150021109104001010000100001101000001100000010000101031401107011119662400006901000040010120051120051120051120051120051
500241200509000061001200351194951094492560013400121000110000300101000010000107952357361886133499012002301200501200501131533113689500103002010000100006002010000100001200471200351150021109104001010000100001101000000100000010000101031401107021119662400029981000040010120048120048120048120048120048
500241200478990010001200321195051094612560013400121000010000300101000010000107952357360446133499012002301200471200471131653113686500103002010000100006002010000100001200501200471150021109104001010000100000101000001100000010000100031401107011119665400029651000040010120048120048120048120036120048
500241200358990010001200351195081094612560013400121000110000300101000010000107954857361886133499012002701200351200501131653113689500103002010000100006002010000100001200501200471150021109104001010000100001101000001100002010000101031401107011119665400020981000040010120049120048120048120048120048
500241200478990010001200351195051094612560013400121000110000300101000010000107951757360446133499012002601201581202891133573113694500103002010106100006002010000100001200501203801150021109104001010000100000101000001100000732510000101031401107012119662400006651000040010120480120206120049120049120048
500241200478990170001200331195051094612560013400121000110000300101000010000107952357360446132757012001101200471200471131653113686500103002010000100006002010000100001200471200471150021109104001010000100000101000001100010010000101031401107011119650400009601000040010120051120051120051120056120051
500241200538991000001200351195081094632560013400121000010000300101000010000107952357361886133499012002301200471200501131653113689500103002010000100006002010000100001200351200471150021109104001010000100000101000001100000110000100031401107011119665400020081000040010120051120051120048120051120051

Test 3: throughput

Count: 8

Code:

  ld1 { v0.4s }, [x6]
  ld1 { v0.4s }, [x6]
  ld1 { v0.4s }, [x6]
  ld1 { v0.4s }, [x6]
  ld1 { v0.4s }, [x6]
  ld1 { v0.4s }, [x6]
  ld1 { v0.4s }, [x6]
  ld1 { v0.4s }, [x6]
  mov x7, x6
  mov x8, x6
  mov x9, x6
  mov x10, x6
  mov x11, x6
  mov x12, x6
  mov x13, x6

(fused SUBS/B.cc loop)

100 unrolls and 100 iterations

Result (median cycles for code divided by count): 0.3341

retire uop (01)cycle (02)03l1d tlb fill (05)mmu table walk data (08)l2 tlb miss data (0b)0e0f191e223a3f4346494f51schedule uop (52)schedule int uop (53)schedule ldst uop (55)dispatch int uop (56)dispatch ldst uop (58)int uops in schedulers (59)simd uops in schedulers (5a)60696d6emap stall dispatch (70)map rewind (75)map stall (76)dispatch uop (78)map int uop (7c)map ldst uop (7d)map int uop inputs (7f)map ldst uop inputs (80)8283flush restart other nonspec (84)85inst all (8c)inst branch (8d)inst branch taken (90)inst branch cond (94)inst int alu (97)inst simd load (98)inst ldst (9b)9d9fl1d tlb access (a0)l1d cache miss ld (a3)ld unit uop (a6)l1d cache writeback (a8)a9acafb5b6bbl1d cache miss ld nonspec (bf)l1d tlb miss nonspec (c1)c2cfd5map dispatch bubble (d6)ddfetch restart (de)e0? int output thing (e9)eaebec? ldst retires (ed)? int retires (ef)f5f6f7f8fd
802052673220011111065132669920021258010010080000100800165001167763026689267142673216642616684801162008002420080024267322671411802011009910010080000800000100800003980000204280039605742192511021611267290994800001002670826728267282670826728
80204267222000000004512267120181216258010010080000100800005001167231126702267272672716650316665801002008000020080000267272670711802011009910010080000800001100800003980000000800390135000511011611267040060800001002672826710267282670826708
8020426727200000000710026712212062580100100800001008000050011665251266972671126726166503166858010020080000200800002672726707118020110099100100800008000001008000008003900398003961043005132116112672400100800001002700626731267322673026876
8020426737200000000450226712200025801001008000010080000500117703812669726726267261665031666580100200800002008000026727267221180201100991001008000080000010080000398003510398003961043005110116112672401064800001002680626730267372672826708
80204267272000000004512267120121202580100100800001008000050011666601267022672726727166303166858010020080000200800002672926722118020110099100100800008000001008000039800351008003501394300511011611267240060800001002673126719267232671826708
802042673120000000045022669220016258010010080000100800005001166525026702267322671116630316685801002008000020080000267072672211802011009910010080000800001100800003980000103980040613939005110116112671901060800001002672826708267232672326714
8020426727200000000012266923121212258010010080000100800005001167231126702267072671116630316665801002008000020080000267072672211802011009910010080000800000100800003980039104280000013539005110116112672401062800001002673826755267332673126732
802042672920000000157002671001801625801001008000010080000500116723102670226730267221665491693380100202800002008019326731268571180201100991001008000080000110080000398000000358000061390005110116112672401060800001002673926738267282672826728
8020426727200000000451226707012024258010010080000100800005001166818126706267272672216649316685801002008000020080000267072672211802011009910010080000800001100800000800000008000061390005110116112670401060800001002672826708267282672826708
8020426727200000000410226712218181625801001008000010080000500117703812669726722267071665031668080100200800002008000026722267261180201100991001008000080000110080000398003900398003961350005110116112700101000800001002673226732267282672826733

1000 unrolls and 10 iterations

Result (median cycles for code divided by count): 0.3341

retire uop (01)cycle (02)0309l2 tlb miss instruction (0a)l2 tlb miss data (0b)0e0f18191e22233a3f4346494f51schedule uop (52)schedule int uop (53)schedule ldst uop (55)dispatch int uop (56)dispatch ldst uop (58)int uops in schedulers (59)simd uops in schedulers (5a)5f60696b6d6emap stall dispatch (70)map rewind (75)map stall (76)dispatch uop (78)map int uop (7c)map ldst uop (7d)map int uop inputs (7f)map ldst uop inputs (80)8283flush restart other nonspec (84)85inst all (8c)inst branch (8d)inst branch taken (90)inst branch cond (94)inst int alu (97)inst simd load (98)inst ldst (9b)9d9fl1d tlb access (a0)l1d tlb miss (a1)l1d cache miss ld (a3)a5ld unit uop (a6)l1d cache writeback (a8)acafb5b6bbl1d cache miss ld nonspec (bf)c2cfd5map dispatch bubble (d6)daddfetch restart (de)e0eaebec? ldst retires (ed)? int retires (ef)f5f6f7f8fd
8002526730200100000046101267002121162580010108000010800005011675011126702026731267311667631670780010208000020800002670826728118002110910108000080000110800000430800390478003861394405020816075267241410480000102673226732267322672826732
80024267312010000000440012671621119258001010800001080000501167201002670302673126731166763167078001020800002080000267312670811800211091010800008000001080000043080038038800396139440502091607526724140780000102673226732267322673226732
80024267282000000100440012672521102580010108000010800005011685550126706026732267311667631671180010208000020800002673126727118002110910108000080000010800000430800390388003861394305020716057269571410480000102672926735267322672926728
80024267272000000000440012671621119258001010800001080000501167501012670302673126731166723167118001020800002080000267312672711800211091010800008000001080000043080038039800386139440502071606726724014780000102673226728267282672826728
80024267082000000000450012671321119258001010800001080000501167201002670602673126727166893167118001020800002080000267272672711800211091010800008000001080000043080039038800386139440502051605726725014780000102673226732267322673226732
80024267312000000000440012671321102580010108000010800005011671240026706026727267311667631671180010208000020800002672726728118002110910108000080000010800000430800380388003861394405020716057267241010480000102672826709267322673226728
800242672720100000004400126716011192580010108000010800005011671240126706026731267311667631671180010208000020800002672726727118002110910108000080000010800000430800380388003861394405020816089267251410780000102670926709267092672926709
80024267312000000000451012671221116258001010800001080000501167124012670602673526731166523167118001020800002080000267272672811800211091010800008000001080000043080038008003861394405020716087269851410780000102673826729267352674826729
800242673120000000004400126716212119258001010800001080000501168843012668302670826708166763167118001020800002080000267312672811800211091010800008000001080000043080038038800396104405020716058267241414780000102673226732267322673226732
8002426731200100100053101267162121219258001010800001080000501167501012670602672726728166723166888001020800002080000267312672811800211091010800008000011080000000801690428003901394405020716058267251414780000102673226732267282673226732