Apple Microarchitecture Research by Dougall Johnson

M1/A14 P-core (Firestorm): Overview | Base Instructions | SIMD and FP Instructions
M1/A14 E-core (Icestorm):  Overview | Base Instructions | SIMD and FP Instructions

LD1 (multiple, 1 reg, 2S)

Test 1: uops

Code:

  ld1 { v0.2s }, [x6]
  nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop
  mov x0, 1
  mov x1, 2
  mov x8, 0

(no loop instructions)

1000 unrolls and 1 iteration

Retires (minus 60 nops): 1.000

Issues: 1.000

Integer unit issues: 0.000

Load/store unit issues: 1.000

SIMD/FP unit issues: 0.000

retire uop (01)cycle (02)03l1d tlb fill (05)mmu table walk instruction (07)mmu table walk data (08)09l2 tlb miss instruction (0a)l2 tlb miss data (0b)0e0f1e22243a3f464951schedule uop (52)schedule ldst uop (55)dispatch ldst uop (58)simd uops in schedulers (5a)5f6061696d6emap rewind (75)map stall (76)dispatch uop (78)map ldst uop (7d)map ldst uop inputs (80)8283flush restart other nonspec (84)85inst all (8c)inst simd load (98)inst ldst (9b)9dl1d tlb access (a0)l1d tlb miss (a1)l1d cache miss ld (a3)ld unit uop (a6)st unit uop (a7)l1d cache writeback (a8)a9acafb5bbl1d cache miss ld nonspec (bf)l1d tlb miss nonspec (c1)c2cfd0d2l1i cache miss demand (d3)l1i tlb miss demand (d4)d5map dispatch bubble (d6)d9dadbddfetch restart (de)e0eaebec? ldst retires (ed)f5f6f7f8fd
610052923122012110700017104470288400024213100010001000500010015948285862929731010001000100029120291221161001100010001100233100200111000213111294394206846314511552162731933818646422837116185134501575810002929229403294032925829240
6100429206220118011310011045262873400241901000100010005000300159432849729223310100010001000291002917411610011000100011003301002002110003131212757939769333033114621525312138171241472857016364137311572010002929229243292682924529294
6100429280219115111511031104600288010024308100010001000500040015967286182927731010001000100029040290601161001100010000100132100200011001223111324092226836307084921621314538111139502835016214137551577410002922329297292932923529323
610042931921911611150008004543288340024275100010001000500020015964285432927331010001000100029034291291161001100010000100322100201111000213111287693636807303610362163030903815342402852716219135771578210002926729356293392926929256
6100429220219113111011028104580286720024180100010001000500030515964286992930031010001000100029204291251161001100010000100333100300111000210111282492216832306713502166530763817847472833216166134911576010002921629329293022930029219
610042920621811811151001104726286980024178100010001000500020015945286342924431010001000100029014291071161001100010000100232100200011000310121280590326821304084021592312338221144352837216373136681511910002920929316291962919929237
610042929722011311121001310465328697002423210001000100050004001595128526292823101000100010002902929117116100110001000010033210020121100001310128459181687030676382159330463816841432840616175138961568710002924929283292292920629322
6100429325218112111610038104551287990024284100010001000500020015948286082926131010001000100029166290621161001100010000100220100200111000212131313694596839311110422150930803817542392842116373137101551810002929429302291902925829240
61004291712191201115000710456828770002419810001000100050006001594628541292633101000100010002911129086116100110001000010022310011011100032312132279460685930399422156230473817941422846015983136111575110002924929303293112921729219
610042924221911311161114004625287860024260100010001000500040015950286042922231010001000100029106290471161001100010001100223100100211100001311131459517694330019382159732083814838342837816386134001544410002910929217292132928429196

Test 2: Latency 1->2 roundtrip

Chain cycles: 3

Code:

  ld1 { v0.2s }, [x6]
  fmov x0, d0
  eor x8, x8, x0
  eor x8, x8, x0
  add x6, x6, x8
  mov x0, 1
  mov x1, 2
  mov x8, 0

(fused SUBS/B.cc loop)

100 unrolls and 100 iterations

Result (median cycles for code, minus 3 chain cycles): 9.0051

retire uop (01)cycle (02)0309l2 tlb miss instruction (0a)0e0f1e22243a3f4d4f51schedule uop (52)schedule int uop (53)schedule simd uop (54)schedule ldst uop (55)dispatch int uop (56)dispatch simd uop (57)dispatch ldst uop (58)int uops in schedulers (59)simd uops in schedulers (5a)ldst uops in schedulers (5b)60696b6d6emap stall dispatch (70)map rewind (75)map stall (76)dispatch uop (78)map int uop (7c)map ldst uop (7d)map simd uop (7e)map int uop inputs (7f)map ldst uop inputs (80)map simd uop inputs (81)8283flush restart other nonspec (84)85inst all (8c)inst branch (8d)inst branch taken (90)inst branch cond (94)inst int alu (97)inst simd load (98)inst ldst (9b)9d9fl1d tlb access (a0)l1d cache miss ld (a3)ld unit uop (a6)l1d cache writeback (a8)acafb5l1d cache miss ld nonspec (bf)c2cdcfd5map dispatch bubble (d6)ddfetch restart (de)e0? int output thing (e9)eaebec? ldst retires (ed)? int retires (ef)f5f6f7f8fd
50205120051899100011001200361195101094642560103401021000110000301001000010000107900857362366136317112003201200511200511131413113671501003020010000100006020010000100001200511200511150201100991004010010000100000100100001100000010000110032372107221196584000201091000040100120036120055120055120055120055
50204120054899000071001200391195131094672560103401021000110000301001000010000107903557362366136317112003301200511200511131493113671501003020010000100006020010000100001200541200511150201100991004010010000100000100100001100000010000010032102107221196584000201391000040100120052120055120055120052120054
502041200518990000110012003611951010944925601034010210001100003010010000100001079008573623661363170120031012005412005111314931136715010030200100001000060200100001000012005112005111502011009910040100100001000001001000001000006100001100321021012211964640002101091000040100120036120052120055120052120055
502041200548990000010012002011951010946725601034010210001100003010010000100001079035573638061344611120045012005412005111314631136685010030200100001000060200100001000012003512005111502011009910040100100001000001001000011000000100000100321021072211965840002101091000040100120052120055120052120055120052
502041200359000000100012003911951310946725601034010210001100003010010000100001079035573545561363171120056012005412005111314131136685010030200100001000060200100001000012005412005111502011009910040100100001000001001000011000000100001100321021072211964640002101091000040100120055120052120055120055120052
5020412005489900001000120039119513109464256010340102100011000030100100001000010790355736236613631711200330120054120051113149311366850100302001000010000602001000010063120051120051115020110099100401001000010000010010000110000001000011003210210722119661400021310121000040100120036120055120052120055120036
502041200518990000110012002011951310946725601034010210001100003010010000100001079008573638061363171120040012005112005111314131136715010030200100001000060200100001000012005412005111502011009910040100100001000001001000011000000100001100321021072211965840002101391000040100120055120052120055120052120052
502041200519000000110012003911951010944925601034010010001100003010010000100001079035573638061363171120032012005112005111315331136715010030200100001000060200100001000012005112005111502011009910040100100001000001001000011000000100001100321021072211966140002131391000040100120052120055120052120055120036
502041200548990000100012003911951310946725601034010210002100003010010000100001079035573545561363171120054012005112005111314631136685010030200100001000060200100001000012005112005111502011009910040100100001000001001000001000010100001100321021072211965840002101091000040100120055120052120052120055120052
502041200548990000100012003911951310946425601034010210001100003010010000100001079035573545561344611120060012005112005411314631136685010030200100001000060200100001000012005112005111502011009910040100100001000001001000011000050100001100321021072211965840002101091000040100120052120055120055120052120055

1000 unrolls and 10 iterations

Result (median cycles for code, minus 3 chain cycles): 9.0047

retire uop (01)cycle (02)03mmu table walk data (08)090e0f18191e2223243a3f43494d4f51schedule uop (52)schedule int uop (53)schedule simd uop (54)schedule ldst uop (55)dispatch int uop (56)dispatch simd uop (57)dispatch ldst uop (58)int uops in schedulers (59)simd uops in schedulers (5a)ldst uops in schedulers (5b)60696d6emap stall dispatch (70)map rewind (75)map stall (76)dispatch uop (78)map int uop (7c)map ldst uop (7d)map simd uop (7e)map int uop inputs (7f)map ldst uop inputs (80)map simd uop inputs (81)8283flush restart other nonspec (84)85inst all (8c)inst branch (8d)inst branch taken (90)inst branch cond (94)inst int alu (97)inst simd load (98)inst ldst (9b)9d9fl1d tlb access (a0)l1d tlb miss (a1)l1d cache miss ld (a3)ld unit uop (a6)l1d cache writeback (a8)a9acafb5bbl1d cache miss ld nonspec (bf)l1d tlb miss nonspec (c1)c2branch mispred nonspec (cb)cfd5map dispatch bubble (d6)ddfetch restart (de)e0? int output thing (e9)eaebec? ldst retires (ed)? int retires (ef)f5f6f7f8fd
500251200478991110112100012003211119513109461256001640012100011000030010100001000010794835736044613337401200191200431200431131733113682500103002010000100006002010000100001200471200431150021109104001010000100001101000001100070041000111100031401210778119665400026601000040010120056120056120044120056120048
50024120047900000100210001200321111951310946825600134001410001100003001010000100001079593573604461333730120019120055120135113173311368650010300201000010000600201000010000120055120055115002110910400101000010000010100010210001001100010110003140610778119658400026651000040010120056120059120427120056120044
500241200478990011002000012003200119501109521256001640012100011000030010100001000010795935736428613332201200311200551200471131613113682500103002010000100006002010000100001200471200471150021109104001010000100001101000002100070011000001000031408107139119658400026651000040010120054120435120056120044120044
5002412005990200100011000120028101195081094612560016400121000210000300101000010000107952357358486133499012002312004312004711316553113682500103002010000100006002010000100001200551200471150021109104001010000100000101000002100010012100011110003140810788119670400046651000040010120044120044120048120056120056
500241200489020011001310001200321011950510946825600164001210002100003001010000100001079483573604461334990120019120043120047113165311368650010305011000010000600201000010000120055120043115002110910400101000010000010100000110000001100010100003140810788119670400020651000040010120056120048120048120048120044
50024120043899001100110001200281111951310955325600164001410001100003001010000100001079483573604461338660120019120047120047113174311368250010300201000010000600201000010000120055120047115002110910400101000010000010100060010001000100011110003140810778119660400026601000040010120056120048120048120044120048
500241200579020011002100012002811119513109468256001340012100011000030010100001000010795235736188613386601200191200551200431131735411368750010300201000010000600201000010000120047120047115002110910400101000010000010100000210000001100001110003140810798119658400029651000040010120048120044120048120048120044
5002412004389900010041000120040101195051094612560013400141000210000300101000010000107952357364286133322012002312004712005511317317114064512223002010000100006002010215100001200491200551150021109104001010000100000101001122100000041000011100031409107109119658400040981000040010120161120044120048120048120048
500241200438990010001000012003510119508109457816001640012100011000030010100001000010795935735848613386611200231200431200471132153113689500103002010000100006002010000100001200431200921150021109104001010000100000101000000100011012100011010003140810778119658400040651000040010120044120048120048120048120048
50024120043899001000100001200280011950110946825600134001210002100003001010000100001079483573618861338661120023120047120043113161311368650010300201000010000603981000010000120052120152115002110910400101000010000110100000010001000100011100003140810788119670400026651000040010120048120056120044120048120049

Test 3: throughput

Count: 8

Code:

  ld1 { v0.2s }, [x6]
  ld1 { v0.2s }, [x6]
  ld1 { v0.2s }, [x6]
  ld1 { v0.2s }, [x6]
  ld1 { v0.2s }, [x6]
  ld1 { v0.2s }, [x6]
  ld1 { v0.2s }, [x6]
  ld1 { v0.2s }, [x6]
  mov x7, x6
  mov x8, x6
  mov x9, x6
  mov x10, x6
  mov x11, x6
  mov x12, x6
  mov x13, x6

(fused SUBS/B.cc loop)

100 unrolls and 100 iterations

Result (median cycles for code divided by count): 0.3340

retire uop (01)cycle (02)03l1d tlb fill (05)mmu table walk data (08)0e0f1e1f22233a3f4346494f51schedule uop (52)schedule int uop (53)schedule ldst uop (55)dispatch int uop (56)dispatch ldst uop (58)int uops in schedulers (59)simd uops in schedulers (5a)60696d6emap stall dispatch (70)map rewind (75)map stall (76)dispatch uop (78)map int uop (7c)map ldst uop (7d)map int uop inputs (7f)map ldst uop inputs (80)8283flush restart other nonspec (84)85inst all (8c)inst branch (8d)inst branch taken (90)inst branch cond (94)inst int alu (97)inst simd load (98)inst ldst (9b)9d9fl1d tlb access (a0)l1d tlb miss (a1)l1d cache miss ld (a3)a5ld unit uop (a6)st unit uop (a7)l1d cache writeback (a8)a9acafb5b6bbl1d cache miss ld nonspec (bf)l1d tlb miss nonspec (c1)c2cfd5map dispatch bubble (d6)ddfetch restart (de)e0eaebec? ldst retires (ed)? int retires (ef)f5f6f7f8fd
802052672320100004101012670721818122580100100800001008000050011678081267112672226722166453166808010020080000200800002670726722118020110099100100800008000011008000003908003500035800356135390051101161126719662800001002670826723267232672326723
802042672220000004100012670721818122580100100800001008000050011678081266972684126729166453166808010020080000200800002672226722118020110099100100800008000001008000003908003500035800356135390051101161126719662800001002672326723267232672326723
8020426707200000000101267120181812258010010080000100800005001167808126698267072672216645316680801002008000020080000267222672211802011009910010080000800000100800000390800350003580035613500051101161126719662800001002672326723267232672326723
80204267222000011410000267072181812258010010080000100800005001167808126698267222672216645316680801002008000020080000267222670711802011009910010080000800000100800000008003500038800356135390051101161126719602800001002672426728267232672326725
80204267222000000410101267083018025801001008000010080000500116652512669826722267221664531666580100200800002008000026722267221180201100991001008000080000010080000039080035000358000061353900511011611267041062800001002672326708267232672326723
8020426722200000042000026707201212258010010080000100800005001167808126697267222672216630316665801002008000020080000267332673311802011009910010080000800000100800201942080057101588003861574219151101161126729992800001002673326733267342673326733
802042671420011006501032671801818162580100100800001008000050011674011267072673226732166553166908010020080000200800002673326733118020110099100100800008000001008000003908003500035800356135390051101161126704662800001002672326723267232672326723
8020426722200001141000226707201812258010010080000100800005001167808126697267222672216645316680801002008000020080000267222672211802011009910010080000800000100800000390800350003580035613500051101161126719662800001002672826708267232672326723
80204267222000001410001267070180122580100100800001008000050011657891266972672226722166453166808010020080000200800002672226722118020110099100100800008000001008000003908003500035800356135390051101161126719662800001002672326723267232672326723
80204267222000000410001267072181802580100100800001008000050011665251266972672226707166453166808010020080000200800002670726722118020110099100100800008000001008000003908003500035800356135390051101161126719062800001002672326708267232672326723

1000 unrolls and 10 iterations

Result (median cycles for code divided by count): 0.3341

retire uop (01)cycle (02)03mmu table walk instruction (07)l2 tlb miss data (0b)0e0f18191e1f22243a3f4346494f51schedule uop (52)schedule int uop (53)schedule ldst uop (55)dispatch int uop (56)dispatch ldst uop (58)int uops in schedulers (59)simd uops in schedulers (5a)60696d6emap stall dispatch (70)map rewind (75)map stall (76)dispatch uop (78)map int uop (7c)map ldst uop (7d)map int uop inputs (7f)map ldst uop inputs (80)8283flush restart other nonspec (84)85inst all (8c)inst branch (8d)inst branch taken (90)inst branch cond (94)inst int alu (97)inst simd load (98)inst ldst (9b)9d9fl1d tlb access (a0)l1d cache miss ld (a3)ld unit uop (a6)l1d cache writeback (a8)acafb5b6bbl1d cache miss ld nonspec (bf)c2cdcfd5map dispatch bubble (d6)ddfetch restart (de)e0eaebec? ldst retires (ed)? int retires (ef)f5f6f7f8fd
80025267222000000004101002670720016258001010800001080000501167605126697267272672716652316702800102080000208000026708267221180021109101080000800000108000008003903980039610390050202162226719106480000102670926709267092672326729
8002426708200000000000022671221801625800101080000108000050116884312669726727267081666731670780010208000020800002672226722118002110910108000080000010800003980039039800356135390050202162226705106480000102670926709267092672926709
80024267272000011008301002671201201625800101080000108000050116675012670226708267271665231670880010208000020800002670826722118002110910108000080000010800003980000039800390139430050202162226719106480000102672926723267282672926728
8002426708200000100450102267122121816258001010800001080000501168843126702267082672716668316708800102080000208000026728267221180021109101080000800000108000039800390428003961354300502021622267251010280000102670926728267292672926728
8002426727199000000450001266930012025800101080000108000050116884312668326708267271666731670880010208000020800002672826708118002110910108000080000010800003980035039800396135000502021622267301010280000102672926709267282672926729
80024267272000000000000226889212016258001010800001080000501168843126702267272672716652316688800102080000208000026722267221180021109101080000800000108000008000000800350039000502021622267241010480000102670926729267232670926729
8002426727200000000450102267132121216258001010800001080000501166750126702267282672716652316708800102080000208000026708267221180021109101080000800000108000039800350398003961350005020216222670506080000102672926709267292672926728
80024267272000010001200102266932120162580010108000010800005011676050266832672726727166723167088001020800002080000267282670811800211091010800008000001080000398000003980000613943005020216222672460480000102672926728267092672826709
800242670820000000041000226693012121625800101080000108000050116688612669726728267281665231670780010208000020800002670826722118002110910108000080000010800003980039043800006104300502021622267241010480000102670926728267282672326709
800242670820000110000002266932121203580010108000010800005011668861267022670826727166723166888001020800002080000267272672211800211091010800008000001080000080039035800390135000502021622267051010080000102670926723267282672826729