Apple M1 Microarchitecture Research by Dougall Johnson

Firestorm: Overview | Base Instructions | SIMD and FP Instructions
Icestorm:  Overview | Base Instructions | SIMD and FP Instructions

LD1 (multiple, one register, 4H)

Test 1: uops

Code:

  ld1 { v0.4h }, [x6]
  nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop
  mov x0, 1
  mov x1, 2
  mov x8, 0

(no loop instructions)

1000 unrolls and 1 iteration

Retires (minus 60 nops): 1.000

Issues: 1.000

Integer unit issues: 0.000

Load/store unit issues: 1.000

SIMD/FP unit issues: 0.000

retire (01)cycle (02)03050708090a0b0e0f1e2223243a3f464951inst issue (52)~issue ld/st (55)~dispatch ld/st (58)huge thing ld/st (5a)5f6061696d6escheduler rewind (75)scheduler stall (76)~dispatch op (78)~map op ld/st (7d)~map lookup ld/st (80)8283pipeline redirect (84)85inst all (8c)inst fp/simd load (98)inst ldst (9b)9da0a1a3a6a8a9acafb5bbdcache load miss (bf)dtlb miss (c1)c2cfd0d2icache miss (d3)itlb miss (d4)d5d6d9dadbddinst fetch restart (de)e0eaebecld/st retires (ed)f5f6f7f8fd
61005293182200160121000500046132874900243001000100010005001500159602856329327310100010001000290892902511610011000100011004131000001100021300128509212684030309452158830343811737372836616290135651553110002923429418291482922129356
61004292312201201014111110045972870500242801003100010005001300159592872429172310100010001000290282922111610011000100001002131001214100021012137189237685531277432162830923812639392835216494136571560810002930929434292352920829371
61004293162191150014100410047002870900241851000100010005000400159712855929170310100010001000290372906711610011000100001003131002011100031311134469255688033657442158432843819941412833116095137001573010002931629370291262920929212
6100429215219117101810017100460728671002426510001000100050004001597728853293163101000100010002904029051116100110001000010021310010011000213121284294056873304484121512307638111140412840016362137591554210002927529474292682934629160
61004293792191171018100510050022879600241911000100010005000500159702855729239310100010001000290742909211610011000100001000031001000100020200127619199685630929482170230903814739342837116258137191573010002923629502291792920429260
61004291812190170014000200047282872510242321000100010005000400159672890429237310100010001000290702918111610011000100001000031000000100121300129479318685731118422161730953813439362877316255131131573610002931329375291612927629276
6100429249219017001700000005000287731024213100010001000500050015949284402917731010001000100029041290941161001100010000100003100100010002030012930917071633095113721561309738101136352832216314137921574610002924329451293082930229238
610042927421801700150003100455928729002423910001000100050002001595628554292283101000100010002920728987116100110001000010000210001001000213001366793156855308311422165531263811737382847916299138911509010002927829498292302934429170
61004292592190180019000310045642872300242381000100010005000600159532853129212310100010001000291242904011610011000100001000031000300100020300129109307687633348392150331593811739382834216260136571562910002924429436292452913829150
61004292732190180019011310046062875900242071000100010005000600159572861529268310100010001000291072908111610011000100011000021000000100020200129989156690331299362153230723809740342832716185137351548910002924029357292042924129255

Test 2: Latency 1->2 roundtrip

Chain cycles: 3

Code:

  ld1 { v0.4h }, [x6]
  fmov x0, d0
  eor x8, x8, x0
  eor x8, x8, x0
  add x6, x6, x8
  mov x0, 1
  mov x1, 2
  mov x8, 0

(fused SUBS/B.cc loop)

100 unrolls and 100 iterations

Result (median cycles for code, minus 3 chain cycles): 9.0057

retire (01)cycle (02)0305080b0e0f18191e223a3f4d4f51inst issue (52)~issue int (53)~issue fp/simd (54)~issue ld/st (55)~dispatch int (56)~dispatch fp/simd (57)~dispatch ld/st (58)huge thing int (59)huge thing ld/st (5a)huge thing fp/simd (5b)60696b6d6edispatch stall (70)scheduler rewind (75)scheduler stall (76)~dispatch op (78)~map op int (7c)~map op ld/st (7d)~map op fp/simd (7e)~map lookup int (7f)~map lookup ld/st (80)~map lookup fp/simd (81)8283pipeline redirect (84)85inst all (8c)inst branch (8d)inst branch taken (90)inst b.cc (94)inst integer (97)inst fp/simd load (98)inst ldst (9b)9d9fa0a1a3a6a8a9acafb5bbdcache load miss (bf)dtlb miss (c1)c2c5cfd5d6ddinst fetch restart (de)e0? int output thing (e9)eaebecld/st retires (ed)gpr retires (ef)f5f6f7f8fd
5020512005789911110001111120042119516109470256010640104100021000030100100001000010790625736524613662311200330120057120057113152311367450100302001000010000602001000010000120057120057115020110099100401001000010000110010002311000101110000111120321011071111966440004101091000040100120058120058120058120058120058
50204120057899111110021112004211951610947025601064010410002100003010010000100001079062573652461366231120034012011312006011315231136745010030200100001000060200100001000012004112005711502011009910040100100001000001001000221100040111000011110032101107111196644000401091000040100120058120058120058120061120058
502041200578991120000211120042119516109470256010640104100021000030100100001000010790625736524613662311200330120057120057113152311367450100302001000010000602001000010000120057120057115020110099100401001000010000010010002111000200110000111110321011071111966440004101091000040100120058120058120058120058120058
502041200578991010000211120042119517109470256010640104100021000030100100001000010791165736524613662301200330120057120057113152311367450100302001000010000602001000010000120057120057115020110099100401001000010000010010003211000101110000111120321011071111966440004101091000040100120058120058120058120058120058
50204120057899111000021112004211951610947025601064010410002100003010010000100001079062573652461366231120033012005712005711315231136745010030200100001000060200100001000012005712005711502011009910040100100001000011001000221100020011000011112032101107111196644000401091000040100120058120058120058120058120058
502041200578991110000211120042119516109470256010640104100021000030100100001000010790625736524613662301200333120041120057113178311367450100302001000010000602001000010000120057120057115020110099100401001000010000110010002211000201110000111100321011071111966440004101091000040100120058120058120058120058120058
502041200578991110000211120042119516109470256010640104100021000030100100001000010790625736524613662311200330120057120057113152311367450100302001000010000602001000010000120057120057115020110099100401001000010000010010002111000101110000111110321011071111966440004101091000040100120058120058120058120058120058
502041200578991100000211120042119516109470256010640104100021000030100100001000010790625736524613662301200330120057120057113152311367450100302001000010000602001000010000120057120057115020110099100401001000010000010010002211000200110000111100321011071111966440004101091000040100120058120058120058120058120058
502041200578991000000211120042119516109470256010640104100021000030100100001000010790625736524613662301200330120057120057113152311367450100302001000010000602001000010000120057120057115020110099100401001000010000010010002211000300110000111110321011071111966440004101091000040100120058120058120058120058120058
50204120057899111000021112004211951610947025601064010410002100003010010000100001079062573652461366231120033012005712005711315231136745010030200100001006460200100001006412006412005711502011009910040100100001000011001000121100030111000011110032101171111966440004101091000040100120058120058120058120058120058

1000 unrolls and 10 iterations

Result (median cycles for code, minus 3 chain cycles): 9.0050

retire (01)cycle (02)030708090a0e0f18191e1f22233a3f4d4f51inst issue (52)~issue int (53)~issue fp/simd (54)~issue ld/st (55)~dispatch int (56)~dispatch fp/simd (57)~dispatch ld/st (58)huge thing int (59)huge thing ld/st (5a)huge thing fp/simd (5b)60696d6edispatch stall (70)scheduler rewind (75)scheduler stall (76)~dispatch op (78)~map op int (7c)~map op ld/st (7d)~map op fp/simd (7e)~map lookup int (7f)~map lookup ld/st (80)~map lookup fp/simd (81)8283pipeline redirect (84)85inst all (8c)inst branch (8d)inst branch taken (90)inst b.cc (94)inst integer (97)inst fp/simd load (98)inst ldst (9b)9d9fa0a1a3a6a8a9acafb5dcache load miss (bf)c2c5cfd5d6ddinst fetch restart (de)e0? int output thing (e9)eaebecld/st retires (ed)gpr retires (ef)f5f6f7f8fd
500251200478990010010060264000120417119492109463256001340012100011000030010100001000010795485736188613349901200261200471200501131683113674500103002010000100006002010000100001200501200471150021109104001010000100001101000000100000001000511003140510724119941400029001000040010120051120036120048120051120051
500241200508990001000000000120330119508109461256001340010100001000030010100001000010795485736188613349901200231200501200351131683113689500103002010000100006002010000100001200351200471150021109104001010000100000101000001100001061000311003140410744119669400009651000040010120036120051120036120036120036
500241200508990000111410100120035119492109463256001340012100001000030010100001000010795175735455613349901200261200501200501131683113674500103002010000100006002010000100001200351200471150021109104001010000100001101000001100001091000011003140410724119650400029051000040010120051120051120048120051120036
50024120050899011000004540000120037119505109449256001040012100011000030010100001000010795235735455613349901200231200501200501131533113689500103002010000100006002010000100001200351200351150021109104001010000100001101000001100001001000000003140310726119662400029981000040010120039120048120417120051120051
50024120050899000000000000012006211950810946325600104001210001100003001010000100001079517573618861334990120026120050120035113168521136885001030020100001000060020100001000012042912004811500211091040010100001000001010000011000000143341000001203140410743119665400020901000040010120051120051120051120051120051
500241200508990000010000100120032119492109463256001040012100011000030010100001000010795175736044613365201200231200351200501131683113686500103002010000100006098810000100001200351200471150021109104001010000100001101000001100000031000011003140410734119665400029901000040010120036120036120036120039120036
500241200358990000000000100120037119615109463256001340012100001000030010100001000010795485735455613349901200121200501200351131683113686500103002010000100006002010000100001200501200471150021109104001010000100000101000001100000031000011003140410745119652400029981000040010120051120051120048120036120051
500241200508990000000010100120056119752109449256001040012100001000030010100001000010795575744395613375901200111200501200351131683113689500103002010000100006002010000100001200501200471150021109104001010000100001101000001100000001000011003140210724119971400029951000040010120051120051120051120052120036
5002412005089901000000100001204071194921094492560061400121000010000300101000010000107954857361886132757012001112005012003511315331136895001030020100001000060020100001000012040512004711500211091040010100001000011010000001000010105871000001003140410734119662400029681000040010120051120051120052120038120036
500241200508990000000010100120070119505109463256001340010100011000030010100001000010795175736188613275701200261200501200501131653113674500103002010000100006002010000100001200501201352150021109104001010000100000101000000100001031000001003140210744119662400026651000040010120036120036120051120051120051

Test 3: throughput

Count: 8

Code:

  ld1 { v0.4h }, [x6]
  ld1 { v0.4h }, [x6]
  ld1 { v0.4h }, [x6]
  ld1 { v0.4h }, [x6]
  ld1 { v0.4h }, [x6]
  ld1 { v0.4h }, [x6]
  ld1 { v0.4h }, [x6]
  ld1 { v0.4h }, [x6]
  mov x7, x6
  mov x8, x6
  mov x9, x6
  mov x10, x6
  mov x11, x6
  mov x12, x6
  mov x13, x6

(fused SUBS/B.cc loop)

100 unrolls and 100 iterations

Result (median cycles for code divided by count): 0.3341

retire (01)cycle (02)03080b0e0f1e22233a3f4346494f51inst issue (52)~issue int (53)~issue ld/st (55)~dispatch int (56)~dispatch ld/st (58)huge thing int (59)huge thing ld/st (5a)60696d6edispatch stall (70)scheduler rewind (75)scheduler stall (76)~dispatch op (78)~map op int (7c)~map op ld/st (7d)~map lookup int (7f)~map lookup ld/st (80)8283pipeline redirect (84)85inst all (8c)inst branch (8d)inst branch taken (90)inst b.cc (94)inst integer (97)inst fp/simd load (98)inst ldst (9b)9d9fa0a3a6a8a9acafb5b6bbdcache load miss (bf)c5branch mispredict (cb)cdcfd5d6ddinst fetch restart (de)e0eaebecld/st retires (ed)gpr retires (ef)f5f6f7f8fd
8020526736200001145001267332112192580100100800001008001350011671981267062672726727166596166838011520080024200800242672726731118020110099100100800008000001008000043800380039800386139431115118016002672814104800001002673226736267322670826732
8020426727200000044001267260121192580100100800001008001650011683801267062673126731166596166838011520080024200800242672726730118020110099100100800008000001008000043800380053800396139441115118016002670410147800001002673226732267082673226728
80205267272000000440012671601119258010010080000100800155001167303026706267312673116659616683801152008002420080024267312672711802011009910010080000800000100800000800390039800386139431115118016002672814107800001002672826732267282673226732
802042673120000004400026716212119258010010080000100800155001166596026706267312670716659616683801132008002420080024267312672711802011009910010080000800000100800004380038003880038613901115118016002672410147800001002673226732267322673226732
802042672720000004400126824312119258010010080000100800155001166596026706267312673116654316689801002008000020080000267312672711802011009910010080000800000100800004380038003880038610440005110116112673114147800001002673226743267322673226728
8020426727200000031002672201119258010010080000100800005001167127026706267312673116654316689801002008000020080000267072672711802011009910010080000800000100800004380038004180039610440005110116112672814140800001002673226728267322673226732
8020426731200000044101267160121192580100100800001008000050011671271266822673126707166543166858010020080000200800002673126707118020110099100100800008000001008000043800385047800386139440005110116112672814104800001002673226732267322672826732
8020426731200000064310126722212119258010010080000100800005001167231026706267272673116654316689801002008000020080000267272670711802011009910010080000800000100800004380039110800396139440005110116112672814107800001002670826732267282673226730
802042673120000004410126715011219258010010080000100800005001168312026706267312673116654316685801002008000020080000267312670711802011009910010080000800000100800014380038003880038600440005110116112672810140800001002672826732267322673226732
8020426731200000044101267232110258010010080000100800005001165789026706267312672716650316689801002008000020080000267312672711802011009910010080000800000100800004380038003880039613900005110116112672814107800001002673226732267322673226732

1000 unrolls and 10 iterations

Result (median cycles for code divided by count): 0.3341

retire (01)cycle (02)03080a0e0f191e22233a3f4346494f51inst issue (52)~issue int (53)~issue ld/st (55)~dispatch int (56)~dispatch ld/st (58)huge thing int (59)huge thing ld/st (5a)5f60696d6edispatch stall (70)scheduler rewind (75)scheduler stall (76)~dispatch op (78)~map op int (7c)~map op ld/st (7d)~map lookup int (7f)~map lookup ld/st (80)8283pipeline redirect (84)85inst all (8c)inst branch (8d)inst branch taken (90)inst b.cc (94)inst integer (97)inst fp/simd load (98)inst ldst (9b)9d9fa0a3a6a8acafb5b6bbdcache load miss (bf)cfd2icache miss (d3)d5d6d9daddinst fetch restart (de)e0eaebecld/st retires (ed)gpr retires (ef)f5f6f7f8fd
8002526722200000104500026713201816258001010800001080000501166750112668326728267281665231670280010208000020800002672726722118002110910108000080000010800003980039035800396035050200015160010132672406280000102670926723267092670926709
80024267222000001041002266942120025800101080000108000050116884300267032672826728166523167078001020800002080000267082670811800211091010800008000001080000398003903980035600050200013160012122671900080000102672926728267092672926728
800242672720000010000026712012016258001010800001080000501166750012670226708267271665231670780010208000020800002672826708118002110910108000080000110800003980000168003901353950200012160011122671960080000102672926709267232670926729
800242670820000000450012683221812025800101080000108000050116760500266972672726728166723167078001020800002080000267082672211800211091010800008000001080000398003510800350100502000111600121226724010280000102672926729267092670926709
8002426729200000000102267160001225800101080000108000050116675001266972670826708166723167088001020800002080000267082671211800211091010800008000001080000438000003580039610435020001216001062671966280000102672826709267282670926729
800242670820000000234001266930121216258001010800001080000501168843012668326728267081667231670780010208000020800002672226722118002110910108000080000010800000800000398003901390502000121600111226719610280000102672826728267232670926728
80024267282000000000022671301212025800101080000108000050116884301266832672826727166523166888001020800002080000267272672211800211091010800008000001080000398003503980039003943502000121600121026719100080000102670926709267282672326709
800242670820000100000026693001211258001010800001080000501166894002670326708267221665231670280010208000020800002672726708118002110910108000080000010800003980000108003961043502000121600121026719106280000102672926709267092670926728
8002426727200000004500226713001216258001010800001080000501167605002670326727267081665231670280010208000020800002670826722118002110910108000080000010800003980000039800350035395020009160011926812106480000102670926730267092672626729
80024267082000000041002267202181292580010108000010800005011668960026703267082672216672316707800102080000208000026727267161180021109101080000800000108000039800350398003560354350200010160010626719610480000102672826728267282670926709