Apple Microarchitecture Research by Dougall Johnson

M1/A14 P-core (Firestorm): Overview | Base Instructions | SIMD and FP Instructions
M1/A14 E-core (Icestorm):  Overview | Base Instructions | SIMD and FP Instructions

LD1 (multiple, 1 reg, 4H)

Test 1: uops

Code:

  ld1 { v0.4h }, [x6]
  nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop
  mov x0, 1
  mov x1, 2
  mov x8, 0

(no loop instructions)

1000 unrolls and 1 iteration

Retires (minus 60 nops): 1.000

Issues: 1.000

Integer unit issues: 0.000

Load/store unit issues: 1.000

SIMD/FP unit issues: 0.000

retire uop (01)cycle (02)03l1d tlb fill (05)mmu table walk instruction (07)mmu table walk data (08)09l2 tlb miss instruction (0a)l2 tlb miss data (0b)0e0f1e2223243a3f464951schedule uop (52)schedule ldst uop (55)dispatch ldst uop (58)simd uops in schedulers (5a)5f6061696d6emap rewind (75)map stall (76)dispatch uop (78)map ldst uop (7d)map ldst uop inputs (80)8283flush restart other nonspec (84)85inst all (8c)inst simd load (98)inst ldst (9b)9dl1d tlb access (a0)l1d tlb miss (a1)l1d cache miss ld (a3)ld unit uop (a6)l1d cache writeback (a8)a9acafb5bbl1d cache miss ld nonspec (bf)l1d tlb miss nonspec (c1)c2cfd0d2l1i cache miss demand (d3)l1i tlb miss demand (d4)d5map dispatch bubble (d6)d9dadbddfetch restart (de)e0eaebec? ldst retires (ed)f5f6f7f8fd
61005293182200160121000500046132874900243001000100010005001500159602856329327310100010001000290892902511610011000100011004131000001100021300128509212684030309452158830343811737372836616290135651553110002923429418291482922129356
61004292312201201014111110045972870500242801003100010005001300159592872429172310100010001000290282922111610011000100001002131001214100021012137189237685531277432162830923812639392835216494136571560810002930929434292352920829371
61004293162191150014100410047002870900241851000100010005000400159712855929170310100010001000290372906711610011000100001003131002011100031311134469255688033657442158432843819941412833116095137001573010002931629370291262920929212
6100429215219117101810017100460728671002426510001000100050004001597728853293163101000100010002904029051116100110001000010021310010011000213121284294056873304484121512307638111140412840016362137591554210002927529474292682934629160
61004293792191171018100510050022879600241911000100010005000500159702855729239310100010001000290742909211610011000100001000031001000100020200127619199685630929482170230903814739342837116258137191573010002923629502291792920429260
61004291812190170014000200047282872510242321000100010005000400159672890429237310100010001000290702918111610011000100001000031000000100121300129479318685731118422161730953813439362877316255131131573610002931329375291612927629276
6100429249219017001700000005000287731024213100010001000500050015949284402917731010001000100029041290941161001100010000100003100100010002030012930917071633095113721561309738101136352832216314137921574610002924329451293082930229238
610042927421801700150003100455928729002423910001000100050002001595628554292283101000100010002920728987116100110001000010000210001001000213001366793156855308311422165531263811737382847916299138911509010002927829498292302934429170
61004292592190180019000310045642872300242381000100010005000600159532853129212310100010001000291242904011610011000100001000031000300100020300129109307687633348392150331593811739382834216260136571562910002924429436292452913829150
61004292732190180019011310046062875900242071000100010005000600159572861529268310100010001000291072908111610011000100011000021000000100020200129989156690331299362153230723809740342832716185137351548910002924029357292042924129255

Test 2: Latency 1->2 roundtrip

Chain cycles: 3

Code:

  ld1 { v0.4h }, [x6]
  fmov x0, d0
  eor x8, x8, x0
  eor x8, x8, x0
  add x6, x6, x8
  mov x0, 1
  mov x1, 2
  mov x8, 0

(fused SUBS/B.cc loop)

100 unrolls and 100 iterations

Result (median cycles for code, minus 3 chain cycles): 9.0057

retire uop (01)cycle (02)03l1d tlb fill (05)mmu table walk data (08)l2 tlb miss data (0b)0e0f18191e223a3f4d4f51schedule uop (52)schedule int uop (53)schedule simd uop (54)schedule ldst uop (55)dispatch int uop (56)dispatch simd uop (57)dispatch ldst uop (58)int uops in schedulers (59)simd uops in schedulers (5a)ldst uops in schedulers (5b)60696b6d6emap stall dispatch (70)map rewind (75)map stall (76)dispatch uop (78)map int uop (7c)map ldst uop (7d)map simd uop (7e)map int uop inputs (7f)map ldst uop inputs (80)map simd uop inputs (81)8283flush restart other nonspec (84)85inst all (8c)inst branch (8d)inst branch taken (90)inst branch cond (94)inst int alu (97)inst simd load (98)inst ldst (9b)9d9fl1d tlb access (a0)l1d tlb miss (a1)l1d cache miss ld (a3)ld unit uop (a6)l1d cache writeback (a8)a9acafb5bbl1d cache miss ld nonspec (bf)l1d tlb miss nonspec (c1)c2branch cond mispred nonspec (c5)cfd5map dispatch bubble (d6)ddfetch restart (de)e0? int output thing (e9)eaebec? ldst retires (ed)? int retires (ef)f5f6f7f8fd
5020512005789911110001111120042119516109470256010640104100021000030100100001000010790625736524613662311200330120057120057113152311367450100302001000010000602001000010000120057120057115020110099100401001000010000110010002311000101110000111120321011071111966440004101091000040100120058120058120058120058120058
50204120057899111110021112004211951610947025601064010410002100003010010000100001079062573652461366231120034012011312006011315231136745010030200100001000060200100001000012004112005711502011009910040100100001000001001000221100040111000011110032101107111196644000401091000040100120058120058120058120061120058
502041200578991120000211120042119516109470256010640104100021000030100100001000010790625736524613662311200330120057120057113152311367450100302001000010000602001000010000120057120057115020110099100401001000010000010010002111000200110000111110321011071111966440004101091000040100120058120058120058120058120058
502041200578991010000211120042119517109470256010640104100021000030100100001000010791165736524613662301200330120057120057113152311367450100302001000010000602001000010000120057120057115020110099100401001000010000010010003211000101110000111120321011071111966440004101091000040100120058120058120058120058120058
50204120057899111000021112004211951610947025601064010410002100003010010000100001079062573652461366231120033012005712005711315231136745010030200100001000060200100001000012005712005711502011009910040100100001000011001000221100020011000011112032101107111196644000401091000040100120058120058120058120058120058
502041200578991110000211120042119516109470256010640104100021000030100100001000010790625736524613662301200333120041120057113178311367450100302001000010000602001000010000120057120057115020110099100401001000010000110010002211000201110000111100321011071111966440004101091000040100120058120058120058120058120058
502041200578991110000211120042119516109470256010640104100021000030100100001000010790625736524613662311200330120057120057113152311367450100302001000010000602001000010000120057120057115020110099100401001000010000010010002111000101110000111110321011071111966440004101091000040100120058120058120058120058120058
502041200578991100000211120042119516109470256010640104100021000030100100001000010790625736524613662301200330120057120057113152311367450100302001000010000602001000010000120057120057115020110099100401001000010000010010002211000200110000111100321011071111966440004101091000040100120058120058120058120058120058
502041200578991000000211120042119516109470256010640104100021000030100100001000010790625736524613662301200330120057120057113152311367450100302001000010000602001000010000120057120057115020110099100401001000010000010010002211000300110000111110321011071111966440004101091000040100120058120058120058120058120058
50204120057899111000021112004211951610947025601064010410002100003010010000100001079062573652461366231120033012005712005711315231136745010030200100001006460200100001006412006412005711502011009910040100100001000011001000121100030111000011110032101171111966440004101091000040100120058120058120058120058120058

1000 unrolls and 10 iterations

Result (median cycles for code, minus 3 chain cycles): 9.0050

retire uop (01)cycle (02)03mmu table walk instruction (07)mmu table walk data (08)09l2 tlb miss instruction (0a)0e0f18191e1f22233a3f4d4f51schedule uop (52)schedule int uop (53)schedule simd uop (54)schedule ldst uop (55)dispatch int uop (56)dispatch simd uop (57)dispatch ldst uop (58)int uops in schedulers (59)simd uops in schedulers (5a)ldst uops in schedulers (5b)60696d6emap stall dispatch (70)map rewind (75)map stall (76)dispatch uop (78)map int uop (7c)map ldst uop (7d)map simd uop (7e)map int uop inputs (7f)map ldst uop inputs (80)map simd uop inputs (81)8283flush restart other nonspec (84)85inst all (8c)inst branch (8d)inst branch taken (90)inst branch cond (94)inst int alu (97)inst simd load (98)inst ldst (9b)9d9fl1d tlb access (a0)l1d tlb miss (a1)l1d cache miss ld (a3)ld unit uop (a6)l1d cache writeback (a8)a9acafb5l1d cache miss ld nonspec (bf)c2branch cond mispred nonspec (c5)cfd5map dispatch bubble (d6)ddfetch restart (de)e0? int output thing (e9)eaebec? ldst retires (ed)? int retires (ef)f5f6f7f8fd
500251200478990010010060264000120417119492109463256001340012100011000030010100001000010795485736188613349901200261200471200501131683113674500103002010000100006002010000100001200501200471150021109104001010000100001101000000100000001000511003140510724119941400029001000040010120051120036120048120051120051
500241200508990001000000000120330119508109461256001340010100001000030010100001000010795485736188613349901200231200501200351131683113689500103002010000100006002010000100001200351200471150021109104001010000100000101000001100001061000311003140410744119669400009651000040010120036120051120036120036120036
500241200508990000111410100120035119492109463256001340012100001000030010100001000010795175735455613349901200261200501200501131683113674500103002010000100006002010000100001200351200471150021109104001010000100001101000001100001091000011003140410724119650400029051000040010120051120051120048120051120036
50024120050899011000004540000120037119505109449256001040012100011000030010100001000010795235735455613349901200231200501200501131533113689500103002010000100006002010000100001200351200351150021109104001010000100001101000001100001001000000003140310726119662400029981000040010120039120048120417120051120051
50024120050899000000000000012006211950810946325600104001210001100003001010000100001079517573618861334990120026120050120035113168521136885001030020100001000060020100001000012042912004811500211091040010100001000001010000011000000143341000001203140410743119665400020901000040010120051120051120051120051120051
500241200508990000010000100120032119492109463256001040012100011000030010100001000010795175736044613365201200231200351200501131683113686500103002010000100006098810000100001200351200471150021109104001010000100001101000001100000031000011003140410734119665400029901000040010120036120036120036120039120036
500241200358990000000000100120037119615109463256001340012100001000030010100001000010795485735455613349901200121200501200351131683113686500103002010000100006002010000100001200501200471150021109104001010000100000101000001100000031000011003140410745119652400029981000040010120051120051120048120036120051
500241200508990000000010100120056119752109449256001040012100001000030010100001000010795575744395613375901200111200501200351131683113689500103002010000100006002010000100001200501200471150021109104001010000100001101000001100000001000011003140210724119971400029951000040010120051120051120051120052120036
5002412005089901000000100001204071194921094492560061400121000010000300101000010000107954857361886132757012001112005012003511315331136895001030020100001000060020100001000012040512004711500211091040010100001000011010000001000010105871000001003140410734119662400029681000040010120051120051120052120038120036
500241200508990000000010100120070119505109463256001340010100011000030010100001000010795175736188613275701200261200501200501131653113674500103002010000100006002010000100001200501201352150021109104001010000100000101000000100001031000001003140210744119662400026651000040010120036120036120051120051120051

Test 3: throughput

Count: 8

Code:

  ld1 { v0.4h }, [x6]
  ld1 { v0.4h }, [x6]
  ld1 { v0.4h }, [x6]
  ld1 { v0.4h }, [x6]
  ld1 { v0.4h }, [x6]
  ld1 { v0.4h }, [x6]
  ld1 { v0.4h }, [x6]
  ld1 { v0.4h }, [x6]
  mov x7, x6
  mov x8, x6
  mov x9, x6
  mov x10, x6
  mov x11, x6
  mov x12, x6
  mov x13, x6

(fused SUBS/B.cc loop)

100 unrolls and 100 iterations

Result (median cycles for code divided by count): 0.3341

retire uop (01)cycle (02)03mmu table walk data (08)l2 tlb miss data (0b)0e0f1e22233a3f4346494f51schedule uop (52)schedule int uop (53)schedule ldst uop (55)dispatch int uop (56)dispatch ldst uop (58)int uops in schedulers (59)simd uops in schedulers (5a)60696d6emap stall dispatch (70)map rewind (75)map stall (76)dispatch uop (78)map int uop (7c)map ldst uop (7d)map int uop inputs (7f)map ldst uop inputs (80)8283flush restart other nonspec (84)85inst all (8c)inst branch (8d)inst branch taken (90)inst branch cond (94)inst int alu (97)inst simd load (98)inst ldst (9b)9d9fl1d tlb access (a0)l1d cache miss ld (a3)ld unit uop (a6)l1d cache writeback (a8)a9acafb5b6bbl1d cache miss ld nonspec (bf)branch cond mispred nonspec (c5)branch mispred nonspec (cb)cdcfd5map dispatch bubble (d6)ddfetch restart (de)e0eaebec? ldst retires (ed)? int retires (ef)f5f6f7f8fd
8020526736200001145001267332112192580100100800001008001350011671981267062672726727166596166838011520080024200800242672726731118020110099100100800008000001008000043800380039800386139431115118016002672814104800001002673226736267322670826732
8020426727200000044001267260121192580100100800001008001650011683801267062673126731166596166838011520080024200800242672726730118020110099100100800008000001008000043800380053800396139441115118016002670410147800001002673226732267082673226728
80205267272000000440012671601119258010010080000100800155001167303026706267312673116659616683801152008002420080024267312672711802011009910010080000800000100800000800390039800386139431115118016002672814107800001002672826732267282673226732
802042673120000004400026716212119258010010080000100800155001166596026706267312670716659616683801132008002420080024267312672711802011009910010080000800000100800004380038003880038613901115118016002672410147800001002673226732267322673226732
802042672720000004400126824312119258010010080000100800155001166596026706267312673116654316689801002008000020080000267312672711802011009910010080000800000100800004380038003880038610440005110116112673114147800001002673226743267322673226728
8020426727200000031002672201119258010010080000100800005001167127026706267312673116654316689801002008000020080000267072672711802011009910010080000800000100800004380038004180039610440005110116112672814140800001002673226728267322673226732
8020426731200000044101267160121192580100100800001008000050011671271266822673126707166543166858010020080000200800002673126707118020110099100100800008000001008000043800385047800386139440005110116112672814104800001002673226732267322672826732
8020426731200000064310126722212119258010010080000100800005001167231026706267272673116654316689801002008000020080000267272670711802011009910010080000800000100800004380039110800396139440005110116112672814107800001002670826732267282673226730
802042673120000004410126715011219258010010080000100800005001168312026706267312673116654316685801002008000020080000267312670711802011009910010080000800000100800014380038003880038600440005110116112672810140800001002672826732267322673226732
8020426731200000044101267232110258010010080000100800005001165789026706267312672716650316689801002008000020080000267312672711802011009910010080000800000100800004380038003880039613900005110116112672814107800001002673226732267322673226732

1000 unrolls and 10 iterations

Result (median cycles for code divided by count): 0.3341

retire uop (01)cycle (02)03mmu table walk data (08)l2 tlb miss instruction (0a)0e0f191e22233a3f4346494f51schedule uop (52)schedule int uop (53)schedule ldst uop (55)dispatch int uop (56)dispatch ldst uop (58)int uops in schedulers (59)simd uops in schedulers (5a)5f60696d6emap stall dispatch (70)map rewind (75)map stall (76)dispatch uop (78)map int uop (7c)map ldst uop (7d)map int uop inputs (7f)map ldst uop inputs (80)8283flush restart other nonspec (84)85inst all (8c)inst branch (8d)inst branch taken (90)inst branch cond (94)inst int alu (97)inst simd load (98)inst ldst (9b)9d9fl1d tlb access (a0)l1d cache miss ld (a3)ld unit uop (a6)l1d cache writeback (a8)acafb5b6bbl1d cache miss ld nonspec (bf)cfd2l1i cache miss demand (d3)d5map dispatch bubble (d6)d9daddfetch restart (de)e0eaebec? ldst retires (ed)? int retires (ef)f5f6f7f8fd
8002526722200000104500026713201816258001010800001080000501166750112668326728267281665231670280010208000020800002672726722118002110910108000080000010800003980039035800396035050200015160010132672406280000102670926723267092670926709
80024267222000001041002266942120025800101080000108000050116884300267032672826728166523167078001020800002080000267082670811800211091010800008000001080000398003903980035600050200013160012122671900080000102672926728267092672926728
800242672720000010000026712012016258001010800001080000501166750012670226708267271665231670780010208000020800002672826708118002110910108000080000110800003980000168003901353950200012160011122671960080000102672926709267232670926729
800242670820000000450012683221812025800101080000108000050116760500266972672726728166723167078001020800002080000267082672211800211091010800008000001080000398003510800350100502000111600121226724010280000102672926729267092670926709
8002426729200000000102267160001225800101080000108000050116675001266972670826708166723167088001020800002080000267082671211800211091010800008000001080000438000003580039610435020001216001062671966280000102672826709267282670926729
800242670820000000234001266930121216258001010800001080000501168843012668326728267081667231670780010208000020800002672226722118002110910108000080000010800000800000398003901390502000121600111226719610280000102672826728267232670926728
80024267282000000000022671301212025800101080000108000050116884301266832672826727166523166888001020800002080000267272672211800211091010800008000001080000398003503980039003943502000121600121026719100080000102670926709267282672326709
800242670820000100000026693001211258001010800001080000501166894002670326708267221665231670280010208000020800002672726708118002110910108000080000010800003980000108003961043502000121600121026719106280000102672926709267092670926728
8002426727200000004500226713001216258001010800001080000501167605002670326727267081665231670280010208000020800002670826722118002110910108000080000010800003980000039800350035395020009160011926812106480000102670926730267092672626729
80024267082000000041002267202181292580010108000010800005011668960026703267082672216672316707800102080000208000026727267161180021109101080000800000108000039800350398003560354350200010160010626719610480000102672826728267282670926709