Apple Microarchitecture Research by Dougall Johnson

M1/A14 P-core (Firestorm): Overview | Base Instructions | SIMD and FP Instructions
M1/A14 E-core (Icestorm):  Overview | Base Instructions | SIMD and FP Instructions

ST1 (multiple, 1 reg, 4S)

Test 1: uops

Code:

  st1 { v0.4s }, [x6]
  nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop
  mov x0, 0

(no loop instructions)

1000 unrolls and 1 iteration

Retires (minus 60 nops): 1.000

Issues: 1.000

Integer unit issues: 0.000

Load/store unit issues: 1.000

SIMD/FP unit issues: 0.000

retire uop (01)cycle (02)03l1i tlb fill (04)mmu table walk instruction (07)mmu table walk data (08)09l2 tlb miss instruction (0a)l2 tlb miss data (0b)18191e1f2223243a3f464951schedule uop (52)schedule ldst uop (55)dispatch ldst uop (58)simd uops in schedulers (5a)5f696d6emap rewind (75)map stall (76)dispatch uop (78)map ldst uop (7d)map ldst uop inputs (80)8283flush restart other nonspec (84)85inst all (8c)inst simd store (99)inst ldst (9b)l1d tlb access (a0)l1d tlb miss (a1)l1d cache miss st (a2)ld unit uop (a6)st unit uop (a7)l1d cache writeback (a8)a9acafl1d cache miss st nonspec (c0)c2c9cfd0d2l1i cache miss demand (d3)l1i tlb miss demand (d4)d5map dispatch bubble (d6)d9dadbddfetch restart (de)e0eaebec? ldst retires (ed)f5f6f7f8fd
6100529142235320000000100045642851101241021000100110005000715948283062908731010001000200028876289371161001100010001000020100000010002001322291156881313815921383319338093561662830515966131121542810002889129086291682908829130
6100429053234010001000000047432863910240761000100010005000915953283452916631010001000200028893288271161001100010001000030100000010000001296794836947311605321357322638083462672838215756130451512210002902229042291692898328952
61004290972330100200060000473528557002412510001000100050001215928281392909531010001000200028850288991161001100010001000030100000010003001331494876855316116621478320038053558632836416014135411538510002904629061288692909428978
610042897023401111000000004717286270023977100010001000500091593328086290273101000100020002888328877116100110001000100003010000031000201901312692786965315506121276315838043757622834816016133681531210002903329173291172901929075
61004290892341100100012010047852853201240261000100010005000715962282772910031010001000200028865288991161001100010001000030100000010000001314494106892313606321446317738123660662835916034133261523210002920729159290962909428981
6100429200234000010000000046692857410240931000100010005000115932283582907331010001000200028870288941161001100010001000030100000310003001332394736933310105621518318038144360552835715960135831501810002913729089289222903429122
610042905023401001000120000457728469002413510001000100050001015960282402898231010001000200028924288771161001100010001000020100000010002001340193676873317506521482325038193954642841716070130631539510002907029029292052919628872
610042906823411001000138000047182857200240551000100010005000215941282312908831010001000200028889289241161001100010001000000100000010002001318792286929317815721426322638184157622836415899134831501110002909829178290282903829169
6100429050233010010000000045922856600240471000100010005000015943283942904131010001000200028936289521161001100010001000030100000010002001328191776888321106021423321038104167672838316085132671533410002898329158291002916328945
61004290382331200100000000460728627002414010001000100050001315944282992900731010001000200028932288561161001100010001000020100000010002001311493536931313125521416321738114160652834315885133791540910002895629185291082919129095

Test 2: throughput

Count: 8

Code:

  st1 { v0.4s }, [x6]
  st1 { v0.4s }, [x6]
  st1 { v0.4s }, [x6]
  st1 { v0.4s }, [x6]
  st1 { v0.4s }, [x6]
  st1 { v0.4s }, [x6]
  st1 { v0.4s }, [x6]
  st1 { v0.4s }, [x6]
  mov x7, x6
  mov x8, x6
  mov x9, x6
  mov x10, x6
  mov x11, x6
  mov x12, x6
  mov x13, x6

(fused SUBS/B.cc loop)

100 unrolls and 100 iterations

Result (median cycles for code divided by count): 0.5007

retire uop (01)cycle (02)03l1d tlb fill (05)mmu table walk data (08)l2 tlb miss data (0b)0e18191e1f233a3f46494f51schedule uop (52)schedule int uop (53)schedule ldst uop (55)dispatch int uop (56)dispatch ldst uop (58)int uops in schedulers (59)simd uops in schedulers (5a)60696d6emap stall dispatch (70)map rewind (75)map stall (76)dispatch uop (78)map int uop (7c)map ldst uop (7d)map int uop inputs (7f)map ldst uop inputs (80)8283flush restart other nonspec (84)85inst all (8c)inst branch (8d)inst branch taken (90)inst branch cond (94)inst int alu (97)inst simd store (99)inst ldst (9b)9fl1d tlb access (a0)l1d tlb miss (a1)l1d cache miss st (a2)a4ld unit uop (a6)st unit uop (a7)l1d cache writeback (a8)a9acafbcl1d cache miss st nonspec (c0)l1d tlb miss nonspec (c1)c2cfd5map dispatch bubble (d6)ddfetch restart (de)e0? ldst retires (ed)? int retires (ef)f5f6f7f8fd
802054004732111000015180140040161622580100100800001008000050018399810400304005240054299673300128010020080000200160000400544005411802011009910010080000800001008001415440180016111980002164814051101161240061800001004005540064400654004940055
8020440052322101000317014003901652580100100800001008000050018399340400294005240054299683300128010020080000200160000400524006411802011009910010080000800001008001414440080014011680002164414051101161140050800001004005440054400554005540053
8020440054322110000317014003901612580100100800001008000050018404600400304005240055299683300228010020080000200160000400544006311802011009910010080000800001008001414440080016011680002164414051101161140051800001004005540055400554005540053
80204400543221000003221140040161622580100100800001008000050018399330400294005240054299673300228010020080000200160000400534005611802011009910010080000800001008001414480180016111980002164414051101161140051800001004005540055400554005540053
80204400543221010000200140039161612580100100800001008000050018400760400274005440052299653300128010020080000200160000400544005411802011009910010080000800001008001515441080016001880002164414151101161140051800001004005540055400554005540057
802044005532210000018170140039161662580100100800001008000050018399560400384005440063299763300128010020080000200160000400544005311802011009910010080000800001008001516440180016011980002164414151101161140044800001004005540055400554005540056
80204400543211000001517114003801622580100100800001008000050018400040400284005440053299653300128010020080000200160000400544005411802011009910010080000800001008001514440180016011680002164414151101161140051800001004005540055400554005640055
8020440063321111000168170140037161622580100100800001008000050018399560400384005440063299763300108010020080000200160000400544005211802011009910010080000800001008001414440180016102280002164414151101161140051800001004004840048400554005540055
802044005232210000015170140039161612580100100800001008000050018400520400294005640054299673300128010020080000200160000400524005411802011009910010080000800001008001415440080016001780002164214151101161240049800001004005340053400524005340055
80204400533221010003170140039161602580100100800001008000050018398840400304005240054299683300108010020080000200160000400544005211802011009910010080000800001008001415440180016022280002164414051101161140049800001004005340054400544005340053

1000 unrolls and 10 iterations

Result (median cycles for code divided by count): 0.5005

retire uop (01)cycle (02)03mmu table walk data (08)l2 tlb miss data (0b)18191e1f3f46494f51schedule uop (52)schedule int uop (53)schedule ldst uop (55)dispatch int uop (56)dispatch ldst uop (58)int uops in schedulers (59)simd uops in schedulers (5a)60696d6emap stall dispatch (70)map rewind (75)map stall (76)dispatch uop (78)map int uop (7c)map ldst uop (7d)map int uop inputs (7f)map ldst uop inputs (80)8283flush restart other nonspec (84)85inst all (8c)inst branch (8d)inst branch taken (90)inst branch cond (94)inst int alu (97)inst simd store (99)inst ldst (9b)9fl1d tlb access (a0)l1d tlb miss (a1)l1d cache miss st (a2)a4ld unit uop (a6)st unit uop (a7)l1d cache writeback (a8)a9acafbcl1d cache miss st nonspec (c0)c2cfd2d5map dispatch bubble (d6)ddfetch restart (de)e0? ldst retires (ed)? int retires (ef)f5f6f7f8fd
80025400433210000004002701602580010108000010800005018394481400174004340042299773300228001020800002016000040042400421180021109101080000800001080000042008000200280002242050201216114003980000104004340043400434004340043
80024400423100000034002716168258001010800001080000501839472140018400404004229977330022800102080000201600004004240042118002110910108000080000108000000008000200280002242050201116114004080000104004340043400434004340043
8002440042311000003400280002580010108000010800005018394961400174004340042299773300238001020800002016000040043400441180021109101080000800001080000042008000200280002242050200216114003980000104004340043400434004340043
800244004231000000340025161602580010108000010800005018394481400184004340040299773300228001020800002016000040042400441180021109101080000800001080000042008000200280000242050200116214003980000104004340043400434004340043
800244004331000001213540027161602580010108000010800005018394481400174004040042299773300228001020800002016000040042400461180021109101080000800001080000042008000200580002242050200216114004080000104004440043400434004340043
800244004231000001234002716163302580010108018010804325018537431402554018240042303461630486803342080242201604844018340182318002110910108000080000108006000600801820279280180242250570233314003980000104004440043400444018240182
80024407393250133402443403051616219115801901080180108032450184429314034040458404593025542301518001020800002016000040043400421180021109101080000800001080000042008000200280002242050200216114003980000104004140041400434004440043
80024400423220000123400271616025800101080000108000050183947214001540044400402997833002380010208000020160000400434004211800211091010800008000010800602443008012200208080002242250200216114003980000104004640043400444004140043
800244004232100000340027161612580010108000010800005018394481400174005440043299783300238001020800002016000040042400431180021109101080000800001080000048008006222280002244050200216124041380000104004140192400434043740044
800244004031100000340027161602580010108000010803245018394481400174004240043299773300238001020800002016000040043400421180021109101080000800001080000042008000200580002042050200216214003980000104004440043400444004340043