Apple Microarchitecture Research by Dougall Johnson

M1/A14 P-core (Firestorm): Overview | Base Instructions | SIMD and FP Instructions
M1/A14 E-core (Icestorm):  Overview | Base Instructions | SIMD and FP Instructions

ST1 (multiple, 1 reg, 2S)

Test 1: uops

Code:

  st1 { v0.2s }, [x6]
  nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop
  mov x0, 0

(no loop instructions)

1000 unrolls and 1 iteration

Retires (minus 60 nops): 1.000

Issues: 1.000

Integer unit issues: 0.000

Load/store unit issues: 1.000

SIMD/FP unit issues: 0.000

retire uop (01)cycle (02)03l1i tlb fill (04)mmu table walk instruction (07)mmu table walk data (08)l2 tlb miss instruction (0a)l2 tlb miss data (0b)1e1f22233a3f464951schedule uop (52)schedule ldst uop (55)dispatch ldst uop (58)simd uops in schedulers (5a)5f696d6emap rewind (75)map stall (76)dispatch uop (78)map ldst uop (7d)map ldst uop inputs (80)8283flush restart other nonspec (84)85inst all (8c)inst simd store (99)inst ldst (9b)l1d tlb access (a0)l1d tlb miss (a1)l1d cache miss st (a2)ld unit uop (a6)st unit uop (a7)l1d cache writeback (a8)acafl1d cache miss st nonspec (c0)c2cfd0d2l1i cache miss demand (d3)l1i tlb miss demand (d4)d5map dispatch bubble (d6)d9dadbddfetch restart (de)e0eaebec? ldst retires (ed)f5f6f7f8fd
61005294992380201012110464029011102426210001000100050005159762829929407310100010002000291072916211610011000100010000001000001000001274691576865302505121677308238161359602832416321135071572510002925829304291462923029238
6100429234219010100100454628786012427710001000100050006159592827429340310100010002000291162904111610011000100010000001000001000201287790056829307005621712304438161456522840416304136171575410002930029178293012933529276
6100429230219010000100454828824012430710001000100050006159732839129271310100010002000291942920111610011000100010000301000061000001280490906843302515021677303238131957532839916409138411559410002931129293292582898329265
6100429319219020100100453228875002428610001000100050001159532833029381310100010002000290592907611610011000100010000001000001000201293691206857309025821705305538221260572836316328137121596010002929529422293902926829360
6100429267220020100100458228776002428210001000100050005159632837929382310100010002000291442912211610011000100010000201000001000201297389856885306025421746301438151358572845916476137521576810002934429172293092927429366
6100429300219010100110449228848002442410001000100050001159532837929355310100010002000291822919511610011000100010000201000001000201286692036853305415421722304738181157502845716327137521581310002932729349293892934429338
6100429245220020300100460628905002440910001000100050003159552843429289310100010002000292102914611610011000100010000201000001000201288789656816303815521702308438211353532841016179138901574410002925029321293472938329409
6100429252219030200100463528804002433410001000100050001159662839429322310100010002000291412924411610011000100010000201000001000201281691116863306216021654300738201157592843016429137991575710002933129408292652938929275
6100429242219010100100459228825002433810001000100050000159652834229309310100010002000292112908911610011000100010000201000001000001277990346843305915421686301038231656562843516349137971576810002933129342293512929629385
6100429306220020100100454828813002429110001000100050000159702839829370310100010002000291282915411610011000100010000201000001000001284090366841308805821803303638181162512837916401138721567910002929829355293562934729299

Test 2: throughput

Count: 8

Code:

  st1 { v0.2s }, [x6]
  st1 { v0.2s }, [x6]
  st1 { v0.2s }, [x6]
  st1 { v0.2s }, [x6]
  st1 { v0.2s }, [x6]
  st1 { v0.2s }, [x6]
  st1 { v0.2s }, [x6]
  st1 { v0.2s }, [x6]
  mov x7, x6
  mov x8, x6
  mov x9, x6
  mov x10, x6
  mov x11, x6
  mov x12, x6
  mov x13, x6

(fused SUBS/B.cc loop)

100 unrolls and 100 iterations

Result (median cycles for code divided by count): 0.5005

retire uop (01)cycle (02)03mmu table walk instruction (07)mmu table walk data (08)09l2 tlb miss instruction (0a)l2 tlb miss data (0b)18191e1f243a3f46494f51schedule uop (52)schedule int uop (53)schedule ldst uop (55)dispatch int uop (56)dispatch ldst uop (58)int uops in schedulers (59)simd uops in schedulers (5a)60696d6emap stall dispatch (70)map rewind (75)map stall (76)dispatch uop (78)map int uop (7c)map ldst uop (7d)map int uop inputs (7f)map ldst uop inputs (80)8283flush restart other nonspec (84)85inst all (8c)inst branch (8d)inst branch taken (90)inst branch cond (94)inst int alu (97)inst simd store (99)inst ldst (9b)9fl1d tlb access (a0)l1d tlb miss (a1)l1d cache miss st (a2)a4ld unit uop (a6)st unit uop (a7)l1d cache writeback (a8)a9acafbcl1d cache miss st nonspec (c0)c2cfd5map dispatch bubble (d6)ddfetch restart (de)e0? int output thing (e9)ea? ldst retires (ed)? int retires (ef)f5f6f7f8fd
80205400543100000000153004002816012580160100800001008000050018394480400174004240043299553300108010020080000200160242400424004211802011009910010080000800001008000004200800021058000224205110316334004000800001004004440043400444004340041
80204400433100000100000040027161612580100100800001008000050018393520415564004240042299553300018010020080000200160000400434004211802011009910010080000800001008000004200800020038000224205150516654003900800001004004340043400434004340043
80204400403110000000030040028161674258010010080000100800005001839448040017400424004229955330001801002008000020016000040042400421180201100991001008000080000100800000000800020028000224205110325324004000800001004004340044400434004440043
80204400403100000000030040028161612580100100800001008000050018394480400174004240043299673300008010020080000200160000400424004011802011009910010080000800001008000004200800020038000224205110316334004000800001004004540041400434004140044
8020440042310000000003004002716161258010010080000100800005001839448040018400424004329955330000801002008000020016000040054400401180201100991001008000080000100800000440080002002800022005110316334003900800001004004340043400434004440043
80204400433100000011030040025161602580100100800001008000050018394720400174005440042299553300008010020080000200160000400424004311802011009910010080000800001008000004200800020028000024205110216234003700800001004004440043400444004340055
8020440042310000000003004002716169155802201008006010080108500184765304015440256401983006116302018020820080125200160480402524024831802011009910010080000800001008012004231080122048828006224205130525454003900800001004004540044400454004140044
8020440040310000000003004002801612580100100800001008000050018394720400174004340042299553300008010020080000200160000400424004011802011009910010080000800001008000004200800020028000224205110216354003700800001004004340045400434004340041
802044004031000000000000400251600258010010080000100800005001839472040017400424004029955330002801002008000020016000040054400401180201100991001008000080000100800000000800020058000204205110216334003900800001004004340043400434004140041
8020440042310000000007004002716002580100100800001008000050018393520400174004240042299533300018010020080000200160000400434004211802011009910010080000800001008000004200800020068000224205110316324003700800001004004440043400414004340041

1000 unrolls and 10 iterations

Result (median cycles for code divided by count): 0.5005

retire uop (01)cycle (02)03l1d tlb fill (05)mmu table walk instruction (07)mmu table walk data (08)09l2 tlb miss data (0b)18191e1f22243a3f46494f51schedule uop (52)schedule int uop (53)schedule ldst uop (55)dispatch int uop (56)dispatch ldst uop (58)int uops in schedulers (59)simd uops in schedulers (5a)696d6emap stall dispatch (70)map rewind (75)map stall (76)dispatch uop (78)map int uop (7c)map ldst uop (7d)map int uop inputs (7f)map ldst uop inputs (80)8283flush restart other nonspec (84)85inst all (8c)inst branch (8d)inst branch taken (90)inst branch cond (94)inst int alu (97)inst simd load (98)inst simd store (99)inst ldst (9b)9d9fl1d tlb access (a0)l1d tlb miss (a1)l1d cache miss st (a2)a4ld unit uop (a6)st unit uop (a7)l1d cache writeback (a8)a9acafbcl1d cache miss st nonspec (c0)l1d tlb miss nonspec (c1)c2branch mispred nonspec (cb)cfl1i cache miss demand (d3)d5map dispatch bubble (d6)ddfetch restart (de)e0e8? ldst retires (ed)? int retires (ef)f5f6f7f8fd
800254005731110111000190014003216165258001010800001080000501839692400254005340047299853300388001020800002016000040051400571180021109101008000080000010800141436008001600178000216361400502004164340049080000104005340059400484006240060
80024400533101000100019001400371616425800101080000108000050183969240033400524005029992330030800102080000201600004004740052118002110910100800008000001080014153620800161023800021601400502013164340050080000104005340061400534005340059
800244004931110000000140014004316010258001010800001080000501839812400344005140059299943300408001020800002016000040052400511180021109101008000080000010800141536008001621198000216361400502105164340049080000104005340059400514019940059
800244004731010100000170014003816165258001010800601080000501839932400224004740047299943300278001020800002016000040197400521180021109101008000080000010800151436008001600188000214361400502003163440044080000104005440061400524005440060
800244004731110010000190014004416166258001010800001080000501839812400234005040048299823300278001020800002016000040052400501180021109101008000080000010800151434008000210280002234000502104163440039080000104004340041400434004440052
800244004231100000006300040027016825800101080000108000050183969240033401884005829992330029800102080000201600004005240047118002110910100800008000001080016143400800141120800001601410502002164240050080000104004840048400414004340043
80024400423110000000030004002716160258001010800001080000501839424400174004240042299773300228001020800002016000040051400421180021109101008000080000010800001436018007610148000216361410502003163340044080000104006040052400544005340050
8002440197311100110001810140044016025800101080000108000050183942440017400424004229977330022800102080000201600004004040042118002110910100800008000001080000034008000220280002234000502003163340039080000104004340061400514005140049
800244005231010001000140014004316162258001010800001080000501839692400364005340047299943300308001020800002016000040058400521180021109101008000080000010800151434008000200280002234000502003164340039080000104004340059400634004140043
8002440051310000000003000400270160258001010800001080000501839424400254004240040299863300228001020800002016000040042400421180021109101008000080000010807801436008000210080002240000503903164340039080000104004340051400414005140043