Apple Microarchitecture Research by Dougall Johnson

M1/A14 P-core (Firestorm): Overview | Base Instructions | SIMD and FP Instructions
M1/A14 E-core (Icestorm):  Overview | Base Instructions | SIMD and FP Instructions

ST1 (multiple, 1 reg, 8H)

Test 1: uops

Code:

  st1 { v0.8h }, [x6]
  nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop
  mov x0, 0

(no loop instructions)

1000 unrolls and 1 iteration

Retires (minus 60 nops): 1.000

Issues: 1.000

Integer unit issues: 0.000

Load/store unit issues: 1.000

SIMD/FP unit issues: 0.000

retire uop (01)cycle (02)03l1d tlb fill (05)mmu table walk instruction (07)mmu table walk data (08)09l2 tlb miss instruction (0a)l2 tlb miss data (0b)1e1f223a3f464951schedule uop (52)schedule ldst uop (55)dispatch ldst uop (58)simd uops in schedulers (5a)5f696d6emap rewind (75)map stall (76)dispatch uop (78)map ldst uop (7d)map ldst uop inputs (80)8283flush restart other nonspec (84)85inst all (8c)inst simd store (99)inst ldst (9b)l1d tlb access (a0)l1d tlb miss (a1)l1d cache miss st (a2)ld unit uop (a6)st unit uop (a7)l1d cache writeback (a8)a9acafbcl1d cache miss st nonspec (c0)l1d tlb miss nonspec (c1)c2cfd0d2l1i cache miss demand (d3)l1i tlb miss demand (d4)d5map dispatch bubble (d6)d9dadbddfetch restart (de)e0eaebec? ldst retires (ed)f5f6f7f8fd
610052858121402901260020501827940012344710001000100050005159142786028237310100010002000282522827511610011000100010023401001341110001012139181007370173337116720747333138051966712789114183123801366110002836028312283612845828595
61004284882140240025100051532823601233651000100010005000515968280102831533210001000200028308282071161001100010001002200100139141000101213922101917074341197320631339838101673662795814605123431371510002847528330283212831828611
61004288612131271026101051432804800235861000100010005000716036280322827431010001000200028263281821161001100010001002240100137111000000013791101117189340797320815335038151868712792514940123391401810002830928542282032863428296
610042825321212400250000518027929112341310001000100050005159492791028329310100010002000281752833011610011000100010032001001361410001000139121022771853216116820657342138111663712786814583121121379310002808428374283892849728637
61004284222140330028001051622809110234781000100010005000515945280042852031010001000200028257283771161001100010001002340100136141000101113925103027114342196720610339438151171652790814232121771364410002821928237282302850828540
61004282372121210023102051322798600236611000100010005000515956278592816831010001000200028210282311161001100010001002200100131410000300140161034871053346107220689338638101869712794013857120931349810002814828330283932828928433
610042839721212710230020515028117002337010001000100050006159612785728308310100010002000281802820911610011000100010012001001371710000311139961023472103424126620620328238061967682790414370120581385410002827428403284782840128466
610042819521102810250010513028027102340510001000100050005159552779828237310100010002000282822838111610011000100010023001001011010000300141101013672473405157320673346538181367692785614333120601348710002835728236282972824528666
61004284732111280024000050502812501234351000100010005000715972277992833431010001000200028102281811161001100010001002200100136171000130013985104207195346767120748330538172367652790613900121831403910002833428420282432841928613
61004283852121261027101051252807701234881000100010005000715975278492842431010001000200028192283071161001100010001001200100140471000000013928100727202342187020640338638212467762796613977123571374810002848028543282602831728625

Test 2: throughput

Count: 8

Code:

  st1 { v0.8h }, [x6]
  st1 { v0.8h }, [x6]
  st1 { v0.8h }, [x6]
  st1 { v0.8h }, [x6]
  st1 { v0.8h }, [x6]
  st1 { v0.8h }, [x6]
  st1 { v0.8h }, [x6]
  st1 { v0.8h }, [x6]
  mov x7, x6
  mov x8, x6
  mov x9, x6
  mov x10, x6
  mov x11, x6
  mov x12, x6
  mov x13, x6

(fused SUBS/B.cc loop)

100 unrolls and 100 iterations

Result (median cycles for code divided by count): 0.5005

retire uop (01)cycle (02)03l1d tlb fill (05)mmu table walk instruction (07)mmu table walk data (08)l2 tlb miss data (0b)18191e1f23243a3f46494f51schedule uop (52)schedule int uop (53)schedule ldst uop (55)dispatch int uop (56)dispatch ldst uop (58)int uops in schedulers (59)simd uops in schedulers (5a)696b6d6emap stall dispatch (70)map rewind (75)map stall (76)dispatch uop (78)map int uop (7c)map ldst uop (7d)map int uop inputs (7f)map ldst uop inputs (80)8283flush restart other nonspec (84)85inst all (8c)inst branch (8d)inst branch taken (90)inst branch cond (94)inst int alu (97)inst simd store (99)inst ldst (9b)9fl1d tlb access (a0)l1d tlb miss (a1)l1d cache miss st (a2)a4ld unit uop (a6)st unit uop (a7)l1d cache writeback (a8)a9acafbcl1d cache miss st nonspec (c0)l1d tlb miss nonspec (c1)c2cfd5map dispatch bubble (d6)ddfetch restart (de)e0? int output thing (e9)? ldst retires (ed)? int retires (ef)f5f6f7f8fd
80205400523210000000300040030161602580100100800001008000050018394724001704004240042299553300008010020080000200160000400424004311802011009910010080000800001008000004200800021008000224200511011611400400800001004004440043400434004440043
8020440042321000000330004002716162258010010080000100800005001839472400150400434004229956330000801002008000020016000040042400421180201100991001008000080000100800000420080002002800020000511011611400510800001004004440043400444004340043
80204400423220000000300040029161602580100100800001008000050018394484001704004240042299553300018010020080000200160000400424004311802011009910010080000800001008000004200800020028000224200511011611400390800001004004340041400434004340041
80204400423210000000000040025161612580100100800001008000050018400004001704004440043299563300008010020080000200160000400434004211802011009910010080000800001008000004200800020028000224200511011611400400800001004004340043400434004340044
80204400423210000000000040027161602580100100800001008000050018398084001704004940042299553300088010020080000200160000400424004211802011009910010080000800001008000003400800020028000223400511011611400390800001004004340043400434004340041
80204400423100000000900040027161602580100100800001008000050018398324001704004240042299533300098010020080000200160000400404004211802011009910010080000800001008000003400800020008000223400511011611400390800001004005140052400514005240043
8020440042310000000030104002816160258010010080000100800005001839784400170400504004229955330007801002008000020016000040042400511180201100991001008000080000100800000000800022028000223400511011610400470800001004005140041400524005140041
8020440042311000000030004003516002580100100800001008000050018398084002404004240040299553300008010020080000200160000400424004211802011009910010080000800001008000003400800001028000223600511011611400390800001004005240050400514005140043
80204400423100000000300040027161602580100100800001008000050018394244001704004240042299553300008010020080000200160000400424004211802011009910010080000800001008000003400800020088000223400511011611400510800001004004140041400524005140043
8020440050311000000030004002516160258010010080000100800005001839352400240400424012829967330000801002008000020016000040042400421180201100991001008000080000100800000000800020028000223400511011611401590800001004005140043400434004340050

1000 unrolls and 10 iterations

Result (median cycles for code divided by count): 0.5005

retire uop (01)cycle (02)03l1d tlb fill (05)mmu table walk data (08)09l2 tlb miss data (0b)18191e1f23243a3f46494f51schedule uop (52)schedule int uop (53)schedule ldst uop (55)dispatch int uop (56)dispatch ldst uop (58)int uops in schedulers (59)simd uops in schedulers (5a)60696d6emap stall dispatch (70)map rewind (75)map stall (76)dispatch uop (78)map int uop (7c)map ldst uop (7d)map int uop inputs (7f)map ldst uop inputs (80)8283flush restart other nonspec (84)85inst all (8c)inst branch (8d)inst branch taken (90)inst branch cond (94)inst int alu (97)inst simd store (99)inst ldst (9b)9fl1d tlb access (a0)l1d tlb miss (a1)l1d cache miss st (a2)a4ld unit uop (a6)st unit uop (a7)l1d cache writeback (a8)a9acafbcl1d cache miss st nonspec (c0)l1d tlb miss nonspec (c1)c2cfd5map dispatch bubble (d6)ddfetch restart (de)e0? ldst retires (ed)? int retires (ef)f5f6f7f8fd
80025400503220000000150004002716162825800101080000108000050183980804001740050400422997716301508001020800002016000040180401811180021109101080000800001080000034008000200080002234005020216334015780000104018040051400504005140136
80024400423210000000140004031116160258001010800001080000501844197040017400424018129977330022800102080000201600004004240042118002110910108000080000108000003400800020279780002234025020316334003980000104004340041401814004140043
800244004232200000001800040027161605580010108000010800005018398560400174005040042299773300318001020800002016000040040400491180021109101080000800001080000034008000210880002234025038316324003980000104004340043400434004340043
8002440050322000000017000400271600258001010800001080000501839424040017400514004230071330022800102080000201600004004240042118002110910108000080000108000003400800020028000220005020316324004780000104018340041400524005140043
8002440050323000000030004002716160258001010800001080000501839352040025400404018929984330030800102080000201600004004240042118002110910108000080000108000000008000200280002234005020416344003980000104005140052400524005140043
80024400423220000000300040027161625580010108000010800005018394240400174004240042299753300318001020800002016000040042400501180021109101080000800001080000034008000201080002234005020316324003980000104005140051400504005140043
800244004232200000006010004043601622580010108000010800005018396920400224005740187299943300438001020800002016000040050400421180021109101080000800001080000042008006211208280122242005020416444003980000104019440055400434004340044
800244004232110100012170014003201602580010108000010800005018399340400174004240040299753300328001020800002016000040042400421180021109101080000800001080000044008000210580002242005020316234003980000104004340043400434004440041
800244004332100000012000040028161612580010108000010800005018394480400194004240042299773300228001020800002016000040042400431180021109101080000800001080000042108000200580002242005020316434003980000104004140044400434004140044
8002440054321000000030004002716161258001010800001080000501839448040017400424004329978330022800102080000201600004004040043118002110910108000080000108000000008000200580000242005020316334003980000104004340041400434004340043