Apple Microarchitecture Research by Dougall Johnson

M1/A14 P-core (Firestorm): Overview | Base Instructions | SIMD and FP Instructions
M1/A14 E-core (Icestorm):  Overview | Base Instructions | SIMD and FP Instructions

ST1 (multiple, 1 reg, 4H)

Test 1: uops

Code:

  st1 { v0.4h }, [x6]
  nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop
  mov x0, 0

(no loop instructions)

1000 unrolls and 1 iteration

Retires (minus 60 nops): 1.000

Issues: 1.000

Integer unit issues: 0.000

Load/store unit issues: 1.000

SIMD/FP unit issues: 0.000

retire uop (01)cycle (02)03mmu table walk instruction (07)mmu table walk data (08)09l2 tlb miss instruction (0a)l2 tlb miss data (0b)1e1f233a3f464951schedule uop (52)schedule ldst uop (55)dispatch ldst uop (58)simd uops in schedulers (5a)5e5f6061696d6emap rewind (75)map stall (76)dispatch uop (78)map ldst uop (7d)map ldst uop inputs (80)8283flush restart other nonspec (84)85inst all (8c)inst simd store (99)inst ldst (9b)l1d tlb access (a0)l1d cache miss st (a2)ld unit uop (a6)st unit uop (a7)l1d cache writeback (a8)acafl1d cache miss st nonspec (c0)cfd0d2l1i cache miss demand (d3)l1i tlb miss demand (d4)d5map dispatch bubble (d6)d9dadbddfetch restart (de)e0eaebec? ldst retires (ed)f5f6f7f8fd
610052919722623001800004687283841023884100010001000500000001597228368291493101000100020002895528908116100110001000100020100000100021358696966889314565421271324738154447482822315532132581534510002878428848287212884528884
610042896022322002302700476428721102410810001000100050000100915941282512920731010001000200028855287201161001100010001000001000001000313410983569953237125421505322738164947482826715843133631535810002906129096288882910128950
610042880322419002200004762285880024010100010001000500006001596628344291213101000100020002880828780116100110001000100020100000100031332198516887321874621525321138224764572837415739130381509210002911629024289482884829102
610042888022522001600004535285690124181100010001000500000101595928431291303101000100120002893028951116100110001000100000100000100021315391786830312995021577322738164959532863515904135451557310002917929192291672922430182
61004292352351610230000459828762102433710001001100050000410159622834329172310100010002000288952890811610011000100010002010000455100001313093826843315395121539321138194454572839416015134281568810002921529135291812915229168
610042917822621001910104639286110024273100010001000500000001595628289291973101000100020002896328834116100110001000100020100200100021309692546874304574721500310938174551512845316072132411538010002920429145291242913029140
6100429025227190016008804675286090024168100010011000500004081596928303291563101000100020002894129017116100110001000100020100000100021315291796897309094321441321438154249522834415957134011507910002909029095291532920928980
6100429174225220017101045982859201241301000100010005000010015962283382922331010001000200029026288941161001100010001000301000001000213252946568063121145021504326838184652482844915742132491539410002915428963289882912529108
6100429144225190119000046132864400242081000100010005005020015952283382906881010001000200028994290441161001100010001000301000001000213162930368583132135221520326238104853502849916030134631551210002906729074292012917029126
61004291122262100190132004591286750024086100010001000500000081595228347290623101000100020002893128815116100110001000100032100000100001318191866832315684721489321338163844572833815806133131537910002908529094290532904429154

Test 2: throughput

Count: 8

Code:

  st1 { v0.4h }, [x6]
  st1 { v0.4h }, [x6]
  st1 { v0.4h }, [x6]
  st1 { v0.4h }, [x6]
  st1 { v0.4h }, [x6]
  st1 { v0.4h }, [x6]
  st1 { v0.4h }, [x6]
  st1 { v0.4h }, [x6]
  mov x7, x6
  mov x8, x6
  mov x9, x6
  mov x10, x6
  mov x11, x6
  mov x12, x6
  mov x13, x6

(fused SUBS/B.cc loop)

100 unrolls and 100 iterations

Result (median cycles for code divided by count): 0.5007

retire uop (01)cycle (02)03l1d tlb fill (05)mmu table walk data (08)09l2 tlb miss data (0b)191e1f22233a3f46494f51schedule uop (52)schedule int uop (53)schedule ldst uop (55)dispatch int uop (56)dispatch ldst uop (58)int uops in schedulers (59)simd uops in schedulers (5a)60696d6emap stall dispatch (70)map rewind (75)map stall (76)dispatch uop (78)map int uop (7c)map ldst uop (7d)map int uop inputs (7f)map ldst uop inputs (80)8283flush restart other nonspec (84)85inst all (8c)inst branch (8d)inst branch taken (90)inst branch cond (94)inst int alu (97)inst simd store (99)inst ldst (9b)9fl1d tlb access (a0)l1d tlb miss (a1)l1d cache miss st (a2)a4ld unit uop (a6)st unit uop (a7)l1d cache writeback (a8)a9acafbcl1d cache miss st nonspec (c0)l1d tlb miss nonspec (c1)c2cdcfd5map dispatch bubble (d6)ddfetch restart (de)e0? ldst retires (ed)? int retires (ef)f5f6f7f8fd
802054006330010000019001400391616225801001008000010080000500184043804002940054400552996433002180100200800002001600004005240063118020110099100100800008000010080015144200800160217800021644140051103162440061800001004005540055400554005540055
802044005430010000919001400361616525801001008000010080000500184046004002940054400532996533001280100200800002001600004005440054118020110099100100800008000010080015154402800160017800021644140051102163340051800001004005340048400524005340055
80204400533001110001400140037151612580100100800001008000050018400280400294005440053299653300128010020080000200160000400544005411802011009910010080000800001008001414440280016011980002160140051103163340051800001004005540055400554005540054
80204400543001011051180014004916161258010010080000100800005001840028040028400544005229976163024080752200800002001600004005440052118020110099100100800008000010080014144401800160118800021644141051103163340061800001004005540056400554005540055
80204400633001000036019001400371616625801001008000010080000500184043804002940052400632996833001080100200800002001600004005240055118020110099100100800008000010080014144400800162019800021642140051103163340051800001004005540055400554005540052
80204400543001111001800140037016425801001008000010080000500183995604002940064400542996733001280100200800002001600004005240054118020110099100100800008000010080014144400800160116800001644140051104163340051800001004005540055400554005540053
802044005430011010014001400391616425801001008000010080000500183990804002940063400552996733001280100200800002001600004005540047118020110099100100800008000010080014144411800160118800021644140051103164440051800001004005340052400484004840064
8020440054300100000190014003616022580100100800001038000050018400281400304005440055299673300128010020080000200160000400634005411802011009910010080000800001008001414000800160116800021644141051103163340051800001004005340054400534005340064
802044004730011000022001400391616425801001008000010080000500183995604002940063400542996733001280100200800002001600004006440054118020110099100100800008000010080014144410800162121800021644140051103163240051800001004005340054400544005340064
8020440054300100103619101400391616325801001008000010080000500183995604002940054400522996133000580100200800002001600004004740052118020110099100100800008000010080015164400800160018800021644140051103163240051800001004005240053400534005440055

1000 unrolls and 10 iterations

Result (median cycles for code divided by count): 0.5007

retire uop (01)cycle (02)03l1d tlb fill (05)mmu table walk instruction (07)mmu table walk data (08)l2 tlb miss instruction (0a)l2 tlb miss data (0b)18191e1f243a3f46494f51schedule uop (52)schedule int uop (53)schedule ldst uop (55)dispatch int uop (56)dispatch ldst uop (58)int uops in schedulers (59)simd uops in schedulers (5a)60696d6emap stall dispatch (70)map rewind (75)map stall (76)dispatch uop (78)map int uop (7c)map ldst uop (7d)map int uop inputs (7f)map ldst uop inputs (80)8283flush restart other nonspec (84)85inst all (8c)inst branch (8d)inst branch taken (90)91inst branch cond (94)inst int alu (97)inst simd store (99)inst ldst (9b)9fl1d tlb access (a0)l1d tlb miss (a1)l1d cache miss st (a2)a4ld unit uop (a6)st unit uop (a7)l1d cache writeback (a8)a9acafbcl1d cache miss st nonspec (c0)l1d tlb miss nonspec (c1)c2cdcfd5map dispatch bubble (d6)ddfetch restart (de)e0? int output thing (e9)? ldst retires (ed)? int retires (ef)f5f6f7f8fd
80025400583221010010030040039161622580010108000010800005018394480400294005540054299893300358001020800002016000040054400521180021109010108000080000108001404201800160016800021644140050203316333440051080000104004540044400434004840043
80024400433101010100000140027161612580010108006010800005018400280400274005440042299773300338001020800002016000040054400551180021109010108000080000108000014430280016002800021644140050201316333440039080000104004940044400434004440048
800244005431100000000001400281616025800101080000108000050183944804001840054400522997733003480010208000020160000400544005211800211090101080000800001080000042008001600280002044140050203316363740040080000104004340043400434004340045
800244005431010001000170040036161652580010108000010800005018394480400174019240055299893300238001020800002016000040042400631180021109010108000080000108001514440080016001880001244141050204416363640051080000104004340043400554004340055
8002440052311001010009101400401616025800101080000108000050183944804002740055400522999816300328001020800002016000040054400531180021109010108000080000108001504200800160117800021644141050203316143440051080000104004440043400464019440047
800244005431010000000170140030161615580010108000010800005018395920400214005240054299903300208001020800002016000040040400401180021109010108000080000108001414420080000102180002242140050201516143340039080000104013040044400444004340055
80024400543100000100030140048161612580010108000010800005018400280400294004240054299893300348001020800002016000040063400431180021109010108000080000108001514460280002001680002164200050203316343540051180000104006440055400654006440043
8002440051310100000001700400361616525800101080000108000050183993404001740052400472998933002280010208000020160000400404006411800211090101080000800001080000042118001600280002164200050203416333340039080000104004440043401874006440043
800244004231010000001217014003701692580010108000010800005018394480400384005340063299983300328001020800002016000040054400521180021109010108000080000108001404240800020014800021646140050203124363740051080000104005640053400524005440044
800244005431110000000170140028161622580010108000010800005018394720400274005440042299783300348001020800002016000040042400421180021109010108000080000108001515420080002001880002242140050203416133340040080000104005340043400414004340055