Apple Microarchitecture Research by Dougall Johnson

M1/A14 P-core (Firestorm): Overview | Base Instructions | SIMD and FP Instructions
M1/A14 E-core (Icestorm):  Overview | Base Instructions | SIMD and FP Instructions

ST1 (multiple, 1 reg, 16B)

Test 1: uops

Code:

  st1 { v0.16b }, [x6]
  nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop
  mov x0, 0

(no loop instructions)

1000 unrolls and 1 iteration

Retires (minus 60 nops): 1.000

Issues: 1.000

Integer unit issues: 0.000

Load/store unit issues: 1.000

SIMD/FP unit issues: 0.000

retire uop (01)cycle (02)03l1i tlb fill (04)l1d tlb fill (05)mmu table walk instruction (07)mmu table walk data (08)l2 tlb miss instruction (0a)l2 tlb miss data (0b)18191e1f22233a3f464951schedule uop (52)schedule ldst uop (55)dispatch ldst uop (58)simd uops in schedulers (5a)5f6061696d6emap rewind (75)map stall (76)dispatch uop (78)map ldst uop (7d)map ldst uop inputs (80)8283flush restart other nonspec (84)85inst all (8c)inst simd store (99)inst ldst (9b)l1d tlb access (a0)l1d tlb miss (a1)l1d cache miss st (a2)ld unit uop (a6)st unit uop (a7)l1d cache writeback (a8)a9acafbcl1d cache miss st nonspec (c0)l1d tlb miss nonspec (c1)c2cfd0d2l1i cache miss demand (d3)l1i tlb miss demand (d4)d5map dispatch bubble (d6)d9dadbddfetch restart (de)dfe0eaebec? ldst retires (ed)f5f6f7f8fd
6100529136234415161001322004663285182023899100010001000500024001594328271289843281000100120002890428989116100110001000100112110010011000121012936923869033138042214043322380015434542833515789128571538410002902028974289592893928987
61004288912330121011009000464428511002414210001000100150052218159052815328743310100010002000286342856411610011000100010021401001011100010111324693716866313614421193322038037374322815515818130921525010002878028820287332872028677
61004287252310142400001004787284201223902100010001000500020001592528167287703101000100020002872728671216100110001000100513310011011000141213243943069163153046212593163380314373942831015552131771503610002884928822289062880828797
61004288652320122400001004670284490223838100010001000500019001589828116287943101001100020002884628658116100110001000100312110010111000101113013935268803228046211083207380114484432816415705130841513210002886528901288832881428723
61004287492310131111002004656284431123775100010001000500029001595328043288433101000100020002868428708116100110001000100114210010111000141013152929169063168039212173248380515444132818415579129171502210002878128881288672887428759
61004288202310131300002004729284742223808100010001000500024081594128209288763101000100020002875528848116100110001000100214010010111000101012951927068713149145213033202380314464842822515538131171518910002896728846289182885828768
610042884123101312100089004704283520023806100010001000500017081592628202289053101000100020002874128678116100110001000100144010010211000101113085958669703111241210843235380719464832820915721132991505610002881228765287352878428851
61004288632320102210001004703285420224235100010001000500017081593128236290463101000100020002874628877116100110001000100324010011111000101113075919068883139139212213266380018404232832015568131751493010002874128561287032876428683
6100428623230013141001351004602285090023703100010001000500018001594628052288323101000100020002869228739116100110001000100210110010111000121013245950768793080043215773212380415424522806315541131241519310002875828827289292866128637
61004287922310120210002004724283652223849100010001000500019001595828027286023101000100020002878928694116100110001000100230110010011000101013182929269563214141209933326380312423932808415915132471551010002890428668288032875128660

Test 2: throughput

Count: 8

Code:

  st1 { v0.16b }, [x6]
  st1 { v0.16b }, [x6]
  st1 { v0.16b }, [x6]
  st1 { v0.16b }, [x6]
  st1 { v0.16b }, [x6]
  st1 { v0.16b }, [x6]
  st1 { v0.16b }, [x6]
  st1 { v0.16b }, [x6]
  mov x7, x6
  mov x8, x6
  mov x9, x6
  mov x10, x6
  mov x11, x6
  mov x12, x6
  mov x13, x6

(fused SUBS/B.cc loop)

100 unrolls and 100 iterations

Result (median cycles for code divided by count): 0.5005

retire uop (01)cycle (02)03mmu table walk instruction (07)mmu table walk data (08)09l2 tlb miss instruction (0a)l2 tlb miss data (0b)18191e1f22233f46494f51schedule uop (52)schedule int uop (53)schedule ldst uop (55)dispatch int uop (56)dispatch ldst uop (58)int uops in schedulers (59)simd uops in schedulers (5a)60696d6emap stall dispatch (70)map rewind (75)map stall (76)dispatch uop (78)map int uop (7c)map ldst uop (7d)map int uop inputs (7f)map ldst uop inputs (80)8283flush restart other nonspec (84)85inst all (8c)inst branch (8d)inst branch taken (90)inst branch cond (94)inst int alu (97)inst simd store (99)inst ldst (9b)9fl1d tlb access (a0)l1d tlb miss (a1)l1d cache miss st (a2)a4st unit uop (a7)l1d cache writeback (a8)a9acafbcl1d cache miss st nonspec (c0)c2cfd5map dispatch bubble (d6)ddfetch restart (de)e0? ldst retires (ed)? int retires (ef)f5f6f7f8fd
80205400433220000000030040029161602580100100800001008000050018394720400174156342259299553300008010020080000200160238404354043621802011009910010080000800001008000004408000220880002242051101416151640413800001004004340044400434004340044
802044004231100000001830040027161612580100100800001008000050018394720400174004340043299563300008010020080000200160000400404004211802011009910010080000800001008000004208000000280002242051101316131340040800001004004340044400414004440043
80204400433100000000030040028161612580100100800001008000050018394480400184004240042299553300008010020080000200160000400424004311802011009910010080000800001008000004208000200580002200511061612840039800001004004440041400444004340055
8020440040310000000012300400280161258010010080000100800005001840000040017400424004329956330000801002008000020016000040040400401180201100991001008000080000100800000008000200080002242051101316132541467800001004004340044400434004440043
802044004332100000001200040030161602580100100800601008010850018400280400174004340043299553300128010020080000200160000400424004211802011009910010080000800001008000004208000200280002242051101316131340039800001004004440043400414004340043
80204400433220000000330040027161612580100100800001008000050018394720400174004340042299533300008010020080000200160000400434004211802011009910010080000800001008000004208000200080002242051101316131040039800001004004340043400414004440043
802044004231000000001230040027161612580100100800001008000050018394480400174004340042299563300128010020080000200160000400404004211802011009910010080000800001008000004231800020008000020051101316111440039800001004004340044400434004440044
8020440043310000000012001400270161258010010080000100800005111839448040029400544005429955330001801002008000020016000040042400421180201100991001008000080000100800000420800021038000224205110131612740039800001004004340043400434004440044
80204400423100000000123004002516160258010010080000100800005001839496040015400424004229956330001801002008000020016000040042400421180201100991001008000080000100800000422880002002380002242051101516121340040800001004004340041400434004140043
80204400423110000000120004002516160258010010080000100800005001839448040017400424004029955330001801002008000020016000040042400421180201100991001008000080000100800000008000010880002242051101516101440040800001004004340041400434004340043

1000 unrolls and 10 iterations

Result (median cycles for code divided by count): 0.5005

retire uop (01)cycle (02)03l2 tlb miss data (0b)18191e1f22243a3f46494c4f51schedule uop (52)schedule int uop (53)schedule ldst uop (55)dispatch int uop (56)dispatch ldst uop (58)int uops in schedulers (59)simd uops in schedulers (5a)60696d6emap stall dispatch (70)map rewind (75)map stall (76)dispatch uop (78)map int uop (7c)map ldst uop (7d)map int uop inputs (7f)map ldst uop inputs (80)8283flush restart other nonspec (84)85inst all (8c)inst branch (8d)inst branch taken (90)inst branch cond (94)inst int alu (97)inst simd store (99)inst ldst (9b)9fl1d tlb access (a0)l1d tlb miss (a1)l1d cache miss st (a2)st unit uop (a7)l1d cache writeback (a8)abacafbcl1d cache miss st nonspec (c0)c2cdcfd5map dispatch bubble (d6)daddfetch restart (de)e0? ldst retires (ed)? int retires (ef)f5f6f7f8fd
80025400423100006300140584161601258007010800001080216501856581040017400444004229977330022800102080000201602784004240042218002110910108000080000108000003480002102800022340050203316032324003980000104004340043400434004140043
80024400493210001830004003516000258001010800001080000501839424040015400424004229977330029800102080000201600004004240042118002110910108000080000108000003480002108800022380050201316032124003980000104005240051400524005140183
800244004032100012900040025161600258001010800001080000501844245040017400494004229977330030801182080000201600004004240042118002110910108000080000108000000800021014800022340050203216033334004880000104004340043400434004340043
8002440042331000070004016601600258007010800001080000501839448040015400424004029977330022800102080000201600004004240043118002110910108000080000108000004280000102800622420050203716036364004080000104004140182401844005540043
80024400423310001410000400271600055800101080000108000050183944804001740045400432997733002380010208000020160000400424004211800211091010800008000010800000428000232392800022420050393316033334003980000104060142219422554211640044
8002440042321010129100040027161601258001010800001080000501839448140017400424004229977330028800102080000201600004005140042118002110910108000080000108006003480002005800622342050203316033154005680000104005140052400514005240043
800244004232801009400140034161600558001010800001080000501839424040015400424004229977330022800102080000201600004004940042118002110910108000080000108000003480002102800022340050203316033334004880000104004340043400434004340051
80024400423220002160014003416160055800101080000108000050183942404001740042400422997733002280010208000020160242400514004211800211091010800008000010800000348006210580000234005020331650415324004080000104006040043400434004340180
8002440051321100129100040027161600258001010800001080000501839424040022400424005129977330029800102080000201600004004240042118002110910108000080000108006003480002101608800002340050203516035234005080000104004340043400434004340052
800244018132201512132300040027161600258001010803601080972501894939040017400504004229977330022801182080000201600004004240042118002110910108000080000108000003480002102800022340050203216032214003980000104005040041400604005040183