Apple Microarchitecture Research by Dougall Johnson

M1/A14 P-core (Firestorm): Overview | Base Instructions | SIMD and FP Instructions
M1/A14 E-core (Icestorm):  Overview | Base Instructions | SIMD and FP Instructions

ST1 (multiple, 2 regs, 4S)

Test 1: uops

Code:

  st1 { v0.4s, v1.4s }, [x6]
  nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop
  mov x0, 0

(no loop instructions)

1000 unrolls and 1 iteration

Retires (minus 60 nops): 2.000

Issues: 2.000

Integer unit issues: 0.000

Load/store unit issues: 2.000

SIMD/FP unit issues: 0.000

retire uop (01)cycle (02)03l1d tlb fill (05)mmu table walk instruction (07)mmu table walk data (08)09l2 tlb miss instruction (0a)l2 tlb miss data (0b)18191e1f3a3f4651schedule uop (52)schedule ldst uop (55)dispatch ldst uop (58)simd uops in schedulers (5a)5f696d6emap rewind (75)map stall (76)dispatch uop (78)map ldst uop (7d)map ldst uop inputs (80)8283flush restart other nonspec (84)85inst all (8c)inst simd store (99)inst ldst (9b)l1d tlb access (a0)l1d tlb miss (a1)l1d cache miss st (a2)ld unit uop (a6)st unit uop (a7)l1d cache writeback (a8)a9acafbcl1d cache miss st nonspec (c0)l1d tlb miss nonspec (c1)c2c9cfd0d2l1i cache miss demand (d3)l1i tlb miss demand (d4)d5map dispatch bubble (d6)d9dadbddfetch restart (de)e0eaebec? ldst retires (ed)f5f6f7f8fd
620052953022812311271000345872873622435120002000200010000916064284982938331020002000400029136290021161001100010002004281200201220002622012943924069093135115220791306438111259582856816559139381555920002958429382293592946729513
62004292852271171124100034572288810244122000200020001000011605728572292943102000200040002966629518116100110001000200224120020022000242101319692586863309465121008321638141257542859016457135461564920002945029389293732936129324
62004295412281180022100034633289110242322000200020001000001607028709295863102000200040002949729358116100110001000200454020022022000242201342692826934312995421854336038131755512879716348134841563620002956929655297232967129579
62004296552371221123200034706291000245712000200020021000041606928849296363102000200040002946229356116100110001000200334120020216772000262050513128925769523123115621382331038071649502879016349137191539520002943829574296552959329618
620042968323811600211010346542893602423620002000200010000216081285422946631020002000400029263292301161001100010002004341200211520002420013200934869313082115320813325538101353572858015904132931539120002934929486293352942029420
6200429369228117112210009147062894402420920002000200010000616068284732929831020002000400029273292082161001100010002002461200201220002620013400937369093132105420805328638101949522848116298137151547220002946629518294492938029288
620042936922811910221100346422894002425420002000200010000016053285192948531020002000400029256292521161001100010002010241200200220002422013274956069483081125620895321538131458612859016252134371544020002939729387294322934329313
6200429393228125011911003473128945024286200020002000100006160732859029421310200020004000291912927411610011000100020113412002435200026210131929389695031198572076632783812761582865616176135181544820002937329408293292938629512
620042945422712301270000346802893502426420002000200010000516059284902934531020002000400029282292351161001100010002003241200200220002420013071928268743123125520920321538121550562860216167136171535920002968829638296742985429767
6200429711239119002110009146302887502438920002000200010000116057287052928831020002000400029202292401161001100010002002240200202220002421013083946869273112115520859335938131556552857616418135711556920002942929239293682925329320

Test 2: throughput

Count: 8

Code:

  st1 { v0.4s, v1.4s }, [x6]
  st1 { v0.4s, v1.4s }, [x6]
  st1 { v0.4s, v1.4s }, [x6]
  st1 { v0.4s, v1.4s }, [x6]
  st1 { v0.4s, v1.4s }, [x6]
  st1 { v0.4s, v1.4s }, [x6]
  st1 { v0.4s, v1.4s }, [x6]
  st1 { v0.4s, v1.4s }, [x6]
  mov x7, x6
  mov x8, x6
  mov x9, x6
  mov x10, x6
  mov x11, x6
  mov x12, x6
  mov x13, x6

(fused SUBS/B.cc loop)

100 unrolls and 100 iterations

Result (median cycles for code divided by count): 1.0005

retire uop (01)cycle (02)03l1d tlb fill (05)mmu table walk instruction (07)mmu table walk data (08)09l2 tlb miss instruction (0a)l2 tlb miss data (0b)18191e1f22233a3f46494f5051schedule uop (52)schedule int uop (53)schedule ldst uop (55)dispatch int uop (56)dispatch ldst uop (58)int uops in schedulers (59)simd uops in schedulers (5a)696b6d6emap stall dispatch (70)map rewind (75)map stall (76)dispatch uop (78)map int uop (7c)map ldst uop (7d)map int uop inputs (7f)map ldst uop inputs (80)8283flush restart other nonspec (84)85inst all (8c)inst branch (8d)inst branch taken (90)91inst branch cond (94)inst int alu (97)inst simd store (99)inst ldst (9b)9fl1d tlb access (a0)l1d tlb miss (a1)l1d cache miss st (a2)a4ld unit uop (a6)st unit uop (a7)l1d cache writeback (a8)a9acafbcl1d cache miss st nonspec (c0)l1d tlb miss nonspec (c1)c2cfd5map dispatch bubble (d6)ddfetch restart (de)e0? ldst retires (ed)? int retires (ef)f5f6f7f8fd
160205800586200000000003001800271616002516010010016000010016000050036794248001708004280049599533600001601002001600002003200008004080051118020110099010010080000800001001600000340016000210016000023400511241694800391600001008004380043800418004380043
160204800426200000000003002800441602025160100100160000100160000500367988480033080061800585996636001816010020016000020032000080047800471180201100990100100800008000010016001414360116001410141600021636141511251695800481600001008005480059800608005980058
160204800536201010000001900280044016902516010010016000010016000050036801998002708005880058599713600161601002001600002003200008004980060118020110099010010080000800001001600000340016000200216000223400511231665800371600001008004180043800438005180051
16020480042620000000000000180027161620251601001001600001001600005003679352800170800428004259962360000160100200160000200320000800428004211802021009901001008000080000100160000000016000200216000023400511251685800471600001008004380043800438005180041
1602048004262000000000060018002516160025160100100160000100160000500367942480017080042800425996336000016010020016000020032000080049800501180201100990100100800008000010016001515000160000005160002234005112316125800391600001008005080043800438005080041
1602048004262100000000030018002716040251601001001600001001600005003679424800260800428004259955359998160100200160000200320000800428004211802011009901001008000080000100160000000016000200216000223400511251665800391600001008004380043800518004380043
16020480042620000000000300180025016004016010010016000010016000050036797608001708004280042599633600001601002001600002003200008004280049118020110099010010080000800001001600000340016000000216000223400511241664800391600001008004380041800518004380043
160204800426200000000012300180025016002516010010016000010016000050036794248001508004080042599643599981601002001600002003200008004280042118020110099010010080000800001001600000340016000210216000223400511241655800371600001008004380043800418004380043
1602048004262100000000000018002701600251601001001600001001600005003679424800170800508018759955156012116010020216000020032024080179801811180201100990100100800008000010016006203431016006210800160000034005327615985804221600001008031880182803208032080330
160204801826220000011113219700280182016148085160221100160120100160108511368462080135080182800865995327601131603162001602402003202408004280049118020110099010010080000800001001600000340016000200516006223400511251695800391600001008005180043800418004180043

1000 unrolls and 10 iterations

Result (median cycles for code divided by count): 1.0005

retire uop (01)cycle (02)03mmu table walk instruction (07)mmu table walk data (08)09l2 tlb miss instruction (0a)l2 tlb miss data (0b)18191e1f22233a3f46494f51schedule uop (52)schedule int uop (53)schedule ldst uop (55)dispatch int uop (56)dispatch ldst uop (58)int uops in schedulers (59)simd uops in schedulers (5a)61696d6emap stall dispatch (70)map rewind (75)map stall (76)dispatch uop (78)map int uop (7c)map ldst uop (7d)map int uop inputs (7f)map ldst uop inputs (80)8283flush restart other nonspec (84)85inst all (8c)inst branch (8d)inst branch taken (90)inst branch cond (94)inst int alu (97)inst simd store (99)inst ldst (9b)9fl1d tlb access (a0)l1d tlb miss (a1)l1d cache miss st (a2)a4ld unit uop (a6)st unit uop (a7)l1d cache writeback (a8)a9acafbcl1d cache miss st nonspec (c0)c2branch cond mispred nonspec (c5)cfd0d5map dispatch bubble (d6)ddfetch restart (de)e0? ldst retires (ed)? int retires (ef)f5f6f7f8fd
16002580042620100000003000800271616525160010101600001016000050367935208001780042800425997736002216001020160000203200008004080042118002110910108000080000101600000421016000200216000024200502001161180051160000108004480041800418004180043
160024800426211001000123001800271616025160010101600001016000050367944808001780042800425997736002216001020160000203200008004280040118002110910108000080000101600000420016000210216000224200502001162280051160000108004480043800438004380044
1600248004362100010000300080109016025160010101600001016000050367944808001780043800405997736002016001020160000203200008004280042118002110910108000080000101600000420016000200516000224200502202161280039160000108004380044800558004380043
16002480042620100100003000800271616055160010101600001016000050367947208001580043800425997736002316001020160000203202408004080042118002110910108000080000101600000420016000270216000224200502001162180039160000108004480041800438004380044
160024800426210000000030008002716002516001010160000101600005036793520800188004080040599753600201600102016000020320000800428004311800211091010800008000010160060042001600000001600002000502202161280037160000108004380043800438004480044
160024800446211000001030008002716002516007010160000101600005036794480800178004280040599773600221600102016000020320000800428004211800211091010800008000010160000042001601800081600000000502002162280051160000108004380044800418004480043
16002480040620100100003001800281616225160010101600001016000050367969208009780040800425997536002316001020160000203200008004080178118002110910108000080000101600000420016000200516000204200502201161280039160000108004380043800448004180043
1600248004262000000000310080027016125160010101600001016000050368438008001780043800425997736002316001020160000203200008004380042118002110910108000080000101600000420016000210216000024200502001161180039160000108004180055800438004380043
16002480042622000000003001800271616025160010101600001016000050367944808001780044800435997536002216011820160000203200008004280054118002110910108000080000101600000420016000202216000204200502002162280040160000108004380043800438004380041
16002480054620010100003000800251616125160010101600001016000050367944808001780042800405997736002016001020160000203200008004280042118002110910108000080000101600000420016000210216000204200502201162180037160000108004380182800448004380043