Apple Microarchitecture Research by Dougall Johnson

M1/A14 P-core (Firestorm): Overview | Base Instructions | SIMD and FP Instructions
M1/A14 E-core (Icestorm):  Overview | Base Instructions | SIMD and FP Instructions

ST1 (multiple, 4 regs, 8H)

Test 1: uops

Code:

  st1 { v0.8h, v1.8h, v2.8h, v3.8h }, [x6]
  nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop
  mov x0, 0

(no loop instructions)

1000 unrolls and 1 iteration

Retires (minus 60 nops): 4.000

Issues: 4.000

Integer unit issues: 0.000

Load/store unit issues: 4.000

SIMD/FP unit issues: 0.000

retire uop (01)cycle (02)03l1i tlb fill (04)mmu table walk instruction (07)l2 tlb miss instruction (0a)1e1f22233a3f464951schedule uop (52)schedule ldst uop (55)dispatch ldst uop (58)simd uops in schedulers (5a)5f696d6emap rewind (75)map stall (76)dispatch uop (78)map ldst uop (7d)map ldst uop inputs (80)8283flush restart other nonspec (84)85inst all (8c)inst simd store (99)inst ldst (9b)l1d tlb access (a0)l1d cache miss st (a2)st unit uop (a7)l1d cache writeback (a8)acafbcl1d cache miss st nonspec (c0)cfd0d2l1i cache miss demand (d3)l1i tlb miss demand (d4)d5map dispatch bubble (d6)d9dadbddfetch restart (de)e0eaebec? ldst retires (ed)f5f6f7f8fd
64005285122121880300506328132402212540004000400021615171703327812283193104000400080002818128336116100110001000400012400210040022013576101827234336537319391332738081968662795214706119981319440002841628306285502844328446
6400428157213036001052292799500219924000400040002162611170452789728238310400040008000280382816911610011000100040008400043040000814175101897270340236619357334638142566672798514499120231264840002837228245281322840128481
6400428214210032000050182808500221734000400040002163410170562796628183310400040008000282512834211610011000100040008400042040000813999103637267349016719153324638091965652784214151120981340940002807628391281792817228518
6400428239213056040051502810200220374000400040002160910170462790628054310400040008000281532837411610011000100040000400044040000813939102817224352706619078333238091163692799714223119651277040002831528075282652834428429
6400428445217084011050062825644221304000400040002161417170292795428251310400040008000281352818111610011000100040008400045040000013827100457274345326719235331738091468632790614720117621320940002831428205283322831728376
640042844021107601005231281550022172400040004000216171417067278632833931040004000800028357283461161001100010004000040007340000014036103597184336236819301327338092067652790714219119721296340002841328207283602833828416
6400428161212035000052672810300221944000400040002163011170462797528164310400040008000280962832311610011000100040008400243040000813919100727097339546419242330338101168622800114355121011346640002843328264281572819928463
6400428381211065010049412800400221484000400040002163011170492779928190310400040008000282102800411610011000100040000400046040000813995103237218348416019103341138122068632789414465120101337240002827828203282202830528392
6400428377212026000053232813100221284000400040002162912170452778928065310400040008000282992807911610011000100040008400083640000813978101647246343136219295337538121361692790414147118141281940002824828179283412825828461
640042846421208800005024280530022290400040004000216321017048279942827931040004000800028198283891161001100010004000840004704000081382597777189340827019258328338171468672788915202124421422940002897228968288942908328867

Test 2: throughput

Count: 8

Code:

  st1 { v0.8h, v1.8h, v2.8h, v3.8h }, [x6]
  st1 { v0.8h, v1.8h, v2.8h, v3.8h }, [x6]
  st1 { v0.8h, v1.8h, v2.8h, v3.8h }, [x6]
  st1 { v0.8h, v1.8h, v2.8h, v3.8h }, [x6]
  st1 { v0.8h, v1.8h, v2.8h, v3.8h }, [x6]
  st1 { v0.8h, v1.8h, v2.8h, v3.8h }, [x6]
  st1 { v0.8h, v1.8h, v2.8h, v3.8h }, [x6]
  st1 { v0.8h, v1.8h, v2.8h, v3.8h }, [x6]
  mov x7, x6
  mov x8, x6
  mov x9, x6
  mov x10, x6
  mov x11, x6
  mov x12, x6
  mov x13, x6

(fused SUBS/B.cc loop)

100 unrolls and 100 iterations

Result (median cycles for code divided by count): 2.0007

retire uop (01)cycle (02)03l1d tlb fill (05)mmu table walk instruction (07)mmu table walk data (08)l2 tlb miss instruction (0a)l2 tlb miss data (0b)18191e1f233a3f46494f51schedule uop (52)schedule int uop (53)schedule ldst uop (55)dispatch int uop (56)dispatch ldst uop (58)int uops in schedulers (59)simd uops in schedulers (5a)60696d6emap stall dispatch (70)map rewind (75)map stall (76)dispatch uop (78)map int uop (7c)map ldst uop (7d)map int uop inputs (7f)map ldst uop inputs (80)8283flush restart other nonspec (84)85inst all (8c)inst branch (8d)inst branch taken (90)inst branch cond (94)inst int alu (97)inst simd store (99)inst ldst (9b)9fl1d tlb access (a0)l1d tlb miss (a1)l1d cache miss st (a2)a4ld unit uop (a6)st unit uop (a7)l1d cache writeback (a8)a9acafbcl1d cache miss st nonspec (c0)l1d tlb miss nonspec (c1)c2branch cond mispred nonspec (c5)branch mispred nonspec (cb)cfd0d2d5map dispatch bubble (d6)ddfetch restart (de)e0? ldst retires (ed)? int retires (ef)f5f6f7f8fd
3202051600581240100000012180116003716161025320100100320000100320000500735990811601061604631600587999938003532010020032000020064000016005216005811802011009910010080000800001003200141436403200160060132000214401410051100031733160056320000100160053160190160060160059160057
3202041600581242100000001801160037016525320100100320000100320000500736505601600271600591600537999038003532010020032000020064000016005216005111802011009910010080000800001003200141434290320016001832000216361400051100221722160055320000100160048160060160059160060160053
32020416005912401010100661901160032161655532016010032000010032000050073602460160145160059160052799903801403201002003200002006400001601971600511180201100991001008000080000100320015143602320076031432000216361400051100421733160049320000100160051160051160338160050160059
3202041600521241101002339628301160311161622656320220100320120100320216500737463501602641603331604698022027802323205322003203602006409601605611603334180201100991001008000080000100320202143600320016001832000216361400051100011722160054320000100160061160051160053160053160053
320204160060124010001000190116004301492532010010032000010032000050073602680160027160059160052799903800403201002003201402006400001600601600581180201100991001008000080000100320016153600320016011732000216361400051100031733160058320000100160053160059160060160064160054
320204160060124110001000200116003701622532010010032000010032000050073602680160034160057160138799973800343201002003200002006400001600581600521180201100991001008000080000100320014153600320016102032000216361400051100011732160055320000100160052160051160052160053160050
3202041600591241100000001801160032161692532010010032000010032000050073599320160027160059160058799853800403201002003200002006400001600471600601180201100991001008000080000100320014143601320016001832000216341400051100011722160044320000100160063160060160049160059160059
3202041600521241101010001901160045161622532010010032000010032000050073602440160033160052160061799973800353201002003200002006400001600581600591180201100991001008000080000100320015153601320016002232000216361410051100021733160055320000100160053160055160053160053160053
3202041600591241100000001701160033161642532010010032000010032000050073602440160034160060160058799963800313201002003200002006400001600511600581180201100991001008000080000100320014143600320016001932000216361400051100011722160055320000100160051160052160053160054160069
32020416005812411010000122101160044161652532010010032000010032000050073598360160027160059160053799913800403201002003200002006400001600601600591180201100991001008000080000100320014143600320016002032000216361400051100021732160056320000100160616160475160328160605160612

1000 unrolls and 10 iterations

Result (median cycles for code divided by count): 2.0005

retire uop (01)cycle (02)03l1d tlb fill (05)mmu table walk data (08)09l2 tlb miss data (0b)18191e1f23243a3f46494f51schedule uop (52)schedule int uop (53)schedule ldst uop (55)dispatch int uop (56)dispatch ldst uop (58)int uops in schedulers (59)simd uops in schedulers (5a)6061696b6d6emap stall dispatch (70)map rewind (75)map stall (76)dispatch uop (78)map int uop (7c)map ldst uop (7d)map int uop inputs (7f)map ldst uop inputs (80)8283flush restart other nonspec (84)85inst all (8c)inst branch (8d)inst branch taken (90)inst branch cond (94)inst int alu (97)inst simd store (99)inst ldst (9b)9fl1d tlb access (a0)l1d tlb miss (a1)l1d cache miss st (a2)a4ld unit uop (a6)st unit uop (a7)l1d cache writeback (a8)a9acafbcl1d cache miss st nonspec (c0)l1d tlb miss nonspec (c1)c2cfd0d2d5map dispatch bubble (d6)daddfetch restart (de)e0ld nt uop (e6)? ldst retires (ed)? int retires (ef)f5f6f7f8fd
3200251600421240000000609000160035161602532001010320000103200005073594240016001501600501600427997838003432001020320000206400001600491600421180021109101080000800001032000003400320000001432000216000502000317033160039032000010160053160053160043160043160061
3200241600421241100000241900016003516169253200101032000010320000507359424101600170160047160052799803800343201282032000020640000160051160042118002110910108000080000103200000000320014000320002163400502000417032160039032000010160053160053160043160043160052
3200241600421241000000918000160044161602532001010320000103200005073593520016001701600401600477999038002932001020320000206400001600421600491180021109101080000800001032001403400320016002320002163600502000317023160037032000010160041160048160048160043160043
320024160042124001000092000016002716160253200101032000010320000507359856101600260160454160049799893800243202262032000020640000160042160040118002110910108000080000103200000028032000000532000023400502000317033160039032000010160043160041160041160043160041
320024160042124100000003000160027160025320010103200001032000050735935210160017016004216004079980380024320010203200002064000016005016004211800211091010800008000010320000034003200020053200022000502000217032160039032000010160043160043160043160043160052
3200241606221248001101126890001600341616025320010103200001032000050735942410160015016004016004279980380022320010203200002064000016005716005811800211091010800008000010320000153600320002115320002234140502000317033160157832000010160043160044160043160061160053
32002416004212411000001224151400116003816162253200101032000010320000507360316001600176160040160042799803800293200102032000020640000160054160052118002110910108000080000103200141444013200160129320002042015020007881722843160051032000010160053160052160053160053160064
320024160047124111000001400116003716160253203101032000010320000507359692001600380160435160063799923840103200102032036020640000160331160044118186310910108000080000103200000440032000200032000224200502000317033160039032000010160044160043160043160048160043
32002416004212131000000170001600251616025320010103200001032000050735944810160027016004216004079980380025320118203200002064000016004016004711800211091010800008000010320000042003200020118320002144200502000317023160039032000010160053160044160043160043160044
3200241600421199100000014001160025160025320010103200001032000050736002810160017016005216004379980380045320010203200002064000016004016004311800211091010800008000010320000042023200020016320002164200502000317033160039032000010160053160048160041160043160055