Apple Microarchitecture Research by Dougall Johnson

M1/A14 P-core (Firestorm): Overview | Base Instructions | SIMD and FP Instructions
M1/A14 E-core (Icestorm):  Overview | Base Instructions | SIMD and FP Instructions

ST1 (multiple, 4 regs, 4S)

Test 1: uops

Code:

  st1 { v0.4s, v1.4s, v2.4s, v3.4s }, [x6]
  nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop
  mov x0, 0

(no loop instructions)

1000 unrolls and 1 iteration

Retires (minus 60 nops): 4.000

Issues: 4.000

Integer unit issues: 0.000

Load/store unit issues: 4.000

SIMD/FP unit issues: 0.000

retire uop (01)cycle (02)03l1i tlb fill (04)mmu table walk instruction (07)mmu table walk data (08)l2 tlb miss instruction (0a)l2 tlb miss data (0b)18191e1f243a3f464951schedule uop (52)schedule ldst uop (55)dispatch ldst uop (58)simd uops in schedulers (5a)5f696b6d6emap rewind (75)map stall (76)dispatch uop (78)map ldst uop (7d)map ldst uop inputs (80)8283flush restart other nonspec (84)85inst all (8c)inst simd store (99)inst ldst (9b)9dl1d tlb access (a0)l1d tlb miss (a1)l1d cache miss st (a2)ld unit uop (a6)st unit uop (a7)l1d cache writeback (a8)a9acafl1d cache miss st nonspec (c0)c2c9cfd0d2l1i cache miss demand (d3)l1i tlb miss demand (d4)d5map dispatch bubble (d6)d9dadbddfetch restart (de)e0eaebec? ldst retires (ed)f5f6f7f8fd
640052938623821009000010466329162442344840004000400021609217009028740293773104000400080002930229275116100110001000040000004000000400012001324193986920313843820449334638091144412869916328131051497140002948629492295392988129597
640042947923808090000104640293054023283400040004000216015170520287492963331040004000800029243294461161001100010000400001204000000400012001336895117010314754120420329438211341392875716105131131508340002955229655295652942229568
6400429600237080600001046462924300233774000400040042159211707202863529511310400040008000293862950511610011000100004000000400000040008001321195516965318054120411334638221144372871516192132921461340002950929559294852948629493
640042949623601018000181045912940800230984000400040002160941705602872729333310400040008000292342932611610011000100004000080400010040000001311593686945317764420385338438181242422866416214132311484040002937829359297643019629410
6400429423236060600001046872916700232334000400040002161841702902864429403310400040008000292062923711610011000100004000000400000040008201310794877006319424220383332138141041362866016198129091481340002936529462295062936529437
640042934923601001401001046652926200232744000400040002161831704402870429414310400040008008293482928011610011000100004000080400000040008001325696476953309763720485328838141134432862515962131171461740002929529470293752945829359
640042948223706070000104724291400023248400040004000216050170370286292936931040004000800029320292631161001100010000400008040000004000000131539598692131623392053532683819739392861816033132501468640002945729418293612942629433
6400429307236011060000104746291700023384400040004000216072170580285612951391040004000800029329293501161001100010000400008240000038040008001338194176949317234020508336338181741392872916183130671503640002947029450293792944529450
6400429512237090900001046342922600233904000400040002160921705102867929417310400040008008292712935611610011000100004000000400000340008001311595586958316353820449333738131240432875016021129941465540002945629422294442929629491
64004294442360110800091147602928400233394000400040002160301705302868929450310400040008000293552932311610011000100004000000400000040000001329695226933317663820533333138171345412878416107133111479240002944029555294522954429474

Test 2: throughput

Count: 8

Code:

  st1 { v0.4s, v1.4s, v2.4s, v3.4s }, [x6]
  st1 { v0.4s, v1.4s, v2.4s, v3.4s }, [x6]
  st1 { v0.4s, v1.4s, v2.4s, v3.4s }, [x6]
  st1 { v0.4s, v1.4s, v2.4s, v3.4s }, [x6]
  st1 { v0.4s, v1.4s, v2.4s, v3.4s }, [x6]
  st1 { v0.4s, v1.4s, v2.4s, v3.4s }, [x6]
  st1 { v0.4s, v1.4s, v2.4s, v3.4s }, [x6]
  st1 { v0.4s, v1.4s, v2.4s, v3.4s }, [x6]
  mov x7, x6
  mov x8, x6
  mov x9, x6
  mov x10, x6
  mov x11, x6
  mov x12, x6
  mov x13, x6

(fused SUBS/B.cc loop)

100 unrolls and 100 iterations

Result (median cycles for code divided by count): 2.0005

retire uop (01)cycle (02)03mmu table walk data (08)l2 tlb miss data (0b)0f18191e1f2223243a3f46494f51schedule uop (52)schedule int uop (53)schedule ldst uop (55)dispatch int uop (56)dispatch ldst uop (58)int uops in schedulers (59)simd uops in schedulers (5a)696d6emap stall dispatch (70)map rewind (75)map stall (76)dispatch uop (78)map int uop (7c)map ldst uop (7d)map int uop inputs (7f)map ldst uop inputs (80)8283flush restart other nonspec (84)85inst all (8c)inst branch (8d)inst branch taken (90)inst branch cond (94)inst int alu (97)inst simd store (99)inst ldst (9b)9fl1d tlb access (a0)l1d tlb miss (a1)l1d cache miss st (a2)a4ld unit uop (a6)st unit uop (a7)l1d cache writeback (a8)a9acafbcl1d cache miss st nonspec (c0)c2branch cond mispred nonspec (c5)cfd5map dispatch bubble (d6)ddfetch restart (de)e0? ldst retires (ed)? int retires (ef)f5f6f7f8fd
3202051600421241003001201000160027161602532010010032024010032000050073593521605661600421600427997838002432010020032000020064000016004016004211802011009910010080000800001003200000000320000900632000223400511011711160047320000100160050160051160051160050160043
320204160042124000000001000160025161602532010010032000010032000050073594241600171600421600427998038002432010020032000020064000016004216005011802011009910010080000800001003200000000320000980232000223400511011711160039320000100160043160051160043160041160041
32020416004212410000009000016002716160253201001003200001003200005007359424160134160042160040799803801223201002003200002006400001600421601791180201100991001008000080000100320000034003200021062232000223400511011711160048320000100160051160180160050160051160043
3202041600421242000001231000160027161602532010010032000010032000050073594241600261600421600427997815800243201002003200002006400001601791600421180201100991001008000080000100320000034003200021440232000003400511011711160037320000100160043160041160041160041160043
3202041600421241000000300101600250002532010010032000010032000050073593521600261600421600507998038002232010020032000020064024016004216004211802011009910010080000800001003200000000320002970332000223400511011711160039320000100160051160043160041160051160043
32020416004212410000000000016003616160253201001003200001003200005007364188160015160042160042799803800243201002003200002006400001600421600491180201100991001008000080000100320000034003200021500332000203400512811711160039320000100160041160043160043160043160043
3202041600401241000106000001600271616025320100100320000100320000500735942416001716004216004080056380024320100200320000200640000160042160050118020110099100100800008000010032000003400320002100023200622000511011711160037320000100160041160041160041160043160051
32020416004212400000012000001600251602253201001003200001003200005007359424160017160187160042799803800223201002003200002006400001600401600421180201100991001008000080000100320062034003200026407973200020000511011711160039320000100160043160043160041160181160062
3202041600421241000122643000016030316162951453203401003202401003202165117374244160490160672160597801425180330320316200320240200640720160745160318418020110099100100800008000010032024003458032024224319632024223400511011711160046320000100160043160043160043160043160058
32020416005012410000003001016002716164253201001003200001003200005007359424160017160042160042799783800243201002003200002006400001600401600401180201100991001008000080000100320000000032000096023200002000511011711160037320000100160041160043160043160043160044

1000 unrolls and 10 iterations

Result (median cycles for code divided by count): 2.0006

retire uop (01)cycle (02)03l1d tlb fill (05)mmu table walk instruction (07)mmu table walk data (08)09l2 tlb miss data (0b)18191e1f22243a3f46494f51schedule uop (52)schedule int uop (53)schedule ldst uop (55)dispatch int uop (56)dispatch ldst uop (58)int uops in schedulers (59)simd uops in schedulers (5a)696b6d6emap stall dispatch (70)map rewind (75)map stall (76)dispatch uop (78)map int uop (7c)map ldst uop (7d)map int uop inputs (7f)map ldst uop inputs (80)8283flush restart other nonspec (84)85inst all (8c)inst branch (8d)inst branch taken (90)inst branch cond (94)inst int alu (97)inst simd store (99)inst ldst (9b)9fl1d tlb access (a0)l1d tlb miss (a1)l1d cache miss st (a2)a4st unit uop (a7)l1d cache writeback (a8)a9acafbcl1d cache miss st nonspec (c0)c2branch cond mispred nonspec (c5)branch mispred nonspec (cb)cfd5map dispatch bubble (d6)ddfetch restart (de)e0? int output thing (e9)? ldst retires (ed)? int retires (ef)f5f6f7f8fd
32002516004212410000000030001600271616025320010103200001032000050735942416002601600421600517998938003232001020320000206400001600421600421180021109101080000800001032000003403200020027832000423400050201317513160048032000010160052160051160051160058160043
3200241600421240001000003000160035016025320010103200001032000050735985616001701600511600427998038002432001020320000206400001600421601131180021109101080000800001032000003403200000038032000223400050201317145160039032000010160043160043160043160043160043
3200241600421245000000043826710016002516160253200101032000010320000507359856160017016004216004279980380032320010203200002064000016004216004211800211091010800008000010320000000320002004073200022340005020517145160039032000010160043160043160043160043160051
320024160042124000000000300016003516160253200101032000010320000507359784160017016004116004279980380033320010203200002064000016004216005011800211091010800008000010320000034032000230468320002234000502013171413160046032000010160050160052160051160050160043
32002416006212400000000030001600271616025320010103200601032000050735942416001701600511601807998038002432001020320000206400001600421600401180021109101080000800001032000003403200020043432000223400050206171313160039032000010160043160043160181160043160051
32002416004512400000000488446670001600271616025320070103200001032000050735992816001701600481600427998038002232011820320000206400001600511600422180021109101080000800001032000003403200020025732000223400050201317614160037032000010160181160043160043160043160043
32002416018012410000000030001600271616225320010103200001032000050736457216001701600491600427997838002232001020320000206400001600421600501180021109101080000800001032000000293200000044032000223400050201426511160039032000010160043160043160041160043160043
3200241600431245001001072300016002716160253200701032000010320000507359352160017016012016004279980380022320010203200002064000016005016004221800211091010800008000010320000034032000202368320002234000503210171510160048032000010160052160051160052160041160043
3200241600421240000000036300016002716162231153202501032024010320216507378696164042016031516045680212158012332022620320240206404801603261604563180021109101080000800001032030043810863200621026933201222340005055744155160397032000010160041160043160043160043160041
3200241600771240000000000000160027016025320010103200001032000050735978416001701600421600407998038003332001020320000206400001600421600401180021109101080000800001032000003403200020038032000220000502013171514160039032000010160043160043160043160043160052