Apple Microarchitecture Research by Dougall Johnson

M1/A14 P-core (Firestorm): Overview | Base Instructions | SIMD and FP Instructions
M1/A14 E-core (Icestorm):  Overview | Base Instructions | SIMD and FP Instructions

ST1 (multiple, 4 regs, 2D)

Test 1: uops

Code:

  st1 { v0.2d, v1.2d, v2.2d, v3.2d }, [x6]
  nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop
  mov x0, 0

(no loop instructions)

1000 unrolls and 1 iteration

Retires (minus 60 nops): 4.000

Issues: 4.000

Integer unit issues: 0.000

Load/store unit issues: 4.000

SIMD/FP unit issues: 0.000

retire uop (01)cycle (02)03l1d tlb fill (05)mmu table walk instruction (07)mmu table walk data (08)09l2 tlb miss instruction (0a)l2 tlb miss data (0b)181e1f223a3f464951schedule uop (52)schedule ldst uop (55)dispatch ldst uop (58)simd uops in schedulers (5a)5f696d6emap rewind (75)map stall (76)dispatch uop (78)map ldst uop (7d)map ldst uop inputs (80)8283flush restart other nonspec (84)85inst all (8c)inst simd store (99)inst ldst (9b)l1d tlb access (a0)l1d tlb miss (a1)l1d cache miss st (a2)ld unit uop (a6)st unit uop (a7)l1d cache writeback (a8)a9acafbcl1d cache miss st nonspec (c0)l1d tlb miss nonspec (c1)c2c9cfd0d2l1i cache miss demand (d3)l1i tlb miss demand (d4)d5map dispatch bubble (d6)d9dadbddfetch restart (de)e0eaebec? ldst retires (ed)f5f6f7f8fd
64005292662181190115100104587290300023140400040004000216055170282836829254310400040008000290852910211610011000100040000120400000040000120001269991676827304193620264307538131038362844916436132891515940002930029282291842919329323
64004292392190150015000104645291490023205400040004000216193170282836529287310400040008000290762910311610011000100040000804000000400008000129089282684530448342029131013812531362834716296132791496640002919829257292882928129271
64004292392190180021000104578290810023154400040004000216155170422836529252310400040008000291582909111610011000100040000804000200400008002071325094956957312611442038033323812639452877116176132971513840002927229266292352950929207
640042928521901600190001045472914000231564000400040002160951702328406292173104000400080002915229110116100110001000400008040000004000080001315291066843307874020212308238181139402845416268132351486740002944329302293262930029204
64004292682190170016000104591290790023079400040004000216227170292840629150310400040008000291372901211610011000100040000120400000040000800012897921368083093124020374312938081043382835216259133591484840002926629312292832932529296
6400429236219115001600650466729134002316440004000400021607217043284282926931040004000800029098291881161001100010004006512140040144000416400128979173685430979332027930793814641392838516391129651519240002929729303292732929729177
6400429274220114101510050466029120002323040004000400021621317019284542924531040004000800029153292601161001100010004004516140040244000484101287292146881303183520219309238121237432836916148133821508640002928129270292942927329243
64004292912190150018000104670290904023185400040004000216215170112843429227310400040008000291182916911610011000100040000804000000400008000127799070683130777432027930643814640392840616350129751498040002926629273292192927029242
6400429235220016001500010448829009002322240004000400021622517050283582927731040004000800029131291841161001100010004004512240040244002484101292690826853304411392031130883815834442837616162132821484440002929529306293742931729254
640042925121902300150001046472903900231414000400040002161051704028405292573104000400080002914429133116100110001000400558240040044000484101279991946879303710422026930823816739392844016376132801472740002934129279291952933929230

Test 2: throughput

Count: 8

Code:

  st1 { v0.2d, v1.2d, v2.2d, v3.2d }, [x6]
  st1 { v0.2d, v1.2d, v2.2d, v3.2d }, [x6]
  st1 { v0.2d, v1.2d, v2.2d, v3.2d }, [x6]
  st1 { v0.2d, v1.2d, v2.2d, v3.2d }, [x6]
  st1 { v0.2d, v1.2d, v2.2d, v3.2d }, [x6]
  st1 { v0.2d, v1.2d, v2.2d, v3.2d }, [x6]
  st1 { v0.2d, v1.2d, v2.2d, v3.2d }, [x6]
  st1 { v0.2d, v1.2d, v2.2d, v3.2d }, [x6]
  mov x7, x6
  mov x8, x6
  mov x9, x6
  mov x10, x6
  mov x11, x6
  mov x12, x6
  mov x13, x6

(fused SUBS/B.cc loop)

100 unrolls and 100 iterations

Result (median cycles for code divided by count): 2.0005

retire uop (01)cycle (02)03mmu table walk instruction (07)mmu table walk data (08)09l2 tlb miss instruction (0a)l2 tlb miss data (0b)18191e1f233a3f46494f51schedule uop (52)schedule int uop (53)schedule simd uop (54)schedule ldst uop (55)dispatch int uop (56)dispatch ldst uop (58)int uops in schedulers (59)simd uops in schedulers (5a)60696b6d6emap stall dispatch (70)map rewind (75)map stall (76)dispatch uop (78)map int uop (7c)map ldst uop (7d)map int uop inputs (7f)map ldst uop inputs (80)8283flush restart other nonspec (84)85inst all (8c)inst branch (8d)inst branch taken (90)inst branch cond (94)inst int alu (97)inst simd store (99)inst ldst (9b)9fl1d tlb access (a0)l1d tlb miss (a1)l1d cache miss st (a2)a4ld unit uop (a6)st unit uop (a7)l1d cache writeback (a8)a9acafb5bbbcl1d cache miss st nonspec (c0)c2branch cond mispred nonspec (c5)branch mispred nonspec (cb)cdcfd5map dispatch bubble (d6)ddfetch restart (de)e0? int output thing (e9)? ldst retires (ed)? int retires (ef)f5f6f7f8fd
32020516005312400000000000116002716161253201001000320000100320011500735986301600940160120160042799857800163201112003200202006400401600401600401180201100991001008000080000100320000034003200000053200020023401115119116111600460320000100160050160051160052160050160043
32020416004512401001000030016003501632532010010003200001003200005007359424016001701600421600407998038002432010020032000020064000016004216005011802011009910010080000800001003200000360032000200183200020023400005110117211600390320000100160043160050160051160051160043
32020416004212400001000030116002716160253201001000320000100320000500735942401600240160040160042799803800243201002003200002006400001601201600421180201100991001008000080000100320000034003200020023200021640523401005110217111600370320000100160043160041160043160043160051
3202041600501241100000009011600271616225320100100032000010032000050073594240160017016005016012079978380024320100200320000200640000160042160042118020110099100100800008000010032000000003200021023200020023400005112117221600510320000100160114160043160043160043160043
3202041600421240000100003001600271600253201001000320000100320000500735942401600150160050160042799803800243201002003200002006400001600421600421180201100991001008000080000100320000034003200020023200020003400005112217221601170320000100160043160043160043160043160043
32020416004212401001000030116002716163529532250210013200001003223465007359352016001731600501601207998038003132010020032000020064000016004916004011802011009910010080000800001003200000000320000001133200020023400005112217211600460320000100160041160041160043160041160043
32020416004212851001000126011600361616025320100100032000010032000050073593520160390016043416004284526380377320208200320000200640280160042160127318020110099100100800008000010032000003428032000200143200020023400005112217211600370320000100160041160043160043160043160043
320204160042124100010000301160035161602532010010003200001003200005007359856116002401600491600427998038002532010020032000020064000016004216004211802011009910010080000800001003200000360032000210333200020023400005110217211600390320000100160043160043160043160043160051
32020416005912401001100126011600251616025320100100032000010032000050073594240160017016004216004280057380024320100200320120200640000160040160040118020110099100100800008000010032000000003200000023200020023400005130217221600390320000100160043160043160041160181160043
320204160042124100000001329001600271607425320100100032000010032000050073594240160142016004916004079980158002432010020032000020064000016004216004211802011009910010080000800001003200000340032000200203200020043400005112117111600390320000100160043160043160043160044160051

1000 unrolls and 10 iterations

Result (median cycles for code divided by count): 2.0005

retire uop (01)cycle (02)03mmu table walk instruction (07)mmu table walk data (08)l2 tlb miss instruction (0a)l2 tlb miss data (0b)18191e1f22233a3f4046494f51schedule uop (52)schedule int uop (53)schedule ldst uop (55)dispatch int uop (56)dispatch ldst uop (58)int uops in schedulers (59)simd uops in schedulers (5a)696d6emap stall dispatch (70)map rewind (75)map stall (76)dispatch uop (78)map int uop (7c)map ldst uop (7d)map int uop inputs (7f)map ldst uop inputs (80)8283flush restart other nonspec (84)85inst all (8c)inst branch (8d)inst branch taken (90)inst branch cond (94)inst int alu (97)inst simd store (99)inst ldst (9b)9fl1d tlb access (a0)l1d tlb miss (a1)l1d cache miss st (a2)a4ld unit uop (a6)st unit uop (a7)l1d cache writeback (a8)a9acafbcl1d cache miss st nonspec (c0)c2branch mispred nonspec (cb)cfl1i cache miss demand (d3)d5map dispatch bubble (d6)ddfetch restart (de)e0ea? ldst retires (ed)? int retires (ef)f5f6f7f8fd
320025160043124100000003000160027016161253200101032000010320000507359448160017160040160043799813800363200102032000020640000160042160046118002110910108000080000103200000420032000200232000224400502001417513160039032000010160043160043160044160043160043
32002416004212400000000300016002801601253200101032000010320000507360000160017160042160040799803800243200102032000020640000160043160048118002110910108000080000103200000420032000200232000204200502005171312160039032000010160044160043160043160044160182
32002416004012410000100300016002701616125320010103200001032000050735944816001816004216004279980380024320010203201202064000016004016004211800211091010800008000010320000000032000000232006200005020014171312160037032000010160043160043160043160043160043
32002416004312400000000300016002701616125320010103200001032000050735944816001716004216004379981380025320010203200002064000016004216004211800211091010800008000010320000000032000200232000224200502005171212160040032000010160182160043160043160043160043
320024160181124100000003000160028016161253200701032000010320000507364284160017160043160043799813800253200102032000020642880166270160053118002110910108000080000103200000420032000010232000224200502007171213160039032000010160043160043160043160043160182
32002416018012850000001230001601660161602532001010320000103201085073594481600181600421600427998138002432001020320000206400001600421600421180021109101080000800001032000004200320002002320002244005020014171212160039032000010160043160043160043160044160044
32002416004212860000000300016003901616025320070103200001032000050735935216001816004016004279980380036320118203200002064000016004316008521800211091010800008000010320000042003200020023200022440050200517127160051032000010160182160044160043160043160043
3200241601801286000000930001600270161612532001010320000103200005073642361600171600421600427997838002432001020320000206400001600431600461180021109101080000800001032006004229232006200811320062242205448014441313160277032000010160449160324160460160457160598
3200241604551289410024264267000160442016161253200101032000010320000507359472160017160042160043799813800253200102032000020640000160042160046118002110910108000080000103200000420032000200232000224200502001217106160040032000010160044160043160043160043160044
3200241600421285000000030001600250161602532001010320000103200005073594481600171600421600427998038002232001020320000206400001600401600431180021109101080000800001032000004200320002000320002242005020012171415160039032000010160044160043160043160043160043