Apple Microarchitecture Research by Dougall Johnson

M1/A14 P-core (Firestorm): Overview | Base Instructions | SIMD and FP Instructions
M1/A14 E-core (Icestorm):  Overview | Base Instructions | SIMD and FP Instructions

ST1 (multiple, post-index, 4 regs, 4H)

Test 1: uops

Code:

  st1 { v0.4h, v1.4h, v2.4h, v3.4h }, [x6], x8
  nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop
  mov x0, 0
  mov x8, 0

(no loop instructions)

1000 unrolls and 1 iteration

Retires (minus 60 nops): 4.000

Issues: 5.000

Integer unit issues: 1.000

Load/store unit issues: 2.000

SIMD/FP unit issues: 2.000

retire uop (01)cycle (02)03l1d tlb fill (05)mmu table walk instruction (07)mmu table walk data (08)09l2 tlb miss instruction (0a)l2 tlb miss data (0b)18191e1f23243a3f464951schedule uop (52)schedule int uop (53)schedule simd uop (54)schedule ldst uop (55)dispatch int uop (56)dispatch simd uop (57)dispatch ldst uop (58)int uops in schedulers (59)simd uops in schedulers (5a)ldst uops in schedulers (5b)5e5f60696d6emap rewind (75)map stall (76)dispatch uop (78)map ldst uop (7d)map simd uop (7e)map ldst uop inputs (80)map simd uop inputs (81)8283flush restart other nonspec (84)85inst all (8c)inst simd store (99)inst ldst (9b)l1d tlb access (a0)l1d tlb miss (a1)l1d cache miss st (a2)ld unit uop (a6)st unit uop (a7)l1d cache writeback (a8)a9aaacafbcl1d cache miss st nonspec (c0)l1d tlb miss nonspec (c1)c2c9cfd0d2l1i cache miss demand (d3)l1i tlb miss demand (d4)d5map dispatch bubble (d6)d9dadbddfetch restart (de)e0? int output thing (e9)eaebec? ldst retires (ed)? simd retires (ee)? int retires (ef)f5f6f7f8fd
640062950223602600250000100460429221221819750001000200020001000200020005000218021600005021928290262940731050002000200050004000293302944911610011000100020000602000100020000600013145942169223180136020446324238141359602885510001598312912142152000200010002950729531295952941329486
640042948623602501190006000463929274201821450001000200020001000200020005000218071600005021955290442934631050002000200050004000293282941711610011000100020000402000000020000600013400940569573190115920468325138151955632872210001614613338146062000200010002939129290293442950629388
64004294332370230027000010047032922200183025000100020002000100020002000500021812160000502189629018295613105000200020005000400029476294021161001100010002000040200000002000040001327795776923308485920569326038071255552884710001649213381143592000200010002944229418294392966429603
640042939923702100231000100477529166001835650001000200020001000200020005000218071600005021890292452950131050002000200050004000292732945411610011000100020000602000000020000400013327947669043078136020302338038092066562866710001619013172140522000200010002945929386293532939929380
640042948523602200210019100467129241001824550001000200020001000200020005000218111600005021959290612949432750052000200050004000294072932911610011000100020040632000000215520040602013162940569833191185820550323138162058522883310001596413144143342000200010002938129417294092946629528
6400429336236021003100013210046882916300182005000100020002000100120022002500021805160000502202229035300342213950122000200050004000294502930511610011000100020034402001120020000400013130940268953116106120484330938122965612885010011612813005145032000200010002935029419294872941429467
64004293442360211018011089004758292330218178500010002000200210002002200050002180416000050219252903129430291285011200820085012401630026298676161001100010002002040200202048520000400013033940069453167136020479325338121558642887810001607413062143302000200010002952329374292642941729497
64004295092360200122010017700464329226001829850101000200020001001200020005005218001600005021988290802944891050002000200050054004294352942211610011000100020022422000020320000400013173939468943175135920687320038152560562875810011614513283146092000200010002940529486294482940829539
640042938723702700230000100467329234001830150001000200020001000200020005000218101600002021951291322940931050002000200050004000293602938811610011000100020000402000000020000400013000941368743149136220409330638141860662876010001645513223141752000200010002942329398294182946929419
640042940323602100250000100458829275001823450001000200020001000200020005000218111600015021977291402937431050002000200050004000293582934411610011000100020000402000000620000400013148940669123155165720457321938161568572884210001624013098144012000200010002950929410294662938829373

Test 2: throughput

Count: 8

Code:

  st1 { v0.4h, v1.4h, v2.4h, v3.4h }, [x6], x8
  st1 { v0.4h, v1.4h, v2.4h, v3.4h }, [x6], x8
  st1 { v0.4h, v1.4h, v2.4h, v3.4h }, [x6], x8
  st1 { v0.4h, v1.4h, v2.4h, v3.4h }, [x6], x8
  st1 { v0.4h, v1.4h, v2.4h, v3.4h }, [x6], x8
  st1 { v0.4h, v1.4h, v2.4h, v3.4h }, [x6], x8
  st1 { v0.4h, v1.4h, v2.4h, v3.4h }, [x6], x8
  st1 { v0.4h, v1.4h, v2.4h, v3.4h }, [x6], x8
  mov x7, x6
  mov x8, x6
  mov x9, x6
  mov x10, x6
  mov x11, x6
  mov x12, x6
  mov x13, x6
  mov x8, 0

(fused SUBS/B.cc loop)

100 unrolls and 100 iterations

Result (median cycles for code divided by count): 1.0006

retire uop (01)cycle (02)03mmu table walk instruction (07)mmu table walk data (08)l2 tlb miss instruction (0a)l2 tlb miss data (0b)18191e1f2224373a3f46494c4f51schedule uop (52)schedule int uop (53)schedule simd uop (54)schedule ldst uop (55)dispatch int uop (56)dispatch simd uop (57)dispatch ldst uop (58)int uops in schedulers (59)simd uops in schedulers (5a)ldst uops in schedulers (5b)696d6emap stall dispatch (70)map rewind (75)map stall (76)dispatch uop (78)map int uop (7c)map ldst uop (7d)map simd uop (7e)map int uop inputs (7f)map ldst uop inputs (80)map simd uop inputs (81)8283flush restart other nonspec (84)85inst all (8c)inst branch (8d)inst branch taken (90)inst branch cond (94)inst int alu (97)inst simd store (99)inst ldst (9b)9fl1d tlb access (a0)l1d tlb miss (a1)l1d cache miss st (a2)a4ld unit uop (a6)st unit uop (a7)l1d cache writeback (a8)a9acafbcl1d cache miss st nonspec (c0)c2branch cond mispred nonspec (c5)branch mispred nonspec (cb)cfd2d5map dispatch bubble (d6)ddfetch restart (de)e0? int output thing (e9)? ldst retires (ed)? simd retires (ee)? int retires (ef)f5f6f7f8fd
3202068007162000000009003373080035161600254053748010016748216000080100160000160000480499223078313011518002480048800450332400100200160000160120200400000320000800458005151802011009910010080000800001001600000320016024200201516000223200051090121713138004280000160000160000801008004680046802138005080046
320204800456430000001200037950800301616460254075698015916374416000080100160118160000480499239787112929458002680045803756653274001002001600001600002004000003200008004480045118020110099100100800008000010016006104062016000210216000244000051090111713128005180000160000160000801008004680046800468004680046
320204800496220000001200063870800341616002540592580100166188160000801001600001600004804992079461130364780024800458004503274001002001600001600002004000003200008004580045118020110099100100800008000010016000003200160000109221600022320005109081713138004780000160000160000801008022280046800468005180046
320204800456200000000301518608002916160025403639801001646101600608010016000016000048049922334591300539800258004580045033140010020016000016000020040000032000080044800452180201100991001008000080000100160000032001600020021600022320005109091712128004280000160000160000801008039280046800538004680046
3202048004662100000003006101080035161600254036808010016530016000080100160000160000480499220771612994178002580045800450331400100200160000160000200400000320000800458004511802011009910010080000800001001600000320016000210516000203800051090121712138004280000160000160000801008004680046800518004680050
32020480045621000000030043540800301616002540573580100164408160000801001600001600004804992079498129498780023800498004903264001002001600001600002004000003209608004580045118020110099100100800008000010016000003262016006202216012223202051093151714148004280000160000160000801008004680046800468004680046
3202048004562100000003005346080029161600254048638015916501816000080100160000160000480499215276412987528002380045800450327400100200160000160000200400000320000800458004611802011009910010080000800001001600000400016000200121600022400005109015178148004280000160000160000801008004580055800468004680046
3202048005462000000003004857080030161600254052138010016622116000080100160000160000480499215395012926068002480045800450327400100200160000160000200400000320000800458004411802011009910010080000800001001600000400016000010216000224400051090121715148004280000160000160000801008004780046807108303980212
32020480212624010112132267005394180198161609248405501802181662661600608015916035416010848085122120651303729803328037880212248912740067020016012016012020040030032048080226802133180201100991001008000080000100161085740100321600021092016006224020051220162513138019880118160000160000801008021580380802158008980215
32020480214622000021333005511080030161600254042248010016523816000080100160000160000480499222660913039178002480045800450327400100200160000160000200400000320000800458004411802011009910010080000800001001600000400016000200016000004000051090131713118005180000160000160000801008004780046800468004680046

1000 unrolls and 10 iterations

Result (median cycles for code divided by count): 1.0006

retire uop (01)cycle (02)03l1d tlb fill (05)mmu table walk instruction (07)mmu table walk data (08)l2 tlb miss instruction (0a)l2 tlb miss data (0b)18191e1f22373a3f46494f51schedule uop (52)schedule int uop (53)schedule simd uop (54)schedule ldst uop (55)dispatch int uop (56)dispatch simd uop (57)dispatch ldst uop (58)int uops in schedulers (59)simd uops in schedulers (5a)ldst uops in schedulers (5b)60696d6emap stall dispatch (70)map rewind (75)map stall (76)dispatch uop (78)map int uop (7c)map ldst uop (7d)map simd uop (7e)map int uop inputs (7f)map ldst uop inputs (80)map simd uop inputs (81)8283flush restart other nonspec (84)85inst all (8c)inst branch (8d)inst branch taken (90)91inst branch cond (94)inst int alu (97)inst simd store (99)inst ldst (9b)9fl1d tlb access (a0)l1d tlb miss (a1)l1d cache miss st (a2)a4ld unit uop (a6)st unit uop (a7)l1d cache writeback (a8)a9acafbcl1d cache miss st nonspec (c0)l1d tlb miss nonspec (c1)c2branch cond mispred nonspec (c5)cdcfd5map dispatch bubble (d6)ddfetch restart (de)e0? int output thing (e9)? ldst retires (ed)? simd retires (ee)? int retires (ef)f5f6f7f8fd
320026801946201000000016047092800351616225406279800101661431600008001016000016000048004935992881298275080036800618006103314000102016000016000020406900321920800588006811800211090101080000800001016001212380116001400151600021438120005019321731148004680000160000160000800108005180062800518009480058
320024800496201000100015062422802091616025404543800101651761600008001016000016000048004935193791293239080034803048043303414000102016000016000020400000320000800518005111800211090101080000800001016001212360016001410171600021438120005019331732168004980000160000160000800108005080050800628005280051
3200248005062110001001215032212800351616025404799800101671601600008001016000016000048004923999181301001080037800498004903324000102016000016012020400000320000800618006011800211090101080000800001016001213370116001400171600021438121005019331730328013580000160000160000800108022180051800638005080063
320024800496211010100018054942801341616125405015800101674271600008006916000016000048004923988021291672080025800518022003774000102016000016000020400000320000800628006111800211090101080000800001016001212383016001401171600021438120005019141725338004880000160000160000800108005080063800508005180232
3200248005062010001000150568028004416160254094048001016425916000080010160000160000480049314090812945150800258005280051031334000102016000016000020400000320000800588006111800211090101080000800001016001213380016001400171600021438121005019341736318004780000160000160000800108005180062800528005080059
320024800616201010000015029662800351616025404008800101640391600008001016000016000048004936794061293183080025800628006103434000102016000016000020400000320000800498004911800211090101080000800001016001312383016001400151600021438120005019331736268021580000160000160000800108006380051800948006280051
3200248005062110000000150703528003516160254059418001016593416000080010160000160108480049247990912891110800378004980049913334000102016000016000020400000320000800508005011800211090101080000800001016001212380016001401151600021438121005019401734158004780000160000160000800108005180051800608005280050
32002480049620100001015180416828004316160254073458001016939616000080010160000160000480049231993913006540800268005180052031334000102016000016000020400000320000800588006111800211090101080000800001016007413380016001400151600021438120005019322535148005980000160000160000800108006280051800508005980053
32002480050621101010014419045292800341616125405675800101687331600008001016000016000048004923199421302417080037800508005003394000102016000016000020400000320000800508004911800211090101080000800001016001213380016001401171600021438120005019331735308004780000160000160000800108005080062800528005180061
320024800616211000000016054752800461616025406447800101636711600008001016000016000048004925598621298494080036800518005003334000102016000016000020400000320000802188005011800211090101080000800001016001313380116001402141600021436121005019141714348021380000160000160000800108005180062800958005080052