Apple Microarchitecture Research by Dougall Johnson

M1/A14 P-core (Firestorm): Overview | Base Instructions | SIMD and FP Instructions
M1/A14 E-core (Icestorm):  Overview | Base Instructions | SIMD and FP Instructions

ST1 (multiple, post-index, 4 regs, 2S)

Test 1: uops

Code:

  st1 { v0.2s, v1.2s, v2.2s, v3.2s }, [x6], x8
  nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop
  mov x0, 0
  mov x8, 0

(no loop instructions)

1000 unrolls and 1 iteration

Retires (minus 60 nops): 4.000

Issues: 5.000

Integer unit issues: 1.000

Load/store unit issues: 2.000

SIMD/FP unit issues: 2.000

retire uop (01)cycle (02)03l1i tlb fill (04)l1d tlb fill (05)mmu table walk instruction (07)mmu table walk data (08)09l2 tlb miss instruction (0a)l2 tlb miss data (0b)18191e1f2223243a3f464951schedule uop (52)schedule int uop (53)schedule simd uop (54)schedule ldst uop (55)dispatch int uop (56)dispatch simd uop (57)dispatch ldst uop (58)int uops in schedulers (59)simd uops in schedulers (5a)ldst uops in schedulers (5b)5f696d6emap rewind (75)map stall (76)dispatch uop (78)map ldst uop (7d)map simd uop (7e)map ldst uop inputs (80)map simd uop inputs (81)8283flush restart other nonspec (84)85inst all (8c)inst simd store (99)inst ldst (9b)l1d tlb access (a0)l1d tlb miss (a1)l1d cache miss st (a2)ld unit uop (a6)st unit uop (a7)l1d cache writeback (a8)a9acafbcl1d cache miss st nonspec (c0)l1d tlb miss nonspec (c1)c2c9cfd0d2l1i cache miss demand (d3)l1i tlb miss demand (d4)d5map dispatch bubble (d6)d9dadbddfetch restart (de)e0? int output thing (e9)eaebec? ldst retires (ed)? simd retires (ee)? int retires (ef)f5f6f7f8fd
640062949623621111100002100472129192021834250001000200020001000200020005000218081600016218712913929504710500020002000500040002933329341116100110001000200338020020022000060001319497626976317803820638334338131636402892710001628313260143672000200010002938829589294992954529552
640042966123800011001139688000459829336021851750001000200020021000200020005005218281600016219142923329574153050032000200050054008294882945331610011000100020000422001024952000060001310292556990317103420503335438061931322878410011638913220144302000200010002982329698296172963129596
6400429734238000100022264177000458929410221837350101000200220021001200420025000219771600010219102934329793310500020002000500040002948929558116100110001000200006020000002000060001325995526945312603520301338238012041372879510011652213503147542000200010002931729250293092935129644
6400429355236100000000000004641292332018193500010002000200010002000200050002180216000721934292232943431050002000200050054000293802939711610011000100020000602000000200000001201320893976882310103520426330038131336332872110001613113264146162000200010002948029351295462940229500
64004297122350000000000000046542925800184515028100020022002100220022004501521804160329221532967029888752715050202020125035403229798298501416100110001000201320020180354772019202001279989196763309503720985329238135735412941410001633413354148202000200010002938929353293662931329305
64004292872270101101000300046532904700182065000100020002000100020002000500021798160001321801289572919249199500020002000500040002949029235116100110001000200330120023122000282301310994746909308203420596309638142832342891110061619113199142852000200010002956729721295172962329692
64004297062301102002639366691004604296220018694508010132024203010122027203450352223516000122226529836301941074305070203120305060403029967302842316100110001000202826020281083922014202001286688956817303104020538319638094639392913310071654113183146732000200010002995730156299982987529850
64004298282320102200661596795000438630435021864550751011201820281008202220245060225581624012222053016030368823735070202920215045402029875303741916100110001000202226320091031272014202301310893056870315703720785329238104533362931410071656313161143822000200010002996030046300323017729974
640043012924101011010001780014738295040018323500510002000200010012000200050002180416016221976291962939631050002000200050004000293592924011610011000100020032622002012200024200132079587696931244422049032873821831342878910001625813011143092000200010002941929400294582934129410
64004293682270131091000300046542920820181785000100020002000100020002000500521800160001521941290682938771050002000200250004000293982931511610011000100020022412002012222000352101332993536993312433420427329438161637362881510001621812985142442000200010002936229441293492953929396

Test 2: throughput

Count: 8

Code:

  st1 { v0.2s, v1.2s, v2.2s, v3.2s }, [x6], x8
  st1 { v0.2s, v1.2s, v2.2s, v3.2s }, [x6], x8
  st1 { v0.2s, v1.2s, v2.2s, v3.2s }, [x6], x8
  st1 { v0.2s, v1.2s, v2.2s, v3.2s }, [x6], x8
  st1 { v0.2s, v1.2s, v2.2s, v3.2s }, [x6], x8
  st1 { v0.2s, v1.2s, v2.2s, v3.2s }, [x6], x8
  st1 { v0.2s, v1.2s, v2.2s, v3.2s }, [x6], x8
  st1 { v0.2s, v1.2s, v2.2s, v3.2s }, [x6], x8
  mov x7, x6
  mov x8, x6
  mov x9, x6
  mov x10, x6
  mov x11, x6
  mov x12, x6
  mov x13, x6
  mov x8, 0

(fused SUBS/B.cc loop)

100 unrolls and 100 iterations

Result (median cycles for code divided by count): 1.0006

retire uop (01)cycle (02)03mmu table walk data (08)09l2 tlb miss instruction (0a)l2 tlb miss data (0b)18191e1f23373a3f46494f51schedule uop (52)schedule int uop (53)schedule simd uop (54)schedule ldst uop (55)dispatch int uop (56)dispatch simd uop (57)dispatch ldst uop (58)int uops in schedulers (59)simd uops in schedulers (5a)ldst uops in schedulers (5b)60696b6d6emap stall dispatch (70)map rewind (75)map stall (76)dispatch uop (78)map int uop (7c)map ldst uop (7d)map simd uop (7e)map int uop inputs (7f)map ldst uop inputs (80)map simd uop inputs (81)8283flush restart other nonspec (84)85inst all (8c)inst branch (8d)inst branch taken (90)91inst branch cond (94)inst int alu (97)inst simd store (99)inst ldst (9b)9fl1d tlb access (a0)l1d tlb miss (a1)l1d cache miss st (a2)a4ld unit uop (a6)st unit uop (a7)l1d cache writeback (a8)a9acafbbbcl1d cache miss st nonspec (c0)c2branch mispred nonspec (cb)cfd5map dispatch bubble (d6)ddfetch restart (de)e0? int output thing (e9)? ldst retires (ed)? simd retires (ee)? int retires (ef)f5f6f7f8fd
3202068005062000001003062410800300160254088278010016549416000080100160000160000480499215557413003491800230800498004903314001002001600001601202004000003200008005080045118020110099010010080000800001001600000000160000002160002000005109117118004380000160000160000801008004580046800458021280045
32020480043620000000030500308003016160254050818010016496016000080100160000160000480499239884912950580800250800508004603324001002001600001600002004000003200008004580045118020110099010010080000800001001600000326201600020081600020232005109117118004280000160000160000801008004580054800468004680050
3202048004962000000003086450802061616025404562801001652681600008010016000016000048049922283521296284080024080045800450326400100200160000160000200400000320000800508004511802011009901001008000080000100160000032001600020011160002000005109117118004280000160000160000801008004680051800468004680051
320204802126200000002130664808002916011248406344801001650071600008010016000016000048049923197121302517080025080045800450326400100200160000160000200400000320000800458004511802011009901001008000080000100160000032001600021001600020032005109117118004280000160000160000801008004680046800518004680046
3202048004562000000003086520800301616025404720801591650221600008010016000016000048049923990931298888080025080045800500327400100200160000160000200400000320000802128004411802011009901001008000080000100160000032001600020081600020232005109317118019780059160000160000801008004680046806398056180046
32020480045620000010333059170801971616025403868801001656811600008015916000016000048049922323331295611080025080212800450331400100200160000160000200400000320000800458004511802011009901001008000080000100160000032001600020001600020032005109117118004180000160000160000801008004680046800518004680046
32020480045620000000122055110800301616025405196801001656171600008010016000016000048049922283001302019080024080045800450327400100200160000160000200400000320000800448004511802011009901001008000080000100160000032001600000014160002020005109117118004280000160000160000801008005180046800468005180046
32020480045620000000000490708003516160254034168010016456816000080100160000160000480499223131812932780800240800458004503284001002001600001600002004000003200008004580213118020110099010010080000800001001600000000160002100160002020005109117118004280000160000160000801008004680046800518004680047
3202048004462000000012304769080030161602540649780100166657160000801001600001600004804992398801129969508002408004980050032740010020016000016000020040000032000080044800451180201100990100100800008000010016000000001600020051600020232005109117118004380000160000160000801008005080213800468004680046
3202048004562100000003051940800291616025404475801001643751600008010016000016000048085122305131298576080025080045800460332400100200160000160000200400000320000800458004511802011009901001008000080000100160000032001600020021600020232005109117118020480000160000160000801008005180046800468005080046

1000 unrolls and 10 iterations

Result (median cycles for code divided by count): 1.0006

retire uop (01)cycle (02)03mmu table walk data (08)09l2 tlb miss data (0b)18191e1f24373a3f46494f51schedule uop (52)schedule int uop (53)schedule simd uop (54)schedule ldst uop (55)dispatch int uop (56)dispatch simd uop (57)dispatch ldst uop (58)int uops in schedulers (59)simd uops in schedulers (5a)ldst uops in schedulers (5b)61696d6emap stall dispatch (70)map rewind (75)map stall (76)dispatch uop (78)map int uop (7c)map ldst uop (7d)map simd uop (7e)map int uop inputs (7f)map ldst uop inputs (80)map simd uop inputs (81)8283flush restart other nonspec (84)85inst all (8c)inst branch (8d)inst branch taken (90)inst branch cond (94)inst int alu (97)inst simd store (99)inst ldst (9b)9fl1d tlb access (a0)l1d tlb miss (a1)l1d cache miss st (a2)a4ld unit uop (a6)st unit uop (a7)l1d cache writeback (a8)a9acafbcl1d cache miss st nonspec (c0)c2cfd0d5map dispatch bubble (d6)ddfetch restart (de)e0? int output thing (e9)? ldst retires (ed)? simd retires (ee)? int retires (ef)f5f6f7f8fd
3200268005162100000030531208003016160254061588001016511216000080010160000160000480049222509112909360800258004580045037240001020160000160000204000003200008004580045118002110910108000080000101600000400016000200216000224005019013171478011780000160000160000800108004680046800468004680046
320024800456200000003027450800291600254041558001016365116000080010160000160000480049191982813023290800248004580045032740001020160000160000204000003200008004580045218002110910108000080000101600000400016000200216000224005019061713148004280000160000160000800108004680046800478004780214
320024800456200000003054070800301616025404603800101643141600008001016000016000048004922325011298599080024800458004407274000102016000016000020400000320000800458004511800211091010800008000010160000040001600620021600002400501905251358004280000160000160000800108004680046800498004780046
320024800456200000013230408008003016169325405767800101644581600008001016000016000048004921521811295291080024800458004503274000102016000016000020400300320000800448004411800211091010800008000010160000042001600610021600022400501905175138019880000160000160000800108004680046800478021480046
32002480045621000000303795080030161602540579580010165551160000800101601181600004800492237947130138208002480045800458432740001020160000160000204000003200008004580045118002110910108000080000101600000400016000200216000224005033014176128004280000160000160000800108004680046800468004680046
32002480045620000100302906080030161602540595580010167117160000800101600001600004800492078211130015608002480054800450327400010201601201600002040000032000080044800451180021109101080000800001016000004600160002002160002240050190141714138004280000160000160000800108021380046800558004680046
3200248004562100000030508908003016160254033658001016485816006080010160000160000480049223408812942790800298012380215033640001020160000160000204000003200008004480046118002110910108000080000101600000400016000200216000224005019013177138004280000160000160000800108004680046800468004680046
32002480045621000000305169080030161602540537480010165003160000800101600001600004800492209669129887308002480045800450828400010201600001600002040000032000080045800451180021109101080000800001016000004000160002002160002240050190131711138004280000160000160000800108004980046800468004780046
3200248004662100000030458908003016160254052158001016417616000080010160000160000480049207938213047800800248004580045032740001020160000160000204000003200008004580045118002110910108000080000101600000400016000200516000224005019016176148004280000160000160000800108005580045800468005580046
320024800456200000003038570800291616025404839800691654381600008001016000016000048004922269981293428080024800458004503274000102016000016000020400000320000802138004511800211091010800008000010160000000016000200216000224005019013171358004280000160000160000800108004680045800468004680046