Apple Microarchitecture Research by Dougall Johnson

M1/A14 P-core (Firestorm): Overview | Base Instructions | SIMD and FP Instructions
M1/A14 E-core (Icestorm):  Overview | Base Instructions | SIMD and FP Instructions

ST1 (multiple, post-index, 3 regs, 4H)

Test 1: uops

Code:

  st1 { v0.4h, v1.4h, v2.4h }, [x6], x8
  nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop
  mov x0, 0
  mov x8, 0

(no loop instructions)

1000 unrolls and 1 iteration

Retires (minus 60 nops): 3.000

Issues: 4.000

Integer unit issues: 1.000

Load/store unit issues: 2.000

SIMD/FP unit issues: 1.000

retire uop (01)cycle (02)03l1i tlb fill (04)l1d tlb fill (05)mmu table walk instruction (07)mmu table walk data (08)09l2 tlb miss instruction (0a)l2 tlb miss data (0b)18191e1f233a3f464951schedule uop (52)schedule int uop (53)schedule simd uop (54)schedule ldst uop (55)dispatch int uop (56)dispatch simd uop (57)dispatch ldst uop (58)int uops in schedulers (59)simd uops in schedulers (5a)ldst uops in schedulers (5b)5f6061696d6emap rewind (75)map stall (76)dispatch uop (78)map ldst uop (7d)map simd uop (7e)map ldst uop inputs (80)map simd uop inputs (81)8283flush restart other nonspec (84)85inst all (8c)inst simd store (99)inst ldst (9b)l1d tlb access (a0)l1d tlb miss (a1)l1d cache miss st (a2)ld unit uop (a6)st unit uop (a7)l1d cache writeback (a8)a9acafbcl1d cache miss st nonspec (c0)l1d tlb miss nonspec (c1)c2c9cfd0d2l1i cache miss demand (d3)l1i tlb miss demand (d4)d5map dispatch bubble (d6)d9dadbddfetch restart (de)e0? int output thing (e9)eaebec? ldst retires (ed)? simd retires (ee)? int retires (ef)f5f6f7f8fd
63006291042331040040003004658287033017926400010001000200010001000200050001591180006092166128681290973104000200010005000200029042288981161001100010002006262200014825200004021231334889526797303404420917333138211145412882710011616913414143752000100010002952729439294122951229584
6300429464238002102130000471329347001840540001000100020001000100020005000159038000200217482927329588172740082004100150052000295762950831610011000100020090622000046052000160201317893776986314513920966347438162249492895510021632413392145672000100010002959929549296772961429646
6300429681237003004000910474329427001873040001000100020001000100020005000159018008500217132900929417310400020001001500020002953829422116100110001000200000020002092000000001318093736934320914220983325438121539372867310001648713098143602000100010002924229398294002934329593
63004294262380040020001510468129216121838040001000100020001000100020025000159028000300217492907829468310400020001000500020002939729401116100110001000200004020000032000060401293392376836306805021053320538153750422935610071637413348145182000100010002973830206299803014130160
630042998324200200211614224415850470729375011848240001000100020001000100020005000159058000000217062936629711328400020001000500020002931129362116100110001000200004020001032000000001319792016933313214520706329738201347382873210001600613129146342000100010002943129352294252940829440
630042927622800210400001046362913101183824000100110002000100010002000500015900800040021717290662932431040002000100050002000293382949911610011000100020000402002000200004000131169442698831271432083632163823943432873310001609113046144302000100010002928529289293002939329286
6300429348228001002000010467929058011836140001000100020001000100120005000159028000000217202914929373310400020001000500020002934829291116100110001000200404020000202000000001321891946983317804620647333538211247482874110001606813267143032000100010002935629437293942938329406
6300429286228003003000010468629172001834940001000100020001000100120005000159128000200217412906329372310400020001000500020002932229284116100110001000200006020000002000000001327893706893319514520717329038161042482876610001658913769150062000100010002950729314292772957429343
6300429494235004002116025501761044492893420184744031100610002018100910072012507016553800812082202728803286713120540362000100050052000286992857611610011000100020002632011104752001160001306491946813304204720387329438061944442840210001577612810141032000100010002892929110290462911829052
630042906222410300400002048822844133176904000100010002000100010002000500015909800070021712284092859031040002000100050002000288182864211610011000100020000602001004200006000135839736697531512421985231583816837382813610001502112255136322000100010002858228775287712866128552

Test 2: throughput

Count: 8

Code:

  st1 { v0.4h, v1.4h, v2.4h }, [x6], x8
  st1 { v0.4h, v1.4h, v2.4h }, [x6], x8
  st1 { v0.4h, v1.4h, v2.4h }, [x6], x8
  st1 { v0.4h, v1.4h, v2.4h }, [x6], x8
  st1 { v0.4h, v1.4h, v2.4h }, [x6], x8
  st1 { v0.4h, v1.4h, v2.4h }, [x6], x8
  st1 { v0.4h, v1.4h, v2.4h }, [x6], x8
  st1 { v0.4h, v1.4h, v2.4h }, [x6], x8
  mov x7, x6
  mov x8, x6
  mov x9, x6
  mov x10, x6
  mov x11, x6
  mov x12, x6
  mov x13, x6
  mov x8, 0

(fused SUBS/B.cc loop)

100 unrolls and 100 iterations

Result (median cycles for code divided by count): 1.0005

retire uop (01)cycle (02)03l1d tlb fill (05)mmu table walk data (08)09l2 tlb miss data (0b)18191e1f2224373a3f46494f51schedule uop (52)schedule int uop (53)schedule simd uop (54)schedule ldst uop (55)dispatch int uop (56)dispatch simd uop (57)dispatch ldst uop (58)int uops in schedulers (59)simd uops in schedulers (5a)ldst uops in schedulers (5b)60696d6emap stall dispatch (70)map rewind (75)map stall (76)dispatch uop (78)map int uop (7c)map ldst uop (7d)map simd uop (7e)map int uop inputs (7f)map ldst uop inputs (80)map simd uop inputs (81)8283flush restart other nonspec (84)85inst all (8c)inst branch (8d)inst branch taken (90)inst branch cond (94)inst int alu (97)inst simd store (99)inst ldst (9b)9fl1d tlb access (a0)l1d tlb miss (a1)l1d cache miss st (a2)a4ld unit uop (a6)st unit uop (a7)l1d cache writeback (a8)a9acafbcl1d cache miss st nonspec (c0)l1d tlb miss nonspec (c1)c2c9cdcfd5map dispatch bubble (d6)d9ddfetch restart (de)e0? int output thing (e9)? ldst retires (ed)? simd retires (ee)? int retires (ef)f5f6f7f8fd
240206800586210000000300260800341616253253201348010080036160000801008000016000018302863679541640154080018800428004249956350000320100200160000800002004000001600008004280042118020110099100100800008000010016000003400160002109421600022000005110116013800478000016000080000801008004180044802118005180043
2402048004062000000009002308002500025320136801008004116000080100800001600001830286367954164009408001880042800424996235000132010020016000080000200400000160000800438004211802011009910010080000800001001600000340016000200216000223400005110116011800398000016000080000801008004480044800518004380043
2402048004262000000012900190800271616025320122801008002216000080100800591600001830286367954164012008001880043800435330335000032010020016000080000200400000160000800408004311802011009910010080000800001001600000340016000200216000223400005110116011800398005816000080000801008005080043800438004480044
24020480043621000000030011080028161602532014780100800191600008010080000160000135069336798296423310800188004280042499563500013201002001601228000020040000016000080042800401180201100991001008000080000100160000034001600020081600022000005110116011800478000016000080000801008005180044800448005180043
240204800426210000000900190800271616025320152801008004816000080100800001600001830286367954164066308002880042800424995635000132010020016000080000200400000160000800428004211802011009910010080000800001001600000340016000210216000223400005110116011800408000016000080000801008004180043800438004480044
240204800436210000001323007020800351616325320125801008001816000080100800001600001370706367954164013808002580043800434995635000032010020016012280000200400000160000800428004211802011009910010080000800001001600000340016000200216000223400005110116311800468000016000080000801008004480213800438004380044
2402048004062100000003005808002816002532011180100800211600008010080000160000182626036795416400750800188005080049499563500013201002001600008000020040000016000080050800401180201100991001008000080000100160000000016000420516000023400005110116011800468000016000080000801008005180044800448005080043
24020480042620000000000124080028161602532014180100800041600008010080000160000183028636795416400760800258004280042499563500013201002001600008000020040000016000080042800421180201100991001008000080000100160000034001600020094216000223400005110116011800408000016000080000801008004380043800518004180044
2402048004062100000012601210800331616025320111801008081516000080100800001600001370692368643864014908001880042800494995635000032010020016000080000200400000160000800498005011802011009910010080000800001001600000340016000200816000223400005110116011800478000016000080000801008004180044800448005180043
240204800406200000000901665080028161602532012180100805381600008010080000160000183028636795416400880800188004980049499563500073201002001601228000020040000016000080049800491180201100991001008000080000100160000034001600021051600022000005110116031800468000016000080000801008004480050802118004380044

1000 unrolls and 10 iterations

Result (median cycles for code divided by count): 1.0005

retire uop (01)cycle (02)03mmu table walk data (08)09l2 tlb miss data (0b)18191e1f222324373a3f46494f51schedule uop (52)schedule int uop (53)schedule simd uop (54)schedule ldst uop (55)dispatch int uop (56)dispatch simd uop (57)dispatch ldst uop (58)int uops in schedulers (59)simd uops in schedulers (5a)ldst uops in schedulers (5b)6061696b6d6emap stall dispatch (70)map rewind (75)map stall (76)dispatch uop (78)map int uop (7c)map ldst uop (7d)map simd uop (7e)map int uop inputs (7f)map ldst uop inputs (80)map simd uop inputs (81)8283flush restart other nonspec (84)85inst all (8c)inst branch (8d)inst branch taken (90)inst branch cond (94)inst int alu (97)inst simd store (99)inst ldst (9b)9fl1d tlb access (a0)l1d tlb miss (a1)l1d cache miss st (a2)a4ld unit uop (a6)st unit uop (a7)l1d cache writeback (a8)acafbcl1d cache miss st nonspec (c0)c2cdcfd2d5map dispatch bubble (d6)d9daddfetch restart (de)e0? int output thing (e9)? ldst retires (ed)? simd retires (ee)? int retires (ef)f5f6f7f8fd
2400268004262100000031007080027161602532001280010800031600008001080000160000182976936795416400110080018080042800434997835002332001020160141800002040000016000080043800431180021109101080000800001016000004200160002021600022000502004160034800408000016000080000800108004480044800438004380044
2400248004362000000030007080028161602532001280010800031600008001080000160000182976936795416400100080019080043800434997835002232001020160000800002040000016000080043800431180021109101080000800001016000000001600020016000204200502005160046800408000016000080000800108004380043800448004480044
24002480042621000000300000800281616025320013800108000216000080010800001600001829769367954164000700800180800438004349978350020320010201600008000020400000160000800408004011800211091010800008000010160000042001600020016000224200502006160095800408000016000080000800108004380044800448004180044
240024800436200000003000408002816160253200138001080004160000800108000016000018297693679541640008008001808004380043499783500233200102016000080000204000001600008004380043118002110910108000080000101600000420016000201716000224200502005160082800408000016000080000800108004480043800438004480041
240024800436200000012300060800251616025320012800108000216000080010800001600001829769367954164000700800180800408004349978350023320010201600008000020400000160000800428004311800211091010800008000010160000042001600621216000224200502005160032800408000016000080000800108004480044800448004380044
24002480043643000009200040800271616025320014800108000516000080010800001600001829769367954164000010800270800438004349978350023320010201600008000020400000160000800408004311800211091010800008000010160000044001600021516000424200502004160043800408000016000080000800108004480043800448004380044
24002480042643000000300010800281616025320013800108000116000080010800001600001829769367954164001200800180800438004349978350023320010201600008000020400000160000800428004311800211091010800008000010160000042001600020216000224200502006160025800398000016000080000800108004380044800448004180044
24002480042643000000300000800281616025320014800108000216000080074800001600001829769367954164000700800180800438004349978350022320010201600008000020400000160000800438004211800211091010800008000010160000000016000201116000224200502006160052800508000016000080000800108004480044800448004380043
240024800436430000012300020800281616025320014800108000216000080010800001600001829769367954164001600800180800438004249978350023320010201600008000020400000160000800438004311800211091010800008000010160000042001600021216000224200502004160047800398000016000080000800108004480044800448004480044
240024800426430000003000408002700025320011800108000616000080010800001600001829769367954164000700800180800438004049978350023320010201600008000020400000160000800438004311800211091010800008000010160000042001600020516000224200502005160023800498000016000080000800108004480044800448004380043