Apple Microarchitecture Research by Dougall Johnson

M1/A14 P-core (Firestorm): Overview | Base Instructions | SIMD and FP Instructions
M1/A14 E-core (Icestorm):  Overview | Base Instructions | SIMD and FP Instructions

ST2 (single, post-index, H)

Test 1: uops

Code:

  st2 { v0.h, v1.h }[1], [x6], x8
  nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop
  mov x0, 0
  mov x8, 0

(no loop instructions)

1000 unrolls and 1 iteration

Retires (minus 60 nops): 2.000

Issues: 3.000

Integer unit issues: 1.000

Load/store unit issues: 1.000

SIMD/FP unit issues: 1.000

retire uop (01)cycle (02)03l1i tlb fill (04)l1d tlb fill (05)mmu table walk instruction (07)mmu table walk data (08)l2 tlb miss instruction (0a)l2 tlb miss data (0b)181e1f22243a3f464951schedule uop (52)schedule int uop (53)schedule simd uop (54)schedule ldst uop (55)dispatch int uop (56)dispatch simd uop (57)dispatch ldst uop (58)int uops in schedulers (59)simd uops in schedulers (5a)ldst uops in schedulers (5b)5e5f61696d6emap rewind (75)map stall (76)dispatch uop (78)map ldst uop (7d)map simd uop (7e)map ldst uop inputs (80)map simd uop inputs (81)8283flush restart other nonspec (84)85inst all (8c)inst simd store (99)inst ldst (9b)l1d tlb access (a0)l1d tlb miss (a1)l1d cache miss st (a2)ld unit uop (a6)st unit uop (a7)l1d cache writeback (a8)a9acafbcl1d cache miss st nonspec (c0)l1d tlb miss nonspec (c1)c2c9cfd0d2l1i cache miss demand (d3)l1i tlb miss demand (d4)d5map dispatch bubble (d6)d9dadbddfetch restart (de)e0? int output thing (e9)eaebec? ldst retires (ed)? simd retires (ee)? int retires (ef)f5f6f7f8fd
620062858522210180110000104743283661117582300010001000100010001000100050001090380000240216212836228571310300010001000300020002847328564116100110001000100133010020121000231101327096416924313294619830320838011141412798410001520012714135071000100010002849028530285402856528636
6200428426220011511110132100469228457011759930001000100010001000100010005000109058000090216972805728567310300010001000300020002841828537416100110001000100210010020321000141201320093676899319594519892325938021545422808010001535712539135641000100010002850028500286082853728480
62004286692200190120001004786283921017633300010001000100010001000100050001089880000190216842832728425310300010001000300020002855528462116100110001000100110110011011000141301339697117014326424419947321438191245472814810001553312578133111000100010002856228595285582867228654
62004285922210170141001104780284531117596300010001000100010001000100050001090780000170216812829128557328300010001000300020002855528503116100110001000100220110010221000201101352598717009322753920041318738111041512798410001531512636136021000100010002857928685286242860028496
6200428629222011301310122004794285181117589300010001000100010001000100050001091180000160217102828228506310300010001000300020002865628458116100110001000100214210020011001141001329795406901322554220075322738132246462818310001503812408134241000100010002860928620285592858328558
62004285882220191121003004647283831117616300010001000100010001000100050001090080000120217222832628597310300010001000300020002851728571116100110001000100133110010211000141101340396866990320174019953323738141435382817210001530312495135091000100010002864628516285382867428545
620042861022301100161001004921284941017574300010001000100010001000100050001090180000210217002836828658310300010001000300020002864928686116100110001000100214010020121000241001341897816962312744319997328238061041342814310001529312630131791000100010002857828531285432861928507
6200428576221001609000000497128419101761130001000100010001000100010005000109088000080216802834028608310300010001000300020002852128501116100110001000100003010000031000030001330194797043314863720125330138121339462823310001508112573134571000100010002864828525285002856828578
620042862722101131161003104785284670117633300010001000100010001000100050001090480000140216482840028398310300310001000300020002848628509116100110001000100003010001001000030001322096906973319974019995314838101344372814210001525612433134301000100010002853228556285342854028599
6200428478221001301400120004716284070117574300010001000100010001000100050001090280000150216902836228480310300010001000300020002850328519116100110001000100003010000031000030001339796597065328810401990731893807644392810310001513212550134001000100010002863428564285452860528495

Test 2: throughput

Count: 8

Code:

  st2 { v0.h, v1.h }[1], [x6], x8
  st2 { v0.h, v1.h }[1], [x6], x8
  st2 { v0.h, v1.h }[1], [x6], x8
  st2 { v0.h, v1.h }[1], [x6], x8
  st2 { v0.h, v1.h }[1], [x6], x8
  st2 { v0.h, v1.h }[1], [x6], x8
  st2 { v0.h, v1.h }[1], [x6], x8
  st2 { v0.h, v1.h }[1], [x6], x8
  mov x7, x6
  mov x8, x6
  mov x9, x6
  mov x10, x6
  mov x11, x6
  mov x12, x6
  mov x13, x6
  mov x8, 0

(fused SUBS/B.cc loop)

100 unrolls and 100 iterations

Result (median cycles for code divided by count): 1.0005

retire uop (01)cycle (02)03l1d tlb fill (05)mmu table walk data (08)09l2 tlb miss data (0b)18191e1f2224373a3f46494f51schedule uop (52)schedule int uop (53)schedule simd uop (54)schedule ldst uop (55)dispatch int uop (56)dispatch simd uop (57)dispatch ldst uop (58)int uops in schedulers (59)simd uops in schedulers (5a)ldst uops in schedulers (5b)60696b6d6emap stall dispatch (70)map rewind (75)map stall (76)dispatch uop (78)map int uop (7c)map ldst uop (7d)map simd uop (7e)map int uop inputs (7f)map ldst uop inputs (80)map simd uop inputs (81)8283flush restart other nonspec (84)85inst all (8c)inst branch (8d)inst branch taken (90)inst branch cond (94)inst int alu (97)inst simd store (99)inst ldst (9b)9fl1d tlb access (a0)l1d tlb miss (a1)l1d cache miss st (a2)a4ld unit uop (a6)st unit uop (a7)l1d cache writeback (a8)a9acafbcl1d cache miss st nonspec (c0)l1d tlb miss nonspec (c1)c2branch cond mispred nonspec (c5)branch indir mispred nonspec (c6)branch mispred nonspec (cb)cdcfd2d5map dispatch bubble (d6)ddfetch restart (de)e0? int output thing (e9)eb? ldst retires (ed)? simd retires (ee)? int retires (ef)f5f6f7f8fd
16020680040643100100070013618002599225242155801008204380000801048000480000435901437588486449230800150800408004059930660066240108200800088000820024002416001680040800401180201100991001008000080000100800087250080008018800008257110115116001600800378000008000080000801008004180041800418004180041
16020480040642110000670030991800250112252402658010082423800008010480004800004359010375884864548308001508004080040599306599932401082008000880008200240024160016800408004011802011009910010080000800001008000770008000801780000707010115116001600800378000008000080000801008004180041800418004180041
160204800406431000001270051601800251011625244397801008425080000801048000480000435901037588486403940800150805248004059930659993240108200800088000820024002416001680040801621180201100991001008000080000100800087250180008007800018277010115116001600800378000008000080000801008004180041800418004180041
160204800406431101002171015718002511106252425848010082415800008010480004800004359018375884865270308001508004080040599306599932401082008000880008200240024160016800408004011802011009910010080000800001008000782500800081011800017277110115116001600800378000008000080000801008004180041800418004180041
160204800406431100000900191180025111112524214980100820518000080100800008000043589943758848646151080015080040800405992435999824010020080000800002002400001600008004080040118020110099100100800008000010080007700080008007800017257300005110011611800378000008000080000801008004180041800418004180041
1602048004064310010009001651800251111237242121801008342880000801008000080000435899437588486529990800150800408004059924359998240100200800008000020024000016000080040800401180201100991001008000080000100800087250080007008800018257100005110011611800378000008000080000801008004180041800418004180041
160204800406421000000900161180025110125242558801008245880000801008000080000435899037588486472470800150800408004059924359998240100200800008000020024000016000080040800401180201100991001008000080000100800078250180007008800017257000005110011611803908000008000080000801008053880041800418004180041
160204800406421000000900163018002511112252425728010082420800008010080000800004358994375884865275308001508004080040599243599982401002008000080000200240000160000800408004011802011009910010080000800001008000772501800080114800018257000005110011611800378000008000080000801008004180041800418004180041
1602048004064310000009002050180025119025241867801008342680000801008000080000435899437588486405790800150800408004059924359998240100200800008012820024000016000080040800401180201100991001008000080000100800077001800080011800017267000005110011611800378000008000080000801008004180041800418004180041
16020480040643100000090010118002500225241562801008257980000801008023280000435899437748926454760800150800408004059924359998240100200800008000020024000016000080040800401180201100991001008000080000100800078250080008018800018257000005110001611800378000008000080000801008004180041809058040680531

1000 unrolls and 10 iterations

Result (median cycles for code divided by count): 1.0005

retire uop (01)cycle (02)03l1i tlb fill (04)l1d tlb fill (05)mmu table walk instruction (07)mmu table walk data (08)l2 tlb miss instruction (0a)l2 tlb miss data (0b)0f18191e1f24373a3f46494f51schedule uop (52)schedule int uop (53)schedule simd uop (54)schedule ldst uop (55)dispatch int uop (56)dispatch simd uop (57)dispatch ldst uop (58)int uops in schedulers (59)simd uops in schedulers (5a)ldst uops in schedulers (5b)6061696d6emap stall dispatch (70)map rewind (75)map stall (76)dispatch uop (78)map int uop (7c)map ldst uop (7d)map simd uop (7e)map int uop inputs (7f)map ldst uop inputs (80)map simd uop inputs (81)8283flush restart other nonspec (84)85inst all (8c)inst branch (8d)inst branch taken (90)inst branch cond (94)inst int alu (97)inst simd store (99)inst ldst (9b)9fl1d tlb access (a0)l1d tlb miss (a1)l1d cache miss st (a2)a4ld unit uop (a6)st unit uop (a7)l1d cache writeback (a8)a9acafbcl1d cache miss st nonspec (c0)l1d tlb miss nonspec (c1)c2cfd0d2d5map dispatch bubble (d6)daddfetch restart (de)e0? int output thing (e9)? ldst retires (ed)? simd retires (ee)? int retires (ef)f5f6f7f8fd
16002680160620000000000129202340080025803252456478001084809800008001080000800004358429375884864173700800158004080162599463600202400102080000800002024000016000080040800401180021109101080000800001080000022008000110480001100050200031603380037800008000080000800108004180041800418004180041
1600248004064311010100009047341800259902524257780010809228000080010800008000043584173758848656890008001580040800405994636002024001020800008000020240000160000800408004011800211091010800008000010800087300080008011084800018297150200031603380037800008000080000800108016180041800418004180041
160024800406200100000001290213518002599732524118280010809208000080126800008000043584093758848646214008001580040800405994636002024001020800008000020240000160000800408004011800211091010800008000010800000210080001003800001210050200031604380037800008000080000800108004180041800418004180041
160024800406210000000000001148080025803252437678001084694800008001080000800004358429375884865423500800158004080040599463600202400102080000800002024000016000080040800401180021109101080000800001080000021008000000106880001000050200031602380037800008000080000800108004180041800418004180041
160024800406200000000000409240800258802524070580102846918000080010800008000043584293758848640054008001580040800405994636002024001020800008000020240000160000800408004011800211091010800008000010800000210080001006800011210050200031604380037800008000080000800108004180041800418004180041
160024800406210000000006401148080025880252419578001083757800008001080000800004358429375884864174200800158004080040599463600202403502080000800002024000016000080040800401180021109101080000800001080000021008000100480001100050200032502380037800008000080000800108004180041800418004180041
16002480040620000000000092013800800258812524470680010846928000080010800008000043584293758848656916008001580040800405994636002024001020801208000020240000160000800408004011800211091010800008000010800000210080000003800001210050200041603480037800008000080000800108004180041800418004180041
160024800406200000000001240120800258872252456438001081148800608001080000800004358429375884864209110800158016380040599463600202400102080000800002024000016000080040800401180021109101080000800001080000000080001006800011210050200051604380037800008000080000800108004180041800418004180041
1600248004062000000000002019470800258802524059080010837578000080010800008000043584293758848647031008001580040800405994636002024001020800008000020240360160000800408004011800211091010800008000010800000210080001000800011210050200041603480037800008000080000800108004180041800418004180041
1600248004062100000000002056880800250002524059080010812608000080010801168000043584293758848654088008001580040800405994636002024001020800008000020240000160000800408004011800211091010800008000010800000210080001000800011210250380091604380142800928000080000800108040780163801648040780285