Apple M1 Microarchitecture Research by Dougall Johnson

Firestorm: Overview | Base Instructions | SIMD and FP Instructions
Icestorm:  Overview | Base Instructions | SIMD and FP Instructions

LD1 (multiple, four registers, 2S)

Test 1: uops

Code:

  ld1 { v0.2s, v1.2s, v2.2s, v3.2s }, [x6]
  nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop
  mov x0, 1
  mov x1, 2
  mov x8, 0

(no loop instructions)

1000 unrolls and 1 iteration

Retires (minus 60 nops): 4.000

Issues: 2.000

Integer unit issues: 0.000

Load/store unit issues: 2.000

SIMD/FP unit issues: 0.000

retire (01)cycle (02)0304070a0f18191e1f22243a3f43464951inst issue (52)~issue ld/st (55)~dispatch ld/st (58)huge thing ld/st (5a)5f696d6escheduler rewind (75)scheduler stall (76)~dispatch op (78)~map op ld/st (7d)~map lookup ld/st (80)8283pipeline redirect (84)85inst all (8c)inst fp/simd load (98)inst ldst (9b)9da0a3a6a8acafb5bbdcache load miss (bf)cfd0d2icache miss (d3)itlb miss (d4)d5d6d9dadbddinst fetch restart (de)e0eaebecld/st retires (ed)? fp/simd (ee)f5f6f7f8fd
64005284992135210008010492228251000231922000200020001000011624928052287363102000400020002834828647116100110001000120004200402200042413165957271733345479195553084381017767228199154661225814979200020002879228302287562829328357
64004282842169110004010502328149200231462000200020001000041625927996283103102000400020002831528256116100110001000120004200400200040413841928171273290471193593083381613756027952145381301114589200020002842328306286912822828757
640042832121280100040005040281500002318220002000200010000416281285802841931020004000200028722281691161001100010000200042000052000004136651004871063173171197543407381414737628043148481256313933200020002829628697288452839728696
640042834721681100040005049281070002318420002000200010000416277280732836431020004000200028323283151161001100010000200042004002000424138681009871973360264194033325381418747227999145721220514663200020002845828417288682846928343
64004284222169110004010501928558000231612000200020001000021627228020288513102000400020002830228793116100110001000020004200000200000413626100067144342356419361330838198797127940147651304413815200020002834628338283802878128391
64004287332128110006000511728090000232692000200020001000031625628296287843102000400020002822728247116100110001000020004200002200420413938100947165336906819377327838198736127929147311224613928200020002846928318283582840128448
6400428423212101100000005127284380222304020002000200010000116278279942838931020004000200028251282281161001100010000200042000072000222131931010272423204167192583293381423647427982145531246314733200020002835528313284232837128326
640042837421351200000105070282080002323520002000200010000116250279902875231020004000200028341282811161001100010000200042000062000404138631011072253370470193723398381915787327933157491219813634200020002835628243283382838228343
6400428762216111000060005117282420002307020002000200010000316268280642828931020004000200028250283331161001100010001200042002042000400140641012671873470159194123308381515767228030146101211114492200020002839528351283332833128229
640042833721261000040005161281140002310220002000200010000216275279782828231020004000200028380283311161001100010000200042000002000404136731025970933398269193773177381515806528158147351247513905200020002833428345288192868228759

Test 2: throughput

Count: 8

Code:

  ld1 { v0.2s, v1.2s, v2.2s, v3.2s }, [x6]
  ld1 { v0.2s, v1.2s, v2.2s, v3.2s }, [x6]
  ld1 { v0.2s, v1.2s, v2.2s, v3.2s }, [x6]
  ld1 { v0.2s, v1.2s, v2.2s, v3.2s }, [x6]
  ld1 { v0.2s, v1.2s, v2.2s, v3.2s }, [x6]
  ld1 { v0.2s, v1.2s, v2.2s, v3.2s }, [x6]
  ld1 { v0.2s, v1.2s, v2.2s, v3.2s }, [x6]
  ld1 { v0.2s, v1.2s, v2.2s, v3.2s }, [x6]
  mov x7, x6
  mov x8, x6
  mov x9, x6
  mov x10, x6
  mov x11, x6
  mov x12, x6
  mov x13, x6

(fused SUBS/B.cc loop)

100 unrolls and 100 iterations

Result (median cycles for code divided by count): 1.0005

retire (01)cycle (02)03090e0f1e22243a3f43464951inst issue (52)~issue int (53)~issue ld/st (55)~dispatch int (56)~dispatch ld/st (58)huge thing int (59)huge thing ld/st (5a)60696b6d6escheduler rewind (75)scheduler stall (76)~dispatch op (78)~map op int (7c)~map op ld/st (7d)~map lookup int (7f)~map lookup ld/st (80)8283pipeline redirect (84)85inst all (8c)inst branch (8d)inst branch taken (90)inst b.cc (94)inst integer (97)inst fp/simd load (98)inst ldst (9b)9d9fa0a3a6a8acafb5b6bbdcache load miss (bf)c5cdcfd5d6ddinst fetch restart (de)e0e7eaebecld/st retires (ed)? fp/simd (ee)gpr retires (ef)f5f6f7f8fd
3202058006760010038000800250121225160100100160000100160000500800853080015080040800403221601002003200002001600008004080040118020110099100100800008000001001600003516003203216003161324000510931733800371101021600001600001008005480041800418004180041
3202048004060000038000800252121225160100100160000100160000500800377180015080040800403221601002003200002001600008004080040118020110099100100800008000001001600003516003203216003261323500510931733800371101021600001600001008004180041800418004180041
3202048004060000038002800252121225160100100160000100160000500800377080015080040800403221601002003200002001600008004080040118020110099100100800008000001001600003516003203216003261323500510931733800371101021600001600001008004180041800418004180041
3202048004060000038100800252121225160100100160000100160000500800377180015080040800403221601002003200002001600008004080040118020110099100100800008000001001600003516003203216003261323500510931733800371101021600001600001008004180041800418004180041
3202048004059901038000800252121225160100100160000100160000500800377080015080040800403221601002003200002001600008004080040118020110099100100800008000001001600003516003203216003261323500510931733800371101021600001600001008004180041800418004180041
3202048004059900038000800252121225160100100160000100160000500800377180015080040800403221601002003200002001600008004080040118020110099100100800008000001001600003516003203216003261323500510931733800371101021600001600001008004180041800418004180041
3202048004060000038000800252121225160100100160000100160000500800377180015080040800403221601002003200002001600008004080040118020110099100100800008000001001600003516003203216003261323500510931733800371101021600001600001008004180041800418004180041
3202048004060000038000800252121225160100100160000100160000500800377180015080040800403221601002003200002001600008004080040118020110099100100800008000001001600003516003203216003261323500510931733800371101021600001600001008004180041800418004180041
320204800406000003800080025212122516010010016000010016000050080037718001538004080040322160100200320000200160000800408004011802011009910010080000800000100160000351600321016003661323500510931733800371101021600001600001008004180041800418004180041
3202048004060000038000800252121225160100100160000100160000500800374080015080040800403221601002003200002001600008004080040118020110099100100800008000001001600003516009063216003261323500513031733800371101021600001600001008004180041800418004180041

1000 unrolls and 10 iterations

Result (median cycles for code divided by count): 1.0005

retire (01)cycle (02)030508090b0e0f1e1f22233a3f43464951inst issue (52)~issue int (53)~issue ld/st (55)~dispatch int (56)~dispatch ld/st (58)huge thing int (59)huge thing ld/st (5a)60696d6edispatch stall (70)scheduler rewind (75)scheduler stall (76)~dispatch op (78)~map op int (7c)~map op ld/st (7d)~map lookup int (7f)~map lookup ld/st (80)8283pipeline redirect (84)85inst all (8c)inst branch (8d)inst branch taken (90)inst b.cc (94)inst integer (97)inst fp/simd load (98)inst ldst (9b)9d9fa0a1a3a5a6a8a9acafb5b6bbdcache load miss (bf)dtlb miss (c1)c2cfd5d6ddinst fetch restart (de)e0e7eaebecld/st retires (ed)? fp/simd (ee)gpr retires (ef)f5f6f7f8fd
32002580041599000000300100800252121225160010101600001016000050800219080015800408004003221600102032000020160000800408004011800211091010800008000001016000002701600240024160024612427005019131787800371662160000160000108004180041800418015180041
320024800406000000003000018002521212251600101016000010160000508000000800158004080040032216001020320000201600008004080040118002110910108000080000010160000035816002400241600246124270050198171011800371662160000160000108004180041800418004180041
32002480040599000000300000800250121225160010101600001016000050800221080015800408004003221600102032000020160000800408004011800211091010800008000001016000002701600240024160032612427005019917812800371662160000160000108004180041800418004180041
3200248004060000000030010080025212122516001010160000101600005080022208001580040800400322160010203200002016000080040800401180021109101080000800000101600000270160024002416002461242700501971799800371662160000160000108004180041800418004180041
3200248004059900000030000080025201225160010101600001016000050800217080015800408004003221600102032000020160000801168004011800211091010800008000001016000002701600240024160024612427005048917108800371660160000160000108004180041800418004180041
32002480040599000000300000800252121225160010101600001016000050800222080015800408004003221600102032018020160000800408004011800211091010800008000001016000002701600820024160024612427005019717119800371662160000160000108004180041800418004180041
320024800405990000003000028002521616251600101016000010160000508009640800158004080040032216001020320000201600008004080040118002110910108000080000010160013134201600530151160038615142130501971786800370992160000160000108004180041800418004180041
3200248004060011100057010280025216162516001010160000101600005080097018001580040800400322160010203200002016000080040800401180021109101080000800000101600000270160024002416002461242700501910171122800371662160000160000108004180041800418004180041
32002480040599000000300000800252121225160010101600001016000050800219080015800408004003221600102032000020160000800408004011800211091010800008000001016000002701600240024160024612427005019917119800371662160000160000108004180041800418004180041
320024800405990000003000008002521212251600101016000010160000508002190800158004080040032216001020320000201600008004080040118002110910108000080000110160000027016002400241600246124270050191017710800371662160000160000108004180041800418004180041