Apple M1 Microarchitecture Research by Dougall Johnson

Firestorm: Overview | Base Instructions | SIMD and FP Instructions
Icestorm:  Overview | Base Instructions | SIMD and FP Instructions

LD3 (multiple structures, 4H)

Test 1: uops

Code:

  ld3 { v0.4h, v1.4h, v2.4h }, [x6]
  nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop
  mov x0, 1
  mov x1, 2
  mov x8, 0

(no loop instructions)

1000 unrolls and 1 iteration

Retires (minus 60 nops): 5.000

Issues: 5.009

Integer unit issues: 0.000

Load/store unit issues: 2.000

SIMD/FP unit issues: 3.009

retire (01)cycle (02)03040507080a0b0e0f181e223a3f464951inst issue (52)~issue fp/simd (54)~issue ld/st (55)~dispatch fp/simd (57)~dispatch ld/st (58)huge thing ld/st (5a)huge thing fp/simd (5b)5f6061696b6d6escheduler rewind (75)scheduler stall (76)~dispatch op (78)~map op ld/st (7d)~map op fp/simd (7e)~map lookup ld/st (80)~map lookup fp/simd (81)8283pipeline redirect (84)85inst all (8c)inst fp/simd load (98)inst ldst (9b)9da0a1a3a6a7a8a9acafb5bbdcache load miss (bf)dtlb miss (c1)c2cfd0d2icache miss (d3)itlb miss (d4)d5d6d9dadbddinst fetch restart (de)e0eaebecld/st retires (ed)? fp/simd (ee)f5f6f7f8fd
6500528592213101301001010150982804802162025012301220003000200010000357930082294402841128352310500020003000200060002812828339116100110001000120043420050291520004242113411975070383454578194573158382616586527897140781233413560200030002860328502280392850628542
6500428324213411111000204938282200016327501530092000300020001000035638008228900283582847531050002000300020006000282212850911610011000100002004302002002520000242613809989572283306161192013197382218585927918146991228113697200030002850228382285082850428579
65004285982135111110002147142825310162585003300920003000200010000356431002297902849328410310500020003000200060002842728328116100110001000020023420050341420004442013765978870343330057193623210382027636727972150261245214173200030002864728564286282856328343
65004285652137111110002049132829800163245009300920003000200010000357553182296602845528609310500020003000200060002842928326116100110001000020023420030282520024262113175967871083420064193283320382318626228014151421200314216200030002854928429284422830728370
65004283952134111110004047822807200161965012301220003000200010002357460082299402851328574310500020003000200060002852728456116100110001000020033420020332220024242113455998071843341263192203177382113626028040141641243213887200030002829928575286692850928237
6500428333213211111000504943280480016156500930032000300020001000035628118229870284092838431050002000300020006000284732826111610011000100002002362005032720004262113297967770103279162194713221382318565928060146751213013871200030002854828564285492849428489
6500428281214611111000204860280742116155500930092000300020001000035724000229590283072841031050002000300020006000285142822811610011000100002002242004002720004202213407978271173306158194373346382119575527944150901241913749200030002852328474283722846828578
650042862221351111000030509328130001631850123009200030002000100003572900822982028198284523105000200030002000600028278281721161001100010000200324200203912200042421133631003872053170259191683183381914616728022153461207813527200030002861528381283392845628257
6500428321212611010000299150752830200162125009300920003000200010000357611102293102839628258310500020003000200060002833328188116100110001000020034420020022200004421133441004171653360059191323197381716606628050146211196113850200030002812128506282932842528478
650042834121241111110031496827964001630650093009200030002000100033563750022968028301283523105000200030002000600028255282211161001100010000200234200403712200002622137081006370853355267193613198381612596527870144371225213064200030002839428564283752815128543

Test 2: throughput

Count: 8

Code:

  ld3 { v0.4h, v1.4h, v2.4h }, [x6]
  ld3 { v0.4h, v1.4h, v2.4h }, [x6]
  ld3 { v0.4h, v1.4h, v2.4h }, [x6]
  ld3 { v0.4h, v1.4h, v2.4h }, [x6]
  ld3 { v0.4h, v1.4h, v2.4h }, [x6]
  ld3 { v0.4h, v1.4h, v2.4h }, [x6]
  ld3 { v0.4h, v1.4h, v2.4h }, [x6]
  ld3 { v0.4h, v1.4h, v2.4h }, [x6]
  mov x7, x6
  mov x8, x6
  mov x9, x6
  mov x10, x6
  mov x11, x6
  mov x12, x6
  mov x13, x6

(fused SUBS/B.cc loop)

100 unrolls and 100 iterations

Result (median cycles for code divided by count): 1.0005

retire (01)cycle (02)0305080b0e0f1e1f223a3f4346494f51inst issue (52)~issue int (53)~issue fp/simd (54)~issue ld/st (55)~dispatch int (56)~dispatch fp/simd (57)~dispatch ld/st (58)huge thing int (59)huge thing ld/st (5a)huge thing fp/simd (5b)60696d6edispatch stall (70)scheduler rewind (75)scheduler stall (76)~dispatch op (78)~map op int (7c)~map op ld/st (7d)~map op fp/simd (7e)~map lookup int (7f)~map lookup ld/st (80)~map lookup fp/simd (81)8283pipeline redirect (84)85inst all (8c)inst branch (8d)inst branch taken (90)inst b.cc (94)inst integer (97)inst fp/simd load (98)inst ldst (9b)9d9fa0a1a3a6a8a9acafb5b6bbdcache load miss (bf)dtlb miss (c1)c2cfd5d6ddinst fetch restart (de)e0eaebecld/st retires (ed)? fp/simd (ee)gpr retires (ef)f5f6f7f8fd
40020580054599101005701080026255325400120100240068160000100240000160000500801386288025818002280041800410323400100200160000240000200160000480000800418004111802011009910010080000800000100160015130160015025216003961524313151092172280038131311600002400001008004280042800428004280042
400204800415991100012002800262503254001741002400771600001002400001600005008000422884893180022800418004103234001002001600922400002001600004800008004180041118020110099100100800008000001001600131301600530151160039611201305109217228003813011600002400001008004280042800428004280042
400204800416001110058012800262053254001791002400741600001002400001600005008013872884892180022802498004103234001002001600002400002001600004800008004180041118020110099100100800008000001001600131201600540152160039605201215109217228003801311600002400001008004280042800428004280042
4002048004159910100130128002625532540017410024007416000010024000016000050080004228848931800228004180041032340010020016000024000020016000048000080041800411180201100991001008000080000010016001413431600530155160039615201315109217228010301311600002400001008004280042800428004280042
40020480041599110005800280026250325400120100240068160000100240000160000500801380288660218002280041800410323400100200160000240000200160000480000800418004111802011009910010080000800001100160013150160052011216000001524313151092172280038131301600002400001008004280042800428004280042
4002048004160011000580108002625532540016810024002116000010024000016000050080137228848951800228004180041032340010020016000024000020016000048000080041800411180201100991001008000080000010016001413431600530155160039611343131510921722800380001600002400001008004280042800428004280042
400204800415991110058002800262550254001791002400711600001002400001600005008013872884892180022800418004103234001002001600002400002001600004800008004180041118020110099100100800008000001001600131343160013015116000061524313151092172280038131301600002400001008004280042800428004280042
40020480041600111005801280026255025400168100240019160000100240000160000500800042288027818002280140800410323400100200160000240000200160000480000800418004111802011009910010080000800000100160013144316001402541600396151431325109217228003801311600002400001008004280042800428004280042
40020480041599110001301280026255325400118100240071160000100240000160000500801383288024918002280041800410323400100200160000240000200160000480000800418004111802011009910010080000800000100160015154316005112701600390152431305109217228003801311600002400001008004280042800428004280042
40020480041599101001200280026200325400118100240074160000100240000160000500800042288489318002280041800410323400100200160000240000200160000480000800418004111802011009910010080000800000100160014144316005300521600390113012151092172280038131301600002400001008004280042800428004280042

1000 unrolls and 10 iterations

Result (median cycles for code divided by count): 1.0005

retire (01)cycle (02)0305080b0e0f191e1f22233a3f4346494f51inst issue (52)~issue int (53)~issue fp/simd (54)~issue ld/st (55)~dispatch int (56)~dispatch fp/simd (57)~dispatch ld/st (58)huge thing int (59)huge thing ld/st (5a)huge thing fp/simd (5b)60696b6d6edispatch stall (70)72scheduler rewind (75)scheduler stall (76)~dispatch op (78)~map op int (7c)~map op ld/st (7d)~map op fp/simd (7e)~map lookup int (7f)~map lookup ld/st (80)~map lookup fp/simd (81)8283pipeline redirect (84)85inst all (8c)inst branch (8d)inst branch taken (90)91inst b.cc (94)inst integer (97)inst fp/simd load (98)inst ldst (9b)9d9fa0a1a3a6a8a9acafb5b6bbdcache load miss (bf)dtlb miss (c1)c2c5cfd5d6dbddinst fetch restart (de)e0e7eaebecld/st retires (ed)? fp/simd (ee)gpr retires (ef)f5f6f7f8fd
400025800695990001005700028002620120254000731024000016000010240000160000508013862880278080022080041800410032340001020160000240000201600004800008004180041118002110901010800008000011016001314016005400521600396152431310501931703280038013130160000240000108004280042800428004280042
40002480041599101000120002800262012025400073102400631600001024000016000050800853288196808002208004180041003234002352016000024000020160000480000800418004111800211090101080000800000101600000351600320032160000613235000501931702380038014100160000240000108004280042800428004280042
4000248004160000000042000280026012120254000101024006316000010240000160000508008532880000180022080041800410032340001020160000240000201600004800008004180041118002110901010800008000001016000003516003600016000061324000050193170338003801400160000240000108004280042800428004280042
400024800416000000004201028002620120254000101024000016000010240000160000508003742881968080119080041800410032340001020160000240000201600004800008004180041118002110901010800008000001016000003516000010361600366003500050192170338003801400160000240000108004280042800428004280042
40002480041599000000420000800262120025400073102400631600001024000016000050800000288000008002208004180041003234000102016000024000020160000480000800418004111800211090101080000800000101600121343160051015116003961510131050193170338003800130160000240000108004280042800428004280042
4000248004159911100013000080026205325400089102400791600001024000016000050800048288025808002208004180041073234000102016000024000020160000480000800418004111800211090101080000800000101600000351600322032160000613240000501931703380038010140160000240000108004280042800428004280042
40002480041599000000380002800261121202540001010240063160000102400001600005080037728800000800220800418004100323400010201600002400002016000048000080041800411180021109010108000080000010160000035160036003216003561320000501931703380038014100160000240000108004280042800428004280042
40002480041599000000420002800260121202540007310240063160000102400001600005080085328833651800220800418004100323400010201600002400002016000048000080041800411180021109010108000080000010160000035160036003616003600364000050193170338003800100160000240000108004280042800428004280042
4000248004160000010042000280026201202540001010240063160000102400001600005080085328819680800220800418004100323400010201600002400002016000048000080041800411180021109010108000080000010160000035160000000160032600400005019317023800380000160000240000108004280042800428004280042
40002480041599101000130002800262553254000891024007916000010240000160000508013862884997080022080041800410032340001020160000240000201600004800008004180041118002110901010800008000011016001313431600540113160039005243123050193170328003801400160000240000108004280042800428004280042