Apple M1 Microarchitecture Research by Dougall Johnson

Firestorm: Overview | Base Instructions | SIMD and FP Instructions
Icestorm:  Overview | Base Instructions | SIMD and FP Instructions

LD3R (1D)

Test 1: uops

Code:

  ld3r { v0.1d, v1.1d, v2.1d }, [x6]
  nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop
  mov x0, 1
  mov x1, 2
  mov x8, 0

(no loop instructions)

1000 unrolls and 1 iteration

Retires (minus 60 nops): 5.000

Issues: 5.006

Integer unit issues: 0.000

Load/store unit issues: 2.000

SIMD/FP unit issues: 3.006

retire (01)cycle (02)030507080a0b0e0f18191e2223243a3f43464951inst issue (52)~issue fp/simd (54)~issue ld/st (55)~dispatch fp/simd (57)~dispatch ld/st (58)huge thing ld/st (5a)huge thing fp/simd (5b)5f61696b6d6escheduler rewind (75)scheduler stall (76)~dispatch op (78)~map op ld/st (7d)~map op fp/simd (7e)~map lookup ld/st (80)~map lookup fp/simd (81)8283pipeline redirect (84)85inst all (8c)inst fp/simd load (98)inst ldst (9b)9da0a1a3a6a8a9acafb5bbdcache load miss (bf)c2cfd0d2icache miss (d3)itlb miss (d4)d5d6d9dadbddinst fetch restart (de)dfe0eaebecld/st retires (ed)? fp/simd (ee)f5f6f7f8fd
6500529326220015017000000100453428765002170705006300920003000200010000357514022864029048292273105000200030002000600029115291261161001100010001200000200210420004040130039400689130514412016131183818113545328365162721315414997200030002925929308292902928829193
650042939521901201201000500046382883800216974501230062000300020001000035603002295502913229248310500020003000200060002916829116116100110001000020000420020022000624012783911168463036941201513122381264544328419163621335615006200030002933429323292652927429283
6500429263219016014000005000458928808000169765006300620003000200010000357022022912029113293413105000200030002000600029170291671161001100010000200006200000020000040131449068681630561035202633072381694238228407164221328714804200030002926629262293362921929326
6500429214219016014000000000452128763000170665012301220003000200010000356942022930029097291763105000200030002000600029127291281161001100010000200004200000320004060131819180681030317382012030983817104339328419163021334814981200030002927829314292862924229327
650042930822001509010002000458828846000169895003300620003000200010000356907023007029148292603105000200030002000600029146292051161001100010000200000200000020004040129039172683030458362020730903820153329228349162371347114686200030002933529250293122924329247
6500429300219011010000000000453728761000169605006300620003000200010000357712022926029159293143105000200030002000600029158291191161001100010001200004200000020006040126939156682330648392018330623817103634328445162891343914870200030002926729326292282922929284
650042926521801601600000000045592885400016983500030092000300020001000035717302290802912929244310500020003000200060002913429124116100110001000120000620000022000424012900910268413034842201493124381663540328416164891346114741200030002923629262292732922229292
6500429310220013013000000000454028880020170885006300020003000200010000356153022943029044292433105000200030002000600029202291271161001100010001200000200000020004040129499163688430679352024530833819103737228459163771345415004200030002933329226292502919828777
6500429207219018016010004000460928870000169995006300020003000200010000358002022934029053292213105000200030002000600029081291021161001100010001200030200000020024000127499122680130617362010730953819104736228421163481354114948200030002926229246293032921429296
650042936521908014000005000458028818020169615000300020003000200010000357184022887029104292443105000200030002000600029203291921161001100010001200006200000020024040129639155686530536352015030603818104030228382163841341114993200030002932929205293112921129287

Test 2: throughput

Count: 8

Code:

  ld3r { v0.1d, v1.1d, v2.1d }, [x6]
  ld3r { v0.1d, v1.1d, v2.1d }, [x6]
  ld3r { v0.1d, v1.1d, v2.1d }, [x6]
  ld3r { v0.1d, v1.1d, v2.1d }, [x6]
  ld3r { v0.1d, v1.1d, v2.1d }, [x6]
  ld3r { v0.1d, v1.1d, v2.1d }, [x6]
  ld3r { v0.1d, v1.1d, v2.1d }, [x6]
  ld3r { v0.1d, v1.1d, v2.1d }, [x6]
  mov x7, x6
  mov x8, x6
  mov x9, x6
  mov x10, x6
  mov x11, x6
  mov x12, x6
  mov x13, x6

(fused SUBS/B.cc loop)

100 unrolls and 100 iterations

Result (median cycles for code divided by count): 1.0005

retire (01)cycle (02)0305080b0e0f18191e1f2223243a3f4346494f51inst issue (52)~issue int (53)~issue fp/simd (54)~issue ld/st (55)~dispatch int (56)~dispatch fp/simd (57)~dispatch ld/st (58)huge thing int (59)huge thing ld/st (5a)huge thing fp/simd (5b)60696d6edispatch stall (70)scheduler rewind (75)scheduler stall (76)~dispatch op (78)~map op int (7c)~map op ld/st (7d)~map op fp/simd (7e)~map lookup int (7f)~map lookup ld/st (80)~map lookup fp/simd (81)8283pipeline redirect (84)85inst all (8c)inst branch (8d)inst branch taken (90)inst b.cc (94)inst integer (97)inst fp/simd load (98)inst ldst (9b)9d9fa0a1a3a6a7a8a9acafb5b6bbdcache load miss (bf)dtlb miss (c1)c2c5branch mispredict (cb)cdcfd5d6ddinst fetch restart (de)e0? int output thing (e9)eaebecld/st retires (ed)? fp/simd (ee)gpr retires (ef)f5f6f7f8fd
40020580069600000100075010028002720532540018510024008316001210024002416001650080145628850841800228004380042061340014020016001624002420016001648004880042800421180201100991001008000080000010016001812016005600152160043615143120111511601600800390131301600002400001008004380043800438004380043
40020480042600110000056010028002825532540014110024007716001210024002416001650080145628850781800228004380042071340014020016001624002420016001648004880042800431180201100991001008000080000010016001814431600570025216004461520132111511701600800390131311600002400001008004380043800438004480043
40020480043600100000058000008002605532540016810024006716000010024000016000050080137228848951800228004180041032340010020016000024000020016000048000080041800411180201100991001008000080000010016001314431600130001316003961134313100051091171180038001311600002400001008004280042800428004280042
400204800416001100001570100280026255325400119100240079160000100240000160000500801386288489218002280041800410323400100200160000240000200160000480000800418004111802011009910010080000800000100160013124316005300052160039615243131000510911711800380131301600002400001008004280042800428004280042
40020480041599100000058010028002625532540017910024007916000010024000016000050080010228848921800228004180041032340010020016000024000020016000048000080041800411180201100991001008000080000010016001414431601120725116003961124313000051091171180038001311600002400001008004280042800428004280042
40020480041599101000057000028002605732540016610024007916000010024000016000050080138628848951800228004180041032340010020016000024000020016000048000080041800411180201100991001008000080000010016001313431600530025216000061524313200051091171180038001301600002400001008004280042800428004280042
400204800416001110000580100280026255325400118100240074160000100240000160000500801383288489918002280041800410323400100200160000240000200160000480000800418004111802011009910010080000800000100160014124316005300152160039615243131000510911711800380131301600002400001008010880042800428004280042
4002048004160011100005801002800260553254001681002400791600001002400001600005008013822884895180022800418004103234001002001600002400002001600004800008004180041118020110099100100800008000001001600131543160052000511600006151013000051091171180038013001600002400001008004280042800428004280042
4002048004159911000005800002800262500254001741002400181600001002400001600005008013832884906080022800418004103234001002001600002400002001600004800008004180041118020110099100100800008000001001600121343160051002511600006151012100051091171180038013011600002400001008004280042800428004280042
400204800415991110000120100280026255325400174100240074160000100240000160000500801383288499618002280041800410323400100200160000240000200160000480000800418004111802011009910010080000800000100160015134316005100013160039611343132000510911711800380131311600002400001008004280042800428004280042

1000 unrolls and 10 iterations

Result (median cycles for code divided by count): 1.0005

retire (01)cycle (02)0308090b0e0f191e1f22233a3f43464951inst issue (52)~issue int (53)~issue fp/simd (54)~issue ld/st (55)~dispatch int (56)~dispatch fp/simd (57)~dispatch ld/st (58)huge thing int (59)huge thing ld/st (5a)huge thing fp/simd (5b)60696d6e72scheduler rewind (75)scheduler stall (76)~dispatch op (78)~map op int (7c)~map op ld/st (7d)~map op fp/simd (7e)~map lookup int (7f)~map lookup ld/st (80)~map lookup fp/simd (81)8283pipeline redirect (84)85inst all (8c)inst branch (8d)inst branch taken (90)inst b.cc (94)inst integer (97)inst fp/simd load (98)inst ldst (9b)9d9fa0a3a6a8a9acafb5b6bbdcache load miss (bf)c5branch mispredict (cb)cfd5d6d9dbddinst fetch restart (de)e0e7eaebecld/st retires (ed)? fp/simd (ee)gpr retires (ef)f5f6f7f8fd
400025800426000101000010080026012122540001010240063160000102400001600005080085328800000800228004180041032340001020160000240000201600004800008004180041118002110910108000080000010160000351600320035160036610400050196176078800380000160000240000108004280042800428004280105
4000248004160000000000102800262012254000731024006316000010240000160000508008532882004080022800418004103234000102016000024000020160000480000800418004111800211091010800008000001016000040160000003616000000320005019517005880038014140160000240000108004280042800428004280042
400024800415990000004200008002621212254000681024006016000010240000160000508003772881975080022800418004103234000102016000024000020160000480000800418004111800211091010800008000001016000035160000103616003660040005019817005780038010140160000240000108004280042800428004280042
4000248004160000000000102800262121225400073102400631600001024000016000050800853288197508002280041800410323400010201600002400002016000048000080041800411180021109101080000800000101600000160032003216000061035005019517018880038014140160000240000108004280042800428004280042
400024800415991000004200028002621212254000731024006316000010240000160000508008532881977080022800418004103234000102016000024000020160000480000800418004111800211091010800008000001016000035160000003616003661040005019617008880038014100160000240000108004280042800428004280042
4000258004159900000042000080026001225400010102400631600001024000016000050800377288197808002280041800410323400010201600002400002016000048000080041800411180021109101080000800000101600003516003600016003261360005019817007680038014100160000240000108004280042800428004280042
40002480041600000000380000800262121225400010102400631600001024000016000050800853288197808002280041800410323400010201600002400002016000048000080041800411180021109101080000800000101600000160000003216003200323500501951700778003801400160000240000108004280042800428004280042
4000248004160000000042000080026212025400073102400001600001024000016000050800000288196808002280041800410323400010201600002400002016000048000080041800411180022109101080000800000101600000160036000160036003635005019717008580038014140160000240000108004280042800428004280042
40002480041599000000000028002621212254000681024000016000010240000160000508003782881975080022800418004103234000102016000024000020160000480000800418004111800211091010800008000001016000001600330036160032613640005019817007680038110100160000240000108004280042800428004280042
40002480041599000000001028002600122540006810240000160000102400001600005080085328833370800228004180041032340001020160000240000201600004800008004180041118002110910108000080000010160000351600000032160000603240105019717008780038014100160000240000108004280042800428004280042