Apple M1 Microarchitecture Research by Dougall Johnson

Firestorm: Overview | Base Instructions | SIMD and FP Instructions
Icestorm:  Overview | Base Instructions | SIMD and FP Instructions

LD3R (2D)

Test 1: uops

Code:

  ld3r { v0.2d, v1.2d, v2.2d }, [x6]
  nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop
  mov x0, 1
  mov x1, 2
  mov x8, 0

(no loop instructions)

1000 unrolls and 1 iteration

Retires (minus 60 nops): 5.000

Issues: 5.006

Integer unit issues: 0.000

Load/store unit issues: 2.000

SIMD/FP unit issues: 3.006

retire (01)cycle (02)030708090a0e0f181e1f2223243a3f464951inst issue (52)~issue fp/simd (54)~issue ld/st (55)~dispatch fp/simd (57)~dispatch ld/st (58)huge thing ld/st (5a)huge thing fp/simd (5b)5f696d6escheduler rewind (75)scheduler stall (76)~dispatch op (78)~map op ld/st (7d)~map op fp/simd (7e)~map lookup ld/st (80)~map lookup fp/simd (81)8283pipeline redirect (84)85inst all (8c)inst fp/simd load (98)inst ldst (9b)9da0a3a6a8acafb5b6bbdcache load miss (bf)cfd0d2icache miss (d3)itlb miss (d4)d5d6d9dadbddinst fetch restart (de)e0eaebecld/st retires (ed)? fp/simd (ee)f5f6f7f8fd
65005292482201600161105010045752883520169965006300620003000200010000356133229282914629201310500020003000200060002914629172116100110001000120000200202200040001305791166825307655820250301538085343328458164261332414892200030002929429141292782920529281
650042929821911001410000100459628795101701350063009200030002000100023566152295029135292773105000200030002000600029168291751161001100010000200062000002000000612925931468553037746201523131381212323728443164931319714769200030002924229249292622923729305
65004293092191100150006010045522876800169805000300620003000200010000356835229232904229209310500020003000200060002908929144116100110001000020006200000200040001281591166822304024620101309838176423628424164821336214973200030002915329317292082926329253
65004293022191300110007000046082884300169765009300620003000200010000356105228852912629226310500020003000200060002916129031116100110001000020004200000200000001286692186822306784020177303438236273028559160301330914981200030002930329229292792927129222
650042922221914001400070000462728843001697850063000200030002000100003570522286729037292643105000200030002000600029165291871161001100010000200062000032000400412916917868003088441201263062380814403628358163761328714869200030002920129235292832929329198
6500429258219160090000010046062900200169035000300920003000200010000356862228872916029248310500020003000200060002915429034116100110001000020006200000200040001282792456868304354820110313438134423828364164631335314808200030002925729350293392927429317
6500429273219900100007010045832881300169415009300620003000200010000356245228802915929251310500020003000200060002921529185116100110001000020004200003200240041285591546855306865120203308838137363828370164071337115040200030002922329225292172931329210
6500429292219170080006000045982882400169885006300020003000200010000356935229372918229262310500020003000200060002921229207116100110001000020004200000200140041289391076809307154820140309038117343928395163471337514804200030002927229236292822925529257
65004292562191401110006010045462884600170185009300620003000200010000356895228862905429232310500020003000200060002908529150116100110001000020006200000200040061285291756816311663720203314438088353728469164401311714824200030002922229246293212922229239
650042927221915011200050100452328831021703150093006200030002000100003571252293229112292473105000200030002000600029143291361161001100010000200042000002002400412867923968593085546198083073381815353728366163711337414962200030002930029266292452922029258

Test 2: throughput

Count: 8

Code:

  ld3r { v0.2d, v1.2d, v2.2d }, [x6]
  ld3r { v0.2d, v1.2d, v2.2d }, [x6]
  ld3r { v0.2d, v1.2d, v2.2d }, [x6]
  ld3r { v0.2d, v1.2d, v2.2d }, [x6]
  ld3r { v0.2d, v1.2d, v2.2d }, [x6]
  ld3r { v0.2d, v1.2d, v2.2d }, [x6]
  ld3r { v0.2d, v1.2d, v2.2d }, [x6]
  ld3r { v0.2d, v1.2d, v2.2d }, [x6]
  mov x7, x6
  mov x8, x6
  mov x9, x6
  mov x10, x6
  mov x11, x6
  mov x12, x6
  mov x13, x6

(fused SUBS/B.cc loop)

100 unrolls and 100 iterations

Result (median cycles for code divided by count): 1.0005

retire (01)cycle (02)0305080b0e0f18191e22233a3f4346494f51inst issue (52)~issue int (53)~issue fp/simd (54)~issue ld/st (55)~dispatch int (56)~dispatch fp/simd (57)~dispatch ld/st (58)huge thing int (59)huge thing ld/st (5a)huge thing fp/simd (5b)60696d6e72scheduler rewind (75)scheduler stall (76)~dispatch op (78)~map op int (7c)~map op ld/st (7d)~map op fp/simd (7e)~map lookup int (7f)~map lookup ld/st (80)~map lookup fp/simd (81)8283pipeline redirect (84)85inst all (8c)inst branch (8d)inst branch taken (90)inst b.cc (94)inst integer (97)inst fp/simd load (98)inst ldst (9b)9d9fa0a3a5a6a8acafb5b6bbdcache load miss (bf)branch mispredict (cb)cfd5d6ddinst fetch restart (de)e0e7eaebld/st retires (ed)? fp/simd (ee)gpr retires (ef)f5f6f7f8fd
4002058006860011100005810080026255025400157100240057160000100240000160000500800853288000018002280041800410323400100200160000240000200160000480000800418004111802011009910010080000800000100160000350160032031600006132005109117118003801401600002400001008004280042800428004280042
4002048004159900000000100800260121202540015710024006316000010024000016000050080037728819781800228004180041032340010020016000024000020016000048000080041800411180201100991001008000080000010016000000160032001600326000051091171180038010141600002400001008004280042800428004280042
4002048004159900000004200080026212120254001631002400571600001002400001600005008008532880000180022800418004103234001002001600002400002001600004800008004180041118020110099100100800008000001001600005301600321016000060040051091171180038010141600002400001008004280042800428004280042
40020480041599000000000028002621212025400157100240057160000100240000160000500800853288332918002280041800410323400100200160000240000200160000480000800418004111802011009910010080000800000100160000530160032001600320136400510911711800380001600002400001008004280042800428004280042
400204800415990000000420008002621212025400163100240057160000100240000160000500800000288000008002280041800410323400100200160000240000200160000480000800418004111802011009910010080000800000100160000530160318433616003260035051091171180124010101600002400001008004280042800428004280042
40020480041599000000042000800262121202540016310024000016000010024000016000050080085328819680800228004180041032340010020016000024000020016000048000080041800411180201100991001008000080000010016000053016003223616003261040051091171180038014101600002400001008004280042800428004280042
40020480041599000000042102800262012025400100100240063160000100240000160000500800376288269208002280041800410323400100200160000240000200160000480000800418004111802011009910010080000800000100160000350160032036160000603235051091171180038110101600002400001008004280042800428004280042
40020480041599000000042002800262121202540010010024006316000010024000016000050080000028800000800228004180041032340010020016000024000020016000048000080041800411180201100991001008000080000010016000035016003203616006060324005109117118003801401600002400001008004280042800428004280042
400204800416000000000000280026212120254001631002400631600001002400001600005008008532880000080022800418069573234001002001600002400002001600004800008004180041118020110099100100800008000001001600003501600320361600326104005109117118003810101600002400001008004280042800428004280042
4002048004160000000000000800262120025400157100240000160000100240000160000500800377288000008002280041800410323400100200160000240000200160000480000800418004111802011009910010080000800001100160000350160032236160036013240051091171180104014101600002400001008004280042800428004280042

1000 unrolls and 10 iterations

Result (median cycles for code divided by count): 1.0005

retire (01)cycle (02)0308090b0e0f18191e1f2223243a3f4346494f51inst issue (52)~issue int (53)~issue fp/simd (54)~issue ld/st (55)~dispatch int (56)~dispatch fp/simd (57)~dispatch ld/st (58)huge thing int (59)huge thing ld/st (5a)huge thing fp/simd (5b)60696d6edispatch stall (70)scheduler rewind (75)scheduler stall (76)~dispatch op (78)~map op int (7c)~map op ld/st (7d)~map op fp/simd (7e)~map lookup int (7f)~map lookup ld/st (80)~map lookup fp/simd (81)8283pipeline redirect (84)85inst all (8c)inst branch (8d)inst branch taken (90)inst b.cc (94)inst integer (97)inst fp/simd load (98)inst ldst (9b)9d9fa0a1a3a5a6a8a9acafb5b6bbdcache load miss (bf)c2c5cfd0d5d6ddinst fetch restart (de)dfe0e7eaebld/st retires (ed)? fp/simd (ee)gpr retires (ef)f5f6f7f8fd
400025800566000000000300000180026212120254000101024000016000010240000160000508002192881253180022800418004103234000102016000024000020160000480000800418004111800211091010800008000011016000002701600240024160024612427005035351755380038106160000240000108004280042800428004280042
40002480041599000000030000008002621212025400010102400451600001024000016000050800219288119918002280041800410323400010201600002400002016000048000080041800411180021109101080000800000101600000270160024002416000060027005022341744380038106160000240000108004280042800428004280042
40002480041600000000000100080026212120254000101024000016000010240000160000508002222881212180022800418004103234000102016000024000020160000480000800418004111800211091010800008000001016000002701600230024160024602427005022351766380038166160000240000108004280042800428004280042
40002480041600000000000000080026212120254000101024000016000010240000160000508002192881199180022800418004103234000102016000024000020160000480000800418004111800211091010800008000001016000002701600000001600240000005022341745380038166160000240000108004280042800428004280042
400024800416000000000300100080026012120254000561024004616000010240000160000508002192880000180022800418004103234000102016000024000020160000480000800418004111800211091010800008000001016000002701600240024160000012427005022341744380038106160000240000108004280042800428004280042
400024800416000000000300100080026212120254000561024004616000010240000160000508002192881212180022800418004103234000102016000024000020160000480000800418004111800211091010800008000001016000002701600240024160024612427005022361745380038166160000240000108004280042800428004280042
40002480041599000000060010008002620120254000551024004516000010240000160000508002182881210180022800418004103234000102016000024000020160000480000800418004111800211091010800008000001016000002701600240024160024612427005022341744380038066160000240000108004280042800428004280042
4000248004159900000000010008002621212025400056102400001600001024000016000050800219288120818002280041800410323400010201600002400002016000048000080041800411180021109101080000800000101600000270160024002416002461027005022341744380038106160000240000108004280042800428004280042
400024800416000000000300100080026212120254000561024004616000010240000160000508002192881212180022800418004103234000102016009224000020160000480000800418004111800211091010800008000011016000002701600240024160024612427005022361744380038166160000240000108004280047800428004280042
40002480041599000000030000008002601212025400056102400461600001024000016000050800000288121218002280041800410323400010201600002400002016000048000080041800411180021109101080000800000101600000270160024000160024612427005022341744380038106160000240000108004280042800428004280042