Apple M1 Microarchitecture Research by Dougall Johnson

Firestorm: Overview | Base Instructions | SIMD and FP Instructions
Icestorm:  Overview | Base Instructions | SIMD and FP Instructions

DUP (general, 16B)

Test 1: uops

Code:

  dup v0.16b, w0
  mov x0, 1
  mov x1, 2

(no loop instructions)

1000 unrolls and 1 iteration

Retires: 2.000

Issues: 2.000

Integer unit issues: 0.000

Load/store unit issues: 1.000

SIMD/FP unit issues: 1.000

retire (01)cycle (02)03071e3f4f51inst issue (52)~issue fp/simd (54)~issue ld/st (55)~dispatch fp/simd (57)~dispatch ld/st (58)huge thing ld/st (5a)huge thing fp/simd (5b)60696d6edispatch stall (70)scheduler rewind (75)scheduler stall (76)~dispatch op (78)~map op ld/st (7d)~map op fp/simd (7e)~map lookup ld/st (80)~map lookup fp/simd (81)8283pipeline redirect (84)85inst all (8c)inst neon or fp (9a)a6accfd5d6ddinst fetch restart (de)e0ld/st retires (ed)? fp/simd (ee)f5f6f7f8fd
2004375210360025200010001000100010001407522820035637537572310820001000100010001000375375111001100010000731161137210001000376376376376376
2004375300360225200010001000100010001407522820135637537572310820001000100010001000375375111001100010000731161137210001000376376376376376
2004375310360025200010001000100010001407522820135637537672310820001000100010001000375379111001100010000731161137210001000376376376376376
2004375300360025200010001000100010001407522820135637537572310820001000100010001000375375111001100010003731161137210001000376376376376376
2004380300360025200010001000100010001407522820135637537572310820001000100010001000379375111001100010000751161137210001000376376376376376
2004375200360025200010001000100010001407522820135637537572310820001000100010001000375375111001100010000731161137210001000376376376376376
2004375300360025200010001000100010001407522820135637537572310820001000100010001000375375111001100010000731161137710001000376376376376376
2004375200360025200010001000100010001407522820135637537572310820001000100010001000375378111001100010000751161237210001000376376376377376
2004375210360025200010001000100010001407522820135637537572310820001000100010001000375375111001100010000731161137210001000376376376376376
2004375300360025200010001000100010001407522820135637537572310820001000100010001000375375111001100010000731161137210001000376376376376376

Test 2: Latency 1->2 roundtrip

Code:

  dup v0.16b, w0
  fmov x0, d0
  mov x0, 1
  mov x1, 2

(fused SUBS/B.cc loop)

100 unrolls and 100 iterations

Result (median cycles for code): 12.0032

retire (01)cycle (02)031e3f4f51inst issue (52)~issue int (53)~issue fp/simd (54)~issue ld/st (55)~dispatch int (56)~dispatch fp/simd (57)~dispatch ld/st (58)huge thing int (59)huge thing ld/st (5a)huge thing fp/simd (5b)60696d6edispatch stall (70)scheduler rewind (75)scheduler stall (76)~dispatch op (78)~map op int (7c)~map op ld/st (7d)~map op fp/simd (7e)~map lookup int (7f)~map lookup ld/st (80)~map lookup fp/simd (81)8283pipeline redirect (84)85inst all (8c)inst branch (8d)inst branch taken (90)inst b.cc (94)inst integer (97)inst neon or fp (9a)9fa1a6a8a9acc2c5branch mispredict (cb)cfd0d5d6ddinst fetch restart (de)e0? int output thing (e9)ld/st retires (ed)? fp/simd (ee)gpr retires (ef)f5f6f7f8fd
302041200368990120017109457254010010100200001000010020000100005005735672136720531120013120032120032115525311624330100200100002000020010000200001200321200321120201100991001010010000100010000000100131013242211957410000100001000010100120033120033120033120033120033
3020412003289912120020109456254010010100200001000010020000100005005735672136720531120013120032120033115525311624030100200100002000020010000200001200331200321120201100991001010010000100010006520000131012162211957410000100001000010100120033120033120033120033120034
302041200328990120017109456254010010100200001000010020000100005005735672136720530120013120032120032115525311624030100200100002000020010000200001200321200321120201100991001010010000100010000000000131012162211957410000100001000010100120033120033120033120033120033
302041200328990120017109456254010010100200001000010020000100005005735672136720531120013120032120032115525311624030100200100002000020010000200001200321200321120201100991001010010000100010000000000131012162211957410000100001000010100120033120033120033120033120033
302041200328990120017109456254010010100200001000010020000100005005735672136720531120013120040120032115525311624030100200100002000020010000200001200321200322120201100991001010010000100010000000000131012162211957410000100001000010100120033120033120033120033120033
302041200328990120017109456254010010100200001000010020000100005005735672136720531120013120032120032115525311624030100200100002000020010000200001200321200321120201100991001010010000100010000000000131012162211957410000100001000010100120033120033120033120033120033
3020412003289901200171094562540100101002000010000100200001000050057356721367205311200131200321200321155253116240301002001000020000200100002000012003212003211202011009910010100100001000100001000000131012162211957410000100001000010100120033120033120033120033120033
302041200328990120017109456254010010100200001000010020000100005005735672136720531120013120032120032115525311624030100200100002000020010000200001200321200321120201100991001010010000100010000100000131012162211957410000100001000010100120033120033120033120033120033
30204120032899705120017109456254010010100200001000010020000100005005735672136720531120013120032120032115525311624030100200100652000020010000200001200751200331120201100991001010010000100010000000200131012162211957410000100001000010100120033120033120033120033120033
302041200328990120017109456254010010100200001000010020000100005005735672136720531120013120032120032115525311624030100200100002012120010000200001200321200321120201100991001010010000100010000000000131012162211957410000100001000010100120033120033120033120033120033

1000 unrolls and 10 iterations

Result (median cycles for code): 12.0051

retire (01)cycle (02)03080b191e1f3f4f51inst issue (52)~issue int (53)~issue fp/simd (54)~issue ld/st (55)~dispatch int (56)~dispatch fp/simd (57)~dispatch ld/st (58)huge thing int (59)huge thing ld/st (5a)huge thing fp/simd (5b)60696b6d6edispatch stall (70)scheduler rewind (75)scheduler stall (76)~dispatch op (78)~map op int (7c)~map op ld/st (7d)~map op fp/simd (7e)~map lookup int (7f)~map lookup ld/st (80)~map lookup fp/simd (81)8283pipeline redirect (84)85inst all (8c)inst branch (8d)inst branch taken (90)inst b.cc (94)inst integer (97)inst neon or fp (9a)9fa6a8a9acc5cfd5d6ddinst fetch restart (de)e0? int output thing (e9)ld/st retires (ed)? fp/simd (ee)gpr retires (ef)f5f6f7f8fd
30024120032899000001200171094562540010100102000010000102000010000505739080136934521120013012003212003211554831162623033620100002000020100002000012003212003211200211091010010100001010000000012701161111957410000100001000010010120033120033120033120033120033
3002412003289900000120017109491254001010010200051000010200001000050573644013670029012001331200651200341155483116262300102010062200002010000200001200731200791120021109101001010000101000000135012701161111957410000100001000010010120033120033120033120033120033
3002412005189900000120017109456254001910010200001000010200001000050573567213670029012001301200321200671155503116262300102010000200002010000200001200321200321120021109101001010000101000000189012701161111957410000100001000010010120033120033120033120033120033
300241200688990000012001710945625400101001020000100001020000100005057356721367002911200130120032120032115548311626230010201000020000201000020000120032120032112002110910100101000010100000072012701161111957410000100001000010010120058120033120033120033120033
3002412006089900000120017109517254001010010200001000010200001000050573567213670029012001301200321200321155483116262300102010000200002010000200001200321200321120021109101001010000101000000114012701161111957710000100001000010010120033120033120033120033120033
30024120033899000001200171094562540010100102000010000102045610000505735720136700290120013012003212003211554831162623001020100002000020100002000012003212003211200211091010010100001010000009012701161111957410000100001000010010120033120033120033120033120033
3002412007689900000120053109456254001010010200001000010200001000050573567213670029012001301200321200321155483116262300102010000200002010000200001200321200331120021109101001010000101000000186012701161111957410000100001000010010120033120033120033120033120033
30024120078899000001200171094562540010100102000010000102000010000505735672136700290120013012003212003211554831162623001020100002000020100002000012003212003211200211091010010100001010000000012701161111957410000100001000010010120033120033120033120033120033
300241200398990000012001710945625400101001020000100001020000100005057356721367002901200130120032120032115548311626230010201000020000201000020000120033120032112002110910100101000010100000099012701161011957410000100001000010010120033120033120033120033120036
300241200328990000012001710945625400101001020000100001020000100005057356721367002901200130120032120032115548311626230010201000020000201000020000120032120032112002110910100101000010100000018012701161111957410000100001000010010120033120033120033120033120033

Test 3: throughput

Count: 8

Code:

  dup v0.16b, w8
  dup v1.16b, w8
  dup v2.16b, w8
  dup v3.16b, w8
  dup v4.16b, w8
  dup v5.16b, w8
  dup v6.16b, w8
  dup v7.16b, w8
  mov x8, 9

(fused SUBS/B.cc loop)

100 unrolls and 100 iterations

Result (median cycles for code divided by count): 0.3351

retire (01)cycle (02)03070a1e1f3a3f4f51inst issue (52)~issue int (53)~issue fp/simd (54)~issue ld/st (55)~dispatch int (56)~dispatch fp/simd (57)~dispatch ld/st (58)huge thing int (59)huge thing ld/st (5a)huge thing fp/simd (5b)60696d6edispatch stall (70)scheduler rewind (75)scheduler stall (76)~dispatch op (78)~map op int (7c)~map op ld/st (7d)~map op fp/simd (7e)~map lookup int (7f)~map lookup ld/st (80)~map lookup fp/simd (81)8283pipeline redirect (84)85inst all (8c)inst branch (8d)inst branch taken (90)inst b.cc (94)inst integer (97)inst neon or fp (9a)9fa6a8a9acc2c5branch mispredict (cb)cdcfd5d6ddinst fetch restart (de)e0ld/st retires (ed)? fp/simd (ee)gpr retires (ef)f5f6f7f8fd
1602042671120011002266970251601001008000080000100800208001550011695571883252026693267082670866366665716013620080024800242008002080020268832676811802011009910010080000100800001015321115119816382698780000800001002670926709267092670926709
1602042680020011132882268580251601001008000080000100800208037550011689511887370026689267082670866326665716013520080020800202008002080020267082670811802011009910010080000100800000012901115119316832670580000800001002670926709267092670926709
1602042671520011002266930251601001008000080000100800208001650011689511884163026689267082670866326665716013520080020800202008002080020267082670811802011009910010080000100800000017401115119616662670580000800001002670926709267092670926709
16020426708200110022669302516010010080000800001008002080015500116895118973260266892670826708663266657160135200800208002020080020800202670826708118020110099100100800001008000000001115119816382670580000800001002670926709267092670926709
160204267082001100226693025160100100800008000010080020800155001168951188888902668926708267086632666571601352008002080020200800208002026708267081180201100991001008000010080000007801115119816882670580000800001002670926709267092670926709
16020426708200110022669302516010010080000800001008002080015500116895118929320266892670826708663466661160135200800208002020080020800202670826708118020110099100100800001008000000301115119816882670580000800001002670926709267092670926709
16020426708200110022669302516010010080000800001008002080015500116895118863100266892670826708663266657160135200802168002020080020800202670826708118020110099100100800001008000000001115119316882670580000800001002670926709267092670926709
16020426708200110022669302516010010080000800001008020880196500116895118923400266892670826708663266657160135200800208002020080020800202670826708118020110099100100800001008000000001115119816882670580000800001002686426719267182670926709
16020426875200110108226693025160100100800008000010080020800155001168956188416302668926708267086632666571601352008002080020200800208002026711267081180201100991001008000010080000006301115119816982670580000800001002670926709267092670926709
160204268142011100226693025160100100800008000010080020800165001170128188416302668926708267086632666571601352008002080020200800208002026708267081180201100991001008000010080000006301115119616882670580000800001002670926709267092670926709

1000 unrolls and 10 iterations

Result (median cycles for code divided by count): 0.3339

retire (01)cycle (02)030e1e3f51inst issue (52)~issue int (53)~issue fp/simd (54)~issue ld/st (55)~dispatch int (56)~dispatch fp/simd (57)~dispatch ld/st (58)huge thing int (59)huge thing ld/st (5a)huge thing fp/simd (5b)60696d6edispatch stall (70)scheduler rewind (75)scheduler stall (76)~dispatch op (78)~map op int (7c)~map op ld/st (7d)~map op fp/simd (7e)~map lookup int (7f)~map lookup ld/st (80)~map lookup fp/simd (81)8283pipeline redirect (84)85inst all (8c)inst branch (8d)inst branch taken (90)inst b.cc (94)inst integer (97)inst neon or fp (9a)9fa5a6a8acc2cfd5d6ddinst fetch restart (de)e0ld/st retires (ed)? fp/simd (ee)gpr retires (ef)f5f6f7f8fd
160024267081992226693251600101080000800001080000800005011688801884032026689267082670866533668816001020800008000020800008000026708267081180021109101080000100800001090502011611267058000080000102670926709267092670926782
160024267082000026693251600101080000800001080000800005011688801884032026689267082670866533668816001020800008000020800008000026708267081180021109101080000100800001300502011611267058000080000102670926709267092670926780
1600242670820000266932516001010800008000010803708000050118383218840321266892670826708665336688160010208000080000208000080000267082670811800211091010800001008000001200502011611267058000080000102670926709267092670926809
1600242671020000266932516001010800008000010800008000050116886818830140266892670826708665336688160010208000080000208000080000267082670811800211091010800001008000001230502011611267058000080000102670926709267112671226738
160024267082000026693251600101080000800001080000800005011688801884032126689267082670866533669016001020800008000020800008000026708267081180021109101080000100800001200502011611267058000080000102670926709267092670926755
1600242670820000266932516001010800008000010800008000050116888018840321266892670826708665336688160010208000080000208000080000267082670811800211091010800001033800001030502011611267058000080000102670926709267092670926774
160024267102000026693251600101080000800001080000800005011688801884032026689267082670866533668816001020800008000020800008000026708267081180021109101080000100800001930502011611267058000080000102670926709267092670926812
1600242671619900266932516001010800008000010805558000050116888018840321266892670826708665336688160010208000080000208000080000267082670811800211091010800001008000001290502011611267058000080000102670926709267092670926795
1600242670820000266932516001010800008000010800008000050116888018840321266892670826708665336688160010208000080000208000080000267082670811800211091010800001008000061110502011611267058000080000102670926709267092670926787
160024267102000026693251600101080000800001080000800005011703611884032126689267082670866533668816001020800008000020800008000026708267081180021109101080000100800002530502011611267058000080000102670926709267092670926798