Apple Microarchitecture Research by Dougall Johnson

M1/A14 P-core (Firestorm): Overview | Base Instructions | SIMD and FP Instructions
M1/A14 E-core (Icestorm):  Overview | Base Instructions | SIMD and FP Instructions

DUP (general, 16B)

Test 1: uops

Code:

  dup v0.16b, w0
  mov x0, 1
  mov x1, 2

(no loop instructions)

1000 unrolls and 1 iteration

Retires: 2.000

Issues: 2.000

Integer unit issues: 0.000

Load/store unit issues: 1.000

SIMD/FP unit issues: 1.000

retire uop (01)cycle (02)03mmu table walk instruction (07)1e3f4f51schedule uop (52)schedule simd uop (54)schedule ldst uop (55)dispatch simd uop (57)dispatch ldst uop (58)simd uops in schedulers (5a)ldst uops in schedulers (5b)60696d6emap stall dispatch (70)map rewind (75)map stall (76)dispatch uop (78)map ldst uop (7d)map simd uop (7e)map ldst uop inputs (80)map simd uop inputs (81)8283flush restart other nonspec (84)85inst all (8c)inst simd alu (9a)ld unit uop (a6)accfd5map dispatch bubble (d6)ddfetch restart (de)e0? ldst retires (ed)? simd retires (ee)f5f6f7f8fd
2004375210360025200010001000100010001407522820035637537572310820001000100010001000375375111001100010000731161137210001000376376376376376
2004375300360225200010001000100010001407522820135637537572310820001000100010001000375375111001100010000731161137210001000376376376376376
2004375310360025200010001000100010001407522820135637537672310820001000100010001000375379111001100010000731161137210001000376376376376376
2004375300360025200010001000100010001407522820135637537572310820001000100010001000375375111001100010003731161137210001000376376376376376
2004380300360025200010001000100010001407522820135637537572310820001000100010001000379375111001100010000751161137210001000376376376376376
2004375200360025200010001000100010001407522820135637537572310820001000100010001000375375111001100010000731161137210001000376376376376376
2004375300360025200010001000100010001407522820135637537572310820001000100010001000375375111001100010000731161137710001000376376376376376
2004375200360025200010001000100010001407522820135637537572310820001000100010001000375378111001100010000751161237210001000376376376377376
2004375210360025200010001000100010001407522820135637537572310820001000100010001000375375111001100010000731161137210001000376376376376376
2004375300360025200010001000100010001407522820135637537572310820001000100010001000375375111001100010000731161137210001000376376376376376

Test 2: Latency 1->2 roundtrip

Code:

  dup v0.16b, w0
  fmov x0, d0
  mov x0, 1
  mov x1, 2

(fused SUBS/B.cc loop)

100 unrolls and 100 iterations

Result (median cycles for code): 12.0032

retire uop (01)cycle (02)031e3f4f51schedule uop (52)schedule int uop (53)schedule simd uop (54)schedule ldst uop (55)dispatch int uop (56)dispatch simd uop (57)dispatch ldst uop (58)int uops in schedulers (59)simd uops in schedulers (5a)ldst uops in schedulers (5b)60696d6emap stall dispatch (70)map rewind (75)map stall (76)dispatch uop (78)map int uop (7c)map ldst uop (7d)map simd uop (7e)map int uop inputs (7f)map ldst uop inputs (80)map simd uop inputs (81)8283flush restart other nonspec (84)85inst all (8c)inst branch (8d)inst branch taken (90)inst branch cond (94)inst int alu (97)inst simd alu (9a)9fl1d tlb miss (a1)ld unit uop (a6)l1d cache writeback (a8)a9acc2branch cond mispred nonspec (c5)branch mispred nonspec (cb)cfd0d5map dispatch bubble (d6)ddfetch restart (de)e0? int output thing (e9)? ldst retires (ed)? simd retires (ee)? int retires (ef)f5f6f7f8fd
302041200368990120017109457254010010100200001000010020000100005005735672136720531120013120032120032115525311624330100200100002000020010000200001200321200321120201100991001010010000100010000000100131013242211957410000100001000010100120033120033120033120033120033
3020412003289912120020109456254010010100200001000010020000100005005735672136720531120013120032120033115525311624030100200100002000020010000200001200331200321120201100991001010010000100010006520000131012162211957410000100001000010100120033120033120033120033120034
302041200328990120017109456254010010100200001000010020000100005005735672136720530120013120032120032115525311624030100200100002000020010000200001200321200321120201100991001010010000100010000000000131012162211957410000100001000010100120033120033120033120033120033
302041200328990120017109456254010010100200001000010020000100005005735672136720531120013120032120032115525311624030100200100002000020010000200001200321200321120201100991001010010000100010000000000131012162211957410000100001000010100120033120033120033120033120033
302041200328990120017109456254010010100200001000010020000100005005735672136720531120013120040120032115525311624030100200100002000020010000200001200321200322120201100991001010010000100010000000000131012162211957410000100001000010100120033120033120033120033120033
302041200328990120017109456254010010100200001000010020000100005005735672136720531120013120032120032115525311624030100200100002000020010000200001200321200321120201100991001010010000100010000000000131012162211957410000100001000010100120033120033120033120033120033
3020412003289901200171094562540100101002000010000100200001000050057356721367205311200131200321200321155253116240301002001000020000200100002000012003212003211202011009910010100100001000100001000000131012162211957410000100001000010100120033120033120033120033120033
302041200328990120017109456254010010100200001000010020000100005005735672136720531120013120032120032115525311624030100200100002000020010000200001200321200321120201100991001010010000100010000100000131012162211957410000100001000010100120033120033120033120033120033
30204120032899705120017109456254010010100200001000010020000100005005735672136720531120013120032120032115525311624030100200100652000020010000200001200751200331120201100991001010010000100010000000200131012162211957410000100001000010100120033120033120033120033120033
302041200328990120017109456254010010100200001000010020000100005005735672136720531120013120032120032115525311624030100200100002012120010000200001200321200321120201100991001010010000100010000000000131012162211957410000100001000010100120033120033120033120033120033

1000 unrolls and 10 iterations

Result (median cycles for code): 12.0051

retire uop (01)cycle (02)03mmu table walk data (08)l2 tlb miss data (0b)191e1f3f4f51schedule uop (52)schedule int uop (53)schedule simd uop (54)schedule ldst uop (55)dispatch int uop (56)dispatch simd uop (57)dispatch ldst uop (58)int uops in schedulers (59)simd uops in schedulers (5a)ldst uops in schedulers (5b)60696b6d6emap stall dispatch (70)map rewind (75)map stall (76)dispatch uop (78)map int uop (7c)map ldst uop (7d)map simd uop (7e)map int uop inputs (7f)map ldst uop inputs (80)map simd uop inputs (81)8283flush restart other nonspec (84)85inst all (8c)inst branch (8d)inst branch taken (90)inst branch cond (94)inst int alu (97)inst simd alu (9a)9fld unit uop (a6)l1d cache writeback (a8)a9acbranch cond mispred nonspec (c5)cfd5map dispatch bubble (d6)ddfetch restart (de)e0? int output thing (e9)? ldst retires (ed)? simd retires (ee)? int retires (ef)f5f6f7f8fd
30024120032899000001200171094562540010100102000010000102000010000505739080136934521120013012003212003211554831162623033620100002000020100002000012003212003211200211091010010100001010000000012701161111957410000100001000010010120033120033120033120033120033
3002412003289900000120017109491254001010010200051000010200001000050573644013670029012001331200651200341155483116262300102010062200002010000200001200731200791120021109101001010000101000000135012701161111957410000100001000010010120033120033120033120033120033
3002412005189900000120017109456254001910010200001000010200001000050573567213670029012001301200321200671155503116262300102010000200002010000200001200321200321120021109101001010000101000000189012701161111957410000100001000010010120033120033120033120033120033
300241200688990000012001710945625400101001020000100001020000100005057356721367002911200130120032120032115548311626230010201000020000201000020000120032120032112002110910100101000010100000072012701161111957410000100001000010010120058120033120033120033120033
3002412006089900000120017109517254001010010200001000010200001000050573567213670029012001301200321200321155483116262300102010000200002010000200001200321200321120021109101001010000101000000114012701161111957710000100001000010010120033120033120033120033120033
30024120033899000001200171094562540010100102000010000102045610000505735720136700290120013012003212003211554831162623001020100002000020100002000012003212003211200211091010010100001010000009012701161111957410000100001000010010120033120033120033120033120033
3002412007689900000120053109456254001010010200001000010200001000050573567213670029012001301200321200321155483116262300102010000200002010000200001200321200331120021109101001010000101000000186012701161111957410000100001000010010120033120033120033120033120033
30024120078899000001200171094562540010100102000010000102000010000505735672136700290120013012003212003211554831162623001020100002000020100002000012003212003211200211091010010100001010000000012701161111957410000100001000010010120033120033120033120033120033
300241200398990000012001710945625400101001020000100001020000100005057356721367002901200130120032120032115548311626230010201000020000201000020000120033120032112002110910100101000010100000099012701161011957410000100001000010010120033120033120033120033120036
300241200328990000012001710945625400101001020000100001020000100005057356721367002901200130120032120032115548311626230010201000020000201000020000120032120032112002110910100101000010100000018012701161111957410000100001000010010120033120033120033120033120033

Test 3: throughput

Count: 8

Code:

  dup v0.16b, w8
  dup v1.16b, w8
  dup v2.16b, w8
  dup v3.16b, w8
  dup v4.16b, w8
  dup v5.16b, w8
  dup v6.16b, w8
  dup v7.16b, w8
  mov x8, 9

(fused SUBS/B.cc loop)

100 unrolls and 100 iterations

Result (median cycles for code divided by count): 0.3351

retire uop (01)cycle (02)03mmu table walk instruction (07)l2 tlb miss instruction (0a)1e1f3a3f4f51schedule uop (52)schedule int uop (53)schedule simd uop (54)schedule ldst uop (55)dispatch int uop (56)dispatch simd uop (57)dispatch ldst uop (58)int uops in schedulers (59)simd uops in schedulers (5a)ldst uops in schedulers (5b)60696d6emap stall dispatch (70)map rewind (75)map stall (76)dispatch uop (78)map int uop (7c)map ldst uop (7d)map simd uop (7e)map int uop inputs (7f)map ldst uop inputs (80)map simd uop inputs (81)8283flush restart other nonspec (84)85inst all (8c)inst branch (8d)inst branch taken (90)inst branch cond (94)inst int alu (97)inst simd alu (9a)9fld unit uop (a6)l1d cache writeback (a8)a9acc2branch cond mispred nonspec (c5)branch mispred nonspec (cb)cdcfd5map dispatch bubble (d6)ddfetch restart (de)e0? ldst retires (ed)? simd retires (ee)? int retires (ef)f5f6f7f8fd
1602042671120011002266970251601001008000080000100800208001550011695571883252026693267082670866366665716013620080024800242008002080020268832676811802011009910010080000100800001015321115119816382698780000800001002670926709267092670926709
1602042680020011132882268580251601001008000080000100800208037550011689511887370026689267082670866326665716013520080020800202008002080020267082670811802011009910010080000100800000012901115119316832670580000800001002670926709267092670926709
1602042671520011002266930251601001008000080000100800208001650011689511884163026689267082670866326665716013520080020800202008002080020267082670811802011009910010080000100800000017401115119616662670580000800001002670926709267092670926709
16020426708200110022669302516010010080000800001008002080015500116895118973260266892670826708663266657160135200800208002020080020800202670826708118020110099100100800001008000000001115119816382670580000800001002670926709267092670926709
160204267082001100226693025160100100800008000010080020800155001168951188888902668926708267086632666571601352008002080020200800208002026708267081180201100991001008000010080000007801115119816882670580000800001002670926709267092670926709
16020426708200110022669302516010010080000800001008002080015500116895118929320266892670826708663466661160135200800208002020080020800202670826708118020110099100100800001008000000301115119816882670580000800001002670926709267092670926709
16020426708200110022669302516010010080000800001008002080015500116895118863100266892670826708663266657160135200802168002020080020800202670826708118020110099100100800001008000000001115119316882670580000800001002670926709267092670926709
16020426708200110022669302516010010080000800001008020880196500116895118923400266892670826708663266657160135200800208002020080020800202670826708118020110099100100800001008000000001115119816882670580000800001002686426719267182670926709
16020426875200110108226693025160100100800008000010080020800155001168956188416302668926708267086632666571601352008002080020200800208002026711267081180201100991001008000010080000006301115119816982670580000800001002670926709267092670926709
160204268142011100226693025160100100800008000010080020800165001170128188416302668926708267086632666571601352008002080020200800208002026708267081180201100991001008000010080000006301115119616882670580000800001002670926709267092670926709

1000 unrolls and 10 iterations

Result (median cycles for code divided by count): 0.3339

retire uop (01)cycle (02)030e1e3f51schedule uop (52)schedule int uop (53)schedule simd uop (54)schedule ldst uop (55)dispatch int uop (56)dispatch simd uop (57)dispatch ldst uop (58)int uops in schedulers (59)simd uops in schedulers (5a)ldst uops in schedulers (5b)60696d6emap stall dispatch (70)map rewind (75)map stall (76)dispatch uop (78)map int uop (7c)map ldst uop (7d)map simd uop (7e)map int uop inputs (7f)map ldst uop inputs (80)map simd uop inputs (81)8283flush restart other nonspec (84)85inst all (8c)inst branch (8d)inst branch taken (90)inst branch cond (94)inst int alu (97)inst simd alu (9a)9fa5ld unit uop (a6)l1d cache writeback (a8)acc2cfd5map dispatch bubble (d6)ddfetch restart (de)e0? ldst retires (ed)? simd retires (ee)? int retires (ef)f5f6f7f8fd
160024267081992226693251600101080000800001080000800005011688801884032026689267082670866533668816001020800008000020800008000026708267081180021109101080000100800001090502011611267058000080000102670926709267092670926782
160024267082000026693251600101080000800001080000800005011688801884032026689267082670866533668816001020800008000020800008000026708267081180021109101080000100800001300502011611267058000080000102670926709267092670926780
1600242670820000266932516001010800008000010803708000050118383218840321266892670826708665336688160010208000080000208000080000267082670811800211091010800001008000001200502011611267058000080000102670926709267092670926809
1600242671020000266932516001010800008000010800008000050116886818830140266892670826708665336688160010208000080000208000080000267082670811800211091010800001008000001230502011611267058000080000102670926709267112671226738
160024267082000026693251600101080000800001080000800005011688801884032126689267082670866533669016001020800008000020800008000026708267081180021109101080000100800001200502011611267058000080000102670926709267092670926755
1600242670820000266932516001010800008000010800008000050116888018840321266892670826708665336688160010208000080000208000080000267082670811800211091010800001033800001030502011611267058000080000102670926709267092670926774
160024267102000026693251600101080000800001080000800005011688801884032026689267082670866533668816001020800008000020800008000026708267081180021109101080000100800001930502011611267058000080000102670926709267092670926812
1600242671619900266932516001010800008000010805558000050116888018840321266892670826708665336688160010208000080000208000080000267082670811800211091010800001008000001290502011611267058000080000102670926709267092670926795
1600242670820000266932516001010800008000010800008000050116888018840321266892670826708665336688160010208000080000208000080000267082670811800211091010800001008000061110502011611267058000080000102670926709267092670926787
160024267102000026693251600101080000800001080000800005011703611884032126689267082670866533668816001020800008000020800008000026708267081180021109101080000100800002530502011611267058000080000102670926709267092670926798