Apple Microarchitecture Research by Dougall Johnson

M1/A14 P-core (Firestorm): Overview | Base Instructions | SIMD and FP Instructions
M1/A14 E-core (Icestorm):  Overview | Base Instructions | SIMD and FP Instructions

DUP (general, 2D)

Test 1: uops

Code:

  dup v0.2d, x0
  mov x0, 1
  mov x1, 2

(no loop instructions)

1000 unrolls and 1 iteration

Retires: 2.000

Issues: 2.000

Integer unit issues: 0.000

Load/store unit issues: 1.000

SIMD/FP unit issues: 1.000

retire uop (01)cycle (02)031e3f4f51schedule uop (52)schedule simd uop (54)schedule ldst uop (55)dispatch simd uop (57)dispatch ldst uop (58)simd uops in schedulers (5a)ldst uops in schedulers (5b)696d6emap stall dispatch (70)map rewind (75)map stall (76)dispatch uop (78)map ldst uop (7d)map simd uop (7e)map ldst uop inputs (80)map simd uop inputs (81)8283flush restart other nonspec (84)85inst all (8c)inst simd alu (9a)ld unit uop (a6)l1d cache writeback (a8)accfd5map dispatch bubble (d6)ddfetch restart (de)e0? ldst retires (ed)? simd retires (ee)f5f6f7f8fd
200437530360025200010001000100010001407522820356375375753108200010001000100010003753751110011000100000731161137210001000376376376378376
200437530360025200010001000100010001407522820356375375723108200010001000100010003753751110011000100000731161137210001000376376376376376
200437620360025200010001000100010001407522820356375375723108200010001000100010003753751110011000100006731161137210001000376376376376376
200437530360025200010001000100010001407522820356377375723108200010001000100010003753751110011000100000731161137210001000376376376376376
200437530360025200010001000100010001407522820356375375723108200010001000100010003753751110011000100000731161137210001000376377376376376
200437520360025200010001000100010001407522820356375375723108200010001000100010003753751110011000100000731161137210001000376376376376376
200437530360025200010001000100010001407522820356375375723108200010001000100010003753751110011000100000731161137210001000379376376376376
200437530360025200010001000100010001407522820356378391723108200010001000100010003753751110011000100000731161137210001000376376376376376
2004375315360025200010001000100010001407522820356375375723108200010001000100010003753751110011000100000731161137210001000376376376376376
200437530360025200010001000100010001407522820356375375723108200010001000100010003753751110011000100000731161137210001000376376376376376

Test 2: Latency 1->2 roundtrip

Code:

  dup v0.2d, x0
  fmov x0, d0
  mov x0, 1
  mov x1, 2

(fused SUBS/B.cc loop)

100 unrolls and 100 iterations

Result (median cycles for code): 12.0032

retire uop (01)cycle (02)03mmu table walk data (08)l2 tlb miss data (0b)191e1f3f4f51schedule uop (52)schedule int uop (53)schedule simd uop (54)schedule ldst uop (55)dispatch int uop (56)dispatch simd uop (57)dispatch ldst uop (58)int uops in schedulers (59)simd uops in schedulers (5a)ldst uops in schedulers (5b)60696d6emap stall dispatch (70)map rewind (75)map stall (76)dispatch uop (78)map int uop (7c)map ldst uop (7d)map simd uop (7e)map int uop inputs (7f)map ldst uop inputs (80)map simd uop inputs (81)8283flush restart other nonspec (84)85inst all (8c)inst branch (8d)inst branch taken (90)inst branch cond (94)inst int alu (97)inst simd alu (9a)9fld unit uop (a6)l1d cache writeback (a8)acbranch mispred nonspec (cb)cfd0d5map dispatch bubble (d6)ddfetch restart (de)e0? int output thing (e9)? ldst retires (ed)? simd retires (ee)? int retires (ef)f5f6f7f8fd
302041200348990000012001710945625401001010020000100001002000010000500573567213678201112001312003212003211552531162403010020010000200002001000020000120032120032112020110099100101001000010010000130132812242211957410000100001000010100120033120033120033120033120034
3020412003289900012012001710945625401001010020000100001002000010000500573567213672053112001412003212003211552831162403010020010000200002001000020000120032120032112020110099100101001000010010000401131012162211957410000100001000010100120033120033120033120033120033
302041200328990000012001710947925401001010020000100001002000010000500573567213672053112001312003212003211552531162403010020010000200002001000020000120032120032112020110099100101001000010010000000131012162211957410000100001000010100120033120033120033120033120033
302041200328990000012001710945625401001010020000100001002000010000500573567213678206112001312007312003211552531162403010020010000200002001000020000120032120032112020110099100101001000010010000000131012162211957410000100001000010100120033120033120033120033120033
302041200328990000012001710945625401001010020000100001002000010000500573567213672053012001312003212003211552531162403010020010000200002001000020000120032120032112020110099100101001000010010000000131012162211957410000100001000010100120033120033120033120033120033
302041200328991000012001710945625401261010020000100001002000010000500573567213679094112001312003212003211552531162403010020010000200002001000020000120032120032112020110099100101001000010010000000131012163211957410000100001000010100120033120033120033120033120033
302041200328990000012001710945625401001010020000100001002000010000500573567213672053112001312003212003211552531162403010020010000200002001000020000120032120032112020110099100101001000010010000000131012162211957410000100001000010100120033120038120033120033120033
3020412003289900000120017109456254010010100200001000010020000100005005735672136788551120013120032120032115525311624030100200100002000020010000200001200321200321120201100991001010010000100100000360131212162211957710000100001000010100120033120033120033120033120033
302041200328990000012001710945625401001010020000100001022000010000500573567213683257112001312003212003211552531162403010020010000200002001000020000120032120032112020110099100101001000010010000000131012162211957410000100001000010100120033120033120033120033120033
302041200328990000012001710945625401001010020000100001002000010000500573567213672053112001312003212003211552531162403010020010000200002001000020000120032120032112020110099100101001000010010000000131012162211957410000100001000010100120033120033120076120033120033

1000 unrolls and 10 iterations

Result (median cycles for code): 12.0032

retire uop (01)cycle (02)03mmu table walk instruction (07)l2 tlb miss data (0b)1e3f4f51schedule uop (52)schedule int uop (53)schedule simd uop (54)schedule ldst uop (55)dispatch int uop (56)dispatch simd uop (57)dispatch ldst uop (58)int uops in schedulers (59)simd uops in schedulers (5a)ldst uops in schedulers (5b)6061696b6d6emap stall dispatch (70)map rewind (75)map stall (76)dispatch uop (78)map int uop (7c)map ldst uop (7d)map simd uop (7e)map int uop inputs (7f)map ldst uop inputs (80)map simd uop inputs (81)8283flush restart other nonspec (84)85inst all (8c)inst branch (8d)inst branch taken (90)inst branch cond (94)inst int alu (97)inst simd alu (9a)9fld unit uop (a6)l1d cache writeback (a8)accdcfd0d5map dispatch bubble (d6)ddfetch restart (de)e0? int output thing (e9)? ldst retires (ed)? simd retires (ee)? int retires (ef)f5f6f7f8fd
3002412003289900912001710945625400101001020000100001020000100005057356721367002910120013012003212003211554831162643001020100002000020100002000012003212003211200211091010010100001010000000127002161111957410000100001000010010120033120033120033120033120033
3002412003289900012001710945625400101001020000100001020000100005057356721367002900120013012003212003211554831162623001020100002000020100002000012003212003211200211091010010100001010000000129601161111957410000100001000010010120033120033120033120033120033
3002412003289900012001710945625400101001020000100001020000100005057356721367002900120013012003212003211554831162623001020100002000020100002000012003212003211200211091010010100001010000000127001161111957410000100001000010010120033120033120033120033120033
3002412003289900012001710945625400101001020000100021020000100005057356721367002900120013012003212003211554831162623001020100002000020100002000012003212003211200211091010010100001010000000127003161111957410000100001000010010120033120033120033120033120033
30024120032899000120017109456254001010010200001000010200001000050573567213670029001200130120032120032115548311626230010201000020000201000020000120032120032112002110910100101000010100000150127001161111957410000100001000010010120033120033120033120033120033
3002412003289900012001710945625400101001020000100001020000100005057356721367002900120013012003212003211554831162623001020100002000020100002000012003212003211200211091010010100001010000000127201161111957410000100001000010010120034120033120033120033120033
3002412003289900312001710945625400101001020000100001020000100005057356721367002915120013012003212009011554831162623001220100002000020100002000012003212003211200211091010010100001010000000127051161111957410000100001000010010120033120033120033120074120077
3002412003289900012001710945638400101001020000100001020000100005057356721367002905120013012003212006711554831162623001020100002000020100002000012003212003211200211091010010100001010000000127053163211957410002100001000010010120033120033120033120033120071
3002412006890010012001710945625400101001020000100021020000100005057381201367082000120013012003212003211554931162623018020100002000020100002000012003212003211200211091010010100001010000000127051161111957410000100001000010010120033120033120033120033120033
3002412003289900012001710949725400101001020000100001020000100006057356811367009015120013012003212003211554731162623017920100002000020100002000012003212010811200211091010010100001010000000127051161111957410000100001000010010120033120033120033120033120033

Test 3: throughput

Count: 8

Code:

  dup v0.2d, x8
  dup v1.2d, x8
  dup v2.2d, x8
  dup v3.2d, x8
  dup v4.2d, x8
  dup v5.2d, x8
  dup v6.2d, x8
  dup v7.2d, x8
  mov x8, 9

(fused SUBS/B.cc loop)

100 unrolls and 100 iterations

Result (median cycles for code divided by count): 0.3338

retire uop (01)cycle (02)03091e3f4f51schedule uop (52)schedule int uop (53)schedule simd uop (54)schedule ldst uop (55)dispatch int uop (56)dispatch simd uop (57)dispatch ldst uop (58)int uops in schedulers (59)simd uops in schedulers (5a)ldst uops in schedulers (5b)60696d6emap stall dispatch (70)map rewind (75)map stall (76)dispatch uop (78)map int uop (7c)map ldst uop (7d)map simd uop (7e)map int uop inputs (7f)map ldst uop inputs (80)map simd uop inputs (81)8283flush restart other nonspec (84)85inst all (8c)inst branch (8d)inst branch taken (90)inst branch cond (94)inst int alu (97)inst simd alu (9a)9fl1d tlb miss (a1)ld unit uop (a6)l1d cache writeback (a8)acbranch cond mispred nonspec (c5)branch mispred nonspec (cb)cdcfd5map dispatch bubble (d6)ddfetch restart (de)e0? ldst retires (ed)? simd retires (ee)? int retires (ef)f5f6f7f8fd
1602042671120000266935251601001008000080000100800208001550011689511884163126689267082670866326665716013520080020800202008002080020267082670811802011009910010080000100080000001115117017002670580000800001002672426709267092670926709
1602042670820000266940251601001008000080000100800238001850011683521882981026698267132670966309664516014120080023800232008002380023267092670911802011009910010080000100080000102225127124112670680000800001002672026717267102671026710
1602042670920100266940251601001008000080000100800238001850011683521883460126690267092670966219664416014120080023800232008002380023267092670911802011009910010080000100080000002225127124112670680000800001002671826714267182671026710
1602042670920000266940251601001008000080000100800238001850011683521883460126690267092670966219664516014120080023800232008002380023267092670911802011009910010080000100080000002225128124112670680000800001002672726710267102671026710
1602042670920000266940251601001008000080000100800238001850011683521883460126690267092670966219664516014120080023800232008002380023267092670911802011009910010080000100080000002225128124112670680000800001002671026714267152672326710
1602042670920000266940251601001008000080000100800238001850011683521883460026690267092670966219664516014120080023800232008002380023267092670911802011009910010080000100080000002225128124112670680000800001002671026710267102671026710
16020426709200002669402516010010080000800001008002380018500116835218834601266902670926709662110664516014120080023800232008002380023267092670911802011009910010080000100080000002225127124112670680000800001002682826711267182671226710
1602042670920000266940251601001008000080000100800238001850011683521883460026690267092670966219664516014120080023800232008002380023267092670911802011009910010080000100080000002225127124112670680000800001002671826710267152671026710
1602042670919900266940251601001008000080000100800238001850011683521883460026690267092670966219664516014120080023800232008002380023267092670911802011009910010080000100080000002225127124112670680000800001002671026710267102671026710
1602042670920000266945251601001008000080000100800238001850011683521883460026690267092670966219664516014120080023800232008002380023267092670911802011009910010080000100080000002225127124112670680000800001002671026710267112671026710

1000 unrolls and 10 iterations

Result (median cycles for code divided by count): 0.3338

retire uop (01)cycle (02)03mmu table walk instruction (07)09l2 tlb miss instruction (0a)1e3a3f4f51schedule uop (52)schedule int uop (53)schedule simd uop (54)schedule ldst uop (55)dispatch int uop (56)dispatch simd uop (57)dispatch ldst uop (58)int uops in schedulers (59)simd uops in schedulers (5a)ldst uops in schedulers (5b)60696d6emap stall dispatch (70)simd prf full (72)map rewind (75)map stall (76)dispatch uop (78)map int uop (7c)map ldst uop (7d)map simd uop (7e)map int uop inputs (7f)map ldst uop inputs (80)map simd uop inputs (81)8283flush restart other nonspec (84)85inst all (8c)inst branch (8d)inst branch taken (90)inst branch cond (94)inst int alu (97)inst simd alu (9a)9fl1d tlb miss (a1)ld unit uop (a6)l1d cache writeback (a8)accfd5map dispatch bubble (d6)dbddfetch restart (de)e0? ldst retires (ed)? simd retires (ee)? int retires (ef)f5f6f7f8fd
16002426714200101122266970251600101080000800001080000800005011688801884032126689267082670866530366881600102080000800002080000800002670826708118002110910108000010080000005022171601315267058000080000102671726709267092670926709
1600242670820010102266930251600101080000800001080000800005011688801884032126689267082670866530366881600102080000800002080000800002670826708118002110910108000010080000005022301601913267058000080000102671026709267092670926709
1600242681320010102266930251600101080000800001080000800005011688801884032126689267082670866530366881600102080000800002080000800002670826708118002110910108000010080000105022151601613267058000080000102671426717267092671826709
1600242670820010202266930251600101080000800001080000800005011688801884032126689267082670866530366881600102080000800002080000800002670826708118002110910108000010080000005022141601512267058000080000102670926709267092670926709
1600242670820010102266930251600101080000800001080000800005011688801884032126689267082670866530366881600102080000800002080000800002670826708118002110910108000010080000005022151601215267058000080000102670926709267112670926709
1600242670820010102266930251600101080000800001080000801785011688801884032126689267082670866530366881600102080000800002080000800002670826708118002110910108000010080000005022131601612267058000080000102670926709267092670926709
1600242670819910102266930251600101080000800001080000800005011688801884032126689267082670866530366881600102080000800002080000800002670826708118002110910108000010080000005022161601415267058000080000102670926709267092670926709
1600242670820010102266930251600101080000800001080000800005011688801884032126689267082670866530366881600102080000800002080000800002670826708118002110910108000010080000005022121601311267058000080000102683226711267172670926709
1600242670820010102266930251600101080000800001080000800005011688801884032126689267082670866530366881600102080000800002080000800002670826708118002110910108000010080000005022121601512267058000080000102670926709267092670926709
1600242670820010102266930251600101080000800001080000800005011688801884032126689267082670866530366881600102080000800002080000800002670826708118002110910108000010080000005022121601513267058000080000102670926709267092670926709