Apple Microarchitecture Research by Dougall Johnson

M1/A14 P-core (Firestorm): Overview | Base Instructions | SIMD and FP Instructions
M1/A14 E-core (Icestorm):  Overview | Base Instructions | SIMD and FP Instructions

DUP (general, 2S)

Test 1: uops

Code:

  dup v0.2s, w0
  mov x0, 1
  mov x1, 2

(no loop instructions)

1000 unrolls and 1 iteration

Retires: 2.000

Issues: 2.000

Integer unit issues: 0.000

Load/store unit issues: 1.000

SIMD/FP unit issues: 1.000

retire uop (01)cycle (02)03mmu table walk instruction (07)l2 tlb miss instruction (0a)1e3a3f4f51schedule uop (52)schedule simd uop (54)schedule ldst uop (55)dispatch simd uop (57)dispatch ldst uop (58)simd uops in schedulers (5a)ldst uops in schedulers (5b)60696d6emap stall dispatch (70)map rewind (75)map stall (76)dispatch uop (78)map ldst uop (7d)map simd uop (7e)map ldst uop inputs (80)map simd uop inputs (81)8283flush restart other nonspec (84)85inst all (8c)inst simd alu (9a)ld unit uop (a6)l1d cache writeback (a8)accfd5map dispatch bubble (d6)ddfetch restart (de)e0? ldst retires (ed)? simd retires (ee)f5f6f7f8fd
2004375311023600252000100010001000100014075228200357375375723108200010001000100010003753751110011000100000755165537210001000376376376376376
2004375311023600252000100010001000100014075228201356375375723108200010001000100010003754931110011000100000755165537210001000376383381376381
2004375311023600252000100010001000100014075228201358375375723108200010001000100010003753751110011000100000755165537210001000376381376379376
2004375311023600252000100010001000100014075228200356375375723108200010001000100010003753751110011000100000755165537210001000376379376379376
2004375311023600252000100010001000100014075228201356375375723108200010001000100010003753751110011000100000755165537210001000376381376379376
2004375311023600252000103310001000100014075228201356375375723108200010001000100010003753751110011000100000755165537210001000376376376376376
2004375311023600252000100010001000100014075228200363375375723108200010001000100010003753751110011000100003755165537210001000376376378376376
2004375211023600252000100010001000100014075230220356375375723108200010001000100010003753751110011000100000755165537210001000376381376376376
2004375311023600252000100010001000100014075228201357375375723108200010001000100010003753751110011000100000755165537210001000376376376376376
2004375311023600252000100010001000100014075228200365375375723108200010001000100010003753751110011000100000755165537210001000376376376376376

Test 2: Latency 1->2 roundtrip

Code:

  dup v0.2s, w0
  fmov x0, d0
  mov x0, 1
  mov x1, 2

(fused SUBS/B.cc loop)

100 unrolls and 100 iterations

Result (median cycles for code): 12.0032

retire uop (01)cycle (02)03mmu table walk data (08)09l2 tlb miss data (0b)18191e1f3f4f51schedule uop (52)schedule int uop (53)schedule simd uop (54)schedule ldst uop (55)dispatch int uop (56)dispatch simd uop (57)dispatch ldst uop (58)int uops in schedulers (59)simd uops in schedulers (5a)ldst uops in schedulers (5b)60696d6emap stall dispatch (70)map rewind (75)map stall (76)dispatch uop (78)map int uop (7c)map ldst uop (7d)map simd uop (7e)map int uop inputs (7f)map ldst uop inputs (80)map simd uop inputs (81)8283flush restart other nonspec (84)85inst all (8c)inst branch (8d)inst branch taken (90)inst branch cond (94)inst int alu (97)inst simd alu (9a)9fl1d tlb access (a0)l1d tlb miss (a1)ld unit uop (a6)st unit uop (a7)l1d cache writeback (a8)a9acc2cdcfd0d5map dispatch bubble (d6)ddfetch restart (de)e0? int output thing (e9)? ldst retires (ed)? simd retires (ee)? int retires (ef)f5f6f7f8fd
3020412003489900000001200171094562540100101002000010000100200001000050057361761367205311200131200321200321155253116298301002001000020000200100002000012003212003211202011009910010100100001000010000000000131013162311957410000100001000010100120033120033120033120033120033
3020412003289900000001200171094562540100101002000010000100200001000050057356721367216411200131200321200321155253116302301002001000020000200100002000012003212003221202011009910010100100001000010000000000131013163311957410000100001000010100120033120033120033120033120033
3020412003289900000001200171094562540100101002000010000100200001000050057356721367205311200131200321200321155253116240301002001000020000200100002000012003212003211202011009910010100100001000010000000000131013163311957410000100001000010100120033120033120033120033120033
3020412003289900000001200181094562540100101002000010000100200001000050057356721367205311200131200321200321155253116294301002001000020000200100002000012003212003211202011009910010100100001000010000000000131013163311957410000100001000010100120033120033120033120033120033
3020412003289900000001200171094562540100101002000010000100200001000050057356721367205311200131200321200321155253116249301002021000020000200100002000012003212003221202011009910010100100001000010000010600131013253511957410000100001000010100120033120033120130120036120034
3020412003289900000001200171094562540100101002000010000100200001000050057356721367205311200131200321200321155253116312301002001000020000200100002000012003212003211202011009910010100100001000010000000000131013163311957410000100001000010100120033120033120033120033120033
3020412003289900000001200171094562540100101002000010000100200001000050057356721367205311200131200791200351155253116304301002001000020000200100002000012003212003211202011009910010100100001000010000000000131014163311957410000100001000010100120033120033120033120033120033
3020412003289900000001200171094562540100101002000010000100200001000050057356721367205311200131200691200321155253116240301002001000020000200100002000012003212003211202011009910010100100001000010000000000131013163311957410000100001000010100120033120033120033120033120033
3020412003289900000001200171094562540100101002000010000100200001000050057356721367238611200181200321200321155253116240301002001000020000200100002000012003212003211202011009910010100100001000010000000000131013163311957410000100001000010100120033120033120033120033120033
3020412003289900000001200171094562540100101002000010000100200001000050057356721367205311200131200321200321155253116312301252001000020000200100002000012003212003211202011009910010100100001000010000000000131013163311957410000100001000010100120035120298120197120033120033

1000 unrolls and 10 iterations

Result (median cycles for code): 12.0032

retire uop (01)cycle (02)0318191e3f4f51schedule uop (52)schedule int uop (53)schedule simd uop (54)schedule ldst uop (55)dispatch int uop (56)dispatch simd uop (57)dispatch ldst uop (58)int uops in schedulers (59)simd uops in schedulers (5a)ldst uops in schedulers (5b)60696b6d6emap stall dispatch (70)map rewind (75)map stall (76)dispatch uop (78)map int uop (7c)map ldst uop (7d)map simd uop (7e)map int uop inputs (7f)map ldst uop inputs (80)map simd uop inputs (81)8283flush restart other nonspec (84)85inst all (8c)inst branch (8d)inst branch taken (90)inst branch cond (94)inst int alu (97)inst simd alu (9a)9fld unit uop (a6)l1d cache writeback (a8)acbranch cond mispred nonspec (c5)cdcfd5map dispatch bubble (d6)ddfetch restart (de)e0? int output thing (e9)? ldst retires (ed)? simd retires (ee)? int retires (ef)f5f6f7f8fd
30024120032899001051200171094562540010100102000010000102000010000505735672136700290120016012003212003211554831162623001020100002000020100002000012003212003211200211091010010100001010000000012701161111957410000100001000010010120033120033120033120033120033
30024120032899008611200171094562540010100102000010000102000010000505735672136700290120013012003212003211554831162623001020100002000020100002000012003212003211200211091010010100001010000000012702161111963710000100001000010010120033120033120033120033120033
30024120032899007861200181094562540010100102000010000102000010000505735672136700291120013012003212003211554831163163001020100002000020100002000012003212003211200211091010010100001010000000012701161211957410000100001000010010120033120033120033120033120033
30024120032899008851200171094562540010100102000010000102000010000505735672136700290120013012003212003211554831162623001020100002000020100002000012003212003211200211091010010100001010000100012701161111957410000100001000010010120033120033120033120033120033
30024120032899008821200171094562540010100102000010000102000010000505735672136700290120013012003212003211554891162693001020100002000020100002000012003212003211200211091010010100001010000000012701161111957410000100001000010010120033120033120033120033120033
30024120032899008221200171094562540010100102000010000102000010000505735672136700290120013012003212003211554831162623001020100002000020100002000012011412003811200211091010010100001010000000012701161111957410000100001000010010120033120033120033120033120038
30024120032899003901200171094562540010100102000010000102000010000505735672136700290120013012003212003211554831162623001020100002000020100002000012003212003211200211091010010100001010000000012701161111957410000100001000010010120033120041120033120033120033
30024120032900005551200171094562540010100102000010000102000010000505735672136700291120013012003212003211554831162623001020100002000020100002000012003212003211200211091010010100001010000000012701161111957410000100001000010010120033120033120033120033120033
30024120032899009541200171094562540010100102000010000102000010000505735672136700290120013012003212003211554831162623001020100002000020100002000012003212003211200211091010010100001010000000012701161111957410000100001000010010120033120033120033120033120033
30024120032899008311200171094562540010100102000010000102000010000505735672136700290120013012003212003211554831164633001020100002000020100002000012003212003211200211091010010100001010000000012701161111957410000100001000010010120033120033120033120033120033

Test 3: throughput

Count: 8

Code:

  dup v0.2s, w8
  dup v1.2s, w8
  dup v2.2s, w8
  dup v3.2s, w8
  dup v4.2s, w8
  dup v5.2s, w8
  dup v6.2s, w8
  dup v7.2s, w8
  mov x8, 9

(fused SUBS/B.cc loop)

100 unrolls and 100 iterations

Result (median cycles for code divided by count): 0.3339

retire uop (01)cycle (02)03mmu table walk instruction (07)l2 tlb miss instruction (0a)1e3f51schedule uop (52)schedule int uop (53)schedule simd uop (54)schedule ldst uop (55)dispatch int uop (56)dispatch simd uop (57)dispatch ldst uop (58)int uops in schedulers (59)simd uops in schedulers (5a)ldst uops in schedulers (5b)696d6emap stall dispatch (70)map rewind (75)map stall (76)dispatch uop (78)map int uop (7c)map ldst uop (7d)map simd uop (7e)map int uop inputs (7f)map ldst uop inputs (80)map simd uop inputs (81)8283flush restart other nonspec (84)85inst all (8c)inst branch (8d)inst branch taken (90)inst branch cond (94)inst int alu (97)inst simd alu (9a)9fld unit uop (a6)l1d cache writeback (a8)acbranch cond mispred nonspec (c5)branch mispred nonspec (cb)cdcfd5map dispatch bubble (d6)ddfetch restart (de)e0? ldst retires (ed)? simd retires (ee)? int retires (ef)f5f6f7f8fd
160204267102000172266932516010010080000800001008002080015500116895118841632682226724270946642666731601352008002080020200800208002026708267081180201100991001008000010080000001115117116112670580000800001002671626709267092670926709
16020426708200110266932516010010080000800001008002080015500116895118841632681126717267086632666571601352008002080020200800208002026708267081180201100991001008000010080000001115117116112670580000800001002670926709267092670926709
16020426708200110266932516010010080000800001008002080015500116895118841632678626719267176648666571601352008002080020200800208002026708267081180201100991001008000010080000001115117116112670580000800001002670926709267092670926709
160204267082001115266932516010010080000800001008002080015500116895118841632680526708267356652666571601352008002080020200800208002026708267081180201100991001008000010080000001115117116112670580000800001002670926709267092670926709
16020426708200110266932516010010080000800001008002080015500116895118841632668926708268716721666571601352008002080020200800208002026708267081180201100991001008000010080000001115117116112670580000800001002670926709267092670926709
16020426708200110266932516010010080000800001008002080015500116895118841632682727352267086632666571601352008002080020200800208002026708267081180201100991001008000010080000001115117116112670580000800001002670926709267092670926709
16020426708200110266932516010010080000800001008002080015500116895118841632681826717267136632666571601352008002080020200800208002026708267081180201100991001008000010080000001115117116112670580000800001002670926709267092670926709
16020426708206110266932516010010080000800001008002080015500116895118841632680826721267146632666571601352008002080020200800208002026708267081180201100991001008000010080000001115117116112670580000800001002670926709267092670926709
16020426708200110266932516010010080000800001008002080015500116895118841632680526716267186634666571601352008002080020200800208002026708267111180201100991001008000010080000001115117116112670580000800001002670926709267092670926709
16020426708200110266932516010010080000800001008002080015500116895118841632675726714267176647666571601352008002080020200800208002026708267081180201100991001008000010080000001115117116112670580000800001002670926709267092670926709

1000 unrolls and 10 iterations

Result (median cycles for code divided by count): 0.3338

retire uop (01)cycle (02)031e3f4f51schedule uop (52)schedule int uop (53)schedule simd uop (54)schedule ldst uop (55)dispatch int uop (56)dispatch simd uop (57)dispatch ldst uop (58)int uops in schedulers (59)simd uops in schedulers (5a)ldst uops in schedulers (5b)60696d6emap stall dispatch (70)simd prf full (72)map rewind (75)map stall (76)dispatch uop (78)map int uop (7c)map ldst uop (7d)map simd uop (7e)map int uop inputs (7f)map ldst uop inputs (80)map simd uop inputs (81)8283flush restart other nonspec (84)85inst all (8c)inst branch (8d)inst branch taken (90)inst branch cond (94)inst int alu (97)inst simd alu (9a)9fa5ld unit uop (a6)acbranch mispred nonspec (cb)cfd5map dispatch bubble (d6)ddfetch restart (de)e0? ldst retires (ed)? simd retires (ee)? int retires (ef)f5f6f7f8fd
16002426713200026693025160010108000080000108000080000501168880188403212675826800267106653036688160010208000080000208000080000267082670811800211091010800001008000000502031611267058000080000102670926709267092670926709
16002426708200026693025160010108000080000108000080000501168880188403212675826710267126657036695160010208000080000208000080000267082670811800211091010800001008000000502011612267058000080000102670926709267092670926709
16002426708200026693025160010108000080000108000080000501168880188403202677826782267086658036688160010208000080000208000080000267082670811800211091010800001008000000502011611267058000080000102670926709267092670926709
16002426708200026693025160010108000080000108000080000501168880188403202684726717267086653036688160010208000080000208000080000267082670811800211091010800001008000000502011611267058000080000102670926709267092670926709
16002426708200026693025160010108000080000108000080000501168880188403202687726716267086653036688160010208000080000208000080000267082670811800211091010800001008000000502011611267058000080000102670926709267092670926709
160024267082000266930251600101080000800001080000800005011688801884032026757267142671766670366881600102080000800002080000800002670826708118002110910108000010080000121502011611267058000080000102670926709267092670926709
160024267082000266930251600101080000800001080000800005011688801884032026689267082689366598936691160010208000080000208000080000267082670811800211091010800001008000000502011611267058000080000102670926709267092670926709
160024267082001226693025160010108000080000108000080000501168880188856512675426708267226672036688160010208000080000208000080000267082670811800211091010800001008000000502011611267058000080000102670926709267092670926709
16002426708200026693025160010108000080000108000080000501168880188403202675126715267176666036688160010208000080000208000080000267082670811800211091010800001008000000502011611267058000080000102670926709267092670926709
16002426708200026693025160010108000080000108000080000501168880188403212676226715267156663036688160010208000080000208000080000267082670811800211091010800001008000000502011611267058000080000102670926709267092670926709