Apple Microarchitecture Research by Dougall Johnson

M1/A14 P-core (Firestorm): Overview | Base Instructions | SIMD and FP Instructions
M1/A14 E-core (Icestorm):  Overview | Base Instructions | SIMD and FP Instructions

DUP (general, 4S)

Test 1: uops

Code:

  dup v0.4s, w0
  mov x0, 1
  mov x1, 2

(no loop instructions)

1000 unrolls and 1 iteration

Retires: 2.000

Issues: 2.000

Integer unit issues: 0.000

Load/store unit issues: 1.000

SIMD/FP unit issues: 1.000

retire uop (01)cycle (02)031e3f4f51schedule uop (52)schedule simd uop (54)schedule ldst uop (55)dispatch simd uop (57)dispatch ldst uop (58)simd uops in schedulers (5a)ldst uops in schedulers (5b)60696d6emap stall dispatch (70)map rewind (75)map stall (76)dispatch uop (78)map ldst uop (7d)map simd uop (7e)map ldst uop inputs (80)map simd uop inputs (81)8283flush restart other nonspec (84)85inst all (8c)inst simd alu (9a)ld unit uop (a6)l1d cache writeback (a8)accfd5map dispatch bubble (d6)ddfetch restart (de)e0? ldst retires (ed)? simd retires (ee)f5f6f7f8fd
2004375303610252000100010001000100014075228201356375375723108200010001000100010003753751110011000100000731161137210001000376376376376376
2004376303600252000100010001000100014075228981356375375723108200010001192100010003753751110011000100000731161137210001000376376376376376
2004375303600252000100010001000100014075228200356379375723108200010001000100010003753751110011000100000731161137210001000376376376376376
2004375303600252000100010001000100014075228200356375375723108200010001000100010003753751110011000100000731161137210001000376376376376376
2004375203600252000100010001000100014075228200361375375723108200010001000100010003753751110011000100013731161137210001000376376376376376
2004375303600252000100010001000100014075228201356375380723108200010001000100010003783751110011000100000731161137210001000376377376376376
2004375203600252000100010001000100014075228200356375375723108200010001000100010003753751110011000100010731161137210001000379376376376376
2004375203600252000100010001000100014075228200356375380723108200010001000100010003753751110011000100003731161137210001000376376376376376
2004375203600252000100010001000100014075228201356378376723108200010001000100010003753751110011000100003731161137210001000376376376376376
2004375203600252000100010001000100014075228201356375375723112200010001000100010003753751110011000100000731161137210001000376376376376376

Test 2: Latency 1->2 roundtrip

Code:

  dup v0.4s, w0
  fmov x0, d0
  mov x0, 1
  mov x1, 2

(fused SUBS/B.cc loop)

100 unrolls and 100 iterations

Result (median cycles for code): 12.0032

retire uop (01)cycle (02)03mmu table walk data (08)191e3f4f51schedule uop (52)schedule int uop (53)schedule simd uop (54)schedule ldst uop (55)dispatch int uop (56)dispatch simd uop (57)dispatch ldst uop (58)int uops in schedulers (59)simd uops in schedulers (5a)ldst uops in schedulers (5b)60696b6d6emap stall dispatch (70)map rewind (75)map stall (76)dispatch uop (78)map int uop (7c)map ldst uop (7d)map simd uop (7e)map int uop inputs (7f)map ldst uop inputs (80)map simd uop inputs (81)8283flush restart other nonspec (84)85inst all (8c)inst branch (8d)inst branch taken (90)inst branch cond (94)inst int alu (97)inst simd alu (9a)9fld unit uop (a6)l1d cache writeback (a8)a9acc2branch cond mispred nonspec (c5)branch mispred nonspec (cb)cfd0d5map dispatch bubble (d6)ddfetch restart (de)e0? int output thing (e9)? ldst retires (ed)? simd retires (ee)? int retires (ef)f5f6f7f8fd
30204120032899000120017109456254010010100200001000010020000100505005735672136720531120013012003212003211552531162403010020010000200002001000020000120032120032112020110099100101001000010010000000001131012162211957410000100001000010100120101120038120033120033120033
30204120032899000120017109456254010010100200001000010020000100005005735672136720530120013012003212003211552531162403010020010000200002001000020000120032120032112020110099100101001000010010000003000131012162211957410000100001000010100120093120036120033120033120033
3020412003289900012001710945625401001010020000100001002000010000500573567213672053012001301200321200321155253116240301002001000020000200100002013212006012003411202011009910010100100001001000000177000131012162211964310000100001000010100120033120033120033120046120033
3020412007289900675120017109456254010010100200001000010020000100005005735672136720530120013312003212003211552531162403010020010000200002001006520000120032120032112020110099100101001000010010000000000131012162211957410000100001000010100120033120033120034120033120033
30204120033899000120017109456254010010100200001000010020000100005005735672136720530120013012003212003211552531162403010020010000200002001000020000120032120032112020110099100101001000010010000000000131012163211957410000100001000010100120033120033120033120033120033
30204120032899000120017109456254010010100200001000010020000100005005735672136720530120013012003212003211552531162403010020010000200002001006720000120032120032112020110099100101001000010010000000000131012162211957410000100001000010100120066120036120033120033120033
30204120032899000120017109456254010010100200001000010020000100005005735672136720530120013012003212003211552531162403010020010000200002001000020000120032120032112020110099100101001000010010000000000131013162211957410000100001000010100120033120033120033120033120033
30204120032899000120017109456254010010100200001000010020000100005005735672136720530120017012003212003211552531162403010020010000200002001000020000120032120032112020110099100101001000010010000000000131012162211957410000100001000010100120033120033120033120033120033
30204120032899000120017109456254010010100200001000010020000100005005735672136720530120013012003212003211552531162403010020010000200002001000020000120032120032112020110099100101001000010010000000000131012162211957710000100001000010100120033120033120033120033120033
30204120032899000120017109456254010010100200001000010020000100005005735672136720531120013012003212003211552531162403010020010000200002001000020000120032120032112020110099100101001000010010000000000131012162211957410000100001000010100120033120033120033120033120033

1000 unrolls and 10 iterations

Result (median cycles for code): 12.0032

retire uop (01)cycle (02)031e3f4f51schedule uop (52)schedule int uop (53)schedule simd uop (54)schedule ldst uop (55)dispatch int uop (56)dispatch simd uop (57)dispatch ldst uop (58)int uops in schedulers (59)simd uops in schedulers (5a)ldst uops in schedulers (5b)5f60696d6emap stall dispatch (70)map rewind (75)map stall (76)dispatch uop (78)map int uop (7c)map ldst uop (7d)map simd uop (7e)map int uop inputs (7f)map ldst uop inputs (80)map simd uop inputs (81)8283flush restart other nonspec (84)85inst all (8c)inst branch (8d)inst branch taken (90)inst branch cond (94)inst int alu (97)inst simd alu (9a)9fld unit uop (a6)l1d cache writeback (a8)a9acbranch cond mispred nonspec (c5)cdcfd5map dispatch bubble (d6)ddfetch restart (de)e0? int output thing (e9)? ldst retires (ed)? simd retires (ee)? int retires (ef)f5f6f7f8fd
30024120032899012001710945625400101001020000100001020000100005057356721367002900120013120032120032115548311626230010201000020000201000020000120032120032112002110910100101000010100000001013091161111958910000100001000010010120033120121120118120033120287
300241202529001212002010945625400101001020000100001020000100005057357681367002900120013120032120032115548311626230010201000020000201000020000120032120032112002110910100101000010100000000012701341111980610000100001000010010120210120033120033120033120034
300241200358991212002010945625400101001020000100001020000100005057356721367025700120013120032120032115548311626230010201000020000201000020000120034120040112002110910100101000010100004200012701161111965610000100001000010010120033120033120033120033120033
30024120032899012001710945625400181001020000100001020000100005057356721367002900120013120032120032115548311626230010201000020000201000020000120032120032112002110910100101000010100000000012701161111962010000100001000010010120041120034120033120033120033
30024120032899012001710945625400101001420004100001020000100005057356721367002900120013120032120032115548311626230010201000020000201000020000120032120032112002110910100101000010100000000012701161111963110000100001000010010120033120033120033120033120033
30024120032899012001710945625400101001020000100001020000100005057356721367002900120013120032120032115548311626230010201000020000201000020000120032120032112002110910100101000010100000000012701161111959610000100001000010010120033120033120033120033120033
30024120032899012001710945625400101001020000100001020000100005057356721367002900120013120032120032115548311626230010201000020000201000020000120032120032112002110910100101000010100000000012701161111964310000100001000010010120033120033120033120033120033
3002412003289912120017109456254001010010200001000010200001000050573567213670029001200131200321200321155483116262300102010000200002010000200001200321200321120021109101001010000101000100180012701161111963710000100001000010010120033120033120033120033120033
30024120032899012001710945625400101001020000100001020000100005057356721367002900120013120032120032115548311626230010201000020000201000020000120032120032112002110910100101000010100000000012701161111964310000100001000010010120033120033120033120033120033
30024120032899204120017109456254001010010200001000010200001005050573567213670029001200131200321200321155483116262300102010000200002010000200001200321200321120021109101001010000101000000240012701161111957910000100001000010010120033120033120033120033120033

Test 3: throughput

Count: 8

Code:

  dup v0.4s, w8
  dup v1.4s, w8
  dup v2.4s, w8
  dup v3.4s, w8
  dup v4.4s, w8
  dup v5.4s, w8
  dup v6.4s, w8
  dup v7.4s, w8
  mov x8, 9

(fused SUBS/B.cc loop)

100 unrolls and 100 iterations

Result (median cycles for code divided by count): 0.3338

retire uop (01)cycle (02)03mmu table walk instruction (07)l2 tlb miss instruction (0a)1e3f4f51schedule uop (52)schedule int uop (53)schedule simd uop (54)schedule ldst uop (55)dispatch int uop (56)dispatch simd uop (57)dispatch ldst uop (58)int uops in schedulers (59)simd uops in schedulers (5a)ldst uops in schedulers (5b)60696d6emap stall dispatch (70)map rewind (75)map stall (76)dispatch uop (78)map int uop (7c)map ldst uop (7d)map simd uop (7e)map int uop inputs (7f)map ldst uop inputs (80)map simd uop inputs (81)8283flush restart other nonspec (84)85inst all (8c)inst branch (8d)inst branch taken (90)inst branch cond (94)inst int alu (97)inst simd alu (9a)9fa5ld unit uop (a6)acbranch cond mispred nonspec (c5)branch mispred nonspec (cb)cdcfd5map dispatch bubble (d6)ddfetch restart (de)e0? int output thing (e9)? ldst retires (ed)? simd retires (ee)? int retires (ef)f5f6f7f8fd
16020426710200110266931225160100100800008000010080020800155001168951188416312668926708267086632666571601352008002080020200800208002026708267081180201100991001008000010008000097311151172161126840080000800001002670926709267092670926709
16020426708200110266930251601001008000080000100800208001550011689511884163126689267082670866326665716013520080020800202008002080020267102670811802011009910010080000100080000011151171161126711080000800001002670926709267092670926709
16020426708200110266930251601001008000080000100800208001550011689511884163126689267082670866326665716013520080020800202008002080020267082670811802011009910010080000100080000011151171161126713080000800001002670926709267092670926709
16020426708200110266930251601001008000080000100800208001550011689511884163126689267082670866326665716013520080020800202008002080020267082670811802011009910010080000100080000011151171161126820080000800001002670926709267092670926709
16020426708200110266931251601001008000080000100800208001550011689511884163126689267082670866326665716013520080020800202008002080020267082670811802011009910010080000100080000011151171161126711080000800001002670926709267092670926709
16020426708200110266930251601001008000080000100800208001550011689511884163126689267082670866326665716013520080020800202008002080020267082670811802011009910010080000100080000011151171162126778080000800001002670926709267092670926709
16020426708200110266930251601001008000080000100800208001550011689511884163126689267082670866326665716013520080020800202008002080020267082670811802011009910010080000100080000011151171161126705080000800001002670926709267092670926709
160204267082001121266960251603601008000080000100800208001550011689511884163126689267082670866326665716013520080020800202008002080020267112670811802011009910010080000100080000011151171161126817080000800001002670926709267092671326709
16020426708200110266930251601001008000080000100800208001550011689511884163126689267082670866326665716013520080020800202008002080020267082670811802011009910010080000100080000011151171161126713080000800001002670926709267092670926709
16020426708200110266930251601001008000080000100800208001550011689511884163126689267082670866326665716013520080020800202008002080020267082670811802011009910010080000100080000011151171161126821080000800001002670926709267092670926709

1000 unrolls and 10 iterations

Result (median cycles for code divided by count): 0.3338

retire uop (01)cycle (02)031e3a3f4f51schedule uop (52)schedule int uop (53)schedule simd uop (54)schedule ldst uop (55)dispatch int uop (56)dispatch simd uop (57)dispatch ldst uop (58)int uops in schedulers (59)simd uops in schedulers (5a)ldst uops in schedulers (5b)5f60696b6d6emap stall dispatch (70)map rewind (75)map stall (76)dispatch uop (78)map int uop (7c)map ldst uop (7d)map simd uop (7e)map int uop inputs (7f)map ldst uop inputs (80)map simd uop inputs (81)8283flush restart other nonspec (84)85inst all (8c)inst branch (8d)inst branch taken (90)inst branch cond (94)inst int alu (97)inst simd alu (9a)9fl1d tlb access (a0)l1d tlb miss (a1)ld unit uop (a6)a9branch cond mispred nonspec (c5)cfd0d2l1i cache miss demand (d3)d5map dispatch bubble (d6)d9dadbddfetch restart (de)e0ea? ldst retires (ed)? simd retires (ee)? int retires (ef)f5f6f7f8fd
160024267091990026697025160010108000080000108000080000501168880188403201266890267082670866533668816001020800008000020800008000026708267081180021109101080000100080000005020000416004422680408000080000102670926709267092670926709
160024267082000126693025160010108000080000108000080000501168880188403201266890267082670866533668816001020800008000020800008000026708267081180021109101080000100080133005020000916004342685108000080000102670926709267092670926709
160024267082000026693025160010108000080000108000080000501168880188403201266890267082670866533668816001020800008000020800008000026708267081180021109101080000100080000005024000416004422672608000080000102670926709267092670926709
160024267082000026693025160010108000080000108000080000501168880188403211266890267082670866533668816001020800008000020800008021426760267321180021109101080000100080000005020000416004742670808000080000102670926712267092670926709
160024267082000026693025160010108000080000108000080000501168880188403201266890267082670866533668816001020800008000020800008000026708267081180021109101080000100080000005020000416004422671108000080000102670926819269272671226709
160024267082009026697025160010108000080000108000080000501168880188403201266890267082670866533668816001020800008000020800008000026708267081180021109101080000100080000005020000216005422671508000080000102670926709267092670926709
160024267082000026693025160010108000080000108000080000501168880188403201266890267082670866533668816001020800008000020800008000026708267081180021109101080000100080000005020000416004442671008000080000102670926709267092670926709
160024267082000026693025160010108000080000108000080000501168880188403201266890267082670866533668816001020800008000020800008000026708267081180021109101080000100080000005020000516004242671308000080000102670926709267092670926709
160024267082000026693025160010108000080000108000080000501168880188403211266890267082670866533668816001020800008000020800008000026708267081180021109101080000100080000005020000216004242671108000080000102670926709267092670926709
160024267082000026693025160010108000080000108000080000501168880188403201266890267082670866533668816001020800008000020800008000026708267081180021109101080000100080000005020000416003242683708000080000102670926709267092670926709