Apple Microarchitecture Research by Dougall Johnson

M1/A14 P-core (Firestorm): Overview | Base Instructions | SIMD and FP Instructions
M1/A14 E-core (Icestorm):  Overview | Base Instructions | SIMD and FP Instructions

INS (general, H)

Test 1: uops

Code:

  ins v0.h[2], w1
  mov x0, 1
  mov x1, 2

(no loop instructions)

1000 unrolls and 1 iteration

Retires: 2.000

Issues: 2.000

Integer unit issues: 0.000

Load/store unit issues: 1.000

SIMD/FP unit issues: 1.000

retire uop (01)cycle (02)031e3f4e51schedule uop (52)schedule simd uop (54)schedule ldst uop (55)dispatch simd uop (57)dispatch ldst uop (58)simd uops in schedulers (5a)ldst uops in schedulers (5b)60696d6emap stall dispatch (70)map rewind (75)map stall (76)dispatch uop (78)map ldst uop (7d)map simd uop (7e)map ldst uop inputs (80)map simd uop inputs (81)8283flush restart other nonspec (84)85inst all (8c)inst simd alu (9a)ld unit uop (a6)l1d cache writeback (a8)accfd5map dispatch bubble (d6)ddfetch restart (de)e0? ldst retires (ed)? simd retires (ee)f5f6f7f8fd
200420401502025169025200010001000100010006342265147120212040204014503177320001000100010002000204020401110011000100000731161120371000100020412041204120412041
200420401602025169025200010001000100010006342265147120212040204014503177320001000100010002000204020401110011000100000731161120371000100020412041204120412041
200420401702025169025200010001000100010006342265147120212040204014503177320001000100010002000204020401110011000100000731161120371000100020412041204120412041
200420401502025169025200010001000100010006342265147120212040204014503177320001000100010002000204020401110011000100000731161120371000100020412041204120412041
200420401602025169025200010001000100010006342265147020212040204014503177320001000100010002000204020401110011000100000731161120371000100020412041204120412041
200420401602025169025200010001000100010006347265147020212040204014503177320001000100010002000204020401110011000100000731161120371000100020412041204120412041
200420401602025169025200010001000100010006342265147020212040204014503177320001000100010002000204020401110011000100000731161120371000100020412041204120412041
200420401502025169025200010001000100010006342265147120212040204014503177320001000100010002000204020401110011000100000731161120371000100020412041204120412041
200420401602025169025200010001000100010006342265147120212040204014503177320001000100010002000204020401110011000100010731161120371000100020412041204120412041
2004204016020251690252000100010001152100063422651471202120402040145031773200010001000100020002040204011100110001000039731161120371000100020412041204120412041

Test 2: Latency 1->1

Code:

  ins v0.h[2], w1
  mov x0, 1
  mov x1, 2

(fused SUBS/B.cc loop)

100 unrolls and 100 iterations

Result (median cycles for code): 2.0040

retire uop (01)cycle (02)03l1i tlb fill (04)mmu table walk data (08)18191e1f3f4e51schedule uop (52)schedule int uop (53)schedule simd uop (54)schedule ldst uop (55)dispatch int uop (56)dispatch simd uop (57)dispatch ldst uop (58)int uops in schedulers (59)simd uops in schedulers (5a)ldst uops in schedulers (5b)60696b6d6emap stall dispatch (70)map rewind (75)map stall (76)dispatch uop (78)map int uop (7c)map ldst uop (7d)map simd uop (7e)map int uop inputs (7f)map ldst uop inputs (80)map simd uop inputs (81)8283flush restart other nonspec (84)85inst all (8c)inst branch (8d)inst branch taken (90)inst branch cond (94)inst int alu (97)inst simd alu (9a)9fl1d tlb access (a0)l1d tlb miss (a1)a5ld unit uop (a6)st unit uop (a7)l1d cache writeback (a8)a9acc2cdcfd5map dispatch bubble (d6)ddfetch restart (de)e0? int output thing (e9)? ldst retires (ed)? simd retires (ee)? int retires (ef)f5f6f7f8fd
202042004015510000020025196892520100100100001000010010000100005005374628481321200210200402004017175317498201002001000010000200100002000020040200401110201100991001001000010000182100001000007101161120037010000100001002004120041200412004120041
202042004015500001320200251968925201001001000010000100100001000050053518284813212002102004020040171753174982010020010000100002001000020000200402004011102011009910010010000100000100000000007101162220037010000100001002004120041200412004120041
2020420040155000000200251968925201001001000010000100100001000050053518284813212002102004020040171753174982010020010000100002001000020000200402004011102011009910010010000100000100000000007101161120037010000100001002004120041200412004120041
2020420040155000000200251968925201001001000010000100100001000050053522284813212002102004020040171753174982010020010000100002001000020000200402004011102011009910010010000100000100000000007101161120037010000100001002004120041200412004120041
2020420040155000000200251968925201001001000010000100100001000050053518284813212002102004020040171753174982010020010000100002001000020000200402004011102011009910010010000100000100000003007101161120037010000100001002004120041200412004120165
20204200401550000002002519689252010010010000100001001000010000500535182848132120021020040200401717531749820100200100001000020010000200002004020040111020110099100100100001000001000002300007101161120037010000100001002004120041200412004120041
20204200401550000002002519689252010010010000100001001000010000500535182848132120021020040200401717531749920100200100001000020010000200002004020040111020110099100100100001000001000004703007101161120037010000100001002004120041200412004120041
2020420040156000000200251968925201001001000010000100100001000050053518284813212002102004020040171753174982010020010000100002001000020000200402004011102011009910010010000100000100000000007101161120037010000100001002004120041200412004120041
2020420040155000000200251968925201001001000010000100100001000050053518284956612002102004020040171753174982010020010000100002001000020000200402004011102011009910010010000100000100000000007101161120037010000100001002004120041200412004120041
2020420040155000000200251968925201001001000010000100100001000050053518284813212002102004020040171753174982010020010000100002001000020000200402004021102011009910010010000100000100000000007101161120037010000100001002004120041200412004120041

1000 unrolls and 10 iterations

Result (median cycles for code): 2.0040

retire uop (01)cycle (02)0309l2 tlb miss data (0b)1e3f4e51schedule uop (52)schedule int uop (53)schedule simd uop (54)schedule ldst uop (55)dispatch int uop (56)dispatch simd uop (57)dispatch ldst uop (58)int uops in schedulers (59)simd uops in schedulers (5a)ldst uops in schedulers (5b)60696d6emap stall dispatch (70)map rewind (75)map stall (76)dispatch uop (78)map int uop (7c)map ldst uop (7d)map simd uop (7e)map int uop inputs (7f)map ldst uop inputs (80)map simd uop inputs (81)8283flush restart other nonspec (84)85inst all (8c)inst branch (8d)inst branch taken (90)inst branch cond (94)inst int alu (97)inst simd alu (9a)9fld unit uop (a6)l1d cache writeback (a8)acbranch mispred nonspec (cb)cfd5map dispatch bubble (d6)ddfetch restart (de)e0ea? ldst retires (ed)? simd retires (ee)? int retires (ef)f5f6f7f8fd
2002420040155000200251969025200101010000100001010000100005053585284814712002120040201021718431752020010201000010000201000020000200402004011100211091010100001010000000640316222003701000010000102004120041200412004120041
2002420040155000200251969025200101010000100001010000100005053604284814702002120040200401719731752020010201000010000201000020000200402004011100211091010100001010000000640216222003701000010000102004120041200412004120041
2002420040155000200251969025200101010000100001010000100005053597284814712002120043200401719731752020010201000010000201000020000200402004011100211091010100001010000100640216222003701000010000102004120041200412004120041
20024200401560012200251969025200101010000100001010000100005053601284814712002120040200401719731752020010201000010000201000020000200402004011100211091010100001010000000640216222003701000010000102004120041200412004120041
2002420040162000200251969025200101010000100001010000100005053589284814712002120040200401719731752020010201000010000201000020000200402004011100211091010100001010000000640216222003701000010000102004120041200412004120041
2002420040155010200251969025200101010000100001010000100005053585284814712002120040200401719731752020010201000010000201000020000200402004011100211091010100001010000000640216222003701000010000102004120041200412004120041
2002420040156000200251969025200101010000100001010000100005053590284814712002120040200401719731752020010201000010000201000020000200402004011100211091010100001010000000640216222003701000010000102004120041200412004120041
20024200401550024200251969025200101010000100001010000100005053593284814712002120040200401719731752020010201000010000201000020000200402004011100211091010100001010000000640216222003701000010000102004120041200412004120041
2002420040155000200251969025200101010000100001010000100005053612284814702002120040200401719731752020010201000010000201000020000200402004011100211091010100001010000000640216222003701000010000102004120041200412004120041
2002420040155000200251969025200101010000100001010000100005053585284814712002120040200401719731752020010201000010000201000020000200402004011100211091010100001010000000640216222003701000010000102004120041200412004120041

Test 3: Latency 1->2 roundtrip

Code:

  ins v0.h[2], w0
  fmov x0, d0
  mov x0, 1
  mov x1, 2

(fused SUBS/B.cc loop)

100 unrolls and 100 iterations

Result (median cycles for code): 12.0032

retire uop (01)cycle (02)03mmu table walk data (08)l2 tlb miss data (0b)18191e1f3a3f4f51schedule uop (52)schedule int uop (53)schedule simd uop (54)schedule ldst uop (55)dispatch int uop (56)dispatch simd uop (57)dispatch ldst uop (58)int uops in schedulers (59)simd uops in schedulers (5a)ldst uops in schedulers (5b)60696d6emap stall dispatch (70)map rewind (75)map stall (76)dispatch uop (78)map int uop (7c)map ldst uop (7d)map simd uop (7e)map int uop inputs (7f)map ldst uop inputs (80)map simd uop inputs (81)8283flush restart other nonspec (84)85inst all (8c)inst branch (8d)inst branch taken (90)inst branch cond (94)inst int alu (97)inst simd alu (9a)9fl1d tlb access (a0)ld unit uop (a6)l1d cache writeback (a8)a9acc2cdcfd0d5map dispatch bubble (d6)ddfetch restart (de)e0? int output thing (e9)? ldst retires (ed)? simd retires (ee)? int retires (ef)f5f6f7f8fd
302041200328990000885352012001710945625401001010020000100001002000010000500573567213672053112001312003212003211552531162403010020010000200002001000030000120032120032112020110099100101001000010001000000000131012162211957410000100001000010100120033120033120033120033120033
30204120032899000000012001710945625401001010020000100001002000010000500573567213672053012001312003212003211552531162403010020010000200002001000030000120032120032112020110099100101001000010001000000000131012162211957410000100001000010100120033120033120033120033120033
30204120032899000000012001710945625401001010020000100001002000010000500573567213672053012001312003212003211552531162453010020010000200002001000030000120032120032112020110099100101001000010001000000000131012162211957410007100001000010100120033120033120033120033120041
3020412003289900008820012001810945625401001010020000100001002000010000500573567213672053012001312003212003211552531162403010020010000200002001000030000120032120032112020110099100101001000010001000000000131012162211957410000100001000010100120033120033120033120033120046
3020412003289900002850012001710945625401001010020000100001022000010000500573567213679178012001312003312003311552531162403010020010000200002001000030000120032120032112020110099100101001000010001000000000131012162211957410000100001000010100120036120040120033120033120033
30204120032899000000012001710945625401001010020000100001002000010000500573567213672053012001312003212003211552531162403010020010000200002001000030000120032120032112020110099100101001000010001000000000131012160211957410000100001000010100120033120033120033120033120033
3020412003289900002250012001710945625401001010020000100001002000010000500573567213672053012001312003212003211552531162403010020010000200002001000030000120032120032112020110099100101001000010001000000000131012162311957410000100001000010100120033120033120033120033120033
30204120032899000000012001710945625401001010020000100001002000010000500573567213672053012001312003212003211552531162403010020010000200002001000030000120032120032112020110099100101001000010001000010000131012162211957410000100001000010100120033120033120033120033120033
30204120032899000000012001710945625401001010020000100001002000010000500573567213672053012001312003212003211552531162423010020010000200002001000030000120032120032112020110099100101001000010001000000000131003162211957410000100001000010100120033120033120033120033120033
30204120032899000000012001710945625401001010020000100001002000010000500573567213672053012001312003212003211552531162403010020010000200002001000030000120032120032112020110099100101001000010001000000000131012162211957410000100001000010100120033120033120033120033120033

1000 unrolls and 10 iterations

Result (median cycles for code): 12.0032

retire uop (01)cycle (02)03mmu table walk instruction (07)mmu table walk data (08)l2 tlb miss instruction (0a)18191e1f3f4f51schedule uop (52)schedule int uop (53)schedule simd uop (54)schedule ldst uop (55)dispatch int uop (56)dispatch simd uop (57)dispatch ldst uop (58)int uops in schedulers (59)simd uops in schedulers (5a)ldst uops in schedulers (5b)60696d6emap stall dispatch (70)map rewind (75)map stall (76)dispatch uop (78)map int uop (7c)map ldst uop (7d)map simd uop (7e)map int uop inputs (7f)map ldst uop inputs (80)map simd uop inputs (81)8283flush restart other nonspec (84)85inst all (8c)inst branch (8d)inst branch taken (90)inst branch cond (94)inst int alu (97)inst simd store (99)inst simd alu (9a)9fl1d tlb miss (a1)ld unit uop (a6)l1d cache writeback (a8)a9acbbc2cfd5map dispatch bubble (d6)dbddfetch restart (de)e0? int output thing (e9)? ldst retires (ed)? simd retires (ee)? int retires (ef)f5f6f7f8fd
3002412003290000000601200171094562540010100102000010000102000010000505735672136700290120013120032120075115550311626230010201000020000201000030000120032120032112002110910100100100001001000000404900127021601211957610056100001000010010122220122732122597122221122640
30024122341919001332742362464120017109456254001010010200001000018244101173561583010413847798012261312347812329111688516411763234499201000020000201000030000120032120032212002110910113340100001001000332000127011601111957410000100001000010010120033120033120033120033120033
3002412003289900000156241200171094562540010100102000010000102000010000505735672136700290120013120059120057115548311626230010201000020000201000030184120032120032112002110910100100100001001000010000127011601111957410000100001000010010120033120033120069120084120033
3002412003289900000001200171094562540010100102000010000102000010000505735672136700290120013120097120385115551311626230010201000020000201000030000120032120032112002110910100100100001001000010000127011601211957410000100001000010010120033120033120033120033120033
3002412003289900000001200171094562540010100102000010000102000010000505735672136700290120013120127120033115548311626230010201000020000201000030000120032120032112002110910100100100001001000000000127011601111957410000100001000010010120033120033120033120033120033
3002412003289900000001200181094562540010100122000010002102000010000505735672136763400121433122396121597116336311626230010201000020123201000030551120205120200261200211091010010010000100100450411949500127021603612110310044100001000010010120034120034120033120033122310
30024122405914310000012001711037562240216100642010810060142315411294665802140138222110120013120032120112115552311626230345201141922067221129233109121949121812271200211091010010010000102100401411145000141513302311957410000100001000010010120033120034120033120033122399
3002412267495400010001200171094562540010100102000410000102011910000505735672136740000120013120041120092115548311626230010201000020000201000030000120032120032112002110910100100100001001000000300127011601111957410000100001000010010120033120033120033120033120033
3002412003289900000001200171094562540010100102000010000102000010000505735672136700291120099120032120085115658311626230010201000020000201000030000120032120032112002110910100100100001001000000000127011601111957410000100001000010010120033120033120033120033120033
3002412003289900000001200171094562540010100102000010000102000010000505735672136700290120013120032120069115551311626230010201000020000201000030000120032120032112002110910100100100001001000000000127011601111957410000100001000010010120033120033120033120033120033

Test 4: throughput

Count: 8

Code:

  movi v0.16b, 0
  ins v0.h[2], w8
  movi v1.16b, 0
  ins v1.h[2], w8
  movi v2.16b, 0
  ins v2.h[2], w8
  movi v3.16b, 0
  ins v3.h[2], w8
  movi v4.16b, 0
  ins v4.h[2], w8
  movi v5.16b, 0
  ins v5.h[2], w8
  movi v6.16b, 0
  ins v6.h[2], w8
  movi v7.16b, 0
  ins v7.h[2], w8
  mov x8, 9

(fused SUBS/B.cc loop)

100 unrolls and 100 iterations

Result (median cycles for code divided by count): 0.3759

retire uop (01)cycle (02)03mmu table walk data (08)1e3f4f51schedule uop (52)schedule int uop (53)schedule simd uop (54)schedule ldst uop (55)dispatch int uop (56)dispatch simd uop (57)dispatch ldst uop (58)int uops in schedulers (59)simd uops in schedulers (5a)ldst uops in schedulers (5b)60696d6emap rewind (75)map stall (76)dispatch uop (78)map int uop (7c)map ldst uop (7d)map simd uop (7e)map int uop inputs (7f)map ldst uop inputs (80)map simd uop inputs (81)8283flush restart other nonspec (84)85inst all (8c)inst branch (8d)inst branch taken (90)inst branch cond (94)inst int alu (97)inst simd alu (9a)9fa5ld unit uop (a6)l1d cache writeback (a8)acbranch cond mispred nonspec (c5)branch mispred nonspec (cb)cdcfd5map dispatch bubble (d6)e0? int output thing (e9)? ldst retires (ed)? simd retires (ee)? int retires (ef)f5f6f7f8fd
2402043009723400300540261601421258000580013125800198001965041648098148703004830069300696141601632008001980019200800191600383006930069111602011009910010016000010008000200111101181163006625800001600001003007030070300703006930069
2402043006923300300530271601431258000580013125800198001965041648098148703004830068300696141601632008001980019200800191600383006830068111602011009910010016000010008000200111101180163006625800001600001003006930069300693007030069
2402043006923300300540271601431258000580013125800198001965041648098320603004830068300697141601632008001980019200800191600383006830068111602011009910010016000010008000200111101190163006525800001600001003006930069300693006930069
2402043006823300300530271601431258000580013125800198001965041648098148703004830219302067141601632008001980019200800191600383006830069111602011009910010016000010008000200111101190163006525800001600001003006930069300693006930069
2402043006923300300530271601431258000580013125800198001965041648098148703004830069300687141601632008001980019200800201600383006930069111602011009910010016000010008000200111101180163006525800001600001003007030069300693007030070
2402043006823300300530271601431258000580013125800198001965041648098148713004830068300686141601632008001980019200800191600383006830068111602011009910010016000010008000200111101190163006625800001600001003007030070300693007030070
2402043006923300300540271601431258000580013125800198001965041648098148703004830068300687141601632008001980019200800191600383006830068111602011009910010016000010008000200111101180163006525800001600001003006930069300693007030069
2402043006923300300540261601431258000580013125800198001965041648098148703004830068300687141601632008001980019200800191600383006930068111602011009910010016000010008000220111101190163006625800001600001003006930070300693007030069
2402043006923300300530271601431258000580013125800198001965041648098148703004830205300697141601632008001980019200800191600383006830068111602011009910010016000010008000200111101180163006625800001600001003006930070300703007030070
2402043006823300300540261601431258000680014125800198001965041721198148703004830069300687141601632008001980019200800191600383006930068111602011009910010016000010008000213111101190163006625800001600001003007030069300693007030069

1000 unrolls and 10 iterations

Result (median cycles for code divided by count): 0.3756

retire uop (01)cycle (02)03mmu table walk instruction (07)l2 tlb miss instruction (0a)l2 tlb miss data (0b)18191e1f3a3f4f51schedule uop (52)schedule int uop (53)schedule simd uop (54)schedule ldst uop (55)dispatch int uop (56)dispatch simd uop (57)dispatch ldst uop (58)int uops in schedulers (59)simd uops in schedulers (5a)ldst uops in schedulers (5b)5f6061696d6emap stall dispatch (70)map rewind (75)map stall (76)dispatch uop (78)map int uop (7c)map ldst uop (7d)map simd uop (7e)map int uop inputs (7f)map ldst uop inputs (80)map simd uop inputs (81)8283flush restart other nonspec (84)85inst all (8c)inst branch (8d)inst branch taken (90)inst branch cond (94)inst int alu (97)inst simd alu (9a)9fa5ld unit uop (a6)l1d cache writeback (a8)a9acc2c9branch mispred nonspec (cb)cfd0d2l1i cache miss demand (d3)d5map dispatch bubble (d6)d9dadbddfetch restart (de)e0? int output thing (e9)eaec? ldst retires (ed)? simd retires (ee)? int retires (ef)f5f6f7f8fd
240024300462250100000030030026160013138000080000138000080000684016069609011110300263004530045032416001320800008000020800001600003004530045111600211091010160000100800000000001002316612052422181330042330080000160000103004630046300463004630046
240024300452250000000030030026160013138000080000138000080000684016069609011110300263004530045032416001320800008000020800001600003004530045111600211091010160000100800000000001002113511817211181830042315080000160000103004630046304243004630046
2400243004522500000000300300621600131380000800001380000800006840160696090111103002630045300450330316001320800008000020800001600003004530045111600211091010160000100800000000001002113511817211181730042315080000160000103004630046300463013330046
240024300452250000000030030026160013138000080000138000080000684016069609011110300263004530045032416001320800008000020800001600003004530045111600211091010160000100800000000001002113511817211151930042315080000160000103004630046300463004630046
240024300452250000000030030026160013138000080000138000080000684016069609011110300263004530045032416001320800008000020800001600003004530168111600211091010160000100800000000001002113611617211171830042330080000160000103004630046300463004630046
240024300452250000000030030026160013138000080000138000080000684016069609011110300883004530045032416001320800008000020800001600003004530045111600211091010160000100800000000001002113511717211121930042315080000160000103004630046300463004630046
24002430045225000000003003053126160013138000080000138000080000684016069609011110300263004530045032416001320800008000020800001600003004530045111600211091010160000100800000000001002316721717422181930042330080000160000103004630046300463004630046
240024300452250000000030030026160013138000080000138000080000684016069609010110300263004530045032416001320800008000020800001600003004530045111600211091010160000100800000000001002316721717422161130042330080000160000103004630046300463004630046
240024300452250000000030030026160013138000080000138000080000684016069609010110300263004430045032416001320800008000020800001600003004530045111600211091010160000100800000000001002316721617422181730042330080000160000103004630046300463004630046
240024300452250000000030030026160013138000080000138000080000684016069609010110300293004530045032416001320800008000020800001600003004530045111600211091010160000100800000000001002113611517211171630042315080000160000103004630046300463004630046