Apple M1 Microarchitecture Research by Dougall Johnson

Firestorm: Overview | Base Instructions | SIMD and FP Instructions
Icestorm:  Overview | Base Instructions | SIMD and FP Instructions

INS (general, H)

Test 1: uops

Code:

  ins v0.h[2], w1
  mov x0, 1
  mov x1, 2

(no loop instructions)

1000 unrolls and 1 iteration

Retires: 2.000

Issues: 2.000

Integer unit issues: 0.000

Load/store unit issues: 1.000

SIMD/FP unit issues: 1.000

retire (01)cycle (02)031e3f4e51inst issue (52)~issue fp/simd (54)~issue ld/st (55)~dispatch fp/simd (57)~dispatch ld/st (58)huge thing ld/st (5a)huge thing fp/simd (5b)60696d6edispatch stall (70)scheduler rewind (75)scheduler stall (76)~dispatch op (78)~map op ld/st (7d)~map op fp/simd (7e)~map lookup ld/st (80)~map lookup fp/simd (81)8283pipeline redirect (84)85inst all (8c)inst neon or fp (9a)a6a8accfd5d6ddinst fetch restart (de)e0ld/st retires (ed)? fp/simd (ee)f5f6f7f8fd
200420401502025169025200010001000100010006342265147120212040204014503177320001000100010002000204020401110011000100000731161120371000100020412041204120412041
200420401602025169025200010001000100010006342265147120212040204014503177320001000100010002000204020401110011000100000731161120371000100020412041204120412041
200420401702025169025200010001000100010006342265147120212040204014503177320001000100010002000204020401110011000100000731161120371000100020412041204120412041
200420401502025169025200010001000100010006342265147120212040204014503177320001000100010002000204020401110011000100000731161120371000100020412041204120412041
200420401602025169025200010001000100010006342265147020212040204014503177320001000100010002000204020401110011000100000731161120371000100020412041204120412041
200420401602025169025200010001000100010006347265147020212040204014503177320001000100010002000204020401110011000100000731161120371000100020412041204120412041
200420401602025169025200010001000100010006342265147020212040204014503177320001000100010002000204020401110011000100000731161120371000100020412041204120412041
200420401502025169025200010001000100010006342265147120212040204014503177320001000100010002000204020401110011000100000731161120371000100020412041204120412041
200420401602025169025200010001000100010006342265147120212040204014503177320001000100010002000204020401110011000100010731161120371000100020412041204120412041
2004204016020251690252000100010001152100063422651471202120402040145031773200010001000100020002040204011100110001000039731161120371000100020412041204120412041

Test 2: Latency 1->1

Code:

  ins v0.h[2], w1
  mov x0, 1
  mov x1, 2

(fused SUBS/B.cc loop)

100 unrolls and 100 iterations

Result (median cycles for code): 2.0040

retire (01)cycle (02)03040818191e1f3f4e51inst issue (52)~issue int (53)~issue fp/simd (54)~issue ld/st (55)~dispatch int (56)~dispatch fp/simd (57)~dispatch ld/st (58)huge thing int (59)huge thing ld/st (5a)huge thing fp/simd (5b)60696b6d6edispatch stall (70)scheduler rewind (75)scheduler stall (76)~dispatch op (78)~map op int (7c)~map op ld/st (7d)~map op fp/simd (7e)~map lookup int (7f)~map lookup ld/st (80)~map lookup fp/simd (81)8283pipeline redirect (84)85inst all (8c)inst branch (8d)inst branch taken (90)inst b.cc (94)inst integer (97)inst neon or fp (9a)9fa0a1a5a6a7a8a9acc2cdcfd5d6ddinst fetch restart (de)e0? int output thing (e9)ld/st retires (ed)? fp/simd (ee)gpr retires (ef)f5f6f7f8fd
202042004015510000020025196892520100100100001000010010000100005005374628481321200210200402004017175317498201002001000010000200100002000020040200401110201100991001001000010000182100001000007101161120037010000100001002004120041200412004120041
202042004015500001320200251968925201001001000010000100100001000050053518284813212002102004020040171753174982010020010000100002001000020000200402004011102011009910010010000100000100000000007101162220037010000100001002004120041200412004120041
2020420040155000000200251968925201001001000010000100100001000050053518284813212002102004020040171753174982010020010000100002001000020000200402004011102011009910010010000100000100000000007101161120037010000100001002004120041200412004120041
2020420040155000000200251968925201001001000010000100100001000050053522284813212002102004020040171753174982010020010000100002001000020000200402004011102011009910010010000100000100000000007101161120037010000100001002004120041200412004120041
2020420040155000000200251968925201001001000010000100100001000050053518284813212002102004020040171753174982010020010000100002001000020000200402004011102011009910010010000100000100000003007101161120037010000100001002004120041200412004120165
20204200401550000002002519689252010010010000100001001000010000500535182848132120021020040200401717531749820100200100001000020010000200002004020040111020110099100100100001000001000002300007101161120037010000100001002004120041200412004120041
20204200401550000002002519689252010010010000100001001000010000500535182848132120021020040200401717531749920100200100001000020010000200002004020040111020110099100100100001000001000004703007101161120037010000100001002004120041200412004120041
2020420040156000000200251968925201001001000010000100100001000050053518284813212002102004020040171753174982010020010000100002001000020000200402004011102011009910010010000100000100000000007101161120037010000100001002004120041200412004120041
2020420040155000000200251968925201001001000010000100100001000050053518284956612002102004020040171753174982010020010000100002001000020000200402004011102011009910010010000100000100000000007101161120037010000100001002004120041200412004120041
2020420040155000000200251968925201001001000010000100100001000050053518284813212002102004020040171753174982010020010000100002001000020000200402004021102011009910010010000100000100000000007101161120037010000100001002004120041200412004120041

1000 unrolls and 10 iterations

Result (median cycles for code): 2.0040

retire (01)cycle (02)03090b1e3f4e51inst issue (52)~issue int (53)~issue fp/simd (54)~issue ld/st (55)~dispatch int (56)~dispatch fp/simd (57)~dispatch ld/st (58)huge thing int (59)huge thing ld/st (5a)huge thing fp/simd (5b)60696d6edispatch stall (70)scheduler rewind (75)scheduler stall (76)~dispatch op (78)~map op int (7c)~map op ld/st (7d)~map op fp/simd (7e)~map lookup int (7f)~map lookup ld/st (80)~map lookup fp/simd (81)8283pipeline redirect (84)85inst all (8c)inst branch (8d)inst branch taken (90)inst b.cc (94)inst integer (97)inst neon or fp (9a)9fa6a8acbranch mispredict (cb)cfd5d6ddinst fetch restart (de)e0eald/st retires (ed)? fp/simd (ee)gpr retires (ef)f5f6f7f8fd
2002420040155000200251969025200101010000100001010000100005053585284814712002120040201021718431752020010201000010000201000020000200402004011100211091010100001010000000640316222003701000010000102004120041200412004120041
2002420040155000200251969025200101010000100001010000100005053604284814702002120040200401719731752020010201000010000201000020000200402004011100211091010100001010000000640216222003701000010000102004120041200412004120041
2002420040155000200251969025200101010000100001010000100005053597284814712002120043200401719731752020010201000010000201000020000200402004011100211091010100001010000100640216222003701000010000102004120041200412004120041
20024200401560012200251969025200101010000100001010000100005053601284814712002120040200401719731752020010201000010000201000020000200402004011100211091010100001010000000640216222003701000010000102004120041200412004120041
2002420040162000200251969025200101010000100001010000100005053589284814712002120040200401719731752020010201000010000201000020000200402004011100211091010100001010000000640216222003701000010000102004120041200412004120041
2002420040155010200251969025200101010000100001010000100005053585284814712002120040200401719731752020010201000010000201000020000200402004011100211091010100001010000000640216222003701000010000102004120041200412004120041
2002420040156000200251969025200101010000100001010000100005053590284814712002120040200401719731752020010201000010000201000020000200402004011100211091010100001010000000640216222003701000010000102004120041200412004120041
20024200401550024200251969025200101010000100001010000100005053593284814712002120040200401719731752020010201000010000201000020000200402004011100211091010100001010000000640216222003701000010000102004120041200412004120041
2002420040155000200251969025200101010000100001010000100005053612284814702002120040200401719731752020010201000010000201000020000200402004011100211091010100001010000000640216222003701000010000102004120041200412004120041
2002420040155000200251969025200101010000100001010000100005053585284814712002120040200401719731752020010201000010000201000020000200402004011100211091010100001010000000640216222003701000010000102004120041200412004120041

Test 3: Latency 1->2 roundtrip

Code:

  ins v0.h[2], w0
  fmov x0, d0
  mov x0, 1
  mov x1, 2

(fused SUBS/B.cc loop)

100 unrolls and 100 iterations

Result (median cycles for code): 12.0032

retire (01)cycle (02)03080b18191e1f3a3f4f51inst issue (52)~issue int (53)~issue fp/simd (54)~issue ld/st (55)~dispatch int (56)~dispatch fp/simd (57)~dispatch ld/st (58)huge thing int (59)huge thing ld/st (5a)huge thing fp/simd (5b)60696d6edispatch stall (70)scheduler rewind (75)scheduler stall (76)~dispatch op (78)~map op int (7c)~map op ld/st (7d)~map op fp/simd (7e)~map lookup int (7f)~map lookup ld/st (80)~map lookup fp/simd (81)8283pipeline redirect (84)85inst all (8c)inst branch (8d)inst branch taken (90)inst b.cc (94)inst integer (97)inst neon or fp (9a)9fa0a6a8a9acc2cdcfd0d5d6ddinst fetch restart (de)e0? int output thing (e9)ld/st retires (ed)? fp/simd (ee)gpr retires (ef)f5f6f7f8fd
302041200328990000885352012001710945625401001010020000100001002000010000500573567213672053112001312003212003211552531162403010020010000200002001000030000120032120032112020110099100101001000010001000000000131012162211957410000100001000010100120033120033120033120033120033
30204120032899000000012001710945625401001010020000100001002000010000500573567213672053012001312003212003211552531162403010020010000200002001000030000120032120032112020110099100101001000010001000000000131012162211957410000100001000010100120033120033120033120033120033
30204120032899000000012001710945625401001010020000100001002000010000500573567213672053012001312003212003211552531162453010020010000200002001000030000120032120032112020110099100101001000010001000000000131012162211957410007100001000010100120033120033120033120033120041
3020412003289900008820012001810945625401001010020000100001002000010000500573567213672053012001312003212003211552531162403010020010000200002001000030000120032120032112020110099100101001000010001000000000131012162211957410000100001000010100120033120033120033120033120046
3020412003289900002850012001710945625401001010020000100001022000010000500573567213679178012001312003312003311552531162403010020010000200002001000030000120032120032112020110099100101001000010001000000000131012162211957410000100001000010100120036120040120033120033120033
30204120032899000000012001710945625401001010020000100001002000010000500573567213672053012001312003212003211552531162403010020010000200002001000030000120032120032112020110099100101001000010001000000000131012160211957410000100001000010100120033120033120033120033120033
3020412003289900002250012001710945625401001010020000100001002000010000500573567213672053012001312003212003211552531162403010020010000200002001000030000120032120032112020110099100101001000010001000000000131012162311957410000100001000010100120033120033120033120033120033
30204120032899000000012001710945625401001010020000100001002000010000500573567213672053012001312003212003211552531162403010020010000200002001000030000120032120032112020110099100101001000010001000010000131012162211957410000100001000010100120033120033120033120033120033
30204120032899000000012001710945625401001010020000100001002000010000500573567213672053012001312003212003211552531162423010020010000200002001000030000120032120032112020110099100101001000010001000000000131003162211957410000100001000010100120033120033120033120033120033
30204120032899000000012001710945625401001010020000100001002000010000500573567213672053012001312003212003211552531162403010020010000200002001000030000120032120032112020110099100101001000010001000000000131012162211957410000100001000010100120033120033120033120033120033

1000 unrolls and 10 iterations

Result (median cycles for code): 12.0032

retire (01)cycle (02)0307080a18191e1f3f4f51inst issue (52)~issue int (53)~issue fp/simd (54)~issue ld/st (55)~dispatch int (56)~dispatch fp/simd (57)~dispatch ld/st (58)huge thing int (59)huge thing ld/st (5a)huge thing fp/simd (5b)60696d6edispatch stall (70)scheduler rewind (75)scheduler stall (76)~dispatch op (78)~map op int (7c)~map op ld/st (7d)~map op fp/simd (7e)~map lookup int (7f)~map lookup ld/st (80)~map lookup fp/simd (81)8283pipeline redirect (84)85inst all (8c)inst branch (8d)inst branch taken (90)inst b.cc (94)inst integer (97)inst fp/simd store (99)inst neon or fp (9a)9fa1a6a8a9acbbc2cfd5d6dbddinst fetch restart (de)e0? int output thing (e9)ld/st retires (ed)? fp/simd (ee)gpr retires (ef)f5f6f7f8fd
3002412003290000000601200171094562540010100102000010000102000010000505735672136700290120013120032120075115550311626230010201000020000201000030000120032120032112002110910100100100001001000000404900127021601211957610056100001000010010122220122732122597122221122640
30024122341919001332742362464120017109456254001010010200001000018244101173561583010413847798012261312347812329111688516411763234499201000020000201000030000120032120032212002110910113340100001001000332000127011601111957410000100001000010010120033120033120033120033120033
3002412003289900000156241200171094562540010100102000010000102000010000505735672136700290120013120059120057115548311626230010201000020000201000030184120032120032112002110910100100100001001000010000127011601111957410000100001000010010120033120033120069120084120033
3002412003289900000001200171094562540010100102000010000102000010000505735672136700290120013120097120385115551311626230010201000020000201000030000120032120032112002110910100100100001001000010000127011601211957410000100001000010010120033120033120033120033120033
3002412003289900000001200171094562540010100102000010000102000010000505735672136700290120013120127120033115548311626230010201000020000201000030000120032120032112002110910100100100001001000000000127011601111957410000100001000010010120033120033120033120033120033
3002412003289900000001200181094562540010100122000010002102000010000505735672136763400121433122396121597116336311626230010201000020123201000030551120205120200261200211091010010010000100100450411949500127021603612110310044100001000010010120034120034120033120033122310
30024122405914310000012001711037562240216100642010810060142315411294665802140138222110120013120032120112115552311626230345201141922067221129233109121949121812271200211091010010010000102100401411145000141513302311957410000100001000010010120033120034120033120033122399
3002412267495400010001200171094562540010100102000410000102011910000505735672136740000120013120041120092115548311626230010201000020000201000030000120032120032112002110910100100100001001000000300127011601111957410000100001000010010120033120033120033120033120033
3002412003289900000001200171094562540010100102000010000102000010000505735672136700291120099120032120085115658311626230010201000020000201000030000120032120032112002110910100100100001001000000000127011601111957410000100001000010010120033120033120033120033120033
3002412003289900000001200171094562540010100102000010000102000010000505735672136700290120013120032120069115551311626230010201000020000201000030000120032120032112002110910100100100001001000000000127011601111957410000100001000010010120033120033120033120033120033

Test 4: throughput

Count: 8

Code:

  movi v0.16b, 0
  ins v0.h[2], w8
  movi v1.16b, 0
  ins v1.h[2], w8
  movi v2.16b, 0
  ins v2.h[2], w8
  movi v3.16b, 0
  ins v3.h[2], w8
  movi v4.16b, 0
  ins v4.h[2], w8
  movi v5.16b, 0
  ins v5.h[2], w8
  movi v6.16b, 0
  ins v6.h[2], w8
  movi v7.16b, 0
  ins v7.h[2], w8
  mov x8, 9

(fused SUBS/B.cc loop)

100 unrolls and 100 iterations

Result (median cycles for code divided by count): 0.3759

retire (01)cycle (02)03081e3f4f51inst issue (52)~issue int (53)~issue fp/simd (54)~issue ld/st (55)~dispatch int (56)~dispatch fp/simd (57)~dispatch ld/st (58)huge thing int (59)huge thing ld/st (5a)huge thing fp/simd (5b)60696d6escheduler rewind (75)scheduler stall (76)~dispatch op (78)~map op int (7c)~map op ld/st (7d)~map op fp/simd (7e)~map lookup int (7f)~map lookup ld/st (80)~map lookup fp/simd (81)8283pipeline redirect (84)85inst all (8c)inst branch (8d)inst branch taken (90)inst b.cc (94)inst integer (97)inst neon or fp (9a)9fa5a6a8acc5branch mispredict (cb)cdcfd5d6e0? int output thing (e9)ld/st retires (ed)? fp/simd (ee)gpr retires (ef)f5f6f7f8fd
2402043009723400300540261601421258000580013125800198001965041648098148703004830069300696141601632008001980019200800191600383006930069111602011009910010016000010008000200111101181163006625800001600001003007030070300703006930069
2402043006923300300530271601431258000580013125800198001965041648098148703004830068300696141601632008001980019200800191600383006830068111602011009910010016000010008000200111101180163006625800001600001003006930069300693007030069
2402043006923300300540271601431258000580013125800198001965041648098320603004830068300697141601632008001980019200800191600383006830068111602011009910010016000010008000200111101190163006525800001600001003006930069300693006930069
2402043006823300300530271601431258000580013125800198001965041648098148703004830219302067141601632008001980019200800191600383006830069111602011009910010016000010008000200111101190163006525800001600001003006930069300693006930069
2402043006923300300530271601431258000580013125800198001965041648098148703004830069300687141601632008001980019200800201600383006930069111602011009910010016000010008000200111101180163006525800001600001003007030069300693007030070
2402043006823300300530271601431258000580013125800198001965041648098148713004830068300686141601632008001980019200800191600383006830068111602011009910010016000010008000200111101190163006625800001600001003007030070300693007030070
2402043006923300300540271601431258000580013125800198001965041648098148703004830068300687141601632008001980019200800191600383006830068111602011009910010016000010008000200111101180163006525800001600001003006930069300693007030069
2402043006923300300540261601431258000580013125800198001965041648098148703004830068300687141601632008001980019200800191600383006930068111602011009910010016000010008000220111101190163006625800001600001003006930070300693007030069
2402043006923300300530271601431258000580013125800198001965041648098148703004830205300697141601632008001980019200800191600383006830068111602011009910010016000010008000200111101180163006625800001600001003006930070300703007030070
2402043006823300300540261601431258000680014125800198001965041721198148703004830069300687141601632008001980019200800191600383006930068111602011009910010016000010008000213111101190163006625800001600001003007030069300693007030069

1000 unrolls and 10 iterations

Result (median cycles for code divided by count): 0.3756

retire (01)cycle (02)03070a0b18191e1f3a3f4f51inst issue (52)~issue int (53)~issue fp/simd (54)~issue ld/st (55)~dispatch int (56)~dispatch fp/simd (57)~dispatch ld/st (58)huge thing int (59)huge thing ld/st (5a)huge thing fp/simd (5b)5f6061696d6edispatch stall (70)scheduler rewind (75)scheduler stall (76)~dispatch op (78)~map op int (7c)~map op ld/st (7d)~map op fp/simd (7e)~map lookup int (7f)~map lookup ld/st (80)~map lookup fp/simd (81)8283pipeline redirect (84)85inst all (8c)inst branch (8d)inst branch taken (90)inst b.cc (94)inst integer (97)inst neon or fp (9a)9fa5a6a8a9acc2c9branch mispredict (cb)cfd0d2icache miss (d3)d5d6d9dadbddinst fetch restart (de)e0? int output thing (e9)eaecld/st retires (ed)? fp/simd (ee)gpr retires (ef)f5f6f7f8fd
240024300462250100000030030026160013138000080000138000080000684016069609011110300263004530045032416001320800008000020800001600003004530045111600211091010160000100800000000001002316612052422181330042330080000160000103004630046300463004630046
240024300452250000000030030026160013138000080000138000080000684016069609011110300263004530045032416001320800008000020800001600003004530045111600211091010160000100800000000001002113511817211181830042315080000160000103004630046304243004630046
2400243004522500000000300300621600131380000800001380000800006840160696090111103002630045300450330316001320800008000020800001600003004530045111600211091010160000100800000000001002113511817211181730042315080000160000103004630046300463013330046
240024300452250000000030030026160013138000080000138000080000684016069609011110300263004530045032416001320800008000020800001600003004530045111600211091010160000100800000000001002113511817211151930042315080000160000103004630046300463004630046
240024300452250000000030030026160013138000080000138000080000684016069609011110300263004530045032416001320800008000020800001600003004530168111600211091010160000100800000000001002113611617211171830042330080000160000103004630046300463004630046
240024300452250000000030030026160013138000080000138000080000684016069609011110300883004530045032416001320800008000020800001600003004530045111600211091010160000100800000000001002113511717211121930042315080000160000103004630046300463004630046
24002430045225000000003003053126160013138000080000138000080000684016069609011110300263004530045032416001320800008000020800001600003004530045111600211091010160000100800000000001002316721717422181930042330080000160000103004630046300463004630046
240024300452250000000030030026160013138000080000138000080000684016069609010110300263004530045032416001320800008000020800001600003004530045111600211091010160000100800000000001002316721717422161130042330080000160000103004630046300463004630046
240024300452250000000030030026160013138000080000138000080000684016069609010110300263004430045032416001320800008000020800001600003004530045111600211091010160000100800000000001002316721617422181730042330080000160000103004630046300463004630046
240024300452250000000030030026160013138000080000138000080000684016069609010110300293004530045032416001320800008000020800001600003004530045111600211091010160000100800000000001002113611517211171630042315080000160000103004630046300463004630046