Apple M1 Microarchitecture Research by Dougall Johnson

Firestorm: Overview | Base Instructions | SIMD and FP Instructions
Icestorm:  Overview | Base Instructions | SIMD and FP Instructions

INS (general, D)

Test 1: uops

Code:

  ins v0.d[1], x1
  mov x0, 1
  mov x1, 2

(no loop instructions)

1000 unrolls and 1 iteration

Retires: 2.000

Issues: 2.000

Integer unit issues: 0.000

Load/store unit issues: 1.000

SIMD/FP unit issues: 1.000

retire (01)cycle (02)031e1f3f4e4f51inst issue (52)~issue fp/simd (54)~issue ld/st (55)~dispatch fp/simd (57)~dispatch ld/st (58)huge thing ld/st (5a)huge thing fp/simd (5b)696d6edispatch stall (70)scheduler rewind (75)scheduler stall (76)~dispatch op (78)~map op ld/st (7d)~map op fp/simd (7e)~map lookup ld/st (80)~map lookup fp/simd (81)8283pipeline redirect (84)85inst all (8c)inst neon or fp (9a)a6a8accfd5d6ddinst fetch restart (de)e0ld/st retires (ed)? fp/simd (ee)f5f6f7f8fd
2004204016002025169002520001000100010001000634626514720212040204014503177320001000100010002000204020401110011000100000732161120371000100020412041204120412041
20042040160020251690054200010001000100010006342265147202120402040145031773200010001000100020002040204011100110001000100731161120371000100020412041204120412041
2004204016002025169002520001000100010001000654926514720212040204014503177320001000100010002000204020401110011000100000731161120371000100020412041204120412041
2004204016002025169002520001000100010001000634226514720212040204014503177320001000100010002000204020401110011000100000731161120371000100020412041204120412041
20042040161202025169002520001000100010001000634726514720212040204014503177320001000100010002000204020401110011000100000731161120371000100020412041204120412041
2004204015002025169002520001000100010001000634226514720212040204014503177320001000100010002000204020401110011000100000731161120371000100020412041204120412041
2004204016002025168802520001000100010001000634226514720212040204014503177320001000100010002000204020401110011000100090731161120371000100020412041204120412041
2004204016002025169002520001000100010001000634226514720212040204014503177320001000100010002000204020401110011000100000731161120371000100020412041204120412041
2004204016002025169002520001000100010001000634226514720212040204014503177320001000100010002000204020401110011000100000731161120371000100020412041204120412041
20042040160020251686025200010001000100010006342265147202120402040145031773200010001000100020002040204011100110001000220731161120371000100020412041204120412041

Test 2: Latency 1->1

Code:

  ins v0.d[1], x1
  mov x0, 1
  mov x1, 2

(fused SUBS/B.cc loop)

100 unrolls and 100 iterations

Result (median cycles for code): 2.0040

retire (01)cycle (02)031e3f4e5051inst issue (52)~issue int (53)~issue fp/simd (54)~issue ld/st (55)~dispatch int (56)~dispatch fp/simd (57)~dispatch ld/st (58)huge thing int (59)huge thing ld/st (5a)huge thing fp/simd (5b)6061696d6edispatch stall (70)scheduler rewind (75)scheduler stall (76)~dispatch op (78)~map op int (7c)~map op ld/st (7d)~map op fp/simd (7e)~map lookup int (7f)~map lookup ld/st (80)~map lookup fp/simd (81)8283pipeline redirect (84)85inst all (8c)inst branch (8d)inst branch taken (90)inst b.cc (94)inst integer (97)inst neon or fp (9a)9fa6a8acc5branch mispredict (cb)cdcfd0d5d6ddinst fetch restart (de)e0ld/st retires (ed)? fp/simd (ee)gpr retires (ef)f5f6f7f8fd
20204200401550200251968902520100100100001000010010000100005005351828481321020021200402004017182617494201002001000410004200100042000820040200401110201100991001001000010010000001117160016002003710000100001002004120041200412004120041
20204200401560200251968902520100100100001000010010000100005005351828481320020021200402004017182617494201002001000410004200100042000820040200401110201100991001001000010010000101117160016002003710000100001002004120041200412004120041
20204200401550200251968902520100100100001000010010000100005005351828481320020021200402004017182617494201002001000410004200100042000820040200401110201100991001001000010010000001117160016002003710000100001002004120041200412004120041
20204200401550200251968902520100100100001000010010000100005005351828481320020021200402004017182617494201002001000410004200100042000820040200401110201100991001001000010010000001117163016002003710000100001002004120041200412004120041
20204200401550200251968902520100100100001000010010000100005005351828481321020060200402004017182317498201002001000010000200100002000020040200401110201100991001001000010010000000007103116112003710000100001002004120041200412004120041
20204200401610200251968902520100100100001000010010000100005005351828481320020021200402004017175317498201002001000010000200100002000020040200401110201100991001001000010010000000007100116112003710000100001002004120041200412004120041
20204200401550200251968902520100100100001000010010000100005005351828481320020021200402004017175317498201002001000010000200100002000020040200401110201100991001001000010010000000007100116112003710000100001002004120041200412004120041
20204200401550200251968902520100100100001000010010000100005005351828481320020021200402004017175317498201002001000010000200100002000020040200401110201100991001001000010010000000007100116112003710000100001002004120041200412004120041
20204200401550200251968902520100100100001000010010000100005005351828481320020021200402004017175317498201002001000010000200100002000020040200401110201100991001001000010010000000007100116112003710000100001002004120041200412004120041
20204200401560201491968902520100100100001000010010000100005005351828481321020021200402004017175317498201002001000010000200100002000020040200401110201100991001001000010010000000007100116112003710000100001002004120041200412004120041

1000 unrolls and 10 iterations

Result (median cycles for code): 2.0040

retire (01)cycle (02)0308090b18191e1f3a3f494e4f51inst issue (52)~issue int (53)~issue fp/simd (54)~issue ld/st (55)~dispatch int (56)~dispatch fp/simd (57)~dispatch ld/st (58)huge thing int (59)huge thing ld/st (5a)huge thing fp/simd (5b)60696b6d6edispatch stall (70)scheduler rewind (75)scheduler stall (76)~dispatch op (78)~map op int (7c)~map op ld/st (7d)~map op fp/simd (7e)~map lookup int (7f)~map lookup ld/st (80)~map lookup fp/simd (81)8283pipeline redirect (84)85inst all (8c)inst branch (8d)inst branch taken (90)inst b.cc (94)inst integer (97)inst neon or fp (9a)9fa0a1a5a6a7a8a9acc2cfd5d6dcddinst fetch restart (de)e0? int output thing (e9)ebld/st retires (ed)? fp/simd (ee)gpr retires (ef)f5f6f7f8fd
2002420040155000000002002501969002520010101000010000101000010000505358528481470200210200402004017197317520200102010000100002010000200002004020040111002110910101000010000100000000064021602220037001000010000102004120041200412004120041
2002420040155000000002002501969002520010101000010000101000010000505358528481470200210200402004017197317520200102010000100002010000200002004020040111002110910101000010000100000000064021602220037001000010000102004120041200412004120041
2002420040156000000002002501969002520010101000010000101000010000505358528481470200210200402004017197317520200102010000100002010000200002004020040111002110910101000010000100000000064021602220037001000010000102004120041200412004120041
2002420040155000000002002501969002520010101000010000101000010000505358528481471200210200402004017197317520200102010000100002010000200002004020040111002110910101000010020100000000064021602220037001000010000102004120041200412004120041
2002420040155000000002002501969002520010101000010000101000010000505358528481470200210200402004017197317520200102010170100002010000200002004020040111002110910101000010000100000000064021602220037001000010000102004120041200412004120041
2002420040155000000002002501969002520010101000010000101000010000505358528481470200210200402004017197317520200102010000100002010000200002004020040111002110910101000010000100000000064021602220037001000010000102004120041200412004120041
2002420040156000000002002501969002520010101000010000101000010000505358528481470200210200402004017197317520200102010000100002010000200002004020040111002110910101000010000100000000064021602220037001000010000102004120041200412004120041
2002420040156000000002002501969002520010101000010000101000010000505358528481470200210200402004017197317520200102010000100002010000200002004020040111002110910101000010000100000000064021602220037001000010000102004120041200412004120041
2002420040155000000002002501969002520010101000010000101000010000505358528481470200210200402004017197317520200102010000100002010000200002004020040111002110910101000010000100000000064021602220037001000010000102004120041200412004120041
2002420040156000000002002501969002520010101000010000101000010000505358528481471200210200402004017197317520200102010000100002010000200002004020040111002110910101000010000100000000064021602220037001000010000102004120041200412004120041

Test 3: Latency 1->2 roundtrip

Code:

  ins v0.d[1], x0
  fmov x0, d0
  mov x0, 1
  mov x1, 2

(fused SUBS/B.cc loop)

100 unrolls and 100 iterations

Result (median cycles for code): 12.0032

retire (01)cycle (02)030b18191e1f3f4f51inst issue (52)~issue int (53)~issue fp/simd (54)~issue ld/st (55)~dispatch int (56)~dispatch fp/simd (57)~dispatch ld/st (58)huge thing int (59)huge thing ld/st (5a)huge thing fp/simd (5b)60696d6edispatch stall (70)scheduler rewind (75)scheduler stall (76)~dispatch op (78)~map op int (7c)~map op ld/st (7d)~map op fp/simd (7e)~map lookup int (7f)~map lookup ld/st (80)~map lookup fp/simd (81)8283pipeline redirect (84)85inst all (8c)inst branch (8d)inst branch taken (90)inst b.cc (94)inst integer (97)inst neon or fp (9a)9fa6a7a8a9acc2c5cfd0d5d6ddinst fetch restart (de)e0? int output thing (e9)ld/st retires (ed)? fp/simd (ee)gpr retires (ef)f5f6f7f8fd
302041200328990000012001710945625401001010020000100001002000010000500573567213672053012001312003212003211552531162403010020010000200002001000030000120032120032112020110099100101001000010010000000900131012162211957410000100001000010100120033120033120033120033120033
302041200328990000012001710945625401001010020004100001002000010000500573586413672053012001312003212003211552531162403010020010000200002001000030000120032120032112020110099100101001000010010000000000131012162211957410000100001000010100120033120033120033120033120033
302041200328990000012001710945625401001010020000100001002000010000500573567213672053012001312003212003211552531162403010020010000200002001000030000120032120032112020110099100101001000010010000000000131012162311957410000100001000010100120033120033120033120033120033
302041200328990000012001710945625401001010020000100001002000010000500573567213672053112001312003212003211552531162403010020010000200002001000030000120032120032112020110099100101001000010010000000000131012162211964110000100001000010100120033120033120033120033120033
302041200328990000012007110945625401001010020000100001002000010000500573567213672053112001312003212003211552531162403010020010000200002001000030191120036120032112020110099100101001000010010000000000131012162211957410000100001000010100120033120033120033120033120033
302041200328990000012001710945625401001010020000100001002000010000500573567213672053012001312003212003211552531162403010020010067200002001000030000120032120037112020110099100101001000010010000000000131012162211957410000100001000010100120033120033120033120036120033
302051200698990000012003210945625401001010020000100001002000010000500573567213672053012001312003212003211552531162403027220010000200002001000030000120032120032112020110099100101001000010010000000000131012162211957410000100001000010100120033120033120033120033120033
302041200328990000012001710945625401001010020000100001002000010000500573567213672053112001312003212003211552531162403010020010000200002001000030000120032120032112020110099100101001000010010000000000131012163211957410000100001000010100120033120033120033120033120033
302041200328990000012001710945625401001010020000100001002000010000500573567213672053012001312003212003211552831162483010020010000200002001000030000120032120032112020110099100101001000010010000000000131012162211957410000100001000010100120033120033120033120033120125
302041200329000000012001710945925401001010020000100001002000010000500573567213672053112001312003212003211552531162403010020010000200002001000030000120032120032112020110099100101001000010010000000000131012162211957410000100001000010100120033120034120033120033120033

1000 unrolls and 10 iterations

Result (median cycles for code): 12.0032

retire (01)cycle (02)03080b18191e1f3f4f51inst issue (52)~issue int (53)~issue fp/simd (54)~issue ld/st (55)~dispatch int (56)~dispatch fp/simd (57)~dispatch ld/st (58)huge thing int (59)huge thing ld/st (5a)huge thing fp/simd (5b)60696b6d6edispatch stall (70)scheduler rewind (75)scheduler stall (76)~dispatch op (78)~map op int (7c)~map op ld/st (7d)~map op fp/simd (7e)~map lookup int (7f)~map lookup ld/st (80)~map lookup fp/simd (81)8283pipeline redirect (84)85inst all (8c)inst branch (8d)inst branch taken (90)inst b.cc (94)inst integer (97)inst neon or fp (9a)9fa6a8acc5branch mispredict (cb)cdcfd5d6ddinst fetch restart (de)e0? int output thing (e9)ld/st retires (ed)? fp/simd (ee)gpr retires (ef)f5f6f7f8fd
3002412003289900000012001710945625400101001020000100001020000100005057356721367002911200130120032120032115548311626630010201000020000201000030000120032120032112002110910100101000010100000000012702161111964610000100001000010010120033120033120033120033120033
3002412003289900000012001710945625400101001020000100001020000100005057356721367002911200130120032120032115548311628530010201000020000201000030000120032120032112002110910100101000010100001000012701161111957410000100001000010010120033120033120033120033120033
30024120032899000000120017109456254001010010200001000010200001000050573567213670029112001301200321200321155483116265300102010066200002010000300001200321200321120021109101001010000101000056000012701161111957410000100001000010010120033120033120033120033120033
3002412003289900000012001710946525400101001020000100001020000100005057356721367002911200130120032120032115548311626230010201000020000201000030000120032120032112002110910100101000010100000000012701161111957410000100001000010010120033120033120033120033120033
3002412003989900006690120017109458254001010010200001000010200001000050573567213671051112001301200321200321155483116262300102010000200002010000300001200321200321120021109101001010000101000013000012701161111957410000100001000010010120033120033120033120033120033
3002412003289900000012001710945625400101001020000100001020000100005057356721367002911200130120032120032115602311626230182201000020000201000030000120032120032112002110910100101000010100000001012702161211957410000100001000010010120033120033120033120033120033
30024120032899000055501200171094542540010100102000010000102000010000505735672136700291120013012003212003211554831162623001020100002000020100003000012003212003211200211091010010100001010000014400012703165111957410000100001000010010120033120033120033120033120033
3002412003289900000012001710945625400101001020000100001020000100005057356721367002911200130120032120032115548311626230010201000020000201000030000120032120032112002110910100101000010100000000012701161111957410000100001000010010120033120033120033120033120033
300241200328990000001200171094562540010100102000010000102000010000505735672136700291120013012003212003211554831162623001020100002000020100003000012003212003211200211091010010100001010000022800012701161111957410000100001000010010120033120033120033120033120033
3002412009693300000012001710945625400101001020000100001020000100005057356721367002911200130120032120032115548311626230010201000020000201000030000120032120032112002110910100101000010100000000012701161111957410000100001000010010120033120071120033120033120034

Test 4: throughput

Count: 8

Code:

  movi v0.16b, 0
  ins v0.d[1], x8
  movi v1.16b, 0
  ins v1.d[1], x8
  movi v2.16b, 0
  ins v2.d[1], x8
  movi v3.16b, 0
  ins v3.d[1], x8
  movi v4.16b, 0
  ins v4.d[1], x8
  movi v5.16b, 0
  ins v5.d[1], x8
  movi v6.16b, 0
  ins v6.d[1], x8
  movi v7.16b, 0
  ins v7.d[1], x8
  mov x8, 9

(fused SUBS/B.cc loop)

100 unrolls and 100 iterations

Result (median cycles for code divided by count): 0.3759

retire (01)cycle (02)0308090b18191e1f3a3f4f51inst issue (52)~issue int (53)~issue fp/simd (54)~issue ld/st (55)~dispatch int (56)~dispatch fp/simd (57)~dispatch ld/st (58)huge thing int (59)huge thing ld/st (5a)huge thing fp/simd (5b)60696b6d6edispatch stall (70)scheduler rewind (75)scheduler stall (76)~dispatch op (78)~map op int (7c)~map op ld/st (7d)~map op fp/simd (7e)~map lookup int (7f)~map lookup ld/st (80)~map lookup fp/simd (81)8283pipeline redirect (84)85inst all (8c)inst branch (8d)inst branch taken (90)inst b.cc (94)inst integer (97)inst neon or fp (9a)9fa0a1a5a6a8acc2c5branch mispredict (cb)cdcficache miss (d3)d5d6ddinst fetch restart (de)e0? int output thing (e9)ld/st retires (ed)? fp/simd (ee)gpr retires (ef)f5f6f7f8fd
240204300972340000000030053026160143125800068001312580019800196504164809814870300480300693006907141601632008001980019200800191600383006830068111602011009910010016000010000080002000111101190016003006625800001600001003007030070300703006930069
240204300682410000000030054026160143125800058001312580019800196504164809814870300480300683006907141601632008001980019200800191600383006830069111602011009910010016000010000080002000111101180216003006525800001600001003007030070300703019730070
240204300692331000000030054027160143125800058001312580019800196504164809814870300480300693006807141601632008001980019200800191600383006930068111602011009910010016000010000080002000111101190016003006625800001600001003007030069300693007030069
240204300692330000000030053026160143125800058001312580019800196504164809814870300480300693006807141601632008001980019200800191600383006830069111602011009910010016000010000080002000111101190016003006625800001600001003006930069300693006930070
240204300692330000000030053027160143125800058001312580019800196504164809814870300480300683006906141601632008001980019200800191600383006830069111602011009910010016000010000080002000111101190016003006525800001600001003007030069300693007030070
240204300682330000000030053027160143125800058001312580019800196504164809814870300480300683006906141601632008001980019200800191600383006930068111602011009910010016000010000080002000111101180016003006625800001600001003006930069300703007030069
240204300692410000000030054026160143125800058001312580019800196504164809814871300480300683006806141601632008001980019200800191600383006830068111602011009910010016000010000080002000111101180116003006625800001600001003007030070300703007030070
240204300682330000000030054027160143125800058001312580019800196504164809814871300480300693006907141601632008001980019200800191600383006830069111602011009910010016000010000080002000111101180016003006525800001600001003007030070300703007030069
240204300682330000000030054027160143125800058001312580019800196504164809814871300480300693006806141601632008001980019200800191600383006830069111602011009910010016000010000080002000111101191016003006625800001600001003006930069300693006930070
240204300692330000060030053026160143125800058001312580019800196504164809814871300480300693006906141601632008001980019200800191600383006830069111602011009910010016000010000080002000111101180016003006525800001600001003006930069300703007030070

1000 unrolls and 10 iterations

Result (median cycles for code divided by count): 0.3757

retire (01)cycle (02)030408191e3a3f4f51inst issue (52)~issue int (53)~issue fp/simd (54)~issue ld/st (55)~dispatch int (56)~dispatch fp/simd (57)~dispatch ld/st (58)huge thing int (59)huge thing ld/st (5a)huge thing fp/simd (5b)5f6061696d6edispatch stall (70)scheduler rewind (75)scheduler stall (76)~dispatch op (78)~map op int (7c)~map op ld/st (7d)~map op fp/simd (7e)~map lookup int (7f)~map lookup ld/st (80)~map lookup fp/simd (81)8283pipeline redirect (84)85inst all (8c)inst branch (8d)inst branch taken (90)inst b.cc (94)inst integer (97)inst neon or fp (9a)9fa5a6a8accfd0d2icache miss (d3)itlb miss (d4)d5d6d9dadbddinst fetch restart (de)e0eald/st retires (ed)? fp/simd (ee)gpr retires (ef)f5f6f7f8fd
2400243007022500000300400261600101080000800001080000800005040363896415501103003330052300520324160010208000080000208000016000030087300551116002110910101600001008000000100221331059223222929300534180000160000103005630056300563005630056
2400243005522500000300400261600101080000800001080000800005040421996574401103003630055300550324160010208000080000208000016000030055300551116002110910101600001088000000100221361053223223028300524180000160000103005630056300563005630056
2400243005522500001300400261600101080000800001080000800005040421996574401103003630055300550324160010208000080000208000016000030056300551116002110910101600001008000000100241692046223221327300524180000160000103005630056300563005630056
2400243005522500001300400261600101080000800001080000800005040421996574401103003630055300550324160010208000080000208000016000030055300551116002110910101600001008000000100241692050223222928300524280000160000103005630056300563005630056
2400243005523200001300420261600101080113800001080000800005040421996574401103003630055300550324160010208000080000208000016000030055300551116002110910101600001008000000100241692051223222626300524180000160000103005330053300533005330053
2400243005222500000300400261600101080000800001080000800005040421996574401103003630055300550324160010208000080000208000016000030055300551116002110910101600001008000000100261692046223222522300524180000160000103005630056300563005630056
2400243005522500000300400261600101080000800001080000800005040421996574401103003630055300550324160010208000080000208000016000030055300551116002110910101600001008000000100241692048223222616300524180000160000103005630056300563005630053
2400243005222500000300400261600101080000800001080000800005040421996574401103003630055300550324160010208000080000208000016000030055300551116002110910101600001008000000100261692043223222728300524180000160000103005630056300563005630056
2400243005522500001300400261600101080000800001080000800005040421996574401103003630055300550324160010208000080000208000016000030055300551116002110910101600001008000000100261692052223222525300524180000160000103005630056300563005630056
24002430055225000013004002616001110800138000010800008000050406014970261111030033300523005203241600102080000800002080000160000300553006311160021109101016000010080000001002616102043223222615300524180000160000103005830056300563005630056