Apple M1 Microarchitecture Research by Dougall Johnson

Firestorm: Overview | Base Instructions | SIMD and FP Instructions
Icestorm:  Overview | Base Instructions | SIMD and FP Instructions

INS (general, B)

Test 1: uops

Code:

  ins v0.b[2], w1
  mov x0, 1
  mov x1, 2

(no loop instructions)

1000 unrolls and 1 iteration

Retires: 2.000

Issues: 2.000

Integer unit issues: 0.000

Load/store unit issues: 1.000

SIMD/FP unit issues: 1.000

retire (01)cycle (02)031e3f4e51inst issue (52)~issue fp/simd (54)~issue ld/st (55)~dispatch fp/simd (57)~dispatch ld/st (58)huge thing ld/st (5a)huge thing fp/simd (5b)60696d6edispatch stall (70)scheduler rewind (75)scheduler stall (76)~dispatch op (78)~map op ld/st (7d)~map op fp/simd (7e)~map lookup ld/st (80)~map lookup fp/simd (81)8283pipeline redirect (84)85inst all (8c)inst neon or fp (9a)a6a8accfd5d6ddinst fetch restart (de)e0ld/st retires (ed)? fp/simd (ee)f5f6f7f8fd
200420401502025169025200010001000100010006342265147120212040204014503177320001000100010002000204020401110011000100010732161120371000100020412041204120412041
200420401602025169025200010001000100010006342265147020212040204014503177320001000100010002000204020401110011000100000731161120371000100020412041204120412041
200420401502025169025200010001000100010006342265147020212040204014503177320001000100010002000204020401110011000100000731161120371000100020412041204120412041
200420401502025169025200010001000100010006373265147020602040204014503177320001000100010002000204020401110011000100000731161120371000100020412041204120412041
2004204016020251690252000100010001000100063422651471202120402040145031773200010001000100020002040204011100110001000021731161120371000100020412041204120412041
200420401502025169025200010001000100010006342265147120212040204014503177320001000100010002000204020401110011000100000731161120371000100020412041204120412041
200420401502025168625200010001000100010006342265147120212040204014503177320001000100010002000204020401110011000100040731161120371000100020412041204120412041
200420401502025169025200010001000100010006342265147120212040204014503177320001000100010002000204020401110011000100000731161120371000100020412041204120412041
200420401502025169025200010001000100010006342265147120212040204014503177320001000100010002000204020401110011000100000731161120371000100020412041204120412041
200420401502025169025200010001000100010006342265147120212040204014503177320001000100010002000204020401110011000100000731161120371000100020412041204120412041

Test 2: Latency 1->1

Code:

  ins v0.b[2], w1
  mov x0, 1
  mov x1, 2

(fused SUBS/B.cc loop)

100 unrolls and 100 iterations

Result (median cycles for code): 2.0040

retire (01)cycle (02)03080b18191e1f3a3f4e4f51inst issue (52)~issue int (53)~issue fp/simd (54)~issue ld/st (55)~dispatch int (56)~dispatch fp/simd (57)~dispatch ld/st (58)huge thing int (59)huge thing ld/st (5a)huge thing fp/simd (5b)60696b6d6edispatch stall (70)scheduler rewind (75)scheduler stall (76)~dispatch op (78)~map op int (7c)~map op ld/st (7d)~map op fp/simd (7e)~map lookup int (7f)~map lookup ld/st (80)~map lookup fp/simd (81)8283pipeline redirect (84)85inst all (8c)inst branch (8d)inst branch taken (90)inst b.cc (94)inst integer (97)inst neon or fp (9a)9fa0a1a5a6a7a8a9acc2c5branch mispredict (cb)cdcfd5d6ddinst fetch restart (de)e0? int output thing (e9)ld/st retires (ed)? fp/simd (ee)gpr retires (ef)f5f6f7f8fd
202042004016100001800200251968903720100100100001000010010000100005165351828481320200210200402004017175317498201002001000010170200100002000020040200401110201100991001001000010020010000010000007101161120086010000100001002004120041200412004120041
20204200401610000000200251968902520100100100001000010210000100005005351828481320200210200402004017175317498201002001000010000200100062000020040200401110201100991001001000010000010000000000007100161120037010000100001002004120041200412004120041
20204200401610000000200251967102520100100100001000010010000100005005351828481320200210200402004017175317498201002001000010000200100002000020040200401110201100991001001000010000010000000300007390161120037010000100001002004120041200412004120041
2020420090161000029700200251968902520100100100001000010010000100005225351828481320200210200402004017175317498201002001000010000200100002000020040200401110201100991001001000010000010000000601007101161120037010000100001002004120041200412004120041
2020420040161000012000200251968902520100100100121000010010000101585005683128481320200210200402004017175317498201002001000010000200100002000020040200401110201100991001001000010000010000080000017100161120037010000100001002004120041200412004120041
202042004016000000002002519689025201001001001210000100100001000050053518284813202002132004020090171752617498201002001000010166200100002000020090200401110201100991001001000010000010000000000007101161120037010000100001002004120041200412004120041
202042004016100003000200251968902520100100100001000010010000100005005351828481320200210200402004017175317498201002001000010000200101712000020040200401110201100991001001000010000010000000000007101161120037010000100001002004120041200412004120228
20204200401610000000200251968902520100100100001000010010000100005005351828481321200210200402004017175317498201002001000010000200100002000020040200401110201100991001001000010000010000000000007101161120037010000100001002004120041200412004120041
20204200401560000000200251968902520100100100001000010010000100005005351828481321200210200402004017175317498201002001000010000200100002000020040200401110201100991001001000010000010000000000007101161120037010000100001002004120041200412004120041
20204200401550000000200251968902520100100100001000010010000100005005351828481321200210200402004017175317498201002001000010000200100002000020040200401110201100991001001000010000010000000000007101161120037010000100001002004120041200412004120041

1000 unrolls and 10 iterations

Result (median cycles for code): 2.0040

retire (01)cycle (02)0307191e1f3f4e51inst issue (52)~issue int (53)~issue fp/simd (54)~issue ld/st (55)~dispatch int (56)~dispatch fp/simd (57)~dispatch ld/st (58)huge thing int (59)huge thing ld/st (5a)huge thing fp/simd (5b)60696d6edispatch stall (70)scheduler rewind (75)scheduler stall (76)~dispatch op (78)~map op int (7c)~map op ld/st (7d)~map op fp/simd (7e)~map lookup int (7f)~map lookup ld/st (80)~map lookup fp/simd (81)8283pipeline redirect (84)85inst all (8c)inst branch (8d)inst branch taken (90)inst b.cc (94)inst integer (97)inst neon or fp (9a)9fa0a6a8cfd5d6ddinst fetch restart (de)e0ld/st retires (ed)? fp/simd (ee)gpr retires (ef)f5f6f7f8fd
200242004015000002002519690252001010100001000010100001000050535852848147020021200402004017197317520200102010000100002010000200002004020040111002110910101000010010000064031633200371000010000102004120041200412004120041
200242004015000002002519690252001010100001000010100001000050535852848147020021200402004017197317520200102010000100002010000200002004020040111002110910101000010010000064031633200371000010000102004120041200412004120041
200242004015000002002519690252001010100001000010100001000050535852848147120021200402004017197317520200102010000100002010000200002004020040111002110910101000010010000064031633200771000010000102004120041200412004120041
200242004015000002002519690252001010100001000010100001000050535852848147120021200402004017197317520200102010000100002010000200002004020040111002110910101000010010000064031633200371000010000102004120041200412004120041
200242004015000002002519690252001010100001000010100001000050535852848147120021200402004017197317520200102010000100002010000200002004020040111002110910101000010010000064031633200371000010000102004120041200412004120041
200242004015000002002519690252001010100001000010100001000050535852848147120021200402004017197317520200102010000100002010000200002004020040111002110910101000010010000064031633200371000010000102004120041200412004120041
200242004015000002002519690252001010100001000010100001000050535852848147120021200402004017197317520200102010000100002010000200002004020040111002110910101000010010000064031633200371000010000102004120041200412004120041
200242004015000002002519690252001010100001000010100001000050535852848147120021200402004017197317520200102010000100002010000200002004020040111002110910101000010410000064031633200371000010000102004120041200412004120041
200242004015000002002519690252001010100001000010100001000050535852848147020021200402004017197317520200102010000100002010000200002004020040111002110910101000010010000064031633200371000010000102004120041200412004120041
200242004015000002002519690252001010100001000010100001000050535852848147020021200402004017197317520200102010000100002010000200002004020040111002110910101000010010000064031633200371000010000102004120041200412004120041

Test 3: Latency 1->2 roundtrip

Code:

  ins v0.b[2], w0
  fmov x0, d0
  mov x0, 1
  mov x1, 2

(fused SUBS/B.cc loop)

100 unrolls and 100 iterations

Result (median cycles for code): 12.0032

retire (01)cycle (02)0308091e1f3f4f51inst issue (52)~issue int (53)~issue fp/simd (54)~issue ld/st (55)~dispatch int (56)~dispatch fp/simd (57)~dispatch ld/st (58)huge thing int (59)huge thing ld/st (5a)huge thing fp/simd (5b)60696b6d6edispatch stall (70)scheduler rewind (75)scheduler stall (76)~dispatch op (78)~map op int (7c)~map op ld/st (7d)~map op fp/simd (7e)~map lookup int (7f)~map lookup ld/st (80)~map lookup fp/simd (81)8283pipeline redirect (84)85inst all (8c)inst branch (8d)inst branch taken (90)inst b.cc (94)inst integer (97)inst neon or fp (9a)9fa0a6a7a8a9acc2c5c9branch mispredict (cb)cdcfd0d5d6ddinst fetch restart (de)e0? int output thing (e9)ld/st retires (ed)? fp/simd (ee)gpr retires (ef)f5f6f7f8fd
3020412003289900104701200171094562540100101042000010000100200001000050057356721367205301200130120032120032115532711623630100200100032000520010003300081200321200322120201100991001010010000100010000000001011131701162211958310000100001000010100120033120033120033120033120033
302041200328990079201200171094562540100101002000010000100200001000050057356721367205301200130120032120032115532611626730100200100032000520010003300081200321200322120201100991001010010000100010000000001011131801161111958310000100001000010100120033120033120033120033120033
30204120032930009601761200171094562540100101002000010000100200001000050057356721367205301200130120032120032115532611623630100200100032000520010003300081200321200321120201100991001010010000100010000000001011131801172211958310000100001000010100120033120033120033120033120033
302041200329310057901200171094562540100101002000010000100200001000050057356721367205301200130120032120032115532711623530100200100032000520010003300081200321200321120201100991001010010000100010000000001011131801161211958310000100001000010100120033120033120033120033120033
302041200329300090901200171094562540100101002000010000100200001000050057356721367205301200130120032120032115532711623930100200100032000520010003300081200321200321120201100991001010010000100010000000000000131013163311957410000100001000010100120033120033120033120033120033
3020412003289900123001200171094562540100101002000010000100200001000050057356721367205301200130120032120032115525311624030100200100002000020010000300001200321200321120201100991001010010000100010000000000010131013163311957410000100001000010100120033120033120033120033120033
302041200328990072601200171094562540100101002000010000100200001000050057356721367205301200130120032120032115525311624530100200100002000020010000300001200321200321120201100991001010010000100010000000000000131013163311957410000100001000010100120033120033120033120033120033
3020412003289900117901200171094572540100101002000010000100200001000050057356721367205301200130120032120032115525311624030100200100002000020010000300001200321200321120201100991001010010000100010000000000000131013163311957410000100001000010100120033120033120033120033120033
3020412003289900001200171094562540100101002000010000100200001000050057356721367205301200130120032120032115525311624030100200100002000020010000300001200321200321120201100991001010010000100010000000000000131013163311957410000100001000010100120033120033120033120033120033
3020412003289900001200171094562540100101002000010000100200001000050057381901367216901200130120032120032115525311624030100200100002000020010000300001200321200321120201100991001010010000100010000000000000131013163311957410000100001000010100120033120033120033120033120033

1000 unrolls and 10 iterations

Result (median cycles for code): 12.0032

retire (01)cycle (02)03181e1f3f4f51inst issue (52)~issue int (53)~issue fp/simd (54)~issue ld/st (55)~dispatch int (56)~dispatch fp/simd (57)~dispatch ld/st (58)huge thing int (59)huge thing ld/st (5a)huge thing fp/simd (5b)60696d6edispatch stall (70)scheduler rewind (75)scheduler stall (76)~dispatch op (78)~map op int (7c)~map op ld/st (7d)~map op fp/simd (7e)~map lookup int (7f)~map lookup ld/st (80)~map lookup fp/simd (81)8283pipeline redirect (84)85inst all (8c)inst branch (8d)inst branch taken (90)inst b.cc (94)inst integer (97)inst neon or fp (9a)9fa1a6a8acbranch mispredict (cb)cdcfd0d5d6ddinst fetch restart (de)e0? int output thing (e9)ld/st retires (ed)? fp/simd (ee)gpr retires (ef)f5f6f7f8fd
3002412003289900012001710945725400101001020000100001020000100005057356721367002911200131200321200321155483116262300102010000200002010000300001200321200321120021109101001010000100100000010127001161111958610000100001000010010120033120033120033120033120033
3002412003289900012001710945625400101001020000100001020000100005057356721367002911200131200321200321155483116262300102010000200002010000300001200321200321120021109101001010000100100000000129501161111960610000100001000010010120033120033120033120033120033
30024120032106400012001710945625400101001020000100001020000100005057356721367002911200131200321200321155483116262300102010000200002010000300001200321200321120021109101001010000100100000000127001161111960210000100001000010010120033120033120033120033120033
3002412003289900012001710945625400101001020000100001020000100005057356721367002911200131200321200381155483116262300102010000200002010000300001200321201161120021109101001010000100100000000127002161111964610000100001000010010120033120033120033120033120033
3002412003289900012001710945625400101001020000100001020000100005057356721367002911200131200321200321155483116262300102010000200002010000300001200321200321120021109101001010000100100000000127001161111962310000100001000010010120033120033120076120033120033
3002412003289900012001710945625400101001020000100001020000100005057356721367002911200131200321200321155483116262300102010000200002010000300001200321200321120021109101001010000100100000000127001161111960610000100001000010010120033120033120033120033120033
3002412003289900012001710945625400101001020000100001020000100005057356721367002911200131200321200321155483116262300102010000200002010000300001200321200321120021109101001010000100100000001131502161111963910000100001000010010120033120033120033120033120033
3002412003289900012001710945625400101001020000100001020000100005057356721367002911200131200321200321155483116262300102010000200002010000300001200321200321120021109101001010000100100000000127001161111985410000100001000010010120033120033120033120033120131
3002412003289900012001710945525400101001020000100001020000100505057359601367002911200131200321200321155483116262300102010000200002010000300001200321200321120021109101001010000100100000000127001161111962410000100001000010010120033120033120033120033120033
3002412003289906012001710945625400101001020000100001020000100005057356721367002911200131200321200321155483116262300102010000200002010000300001200331200321120021109101001010000100100000000127001160111960810000100001000010010120034120033120033120033120033

Test 4: throughput

Count: 8

Code:

  movi v0.16b, 0
  ins v0.b[2], w8
  movi v1.16b, 0
  ins v1.b[2], w8
  movi v2.16b, 0
  ins v2.b[2], w8
  movi v3.16b, 0
  ins v3.b[2], w8
  movi v4.16b, 0
  ins v4.b[2], w8
  movi v5.16b, 0
  ins v5.b[2], w8
  movi v6.16b, 0
  ins v6.b[2], w8
  movi v7.16b, 0
  ins v7.b[2], w8
  mov x8, 9

(fused SUBS/B.cc loop)

100 unrolls and 100 iterations

Result (median cycles for code divided by count): 0.3759

retire (01)cycle (02)0318191e1f3f4f51inst issue (52)~issue int (53)~issue fp/simd (54)~issue ld/st (55)~dispatch int (56)~dispatch fp/simd (57)~dispatch ld/st (58)huge thing int (59)huge thing ld/st (5a)huge thing fp/simd (5b)60696b6d6edispatch stall (70)72scheduler rewind (75)scheduler stall (76)~dispatch op (78)~map op int (7c)~map op ld/st (7d)~map op fp/simd (7e)~map lookup int (7f)~map lookup ld/st (80)~map lookup fp/simd (81)8283pipeline redirect (84)85inst all (8c)inst branch (8d)inst branch taken (90)inst b.cc (94)inst integer (97)inst neon or fp (9a)9fa0a6a8acc5branch mispredict (cb)cdcfd5d6ddinst fetch restart (de)e0? int output thing (e9)ld/st retires (ed)? fp/simd (ee)gpr retires (ef)f5f6f7f8fd
240204300942420012030054027160143125800058001312580019800196504164809814871300480300683006900614160163200800198001920080019160038301323006811160201100991001001600001000800020011110119316113006525800001600001003006930069300693006930070
24020430069241000030053027160143125800058001312580019800196504164809814870300480300683006900714160163200800198001920080019160038300693015611160201100991001001600001000800020011110119216113006625800001600001003019730069300693006930069
24020430069242000030053026160143125800058001312580019800196504164809814871300480300683006900714160163200800198001920080019160038300683012811160201100991001001600001000800020311110119316113006625800001600001003006930070300693006930070
24020430069241000030053027160143125800058001312580019800196504164809814871300480300683006900714160163200800198001920080019160038300683006811160201100991001001600001000800020011210119116113006525800001600001003006930069300703007030070
24020430069233000030053027160143125800058001312580019800196504164809814871300490300693006900614160163200800198001920080019160038300993006821160201100991001001600001000800020311110119116113006625800001600001003006930069300693006930069
24020430068233000030053027160143125800058001312580019800196504164809814871300480300683006800714160163200800198001920080019160038300683006911160201100991001001600001000800020011110119116113016125800001600001003007030069300703007030069
24020430068241000030053027160143125800058001412580019801586504174929834371300480300683006900714160163200800248002320080024160046300813008111160201100991001001600001000800020011110119116113006525800001600001003006930070300703007030070
240204300692415015030053027160143125800058001312580019800196504164809814871300480300693006800614160163200800198001920080019160038300683006811160201100991001001600001000800020011110119116113006625800001600001003007030070300703007030069
24020430069241000030054026160144125800058001312580019800196504158859814870300480300683006800714160163200800198001920080019160038300683006811160201100991001001600001000800020011110119116113006625800001600001003006930069300693007030070
24020430068242000030054027160143125800058001312580019800196504164809814871300480300693006800714160163200800198001920080019160038300693006911160201100991001001600001000800020312210129116113006525800001600001003007030069300693007030069

1000 unrolls and 10 iterations

Result (median cycles for code divided by count): 0.3756

retire (01)cycle (02)03070818191e1f3f4f51inst issue (52)~issue int (53)~issue fp/simd (54)~issue ld/st (55)~dispatch int (56)~dispatch fp/simd (57)~dispatch ld/st (58)huge thing int (59)huge thing ld/st (5a)huge thing fp/simd (5b)5f6061696d6edispatch stall (70)scheduler rewind (75)scheduler stall (76)~dispatch op (78)~map op int (7c)~map op ld/st (7d)~map op fp/simd (7e)~map lookup int (7f)~map lookup ld/st (80)~map lookup fp/simd (81)8283pipeline redirect (84)85inst all (8c)inst branch (8d)inst branch taken (90)inst b.cc (94)inst integer (97)inst neon or fp (9a)9fa1a5a6a8a9acc2cfd0d2icache miss (d3)d5d6d9dadbddinst fetch restart (de)e0? int output thing (e9)eaecld/st retires (ed)? fp/simd (ee)gpr retires (ef)f5f6f7f8fd
240024300632250000003003002616001212800008000012800008000062401157960991111030026300453004503241600122080000800002080000160000300453004511160021109101016000010008000000001002213219192112330042220080000160000103004630046300463004630046
24002430045225000000310562563741626091380519811701181238812226244467210215171110309353115531044238179606162523208122581354208042216246031170311651011600211091010160000100258116610640301002213517222114330042220080000160000103004630046300463004630787
24002430510225015481035230030160315161573108039080650128082780410684515061022183111030485304133057435812221616001220800008081220800001600003004530666311600211091010160000100980000020010278135191462124531501120080000160000103191031541320283138431047
240024300452330100003003002616001212800008000012800008000062401157960991011030026300453004503241600122080000800002080000160000300453007211160021109101016000010008000010001002416629192113330042220080000160000103004630046300463004630046
2400243004522500000030030026160012128000080000128000080000624077609697950110300263004530136032416001220800008000020800001600003004530058111600211091010160000100080000000010024135110222122330042220080000160000103004630046300463004630046
240024300452250000003003002616001212800008000012800008000062407760969795011030026300453004503241600122080000800002080000160000300453005411160021109101016000010008000000001002413628224223230042240080000160000103004630046300463004630046
240024300452250000003003002616001212800008000012800008000062407760969795011030026300453004503241600122080000800002080000160000300453004511160021109101016000010008000000001002213516194223330042240080000160000103004630046300463004630046
240024300452250000003003002616001212800008000012800008000062407760969795011030026300453004503241600122080000800002080000160000300453004511160021109101016000010008000000001002213517194222330042240080000160000103004630046300463004630046
240024300452250000003003002616001212800008000012800008000062407760960991111030026300453004503241600122080000800002080000160000300473004511160021109101016000010008000000001002213516192113230042220080000160000103004630046300463004630046
240024300452330000003003002616001212800008000012800008000062401157960991111030026300453004503241600122080000800002080000160000300453009111160021109101016000010008000000001002213517222112330042220080000160000103004630046300463004630132