Apple M1 Microarchitecture Research by Dougall Johnson

Firestorm: Overview | Base Instructions | SIMD and FP Instructions
Icestorm:  Overview | Base Instructions | SIMD and FP Instructions

INS (general, S)

Test 1: uops

Code:

  ins v0.s[2], w1
  mov x0, 1
  mov x1, 2

(no loop instructions)

1000 unrolls and 1 iteration

Retires: 2.000

Issues: 2.000

Integer unit issues: 0.000

Load/store unit issues: 1.000

SIMD/FP unit issues: 1.000

retire (01)cycle (02)03080b18191e1f3f4e4f51inst issue (52)~issue fp/simd (54)~issue ld/st (55)~dispatch fp/simd (57)~dispatch ld/st (58)huge thing ld/st (5a)huge thing fp/simd (5b)60696d6edispatch stall (70)scheduler rewind (75)scheduler stall (76)~dispatch op (78)~map op ld/st (7d)~map op fp/simd (7e)~map lookup ld/st (80)~map lookup fp/simd (81)8283pipeline redirect (84)85inst all (8c)inst neon or fp (9a)a0a1a5a6a7a8a9acc2cfd5d6ddinst fetch restart (de)e0ld/st retires (ed)? fp/simd (ee)f5f6f7f8fd
200420401500000020251690025200010001000100010006342265147120212040204014503177320001000100010002000204020401110011000000100000000732161120371000100020412041204120412041
20042040160000120202516900252000100010001000100063422651471202120402040145031773200010001000100020002040204011100110000001000000390731161120371000100020412041204120412041
200420401600000020251690025200010001000100010006346265147120212040204014503177320001000100010002000204020401110011000000100000000731161120371000100020412041204120412041
2004204016000012020251690025200010001000100010006342265147020212040204014503177320001000100010002000204020401110011000000100000000731161120371000100020412041204120412041
200420401600000020251690025200010001000100010006342265147020212040204014503177320001000100010002000204020401110011000000100000000731161120371000100020412041204120412041
200420401600000020251690025200010001000100010006342265147020212040204014503177320001000100010002000204020401110011000000100000000731161120371000100020412041204120412041
2004204016000000202516900252000100010001000100063422651470202120402040145031773200010001000100020002040204011100110000001000000210731161120371000100020412041204220412041
2004204015000000202516900252000100010001000100063462651470202120402040145031773200010001000100020002040204011100110000001000000360731161120371000100020412041204120412041
200420401600000020251690025200010001000100010006342265147020212040204014503177320001000100010002000204020401110011000000100000000731161120371000100020412041204120412041
200420401700000020251690025200010001000100010006377265147020212040204014503177320001000100010002000204020401110011000000100000090731161120371000100020412041204120412041

Test 2: Latency 1->1

Code:

  ins v0.s[2], w1
  mov x0, 1
  mov x1, 2

(fused SUBS/B.cc loop)

100 unrolls and 100 iterations

Result (median cycles for code): 2.0040

retire (01)cycle (02)030818191e1f3f4e4f51inst issue (52)~issue int (53)~issue fp/simd (54)~issue ld/st (55)~dispatch int (56)~dispatch fp/simd (57)~dispatch ld/st (58)huge thing int (59)huge thing ld/st (5a)huge thing fp/simd (5b)696d6edispatch stall (70)scheduler rewind (75)scheduler stall (76)~dispatch op (78)~map op int (7c)~map op ld/st (7d)~map op fp/simd (7e)~map lookup int (7f)~map lookup ld/st (80)~map lookup fp/simd (81)8283pipeline redirect (84)85inst all (8c)inst branch (8d)inst branch taken (90)inst b.cc (94)inst integer (97)inst neon or fp (9a)9fa6a8accfd5d6ddinst fetch restart (de)e0? int output thing (e9)ld/st retires (ed)? fp/simd (ee)gpr retires (ef)f5f6f7f8fd
20204200401550000020025196800552010010010000100001001000010000500535342848132200212004020040171753174982010020010000100002001000020000200402004011102011009910010010000100100000217102161120037010000100001002004120041200412004120041
2020420040155000002002519689025201001001000010000100100001000050053534284813220021200402004017175317498201002001000010000200100002000020040200401110201100991001001000010010000007101161120037010000100001002004120041200412004120041
20204200401550000020025196890252010010010000100001001000010000500535182848132200212004020040171753174982010020010000100002001000020000200402004011102011009910010010000100100000217100161120037010000100001002004120041200412004120041
20204200401550000020025196890252010010010000100001001000010000500535792848132200212004020040171753174982010020010000100002001000020000200402004011102011009910010010000100100000187101161120037010000100001002004120041200412004120041
2020420040155000002002519689025201001001000010000100100001000050053518284813220021200402004017175317498201002001000010000200100002000020040200401110201100991001001000010010000007101161120037010000100001002004120041200412004120041
20204200401551000020025196890252010010010000100001001000010000500535182848132200212004020040171753174982010020010000100002001000020000200402004011102011009910010010000100100000187101161120037010000100001002004120041200412004120041
20204200401550000020025196890252010010010000100001001000010000500535182848132200212004020040171753174982010020010000100002001000020000200402004011102011009910010010000100100000127101161120037010000100001002004120041200412004120041
20204200401550000020025196890252010010010000100001001000010000500535222848132200212004020040171753174982010020010000100002001000020000200402004011102011009910010010000100100000247101161120037010000100001002004120041200412004120041
20204200401550000020025196890252010010010000100001001000010000500535792848132200212004020040171753174982010020010000100002001000020000200402004011102011009910010010000100100000127101161120037010000100001002004120041200412004120041
20204200401550000020025196890252010010010000100001001000010000500535222848132200212004020040171753174982010020010000100002001000020000200402004011102011009910010010000100100000187101161120037010000100001002004120041200412004120041

1000 unrolls and 10 iterations

Result (median cycles for code): 2.0040

retire (01)cycle (02)0308181e3f4e51inst issue (52)~issue int (53)~issue fp/simd (54)~issue ld/st (55)~dispatch int (56)~dispatch fp/simd (57)~dispatch ld/st (58)huge thing int (59)huge thing ld/st (5a)huge thing fp/simd (5b)6061696d6edispatch stall (70)scheduler rewind (75)scheduler stall (76)~dispatch op (78)~map op int (7c)~map op ld/st (7d)~map op fp/simd (7e)~map lookup int (7f)~map lookup ld/st (80)~map lookup fp/simd (81)8283pipeline redirect (84)85inst all (8c)inst branch (8d)inst branch taken (90)inst b.cc (94)inst integer (97)inst neon or fp (9a)9fa6a8acc5cdcfd5d6ddinst fetch restart (de)e0? int output thing (e9)ld/st retires (ed)? fp/simd (ee)gpr retires (ef)f5f6f7f8fd
2002420040155000200251969025200101010000100001010000100005053754284814700200212004020040171973175202001020100001000020100002000020040200401110021109101010000101000001200640316222003701000010000102004120041200412004120041
200242004015600020025196902520010101000010000101000010000505359628481470020021200402004017198317520200102010000100002010000200002004020040111002110910101000010100000300640216222003701000010000102004120041200412004120041
2002420040155000200251969025200101010000100001010000100005053598284814700200212004020040171973175202001020100001000020100002000020040200401110021109101010000101000001500640216222003701000010000102004120041200412004120041
200242004015500020025196902520010101000010000101000010000505358528481470020021200402004017197317520200102010000100002010000200002004020040111002110910101000010100000010640216222003701000010000102004120041200412004120041
20024200401550012200251968284200101010000100001010000100005053586284814700200212004020040171973175202001020100001000020100002000020040200401110021109101010000101000001200640216222003701000010000102004120041200412004120041
200242004015500020025196902520010101000010000101015010000505359428481470020021200402004017197317520200102010000100002010000200002004020040111002110910101000010100000900640217222003701000010000102004120041200412004120041
2002420040155000200251969025200101010000100001010000100005053594284814700200212004020040171973175202001020100001000020100002000020040200401110021109101010000101000006300640216222003701000010000102004120041200412004120041
2002420040155000200251969025200101010000100001010000100005053835284814700200212004020040171973175202001020100001000020100002000020040200401110021109101010000101000001200640216222003701000010000102004120041200412004120041
200242004015500020025196902520010101000010000101000010000505358528481470020021200402004017197317520200102010000100002010000200002004020040111002110910101000010100002900640216222003701000010000102004120041200412004120041
2002420040155202120025196902520010101000010000101000010000505358528481470020021200402004017197317520200102010000100002010000200002004020040111002110910101000010100000000640216222003701000010000102004120041200412004120041

Test 3: Latency 1->2 roundtrip

Code:

  ins v0.s[2], w0
  fmov x0, d0
  mov x0, 1
  mov x1, 2

(fused SUBS/B.cc loop)

100 unrolls and 100 iterations

Result (median cycles for code): 12.0032

retire (01)cycle (02)03080f191e3f4f51inst issue (52)~issue int (53)~issue fp/simd (54)~issue ld/st (55)~dispatch int (56)~dispatch fp/simd (57)~dispatch ld/st (58)huge thing int (59)huge thing ld/st (5a)huge thing fp/simd (5b)60696d6edispatch stall (70)scheduler rewind (75)scheduler stall (76)~dispatch op (78)~map op int (7c)~map op ld/st (7d)~map op fp/simd (7e)~map lookup int (7f)~map lookup ld/st (80)~map lookup fp/simd (81)8283pipeline redirect (84)85inst all (8c)inst branch (8d)inst branch taken (90)inst b.cc (94)inst integer (97)inst neon or fp (9a)9fa0a6a8accfd0d5d6ddinst fetch restart (de)e0? int output thing (e9)ld/st retires (ed)? fp/simd (ee)gpr retires (ef)f5f6f7f8fd
30204120032899000012001710945625401001010020000100001002000010000500573567213672053112001312003212003211552531162403010020010000200002001000030000120032120032112020110099100101001000010001000000131012162211957410000100001000010100120033120033120033120033120033
30204120032899000012001710945625401001010020000100001002000010000500573567213672053112001312003212003211552531162403010020010000200002001000030000120032120032112020110099100101001000010001000000131012162211957410000100001000010100120033120033120033120033120033
30204120032899000012001710945625401001010020000100001002000010000500573567213672053112002612003212003211552531162403010020010000200002001000030000120032120032112020110099100101001000010001000000131012162211957410000100001000010100120033120033120033120033120033
30204120032899000012001710945625401001010020000100001002000010000500573567213672053112001312003212003211552531162403010020010000200002001000030000120032120032112020110099100101001000010001000000131012162211957410000100001000010100120033120033120033120033120033
30204120032899000012001710945625401001010020000100021002000010000500573749613672053112001412003212003811553031162403010020010000200002001000030000120032120032112020110099100101001000010001000000131012162211957410000100001000010100120033120033120033120033120033
30204120032899000012002310945625401001010020000100001002000010000500573567213672053112001312003212003211552531162403010020010000200002001000030000120032120032112020110099100101001000010001000000131012162211957410000100001000010100120033120033120033120071120089
30204120032899000012001710945625401091010020000100001002000010000500573586413672053112001312003212003211552531162403010020010000200002001000030000120032120032112020110099100101001000010001000000131012162211957410000100001000010100120033120033120033120033120033
30204120032899000012001710945625401001010020000100001002000010000500573567213672053112001312003212003211552531162403010020010000200002001000030000120032120032112020110099100101001000010001000000131012162211957410000100001000010100120033120033120033120033120033
30204120032899000012001710945625401001010020000100001002000010000500573567213672053112001312003212003211552531162403010020010000200002001000030000120032120032112020110099100101001000010001000000131012162211957410000100001000010100120033120033120033120033120118
30204120111899000012001710945625401001010020000100001002000010000500573567213672053112001312003212003211552531162403010020010000200002001000030000120032120032112020110099100101001000010001000013131012162211957410000100001000010100120033120033120033120033120033

1000 unrolls and 10 iterations

Result (median cycles for code): 12.0032

retire (01)cycle (02)030b1e3a3f4f51inst issue (52)~issue int (53)~issue fp/simd (54)~issue ld/st (55)~dispatch int (56)~dispatch fp/simd (57)~dispatch ld/st (58)huge thing int (59)huge thing ld/st (5a)huge thing fp/simd (5b)60696b6d6edispatch stall (70)scheduler rewind (75)scheduler stall (76)~dispatch op (78)~map op int (7c)~map op ld/st (7d)~map op fp/simd (7e)~map lookup int (7f)~map lookup ld/st (80)~map lookup fp/simd (81)8283pipeline redirect (84)85inst all (8c)inst branch (8d)inst branch taken (90)inst b.cc (94)inst integer (97)inst neon or fp (9a)9fa0a6a8a9acc2branch mispredict (cb)cdcfd5d6ddinst fetch restart (de)e0? int output thing (e9)ld/st retires (ed)? fp/simd (ee)gpr retires (ef)f5f6f7f8fd
300241200379310001200171094562540010100102000010000102000010000505735672136700291120013012003212003211554831162623001020100002000020100003000012003212003211200211091010010100001001000000000012703163411957410000100001000010010120033120033120033120033120033
300241200328990001200171094562540010100102000010000102000010000505735672136700290120013012003212003211554831162623001020100002000020100003000012003212003211200211091010010100001001000000000012703163311957410000100001000010010120033120033120033120033120033
300241200328990001200171094562540010100102000010000102000010000505735672136700290120013012003212003211554831162623001020100002000020100003000012003212003211200211091010010100001001000000000012703163311957410000100001000010010120033120033120033120033120033
300241200328990001200171094562540010100102000010000102000010000505735672136700291120013012003212003311554831162623001020100002000020100003000012003212003211200211091010010100001001000010000012704163311957410000100001000010010120033120033120033120033120033
300241200328990001200171094562540010100102000010000102000010000505735672136700290120013012003212003211554831162623001020100002000020100003019612003212003211200211091010010100001001000000000012703163311957410000100001000010010120066120033120074120033120037
3002512006610910001200171094564040010100102000010000102000010000505735672136700291120013012003212003211554831162623001020100002000020100003000012003212003211200211091010010100001001000000000012704163311957410000100001000010010120033120033120033120033120033
300241200328990601200171094562540010100102000010000102000010000505735672136700291120013312003212003211554831162653001020100002000020100003000012004712003211200211091010010100001001000000000012703163311957410000100001000010010120033120033120033120033120033
300241200328990001200171094562540010100102000010000102000010000505735672136700290120013012003212003211554831162623001020100002000020100003000012003212003211200211091010010100001001000000000012703164311957410000100001000010010120033120033120033120033120033
300241200328991021200571094562540010100102000010000102000010000505735672136700290120013012003212003211554831162623001020100002000020100003000012003212003211200211091010010100001001000000000012703163311957410000100001000010010120033120033120033120033120033
300241200328990001200171094562540010100102000010000102000010000505735672136700291120013012003212003211554831162623001020100002000020100003000012003212003211200211091010010100001001000000000012703163311957410000100001000010010120033120033120033120033120033

Test 4: throughput

Count: 8

Code:

  movi v0.16b, 0
  ins v0.s[2], w8
  movi v1.16b, 0
  ins v1.s[2], w8
  movi v2.16b, 0
  ins v2.s[2], w8
  movi v3.16b, 0
  ins v3.s[2], w8
  movi v4.16b, 0
  ins v4.s[2], w8
  movi v5.16b, 0
  ins v5.s[2], w8
  movi v6.16b, 0
  ins v6.s[2], w8
  movi v7.16b, 0
  ins v7.s[2], w8
  mov x8, 9

(fused SUBS/B.cc loop)

100 unrolls and 100 iterations

Result (median cycles for code divided by count): 0.3759

retire (01)cycle (02)03191e3f4f51inst issue (52)~issue int (53)~issue fp/simd (54)~issue ld/st (55)~dispatch int (56)~dispatch fp/simd (57)~dispatch ld/st (58)huge thing int (59)huge thing ld/st (5a)huge thing fp/simd (5b)5f60696d6escheduler rewind (75)scheduler stall (76)~dispatch op (78)~map op int (7c)~map op ld/st (7d)~map op fp/simd (7e)~map lookup int (7f)~map lookup ld/st (80)~map lookup fp/simd (81)8283pipeline redirect (84)85inst all (8c)inst branch (8d)inst branch taken (90)inst b.cc (94)inst integer (97)inst neon or fp (9a)9fa0a5a6a8acdtlb miss (c1)c5branch mispredict (cb)cdcfd5d6dbddinst fetch restart (de)e0? int output thing (e9)ld/st retires (ed)? fp/simd (ee)gpr retires (ef)f5f6f7f8fd
2402043009524212793006622281601531268001080017126803058002465642318698842200302513008130212913160173200800248002320080024160046300813008111160201100991001001600001000080007300222101281230113007926800001600001003008330082300833008230083
240204300822330030067728160153126800108001712680023800246564162699812150030061300813008110131601732008002480023200800241600463008230081111602011009910010016000010000800070210222101291230113007826800001600001003008230083300823008330082
24020430081233003006732281601531268001080017126800238002465641626998121501300613008130081913160173200800248002320080024160046300823008111160201100991001001600001000080007000222101551230113007926800001600001003008230082300823008230082
2402043008123300300661271601531268001080017126800238002465641626998121500300613008130081913160173200800248002320080024160046300813008111160201100991001001600001000080007200222101291230113007926800001600001003033330083300833008230453
240204300822330123006602716015312680010800171268002380024656417037981698003006130081300811013160173200800248002320080024160046300813008111160201100991001001600001000080007000222101281230113007926800001600001003008430082300823010530083
24020430081233003006702616015312680010800171268002380024656416269981215013006130081300811013160173200800248002320080024160046300813008111160201100991001001600001000080002000111101190160003006525800001600001003007030070300693007030070
2402043006923300300540261601431258000580013125800198001965041648098148700300483006930068614160163200800198001920080019160038300693006811160201100991001001600001000080002000111101180160003006625800001600001003006930069300893006930070
24020430069233003005497261601421258012980012125800198001965041648098148700300483006930069614160163200800198001920080019160038300693006811160201100991001001600001000080002100111101180160003006625800001600001003006930069300803007030069
2402043006923300300540271601431258000580013125800198001965041648098148700300483006930069714160163200800198001920080019160038300683006811160201100991001001600001000080002100111101192160003006625800001600001003007030070300883006930070
2402043006823300300540261601431258000580013125800198001965041648098148700300483006830069714160163200800198001920080019160038300693006811160201100991001001600001000080002000111101180160003006525800001600001003007030103300703006930069

1000 unrolls and 10 iterations

Result (median cycles for code divided by count): 0.3756

retire (01)cycle (02)03041e3a3f4f51inst issue (52)~issue int (53)~issue fp/simd (54)~issue ld/st (55)~dispatch int (56)~dispatch fp/simd (57)~dispatch ld/st (58)huge thing int (59)huge thing ld/st (5a)huge thing fp/simd (5b)5f60696d6edispatch stall (70)scheduler rewind (75)scheduler stall (76)~dispatch op (78)~map op int (7c)~map op ld/st (7d)~map op fp/simd (7e)~map lookup int (7f)~map lookup ld/st (80)~map lookup fp/simd (81)8283pipeline redirect (84)85inst all (8c)inst branch (8d)inst branch taken (90)inst b.cc (94)inst integer (97)inst neon or fp (9a)9fa0a6a8accfd0d2icache miss (d3)itlb miss (d4)d5d6d9dadbddinst fetch restart (de)e0? int output thing (e9)eald/st retires (ed)? fp/simd (ee)gpr retires (ef)f5f6f7f8fd
240024300462250003003002616001313800008000013800008000068402642963137013002630045300450324160013208000080000208000016000030045300451116002110910101600001008000000100246521251732227233004233180000160000103004630046302103004630046
240024300452251013003002616105213800008000013800008000068405696963201013002630045300450324160013208000080000208000016000030045300451116002110910101600001008000080100266521221732229253004233180000160000103004630046301673004630046
2400243004522510130030026160013138000080000138000080000684016069609010130026300453004503241600132080000800002080000160000300453004511160021109101016000010080000353100256520231732228213004233180000160000103004530046301653004630046
240024300452251003003002616001313800008000013800008000068401606960901013002630045300450349160013208000080000208000016000030045300451116002110910101600001008000020100266521301732231293004233180000160000103004630046301533004830046
2400243004522514803003002616001212800008000012800008000062402075961779113002630045300450324160012208000080000208000016000030045300451116002110910101600001008000000100276521301732222303004524180000160000103004930183300493005030046
2400243004523800030030922616001212800008000012800008000062402075961779013002930048300480324160012208000080000208000016000030048300481116002110910101600001008000050100276521301732224263004524180000160000103004930164300493004930049
240024300482251013003306116027213800008000012800008000062405220961779013002930048300480324160012208000080000208000016000030048300481116002110910101600001008000023100276521191732229243004524180000160000103004930151300473004930049
240024300482251003003302616001212800008000012800008000062402075961779013002930048300480324160012208000080000208000016000030048300481116002110910101600001008000010100266220231732228243004524180000160000103004930142300493004930049
240024300482251013003302616001212800008000012800008000062402075961779013002930048300480324160012208000080000208000016000030048300481116002110910101600001008000010100266220221732229243004524180000160000103004930114300463004930049
240024300482241013003302616001212800008000012800008000062402075961779013002930048300480324160012208000080000208000016000030048300481116002110910101600001008000010100266220291732230303004524180000160000103004930116300493005230184