Apple M1 Microarchitecture Research by Dougall Johnson

Firestorm: Overview | Base Instructions | SIMD and FP Instructions
Icestorm:  Overview | Base Instructions | SIMD and FP Instructions

SQDMULL2 (vector, 4S)

Test 1: uops

Code:

  sqdmull2 v0.4s, v0.8h, v1.8h
  movi v0.16b, 1
  movi v1.16b, 2

(no loop instructions)

1000 unrolls and 1 iteration

Retires: 1.000

Issues: 1.000

Integer unit issues: 0.000

Load/store unit issues: 0.000

SIMD/FP unit issues: 1.000

retire (01)cycle (02)03081e3f4e51inst issue (52)~issue fp/simd (54)~dispatch fp/simd (57)huge thing fp/simd (5b)60696d6edispatch stall (70)scheduler rewind (75)scheduler stall (76)~dispatch op (78)~map op fp/simd (7e)~map lookup fp/simd (81)8283pipeline redirect (84)85inst all (8c)inst neon or fp (9a)a8accfd5d6ddinst fetch restart (de)e0? fp/simd (ee)f5f6f7f8fd
1004303723006125482510001000100039831303018303730372415328951000100020003037303711100110000073216112630100030383038303830383038
1004303723006125482510001000100039831303018303730372415328951000100020003037303711100110000073116112630100030383038303830383038
1004303723006125482510001000100039831303018303730372415328951000100020003037303711100110000073116112630100030383038303830383038
1004303723006125482510001000100039831303018303730372415328951000100020003037303711100110000073116112630100030383038303830383038
1004303723006125482510001000100039831303018303730372415328951000100020003037303711100110000073116112630100030383038303830383038
10043037220010325482510001000100039831303018303730372415328951000100020003037303711100110000073116112630100030383038303830383038
10043037230014925482510001000100039967003018303730372415328951000100020003037303711100110000073116112630100030383038303830383038
1004303723006125482510001000100039831303018303730372415328951000100020003037303711100110000073116112630100030383038303830383038
10043037231020625482510001000100039831303018303730372415328951000100020003037303711100110000073116112630100030383038303830383038
1004303723006125482510001000100039831303018303730372415328951000100020003037303711100110000073116122630100030383038303830383038

Test 2: Latency 1->2

Code:

  sqdmull2 v0.4s, v0.8h, v1.8h
  movi v0.16b, 1
  movi v1.16b, 2

(fused SUBS/B.cc loop)

100 unrolls and 100 iterations

Result (median cycles for code): 3.0037

retire (01)cycle (02)03080b1e1f3f4e51inst issue (52)~issue int (53)~issue fp/simd (54)~dispatch int (56)~dispatch fp/simd (57)huge thing int (59)huge thing fp/simd (5b)60696d6edispatch stall (70)scheduler rewind (75)scheduler stall (76)~dispatch op (78)~map op int (7c)~map op fp/simd (7e)~map lookup int (7f)~map lookup fp/simd (81)8283pipeline redirect (84)85inst all (8c)inst branch (8d)inst branch taken (90)inst b.cc (94)inst integer (97)inst neon or fp (9a)9fa0a7a8a9acc2cfd5d6ddinst fetch restart (de)e0? fp/simd (ee)gpr retires (ef)f5f6f7f8fd
1020430037225000061295482510100100100001001000050042773130300183003730037282653287451010020010000200200003003730037111020110099100100100001000000007101161129634100001003003830038300383003830038
1020430037225000061295482510100100100001001000050042773130300183003730037282653287451010020010000200200003003730037111020110099100100100001000000007101161129634100001003003830038300383003830038
10204300372250000612954825101001001000010010000500427731303001830037300372826532874510100200100002002000030037300371110201100991001001000010000004507101161129634100001003003830038300383003830038
1020430037225000061295482510100100100001001000050042773130300183003730037282653287451010020010000200200003003730037111020110099100100100001000000007101161129634100001003003830038300383003830038
1020430037225000061295482510100100100001001000050042773130300183003730037282653287451010020010000200200003003730037111020110099100100100001000000007101161129634100001003003830038300383003830038
1020430037225000061295482510100100100001001000050042773130300183003730037282653287451010020010000200200003003730037111020110099100100100001000000007101171129634100001003003830038300383003830038
1020430037225000061295482510100100100001001000050042773130300183003730037282653287451010020010000200200003003730037111020110099100100100001000000007101161129634100001003003830038300383003830038
102043003722500010861295482510100100100001001000050042773131300183003730037282653287451010020010000200200003003730037111020110099100100100001000000007101161129634100001003003830038300383003830038
1020430037225000082295482510100100100001001000050042773130300183003730037282653287451010020010000200200003003730037111020110099100100100001000000307101161129634100001003003830038300383003830038
10204300372250000536295482510100100100001001000050042773131300183003730037282653287451010020010000200200003003730037111020110099100100100001000000007101161129634100001003003830038300383003830038

1000 unrolls and 10 iterations

Result (median cycles for code): 3.0037

retire (01)cycle (02)0318191e1f3f4e51inst issue (52)~issue int (53)~issue fp/simd (54)~dispatch int (56)~dispatch fp/simd (57)huge thing int (59)huge thing fp/simd (5b)60696d6edispatch stall (70)scheduler rewind (75)scheduler stall (76)~dispatch op (78)~map op int (7c)~map op fp/simd (7e)~map lookup int (7f)~map lookup fp/simd (81)8283pipeline redirect (84)85inst all (8c)inst branch (8d)inst branch taken (90)inst b.cc (94)inst integer (97)inst neon or fp (9a)9fa0a1a6a8a9acc2cfd5d6ddinst fetch restart (de)e0? int output thing (e9)? fp/simd (ee)gpr retires (ef)f5f6f7f8fd
10024300372250087061295482510010101000010100005042773130300183003730037282873287671001020100002020000300373003711100211091010100001000000006402162229630010000103003830038300383003830038
10024300372250069061295482510010101000010100005042773130300183003730037282873287671001020100002020000300373003711100211091010100001000000006402162229630010000103003830038300383003830038
1002430037225000061295482510010101000010100005042773130300183003730037282873287671001020100002020000300373003711100211091010100001000010306402162229630010000103003830038300383003830038
10024300372250015061295482510010101000010100005042773130300183003730037282873287671001020100002020000300373003711100211091010100001000000306402162229668010000103003830038300383008430038
10024300372260018061295482510010101000010100005042773130300183003730037282873287671001020100002020000300373003711100211091010100001000000306402162229630010000103003830038300383003830038
10024300372240015061295482510010101000010100005042773130300183003730037282873287671001020100002020000300373003711100211091010100001000000006402162229630010000103003830038300383003830038
100243003722400270103295482510010101000010100005042773130300183003730037282873287671001020100002020000300373003711100211091010100001000000006402162229630010000103003830038300383003830038
10024300372250042082295482510010101000010100005042773130300543008530037282873287671001020100002020000300373003711100211091010100001000000006613242229884010000103003830038300383003830038
10024300372250090653429548841001010100081010000504280027030270300373003728287328767100102010000202000030037300371110021109101010000100001601392806404892229630410000103049730085300383003830038
100243003722500420052032946719910097151007218113415642773130300183022630225283063289171120620100002020000300373003711100211091010100001000000068333882429955210000103027430228301803003830513

Test 3: Latency 1->3

Code:

  sqdmull2 v0.4s, v1.8h, v0.8h
  movi v0.16b, 1
  movi v1.16b, 2

(fused SUBS/B.cc loop)

100 unrolls and 100 iterations

Result (median cycles for code): 3.0037

retire (01)cycle (02)030e1e1f3a3f4e51inst issue (52)~issue int (53)~issue fp/simd (54)~dispatch int (56)~dispatch fp/simd (57)huge thing int (59)huge thing fp/simd (5b)60696d6edispatch stall (70)scheduler rewind (75)scheduler stall (76)~dispatch op (78)~map op int (7c)~map op fp/simd (7e)~map lookup int (7f)~map lookup fp/simd (81)8283pipeline redirect (84)85inst all (8c)inst branch (8d)inst branch taken (90)inst b.cc (94)inst integer (97)inst neon or fp (9a)9fa0a1a6a8a9acc2cfd5d6ddinst fetch restart (de)e0? fp/simd (ee)gpr retires (ef)f5f6f7f8fd
10204300372250000612954825101001001000010010000500427731303001830037300372826532874510100200100002002000030037300371110201100991001001000010000000007102162229634100001003003830038300383003830038
10204300372250000612954825101001001000010010000500427731303001830037300372826532874510100200100002002000030037300371110201100991001001000010000000007102162229634100001003003830038300383003830038
10204300372250000612953925101001001000010010000500427731303005430037300372826532874510100200100002002000030037300371110201100991001001000010000000007102162229634100001003003830038300383003830038
10204300372250000612954825101001001000010010000500427731303001830037300372826532874510100200100002002000030037300371110201100991001001000010000000047102162229634100001003003830038300383003830038
102043003722500005362954825101001001000010010000500427731303001830037300372826532874510100200100002002000030037300371110201100991001001000010000000007102162229634100001003003830038300383003830038
10204300372250000612954825101001001000010010000500427731303001830037300372826532874510100200100002002000030037300371110201100991001001000010000000007102162229634100001003003830038300383003830038
10204300372240000612954825101001001000010010000500427731313001830037300372826532874510100200100002002000030037300371110201100991001001000010000000007102162229634100001003003830038300383003830038
10204300372250000612954825101001001000010010000500427731303001830037300372826532874510100200100002002000030037300371110201100991001001000010000000007102162229634100001003003830038300383003830038
10204300372250000612954825101001001000010010000500427731303001830037300372826532874510100200100002002000030037300371110201100991001001000010000000027102162229634100001003003830038300383003830038
10204300372250000612954825101001001000010010000500427731303001830037300372826532874510100200100002002000030037300371110201100991001001000010000000007102162229634100001003003830038300383003830038

1000 unrolls and 10 iterations

Result (median cycles for code): 3.0037

retire (01)cycle (02)0308090b18191e1f3f4e51inst issue (52)~issue int (53)~issue fp/simd (54)~dispatch int (56)~dispatch fp/simd (57)huge thing int (59)huge thing fp/simd (5b)60696d6edispatch stall (70)scheduler rewind (75)scheduler stall (76)~dispatch op (78)~map op int (7c)~map op fp/simd (7e)~map lookup int (7f)~map lookup fp/simd (81)8283pipeline redirect (84)85inst all (8c)inst branch (8d)inst branch taken (90)inst b.cc (94)inst integer (97)inst neon or fp (9a)9fa0a1a6a8acc2cfd5d6ddinst fetch restart (de)e0? int output thing (e9)? fp/simd (ee)gpr retires (ef)f5f6f7f8fd
100243003722500000008229548251001010100001010000504277313030018300373003728287328767100102010000202000030037300371110021109101010000100000006402162229630010000103003830038300383003830038
100243003722500000006129548251001010100001010000504277313030018300373003728287328767100102010000202000030037300371110021109101010000100000006402162229630010000103003830038300383003830038
100243003722500000006129548251001010100001010000504277313030018300373003728287328767100102010000202000030037300371110021109101010000100000006402162229630010000103003830038300383003830038
100243003722500000006129548251001010100001010000504277313030018300373003728287328767100102010000202000030037300371110021109101010000100000006402162229630010000103003830038300383003830038
100243003722500000006129548251001010100001010000504277313130018300373003728287328767100102010000202000030037300371110021109101010000100000006402162229630010000103003830038300383003830038
100243003722500000006129548251001010100001010000504277313030018300373003728287328767100102010000202000030037300371110021109101010000100000006402162229630010000103003830038300383003830038
100243003722500000006129548251001010100001010000504277313030018300373003728287328767100102010000202000030037300371110021109101010000100000006402162229630010000103003830038300383003830038
100243003722500000006129548251001010100001010000504277313030018300373003728287328767100102010000202000030037300371110021109101010000100000046402162229630010000103003830038300383003830038
1002430037225000002706129548251001010100001010000504277313030018300373003728287328767100102010000202034430037300371110021109101010000100000006402162229630010000103003830038300383003830038
100243003722500000006129548251001010100001010000504277313030018300373003728287328767100102010000202000030037300371110021109101010000100000006402162229630010000103003830038300383003830038

Test 4: throughput

Count: 8

Code:

  sqdmull2 v0.4s, v8.8h, v9.8h
  sqdmull2 v1.4s, v8.8h, v9.8h
  sqdmull2 v2.4s, v8.8h, v9.8h
  sqdmull2 v3.4s, v8.8h, v9.8h
  sqdmull2 v4.4s, v8.8h, v9.8h
  sqdmull2 v5.4s, v8.8h, v9.8h
  sqdmull2 v6.4s, v8.8h, v9.8h
  sqdmull2 v7.4s, v8.8h, v9.8h
  movi v8.16b, 9
  movi v9.16b, 10

(fused SUBS/B.cc loop)

100 unrolls and 100 iterations

Result (median cycles for code divided by count): 0.2505

retire (01)cycle (02)033f51inst issue (52)~issue int (53)~issue fp/simd (54)~dispatch int (56)~dispatch fp/simd (57)huge thing int (59)huge thing fp/simd (5b)6061696d6edispatch stall (70)scheduler rewind (75)scheduler stall (76)~dispatch op (78)~map op int (7c)~map op fp/simd (7e)~map lookup int (7f)~map lookup fp/simd (81)8283pipeline redirect (84)85inst all (8c)inst branch (8d)inst branch taken (90)inst b.cc (94)inst integer (97)inst neon or fp (9a)9facc2branch mispredict (cb)cfd5d6ddinst fetch restart (de)e0? fp/simd (ee)gpr retires (ef)f5f6f7f8fd
802042004915641258010010080000100800005006400001520020200392003999733999780100200800002001600002003920039118020110099100100800001001560051102161120036800001002004020040200402004020040
8020420101193412580100100800001008000050064000000200202003920039997339997801532008000020016000020039200391180201100991001008000010000051101161120036800001002004020040200402004020040
80204200391504125801001008000010080000500640000002002020039200399973399978010020080000200160000200392003911802011009910010080000100150051101161120036800001002004020040200402004020040
8020420039150412580100100800001008000050064000010200202003920039997339997801002008000020016000020039200391180201100991001008000010000051101161120036800001002004020040200402004020040
8020420039150412580100100800001008000050064000000200202003920039997339997801002008000020016000020039200391180201100991001008000010000051101161120036800001002004020040200402004020040
8020420039150412580100100800001008000050064000010200202003920039997339997801002008000020016000020039200391180201100991001008000010004051101161120036800001002004020040200402009120040
8020420039150412580100100800001008000050064000010200202003920039997339997801002008000020016000020039200391180201100991001008000010000051101161120036800001002004020040200402004020040
802042003915041258010010080000100800005006400001020020200392003999733999780100200800002001600002003920039118020110099100100800001001320051101161120036800001002004020040200402004020040
8020420039150412580100100800001008000050064000000200202003920039997339997801002008000020016000020039200391180201100991001008000010000151101161120036800001002004020040200402004020040
8020420039150412580100100800001008000050064000000200202003920039997339997801002008000020016000020039200391180201100991001008000010000051101161120036800001002004020040200402004020040

1000 unrolls and 10 iterations

Result (median cycles for code divided by count): 0.2505

retire (01)cycle (02)03181e3f51inst issue (52)~issue int (53)~issue fp/simd (54)~dispatch int (56)~dispatch fp/simd (57)huge thing int (59)huge thing fp/simd (5b)6061696d6edispatch stall (70)scheduler rewind (75)scheduler stall (76)~dispatch op (78)~map op int (7c)~map op fp/simd (7e)~map lookup int (7f)~map lookup fp/simd (81)8283pipeline redirect (84)85inst all (8c)inst branch (8d)inst branch taken (90)inst b.cc (94)inst integer (97)inst neon or fp (9a)9faccfd0d2d5d6ddinst fetch restart (de)e0? int output thing (e9)? fp/simd (ee)gpr retires (ef)f5f6f7f8fd
800242004815106402580010108000010800005064000000200202003920039999631001980010208000020160000200392003911800211091010800001005020403161120036080000102004020040200402004020040
8002420039150018402580010108000010800005064000004200202003920039999631001980010208000020160000200392003911800211091010800001005020001161120036080000102004020040200402004020040
8002420039150039402580010108000010800005064000000200202003920039999631001980010208000020160000200392003911800211091010800001005091401161120036180000102004020040200402004020040
80024200391501138406880209108000010800006064000000200292003920039999631001980010208000020160000200392003911800211091010800001005020401161120036080000102004020040200402004020040
8002420039150018402580010108000010800005064000000200202003920039999631001980010208000020160000200392003911800211091010800001005020401161120036080000102009120040200402004020040
800242003915000612580010108000010800005064000004200202003920039999631001980010208000020160000200392003911800211091010800001005020001161120036080000102004020040200402004020040
800242003915009402580010108000010800005064000000200202003920039999631001980010208000020160000200392003911800211091010800001005020001161120036080000102004020040200402004020040
80024200391500245572580010108000010800005064000010200202003920039999631001980010208000020160000200392003911800211091010800001005020001161120036080000102004020040200402004020040
8002420039150027402580010108000010800005064000004200202003920039999631001980010208000020160000200392003911800211091010800001005020001161120036080000102004020040200402004020040
8002420039150018402580010108000010800005064000004200202003920039999631001980010208000020160000200392003911800211091010800001005020001161120036080000102004020040200402004020040