Apple M1 Microarchitecture Research by Dougall Johnson

Firestorm: Overview | Base Instructions | SIMD and FP Instructions
Icestorm:  Overview | Base Instructions | SIMD and FP Instructions

FDIV (scalar, D)

Test 1: uops

Code:

  fdiv d0, d0, d1
  movi v0.16b, 1
  movi v1.16b, 2

(no loop instructions)

1000 unrolls and 1 iteration

Retires: 1.000

Issues: 1.000

Integer unit issues: 0.000

Load/store unit issues: 0.000

SIMD/FP unit issues: 1.000

retire (01)cycle (02)03181e3f4e51inst issue (52)~issue fp/simd (54)~dispatch fp/simd (57)huge thing fp/simd (5b)60696d6edispatch stall (70)scheduler rewind (75)scheduler stall (76)~dispatch op (78)~map op fp/simd (7e)~map lookup fp/simd (81)8283pipeline redirect (84)85inst all (8c)inst neon or fp (9a)a0a1a6a8acc2c3cfd5d6ddinst fetch restart (de)e0? fp/simd (ee)f5f6f7f8fd
10041003775006186492510001000100035198011001810037100379396398951000100020001003710037111001100000000007321622957910001003810038100381003810038
10041003775006186492510001000100035198001001810037100379396398951000100020001003710037111001100000050007321622957910001003810038100381003810038
10041003775006186492510001000100035198011001810037100379396398951000100020001003710037111001100000000007321622957910001003810038100381003810038
100410037750036786492510001000100035198011001810037100379396398951000100020001003710037111001100000200007321622957910001003810038100381003810038
10041003776006186492510001000100035198011001810037100379396398951000100020001008410037111001100000000007321622957910001003810038100381003810038
10041003775006186492510001000100035198001001810037100379396398951000100020001003710037111001100000000007321622957910001003810038100381003810038
10041003775006186492510001000100035198011001810037100379396398951000100020001003710037111001100000020007321622957910001003810038100381003810038
10041003775006186492510001000100035198011001810037100379396398951000100020001003710037111001100000000007321622957910001003810038100381003810038
10041003775006186492510001000100035198011001810037100379396398951000100020001003710037111001100000000007321622957910001003810038100381003810038
10041003775006186492510001000100035198001001810084100379396398951000100020001003710037111001100000000007321622957910001003810038100381003810038

Test 2: Latency 1->2

Code:

  fdiv d0, d0, d1
  movi v0.16b, 1
  movi v1.16b, 2

(fused SUBS/B.cc loop)

100 unrolls and 100 iterations

Result (median cycles for code): 10.0037

retire (01)cycle (02)03081e1f3f4e51inst issue (52)~issue int (53)~issue fp/simd (54)~dispatch int (56)~dispatch fp/simd (57)huge thing int (59)huge thing fp/simd (5b)60696d6edispatch stall (70)scheduler rewind (75)scheduler stall (76)~dispatch op (78)~map op int (7c)~map op fp/simd (7e)~map lookup int (7f)~map lookup fp/simd (81)8283pipeline redirect (84)85inst all (8c)inst branch (8d)inst branch taken (90)inst b.cc (94)inst integer (97)inst neon or fp (9a)9fa6a8a9accfd5d6ddinst fetch restart (de)e0? int output thing (e9)? fp/simd (ee)gpr retires (ef)f5f6f7f8fd
10204100037749000148896492510100100100001001000050035829801100018100037100037982463987451010020010000200200001000371000371110201100991001001000010000007101161199579010000100100038100038100038100038100038
1020410003774900061896492510100100100001001000050035829801100018100037100037982463987451010020010000200200001000371000371110201100991001001000010000007101161199579010000100100038100038100038100038100038
1020410003774900061896492510100100100001001000050035829801100018100037100037982463987451010020010000200200001000371000371110201100991001001000010000007101161199579010000100100038100038100038100038100038
1020410003774901800726896492510100100100001001000050035829801100018100037100037982463987451010020010000202200001000371000371110201100991001001000010000007101161199579010000100100038100038100038100038100038
102041000377500267061896492510100100100001001000050035829801100018100037100037982463987451010020010000200200001000371000372110201100991001001000010000007101161199579010000100100038100038100038100038100038
102041000377490267061896492510100100100001001000050035829801100018100037100037982463987451010020010000200200001000371000371110201100991001001000010000007101161199579010000100100038100038100038100038100276
1020410003774902640726896492510100100100001001000050035829800100018100037100037982463987451010020010000200200001000371000371110201100991001001000010000007101161199579010000100100038100038100038100038100038
102041000377500438061896492510100100100001001000050035829800100018100037100037982463987451010020010000200200001000841000371110201100991001001000010000007101161199579010000100100038100038100038100038100038
102041000377490306061896492510100100100001001000050035829801100018100037100037982463987451010020010000200200001000371000371110201100991001001000010000007101161199579010000100100038100038100038100038100038
102041000377490315061896492510100100100001001000050035829801100018100037100037982463987451010020010000200200001000371000371110201100991001001000010000007101161199579010000100100038100038100038100038100038

1000 unrolls and 10 iterations

Result (median cycles for code): 10.0037

retire (01)cycle (02)030408091e3f4e51inst issue (52)~issue int (53)~issue fp/simd (54)~dispatch int (56)~dispatch fp/simd (57)huge thing int (59)huge thing fp/simd (5b)696d6edispatch stall (70)scheduler rewind (75)scheduler stall (76)~dispatch op (78)~map op int (7c)~map op fp/simd (7e)~map lookup int (7f)~map lookup fp/simd (81)8283pipeline redirect (84)85inst all (8c)inst branch (8d)inst branch taken (90)inst b.cc (94)inst integer (97)inst neon or fp (9a)9fa8accdcfd5d6ddinst fetch restart (de)e0? fp/simd (ee)gpr retires (ef)f5f6f7f8fd
100241000377501000618964925100101010000101000050358298010001810003710003798268398767100102010000202000010003710003711100211091010100001000064021622995791000010100038100038100038100038100038
100241000377490000618964925100101010000101000050358298010001810003710003798268398767100102010000202000010003710003711100211091010100001000064021622995791000010100038100038100038100038100038
100241000377490000618964925100101010000101000050358298010001810003710003798268398767100102010000202000010003710003711100211091010100001000064021622995791000010100038100038100038100038100038
10024100037750000064148964925100101010000101000050358298010001810003710003798268398767100102010000202000010003710003711100211091010100001020064021622995791000010100038100038100038100038100038
100241000377490000618964925100101010000101000050358298010001810003710003798268398767100102010000202000010003710003711100211091010100001000064021622995791000010100038100038100038100038100038
100241000377500000618964925100101010000101000050358298010001810003710003798268398767100102010000202000010003710003711100211091010100001000064021622995791000010100038100038100038100038100038
100241000377490000618964925100101010000101000050358298010001810003710003798268398767100102010000202000010003710003711100211091010100001000064021622995791000010100038100038100038100038100038
1002410003774900006189649251001010100001010000503582980100018100037100037982682598767100102010000202000010003710003711100211091010100001030064021622995791000010100038100038100038100038100038
100241000377490000618964925100101010000101000050358298010001810003710003798268398767100102010000202000010003710003711100211091010100001000064021622995791000010100038100038100038100038100038
100241000377490000618964925100101010000101000050358298010001810003710003798268398767100102010000202000010003710003711100211091010100001000164021622995791000010100038100038100038100038100038

Test 3: Latency 1->3

Code:

  fdiv d0, d1, d0
  movi v0.16b, 1
  movi v1.16b, 2

(fused SUBS/B.cc loop)

100 unrolls and 100 iterations

Result (median cycles for code): 10.0037

retire (01)cycle (02)030b1e1f3f4e51inst issue (52)~issue int (53)~issue fp/simd (54)~dispatch int (56)~dispatch fp/simd (57)huge thing int (59)huge thing fp/simd (5b)60696b6d6edispatch stall (70)scheduler rewind (75)scheduler stall (76)~dispatch op (78)~map op int (7c)~map op fp/simd (7e)~map lookup int (7f)~map lookup fp/simd (81)8283pipeline redirect (84)85inst all (8c)inst branch (8d)inst branch taken (90)inst b.cc (94)inst integer (97)inst neon or fp (9a)9fa0a6a7a8accfd5d6ddinst fetch restart (de)e0? int output thing (e9)? fp/simd (ee)gpr retires (ef)f5f6f7f8fd
102041000377490006189649251010010010000100100005003582980010001801000371000379824639874510100200100002002000010003710003711102011009910010010000100000007101161199579010000100100038100038100038100038100038
102041000377490006189649251010010010000100100005003582980010001801000371000379824639884110100200100002002000010007610003711102011009910010010000100000007101161199579010000100100038100038100038100038100038
102041000377490006189649251010010010000100100005003582980010001801000371000379824639874510100200100002002000010003710003711102011009910010010000100000037101161199579010000100100038100038100038100038100038
1020410003775000025189649251010010010000100100005003582980010001801000371000379824639874510100200100002002000010003710003711102011009910010010000100000007101161199579010000100100038100038100038100038100038
102041000377490006189649251010010010000100100005003582980010001801000371000379824639874510100200100002002000010003710003711102021009910010010000100000007101161199579010000100100038100038100038100038100038
102041000377490006189649251010010010000100100005003582980010001801000371000379824639874510100200100002002000010003710003711102011009910010010000100000007101161199579010000100100038100038100038100038100038
1020410003774900072689649251010010010000100100005003582980010001801000371000379824679874510100200100002002000010003710003711102011009910010010000100000007101161199579010000100100038100038100038100038100038
102041000377490006189649251010010010000100100005003582980010001801000371000379833039874510100200100002002000010003710003711102011009910010010000100000007101161199579010000100100038100038100038100038100038
102041000377500006189649251010010010000100100005003582980010001801000371000379824639874510100200100002002000010003710003711102011009910010010000100000007101161199579010000100100038100038100038100038100038
102041000377490006189649251010010010000100100005003582980010001801000371000379824639874510100200100002002000010003710003711102011009910010010000100000007101161199579010000100100038100038100038100038100038

1000 unrolls and 10 iterations

Result (median cycles for code): 10.0037

retire (01)cycle (02)03090a0b18191e1f3a3f4e51inst issue (52)~issue int (53)~issue fp/simd (54)~dispatch int (56)~dispatch fp/simd (57)huge thing int (59)huge thing fp/simd (5b)60696b6d6edispatch stall (70)scheduler rewind (75)scheduler stall (76)~dispatch op (78)~map op int (7c)~map op fp/simd (7e)~map lookup int (7f)~map lookup fp/simd (81)8283pipeline redirect (84)85inst all (8c)inst branch (8d)inst branch taken (90)inst b.cc (94)inst integer (97)inst neon or fp (9a)9fa6a8a9accdcfd5d6ddinst fetch restart (de)e0? int output thing (e9)? fp/simd (ee)gpr retires (ef)f5f6f7f8fd
100241000377490000000082896492510010101000010100005035829801100018010003710003798268398803100102010000202000010003710003711100211091010100001000000640216229957901000010100038100038100038100038100038
100241000377490000000061896492510010101000010100005035829800100018010003710003798268398767100102010000202000010003710003711100211091010100001000000640216229957901000010100038100038100038100038100038
100241000377490000000061896492510010101000010100005035833010100018010003710003798268398767100102010000202000010003710003711100211091010100001000000640216229957901000010100038100038100038100038100038
100241000377490000000061896492510010101000010100005035829800100018010003710003798268398767100102010000202000010003710003711100211091010100001000000640216229957901000010100038100038100038100038100038
100251000377490000000082896492510010101000010100005035829800100018010003710003798268398767100102010000202000010003710003711100211091010100001000000640216229957901000010100038100038100038100038100038
100241000377490000000061896492510010101000010100005035829800100018010003710003798268398767100102010000202000010003710003711100211091010100001000000640216229957901000010100038100038100038100038100038
1002410003774900000000272896492510010101000010100005035829800100018010003710003798268398767100102010000202000010003710003711100211091010100001000000662216229957901000010100038100038100038100038100038
10024100037749000000001114896492510010101000010100005035829800100018010003710003798268398767100102010000202000010003710003711100211091010100001000000640216229957901000010100038100038100038100038100038
100241000377490000000061896492510010101000010100005035829800100018010003710008398268398767100102010000202000010003710003711100211091010100001000030640216229957901000010100038100038100038100038100038
100241000377490000000061896492510010101000010100005035829801100018010003710003798268398767100102010000202000010003710003711100211091010100001001260640216229957901000010100038100038100038100038100038

Test 4: throughput

Count: 8

Code:

  fdiv d0, d8, d9
  fdiv d1, d8, d9
  fdiv d2, d8, d9
  fdiv d3, d8, d9
  fdiv d4, d8, d9
  fdiv d5, d8, d9
  fdiv d6, d8, d9
  fdiv d7, d8, d9
  movi v8.16b, 9
  movi v9.16b, 10

(fused SUBS/B.cc loop)

100 unrolls and 100 iterations

Result (median cycles for code divided by count): 1.0006

retire (01)cycle (02)03090b1e1f3f51inst issue (52)~issue int (53)~issue fp/simd (54)~dispatch int (56)~dispatch fp/simd (57)huge thing int (59)huge thing fp/simd (5b)696d6edispatch stall (70)scheduler rewind (75)scheduler stall (76)~dispatch op (78)~map op int (7c)~map op fp/simd (7e)~map lookup int (7f)~map lookup fp/simd (81)8283pipeline redirect (84)85inst all (8c)inst branch (8d)inst branch taken (90)inst b.cc (94)inst integer (97)inst neon or fp (9a)9fa8acc2branch mispredict (cb)cdcfd5d6ddinst fetch restart (de)e0? fp/simd (ee)gpr retires (ef)f5f6f7f8fd
8020480046599000057258010010080000100800005006400008020780046800466996437000480100200800002001600008027380046118020110099100100800001000300051106164580037800001008004780047800478010480047
80204800465990000722258010010080000100800005006400008002780046800466996437000480100200800002001600008004680046118020110099100100800001000000051105165380037800001008004780047800478004780047
80204800466000000992580100100800001008000050064000080027800468004669964370004801002008000020016000080046800461180201100991001008000010001800051107165680037800001008004780047800478004780047
8020480046599000057258010010080000100800005006400008002780046800466996437000480100200800002001600008004680046118020110099100100800001000000051104164380037800001008004780047800478004780047
8020480046600000057258010010080000100800005006400008002780046800466996437000480100200800002001600008004680046118020110099100100800001000000051105163580037800001008004780047800478004780047
8020480046600000057258010010080000100800005506400008002780046800466996437000480100200800002001600008004680046118020110099100100800001000000051105164480037800001008004780047800478004780047
8020480046599000057258010010080000100800005006400008002780046800466996437000480100200800002001600008004680046118020110099100100800001000000051104165480037800001008004780047800478004780047
80204800466000012057258010010080000100800005006400008002780046800466996437000480100200800002001600008004680046118020110099100100800001000300051105165580037800001008004780047800478004780047
80204800465990000342258010010080000100800005006400008002780046800466996437000480100200800002001600008004680046118020110099100100800001000000051105165380037800001008004780047800478004780047
8020480046600000078258010010080000100800005006400008002780046800466996437000480100200800002001600008004680046118020110099100100800001000000051104164480036800001008004780047800478004780047

1000 unrolls and 10 iterations

Result (median cycles for code divided by count): 1.0006

retire (01)cycle (02)03090b1e3f51inst issue (52)~issue int (53)~issue fp/simd (54)~dispatch int (56)~dispatch fp/simd (57)huge thing int (59)huge thing fp/simd (5b)60696d6edispatch stall (70)72scheduler rewind (75)scheduler stall (76)~dispatch op (78)~map op int (7c)~map op fp/simd (7e)~map lookup int (7f)~map lookup fp/simd (81)8283pipeline redirect (84)85inst all (8c)inst branch (8d)inst branch taken (90)inst b.cc (94)inst integer (97)inst neon or fp (9a)9fa0a8acc5cfd5d6ddinst fetch restart (de)e0? fp/simd (ee)gpr retires (ef)f5f6f7f8fd
800248004659900057258001010800001080000506400001800278004680046699860370026800102080000201600008004680046118002110910108000010000050201116468003680000108004780047800478004780047
800248004660000088125800101080000108000050640000080027800468004669986037002680010208000020160000800468004611800211091010800001000005020516358003680000108004780047800478004780047
800248004660000372225800101080000108000050640000180027800468004669986037002680010208000020160000800468004611800211091010800001000005020616668003680000108004780047800478004780047
8002480046600000174425800101080000108000050640000180027800468004669986037002680010208000020160000800468004611800211091010800001000005020516538003680000108004780047801018004780047
80024800465990005725800101080000108000050640000080027800468004669986037002680010208000020160000800468004611800211091010800001000005020516358003680000108004780047800478004780047
80024800466000005725800101080000108000050640000180027800468004669986037002680144208000020160000800468004611800211091010800001000005020316658003680000108004780047800478004780047
80024800465990005725800101080000108000050640000180027800468004669986037002680010208000020160000800468004611800211091010800001000005020616758003680000108004780047800478004780047
80024800466000005725800101080000108000050640000180027800468004669986037002680010208000020160000800468004611800211091010800001000005020616458003680000108004780047800478004780047
80024800466000005725800101080000108000050640000180027800468004669986037002680010208000020160000800468004611800211091010800001000005020616538003680000108004780047800478004780047
80024800466000005725800101080000108000050640000180027800468004669986037002680010208000020160000800468004611800211091010800001000005020516358003680000108004780047800478004780047