Apple Microarchitecture Research by Dougall Johnson

M1/A14 P-core (Firestorm): Overview | Base Instructions | SIMD and FP Instructions
M1/A14 E-core (Icestorm):  Overview | Base Instructions | SIMD and FP Instructions

FDIV (scalar, D)

Test 1: uops

Code:

  fdiv d0, d0, d1
  movi v0.16b, 1
  movi v1.16b, 2

(no loop instructions)

1000 unrolls and 1 iteration

Retires: 1.000

Issues: 1.000

Integer unit issues: 0.000

Load/store unit issues: 0.000

SIMD/FP unit issues: 1.000

retire uop (01)cycle (02)03181e3f4e51schedule uop (52)schedule simd uop (54)dispatch simd uop (57)ldst uops in schedulers (5b)60696d6emap stall dispatch (70)map rewind (75)map stall (76)dispatch uop (78)map simd uop (7e)map simd uop inputs (81)8283flush restart other nonspec (84)85inst all (8c)inst simd alu (9a)l1d tlb access (a0)l1d tlb miss (a1)ld unit uop (a6)l1d cache writeback (a8)acc2c3cfd5map dispatch bubble (d6)ddfetch restart (de)e0? simd retires (ee)f5f6f7f8fd
10041003775006186492510001000100035198011001810037100379396398951000100020001003710037111001100000000007321622957910001003810038100381003810038
10041003775006186492510001000100035198001001810037100379396398951000100020001003710037111001100000050007321622957910001003810038100381003810038
10041003775006186492510001000100035198011001810037100379396398951000100020001003710037111001100000000007321622957910001003810038100381003810038
100410037750036786492510001000100035198011001810037100379396398951000100020001003710037111001100000200007321622957910001003810038100381003810038
10041003776006186492510001000100035198011001810037100379396398951000100020001008410037111001100000000007321622957910001003810038100381003810038
10041003775006186492510001000100035198001001810037100379396398951000100020001003710037111001100000000007321622957910001003810038100381003810038
10041003775006186492510001000100035198011001810037100379396398951000100020001003710037111001100000020007321622957910001003810038100381003810038
10041003775006186492510001000100035198011001810037100379396398951000100020001003710037111001100000000007321622957910001003810038100381003810038
10041003775006186492510001000100035198011001810037100379396398951000100020001003710037111001100000000007321622957910001003810038100381003810038
10041003775006186492510001000100035198001001810084100379396398951000100020001003710037111001100000000007321622957910001003810038100381003810038

Test 2: Latency 1->2

Code:

  fdiv d0, d0, d1
  movi v0.16b, 1
  movi v1.16b, 2

(fused SUBS/B.cc loop)

100 unrolls and 100 iterations

Result (median cycles for code): 10.0037

retire uop (01)cycle (02)03mmu table walk data (08)1e1f3f4e51schedule uop (52)schedule int uop (53)schedule simd uop (54)dispatch int uop (56)dispatch simd uop (57)int uops in schedulers (59)ldst uops in schedulers (5b)60696d6emap stall dispatch (70)map rewind (75)map stall (76)dispatch uop (78)map int uop (7c)map simd uop (7e)map int uop inputs (7f)map simd uop inputs (81)8283flush restart other nonspec (84)85inst all (8c)inst branch (8d)inst branch taken (90)inst branch cond (94)inst int alu (97)inst simd alu (9a)9fld unit uop (a6)l1d cache writeback (a8)a9accfd5map dispatch bubble (d6)ddfetch restart (de)e0? int output thing (e9)? simd retires (ee)? int retires (ef)f5f6f7f8fd
10204100037749000148896492510100100100001001000050035829801100018100037100037982463987451010020010000200200001000371000371110201100991001001000010000007101161199579010000100100038100038100038100038100038
1020410003774900061896492510100100100001001000050035829801100018100037100037982463987451010020010000200200001000371000371110201100991001001000010000007101161199579010000100100038100038100038100038100038
1020410003774900061896492510100100100001001000050035829801100018100037100037982463987451010020010000200200001000371000371110201100991001001000010000007101161199579010000100100038100038100038100038100038
1020410003774901800726896492510100100100001001000050035829801100018100037100037982463987451010020010000202200001000371000371110201100991001001000010000007101161199579010000100100038100038100038100038100038
102041000377500267061896492510100100100001001000050035829801100018100037100037982463987451010020010000200200001000371000372110201100991001001000010000007101161199579010000100100038100038100038100038100038
102041000377490267061896492510100100100001001000050035829801100018100037100037982463987451010020010000200200001000371000371110201100991001001000010000007101161199579010000100100038100038100038100038100276
1020410003774902640726896492510100100100001001000050035829800100018100037100037982463987451010020010000200200001000371000371110201100991001001000010000007101161199579010000100100038100038100038100038100038
102041000377500438061896492510100100100001001000050035829800100018100037100037982463987451010020010000200200001000841000371110201100991001001000010000007101161199579010000100100038100038100038100038100038
102041000377490306061896492510100100100001001000050035829801100018100037100037982463987451010020010000200200001000371000371110201100991001001000010000007101161199579010000100100038100038100038100038100038
102041000377490315061896492510100100100001001000050035829801100018100037100037982463987451010020010000200200001000371000371110201100991001001000010000007101161199579010000100100038100038100038100038100038

1000 unrolls and 10 iterations

Result (median cycles for code): 10.0037

retire uop (01)cycle (02)03l1i tlb fill (04)mmu table walk data (08)091e3f4e51schedule uop (52)schedule int uop (53)schedule simd uop (54)dispatch int uop (56)dispatch simd uop (57)int uops in schedulers (59)ldst uops in schedulers (5b)696d6emap stall dispatch (70)map rewind (75)map stall (76)dispatch uop (78)map int uop (7c)map simd uop (7e)map int uop inputs (7f)map simd uop inputs (81)8283flush restart other nonspec (84)85inst all (8c)inst branch (8d)inst branch taken (90)inst branch cond (94)inst int alu (97)inst simd alu (9a)9fl1d cache writeback (a8)accdcfd5map dispatch bubble (d6)ddfetch restart (de)e0? simd retires (ee)? int retires (ef)f5f6f7f8fd
100241000377501000618964925100101010000101000050358298010001810003710003798268398767100102010000202000010003710003711100211091010100001000064021622995791000010100038100038100038100038100038
100241000377490000618964925100101010000101000050358298010001810003710003798268398767100102010000202000010003710003711100211091010100001000064021622995791000010100038100038100038100038100038
100241000377490000618964925100101010000101000050358298010001810003710003798268398767100102010000202000010003710003711100211091010100001000064021622995791000010100038100038100038100038100038
10024100037750000064148964925100101010000101000050358298010001810003710003798268398767100102010000202000010003710003711100211091010100001020064021622995791000010100038100038100038100038100038
100241000377490000618964925100101010000101000050358298010001810003710003798268398767100102010000202000010003710003711100211091010100001000064021622995791000010100038100038100038100038100038
100241000377500000618964925100101010000101000050358298010001810003710003798268398767100102010000202000010003710003711100211091010100001000064021622995791000010100038100038100038100038100038
100241000377490000618964925100101010000101000050358298010001810003710003798268398767100102010000202000010003710003711100211091010100001000064021622995791000010100038100038100038100038100038
1002410003774900006189649251001010100001010000503582980100018100037100037982682598767100102010000202000010003710003711100211091010100001030064021622995791000010100038100038100038100038100038
100241000377490000618964925100101010000101000050358298010001810003710003798268398767100102010000202000010003710003711100211091010100001000064021622995791000010100038100038100038100038100038
100241000377490000618964925100101010000101000050358298010001810003710003798268398767100102010000202000010003710003711100211091010100001000164021622995791000010100038100038100038100038100038

Test 3: Latency 1->3

Code:

  fdiv d0, d1, d0
  movi v0.16b, 1
  movi v1.16b, 2

(fused SUBS/B.cc loop)

100 unrolls and 100 iterations

Result (median cycles for code): 10.0037

retire uop (01)cycle (02)03l2 tlb miss data (0b)1e1f3f4e51schedule uop (52)schedule int uop (53)schedule simd uop (54)dispatch int uop (56)dispatch simd uop (57)int uops in schedulers (59)ldst uops in schedulers (5b)60696b6d6emap stall dispatch (70)map rewind (75)map stall (76)dispatch uop (78)map int uop (7c)map simd uop (7e)map int uop inputs (7f)map simd uop inputs (81)8283flush restart other nonspec (84)85inst all (8c)inst branch (8d)inst branch taken (90)inst branch cond (94)inst int alu (97)inst simd alu (9a)9fl1d tlb access (a0)ld unit uop (a6)st unit uop (a7)l1d cache writeback (a8)accfd5map dispatch bubble (d6)ddfetch restart (de)e0? int output thing (e9)? simd retires (ee)? int retires (ef)f5f6f7f8fd
102041000377490006189649251010010010000100100005003582980010001801000371000379824639874510100200100002002000010003710003711102011009910010010000100000007101161199579010000100100038100038100038100038100038
102041000377490006189649251010010010000100100005003582980010001801000371000379824639884110100200100002002000010007610003711102011009910010010000100000007101161199579010000100100038100038100038100038100038
102041000377490006189649251010010010000100100005003582980010001801000371000379824639874510100200100002002000010003710003711102011009910010010000100000037101161199579010000100100038100038100038100038100038
1020410003775000025189649251010010010000100100005003582980010001801000371000379824639874510100200100002002000010003710003711102011009910010010000100000007101161199579010000100100038100038100038100038100038
102041000377490006189649251010010010000100100005003582980010001801000371000379824639874510100200100002002000010003710003711102021009910010010000100000007101161199579010000100100038100038100038100038100038
102041000377490006189649251010010010000100100005003582980010001801000371000379824639874510100200100002002000010003710003711102011009910010010000100000007101161199579010000100100038100038100038100038100038
1020410003774900072689649251010010010000100100005003582980010001801000371000379824679874510100200100002002000010003710003711102011009910010010000100000007101161199579010000100100038100038100038100038100038
102041000377490006189649251010010010000100100005003582980010001801000371000379833039874510100200100002002000010003710003711102011009910010010000100000007101161199579010000100100038100038100038100038100038
102041000377500006189649251010010010000100100005003582980010001801000371000379824639874510100200100002002000010003710003711102011009910010010000100000007101161199579010000100100038100038100038100038100038
102041000377490006189649251010010010000100100005003582980010001801000371000379824639874510100200100002002000010003710003711102011009910010010000100000007101161199579010000100100038100038100038100038100038

1000 unrolls and 10 iterations

Result (median cycles for code): 10.0037

retire uop (01)cycle (02)0309l2 tlb miss instruction (0a)l2 tlb miss data (0b)18191e1f3a3f4e51schedule uop (52)schedule int uop (53)schedule simd uop (54)dispatch int uop (56)dispatch simd uop (57)int uops in schedulers (59)ldst uops in schedulers (5b)60696b6d6emap stall dispatch (70)map rewind (75)map stall (76)dispatch uop (78)map int uop (7c)map simd uop (7e)map int uop inputs (7f)map simd uop inputs (81)8283flush restart other nonspec (84)85inst all (8c)inst branch (8d)inst branch taken (90)inst branch cond (94)inst int alu (97)inst simd alu (9a)9fld unit uop (a6)l1d cache writeback (a8)a9accdcfd5map dispatch bubble (d6)ddfetch restart (de)e0? int output thing (e9)? simd retires (ee)? int retires (ef)f5f6f7f8fd
100241000377490000000082896492510010101000010100005035829801100018010003710003798268398803100102010000202000010003710003711100211091010100001000000640216229957901000010100038100038100038100038100038
100241000377490000000061896492510010101000010100005035829800100018010003710003798268398767100102010000202000010003710003711100211091010100001000000640216229957901000010100038100038100038100038100038
100241000377490000000061896492510010101000010100005035833010100018010003710003798268398767100102010000202000010003710003711100211091010100001000000640216229957901000010100038100038100038100038100038
100241000377490000000061896492510010101000010100005035829800100018010003710003798268398767100102010000202000010003710003711100211091010100001000000640216229957901000010100038100038100038100038100038
100251000377490000000082896492510010101000010100005035829800100018010003710003798268398767100102010000202000010003710003711100211091010100001000000640216229957901000010100038100038100038100038100038
100241000377490000000061896492510010101000010100005035829800100018010003710003798268398767100102010000202000010003710003711100211091010100001000000640216229957901000010100038100038100038100038100038
1002410003774900000000272896492510010101000010100005035829800100018010003710003798268398767100102010000202000010003710003711100211091010100001000000662216229957901000010100038100038100038100038100038
10024100037749000000001114896492510010101000010100005035829800100018010003710003798268398767100102010000202000010003710003711100211091010100001000000640216229957901000010100038100038100038100038100038
100241000377490000000061896492510010101000010100005035829800100018010003710008398268398767100102010000202000010003710003711100211091010100001000030640216229957901000010100038100038100038100038100038
100241000377490000000061896492510010101000010100005035829801100018010003710003798268398767100102010000202000010003710003711100211091010100001001260640216229957901000010100038100038100038100038100038

Test 4: throughput

Count: 8

Code:

  fdiv d0, d8, d9
  fdiv d1, d8, d9
  fdiv d2, d8, d9
  fdiv d3, d8, d9
  fdiv d4, d8, d9
  fdiv d5, d8, d9
  fdiv d6, d8, d9
  fdiv d7, d8, d9
  movi v8.16b, 9
  movi v9.16b, 10

(fused SUBS/B.cc loop)

100 unrolls and 100 iterations

Result (median cycles for code divided by count): 1.0006

retire uop (01)cycle (02)0309l2 tlb miss data (0b)1e1f3f51schedule uop (52)schedule int uop (53)schedule simd uop (54)dispatch int uop (56)dispatch simd uop (57)int uops in schedulers (59)ldst uops in schedulers (5b)696d6emap stall dispatch (70)map rewind (75)map stall (76)dispatch uop (78)map int uop (7c)map simd uop (7e)map int uop inputs (7f)map simd uop inputs (81)8283flush restart other nonspec (84)85inst all (8c)inst branch (8d)inst branch taken (90)inst branch cond (94)inst int alu (97)inst simd alu (9a)9fl1d cache writeback (a8)acc2branch mispred nonspec (cb)cdcfd5map dispatch bubble (d6)ddfetch restart (de)e0? simd retires (ee)? int retires (ef)f5f6f7f8fd
8020480046599000057258010010080000100800005006400008020780046800466996437000480100200800002001600008027380046118020110099100100800001000300051106164580037800001008004780047800478010480047
80204800465990000722258010010080000100800005006400008002780046800466996437000480100200800002001600008004680046118020110099100100800001000000051105165380037800001008004780047800478004780047
80204800466000000992580100100800001008000050064000080027800468004669964370004801002008000020016000080046800461180201100991001008000010001800051107165680037800001008004780047800478004780047
8020480046599000057258010010080000100800005006400008002780046800466996437000480100200800002001600008004680046118020110099100100800001000000051104164380037800001008004780047800478004780047
8020480046600000057258010010080000100800005006400008002780046800466996437000480100200800002001600008004680046118020110099100100800001000000051105163580037800001008004780047800478004780047
8020480046600000057258010010080000100800005506400008002780046800466996437000480100200800002001600008004680046118020110099100100800001000000051105164480037800001008004780047800478004780047
8020480046599000057258010010080000100800005006400008002780046800466996437000480100200800002001600008004680046118020110099100100800001000000051104165480037800001008004780047800478004780047
80204800466000012057258010010080000100800005006400008002780046800466996437000480100200800002001600008004680046118020110099100100800001000300051105165580037800001008004780047800478004780047
80204800465990000342258010010080000100800005006400008002780046800466996437000480100200800002001600008004680046118020110099100100800001000000051105165380037800001008004780047800478004780047
8020480046600000078258010010080000100800005006400008002780046800466996437000480100200800002001600008004680046118020110099100100800001000000051104164480036800001008004780047800478004780047

1000 unrolls and 10 iterations

Result (median cycles for code divided by count): 1.0006

retire uop (01)cycle (02)0309l2 tlb miss data (0b)1e3f51schedule uop (52)schedule int uop (53)schedule simd uop (54)dispatch int uop (56)dispatch simd uop (57)int uops in schedulers (59)ldst uops in schedulers (5b)60696d6emap stall dispatch (70)simd prf full (72)map rewind (75)map stall (76)dispatch uop (78)map int uop (7c)map simd uop (7e)map int uop inputs (7f)map simd uop inputs (81)8283flush restart other nonspec (84)85inst all (8c)inst branch (8d)inst branch taken (90)inst branch cond (94)inst int alu (97)inst simd alu (9a)9fl1d tlb access (a0)l1d cache writeback (a8)acbranch cond mispred nonspec (c5)cfd5map dispatch bubble (d6)ddfetch restart (de)e0? simd retires (ee)? int retires (ef)f5f6f7f8fd
800248004659900057258001010800001080000506400001800278004680046699860370026800102080000201600008004680046118002110910108000010000050201116468003680000108004780047800478004780047
800248004660000088125800101080000108000050640000080027800468004669986037002680010208000020160000800468004611800211091010800001000005020516358003680000108004780047800478004780047
800248004660000372225800101080000108000050640000180027800468004669986037002680010208000020160000800468004611800211091010800001000005020616668003680000108004780047800478004780047
8002480046600000174425800101080000108000050640000180027800468004669986037002680010208000020160000800468004611800211091010800001000005020516538003680000108004780047801018004780047
80024800465990005725800101080000108000050640000080027800468004669986037002680010208000020160000800468004611800211091010800001000005020516358003680000108004780047800478004780047
80024800466000005725800101080000108000050640000180027800468004669986037002680144208000020160000800468004611800211091010800001000005020316658003680000108004780047800478004780047
80024800465990005725800101080000108000050640000180027800468004669986037002680010208000020160000800468004611800211091010800001000005020616758003680000108004780047800478004780047
80024800466000005725800101080000108000050640000180027800468004669986037002680010208000020160000800468004611800211091010800001000005020616458003680000108004780047800478004780047
80024800466000005725800101080000108000050640000180027800468004669986037002680010208000020160000800468004611800211091010800001000005020616538003680000108004780047800478004780047
80024800466000005725800101080000108000050640000180027800468004669986037002680010208000020160000800468004611800211091010800001000005020516358003680000108004780047800478004780047