Apple Microarchitecture Research by Dougall Johnson

M1/A14 P-core (Firestorm): Overview | Base Instructions | SIMD and FP Instructions
M1/A14 E-core (Icestorm):  Overview | Base Instructions | SIMD and FP Instructions

FDIV (vector, 4S)

Test 1: uops

Code:

  fdiv v0.4s, v0.4s, v1.4s
  movi v0.16b, 1
  movi v1.16b, 2

(no loop instructions)

1000 unrolls and 1 iteration

Retires: 1.000

Issues: 1.000

Integer unit issues: 0.000

Load/store unit issues: 0.000

SIMD/FP unit issues: 1.000

retire uop (01)cycle (02)031e3f4e51schedule uop (52)schedule simd uop (54)dispatch simd uop (57)ldst uops in schedulers (5b)696d6emap stall dispatch (70)map rewind (75)map stall (76)dispatch uop (78)map simd uop (7e)map simd uop inputs (81)8283flush restart other nonspec (84)85inst all (8c)inst simd alu (9a)accfd5map dispatch bubble (d6)ddfetch restart (de)e0? simd retires (ee)f5f6f7f8fd
1004803760061672125100010001000281300801880378037749437895100010002000803780371110011000073116117677100080388038803880388038
1004803761061672125100010001000281300801880378037749437895100010002000813180371110011000073116117677100080388038803880388038
1004803760061672125100010001000281300801880378037749477895100010002000803780371110011000073116117677100080388038803880388038
10048037600251672125100010001000281300801880378037749437895100010002000803780371110011000073116117677100080388038803880388038
10048037602161672125100010001000281300801880378037749437895100010002000803780371110011000073116117677100080388038803880388038
1004803760061672125100010001000281300801880378037749437895100010002000803780371110011000073116117677100080388038803880388038
1004803760061672125100010001000281300801880378037749437895100010002000803780371110011000073116117677100080388038803880388038
1004803760061672125100010001000281300801880378037749437895100010002000803780371110011000073116117677100080388038803880388038
1004803760682672125100010001000281300801880378037749437895100010002000803780371110011000073116117677100080388038803880388038
1004803760061672125100010001000281300801880378037749437895100010002000803780371110011000073116117677100080388038803880388038

Test 2: Latency 1->2

Code:

  fdiv v0.4s, v0.4s, v1.4s
  movi v0.16b, 1
  movi v1.16b, 2

(fused SUBS/B.cc loop)

100 unrolls and 100 iterations

Result (median cycles for code): 8.0037

retire uop (01)cycle (02)03mmu table walk data (08)18191e1f3f4e51schedule uop (52)schedule int uop (53)schedule simd uop (54)dispatch int uop (56)dispatch simd uop (57)int uops in schedulers (59)ldst uops in schedulers (5b)696b6d6emap stall dispatch (70)simd prf full (72)map rewind (75)map stall (76)dispatch uop (78)map int uop (7c)map simd uop (7e)map int uop inputs (7f)map simd uop inputs (81)8283flush restart other nonspec (84)85inst all (8c)inst branch (8d)inst branch taken (90)inst branch cond (94)inst int alu (97)inst simd alu (9a)9fl1d tlb access (a0)l1d cache writeback (a8)accfd5map dispatch bubble (d6)ddfetch restart (de)e0? int output thing (e9)? simd retires (ee)? int retires (ef)f5f6f7f8fd
1020480037599000006169721251010010010000100100005002864300800180800378003778344037874510100200100002002000080037800371110201100991001001000010000071001611796770100001008003880038800388003880038
1020480037600000006169721251010010010000100100005002864300800180800378003778344037874510100200100002002000080037800371110201100991001001000010000071011611796770100001008003880038800388003880038
1020480037600000006169721251010010010000100100005002864300800180800378003778344037874510100200100002002000080037800371110201100991001001000010000071011611796770100001008003880038800388003880038
1020480037599000006169721251010010010000100100005002864300800180800378003778344037874510100200100002002000080037800371110201100991001001000010000071011611796770100001008003880038800388003880038
1020480037599000006169669251010010010000100100005002864300800180800378003778362037874510100200100002002000080037800371110201100991001001000010000071011611796770100001008003880038800388003880038
1020480037600000006169721251010010010000100100005002864300800180800378003778344037874510100200100002002000080037800371110201100991001001000010000071011611796770100001008003880038800388003880038
1020480037599000006169721251010010010000100100005002864300800180800378003778344037874510100200100002002000080037800371110201100991001001000010001071011611796770100001008003880038800388003880038
1020480037600000006169721251010010010000100100005002864300800180800378003778344037874510100200100002002000080037800371110201100991001001000010000071011611796770100001008003880038800388003880038
1020480037599000006169721251010010010000100100005002864300800180800378003778344037874510100200100002002000080037800371110201100991001001000010000071011611796770100001008003880038800388007280038
1020480037600000006169721251010010010000100100005002864300800180800378003778344037874510100200100002002000080037800371110201100991001001000010000071001611796770100001008003880038800388003880038

1000 unrolls and 10 iterations

Result (median cycles for code): 8.0037

retire uop (01)cycle (02)03l2 tlb miss data (0b)1e3f4e51schedule uop (52)schedule int uop (53)schedule simd uop (54)dispatch int uop (56)dispatch simd uop (57)int uops in schedulers (59)ldst uops in schedulers (5b)60696d6emap stall dispatch (70)map rewind (75)map stall (76)dispatch uop (78)map int uop (7c)map simd uop (7e)map int uop inputs (7f)map simd uop inputs (81)8283flush restart other nonspec (84)85inst all (8c)inst branch (8d)inst branch taken (90)inst branch cond (94)inst int alu (97)inst simd alu (9a)9fl1d tlb miss (a1)l1d cache writeback (a8)accdcfd5map dispatch bubble (d6)ddfetch restart (de)e0? simd retires (ee)? int retires (ef)f5f6f7f8fd
1002480037599006169721251001010100001010000502864300080018800378003778366378767100102010000202000080037800371110021109101010000100000640316227967710000108003880038800388003880038
100248003759903396169721251001010100001010000502864300180018800378003778366378767100102010000202000080037800371110021109101010000100000640216227967710000108003880038800388003880038
1002480037599006169721251001010100001010000502864300080018800378003778366378767100102010000202000080037800371110021109101010000100000640216227967710000108003880038800388003880038
1002480037599006169721251001010100001010000502864300080018800378003778366378767100102010000202000080037800371110021109101010000100000640216227967710000108003880038800388003880038
1002480037600063145769721251001010100001010000502864300080018800378003778366378767100102010000202000080037800371110021109101010000100000640216227967710000108003880038800388003880038
1002480037600006169721251001010100001010000502864300180018800378003778366378767100102010000202000080037800371110021109101010000100000640216227967710000108003880038800388003880038
100248003759902256169721251001010100001010000502864300080018800378003778366378767100102010000202000080037800371110021109101010000100000640216227967710000108003880038800388003880038
10024800375990546169721251001010100001010000502864300180018800378003778366378767100102010000202000080037800371110021109101010000100000640216227967710000108003880038800388003880038
1002480037599006169721251001010100001010000502864300080018800378003778366378767100102010000202000080037800371110021109101010000100000640216127967710000108003880038800388003880038
1002480037599006169721251001010100001010000502864300080018800378003778366378767100102010000202000080037800371110021109101010000100100640216227967710000108003880038800388003880038

Test 3: Latency 1->3

Code:

  fdiv v0.4s, v1.4s, v0.4s
  movi v0.16b, 1
  movi v1.16b, 2

(fused SUBS/B.cc loop)

100 unrolls and 100 iterations

Result (median cycles for code): 8.0037

retire uop (01)cycle (02)03mmu table walk data (08)09l2 tlb miss instruction (0a)18191e1f3f4e51schedule uop (52)schedule int uop (53)schedule simd uop (54)dispatch int uop (56)dispatch simd uop (57)int uops in schedulers (59)ldst uops in schedulers (5b)60696d6emap stall dispatch (70)map rewind (75)map stall (76)dispatch uop (78)map int uop (7c)map simd uop (7e)map int uop inputs (7f)map simd uop inputs (81)8283flush restart other nonspec (84)85inst all (8c)inst branch (8d)inst branch taken (90)inst branch cond (94)inst int alu (97)inst simd alu (9a)9fld unit uop (a6)st unit uop (a7)l1d cache writeback (a8)accfd5map dispatch bubble (d6)ddfetch restart (de)e0? int output thing (e9)? simd retires (ee)? int retires (ef)f5f6f7f8fd
102048003759900000006169721251010010010000100100005002864300180018800378003778344378745101002001000020020000800378003711102011009910010010000100000071031612796770100001008003880038800388003880038
102048003759900000006169721251010010010000100100005002864300180018800378003778344378745101002001000020020000800378003711102011009910010010000100000071021622796770100001008003880038800388003880038
102048003760000040008469721251010010010000100100005002864300180018800378003778344378745101002001000020020000800378003711102011009910010010000100001071021622796770100001008003880038800388003880038
102048003759900000906169721251010010010000100100005002864300180018800378003778344378745101002001000020020000800378003711102011009910010010000100000071021622796770100001008003880038800388003880038
102048003760000000006169721251010010010000100100005002864300180018800378003778344378745101002001000020020000800378003711102011009910010010000100000071021622796770100001008003880038800388003880038
102048003760000000006169721251010010010000100100005002864300180018800378003778344378745101002001000020020000800378003711102011009910010010000100001371021622796770100001008003880038800388003880038
102048003759900000006169721251010010010000100100005002864300180018800378003778344378745101002001000020020000800378003711102011009910010010000100000071021622796770100001008003880038800388003880038
102048003760000040006169721251010010010000100100395002864300180018800378003778344378745101002001000020020000800378003711102011009910010010000100000071021622796770100001008003880038800388003880038
102048003759900000008269721251010010010000100100005002864300180018800378003778344378745101002001000020020000800378003711102011009910010010000100000071021622796770100001008003880038800388003880038
102048003759900000006169721251010010010000100100005002864300180018800378003778344378745101002001000020020000800378003711102011009910010010000100000071021622796770100001008003880038800388003880038

1000 unrolls and 10 iterations

Result (median cycles for code): 8.0037

retire uop (01)cycle (02)031e3f4e51schedule uop (52)schedule int uop (53)schedule simd uop (54)dispatch int uop (56)dispatch simd uop (57)int uops in schedulers (59)ldst uops in schedulers (5b)60696d6emap stall dispatch (70)map rewind (75)map stall (76)dispatch uop (78)map int uop (7c)map simd uop (7e)map int uop inputs (7f)map simd uop inputs (81)8283flush restart other nonspec (84)85inst all (8c)inst branch (8d)inst branch taken (90)inst branch cond (94)inst int alu (97)inst simd alu (9a)9faccdcfd5map dispatch bubble (d6)ddfetch restart (de)e0? simd retires (ee)? int retires (ef)f5f6f7f8fd
1002480037600084697212510010101000010100005028643000800188003780037783663787671001020100002020000800378003711100211091010100001000640216227967710000108003880081800388003880038
10024800375990111697212510010101000010100005028643001800188003780037783663787671001020100002020000800378003711100211091010100001000640216227967710000108003880038800388003880038
1002480037599061697212510010101000010100005028643000800188003780037783663787671001020100002020000800378003711100211091010100001000640216227967710000108003880038800388003880038
100248003760062461697212510010101000010100005028643000800188003780037783663787671001020100002020000800378003711100221091010100001000640216227967710000108003880038800388003880038
1002480037599061697212510010101000010100005028643001800188003780037783663787671001020100002020000800378003711100211091010100001000640216227967710000108003880038800388003880038
1002480037600061697212510010101000010100005028643000800188003780037783663787671001020100002020000800378003711100211091010100001000640216227967710000108003880038800388003880038
1002480037600061697212510010101000010100005028643001800188003780037783663787671001020100002020000800378003711100211091010100001000640216227967710000108003880038800388003880038
1002480037599061697212510010101000010100005028643000800188003780037783663787671001020100002020000800378003711100211091010100001000640216227967710000108003880085800388003880038
10024800746003968697212510010101000010100005028643000800188003780037783663787671001020100002020000800378003711100211091010100001000640216227967710000108003880038800388003880038
1002480037599061697212510010101000010100005028643001800188003780037783663787671001020100002020000800378003711100211091010100001000640216227967710000108003880038800388003880038

Test 4: throughput

Count: 8

Code:

  fdiv v0.4s, v8.4s, v9.4s
  fdiv v1.4s, v8.4s, v9.4s
  fdiv v2.4s, v8.4s, v9.4s
  fdiv v3.4s, v8.4s, v9.4s
  fdiv v4.4s, v8.4s, v9.4s
  fdiv v5.4s, v8.4s, v9.4s
  fdiv v6.4s, v8.4s, v9.4s
  fdiv v7.4s, v8.4s, v9.4s
  movi v8.16b, 9
  movi v9.16b, 10

(fused SUBS/B.cc loop)

100 unrolls and 100 iterations

Result (median cycles for code divided by count): 1.0006

retire uop (01)cycle (02)03mmu table walk instruction (07)09191e3f51schedule uop (52)schedule int uop (53)schedule simd uop (54)dispatch int uop (56)dispatch simd uop (57)int uops in schedulers (59)ldst uops in schedulers (5b)60696b6d6emap stall dispatch (70)simd prf full (72)map rewind (75)map stall (76)dispatch uop (78)map int uop (7c)map simd uop (7e)map int uop inputs (7f)map simd uop inputs (81)8283flush restart other nonspec (84)85inst all (8c)inst branch (8d)inst branch taken (90)inst branch cond (94)inst int alu (97)inst simd alu (9a)9fl1d cache writeback (a8)acbranch mispred nonspec (cb)cdcfd5map dispatch bubble (d6)ddfetch restart (de)e0? int output thing (e9)? simd retires (ee)? int retires (ef)f5f6f7f8fd
8020480044600000055258010010080000100800005006400001800250800448004469964037000280100200800002001600008004480044118020110099100100800001000000511021611800360800001008004580045800458004580045
80204800445991000720258010010080000100800005006400000800250800448004469964037000280100200800002001600008004480044118020110099100100800001000000511011611800360800001008009980045800458004580045
8020480044599000055258010010080000100800005006400000800650800448004469964037000280100200800002001600008004480044118020110099100100800001000600511011711800360800001008004580045800458004580045
8020480044600000055258010010080000100800005006400001800250800448004469964037000280153200800002001600008004480044118020110099100100800001003000511011711800360800001008004580045800458004580045
802048004460000002200258010010080000100800005006400000800250800448004469964037000280100200800002001600008004480044118020110099100100800001001600511011611800360800001008004580045800458004580099
8020480044599000055258010010080000100800005006400000800250800448004469964037000280100200800002001600008004480044118020110099100100800001000000511011611800360800001008004580045800458004580045
8020480044599000055258010010080000100800005006400000800250800448004469964037000280100200800002001600008004480044118020110099100100800001000000511011611800360800001008004580045800458004580045
8020480098599000055258010010080000100800005006400000800250800448004470000037000280100200800002001600008004480044118020110099100100800001000000511011611800360800001008004580045800458004580045
8020480044600000076258010010080000100800005006400000800250800448004469964037000280100200800002001600008004480044118020110099100100800001000000511011611800360800001008004580045800458004580045
8020480044600000055258010010080000100800005006400000800250800448004469964037000280100200800002001600008004480044118020110099100100800001000000511011611800360800001008004580045800458004580045

1000 unrolls and 10 iterations

Result (median cycles for code divided by count): 1.0006

retire uop (01)cycle (02)031e3f51schedule uop (52)schedule int uop (53)schedule simd uop (54)dispatch int uop (56)dispatch simd uop (57)int uops in schedulers (59)ldst uops in schedulers (5b)696d6emap stall dispatch (70)map rewind (75)map stall (76)dispatch uop (78)map int uop (7c)map simd uop (7e)map int uop inputs (7f)map simd uop inputs (81)8283flush restart other nonspec (84)85inst all (8c)inst branch (8d)inst branch taken (90)inst branch cond (94)inst int alu (97)inst simd alu (9a)9fl1d cache writeback (a8)accfd5map dispatch bubble (d6)d9ddfetch restart (de)e0? simd retires (ee)? int retires (ef)f5f6f7f8fd
80024800445990552580010108000010800005064000080025800448004469986370024800102080000201600008004480044118002110910108000010005020131621088003580000108004580045800458004580045
8002480044600055258001010800001080000506400008006880044800446998637002480010208000020160000800448004411800211091010800001000502013160998003580000108004580045800458004580045
800248025960005525800101080000108000050640000800258004480044699863700248001020800002016000080044800441180021109101080000100050201116010118003580000108004580045800458004580045
80024800446000552580010108000010800005064000080025800448004469986370024800102080000201600008004480044118002110910108000010005020101608118003580000108004580045800458004580045
80024800445990552580010108000010800005064000080025800448004469986370024800412080000201600008004480044118002110910108000010205020121608138003580000108009980045800458004580097
80024800446009552580010108000010800005064000080025800448004469986370024800102080000201600008004480044118002110910108000010005020111607128003580000108004580045800458004580045
80024800446009720258001010800001080000506400008002580044800446998627700248001020800002016000080044800441180021109101080000100050207160998003580000108004580045800458004580045
8002480044599396025800101080000108000050640000800258004480044699863700248001020800002016000080044800441180021109101080000100050201116011108003580000108004580045800458004580045
800248004459905525800101080000108000050640000800258004480044699863700248001020800002016000080044800441180021109101080000100050201116011128003580000108004580045800458004580045
800248004459905525800391080000108000050640000800258004480044699863700248001020800002016000080044800441180021109101080000100050201216011118003580000108004580045800458004580045