Apple Microarchitecture Research by Dougall Johnson

M1/A14 P-core (Firestorm): Overview | Base Instructions | SIMD and FP Instructions
M1/A14 E-core (Icestorm):  Overview | Base Instructions | SIMD and FP Instructions

FDIV (vector, 8H)

Test 1: uops

Code:

  fdiv v0.8h, v0.8h, v1.8h
  movi v0.16b, 1
  movi v1.16b, 2

(no loop instructions)

1000 unrolls and 1 iteration

Retires: 1.000

Issues: 1.000

Integer unit issues: 0.000

Load/store unit issues: 0.000

SIMD/FP unit issues: 1.000

retire uop (01)cycle (02)031e3f4e51schedule uop (52)schedule simd uop (54)dispatch simd uop (57)ldst uops in schedulers (5b)696d6emap stall dispatch (70)map rewind (75)map stall (76)dispatch uop (78)map simd uop (7e)map simd uop inputs (81)8283flush restart other nonspec (84)85inst all (8c)inst simd alu (9a)l1d cache writeback (a8)accfd5map dispatch bubble (d6)ddfetch restart (de)e0? simd retires (ee)f5f6f7f8fd
10047037530765757251000100010002459577018703770376543368951000100020007037703711100110000073116116726100070387038703870387038
100470375306157572510001000100024595770187037703765433689510001000200070377037111001100001273116116726100070387038703870387038
10047037530615757251000100010002459577018703770376543368951000100020007037703711100110000673116116726100070387038703870387038
10047037530615757251000100010002459577018703770376543368951000100020007037703711100110000073116116726100070387038703870387038
10047037533615757251000100010002459577018703770376543368951000100020007037703711100110000073116116726100070387038703870387038
10047037530615757251000100010002459577018703770376543368951000100020007037703711100110000373116116726100070387038703870387038
100470375206157572510001000100024595770187037703765433689510001000200070377037111001100009673116116726100070387038703870387038
10047037520615757251000100010002459577018703770376543368951000100020007037703711100110000073116116726100070387038703870387038
10047037530615757251004100010002462587018703770376543368951000100020007037703711100110005073116116726100070387038703870387038
10047037530615757251000100010002459577018703770376543368951000100020007037703711100110000373116116726100070387038703870387038

Test 2: Latency 1->2

Code:

  fdiv v0.8h, v0.8h, v1.8h
  movi v0.16b, 1
  movi v1.16b, 2

(fused SUBS/B.cc loop)

100 unrolls and 100 iterations

Result (median cycles for code): 7.0037

retire uop (01)cycle (02)03mmu table walk data (08)1e1f3f4e51schedule uop (52)schedule int uop (53)schedule simd uop (54)dispatch int uop (56)dispatch simd uop (57)int uops in schedulers (59)ldst uops in schedulers (5b)60696d6emap stall dispatch (70)map rewind (75)map stall (76)dispatch uop (78)map int uop (7c)map simd uop (7e)map int uop inputs (7f)map simd uop inputs (81)8283flush restart other nonspec (84)85inst all (8c)inst branch (8d)inst branch taken (90)inst branch cond (94)inst int alu (97)inst simd alu (9a)9fl1d cache writeback (a8)accfd5map dispatch bubble (d6)ddfetch restart (de)e0? simd retires (ee)? int retires (ef)f5f6f7f8fd
1020470037524033023259757251010010010000100100005002504957170018700377003768393368745101002001000020020000700377003711102011009910010010000100007101161169725100001007003870038700387003870038
102047003752400061597572510100100100001001000050025049571700187003770037683931268745101002001000020020000700377003711102011009910010010000100007101161169725100001007003870038700387003870038
102047003752500072659757251010010010000100100005002504957070018700377003768393368745101002001000020020000700377003711102011009910010010000100007101161169725100001007003870038700387003870038
10204700375240006159757251010010010000100100005002504957170018700377003768393368745101002001000020020000700377003711102011009910010010000100007101161169725100001007003870038700387003870038
10204700375250006159757251010010010000100100005002504957170018700377003768393368745101002001000020020000700377003711102011009910010010000100007101161169725100001007003870038700387003870038
10204700375250006159739251010010010000100100005002504957170018700377003768393368745101002001000020020000700377003711102011009910010010000100007101161169725100001007003870038700387003870038
102047003752500072659757251010010010000108100005002504957170018700377003768393368745101002001000020020000700377003711102011009910010010000100007101161169725100001007003870038700387003870038
10204700375250006159757251010010010000100100005002504957170018700377003768393368745101002001000020020000700377003711102011009910010010000100007101161169725100001007003870038700387003870038
10204700375240006159757251010010010000100100005002504957170018700377003768393368745101002001000020020000700847003711102011009910010010000100007101161169725100001007003870038700387003870038
10204700375250006159757251010010010000100100005002504957170018700377003768393368745101002001000020020000700377003711102011009910010010000100007101161169725100001007003870038700387003870038

1000 unrolls and 10 iterations

Result (median cycles for code): 7.0037

retire uop (01)cycle (02)031e3f4e51schedule uop (52)schedule int uop (53)schedule simd uop (54)dispatch int uop (56)dispatch simd uop (57)int uops in schedulers (59)ldst uops in schedulers (5b)696d6emap stall dispatch (70)map rewind (75)map stall (76)dispatch uop (78)map int uop (7c)map simd uop (7e)map int uop inputs (7f)map simd uop inputs (81)8283flush restart other nonspec (84)85inst all (8c)inst branch (8d)inst branch taken (90)inst branch cond (94)inst int alu (97)inst simd alu (9a)9fl1d tlb access (a0)l1d tlb miss (a1)ld unit uop (a6)l1d cache writeback (a8)acbranch cond mispred nonspec (c5)branch mispred nonspec (cb)cdcfd5map dispatch bubble (d6)ddfetch restart (de)e0? simd retires (ee)? int retires (ef)f5f6f7f8fd
10024700375244086159757251001010100001010000502504957700187003770037684153687671001020100002020000700377003711100211091010100001000000000640316226972610000107003870038700387003870038
1002470037525426159757251001010100001010000502504957700187003770037684153687671001020100002020000700377003711100211091010100001000000000640216226972610000107003870038700387003870038
1002470037524426159757251001010100001010000502504957700187003770037684153687671001020100002020000700377008411100211091010100001000000100640216226972610000107003870038700387003870038
10024700375253034659757251001010100001010000502504957700187003770037684153687671001020100002020000700377003711100211091010100001000000000640216226972610000107003870038700387003870038
10024700375255376159757441001010100001010000502504957700187003770037684153687671001020100002020000700377003711100211091010100001000000000640216226972610000107003870038700387003870038
1002470037524366159757251001010100001010000502504957700187003770037684153687671001020100002020000700377003711100211091010100001000000000640216226972610000107003870038700387003870038
1002470070525426159757251001010100001010000502504957700187003770037684153687671001020100002020000700377003711100211091010100001000000000640216226972610000107003870038700387003870038
100247003752566159757251001010100001010000502504957700187003770037684153687671001020100002020000700377003711100211091010100001000000000640216226972610000107003870038700387003870038
1002470037525306159757251001010100001010000502504957700187003770037684153687671001020100002020000700377003711100211091010100001000000000640216226972610000107003870038700387003870038
10024700375245376159757251001010100001010000502504957700187003770037684153687671001020100002020000700377003711100211091010100001000000000640216226972610000107003870038700387003870038

Test 3: Latency 1->3

Code:

  fdiv v0.8h, v1.8h, v0.8h
  movi v0.16b, 1
  movi v1.16b, 2

(fused SUBS/B.cc loop)

100 unrolls and 100 iterations

Result (median cycles for code): 7.0037

retire uop (01)cycle (02)0318191e1f3f4e51schedule uop (52)schedule int uop (53)schedule simd uop (54)dispatch int uop (56)dispatch simd uop (57)int uops in schedulers (59)ldst uops in schedulers (5b)60696d6emap stall dispatch (70)map rewind (75)map stall (76)dispatch uop (78)map int uop (7c)map simd uop (7e)map int uop inputs (7f)map simd uop inputs (81)8283flush restart other nonspec (84)85inst all (8c)inst branch (8d)inst branch taken (90)inst branch cond (94)inst int alu (97)inst simd alu (9a)9fl1d cache writeback (a8)a9accfd5map dispatch bubble (d6)ddfetch restart (de)e0? simd retires (ee)? int retires (ef)f5f6f7f8fd
10204700375250448061597572510100100100001001000050025049571700187003770037683933687451010020010000200200007003770037111020110099100100100001000007101161169725100001007003870038700387003870038
10204700375240000156597572510100100100041001000050025049571700187003770037683933687451010020010000200200007003770037111020110099100100100001000007101161169725100001007003870038700387003870038
1020470037525000084597572510100100100001001000050025049571700187003770037683933687451010020010000200200007003770037111020110099100100100001000007101161169725100001007003870038700387003870038
1020470037525000061597572510100100100001001000050025049571700187003770037683933687451010020010000200200007003770037111020110099100100100001000007100161169725100001007003870038700387003870038
1020470037524000061597572510100100100001001000050025049571700187003770037683933687451010020010000200200007003770037111020110099100100100001000007101161169725100001007003870038700387003870038
10204700375244000726597572510100100100001001000050025049571700187003770037683933687451010020010000200200007003770037111020110099100100100001000007101161169725100001007003870038700387003870038
1020470037524000061597572510100100100001001000050025049571700187003770037683933687451010020010000200200007003770037111020110099100100100001000007101161169725100001007003870038700387003870038
1020470037525000061597572510100100100001001000050025049571700187003770037683933687451010020010000200200007003770037111020110099100100100001000007101161169725100001007003870038700387003870038
10204700375250033082597572510100100100001001000050025049571700187003770037684103687451010020210000202200007008470131111020110099100100100001000007101161169725100001007003870038700387003870038
1020470037524000061597572510100100100001001000050025049571700187003770037683933687451010020010000200200007003770037111020110099100100100001000007101161169725100001007003870038700387003870038

1000 unrolls and 10 iterations

Result (median cycles for code): 7.0037

retire uop (01)cycle (02)031e3f4e51schedule uop (52)schedule int uop (53)schedule simd uop (54)dispatch int uop (56)dispatch simd uop (57)int uops in schedulers (59)ldst uops in schedulers (5b)60696d6emap stall dispatch (70)map rewind (75)map stall (76)dispatch uop (78)map int uop (7c)map simd uop (7e)map int uop inputs (7f)map simd uop inputs (81)8283flush restart other nonspec (84)85inst all (8c)inst branch (8d)inst branch taken (90)inst branch cond (94)inst int alu (97)inst simd alu (9a)9fl1d cache writeback (a8)accfd5map dispatch bubble (d6)ddfetch restart (de)e0? simd retires (ee)? int retires (ef)f5f6f7f8fd
1002470037524061597572510010101000010100005025049571700187003770037684153687671001020100002020000700377003711100211091010100001000640216226972610000107003870038700387003870038
1002470037524061597572510010101000010100005025049570700187003770037684153687671001020100002020000700377003711100211091010100001000640216226972610000107003870038700387003870038
10024700375240825973925100101010000101000050250495707001870084700376841536876710010201000020200007003770037211002110910101000010018640217226972610000107003870038700387003870038
1002470037525061597572510010101000010100005025049571700187003770037684153687671001020100002020000700377003711100211091010100001000640216226972610000107003870038700387003870070
1002470037525061597572510010101000010100005025049570700187003770037684153687671001020100002020000700377003711100211091010100001000640216226972610000107003870038700387003870038
10024700375250536597392510010101000010100005025049571700187003770037684153687671001020100002020000700377003711100211091010100001000640216226972610000107003870038700387003870038
1002470037525061597572510010101000010100005025049571700187003770037684153687671001020100002020000700377003711100211091010100001000640216226972610000107003870038700387003870038
10024700375240172597572510010101000010100005025049570700187003770037684153687671001020100002020000700377003711100211091010100001000640216226972610000107003870038700387003870038
1002470037525061597572510010101000010100005025049571700187003770037684153687671001020100002020000700377003711100211091010100001000640216226972610000107003870038700387003870038
10024700375250654597572510010101000010100005025049570700187003770037684153687671001020100002020000700377003711100211091010100001000640216226972610000107003870038700387003870038

Test 4: throughput

Count: 8

Code:

  fdiv v0.8h, v8.8h, v9.8h
  fdiv v1.8h, v8.8h, v9.8h
  fdiv v2.8h, v8.8h, v9.8h
  fdiv v3.8h, v8.8h, v9.8h
  fdiv v4.8h, v8.8h, v9.8h
  fdiv v5.8h, v8.8h, v9.8h
  fdiv v6.8h, v8.8h, v9.8h
  fdiv v7.8h, v8.8h, v9.8h
  movi v8.16b, 9
  movi v9.16b, 10

(fused SUBS/B.cc loop)

100 unrolls and 100 iterations

Result (median cycles for code divided by count): 1.0005

retire uop (01)cycle (02)03l1i tlb fill (04)18191e1f3f51schedule uop (52)schedule int uop (53)schedule simd uop (54)dispatch int uop (56)dispatch simd uop (57)int uops in schedulers (59)ldst uops in schedulers (5b)696d6emap stall dispatch (70)map rewind (75)map stall (76)dispatch uop (78)map int uop (7c)map simd uop (7e)map int uop inputs (7f)map simd uop inputs (81)8283flush restart other nonspec (84)85inst all (8c)inst branch (8d)inst branch taken (90)inst branch cond (94)inst int alu (97)inst simd alu (9a)9fl1d cache writeback (a8)acc2branch mispred nonspec (cb)cdcfd5map dispatch bubble (d6)ddfetch restart (de)e0? simd retires (ee)? int retires (ef)f5f6f7f8fd
8020480043600100870305258010010080000100800005006400008002480043800436996437000180100200800002001600008004380043118020110099100100800001000000051106161180035800001008004480044800448004480044
80204800436000000054258010010080000100800005006400008002480043800436996437000180100200800002001600008004380043118020110099100100800001000000051101161180035800001008004480044800448004480044
80204800436000000054258010010080000100800005006400008002480043800436996437000180100200800002001600008004380043118020110099100100800001000000051101161280035800001008004480044800448004480044
80204800435990000054258010010080000100800005006400008002480043800436996437000180100200800002001600008004380043118020110099100100800001000000051101161180035800001008004480044800448004480044
80204800435990000054258010010080000100800005006400008002480043800436996437000180100200800562001600008004380043118020110099100100800001000000051101161080035800001008004480044800448004480044
80204800436200000054258010010080000100800005006400008002480043800436996437000180100200800002001600008004380043118020110099100100800001000000051311161180035800001008004480044800448004480044
80204800436000000054258010010080000100800005006400008002480043800436996437000180100200800002001600008004380043118020110099100100800001000000051101161180035800001008004480044800448004480044
80204800436000000054258010010080000100800005006400008002480043800436996437000180100200800002001600008004380043118020110099100100800001000000051101161180035800001008004480044800448025780044
80204800435990100054258010010080000100800005006400008002480043800436996437000180100200800002001600008004380043118020110099100100800001000000051101161180035800001008004480044800448004480044
80204800436000000075258010010080000100800005006400008002480043800436996437000180100200800002001600008004380043118020110099100100800001000000051101161180035800001008004480044800448004480044

1000 unrolls and 10 iterations

Result (median cycles for code divided by count): 1.0005

retire uop (01)cycle (02)031e3f51schedule uop (52)schedule int uop (53)schedule simd uop (54)dispatch int uop (56)dispatch simd uop (57)int uops in schedulers (59)ldst uops in schedulers (5b)60696b6d6emap stall dispatch (70)map rewind (75)map stall (76)dispatch uop (78)map int uop (7c)map simd uop (7e)map int uop inputs (7f)map simd uop inputs (81)8283flush restart other nonspec (84)85inst all (8c)inst branch (8d)inst branch taken (90)inst branch cond (94)inst int alu (97)inst simd alu (9a)9fld unit uop (a6)l1d cache writeback (a8)acbranch cond mispred nonspec (c5)branch mispred nonspec (cb)cfd0d2d5map dispatch bubble (d6)ddfetch restart (de)e0? int output thing (e9)eaeb? simd retires (ee)? int retires (ef)f5f6f7f8fd
8002480043599084425800101080000108002950640000180024080043800436998637002380010208000020160000800438004311800211091010800001000000502030291628288003600080000108004480044800448004480044
8002480043599018225800101080000108000050640000180024080043800436998637002380010208000020160000800438004311800211091010800001000000502000261628288003600080000108004480044800448004480044
8002480043600019425800101080000108000050640000180024080043800436998637002380010208000020160000800438004311800211091010800001000000502000281614288003600080000108004480044800448004480044
80024800435996062625800101080000108000050640000180024080043800436998637002380010208000020160000800438004311800211091010800001000000502000281613288003600080000108004480044800448004480095
80024800436000812580010108000010800005064000018002408004380043699863700238001020800002016000080043800431180021109101080000100000050200029162727800362211980000108004480044800448004480044
8002480150599025725800101080000108000050640000180024080097800436998637002380010208000020160000800438004311800211091010800001000000502000191628288003600080000108004480044800448004480044
8002480043600022625800101080000108000050640000080024080043800436998637002380010208000020160000800438004311800211091010800001000000502000141628148003600080000108004480044800448004480044
8002480043599018425800101080000108000050640000180024080043800436998637002380010208000020160000800438004311800211091010800001000000502000291728278003600080000108004480044800448004480044
800248004360005425800101080000108000050640000080024080043800436998637002380010208000020160000800438004311800211091010800001000000502000271628298003600080000108004480044800448004480044
800248004359907725800101080000108000050640000180024080043800436998637002380010208000020160000800438004311800211091010800001000000502000271630308003600080000108004480044800448004480044