Apple M1 Microarchitecture Research by Dougall Johnson

Firestorm: Overview | Base Instructions | SIMD and FP Instructions
Icestorm:  Overview | Base Instructions | SIMD and FP Instructions

FDIV (vector, 8H)

Test 1: uops

Code:

  fdiv v0.8h, v0.8h, v1.8h
  movi v0.16b, 1
  movi v1.16b, 2

(no loop instructions)

1000 unrolls and 1 iteration

Retires: 1.000

Issues: 1.000

Integer unit issues: 0.000

Load/store unit issues: 0.000

SIMD/FP unit issues: 1.000

retire (01)cycle (02)031e3f4e51inst issue (52)~issue fp/simd (54)~dispatch fp/simd (57)huge thing fp/simd (5b)696d6edispatch stall (70)scheduler rewind (75)scheduler stall (76)~dispatch op (78)~map op fp/simd (7e)~map lookup fp/simd (81)8283pipeline redirect (84)85inst all (8c)inst neon or fp (9a)a8accfd5d6ddinst fetch restart (de)e0? fp/simd (ee)f5f6f7f8fd
10047037530765757251000100010002459577018703770376543368951000100020007037703711100110000073116116726100070387038703870387038
100470375306157572510001000100024595770187037703765433689510001000200070377037111001100001273116116726100070387038703870387038
10047037530615757251000100010002459577018703770376543368951000100020007037703711100110000673116116726100070387038703870387038
10047037530615757251000100010002459577018703770376543368951000100020007037703711100110000073116116726100070387038703870387038
10047037533615757251000100010002459577018703770376543368951000100020007037703711100110000073116116726100070387038703870387038
10047037530615757251000100010002459577018703770376543368951000100020007037703711100110000373116116726100070387038703870387038
100470375206157572510001000100024595770187037703765433689510001000200070377037111001100009673116116726100070387038703870387038
10047037520615757251000100010002459577018703770376543368951000100020007037703711100110000073116116726100070387038703870387038
10047037530615757251004100010002462587018703770376543368951000100020007037703711100110005073116116726100070387038703870387038
10047037530615757251000100010002459577018703770376543368951000100020007037703711100110000373116116726100070387038703870387038

Test 2: Latency 1->2

Code:

  fdiv v0.8h, v0.8h, v1.8h
  movi v0.16b, 1
  movi v1.16b, 2

(fused SUBS/B.cc loop)

100 unrolls and 100 iterations

Result (median cycles for code): 7.0037

retire (01)cycle (02)03081e1f3f4e51inst issue (52)~issue int (53)~issue fp/simd (54)~dispatch int (56)~dispatch fp/simd (57)huge thing int (59)huge thing fp/simd (5b)60696d6edispatch stall (70)scheduler rewind (75)scheduler stall (76)~dispatch op (78)~map op int (7c)~map op fp/simd (7e)~map lookup int (7f)~map lookup fp/simd (81)8283pipeline redirect (84)85inst all (8c)inst branch (8d)inst branch taken (90)inst b.cc (94)inst integer (97)inst neon or fp (9a)9fa8accfd5d6ddinst fetch restart (de)e0? fp/simd (ee)gpr retires (ef)f5f6f7f8fd
1020470037524033023259757251010010010000100100005002504957170018700377003768393368745101002001000020020000700377003711102011009910010010000100007101161169725100001007003870038700387003870038
102047003752400061597572510100100100001001000050025049571700187003770037683931268745101002001000020020000700377003711102011009910010010000100007101161169725100001007003870038700387003870038
102047003752500072659757251010010010000100100005002504957070018700377003768393368745101002001000020020000700377003711102011009910010010000100007101161169725100001007003870038700387003870038
10204700375240006159757251010010010000100100005002504957170018700377003768393368745101002001000020020000700377003711102011009910010010000100007101161169725100001007003870038700387003870038
10204700375250006159757251010010010000100100005002504957170018700377003768393368745101002001000020020000700377003711102011009910010010000100007101161169725100001007003870038700387003870038
10204700375250006159739251010010010000100100005002504957170018700377003768393368745101002001000020020000700377003711102011009910010010000100007101161169725100001007003870038700387003870038
102047003752500072659757251010010010000108100005002504957170018700377003768393368745101002001000020020000700377003711102011009910010010000100007101161169725100001007003870038700387003870038
10204700375250006159757251010010010000100100005002504957170018700377003768393368745101002001000020020000700377003711102011009910010010000100007101161169725100001007003870038700387003870038
10204700375240006159757251010010010000100100005002504957170018700377003768393368745101002001000020020000700847003711102011009910010010000100007101161169725100001007003870038700387003870038
10204700375250006159757251010010010000100100005002504957170018700377003768393368745101002001000020020000700377003711102011009910010010000100007101161169725100001007003870038700387003870038

1000 unrolls and 10 iterations

Result (median cycles for code): 7.0037

retire (01)cycle (02)031e3f4e51inst issue (52)~issue int (53)~issue fp/simd (54)~dispatch int (56)~dispatch fp/simd (57)huge thing int (59)huge thing fp/simd (5b)696d6edispatch stall (70)scheduler rewind (75)scheduler stall (76)~dispatch op (78)~map op int (7c)~map op fp/simd (7e)~map lookup int (7f)~map lookup fp/simd (81)8283pipeline redirect (84)85inst all (8c)inst branch (8d)inst branch taken (90)inst b.cc (94)inst integer (97)inst neon or fp (9a)9fa0a1a6a8acc5branch mispredict (cb)cdcfd5d6ddinst fetch restart (de)e0? fp/simd (ee)gpr retires (ef)f5f6f7f8fd
10024700375244086159757251001010100001010000502504957700187003770037684153687671001020100002020000700377003711100211091010100001000000000640316226972610000107003870038700387003870038
1002470037525426159757251001010100001010000502504957700187003770037684153687671001020100002020000700377003711100211091010100001000000000640216226972610000107003870038700387003870038
1002470037524426159757251001010100001010000502504957700187003770037684153687671001020100002020000700377008411100211091010100001000000100640216226972610000107003870038700387003870038
10024700375253034659757251001010100001010000502504957700187003770037684153687671001020100002020000700377003711100211091010100001000000000640216226972610000107003870038700387003870038
10024700375255376159757441001010100001010000502504957700187003770037684153687671001020100002020000700377003711100211091010100001000000000640216226972610000107003870038700387003870038
1002470037524366159757251001010100001010000502504957700187003770037684153687671001020100002020000700377003711100211091010100001000000000640216226972610000107003870038700387003870038
1002470070525426159757251001010100001010000502504957700187003770037684153687671001020100002020000700377003711100211091010100001000000000640216226972610000107003870038700387003870038
100247003752566159757251001010100001010000502504957700187003770037684153687671001020100002020000700377003711100211091010100001000000000640216226972610000107003870038700387003870038
1002470037525306159757251001010100001010000502504957700187003770037684153687671001020100002020000700377003711100211091010100001000000000640216226972610000107003870038700387003870038
10024700375245376159757251001010100001010000502504957700187003770037684153687671001020100002020000700377003711100211091010100001000000000640216226972610000107003870038700387003870038

Test 3: Latency 1->3

Code:

  fdiv v0.8h, v1.8h, v0.8h
  movi v0.16b, 1
  movi v1.16b, 2

(fused SUBS/B.cc loop)

100 unrolls and 100 iterations

Result (median cycles for code): 7.0037

retire (01)cycle (02)0318191e1f3f4e51inst issue (52)~issue int (53)~issue fp/simd (54)~dispatch int (56)~dispatch fp/simd (57)huge thing int (59)huge thing fp/simd (5b)60696d6edispatch stall (70)scheduler rewind (75)scheduler stall (76)~dispatch op (78)~map op int (7c)~map op fp/simd (7e)~map lookup int (7f)~map lookup fp/simd (81)8283pipeline redirect (84)85inst all (8c)inst branch (8d)inst branch taken (90)inst b.cc (94)inst integer (97)inst neon or fp (9a)9fa8a9accfd5d6ddinst fetch restart (de)e0? fp/simd (ee)gpr retires (ef)f5f6f7f8fd
10204700375250448061597572510100100100001001000050025049571700187003770037683933687451010020010000200200007003770037111020110099100100100001000007101161169725100001007003870038700387003870038
10204700375240000156597572510100100100041001000050025049571700187003770037683933687451010020010000200200007003770037111020110099100100100001000007101161169725100001007003870038700387003870038
1020470037525000084597572510100100100001001000050025049571700187003770037683933687451010020010000200200007003770037111020110099100100100001000007101161169725100001007003870038700387003870038
1020470037525000061597572510100100100001001000050025049571700187003770037683933687451010020010000200200007003770037111020110099100100100001000007100161169725100001007003870038700387003870038
1020470037524000061597572510100100100001001000050025049571700187003770037683933687451010020010000200200007003770037111020110099100100100001000007101161169725100001007003870038700387003870038
10204700375244000726597572510100100100001001000050025049571700187003770037683933687451010020010000200200007003770037111020110099100100100001000007101161169725100001007003870038700387003870038
1020470037524000061597572510100100100001001000050025049571700187003770037683933687451010020010000200200007003770037111020110099100100100001000007101161169725100001007003870038700387003870038
1020470037525000061597572510100100100001001000050025049571700187003770037683933687451010020010000200200007003770037111020110099100100100001000007101161169725100001007003870038700387003870038
10204700375250033082597572510100100100001001000050025049571700187003770037684103687451010020210000202200007008470131111020110099100100100001000007101161169725100001007003870038700387003870038
1020470037524000061597572510100100100001001000050025049571700187003770037683933687451010020010000200200007003770037111020110099100100100001000007101161169725100001007003870038700387003870038

1000 unrolls and 10 iterations

Result (median cycles for code): 7.0037

retire (01)cycle (02)031e3f4e51inst issue (52)~issue int (53)~issue fp/simd (54)~dispatch int (56)~dispatch fp/simd (57)huge thing int (59)huge thing fp/simd (5b)60696d6edispatch stall (70)scheduler rewind (75)scheduler stall (76)~dispatch op (78)~map op int (7c)~map op fp/simd (7e)~map lookup int (7f)~map lookup fp/simd (81)8283pipeline redirect (84)85inst all (8c)inst branch (8d)inst branch taken (90)inst b.cc (94)inst integer (97)inst neon or fp (9a)9fa8accfd5d6ddinst fetch restart (de)e0? fp/simd (ee)gpr retires (ef)f5f6f7f8fd
1002470037524061597572510010101000010100005025049571700187003770037684153687671001020100002020000700377003711100211091010100001000640216226972610000107003870038700387003870038
1002470037524061597572510010101000010100005025049570700187003770037684153687671001020100002020000700377003711100211091010100001000640216226972610000107003870038700387003870038
10024700375240825973925100101010000101000050250495707001870084700376841536876710010201000020200007003770037211002110910101000010018640217226972610000107003870038700387003870038
1002470037525061597572510010101000010100005025049571700187003770037684153687671001020100002020000700377003711100211091010100001000640216226972610000107003870038700387003870070
1002470037525061597572510010101000010100005025049570700187003770037684153687671001020100002020000700377003711100211091010100001000640216226972610000107003870038700387003870038
10024700375250536597392510010101000010100005025049571700187003770037684153687671001020100002020000700377003711100211091010100001000640216226972610000107003870038700387003870038
1002470037525061597572510010101000010100005025049571700187003770037684153687671001020100002020000700377003711100211091010100001000640216226972610000107003870038700387003870038
10024700375240172597572510010101000010100005025049570700187003770037684153687671001020100002020000700377003711100211091010100001000640216226972610000107003870038700387003870038
1002470037525061597572510010101000010100005025049571700187003770037684153687671001020100002020000700377003711100211091010100001000640216226972610000107003870038700387003870038
10024700375250654597572510010101000010100005025049570700187003770037684153687671001020100002020000700377003711100211091010100001000640216226972610000107003870038700387003870038

Test 4: throughput

Count: 8

Code:

  fdiv v0.8h, v8.8h, v9.8h
  fdiv v1.8h, v8.8h, v9.8h
  fdiv v2.8h, v8.8h, v9.8h
  fdiv v3.8h, v8.8h, v9.8h
  fdiv v4.8h, v8.8h, v9.8h
  fdiv v5.8h, v8.8h, v9.8h
  fdiv v6.8h, v8.8h, v9.8h
  fdiv v7.8h, v8.8h, v9.8h
  movi v8.16b, 9
  movi v9.16b, 10

(fused SUBS/B.cc loop)

100 unrolls and 100 iterations

Result (median cycles for code divided by count): 1.0005

retire (01)cycle (02)030418191e1f3f51inst issue (52)~issue int (53)~issue fp/simd (54)~dispatch int (56)~dispatch fp/simd (57)huge thing int (59)huge thing fp/simd (5b)696d6edispatch stall (70)scheduler rewind (75)scheduler stall (76)~dispatch op (78)~map op int (7c)~map op fp/simd (7e)~map lookup int (7f)~map lookup fp/simd (81)8283pipeline redirect (84)85inst all (8c)inst branch (8d)inst branch taken (90)inst b.cc (94)inst integer (97)inst neon or fp (9a)9fa8acc2branch mispredict (cb)cdcfd5d6ddinst fetch restart (de)e0? fp/simd (ee)gpr retires (ef)f5f6f7f8fd
8020480043600100870305258010010080000100800005006400008002480043800436996437000180100200800002001600008004380043118020110099100100800001000000051106161180035800001008004480044800448004480044
80204800436000000054258010010080000100800005006400008002480043800436996437000180100200800002001600008004380043118020110099100100800001000000051101161180035800001008004480044800448004480044
80204800436000000054258010010080000100800005006400008002480043800436996437000180100200800002001600008004380043118020110099100100800001000000051101161280035800001008004480044800448004480044
80204800435990000054258010010080000100800005006400008002480043800436996437000180100200800002001600008004380043118020110099100100800001000000051101161180035800001008004480044800448004480044
80204800435990000054258010010080000100800005006400008002480043800436996437000180100200800562001600008004380043118020110099100100800001000000051101161080035800001008004480044800448004480044
80204800436200000054258010010080000100800005006400008002480043800436996437000180100200800002001600008004380043118020110099100100800001000000051311161180035800001008004480044800448004480044
80204800436000000054258010010080000100800005006400008002480043800436996437000180100200800002001600008004380043118020110099100100800001000000051101161180035800001008004480044800448004480044
80204800436000000054258010010080000100800005006400008002480043800436996437000180100200800002001600008004380043118020110099100100800001000000051101161180035800001008004480044800448025780044
80204800435990100054258010010080000100800005006400008002480043800436996437000180100200800002001600008004380043118020110099100100800001000000051101161180035800001008004480044800448004480044
80204800436000000075258010010080000100800005006400008002480043800436996437000180100200800002001600008004380043118020110099100100800001000000051101161180035800001008004480044800448004480044

1000 unrolls and 10 iterations

Result (median cycles for code divided by count): 1.0005

retire (01)cycle (02)031e3f51inst issue (52)~issue int (53)~issue fp/simd (54)~dispatch int (56)~dispatch fp/simd (57)huge thing int (59)huge thing fp/simd (5b)60696b6d6edispatch stall (70)scheduler rewind (75)scheduler stall (76)~dispatch op (78)~map op int (7c)~map op fp/simd (7e)~map lookup int (7f)~map lookup fp/simd (81)8283pipeline redirect (84)85inst all (8c)inst branch (8d)inst branch taken (90)inst b.cc (94)inst integer (97)inst neon or fp (9a)9fa6a8acc5branch mispredict (cb)cfd0d2d5d6ddinst fetch restart (de)e0? int output thing (e9)eaeb? fp/simd (ee)gpr retires (ef)f5f6f7f8fd
8002480043599084425800101080000108002950640000180024080043800436998637002380010208000020160000800438004311800211091010800001000000502030291628288003600080000108004480044800448004480044
8002480043599018225800101080000108000050640000180024080043800436998637002380010208000020160000800438004311800211091010800001000000502000261628288003600080000108004480044800448004480044
8002480043600019425800101080000108000050640000180024080043800436998637002380010208000020160000800438004311800211091010800001000000502000281614288003600080000108004480044800448004480044
80024800435996062625800101080000108000050640000180024080043800436998637002380010208000020160000800438004311800211091010800001000000502000281613288003600080000108004480044800448004480095
80024800436000812580010108000010800005064000018002408004380043699863700238001020800002016000080043800431180021109101080000100000050200029162727800362211980000108004480044800448004480044
8002480150599025725800101080000108000050640000180024080097800436998637002380010208000020160000800438004311800211091010800001000000502000191628288003600080000108004480044800448004480044
8002480043600022625800101080000108000050640000080024080043800436998637002380010208000020160000800438004311800211091010800001000000502000141628148003600080000108004480044800448004480044
8002480043599018425800101080000108000050640000180024080043800436998637002380010208000020160000800438004311800211091010800001000000502000291728278003600080000108004480044800448004480044
800248004360005425800101080000108000050640000080024080043800436998637002380010208000020160000800438004311800211091010800001000000502000271628298003600080000108004480044800448004480044
800248004359907725800101080000108000050640000180024080043800436998637002380010208000020160000800438004311800211091010800001000000502000271630308003600080000108004480044800448004480044