Apple M1 Microarchitecture Research by Dougall Johnson

Firestorm: Overview | Base Instructions | SIMD and FP Instructions
Icestorm:  Overview | Base Instructions | SIMD and FP Instructions

FDIV (scalar, H)

Test 1: uops

Code:

  fdiv h0, h0, h1
  movi v0.16b, 1
  movi v1.16b, 2

(no loop instructions)

1000 unrolls and 1 iteration

Retires: 1.000

Issues: 1.000

Integer unit issues: 0.000

Load/store unit issues: 0.000

SIMD/FP unit issues: 1.000

retire (01)cycle (02)03191e3f4e51inst issue (52)~issue fp/simd (54)~dispatch fp/simd (57)huge thing fp/simd (5b)60696d6edispatch stall (70)scheduler rewind (75)scheduler stall (76)~dispatch op (78)~map op fp/simd (7e)~map lookup fp/simd (81)8283pipeline redirect (84)85inst all (8c)inst neon or fp (9a)a8accfd5d6ddinst fetch restart (de)e0? fp/simd (ee)f5f6f7f8fd
1004703752006157572510001000100024595717018703770376543368951000100020007037703711100110000073216436726100070387038703870387038
1004703753006157572510001000100024595717018703770376543368951000100020007037703711100110000073316336726100070387038703870387038
1004703753006157572510001000100024595717018703770376543368951000100020007037703711100110000073316336726100070387038703870387038
1004703752006157572510001000100024595717018703770376543368951000100020007037703711100110000073316336726100070387038703870387038
10047037530061575725100010001000245957170187037703765433689510001000200070377037111001100006073316336726100070387038703870387038
1004703753006157572510001000100024595717018703770376543368951000100020007037703711100110000073316336726100070387038703870387038
10047037530246157572510001000100024595717018703770376543368951000100020007037703711100110000073316336726100070387038703870387038
10047037530126157572510001000100024595717018703770376543368951000100020007037703711100110000073316336726100070387038703870387038
1004703753006157572510001000100024595717018703770376543368951000100020007037703711100110000073316336726100070387038703870387038
1004703753006157572510001000100024595717018703770376543368951000100020007037703711100110000073216336726100070387038703870387038

Test 2: Latency 1->2

Code:

  fdiv h0, h0, h1
  movi v0.16b, 1
  movi v1.16b, 2

(fused SUBS/B.cc loop)

100 unrolls and 100 iterations

Result (median cycles for code): 7.0037

retire (01)cycle (02)0307080a0b18191e1f3f4e51inst issue (52)~issue int (53)~issue fp/simd (54)~dispatch int (56)~dispatch fp/simd (57)huge thing int (59)huge thing fp/simd (5b)60696d6edispatch stall (70)scheduler rewind (75)scheduler stall (76)~dispatch op (78)~map op int (7c)~map op fp/simd (7e)~map lookup int (7f)~map lookup fp/simd (81)8283pipeline redirect (84)85inst all (8c)inst branch (8d)inst branch taken (90)inst b.cc (94)inst integer (97)inst neon or fp (9a)9fa0a1a6a7a8a9acc2branch mispredict (cb)cfd5d6ddinst fetch restart (de)e0? int output thing (e9)? fp/simd (ee)gpr retires (ef)f5f6f7f8fd
10204700375240000005040615975725101001001000010010000500250495707001870037700376839336874510100200100002002000070037700371110201100991001001000010000000000071011611697250100001007003870038700387003870038
10204700375240000004020615975725101001001000010010000500250495707001870037700376839336874510100200100002002000070085700371110201100991001001000010000000062071011611697250100001007003870038701817003870086
1020470225543000000600061597572510100100100121221011750025049570700187031970273683933687451010020010000200200007003770037111020110099100100100001000000000409326137227031225100001007074770794707987074870748
102047075054811211615247813208259757251019110010000100103125002504957070198700377013268393369253101002001067320020000700377079221102011009910010010000100080114894020819516126972525100001007003870798700387003870038
102047003754300003102199504323359597320101001001000010010000500250495707001870037700376839336874510100200100002002000070037700371110201100991001001000010000000000071011611697250100001007003870038700387003870038
10204700375250000002910615975744101001001000010010000605250495707001870037700376839336874510100200100002002011470037700371110201100991001001000010000300000071011621697250100001007003870038700387003870038
10204700375250000005010615974725101001001000010010000500250495707001870037700376839336874510100200100002002021870085700371110201100991001001000010000000000071011611697250100001007003870038700387003870038
102047003752400000039388615975725101001001000010010000500250495707001870037700376839336874510100200100002002011270037700371110201100991001001000010000000000071011611697250100001007003870038700387003870038
102047003752400000051388825975725101001001000010010000500250495707001870037700376853236874510100200100002002000070037700371110201100991001001000010000000000071011611697250100001007003870038700387008670038
102047003752500001046502935975725101001001000010010000500250495707001870037700376839336874510100200100002002000070037700371110201100991001001000010000001000071011612697250100001007003870038700387003870038

1000 unrolls and 10 iterations

Result (median cycles for code): 7.0037

retire (01)cycle (02)03081e3f4e51inst issue (52)~issue int (53)~issue fp/simd (54)~dispatch int (56)~dispatch fp/simd (57)huge thing int (59)huge thing fp/simd (5b)60696d6edispatch stall (70)scheduler rewind (75)scheduler stall (76)~dispatch op (78)~map op int (7c)~map op fp/simd (7e)~map lookup int (7f)~map lookup fp/simd (81)8283pipeline redirect (84)85inst all (8c)inst branch (8d)inst branch taken (90)inst b.cc (94)inst integer (97)inst neon or fp (9a)9fa8acc5cfd5d6ddinst fetch restart (de)e0? fp/simd (ee)gpr retires (ef)f5f6f7f8fd
10024700375240074759757251001010100001010000502504957070018700377003768415368767100102010000202000070037700371110021109101010000108600640216226979210000107003870038700387003870038
1002470037525006159757251001010100001010000502504957070018700377003768415368767100102010000202000070037700371110021109101010000108000640216226972610000107003870038700387003870038
1002470037525006159757251001010100001010000502504957070018700377003768415368767100102010000202000070037700371110021109101010000107130640216226972610000107003870038700387003870038
1002470037525006159757251001010100001010000502504957070018700377003768434368767100102010000202000070037700371110021109101010000103700640216226972610000107003870038700387003870038
1002470037524096159757251001010100001010000502504957070018700377003768415368767100102010000202000070037700371110021109101010000108500640216226972610000107003870038700387003870038
1002470037524006159757251001010100001010000502504957070018700377003768415368767100102010000202000070037700371110021109101010000107701640216226972610000107003870038700387003870038
10024700375250031359757251001010100001010000502504957070018700377003768415368767100102010000202000070037700371110021109101010000108400640216226972610000107003870038700387003870038
10024700375250053659757251001010100001010000502504957070018700377003768415368767100102010000202000070037700371110021109101010000107600640216226972610000107003870038700387003870038
1002470037525006159757251001010100001010000502504957070018700377003768415368767100102010000202000070037700371110021109101010000108000640216226972610000107003870038700387003870038
1002470037524006159757251001010100001010000502504957070018700377003768415368767100102010000202000070037700371110021109101010000108600640216226972610000107003870038700387003870038

Test 3: Latency 1->3

Code:

  fdiv h0, h1, h0
  movi v0.16b, 1
  movi v1.16b, 2

(fused SUBS/B.cc loop)

100 unrolls and 100 iterations

Result (median cycles for code): 7.0037

retire (01)cycle (02)030b1e1f3f4e51inst issue (52)~issue int (53)~issue fp/simd (54)~dispatch int (56)~dispatch fp/simd (57)huge thing int (59)huge thing fp/simd (5b)60696d6edispatch stall (70)scheduler rewind (75)scheduler stall (76)~dispatch op (78)~map op int (7c)~map op fp/simd (7e)~map lookup int (7f)~map lookup fp/simd (81)8283pipeline redirect (84)85inst all (8c)inst branch (8d)inst branch taken (90)inst b.cc (94)inst integer (97)inst neon or fp (9a)9fa8acc2branch mispredict (cb)cfd5d6ddinst fetch restart (de)e0? int output thing (e9)? fp/simd (ee)gpr retires (ef)f5f6f7f8fd
102047003752500061597572510100100100001001000050025049570700187003770037683933687451010020010000200200007003770037111020110099100100100001008200071011611697250100001007003870038700387003870038
102047003752400061597572510100100100001001000050025049571700187003770037683933687451010020010000200200007003770037111020110099100100100001008300071011611697250100001007003870038700387003870038
1020470037524000126597572510100100100001001000050025049571700187003770037683933687451010020010000200200007003770084111020110099100100100001008000071011611697250100001007003870038700387003870038
10204700375250675061597572510100100100001001000050025049570700187003770037683933687451010020010000200200007007070037111020110099100100100001008300071011611697250100001007003870038700387003870038
102047003752500061597572510100100100161001000050025049570700187003770037683933687451010020010058200200007003770037111020110099100100100001008100071011611697250100001007003870038700387003870038
102047003752400061597572510100100100001001000050025049571700187003770037683933687451010020010000200200007003770037111020110099100100100001007800071011611697250100001007003870038700387003870038
1020470037525024061597572510100100100001001000050025049570700187003770037683933687451010020010000200200007003770037111020110099100100100001007900071011611697250100001007003870038700387003870038
10204700375250006159757251010010010000100100005002504957170018700377003768393368745101002001000020020000700377003711102011009910010010000100090071011611697250100001007003870038700387003870038
10204700375240006159757251010010010000100100005002504957070018700377003768393368745101002001000020020000700377003711102011009910010010000100090071011611697250100001007003870038700387003870038
10204700375240006159757251010010010000100100005002504957070018700377003768393368745101002001000020020000700377003711102011009910010010000100000071011611697250100001007007170038700387003870038

1000 unrolls and 10 iterations

Result (median cycles for code): 7.0037

retire (01)cycle (02)030b1e1f3a3f4e51inst issue (52)~issue int (53)~issue fp/simd (54)~dispatch int (56)~dispatch fp/simd (57)huge thing int (59)huge thing fp/simd (5b)60696b6d6edispatch stall (70)scheduler rewind (75)scheduler stall (76)~dispatch op (78)~map op int (7c)~map op fp/simd (7e)~map lookup int (7f)~map lookup fp/simd (81)8283pipeline redirect (84)85inst all (8c)inst branch (8d)inst branch taken (90)inst b.cc (94)inst integer (97)inst neon or fp (9a)9fa8a9accdcfd5d6ddinst fetch restart (de)e0? fp/simd (ee)gpr retires (ef)f5f6f7f8fd
1002470037524000271859757251001010100001010000502504957170018070037700376841536876710010201000020200007003770037111002110910101000010000064310161276972610000107003870038700387003870038
1002470037525000210859757251001010100001010000502504957070018070180700376841536876710010201000020200007008470037111002110910101000010000064351610106972610000107003870038700387003870038
10024700375250002588597572510010101000010100005025049570700180700377003768415368767100102010000202000070037700371110021109101010000100000643121610106972610000107003870038700857003870038
10024700375250902548597572510010101000010100005025049570700180700377003768415368767100102010000202000070037700371110021109101010000100000643111610116972610000107003870038700387003870038
10024700375250002411597572510010101000010100005025049570700180700377003768415368767100102010000202000070037700371110021109101010000100000643111610106972610000107003870038700387003870038
10024700375250002567597572510010101000010100005025049570700180700377003768451368767100102010000202000070037700371110021109101010000100000643101610106972610000107003870038700387003870038
10024700375250002230759757251001010100001010039502504957070018070037700376841536876710010201000020200007003770037111002110910101000010000064310165106972610000107003870038700387003870038
100247003752500026165975725100101010000101000050250495707001807003770037684153687671001020100002020000700377003711100211091010100001000306435161256972610000107003870038700387003870038
10024700375250002196659757251001010100001110000502504957070018070037700376841536876710010201000020200007003770037111002110910101000010000064312161276972610000107003870038700387003870038
10024700375250002261597572510010101000010100005025049570700180700377003768415368767100102010000202000070037700371110021109101010000100000643101610106972610000107003870038700387003870038

Test 4: throughput

Count: 8

Code:

  fdiv h0, h8, h9
  fdiv h1, h8, h9
  fdiv h2, h8, h9
  fdiv h3, h8, h9
  fdiv h4, h8, h9
  fdiv h5, h8, h9
  fdiv h6, h8, h9
  fdiv h7, h8, h9
  movi v8.16b, 9
  movi v9.16b, 10

(fused SUBS/B.cc loop)

100 unrolls and 100 iterations

Result (median cycles for code divided by count): 1.0005

retire (01)cycle (02)0308090b0e18191e1f3a3f51inst issue (52)~issue int (53)~issue fp/simd (54)~dispatch int (56)~dispatch fp/simd (57)huge thing int (59)huge thing fp/simd (5b)696d6edispatch stall (70)scheduler rewind (75)scheduler stall (76)~dispatch op (78)~map op int (7c)~map op fp/simd (7e)~map lookup int (7f)~map lookup fp/simd (81)8283pipeline redirect (84)85inst all (8c)inst branch (8d)inst branch taken (90)inst b.cc (94)inst integer (97)inst neon or fp (9a)9fa0a1a6a8accfd5d6ddinst fetch restart (de)e0? int output thing (e9)? fp/simd (ee)gpr retires (ef)f5f6f7f8fd
8020480043601000000000549680100100800001008000050064000080024802048004369964370001801002008000020016000080043800431180201100991001008000010000000511061611800350800001008004480044800448004480044
802048004360010000012004342580100100800001008000050064000080024800438004369964370001801002008000020016000080043800431180201100991001008000010000000511011611800350800001008004480044800448004480044
8020480043599000000000542580100100800001008000050064000080024800438004369964370001801002008000020016000080043800431180201100991001008000010000000511011611800350800001008004480044800448004480044
8020480043600000000000542580100100800001008000050064000080024800438004369964370001801002008000020016000080043800431180201100991001008000010000000511011642800350800001008004480044800448004480044
80204800435990000000008772580100100800001008000050064000080024800438004369964370001801002008000020016000080043800431180201100991001008000010000000511011612800350800001008004480044800448025980044
8020480043600000000513520542580100100801121008000050064000080024800438004369964370001801002008000020016000080043800431180201100991001008000010000000511011611800350800001008004480044800448004480044
8020480043600000000000542580209100800001008000050064000080024800438004369964370001801002008000020016000080043800431180201100991001008000010000000511011611800780800001008025680044800448004480044
8020480043600000000000542580100100800001008000050064000080192800438004369964370001801002008000020016000080043800432180201100991001008000010000000516211611800350800001008004480044800448004480044
8020480043599000000000549780100100800001008000050064000080024800438004369964370001801002008000020016000080043800431180201100991001008000010020000511011611800350800001008004480044800448004480044
8020480043600010000000542580100100800001008000050064000080024800438004369964370001801002008000020016000080043800435180201100991001008000010000030511011611800350800001008004480044800448004480044

1000 unrolls and 10 iterations

Result (median cycles for code divided by count): 1.0005

retire (01)cycle (02)03080b0e181e3f51inst issue (52)~issue int (53)~issue fp/simd (54)~dispatch int (56)~dispatch fp/simd (57)huge thing int (59)huge thing fp/simd (5b)60696d6edispatch stall (70)72scheduler rewind (75)scheduler stall (76)~dispatch op (78)~map op int (7c)~map op fp/simd (7e)~map lookup int (7f)~map lookup fp/simd (81)8283pipeline redirect (84)85inst all (8c)inst branch (8d)inst branch taken (90)inst b.cc (94)inst integer (97)inst neon or fp (9a)9fa1accfd5d6ddinst fetch restart (de)e0ea? fp/simd (ee)gpr retires (ef)f5f6f7f8fd
800248004360000000542580010108000010800005064000018002480043800436998603700238001020800002016000080043800431180022109101080000100050202161180036080000108004480044800448004480044
800248004359900000542580010108000010800005064000018002480043800436998603700238001020800002016000080043800431180021109101080000100050201161180036080000108004480044800448004480044
800248004359900000542580010108000010800005064000018002480043800436998603700238001020800002016000080043800431180021109101080000100050201161180036080000108004480044800448004480044
800248004360000000542580010108000010800005064000018002480043800436998603700238001020800002016000080043800431180021109101080000100050201161180036080000108004480044800448004480044
8002480043599000005292580010108000010800005064000018002480043800436998603700238001020800002016000080043800431180021109101080000100050201161180036080000108004480044800448004480044
800248004360000000542580010108000011800005064000018002480043800436998603700238001020800002016000080043800431180021109101080000100050201161180036080000108004480044800448004480044
800248004359900000542580010108000010800005064000018002480043800436998603700238001020800002016000080043800431180021109101080000100050203161180036080000108004480044800448004480044
800248004359900000542580010108000010800005064000018002480043800436998603700238001020800002016000080043800431180021109101080000100050201162280036080000108004480044800448004480044
800248004359900000542580010108000010800005064000008002480043800436998603700238001020800002016000080043800431180021109101080000100050202162280036080000108004480044800448004480044
800248004360000000542580010108000010800005064000008002480043800436998603700238001020800002016000080043800431180021109101080000100050201161180036080000108004480044800448004480044