Apple Microarchitecture Research by Dougall Johnson

M1/A14 P-core (Firestorm): Overview | Base Instructions | SIMD and FP Instructions
M1/A14 E-core (Icestorm):  Overview | Base Instructions | SIMD and FP Instructions

FDIV (scalar, H)

Test 1: uops

Code:

  fdiv h0, h0, h1
  movi v0.16b, 1
  movi v1.16b, 2

(no loop instructions)

1000 unrolls and 1 iteration

Retires: 1.000

Issues: 1.000

Integer unit issues: 0.000

Load/store unit issues: 0.000

SIMD/FP unit issues: 1.000

retire uop (01)cycle (02)03191e3f4e51schedule uop (52)schedule simd uop (54)dispatch simd uop (57)ldst uops in schedulers (5b)60696d6emap stall dispatch (70)map rewind (75)map stall (76)dispatch uop (78)map simd uop (7e)map simd uop inputs (81)8283flush restart other nonspec (84)85inst all (8c)inst simd alu (9a)l1d cache writeback (a8)accfd5map dispatch bubble (d6)ddfetch restart (de)e0? simd retires (ee)f5f6f7f8fd
1004703752006157572510001000100024595717018703770376543368951000100020007037703711100110000073216436726100070387038703870387038
1004703753006157572510001000100024595717018703770376543368951000100020007037703711100110000073316336726100070387038703870387038
1004703753006157572510001000100024595717018703770376543368951000100020007037703711100110000073316336726100070387038703870387038
1004703752006157572510001000100024595717018703770376543368951000100020007037703711100110000073316336726100070387038703870387038
10047037530061575725100010001000245957170187037703765433689510001000200070377037111001100006073316336726100070387038703870387038
1004703753006157572510001000100024595717018703770376543368951000100020007037703711100110000073316336726100070387038703870387038
10047037530246157572510001000100024595717018703770376543368951000100020007037703711100110000073316336726100070387038703870387038
10047037530126157572510001000100024595717018703770376543368951000100020007037703711100110000073316336726100070387038703870387038
1004703753006157572510001000100024595717018703770376543368951000100020007037703711100110000073316336726100070387038703870387038
1004703753006157572510001000100024595717018703770376543368951000100020007037703711100110000073216336726100070387038703870387038

Test 2: Latency 1->2

Code:

  fdiv h0, h0, h1
  movi v0.16b, 1
  movi v1.16b, 2

(fused SUBS/B.cc loop)

100 unrolls and 100 iterations

Result (median cycles for code): 7.0037

retire uop (01)cycle (02)03mmu table walk instruction (07)mmu table walk data (08)l2 tlb miss instruction (0a)l2 tlb miss data (0b)18191e1f3f4e51schedule uop (52)schedule int uop (53)schedule simd uop (54)dispatch int uop (56)dispatch simd uop (57)int uops in schedulers (59)ldst uops in schedulers (5b)60696d6emap stall dispatch (70)map rewind (75)map stall (76)dispatch uop (78)map int uop (7c)map simd uop (7e)map int uop inputs (7f)map simd uop inputs (81)8283flush restart other nonspec (84)85inst all (8c)inst branch (8d)inst branch taken (90)inst branch cond (94)inst int alu (97)inst simd alu (9a)9fl1d tlb access (a0)l1d tlb miss (a1)ld unit uop (a6)st unit uop (a7)l1d cache writeback (a8)a9acc2branch mispred nonspec (cb)cfd5map dispatch bubble (d6)ddfetch restart (de)e0? int output thing (e9)? simd retires (ee)? int retires (ef)f5f6f7f8fd
10204700375240000005040615975725101001001000010010000500250495707001870037700376839336874510100200100002002000070037700371110201100991001001000010000000000071011611697250100001007003870038700387003870038
10204700375240000004020615975725101001001000010010000500250495707001870037700376839336874510100200100002002000070085700371110201100991001001000010000000062071011611697250100001007003870038701817003870086
1020470225543000000600061597572510100100100121221011750025049570700187031970273683933687451010020010000200200007003770037111020110099100100100001000000000409326137227031225100001007074770794707987074870748
102047075054811211615247813208259757251019110010000100103125002504957070198700377013268393369253101002001067320020000700377079221102011009910010010000100080114894020819516126972525100001007003870798700387003870038
102047003754300003102199504323359597320101001001000010010000500250495707001870037700376839336874510100200100002002000070037700371110201100991001001000010000000000071011611697250100001007003870038700387003870038
10204700375250000002910615975744101001001000010010000605250495707001870037700376839336874510100200100002002011470037700371110201100991001001000010000300000071011621697250100001007003870038700387003870038
10204700375250000005010615974725101001001000010010000500250495707001870037700376839336874510100200100002002021870085700371110201100991001001000010000000000071011611697250100001007003870038700387003870038
102047003752400000039388615975725101001001000010010000500250495707001870037700376839336874510100200100002002011270037700371110201100991001001000010000000000071011611697250100001007003870038700387003870038
102047003752400000051388825975725101001001000010010000500250495707001870037700376853236874510100200100002002000070037700371110201100991001001000010000000000071011611697250100001007003870038700387008670038
102047003752500001046502935975725101001001000010010000500250495707001870037700376839336874510100200100002002000070037700371110201100991001001000010000001000071011612697250100001007003870038700387003870038

1000 unrolls and 10 iterations

Result (median cycles for code): 7.0037

retire uop (01)cycle (02)03mmu table walk data (08)1e3f4e51schedule uop (52)schedule int uop (53)schedule simd uop (54)dispatch int uop (56)dispatch simd uop (57)int uops in schedulers (59)ldst uops in schedulers (5b)60696d6emap stall dispatch (70)map rewind (75)map stall (76)dispatch uop (78)map int uop (7c)map simd uop (7e)map int uop inputs (7f)map simd uop inputs (81)8283flush restart other nonspec (84)85inst all (8c)inst branch (8d)inst branch taken (90)inst branch cond (94)inst int alu (97)inst simd alu (9a)9fl1d cache writeback (a8)acbranch cond mispred nonspec (c5)cfd5map dispatch bubble (d6)ddfetch restart (de)e0? simd retires (ee)? int retires (ef)f5f6f7f8fd
10024700375240074759757251001010100001010000502504957070018700377003768415368767100102010000202000070037700371110021109101010000108600640216226979210000107003870038700387003870038
1002470037525006159757251001010100001010000502504957070018700377003768415368767100102010000202000070037700371110021109101010000108000640216226972610000107003870038700387003870038
1002470037525006159757251001010100001010000502504957070018700377003768415368767100102010000202000070037700371110021109101010000107130640216226972610000107003870038700387003870038
1002470037525006159757251001010100001010000502504957070018700377003768434368767100102010000202000070037700371110021109101010000103700640216226972610000107003870038700387003870038
1002470037524096159757251001010100001010000502504957070018700377003768415368767100102010000202000070037700371110021109101010000108500640216226972610000107003870038700387003870038
1002470037524006159757251001010100001010000502504957070018700377003768415368767100102010000202000070037700371110021109101010000107701640216226972610000107003870038700387003870038
10024700375250031359757251001010100001010000502504957070018700377003768415368767100102010000202000070037700371110021109101010000108400640216226972610000107003870038700387003870038
10024700375250053659757251001010100001010000502504957070018700377003768415368767100102010000202000070037700371110021109101010000107600640216226972610000107003870038700387003870038
1002470037525006159757251001010100001010000502504957070018700377003768415368767100102010000202000070037700371110021109101010000108000640216226972610000107003870038700387003870038
1002470037524006159757251001010100001010000502504957070018700377003768415368767100102010000202000070037700371110021109101010000108600640216226972610000107003870038700387003870038

Test 3: Latency 1->3

Code:

  fdiv h0, h1, h0
  movi v0.16b, 1
  movi v1.16b, 2

(fused SUBS/B.cc loop)

100 unrolls and 100 iterations

Result (median cycles for code): 7.0037

retire uop (01)cycle (02)03l2 tlb miss data (0b)1e1f3f4e51schedule uop (52)schedule int uop (53)schedule simd uop (54)dispatch int uop (56)dispatch simd uop (57)int uops in schedulers (59)ldst uops in schedulers (5b)60696d6emap stall dispatch (70)map rewind (75)map stall (76)dispatch uop (78)map int uop (7c)map simd uop (7e)map int uop inputs (7f)map simd uop inputs (81)8283flush restart other nonspec (84)85inst all (8c)inst branch (8d)inst branch taken (90)inst branch cond (94)inst int alu (97)inst simd alu (9a)9fl1d cache writeback (a8)acc2branch mispred nonspec (cb)cfd5map dispatch bubble (d6)ddfetch restart (de)e0? int output thing (e9)? simd retires (ee)? int retires (ef)f5f6f7f8fd
102047003752500061597572510100100100001001000050025049570700187003770037683933687451010020010000200200007003770037111020110099100100100001008200071011611697250100001007003870038700387003870038
102047003752400061597572510100100100001001000050025049571700187003770037683933687451010020010000200200007003770037111020110099100100100001008300071011611697250100001007003870038700387003870038
1020470037524000126597572510100100100001001000050025049571700187003770037683933687451010020010000200200007003770084111020110099100100100001008000071011611697250100001007003870038700387003870038
10204700375250675061597572510100100100001001000050025049570700187003770037683933687451010020010000200200007007070037111020110099100100100001008300071011611697250100001007003870038700387003870038
102047003752500061597572510100100100161001000050025049570700187003770037683933687451010020010058200200007003770037111020110099100100100001008100071011611697250100001007003870038700387003870038
102047003752400061597572510100100100001001000050025049571700187003770037683933687451010020010000200200007003770037111020110099100100100001007800071011611697250100001007003870038700387003870038
1020470037525024061597572510100100100001001000050025049570700187003770037683933687451010020010000200200007003770037111020110099100100100001007900071011611697250100001007003870038700387003870038
10204700375250006159757251010010010000100100005002504957170018700377003768393368745101002001000020020000700377003711102011009910010010000100090071011611697250100001007003870038700387003870038
10204700375240006159757251010010010000100100005002504957070018700377003768393368745101002001000020020000700377003711102011009910010010000100090071011611697250100001007003870038700387003870038
10204700375240006159757251010010010000100100005002504957070018700377003768393368745101002001000020020000700377003711102011009910010010000100000071011611697250100001007007170038700387003870038

1000 unrolls and 10 iterations

Result (median cycles for code): 7.0037

retire uop (01)cycle (02)03l2 tlb miss data (0b)1e1f3a3f4e51schedule uop (52)schedule int uop (53)schedule simd uop (54)dispatch int uop (56)dispatch simd uop (57)int uops in schedulers (59)ldst uops in schedulers (5b)60696b6d6emap stall dispatch (70)map rewind (75)map stall (76)dispatch uop (78)map int uop (7c)map simd uop (7e)map int uop inputs (7f)map simd uop inputs (81)8283flush restart other nonspec (84)85inst all (8c)inst branch (8d)inst branch taken (90)inst branch cond (94)inst int alu (97)inst simd alu (9a)9fl1d cache writeback (a8)a9accdcfd5map dispatch bubble (d6)ddfetch restart (de)e0? simd retires (ee)? int retires (ef)f5f6f7f8fd
1002470037524000271859757251001010100001010000502504957170018070037700376841536876710010201000020200007003770037111002110910101000010000064310161276972610000107003870038700387003870038
1002470037525000210859757251001010100001010000502504957070018070180700376841536876710010201000020200007008470037111002110910101000010000064351610106972610000107003870038700387003870038
10024700375250002588597572510010101000010100005025049570700180700377003768415368767100102010000202000070037700371110021109101010000100000643121610106972610000107003870038700857003870038
10024700375250902548597572510010101000010100005025049570700180700377003768415368767100102010000202000070037700371110021109101010000100000643111610116972610000107003870038700387003870038
10024700375250002411597572510010101000010100005025049570700180700377003768415368767100102010000202000070037700371110021109101010000100000643111610106972610000107003870038700387003870038
10024700375250002567597572510010101000010100005025049570700180700377003768451368767100102010000202000070037700371110021109101010000100000643101610106972610000107003870038700387003870038
10024700375250002230759757251001010100001010039502504957070018070037700376841536876710010201000020200007003770037111002110910101000010000064310165106972610000107003870038700387003870038
100247003752500026165975725100101010000101000050250495707001807003770037684153687671001020100002020000700377003711100211091010100001000306435161256972610000107003870038700387003870038
10024700375250002196659757251001010100001110000502504957070018070037700376841536876710010201000020200007003770037111002110910101000010000064312161276972610000107003870038700387003870038
10024700375250002261597572510010101000010100005025049570700180700377003768415368767100102010000202000070037700371110021109101010000100000643101610106972610000107003870038700387003870038

Test 4: throughput

Count: 8

Code:

  fdiv h0, h8, h9
  fdiv h1, h8, h9
  fdiv h2, h8, h9
  fdiv h3, h8, h9
  fdiv h4, h8, h9
  fdiv h5, h8, h9
  fdiv h6, h8, h9
  fdiv h7, h8, h9
  movi v8.16b, 9
  movi v9.16b, 10

(fused SUBS/B.cc loop)

100 unrolls and 100 iterations

Result (median cycles for code divided by count): 1.0005

retire uop (01)cycle (02)03mmu table walk data (08)09l2 tlb miss data (0b)0e18191e1f3a3f51schedule uop (52)schedule int uop (53)schedule simd uop (54)dispatch int uop (56)dispatch simd uop (57)int uops in schedulers (59)ldst uops in schedulers (5b)696d6emap stall dispatch (70)map rewind (75)map stall (76)dispatch uop (78)map int uop (7c)map simd uop (7e)map int uop inputs (7f)map simd uop inputs (81)8283flush restart other nonspec (84)85inst all (8c)inst branch (8d)inst branch taken (90)inst branch cond (94)inst int alu (97)inst simd alu (9a)9fl1d tlb access (a0)l1d tlb miss (a1)ld unit uop (a6)l1d cache writeback (a8)accfd5map dispatch bubble (d6)ddfetch restart (de)e0? int output thing (e9)? simd retires (ee)? int retires (ef)f5f6f7f8fd
8020480043601000000000549680100100800001008000050064000080024802048004369964370001801002008000020016000080043800431180201100991001008000010000000511061611800350800001008004480044800448004480044
802048004360010000012004342580100100800001008000050064000080024800438004369964370001801002008000020016000080043800431180201100991001008000010000000511011611800350800001008004480044800448004480044
8020480043599000000000542580100100800001008000050064000080024800438004369964370001801002008000020016000080043800431180201100991001008000010000000511011611800350800001008004480044800448004480044
8020480043600000000000542580100100800001008000050064000080024800438004369964370001801002008000020016000080043800431180201100991001008000010000000511011642800350800001008004480044800448004480044
80204800435990000000008772580100100800001008000050064000080024800438004369964370001801002008000020016000080043800431180201100991001008000010000000511011612800350800001008004480044800448025980044
8020480043600000000513520542580100100801121008000050064000080024800438004369964370001801002008000020016000080043800431180201100991001008000010000000511011611800350800001008004480044800448004480044
8020480043600000000000542580209100800001008000050064000080024800438004369964370001801002008000020016000080043800431180201100991001008000010000000511011611800780800001008025680044800448004480044
8020480043600000000000542580100100800001008000050064000080192800438004369964370001801002008000020016000080043800432180201100991001008000010000000516211611800350800001008004480044800448004480044
8020480043599000000000549780100100800001008000050064000080024800438004369964370001801002008000020016000080043800431180201100991001008000010020000511011611800350800001008004480044800448004480044
8020480043600010000000542580100100800001008000050064000080024800438004369964370001801002008000020016000080043800435180201100991001008000010000030511011611800350800001008004480044800448004480044

1000 unrolls and 10 iterations

Result (median cycles for code divided by count): 1.0005

retire uop (01)cycle (02)03mmu table walk data (08)l2 tlb miss data (0b)0e181e3f51schedule uop (52)schedule int uop (53)schedule simd uop (54)dispatch int uop (56)dispatch simd uop (57)int uops in schedulers (59)ldst uops in schedulers (5b)60696d6emap stall dispatch (70)simd prf full (72)map rewind (75)map stall (76)dispatch uop (78)map int uop (7c)map simd uop (7e)map int uop inputs (7f)map simd uop inputs (81)8283flush restart other nonspec (84)85inst all (8c)inst branch (8d)inst branch taken (90)inst branch cond (94)inst int alu (97)inst simd alu (9a)9fl1d tlb miss (a1)accfd5map dispatch bubble (d6)ddfetch restart (de)e0ea? simd retires (ee)? int retires (ef)f5f6f7f8fd
800248004360000000542580010108000010800005064000018002480043800436998603700238001020800002016000080043800431180022109101080000100050202161180036080000108004480044800448004480044
800248004359900000542580010108000010800005064000018002480043800436998603700238001020800002016000080043800431180021109101080000100050201161180036080000108004480044800448004480044
800248004359900000542580010108000010800005064000018002480043800436998603700238001020800002016000080043800431180021109101080000100050201161180036080000108004480044800448004480044
800248004360000000542580010108000010800005064000018002480043800436998603700238001020800002016000080043800431180021109101080000100050201161180036080000108004480044800448004480044
8002480043599000005292580010108000010800005064000018002480043800436998603700238001020800002016000080043800431180021109101080000100050201161180036080000108004480044800448004480044
800248004360000000542580010108000011800005064000018002480043800436998603700238001020800002016000080043800431180021109101080000100050201161180036080000108004480044800448004480044
800248004359900000542580010108000010800005064000018002480043800436998603700238001020800002016000080043800431180021109101080000100050203161180036080000108004480044800448004480044
800248004359900000542580010108000010800005064000018002480043800436998603700238001020800002016000080043800431180021109101080000100050201162280036080000108004480044800448004480044
800248004359900000542580010108000010800005064000008002480043800436998603700238001020800002016000080043800431180021109101080000100050202162280036080000108004480044800448004480044
800248004360000000542580010108000010800005064000008002480043800436998603700238001020800002016000080043800431180021109101080000100050201161180036080000108004480044800448004480044