Apple Microarchitecture Research by Dougall Johnson

M1/A14 P-core (Firestorm): Overview | Base Instructions | SIMD and FP Instructions
M1/A14 E-core (Icestorm):  Overview | Base Instructions | SIMD and FP Instructions

FDIV (vector, 4H)

Test 1: uops

Code:

  fdiv v0.4h, v0.4h, v1.4h
  movi v0.16b, 1
  movi v1.16b, 2

(no loop instructions)

1000 unrolls and 1 iteration

Retires: 1.000

Issues: 1.000

Integer unit issues: 0.000

Load/store unit issues: 0.000

SIMD/FP unit issues: 1.000

retire uop (01)cycle (02)031e3f4e51schedule uop (52)schedule simd uop (54)dispatch simd uop (57)ldst uops in schedulers (5b)60696d6emap stall dispatch (70)map rewind (75)map stall (76)dispatch uop (78)map simd uop (7e)map simd uop inputs (81)8283flush restart other nonspec (84)85inst all (8c)inst simd alu (9a)ld unit uop (a6)st unit uop (a7)l1d cache writeback (a8)accfd5map dispatch bubble (d6)ddfetch restart (de)e0? simd retires (ee)f5f6f7f8fd
10047037530615757251000100010002459571701870377037654336895100010002000703770371110011000000373116116726100070387038703870387038
10047037520615757251000100010002459571701870377037654336895100010002000703770371110011000000073116116726100070387038703870387038
10047037530615757251000100010002459571701870377037654336895100010002000703770371110011000001073116116726100070387038703870387038
100470375306157572510001000100024595717018703770376543368951000100020007037703711100110000001273116116726100070387038703870387038
10047037530615757251000100010002459571701870377037654336895100010002000703770371110011000001073116116726100070387038703870387038
100470375215615757251000100010002459571701870377037654336895100010002000703770371110011000000073116116726100070387038703870387038
10047037530615757251000100010002459571701870377037654336895100010002000703770371110011000000073116116726100070387038703870387038
10047037530615757251000100010002459571701870377037654336895100010002000703770371110011000000073116116726100070387038703870387038
10047037520615757251000100010002459571701870377037654336895100010002000703770371110011000200373116116726100070387038703870387038
10047037520615757251000100010002459571701870377037654336895100010002000703770371110011000010073116116726100070387038703870387038

Test 2: Latency 1->2

Code:

  fdiv v0.4h, v0.4h, v1.4h
  movi v0.16b, 1
  movi v1.16b, 2

(fused SUBS/B.cc loop)

100 unrolls and 100 iterations

Result (median cycles for code): 7.0037

retire uop (01)cycle (02)03mmu table walk data (08)l2 tlb miss data (0b)18191e1f3f4e51schedule uop (52)schedule int uop (53)schedule simd uop (54)dispatch int uop (56)dispatch simd uop (57)int uops in schedulers (59)ldst uops in schedulers (5b)60696d6emap stall dispatch (70)simd prf full (72)map rewind (75)map stall (76)dispatch uop (78)map int uop (7c)map simd uop (7e)map int uop inputs (7f)map simd uop inputs (81)8283flush restart other nonspec (84)85inst all (8c)inst branch (8d)inst branch taken (90)inst branch cond (94)inst int alu (97)inst simd alu (9a)9fl1d tlb access (a0)l1d tlb miss (a1)l1d cache writeback (a8)a9acbranch cond mispred nonspec (c5)branch mispred nonspec (cb)cfd5map dispatch bubble (d6)ddfetch restart (de)e0? int output thing (e9)? simd retires (ee)? int retires (ef)f5f6f7f8fd
102047003752400401061597572510100100100001001000050025049571700187003770037683930368745101002001000020020000700377003711102011009910010010000100000001071011611697250100001007003870038700387003870038
1020470037524000000615975725101001001000010010000500250495717001870037700376839303687451010020010000200200007003770037111020110099100100100001000000000710116116972514100001007003870038700387003870038
102047003752500000061597572510100100100001001000050025049571700187003770037683930368745101002001000020020000700377003711102011009910010010000100000000071011611697250100001007003870038700387003870038
102047003752400000061597572510100100100001001000050025049571701267003770037683930368745101462001000020020000700377003711102011009910010010000100000000071011610697250100001007003870038700387003870038
1020470037524010300286597572510100100100001001000050025049571700187003770037683930368745101002001000020020000700377003711102011009910010010000100000000071011611697250100001007003870038700387003870038
102047003752500000061597572510100100100001001000050025049571700187003770037683930368745101002001000020020000700377003711102011009910010010000100000000071011601697250100001007003870038700387003870038
102047003752500000061597572510100100100001001000050025049571700187003770037683930368745101002001000020020000700377003711102011009910010010000100000000071021613697250100001007003870038700387003870038
102047003752500000061597572510100100100001001000050025049571700187003770037683930368745101002001000020020000700377003711102011009910010010000100000000071011611697250100001007003870038700387003870038
102047003752400000061597572510100100100001001000050025049571700187003770037683930368745101002001000020020000700377003711102011009910010010000100000000071011611697250100001007003870038700387003870038
1020470037524000000536597572510100100100001001000050025049571700187003770037683933368745101002001000020020000700377003711102011009910010010000100000000071011611697250100001007003870038700387003870038

1000 unrolls and 10 iterations

Result (median cycles for code): 7.0037

retire uop (01)cycle (02)03l2 tlb miss data (0b)18191e1f3f4e51schedule uop (52)schedule int uop (53)schedule simd uop (54)dispatch int uop (56)dispatch simd uop (57)int uops in schedulers (59)ldst uops in schedulers (5b)60696d6emap stall dispatch (70)map rewind (75)map stall (76)dispatch uop (78)map int uop (7c)map simd uop (7e)map int uop inputs (7f)map simd uop inputs (81)8283flush restart other nonspec (84)85inst all (8c)inst branch (8d)inst branch taken (90)inst branch cond (94)inst int alu (97)inst simd alu (9a)9fl1d tlb miss (a1)ld unit uop (a6)l1d cache writeback (a8)a9acc2branch cond mispred nonspec (c5)cdcfd5map dispatch bubble (d6)ddfetch restart (de)e0? int output thing (e9)? simd retires (ee)? int retires (ef)f5f6f7f8fd
100247003752400000615975743100101010000101000050250495717001870037700376841536876710010201000020200007003770037511002110910101000010000000006402162269726010000107003870038700387013270038
100247003752500000615975725100101010000101000050250495707001870037700376841536876710010201000020200007003770037111002110910101000010000000006402162269726010000107003870038700387003870038
10024700375250003907265975725100101010000101000050250495707001870037700376841536876710010201000020200007003770037111002110910101000010000000006402162269726010000107003870038700387003870038
100247003752500000615975725100101010000101000050250495707001870037700376841536876710010201000020200007003770037111002110910101000010000000006402162269726010000107003870038700387003870038
100247003752400000615975725100271010000101000050250495707005470037700376841536876710010201022220200007003770037111002110910101000010000000006402162269726010000107003870038700387003870038
1002470037525000007265975725100101010000131000050250495707016270037700376841536876710010201000020200007003770037111002110910101000010000000006402162269726010000107003870038700387003870038
1002470180524000001245975725100101010000101000050250495707001870037702276841536876710010201000020200007003770037111002110910101000010000000006402162269726010000107003870038700387003870228
100247003752400000615975725100101010000101000050250495707001870037700376841536876710010201000020200007003770037111002110910101000010400000006402162269726010000107003870038700387003870038
100247003752500066264615975725100101010000101000050250495707001870037700376841536876710010201000020200007003770037111002110910101000010000000006402162269726010000107003870038700387003870038
100247003752500000615971725100101010000101000050250495707001870037700376841536876710010201000020200007003770037111002110910101000010000090006402162269726010000107003870038700387003870038

Test 3: Latency 1->3

Code:

  fdiv v0.4h, v1.4h, v0.4h
  movi v0.16b, 1
  movi v1.16b, 2

(fused SUBS/B.cc loop)

100 unrolls and 100 iterations

Result (median cycles for code): 7.0037

retire uop (01)cycle (02)03l2 tlb miss instruction (0a)191e1f3f4e51schedule uop (52)schedule int uop (53)schedule simd uop (54)dispatch int uop (56)dispatch simd uop (57)int uops in schedulers (59)ldst uops in schedulers (5b)60696d6emap stall dispatch (70)map rewind (75)map stall (76)dispatch uop (78)map int uop (7c)map simd uop (7e)map int uop inputs (7f)map simd uop inputs (81)8283flush restart other nonspec (84)85inst all (8c)inst branch (8d)inst branch taken (90)inst branch cond (94)inst int alu (97)inst simd alu (9a)9fl1d tlb access (a0)l1d tlb miss (a1)ld unit uop (a6)l1d cache writeback (a8)acbranch mispred nonspec (cb)cdcfd5map dispatch bubble (d6)ddfetch restart (de)e0? simd retires (ee)? int retires (ef)f5f6f7f8fd
10204700375240000615975725101001001000010010000500250495707001870037700376839336874510100200100002002000070037700371110201100991001001000010000000007102162269725100001007003870038700387003870038
102047003752400210615975725101001001000010010000500250495717001870037700376839336874510100200100002002000070037700371110201100991001001000010000000007102162269725100001007003870038700387003870038
10204700375250000615975725101001001000010010000500250495717001870037700376839336874510100200100002002000070037700371110201100991001001000010000000007102162269725100001007003870038700387003870038
102047003752500001345975725101001001000010010000500250495717001870037700376839336874510100200100002002000070037700371110201100991001001000010022200007102162269726100001007003870038700387003870038
10204700375241000615975725101001001000010010000500250495717001870037700376839336874510100200100002002000070037700371110201100991001001000010000000017102162269725100001007003870038700387003870038
102047003752502121087905974725101001001000010010000500250495717001870037700376844436874510100200100002002000070037700371110201100991001001000010000000007102162269725100001007003870038700387003870038
1020470037525001201035975725101001001000010010000500250495717001870037700376839336874510100200100002002000070037700371110201100991001001000010000008779007102162269725100001007003870038700387003870038
10204700375240000615975725101001001000010010000500250495717001870037700376839336874510100200100002002000070037700371110201100991001001000010000000007102162269725100001007003870038700387003870038
102047003752500006159757251010010010000100100005002504957170018700377003768393368745101002001000020020000700377003711102011009910010010000100000021007102162269725100001007003870038700387003870038
10204700375250090615975725101001001000010010000500250495717001870037700376839336874510100200100002002000070037700371110201100991001001000010000000007102162269725100001007003870038700387003870038

1000 unrolls and 10 iterations

Result (median cycles for code): 7.0037

retire uop (01)cycle (02)0318191e3f4e51schedule uop (52)schedule int uop (53)schedule simd uop (54)dispatch int uop (56)dispatch simd uop (57)int uops in schedulers (59)ldst uops in schedulers (5b)696d6emap stall dispatch (70)map rewind (75)map stall (76)dispatch uop (78)map int uop (7c)map simd uop (7e)map int uop inputs (7f)map simd uop inputs (81)8283flush restart other nonspec (84)85inst all (8c)inst branch (8d)inst branch taken (90)inst branch cond (94)inst int alu (97)inst simd alu (9a)9fl1d cache writeback (a8)acbranch cond mispred nonspec (c5)branch mispred nonspec (cb)cfd5map dispatch bubble (d6)ddfetch restart (de)e0? simd retires (ee)? int retires (ef)f5f6f7f8fd
10024700375240006159757251001010100001010000502504957700187003770037684153687671001020100002020000700377003711100211091010100001077000640216246972610000107003870038700387003870179
1002470037525010118559757301001010100001010000502504957700187003770037684156687671001020100572020000700377013311100211091010100001016224900640216226972610000107003870038700387003870038
1002470037525000615975725100101010000121000050250495770018700377003768415368767100102010000202000070037700371110021109101010000103719200640216226972610000107003870038700387003870038
1002470084525004510359757251001010100001010000502505258700537003770037684153687671001020100002020000700377003711100211091010100001037000640216226972610000107003870038700387003870038
100247003752400246159757251001010100001010000502504957700187003770037684153687671001020100002020000700377003711100211091010100001001200640216226972610000107003870038700387003870038
1002470037525000615975725100101010000101000050250495770018700377003768415368767100102010000202000070037700372110021109101010000107000640216226972610000107003870038700387003870038
10024700375240006159757251001010100001010000502504957700187003770037684153687671001020100002020000700377003711100211091010100001079000640216226972610000107003870038700387003870038
1002470037524000615975725100101010000101000050250495770018700377003768415368767100102010000202000070037700371110021109101010000104000640216226972610000107003870038700387003870038
1002470037524000615975725100101010000101000050250495770018700377003768415368767100102010000202000070037700371110021109101010000100300640216226972610000107003870038700387003870038
10024700375250006159757251001010100001010000502504957700187003770037684153687671001020100002020000700377003711100211091010100001080000640216226972610000107003870038700387003870038

Test 4: throughput

Count: 8

Code:

  fdiv v0.4h, v8.4h, v9.4h
  fdiv v1.4h, v8.4h, v9.4h
  fdiv v2.4h, v8.4h, v9.4h
  fdiv v3.4h, v8.4h, v9.4h
  fdiv v4.4h, v8.4h, v9.4h
  fdiv v5.4h, v8.4h, v9.4h
  fdiv v6.4h, v8.4h, v9.4h
  fdiv v7.4h, v8.4h, v9.4h
  movi v8.16b, 9
  movi v9.16b, 10

(fused SUBS/B.cc loop)

100 unrolls and 100 iterations

Result (median cycles for code divided by count): 1.0005

retire uop (01)cycle (02)03mmu table walk data (08)l2 tlb miss data (0b)18191e1f3f51schedule uop (52)schedule int uop (53)schedule simd uop (54)dispatch int uop (56)dispatch simd uop (57)int uops in schedulers (59)ldst uops in schedulers (5b)6061696b6d6emap stall dispatch (70)map rewind (75)map stall (76)dispatch uop (78)map int uop (7c)map simd uop (7e)map int uop inputs (7f)map simd uop inputs (81)8283flush restart other nonspec (84)85inst all (8c)inst branch (8d)inst branch taken (90)inst branch cond (94)inst int alu (97)inst simd alu (9a)9fl1d tlb access (a0)l1d tlb miss (a1)l1d cache writeback (a8)a9acc2branch cond mispred nonspec (c5)cdcfd0d5map dispatch bubble (d6)ddfetch restart (de)e0? int output thing (e9)? simd retires (ee)? int retires (ef)f5f6f7f8fd
802048004359900000054258010010080000100800005006400001480024080043800436996437000180100200800002001600008004380043118020110099100100800001000087000005110431611800350800001008004480044800448004480044
802048004360000000054258010010080000100800005006400001480024080043800436996437000180100200800002001600008004380043118020110099100100800001000045000005110411611800350800001008004480044800448004480044
802048004359900000054258010010080000100800005006400001480024080043800436996437000180100200800002001600008004380043118020110099100100800001000077000005110411611800350800001008004480044800448004480044
802048004360000000054258010010080000100800005006400001480024080043800436996437000180100200800002001600008004380043118020110099100100800001000076000005110411611800350800001008004480044800448004480044
802048004359900000054258010010080000100800005006400001480024080043800436996437000180100200800002001600008004380043118020110099100100800001000090180005110411611800350800001008004480044800448004480044
802048004359900000054258010010080000100800005006400001480024080043800436996437000180100200800002001600008004380043118020110099100100800001000077000005110411611800350800001008004480044800448004480044
802048004359900000054258010010080000100800005006400001480024080043800436996437000180100200800002001600008004380043118020110099100100800001000087000005110411611800350800001008004480044800448004480044
8020480043599000000645258010010080000100800005006400001480024080043800436996437000180100200800002001600008004380043118020110099100100800001000026000005110411611800350800001008004480044800448004480044
80204800436000000007192580100100800001008003050064000014800240800438004369964370001801002008000020016000080043800431180201100991001008000010000250000051614181118049724800001008137181668806318046780578
802048068460811111320123254438010010080056100800605006400001480024080043800436996437000180100200800002001600008004380043118020110099100100800001000085000005110411611800350800001008004480044800448004480044

1000 unrolls and 10 iterations

Result (median cycles for code divided by count): 1.0005

retire uop (01)cycle (02)031e3f51schedule uop (52)schedule int uop (53)schedule simd uop (54)dispatch int uop (56)dispatch simd uop (57)int uops in schedulers (59)ldst uops in schedulers (5b)5f60696d6emap stall dispatch (70)simd prf full (72)map rewind (75)map stall (76)dispatch uop (78)map int uop (7c)map simd uop (7e)map int uop inputs (7f)map simd uop inputs (81)8283flush restart other nonspec (84)85inst all (8c)inst branch (8d)inst branch taken (90)inst branch cond (94)inst int alu (97)inst simd alu (9a)9fld unit uop (a6)l1d cache writeback (a8)acbranch cond mispred nonspec (c5)cfd2d5map dispatch bubble (d6)dadbddfetch restart (de)e0? simd retires (ee)? int retires (ef)f5f6f7f8fd
8002480043599054628001010800001080000506400000080024800438004369986037002380010208000020160000800438004311800211091010800001002005020031600118003680000108004480044800448004480044
8002480043599054258001010800001080000506400000080024800438004369986037002380010208000020160000800438004311800211091010800001000005020011600118003680000108004480044800448004480044
8002480043600054258001010800001080000506400000080024800438004369986037002380010208000020160000800438004311800211091010800001000005020011600118003680000108004480044800448004480044
8002480043599054258001010800001080000506400000080024800438004369986037002380010208000020160000800438004311800211091010800001000005020011600118003680000108004480044800448004480044
80024800435994273258001010800001080000506400000080024800438004369986037002380010208000020160000800438004311800211091010800001000005020011600128003680000108004480044800448004480044
8002480043600054258001010800001080000506400000080024800438004369986037002380010208000020160000800438004311800211091010800001000005020021600128003680000108004480044800448004480044
8002480043600127612580010108000010800005064000000800248004380043699860370023800102080000201600008004380043118002110910108000010064217305020013300118003680000108004480044800448004480044
8002480043599054258001010800001080000506400000080024800438004369986037002380010208000020160000800438004311800211091010800001000005020011600118003680000108004480044800448004480044
8002480043600054258001010800001080000506400000080024800438004369986037002380010208000020160000800438004311800211091010800001000005020011600118003680000108004480044800968004480044
800248004359902178448001010800001080000506400000080024800438004369986337002380040208000020160000800438004311800211091010800001000005020011600118003680000108004480044800448004480044