Apple M1 Microarchitecture Research by Dougall Johnson

Firestorm: Overview | Base Instructions | SIMD and FP Instructions
Icestorm:  Overview | Base Instructions | SIMD and FP Instructions

FDIV (vector, 4H)

Test 1: uops

Code:

  fdiv v0.4h, v0.4h, v1.4h
  movi v0.16b, 1
  movi v1.16b, 2

(no loop instructions)

1000 unrolls and 1 iteration

Retires: 1.000

Issues: 1.000

Integer unit issues: 0.000

Load/store unit issues: 0.000

SIMD/FP unit issues: 1.000

retire (01)cycle (02)031e3f4e51inst issue (52)~issue fp/simd (54)~dispatch fp/simd (57)huge thing fp/simd (5b)60696d6edispatch stall (70)scheduler rewind (75)scheduler stall (76)~dispatch op (78)~map op fp/simd (7e)~map lookup fp/simd (81)8283pipeline redirect (84)85inst all (8c)inst neon or fp (9a)a6a7a8accfd5d6ddinst fetch restart (de)e0? fp/simd (ee)f5f6f7f8fd
10047037530615757251000100010002459571701870377037654336895100010002000703770371110011000000373116116726100070387038703870387038
10047037520615757251000100010002459571701870377037654336895100010002000703770371110011000000073116116726100070387038703870387038
10047037530615757251000100010002459571701870377037654336895100010002000703770371110011000001073116116726100070387038703870387038
100470375306157572510001000100024595717018703770376543368951000100020007037703711100110000001273116116726100070387038703870387038
10047037530615757251000100010002459571701870377037654336895100010002000703770371110011000001073116116726100070387038703870387038
100470375215615757251000100010002459571701870377037654336895100010002000703770371110011000000073116116726100070387038703870387038
10047037530615757251000100010002459571701870377037654336895100010002000703770371110011000000073116116726100070387038703870387038
10047037530615757251000100010002459571701870377037654336895100010002000703770371110011000000073116116726100070387038703870387038
10047037520615757251000100010002459571701870377037654336895100010002000703770371110011000200373116116726100070387038703870387038
10047037520615757251000100010002459571701870377037654336895100010002000703770371110011000010073116116726100070387038703870387038

Test 2: Latency 1->2

Code:

  fdiv v0.4h, v0.4h, v1.4h
  movi v0.16b, 1
  movi v1.16b, 2

(fused SUBS/B.cc loop)

100 unrolls and 100 iterations

Result (median cycles for code): 7.0037

retire (01)cycle (02)03080b18191e1f3f4e51inst issue (52)~issue int (53)~issue fp/simd (54)~dispatch int (56)~dispatch fp/simd (57)huge thing int (59)huge thing fp/simd (5b)60696d6edispatch stall (70)72scheduler rewind (75)scheduler stall (76)~dispatch op (78)~map op int (7c)~map op fp/simd (7e)~map lookup int (7f)~map lookup fp/simd (81)8283pipeline redirect (84)85inst all (8c)inst branch (8d)inst branch taken (90)inst b.cc (94)inst integer (97)inst neon or fp (9a)9fa0a1a8a9acc5branch mispredict (cb)cfd5d6ddinst fetch restart (de)e0? int output thing (e9)? fp/simd (ee)gpr retires (ef)f5f6f7f8fd
102047003752400401061597572510100100100001001000050025049571700187003770037683930368745101002001000020020000700377003711102011009910010010000100000001071011611697250100001007003870038700387003870038
1020470037524000000615975725101001001000010010000500250495717001870037700376839303687451010020010000200200007003770037111020110099100100100001000000000710116116972514100001007003870038700387003870038
102047003752500000061597572510100100100001001000050025049571700187003770037683930368745101002001000020020000700377003711102011009910010010000100000000071011611697250100001007003870038700387003870038
102047003752400000061597572510100100100001001000050025049571701267003770037683930368745101462001000020020000700377003711102011009910010010000100000000071011610697250100001007003870038700387003870038
1020470037524010300286597572510100100100001001000050025049571700187003770037683930368745101002001000020020000700377003711102011009910010010000100000000071011611697250100001007003870038700387003870038
102047003752500000061597572510100100100001001000050025049571700187003770037683930368745101002001000020020000700377003711102011009910010010000100000000071011601697250100001007003870038700387003870038
102047003752500000061597572510100100100001001000050025049571700187003770037683930368745101002001000020020000700377003711102011009910010010000100000000071021613697250100001007003870038700387003870038
102047003752500000061597572510100100100001001000050025049571700187003770037683930368745101002001000020020000700377003711102011009910010010000100000000071011611697250100001007003870038700387003870038
102047003752400000061597572510100100100001001000050025049571700187003770037683930368745101002001000020020000700377003711102011009910010010000100000000071011611697250100001007003870038700387003870038
1020470037524000000536597572510100100100001001000050025049571700187003770037683933368745101002001000020020000700377003711102011009910010010000100000000071011611697250100001007003870038700387003870038

1000 unrolls and 10 iterations

Result (median cycles for code): 7.0037

retire (01)cycle (02)030b18191e1f3f4e51inst issue (52)~issue int (53)~issue fp/simd (54)~dispatch int (56)~dispatch fp/simd (57)huge thing int (59)huge thing fp/simd (5b)60696d6edispatch stall (70)scheduler rewind (75)scheduler stall (76)~dispatch op (78)~map op int (7c)~map op fp/simd (7e)~map lookup int (7f)~map lookup fp/simd (81)8283pipeline redirect (84)85inst all (8c)inst branch (8d)inst branch taken (90)inst b.cc (94)inst integer (97)inst neon or fp (9a)9fa1a6a8a9acc2c5cdcfd5d6ddinst fetch restart (de)e0? int output thing (e9)? fp/simd (ee)gpr retires (ef)f5f6f7f8fd
100247003752400000615975743100101010000101000050250495717001870037700376841536876710010201000020200007003770037511002110910101000010000000006402162269726010000107003870038700387013270038
100247003752500000615975725100101010000101000050250495707001870037700376841536876710010201000020200007003770037111002110910101000010000000006402162269726010000107003870038700387003870038
10024700375250003907265975725100101010000101000050250495707001870037700376841536876710010201000020200007003770037111002110910101000010000000006402162269726010000107003870038700387003870038
100247003752500000615975725100101010000101000050250495707001870037700376841536876710010201000020200007003770037111002110910101000010000000006402162269726010000107003870038700387003870038
100247003752400000615975725100271010000101000050250495707005470037700376841536876710010201022220200007003770037111002110910101000010000000006402162269726010000107003870038700387003870038
1002470037525000007265975725100101010000131000050250495707016270037700376841536876710010201000020200007003770037111002110910101000010000000006402162269726010000107003870038700387003870038
1002470180524000001245975725100101010000101000050250495707001870037702276841536876710010201000020200007003770037111002110910101000010000000006402162269726010000107003870038700387003870228
100247003752400000615975725100101010000101000050250495707001870037700376841536876710010201000020200007003770037111002110910101000010400000006402162269726010000107003870038700387003870038
100247003752500066264615975725100101010000101000050250495707001870037700376841536876710010201000020200007003770037111002110910101000010000000006402162269726010000107003870038700387003870038
100247003752500000615971725100101010000101000050250495707001870037700376841536876710010201000020200007003770037111002110910101000010000090006402162269726010000107003870038700387003870038

Test 3: Latency 1->3

Code:

  fdiv v0.4h, v1.4h, v0.4h
  movi v0.16b, 1
  movi v1.16b, 2

(fused SUBS/B.cc loop)

100 unrolls and 100 iterations

Result (median cycles for code): 7.0037

retire (01)cycle (02)030a191e1f3f4e51inst issue (52)~issue int (53)~issue fp/simd (54)~dispatch int (56)~dispatch fp/simd (57)huge thing int (59)huge thing fp/simd (5b)60696d6edispatch stall (70)scheduler rewind (75)scheduler stall (76)~dispatch op (78)~map op int (7c)~map op fp/simd (7e)~map lookup int (7f)~map lookup fp/simd (81)8283pipeline redirect (84)85inst all (8c)inst branch (8d)inst branch taken (90)inst b.cc (94)inst integer (97)inst neon or fp (9a)9fa0a1a6a8acbranch mispredict (cb)cdcfd5d6ddinst fetch restart (de)e0? fp/simd (ee)gpr retires (ef)f5f6f7f8fd
10204700375240000615975725101001001000010010000500250495707001870037700376839336874510100200100002002000070037700371110201100991001001000010000000007102162269725100001007003870038700387003870038
102047003752400210615975725101001001000010010000500250495717001870037700376839336874510100200100002002000070037700371110201100991001001000010000000007102162269725100001007003870038700387003870038
10204700375250000615975725101001001000010010000500250495717001870037700376839336874510100200100002002000070037700371110201100991001001000010000000007102162269725100001007003870038700387003870038
102047003752500001345975725101001001000010010000500250495717001870037700376839336874510100200100002002000070037700371110201100991001001000010022200007102162269726100001007003870038700387003870038
10204700375241000615975725101001001000010010000500250495717001870037700376839336874510100200100002002000070037700371110201100991001001000010000000017102162269725100001007003870038700387003870038
102047003752502121087905974725101001001000010010000500250495717001870037700376844436874510100200100002002000070037700371110201100991001001000010000000007102162269725100001007003870038700387003870038
1020470037525001201035975725101001001000010010000500250495717001870037700376839336874510100200100002002000070037700371110201100991001001000010000008779007102162269725100001007003870038700387003870038
10204700375240000615975725101001001000010010000500250495717001870037700376839336874510100200100002002000070037700371110201100991001001000010000000007102162269725100001007003870038700387003870038
102047003752500006159757251010010010000100100005002504957170018700377003768393368745101002001000020020000700377003711102011009910010010000100000021007102162269725100001007003870038700387003870038
10204700375250090615975725101001001000010010000500250495717001870037700376839336874510100200100002002000070037700371110201100991001001000010000000007102162269725100001007003870038700387003870038

1000 unrolls and 10 iterations

Result (median cycles for code): 7.0037

retire (01)cycle (02)0318191e3f4e51inst issue (52)~issue int (53)~issue fp/simd (54)~dispatch int (56)~dispatch fp/simd (57)huge thing int (59)huge thing fp/simd (5b)696d6edispatch stall (70)scheduler rewind (75)scheduler stall (76)~dispatch op (78)~map op int (7c)~map op fp/simd (7e)~map lookup int (7f)~map lookup fp/simd (81)8283pipeline redirect (84)85inst all (8c)inst branch (8d)inst branch taken (90)inst b.cc (94)inst integer (97)inst neon or fp (9a)9fa8acc5branch mispredict (cb)cfd5d6ddinst fetch restart (de)e0? fp/simd (ee)gpr retires (ef)f5f6f7f8fd
10024700375240006159757251001010100001010000502504957700187003770037684153687671001020100002020000700377003711100211091010100001077000640216246972610000107003870038700387003870179
1002470037525010118559757301001010100001010000502504957700187003770037684156687671001020100572020000700377013311100211091010100001016224900640216226972610000107003870038700387003870038
1002470037525000615975725100101010000121000050250495770018700377003768415368767100102010000202000070037700371110021109101010000103719200640216226972610000107003870038700387003870038
1002470084525004510359757251001010100001010000502505258700537003770037684153687671001020100002020000700377003711100211091010100001037000640216226972610000107003870038700387003870038
100247003752400246159757251001010100001010000502504957700187003770037684153687671001020100002020000700377003711100211091010100001001200640216226972610000107003870038700387003870038
1002470037525000615975725100101010000101000050250495770018700377003768415368767100102010000202000070037700372110021109101010000107000640216226972610000107003870038700387003870038
10024700375240006159757251001010100001010000502504957700187003770037684153687671001020100002020000700377003711100211091010100001079000640216226972610000107003870038700387003870038
1002470037524000615975725100101010000101000050250495770018700377003768415368767100102010000202000070037700371110021109101010000104000640216226972610000107003870038700387003870038
1002470037524000615975725100101010000101000050250495770018700377003768415368767100102010000202000070037700371110021109101010000100300640216226972610000107003870038700387003870038
10024700375250006159757251001010100001010000502504957700187003770037684153687671001020100002020000700377003711100211091010100001080000640216226972610000107003870038700387003870038

Test 4: throughput

Count: 8

Code:

  fdiv v0.4h, v8.4h, v9.4h
  fdiv v1.4h, v8.4h, v9.4h
  fdiv v2.4h, v8.4h, v9.4h
  fdiv v3.4h, v8.4h, v9.4h
  fdiv v4.4h, v8.4h, v9.4h
  fdiv v5.4h, v8.4h, v9.4h
  fdiv v6.4h, v8.4h, v9.4h
  fdiv v7.4h, v8.4h, v9.4h
  movi v8.16b, 9
  movi v9.16b, 10

(fused SUBS/B.cc loop)

100 unrolls and 100 iterations

Result (median cycles for code divided by count): 1.0005

retire (01)cycle (02)03080b18191e1f3f51inst issue (52)~issue int (53)~issue fp/simd (54)~dispatch int (56)~dispatch fp/simd (57)huge thing int (59)huge thing fp/simd (5b)6061696b6d6edispatch stall (70)scheduler rewind (75)scheduler stall (76)~dispatch op (78)~map op int (7c)~map op fp/simd (7e)~map lookup int (7f)~map lookup fp/simd (81)8283pipeline redirect (84)85inst all (8c)inst branch (8d)inst branch taken (90)inst b.cc (94)inst integer (97)inst neon or fp (9a)9fa0a1a8a9acc2c5cdcfd0d5d6ddinst fetch restart (de)e0? int output thing (e9)? fp/simd (ee)gpr retires (ef)f5f6f7f8fd
802048004359900000054258010010080000100800005006400001480024080043800436996437000180100200800002001600008004380043118020110099100100800001000087000005110431611800350800001008004480044800448004480044
802048004360000000054258010010080000100800005006400001480024080043800436996437000180100200800002001600008004380043118020110099100100800001000045000005110411611800350800001008004480044800448004480044
802048004359900000054258010010080000100800005006400001480024080043800436996437000180100200800002001600008004380043118020110099100100800001000077000005110411611800350800001008004480044800448004480044
802048004360000000054258010010080000100800005006400001480024080043800436996437000180100200800002001600008004380043118020110099100100800001000076000005110411611800350800001008004480044800448004480044
802048004359900000054258010010080000100800005006400001480024080043800436996437000180100200800002001600008004380043118020110099100100800001000090180005110411611800350800001008004480044800448004480044
802048004359900000054258010010080000100800005006400001480024080043800436996437000180100200800002001600008004380043118020110099100100800001000077000005110411611800350800001008004480044800448004480044
802048004359900000054258010010080000100800005006400001480024080043800436996437000180100200800002001600008004380043118020110099100100800001000087000005110411611800350800001008004480044800448004480044
8020480043599000000645258010010080000100800005006400001480024080043800436996437000180100200800002001600008004380043118020110099100100800001000026000005110411611800350800001008004480044800448004480044
80204800436000000007192580100100800001008003050064000014800240800438004369964370001801002008000020016000080043800431180201100991001008000010000250000051614181118049724800001008137181668806318046780578
802048068460811111320123254438010010080056100800605006400001480024080043800436996437000180100200800002001600008004380043118020110099100100800001000085000005110411611800350800001008004480044800448004480044

1000 unrolls and 10 iterations

Result (median cycles for code divided by count): 1.0005

retire (01)cycle (02)031e3f51inst issue (52)~issue int (53)~issue fp/simd (54)~dispatch int (56)~dispatch fp/simd (57)huge thing int (59)huge thing fp/simd (5b)5f60696d6edispatch stall (70)72scheduler rewind (75)scheduler stall (76)~dispatch op (78)~map op int (7c)~map op fp/simd (7e)~map lookup int (7f)~map lookup fp/simd (81)8283pipeline redirect (84)85inst all (8c)inst branch (8d)inst branch taken (90)inst b.cc (94)inst integer (97)inst neon or fp (9a)9fa6a8acc5cfd2d5d6dadbddinst fetch restart (de)e0? fp/simd (ee)gpr retires (ef)f5f6f7f8fd
8002480043599054628001010800001080000506400000080024800438004369986037002380010208000020160000800438004311800211091010800001002005020031600118003680000108004480044800448004480044
8002480043599054258001010800001080000506400000080024800438004369986037002380010208000020160000800438004311800211091010800001000005020011600118003680000108004480044800448004480044
8002480043600054258001010800001080000506400000080024800438004369986037002380010208000020160000800438004311800211091010800001000005020011600118003680000108004480044800448004480044
8002480043599054258001010800001080000506400000080024800438004369986037002380010208000020160000800438004311800211091010800001000005020011600118003680000108004480044800448004480044
80024800435994273258001010800001080000506400000080024800438004369986037002380010208000020160000800438004311800211091010800001000005020011600128003680000108004480044800448004480044
8002480043600054258001010800001080000506400000080024800438004369986037002380010208000020160000800438004311800211091010800001000005020021600128003680000108004480044800448004480044
8002480043600127612580010108000010800005064000000800248004380043699860370023800102080000201600008004380043118002110910108000010064217305020013300118003680000108004480044800448004480044
8002480043599054258001010800001080000506400000080024800438004369986037002380010208000020160000800438004311800211091010800001000005020011600118003680000108004480044800448004480044
8002480043600054258001010800001080000506400000080024800438004369986037002380010208000020160000800438004311800211091010800001000005020011600118003680000108004480044800968004480044
800248004359902178448001010800001080000506400000080024800438004369986337002380040208000020160000800438004311800211091010800001000005020011600118003680000108004480044800448004480044