Apple M1 Microarchitecture Research by Dougall Johnson

Firestorm: Overview | Base Instructions | SIMD and FP Instructions
Icestorm:  Overview | Base Instructions | SIMD and FP Instructions

FDIV (vector, 4S)

Test 1: uops

Code:

  fdiv v0.4s, v0.4s, v1.4s
  movi v0.16b, 1
  movi v1.16b, 2

(no loop instructions)

1000 unrolls and 1 iteration

Retires: 1.000

Issues: 1.000

Integer unit issues: 0.000

Load/store unit issues: 0.000

SIMD/FP unit issues: 1.000

retire (01)cycle (02)031e3f4e51inst issue (52)~issue fp/simd (54)~dispatch fp/simd (57)huge thing fp/simd (5b)696d6edispatch stall (70)scheduler rewind (75)scheduler stall (76)~dispatch op (78)~map op fp/simd (7e)~map lookup fp/simd (81)8283pipeline redirect (84)85inst all (8c)inst neon or fp (9a)accfd5d6ddinst fetch restart (de)e0? fp/simd (ee)f5f6f7f8fd
1004803760061672125100010001000281300801880378037749437895100010002000803780371110011000073116117677100080388038803880388038
1004803761061672125100010001000281300801880378037749437895100010002000813180371110011000073116117677100080388038803880388038
1004803760061672125100010001000281300801880378037749477895100010002000803780371110011000073116117677100080388038803880388038
10048037600251672125100010001000281300801880378037749437895100010002000803780371110011000073116117677100080388038803880388038
10048037602161672125100010001000281300801880378037749437895100010002000803780371110011000073116117677100080388038803880388038
1004803760061672125100010001000281300801880378037749437895100010002000803780371110011000073116117677100080388038803880388038
1004803760061672125100010001000281300801880378037749437895100010002000803780371110011000073116117677100080388038803880388038
1004803760061672125100010001000281300801880378037749437895100010002000803780371110011000073116117677100080388038803880388038
1004803760682672125100010001000281300801880378037749437895100010002000803780371110011000073116117677100080388038803880388038
1004803760061672125100010001000281300801880378037749437895100010002000803780371110011000073116117677100080388038803880388038

Test 2: Latency 1->2

Code:

  fdiv v0.4s, v0.4s, v1.4s
  movi v0.16b, 1
  movi v1.16b, 2

(fused SUBS/B.cc loop)

100 unrolls and 100 iterations

Result (median cycles for code): 8.0037

retire (01)cycle (02)030818191e1f3f4e51inst issue (52)~issue int (53)~issue fp/simd (54)~dispatch int (56)~dispatch fp/simd (57)huge thing int (59)huge thing fp/simd (5b)696b6d6edispatch stall (70)72scheduler rewind (75)scheduler stall (76)~dispatch op (78)~map op int (7c)~map op fp/simd (7e)~map lookup int (7f)~map lookup fp/simd (81)8283pipeline redirect (84)85inst all (8c)inst branch (8d)inst branch taken (90)inst b.cc (94)inst integer (97)inst neon or fp (9a)9fa0a8accfd5d6ddinst fetch restart (de)e0? int output thing (e9)? fp/simd (ee)gpr retires (ef)f5f6f7f8fd
1020480037599000006169721251010010010000100100005002864300800180800378003778344037874510100200100002002000080037800371110201100991001001000010000071001611796770100001008003880038800388003880038
1020480037600000006169721251010010010000100100005002864300800180800378003778344037874510100200100002002000080037800371110201100991001001000010000071011611796770100001008003880038800388003880038
1020480037600000006169721251010010010000100100005002864300800180800378003778344037874510100200100002002000080037800371110201100991001001000010000071011611796770100001008003880038800388003880038
1020480037599000006169721251010010010000100100005002864300800180800378003778344037874510100200100002002000080037800371110201100991001001000010000071011611796770100001008003880038800388003880038
1020480037599000006169669251010010010000100100005002864300800180800378003778362037874510100200100002002000080037800371110201100991001001000010000071011611796770100001008003880038800388003880038
1020480037600000006169721251010010010000100100005002864300800180800378003778344037874510100200100002002000080037800371110201100991001001000010000071011611796770100001008003880038800388003880038
1020480037599000006169721251010010010000100100005002864300800180800378003778344037874510100200100002002000080037800371110201100991001001000010001071011611796770100001008003880038800388003880038
1020480037600000006169721251010010010000100100005002864300800180800378003778344037874510100200100002002000080037800371110201100991001001000010000071011611796770100001008003880038800388003880038
1020480037599000006169721251010010010000100100005002864300800180800378003778344037874510100200100002002000080037800371110201100991001001000010000071011611796770100001008003880038800388007280038
1020480037600000006169721251010010010000100100005002864300800180800378003778344037874510100200100002002000080037800371110201100991001001000010000071001611796770100001008003880038800388003880038

1000 unrolls and 10 iterations

Result (median cycles for code): 8.0037

retire (01)cycle (02)030b1e3f4e51inst issue (52)~issue int (53)~issue fp/simd (54)~dispatch int (56)~dispatch fp/simd (57)huge thing int (59)huge thing fp/simd (5b)60696d6edispatch stall (70)scheduler rewind (75)scheduler stall (76)~dispatch op (78)~map op int (7c)~map op fp/simd (7e)~map lookup int (7f)~map lookup fp/simd (81)8283pipeline redirect (84)85inst all (8c)inst branch (8d)inst branch taken (90)inst b.cc (94)inst integer (97)inst neon or fp (9a)9fa1a8accdcfd5d6ddinst fetch restart (de)e0? fp/simd (ee)gpr retires (ef)f5f6f7f8fd
1002480037599006169721251001010100001010000502864300080018800378003778366378767100102010000202000080037800371110021109101010000100000640316227967710000108003880038800388003880038
100248003759903396169721251001010100001010000502864300180018800378003778366378767100102010000202000080037800371110021109101010000100000640216227967710000108003880038800388003880038
1002480037599006169721251001010100001010000502864300080018800378003778366378767100102010000202000080037800371110021109101010000100000640216227967710000108003880038800388003880038
1002480037599006169721251001010100001010000502864300080018800378003778366378767100102010000202000080037800371110021109101010000100000640216227967710000108003880038800388003880038
1002480037600063145769721251001010100001010000502864300080018800378003778366378767100102010000202000080037800371110021109101010000100000640216227967710000108003880038800388003880038
1002480037600006169721251001010100001010000502864300180018800378003778366378767100102010000202000080037800371110021109101010000100000640216227967710000108003880038800388003880038
100248003759902256169721251001010100001010000502864300080018800378003778366378767100102010000202000080037800371110021109101010000100000640216227967710000108003880038800388003880038
10024800375990546169721251001010100001010000502864300180018800378003778366378767100102010000202000080037800371110021109101010000100000640216227967710000108003880038800388003880038
1002480037599006169721251001010100001010000502864300080018800378003778366378767100102010000202000080037800371110021109101010000100000640216127967710000108003880038800388003880038
1002480037599006169721251001010100001010000502864300080018800378003778366378767100102010000202000080037800371110021109101010000100100640216227967710000108003880038800388003880038

Test 3: Latency 1->3

Code:

  fdiv v0.4s, v1.4s, v0.4s
  movi v0.16b, 1
  movi v1.16b, 2

(fused SUBS/B.cc loop)

100 unrolls and 100 iterations

Result (median cycles for code): 8.0037

retire (01)cycle (02)0308090a18191e1f3f4e51inst issue (52)~issue int (53)~issue fp/simd (54)~dispatch int (56)~dispatch fp/simd (57)huge thing int (59)huge thing fp/simd (5b)60696d6edispatch stall (70)scheduler rewind (75)scheduler stall (76)~dispatch op (78)~map op int (7c)~map op fp/simd (7e)~map lookup int (7f)~map lookup fp/simd (81)8283pipeline redirect (84)85inst all (8c)inst branch (8d)inst branch taken (90)inst b.cc (94)inst integer (97)inst neon or fp (9a)9fa6a7a8accfd5d6ddinst fetch restart (de)e0? int output thing (e9)? fp/simd (ee)gpr retires (ef)f5f6f7f8fd
102048003759900000006169721251010010010000100100005002864300180018800378003778344378745101002001000020020000800378003711102011009910010010000100000071031612796770100001008003880038800388003880038
102048003759900000006169721251010010010000100100005002864300180018800378003778344378745101002001000020020000800378003711102011009910010010000100000071021622796770100001008003880038800388003880038
102048003760000040008469721251010010010000100100005002864300180018800378003778344378745101002001000020020000800378003711102011009910010010000100001071021622796770100001008003880038800388003880038
102048003759900000906169721251010010010000100100005002864300180018800378003778344378745101002001000020020000800378003711102011009910010010000100000071021622796770100001008003880038800388003880038
102048003760000000006169721251010010010000100100005002864300180018800378003778344378745101002001000020020000800378003711102011009910010010000100000071021622796770100001008003880038800388003880038
102048003760000000006169721251010010010000100100005002864300180018800378003778344378745101002001000020020000800378003711102011009910010010000100001371021622796770100001008003880038800388003880038
102048003759900000006169721251010010010000100100005002864300180018800378003778344378745101002001000020020000800378003711102011009910010010000100000071021622796770100001008003880038800388003880038
102048003760000040006169721251010010010000100100395002864300180018800378003778344378745101002001000020020000800378003711102011009910010010000100000071021622796770100001008003880038800388003880038
102048003759900000008269721251010010010000100100005002864300180018800378003778344378745101002001000020020000800378003711102011009910010010000100000071021622796770100001008003880038800388003880038
102048003759900000006169721251010010010000100100005002864300180018800378003778344378745101002001000020020000800378003711102011009910010010000100000071021622796770100001008003880038800388003880038

1000 unrolls and 10 iterations

Result (median cycles for code): 8.0037

retire (01)cycle (02)031e3f4e51inst issue (52)~issue int (53)~issue fp/simd (54)~dispatch int (56)~dispatch fp/simd (57)huge thing int (59)huge thing fp/simd (5b)60696d6edispatch stall (70)scheduler rewind (75)scheduler stall (76)~dispatch op (78)~map op int (7c)~map op fp/simd (7e)~map lookup int (7f)~map lookup fp/simd (81)8283pipeline redirect (84)85inst all (8c)inst branch (8d)inst branch taken (90)inst b.cc (94)inst integer (97)inst neon or fp (9a)9faccdcfd5d6ddinst fetch restart (de)e0? fp/simd (ee)gpr retires (ef)f5f6f7f8fd
1002480037600084697212510010101000010100005028643000800188003780037783663787671001020100002020000800378003711100211091010100001000640216227967710000108003880081800388003880038
10024800375990111697212510010101000010100005028643001800188003780037783663787671001020100002020000800378003711100211091010100001000640216227967710000108003880038800388003880038
1002480037599061697212510010101000010100005028643000800188003780037783663787671001020100002020000800378003711100211091010100001000640216227967710000108003880038800388003880038
100248003760062461697212510010101000010100005028643000800188003780037783663787671001020100002020000800378003711100221091010100001000640216227967710000108003880038800388003880038
1002480037599061697212510010101000010100005028643001800188003780037783663787671001020100002020000800378003711100211091010100001000640216227967710000108003880038800388003880038
1002480037600061697212510010101000010100005028643000800188003780037783663787671001020100002020000800378003711100211091010100001000640216227967710000108003880038800388003880038
1002480037600061697212510010101000010100005028643001800188003780037783663787671001020100002020000800378003711100211091010100001000640216227967710000108003880038800388003880038
1002480037599061697212510010101000010100005028643000800188003780037783663787671001020100002020000800378003711100211091010100001000640216227967710000108003880085800388003880038
10024800746003968697212510010101000010100005028643000800188003780037783663787671001020100002020000800378003711100211091010100001000640216227967710000108003880038800388003880038
1002480037599061697212510010101000010100005028643001800188003780037783663787671001020100002020000800378003711100211091010100001000640216227967710000108003880038800388003880038

Test 4: throughput

Count: 8

Code:

  fdiv v0.4s, v8.4s, v9.4s
  fdiv v1.4s, v8.4s, v9.4s
  fdiv v2.4s, v8.4s, v9.4s
  fdiv v3.4s, v8.4s, v9.4s
  fdiv v4.4s, v8.4s, v9.4s
  fdiv v5.4s, v8.4s, v9.4s
  fdiv v6.4s, v8.4s, v9.4s
  fdiv v7.4s, v8.4s, v9.4s
  movi v8.16b, 9
  movi v9.16b, 10

(fused SUBS/B.cc loop)

100 unrolls and 100 iterations

Result (median cycles for code divided by count): 1.0006

retire (01)cycle (02)030709191e3f51inst issue (52)~issue int (53)~issue fp/simd (54)~dispatch int (56)~dispatch fp/simd (57)huge thing int (59)huge thing fp/simd (5b)60696b6d6edispatch stall (70)72scheduler rewind (75)scheduler stall (76)~dispatch op (78)~map op int (7c)~map op fp/simd (7e)~map lookup int (7f)~map lookup fp/simd (81)8283pipeline redirect (84)85inst all (8c)inst branch (8d)inst branch taken (90)inst b.cc (94)inst integer (97)inst neon or fp (9a)9fa8acbranch mispredict (cb)cdcfd5d6ddinst fetch restart (de)e0? int output thing (e9)? fp/simd (ee)gpr retires (ef)f5f6f7f8fd
8020480044600000055258010010080000100800005006400001800250800448004469964037000280100200800002001600008004480044118020110099100100800001000000511021611800360800001008004580045800458004580045
80204800445991000720258010010080000100800005006400000800250800448004469964037000280100200800002001600008004480044118020110099100100800001000000511011611800360800001008009980045800458004580045
8020480044599000055258010010080000100800005006400000800650800448004469964037000280100200800002001600008004480044118020110099100100800001000600511011711800360800001008004580045800458004580045
8020480044600000055258010010080000100800005006400001800250800448004469964037000280153200800002001600008004480044118020110099100100800001003000511011711800360800001008004580045800458004580045
802048004460000002200258010010080000100800005006400000800250800448004469964037000280100200800002001600008004480044118020110099100100800001001600511011611800360800001008004580045800458004580099
8020480044599000055258010010080000100800005006400000800250800448004469964037000280100200800002001600008004480044118020110099100100800001000000511011611800360800001008004580045800458004580045
8020480044599000055258010010080000100800005006400000800250800448004469964037000280100200800002001600008004480044118020110099100100800001000000511011611800360800001008004580045800458004580045
8020480098599000055258010010080000100800005006400000800250800448004470000037000280100200800002001600008004480044118020110099100100800001000000511011611800360800001008004580045800458004580045
8020480044600000076258010010080000100800005006400000800250800448004469964037000280100200800002001600008004480044118020110099100100800001000000511011611800360800001008004580045800458004580045
8020480044600000055258010010080000100800005006400000800250800448004469964037000280100200800002001600008004480044118020110099100100800001000000511011611800360800001008004580045800458004580045

1000 unrolls and 10 iterations

Result (median cycles for code divided by count): 1.0006

retire (01)cycle (02)031e3f51inst issue (52)~issue int (53)~issue fp/simd (54)~dispatch int (56)~dispatch fp/simd (57)huge thing int (59)huge thing fp/simd (5b)696d6edispatch stall (70)scheduler rewind (75)scheduler stall (76)~dispatch op (78)~map op int (7c)~map op fp/simd (7e)~map lookup int (7f)~map lookup fp/simd (81)8283pipeline redirect (84)85inst all (8c)inst branch (8d)inst branch taken (90)inst b.cc (94)inst integer (97)inst neon or fp (9a)9fa8accfd5d6d9ddinst fetch restart (de)e0? fp/simd (ee)gpr retires (ef)f5f6f7f8fd
80024800445990552580010108000010800005064000080025800448004469986370024800102080000201600008004480044118002110910108000010005020131621088003580000108004580045800458004580045
8002480044600055258001010800001080000506400008006880044800446998637002480010208000020160000800448004411800211091010800001000502013160998003580000108004580045800458004580045
800248025960005525800101080000108000050640000800258004480044699863700248001020800002016000080044800441180021109101080000100050201116010118003580000108004580045800458004580045
80024800446000552580010108000010800005064000080025800448004469986370024800102080000201600008004480044118002110910108000010005020101608118003580000108004580045800458004580045
80024800445990552580010108000010800005064000080025800448004469986370024800412080000201600008004480044118002110910108000010205020121608138003580000108009980045800458004580097
80024800446009552580010108000010800005064000080025800448004469986370024800102080000201600008004480044118002110910108000010005020111607128003580000108004580045800458004580045
80024800446009720258001010800001080000506400008002580044800446998627700248001020800002016000080044800441180021109101080000100050207160998003580000108004580045800458004580045
8002480044599396025800101080000108000050640000800258004480044699863700248001020800002016000080044800441180021109101080000100050201116011108003580000108004580045800458004580045
800248004459905525800101080000108000050640000800258004480044699863700248001020800002016000080044800441180021109101080000100050201116011128003580000108004580045800458004580045
800248004459905525800391080000108000050640000800258004480044699863700248001020800002016000080044800441180021109101080000100050201216011118003580000108004580045800458004580045