Apple M1 Microarchitecture Research by Dougall Johnson

Firestorm: Overview | Base Instructions | SIMD and FP Instructions
Icestorm:  Overview | Base Instructions | SIMD and FP Instructions

FDIV (vector, 2S)

Test 1: uops

Code:

  fdiv v0.2s, v0.2s, v1.2s
  movi v0.16b, 1
  movi v1.16b, 2

(no loop instructions)

1000 unrolls and 1 iteration

Retires: 1.000

Issues: 1.000

Integer unit issues: 0.000

Load/store unit issues: 0.000

SIMD/FP unit issues: 1.000

retire (01)cycle (02)031e3f4e51inst issue (52)~issue fp/simd (54)~dispatch fp/simd (57)huge thing fp/simd (5b)60696d6edispatch stall (70)scheduler rewind (75)scheduler stall (76)~dispatch op (78)~map op fp/simd (7e)~map lookup fp/simd (81)8283pipeline redirect (84)85inst all (8c)inst neon or fp (9a)a8accfd5d6ddinst fetch restart (de)e0? fp/simd (ee)f5f6f7f8fd
100480376008267212510001000100028130018018803780377494378951000100020008037803711100110000073116117677100080388038803880388038
1004803760029067212510001000100028130018018803780377494378951000100020008084803711100110000073116117677100080388038803880388038
100480376006167212510001000100028130018018803780377494378951000100020008037803711100110000073116117677100080388038808680388038
100480376006167212510001000100028130018018803780377494378951000100020008037803711100110002073116117677100080388038803880388038
100480376006167212510001000100028130018018803780377494378951000111220008037803711100110000073116117677100080388038803880388038
100480376006167212510001000100028130018018803780377494379211000100020008037803711100110000073116117677100080388038803880388038
100480376006167212510001000100028130018018803780377494378951000100020008037803711100110000073116117677100080388038803880388038
100480376006167212510001000100028130018018803780377494378951000100020008037803711100110000073116117677100080388038803880388038
100480376006167212510001000100028130018018803780377494378951000100020008037803711100110000073116117677100080388038803880388038
100480376006167212510001000100028130018018803780377494378951000100020008037803711100110000073116117677100080388038803880388038

Test 2: Latency 1->2

Code:

  fdiv v0.2s, v0.2s, v1.2s
  movi v0.16b, 1
  movi v1.16b, 2

(fused SUBS/B.cc loop)

100 unrolls and 100 iterations

Result (median cycles for code): 8.0037

retire (01)cycle (02)030b1e3f4e51inst issue (52)~issue int (53)~issue fp/simd (54)~dispatch int (56)~dispatch fp/simd (57)huge thing int (59)huge thing fp/simd (5b)60696b6d6edispatch stall (70)scheduler rewind (75)scheduler stall (76)~dispatch op (78)~map op int (7c)~map op fp/simd (7e)~map lookup int (7f)~map lookup fp/simd (81)8283pipeline redirect (84)85inst all (8c)inst branch (8d)inst branch taken (90)inst b.cc (94)inst integer (97)inst neon or fp (9a)9fa0a8acc5branch mispredict (cb)cdcfd5d6ddinst fetch restart (de)e0? int output thing (e9)? fp/simd (ee)gpr retires (ef)f5f6f7f8fd
1020480037599006169721251010010010000100100005002864300180018080037800377835177874010100200100082002001680037800371110201100991001001000010000011171801600798880100001008003880038800388003880038
1020480037600006169721251010010010000100100005002864300080018080037800377835167874110100200100082002001680037800371110201100991001001000010000011171801600796880100001008003880038800388003880038
1020480037600006169721251010010010000100100005002864300080018080037800377835177874110100200100082002001680037800371110201100991001001000010010011171801600796870100001008003880038800388003880038
1020480037600006169721251010010010000100100005002864300080018080037800377835167874110100200100082002001680037800371110201100991001001000010000011171701600796880100001008003880038800388003880038
1020480037599006169721251010010010000100100005002864300080018080037800377835177874010100200100082002001680037800371110201100991001001000010000600071011611796770100001008003880038800388003880038
1020480037600006169721251010010010000100100005002864300180018080037800377834437874510100200100002002000080037800371110201100991001001000010000000071011611796770100001008003880038800388003880038
10204800375990061697212510100100100001001000050028643001800180800378003778344378745101002001000020020000800378003711102011009910010010000100000000710116117967714100001008003880038800388003880038
1020480037599006169721251010010010000100100005002864300180018080037800377834437874510100200100002002000080037800371110201100991001001000010000000071011611796770100001008003880038800388003880038
1020480037600006169721251010010010000124100005002864300180018080037800377834437874510100200100002002000080037800371110201100991001001000010000000071011611796770100001008003880038800388003880038
1020480037599006169721251010010010000100100005002864300180018080037800377834437878010100200100002002000080037800371110201100991001001000010000000071011611796770100001008003880038800388003880038

1000 unrolls and 10 iterations

Result (median cycles for code): 8.0037

retire (01)cycle (02)031e1f3f4e51inst issue (52)~issue int (53)~issue fp/simd (54)~dispatch int (56)~dispatch fp/simd (57)huge thing int (59)huge thing fp/simd (5b)60696b6d6edispatch stall (70)scheduler rewind (75)scheduler stall (76)~dispatch op (78)~map op int (7c)~map op fp/simd (7e)~map lookup int (7f)~map lookup fp/simd (81)8283pipeline redirect (84)85inst all (8c)inst branch (8d)inst branch taken (90)inst b.cc (94)inst integer (97)inst neon or fp (9a)9fa8accfd5d6ddinst fetch restart (de)e0? int output thing (e9)? fp/simd (ee)gpr retires (ef)f5f6f7f8fd
1002480037599006169721251001010100001010000502864300180018080037800377836637876710010201000020200008003780037111002110910101000010006402162279677010000108003880038800388003880069
1002480037599006169721251001010100001010000502864300180018080037800377836637876710010201000020200008003780037111002110910101000010006402162279677010000108003880038800388003880038
10024800375990061697212510012101000010100005028643001800183800378003778366378767100102010215202000080037800371110021109101010000100101236402162279677010000108003880038800388003880038
1002580037599006169721251001010100001010000502864300180018080037800377836637876710010201000020200008003780037111002110910101000010006402162279677010000108003880038800388003880038
1002480037600006169721251001010100001010000502864300180018080037800377836637876710010201000020200008003780037111002110910101000010006402162279677010000108003880038800388003880038
1002480037600906169721251001010100001010000502864300180018080037800377836637876710010201000020200008003780037111002110910101000010006402162279677010000108003880038800388003880038
100248003759926406169721251001010100001010000502864300180018080037800377836637876710010201000020200008003780037111002110910101000010006402162279677010000108003880038800388003880038
1002480037599564061697212510010101000010100005028643001800180800378003778366237876710010201000020200008003780037111002110910101000010006402162279677010000108003880038800388003880038
100248003759938106169721251001010100001010000502864300180018080037800377836637876710010201000020200008003780037111002110910101000010006402162279677010000108003880038800388003880038
10024800375999606169721251001010100001010000502864300180018080037800377836637876710010201000020200008003780037111002110910101000010036402162279677010000108003880038800388003880038

Test 3: Latency 1->3

Code:

  fdiv v0.2s, v1.2s, v0.2s
  movi v0.16b, 1
  movi v1.16b, 2

(fused SUBS/B.cc loop)

100 unrolls and 100 iterations

Result (median cycles for code): 8.0037

retire (01)cycle (02)030818191e1f3a3f4e51inst issue (52)~issue int (53)~issue fp/simd (54)~dispatch int (56)~dispatch fp/simd (57)huge thing int (59)huge thing fp/simd (5b)60696d6edispatch stall (70)72scheduler rewind (75)scheduler stall (76)~dispatch op (78)~map op int (7c)~map op fp/simd (7e)~map lookup int (7f)~map lookup fp/simd (81)8283pipeline redirect (84)85inst all (8c)inst branch (8d)inst branch taken (90)inst b.cc (94)inst integer (97)inst neon or fp (9a)9fa0a1a8acbranch mispredict (cb)cdcfd5d6ddinst fetch restart (de)e0? int output thing (e9)? fp/simd (ee)gpr retires (ef)f5f6f7f8fd
102048003760000000012069721251010010010000100100005002864300080018800378003778344037874510100200100002002000080037800371110201100991001001000010000000071011611796770100001008003880038800388003880038
10204800375990006006169721251010010010000100100005002864300080018800378003778344037874510155200100002002000080037800371110201100991001001000010000000071011611796770100001008003880038800388003880038
10204800375990000006169721251010010010000100100005002864300080018800378003778344037874510100200100002002000080037800371110201100991001001000010000000071011611796770100001008003880038800388003880038
102048003759900000061697212510100100100001001000050028643000800188003780037783440237874510100200100002002000080037801361110201100991001001000010000200071013311796770100001008003880038800388003880038
1020480037599131210037469721251010010010003100100005002864300080018800378003778344037874510100200100002002000080037800371110201100991001001000010000100071011611796770100001008003880038800388003880038
1020480037599040279006169721251010010010000100100005002864300080018800378003778344037874510100200100002002000080037800371110201100991001001000010000130071011611796770100001008003880038800388003880038
1020480037599000120010369721251010010010000100100005002864300080018800378003778344037874510100200100002042000080415800371110201100991001001000010000000071011611796770100001008003880038800388003880038
10204800376000000006169721251010010010000100100005002864300080018800378003778344037874510100200100002002000080037800371110201100991001001000010000000071011611796770100001008003880038800388003880038
10204800375990000006169721251010010010000100100005002864300080018800378003778397037874510100200100002002000080037800371110201100991001001000010000000071011611796770100001008003880038800388003880038
10204800375990000006169721251010010010000100100005002864300080018800378003778344037874510100200100002002000080037800371110201100991001001000010000000071011611796770100001008003880038800388003880038

1000 unrolls and 10 iterations

Result (median cycles for code): 8.0037

retire (01)cycle (02)031e3f4e51inst issue (52)~issue int (53)~issue fp/simd (54)~dispatch int (56)~dispatch fp/simd (57)huge thing int (59)huge thing fp/simd (5b)60696d6edispatch stall (70)scheduler rewind (75)scheduler stall (76)~dispatch op (78)~map op int (7c)~map op fp/simd (7e)~map lookup int (7f)~map lookup fp/simd (81)8283pipeline redirect (84)85inst all (8c)inst branch (8d)inst branch taken (90)inst b.cc (94)inst integer (97)inst neon or fp (9a)9fa0a8acbranch mispredict (cb)cfd5d6ddinst fetch restart (de)e0? fp/simd (ee)gpr retires (ef)f5f6f7f8fd
100248003759906169721251001010100001010000502864300180018800378003778366378767100102010000202000080037800371110021109101010000100000640216227967710000108003880038800388003880038
100248003760006169721251001010100001010000502864300180018800378003778366378767100102010000202000080037800371110021109101010000100000640216227967710000108003880038800388003880038
100248003760006169721251001010100001010000502864300180018800378003778366378767100102010000202012080037800371110021109101010000100000640216227967710000108003880038800388003880038
100248003759906169721251001010100001010000502864300180018800378003778366378767100102010000202000080037800371110021109101010000100000640216227967710000108003880038800388003880038
100248003759906169721251001010100001010000502864300180018800378003778366378767100102010000202000080037800371110021109101010000100000640216227967710000108003880038800388003880038
1002480037599636169708251001010100001010000502864300180018800378003778366378767100102010000202000080037800371110021109101010000100000640216227967710000108003880038800388003880038
100248003759906169721251001010100001010000502864300180018800378003778366378767100102010000202000080037800371110021109101010000100000640216227967710000108003880038800388003880038
100248003759906169721251001010100001010000502864300180018800378003778366378767100102010000202000080037800371110021109101010000100000640216227967710000108003880038800388003880038
100248003760006169721251001010100001010000502864300180018800378003778366378767100102010000202000080037800371110021109101010000100000640216227967710000108003880038800388003880038
100248003759906169721251001010100001010000552864300180018800378003778366378767100102010000202000080037800371110021109101010000100000640216227967710000108003880038800388003880038

Test 4: throughput

Count: 8

Code:

  fdiv v0.2s, v8.2s, v9.2s
  fdiv v1.2s, v8.2s, v9.2s
  fdiv v2.2s, v8.2s, v9.2s
  fdiv v3.2s, v8.2s, v9.2s
  fdiv v4.2s, v8.2s, v9.2s
  fdiv v5.2s, v8.2s, v9.2s
  fdiv v6.2s, v8.2s, v9.2s
  fdiv v7.2s, v8.2s, v9.2s
  movi v8.16b, 9
  movi v9.16b, 10

(fused SUBS/B.cc loop)

100 unrolls and 100 iterations

Result (median cycles for code divided by count): 1.0006

retire (01)cycle (02)03081e3f51inst issue (52)~issue int (53)~issue fp/simd (54)~dispatch int (56)~dispatch fp/simd (57)huge thing int (59)huge thing fp/simd (5b)60696b6d6edispatch stall (70)72scheduler rewind (75)scheduler stall (76)~dispatch op (78)~map op int (7c)~map op fp/simd (7e)~map lookup int (7f)~map lookup fp/simd (81)8283pipeline redirect (84)85inst all (8c)inst branch (8d)inst branch taken (90)inst b.cc (94)inst integer (97)inst neon or fp (9a)9fa8acbranch mispredict (cb)cfd5d6ddinst fetch restart (de)e0? fp/simd (ee)gpr retires (ef)f5f6f7f8fd
802048004560000552580100100800001008000050064000018002508004480044699640370002801002008000020016000080044800441180201100991001008000010000051102161180036800001008004580045800458004580045
802048004459900552580100100800001008000050064000008002508004480044699890370002801002008005620016000080044800441180201100991001008000010010051101161180036800001008004580045800458004580045
802048004460000552580100100800001008000050064000008002508004480044699640370002801002008000020016000080044800441180201100991001008000010000051101161180036800001008004580045800458004580045
802048004459900552580100100800001008000050064000008002508004480044699640370002801002008000020016000080044800441180201100991001008000010000051101161180036800001008004580045800458004580045
802048004459900552580100100800001008000050064000008002508004480044699640370002801002008004820016000080044800441180201100991001008000010000051101161180036800001008004580045800458004580045
8020480044599007202580100100800001008000050064000008002508004480044699640370002801002008000020016000080044800441180201100991001008000010000151101161180036800001008004580045800458004580045
8020480044599002452580100100800001008000050064000018002508004480097699640370002801002008000020016000080044800441180201100991001008000010000051101161180036800001008004580045800458004580045
802048004459910552580100100800001008000050064000008019708004480044699640370002801002008004020016000080044800441180201100991001008000010000051101161180036800001008004580045800458004580045
802048004459900552580100100800001008000050064000008002508004480044699640370002801002008000020016000080044800441180201100991001008000010000051101161180036800001008004580045800458004580045
802048004460000552580100100800001008000050064000008002508004480044699640370002801002008000020016000080044800441180201100991001008000010000051101161180036800001008004580045800458004580045

1000 unrolls and 10 iterations

Result (median cycles for code divided by count): 1.0006

retire (01)cycle (02)03090b1e3f51inst issue (52)~issue int (53)~issue fp/simd (54)~dispatch int (56)~dispatch fp/simd (57)huge thing int (59)huge thing fp/simd (5b)60696b6d6edispatch stall (70)72scheduler rewind (75)scheduler stall (76)~dispatch op (78)~map op int (7c)~map op fp/simd (7e)~map lookup int (7f)~map lookup fp/simd (81)8283pipeline redirect (84)85inst all (8c)inst branch (8d)inst branch taken (90)inst b.cc (94)inst integer (97)inst neon or fp (9a)9fa8acbranch mispredict (cb)cdcfd0d5d6d9ddinst fetch restart (de)e0ea? fp/simd (ee)gpr retires (ef)f5f6f7f8fd
800248004460001055258001010800001080000506400001800250800448004469986037002480010208000020160000800448004411800211091010800001000005020031662380035080000108004580045800458004580045
8002480044600000552580010108000010800005064000018002508004480044699860370024800102080000201600008004480044118002110910108000010570005020021642380035080000108004580045800458004580045
800248004459900055258001010800001080000506400001800250800448004469986037002480010208000020160000800448004411800211091010800001013005020021662380035080000108004580045800458004580045
800248004459900055258001010800001080000506400001800250800448004469986037002480010208000020160000800448004411800211091010800001000005020031663380035080000108004580045800458004580045
8002480044600000762580010108000010800005064000018002508004480044699860370024800102080000201600008004480044118002110910108000010000050200316433800352080000108004580045800458004580045
800248004460000055258001010800001080000506400001800250800448004469986037002480010208000020160000800448009711800211091010800001000005020031643380035080000108004580045800458004580045
800248004459900055258001010800001080000506400001800250800448004469986037002480010208000020160000800448004411800211091010800001000005020031643380035080000108004580045800458004580045
800248004460000055258001010800001080000506400001800250800448004469986037002480010208000020160000800448004411800211091010800001000005020021643380035080000108004580045800458004580045
800248004460000055258001010800001080000506400001800250800448004469986037002480010208000020160000800448004411800211091010800001000005020031663380035080000108004580045800458004580045
800248009759900060258001010800001080000506400001800250800448004469986337002480010208000020160000800448004411800211091010800001000005020031643380035080000108004580045800458004580045