Apple Microarchitecture Research by Dougall Johnson

M1/A14 P-core (Firestorm): Overview | Base Instructions | SIMD and FP Instructions
M1/A14 E-core (Icestorm):  Overview | Base Instructions | SIMD and FP Instructions

FDIV (vector, 2S)

Test 1: uops

Code:

  fdiv v0.2s, v0.2s, v1.2s
  movi v0.16b, 1
  movi v1.16b, 2

(no loop instructions)

1000 unrolls and 1 iteration

Retires: 1.000

Issues: 1.000

Integer unit issues: 0.000

Load/store unit issues: 0.000

SIMD/FP unit issues: 1.000

retire uop (01)cycle (02)031e3f4e51schedule uop (52)schedule simd uop (54)dispatch simd uop (57)ldst uops in schedulers (5b)60696d6emap stall dispatch (70)map rewind (75)map stall (76)dispatch uop (78)map simd uop (7e)map simd uop inputs (81)8283flush restart other nonspec (84)85inst all (8c)inst simd alu (9a)l1d cache writeback (a8)accfd5map dispatch bubble (d6)ddfetch restart (de)e0? simd retires (ee)f5f6f7f8fd
100480376008267212510001000100028130018018803780377494378951000100020008037803711100110000073116117677100080388038803880388038
1004803760029067212510001000100028130018018803780377494378951000100020008084803711100110000073116117677100080388038803880388038
100480376006167212510001000100028130018018803780377494378951000100020008037803711100110000073116117677100080388038808680388038
100480376006167212510001000100028130018018803780377494378951000100020008037803711100110002073116117677100080388038803880388038
100480376006167212510001000100028130018018803780377494378951000111220008037803711100110000073116117677100080388038803880388038
100480376006167212510001000100028130018018803780377494379211000100020008037803711100110000073116117677100080388038803880388038
100480376006167212510001000100028130018018803780377494378951000100020008037803711100110000073116117677100080388038803880388038
100480376006167212510001000100028130018018803780377494378951000100020008037803711100110000073116117677100080388038803880388038
100480376006167212510001000100028130018018803780377494378951000100020008037803711100110000073116117677100080388038803880388038
100480376006167212510001000100028130018018803780377494378951000100020008037803711100110000073116117677100080388038803880388038

Test 2: Latency 1->2

Code:

  fdiv v0.2s, v0.2s, v1.2s
  movi v0.16b, 1
  movi v1.16b, 2

(fused SUBS/B.cc loop)

100 unrolls and 100 iterations

Result (median cycles for code): 8.0037

retire uop (01)cycle (02)03l2 tlb miss data (0b)1e3f4e51schedule uop (52)schedule int uop (53)schedule simd uop (54)dispatch int uop (56)dispatch simd uop (57)int uops in schedulers (59)ldst uops in schedulers (5b)60696b6d6emap stall dispatch (70)map rewind (75)map stall (76)dispatch uop (78)map int uop (7c)map simd uop (7e)map int uop inputs (7f)map simd uop inputs (81)8283flush restart other nonspec (84)85inst all (8c)inst branch (8d)inst branch taken (90)inst branch cond (94)inst int alu (97)inst simd alu (9a)9fl1d tlb access (a0)l1d cache writeback (a8)acbranch cond mispred nonspec (c5)branch mispred nonspec (cb)cdcfd5map dispatch bubble (d6)ddfetch restart (de)e0? int output thing (e9)? simd retires (ee)? int retires (ef)f5f6f7f8fd
1020480037599006169721251010010010000100100005002864300180018080037800377835177874010100200100082002001680037800371110201100991001001000010000011171801600798880100001008003880038800388003880038
1020480037600006169721251010010010000100100005002864300080018080037800377835167874110100200100082002001680037800371110201100991001001000010000011171801600796880100001008003880038800388003880038
1020480037600006169721251010010010000100100005002864300080018080037800377835177874110100200100082002001680037800371110201100991001001000010010011171801600796870100001008003880038800388003880038
1020480037600006169721251010010010000100100005002864300080018080037800377835167874110100200100082002001680037800371110201100991001001000010000011171701600796880100001008003880038800388003880038
1020480037599006169721251010010010000100100005002864300080018080037800377835177874010100200100082002001680037800371110201100991001001000010000600071011611796770100001008003880038800388003880038
1020480037600006169721251010010010000100100005002864300180018080037800377834437874510100200100002002000080037800371110201100991001001000010000000071011611796770100001008003880038800388003880038
10204800375990061697212510100100100001001000050028643001800180800378003778344378745101002001000020020000800378003711102011009910010010000100000000710116117967714100001008003880038800388003880038
1020480037599006169721251010010010000100100005002864300180018080037800377834437874510100200100002002000080037800371110201100991001001000010000000071011611796770100001008003880038800388003880038
1020480037600006169721251010010010000124100005002864300180018080037800377834437874510100200100002002000080037800371110201100991001001000010000000071011611796770100001008003880038800388003880038
1020480037599006169721251010010010000100100005002864300180018080037800377834437878010100200100002002000080037800371110201100991001001000010000000071011611796770100001008003880038800388003880038

1000 unrolls and 10 iterations

Result (median cycles for code): 8.0037

retire uop (01)cycle (02)031e1f3f4e51schedule uop (52)schedule int uop (53)schedule simd uop (54)dispatch int uop (56)dispatch simd uop (57)int uops in schedulers (59)ldst uops in schedulers (5b)60696b6d6emap stall dispatch (70)map rewind (75)map stall (76)dispatch uop (78)map int uop (7c)map simd uop (7e)map int uop inputs (7f)map simd uop inputs (81)8283flush restart other nonspec (84)85inst all (8c)inst branch (8d)inst branch taken (90)inst branch cond (94)inst int alu (97)inst simd alu (9a)9fl1d cache writeback (a8)accfd5map dispatch bubble (d6)ddfetch restart (de)e0? int output thing (e9)? simd retires (ee)? int retires (ef)f5f6f7f8fd
1002480037599006169721251001010100001010000502864300180018080037800377836637876710010201000020200008003780037111002110910101000010006402162279677010000108003880038800388003880069
1002480037599006169721251001010100001010000502864300180018080037800377836637876710010201000020200008003780037111002110910101000010006402162279677010000108003880038800388003880038
10024800375990061697212510012101000010100005028643001800183800378003778366378767100102010215202000080037800371110021109101010000100101236402162279677010000108003880038800388003880038
1002580037599006169721251001010100001010000502864300180018080037800377836637876710010201000020200008003780037111002110910101000010006402162279677010000108003880038800388003880038
1002480037600006169721251001010100001010000502864300180018080037800377836637876710010201000020200008003780037111002110910101000010006402162279677010000108003880038800388003880038
1002480037600906169721251001010100001010000502864300180018080037800377836637876710010201000020200008003780037111002110910101000010006402162279677010000108003880038800388003880038
100248003759926406169721251001010100001010000502864300180018080037800377836637876710010201000020200008003780037111002110910101000010006402162279677010000108003880038800388003880038
1002480037599564061697212510010101000010100005028643001800180800378003778366237876710010201000020200008003780037111002110910101000010006402162279677010000108003880038800388003880038
100248003759938106169721251001010100001010000502864300180018080037800377836637876710010201000020200008003780037111002110910101000010006402162279677010000108003880038800388003880038
10024800375999606169721251001010100001010000502864300180018080037800377836637876710010201000020200008003780037111002110910101000010036402162279677010000108003880038800388003880038

Test 3: Latency 1->3

Code:

  fdiv v0.2s, v1.2s, v0.2s
  movi v0.16b, 1
  movi v1.16b, 2

(fused SUBS/B.cc loop)

100 unrolls and 100 iterations

Result (median cycles for code): 8.0037

retire uop (01)cycle (02)03mmu table walk data (08)18191e1f3a3f4e51schedule uop (52)schedule int uop (53)schedule simd uop (54)dispatch int uop (56)dispatch simd uop (57)int uops in schedulers (59)ldst uops in schedulers (5b)60696d6emap stall dispatch (70)simd prf full (72)map rewind (75)map stall (76)dispatch uop (78)map int uop (7c)map simd uop (7e)map int uop inputs (7f)map simd uop inputs (81)8283flush restart other nonspec (84)85inst all (8c)inst branch (8d)inst branch taken (90)inst branch cond (94)inst int alu (97)inst simd alu (9a)9fl1d tlb access (a0)l1d tlb miss (a1)l1d cache writeback (a8)acbranch mispred nonspec (cb)cdcfd5map dispatch bubble (d6)ddfetch restart (de)e0? int output thing (e9)? simd retires (ee)? int retires (ef)f5f6f7f8fd
102048003760000000012069721251010010010000100100005002864300080018800378003778344037874510100200100002002000080037800371110201100991001001000010000000071011611796770100001008003880038800388003880038
10204800375990006006169721251010010010000100100005002864300080018800378003778344037874510155200100002002000080037800371110201100991001001000010000000071011611796770100001008003880038800388003880038
10204800375990000006169721251010010010000100100005002864300080018800378003778344037874510100200100002002000080037800371110201100991001001000010000000071011611796770100001008003880038800388003880038
102048003759900000061697212510100100100001001000050028643000800188003780037783440237874510100200100002002000080037801361110201100991001001000010000200071013311796770100001008003880038800388003880038
1020480037599131210037469721251010010010003100100005002864300080018800378003778344037874510100200100002002000080037800371110201100991001001000010000100071011611796770100001008003880038800388003880038
1020480037599040279006169721251010010010000100100005002864300080018800378003778344037874510100200100002002000080037800371110201100991001001000010000130071011611796770100001008003880038800388003880038
1020480037599000120010369721251010010010000100100005002864300080018800378003778344037874510100200100002042000080415800371110201100991001001000010000000071011611796770100001008003880038800388003880038
10204800376000000006169721251010010010000100100005002864300080018800378003778344037874510100200100002002000080037800371110201100991001001000010000000071011611796770100001008003880038800388003880038
10204800375990000006169721251010010010000100100005002864300080018800378003778397037874510100200100002002000080037800371110201100991001001000010000000071011611796770100001008003880038800388003880038
10204800375990000006169721251010010010000100100005002864300080018800378003778344037874510100200100002002000080037800371110201100991001001000010000000071011611796770100001008003880038800388003880038

1000 unrolls and 10 iterations

Result (median cycles for code): 8.0037

retire uop (01)cycle (02)031e3f4e51schedule uop (52)schedule int uop (53)schedule simd uop (54)dispatch int uop (56)dispatch simd uop (57)int uops in schedulers (59)ldst uops in schedulers (5b)60696d6emap stall dispatch (70)map rewind (75)map stall (76)dispatch uop (78)map int uop (7c)map simd uop (7e)map int uop inputs (7f)map simd uop inputs (81)8283flush restart other nonspec (84)85inst all (8c)inst branch (8d)inst branch taken (90)inst branch cond (94)inst int alu (97)inst simd alu (9a)9fl1d tlb access (a0)l1d cache writeback (a8)acbranch mispred nonspec (cb)cfd5map dispatch bubble (d6)ddfetch restart (de)e0? simd retires (ee)? int retires (ef)f5f6f7f8fd
100248003759906169721251001010100001010000502864300180018800378003778366378767100102010000202000080037800371110021109101010000100000640216227967710000108003880038800388003880038
100248003760006169721251001010100001010000502864300180018800378003778366378767100102010000202000080037800371110021109101010000100000640216227967710000108003880038800388003880038
100248003760006169721251001010100001010000502864300180018800378003778366378767100102010000202012080037800371110021109101010000100000640216227967710000108003880038800388003880038
100248003759906169721251001010100001010000502864300180018800378003778366378767100102010000202000080037800371110021109101010000100000640216227967710000108003880038800388003880038
100248003759906169721251001010100001010000502864300180018800378003778366378767100102010000202000080037800371110021109101010000100000640216227967710000108003880038800388003880038
1002480037599636169708251001010100001010000502864300180018800378003778366378767100102010000202000080037800371110021109101010000100000640216227967710000108003880038800388003880038
100248003759906169721251001010100001010000502864300180018800378003778366378767100102010000202000080037800371110021109101010000100000640216227967710000108003880038800388003880038
100248003759906169721251001010100001010000502864300180018800378003778366378767100102010000202000080037800371110021109101010000100000640216227967710000108003880038800388003880038
100248003760006169721251001010100001010000502864300180018800378003778366378767100102010000202000080037800371110021109101010000100000640216227967710000108003880038800388003880038
100248003759906169721251001010100001010000552864300180018800378003778366378767100102010000202000080037800371110021109101010000100000640216227967710000108003880038800388003880038

Test 4: throughput

Count: 8

Code:

  fdiv v0.2s, v8.2s, v9.2s
  fdiv v1.2s, v8.2s, v9.2s
  fdiv v2.2s, v8.2s, v9.2s
  fdiv v3.2s, v8.2s, v9.2s
  fdiv v4.2s, v8.2s, v9.2s
  fdiv v5.2s, v8.2s, v9.2s
  fdiv v6.2s, v8.2s, v9.2s
  fdiv v7.2s, v8.2s, v9.2s
  movi v8.16b, 9
  movi v9.16b, 10

(fused SUBS/B.cc loop)

100 unrolls and 100 iterations

Result (median cycles for code divided by count): 1.0006

retire uop (01)cycle (02)03mmu table walk data (08)1e3f51schedule uop (52)schedule int uop (53)schedule simd uop (54)dispatch int uop (56)dispatch simd uop (57)int uops in schedulers (59)ldst uops in schedulers (5b)60696b6d6emap stall dispatch (70)simd prf full (72)map rewind (75)map stall (76)dispatch uop (78)map int uop (7c)map simd uop (7e)map int uop inputs (7f)map simd uop inputs (81)8283flush restart other nonspec (84)85inst all (8c)inst branch (8d)inst branch taken (90)inst branch cond (94)inst int alu (97)inst simd alu (9a)9fl1d cache writeback (a8)acbranch mispred nonspec (cb)cfd5map dispatch bubble (d6)ddfetch restart (de)e0? simd retires (ee)? int retires (ef)f5f6f7f8fd
802048004560000552580100100800001008000050064000018002508004480044699640370002801002008000020016000080044800441180201100991001008000010000051102161180036800001008004580045800458004580045
802048004459900552580100100800001008000050064000008002508004480044699890370002801002008005620016000080044800441180201100991001008000010010051101161180036800001008004580045800458004580045
802048004460000552580100100800001008000050064000008002508004480044699640370002801002008000020016000080044800441180201100991001008000010000051101161180036800001008004580045800458004580045
802048004459900552580100100800001008000050064000008002508004480044699640370002801002008000020016000080044800441180201100991001008000010000051101161180036800001008004580045800458004580045
802048004459900552580100100800001008000050064000008002508004480044699640370002801002008004820016000080044800441180201100991001008000010000051101161180036800001008004580045800458004580045
8020480044599007202580100100800001008000050064000008002508004480044699640370002801002008000020016000080044800441180201100991001008000010000151101161180036800001008004580045800458004580045
8020480044599002452580100100800001008000050064000018002508004480097699640370002801002008000020016000080044800441180201100991001008000010000051101161180036800001008004580045800458004580045
802048004459910552580100100800001008000050064000008019708004480044699640370002801002008004020016000080044800441180201100991001008000010000051101161180036800001008004580045800458004580045
802048004459900552580100100800001008000050064000008002508004480044699640370002801002008000020016000080044800441180201100991001008000010000051101161180036800001008004580045800458004580045
802048004460000552580100100800001008000050064000008002508004480044699640370002801002008000020016000080044800441180201100991001008000010000051101161180036800001008004580045800458004580045

1000 unrolls and 10 iterations

Result (median cycles for code divided by count): 1.0006

retire uop (01)cycle (02)0309l2 tlb miss data (0b)1e3f51schedule uop (52)schedule int uop (53)schedule simd uop (54)dispatch int uop (56)dispatch simd uop (57)int uops in schedulers (59)ldst uops in schedulers (5b)60696b6d6emap stall dispatch (70)simd prf full (72)map rewind (75)map stall (76)dispatch uop (78)map int uop (7c)map simd uop (7e)map int uop inputs (7f)map simd uop inputs (81)8283flush restart other nonspec (84)85inst all (8c)inst branch (8d)inst branch taken (90)inst branch cond (94)inst int alu (97)inst simd alu (9a)9fl1d cache writeback (a8)acbranch mispred nonspec (cb)cdcfd0d5map dispatch bubble (d6)d9ddfetch restart (de)e0ea? simd retires (ee)? int retires (ef)f5f6f7f8fd
800248004460001055258001010800001080000506400001800250800448004469986037002480010208000020160000800448004411800211091010800001000005020031662380035080000108004580045800458004580045
8002480044600000552580010108000010800005064000018002508004480044699860370024800102080000201600008004480044118002110910108000010570005020021642380035080000108004580045800458004580045
800248004459900055258001010800001080000506400001800250800448004469986037002480010208000020160000800448004411800211091010800001013005020021662380035080000108004580045800458004580045
800248004459900055258001010800001080000506400001800250800448004469986037002480010208000020160000800448004411800211091010800001000005020031663380035080000108004580045800458004580045
8002480044600000762580010108000010800005064000018002508004480044699860370024800102080000201600008004480044118002110910108000010000050200316433800352080000108004580045800458004580045
800248004460000055258001010800001080000506400001800250800448004469986037002480010208000020160000800448009711800211091010800001000005020031643380035080000108004580045800458004580045
800248004459900055258001010800001080000506400001800250800448004469986037002480010208000020160000800448004411800211091010800001000005020031643380035080000108004580045800458004580045
800248004460000055258001010800001080000506400001800250800448004469986037002480010208000020160000800448004411800211091010800001000005020021643380035080000108004580045800458004580045
800248004460000055258001010800001080000506400001800250800448004469986037002480010208000020160000800448004411800211091010800001000005020031663380035080000108004580045800458004580045
800248009759900060258001010800001080000506400001800250800448004469986337002480010208000020160000800448004411800211091010800001000005020031643380035080000108004580045800458004580045