Apple Microarchitecture Research by Dougall Johnson

M1/A14 P-core (Firestorm): Overview | Base Instructions | SIMD and FP Instructions
M1/A14 E-core (Icestorm):  Overview | Base Instructions | SIMD and FP Instructions

FCSEL (scalar, D)

Test 1: Latency 1->2

Code:

  fcsel d0, d0, d1, lt
  movi v0.16b, 1
  movi v1.16b, 2

(fused SUBS/B.cc loop)

100 unrolls and 100 iterations

Result (median cycles for code): 2.0037

retire uop (01)cycle (02)03mmu table walk data (08)181e3f4e51schedule uop (52)schedule int uop (53)schedule simd uop (54)dispatch int uop (56)dispatch simd uop (57)int uops in schedulers (59)ldst uops in schedulers (5b)60696d6emap stall dispatch (70)map rewind (75)map stall (76)dispatch uop (78)map int uop (7c)map simd uop (7e)map int uop inputs (7f)map simd uop inputs (81)8283flush restart other nonspec (84)85inst all (8c)inst branch (8d)inst branch taken (90)inst branch cond (94)inst int alu (97)inst simd alu (9a)9fld unit uop (a6)l1d cache writeback (a8)acc2cfd5map dispatch bubble (d6)ddfetch restart (de)e0? simd retires (ee)? int retires (ef)f5f6f7f8fd
1020420037150000611982225101001001000010010000500142388002001820037200371856620187451010020010000200300002003720037111020110099100100100001010000007101161119907100001002003820038201332003820038
102042003715000061198222510100100100001001000050014238800200182003720037185773187451010020010000200302852003720037111020110099100100100001010041007101161119907100001002003820038200382003820038
102042003715000061198222510119100100001121008050014238800200182003720133185663187451010020210000202300002003720131111020110099100100100001010000007101161119907100001002003820038200382003820038
102042003715000082198222510100100100001001000050014238800200182003720037185663187451010020010000200300002003720037111020110099100100100001010000007101161119907100001002003820038200382003820038
102042003715000061198222510100100100001001000050014238800200182003720037185663187451010020010000200300002003720037111020110099100100100001010000007101161119907100001002003820038200382003820038
102042003715000061198222510100100100001001000050014238800200182003720037185663187451010020010000200300002003720037111020110099100100100001010000131307101161119907100001002008620085200382003820038
102042003715010061198222510100100100001001000050014238800200182003720037185663187451010020010000200300002003720037111020110099100100100001010000307101161119907100001002003820038200382003820038
1020420037150001261198222510100100100001001000050014238801200182003720037185663187451010020010000200300002003720037111020110099100100100001010000007101161119907100001002003820038200382003820038
102042003715000061198222510100100100001001000050014238800200182003720037185663187451010020010000200300002003720037111020110099100100100001010000007101161119907100001002003820038200382003820038
102042017915000082198222510100100100001001000050014238800200182003720037185663187451010020010000200300002003720037111020110099100100100001010000007101161119907100001002007120038200382003820038

1000 unrolls and 10 iterations

Result (median cycles for code): 2.0037

retire uop (01)cycle (02)031e3f4e51schedule uop (52)schedule int uop (53)schedule simd uop (54)dispatch int uop (56)dispatch simd uop (57)int uops in schedulers (59)ldst uops in schedulers (5b)60696d6emap stall dispatch (70)map rewind (75)map stall (76)dispatch uop (78)map int uop (7c)map simd uop (7e)map int uop inputs (7f)map simd uop inputs (81)8283flush restart other nonspec (84)85inst all (8c)inst branch (8d)inst branch taken (90)inst branch cond (94)inst int alu (97)inst simd alu (9a)9faccfd5map dispatch bubble (d6)ddfetch restart (de)e0? int output thing (e9)? simd retires (ee)? int retires (ef)f5f6f7f8fd
1002420037150061198222510010101000010100005014238800200182003720037185883187671001020100002030000200372003711100211091010100001001006402162219907010000102003820038200382003820038
1002420037150061198222510010101000010100005014238800200182003720037185883187671001020100002030000200372003711100211091010100001001006402162219907010000102003820038200382003820038
1002420037150061198222510010101000010100005014238800200182003720037185883187671001020100002030000200372003711100211091010100001001006402162219907010000102003820038200382003820038
1002420037150061198222510010101000010100005014238800200182003720037185883187671001020100002030000200372003711100211091010100001001006402162219907010000102003820038200382003820038
1002420037150061198222510010101000010100005014238801200182003720037185883187671001020100002030000200372003711100211091010100001001006402162219907010000102003820038200382003820038
1002420037150061198222510010101000010100005014238800200182003720037185883187671001020100002030000200372003711100211091010100001001006402162219907010000102003820038200382003820038
10024200371500147198222510010101000010100005014238800200182003720037185883187671001020100002030000200372003711100211091010100001001006402162219907010000102003820038200382003820038
1002420037150061198222510010101000010100005014238800200182003720037185883187671001020100002030000200372003711100211091010100001001006402162219907010000102003820038200382003820070
1002420037150061198222510010101000010100005014238800200182003720037185883187671001020100002030000200372003711100211091010100001001006422162219907010000102003820038200382003820038
100242003715006119822251001010100001010000501423880020018200372003718588318767100102010000203000020037200371110021109101010000100102226402162219907010000102003820038200382003820038

Test 2: uops

Code:

  fcsel d0, d1, d0, lt
  movi v0.16b, 1
  movi v1.16b, 2

(no loop instructions)

1000 unrolls and 1 iteration

Retires: 1.000

Issues: 1.000

Integer unit issues: 0.000

Load/store unit issues: 0.000

SIMD/FP unit issues: 1.000

retire uop (01)cycle (02)031e3f4e51schedule uop (52)schedule simd uop (54)dispatch simd uop (57)ldst uops in schedulers (5b)60696d6emap stall dispatch (70)map rewind (75)map stall (76)dispatch uop (78)map simd uop (7e)map simd uop inputs (81)8283flush restart other nonspec (84)85inst all (8c)inst simd alu (9a)9fld unit uop (a6)st unit uop (a7)l1d cache writeback (a8)a9acc2cfd5map dispatch bubble (d6)ddfetch restart (de)e0? simd retires (ee)f5f6f7f8fd
100420371506118222510071000100013688012018203720371716318951000100030002037203711100110001000000063073216221907100020382038203820382038
10042037150611822251000100010001368801201820372037171631895100010003000203720371110011000100000000073216221907100020382038203820382038
10042037150611822251000101210001368801201820372037171631895100010003000203720371110011000100000000073516221907100020382038203820382038
1004203715132821822251012100010801368801201820842037171631895100010003000203720372110011000100001131008273216221907100020382038203820382038
10042037150611822251000100010001368801201820372037171631895100010003000203720371110011000100000000073216221907100020382038203820382038
10042037160611822251000100010001368801201820372037171631895100010003000203720371110011000100000000073216221907100020382038203820382038
10042037150611822251000100010001368800201820372037171631895100010003000203720371110011000100000000073216221961100020382038203820382038
10042037160611822251000100010001368801201820372037171631895100010003000203720371110011000100000000073216221907100020382038203820382038
10042037160611822251000100010001368801201820372037171631895100010003000203720371110011000100000000073216221907100020382038203820382038
10042037150611822251000100010001368801201820372037171631895100010003000203720371110011000100000000073216221907100020382038203820382038

Test 3: Latency 1->3

Code:

  fcsel d0, d1, d0, lt
  movi v0.16b, 1
  movi v1.16b, 2

(fused SUBS/B.cc loop)

100 unrolls and 100 iterations

Result (median cycles for code): 2.0037

retire uop (01)cycle (02)03mmu table walk data (08)09l2 tlb miss data (0b)181e3f4e51schedule uop (52)schedule int uop (53)schedule simd uop (54)dispatch int uop (56)dispatch simd uop (57)int uops in schedulers (59)ldst uops in schedulers (5b)696d6emap stall dispatch (70)map rewind (75)map stall (76)dispatch uop (78)map int uop (7c)map simd uop (7e)map int uop inputs (7f)map simd uop inputs (81)8283flush restart other nonspec (84)85inst all (8c)inst branch (8d)inst branch taken (90)inst branch cond (94)inst int alu (97)inst simd alu (9a)9fl1d tlb access (a0)l1d tlb miss (a1)st unit uop (a7)l1d cache writeback (a8)acc2cfd5map dispatch bubble (d6)ddfetch restart (de)e0? int output thing (e9)? simd retires (ee)? int retires (ef)f5f6f7f8fd
102042003715000003910319822251010010010000100100005001423880200182003720037185663187451010020010000200300002003720037111020110099100100100001010000000071011611199070100001002003820038200382003820038
1020420037150000006119822251010010010000100100005001423880200182003720037185663187451010020010000200300002003720037111020110099100100100001010000000071011611199070100001002003820038201802003820038
102042003714900000106619822251010010010000100100005001423880200182003720037185663187451010020010000200311432003720037111020110099100100100001010000000071011611199070100001002003820038200382003820038
1020420037150000106119822251010010010000100100005001423880200182003720037185663187451010020010000200300002003720037111020110099100100100001010000000071011611199070100001002003820038200382003820038
10204200371500000044119822251010010010000100100005001423880200182003720037185663187451010020010000202300002003720037111020110099100100100001010000000071011611199070100001002003820038200382003820038
1020420037150000006119786251010010010000100100005001423880200182003720037185663187451010020210000200300002003720037111020110099100100100001010000000071011613199070100001002003820038200382003820038
1020420037150000006119822251010010010012100100005001423880200182003720037185663187451010020010000200300002003720037111020110099100100100001010000013071011611199070100001002003820038200382003820038
1020420037150000006119822251010010010000125100005001423880200182003720037185663187451010020010000200300002003720037111020110099100100100001010000006071011611199070100001002003820038200382003820038
10204200371500000186119822251010010010000100100006161423880200182003720037185663187451010020010000200300002003720037111020110099100100100001010000000071011611199070100001002003820038200382003820038
1020420037150000006119822251010012210000100100005001423880200182003720037185663187451010020010000200300002003720037111020110099100100100001010000000071021611199070100001002003820038200382003820038

1000 unrolls and 10 iterations

Result (median cycles for code): 2.0037

retire uop (01)cycle (02)03mmu table walk data (08)l2 tlb miss data (0b)1e1f3f4e51schedule uop (52)schedule int uop (53)schedule simd uop (54)dispatch int uop (56)dispatch simd uop (57)int uops in schedulers (59)ldst uops in schedulers (5b)696d6emap stall dispatch (70)map rewind (75)map stall (76)dispatch uop (78)map int uop (7c)map simd uop (7e)map int uop inputs (7f)map simd uop inputs (81)8283flush restart other nonspec (84)85inst all (8c)inst branch (8d)inst branch taken (90)inst branch cond (94)inst int alu (97)inst simd alu (9a)9fl1d tlb miss (a1)ld unit uop (a6)l1d cache writeback (a8)acc2cfd5map dispatch bubble (d6)ddfetch restart (de)e0? simd retires (ee)? int retires (ef)f5f6f7f8fd
100242003715000006119822251001010100001010000501423880200182003720037185883187671001020100002030000200372003711100211091010100001001000000640416331990710000102003820038201332008620038
100242003715000006119822251001010100001010000501423880200182003720037185883187671001020100002030000200372003711100211091010100001001000000640316331990710000102003820038200382003820038
100242003715000006119822251001010100001010000501423880200182003720037185883187671001020100002030000200372003711100211091010100001001000000640316331990710000102003820038200382003820038
1002420037150000034619822251001010100001010000501423880200182003720083185883187671001020100002030000200372003711100211091010100001001000000640316331990710000102003820038200382003820085
100242003714900006119822251001010100001010000501423880200182003720037185883187671001020100002030000200372003711100211091010100001001000000640316331990710000102003820038200382003820038
100242003715000006119822251001010100001010000501423880200182003720037185883187671001020100002030000200372003711100211091010100001001000000640316331990710000102003820038200382003820038
100242003715000006119822251001010100001010000501423880200182003720037185883187671001020100002030000200372003711100211091010100001001000000640316331990710000102003820038200382003820038
100242003715000006119822251001010100001010000501423880200182003720037185883187671001020100002030000200372003711100211091010100001001000000640316331990710000102003820038200382003820038
100242003715000006119822251001010100001010000501423880200182003720037185883187671001020100002030000200372003711100211091010100001001000000640316331999210000102003820038200382003820038
100242003715000006119822251001010100001010000501423880200182003720037185883187671001020100002030000200372003711100211091010100001001000100640316331990710000102003820038200382003820038

Test 4: Latency 1->4

Chain cycles: 2

Code:

  fcsel d0, d1, d2, lt
  fcmp d0, d3
  movi v0.16b, 1
  movi v1.16b, 2
  movi v2.16b, 3
  movi v3.16b, 4

(non-fused SUB/CBNZ loop)

100 unrolls and 100 iterations

Result (median cycles for code, minus 2 chain cycles): 2.0037

retire uop (01)cycle (02)031e3f4e51schedule uop (52)schedule int uop (53)schedule simd uop (54)dispatch int uop (56)dispatch simd uop (57)int uops in schedulers (59)ldst uops in schedulers (5b)60696d6emap stall dispatch (70)map rewind (75)map stall (76)dispatch uop (78)map int uop (7c)map simd uop (7e)map int uop inputs (7f)map simd uop inputs (81)8283flush restart other nonspec (84)85inst all (8c)inst branch (8d)inst branch taken (90)inst int alu (97)inst simd alu (9a)9facbranch cond mispred nonspec (c5)branch mispred nonspec (cb)cdcfd5map dispatch bubble (d6)ddfetch restart (de)e0? int output thing (e9)? simd retires (ee)? int retires (ef)f5f6f7f8fd
202044003729906139822252020020020000200200001100285388004001840037400373732473749020200200200082005002040037400371120201100991002000010000011113170160039918100100001004003840038400384003840038
202044003730006139822252020020020000200200001100285388004001840037400373732463749120200200200082005002040037400371120201100991002000010000011113180160039918100100001004003840038400384003840038
2020440037299972639822252020020020000200200001100285388004001840037400373732463749120200200200082005002040037400371120201100991002000010000011113180160039919100100001004003840038400384003840038
20204400373001326139822252020020020000200200001100285388014001840037400373732463749120200200200082005002040037400371120201100991002000010000011113170160039918100100001004003840038400384003840038
202044003729906139822252020020020000200200001100285388004001840037400373732473749120200200200002005000040037400371120201100991002000010000000013101161139909100100001004003840038400384003840038
202044003730006139822252020020020000200200001100285388004001840037400373731633749520200200200002005000040037400371120201100991002000010000000013101161139909100100001004003840038400384003840038
2020440037300081398222520200200200002002000011002853880040018400374003737316123749520200200200002005000040037400371120201100991002000010000000013101161139909100100001004003840038400384003840038
202044003730006139822252020020020000200200001100285388004001840037400373731633749520200200200002005000040037400371120201100991002000010000000013101161139909100100001004003840038400384003840038
2020440037300053639822252020020020000200200001100285388014001840037400373731633749520200200200002005000040037400371120201100991002000010000000013101161139909100100001004003840038400384003840038
202044003729906139822252020020020000200200001100285388004001840037400373731633749520200200200002005000040037400371120201100991002000010000000013101161139909100100001004003840038400384003840038

1000 unrolls and 10 iterations

Result (median cycles for code, minus 2 chain cycles): 2.0037

retire uop (01)cycle (02)03191e1f3a3f4e51schedule uop (52)schedule int uop (53)schedule simd uop (54)dispatch int uop (56)dispatch simd uop (57)int uops in schedulers (59)ldst uops in schedulers (5b)60696d6emap stall dispatch (70)map rewind (75)map stall (76)dispatch uop (78)map int uop (7c)map simd uop (7e)map int uop inputs (7f)map simd uop inputs (81)8283flush restart other nonspec (84)85inst all (8c)inst branch (8d)inst branch taken (90)inst int alu (97)inst simd alu (9a)9fl1d cache writeback (a8)accfd5map dispatch bubble (d6)ddfetch restart (de)e0? int output thing (e9)? simd retires (ee)? int retires (ef)f5f6f7f8fd
20024400373000000613982225200202020000202000011028538801400184003740037373383375172002020200002050000400374003711200211091020000100000156127031643399091010000104003840038400384003840038
20024400373000000613982225200202020000202000011028538801400184003740037373383375172002020200002050000400374003711200211091020000100000105127031633399091010000104003840038400384003840038
2002440037300090082398224420020202000020200001102853880140018400374003737348337517200202020000205000040037400371120021109102000010000021127021633399091010000104003840038400384003840038
20024400373000000613982225200202020000202000011028538801400184003740037373383375172002020200002050000400374003711200211091020000100000126127031632399091010000104003840038400384003840038
20024400373000000613982225200202020000202000011028543681400184003740037373383375172002020200002050000400374003711200211091020000100000150127031643399091010000104003840038400384003840038
20024400372990000613982225200202020000202000011028538801400184003740037373383375172002020200002050000400374003711200211091020000100000144127031634399091010000104003840038400384003840038
20024400372990000613982225200202020000202000011028538801400184003740037373383375172002020200002050000400374003711200211091020000100000144127031634399091010000104003840038400384003840069
20024400372990000613982225200202020000202000011028538801400184003740085373383375172002020200002050000400374003711200211091020000100000156127031634399091010000104003840038400384003840038
20024400373000000613982225200202020000202000011028538801400184003740037373383375172002020200002050000400374003711200211091020000100000114127031633399091010000104003840038400384003840038
2002440037299000061398222520020202000020200001102853880140018400374003737338337517200202020000205000040037400371120021109102000010000015127031633399091010000104003840038400384003840038

Test 5: throughput

Count: 8

Code:

  fcsel d0, d8, d9, lt
  fcsel d1, d8, d9, lt
  fcsel d2, d8, d9, lt
  fcsel d3, d8, d9, lt
  fcsel d4, d8, d9, lt
  fcsel d5, d8, d9, lt
  fcsel d6, d8, d9, lt
  fcsel d7, d8, d9, lt
  movi v8.16b, 9
  movi v9.16b, 10

(fused SUBS/B.cc loop)

100 unrolls and 100 iterations

Result (median cycles for code divided by count): 0.5005

retire uop (01)cycle (02)031e3f51schedule uop (52)schedule int uop (53)schedule simd uop (54)dispatch int uop (56)dispatch simd uop (57)int uops in schedulers (59)ldst uops in schedulers (5b)696d6emap stall dispatch (70)map rewind (75)map stall (76)dispatch uop (78)map int uop (7c)map simd uop (7e)map int uop inputs (7f)map simd uop inputs (81)8283flush restart other nonspec (84)85inst all (8c)inst branch (8d)inst branch taken (90)inst branch cond (94)inst int alu (97)inst simd alu (9a)9fl1d cache writeback (a8)accdcfd5map dispatch bubble (d6)ddfetch restart (de)e0? int output thing (e9)? simd retires (ee)? int retires (ef)f5f6f7f8fd
80204400513000432580100100800001008000050064000040019400384003829970329996801002008000020024000040038400381180201100991001008000080100000511021611400350800001004003940039400394003940039
80204400383000432580100100800001008000050064000040019400384003829970329996801002008000020024000040038400381180201100991001008000080100000511011611400350800001004003940039400394003940039
80204400383000432580100100800001008000050064000040019400384003829970329996801002008000020024000040038400381180201100991001008000080100000511011611400350800001004003940039400394003940039
80204400383000432580100100800001008000050064000040019400384003829970329996801002008000020024000040038400381180201100991001008000080100400511011611400350800001004003940039400394003940039
80204400383000432580100100800001008000050064000040019400384003829970329996801002008000020024000040038400381180201100991001008000080100000511011611400350800001004003940039400394003940039
80204400383000432580100100800001008000050064000040019400384003829970329996801002008000020024000040038400381180201100991001008000080100000511011611400350800001004003940039400394003940039
802044003830007082580100100800001008000050064000040019400384003829970329996801002008000020024000040038400381180201100991001008000080100000511011611400350800001004003940039400394003940039
80204400382990432580100100800001008000050064000040019400384003829970330014801002008000020024000040038400381180201100991001008000080100000511021611400350800001004003940039400394003940039
802044003830004325801001008000010080000500640000400194003840038299701529990801002008000020024000040038400381180201100991001008000080100500511011611400350800001004003940039400394003940039
80204400383000432580100100800001008000050064000040019400384003829970329996801002008000020024000040038400381180201100991001008000080100000511011611400350800001004003940039400394003940039

1000 unrolls and 10 iterations

Result (median cycles for code divided by count): 0.5005

retire uop (01)cycle (02)03l1i tlb fill (04)091e1f3f51schedule uop (52)schedule int uop (53)schedule simd uop (54)dispatch int uop (56)dispatch simd uop (57)int uops in schedulers (59)ldst uops in schedulers (5b)696b6d6emap stall dispatch (70)map rewind (75)map stall (76)dispatch uop (78)map int uop (7c)map simd uop (7e)map int uop inputs (7f)map simd uop inputs (81)8283flush restart other nonspec (84)85inst all (8c)inst branch (8d)inst branch taken (90)inst branch cond (94)inst int alu (97)inst simd alu (9a)9fl1d cache writeback (a8)acbranch mispred nonspec (cb)cfl1i tlb miss demand (d4)d5map dispatch bubble (d6)ddfetch restart (de)e0? simd retires (ee)? int retires (ef)f5f6f7f8fd
80024400442990000708258001010800001080000506400004001904003840038299923300188001020800002024000040038400381180021109101080000800102005021191617144003580000104003940088400884003940039
800244003829911010489258001010800001080000506400004001904003840038299923300188001020800002024000040038400381180021109101080000800100005021116161794003580000104003940039400394003940039
80024400383001100899880057108000010800005064000040019040038400382999233001880010208000020240000400384003811800211091010800008001003050211171613174003580000104003940039400394003940039
80024400383001100892580010108000010800005064000040019040038400382999233001880010208000020240699400384003811800211091010800008001000050211161618114003580000104003940039400394003940039
80024400383001100892580010108004810800005064000040019040038400382999233001880010208000020240000400384003811800211091010800008001000050211171616174003580000104003940039400394003940039
800244003830011003762580010108000010800005064000040019040038400382999233001880010208000020240000400384003811800211091010800008001040050211121619164003580000104003940039400394003940039
80024400383001100892580010108000010800005064000040019040038400382999233001880010208000020240000400384003811800211091010800008001000050211171614184003580000104003940039400394003940039
80024400383001100892580010138000010800505064000040019040038400382999233001880010208000020240000400384003811800211091010800008001010050211171614174003580000104003940039400394003940039
80024400383001100892580010108000010800005064000040019040038400382999233001880010208000020240000400384003811800211091010800008001000050211171617174003580000104003940039400394003940039
8002440038300110089258001010800001080000506400004001904003840038299923300188001020800002024000040038400381180021109101080000800100005021191617144003580000104003940039400394003940039