Apple Microarchitecture Research by Dougall Johnson

M1/A14 P-core (Firestorm): Overview | Base Instructions | SIMD and FP Instructions
M1/A14 E-core (Icestorm):  Overview | Base Instructions | SIMD and FP Instructions

FMLA (by element, 4S)

Test 1: uops

Code:

  fmla v0.4s, v1.4s, v2.s[1]
  movi v0.16b, 1
  movi v1.16b, 2
  movi v2.16b, 3

(no loop instructions)

1000 unrolls and 1 iteration

Retires: 1.000

Issues: 1.000

Integer unit issues: 0.000

Load/store unit issues: 0.000

SIMD/FP unit issues: 1.000

retire uop (01)cycle (02)03181e3f4e51schedule uop (52)schedule simd uop (54)dispatch simd uop (57)ldst uops in schedulers (5b)60696d6emap stall dispatch (70)map rewind (75)map stall (76)dispatch uop (78)map simd uop (7e)map simd uop inputs (81)8283flush restart other nonspec (84)85inst all (8c)inst simd alu (9a)l1d cache writeback (a8)accfd5map dispatch bubble (d6)ddfetch restart (de)e0? simd retires (ee)f5f6f7f8fd
1004403730006134072510001000100053190814018403740373258338951000100030004037403711100110002073116113473100040384038403840384038
1004403730006134072510001000100053190804018403740373258338951000100030004037403711100110000073116113473100040384038403840384038
1004403730036134072510001000100053190814018403740373258338951000100030004037403711100110000073116113473100040384038403840384038
1004403730106134072510001000100053190804018403740373258338951000100030004037403711100110000073116113473100040384038403840384038
1004403730008434072510001000100053190804018403740373258338951000100030004037403711100110000073116113473100040384038403840384038
1004403730006134072510001000100053190804018403740373258338951000100030004037403711100110000073116113473100040384038403840384038
1004403730006134072510001000100053190804018403740373258338951000100030004037403711100110000073116113473100040384038403840384038
1004403730006134072510001000100053190804018403740373258338951000100030004037403711100110000073116113473100040384038403840384038
1004403730009034072510001000100053190814018403740373258338951000100030004037403711100110000073116113473100040384038403840384038
1004403730006134072510001000100053190804018403740373258338951000100030004037403711100110000073116113473100040384038403840384038

Test 2: Latency 1->1

Code:

  fmla v0.4s, v1.4s, v2.s[1]
  movi v0.16b, 1
  movi v1.16b, 2
  movi v2.16b, 3

(fused SUBS/B.cc loop)

100 unrolls and 100 iterations

Result (median cycles for code): 4.0037

retire uop (01)cycle (02)03mmu table walk data (08)l2 tlb miss data (0b)18191e1f3a3f4e51schedule uop (52)schedule int uop (53)schedule simd uop (54)dispatch int uop (56)dispatch simd uop (57)int uops in schedulers (59)ldst uops in schedulers (5b)696b6d6emap stall dispatch (70)simd prf full (72)map rewind (75)map stall (76)dispatch uop (78)map int uop (7c)map simd uop (7e)map int uop inputs (7f)map simd uop inputs (81)8283flush restart other nonspec (84)85inst all (8c)inst branch (8d)inst branch taken (90)inst branch cond (94)inst int alu (97)inst simd alu (9a)9fl1d tlb access (a0)l1d tlb miss (a1)ld unit uop (a6)l1d cache writeback (a8)a9acc2branch cond mispred nonspec (c5)c9cdcfd0d5map dispatch bubble (d6)ddfetch restart (de)e0? int output thing (e9)? simd retires (ee)? int retires (ef)f5f6f7f8fd
10205400373000000000613940725101001001000010410444500570690840123040037400373810803387451010020010000200300004003740037411020110099100100100001000000030000710024123394790100001004003840084401464003840038
102044003730901000006139407251010010010000110100005165708304400180400374003738108025387451010020010164212300004003740084111020110099100100100001000000000000741021622394790100001004003840038400384003840038
1020440037312010000010373940725101001001000010010000500570690840088340037400373811503387451010020010000200304984018340084111020110099100100100001000000000000734021622394797100001004003840038400384003840075
10204401803120100000613940725101001031001810010000500570690840018040037400373810803387451010020210174200300004003740037111020110099100100100001004000000000740021622394790100001004003840038400864003840038
10204400373000001000613940725101001041001810010000546570690840018040037400373810807388001010020010000204300004003740037211020110099100100100001000000000000710121622394790100001004003840038400384003840038
1020440037300010000010653940786101001001000010410296500570690840018040037400373810803387631010020010000200300004003740037211020110099100100100001000021000001710124022394790100001004003840038400384003840038
1020440084304010400029163940783101001001000010910000500571109640018040037400373810803387451010020010000200300004003740037111020110099100100100001000000060000740131722394790100001004003840038400384003840038
10204402283110000000613940725101001001000010010000522570690840018040178400843810803387451010020010160200300004003740037111020110099100100100001000400000000710121622396210100001004003840038400384008640038
10204400373001000156008143940725101001131000010010000500570830440018040084400373810803387451025020010000200300004003740037111020210099100100100001000007004000710121623394790100001004003840038400384003840086
10204400373000000396016139407251010011010000100100005005711096400180400374003738118016387451010020010499200300004008440179111020110099100100100001000022032000710131623394793100001004003840038400384003840038

1000 unrolls and 10 iterations

Result (median cycles for code): 4.0037

retire uop (01)cycle (02)03mmu table walk instruction (07)mmu table walk data (08)l2 tlb miss instruction (0a)l2 tlb miss data (0b)18191e1f3f4e51schedule uop (52)schedule int uop (53)schedule simd uop (54)dispatch int uop (56)dispatch simd uop (57)int uops in schedulers (59)ldst uops in schedulers (5b)60696b6d6emap stall dispatch (70)simd prf full (72)map rewind (75)map stall (76)dispatch uop (78)map int uop (7c)map simd uop (7e)map int uop inputs (7f)map simd uop inputs (81)8283flush restart other nonspec (84)85inst all (8c)inst branch (8d)inst branch taken (90)inst branch cond (94)inst int alu (97)inst simd alu (9a)9fl1d tlb access (a0)l1d tlb miss (a1)ld unit uop (a6)st unit uop (a7)l1d cache writeback (a8)a9acc2branch mispred nonspec (cb)cfd5map dispatch bubble (d6)ddfetch restart (de)e0? int output thing (e9)? simd retires (ee)? int retires (ef)f5f6f7f8fd
1002440084300000113007263940725100101010000101000050570690814001804003740037381300838767100102010000203000040037400371110021109101010000100000000006403163239473010000104003840038400384003840038
10024401773170000005370613938925100101010000101000050570690814001834003740037381300738767100102210000203000040037400371110021109101010000100001507203006402163339473010000104003840085400384003840229
1002440037300100000210613940725100101010018101014861570690814001834003740037381300338767100102010000223000040037400371110021109101010000100000000006403163639473010000104003840038400384003840038
100244018031300003040502963940725100101010018101000050570690814001804003740037381300338767104542010000203000040037400854110021109101010000100000000006645163439473010000104003840038400384003840180
10024400373120000005790613940725100101010000121000050570690804001804003740037381300338767100102010000203000040037400374110021109101010000100000200006404163339473010000104003840038400384003840038
10024400373141000000014053940769100101210000101000050570830404001804003740037381300338767100102010000203000040037400854110021109101010000100000007468006403163439473010000104008540181400854003840038
10024400372991000005400613940725100101010018101000050570690814001804018040037381300338841100102010000203000040037400371110021109101010000100000103006633165239473010000104008640133400854003840038
100244003730010000090613940725100161110000101000050570830404001804003740037381300338767100102010000203145240084400371110022109101010000100000000006403163439473010000104003840038400384003840038
100254003730000000000613940725100101010000101000050570830404001804003740037381300338767100102010000203000040037401321110021109101010000100000103540006692163339473010000104003840038400384003840038
10024401803171001010094394072510010101000010101475057069080400180400374003738130033876710010201000020300004003740179111002110910101000010000000126006403163239473010000104003840038400384003840083

Test 3: Latency 1->2

Code:

  fmla v0.4s, v0.4s, v1.s[1]
  movi v0.16b, 1
  movi v1.16b, 2

(fused SUBS/B.cc loop)

100 unrolls and 100 iterations

Result (median cycles for code): 4.0037

retire uop (01)cycle (02)0309l2 tlb miss data (0b)18191e3a3f4e51schedule uop (52)schedule int uop (53)schedule simd uop (54)dispatch int uop (56)dispatch simd uop (57)int uops in schedulers (59)ldst uops in schedulers (5b)60696d6emap stall dispatch (70)map rewind (75)map stall (76)dispatch uop (78)map int uop (7c)map simd uop (7e)map int uop inputs (7f)map simd uop inputs (81)8283flush restart other nonspec (84)85inst all (8c)inst branch (8d)inst branch taken (90)inst branch cond (94)inst int alu (97)inst simd alu (9a)9fl1d tlb access (a0)st unit uop (a7)l1d cache writeback (a8)acc2cfd5map dispatch bubble (d6)ddfetch restart (de)e0? simd retires (ee)? int retires (ef)f5f6f7f8fd
102044003729900000072639407251010010010000100100005005706908140018400374003738108338745101002001000020030000400374003711102011009910010010000100000007101161139479100001004003840038400384003840038
10204400373000000006139407451010010010000100100005615706908040018400374003738108338745101002061000020030498400374003711102011009910010010000100200047101161139479100001004003840038400384013340038
1020440037315100027606139396251010011410000100102965005706908040053400374003738108338745101002001000020030984400374003711102011009910010010000100004363537101162139479100001004003840133400384003840038
1020440132311000021906139407251010010010000100100005005708304040018400374003738108338745101002001000020030000400374003711102011009910010010000100000007101161139479100001004003840038400384003840038
102044003731200000020139407251010010010000100100005005706908040018400374003738108338745120162041000020030999400374003711102011009910010010000100001007101161139479100001004003840038400384003840038
1020440037310003000201039407251010010010000121101485005706908040018400374003738108338745101002001000020030000400374003711102011009910010010000100001307101161139479100001004013340134400384003840038
1020440037311000013206139407251010010010000100100005005706908140018400374003738108338745101002001000020030000400374003711102011009910010010000100001007101161139479100001004003840038400384003840038
1020440037305000212017639407251010010510000100100005005706908040018400374003738108338745101002001000020030000400374003711102011009910010010000100000007101161139479100001004003840038400384003840038
10204400373000000016139407251010010010000100100005005706908040018400374003738108338745101002001000020030000400844003711102011009910010010000100000027101161139479100001004003840038400384003840134
1020440037300000000103394072510100100100001001000050057069080400184003740037381083387451010020510000200300004003740037111020110099100100100001002101082007101161139479100001004003840038400384003840085

1000 unrolls and 10 iterations

Result (median cycles for code): 4.0037

retire uop (01)cycle (02)03l2 tlb miss data (0b)18191e1f3f4e51schedule uop (52)schedule int uop (53)schedule simd uop (54)dispatch int uop (56)dispatch simd uop (57)int uops in schedulers (59)ldst uops in schedulers (5b)60696d6emap stall dispatch (70)map rewind (75)map stall (76)dispatch uop (78)map int uop (7c)map simd uop (7e)map int uop inputs (7f)map simd uop inputs (81)8283flush restart other nonspec (84)85inst all (8c)inst branch (8d)inst branch taken (90)inst branch cond (94)inst int alu (97)inst simd alu (9a)9fl1d tlb access (a0)l1d cache writeback (a8)accfd5map dispatch bubble (d6)ddfetch restart (de)e0? simd retires (ee)? int retires (ef)f5f6f7f8fd
100244003730000000613940725100101010000101000050570690804001840037400373813033876710010201000020300004003740037111002110910101000010000640416433947310000104003840038400384003840038
100244003730000000613940725100101010000101000050570690814001840037400373813033876710010201000020300004003740037111002110910101000010000640316343947310000104003840038400384003840038
100244003730000000613940725100101010000101000050570690804001840037400373813033876710010201000020300004003740037111002110910101000010000640316343947310000104003840038400384003840038
100244003729900000613940725100101010000101000050570690804001840037400373813033876710010201000020300004003740037111002110910101000010000640316343947310000104003840038400384003840038
100244003730000000613940725100101010000101000050570690814001840037400373813033876710010201000020300004003740037111002110910101000010000640416433947310000104003840038400384003840038
100244003729900000613940725100101010000101000050570690804001840037400373813033876710010201000020300004003740037111002110910101000010000640416443947310000104003840038400384003840038
100244003730000000613940725100101010000101000050570690804001840037400373813033876710010201000020300004003740037111002110910101000010000640416343947510000104003840038400384003840038
100244003730000000613940725100101010000101000050570690804001840037400373813033876710010201000020300004003740037111002110910101000010000640316343947310000104003840038400384003840038
1002440037300000009823940725100101010000101000050570690804001840037400373813033876710010201000020300004003740037111002110910101000010000640416343947310000104003840038400384003840038
100244003730000000613940725100101010000101000050570690804001840037400373813033876710010201000020300004003740037111002110910101000010000640416343947310000104003840038400384003840038

Test 4: Latency 1->3

Code:

  fmla v0.4s, v1.4s, v0.s[1]
  movi v0.16b, 1
  movi v1.16b, 2

(fused SUBS/B.cc loop)

100 unrolls and 100 iterations

Result (median cycles for code): 4.0037

retire uop (01)cycle (02)03mmu table walk data (08)l2 tlb miss data (0b)1e3f4e51schedule uop (52)schedule int uop (53)schedule simd uop (54)dispatch int uop (56)dispatch simd uop (57)int uops in schedulers (59)ldst uops in schedulers (5b)60696d6emap stall dispatch (70)map rewind (75)map stall (76)dispatch uop (78)map int uop (7c)map simd uop (7e)map int uop inputs (7f)map simd uop inputs (81)8283flush restart other nonspec (84)85inst all (8c)inst branch (8d)inst branch taken (90)inst branch cond (94)inst int alu (97)inst simd alu (9a)9fl1d tlb access (a0)l1d cache writeback (a8)acbranch mispred nonspec (cb)cfd5map dispatch bubble (d6)ddfetch restart (de)e0? int output thing (e9)? simd retires (ee)? int retires (ef)f5f6f7f8fd
10204400373000006139389251010010010000100101475005706908140018400374003738108338745101002001000020030000400374003711102011009910010010000100000071011611394790100001004003840038400384003840038
1020440037300000132539407251010010010000100100005005706908140018400374003738108338745101002001000020030000400374008411102011009910010010000100000071011611394790100001004003840038400384003840038
10204400373000066139407251010010010000100100005005706908140018400374003738108338745101002001017920030000400374003711102011009910010010000100000071011611394790100001004003840038400384003840038
102044003729900061394072510100100100001001000050057069081400184003740037381083387451010020010000200300004003740037111020110099100100100001000000710116113947911100001004003840038400384003840038
102044003730000074739407251010010010000100100005005706908140018400374003738108338745101002001000020030000400374003711102011009910010010000100000071011611394790100001004003840038400384003840038
102044003729900017239407251010010010000100100005005706908140018400374003738108338745101002001000020030000400374003711102011009910010010000100000071011611394790100001004003840038400384003840038
10204400373000006139407251010010010000100100005005706908140018400374003738108338745101002001000020030000400374003711102011009910010010000100000071011611394790100001004003840038400384003840038
10204400372990006139407251010010010000100100005005706908140018400374003738108338745101002001000020030000400374003711102011009910010010000100000071011611394799100001004003840038400384003840038
10204400373000006139407251010010410000100100005005706908140018400374003738108338745101002001000020030000400374003711102011009910010010000100000071011611394790100001004003840038400384003840038
102044003729900126139407251010010010000100100005005706908140018400374003738108338745101002001000020030000400374003711102011009910010010000100000071011611394790100001004003840038400384003840038

1000 unrolls and 10 iterations

Result (median cycles for code): 4.0037

retire uop (01)cycle (02)031e3f4e51schedule uop (52)schedule int uop (53)schedule simd uop (54)dispatch int uop (56)dispatch simd uop (57)int uops in schedulers (59)ldst uops in schedulers (5b)60696d6emap stall dispatch (70)simd prf full (72)map rewind (75)map stall (76)dispatch uop (78)map int uop (7c)map simd uop (7e)map int uop inputs (7f)map simd uop inputs (81)8283flush restart other nonspec (84)85inst all (8c)inst branch (8d)inst branch taken (90)inst branch cond (94)inst int alu (97)inst simd alu (9a)9fld unit uop (a6)l1d cache writeback (a8)cdcfd5map dispatch bubble (d6)ddfetch restart (de)e0? simd retires (ee)? int retires (ef)f5f6f7f8fd
100244003729906139407251001010100001010000505706908140018400374003738130033876710010201000020300004003740037111002110910101000010000640316343947310000104003840038400384003840038
100244003730006139407251001010100001010000505706908140018400374003738130033876710010201000020300004003740037111002110910101000010000668316343947310000104003840038400384003840038
100244003730006139407251001010100001010000505706908040018400374003738130033876710010201000020300004003740037111002110910101000010000640316343947310000104003840038400384003840038
1002440037300072639407251001010100001010000505706908040018400374003738130033876710010201000020300004003740037111002110910101000010000640316333947310000104003840038400384003840038
100244003730006139407251001010100001010000505706908040018400374003738130033876710010201000020300004003740037111002110910101000010000640316343951110000104003840038400384003840038
100244003730006139407251001010100001010000505706908040018400374003738130033876710010201017220300004003740037111002110910101000010000640316343947310000104003840038400384003840038
100244003730006139407251001010100001010000505706908040018400374003738130033876710010201000020300004003740037111002110910101000010000640316333947310000104003840038400384003840038
100244003730008239407251001010100001010000505706908140018400374003738130033876710010201000020300004003740037111002110910101000010000640316333947310000104003840038400384003840038
100244003730006139407251001010100001010000505706908040018400374003738130033876710010201000020300004003740037111002110910101000010000640316333947310000104003840038400384003840038
1002440037300072639407251001010100001010000505706908040018400374003738130033876710010201000020300004003740037111002110910101000010000640316333947310000104003840038400384003840038

Test 5: throughput

Count: 8

Code:

  movi v0.16b, 0
  fmla v0.4s, v8.4s, v9.s[1]
  movi v1.16b, 0
  fmla v1.4s, v8.4s, v9.s[1]
  movi v2.16b, 0
  fmla v2.4s, v8.4s, v9.s[1]
  movi v3.16b, 0
  fmla v3.4s, v8.4s, v9.s[1]
  movi v4.16b, 0
  fmla v4.4s, v8.4s, v9.s[1]
  movi v5.16b, 0
  fmla v5.4s, v8.4s, v9.s[1]
  movi v6.16b, 0
  fmla v6.4s, v8.4s, v9.s[1]
  movi v7.16b, 0
  fmla v7.4s, v8.4s, v9.s[1]
  movi v8.16b, 9
  movi v9.16b, 10

(fused SUBS/B.cc loop)

100 unrolls and 100 iterations

Result (median cycles for code divided by count): 0.2508

retire uop (01)cycle (02)03l1i tlb fill (04)mmu table walk data (08)l2 tlb miss data (0b)18191e1f3a3f51schedule uop (52)schedule int uop (53)schedule simd uop (54)dispatch int uop (56)dispatch simd uop (57)int uops in schedulers (59)ldst uops in schedulers (5b)60696d6emap rewind (75)map stall (76)dispatch uop (78)map int uop (7c)map simd uop (7e)map int uop inputs (7f)map simd uop inputs (81)8283flush restart other nonspec (84)85inst all (8c)inst branch (8d)inst branch taken (90)inst branch cond (94)inst int alu (97)inst simd alu (9a)9fl1d tlb access (a0)l1d tlb miss (a1)ld unit uop (a6)st unit uop (a7)l1d cache writeback (a8)a9acc2branch cond mispred nonspec (c5)cfd2d5map dispatch bubble (d6)daddfetch restart (de)e0? int output thing (e9)? simd retires (ee)? int retires (ef)f5f6f7f8fd
16020420091150000000005152580100100800001008000050064000012004620065200653238010020080000200240000200652006511160201100991001001600001000000000001019641614905620062211600001002006620405204322056620160
160204204841510116089752804025801001008000010080000500640000120046200652006534680100200800002002400002006520065111602011009910010016000010000202434900010115010160562006201600001002006620066200662006620066
160204200651500000000040258010012180000118800005006408960200462006520065204580100200800002002400002006520065111602011009910010016000010000000000010120091601052006201600001002006620066200662006620066
16020420065150000000004025801001008000010080000500640000120046200652006532380100200800002002400002006520065111602011009910010016000010000000070300101150101601042006201600001002006620066200662006620066
160204200651500000000040258010010080000100800005006400001200462006520065323801002008000020024000020065200651116020110099100100160000100000000000101200101601042006201600001002006620066200662006620066
16020420065150000000006325801001008000010080000500640000120046200652006532380100200800002002400002006520065111602011009910010016000010000000000010120010160942006201600001002006620066200662006620066
16020420065150000000008425801001008000010080000500640000120046200652006532380100200800002002400002006520065311602011009910010016000010000000000010120051601142006201600001002006620066200662006620066
1602042006515000000000822580100100800001008000050064000012004620065200653238010020080000200240000200652006511160201100991001001600001000000000001011901016010102006201600001002006620066200662006620066
1602042006515000000000444258010010080000100800005006400001200462006520065323801002008000020024000020065200651116020110099100100160000100000000000101190101601042006201600001002006620066200662006620066
16020420065151000000004025801001008000010080000500640000120046200652006532380100200800002002400002006520065111602011009910010016000010000000000010120051601042006201600001002006620066200662006620066

1000 unrolls and 10 iterations

Result (median cycles for code divided by count): 0.2506

retire uop (01)cycle (02)03l1i tlb fill (04)mmu table walk data (08)1e3a3f51schedule uop (52)schedule int uop (53)schedule simd uop (54)dispatch int uop (56)dispatch simd uop (57)int uops in schedulers (59)ldst uops in schedulers (5b)5f6061696d6emap rewind (75)map stall (76)dispatch uop (78)map int uop (7c)map simd uop (7e)map int uop inputs (7f)map simd uop inputs (81)8283flush restart other nonspec (84)85inst all (8c)inst branch (8d)inst branch taken (90)inst branch cond (94)inst int alu (97)inst simd alu (9a)9fl1d cache writeback (a8)cfd0d2l1i cache miss demand (d3)l1i tlb miss demand (d4)d5map dispatch bubble (d6)d9dadbddfetch restart (de)dfe0? int output thing (e9)ea? simd retires (ee)? int retires (ef)f5f6f7f8fd
1600242007515020012202580012128000012800006264000011520030200492004932380012208000020240000200492004911160021109101016000010010068851241221113737420046216160000102005020050200502005020050
1600242004915000017325800121280000128000062640000115200302004920049323800122080000202400002004920049111600211091010160000100100581451243221113735420046216160000102005020050200502005020050
160024200491501001462580012128000012800006264000011520030200492004932380012208000020240000200492004911160021109101016000010010070851243221113935520046216160000102005020050200502005020050
1600242004915010017772580012128000012800006264000011520030200492004932380012208000020240000200492004911160021109101016000010010070862242221213735520050216160000102005020054200502005020050
1600242004915010004802580012128000012800006264000001520030200492004932380012208000020240000200492004911160021109101016000010010073851237221114041420046216160000102005020050200502005020050
1600242004915020018192580012128000012800006264000001520034200532004932380012208000020240000200492004911160021109101016000010010074851251223224024420050216160000102005020050200502005020050
1600242004915010018152580012128000012800006264000011520030200492004932380012208000020240000200492004911160021109101016000010010070851240221112939420046216160000102005020050200502005020050
1600242004915010014342580012128000012800006264000011520034200492004932380012208000020240000200492004911160021109101016000010010072861253221214225420050231160000102005020050200502005020050
160024200491501013508052580012128000012800006264000011520030200492004932380012208000020240000200492004911160021109101016000010010072861236231112738420046216160000102005020050200502005020050
160024200491501001522580012128000012800006264000011520030200492004932380012208000020240000200492004911160021109101016000010010074851223221113943420046216160000102005020050200502005020050

Test 6: throughput

Count: 12

Code:

  fmla v0.4s, v12.4s, v13.s[1]
  fmla v1.4s, v12.4s, v13.s[1]
  fmla v2.4s, v12.4s, v13.s[1]
  fmla v3.4s, v12.4s, v13.s[1]
  fmla v4.4s, v12.4s, v13.s[1]
  fmla v5.4s, v12.4s, v13.s[1]
  fmla v6.4s, v12.4s, v13.s[1]
  fmla v7.4s, v12.4s, v13.s[1]
  fmla v8.4s, v12.4s, v13.s[1]
  fmla v9.4s, v12.4s, v13.s[1]
  fmla v10.4s, v12.4s, v13.s[1]
  fmla v11.4s, v12.4s, v13.s[1]
  movi v12.16b, 13
  movi v13.16b, 14

(fused SUBS/B.cc loop)

100 unrolls and 100 iterations

Result (median cycles for code divided by count): 0.3337

retire uop (01)cycle (02)03191e373f4e51schedule uop (52)schedule int uop (53)schedule simd uop (54)dispatch int uop (56)dispatch simd uop (57)int uops in schedulers (59)ldst uops in schedulers (5b)60696d6emap stall dispatch (70)simd prf full (72)map rewind (75)map stall (76)dispatch uop (78)map int uop (7c)map simd uop (7e)map int uop inputs (7f)map simd uop inputs (81)8283flush restart other nonspec (84)85inst all (8c)inst branch (8d)inst branch taken (90)inst branch cond (94)inst int alu (97)inst simd alu (9a)9fl1d cache writeback (a8)acc2cfd5map dispatch bubble (d6)ddfetch restart (de)e0? simd retires (ee)? int retires (ef)f5f6f7f8fd
12020440039300000619961251201001001200001001200005005630640040020400394003924932032499712010020012000020036000041687400391112020110099100100120000100000761051611400301200001004004040040400404004040040
12020440039300000619961251201001001200001001200005005966386040020400394003924932032499712010020012000020036000041688400391112020110099100100120000100000761011611400301200001004004040040400404004040040
120205416913000120619961251201001001200001001200005005630640040020400394003924932032499712010020012020220036000040039400391112020110099100100120000100000761011611400301200001004159840040400404004040040
12020440039299000619961251201001001200001001200005005630640040020400394003924932032499712010020012000020036000040039400391112020110099100100120000100000761011611400301200001004004040040400404004040040
12020440039300000619961251201001001200001001200005005630640040020400394003924932032499712010020012000020036000040039400391112020110099100100120000100000761011611400301200001004004040040400404004040040
12020440039300000619961251201011001200001001200005005630640040020400394003924932032499712010020012000020036000040039400391112020110099100100120000100000761011611400301200001004004040040400404004040040
12020440039300000619961251201001001200001001200005005630640040020400394169124932032664512010020012000020036000040039400391112020110099100100120000100000761011611400301200001004004040040400404004040040
12020440039300000619961251201001001200001001200005005630640040020400394003924932032499712010020012000020036000040039400391112020110099100100120000100000761011611400301200001004004040040400404004040040
12020440039299000619961251201001001200001001200005005630640040020416884003924932032499712010020012000020036000040039400391112020110099100100120000100000761011611400301200001004004040040400404004040040
12020440039299000619961251201001001200001001200005005630640040020400394003924932032499712010020012000020036000040039400391112020110099100100120000100000761011611400301200001004004040040400404004040040

1000 unrolls and 10 iterations

Result (median cycles for code divided by count): 0.3337

retire uop (01)cycle (02)03l2 tlb miss data (0b)18191e1f373f4e51schedule uop (52)schedule int uop (53)schedule simd uop (54)dispatch int uop (56)dispatch simd uop (57)int uops in schedulers (59)ldst uops in schedulers (5b)5f6061696d6emap stall dispatch (70)simd prf full (72)map rewind (75)map stall (76)dispatch uop (78)map int uop (7c)map simd uop (7e)map int uop inputs (7f)map simd uop inputs (81)8283flush restart other nonspec (84)85inst all (8c)inst branch (8d)inst branch taken (90)inst branch cond (94)inst int alu (97)inst simd alu (9a)9fl1d cache writeback (a8)a9acc2cfd0d2l1i cache miss demand (d3)d5map dispatch bubble (d6)d9dadbddfetch restart (de)e0? int output thing (e9)eaec? simd retires (ee)? int retires (ef)f5f6f7f8fd
12002440039300011000619961251200101012000010120000505630640000400714003940039249550325019120010201200002036000041835416861112002110910101200001000607520500141600061040030000120000104004040040400404004040040
1200244003930000000061996125120010101200001012000050563064000040020400394003924955032501912001020120000203600004003940039111200211091010120000100000752053110160009540030005120000104004040040400404004040040
12002440039300000000536402744612001010120000101200005056306400004002041686400392495503250191200102012000020360000400394003911120021109101012000010000075200006160008940030005120000104004040040400404004040040
1200244003929900000061996125120010101200001012000050563064000040020400394003924955032502712020620120000203600004003940039111200211091010120000100000752002071600061040030000120000104004040040400404004040040
120024400393000000006199612512001010120000101200005056306400004002040682400392495503250191200102012000020360000400394003911120021109101012000010000075200006160009540030000120000104007541692416874004040040
1200244170130000000082996125120010101200001012000050563064001540881400394003924955032501912001020120000203600004003940039111200211091010120000100000752000061600091040030000120000104004040040400404168740040
12002442470300000000619961251200101012000010120000505630640000400204003940039249550325019120208201200002036000040039400391112002110910101200001000007520000916000106400300200120000104004040040400404004040040
120024400393000000006199612512001010120000101200005056306400154002040039400392495503250191200102012000020360000400394003911120021109101012000010000075200006160005940030000120000104004040040400404004040040
120024400393180000006199615112001010120000101200005056306400054002040039400392495503250191200102012000020360000400394003911120021109101012000010000075205008160009640030000120000104004040040400404004040040
120024400393000000006199612512001010120000101200005056306400154002040039400392495503250191200102012000020360000400394003911120021109101012000010000075200007160009940030000120000104004040040400404004040040