Apple Microarchitecture Research by Dougall Johnson

M1/A14 P-core (Firestorm): Overview | Base Instructions | SIMD and FP Instructions
M1/A14 E-core (Icestorm):  Overview | Base Instructions | SIMD and FP Instructions

FMLA (vector, 4H)

Test 1: uops

Code:

  fmla v0.4h, v1.4h, v2.4h
  movi v0.16b, 1
  movi v1.16b, 2
  movi v2.16b, 3

(no loop instructions)

1000 unrolls and 1 iteration

Retires: 1.000

Issues: 1.000

Integer unit issues: 0.000

Load/store unit issues: 0.000

SIMD/FP unit issues: 1.000

retire uop (01)cycle (02)03191e3f4e51schedule uop (52)schedule simd uop (54)dispatch simd uop (57)ldst uops in schedulers (5b)60696d6emap stall dispatch (70)map rewind (75)map stall (76)dispatch uop (78)map simd uop (7e)map simd uop inputs (81)8283flush restart other nonspec (84)85inst all (8c)inst simd alu (9a)l1d tlb miss (a1)l1d cache writeback (a8)accfd5map dispatch bubble (d6)ddfetch restart (de)e0? simd retires (ee)f5f6f7f8fd
10044037310061340725100010001000531908040184037403732583389510001000300040374037111001100000073116113473100040384038403840384038
10044037300061340725100010001000531908040184037403732583389510001000300040374037111001100000073116113473100040384038403840384038
10044037310061340725100010001000531908040184037403732583389510001000300040374037111001100000073224113473100040384038403840384038
10044037300061340725100010001000531908040184037403732583389510001000300040374037111001100000073116113473100040384038403840384038
10044037300082340725100010001000531908040184037403732583389510001000300040374037111001100000073116113533100040384038403840384038
10044037310061340725100010001000531908040184037403732583389510001000300040374037111001100000073116113473100040384038403840384038
10044037300061340725100010001000531908040184037403732583389510001000300040374037111001100000073116113473100040384038403840384038
10044037300061340725100010001000531908040184037403732583389510001000300040374037111001100000073116113473100040384038403840384038
10044037310061340725100010001000531908040184037403732583389510001000300040374037111001100000073116113473100040384038403840384038
1004403730118661340725100010001000531908040184037403732583389510001000300040374037111001100040073116113473100040384038403840384038

Test 2: Latency 1->1

Code:

  fmla v0.4h, v1.4h, v2.4h
  movi v0.16b, 1
  movi v1.16b, 2
  movi v2.16b, 3

(fused SUBS/B.cc loop)

100 unrolls and 100 iterations

Result (median cycles for code): 4.0037

retire uop (01)cycle (02)03mmu table walk data (08)l2 tlb miss data (0b)18191e1f3a3f4e51schedule uop (52)schedule int uop (53)schedule simd uop (54)dispatch int uop (56)dispatch simd uop (57)int uops in schedulers (59)ldst uops in schedulers (5b)60696d6emap stall dispatch (70)map rewind (75)map stall (76)dispatch uop (78)map int uop (7c)map simd uop (7e)map int uop inputs (7f)map simd uop inputs (81)8283flush restart other nonspec (84)85inst all (8c)inst branch (8d)inst branch taken (90)inst branch cond (94)inst int alu (97)inst simd alu (9a)9fl1d tlb access (a0)l1d tlb miss (a1)ld unit uop (a6)st unit uop (a7)l1d cache writeback (a8)a9accfd0d5map dispatch bubble (d6)ddfetch restart (de)e0? int output thing (e9)? simd retires (ee)? int retires (ef)f5f6f7f8fd
1020440037300000000061394072510100100100001001000050057083040400184003740037381083387451010020010000200300004008340037111020110099100100100001000000003710021632394790100001004003840038400384003840038
1020440037299000000061394074510100100100001001000050057069081400184003740037381083387451010020010000200300004003740037111020110099100100100001000000000710021623394790100001004003840086400384003840038
10204400373000100000441394072510100100100001001000050057069080400184003740037381083387451010020010000200300004003740037111020110099100100100001000000003710021622394790100001004003840038400384003840038
1020440037300000000061394072510100100100001001000050057069080400184003740037381083387451010020010000200300004003740037111020110099100100100001000000000710021622394790100001004003840038400384003840038
102044003730000001410061394072510100100100001001000050057069080400184003740037381083387451010020010000200300004003740037211020110099100100100001000000000710021622394790100001004003840038400384003840038
1020440037300000000061394072510100100100001001000050057069080400184003740037381083387451010020010000200300004003740037211020110099100100100001000000000712021623394790100001004003840038400384003840038
1020440037300000000061394072510100100100001001000050057069080400184003740037381083387571010020010000200300004003740037111020110099100100100001000000300710021622394790100001004008740038400864003840038
1020440037300010000061394072510100100100001001000050057069081400184003740037381083387451010020010000204300004003740037111020110099100100100001000000120710021622394790100001004003840086400384003840038
102044003730000000001061394072510100100100001001000052257069080400184003740037381083387451025220010000200300004003740037111020110099100100100001000000000710021622394790100001004003840038400384003840038
10204400373000000000557394072510100100100001001000050057069080400184003740037381123387451010020010000200300004003740037111020110099100100100001000000003710022622394790100001004003840038400384003840038

1000 unrolls and 10 iterations

Result (median cycles for code): 4.0037

retire uop (01)cycle (02)0318191e3f4e51schedule uop (52)schedule int uop (53)schedule simd uop (54)dispatch int uop (56)dispatch simd uop (57)int uops in schedulers (59)ldst uops in schedulers (5b)60696d6emap stall dispatch (70)map rewind (75)map stall (76)dispatch uop (78)map int uop (7c)map simd uop (7e)map int uop inputs (7f)map simd uop inputs (81)8283flush restart other nonspec (84)85inst all (8c)inst branch (8d)inst branch taken (90)inst branch cond (94)inst int alu (97)inst simd alu (9a)9fld unit uop (a6)l1d cache writeback (a8)acc3cdcfd5map dispatch bubble (d6)ddfetch restart (de)e0? simd retires (ee)? int retires (ef)f5f6f7f8fd
10024400373000010261394072510010101000010100005057069081400184003740037381303387671001020100002030000400374003711100211091010100001000000640216233947310000104003840038400384003840038
1002440037299004261394072510010101000010100005057069080400184003740037381303387671001020100002030000400374003711100211091010100001000000640216223947310000104003840038400384003840038
1002440037300006613940725100101010000101000050570690804001840037400373813033876710010201000020300004003740037111002110910101000010011432900640216323947310000104003840038400384003840038
1002440037300113961394072510010101000010100005057069080400184003740037381303387671001020100002030000400374003711100211091010100001000000640216223947310000104003840038400384003840038
10024400373000015103394072510010101000010100005057069080400184003740037381303387671001020100002030000400374003711100211091010100001000000640216223947310000104003840038400384003840038
100244003730000271023394072510010101000010100005057069080400184003740037381303387671001020100002030000400374003711100211091010100001000000640316323947310000104003840038400384003840038
100244003730000294613940725100101010000101000050570830404001840037400373813012387671001020100002030000400374003711100211091010100001000000640216223947310000104003840038400384003840038
1002440037299002761394072510010101000010100005057069080400184003740037381303387671001020100002030000400374003711100211091010100001000300640224223947310000104003840038400864003840038
1002440037299101861394072510010101000010100005057083040400184003740037381303387671001020100002030000400374003711100211091010100001000000640216223947310000104003840038400384003840038
10024400373000023161394072510010101000010100005057069080400184003740037381303387671001020100002030000400374003711100211091010100001000000640216223947310000104003840038400384003840038

Test 3: Latency 1->2

Code:

  fmla v0.4h, v0.4h, v1.4h
  movi v0.16b, 1
  movi v1.16b, 2

(fused SUBS/B.cc loop)

100 unrolls and 100 iterations

Result (median cycles for code): 4.0037

retire uop (01)cycle (02)03mmu table walk instruction (07)mmu table walk data (08)l2 tlb miss data (0b)18191e1f3f4e51schedule uop (52)schedule int uop (53)schedule simd uop (54)dispatch int uop (56)dispatch simd uop (57)int uops in schedulers (59)ldst uops in schedulers (5b)60696d6emap stall dispatch (70)map rewind (75)map stall (76)dispatch uop (78)map int uop (7c)map simd uop (7e)map int uop inputs (7f)map simd uop inputs (81)8283flush restart other nonspec (84)85inst all (8c)inst branch (8d)inst branch taken (90)inst branch cond (94)inst int alu (97)inst simd alu (9a)9fl1d tlb access (a0)l1d tlb miss (a1)ld unit uop (a6)st unit uop (a7)l1d cache writeback (a8)acc2cfd5map dispatch bubble (d6)d8ddfetch restart (de)e0? simd retires (ee)? int retires (ef)f5f6f7f8fd
10204400372990000013206139407251010010010000100100005005706908040018400374003738108338745101002001000020030000400374003711102011009910010010000100000000071021702239479100001004003840038400384003840038
10204400372990000011406139407251010010010000100100005005706908040018400374003738108338745101002001000020030000400374003711102011009910010010000100000000071021602239479100001004003840038400384003840038
10204400373000000010508239398251010010010000100100005005706908040018400844003738108338745101002001000020030000400374008511102011009910010010000100020050071011602239479100001004003840038400384003840038
1020440037299000002706139407251010010010000100100005005706908040018400374003738108338745101002001000020430000400374003711102011009910010010000100000010071021602239479100001004003840038400384003840038
102044003730000011210122739389251010010010000100100005005706908040018400374003738108338745101002001000020030000400374003711102011009910010010000100000000071021602239479100001004003840038400384003840038
1020440037300000002790385139398251010010010000100100005005706908040018400374003738108338745101002001000020030000400374003711102011009910010010000100000013071021602239479100001004003840038400384003840038
10204400372990000018308239407251010010010000100100005005706908040018400374003738108338745101002001000020030000400374003711102011009910010010000100000000071021602239479100001004003840038400384003840038
10204400373000000029106139398251010010010000100101485005706908040018400864003738108338745101002001000020030000400374003711102011009910010010000100002013368471021602239479100001004008640038400384003840038
10204400373000000025806139407441010011410000100101485005706908040053400854003738108338764101002001000020030000400374003711102011009910010010000100400100273221602239479100001004003840038400854003840038
1020440084300001102706139407451010011910000100100005005706908040018400374003738108338745101002001000020030000400374003711102011009910010010000100000010071021602239479100001004003840038400384003840038

1000 unrolls and 10 iterations

Result (median cycles for code): 4.0037

retire uop (01)cycle (02)03mmu table walk data (08)09l2 tlb miss data (0b)18191e1f3a3f4e51schedule uop (52)schedule int uop (53)schedule simd uop (54)dispatch int uop (56)dispatch simd uop (57)int uops in schedulers (59)ldst uops in schedulers (5b)60696d6emap stall dispatch (70)map rewind (75)map stall (76)dispatch uop (78)map int uop (7c)map simd uop (7e)map int uop inputs (7f)map simd uop inputs (81)8283flush restart other nonspec (84)85inst all (8c)inst branch (8d)inst branch taken (90)inst branch cond (94)inst int alu (97)inst simd alu (9a)9fl1d tlb access (a0)l1d tlb miss (a1)ld unit uop (a6)st unit uop (a7)l1d cache writeback (a8)a9acc2branch cond mispred nonspec (c5)cfd5map dispatch bubble (d6)ddfetch restart (de)e0? int output thing (e9)? simd retires (ee)? int retires (ef)f5f6f7f8fd
10024400373000000030012639407251001010100001010000505706908140018400374003738130338767100102010000203000040037400371110021109101010000100000000006403163339473010000104003840038400384003840038
1002440037300000000006139407251001010100001010000505709700140053400374003738130338767100102010000203000040037400371110021109101010000100000000006403163339473010000104003840038400384003840038
1002440037300000000006139407251001010100001010000505706908140018400374003738130338767100102010000203000040037400371110021109101010000100000000006403163339473010000104003840038400384003840038
1002440037299000000006139407251001010100001010000555706908140018400374003738130338767100102010000203000040037400371110021109101010000100000300006403163339473010000104003840038400384003840038
1002440037300000000006139407251001010100001010000505706908140018400374003738130338767100102010000203000040037400371110021109101010000100000000006403163339473010000104003840038400384003840038
1002440037300000000006139407251001010100001010000505706908140018400374003738130338767100102010000203000040037400371110021109101010000100000000006403163339473010000104003840038400384003840038
1002440037299000000006139407251001010100001010000505706908140018400374003738130338767100102010000203000040037400371110021109101010000100000000006403163339473010000104003840038400384003840038
10024400373000000000072639407251001010100001010000505706908140018400374003738130338767100102010000203000040037400371110021109101010000100000000006403163339473010000104003840038400384003840038
1002440037300000000006139407251001010100001010000505706908140018400374003738135338767100102010000203000040037400371110021109101010000100000003006403163339473010000104003840038400384003840038
1002440037300000000006139407251001010100001010000505706908140018400374017738130338767100102010000203000040037400761110021109101010000100000000006403163339473010000104003840038400384003840038

Test 4: Latency 1->3

Code:

  fmla v0.4h, v1.4h, v0.4h
  movi v0.16b, 1
  movi v1.16b, 2

(fused SUBS/B.cc loop)

100 unrolls and 100 iterations

Result (median cycles for code): 4.0037

retire uop (01)cycle (02)0309191e1f3f4e51schedule uop (52)schedule int uop (53)schedule simd uop (54)dispatch int uop (56)dispatch simd uop (57)int uops in schedulers (59)ldst uops in schedulers (5b)6061696d6emap stall dispatch (70)simd prf full (72)map rewind (75)map stall (76)dispatch uop (78)map int uop (7c)map simd uop (7e)map int uop inputs (7f)map simd uop inputs (81)8283flush restart other nonspec (84)85inst all (8c)inst branch (8d)inst branch taken (90)inst branch cond (94)inst int alu (97)inst simd alu (9a)9fl1d tlb access (a0)l1d tlb miss (a1)ld unit uop (a6)l1d cache writeback (a8)a9acbranch cond mispred nonspec (c5)cfd0l1i cache miss demand (d3)d5map dispatch bubble (d6)ddfetch restart (de)e0? int output thing (e9)? simd retires (ee)? int retires (ef)f5f6f7f8fd
1020440037299200061394072510100100100001001000050057069081040018400374003738108033874510100200100002003000040037400371110201100991001001000010000000007100011611394790100001004003840038400384003840038
10204400373000000916394072510100100100001001000050057069081040018400374003738108033874510100200100002003000040037400371110201100991001001000010000000007104011611394790100001004003840038400384003840038
1020440037300000061394072510100100100001001000050057069081440018400374003738108033874510100200100002003000040037400371110201100991001001000010000000007100011611394790100001004003840038400384003840038
1020440037300000061394072510100100100001001000050057069081040018400844003738108033874510100200100002003000040037400371110201100991001001000010000000007100011611394790100001004003840038400384003840038
1020440037300000061394072510100100100001001000050057069080040018400374003738108033874510100200100002003000040037400371110201100991001001000010000000007100011611394790100001004003840038400384003840038
1020440037300000061394072510100100100001001000050057069080040018400374003738108033874510100200100002003000040037400371110201100991001001000010000000007100011611394790100001004003840038400384003840038
1020440037299000061394072510100100100001001000050057069081040018400374003738108033874510100200100002003000040037400371110201100991001001000010000000007100011611394790100001004003840038400384003840038
1020540037300000061394072510100100100001001000050057069080040018400374003738108033874510100200100002003000040037400371110201100991001001000010000000007100011611394790100001004003840038400384003840038
10204400372990000127394072510100100100001001000050057069080440018400374003738108033874510100200100002003000040037400371110201100991001001000010000000007100011611394790100001004003840038400384003840038
1020440037299000061394072510100100100001001000050057069081040018400374003738108033874510100200100002003000040037400371110201100991001001000010000000007100011611394790100001004003840038400384003840038

1000 unrolls and 10 iterations

Result (median cycles for code): 4.0037

retire uop (01)cycle (02)03mmu table walk data (08)09l2 tlb miss data (0b)18191e1f3a3f4e51schedule uop (52)schedule int uop (53)schedule simd uop (54)dispatch int uop (56)dispatch simd uop (57)int uops in schedulers (59)ldst uops in schedulers (5b)60696d6emap stall dispatch (70)map rewind (75)map stall (76)dispatch uop (78)map int uop (7c)map simd uop (7e)map int uop inputs (7f)map simd uop inputs (81)8283flush restart other nonspec (84)85inst all (8c)inst branch (8d)inst branch taken (90)inst branch cond (94)inst int alu (97)inst simd alu (9a)9fl1d tlb access (a0)l1d tlb miss (a1)ld unit uop (a6)l1d cache writeback (a8)a9acc2cfd5map dispatch bubble (d6)ddfetch restart (de)e0? int output thing (e9)? simd retires (ee)? int retires (ef)f5f6f7f8fd
10024400373000000000061394072510010101000010100005057069080400184003740037381303387671001020100002030000400374003711100211091010100001000000606402162239473010000104003840038400384003840038
10024400373000000000061394072510010101000010100005057069080401584008540037381303387671001020100002030000400374003711100211091010100001000000006402162239473010000104003840038400384003840038
100244003730000000000251394072510010101000010100005057069080400184003740037381303387671001020100002030000400374003711100211091010100001000000006402162239473010000104003840038400384003840038
10024400373000000000061394072510010101000010100005057069080400184003740037381303387671001020100002030000400374003711100211091010100001000000006402162239473010000104003840038400384003840038
10024400372990000000061394072510010101000010100005057069080400184003740037381303387671001020100002030000400374003711100211091010100001000000006402162239473010000104003840038400384003840038
10024400373000000000061394072510010101000010100005057069080400184003740037381303387671001020100002030000400374003711100211091010100001000000006402162239473310000104032140417404654041740420
1002440467303011981200792073883933518010067201004812113325557189180402984050040416381684238915111992411451243372340462403687110021109101010000102202428758080931044339795510000104022940416404634046440462
10024404633031011081068792141573932610110062161005416111847057194720403334046740461381662338938113452210808263437140453404151111002110910101000010202142078806402162239473010000104003840038400384003840038
10024400373000000000061394072510010101000010100005057069080400184003740037381303387671001020100002030000400374003711100211091010100001000000006402162239473010000104003840038400384003840038
1002440037300000004110061394072510010101000010100005057069080400184003740037381303387671001020100002030000400374003711100211091010100001000000006402162239473010000104003840038400384003840038

Test 5: throughput

Count: 8

Code:

  movi v0.16b, 0
  fmla v0.4h, v8.4h, v9.4h
  movi v1.16b, 0
  fmla v1.4h, v8.4h, v9.4h
  movi v2.16b, 0
  fmla v2.4h, v8.4h, v9.4h
  movi v3.16b, 0
  fmla v3.4h, v8.4h, v9.4h
  movi v4.16b, 0
  fmla v4.4h, v8.4h, v9.4h
  movi v5.16b, 0
  fmla v5.4h, v8.4h, v9.4h
  movi v6.16b, 0
  fmla v6.4h, v8.4h, v9.4h
  movi v7.16b, 0
  fmla v7.4h, v8.4h, v9.4h
  movi v8.16b, 9
  movi v9.16b, 10

(fused SUBS/B.cc loop)

100 unrolls and 100 iterations

Result (median cycles for code divided by count): 0.2508

retire uop (01)cycle (02)031e3f51schedule uop (52)schedule int uop (53)schedule simd uop (54)dispatch int uop (56)dispatch simd uop (57)int uops in schedulers (59)ldst uops in schedulers (5b)696d6erob full (74)map rewind (75)map stall (76)dispatch uop (78)map int uop (7c)map simd uop (7e)map int uop inputs (7f)map simd uop inputs (81)8283flush restart other nonspec (84)85inst all (8c)inst branch (8d)inst branch taken (90)inst branch cond (94)inst int alu (97)inst simd alu (9a)9fl1d cache writeback (a8)a9accfd5map dispatch bubble (d6)ddfetch restart (de)e0? simd retires (ee)? int retires (ef)f5f6f7f8fd
160204200901510402580100100800001008000050064000020046200652006503238010020080000200240000200652006511160201100991001001600001000031011111612200621600001002006620066200662006620066
160204200651506402580100100800001008000050064000020046200652006503238010020080000200240000200652006511160201100991001001600001000001011111611200621600001002006620066200662006620066
160204200651500402580100100800001008000050064000020046200652006503238010020080000200240000200652006511160201100991001001600001000001011111611200621600001002006620066200662006620066
1602042006515002302580100100800001008000050064000020046200652006503238010020080000200240000200652006511160201100991001001600001000001011111611200621600001002006620066200662006620066
160204200651500402580100100800001008000050064000020046200652006503238010020080000200240000200652006511160201100991001001600001000001011111611200621600001002006620066200662006620066
1602042006515024402580100100800001008000050064000020046200652006503238010020080000200240000200652006511160201100991001001600001000001011111631200621600001002006620066200662006620066
160204200651500402580100100800001008000050064000020046200652006503238010020080000200240000200652006511160201100991001001600001000031011111611200621600001002006620066200662006620066
160204200651500402580100100800001008000050064000020046200652006503238010020080000200240000200652006511160202100991001001600001000001011111611200621600001002006620066200662006620066
160204200651500402580100100800001008000050064000020046200652006503238010020080000200240000200652006511160201100991001001600001000001011111613200621600001002006620066200662006620066
160204200651510402580100100800001008000050064000020046200652006503238010020080000200240000200652006511160201100991001001600001000001011111611200621600001002006620066200662006620066

1000 unrolls and 10 iterations

Result (median cycles for code divided by count): 0.2526

retire uop (01)cycle (02)03mmu table walk instruction (07)1e3a3f51schedule uop (52)schedule int uop (53)schedule simd uop (54)dispatch int uop (56)dispatch simd uop (57)int uops in schedulers (59)ldst uops in schedulers (5b)5f6061696d6emap rewind (75)map stall (76)dispatch uop (78)map int uop (7c)map simd uop (7e)map int uop inputs (7f)map simd uop inputs (81)8283flush restart other nonspec (84)85inst all (8c)inst branch (8d)inst branch taken (90)inst branch cond (94)inst int alu (97)inst simd alu (9a)9fl1d tlb access (a0)l1d tlb miss (a1)st unit uop (a7)l1d cache writeback (a8)a9acbranch cond mispred nonspec (c5)cfd0d2l1i cache miss demand (d3)d5map dispatch bubble (d6)d9dadbddfetch restart (de)e0? int output thing (e9)eaec? simd retires (ee)? int retires (ef)f5f6f7f8fd
1600242008515100046278001212800001280000626400001102003320052200523238001220800002024000020052200521116002110910101600001000000001003381125252111615200492201160000102006220053200622005320053
160024200611500303046278001212800001280000626400002152003320052200523238001220800002024000020052200611116002110910101600001000000001003185120252121911200492201160000102005320053200532005320053
160024200521500396046278001212800001280000626400002152003320061200613238001220800002024000020052200521116002110910101600001000000001003485112252111711200492401160000102005320053200532005320053
1600242006115006046278001212800001280000626400002152003320052200523238001220800002024000020052200521116002110910101600001000000001003885120252111015200492201160000102005320053200532005320053
1600242005215000046278001212800001280000626400001152004220052200523238001220800002024000020052200521116002110910101600001000000001003885122252112019200492201160000102005320053200532005320053
160024200611500004627800121280000128000062640000215200332005220052323800122080000202400002005220052111600211091010160000100000000100388519252111510200492201160000102005320053200532005320053
160024200521500294046278001212800001280000626400002152003320052200523238001220800002024000020052200521116002110910101600001000000001003885115252111515200492201160000102005320053200532005320053
1600242005215000046278001212800001280000626400002152004220061200523238001220800002024000020052200521116002110910101600001000000001003385110252111411200492201160000102005320053200532005320053
1600242005215000052298001212800001280000626400002152003320052200523238001220800002024000020052200521116002110910101600001000000001003385120252111315200492201160000102005320053200532005320053
1600242005215000046278001212800001280000626400001152003320052200523908001220800002024000020052200521116002110910101600001000000001003885114252111515200492201160000102005320053200532014620053

Test 6: throughput

Count: 16

Code:

  fmla v0.4h, v16.4h, v17.4h
  fmla v1.4h, v16.4h, v17.4h
  fmla v2.4h, v16.4h, v17.4h
  fmla v3.4h, v16.4h, v17.4h
  fmla v4.4h, v16.4h, v17.4h
  fmla v5.4h, v16.4h, v17.4h
  fmla v6.4h, v16.4h, v17.4h
  fmla v7.4h, v16.4h, v17.4h
  fmla v8.4h, v16.4h, v17.4h
  fmla v9.4h, v16.4h, v17.4h
  fmla v10.4h, v16.4h, v17.4h
  fmla v11.4h, v16.4h, v17.4h
  fmla v12.4h, v16.4h, v17.4h
  fmla v13.4h, v16.4h, v17.4h
  fmla v14.4h, v16.4h, v17.4h
  fmla v15.4h, v16.4h, v17.4h
  movi v16.16b, 17
  movi v17.16b, 18

(fused SUBS/B.cc loop)

100 unrolls and 100 iterations

Result (median cycles for code divided by count): 0.2502

retire uop (01)cycle (02)0318191e1f373f4e51schedule uop (52)schedule int uop (53)schedule simd uop (54)dispatch int uop (56)dispatch simd uop (57)int uops in schedulers (59)ldst uops in schedulers (5b)696d6emap stall dispatch (70)simd prf full (72)map rewind (75)map stall (76)dispatch uop (78)map int uop (7c)map simd uop (7e)map int uop inputs (7f)map simd uop inputs (81)8283flush restart other nonspec (84)85inst all (8c)inst branch (8d)inst branch taken (90)inst branch cond (94)inst int alu (97)inst simd alu (9a)9fl1d tlb access (a0)ld unit uop (a6)l1d cache writeback (a8)a9acc2cfd5map dispatch bubble (d6)ddfetch restart (de)e0? int output thing (e9)? simd retires (ee)? int retires (ef)f5f6f7f8fd
16020440145307000006102516012210016000010016000050012800004002140040400431997303211851601002001600002004800004004040040111602011009910010016000010000000010110316114003701600001004004140041400414004140041
160204412053080084006102516010010016002210016000050012800004231640040400402109503199981601002001600002004800004004040040111602011009910010016000010000000010110116114003701600001004004141191412064004140041
160204400403000000224245412516010010016000010016000050012800004002140040400401997303199991601002001600002004800004004040040111602011009910010016000010000000010110116114003701600001004119140044400414004140041
16020440040300000004245412516012210016000010016000050012800004117140040400402111103211481601002001600002004800004004040040111602011009910010016000010000000010110116114003701600001004119140044400414004140041
160204400402990000224244852516010010016000110016000050057152614002140040411902109503199981601002001600002004800004119041205111602011009910010016000010000000010110116114003701600001004119140044400414004140041
16020440040300000004202516010010016000010016000050012800004002140040400401997303211631601002001600002004800004004040040111602011009910010016000010000000010110116114003701600001004004141191412064004141206
16020440040299000004202516010010016000010016000050012800004002140040411901997303199981601002001600002004800004004040040111602011009910010016000010000000010110116114120201600001004004140041411914120640041
16020440040300000004244852516010010016002210016000050012800004002140043411902109503211481601002001600002004800004004040040111602011009910010016000010000000010110116114003701600001004004141191400444004140041
160204400402990000070702516012210016000010016000050057057744002140040411901997303199981601002001600002004800004004040040111602011009910010016000010000100010110116114120201600001004004140041400414004140041
16020440040300000004202516010010016002210016000050013199994002140040400402109503199981601002001600002004800004120540040111602011009910010016000010000000010110116114003701600001004004140041411914228640041

1000 unrolls and 10 iterations

Result (median cycles for code divided by count): 0.2547

retire uop (01)cycle (02)03l2 tlb miss data (0b)181e1f373f4e51schedule uop (52)schedule int uop (53)schedule simd uop (54)dispatch int uop (56)dispatch simd uop (57)int uops in schedulers (59)ldst uops in schedulers (5b)5f60696d6emap stall dispatch (70)simd prf full (72)map rewind (75)map stall (76)dispatch uop (78)map int uop (7c)map simd uop (7e)map int uop inputs (7f)map simd uop inputs (81)8283flush restart other nonspec (84)85inst all (8c)inst branch (8d)inst branch taken (90)inst branch cond (94)inst int alu (97)inst simd alu (9a)9fld unit uop (a6)l1d cache writeback (a8)a9acc2branch cond mispred nonspec (c5)branch mispred nonspec (cb)cfd0d2l1i cache miss demand (d3)d5map dispatch bubble (d6)d9dadbddfetch restart (de)e0eaeb? simd retires (ee)? int retires (ef)f5f6f7f8fd
16002440758300000080478955251600101016000010160000505868895114230842327400401999603223071600102016000020480000400404232711160021109101016000010000000010022311816211102040037157160000104231940041423194232842319
16002440040317000007302516009010160000101600005012800000142308400404232722237032002016001020160000204800004232740040111600211091010160000100000000100246221916222202042324307160000104004142328400414232840041
16002440040317000080678955251600901016008010160000505868895114230842296423272223703223071600102016000020480000423274004011160021109101016000010000020010022311191621191942324157160000104004142286400414232842328
16002442327300000080110895525160010101600001016000050586889511400214232742327199963332002016001020160000204800004004042327111600211091010160000100000000100223111916211202040037157160000104004142328400414232842328
1600244232729900000712025160090101600801016000050128000011423084004042327222373332230716001020160000204800004232740040111600221091010160000100000000100223121916211202042324155160000104232840041423284004140041
16002440040317000006789552516009010160043101600005058688951140021423274232722237333200201600102016000020480000400404232711160021109101016000010000000010022611191622171940037307160000104004142328423284232842328
16002440040317000006789552516001010160080101600005012800001140021400404232719996032230716001020160000204800004232742327111600211091010160000100000000100243111916211161940037155160000104004140041422864004142319
160024400403170000806789552516009010160080101600005012800000140021423274232719996032230716001020160000204800004232742327111600211091010160000100000000100226121916211202042324157160000104123042328423284232842315
160024423273170000807302516001010160000101600005058688951140021423274232722237333223071600102016000020480000400404232711160021109101016000010000000010024611191621119740037155160000104004142328400414232840041
16002440040317000004702516009010160080101600005012800000142308423274004022237032002016001020160000204800004232740040111600211091010160000100000000100223111916219120740037307160000104004142328400414232842328