Apple M1 Microarchitecture Research by Dougall Johnson

Firestorm: Overview | Base Instructions | SIMD and FP Instructions
Icestorm:  Overview | Base Instructions | SIMD and FP Instructions

FMLA (by element, S)

Test 1: uops

Code:

  fmla s0, s1, v2.s[1]
  movi v0.16b, 1
  movi v1.16b, 2
  movi v2.16b, 3

(no loop instructions)

1000 unrolls and 1 iteration

Retires: 1.000

Issues: 1.000

Integer unit issues: 0.000

Load/store unit issues: 0.000

SIMD/FP unit issues: 1.000

retire (01)cycle (02)03191e3f4e51inst issue (52)~issue fp/simd (54)~dispatch fp/simd (57)huge thing fp/simd (5b)60696d6edispatch stall (70)scheduler rewind (75)scheduler stall (76)~dispatch op (78)~map op fp/simd (7e)~map lookup fp/simd (81)8283pipeline redirect (84)85inst all (8c)inst neon or fp (9a)cfd5d6ddinst fetch restart (de)e0? fp/simd (ee)f5f6f7f8fd
10044037300061340725100010001000531908140184037403732583389510001000300040374037111001100073116113473100040384038403840384038
10044037300082340725100010001000531908140184037403732583389510001000300040374037111001100073116113473100040384038403840384038
100440373000103340725100010001000531908140184037403732583389510001000300040374037111001100073116113473100040384038403840384038
10044037300061340725100010001000531908140184037403732583389510001000300040374037111001100073116113473100040384038403840384038
10044037300061340725100010001000531908140184037403732583389510001000300040374037111001100073116113473100040384038403840384038
10044037300061340725100010001000531908140184037403732583389510001000300040374037111001100073116113473100040384038403840384038
10044037300061340725100010001000531908140184037403732583389510001000300040374037111001100073116113473100040384038403840384038
10044037300061340725100010001000531908140184037403732583389510001000300040374037111001100073116113473100040384038403840384038
100440373006251340725100010001000531908140184037403732583389510001000300040374037111001100073116113473100040384038403840384038
100440373002461340725100010001000531908140184037403732583389510001000300040374037111001100073116113549100040384038403840384038

Test 2: Latency 1->1

Code:

  fmla s0, s1, v2.s[1]
  movi v0.16b, 1
  movi v1.16b, 2
  movi v2.16b, 3

(fused SUBS/B.cc loop)

100 unrolls and 100 iterations

Result (median cycles for code): 4.0037

retire (01)cycle (02)03191e3a3f4e51inst issue (52)~issue int (53)~issue fp/simd (54)~dispatch int (56)~dispatch fp/simd (57)huge thing int (59)huge thing fp/simd (5b)60696d6edispatch stall (70)scheduler rewind (75)scheduler stall (76)~dispatch op (78)~map op int (7c)~map op fp/simd (7e)~map lookup int (7f)~map lookup fp/simd (81)8283pipeline redirect (84)85inst all (8c)inst branch (8d)inst branch taken (90)inst b.cc (94)inst integer (97)inst neon or fp (9a)9fa8branch mispredict (cb)cfd0d5d6ddinst fetch restart (de)e0? int output thing (e9)? fp/simd (ee)gpr retires (ef)f5f6f7f8fd
1020440037300000613940725101001001000010010000500570690804001840037400373810833874510100200100002003000040037400371110201100991001001000010000710121622394790100001004003840038400384003840038
1020440037300000613940725101001001000010010000500570690804001840037400373810833874510100200100002003000040037400371110201100991001001000010000710121622394790100001004003840038400384003840038
1020440037300000613940725101001001000010010000500570690804001840037400373810833874510100200100002003000040037400371110201100991001001000010000710121622394790100001004003840038400384003840038
1020440037300000613940725101001001000010010000500570690814001840037400373810833874510100200100002003000040037400371110201100991001001000010000710121622394790100001004003840038400384003840038
1020440037300000613940725101001001000010010000500570690804001840037400373810833874510100200100002003000040037400371110201100991001001000010000710121622394790100001004003840086400384003840038
1020440037299000613940725101001001000010010000500570690804001840037400373810833874510100200100002003000040037400371110201100991001001000010000712121622394790100001004003840038400384003840038
10204400372990001473940725101001001000010010000500570690804001840037400373810833874510100200100002003000040037400371110201100991001001000010000710121622394790100001004003840038400384003840038
1020440037300000613940725101001001000010010000500570690814001840037400373810833874510100200100002003000040037400371110201100991001001000010000710121622394790100001004003840038400384003840038
1020440037300000613940725101001001000010010000500570690804001840037400373810833874510100200100002003000040037400371110201100991001001000010000710121623394790100001004003840038400384003840038
1020440037300000613940725101001001000010010000500570743904001840037400373810833874510100200100002003000040037400371110201100991001001000010000710131622394790100001004003840038400384003840038

1000 unrolls and 10 iterations

Result (median cycles for code): 4.0037

retire (01)cycle (02)03080b1e3f4e51inst issue (52)~issue int (53)~issue fp/simd (54)~dispatch int (56)~dispatch fp/simd (57)huge thing int (59)huge thing fp/simd (5b)60696b6d6edispatch stall (70)scheduler rewind (75)scheduler stall (76)~dispatch op (78)~map op int (7c)~map op fp/simd (7e)~map lookup int (7f)~map lookup fp/simd (81)8283pipeline redirect (84)85inst all (8c)inst branch (8d)inst branch taken (90)inst b.cc (94)inst integer (97)inst neon or fp (9a)9fa8accfd5d6ddinst fetch restart (de)e0? int output thing (e9)? fp/simd (ee)gpr retires (ef)f5f6f7f8fd
10024400373000006139407251001010100001010000505706908040018040037400373813033876710010201000020300004003740037111002110910101000010006404163339473010000104003840038400384003840038
10024400373000006139407451001010100001010000505706908040018040037400373813033876710010201000020300004003740037111002110910101000010036403163439473010000104003840038400384003840038
10024400373000006139407251001010100001010000505706908040018040037400373813033876710010201000020300004003740037111002110910101000010006403163339473010000104003840038400384003840038
10024400372990006139407251001010100001010000505706908040018040037400373813033876710010201000020300004003740037111002110910101000010006403163439473010000104003840038400384003840038
100244003729900072639407251001010100001010000505706908040018040037400373813033876710010201000020300004003740037111002110910101000010006403163339473010000104003840038400384003840038
10024400373000006139407251001010100001010000505706908040018040037400373813033876710010201000020300004003740037111002110910101000010006403163439473010000104003840038400384007140038
10024400372990006139407251001010100001010000505706908040018040037400373813033876710010201000020300004003740037111002110910101000010006403163439473010000104003840038400384003840038
10024400373000006139407251001010100001010000505706908040018340037400373813033876710010201000020300004003740037111002110910101000010006403163439473010000104003840038400384003840038
10024400372990006139407251001010100001010000505706908040018040037400373813033876710010201017220300004003740037111002110910101000010006403163339473010000104003840038400384003840038
10024400373000008239407251001010100001010000505706908040018040037400373813033876710010201000020300004003740037111002110910101000010006403163339473010000104003840038400384003840038

Test 3: Latency 1->2

Code:

  fmla s0, s0, v1.s[1]
  movi v0.16b, 1
  movi v1.16b, 2

(fused SUBS/B.cc loop)

100 unrolls and 100 iterations

Result (median cycles for code): 4.0037

retire (01)cycle (02)031e3f4e51inst issue (52)~issue int (53)~issue fp/simd (54)~dispatch int (56)~dispatch fp/simd (57)huge thing int (59)huge thing fp/simd (5b)6061696d6edispatch stall (70)scheduler rewind (75)scheduler stall (76)~dispatch op (78)~map op int (7c)~map op fp/simd (7e)~map lookup int (7f)~map lookup fp/simd (81)8283pipeline redirect (84)85inst all (8c)inst branch (8d)inst branch taken (90)inst b.cc (94)inst integer (97)inst neon or fp (9a)9facbranch mispredict (cb)cfd0d2d5d6ddinst fetch restart (de)e0? fp/simd (ee)gpr retires (ef)f5f6f7f8fd
102044003729906139407251010010010000100100005005706908004001840037400373810833874510100200100002003000040037400371110201100991001001000010000710002162239479100001004003840038400384003840038
102044003730006139407251010010010000100100005005706908004001840037400373810833874510263200100002003000040037400371110201100991001001000010000710002162239479100001004003840038400384003840038
102044003730006139407251010010010000100100005005706908004001840037400373810833874510100200100002003000040037400371110201100991001001000010000710012162239479100001004003840038400384003840038
102044003730006139407251010010010000100100005005706908154001840037400373810833874510100200100002003000040037400371110201100991001001000010000710512162239479100001004003840038400384003840038
1020440037300055139407251010010010000100100005005706908004001840037400373810833874510100200100002003000040037400371110201100991001001000010000710002165239479100001004003840038400384003840038
102044003730006139407251010010010000100100005005706908004001840037400373810833874510100200100002003000040037400371110201100991001001000010000710002162239479100001004003840038400384003840038
1020440037300034639407251010010010000100100005005706908104005340037400373810833874510100200100002003000040037400371110201100991001001000010090710002162239479100001004003840038400384003840038
102044003730006139407251010010010000100100005005706908004001840037400373810833874510100200100002003000040037400371110201100991001001000010000710002162239479100001004003840038400384003840038
102044003730006139407251010010010000100100005005706908004001840037400373810833874510100200100002003000040037400371110201100991001001000010000710002162239479100001004003840038400384003840038
102044003730006139407251010010010000100100005005706908004001840037400373810833874510100200100002003000040037400371110201100991001001000010000710002162239479100001004003840038400384003840038

1000 unrolls and 10 iterations

Result (median cycles for code): 4.0037

retire (01)cycle (02)03080b18191e3f4e51inst issue (52)~issue int (53)~issue fp/simd (54)~dispatch int (56)~dispatch fp/simd (57)huge thing int (59)huge thing fp/simd (5b)60696d6edispatch stall (70)scheduler rewind (75)scheduler stall (76)~dispatch op (78)~map op int (7c)~map op fp/simd (7e)~map lookup int (7f)~map lookup fp/simd (81)8283pipeline redirect (84)85inst all (8c)inst branch (8d)inst branch taken (90)inst b.cc (94)inst integer (97)inst neon or fp (9a)9fa1a6a8a9acc2branch mispredict (cb)cfd5d6ddinst fetch restart (de)e0? int output thing (e9)? fp/simd (ee)gpr retires (ef)f5f6f7f8fd
100244003729900000726394072510010101000010100005057069080400184003740037381303387671001020100002030000400374003711100211091010100001000000006403162239473010000104003840038400384003840038
10024400373000000061394072510010101000010100005057069080400184003740037381303387671001020100002030000400374003711100211091010100001000410006402242239473010000104003840038400384003840038
10024400373000000061394072510010101000010100005057069081400184003740037381303387671001020100002030000400374003711100211091010100001000000006402162239473010000104003840038400384003840038
10024400373000000061394072510010101000010100005057069080400184003740037381303387671001020100002030000400374003711100211091010100001000000006402162239473010000104003840038400384003840038
10024400373000100061394072510010101000010100005057069080400184003740037381303387671001020100002030000400374003711100211091010100001000000006402162239473010000104003840038400384003840038
10024400373000000061394072510010101000010100005057069080400184003740037381303387671001020100002030000400374003711100211091010100001000000006402162239473010000104008440038400384003840038
10024400373000000061394072510010101000010100005057069080400534003740037381303387671001020100002030000400374003711100211091010100001000000006402162239473010000104003840038400384003840038
10024400373000000061394072510010101000010100005057069080400184003740037381303387671001020100002030000400374003711100211091010100001000000006402162239473010000104003840038400384003840038
10024400372990000061394072510010101000010100005057069080400184003740037381303387671001020100002030000400374003711100211091010100001000000006402162239473010000104003840038400384003840038
10024400373000000061394072510010101000010100005057069080400184003740037381303387671001020100002030000400374003711100211091010100001000000006402162239473010000104003840038400384003840038

Test 4: Latency 1->3

Code:

  fmla s0, s1, v0.s[1]
  movi v0.16b, 1
  movi v1.16b, 2

(fused SUBS/B.cc loop)

100 unrolls and 100 iterations

Result (median cycles for code): 4.0037

retire (01)cycle (02)031e3f4e51inst issue (52)~issue int (53)~issue fp/simd (54)~dispatch int (56)~dispatch fp/simd (57)huge thing int (59)huge thing fp/simd (5b)61696b6d6edispatch stall (70)scheduler rewind (75)scheduler stall (76)~dispatch op (78)~map op int (7c)~map op fp/simd (7e)~map lookup int (7f)~map lookup fp/simd (81)8283pipeline redirect (84)85inst all (8c)inst branch (8d)inst branch taken (90)inst b.cc (94)inst integer (97)inst neon or fp (9a)9fa8accfd5d6ddinst fetch restart (de)e0? fp/simd (ee)gpr retires (ef)f5f6f7f8fd
10205400373003961394072510100100100001001000050057069080400180400374003738108338745101002001000020030000400374003711102011009910010010000100007102161139479100001004003840038400384003840038
10204400372995161394072510100100100001001000050057069080400180400374003738108338745101002001000020030000400374003711102011009910010010000100007101161139479100001004003840038400384003840038
1020440037300061394072510100100100001001000050057069080400180400374003738108338745101002001000020030000400374003711102011009910010010000100007101161139479100001004003840038400384003840038
10204400373003961394072510100100100001001000050057069080400180400374003738108338745101002001000020030000400374003711102011009910010010000100007101161139479100001004003840038400384003840038
10204400373000458394072510100100100001001000050057069080400180400374003738108338745101002001000020030000400374003711102011009910010010000100007101161139479100001004003840038400384003840038
10204400373008461394072510100100100001001000050057069080400180400374003738108338745101002001000020030000400374003711102011009910010010000100007101161139479100001004003840038400384003840038
1020440037300061394072510100100100001001000050057069080400180400374003738108338745101002001000020030000400374003711102011009910010010000100007101161139479100001004003840038400384003840038
10204400373002161394072510100100100001001000050057069080400180400374003738108338745101002001000020030000400374003711102011009910010010000100007101161139479100001004003840038400384003840038
10204400373018461394072510100100100001001000050057069080400180400374003738108338745101002001000020030000400374003711102011009910010010000100007101161139479100001004003840038400384003840038
10204400373007561394072510100100100001001000050057069080400180400374003738108338745101002001000020030000400374003711102011009910010010000100007101161139479100001004003840038400384003840038

1000 unrolls and 10 iterations

Result (median cycles for code): 4.0037

retire (01)cycle (02)030708090b18191e1f3f4e51inst issue (52)~issue int (53)~issue fp/simd (54)~dispatch int (56)~dispatch fp/simd (57)huge thing int (59)huge thing fp/simd (5b)60696b6d6edispatch stall (70)scheduler rewind (75)scheduler stall (76)~dispatch op (78)~map op int (7c)~map op fp/simd (7e)~map lookup int (7f)~map lookup fp/simd (81)8283pipeline redirect (84)85inst all (8c)inst branch (8d)inst branch taken (90)inst b.cc (94)inst integer (97)inst neon or fp (9a)9fa0a1a6a7a8a9acc2cfd5d6ddinst fetch restart (de)e0? int output thing (e9)? fp/simd (ee)gpr retires (ef)f5f6f7f8fd
1002440037300100000006139407251001010100001010000505706908140018040037400373813033876710010201000020300004003740037111002110910101000010000000006403163339473010000104003840038400384003840038
100244003730000000021906139407251001010100001010000505706908140018040037400373813033876710010201000020300004003740037111002110910101000010000000006403163339473010000104003840038400774003840038
10024400372990000005106139407251001010100001010000835712492040158040413401783814733876710010201000020300004003740037111002110910101000010000000006403163339473010000104003840038400384003840038
10024400373000000002706139407251001010100001010000505706908140018040037400373813033876710010201000020300004003740037111002110910101000010000000006403163339473010000104003840038400384003840038
10024400372990000005106139407251001010100001010000505706908140018040037400373813033876710010201000020300004003740037111002110910101000010000000006403163339473010000104003840038400384003840038
10024400373000000003606139407251001010100001010000505706908140018040037400373813033876710010201000020300004003740037111002110910101000010000000006403163339545010000104003840038400384003840038
10024400373000000002706139407251001010100001010000505706908040018040037400373813033876710010201000020300004003740037111002110910101000010000000006403163339473010000104003840038400384003840038
10024400373000000004506139407251001010100001010000505706908040018040037400373813033876710012201000020300004003740037111002110910101000010000000006403163339473010000104003840038400384003840038
100244003730000000048072639407251001010100001010000505706908140018040037400373813033876710010201000020300004003740037111002110910101000010000000006403163339473010000104003840038400384003840038
10024400373000000012706139407251001010100001010000505706908140018040037400373813033876710010201000020300004003740037111002110910101000010000000006403163339473010000104003840038400384003840038

Test 5: throughput

Count: 8

Code:

  movi v0.16b, 0
  fmla s0, s8, v9.s[1]
  movi v1.16b, 0
  fmla s1, s8, v9.s[1]
  movi v2.16b, 0
  fmla s2, s8, v9.s[1]
  movi v3.16b, 0
  fmla s3, s8, v9.s[1]
  movi v4.16b, 0
  fmla s4, s8, v9.s[1]
  movi v5.16b, 0
  fmla s5, s8, v9.s[1]
  movi v6.16b, 0
  fmla s6, s8, v9.s[1]
  movi v7.16b, 0
  fmla s7, s8, v9.s[1]
  movi v8.16b, 9
  movi v9.16b, 10

(fused SUBS/B.cc loop)

100 unrolls and 100 iterations

Result (median cycles for code divided by count): 0.2508

retire (01)cycle (02)030b18191e1f3a3f51inst issue (52)~issue int (53)~issue fp/simd (54)~dispatch int (56)~dispatch fp/simd (57)huge thing int (59)huge thing fp/simd (5b)60696d6escheduler rewind (75)scheduler stall (76)~dispatch op (78)~map op int (7c)~map op fp/simd (7e)~map lookup int (7f)~map lookup fp/simd (81)8283pipeline redirect (84)85inst all (8c)inst branch (8d)inst branch taken (90)inst b.cc (94)inst integer (97)inst neon or fp (9a)9fa0a1a6a7a8a9acc2c5cdcfd5d6ddinst fetch restart (de)e0? int output thing (e9)? fp/simd (ee)gpr retires (ef)f5f6f7f8fd
16020420090150000150040258010010080000100800005006400001200462006520065323801002008000020024000020065200651116020110099100100160000100000000000010111416112006201600001002006620066200662006620066
1602042006515000000040258010010080000100800005006400000200462006520065323801002008000020024000020065200651116020110099100100160000100000000000010111116112006201600001002006620066200662006620066
16020420065150000180040258010010080000100800005006400000200462006520065323801002008000020024000020065200651116020110099100100160000100000002000010111116112006201600001002006620066200662006620066
1602042006515000060040258010010080000100800005006400000200462006520065323801002008000020024000020065200651116020110099100100160000100000000000010111116112006201600001002006620066200662006620066
16020420065150000180040258010010080000100800005006400001200462006520065323801002008000020024000020065200651116020110099100100160000100000000000010111148112006201600001002006620066200662006620066
1602042006515000000040258010010080000100800005006400000200462006520171323801002008000020024000020065200651116020110099100100160000100000000000010111116112006201600001002006620066200662006620066
160204200651500003450040258010010080000100800005006400001200462006520065323801002008000020024000020065200651116020110099100100160000100000000000010111116112006201600001002006620066200662006620066
16020420065151000120040258010010080000100800005006400001200462006520065323801002008000020024000020065200651116020110099100100160000100000000000010111116112006201600001002006620066200662006620066
1602042006515000000040258010010080000100800005006400001200462006520065323801002008000020024000020065200651116020110099100100160000100000000000010111116112006201600001002006620066200662006620066
16020420065151000150074258010010080000100800005006400000200462006520065323801002008000020024000020065200651116020110099100100160000100000000000010111116112006201600001002006620066200662006620066

1000 unrolls and 10 iterations

Result (median cycles for code divided by count): 0.2506

retire (01)cycle (02)031e3a3f51inst issue (52)~issue int (53)~issue fp/simd (54)~dispatch int (56)~dispatch fp/simd (57)huge thing int (59)huge thing fp/simd (5b)5f6061696d6escheduler rewind (75)scheduler stall (76)~dispatch op (78)~map op int (7c)~map op fp/simd (7e)~map lookup int (7f)~map lookup fp/simd (81)8283pipeline redirect (84)85inst all (8c)inst branch (8d)inst branch taken (90)inst b.cc (94)inst integer (97)inst neon or fp (9a)9fa8accfd0d2icache miss (d3)d5d6d9dadbddinst fetch restart (de)e0? int output thing (e9)ea? fp/simd (ee)gpr retires (ef)f5f6f7f8fd
1600242004915042046258001212800001280000626400001102004920051200513238001220800002024000020051200511116002110910101600001000100488312420211242320044230160000102005220052200522005220052
160024200511509052258001212800001280000626400000152004820047200473238001220800002024000020047200471116002110910101600001000100448412320211252720044215160000102004820048200482004820048
160024200471500052258001212800001280000626400000152003720047200473238001220800002024000020047200471116002110910101600001000100468412420211232420044215160000102004820048200482004820048
160024200471500046258001212800001280000626400001152004820047200473238001220800002024000020047200471116002110910101600001013100498412420211232420044215160000102004820048200482004820048
1600242004715000217258001212800001280000626400001152004720047200473238001220800002024000020047200471116002110910101600001000100448412320211222320044215160000102004820048200482004820048
160024200471500046258001212800001280000626400001152003720047200473238001220800002024000020047200471116002110910101600001000100478412620211252020044215160000102004820048200482004820048
160024200471506046258001212800001280000626400001152003620047200473238001220800002024000020047200471116002110910101600001000100468412320211242320044215160000102004820048200482004820048
160024200471500046258001212800001280000626400001152003720047200473238001220800002024000020047200471116002110910101600001000100478412420211232420044228160000102004820048200482004820048
1600242004715000388258001212800001280000626400001152004720047200473238001220800002024000020047200471116002110910101600001000100458412320211242320044215160000102004820048200482004820048
1600242004715000522580012128000012800006264000001520036200472004732380012208000020240000200472004711160021109101016000010001004911522524422252420048230160000102005220052200522005220052

Test 6: throughput

Count: 12

Code:

  fmla s0, s12, v13.s[1]
  fmla s1, s12, v13.s[1]
  fmla s2, s12, v13.s[1]
  fmla s3, s12, v13.s[1]
  fmla s4, s12, v13.s[1]
  fmla s5, s12, v13.s[1]
  fmla s6, s12, v13.s[1]
  fmla s7, s12, v13.s[1]
  fmla s8, s12, v13.s[1]
  fmla s9, s12, v13.s[1]
  fmla s10, s12, v13.s[1]
  fmla s11, s12, v13.s[1]
  movi v12.16b, 13
  movi v13.16b, 14

(fused SUBS/B.cc loop)

100 unrolls and 100 iterations

Result (median cycles for code divided by count): 0.3337

retire (01)cycle (02)031e373f4e51inst issue (52)~issue int (53)~issue fp/simd (54)~dispatch int (56)~dispatch fp/simd (57)huge thing int (59)huge thing fp/simd (5b)696d6edispatch stall (70)scheduler rewind (75)scheduler stall (76)~dispatch op (78)~map op int (7c)~map op fp/simd (7e)~map lookup int (7f)~map lookup fp/simd (81)8283pipeline redirect (84)85inst all (8c)inst branch (8d)inst branch taken (90)inst b.cc (94)inst integer (97)inst neon or fp (9a)9fa1a8c2cfd5d6ddinst fetch restart (de)e0? fp/simd (ee)gpr retires (ef)f5f6f7f8fd
12020441683299006199612512010010012000010012000050056306404002440039416912657732499712010020012000020036000041691400391112020110099100100120000100000761011611416831200001004004041692400404169240040
12020440039312390189379662512010110012000110012000050056306404002040039416912658232664912010020012000020036000041691400391112020110099100100120000100000761011611416831200001004004040040400404004040040
12020440039312036199612512010010012000010012000050058518694167241691400392493232499712010020012000020036000040039416861112020110099100100120000100000761011611400301200001004168740040416924004041692
12020441687300006199612512010010012000010012000050058518694002040039400392493232499712010020012000020036000040039400391112020110099100100120000100000761011611400301200001004004040040400404004040040
120204400393000153699612512010010012000010012000050058518694245141691400392493232499712010020012000020036000040039416861112020110099100100120000100000761011611400301200001004004041687400404169240040
120204400393130061356892512010110012000110012000050056306404002040039400392493232499712010020012000020036000041691400391112020110099100100120000100000761011611416831200001004004040040400404004040040
12020440039299006199612512010010012000010012000050058518694166741691400392493232499712010020012000020036000040039416911112020110099100100120000100000761011611400301200001004004040040400404004040040
1202044003929901251379665012010010012000010012000050058519934002040039400392493232499712010020012000020036000040039416911112020110099100100120000100000761011611400301200001004169240040416924004041692
12020441687300006199612512010010012000010012000050056306404002040039416912658232664912010020012000020036000041686400391112020110099100100120000100000761011611416771200001004004040040400404004040040
120204400393000161379662512010310012000110012000050056306404002041691400392493232499712010020012000020036000040039416911112020110099100100120000100000761011611400301200001004004040040400404004040040

1000 unrolls and 10 iterations

Result (median cycles for code divided by count): 0.3337

retire (01)cycle (02)0308090b18191e373f4e51inst issue (52)~issue int (53)~issue fp/simd (54)~dispatch int (56)~dispatch fp/simd (57)huge thing int (59)huge thing fp/simd (5b)6061696d6edispatch stall (70)scheduler rewind (75)scheduler stall (76)~dispatch op (78)~map op int (7c)~map op fp/simd (7e)~map lookup int (7f)~map lookup fp/simd (81)8283pipeline redirect (84)85inst all (8c)inst branch (8d)inst branch taken (90)inst b.cc (94)inst integer (97)inst neon or fp (9a)9fa8a9acc5cfd0d2d5d6dbddinst fetch restart (de)e0ea? fp/simd (ee)gpr retires (ef)f5f6f7f8fd
120024400393110000000619961251200101012000010120000505630640004002040039400392495532501912001020120000203600004003940039111200211091010120000100000752000211601619400300120000104004040040400404004040040
120024400393000000000619961251200101012000010120000505630640004002040039400392495532501912001020120000203600004003940039111200211091010120000100000752000191601919400300120000104004040040400404004040040
12002440039300000000061996125120010101200001012000050585186900400204003940039249553250191200102012000020360000400394003911120021109101012000010000075205219160919400300120000104004040040400404004040040
120024400393000000000619961251200101012000010120000505630640154002040039400392495532501912001020120000203600004003940039111200211091010120000101000752000191601919400300120000104004040040400404004041702
120024400392990000000619961251200101012000010120000505630640154002040039416912495532501912001020120000203600004003940039111200211091010120000100101752052191601919400300120000104004040040400404004040040
120024400392990000000619961251200101012000010120000505630640154002040039400392495532501912001020120000203600004003940039111200211091010120000100000752000191601919400300120000104004040040400404004040040
1200244003930000000006199612512001010120000101200005056306401540020400394003924961325019120010201200002036000040039400391112002110910101200001000007520529160199400300120000104004040040400404004040040
120024400392990000000619961251200101012000010120000505630640054002040039400392495532501912001020120000203600004003940039111200211091010120000100000752050191601919400300120000104004040040400404004040040
12002440039300001000072699612512001010120000101200005056306401040020400394003924955325019120010201200002036000040039400391112002110910101200001000007520529160197400300120000104004040040400404004040040
12002440039300000000061996125120010101200001012000050563064010400204003940039249553250191200102012000020360000400394003911120021109101012000010000075200019160719400300120000104004040040400404004040040