Apple M1 Microarchitecture Research by Dougall Johnson

Firestorm: Overview | Base Instructions | SIMD and FP Instructions
Icestorm:  Overview | Base Instructions | SIMD and FP Instructions

FMLS (by element, H)

Test 1: uops

Code:

  fmls h0, h1, v2.h[1]
  movi v0.16b, 1
  movi v1.16b, 2
  movi v2.16b, 3

(no loop instructions)

1000 unrolls and 1 iteration

Retires: 1.000

Issues: 1.000

Integer unit issues: 0.000

Load/store unit issues: 0.000

SIMD/FP unit issues: 1.000

retire (01)cycle (02)031e3f4e51inst issue (52)~issue fp/simd (54)~dispatch fp/simd (57)huge thing fp/simd (5b)696d6edispatch stall (70)scheduler rewind (75)scheduler stall (76)~dispatch op (78)~map op fp/simd (7e)~map lookup fp/simd (81)8283pipeline redirect (84)85inst all (8c)inst neon or fp (9a)accfd5d6ddinst fetch restart (de)e0? fp/simd (ee)f5f6f7f8fd
1004403730061340725100010001000531908401840374037325833895100010003000403740371110011000073116113473100040384038403840384038
1004403730061340725100010001000531908401840374037325833895100010003000403740371110011000073116113473100040384038403840384038
1004403730061340725100010001000531908401840374037325833895100010003000403740371110011000073116113473100040384038403840384038
1004403730061340725100010001000531908401840374037325833895100010003000403740371110011000073116113473100040384038403840384038
1004403730061340725100010001000531908401840374037325833895100010003000403740371110011000073116113473100040384038403840384038
1004403730061340725100010001000531908401840374037325833895100010003000403740371110011000073116113473100040384038403840384038
1004403730084340725100010001000531908401840374037325833895100010003000403740371110011000073116114033100040384038403840384038
1004403730061340725100010001000531908401840374037325833895100010003000403740371110011000373216113473100040384038403840384038
1004403730061340725100010001000531908401840374037325833895100010003000403740371110011000073116113473100040384038403840384038
1004403731061340725100010001000531908401840374037325833895100010003000403740371110011000073116113473100040384038403840384038

Test 2: Latency 1->1

Code:

  fmls h0, h1, v2.h[1]
  movi v0.16b, 1
  movi v1.16b, 2
  movi v2.16b, 3

(fused SUBS/B.cc loop)

100 unrolls and 100 iterations

Result (median cycles for code): 4.0037

retire (01)cycle (02)031e3a3f4e51inst issue (52)~issue int (53)~issue fp/simd (54)~dispatch int (56)~dispatch fp/simd (57)huge thing int (59)huge thing fp/simd (5b)60696b6d6edispatch stall (70)scheduler rewind (75)scheduler stall (76)~dispatch op (78)~map op int (7c)~map op fp/simd (7e)~map lookup int (7f)~map lookup fp/simd (81)8283pipeline redirect (84)85inst all (8c)inst branch (8d)inst branch taken (90)inst b.cc (94)inst integer (97)inst neon or fp (9a)9fa1a8accfd0d5d6ddinst fetch restart (de)e0? fp/simd (ee)gpr retires (ef)f5f6f7f8fd
10204400373000014939407251010010010000100100005005706908040018040037400373810833874510100200100002003000040037400371110201100991001001000010000368071004522239479100001004003840038400384003840038
1020440037300006139407251010010010000100100005005706908140018040037400373810833874510100200100002003000040037400371110201100991001001000010000071003162239479100001004003840038400384003840038
1020440037300006139407251010010010000100100005005706908040018040037400373810833874510100200100002003000040037400371110201100991001001000010000071002162239479100001004003840038400384003840038
1020440037299006139407251010010010000100100005005706908140018040037400373810833874510100200100002003000040037400371110201100991001001000010000071002162239479100001004003840038400384003840038
1020440037300006139407251010010010000100100005005706908040018040037400373810833874510100200100002003000040037400371110201100991001001000010000071012162239479100001004003840038400384003840038
1020440037299006139407251010010010000100100005005706908040018040037400373810833874510100200100002003000040037400371110201100991001001000010000071012162239479100001004003840038400384003840038
1020440037300016139407251010010010000100100005005706908040018040037400373810833874510100200100002003000040037400371110201100991001001000010000971012162239479100001004003840038400384003840038
1020440037300006139407251010010010000100100005005706908040018040037400373810833874510100200100002003000040037400371110201100991001001000010000071012162239479100001004003840038400384003840038
1020440037300006139407251010010010000100100005005706908040018040037400373810833874510100200100002003000040037400371110201100991001001000010000071013162239479100001004003840038400384003840038
10204400372990022939407251010010010000100100005005706908140018040037400373810833874510100200100002003000040037400371110201100991001001000010000071012162239479100001004003840038400384003840038

1000 unrolls and 10 iterations

Result (median cycles for code): 4.0037

retire (01)cycle (02)03080b18191e1f3f4e51inst issue (52)~issue int (53)~issue fp/simd (54)~dispatch int (56)~dispatch fp/simd (57)huge thing int (59)huge thing fp/simd (5b)60696d6edispatch stall (70)72scheduler rewind (75)scheduler stall (76)~dispatch op (78)~map op int (7c)~map op fp/simd (7e)~map lookup int (7f)~map lookup fp/simd (81)8283pipeline redirect (84)85inst all (8c)inst branch (8d)inst branch taken (90)inst b.cc (94)inst integer (97)inst neon or fp (9a)9fa0a1a6a7a8a9acc2cdcfd5d6ddinst fetch restart (de)e0? int output thing (e9)? fp/simd (ee)gpr retires (ef)f5f6f7f8fd
1002440037300000000145394072510010101000010100005057069080400184003740037381300338767100102010000203000040037400371110021109101010000100000000006403162239473010000104003840038400384003840038
100244003730000000061394072510010101000010100005057069080400184003740037381300338767100102010000203000040037400371110021109101010000100000000006403162239473010000104003840038400384003840038
1002440037300000000536394072510010101000010100005057069081400184003740037381300338767100102010000203000040037400371110021109101010000100000000006402162239473010000104003840038400384003840038
100244003729900009061394072510010101000010100005057069081400184003740037381300338767100102010000203000040037400371110021109101010000100000000006402162239473010000104003840038400384003840038
100244003729900000061394072510010101000010100005057069081400184003740037381300338767100102010000203000040037400371110021109101010000100000000006402162239473010000104003840038400384003840038
100244003729900000082394074510010101000010100006157097000400184003740037381300338767100102010000203000040037400371110021109101010000100000000006402162239473010000104003840038400384003840038
100244003730000000061394072510010101000010100005057069081400184003740037381300338767100102010000203000040037400371110021109101010000100000203206402162339473010000104003840358400744003840038
100244003730000000061394074510010101000010100005057069080400184003740037381300338767100102010000203000040037400371110021109101010000100000000006402162239473010000104003840038400384003840038
100244003730000000061394072510010101000010100005057069080400184003740037381300338767100102010000203000040037400371110021109101010000100000000006402162239473010000104003840038400384003840038
100244003730000000061394072510010101000010100005057069081400184003740037381300338767100102010000203000040037400371110021109101010000100000000006402162239473010000104003840038400384003840038

Test 3: Latency 1->2

Code:

  fmls h0, h0, v1.h[1]
  movi v0.16b, 1
  movi v1.16b, 2

(fused SUBS/B.cc loop)

100 unrolls and 100 iterations

Result (median cycles for code): 4.0037

retire (01)cycle (02)031e3f4e51inst issue (52)~issue int (53)~issue fp/simd (54)~dispatch int (56)~dispatch fp/simd (57)huge thing int (59)huge thing fp/simd (5b)696d6edispatch stall (70)scheduler rewind (75)scheduler stall (76)~dispatch op (78)~map op int (7c)~map op fp/simd (7e)~map lookup int (7f)~map lookup fp/simd (81)8283pipeline redirect (84)85inst all (8c)inst branch (8d)inst branch taken (90)inst b.cc (94)inst integer (97)inst neon or fp (9a)9fa7a8accfd5d6ddinst fetch restart (de)e0? fp/simd (ee)gpr retires (ef)f5f6f7f8fd
1020440037300044139407251010010010000100100005005706908400184003740037381083387451010020010000200300004003740037111020110099100100100001000007101161139479100001004003840038400384003840038
102044003729996139407251010010010000100100005005706908400184003740037381083387451010020010000200300004003740037111020110099100100100001000007101161139479100001004003840038400384003840038
1020440037300336139407251010010010000100100005005706908400184003740037381083387451010020010000200300004003740037111020110099100100100001002007101161139479100001004003840038400384003840038
102044003730006139407251010010010000100100005005706908400184003740037381083387451010020010000200300004003740037111020110099100100100001000007101161139479100001004003840038400384003840038
102044003730006139407251010010010000100100005005706908400184003740037381083387451010020010000200300004003740037111020110099100100100001000007101161139479100001004003840038400384003840038
1020440037299027239407251010010010000100100005005706908400184003740037381083387451010020010000200300004003740037111020110099100100100001000007101161139479100001004003840038400384003840038
102044003730006139407251010010010000100100005005706908400184003740037381083387451010020010000200300004003740037111020110099100100100001000007101161139479100001004003840038400384003840038
102044003730006139407251010010010000100100005005706908400184003740037381083387451010020010000200300004003740037111020110099100100100001000007101161139479100001004003840038400384003840038
102044003730006139407251010010010000100100005005706908400184003740037381083387451010020010000200300004003740037111020110099100100100001000007101161139479100001004003840038400384003840038
102044003730006139407251010010010000100100005005706908400184003740037381083387451010020010000200300004003740037111020110099100100100001000207101161139479100001004003840038400384003840038

1000 unrolls and 10 iterations

Result (median cycles for code): 4.0037

retire (01)cycle (02)0308090b18191e1f3f4e51inst issue (52)~issue int (53)~issue fp/simd (54)~dispatch int (56)~dispatch fp/simd (57)huge thing int (59)huge thing fp/simd (5b)60696d6edispatch stall (70)scheduler rewind (75)scheduler stall (76)~dispatch op (78)~map op int (7c)~map op fp/simd (7e)~map lookup int (7f)~map lookup fp/simd (81)8283pipeline redirect (84)85inst all (8c)inst branch (8d)inst branch taken (90)inst b.cc (94)inst integer (97)inst neon or fp (9a)9fa0a1a6a8a9acc2cfd5d6ddinst fetch restart (de)e0? int output thing (e9)? fp/simd (ee)gpr retires (ef)f5f6f7f8fd
10024400372990000000441394072510010101000010100005057069080400184003740037381303387671001020100002030000400374003711100211091010100001000000006405162239473010000104003840038400384003840038
1002440037300000000061394072510010101000010100005057069080400184003740037381303387671001020100002030000400374003711100211091010100001000000006402162239473010000104003840038400384003840038
1002440037300000000061394072510010101000010100005057069080400184003740037381303387671001020100002030000400374003711100211091010100001000000006402162239473010000104003840038400384003840038
1002440037300000000082394072510010101000010100005057069080400184003740037381303387671001020100002030000400374003711100211091010100001000000350806402163239473010000104003840038400384003840038
1002440037299000000061394072510010101000010100005057069080400184003740037381303387671001020100002030000400374003711100211091010100001000000006402162239473010000104003840038400384003840038
1002440037299000000061394072510010101000010100005057069080400184003740037381303387671001020100002030000400374003711100211091010100001000010006402162239473010000104003840038400384003840038
1002440037300000000061394072510010101000010100005057069080400184003740037381303387671001020100002030000400374003711100211091010100001000000006622492339473010000104003840038400384003840038
10024400372990000012061394072510020101000012102965057069080400184003740084381303387871001020100002030000400374003711100211091010100001000000006402162239473010000104003840038400384003840038
1002440037300000000061394072510010101000010100005057069081400184003740037381303387671001020100002030000400374003711100211091010100001000000006402162239473010000104003840038400384003840038
100244003729900110010861393892510010101000611100005557069080400184003740037381303387671001020100002030000400374003711100211091010100001000000006402162239473010000104003840038400384003840038

Test 4: Latency 1->3

Code:

  fmls h0, h1, v0.h[1]
  movi v0.16b, 1
  movi v1.16b, 2

(fused SUBS/B.cc loop)

100 unrolls and 100 iterations

Result (median cycles for code): 4.0037

retire (01)cycle (02)03080b18191e1f3a3f4e51inst issue (52)~issue int (53)~issue fp/simd (54)~dispatch int (56)~dispatch fp/simd (57)huge thing int (59)huge thing fp/simd (5b)60696d6edispatch stall (70)scheduler rewind (75)scheduler stall (76)~dispatch op (78)~map op int (7c)~map op fp/simd (7e)~map lookup int (7f)~map lookup fp/simd (81)8283pipeline redirect (84)85inst all (8c)inst branch (8d)inst branch taken (90)inst b.cc (94)inst integer (97)inst neon or fp (9a)9fa0a1a6a8a9acc2branch mispredict (cb)cfd5d6ddinst fetch restart (de)e0? int output thing (e9)? fp/simd (ee)gpr retires (ef)f5f6f7f8fd
1020440037300000000061394072510100100100001001000050057069081400184003740037381083387451010020010000200300004003740037111020110099100100100001000000000071011611394790100001004003840038400384003840038
1020440037300000000061394072510100100100001001000050057069081400184008440037381083387451010020010000200300004003740037111020110099100100100001000000000071011611394790100001004003840038400384003840038
1020440037300000000061394072510100100100001001000050057069080400184003740037381083387451010020010000200300004003740037111020110099100100100001000000000071011611394790100001004003840038400384003840038
1020440037300008811887040726739325204102031431005414611184739571947204001840455404203814344388931137722411321234344224046640417101102011009910010010000100002003212520914288233979734100001004045540420404674046740417
1020440463302019913417041724439335225102011421005414911332743571947204036840464405183814244388881148422911481230300004003740037111020110099100100100001000000000071011611394790100001004003840038400384003840133
10204400372990000000126393332271020314510048152111847365718242040368405154056038140493892911685237116892343527740512405621111020110099100100100001002200037965009331122123990540100001004064640614406104060640609
1020440613304011313171610561101583929927810231163100781591177678757236600404384003740037381083387451010020010000200300004003740037111020110099100100100001000000002087418913398210100001004003840038400384003840086
10204400373000000000130239407441010010010000100100005005706908040018400374003738111133880111471248111492343442840506405601211020110099100100100001000000000071011610394790100001004003840038400384003840038
1020440037300000000061394072510100100100001001000050057069081400184003740037381083387451010020010000200300004003740037111020110099100100100001000000000171011611394790100001004003840038400384003840038
1020440037300000000061394072510100100100001001000050057069081400184003740037381083387451010020010000200300004003740037111020110099100100100001000000000071011611394790100001004003840038400384003840038

1000 unrolls and 10 iterations

Result (median cycles for code): 4.0037

retire (01)cycle (02)03081e1f3f4e51inst issue (52)~issue int (53)~issue fp/simd (54)~dispatch int (56)~dispatch fp/simd (57)huge thing int (59)huge thing fp/simd (5b)60696d6edispatch stall (70)scheduler rewind (75)scheduler stall (76)~dispatch op (78)~map op int (7c)~map op fp/simd (7e)~map lookup int (7f)~map lookup fp/simd (81)8283pipeline redirect (84)85inst all (8c)inst branch (8d)inst branch taken (90)inst b.cc (94)inst integer (97)inst neon or fp (9a)9fa0a1a8accdcfd5d6ddinst fetch restart (de)e0? fp/simd (ee)gpr retires (ef)f5f6f7f8fd
100244003730000061394072510010101000010100005057069080400184003740037381303387671001020100002030000400374003711100211091010100001000000640216223947310000104003840038400384003840038
100244003730000061394072510010101000010100005057069081400184003740037381303387671001020100002030000400374003711100211091010100001000000640216223947310000104003840038400384003840038
100244003730000061394072510010101000010100005057069081400184003740037381303387671001020100002030000400374003711100211091010100001000000640216223947310000104003840038400384003840038
100244003730000061394072510010101000010100005057069080400184003740037381303387671001020100002030000400374003711100211091010100001000000640216223947310000104003840038400384008540038
100244003730010061394072510010101000010100005057069080400184003740037381303387671001020100002030000400374003711100211091010100001000000640216223947310000104003840038400384003840038
100244003730000061394072510010101000010100005057069080400184003740037381303387671001020100002030000400374003711100211091010100001000000640216223947310000104003840038400384003840038
100244003729900061394072510010101000010100005057069081400184003740037381303387671001020100002030000400374003711100211091010100001000000640216223947310000104003840038400384003840038
100244003730000061394072510010101000010100005057069081400184003740037381303387671001020100002030000400374003711100211091010100001000000640216223947310000104003840038400384003840038
100244003730000082394072510010101000010100005057069080400184003740037381303387671001020100002030000400374003711100211091010100001000000640216223947310000104003840038400384003840038
100244003730000061394072510010101000010100005057069081400184003740037381303387671001020100002030000400374003711100211091010100001000000640216223947310000104003840038400384003840038

Test 5: throughput

Count: 8

Code:

  movi v0.16b, 0
  fmls h0, h8, v9.h[1]
  movi v1.16b, 0
  fmls h1, h8, v9.h[1]
  movi v2.16b, 0
  fmls h2, h8, v9.h[1]
  movi v3.16b, 0
  fmls h3, h8, v9.h[1]
  movi v4.16b, 0
  fmls h4, h8, v9.h[1]
  movi v5.16b, 0
  fmls h5, h8, v9.h[1]
  movi v6.16b, 0
  fmls h6, h8, v9.h[1]
  movi v7.16b, 0
  fmls h7, h8, v9.h[1]
  movi v8.16b, 9
  movi v9.16b, 10

(fused SUBS/B.cc loop)

100 unrolls and 100 iterations

Result (median cycles for code divided by count): 0.2508

retire (01)cycle (02)03080b191e1f3f51inst issue (52)~issue int (53)~issue fp/simd (54)~dispatch int (56)~dispatch fp/simd (57)huge thing int (59)huge thing fp/simd (5b)6061696d6escheduler rewind (75)scheduler stall (76)~dispatch op (78)~map op int (7c)~map op fp/simd (7e)~map lookup int (7f)~map lookup fp/simd (81)8283pipeline redirect (84)85inst all (8c)inst branch (8d)inst branch taken (90)inst b.cc (94)inst integer (97)inst neon or fp (9a)9facc2cfd0d2d5d6ddinst fetch restart (de)e0? fp/simd (ee)gpr retires (ef)f5f6f7f8fd
160204200891500000040258022110080000100800005006400000020046200652006532380100200800002002400002006520065111602011009910010016000010000101110011611200621600001002006620066200662006620066
160204200651500000040258010010080000100800005006400000520046200652006532380100200800002002400002006520065211602011009910010016000010000101110011611200621600001002006620066200662006620066
160204200651500000040258010010080000100800005006400000520046200652006532380100200800002002400002006520065111602011009910010016000010000101115111611200621600001002006620066200662006620066
160204200651510000040258010010080000100800005006400000020046200652006532380100200800002002400002006520065111602011009910010016000010000101110011611200621600001002006620066200662006620066
160204200651500000040258010010080000100800005006400001520046200652006532380100200800002002400002006520065111602011009910010016000010000101110011611200621600001002006620066200662006620066
160204200651500000040258010010080000100800005006400000020046200652006532380100200800002002400002006520065111602011009910010016000010000101110011611200621600001002006620066200662006620066
160204200651500000040258010010080000100800005606400000020046200652006532380100200800002002400002006520065111602011009910010016000010000101110011611200621600001002006620066200662006620066
1602042006515000000519258010010080000100800005006400000020046200652006532380100200800002002400002006520065111602011009910010016000010000101110011611200621600001002006620066200662006620066
160204200651500000040258010010080000100800005006400001020046200652006532380100200800002002400002006520065111602011009910010016000010000101110011611200621600001002006620066200662006620066
160204200651500000040258010010080000100800005006400000020046200652006532380100200800002002400002006520065111602011009910010016000010000101110011611200621600001002006620066200662006620066

1000 unrolls and 10 iterations

Result (median cycles for code divided by count): 0.2506

retire (01)cycle (02)0307080b1e3f51inst issue (52)~issue int (53)~issue fp/simd (54)~dispatch int (56)~dispatch fp/simd (57)huge thing int (59)huge thing fp/simd (5b)5f60696d6escheduler rewind (75)scheduler stall (76)~dispatch op (78)~map op int (7c)~map op fp/simd (7e)~map lookup int (7f)~map lookup fp/simd (81)8283pipeline redirect (84)85inst all (8c)inst branch (8d)inst branch taken (90)inst b.cc (94)inst integer (97)inst neon or fp (9a)9fa1a8accfd0d2icache miss (d3)d5d6d9dadbddinst fetch restart (de)e0? int output thing (e9)ea? fp/simd (ee)gpr retires (ef)f5f6f7f8fd
160024200721500000462580012128000012800006264000001200322005120051323800122080000202400002004720051111600211091010160000100001004061233124421131320048230160000102005220052200522005220052
16002420051150000052258001212800001280000626400001120028200472004732380012208000020240000200472004711160021109101016000010010100373120920211102020044215160000102004820048200482004820048
16002420047150000125272580012128000012800006264000011200282004720047323800122080000202400002004720047111600211091010160000100001003631221423211111920044215160000102004820048200482004820048
160024200471500000360258001212800001280000626400001120028200472004732380012208000020240000200472004711160021109101016000010000100313120132021191220044215160000102004820048200482004820048
16002420047150000046258001212800001280000626400001120028200472004732380012208000020240000200472004711160021109101016000010000100373124102021291320044215160000102004820048200482004820052
160024200471500000462580012128000012800006264000011200282004720047323800122080000202400002004720047111600211091010160000100001004031201820211101320044215160000102004820048200482004820048
16002420047150000046258001212800001280000626400001120028200472004732380012208000020240000200472004711160021109101016000010000100353122142021114920044215160000102004820048200482004820048
1600242004715000004625800121280000128000062640000112002820047200473238001220800002024000020047200471116002110910101600001000010032312192021181220044215160000102004820048200482004820048
16002420047150000046258001212800001280000626400001120094200472004732380012208000020240000200472005111160021109101016000010000100403120112021117820044215160000102004820048200482004820048
160024200471500000462580012128000012800006264000011200282004720047323800122080000202400002004720047111600211091010160000100001004031221020211131020044215160000102004820048200482004820048

Test 6: throughput

Count: 12

Code:

  fmls h0, h12, v13.h[1]
  fmls h1, h12, v13.h[1]
  fmls h2, h12, v13.h[1]
  fmls h3, h12, v13.h[1]
  fmls h4, h12, v13.h[1]
  fmls h5, h12, v13.h[1]
  fmls h6, h12, v13.h[1]
  fmls h7, h12, v13.h[1]
  fmls h8, h12, v13.h[1]
  fmls h9, h12, v13.h[1]
  fmls h10, h12, v13.h[1]
  fmls h11, h12, v13.h[1]
  movi v12.16b, 13
  movi v13.16b, 14

(fused SUBS/B.cc loop)

100 unrolls and 100 iterations

Result (median cycles for code divided by count): 0.3337

retire (01)cycle (02)03081e1f373f4e51inst issue (52)~issue int (53)~issue fp/simd (54)~dispatch int (56)~dispatch fp/simd (57)huge thing int (59)huge thing fp/simd (5b)60696b6d6edispatch stall (70)scheduler rewind (75)scheduler stall (76)~dispatch op (78)~map op int (7c)~map op fp/simd (7e)~map lookup int (7f)~map lookup fp/simd (81)8283pipeline redirect (84)85inst all (8c)inst branch (8d)inst branch taken (90)inst b.cc (94)inst integer (97)inst neon or fp (9a)9fa0a8accdcfd5d6ddinst fetch restart (de)e0? fp/simd (ee)gpr retires (ef)f5f6f7f8fd
1202044003930000006199612512010010012000010012000050056306400400200400394003924932324997120100200120000200360000400394003911120201100991001001200001000000761061611400301200001004004040040400404004040040
1202044003930000006199612512010010012000010012000050056306401400200400394003924932324997120100200120000200360000400394003911120201100991001001200001000000761011611400301200001004169240040400404004040040
12020440039299000061379662512010010012000010012000050056306400400200400394003924932324997120100200120000200360000400394003911120201100991001001200001000000761011611400301200001004004040040400404004040040
1202044003930000006199612512010010012000010012000050056306400400200400394003924932324997120100200120000200360000400394145011120201100991001001200001000000761013611400301200001004004040040400404004040040
1202044003929900006199612512010010012000010012000050056306400400203416864003924932324997120100200120000200360000400394003911120201100991001001200001000001761011611400301200001004169240040400404004041692
12020441687300000161996125120100100120000100120000500585199304167204168640039249323249971201002001200002003600004003940039111202011009910010012000010005600761011611400301200001004168740040416924004040040
1202044168630000006199612512010010012000010012000050056306400400200400394169126582326649120100200120000200360000416914003911120201100991001001200001000000761011611400301200001004004041687400404168740040
12020440039299000061379662512010310012000110012000050056306400400200400394169126582326649120100200120000200360000400394003911120201100991001001200001000000761011611400301200001004004040040400404004040040
12020440039300000061356892512010110012000310012000050056306400400200400394003926582326649120100200120000200360000410094003911120201100991001001200001000000763311611400301200001004169240040416924004041692
12020441691300000061379662512010110012000110012000050056306400400200400394003924932324997120100200120000200360000416914003911120201100991001001200001000000761011611416831200001004004040040400404004041692

1000 unrolls and 10 iterations

Result (median cycles for code divided by count): 0.3337

retire (01)cycle (02)0308181e1f373f4e51inst issue (52)~issue int (53)~issue fp/simd (54)~dispatch int (56)~dispatch fp/simd (57)huge thing int (59)huge thing fp/simd (5b)6061696d6edispatch stall (70)72scheduler rewind (75)scheduler stall (76)~dispatch op (78)~map op int (7c)~map op fp/simd (7e)~map lookup int (7f)~map lookup fp/simd (81)8283pipeline redirect (84)85inst all (8c)inst branch (8d)inst branch taken (90)inst b.cc (94)inst integer (97)inst neon or fp (9a)9fa6a8a9acc5cfd0d2icache miss (d3)d5d6ddinst fetch restart (de)e0? int output thing (e9)eaeb? fp/simd (ee)gpr retires (ef)f5f6f7f8fd
12002440039299000006199612512001010120000101200005056306400040020400394003924955032501912001020120000203600004003940039111200211091010120000100000075208001816131140030000120000104004040040400404004040040
120024400392990000025199612512001010120000101200005056306400040020400394003924955032501912001020120000203600004003940039111200211091010120000100000075200002616121440030000120000104004040040400404004040040
12002440039300000006199612512001010120000101200005056306400040020400394003924955032501912001020120000203600004003940039111200211091010120000100000075200001416111440030000120000104004040040400404004040040
12002440039300000006199612512001010120000101200005056306400040020400394003924955032501912001020120000203600004003940039111200211091010120000100000075200001216121240030000120000104004040040400404004040040
12002441691300000006199612512001010120000101200005056306400041667400394003924954032501912001020120000203600004003940039111200211091010120000100100075200001416141340030000120000104004040040400404004040040
12002440039300000006199612512001010120000101200005056306400040020400394003924955732501912001020120000203600004003940039111200211091010120000100000075200001216101240030000120000104004040040400404004040040
12002440039300000006199612512001010120000101200005056306401540020400394003924955032501912001020120000203600004003940039111200211091010120000100000075200001116151140030000120000104004040040400404004040040
12002440039300000006199612512001010120000101200005056306400040020400394003924955032667112001020120000203600004170141691111200211091010120000100000075200001516151440030000120000104004040040400404004040040
12002440039300000006199612512001010120000101200005056306400040020400394003924955032501912001020120000203600004003940039111200211091010120000100000075200001116131040030000120000104004040040400404004040040
12002440039300000006199612512001010120000101200005056306400041667400394003924955032501912001020120000203600004003940039111200211091010120000100000075200001216131240030000120000104004040040400404004040040