Apple M1 Microarchitecture Research by Dougall Johnson

Firestorm: Overview | Base Instructions | SIMD and FP Instructions
Icestorm:  Overview | Base Instructions | SIMD and FP Instructions

FMLA (by element, 4H)

Test 1: uops

Code:

  fmla v0.4h, v1.4h, v2.h[1]
  movi v0.16b, 1
  movi v1.16b, 2
  movi v2.16b, 3

(no loop instructions)

1000 unrolls and 1 iteration

Retires: 1.000

Issues: 1.000

Integer unit issues: 0.000

Load/store unit issues: 0.000

SIMD/FP unit issues: 1.000

retire (01)cycle (02)03090b1e3f4e51inst issue (52)~issue fp/simd (54)~dispatch fp/simd (57)huge thing fp/simd (5b)60696d6edispatch stall (70)scheduler rewind (75)scheduler stall (76)~dispatch op (78)~map op fp/simd (7e)~map lookup fp/simd (81)8283pipeline redirect (84)85inst all (8c)inst neon or fp (9a)a8accfd5d6ddinst fetch restart (de)e0? fp/simd (ee)f5f6f7f8fd
10044037310006134072510001000100053190814018403740373258338951000100030004037403711100110000073116113473100040384038403840384038
100440373000032534072510001000100053190804018403740373258338951000100030004037403711100110000073116113473100040384038403840384038
100440373100020334072510001000100053190804018403740373258338951000100030004037403711100110000073116113473100040384038403840384038
100440373000032834072510001000100053190814018403740373258338951000100030004037403711100110000073116113473100040384038403840384038
100440373100032134072510001000100053190804018403740373258338951000100030004037403711100110000073116113473100040384038403840384038
1004403730002134034072510001000100053190814018403740373258338951000100030004037403711100110000073116113473100040384038403840384038
100440373000061340725100010001000531908040184037403732583389510001000300040374037111001100011073116113473100040384038403840384038
10044037300006134072510001000100053190804018403740373258338951000100030004037403711100110000073116113473100040384038403840384038
100440373000061340725100010001000531908040184037403732583389510001000300040374037111001100015073116113473100040384038403840384038
10044037300006134072510001000100053190814018403740373258338951000100030004037403711100110001073116113473100040384038403840384038

Test 2: Latency 1->1

Code:

  fmla v0.4h, v1.4h, v2.h[1]
  movi v0.16b, 1
  movi v1.16b, 2
  movi v2.16b, 3

(fused SUBS/B.cc loop)

100 unrolls and 100 iterations

Result (median cycles for code): 4.0037

retire (01)cycle (02)030b191e1f3a3f4e51inst issue (52)~issue int (53)~issue fp/simd (54)~dispatch int (56)~dispatch fp/simd (57)huge thing int (59)huge thing fp/simd (5b)696d6edispatch stall (70)scheduler rewind (75)scheduler stall (76)~dispatch op (78)~map op int (7c)~map op fp/simd (7e)~map lookup int (7f)~map lookup fp/simd (81)8283pipeline redirect (84)85inst all (8c)inst branch (8d)inst branch taken (90)inst b.cc (94)inst integer (97)inst neon or fp (9a)9fa1a8a9acbranch mispredict (cb)cfd0d5d6ddinst fetch restart (de)e0? int output thing (e9)? fp/simd (ee)gpr retires (ef)f5f6f7f8fd
1020440037300000006139407251010010010000100100005005706908400184003740037381083387451010020010000200300004003740037111020110099100100100001000102430710121622394790100001004003840038400384003840038
10204400372990000061394072510100100100001001000050057069084001840037400373810833874510100200100002003000040037400371110201100991001001000010000000710121622394790100001004003840038400384008340038
10204400373000000082394072510100100100001001000050057069084001840037400373810833874510100200100002003000040230400371110201100991001001000010000000710121622394790100001004003840038400384003840038
10204400373000000061394072510100100100001001000050057069084001840037400373810833874510100200100002003000040037400371110201100991001001000010000000710121622394790100001004003840038400384003840038
102044003730000000726394072510100100100001001000050057069084001840037400373810833874510100200100002003000040037400371110201100991001001000010000000710121622394790100001004003840038400384003840038
10204400373000000061394072510100100100001001000050057069084001840037400373810833874510100200100002003000040037400372110201100991001001000010000000710121622394790100001004003840038400384003840038
1020440037299000001033940725101001001000010010148500570690840018400374003738108338745101002001000020030000400374003711102011009910010010000100000007121216223947920100001004003840038400384003840038
102044003730000000536394072510100100100001001000050057069084001840037400373810833874510100200100002003000040037400371110201100991001001000010000000710121622394790100001004003840038400384003840038
10204400373000000061394072510100100100001001000050057069084001840037400373810833874510100200100002003000040037400371110201100991001001000010000001710121622394790100001004003840038400384003840038
10204400372990000061393892510100100100001001000050057069084001840037400373810833874510100200100002003000040037400371110201100991001001000010000000710121622394790100001004003840038400384003840038

1000 unrolls and 10 iterations

Result (median cycles for code): 4.0037

retire (01)cycle (02)0308090b18191e1f3f4e51inst issue (52)~issue int (53)~issue fp/simd (54)~dispatch int (56)~dispatch fp/simd (57)huge thing int (59)huge thing fp/simd (5b)60696d6edispatch stall (70)scheduler rewind (75)scheduler stall (76)~dispatch op (78)~map op int (7c)~map op fp/simd (7e)~map lookup int (7f)~map lookup fp/simd (81)8283pipeline redirect (84)85inst all (8c)inst branch (8d)inst branch taken (90)inst b.cc (94)inst integer (97)inst neon or fp (9a)9fa0a1a8a9acc2cfd5d6ddinst fetch restart (de)e0? int output thing (e9)? fp/simd (ee)gpr retires (ef)f5f6f7f8fd
100244003730000000008239407251001010100001010000505706908040018400374003738130338767100102010000203000040037400371110021109101010000100000006406162239473010000104003840038400384003840038
100244003730000000006139407251001010100001010000505706908040018400374003738130338767100102010000203000040037400371110021109101010000100000006402162239473010000104003840038400384003840038
10024400373000000000613940725100101010000101000050570690804001840037400373813033876710010201000020300004003740037111002110910101000010000014706403162239473010000104003840038400384003840038
100244003730000000006139407251001010100001010000505706908040018400374003738130338767100102010000203000040037400371110021109101010000100000306403162239473010000104003840038400384003840038
100244003729910000006139407251001010100001010000505706908040018400374003738130338767100102010000203000040037400371110021109101010000100000006402162239473010000104003840038400384003840038
100244003729900000006639407251001010100001010000505706908040018400374003738130338767100102010000203000040037400371110021109101010000100000006403162239473010000104003840038400384003840038
100244003730000000006139407251001010100001010000505706908040018400374003738130338767100102010000203000040037400371110021109101010000100000006402162239473010000104003840038400384003840038
100244003729900000006139407251001010100001010000505706908040018400374003738130338767100102010000203000040037400371110021109101010000100000006402162239473010000104003840038400384003840038
100244003730000000006139407251001010100001010000505706908040018400374003738130338767100102010000203000040037400371110021109101010000100000006402162239473010000104003840038400384003840038
100244003730000000006139407251001010100001010000505706908040018400374003738130338767100102010000203000040037400371110021109101010000100000006402162339473010000104003840038400384003840038

Test 3: Latency 1->2

Code:

  fmla v0.4h, v0.4h, v1.h[1]
  movi v0.16b, 1
  movi v1.16b, 2

(fused SUBS/B.cc loop)

100 unrolls and 100 iterations

Result (median cycles for code): 4.0037

retire (01)cycle (02)03181e1f3f4e51inst issue (52)~issue int (53)~issue fp/simd (54)~dispatch int (56)~dispatch fp/simd (57)huge thing int (59)huge thing fp/simd (5b)60696b6d6edispatch stall (70)scheduler rewind (75)scheduler stall (76)~dispatch op (78)~map op int (7c)~map op fp/simd (7e)~map lookup int (7f)~map lookup fp/simd (81)8283pipeline redirect (84)85inst all (8c)inst branch (8d)inst branch taken (90)inst b.cc (94)inst integer (97)inst neon or fp (9a)9fa8acc5branch mispredict (cb)cfd5d6ddinst fetch restart (de)e0? int output thing (e9)? fp/simd (ee)gpr retires (ef)f5f6f7f8fd
1020440037300081061394072510100100100001001000050057069081400180400374003738108338745101002001000020030000400374003711102011009910010010000100030071021622394790100001004003840038400384003840038
102044003730000061394072510100100100001001000050057069081400180400374003738108338745101002001000020030000400374003711102011009910010010000100000071021622394790100001004003840038400384003840038
102044003730000061394072510100100100001001000050057069080400180400374003738108338745101002001000020030000400374003711102011009910010010000100000071021622394790100001004003840038400384003840038
1020440037300042061394072510100100100001001000050057069080400180400374003738108338745101002001000020030000400374003711102011009910010010000100000071021622394790100001004003840038400384003840038
1020440037300012061394072510100100100001001000050057069081400180400374003738108338745101002001000020030000400374003711102011009910010010000100000071021622394790100001004003840038400384003840038
10204400373000243061394072510100100100001001000050057069081400180400374003738108338745101002001000020030000400374003711102011009910010010000100000071021622394790100001004003840038400384003840038
102044003730000061394072510100100100001001000050057069081400180400374003738108338745101002001000020030000400374003711102011009910010010000100000071021622394790100001004003840038400384003840038
102044003730000061394072510100100100001001000050057069081400180400374003738108338745101002001000020030000400374003711102011009910010010000100000071021622394790100001004003840038400384003840038
102044003731100061394072510100100100001001000050057069081400180400374003738108338745101002001000020030000400374003711102011009910010010000100000071021622394790100001004003840038400384003840038
102044003729900061394072510100100100001001000050057069080400180400374003738108338745101002001000020030000400374003711102011009910010010000100000071021622394790100001004003840038400384003840038

1000 unrolls and 10 iterations

Result (median cycles for code): 4.0037

retire (01)cycle (02)030b181e1f3f4e51inst issue (52)~issue int (53)~issue fp/simd (54)~dispatch int (56)~dispatch fp/simd (57)huge thing int (59)huge thing fp/simd (5b)60696d6edispatch stall (70)scheduler rewind (75)scheduler stall (76)~dispatch op (78)~map op int (7c)~map op fp/simd (7e)~map lookup int (7f)~map lookup fp/simd (81)8283pipeline redirect (84)85inst all (8c)inst branch (8d)inst branch taken (90)inst b.cc (94)inst integer (97)inst neon or fp (9a)9fa8c5cfd5d6ddinst fetch restart (de)e0? fp/simd (ee)gpr retires (ef)f5f6f7f8fd
10024400373000087061394072510010101000010100005057069080400184003740037381303387671001020100002030000400374003711100211091010100001000640316333947310000104003840038400384003840038
10024400373000042076394072510010101000010100005057069081400184003740037381303387671001020100002030000400374003711100211091010100001000640316333947310000104003840038400384003840038
10024400373000033061394072510010101000010100005057069081400184003740037381303387671001020100002030000400374003711100211091010100001000640316333947310000104003840038400384003840038
100244003730000117061394072510010101000010100005057069080400184003740037381303387671001020100002030000400374003711100211091010100001000640316333947310000104003840038400384003840038
100244003730000471061394072510010101000010100005057069081400184003740037381303387671001020100002030000400374003711100211091010100001000640316333947310000104003840038400384003840038
10024400373000087061394072510010101000010100005057069081400184003740037381303387671001020100002030000400374003711100211091010100001000640316333947310000104003840038400384003840038
10024400373000042061394072510010101000010100005057069081400184003740037381303387671001020100002030000400374003711100211091010100001000640316333947310000104003840038400384003840038
10024400373000036061394072510010101000010100005057069081400184003740037381303387671001020100002030000400374003711100211091010100001000640316333947310000104003840038400384003840038
1002440037300006061394072510010101000010100005057069081400184003740037381303387671001020100002030000400374003711100211091010100001000640316333947310000104003840038400384003840038
100244003729900252061394072510010101000010100005057069080400654003740037381303387671001020100002030000400374003711100211091010100001000640316333947310000104003840038400384003840038

Test 4: Latency 1->3

Code:

  fmla v0.4h, v1.4h, v0.h[1]
  movi v0.16b, 1
  movi v1.16b, 2

(fused SUBS/B.cc loop)

100 unrolls and 100 iterations

Result (median cycles for code): 4.0037

retire (01)cycle (02)0318191e1f3a3f4e51inst issue (52)~issue int (53)~issue fp/simd (54)~dispatch int (56)~dispatch fp/simd (57)huge thing int (59)huge thing fp/simd (5b)60696b6d6edispatch stall (70)scheduler rewind (75)scheduler stall (76)~dispatch op (78)~map op int (7c)~map op fp/simd (7e)~map lookup int (7f)~map lookup fp/simd (81)8283pipeline redirect (84)85inst all (8c)inst branch (8d)inst branch taken (90)inst b.cc (94)inst integer (97)inst neon or fp (9a)9fa0a7a8a9acbranch mispredict (cb)cfd5d6ddinst fetch restart (de)e0? int output thing (e9)? fp/simd (ee)gpr retires (ef)f5f6f7f8fd
102044003730000294006139407251010010010000100100005005706908140018040037400373810833874510100200100002003000040037400371110201100991001001000010000000071011611394790100001004003840038400384003840038
102044003730000270015639407251010010010000100100005005706908140018040037400373810833874510100200100002003000040037400371110201100991001001000010000000071011611394790100001004003840038400384003840038
1020440037300003913206139407251010010010000100100005005706908140018040037400373810833874510100200100002003000040037400371110201100991001001000010000000071011611394790100001004003840038400384003840038
10204400373000036006139407251010010010000100100005005706908140018040037400373810833874510100200100002003000040037400371110201100991001001000010000000071011611394790100001004003840038400384008640038
10204400373000024006139407251010010010000100100005005706908140018040037400373810833874510100200100002003000040037400371110201100991001001000010000000071011611394790100001004003840038400384003840038
10204400373000039007639407251010010010000100100005005706908140018040037400373810833874510100200100002003000040037400371110201100991001001000010000000071011611394790100001004003840038400384003840038
10204400372990030006139407251010010010000100100005005706908140018040037400373810833874510100200100002003000040037400371110201100991001001000010000900071011611394790100001004003840038400384003840038
1020440037300006006139407251010010010000100100005005706908140018040037400373810833874510100200100002003000040037400371110201100991001001000010000000071011611394790100001004003840038400384003840038
10204400373000030006139407251010010010000100100005005706908140018040037400373810833874510100200100002003000040037400371110201100991001001000010000000071011611394790100001004003840038400384003840038
1020440037300003450069763932620610196151100481451118477257193961403330403584046838138413883611477226114942143448840471404671011020110099100100100001004012285480908189123979736100001004003840038400384003840038

1000 unrolls and 10 iterations

Result (median cycles for code): 4.0037

retire (01)cycle (02)031e3f4e51inst issue (52)~issue int (53)~issue fp/simd (54)~dispatch int (56)~dispatch fp/simd (57)huge thing int (59)huge thing fp/simd (5b)60696d6edispatch stall (70)scheduler rewind (75)scheduler stall (76)~dispatch op (78)~map op int (7c)~map op fp/simd (7e)~map lookup int (7f)~map lookup fp/simd (81)8283pipeline redirect (84)85inst all (8c)inst branch (8d)inst branch taken (90)inst b.cc (94)inst integer (97)inst neon or fp (9a)9fa8a9acbranch mispredict (cb)cdcfd5d6ddinst fetch restart (de)e0? fp/simd (ee)gpr retires (ef)f5f6f7f8fd
10024400373000726394072510010101000010100005057069080400184003740037381303387671001020100002030000400374003711100211091010100001000000640216223947310000104003840038400384003840038
1002440037300061394072510010101000010100005057069081400184003740037381303387671001020100002030000400374003711100211091010100001000000640216223947310000104003840038400384003840038
1002440037300061394072510010101000010100005057069080400184003740037381303387671001020100002030000400374003711100211091010100001000000640216223947310000104003840038400384003840038
1002440037300061394072510010101000010100005057069080400184003740037381303387671001020100002030000400374003711100211091010100001000000640216223947310000104003840038400384003840038
1002440037300061394072510010101000011100005057069081400184003740037381303387671001020100002030000400374003711100211091010100001000000640216223947310000104003840038400384003840038
1002440037299061394072510010101000010100005057069080400184003740037381303387671001020100002030000400374003711100211091010100001000000640216223947310000104003840038400384003840038
1002440037300061394072510010101000010100005057069080400184003740037381303387671001020100002030000400374003711100211091010100001000000640216223947310000104003840038400384003840038
1002440037300061394072510010101000010100005057069080400184003740037381303387671001020100002030000400374003711100211091010100001000000640216223947310000104003840038400384003840038
1002440037300061394072510010101000010100005057069081400184003740037381303387671001020100002030000400374003711100211091010100001000000640216223947310000104003840038400384003840038
1002440037300061394072510010101000010100005057069080400184003740037381303387671001020100002030000400374003711100211091010100001000000640216223947310000104003840038400384003840038

Test 5: throughput

Count: 8

Code:

  movi v0.16b, 0
  fmla v0.4h, v8.4h, v9.h[1]
  movi v1.16b, 0
  fmla v1.4h, v8.4h, v9.h[1]
  movi v2.16b, 0
  fmla v2.4h, v8.4h, v9.h[1]
  movi v3.16b, 0
  fmla v3.4h, v8.4h, v9.h[1]
  movi v4.16b, 0
  fmla v4.4h, v8.4h, v9.h[1]
  movi v5.16b, 0
  fmla v5.4h, v8.4h, v9.h[1]
  movi v6.16b, 0
  fmla v6.4h, v8.4h, v9.h[1]
  movi v7.16b, 0
  fmla v7.4h, v8.4h, v9.h[1]
  movi v8.16b, 9
  movi v9.16b, 10

(fused SUBS/B.cc loop)

100 unrolls and 100 iterations

Result (median cycles for code divided by count): 0.2508

retire (01)cycle (02)031e3f51inst issue (52)~issue int (53)~issue fp/simd (54)~dispatch int (56)~dispatch fp/simd (57)huge thing int (59)huge thing fp/simd (5b)60696d6escheduler rewind (75)scheduler stall (76)~dispatch op (78)~map op int (7c)~map op fp/simd (7e)~map lookup int (7f)~map lookup fp/simd (81)8283pipeline redirect (84)85inst all (8c)inst branch (8d)inst branch taken (90)inst b.cc (94)inst integer (97)inst neon or fp (9a)9fc2cfd5d6ddinst fetch restart (de)e0ea? fp/simd (ee)gpr retires (ef)f5f6f7f8fd
160204200891501540258010010080000100800005006400001200462006520065323801002008000020024000020065200651116020110099100100160000100010111316112006201600001002006620066200662006620066
160204200651500515258010010080000100800005006400001200462006520065323801002008000020024000020065200651116020110099100100160000100010111116112006201600001002006620066200662006620066
160204200651500515258010010080000100800005006400001200462006520065323801002008000020024000020065200651116020110099100100160000100010111116112006211600001002006620066200662006620066
16020420065150040258010010080000100800005006400001200462006520065323801002008000020024000020065200651116020110099100100160000100010111116112006201600001002006620066200662006620066
16020420065150640258010010080000100800005006400001200462006520065323801002008000020024000020065200651116020110099100100160000100010111116112006201600001002006620066200662006620066
16020420065150040258010010080000100800005006400001200462006520065323801002008000020024000020065200651116020110099100100160000100010111116112006201600001002006620066200662006620066
16020420065150040258010010080000100800005006400001200462006520065323801002008000020024000020065200651116020110099100100160000100010111116112006201600001002006620066200662006620066
16020420065151040258010010080000100800005006400001200462006520065323801002008000020024000020065200651116020110099100100160000100010111016212006201600001002006620066200662006620066
16020420065150040258010010080000100800005006400001200462006520065323801002008000020024000020065200651116020110099100100160000100010111116112006201600001002006620066200662006620066
16020420065151040258010010080000100800005006400001200462006520065323801002008000020024000020065200651116020110099100100160000100010111116102006201600001002006620066200662006620066

1000 unrolls and 10 iterations

Result (median cycles for code divided by count): 0.2507

retire (01)cycle (02)03070a1e3a3f51inst issue (52)~issue int (53)~issue fp/simd (54)~dispatch int (56)~dispatch fp/simd (57)huge thing int (59)huge thing fp/simd (5b)5f60696d6escheduler rewind (75)scheduler stall (76)~dispatch op (78)~map op int (7c)~map op fp/simd (7e)~map lookup int (7f)~map lookup fp/simd (81)8283pipeline redirect (84)85inst all (8c)inst branch (8d)inst branch taken (90)inst b.cc (94)inst integer (97)inst neon or fp (9a)9fa8acc2cfd0d2icache miss (d3)d5d6d9dadbddinst fetch restart (de)e0? int output thing (e9)ea? fp/simd (ee)gpr retires (ef)f5f6f7f8fd
16002420061151070046258001212800001280000626400003120034200532005332380012208000020240000200492004911160021109101016000010000100646224222111333720046231160000102005020050200502005020050
16002420049150020058258001212800001280000626400003120030200492004932380012208000020240000200492004911160021109101016000010000100613113422111362420046216160000102005020050200502005020050
1600242004915014381058258001212800001280000626400000120030200492005332380012208000020240000200492004911160021109101016000010000100583113622111383920046216160000102005020050200502005020050
16002420049150040052258001212800001280000626400003120030200492004932380012208000020240000200492004911160021109101016000010000100623113822111393920046216160000102005020050200502005020050
16002420049150340152258001212800001280000626400000120030200492004932380012208000020240000200492004911160021109101016000010200100633113822111393820046216160000102005020050200502005020050
160024200491513515058258001212800001280000626400003120030200492004932380012208000020240000200492004911160021109101016000010000100613113922111273920046216160000102005020050200502005020050
16002420049150250070258001212800001280000626400000120030200492004932380012208000020240000200532004911160021109101016000010000100493114022111404020046216160000102005020050200542005020050
16002420049150340170258001212800001280000626400000120030200492004932380012208000020240000200492004911160021109101016000010000100603113822111254020046216160000102005020050200502005020050
16002420049150260170258001212800001280000626400003120030200492004932380012208000020240000200492004911160021109101016000010000100633112722111412820046216160000102005020050200502005020050
16002420049151140164258001212800001280000626400000120030200492004932380012208000020240000200492004911160021109101016000010000100633113822111243620046216160000102005020050200502005020050

Test 6: throughput

Count: 12

Code:

  fmla v0.4h, v12.4h, v13.h[1]
  fmla v1.4h, v12.4h, v13.h[1]
  fmla v2.4h, v12.4h, v13.h[1]
  fmla v3.4h, v12.4h, v13.h[1]
  fmla v4.4h, v12.4h, v13.h[1]
  fmla v5.4h, v12.4h, v13.h[1]
  fmla v6.4h, v12.4h, v13.h[1]
  fmla v7.4h, v12.4h, v13.h[1]
  fmla v8.4h, v12.4h, v13.h[1]
  fmla v9.4h, v12.4h, v13.h[1]
  fmla v10.4h, v12.4h, v13.h[1]
  fmla v11.4h, v12.4h, v13.h[1]
  movi v12.16b, 13
  movi v13.16b, 14

(fused SUBS/B.cc loop)

100 unrolls and 100 iterations

Result (median cycles for code divided by count): 0.3474

retire (01)cycle (02)03080b181e1f373f4e51inst issue (52)~issue int (53)~issue fp/simd (54)~dispatch int (56)~dispatch fp/simd (57)huge thing int (59)huge thing fp/simd (5b)60696d6edispatch stall (70)scheduler rewind (75)scheduler stall (76)~dispatch op (78)~map op int (7c)~map op fp/simd (7e)~map lookup int (7f)~map lookup fp/simd (81)8283pipeline redirect (84)85inst all (8c)inst branch (8d)inst branch taken (90)inst b.cc (94)inst integer (97)inst neon or fp (9a)9fa0a1a6a8acc2cfd5d6ddinst fetch restart (de)e0? fp/simd (ee)gpr retires (ef)f5f6f7f8fd
1202044003930000059704613568925120103100120003100120000500563064014002040039416912657732499712010020012000020036000040039416911112020110099100100120000100002200761011611400301200001004004041687400404169240040
1202044003931000000361996125120100100120000100120000551585199314002040039416862657932664412010020012000020036000041686400391112020110099100100120000100000000761011611416771200001004004040040400404004041687
12020441686299000003613568925120100100120000100120000500585199314002040190401302657732664912010020012000020036000041688416861112020110099100100120000100000000761011611400301200001004168740040416874004041687
12020441686312000000613568925120100100120000100120000500585186914002040039416862493232499712010020012000020036000040039416861112020110099100100120000100200000761011611424611200001004168740040416874004041687
12020440039299000000613796625120101100120003100120000500585199314002040039416862657732664412010020012000020036000041686400391112020110099100100120000100000000761011611400301200001004004041687400404004040040
12020441691300000000613568925120101100120003100120000500563064014167241691400392493232499712010020012000020036000040039416911112020110099100100120000100000060761011611400301200001004168740040416874004041687
1202044168830100400361996125120100100120000100120000500585199314002040039416862658232664912010020012000020036000041686400391112020110099100100120000100000000761011611400301200001004168740040416874004041687
1202054170230000000161996125120100100120000100120000500585199314166741686400392493232499712010020012000020036000040039416861112020110099100100120000100000000761011611400301200001004169240040416874004040040
1202044168629900000061996125120100100120000100120000500585199314002040039416862657732664412010020012000020036000041686400391112020110099100100120000100000000761011611416831200001004169240040416874004041687
12020441691299000001613796625120103100120001100120000500563064014167241686400392493232499712010020012000020036000040039416861112020110099100100120000100000000761011611400301200001004169240040416924004041687

1000 unrolls and 10 iterations

Result (median cycles for code divided by count): 0.3337

retire (01)cycle (02)030a0b1e1f3f4e51inst issue (52)~issue int (53)~issue fp/simd (54)~dispatch int (56)~dispatch fp/simd (57)huge thing int (59)huge thing fp/simd (5b)5f6061696d6edispatch stall (70)scheduler rewind (75)scheduler stall (76)~dispatch op (78)~map op int (7c)~map op fp/simd (7e)~map lookup int (7f)~map lookup fp/simd (81)8283pipeline redirect (84)85inst all (8c)inst branch (8d)inst branch taken (90)inst b.cc (94)inst integer (97)inst neon or fp (9a)9fa0a1a8a9acc2c5cfd0d2icache miss (d3)d5d6d9dadbddinst fetch restart (de)e0eaebec? fp/simd (ee)gpr retires (ef)f5f6f7f8fd
12002441691300100010999612512001010120000101200005056306402104002040039400392495532501912001020120000203600004003940039111200211091010120000100090900752284117162111654003020105120000104004040040400404004040040
12002440039300000067996125120010101200001012000050563064021540020400394003924955327450120010201200002036000040039424701112002110910101200001000001800752285116162111664003020105120000104004040040400404004040040
12002440039300010016299612512001010120000101200005056306402154002040039400392495532501912001020120000203600004003940039111200211091010120000100000000752285113162111674003020105120000104004040040400404004040040
120024416863120033264679961251200101012000010120000505630640215400204003940039249553250191200102012000020360000400394003911120021109101012000010000000075228515162111674167820105120000104004040040400404004040040
12002440039300000019599612512001010120000101200005056306402154002040039400392495532501912001020120000203600004003940039111200211091010120000100000000752285116162217164003020105120000104004040040400404004040040
12002440039300000067996125120010101200001012000050585186911540020400394003924955325019120010201200002036000040039400391112002110910101200001000010007522851161621116164003020105120000104004040040400404004040040
1200244003929900006799612512001010120000101200005056306402154002040039400392495532501912001020120000203600004003940039111200211091010120000100000000752285116162111674003020105120000104004040040400404004040040
12002440039300000067996125120010101200001012000050563064011540020400394003924955325019120010201200002036000040039400391112002110910101200001000000007522851164121117174003020105120000104004040040400404004040040
1200244003930000210679961251200101012000010120000505630640115400204003940039249553250191200102012000020360000400394003911120021109101012000010000000075228515162118174003020105120000104004040040416964004040040
120024400392990012067996125120010101200011012000050563064001540020400394003924955325019120010201200002036000040039400391112002110910101200001000002700752285151621117174003020105120000104004040040400404004040040