Apple Microarchitecture Research by Dougall Johnson

M1/A14 P-core (Firestorm): Overview | Base Instructions | SIMD and FP Instructions
M1/A14 E-core (Icestorm):  Overview | Base Instructions | SIMD and FP Instructions

FMLA (by element, 2S)

Test 1: uops

Code:

  fmla v0.2s, v1.2s, v2.s[1]
  movi v0.16b, 1
  movi v1.16b, 2
  movi v2.16b, 3

(no loop instructions)

1000 unrolls and 1 iteration

Retires: 1.000

Issues: 1.000

Integer unit issues: 0.000

Load/store unit issues: 0.000

SIMD/FP unit issues: 1.000

retire uop (01)cycle (02)033f4e51schedule uop (52)schedule simd uop (54)dispatch simd uop (57)ldst uops in schedulers (5b)60696d6emap stall dispatch (70)map rewind (75)map stall (76)dispatch uop (78)map simd uop (7e)map simd uop inputs (81)8283flush restart other nonspec (84)85inst all (8c)inst simd alu (9a)l1d cache writeback (a8)accfd5map dispatch bubble (d6)ddfetch restart (de)e0? simd retires (ee)f5f6f7f8fd
10044037306134072510001000100053190814018403740373258338951000100030004037403711100110000073216113473100040384038403840384038
10044037306134072510001000100053190804018403740373258338951000100030004037403711100110000073116113473100040384038403840384038
10044037306134072510001000100053190804018403740373258338951000100030004037403711100110000373116113473100040384038403840384038
10044037306134072510001000100053190804018403740373258338951000100030004037403711100110000073116113473100040384038403840384038
100440373016634072510001000100053190804018403740373258338951000100030004037403711100110000073116113473100040384038403840384038
100440373157334072510001000100053190804018403740373258338951000100030004037403711100110001073116113473100040384038403840384038
10044037306134072510001000100053190804018403740373258338951000100030004037403711100110000073116113473100040854085403840384038
10044037316134072510001000100053190804018403740373258338951000100030004037403711100110000073116113473100040384038403840384038
10044037316134072510001000100053190804018403740373258338951000100030004037403711100110000073116113473100040384038403840384038
10044037316134072510001000100053190804018403740373258338951000100030004037403711100110001073116113473100040384038403840384038

Test 2: Latency 1->1

Code:

  fmla v0.2s, v1.2s, v2.s[1]
  movi v0.16b, 1
  movi v1.16b, 2
  movi v2.16b, 3

(fused SUBS/B.cc loop)

100 unrolls and 100 iterations

Result (median cycles for code): 4.0037

retire uop (01)cycle (02)03l2 tlb miss data (0b)18191e3a3f4e51schedule uop (52)schedule int uop (53)schedule simd uop (54)dispatch int uop (56)dispatch simd uop (57)int uops in schedulers (59)ldst uops in schedulers (5b)60696d6emap stall dispatch (70)map rewind (75)map stall (76)dispatch uop (78)map int uop (7c)map simd uop (7e)map int uop inputs (7f)map simd uop inputs (81)8283flush restart other nonspec (84)85inst all (8c)inst branch (8d)inst branch taken (90)inst branch cond (94)inst int alu (97)inst simd alu (9a)9fl1d tlb miss (a1)ld unit uop (a6)st unit uop (a7)l1d cache writeback (a8)a9acc2branch cond mispred nonspec (c5)branch mispred nonspec (cb)cdcfd0d5map dispatch bubble (d6)ddfetch restart (de)e0? int output thing (e9)? simd retires (ee)? int retires (ef)f5f6f7f8fd
102044003730000057061394072510100100100001001000050057069081400184003740037381156387411010020010008200300244003740037111020110099100100100001000000030111717021600394900100001004003840038400384003840038
10204402253000000061394072510100100100001001000050057069081400184003740037381156387411010020010008200300244003740037111020110099100100100001000000000111718001600394890100001004003840038400384003840038
1020440037300000408061394072510100100100001001000050057069081400184003740037381083387451010020010000200300004003740037111020110099100100100001000000000000710121622394790100001004003840038400384003840038
10204400373000000061394072510100100100001001000050057069081400184003740037381083387451010020010000200300004003740037111020110099100100100001000000030000710121622394790100001004003840038400384003840038
10204400373000000061394072510100100100001001000056657069081400184003740037381083387451010020010000200300004003740037111020110099100100100001000000000000710121622394790100001004003840038400384003840038
102044023030000012061394072510100100100001001000050057069081400184003740037381083387451010020010000200300004003740037111020110099100100100001000000000000710121622394790100001004003840038400384003840038
1020440037300000390189394072510100100100001001000050057069081400184003740037381083387451010020010000200300004003740037111020110099100100100001000000000000710121622394790100001004003840038400384003840038
10204400373000006061394072510100100100001001000050057069081400184003740037381083387451010020010000200300004003740037111020110099100100100001004000000000710121622394790100001004003840038400384003840038
10204400373000000061394072510100100100001001000050057124921400184003740037381083387451010020010000200300004003740037111020110099100100100001000000000000710131622394790100001004003840038400384003840038
10204400373000000161394072510100100100001001000061757123101400184003740037381083387451010020010000212300004003740037111020110099100100100001000000000000710131622394790100001004003840038400384003840038

1000 unrolls and 10 iterations

Result (median cycles for code): 4.0037

retire uop (01)cycle (02)03mmu table walk data (08)l2 tlb miss data (0b)1e3f4e51schedule uop (52)schedule int uop (53)schedule simd uop (54)dispatch int uop (56)dispatch simd uop (57)int uops in schedulers (59)ldst uops in schedulers (5b)696d6emap stall dispatch (70)map rewind (75)map stall (76)dispatch uop (78)map int uop (7c)map simd uop (7e)map int uop inputs (7f)map simd uop inputs (81)8283flush restart other nonspec (84)85inst all (8c)inst branch (8d)inst branch taken (90)inst branch cond (94)inst int alu (97)inst simd alu (9a)9fl1d tlb access (a0)l1d tlb miss (a1)ld unit uop (a6)l1d cache writeback (a8)a9accdcfd5map dispatch bubble (d6)ddfetch restart (de)e0? int output thing (e9)? simd retires (ee)? int retires (ef)f5f6f7f8fd
10024400372990006139407251001010100001010000505706908400184003740037381303387671001020100002030000400374003711100211091010100001000000006403163339473010000104003840038400384003840038
10024400373000036139407251001010100001010000505706908400184003740037381303387671001020100002030000400374003711100211091010100001000000006403163339473010000104003840038400384003840038
10024400373000006139407251001010100001010000505706908400184003740037381303387671001020100002030000400374003711100211091010100001000000006403163339473010000104003840038400384003840038
10024400373000006139407251001010100001010000505706908400184003740037381303387671001020100002030000400374003711100211091010100001000000006403163339473010000104003840038400384003840038
10024400373000006139407251001010100001010000505706908400184003740037381303387671001020100002030000400374003711100211091010100001000000006402162239473010000104003840038400384003840038
10024400373000006139407251001010100001010000505706908400184003740037381303387671001020100002030000400374003711100211091010100001000000006403163339473010000104003840038400384003840038
10024400373000006139407251001010100001010000505706908400184003740037381303387671001020100002030000400374003711100211091010100001000000006403163339473010000104003840038400384003840038
10024400373000006139407251001010100001010000505706908400184003740037381303387671001020100002030000400374003711100211091010100001000000006403163339473010000104003840038400384003840038
10024400373000006139407251001010100001010000505706908400184003740037381303387671001020100002030000400374003711100211091010100001000000006403163339473010000104003840038400384003840038
10024400373000006139407251001010100001010000505706908400184003740037381303387671001020100002030000400374003711100211091010100001000000006402162239473010000104003840038400384003840038

Test 3: Latency 1->2

Code:

  fmla v0.2s, v0.2s, v1.s[1]
  movi v0.16b, 1
  movi v1.16b, 2

(fused SUBS/B.cc loop)

100 unrolls and 100 iterations

Result (median cycles for code): 4.0037

retire uop (01)cycle (02)03mmu table walk data (08)l2 tlb miss data (0b)18191e1f3a3f4e51schedule uop (52)schedule int uop (53)schedule simd uop (54)dispatch int uop (56)dispatch simd uop (57)int uops in schedulers (59)ldst uops in schedulers (5b)60696b6d6emap stall dispatch (70)map rewind (75)map stall (76)dispatch uop (78)map int uop (7c)map simd uop (7e)map int uop inputs (7f)map simd uop inputs (81)8283flush restart other nonspec (84)85inst all (8c)inst branch (8d)inst branch taken (90)inst branch cond (94)inst int alu (97)inst simd alu (9a)9fl1d tlb access (a0)l1d tlb miss (a1)ld unit uop (a6)st unit uop (a7)l1d cache writeback (a8)a9acc2branch mispred nonspec (cb)cfd5map dispatch bubble (d6)ddfetch restart (de)e0? int output thing (e9)? simd retires (ee)? int retires (ef)f5f6f7f8fd
102044003729900000006139407251010010010000100100005005706908040018040037400373810833874510100200100002003000040037400371110201100991001001000010000000000071011611394790100001004003840038400384003840038
10204400372990000000613940725101001001000010010000500570690804001804003740037381083387451010020010000200300004003740037111020110099100100100001000000201620071011611394790100001004003840038400384003840038
102044003729900000006139407251010010010000100100005005706908140018040037400373810833874510100200100002003000040037400371110201100991001001000010000003000071011611394790100001004003840038400384003840038
102044003730000000006139407251010010010000100100005225706908140018040037400373810833874510100200100002003000040037400371110201100991001001000010000000000071011611394790100001004003840038400384003840038
102044003730000000006139407251010010010000100100005005706908140018040037400373810833874510100200100002003000040037400371110201100991001001000010000000000071011601394790100001004003840038400384003840038
102044003730000000006139407251010010010000100100005005706908140018040037400373810833874510100200100002003000040037400371110201100991001001000010000000000071011611394790100001004003840038400384003840038
1020440037300000000061394072510100100100001001000050057069081400180400374003738108253874510100200100002003000040037400371110201100991001001000010000000000071011611394790100001004003840038400384003840038
102044003730000000006139407251010010010000100100005005706908140018040037400373810833874510100200100002003000040037400371110201100991001001000010000000000071011611394790100001004003840038400384003840038
1020440037300000030006139407251010010010000100100005005706908140018040037400373810833874510100200100002003000040037400371110201100991001001000010000000000071011611394790100001004003840038400384003840038
102044003729900000006139407251010010010000100100005005706908140018040037400373810833874510100200100002003000040037400371110201100991001001000010000000000071011611394790100001004003840038400384003840038

1000 unrolls and 10 iterations

Result (median cycles for code): 4.0037

retire uop (01)cycle (02)03mmu table walk data (08)09l2 tlb miss data (0b)18191e1f3a3f4e51schedule uop (52)schedule int uop (53)schedule simd uop (54)dispatch int uop (56)dispatch simd uop (57)int uops in schedulers (59)ldst uops in schedulers (5b)60696b6d6emap stall dispatch (70)map rewind (75)map stall (76)dispatch uop (78)map int uop (7c)map simd uop (7e)map int uop inputs (7f)map simd uop inputs (81)8283flush restart other nonspec (84)85inst all (8c)inst branch (8d)inst branch taken (90)inst branch cond (94)inst int alu (97)inst simd alu (9a)9fl1d tlb access (a0)l1d tlb miss (a1)ld unit uop (a6)l1d cache writeback (a8)a9acc2branch cond mispred nonspec (c5)branch mispred nonspec (cb)cdcfd5map dispatch bubble (d6)ddfetch restart (de)e0? int output thing (e9)? simd retires (ee)? int retires (ef)f5f6f7f8fd
100244003730000000000613940725100101010000101000050570690804001804003740037381303387671001020100002030000400374003711100211091010100001000000000006402162239473010000104003840038400384003840038
100244003729900000000613940725100101010000101000050570690804001804003740037381303387671001020100002030000400374003711100211091010100001000000600006402162239473010000104003840038400384003840038
1002440037311000001200893940725100101010000101000050570690804001804003740037381303387671001020100002030000402274032211100211091010100001000000300006402162239579010000104003840038400384003840038
1002440037310000000002763940725100101010000101000050570690814001804003740037381353387871001020100002030000400374003711100211091010100001000000000006402162239473010000104003840038400384003840038
10024400373000000000010673940725100201010000101014850570690804001804003740037381303387671001020100002030489401324003711100211091010100001004000000006402242239473010000104003840086400384003840038
1002440037310000000001603940725100101210000101029650570690804001804003740037381303387671001020100002030000400374003711100211091010100001000000000016402813339956410000104060640605405614064140606
100244059430400012121728123206507393082381008618100661711332715719472040263040084403213813028388031001020100002030000400374008411100221091010100001000210370006403162239473010000104003840038400384027440038
100244003731100002210090739407251001710100181010888505709700040088040037400373813025387671016020100002030489403234003711100211091010100001020000000006402162239691010000104008540038401784003840083
1002540037322010000002033938925100101010000101000060570970004006504003740037381303387861016022100002030000400374003711100211091010100001000000041006402161239473010000104017940038400384003840038
10024400372990100000010823940725100161010000101000050570690804001804003740037381583387671001020101632030972400374003711100211091010100001000000000006402162339473010000104003840038400384003840086

Test 4: Latency 1->3

Code:

  fmla v0.2s, v1.2s, v0.s[1]
  movi v0.16b, 1
  movi v1.16b, 2

(fused SUBS/B.cc loop)

100 unrolls and 100 iterations

Result (median cycles for code): 4.0037

retire uop (01)cycle (02)03mmu table walk data (08)09l2 tlb miss data (0b)18191e1f3a3f4e51schedule uop (52)schedule int uop (53)schedule simd uop (54)dispatch int uop (56)dispatch simd uop (57)int uops in schedulers (59)ldst uops in schedulers (5b)60696b6d6emap stall dispatch (70)simd prf full (72)map rewind (75)map stall (76)dispatch uop (78)map int uop (7c)map simd uop (7e)map int uop inputs (7f)map simd uop inputs (81)8283flush restart other nonspec (84)85inst all (8c)inst branch (8d)inst branch taken (90)inst branch cond (94)inst int alu (97)inst simd alu (9a)9fl1d tlb access (a0)l1d tlb miss (a1)ld unit uop (a6)st unit uop (a7)l1d cache writeback (a8)a9acc2branch cond mispred nonspec (c5)branch mispred nonspec (cb)cdcfd5map dispatch bubble (d6)ddfetch restart (de)e0? int output thing (e9)? simd retires (ee)? int retires (ef)f5f6f7f8fd
10204400373210000000061393982510100100100001001000050057069080400180400374003738108033876210100200100002003050140131400371110201100991001001000010000000060000710116113947911100001004003840038400384008640180
102044008431100013849006139407251010010010018100100005005706908040018040037400843810803387451010020010497200300004008440179111020110099100100100001000000003000071021612394790100001004003840038400384003840038
10204400373160000015006139407831010010010000100100005005706908040018040037400373810803387451055720010000200300004003740037111020110099100100100001000020003000071011611394790100001004008640038400384003840038
1020540037299000000880137039407251010010010024100100006245706908040018040037400373810803387451056520410000200300004003740037111020110099100100100001000000300000078111611395150100001004003840038400384003840087
10204400373180011337500915394078210100100100001001000050057069080400180400374003738108033874510100200100002003000040037402271110201100991001001000010020200010721000077711611395130100001004003840038400384008540132
1020540037300000000001281394072510100100100001111029650057069081400180400844003738108033874510100204100002003000040037400371110201100991001001000010000302010679200071014011394792100001004003840038400384017240179
102044003730000013390018939407251010010010000100100005005706908040018040037400373811273387641025120010000200300004003740037111020110099100100100001000000000000071011711394790100001004003840038401814003840181
10204400373000000300061394072510131100100001031000050057109820400180400374003738108033874510100200100002003148240037400371110201100991001001000010000000494000710116113947916100001004003840038400384021740121
10204400373130000000061394072510100100100001001044450057069080400880400874003738108033874510100210100002003000040037400371110201100991001001000010000002012000071011611395870100001004003840038400384003840086
1020440037311000008670061394071011010010010000102104445005708304040018040037400373810803387451026620010000200300004018040037111020110099100100100001000000120001071011611395853100001004003840038400384021540085

1000 unrolls and 10 iterations

Result (median cycles for code): 4.0037

retire uop (01)cycle (02)03mmu table walk instruction (07)mmu table walk data (08)09l2 tlb miss data (0b)18191e1f3a3f4e51schedule uop (52)schedule int uop (53)schedule simd uop (54)dispatch int uop (56)dispatch simd uop (57)int uops in schedulers (59)ldst uops in schedulers (5b)60696b6d6emap stall dispatch (70)simd prf full (72)map rewind (75)map stall (76)dispatch uop (78)map int uop (7c)map simd uop (7e)map int uop inputs (7f)map simd uop inputs (81)8283flush restart other nonspec (84)85inst all (8c)inst branch (8d)inst branch taken (90)inst branch cond (94)inst int alu (97)inst simd alu (9a)9fl1d tlb access (a0)l1d tlb miss (a1)ld unit uop (a6)st unit uop (a7)l1d cache writeback (a8)a9acc2branch mispred nonspec (cb)cdcfd0d5map dispatch bubble (d6)ddfetch restart (de)e0? int output thing (e9)? simd retires (ee)? int retires (ef)f5f6f7f8fd
100244003730010000000182394074510018101001210100005057110961400180401784003738130033882510010201000022300004003740037111002110910101000010200010000064002162239473010000104003840038400384003840038
1002440037315000000000395339407251001010100061110000505710800040018040037400843813003387671001020100002030000400374008541100211091010100001000000016200064002162639473010000104003840038400384008540038
1002440037314000000001448394074410010101000012101475057069081400650400374017938130033876710010201000020305164003740037111002110910101000010000032020064002492239473010000104003840038402274003840038
1002440037314000000000293394078110010101000014100005057069081400530400834003738130033876710310201000020309604017840037111002110910101000010002000000064002162239473010000104003840085400384003840227
1002440037311010001153880124239407251001010100181010000505706908040018040070400373813002638767100102010000203000040037400844110021109101010000100000301800064002162239547110000104003840038400384003840038
10024400373010000300003491394078410010101000010100005057069081400180401334003738130073882310010201000022300004003740037111002110910101000010000002000064002162239602010000104003840038400384003840038
1002440037311000000132264061394072510010101000610100005057069080400180400374003738147033876710010201000020300004008540133111002110910101000010010000000064002422239473010000104003840038400384003840086
1002440178317000001182640613940725100101010012101000050570690804005304008440084381520338767100102010481203048640037402271110021109101010000104020001069000064002162239473010000104003840038400384003840038
1002440037316000000000789394072510010101000010100005057069080400180400374003738130033876710010201000020300004003740037111002110910101000010000040000064002162339473010000104003840038400384003840038
1002440037300000000000103394072510010101000010100005057069081400180400374003738130033876710010201000020300004003740037111002110910101000010000010000064002162239473010000104003840038400384003840085

Test 5: throughput

Count: 8

Code:

  movi v0.16b, 0
  fmla v0.2s, v8.2s, v9.s[1]
  movi v1.16b, 0
  fmla v1.2s, v8.2s, v9.s[1]
  movi v2.16b, 0
  fmla v2.2s, v8.2s, v9.s[1]
  movi v3.16b, 0
  fmla v3.2s, v8.2s, v9.s[1]
  movi v4.16b, 0
  fmla v4.2s, v8.2s, v9.s[1]
  movi v5.16b, 0
  fmla v5.2s, v8.2s, v9.s[1]
  movi v6.16b, 0
  fmla v6.2s, v8.2s, v9.s[1]
  movi v7.16b, 0
  fmla v7.2s, v8.2s, v9.s[1]
  movi v8.16b, 9
  movi v9.16b, 10

(fused SUBS/B.cc loop)

100 unrolls and 100 iterations

Result (median cycles for code divided by count): 0.2508

retire uop (01)cycle (02)03mmu table walk instruction (07)mmu table walk data (08)09l2 tlb miss data (0b)18191e1f3a3f51schedule uop (52)schedule int uop (53)schedule simd uop (54)dispatch int uop (56)dispatch simd uop (57)int uops in schedulers (59)ldst uops in schedulers (5b)5f60696b6d6erob full (74)map rewind (75)map stall (76)dispatch uop (78)map int uop (7c)map simd uop (7e)map int uop inputs (7f)map simd uop inputs (81)8283flush restart other nonspec (84)85inst all (8c)inst branch (8d)inst branch taken (90)inst branch cond (94)inst int alu (97)inst simd alu (9a)9fl1d tlb access (a0)l1d tlb miss (a1)ld unit uop (a6)st unit uop (a7)l1d cache writeback (a8)a9acc2branch cond mispred nonspec (c5)branch mispred nonspec (cb)cdcfd5map dispatch bubble (d6)ddfetch restart (de)e0? int output thing (e9)? simd retires (ee)? int retires (ef)f5f6f7f8fd
1602042007415000000001760166258010010080000100801105006400000020046020065200650391801002008000020024000020065200652116020110099100100160000100000000123000010138316232006201600001002023220066200662006620066
16020420065150000000132264030267801001008000010080000500640000002004602006520065032380448200801122002400002006520065111602011009910010016000010000201060000101133163420062191600001002006620066200662006620066
160204200651500000000004025801001008000010080000500640896002004602006520065039080236200800002002400002006520065111602011009910010016000010000000015000010114316332006201600001002006620066200662006620066
16020420065150000001000402580100100800001008000050064086400200460202292006503238010020080000200240000200652006511160201100991001001600001000000000000010113316232006201600001002006620066200662006620066
1602042006515000000000010525801001008000010080000500640000002004602006520065035180247200803302002400002006520065111602011009910010016000010000000002000101133493320062171600001002006620066200662006620066
16020420065156000000000612580100100800001008000050064000000200460200652006503238010020080000200240000200652006511160201100991001001600001000000000000010112316332006201600001002006620066200662006620066
16020420065150010000450072814880100100800001008000050064000000200460200652006503238010020080000200240000200652006511160201100991001001600001000020000000010113316422006201600001002006620066200662006620066
1602042006515710000000058325801001008000010080000500640000002011402006520065027238010020080000200240000200652006511160201100991001001600001000000000010010113416252006201600001002007820066200662006620066
160204200651500000000004192580100100800001008000050064000000200460200652006503238010020080000200240000200652006511160201100991001001600001000000000002010113316332026301600001002006620066200662006620066
160204200651560000010006902580100100800001008000050064000000201140200652006503238010020080000200240000200652006521160201100991001001600001000000000000010112316322006201600001002006620066200662006620066

1000 unrolls and 10 iterations

Result (median cycles for code divided by count): 0.2508

retire uop (01)cycle (02)03mmu table walk instruction (07)mmu table walk data (08)09l2 tlb miss instruction (0a)l2 tlb miss data (0b)18191e1f3a3f51schedule uop (52)schedule int uop (53)schedule simd uop (54)dispatch int uop (56)dispatch simd uop (57)int uops in schedulers (59)ldst uops in schedulers (5b)5f6061696d6erob full (74)map rewind (75)map stall (76)dispatch uop (78)map int uop (7c)map simd uop (7e)map int uop inputs (7f)map simd uop inputs (81)8283flush restart other nonspec (84)85inst all (8c)inst branch (8d)inst branch taken (90)inst branch cond (94)inst int alu (97)inst simd alu (9a)9fl1d tlb access (a0)l1d tlb miss (a1)ld unit uop (a6)l1d cache writeback (a8)a9acc2branch cond mispred nonspec (c5)branch mispred nonspec (cb)cfd0d2l1i cache miss demand (d3)d5map dispatch bubble (d6)d9dadbddfetch restart (de)e0? int output thing (e9)eaebec? simd retires (ee)? int retires (ef)f5f6f7f8fd
160024200741570010001000462780012128000012800006264000011520033200522005203238001220800002024000020052202131116002110910101600001000000000010036811172562213142004922001160000102005320053200532005320053
1600242005215000000000004627800121280000128000062640000015200332006120052032380012208000020240000200522019611160021109101016000010000000000100381142143461112122004924001160000102005320053200532005320062
160024201191500000000120067298001212800001280000626400000152004220061200610323800122080000202400002006120259111600211091010160000100000030011003985213348311382005822002160000102006220062200622005320062
16002420061150020000000063127800121280000128000062640000115200332013020061032380012208000020240000200522028611160021109101016000010000000000100821152136782213122019434002160000102005320146203062006220053
16002420061150001000030023129800121280000128000062640000015200422006120061032380012208000020240000200612026311160021109101016000010000000000100381152834614214142025224001160000102006220053200622006220053
160024200611500000000000506278001212800001280000626400001152004220052200610323800122080000202400002005220213111600211091010160000100000000001003511228348221392004924002160000102005320062200532006220062
16002420061157000000008803522980012128000012800006264000011520033203032006103238001220800002024000020061202711116002110910101600001000000021010034651163482217102005822001160000102006220053200532006220062
1600242006115600000000001782980012128000012800006264000001520042200612006103238001220800002024000020061205161116002110910101600001000000600010035115114348331592005824002160000102014320228200532012120053
16002420061156000000000046298001212800001280000626408720152004220061200520323800122080000202400002008720061111600211091010160000100001000001003711521417842214122005824002160000102006220062201442031220062
160024200611500000000000522980012128000012800006264000001520042200522006103238001220800002024000020174203092116002110910101600001000010000010038115292244221492005822002160000102006220062200622006220137

Test 6: throughput

Count: 12

Code:

  fmla v0.2s, v12.2s, v13.s[1]
  fmla v1.2s, v12.2s, v13.s[1]
  fmla v2.2s, v12.2s, v13.s[1]
  fmla v3.2s, v12.2s, v13.s[1]
  fmla v4.2s, v12.2s, v13.s[1]
  fmla v5.2s, v12.2s, v13.s[1]
  fmla v6.2s, v12.2s, v13.s[1]
  fmla v7.2s, v12.2s, v13.s[1]
  fmla v8.2s, v12.2s, v13.s[1]
  fmla v9.2s, v12.2s, v13.s[1]
  fmla v10.2s, v12.2s, v13.s[1]
  fmla v11.2s, v12.2s, v13.s[1]
  movi v12.16b, 13
  movi v13.16b, 14

(fused SUBS/B.cc loop)

100 unrolls and 100 iterations

Result (median cycles for code divided by count): 0.3438

retire uop (01)cycle (02)03mmu table walk instruction (07)mmu table walk data (08)09l2 tlb miss instruction (0a)l2 tlb miss data (0b)18191e1f373a3f4e51schedule uop (52)schedule int uop (53)schedule simd uop (54)dispatch int uop (56)dispatch simd uop (57)int uops in schedulers (59)ldst uops in schedulers (5b)6061696b6d6emap stall dispatch (70)simd prf full (72)map rewind (75)map stall (76)dispatch uop (78)map int uop (7c)map simd uop (7e)map int uop inputs (7f)map simd uop inputs (81)8283flush restart other nonspec (84)85inst all (8c)inst branch (8d)inst branch taken (90)inst branch cond (94)inst int alu (97)inst simd alu (9a)9fl1d tlb access (a0)l1d tlb miss (a1)ld unit uop (a6)st unit uop (a7)l1d cache writeback (a8)a9acc2branch cond mispred nonspec (c5)cdcfd0d2d5map dispatch bubble (d6)ddfetch restart (de)e0? int output thing (e9)? simd retires (ee)? int retires (ef)f5f6f7f8fd
120204408863150000020000061379665012010310012000312012020850056306400041672040039416862657703249971201002001200002003606004128441691111202011009910010012000010000000018000763300216224003001200001004004040040416924004041687
12020441686312000000000006114032251201031001200011001200005005851993004002004003941120249320326649120100200120000200360672400394168611120201100991001001200001000000903000761000225224003001200001004169240040416874004041692
120204401093000000000294000619961251201031001200031001200005005849059004170504003941734249320326649120100200120402200360000400394003911120201100991001001200001002000400000763300116124170901200001004004041687400404247140040
1202044008930000110001101000949961251201001001200001001200005005819910104002004168640039249327326649120100200120000200360696400394169111120201100991001001200001000000000020761000217224003001200001004044541692416874004040040
1202044003932600000009300061252002512010010012000010712019550058519930040020040039400392572402425002120100200120000200360000416864015421120201100991001001200001000020200000763300216224040201200001004170340040400404169241687
1202044168631400000013510306135689251201031001200011001200005005630640004002034168640039265790324997120100200120000200360000400394168611120202100991001001200001000000400691000761000216224003001200001004004041692400934116441687
1202044168630000100001268830613796625120103100120003100120000500585186900416720400394169125498032664512010020012000020036000040039416911112020110099100100120000100002000680000761000217224003001200001004004040090424714168742471
12020441719311000100031800032435689251201031001200031001200005005851869004002004003941691249320324997120100200120000200360000416864003921120201100991001001200001000020000001761000216224003001200001004168740040416924004041687
120205400393001000000144030391356892512019410012005410012020350058468890041702341686400392493203266491203082021200002003600004168641006111202011009910010012000010000003707060007633002162241709101200001004004040040400404004040040
120204400393230000000120000410356894712020310012000011612000050056279491041672040039416862493103249971201002021200002003600004172141461111202011009910010012000010002004609000765700216224010801200001004168740040417034004040040

1000 unrolls and 10 iterations

Result (median cycles for code divided by count): 0.3337

retire uop (01)cycle (02)03l2 tlb miss data (0b)181e1f373a3f4e51schedule uop (52)schedule int uop (53)schedule simd uop (54)dispatch int uop (56)dispatch simd uop (57)int uops in schedulers (59)ldst uops in schedulers (5b)5f60696b6d6emap stall dispatch (70)simd prf full (72)map rewind (75)map stall (76)dispatch uop (78)map int uop (7c)map simd uop (7e)map int uop inputs (7f)map simd uop inputs (81)8283flush restart other nonspec (84)85inst all (8c)inst branch (8d)inst branch taken (90)inst branch cond (94)inst int alu (97)inst simd alu (9a)9fl1d tlb access (a0)l1d cache writeback (a8)a9acbranch cond mispred nonspec (c5)c9branch mispred nonspec (cb)cdcfd0d2l1i cache miss demand (d3)d5map dispatch bubble (d6)d9dadbddfetch restart (de)e0? int output thing (e9)eaebec? simd retires (ee)? int retires (ef)f5f6f7f8fd
120024400393261324000779996186120010101200001012000066563064011400200400394003926583032503912001020120000203600004003940039111200211091010120000100226000075223116164216640030040105120000104004040040417754109440094
1200244003930010128800733159846120010101200801012000055584316501400200400394003924955032501912001020120000203600004003940039511200211091010120000100200000075933124164224640552140100120000104004040040400404004040040
120024400393240000007399618712001010120000101200005556306401140570040226402602495503252001200102012020020360000404514133711120021109101012000010200690030075476224162112440030020200120000104004040040400404004040282
1200244003930800035230114715867251202431012000010120196505643833214002004003940039249550102501912001020120000203605854118241686111200211091010120000100000000075223212162114240030020105120000104004040040400404004040040
12002440039322000000288996125120010101200011012000050563064011400200400394003924955032501912001020120000203600004003940039111200211091010120000100600000075223116162113440030020105120000104004040040400404004040040
120024400393180090002679961251200691012000010120200505630640214002004003940039266040325019120010201200002036000040039400391112002210910101200001004600000075223112162114240030020105120000104004040040400404004040040
12002440039311003000067996125120011101200001012000050563064011400200400394003924955032501912001020120000203600004003940039111200211091010120000100000000075223224162112440030020105120000104004040040400404004040040
1200244003931000000067996125120010101200001012000050563064011400700400394009024955032501912001020120000203600004003940039111200211091010120000100000000175223124162122440030020105120000104004040040400404004040040
120024400393100066000732996125120010101200001012000050562694211400200400394003924955032501912021020120000203600004003940039111200211091010120000100000000075223133162114240030040105120000104004040040400404004040040
120024400393100036001395996125120010101200001012000060563064011400710400394003924955032501912001020120000203600004003940039111200211091010120000100000000075223122162114440030020105120000104004040040400404004040040