Apple Microarchitecture Research by Dougall Johnson

M1/A14 P-core (Firestorm): Overview | Base Instructions | SIMD and FP Instructions
M1/A14 E-core (Icestorm):  Overview | Base Instructions | SIMD and FP Instructions

BIF (vector, 8B)

Test 1: uops

Code:

  bif v0.8b, v1.8b, v2.8b
  movi v0.16b, 1
  movi v1.16b, 2
  movi v2.16b, 3

(no loop instructions)

1000 unrolls and 1 iteration

Retires: 1.000

Issues: 1.000

Integer unit issues: 0.000

Load/store unit issues: 0.000

SIMD/FP unit issues: 1.000

retire uop (01)cycle (02)03mmu table walk data (08)1e3a3f4e51schedule uop (52)schedule simd uop (54)dispatch simd uop (57)ldst uops in schedulers (5b)60696d6emap stall dispatch (70)map rewind (75)map stall (76)dispatch uop (78)map simd uop (7e)map simd uop inputs (81)8283flush restart other nonspec (84)85inst all (8c)inst simd alu (9a)l1d cache writeback (a8)accfd0d5map dispatch bubble (d6)ddfetch restart (de)dfe0? simd retires (ee)f5f6f7f8fd
1004203715003721687251000100010002646801201820372037157231895100010003000203720371110011000007941161131787100020382038203820382038
1004203715103721687251000100010002646801201820372037157231895100010003000203720371110011000007941161131787100020382038203820382038
1004203715003721687251000100010002646801201820372037157231895100010003000203720371110011000007941161131787100020382038203820382038
1004203715003721687251000100010002646801201820372037157231895100010003000203720371110011000707941161131787100020382038203820382038
1004203715003721687251000100010002646801201820372037157231895100010003000203720371110011000007941161131787100020382038203820382038
1004203715093721687251000100010002646801201820372037157231895100010003000203720371110011000007941161131787100020382038203820382038
10042037160937216872510001000100026468002018203720371572318951000100030002037203711100110001307941161131787100020382038203820382038
1004203715003721687251000100010002646800201820372037157231895100010003000203720371110011000037941161131787100020382038203820382038
1004203716003721687251000100010002646801201820372037157231895100010003000203720371110011000007941161131787100020382038203820382038
1004203715003721687251000100010002646801201820372037157231895100010003000203720371110011000007941161131787100020382038203820382038

Test 2: Latency 1->1

Code:

  bif v0.8b, v1.8b, v2.8b
  movi v0.16b, 1
  movi v1.16b, 2
  movi v2.16b, 3

(fused SUBS/B.cc loop)

100 unrolls and 100 iterations

Result (median cycles for code): 2.0037

retire uop (01)cycle (02)03l2 tlb miss data (0b)1e1f3a3f4e51schedule uop (52)schedule int uop (53)schedule simd uop (54)dispatch int uop (56)dispatch simd uop (57)int uops in schedulers (59)ldst uops in schedulers (5b)696d6emap stall dispatch (70)simd prf full (72)map rewind (75)map stall (76)dispatch uop (78)map int uop (7c)map simd uop (7e)map int uop inputs (7f)map simd uop inputs (81)8283flush restart other nonspec (84)85inst all (8c)inst branch (8d)inst branch taken (90)inst branch cond (94)inst int alu (97)inst simd alu (9a)9fl1d cache writeback (a8)accfd0d5map dispatch bubble (d6)ddfetch restart (de)e0? simd retires (ee)? int retires (ef)f5f6f7f8fd
1020420037150000061196872510100100100001001000050028476802001820037200371842203187451010020010000200300002003720037111020110099100100100001000071012162219791100001002003820038200382003820038
1020420037150100061196872510100100100001001000050028476802001820037200371842203187451010020010000200300002003720037111020110099100100100001000071012162219791100001002003820038200382003820038
1020420037150000061196872510100100100001001000050028476802001820037200371842203187451010020010000200300002003720037111020110099100100100001000071212162219791100001002003820038200382003820038
10204200371500000346196872510100100100001001000050028476802001820037200371842203187451010020010000200300002003720037111020110099100100100001001071010162219791100001002003820038200382003820038
1020420037150000084196872510100100100001001000050028476802001820037200371842203187451010020010332200300002003720037111020110099100100100001000071212162219791100001002003820038200382003820038
1020420037150000061196872510100100100001001000050028476802001820037200371842203187451010020010000200300002003720037111020110099100100100001000071213162219791100001002003820038200382003820038
1020420037150000061196872510100100100001001000050028476802001820037200371842203187451010020010000200300002003720037111020110099100100100001000071012162219791100001002003820038200382003820038
1020420037150000161196872510100100100001001000050028476802001820037200371842203187451010020010000200300002003720037111020110099100100100001000071012162219791100001002003820038200382003820038
1020420037150000084196872510100100100001001000050028476802001820037200371842203187451010020010000200300002003720037111020110099100100100001000071012162219791100001002003820038200382003820038
1020420037150000061196872510100100100001001000050028476802001820037200371842203187451010020010000200300002003720037111020110099100100100001000071012162219791100001002003820038200382003820038

1000 unrolls and 10 iterations

Result (median cycles for code): 2.0037

retire uop (01)cycle (02)03mmu table walk data (08)l2 tlb miss data (0b)18191e1f3f4e51schedule uop (52)schedule int uop (53)schedule simd uop (54)dispatch int uop (56)dispatch simd uop (57)int uops in schedulers (59)ldst uops in schedulers (5b)696d6emap stall dispatch (70)map rewind (75)map stall (76)dispatch uop (78)map int uop (7c)map simd uop (7e)map int uop inputs (7f)map simd uop inputs (81)8283flush restart other nonspec (84)85inst all (8c)inst branch (8d)inst branch taken (90)inst branch cond (94)inst int alu (97)inst simd alu (9a)9fl1d tlb access (a0)l1d tlb miss (a1)ld unit uop (a6)st unit uop (a7)l1d cache writeback (a8)a9acc2cfd5map dispatch bubble (d6)ddfetch restart (de)e0? int output thing (e9)? simd retires (ee)? int retires (ef)f5f6f7f8fd
1002420037150100000124196872510010101000010100005028476802001820037200371844431876710010201000020300002003720037111002110910101000010000000006623163319785010000102003820038200382003820038
100242003715000000061196872510010101000010100005028476802001820037200371844431876710010201000020300002003720037111002110910101000010000000006403163319785010000102003820038200382003820038
100242003715000000082196872510010101000010100005028476802001820037200371844431876710010201000020300002003720037111002110910101000010000000006403163319785010000102003820038200382003820038
100242003715000000061196872510010101000010100005028476802001820037200371844431876710010201000020300002003720037111002110910101000010000000006403163319785010000102003820038200382003820038
100242003715000000061196872510010101000010100005028476802001820037200371844431876710010201000020300002003720037111002110910101000010000000006404163319785010000102003820038200382003820038
100242003715000000061196872510010101000010100005028476802001820037200371844431876710010201000020300002003720037111002110910101000010000000006403163319785010000102003820038200382003820038
100242003715000000061196872510010101000010100005028476802001820037200371844431876710010201000020300002003720037111002110910101000010000000006403163319785010000102003820038200382003820038
100242003715000000061196872510010101000010100005028476802001820037200371844431876710010201000020300002003720037111002110910101000010000000006403163319853010000102003820038200382003820038
100242003715000000061196872510010101000010100005028476802001820037200371844431876710010201000020300002003720037111002110910101000010000000006403163319785010000102003820038200382003820038
1002420037149000000251196872510010101000010100005028476802001820037200371844431876710010201000020300002003720037111002110910101000010000000006403163319785010000102003820038200382003820038

Test 3: Latency 1->2

Code:

  bif v0.8b, v0.8b, v1.8b
  movi v0.16b, 1
  movi v1.16b, 2

(fused SUBS/B.cc loop)

100 unrolls and 100 iterations

Result (median cycles for code): 2.0037

retire uop (01)cycle (02)03191e3f4e51schedule uop (52)schedule int uop (53)schedule simd uop (54)dispatch int uop (56)dispatch simd uop (57)int uops in schedulers (59)ldst uops in schedulers (5b)60696d6emap stall dispatch (70)map rewind (75)map stall (76)dispatch uop (78)map int uop (7c)map simd uop (7e)map int uop inputs (7f)map simd uop inputs (81)8283flush restart other nonspec (84)85inst all (8c)inst branch (8d)inst branch taken (90)inst branch cond (94)inst int alu (97)inst simd alu (9a)9fcdcfd5map dispatch bubble (d6)ddfetch restart (de)e0? simd retires (ee)? int retires (ef)f5f6f7f8fd
102042003715000611968725101001001000010010000500284768002001820037200371842231874510100200100002003000020037200371110201100991001001000010007101161119791100001002003820038200382003820038
102042003715000611968725101001001000010010000500284768002001820037200371842231874510100200100002003000020037200371110201100991001001000010007101161119791100001002003820038200382003820038
1020420037150006551968725101001001000010010000500284768002001820037200371842231874510100200100002003000020037200371110201100991001001000010007101161119791100001002003820038200382003820038
102042003715000611968725101001001000010010000500284768012001820037200371842231874510100200100002003000020037200371110201100991001001000010007101161119791100001002003820038200382003820038
102042003715000611968725101001001000010010000500284768002001820037200371842231874510100200100002003000020037200371110201100991001001000010007101161119791100001002003820038200382003820038
102042003715000611968725101001001000010010000500284768002001820037200371842231874510100200100002003000020037200371110201100991001001000010007101161119791100001002003820038200382003820038
1020420037150007261968725101001001000010010000500284768002001820037200371842231874510100200100002003000020037200371110201100991001001000010007101161119791100001002003820038200382003820038
102042003715010611968725101001001000010010000500284768002001820037200841842231874510100200100002003000020037200371110201100991001001000010007101161119791100001002003820038200382003820038
102042003715000611968725101001001000010010000500284768002001820037200371842231874510100200100002003000020037200371110201100991001001000010007101161119791100001002003820038200382003820038
102042003715000611968725101001001000010010000500284768002001820037200371842231874510100200100002003000020037200371110201100991001001000010007101161119791100001002003820038200382003820038

1000 unrolls and 10 iterations

Result (median cycles for code): 2.0037

retire uop (01)cycle (02)03l2 tlb miss data (0b)18191e1f3f4e51schedule uop (52)schedule int uop (53)schedule simd uop (54)dispatch int uop (56)dispatch simd uop (57)int uops in schedulers (59)ldst uops in schedulers (5b)696d6emap stall dispatch (70)map rewind (75)map stall (76)dispatch uop (78)map int uop (7c)map simd uop (7e)map int uop inputs (7f)map simd uop inputs (81)8283flush restart other nonspec (84)85inst all (8c)inst branch (8d)inst branch taken (90)inst branch cond (94)inst int alu (97)inst simd alu (9a)9fl1d tlb access (a0)l1d tlb miss (a1)ld unit uop (a6)l1d cache writeback (a8)a9acc2cfd5map dispatch bubble (d6)ddfetch restart (de)e0? int output thing (e9)? simd retires (ee)? int retires (ef)f5f6f7f8fd
10024200371500001506119687251001010100001010000502847680200182003720037184443187671001020100002030000200372003711100211091010100001000000006402162219785010000102003820038200382003820038
1002420037150000906119687251001010100001010000502847680200182003720037184443187671001020100002030000200372003711100211091010100001000000006402162219785010000102003820038200382003820038
10024200371500006061196872510010101000010100005028476802001820037200371844411187671001020100002030000200372003711100211091010100001000000006402162219785010000102003820038200382003820084
1002420037150000006119687251001010100001010000502847680200182003720037184443187671001020100002030000200372003711100211091010100001000000006402162219785010000102003820038200382003820038
100242003715000036606119687251001010100001010000502847680200182003720037184443187671001020100002030000200372003711100211091010100001000000006402162219785010000102003820038200382003820038
1002420037150000411015619687251001010100001010000502847680200182003720037184443187671001020100002030000200372003721100211091010100001000000006402162219785010000102003820038200382003820038
1002420037150000606119687251001010100001010000502847680200182003720037184443187671001020100002030000200372003711100211091010100001000000006402162219785010000102003820038200382003820038
1002420037150000006119687251001010100001010000502847680200182003720037184443188051001020100002030000200372003711100211091010100001000000006402162219785010000102003820038200382008520038
1002420037150000006119687251001010100001010000502847680200182003720037184443187671001020100002030000200372003711100211091010100001000000006402162219785010000102003820038200382003820038
1002420037150000606119687251001010100001010000502847680200182003720037184443187671001020100002030000200372003711100211091010100001000000006402162219785010000102003820038200382003820038

Test 4: Latency 1->3

Code:

  bif v0.8b, v1.8b, v0.8b
  movi v0.16b, 1
  movi v1.16b, 2

(fused SUBS/B.cc loop)

100 unrolls and 100 iterations

Result (median cycles for code): 2.0037

retire uop (01)cycle (02)03mmu table walk data (08)l2 tlb miss data (0b)18191e3f4e51schedule uop (52)schedule int uop (53)schedule simd uop (54)dispatch int uop (56)dispatch simd uop (57)int uops in schedulers (59)ldst uops in schedulers (5b)60696d6emap stall dispatch (70)map rewind (75)map stall (76)dispatch uop (78)map int uop (7c)map simd uop (7e)map int uop inputs (7f)map simd uop inputs (81)8283flush restart other nonspec (84)85inst all (8c)inst branch (8d)inst branch taken (90)inst branch cond (94)inst int alu (97)inst simd alu (9a)9fl1d tlb access (a0)l1d tlb miss (a1)ld unit uop (a6)accfd5map dispatch bubble (d6)ddfetch restart (de)e0? int output thing (e9)? simd retires (ee)? int retires (ef)f5f6f7f8fd
10204200371500101726119687251010010010000100100005002847680020018200862003718422318745101002001000020030000200372003711102011009910010010000100000071011611197910100001002003820038200382003820038
10204200371500000696119687251010010010000100100005002847680020018200372003718422318745101002001000020030000200372003711102011009910010010000100000371011611197910100001002003820038200382003820038
1020420037150000096119687251010010010000100100005002847680020018200372003718422318745101002001000020030000200372003711102011009910010010000100000071011611197910100001002003820038200382003820038
10204200371500000366119687251010010010000100100005002847680020018200372003718422318745101002001000020030000200372003711102011009910010010000100000071042511197910100001002003820038200382003820038
102042003715000301263819687251010010010048100100005002847680020018200372003718422318745101002001000020030000200372003711102011009910010010000100000071011611197910100001002003820038200382003820038
10204200371500000156119687251010010010000100100005002847680020018200372003718422318745101002001000020030000200372003711102011009910010010000100000071011611197910100001002003820038200382003820038
10204200371500000156119687251010010010000100100005002847680020018200372003718422318745101002001000020030000200372003711102011009910010010000100000071011611197910100001002003820038200382003820038
1020420037150000066119687251010010010000100100005002847680020018200372003718422318745101002001000020030000200372003711102011009910010010000100000071011611197910100001002003820038200382003820038
1020420037150010006119687251010010010000100100005002847680020018200372003718422318745101002001000020030000200372013211102011009910010010000100000071011611197910100001002003820038200382003820038
10204200371500000156119687251010010010000100100005002847680020018200372003718422318745101002001000020030000200372003711102011009910010010000100000071011611197910100001002003820038200382003820038

1000 unrolls and 10 iterations

Result (median cycles for code): 2.0037

retire uop (01)cycle (02)031e3f4e51schedule uop (52)schedule int uop (53)schedule simd uop (54)dispatch int uop (56)dispatch simd uop (57)int uops in schedulers (59)ldst uops in schedulers (5b)60696d6emap stall dispatch (70)simd prf full (72)map rewind (75)map stall (76)dispatch uop (78)map int uop (7c)map simd uop (7e)map int uop inputs (7f)map simd uop inputs (81)8283flush restart other nonspec (84)85inst all (8c)inst branch (8d)inst branch taken (90)inst branch cond (94)inst int alu (97)inst simd alu (9a)9faccfd5map dispatch bubble (d6)ddfetch restart (de)e0? simd retires (ee)? int retires (ef)f5f6f7f8fd
10024200371502461196872510010101000010100005028476801200182003720037184440318767100102010000203000020037200371110021109101010000100640416441978510000102003820038200382003820038
1002420037149061196872510010101000010100005028476801200182003720037184440318767100102010000203000020037200371110021109101010000100640416431978510000102003820038200382003820038
1002420037150061196872510010101000010100005028476801200182003720037184440318767100102010000203000020037200371110021109101010000100640416431978510000102003820038200382003820038
1002420037150061196872510010101000010100005028476801200182003720037184440318767100102010000203000020037200371110021109101010000100640416341978510000102003820038200382003820038
10024200371500611968725100101010000101000050284768012001820037200371844401518767100102010000203000020037200371110021109101010000100640416441978510000102003820038200382003820038
10024200371507861196872510010101000010100005028476801200182003720037184440318767100102010000203000020037200371110021109101010000100640316431978510000102003820038200382003820038
1002420037150061196872510010101000010100005028476801200182003720037184440318767100102010000203000020037200371110021109101010000100640416441978510000102008520038200382003820038
1002420037150061196872510010101000010100005028476801200182003720037184440318767100102010000203000020037200371110021109101010000100640316341978510000102003820038200382003820038
10024200371501561196872510010101000010100005028476801200182003720037184440318767100102010000203000020037200371110021109101010000100640316341978510000102003820038200382003820038
1002420037150061196872510010101000010100005028476801200182003720037184440318767100102010000203000020037200371110021109101010000100640316431978510000102003820038200382003820038

Test 5: throughput

Count: 8

Code:

  movi v0.16b, 0
  bif v0.8b, v8.8b, v9.8b
  movi v1.16b, 0
  bif v1.8b, v8.8b, v9.8b
  movi v2.16b, 0
  bif v2.8b, v8.8b, v9.8b
  movi v3.16b, 0
  bif v3.8b, v8.8b, v9.8b
  movi v4.16b, 0
  bif v4.8b, v8.8b, v9.8b
  movi v5.16b, 0
  bif v5.8b, v8.8b, v9.8b
  movi v6.16b, 0
  bif v6.8b, v8.8b, v9.8b
  movi v7.16b, 0
  bif v7.8b, v8.8b, v9.8b
  movi v8.16b, 9
  movi v9.16b, 10

(fused SUBS/B.cc loop)

100 unrolls and 100 iterations

Result (median cycles for code divided by count): 0.2508

retire uop (01)cycle (02)031e3f51schedule uop (52)schedule int uop (53)schedule simd uop (54)dispatch int uop (56)dispatch simd uop (57)int uops in schedulers (59)ldst uops in schedulers (5b)696d6emap rewind (75)map stall (76)dispatch uop (78)map int uop (7c)map simd uop (7e)map int uop inputs (7f)map simd uop inputs (81)8283flush restart other nonspec (84)85inst all (8c)inst branch (8d)inst branch taken (90)inst branch cond (94)inst int alu (97)inst simd alu (9a)9fl1d cache writeback (a8)accfd5map dispatch bubble (d6)ddfetch restart (de)e0? simd retires (ee)? int retires (ef)f5f6f7f8fd
16020420088151513825801001008000010080000500640000200442006320063321801002008000020024000020063200631116020110099100100160000100001011211612200601600001002006420064200642006420064
1602042006315033825801001008000010080000500640000200442006320063321801002008000020024000020063200631116020110099100100160000100001011111621200601600001002006420064200642006420064
1602042006315003825801001258000010080000500640000200442006320063321801002008000020024000020063200631116020110099100100160000100001011121611200601600001002006420064200642006420064
16020420063150363825801001008000010080000500640000200442006320063321801002008000020024000020063200631116020110099100100160000100001011131612200601600001002006420064200642006420064
160204200631503622825801001008000010080000500640000200442006320063321801002008000020024000020063200631116020110099100100160000100001011111622200601600001002006420064200642006420064
16020420063150363825801001008000010080000500640000200442006320063321801002008000020024000020063200631116020110099100100160000100001011111613200601600001002006420064200642006420064
16020420063150153825801001008000010080000500640000200442006320063321801002008000020024000020063200631116020110099100100160000100001014511622200601600001002006420064200642006420064
16020420063150363825801001008000010080000500640000200442006320063321801002008000020024000020063200631116020110099100100160000100001011311623200601600001002006420064200642006420064
16020420063150363825801001008000010080000500640000200442006320063321801002008000020024000020063200631116020110099100100160000100001011121621200601600001002006420064200642006420064
16020420063150363825801001008000010080000500640000200442006320063321801002008000020024000020063200631116020110099100100160000100001011111612200601600001002006420064200642006420064

1000 unrolls and 10 iterations

Result (median cycles for code divided by count): 0.2506

retire uop (01)cycle (02)03mmu table walk data (08)l2 tlb miss data (0b)18191e1f3a3f51schedule uop (52)schedule int uop (53)schedule simd uop (54)dispatch int uop (56)dispatch simd uop (57)int uops in schedulers (59)ldst uops in schedulers (5b)5f60696d6emap rewind (75)map stall (76)dispatch uop (78)map int uop (7c)map simd uop (7e)map int uop inputs (7f)map simd uop inputs (81)8283flush restart other nonspec (84)85inst all (8c)inst branch (8d)inst branch taken (90)inst branch cond (94)inst int alu (97)inst simd alu (9a)9fl1d tlb access (a0)ld unit uop (a6)l1d cache writeback (a8)a9accfd0d2l1i cache miss demand (d3)d5map dispatch bubble (d6)d9dadbddfetch restart (de)e0? int output thing (e9)ea? simd retires (ee)? int retires (ef)f5f6f7f8fd
16002420056150000018004425800121280000128000062640000112002620045200453218001220801002024000020045200451116002110910101600001000000100263112202112320042215160000102004620046200462004620046
16002420045151000000015763801131380101128000062640000012009320287200493218001220800002024000020049200491116002110910101600001000000100286222244223320046230160000102015020149200502005020050
1600242004915000000005025800121280000128000062640000012003020049200493218001220800002024000020049200491116002110910101600001000000100266223204223220042230160000102005020046200502004620050
1600242004915000000005025800121280000128000062640000012002620049200453218001220800002024000020049200491116002110910101600001000000100266222244222320046230160000102004620050200502004620050
1600242004515000000005025800121280000128000062640000012003020049200493218001220800002024000020049200491116002110910101600001000000100296223244213420046230160000102005020050200502005020050
1600242004915000000005025800121280000128000062640000012002620049200493218001220800002024000020049200451116002110910101600001000000100286223244213320046230160000102005020050200502004620050
16002420049150000012005025800121280000128000062640000112003020049200493218001220800002024000020049200451116002110910101600001000000100263223204213320046230160000102005020046200462005020046
1600242004915000000004425800121280000128000062640000012003020049200493218001220800002024000020049200491116002110910101600001000003100296222244222320046230160000102005020050200502005020050
16002420049150000012005025800121280000128000062640000012003020049200453218001220800002024000020049200451116002110910101600001000103100286123244223220046230160000102005020046200502005020050
1600242013815000101800922580012128000012800006264080011200262004920049321801122080000202400002004920045211600211091010160000104010488100273114202113320042215160000102004620046200462004620046

Test 6: throughput

Count: 16

Code:

  bif v0.8b, v16.8b, v17.8b
  bif v1.8b, v16.8b, v17.8b
  bif v2.8b, v16.8b, v17.8b
  bif v3.8b, v16.8b, v17.8b
  bif v4.8b, v16.8b, v17.8b
  bif v5.8b, v16.8b, v17.8b
  bif v6.8b, v16.8b, v17.8b
  bif v7.8b, v16.8b, v17.8b
  bif v8.8b, v16.8b, v17.8b
  bif v9.8b, v16.8b, v17.8b
  bif v10.8b, v16.8b, v17.8b
  bif v11.8b, v16.8b, v17.8b
  bif v12.8b, v16.8b, v17.8b
  bif v13.8b, v16.8b, v17.8b
  bif v14.8b, v16.8b, v17.8b
  bif v15.8b, v16.8b, v17.8b
  movi v16.16b, 17
  movi v17.16b, 18

(fused SUBS/B.cc loop)

100 unrolls and 100 iterations

Result (median cycles for code divided by count): 0.2502

retire uop (01)cycle (02)03l2 tlb miss data (0b)191e1f373a3f4e51schedule uop (52)schedule int uop (53)schedule simd uop (54)dispatch int uop (56)dispatch simd uop (57)int uops in schedulers (59)ldst uops in schedulers (5b)60696d6emap stall dispatch (70)simd prf full (72)map rewind (75)map stall (76)dispatch uop (78)map int uop (7c)map simd uop (7e)map int uop inputs (7f)map simd uop inputs (81)8283flush restart other nonspec (84)85inst all (8c)inst branch (8d)inst branch taken (90)inst branch cond (94)inst int alu (97)inst simd alu (9a)9fld unit uop (a6)l1d cache writeback (a8)a9acc2cfd5map dispatch bubble (d6)ddfetch restart (de)e0? simd retires (ee)? int retires (ef)f5f6f7f8fd
1602044006030000270230610251601011001600011001600005001280000140019400384003919974032003616010020016000020048000040038400391116020110099100100160000100000001011011611401491600001004003940040400394004040039
160204400393000000141051610251601271001600271001600005004437926040038400384010719973031999616010020016000020048000040038400731116020110099100100160000100000001011011611400591600001004003940039400634006340039
16020440062300005101012602516010010016000010016000050044379261400194003840417199730320048160100200160000200480000402894003811160201100991001001600001000001201011011611400361600001004006340063400634006340039
16020440144299000000400251601311001600001001600005001319998140019400384028919973032003616010020016000020048000040038400381116020110099100100160000100000001011012911400361600001004003940079400404003940040
1602044003929900002706110251601001001600001001600005002462584040048400384007819973031999616010020016000020048000040038400391116020110099100100160000100000001011011611400721600001004004040039400404003940056
160204400383000000260610251601001001603381001600005001280000140044400384003919986031999716010020016000020048000040039400381116020110099100100160000100000001011011611400751600001004007940040400394004040040
160204400902990000167010302516012710016002710016000050054833551402704003840038200431132002016010020016000020048000040064400381116020110099100100160000100000001011011611401411600001004003940290400394029040039
16020440038300000000840251601011001600011001600005001280000040133400394043919973032003616010020016000020048000040038400391116020110099100100160000100010901011011611401071600001004003940145402904003940039
160204401443000000230400251601001001600011001600005005435282140059400384003919973031999716010020016000020048000040078400391116020110099100100160000100000001011011611400751600001004004040039400794004040039
160204400382990000106110251601231001600001001600005001319998040044400784007819973031999716010020016000020048000040039400381116020110099100100160000100000001011011611400361600001004003940040400394004040040

1000 unrolls and 10 iterations

Result (median cycles for code divided by count): 0.2506

retire uop (01)cycle (02)03mmu table walk instruction (07)1e373f4e51schedule uop (52)schedule int uop (53)schedule simd uop (54)dispatch int uop (56)dispatch simd uop (57)int uops in schedulers (59)ldst uops in schedulers (5b)5f60696d6emap stall dispatch (70)simd prf full (72)map rewind (75)map stall (76)dispatch uop (78)map int uop (7c)map simd uop (7e)map int uop inputs (7f)map simd uop inputs (81)8283flush restart other nonspec (84)85inst all (8c)inst branch (8d)inst branch taken (90)inst branch cond (94)inst int alu (97)inst simd alu (9a)9fl1d cache writeback (a8)acbranch mispred nonspec (cb)cfd0d2l1i cache miss demand (d3)d5map dispatch bubble (d6)d9dadbddfetch restart (de)e0eaeb? simd retires (ee)? int retires (ef)f5f6f7f8fd
1600244008330100067110251600331016000010160000505402633114002040099400992003103200181600102016000020480000400994009911160021109101016000010460100223116162113940096209160000104010040100401494004040039
16002440099300000216110251600921016008210160000501280000114008040099400992003103200181600102016000020480000400994009911160021109101016000010000100223219162119340096207160000104010040039400394010040100
160024400993000082450251600921016008210160000505402633114008040038400992003103200181600102016000020480000400384003811160021109101016000010000100223113162113940035209160000104010040058400394010040149
160024400993000082451102516001010160082101600005012800001140019400994014819996032007916001020160000204800004009940099111600211091010160000100001002231111162119340096208160000104003940100401004010040100
1600244003829900045110251600101016000010160000503997424114008040038400382003103200791600102016000020480000400384009911160021109101016000010000100223115162113940096209160000104010040100401494004040100
16002440099300008267110251600111016008210160000505402633114008040099400992003103201281600102016000020480000400994014811160021109101016000010000100223119162116940096208160000104010040100401004004040100
160024401543000082670251600921016002310160000501280000114008040038400382003103200181600102016000020480000400994009911160021109101016000010000100223119162115340096209160000104010040100401004003940149
16002440099301000542025160092101600001016000050540263311400804009940057199960320018160010201600002048000040099400991116002110910101600001004650100223119162119940096209160000104010040100401004010040100
16002440099300008246110251600921016008210160000505402633114008040090400992003103200791600102016000020480000400994009911160021109101016000010000100223113162113540096209160000104010040100401004010040039
16002440038300008267110251600921016000110160000501280000114008040099400572003103200791600102016000020480000400384003811160021109101016000010000100223115162115540096208160000104010040100401004014940149