Apple Microarchitecture Research by Dougall Johnson

M1/A14 P-core (Firestorm): Overview | Base Instructions | SIMD and FP Instructions
M1/A14 E-core (Icestorm):  Overview | Base Instructions | SIMD and FP Instructions

TBX (single register table, 16B)

Test 1: uops

Code:

  tbx v0.16b, { v1.16b }, v2.16b
  movi v0.16b, 1
  movi v1.16b, 2
  movi v2.16b, 3

(no loop instructions)

1000 unrolls and 1 iteration

Retires: 1.000

Issues: 1.000

Integer unit issues: 0.000

Load/store unit issues: 0.000

SIMD/FP unit issues: 1.000

retire uop (01)cycle (02)031e3a3f4e51schedule uop (52)schedule simd uop (54)dispatch simd uop (57)ldst uops in schedulers (5b)60696d6emap stall dispatch (70)map rewind (75)map stall (76)dispatch uop (78)map simd uop (7e)map simd uop inputs (81)8283flush restart other nonspec (84)85inst all (8c)inst simd alu (9a)l1d cache writeback (a8)accfd5map dispatch bubble (d6)ddfetch restart (de)e0? simd retires (ee)f5f6f7f8fd
1004203715006116872510001000100026468012018203720371572318951000100030002037203711100110000073116111787100020382038203820382038
1004203715008416872510001000100026468012018203720371572318951000100030002037203711100110000073116111787100020382038203820382038
1004203715006116872510001000100026468002018203720371572318951000103230002037203711100110000073116111787100020382038203820382038
10042037160070116872510001000100026468012018203720371572318951000100030002037203711100110000073116111787100020382038203820382038
1004203716006116872510001000100026468012018203720371572318951000100030002037203711100110000073116111787100020382038203820382038
1004203715006116872510001000100026468012018203720371572318951000100030002037203711100110000073116111787100020382038203820382038
10042037151206116872510001000100026468002018203720371572318951000100030002037203711100110001073116111787100020382038203820382038
1004203715006116872510001000100026468012018203720371572318951000100030002037203711100110000073116111787100020382038203820382038
1004203715006116872510001000100026468002018203720371572318951000100030002037203711100110000073116111787100020382038203820382038
1004203715006116872510001000100026468012018203720371572318951000100030002037203711100110000073116112004100020382038203820382038

Test 2: Latency 1->1

Code:

  tbx v0.16b, { v1.16b }, v2.16b
  movi v0.16b, 1
  movi v1.16b, 2
  movi v2.16b, 3

(fused SUBS/B.cc loop)

100 unrolls and 100 iterations

Result (median cycles for code): 2.0037

retire uop (01)cycle (02)031e3a3f4e51schedule uop (52)schedule int uop (53)schedule simd uop (54)dispatch int uop (56)dispatch simd uop (57)int uops in schedulers (59)ldst uops in schedulers (5b)696d6emap stall dispatch (70)map rewind (75)map stall (76)dispatch uop (78)map int uop (7c)map simd uop (7e)map int uop inputs (7f)map simd uop inputs (81)8283flush restart other nonspec (84)85inst all (8c)inst branch (8d)inst branch taken (90)inst branch cond (94)inst int alu (97)inst simd alu (9a)9fl1d cache writeback (a8)accfd0d5map dispatch bubble (d6)ddfetch restart (de)e0? simd retires (ee)? int retires (ef)f5f6f7f8fd
10204200371506906119687251010010010000100100005002847680200182003720037184223187451010020010000200300002003720037111020110099100100100001000071012162219791100001002003820038200382003820038
1020420037150906119687251010010010000100100005002847680200182003720037184223187451010020010000200300002003720037111020110099100100100001000071012162219791100001002003820038200382003820038
102042003715020406119687251010010010000100100005002847680200182003720037184223187451010020010000200300002003720037111020110099100100100001000071212163219791100001002003820038200382003820038
10204200371550034619687251010010010000100100005002847680200182003720037184223187451010020010000200300002003720037111020110099100100100001000071012162219791100001002003820038200382003820038
1020420037150006119687251010010010000100100005002847680200182003720037184223187451010020010000200300002003720037111020110099100100100001000071012163219791100001002003820038200382003820038
1020420037150006119687251010010010000100100005002847680200182003720037184223187451010020010000200300002003720037111020110099100100100001000071212162219791100001002003820038200382003820038
102042003715042306119687251010010010000100100005002847680200182003720037184223187451010020010000200300002003720037111020110099100100100001000071012162219791100001002003820038200382003820038
10204200371501806119687251010010010000100100005002847680200182003720037184223187451010020010000200300002003720037111020110099100100100001000071012162219791100001002003820038200382003820038
10204200371502106119687251010010010000100100005002847680200182003720037184223187451010020010000200300002003720037111020110099100100100001000071012162219791100001002003820038200382003820038
10204200371501806119687251010010010000100100005002847680200182003720037184223187451010020010000200300002003720037111020110099100100100001000071012162319791100001002003820038200382003820038

1000 unrolls and 10 iterations

Result (median cycles for code): 2.0037

retire uop (01)cycle (02)03091e3f4e51schedule uop (52)schedule int uop (53)schedule simd uop (54)dispatch int uop (56)dispatch simd uop (57)int uops in schedulers (59)ldst uops in schedulers (5b)60696d6emap stall dispatch (70)map rewind (75)map stall (76)dispatch uop (78)map int uop (7c)map simd uop (7e)map int uop inputs (7f)map simd uop inputs (81)8283flush restart other nonspec (84)85inst all (8c)inst branch (8d)inst branch taken (90)inst branch cond (94)inst int alu (97)inst simd alu (9a)9fl1d cache writeback (a8)accdcfd5map dispatch bubble (d6)ddfetch restart (de)e0? simd retires (ee)? int retires (ef)f5f6f7f8fd
1002420037150024611968725100101010000101000050284768002001820037200371844431876710010201000020300002003720037111002110910101000010000640216221978510000102003820038200382003820038
100242003715000611968725100101010000101000050284768012001820037200371844431876710010201000020300002003720037111002110910101000010000640216221978510000102003820038200382003820038
1002420037150006119687251001010100001010000502847680020018200372003718444121876710010201000020300002003720037111002110910101000010000640216221978510000102003820038200382003820038
1002420037150193461968725100101010000101000050284768002001820037200371844431876710010201000020300002003720037111002110910101000010000640216221978510000102003820038200382003820038
100242003715000611968725100101010000101000050284768002001820037200371844431876710010201000020300002003720037111002110910101000010000640216221978510000102003820038200382003820038
100242003715000611968725100101010000101000050284768002001820037200371844431876710010201000020300002003720037111002110910101000010000640216221978510000102003820038200382003820038
100242003715000611968725100101010000101000050284768002001820037200371844431876710010201000020300002003720037111002110910101000010030640216221978510000102003820038200382003820038
100242003715000611968725100101010000101000050284768002001820037200371844431876710010201000020304862003720037111002110910101000010000640216221978510000102003820038200382003820038
100242003715000611968725100101010000101000050284768002001820037200371844431876710010201000020300002003720037111002110910101000010001640216221978510000102003820038200382003820038
100242003715000611968725100101010000101000050284768002001820037200371844431876710010201000020300002003720037111002110910101000010000640316221978510000102003820038200382003820038

Test 3: Latency 1->2

Code:

  tbx v0.16b, { v0.16b }, v1.16b
  movi v0.16b, 1
  movi v1.16b, 2

(fused SUBS/B.cc loop)

100 unrolls and 100 iterations

Result (median cycles for code): 2.0037

retire uop (01)cycle (02)03l2 tlb miss data (0b)18191e1f3f4e51schedule uop (52)schedule int uop (53)schedule simd uop (54)dispatch int uop (56)dispatch simd uop (57)int uops in schedulers (59)ldst uops in schedulers (5b)696d6emap stall dispatch (70)map rewind (75)map stall (76)dispatch uop (78)map int uop (7c)map simd uop (7e)map int uop inputs (7f)map simd uop inputs (81)8283flush restart other nonspec (84)85inst all (8c)inst branch (8d)inst branch taken (90)inst branch cond (94)inst int alu (97)inst simd alu (9a)9fl1d cache writeback (a8)acc2cfd5map dispatch bubble (d6)ddfetch restart (de)e0? int output thing (e9)? simd retires (ee)? int retires (ef)f5f6f7f8fd
102042003715000018061196872510100100100001001000050028476802001820037200371842231874510100200100002003000020037200371110201100991001001000010000071021611197910100001002003820038200382003820038
10204200371500000061196872510100100100001001000050028476802001820037200371842231874510100200100002003000020037200371110201100991001001000010000071011611197910100001002003820038200382003820038
10204200371500000061196872510100100100001001000050028489632001820037200371842231874510100200100002003000020037200371110201100991001001000010000071011611197910100001002003820038200382003820038
10204200371500000061196872510100100100001001000050028476802001820037200371842231874510100200100002003000020037200371110201100991001001000010001975475414612197910100001002003820038200862003820133
102042013315111115610461196872510100100100001001000050028476802001820037200371842281876210429204101662043050120085200371110201100991001001000010000071011611197910100001002003820038200382003820038
10204200371500000061196872510100100100001001000050028476802001820037200371842231874510100200100002003000020037200371110201100991001001000010000071011611197910100001002003820038200382003820038
10204200371500100061196872510100100100001001000050028476802001820037200371842231874510100200100002003000020037200371110201100991001001000010000071011611197910100001002003820038200382003820038
10204200371500000061196872510100100100001001000050028476802001820037200371842231874510100200100002003000020037200371110201100991001001000010000071011611197910100001002003820038200382003820038
10204200371500000061196872510100100100001001000050028476802001820037200371842231874510100200100002003000020037200371110201100991001001000010000071011611197910100001002003820038200382003820038
10204200371500000061196872510100100100001001000050028476802001820037200371842231874510100200100002003000020037200371110201100991001001000010000071011611197910100001002003820038200382003820038

1000 unrolls and 10 iterations

Result (median cycles for code): 2.0037

retire uop (01)cycle (02)03mmu table walk instruction (07)09l2 tlb miss data (0b)18191e1f3a3f4e51schedule uop (52)schedule int uop (53)schedule simd uop (54)dispatch int uop (56)dispatch simd uop (57)int uops in schedulers (59)ldst uops in schedulers (5b)60696d6emap stall dispatch (70)map rewind (75)map stall (76)dispatch uop (78)map int uop (7c)map simd uop (7e)map int uop inputs (7f)map simd uop inputs (81)8283flush restart other nonspec (84)85inst all (8c)inst branch (8d)inst branch taken (90)inst branch cond (94)inst int alu (97)inst simd alu (9a)9fl1d tlb access (a0)l1d tlb miss (a1)ld unit uop (a6)l1d cache writeback (a8)a9acc2cfd5map dispatch bubble (d6)ddfetch restart (de)e0? int output thing (e9)? simd retires (ee)? int retires (ef)f5f6f7f8fd
10024200371500000000061196872510010101000010100005028476801200182003720037184443187671001020100002030000200372003711100211091010100001000000006403162219785010000102003820038200382003820038
10024200371500000000061196872510010101000010100005028476800200182003720037184443187671001020100002030000200372003711100211091010100001000000006402162219785010000102003820038200382003820038
10024200371500000000061196872510010101000010100005028476801200182003720037184443187671001020100002030000200372003711100211091010100001000000006402162219785010000102003820038200382003820038
10024200371500000000061196872510010101000010100005028476801200182003720037184443187671001020100002030000200372003711100211091010100001000000006402162219785010000102003820038200382003820038
10024200371500000000061196872510010101000010100005028476801200182003720037184443187671001020100002030000200372003711100211091010100001000000006402162219785010000102003820038200382003820038
10024200371500010000061196872510010101000010100005028476800200182003720037184443187671001020100002030000200372003711100211091010100001000000006402162219785010000102003820038200382003820038
10024200371500000000061196872510010101000010100005028476801200182003720037184443187671001020100002030000200372003711100211091010100001000000006402162219785010000102003820038200382003820083
10024200371500000000061196872510010101000010100005028476801200182003720037184443187671001020100002030000200372003711100211091010100001000000006402162219785010000102003820038200382003820038
1002420037150000005100611968725100101010000101000050284768012001820037200371844431880310774221067120324872022820272511002110910101000010022109850272557711519968210000102027620273202762022620266
1002420131151010456843526253219632116100591110060111076050285281212016220274202741845926188551077220109922231989202612026261100211091010100001000012970307314576319785610000102027520274203112022820169

Test 4: Latency 1->3

Code:

  tbx v0.16b, { v1.16b }, v0.16b
  movi v0.16b, 1
  movi v1.16b, 2

(fused SUBS/B.cc loop)

100 unrolls and 100 iterations

Result (median cycles for code): 2.0037

retire uop (01)cycle (02)03mmu table walk data (08)l2 tlb miss data (0b)18191e1f3f4e51schedule uop (52)schedule int uop (53)schedule simd uop (54)dispatch int uop (56)dispatch simd uop (57)int uops in schedulers (59)ldst uops in schedulers (5b)60696d6emap stall dispatch (70)map rewind (75)map stall (76)dispatch uop (78)map int uop (7c)map simd uop (7e)map int uop inputs (7f)map simd uop inputs (81)8283flush restart other nonspec (84)85inst all (8c)inst branch (8d)inst branch taken (90)inst branch cond (94)inst int alu (97)inst simd alu (9a)9fl1d tlb access (a0)l1d tlb miss (a1)l1d cache writeback (a8)a9acc2branch cond mispred nonspec (c5)cfd5map dispatch bubble (d6)ddfetch restart (de)e0? int output thing (e9)? simd retires (ee)? int retires (ef)f5f6f7f8fd
102042003715000000061196872510100100100001001000050028476801200182003720037184223187451010020010000200300002003720037111020110099100100100001000050120071011611197910100001002003820038200382003820038
10204200371500000006119687251010010010000100100005002847680120018200372003718422318745101002001000020030000200372003711102011009910010010000100001000071011611197910100001002003820038200382003820038
102042003715000000061196872510100100100001001000050028476800200182003720037184223187451010020010000200300002003720037111020110099100100100001000000750071011611197910100001002003820038200382003820038
10204200371500000006119687251010010010000100100005002847680020018200372003718422318745101002001000020030000200372003711102011009910010010000100000000071011611197910100001002003820038200382003820038
10204200371500000006119687251010010010000100100005002847680020018200372003718422318745101002001000020030000200372003711102011009910010010000100001030071011611197910100001002003820038200382003820038
102042003715000000061196872510100100100001001000050028476800200182003720037184223187451010020010000200300002003720037111020110099100100100001000053000071011611197910100001002003820038200382003820038
1020420037150000000637196872510100100100001001000050028476800200182003720037184223187451010020010000200300002003720037111020110099100100100001000039000071011611197910100001002003820038200382003820038
10204200371500000006119687251010010010012100100005002847680020018200372003718422318745101002001000020030000200372003711102011009910010010000100001000071011611197910100001002003820038200382003820038
10204200371500000006119687251010010010000100100005002847680020018200372003718422318745101002001000020030000200372003711102011009910010010000100000000071011611197910100001002003820038200382003820038
10204200371500000006119687251010010010000100100005002847680020018200372003718422318745101002001000020030000200372003711102011009910010010000100001000071011611197910100001002003820038200382003820038

1000 unrolls and 10 iterations

Result (median cycles for code): 2.0037

retire uop (01)cycle (02)03mmu table walk data (08)18191e1f3f4e51schedule uop (52)schedule int uop (53)schedule simd uop (54)dispatch int uop (56)dispatch simd uop (57)int uops in schedulers (59)ldst uops in schedulers (5b)60696d6emap stall dispatch (70)map rewind (75)map stall (76)dispatch uop (78)map int uop (7c)map simd uop (7e)map int uop inputs (7f)map simd uop inputs (81)8283flush restart other nonspec (84)85inst all (8c)inst branch (8d)inst branch taken (90)inst branch cond (94)inst int alu (97)inst simd alu (9a)9fl1d tlb access (a0)l1d tlb miss (a1)ld unit uop (a6)st unit uop (a7)l1d cache writeback (a8)a9acafc2cdcfd5map dispatch bubble (d6)ddfetch restart (de)e0? int output thing (e9)? simd retires (ee)? int retires (ef)f5f6f7f8fd
10024200371500000061196872510010101000010100005028476800200182003720037184443187671001020100002030000200372003711100211091010100001000000000006402162219785010000102003820038200382003820038
100242003715000000103196872510010101000010100005028476800200182008320133184443187671001020100002030000200372003711100211091010100001000001000006402162219823010000102008520038200382003820085
10024200371500000082196872510010101000010100005028476800200182008420037184443187791001020100002030000200372008421100211091010100001000000000006402162219785010000102003820038200382003820038
100242003715000000726196872510010101000010100005028476800200182003720037184443187671001020100002030000200372003711100211091010100001000000000006402162219785010000102003820038200382003820038
10024200371500000061196872510010101000010100005028476800200182003720037184443187671001020100002030000200372003711100211091010100001000000000006402162219785010000102003820038200382003820038
10024200371500000061196872510010101000010100005028476800200182003720037184443187671001020100002030000200372003711100211091010100001000000000006402162219785010000102003820038200382003820038
100242003715000012061196872510010101000010100005028476800200212003720181184443187671001020100002030000200372003711100211091010100001000001030006402162219785010000102003820038200382003820038
10024200371500000061196872510010101000010100005028476800200182003720037184443187671001020100002030000200372003711100211091010100001000000000006402162219785010000102003820038200382003820038
10024200371500000061196872510010101000010100005028476800200182003720037184443187671001020100002030000200372003711100211091010100001000000000006402162219785010000102003820038200382003820038
10024200371500000061196872510010101000010100005028476800200182003720037184443187671001020100002030000200372003711100211091010100001000000000006402162219785010000102003820038200382003820038

Test 5: throughput

Count: 8

Code:

  movi v0.16b, 0
  tbx v0.16b, { v8.16b }, v9.16b
  movi v1.16b, 0
  tbx v1.16b, { v8.16b }, v9.16b
  movi v2.16b, 0
  tbx v2.16b, { v8.16b }, v9.16b
  movi v3.16b, 0
  tbx v3.16b, { v8.16b }, v9.16b
  movi v4.16b, 0
  tbx v4.16b, { v8.16b }, v9.16b
  movi v5.16b, 0
  tbx v5.16b, { v8.16b }, v9.16b
  movi v6.16b, 0
  tbx v6.16b, { v8.16b }, v9.16b
  movi v7.16b, 0
  tbx v7.16b, { v8.16b }, v9.16b
  movi v8.16b, 9
  movi v9.16b, 10

(fused SUBS/B.cc loop)

100 unrolls and 100 iterations

Result (median cycles for code divided by count): 0.2508

retire uop (01)cycle (02)03l1i tlb fill (04)18191e3f51schedule uop (52)schedule int uop (53)schedule simd uop (54)dispatch int uop (56)dispatch simd uop (57)int uops in schedulers (59)ldst uops in schedulers (5b)60696d6emap rewind (75)map stall (76)dispatch uop (78)map int uop (7c)map simd uop (7e)map int uop inputs (7f)map simd uop inputs (81)8283flush restart other nonspec (84)85inst all (8c)inst branch (8d)inst branch taken (90)inst branch cond (94)inst int alu (97)inst simd alu (9a)9fl1d cache writeback (a8)a9acbranch cond mispred nonspec (c5)cfd5map dispatch bubble (d6)ddfetch restart (de)e0? simd retires (ee)? int retires (ef)f5f6f7f8fd
16020420088150100080925801001008000010080000500640000120044200632006332180100200800002002400002006320063111602011009910010016000010000001011111611200601600001002006420064200642006420064
16020420063150000012425801001008000010080000500640000120044200632006332180100200800002002400002006320063111602011009910010016000010000001011111611200601600001002006420064200642006420064
1602042006315000005925801001008000010080000500640000120044200632006332180100200800002002400002006320063111602011009910010016000010000001011111611200601600001002006420064200642006420064
16020420063155000018225801001008000010080000500640000120044200632006332180100200800002002400002006320063111602011009910010016000010000001011111611200601600001002006420064200642006420064
16020420063150000020625801001008000010080000500640000120044200632006332180100200800002002400002006320063111602011009910010016000010000001013611611200601600001002006420064200642006420064
1602042006315000003825801001008000010080000500640000120044200632006332180100200800002002400002006320063111602011009910010016000010000001011111611200601600001002006420064200642006420064
16020420063150000018525801001008000010080000500640000120044200632006332180100200800002002400002006320063111602011009910010016000010000001011111611200601600001002006420064200642006420064
16020420063151000065425801001008000010080000500640000120044200632006332180100200800002002400002006320063111602011009910010016000010010001011111611200601600001002006420064200642006420064
1602042006315000003825801001008000010080000500640000120044200632006332180100200800002002400002006320063111602011009910010016000010000001011111611200601600001002006420064200642006420064
1602042006315100003825801001008000010080000500640000120044200632006332180100200800002002400002006320063111602011009910010016000010000001011111611200601600001002006420064200642006420064

1000 unrolls and 10 iterations

Result (median cycles for code divided by count): 0.2506

retire uop (01)cycle (02)033f51schedule uop (52)schedule int uop (53)schedule simd uop (54)dispatch int uop (56)dispatch simd uop (57)int uops in schedulers (59)ldst uops in schedulers (5b)5f6061696d6emap rewind (75)map stall (76)dispatch uop (78)map int uop (7c)map simd uop (7e)map int uop inputs (7f)map simd uop inputs (81)8283flush restart other nonspec (84)85inst all (8c)inst branch (8d)inst branch taken (90)inst branch cond (94)inst int alu (97)inst simd alu (9a)9fl1d tlb access (a0)ld unit uop (a6)l1d cache writeback (a8)accfd0d2l1i cache miss demand (d3)d5map dispatch bubble (d6)d9dadbddfetch restart (de)e0? int output thing (e9)eaebec? simd retires (ee)? int retires (ef)f5f6f7f8fd
160024200731505192780012128000012800006264000001520031200592005932180012208000020240000200592005911160021109101016000010000010030114110252117112004722001160000102005120051200512005120051
16002420050150442780012128000012800006264000011520031200502005032180012208000020240000200502005011160021109101016000010000991003082111252111192004722001160000102005120051200512005120051
160024200501514427800121280000128000062640000115200312005020050321800122080000202400002005020050111600211091010160000100006100308319252117112004722001160000102005120051200512005120051
16002420050150442780012128000012800006264000011520286200502005032180012208000020240000200502005011160021109101016000010000961003083172521111112004722001160000102005120051200512005120051
1600242005015044278001212800001280101626400001152003120050200503218001220800002024000020050200501116002110910101600001000191003083111252111172004722001160000102005120051200512005120051
1600242005015144278001212800001280000626400001152003120050200503218001220800002024000020050200501116002110910101600001000061003083111252111172004722001160000102005120051200512005120051
160024200501504427800121280000128000062640000115200312005020050321800122080000202400002005020050111600211091010160000100052610030821112521111112004722001160000102005120051200512005120051
16002420050150442780012128000012800006264000011520031200502005032180012208000020240000200502005011160021109101016000010005010030831112521111112004722001160000102005120051200512005120051
160024200501504427801041280000128000062640000115200312005020050321800122080000202400002005020050111600211091010160000100001291003082111252117112004722001160000102005120051200512005120051
1600242005015044278001212800001280000626400001152003120050200503218001220800002024000020050200501116002110910101600001000161003483172521111112004722001160000102005120051200512005120051

Test 6: throughput

Count: 16

Code:

  tbx v0.16b, { v16.16b }, v17.16b
  tbx v1.16b, { v16.16b }, v17.16b
  tbx v2.16b, { v16.16b }, v17.16b
  tbx v3.16b, { v16.16b }, v17.16b
  tbx v4.16b, { v16.16b }, v17.16b
  tbx v5.16b, { v16.16b }, v17.16b
  tbx v6.16b, { v16.16b }, v17.16b
  tbx v7.16b, { v16.16b }, v17.16b
  tbx v8.16b, { v16.16b }, v17.16b
  tbx v9.16b, { v16.16b }, v17.16b
  tbx v10.16b, { v16.16b }, v17.16b
  tbx v11.16b, { v16.16b }, v17.16b
  tbx v12.16b, { v16.16b }, v17.16b
  tbx v13.16b, { v16.16b }, v17.16b
  tbx v14.16b, { v16.16b }, v17.16b
  tbx v15.16b, { v16.16b }, v17.16b
  movi v16.16b, 17
  movi v17.16b, 18

(fused SUBS/B.cc loop)

100 unrolls and 100 iterations

Result (median cycles for code divided by count): 0.2504

retire uop (01)cycle (02)031e373f4e51schedule uop (52)schedule int uop (53)schedule simd uop (54)dispatch int uop (56)dispatch simd uop (57)int uops in schedulers (59)ldst uops in schedulers (5b)60696d6emap stall dispatch (70)simd prf full (72)map rewind (75)map stall (76)dispatch uop (78)map int uop (7c)map simd uop (7e)map int uop inputs (7f)map simd uop inputs (81)8283flush restart other nonspec (84)85inst all (8c)inst branch (8d)inst branch taken (90)inst branch cond (94)inst int alu (97)inst simd alu (9a)9fl1d tlb access (a0)l1d cache writeback (a8)acc2cdcfd5map dispatch bubble (d6)ddfetch restart (de)e0? simd retires (ee)? int retires (ef)f5f6f7f8fd
1602044007030003384173251602061001600011001600005001320000040019400384012419976031999616010020016000020048000040038400381116020110099100100160000100000001011061633401211600001004013740063401084003940063
160204400783000144406251601191001600011021600965005482191040020400734003919981031999616010020016000020048000040038400751116020110099100100160000100000001011051633400351600001004006640040400564010240063
1602044008730000406251601001001600001001600005004437926040019400654003919974032001516010020016000020048000040055400381116020110099100100160000100000001011071633400621600001004007940058400664004040039
16020440038300010661149251601001001600001001600005001319998040082401124003819973031999616010020016000020048000040038400381116020110099100100160000100000001011061632400351600001004004040066400394021040039
160204400623000040149251601001001600011001600005005518691040019400384011219973031999616010020016000020048000040063400651116020110099100100160000100000001011041633400351600001004003940113400394004040064
16020440039300010640197251601991001600011001600005001280000040019400384011219976032007016010020016000020048000040055401011116020110099100100160000100000001011041633400591600001004003940039400394003940079
160204400383010041232251601361001600001001600005001280000040019400574011219973031999616010020016000020048000040065400391116020110099100100160000100000001011061633400351600001004011340039401134003940063
16020440062300004010251601011001600001001600005005518618040020400394003819973031999616010020016000020048000040038400571116020110099100100160000100000001011071633400591600001004003940153400394014540039
160204400383000061122251601001001600001001600005004728951040020400384006519975031999616010020016000020048000040057401011116020110099100100160000100013001011071633400351600001004003940056402204007640066
160204401123000106610251601361001600001001600005001319998040133402894028920014031999616010020016000020048000040038400391116020110099100100160000100000001011051633400351600001004005740066400394004040039

1000 unrolls and 10 iterations

Result (median cycles for code divided by count): 0.2502

retire uop (01)cycle (02)03mmu table walk instruction (07)l2 tlb miss instruction (0a)1e373a3f4e51schedule uop (52)schedule int uop (53)schedule simd uop (54)dispatch int uop (56)dispatch simd uop (57)int uops in schedulers (59)ldst uops in schedulers (5b)5f60696d6emap stall dispatch (70)simd prf full (72)map rewind (75)map stall (76)dispatch uop (78)map int uop (7c)map simd uop (7e)map int uop inputs (7f)map simd uop inputs (81)8283flush restart other nonspec (84)85inst all (8c)inst branch (8d)inst branch taken (90)inst branch cond (94)inst int alu (97)inst simd alu (9a)9fl1d cache writeback (a8)a9acc2cdcfd0d2l1i cache miss demand (d3)l1i tlb miss demand (d4)d5map dispatch bubble (d6)d9dadbddfetch restart (de)e0? int output thing (e9)eaeb? simd retires (ee)? int retires (ef)f5f6f7f8fd
160024400483001107817927025160159101600001016000050128000011400194009540038200840320018160010201600002048000040145400382116002110910101600001000000100253111221621111515400350156160000104003940039400394003940039
160024400383001100157025160010101600001016000050128000011400194003840038199960320018160010201600002048000040038400381116002110910101600001000000100233111151621171415400350155160000104003940039400964003940078
16002440038300010017911125160010101600001016000050131999711400194003840038199960320018160010201600002048000040153400951116002110910101600001000000100233110171621111515400350155160000104009640039400394009640096
16002440100300112101682111251600101016007810160000501280000114001940038401532002603200181600102016000020480000400384003811160021109101016000010000001002531111616211115154009201510160000104003940039400394003940039
160024400383001107818511125160010101600001016000050128000011400764003840038199960320075160010201600002048000040038400381116002110910101600001000000100233111171621111718400350155160000104003940039400394003940039
1600244009530011240157025160010101601491016000050552604611400194003840038199960320018160010201600002048000040038400991116002110910101600001010300100233111161621111516400350155160000104003940039400394003940039
160024400383011100151025160011101600001016000050128000011400194003840038199960320018160010201600002048000040218400951116002110910101600001000000100233111111621111515400350155160000104009640039400394003940039
16002440038300000780451112516001010160000101600005055194181140019400384003819996032001816001020160000204800004009540038111600211091010160000100000010022311015162111138400350155160000104003940154400964003940039
160025400383000005204511125160010101600001016000050551941811400194003840153200260320018160010201600002048000040038400381116002110910101600001000000100223110161621121415400920155160000104009640039400394003940039
160024400383000000045025160010101600781016000050128000011400194009540038199960320018160010201600002048000040038400381116002110910101600001000000100223110171621101516400350155160000104003940039400964003940039