Apple Microarchitecture Research by Dougall Johnson

M1/A14 P-core (Firestorm): Overview | Base Instructions | SIMD and FP Instructions
M1/A14 E-core (Icestorm):  Overview | Base Instructions | SIMD and FP Instructions

TBL (two register table, 8B)

Test 1: uops

Code:

  tbl v0.8b, { v0.16b, v1.16b }, v2.8b
  movi v0.16b, 1
  movi v1.16b, 2
  movi v2.16b, 3

(no loop instructions)

1000 unrolls and 1 iteration

Retires: 1.000

Issues: 1.000

Integer unit issues: 0.000

Load/store unit issues: 0.000

SIMD/FP unit issues: 1.000

retire uop (01)cycle (02)03mmu table walk data (08)1e3f4e51schedule uop (52)schedule simd uop (54)dispatch simd uop (57)ldst uops in schedulers (5b)60696d6emap stall dispatch (70)map rewind (75)map stall (76)dispatch uop (78)map simd uop (7e)map simd uop inputs (81)8283flush restart other nonspec (84)85inst all (8c)inst simd alu (9a)l1d cache writeback (a8)accfd5map dispatch bubble (d6)ddfetch restart (de)e0? simd retires (ee)f5f6f7f8fd
1004203715006116872510001000100026468012018203720371572318951000100030002037203711100110000073116111787100020382038203820382038
1004203715006116872510001000100026468002018203720371572318951000100030002037203711100110000073116111787100020382038203820382038
1004203715006116872510001000100026468012018203720371572318951000100030002037203711100110000073116111787100020382038203820382038
1004203715006116872510001000100026468012018203720371572318951000100030002037203711100110000073116111787100020382038203820382038
1004203715006116872510001000100026468012018203720371572318951000100030002037203711100110000073116111787100020382038203820382038
10042037150015616872510001000115226468002018203720371572318951000100030002037203711100110000373116111787100020382038203820382038
1004203715006116872510001000100026468012018203720371572318951000100030002037203711100110000073116111787100020382038203820382038
1004203715006116872510001000100026468012018203720371572318951000100030002037203711100110005073116111787100020382038203820382038
1004203716006116872510001000100026468012018203720371572318951000100030002037203711100110000073116111787100020382038203820382038
1004203715006116872510001000100026468002018203720371572318951000100030002037203711100110000073116111787100020382038203820382038

Test 2: Latency 1->2

Code:

  tbl v0.8b, { v0.16b, v1.16b }, v2.8b
  movi v0.16b, 1
  movi v1.16b, 2
  movi v2.16b, 3

(fused SUBS/B.cc loop)

100 unrolls and 100 iterations

Result (median cycles for code): 2.0037

retire uop (01)cycle (02)03l1i tlb fill (04)1e3a3f4e51schedule uop (52)schedule int uop (53)schedule simd uop (54)dispatch int uop (56)dispatch simd uop (57)int uops in schedulers (59)ldst uops in schedulers (5b)60696d6emap stall dispatch (70)map rewind (75)map stall (76)dispatch uop (78)map int uop (7c)map simd uop (7e)map int uop inputs (7f)map simd uop inputs (81)8283flush restart other nonspec (84)85inst all (8c)inst branch (8d)inst branch taken (90)inst branch cond (94)inst int alu (97)inst simd alu (9a)9facbranch mispred nonspec (cb)cdcfd0d5map dispatch bubble (d6)ddfetch restart (de)e0? simd retires (ee)? int retires (ef)f5f6f7f8fd
102042003715010015051968725101001001000010010000500284768002001820037200371842231874510100200100002003000020037200371110201100991001001000010000071004162319791100001002003820038200382003820038
10204200371500007261968725101001001000010010000500284768002001820037200371842231874510100200100002003000020037200371110201100991001001000010000071002162319791100001002003820038200382003820038
1020420037150000611968725101001001000010010000500284768002001820037200371842231874510100200100002003000020037200371110201100991001001000010000071002162219791100001002003820038200382003820038
1020420037150000611968725101001001000010010000500284768002001820037200371842231874510100200100002003000020037200371110201100991001001000010000071002162319791100001002003820038200382003820038
1020420037150000611968725101001001000010010000500284768012001820037200371842231874510100200100002003000020037200371110201100991001001000010000071002162219791100001002003820038200382003820038
10204200371500001031968725101001001000010010000500284768002001820037200371842231874510100200100002003000020037200371110201100991001001000010000071012162319791100001002003820038200382003820038
1020420037150000841968725101001001000010010000500284768002001820037200371842231874510100200100002003000020037200371110201100991001001000010000071012162219791100001002003820038200382003820038
10204200371500001241968725101001001000010010000500284768002001820037200371842231874510100200100002003000020037200371110201100991001001000010000071012162219791100001002003820038200382003820038
1020420037150000611968725101001001000010010000500284768002001820037200371842231874510100200100002003000020037200371110201100991001001000010000071012162219791100001002003820038200382003820038
1020420037150000611968725101001001000010010000500284768002001820037200371842231874510100200100002003000020037200371110201100991001001000010000071012163319791100001002003820038200382003820038

1000 unrolls and 10 iterations

Result (median cycles for code): 2.0037

retire uop (01)cycle (02)03mmu table walk data (08)l2 tlb miss data (0b)18191e1f3a3f4e51schedule uop (52)schedule int uop (53)schedule simd uop (54)dispatch int uop (56)dispatch simd uop (57)int uops in schedulers (59)ldst uops in schedulers (5b)60696b6d6emap stall dispatch (70)map rewind (75)map stall (76)dispatch uop (78)map int uop (7c)map simd uop (7e)map int uop inputs (7f)map simd uop inputs (81)8283flush restart other nonspec (84)85inst all (8c)inst branch (8d)inst branch taken (90)inst branch cond (94)inst int alu (97)inst simd alu (9a)9fl1d tlb access (a0)l1d tlb miss (a1)ld unit uop (a6)st unit uop (a7)l1d cache writeback (a8)a9acc2branch mispred nonspec (cb)cfd5map dispatch bubble (d6)ddfetch restart (de)e0? int output thing (e9)? simd retires (ee)? int retires (ef)f5f6f7f8fd
1002420037150000000061196872510010101000010100005028476801200180200372003718444318767100102010000203000020037200371110021109101010000100000000016402162219785010000102003820038200382003820038
1002420037150000000061196872510010101000010100005028476801200180200372003718444318767100102010000203000020037200371110021109101010000100000000006402162219785010000102003820038200382003820038
100242003715000000012498196321191007314100601210760612855378120270020323202731845528188611077020108372232958202732032181100211091010100001022001410005207903642320039010000102040720372203752036820371
1002420367153108493361603480196101551009612100841210912652856661120162020371203691846134189101107422111662434014203672041881100211091010100001002002304028008052733220075110000102036920368201322041720418
10024203661530088795704139451961017310108131008413112166128579441202340203702036718467318767100102010331203000020037200371110021109101010000100000003006403162219785110000102027520275202742022820273
100242027115201466725280564196321161007114100721010760612855378120270020372203721846135188781092420113242433519203672035831100211091010100001000211411850007892933220038010000102042020419203682036920367
10024204151520000000675196871541010814100841010912502854095120198020273202711845228188981077222108342632019202762036571100211091010100001004000013870207954723320039210000102037320404202772032320370
10024202291521188933352126101968725100101010000101000050284768012001802003720037184441618877110792010000223048920226202623110021109101010000102030025875006402162219785010000102008520369202742027520358
1002420132151014479244001510196872510010101000010100005028476801201260200372003718447318767100102010000223048620371201807110021109101010000100200107960407692242219785010000102003820038200852013320038
10024201311500144267440125031962111610070101002412109126128529981200180200372003718444318786100102010000203050720085200372110021109101010000104000008088207033244220003110000102003820038200382003820038

Test 3: Latency 1->3

Code:

  tbl v1.8b, { v0.16b, v1.16b }, v2.8b
  movi v0.16b, 1
  movi v1.16b, 2
  movi v2.16b, 3

(fused SUBS/B.cc loop)

100 unrolls and 100 iterations

Result (median cycles for code): 2.0037

retire uop (01)cycle (02)03l2 tlb miss data (0b)18191e1f3a3f4e51schedule uop (52)schedule int uop (53)schedule simd uop (54)dispatch int uop (56)dispatch simd uop (57)int uops in schedulers (59)ldst uops in schedulers (5b)60696d6emap stall dispatch (70)simd prf full (72)map rewind (75)map stall (76)dispatch uop (78)map int uop (7c)map simd uop (7e)map int uop inputs (7f)map simd uop inputs (81)8283flush restart other nonspec (84)85inst all (8c)inst branch (8d)inst branch taken (90)inst branch cond (94)inst int alu (97)inst simd alu (9a)9fl1d tlb access (a0)l1d tlb miss (a1)ld unit uop (a6)l1d cache writeback (a8)acc2cfd0d5map dispatch bubble (d6)ddfetch restart (de)e0? int output thing (e9)? simd retires (ee)? int retires (ef)f5f6f7f8fd
10204200371500001500295196872510100100100001001000050028476801200182003720037184220318745101002001000020030000200372003711102011009910010010000100000000710021623197910100001002003820038200382003820038
1020420037150000000145196872510100100100001001000050028476801200182003720037184220318745101002001000020030000200372003711102011009910010010000100000000710031622197910100001002003820038200382003820038
1020420037150000000103196872510100100100001001000050028476800200182003720037184220318745101002001000020030000200372003711102011009910010010000100420000710021622197910100001002003820038200382003820038
102042003715000000061196872510100100100001001000050028476800200182003720037184220318745101002001000020030000200372003711102011009910010010000100000000710021622197910100001002003820038200382003820038
102042003715000000061196872510100100100001001000050028476800200182003720037184220318745101002001000020030000200372003711102011009910010010000100000000710021622197910100001002003820038200382003820038
102042003715000000061196872510100100100001001000050028476800200182003720037184220318745101002001000020030000200372003711102011009910010010000100000000710121622197910100001002003820038200382003820038
102042003715000000061196872510100100100001001000050028476800200182003720037184220318745101002001000020030000200372003711102011009910010010000100000000710131622197910100001002003820038200382003820038
1020420037150000000103196872510100100100001001000050028476801200182003720037184220318745101002001000020030000200372003711102011009910010010000100000000710121622197910100001002003820038200382003820038
102042003715000000061196872510100100100001001000050028476800200182003720037184220318745101002001000020030000200372003711102011009910010010000100000000710121622197910100001002003820038200382003820038
1020420037150000001170196872510100100100001001000050028476801200182003720037184260318766101002001000020030000200372003711102011009910010010000100000030710121623197910100001002003820038200382003820038

1000 unrolls and 10 iterations

Result (median cycles for code): 2.0037

retire uop (01)cycle (02)031e3f4e51schedule uop (52)schedule int uop (53)schedule simd uop (54)dispatch int uop (56)dispatch simd uop (57)int uops in schedulers (59)ldst uops in schedulers (5b)60696d6emap stall dispatch (70)map rewind (75)map stall (76)dispatch uop (78)map int uop (7c)map simd uop (7e)map int uop inputs (7f)map simd uop inputs (81)8283flush restart other nonspec (84)85inst all (8c)inst branch (8d)inst branch taken (90)inst branch cond (94)inst int alu (97)inst simd alu (9a)9fld unit uop (a6)accfd5map dispatch bubble (d6)ddfetch restart (de)e0? simd retires (ee)? int retires (ef)f5f6f7f8fd
10024200371500103196872510010101000010100005028476800200182003720037184443187671001020100002030000200372003711100211091010100001000640216251978510000102003820038200382003820038
1002420037150061196872510010101000010100005028476800200182003720037184443187671001020100002030000200372003711100211091010100001000640216261978510000102003820038200382003820038
1002420037150061196872510010101000010100005028476800200182003720037184443187671001020100002030000200372003711100211091010100001000640216231978510000102003820038200382003820038
1002420037150061196872510010101000010100005028476800200182003720037184443187671001020100002030000200372008411100211091010100001000640216251978510000102003820038200382003820038
1002420037150061196872510010101000010100005028476800200182003720037184443187671001020100002030000200372003711100211091010100001000640216251978510000102003820038200382003820038
10024200371500147196872510010101000010100005028476800200182003720037184443187671001020100002030000200372003711100211091010100001000640216251978510000102003820038200382003820038
1002420037150061196872510010101000010100005028476800200182003720037184443187671001020100002030000200372003721100211091010100001000640216231978510000102003820038200382003820038
1002420037150061196872510010101000010100005028476800200182003720037184443187671001020100002030000200372003711100211091010100001000640216241978510000102003820038200382003820038
1002420037150061196872510010101000010100005028476800200182003720037184443187671001020100002030000200372003711100211091010100001000640216241978510000102003820038200382003820038
10024200371500124196872510010101000010100005028476800200182003720037184443187671001020100002030000200372003711100211091010100001000640216251978510000102003820038200382003820038

Test 4: Latency 1->4

Code:

  tbl v2.8b, { v0.16b, v1.16b }, v2.8b
  movi v0.16b, 1
  movi v1.16b, 2
  movi v2.16b, 3

(fused SUBS/B.cc loop)

100 unrolls and 100 iterations

Result (median cycles for code): 2.0037

retire uop (01)cycle (02)03193a3f4e51schedule uop (52)schedule int uop (53)schedule simd uop (54)dispatch int uop (56)dispatch simd uop (57)int uops in schedulers (59)ldst uops in schedulers (5b)60696d6emap stall dispatch (70)map rewind (75)map stall (76)dispatch uop (78)map int uop (7c)map simd uop (7e)map int uop inputs (7f)map simd uop inputs (81)8283flush restart other nonspec (84)85inst all (8c)inst branch (8d)inst branch taken (90)inst branch cond (94)inst int alu (97)inst simd alu (9a)9fl1d tlb miss (a1)l1d cache writeback (a8)a9accfd0d5map dispatch bubble (d6)ddfetch restart (de)e0? int output thing (e9)? simd retires (ee)? int retires (ef)f5f6f7f8fd
10204200371500061196872510100100100001001000050028476801200182003720037184223187451010020010000200300002003720037111020110099100100100001000000710131622197910100001002003820038200382003820038
102042003715000103196872510100100100001001000050028476801200182003720037184223187451010020010000200300002003720037111020110099100100100001000000710121622197910100001002003820038200382003820038
10204200371500061196872510100100100001001000050028476801200182003720037184223187451010020010000200300002003720037111020110099100100100001000000710121622197910100001002003820038200382003820038
10204200371500061196872510100100100001001000050028476801200182003720037184223187451010020010000200300002003720037111020110099100100100001000000710121622198970100001002003820038200382003820038
102042003715000124196872510100100100001001000050028476801200182003720037184223187451010020010000200300002003720037111020110099100100100001000000712121622197910100001002003820038200382003820038
102042003715000278196872510100100100001001000050028476801200182003720037184223187451010020010000200300002003720037111020110099100100100001000000710121632197911100001002003820038200382003820038
10204200371500061196872510100100100001001000050028476801200182003720037184223187451010020010000200300002003720037111020110099100100100001000000710121622197910100001002003820038200382003820038
10204200371500061196872510100100100001001000050028476801200182003720037184223187451010020010000200300002003720037111020110099100100100001000000710121623197910100001002003820038200382003820038
10204200371500061196872510100100100001001000050028476801200182003720037184223187451010020010000200300002003720037111020110099100100100001000000710121622197910100001002003820038200382003820038
102042003715000159196872510100100100001001000050028476801200182003720037184223187451010020010000200300002003720037111020110099100100100001000000710121632197910100001002003820038200382003820038

1000 unrolls and 10 iterations

Result (median cycles for code): 2.0037

retire uop (01)cycle (02)031e3f4e51schedule uop (52)schedule int uop (53)schedule simd uop (54)dispatch int uop (56)dispatch simd uop (57)int uops in schedulers (59)ldst uops in schedulers (5b)696d6emap stall dispatch (70)map rewind (75)map stall (76)dispatch uop (78)map int uop (7c)map simd uop (7e)map int uop inputs (7f)map simd uop inputs (81)8283flush restart other nonspec (84)85inst all (8c)inst branch (8d)inst branch taken (90)inst branch cond (94)inst int alu (97)inst simd alu (9a)9fl1d cache writeback (a8)cfd5map dispatch bubble (d6)ddfetch restart (de)e0? simd retires (ee)? int retires (ef)f5f6f7f8fd
100242003714901451968725100101010000101000050284768020018200372003718444318767100102010000203000020037200371110021109101010000100640316331978510000102003820038200382003820038
10024200371500611968725100101010000101000050284768020018200372003718444318767100102010000203000020037200371110021109101010000100640316331978510000102003820038200382003820038
10024200371503611968725100101010000101000050284768020018200372003718444318767100102010000203000020037200371110021109101010000100640316331978510000102003820038200382003820038
10024200371500611968725100101010000101000050284768020018200372003718444318767100102010000203000020037200371110021109101010000100640316331978510000102003820038200382003820038
10024200371500611968725100101010000101000050284768020018200372003718444318767100102010000203000020037200371110021109101010000100640316331978510000102003820038200382003820038
10024200371500611968725100101010000101000050284768020018200372003718444318767100102010000203000020037200371110021109101010000100640316331978510000102003820038200382003820038
10024200371500611968725100101010000101000050284768020018200372003718444318767100102010000203000020037200371110021109101010000100640316331978510000102003820038200382003820038
10024200371500611968725100101010000101000050284768020018200372003718444318767100102010000203000020037200371110021109101010000100640316331978510000102003820038200382003820038
10024200371500611968725100101010000101000050284768020018200372003718444318767100102010000203000020037200371110021109101010000100640316331978510000102003820038200382003820038
10024200371500611968725100101010000101000050284768020018200372003718444318767100102010000203000020037200371110021109101010000100640316331978510000102003820038200382003820038

Test 5: throughput

Count: 8

Code:

  tbl v0.8b, { v8.16b, v9.16b }, v10.8b
  tbl v1.8b, { v8.16b, v9.16b }, v10.8b
  tbl v2.8b, { v8.16b, v9.16b }, v10.8b
  tbl v3.8b, { v8.16b, v9.16b }, v10.8b
  tbl v4.8b, { v8.16b, v9.16b }, v10.8b
  tbl v5.8b, { v8.16b, v9.16b }, v10.8b
  tbl v6.8b, { v8.16b, v9.16b }, v10.8b
  tbl v7.8b, { v8.16b, v9.16b }, v10.8b
  movi v8.16b, 9
  movi v9.16b, 10
  movi v10.16b, 11

(fused SUBS/B.cc loop)

100 unrolls and 100 iterations

Result (median cycles for code divided by count): 0.2505

retire uop (01)cycle (02)03mmu table walk data (08)l2 tlb miss data (0b)18191e1f3f51schedule uop (52)schedule int uop (53)schedule simd uop (54)dispatch int uop (56)dispatch simd uop (57)int uops in schedulers (59)ldst uops in schedulers (5b)6061696d6emap stall dispatch (70)map rewind (75)map stall (76)dispatch uop (78)map int uop (7c)map simd uop (7e)map int uop inputs (7f)map simd uop inputs (81)8283flush restart other nonspec (84)85inst all (8c)inst branch (8d)inst branch taken (90)inst branch cond (94)inst int alu (97)inst simd alu (9a)9fl1d tlb access (a0)l1d tlb miss (a1)ld unit uop (a6)st unit uop (a7)l1d cache writeback (a8)a9acc2cfd0d5map dispatch bubble (d6)ddfetch restart (de)e0? int output thing (e9)? simd retires (ee)? int retires (ef)f5f6f7f8fd
80204200611500000004025801001008000010080000500640000022001920038200389973399968010020080000200240000200382003811802011009910010080000100000000005110021611200350800001002003920039200392003920039
80204200381500000004025801001008000010080000500640000022001920038200389973399968010020080000200240000200382003811802011009910010080000100000000005110011611200350800001002003920039200392003920039
80204200381500000004025801001008000010080000500640000102001920038200389973399968010020080000200240000200382003811802011009910010080000100000000005110211611200350800001002003920112200912010020039
80204200381500000004025801001008000010080000500640000022001920038200389973399968010020080000200240000200382003811802011009910010080000100000000005110011611200350800001002003920039200392003920039
80204200381500000004025801001008000010080000500640000022001920038200389973399968010020080000200240000200382003811802011009910010080000100000000005110211611200350800001002003920039200392003920039
80204200381500000004025801001008000010080000500640000102001920038200389973399968010020080000200240000200382003811802011009910010080000100000000005110011611200350800001002003920039200392003920039
80204200381500000006125801001008000010080000500640000022001920038200389973399968010020080000200240000200382003811802011009910010080000100000000005110211611200350800001002003920039200392003920039
80204200381500000004025801001008000010080000500640000022001920038200389973399968010020080000200240000200382003811802011009910010080000100000000005110211611200350800001002003920039200392003920039
80204200381500000004025801001008000010080000500640000122001920038200389973399968010020080000200240000200382003811802011009910010080000100000000005110011611200350800001002003920039200392003920039
80204200381500000004025801001008000010080000500640000022001920038200389973399968010020080000200240000200382003811802011009910010080000100000000005110211611200350800001002003920039200392003920039

1000 unrolls and 10 iterations

Result (median cycles for code divided by count): 0.2505

retire uop (01)cycle (02)0318191e1f3f51schedule uop (52)schedule int uop (53)schedule simd uop (54)dispatch int uop (56)dispatch simd uop (57)int uops in schedulers (59)ldst uops in schedulers (5b)60696d6emap stall dispatch (70)map rewind (75)map stall (76)dispatch uop (78)map int uop (7c)map simd uop (7e)map int uop inputs (7f)map simd uop inputs (81)8283flush restart other nonspec (84)85inst all (8c)inst branch (8d)inst branch taken (90)inst branch cond (94)inst int alu (97)inst simd alu (9a)9fl1d tlb miss (a1)st unit uop (a7)l1d cache writeback (a8)a9acc2branch cond mispred nonspec (c5)branch mispred nonspec (cb)cfd5map dispatch bubble (d6)ddfetch restart (de)e0? simd retires (ee)? int retires (ef)f5f6f7f8fd
800242003915000009322580010108000010800005064000012001920038200389996310018800102080000202400002003820038118002110910108000010001000005020316332003580000102003920039200392003920039
8002420089150000011422580010108000010800005064000012001920038200389996310018800102080000202400002003820038118002110910108000010000000005020316332003580000102003920039200392003920039
800242003815000003925800101080000108000050640000120019200382003810011310018800102080000202400002003820038118002110910108000010000000005020316322003580000102003920039200392003920039
80024200381500000392580010108000010800005064000002001920038200389996310018800102080000202400002003820038118002110910108000010000000005020316432003580000102003920039200392003920039
80024200381500000812580010108000010800005064000002001920038200389996310018800102080000202400002003820038118002110910108000010000000005020316432003580000102003920039200392003920039
80024200381500000392580010108000010800005064000012001920038200389996310018800102080000202400002003820038118002110910108000010000000005020216232003580000102003920039200392003920039
80024200381500000602580010108000010800005064000002001920038200389996310018800102080000202400002003820038118002110910108000010000000005020316232003580000102003920039200392003920039
800242003815000006232580010108000010800005064000002001920038200389996310018800102080000202400002003820038118002110910108000010000000005020316332003580000102003920039200392003920039
800242003815000001272580010108000010800005064000002001920038200389996310018800102080000202400002003820038118002110910108000010000000005020316322003580000102003920039200392003920039
80024200381500000392580010108000010800005064000002001920038200389996310018800102080000202400002003820038118002110910108000010000000005020316322003580000102003920039200392003920039