Apple M1 Microarchitecture Research by Dougall Johnson

Firestorm: Overview | Base Instructions | SIMD and FP Instructions
Icestorm:  Overview | Base Instructions | SIMD and FP Instructions

TBL (two register table, 16B)

Test 1: uops

Code:

  tbl v0.16b, { v0.16b, v1.16b }, v2.16b
  movi v0.16b, 1
  movi v1.16b, 2
  movi v2.16b, 3

(no loop instructions)

1000 unrolls and 1 iteration

Retires: 1.000

Issues: 1.000

Integer unit issues: 0.000

Load/store unit issues: 0.000

SIMD/FP unit issues: 1.000

retire (01)cycle (02)03191e3f4e51inst issue (52)~issue fp/simd (54)~dispatch fp/simd (57)huge thing fp/simd (5b)60696d6edispatch stall (70)scheduler rewind (75)scheduler stall (76)~dispatch op (78)~map op fp/simd (7e)~map lookup fp/simd (81)8283pipeline redirect (84)85inst all (8c)inst neon or fp (9a)a8accfd5d6ddinst fetch restart (de)e0? fp/simd (ee)f5f6f7f8fd
1004203715006116872510001000100026468002018203720371572318951000100030002037203711100110000073216221787100020382038203820382038
1004203715006116872510001000100026468012018203720371572318951000100030002037203711100110000073216221787100020382038203820382038
10042037150726116872510001000100026468012018203720371572318951000100030002037203711100110000073216221787100020382038203820382038
1004203715006116872510001000100026468012018203720371572318951000100030002037203711100110000073216221787100020382038203820382038
1004203715006116872510001000100026468002018203720371572318951000100030002037203711100110000073216221787100020382038203820382038
1004203715006116872510001000100026468002018203720371572318951000100030002037203711100110000073216221787100020382038203820382038
1004203715006116872510001000100026468012018203720371572318951000100030002037203711100110000073216221787100020382038203820382038
1004203715006116872510001000100026468012018203720371572318951000100030002037203711100110000073216221787100020382038203820382038
1004203715006116872510001000100026468002018203720371572318951000100030002037203711100110000073216221787100020382038203820382038
1004203715006116872510001000100026468002018203720371572318951000100030002037203711100110000073216221787100020382038203820382038

Test 2: Latency 1->2

Code:

  tbl v0.16b, { v0.16b, v1.16b }, v2.16b
  movi v0.16b, 1
  movi v1.16b, 2
  movi v2.16b, 3

(fused SUBS/B.cc loop)

100 unrolls and 100 iterations

Result (median cycles for code): 2.0037

retire (01)cycle (02)030818191e1f3a3f4e51inst issue (52)~issue int (53)~issue fp/simd (54)~dispatch int (56)~dispatch fp/simd (57)huge thing int (59)huge thing fp/simd (5b)60696d6edispatch stall (70)scheduler rewind (75)scheduler stall (76)~dispatch op (78)~map op int (7c)~map op fp/simd (7e)~map lookup int (7f)~map lookup fp/simd (81)8283pipeline redirect (84)85inst all (8c)inst branch (8d)inst branch taken (90)inst b.cc (94)inst integer (97)inst neon or fp (9a)9fa0a1a6a8a9acc2cfd0d5d6ddinst fetch restart (de)e0? int output thing (e9)? fp/simd (ee)gpr retires (ef)f5f6f7f8fd
1020420037150000570061196872510100100100001001000050028476801200182003720037184223187451010020010000200300002003720037111020110099100100100001000000000710131623197910100001002003820038200382003820038
1020420037150000120061196872510100100100001001000050028476801200182003720037184223187451010020010000200300002003720037111020110099100100100001000002000710121622197910100001002003820038200382003820038
10204200371500003691320103196872510100100100001001060850028476801200182003720037184223187451010020010000200300002003720037111020110099100100100001000000000710121623197910100001002003820038200382003820038
102042003715000030061196872510100100100001001000050028476800200182003720037184223187451010020010000200300002003720037111020110099100100100001000000000710121622197910100001002003820038200382003820038
102042003715000030061196872510100113100001001000050028476801200542003720037184303187451010020210000200304922003720037111020110099100100100001000000060710121622197910100001002003820038200382003820038
1020420037150000180061196874410100100100001001000050028476800200182003720037184223187451010020010000200300002003720037111020110099100100100001000001060010712121622197910100001002003820038200382003820230
1020420037149000690061196872510100100100001001000050028476800200182003720037184223187451010020010000200300002003720037111020110099100100100001000000000710121622197910100001002003820038200382003820038
1020420037150000180061196872510100100100001001000050028476800200182003720037184223187451010020010000200300002003720037111020110099100100100001000001030710121622197910100001002003820038200382003820038
10204200371500001800631196872510100100100001001000050028476800200182003720037184223187451010020010000200300002003720037111020110099100100100001000000000712121622199130100001002003820038200382003820038
10204200371500004200103196872510100100100001001000050028476800200182003720037184223187451010020010000200300002003720179111020110099100100100001000001060710121622197910100001002003820038200382003820038

1000 unrolls and 10 iterations

Result (median cycles for code): 2.0037

retire (01)cycle (02)03080b18191e1f3f4e51inst issue (52)~issue int (53)~issue fp/simd (54)~dispatch int (56)~dispatch fp/simd (57)huge thing int (59)huge thing fp/simd (5b)696d6edispatch stall (70)scheduler rewind (75)scheduler stall (76)~dispatch op (78)~map op int (7c)~map op fp/simd (7e)~map lookup int (7f)~map lookup fp/simd (81)8283pipeline redirect (84)85inst all (8c)inst branch (8d)inst branch taken (90)inst b.cc (94)inst integer (97)inst neon or fp (9a)9fa0a1a6a8acc2c5branch mispredict (cb)cfd5d6ddinst fetch restart (de)e0? fp/simd (ee)gpr retires (ef)f5f6f7f8fd
10024200371500000006119687251001010100001010000502847680200182003720037184443187671001020100002030000200372003711100211091010100001000000000640216221978510000102003820038200382003820038
10024200371500000006119687251002210100121010000502847680200182003720037184443187671001020100002030000200372003711100211091010100001000000000640116231978510000102003820038200382003820038
100242003715000002406119687251001010100001010000502847680200182003720037184443187671001020100002030000200372003711100211091010100001000000000640216221978510000102003820038200382003820038
10024200371500000006119687251001010100001010000502847680200182003720037184443187671001020100002030000200372003711100211091010100001000000000640216221978510000102003820038200382003820038
100242003715000001506119687251001010100001010000502847680200182003720037184443187671001020100002030000200372003711100211091010100001000000000640216221978510000102003820038200382003820038
100242003715000003906119687251001010100001010000502847680200182003720037184443187671001020100002030000200372003711100211091010100001000000000685216221978510000102003820038200852022820276
100242022615210055943522015196329810070121000010101527228540952016220273202731845920187861077222108252232511202762022661100211091010100001022008213400746648422000310000102027520228202762027520271
1002420274152115468444024461964311710058101001212107605028540952016220227202621845326188591077422108352031998203132027261100211091010100001022000000640216221978510000102003820038200382003820276
100242022615110556095082463196439910075121006010107608328540952019820272202271845924188541077020106742031476202762027361100211091010100001002010700745342332001710000102031920277202742026420320
10024202721521136996528296419621134100481410072101091272285537820234203222032018468211884210922241100120314822027520320611002110910101000010002111970200766277222007510000102036820418204112027720459

Test 3: Latency 1->3

Code:

  tbl v1.16b, { v0.16b, v1.16b }, v2.16b
  movi v0.16b, 1
  movi v1.16b, 2
  movi v2.16b, 3

(fused SUBS/B.cc loop)

100 unrolls and 100 iterations

Result (median cycles for code): 2.0037

retire (01)cycle (02)030708090a0b18191e1f3a3f4e51inst issue (52)~issue int (53)~issue fp/simd (54)~dispatch int (56)~dispatch fp/simd (57)huge thing int (59)huge thing fp/simd (5b)60696d6edispatch stall (70)scheduler rewind (75)scheduler stall (76)~dispatch op (78)~map op int (7c)~map op fp/simd (7e)~map lookup int (7f)~map lookup fp/simd (81)8283pipeline redirect (84)85inst all (8c)inst branch (8d)inst branch taken (90)inst b.cc (94)inst integer (97)inst neon or fp (9a)9fa0a1a6a8a9acc2cfd0d5d6ddinst fetch restart (de)e0? int output thing (e9)? fp/simd (ee)gpr retires (ef)f5f6f7f8fd
1020420037150100100090198196872510100100100001001000050028476800200182003720037184223187451010020010000200300002003720037111020110099100100100001000000000713041645197910100001002003820038200382003820038
1020420037150100100000170196872510100100100001001000050028476800200182003720037184223187451010020010000200300002003720037111020110099100100100001000000000713051644197910100001002003820038200382003820038
10204200371501001000001540196872510100100100001001000050028476800200182003720037184223187451010020010000200300002003720037111020110099100100100001000000000713061655197910100001002003820038200382003820038
1020420037150100100000165196872510100100100001001000050028476800200182003720037184223187451010020010000200300002003720037111020110099100100100001000000000713051655197910100001002003820038200382003820038
10204200371501001000120165196872510100100100001001000050028476800200542003720037184223187451010020010000200300002003720037111020110099100100100001000000000713051653197910100001002003820038200382003820038
1020420037149100100000186196872510100100100001001000050028476800200182003720037184223187451010020010174200300002003720037111020110099100100100001000000000716031635197910100001002003820038200382003820038
10204200371501001000180165196872510100100100001001000050028476800200182003720037184223187451010020010000200300002003720037111020110099100100100001000000000713031655197910100001002003820038200382003820038
1020420037150100100030270196872510100100100001001000050028476800200182003720037184223187451010020010000200300002003720037111020110099100100100001000000000713031666197910100001002003820038200382003820038
1020420037150100100000165196872510100100100001001000050028476800200182003720037184223187451010020010000200300002003720037111020110099100100100001000000000713031663197910100001002003820038200382003820038
10204200371501001000120165196874310100100100001001000050028476800200182003720037184223187451010020010000200300002003720037111020110099100100100001000000000713041644197910100001002003820038200382003820038

1000 unrolls and 10 iterations

Result (median cycles for code): 2.0037

retire (01)cycle (02)031e1f3f4e51inst issue (52)~issue int (53)~issue fp/simd (54)~dispatch int (56)~dispatch fp/simd (57)huge thing int (59)huge thing fp/simd (5b)60696b6d6edispatch stall (70)scheduler rewind (75)scheduler stall (76)~dispatch op (78)~map op int (7c)~map op fp/simd (7e)~map lookup int (7f)~map lookup fp/simd (81)8283pipeline redirect (84)85inst all (8c)inst branch (8d)inst branch taken (90)inst b.cc (94)inst integer (97)inst neon or fp (9a)9fcfd5d6ddinst fetch restart (de)e0? fp/simd (ee)gpr retires (ef)f5f6f7f8fd
10024200371501206119687251001010100001010000502847680120018020037200371844431876710010201000020300002003720037111002110910101000010640216231978510000102003820038200382003820038
1002420037150306119687251001010100001010000502847680020018020037200371844431876710010201000020300002003720037111002110910101000010640216221978510000102003820038200382003820038
1002420037150210886119687251001010100001010000502847680020018020037200371844431876710010201000020300002003720037111002110910101000010640216321978510000102003820038200382003820038
1002420037150006119687251001010100001010000502847680020018020037200371844431876710010201000020300002003720037111002110910101000010640216321978510000102003820038200382003820038
10024200371502706119687251001010100001010000502847680020037020037200371844431876710010201000020300002003720037111002110910101000010640216221978510000102003820038200382003820038
1002420037150006119687251001010100001010000502847680020018020037200371844431876710010201000020300002003720037111002110910101000010640216221978510000102003820038200382003820038
1002420037150006119687251001010100001010000502847680020018020037200371844431876710010201000020300002003720037111002110910101000010640216221978510000102003820038200382003820038
100242003715023706119687251001010100001010000502847680020018020037200371844431876710010201000020300002003720037111002110910101000010640216321978510000102003820038200382003820038
1002420037150006119687251001010100001010000502847680020018020037200371844431876710010201000020300002003720037111002110910101000010640216221978510000102003820038200382003820038
1002420037150906119687251001010100001010000502847680020018020037200371844431876710010201000020300002003720037111002110910101000010640216221978510000102003820038200382003820038

Test 4: Latency 1->4

Code:

  tbl v2.16b, { v0.16b, v1.16b }, v2.16b
  movi v0.16b, 1
  movi v1.16b, 2
  movi v2.16b, 3

(fused SUBS/B.cc loop)

100 unrolls and 100 iterations

Result (median cycles for code): 2.0037

retire (01)cycle (02)031e1f3f4e51inst issue (52)~issue int (53)~issue fp/simd (54)~dispatch int (56)~dispatch fp/simd (57)huge thing int (59)huge thing fp/simd (5b)60696d6edispatch stall (70)scheduler rewind (75)scheduler stall (76)~dispatch op (78)~map op int (7c)~map op fp/simd (7e)~map lookup int (7f)~map lookup fp/simd (81)8283pipeline redirect (84)85inst all (8c)inst branch (8d)inst branch taken (90)inst b.cc (94)inst integer (97)inst neon or fp (9a)9facc2cfd0d5d6ddinst fetch restart (de)e0? fp/simd (ee)gpr retires (ef)f5f6f7f8fd
102042003715042061196872510100100100001001000050028476800200182003720037184223187451010020010000200300002003720037111020110099100100100001000071002162219791100001002003820038200382003820038
1020420037150016861196872510100104100001001015250028476800200182003720037184223187451010020010000200300002003720037111020110099100100100001000071002162219791100001002003820038200382003820038
10204200371500061196872510100100100001001000050028476801200182003720037184223187451010020010000200300002003720037111020110099100100100001000071002162219791100001002003820038200382003820038
10204200371500061196872510100100100001001000050028476801200182003720037184223187451010020010000200300002003720037111020110099100100100001000071002162219791100001002003820038200382003820038
10204200371503061196872510100100100001001000050028476801200182003720037184223187451010020010000200300002003720037111020110099100100100001000071012162219791100001002003820038200382003820038
10204200371500061196872510100100100001001000050028476801200182003720037184223187451010020010000200300002003720037111020110099100100100001000071012162219791100001002003820038200382003820038
10204200371509061196872510100100100001001000050028476801200182003720037184223187451010020010000200300002003720037111020110099100100100001000071212162319791100001002003820038200382003820038
10204200371500061196872510100100100001001000050028476801200182003720037184223187451010020010000200300002003720037111020110099100100100001000071212162219791100001002003820038200382003820038
1020420037150270061196872510100100100001001000050028476801200182003720037184223187451010020010000200300002003720037111020110099100100100001000071012162219791100001002003820038200382003820038
10204200371500061196872510100100100001001000050028476801200182003720037184223187451010020010000200300002003720037111020110099100100100001000071012163219791100001002003820038200382003820038

1000 unrolls and 10 iterations

Result (median cycles for code): 2.0037

retire (01)cycle (02)03080b18191e1f3f4e5051inst issue (52)~issue int (53)~issue fp/simd (54)~dispatch int (56)~dispatch fp/simd (57)huge thing int (59)huge thing fp/simd (5b)60696d6edispatch stall (70)scheduler rewind (75)scheduler stall (76)~dispatch op (78)~map op int (7c)~map op fp/simd (7e)~map lookup int (7f)~map lookup fp/simd (81)8283pipeline redirect (84)85inst all (8c)inst branch (8d)inst branch taken (90)inst b.cc (94)inst integer (97)inst neon or fp (9a)9fa8a9c2c5cfd5d6ddinst fetch restart (de)e0? int output thing (e9)? fp/simd (ee)gpr retires (ef)f5f6f7f8fd
10024200371500000006119687200212510010101000010100005028476800200182003720037184443187671001020100002030000200372003711100211091010100001000006402162219785010000102003820038200382003820038
10024200371500000006119687200212510010101000010100005028476800200212003720037184443187671001020100002030000200372003711100211091010100001000006402162219785010000102003820038200382003820038
100242003715000002706119687200212510010101000010100005028476801200182003720085184443187671001020100002030000200372003711100211091010100001000006402162219785010000102003820038200382003820038
1002420037150000000611968702510010101000010100005028476800200182003720037184443187671001020100002030000200372003711100211091010100001000006402162219785010000102003820038200382003820038
1002420037150000000611968702510010101000010100005028476801200182003720037184443187671001020100002030000200372003711100211091010100001000006402242219785010000102003820038200382003820038
1002420037150000060611968702510010101000010100005028476801200182003720037184443187671001020100002030000200372003711100211091010100001000006402163219785010000102003820038200382003820038
1002420037150000000611968702510010101000010100005028476800200182003720037184443187671001020100002030000200372003711100211091010100001000006402162319785010000102003820038200382003820038
1002420037150000000611968702510010101000010100005028476801200182003720037184443187671001020100002030000200372003711100211091010100001000006402162219785010000102003820038200382003820038
1002420037150000000611968702510010101000010100005028476801200182003720037184443187671001020100002030000200372003711100211091010100001000006402162219785010000102003820038200382003820038
100242003715000001201891968704410046101000010100005028476801200182003720037184513187671001020101692030000202762003711100211091010100001000006402162219785010000102003820038200382003820038

Test 5: throughput

Count: 8

Code:

  tbl v0.16b, { v8.16b, v9.16b }, v10.16b
  tbl v1.16b, { v8.16b, v9.16b }, v10.16b
  tbl v2.16b, { v8.16b, v9.16b }, v10.16b
  tbl v3.16b, { v8.16b, v9.16b }, v10.16b
  tbl v4.16b, { v8.16b, v9.16b }, v10.16b
  tbl v5.16b, { v8.16b, v9.16b }, v10.16b
  tbl v6.16b, { v8.16b, v9.16b }, v10.16b
  tbl v7.16b, { v8.16b, v9.16b }, v10.16b
  movi v8.16b, 9
  movi v9.16b, 10
  movi v10.16b, 11

(fused SUBS/B.cc loop)

100 unrolls and 100 iterations

Result (median cycles for code divided by count): 0.2505

retire (01)cycle (02)030a0b18191e1f3f51inst issue (52)~issue int (53)~issue fp/simd (54)~dispatch int (56)~dispatch fp/simd (57)huge thing int (59)huge thing fp/simd (5b)60696d6edispatch stall (70)scheduler rewind (75)scheduler stall (76)~dispatch op (78)~map op int (7c)~map op fp/simd (7e)~map lookup int (7f)~map lookup fp/simd (81)8283pipeline redirect (84)85inst all (8c)inst branch (8d)inst branch taken (90)inst b.cc (94)inst integer (97)inst neon or fp (9a)9fa0a1a8a9accfd5d6ddinst fetch restart (de)e0? int output thing (e9)? fp/simd (ee)gpr retires (ef)f5f6f7f8fd
802042005815000111328840258028610080000100800005006400000200192003820038997339996801002008000020024000020038200381180201100991001008000010000001745511021611200350800001002003920039200892008820039
8020420038150000012040258019410080000100800005006400000200192003820038997339996801002008000020024000020038200381180201100991001008000010000000511011611200350800001002003920039200392003920039
802042003815000020040258010010080000100800005006400000200192003820038997339996801002008000020024000020038200381180201100991001008000010000000512811611200350800001002003920039200392003920039
8020420038150000000402580100100800001008000050064000002001920038200389973399968010020080000200240000200382003811802011009910010080000100005109511011611200350800001002003920039200392003920039
802042003815000000040258010010080000100800005006400001200192003820038997339996801002008000020024000020038200381180201100991001008000010000000511011611200350800001002003920039200392003920039
802042003815000000040258010010080000100800005006400000200192003820038997339996801002008000020024000020038200381180201100991001008000010000000511011611200350800001002003920039200392003920039
802042003815000000040258010010080000100800005006400000200192003820038997339996801002008000020024000020038200381180201100991001008000010000000511011611200350800001002003920039200392003920039
802042003815000000040258010010080000100800005006400001200192003820038997339996801002008000020024000020038200381180201100991001008000010000500511011611200350800001002003920039200392003920039
802042003815000000040258010010080000100800005006400000200192003820038997339996801002008000020024000020038200381180201100991001008000010000000511011611200350800001002003920039200392003920039
802042003815000000061258010010080000100800005006400001200192003820038997339996801002008000020024000020038200381180201100991001008000010000000511011611200350800001002003920039200392003920039

1000 unrolls and 10 iterations

Result (median cycles for code divided by count): 0.2505

retire (01)cycle (02)03040b193f51inst issue (52)~issue int (53)~issue fp/simd (54)~dispatch int (56)~dispatch fp/simd (57)huge thing int (59)huge thing fp/simd (5b)60696d6edispatch stall (70)scheduler rewind (75)scheduler stall (76)~dispatch op (78)~map op int (7c)~map op fp/simd (7e)~map lookup int (7f)~map lookup fp/simd (81)8283pipeline redirect (84)85inst all (8c)inst branch (8d)inst branch taken (90)inst b.cc (94)inst integer (97)inst neon or fp (9a)9fa8acbranch mispredict (cb)cfd5d6ddinst fetch restart (de)e0? fp/simd (ee)gpr retires (ef)f5f6f7f8fd
80024200481501003925800101080000108000050640000020019200382003899963100188001020800002024000020038200381180021109101080000100305020201623182003580000102003920039200392003920039
80024200381500003925800101080000108000050640000020019200382003899963100188001020800002024000020038200381180021109101080000101005020231619232003580000102003920039200392003920039
80024200381500003925800101080000108000050640000020019200382003899963100188001020800002024000020038200381180021109101080000100005020191623192003580000102003920039200392003920039
80024200381500003925800101080000108000050640000120019200382003899963100188001020800002024000020038200381180021109101080000101005020231619232003580000102003920039200392003920039
80024200381500003925800101080000108000050640000120019200382003899963100188001020800002024000020038200381180021109101080000101005020201623202003580000102003920039200392003920039
80024200381500013925800101080000108000050640000120019200382003899963100188001020800002024000020038200381180021109101080000100005020191623182003580000102003920039200392003920039
80024200381500003925800101080000108000050640000120019200382003899963100188001020800002024000020038200381180021109101080000100005020171622182003580000102003920039200392003920039
800242003815000073925800101080000108000050640000120019200382003899963100188001020800002024000020038200381180021109101080000100005020231614232003580000102003920039200392003920039
80024200381500003925800101080000108000050640000120019200382003899963100188001020800002024000020038200381180021109101080000102305020131623142003580000102003920039200392003920039
80024200381500003925800101080000108000050640000120019200382003899963100188001020800002024000020038200381180021109101080000108005020231623232003580000102003920039200392003920039