Apple Microarchitecture Research by Dougall Johnson

M1/A14 P-core (Firestorm): Overview | Base Instructions | SIMD and FP Instructions
M1/A14 E-core (Icestorm):  Overview | Base Instructions | SIMD and FP Instructions

TBL (three register table, 16B)

Test 1: uops

Code:

  tbl v0.16b, { v0.16b, v1.16b, v2.16b }, v3.16b
  movi v0.16b, 1
  movi v1.16b, 2
  movi v2.16b, 3
  movi v3.16b, 4

(no loop instructions)

1000 unrolls and 1 iteration

Retires: 2.000

Issues: 2.000

Integer unit issues: 0.001

Load/store unit issues: 0.000

SIMD/FP unit issues: 2.000

retire uop (01)cycle (02)schedule uop (52)schedule int uop (53)schedule simd uop (54)dispatch simd uop (57)ldst uops in schedulers (5b)dispatch uop (78)map simd uop (7e)map simd uop inputs (81)? int output thing (e9)? simd retires (ee)
20044033200112000200010124820002000600012000
20044033200112000200010124820002000600012000
20044033200112000200010124820002000600012000
20044033200112000200010124820002000600012000
20044033200112000200010124820002000600012000
20044033200112000200010124820002000600012000
20044033200112000200010124820002000600012000
20044033200112000200010124820002000600012000
20044033200112000200010124820002000600012000
20044033200112000200010124820002000600012000

Test 2: Latency 1->2

Code:

  tbl v0.16b, { v0.16b, v1.16b, v2.16b }, v3.16b
  movi v0.16b, 1
  movi v1.16b, 2
  movi v2.16b, 3
  movi v3.16b, 4

(fused SUBS/B.cc loop)

100 unrolls and 100 iterations

Result (median cycles for code): 4.0033

retire uop (01)cycle (02)schedule uop (52)schedule int uop (53)schedule simd uop (54)dispatch int uop (56)dispatch simd uop (57)int uops in schedulers (59)ldst uops in schedulers (5b)dispatch uop (78)map int uop (7c)map simd uop (7e)map int uop inputs (7f)map simd uop inputs (81)? int output thing (e9)? simd retires (ee)? int retires (ef)
20204400332010110120000100200003001019248201002002000620060018120000100
20204400332010110120000100200003001019248201002002000420060012120000100
20204400332010110120000100200003001019248201002002000420060012120000100
20204400332010110120000100200003001019248201002002000420060012120000100
20204400332010110120000100200003001019248201002002000420060012120000100
20204400332010110120000100200003001019248201002002000420060012120000100
20204400332010110120000100200003001019248201002002000420060012120000100
20204400332010110120000100200003001019248201002002000420060012120000100
20204400332010110120000100200003001019248201002002000420060012120000100
20204400332010110120000100200003001019248201002002000420060012120000100

1000 unrolls and 10 iterations

Result (median cycles for code): 4.0033

retire uop (01)cycle (02)schedule uop (52)schedule int uop (53)schedule simd uop (54)dispatch int uop (56)dispatch simd uop (57)int uops in schedulers (59)ldst uops in schedulers (5b)dispatch uop (78)map int uop (7c)map simd uop (7e)map int uop inputs (7f)map simd uop inputs (81)? int output thing (e9)? simd retires (ee)? int retires (ef)
20024400332001111200001020000301019248200102020006206000012000010
20024400332001111200001020000301019248200102020000206000012000010
20024400332001111200001020000301019248200102020000206000012000010
20024400332001111200001020000301019248200102020000206000012000010
20024400332001111200001020000301019248200102020000206000012000010
20024400332001111200001020000301019248200102020000206000012000010
20024400332001111200001020000301019248200102020000206000012000010
20024400332001111200001020000301019248200102020000206000012000010
20024400332001111200001020000301019248200102020000206000012000010
20024400332001111200001020000301019248200102020000206000012000010

Test 3: Latency 1->3

Code:

  tbl v1.16b, { v0.16b, v1.16b, v2.16b }, v3.16b
  movi v0.16b, 1
  movi v1.16b, 2
  movi v2.16b, 3
  movi v3.16b, 4

(fused SUBS/B.cc loop)

100 unrolls and 100 iterations

Result (median cycles for code): 4.0033

retire uop (01)cycle (02)schedule uop (52)schedule int uop (53)schedule simd uop (54)schedule ldst uop (55)dispatch int uop (56)dispatch simd uop (57)int uops in schedulers (59)ldst uops in schedulers (5b)dispatch uop (78)map int uop (7c)map simd uop (7e)map int uop inputs (7f)map simd uop inputs (81)? int output thing (e9)? simd retires (ee)? int retires (ef)
202044003320101101200000100200003001019248201002002000620060012120000100
202044003320101101200000100200003001019580201342002004820060018120000100
202044003320101101200000100200003001019248201002002000420060144220000100
202044003320101101200000100200003001019248201002002000420060012120000100
202044003320101101200000100200003001019248201002002000420060012120000100
202044003320101101200000100200003001019248201002002000420060144220000100
202044003320101101200000100200003001019248201002002000420060012120000100
202044003320101101200000100200003001019248201002002000420060012120000100
202044003320101101200000100200003001019248201002002000620060012120000100
202044003320101101200000100200003001019248201002002000420060012120000100

1000 unrolls and 10 iterations

Result (median cycles for code): 4.0033

retire uop (01)cycle (02)schedule uop (52)schedule int uop (53)schedule simd uop (54)dispatch int uop (56)dispatch simd uop (57)int uops in schedulers (59)ldst uops in schedulers (5b)dispatch uop (78)map int uop (7c)map simd uop (7e)map int uop inputs (7f)map simd uop inputs (81)? int output thing (e9)? simd retires (ee)? int retires (ef)
20024400332001111200001020000301019248200102020000206000012000010
20024400332001111200001020000301019248200102020000206000012000010
20024400332001111200001020000301019248200102020000206000012000010
20024400332001111200001020000301019248200102020000206000012000010
20024400332001111200001020000301019248200102020000206000012000010
20024400332001111200001020000301019248200102020000206000012000010
20024400332001111200001020000301019248200102020000206000012000010
20024400332001111200001020000301020312200822020080206000012000010
20024400332001111200001020000301019248200102020000206000012000010
20024400332001111200001020000301019248200102020000206000012000010

Test 4: Latency 1->4

Code:

  tbl v2.16b, { v0.16b, v1.16b, v2.16b }, v3.16b
  movi v0.16b, 1
  movi v1.16b, 2
  movi v2.16b, 3
  movi v3.16b, 4

(fused SUBS/B.cc loop)

100 unrolls and 100 iterations

Result (median cycles for code): 2.0035

retire uop (01)cycle (02)schedule uop (52)schedule int uop (53)schedule simd uop (54)dispatch int uop (56)dispatch simd uop (57)int uops in schedulers (59)ldst uops in schedulers (5b)dispatch uop (78)map int uop (7c)map simd uop (7e)map int uop inputs (7f)map simd uop inputs (81)? int output thing (e9)? simd retires (ee)? int retires (ef)
2020420035201011012000010020001300499059201032002001020060024120000100
2020420035201021012000110020003300499115201012002000820060024120000100
2020420035201011012000010020001300499115201012002000820060024120000100
2020420035201011012000010020001300499115201012002000820060024120000100
2020420035201011012000010020001300499115201012002000820060024120000100
2020420035201011012000010020001300499115201012002000820060024120000100
2020420035201011012000010020001300499115201012002000820060024120000100
2020420035201011012000010020001300499460201682002007620060024120000100
2020420035201011012000010020001300499469201702002008020060240120000100
2020420141201931022009110120139300501185205032002042420260438320000100

1000 unrolls and 10 iterations

Result (median cycles for code): 2.0035

retire uop (01)cycle (02)schedule uop (52)schedule int uop (53)schedule simd uop (54)dispatch int uop (56)dispatch simd uop (57)int uops in schedulers (59)ldst uops in schedulers (5b)dispatch uop (78)map int uop (7c)map simd uop (7e)map int uop inputs (7f)map simd uop inputs (81)? int output thing (e9)? simd retires (ee)? int retires (ef)
2002420035200111120000102000130499076200132020010206000012000010
2002420035200111120000102000030499110200102020000206000012000010
2002420035200111120000102000030499110200102020000206000012000010
2002420035200111120000102000030499110200102020000206000012000010
2002420035200111120000102000030499110200102020000206000012000010
2002420035200111120000102000030499110200102020000206000012000010
2002420035200111120000102000030499110200102020000206000012000010
2002420035200111120000102000030499115200112020008206000012000010
2002420035200111120000102000030499110200102020000206000012000010
2002420035200111120000102000030499110200102020000206000012000010

Test 5: Latency 1->5

Code:

  tbl v3.16b, { v0.16b, v1.16b, v2.16b }, v3.16b
  movi v0.16b, 1
  movi v1.16b, 2
  movi v2.16b, 3
  movi v3.16b, 4

(fused SUBS/B.cc loop)

100 unrolls and 100 iterations

Result (median cycles for code): 4.0033

retire uop (01)cycle (02)schedule uop (52)schedule int uop (53)schedule simd uop (54)dispatch int uop (56)dispatch simd uop (57)int uops in schedulers (59)ldst uops in schedulers (5b)dispatch uop (78)map int uop (7c)map simd uop (7e)map int uop inputs (7f)map ldst uop inputs (80)map simd uop inputs (81)? int output thing (e9)? ldst retires (ed)? simd retires (ee)? int retires (ef)
2020440033201011012000010020000300101924820100200200062000600121020000100
2020440033201011012000010020000300101924820100200200042000600121020000100
2020440033201011012000010020000300101924820100200200042000600121020000100
2020440033201011012000010020000300101924820100200200042000600121020000100
2020440033201011012000010020000300101924820100200200042000600121020000100
2020440033201011012000010020000300101924820100200200042000600181020000100
2020440033201011012000010020000300101924820100200200042000600121020000100
2020440033201011012000010020000300101924820100200200062000600121020000100
2020440033201011012000010020000300101924820100200200042000600121020000100
2020440033201011012000010020000300101924820100200200042000600121020000100

1000 unrolls and 10 iterations

Result (median cycles for code): 4.0033

retire uop (01)cycle (02)schedule uop (52)schedule int uop (53)schedule simd uop (54)dispatch int uop (56)dispatch simd uop (57)int uops in schedulers (59)ldst uops in schedulers (5b)dispatch uop (78)map int uop (7c)map simd uop (7e)map int uop inputs (7f)map simd uop inputs (81)? int output thing (e9)? simd retires (ee)? int retires (ef)
20024400332001111200001020000301019248200102020006206000012000010
20024400332001111200001020000301019248200102020000206000012000010
20024400332001111200001020000301019248200102020000206000012000010
20024400332001111200001020000301019248200102020000206000012000010
20024400332001111200001020000301019248200102020000206000012000010
20024400332001111200001020000301019248200102020000206000012000010
20024400332001111200001020000301019248200102020000206000012000010
20024400332001111200001020000301019248200102020000206000012000010
20024400332001111200001020000301019248200102020000206000012000010
20024400332001111200001020000301019248200102020000206000012000010

Test 6: throughput

Count: 8

Code:

  tbl v0.16b, { v8.16b, v9.16b, v10.16b }, v11.16b
  tbl v1.16b, { v8.16b, v9.16b, v10.16b }, v11.16b
  tbl v2.16b, { v8.16b, v9.16b, v10.16b }, v11.16b
  tbl v3.16b, { v8.16b, v9.16b, v10.16b }, v11.16b
  tbl v4.16b, { v8.16b, v9.16b, v10.16b }, v11.16b
  tbl v5.16b, { v8.16b, v9.16b, v10.16b }, v11.16b
  tbl v6.16b, { v8.16b, v9.16b, v10.16b }, v11.16b
  tbl v7.16b, { v8.16b, v9.16b, v10.16b }, v11.16b
  movi v8.16b, 9
  movi v9.16b, 10
  movi v10.16b, 11
  movi v11.16b, 12

(fused SUBS/B.cc loop)

100 unrolls and 100 iterations

Result (median cycles for code divided by count): 1.0005

retire uop (01)cycle (02)schedule uop (52)schedule int uop (53)schedule simd uop (54)dispatch int uop (56)dispatch simd uop (57)int uops in schedulers (59)ldst uops in schedulers (5b)dispatch uop (78)map int uop (7c)map simd uop (7e)map int uop inputs (7f)map simd uop inputs (81)? int output thing (e9)? simd retires (ee)? int retires (ef)
160204800561601031011600021001600083008000511601102001600142004800361160000100
160204800361601031011600021001600083008000411601082001600122004800361160000100
160204800361601031011600021001600083008002501601542001600642004800361160000100
160204800361601031011600021001600083008000411601082001600122004800361160000100
160204800361601031011600021001600083008000411601082001600122004800361160000100
160205800721601411011600401001600563008000411601082001600122004800361160000100
160204800361601031011600021001600083008002501601542001600642004801921160000100
160204800461601051011600041001600103008000411601082001600122004800361160000100
160204800361601031011600021001600083008000411601082001600122004800361160000100
160204800361601031011600021001600083008000411601082001600122004800361160000100

1000 unrolls and 10 iterations

Result (median cycles for code divided by count): 1.0005

retire uop (01)cycle (02)schedule uop (52)schedule int uop (53)schedule simd uop (54)dispatch int uop (56)dispatch simd uop (57)int uops in schedulers (59)ldst uops in schedulers (5b)dispatch uop (78)map int uop (7c)map simd uop (7e)map int uop inputs (7f)map simd uop inputs (81)? int output thing (e9)? simd retires (ee)? int retires (ef)
160024800361600151116000410160010308000001600102016000020480000116000010
160024800361600111116000010160000308000001600102016000020480000116000010
160024800361600111116000010160000308000001600102016000020480000116000010
160024800361600111116000010160000308000001600102016000020480000116000010
160024800361600111116000010160000308000001600102016000020480000116000010
160024800361600111116000010160000308000001600102016000020480000116000010
160024800361600111116000010160000308002501600642016006420480000116000010
160024800361600111116000010160000308000001600102016000020480000116000010
160024800361600111116000010160000308000001600102016000020480000116000010
160024800361600111116000010160000308000001600102016000020480000116000010