Apple Microarchitecture Research by Dougall Johnson

M1/A14 P-core (Firestorm): Overview | Base Instructions | SIMD and FP Instructions
M1/A14 E-core (Icestorm):  Overview | Base Instructions | SIMD and FP Instructions

TBL (four register table, 16B)

Test 1: uops

Code:

  tbl v0.16b, { v0.16b, v1.16b, v2.16b, v3.16b }, v4.16b
  movi v0.16b, 1
  movi v1.16b, 2
  movi v2.16b, 3
  movi v3.16b, 4

(no loop instructions)

1000 unrolls and 1 iteration

Retires: 3.000

Issues: 3.000

Integer unit issues: 0.001

Load/store unit issues: 0.000

SIMD/FP unit issues: 3.000

retire uop (01)cycle (02)schedule uop (52)schedule int uop (53)schedule simd uop (54)dispatch simd uop (57)ldst uops in schedulers (5b)dispatch uop (78)map simd uop (7e)map simd uop inputs (81)? int output thing (e9)? simd retires (ee)
30044033300113000300010017930003000900013000
30044033300113000300010017930003000900013000
30044033300113000300010017930003000900013000
30044033300113000300010017930003000900013000
30044033300113000300010017930003000900013000
30044033300113000300010017930003000900013000
30044033300113000300010017930003000900013000
30044033300113000300010017930003000900013000
30044033300113000300010017930003000900013000
30044033300113000300010017930003000900013000

Test 2: Latency 1->2

Code:

  tbl v0.16b, { v0.16b, v1.16b, v2.16b, v3.16b }, v4.16b
  movi v0.16b, 1
  movi v1.16b, 2
  movi v2.16b, 3
  movi v3.16b, 4

(fused SUBS/B.cc loop)

100 unrolls and 100 iterations

Result (median cycles for code): 4.0033

retire uop (01)cycle (02)schedule uop (52)schedule int uop (53)schedule simd uop (54)dispatch int uop (56)dispatch simd uop (57)int uops in schedulers (59)ldst uops in schedulers (5b)dispatch uop (78)map int uop (7c)map simd uop (7e)map int uop inputs (7f)map simd uop inputs (81)? int output thing (e9)? simd retires (ee)? int retires (ef)
3020440033302012013000020030000700100914530202200300102009002410130000100
3020440033302012013000020030000700100917930200200300082009002410130000100
3020440033302012013000020030000700100917930200200300082009002410130000100
3020440033302012013000020030000700100917930200200300082009002410130000100
3020440033302012013000020030000700100917930200200300082009002410130000100
3020440033302012013000020030000700100917930200200300082009002410130000100
3020440033302012013000020030000700100917930200200300082009002410130000100
3020440033302012013000020030000700100917930200200300082009002410130000100
3020440033302012013000020030000700100917930200200300082009002410130000100
3020440033302012013000020030000700100917930200200300082009002410130000100

1000 unrolls and 10 iterations

Result (median cycles for code): 4.0033

retire uop (01)cycle (02)schedule uop (52)schedule int uop (53)schedule simd uop (54)dispatch int uop (56)dispatch simd uop (57)int uops in schedulers (59)ldst uops in schedulers (5b)dispatch uop (78)map int uop (7c)map simd uop (7e)map int uop inputs (7f)map simd uop inputs (81)? int output thing (e9)? simd retires (ee)? int retires (ef)
30024400333001111300001030000301009170300112030010209000013000010
30024400333001111300001030000301009179300102030000209000013000010
30024400333001111300001030000301009422300592030060209017123000010
30024400333001111300001030000301009179300102030000209000013000010
30024400333001111300001030000301009179300102030000209000013000010
30024400333001111300001030000301009179300102030000209016213000010
30024400333001111300001030000301009179300102030000209000013000010
30024400333001111300001030000301009179300102030000209000013000010
30024400333001111300001030000301009179300102030000209000013000010
30024400333001111300001030000301009179300102030000209000013000010

Test 3: Latency 1->3

Code:

  tbl v1.16b, { v0.16b, v1.16b, v2.16b, v3.16b }, v4.16b
  movi v0.16b, 1
  movi v1.16b, 2
  movi v2.16b, 3
  movi v3.16b, 4

(fused SUBS/B.cc loop)

100 unrolls and 100 iterations

Result (median cycles for code): 4.0033

retire uop (01)cycle (02)schedule uop (52)schedule int uop (53)schedule simd uop (54)dispatch int uop (56)dispatch simd uop (57)int uops in schedulers (59)ldst uops in schedulers (5b)dispatch uop (78)map int uop (7c)map simd uop (7e)map int uop inputs (7f)map simd uop inputs (81)? int output thing (e9)? simd retires (ee)? int retires (ef)
3020440033302012013000020030002700100916230202200300102009003010130000100
3020440033302012013000020030002700100917930200200300082009002410130000100
3020440033302012013000020030000700100917930200200300082009002410130000100
3020440033302012013000020030000700100917930200200300082009002410130000100
3020440033302012013000020030000700100917930200200300082009002410130000100
3020440033302012013000020030000700100917930200200300082009019210130000100
3020440033302012013000020030000700100917930200200300082009002410130000100
3020440033302012013000020030000700100917930200200300082009002410130000100
3020440033302012013000020030000700100917930200200300082009002410130000100
3020440033302012013000020030000700100917930200200300082009002410130000100

1000 unrolls and 10 iterations

Result (median cycles for code): 4.0033

retire uop (01)cycle (02)schedule uop (52)schedule int uop (53)schedule simd uop (54)dispatch int uop (56)dispatch simd uop (57)int uops in schedulers (59)ldst uops in schedulers (5b)dispatch uop (78)map int uop (7c)map simd uop (7e)map int uop inputs (7f)map simd uop inputs (81)? int output thing (e9)? simd retires (ee)? int retires (ef)
30024400333001111300001030000301009170300112030010209000013000010
30024400333001111300001030000301009179300102030000209000013000010
30024400333001111300001030000301009179300102030000209000013000010
30024400333001111300001030000301009433300612030068209000013000010
30024400333001111300001030000301009179300102030000209000013000010
30024400333001111300001030000301009179300102030000209000013000010
30024400333001111300001030000301009179300102030000209000013000010
30024400333001111300001030000301009179300102030000209000013000010
30024400333001111300001030000301009179300102030000209000013000010
30024400333001111300001030000301009179300102030000209000013000010

Test 4: Latency 1->4

Code:

  tbl v2.16b, { v0.16b, v1.16b, v2.16b, v3.16b }, v4.16b
  movi v0.16b, 1
  movi v1.16b, 2
  movi v2.16b, 3
  movi v3.16b, 4

(fused SUBS/B.cc loop)

100 unrolls and 100 iterations

Result (median cycles for code): 4.0033

retire uop (01)cycle (02)schedule uop (52)schedule int uop (53)schedule simd uop (54)schedule ldst uop (55)dispatch int uop (56)dispatch simd uop (57)int uops in schedulers (59)ldst uops in schedulers (5b)dispatch uop (78)map int uop (7c)map simd uop (7e)map int uop inputs (7f)map simd uop inputs (81)? int output thing (e9)? simd retires (ee)? int retires (ef)
326974819432339140930096834126330134700100916630202200300102009003010130000100
30204400333020120130000020030000700100917330200200300082009002410130000100
30204400333020120130000020030000700100917330200200300082009002410130000100
30204400333020120130000020030000700100917330200200300082009002410130000100
30204400333020120130000020030000700100917330200200300082009002410130000100
30204400333020120130000020030000700100917330200200300082009002410130000100
30204400333020120130000020030000700100917330200200300082009002410130000100
30204400333020120130000020030000700100917330200200300082009002410130000100
30204400333020120130000020030000700100917330200200300082009002410130000100
30204400333020120130000020030000700100917330200200300082009002410130000100

1000 unrolls and 10 iterations

Result (median cycles for code): 4.0033

retire uop (01)cycle (02)schedule uop (52)schedule int uop (53)schedule simd uop (54)dispatch int uop (56)dispatch simd uop (57)int uops in schedulers (59)ldst uops in schedulers (5b)dispatch uop (78)map int uop (7c)map simd uop (7e)map int uop inputs (7f)map simd uop inputs (81)? int output thing (e9)? simd retires (ee)? int retires (ef)
30024400333001111300001030001301009167300112030010209000013000010
30024400333001111300001030000301009173300102030000209000013000010
30024400333001111300001030000301009173300102030000209000013000010
30024400333001111300001030000301009173300102030000209000013000010
30024400333001111300001030000301009173300102030000209000013000010
30024400333001111300001030000301009173300102030000209000013000010
30024400333001111300001030000301009173300102030000209000013000010
30024400333001111300001030000301009173300102030000209000013000010
30024400333001111300001030001301009162300102030000209000013000010
30024400333001111300001030000301009173300102030000209000013000010

Test 5: Latency 1->5

Code:

  tbl v3.16b, { v0.16b, v1.16b, v2.16b, v3.16b }, v4.16b
  movi v0.16b, 1
  movi v1.16b, 2
  movi v2.16b, 3
  movi v3.16b, 4

(fused SUBS/B.cc loop)

100 unrolls and 100 iterations

Result (median cycles for code): 4.0033

retire uop (01)cycle (02)schedule uop (52)schedule int uop (53)schedule simd uop (54)schedule ldst uop (55)dispatch int uop (56)dispatch simd uop (57)int uops in schedulers (59)ldst uops in schedulers (5b)dispatch uop (78)map int uop (7c)map simd uop (7e)map int uop inputs (7f)map simd uop inputs (81)? int output thing (e9)? simd retires (ee)? int retires (ef)
30204402253031720130116020030208700100916630202200300102009003010130000100
30204400333020120130000020030000700100917330200200300082009002410130000100
30204400333020120130000020030000700100917330200200300082009002410130000100
30205401143025320030053019930102700100917330200200300082009002410130000100
30204400333020120130000020030000700100917330200200300082009002410130000100
30204400333020120130000020030000700100917330200200300082009002410130000100
30204400333020120130000020030000700100917330200200300082009002410130000100
3132461533316163548259802088364326006700100917330200200300082009002410130000100
30204400333020120130000020030000700100917330200200300082009002410130000100
30204400333020120130000020030000700100917330200200300082009002410130000100

1000 unrolls and 10 iterations

Result (median cycles for code): 4.0033

retire uop (01)cycle (02)schedule uop (52)schedule int uop (53)schedule simd uop (54)dispatch int uop (56)dispatch simd uop (57)int uops in schedulers (59)ldst uops in schedulers (5b)dispatch uop (78)map int uop (7c)map simd uop (7e)map int uop inputs (7f)map simd uop inputs (81)? int output thing (e9)? simd retires (ee)? int retires (ef)
30024400333001111300001030001301009410300612030062209001213000010
30024400333001111300001030000301009173300102030000209000013000010
30024400333001111300001030000301009173300102030000209000013000010
30024400333001111300001030000301009173300102030000209000013000010
30024400333001111300001030000301009173300102030000209000013000010
30025400663003511300241030051301009173300102030000209003013000010
30025400663003511300241030051301009173300102030004209000013000010
30024400333001111300001030000301009173300102030000209000013000010
30024400333001111300001030000301009173300102030000209000013000010
30024400333001111300001030000301009173300102030000209000013000010

Test 6: Latency 1->6

Code:

  tbl v4.16b, { v0.16b, v1.16b, v2.16b, v3.16b }, v4.16b
  movi v0.16b, 1
  movi v1.16b, 2
  movi v2.16b, 3
  movi v3.16b, 4

(fused SUBS/B.cc loop)

100 unrolls and 100 iterations

Result (median cycles for code): 4.0034

retire uop (01)cycle (02)schedule uop (52)schedule int uop (53)schedule simd uop (54)dispatch int uop (56)dispatch simd uop (57)int uops in schedulers (59)ldst uops in schedulers (5b)dispatch uop (78)map int uop (7c)map simd uop (7e)map int uop inputs (7f)map simd uop inputs (81)? int output thing (e9)? simd retires (ee)? int retires (ef)
3020440034302012013000020030001700100708830202200300102009002410130000100
3020440034302012013000020030002700100712530201200300082009002410130000100
3020440034302012013000020030001700100712530201200300082009015610130000100
3020440034302012013000020030001659100746430239200300562009002410130000100
3020440034302012013000020030001700100712530201200300082009002410130000100
3020440034302012013000020030001700100712530201200300082009002410130000100
3020440034302012013000020030001700100712530201200300082009002410130000100
3020440034302012013000020030001700100712530201200300082009002410130000100
3020440034302012013000020030001700100712530201200300082009002410130000100
3020440034302012013000020030001700100712530201200300082009002410130000100

1000 unrolls and 10 iterations

Result (median cycles for code): 4.0034

retire uop (01)cycle (02)schedule uop (52)schedule int uop (53)schedule simd uop (54)dispatch int uop (56)dispatch simd uop (57)int uops in schedulers (59)ldst uops in schedulers (5b)dispatch uop (78)map int uop (7c)map simd uop (7e)map int uop inputs (7f)map simd uop inputs (81)? int output thing (e9)? simd retires (ee)? int retires (ef)
30024400343001111300001030000301007133300112030006209000013000010
30024400343001111300001030000301007120300102030000209000013000010
30024400343001111300001030000301007120300102030000209000013000010
30024400343001111300001030000301007120300102030000209000013000010
30024400343001111300001030000301007120300102030000209000013000010
30024400343001111300001030000301007120300102030000209000013000010
30024400343001111300001030000301007120300102030000209000013000010
30024400343001111300001030000301007120300102030000209000013000010
30024400343001111300001030000301007120300102030000209000013000010
30024400343001111300001030000301007120300102030000209000013000010

Test 7: throughput

Count: 8

Code:

  tbl v0.16b, { v8.16b, v9.16b, v10.16b, v11.16b }, v12.16b
  tbl v1.16b, { v8.16b, v9.16b, v10.16b, v11.16b }, v12.16b
  tbl v2.16b, { v8.16b, v9.16b, v10.16b, v11.16b }, v12.16b
  tbl v3.16b, { v8.16b, v9.16b, v10.16b, v11.16b }, v12.16b
  tbl v4.16b, { v8.16b, v9.16b, v10.16b, v11.16b }, v12.16b
  tbl v5.16b, { v8.16b, v9.16b, v10.16b, v11.16b }, v12.16b
  tbl v6.16b, { v8.16b, v9.16b, v10.16b, v11.16b }, v12.16b
  tbl v7.16b, { v8.16b, v9.16b, v10.16b, v11.16b }, v12.16b
  movi v8.16b, 9
  movi v9.16b, 10
  movi v10.16b, 11
  movi v11.16b, 12
  movi v12.16b, 13

(fused SUBS/B.cc loop)

100 unrolls and 100 iterations

Result (median cycles for code divided by count): 1.5005

retire uop (01)cycle (02)schedule uop (52)schedule int uop (53)schedule simd uop (54)dispatch int uop (56)dispatch simd uop (57)int uops in schedulers (59)ldst uops in schedulers (5b)dispatch uop (78)map int uop (7c)map simd uop (7e)map int uop inputs (7f)map simd uop inputs (81)? int output thing (e9)? simd retires (ee)? int retires (ef)
2402041200472402082012400072002400127001200045240210200240016200720054101240000100
2402041200472402082012400072002400127001200049240210200240016200720204100240000100
2402041200362402052012400042002400107001200049240210200240016200720048101240000100
2402041200362402052012400042002400107001200049240210200240016200720048101240000100
2402041200362402052012400042002400107001200049240210200240016200720048101240000100
2402041200362402052012400042002400106511200266240258200240068200720048101240000100
2402041200362402052012400042002400107001200049240210200240016200720048101240000100
2402041200362402052012400042002400107001200049240210200240016200720048101240000100
2402041200362402052012400042002400107001200049240210200240016200720048101240000100
2402041200362402052012400042002400107001200049240210200240016200720204101240000100

1000 unrolls and 10 iterations

Result (median cycles for code divided by count): 1.5005

retire uop (01)cycle (02)schedule uop (52)schedule int uop (53)schedule simd uop (54)dispatch int uop (56)dispatch simd uop (57)dispatch ldst uop (58)int uops in schedulers (59)simd uops in schedulers (5a)ldst uops in schedulers (5b)dispatch uop (78)map int uop (7c)map ldst uop (7d)map simd uop (7e)map int uop inputs (7f)map simd uop inputs (81)? int output thing (e9)? simd retires (ee)? int retires (ef)
24002412005624001411240003102400080300120003724001820024001220720000124000010
2400241200362400111124000010240000859378301277312002052423862007103824005020720000124000010
24002612012724008711240076102401020300120040124001820024001220720036124000010
24002512007224005111240040102400560300119999824001020024000020720000124000010
24002412003624001111240000102400000300120003924001820024001220720000124000010
24002412003624001111240000102400000300119999824001020024000020720000124000010
24002412003624001111240000102400000300119999824001020024000020720000124000010
24002412003624001111240000102400000300119999824001020024000020720000124000010
24002512007224004711240036102400460300119999824001020024000020720000124000010
24002412003624001111240000102400000300119999824001020024000020720000124000010