Apple Microarchitecture Research by Dougall Johnson

M1/A14 P-core (Firestorm): Overview | Base Instructions | SIMD and FP Instructions
M1/A14 E-core (Icestorm):  Overview | Base Instructions | SIMD and FP Instructions

LDRSB (register, uxtw, 64-bit)

Test 1: uops

Code:

  ldrsb x0, [x6, w7, uxtw]
  mov x7, #4
  mov x8, 0

(no loop instructions)

1000 unrolls and 1 iteration

Retires: 1.000

Issues: 1.000

Integer unit issues: 0.001

Load/store unit issues: 1.000

SIMD/FP unit issues: 0.000

retire uop (01)cycle (02)schedule uop (52)schedule int uop (53)schedule ldst uop (55)dispatch ldst uop (58)simd uops in schedulers (5a)dispatch uop (78)map ldst uop (7d)map ldst uop inputs (80)? int output thing (e9)? ldst retires (ed)
10056891027110261000816610001000200011000
10045501001110001000816610001000200011000
10045501001110001000816610001000200011000
10045501001110001000816610001000200011000
10045501001110001000798610001000200011000
10045501001110001000816610001000200011000
10045501001110001000816610001000200011000
10045501001110001000816610001000200011000
10045561001110001000816610001000200011000
10045561001110001000816610001000200011000

Test 2: Latency 1->2 (with chain penalty)

Chain cycles: 3

Code:

  ldrsb x0, [x6, w7, uxtw]
  eor x8, x8, x0
  eor x8, x8, x0
  add x6, x6, x8
  mov x7, #4
  mov x8, 0

(fused SUBS/B.cc loop)

100 unrolls and 100 iterations

Result (median cycles for code, minus 3 chain cycles): 4.0040

retire uop (01)cycle (02)schedule uop (52)schedule int uop (53)schedule ldst uop (55)dispatch int uop (56)dispatch ldst uop (58)int uops in schedulers (59)simd uops in schedulers (5a)dispatch uop (78)map int uop (7c)map ldst uop (7d)map int uop inputs (7f)map ldst uop inputs (80)? int output thing (e9)? ldst retires (ed)? int retires (ef)
4020570511401083010710001301301000318595336939214010630210100046022420008300031000030100
4020470047401033010310000301031000318595816940634010630212100046022020008300031000030100
4020470040401023010210000301031000318595816940634010630212100046022420008300031000030100
4020470040401023010210000301031001518615686949644015230251100176022020008300031000030100
4020470040401023010210000301031000318595816940634010630212100046022420008300031000030100
4020470040401023010210000301031000318595816940634010630212100046022420008300031000030100
4020570158401113010910002301351000318595816940634010630212100046022420008300031000030100
4020470040401023010210000301031000318595816940634010630212100046022420008300031000030100
4020470040401023010210000301031000318595816940634010630212100046022420008300031000030100
4020470040401023010210000301031000318595816940634010630212100046022420008300031000030100

1000 unrolls and 10 iterations

Result (median cycles for code, minus 3 chain cycles): 4.0042

retire uop (01)cycle (02)schedule uop (52)schedule int uop (53)schedule ldst uop (55)dispatch int uop (56)dispatch ldst uop (58)int uops in schedulers (59)simd uops in schedulers (5a)dispatch uop (78)map int uop (7c)map ldst uop (7d)map int uop inputs (7f)map ldst uop inputs (80)map simd uop inputs (81)? int output thing (e9)? ldst retires (ed)? simd retires (ee)? int retires (ef)
400257016140018300171000130040100031859536694688400163003210004600442000803000210000030010
400247004240012300121000030010100001859517694664400103002010000600202000003000210000030010
400257007240020300181000230045100001860306693945400103002010000600202000003000210000030010
400247014540013300131000030010100001859544694675400103002010000600202000003000210000030010
400247004240012300121000030010100001859652694719400103002010000600202000003000210000030010
400247004240012300121000030010100001859517694664400103002010000600202000003000210000030010
400247004240012300121000030010100001859517694664400103002010000600202000003000210000030010
400247004240012300121000030010100001859517694664400103002010000600202000003000210000030010
400247004240012300121000030010100001859517694664400103002010000600202000003000210000030010
400247004240012300121000030010100001859517694664400103002010000601222003403000810000030010

Test 3: Latency 1->3 (with chain penalty)

Chain cycles: 3

Code:

  ldrsb x0, [x6, w7, uxtw]
  eor x8, x8, x0
  eor x8, x8, x0
  add x7, x7, x8
  mov x7, #4
  mov x8, 0

(fused SUBS/B.cc loop)

100 unrolls and 100 iterations

Result (median cycles for code, minus 3 chain cycles): 4.0047

retire uop (01)cycle (02)schedule uop (52)schedule int uop (53)schedule ldst uop (55)dispatch int uop (56)dispatch ldst uop (58)int uops in schedulers (59)simd uops in schedulers (5a)dispatch uop (78)map int uop (7c)map ldst uop (7d)map int uop inputs (7f)map ldst uop inputs (80)? int output thing (e9)? ldst retires (ed)? int retires (ef)
4020570150401083010710001301301000318594856938764010630210100046022020008300031000030100
4020470047401033010310000301031000318595816940634010630212100046022420008300031000030100
4020470047401033010310000301031000318595816940634010630212100046022420008300031000030100
4020470047401033010310000301031000318595816940634010630212100046022420008300031000030100
4020470047401033010310000301031000318595816940634010630212100046022420008300031000030100
4020470047401033010310000301031000318595816940634010630212100046022420008300031000030100
4020470047401033010310000301031000318595816940634010630212100046022420008300031000030100
4020470047401033010310000301031000318613366947764010630212100046022420008300031000030100
4020470047401033010310000301031000318620926950854010630212100046022420008300031000030100
4020470058401033010310000301031000318595816940634010630212100046022420008300031000030100

1000 unrolls and 10 iterations

Result (median cycles for code, minus 3 chain cycles): 4.0049

retire uop (01)cycle (02)schedule uop (52)schedule int uop (53)schedule ldst uop (55)dispatch int uop (56)dispatch ldst uop (58)int uops in schedulers (59)simd uops in schedulers (5a)dispatch uop (78)map int uop (7c)map ldst uop (7d)map int uop inputs (7f)map ldst uop inputs (80)? int output thing (e9)? ldst retires (ed)? int retires (ef)
4002570155400183001710001300401000318597046937274001630030100046002020000300031000030010
4002470049400133001310000300101000018597066947344001030020100006002020000300031000030010
4002470062400133001310000300101000018597066947344001030020100006012220036300091000030010
4002470053400133001310000300101072918886987051084185131585108406002020000300031000030010
4002470050400133001310000300101000018597066947344001030020100006002020000300031000030010
4002470049400133001310000300101000018597066947344001030020100006002020000300031000030010
4002470049400133001310000300101001518621227168914006030068100176002020000300031000030010
4002470049400133001310000300101000018597066947344001030020100006002020000300031000030010
4002470049400133001310000300101000018597066947344001030020100006002020000300031000030010
4002470049400133001310000300101000018597066947344001030020100006002020000300031000030010

Test 4: throughput

Count: 8

Code:

  ldrsb x0, [x6, w7, uxtw]
  ldrsb x0, [x6, w7, uxtw]
  ldrsb x0, [x6, w7, uxtw]
  ldrsb x0, [x6, w7, uxtw]
  ldrsb x0, [x6, w7, uxtw]
  ldrsb x0, [x6, w7, uxtw]
  ldrsb x0, [x6, w7, uxtw]
  ldrsb x0, [x6, w7, uxtw]
  mov x7, 8

(fused SUBS/B.cc loop)

100 unrolls and 100 iterations

Result (median cycles for code divided by count): 0.5007

retire uop (01)cycle (02)schedule uop (52)schedule int uop (53)schedule ldst uop (55)dispatch int uop (56)dispatch ldst uop (58)int uops in schedulers (59)simd uops in schedulers (5a)dispatch uop (78)map int uop (7c)map ldst uop (7d)map int uop inputs (7f)map ldst uop inputs (80)map simd uop inputs (81)? int output thing (e9)? ldst retires (ed)? simd retires (ee)? int retires (ef)
8020540178801331018003210080008300248262801082008001220016002801800000100
8020440063801051018000410080008300472262801082008001220016014401800000100
8020440129801011018000010080008300640484801082008001220016002401800000100
8020440061801011018000010080008300640268801082008001220016002401800000100
8020440059801011018000010080008300640376801082008001220016002401800000100
8020440056801011018000010080008300640268801082008001220016002401800000100
8020440056801011018000010080008300640268801082008001220016002401800000100
8020440056801011018000010080008300640268801082008001220016002401800000100
8020540110801381018003710080008300640268801082008001220016002401800000100
8020440056801011018000010080008300640268801082008001220016002401800000100

1000 unrolls and 10 iterations

Result (median cycles for code divided by count): 0.5006

retire uop (01)cycle (02)schedule uop (52)schedule int uop (53)schedule ldst uop (55)dispatch int uop (56)dispatch ldst uop (58)int uops in schedulers (59)simd uops in schedulers (5a)dispatch uop (78)map int uop (7c)map ldst uop (7d)map int uop inputs (7f)map ldst uop inputs (80)? int output thing (e9)? ldst retires (ed)? int retires (ef)
80025402408003511800241080008304000648001820800122016000018000010
80024400438001111800001080000306400408001020800002016000018000010
80024400438001111800001080000306400408001020800002016000018000010
80024400438001111800001080000306400408001020800002016000018000010
80024400438001111800001080000306400408001020800002016000018000010
80024400438001111800001080000306400408001020800002016000018000010
80024400438001111800001080000306400408001020800002016000018000010
80024400438001111800001080000306400408001020800002016014218000010
80024400518001111800001080000306400408001020800002016000018000010
80024400438001111800001080000306400408001020800002016000018000010