Apple Microarchitecture Research by Dougall Johnson

M1/A14 P-core (Firestorm): Overview | Base Instructions | SIMD and FP Instructions
M1/A14 E-core (Icestorm):  Overview | Base Instructions | SIMD and FP Instructions

LDRSW (post-index)

Test 1: uops

Code:

  ldrsw x0, [x6], #8
  mov x0, 1
  mov x1, 2
  mov x8, 0

(no loop instructions)

1000 unrolls and 1 iteration

Retires: 2.000

Issues: 2.000

Integer unit issues: 1.001

Load/store unit issues: 1.000

SIMD/FP unit issues: 0.000

retire uop (01)cycle (02)schedule uop (52)schedule int uop (53)schedule ldst uop (55)dispatch int uop (56)dispatch ldst uop (58)int uops in schedulers (59)simd uops in schedulers (5a)dispatch uop (78)map int uop (7c)map ldst uop (7d)map int uop inputs (7f)map ldst uop inputs (80)? int output thing (e9)? ldst retires (ed)? int retires (ef)
2005126420351020101510381000210071779620001000100010001000100110001000
2004107620011001100010001000210391754020001000100010001000100110001000
2004107220011001100010001000207441747720001000100010001000100110001000
2004112620011001100010001000212361774320001000100010001000100110001000
2004106020011001100010001000211741755520001000100010001000100110001000
2004107420011001100010001000212361755820001000100010001000100110001000
2004107920011001100010001000211751760020001000100010001000100110001000
2004108820011001100010001000211621764820001000100010001000100110001000
2004107420011001100010001000214041761120001000100010001000100110001000
2004110020011001100010001000212971755620001000100010001000100110001000

Test 2: Latency 1->2 (with chain penalty)

Chain cycles: 3

Code:

  ldrsw x0, [x6], #8
  eor x8, x8, x0
  eor x8, x8, x0
  add x6, x6, x8
  mov x0, 1
  mov x1, 2
  mov x8, 0

(fused SUBS/B.cc loop)

100 unrolls and 100 iterations

Result (median cycles for code, minus 3 chain cycles): 4.0123

retire uop (01)cycle (02)schedule uop (52)schedule int uop (53)schedule ldst uop (55)dispatch int uop (56)dispatch ldst uop (58)int uops in schedulers (59)simd uops in schedulers (5a)dispatch uop (78)map int uop (7c)map ldst uop (7d)map int uop inputs (7f)map ldst uop inputs (80)? int output thing (e9)? ldst retires (ed)? int retires (ef)
5020971218501604015510005402471000318508205348965010940212100047022110004400091000040100
5020470211501094010910000401061000318524675353295010940212100047029210015400251000040100
5020470205501094010910000401061000318538715357355010940212100047022110004400091000040100
5020470232501094010910000401061000318537905357935010940212100047022110004400041000040100
5020470105501044010410000401061000318503885347465010940212100047022110004400041000040100
5020470107501044010410000401061000318503885347465010940212100047022110004400041000040100
5020470107501044010410000401061000318503885347465010940212100047022110004400041000040100
5020570191501174011510002401401000318505235347815010940212100047022110004400041000040100
5020470107501044010410000401061000318503885347465010940212100047022110004400041000040100
5020470107501044010410000401061000318503885347465010940212100047022110004400041000040100

1000 unrolls and 10 iterations

Result (median cycles for code, minus 3 chain cycles): 4.0115

retire uop (01)cycle (02)schedule uop (52)schedule int uop (53)schedule ldst uop (55)dispatch int uop (56)dispatch ldst uop (58)int uops in schedulers (59)simd uops in schedulers (5a)dispatch uop (78)map int uop (7c)map ldst uop (7d)map int uop inputs (7f)map ldst uop inputs (80)? int output thing (e9)? ldst retires (ed)? int retires (ef)
5002971241500704006510005401561000318507485352025001940032100047002010000400041000040010
5002470115500144001410000400101000018506065351375001040020100007010910013400151000040010
5002470111500144001410000400101000018521995356425001040020100007004110004400051000040010
5002470115500144001410000400101000018506605351515001040020100007002010000400041000040010
5002470115500144001410000400101000018506605351515001040020100007002010000400041000040010
5002470115500144001410000400101000018506605351515001040020100007002010000400041000040010
5002470115500144001410000400101000018506605351515001040020100007002010000400041000040010
5002470115500144001410000400101000018506605351515001040020100007002010000400041000040010
5002470115500144001410000400101000018506605351515001040020100007002010000400041000040010
5002470115500144001410000400101001218547215363565005940071100137002010000400041000040010

Test 3: throughput

Count: 8

Code:

  ldrsw x0, [x6], #8
  ldrsw x0, [x7], #8
  ldrsw x0, [x8], #8
  ldrsw x0, [x9], #8
  ldrsw x0, [x10], #8
  ldrsw x0, [x11], #8
  ldrsw x0, [x12], #8
  ldrsw x0, [x13], #8
  mov x7, x6
  mov x8, x6
  mov x9, x6
  mov x10, x6
  mov x11, x6
  mov x12, x6
  mov x13, x6

(fused SUBS/B.cc loop)

100 unrolls and 100 iterations

Result (median cycles for code divided by count): 0.5404

retire uop (01)cycle (02)schedule uop (52)schedule int uop (53)schedule simd uop (54)schedule ldst uop (55)dispatch int uop (56)dispatch simd uop (57)dispatch ldst uop (58)int uops in schedulers (59)simd uops in schedulers (5a)dispatch uop (78)map int uop (7c)map ldst uop (7d)map int uop inputs (7f)map ldst uop inputs (80)? int output thing (e9)? ldst retires (ed)? int retires (ef)
16020944220160423803130801108031608001124061364448016012480213800138021280012800098000080100
16020443236160109801090800008011208005424099161684616020880254800548021280012800098000080100
16020443235160109801090800008011208001124061064566016012380212800128020880008800078000080100
16020543303160184801510800338015408001224065864430916012480212800128021280012800098000080100
16020443231160109801090800008011208001224061064018316012480212800128021280012800098000080100
16020443232160109801090800008011208001224061064417816012480212800128021280012800098000080100
16020443231160109801090800008011208001224061064378116012480212800128021080010800078000080100
16291358998162704817785809218164268001224061364219416012580213800138021280012800098000080100
16020443236160110801090800018011208001024061063881116012280212800128021280012800098000080100
16020443233160109801090800008011208001024061064291516012280212800128021280012800098000080100

1000 unrolls and 10 iterations

Result (median cycles for code divided by count): 0.5404

retire uop (01)cycle (02)schedule uop (52)schedule int uop (53)schedule ldst uop (55)dispatch int uop (56)dispatch ldst uop (58)int uops in schedulers (59)simd uops in schedulers (5a)dispatch uop (78)map int uop (7c)map ldst uop (7d)map int uop inputs (7f)map ldst uop inputs (80)? int output thing (e9)? ldst retires (ed)? int retires (ef)
160029443751603328022380109802268005324070660346816011680073800538003380013800108000080010
160024432441600118001180000800108000024030464760016001080020800008002080000800018000080010
160024432301600118001180000800108000024030464349016001080020800008002080000800018000080010
160024432301600118001180000800108000024030464513616001080020800008002080000800018000080010
160024432301600118001180000800108000024030464377816001080020800008002080000800018000080010
160024432311600118001180000800108000024030464643516001080020800008002080000800018000080010
160024432281600118001180000800108000024030464260316001080020800008002080000800018000080010
160024432281600118001180000800108000024030464665216001080020800008002080000800018000080010
160024432281600118001180000800108000024030464152216001080020800008002080000800018000080010
160024432281600118001180000800108000024030464279116001080020800008002080000800018000080010