Apple Microarchitecture Research by Dougall Johnson

M1/A14 P-core (Firestorm): Overview | Base Instructions | SIMD and FP Instructions
M1/A14 E-core (Icestorm):  Overview | Base Instructions | SIMD and FP Instructions

LDR (pre-index, 32-bit)

Test 1: uops

Code:

  ldr w0, [x6, #8]!
  mov x0, 1
  mov x1, 2
  mov x8, 0

(no loop instructions)

1000 unrolls and 1 iteration

Retires: 2.000

Issues: 2.000

Integer unit issues: 1.001

Load/store unit issues: 1.000

SIMD/FP unit issues: 0.000

retire uop (01)cycle (02)schedule uop (52)schedule int uop (53)schedule ldst uop (55)dispatch int uop (56)dispatch ldst uop (58)int uops in schedulers (59)simd uops in schedulers (5a)dispatch uop (78)map int uop (7c)map ldst uop (7d)map int uop inputs (7f)map ldst uop inputs (80)? int output thing (e9)? ldst retires (ed)? int retires (ef)
2005122820391020101910421000214351800220001000100010001000100110001000
2004108220011001100010001000214621762920001000100010001000100110001000
2004107920011001100010001000212771759620001000100010001000100110001000
2004107820011001100010001000207831773420001000100010001000100110001000
2004108120011001100010001000207671771820001000100010001000100110001000
2004107820011001100010001000214731763020001000100010001000100110001000
2004108420011001100010001000211711762820001000100010001000100110001000
2004107720011001100010001000214961761120001000100010001000100110001000
2004107820011001100010001000212501767420001000100010001000100110001000
2004106520011001100010001000215341765220001000100010001000100110001000

Test 2: Latency 1->2 (with chain penalty)

Chain cycles: 3

Code:

  ldr w0, [x6, #8]!
  eor x8, x8, x0
  eor x8, x8, x0
  add x6, x6, x8
  mov x0, 1
  mov x1, 2
  mov x8, 0

(fused SUBS/B.cc loop)

100 unrolls and 100 iterations

Result (median cycles for code, minus 3 chain cycles): 4.0120

retire uop (01)cycle (02)schedule uop (52)schedule int uop (53)schedule ldst uop (55)dispatch int uop (56)dispatch ldst uop (58)int uops in schedulers (59)simd uops in schedulers (5a)dispatch uop (78)map int uop (7c)map ldst uop (7d)map int uop inputs (7f)map ldst uop inputs (80)? int output thing (e9)? ldst retires (ed)? int retires (ef)
5020971406501674016210005402471000218501825346545010840211100037022110004400041000040100
5020470092501034010310000401061000318501185346985010940212100047022110004400041000040100
5020470106501044010410000401061000318523325354325010940212100047022110004400041000040100
5020470097501044010410000401061000318501185346985010940212100047022110004400041000040100
5020470097501044010410000401061000318501185346985010940212100047028910013400151000040100
5020470092501034010310000401061000318501185346985010940212100047022110004400041000040100
5020470097501044010410000401061000318501185346985010940212100047022110004400041000040100
5020470097501044010410000401061000318501185346985010940212100047022110004400041000040100
5020470110501044010410000401061000318503345347695010940212100047022110004400041000040100
5020470097501044010410000401061000318501185346985010940212100047022110004400041000040100

1000 unrolls and 10 iterations

Result (median cycles for code, minus 3 chain cycles): 4.0099

retire uop (01)cycle (02)schedule uop (52)schedule int uop (53)schedule ldst uop (55)dispatch int uop (56)dispatch ldst uop (58)int uops in schedulers (59)simd uops in schedulers (5a)dispatch uop (78)map int uop (7c)map ldst uop (7d)map int uop inputs (7f)map ldst uop inputs (80)? int output thing (e9)? ldst retires (ed)? int retires (ef)
5002971225500704006510005401561000318508025352605001940032100047010910013400161000040010
5002470137500144001410000400101000018512275353825001040020100007002010000400041000040010
5002470113500144001410000400101000018505525351595001040020100007002010000400041000040010
5002570274500274002510002400491000018506605351955001040020100007002010000400041000040010
5002470113500144001410000400101000018506065351775001040020100007002010000400041000040010
5002470113500144001410000400101000018506065351775001040020100007002010000400041000040010
5002470114500144001410000400101000018515515354925001040020100007002010000400041000040010
5002570203500284002610002400501000318536425359525001940032100047002010000400091000040010
5002470178500194001910000400101000018523615357085001040020100007002010000400091000040010
5002470178500194001910000400101000018523615357085001040020100007002010000400091000040010

Test 3: throughput

Count: 8

Code:

  ldr w0, [x6, #8]!
  ldr w0, [x7, #8]!
  ldr w0, [x8, #8]!
  ldr w0, [x9, #8]!
  ldr w0, [x10, #8]!
  ldr w0, [x11, #8]!
  ldr w0, [x12, #8]!
  ldr w0, [x13, #8]!
  mov x7, x6
  mov x8, x6
  mov x9, x6
  mov x10, x6
  mov x11, x6
  mov x12, x6
  mov x13, x6

(fused SUBS/B.cc loop)

100 unrolls and 100 iterations

Result (median cycles for code divided by count): 0.5402

retire uop (01)cycle (02)schedule uop (52)schedule int uop (53)schedule ldst uop (55)dispatch int uop (56)dispatch ldst uop (58)int uops in schedulers (59)simd uops in schedulers (5a)dispatch uop (78)map int uop (7c)map ldst uop (7d)map int uop inputs (7f)map ldst uop inputs (80)? int output thing (e9)? ldst retires (ed)? int retires (ef)
160209442091604168031280104803158001224057864200616012480212800128021280012800098000080100
160204432251601098010980000801128000924048564238216012180212800128020880008800078000080100
160204432151601098010980000801128001224048564441016012480212800128021280012800098000080100
160204432151601098010980000801128001124048564211116012380212800128021280012800098000080100
160206434451602508018980061801928000824051064140516012080212800128021280012800098000080100
160204432181601098010980000801128001024048564251316012280212800128021280012800098000080100
160204432171601098010980000801128001124048564253816012380212800128021280012800098000080100
160204432161601098010980000801128001024048564308216012280212800128021280012800098000080100
160204432161601098010980000801128001124048564321316012380212800128021280012800098000080100
160204432181601098010980000801128001024048564205616012280212800128021280012800098000080100

1000 unrolls and 10 iterations

Result (median cycles for code divided by count): 0.5402

retire uop (01)cycle (02)schedule uop (52)schedule int uop (53)schedule ldst uop (55)dispatch int uop (56)dispatch ldst uop (58)int uops in schedulers (59)simd uops in schedulers (5a)dispatch uop (78)map int uop (7c)map ldst uop (7d)map int uop inputs (7f)map ldst uop inputs (80)? int output thing (e9)? ldst retires (ed)? int retires (ef)
160029443191603198022180098802248001324031164423216003680033800138003280012800098000080010
160024432211600118001180000800108000024017964162016001080020800008002080000800018000080010
160024432141600118001180000800108000024017963554216001080020800008002080000800018000080010
160024432161600118001180000800108000024017964185316001080020800008002080000800018000080010
160024432161600118001180000800108000024017964634116001080020800008002080000800018000080010
160024432211600118001180000800108000024017964195716001080020800008002080000800018000080010
160024432231600118001180000800108000024017964342016001080020800008002080000800018000080010
160024432131600118001180000800108000024017964376416001080020800008002080000800018000080010
160024432151600118001180000800108000024017963328516001080020800008002080000800018000080010
160024432151600118001180000800108000024017964310816001080020800008002080000800018000080010