Apple Microarchitecture Research by Dougall Johnson

M1/A14 P-core (Firestorm): Overview | Base Instructions | SIMD and FP Instructions
M1/A14 E-core (Icestorm):  Overview | Base Instructions | SIMD and FP Instructions

LDR (post-index, 64-bit)

Test 1: uops

Code:

  ldr x0, [x6], #8
  mov x0, 1
  mov x1, 2
  mov x8, 0

(no loop instructions)

1000 unrolls and 1 iteration

Retires: 2.000

Issues: 2.000

Integer unit issues: 1.001

Load/store unit issues: 1.000

SIMD/FP unit issues: 0.000

retire uop (01)cycle (02)schedule uop (52)schedule int uop (53)schedule ldst uop (55)dispatch int uop (56)dispatch ldst uop (58)int uops in schedulers (59)simd uops in schedulers (5a)dispatch uop (78)map int uop (7c)map ldst uop (7d)map int uop inputs (7f)map ldst uop inputs (80)? int output thing (e9)? ldst retires (ed)? int retires (ef)
2005136920731043103010421000208771756820001000100010001000100110001000
2004106720011001100010001000211071769820001000100010001000100110001000
2004107820011001100010001000211331758720001000100010001000100110001000
2004112520011001100010001000210531762820001000100010001000100110001000
2004106820011001100010001000208071834720001000100010001000100110001000
2004110020011001100010001000213631781120001000100010001000100110001000
2004108220011001100010001000207881825620001000100010001000100110001000
2004110020011001100010001000209321834920001000100010001000100110001000
2004108420011001100010001000210731886720001000100010001000100110001000
2004109420011001100010001000206311852620001000100010001000100110001000

Test 2: Latency 1->2 (with chain penalty)

Chain cycles: 3

Code:

  ldr x0, [x6], #8
  eor x8, x8, x0
  eor x8, x8, x0
  add x6, x6, x8
  mov x0, 1
  mov x1, 2
  mov x8, 0

(fused SUBS/B.cc loop)

100 unrolls and 100 iterations

Result (median cycles for code, minus 3 chain cycles): 4.0101

retire uop (01)cycle (02)schedule uop (52)schedule int uop (53)schedule ldst uop (55)dispatch int uop (56)dispatch ldst uop (58)int uops in schedulers (59)simd uops in schedulers (5a)dispatch uop (78)map int uop (7c)map ldst uop (7d)map int uop inputs (7f)map ldst uop inputs (80)? int output thing (e9)? ldst retires (ed)? int retires (ef)
5020971398501664016110005402471000318500205345645010940212100047022110004400041000040100
5020470099501044010410000401061000318501725346765010940212100047022110004400041000040100
5020470099501044010410000401061000318501725346765010940212100047022110004400041000040100
5020470099501044010410000401061000318501725346765010940212100047022110004400041000040100
5020470115501044010410000401061000318501725346765010940212100047022110004400041000040100
5020470099501044010410000401061000318501725346765010940212100047022110004400041000040100
5020570181501174011510002401401000318506585348345010940212100047022110004400041000040100
5020470097501044010410000401061000318501185346605010940212100047022110004400041000040100
5020470097501044010410000401061000318501185346605010940212100047022110004400041000040100
5020470097501044010410000401061000318501185346605010940212100047029310014400161000040100

1000 unrolls and 10 iterations

Result (median cycles for code, minus 3 chain cycles): 4.0159

retire uop (01)cycle (02)schedule uop (52)schedule int uop (53)schedule ldst uop (55)dispatch int uop (56)dispatch ldst uop (58)int uops in schedulers (59)simd uops in schedulers (5a)dispatch uop (78)map int uop (7c)map ldst uop (7d)map int uop inputs (7f)map ldst uop inputs (80)? int output thing (e9)? ldst retires (ed)? int retires (ef)
5002971311500714006610005401561000318518875354145001940032100047002010000400071000040010
5002470162500174001710000400101000018518755354685001040020100007002010000400061000040010
5002470153500164001610000400101000018516865354195001040020100007002010000400061000040010
5002470153500164001610000400101000018516865354195001040020100007002010000400061000040010
5002470153500164001610000400101000018516865354195001040020100007002010000400061000040010
5002470153500164001610000400101000018516865354195001040020100007002010000400061000040010
5002470153500164001610000400101000018516865354195001040020100007002010000400061000040010
5002470153500164001610000400101000018516865354195001040020100007002010000400061000040010
5002470153500164001610000400101000018516865354195001040020100007002010000400061000040010
5002470153500164001610000400101000018516865354195001040020100007002010000400061000040010

Test 3: throughput

Count: 8

Code:

  ldr x0, [x6], #8
  ldr x0, [x7], #8
  ldr x0, [x8], #8
  ldr x0, [x9], #8
  ldr x0, [x10], #8
  ldr x0, [x11], #8
  ldr x0, [x12], #8
  ldr x0, [x13], #8
  mov x7, x6
  mov x8, x6
  mov x9, x6
  mov x10, x6
  mov x11, x6
  mov x12, x6
  mov x13, x6

(fused SUBS/B.cc loop)

100 unrolls and 100 iterations

Result (median cycles for code divided by count): 0.5402

retire uop (01)cycle (02)schedule uop (52)schedule int uop (53)schedule ldst uop (55)dispatch int uop (56)dispatch ldst uop (58)int uops in schedulers (59)simd uops in schedulers (5a)dispatch uop (78)map int uop (7c)map ldst uop (7d)map int uop inputs (7f)map ldst uop inputs (80)? int output thing (e9)? ldst retires (ed)? int retires (ef)
160209444181604338031380120803168001124048864072916012480213800138021280012800098000080100
160204432211601098010980000801128001124048564430416012380212800128021280012800098000080100
160204432181601098010980000801128000924048564321816012180212800128021280012800098000080100
160204432171601098010980000801128000824048564190616012080212800128021280012800098000080100
160204432141601078010780000801108001224048564201116012480212800128021280012800098000080100
160204432171601098010980000801128001224048563817916012480212800128021280012800098000080100
160204432191601148010980005801128001124048563909116012380212800128021280012800098000080100
160205432911601858015180034801548001124057864235216012380212800128021280012800098000080100
160204432171601098010980000801128001124048564482316012380212800128021280012800098000080100
160204432181601108010980001801128000924048564023316012180212800128021280012800098000080100

1000 unrolls and 10 iterations

Result (median cycles for code divided by count): 0.5408

retire uop (01)cycle (02)schedule uop (52)schedule int uop (53)schedule ldst uop (55)dispatch int uop (56)dispatch ldst uop (58)int uops in schedulers (59)simd uops in schedulers (5a)dispatch uop (78)map int uop (7c)map ldst uop (7d)map int uop inputs (7f)map ldst uop inputs (80)? int output thing (e9)? ldst retires (ed)? int retires (ef)
160029443731603338022380110802268001124030864579316003380032800128002080000800018000080010
160024432161600118001180000800108001024042362609516003380033800138002080000800018000080010
160024432151600118001180000800108000024017963221916001080020800008002080000800018000080010
160024432161600118001180000800108000024017964215816001080020800008002080000800018000080010
160024432161600118001180000800108000024017964394116001080020800008002080000800018000080010
160024432161600118001180000800108000024017964462516001080020800008002080000800018000080010
160024432161600118001180000800108000024017964425216001080020800008002080000800018000080010
160024432161600118001180000800108000024017964491516001080020800008002080000800018000080010
160024432161600118001180000800108000024017964726416001080020800008002080000800018000080010
160024432161600118001180000800108000024017964537116001080020800008002080000800018000080010