Apple Microarchitecture Research by Dougall Johnson

M1/A14 P-core (Firestorm): Overview | Base Instructions | SIMD and FP Instructions
M1/A14 E-core (Icestorm):  Overview | Base Instructions | SIMD and FP Instructions

LDRH (pre-index)

Test 1: uops

Code:

  ldrh w0, [x6, #8]!
  mov x0, 1
  mov x1, 2
  mov x8, 0

(no loop instructions)

1000 unrolls and 1 iteration

Retires: 2.000

Issues: 2.000

Integer unit issues: 1.001

Load/store unit issues: 1.000

SIMD/FP unit issues: 0.000

retire uop (01)cycle (02)schedule uop (52)schedule int uop (53)schedule ldst uop (55)dispatch int uop (56)dispatch ldst uop (58)int uops in schedulers (59)simd uops in schedulers (5a)dispatch uop (78)map int uop (7c)map ldst uop (7d)map int uop inputs (7f)map ldst uop inputs (80)? int output thing (e9)? ldst retires (ed)? int retires (ef)
2005125420361020101610401000213591834420001000100010001000100110001000
2004112420011001100010001000225161988020001000100010001000100110001000
2004116720011001100010001000219851929420001000100010001000100110001000
2004119020011001100010001000228381933420001000100010001000100110001000
2004113420011001100010001000233741963920001000100010001000100110001000
2004116320011001100010001000219861879720001000100010001000100110001000
2004119020011001100010001000222771950220001000100010001000100110001000
2004118220011001100010001000223641927320001000100010001000100110001000
2004118820011001100010001000227591965620001000100010001000100110001000
2004119620011001100010001000227031945120001000100010001000100110001000

Test 2: Latency 1->2 (with chain penalty)

Chain cycles: 3

Code:

  ldrh w0, [x6, #8]!
  eor x8, x8, x0
  eor x8, x8, x0
  add x6, x6, x8
  mov x0, 1
  mov x1, 2
  mov x8, 0

(fused SUBS/B.cc loop)

100 unrolls and 100 iterations

Result (median cycles for code, minus 3 chain cycles): 4.0110

retire uop (01)cycle (02)schedule uop (52)schedule int uop (53)schedule ldst uop (55)dispatch int uop (56)dispatch ldst uop (58)int uops in schedulers (59)simd uops in schedulers (5a)dispatch uop (78)map int uop (7c)map ldst uop (7d)map int uop inputs (7f)map ldst uop inputs (80)? int output thing (e9)? ldst retires (ed)? simd retires (ee)? int retires (ef)
50209712255016140156100054024710003185060953483550109402121000470221100044000310000040100
50204702535010340103100004010610003185144653512050109402121000470221100044000310000040100
50204701065010340103100004010610003185036153479750109402121000470221100044000310000040100
50204701065010340103100004010610013185196553528150153402531001570221100044000410000040100
50204701065010340103100004010610003185036153479750109402121000470221100044000310000040100
50204701065010340103100004010610003185130653510750109402121000470221100044000310000040100
50204701065010340103100004010610003185030753477950109402121000470221100044000310000040100
50204701065010340103100004010610003185030753477950109402121000470221100044000310000040100
50204701065010340103100004010610003185030753477950109402121000470221100044000310000040100
50204701065010340103100004010610003185036153479750109402121000470221100044000310000040100

1000 unrolls and 10 iterations

Result (median cycles for code, minus 3 chain cycles): 4.0166

retire uop (01)cycle (02)schedule uop (52)schedule int uop (53)schedule ldst uop (55)dispatch int uop (56)dispatch ldst uop (58)int uops in schedulers (59)simd uops in schedulers (5a)dispatch uop (78)map int uop (7c)map ldst uop (7d)map int uop inputs (7f)map ldst uop inputs (80)? int output thing (e9)? ldst retires (ed)? int retires (ef)
5002971228500694006410005401561000318521255356685001940032100047002010000400071000040010
5002570261500314002910002400501000018518755355675001040020100007002010000400081000040010
5002470160500174001710000400101000018518755355675001040020100007002010000400071000040010
5002470158500174001710000400101000018518755355675001040020100007002010000400071000040010
5002470169500174001710000400101000018532255360175001040020100007002010000400081000040010
5002470174500174001710000400101000018518755355675001040020100007002010000400071000040010
5002470158500174001710000400101000018518755355675001040020100007002010000400071000040010
5002470170500184001810000400101000018518755355675001040020100007002010000400071000040010
5002470160500174001710000400101000018518215355495001040020100007010910013400181000040010
5002470160500174001710000400101000018518215355495001040020100007002010000400081000040010

Test 3: throughput

Count: 8

Code:

  ldrh w0, [x6, #8]!
  ldrh w0, [x7, #8]!
  ldrh w0, [x8, #8]!
  ldrh w0, [x9, #8]!
  ldrh w0, [x10, #8]!
  ldrh w0, [x11, #8]!
  ldrh w0, [x12, #8]!
  ldrh w0, [x13, #8]!
  mov x7, x6
  mov x8, x6
  mov x9, x6
  mov x10, x6
  mov x11, x6
  mov x12, x6
  mov x13, x6

(fused SUBS/B.cc loop)

100 unrolls and 100 iterations

Result (median cycles for code divided by count): 0.5404

retire uop (01)cycle (02)schedule uop (52)schedule int uop (53)schedule ldst uop (55)dispatch int uop (56)dispatch ldst uop (58)int uops in schedulers (59)simd uops in schedulers (5a)dispatch uop (78)map int uop (7c)map ldst uop (7d)map int uop inputs (7f)map ldst uop inputs (80)? int output thing (e9)? ldst retires (ed)? int retires (ef)
160209442401604148031380101803168001224061064132916012480212800128021280012800098000080100
160204432361601098010980000801128001224061064295916012480212800128021280012800098000080100
160204432371601098010980000801128001024061064378716012280212800128021280012800098000080100
160204432351601098010980000801128001124061063923616012380212800128025480054800518000080100
160204432361601058010580000801088001224062363945916012480212800128021280012800098000080100
160204432361601098010980000801128000824061064187216012080212800128021280012800098000080100
160204432311601098010980000801128000824065464550316012080212800128021280012800098000080100
160204432321601098010980000801128001024066064539416012280212800128021280012800098000080100
160204432301601098010980000801128001024061064466416012280212800128021280012800098000080100
160204432351601098010980000801128001124064264317816012380212800128021280012800098000080100

1000 unrolls and 10 iterations

Result (median cycles for code divided by count): 0.5404

retire uop (01)cycle (02)schedule uop (52)schedule int uop (53)schedule ldst uop (55)dispatch int uop (56)dispatch ldst uop (58)int uops in schedulers (59)simd uops in schedulers (5a)dispatch uop (78)map int uop (7c)map ldst uop (7d)map int uop inputs (7f)map ldst uop inputs (80)? int output thing (e9)? ldst retires (ed)? int retires (ef)
160029445361603268022280104802258001024034064381916003280032800128002080000800018000080010
160024432341600118001180000800108000024030464224116001080020800008002080000800018000080010
160024432311600118001180000800108000024030464320716001080020800008002080000800018000080010
160024432311600118001180000800108000024030464404916001080020800008002080000800018000080010
160024432311600118001180000800108000024030464524716001080020800008002080000800018000080010
160024432311600118001180000800108000024030464446916001080020800008002080000800018000080010
160024432311600118001180000800108000024030464153416001080020800008002080000800018000080010
160024432311600118001180000800108000024030464358616001080020800008002080000800018000080010
160024432311600118001180000800108000024030464555416001080020800008002080000800018000080010
160024432311600118001180000800108000024030464446916001080020800008002080000800018000080010