Apple Microarchitecture Research by Dougall Johnson

M1/A14 P-core (Firestorm): Overview | Base Instructions | SIMD and FP Instructions
M1/A14 E-core (Icestorm):  Overview | Base Instructions | SIMD and FP Instructions

LDRSB (pre-index, 32-bit)

Test 1: uops

Code:

  ldrsb w0, [x6, #8]!
  mov x0, 1
  mov x1, 2
  mov x8, 0

(no loop instructions)

1000 unrolls and 1 iteration

Retires: 2.000

Issues: 2.000

Integer unit issues: 1.001

Load/store unit issues: 1.000

SIMD/FP unit issues: 0.000

retire uop (01)cycle (02)schedule uop (52)schedule int uop (53)schedule ldst uop (55)dispatch int uop (56)dispatch ldst uop (58)int uops in schedulers (59)simd uops in schedulers (5a)dispatch uop (78)map int uop (7c)map ldst uop (7d)map int uop inputs (7f)map ldst uop inputs (80)? int output thing (e9)? ldst retires (ed)? int retires (ef)
2005123120311018101310361000205721770920001000100010001000100110001000
2004107620011001100010001000204741812320001000100010001000100110001000
2004106820011001100010001000211721790920001000100010001000100110001000
2004108220011001100010001000211261735520001000100010001000100110001000
2004108720011001100010001000205821804620001000100010001000100110001000
2004108320011001100010001000212121760520001000100010001000100110001000
2004108020011001100010001000214861767620001000100010001000100110001000
2004108020011001100010001000213811770120001000100010001000100110001000
2004107620011001100010001000210631797420001000100010001000100110001000
2004110320011001100010001000213181758520001000100010001000100110001000

Test 2: Latency 1->2 (with chain penalty)

Chain cycles: 3

Code:

  ldrsb w0, [x6, #8]!
  eor x8, x8, x0
  eor x8, x8, x0
  add x6, x6, x8
  mov x0, 1
  mov x1, 2
  mov x8, 0

(fused SUBS/B.cc loop)

100 unrolls and 100 iterations

Result (median cycles for code, minus 3 chain cycles): 4.0121

retire uop (01)cycle (02)schedule uop (52)schedule int uop (53)schedule ldst uop (55)dispatch int uop (56)dispatch ldst uop (58)int uops in schedulers (59)simd uops in schedulers (5a)dispatch uop (78)map int uop (7c)map ldst uop (7d)map int uop inputs (7f)map ldst uop inputs (80)? int output thing (e9)? ldst retires (ed)? int retires (ef)
5020971430501674016210005402471000318505605347885010940212100047022110004400041000040100
5020570208501174011510002401401000318506365348445010940212100047022110004400041000040100
5020470117501044010410000401061000318506585348825010940212100047022110004400041000040100
5020470117501044010410000401061000318506585348825010940212100047022110004400041000040100
5020470117501044010410000401061000318506585348825010940212100047022110004400041000040100
5020470117501044010410000401061000318506585348825010940212100047022110004400041000040100
5020470117501044010410000401061000318506585348825010940212100047022110004400041000040100
5020470117501044010410000401061000318506585348825010940212100047022110004400041000040100
5020470117501044010410000401061001218542365359565015240251100137029310014400161000040100
5020570287501184011610002401401000318507125349005010940212100047022110004400041000040100

1000 unrolls and 10 iterations

Result (median cycles for code, minus 3 chain cycles): 4.0117

retire uop (01)cycle (02)schedule uop (52)schedule int uop (53)schedule ldst uop (55)dispatch int uop (56)dispatch ldst uop (58)int uops in schedulers (59)simd uops in schedulers (5a)dispatch uop (78)map int uop (7c)map ldst uop (7d)map int uop inputs (7f)map ldst uop inputs (80)? int output thing (e9)? ldst retires (ed)? int retires (ef)
5002971386500704006510005401561000318508835352835001940032100047002010000400041000040010
5002470110500144001410000400101000018503365350835001040020100007002010000400041000040010
5002470113500144001410000400101000018506335351825001040020100007002010000400041000040010
5002470119500144001410000400101000018505525351555001040020100007002010000400041000040010
5002570187500274002510002400481000018507605352235001040020100007002010000400041000040010
5002470105500144001410000400101000018510115353085001040020100007011310014400181000040010
5002470275500294002610003400461001018529975361335005640060100107002010000400041000040010
5002470116500144001410000400101000018506335351825001040020100007002010000400041000040010
5002470109500144001410000400101000018504715351285001040020100007002010000400041000040010
5002470104500144001410000400101000018505795351625001040020100007002010000400041000040010

Test 3: throughput

Count: 8

Code:

  ldrsb w0, [x6, #8]!
  ldrsb w0, [x7, #8]!
  ldrsb w0, [x8, #8]!
  ldrsb w0, [x9, #8]!
  ldrsb w0, [x10, #8]!
  ldrsb w0, [x11, #8]!
  ldrsb w0, [x12, #8]!
  ldrsb w0, [x13, #8]!
  mov x7, x6
  mov x8, x6
  mov x9, x6
  mov x10, x6
  mov x11, x6
  mov x12, x6
  mov x13, x6

(fused SUBS/B.cc loop)

100 unrolls and 100 iterations

Result (median cycles for code divided by count): 0.5403

retire uop (01)cycle (02)schedule uop (52)schedule int uop (53)schedule ldst uop (55)dispatch int uop (56)dispatch ldst uop (58)int uops in schedulers (59)simd uops in schedulers (5a)dispatch uop (78)map int uop (7c)map ldst uop (7d)map int uop inputs (7f)map ldst uop inputs (80)? int output thing (e9)? ldst retires (ed)? int retires (ef)
160209444261604278031380114803168001024061064736016012280212800128021280012800098000080100
160204432361601098010980000801128001124052964484216012380212800128021280012800098000080100
160204432241601098010980000801128001224052964350316012480212800128021280012800098000080100
160204432241601098010980000801128001224052964344816012480212800128021280012800098000080100
160204432241601098010980000801128001124052964452416012380212800128021280012800098000080100
160204432241601098010980000801128001124052964596516012380212800128021280012800098000080100
160204432241601098010980000801128001124052964597616012380212800128021280012800098000080100
160204432241601098010980000801128001124052964224116012380212800128021280012800098000080100
160204432241601098010980000801128001224052963935516012480212800128021280012800098000080100
160204432241601098010980000801128001124052964194416012380212800128021280012800098000080100

1000 unrolls and 10 iterations

Result (median cycles for code divided by count): 0.5409

retire uop (01)cycle (02)schedule uop (52)schedule int uop (53)schedule ldst uop (55)dispatch int uop (56)dispatch ldst uop (58)int uops in schedulers (59)simd uops in schedulers (5a)dispatch uop (78)map int uop (7c)map ldst uop (7d)map int uop inputs (7f)map ldst uop inputs (80)? int output thing (e9)? ldst retires (ed)? int retires (ef)
160029443761603408022280118802258000924034064259216003180032800128002080000800018000080010
160025433181600828005280030800518000024030464422516001080020800008003280012800098000080010
160024432761600118001180000800108000024070764498416001080020800008002080000800018000080010
160024432721600118001180000800108000024070764069216001080020800008002080000800018000080010
160024432721600118001180000800108000024070464304716001080020800008002080000800018000080010
160024432701600118001180000800108000024070464608516001080020800008002080000800018000080010
160024432721600118001180000800108000024070264572316001080020800008002080000800018000080010
160024432721600118001180000800108000024070764191116001080020800008002080000800018000080010
160024432721600118001180000800108000024070264518816001080020800008002080000800018000080010
160024432721600118001180000800108000024070464601016001080020800008002080000800018000080010