Apple Microarchitecture Research by Dougall Johnson

M1/A14 P-core (Firestorm): Overview | Base Instructions | SIMD and FP Instructions
M1/A14 E-core (Icestorm):  Overview | Base Instructions | SIMD and FP Instructions

LDR (pre-index, 64-bit)

Test 1: uops

Code:

  ldr x0, [x6, #8]!
  mov x0, 1
  mov x1, 2
  mov x8, 0

(no loop instructions)

1000 unrolls and 1 iteration

Retires: 2.000

Issues: 2.000

Integer unit issues: 1.001

Load/store unit issues: 1.000

SIMD/FP unit issues: 0.000

retire uop (01)cycle (02)schedule uop (52)schedule int uop (53)schedule ldst uop (55)dispatch int uop (56)dispatch ldst uop (58)int uops in schedulers (59)simd uops in schedulers (5a)dispatch uop (78)map int uop (7c)map ldst uop (7d)map int uop inputs (7f)map ldst uop inputs (80)? int output thing (e9)? ldst retires (ed)? int retires (ef)
2005122420321019101310361000208681755820001000100010001000100110001000
2004110120011001100010001000207381761520001000100010001000100110001000
2004108820011001100010001000212301780620001000100010001000100110001000
2004111220011001100010001000212821780420001000100010001000100110001000
2004108520011001100010001000213431870720001000100010001000100110001000
2004108720011001100010001000212761780620001000100010001000100110001000
2004111320011001100010001000213101800320001000100010001000100110001000
2004108520011001100010001000212361780320001000100010001000100110001000
2004108720011001100010001000212381777120001000100010001000100110001000
2004108820011001100010001000212641780220001000100010001000100110001000

Test 2: Latency 1->2 (with chain penalty)

Chain cycles: 3

Code:

  ldr x0, [x6, #8]!
  eor x8, x8, x0
  eor x8, x8, x0
  add x6, x6, x8
  mov x0, 1
  mov x1, 2
  mov x8, 0

(fused SUBS/B.cc loop)

100 unrolls and 100 iterations

Result (median cycles for code, minus 3 chain cycles): 4.0110

retire uop (01)cycle (02)schedule uop (52)schedule int uop (53)schedule ldst uop (55)dispatch int uop (56)dispatch ldst uop (58)int uops in schedulers (59)simd uops in schedulers (5a)dispatch uop (78)map int uop (7c)map ldst uop (7d)map int uop inputs (7f)map ldst uop inputs (80)? int output thing (e9)? ldst retires (ed)? int retires (ef)
5020971425501614015610005402471000318516355350795010940212100047022110004400031000040100
5020470114501034010310000401061000318504155348155010940212100047022110004400031000040100
5020470110501034010310000401061000318504695348335010940212100047022110004400031000040100
5020470110501034010310000401061000318504695348335010940212100047022110004400031000040100
5020470110501034010310000401061000318504155348155010940212100047022110004400031000040100
5020470110501034010310000401061000318504695348335010940212100047022110004400031000040100
5020470110501034010310000401061000318504695348335010940212100047022110004400031000040100
5020470110501034010310000401061000318504695348335010940212100047022110004400031000040100
5020470110501034010310000401061000318504695348335010940212100047022110004400031000040100
5020470110501034010310000401061000318504695348335010940212100047022110004400031000040100

1000 unrolls and 10 iterations

Result (median cycles for code, minus 3 chain cycles): 4.0167

retire uop (01)cycle (02)schedule uop (52)schedule int uop (53)schedule ldst uop (55)dispatch int uop (56)dispatch ldst uop (58)int uops in schedulers (59)simd uops in schedulers (5a)ldst uops in schedulers (5b)dispatch uop (78)map int uop (7c)map ldst uop (7d)map simd uop (7e)map int uop inputs (7f)map ldst uop inputs (80)? int output thing (e9)? ldst retires (ed)? int retires (ef)
500297137350070400651000540156100031850186534975050019400321000407002010000400041000040010
500247010150014400141000040010100001850174535029050010400201000007002010000400041000040010
500247009050013400131000040010100001850309535074050010400201000007002010000400041000040010
500247014150013400131000040010100001851092535332050010400201000007002010000400041000040010
500247009050013400131000040010100001850174535029050010400201000007002010000400041000040010
500247009050013400131000040010100001850174535029050010400201000007010910013400151000040010
500247011350014400141000040010100001850741535218050010400201000007002010000400041000040010
500247013950013400131000040010100001850849535250050010400201000007002010000400041000040010
500247015650013400131000040010100001850876535262050010400201000007002010000400041000040010
500257017550026400241000240050100001850174535029050010400201000007002010000400041000040010

Test 3: throughput

Count: 8

Code:

  ldr x0, [x6, #8]!
  ldr x0, [x7, #8]!
  ldr x0, [x8, #8]!
  ldr x0, [x9, #8]!
  ldr x0, [x10, #8]!
  ldr x0, [x11, #8]!
  ldr x0, [x12, #8]!
  ldr x0, [x13, #8]!
  mov x7, x6
  mov x8, x6
  mov x9, x6
  mov x10, x6
  mov x11, x6
  mov x12, x6
  mov x13, x6

(fused SUBS/B.cc loop)

100 unrolls and 100 iterations

Result (median cycles for code divided by count): 0.5407

retire uop (01)cycle (02)schedule uop (52)schedule int uop (53)schedule ldst uop (55)dispatch int uop (56)dispatch ldst uop (58)int uops in schedulers (59)simd uops in schedulers (5a)dispatch uop (78)map int uop (7c)map ldst uop (7d)map int uop inputs (7f)map ldst uop inputs (80)? int output thing (e9)? ldst retires (ed)? int retires (ef)
160209442051604128031280100803158001224081064282316012480212800128021280012800098000080100
160204432521601078010780000801108001124081564535416012380212800128021280012800098000080100
160205433351601818015080031801538001124083864275416012380212800128021280012800098000080100
160204432501601098010980000801128000924084564203116011980210800108021280012800098000080100
160204432461601078010780000801108000924081564116516012180212800128021080010800078000080100
160204432481601098010980000801128001124081463922116012380212800128021280012800098000080100
160204432471601098010980000801128001124084264065016012380212800128021280012800098000080100
160204432461601098010980000801128001224080063611316012480212800128021280012800098000080100
160204432461601098010980000801128001224081564281216012480212800128021280012800098000080100
160204432491601108010980001801128001124080164558016012380212800128021280012800098000080100

1000 unrolls and 10 iterations

Result (median cycles for code divided by count): 0.5404

retire uop (01)cycle (02)schedule uop (52)schedule int uop (53)schedule ldst uop (55)dispatch int uop (56)dispatch ldst uop (58)int uops in schedulers (59)simd uops in schedulers (5a)dispatch uop (78)map int uop (7c)map ldst uop (7d)map int uop inputs (7f)map ldst uop inputs (80)? int output thing (e9)? ldst retires (ed)? int retires (ef)
160029444141603278022380104802268000824034064371116003080032800128002080000800018000080010
160024432281600118001180000800108000024030464419516001080020800008002080000800018000080010
160024432291600118001180000800108000024030464448516001080020800008002080000800018000080010
160024432291600118001180000800108000024030464573016001080020800008002080000800018000080010
160024432291600118001180000800108000024030464146616001080020800008002080000800018000080010
160024432321600118001180000800108016424081064550816033880184801648002080000800018000080010
160024432291600118001180000800108000024030464314316001080020800008002080000800018000080010
160024432291600118001180000800108000024030464272016001080020800008002080000800018000080010
160024432291600118001180000800108000024030464369316001080020800008002080000800018000080010
160024432291600118001180000800108000024030464559516001080020800008002080000800018000080010