Apple Microarchitecture Research by Dougall Johnson

M1/A14 P-core (Firestorm): Overview | Base Instructions | SIMD and FP Instructions
M1/A14 E-core (Icestorm):  Overview | Base Instructions | SIMD and FP Instructions

LDR (post-index, 32-bit)

Test 1: uops

Code:

  ldr w0, [x6], #8
  mov x0, 1
  mov x1, 2
  mov x8, 0

(no loop instructions)

1000 unrolls and 1 iteration

Retires: 2.000

Issues: 2.000

Integer unit issues: 1.001

Load/store unit issues: 1.000

SIMD/FP unit issues: 0.000

retire uop (01)cycle (02)schedule uop (52)schedule int uop (53)schedule ldst uop (55)dispatch int uop (56)dispatch ldst uop (58)int uops in schedulers (59)simd uops in schedulers (5a)dispatch uop (78)map int uop (7c)map ldst uop (7d)map int uop inputs (7f)map ldst uop inputs (80)? int output thing (e9)? ldst retires (ed)? int retires (ef)
2005126620331020101310361000194181794620001000100010001000100110001000
2004107720011001100010001000208401746220001000100010001000100110001000
2004109720011001100010001000210561762920001000100010001000100110001000
2004108920011001100010001000212731770120001000100010001000100110001000
2004106120011001100010001000213091762820001000100010001000100110001000
2004108420011001100010001000211761762820001000100010001000100110001000
2004109520011001100010001000210211766320001000100010001000100110001000
2004107220011001100010001000210711758620001000100010001000100110001000
2004108620011001100010001000213191764820001000100010001000100110001000
2004108320011001100010001000212461797420001000100010001000100110001000

Test 2: Latency 1->2 (with chain penalty)

Chain cycles: 3

Code:

  ldr w0, [x6], #8
  eor x8, x8, x0
  eor x8, x8, x0
  add x6, x6, x8
  mov x0, 1
  mov x1, 2
  mov x8, 0

(fused SUBS/B.cc loop)

100 unrolls and 100 iterations

Result (median cycles for code, minus 3 chain cycles): 4.0099

retire uop (01)cycle (02)schedule uop (52)schedule int uop (53)schedule ldst uop (55)dispatch int uop (56)dispatch ldst uop (58)int uops in schedulers (59)simd uops in schedulers (5a)dispatch uop (78)map int uop (7c)map ldst uop (7d)map int uop inputs (7f)map ldst uop inputs (80)? int output thing (e9)? ldst retires (ed)? int retires (ef)
5021071541501754016810007402801000318502045346565010940212100047022110004400041000040100
5020470101501044010410000401061000318501725346765010940212100047022110004400041000040100
5020470099501044010410000401061000318501725346765010940212100047022110004400041000040100
5020470099501044010410000401061000318501725346765010940212100047022110004400041000040100
5020470099501044010410000401061000318501725346765010940212100047022110004400041000040100
5020470099501044010410000401061000318501725346765010940212100047042710034400381000040100
5020470122501044010410000401061000318509825349375010940212100047022110004400041000040100
5020470099501044010410000401061000318501725346765010940212100047022110004400041000040100
5020470099501044010410000401061000318501725346765010940212100047022110004400041000040100
5020470099501044010410000401061000318501725346765010940212100047022110004400041000040100

1000 unrolls and 10 iterations

Result (median cycles for code, minus 3 chain cycles): 4.0101

retire uop (01)cycle (02)schedule uop (52)schedule int uop (53)schedule ldst uop (55)dispatch int uop (56)dispatch ldst uop (58)int uops in schedulers (59)simd uops in schedulers (5a)dispatch uop (78)map int uop (7c)map ldst uop (7d)map int uop inputs (7f)map ldst uop inputs (80)map simd uop inputs (81)? int output thing (e9)? ldst retires (ed)? simd retires (ee)? int retires (ef)
500297128350070400651000540156100031850316535056500194003210004700411000404000410000040010
500247009950014400141000040010100001850228535009500104002010000700201000004000410000040010
500247009950014400141000040010100001850228535009500104002010000700201000004000410000040010
500247009950014400141000040010100001850228535009500104002010000700201000004000410000040010
500257024750027400251000240050100001852172535620500104002010000700201000004000410000040010
500247011050014400141000040010100001850228535009500104002010000700201000004000410000040010
500247009950014400141000040010100001850228535009500104002010000700201000004000410000040010
500247009950014400141000040010100001850228535009500104002010000700201000004000410000040010
500247009950014400141000040010100001850228535009500104002010000700201000004000410000040010
500247009950014400141000040010100001850228535009500104002010000700201000004000410000040010

Test 3: throughput

Count: 8

Code:

  ldr w0, [x6], #8
  ldr w0, [x7], #8
  ldr w0, [x8], #8
  ldr w0, [x9], #8
  ldr w0, [x10], #8
  ldr w0, [x11], #8
  ldr w0, [x12], #8
  ldr w0, [x13], #8
  mov x7, x6
  mov x8, x6
  mov x9, x6
  mov x10, x6
  mov x11, x6
  mov x12, x6
  mov x13, x6

(fused SUBS/B.cc loop)

100 unrolls and 100 iterations

Result (median cycles for code divided by count): 0.5410

retire uop (01)cycle (02)schedule uop (52)schedule int uop (53)schedule simd uop (54)schedule ldst uop (55)dispatch int uop (56)dispatch simd uop (57)dispatch ldst uop (58)int uops in schedulers (59)simd uops in schedulers (5a)dispatch uop (78)map int uop (7c)map ldst uop (7d)map int uop inputs (7f)map ldst uop inputs (80)? int output thing (e9)? ldst retires (ed)? int retires (ef)
16020944233160422803130801098031608000924057864099616012180212800128021280012800098000080100
16020443229160109801090800008011208001024057864374716012280212800128021280012800098000080100
16020443224160109801090800008011208000824057864223916012080212800128021280012800098000080100
16020443226160110801090800018011208000824057864245116012080212800128021280012800098000080100
16020543460160179801490800308015008000924057864409216012180212800128025480054800518000080100
16020443225160109801090800008011208000824057864187916012080212800128021280012800098000080100
16020443225160109801090800008011208000824057864363816012080212800128021280012800098000080100
16020443224160109801090800008011208001224057864142116012480212800128021280012800098000080100
16020443225160109801090800008011208000924057864057016012180212800128021280012800098000080100
16020443238160109801090800008011208001024058764549716012280212800128021280012800098000080100

1000 unrolls and 10 iterations

Result (median cycles for code divided by count): 0.5402

retire uop (01)cycle (02)schedule uop (52)schedule int uop (53)schedule ldst uop (55)dispatch int uop (56)dispatch ldst uop (58)int uops in schedulers (59)simd uops in schedulers (5a)dispatch uop (78)map int uop (7c)map ldst uop (7d)map int uop inputs (7f)map ldst uop inputs (80)? int output thing (e9)? ldst retires (ed)? int retires (ef)
160029445441603358022380112802268001224030864589016003480032800128002080000800018000080010
160024432221600118001180000800108000024017964216316001080020800008002080000800018000080010
160024432151600118001180000800108000024017964424616001080020800008002080000800018000080010
160024432131600118001180000800108000024017964234416001080020800008002080000800018000080010
160024432131600118001180000800108000024017964107816001080020800008002080000800018000080010
160024432201600118001180000800108016424073764688116033880184801648018480164801658000080010
160024432161600118001180000800108000024017964604116001080020800008007380053800508000080010
160024432151600118001180000800108000024017964370316001080020800008002080000800018000080010
160024433241600828005280030800518000024017963694016001080020800008010280082800838000080010
160024432151600118001180000800108000024017964527416001080020800008010280082800838000080010