Apple Microarchitecture Research by Dougall Johnson

M1/A14 P-core (Firestorm): Overview | Base Instructions | SIMD and FP Instructions
M1/A14 E-core (Icestorm):  Overview | Base Instructions | SIMD and FP Instructions

LDP (post-index, 32-bit)

Test 1: uops

Code:

  ldp w0, w1, [x6], #8
  mov x0, 1
  mov x1, 2
  mov x8, 0

(no loop instructions)

1000 unrolls and 1 iteration

Retires: 3.000

Issues: 2.000

Integer unit issues: 1.001

Load/store unit issues: 1.000

SIMD/FP unit issues: 0.000

retire uop (01)cycle (02)schedule uop (52)schedule int uop (53)schedule ldst uop (55)dispatch int uop (56)dispatch ldst uop (58)int uops in schedulers (59)simd uops in schedulers (5a)dispatch uop (78)map int uop (7c)map ldst uop (7d)map int uop inputs (7f)map ldst uop inputs (80)? int output thing (e9)? ldst retires (ed)? int retires (ef)
3005129820381023101510281000132261506520001000200010002000100110002000
3004111320011001100010001000132771474920001000200010002000100110002000
3004110820011001100010001000138061566720001000200010002000100110002000
3004108620011001100010001000135291503620001000200010002000100110002000
3004107520011001100010001000135471509820001000200010002000100110002000
3004107620011001100010001000134671460820001000200010002000100110002000
3004109320011001100010001000133741501120001000200010002000100110002000
3004113620011001100010001000136421497320001000200010002000100110002000
3004112020011001100010001000133891465720001000200010002000100110002000
3004108020011001100010001000134401499720001000200010002000100110002000

Test 2: Latency 1->3 (with chain penalty)

Chain cycles: 3

Code:

  ldp w0, w1, [x6], #8
  eor x8, x8, x0
  eor x8, x8, x0
  add x6, x6, x8
  mov x0, 1
  mov x1, 2
  mov x8, 0

(fused SUBS/B.cc loop)

100 unrolls and 100 iterations

Result (median cycles for code, minus 3 chain cycles): 4.0108

retire uop (01)cycle (02)schedule uop (52)schedule int uop (53)schedule ldst uop (55)dispatch int uop (56)dispatch ldst uop (58)int uops in schedulers (59)simd uops in schedulers (5a)dispatch uop (78)map int uop (7c)map ldst uop (7d)map int uop inputs (7f)map ldst uop inputs (80)? int output thing (e9)? ldst retires (ed)? int retires (ef)
6020971439502624025710005403481000318509335491825020940212200087022120008401041000050100
6020470145502044020410000402061000318504475490275020940212200087022120008401031000050100
6020470132502034020310000402061000318506355491205020940212200087022120008401031000050100
6020470118502034020310000402061000318506355491205020940212200087022120008401031000050100
6020470118502034020310000402061000318505545490935020940212200087022120008401031000050100
6020470110502034020310000402061001318524465496735025340252200287022120008401031000050100
6020470140502034020310000402061000318516345494535020940212200087022120008401031000050100
6020470136502034020310000402061000318508515491925020940212200087022120008401031000050100
6020470124502034020310000402061000318509595492285020940212200087022120008401031000050100
6020470168502034020310000402061001318520975495575025340252200287029320028401161000050100

1000 unrolls and 10 iterations

Result (median cycles for code, minus 3 chain cycles): 4.0180

retire uop (01)cycle (02)schedule uop (52)schedule int uop (53)schedule ldst uop (55)dispatch int uop (56)dispatch ldst uop (58)int uops in schedulers (59)simd uops in schedulers (5a)dispatch uop (78)map int uop (7c)map ldst uop (7d)map int uop inputs (7f)map ldst uop inputs (80)? int output thing (e9)? ldst retires (ed)? int retires (ef)
6002971247500804007510005401661000318499525496085002940032200087002020000400131000050010
6002470094500234002310000400201000018499485496165002040020200007002020000400131000050010
6002570177500364003410002400601000018499485496165002040020200007002020000400131000050010
6002470090500234002310000400201000018499485496165002040020200007002020000400131000050010
6002470090500234002310000400201000018499485496165002040020200007002020000400131000050010
6002470090500234002310000400201000018499485496165002040020200007002020000400131000050010
6002470090500234002310000400201000018499485496165002040020200007002020000400131000050010
6002470095500234002310000400201000018499485496165002040020200007002020000400131000050010
6002470090500234002310000400201000018499485496165002040020200007002020000400131000050010
6002470090500234002310000400201000018499485496165002040020200007011120028400241000050010

Test 3: Latency 2->3 (with chain penalty)

Chain cycles: 3

Code:

  ldp w0, w1, [x6], #8
  eor x8, x8, x1
  eor x8, x8, x1
  add x6, x6, x8
  mov x0, 1
  mov x1, 2
  mov x8, 0

(fused SUBS/B.cc loop)

100 unrolls and 100 iterations

Result (median cycles for code, minus 3 chain cycles): 4.0160

retire uop (01)cycle (02)schedule uop (52)schedule int uop (53)schedule ldst uop (55)dispatch int uop (56)dispatch ldst uop (58)int uops in schedulers (59)simd uops in schedulers (5a)dispatch uop (78)map int uop (7c)map ldst uop (7d)map int uop inputs (7f)map ldst uop inputs (80)? int output thing (e9)? ldst retires (ed)? int retires (ef)
6020971227502594025410005403481000318514475492975020940212200087022120008401061000050100
6020470159502064020610000402061000318516885494315020940212200087022120008401071000050100
6020470160502074020710000402061000318516885494315020940212200087022120008401071000050100
6020470160502074020710000402061000318517425494495020940212200087022120008401071000050100
6020470160502074020710000402061000318516885494315020940212200087029120028401191000050100
6020470208502074020710000402061000318522555496205020940212200087022120008401071000050100
6020470198502074020710000402061000318534165500055020940212200087022120008401071000050100
6020470160502074020710000402061000318514725493595020940212200087022120008401081000050100
6020470206502074020710000402061000318524175496745020940212200087022120008401071000050100
6020470149502074020710000402061000318517965494675020940212200087022120008401071000050100

1000 unrolls and 10 iterations

Result (median cycles for code, minus 3 chain cycles): 4.0133

retire uop (01)cycle (02)schedule uop (52)schedule int uop (53)schedule ldst uop (55)dispatch int uop (56)dispatch ldst uop (58)int uops in schedulers (59)simd uops in schedulers (5a)dispatch uop (78)map int uop (7c)map ldst uop (7d)map int uop inputs (7f)map ldst uop inputs (80)? int output thing (e9)? ldst retires (ed)? int retires (ef)
6002971306500804007510005401661000318508965499505002940032200087002020000400141000050010
6002470119500244002410000400201000018506775498565002040020200007002020000400141000050010
6002470117500244002410000400201000018506775498565002040020200007002020000400141000050010
6002470117500244002410000400201000018506775498565002040020200007002020000400141000050010
6002470117500244002410000400201000018506775498565002040020200007002020000400141000050010
6002470117500244002410000400201000018506775498565002040020200007002020000400141000050010
6002470117500244002410000400201000018506775498565002040020200007002020000400141000050010
6002470117500244002410000400201001318524315503915007340072200287002020000400141000050010
6002470117500244002410000400201000018506775498565002040020200007002020000400141000050010
6002470117500244002410000400201000018506775498565002040020200007002020000400141000050010

Test 4: throughput

Count: 8

Code:

  ldp w0, w1, [x6], #8
  ldp w0, w1, [x7], #8
  ldp w0, w1, [x8], #8
  ldp w0, w1, [x9], #8
  ldp w0, w1, [x10], #8
  ldp w0, w1, [x11], #8
  ldp w0, w1, [x12], #8
  ldp w0, w1, [x13], #8
  mov x7, x6
  mov x8, x6
  mov x9, x6
  mov x10, x6
  mov x11, x6
  mov x12, x6
  mov x13, x6

(fused SUBS/B.cc loop)

100 unrolls and 100 iterations

Result (median cycles for code divided by count): 0.7517

retire uop (01)cycle (02)schedule uop (52)schedule int uop (53)schedule ldst uop (55)dispatch int uop (56)dispatch ldst uop (58)int uops in schedulers (59)simd uops in schedulers (5a)dispatch uop (78)map int uop (7c)map ldst uop (7d)map int uop inputs (7f)map ldst uop inputs (80)? int output thing (e9)? ldst retires (ed)? int retires (ef)
240209612001603308024080090802418000824047525143416011680208160016802081600168000680000160100
240204601351601118010680005801088000824047425138616011680208160016802081600168000680000160100
240204601371601118010680005801088000824047525148016011680208160016802081600168000680000160100
240204601381601118010680005801088000824047725142216011680208160016802081600168000680000160100
240204601351601118010680005801088000824047525139816011680208160016802081600168000680000160100
240204601371601118010680005801088000824048625138816011680208160016802081600168000680000160100
240204601371601118010680005801088000824047225138116011680208160016802081600168000680000160100
240204601381601118010680005801088000824047525136716011680208160016802081600168000680000160100
240204601361601118010680005801088000824047625138816011680208160016802081600168000680000160100
240204601331601118010680005801088000824047525136216011680208160016802081600168000680000160100

1000 unrolls and 10 iterations

Result (median cycles for code divided by count): 0.7509

retire uop (01)cycle (02)schedule uop (52)schedule int uop (53)schedule ldst uop (55)dispatch int uop (56)dispatch ldst uop (58)int uops in schedulers (59)simd uops in schedulers (5a)dispatch uop (78)map int uop (7c)map ldst uop (7d)map int uop inputs (7f)map ldst uop inputs (80)? int output thing (e9)? ldst retires (ed)? int retires (ef)
240029612281602498014980100801518000824020525139216002680028160016800201600008000180000160010
240024600801600118001180000800108000024011625121616001080020160000800561600728003480000160010
240024600811600118001180000800108000024011625122116001080020160000800201600008000180000160010
240024600821600118001180000800108000924040925199216002780028160018800201600008000180000160010
240024601031600118001180000800108000024030725154916001080020160000800201600008000180000160010
240024601031600118001180000800108000024031025154916001080020160000800201600008000180000160010
240024601081600118001180000800108000024030725144416001080020160000800201600008000180000160010
240024601031600118001180000800108000024030525155116001080020160000800201600008000180000160010
240024601081600118001180000800108000024031125154916001080020160000800201600008000180000160010
240024601041600118001180000800108000024030625154816001080020160000800201600008000180000160010