Apple Microarchitecture Research by Dougall Johnson

M1/A14 P-core (Firestorm): Overview | Base Instructions | SIMD and FP Instructions
M1/A14 E-core (Icestorm):  Overview | Base Instructions | SIMD and FP Instructions

LDPSW (pre-index)

Test 1: uops

Code:

  ldpsw x0, x1, [x6, #8]!
  mov x0, 1
  mov x1, 2
  mov x8, 0

(no loop instructions)

1000 unrolls and 1 iteration

Retires: 3.000

Issues: 2.000

Integer unit issues: 1.001

Load/store unit issues: 1.000

SIMD/FP unit issues: 0.000

retire uop (01)cycle (02)schedule uop (52)schedule int uop (53)schedule ldst uop (55)dispatch int uop (56)dispatch ldst uop (58)int uops in schedulers (59)simd uops in schedulers (5a)dispatch uop (78)map int uop (7c)map ldst uop (7d)map int uop inputs (7f)map ldst uop inputs (80)? int output thing (e9)? ldst retires (ed)? int retires (ef)
3005136720361020101610281000130941487920001000200010002000100110002000
3004111920011001100010001000135761486020001000200010002000100110002000
3004113120011001100010001000135511481820001000200010002000100110002000
3004110120011001100010001000131851447520001000200010002000100110002000
3004109920011001100010001000135801490020001000200010002000100110002000
3004109720011001100010001000133481459420001000200010002000100110002000
3004109820011001100010001000134061466820001000200010002000100110002000
3004113520011001100010001000134341499420001000200010002000100110002000
3004109820011001100010001000136951495820001000200010002000100110002000
3004113920011001100010001000137721493920001000200010002000100110002000

Test 2: Latency 1->3 (with chain penalty)

Chain cycles: 3

Code:

  ldpsw x0, x1, [x6, #8]!
  eor x8, x8, x0
  eor x8, x8, x0
  add x6, x6, x8
  mov x0, 1
  mov x1, 2
  mov x8, 0

(fused SUBS/B.cc loop)

100 unrolls and 100 iterations

Result (median cycles for code, minus 3 chain cycles): 4.0140

retire uop (01)cycle (02)schedule uop (52)schedule int uop (53)schedule ldst uop (55)dispatch int uop (56)dispatch ldst uop (58)int uops in schedulers (59)simd uops in schedulers (5a)dispatch uop (78)map int uop (7c)map ldst uop (7d)map int uop inputs (7f)map ldst uop inputs (80)? int output thing (e9)? ldst retires (ed)? int retires (ef)
6020971246502614025610005403481000318501235489085020940212200087022120008401031000050100
6020470094502034020310000402061000318498525488555020940212200087022120008401031000050100
6020470092502034020310000402061000318498525488555020940212200087022120008401031000050100
6020470092502034020310000402061000318498525488555020940212200087022120008401031000050100
6020470092502034020310000402061000318498525488555020940212200087022120008401031000050100
6020470092502034020310000402061000318498525488555020940212200087029120028401141000050100
6020570259502184021610002402391000318510675492535020940212200087022120008401031000050100
6020470092502034020310000402061000318498525488555020940212200087022120008401031000050100
6020470092502034020310000402061000318498525488555020940212200087022120008401031000050100
6020470094502034020310000402061000318522285496455020940212200087022120008401031000050100

1000 unrolls and 10 iterations

Result (median cycles for code, minus 3 chain cycles): 4.0106

retire uop (01)cycle (02)schedule uop (52)schedule int uop (53)schedule ldst uop (55)dispatch int uop (56)dispatch ldst uop (58)int uops in schedulers (59)simd uops in schedulers (5a)dispatch uop (78)map int uop (7c)map ldst uop (7d)map int uop inputs (7f)map ldst uop inputs (80)? int output thing (e9)? ldst retires (ed)? int retires (ef)
6002971232500804007510005401661000318505455498335002940032200087002020000400141000050010
6002470113500244002410000400201000018503265497465002040020200007002020000400131000050010
6002470104500234002310000400201000018503805497645002040020200007002020000400131000050010
6002470104500234002310000400201000018503265497465002040020200007002020000400131000050010
6002470104500234002310000400201000018503265497465002040020200007002020000400131000050010
6002470104500234002310000400201000018503265497465002040020200007011120028400241000050010
6002470131500234002310000400201000018503265497465002040020200007002020000400131000050010
6002470104500234002310000400201000018503265497465002040020200007002020000400131000050010
6002470104500234002310000400201000018503265497465002040020200007002020000400131000050010
6002470104500234002310000400201000018503265497465002040020200007002020000400131000050010

Test 3: Latency 2->3 (with chain penalty)

Chain cycles: 3

Code:

  ldpsw x0, x1, [x6, #8]!
  eor x8, x8, x1
  eor x8, x8, x1
  add x6, x6, x8
  mov x0, 1
  mov x1, 2
  mov x8, 0

(fused SUBS/B.cc loop)

100 unrolls and 100 iterations

Result (median cycles for code, minus 3 chain cycles): 4.0112

retire uop (01)cycle (02)schedule uop (52)schedule int uop (53)schedule ldst uop (55)dispatch int uop (56)dispatch ldst uop (58)int uops in schedulers (59)simd uops in schedulers (5a)dispatch uop (78)map int uop (7c)map ldst uop (7d)map int uop inputs (7f)map ldst uop inputs (80)? int output thing (e9)? ldst retires (ed)? int retires (ef)
6020971571502614025610005403481000318504735490595020940212200087022120008401041000050100
6020470117502044020410000402061001318518795509015025340252200287022120008401031000050100
6020570189502164021410002402401000318509595492285020940212200087022120008401031000050100
6020470106502034020310000402061000318502305489855020940212200087022120008401031000050100
6020570197502164021410002402401000318510405492555020940212200087022120008401031000050100
6020470134502034020310000402061000318508245491835020940212200087022120008401031000050100
6020470119502034020310000402061000318509865492375020940212200087022120008401031000050100
6020470117502034020310000402061000318507975491745020940212200087022120008401031000050100
6020470132502034020310000402061000318510135492445020940212200087022120008401031000050100
6020570240502184021610002402401000318510135492465020940212200087022120008401031000050100

1000 unrolls and 10 iterations

Result (median cycles for code, minus 3 chain cycles): 4.0142

retire uop (01)cycle (02)schedule uop (52)schedule int uop (53)schedule ldst uop (55)dispatch int uop (56)dispatch ldst uop (58)int uops in schedulers (59)simd uops in schedulers (5a)dispatch uop (78)map int uop (7c)map ldst uop (7d)map int uop inputs (7f)map ldst uop inputs (80)? int output thing (e9)? ldst retires (ed)? int retires (ef)
6002971226500794007410005401661000318505995498515002940032200087002020000400141000050010
6002470122500254002510000400201000018506505498405002040020200007002020000400141000050010
6002570186500374003510002400591000018504885497935002040020200007002020000400141000050010
6002470111500244002410000400201000018505155498025002040020200007002020000400141000050010
6002470111500244002410000400201000018506505498405002040020200007002020000400141000050010
6002470111500244002410000400201000018506505498405002040020200007002020000400141000050010
6002470111500244002410000400201000018506505498405002040020200007002020000400141000050010
6002470111500244002410000400201000018506505498405002040020200007002020000400141000050010
6002470111500244002410000400201001318526015504435007340072200287002020000400151000050010
6002470111500244002410000400201000018506505498405002040020200007002020000400141000050010

Test 4: throughput

Count: 8

Code:

  ldpsw x0, x1, [x6, #8]!
  ldpsw x0, x1, [x7, #8]!
  ldpsw x0, x1, [x8, #8]!
  ldpsw x0, x1, [x9, #8]!
  ldpsw x0, x1, [x10, #8]!
  ldpsw x0, x1, [x11, #8]!
  ldpsw x0, x1, [x12, #8]!
  ldpsw x0, x1, [x13, #8]!
  mov x7, x6
  mov x8, x6
  mov x9, x6
  mov x10, x6
  mov x11, x6
  mov x12, x6
  mov x13, x6

(fused SUBS/B.cc loop)

100 unrolls and 100 iterations

Result (median cycles for code divided by count): 0.7516

retire uop (01)cycle (02)schedule uop (52)schedule int uop (53)schedule ldst uop (55)dispatch int uop (56)dispatch ldst uop (58)int uops in schedulers (59)simd uops in schedulers (5a)dispatch uop (78)map int uop (7c)map ldst uop (7d)map int uop inputs (7f)map ldst uop inputs (80)? int output thing (e9)? ldst retires (ed)? int retires (ef)
240209612051603528024080112802418000924047525129816011780208160018802081600168000680000160100
240204601531601118010680005801088000824047625127816011680208160016802081600168000680000160100
240204601441601118010680005801088000824047525127416011680208160016802081600168000680000160100
240204601331601118010680005801088000824047425126116011680208160016802081600168000680000160100
240204601311601118010680005801088000824048725124416011680208160016802081600168000680000160100
240204601331601118010680005801088000824047525131216011680208160016802081600168000680000160100
240204601301601118010680005801088000824047225128816011680208160016802081600168000680000160100
240204601361601118010680005801088000824047625129616011680208160016802081600168000680000160100
240204601341601118010680005801088000824047625131716011680208160016802081600168000680000160100
240204601341601118010680005801088000824047525126816011680208160016802081600168000680000160100

1000 unrolls and 10 iterations

Result (median cycles for code divided by count): 0.7515

retire uop (01)cycle (02)schedule uop (52)schedule int uop (53)schedule ldst uop (55)dispatch int uop (56)dispatch ldst uop (58)int uops in schedulers (59)simd uops in schedulers (5a)dispatch uop (78)map int uop (7c)map ldst uop (7d)map int uop inputs (7f)map ldst uop inputs (80)? int output thing (e9)? ldst retires (ed)? int retires (ef)
240029613681602558014980106801518000824020325159316002680028160016800201600008000180000160010
240024600911600118001180000800108000024016825130416001080020160000800201600008000180000160010
240024600891600118001180000800108000024016725127416001080020160000800201600008000180000160010
240024600881600118001180000800108000024016725128116001080020160000800201600008000180000160010
240024600811600118001180000800108000024016825128016001080020160000800541600698003280000160010
240024600811600118001180000800108000024016725127116001080020160000800201600008000180000160010
240024600951600118001180000800108000024016825127516001080020160000800201600008000180000160010
240024600881600118001180000800108000024016725128116001080020160000800201600008000180000160010
240025602621600738004380030800458000024016725112616001080020160000800201600008000180000160010
240024600841600118001180000800108003624038825416216008180055160073800201600008000180000160010