Apple Microarchitecture Research by Dougall Johnson

M1/A14 P-core (Firestorm): Overview | Base Instructions | SIMD and FP Instructions
M1/A14 E-core (Icestorm):  Overview | Base Instructions | SIMD and FP Instructions

LDP (post-index, 64-bit)

Test 1: uops

Code:

  ldp x0, x1, [x6], #8
  mov x0, 1
  mov x1, 2
  mov x8, 0

(no loop instructions)

1000 unrolls and 1 iteration

Retires: 3.000

Issues: 2.000

Integer unit issues: 1.001

Load/store unit issues: 1.000

SIMD/FP unit issues: 0.000

retire uop (01)cycle (02)schedule uop (52)schedule int uop (53)schedule ldst uop (55)dispatch int uop (56)dispatch ldst uop (58)int uops in schedulers (59)simd uops in schedulers (5a)dispatch uop (78)map int uop (7c)map ldst uop (7d)map int uop inputs (7f)map ldst uop inputs (80)? int output thing (e9)? ldst retires (ed)? int retires (ef)
3005157820571029102810281000132081456520001000200010002000100110002000
3004111220011001100010001000133711456920001000200010002000100110002000
3004110220011001100010001000133301456520001000200010002000100110002000
3004112420011001100010001000131411459420001000200010002000100110002000
3004115720011001100010001000132431487120001000200010002000100110002000
3004110420011001100010001000135561511020001000200010002000100110002000
3004114320011001100010001000132961462220001000200010002000100110002000
3004111120011001100010001000135431477220001000200010002000100110002000
3004115620011001100010001000132531452520001000200010002000100110002000
3004108020011001100010001000134701488520001000200010002000100110002000

Test 2: Latency 1->3 (with chain penalty)

Chain cycles: 3

Code:

  ldp x0, x1, [x6], #8
  eor x8, x8, x0
  eor x8, x8, x0
  add x6, x6, x8
  mov x0, 1
  mov x1, 2
  mov x8, 0

(fused SUBS/B.cc loop)

100 unrolls and 100 iterations

Result (median cycles for code, minus 3 chain cycles): 5.9003

retire uop (01)cycle (02)schedule uop (52)schedule int uop (53)schedule ldst uop (55)dispatch int uop (56)dispatch ldst uop (58)int uops in schedulers (59)simd uops in schedulers (5a)dispatch uop (78)map int uop (7c)map ldst uop (7d)map int uop inputs (7f)map ldst uop inputs (80)? int output thing (e9)? ldst retires (ed)? int retires (ef)
60209126516515044149910005403481000323653077014665020940212200067022120008413691000050100
6020489010514694146910000402061000323596556998245020940212200087022120008413681000050100
6020488989514694146910000402061000323597356998815020940212200087022120008413681000050100
6020488989514694146910000402061000323597356998815020940212200087022120008413681000050100
6020488989514694146910000402061001323600326999645025340252200287022120008413681000050100
6020488989514694146910000402061000323597356998815020940212200087022120008413691000050100
6020488989514694146910000402061000323597356998815020940212200087022120008413691000050100
6020488989514694146910000402061000323593036997535020940212200087022120008413691000050100
6020488989514694146910000402061000323597356998815020940212200087029120028413781000050100
6020489317514704147010000402061000323597356998815020940212200087022120008413691000050100

1000 unrolls and 10 iterations

Result (median cycles for code, minus 3 chain cycles): 5.9018

retire uop (01)cycle (02)schedule uop (52)schedule int uop (53)schedule ldst uop (55)dispatch int uop (56)dispatch ldst uop (58)int uops in schedulers (59)simd uops in schedulers (5a)dispatch uop (78)map int uop (7c)map ldst uop (7d)map int uop inputs (7f)map ldst uop inputs (80)map simd uop inputs (81)? int output thing (e9)? ldst retires (ed)? simd retires (ee)? int retires (ef)
6002912417751324413191000540166100032359591700595500294003220008700202000004127710000050010
600248898651287412871000040020100002359318700492500204002020000700202000004127610000050010
600248896951286412861000040020100002360749700916500204002020000700202000004127610000050010
600248919051288412881000040020100002362261701370500204002020000703002008804131810000050010
600248913251287412871000040020100002359318700492500204002020000701112002804128410000050010
600248898451287412871000040020100002359642700588500204002020000700202000004127610000050010
600248898451287412871000040020100002358913700372500204002020000700202000004127510000050010
600248897751287412871000040020100002358724700316500204002020000700202000004127610000050010
600248897751287412871000040020100002358319700196500204002020000700202000004127510000050010
600248927851286412861000040020100002358373700212500204002020000700202000004127510000050010

Test 3: Latency 2->3 (with chain penalty)

Chain cycles: 3

Code:

  ldp x0, x1, [x6], #8
  eor x8, x8, x1
  eor x8, x8, x1
  add x6, x6, x8
  mov x0, 1
  mov x1, 2
  mov x8, 0

(fused SUBS/B.cc loop)

100 unrolls and 100 iterations

Result (median cycles for code, minus 3 chain cycles): 5.9016

retire uop (01)cycle (02)schedule uop (52)schedule int uop (53)schedule ldst uop (55)dispatch int uop (56)dispatch ldst uop (58)int uops in schedulers (59)simd uops in schedulers (5a)dispatch uop (78)map int uop (7c)map ldst uop (7d)map int uop inputs (7f)map ldst uop inputs (80)? int output thing (e9)? ldst retires (ed)? int retires (ef)
60209123915515054150010005403481000323595746998005020940212200087022120008413701000050100
6020589032514804147810002402401000323600866999855020940212200087022120008413701000050100
6020489002514704147010000402061000323600866999855020940212200087022120008413701000050100
6020489002514704147010000402061000323600866999855020940212200087022120008413701000050100
6020489002514704147010000402061000323600866999855020940212200087022120008413701000050100
6020489002514704147010000402061000323618687005135020940212200087022120008413701000050100
6020489002514704147010000402061000323628137007985020940212200087022120008413701000050100
6020589169514824148010002402401000323603837000735020940212200087022120008413701000050100
6020489002514704147010000402061000323600866999855020940212200087022120008413701000050100
6020489002514704147010000402061000323600866999855020940212200087022120008413701000050100

1000 unrolls and 10 iterations

Result (median cycles for code, minus 3 chain cycles): 5.8977

retire uop (01)cycle (02)schedule uop (52)schedule int uop (53)schedule ldst uop (55)dispatch int uop (56)dispatch ldst uop (58)int uops in schedulers (59)simd uops in schedulers (5a)dispatch uop (78)map int uop (7c)map ldst uop (7d)map int uop inputs (7f)map ldst uop inputs (80)? int output thing (e9)? ldst retires (ed)? int retires (ef)
60030123908513324132510007402001000323652887022845002940032200087002020000412781000050010
6002589559513014129910002400591000023654217022685002040020200007002020000412751000050010
6002488956512864128610000400201000023589947003965002040020200007002020000412751000050010
6002488956512864128610000400201000023589947003965002040020200007002020000412751000050010
6002488956512864128610000400201000023585627002685002040020200007002020000412751000050010
6002488956512864128610000400201000023585627002685002040020200007002020000412751000050010
6002588986512964129410002400601000023589947003965002040020200007002020000412761000050010
6002488956512864128610000400201000023585627002685002040020200007002020000412751000050010
6002488956512864128610000400201000023585627002685002040020200007002020000412751000050010
6002488956512864128610000400201000023585627002685002040020200007002020000412761000050010

Test 4: throughput

Count: 8

Code:

  ldp x0, x1, [x6], #8
  ldp x0, x1, [x7], #8
  ldp x0, x1, [x8], #8
  ldp x0, x1, [x9], #8
  ldp x0, x1, [x10], #8
  ldp x0, x1, [x11], #8
  ldp x0, x1, [x12], #8
  ldp x0, x1, [x13], #8
  mov x7, x6
  mov x8, x6
  mov x9, x6
  mov x10, x6
  mov x11, x6
  mov x12, x6
  mov x13, x6

(fused SUBS/B.cc loop)

100 unrolls and 100 iterations

Result (median cycles for code divided by count): 0.7632

retire uop (01)cycle (02)schedule uop (52)schedule int uop (53)schedule ldst uop (55)dispatch int uop (56)dispatch ldst uop (58)int uops in schedulers (59)simd uops in schedulers (5a)dispatch uop (78)map int uop (7c)map ldst uop (7d)map int uop inputs (7f)map ldst uop inputs (80)? int output thing (e9)? ldst retires (ed)? int retires (ef)
240209621841603688024080128802418000824078825206916011680208160016802081600168000680000160100
240204610751601118010680005801088000824079025201216011680208160016802081600168000680000160100
240204610551601118010680005801088000824072225186916011680208160016802081600168000680000160100
240204610531601118010680005801088000824072225187716011680208160016802081600168000680000160100
240204610561601118010680005801088000824072225188516011680208160016802081600168000680000160100
240204610601601118010680005801088000824072225188516011680208160016802081600168000680000160100
240204610531601118010680005801088000824072225188416011680208160016802081600168000680000160100
240204610541601118010680005801088000824072225187216011680208160016802081600168000680000160100
240204610561601118010680005801088000824072225185416011680208160016802081600168000680000160100
240204610571601118010680005801088000824072225187716011680208160016802351600738003380000160100

1000 unrolls and 10 iterations

Result (median cycles for code divided by count): 0.7627

retire uop (01)cycle (02)schedule uop (52)schedule int uop (53)schedule ldst uop (55)dispatch int uop (56)dispatch ldst uop (58)int uops in schedulers (59)simd uops in schedulers (5a)dispatch uop (78)map int uop (7c)map ldst uop (7d)map int uop inputs (7f)map ldst uop inputs (80)? int output thing (e9)? ldst retires (ed)? int retires (ef)
240029621401602888014980139801518000924052625219716002780028160018800201600008000180000160010
240024610171600118001180000800108000024049525200416001080020160000800201600008000180000160010
240024610211600118001180000800108000024049425202416001080020160000800201600008000180000160010
240024610171600118001180000800108000024050025201916001080020160000800201600008000180000160010
240024610201600118001180000800108000024049425209016001080020160000800201600008000180000160010
240024610181600118001180000800108000024049425202416001080020160000800201600008000180000160010
240024610171600118001180000800108000024049325202616001080020160000800201600008000180000160010
240024610181600118001180000800108000024049025202716001080020160000800201600008000180000160010
240024610181600118001180000800108000024049325202216001080020160000800201600008000180000160010
240024610161600118001180000800108000024049325201716001080020160000800201600008000180000160010