Apple Microarchitecture Research by Dougall Johnson

M1/A14 P-core (Firestorm): Overview | Base Instructions | SIMD and FP Instructions
M1/A14 E-core (Icestorm):  Overview | Base Instructions | SIMD and FP Instructions

LDP (pre-index, 64-bit)

Test 1: uops

Code:

  ldp x0, x1, [x6, #8]!
  mov x0, 1
  mov x1, 2
  mov x8, 0

(no loop instructions)

1000 unrolls and 1 iteration

Retires: 3.000

Issues: 2.000

Integer unit issues: 1.001

Load/store unit issues: 1.000

SIMD/FP unit issues: 0.000

retire uop (01)cycle (02)schedule uop (52)schedule int uop (53)schedule ldst uop (55)dispatch int uop (56)dispatch ldst uop (58)int uops in schedulers (59)simd uops in schedulers (5a)dispatch uop (78)map int uop (7c)map ldst uop (7d)map int uop inputs (7f)map ldst uop inputs (80)? int output thing (e9)? ldst retires (ed)? int retires (ef)
3005151520571029102810281000131821452820001000200010002000100110002000
3004106920011001100010001000132651512120001000200010002000100110002000
3004108420011001100010001000131831414520001000200010002000100110002000
3004110520011001100010001000135281484420001000200010002000100110002000
3004109420011001100010001000133521453620001000200010002000100110002000
3004106620011001100010001028137821470320561028205710002000100110002000
3004106220011001100010001000134411467820001000200010002000100110002000
3004108420011001100010001000133981479720001000200010002000100110002000
3004108320011001100010001000137631537420001000200010002000100110002000
3004111220011001100010001000137371484820001000200010002000100110002000

Test 2: Latency 1->3 (with chain penalty)

Chain cycles: 3

Code:

  ldp x0, x1, [x6, #8]!
  eor x8, x8, x0
  eor x8, x8, x0
  add x6, x6, x8
  mov x0, 1
  mov x1, 2
  mov x8, 0

(fused SUBS/B.cc loop)

100 unrolls and 100 iterations

Result (median cycles for code, minus 3 chain cycles): 5.9003

retire uop (01)cycle (02)schedule uop (52)schedule int uop (53)schedule ldst uop (55)dispatch int uop (56)dispatch ldst uop (58)int uops in schedulers (59)simd uops in schedulers (5a)dispatch uop (78)map int uop (7c)map ldst uop (7d)map int uop inputs (7f)map ldst uop inputs (80)? int output thing (e9)? ldst retires (ed)? int retires (ef)
60209126479515044149910005403481000323628697007395020940212200067022120008413731000050100
6020489376514744147410000402061000323672307020985020940212200087022120008413731000050100
6020489003514734147310000402061000323601296999935020940212200087022120008413731000050100
6020489003514734147310000402061000323601296999935020940212200087022120008413731000050100
6020489003514734147310000402061000323601296999935020940212200087022120008413731000050100
6020489003514734147310000402061000323601296999935020940212200087022120008413731000050100
6020489003514734147310000402061000323601296999935020940212200087022120008413731000050100
6020489003514734147310000402061000323601296999935020940212200087022120008413731000050100
6020489003514734147310000402061000323601296999935020940212200087022120008413731000050100
6020489003514734147310000402061000323601296999935020940212200087022120008413731000050100

1000 unrolls and 10 iterations

Result (median cycles for code, minus 3 chain cycles): 5.9016

retire uop (01)cycle (02)schedule uop (52)schedule int uop (53)schedule ldst uop (55)dispatch int uop (56)dispatch ldst uop (58)int uops in schedulers (59)simd uops in schedulers (5a)dispatch uop (78)map int uop (7c)map ldst uop (7d)map int uop inputs (7f)map ldst uop inputs (80)? int output thing (e9)? ldst retires (ed)? int retires (ef)
60030123857513344132710007401991000323602827007955002940032200087004120008412841000050010
6002489016512944129410000400201000023606307008765002040020200007002020000412841000050010
6002489016512944129410000400201000023653557022795002040020200007002020000412841000050010
6002489171512944129410000400201000023665167026205002040020200007002020000412841000050010
6002489151512944129410000400201000023642757019585002040020200007002020000412841000050010
6002489016512944129410000400201000023606307008765002040020200007002020000412841000050010
6002489016512944129410000400201000023606307008765002040020200007002020000412841000050010
6002489016512944129410000400201000023606307008765002040020200007002020000412841000050010
6002489016512944129410000400201000023606307008765002040020200007002020000412841000050010
6002489016512944129410000400201000023606307008765002040020200007002020000412841000050010

Test 3: Latency 2->3 (with chain penalty)

Chain cycles: 3

Code:

  ldp x0, x1, [x6, #8]!
  eor x8, x8, x1
  eor x8, x8, x1
  add x6, x6, x8
  mov x0, 1
  mov x1, 2
  mov x8, 0

(fused SUBS/B.cc loop)

100 unrolls and 100 iterations

Result (median cycles for code, minus 3 chain cycles): 5.8990

retire uop (01)cycle (02)schedule uop (52)schedule int uop (53)schedule ldst uop (55)dispatch int uop (56)dispatch ldst uop (58)int uops in schedulers (59)simd uops in schedulers (5a)dispatch uop (78)map int uop (7c)map ldst uop (7d)map int uop inputs (7f)map ldst uop inputs (80)? int output thing (e9)? ldst retires (ed)? int retires (ef)
62287140495533034259610707414081000323741707041225020940212200087022120008413711000050100
6020488974514714147110000402061001223701307030375024640248200247022120008413731000050100
6020489180514724147210000402061000323674737021695020940212200087022120008413721000050100
6020489134514724147210000402061000323597786998895020940212200087022120008413721000050100
6020488990514724147210000402061000323597786998895020940212200087022120008413741000050100
6020489482514724147210000402061001323673347021245025340252200287022120008413731000050100
6020488990514724147210000402061000323597786998895020940212200087022120008413721000050100
6020488990514724147210000402061000323597786998895020940212200087022120008413721000050100
6020488990514724147210000402061000323597786998895020940212200087022120008413721000050100
6020488990514724147210000402061000323597786998895020940212200087022120008413721000050100

1000 unrolls and 10 iterations

Result (median cycles for code, minus 3 chain cycles): 5.8990

retire uop (01)cycle (02)schedule uop (52)schedule int uop (53)schedule ldst uop (55)dispatch int uop (56)dispatch ldst uop (58)int uops in schedulers (59)simd uops in schedulers (5a)dispatch uop (78)map int uop (7c)map ldst uop (7d)map int uop inputs (7f)map ldst uop inputs (80)? int output thing (e9)? ldst retires (ed)? int retires (ef)
60029123902513224131710005401661000323594597005205002940032200087002020000412791000050010
6002489112512944129410000400261000023602797007725002040020200007002020000412821000050010
6002488990512924129210000400201000023599287006685002040020200007002020000412821000050010
6002488990512924129210000400201000023599287006685002040020200007002020000412821000050010
6002488990512924129210000400201000023599287006685002040020200007002020000412821000050010
6002488990512924129210000400201000023599287006685002040020200007002020000412821000050010
6002488990512924129210000400201001323606097009025007340072200287002020000412831000050010
6002488990512924129210000400201000023599287006685002040020200007002020000412821000050010
6002488990512924129210000400201000023599287006685002040020200007002020000412821000050010
6002488990512924129210000400201000023599287006685002040020200007002020000412821000050010

Test 4: throughput

Count: 8

Code:

  ldp x0, x1, [x6, #8]!
  ldp x0, x1, [x7, #8]!
  ldp x0, x1, [x8, #8]!
  ldp x0, x1, [x9, #8]!
  ldp x0, x1, [x10, #8]!
  ldp x0, x1, [x11, #8]!
  ldp x0, x1, [x12, #8]!
  ldp x0, x1, [x13, #8]!
  mov x7, x6
  mov x8, x6
  mov x9, x6
  mov x10, x6
  mov x11, x6
  mov x12, x6
  mov x13, x6

(fused SUBS/B.cc loop)

100 unrolls and 100 iterations

Result (median cycles for code divided by count): 0.7633

retire uop (01)cycle (02)schedule uop (52)schedule int uop (53)schedule ldst uop (55)dispatch int uop (56)dispatch ldst uop (58)int uops in schedulers (59)simd uops in schedulers (5a)dispatch uop (78)map int uop (7c)map ldst uop (7d)map int uop inputs (7f)map ldst uop inputs (80)? int output thing (e9)? ldst retires (ed)? int retires (ef)
240209623281603698024080129802418000824076225186316011680208160016802081600168000680000160100
240204610641601118010680005801088000824076025186116011680208160016802081600168000680000160100
240204610621601118010680005801088000824076025187316011680208160016802081600168000680000160100
240204610591601118010680005801088000824076225186816011680208160016802081600168000680000160100
240204610631601118010680005801088000824076025190316011680208160016802081600168000680000160100
240204614441603198021080109802128000824077025168116011680208160016802081600168000680000160100
240204610561601118010680005801088006224091025350816022480262160126802081600168000680000160100
240204611531601678013480033801358000824076225185816011680208160016802361600728003480000160100
240204610591601118010680005801088000824076225183116011680208160016802081600168000680000160100
240204611551601678013480033801358000824076225184016011680208160016802081600168000680000160100

1000 unrolls and 10 iterations

Result (median cycles for code divided by count): 0.7627

retire uop (01)cycle (02)schedule uop (52)schedule int uop (53)schedule ldst uop (55)dispatch int uop (56)dispatch ldst uop (58)int uops in schedulers (59)simd uops in schedulers (5a)dispatch uop (78)map int uop (7c)map ldst uop (7d)map int uop inputs (7f)map ldst uop inputs (80)? int output thing (e9)? ldst retires (ed)? int retires (ef)
240029621231602758014980126801518000824050125192116002680028160016800201600008000180000160010
240024610201600118001180000800108000024047725187916001080020160000800201600008000180000160010
240024610171600118001180000800108000024048125189916001080020160000800201600008000180000160010
240024610221600118001180000800108000024047525189916001080020160000800201600008000180000160010
240025611161600778004480033800468000024047625188516001080020160000800201600008000180000160010
240024610131600118001180000800108000024048125189816001080020160000800201600008000180000160010
240024610151600118001180000800108000024047325189816001080020160000800201600008000180000160010
240024610151600118001180000800108000024047925190516001080020160000800201600008000180000160010
240024610151600118001180000800108000024048025187416001080020160000800201600008000180000160010
240024610141600118001180000800108000024047825188916001080020160000800201600008000180000160010