Apple Microarchitecture Research by Dougall Johnson

M1/A14 P-core (Firestorm): Overview | Base Instructions | SIMD and FP Instructions
M1/A14 E-core (Icestorm):  Overview | Base Instructions | SIMD and FP Instructions

LDP (post-index, Q)

Test 1: uops

Code:

  ldp q0, q1, [x6], #0x10
  mov x0, 1
  mov x1, 2
  mov x8, 0

(no loop instructions)

1000 unrolls and 1 iteration

Retires: 2.000

Issues: 3.000

Integer unit issues: 1.001

Load/store unit issues: 2.000

SIMD/FP unit issues: 0.000

retire uop (01)cycle (02)schedule uop (52)schedule int uop (53)schedule ldst uop (55)dispatch int uop (56)dispatch ldst uop (58)int uops in schedulers (59)simd uops in schedulers (5a)dispatch uop (78)map ldst uop (7d)map ldst uop inputs (80)? int output thing (e9)? ldst retires (ed)
200518633057102720301026200035162564130002000200010012000
200415843001100120001000200035162486730002000200010012000
200415413001100120001000200035162485830002000200010012000
200415433001100120001000200035162475930002000200010012000
200415443001100120001000200035162491230002000200010012000
200415433001100120001000200035162471430002000200010012000
200415363001100120001000200035162497530002000200010012000
200415383001100120001000200035162486730002000200010012000
200415533001100120001000200035162518230002000200010012000
200415423001100120001000200035162536230002000200010012000

Test 2: Latency 1->3 roundtrip

Chain cycles: 3

Code:

  ldp q0, q1, [x6], #0x10
  fmov x0, d0
  eor x8, x8, x0
  eor x8, x8, x0
  add x6, x6, x8
  mov x0, 1
  mov x1, 2
  mov x8, 0

(fused SUBS/B.cc loop)

100 unrolls and 100 iterations

Result (median cycles for code, minus 3 chain cycles): 7.0186

retire uop (01)cycle (02)schedule uop (52)schedule int uop (53)schedule simd uop (54)schedule ldst uop (55)dispatch int uop (56)dispatch simd uop (57)dispatch ldst uop (58)int uops in schedulers (59)simd uops in schedulers (5a)ldst uops in schedulers (5b)dispatch uop (78)map int uop (7c)map ldst uop (7d)map simd uop (7e)map int uop inputs (7f)map ldst uop inputs (80)map simd uop inputs (81)? int output thing (e9)? ldst retires (ed)? int retires (ef)
60223101943802865015710091200384042110097200062660747156856078937770113302092000810003602182000810003500012000040100
60204100094801065010110005200004010410003200062660697156845278933470113302092000810003602182000810003500012000040100
60204100094801065010110005200004010410003200242662345156949678987570171302392002410013603382004810023500172000040100
60204100111801065010110005200004010410003200062662070156944278979870113302092000810003602182000810003500012000040100
60204100130801065010110005200004010410003200242662642156973478997570173302392002810013602182000810003500012000040100
65466113925844365276510053216184259210045200062660747156856078937770113302092000810003602182000810003500012000040100
60204100094801065010110005200004010410003200062660747156856078937770113302092000810003602182000810003500012000040100
60204100109801065010110005200004010410003200062661341156895678956370113302092000810003602182000810003500012000040100
60204100102801065010110005200004010410003200062661071156877478947870113302092000810003602182000810003500012000040100
60204100097801065010110005200004010410003200062661017156874078946470113302092000810003602182000810003500012000040100

1000 unrolls and 10 iterations

Result (median cycles for code, minus 3 chain cycles): 7.0114

retire uop (01)cycle (02)schedule uop (52)schedule int uop (53)schedule simd uop (54)schedule ldst uop (55)dispatch int uop (56)dispatch simd uop (57)dispatch ldst uop (58)int uops in schedulers (59)simd uops in schedulers (5a)ldst uops in schedulers (5b)dispatch uop (78)map int uop (7c)map ldst uop (7d)map simd uop (7e)map int uop inputs (7f)map ldst uop inputs (80)map simd uop inputs (81)? int output thing (e9)? ldst retires (ed)? int retires (ef)
60043102461801985006610094200384033010098200062661319157046279028970023300292000810003600382000810003500012000040010
60024100106800165001110005200004001010000200002661031157029079020070010300202000010000600202000010000500012000040010
60024100104800165001110005200004001010000200002661031157029079020070010300202000010000600202000010000500012000040010
60024100104800165001110005200004001010000200002661031157029079020070010300202000010000600202000010000500012000040010
60024100106800165001110005200004001010000200002661328157048879029470010300202000010000600202000010000500012000040010
60024100104800165001110005200004001010000200002661031157029079020070010300202000010000600202000010000500012000040010
60024100104800165001110005200004001010000200002661031157029079020070010300202000010000600202000010000500012000040010
60024100104800165001110005200004001010000200002661031157029079020070010300202000010000600202000010000500012000040010
60024100104800165001110005200004001010000200002661058157030679020870010300202000010000600202000010000500012000040010
60024100104800165001110005200004001010000200002661031157029079020070010300202000010000600202000010000500012000040010

Test 3: Latency 2->3 roundtrip

Chain cycles: 3

Code:

  ldp q0, q1, [x6], #0x10
  fmov x1, d1
  eor x8, x8, x1
  eor x8, x8, x1
  add x6, x6, x8
  mov x0, 1
  mov x1, 2
  mov x8, 0

(fused SUBS/B.cc loop)

100 unrolls and 100 iterations

Result (median cycles for code, minus 3 chain cycles): 7.0118

retire uop (01)cycle (02)schedule uop (52)schedule int uop (53)schedule simd uop (54)schedule ldst uop (55)dispatch int uop (56)dispatch simd uop (57)dispatch ldst uop (58)int uops in schedulers (59)simd uops in schedulers (5a)ldst uops in schedulers (5b)dispatch uop (78)map int uop (7c)map ldst uop (7d)map simd uop (7e)map int uop inputs (7f)map ldst uop inputs (80)map simd uop inputs (81)? int output thing (e9)? ldst retires (ed)? int retires (ef)
60223102904802365011210104200204038510090200062661157156872678946770113302092000810003602182000810003500012000040100
60204100111801075010110006200004010410003200062661368156895678956670113302092000810003602182000810003500012000040100
60204100107801065010110005200004010410003200062661071156876678947070113302092000810003602182000810003500012000040100
60204100104801065010110005200004010410003200062661206156885678951170113302092000810003602182000810003500012000040100
60205100179801215010910010200024013610013200062662259156954478984470113302092000810003602182000810003500012000040100
60204100136801065010110005200004010410003200062661395156896878957270113302092000810003602182000810003500012000040100
60204100121801065010110005200004010410003200062661638156913078965170113302092000810003602182000810003500012000040100
60204100111801065010110005200004010410003200062662178156950078981370113302092000810003602782002810013500092000040100
60204100118801075010110006200004010410003200062662070156943278978470113302092000810003602182000810003500012000040100
60204100133801065010110005200004010410003200062661827156926078971370113302092000810003602182000810003500012000040100

1000 unrolls and 10 iterations

Result (median cycles for code, minus 3 chain cycles): 7.0122

retire uop (01)cycle (02)schedule uop (52)schedule int uop (53)schedule simd uop (54)schedule ldst uop (55)dispatch int uop (56)dispatch simd uop (57)dispatch ldst uop (58)int uops in schedulers (59)simd uops in schedulers (5a)ldst uops in schedulers (5b)dispatch uop (78)map int uop (7c)map ldst uop (7d)map simd uop (7e)map int uop inputs (7f)map ldst uop inputs (80)map simd uop inputs (81)? int output thing (e9)? ldst retires (ed)? int retires (ef)
60043102807801345002110093200204029410088200062660050156965678989170023300292000810003600202000010000500012000040010
60024100060800135001110002200004001010000200002659951156962478985770010300202000010000600202000010000500012000040010
60024100079800135001110002200004001010000200242664487157259079130670083300592002810013600982002410013500092000040010
60025100166800285001910007200024004610013200002660626157007479007070010300202000010000600202000010000500012000040010
60024100160800135001110002200004001010000200002660140156974878992270010300202000010000600202000010000500012000040010
60024100060800135001110002200004001010000200002659897156958878984170010300202000010000600202000010000500012000040010
60024100073800135001110002200004001010000200002659897156958878984170010300202000010000600982002810013500092000040010
62761115343829555189910016210404186710027200002662759157149679073870010300202000010000600202000010000500012000040010
60024100114800135001110002200004001010000200002660086156971278989570010300202000010000600202000010000500012000040010
60024100060800135001110002200004001010000200002659897156958878984170010300202000010000600202000010000500012000040010

Test 4: throughput

Count: 8

Code:

  ldp q0, q1, [x6], #0x10
  ldp q0, q1, [x7], #0x10
  ldp q0, q1, [x8], #0x10
  ldp q0, q1, [x9], #0x10
  ldp q0, q1, [x10], #0x10
  ldp q0, q1, [x11], #0x10
  ldp q0, q1, [x12], #0x10
  ldp q0, q1, [x13], #0x10
  mov x7, x6
  mov x8, x6
  mov x9, x6
  mov x10, x6
  mov x11, x6
  mov x12, x6
  mov x13, x6

(fused SUBS/B.cc loop)

100 unrolls and 100 iterations

Result (median cycles for code divided by count): 1.0795

retire uop (01)cycle (02)schedule uop (52)schedule int uop (53)schedule ldst uop (55)dispatch int uop (56)dispatch ldst uop (58)int uops in schedulers (59)simd uops in schedulers (5a)dispatch uop (78)map int uop (7c)map ldst uop (7d)map int uop inputs (7f)map ldst uop inputs (80)? int output thing (e9)? ldst retires (ed)? int retires (ef)
160223881142406768036716030980368160008240318139361224011420016001220016007080032160000100
160204863602401068010616000080107160009240318139283924011520016001220016001280005160000100
160204863442401058010516000080106160009240318139339124011520016001220016001280005160000100
160204864622401598012916003080130160009240318139339124011520016001220016001280005160000100
160204863442401058010516000080106160009240318139339124011520016001220016001280005160000100
160204866922402738018316009080184160009240318139339124011520016001220016001280005160000100
160204864562401618013116003080132160009240318139339124011520016001220016001280005160000100
160204863442401058010516000080106160057240396138888824018920016006820016001280005160000100
160204863442401058010516000080106160009240318139346324011520016001220016001280005160000100
160204863502401058010516000080106160057240396139427924018920016006820016001280005160000100

1000 unrolls and 10 iterations

Result (median cycles for code divided by count): 1.0793

retire uop (01)cycle (02)schedule uop (52)schedule int uop (53)schedule ldst uop (55)dispatch int uop (56)dispatch ldst uop (58)int uops in schedulers (59)simd uops in schedulers (5a)dispatch uop (78)map int uop (7c)map ldst uop (7d)map int uop inputs (7f)map ldst uop inputs (80)? int output thing (e9)? ldst retires (ed)? int retires (ef)
160043885552405938028416030980285160009240048139295524002520160012201600008000116000010
160024863622400118001116000080010160000240030139350824001020160000201600088000316000010
160024866232400118001116000080010160000240030139405024001020160000201600008000116000010
160024863492400118001116000080010160000240030139336424001020160000201600008000116000010
160024863422400118001116000080010160000240030139335524001020160000201600008000116000010
160024863422400118001116000080010160000240030139335524001020160000201600008000116000010
160024863422400118001116000080010160000240030139341424001020160000201600008000116000010
160024863492400118001116000080010160000240030139348924001020160000201600008000116000010
160024863432400118001116000080010160000240030139337324001020160000201600008000116000010
160024863422400118001116000080010160000240030139345424001020160000201600008000116000010