Apple Microarchitecture Research by Dougall Johnson

M1/A14 P-core (Firestorm): Overview | Base Instructions | SIMD and FP Instructions
M1/A14 E-core (Icestorm):  Overview | Base Instructions | SIMD and FP Instructions

LDP (pre-index, 32-bit)

Test 1: uops

Code:

  ldp w0, w1, [x6, #8]!
  mov x0, 1
  mov x1, 2
  mov x8, 0

(no loop instructions)

1000 unrolls and 1 iteration

Retires: 3.000

Issues: 2.000

Integer unit issues: 1.001

Load/store unit issues: 1.000

SIMD/FP unit issues: 0.000

retire uop (01)cycle (02)schedule uop (52)schedule int uop (53)schedule ldst uop (55)dispatch int uop (56)dispatch ldst uop (58)int uops in schedulers (59)simd uops in schedulers (5a)dispatch uop (78)map int uop (7c)map ldst uop (7d)map int uop inputs (7f)map ldst uop inputs (80)? int output thing (e9)? ldst retires (ed)? int retires (ef)
3005130920351020101510281000133051460920001000200010002000100110002000
3004111120011001100010001000134421453820001000200010002000100110002000
3004110920011001100010001000135551463720001000200010002000100110002000
3004107320011001100010001000135781561820001000200010002000100110002000
3004108420011001100010001000136541533720001000200010002000100110002000
3004110520011001100010001000136671545120001000200010002000100110002000
3004107920011001100010001000134761491220001000200010002000100110002000
3004108320011001100010001000133871460520001000200010002000100110002000
3004106720011001100010001000136551464420001000200010002000100110002000
3004111320011001100010001000135121499220001000200010002000100110002000

Test 2: Latency 1->3 (with chain penalty)

Chain cycles: 3

Code:

  ldp w0, w1, [x6, #8]!
  eor x8, x8, x0
  eor x8, x8, x0
  add x6, x6, x8
  mov x0, 1
  mov x1, 2
  mov x8, 0

(fused SUBS/B.cc loop)

100 unrolls and 100 iterations

Result (median cycles for code, minus 3 chain cycles): 4.0178

retire uop (01)cycle (02)schedule uop (52)schedule int uop (53)schedule ldst uop (55)dispatch int uop (56)dispatch ldst uop (58)int uops in schedulers (59)simd uops in schedulers (5a)dispatch uop (78)map int uop (7c)map ldst uop (7d)map int uop inputs (7f)map ldst uop inputs (80)? int output thing (e9)? ldst retires (ed)? int retires (ef)
6020971333502614025610005403481000318520945495655020940212200087022120008401041000050100
6020470139502044020410000402061000318513915493615020940212200087022120008401041000050100
6020470141502044020410000402061000318512295493075020940212200087022120008401041000050100
6020570190502174021510002402401001318531235498645025340252200287022120006401051000050100
6020470122502044020410000402061000318511505492245020940212200087022120008401041000050100
6020470117502044020410000402061001318521995495895025340252200287022120008401041000050100
6020470144502044020410000402061000318508515491855020940212200087022120008401041000050100
6020470170502044020410000402061000318512025493025020940212200087022120008401041000050100
6020470119502044020410000402061000318505815490955020940212200087022120008401041000050100
6020470119502044020410000402061000318505815490955020940212200087022120008401041000050100

1000 unrolls and 10 iterations

Result (median cycles for code, minus 3 chain cycles): 4.0108

retire uop (01)cycle (02)schedule uop (52)schedule int uop (53)schedule ldst uop (55)dispatch int uop (56)dispatch ldst uop (58)int uops in schedulers (59)simd uops in schedulers (5a)dispatch uop (78)map int uop (7c)map ldst uop (7d)map int uop inputs (7f)map ldst uop inputs (80)? int output thing (e9)? ldst retires (ed)? int retires (ef)
6002971222500794007410005401661000318508965499505002940032200087004120008400141000050010
6002470119500244002410000400201000018508665499195002040020200007002020000400141000050010
6002470117500244002410000400201000018507315498745002040020200007002020000400141000050010
6002470118500244002410000400201000018509205499375002040020200007002020000400141000050010
6002470328500544004810006400921000018506775498565002040020200007002020000400141000050010
6002470117500244002410000400201000018508125499015002040020200007002020000400141000050010
6002470128500244002410000400201000018511095500005002040020200007004120008400151000050010
6002470105500244002410000400201000018504075497625002040020200007002020000400141000050010
6002470109500244002410000400201000018503535497445002040020200007002020000400141000050010
6002470105500244002410000400201000018505425498075002040020200007002020000400141000050010

Test 3: Latency 2->3 (with chain penalty)

Chain cycles: 3

Code:

  ldp w0, w1, [x6, #8]!
  eor x8, x8, x1
  eor x8, x8, x1
  add x6, x6, x8
  mov x0, 1
  mov x1, 2
  mov x8, 0

(fused SUBS/B.cc loop)

100 unrolls and 100 iterations

Result (median cycles for code, minus 3 chain cycles): 4.0117

retire uop (01)cycle (02)schedule uop (52)schedule int uop (53)schedule ldst uop (55)dispatch int uop (56)dispatch ldst uop (58)int uops in schedulers (59)simd uops in schedulers (5a)ldst uops in schedulers (5b)dispatch uop (78)map int uop (7c)map ldst uop (7d)map simd uop (7e)map int uop inputs (7f)map ldst uop inputs (80)? int output thing (e9)? ldst retires (ed)? int retires (ef)
602097121650257402521000540348100031850501549038050209402122000807022120008401041000050100
602047011950204402041000040206100031851121549275050209402122000807022120008401041000050100
602047011350204402041000040206100031850365549023050209402122000807022120008401041000050100
602047011350204402041000040206100031850419549041050209402122000807022120008401041000050100
602057019850217402151000240239100031851175549293050209402122000807036320050401271000050100
6020470131502044020410000402061127915295994689741488473713862219253147022120008401041000050100
602047015850204402041000040206100031850716549139050209402122000807022120008401041000050100
602047011750204402041000040206100031850527549077050209402122000807022120008401041000050100
602047011750204402041000040206100031850473549059050209402122000807022120008401041000050100
602047011750204402041000040206100031850473549059050209402122000807022120008401041000050100

1000 unrolls and 10 iterations

Result (median cycles for code, minus 3 chain cycles): 4.0108

retire uop (01)cycle (02)schedule uop (52)schedule int uop (53)schedule ldst uop (55)dispatch int uop (56)dispatch ldst uop (58)int uops in schedulers (59)simd uops in schedulers (5a)dispatch uop (78)map int uop (7c)map ldst uop (7d)map int uop inputs (7f)map ldst uop inputs (80)? int output thing (e9)? ldst retires (ed)? int retires (ef)
6003071363500934008610007402001000318505965497635002940032200067002020000400141000050010
6002470135500244002410000400201000018519465502755002040020200007002020000400141000050010
6002470133500244002410000400201000018513525500775002040020200007002020000400141000050010
6002470134500244002410000400201000018511635500145002040020200007004120006400151000050010
6002470124500244002410000400261000318510855500135002940032200087004120008400141000050010
6002470116500244002410000400261001318526005504895007240072200287004120008400141000050010
6002470162500244002410000400261000318516795502115002940032200087004120008400141000050010
6002470133500244002410000400261000318506535498695002940032200087004120008400141000050010
6002470115500244002410000400261000318506535498695002940032200087004120008400141000050010
6002470115500244002410000400261000318506535498695002940032200087004120008400141000050010

Test 4: throughput

Count: 8

Code:

  ldp w0, w1, [x6, #8]!
  ldp w0, w1, [x7, #8]!
  ldp w0, w1, [x8, #8]!
  ldp w0, w1, [x9, #8]!
  ldp w0, w1, [x10, #8]!
  ldp w0, w1, [x11, #8]!
  ldp w0, w1, [x12, #8]!
  ldp w0, w1, [x13, #8]!
  mov x7, x6
  mov x8, x6
  mov x9, x6
  mov x10, x6
  mov x11, x6
  mov x12, x6
  mov x13, x6

(fused SUBS/B.cc loop)

100 unrolls and 100 iterations

Result (median cycles for code divided by count): 0.7520

retire uop (01)cycle (02)schedule uop (52)schedule int uop (53)schedule ldst uop (55)dispatch int uop (56)dispatch ldst uop (58)int uops in schedulers (59)simd uops in schedulers (5a)dispatch uop (78)map int uop (7c)map ldst uop (7d)map int uop inputs (7f)map ldst uop inputs (80)? int output thing (e9)? ldst retires (ed)? int retires (ef)
240209611971603348024080094802418000824046125124316011680208160016802351600738003380000160100
240204601271601118010680005801088000824046925125916011680208160016802081600168000680000160100
240204601281601118010680005801088000824046125122316011680208160016802081600168000680000160100
240204601271601118010680005801088000824046125122916011680208160016802351600708003380000160100
240204601211601118010680005801088000824046725134916011680208160016802081600168000680000160100
240204601301601118010680005801088000824045925119816011680208160016802081600168000680000160100
240204601261601118010680005801088000824046225120216011680208160016802081600168000680000160100
240204601281601118010680005801088000824046125120816011680208160016802081600168000680000160100
240204601281601118010680005801088000824046125115316011680208160016802081600168000680000160100
240204601251601118010680005801088000824046125117616011680208160016802081600168000680000160100

1000 unrolls and 10 iterations

Result (median cycles for code divided by count): 0.7511

retire uop (01)cycle (02)schedule uop (52)schedule int uop (53)schedule ldst uop (55)dispatch int uop (56)dispatch ldst uop (58)int uops in schedulers (59)simd uops in schedulers (5a)dispatch uop (78)map int uop (7c)map ldst uop (7d)map int uop inputs (7f)map ldst uop inputs (80)? int output thing (e9)? ldst retires (ed)? int retires (ef)
240029611931602398014980090801518000824020525132816002680028160016800201600008000180000160010
240024600971600118001180000800108000024017825132816001080020160000800201600008000180000160010
240024600871600118001180000800108003624051625220016008280056160072800201600008000180000160010
240024600821600118001180000800108000024018125130016001080020160000800201600008000180000160010
240025601771600638003780026800378000024018725119816001080020160000800201600008000180000160010
240024600841600118001180000800108000024018125135716001080020160000800201600008000180000160010
240024600851600118001180000800108000024018125132716001080020160000800201600008000180000160010
240024600841600118001180000800108000024018125132316001080020160000800201600008000180000160010
240024600871600118001180000800108000024017825135916001080020160000800201600008000180000160010
240024600841600118001180000800108000024018125134816001080020160000800201600008000180000160010