Apple Microarchitecture Research by Dougall Johnson

M1/A14 P-core (Firestorm): Overview | Base Instructions | SIMD and FP Instructions
M1/A14 E-core (Icestorm):  Overview | Base Instructions | SIMD and FP Instructions

LDP (post-index, D)

Test 1: uops

Code:

  ldp d0, d1, [x6], #0x10
  mov x0, 1
  mov x1, 2
  mov x8, 0

(no loop instructions)

1000 unrolls and 1 iteration

Retires: 2.000

Issues: 3.000

Integer unit issues: 1.001

Load/store unit issues: 2.000

SIMD/FP unit issues: 0.000

retire uop (01)cycle (02)schedule uop (52)schedule int uop (53)schedule ldst uop (55)dispatch int uop (56)dispatch ldst uop (58)int uops in schedulers (59)simd uops in schedulers (5a)dispatch uop (78)map ldst uop (7d)map ldst uop inputs (80)? int output thing (e9)? ldst retires (ed)
200518133057102720301026200035162504730002000200010012000
200415733001100120001000200035162467830002000200010012000
200415163001100120001000200035162468730002000200010012000
200415823001100120001000200035162697330002000200010012000
200415273001100120001000200035162477730002000200010012000
200415153001100120001000200035162468730002000205610272000
200415543001100120001000200035162471430002000200010012000
200415423001100120001000200035162464230002000200010012000
200415153001100120001000200035162455230002000200010012000
200415243001100120001000200035162461530002000200010012000

Test 2: Latency 1->3 roundtrip

Chain cycles: 3

Code:

  ldp d0, d1, [x6], #0x10
  fmov x0, d0
  eor x8, x8, x0
  eor x8, x8, x0
  add x6, x6, x8
  mov x0, 1
  mov x1, 2
  mov x8, 0

(fused SUBS/B.cc loop)

100 unrolls and 100 iterations

Result (median cycles for code, minus 3 chain cycles): 7.0084

retire uop (01)cycle (02)schedule uop (52)schedule int uop (53)schedule simd uop (54)schedule ldst uop (55)dispatch int uop (56)dispatch simd uop (57)dispatch ldst uop (58)int uops in schedulers (59)simd uops in schedulers (5a)ldst uops in schedulers (5b)dispatch uop (78)map int uop (7c)map ldst uop (7d)map simd uop (7e)map int uop inputs (7f)map ldst uop inputs (80)map simd uop inputs (81)? int output thing (e9)? ldst retires (ed)? int retires (ef)
60214102425802325011210100200204038510089200062661044156879478947570113302092000810003602182000810003500012000040100
60204100140801045010110003200004010410003200242664505157098279053970173302392002810013602182000810003500012000040100
60204100074801045010110003200004010410003200062660577156848078932670113302092000810003602182000810003500012000040100
60204100083801045010110003200004010410003200062660423156838278928270113302092000810003602182000810003500012000040100
60204100086801045010110003200004010410003200062660369156834678926270113302092000810003602182000810003500012000040100
60204100084801045010110003200004010410003200062660342156832678925570113302092000810003602182000810003500012000040100
60204100098801045010110003200004010410003200062662367156967878988870113302092000810003602182000810003500012000040100
60204100083801045010110003200004010410003200062660288156829278923770113302092000810003602182000810003500012000040100
60204100078801045010110003200004010410003200062660207156823878921070113302092000810003602182000810003500012000040100
60204100086801045010110003200004010410003200062660288156829278923870113302092000810003602182000810003500012000040100

1000 unrolls and 10 iterations

Result (median cycles for code, minus 3 chain cycles): 7.0066

retire uop (01)cycle (02)schedule uop (52)schedule int uop (53)schedule simd uop (54)schedule ldst uop (55)dispatch int uop (56)dispatch simd uop (57)dispatch ldst uop (58)int uops in schedulers (59)simd uops in schedulers (5a)ldst uops in schedulers (5b)dispatch uop (78)map int uop (7c)map ldst uop (7d)map simd uop (7e)map int uop inputs (7f)map ldst uop inputs (80)map simd uop inputs (81)? int output thing (e9)? ldst retires (ed)? simd retires (ee)? int retires (ef)
600341022748013850021100972002040294100832002426636651571920791021700823005920026100136003820008100035000120000040010
600241000668001450011100032000040014100032000026600591569684789897700103002020000100006002020000100005000120000040010
600241000668001450011100032000040010100002000026600591569684789897700103002020000100006002020000100005000120000040010
600241000668001450011100032000040010100002000026600591569684789897700103002020000100006002020000100005000120000040010
600241000668001450011100032000040010100002000026600591569684789897700103002020000100006002020000100005000120000040010
600241000668001450011100032000040010100002000026600591569684789897700103002020000100006002020000100005000120000040010
600251001428002950019100082000240046100132000026600591569684789897700103002020000100006002020000100005000120000040010
600241000668001450011100032000040010100002000026600591569684789897700103002020000100006002020000100005000120000040010
600241000668001450011100032000040010100002000026600591569684789897700103002020000100006002020000100005000120000040010
600241000668001450011100032000040010100002000026600591569684789897700103002020000100006002020000100005000120000040010

Test 3: Latency 2->3 roundtrip

Chain cycles: 3

Code:

  ldp d0, d1, [x6], #0x10
  fmov x1, d1
  eor x8, x8, x1
  eor x8, x8, x1
  add x6, x6, x8
  mov x0, 1
  mov x1, 2
  mov x8, 0

(fused SUBS/B.cc loop)

100 unrolls and 100 iterations

Result (median cycles for code, minus 3 chain cycles): 7.0089

retire uop (01)cycle (02)schedule uop (52)schedule int uop (53)schedule simd uop (54)schedule ldst uop (55)dispatch int uop (56)dispatch simd uop (57)dispatch ldst uop (58)int uops in schedulers (59)simd uops in schedulers (5a)ldst uops in schedulers (5b)dispatch uop (78)map int uop (7c)map ldst uop (7d)map simd uop (7e)map int uop inputs (7f)map ldst uop inputs (80)map simd uop inputs (81)? int output thing (e9)? ldst retires (ed)? simd retires (ee)? int retires (ef)
602141024008022850112100962002040385100892000626605901568428789315701133020920008100036027820028100135000920000040100
602041000858010450101100032000040104100032000626604771568410789300701133020920008100036021820008100035000120000040100
602041000848010450101100032000040104100032000626607741568584789398701133020920008100036021820008100035000120000040100
602041000838010450101100032000040104100032000626607201568548789380701133020920008100036021820008100035000120000040100
602041000838010450101100032000040104100032000626603691568338789264701133020920008100036021820008100035000120000040100
602041000948010550101100042000040104100032000626607201568548789381701133020920008100036021820008100035000120000040100
602041000838010450101100032000040104100032002326625121560104794197701713023820026100136021820008100035000120000040100
602041000848010450101100032000040104100032000626606121568500789343701133020920008100036021820008100035000120000040100
602041002028012250112100062000440139100122000626603961568356789273701133020920008100036021820008100035000120000040100
602041000808010450101100032000040104100032000626604501568392789290701133020920008100036021820008100035000120000040100

1000 unrolls and 10 iterations

Result (median cycles for code, minus 3 chain cycles): 7.0073

retire uop (01)cycle (02)schedule uop (52)schedule int uop (53)schedule simd uop (54)schedule ldst uop (55)dispatch int uop (56)dispatch simd uop (57)dispatch ldst uop (58)int uops in schedulers (59)simd uops in schedulers (5a)ldst uops in schedulers (5b)dispatch uop (78)map int uop (7c)map ldst uop (7d)map simd uop (7e)map int uop inputs (7f)map ldst uop inputs (80)map simd uop inputs (81)? int output thing (e9)? ldst retires (ed)? simd retires (ee)? int retires (ef)
600341024218014050021100992002040294100842000626606631570058790079700223002920006100036002020000100005000120000040010
600241000808001450011100032000040010100002000026601941569780789938700103002020000100006002020000100005000120000040010
600241000718001350011100022000040010100002000026601941569780789938700103002020000100006002020000100005000120000040010
600251001448002850019100072000240046100132000026602481569816789954700103002020000100006002020000100005000120000040010
600241000748001350011100022000040010100002000026602751569830789964700103002020000100006002020000100005000120000040010
600241000928001350011100022000040010100002000026605991570050790065700103002020000100006002020000100005000120000040010
600241000858001350011100022000040010100002000026605181569996790040700103002020000100006002020000100005000120000040010
600251001588002850019100072000240046100132000026608691570228790156700103002020000100006002020000100005000120000040010
600241001198001350011100022000040010100002000026605181569996790037700103002020000100006002020000100005000120000040010
600241000718001350011100022000040010100002002426618411570810790478700833005920028100136002020000100005000120000040010

Test 4: throughput

Count: 8

Code:

  ldp d0, d1, [x6], #0x10
  ldp d0, d1, [x7], #0x10
  ldp d0, d1, [x8], #0x10
  ldp d0, d1, [x9], #0x10
  ldp d0, d1, [x10], #0x10
  ldp d0, d1, [x11], #0x10
  ldp d0, d1, [x12], #0x10
  ldp d0, d1, [x13], #0x10
  mov x7, x6
  mov x8, x6
  mov x9, x6
  mov x10, x6
  mov x11, x6
  mov x12, x6
  mov x13, x6

(fused SUBS/B.cc loop)

100 unrolls and 100 iterations

Result (median cycles for code divided by count): 1.0794

retire uop (01)cycle (02)schedule uop (52)schedule int uop (53)schedule ldst uop (55)dispatch int uop (56)dispatch ldst uop (58)int uops in schedulers (59)simd uops in schedulers (5a)dispatch uop (78)map int uop (7c)map ldst uop (7d)map int uop inputs (7f)map ldst uop inputs (80)? int output thing (e9)? ldst retires (ed)? int retires (ef)
160214886392406678036716030080368160009240318139376724011520016001220016001280005160000100
160204863632401058010516000080106160009240318139351124011520016001220016001280005160000100
160204863522401058010516000080106160009240318139351124011520016001220016001280005160000100
160204863522401058010516000080106160009240318139351124011520016001220016001280005160000100
160204863522401058010516000080106160009240318139351124011520016001220016001280005160000100
160204863522401058010516000080106160009240318139351124011520016001220016001280005160000100
160204863522401058010516000080106160009240318139351124011520016001220016001280005160000100
160204863562401058010516000080106160008240318139360624011420016001220016001280005160000100
160204863512401058010516000080106160008240318139349824011420016001220016001280005160000100
160205864172401618013116003080132160008240318139373624011420016001220016001280005160000100

1000 unrolls and 10 iterations

Result (median cycles for code divided by count): 1.0794

retire uop (01)cycle (02)schedule uop (52)schedule int uop (53)schedule ldst uop (55)dispatch int uop (56)dispatch ldst uop (58)int uops in schedulers (59)simd uops in schedulers (5a)ldst uops in schedulers (5b)dispatch uop (78)map int uop (7c)map ldst uop (7d)map simd uop (7e)map int uop inputs (7f)map ldst uop inputs (80)? int output thing (e9)? ldst retires (ed)? int retires (ef)
16003488299240585802851603008028616001124005113928530240028201600140201600008000116000010
16002486349240011800111600008001016000024003013934210240010201600000201600008000116000010
16002486348240011800111600008001016000024003013934210240010201600000201600008000116000010
16002486348240011800111600008001016000024003013934210240010201600000201600688003116000010
16002486348240011800111600008001016000024003013934210240010201600000201600008000116000010
16002486348240011800111600008001016000024003013934210240010201600000201600008000116000010
16002486348240011800111600008001016000024003013935940240010201600000201600688003116000010
16002486346240011800111600008001016000024003013935110240010201600000201600008000116000010
16002486348240011800111600008001016000024003013935610240010201600000201600008000116000010
16002486353240011800111600008001016000024003013934210240010201600000201600008000116000010