Apple Microarchitecture Research by Dougall Johnson

M1/A14 P-core (Firestorm): Overview | Base Instructions | SIMD and FP Instructions
M1/A14 E-core (Icestorm):  Overview | Base Instructions | SIMD and FP Instructions

LDP (pre-index, Q)

Test 1: uops

Code:

  ldp q0, q1, [x6, #0x10]!
  mov x0, 1
  mov x1, 2
  mov x8, 0

(no loop instructions)

1000 unrolls and 1 iteration

Retires: 2.000

Issues: 3.000

Integer unit issues: 1.001

Load/store unit issues: 2.000

SIMD/FP unit issues: 0.000

retire uop (01)cycle (02)schedule uop (52)schedule int uop (53)schedule ldst uop (55)dispatch int uop (56)dispatch ldst uop (58)int uops in schedulers (59)simd uops in schedulers (5a)dispatch uop (78)map ldst uop (7d)map ldst uop inputs (80)? int output thing (e9)? ldst retires (ed)
200518543057102720301026200035162525430002000200010012000
200415643001100120001000200035162475930002000200010012000
200415103001100120001000200035162502930002000200010012000
200415223001100120001000200035162475930002000200010012000
200415153001100120001000200035162454330002000200010012000
200415403001100120001000200035162459730002000200010012000
200415183001100120001000200035162449830002000200010012000
200415303001100120001000200035162458830002000200010012000
200415253001100120001000200035162448930002000200010012000
200415263001100120001000200035162478630002000200010012000

Test 2: Latency 1->3 roundtrip

Chain cycles: 3

Code:

  ldp q0, q1, [x6, #0x10]!
  fmov x0, d0
  eor x8, x8, x0
  eor x8, x8, x0
  add x6, x6, x8
  mov x0, 1
  mov x1, 2
  mov x8, 0

(fused SUBS/B.cc loop)

100 unrolls and 100 iterations

Result (median cycles for code, minus 3 chain cycles): 7.0076

retire uop (01)cycle (02)schedule uop (52)schedule int uop (53)schedule simd uop (54)schedule ldst uop (55)dispatch int uop (56)dispatch simd uop (57)dispatch ldst uop (58)int uops in schedulers (59)simd uops in schedulers (5a)ldst uops in schedulers (5b)dispatch uop (78)map int uop (7c)map ldst uop (7d)map simd uop (7e)map int uop inputs (7f)map ldst uop inputs (80)map simd uop inputs (81)? int output thing (e9)? ldst retires (ed)? int retires (ef)
60223102390802895015710094200384042110096200062660828156864478939370113302092000810003602182000810003500012000040100
60204100087801045010110003200004010410003200062660477156841078928770113302092000810003602182000810003500012000040100
60204100076801045010110003200004010410003200062660261156827078922270113302092000810003602182000810003500012000040100
60204100076801045010110003200004010410003200062660315156830678923870113302092000810003602182000810003500012000040100
60204100076801045010110003200004010410003200062660261156827078922270113302092000810003602782002810013500092000040100
62981115847828365183710014209854174910025200422664302157070279047070233302692004810023602182000810003500012000040100
60204100111801045010110003200004010410003200062660450156839678928270113302092000810003602182000810003500012000040100
60204100076801045010110003200004010410003200062660261156827078922270113302092000810003602182000810003500012000040100
60204100076801045010110003200004010410003200062660261156827078922270113302092000810003602182000810003500012000040100
60204100076801045010110003200004010410003200062660261156827078922270113302092000810003602182000810003500012000040100

1000 unrolls and 10 iterations

Result (median cycles for code, minus 3 chain cycles): 7.0075

retire uop (01)cycle (02)schedule uop (52)schedule int uop (53)schedule simd uop (54)schedule ldst uop (55)dispatch int uop (56)dispatch simd uop (57)dispatch ldst uop (58)int uops in schedulers (59)simd uops in schedulers (5a)ldst uops in schedulers (5b)dispatch uop (78)map int uop (7c)map ldst uop (7d)map simd uop (7e)map int uop inputs (7f)map ldst uop inputs (80)map simd uop inputs (81)? int output thing (e9)? ldst retires (ed)? int retires (ef)
60043102125801955006610091200384033010100200062660513157003679005470023300292000810003600202000010000500012000040010
60024100085800145001110003200004001010000200002661004157031079018570010300202000010000600202000010000500012000040010
60024100121800145001110003200004001010000200002661355157054479029870010300202000010000600202000010000500012000040010
60024100111800145001110003200004001010000200002661490157063879034170010300202000010000600982002810013500092000040010
60024100138800205001110009200004001410003200002661378157038879025670010300202000010000600802002010010500122000040010
60024100109800195001110008200004001010000200002661193157034079022370010300202000010000600982002810013500122000040010
60024100307800155001110004200004001010000200002661112157038279022370010300202000010000600202000010000500012000040010
60024100080800145001110003200004001010000200202661925157086279045870075300502002010010600202000010000500012000040010
60024100073800145001110003200004001010000200002660221156978878993970010300202000010000600202000010000500012000040010
60024100077800145001110003200004001010000200442663840157055079155870147300892004610023600202000010000500012000040010

Test 3: Latency 2->3 roundtrip

Chain cycles: 3

Code:

  ldp q0, q1, [x6, #0x10]!
  fmov x1, d1
  eor x8, x8, x1
  eor x8, x8, x1
  add x6, x6, x8
  mov x0, 1
  mov x1, 2
  mov x8, 0

(fused SUBS/B.cc loop)

100 unrolls and 100 iterations

Result (median cycles for code, minus 3 chain cycles): 7.0080

retire uop (01)cycle (02)schedule uop (52)schedule int uop (53)schedule simd uop (54)schedule ldst uop (55)dispatch int uop (56)dispatch simd uop (57)dispatch ldst uop (58)int uops in schedulers (59)simd uops in schedulers (5a)ldst uops in schedulers (5b)dispatch uop (78)map int uop (7c)map ldst uop (7d)map simd uop (7e)map int uop inputs (7f)map ldst uop inputs (80)map simd uop inputs (81)? int output thing (e9)? ldst retires (ed)? simd retires (ee)? int retires (ef)
602231026368022850112100962002040385100892000626600451568126789162701133020920008100036021820008100035000120000040100
602041000688010450101100032000040104100032000626599951568018789119701133020920008100036021820008100035000120000040100
602041000688010450101100032000040104100032000626600991568162789178701133020920008100036021820008100035000120000040100
602041001048010450101100032000040104100032000626602341568252789223701133020920008100036021820008100035000120000040100
602041000688010450101100032000040104100032000626600451568126789162701133020920008100036021820008100035000120000040100
602041000688010450101100032000040104100032000626600451568126789162701133020920008100036021820008100035000120000040100
602041000688010450101100032000040104100032000626600451568126789162701133020920008100036021820008100035000120000040100
602041000688010450101100032000040104100032000626600451568126789162701133020920008100036027820025100135000920000040100
602041000688010450101100032000040104100032000626600451568126789162701133020920008100036021820008100035000120000040100
602041000708010450101100032000040104100032000626601531568198789197701133020920008100036021820008100035000120000040100

1000 unrolls and 10 iterations

Result (median cycles for code, minus 3 chain cycles): 7.0075

retire uop (01)cycle (02)schedule uop (52)schedule int uop (53)schedule simd uop (54)schedule ldst uop (55)dispatch int uop (56)dispatch simd uop (57)dispatch ldst uop (58)int uops in schedulers (59)simd uops in schedulers (5a)ldst uops in schedulers (5b)dispatch uop (78)map int uop (7c)map ldst uop (7d)map simd uop (7e)map int uop inputs (7f)map ldst uop inputs (80)map simd uop inputs (81)? int output thing (e9)? ldst retires (ed)? simd retires (ee)? int retires (ef)
600431026788013650021100952002040294100842000626604551569924790022700233002920008100036002020000100005000120000040010
600241000828001450011100032000040010100002000026604371569932790017700103002020000100006009820028100135000920000040010
600241000718001350011100022000040010100002000026605181569986790040700103002020000100006002020000100005000120000040010
600241001518001350011100022000040010100002000026621921571102790567700103002020000100006002020000100005000120000040010
600241001488001350011100022000040010100002000026627861571498790753700103002020000100005240423022743440453180573234069
600241000828001450011100032000040010100002000026605991570040790065700103002020000100006002020000100005000120000040010
600241001448002050011100092000040014100022000026615171570530790392700103002020000100006002020000100005000120000040010
600241001148001850011100072000040010100002000026625161571196790713700103002020000100006002020000100005000120000040010
600241001138001850011100072000040010100002002426673801574256792291700833005920028100136003820008100035000120000040010
600241001018001350011100022000040010100002002426626241571306790712700833005920028100136002020000100005000120000040010

Test 4: throughput

Count: 8

Code:

  ldp q0, q1, [x6, #0x10]!
  ldp q0, q1, [x7, #0x10]!
  ldp q0, q1, [x8, #0x10]!
  ldp q0, q1, [x9, #0x10]!
  ldp q0, q1, [x10, #0x10]!
  ldp q0, q1, [x11, #0x10]!
  ldp q0, q1, [x12, #0x10]!
  ldp q0, q1, [x13, #0x10]!
  mov x7, x6
  mov x8, x6
  mov x9, x6
  mov x10, x6
  mov x11, x6
  mov x12, x6
  mov x13, x6

(fused SUBS/B.cc loop)

100 unrolls and 100 iterations

Result (median cycles for code divided by count): 1.0796

retire uop (01)cycle (02)schedule uop (52)schedule int uop (53)schedule ldst uop (55)dispatch int uop (56)dispatch ldst uop (58)int uops in schedulers (59)simd uops in schedulers (5a)dispatch uop (78)map int uop (7c)map ldst uop (7d)map int uop inputs (7f)map ldst uop inputs (80)? int output thing (e9)? ldst retires (ed)? int retires (ef)
160223881862406768036716030980368160009240318139391524011520016001220016001280005160000100
160204863642401058010516000080106160009240318139375324011520016001220016001280005160000100
160204863622401058010516000080106160009240318139375424011520016001220016001280005160000100
160204863682401058010516000080106160008240318139382924011420016001220016001280005160000100
160204864302401058010516000080106160008240318139406324011420016001220016001280005160000100
160205864392401618013116003080132160008240318139381124011420016001220016001280005160000100
160204863632401058010516000080106160009240318139375424011520016001220016001280005160000100
160204863622401058010516000080106160009240318139375424011520016001220016001280005160000100
160204863622401058010516000080106160009240318139386124011520016001220016001280005160000100
160204863622401058010516000080106160009240318139386124011520016001220016001280005160000100

1000 unrolls and 10 iterations

Result (median cycles for code divided by count): 1.0795

retire uop (01)cycle (02)schedule uop (52)schedule int uop (53)schedule simd uop (54)schedule ldst uop (55)dispatch int uop (56)dispatch simd uop (57)dispatch ldst uop (58)int uops in schedulers (59)simd uops in schedulers (5a)dispatch uop (78)map int uop (7c)map ldst uop (7d)map int uop inputs (7f)map ldst uop inputs (80)map simd uop inputs (81)? int output thing (e9)? ldst retires (ed)? simd retires (ee)? int retires (ef)
1600448848424064980310016033980311016000924004813934942400252016001220160012080005160000010
1600248637424001180011016000080010016000024003013935472400102016000020160000080001160000010
1600248635524001180011016000080010016000024003013935832400102016000020160000080001160000010
1600248635024001180011016000080010016000024003013935472400102016000020160000080001160000010
1600248635024001180011016000080010016000024003013935372400102016000020160000080001160000010
1600248634924001180011016000080010016000024003013935472400102016000020160000080001160000010
1600248635024001180011016000080010016000024003013935472400102016000020160000080001160000010
1600248635024001180011016000080010016000024003013935732400102016000020160000080001160000010
1600248635024001180011016000080010016000024003013935472400102016000020160000080001160000010
1600248635124001180011016000080010016000024003013935382400102016000020160000080001160000010