Apple Microarchitecture Research by Dougall Johnson

M1/A14 P-core (Firestorm): Overview | Base Instructions | SIMD and FP Instructions
M1/A14 E-core (Icestorm):  Overview | Base Instructions | SIMD and FP Instructions

PRFM (register, PLDL2STRM)

Test 1: uops

Code:

  prfm pldl2strm, [x6]
  mov x0, 0

(no loop instructions)

1000 unrolls and 1 iteration

Retires: 1.000

Issues: 1.000

Integer unit issues: 0.001

Load/store unit issues: 1.000

SIMD/FP unit issues: 0.000

retire uop (01)cycle (02)schedule uop (52)schedule int uop (53)schedule ldst uop (55)dispatch ldst uop (58)simd uops in schedulers (5a)dispatch uop (78)map ldst uop (7d)map ldst uop inputs (80)? int output thing (e9)? ldst retires (ed)
1004203710011100010003408210001000100011000
1004205810011100010003413610001000100011000
1004205810011100010003413610001000100011000
1004205810011100010003413610001000100011000
1004205810011100010003413610001000100011000
1004205810011100010003413610001000100011000
1004205810011100010003413610001000100011000
1004205810011100010003413610001000100011000
1004205810011100010003413610001000100011000
1004205810011100010003413610001000100011000

Test 2: throughput

Code:

  prfm pldl2strm, [x6]
  add x6, x6, 64

(fused SUBS/B.cc loop)

100 unrolls and 100 iterations

Result (median cycles for code): 1.9983

retire uop (01)cycle (02)schedule uop (52)schedule int uop (53)schedule ldst uop (55)dispatch int uop (56)dispatch ldst uop (58)int uops in schedulers (59)simd uops in schedulers (5a)dispatch uop (78)map int uop (7c)map ldst uop (7d)map int uop inputs (7f)map ldst uop inputs (80)? int output thing (e9)? ldst retires (ed)? int retires (ef)
20204203142010510105100001010610000612243512692010110203100031020910009100071000010100
20204201182010310103100001010810000611653475952010610208100081021310013100061000010100
20204199922010410104100001010710004619113456292011310211100111020610006100021000010100
20204198652010110101100001010210000612933488632010010202100021020710007100031000010100
20204200432010210102100001010610006612993517292011810214100141020210002100011000010100
20204201192010510105100001011010000612563504292010310205100051020310004100011000010100
20204199712010110101100001010210000615443468332010610208100081020210002100011000010100
20204200302010110101100001010010002615143475032011010210100101021010010100031000010100
20204199482010210102100001010610002619423450952011010210100101021410014100061000010100
20204199832010610106100001011210006615353481252011810214100141021410014100061000010100

1000 unrolls and 10 iterations

Result (median cycles for code): 2.0023

retire uop (01)cycle (02)schedule uop (52)schedule int uop (53)schedule ldst uop (55)dispatch int uop (56)dispatch ldst uop (58)int uops in schedulers (59)simd uops in schedulers (5a)dispatch uop (78)map int uop (7c)map ldst uop (7d)map int uop inputs (7f)map ldst uop inputs (80)? int output thing (e9)? ldst retires (ed)? int retires (ef)
20024209242001110011100001001310004604403458792002310031100121002010000100011000010010
20024200232001110011100001001010000611083487052001010020100001002010000100011000010010
20024200232001110011100001001010000611083487052001010020100001002010000100011000010010
20024200232001110011100001001010000611083487052001010020100001002010000100011000010010
20024200232001110011100001001010000611083487052001010020100001002010000100011000010010
20024200232001110011100001001010000611083487052001010020100001002010000100011000010010
20024200232001110011100001001010000611083487052001010020100001002010000100011000010010
20024200232001110011100001001010000611083487052001010020100001002010000100011000010010
20024198602001110011100001001010000614913462712001010020100001002010000100011000010010
20024199762001110011100001001010000608943492812001010020100001002010000100011000010010

Test 3: throughput

Code:

  prfm pldl2strm, [x6]
  mov x7, 8

(fused SUBS/B.cc loop)

100 unrolls and 100 iterations

Result (median cycles for code): 2.0503

retire uop (01)cycle (02)schedule uop (52)schedule int uop (53)schedule ldst uop (55)dispatch int uop (56)dispatch ldst uop (58)int uops in schedulers (59)simd uops in schedulers (5a)dispatch uop (78)map int uop (7c)map ldst uop (7d)map int uop inputs (7f)map ldst uop inputs (80)? int output thing (e9)? ldst retires (ed)? int retires (ef)
1020420467101011011000010010002300357166101022001001220010012110000100
1020420408101011011000010010000300357346101002001000420010004110000100
1020420482101011011000010010006300357346101062001001220010012110000100
1020420503101011011000010010006300357346101062001001220010012110000100
1020420503101011011000010010006300357346101062001001220010012110000100
1020420503101011011000010010006300357346101062001001220010012110000100
1020420503101011011000010010006300357346101062001001220010012110000100
1020420503101011011000010010006300357346101062001001220010012110000100
1020420503101011011000010010006300357346101062001001220010012110000100
1020420503101011011000010010006300357346101062001001220010012110000100

1000 unrolls and 10 iterations

Result (median cycles for code): 1.8718

retire uop (01)cycle (02)schedule uop (52)schedule int uop (53)schedule ldst uop (55)dispatch int uop (56)dispatch ldst uop (58)int uops in schedulers (59)simd uops in schedulers (5a)dispatch uop (78)map int uop (7c)map ldst uop (7d)map int uop inputs (7f)map ldst uop inputs (80)? int output thing (e9)? ldst retires (ed)? int retires (ef)
1002419535100111110000101000630325270100162010012201001411000010
1002418718100111110000101000030325246100102010000201000011000010
1002418718100111110000101000030325246100102010000201000011000010
1002418718100111110000101000030325246100102010000201000011000010
1002418718100111110000101000030325246100102010000201000011000010
1002418718100111110000101000030325246100102010000201000011000010
1002418718100111110000101000030325246100102010000201000011000010
1002418718100111110000101000030325246100102010000201000011000010
1002418718100111110000101000030325246100102010000201000011000010
1002418718100111110000101000030325246100102010000201000011000010