Apple Microarchitecture Research by Dougall Johnson

M1/A14 P-core (Firestorm): Overview | Base Instructions | SIMD and FP Instructions
M1/A14 E-core (Icestorm):  Overview | Base Instructions | SIMD and FP Instructions

PRFM (register, PLDL3KEEP)

Test 1: uops

Code:

  prfm pldl3keep, [x6]
  mov x0, 0

(no loop instructions)

1000 unrolls and 1 iteration

Retires: 1.000

Issues: 1.000

Integer unit issues: 0.001

Load/store unit issues: 1.000

SIMD/FP unit issues: 0.000

retire uop (01)cycle (02)schedule uop (52)schedule int uop (53)schedule ldst uop (55)dispatch ldst uop (58)simd uops in schedulers (5a)dispatch uop (78)map ldst uop (7d)map ldst uop inputs (80)? int output thing (e9)? ldst retires (ed)
1004247010011100010003490610001000100011000
1004209710011100010003490610001000100011000
1004209710011100010003490610001000100011000
1004209710011100010003490610001000100011000
1004209710011100010003490610001000100011000
1004209710011100010003490610001000100011000
1004210110011100010003490610001000100011000
1004209710011100010003491810001000100011000
1004210110011100010003480410001000100011000
1004207510011100010003458610001000100011000

Test 2: throughput

Code:

  prfm pldl3keep, [x6]
  add x6, x6, 64

(fused SUBS/B.cc loop)

100 unrolls and 100 iterations

Result (median cycles for code): 2.0000

retire uop (01)cycle (02)schedule uop (52)schedule int uop (53)schedule ldst uop (55)dispatch int uop (56)dispatch ldst uop (58)int uops in schedulers (59)simd uops in schedulers (5a)dispatch uop (78)map int uop (7c)map ldst uop (7d)map int uop inputs (7f)map ldst uop inputs (80)? int output thing (e9)? ldst retires (ed)? int retires (ef)
20204212082010210102100001010110002612883496292010910209100091020710007100061000010100
20204208942034710227101201023010002612393499972011010210100101021110011100071000010100
20204201072010510105100001011010004614173487372011410212100121020810008100021000010100
20204201172010110101100001010010000611613504632010210204100041020610006100021000010100
20204200922010310103100001010810000612923497612010210204100041020210002100011000010100
20204200152010310103100001010210000613183512192010610208100081020210002100011000010100
20204201462010310103100001010410000610793490392010210204100041020210002100011000010100
20204200362010310103100001010510000616463470792010510207100071020810008100051000010100
20204200642010210102100001010610000616873466952010210204100041020210002100011000010100
20204199722010210102100001010610000615933477252010610208100081020610006100021000010100

1000 unrolls and 10 iterations

Result (median cycles for code): 2.0065

retire uop (01)cycle (02)schedule uop (52)schedule int uop (53)schedule ldst uop (55)dispatch int uop (56)dispatch ldst uop (58)int uops in schedulers (59)simd uops in schedulers (5a)dispatch uop (78)map int uop (7c)map ldst uop (7d)map int uop inputs (7f)map ldst uop inputs (80)? int output thing (e9)? ldst retires (ed)? int retires (ef)
20024210182001110011100001001310000613843479072001210024100041002210002100011000010010
20024200352001110011100001001010000609103486672001010020100001002010000100011000010010
20024199742001110011100001001010000605623484612001010020100001002010000100011000010010
20024198402001110011100001001010000610263491132001010020100001002010000100011000010010
20024199662001110011100001001010000607393480912001010020100001002010000100011000010010
20024199902001110011100001001010000609803474032001010020100001002010000100011000010010
20024198082001110011100001001010000612673463632001010020100001002010000100011000010010
20024199642001110011100001001010000608753479072001010020100001002010000100011000010010
20024200902001110011100001001010000610173477272001010020100001002010000100011000010010
20024198962001110011100001001010000608643479792001010020100001002010000100011000010010

Test 3: throughput

Code:

  prfm pldl3keep, [x6]
  mov x7, 8

(fused SUBS/B.cc loop)

100 unrolls and 100 iterations

Result (median cycles for code): 2.0058

retire uop (01)cycle (02)schedule uop (52)schedule int uop (53)schedule ldst uop (55)dispatch int uop (56)dispatch ldst uop (58)int uops in schedulers (59)simd uops in schedulers (5a)dispatch uop (78)map int uop (7c)map ldst uop (7d)map int uop inputs (7f)map ldst uop inputs (80)? int output thing (e9)? ldst retires (ed)? int retires (ef)
1020420058101011011000010010000300350066101002001000620010004110000100
1020420058101011011000010010006300349086101062001001220010012110000100
1020420058101011011000010010006300349160101062001001220010012110000100
1020420058101011011000010010006300349160101062001001220010012110000100
1020420058101011011000010010006300349160101062001001220010012110000100
1020420058101011011000010010006300349160101062001001220010012110000100
1020420058101011011000010010006300349160101062001001220010012110000100
1020420058101011011000010010006300349160101062001001220010012110000100
1020420058101011011000010010006300349160101062001001220010012110000100
1020420058101011011000010010006300349160101062001001220010012110000100

1000 unrolls and 10 iterations

Result (median cycles for code): 2.0058

retire uop (01)cycle (02)schedule uop (52)schedule int uop (53)schedule ldst uop (55)dispatch int uop (56)dispatch ldst uop (58)int uops in schedulers (59)simd uops in schedulers (5a)dispatch uop (78)map int uop (7c)map ldst uop (7d)map int uop inputs (7f)map ldst uop inputs (80)? int output thing (e9)? ldst retires (ed)? int retires (ef)
1002420055100111110000101000030349136100102010000201000011000010
1002420058100111110000101000030349136100102010000201000011000010
1002420058100111110000101000030349136100102010000201000011000010
1002420058100111110000101000030349136100102010000201000011000010
1002420058100111110000101000030349136100102010000201000011000010
1002420058100111110000101000030349136100102010000201000011000010
1002420058100111110000101000030349136100102010000201000011000010
1002420058100111110000101000030349136100102010000201000011000010
1002420058100111110000101000030349136100102010000201000011000010
1002420058100111110000101000030349136100102010000201000011000010