Apple Microarchitecture Research by Dougall Johnson

M1/A14 P-core (Firestorm): Overview | Base Instructions | SIMD and FP Instructions
M1/A14 E-core (Icestorm):  Overview | Base Instructions | SIMD and FP Instructions

PRFM (register, PLDL1KEEP)

Test 1: uops

Code:

  prfm pldl1keep, [x6]
  mov x0, 0

(no loop instructions)

1000 unrolls and 1 iteration

Retires: 1.000

Issues: 1.000

Integer unit issues: 0.001

Load/store unit issues: 1.000

SIMD/FP unit issues: 0.000

retire uop (01)cycle (02)schedule uop (52)schedule int uop (53)schedule ldst uop (55)dispatch ldst uop (58)simd uops in schedulers (5a)dispatch uop (78)map ldst uop (7d)map ldst uop inputs (80)? int output thing (e9)? ldst retires (ed)
1004206410011100010003423010001000100011000
1004199310011100010003541010001000100011000
1004211310011100010003559010001000100011000
1004211310011100010003563410001000100011000
1004214310011100010003564210001000100011000
1004211410011100010003568610001000100011000
1004214310011100010003565610001000100011000
1004214510011100010003553810001000100011000
1004214610011100010003568610001000100011000
1004212210011100010003568610001000100011000

Test 2: throughput

Code:

  prfm pldl1keep, [x6]
  add x6, x6, 64

(fused SUBS/B.cc loop)

100 unrolls and 100 iterations

Result (median cycles for code): 2.0217

retire uop (01)cycle (02)schedule uop (52)schedule int uop (53)schedule ldst uop (55)dispatch int uop (56)dispatch ldst uop (58)int uops in schedulers (59)simd uops in schedulers (5a)dispatch uop (78)map int uop (7c)map ldst uop (7d)map int uop inputs (7f)map ldst uop inputs (80)? int output thing (e9)? ldst retires (ed)? int retires (ef)
20204207962010310103100001010810000608633505572010310205100051020410004100011000010100
20204199962010610106100001011210000613173499292010010202100021020410004100031000010100
20204201412010610106100001011210000612123502692010410206100061020410004100011000010100
20204202912010210102100001010410000612613533692010610208100081020810008100021000010100
20204200982010510105100001011010000614173509832010210204100041020810008100051000010100
20204202302010210102100001010610000612933508692010010202100021021010010100031000010100
20204200682010510105100001011010006617353468512011810214100141020610006100021000010100
20204201162010210102100001010610031613103502532017010241100391020310003100021000010100
20204201952010210102100001010410000611693520092010210204100041020810008100051000010100
20204202152010210102100001010610000609733501892010610208100081020210002100011000010100

1000 unrolls and 10 iterations

Result (median cycles for code): 2.0054

retire uop (01)cycle (02)schedule uop (52)schedule int uop (53)schedule ldst uop (55)dispatch int uop (56)dispatch ldst uop (58)int uops in schedulers (59)simd uops in schedulers (5a)dispatch uop (78)map int uop (7c)map ldst uop (7d)map int uop inputs (7f)map ldst uop inputs (80)? int output thing (e9)? ldst retires (ed)? int retires (ef)
20024209492001510015100001002010000612893497612001010020100001002010000100011000010010
20024201672001110011100001001010000610253513532001010020100001002010000100011000010010
20024201672001110011100001001010000613053496152001010020100001002010000100011000010010
20024200962001110011100001001010000612943502792001010020100001002010000100011000010010
20024201532001110011100001001010000607823505752001010020100001002010000100011000010010
20024201152001110011100001001010000612573506772001010020100001002010000100011000010010
20024201752001110011100001001010000613003503692001010020100001002010000100011000010010
20024201922001110011100001001010000610693511092001010020100001002010000100011000010010
20024201952001110011100001001010000612503524852001010020100001002010000100011000010010
20024201702001110011100001001010000611583517092001010020100001002010000100011000010010

Test 3: throughput

Code:

  prfm pldl1keep, [x6]
  mov x7, 8

(fused SUBS/B.cc loop)

100 unrolls and 100 iterations

Result (median cycles for code): 2.0503

retire uop (01)cycle (02)schedule uop (52)schedule int uop (53)schedule ldst uop (55)dispatch int uop (56)dispatch ldst uop (58)int uops in schedulers (59)simd uops in schedulers (5a)dispatch uop (78)map int uop (7c)map ldst uop (7d)map int uop inputs (7f)map ldst uop inputs (80)? int output thing (e9)? ldst retires (ed)? int retires (ef)
1020420512101011011000010010002300357076101022001001020010010110000100
1020420511101011011000010010000300356086101002001000820010012110000100
1020520576101311011003010010006300357346101062001001220010012110000100
1020420157101011011000010010000300357308101002001000420010012110000100
1020420503101011011000010010006300357346101062001001220010012110000100
1020420503101011011000010010006300357346101062001001220010012110000100
1020420503101011011000010010006300357346101062001001220010012110000100
1020420503101011011000010010006300357346101062001001220010012110000100
1020420503101011011000010010006300357346101062001001220010012110000100
1020420503101011011000010010006300357346101062001001220010004110000100

1000 unrolls and 10 iterations

Result (median cycles for code): 2.0864

retire uop (01)cycle (02)schedule uop (52)schedule int uop (53)schedule ldst uop (55)dispatch int uop (56)dispatch ldst uop (58)int uops in schedulers (59)simd uops in schedulers (5a)dispatch uop (78)map int uop (7c)map ldst uop (7d)map int uop inputs (7f)map ldst uop inputs (80)? int output thing (e9)? ldst retires (ed)? int retires (ef)
1002421137100111110000101000630365438100162010012201000011000010
1002520935100411110030101000030363966100102010000201000011000010
1002420916100111110000101000030364494100102010000201000011000010
1002420869100111110000101000030364702100102010000201000011000010
1002420818100111110000101000030362654100102010000201000011000010
1002420765100111110000101000030363322100102010000201000011000010
1002420874100111110000101000030365374100102010000201000011000010
1002420960100111110000101000030365330100102010000201000011000010
1002420963100111110000101000030365330100102010000201000011000010
1002420960100111110000101000030365322100102010000201000011000010