Apple Microarchitecture Research by Dougall Johnson

M1/A14 P-core (Firestorm): Overview | Base Instructions | SIMD and FP Instructions
M1/A14 E-core (Icestorm):  Overview | Base Instructions | SIMD and FP Instructions

PRFM (register, PSTL1KEEP)

Test 1: uops

Code:

  prfm pstl1keep, [x6]
  mov x0, 0

(no loop instructions)

1000 unrolls and 1 iteration

Retires: 1.000

Issues: 1.000

Integer unit issues: 0.001

Load/store unit issues: 1.000

SIMD/FP unit issues: 0.000

retire uop (01)cycle (02)schedule uop (52)schedule int uop (53)schedule ldst uop (55)dispatch ldst uop (58)simd uops in schedulers (5a)dispatch uop (78)map ldst uop (7d)map ldst uop inputs (80)? int output thing (e9)? ldst retires (ed)
1004209310011100010003490010001000100011000
1004210410011100010003493010001000100011000
1004207610011100010003481610001000100011000
1004209610011100010003492810001000100011000
1004210110011100010003492810001000100011000
1004207610011100010003492210001000100011000
1004210410011100010003468410001000100011000
1004208710011100010003497610001000100011000
1004208010011100010003461810001000100011000
1004209710011100010003466810001000100011000

Test 2: throughput

Code:

  prfm pstl1keep, [x6]
  add x6, x6, 64

(fused SUBS/B.cc loop)

100 unrolls and 100 iterations

Result (median cycles for code): 1.9976

retire uop (01)cycle (02)schedule uop (52)schedule int uop (53)schedule ldst uop (55)dispatch int uop (56)dispatch ldst uop (58)int uops in schedulers (59)simd uops in schedulers (5a)dispatch uop (78)map int uop (7c)map ldst uop (7d)map int uop inputs (7f)map ldst uop inputs (80)? int output thing (e9)? ldst retires (ed)? int retires (ef)
20204206102010510105100001011110002616343489252010910209100091020710007100031000010100
20204201002010110101100001010110000615403466332010010202100021020410004100031000010100
20204201302010110101100001010010000614533466472010210204100041020210002100011000010100
20204199682010110101100001010010002614833491692011010210100101020410004100031000010100
20204200662010510105100001010610000615213487692010610208100081021010010100031000010100
20204199702010410104100001010310000615263484532010210204100041021010010100051000010100
20204199222010210102100001010410000616413476632010610208100081020410004100011000010100
20204199852010110101100001010010002614963491252011010210100101020610006100021000010100
20204198922010610106100001011210000617443469412010410206100061020410004100011000010100
20204201172010210102100001010610000617173459532010210204100041020210002100011000010100

1000 unrolls and 10 iterations

Result (median cycles for code): 2.0176

retire uop (01)cycle (02)schedule uop (52)schedule int uop (53)schedule ldst uop (55)dispatch int uop (56)dispatch ldst uop (58)int uops in schedulers (59)simd uops in schedulers (5a)dispatch uop (78)map int uop (7c)map ldst uop (7d)map int uop inputs (7f)map ldst uop inputs (80)? int output thing (e9)? ldst retires (ed)? int retires (ef)
20024243812001210012100001001510000609063494632001510027100071002010000100011000010010
20024201072001110011100001001010000604963455132001010020100001002010000100011000010010
20024201242001110011100001001010000611663485532001010020100001002010000100011000010010
20024200752001110011100001001010000611073495672001010020100001002010000100011000010010
20024200652001110011100001001010000609993496292001010020100001002010000100011000010010
20024199952001110011100001001010000608573479912001010020100001002010000100011000010010
20024201502001110011100001001010000611533502432001010020100001002010000100011000010010
20024201492001110011100001001010000611813499092001010020100001002010000100011000010010
20024200482001110011100001001010000605443490452001010020100001002010000100011000010010
20024200772001110011100001001010000610293507852001010020100001002010000100011000010010

Test 3: throughput

Code:

  prfm pstl1keep, [x6]
  mov x7, 8

(fused SUBS/B.cc loop)

100 unrolls and 100 iterations

Result (median cycles for code): 1.9387

retire uop (01)cycle (02)schedule uop (52)schedule int uop (53)schedule ldst uop (55)dispatch int uop (56)dispatch ldst uop (58)int uops in schedulers (59)simd uops in schedulers (5a)dispatch uop (78)map int uop (7c)map ldst uop (7d)map int uop inputs (7f)map ldst uop inputs (80)? int output thing (e9)? ldst retires (ed)? int retires (ef)
1020419479101011011000010010000300339178101002001000420010008110000100
1020419247101011011000010010000300333056101002001000420010004110000100
1020419207101011011000010010000300335544101002001000420010004110000100
1020419244101011011000010010002300335382101022001001220010004110000100
1020419367101011011000010010004300337914101042001001220010008110000100
1020419226101011011000010010000300332656101002001000820010069110000100
1020419375101011011000010010000300334828101002001000820010012110000100
1020419420101011011000010010002300336588101022001001220010004110000100
1020419144101011011000010010002300337332101022001001220010008110000100
1020419373101011011000010010000300339204101002001000820010008110000100

1000 unrolls and 10 iterations

Result (median cycles for code): 1.8718

retire uop (01)cycle (02)schedule uop (52)schedule int uop (53)schedule ldst uop (55)dispatch int uop (56)dispatch ldst uop (58)int uops in schedulers (59)simd uops in schedulers (5a)dispatch uop (78)map int uop (7c)map ldst uop (7d)map int uop inputs (7f)map ldst uop inputs (80)? int output thing (e9)? ldst retires (ed)? int retires (ef)
1002420065100111110000101000030349282100102010000201000011000010
1002420048100111110000101000030349214100102010000201000011000010
1002420048100111110000101000030349214100102010000201000011000010
1002420048100111110000101000030349214100102010000201000011000010
1002420048100111110000101000030349214100102010000201000011000010
1002420048100111110000101000030349214100102010000201000011000010
1002420048100111110000101000030349214100102010000201000011000010
1002420048100111110000101000030349214100102010000201000011000010
1002420048100111110000101000030349214100102010000201000011000010
1002420048100111110000101000030349214100102010000201000011000010