Apple Microarchitecture Research by Dougall Johnson

M1/A14 P-core (Firestorm): Overview | Base Instructions | SIMD and FP Instructions
M1/A14 E-core (Icestorm):  Overview | Base Instructions | SIMD and FP Instructions

PRFM (register, PLIL3STRM)

Test 1: uops

Code:

  prfm plil3strm, [x6]
  mov x0, 0

(no loop instructions)

1000 unrolls and 1 iteration

Retires: 1.000

Issues: 1.000

Integer unit issues: 0.001

Load/store unit issues: 1.000

SIMD/FP unit issues: 0.000

retire uop (01)cycle (02)schedule uop (52)schedule int uop (53)schedule ldst uop (55)dispatch ldst uop (58)simd uops in schedulers (5a)dispatch uop (78)map ldst uop (7d)map ldst uop inputs (80)? int output thing (e9)? ldst retires (ed)
1004213510011100010003568610001000100011000
1004214310011100010003568610001000100011000
1004214310011100010003568610001000100011000
1004215510011100010003568610001000100011000
1004215510011100010003568610001000100011000
1004214610011100010003535810001000100011000
1004211510011100010003560610001000100011000
1004211710011100010003532610001000100011000
1004214410011100010003563810001000100011000
1004212810011100010003564210001000100011000

Test 2: throughput

Code:

  prfm plil3strm, [x6]
  add x6, x6, 64

(fused SUBS/B.cc loop)

100 unrolls and 100 iterations

Result (median cycles for code): 2.0176

retire uop (01)cycle (02)schedule uop (52)schedule int uop (53)schedule ldst uop (55)dispatch int uop (56)dispatch ldst uop (58)int uops in schedulers (59)simd uops in schedulers (5a)dispatch uop (78)map int uop (7c)map ldst uop (7d)map int uop inputs (7f)map ldst uop inputs (80)? int output thing (e9)? ldst retires (ed)? int retires (ef)
20204212542010310103100001010210004612693511672011310211100111021210012100051000010100
20204201762010510105100001011010004612893511732011410212100121021210012100051000010100
20204201762010510105100001011010004612893511732011410212100121021210012100051000010100
20204201762010510105100001011010004612893511732011410212100121021210012100051000010100
20204201762010510105100001011010004612893511732011410212100121021210012100051000010100
20204201762010510105100001011010004612893511732011410212100121021210012100051000010100
20204201762010510105100001011010004612893511732011410212100121021210012100051000010100
20204201762010510105100001011010004612893511732011410212100121021210012100051000010100
20204201762010510105100001011010004612893511732011410212100121021210012100051000010100
20204201762010510105100001011010004612893511732011410212100121021210012100051000010100

1000 unrolls and 10 iterations

Result (median cycles for code): 2.0145

retire uop (01)cycle (02)schedule uop (52)schedule int uop (53)schedule ldst uop (55)dispatch int uop (56)dispatch ldst uop (58)int uops in schedulers (59)simd uops in schedulers (5a)dispatch uop (78)map int uop (7c)map ldst uop (7d)map int uop inputs (7f)map ldst uop inputs (80)? int output thing (e9)? ldst retires (ed)? int retires (ef)
20024211642001210012100001001610000612153490432001510027100071002010000100011000010010
20024200412001110011100001001010000612013490432001010020100001002010000100011000010010
20024200412001110011100001001010000612013490432001010020100001002010000100011000010010
20024200412001110011100001001010000612013490432001010020100001002010000100011000010010
20024200412001110011100001001010000612013490432001010020100001002010000100011000010010
20024200412001110011100001001010000604673509712001510027100081002010000100011000010010
20024200682001110011100001001010000611053496732001010020100001002010000100011000010010
20024200682001110011100001001010000611053496732001010020100001002010000100011000010010
20024200682001110011100001001010000611053496732001010020100001002010000100011000010010
20024200682001110011100001001010000611053496732001010020100001002010000100011000010010

Test 3: throughput

Code:

  prfm plil3strm, [x6]
  mov x7, 8

(fused SUBS/B.cc loop)

100 unrolls and 100 iterations

Result (median cycles for code): 2.0048

retire uop (01)cycle (02)schedule uop (52)schedule int uop (53)schedule ldst uop (55)dispatch int uop (56)dispatch ldst uop (58)int uops in schedulers (59)simd uops in schedulers (5a)dispatch uop (78)map int uop (7c)map ldst uop (7d)map int uop inputs (7f)map ldst uop inputs (80)? int output thing (e9)? ldst retires (ed)? int retires (ef)
1020420058101011011000010010006300349012101062001001220010012110000100
1020420058101011011000010010000300347338101002001000820010012110000100
1020419812101011011000010010004300345536101042001001220010012110000100
1020419937101011011000010010000300348582101002001000420010010110000100
1020420160101011011000010010000300350364101002001000420010008110000100
1020420124101011011000010010000300350566101002001000820010004110000100
1020420098101011011000010010004300349494101042001001220010004110000100
1020420120101011011000010010000300349720101002001000820010012110000100
1020420098101011011000010010000300350696101002001000820010012110000100
1020420043101011011000010010000300350440101002001000820010008110000100

1000 unrolls and 10 iterations

Result (median cycles for code): 2.0070

retire uop (01)cycle (02)schedule uop (52)schedule int uop (53)schedule ldst uop (55)dispatch int uop (56)dispatch ldst uop (58)int uops in schedulers (59)simd uops in schedulers (5a)dispatch uop (78)map int uop (7c)map ldst uop (7d)map int uop inputs (7f)map ldst uop inputs (80)? int output thing (e9)? ldst retires (ed)? int retires (ef)
1002419350100111110000101000030331478100102010000201000611000010
1002420890100111110000101000030362272100102010000201000011000010
1002420786100111110000101000030360070100102010000201000011000010
1002420725100111110000101000030362558100102010000201000011000010
1002420627100111110000101000030349918100102010000201000011000010
1002420089100111110000101000030350036100102010000201000011000010
1002420094100111110000101000030351062100102010000201000011000010
1002420095100111110000101000030350044100102010000201000011000010
1002420070100111110000101000030348004100102010000201000011000010
1002420746101311110120101000030348884100102010000201000011000010