Apple Microarchitecture Research by Dougall Johnson

M1/A14 P-core (Firestorm): Overview | Base Instructions | SIMD and FP Instructions
M1/A14 E-core (Icestorm):  Overview | Base Instructions | SIMD and FP Instructions

STP (pre-index, D)

Test 1: uops

Code:

  stp d0, d1, [x6, #0x10]!
  nop ; nop ; nop ; nop ; nop ; nop ; nop

(no loop instructions)

1000 unrolls and 1 iteration

Retires (minus 7 nops): 2.000

Issues: 3.000

Integer unit issues: 1.001

Load/store unit issues: 1.000

SIMD/FP unit issues: 1.000

retire uop (01)cycle (02)schedule uop (52)schedule int uop (53)schedule simd uop (54)schedule ldst uop (55)dispatch int uop (56)dispatch simd uop (57)dispatch ldst uop (58)int uops in schedulers (59)simd uops in schedulers (5a)ldst uops in schedulers (5b)dispatch uop (78)map ldst uop (7d)map simd uop (7e)map ldst uop inputs (80)map simd uop inputs (81)? int output thing (e9)? ldst retires (ed)? simd retires (ee)
90064349304310151014101410141014100030008768400030001000100020002000100110001000
90042400300110011000100010001000100030008757400030001000100020002000100110001000
90042306300110011000100010001000100030008759400030001000100020002000100110001000
90042316300110011000100010001000100030008760400030001000100020002000100110001000
90042306300110011000100010001000100030008759400030001000100020002000100110001000
90042585300110011000100010001000100030008763400030001000100020002000100110001000
90042313300110011000100010001000100030008762400030001000100020002000100110001000
90042306300110011000100010001000100030008761400030001000100020002000100110001000
90042306300110011000100010001000100030008761400030001000100020002000100110001000
90042369300110011000100010001000100030008762400030001000100020002000100110001000

Test 2: Latency 3->3

Code:

  stp d0, d1, [x6, #0x10]!

(fused SUBS/B.cc loop)

100 unrolls and 100 iterations

Result (median cycles for code): 1.1419

retire uop (01)cycle (02)schedule uop (52)schedule int uop (53)schedule simd uop (54)schedule ldst uop (55)dispatch int uop (56)dispatch simd uop (57)dispatch ldst uop (58)int uops in schedulers (59)simd uops in schedulers (5a)ldst uops in schedulers (5b)dispatch uop (78)map int uop (7c)map ldst uop (7d)map simd uop (7e)map int uop inputs (7f)map ldst uop inputs (80)map simd uop inputs (81)? int output thing (e9)? ldst retires (ed)? simd retires (ee)? int retires (ef)
20224146833090210413103091018010414103111000036511199431400223011020010005100052002001420014100061000010000100
20204115043010910105100041000010106100061000235696200002400263011420010006100062002001220012100051000010000100
20204115603010910105100041000010106100061000035692197844400183010820010006100062002001220012100051000010000100
20204116053010510103100021000010102100021000135699199395400263011320010006100062002001220012100051000010000100
20204118133010910105100041000010106100061000135695192882400263011320010006100062002001220012100051000010000100
20204116713010910105100041000010106100061000035677190068400183010820010004100042002001220012100051000010000100
20204115313010910105100041000010106100061000035658194658400183010820010006100062002001220012100051000010000100
20204114553010910105100041000010106100061000235682199197400263011420010006100062002001220012100051000010000100
20204116343010910105100041000010106100061000135655198758400263011320010006100062002001220012100051000010000100
20204116903010910105100041000010106100061000235687199467400263011420010006100062002001220012100051000010000100

1000 unrolls and 10 iterations

Result (median cycles for code): 1.1379

retire uop (01)cycle (02)schedule uop (52)schedule int uop (53)schedule simd uop (54)schedule ldst uop (55)dispatch int uop (56)dispatch simd uop (57)dispatch ldst uop (58)int uops in schedulers (59)simd uops in schedulers (5a)ldst uops in schedulers (5b)dispatch uop (78)map int uop (7c)map ldst uop (7d)map simd uop (7e)map int uop inputs (7f)map ldst uop inputs (80)map simd uop inputs (81)? int output thing (e9)? ldst retires (ed)? simd retires (ee)? int retires (ef)
20044155213081310322103111018010321103111000136700194729400263002320100061000620200002000010001100001000010
20024113913001110011100001000010010100001000035625194580400003001020100001000020200002000010001100001000010
20024113793001110011100001000010010100001000035625194301400003001020100001000020200002000010001100001000010
20024114183001110011100001000010010100001000035623193391400003001020100001000020200002000010001100001000010
20024113493001110011100001000010010100001000035628193337400003001020100001000020200002000010001100001000010
20024113523001110011100001000010010100001000035625194058400003001020100001000020200002000010001100001000010
20024114133001110011100001000010010100001000035623193320400003001020100001000020200002000010001100001000010
20024113373001110011100001000010010100001000035622194273400003001020100001000020200002000010001100001000010
20024113683001110011100001000010010100001000035625194508400003001020100001000020200002000010001100001000010
20024113803001110011100001000010010100001000035622194509400003001020100001000020200002000010001100001000010

Test 3: throughput

Count: 8

Code:

  stp d0, d1, [x6, #0x10]!
  stp d0, d1, [x7, #0x10]!
  stp d0, d1, [x8, #0x10]!
  stp d0, d1, [x9, #0x10]!
  stp d0, d1, [x10, #0x10]!
  stp d0, d1, [x11, #0x10]!
  stp d0, d1, [x12, #0x10]!
  stp d0, d1, [x13, #0x10]!
  mov x7, x6
  mov x8, x6
  mov x9, x6
  mov x10, x6
  mov x11, x6
  mov x12, x6
  mov x13, x6

(fused SUBS/B.cc loop)

100 unrolls and 100 iterations

Result (median cycles for code divided by count): 1.0006

retire uop (01)cycle (02)schedule uop (52)schedule int uop (53)schedule simd uop (54)schedule ldst uop (55)dispatch int uop (56)dispatch simd uop (57)dispatch ldst uop (58)int uops in schedulers (59)simd uops in schedulers (5a)ldst uops in schedulers (5b)dispatch uop (78)map int uop (7c)map ldst uop (7d)map simd uop (7e)map int uop inputs (7f)map ldst uop inputs (80)map simd uop inputs (81)? int output thing (e9)? ldst retires (ed)? simd retires (ee)? int retires (ef)
1602248257424089380407803068018080408803088000224031813602653200262401142008000680006200160012160012800058000080000100
1602048005624010980105800048000080106800068000224031813601173200262401142008000680006200160012160012800058000080000100
1602048004824010980105800048000080106800068000224031813601173200262401142008000680006200160012160012800058000080000100
1602048004824010980105800048000080106800068000224031813601173200262401142008000680006200160072160072800358000080000100
1602048005624010980105800048000080106800068000224031813601173200262401142008000680006200160012160012800058000080000100
1602048004824010980105800048000080106800068000224031813601173200262401142008000680006200160012160012800058000080000100
1602048004924010980105800048000080106800068000224031813602113200262401142008000680006200160012160012800058000080000100
1602048004524010980105800048000080106800068000224031813600633200262401142008000680006200160012160012800058000080000100
1602048004524010980105800048000080106800068000224031813600633200262401142008000680006200160012160012800058000080000100
1602048004524010980105800048000080106800068000224031813600633200262401142008000680006200160012160012800058000080000100

1000 unrolls and 10 iterations

Result (median cycles for code divided by count): 1.0012

retire uop (01)cycle (02)schedule uop (52)schedule int uop (53)schedule simd uop (54)schedule ldst uop (55)dispatch int uop (56)dispatch simd uop (57)dispatch ldst uop (58)int uops in schedulers (59)simd uops in schedulers (5a)ldst uops in schedulers (5b)dispatch uop (78)map int uop (7c)map ldst uop (7d)map simd uop (7e)map int uop inputs (7f)map ldst uop inputs (80)map simd uop inputs (81)? int output thing (e9)? ldst retires (ed)? simd retires (ee)? int retires (ef)
1600448215924081780324803138018080325803158000324005113602983200302400272080007800072016000016000080001800008000010
1600248005324001180011800008000080010800008000024003013602053200002400102080000800002016000016000080001800008000010
1600258022224009780045800348001880046800368000024003013612313200002400102080000800002016000016000080001800008000010
1600248010024001180011800008000080010800008000024003013612313200002400102080000800002016000016000080001800008000010
1600248009824001180011800008000080010800008000024003013609973200002400102080000800002016007216007280035800008000010
1600248009824001180011800008000080010800008000024003013610873200002400102080000800002016000016000080001800008000010
1600248009824001180011800008000080010800008000024003013610153200002400102080000800002016000016000080001800008000010
1600248009824001180011800008000080010800008000024003013610153200002400102080000800002016000016000080001800008000010
1600248009824001180011800008000080010800008000024003013610153200002400102080000800002016000016000080001800008000010
1600248009824001180011800008000080010800008000024003013610153200002400102080000800002016000016000080001800008000010