Apple Microarchitecture Research by Dougall Johnson

M1/A14 P-core (Firestorm): Overview | Base Instructions | SIMD and FP Instructions
M1/A14 E-core (Icestorm):  Overview | Base Instructions | SIMD and FP Instructions

STP (pre-index, S)

Test 1: uops

Code:

  stp s0, s1, [x6, #0x10]!
  nop ; nop ; nop ; nop ; nop ; nop ; nop

(no loop instructions)

1000 unrolls and 1 iteration

Retires (minus 7 nops): 2.000

Issues: 3.000

Integer unit issues: 1.001

Load/store unit issues: 1.000

SIMD/FP unit issues: 1.000

retire uop (01)cycle (02)schedule uop (52)schedule int uop (53)schedule simd uop (54)schedule ldst uop (55)dispatch int uop (56)dispatch simd uop (57)dispatch ldst uop (58)int uops in schedulers (59)simd uops in schedulers (5a)ldst uops in schedulers (5b)dispatch uop (78)map ldst uop (7d)map simd uop (7e)map ldst uop inputs (80)map simd uop inputs (81)? int output thing (e9)? ldst retires (ed)? simd retires (ee)
90064238304310151014101410141014100030008773400030001000100020002000100110001000
90042431300110011000100010001000100030008762400030001000100020002000100110001000
90042325300110011000100010001000100030008770400030001000100020002000100110001000
90042306300110011000100010001000100030008816400030001000100020002000100110001000
90042306300110011000100010001000100030008775400030001000100020002000100110001000
90042306300110011000100010001000100030008762400030001000100020002000100110001000
90042306300110011000100010001000100030008806400030001000100020002000100110001000
90042306300110011000100010001000100030008762400030001000100020002000100110001000
90042306300110011000100010001000100030008798400030001000100020002000100110001000
90042306300110011000100010001000100030008762400030001000100020002000100110001000

Test 2: Latency 3->3

Code:

  stp s0, s1, [x6, #0x10]!

(fused SUBS/B.cc loop)

100 unrolls and 100 iterations

Result (median cycles for code): 1.1379

retire uop (01)cycle (02)schedule uop (52)schedule int uop (53)schedule simd uop (54)schedule ldst uop (55)dispatch int uop (56)dispatch simd uop (57)dispatch ldst uop (58)int uops in schedulers (59)simd uops in schedulers (5a)ldst uops in schedulers (5b)dispatch uop (78)map int uop (7c)map ldst uop (7d)map simd uop (7e)map int uop inputs (7f)map ldst uop inputs (80)map simd uop inputs (81)? int output thing (e9)? ldst retires (ed)? simd retires (ee)? int retires (ef)
20224150383090010412103081018010411103081000238276194183400303011620010007100072002001420014100061000010000100
20204113723010910105100041000010106100061000135749193796400263011320010006100062002001220012100051000010000100
20204113713010910105100041000010106100061000135746194410400263011320010006100062002001220012100051000010000100
20204113673010910105100041000010106100061000135745194428400263011320010006100062002001220012100051000010000100
20204114123010910105100041000010106100061000135748193600400263011320010006100062002001220012100051000010000100
20204113763010910105100041000010106100061000135746194032400263011320010006100062002001220012100051000010000100
20204113993010910105100041000010106100061000135746193798400263011320010006100062002001220012100051000010000100
20204113793010910105100041000010106100061000135747194824400263011320010006100062002001220012100051000010000100
20204113913010910105100041000010106100061000135745194086400263011320010006100062002001220012100051000010000100
20204113903010910105100041000010106100061000135748194662400263011320010006100062002001220012100051000010000100

1000 unrolls and 10 iterations

Result (median cycles for code): 1.1384

retire uop (01)cycle (02)schedule uop (52)schedule int uop (53)schedule simd uop (54)schedule ldst uop (55)dispatch int uop (56)dispatch simd uop (57)dispatch ldst uop (58)int uops in schedulers (59)simd uops in schedulers (5a)ldst uops in schedulers (5b)dispatch uop (78)map int uop (7c)map ldst uop (7d)map simd uop (7e)map int uop inputs (7f)map ldst uop inputs (80)map simd uop inputs (81)? int output thing (e9)? ldst retires (ed)? simd retires (ee)? int retires (ef)
20044149773081310322103111018010323103131000135287194310400263002320100061000620200002000010001100001000010
20024113843001110011100001000010010100001000035623193590400003001020100001000020200002000010001100001000010
20024113603001110011100001000010010100001000035623193842400003001020100001000020200002000010001100001000010
20024113863001110011100001000010010100001000035624194022400003001020100001000020200002000010001100001000010
20024113803001110011100001000010010100001000035616195255400003001020100001000020200002000010001100001000010
20024114783001110011100001000010010100001000035628194328400003001020100001000020200002000010001100001000010
20024116323001110011100001000010010100001000035623194617400003001020100001000020200002000010001100001000010
20024113673001110011100001000010010100001000035590193841400003001020100001000020200002000010001100001000010
20024114453001110011100001000010010100001000035624193860400003001020100001000020200002000010001100001000010
20024113663001110011100001000010010100001000035625194112400003001020100001000020200002000010001100001000010

Test 3: throughput

Count: 8

Code:

  stp s0, s1, [x6, #0x10]!
  stp s0, s1, [x7, #0x10]!
  stp s0, s1, [x8, #0x10]!
  stp s0, s1, [x9, #0x10]!
  stp s0, s1, [x10, #0x10]!
  stp s0, s1, [x11, #0x10]!
  stp s0, s1, [x12, #0x10]!
  stp s0, s1, [x13, #0x10]!
  mov x7, x6
  mov x8, x6
  mov x9, x6
  mov x10, x6
  mov x11, x6
  mov x12, x6
  mov x13, x6

(fused SUBS/B.cc loop)

100 unrolls and 100 iterations

Result (median cycles for code divided by count): 1.0007

retire uop (01)cycle (02)schedule uop (52)schedule int uop (53)schedule simd uop (54)schedule ldst uop (55)dispatch int uop (56)dispatch simd uop (57)dispatch ldst uop (58)int uops in schedulers (59)simd uops in schedulers (5a)ldst uops in schedulers (5b)dispatch uop (78)map int uop (7c)map ldst uop (7d)map simd uop (7e)map int uop inputs (7f)map ldst uop inputs (80)map simd uop inputs (81)? int output thing (e9)? ldst retires (ed)? simd retires (ee)? int retires (ef)
1602248207124089380407803068018080408803088000224031813602653200262401142008000680006200160012160012800058000080000100
1602048005624010980105800048000080106800068000224031813602653200262401142008000680006200160012160012800058000080000100
1602048005624010980105800048000080106800068003224040813610763201462402042008003680036200160012160012800058000080000100
1602048005624010980105800048000080106800068000224031813602653200262401142008000680006200160012160012800058000080000100
1602048005624010980105800048000080106800068000224031813602653200262401142008000680006200160012160012800058000080000100
1602048005624010980105800048000080106800068000224031813602653200262401142008000680006200160072160072800358000080000100
1602048005824010980105800048000080106800068000324032113605193200302401172008000780007200160012160012800058000080000100
1602048005624010980105800048000080106800068000224031813602163200262401142008000680006200160012160012800058000080000100
1602048005624010980105800048000080106800068000224031813602653200262401142008000680006200160012160012800058000080000100
1602048005624010980105800048000080106800068000224031813605433200262401142008000680006200160012160012800058000080000100

1000 unrolls and 10 iterations

Result (median cycles for code divided by count): 1.0006

retire uop (01)cycle (02)schedule uop (52)schedule int uop (53)schedule simd uop (54)schedule ldst uop (55)dispatch int uop (56)dispatch simd uop (57)dispatch ldst uop (58)int uops in schedulers (59)simd uops in schedulers (5a)ldst uops in schedulers (5b)dispatch uop (78)map int uop (7c)map ldst uop (7d)map simd uop (7e)map int uop inputs (7f)map ldst uop inputs (80)map simd uop inputs (81)? int output thing (e9)? ldst retires (ed)? simd retires (ee)? int retires (ef)
1600448205324081780324803138018080325803158000224004813602113200262400242080006800062016001216001280005800008000010
1600248004524001180011800008000080010800008000024003013600573200002400102080000800002016000016000080001800008000010
1600248004524001180011800008000080010800008000024003013600573200002400102080000800002016000016000080001800008000010
1600248004524001180011800008000080010800008000024003013600573200002400102080000800002016000016000080001800008000010
1600248004524001180011800008000080010800008003224013813610393201462401142080036800362016000016000080001800008000010
1600248004524001180011800008000080010800008000024003013600573200002400102080000800002016000016000080001800008000010
1600248004524001180011800008000080010800008000024003013600573200002400102080000800002016000016000080001800008000010
1600258015424009780045800348001880046800368000024003013600573200002400102080000800002016000016000080001800008000010
1600248004524001180011800008000080010800008000024003013600573200002400102080000800002016000016000080001800008000010
1600248004524001180011800008000080010800008000024003013600573200002400102080000800002016000016000080001800008000010