Apple Microarchitecture Research by Dougall Johnson

M1/A14 P-core (Firestorm): Overview | Base Instructions | SIMD and FP Instructions
M1/A14 E-core (Icestorm):  Overview | Base Instructions | SIMD and FP Instructions

SWPA (32-bit)

Test 1: uops

Code:

  swpa w0, w1, [x6]
  nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop
  mov x0, 0

(no loop instructions)

1000 unrolls and 1 iteration

Retires (minus 70 nops): 2.000

Issues: 2.000

Integer unit issues: 0.001

Load/store unit issues: 2.000

SIMD/FP unit issues: 0.000

retire uop (01)cycle (02)schedule uop (52)schedule int uop (53)schedule ldst uop (55)dispatch ldst uop (58)simd uops in schedulers (5a)dispatch uop (78)map ldst uop (7d)map ldst uop inputs (80)? int output thing (e9)? ldst retires (ed)
720063463420111201020001178620002000400012000
720043416020011200020001178620002000400012000
720043413320011200020001178620002000400012000
720043411320011200020021179620022002400012000
720043413020011200020001178620002000400012000
720043413320011200020001178620002000400012000
720043425420011200020001178620002000400012000
720043413120011200020001178620002000400012000
720043413020011200020001178620002000400012000
720043413020011200020001178620002000400012000

Test 2: throughput

Code:

  swpa w0, w1, [x6]
  add x6, x6, 4

(fused SUBS/B.cc loop)

100 unrolls and 100 iterations

Result (median cycles for code): 6.0012

retire uop (01)cycle (02)schedule uop (52)schedule int uop (53)schedule ldst uop (55)dispatch int uop (56)dispatch ldst uop (58)int uops in schedulers (59)simd uops in schedulers (5a)dispatch uop (78)map int uop (7c)map ldst uop (7d)map int uop inputs (7f)map ldst uop inputs (80)? int output thing (e9)? ldst retires (ed)? int retires (ef)
30208607053021510138200771013920004352521256743010610202200041020640025100062000010100
30204600193010110101200001010120002352491257243010310201200031020140005100012000010100
30204600123010110101200001010120002352491257073010310201200031020140005100012000010100
30204600123010110101200001010120002352491257203010310201200031020140005100012000010100
30204600123010110101200001010120014352701259133012010206200151020140005100012000010100
30204600123010110101200001010120002352491257293010310201200031020140005100012000010100
30204600123010110101200001010120002352491257023010310201200031020140005100012000010100
30204600123010110101200001010120002352491257313010310201200031020140005100012000010100
30204600123010110101200001010120002352491257233010310201200031020140005100012000010100
30204600123010110101200001010120002352491257213010310201200031020140005100012000010100

1000 unrolls and 10 iterations

Result (median cycles for code): 6.0019

retire uop (01)cycle (02)schedule uop (52)schedule int uop (53)schedule ldst uop (55)dispatch int uop (56)dispatch ldst uop (58)int uops in schedulers (59)simd uops in schedulers (5a)dispatch uop (78)map int uop (7c)map ldst uop (7d)map int uop inputs (7f)map ldst uop inputs (80)map simd uop inputs (81)? int output thing (e9)? ldst retires (ed)? simd retires (ee)? int retires (ef)
3002760373300801003320047100332000235069125983300131002120003100204000001000120000010010
3002460019300111001120000100102000035066125962300101002020000100204000001000120000010010
3002560026300251001520010100162000035066126008300101002020000100204000001000120000010010
3002460019300111001120000100102000035066125960300101002020000100204000001000120000010010
3002460032300111001120000100102000035066125935300101002020000100204000001000120000010010
3002460019300111001120000100102000035066125980300101002020000100204000001000120000010010
3002460019300111001120000100102000035066125974300101002020000100204000001000120000010010
3002460019300111001120000100102000035066125991300101002020000100204000001000120000010010
3002460019300111001120000100102000035066125977300101002020000100204000001000120000010010
3002460019300111001120000100102000035066125960300101002020000100204000001000120000010010

Test 3: throughput

Code:

  swpa w0, w1, [x6]
  mov x7, 8

(fused SUBS/B.cc loop)

100 unrolls and 100 iterations

Result (median cycles for code): 22.0046

retire uop (01)cycle (02)schedule uop (52)schedule int uop (53)schedule ldst uop (55)dispatch int uop (56)dispatch ldst uop (58)int uops in schedulers (59)simd uops in schedulers (5a)dispatch uop (78)map int uop (7c)map ldst uop (7d)map int uop inputs (7f)map ldst uop inputs (80)map simd uop inputs (81)? int output thing (e9)? ldst retires (ed)? simd retires (ee)? int retires (ef)
20205220148201241012002310020046300217686720146200200462004000801200000100
20204220046201051012000410020004300217554120104200200042004000801200000100
20204220051201051012000410020004300217551920104200200042004000801200000100
20204220046201051012000410020004300217554120104200200042004000801200000100
20204220046201051012000410020024300217570720124200200242004000801200000100
20204220046201051012000410020004300217641620104200200042004000801200000100
20204220046201051012000410020004300217554120104200200042004000801200000100
20204220046201051012000410020004300217554120104200200042004000801200000100
20204220046201051012000410020004300217554120104200200042004000801200000100
20204220046201051012000410020004300217558020104200200042004000801200000100

1000 unrolls and 10 iterations

Result (median cycles for code): 22.0044

retire uop (01)cycle (02)schedule uop (52)schedule int uop (53)schedule ldst uop (55)dispatch int uop (56)dispatch ldst uop (58)int uops in schedulers (59)simd uops in schedulers (5a)dispatch uop (78)map int uop (7c)map ldst uop (7d)map int uop inputs (7f)map ldst uop inputs (80)? int output thing (e9)? ldst retires (ed)? int retires (ef)
200252201442003411200231020004302172801200142020004204000012000010
200242200442001111200001020000302172783200102020000204004812000010
200242200442001111200001020000302172783200102020000204000012000010
200242200442001111200001020000302172783200102020000204000012000010
200242200442001111200001020000302172783200102020000204004812000010
200242200442001111200001020000302172783200102020000204000012000010
200242200442001111200001020000302172783200102020000204000012000010
200242200442001111200001020000302172803200102020000204005212000010
200242200442001111200001020000302172796200102020000204000012000010
200242200442001111200001020000302172783200102020000204000012000010