Apple Microarchitecture Research by Dougall Johnson

M1/A14 P-core (Firestorm): Overview | Base Instructions | SIMD and FP Instructions
M1/A14 E-core (Icestorm):  Overview | Base Instructions | SIMD and FP Instructions

SWPAH

Test 1: uops

Code:

  swpah w0, w1, [x6]
  nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop
  mov x0, 0

(no loop instructions)

1000 unrolls and 1 iteration

Retires (minus 70 nops): 2.000

Issues: 2.000

Integer unit issues: 0.001

Load/store unit issues: 2.000

SIMD/FP unit issues: 0.000

retire uop (01)cycle (02)schedule uop (52)schedule int uop (53)schedule ldst uop (55)dispatch ldst uop (58)simd uops in schedulers (5a)dispatch uop (78)map ldst uop (7d)map ldst uop inputs (80)? int output thing (e9)? ldst retires (ed)
720053437120051200420001178220002000400012000
720043411820011200020001178220002000400012000
720043414420011200020001178220002000400012000
720043486120011200020001178220002000400012000
720043445020011200020001178220002000400012000
720043519520011200020001178220002000400012000
720043411020011200020001178220002000400012000
720043418720011200020001178220002000400012000
720043428520011200020001178220002000400012000
720043427020011200020001178220002000400012000

Test 2: throughput

Code:

  swpah w0, w1, [x6]
  add x6, x6, 2

(fused SUBS/B.cc loop)

100 unrolls and 100 iterations

Result (median cycles for code): 6.0019

retire uop (01)cycle (02)schedule uop (52)schedule int uop (53)schedule simd uop (54)schedule ldst uop (55)dispatch int uop (56)dispatch simd uop (57)dispatch ldst uop (58)int uops in schedulers (59)simd uops in schedulers (5a)dispatch uop (78)map int uop (7c)map ldst uop (7d)map int uop inputs (7f)map ldst uop inputs (80)? int output thing (e9)? ldst retires (ed)? int retires (ef)
3020760584301851012802005710131020004352541254343010610202200041020240008100012000010100
16606331621656455580110065560020004352521254933010610202200041020140005100012000010100
3020460028301031010102000210102020016352851260013012310207200171020140005100012000010100
1422823084116837057046266613020004352511254353010610202200041020140005100012000010100
3020560026301151010502001010106020002352501254923010310201200031020140005100012000010100
3020460019301011010102000010101020002352501254843010310201200031020140005100012000010100
3020460019301011010102000010101020002352491254983010310201200031020140005100012000010100
3020460019301011010102000010101020002352491255153010310201200031020640025100062000010100
3020460019301011010102000010101020002352491254703010310201200031020140005100012000010100
3020460019301011010102000010101020002352501255433010310201200031020140005100012000010100

1000 unrolls and 10 iterations

Result (median cycles for code): 6.0015

retire uop (01)cycle (02)schedule uop (52)schedule int uop (53)schedule ldst uop (55)dispatch int uop (56)dispatch ldst uop (58)int uops in schedulers (59)simd uops in schedulers (5a)dispatch uop (78)map int uop (7c)map ldst uop (7d)map int uop inputs (7f)map ldst uop inputs (80)? int output thing (e9)? ldst retires (ed)? int retires (ef)
30026602823006010027200331002720002350691256363001310021200031002040000100012000010010
30024600223001110011200001001020000350501255973001010020200001002040000100012000010010
30024600153001110011200001001020000350501255693001010020200001002040000100012000010010
30024600153001110011200001001020000350501255633001010020200001002040000100012000010010
30024600153001110011200001001020000350501255533001010020200001002040000100012000010010
30024600153001110011200001001020000350501255223001010020200001002040000100012000010010
30024600153001110011200001001020000350501255413001010020200001002040000100012000010010
30024600153001110011200001001020000350501255163001010020200001002040000100012000010010
30024600153001110011200001001020000350501255273001010020200001002040000100012000010010
30024600153001110011200001001020000350501255243001010020200001002040000100012000010010

Test 3: throughput

Code:

  swpah w0, w1, [x6]
  mov x7, 8

(fused SUBS/B.cc loop)

100 unrolls and 100 iterations

Result (median cycles for code): 22.0044

retire uop (01)cycle (02)schedule uop (52)schedule int uop (53)schedule ldst uop (55)dispatch int uop (56)dispatch ldst uop (58)int uops in schedulers (59)simd uops in schedulers (5a)dispatch uop (78)map int uop (7c)map ldst uop (7d)map int uop inputs (7f)map ldst uop inputs (80)? int output thing (e9)? ldst retires (ed)? int retires (ef)
202072203132017010320067102200043002175606201042002000420040008120000100
202052200822012710120026100200043002175590201042002000420040008120000100
202042200442010510120004100200043002175606201042002000420040008120000100
202042200442010510120004100200043002175606201042002000420040008120000100
202042200442010510120004100200043002175606201042002000420040008120000100
202052200712012510120024100200043002175606201042002000420040008120000100
202052201182012510120024100200043002175606201042002000420040008120000100
202042200462010510120004100200043002175638201042002000420040008120000100
202042200462010510120004100200043002175646201042002000420040008120000100
202042200462010510120004100200043002175646201042002000420040008120000100

1000 unrolls and 10 iterations

Result (median cycles for code): 22.0046

retire uop (01)cycle (02)schedule uop (52)schedule int uop (53)schedule ldst uop (55)dispatch int uop (56)dispatch ldst uop (58)int uops in schedulers (59)simd uops in schedulers (5a)dispatch uop (78)map int uop (7c)map ldst uop (7d)map int uop inputs (7f)map ldst uop inputs (80)? int output thing (e9)? ldst retires (ed)? int retires (ef)
200252203782003511200241020004302172819200142020004204000012000010
200242200462001111200001020000302172823200102020000204000012000010
200242200462001111200001020024302173267200342020024204000012000010
200242200462001111200001020000302172823200102020000204000012000010
200242200462001111200001020000302172823200102020000204000012000010
200252200732003511200241020000302172823200102020000204000012000010
200242200462001111200001020000302172823200102020000204000012000010
200242200462001111200001020000302172823200102020000204005212000010
200242200392001111200001020000302172823200102020000204004812000010
200242200462001111200001020000302172823200102020000204000012000010