Apple Microarchitecture Research by Dougall Johnson

M1/A14 P-core (Firestorm): Overview | Base Instructions | SIMD and FP Instructions
M1/A14 E-core (Icestorm):  Overview | Base Instructions | SIMD and FP Instructions

SWPAB

Test 1: uops

Code:

  swpab w0, w1, [x6]
  nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop
  mov x0, 0

(no loop instructions)

1000 unrolls and 1 iteration

Retires (minus 70 nops): 2.000

Issues: 2.000

Integer unit issues: 0.001

Load/store unit issues: 2.000

SIMD/FP unit issues: 0.000

retire uop (01)cycle (02)schedule uop (52)schedule int uop (53)schedule ldst uop (55)dispatch ldst uop (58)simd uops in schedulers (5a)dispatch uop (78)map ldst uop (7d)map ldst uop inputs (80)? int output thing (e9)? ldst retires (ed)
720053454920111201020001178220002000400012000
720043420620011200020001178220002000400012000
720043416120011200020001178220002000400012000
720043416620011200020001178220002000400012000
720043419120011200020001178220002000400012000
720043421020011200020001178220002000400012000
720043416920011200020001178220002000400012000
720043420820011200020001178220002000400012000
720043421820011200020001178220002000400012000
720043417320011200020001178220002000400012000

Test 2: throughput

Code:

  swpab w0, w1, [x6]
  add x6, x6, 2

(fused SUBS/B.cc loop)

100 unrolls and 100 iterations

Result (median cycles for code): 6.0012

retire uop (01)cycle (02)schedule uop (52)schedule int uop (53)schedule ldst uop (55)dispatch int uop (56)dispatch ldst uop (58)int uops in schedulers (59)simd uops in schedulers (5a)dispatch uop (78)map int uop (7c)map ldst uop (7d)map int uop inputs (7f)map ldst uop inputs (80)? int output thing (e9)? ldst retires (ed)? int retires (ef)
30206603003014810116200321011920004352361252363010610202200041020140005100012000010100
30204600183010310101200021010220002352331252083010310201200031020140005100012000010100
30204600123010110101200001010120002352331252273010310201200031020140005100012000010100
30204600123010110101200001010120002352331252413010310201200031020140005100012000010100
30204600123010110101200001010120002352331252323010310201200031020640025100062000010100
30204600123010110101200001010120010352611253673011510205200111020140005100012000010100
30205600223011310105200081010520002352331252403010310201200031020140005100012000010100
30205600163011610106200101010620002352331251343010310201200031020140005100012000010100
30204600123010110101200001010120002352331251523010310201200031020140005100012000010100
30204600123010110101200001010120002352331252253010310201200031020140005100012000010100

1000 unrolls and 10 iterations

Result (median cycles for code): 6.0019

retire uop (01)cycle (02)schedule uop (52)schedule int uop (53)schedule ldst uop (55)dispatch int uop (56)dispatch ldst uop (58)int uops in schedulers (59)simd uops in schedulers (5a)dispatch uop (78)map int uop (7c)map ldst uop (7d)map int uop inputs (7f)map ldst uop inputs (80)? int output thing (e9)? ldst retires (ed)? int retires (ef)
30026602723006210027200351002920002350691256183001310021200031002040000100012000010010
30024600193001110011200001001020000350661255673001010020200001002040000100012000010010
30024600193001110011200001001020000350661255703001010020200001002040000100012000010010
30024600193001110011200001001020000350661255843001010020200001002540021100052000010010
30024600193001110011200001001020000350661255573001010020200001002040000100012000010010
30024600193001110011200001001020000350661255273001010020200001002040000100012000010010
30024600193001110011200001001020000350661255583001010020200001002040000100012000010010
30024600193001110011200001001020000350661256373001010020200001002040000100012000010010
30024600193001110011200001001020000350661255933001010020200001002040000100012000010010
30024600193001110011200001001020000350661255673001010020200001002040000100012000010010

Test 3: throughput

Code:

  swpab w0, w1, [x6]
  mov x7, 8

(fused SUBS/B.cc loop)

100 unrolls and 100 iterations

Result (median cycles for code): 22.0046

retire uop (01)cycle (02)schedule uop (52)schedule int uop (53)schedule ldst uop (55)dispatch int uop (56)dispatch ldst uop (58)int uops in schedulers (59)simd uops in schedulers (5a)dispatch uop (78)map int uop (7c)map ldst uop (7d)map int uop inputs (7f)map ldst uop inputs (80)? int output thing (e9)? ldst retires (ed)? int retires (ef)
202052201422012410120023100200043002175584201042002000420040008120000100
202042200442010510120004100200043002175606201042002000420040008120000100
202042200442010510120004100200043002175606201042002000420040008120000100
202042200442010510120004100200243002175772201242002002420040008120000100
202042200442010510120004100200043002175609201042002000420040016120000100
202042200442010510120004100200043002175606201042002000420040048120000100
202042200492010510120004100200043002175646201042002000420040008120000100
202042200462010510120004100200043002175646201042002000420040048120000100
202042200462010510120004100200043002175694201042002000420040008120000100
202042200462010510120004100200043002175646201042002000420040008120000100

1000 unrolls and 10 iterations

Result (median cycles for code): 22.0039

retire uop (01)cycle (02)schedule uop (52)schedule int uop (53)schedule ldst uop (55)dispatch int uop (56)dispatch ldst uop (58)int uops in schedulers (59)simd uops in schedulers (5a)dispatch uop (78)map int uop (7c)map ldst uop (7d)map int uop inputs (7f)map ldst uop inputs (80)? int output thing (e9)? ldst retires (ed)? int retires (ef)
200262202402005711200461020004302172819200142020004204000012000010
200242200462001111200001020026302173039200362020026204004812000010
200242200392001111200001020000302172823200102020000204000012000010
200242200392001111200001020000302172843200102020000204000012000010
200252200732003511200241020024302173014200342020024204000012000010
200242200392001111200001020000302172823200102020000204000012000010
200242200392001111200001020000302172823200102020000204000012000010
200242200392001111200001020024302173054200342020024204000812000010
200242200392001111200001020000302172831200102020000204000012000010
200242200392001111200001020000302172823200102020000204004812000010