Apple Microarchitecture Research by Dougall Johnson

M1/A14 P-core (Firestorm): Overview | Base Instructions | SIMD and FP Instructions
M1/A14 E-core (Icestorm):  Overview | Base Instructions | SIMD and FP Instructions

SWPA (64-bit)

Test 1: uops

Code:

  swpa x0, x1, [x6]
  nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop
  mov x0, 0

(no loop instructions)

1000 unrolls and 1 iteration

Retires (minus 70 nops): 2.000

Issues: 2.000

Integer unit issues: 0.001

Load/store unit issues: 2.000

SIMD/FP unit issues: 0.000

retire uop (01)cycle (02)schedule uop (52)schedule int uop (53)schedule ldst uop (55)dispatch ldst uop (58)simd uops in schedulers (5a)dispatch uop (78)map ldst uop (7d)map ldst uop inputs (80)? int output thing (e9)? ldst retires (ed)
720053437420051200420001177220002000400012000
720043412920011200020001177220002000400012000
720043412720011200020001177220002000400012000
720043412820011200020001177220002000400012000
720043440020011200020001177220002000400012000
720043413120011200020001179820002000400012000
720043472720011200020001177220002000400012000
720043424720011200020001177220002000400012000
720043412720011200020001177220002000400012000
720043417020011200020001177220002000400012000

Test 2: throughput

Code:

  swpa x0, x1, [x6]
  add x6, x6, 8

(fused SUBS/B.cc loop)

100 unrolls and 100 iterations

Result (median cycles for code): 6.0022

retire uop (01)cycle (02)schedule uop (52)schedule int uop (53)schedule simd uop (54)schedule ldst uop (55)dispatch int uop (56)dispatch simd uop (57)dispatch ldst uop (58)int uops in schedulers (59)simd uops in schedulers (5a)dispatch uop (78)map int uop (7c)map ldst uop (7d)map int uop inputs (7f)map ldst uop inputs (80)? int output thing (e9)? ldst retires (ed)? int retires (ef)
3020960589302191013802008110139020004352521267443010610202200041020140005100012000010100
3020460025301031010102000210102020004352521268923010610202200041020140005100012000010100
3020560029301131010502000810105020002352491268193010310201200031020140005100012000010100
3020460019301011010102000010101020002352491269063010310201200031020140005100012000010100
3020460019301011010102000010101020002352491269183010310201200031020140005100012000010100
3020460019301011010102000010101020002352491269063010310201200031020140005100012000010100
3020460019301011010102000010101020002352491268983010310201200031020140005100012000010100
3020460019301011010102000010101020002352491268823010310201200031020140005100012000010100
3020460019301011010102000010101020010353171273243011510205200111020140005100012000010100
3020460019301011010102000010101020002352491269063010310201200031020140005100012000010100

1000 unrolls and 10 iterations

Result (median cycles for code): 6.0015

retire uop (01)cycle (02)schedule uop (52)schedule int uop (53)schedule ldst uop (55)dispatch int uop (56)dispatch ldst uop (58)int uops in schedulers (59)simd uops in schedulers (5a)dispatch uop (78)map int uop (7c)map ldst uop (7d)map int uop inputs (7f)map ldst uop inputs (80)? int output thing (e9)? ldst retires (ed)? int retires (ef)
30029605813012010045200751004520002350691271453001310021200031002640024100052000010010
30024600283001110011200001001020000350501268153001010020200001002040000100012000010010
30024600153001110011200001001020000350501267693001010020200001002040000100012000010010
30024600153001110011200001001020000350501268253001010020200001002040000100012000010010
30024600153001110011200001001020000350501268833001010020200001002040000100012000010010
30024600153001110011200001001020000350501267643001010020200001002040000100012000010010
30024600223001110011200001001020000350501270203001010020200001002040000100012000010010
30024600153001110011200001001020010350801273773002510025200111002540021100052000010010
30025600323003110017200141001720002350691274953001310021200031002040000100012000010010
30024600193001110011200001001020002350691271943001310021200031002140005100012000010010

Test 3: throughput

Code:

  swpa x0, x1, [x6]
  mov x7, 8

(fused SUBS/B.cc loop)

100 unrolls and 100 iterations

Result (median cycles for code): 22.0039

retire uop (01)cycle (02)schedule uop (52)schedule int uop (53)schedule ldst uop (55)dispatch int uop (56)dispatch ldst uop (58)int uops in schedulers (59)simd uops in schedulers (5a)ldst uops in schedulers (5b)dispatch uop (78)map int uop (7c)map ldst uop (7d)map simd uop (7e)map int uop inputs (7f)map ldst uop inputs (80)? int output thing (e9)? ldst retires (ed)? int retires (ef)
20205220150201271012002610020004300217560202010420020004020040008120000100
20204220046201051012000410020004300217554102010420020004020040048120000100
20204220046201051012000410020004300217554102010420020004020040008120000100
20204220039201051012000410020004300217554102010420020004020040008120000100
20204220039201051012000410020024300217570802012420020024020040008120000100
20204220039201051012000410020004300217554102010420020004020040008120000100
20204220039201051012000410020004300217558102010420020004020040008120000100
20204220044201051012000410020004300217636002010420020004020040008120000100
20204220037201051012000410020004300217552502010420020004020040008120000100
20205220065201251012002410020004300217558402010420020004020040008120000100

1000 unrolls and 10 iterations

Result (median cycles for code): 22.0044

retire uop (01)cycle (02)schedule uop (52)schedule int uop (53)schedule ldst uop (55)dispatch int uop (56)dispatch ldst uop (58)int uops in schedulers (59)simd uops in schedulers (5a)dispatch uop (78)map int uop (7c)map ldst uop (7d)map int uop inputs (7f)map ldst uop inputs (80)? int output thing (e9)? ldst retires (ed)? int retires (ef)
200262201732005411200431020004302172801200142020004204000012000010
200242200442001111200001020000302172783200102020000204000012000010
200242200442001111200001020024302173530200342020024204000012000010
200242200442001111200001020000302172783200102020000204000012000010
200242200442001111200001020000302172783200102020000204000012000010
200242200452001111200001020026302172934200362020026204000012000010
200242200442001111200001020000302172783200102020000204000012000010
200242200442001111200001020000302172783200102020000204000012000010
200252201112003711200261020000302172678200102020000204000012000010
200242200442001111200001020000302172783200102020000204000012000010