Apple Microarchitecture Research by Dougall Johnson

M1/A14 P-core (Firestorm): Overview | Base Instructions | SIMD and FP Instructions
M1/A14 E-core (Icestorm):  Overview | Base Instructions | SIMD and FP Instructions

SWPAL (32-bit)

Test 1: uops

Code:

  swpal w0, w1, [x6]
  nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop
  mov x0, 0

(no loop instructions)

1000 unrolls and 1 iteration

Retires (minus 70 nops): 2.000

Issues: 2.000

Integer unit issues: 0.001

Load/store unit issues: 2.000

SIMD/FP unit issues: 0.000

retire uop (01)cycle (02)schedule uop (52)schedule int uop (53)schedule ldst uop (55)dispatch ldst uop (58)simd uops in schedulers (5a)dispatch uop (78)map ldst uop (7d)map ldst uop inputs (80)? int output thing (e9)? ldst retires (ed)
720053437720051200420001178820002000400012000
720043413120011200020001178820002000400012000
720043417320011200020001179420002000401212000
720043421020011200020001178820002000400012000
720043411220011200020001178820002000400012000
720043411220011200020001178820002000400012000
720043410320011200020001178820002000400012000
720043408220011200020001178820002000400012000
720043416420011200020021179920022002400812000
720043462720011200020001178820002000400012000

Test 2: throughput

Code:

  swpal w0, w1, [x6]
  add x6, x6, 4

(fused SUBS/B.cc loop)

100 unrolls and 100 iterations

Result (median cycles for code): 6.0057

retire uop (01)cycle (02)schedule uop (52)schedule int uop (53)schedule ldst uop (55)dispatch int uop (56)dispatch ldst uop (58)int uops in schedulers (59)simd uops in schedulers (5a)dispatch uop (78)map int uop (7c)map ldst uop (7d)map int uop inputs (7f)map ldst uop inputs (80)? int output thing (e9)? ldst retires (ed)? int retires (ef)
30207604563018110126200551012720004352521256083010610202200041020240008100012000010100
30204600543010310101200021010220002352331255273010310201200031020140005100012000010100
30204600543010110101200001010120002352331255773010310201200031020140005100012000010100
30204600543010110101200001010120026352931258453013910213200271020140005100012000010100
30204600553010110101200001010120051353341266513017710227200511020140005100012000010100
30204600543010110101200001010120002352331255053010310201200031020140005100012000010100
30204600543010110101200001010120002352331255083010310201200031020140005100012000010100
30204600543010110101200001010120002352331254933010310201200031020140005100012000010100
30204600543010110101200001010120002352331255013010310201200031020140005100012000010100
30205601003013710113200241011320002352491256243010310201200031020140005100012000010100

1000 unrolls and 10 iterations

Result (median cycles for code): 6.0064

retire uop (01)cycle (02)schedule uop (52)schedule int uop (53)schedule ldst uop (55)dispatch int uop (56)dispatch ldst uop (58)int uops in schedulers (59)simd uops in schedulers (5a)dispatch uop (78)map int uop (7c)map ldst uop (7d)map int uop inputs (7f)map ldst uop inputs (80)? int output thing (e9)? ldst retires (ed)? int retires (ef)
30027604413009910039200601003920000350661260193001010020200001002040000100012000010010
30024600643001110011200001001020000350661259883001010020200001002040000100012000010010
30024600643001110011200001001020000350661259763001010020200001002040000100012000010010
30024600643001110011200001001020000350661259993001010020200001002040000100012000010010
30024600643001110011200001001020000350661259783001010020200001002040000100012000010010
30024600643001110011200001001020000350661259953001010020200001002040000100012000010010
30024600643001110011200001001020000350661259913001010020200001002040000100012000010010
30024600643001110011200001001020000350661259873001010020200001003340053100132000010010
30024600643001110011200001001020000350661259263001010020200001002040000100012000010010
30024600643001110011200001001020000350661259803001010020200001002040000100012000010010

Test 3: throughput

Code:

  swpal w0, w1, [x6]
  mov x7, 8

(fused SUBS/B.cc loop)

100 unrolls and 100 iterations

Result (median cycles for code): 24.0046

retire uop (01)cycle (02)schedule uop (52)schedule int uop (53)schedule simd uop (54)schedule ldst uop (55)dispatch int uop (56)dispatch simd uop (57)dispatch ldst uop (58)int uops in schedulers (59)simd uops in schedulers (5a)dispatch uop (78)map int uop (7c)map ldst uop (7d)map int uop inputs (7f)map ldst uop inputs (80)? int output thing (e9)? ldst retires (ed)? int retires (ef)
20205240150201251010200241000200043002365831201042002000420040048120000100
20204240046201051010200041000200043002365842201042002000420040008120000100
20204240046201051010200041000200043002365842201042002000420040008120000100
20204240046201051010200041000200043002365842201042002000420040008120000100
20204240046201051010200041000200043002365842201042002000420040008120000100
20204240046201051010200041000200043002365851201042002000420040008120000100
20204240046201051010200041000200043002365842201042002000420040008120000100
20204240047201051010200041000200043002365842201042002000420040008120000100
20204240046201051010200041000200043002365842201042002000420040008120000100
20205240114201251010200241000200043002365842201042002000420040008120000100

1000 unrolls and 10 iterations

Result (median cycles for code): 24.0039

retire uop (01)cycle (02)schedule uop (52)schedule int uop (53)schedule simd uop (54)schedule ldst uop (55)dispatch int uop (56)dispatch simd uop (57)dispatch ldst uop (58)int uops in schedulers (59)simd uops in schedulers (5a)dispatch uop (78)map int uop (7c)map ldst uop (7d)map int uop inputs (7f)map ldst uop inputs (80)? int output thing (e9)? ldst retires (ed)? simd retires (ee)? int retires (ef)
200252401532003511020024100200243023635442003420200242040052120000010
200242400402001111020000100200003023626442001020200002040000120000010
200242400392001111020000100200203023631322003020200202040048120000010
200242400392001111020000100200003023626442001020200002040000120000010
200242400392001111020000100200003023626442001020200002040000120000010
200242400392001111020000100200243023628242003420200242040000120000010
200242400392001111020000100200003023626442001020200002040000120000010
200242400392001111020000100200003023626442001020200002040000120000010
200262400932005511020044100200003023626442001020200002040000120000010
200242400392001111020000100200003023626442001020200002040000120000010