Apple Microarchitecture Research by Dougall Johnson

M1/A14 P-core (Firestorm): Overview | Base Instructions | SIMD and FP Instructions
M1/A14 E-core (Icestorm):  Overview | Base Instructions | SIMD and FP Instructions

SWPALB

Test 1: uops

Code:

  swpalb w0, w1, [x6]
  nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop
  mov x0, 0

(no loop instructions)

1000 unrolls and 1 iteration

Retires (minus 70 nops): 2.000

Issues: 2.000

Integer unit issues: 0.001

Load/store unit issues: 2.000

SIMD/FP unit issues: 0.000

retire uop (01)cycle (02)schedule uop (52)schedule int uop (53)schedule ldst uop (55)dispatch ldst uop (58)simd uops in schedulers (5a)dispatch uop (78)map ldst uop (7d)map ldst uop inputs (80)? int output thing (e9)? ldst retires (ed)
720053468820051200420001181220002000400012000
720043408020011200020001178820002000400012000
720043407820011200020001178820002000400012000
720043407720011200020001178820002000400012000
720043408020011200020001178820002000400012000
720043434520011200020001178820002000400012000
720043418720011200020001178820002000400012000
720043407720011200020001178820002000400012000
720043407820011200020001178820002000400012000
720043407720011200020001178820002000400112000

Test 2: throughput

Code:

  swpalb w0, w1, [x6]
  add x6, x6, 2

(fused SUBS/B.cc loop)

100 unrolls and 100 iterations

Result (median cycles for code): 6.0061

retire uop (01)cycle (02)schedule uop (52)schedule int uop (53)schedule ldst uop (55)dispatch int uop (56)dispatch ldst uop (58)int uops in schedulers (59)simd uops in schedulers (5a)dispatch uop (78)map int uop (7c)map ldst uop (7d)map int uop inputs (7f)map ldst uop inputs (80)? int output thing (e9)? ldst retires (ed)? int retires (ef)
30206603233015710119200381012020004352521253673010610202200041020240008100012000010100
30204600613010110101200001010120002352491252893010310201200031020140005100012000010100
30204600613010110101200001010120002352491253233010310201200031020140005100012000010100
30204600643010110101200001010120002352491252883010310201200031020140005100012000010100
30204600613010110101200001010120002352491253003010310201200031020140005100012000010100
30204600613010110101200001010120002352491253193010310201200031020140005100012000010100
30204600613010110101200001010120002352491253693010310201200031021340053100132000010100
30204600613010110101200001010120002352491253113010310201200031020140005100012000010100
30204600613010110101200001010120002352491253223010310201200031020140005100012000010100
30204600613010110101200001010120002352491253383010310201200031021140049100112000010100

1000 unrolls and 10 iterations

Result (median cycles for code): 6.0064

retire uop (01)cycle (02)schedule uop (52)schedule int uop (53)schedule ldst uop (55)dispatch int uop (56)dispatch ldst uop (58)int uops in schedulers (59)simd uops in schedulers (5a)dispatch uop (78)map int uop (7c)map ldst uop (7d)map int uop inputs (7f)map ldst uop inputs (80)? int output thing (e9)? ldst retires (ed)? int retires (ef)
30026603263006110027200341002720002350691256223001310021200031002040000100012000010010
30024600613001110011200001001020000350661255553001010020200001002040000100012000010010
30024600613001110011200001001020002350691261033001310021200031002040000100012000010010
30024600643001110011200001001020000350661258363001010020200001002040000100012000010010
30024600643001110011200001001020000350661258033001010020200001002040000100012000010010
30024600643001110011200001001020000350661258013001010020200001002040000100012000010010
30024600643001110011200001001020000350661258083001010020200001002040000100012000010010
30024600643001110011200001001020000350661256563001010020200001002040000100012000010010
30024600643001110011200001001020000350661256503001010020200001002040000100012000010010
30024600643001110011200001001020000350661256363001010020200001002040000100012000010010

Test 3: throughput

Code:

  swpalb w0, w1, [x6]
  mov x7, 8

(fused SUBS/B.cc loop)

100 unrolls and 100 iterations

Result (median cycles for code): 24.0039

retire uop (01)cycle (02)schedule uop (52)schedule int uop (53)schedule ldst uop (55)dispatch int uop (56)dispatch ldst uop (58)int uops in schedulers (59)simd uops in schedulers (5a)dispatch uop (78)map int uop (7c)map ldst uop (7d)map int uop inputs (7f)map ldst uop inputs (80)? int output thing (e9)? ldst retires (ed)? int retires (ef)
202052403852012710120026100200043002365820201042002000420040008120000100
202042400462010510120004100200043002365842201042002000420040008120000100
202042400392010510120004100200043002365842201042002000420040048120000100
202042400392010510120004100200043002365842201042002000420040008120000100
202042400392010510120004100200043002365842201042002000420040008120000100
202042400392010510120004100200043002365842201042002000420040008120000100
202042400392010510120004100200043002365842201042002000420040008120000100
202042400412010510120004100200243002366004201242002002420040008120000100
202042400392010510120004100200043002365842201042002000420040008120000100
202042400392010510120004100200043002365842201042002000420040008120000100

1000 unrolls and 10 iterations

Result (median cycles for code): 24.0037

retire uop (01)cycle (02)schedule uop (52)schedule int uop (53)schedule simd uop (54)schedule ldst uop (55)dispatch int uop (56)dispatch simd uop (57)dispatch ldst uop (58)int uops in schedulers (59)simd uops in schedulers (5a)dispatch uop (78)map int uop (7c)map ldst uop (7d)map int uop inputs (7f)map ldst uop inputs (80)? int output thing (e9)? ldst retires (ed)? simd retires (ee)? int retires (ef)
200252401502003511020024100200243023627742003420200242040048120000010
200242400372001111020000100200563023636122006620200562040052120000010
297682576562877451093223633476930221165299623994242547348632240989243464441121779045411
200252400982003611020025100200003023626162001020200002040000120000010
200242400372001111020000100200003023626042001020200002040040120000010
200242400372001111020000100200003023626042001020200002040000120000010
200242400372001111020000100200003023626042001020200002040000120000010
200252400642003711020026100200003023626342001020200002040000120000010
200242400372001111020000100200003023626342001020200002040000120000010
200242400372001111020000100200003023626042001020200002040048120000010