Apple Microarchitecture Research by Dougall Johnson

M1/A14 P-core (Firestorm): Overview | Base Instructions | SIMD and FP Instructions
M1/A14 E-core (Icestorm):  Overview | Base Instructions | SIMD and FP Instructions

SWPALH

Test 1: uops

Code:

  swpalh w0, w1, [x6]
  nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop
  mov x0, 0

(no loop instructions)

1000 unrolls and 1 iteration

Retires (minus 70 nops): 2.000

Issues: 2.000

Integer unit issues: 0.001

Load/store unit issues: 2.000

SIMD/FP unit issues: 0.000

retire uop (01)cycle (02)schedule uop (52)schedule int uop (53)schedule ldst uop (55)dispatch ldst uop (58)simd uops in schedulers (5a)dispatch uop (78)map ldst uop (7d)map ldst uop inputs (80)? int output thing (e9)? ldst retires (ed)
720053569820051200420001178820002000400012000
720043465420011200020001180220002000400012000
720043480720011200020001179220002000400012000
720043423620011200020001178420002000400012000
720043412420011200020001178420002000400012000
720043412520011200020001178420002000400012000
720043412720011200020001178420002000400012000
720043412620011200020001178420002000400012000
720043412620011200020001178420002000400112000
720043415920011200020001178220002000400012000

Test 2: throughput

Code:

  swpalh w0, w1, [x6]
  add x6, x6, 2

(fused SUBS/B.cc loop)

100 unrolls and 100 iterations

Result (median cycles for code): 6.0057

retire uop (01)cycle (02)schedule uop (52)schedule int uop (53)schedule ldst uop (55)dispatch int uop (56)dispatch ldst uop (58)int uops in schedulers (59)simd uops in schedulers (5a)dispatch uop (78)map int uop (7c)map ldst uop (7d)map int uop inputs (7f)map ldst uop inputs (80)? int output thing (e9)? ldst retires (ed)? int retires (ef)
30206605083015710118200391011920024352881257913013410210200241020240008100012000010100
30204600643010110101200001010120002352491253773010310201200031020140005100012000010100
30204600573010110101200001010120002352491253453010310201200031020140005100012000010100
30204600573010110101200001010120002352491253303010310201200031020140005100012000010100
30204600573010110101200001010120002352491253483010310201200031020140005100012000010100
30204600573010110101200001010120002352491253283010310201200031020140005100012000010100
30204600573010110101200001010120002352491253603010310201200031020140005100012000010100
30204600573010110101200001010120002352491253523010310201200031020140005100012000010100
30204600573010110101200001010120002352491253553010310201200031020140005100012000010100
30205601033014010114200261011420002352491253853010310201200031020140005100012000010100

1000 unrolls and 10 iterations

Result (median cycles for code): 6.0057

retire uop (01)cycle (02)schedule uop (52)schedule int uop (53)schedule ldst uop (55)dispatch int uop (56)dispatch ldst uop (58)int uops in schedulers (59)simd uops in schedulers (5a)ldst uops in schedulers (5b)dispatch uop (78)map int uop (7c)map ldst uop (7d)map simd uop (7e)map int uop inputs (7f)map ldst uop inputs (80)? int output thing (e9)? ldst retires (ed)? int retires (ef)
3002660335300611002720034100272000235069125565030013100212000301002040000100012000010010
300246005430011100112000010010341123051382894401063637793569235699671002140005100012000010010
3002460461300111001120000100102000035050125498030010100202000001002040000100012000010010
3002460057300111001120000100102000035050125519030010100202000001002040000100012000010010
3002460057300111001120000100102000035050125516030010100202000001002040000100012000010010
3002460057300111001120000100102000035050125522030010100202000001002040000100012000010010
3002460057300111001120000100102000035050125519030010100202000001002040000100012000010010
3002460057300111001120000100102002635108125662030049100332002701002040000100012000010010
3002460067300111001120000100102000035050125520030010100202000001002040000100012000010010
3002460057300111001120000100102000035050125508030010100202000001002040000100012000010010

Test 3: throughput

Code:

  swpalh w0, w1, [x6]
  mov x7, 8

(fused SUBS/B.cc loop)

100 unrolls and 100 iterations

Result (median cycles for code): 24.0044

retire uop (01)cycle (02)schedule uop (52)schedule int uop (53)schedule ldst uop (55)dispatch int uop (56)dispatch ldst uop (58)int uops in schedulers (59)simd uops in schedulers (5a)dispatch uop (78)map int uop (7c)map ldst uop (7d)map int uop inputs (7f)map ldst uop inputs (80)? int output thing (e9)? ldst retires (ed)? int retires (ef)
202052401652012510120024100200043002365791201042002000420040008120000100
202042400442010510120004100200043002365697201042002000420040008120000100
202042400372010510120004100200243002371529201242002002420040008120000100
202042400372010510120004100200043002365697201042002000420040008120000100
202042400372010510120004100200043002365697201042002000420040008120000100
202052400952012510120024100200043002365802201042002000420040008120000100
202042400372010510120004100200043002365697201042002000420040008120000100
202042400372010510120004100200243002365859201242002002420040008120000100
202042400372010510120004100200243002366312201242002002420040008120000100
202042400372010510120004100200043002365697201042002000420040008120000100

1000 unrolls and 10 iterations

Result (median cycles for code): 24.0046

retire uop (01)cycle (02)schedule uop (52)schedule int uop (53)schedule simd uop (54)schedule ldst uop (55)dispatch int uop (56)dispatch simd uop (57)dispatch ldst uop (58)int uops in schedulers (59)simd uops in schedulers (5a)dispatch uop (78)map int uop (7c)map ldst uop (7d)map int uop inputs (7f)map ldst uop inputs (80)map simd uop inputs (81)? int output thing (e9)? ldst retires (ed)? simd retires (ee)? int retires (ef)
2002524015020035110200241002000430236275620014202000420400000120000010
2002424004620011110200001002000030236274920010202000020400000120000010
2002424004620015110200041002000030236274920010202000020400000120000010
29240267814278945082422808460742000030236278120010202000020400480120000010
2002524008020035110200241002000030236274920010202000020400000120000010
2002424004620011110200001002000030236274920010202000020400000120000010
2002424004620011110200001002002630236306120036202002620400000120000010
2002424004620011110200001002000030236274920010202000020400000120000010
2002424004620011110200001002000030236274920010202000020400480120000010
2002524008720035110200241002000030236274920010202000020400000120000010