Apple Microarchitecture Research by Dougall Johnson

M1/A14 P-core (Firestorm): Overview | Base Instructions | SIMD and FP Instructions
M1/A14 E-core (Icestorm):  Overview | Base Instructions | SIMD and FP Instructions

SWPH

Test 1: uops

Code:

  swph w0, w1, [x6]
  nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop
  mov x0, 0

(no loop instructions)

1000 unrolls and 1 iteration

Retires (minus 70 nops): 2.000

Issues: 2.000

Integer unit issues: 0.001

Load/store unit issues: 2.000

SIMD/FP unit issues: 0.000

retire uop (01)cycle (02)schedule uop (52)schedule int uop (53)schedule ldst uop (55)dispatch ldst uop (58)simd uops in schedulers (5a)dispatch uop (78)map ldst uop (7d)map ldst uop inputs (80)? int output thing (e9)? ldst retires (ed)
720053442820051200420001176720002000400012000
720043420620011200020001176720002000400012000
720043416920011200020001176720002000400012000
720043415820011200020001176720002000400012000
720043434420011200020001184120002000400012000
720043459320011200020001177020002000400012000
720043450920011200020001177020002000400012000
720043504320011200020001177020002000400012000
720043501320011200020001177020002000400012000
720043456120011200020001177020002000400012000

Test 2: throughput

Code:

  swph w0, w1, [x6]
  add x6, x6, 2

(fused SUBS/B.cc loop)

100 unrolls and 100 iterations

Result (median cycles for code): 3.0062

retire uop (01)cycle (02)schedule uop (52)schedule int uop (53)schedule simd uop (54)schedule ldst uop (55)dispatch int uop (56)dispatch simd uop (57)dispatch ldst uop (58)int uops in schedulers (59)simd uops in schedulers (5a)dispatch uop (78)map int uop (7c)map ldst uop (7d)map int uop inputs (7f)map ldst uop inputs (80)? int output thing (e9)? ldst retires (ed)? int retires (ef)
3020630405301951013702005810137020006329021257633010910203200071020340013100032000010100
3020430065301051010202000310102020005328771257193010710202200061020240012100022000010100
3020430058301051010202000310102020005328771257433010710202200061020240012100022000010100
3020430065301051010202000310102020005328771256573010710202200061020240012100022000010100
3020430665301051010202000310102020005329031258813010710202200061020240012100022000010100
3020430062301051010202000310102020005329061258383010710202200061020240012100022000010100
3020430062301051010202000310102020005329061259983010710202200061020240012100022000010100
3020430062301051010202000310102020005329061259523010710202200061020240012100022000010100
3020430062301051010202000310102020005329061258983010710202200061020240012100022000010100
3020430062301051010202000310102020005329061259423010710202200061020240012100022000010100

1000 unrolls and 10 iterations

Result (median cycles for code): 3.0058

retire uop (01)cycle (02)schedule uop (52)schedule int uop (53)schedule ldst uop (55)dispatch int uop (56)dispatch ldst uop (58)int uops in schedulers (59)simd uops in schedulers (5a)dispatch uop (78)map int uop (7c)map ldst uop (7d)map int uop inputs (7f)map ldst uop inputs (80)? int output thing (e9)? ldst retires (ed)? int retires (ef)
30026303693009810045200531004520005326251272843001710022200061002040000100012000010010
30024300653001110011200001001020000326011269913001010020200001002040000100012000010010
30024300583001110011200001001020000326011263433001010020200001002040000100012000010010
30024300583001110011200001001020000326011267713001010020200001002040000100012000010010
30024300583001110011200001001020000325941262883001010020200001002040000100012000010010
30024300583001110011200001001020000326011265733001010020200001002040000100012000010010
30024300583001110011200001001020040327351271203007010040200401002040000100012000010010
30024300583001110011200001001020000326011269793001010020200001002040000100012000010010
30024300583001110011200001001020000325941264483001010020200001002040000100012000010010
30024300583001110011200001001020000326011264533001010020200001002040000100012000010010

Test 3: throughput

Code:

  swph w0, w1, [x6]
  mov x7, 8

(fused SUBS/B.cc loop)

100 unrolls and 100 iterations

Result (median cycles for code): 10.4944

retire uop (01)cycle (02)schedule uop (52)schedule int uop (53)schedule ldst uop (55)dispatch int uop (56)dispatch ldst uop (58)int uops in schedulers (59)simd uops in schedulers (5a)dispatch uop (78)map int uop (7c)map ldst uop (7d)map int uop inputs (7f)map ldst uop inputs (80)? int output thing (e9)? ldst retires (ed)? int retires (ef)
202061020132022610120125100200724991879656201722002019820040152120000100
202051076512020010120099100200095001948291201092002002820040040120000100
202041000572010910120008100200095001778625201092002002020040040120000100
202041037962017210120071100200375001866111201372002010020440156120000100
202041043082010110120000100200324041857090201322002009220040008120000100
202041050662023510120134100200194001876779201192002004420040136120000100
202041064942015410120053100200755141967675201772022013020040248120000100
202041053382011610120015100202095001847557203092002055220040360120000100
202041040012012510120024100200204171839194201202002005220440196120000100
202041026512012210120021100200445031892650201452022009620040040120000100

1000 unrolls and 10 iterations

Result (median cycles for code): 10.0701

retire uop (01)cycle (02)schedule uop (52)schedule int uop (53)schedule ldst uop (55)dispatch int uop (56)dispatch ldst uop (58)int uops in schedulers (59)simd uops in schedulers (5a)dispatch uop (78)map int uop (7c)map ldst uop (7d)map int uop inputs (7f)map ldst uop inputs (80)? int output thing (e9)? ldst retires (ed)? simd retires (ee)? int retires (ef)
2002510017920045112003410200095017786252001920200202040032120000010
2002410006120019112000810200134917819902002320200262040044120000010
2002410005720019112000810200095017784992001920200162040032120000010
2002410006020019112000810200134917820082002320200262040052120000010
2002410041020017112000610200294917843762003920200702040032120000010
2002410032220019112000810200915017889532010120201522040324120000010
2002410070220075112006410200244917906582003420200662040188120000010
2002410048220031112002010200534917823602006320201342040224120000010
2002410006920019112000810200095017784992001920200162040032120000010
2002410018520031112002010200125017827062002220200322040072120000010