Apple Microarchitecture Research by Dougall Johnson

M1/A14 P-core (Firestorm): Overview | Base Instructions | SIMD and FP Instructions
M1/A14 E-core (Icestorm):  Overview | Base Instructions | SIMD and FP Instructions

SWPB

Test 1: uops

Code:

  swpb w0, w1, [x6]
  nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop
  mov x0, 0

(no loop instructions)

1000 unrolls and 1 iteration

Retires (minus 70 nops): 2.000

Issues: 2.000

Integer unit issues: 0.001

Load/store unit issues: 2.000

SIMD/FP unit issues: 0.000

retire uop (01)cycle (02)schedule uop (52)schedule int uop (53)schedule ldst uop (55)dispatch ldst uop (58)simd uops in schedulers (5a)dispatch uop (78)map ldst uop (7d)map ldst uop inputs (80)? int output thing (e9)? ldst retires (ed)
720063486220191201820001177020002000400012000
720043409720011200020001177020002000400012000
720043406920011200020001177020002000400012000
720043407220011200020001177020002000400012000
720043407220011200020021178220022002400012000
720043421220011200020001177020002000400012000
720043406820011200020001177020002000400012000
720043407120011200020001177020002000400012000
720043407520011200020001177020002000400012000
720043406820011200020001177020002000400012000

Test 2: throughput

Code:

  swpb w0, w1, [x6]
  add x6, x6, 2

(fused SUBS/B.cc loop)

100 unrolls and 100 iterations

Result (median cycles for code): 3.0062

retire uop (01)cycle (02)schedule uop (52)schedule int uop (53)schedule ldst uop (55)dispatch int uop (56)dispatch ldst uop (58)int uops in schedulers (59)simd uops in schedulers (5a)dispatch uop (78)map int uop (7c)map ldst uop (7d)map int uop inputs (7f)map ldst uop inputs (80)map simd uop inputs (81)? int output thing (e9)? ldst retires (ed)? simd retires (ee)? int retires (ef)
3020630342301951013720058101372000632901125769301091020320007102034001301000320000010100
3020430065301071010320004101032000532870125788301071020220006102024001201000220000010100
3020430058301051010220003101022000532870125708301071020220006102024001201000220000010100
3020430058301051010220003101022000532895125715301071020220006102024001201000220000010100
3020430058301051010220003101022000532870125744301071020220006102024001201000220000010100
3020430058301051010220003101022000532895125743301071020220006102024001201000220000010100
3020430065301051010220003101022000532870125754301071020220006102024001201000220000010100
3020430065301051010220003101022000532870125668301071020220006102024001201000220000010100
3020430065301051010220003101022000532870125686301071020220006102024001201000220000010100
3020430058301051010220003101022000532870125762301071020220006102024001201000220000010100

1000 unrolls and 10 iterations

Result (median cycles for code): 3.0065

retire uop (01)cycle (02)schedule uop (52)schedule int uop (53)schedule ldst uop (55)dispatch int uop (56)dispatch ldst uop (58)int uops in schedulers (59)simd uops in schedulers (5a)dispatch uop (78)map int uop (7c)map ldst uop (7d)map int uop inputs (7f)map ldst uop inputs (80)? int output thing (e9)? ldst retires (ed)? int retires (ef)
30027304983013810062200761006220005326251264563001710022200061002040000100012000010010
30024300653001110011200001001020000326281272003001010020200001002040000100012000010010
30024300653001110011200001001020000326281265603001010020200001002040000100012000010010
30024300653001110011200001001020000326191269413001010020200001002040000100012000010010
30024300653001110011200001001020000326281263603001010020200001002040000100012000010010
30024300653001110011200001001020000326281266953001010020200001002040000100012000010010
30024300653001110011200001001020000326281263743001010020200001002040000100012000010010
30024300653001110011200001001020000326281265963001010020200001002040000100012000010010
30024300653001110011200001001020000326281266503001010020200001002040000100012000010010
30024300653001110011200001001020000326281271483001010020200001002040000100012000010010

Test 3: throughput

Code:

  swpb w0, w1, [x6]
  mov x7, 8

(fused SUBS/B.cc loop)

100 unrolls and 100 iterations

Result (median cycles for code): 10.5995

retire uop (01)cycle (02)schedule uop (52)schedule int uop (53)schedule ldst uop (55)dispatch int uop (56)dispatch ldst uop (58)int uops in schedulers (59)simd uops in schedulers (5a)dispatch uop (78)map int uop (7c)map ldst uop (7d)map int uop inputs (7f)map ldst uop inputs (80)? int output thing (e9)? ldst retires (ed)? int retires (ef)
202061026172017810120077100200005001803287201002002000420040008120000100
202041017922011210120011100200115001809866201112002002220240316120000100
202041059952013010120029100200163501867080201162002003420040560120000100
202041096232012010120019100200245001829620201242002006020040496120000100
202041055812012210120021100200473641857617201492042011420040712120000100
202051071972020410320101102200084471913497201082002002820040096120000100
202041098232012010120019100200414341946942201412002010620240216120000100
202041102962010410120003100200005001985314201002002000420040152120000100
202041062862015810120057100200165001932411201162002004820040304120000100
202041063072014410120043100200155001915486201152002004020040144120000100

1000 unrolls and 10 iterations

Result (median cycles for code): 10.0672

retire uop (01)cycle (02)schedule uop (52)schedule int uop (53)schedule ldst uop (55)dispatch int uop (56)dispatch ldst uop (58)int uops in schedulers (59)simd uops in schedulers (5a)dispatch uop (78)map int uop (7c)map ldst uop (7d)map int uop inputs (7f)map ldst uop inputs (80)? int output thing (e9)? ldst retires (ed)? simd retires (ee)? int retires (ef)
2002510015720046112003510200095017786252001920200162040052120000010
2002410025120017112000610200125017829942002220200322040032120000010
2002410032220019112000810200255017826532003520200562040228120000010
2002410047920070112005910200544917850442006420201422040124120000010
2002410014020045112003410200095017833952001920200162040248120000010
2002410069320034112002310200494917842612005920201222040204120000010
2002410067320031112002010200434917895532005320201062040296120000010
2002410041920099112008810200634917858642007320201662040180120000010
2002410058220045112003410201094917971922011920202182040260120000010
2002410088520048112003710200495017925672005920201242040188120000010