Apple Microarchitecture Research by Dougall Johnson

M1/A14 P-core (Firestorm): Overview | Base Instructions | SIMD and FP Instructions
M1/A14 E-core (Icestorm):  Overview | Base Instructions | SIMD and FP Instructions

SWPL (64-bit)

Test 1: uops

Code:

  swpl x0, x1, [x6]
  nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop
  mov x0, 0

(no loop instructions)

1000 unrolls and 1 iteration

Retires (minus 70 nops): 2.000

Issues: 2.000

Integer unit issues: 0.001

Load/store unit issues: 2.000

SIMD/FP unit issues: 0.000

retire uop (01)cycle (02)schedule uop (52)schedule int uop (53)schedule ldst uop (55)dispatch ldst uop (58)simd uops in schedulers (5a)dispatch uop (78)map ldst uop (7d)map ldst uop inputs (80)? int output thing (e9)? ldst retires (ed)
720053478120051200420001176720002000400012000
720043485020011200020001176720002000400012000
720043424720011200020001176720002000400012000
720043442020011200020001176720002000400012000
720053467820031200220001176720002000400012000
720043454820011200020001176720002000400012000
720043467320011200020001176720002000400012000
720043431920011200020001176720002000400012000
720043458220011200020001176720002000400012000
720043559320011200020001176720002000400012000

Test 2: throughput

Code:

  swpl x0, x1, [x6]
  add x6, x6, 8

(fused SUBS/B.cc loop)

100 unrolls and 100 iterations

Result (median cycles for code): 6.0057

retire uop (01)cycle (02)schedule uop (52)schedule int uop (53)schedule ldst uop (55)dispatch int uop (56)dispatch ldst uop (58)int uops in schedulers (59)simd uops in schedulers (5a)dispatch uop (78)map int uop (7c)map ldst uop (7d)map int uop inputs (7f)map ldst uop inputs (80)? int output thing (e9)? ldst retires (ed)? int retires (ef)
30209607063025810154201041015520004329061337783010610202200041020240008100012000010100
30205601003016210122200401012220002328701336593010310201200031020140005100012000010100
30204600543010110101200001010120002328681338093010310201200031020140005100012000010100
30204600543010110101200001010120002328711335433010310201200031020140005100012000010100
30204600543010110101200001010120002328711336313010310201200031020140005100012000010100
30204600543010110101200001010120002328691337733010310201200031020140005100012000010100
30204600543010110101200001010120002328711337423010310201200031020140005100012000010100
30204600543010110101200001010120002328701337103010310201200031020140005100012000010100
30204600543010110101200001010120002328711336773010310201200031020140005100012000010100
30204600543010110101200001010120002328711337413010310201200031020140005100012000010100

1000 unrolls and 10 iterations

Result (median cycles for code): 6.0064

retire uop (01)cycle (02)schedule uop (52)schedule int uop (53)schedule ldst uop (55)dispatch int uop (56)dispatch ldst uop (58)int uops in schedulers (59)simd uops in schedulers (5a)dispatch uop (78)map int uop (7c)map ldst uop (7d)map int uop inputs (7f)map ldst uop inputs (80)? int output thing (e9)? ldst retires (ed)? int retires (ef)
30029605683015110061200901006120002326331340473001310021200031002040000100012000010010
30024600613001110011200001001020000326301340503001010020200001002040000100012000010010
30024600613001110011200001001020000326301340603001010020200001002040000100012000010010
30024600613001110011200001001020000326301340363001010020200001002040000100012000010010
30024600613001110011200001001020000326301340623001010020200001002040000100012000010010
30024600613001110011200001001020000326301340463001010020200001002040000100012000010010
30024600863001310011200021001220000326261342323001010020200001002040000100012000010010
30024600643001110011200001001020000326261340893001010020200001002040000100012000010010
30024600643001110011200001001020000326261341033001010020200001002040000100012000010010
30024600643001110011200001001020000326261340673001010020200001002040000100012000010010

Test 3: throughput

Code:

  swpl x0, x1, [x6]
  mov x7, 8

(fused SUBS/B.cc loop)

100 unrolls and 100 iterations

Result (median cycles for code): 9.8242

retire uop (01)cycle (02)schedule uop (52)schedule int uop (53)schedule ldst uop (55)dispatch int uop (56)dispatch ldst uop (58)int uops in schedulers (59)simd uops in schedulers (5a)dispatch uop (78)map int uop (7c)map ldst uop (7d)map int uop inputs (7f)map ldst uop inputs (80)? int output thing (e9)? ldst retires (ed)? int retires (ef)
202051007062112810221026101211845041768022212852142374420649396120000100
20204993982071610120615100203175001741853204172002109029249948120000100
202041003682138613821248137212116911767167213513122386233448020120000100
20204997452071711720600116201505051731137202512042037029842736120000100
20204995712057312120452120200465001740424201462002013020241544120000100
20205990902057310120472100202674161734901203672002092020041872120000100
20204986482039710120296100206125001770022207122002202020041344120000100
20204985962025610120155100200375001731736201372002011429442024120000100
20204981532112010121019100207974481728838208972002240220040300120000100
20204962662010210120001100200004501719855201002002000420240184120000100

1000 unrolls and 10 iterations

Result (median cycles for code): 10.0303

retire uop (01)cycle (02)schedule uop (52)schedule int uop (53)schedule ldst uop (55)dispatch int uop (56)dispatch ldst uop (58)int uops in schedulers (59)simd uops in schedulers (5a)dispatch uop (78)map int uop (7c)map ldst uop (7d)map int uop inputs (7f)map ldst uop inputs (80)? int output thing (e9)? ldst retires (ed)? int retires (ef)
200251002562151012214981122026511776196220372226114224975612000010
200241001902090114208871321740521774706217512225274224962812000010
200241002392115213211391221547491777843215582424952224868012000010
200241000642066712206551121512531774724215232224688245182012000010
200241003582127012212581121602561775812216152824898224766812000010
200241002132114811211371021265521781128212762223836285200012000010
200241010232167317216561621843511783948218562625708304945612000010
200241007352169513216821221558551780053215713024834305014012000010
200241003062133312213211121639771777942216584025136405044012000010
200241005702154411215331021803551776451218163025544345139212000010