Apple Microarchitecture Research by Dougall Johnson

M1/A14 P-core (Firestorm): Overview | Base Instructions | SIMD and FP Instructions
M1/A14 E-core (Icestorm):  Overview | Base Instructions | SIMD and FP Instructions

SWP (64-bit)

Test 1: uops

Code:

  swp x0, x1, [x6]
  nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop
  mov x0, 0

(no loop instructions)

1000 unrolls and 1 iteration

Retires (minus 70 nops): 2.000

Issues: 2.000

Integer unit issues: 0.001

Load/store unit issues: 2.000

SIMD/FP unit issues: 0.000

retire uop (01)cycle (02)schedule uop (52)schedule int uop (53)schedule ldst uop (55)dispatch ldst uop (58)simd uops in schedulers (5a)dispatch uop (78)map ldst uop (7d)map ldst uop inputs (80)? int output thing (e9)? ldst retires (ed)
720063452920131201220001177020002000400012000
720043415520011200020001177020002000400012000
720043508920011200020001177220002000400012000
720043426820011200020001177020002000400012000
720043410820011200020001177020002000400012000
720043411720011200020001177020002000400012000
720043412820011200020001177020002000400012000
720043411920011200020001177020002000400012000
720043412220011200020001177020002000400012000
720043410820011200020001177020002000400012000

Test 2: throughput

Code:

  swp x0, x1, [x6]
  add x6, x6, 8

(fused SUBS/B.cc loop)

100 unrolls and 100 iterations

Result (median cycles for code): 3.0062

retire uop (01)cycle (02)schedule uop (52)schedule int uop (53)schedule ldst uop (55)dispatch int uop (56)dispatch ldst uop (58)int uops in schedulers (59)simd uops in schedulers (5a)dispatch uop (78)map int uop (7c)map ldst uop (7d)map int uop inputs (7f)map ldst uop inputs (80)? int output thing (e9)? ldst retires (ed)? int retires (ef)
30209309103033310188201451018820006329181281523010910203200071020240012100022000010100
30204300623010510102200031010220005329111282403010710202200061020240012100022000010100
30204300623010510102200031010220005329111282983010710202200061020240012100022000010100
30204300643010510102200031010220005329111282523010710202200061020240012100022000010100
30204300623010510102200031010220005329111281883010710202200061020240012100022000010100
30204300623010510102200031010220005329111282903010710202200061020240012100022000010100
30204300623010510102200031010220005329111282903010710202200061020240012100022000010100
30204300623010510102200031010220005329081279903010710202200061020240012100022000010100
30204300623010410102200021010220005329111281523010710202200061020240012100022000010100
30204300623010510102200031010220005329111282243010710202200061020240012100022000010100

1000 unrolls and 10 iterations

Result (median cycles for code): 3.0055

retire uop (01)cycle (02)schedule uop (52)schedule int uop (53)schedule ldst uop (55)dispatch int uop (56)dispatch ldst uop (58)int uops in schedulers (59)simd uops in schedulers (5a)dispatch uop (78)map int uop (7c)map ldst uop (7d)map int uop inputs (7f)map ldst uop inputs (80)? int output thing (e9)? ldst retires (ed)? int retires (ef)
30029307153021910089201301008920005326421291513001710022200061002040000100012000010010
30024300623001110011200001001020000326021291523001010020200001002040000100012000010010
30024300563001110011200001001020000326021290423001010020200001002040000100012000010010
30024300553001110011200001001020000326021290983001010020200001002040000100012000010010
30024300553001110011200001001020000326021290183001010020200001002040000100012000010010
30024300563001110011200001001020000326021291433001010020200001002340013100032000010010
30024300583001410012200021001220000326021291533001010020200001002040000100012000010010
30024300583001110011200001001020000326021291773001010020200001002040000100012000010010
30024300583001110011200001001020000326021290523001010020200001002040000100012000010010
30024300583001110011200001001020000326021292573001010020200001002040000100012000010010

Test 3: throughput

Code:

  swp x0, x1, [x6]
  mov x7, 8

(fused SUBS/B.cc loop)

100 unrolls and 100 iterations

Result (median cycles for code): 10.4969

retire uop (01)cycle (02)schedule uop (52)schedule int uop (53)schedule ldst uop (55)dispatch int uop (56)dispatch ldst uop (58)int uops in schedulers (59)simd uops in schedulers (5a)dispatch uop (78)map int uop (7c)map ldst uop (7d)map int uop inputs (7f)map ldst uop inputs (80)? int output thing (e9)? ldst retires (ed)? int retires (ef)
202061014662017310120072100200104991803853201102002002220040040120000100
202041037642014010120039100200245001867084201242002007220040064120000100
202051048412016810420064104200415001855249201412002011220041184120000100
202041080092015010120049100200565001848888201562002016820040352120000100
202041126402012810120027100200305001939091201302002008820040220120000100
202041041552011310120012100200325011856492201322002009620040752120000100
202041045812014310120042100200195001867192201192002006020040008120000100
202041036342010110120000100200655001852952201652002017220240844220000100
202041044352010410120003100200384101851827201402042009820040104120000100
202041052432011510120014100200444331878740201472062011420040144120000100

1000 unrolls and 10 iterations

Result (median cycles for code): 10.0384

retire uop (01)cycle (02)schedule uop (52)schedule int uop (53)schedule ldst uop (55)dispatch int uop (56)dispatch ldst uop (58)int uops in schedulers (59)simd uops in schedulers (5a)dispatch uop (78)map int uop (7c)map ldst uop (7d)map int uop inputs (7f)map ldst uop inputs (80)? int output thing (e9)? ldst retires (ed)? simd retires (ee)? int retires (ef)
2002510070020062112005110200005017791292001020200042040008120000010
2002410008020011112000010200005017790032001020200002040264120000010
2002410041720017112000610200045017857662001420200122040132120000010
2002410042920027112001610200285017840192003820200642040168120000010
2002410044620023112001210200005017790032001020200002040000120000010
2002410007320011112000010200005017790032001020200002040000120000010
2002410007320011112000010200005017790032001020200002040000120000010
2002410007320011112000010200745017812782008420201202040072120000010
2002410024620020112000910200165017841452002620200422040104120000010
2002410015520018112000710200095017786252001920200162040032120000010