Apple Microarchitecture Research by Dougall Johnson

M1/A14 P-core (Firestorm): Overview | Base Instructions | SIMD and FP Instructions
M1/A14 E-core (Icestorm):  Overview | Base Instructions | SIMD and FP Instructions

SWPAL (64-bit)

Test 1: uops

Code:

  swpal x0, x1, [x6]
  nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop
  mov x0, 0

(no loop instructions)

1000 unrolls and 1 iteration

Retires (minus 70 nops): 2.000

Issues: 2.000

Integer unit issues: 0.001

Load/store unit issues: 2.000

SIMD/FP unit issues: 0.000

retire uop (01)cycle (02)schedule uop (52)schedule int uop (53)schedule ldst uop (55)dispatch ldst uop (58)simd uops in schedulers (5a)dispatch uop (78)map ldst uop (7d)map ldst uop inputs (80)? int output thing (e9)? ldst retires (ed)
720053774520051200420021180220022002400012000
720043437820011200020001177220002000400012000
720043445720011200020001177220002000400012000
720043426620011200020001177220002000400012000
720043461320011200020001177220002000400012000
720043414820011200020001177220002000400012000
720043415320011200020001177220002000400012000
720043416820011200020001177220002000400012000
720043414620011200020001177220002000400412000
720043430820011200020001177220002000400012000

Test 2: throughput

Code:

  swpal x0, x1, [x6]
  add x6, x6, 8

(fused SUBS/B.cc loop)

100 unrolls and 100 iterations

Result (median cycles for code): 6.0061

retire uop (01)cycle (02)schedule uop (52)schedule int uop (53)schedule ldst uop (55)dispatch int uop (56)dispatch ldst uop (58)int uops in schedulers (59)simd uops in schedulers (5a)dispatch uop (78)map int uop (7c)map ldst uop (7d)map int uop inputs (7f)map ldst uop inputs (80)map simd uop inputs (81)? int output thing (e9)? ldst retires (ed)? simd retires (ee)? int retires (ef)
3020960700302221013920083101402000435252126821301061020220004102024000801000120000010100
3020460061301011010120000101012000235249126953301031020120003102014000501000120000010100
3020460061301031010120002101022000235249126938301031020120003102014000501000120000010100
3020460061301011010120000101012000235249126922301031020120003102014000501000120000010100
3020460061301011010120000101012000235249126944301031020120003102014000501000120000010100
3020460061301011010120000101012000235249126829301031020120003102014000501000120000010100
3020460061301011010120000101012000235249126915301031020120003102014000501000120000010100
3020460061301011010120000101012000235249126835301031020120003102014000501000120000010100
3020460064301011010120000101012000235249126878301031020120003102014000501000120000010100
3020460061301011010120000101012000235249126817301031020120003102014000501000120000010100

1000 unrolls and 10 iterations

Result (median cycles for code): 6.0057

retire uop (01)cycle (02)schedule uop (52)schedule int uop (53)schedule ldst uop (55)dispatch int uop (56)dispatch ldst uop (58)int uops in schedulers (59)simd uops in schedulers (5a)dispatch uop (78)map int uop (7c)map ldst uop (7d)map int uop inputs (7f)map ldst uop inputs (80)? int output thing (e9)? ldst retires (ed)? int retires (ef)
30029607463013910052200871005220002350691270063001310021200031002040000100012000010010
30024600573001110011200001001020000350501270043001010020200001002040000100012000010010
30024600573001110011200001001020000350501269963001010020200001002040000100012000010010
30024600573001110011200001001020000350501270163001010020200001002040000100012000010010
30024600573001110011200001001020000350501270083001010020200001002040000100012000010010
30025601033004710023200241002320000350501270043001010020200001002040000100012000010010
30024600643001110011200001001020000350501268333001010020200001002040000100012000010010
30024600573001110011200001001020000350501270003001010020200001002040000100012000010010
30024600573001110011200001001020000350501270003001010020200001002040000100012000010010
30024600573001110011200001001020000350501269593001010020200001002040000100012000010010

Test 3: throughput

Code:

  swpal x0, x1, [x6]
  mov x7, 8

(fused SUBS/B.cc loop)

100 unrolls and 100 iterations

Result (median cycles for code): 24.0046

retire uop (01)cycle (02)schedule uop (52)schedule int uop (53)schedule ldst uop (55)dispatch int uop (56)dispatch ldst uop (58)int uops in schedulers (59)simd uops in schedulers (5a)dispatch uop (78)map int uop (7c)map ldst uop (7d)map int uop inputs (7f)map ldst uop inputs (80)? int output thing (e9)? ldst retires (ed)? int retires (ef)
202052401472012510120024100200043002365780201042002000420040008120000100
202052400712012510120024100200043002365802201042002000420040008120000100
202042400452010510120004100200043002365802201042002000420040008120000100
202042400502010510120004100200043002365802201042002000420040048120000100
202042400442010510120004100200043002365802201042002000420040008120000100
202042400442010510120004100200043002365802201042002000420040008120000100
202042400442010510120004100200043002365881201042002000420040008120000100
202042400442010510120004100200043002365802201042002000420040008120000100
202042400442010510120004100200243002366309201242002002420040048120000100
202042400442010510120004100200043002365822201042002000420040008120000100

1000 unrolls and 10 iterations

Result (median cycles for code): 24.0039

retire uop (01)cycle (02)schedule uop (52)schedule int uop (53)schedule ldst uop (55)dispatch int uop (56)dispatch ldst uop (58)int uops in schedulers (59)simd uops in schedulers (5a)dispatch uop (78)map int uop (7c)map ldst uop (7d)map int uop inputs (7f)map ldst uop inputs (80)? int output thing (e9)? ldst retires (ed)? int retires (ef)
200252401442003411200231020000302362738200102020000204000012000010
200242400462001111200001020000302362644200102020000204000012000010
200242400392001111200001020044302363450200542020044204000012000010
200242400392001111200001020000302362644200102020000204000012000010
200242400392001111200001020000302362707200102020000204005212000010
200242400412001111200001020000302362644200102020000204000012000010
200242400392001111200001020024302363169200342020024204000012000010
200242400392001111200001020044302363219200542020044204000012000010
200252400882003511200241020000302362644200102020000204000012000010
200242400392001111200001020000302362644200102020000204000012000010