Apple Microarchitecture Research by Dougall Johnson

M1/A14 P-core (Firestorm): Overview | Base Instructions | SIMD and FP Instructions
M1/A14 E-core (Icestorm):  Overview | Base Instructions | SIMD and FP Instructions

STXR (64-bit)

Test 1: uops

Code:

  stxr w0, x1, [x6]
  nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop
  mov x0, 0

(no loop instructions)

1000 unrolls and 1 iteration

Retires (minus 70 nops): 1.000

Issues: 1.000

Integer unit issues: 0.001

Load/store unit issues: 1.000

SIMD/FP unit issues: 0.000

retire uop (01)cycle (02)schedule uop (52)schedule int uop (53)schedule ldst uop (55)dispatch ldst uop (58)simd uops in schedulers (5a)dispatch uop (78)map ldst uop (7d)map ldst uop inputs (80)? int output thing (e9)? ldst retires (ed)
71005342061003110021000400010001000200011000
71004338661001110001000400010001000200011000
71004338601001110001000400010001000200011000
71004338581001110001000400010001000200011000
71004338601001110001000400010001000200011000
71004338581001110001000400010001000200011000
71004338611001110001000400010001000200011000
71004338581001110001000400010001000200011000
71004338771001110001000400010001000200011000
71004338591001110001000400010001000200011000

Test 2: throughput

Code:

  stxr w0, x1, [x6]
  add x6, x6, 8

(fused SUBS/B.cc loop)

100 unrolls and 100 iterations

Result (median cycles for code): 2.3395

retire uop (01)cycle (02)schedule uop (52)schedule int uop (53)schedule ldst uop (55)dispatch int uop (56)dispatch ldst uop (58)int uops in schedulers (59)simd uops in schedulers (5a)dispatch uop (78)map int uop (7c)map ldst uop (7d)map int uop inputs (7f)map ldst uop inputs (80)? int output thing (e9)? ldst retires (ed)? int retires (ef)
20209240982038710279101081028010005354932518602011010205100051020520010100041000010100
20204233982010410104100001010510004354732523312010810204100041020420008100031000010100
20204233992010310103100001010410004354732521752010810204100041020420008100031000010100
20204233942010310103100001010410004354732521362010810204100041020420008100031000010100
20204234052010310103100001010410004354732520672010810204100041020420008100031000010100
20204234062010310103100001010410004354732520582010810204100041020420008100031000010100
20204233862010310103100001010410004354732522062010810204100041020420008100031000010100
20204234182010310103100001010410004354732523012010810204100041020420008100031000010100
20204233102010310103100001010410004354732516202010810204100041020420008100031000010100
20204234152010310103100001010410034358092531982016810234100341020420008100031000010100

1000 unrolls and 10 iterations

Result (median cycles for code): 2.3899

retire uop (01)cycle (02)schedule uop (52)schedule int uop (53)schedule ldst uop (55)dispatch int uop (56)dispatch ldst uop (58)int uops in schedulers (59)simd uops in schedulers (5a)dispatch uop (78)map int uop (7c)map ldst uop (7d)map int uop inputs (7f)map ldst uop inputs (80)? int output thing (e9)? ldst retires (ed)? int retires (ef)
20029255822024110152100891015710004352482584952001810024100041002520010100041000010010
20024238892001110011100001001010000352352587242001010020100001002020000100011000010010
20024239372001110011100001001010000352352586122001010020100001002020000100011000010010
20024238312001110011100001001010000352352580402001010020100001002020000100011000010010
20024238552001110011100001001010000352352576442001010020100001002020000100011000010010
20024237842001110011100001001010000352352586222001010020100001002020000100011000010010
20024239092001110011100001001010000352352585952001010020100001002020000100011000010010
20024239242001110011100001001010000352352587832001010020100001002020000100011000010010
20024239042001110011100001001010000352352584672001010020100001002020000100011000010010
20024238992001110011100001001010000352352584222001010020100001002020000100011000010010

Test 3: throughput

Code:

  stxr w0, x1, [x6]
  mov x7, 8

(fused SUBS/B.cc loop)

100 unrolls and 100 iterations

Result (median cycles for code): 3.0040

retire uop (01)cycle (02)schedule uop (52)schedule int uop (53)schedule ldst uop (55)dispatch int uop (56)dispatch ldst uop (58)int uops in schedulers (59)simd uops in schedulers (5a)dispatch uop (78)map int uop (7c)map ldst uop (7d)map int uop inputs (7f)map ldst uop inputs (80)? int output thing (e9)? ldst retires (ed)? int retires (ef)
1020520519101191011001810010000300528855101002001000420020008110000100
1020430040101011011000010010000300528713101002001000420020008110000100
1020430040101011011000010010000300528713101002001000420020008110000100
1020430040101011011000010010000300528713101002001000420020008110000100
1020430040101011011000010010000300528713101002001000420020008110000100
1020430040101011011000010010000300528713101002001000420020008110000100
1020430048101011011000010010000300528857101002001000420020008110000100
1020430040101011011000010010000300528713101002001000420020008110000100
1020430040101011011000010010000300528713101002001000420020008110000100
1020430040101011011000010010000300528713101002001000420020008110000100

1000 unrolls and 10 iterations

Result (median cycles for code): 3.0047

retire uop (01)cycle (02)schedule uop (52)schedule int uop (53)schedule ldst uop (55)dispatch int uop (56)dispatch ldst uop (58)int uops in schedulers (59)simd uops in schedulers (5a)dispatch uop (78)map int uop (7c)map ldst uop (7d)map int uop inputs (7f)map ldst uop inputs (80)? int output thing (e9)? ldst retires (ed)? int retires (ef)
1002520160100291110018101000030528855100102010000202000011000010
1002430047100111110000101000030528855100102010000202000011000010
1002430047100111110000101000030528855100102010000202000011000010
1002430047100111110000101000030528855100102010000202000011000010
1002430047100111110000101000030528855100102010000202000011000010
1002430047100111110000101000030528855100102010000202000011000010
1002430047100111110000101000030528855100102010000202000011000010
1002430047100111110000101000030528855100102010000202000011000010
1002430047100111110000101000030528855100102010000202000011000010
1002530094100291110018101000030528855100102010000202000011000010