Apple Microarchitecture Research by Dougall Johnson

M1/A14 P-core (Firestorm): Overview | Base Instructions | SIMD and FP Instructions
M1/A14 E-core (Icestorm):  Overview | Base Instructions | SIMD and FP Instructions

LDRSW (pre-index)

Test 1: uops

Code:

  ldrsw x0, [x6, #8]!
  mov x0, 1
  mov x1, 2
  mov x8, 0

(no loop instructions)

1000 unrolls and 1 iteration

Retires: 2.000

Issues: 2.000

Integer unit issues: 1.001

Load/store unit issues: 1.000

SIMD/FP unit issues: 0.000

retire uop (01)cycle (02)schedule uop (52)schedule int uop (53)schedule ldst uop (55)dispatch int uop (56)dispatch ldst uop (58)int uops in schedulers (59)simd uops in schedulers (5a)dispatch uop (78)map int uop (7c)map ldst uop (7d)map int uop inputs (7f)map ldst uop inputs (80)? int output thing (e9)? ldst retires (ed)? int retires (ef)
2005124520311018101310361000207121738920001000100010001000100110001000
2004107620011001100010001000205821784020001000100010001000100110001000
2004107320011001100010001000211391771620001000100010001000100110001000
2004108320011001100010001000208771793420001000100010001000100110001000
2004108620011001100010001000209991751520001000100010001000100110001000
2004108620011001100010001000210651763420001000100010001000100110001000
2004108120011001100010001000208891798220001000100010001000100110001000
2004108220011001100010001000211081758920001000100010001000100110001000
2004107620011001100010001000210561762320001000100010001000100110001000
2004108420011001100010001000212261758820001000100010001000100110001000

Test 2: Latency 1->2 (with chain penalty)

Chain cycles: 3

Code:

  ldrsw x0, [x6, #8]!
  eor x8, x8, x0
  eor x8, x8, x0
  add x6, x6, x8
  mov x0, 1
  mov x1, 2
  mov x8, 0

(fused SUBS/B.cc loop)

100 unrolls and 100 iterations

Result (median cycles for code, minus 3 chain cycles): 4.0276

retire uop (01)cycle (02)schedule uop (52)schedule int uop (53)schedule ldst uop (55)dispatch int uop (56)dispatch ldst uop (58)int uops in schedulers (59)simd uops in schedulers (5a)dispatch uop (78)map int uop (7c)map ldst uop (7d)map int uop inputs (7f)map ldst uop inputs (80)? int output thing (e9)? ldst retires (ed)? int retires (ef)
5020971263501614015610005402471000318517755351605010940212100047022110004400061000040100
5020470153501064010610000401061000318515765351695010940212100047022110004400061000040100
5020470201501064010610000401061000318526295355175010940212100047022110004400061000040100
5020470160501064010610000401061000318526835355385010940212100047022110004400061000040100
5020470178501064010610000401061000318515765351695010940212100047022110004400061000040100
5020470151501064010610000401061001218533735356295015240251100137022110004400061000040100
5020470151501064010610000401061000318515765351695010940212100047022110004400061000040100
5020470151501064010610000401061001318561485366285015340253100157098810118401421000040100
5020470134501064010610000401061000318518465352595010940212100047022110004400061000040100
5020470151501064010610000401061000318515765351695010940212100047022110004400061000040100

1000 unrolls and 10 iterations

Result (median cycles for code, minus 3 chain cycles): 4.0094

retire uop (01)cycle (02)schedule uop (52)schedule int uop (53)schedule ldst uop (55)dispatch int uop (56)dispatch ldst uop (58)int uops in schedulers (59)simd uops in schedulers (5a)dispatch uop (78)map int uop (7c)map ldst uop (7d)map int uop inputs (7f)map ldst uop inputs (80)? int output thing (e9)? ldst retires (ed)? int retires (ef)
5003071484500834007610007401901000318529355359615001940032100047004110004400041000040010
5002470119500144001410000400101000018500395349985001040020100007002010000400031000040010
5002470092500134001310000400101000018500395349985001040020100007002010000400031000040010
5002470092500134001310000400101000018500395349985001040020100007010910013400141000040010
5002470132500134001310000400101000018514975354835001040020100007002010000400031000040010
5002470186500134001310000400101000018500395349985001040020100007002010000400031000040010
5002470092500134001310000400101000018500395349985001040020100007002010000400031000040010
5002470092500134001310000400101000018500395349985001040020100007011210015400141000040010
5002470092500134001310000400101000018500395349985001040020100007002010000400031000040010
5002470096500134001310000400101000018506335351965001040020100007002010000400031000040010

Test 3: throughput

Count: 8

Code:

  ldrsw x0, [x6, #8]!
  ldrsw x0, [x7, #8]!
  ldrsw x0, [x8, #8]!
  ldrsw x0, [x9, #8]!
  ldrsw x0, [x10, #8]!
  ldrsw x0, [x11, #8]!
  ldrsw x0, [x12, #8]!
  ldrsw x0, [x13, #8]!
  mov x7, x6
  mov x8, x6
  mov x9, x6
  mov x10, x6
  mov x11, x6
  mov x12, x6
  mov x13, x6

(fused SUBS/B.cc loop)

100 unrolls and 100 iterations

Result (median cycles for code divided by count): 0.5403

retire uop (01)cycle (02)schedule uop (52)schedule int uop (53)schedule ldst uop (55)dispatch int uop (56)dispatch ldst uop (58)int uops in schedulers (59)simd uops in schedulers (5a)dispatch uop (78)map int uop (7c)map ldst uop (7d)map int uop inputs (7f)map ldst uop inputs (80)? int output thing (e9)? ldst retires (ed)? int retires (ef)
160204432561601098010980000801128001124048564242016012380212800128021280012800098000080100
160204432241601098010980000801128000924048564259516012180212800128021280012800098000080100
160205435361601828015180031801548001224057864213516012480212800128021280012800098000080100
160204432141601098010980000801128001224048564291716012480212800128021280012800098000080100
160204432171601098010980000801128001124048564233316012380212800128021280012800098000080100
160204432161601078010780000801088001024048564104516012280212800128021280012800098000080100
160204432171601108010980001801128000924048564119816012180212800128021280012800098000080100
160205434761601868015180035801548001224060164424716012480212800128021280012800098000080100
160204432181601098010980000801128000824047364295516011680208800088021280012800098000080100
160204432161601098010980000801128000824047364240616011680208800088029280092800898000080100

1000 unrolls and 10 iterations

Result (median cycles for code divided by count): 0.5402

retire uop (01)cycle (02)schedule uop (52)schedule int uop (53)schedule ldst uop (55)dispatch int uop (56)dispatch ldst uop (58)int uops in schedulers (59)simd uops in schedulers (5a)dispatch uop (78)map int uop (7c)map ldst uop (7d)map int uop inputs (7f)map ldst uop inputs (80)map simd uop inputs (81)? int output thing (e9)? ldst retires (ed)? simd retires (ee)? int retires (ef)
16002944382160326802208010680223800082402156442231600308003280012800208000008000180000080010
16002443216160011800118000080010800002401796450481600108002080000800208000008000180000080010
16002443215160011800118000080010800002401796447371600108002080000800208000008000180000080010
16002543284160092800618003180064800002402726459271600108002080000800208000008000180000080010
16002443215160011800118000080010800002401796445851600108002080000800208000008000180000080010
16002443217160011800118000080010800002401796406141600108002080000800208000008000180000080010
16002443215160011800118000080010800002401796392111600108002080000800208000008000180000080010
16002443215160011800118000080010800002401796423591600108002080000800208000008000180000080010
16002443215160011800118000080010800002401796398941600108002080000800208000008000180000080010
16002543304160094800618003380064800002402726417771600108002080000800208000008000180000080010