Apple Microarchitecture Research by Dougall Johnson

M1/A14 P-core (Firestorm): Overview | Base Instructions | SIMD and FP Instructions
M1/A14 E-core (Icestorm):  Overview | Base Instructions | SIMD and FP Instructions

LDRSH (post-index, 32-bit)

Test 1: uops

Code:

  ldrsh w0, [x6], #8
  mov x0, 1
  mov x1, 2
  mov x8, 0

(no loop instructions)

1000 unrolls and 1 iteration

Retires: 2.000

Issues: 2.000

Integer unit issues: 1.001

Load/store unit issues: 1.000

SIMD/FP unit issues: 0.000

retire uop (01)cycle (02)schedule uop (52)schedule int uop (53)schedule ldst uop (55)dispatch int uop (56)dispatch ldst uop (58)int uops in schedulers (59)simd uops in schedulers (5a)dispatch uop (78)map int uop (7c)map ldst uop (7d)map int uop inputs (7f)map ldst uop inputs (80)? int output thing (e9)? ldst retires (ed)? int retires (ef)
2005138220391020101910421000210931783820001000100010001000100110001000
2004107120011001100010001000208821825420001000100010001000100110001000
2004106820011001100010001000213731775520001000100010001000100110001000
2004108120011001100010001000213391778620001000100010001000100110001000
2004108420011001100010001000213921768220001000100010001000100110001000
2004108220011001100010001000212721771920001000100010001000100110001000
2004108020011001100010001000213471771520001000100010001000100110001000
2004108020011001100010001000213471773320001000100010001000100110001000
2004108320011001100010001000216401766820001000100010001000100110001000
2004108320011001100010001000213991772020001000100010001000100110001000

Test 2: Latency 1->2 (with chain penalty)

Chain cycles: 3

Code:

  ldrsh w0, [x6], #8
  eor x8, x8, x0
  eor x8, x8, x0
  add x6, x6, x8
  mov x0, 1
  mov x1, 2
  mov x8, 0

(fused SUBS/B.cc loop)

100 unrolls and 100 iterations

Result (median cycles for code, minus 3 chain cycles): 4.0115

retire uop (01)cycle (02)schedule uop (52)schedule int uop (53)schedule ldst uop (55)dispatch int uop (56)dispatch ldst uop (58)int uops in schedulers (59)simd uops in schedulers (5a)dispatch uop (78)map int uop (7c)map ldst uop (7d)map int uop inputs (7f)map ldst uop inputs (80)? int output thing (e9)? ldst retires (ed)? int retires (ef)
5020971088501584015310005402471000318516085351025010940212100047022110004400041000040100
5020470117501044010410000401061000318505285347665010940212100047022110004400041000040100
5020470115501044010410000401061000318506045348225010940212100047022110004400041000040100
5020470115501044010410000401061000318506045348225010940212100047022110004400041000040100
5020470115501044010410000401061000318506045348225010940212100047022110004400041000040100
5020570194501174011510002401401000318511175349875010940212100047022110004400041000040100
5020470115501044010410000401061000318505505348065010940212100047022110004400041000040100
5020470115501044010410000401061000318506045348225010940212100047022110004400041000040100
5020470115501044010410000401061000318510635349635010940212100047022110004400041000040100
5020470115501044010410000401061000318506045348225010940212100047022110004400041000040100

1000 unrolls and 10 iterations

Result (median cycles for code, minus 3 chain cycles): 4.0104

retire uop (01)cycle (02)schedule uop (52)schedule int uop (53)schedule ldst uop (55)dispatch int uop (56)dispatch ldst uop (58)int uops in schedulers (59)simd uops in schedulers (5a)dispatch uop (78)map int uop (7c)map ldst uop (7d)map int uop inputs (7f)map ldst uop inputs (80)? int output thing (e9)? ldst retires (ed)? int retires (ef)
5002971631500694006410005401561000318506725351015001940032100047002010000400041000040010
5002470115500144001410000400101001318519145355425006140073100157002010000400041000040010
5002470104500134001310000400101000018503635350705001040020100007002010000400031000040010
5002470104500134001310000400101000018503635350705001040020100007002010000400031000040010
5002470104500134001310000400101000018503635350705001040020100007002010000400031000040010
5002470104500134001310000400101000018503635350705001040020100007002010000400031000040010
5002470104500134001310000400101000018503635350705001040020100007002010000400031000040010
5002470104500134001310000400101000018503635350705001040020100007002010000400031000040010
5002470104500134001310000400101000018503635350705001040020100007002010000400031000040010
5002570220500264002410002400501000018511735353175001040020100007002010000400031000040010

Test 3: throughput

Count: 8

Code:

  ldrsh w0, [x6], #8
  ldrsh w0, [x7], #8
  ldrsh w0, [x8], #8
  ldrsh w0, [x9], #8
  ldrsh w0, [x10], #8
  ldrsh w0, [x11], #8
  ldrsh w0, [x12], #8
  ldrsh w0, [x13], #8
  mov x7, x6
  mov x8, x6
  mov x9, x6
  mov x10, x6
  mov x11, x6
  mov x12, x6
  mov x13, x6

(fused SUBS/B.cc loop)

100 unrolls and 100 iterations

Result (median cycles for code divided by count): 0.5409

retire uop (01)cycle (02)schedule uop (52)schedule int uop (53)schedule ldst uop (55)dispatch int uop (56)dispatch ldst uop (58)int uops in schedulers (59)simd uops in schedulers (5a)dispatch uop (78)map int uop (7c)map ldst uop (7d)map int uop inputs (7f)map ldst uop inputs (80)? int output thing (e9)? ldst retires (ed)? int retires (ef)
160209444021604118031380098803168001224057863964416012480212800128021280012800098000080100
160204432291601128010980003801128000924048562879716012180212800128021280012800098000080100
160204432201601098010980000801128001024048564247016012280212800128021280012800098000080100
160204432171601098010980000801128000824047363609216011680208800088021280012800098000080100
160204432181601098010980000801128001224048564338316012480212800128021280012800098000080100
160204432171601098010980000801128001224048564212216012480212800128021280012800098000080100
160205433251601798014980030801508000924048564103616012180212800128021280012800098000080100
160204432171601098010980000801128000924048563528516012180212800128025480054800518000080100
160204432161601098010980000801128001224048564422416012480212800128020880008800078000080100
160204432171601098010980000801128001024048564580216012280212800128021280012800098000080100

1000 unrolls and 10 iterations

Result (median cycles for code divided by count): 0.5405

retire uop (01)cycle (02)schedule uop (52)schedule int uop (53)schedule ldst uop (55)dispatch int uop (56)dispatch ldst uop (58)int uops in schedulers (59)simd uops in schedulers (5a)dispatch uop (78)map int uop (7c)map ldst uop (7d)map int uop inputs (7f)map ldst uop inputs (80)? int output thing (e9)? ldst retires (ed)? int retires (ef)
160029443371603328022180111802228005424092762119316011880074800548003280012800098000080010
160024432451600198001980000800228001024053064297216003280032800128002080000800018000080010
160024432441600118001180000800108000024050964200616001080020800008002080000800018000080010
160024432441600118001180000800108000024047964323516001080020800008002080000800018000080010
160024432441600118001180000800108000024051764097316001080020800008002080000800018000080010
160024432461600118001180000800108000024051464260416001080020800008002080000800018000080010
160024432551600118001180000800108000024051864205616001080020800008002080000800018000080010
160024432441600118001180000800108000024049064347516001080020800008002080000800018000080010
160024432441600118001180000800108000024050464217516001080020800008002080000800018000080010
160024432441600118001180000800108000024050464309016001080020800008002080000800018000080010