Apple Microarchitecture Research by Dougall Johnson

M1/A14 P-core (Firestorm): Overview | Base Instructions | SIMD and FP Instructions
M1/A14 E-core (Icestorm):  Overview | Base Instructions | SIMD and FP Instructions

LDRB (post-index)

Test 1: uops

Code:

  ldrb w0, [x6], #8
  mov x0, 1
  mov x1, 2
  mov x8, 0

(no loop instructions)

1000 unrolls and 1 iteration

Retires: 2.000

Issues: 2.000

Integer unit issues: 1.001

Load/store unit issues: 1.000

SIMD/FP unit issues: 0.000

retire uop (01)cycle (02)schedule uop (52)schedule int uop (53)schedule ldst uop (55)dispatch int uop (56)dispatch ldst uop (58)int uops in schedulers (59)simd uops in schedulers (5a)dispatch uop (78)map int uop (7c)map ldst uop (7d)map int uop inputs (7f)map ldst uop inputs (80)? int output thing (e9)? ldst retires (ed)? int retires (ef)
2005129520351020101510381000206901739820001000100010001000100110001000
2004107920011001100010001000207231767520001000100010001000100110001000
2004107620011001100010001000210971739620001000100010001000100110001000
2004106220011001100010001000209871767420001000100010001000100110001000
2004107520011001100010001000211601770120001000100010001000100110001000
2004108120011001100010001000210971763120001000100010001000100110001000
2004107820011001100010001000212701739620001000100010001000100110001000
2004108620011001100010001000214101763720001000100010001000100110001000
2004107820011001100010001000214081761320001000100010001000100110001000
2004109620011001100010001000211931740220001000100010001000100110001000

Test 2: Latency 1->2 (with chain penalty)

Chain cycles: 3

Code:

  ldrb w0, [x6], #8
  eor x8, x8, x0
  eor x8, x8, x0
  add x6, x6, x8
  mov x0, 1
  mov x1, 2
  mov x8, 0

(fused SUBS/B.cc loop)

100 unrolls and 100 iterations

Result (median cycles for code, minus 3 chain cycles): 4.0281

retire uop (01)cycle (02)schedule uop (52)schedule int uop (53)schedule ldst uop (55)dispatch int uop (56)dispatch ldst uop (58)int uops in schedulers (59)simd uops in schedulers (5a)dispatch uop (78)map int uop (7c)map ldst uop (7d)map int uop inputs (7f)map ldst uop inputs (80)? int output thing (e9)? ldst retires (ed)? int retires (ef)
5020971240501604015510005402471000218519055351355010840211100037022110004400071000040100
5020470160501074010710000401061000318515765351215010940212100047022110004400061000040100
5020570264501224012010002401401000318525215354105010940212100047022110004400071000040100
5020470158501074010710000401061000318517655351705010940212100047022110004400071000040100
5020470158501074010710000401061000318517655351705010940212100047022110004400071000040100
5020470158501074010710000401061000318517655351705010940212100047022110004400071000040100
5020470158501074010710000401061001218541015358005015240251100137022110004400071000040100
5020470172501074010710000401061000318517655351705010940212100047022110004400071000040100
5020470160501074010710000401061000318517655351705010940212100047022110004400071000040100
5020470158501074010710000401061000318517655351705010940212100047029210015400181000040100

1000 unrolls and 10 iterations

Result (median cycles for code, minus 3 chain cycles): 4.0117

retire uop (01)cycle (02)schedule uop (52)schedule int uop (53)schedule ldst uop (55)dispatch int uop (56)dispatch ldst uop (58)int uops in schedulers (59)simd uops in schedulers (5a)dispatch uop (78)map int uop (7c)map ldst uop (7d)map int uop inputs (7f)map ldst uop inputs (80)? int output thing (e9)? ldst retires (ed)? simd retires (ee)? int retires (ef)
50029712715007040065100054015610003185074853520250019400321000470020100004000410000040010
50024701175001440014100004001010000185060653513750010400201000070020100004000410000040010
50024701135001440014100004001010000185060653513750010400201000070020100004000410000040010
50025701965002740025100024004810000185130853535350010400201000070020100004000410000040010
50024701135001440014100004001010000185060653513750010400201000070020100004000410000040010
50024701135001440014100004001010000185060653513750010400201000070020100004000410000040010
50024701135001440014100004001010000185068753516450010400201000070020100004000410000040010
50024701325001440014100004001010000185230753567950010400201000071708111824070810596340940
50024701585001440014100004001010000185201053557650010400201000070020100004000410000040010
50024701175001440014100004001010000185071453516550010400201000070020100004000410000040010

Test 3: throughput

Count: 8

Code:

  ldrb w0, [x6], #8
  ldrb w0, [x7], #8
  ldrb w0, [x8], #8
  ldrb w0, [x9], #8
  ldrb w0, [x10], #8
  ldrb w0, [x11], #8
  ldrb w0, [x12], #8
  ldrb w0, [x13], #8
  mov x7, x6
  mov x8, x6
  mov x9, x6
  mov x10, x6
  mov x11, x6
  mov x12, x6
  mov x13, x6

(fused SUBS/B.cc loop)

100 unrolls and 100 iterations

Result (median cycles for code divided by count): 0.5406

retire uop (01)cycle (02)schedule uop (52)schedule int uop (53)schedule ldst uop (55)dispatch int uop (56)dispatch ldst uop (58)int uops in schedulers (59)simd uops in schedulers (5a)dispatch uop (78)map int uop (7c)map ldst uop (7d)map int uop inputs (7f)map ldst uop inputs (80)? int output thing (e9)? ldst retires (ed)? int retires (ef)
160209442081604198031180108803128001124081863742416012380212800128021280012800098000080100
160204432481601098010980000801128000824079164415716011680208800088021280012800098000080100
160204432461601098010980000801128001024084863899916012280212800128021280012800098000080100
160204432481601098010980000801128001224080564434116012480212800128021280012800098000080100
160204432461601098010980000801128001224079664288616012480212800128021280012800098000080100
160205433241601828015180031801548001024078064593916012280212800128021280012800098000080100
160204432471601098010980000801128001124082363637616012380212800128025480054800518000080100
160204432461601098010980000801128001224084464185216012480212800128021280012800098000080100
160204432501601098010980000801128001224080864515116012480212800128021280012800098000080100
160204432471601098010980000801128001224080963693916012480212800128021280012800098000080100

1000 unrolls and 10 iterations

Result (median cycles for code divided by count): 0.5404

retire uop (01)cycle (02)schedule uop (52)schedule int uop (53)schedule ldst uop (55)dispatch int uop (56)dispatch ldst uop (58)int uops in schedulers (59)simd uops in schedulers (5a)dispatch uop (78)map int uop (7c)map ldst uop (7d)map int uop inputs (7f)map ldst uop inputs (80)? int output thing (e9)? ldst retires (ed)? int retires (ef)
160029444761603348022380111802268001124034364444816003480033800138003280012800098000080010
160024432281600118001180000800108000024030464472916001080020800008002080000800018000080010
160024433541600228001980003800228001024035364423216003280032800128003280012800098000080010
160024432331600198001980000800228001024034064428716003280032800128003280012800098000080010
160024432301600198001980000800228001024038364609816003280032800128003280012800098000080010
160024432321600198001980000800228000024030464132916001080020800008002080000800018000080010
160024432291600118001180000800108000024030464510516001080020800008002080000800018000080010
160024432291600118001180000800108000024030464584716001080020800008002080000800018000080010
160024432291600118001180000800108000024030464435816001080020800008002080000800018000080010
160024432291600118001180000800108000024030464172816001080020800008002080000800018000080010