Apple Microarchitecture Research by Dougall Johnson

M1/A14 P-core (Firestorm): Overview | Base Instructions | SIMD and FP Instructions
M1/A14 E-core (Icestorm):  Overview | Base Instructions | SIMD and FP Instructions

LDRSB (post-index, 32-bit)

Test 1: uops

Code:

  ldrsb w0, [x6], #8
  mov x0, 1
  mov x1, 2
  mov x8, 0

(no loop instructions)

1000 unrolls and 1 iteration

Retires: 2.000

Issues: 2.000

Integer unit issues: 1.001

Load/store unit issues: 1.000

SIMD/FP unit issues: 0.000

retire uop (01)cycle (02)schedule uop (52)schedule int uop (53)schedule ldst uop (55)dispatch int uop (56)dispatch ldst uop (58)int uops in schedulers (59)simd uops in schedulers (5a)dispatch uop (78)map int uop (7c)map ldst uop (7d)map int uop inputs (7f)map ldst uop inputs (80)? int output thing (e9)? ldst retires (ed)? int retires (ef)
2005123120311018101310361000207611753020001000100010001000100110001000
2004106420011001100010001000208941779720001000100010001000100110001000
2004107020011001100010001000210111748620001000100010001000100110001000
2004112120011001100010001000211011753420001000100010001000100110001000
2004107320011001100010001000210601780520001000100010001000100110001000
2004106820011001100010001000211851766020001000100010001000100110001000
2004111620011001100010001000210741744820001000100010001000100110001000
2004112220011001100010001000210081768120001000100010001000100110001000
2004112720011001100010001000211111849420001000100010001000100110001000
2004110020011001100010001000210991737620001000100010001000100110001000

Test 2: Latency 1->2 (with chain penalty)

Chain cycles: 3

Code:

  ldrsb w0, [x6], #8
  eor x8, x8, x0
  eor x8, x8, x0
  add x6, x6, x8
  mov x0, 1
  mov x1, 2
  mov x8, 0

(fused SUBS/B.cc loop)

100 unrolls and 100 iterations

Result (median cycles for code, minus 3 chain cycles): 4.0147

retire uop (01)cycle (02)schedule uop (52)schedule int uop (53)schedule ldst uop (55)dispatch int uop (56)dispatch ldst uop (58)int uops in schedulers (59)simd uops in schedulers (5a)dispatch uop (78)map int uop (7c)map ldst uop (7d)map int uop inputs (7f)map ldst uop inputs (80)? int output thing (e9)? ldst retires (ed)? int retires (ef)
5020971254501614015610005402471000318500745345825010940212100047022110004400051000040100
5020470101501044010410000401061000318501185346605010940212100047022110004400041000040100
5020470097501044010410000401061000318501185346605010940212100047022110004400041000040100
5020470097501044010410000401061000318501185346605010940212100047022110004400041000040100
5020470180501044010410000401061000318502805347095010940212100047022110004400041000040100
5020470097501044010410000401061000318501185346605010940212100047022110004400041000040100
5020470097501044010410000401061000318501185346605010940212100047028910013400151000040100
5020470097501044010410000401061000318501185346605010940212100047022110004400041000040100
5020470097501044010410000401061000318501185346605010940212100047022110004400041000040100
5020470097501044010410000401061000318501185346605010940212100047022110004400041000040100

1000 unrolls and 10 iterations

Result (median cycles for code, minus 3 chain cycles): 4.0110

retire uop (01)cycle (02)schedule uop (52)schedule int uop (53)schedule ldst uop (55)dispatch int uop (56)dispatch ldst uop (58)int uops in schedulers (59)simd uops in schedulers (5a)dispatch uop (78)map int uop (7c)map ldst uop (7d)map int uop inputs (7f)map ldst uop inputs (80)? int output thing (e9)? ldst retires (ed)? int retires (ef)
5002971260500704006510005401561000318508025352205001940032100047002010000400041000040010
5002570197500274002510002400501000018512545353575001040020100007002010000400031000040010
5002470110500134001310000400101000018505255351245001040020100007002010000400031000040010
5002470110500134001310000400101000018505255351245001040020100007002010000400031000040010
5002470110500134001310000400101000018505255351245001040020100007002010000400031000040010
5002470110500134001310000400101000018505255351245001040020100007002010000400031000040010
5002470110500134001310000400101000018505255351245001040020100007002010000400031000040010
5002470110500134001310000400101000018505255351245001040020100007002010000400031000040010
5002470110500134001310000400101000018505255351245001040020100007010910013400141000040010
5002470110500134001310000400101000018505255351245001040020100007002010000400031000040010

Test 3: throughput

Count: 8

Code:

  ldrsb w0, [x6], #8
  ldrsb w0, [x7], #8
  ldrsb w0, [x8], #8
  ldrsb w0, [x9], #8
  ldrsb w0, [x10], #8
  ldrsb w0, [x11], #8
  ldrsb w0, [x12], #8
  ldrsb w0, [x13], #8
  mov x7, x6
  mov x8, x6
  mov x9, x6
  mov x10, x6
  mov x11, x6
  mov x12, x6
  mov x13, x6

(fused SUBS/B.cc loop)

100 unrolls and 100 iterations

Result (median cycles for code divided by count): 0.5404

retire uop (01)cycle (02)schedule uop (52)schedule int uop (53)schedule ldst uop (55)dispatch int uop (56)dispatch ldst uop (58)int uops in schedulers (59)simd uops in schedulers (5a)dispatch uop (78)map int uop (7c)map ldst uop (7d)map int uop inputs (7f)map ldst uop inputs (80)? int output thing (e9)? ldst retires (ed)? int retires (ef)
160209442651604228031280110803158001024061063733616012280212800128021280012800098000080100
160204432351601098010980000801128001124061064380316012380212800128021280012800098000080100
160204432321601088010780001801088001124061064272216012380212800128021280012800098000080100
160204432311601098010980000801128001224061064705016012480212800128021280012800098000080100
160204432311601098010980000801128001124061064051716012380212800128021280012800098000080100
160204432301601078010780000801108001124061064490716012380212800128021280012800098000080100
160204432301601078010780000801108001124061063943516012380212800128021280012800098000080100
160204432311601098010980000801128001124061064320616012380212800128021280012800098000080100
160205433011601818015180030801548001024071462787416012280212800128021280012800098000080100
160204432371601098010980000801128001124061064663116012380212800128021280012800098000080100

1000 unrolls and 10 iterations

Result (median cycles for code divided by count): 0.5403

retire uop (01)cycle (02)schedule uop (52)schedule int uop (53)schedule ldst uop (55)dispatch int uop (56)dispatch ldst uop (58)int uops in schedulers (59)simd uops in schedulers (5a)dispatch uop (78)map int uop (7c)map ldst uop (7d)map int uop inputs (7f)map ldst uop inputs (80)? int output thing (e9)? ldst retires (ed)? int retires (ef)
160029447361603448022280122802258001224021564602716003480032800128002080000800018000080010
160024432161600118001180000800108000024017964742416001080020800008007480054800518000080010
160024432151600118001180000800108000024017964419116001080020800008002080000800018000080010
160024432151600118001180000800108000024017964230016001080020800008002080000800018000080010
160024432151600118001180000800108000024017964068616001080020800008002080000800018000080010
160024432151600118001180000800108000024017964359216001080020800008002080000800018000080010
160024432151600118001180000800108000024017964359516001080020800008002080000800018000080010
160024432191600118001180000800108000024017964144516001080020800008002080000800018000080010
160024432151600118001180000800108000024017964549416001080020800008002080000800018000080010
160024432151600118001180000800108000024017964394516001080020800008002080000800018000080010