Apple Microarchitecture Research by Dougall Johnson

M1/A14 P-core (Firestorm): Overview | Base Instructions | SIMD and FP Instructions
M1/A14 E-core (Icestorm):  Overview | Base Instructions | SIMD and FP Instructions

LDRH (post-index)

Test 1: uops

Code:

  ldrh w0, [x6], #8
  mov x0, 1
  mov x1, 2
  mov x8, 0

(no loop instructions)

1000 unrolls and 1 iteration

Retires: 2.000

Issues: 2.000

Integer unit issues: 1.001

Load/store unit issues: 1.000

SIMD/FP unit issues: 0.000

retire uop (01)cycle (02)schedule uop (52)schedule int uop (53)schedule ldst uop (55)dispatch int uop (56)dispatch ldst uop (58)int uops in schedulers (59)simd uops in schedulers (5a)dispatch uop (78)map int uop (7c)map ldst uop (7d)map int uop inputs (7f)map ldst uop inputs (80)? int output thing (e9)? ldst retires (ed)? int retires (ef)
2005123920401021101910421000209451752020001000100010001000100110001000
2004107320011001100010001000212311762520001000100010001000100110001000
2004108220011001100010001000212901798320001000100010001000100110001000
2004107820011001100010001000214441762520001000100010001000100110001000
2004107320011001100010001000213191785720001000100010001000100110001000
2004107920011001100010001000209351798620001000100010001000100110001000
2004110020011001100010001000214231762620001000100010001000100110001000
2004109720011001100010001000214551762620001000100010001000100110001000
2004108220011001100010001000214211760520001000100010001000100110001000
2004107820011001100010001000208071782820001000100010001000100110001000

Test 2: Latency 1->2 (with chain penalty)

Chain cycles: 3

Code:

  ldrh w0, [x6], #8
  eor x8, x8, x0
  eor x8, x8, x0
  add x6, x6, x8
  mov x0, 1
  mov x1, 2
  mov x8, 0

(fused SUBS/B.cc loop)

100 unrolls and 100 iterations

Result (median cycles for code, minus 3 chain cycles): 4.0109

retire uop (01)cycle (02)schedule uop (52)schedule int uop (53)schedule ldst uop (55)dispatch int uop (56)dispatch ldst uop (58)int uops in schedulers (59)simd uops in schedulers (5a)dispatch uop (78)map int uop (7c)map ldst uop (7d)map int uop inputs (7f)map ldst uop inputs (80)? int output thing (e9)? ldst retires (ed)? int retires (ef)
5020971643501614015610005402471000318501775346465010940212100047022110004400041000040100
5020470104501034010310000401061000318499565346205010940212100047022110004400031000040100
5020470092501034010310000401061000318500105346355010940212100047022110004400031000040100
5020470090501034010310000401061000318499295346115010940212100047022110004400031000040100
5020470094501034010310000401061000318499565346205010940212100047022110004400031000040100
5020470094501034010310000401061000318500375346475010940212100047022110004400031000040100
5020470093501034010310000401061000318499835346285010940212100047022110004400031000040100
5020470091501034010310000401061000318499295346115010940212100047022110004400031000040100
5020470091501034010310000401061000318499835346275010940212100047022110004400031000040100
5020470119501034010310000401061000318500375346465010940212100047022110004400031000040100

1000 unrolls and 10 iterations

Result (median cycles for code, minus 3 chain cycles): 4.0246

retire uop (01)cycle (02)schedule uop (52)schedule int uop (53)schedule ldst uop (55)dispatch int uop (56)dispatch ldst uop (58)int uops in schedulers (59)simd uops in schedulers (5a)dispatch uop (78)map int uop (7c)map ldst uop (7d)map int uop inputs (7f)map ldst uop inputs (80)? int output thing (e9)? ldst retires (ed)? simd retires (ee)? int retires (ef)
50029712425006940064100054015610003185188753542850019400321000470020100004000710000040010
50024701645001740017100004001010000185192953549850010400201000070020100004000710000040010
50024701625001740017100004001010012185387253603650062400711001470020100004000710000040010
50025702485003040028100024005010012185602753673450061400711001370020100004000710000040010
50024701625001740017100004001010000185192953549850010400201000070109100134002010000040010
50024701855001740017100004001010000185257753570350010400201000070020100004000710000040010
50024701755001740017100004001010000185219953558450010400201000070020100004000710000040010
50024701855001740017100004001010000185249653567650010400201000070020100004000710000040010
50024701695001740017100004001010000185228053561050010400201000070020100004000710000040010
50024701885001740017100004001010000185255053569050010400201000070020100004000710000040010

Test 3: throughput

Count: 8

Code:

  ldrh w0, [x6], #8
  ldrh w0, [x7], #8
  ldrh w0, [x8], #8
  ldrh w0, [x9], #8
  ldrh w0, [x10], #8
  ldrh w0, [x11], #8
  ldrh w0, [x12], #8
  ldrh w0, [x13], #8
  mov x7, x6
  mov x8, x6
  mov x9, x6
  mov x10, x6
  mov x11, x6
  mov x12, x6
  mov x13, x6

(fused SUBS/B.cc loop)

100 unrolls and 100 iterations

Result (median cycles for code divided by count): 0.5402

retire uop (01)cycle (02)schedule uop (52)schedule int uop (53)schedule ldst uop (55)dispatch int uop (56)dispatch ldst uop (58)int uops in schedulers (59)simd uops in schedulers (5a)dispatch uop (78)map int uop (7c)map ldst uop (7d)map int uop inputs (7f)map ldst uop inputs (80)? int output thing (e9)? ldst retires (ed)? int retires (ef)
160209447401604338031380120803168001124057864582316012380212800128021280012800098000080100
160204432241601098010980000801128001224048563929716012480212800128021280012800098000080100
160204432151601098010980000801128000824048564310316012080212800128021280012800098000080100
160204432181601098010980000801128001224048564695916012480212800128021280012800098000080100
160204432181601098010980000801128001224048564300816012480212800128025480054800518000080100
160204432251601108010980001801128001024048564593916012280212800128021280012800098000080100
160204432161601098010980000801128001024048563880116012280212800128021280012800098000080100
160204432171601098010980000801128001224048563637716012480212800128021280012800098000080100
160204432171601098010980000801128001024048564214916012280212800128020880008800078000080100
160204432161601098010980000801128001224048564284816012480212800128021280012800098000080100

1000 unrolls and 10 iterations

Result (median cycles for code divided by count): 0.5407

retire uop (01)cycle (02)schedule uop (52)schedule int uop (53)schedule ldst uop (55)dispatch int uop (56)dispatch ldst uop (58)int uops in schedulers (59)simd uops in schedulers (5a)dispatch uop (78)map int uop (7c)map ldst uop (7d)map int uop inputs (7f)map ldst uop inputs (80)? int output thing (e9)? ldst retires (ed)? int retires (ef)
160030444941604088026180147802648001224051664453816003480032800128003280012800098000080010
160024432541600118001180000800108000024050364531516001080020800008002080000800018000080010
160024432531600118001180000800108000024050064147316001080020800008002080000800018000080010
160024432531600118001180000800108000024046964233016001080020800008002080000800018000080010
160024432531600118001180000800108000024050064361116001080020800008002080000800018000080010
160024432531600118001180000800108000024050264615016001080020800008002080000800018000080010
160024432531600118001180000800108000024049564049016001080020800008002080000800018000080010
160024432531600118001180000800108000024050064156716001080020800008002080000800018000080010
160024432531600118001180000800108000024049664342116001080020800008002080000800018000080010
160024432531600118001180000800108000024048064154716001080020800008002080000800018000080010