Apple Microarchitecture Research by Dougall Johnson

M1/A14 P-core (Firestorm): Overview | Base Instructions | SIMD and FP Instructions
M1/A14 E-core (Icestorm):  Overview | Base Instructions | SIMD and FP Instructions

LDPSW (post-index)

Test 1: uops

Code:

  ldpsw x0, x1, [x6], #8
  mov x0, 1
  mov x1, 2
  mov x8, 0

(no loop instructions)

1000 unrolls and 1 iteration

Retires: 3.000

Issues: 2.000

Integer unit issues: 1.001

Load/store unit issues: 1.000

SIMD/FP unit issues: 0.000

retire uop (01)cycle (02)schedule uop (52)schedule int uop (53)schedule ldst uop (55)dispatch int uop (56)dispatch ldst uop (58)int uops in schedulers (59)simd uops in schedulers (5a)dispatch uop (78)map int uop (7c)map ldst uop (7d)map int uop inputs (7f)map ldst uop inputs (80)? int output thing (e9)? ldst retires (ed)? int retires (ef)
3005130020361020101610281000131461448520001000200010002000100110002000
3004108120011001100010001000131751454320001000200010002000100110002000
3004109120011001100010001000132071434220001000200010002000100110002000
3004107420011001100010001000134691463420001000200010002000100110002000
3004108920011001100010001000133081455420001000200010002000100110002000
3004108920011001100010001000132941473620001000200010002000100110002000
3004107520011001100010001000132241455020001000200010002000100110002000
3004108620011001100010001000133011449620001000200010002000100110002000
3004110520011001100010001000133371450520001000200010002000100110002000
3004111020011001100010001000131781445720001000200010002000100110002000

Test 2: Latency 1->3 (with chain penalty)

Chain cycles: 3

Code:

  ldpsw x0, x1, [x6], #8
  eor x8, x8, x0
  eor x8, x8, x0
  add x6, x6, x8
  mov x0, 1
  mov x1, 2
  mov x8, 0

(fused SUBS/B.cc loop)

100 unrolls and 100 iterations

Result (median cycles for code, minus 3 chain cycles): 4.0106

retire uop (01)cycle (02)schedule uop (52)schedule int uop (53)schedule ldst uop (55)dispatch int uop (56)dispatch ldst uop (58)int uops in schedulers (59)simd uops in schedulers (5a)dispatch uop (78)map int uop (7c)map ldst uop (7d)map int uop inputs (7f)map ldst uop inputs (80)? int output thing (e9)? ldst retires (ed)? int retires (ef)
6020971243502624025710005403481000318505015490395020940212200067022120008401031000050100
6020470115502044020410000402061000318500965489105020940212200087022120008401031000050100
6020470106502034020310000402061000318502305489855020940212200087022120008401031000050100
6020470106502034020310000402061000318530655499205020940212200087022120008401031000050100
6020470106502034020310000402061000318501765489675020940212200087022120008401031000050100
6020470106502034020310000402061000318501765489675020940212200087022120008401031000050100
6020470106502034020310000402061000318501765489675020940212200087022120008401031000050100
6020470106502034020310000402061000318501765489675020940212200087022120008401031000050100
6020470106502034020310000402061000318501765489675020940212200087022120008401031000050100
6020470106502034020310000402061000318501765489675020940212200087022120008401031000050100

1000 unrolls and 10 iterations

Result (median cycles for code, minus 3 chain cycles): 4.0140

retire uop (01)cycle (02)schedule uop (52)schedule int uop (53)schedule ldst uop (55)dispatch int uop (56)dispatch ldst uop (58)int uops in schedulers (59)simd uops in schedulers (5a)dispatch uop (78)map int uop (7c)map ldst uop (7d)map int uop inputs (7f)map ldst uop inputs (80)? int output thing (e9)? ldst retires (ed)? int retires (ef)
6002971384500804007510005401661000318504105497845002940032200087002020000400141000050010
6002470104500244002410000400201000018500565496525002040020200007002020000400131000050010
6002470095500234002310000400201000018500565496525002040020200007002020000400131000050010
6002470100500234002310000400201000018500565496525002040020200007002020000400131000050010
6002470092500234002310000400201000018499755496255002040020200007002020000400131000050010
6002470114500234002310000400201001318517645501995007340072200287002020000400141000050010
6002470090500234002310000400201000018499755496255002040020200007002020000400131000050010
6002470091500234002310000400201000018500025496345002040020200007002020000400131000050010
6002470091500234002310000400201000018499755496255002040020200007002020000400131000050010
6002470092500234002310000400201000018499485496165002040020200007002020000400131000050010

Test 3: Latency 2->3 (with chain penalty)

Chain cycles: 3

Code:

  ldpsw x0, x1, [x6], #8
  eor x8, x8, x1
  eor x8, x8, x1
  add x6, x6, x8
  mov x0, 1
  mov x1, 2
  mov x8, 0

(fused SUBS/B.cc loop)

100 unrolls and 100 iterations

Result (median cycles for code, minus 3 chain cycles): 4.0107

retire uop (01)cycle (02)schedule uop (52)schedule int uop (53)schedule ldst uop (55)dispatch int uop (56)dispatch ldst uop (58)int uops in schedulers (59)simd uops in schedulers (5a)dispatch uop (78)map int uop (7c)map ldst uop (7d)map int uop inputs (7f)map ldst uop inputs (80)? int output thing (e9)? ldst retires (ed)? int retires (ef)
6020971235502624025710005403481000318500435488525020940212200067022120008401041000050100
6020470105502044020410000402061000318501235489085020940212200087022120008401041000050100
6020470105502044020410000402061000318502035489655020940212200087022120008401041000050100
6020470105502044020410000402061000318502035489655020940212200087022120008401041000050100
6020470105502044020410000402061000318502035489655020940212200087022120008401041000050100
6020470105502044020410000402061000318502035489655020940212200087029120028401151000050100
6020470105502044020410000402061000318502035489655020940212200087022120008401041000050100
6020470105502044020410000402061000318502035489655020940212200087022120008401041000050100
6020470105502044020410000402061000318502035489655020940212200087022120008401041000050100
6020470105502044020410000402061000318505005490645020940212200087022120008401041000050100

1000 unrolls and 10 iterations

Result (median cycles for code, minus 3 chain cycles): 4.0115

retire uop (01)cycle (02)schedule uop (52)schedule int uop (53)schedule ldst uop (55)dispatch int uop (56)dispatch ldst uop (58)int uops in schedulers (59)simd uops in schedulers (5a)dispatch uop (78)map int uop (7c)map ldst uop (7d)map int uop inputs (7f)map ldst uop inputs (80)? int output thing (e9)? ldst retires (ed)? int retires (ef)
6002971243500804007510005401661000318506535498695002940032200087002020000400141000050010
6002470113500244002410000400201000018505695498205002040020200007002020000400141000050010
6002470113500244002410000400201000018505695498205002040020200007002020000400141000050010
6002470115500244002410000400201000018523245503885002040020200007011120028400251000050010
6002470139500244002410000400201000018505695498205002040020200007002020000400141000050010
6002470113500244002410000400201000018505695498205002040020200007002020000400141000050010
6002470113500244002410000400201000018505695498205002040020200007011120028400251000050010
6002470115500244002410000400201000018505695498205002040020200007002020000400141000050010
6002470113500244002410000400201000018505695498205002040020200007002020000400141000050010
6002470113500244002410000400201000018505695498205002040020200007002020000400141000050010

Test 4: throughput

Count: 8

Code:

  ldpsw x0, x1, [x6], #8
  ldpsw x0, x1, [x7], #8
  ldpsw x0, x1, [x8], #8
  ldpsw x0, x1, [x9], #8
  ldpsw x0, x1, [x10], #8
  ldpsw x0, x1, [x11], #8
  ldpsw x0, x1, [x12], #8
  ldpsw x0, x1, [x13], #8
  mov x7, x6
  mov x8, x6
  mov x9, x6
  mov x10, x6
  mov x11, x6
  mov x12, x6
  mov x13, x6

(fused SUBS/B.cc loop)

100 unrolls and 100 iterations

Result (median cycles for code divided by count): 0.7520

retire uop (01)cycle (02)schedule uop (52)schedule int uop (53)schedule ldst uop (55)dispatch int uop (56)dispatch ldst uop (58)int uops in schedulers (59)simd uops in schedulers (5a)dispatch uop (78)map int uop (7c)map ldst uop (7d)map int uop inputs (7f)map ldst uop inputs (80)? int output thing (e9)? ldst retires (ed)? int retires (ef)
240209611961603308024080090802418000924048725133416011780208160018802351600708003380000160100
240204601281601118010680005801088000824047525118616011680208160016802081600168000680000160100
240204601311601118010680005801088000824047525132816011680208160016802081600168000680000160100
240205602131601658013380032801358000824047625144916011680208160016802361600728003480000160100
240204601301601118010680005801088006124088025429116022280261160123802081600168000680000160100
240204601891601118010680005801088000824066625166216011680208160016802081600168000680000160100
240204601651601118010680005801088000824066425166816011680208160016802081600168000680000160100
240204601631601118010680005801088000824068625171316011680208160016802081600168000680000160100
240204601581601118010680005801088000824066525166016011680208160016802081600168000680000160100
240204601641601118010680005801088000824066425169416011680208160016802351600738003380000160100

1000 unrolls and 10 iterations

Result (median cycles for code divided by count): 0.7511

retire uop (01)cycle (02)schedule uop (52)schedule int uop (53)schedule ldst uop (55)dispatch int uop (56)dispatch ldst uop (58)int uops in schedulers (59)simd uops in schedulers (5a)dispatch uop (78)map int uop (7c)map ldst uop (7d)map int uop inputs (7f)map ldst uop inputs (80)? int output thing (e9)? ldst retires (ed)? int retires (ef)
240029611521602418014980092801518000824019125133516002680028160016800201600008000180000160010
240024600811600118001180000800108000024016725127816001080020160000800201600008000180000160010
240024600891600118001180000800108000024016725125216001080020160000800201600008000180000160010
240024600831600118001180000800108000024018625124716001080020160000800201600008000180000160010
240024600821600118001180000800108000024016725127816001080020160000800551600728003480000160010
240024600811600118001180000800108003624052125414416008180055160073800201600008000180000160010
240024600821600118001180000800108000024016525126916001080020160000800201600008000180000160010
240024600841600118001180000800108000024016525125416001080020160000800201600008000180000160010
240024600801600118001180000800108000024016725127116001080020160000800201600008000180000160010
240024600831600118001180000800108000024016725125416001080020160000800201600008000180000160010