Apple Microarchitecture Research by Dougall Johnson

M1/A14 P-core (Firestorm): Overview | Base Instructions | SIMD and FP Instructions
M1/A14 E-core (Icestorm):  Overview | Base Instructions | SIMD and FP Instructions

LDP (pre-index, S)

Test 1: uops

Code:

  ldp s0, s1, [x6, #0x10]!
  mov x0, 1
  mov x1, 2
  mov x8, 0

(no loop instructions)

1000 unrolls and 1 iteration

Retires: 2.000

Issues: 3.000

Integer unit issues: 1.001

Load/store unit issues: 2.000

SIMD/FP unit issues: 0.000

retire uop (01)cycle (02)schedule uop (52)schedule int uop (53)schedule ldst uop (55)dispatch int uop (56)dispatch ldst uop (58)int uops in schedulers (59)simd uops in schedulers (5a)dispatch uop (78)map ldst uop (7d)map ldst uop inputs (80)? int output thing (e9)? ldst retires (ed)
200518203057102720301026200035162495730002000200010012000
200415463001100120001000200035162469630002000200010012000
200415173001100120001000200035162437230002000200010012000
200415103001100120001000200035162466030002000200010012000
200415343001100120001000200035162452530002000200010012000
200415343001100120001000200035162453430002000200010012000
200415283001100120001000200035162439030002000200010012000
200415073001100120001000200035162448030002000200010012000
200415213001100120001000200035162433630002000200010012000
200415323001100120001000200035162472330002000200010012000

Test 2: Latency 1->3 roundtrip

Chain cycles: 3

Code:

  ldp s0, s1, [x6, #0x10]!
  fmov x0, d0
  eor x8, x8, x0
  eor x8, x8, x0
  add x6, x6, x8
  mov x0, 1
  mov x1, 2
  mov x8, 0

(fused SUBS/B.cc loop)

100 unrolls and 100 iterations

Result (median cycles for code, minus 3 chain cycles): 7.0118

retire uop (01)cycle (02)schedule uop (52)schedule int uop (53)schedule simd uop (54)schedule ldst uop (55)dispatch int uop (56)dispatch simd uop (57)dispatch ldst uop (58)int uops in schedulers (59)simd uops in schedulers (5a)ldst uops in schedulers (5b)dispatch uop (78)map int uop (7c)map ldst uop (7d)map simd uop (7e)map int uop inputs (7f)map ldst uop inputs (80)map simd uop inputs (81)? int output thing (e9)? ldst retires (ed)? int retires (ef)
60214102616802365011210104200204038510089200062660968156860478942970113302092000810003602782002510013500092000040100
60204100104801075010110006200004010410003200062661017156872078947670113302092000810003602182000810003500012000040100
60204100104801075010110006200004010410003200062660967156861278943370113302092000810003602182000810003500012000040100
60204100104801075010110006200004010410003200062661017156872078947670113302092000810003602182000810003500012000040100
60204100104801075010110006200004010410003200062661017156872078947670113302092000810003602182000810003500012000040100
60204100109801075010110006200004010410003200062661017156872078947670113302092000810003602182000810003500012000040100
60204100104801075010110006200004010410003200062661017156872078947670113302092000810003602782002810013500092000040100
60204100108801075010110006200004010410002200062661125156879278951170113302092000810003602182000810003500012000040100
60204100122801075010110006200004010410003200242663533157028279024870173302392002810013602182000810003500012000040100
60204100188801075010110006200004010410003200062661017156872078947670113302092000810003602182000810003500012000040100

1000 unrolls and 10 iterations

Result (median cycles for code, minus 3 chain cycles): 7.0119

retire uop (01)cycle (02)schedule uop (52)schedule int uop (53)schedule simd uop (54)schedule ldst uop (55)dispatch int uop (56)dispatch simd uop (57)dispatch ldst uop (58)int uops in schedulers (59)simd uops in schedulers (5a)ldst uops in schedulers (5b)dispatch uop (78)map int uop (7c)map ldst uop (7d)map simd uop (7e)map int uop inputs (7f)map ldst uop inputs (80)map simd uop inputs (81)? int output thing (e9)? ldst retires (ed)? int retires (ef)
60034102465801435002110102200204029410089200242662305157103079060370083300592002810013600382000810003500012000040010
60024100086800145001110003200004001010000200002660194156978078993570010300202000010000600202000010000500012000040010
60024100081800135001110002200004001010000200002660194156978078993570010300202000010000600202000010000500012000040010
60024100071800135001110002200004001010000200002660194156978078993570010300202000010000600202000010000500012000040010
60024100071800135001110002200004001010000200002660194156978078993570010300202000010000600202000010000500012000040010
60024100071800135001110002200004001010000200002660194156978078993570010300202000010000600202000010000500012000040010
60024100071800135001110002200004001010000200002660194156978078993570010300202000010000600202000010000500012000040010
60024100071800135001110002200004001010000200002660194156978078993570010300202000010000600202000010000500012000040010
60025100146800285001910007200024004610013200002660248156981678995370010300202000010000600202000010000500012000040010
60024100071800135001110002200004001010000200002660194156978078993570010300202000010000600202000010000500012000040010

Test 3: Latency 2->3 roundtrip

Chain cycles: 3

Code:

  ldp s0, s1, [x6, #0x10]!
  fmov x1, d1
  eor x8, x8, x1
  eor x8, x8, x1
  add x6, x6, x8
  mov x0, 1
  mov x1, 2
  mov x8, 0

(fused SUBS/B.cc loop)

100 unrolls and 100 iterations

Result (median cycles for code, minus 3 chain cycles): 7.0080

retire uop (01)cycle (02)schedule uop (52)schedule int uop (53)schedule simd uop (54)schedule ldst uop (55)dispatch int uop (56)dispatch simd uop (57)dispatch ldst uop (58)int uops in schedulers (59)simd uops in schedulers (5a)ldst uops in schedulers (5b)dispatch uop (78)map int uop (7c)map ldst uop (7d)map simd uop (7e)map int uop inputs (7f)map ldst uop inputs (80)map simd uop inputs (81)? int output thing (e9)? ldst retires (ed)? int retires (ef)
60214102243802305011210098200204038510089200062660531156844678931370113302092000810003602182000810003500012000040100
60204100080801045010110003200004010410003200062660126156819078918270113302092000810003602182000810003500012000040100
60204100071801035010110002200004010410003200062660126156819078918270113302092000810003602182000810003500012000040100
60204100071801035010110002200004010410003200062660126156819078918270113302092000810003602182000810003500012000040100
60205100199801185010910007200024013610013200062660693156856878936270113302092000810003602182000810003500012000040100
60204100071801035010110002200004010410003200062660126156819078918270113302092000810003602182000810003500012000040100
60204100071801035010110002200004010410003200062660126156819078918270113302092000810003602182000810003500012000040100
60204100071801035010110002200004010410003200062660126156819078918270113302092000810003602182000810003500012000040100
60204100071801035010110002200004010410003200062660126156819078918270113302092000810003602182000810003500012000040100
60204100071801035010110002200004010410003200062660126156819078918270113302092000810003602782002810013500092000040100

1000 unrolls and 10 iterations

Result (median cycles for code, minus 3 chain cycles): 7.0079

retire uop (01)cycle (02)schedule uop (52)schedule int uop (53)schedule simd uop (54)schedule ldst uop (55)dispatch int uop (56)dispatch simd uop (57)dispatch ldst uop (58)int uops in schedulers (59)simd uops in schedulers (5a)ldst uops in schedulers (5b)dispatch uop (78)map int uop (7c)map ldst uop (7d)map simd uop (7e)map int uop inputs (7f)map ldst uop inputs (80)map simd uop inputs (81)? int output thing (e9)? ldst retires (ed)? simd retires (ee)? int retires (ef)
600341024538014150021101002002040294100892000626600811569752789927700233002920008100036002020000100005000120000040010
600251001308002850019100072000240046100122000026598701569572789833700103002020000100006002020000100005000120000040010
600241000598001350011100022000040010100002000026598701569572789833700103002020000100006002020000100005000120000040010
600241000598001350011100022000040010100002000026598701569572789833700103002020000100006002020000100005000120000040010
600241000598001350011100022000040010100002000026598701569572789833700103002020000100006002020000100005000120000040010
600241000598001350011100022000040010100002000026598701569572789833700103002020000100006002020000100005000120000040010
600241000598001350011100022000040010100002000026598701569572789833700103002020000100006002020000100005000120000040010
600241000598001350011100022000040010100002000026598701569572789833700103002020000100006002020000100005000120000040010
600241000598001350011100022000040010100002000026598701569572789833700103002020000100006009820028100135000920000040010
600241000668001450011100032000040010100002000026598701569572789833700103002020000100006002020000100005000120000040010

Test 4: throughput

Count: 8

Code:

  ldp s0, s1, [x6, #0x10]!
  ldp s0, s1, [x7, #0x10]!
  ldp s0, s1, [x8, #0x10]!
  ldp s0, s1, [x9, #0x10]!
  ldp s0, s1, [x10, #0x10]!
  ldp s0, s1, [x11, #0x10]!
  ldp s0, s1, [x12, #0x10]!
  ldp s0, s1, [x13, #0x10]!
  mov x7, x6
  mov x8, x6
  mov x9, x6
  mov x10, x6
  mov x11, x6
  mov x12, x6
  mov x13, x6

(fused SUBS/B.cc loop)

100 unrolls and 100 iterations

Result (median cycles for code divided by count): 1.0794

retire uop (01)cycle (02)schedule uop (52)schedule int uop (53)schedule ldst uop (55)dispatch int uop (56)dispatch ldst uop (58)int uops in schedulers (59)simd uops in schedulers (5a)dispatch uop (78)map int uop (7c)map ldst uop (7d)map int uop inputs (7f)map ldst uop inputs (80)? int output thing (e9)? ldst retires (ed)? int retires (ef)
160214885382406658036516030080366160005240312139253024010920016000820016000880003160000100
160204863622401038010316000080104160004240312139350024010820016000820016000880003160000100
160204863772401038010316000080104160004240312139290324010820016000820016000880003160000100
160204863512401038010316000080104160004240312139344624010820016000820016000880003160000100
160205863722401038010316000080104160005240312139369724010920016000820016000880003160000100
160204863482401038010316000080104160004240312139344624010820016000820016000880003160000100
160204863542401038010316000080104160005240312139345924010920016000820016000880003160000100
160204863482401038010316000080104160004240312139344624010820016000820016000880003160000100
160204863482401038010316000080104160004240312139344624010820016000820016000880003160000100
160204863482401038010316000080104160004240312139344624010820016000820016000880003160000100

1000 unrolls and 10 iterations

Result (median cycles for code divided by count): 1.0793

retire uop (01)cycle (02)schedule uop (52)schedule int uop (53)schedule ldst uop (55)dispatch int uop (56)dispatch ldst uop (58)int uops in schedulers (59)simd uops in schedulers (5a)dispatch uop (78)map int uop (7c)map ldst uop (7d)map int uop inputs (7f)map ldst uop inputs (80)? int output thing (e9)? ldst retires (ed)? int retires (ef)
160034887092405858028516030080286160010240051139321024002720160014201600008000116000010
160024863572400118001116000080010160000240030139344324001020160000201600008000116000010
160024863582400118001116000080010160000240030139342524001020160000201600008000116000010
160024868062402358011516012080114160000240030139347024001020160000201600008000116000010
160024863472400118001116000080010160000240030139342524001020160000201600008000116000010
160024863472400118001116000080010160000240030139342524001020160000201600008000116000010
160024863472400118001116000080010160000240030139342524001020160000201600008000116000010
160024863472400118001116000080010160000240030139342524001020160000201600008000116000010
160024863472400118001116000080010160000240030139342524001020160000201600008000116000010
160024863472400118001116000080010160000240030139342524001020160000201600008000116000010