Apple Microarchitecture Research by Dougall Johnson

M1/A14 P-core (Firestorm): Overview | Base Instructions | SIMD and FP Instructions
M1/A14 E-core (Icestorm):  Overview | Base Instructions | SIMD and FP Instructions

LDRB (pre-index)

Test 1: uops

Code:

  ldrb w0, [x6, #8]!
  mov x0, 1
  mov x1, 2
  mov x8, 0

(no loop instructions)

1000 unrolls and 1 iteration

Retires: 2.000

Issues: 2.000

Integer unit issues: 1.001

Load/store unit issues: 1.000

SIMD/FP unit issues: 0.000

retire uop (01)cycle (02)schedule uop (52)schedule int uop (53)schedule ldst uop (55)dispatch int uop (56)dispatch ldst uop (58)int uops in schedulers (59)simd uops in schedulers (5a)dispatch uop (78)map int uop (7c)map ldst uop (7d)map int uop inputs (7f)map ldst uop inputs (80)? int output thing (e9)? ldst retires (ed)? int retires (ef)
2005129120401021101910421000206921757120001000100010001000100110001000
2004109320011001100010001000208401753520001000100010001000100110001000
2004107020011001100010001000206471781120001000100010001000100110001000
2004108120011001100010001000209681747820001000100010001000100110001000
2004106820011001100010001000210081743420001000100010001000100110001000
2004108120011001100010001000213231770520001000100010001000100110001000
2004112120011001100010001000213361766520001000100010001000100110001000
2004108320011001100010001000211081744120001000100010001000100110001000
2004109920011001100010001000212841771420001000100010001000100110001000
2004110820011001100010001000204171803120001000100010001000100110001000

Test 2: Latency 1->2 (with chain penalty)

Chain cycles: 3

Code:

  ldrb w0, [x6, #8]!
  eor x8, x8, x0
  eor x8, x8, x0
  add x6, x6, x8
  mov x0, 1
  mov x1, 2
  mov x8, 0

(fused SUBS/B.cc loop)

100 unrolls and 100 iterations

Result (median cycles for code, minus 3 chain cycles): 4.0145

retire uop (01)cycle (02)schedule uop (52)schedule int uop (53)schedule ldst uop (55)dispatch int uop (56)dispatch ldst uop (58)int uops in schedulers (59)simd uops in schedulers (5a)dispatch uop (78)map int uop (7c)map ldst uop (7d)map int uop inputs (7f)map ldst uop inputs (80)? int output thing (e9)? ldst retires (ed)? int retires (ef)
5020971225501614015610005402471001218523305368645015240251100137022110003400101000040100
5020470165501094010910000401061000318516035351535010940212100047022110004400081000040100
5020470161501094010910000401061000318516575351705010940212100047022110004400081000040100
5020470162501094010910000401061000318516575351715010940212100047022110004400081000040100
5020470156501094010910000401061000318519815352775010940212100047022110004400081000040100
5020470179501094010910000401061000318516305351605010940212100047022110004400081000040100
5020470145501094010910000401061000318512255350275010940212100047022110004400081000040100
5020470145501094010910000401061001318543395359525015540254100147022110004400081000040100
5020470145501094010910000401061000318512255350275010940212100047022110004400081000040100
5020470145501094010910000401061000318512255350275010940212100047022110004400081000040100

1000 unrolls and 10 iterations

Result (median cycles for code, minus 3 chain cycles): 4.0105

retire uop (01)cycle (02)schedule uop (52)schedule int uop (53)schedule ldst uop (55)dispatch int uop (56)dispatch ldst uop (58)int uops in schedulers (59)simd uops in schedulers (5a)dispatch uop (78)map int uop (7c)map ldst uop (7d)map int uop inputs (7f)map ldst uop inputs (80)? int output thing (e9)? ldst retires (ed)? int retires (ef)
5002971253500684006310005401561000318515365353565001940032100047004110004400041000040010
5002470105500144001410000400101000018503905351015001040020100007002010000400041000040010
5002470105500144001410000400101000018503905351015001040020100007002010000400041000040010
5002470105500144001410000400101000018503905351015001040020100007002010000400041000040010
5002470105500144001410000400101000018503905351015001040020100007002010000400041000040010
5002470189500144001410000400101000018509845352995001040020100007002010000400041000040010
5002470105500144001410000400101000018503905351015001040020100007002010000400041000040010
5002470105500144001410000400101000018503905351015001040020100007011210015400171000040010
5002470107500144001410000400101000018503905351015001040020100007002010000400041000040010
5002470105500144001410000400101000018503905351015001040020100007002010000400041000040010

Test 3: throughput

Count: 8

Code:

  ldrb w0, [x6, #8]!
  ldrb w0, [x7, #8]!
  ldrb w0, [x8, #8]!
  ldrb w0, [x9, #8]!
  ldrb w0, [x10, #8]!
  ldrb w0, [x11, #8]!
  ldrb w0, [x12, #8]!
  ldrb w0, [x13, #8]!
  mov x7, x6
  mov x8, x6
  mov x9, x6
  mov x10, x6
  mov x11, x6
  mov x12, x6
  mov x13, x6

(fused SUBS/B.cc loop)

100 unrolls and 100 iterations

Result (median cycles for code divided by count): 0.5403

retire uop (01)cycle (02)schedule uop (52)schedule int uop (53)schedule ldst uop (55)dispatch int uop (56)dispatch ldst uop (58)int uops in schedulers (59)simd uops in schedulers (5a)dispatch uop (78)map int uop (7c)map ldst uop (7d)map int uop inputs (7f)map ldst uop inputs (80)? int output thing (e9)? ldst retires (ed)? int retires (ef)
160209444771604258031280113803158001024064264256616012280212800128021280012800098000080100
160204432251601098010980000801128001124052964347316012380212800128025380053800508000080100
160204432431601118011080001801138000824070263452216011680208800088021280012800098000080100
160204432291601098010980000801128000924052964366616012180212800128021280012800098000080100
160204432241601098010980000801128005424109862063016020880254800548021080010800078000080100
160204432251601098010980000801128001124056264347716012380212800128021280012800098000080100
160204432261601118010980002801128000924052364394916011980210800108021280012800098000080100
160204432251601098010980000801128001124061164326616012380212800128021280012800098000080100
160204432241601098010980000801128001024052963878116012280212800128021280012800098000080100
160204432241601098010980000801128001124052964590616012380212800128021280012800098000080100

1000 unrolls and 10 iterations

Result (median cycles for code divided by count): 0.5404

retire uop (01)cycle (02)schedule uop (52)schedule int uop (53)schedule ldst uop (55)dispatch int uop (56)dispatch ldst uop (58)int uops in schedulers (59)simd uops in schedulers (5a)dispatch uop (78)map int uop (7c)map ldst uop (7d)map int uop inputs (7f)map ldst uop inputs (80)? int output thing (e9)? ldst retires (ed)? int retires (ef)
160029444591603308022180109802248001124034064270116003380032800128002080000800018000080010
160024432291600118001180000800108000024030464574716001080020800008002080000800018000080010
160024432301600118001180000800108000024030464010316001080020800008002080000800018000080010
160024432301600118001180000800108000024030464249516001080020800008002080000800018000080010
160024432301600118001180000800108000024030464302816001080020800008007380053800508000080010
160024432301600118001180000800108000024030464196816001080020800008002080000800018000080010
160024432301600118001180000800108000024030464657416001080020800008002080000800018000080010
160024432311600118001180000800108000024030464622716001080020800008002080000800018000080010
160024432301600118001180000800108000024030464526016001080020800008002080000800018000080010
160024432301600118001180000800108000024030464335916001080020800008002080000800018000080010