Apple Microarchitecture Research by Dougall Johnson

M1/A14 P-core (Firestorm): Overview | Base Instructions | SIMD and FP Instructions
M1/A14 E-core (Icestorm):  Overview | Base Instructions | SIMD and FP Instructions

LDR (register, lsl, Q)

Test 1: uops

Code:

  ldr q0, [x6, x7, lsl #4]
  mov x7, #4
  mov x8, 0

(no loop instructions)

1000 unrolls and 1 iteration

Retires: 2.000

Issues: 2.000

Integer unit issues: 1.001

Load/store unit issues: 1.000

SIMD/FP unit issues: 0.000

retire uop (01)cycle (02)schedule uop (52)schedule int uop (53)schedule ldst uop (55)dispatch int uop (56)dispatch ldst uop (58)int uops in schedulers (59)simd uops in schedulers (5a)dispatch uop (78)map int uop (7c)map ldst uop (7d)map int uop inputs (7f)map ldst uop inputs (80)? int output thing (e9)? ldst retires (ed)? int retires (ef)
2006704206310331030103210003000418020001000100010002000100110001000
2004555200110011000100010003000418020001000100010002000100110001000
2004555200110011000100010003000418020001000100010002000100110001000
2004555200110011000100010003000418020001000100010002000100110001000
2004555200110011000100010003000418020001000100010002000100110001000
2004555200110011000100010003000418020001000100010002000100110001000
2004555200110011000100010003000418020001000100010002000100110001000
2004555200110011000100010003000418020001000100010002000100110001000
2004555200110011000100010003000418020001000100010002000100110001000
2004555200110011000100010003000418020001000100010002000100110001000

Test 2: Latency 1->2 roundtrip

Chain cycles: 3

Code:

  ldr q0, [x6, x7, lsl #4]
  fmov x0, d0
  eor x8, x8, x0
  eor x8, x8, x0
  add x6, x6, x8
  mov x7, #4
  mov x8, 0

(fused SUBS/B.cc loop)

100 unrolls and 100 iterations

Result (median cycles for code, minus 3 chain cycles): 7.0043

retire uop (01)cycle (02)schedule uop (52)schedule int uop (53)schedule simd uop (54)schedule ldst uop (55)dispatch int uop (56)dispatch simd uop (57)dispatch ldst uop (58)int uops in schedulers (59)simd uops in schedulers (5a)ldst uops in schedulers (5b)dispatch uop (78)map int uop (7c)map ldst uop (7d)map simd uop (7e)map int uop inputs (7f)map ldst uop inputs (80)map simd uop inputs (81)? int output thing (e9)? ldst retires (ed)? int retires (ef)
6020710019670137501261000910002401791002310003265919597863798389560109402121000310003702212000610003500021000050100
6020410004870104501021000210000401041000210003265904397862198386860110402101000310003702172000610003500021000050100
6020410004170103501021000110000401041000310003265904397862198386860110402101000310003702172000610003500021000050100
6020410004170103501021000110000401041000310003265877097852198376060110402101000310003702172000610003500021000050100
6020410004170103501021000110000401041000310003265904397862198386860110402101000310003702172000610003500021000050100
6020410004170103501021000110000401041000310003265904397862198386860110402101000310003702932002610014500171000050100
6020410004170103501021000110000401041000310003265904397862198386860110402101000310003702172000610003500021000050100
6020410004170103501021000110000401041000310003265904397862198386860110402101000310003702172000610003500021000050100
6020410004170103501021000110000401041000310003265912497865498390160110402101000310003702172000610003500021000050100
6020410004170103501021000110000401041000310003265904397862198386860110402101000310003702172000610003500021000050100

1000 unrolls and 10 iterations

Result (median cycles for code, minus 3 chain cycles): 7.0043

retire uop (01)cycle (02)schedule uop (52)schedule int uop (53)schedule simd uop (54)schedule ldst uop (55)dispatch int uop (56)dispatch simd uop (57)dispatch ldst uop (58)int uops in schedulers (59)simd uops in schedulers (5a)ldst uops in schedulers (5b)dispatch uop (78)map int uop (7c)map ldst uop (7d)map simd uop (7e)map int uop inputs (7f)map ldst uop inputs (80)map simd uop inputs (81)? int output thing (e9)? ldst retires (ed)? int retires (ef)
6002610017170029500211000710001400501001310000266092698017698523760010400201000010000700202000010000500011000050010
6002410005270013500111000210000400101000010000265911797944898451760010400201000010000700202000010000500011000050010
6002410004170012500111000110000400101000010000265911797944898451760010400201000010000700202000010000500011000050010
6002410004170012500111000110000400101000010000265911797944898451760010400201000010000700202000010000500011000050010
6002410004170012500111000110000400101000010000265911797944898451760010400201000010000700202000010000500011000050010
6002410004170012500111000110000400101000010000265911797944898451760010400201000010000700202000010000500011000050010
6002410004170012500111000110000400101000010000265911797944898451760010400201000010000701072002610013500171000050010
6002410004570012500111000110000400101000010013265949097958598465660080400731001310014700202000010000500011000050010
6002410004670012500111000110000400101000010000265911797944898451760010400201000010000700202000010000500011000050010
6002410004170012500111000110000400101000010000265911797944898451760010400201000010000700202000010000500011000050010

Test 3: Latency 1->3 roundtrip

Chain cycles: 3

Code:

  ldr q0, [x6, x7, lsl #4]
  fmov x0, d0
  eor x8, x8, x0
  eor x8, x8, x0
  add x7, x7, x8
  mov x7, #4
  mov x8, 0

(fused SUBS/B.cc loop)

100 unrolls and 100 iterations

Result (median cycles for code, minus 3 chain cycles): 8.0049

retire uop (01)cycle (02)schedule uop (52)schedule int uop (53)schedule simd uop (54)schedule ldst uop (55)dispatch int uop (56)dispatch simd uop (57)dispatch ldst uop (58)int uops in schedulers (59)simd uops in schedulers (5a)ldst uops in schedulers (5b)dispatch uop (78)map int uop (7c)map ldst uop (7d)map simd uop (7e)map int uop inputs (7f)map ldst uop inputs (80)map simd uop inputs (81)? int output thing (e9)? ldst retires (ed)? int retires (ef)
6020611016470110501021000710001401321001210003292930187364487375860109402121000310003702222000810003500011000050100
6020411004770103501011000210000401041000310012292977987385887396260160402531001410013702922002810013500071000050100
6020411006070103501011000210000401041000210012293166987448287456760160402531001410013702222000810003500011000050100
6020411004770103501011000210000401041000310003292939487375387385260110402131000410003702222000810003500011000050100
6020411004770103501011000210000401041000310003292939487375387385260110402131000410003702222000810003500011000050100
6020411004970103501011000210000401041000310003293101487429287435960110402131000410003702902002410013500081000050100
6020411005170103501011000210000401041000310003292935587366087377660109402121000310003702222000810003500011000050100
6020411004970103501011000210000401041000310003292944887376987387060110402131000410003702222000810003500011000050100
6020411004970103501011000210000401041000310003292944887376987387060110402131000410003702222000810003500011000050100
6020411004970103501011000210000401041000310003292944887376987387060110402131000410003702222000810003500011000050100

1000 unrolls and 10 iterations

Result (median cycles for code, minus 3 chain cycles): 8.0042

retire uop (01)cycle (02)schedule uop (52)schedule int uop (53)schedule simd uop (54)schedule ldst uop (55)dispatch int uop (56)dispatch simd uop (57)dispatch ldst uop (58)int uops in schedulers (59)simd uops in schedulers (5a)ldst uops in schedulers (5b)dispatch uop (78)map int uop (7c)map ldst uop (7d)map simd uop (7e)map int uop inputs (7f)map ldst uop inputs (80)map simd uop inputs (81)? int output thing (e9)? ldst retires (ed)? int retires (ef)
6002611016970020500121000710001400421001210000292927387458087463160010400201000010000701122002810013500071000050010
6002411004070012500111000110000400101000010000292927387458087463160010400201000010000700202000010000500011000050010
6002411010370012500111000110000400101000010000292973287472987477860010400201000010000700202000010000500011000050010
6002411004070012500111000110000400101000010000292927387458087463160010400201000010000700202000010000500011000050010
6002411004070012500111000110000400101000010000292927387458087463160010400201000010000700202000010000500011000050010
6002411004070012500111000110000400101000010000292927387458087463160010400201000010000700202000010000500011000050010
6002511007270021500171000310001400411001010000292927387458087463160010400201000010000700202000010000500011000050010
6002411004070012500111000110000400101000010000292927387458087463160010400201000010000700202000010000500011000050010
6002411004070012500111000110000400101000010000292927387458087463160010400201000010000700202000010000500011000050010
6002411004070012500111000110000400101000010000292927387458087463160010400201000010000700202000010000500011000050010

Test 4: throughput

Count: 8

Code:

  ldr q0, [x6, x7, lsl #4]
  ldr q0, [x6, x7, lsl #4]
  ldr q0, [x6, x7, lsl #4]
  ldr q0, [x6, x7, lsl #4]
  ldr q0, [x6, x7, lsl #4]
  ldr q0, [x6, x7, lsl #4]
  ldr q0, [x6, x7, lsl #4]
  ldr q0, [x6, x7, lsl #4]
  mov x7, 8

(fused SUBS/B.cc loop)

100 unrolls and 100 iterations

Result (median cycles for code divided by count): 0.5014

retire uop (01)cycle (02)schedule uop (52)schedule int uop (53)schedule ldst uop (55)dispatch int uop (56)dispatch ldst uop (58)int uops in schedulers (59)simd uops in schedulers (5a)dispatch uop (78)map int uop (7c)map ldst uop (7d)map int uop inputs (7f)map ldst uop inputs (80)? int output thing (e9)? ldst retires (ed)? int retires (ef)
1602044043316011780109800088011280012240336326036160124802128001280212160024800098000080100
1602044012916011780109800088011280012240336326918160124802128001280212160024800098000080100
1602044011716011780109800088011280012240336324284160124802128001280212160024800098000080100
1602044011616011780109800088011280012240336320206160124802128001280212160024800098000080100
1602044011616011780109800088011280012240336323264160124802128001280212160024800098000080100
1602044011916011780109800088011280012240336320206160124802128001280212160024800098000080100
1602044011616011780109800088011280012240336320206160124802128001280212160024800098000080100
1602044011616011780109800088011280012240336320206160124802128001280212160024800098000080100
1602044011616011780109800088011280012240336320206160124802128001280212160024800098000080100
1602044011616011780109800088011280012240336320206160124802128001280212160024800098000080100

1000 unrolls and 10 iterations

Result (median cycles for code divided by count): 0.5008

retire uop (01)cycle (02)schedule uop (52)schedule int uop (53)schedule ldst uop (55)dispatch int uop (56)dispatch ldst uop (58)int uops in schedulers (59)simd uops in schedulers (5a)dispatch uop (78)map int uop (7c)map ldst uop (7d)map int uop inputs (7f)map ldst uop inputs (80)map simd uop inputs (81)? int output thing (e9)? ldst retires (ed)? simd retires (ee)? int retires (ef)
160026406511600898005180038800548001224006632020616003480032800128002016000008000180000080010
160024400771600118001180000800108004224015632032616009480062800428002016000008000180000080010
160024400641600118001180000800108000024003032016216001080020800008002016000008000180000080010
160024400641600118001180000800108000024003032016216001080020800008002016000008000180000080010
160024400641600118001180000800108000024003032016216001080020800008002016000008000180000080010
160024400641600118001180000800108000024003032016216001080020800008002016000008000180000080010
160024400641600118001180000800108000024003032016216001080020800008002016000008000180000080010
160024400751600118001180000800108000024003036945216001080020800008002016000008000180000080010
160024410921600118001180000800108000024003032016216001080020800008002016000008000180000080010
160024400661600118001180000800108000024003032016216001080020800008002016000008000180000080010