Apple Microarchitecture Research by Dougall Johnson

M1/A14 P-core (Firestorm): Overview | Base Instructions | SIMD and FP Instructions
M1/A14 E-core (Icestorm):  Overview | Base Instructions | SIMD and FP Instructions

CASP (64-bit)

Test 1: uops

Code:

  casp x0, x1, x2, x3, [x6]
  nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop
  mov x0, 0

(no loop instructions)

1000 unrolls and 1 iteration

Retires (minus 70 nops): 6.000

Issues: 3.042

Integer unit issues: 0.001

Load/store unit issues: 3.042

SIMD/FP unit issues: 0.000

retire uop (01)cycle (02)schedule uop (52)schedule int uop (53)schedule ldst uop (55)dispatch ldst uop (58)simd uops in schedulers (5a)dispatch uop (78)map int uop (7c)map ldst uop (7d)map int uop inputs (7f)map ldst uop inputs (80)? int output thing (e9)? ldst retires (ed)? int retires (ef)
760113606530431304230361148530362025404820267091130003000
760043597130401303930451187830452030406020327112130003000
760043580430431304230331137730332025404520427147130003000
760043647230491304830511185330512036406920387133130003000
760043657430671306630521179530522037407020337112130003000
760043574430401303930481157630482032406420327112130003000
760053601130491304830421194630422028405620347119130003000
760043595230431304230391157030392026405220227077130003000
760043589330431304230461173230462032406220327105130003000
760043596830431304230361125430362024404820297098130003000

Test 2: throughput

Code:

  casp x0, x1, x2, x3, [x6]
  add x6, x6, 16

(fused SUBS/B.cc loop)

100 unrolls and 100 iterations

Result (median cycles for code): 17.0060

retire uop (01)cycle (02)schedule uop (52)schedule int uop (53)schedule ldst uop (55)dispatch int uop (56)dispatch ldst uop (58)int uops in schedulers (59)simd uops in schedulers (5a)dispatch uop (78)map int uop (7c)map ldst uop (7d)map int uop inputs (7f)map ldst uop inputs (80)? int output thing (e9)? ldst retires (ed)? int retires (ef)
7023517113654222240873013523892300387552419966825501030238400513027470172238993000040100
7020417007155064250643000025054300387435919972555462630238400513035070328250633000040100
7020617022554353242933006024280300027565419971125505630202400033020270004249643000040100
7020517015154440244113002924400300027565419970885505630202400033020270004249643000040100
7020517015454338243083003024298300027565419970885505630202400033034670340250703000040100
7020417101055515252763023925265300717602519975235517830276400963020270004249643000040100
7020417006555064250643000025054300027565419971375505630202400033020270004249643000040100
7020517011254927248973003024885300027565419971125505630202400033020270004249643000040100
7020517017454757247273003024716300027565419971375505630202400033020270004249643000040100
7020417006155064250643000025054300027565419970885505630202400033020270004249643000040100

1000 unrolls and 10 iterations

Result (median cycles for code): 17.0061

retire uop (01)cycle (02)schedule uop (52)schedule int uop (53)schedule ldst uop (55)dispatch int uop (56)dispatch ldst uop (58)int uops in schedulers (59)simd uops in schedulers (5a)dispatch uop (78)map int uop (7c)map ldst uop (7d)map int uop inputs (7f)map ldst uop inputs (80)? int output thing (e9)? ldst retires (ed)? int retires (ef)
7005517114354158240173014123839300027551019999605500630022400033002070000250003000040010
7002417006955010250103000025002300007550419995925500230020400003002070000250023000040010
7002417006055010250103000025002300007550419995915500230020400003002070000250003000040010
7002417006555010250103000025002300007550419995925500230020400003005870088245753000040010
7002417008155010250103000025002300387537819995315494430058400513002070000250023000040010
7002417022155069250393003025030300007550019997255500230020400003002070000250053000040010
7002417005255013250133000025002300007551819995275500230020400003002070000250033000040010
7002417006055013250133000025002300007551819994895500230020400003002070000250033000040010
7002417005455013250133000025002300007551819994895500230020400003002070000250033000040010
7002417005255013250133000025002300007551819995755500230020400003002070000250033000040010

Test 3: throughput

Code:

  casp x0, x1, x2, x3, [x6]
  mov x7, 8

(fused SUBS/B.cc loop)

100 unrolls and 100 iterations

Result (median cycles for code): 17.0045

retire uop (01)cycle (02)schedule uop (52)schedule int uop (53)schedule ldst uop (55)dispatch int uop (56)dispatch ldst uop (58)int uops in schedulers (59)simd uops in schedulers (5a)dispatch uop (78)map int uop (7c)map ldst uop (7d)map int uop inputs (7f)map ldst uop inputs (80)? int output thing (e9)? ldst retires (ed)? int retires (ef)
7019717022311395858907550515889410492734450411175951662737055413990470574244897609653000040092
7019617004511626061265549956126810495734338611176911662257057413994470366244163586453000040084
7019617004511626061265549956126810495734338611176911662257057413994470574244897609653000040092
7019617004511626061265549956126810495734338611176911662257057413994470574244897609653000040092
7019617004511626061265549956126810495734338611176911662257057413994470574244897609653000040092
7019617004511626061265549956126810500633775811192041639427060814001070574244897609633000040092
7019617004711626061265549956126810495734340411177031662257057413994470574244897609653000040092
7019617004511626061265549956126810495734338611176911662257057413994470574244897609653000040092
7019617004511626061265549956126810495734338611176911662257057413994470610245017596273000040084
7019617004511626061265549956126810495734338611176911662257057413994470574244897609653000040092

1000 unrolls and 10 iterations

Result (median cycles for code): 17.0082

retire uop (01)cycle (02)schedule uop (52)schedule int uop (53)schedule ldst uop (55)dispatch int uop (56)dispatch ldst uop (58)int uops in schedulers (59)simd uops in schedulers (5a)ldst uops in schedulers (5b)dispatch uop (78)map int uop (7c)map ldst uop (7d)map simd uop (7e)map int uop inputs (7f)map ldst uop inputs (80)? int output thing (e9)? ldst retires (ed)? int retires (ef)
700171705891149665990955057598861049483459721119190016683870028139932070028244876618583000040000
700141700811168256185254973618721049523462541119691016682670036139940070036244880618123000040000
700071701691157356072155014607761049383462261119487016680670032139924070036244880618123000040000
700141700811168886188855000618901049483459721119190016683870028139932070028244876618583000040000
700141700811168886188855000618901050203440061119968016609870078140030070028244876618583000040000
700141700811168866188655000618901049483459721119190016683870028139932070022244855685193000040002
700141700821168946189455000618901049483459371119216016683870028139932070010244813618283000040000
700141700821168946189455000618901049483459371119216016683870028139932070028244876618643000040000
700141700821168946189455000618901049213461891119783016679370010139896070072245018603073000039994
700141700821167246178454940618201048613462391121203016663969970139816070032244866617503000040000