Apple Microarchitecture Research by Dougall Johnson

M1/A14 P-core (Firestorm): Overview | Base Instructions | SIMD and FP Instructions
M1/A14 E-core (Icestorm):  Overview | Base Instructions | SIMD and FP Instructions

CASPA (32-bit)

Test 1: uops

Code:

  caspa w0, w1, w2, w3, [x6]
  nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop
  mov x0, 0

(no loop instructions)

1000 unrolls and 1 iteration

Retires (minus 70 nops): 6.000

Issues: 3.000

Integer unit issues: 0.001

Load/store unit issues: 3.000

SIMD/FP unit issues: 0.000

retire uop (01)cycle (02)schedule uop (52)schedule int uop (53)schedule ldst uop (55)dispatch ldst uop (58)int uops in schedulers (59)simd uops in schedulers (5a)dispatch uop (78)map int uop (7c)map ldst uop (7d)map int uop inputs (7f)map ldst uop inputs (80)? int output thing (e9)? ldst retires (ed)? int retires (ef)
7600734882300713006300001100130002000400020007000130003000
7600434512300113000300001100130002000400020007000130003000
7600434498300113000300001100130002000400020007000130003000
7600434501300113000300001100130002000400020007000130003000
7600434441300113000300001100130002000400020007000130003000
7600434583300113000300001100130002000400020007000130003000
7600534622300413003300001100730002000400020027007130003000
7600434444300113000300001100130002000400020007000130003000
7600434535300113000300001100130002000400020007000130003000
7600434496300113000300001100130002000400020007000130003000

Test 2: throughput

Code:

  caspa w0, w1, w2, w3, [x6]
  add x6, x6, 8

(fused SUBS/B.cc loop)

100 unrolls and 100 iterations

Result (median cycles for code): 17.0055

retire uop (01)cycle (02)schedule uop (52)schedule int uop (53)schedule ldst uop (55)dispatch int uop (56)dispatch ldst uop (58)int uops in schedulers (59)simd uops in schedulers (5a)dispatch uop (78)map int uop (7c)map ldst uop (7d)map int uop inputs (7f)map ldst uop inputs (80)map simd uop inputs (81)? int output thing (e9)? ldst retires (ed)? simd retires (ee)? int retires (ef)
70220170603511952110430091210513013569302646954530563033640180303037023802280230000040100
70204170253529352287630059228713006968942645632529413026840092302687016102278030000040100
70204170492530422292530117229203000368665644671528303020240004302687016102277130000040100
70204173441542262344230784234373010268345646032527513030140136302027000702273230000040100
70204170052528332283230001228273000368665644671528303020240004302027000702273230000040100
70204170052528332283230001228273000368665644671528303020240004302027000702273230000040100
70204170052528332283230001228273000368665644671528303020240004302027000702273230000040100
70205170141518162178630030217813003664303645134513743023540048302027000702273230000040100
70204170052528332283230001228273000368665644685528303020240004302027000702273230000040100
70205170113522462221630030222113000369929645827528303020240004302027000702273230000040100

1000 unrolls and 10 iterations

Result (median cycles for code): 17.0058

retire uop (01)cycle (02)schedule uop (52)schedule int uop (53)schedule ldst uop (55)dispatch int uop (56)dispatch ldst uop (58)int uops in schedulers (59)simd uops in schedulers (5a)dispatch uop (78)map int uop (7c)map ldst uop (7d)map int uop inputs (7f)map ldst uop inputs (80)? int output thing (e9)? ldst retires (ed)? simd retires (ee)? int retires (ef)
7003917059151053209913006220938300036843964479152740300224000430022700072273430000040010
7002517010052782227523003022745300036843964475452740300224000430022700072273430000040010
7002417005852745227443000122737300036843964476852740300224000430022700072273430000040010
7002417005852745227443000122737300036843964475452740300224000430022700072273430000040010
7002417005852745227443000122737300036843964475452740300224000430055700842184430000040010
7002417005852745227443000122737300036843964475452740300224000430022700072273430000040010
7002417005852745227443000122737300036843964475452740300224000430022700072273430000040010
7002417005852745227443000122737300036843964475452740300224000430022700072273430000040010
7002417005852745227443000122737300036843964475452740300224000430022700072273430000040010
7002417005852745227443000122737300036843964475452740300224000430022700072273430000040010

Test 3: throughput

Code:

  caspa w0, w1, w2, w3, [x6]
  mov x7, 8

(fused SUBS/B.cc loop)

100 unrolls and 100 iterations

Result (median cycles for code): 15.0251

retire uop (01)cycle (02)schedule uop (52)schedule int uop (53)schedule ldst uop (55)dispatch int uop (56)dispatch ldst uop (58)int uops in schedulers (59)simd uops in schedulers (5a)dispatch uop (78)map int uop (7c)map ldst uop (7d)map int uop inputs (7f)map ldst uop inputs (80)? int output thing (e9)? ldst retires (ed)? int retires (ef)
602091501625199721959300381103230030178115116943184286820220400402022070070254543000030104
602111502045381123761300501196530078157472416973874144120252401042022070070253223000030104
602061503405551025493300171283630030178560416999884286720220400402022070070253853000030104
602081506285544125422300191283730024178865817042724286020216400322021670056253223000030102
602061506285543925422300171283630024178490516993264286020216400322021670056253853000030102
602091503275284622798300481149530024178490516993264286020216400322021670056253853000030102
602081500525557525556300191283730069163003416985304181420248400922022070070254563000030104
602081500555557525556300191283730030178115716942704286720220400402022070070254563000030104
602081500525557525556300191283730021178234916920784285820214400282022070070254583000030104
602081500545557525556300191283830030178501516992374286820220400402022070070254583000030104

1000 unrolls and 10 iterations

Result (median cycles for code): 15.0054

retire uop (01)cycle (02)schedule uop (52)schedule int uop (53)schedule ldst uop (55)dispatch int uop (56)dispatch ldst uop (58)int uops in schedulers (59)simd uops in schedulers (5a)dispatch uop (78)map int uop (7c)map ldst uop (7d)map int uop inputs (7f)map ldst uop inputs (80)? int output thing (e9)? ldst retires (ed)? int retires (ef)
600291501645190121863300381094230027178090116944624277320038400362003870063254523000030014
600281500845547925472300071274730003178136016945514274020022400042002270007254533000030010
600241500485546425463300011273730003178134816945374274020022400042002070000254553000030010
600241500485546525465300001273730000178134816945234273720020400002002070000254553000030010
600241500485546525465300001273730042172956416944434241820048400562003870063254583000030014
600241500485546525465300001273730000178134816945234273720020400002002070000254553000030010
600241500485546525465300001273730000178134816945254273720020400002004870098227133000030012
600281500885547825460300181274730000178177916950694273720020400002002070000254453000030010
600241500805545525455300001273730033171039516947554227020042400442003870063254543000030014
600281500525548425466300181274630027178086516944244277320038400362003870063254563000030014