Apple Microarchitecture Research by Dougall Johnson

M1/A14 P-core (Firestorm): Overview | Base Instructions | SIMD and FP Instructions
M1/A14 E-core (Icestorm):  Overview | Base Instructions | SIMD and FP Instructions

CASP (32-bit)

Test 1: uops

Code:

  casp w0, w1, w2, w3, [x6]
  nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop
  mov x0, 0

(no loop instructions)

1000 unrolls and 1 iteration

Retires (minus 70 nops): 6.000

Issues: 3.000

Integer unit issues: 0.001

Load/store unit issues: 3.000

SIMD/FP unit issues: 0.000

retire uop (01)cycle (02)schedule uop (52)schedule int uop (53)schedule ldst uop (55)dispatch ldst uop (58)simd uops in schedulers (5a)dispatch uop (78)map int uop (7c)map ldst uop (7d)map int uop inputs (7f)map ldst uop inputs (80)? int output thing (e9)? ldst retires (ed)? int retires (ef)
760113474330191301830001100030002000400020007000130003000
760043449230011300030001100030002000400020007000130003000
760043441930011300030001100030002000400020007000130003000
760043442030011300030001100030002000400020007000130003000
760043442630011300030001100030002000400020007000130003000
760043442330011300030001100030002000400020007000130003000
760043441430011300030001100030002000400020007000130003000
760043440030011300030031101230032002400420007000130003000
760043464530011300030001100030002000400020007000130003000
760043441830011300030001100030002000400020007000130003000

Test 2: throughput

Code:

  casp w0, w1, w2, w3, [x6]
  add x6, x6, 8

(fused SUBS/B.cc loop)

100 unrolls and 100 iterations

Result (median cycles for code): 17.0061

retire uop (01)cycle (02)schedule uop (52)schedule int uop (53)schedule ldst uop (55)dispatch int uop (56)dispatch ldst uop (58)int uops in schedulers (59)simd uops in schedulers (5a)dispatch uop (78)map int uop (7c)map ldst uop (7d)map int uop inputs (7f)map ldst uop inputs (80)? int output thing (e9)? ldst retires (ed)? int retires (ef)
7022517111954074239423013223895300027554819970055505630202400033020270004249603000040100
7020417005955060250603000025054300027554819971105505630202400033020270004249603000040100
7020417005855060250603000025054300027557819971045505630202400033023870088240903000040100
7020417005555060250603000025054300027554819970325505630202400033020270004249623000040100
7020417010055060250603000025054300027554819971565505630202400033020270004249603000040100
7020417005855060250603000025054300027554819970875505630202400033020270004249603000040100
7020417005855060250603000025054300027554819970315505630202400033020270004249603000040100
7020417005855060250603000025054300027554819970315505630202400033020270004249603000040100
7020417005955060250603000025054300387573419971715511630238400513020270004249603000040100
7020417006055060250603000025054300027554819970185505630202400033020270004249603000040100

1000 unrolls and 10 iterations

Result (median cycles for code): 17.0055

retire uop (01)cycle (02)schedule uop (52)schedule int uop (53)schedule ldst uop (55)dispatch int uop (56)dispatch ldst uop (58)int uops in schedulers (59)simd uops in schedulers (5a)dispatch uop (78)map int uop (7c)map ldst uop (7d)map int uop inputs (7f)map ldst uop inputs (80)? int output thing (e9)? ldst retires (ed)? int retires (ef)
7004517080354026238993012723865300027544619995685500630022400033002070000249993000040010
7002417005555009250093000025002300007538319994645500230020400003002070000249993000040010
7002417005555009250093000025002300367195019991275384830056400483002070000249993000040010
7002417005555009250093000025002300007538319994645500230020400003002070000249993000040010
7002417005555011250113000025002300007538319994645500230020400003002070000249993000040010
7002417005555009250093000025002300007538319994645500230020400003005870088245503000040010
7002417006155011250113000025002300007538319994645500230020400003002070000249993000040010
7002417005555009250093000025002300007538319994645500230020400003002070000249993000040010
7002417005555009250093000025002300007538319994645500230020400003002070000249993000040010
7002417005555009250093000025002300387232619990575398430058400513002070000249993000040010

Test 3: throughput

Code:

  casp w0, w1, w2, w3, [x6]
  mov x7, 8

(fused SUBS/B.cc loop)

100 unrolls and 100 iterations

Result (median cycles for code): 17.0056

retire uop (01)cycle (02)schedule uop (52)schedule int uop (53)schedule ldst uop (55)dispatch int uop (56)dispatch ldst uop (58)int uops in schedulers (59)simd uops in schedulers (5a)dispatch uop (78)map int uop (7c)map ldst uop (7d)map int uop inputs (7f)map ldst uop inputs (80)map simd uop inputs (81)? int output thing (e9)? ldst retires (ed)? simd retires (ee)? int retires (ef)
701971702271117905733154459577201044203466221129873164884702161392287038624423506017730000040062
701881702071100915726752824592981043603529081135269165000701761391487018224352505936730000040092
701961700841138795966754212604761046703528841128589165742703821395627038224423106056530000040092
701961700841155776086554712610721049703453001118027166242705821399627053024474305955130000040082
701891701271142455949754748597061046703480241124889165736703821395627038224423106055930000040092
701961700841155716085954712610661049703452841117989166242705821399627058224493106096530000040092
701961700841162776126555012612721049703452841117989166242705821399627058224493106096530000040092
701961700841162776126555012612721049703452841117989166242705821399627061824503805905630000040084
701961700841162776126555012612721049703452841117989166242705821399627058224493106096530000040092
701961700841162776126555012612721049703452841117989166242705821399627058224493106096530000040092

1000 unrolls and 10 iterations

Result (median cycles for code): 17.0082

retire uop (01)cycle (02)schedule uop (52)schedule int uop (53)schedule ldst uop (55)dispatch int uop (56)dispatch ldst uop (58)int uops in schedulers (59)simd uops in schedulers (5a)dispatch uop (78)map int uop (7c)map ldst uop (7d)map int uop inputs (7f)map ldst uop inputs (80)? int output thing (e9)? ldst retires (ed)? simd retires (ee)? int retires (ef)
70016170084116905618935501261902104890345566111850116674269988139854700162448286183630000039998
70014170045116847618695497861866104992342104112016116543470060139992699882447366173830000040000
70014170081116607616945491361754104891345796112047916666569990139856700102448136182630000040000
70014170085116831618565497561872104952346254111969116682670036139940700362448806181230000040000
70014170082116883618925499161890104948345972111919016683870028139932700602449826181030000039992
70014170085116829618545497561872104952346254111969116682670036139940700362448806181230000040000
70014170082116831618565497561872104921346224111978916679370010139896700362448806181230000040000
70014170082116883618925499161890104948346134111944116683870028139932700282448766185830000040000
70014170081116888618885500061890105016343864111968116613470076140024700082448066181430000040000
70014170081116814618445497061866104921346224111975716679370010139896700102448136182230000040000