Apple Microarchitecture Research by Dougall Johnson

M1/A14 P-core (Firestorm): Overview | Base Instructions | SIMD and FP Instructions
M1/A14 E-core (Icestorm):  Overview | Base Instructions | SIMD and FP Instructions

CASAH

Test 1: uops

Code:

  casah w0, w1, [x6]
  nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop
  mov x0, 0

(no loop instructions)

1000 unrolls and 1 iteration

Retires (minus 70 nops): 4.000

Issues: 3.000

Integer unit issues: 0.001

Load/store unit issues: 3.000

SIMD/FP unit issues: 0.000

retire uop (01)cycle (02)schedule uop (52)schedule int uop (53)schedule simd uop (54)schedule ldst uop (55)dispatch int uop (56)dispatch simd uop (57)dispatch ldst uop (58)simd uops in schedulers (5a)dispatch uop (78)map int uop (7c)map ldst uop (7d)map int uop inputs (7f)map ldst uop inputs (80)? int output thing (e9)? ldst retires (ed)? int retires (ef)
740073458830101030090030001503330001000300010006000130001000
740043427930011030000030001503330001000300010006000130001000
740043426930011030000030001503330001000300010006000130001000
740043433830011030000030001503330001000300010006000130001000
740043454130011030000030001503330001000300010006000130001000
740043425930011030000030001503630001000300010006000130001000
740043440630011030000030001503330001000300010006000130001000
740043454630011030000030001503330001000300010006000130001000
740043443530011030000030001503330001000300010006000130001000
740043451930011030000030001503730001000300010006000130001000

Test 2: throughput

Code:

  casah w0, w1, [x6]
  add x6, x6, 2

(fused SUBS/B.cc loop)

100 unrolls and 100 iterations

Result (median cycles for code): 7.0058

retire uop (01)cycle (02)schedule uop (52)schedule int uop (53)schedule ldst uop (55)dispatch int uop (56)dispatch ldst uop (58)int uops in schedulers (59)simd uops in schedulers (5a)dispatch uop (78)map int uop (7c)map ldst uop (7d)map int uop inputs (7f)map ldst uop inputs (80)? int output thing (e9)? ldst retires (ed)? simd retires (ee)? int retires (ef)
502087024741834117863004811786300034292627998044268202013000320201600061416630000020100
502047006044267142663000114265300034286627987544268202013000320201600061416630000020100
502047005444267142663000114265300034286027989044268202013000320223600721305630000020100
502047005444267142663000114265300034286027989044268202013000320201600061416630000020100
502047006044267142663000114265300034286127990044268202013000320201600061416630000020100
502047005444267142663000114265300034286227989544268202013000320201600061416630000020100
502047006044267142663000114265300034289027992444268202013000320201600061416630000020100
502047005144267142663000114265300034289027992144268202013000320201600061416630000020100
502047005144267142663000114265300034286127980144268202013000320201600061416630000020100
502047005144267142663000114265300034289828005344268202013000320201600061416630000020100

1000 unrolls and 10 iterations

Result (median cycles for code): 7.0051

retire uop (01)cycle (02)schedule uop (52)schedule int uop (53)schedule ldst uop (55)dispatch int uop (56)dispatch ldst uop (58)int uops in schedulers (59)simd uops in schedulers (5a)dispatch uop (78)map int uop (7c)map ldst uop (7d)map int uop inputs (7f)map ldst uop inputs (80)map simd uop inputs (81)? int output thing (e9)? ldst retires (ed)? simd retires (ee)? int retires (ef)
5002870325417441169630048116963000342639280241441782002130003200206000001416730000020010
5002470060441761417630000141753000042613280114441752002030000200206000001416630000020010
5002470054441761417630000141753000042613280114441752002030000200206000001416630000020010
5002470070441761417630000141753000042615280114441752002030000200206000001416630000020010
5002470054441761417630000141753000042613280114441752002030000200206000001416630000020010
5002470054441761417630000141753000042613280114441752002030000200206000001416630000020010
5002470054441761417630000141753000042613280114441752002030000200206000001416630000020010
5002470054441761417630000141753000042613280114441752002030000200206000001416630000020010
5002470054441761417630000141753000042613280114441752002030000200206000001416630000020010
5002470054441761417630000141753000042613280114441752002030000200206000001416630000020010

Test 3: throughput

Code:

  casah w0, w1, [x6]
  mov x7, 8

(fused SUBS/B.cc loop)

100 unrolls and 100 iterations

Result (median cycles for code): 22.0039

retire uop (01)cycle (02)schedule uop (52)schedule int uop (53)schedule simd uop (54)schedule ldst uop (55)dispatch int uop (56)dispatch simd uop (57)dispatch ldst uop (58)int uops in schedulers (59)simd uops in schedulers (5a)dispatch uop (78)map int uop (7c)map ldst uop (7d)map int uop inputs (7f)map ldst uop inputs (80)map simd uop inputs (81)? int output thing (e9)? ldst retires (ed)? simd retires (ee)? int retires (ef)
402072201523347134350300361771030003845667269845434268102013000310201600060832830000010100
402042200393839883970300014265030003845667269847634268102013000310201600060832830000010100
402042200883843184300300014265030003845677269850334268102013000310201600060832830000010100
402042200363842984280300014265030036446172269892432334102123003610201600060832830000010100
402042200363842984280300014265030003845667269847634268102013000310201600060832830000010100
402042200363842984280300014265030003845667269847634268102013000310201600060832830000010100
402042200363842984280300014265030003845667269847634268102013000310201600060832830000010100
402042200363842984280300014265030003845667269847634268102013000310201600060832830000010100
402042200363842984280300014265030003845877269923534268102013000310212600720646230000010100
402042200393842984280300014265030003845667269847634268102013000310201600060832830000010100

1000 unrolls and 10 iterations

Result (median cycles for code): 22.0046

retire uop (01)cycle (02)schedule uop (52)schedule int uop (53)schedule ldst uop (55)dispatch int uop (56)dispatch ldst uop (58)int uops in schedulers (59)simd uops in schedulers (5a)dispatch uop (78)map int uop (7c)map ldst uop (7d)map int uop inputs (7f)map ldst uop inputs (80)map simd uop inputs (81)? int output thing (e9)? ldst retires (ed)? simd retires (ee)? int retires (ef)
4002922028533414334430070168230003845466269901934178100213000310020600000832930000010010
4002422005438339833930000417530000845462269900734175100203000010020600000832930000010010
4002422005438339833930000417530036426994269924732150100323003610020600000832930000010010
4002522012034337430730030216430000845462269898534175100203000010020600000832930000010010
4002422005438339833930000417530000845462269900934175100203000010026600360833030000010011
4002422004438340834030000417530000845422269867734175100203000010032600720441130000010010
4002422004538340834030000417530000845422269868134175100203000010020600000833030000010010
4002622007434557452230035227230000845422269867734175100203000010020600000833030000010010
4002422004438340834030000417530000845422269867134175100203000010020600000833030000010010
4002422004438340834030000417530000845422269867734175100203000010032600720638530000010010