Apple Microarchitecture Research by Dougall Johnson

M1/A14 P-core (Firestorm): Overview | Base Instructions | SIMD and FP Instructions
M1/A14 E-core (Icestorm):  Overview | Base Instructions | SIMD and FP Instructions

CASAB

Test 1: uops

Code:

  casab w0, w1, [x6]
  nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop
  mov x0, 0

(no loop instructions)

1000 unrolls and 1 iteration

Retires (minus 70 nops): 4.000

Issues: 3.000

Integer unit issues: 0.001

Load/store unit issues: 3.000

SIMD/FP unit issues: 0.000

retire uop (01)cycle (02)schedule uop (52)schedule int uop (53)schedule ldst uop (55)dispatch ldst uop (58)simd uops in schedulers (5a)dispatch uop (78)map int uop (7c)map ldst uop (7d)map int uop inputs (7f)map ldst uop inputs (80)? int output thing (e9)? ldst retires (ed)? int retires (ef)
740043599130011300030001503330001000300010006000130001000
740043490830011300030001503330001000300010006000130001000
740043513530011300030001508730001000300010006000130001000
740043445530011300030001503330001000300010006000130001000
740043436230011300030001503330001000300010006000130001000
740043467730011300030001503630001000300010006000130001000
740043434230011300030001503330001000300010006000130001000
740043423330011300030001503330001000300010006000130001000
740043437230011300030001503330001000300010006000130001000
740043428830011300030001503330001000300010006000130001000

Test 2: throughput

Code:

  casab w0, w1, [x6]
  add x6, x6, 2

(fused SUBS/B.cc loop)

100 unrolls and 100 iterations

Result (median cycles for code): 7.0060

retire uop (01)cycle (02)schedule uop (52)schedule int uop (53)schedule ldst uop (55)dispatch int uop (56)dispatch ldst uop (58)int uops in schedulers (59)simd uops in schedulers (5a)dispatch uop (78)map int uop (7c)map ldst uop (7d)map int uop inputs (7f)map ldst uop inputs (80)? int output thing (e9)? ldst retires (ed)? int retires (ef)
50208702624183611786300501178630003429132798884426920201300032020160006141663000020100
50205701024200811978300301197730003429002798954426820201300032020160006141663000020100
50204700584426714266300011426530003428902799134426820201300032020160006141663000020100
50204700584426714266300011426530003428902799114426820201300032020160006141663000020100
50204700584426714266300011426530003428902799134426820201300032020160006141663000020100
50204700584426714266300011426530003428892799124426820201300032020160006141663000020100
50204700584426714266300011426530003428902799114426820201300032022360072118743000020100
50204700584426714266300011426530003428902799114426820201300032020160006141663000020100
50204700584426714266300011426530003428902799134426820201300032020160006141663000020100
50204700584426714266300011426530003428912799294426820201300032020160006141663000020100

1000 unrolls and 10 iterations

Result (median cycles for code): 7.0060

retire uop (01)cycle (02)schedule uop (52)schedule int uop (53)schedule ldst uop (55)dispatch int uop (56)dispatch ldst uop (58)int uops in schedulers (59)simd uops in schedulers (5a)ldst uops in schedulers (5b)dispatch uop (78)map int uop (7c)map ldst uop (7d)map simd uop (7e)map int uop inputs (7f)map ldst uop inputs (80)map simd uop inputs (81)? int output thing (e9)? ldst retires (ed)? simd retires (ee)? int retires (ef)
500287025741744116963004811696300034267128033304417820021300030200206000001416630000020010
500257010441808117783003011777300004262128020104417520020300000200206000001416630000020010
500247005844176141763000014175300664294428170604426820064300660200426006601418030000020010
500247013744219141903002914187300334270328150504422420042300330200206000001416730000020010
500247016044224141943003014191300664287128189804427220064300660200646013201419630000020010
500247034944352142363011614235302674658929170404268920337303050200216000601416730000020010
500247006044177141773000014175300004266128035204417520020300000200206000001416630000020010
500247021044262142043005814203301324320128343004436420108301320200426006601418330000020010
500247020844265142073005814206301654325128415304441120130301650201526039601183330000020010
500247006444178141783000014175300004263528024004417520020300000200206000001416630000020010

Test 3: throughput

Code:

  casab w0, w1, [x6]
  mov x7, 8

(fused SUBS/B.cc loop)

100 unrolls and 100 iterations

Result (median cycles for code): 22.0046

retire uop (01)cycle (02)schedule uop (52)schedule int uop (53)schedule ldst uop (55)dispatch int uop (56)dispatch ldst uop (58)int uops in schedulers (59)simd uops in schedulers (5a)ldst uops in schedulers (5b)dispatch uop (78)map int uop (7c)map ldst uop (7d)map simd uop (7e)map int uop inputs (7f)map ldst uop inputs (80)? int output thing (e9)? ldst retires (ed)? int retires (ef)
4020522010436341631130030321130003845688269861303426810201300030102016000683303000010100
4020422004438431843030001426530003845692269860403426810201300030102126007281253000010100
4020422004438432843130001426530003845692269860403426810201300030102016000683313000010100
4020422004438431843030001426530003845692269859803426810201300030102016000683303000010100
4020422004438431843030001426530036527518269949003273710212300360102126007243443000010100
4020422004438432843130001426530003845692269860403426810201300030102016000683303000010100
4020422004438431843030001426530003845700269860703426810201300030102016000683233000010100
4020422004638412841130001426530003845700269862903426810201300030102016000682973000010100
4020422004638398839730001426530003845700269862903426810201300030102126007263413000010100
4020422004638398839730001426530003845700269862903426810201300030102016000682973000010100

1000 unrolls and 10 iterations

Result (median cycles for code): 22.0044

retire uop (01)cycle (02)schedule uop (52)schedule int uop (53)schedule ldst uop (55)dispatch int uop (56)dispatch ldst uop (58)int uops in schedulers (59)simd uops in schedulers (5a)dispatch uop (78)map int uop (7c)map ldst uop (7d)map int uop inputs (7f)map ldst uop inputs (80)map simd uop inputs (81)? int output thing (e9)? ldst retires (ed)? simd retires (ee)? int retires (ef)
4002722015233380334430036168130003845430269869734178100213000310032600720672130000010010
4002422004638339833930000417530000845409269857834175100203000010020600000832930000010010
4002522007835176514630030258030000845409269857834175100203000010020600000832930000010010
4002422004638339833930000417530000845409269857834175100203000010020600000832930000010010
4002422004638339833930000417530000845409269857834175100203000010020600000832930000010010
4002422004638339833930000417530000845409269857834175100203000010032600720647430000010010
4002422004838339833930000417530000845409269857834175100203000010020600000832930000010010
4002422004638339833930000417530000845409269857834175100203000010020600000832930000010010
4002422004638339833930000417530000845409269857834175100203000010020600000832930000010010
4002422004638339833930000417530003845431269873934178100213000310020600000832830000010010