Apple Microarchitecture Research by Dougall Johnson

M1/A14 P-core (Firestorm): Overview | Base Instructions | SIMD and FP Instructions
M1/A14 E-core (Icestorm):  Overview | Base Instructions | SIMD and FP Instructions

CASLH

Test 1: uops

Code:

  caslh w0, w1, [x6]
  nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop
  mov x0, 0

(no loop instructions)

1000 unrolls and 1 iteration

Retires (minus 70 nops): 4.000

Issues: 3.000

Integer unit issues: 0.001

Load/store unit issues: 3.000

SIMD/FP unit issues: 0.000

retire uop (01)cycle (02)schedule uop (52)schedule int uop (53)schedule ldst uop (55)dispatch ldst uop (58)simd uops in schedulers (5a)dispatch uop (78)map int uop (7c)map ldst uop (7d)map int uop inputs (7f)map ldst uop inputs (80)? int output thing (e9)? ldst retires (ed)? int retires (ef)
740073473430131301230001503330001000300010016006130001001
740053431530041300330001502230001000300010006000130001000
740053426730011300030001503230001000300010006000130001000
740043436130011300030001502230001000300010006000130001000
740043424430011300030001502230001000300010006000130001000
740043424230011300030031503730031001300310006000130001000
740043441530011300030001502230001000300010006000130001000
740043424330011300030031503730031001300310006000130001000
740053427830041300330001502230001000300010006000130001000
740043437730011300030001502230001000300010006000130001000

Test 2: throughput

Code:

  caslh w0, w1, [x6]
  add x6, x6, 2

(fused SUBS/B.cc loop)

100 unrolls and 100 iterations

Result (median cycles for code): 9.0060

retire uop (01)cycle (02)schedule uop (52)schedule int uop (53)schedule simd uop (54)schedule ldst uop (55)dispatch int uop (56)dispatch simd uop (57)dispatch ldst uop (58)int uops in schedulers (59)simd uops in schedulers (5a)ldst uops in schedulers (5b)dispatch uop (78)map int uop (7c)map ldst uop (7d)map simd uop (7e)map int uop inputs (7f)map ldst uop inputs (80)map simd uop inputs (81)? int output thing (e9)? ldst retires (ed)? simd retires (ee)? int retires (ef)
502089026343925138760300491387703000347269103903804572820201300030204416072601578730000020100
502049008245730157290300011572503000347271103914404572820201300030202016000601562630000020100
5643210733051211194585317481935633000347285103915104572820201300030202016000601562630000020100
502049006045727157260300011572503000347295103917804572820201300030202016000601562630000020100
502049006045727157260300011572503000347271103915004572820201300030202016000601562630000020100
502049006045727157260300011572503003943550103905404449420225300390202016000601562630000020100
502049006045727157260300011572503000347271103914304572820201300030202016000601562630000020100
502049006045727157260300011572503007547496103960204583320249300750202016000601562630000020100
502049020845815157540300611575303000347271103914004572820201300030202496015001565630000020100
502049006045727157260300011572503003947365103932604578120225300390202016000601562630000020100

1000 unrolls and 10 iterations

Result (median cycles for code): 9.0058

retire uop (01)cycle (02)schedule uop (52)schedule int uop (53)schedule ldst uop (55)dispatch int uop (56)dispatch ldst uop (58)int uops in schedulers (59)simd uops in schedulers (5a)dispatch uop (78)map int uop (7c)map ldst uop (7d)map int uop inputs (7f)map ldst uop inputs (80)? int output thing (e9)? ldst retires (ed)? simd retires (ee)? int retires (ef)
5002890254438301378530045137853000346994103937545638200213000320020600001562630000020010
5002490058456361563630000156353000046994103935945635200203000020020600001562630000020010
5002490051456361563630000156353000046964103924045635200203000020045600781444330000020010
5002490051456361563630000156353000046994103935145635200203000020020600001562630000020010
5002490058456361563630000156353000046964103923745635200203000020020600001562630000020010
5002490051456361563630000156353000046964103923745635200203000020020600001562630000020010
5002490051456361563630000156353000046994103936245635200203000020020600001562630000020010
5002490051456361563630000156353000046964103923345635200203000020020600001562630000020010
5002490051456361563630000156353000046994103934845635200203000020045600781446530000020010
5002590104439021387230030138713000046964103922045635200203000020020600001562630000020010

Test 3: throughput

Code:

  caslh w0, w1, [x6]
  mov x7, 8

(fused SUBS/B.cc loop)

100 unrolls and 100 iterations

Result (median cycles for code): 11.5167

retire uop (01)cycle (02)schedule uop (52)schedule int uop (53)schedule ldst uop (55)dispatch int uop (56)dispatch ldst uop (58)int uops in schedulers (59)simd uops in schedulers (5a)dispatch uop (78)map int uop (7c)map ldst uop (7d)map int uop inputs (7f)map ldst uop inputs (80)? int output thing (e9)? ldst retires (ed)? int retires (ef)
429361172346753115611519201298279036281483132794396088268417904126829158106200513000012831
429421171097193220268516641707579151277345130110196209269457915926797157420192303000012831
429981141926921718820503971646379738286278129045196832269867974226994159547198983000012896
430001156197101219802512101709479566280184129455996690269927956826735158010197903000012832
429971144197065619667509891718179051277756131915596077268477905927266161121188243000012986
431011126186968519276504091709979757281871123042696939269877976126891158946190053000012895
430011155747009219326507661689080047286727126665697277270908005426904158954190373000012895
430031141726936719102502651688080001273320127517296932271608000826907159025189133000012902
430871128896905518808502471722179351268215129945096445270007940827085158529186743000012897
429781156197079719688511091719679363284745123629796225270087941827427161312191533000012945

1000 unrolls and 10 iterations

Result (median cycles for code): 11.7255

retire uop (01)cycle (02)schedule uop (52)schedule int uop (53)schedule ldst uop (55)dispatch int uop (56)dispatch ldst uop (58)int uops in schedulers (59)simd uops in schedulers (5a)dispatch uop (78)map int uop (7c)map ldst uop (7d)map int uop inputs (7f)map ldst uop inputs (80)? int output thing (e9)? ldst retires (ed)? int retires (ef)
428121172226758815568520201297779135275386132901695788264267913826405158149201903000012735
427491173427201420211518031682379051280924132962895874263977905126397158101201883000012735
427491173127200620211517951682379051281041132961695874263977905126397158101201883000012735
427491173127200620211517951682379047281305132937895870263957904726397158089201803000012735
427491173427201420211518031682379047281271132919095870263957904726395158094201903000012735
427491173147200620213517931682379148256885133034095227264337915126491158672201363000012751
427651172517198920157518321682779414282994132274596261265147941626417158175201693000012739
427501173567204820231518171683379067281105132957895900264037906726400158126201933000012736
428101170897231520351519641706280135282622131830397197267598013526759160268203263000012796
428101170907230720348519591706280134282692131843797196267598013526881160988202373000012818