Apple Microarchitecture Research by Dougall Johnson

M1/A14 P-core (Firestorm): Overview | Base Instructions | SIMD and FP Instructions
M1/A14 E-core (Icestorm):  Overview | Base Instructions | SIMD and FP Instructions

CASPL (32-bit)

Test 1: uops

Code:

  caspl w0, w1, w2, w3, [x6]
  nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop
  mov x0, 0

(no loop instructions)

1000 unrolls and 1 iteration

Retires (minus 70 nops): 6.002

Issues: 3.003

Integer unit issues: 0.001

Load/store unit issues: 3.003

SIMD/FP unit issues: 0.000

retire uop (01)cycle (02)schedule uop (52)schedule int uop (53)schedule ldst uop (55)dispatch int uop (56)dispatch ldst uop (58)int uops in schedulers (59)simd uops in schedulers (5a)ldst uops in schedulers (5b)dispatch uop (78)map int uop (7c)map ldst uop (7d)map simd uop (7e)map int uop inputs (7f)map ldst uop inputs (80)? int output thing (e9)? ldst retires (ed)? int retires (ef)
7601136483303113030130030112500300320024004020007000130003000
7600634964300413003010229155653769855322148017396126623620007000130003000
7600734748300713006030060110270300620044008020027007130003002
7600635118300413003030030110110300320024004020027007130003002
7600635316300413003030030110110300320024004020007000130003000
7600634910300413003030030110110300320024004020027007130003002
7600434653300113000030030110110300320024004020027007130003002
7600635102300413003030030110110300320024004020027007130003002
7600634418300413003030030110110300320024004020027007130003002
7600634884300413003030030110110300320024004020027007130003002

Test 2: throughput

Code:

  caspl w0, w1, w2, w3, [x6]
  add x6, x6, 8

(fused SUBS/B.cc loop)

100 unrolls and 100 iterations

Result (median cycles for code): 19.0052

retire uop (01)cycle (02)schedule uop (52)schedule int uop (53)schedule ldst uop (55)dispatch int uop (56)dispatch ldst uop (58)int uops in schedulers (59)simd uops in schedulers (5a)ldst uops in schedulers (5b)dispatch uop (78)map int uop (7c)map ldst uop (7d)map simd uop (7e)map int uop inputs (7f)map ldst uop inputs (80)? int output thing (e9)? ldst retires (ed)? int retires (ef)
702041900525506025060300002505430002755322245234055056302024000303020270004249603000040100
702041900525506025060300002505430038756632244614055096302384005103020270004249603000040100
702041900525506025060300002505430002755322245262055056302024000303024070082249553000040100
702041900535506025060300002505430038721152244686053922302384005103020270004249603000040100
702041900525506025060300002505430002755322245262055056302024000303023870088242023000040100
702041900575506025060300002505430002755322245262055056302024000303020270004249603000040100
702041900525506025060300002505430002755322245262055056302024000303020270004249603000040100
702041900525506025060300002505430002755322245262055056302024000303023870088248973000040100
702041900525506025060300002505430002755322245262055056302024000303020270004249603000040100
702041900525506025060300002505430002755322245262055056302024000303020270004249603000040100

1000 unrolls and 10 iterations

Result (median cycles for code): 19.0052

retire uop (01)cycle (02)schedule uop (52)schedule int uop (53)schedule ldst uop (55)dispatch int uop (56)dispatch ldst uop (58)int uops in schedulers (59)simd uops in schedulers (5a)ldst uops in schedulers (5b)dispatch uop (78)map int uop (7c)map ldst uop (7d)map simd uop (7e)map int uop inputs (7f)map ldst uop inputs (80)? int output thing (e9)? ldst retires (ed)? int retires (ef)
700401905885398323892300912381830002754502249099055006300224000303002070000249993000040010
700241900505500925009300002500230000753662248978055002300204000003005870088246923000040010
765402223025934027440319002715930000754442249110055002300204000003002070000249993000040010
700241900505500925009300002500230000753662248975055002300204000003002070000249993000040010
700241900505500925009300002500230000753662248975055002300204000003002070000249993000040010
700251900955471224682300302467630000754442249094055002300204000003002070000249993000040010
700241900535500925009300002500230000753802249087055002300204000003002070000249993000040010
700241900535500925009300002500230000753662249089055002300204000003005870088245523000040010
700241900505500925009300002500230038724072248359054012300584005103002070000250013000040010
700241900535500925009300002500230000753662248975055002300204000003002070000249993000040010

Test 3: throughput

Code:

  caspl w0, w1, w2, w3, [x6]
  mov x7, 8

(fused SUBS/B.cc loop)

100 unrolls and 100 iterations

Result (median cycles for code): 19.0094

retire uop (01)cycle (02)schedule uop (52)schedule int uop (53)schedule ldst uop (55)dispatch int uop (56)dispatch ldst uop (58)int uops in schedulers (59)simd uops in schedulers (5a)dispatch uop (78)map int uop (7c)map ldst uop (7d)map int uop inputs (7f)map ldst uop inputs (80)? int output thing (e9)? ldst retires (ed)? int retires (ef)
7019919026611250057731547695792410468233892413203501655707039213957870402244295603973000040092
7019619010111548660745547416096010496833795513173091662427058213996070582244925607743000040090
7019419009411588161074548076127410467134290513250311657477038413956470384244232603783000040090
7019419009511558660876547106107610475833398713213161645927044413968070388244252605653000040092
7019419009511530060682546186108010497634042413196311660567059213997270592244936603823000040090
7018719015811458959936546536034010497634044013196521660567059213997270384244232605763000040090
7019419009411627761272550056127410501833444813131281662987061814002870386244245605613000040092
7019419009411488460476544086087610466833939113212501657427038213956070582244925609723000040090
7019419009411558460876547086107610467133937513206321657477038413956470572244896609493000040090
7019419009211608861084550046109210496833438413128111662427058213996070384244232605723000040090

1000 unrolls and 10 iterations

Result (median cycles for code): 19.0101

retire uop (01)cycle (02)schedule uop (52)schedule int uop (53)schedule ldst uop (55)dispatch int uop (56)dispatch ldst uop (58)int uops in schedulers (59)simd uops in schedulers (5a)ldst uops in schedulers (5b)dispatch uop (78)map int uop (7c)map ldst uop (7d)map simd uop (7e)map int uop inputs (7f)map ldst uop inputs (80)? int output thing (e9)? ldst retires (ed)? simd retires (ee)? int retires (ef)
7001719022811494859905550435991210495133634813145730166839700301399360700322448966185130000040002
7001619010111689461881550136189210497333602613139960166877700441399660700302448896184930000040002
7001619009811688961879550106189010495333595213139000166843700301399380700302448896184930000040002
7001619009811688961879550106189010495333595213139000166843700301399380700802450586181330000039992
7001619011711688961879550106189010495333595213139000166843700301399380700302448896184930000040002
7001619009811688961879550106189010495333595213139000166843700301399380700302448896184930000040002
7001619009811688961879550106189010495333595213139000166843700301399380700302448896184930000040002
7000919014011595060899550516091810495333595213139000166843700301399380700302448896184930000040002
7001619009811688961879550106189010495333595213139000166843700301399380700302448896184930000040002
7001619009811688961879550106189010502533479613142820166507700801400360708262456396020330285140394