Apple Microarchitecture Research by Dougall Johnson

M1/A14 P-core (Firestorm): Overview | Base Instructions | SIMD and FP Instructions
M1/A14 E-core (Icestorm):  Overview | Base Instructions | SIMD and FP Instructions

CASH

Test 1: uops

Code:

  cash w0, w1, [x6]
  nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop
  mov x0, 0

(no loop instructions)

1000 unrolls and 1 iteration

Retires (minus 70 nops): 4.001

Issues: 3.003

Integer unit issues: 0.001

Load/store unit issues: 3.003

SIMD/FP unit issues: 0.000

retire uop (01)cycle (02)schedule uop (52)schedule int uop (53)schedule ldst uop (55)dispatch ldst uop (58)simd uops in schedulers (5a)dispatch uop (78)map int uop (7c)map ldst uop (7d)map int uop inputs (7f)map ldst uop inputs (80)? int output thing (e9)? ldst retires (ed)? int retires (ef)
740073451830131301230031503230031001300310016006130001001
740053423430041300330031503230031001300310016006130001001
740053423330041300330031503230031001300310016006130001001
740053423430041300330031503230031001300310016006130001001
740053424830041300330031503230031001300310016006130001001
740053423430041300330031503230031001300310016006130001001
740053423430041300330031503230031001300310016006130001001
740053432430041300330031503230031001300310016006130001001
740053443730041300330001502930001000300010016006130001001
740053437530041300330031503230031001300310016006130001001

Test 2: throughput

Code:

  cash w0, w1, [x6]
  add x6, x6, 2

(fused SUBS/B.cc loop)

100 unrolls and 100 iterations

Result (median cycles for code): 7.0054

retire uop (01)cycle (02)schedule uop (52)schedule int uop (53)schedule ldst uop (55)dispatch int uop (56)dispatch ldst uop (58)int uops in schedulers (59)simd uops in schedulers (5a)dispatch uop (78)map int uop (7c)map ldst uop (7d)map int uop inputs (7f)map ldst uop inputs (80)? int output thing (e9)? ldst retires (ed)? int retires (ef)
50210703734397713895300821389630003472667909224572820201300032020160006156263000020100
50204700584572715726300011572530003472367908434572820201300032020160006156263000020100
50204700514572715726300011572530003472367908434572820201300032020160006156263000020100
50204700514572715726300011572530003472367908414572820201300032020160006156263000020100
50204700514572715726300011572530039466727908114556520225300392020160006156263000020100
50204700514572715726300011572530003472367908434572820201300032020160006156263000020100
50204700514572715726300011572530003472367908454572820201300032020160006156263000020100
50204700514572715726300011572530003472367908434572820201300032020160006156263000020100
50204700514572715726300011572530003472367908474572820201300032020160006156263000020100
50204700514572715726300011572530003472367908434572820201300032020160006156263000020100

1000 unrolls and 10 iterations

Result (median cycles for code): 7.0051

retire uop (01)cycle (02)schedule uop (52)schedule int uop (53)schedule ldst uop (55)dispatch int uop (56)dispatch ldst uop (58)int uops in schedulers (59)simd uops in schedulers (5a)dispatch uop (78)map int uop (7c)map ldst uop (7d)map int uop inputs (7f)map ldst uop inputs (80)? int output thing (e9)? ldst retires (ed)? int retires (ef)
50030703944388313803300801380530003469737911354563820021300032002060000156263000020010
50024700604563615636300001563530000469787911594563520020300002002060000156263000020010
50024700544563615636300001563530000469737911594563520020300002002060000156263000020010
50024700544563615636300001563530000469737911514563520020300002002060000156263000020010
50024700544563615636300001563530000469737911474563520020300002002060000156263000020010
50024700544563615636300001563530000469737911534563520020300002002060000156263000020010
50024700544563615636300001563530000469787911594563520020300002002060000156263000020010
50024700544563615636300001563530000469737911594563520020300002002060000156263000020010
50024700544563615636300001563530000469737911514563520020300002002060000156263000020010
50024700544563615636300001563530000469737911494563520020300002002060000156263000020010

Test 3: throughput

Code:

  cash w0, w1, [x6]
  mov x7, 8

(fused SUBS/B.cc loop)

100 unrolls and 100 iterations

Result (median cycles for code): 10.6179

retire uop (01)cycle (02)schedule uop (52)schedule int uop (53)schedule ldst uop (55)dispatch int uop (56)dispatch ldst uop (58)int uops in schedulers (59)simd uops in schedulers (5a)dispatch uop (78)map int uop (7c)map ldst uop (7d)map int uop inputs (7f)map ldst uop inputs (80)? int output thing (e9)? ldst retires (ed)? int retires (ef)
427141004646141613193482231179879218295801122181596854271447991227124159668195233000013075
431781075916899719704492931773179036294812122135096767271557976526799158059193843000013040
435411073946204013886481541310376064303464118583992586260777650326329154821160553000012909
431031069486791919488484311712777373296502120123694573265427812226067153586179333000012928
432021057946827619189490871768977914305008120010094922265837850927173160060192663000013098
430821053766811019277488331725079036295188122182396767271557976527155159505194193000013074
431781075966899519694493011773178553225260121861692637269087939226580157403188823000013017
428801033606675218588481641599175935284342118734892985260377668725905152420188763000012850
430151035436623618512477241683777262277256116108894125263667763226421155565188413000012903
429711040836631718475478421635978330303434119294495668270537988925965153356177323000012899

1000 unrolls and 10 iterations

Result (median cycles for code): 10.6500

retire uop (01)cycle (02)schedule uop (52)schedule int uop (53)schedule ldst uop (55)dispatch int uop (56)dispatch ldst uop (58)int uops in schedulers (59)simd uops in schedulers (5a)dispatch uop (78)map int uop (7c)map ldst uop (7d)map int uop inputs (7f)map ldst uop inputs (80)? int output thing (e9)? ldst retires (ed)? int retires (ef)
428851056116286113551493101271378244278458118592195307263077877526392158040194883000012902
429101058486881119518492931717478682280384119617695972264697925126040155923192503000012829
429541065396905919663493961746778683280878119962896013264747926826273157357160653000012873
429311060266886019609492511744977273277720116589293861259537769725953155391191823000012814
428261045636819919202489971658578910280630120343796356265577951526557159027196363000012935
428761053756856619406491601693078508248659119356694112264127909226194156866193833000012862
427421033506762718883487441598779292279624121178596945266837992726430158335195683000012911
429541065006910819690494181748579376280306121059897041267208001426977161526199303000013017
430211075476949519917495781794379032279878120548196556265917963926449158394195713000012912
429261061256890919596493131730478606279424119683295909264417917726626159451196863000012948