Apple Microarchitecture Research by Dougall Johnson

M1/A14 P-core (Firestorm): Overview | Base Instructions | SIMD and FP Instructions
M1/A14 E-core (Icestorm):  Overview | Base Instructions | SIMD and FP Instructions

STCLRL (32-bit)

Test 1: uops

Code:

  stclrl w0, [x6]
  nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop
  mov x0, 0

(no loop instructions)

1000 unrolls and 1 iteration

Retires (minus 70 nops): 3.000

Issues: 3.001

Integer unit issues: 1.002

Load/store unit issues: 2.000

SIMD/FP unit issues: 0.000

retire uop (01)cycle (02)schedule uop (52)schedule int uop (53)schedule ldst uop (55)dispatch int uop (56)dispatch ldst uop (58)int uops in schedulers (59)simd uops in schedulers (5a)dispatch uop (78)map int uop (7c)map ldst uop (7d)map int uop inputs (7f)map ldst uop inputs (80)? int output thing (e9)? ldst retires (ed)? int retires (ef)
73005348643034102420101005200077621051330001000200020004000100220001000
73004341283002100220001000200077621051330001000200020004000100220001000
73004341343002100220001000200077621051330001000200020004000100220001000
73004341533002100220001000200077621051330001000200020004000100220001000
73004341343002100220001000200077621051330001000200020004000100220001000
73004341253002100220001000200077621051330001000200020004000100220001000
73004341413002100220001000200277651051930031001200220004000100220001000
73004344293002100220001000200077621051330001000200020004000100220001000
73004341353002100220001000200077621051330001000200020004000100220001000
73004341193002100220001000200077621051330001000200020004000100220001000

Test 2: throughput

Code:

  stclrl w0, [x6]
  add x6, x6, 4

(fused SUBS/B.cc loop)

100 unrolls and 100 iterations

Result (median cycles for code): 6.0065

retire uop (01)cycle (02)schedule uop (52)schedule int uop (53)schedule ldst uop (55)dispatch int uop (56)dispatch ldst uop (58)int uops in schedulers (59)simd uops in schedulers (5a)dispatch uop (78)map int uop (7c)map ldst uop (7d)map int uop inputs (7f)map ldst uop inputs (80)? int output thing (e9)? ldst retires (ed)? int retires (ef)
40207603764030420249200552016920005115674959594011020205200053020840009200082000020100
40204600654010620106200002010220002115783956994010420202200023020340004200062000020100
40204600654010620106200002010220002115783957004010420202200023020340004200062000020100
40204600654010620106200002010220002115777956894010420202200023020340004200062000020100
40204600654010620106200002010220002115773956814010420202200023020340004200062000020100
40204600654010620106200002010220002115785957014010420202200023020340004200062000020100
40204600654010620106200002010220036979141038374017220236200363020340004200062000020100
40204600654010620106200002010220002115785957054010420202200023020340004200062000020100
40204600654010620106200002010220002115777956884010420202200023020340004200062000020100
40204600664010620106200002010220002115769956644010420202200023020340004200062000020100

1000 unrolls and 10 iterations

Result (median cycles for code): 6.0065

retire uop (01)cycle (02)schedule uop (52)schedule int uop (53)schedule ldst uop (55)dispatch int uop (56)dispatch ldst uop (58)int uops in schedulers (59)simd uops in schedulers (5a)dispatch uop (78)map int uop (7c)map ldst uop (7d)map int uop inputs (7f)map ldst uop inputs (80)? int output thing (e9)? ldst retires (ed)? simd retires (ee)? int retires (ef)
400276037640210201592005120074200369593610787540082200562003630023400042000620000020010
400246006540016200162000020010200001155469559240010200202000030020400002000620000020010
400246006540016200162000020010200001155509559940010200202000030020400002000620000020010
400246006540016200162000020010200001155529560340010200202000030020400002000620000020010
400246006540016200162000020010200001155529560140010200202000030020400002000620000020010
400246006540016200162000020010200001155509559740010200202000030020400002000620000020010
400246006540016200162000020010200001155569560740010200202000030020400002000620000020010
400246006540016200162000020010200001155529560140010200202000030020400002000620000020010
400246006540016200162000020010200001155489559440010200202000030020400002000620000020010
400256011340085200532003220044200001155589561540010200202000030020400002000620000020010

Test 3: throughput

Code:

  stclrl w0, [x6]
  mov x7, 8

(fused SUBS/B.cc loop)

100 unrolls and 100 iterations

Result (median cycles for code): 10.7766

retire uop (01)cycle (02)schedule uop (52)schedule int uop (53)schedule ldst uop (55)dispatch int uop (56)dispatch ldst uop (58)int uops in schedulers (59)simd uops in schedulers (5a)dispatch uop (78)map int uop (7c)map ldst uop (7d)map int uop inputs (7f)map ldst uop inputs (80)? int output thing (e9)? ldst retires (ed)? int retires (ef)
302061131584489022195226951281621264201739119518443246511304221112351445775206372000010100
302041082014136520060213051124320296195117819102673063610441204792094841382194262000010100
302041073834031619692206241044420168192579618961323038410316202192159742525198472000010100
302041068514070719970207371090120931198261419243793185711034216072255444265201022000010100
302041077834112620019211071110420742196543019217143150810866212832039840356190772000010100
302051067723990019331205691038321321196444019221103247111251220472089641324192682000010100
302041097774257220833217391189621351197033619182083263311384222862060940777191512000010100
302041069304061619590210261083320762196718619158633142810766212282252844021199662000010100
302041068404064019823208171080420589194988119140193117910690209542035240266190512000010100
302041060623941119086203251017120293189513218800643064410451204972260844490201732000010100

1000 unrolls and 10 iterations

Result (median cycles for code): 11.4047

retire uop (01)cycle (02)schedule uop (52)schedule int uop (53)schedule ldst uop (55)dispatch int uop (56)dispatch ldst uop (58)int uops in schedulers (59)simd uops in schedulers (5a)dispatch uop (78)map int uop (7c)map ldst uop (7d)map int uop inputs (7f)map ldst uop inputs (80)? int output thing (e9)? ldst retires (ed)? int retires (ef)
300261146374678422506242781458325205208374120084923996614771287562899655973222492000010010
300241141564666522095245701445825046208027120074783944614412282582840454951225122000010010
300241140284685922552243071368924279207337019989783798713719272202904955938220212000010010
300241142894637222220241521403124476208231220062553845013985273552821054482220932000010010
300241139754593322292236411357624442208758120104833845814026275922833054950220722000010010
300241138054576322326234371337924448207601820014563840013965275152836354756223682000010010
300241138754609622138239581408824411207948720033163833813937275062773453953224022000010010
300241136014633122224241071462525390208019320053454006614692289372847855121225822000010010
300241139564625622133241231463124940208765720115503932814404281692768853911224192000010010
300241138424615322056240971418825082208032520054603958114509284012809454322224932000010010