Apple Microarchitecture Research by Dougall Johnson

M1/A14 P-core (Firestorm): Overview | Base Instructions | SIMD and FP Instructions
M1/A14 E-core (Icestorm):  Overview | Base Instructions | SIMD and FP Instructions

STADDB

Test 1: uops

Code:

  staddb w0, [x6]
  nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop
  mov x0, 0

(no loop instructions)

1000 unrolls and 1 iteration

Retires (minus 70 nops): 3.000

Issues: 3.002

Integer unit issues: 1.003

Load/store unit issues: 2.000

SIMD/FP unit issues: 0.000

retire uop (01)cycle (02)schedule uop (52)schedule int uop (53)schedule ldst uop (55)dispatch int uop (56)dispatch ldst uop (58)int uops in schedulers (59)simd uops in schedulers (5a)dispatch uop (78)map int uop (7c)map ldst uop (7d)map int uop inputs (7f)map ldst uop inputs (80)? int output thing (e9)? ldst retires (ed)? int retires (ef)
73006345103039102520141007200077701052130001000200020004000100320001000
73004341703003100320001000200077701052130001000200020004000100320001000
73004341643003100320001000200077721052530001000200020004000100320001000
73004344533003100320001000200077701052130001000200020004000100320001000
73004341843003100320001000200077701052130001000200020004000100320001000
73004341523003100320001000200077701052130001000200020004000100320001000
73004341603003100320001000200077701052130001000200020004000100320001000
73004341713003100320001000200077701052130001000200020004000100320001000
73004341793003100320001000200077701052130001000200020004000100320001000
73004341593003100320001000200077701052130001000200020004000100320001000

Test 2: throughput

Code:

  staddb w0, [x6]
  add x6, x6, 2

(fused SUBS/B.cc loop)

100 unrolls and 100 iterations

Result (median cycles for code): 3.0056

retire uop (01)cycle (02)schedule uop (52)schedule int uop (53)schedule ldst uop (55)dispatch int uop (56)dispatch ldst uop (58)int uops in schedulers (59)simd uops in schedulers (5a)dispatch uop (78)map int uop (7c)map ldst uop (7d)map int uop inputs (7f)map ldst uop inputs (80)map simd uop inputs (81)? int output thing (e9)? ldst retires (ed)? simd retires (ee)? int retires (ef)
40206304584029820255200432016820007115871106071401142020720007302114001302001120000020100
40204300664011220110200022010420004115901105872401082020420004302064000802000920000020100
40204300664011220110200022010420004115924105916401082020420004302064000802001020000020100
40204300664011120109200022010420004115946105960401082020420004302064000802000920000020100
40205301294018620150200362013820035111530117927401722023720035302064000802000920000020100
40204300664011220110200022010420004115924105916401082020420004302064000802001020000020100
40204300664011120109200022010420004115927105922401082020420004302064000802000920000020100
40204300664011120109200022010420004115918105895401082020420004302064000802000920000020100
40204300664011220110200022010420004115918105904401082020420004302064000802000920000020100
40204300664011220110200022010420004115914105896401082020420004302064000802000920000020100

1000 unrolls and 10 iterations

Result (median cycles for code): 3.0066

retire uop (01)cycle (02)schedule uop (52)schedule int uop (53)schedule ldst uop (55)dispatch int uop (56)dispatch ldst uop (58)int uops in schedulers (59)simd uops in schedulers (5a)dispatch uop (78)map int uop (7c)map ldst uop (7d)map int uop inputs (7f)map ldst uop inputs (80)? int output thing (e9)? ldst retires (ed)? int retires (ef)
400263036440157201172004020075200041157601059634001820024200043002040000200102000020010
400243006640021200212000020010200361143021076974008220056200363002040000200172000020010
400243006640025200252000020010200001157721059464001020020200003002040000200162000020010
400243006640027200272000020010200001157691059294001020020200003002040000200172000020010
400243006640026200262000020010200001157741059434001020020200003002040000200182000020010
400243006640020200202000020010200001157731059474001020020200003002040000200172000020010
400243006640026200262000020010200301143041077444007120051200303002040000200172000020010
400243006640023200232000020010200001157491059044001020020200003002040000200112000020010
400243006640024200242000020010200001157561059364001020020200003002040000200122000020010
400253013240103200672003620049200001157861059764001020020200003002040000200162000020010

Test 3: throughput

Code:

  staddb w0, [x6]
  mov x7, 8

(fused SUBS/B.cc loop)

100 unrolls and 100 iterations

Result (median cycles for code): 12.9761

retire uop (01)cycle (02)schedule uop (52)schedule int uop (53)schedule simd uop (54)schedule ldst uop (55)dispatch int uop (56)dispatch simd uop (57)dispatch ldst uop (58)int uops in schedulers (59)simd uops in schedulers (5a)ldst uops in schedulers (5b)dispatch uop (78)map int uop (7c)map ldst uop (7d)map simd uop (7e)map int uop inputs (7f)map ldst uop inputs (80)? int output thing (e9)? ldst retires (ed)? simd retires (ee)? int retires (ef)
30206124053408272076002006710145020006245036823123710301131020720013020200400002131020000010100
30204126467412032120302000010100020000245004823119330301001020020000020200400002131020000010100
30204129761414102141002000010100020000245004823119330301001020020000020200400002131020000010100
30204129761414102141002000010100020000245004823119330301001020020000020200400002131020000010100
30204129761414102141002000010100020011250488523642960301191020920019020200400002153920000010100
30204131270415432153502000810100020049248858823502020301781022920056020256401092133620000010100
30204124632409632096302000010100020000234461222195360301001020020000020200400002086420000010100
30204124642409642096402000010100020000245004823119330301001020020000020256401102089120000010100
30204129761414102141002000010100020000245004823119330301001020020000020200400002131020000010100
30204129761414102141002000010100020000245004823119330301001020020000020200400002131020000010100

1000 unrolls and 10 iterations

Result (median cycles for code): 12.6476

retire uop (01)cycle (02)schedule uop (52)schedule int uop (53)schedule ldst uop (55)dispatch int uop (56)dispatch ldst uop (58)int uops in schedulers (59)simd uops in schedulers (5a)dispatch uop (78)map int uop (7c)map ldst uop (7d)map int uop inputs (7f)map ldst uop inputs (80)? int output thing (e9)? ldst retires (ed)? int retires (ef)
300281279064122621052201741012420000242684122874033001010020200002002040000211852000010010
300241283534119321193200001001020000245624823130833001010020200002002040000212732000010010
300241297614128321283200001001020000245624823130833001010020200002002040000212732000010010
300241297614128321283200001001020000245624823130833001010020200002003240024213152000010010
300241297614128321283200001001020000245624823130833001010020200002002040000212732000010010
300241297614128321283200001001020000245624823130833001010020200002002040000212732000010010
300241297614128321283200001001020000245624823130833001010020200002002040000212692000010010
305021301534165621397202591034420130244957323069863021110091201412004040040203902000010010
300241264374110721107200001001020050235244722221593008810048200562002040000210992000010010
300241264904111321113200001001020000238882122536343001010020200002002040000211022000010010