This is inspired by and based on the x86/x64 SIMD Instruction List by Daytime.
This is not an official reference, and may contain mistakes. It is intended to make it easier to find instructions, and to provide an alternative perspective. While writing SME code, please refer to the Arm® Exploration Tools, or Arm® ARM with SME Supplement.
Merging and zeroing predication is typically omitted from the diagrams, but it is shown in operations like BRKN and LD1RQB that use the /Z syntax but have unusual semantics.
This is an ongoing project - dark-red links are missing full descriptions, bright-red links are also missing diagrams and instead link to the documentation in the exploration tools.
Report mistakes or send feedback.
Note: this does not support filtering by vector length, so some unavailable operations may appear available even after selecting a preset.
| 128-bit | 64-bit | 32-bit | 16-bit | 8-bit | |
|---|---|---|---|---|---|
| zip | svzip[_{s,u,f}64_x4] svzip[_{s,u,f}64_x2] |
svzip[_{s,u,f}32_x4] svzip[_{s,u,f}32_x2] |
svzip[_{s,u,f,bf}16_x4] svzip[_{s,u,f,bf}16_x2] |
svzip[_{s,u,mf}8_x4] svzip[_{s,u,mf}8_x2] |
|
| unzip | svuzp[_{s,u,f}64_x4] svuzp[_{s,u,f}64_x2] |
svuzp[_{s,u,f}32_x4] svuzp[_{s,u,f}32_x2] |
svuzp[_{s,u,f,bf}16_x4] svuzp[_{s,u,f,bf}16_x2] |
svuzp[_{s,u,mf}8_x4] svuzp[_{s,u,mf}8_x2] |
|
| unpack | svunpk_s64[_s32_x4] svunpk_u64[_u32_x4] |
svunpk_s64[_s32_x2] svunpk_s32[_s16_x4] svunpk_u64[_u32_x2] svunpk_u32[_u16_x4] |
svunpk_s32[_s16_x2] svunpk_s16[_s8_x4] svunpk_u32[_u16_x2] svunpk_u16[_u8_x4] |
svunpk_s16[_s8_x2] svunpk_u16[_u8_x2] |
|
| move to/from tile | svwrite_hor_za128_m svwrite_ver_za128_m svread_hor_za128_m svread_ver_za128_m |
svwrite_hor_za64[_{s,u,f}64]_m svwrite_ver_za64[_{s,u,f}64]_m svwrite_hor_za64[_{s,u,f}64]_vg2 svwrite_ver_za64[_{s,u,f}64]_vg2 svwrite_hor_za64[_{s,u,f}64]_vg4 svwrite_ver_za64[_{s,u,f}64]_vg4 svwrite_za16[_{s,u,f,bf}16]_vg1x2 svwrite_za32[_{s,u,f}32]_vg1x2 svwrite_za64[_{s,u,f}64]_vg1x2 svwrite_za8[_{s,u,mf}8]_vg1x2 svwrite_za16[_{s,u,f,bf}16]_vg1x4 svwrite_za32[_{s,u,f}32]_vg1x4 svwrite_za64[_{s,u,f}64]_vg1x4 svwrite_za8[_{s,u,mf}8]_vg1x4 svread_hor_za64[_{s,u,f}64]_m svread_ver_za64[_{s,u,f}64]_m svread_hor_za64_{s,u,f}64_vg2 svread_ver_za64_{s,u,f}64_vg2 svread_hor_za64_{s,u,f}64_vg4 svread_ver_za64_{s,u,f}64_vg4 svread_za16_{s,u,f,bf}16_vg1x2 svread_za32_{s,u,f}32_vg1x2 svread_za64_{s,u,f}64_vg1x2 svread_za8_{s,u,mf}8_vg1x2 svread_za16_{s,u,f,bf}16_vg1x4 svread_za32_{s,u,f}32_vg1x4 svread_za64_{s,u,f}64_vg1x4 svread_za8_{s,u,mf}8_vg1x4 |
svwrite_hor_za32[_{s,u,f}32]_m svwrite_ver_za32[_{s,u,f}32]_m svwrite_hor_za32[_{s,u,f}32]_vg2 svwrite_ver_za32[_{s,u,f}32]_vg2 svwrite_hor_za32[_{s,u,f}32]_vg4 svwrite_ver_za32[_{s,u,f}32]_vg4 svread_hor_za32[_{s,u,f}32]_m svread_ver_za32[_{s,u,f}32]_m svread_hor_za32_{s,u,f}32_vg2 svread_ver_za32_{s,u,f}32_vg2 svread_hor_za32_{s,u,f}32_vg4 svread_ver_za32_{s,u,f}32_vg4 |
svwrite_hor_za16[_{s,u,f,bf}16]_m svwrite_ver_za16[_{s,u,f,bf}16]_m svwrite_hor_za16[_{s,u,f,bf}16]_vg2 svwrite_ver_za16[_{s,u,f,bf}16]_vg2 svwrite_hor_za16[_{s,u,f,bf}16]_vg4 svwrite_ver_za16[_{s,u,f,bf}16]_vg4 svread_hor_za16[_{s,u,f,bf}16]_m svread_ver_za16[_{s,u,f,bf}16]_m svread_hor_za16_{s,u,f,bf}16_vg2 svread_ver_za16_{s,u,f,bf}16_vg2 svread_hor_za16_{s,u,f,bf}16_vg4 svread_ver_za16_{s,u,f,bf}16_vg4 |
svwrite_hor_za8[_{s,u,mf}8]_m svwrite_ver_za8[_{s,u,mf}8]_m svwrite_hor_za8[_{s,u,mf}8]_vg2 svwrite_ver_za8[_{s,u,mf}8]_vg2 svwrite_hor_za8[_{s,u,mf}8]_vg4 svwrite_ver_za8[_{s,u,mf}8]_vg4 svread_hor_za8[_{s,u,mf}8]_m svread_ver_za8[_{s,u,mf}8]_m svread_hor_za8_{s,u,mf}8_vg2 svread_ver_za8_{s,u,mf}8_vg2 svread_hor_za8_{s,u,mf}8_vg4 svread_ver_za8_{s,u,mf}8_vg4 |
| move from tile and zero | svread_hor_za64_{s,u,f}64_vg2 svread_ver_za64_{s,u,f}64_vg2 svread_hor_za64_{s,u,f}64_vg4 svread_ver_za64_{s,u,f}64_vg4 svread_za16_{s,u,f,bf}16_vg1x2 svread_za32_{s,u,f}32_vg1x2 svread_za64_{s,u,f}64_vg1x2 svread_za8_{s,u,mf}8_vg1x2 svread_za16_{s,u,f,bf}16_vg1x4 svread_za32_{s,u,f}32_vg1x4 svread_za64_{s,u,f}64_vg1x4 svread_za8_{s,u,mf}8_vg1x4 |
svread_hor_za32_{s,u,f}32_vg2 svread_ver_za32_{s,u,f}32_vg2 svread_hor_za32_{s,u,f}32_vg4 svread_ver_za32_{s,u,f}32_vg4 |
svread_hor_za16_{s,u,f,bf}16_vg2 svread_ver_za16_{s,u,f,bf}16_vg2 svread_hor_za16_{s,u,f,bf}16_vg4 svread_ver_za16_{s,u,f,bf}16_vg4 |
svread_hor_za8_{s,u,mf}8_vg2 svread_ver_za8_{s,u,mf}8_vg2 svread_hor_za8_{s,u,mf}8_vg4 svread_ver_za8_{s,u,mf}8_vg4 |
|
| zero vector groups | |||||
| zero tile | svzero_mask_za |
| 128-bit | 64-bit | 32-bit | 16-bit | 8-bit | |
|---|---|---|---|---|---|
| load table register | svldr_zt | ||||
| load ZA row (unpredicated) | |||||
| load tile slice | |||||
| load strided registers | |||||
| 128-bit | 64-bit | 32-bit | 16-bit | 8-bit | |
|---|---|---|---|---|---|
| store table register | svstr_zt | ||||
| store ZA row (unpredicated) | |||||
| store tile slice | |||||
| store strided registers | |||||
| Integer | Floating-Point | |||||||
|---|---|---|---|---|---|---|---|---|
| 64-bit | 32-bit | 16-bit | 8-bit | double | single | half | BFloat16 | |
| int to float | svcvt_f32[_s32_x2] svcvt_f32[_u32_x2] |
|||||||
| float to int | svcvt_s32[_f32_x2] svcvt_u32[_f32_x2] |
|||||||
| float to float | svcvt_f16[_f32_x2] svcvtn_f16[_f32_x2] |
svcvt_bf16[_f32_x2] svcvtn_bf16[_f32_x2] |
||||||
| int to int | svqcvt_u16[_u64_x4] svqcvtn_u16[_u64_x4] svqcvt_s16[_s64_x4] svqcvtn_s16[_s64_x4] svqcvt_u16[_s64_x4] svqcvtn_u16[_s64_x4] |
svqcvt_u8[_u32_x4] svqcvt_u16[_u32_x2] svqcvtn_u8[_u32_x4] svqcvt_s8[_s32_x4] svqcvt_s16[_s32_x2] svqcvtn_s8[_s32_x4] svqcvt_u8[_s32_x4] svqcvt_u16[_s32_x2] svqcvtn_u8[_s32_x4] |
||||||
| Integer | Floating-Point | |||||||
|---|---|---|---|---|---|---|---|---|
| 64-bit | 32-bit | 16-bit | 8-bit | double | single | half | BFloat16 | |
| add | svadd[_single_{s,u}64_x2] svadd[_single_{s,u}64_x4] |
svadd[_single_{s,u}32_x2] svadd[_single_{s,u}32_x4] |
svadd[_single_{s,u}16_x2] svadd[_single_{s,u}16_x4] |
svadd[_single_{s,u}8_x2] svadd[_single_{s,u}8_x4] |
||||
| clamp | svclamp[_single_s64_x2] svclamp[_single_s64_x4] svclamp[_single_u64_x2] svclamp[_single_u64_x4] |
svclamp[_single_s32_x2] svclamp[_single_s32_x4] svclamp[_single_u32_x2] svclamp[_single_u32_x4] |
svclamp[_single_s16_x2] svclamp[_single_s16_x4] svclamp[_single_u16_x2] svclamp[_single_u16_x4] |
svclamp[_single_s8_x2] svclamp[_single_s8_x4] svclamp[_single_u8_x2] svclamp[_single_u8_x4] |
svclamp[_single_f64_x2] svclamp[_single_f64_x4] |
svclamp[_single_f32_x2] svclamp[_single_f32_x4] |
svclamp[_single_f16_x2] svclamp[_single_f16_x4] |
|
| max | svmax[_single_s64_x2] svmax[_single_s64_x4] svmax[_s64_x2] SMAX {Z.D,Z.D,Z.D,Z.D},{Z.D,Z.D,Z.D,Z.D},{Z.D,Z.D,Z.D,Z.D}SMAX { Zdn1.D-Zdn4.D }, { Zdn1.D-Zdn4.D }, { Zm1.D-Zm4.D } svmax[_s64_x4] svmax[_single_u64_x2] svmax[_single_u64_x4] svmax[_u64_x2] |
svmax[_single_s32_x2] svmax[_single_s32_x4] svmax[_s32_x2] SMAX {Z.S,Z.S,Z.S,Z.S},{Z.S,Z.S,Z.S,Z.S},{Z.S,Z.S,Z.S,Z.S}SMAX { Zdn1.S-Zdn4.S }, { Zdn1.S-Zdn4.S }, { Zm1.S-Zm4.S } svmax[_s32_x4] svmax[_single_u32_x2] svmax[_single_u32_x4] svmax[_u32_x2] |
svmax[_single_s16_x2] svmax[_single_s16_x4] svmax[_s16_x2] SMAX {Z.H,Z.H,Z.H,Z.H},{Z.H,Z.H,Z.H,Z.H},{Z.H,Z.H,Z.H,Z.H}SMAX { Zdn1.H-Zdn4.H }, { Zdn1.H-Zdn4.H }, { Zm1.H-Zm4.H } svmax[_s16_x4] svmax[_single_u16_x2] svmax[_single_u16_x4] svmax[_u16_x2] |
svmax[_single_s8_x2] svmax[_single_s8_x4] svmax[_s8_x2] SMAX {Z.B,Z.B,Z.B,Z.B},{Z.B,Z.B,Z.B,Z.B},{Z.B,Z.B,Z.B,Z.B}SMAX { Zdn1.B-Zdn4.B }, { Zdn1.B-Zdn4.B }, { Zm1.B-Zm4.B } svmax[_s8_x4] svmax[_single_u8_x2] svmax[_single_u8_x4] svmax[_u8_x2] |
svmax[_single_f64_x2] svmax[_single_f64_x4] svmax[_f64_x2] FMAX {Z.D,Z.D,Z.D,Z.D},{Z.D,Z.D,Z.D,Z.D},{Z.D,Z.D,Z.D,Z.D}FMAX { Zdn1.D-Zdn4.D }, { Zdn1.D-Zdn4.D }, { Zm1.D-Zm4.D } svmax[_f64_x4] svmaxnm[_single_f64_x2] svmaxnm[_single_f64_x4] svmaxnm[_f64_x2] |
svmax[_single_f32_x2] svmax[_single_f32_x4] svmax[_f32_x2] FMAX {Z.S,Z.S,Z.S,Z.S},{Z.S,Z.S,Z.S,Z.S},{Z.S,Z.S,Z.S,Z.S}FMAX { Zdn1.S-Zdn4.S }, { Zdn1.S-Zdn4.S }, { Zm1.S-Zm4.S } svmax[_f32_x4] svmaxnm[_single_f32_x2] svmaxnm[_single_f32_x4] svmaxnm[_f32_x2] |
svmax[_single_f16_x2] svmax[_single_f16_x4] svmax[_f16_x2] FMAX {Z.H,Z.H,Z.H,Z.H},{Z.H,Z.H,Z.H,Z.H},{Z.H,Z.H,Z.H,Z.H}FMAX { Zdn1.H-Zdn4.H }, { Zdn1.H-Zdn4.H }, { Zm1.H-Zm4.H } svmax[_f16_x4] svmaxnm[_single_f16_x2] svmaxnm[_single_f16_x4] svmaxnm[_f16_x2] |
|
| min | svmin[_single_s64_x2] svmin[_single_s64_x4] svmin[_s64_x2] SMIN {Z.D,Z.D,Z.D,Z.D},{Z.D,Z.D,Z.D,Z.D},{Z.D,Z.D,Z.D,Z.D}SMIN { Zdn1.D-Zdn4.D }, { Zdn1.D-Zdn4.D }, { Zm1.D-Zm4.D } svmin[_s64_x4] svmin[_single_u64_x2] svmin[_single_u64_x4] svmin[_u64_x2] |
svmin[_single_s32_x2] svmin[_single_s32_x4] svmin[_s32_x2] SMIN {Z.S,Z.S,Z.S,Z.S},{Z.S,Z.S,Z.S,Z.S},{Z.S,Z.S,Z.S,Z.S}SMIN { Zdn1.S-Zdn4.S }, { Zdn1.S-Zdn4.S }, { Zm1.S-Zm4.S } svmin[_s32_x4] svmin[_single_u32_x2] svmin[_single_u32_x4] svmin[_u32_x2] |
svmin[_single_s16_x2] svmin[_single_s16_x4] svmin[_s16_x2] SMIN {Z.H,Z.H,Z.H,Z.H},{Z.H,Z.H,Z.H,Z.H},{Z.H,Z.H,Z.H,Z.H}SMIN { Zdn1.H-Zdn4.H }, { Zdn1.H-Zdn4.H }, { Zm1.H-Zm4.H } svmin[_s16_x4] svmin[_single_u16_x2] svmin[_single_u16_x4] svmin[_u16_x2] |
svmin[_single_s8_x2] svmin[_single_s8_x4] svmin[_s8_x2] SMIN {Z.B,Z.B,Z.B,Z.B},{Z.B,Z.B,Z.B,Z.B},{Z.B,Z.B,Z.B,Z.B}SMIN { Zdn1.B-Zdn4.B }, { Zdn1.B-Zdn4.B }, { Zm1.B-Zm4.B } svmin[_s8_x4] svmin[_single_u8_x2] svmin[_single_u8_x4] svmin[_u8_x2] |
svmin[_single_f64_x2] svmin[_single_f64_x4] svmin[_f64_x2] FMIN {Z.D,Z.D,Z.D,Z.D},{Z.D,Z.D,Z.D,Z.D},{Z.D,Z.D,Z.D,Z.D}FMIN { Zdn1.D-Zdn4.D }, { Zdn1.D-Zdn4.D }, { Zm1.D-Zm4.D } svmin[_f64_x4] svminnm[_single_f64_x2] svminnm[_single_f64_x4] svminnm[_f64_x2] |
svmin[_single_f32_x2] svmin[_single_f32_x4] svmin[_f32_x2] FMIN {Z.S,Z.S,Z.S,Z.S},{Z.S,Z.S,Z.S,Z.S},{Z.S,Z.S,Z.S,Z.S}FMIN { Zdn1.S-Zdn4.S }, { Zdn1.S-Zdn4.S }, { Zm1.S-Zm4.S } svmin[_f32_x4] svminnm[_single_f32_x2] svminnm[_single_f32_x4] svminnm[_f32_x2] |
svmin[_single_f16_x2] svmin[_single_f16_x4] svmin[_f16_x2] FMIN {Z.H,Z.H,Z.H,Z.H},{Z.H,Z.H,Z.H,Z.H},{Z.H,Z.H,Z.H,Z.H}FMIN { Zdn1.H-Zdn4.H }, { Zdn1.H-Zdn4.H }, { Zm1.H-Zm4.H } svmin[_f16_x4] svminnm[_single_f16_x2] svminnm[_single_f16_x4] svminnm[_f16_x2] |
|
| round | ||||||||
| select | svsel[_{s,u,f}64_x2] |
svsel[_{s,u,f}32_x2] |
svsel[_{s,u,f,bf}16_x2] |
svsel[_{s,u,mf}8_x2] |
||||
| mulh | svqdmulh[_single_s64_x2] svqdmulh[_single_s64_x4] svqdmulh[_s64_x2] |
svqdmulh[_single_s32_x2] svqdmulh[_single_s32_x4] svqdmulh[_s32_x2] |
svqdmulh[_single_s16_x2] svqdmulh[_single_s16_x4] svqdmulh[_s16_x2] |
svqdmulh[_single_s8_x2] svqdmulh[_single_s8_x4] svqdmulh[_s8_x2] |
||||
| multiply | ||||||||
| scale | ||||||||
| 64-bit | 32-bit | 16-bit | 8-bit | |
|---|---|---|---|---|
| shift right | svqrshr[_n]_s16[_s64_x4] svqrshrn[_n]_s16[_s64_x4] svqrshr[_n]_u16[_u64_x4] svqrshrn[_n]_u16[_u64_x4] svqrshru[_n]_u16[_s64_x4] svqrshrun[_n]_u16[_s64_x4] |
svqrshr[_n]_s8[_s32_x4] svqrshr[_n]_s16[_s32_x2] svqrshrn[_n]_s8[_s32_x4] svqrshr[_n]_u8[_u32_x4] svqrshr[_n]_u16[_u32_x2] svqrshrn[_n]_u8[_u32_x4] svqrshru[_n]_u8[_s32_x4] svqrshru[_n]_u16[_s32_x2] svqrshrun[_n]_u8[_s32_x4] |
||
| shift left | svrshl[_single_s64_x2] svrshl[_single_s64_x4] svrshl[_s64_x2] SRSHL {Z.D,Z.D,Z.D,Z.D},{Z.D,Z.D,Z.D,Z.D},{Z.D,Z.D,Z.D,Z.D}SRSHL { Zdn1.D-Zdn4.D }, { Zdn1.D-Zdn4.D }, { Zm1.D-Zm4.D } svrshl[_s64_x4] svrshl[_single_u64_x2] svrshl[_single_u64_x4] svrshl[_u64_x2] |
svrshl[_single_s32_x2] svrshl[_single_s32_x4] svrshl[_s32_x2] SRSHL {Z.S,Z.S,Z.S,Z.S},{Z.S,Z.S,Z.S,Z.S},{Z.S,Z.S,Z.S,Z.S}SRSHL { Zdn1.S-Zdn4.S }, { Zdn1.S-Zdn4.S }, { Zm1.S-Zm4.S } svrshl[_s32_x4] svrshl[_single_u32_x2] svrshl[_single_u32_x4] svrshl[_u32_x2] |
svrshl[_single_s16_x2] svrshl[_single_s16_x4] svrshl[_s16_x2] SRSHL {Z.H,Z.H,Z.H,Z.H},{Z.H,Z.H,Z.H,Z.H},{Z.H,Z.H,Z.H,Z.H}SRSHL { Zdn1.H-Zdn4.H }, { Zdn1.H-Zdn4.H }, { Zm1.H-Zm4.H } svrshl[_s16_x4] svrshl[_single_u16_x2] svrshl[_single_u16_x4] svrshl[_u16_x2] |
svrshl[_single_s8_x2] svrshl[_single_s8_x4] svrshl[_s8_x2] SRSHL {Z.B,Z.B,Z.B,Z.B},{Z.B,Z.B,Z.B,Z.B},{Z.B,Z.B,Z.B,Z.B}SRSHL { Zdn1.B-Zdn4.B }, { Zdn1.B-Zdn4.B }, { Zm1.B-Zm4.B } svrshl[_s8_x4] svrshl[_single_u8_x2] svrshl[_single_u8_x4] svrshl[_u8_x2] |
| 32-bit | 16-bit | 8-bit | |
|---|---|---|---|
| table lookup (2-bit indices) | svluti2_lane_zt_{s,u,f}32_x4 svluti2_lane_zt_{s,u,f}32 svluti2_lane_zt_{s,u,f}32_x2 |
svluti2_lane_zt_{s,u,f,bf}16_x4 svluti2_lane_zt_{s,u,f,bf}16 svluti2_lane_zt_{s,u,f,bf}16_x2 svluti2_lane_zt_{s,u,f,bf}16_x2 |
svluti2_lane_zt_{s,u,mf}8_x4 svluti2_lane_zt_{s,u,mf}8 svluti2_lane_zt_{s,u,mf}8_x2 svluti2_lane_zt_{s,u,mf}8_x2 |
| table lookup (4-bit indices) | svluti4_lane_zt_{s,u,f}32_x4 svluti4_lane_zt_{s,u,f}32 svluti4_lane_zt_{s,u,f}32_x2 |
svluti4_lane_zt_{s,u,f,bf}16_x4 svluti4_lane_zt_{s,u,f,bf}16 svluti4_lane_zt_{s,u,f,bf}16_x2 svluti4_lane_zt_{s,u,f,bf}16_x2 |
svluti4_lane_zt_{s,u,mf}8 svluti4_lane_zt_{s,u,mf}8_x2 svluti4_lane_zt_{s,u,mf}8_x2 |
| table lookup (6-bit indices) | |||
| move/zero table register | svzero_zt |
| Integer | Floating-Point | |||||||
|---|---|---|---|---|---|---|---|---|
| 32-bit | 16-bit | 8-bit | double | single | half | BFloat16 | FP8 | |
| outer product and accumulate | svbmopa_za32[_{s,u}32]_m |
svmopa_za32[_s16]_m svmopa_za64[_s16]_m svmopa_za32[_u16]_m svmopa_za64[_u16]_m svsumopa_za64[_s16]_m svusmopa_za64[_u16]_m |
svmopa_za32[_s8]_m svmopa_za32[_u8]_m svsumopa_za32[_s8]_m svusmopa_za32[_u8]_m |
svmopa_za32[_f32]_m |
svmopa_za32[_f16]_m |
svmopa_za32[_bf16]_m |
||
| outer product and subtract | svbmops_za32[_{s,u}32]_m |
svmops_za32[_s16]_m svmops_za64[_s16]_m svmops_za32[_u16]_m svmops_za64[_u16]_m svsumops_za64[_s16]_m svusmops_za64[_u16]_m |
svmops_za32[_s8]_m svmops_za32[_u8]_m svsumops_za32[_s8]_m svusmops_za32[_u8]_m |
svmops_za32[_f32]_m |
svmops_za32[_f16]_m |
svmops_za32[_bf16]_m |
||
| quarter-tile outer product and accumulate | ||||||||
| quarter-tile outer product and subtract | ||||||||
| sparse outer product | ||||||||
| Add multiple of streaming SVE mode predicate length in bytes | |
| Add multiple of streaming SVE mode vector length in bytes | |
| Get multiple of streaming SVE mode vector length in bytes |