1206084Srdivacky/*===---- smmintrin.h - SSE4 intrinsics ------------------------------------=== 2206084Srdivacky * 3206084Srdivacky * Permission is hereby granted, free of charge, to any person obtaining a copy 4206084Srdivacky * of this software and associated documentation files (the "Software"), to deal 5206084Srdivacky * in the Software without restriction, including without limitation the rights 6206084Srdivacky * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 7206084Srdivacky * copies of the Software, and to permit persons to whom the Software is 8206084Srdivacky * furnished to do so, subject to the following conditions: 9206084Srdivacky * 10206084Srdivacky * The above copyright notice and this permission notice shall be included in 11206084Srdivacky * all copies or substantial portions of the Software. 12206084Srdivacky * 13206084Srdivacky * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 14206084Srdivacky * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 15206084Srdivacky * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 16206084Srdivacky * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 17206084Srdivacky * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 18206084Srdivacky * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN 19206084Srdivacky * THE SOFTWARE. 20206084Srdivacky * 21206084Srdivacky *===-----------------------------------------------------------------------=== 22206084Srdivacky */ 23204793Srdivacky 24204793Srdivacky#ifndef _SMMINTRIN_H 25204793Srdivacky#define _SMMINTRIN_H 26204793Srdivacky 27204793Srdivacky#ifndef __SSE4_1__ 28204793Srdivacky#error "SSE4.1 instruction set not enabled" 29204793Srdivacky#else 30204793Srdivacky 31204793Srdivacky#include <tmmintrin.h> 32204793Srdivacky 33204793Srdivacky/* SSE4 Rounding macros. */ 34204793Srdivacky#define _MM_FROUND_TO_NEAREST_INT 0x00 35204793Srdivacky#define _MM_FROUND_TO_NEG_INF 0x01 36204793Srdivacky#define _MM_FROUND_TO_POS_INF 0x02 37204793Srdivacky#define _MM_FROUND_TO_ZERO 0x03 38204793Srdivacky#define _MM_FROUND_CUR_DIRECTION 0x04 39204793Srdivacky 40204793Srdivacky#define _MM_FROUND_RAISE_EXC 0x00 41204793Srdivacky#define _MM_FROUND_NO_EXC 0x08 42204793Srdivacky 43204793Srdivacky#define _MM_FROUND_NINT (_MM_FROUND_RAISE_EXC | _MM_FROUND_TO_NEAREST_INT) 44204793Srdivacky#define _MM_FROUND_FLOOR (_MM_FROUND_RAISE_EXC | _MM_FROUND_TO_NEG_INF) 45204793Srdivacky#define _MM_FROUND_CEIL (_MM_FROUND_RAISE_EXC | _MM_FROUND_TO_POS_INF) 46204793Srdivacky#define _MM_FROUND_TRUNC (_MM_FROUND_RAISE_EXC | _MM_FROUND_TO_ZERO) 47204793Srdivacky#define _MM_FROUND_RINT (_MM_FROUND_RAISE_EXC | _MM_FROUND_CUR_DIRECTION) 48204962Srdivacky#define _MM_FROUND_NEARBYINT (_MM_FROUND_NO_EXC | _MM_FROUND_CUR_DIRECTION) 49204793Srdivacky 50204793Srdivacky#define _mm_ceil_ps(X) _mm_round_ps((X), _MM_FROUND_CEIL) 51204793Srdivacky#define _mm_ceil_pd(X) _mm_round_pd((X), _MM_FROUND_CEIL) 52204793Srdivacky#define _mm_ceil_ss(X, Y) _mm_round_ss((X), (Y), _MM_FROUND_CEIL) 53204793Srdivacky#define _mm_ceil_sd(X, Y) _mm_round_sd((X), (Y), _MM_FROUND_CEIL) 54204793Srdivacky 55204793Srdivacky#define _mm_floor_ps(X) _mm_round_ps((X), _MM_FROUND_FLOOR) 56204793Srdivacky#define _mm_floor_pd(X) _mm_round_pd((X), _MM_FROUND_FLOOR) 57204793Srdivacky#define _mm_floor_ss(X, Y) _mm_round_ss((X), (Y), _MM_FROUND_FLOOR) 58204793Srdivacky#define _mm_floor_sd(X, Y) _mm_round_sd((X), (Y), _MM_FROUND_FLOOR) 59204793Srdivacky 60234353Sdim#define _mm_round_ps(X, M) __extension__ ({ \ 61234353Sdim __m128 __X = (X); \ 62234353Sdim (__m128) __builtin_ia32_roundps((__v4sf)__X, (M)); }) 63204793Srdivacky 64234353Sdim#define _mm_round_ss(X, Y, M) __extension__ ({ \ 65234353Sdim __m128 __X = (X); \ 66234353Sdim __m128 __Y = (Y); \ 67234353Sdim (__m128) __builtin_ia32_roundss((__v4sf)__X, (__v4sf)__Y, (M)); }) 68234353Sdim 69234353Sdim#define _mm_round_pd(X, M) __extension__ ({ \ 70234353Sdim __m128d __X = (X); \ 71234353Sdim (__m128d) __builtin_ia32_roundpd((__v2df)__X, (M)); }) 72234353Sdim 73234353Sdim#define _mm_round_sd(X, Y, M) __extension__ ({ \ 74234353Sdim __m128d __X = (X); \ 75234353Sdim __m128d __Y = (Y); \ 76234353Sdim (__m128d) __builtin_ia32_roundsd((__v2df)__X, (__v2df)__Y, (M)); }) 77234353Sdim 78204793Srdivacky/* SSE4 Packed Blending Intrinsics. */ 79234353Sdim#define _mm_blend_pd(V1, V2, M) __extension__ ({ \ 80234353Sdim __m128d __V1 = (V1); \ 81234353Sdim __m128d __V2 = (V2); \ 82234353Sdim (__m128d) __builtin_ia32_blendpd ((__v2df)__V1, (__v2df)__V2, (M)); }) 83204793Srdivacky 84234353Sdim#define _mm_blend_ps(V1, V2, M) __extension__ ({ \ 85234353Sdim __m128 __V1 = (V1); \ 86234353Sdim __m128 __V2 = (V2); \ 87234353Sdim (__m128) __builtin_ia32_blendps ((__v4sf)__V1, (__v4sf)__V2, (M)); }) 88204793Srdivacky 89206084Srdivackystatic __inline__ __m128d __attribute__((__always_inline__, __nodebug__)) 90204793Srdivacky_mm_blendv_pd (__m128d __V1, __m128d __V2, __m128d __M) 91204793Srdivacky{ 92204793Srdivacky return (__m128d) __builtin_ia32_blendvpd ((__v2df)__V1, (__v2df)__V2, 93204793Srdivacky (__v2df)__M); 94204793Srdivacky} 95204793Srdivacky 96206084Srdivackystatic __inline__ __m128 __attribute__((__always_inline__, __nodebug__)) 97204793Srdivacky_mm_blendv_ps (__m128 __V1, __m128 __V2, __m128 __M) 98204793Srdivacky{ 99204793Srdivacky return (__m128) __builtin_ia32_blendvps ((__v4sf)__V1, (__v4sf)__V2, 100204793Srdivacky (__v4sf)__M); 101204793Srdivacky} 102204793Srdivacky 103206084Srdivackystatic __inline__ __m128i __attribute__((__always_inline__, __nodebug__)) 104204793Srdivacky_mm_blendv_epi8 (__m128i __V1, __m128i __V2, __m128i __M) 105204793Srdivacky{ 106204793Srdivacky return (__m128i) __builtin_ia32_pblendvb128 ((__v16qi)__V1, (__v16qi)__V2, 107204793Srdivacky (__v16qi)__M); 108204793Srdivacky} 109204793Srdivacky 110234353Sdim#define _mm_blend_epi16(V1, V2, M) __extension__ ({ \ 111234353Sdim __m128i __V1 = (V1); \ 112234353Sdim __m128i __V2 = (V2); \ 113234353Sdim (__m128i) __builtin_ia32_pblendw128 ((__v8hi)__V1, (__v8hi)__V2, (M)); }) 114204793Srdivacky 115204962Srdivacky/* SSE4 Dword Multiply Instructions. */ 116206084Srdivackystatic __inline__ __m128i __attribute__((__always_inline__, __nodebug__)) 117204962Srdivacky_mm_mullo_epi32 (__m128i __V1, __m128i __V2) 118204962Srdivacky{ 119206084Srdivacky return (__m128i) ((__v4si)__V1 * (__v4si)__V2); 120204962Srdivacky} 121204962Srdivacky 122206084Srdivackystatic __inline__ __m128i __attribute__((__always_inline__, __nodebug__)) 123204962Srdivacky_mm_mul_epi32 (__m128i __V1, __m128i __V2) 124204962Srdivacky{ 125204962Srdivacky return (__m128i) __builtin_ia32_pmuldq128 ((__v4si)__V1, (__v4si)__V2); 126204962Srdivacky} 127204962Srdivacky 128204962Srdivacky/* SSE4 Floating Point Dot Product Instructions. */ 129234353Sdim#define _mm_dp_ps(X, Y, M) __extension__ ({ \ 130234353Sdim __m128 __X = (X); \ 131234353Sdim __m128 __Y = (Y); \ 132234353Sdim (__m128) __builtin_ia32_dpps((__v4sf)__X, (__v4sf)__Y, (M)); }) 133204962Srdivacky 134234353Sdim#define _mm_dp_pd(X, Y, M) __extension__ ({\ 135234353Sdim __m128d __X = (X); \ 136234353Sdim __m128d __Y = (Y); \ 137234353Sdim (__m128d) __builtin_ia32_dppd((__v2df)__X, (__v2df)__Y, (M)); }) 138234353Sdim 139204962Srdivacky/* SSE4 Streaming Load Hint Instruction. */ 140206084Srdivackystatic __inline__ __m128i __attribute__((__always_inline__, __nodebug__)) 141204962Srdivacky_mm_stream_load_si128 (__m128i *__V) 142204962Srdivacky{ 143204962Srdivacky return (__m128i) __builtin_ia32_movntdqa ((__v2di *) __V); 144204962Srdivacky} 145204962Srdivacky 146204962Srdivacky/* SSE4 Packed Integer Min/Max Instructions. */ 147206084Srdivackystatic __inline__ __m128i __attribute__((__always_inline__, __nodebug__)) 148204962Srdivacky_mm_min_epi8 (__m128i __V1, __m128i __V2) 149204962Srdivacky{ 150204962Srdivacky return (__m128i) __builtin_ia32_pminsb128 ((__v16qi) __V1, (__v16qi) __V2); 151204962Srdivacky} 152204962Srdivacky 153206084Srdivackystatic __inline__ __m128i __attribute__((__always_inline__, __nodebug__)) 154204962Srdivacky_mm_max_epi8 (__m128i __V1, __m128i __V2) 155204962Srdivacky{ 156204962Srdivacky return (__m128i) __builtin_ia32_pmaxsb128 ((__v16qi) __V1, (__v16qi) __V2); 157204962Srdivacky} 158204962Srdivacky 159206084Srdivackystatic __inline__ __m128i __attribute__((__always_inline__, __nodebug__)) 160204962Srdivacky_mm_min_epu16 (__m128i __V1, __m128i __V2) 161204962Srdivacky{ 162204962Srdivacky return (__m128i) __builtin_ia32_pminuw128 ((__v8hi) __V1, (__v8hi) __V2); 163204962Srdivacky} 164204962Srdivacky 165206084Srdivackystatic __inline__ __m128i __attribute__((__always_inline__, __nodebug__)) 166204962Srdivacky_mm_max_epu16 (__m128i __V1, __m128i __V2) 167204962Srdivacky{ 168204962Srdivacky return (__m128i) __builtin_ia32_pmaxuw128 ((__v8hi) __V1, (__v8hi) __V2); 169204962Srdivacky} 170204962Srdivacky 171206084Srdivackystatic __inline__ __m128i __attribute__((__always_inline__, __nodebug__)) 172204962Srdivacky_mm_min_epi32 (__m128i __V1, __m128i __V2) 173204962Srdivacky{ 174204962Srdivacky return (__m128i) __builtin_ia32_pminsd128 ((__v4si) __V1, (__v4si) __V2); 175204962Srdivacky} 176204962Srdivacky 177206084Srdivackystatic __inline__ __m128i __attribute__((__always_inline__, __nodebug__)) 178204962Srdivacky_mm_max_epi32 (__m128i __V1, __m128i __V2) 179204962Srdivacky{ 180204962Srdivacky return (__m128i) __builtin_ia32_pmaxsd128 ((__v4si) __V1, (__v4si) __V2); 181204962Srdivacky} 182204962Srdivacky 183206084Srdivackystatic __inline__ __m128i __attribute__((__always_inline__, __nodebug__)) 184204962Srdivacky_mm_min_epu32 (__m128i __V1, __m128i __V2) 185204962Srdivacky{ 186204962Srdivacky return (__m128i) __builtin_ia32_pminud128((__v4si) __V1, (__v4si) __V2); 187204962Srdivacky} 188204962Srdivacky 189206084Srdivackystatic __inline__ __m128i __attribute__((__always_inline__, __nodebug__)) 190204962Srdivacky_mm_max_epu32 (__m128i __V1, __m128i __V2) 191204962Srdivacky{ 192204962Srdivacky return (__m128i) __builtin_ia32_pmaxud128((__v4si) __V1, (__v4si) __V2); 193204962Srdivacky} 194204962Srdivacky 195204962Srdivacky/* SSE4 Insertion and Extraction from XMM Register Instructions. */ 196204962Srdivacky#define _mm_insert_ps(X, Y, N) __builtin_ia32_insertps128((X), (Y), (N)) 197204962Srdivacky#define _mm_extract_ps(X, N) (__extension__ \ 198249423Sdim ({ union { int __i; float __f; } __t; \ 199210299Sed __v4sf __a = (__v4sf)(X); \ 200263508Sdim __t.__f = __a[(N) & 3]; \ 201249423Sdim __t.__i;})) 202204962Srdivacky 203204962Srdivacky/* Miscellaneous insert and extract macros. */ 204204962Srdivacky/* Extract a single-precision float from X at index N into D. */ 205210299Sed#define _MM_EXTRACT_FLOAT(D, X, N) (__extension__ ({ __v4sf __a = (__v4sf)(X); \ 206204962Srdivacky (D) = __a[N]; })) 207204962Srdivacky 208204962Srdivacky/* Or together 2 sets of indexes (X and Y) with the zeroing bits (Z) to create 209204962Srdivacky an index suitable for _mm_insert_ps. */ 210204962Srdivacky#define _MM_MK_INSERTPS_NDX(X, Y, Z) (((X) << 6) | ((Y) << 4) | (Z)) 211204962Srdivacky 212204962Srdivacky/* Extract a float from X at index N into the first index of the return. */ 213204962Srdivacky#define _MM_PICK_OUT_PS(X, N) _mm_insert_ps (_mm_setzero_ps(), (X), \ 214204962Srdivacky _MM_MK_INSERTPS_NDX((N), 0, 0x0e)) 215205219Srdivacky 216205219Srdivacky/* Insert int into packed integer array at index. */ 217210299Sed#define _mm_insert_epi8(X, I, N) (__extension__ ({ __v16qi __a = (__v16qi)(X); \ 218263508Sdim __a[(N) & 15] = (I); \ 219205219Srdivacky __a;})) 220210299Sed#define _mm_insert_epi32(X, I, N) (__extension__ ({ __v4si __a = (__v4si)(X); \ 221263508Sdim __a[(N) & 3] = (I); \ 222205219Srdivacky __a;})) 223205219Srdivacky#ifdef __x86_64__ 224210299Sed#define _mm_insert_epi64(X, I, N) (__extension__ ({ __v2di __a = (__v2di)(X); \ 225263508Sdim __a[(N) & 1] = (I); \ 226205219Srdivacky __a;})) 227205219Srdivacky#endif /* __x86_64__ */ 228204962Srdivacky 229212904Sdim/* Extract int from packed integer array at index. This returns the element 230212904Sdim * as a zero extended value, so it is unsigned. 231212904Sdim */ 232210299Sed#define _mm_extract_epi8(X, N) (__extension__ ({ __v16qi __a = (__v16qi)(X); \ 233263508Sdim (int)(unsigned char) \ 234263508Sdim __a[(N) & 15];})) 235210299Sed#define _mm_extract_epi32(X, N) (__extension__ ({ __v4si __a = (__v4si)(X); \ 236263508Sdim __a[(N) & 3];})) 237205219Srdivacky#ifdef __x86_64__ 238210299Sed#define _mm_extract_epi64(X, N) (__extension__ ({ __v2di __a = (__v2di)(X); \ 239263508Sdim __a[(N) & 1];})) 240205219Srdivacky#endif /* __x86_64 */ 241205219Srdivacky 242205219Srdivacky/* SSE4 128-bit Packed Integer Comparisons. */ 243206084Srdivackystatic __inline__ int __attribute__((__always_inline__, __nodebug__)) 244205219Srdivacky_mm_testz_si128(__m128i __M, __m128i __V) 245205219Srdivacky{ 246205219Srdivacky return __builtin_ia32_ptestz128((__v2di)__M, (__v2di)__V); 247205219Srdivacky} 248205219Srdivacky 249206084Srdivackystatic __inline__ int __attribute__((__always_inline__, __nodebug__)) 250205219Srdivacky_mm_testc_si128(__m128i __M, __m128i __V) 251205219Srdivacky{ 252205219Srdivacky return __builtin_ia32_ptestc128((__v2di)__M, (__v2di)__V); 253205219Srdivacky} 254205219Srdivacky 255206084Srdivackystatic __inline__ int __attribute__((__always_inline__, __nodebug__)) 256205219Srdivacky_mm_testnzc_si128(__m128i __M, __m128i __V) 257205219Srdivacky{ 258205219Srdivacky return __builtin_ia32_ptestnzc128((__v2di)__M, (__v2di)__V); 259205219Srdivacky} 260205219Srdivacky 261205219Srdivacky#define _mm_test_all_ones(V) _mm_testc_si128((V), _mm_cmpeq_epi32((V), (V))) 262205219Srdivacky#define _mm_test_mix_ones_zeros(M, V) _mm_testnzc_si128((M), (V)) 263234353Sdim#define _mm_test_all_zeros(M, V) _mm_testz_si128 ((M), (V)) 264205219Srdivacky 265205219Srdivacky/* SSE4 64-bit Packed Integer Comparisons. */ 266206084Srdivackystatic __inline__ __m128i __attribute__((__always_inline__, __nodebug__)) 267205219Srdivacky_mm_cmpeq_epi64(__m128i __V1, __m128i __V2) 268205219Srdivacky{ 269234353Sdim return (__m128i)((__v2di)__V1 == (__v2di)__V2); 270205219Srdivacky} 271205219Srdivacky 272205219Srdivacky/* SSE4 Packed Integer Sign-Extension. */ 273206084Srdivackystatic __inline__ __m128i __attribute__((__always_inline__, __nodebug__)) 274205219Srdivacky_mm_cvtepi8_epi16(__m128i __V) 275205219Srdivacky{ 276205219Srdivacky return (__m128i) __builtin_ia32_pmovsxbw128((__v16qi) __V); 277205219Srdivacky} 278205219Srdivacky 279206084Srdivackystatic __inline__ __m128i __attribute__((__always_inline__, __nodebug__)) 280205219Srdivacky_mm_cvtepi8_epi32(__m128i __V) 281205219Srdivacky{ 282205219Srdivacky return (__m128i) __builtin_ia32_pmovsxbd128((__v16qi) __V); 283205219Srdivacky} 284205219Srdivacky 285206084Srdivackystatic __inline__ __m128i __attribute__((__always_inline__, __nodebug__)) 286205219Srdivacky_mm_cvtepi8_epi64(__m128i __V) 287205219Srdivacky{ 288205219Srdivacky return (__m128i) __builtin_ia32_pmovsxbq128((__v16qi) __V); 289205219Srdivacky} 290205219Srdivacky 291206084Srdivackystatic __inline__ __m128i __attribute__((__always_inline__, __nodebug__)) 292205219Srdivacky_mm_cvtepi16_epi32(__m128i __V) 293205219Srdivacky{ 294205219Srdivacky return (__m128i) __builtin_ia32_pmovsxwd128((__v8hi) __V); 295205219Srdivacky} 296205219Srdivacky 297206084Srdivackystatic __inline__ __m128i __attribute__((__always_inline__, __nodebug__)) 298205219Srdivacky_mm_cvtepi16_epi64(__m128i __V) 299205219Srdivacky{ 300205219Srdivacky return (__m128i) __builtin_ia32_pmovsxwq128((__v8hi)__V); 301205219Srdivacky} 302205219Srdivacky 303206084Srdivackystatic __inline__ __m128i __attribute__((__always_inline__, __nodebug__)) 304205219Srdivacky_mm_cvtepi32_epi64(__m128i __V) 305205219Srdivacky{ 306205219Srdivacky return (__m128i) __builtin_ia32_pmovsxdq128((__v4si)__V); 307205219Srdivacky} 308205219Srdivacky 309205219Srdivacky/* SSE4 Packed Integer Zero-Extension. */ 310206084Srdivackystatic __inline__ __m128i __attribute__((__always_inline__, __nodebug__)) 311205219Srdivacky_mm_cvtepu8_epi16(__m128i __V) 312205219Srdivacky{ 313205219Srdivacky return (__m128i) __builtin_ia32_pmovzxbw128((__v16qi) __V); 314205219Srdivacky} 315205219Srdivacky 316206084Srdivackystatic __inline__ __m128i __attribute__((__always_inline__, __nodebug__)) 317205219Srdivacky_mm_cvtepu8_epi32(__m128i __V) 318205219Srdivacky{ 319205219Srdivacky return (__m128i) __builtin_ia32_pmovzxbd128((__v16qi)__V); 320205219Srdivacky} 321205219Srdivacky 322206084Srdivackystatic __inline__ __m128i __attribute__((__always_inline__, __nodebug__)) 323205219Srdivacky_mm_cvtepu8_epi64(__m128i __V) 324205219Srdivacky{ 325205219Srdivacky return (__m128i) __builtin_ia32_pmovzxbq128((__v16qi)__V); 326205219Srdivacky} 327205219Srdivacky 328206084Srdivackystatic __inline__ __m128i __attribute__((__always_inline__, __nodebug__)) 329205219Srdivacky_mm_cvtepu16_epi32(__m128i __V) 330205219Srdivacky{ 331205219Srdivacky return (__m128i) __builtin_ia32_pmovzxwd128((__v8hi)__V); 332205219Srdivacky} 333205219Srdivacky 334206084Srdivackystatic __inline__ __m128i __attribute__((__always_inline__, __nodebug__)) 335205219Srdivacky_mm_cvtepu16_epi64(__m128i __V) 336205219Srdivacky{ 337205219Srdivacky return (__m128i) __builtin_ia32_pmovzxwq128((__v8hi)__V); 338205219Srdivacky} 339205219Srdivacky 340206084Srdivackystatic __inline__ __m128i __attribute__((__always_inline__, __nodebug__)) 341205219Srdivacky_mm_cvtepu32_epi64(__m128i __V) 342205219Srdivacky{ 343205219Srdivacky return (__m128i) __builtin_ia32_pmovzxdq128((__v4si)__V); 344205219Srdivacky} 345205219Srdivacky 346205219Srdivacky/* SSE4 Pack with Unsigned Saturation. */ 347206084Srdivackystatic __inline__ __m128i __attribute__((__always_inline__, __nodebug__)) 348205219Srdivacky_mm_packus_epi32(__m128i __V1, __m128i __V2) 349205219Srdivacky{ 350205219Srdivacky return (__m128i) __builtin_ia32_packusdw128((__v4si)__V1, (__v4si)__V2); 351205219Srdivacky} 352205219Srdivacky 353205219Srdivacky/* SSE4 Multiple Packed Sums of Absolute Difference. */ 354234353Sdim#define _mm_mpsadbw_epu8(X, Y, M) __extension__ ({ \ 355234353Sdim __m128i __X = (X); \ 356234353Sdim __m128i __Y = (Y); \ 357234353Sdim (__m128i) __builtin_ia32_mpsadbw128((__v16qi)__X, (__v16qi)__Y, (M)); }) 358205219Srdivacky 359234353Sdimstatic __inline__ __m128i __attribute__((__always_inline__, __nodebug__)) 360234353Sdim_mm_minpos_epu16(__m128i __V) 361234353Sdim{ 362234353Sdim return (__m128i) __builtin_ia32_phminposuw128((__v8hi)__V); 363234353Sdim} 364234353Sdim 365205408Srdivacky/* These definitions are normally in nmmintrin.h, but gcc puts them in here 366205408Srdivacky so we'll do the same. */ 367205408Srdivacky#ifdef __SSE4_2__ 368205408Srdivacky 369205408Srdivacky/* These specify the type of data that we're comparing. */ 370205408Srdivacky#define _SIDD_UBYTE_OPS 0x00 371205408Srdivacky#define _SIDD_UWORD_OPS 0x01 372205408Srdivacky#define _SIDD_SBYTE_OPS 0x02 373205408Srdivacky#define _SIDD_SWORD_OPS 0x03 374205408Srdivacky 375205408Srdivacky/* These specify the type of comparison operation. */ 376205408Srdivacky#define _SIDD_CMP_EQUAL_ANY 0x00 377205408Srdivacky#define _SIDD_CMP_RANGES 0x04 378205408Srdivacky#define _SIDD_CMP_EQUAL_EACH 0x08 379205408Srdivacky#define _SIDD_CMP_EQUAL_ORDERED 0x0c 380205408Srdivacky 381205408Srdivacky/* These macros specify the polarity of the operation. */ 382205408Srdivacky#define _SIDD_POSITIVE_POLARITY 0x00 383205408Srdivacky#define _SIDD_NEGATIVE_POLARITY 0x10 384205408Srdivacky#define _SIDD_MASKED_POSITIVE_POLARITY 0x20 385205408Srdivacky#define _SIDD_MASKED_NEGATIVE_POLARITY 0x30 386205408Srdivacky 387205408Srdivacky/* These macros are used in _mm_cmpXstri() to specify the return. */ 388205408Srdivacky#define _SIDD_LEAST_SIGNIFICANT 0x00 389205408Srdivacky#define _SIDD_MOST_SIGNIFICANT 0x40 390205408Srdivacky 391205408Srdivacky/* These macros are used in _mm_cmpXstri() to specify the return. */ 392205408Srdivacky#define _SIDD_BIT_MASK 0x00 393205408Srdivacky#define _SIDD_UNIT_MASK 0x40 394205408Srdivacky 395205408Srdivacky/* SSE4.2 Packed Comparison Intrinsics. */ 396205408Srdivacky#define _mm_cmpistrm(A, B, M) __builtin_ia32_pcmpistrm128((A), (B), (M)) 397205408Srdivacky#define _mm_cmpistri(A, B, M) __builtin_ia32_pcmpistri128((A), (B), (M)) 398205408Srdivacky 399205408Srdivacky#define _mm_cmpestrm(A, LA, B, LB, M) \ 400205408Srdivacky __builtin_ia32_pcmpestrm128((A), (LA), (B), (LB), (M)) 401234353Sdim#define _mm_cmpestri(A, LA, B, LB, M) \ 402205408Srdivacky __builtin_ia32_pcmpestri128((A), (LA), (B), (LB), (M)) 403205408Srdivacky 404205408Srdivacky/* SSE4.2 Packed Comparison Intrinsics and EFlag Reading. */ 405234353Sdim#define _mm_cmpistra(A, B, M) \ 406234353Sdim __builtin_ia32_pcmpistria128((A), (B), (M)) 407234353Sdim#define _mm_cmpistrc(A, B, M) \ 408234353Sdim __builtin_ia32_pcmpistric128((A), (B), (M)) 409234353Sdim#define _mm_cmpistro(A, B, M) \ 410234353Sdim __builtin_ia32_pcmpistrio128((A), (B), (M)) 411234353Sdim#define _mm_cmpistrs(A, B, M) \ 412234353Sdim __builtin_ia32_pcmpistris128((A), (B), (M)) 413234353Sdim#define _mm_cmpistrz(A, B, M) \ 414234353Sdim __builtin_ia32_pcmpistriz128((A), (B), (M)) 415205408Srdivacky 416205408Srdivacky#define _mm_cmpestra(A, LA, B, LB, M) \ 417205408Srdivacky __builtin_ia32_pcmpestria128((A), (LA), (B), (LB), (M)) 418205408Srdivacky#define _mm_cmpestrc(A, LA, B, LB, M) \ 419205408Srdivacky __builtin_ia32_pcmpestric128((A), (LA), (B), (LB), (M)) 420205408Srdivacky#define _mm_cmpestro(A, LA, B, LB, M) \ 421205408Srdivacky __builtin_ia32_pcmpestrio128((A), (LA), (B), (LB), (M)) 422205408Srdivacky#define _mm_cmpestrs(A, LA, B, LB, M) \ 423205408Srdivacky __builtin_ia32_pcmpestris128((A), (LA), (B), (LB), (M)) 424205408Srdivacky#define _mm_cmpestrz(A, LA, B, LB, M) \ 425205408Srdivacky __builtin_ia32_pcmpestriz128((A), (LA), (B), (LB), (M)) 426205408Srdivacky 427205408Srdivacky/* SSE4.2 Compare Packed Data -- Greater Than. */ 428206084Srdivackystatic __inline__ __m128i __attribute__((__always_inline__, __nodebug__)) 429205408Srdivacky_mm_cmpgt_epi64(__m128i __V1, __m128i __V2) 430205408Srdivacky{ 431234353Sdim return (__m128i)((__v2di)__V1 > (__v2di)__V2); 432205408Srdivacky} 433205408Srdivacky 434205408Srdivacky/* SSE4.2 Accumulate CRC32. */ 435206084Srdivackystatic __inline__ unsigned int __attribute__((__always_inline__, __nodebug__)) 436205408Srdivacky_mm_crc32_u8(unsigned int __C, unsigned char __D) 437205408Srdivacky{ 438205408Srdivacky return __builtin_ia32_crc32qi(__C, __D); 439205408Srdivacky} 440205408Srdivacky 441206084Srdivackystatic __inline__ unsigned int __attribute__((__always_inline__, __nodebug__)) 442205408Srdivacky_mm_crc32_u16(unsigned int __C, unsigned short __D) 443205408Srdivacky{ 444205408Srdivacky return __builtin_ia32_crc32hi(__C, __D); 445205408Srdivacky} 446205408Srdivacky 447206084Srdivackystatic __inline__ unsigned int __attribute__((__always_inline__, __nodebug__)) 448205408Srdivacky_mm_crc32_u32(unsigned int __C, unsigned int __D) 449205408Srdivacky{ 450205408Srdivacky return __builtin_ia32_crc32si(__C, __D); 451205408Srdivacky} 452205408Srdivacky 453205408Srdivacky#ifdef __x86_64__ 454206084Srdivackystatic __inline__ unsigned long long __attribute__((__always_inline__, __nodebug__)) 455205408Srdivacky_mm_crc32_u64(unsigned long long __C, unsigned long long __D) 456205408Srdivacky{ 457205408Srdivacky return __builtin_ia32_crc32di(__C, __D); 458205408Srdivacky} 459205408Srdivacky#endif /* __x86_64__ */ 460205408Srdivacky 461234353Sdim#ifdef __POPCNT__ 462234353Sdim#include <popcntintrin.h> 463234353Sdim#endif 464205408Srdivacky 465205408Srdivacky#endif /* __SSE4_2__ */ 466204793Srdivacky#endif /* __SSE4_1__ */ 467204793Srdivacky 468204793Srdivacky#endif /* _SMMINTRIN_H */ 469