emmintrin.h revision 259279
1/* Copyright (C) 2003, 2004, 2005, 2006, 2007 Free Software Foundation, Inc.
2
3   This file is part of GCC.
4
5   GCC is free software; you can redistribute it and/or modify
6   it under the terms of the GNU General Public License as published by
7   the Free Software Foundation; either version 2, or (at your option)
8   any later version.
9
10   GCC is distributed in the hope that it will be useful,
11   but WITHOUT ANY WARRANTY; without even the implied warranty of
12   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
13   GNU General Public License for more details.
14
15   You should have received a copy of the GNU General Public License
16   along with GCC; see the file COPYING.  If not, write to
17   the Free Software Foundation, 51 Franklin Street, Fifth Floor,
18   Boston, MA 02110-1301, USA.  */
19
20/* As a special exception, if you include this header file into source
21   files compiled by GCC, this header file does not by itself cause
22   the resulting executable to be covered by the GNU General Public
23   License.  This exception does not however invalidate any other
24   reasons why the executable file might be covered by the GNU General
25   Public License.  */
26
27/* Implemented from the specification included in the Intel C++ Compiler
28   User Guide and Reference, version 9.0.  */
29
30#ifndef _EMMINTRIN_H_INCLUDED
31#define _EMMINTRIN_H_INCLUDED
32
33#ifndef __SSE2__
34# error "SSE2 instruction set not enabled"
35#else
36
37/* We need definitions from the SSE header files*/
38#include <xmmintrin.h>
39
40/* SSE2 */
41typedef double __v2df __attribute__ ((__vector_size__ (16)));
42typedef long long __v2di __attribute__ ((__vector_size__ (16)));
43typedef int __v4si __attribute__ ((__vector_size__ (16)));
44typedef short __v8hi __attribute__ ((__vector_size__ (16)));
45typedef char __v16qi __attribute__ ((__vector_size__ (16)));
46
47/* The Intel API is flexible enough that we must allow aliasing with other
48   vector types, and their scalar components.  */
49typedef long long __m128i __attribute__ ((__vector_size__ (16), __may_alias__));
50typedef double __m128d __attribute__ ((__vector_size__ (16), __may_alias__));
51
52/* Create a selector for use with the SHUFPD instruction.  */
53#define _MM_SHUFFLE2(fp1,fp0) \
54 (((fp1) << 1) | (fp0))
55
56/* Create a vector with element 0 as F and the rest zero.  */
57static __inline __m128d __attribute__((__always_inline__))
58_mm_set_sd (double __F)
59{
60  return __extension__ (__m128d){ __F, 0 };
61}
62
63/* Create a vector with both elements equal to F.  */
64static __inline __m128d __attribute__((__always_inline__))
65_mm_set1_pd (double __F)
66{
67  return __extension__ (__m128d){ __F, __F };
68}
69
70static __inline __m128d __attribute__((__always_inline__))
71_mm_set_pd1 (double __F)
72{
73  return _mm_set1_pd (__F);
74}
75
76/* Create a vector with the lower value X and upper value W.  */
77static __inline __m128d __attribute__((__always_inline__))
78_mm_set_pd (double __W, double __X)
79{
80  return __extension__ (__m128d){ __X, __W };
81}
82
83/* Create a vector with the lower value W and upper value X.  */
84static __inline __m128d __attribute__((__always_inline__))
85_mm_setr_pd (double __W, double __X)
86{
87  return __extension__ (__m128d){ __W, __X };
88}
89
90/* Create a vector of zeros.  */
91static __inline __m128d __attribute__((__always_inline__))
92_mm_setzero_pd (void)
93{
94  return __extension__ (__m128d){ 0.0, 0.0 };
95}
96
97/* Sets the low DPFP value of A from the low value of B.  */
98static __inline __m128d __attribute__((__always_inline__))
99_mm_move_sd (__m128d __A, __m128d __B)
100{
101  return (__m128d) __builtin_ia32_movsd ((__v2df)__A, (__v2df)__B);
102}
103
104/* Load two DPFP values from P.  The address must be 16-byte aligned.  */
105static __inline __m128d __attribute__((__always_inline__))
106_mm_load_pd (double const *__P)
107{
108  return *(__m128d *)__P;
109}
110
111/* Load two DPFP values from P.  The address need not be 16-byte aligned.  */
112static __inline __m128d __attribute__((__always_inline__))
113_mm_loadu_pd (double const *__P)
114{
115  return __builtin_ia32_loadupd (__P);
116}
117
118/* Create a vector with all two elements equal to *P.  */
119static __inline __m128d __attribute__((__always_inline__))
120_mm_load1_pd (double const *__P)
121{
122  return _mm_set1_pd (*__P);
123}
124
125/* Create a vector with element 0 as *P and the rest zero.  */
126static __inline __m128d __attribute__((__always_inline__))
127_mm_load_sd (double const *__P)
128{
129  return _mm_set_sd (*__P);
130}
131
132static __inline __m128d __attribute__((__always_inline__))
133_mm_load_pd1 (double const *__P)
134{
135  return _mm_load1_pd (__P);
136}
137
138/* Load two DPFP values in reverse order.  The address must be aligned.  */
139static __inline __m128d __attribute__((__always_inline__))
140_mm_loadr_pd (double const *__P)
141{
142  __m128d __tmp = _mm_load_pd (__P);
143  return __builtin_ia32_shufpd (__tmp, __tmp, _MM_SHUFFLE2 (0,1));
144}
145
146/* Store two DPFP values.  The address must be 16-byte aligned.  */
147static __inline void __attribute__((__always_inline__))
148_mm_store_pd (double *__P, __m128d __A)
149{
150  *(__m128d *)__P = __A;
151}
152
153/* Store two DPFP values.  The address need not be 16-byte aligned.  */
154static __inline void __attribute__((__always_inline__))
155_mm_storeu_pd (double *__P, __m128d __A)
156{
157  __builtin_ia32_storeupd (__P, __A);
158}
159
160/* Stores the lower DPFP value.  */
161static __inline void __attribute__((__always_inline__))
162_mm_store_sd (double *__P, __m128d __A)
163{
164  *__P = __builtin_ia32_vec_ext_v2df (__A, 0);
165}
166
167static __inline double __attribute__((__always_inline__))
168_mm_cvtsd_f64 (__m128d __A)
169{
170  return __builtin_ia32_vec_ext_v2df (__A, 0);
171}
172
173static __inline void __attribute__((__always_inline__))
174_mm_storel_pd (double *__P, __m128d __A)
175{
176  _mm_store_sd (__P, __A);
177}
178
179/* Stores the upper DPFP value.  */
180static __inline void __attribute__((__always_inline__))
181_mm_storeh_pd (double *__P, __m128d __A)
182{
183  *__P = __builtin_ia32_vec_ext_v2df (__A, 1);
184}
185
186/* Store the lower DPFP value across two words.
187   The address must be 16-byte aligned.  */
188static __inline void __attribute__((__always_inline__))
189_mm_store1_pd (double *__P, __m128d __A)
190{
191  _mm_store_pd (__P, __builtin_ia32_shufpd (__A, __A, _MM_SHUFFLE2 (0,0)));
192}
193
194static __inline void __attribute__((__always_inline__))
195_mm_store_pd1 (double *__P, __m128d __A)
196{
197  _mm_store1_pd (__P, __A);
198}
199
200/* Store two DPFP values in reverse order.  The address must be aligned.  */
201static __inline void __attribute__((__always_inline__))
202_mm_storer_pd (double *__P, __m128d __A)
203{
204  _mm_store_pd (__P, __builtin_ia32_shufpd (__A, __A, _MM_SHUFFLE2 (0,1)));
205}
206
207static __inline int __attribute__((__always_inline__))
208_mm_cvtsi128_si32 (__m128i __A)
209{
210  return __builtin_ia32_vec_ext_v4si ((__v4si)__A, 0);
211}
212
213#ifdef __x86_64__
214/* Intel intrinsic.  */
215static __inline long long __attribute__((__always_inline__))
216_mm_cvtsi128_si64 (__m128i __A)
217{
218  return __builtin_ia32_vec_ext_v2di ((__v2di)__A, 0);
219}
220
221/* Microsoft intrinsic.  */
222static __inline long long __attribute__((__always_inline__))
223_mm_cvtsi128_si64x (__m128i __A)
224{
225  return __builtin_ia32_vec_ext_v2di ((__v2di)__A, 0);
226}
227#endif
228
229static __inline __m128d __attribute__((__always_inline__))
230_mm_add_pd (__m128d __A, __m128d __B)
231{
232  return (__m128d)__builtin_ia32_addpd ((__v2df)__A, (__v2df)__B);
233}
234
235static __inline __m128d __attribute__((__always_inline__))
236_mm_add_sd (__m128d __A, __m128d __B)
237{
238  return (__m128d)__builtin_ia32_addsd ((__v2df)__A, (__v2df)__B);
239}
240
241static __inline __m128d __attribute__((__always_inline__))
242_mm_sub_pd (__m128d __A, __m128d __B)
243{
244  return (__m128d)__builtin_ia32_subpd ((__v2df)__A, (__v2df)__B);
245}
246
247static __inline __m128d __attribute__((__always_inline__))
248_mm_sub_sd (__m128d __A, __m128d __B)
249{
250  return (__m128d)__builtin_ia32_subsd ((__v2df)__A, (__v2df)__B);
251}
252
253static __inline __m128d __attribute__((__always_inline__))
254_mm_mul_pd (__m128d __A, __m128d __B)
255{
256  return (__m128d)__builtin_ia32_mulpd ((__v2df)__A, (__v2df)__B);
257}
258
259static __inline __m128d __attribute__((__always_inline__))
260_mm_mul_sd (__m128d __A, __m128d __B)
261{
262  return (__m128d)__builtin_ia32_mulsd ((__v2df)__A, (__v2df)__B);
263}
264
265static __inline __m128d __attribute__((__always_inline__))
266_mm_div_pd (__m128d __A, __m128d __B)
267{
268  return (__m128d)__builtin_ia32_divpd ((__v2df)__A, (__v2df)__B);
269}
270
271static __inline __m128d __attribute__((__always_inline__))
272_mm_div_sd (__m128d __A, __m128d __B)
273{
274  return (__m128d)__builtin_ia32_divsd ((__v2df)__A, (__v2df)__B);
275}
276
277static __inline __m128d __attribute__((__always_inline__))
278_mm_sqrt_pd (__m128d __A)
279{
280  return (__m128d)__builtin_ia32_sqrtpd ((__v2df)__A);
281}
282
283/* Return pair {sqrt (A[0), B[1]}.  */
284static __inline __m128d __attribute__((__always_inline__))
285_mm_sqrt_sd (__m128d __A, __m128d __B)
286{
287  __v2df __tmp = __builtin_ia32_movsd ((__v2df)__A, (__v2df)__B);
288  return (__m128d)__builtin_ia32_sqrtsd ((__v2df)__tmp);
289}
290
291static __inline __m128d __attribute__((__always_inline__))
292_mm_min_pd (__m128d __A, __m128d __B)
293{
294  return (__m128d)__builtin_ia32_minpd ((__v2df)__A, (__v2df)__B);
295}
296
297static __inline __m128d __attribute__((__always_inline__))
298_mm_min_sd (__m128d __A, __m128d __B)
299{
300  return (__m128d)__builtin_ia32_minsd ((__v2df)__A, (__v2df)__B);
301}
302
303static __inline __m128d __attribute__((__always_inline__))
304_mm_max_pd (__m128d __A, __m128d __B)
305{
306  return (__m128d)__builtin_ia32_maxpd ((__v2df)__A, (__v2df)__B);
307}
308
309static __inline __m128d __attribute__((__always_inline__))
310_mm_max_sd (__m128d __A, __m128d __B)
311{
312  return (__m128d)__builtin_ia32_maxsd ((__v2df)__A, (__v2df)__B);
313}
314
315static __inline __m128d __attribute__((__always_inline__))
316_mm_and_pd (__m128d __A, __m128d __B)
317{
318  return (__m128d)__builtin_ia32_andpd ((__v2df)__A, (__v2df)__B);
319}
320
321static __inline __m128d __attribute__((__always_inline__))
322_mm_andnot_pd (__m128d __A, __m128d __B)
323{
324  return (__m128d)__builtin_ia32_andnpd ((__v2df)__A, (__v2df)__B);
325}
326
327static __inline __m128d __attribute__((__always_inline__))
328_mm_or_pd (__m128d __A, __m128d __B)
329{
330  return (__m128d)__builtin_ia32_orpd ((__v2df)__A, (__v2df)__B);
331}
332
333static __inline __m128d __attribute__((__always_inline__))
334_mm_xor_pd (__m128d __A, __m128d __B)
335{
336  return (__m128d)__builtin_ia32_xorpd ((__v2df)__A, (__v2df)__B);
337}
338
339static __inline __m128d __attribute__((__always_inline__))
340_mm_cmpeq_pd (__m128d __A, __m128d __B)
341{
342  return (__m128d)__builtin_ia32_cmpeqpd ((__v2df)__A, (__v2df)__B);
343}
344
345static __inline __m128d __attribute__((__always_inline__))
346_mm_cmplt_pd (__m128d __A, __m128d __B)
347{
348  return (__m128d)__builtin_ia32_cmpltpd ((__v2df)__A, (__v2df)__B);
349}
350
351static __inline __m128d __attribute__((__always_inline__))
352_mm_cmple_pd (__m128d __A, __m128d __B)
353{
354  return (__m128d)__builtin_ia32_cmplepd ((__v2df)__A, (__v2df)__B);
355}
356
357static __inline __m128d __attribute__((__always_inline__))
358_mm_cmpgt_pd (__m128d __A, __m128d __B)
359{
360  return (__m128d)__builtin_ia32_cmpgtpd ((__v2df)__A, (__v2df)__B);
361}
362
363static __inline __m128d __attribute__((__always_inline__))
364_mm_cmpge_pd (__m128d __A, __m128d __B)
365{
366  return (__m128d)__builtin_ia32_cmpgepd ((__v2df)__A, (__v2df)__B);
367}
368
369static __inline __m128d __attribute__((__always_inline__))
370_mm_cmpneq_pd (__m128d __A, __m128d __B)
371{
372  return (__m128d)__builtin_ia32_cmpneqpd ((__v2df)__A, (__v2df)__B);
373}
374
375static __inline __m128d __attribute__((__always_inline__))
376_mm_cmpnlt_pd (__m128d __A, __m128d __B)
377{
378  return (__m128d)__builtin_ia32_cmpnltpd ((__v2df)__A, (__v2df)__B);
379}
380
381static __inline __m128d __attribute__((__always_inline__))
382_mm_cmpnle_pd (__m128d __A, __m128d __B)
383{
384  return (__m128d)__builtin_ia32_cmpnlepd ((__v2df)__A, (__v2df)__B);
385}
386
387static __inline __m128d __attribute__((__always_inline__))
388_mm_cmpngt_pd (__m128d __A, __m128d __B)
389{
390  return (__m128d)__builtin_ia32_cmpngtpd ((__v2df)__A, (__v2df)__B);
391}
392
393static __inline __m128d __attribute__((__always_inline__))
394_mm_cmpnge_pd (__m128d __A, __m128d __B)
395{
396  return (__m128d)__builtin_ia32_cmpngepd ((__v2df)__A, (__v2df)__B);
397}
398
399static __inline __m128d __attribute__((__always_inline__))
400_mm_cmpord_pd (__m128d __A, __m128d __B)
401{
402  return (__m128d)__builtin_ia32_cmpordpd ((__v2df)__A, (__v2df)__B);
403}
404
405static __inline __m128d __attribute__((__always_inline__))
406_mm_cmpunord_pd (__m128d __A, __m128d __B)
407{
408  return (__m128d)__builtin_ia32_cmpunordpd ((__v2df)__A, (__v2df)__B);
409}
410
411static __inline __m128d __attribute__((__always_inline__))
412_mm_cmpeq_sd (__m128d __A, __m128d __B)
413{
414  return (__m128d)__builtin_ia32_cmpeqsd ((__v2df)__A, (__v2df)__B);
415}
416
417static __inline __m128d __attribute__((__always_inline__))
418_mm_cmplt_sd (__m128d __A, __m128d __B)
419{
420  return (__m128d)__builtin_ia32_cmpltsd ((__v2df)__A, (__v2df)__B);
421}
422
423static __inline __m128d __attribute__((__always_inline__))
424_mm_cmple_sd (__m128d __A, __m128d __B)
425{
426  return (__m128d)__builtin_ia32_cmplesd ((__v2df)__A, (__v2df)__B);
427}
428
429static __inline __m128d __attribute__((__always_inline__))
430_mm_cmpgt_sd (__m128d __A, __m128d __B)
431{
432  return (__m128d) __builtin_ia32_movsd ((__v2df) __A,
433					 (__v2df)
434					 __builtin_ia32_cmpltsd ((__v2df) __B,
435								 (__v2df)
436								 __A));
437}
438
439static __inline __m128d __attribute__((__always_inline__))
440_mm_cmpge_sd (__m128d __A, __m128d __B)
441{
442  return (__m128d) __builtin_ia32_movsd ((__v2df) __A,
443					 (__v2df)
444					 __builtin_ia32_cmplesd ((__v2df) __B,
445								 (__v2df)
446								 __A));
447}
448
449static __inline __m128d __attribute__((__always_inline__))
450_mm_cmpneq_sd (__m128d __A, __m128d __B)
451{
452  return (__m128d)__builtin_ia32_cmpneqsd ((__v2df)__A, (__v2df)__B);
453}
454
455static __inline __m128d __attribute__((__always_inline__))
456_mm_cmpnlt_sd (__m128d __A, __m128d __B)
457{
458  return (__m128d)__builtin_ia32_cmpnltsd ((__v2df)__A, (__v2df)__B);
459}
460
461static __inline __m128d __attribute__((__always_inline__))
462_mm_cmpnle_sd (__m128d __A, __m128d __B)
463{
464  return (__m128d)__builtin_ia32_cmpnlesd ((__v2df)__A, (__v2df)__B);
465}
466
467static __inline __m128d __attribute__((__always_inline__))
468_mm_cmpngt_sd (__m128d __A, __m128d __B)
469{
470  return (__m128d) __builtin_ia32_movsd ((__v2df) __A,
471					 (__v2df)
472					 __builtin_ia32_cmpnltsd ((__v2df) __B,
473								  (__v2df)
474								  __A));
475}
476
477static __inline __m128d __attribute__((__always_inline__))
478_mm_cmpnge_sd (__m128d __A, __m128d __B)
479{
480  return (__m128d) __builtin_ia32_movsd ((__v2df) __A,
481					 (__v2df)
482					 __builtin_ia32_cmpnlesd ((__v2df) __B,
483								  (__v2df)
484								  __A));
485}
486
487static __inline __m128d __attribute__((__always_inline__))
488_mm_cmpord_sd (__m128d __A, __m128d __B)
489{
490  return (__m128d)__builtin_ia32_cmpordsd ((__v2df)__A, (__v2df)__B);
491}
492
493static __inline __m128d __attribute__((__always_inline__))
494_mm_cmpunord_sd (__m128d __A, __m128d __B)
495{
496  return (__m128d)__builtin_ia32_cmpunordsd ((__v2df)__A, (__v2df)__B);
497}
498
499static __inline int __attribute__((__always_inline__))
500_mm_comieq_sd (__m128d __A, __m128d __B)
501{
502  return __builtin_ia32_comisdeq ((__v2df)__A, (__v2df)__B);
503}
504
505static __inline int __attribute__((__always_inline__))
506_mm_comilt_sd (__m128d __A, __m128d __B)
507{
508  return __builtin_ia32_comisdlt ((__v2df)__A, (__v2df)__B);
509}
510
511static __inline int __attribute__((__always_inline__))
512_mm_comile_sd (__m128d __A, __m128d __B)
513{
514  return __builtin_ia32_comisdle ((__v2df)__A, (__v2df)__B);
515}
516
517static __inline int __attribute__((__always_inline__))
518_mm_comigt_sd (__m128d __A, __m128d __B)
519{
520  return __builtin_ia32_comisdgt ((__v2df)__A, (__v2df)__B);
521}
522
523static __inline int __attribute__((__always_inline__))
524_mm_comige_sd (__m128d __A, __m128d __B)
525{
526  return __builtin_ia32_comisdge ((__v2df)__A, (__v2df)__B);
527}
528
529static __inline int __attribute__((__always_inline__))
530_mm_comineq_sd (__m128d __A, __m128d __B)
531{
532  return __builtin_ia32_comisdneq ((__v2df)__A, (__v2df)__B);
533}
534
535static __inline int __attribute__((__always_inline__))
536_mm_ucomieq_sd (__m128d __A, __m128d __B)
537{
538  return __builtin_ia32_ucomisdeq ((__v2df)__A, (__v2df)__B);
539}
540
541static __inline int __attribute__((__always_inline__))
542_mm_ucomilt_sd (__m128d __A, __m128d __B)
543{
544  return __builtin_ia32_ucomisdlt ((__v2df)__A, (__v2df)__B);
545}
546
547static __inline int __attribute__((__always_inline__))
548_mm_ucomile_sd (__m128d __A, __m128d __B)
549{
550  return __builtin_ia32_ucomisdle ((__v2df)__A, (__v2df)__B);
551}
552
553static __inline int __attribute__((__always_inline__))
554_mm_ucomigt_sd (__m128d __A, __m128d __B)
555{
556  return __builtin_ia32_ucomisdgt ((__v2df)__A, (__v2df)__B);
557}
558
559static __inline int __attribute__((__always_inline__))
560_mm_ucomige_sd (__m128d __A, __m128d __B)
561{
562  return __builtin_ia32_ucomisdge ((__v2df)__A, (__v2df)__B);
563}
564
565static __inline int __attribute__((__always_inline__))
566_mm_ucomineq_sd (__m128d __A, __m128d __B)
567{
568  return __builtin_ia32_ucomisdneq ((__v2df)__A, (__v2df)__B);
569}
570
571/* Create a vector of Qi, where i is the element number.  */
572
573static __inline __m128i __attribute__((__always_inline__))
574_mm_set_epi64x (long long __q1, long long __q0)
575{
576  return __extension__ (__m128i)(__v2di){ __q0, __q1 };
577}
578
579static __inline __m128i __attribute__((__always_inline__))
580_mm_set_epi64 (__m64 __q1,  __m64 __q0)
581{
582  return _mm_set_epi64x ((long long)__q1, (long long)__q0);
583}
584
585static __inline __m128i __attribute__((__always_inline__))
586_mm_set_epi32 (int __q3, int __q2, int __q1, int __q0)
587{
588  return __extension__ (__m128i)(__v4si){ __q0, __q1, __q2, __q3 };
589}
590
591static __inline __m128i __attribute__((__always_inline__))
592_mm_set_epi16 (short __q7, short __q6, short __q5, short __q4,
593	       short __q3, short __q2, short __q1, short __q0)
594{
595  return __extension__ (__m128i)(__v8hi){
596    __q0, __q1, __q2, __q3, __q4, __q5, __q6, __q7 };
597}
598
599static __inline __m128i __attribute__((__always_inline__))
600_mm_set_epi8 (char __q15, char __q14, char __q13, char __q12,
601	      char __q11, char __q10, char __q09, char __q08,
602	      char __q07, char __q06, char __q05, char __q04,
603	      char __q03, char __q02, char __q01, char __q00)
604{
605  return __extension__ (__m128i)(__v16qi){
606    __q00, __q01, __q02, __q03, __q04, __q05, __q06, __q07,
607    __q08, __q09, __q10, __q11, __q12, __q13, __q14, __q15
608  };
609}
610
611/* Set all of the elements of the vector to A.  */
612
613static __inline __m128i __attribute__((__always_inline__))
614_mm_set1_epi64x (long long __A)
615{
616  return _mm_set_epi64x (__A, __A);
617}
618
619static __inline __m128i __attribute__((__always_inline__))
620_mm_set1_epi64 (__m64 __A)
621{
622  return _mm_set_epi64 (__A, __A);
623}
624
625static __inline __m128i __attribute__((__always_inline__))
626_mm_set1_epi32 (int __A)
627{
628  return _mm_set_epi32 (__A, __A, __A, __A);
629}
630
631static __inline __m128i __attribute__((__always_inline__))
632_mm_set1_epi16 (short __A)
633{
634  return _mm_set_epi16 (__A, __A, __A, __A, __A, __A, __A, __A);
635}
636
637static __inline __m128i __attribute__((__always_inline__))
638_mm_set1_epi8 (char __A)
639{
640  return _mm_set_epi8 (__A, __A, __A, __A, __A, __A, __A, __A,
641		       __A, __A, __A, __A, __A, __A, __A, __A);
642}
643
644/* Create a vector of Qi, where i is the element number.
645   The parameter order is reversed from the _mm_set_epi* functions.  */
646
647static __inline __m128i __attribute__((__always_inline__))
648_mm_setr_epi64 (__m64 __q0, __m64 __q1)
649{
650  return _mm_set_epi64 (__q1, __q0);
651}
652
653static __inline __m128i __attribute__((__always_inline__))
654_mm_setr_epi32 (int __q0, int __q1, int __q2, int __q3)
655{
656  return _mm_set_epi32 (__q3, __q2, __q1, __q0);
657}
658
659static __inline __m128i __attribute__((__always_inline__))
660_mm_setr_epi16 (short __q0, short __q1, short __q2, short __q3,
661	        short __q4, short __q5, short __q6, short __q7)
662{
663  return _mm_set_epi16 (__q7, __q6, __q5, __q4, __q3, __q2, __q1, __q0);
664}
665
666static __inline __m128i __attribute__((__always_inline__))
667_mm_setr_epi8 (char __q00, char __q01, char __q02, char __q03,
668	       char __q04, char __q05, char __q06, char __q07,
669	       char __q08, char __q09, char __q10, char __q11,
670	       char __q12, char __q13, char __q14, char __q15)
671{
672  return _mm_set_epi8 (__q15, __q14, __q13, __q12, __q11, __q10, __q09, __q08,
673		       __q07, __q06, __q05, __q04, __q03, __q02, __q01, __q00);
674}
675
676/* Create a vector with element 0 as *P and the rest zero.  */
677
678static __inline __m128i __attribute__((__always_inline__))
679_mm_load_si128 (__m128i const *__P)
680{
681  return *__P;
682}
683
684static __inline __m128i __attribute__((__always_inline__))
685_mm_loadu_si128 (__m128i const *__P)
686{
687  return (__m128i) __builtin_ia32_loaddqu ((char const *)__P);
688}
689
690static __inline __m128i __attribute__((__always_inline__))
691_mm_loadl_epi64 (__m128i const *__P)
692{
693  return _mm_set_epi64 ((__m64)0LL, *(__m64 *)__P);
694}
695
696static __inline void __attribute__((__always_inline__))
697_mm_store_si128 (__m128i *__P, __m128i __B)
698{
699  *__P = __B;
700}
701
702static __inline void __attribute__((__always_inline__))
703_mm_storeu_si128 (__m128i *__P, __m128i __B)
704{
705  __builtin_ia32_storedqu ((char *)__P, (__v16qi)__B);
706}
707
708static __inline void __attribute__((__always_inline__))
709_mm_storel_epi64 (__m128i *__P, __m128i __B)
710{
711  *(long long *)__P = __builtin_ia32_vec_ext_v2di ((__v2di)__B, 0);
712}
713
714static __inline __m64 __attribute__((__always_inline__))
715_mm_movepi64_pi64 (__m128i __B)
716{
717  return (__m64) __builtin_ia32_vec_ext_v2di ((__v2di)__B, 0);
718}
719
720static __inline __m128i __attribute__((__always_inline__))
721_mm_movpi64_epi64 (__m64 __A)
722{
723  return _mm_set_epi64 ((__m64)0LL, __A);
724}
725
726static __inline __m128i __attribute__((__always_inline__))
727_mm_move_epi64 (__m128i __A)
728{
729  return _mm_set_epi64 ((__m64)0LL, _mm_movepi64_pi64 (__A));
730}
731
732/* Create a vector of zeros.  */
733static __inline __m128i __attribute__((__always_inline__))
734_mm_setzero_si128 (void)
735{
736  return __extension__ (__m128i)(__v4si){ 0, 0, 0, 0 };
737}
738
739static __inline __m128d __attribute__((__always_inline__))
740_mm_cvtepi32_pd (__m128i __A)
741{
742  return (__m128d)__builtin_ia32_cvtdq2pd ((__v4si) __A);
743}
744
745static __inline __m128 __attribute__((__always_inline__))
746_mm_cvtepi32_ps (__m128i __A)
747{
748  return (__m128)__builtin_ia32_cvtdq2ps ((__v4si) __A);
749}
750
751static __inline __m128i __attribute__((__always_inline__))
752_mm_cvtpd_epi32 (__m128d __A)
753{
754  return (__m128i)__builtin_ia32_cvtpd2dq ((__v2df) __A);
755}
756
757static __inline __m64 __attribute__((__always_inline__))
758_mm_cvtpd_pi32 (__m128d __A)
759{
760  return (__m64)__builtin_ia32_cvtpd2pi ((__v2df) __A);
761}
762
763static __inline __m128 __attribute__((__always_inline__))
764_mm_cvtpd_ps (__m128d __A)
765{
766  return (__m128)__builtin_ia32_cvtpd2ps ((__v2df) __A);
767}
768
769static __inline __m128i __attribute__((__always_inline__))
770_mm_cvttpd_epi32 (__m128d __A)
771{
772  return (__m128i)__builtin_ia32_cvttpd2dq ((__v2df) __A);
773}
774
775static __inline __m64 __attribute__((__always_inline__))
776_mm_cvttpd_pi32 (__m128d __A)
777{
778  return (__m64)__builtin_ia32_cvttpd2pi ((__v2df) __A);
779}
780
781static __inline __m128d __attribute__((__always_inline__))
782_mm_cvtpi32_pd (__m64 __A)
783{
784  return (__m128d)__builtin_ia32_cvtpi2pd ((__v2si) __A);
785}
786
787static __inline __m128i __attribute__((__always_inline__))
788_mm_cvtps_epi32 (__m128 __A)
789{
790  return (__m128i)__builtin_ia32_cvtps2dq ((__v4sf) __A);
791}
792
793static __inline __m128i __attribute__((__always_inline__))
794_mm_cvttps_epi32 (__m128 __A)
795{
796  return (__m128i)__builtin_ia32_cvttps2dq ((__v4sf) __A);
797}
798
799static __inline __m128d __attribute__((__always_inline__))
800_mm_cvtps_pd (__m128 __A)
801{
802  return (__m128d)__builtin_ia32_cvtps2pd ((__v4sf) __A);
803}
804
805static __inline int __attribute__((__always_inline__))
806_mm_cvtsd_si32 (__m128d __A)
807{
808  return __builtin_ia32_cvtsd2si ((__v2df) __A);
809}
810
811#ifdef __x86_64__
812/* Intel intrinsic.  */
813static __inline long long __attribute__((__always_inline__))
814_mm_cvtsd_si64 (__m128d __A)
815{
816  return __builtin_ia32_cvtsd2si64 ((__v2df) __A);
817}
818
819/* Microsoft intrinsic.  */
820static __inline long long __attribute__((__always_inline__))
821_mm_cvtsd_si64x (__m128d __A)
822{
823  return __builtin_ia32_cvtsd2si64 ((__v2df) __A);
824}
825#endif
826
827static __inline int __attribute__((__always_inline__))
828_mm_cvttsd_si32 (__m128d __A)
829{
830  return __builtin_ia32_cvttsd2si ((__v2df) __A);
831}
832
833#ifdef __x86_64__
834/* Intel intrinsic.  */
835static __inline long long __attribute__((__always_inline__))
836_mm_cvttsd_si64 (__m128d __A)
837{
838  return __builtin_ia32_cvttsd2si64 ((__v2df) __A);
839}
840
841/* Microsoft intrinsic.  */
842static __inline long long __attribute__((__always_inline__))
843_mm_cvttsd_si64x (__m128d __A)
844{
845  return __builtin_ia32_cvttsd2si64 ((__v2df) __A);
846}
847#endif
848
849static __inline __m128 __attribute__((__always_inline__))
850_mm_cvtsd_ss (__m128 __A, __m128d __B)
851{
852  return (__m128)__builtin_ia32_cvtsd2ss ((__v4sf) __A, (__v2df) __B);
853}
854
855static __inline __m128d __attribute__((__always_inline__))
856_mm_cvtsi32_sd (__m128d __A, int __B)
857{
858  return (__m128d)__builtin_ia32_cvtsi2sd ((__v2df) __A, __B);
859}
860
861#ifdef __x86_64__
862/* Intel intrinsic.  */
863static __inline __m128d __attribute__((__always_inline__))
864_mm_cvtsi64_sd (__m128d __A, long long __B)
865{
866  return (__m128d)__builtin_ia32_cvtsi642sd ((__v2df) __A, __B);
867}
868
869/* Microsoft intrinsic.  */
870static __inline __m128d __attribute__((__always_inline__))
871_mm_cvtsi64x_sd (__m128d __A, long long __B)
872{
873  return (__m128d)__builtin_ia32_cvtsi642sd ((__v2df) __A, __B);
874}
875#endif
876
877static __inline __m128d __attribute__((__always_inline__))
878_mm_cvtss_sd (__m128d __A, __m128 __B)
879{
880  return (__m128d)__builtin_ia32_cvtss2sd ((__v2df) __A, (__v4sf)__B);
881}
882
883#define _mm_shuffle_pd(__A, __B, __C) ((__m128d)__builtin_ia32_shufpd ((__v2df)__A, (__v2df)__B, (__C)))
884
885static __inline __m128d __attribute__((__always_inline__))
886_mm_unpackhi_pd (__m128d __A, __m128d __B)
887{
888  return (__m128d)__builtin_ia32_unpckhpd ((__v2df)__A, (__v2df)__B);
889}
890
891static __inline __m128d __attribute__((__always_inline__))
892_mm_unpacklo_pd (__m128d __A, __m128d __B)
893{
894  return (__m128d)__builtin_ia32_unpcklpd ((__v2df)__A, (__v2df)__B);
895}
896
897static __inline __m128d __attribute__((__always_inline__))
898_mm_loadh_pd (__m128d __A, double const *__B)
899{
900  return (__m128d)__builtin_ia32_loadhpd ((__v2df)__A, __B);
901}
902
903static __inline __m128d __attribute__((__always_inline__))
904_mm_loadl_pd (__m128d __A, double const *__B)
905{
906  return (__m128d)__builtin_ia32_loadlpd ((__v2df)__A, __B);
907}
908
909static __inline int __attribute__((__always_inline__))
910_mm_movemask_pd (__m128d __A)
911{
912  return __builtin_ia32_movmskpd ((__v2df)__A);
913}
914
915static __inline __m128i __attribute__((__always_inline__))
916_mm_packs_epi16 (__m128i __A, __m128i __B)
917{
918  return (__m128i)__builtin_ia32_packsswb128 ((__v8hi)__A, (__v8hi)__B);
919}
920
921static __inline __m128i __attribute__((__always_inline__))
922_mm_packs_epi32 (__m128i __A, __m128i __B)
923{
924  return (__m128i)__builtin_ia32_packssdw128 ((__v4si)__A, (__v4si)__B);
925}
926
927static __inline __m128i __attribute__((__always_inline__))
928_mm_packus_epi16 (__m128i __A, __m128i __B)
929{
930  return (__m128i)__builtin_ia32_packuswb128 ((__v8hi)__A, (__v8hi)__B);
931}
932
933static __inline __m128i __attribute__((__always_inline__))
934_mm_unpackhi_epi8 (__m128i __A, __m128i __B)
935{
936  return (__m128i)__builtin_ia32_punpckhbw128 ((__v16qi)__A, (__v16qi)__B);
937}
938
939static __inline __m128i __attribute__((__always_inline__))
940_mm_unpackhi_epi16 (__m128i __A, __m128i __B)
941{
942  return (__m128i)__builtin_ia32_punpckhwd128 ((__v8hi)__A, (__v8hi)__B);
943}
944
945static __inline __m128i __attribute__((__always_inline__))
946_mm_unpackhi_epi32 (__m128i __A, __m128i __B)
947{
948  return (__m128i)__builtin_ia32_punpckhdq128 ((__v4si)__A, (__v4si)__B);
949}
950
951static __inline __m128i __attribute__((__always_inline__))
952_mm_unpackhi_epi64 (__m128i __A, __m128i __B)
953{
954  return (__m128i)__builtin_ia32_punpckhqdq128 ((__v2di)__A, (__v2di)__B);
955}
956
957static __inline __m128i __attribute__((__always_inline__))
958_mm_unpacklo_epi8 (__m128i __A, __m128i __B)
959{
960  return (__m128i)__builtin_ia32_punpcklbw128 ((__v16qi)__A, (__v16qi)__B);
961}
962
963static __inline __m128i __attribute__((__always_inline__))
964_mm_unpacklo_epi16 (__m128i __A, __m128i __B)
965{
966  return (__m128i)__builtin_ia32_punpcklwd128 ((__v8hi)__A, (__v8hi)__B);
967}
968
969static __inline __m128i __attribute__((__always_inline__))
970_mm_unpacklo_epi32 (__m128i __A, __m128i __B)
971{
972  return (__m128i)__builtin_ia32_punpckldq128 ((__v4si)__A, (__v4si)__B);
973}
974
975static __inline __m128i __attribute__((__always_inline__))
976_mm_unpacklo_epi64 (__m128i __A, __m128i __B)
977{
978  return (__m128i)__builtin_ia32_punpcklqdq128 ((__v2di)__A, (__v2di)__B);
979}
980
981static __inline __m128i __attribute__((__always_inline__))
982_mm_add_epi8 (__m128i __A, __m128i __B)
983{
984  return (__m128i)__builtin_ia32_paddb128 ((__v16qi)__A, (__v16qi)__B);
985}
986
987static __inline __m128i __attribute__((__always_inline__))
988_mm_add_epi16 (__m128i __A, __m128i __B)
989{
990  return (__m128i)__builtin_ia32_paddw128 ((__v8hi)__A, (__v8hi)__B);
991}
992
993static __inline __m128i __attribute__((__always_inline__))
994_mm_add_epi32 (__m128i __A, __m128i __B)
995{
996  return (__m128i)__builtin_ia32_paddd128 ((__v4si)__A, (__v4si)__B);
997}
998
999static __inline __m128i __attribute__((__always_inline__))
1000_mm_add_epi64 (__m128i __A, __m128i __B)
1001{
1002  return (__m128i)__builtin_ia32_paddq128 ((__v2di)__A, (__v2di)__B);
1003}
1004
1005static __inline __m128i __attribute__((__always_inline__))
1006_mm_adds_epi8 (__m128i __A, __m128i __B)
1007{
1008  return (__m128i)__builtin_ia32_paddsb128 ((__v16qi)__A, (__v16qi)__B);
1009}
1010
1011static __inline __m128i __attribute__((__always_inline__))
1012_mm_adds_epi16 (__m128i __A, __m128i __B)
1013{
1014  return (__m128i)__builtin_ia32_paddsw128 ((__v8hi)__A, (__v8hi)__B);
1015}
1016
1017static __inline __m128i __attribute__((__always_inline__))
1018_mm_adds_epu8 (__m128i __A, __m128i __B)
1019{
1020  return (__m128i)__builtin_ia32_paddusb128 ((__v16qi)__A, (__v16qi)__B);
1021}
1022
1023static __inline __m128i __attribute__((__always_inline__))
1024_mm_adds_epu16 (__m128i __A, __m128i __B)
1025{
1026  return (__m128i)__builtin_ia32_paddusw128 ((__v8hi)__A, (__v8hi)__B);
1027}
1028
1029static __inline __m128i __attribute__((__always_inline__))
1030_mm_sub_epi8 (__m128i __A, __m128i __B)
1031{
1032  return (__m128i)__builtin_ia32_psubb128 ((__v16qi)__A, (__v16qi)__B);
1033}
1034
1035static __inline __m128i __attribute__((__always_inline__))
1036_mm_sub_epi16 (__m128i __A, __m128i __B)
1037{
1038  return (__m128i)__builtin_ia32_psubw128 ((__v8hi)__A, (__v8hi)__B);
1039}
1040
1041static __inline __m128i __attribute__((__always_inline__))
1042_mm_sub_epi32 (__m128i __A, __m128i __B)
1043{
1044  return (__m128i)__builtin_ia32_psubd128 ((__v4si)__A, (__v4si)__B);
1045}
1046
1047static __inline __m128i __attribute__((__always_inline__))
1048_mm_sub_epi64 (__m128i __A, __m128i __B)
1049{
1050  return (__m128i)__builtin_ia32_psubq128 ((__v2di)__A, (__v2di)__B);
1051}
1052
1053static __inline __m128i __attribute__((__always_inline__))
1054_mm_subs_epi8 (__m128i __A, __m128i __B)
1055{
1056  return (__m128i)__builtin_ia32_psubsb128 ((__v16qi)__A, (__v16qi)__B);
1057}
1058
1059static __inline __m128i __attribute__((__always_inline__))
1060_mm_subs_epi16 (__m128i __A, __m128i __B)
1061{
1062  return (__m128i)__builtin_ia32_psubsw128 ((__v8hi)__A, (__v8hi)__B);
1063}
1064
1065static __inline __m128i __attribute__((__always_inline__))
1066_mm_subs_epu8 (__m128i __A, __m128i __B)
1067{
1068  return (__m128i)__builtin_ia32_psubusb128 ((__v16qi)__A, (__v16qi)__B);
1069}
1070
1071static __inline __m128i __attribute__((__always_inline__))
1072_mm_subs_epu16 (__m128i __A, __m128i __B)
1073{
1074  return (__m128i)__builtin_ia32_psubusw128 ((__v8hi)__A, (__v8hi)__B);
1075}
1076
1077static __inline __m128i __attribute__((__always_inline__))
1078_mm_madd_epi16 (__m128i __A, __m128i __B)
1079{
1080  return (__m128i)__builtin_ia32_pmaddwd128 ((__v8hi)__A, (__v8hi)__B);
1081}
1082
1083static __inline __m128i __attribute__((__always_inline__))
1084_mm_mulhi_epi16 (__m128i __A, __m128i __B)
1085{
1086  return (__m128i)__builtin_ia32_pmulhw128 ((__v8hi)__A, (__v8hi)__B);
1087}
1088
1089static __inline __m128i __attribute__((__always_inline__))
1090_mm_mullo_epi16 (__m128i __A, __m128i __B)
1091{
1092  return (__m128i)__builtin_ia32_pmullw128 ((__v8hi)__A, (__v8hi)__B);
1093}
1094
1095static __inline __m64 __attribute__((__always_inline__))
1096_mm_mul_su32 (__m64 __A, __m64 __B)
1097{
1098  return (__m64)__builtin_ia32_pmuludq ((__v2si)__A, (__v2si)__B);
1099}
1100
1101static __inline __m128i __attribute__((__always_inline__))
1102_mm_mul_epu32 (__m128i __A, __m128i __B)
1103{
1104  return (__m128i)__builtin_ia32_pmuludq128 ((__v4si)__A, (__v4si)__B);
1105}
1106
1107#if 0
1108static __inline __m128i __attribute__((__always_inline__))
1109_mm_slli_epi16 (__m128i __A, int __B)
1110{
1111  return (__m128i)__builtin_ia32_psllwi128 ((__v8hi)__A, __B);
1112}
1113
1114static __inline __m128i __attribute__((__always_inline__))
1115_mm_slli_epi32 (__m128i __A, int __B)
1116{
1117  return (__m128i)__builtin_ia32_pslldi128 ((__v4si)__A, __B);
1118}
1119
1120static __inline __m128i __attribute__((__always_inline__))
1121_mm_slli_epi64 (__m128i __A, int __B)
1122{
1123  return (__m128i)__builtin_ia32_psllqi128 ((__v2di)__A, __B);
1124}
1125#else
1126#define _mm_slli_epi16(__A, __B) \
1127  ((__m128i)__builtin_ia32_psllwi128 ((__v8hi)(__A), __B))
1128#define _mm_slli_epi32(__A, __B) \
1129  ((__m128i)__builtin_ia32_pslldi128 ((__v4si)(__A), __B))
1130#define _mm_slli_epi64(__A, __B) \
1131  ((__m128i)__builtin_ia32_psllqi128 ((__v2di)(__A), __B))
1132#endif
1133
1134#if 0
1135static __inline __m128i __attribute__((__always_inline__))
1136_mm_srai_epi16 (__m128i __A, int __B)
1137{
1138  return (__m128i)__builtin_ia32_psrawi128 ((__v8hi)__A, __B);
1139}
1140
1141static __inline __m128i __attribute__((__always_inline__))
1142_mm_srai_epi32 (__m128i __A, int __B)
1143{
1144  return (__m128i)__builtin_ia32_psradi128 ((__v4si)__A, __B);
1145}
1146#else
1147#define _mm_srai_epi16(__A, __B) \
1148  ((__m128i)__builtin_ia32_psrawi128 ((__v8hi)(__A), __B))
1149#define _mm_srai_epi32(__A, __B) \
1150  ((__m128i)__builtin_ia32_psradi128 ((__v4si)(__A), __B))
1151#endif
1152
1153#if 0
1154static __m128i __attribute__((__always_inline__))
1155_mm_srli_si128 (__m128i __A, int __B)
1156{
1157  return ((__m128i)__builtin_ia32_psrldqi128 (__A, __B * 8));
1158}
1159
1160static __m128i __attribute__((__always_inline__))
1161_mm_srli_si128 (__m128i __A, int __B)
1162{
1163  return ((__m128i)__builtin_ia32_pslldqi128 (__A, __B * 8));
1164}
1165#else
1166#define _mm_srli_si128(__A, __B) \
1167  ((__m128i)__builtin_ia32_psrldqi128 (__A, (__B) * 8))
1168#define _mm_slli_si128(__A, __B) \
1169  ((__m128i)__builtin_ia32_pslldqi128 (__A, (__B) * 8))
1170#endif
1171
1172#if 0
1173static __inline __m128i __attribute__((__always_inline__))
1174_mm_srli_epi16 (__m128i __A, int __B)
1175{
1176  return (__m128i)__builtin_ia32_psrlwi128 ((__v8hi)__A, __B);
1177}
1178
1179static __inline __m128i __attribute__((__always_inline__))
1180_mm_srli_epi32 (__m128i __A, int __B)
1181{
1182  return (__m128i)__builtin_ia32_psrldi128 ((__v4si)__A, __B);
1183}
1184
1185static __inline __m128i __attribute__((__always_inline__))
1186_mm_srli_epi64 (__m128i __A, int __B)
1187{
1188  return (__m128i)__builtin_ia32_psrlqi128 ((__v2di)__A, __B);
1189}
1190#else
1191#define _mm_srli_epi16(__A, __B) \
1192  ((__m128i)__builtin_ia32_psrlwi128 ((__v8hi)(__A), __B))
1193#define _mm_srli_epi32(__A, __B) \
1194  ((__m128i)__builtin_ia32_psrldi128 ((__v4si)(__A), __B))
1195#define _mm_srli_epi64(__A, __B) \
1196  ((__m128i)__builtin_ia32_psrlqi128 ((__v4si)(__A), __B))
1197#endif
1198
1199static __inline __m128i __attribute__((__always_inline__))
1200_mm_sll_epi16 (__m128i __A, __m128i __B)
1201{
1202  return (__m128i)__builtin_ia32_psllw128((__v8hi)__A, (__v8hi)__B);
1203}
1204
1205static __inline __m128i __attribute__((__always_inline__))
1206_mm_sll_epi32 (__m128i __A, __m128i __B)
1207{
1208  return (__m128i)__builtin_ia32_pslld128((__v4si)__A, (__v4si)__B);
1209}
1210
1211static __inline __m128i __attribute__((__always_inline__))
1212_mm_sll_epi64 (__m128i __A, __m128i __B)
1213{
1214  return (__m128i)__builtin_ia32_psllq128((__v2di)__A, (__v2di)__B);
1215}
1216
1217static __inline __m128i __attribute__((__always_inline__))
1218_mm_sra_epi16 (__m128i __A, __m128i __B)
1219{
1220  return (__m128i)__builtin_ia32_psraw128 ((__v8hi)__A, (__v8hi)__B);
1221}
1222
1223static __inline __m128i __attribute__((__always_inline__))
1224_mm_sra_epi32 (__m128i __A, __m128i __B)
1225{
1226  return (__m128i)__builtin_ia32_psrad128 ((__v4si)__A, (__v4si)__B);
1227}
1228
1229static __inline __m128i __attribute__((__always_inline__))
1230_mm_srl_epi16 (__m128i __A, __m128i __B)
1231{
1232  return (__m128i)__builtin_ia32_psrlw128 ((__v8hi)__A, (__v8hi)__B);
1233}
1234
1235static __inline __m128i __attribute__((__always_inline__))
1236_mm_srl_epi32 (__m128i __A, __m128i __B)
1237{
1238  return (__m128i)__builtin_ia32_psrld128 ((__v4si)__A, (__v4si)__B);
1239}
1240
1241static __inline __m128i __attribute__((__always_inline__))
1242_mm_srl_epi64 (__m128i __A, __m128i __B)
1243{
1244  return (__m128i)__builtin_ia32_psrlq128 ((__v2di)__A, (__v2di)__B);
1245}
1246
1247static __inline __m128i __attribute__((__always_inline__))
1248_mm_and_si128 (__m128i __A, __m128i __B)
1249{
1250  return (__m128i)__builtin_ia32_pand128 ((__v2di)__A, (__v2di)__B);
1251}
1252
1253static __inline __m128i __attribute__((__always_inline__))
1254_mm_andnot_si128 (__m128i __A, __m128i __B)
1255{
1256  return (__m128i)__builtin_ia32_pandn128 ((__v2di)__A, (__v2di)__B);
1257}
1258
1259static __inline __m128i __attribute__((__always_inline__))
1260_mm_or_si128 (__m128i __A, __m128i __B)
1261{
1262  return (__m128i)__builtin_ia32_por128 ((__v2di)__A, (__v2di)__B);
1263}
1264
1265static __inline __m128i __attribute__((__always_inline__))
1266_mm_xor_si128 (__m128i __A, __m128i __B)
1267{
1268  return (__m128i)__builtin_ia32_pxor128 ((__v2di)__A, (__v2di)__B);
1269}
1270
1271static __inline __m128i __attribute__((__always_inline__))
1272_mm_cmpeq_epi8 (__m128i __A, __m128i __B)
1273{
1274  return (__m128i)__builtin_ia32_pcmpeqb128 ((__v16qi)__A, (__v16qi)__B);
1275}
1276
1277static __inline __m128i __attribute__((__always_inline__))
1278_mm_cmpeq_epi16 (__m128i __A, __m128i __B)
1279{
1280  return (__m128i)__builtin_ia32_pcmpeqw128 ((__v8hi)__A, (__v8hi)__B);
1281}
1282
1283static __inline __m128i __attribute__((__always_inline__))
1284_mm_cmpeq_epi32 (__m128i __A, __m128i __B)
1285{
1286  return (__m128i)__builtin_ia32_pcmpeqd128 ((__v4si)__A, (__v4si)__B);
1287}
1288
1289static __inline __m128i __attribute__((__always_inline__))
1290_mm_cmplt_epi8 (__m128i __A, __m128i __B)
1291{
1292  return (__m128i)__builtin_ia32_pcmpgtb128 ((__v16qi)__B, (__v16qi)__A);
1293}
1294
1295static __inline __m128i __attribute__((__always_inline__))
1296_mm_cmplt_epi16 (__m128i __A, __m128i __B)
1297{
1298  return (__m128i)__builtin_ia32_pcmpgtw128 ((__v8hi)__B, (__v8hi)__A);
1299}
1300
1301static __inline __m128i __attribute__((__always_inline__))
1302_mm_cmplt_epi32 (__m128i __A, __m128i __B)
1303{
1304  return (__m128i)__builtin_ia32_pcmpgtd128 ((__v4si)__B, (__v4si)__A);
1305}
1306
1307static __inline __m128i __attribute__((__always_inline__))
1308_mm_cmpgt_epi8 (__m128i __A, __m128i __B)
1309{
1310  return (__m128i)__builtin_ia32_pcmpgtb128 ((__v16qi)__A, (__v16qi)__B);
1311}
1312
1313static __inline __m128i __attribute__((__always_inline__))
1314_mm_cmpgt_epi16 (__m128i __A, __m128i __B)
1315{
1316  return (__m128i)__builtin_ia32_pcmpgtw128 ((__v8hi)__A, (__v8hi)__B);
1317}
1318
1319static __inline __m128i __attribute__((__always_inline__))
1320_mm_cmpgt_epi32 (__m128i __A, __m128i __B)
1321{
1322  return (__m128i)__builtin_ia32_pcmpgtd128 ((__v4si)__A, (__v4si)__B);
1323}
1324
1325#if 0
1326static __inline int __attribute__((__always_inline__))
1327_mm_extract_epi16 (__m128i const __A, int const __N)
1328{
1329  return __builtin_ia32_vec_ext_v8hi ((__v8hi)__A, __N);
1330}
1331
1332static __inline __m128i __attribute__((__always_inline__))
1333_mm_insert_epi16 (__m128i const __A, int const __D, int const __N)
1334{
1335  return (__m128i) __builtin_ia32_vec_set_v8hi ((__v8hi)__A, __D, __N);
1336}
1337#else
1338#define _mm_extract_epi16(A, N) \
1339  ((int) __builtin_ia32_vec_ext_v8hi ((__v8hi)(A), (N)))
1340#define _mm_insert_epi16(A, D, N) \
1341  ((__m128i) __builtin_ia32_vec_set_v8hi ((__v8hi)(A), (D), (N)))
1342#endif
1343
1344static __inline __m128i __attribute__((__always_inline__))
1345_mm_max_epi16 (__m128i __A, __m128i __B)
1346{
1347  return (__m128i)__builtin_ia32_pmaxsw128 ((__v8hi)__A, (__v8hi)__B);
1348}
1349
1350static __inline __m128i __attribute__((__always_inline__))
1351_mm_max_epu8 (__m128i __A, __m128i __B)
1352{
1353  return (__m128i)__builtin_ia32_pmaxub128 ((__v16qi)__A, (__v16qi)__B);
1354}
1355
1356static __inline __m128i __attribute__((__always_inline__))
1357_mm_min_epi16 (__m128i __A, __m128i __B)
1358{
1359  return (__m128i)__builtin_ia32_pminsw128 ((__v8hi)__A, (__v8hi)__B);
1360}
1361
1362static __inline __m128i __attribute__((__always_inline__))
1363_mm_min_epu8 (__m128i __A, __m128i __B)
1364{
1365  return (__m128i)__builtin_ia32_pminub128 ((__v16qi)__A, (__v16qi)__B);
1366}
1367
1368static __inline int __attribute__((__always_inline__))
1369_mm_movemask_epi8 (__m128i __A)
1370{
1371  return __builtin_ia32_pmovmskb128 ((__v16qi)__A);
1372}
1373
1374static __inline __m128i __attribute__((__always_inline__))
1375_mm_mulhi_epu16 (__m128i __A, __m128i __B)
1376{
1377  return (__m128i)__builtin_ia32_pmulhuw128 ((__v8hi)__A, (__v8hi)__B);
1378}
1379
1380#define _mm_shufflehi_epi16(__A, __B) ((__m128i)__builtin_ia32_pshufhw ((__v8hi)__A, __B))
1381#define _mm_shufflelo_epi16(__A, __B) ((__m128i)__builtin_ia32_pshuflw ((__v8hi)__A, __B))
1382#define _mm_shuffle_epi32(__A, __B) ((__m128i)__builtin_ia32_pshufd ((__v4si)__A, __B))
1383
1384static __inline void __attribute__((__always_inline__))
1385_mm_maskmoveu_si128 (__m128i __A, __m128i __B, char *__C)
1386{
1387  __builtin_ia32_maskmovdqu ((__v16qi)__A, (__v16qi)__B, __C);
1388}
1389
1390static __inline __m128i __attribute__((__always_inline__))
1391_mm_avg_epu8 (__m128i __A, __m128i __B)
1392{
1393  return (__m128i)__builtin_ia32_pavgb128 ((__v16qi)__A, (__v16qi)__B);
1394}
1395
1396static __inline __m128i __attribute__((__always_inline__))
1397_mm_avg_epu16 (__m128i __A, __m128i __B)
1398{
1399  return (__m128i)__builtin_ia32_pavgw128 ((__v8hi)__A, (__v8hi)__B);
1400}
1401
1402static __inline __m128i __attribute__((__always_inline__))
1403_mm_sad_epu8 (__m128i __A, __m128i __B)
1404{
1405  return (__m128i)__builtin_ia32_psadbw128 ((__v16qi)__A, (__v16qi)__B);
1406}
1407
1408static __inline void __attribute__((__always_inline__))
1409_mm_stream_si32 (int *__A, int __B)
1410{
1411  __builtin_ia32_movnti (__A, __B);
1412}
1413
1414static __inline void __attribute__((__always_inline__))
1415_mm_stream_si128 (__m128i *__A, __m128i __B)
1416{
1417  __builtin_ia32_movntdq ((__v2di *)__A, (__v2di)__B);
1418}
1419
1420static __inline void __attribute__((__always_inline__))
1421_mm_stream_pd (double *__A, __m128d __B)
1422{
1423  __builtin_ia32_movntpd (__A, (__v2df)__B);
1424}
1425
1426static __inline void __attribute__((__always_inline__))
1427_mm_clflush (void const *__A)
1428{
1429  __builtin_ia32_clflush (__A);
1430}
1431
1432static __inline void __attribute__((__always_inline__))
1433_mm_lfence (void)
1434{
1435  __builtin_ia32_lfence ();
1436}
1437
1438static __inline void __attribute__((__always_inline__))
1439_mm_mfence (void)
1440{
1441  __builtin_ia32_mfence ();
1442}
1443
1444static __inline __m128i __attribute__((__always_inline__))
1445_mm_cvtsi32_si128 (int __A)
1446{
1447  return _mm_set_epi32 (0, 0, 0, __A);
1448}
1449
1450#ifdef __x86_64__
1451/* Intel intrinsic.  */
1452static __inline __m128i __attribute__((__always_inline__))
1453_mm_cvtsi64_si128 (long long __A)
1454{
1455  return _mm_set_epi64x (0, __A);
1456}
1457
1458/* Microsoft intrinsic.  */
1459static __inline __m128i __attribute__((__always_inline__))
1460_mm_cvtsi64x_si128 (long long __A)
1461{
1462  return _mm_set_epi64x (0, __A);
1463}
1464#endif
1465
1466/* Casts between various SP, DP, INT vector types.  Note that these do no
1467   conversion of values, they just change the type.  */
1468static __inline __m128 __attribute__((__always_inline__))
1469_mm_castpd_ps(__m128d __A)
1470{
1471  return (__m128) __A;
1472}
1473
1474static __inline __m128i __attribute__((__always_inline__))
1475_mm_castpd_si128(__m128d __A)
1476{
1477  return (__m128i) __A;
1478}
1479
1480static __inline __m128d __attribute__((__always_inline__))
1481_mm_castps_pd(__m128 __A)
1482{
1483  return (__m128d) __A;
1484}
1485
1486static __inline __m128i __attribute__((__always_inline__))
1487_mm_castps_si128(__m128 __A)
1488{
1489  return (__m128i) __A;
1490}
1491
1492static __inline __m128 __attribute__((__always_inline__))
1493_mm_castsi128_ps(__m128i __A)
1494{
1495  return (__m128) __A;
1496}
1497
1498static __inline __m128d __attribute__((__always_inline__))
1499_mm_castsi128_pd(__m128i __A)
1500{
1501  return (__m128d) __A;
1502}
1503
1504#endif /* __SSE2__  */
1505
1506#endif /* _EMMINTRIN_H_INCLUDED */
1507