xmmintrin.h revision 360784
1/*===---- xmmintrin.h - Implementation of SSE intrinsics on PowerPC --------===
2 *
3 * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 * See https://llvm.org/LICENSE.txt for license information.
5 * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 *
7 *===-----------------------------------------------------------------------===
8 */
9
10/* Implemented from the specification included in the Intel C++ Compiler
11   User Guide and Reference, version 9.0.  */
12
13#ifndef NO_WARN_X86_INTRINSICS
14/* This header file is to help porting code using Intel intrinsics
15   explicitly from x86_64 to powerpc64/powerpc64le.
16
17   Since X86 SSE intrinsics mainly handles __m128 type, PowerPC
18   VMX/VSX ISA is a good match for vector float SIMD operations.
19   However scalar float operations in vector (XMM) registers require
20   the POWER8 VSX ISA (2.07) level. There are differences for data
21   format and placement of float scalars in the vector register, which
22   require extra steps to match SSE scalar float semantics on POWER.
23
24   It should be noted that there's much difference between X86_64's
25   MXSCR and PowerISA's FPSCR/VSCR registers. It's recommended to use
26   portable <fenv.h> instead of access MXSCR directly.
27
28   Most SSE scalar float intrinsic operations can be performed more
29   efficiently as C language float scalar operations or optimized to
30   use vector SIMD operations. We recommend this for new applications. */
31#error "Please read comment above.  Use -DNO_WARN_X86_INTRINSICS to disable this error."
32#endif
33
34#ifndef _XMMINTRIN_H_INCLUDED
35#define _XMMINTRIN_H_INCLUDED
36
37#if defined(__linux__) && defined(__ppc64__)
38
39/* Define four value permute mask */
40#define _MM_SHUFFLE(w,x,y,z) (((w) << 6) | ((x) << 4) | ((y) << 2) | (z))
41
42#include <altivec.h>
43
44/* Avoid collisions between altivec.h and strict adherence to C++ and
45   C11 standards.  This should eventually be done inside altivec.h itself,
46   but only after testing a full distro build.  */
47#if defined(__STRICT_ANSI__) && (defined(__cplusplus) || \
48				 (defined(__STDC_VERSION__) &&	\
49				  __STDC_VERSION__ >= 201112L))
50#undef vector
51#undef pixel
52#undef bool
53#endif
54
55/* We need type definitions from the MMX header file.  */
56#include <mmintrin.h>
57
58/* Get _mm_malloc () and _mm_free ().  */
59#if __STDC_HOSTED__
60#include <mm_malloc.h>
61#endif
62
63/* The Intel API is flexible enough that we must allow aliasing with other
64   vector types, and their scalar components.  */
65typedef float __m128 __attribute__ ((__vector_size__ (16), __may_alias__));
66
67/* Unaligned version of the same type.  */
68typedef float __m128_u __attribute__ ((__vector_size__ (16), __may_alias__,
69				       __aligned__ (1)));
70
71/* Internal data types for implementing the intrinsics.  */
72typedef float __v4sf __attribute__ ((__vector_size__ (16)));
73
74/* Create an undefined vector.  */
75extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
76_mm_undefined_ps (void)
77{
78  __m128 __Y = __Y;
79  return __Y;
80}
81
82/* Create a vector of zeros.  */
83extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
84_mm_setzero_ps (void)
85{
86  return __extension__ (__m128){ 0.0f, 0.0f, 0.0f, 0.0f };
87}
88
89/* Load four SPFP values from P.  The address must be 16-byte aligned.  */
90extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
91_mm_load_ps (float const *__P)
92{
93  return ((__m128)vec_ld(0, (__v4sf*)__P));
94}
95
96/* Load four SPFP values from P.  The address need not be 16-byte aligned.  */
97extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
98_mm_loadu_ps (float const *__P)
99{
100  return (vec_vsx_ld(0, __P));
101}
102
103/* Load four SPFP values in reverse order.  The address must be aligned.  */
104extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
105_mm_loadr_ps (float const *__P)
106{
107  __v4sf   __tmp;
108  __m128 result;
109  static const __vector unsigned char permute_vector =
110    { 0x1C, 0x1D, 0x1E, 0x1F, 0x18, 0x19, 0x1A, 0x1B, 0x14, 0x15, 0x16,
111	0x17, 0x10, 0x11, 0x12, 0x13 };
112
113  __tmp = vec_ld (0, (__v4sf *) __P);
114  result = (__m128) vec_perm (__tmp, __tmp, permute_vector);
115  return result;
116}
117
118/* Create a vector with all four elements equal to F.  */
119extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
120_mm_set1_ps (float __F)
121{
122  return __extension__ (__m128)(__v4sf){ __F, __F, __F, __F };
123}
124
125extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
126_mm_set_ps1 (float __F)
127{
128  return _mm_set1_ps (__F);
129}
130
131/* Create the vector [Z Y X W].  */
132extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
133_mm_set_ps (const float __Z, const float __Y, const float __X, const float __W)
134{
135  return __extension__ (__m128)(__v4sf){ __W, __X, __Y, __Z };
136}
137
138/* Create the vector [W X Y Z].  */
139extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
140_mm_setr_ps (float __Z, float __Y, float __X, float __W)
141{
142  return __extension__ (__m128)(__v4sf){ __Z, __Y, __X, __W };
143}
144
145/* Store four SPFP values.  The address must be 16-byte aligned.  */
146extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
147_mm_store_ps (float *__P, __m128 __A)
148{
149  vec_st((__v4sf)__A, 0, (__v4sf*)__P);
150}
151
152/* Store four SPFP values.  The address need not be 16-byte aligned.  */
153extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
154_mm_storeu_ps (float *__P, __m128 __A)
155{
156  *(__m128_u *)__P = __A;
157}
158
159/* Store four SPFP values in reverse order.  The address must be aligned.  */
160extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
161_mm_storer_ps (float *__P, __m128 __A)
162{
163  __v4sf   __tmp;
164  static const __vector unsigned char permute_vector =
165    { 0x1C, 0x1D, 0x1E, 0x1F, 0x18, 0x19, 0x1A, 0x1B, 0x14, 0x15, 0x16,
166	0x17, 0x10, 0x11, 0x12, 0x13 };
167
168  __tmp = (__m128) vec_perm (__A, __A, permute_vector);
169
170  _mm_store_ps (__P, __tmp);
171}
172
173/* Store the lower SPFP value across four words.  */
174extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
175_mm_store1_ps (float *__P, __m128 __A)
176{
177  __v4sf __va = vec_splat((__v4sf)__A, 0);
178  _mm_store_ps (__P, __va);
179}
180
181extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
182_mm_store_ps1 (float *__P, __m128 __A)
183{
184  _mm_store1_ps (__P, __A);
185}
186
187/* Create a vector with element 0 as F and the rest zero.  */
188extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
189_mm_set_ss (float __F)
190{
191  return __extension__ (__m128)(__v4sf){ __F, 0.0f, 0.0f, 0.0f };
192}
193
194/* Sets the low SPFP value of A from the low value of B.  */
195extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
196_mm_move_ss (__m128 __A, __m128 __B)
197{
198  static const __vector unsigned int mask = {0xffffffff, 0, 0, 0};
199
200  return (vec_sel ((__v4sf)__A, (__v4sf)__B, mask));
201}
202
203/* Create a vector with element 0 as *P and the rest zero.  */
204extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
205_mm_load_ss (float const *__P)
206{
207  return _mm_set_ss (*__P);
208}
209
210/* Stores the lower SPFP value.  */
211extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
212_mm_store_ss (float *__P, __m128 __A)
213{
214  *__P = ((__v4sf)__A)[0];
215}
216
217/* Perform the respective operation on the lower SPFP (single-precision
218   floating-point) values of A and B; the upper three SPFP values are
219   passed through from A.  */
220
221extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
222_mm_add_ss (__m128 __A, __m128 __B)
223{
224#ifdef _ARCH_PWR7
225  __m128 a, b, c;
226  static const __vector unsigned int mask = {0xffffffff, 0, 0, 0};
227  /* PowerISA VSX does not allow partial (for just lower double)
228     results. So to insure we don't generate spurious exceptions
229     (from the upper double values) we splat the lower double
230     before we to the operation.  */
231  a = vec_splat (__A, 0);
232  b = vec_splat (__B, 0);
233  c = a + b;
234  /* Then we merge the lower float result with the original upper
235     float elements from __A.  */
236  return (vec_sel (__A, c, mask));
237#else
238  __A[0] = __A[0] + __B[0];
239  return (__A);
240#endif
241}
242
243extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
244_mm_sub_ss (__m128 __A, __m128 __B)
245{
246#ifdef _ARCH_PWR7
247  __m128 a, b, c;
248  static const __vector unsigned int mask = {0xffffffff, 0, 0, 0};
249  /* PowerISA VSX does not allow partial (for just lower double)
250     results. So to insure we don't generate spurious exceptions
251     (from the upper double values) we splat the lower double
252     before we to the operation.  */
253  a = vec_splat (__A, 0);
254  b = vec_splat (__B, 0);
255  c = a - b;
256  /* Then we merge the lower float result with the original upper
257     float elements from __A.  */
258  return (vec_sel (__A, c, mask));
259#else
260  __A[0] = __A[0] - __B[0];
261  return (__A);
262#endif
263}
264
265extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
266_mm_mul_ss (__m128 __A, __m128 __B)
267{
268#ifdef _ARCH_PWR7
269  __m128 a, b, c;
270  static const __vector unsigned int mask = {0xffffffff, 0, 0, 0};
271  /* PowerISA VSX does not allow partial (for just lower double)
272     results. So to insure we don't generate spurious exceptions
273     (from the upper double values) we splat the lower double
274     before we to the operation.  */
275  a = vec_splat (__A, 0);
276  b = vec_splat (__B, 0);
277  c = a * b;
278  /* Then we merge the lower float result with the original upper
279     float elements from __A.  */
280  return (vec_sel (__A, c, mask));
281#else
282  __A[0] = __A[0] * __B[0];
283  return (__A);
284#endif
285}
286
287extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
288_mm_div_ss (__m128 __A, __m128 __B)
289{
290#ifdef _ARCH_PWR7
291  __m128 a, b, c;
292  static const __vector unsigned int mask = {0xffffffff, 0, 0, 0};
293  /* PowerISA VSX does not allow partial (for just lower double)
294     results. So to insure we don't generate spurious exceptions
295     (from the upper double values) we splat the lower double
296     before we to the operation.  */
297  a = vec_splat (__A, 0);
298  b = vec_splat (__B, 0);
299  c = a / b;
300  /* Then we merge the lower float result with the original upper
301     float elements from __A.  */
302  return (vec_sel (__A, c, mask));
303#else
304  __A[0] = __A[0] / __B[0];
305  return (__A);
306#endif
307}
308
309extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
310_mm_sqrt_ss (__m128 __A)
311{
312  __m128 a, c;
313  static const __vector unsigned int mask = {0xffffffff, 0, 0, 0};
314  /* PowerISA VSX does not allow partial (for just lower double)
315   * results. So to insure we don't generate spurious exceptions
316   * (from the upper double values) we splat the lower double
317   * before we to the operation. */
318  a = vec_splat (__A, 0);
319  c = vec_sqrt (a);
320  /* Then we merge the lower float result with the original upper
321   * float elements from __A.  */
322  return (vec_sel (__A, c, mask));
323}
324
325/* Perform the respective operation on the four SPFP values in A and B.  */
326extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
327_mm_add_ps (__m128 __A, __m128 __B)
328{
329  return (__m128) ((__v4sf)__A + (__v4sf)__B);
330}
331
332extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
333_mm_sub_ps (__m128 __A, __m128 __B)
334{
335  return (__m128) ((__v4sf)__A - (__v4sf)__B);
336}
337
338extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
339_mm_mul_ps (__m128 __A, __m128 __B)
340{
341  return (__m128) ((__v4sf)__A * (__v4sf)__B);
342}
343
344extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
345_mm_div_ps (__m128 __A, __m128 __B)
346{
347  return (__m128) ((__v4sf)__A / (__v4sf)__B);
348}
349
350extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
351_mm_sqrt_ps (__m128 __A)
352{
353  return (vec_sqrt ((__v4sf)__A));
354}
355
356extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
357_mm_rcp_ps (__m128 __A)
358{
359  return (vec_re ((__v4sf)__A));
360}
361
362extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
363_mm_rsqrt_ps (__m128 __A)
364{
365  return (vec_rsqrte (__A));
366}
367
368extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
369_mm_rcp_ss (__m128 __A)
370{
371  __m128 a, c;
372  static const __vector unsigned int mask = {0xffffffff, 0, 0, 0};
373  /* PowerISA VSX does not allow partial (for just lower double)
374   * results. So to insure we don't generate spurious exceptions
375   * (from the upper double values) we splat the lower double
376   * before we to the operation. */
377  a = vec_splat (__A, 0);
378  c = _mm_rcp_ps (a);
379  /* Then we merge the lower float result with the original upper
380   * float elements from __A.  */
381  return (vec_sel (__A, c, mask));
382}
383
384extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
385_mm_rsqrt_ss (__m128 __A)
386{
387  __m128 a, c;
388  static const __vector unsigned int mask = {0xffffffff, 0, 0, 0};
389  /* PowerISA VSX does not allow partial (for just lower double)
390   * results. So to insure we don't generate spurious exceptions
391   * (from the upper double values) we splat the lower double
392   * before we to the operation. */
393  a = vec_splat (__A, 0);
394  c = vec_rsqrte (a);
395  /* Then we merge the lower float result with the original upper
396   * float elements from __A.  */
397  return (vec_sel (__A, c, mask));
398}
399
400extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
401_mm_min_ss (__m128 __A, __m128 __B)
402{
403  __v4sf a, b, c;
404  static const __vector unsigned int mask = {0xffffffff, 0, 0, 0};
405  /* PowerISA VSX does not allow partial (for just lower float)
406   * results. So to insure we don't generate spurious exceptions
407   * (from the upper float values) we splat the lower float
408   * before we to the operation. */
409  a = vec_splat ((__v4sf)__A, 0);
410  b = vec_splat ((__v4sf)__B, 0);
411  c = vec_min (a, b);
412  /* Then we merge the lower float result with the original upper
413   * float elements from __A.  */
414  return (vec_sel ((__v4sf)__A, c, mask));
415}
416
417extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
418_mm_max_ss (__m128 __A, __m128 __B)
419{
420  __v4sf a, b, c;
421  static const __vector unsigned int mask = {0xffffffff, 0, 0, 0};
422  /* PowerISA VSX does not allow partial (for just lower float)
423   * results. So to insure we don't generate spurious exceptions
424   * (from the upper float values) we splat the lower float
425   * before we to the operation. */
426  a = vec_splat (__A, 0);
427  b = vec_splat (__B, 0);
428  c = vec_max (a, b);
429  /* Then we merge the lower float result with the original upper
430   * float elements from __A.  */
431  return (vec_sel ((__v4sf)__A, c, mask));
432}
433
434extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
435_mm_min_ps (__m128 __A, __m128 __B)
436{
437  __vector __bool int m = vec_cmpgt ((__v4sf) __B, (__v4sf) __A);
438  return vec_sel (__B, __A, m);
439}
440
441extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
442_mm_max_ps (__m128 __A, __m128 __B)
443{
444  __vector __bool int m = vec_cmpgt ((__v4sf) __A, (__v4sf) __B);
445  return vec_sel (__B, __A, m);
446}
447
448/* Perform logical bit-wise operations on 128-bit values.  */
449extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
450_mm_and_ps (__m128 __A, __m128 __B)
451{
452  return ((__m128)vec_and ((__v4sf)__A, (__v4sf)__B));
453//  return __builtin_ia32_andps (__A, __B);
454}
455
456extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
457_mm_andnot_ps (__m128 __A, __m128 __B)
458{
459  return ((__m128)vec_andc ((__v4sf)__B, (__v4sf)__A));
460}
461
462extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
463_mm_or_ps (__m128 __A, __m128 __B)
464{
465  return ((__m128)vec_or ((__v4sf)__A, (__v4sf)__B));
466}
467
468extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
469_mm_xor_ps (__m128 __A, __m128 __B)
470{
471  return ((__m128)vec_xor ((__v4sf)__A, (__v4sf)__B));
472}
473
474/* Perform a comparison on the four SPFP values of A and B.  For each
475   element, if the comparison is true, place a mask of all ones in the
476   result, otherwise a mask of zeros.  */
477extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
478_mm_cmpeq_ps (__m128 __A, __m128 __B)
479{
480  return ((__m128)vec_cmpeq ((__v4sf)__A,(__v4sf) __B));
481}
482
483extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
484_mm_cmplt_ps (__m128 __A, __m128 __B)
485{
486  return ((__m128)vec_cmplt ((__v4sf)__A, (__v4sf)__B));
487}
488
489extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
490_mm_cmple_ps (__m128 __A, __m128 __B)
491{
492  return ((__m128)vec_cmple ((__v4sf)__A, (__v4sf)__B));
493}
494
495extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
496_mm_cmpgt_ps (__m128 __A, __m128 __B)
497{
498  return ((__m128)vec_cmpgt ((__v4sf)__A, (__v4sf)__B));
499}
500
501extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
502_mm_cmpge_ps (__m128 __A, __m128 __B)
503{
504  return ((__m128)vec_cmpge ((__v4sf)__A, (__v4sf)__B));
505}
506
507extern __inline  __m128  __attribute__((__gnu_inline__, __always_inline__, __artificial__))
508_mm_cmpneq_ps (__m128  __A, __m128  __B)
509{
510  __v4sf temp = (__v4sf ) vec_cmpeq ((__v4sf) __A, (__v4sf)__B);
511  return ((__m128)vec_nor (temp, temp));
512}
513
514extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
515_mm_cmpnlt_ps (__m128 __A, __m128 __B)
516{
517  return ((__m128)vec_cmpge ((__v4sf)__A, (__v4sf)__B));
518}
519
520extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
521_mm_cmpnle_ps (__m128 __A, __m128 __B)
522{
523  return ((__m128)vec_cmpgt ((__v4sf)__A, (__v4sf)__B));
524}
525
526extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
527_mm_cmpngt_ps (__m128 __A, __m128 __B)
528{
529  return ((__m128)vec_cmple ((__v4sf)__A, (__v4sf)__B));
530}
531
532extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
533_mm_cmpnge_ps (__m128 __A, __m128 __B)
534{
535  return ((__m128)vec_cmplt ((__v4sf)__A, (__v4sf)__B));
536}
537
538extern __inline  __m128  __attribute__((__gnu_inline__, __always_inline__, __artificial__))
539_mm_cmpord_ps (__m128  __A, __m128  __B)
540{
541  __vector unsigned int a, b;
542  __vector unsigned int c, d;
543  static const __vector unsigned int float_exp_mask =
544    { 0x7f800000, 0x7f800000, 0x7f800000, 0x7f800000 };
545
546  a = (__vector unsigned int) vec_abs ((__v4sf)__A);
547  b = (__vector unsigned int) vec_abs ((__v4sf)__B);
548  c = (__vector unsigned int) vec_cmpgt (float_exp_mask, a);
549  d = (__vector unsigned int) vec_cmpgt (float_exp_mask, b);
550  return ((__m128 ) vec_and (c, d));
551}
552
553extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
554_mm_cmpunord_ps (__m128 __A, __m128 __B)
555{
556  __vector unsigned int a, b;
557  __vector unsigned int c, d;
558  static const __vector unsigned int float_exp_mask =
559    { 0x7f800000, 0x7f800000, 0x7f800000, 0x7f800000 };
560
561  a = (__vector unsigned int) vec_abs ((__v4sf)__A);
562  b = (__vector unsigned int) vec_abs ((__v4sf)__B);
563  c = (__vector unsigned int) vec_cmpgt (a, float_exp_mask);
564  d = (__vector unsigned int) vec_cmpgt (b, float_exp_mask);
565  return ((__m128 ) vec_or (c, d));
566}
567
568/* Perform a comparison on the lower SPFP values of A and B.  If the
569   comparison is true, place a mask of all ones in the result, otherwise a
570   mask of zeros.  The upper three SPFP values are passed through from A.  */
571extern __inline  __m128  __attribute__((__gnu_inline__, __always_inline__, __artificial__))
572_mm_cmpeq_ss (__m128  __A, __m128  __B)
573{
574  static const __vector unsigned int mask =
575    { 0xffffffff, 0, 0, 0 };
576  __v4sf a, b, c;
577  /* PowerISA VMX does not allow partial (for just element 0)
578   * results. So to insure we don't generate spurious exceptions
579   * (from the upper elements) we splat the lower float
580   * before we to the operation. */
581  a = vec_splat ((__v4sf) __A, 0);
582  b = vec_splat ((__v4sf) __B, 0);
583  c = (__v4sf) vec_cmpeq(a, b);
584  /* Then we merge the lower float result with the original upper
585   * float elements from __A.  */
586  return ((__m128)vec_sel ((__v4sf)__A, c, mask));
587}
588
589extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
590_mm_cmplt_ss (__m128 __A, __m128 __B)
591{
592  static const __vector unsigned int mask =
593    { 0xffffffff, 0, 0, 0 };
594  __v4sf a, b, c;
595  /* PowerISA VMX does not allow partial (for just element 0)
596   * results. So to insure we don't generate spurious exceptions
597   * (from the upper elements) we splat the lower float
598   * before we to the operation. */
599  a = vec_splat ((__v4sf) __A, 0);
600  b = vec_splat ((__v4sf) __B, 0);
601  c = (__v4sf) vec_cmplt(a, b);
602  /* Then we merge the lower float result with the original upper
603   * float elements from __A.  */
604  return ((__m128)vec_sel ((__v4sf)__A, c, mask));
605}
606
607extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
608_mm_cmple_ss (__m128 __A, __m128 __B)
609{
610  static const __vector unsigned int mask =
611    { 0xffffffff, 0, 0, 0 };
612  __v4sf a, b, c;
613  /* PowerISA VMX does not allow partial (for just element 0)
614   * results. So to insure we don't generate spurious exceptions
615   * (from the upper elements) we splat the lower float
616   * before we to the operation. */
617  a = vec_splat ((__v4sf) __A, 0);
618  b = vec_splat ((__v4sf) __B, 0);
619  c = (__v4sf) vec_cmple(a, b);
620  /* Then we merge the lower float result with the original upper
621   * float elements from __A.  */
622  return ((__m128)vec_sel ((__v4sf)__A, c, mask));
623}
624
625extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
626_mm_cmpgt_ss (__m128 __A, __m128 __B)
627{
628  static const __vector unsigned int mask =
629    { 0xffffffff, 0, 0, 0 };
630  __v4sf a, b, c;
631  /* PowerISA VMX does not allow partial (for just element 0)
632   * results. So to insure we don't generate spurious exceptions
633   * (from the upper elements) we splat the lower float
634   * before we to the operation. */
635  a = vec_splat ((__v4sf) __A, 0);
636  b = vec_splat ((__v4sf) __B, 0);
637  c = (__v4sf) vec_cmpgt(a, b);
638  /* Then we merge the lower float result with the original upper
639   * float elements from __A.  */
640  return ((__m128)vec_sel ((__v4sf)__A, c, mask));
641}
642
643extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
644_mm_cmpge_ss (__m128 __A, __m128 __B)
645{
646  static const __vector unsigned int mask =
647    { 0xffffffff, 0, 0, 0 };
648  __v4sf a, b, c;
649  /* PowerISA VMX does not allow partial (for just element 0)
650   * results. So to insure we don't generate spurious exceptions
651   * (from the upper elements) we splat the lower float
652   * before we to the operation. */
653  a = vec_splat ((__v4sf) __A, 0);
654  b = vec_splat ((__v4sf) __B, 0);
655  c = (__v4sf) vec_cmpge(a, b);
656  /* Then we merge the lower float result with the original upper
657   * float elements from __A.  */
658  return ((__m128)vec_sel ((__v4sf)__A, c, mask));
659}
660
661extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
662_mm_cmpneq_ss (__m128 __A, __m128 __B)
663{
664  static const __vector unsigned int mask =
665    { 0xffffffff, 0, 0, 0 };
666  __v4sf a, b, c;
667  /* PowerISA VMX does not allow partial (for just element 0)
668   * results. So to insure we don't generate spurious exceptions
669   * (from the upper elements) we splat the lower float
670   * before we to the operation. */
671  a = vec_splat ((__v4sf) __A, 0);
672  b = vec_splat ((__v4sf) __B, 0);
673  c = (__v4sf) vec_cmpeq(a, b);
674  c = vec_nor (c, c);
675  /* Then we merge the lower float result with the original upper
676   * float elements from __A.  */
677  return ((__m128)vec_sel ((__v4sf)__A, c, mask));
678}
679
680extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
681_mm_cmpnlt_ss (__m128 __A, __m128 __B)
682{
683  static const __vector unsigned int mask =
684    { 0xffffffff, 0, 0, 0 };
685  __v4sf a, b, c;
686  /* PowerISA VMX does not allow partial (for just element 0)
687   * results. So to insure we don't generate spurious exceptions
688   * (from the upper elements) we splat the lower float
689   * before we to the operation. */
690  a = vec_splat ((__v4sf) __A, 0);
691  b = vec_splat ((__v4sf) __B, 0);
692  c = (__v4sf) vec_cmpge(a, b);
693  /* Then we merge the lower float result with the original upper
694   * float elements from __A.  */
695  return ((__m128)vec_sel ((__v4sf)__A, c, mask));
696}
697
698extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
699_mm_cmpnle_ss (__m128 __A, __m128 __B)
700{
701  static const __vector unsigned int mask =
702    { 0xffffffff, 0, 0, 0 };
703  __v4sf a, b, c;
704  /* PowerISA VMX does not allow partial (for just element 0)
705   * results. So to insure we don't generate spurious exceptions
706   * (from the upper elements) we splat the lower float
707   * before we to the operation. */
708  a = vec_splat ((__v4sf) __A, 0);
709  b = vec_splat ((__v4sf) __B, 0);
710  c = (__v4sf) vec_cmpgt(a, b);
711  /* Then we merge the lower float result with the original upper
712   * float elements from __A.  */
713  return ((__m128)vec_sel ((__v4sf)__A, c, mask));
714}
715
716extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
717_mm_cmpngt_ss (__m128 __A, __m128 __B)
718{
719  static const __vector unsigned int mask =
720    { 0xffffffff, 0, 0, 0 };
721  __v4sf a, b, c;
722  /* PowerISA VMX does not allow partial (for just element 0)
723   * results. So to insure we don't generate spurious exceptions
724   * (from the upper elements) we splat the lower float
725   * before we to the operation. */
726  a = vec_splat ((__v4sf) __A, 0);
727  b = vec_splat ((__v4sf) __B, 0);
728  c = (__v4sf) vec_cmple(a, b);
729  /* Then we merge the lower float result with the original upper
730   * float elements from __A.  */
731  return ((__m128)vec_sel ((__v4sf)__A, c, mask));
732}
733
734extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
735_mm_cmpnge_ss (__m128 __A, __m128 __B)
736{
737  static const __vector unsigned int mask =
738    { 0xffffffff, 0, 0, 0 };
739  __v4sf a, b, c;
740  /* PowerISA VMX does not allow partial (for just element 0)
741   * results. So to insure we don't generate spurious exceptions
742   * (from the upper elements) we splat the lower float
743   * before we do the operation. */
744  a = vec_splat ((__v4sf) __A, 0);
745  b = vec_splat ((__v4sf) __B, 0);
746  c = (__v4sf) vec_cmplt(a, b);
747  /* Then we merge the lower float result with the original upper
748   * float elements from __A.  */
749  return ((__m128)vec_sel ((__v4sf)__A, c, mask));
750}
751
752extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
753_mm_cmpord_ss (__m128 __A, __m128 __B)
754{
755  __vector unsigned int a, b;
756  __vector unsigned int c, d;
757  static const __vector unsigned int float_exp_mask =
758    { 0x7f800000, 0x7f800000, 0x7f800000, 0x7f800000 };
759  static const __vector unsigned int mask =
760    { 0xffffffff, 0, 0, 0 };
761
762  a = (__vector unsigned int) vec_abs ((__v4sf)__A);
763  b = (__vector unsigned int) vec_abs ((__v4sf)__B);
764  c = (__vector unsigned int) vec_cmpgt (float_exp_mask, a);
765  d = (__vector unsigned int) vec_cmpgt (float_exp_mask, b);
766  c = vec_and (c, d);
767  /* Then we merge the lower float result with the original upper
768   * float elements from __A.  */
769  return ((__m128)vec_sel ((__v4sf)__A, (__v4sf)c, mask));
770}
771
772extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
773_mm_cmpunord_ss (__m128 __A, __m128 __B)
774{
775  __vector unsigned int a, b;
776  __vector unsigned int c, d;
777  static const __vector unsigned int float_exp_mask =
778    { 0x7f800000, 0x7f800000, 0x7f800000, 0x7f800000 };
779  static const __vector unsigned int mask =
780    { 0xffffffff, 0, 0, 0 };
781
782  a = (__vector unsigned int) vec_abs ((__v4sf)__A);
783  b = (__vector unsigned int) vec_abs ((__v4sf)__B);
784  c = (__vector unsigned int) vec_cmpgt (a, float_exp_mask);
785  d = (__vector unsigned int) vec_cmpgt (b, float_exp_mask);
786  c = vec_or (c, d);
787  /* Then we merge the lower float result with the original upper
788   * float elements from __A.  */
789  return ((__m128)vec_sel ((__v4sf)__A, (__v4sf)c, mask));
790}
791
792/* Compare the lower SPFP values of A and B and return 1 if true
793   and 0 if false.  */
794extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
795_mm_comieq_ss (__m128 __A, __m128 __B)
796{
797  return (__A[0] == __B[0]);
798}
799
800extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
801_mm_comilt_ss (__m128 __A, __m128 __B)
802{
803  return (__A[0] < __B[0]);
804}
805
806extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
807_mm_comile_ss (__m128 __A, __m128 __B)
808{
809  return (__A[0] <= __B[0]);
810}
811
812extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
813_mm_comigt_ss (__m128 __A, __m128 __B)
814{
815  return (__A[0] > __B[0]);
816}
817
818extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
819_mm_comige_ss (__m128 __A, __m128 __B)
820{
821  return (__A[0] >= __B[0]);
822}
823
824extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
825_mm_comineq_ss (__m128 __A, __m128 __B)
826{
827  return (__A[0] != __B[0]);
828}
829
830/* FIXME
831 * The __mm_ucomi??_ss implementations below are exactly the same as
832 * __mm_comi??_ss because GCC for PowerPC only generates unordered
833 * compares (scalar and vector).
834 * Technically __mm_comieq_ss et al should be using the ordered
835 * compare and signal for QNaNs.
836 * The __mm_ucomieq_sd et all should be OK, as is.
837 */
838extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
839_mm_ucomieq_ss (__m128 __A, __m128 __B)
840{
841  return (__A[0] == __B[0]);
842}
843
844extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
845_mm_ucomilt_ss (__m128 __A, __m128 __B)
846{
847  return (__A[0] < __B[0]);
848}
849
850extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
851_mm_ucomile_ss (__m128 __A, __m128 __B)
852{
853  return (__A[0] <= __B[0]);
854}
855
856extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
857_mm_ucomigt_ss (__m128 __A, __m128 __B)
858{
859  return (__A[0] > __B[0]);
860}
861
862extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
863_mm_ucomige_ss (__m128 __A, __m128 __B)
864{
865  return (__A[0] >= __B[0]);
866}
867
868extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
869_mm_ucomineq_ss (__m128 __A, __m128 __B)
870{
871  return (__A[0] != __B[0]);
872}
873
874extern __inline float __attribute__((__gnu_inline__, __always_inline__, __artificial__))
875_mm_cvtss_f32 (__m128 __A)
876{
877  return ((__v4sf)__A)[0];
878}
879
880/* Convert the lower SPFP value to a 32-bit integer according to the current
881   rounding mode.  */
882extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
883_mm_cvtss_si32 (__m128 __A)
884{
885  __m64 res = 0;
886#ifdef _ARCH_PWR8
887  double dtmp;
888  __asm__(
889#ifdef __LITTLE_ENDIAN__
890      "xxsldwi %x0,%x0,%x0,3;\n"
891#endif
892      "xscvspdp %x2,%x0;\n"
893      "fctiw  %2,%2;\n"
894      "mfvsrd  %1,%x2;\n"
895      : "+wa" (__A),
896        "=r" (res),
897        "=f" (dtmp)
898      : );
899#else
900  res = __builtin_rint(__A[0]);
901#endif
902  return (res);
903}
904
905extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
906_mm_cvt_ss2si (__m128 __A)
907{
908  return _mm_cvtss_si32 (__A);
909}
910
911/* Convert the lower SPFP value to a 32-bit integer according to the
912   current rounding mode.  */
913
914/* Intel intrinsic.  */
915extern __inline long long __attribute__((__gnu_inline__, __always_inline__, __artificial__))
916_mm_cvtss_si64 (__m128 __A)
917{
918  __m64 res = 0;
919#ifdef _ARCH_PWR8
920  double dtmp;
921  __asm__(
922#ifdef __LITTLE_ENDIAN__
923      "xxsldwi %x0,%x0,%x0,3;\n"
924#endif
925      "xscvspdp %x2,%x0;\n"
926      "fctid  %2,%2;\n"
927      "mfvsrd  %1,%x2;\n"
928      : "+wa" (__A),
929        "=r" (res),
930        "=f" (dtmp)
931      : );
932#else
933  res = __builtin_llrint(__A[0]);
934#endif
935  return (res);
936}
937
938/* Microsoft intrinsic.  */
939extern __inline long long __attribute__((__gnu_inline__, __always_inline__, __artificial__))
940_mm_cvtss_si64x (__m128 __A)
941{
942  return _mm_cvtss_si64 ((__v4sf) __A);
943}
944
945/* Constants for use with _mm_prefetch.  */
946enum _mm_hint
947{
948  /* _MM_HINT_ET is _MM_HINT_T with set 3rd bit.  */
949  _MM_HINT_ET0 = 7,
950  _MM_HINT_ET1 = 6,
951  _MM_HINT_T0 = 3,
952  _MM_HINT_T1 = 2,
953  _MM_HINT_T2 = 1,
954  _MM_HINT_NTA = 0
955};
956
957/* Loads one cache line from address P to a location "closer" to the
958   processor.  The selector I specifies the type of prefetch operation.  */
959extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
960_mm_prefetch (const void *__P, enum _mm_hint __I)
961{
962  /* Current PowerPC will ignores the hint parameters.  */
963  __builtin_prefetch (__P);
964}
965
966/* Convert the two lower SPFP values to 32-bit integers according to the
967   current rounding mode.  Return the integers in packed form.  */
968extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
969_mm_cvtps_pi32 (__m128 __A)
970{
971  /* Splat two lower SPFP values to both halves.  */
972  __v4sf temp, rounded;
973  __vector unsigned long long result;
974
975  /* Splat two lower SPFP values to both halves.  */
976  temp = (__v4sf) vec_splat ((__vector long long)__A, 0);
977  rounded = vec_rint(temp);
978  result = (__vector unsigned long long) vec_cts (rounded, 0);
979
980  return (__m64) ((__vector long long) result)[0];
981}
982
983extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
984_mm_cvt_ps2pi (__m128 __A)
985{
986  return _mm_cvtps_pi32 (__A);
987}
988
989/* Truncate the lower SPFP value to a 32-bit integer.  */
990extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
991_mm_cvttss_si32 (__m128 __A)
992{
993  /* Extract the lower float element.  */
994  float temp = __A[0];
995  /* truncate to 32-bit integer and return.  */
996  return temp;
997}
998
999extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1000_mm_cvtt_ss2si (__m128 __A)
1001{
1002  return _mm_cvttss_si32 (__A);
1003}
1004
1005/* Intel intrinsic.  */
1006extern __inline long long __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1007_mm_cvttss_si64 (__m128 __A)
1008{
1009  /* Extract the lower float element.  */
1010  float temp = __A[0];
1011  /* truncate to 32-bit integer and return.  */
1012  return temp;
1013}
1014
1015/* Microsoft intrinsic.  */
1016extern __inline long long __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1017_mm_cvttss_si64x (__m128 __A)
1018{
1019  /* Extract the lower float element.  */
1020  float temp = __A[0];
1021  /* truncate to 32-bit integer and return.  */
1022  return temp;
1023}
1024
1025/* Truncate the two lower SPFP values to 32-bit integers.  Return the
1026   integers in packed form.  */
1027extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1028_mm_cvttps_pi32 (__m128 __A)
1029{
1030  __v4sf temp;
1031  __vector unsigned long long result;
1032
1033  /* Splat two lower SPFP values to both halves.  */
1034  temp = (__v4sf) vec_splat ((__vector long long)__A, 0);
1035  result = (__vector unsigned long long) vec_cts (temp, 0);
1036
1037  return (__m64) ((__vector long long) result)[0];
1038}
1039
1040extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1041_mm_cvtt_ps2pi (__m128 __A)
1042{
1043  return _mm_cvttps_pi32 (__A);
1044}
1045
1046/* Convert B to a SPFP value and insert it as element zero in A.  */
1047extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1048_mm_cvtsi32_ss (__m128 __A, int __B)
1049{
1050  float temp = __B;
1051  __A[0] = temp;
1052
1053  return __A;
1054}
1055
1056extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1057_mm_cvt_si2ss (__m128 __A, int __B)
1058{
1059  return _mm_cvtsi32_ss (__A, __B);
1060}
1061
1062/* Convert B to a SPFP value and insert it as element zero in A.  */
1063/* Intel intrinsic.  */
1064extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1065_mm_cvtsi64_ss (__m128 __A, long long __B)
1066{
1067  float temp = __B;
1068  __A[0] = temp;
1069
1070  return __A;
1071}
1072
1073/* Microsoft intrinsic.  */
1074extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1075_mm_cvtsi64x_ss (__m128 __A, long long __B)
1076{
1077  return _mm_cvtsi64_ss (__A, __B);
1078}
1079
1080/* Convert the two 32-bit values in B to SPFP form and insert them
1081   as the two lower elements in A.  */
1082extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1083_mm_cvtpi32_ps (__m128        __A, __m64        __B)
1084{
1085  __vector signed int vm1;
1086  __vector float vf1;
1087
1088  vm1 = (__vector signed int) (__vector unsigned long long) {__B, __B};
1089  vf1 = (__vector float) vec_ctf (vm1, 0);
1090
1091  return ((__m128) (__vector unsigned long long)
1092    { ((__vector unsigned long long)vf1) [0],
1093	((__vector unsigned long long)__A) [1]});
1094}
1095
1096extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1097_mm_cvt_pi2ps (__m128 __A, __m64 __B)
1098{
1099  return _mm_cvtpi32_ps (__A, __B);
1100}
1101
1102/* Convert the four signed 16-bit values in A to SPFP form.  */
1103extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1104_mm_cvtpi16_ps (__m64 __A)
1105{
1106  __vector signed short vs8;
1107  __vector signed int vi4;
1108  __vector float vf1;
1109
1110  vs8 = (__vector signed short) (__vector unsigned long long) { __A, __A };
1111  vi4 = vec_vupklsh (vs8);
1112  vf1 = (__vector float) vec_ctf (vi4, 0);
1113
1114  return (__m128) vf1;
1115}
1116
1117/* Convert the four unsigned 16-bit values in A to SPFP form.  */
1118extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1119_mm_cvtpu16_ps (__m64 __A)
1120{
1121  const __vector unsigned short zero =
1122    { 0, 0, 0, 0, 0, 0, 0, 0 };
1123  __vector unsigned short vs8;
1124  __vector unsigned int vi4;
1125  __vector float vf1;
1126
1127  vs8 = (__vector unsigned short) (__vector unsigned long long) { __A, __A };
1128  vi4 = (__vector unsigned int) vec_mergel
1129#ifdef __LITTLE_ENDIAN__
1130                                           (vs8, zero);
1131#else
1132                                           (zero, vs8);
1133#endif
1134  vf1 = (__vector float) vec_ctf (vi4, 0);
1135
1136  return (__m128) vf1;
1137}
1138
1139/* Convert the low four signed 8-bit values in A to SPFP form.  */
1140extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1141_mm_cvtpi8_ps (__m64 __A)
1142{
1143  __vector signed char vc16;
1144  __vector signed short vs8;
1145  __vector signed int vi4;
1146  __vector float vf1;
1147
1148  vc16 = (__vector signed char) (__vector unsigned long long) { __A, __A };
1149  vs8 = vec_vupkhsb (vc16);
1150  vi4 = vec_vupkhsh (vs8);
1151  vf1 = (__vector float) vec_ctf (vi4, 0);
1152
1153  return (__m128) vf1;
1154}
1155
1156/* Convert the low four unsigned 8-bit values in A to SPFP form.  */
1157extern __inline  __m128  __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1158
1159_mm_cvtpu8_ps (__m64  __A)
1160{
1161  const __vector unsigned char zero =
1162    { 0, 0, 0, 0, 0, 0, 0, 0 };
1163  __vector unsigned char vc16;
1164  __vector unsigned short vs8;
1165  __vector unsigned int vi4;
1166  __vector float vf1;
1167
1168  vc16 = (__vector unsigned char) (__vector unsigned long long) { __A, __A };
1169#ifdef __LITTLE_ENDIAN__
1170  vs8 = (__vector unsigned short) vec_mergel (vc16, zero);
1171  vi4 = (__vector unsigned int) vec_mergeh (vs8,
1172					    (__vector unsigned short) zero);
1173#else
1174  vs8 = (__vector unsigned short) vec_mergel (zero, vc16);
1175  vi4 = (__vector unsigned int) vec_mergeh ((__vector unsigned short) zero,
1176                                            vs8);
1177#endif
1178  vf1 = (__vector float) vec_ctf (vi4, 0);
1179
1180  return (__m128) vf1;
1181}
1182
1183/* Convert the four signed 32-bit values in A and B to SPFP form.  */
1184extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1185_mm_cvtpi32x2_ps (__m64 __A, __m64 __B)
1186{
1187  __vector signed int vi4;
1188  __vector float vf4;
1189
1190  vi4 = (__vector signed int) (__vector unsigned long long) { __A, __B };
1191  vf4 = (__vector float) vec_ctf (vi4, 0);
1192  return (__m128) vf4;
1193}
1194
1195/* Convert the four SPFP values in A to four signed 16-bit integers.  */
1196extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1197_mm_cvtps_pi16 (__m128 __A)
1198{
1199  __v4sf rounded;
1200  __vector signed int temp;
1201  __vector unsigned long long result;
1202
1203  rounded = vec_rint(__A);
1204  temp = vec_cts (rounded, 0);
1205  result = (__vector unsigned long long) vec_pack (temp, temp);
1206
1207  return (__m64) ((__vector long long) result)[0];
1208}
1209
1210/* Convert the four SPFP values in A to four signed 8-bit integers.  */
1211extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1212_mm_cvtps_pi8 (__m128 __A)
1213{
1214  __v4sf rounded;
1215  __vector signed int tmp_i;
1216  static const __vector signed int zero = {0, 0, 0, 0};
1217  __vector signed short tmp_s;
1218  __vector signed char res_v;
1219
1220  rounded = vec_rint(__A);
1221  tmp_i = vec_cts (rounded, 0);
1222  tmp_s = vec_pack (tmp_i, zero);
1223  res_v = vec_pack (tmp_s, tmp_s);
1224  return (__m64) ((__vector long long) res_v)[0];
1225}
1226
1227/* Selects four specific SPFP values from A and B based on MASK.  */
1228extern __inline  __m128  __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1229
1230_mm_shuffle_ps (__m128  __A, __m128  __B, int const __mask)
1231{
1232  unsigned long element_selector_10 = __mask & 0x03;
1233  unsigned long element_selector_32 = (__mask >> 2) & 0x03;
1234  unsigned long element_selector_54 = (__mask >> 4) & 0x03;
1235  unsigned long element_selector_76 = (__mask >> 6) & 0x03;
1236  static const unsigned int permute_selectors[4] =
1237    {
1238#ifdef __LITTLE_ENDIAN__
1239      0x03020100, 0x07060504, 0x0B0A0908, 0x0F0E0D0C
1240#else
1241      0x00010203, 0x04050607, 0x08090A0B, 0x0C0D0E0F
1242#endif
1243    };
1244  __vector unsigned int t;
1245
1246  t[0] = permute_selectors[element_selector_10];
1247  t[1] = permute_selectors[element_selector_32];
1248  t[2] = permute_selectors[element_selector_54] + 0x10101010;
1249  t[3] = permute_selectors[element_selector_76] + 0x10101010;
1250  return vec_perm ((__v4sf) __A, (__v4sf)__B, (__vector unsigned char)t);
1251}
1252
1253/* Selects and interleaves the upper two SPFP values from A and B.  */
1254extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1255_mm_unpackhi_ps (__m128 __A, __m128 __B)
1256{
1257  return (__m128) vec_vmrglw ((__v4sf) __A, (__v4sf)__B);
1258}
1259
1260/* Selects and interleaves the lower two SPFP values from A and B.  */
1261extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1262_mm_unpacklo_ps (__m128 __A, __m128 __B)
1263{
1264  return (__m128) vec_vmrghw ((__v4sf) __A, (__v4sf)__B);
1265}
1266
1267/* Sets the upper two SPFP values with 64-bits of data loaded from P;
1268   the lower two values are passed through from A.  */
1269extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1270_mm_loadh_pi (__m128 __A, __m64 const *__P)
1271{
1272  __vector unsigned long long __a = (__vector unsigned long long)__A;
1273  __vector unsigned long long __p = vec_splats(*__P);
1274  __a [1] = __p [1];
1275
1276  return (__m128)__a;
1277}
1278
1279/* Stores the upper two SPFP values of A into P.  */
1280extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1281_mm_storeh_pi (__m64 *__P, __m128 __A)
1282{
1283  __vector unsigned long long __a = (__vector unsigned long long) __A;
1284
1285  *__P = __a[1];
1286}
1287
1288/* Moves the upper two values of B into the lower two values of A.  */
1289extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1290_mm_movehl_ps (__m128 __A, __m128 __B)
1291{
1292  return (__m128) vec_mergel ((__vector unsigned long long)__B,
1293			      (__vector unsigned long long)__A);
1294}
1295
1296/* Moves the lower two values of B into the upper two values of A.  */
1297extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1298_mm_movelh_ps (__m128 __A, __m128 __B)
1299{
1300  return (__m128) vec_mergeh ((__vector unsigned long long)__A,
1301			      (__vector unsigned long long)__B);
1302}
1303
1304/* Sets the lower two SPFP values with 64-bits of data loaded from P;
1305   the upper two values are passed through from A.  */
1306extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1307_mm_loadl_pi (__m128 __A, __m64 const *__P)
1308{
1309  __vector unsigned long long __a = (__vector unsigned long long)__A;
1310  __vector unsigned long long __p = vec_splats(*__P);
1311  __a [0] = __p [0];
1312
1313  return (__m128)__a;
1314}
1315
1316/* Stores the lower two SPFP values of A into P.  */
1317extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1318_mm_storel_pi (__m64 *__P, __m128 __A)
1319{
1320  __vector unsigned long long __a = (__vector unsigned long long) __A;
1321
1322  *__P = __a[0];
1323}
1324
1325#ifdef _ARCH_PWR8
1326/* Intrinsic functions that require PowerISA 2.07 minimum.  */
1327
1328/* Creates a 4-bit mask from the most significant bits of the SPFP values.  */
1329extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1330_mm_movemask_ps (__m128  __A)
1331{
1332  __vector unsigned long long result;
1333  static const __vector unsigned int perm_mask =
1334    {
1335#ifdef __LITTLE_ENDIAN__
1336	0x00204060, 0x80808080, 0x80808080, 0x80808080
1337#else
1338      0x80808080, 0x80808080, 0x80808080, 0x00204060
1339#endif
1340    };
1341
1342  result = ((__vector unsigned long long)
1343	    vec_vbpermq ((__vector unsigned char) __A,
1344			 (__vector unsigned char) perm_mask));
1345
1346#ifdef __LITTLE_ENDIAN__
1347  return result[1];
1348#else
1349  return result[0];
1350#endif
1351}
1352#endif /* _ARCH_PWR8 */
1353
1354/* Create a vector with all four elements equal to *P.  */
1355extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1356_mm_load1_ps (float const *__P)
1357{
1358  return _mm_set1_ps (*__P);
1359}
1360
1361extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1362_mm_load_ps1 (float const *__P)
1363{
1364  return _mm_load1_ps (__P);
1365}
1366
1367/* Extracts one of the four words of A.  The selector N must be immediate.  */
1368extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1369_mm_extract_pi16 (__m64 const __A, int const __N)
1370{
1371  unsigned int shiftr = __N & 3;
1372#ifdef __BIG_ENDIAN__
1373  shiftr = 3 - shiftr;
1374#endif
1375
1376  return ((__A >> (shiftr * 16)) & 0xffff);
1377}
1378
1379extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1380_m_pextrw (__m64 const __A, int const __N)
1381{
1382  return _mm_extract_pi16 (__A, __N);
1383}
1384
1385/* Inserts word D into one of four words of A.  The selector N must be
1386   immediate.  */
1387extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1388_mm_insert_pi16 (__m64 const __A, int const __D, int const __N)
1389{
1390  const int shiftl = (__N & 3) * 16;
1391  const __m64 shiftD = (const __m64) __D << shiftl;
1392  const __m64 mask = 0xffffUL << shiftl;
1393  __m64 result = (__A & (~mask)) | (shiftD & mask);
1394
1395  return (result);
1396}
1397
1398extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1399_m_pinsrw (__m64 const __A, int const __D, int const __N)
1400{
1401  return _mm_insert_pi16 (__A, __D, __N);
1402}
1403
1404/* Compute the element-wise maximum of signed 16-bit values.  */
1405extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1406
1407_mm_max_pi16 (__m64 __A, __m64 __B)
1408{
1409#if _ARCH_PWR8
1410  __vector signed short a, b, r;
1411  __vector __bool short c;
1412
1413  a = (__vector signed short)vec_splats (__A);
1414  b = (__vector signed short)vec_splats (__B);
1415  c = (__vector __bool short)vec_cmpgt (a, b);
1416  r = vec_sel (b, a, c);
1417  return (__m64) ((__vector long long) r)[0];
1418#else
1419  __m64_union m1, m2, res;
1420
1421  m1.as_m64 = __A;
1422  m2.as_m64 = __B;
1423
1424  res.as_short[0] =
1425      (m1.as_short[0] > m2.as_short[0]) ? m1.as_short[0] : m2.as_short[0];
1426  res.as_short[1] =
1427      (m1.as_short[1] > m2.as_short[1]) ? m1.as_short[1] : m2.as_short[1];
1428  res.as_short[2] =
1429      (m1.as_short[2] > m2.as_short[2]) ? m1.as_short[2] : m2.as_short[2];
1430  res.as_short[3] =
1431      (m1.as_short[3] > m2.as_short[3]) ? m1.as_short[3] : m2.as_short[3];
1432
1433  return (__m64) res.as_m64;
1434#endif
1435}
1436
1437extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1438_m_pmaxsw (__m64 __A, __m64 __B)
1439{
1440  return _mm_max_pi16 (__A, __B);
1441}
1442
1443/* Compute the element-wise maximum of unsigned 8-bit values.  */
1444extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1445_mm_max_pu8 (__m64 __A, __m64 __B)
1446{
1447#if _ARCH_PWR8
1448  __vector unsigned char a, b, r;
1449  __vector __bool char c;
1450
1451  a = (__vector unsigned char)vec_splats (__A);
1452  b = (__vector unsigned char)vec_splats (__B);
1453  c = (__vector __bool char)vec_cmpgt (a, b);
1454  r = vec_sel (b, a, c);
1455  return (__m64) ((__vector long long) r)[0];
1456#else
1457  __m64_union m1, m2, res;
1458  long i;
1459
1460  m1.as_m64 = __A;
1461  m2.as_m64 = __B;
1462
1463
1464  for (i = 0; i < 8; i++)
1465  res.as_char[i] =
1466      ((unsigned char) m1.as_char[i] > (unsigned char) m2.as_char[i]) ?
1467	  m1.as_char[i] : m2.as_char[i];
1468
1469  return (__m64) res.as_m64;
1470#endif
1471}
1472
1473extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1474_m_pmaxub (__m64 __A, __m64 __B)
1475{
1476  return _mm_max_pu8 (__A, __B);
1477}
1478
1479/* Compute the element-wise minimum of signed 16-bit values.  */
1480extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1481_mm_min_pi16 (__m64 __A, __m64 __B)
1482{
1483#if _ARCH_PWR8
1484  __vector signed short a, b, r;
1485  __vector __bool short c;
1486
1487  a = (__vector signed short)vec_splats (__A);
1488  b = (__vector signed short)vec_splats (__B);
1489  c = (__vector __bool short)vec_cmplt (a, b);
1490  r = vec_sel (b, a, c);
1491  return (__m64) ((__vector long long) r)[0];
1492#else
1493  __m64_union m1, m2, res;
1494
1495  m1.as_m64 = __A;
1496  m2.as_m64 = __B;
1497
1498  res.as_short[0] =
1499      (m1.as_short[0] < m2.as_short[0]) ? m1.as_short[0] : m2.as_short[0];
1500  res.as_short[1] =
1501      (m1.as_short[1] < m2.as_short[1]) ? m1.as_short[1] : m2.as_short[1];
1502  res.as_short[2] =
1503      (m1.as_short[2] < m2.as_short[2]) ? m1.as_short[2] : m2.as_short[2];
1504  res.as_short[3] =
1505      (m1.as_short[3] < m2.as_short[3]) ? m1.as_short[3] : m2.as_short[3];
1506
1507  return (__m64) res.as_m64;
1508#endif
1509}
1510
1511extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1512_m_pminsw (__m64 __A, __m64 __B)
1513{
1514  return _mm_min_pi16 (__A, __B);
1515}
1516
1517/* Compute the element-wise minimum of unsigned 8-bit values.  */
1518extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1519_mm_min_pu8 (__m64 __A, __m64 __B)
1520{
1521#if _ARCH_PWR8
1522  __vector unsigned char a, b, r;
1523  __vector __bool char c;
1524
1525  a = (__vector unsigned char)vec_splats (__A);
1526  b = (__vector unsigned char)vec_splats (__B);
1527  c = (__vector __bool char)vec_cmplt (a, b);
1528  r = vec_sel (b, a, c);
1529  return (__m64) ((__vector long long) r)[0];
1530#else
1531  __m64_union m1, m2, res;
1532  long i;
1533
1534  m1.as_m64 = __A;
1535  m2.as_m64 = __B;
1536
1537
1538  for (i = 0; i < 8; i++)
1539  res.as_char[i] =
1540      ((unsigned char) m1.as_char[i] < (unsigned char) m2.as_char[i]) ?
1541	  m1.as_char[i] : m2.as_char[i];
1542
1543  return (__m64) res.as_m64;
1544#endif
1545}
1546
1547extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1548_m_pminub (__m64 __A, __m64 __B)
1549{
1550  return _mm_min_pu8 (__A, __B);
1551}
1552
1553/* Create an 8-bit mask of the signs of 8-bit values.  */
1554extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1555_mm_movemask_pi8 (__m64 __A)
1556{
1557  unsigned long long p =
1558#ifdef __LITTLE_ENDIAN__
1559                         0x0008101820283038UL; // permute control for sign bits
1560#else
1561                         0x3830282018100800UL; // permute control for sign bits
1562#endif
1563  return __builtin_bpermd (p, __A);
1564}
1565
1566extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1567_m_pmovmskb (__m64 __A)
1568{
1569  return _mm_movemask_pi8 (__A);
1570}
1571
1572/* Multiply four unsigned 16-bit values in A by four unsigned 16-bit values
1573   in B and produce the high 16 bits of the 32-bit results.  */
1574extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1575_mm_mulhi_pu16 (__m64 __A, __m64 __B)
1576{
1577  __vector unsigned short a, b;
1578  __vector unsigned short c;
1579  __vector unsigned int w0, w1;
1580  __vector unsigned char xform1 = {
1581#ifdef __LITTLE_ENDIAN__
1582      0x02, 0x03, 0x12, 0x13,  0x06, 0x07, 0x16, 0x17,
1583      0x0A, 0x0B, 0x1A, 0x1B,  0x0E, 0x0F, 0x1E, 0x1F
1584#else
1585      0x00, 0x01, 0x10, 0x11,  0x04, 0x05, 0x14, 0x15,
1586      0x00, 0x01, 0x10, 0x11,  0x04, 0x05, 0x14, 0x15
1587#endif
1588    };
1589
1590  a = (__vector unsigned short)vec_splats (__A);
1591  b = (__vector unsigned short)vec_splats (__B);
1592
1593  w0 = vec_vmuleuh (a, b);
1594  w1 = vec_vmulouh (a, b);
1595  c = (__vector unsigned short)vec_perm (w0, w1, xform1);
1596
1597  return (__m64) ((__vector long long) c)[0];
1598}
1599
1600extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1601_m_pmulhuw (__m64 __A, __m64 __B)
1602{
1603  return _mm_mulhi_pu16 (__A, __B);
1604}
1605
1606/* Return a combination of the four 16-bit values in A.  The selector
1607   must be an immediate.  */
1608extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1609_mm_shuffle_pi16 (__m64 __A, int const __N)
1610{
1611  unsigned long element_selector_10 = __N & 0x03;
1612  unsigned long element_selector_32 = (__N >> 2) & 0x03;
1613  unsigned long element_selector_54 = (__N >> 4) & 0x03;
1614  unsigned long element_selector_76 = (__N >> 6) & 0x03;
1615  static const unsigned short permute_selectors[4] =
1616    {
1617#ifdef __LITTLE_ENDIAN__
1618	      0x0908, 0x0B0A, 0x0D0C, 0x0F0E
1619#else
1620	      0x0607, 0x0405, 0x0203, 0x0001
1621#endif
1622    };
1623  __m64_union t;
1624  __vector unsigned long long a, p, r;
1625
1626#ifdef __LITTLE_ENDIAN__
1627  t.as_short[0] = permute_selectors[element_selector_10];
1628  t.as_short[1] = permute_selectors[element_selector_32];
1629  t.as_short[2] = permute_selectors[element_selector_54];
1630  t.as_short[3] = permute_selectors[element_selector_76];
1631#else
1632  t.as_short[3] = permute_selectors[element_selector_10];
1633  t.as_short[2] = permute_selectors[element_selector_32];
1634  t.as_short[1] = permute_selectors[element_selector_54];
1635  t.as_short[0] = permute_selectors[element_selector_76];
1636#endif
1637  p = vec_splats (t.as_m64);
1638  a = vec_splats (__A);
1639  r = vec_perm (a, a, (__vector unsigned char)p);
1640  return (__m64) ((__vector long long) r)[0];
1641}
1642
1643extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1644_m_pshufw (__m64 __A, int const __N)
1645{
1646  return _mm_shuffle_pi16 (__A, __N);
1647}
1648
1649/* Conditionally store byte elements of A into P.  The high bit of each
1650   byte in the selector N determines whether the corresponding byte from
1651   A is stored.  */
1652extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1653_mm_maskmove_si64 (__m64 __A, __m64 __N, char *__P)
1654{
1655  __m64 hibit = 0x8080808080808080UL;
1656  __m64 mask, tmp;
1657  __m64 *p = (__m64*)__P;
1658
1659  tmp = *p;
1660  mask = _mm_cmpeq_pi8 ((__N & hibit), hibit);
1661  tmp = (tmp & (~mask)) | (__A & mask);
1662  *p = tmp;
1663}
1664
1665extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1666_m_maskmovq (__m64 __A, __m64 __N, char *__P)
1667{
1668  _mm_maskmove_si64 (__A, __N, __P);
1669}
1670
1671/* Compute the rounded averages of the unsigned 8-bit values in A and B.  */
1672extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1673_mm_avg_pu8 (__m64 __A, __m64 __B)
1674{
1675  __vector unsigned char a, b, c;
1676
1677  a = (__vector unsigned char)vec_splats (__A);
1678  b = (__vector unsigned char)vec_splats (__B);
1679  c = vec_avg (a, b);
1680  return (__m64) ((__vector long long) c)[0];
1681}
1682
1683extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1684_m_pavgb (__m64 __A, __m64 __B)
1685{
1686  return _mm_avg_pu8 (__A, __B);
1687}
1688
1689/* Compute the rounded averages of the unsigned 16-bit values in A and B.  */
1690extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1691_mm_avg_pu16 (__m64 __A, __m64 __B)
1692{
1693  __vector unsigned short a, b, c;
1694
1695  a = (__vector unsigned short)vec_splats (__A);
1696  b = (__vector unsigned short)vec_splats (__B);
1697  c = vec_avg (a, b);
1698  return (__m64) ((__vector long long) c)[0];
1699}
1700
1701extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1702_m_pavgw (__m64 __A, __m64 __B)
1703{
1704  return _mm_avg_pu16 (__A, __B);
1705}
1706
1707/* Compute the sum of the absolute differences of the unsigned 8-bit
1708   values in A and B.  Return the value in the lower 16-bit word; the
1709   upper words are cleared.  */
1710extern __inline    __m64    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1711_mm_sad_pu8 (__m64  __A, __m64  __B)
1712{
1713  __vector unsigned char a, b;
1714  __vector unsigned char vmin, vmax, vabsdiff;
1715  __vector signed int vsum;
1716  const __vector unsigned int zero =
1717    { 0, 0, 0, 0 };
1718  __m64_union result = {0};
1719
1720  a = (__vector unsigned char) (__vector unsigned long long) { 0UL, __A };
1721  b = (__vector unsigned char) (__vector unsigned long long) { 0UL, __B };
1722  vmin = vec_min (a, b);
1723  vmax = vec_max (a, b);
1724  vabsdiff = vec_sub (vmax, vmin);
1725  /* Sum four groups of bytes into integers.  */
1726  vsum = (__vector signed int) vec_sum4s (vabsdiff, zero);
1727  /* Sum across four integers with integer result.  */
1728  vsum = vec_sums (vsum, (__vector signed int) zero);
1729  /* The sum is in the right most 32-bits of the vector result.
1730     Transfer to a GPR and truncate to 16 bits.  */
1731  result.as_short[0] = vsum[3];
1732  return result.as_m64;
1733}
1734
1735extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1736_m_psadbw (__m64 __A, __m64 __B)
1737{
1738  return _mm_sad_pu8 (__A, __B);
1739}
1740
1741/* Stores the data in A to the address P without polluting the caches.  */
1742extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1743_mm_stream_pi (__m64 *__P, __m64 __A)
1744{
1745  /* Use the data cache block touch for store transient.  */
1746  __asm__ (
1747    "	dcbtstt	0,%0"
1748    :
1749    : "b" (__P)
1750    : "memory"
1751  );
1752  *__P = __A;
1753}
1754
1755/* Likewise.  The address must be 16-byte aligned.  */
1756extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1757_mm_stream_ps (float *__P, __m128 __A)
1758{
1759  /* Use the data cache block touch for store transient.  */
1760  __asm__ (
1761    "	dcbtstt	0,%0"
1762    :
1763    : "b" (__P)
1764    : "memory"
1765  );
1766  _mm_store_ps (__P, __A);
1767}
1768
1769/* Guarantees that every preceding store is globally visible before
1770   any subsequent store.  */
1771extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1772_mm_sfence (void)
1773{
1774  /* Generate a light weight sync.  */
1775  __atomic_thread_fence (__ATOMIC_RELEASE);
1776}
1777
1778/* The execution of the next instruction is delayed by an implementation
1779   specific amount of time.  The instruction does not modify the
1780   architectural state.  This is after the pop_options pragma because
1781   it does not require SSE support in the processor--the encoding is a
1782   nop on processors that do not support it.  */
1783extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1784_mm_pause (void)
1785{
1786  /* There is no exact match with this construct, but the following is
1787     close to the desired effect.  */
1788#if _ARCH_PWR8
1789  /* On power8 and later processors we can depend on Program Priority
1790     (PRI) and associated "very low" PPI setting.  Since we don't know
1791     what PPI this thread is running at we: 1) save the current PRI
1792     from the PPR SPR into a local GRP, 2) set the PRI to "very low*
1793     via the special or 31,31,31 encoding. 3) issue an "isync" to
1794     insure the PRI change takes effect before we execute any more
1795     instructions.
1796     Now we can execute a lwsync (release barrier) while we execute
1797     this thread at "very low" PRI.  Finally we restore the original
1798     PRI and continue execution.  */
1799  unsigned long __PPR;
1800
1801  __asm__ volatile (
1802    "	mfppr	%0;"
1803    "   or 31,31,31;"
1804    "   isync;"
1805    "   lwsync;"
1806    "   isync;"
1807    "   mtppr	%0;"
1808    : "=r" (__PPR)
1809    :
1810    : "memory"
1811  );
1812#else
1813  /* For older processor where we may not even have Program Priority
1814     controls we can only depend on Heavy Weight Sync.  */
1815  __atomic_thread_fence (__ATOMIC_SEQ_CST);
1816#endif
1817}
1818
1819/* Transpose the 4x4 matrix composed of row[0-3].  */
1820#define _MM_TRANSPOSE4_PS(row0, row1, row2, row3)			\
1821do {									\
1822  __v4sf __r0 = (row0), __r1 = (row1), __r2 = (row2), __r3 = (row3);	\
1823  __v4sf __t0 = vec_vmrghw (__r0, __r1);			\
1824  __v4sf __t1 = vec_vmrghw (__r2, __r3);			\
1825  __v4sf __t2 = vec_vmrglw (__r0, __r1);			\
1826  __v4sf __t3 = vec_vmrglw (__r2, __r3);			\
1827  (row0) = (__v4sf)vec_mergeh ((__vector long long)__t0, 	\
1828			       (__vector long long)__t1);	\
1829  (row1) = (__v4sf)vec_mergel ((__vector long long)__t0,	\
1830			       (__vector long long)__t1);	\
1831  (row2) = (__v4sf)vec_mergeh ((__vector long long)__t2,	\
1832			       (__vector long long)__t3);	\
1833  (row3) = (__v4sf)vec_mergel ((__vector long long)__t2,	\
1834			       (__vector long long)__t3);	\
1835} while (0)
1836
1837/* For backward source compatibility.  */
1838//# include <emmintrin.h>
1839
1840#else
1841#include_next <xmmintrin.h>
1842#endif /* defined(__linux__) && defined(__ppc64__) */
1843
1844#endif /* _XMMINTRIN_H_INCLUDED */
1845