1/* Copyright (C) 2006-2015 Free Software Foundation, Inc.
2
3   This file is free software; you can redistribute it and/or modify it under
4   the terms of the GNU General Public License as published by the Free
5   Software Foundation; either version 3 of the License, or (at your option)
6   any later version.
7
8   This file is distributed in the hope that it will be useful, but WITHOUT
9   ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
10   FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
11   for more details.
12
13   Under Section 7 of GPL version 3, you are granted additional
14   permissions described in the GCC Runtime Library Exception, version
15   3.1, as published by the Free Software Foundation.
16
17   You should have received a copy of the GNU General Public License and
18   a copy of the GCC Runtime Library Exception along with this program;
19   see the files COPYING3 and COPYING.RUNTIME respectively.  If not, see
20   <http://www.gnu.org/licenses/>.  */
21
22#ifndef _VMX2SPU_H_
23#define _VMX2SPU_H_	1
24
25#ifdef __cplusplus
26
27#ifdef __SPU__
28
29#include <spu_intrinsics.h>
30#include <vec_types.h>
31
32/* This file maps generic VMX intrinsics and predicates to the SPU using
33 * overloaded C++ functions.
34 */
35
36/************************************************************************
37 *                        INTRINSICS
38 ************************************************************************/
39
40/* vec_abs (vector absolute value)
41 * =======
42 */
43static inline vec_char16 vec_abs(vec_char16 a)
44{
45  vec_char16 minus_a;
46
47  minus_a = (vec_char16)(spu_add((vec_ushort8)(spu_and(spu_xor(a, 0xFF), 0x7F)), 0x101));
48  return (spu_sel(minus_a, a, spu_cmpgt(a, -1)));
49}
50
51static inline vec_short8 vec_abs(vec_short8 a)
52{
53  return (spu_sel(spu_sub(0, a), a, spu_cmpgt(a, -1)));
54}
55
56static inline vec_int4 vec_abs(vec_int4 a)
57{
58  return (spu_sel(spu_sub(0, a), a, spu_cmpgt(a, -1)));
59}
60
61static inline vec_float4 vec_abs(vec_float4 a)
62{
63  return ((vec_float4)(spu_rlmask(spu_sl((vec_uint4)(a), 1), -1)));
64}
65
66/* vec_abss (vector absolute value saturate)
67 * ========
68 */
69static inline vec_char16 vec_abss(vec_char16 a)
70{
71  vec_char16 minus_a;
72
73  minus_a = (vec_char16)spu_add((vec_short8)(spu_xor(a, -1)),
74				(vec_short8)(spu_and(spu_cmpgt((vec_uchar16)(a), 0x80), 1)));
75  return (spu_sel(minus_a, a, spu_cmpgt(a, -1)));
76}
77
78static inline vec_short8 vec_abss(vec_short8 a)
79{
80  vec_short8 minus_a;
81
82  minus_a = spu_add(spu_sub(0, a), (vec_short8)(spu_cmpeq(a, ((vec_short8){0x8000,0x8000,0x8000,0x8000,0x8000,0x8000,0x8000,0x8000}))));
83  return (spu_sel(minus_a, a, spu_cmpgt(a, -1)));
84}
85
86static inline vec_int4 vec_abss(vec_int4 a)
87{
88  vec_int4 minus_a;
89
90  minus_a = spu_add(spu_sub(0, a), (vec_int4)(spu_cmpeq(a, ((vec_int4){0x80000000,0x80000000,0x80000000,0x80000000}))));
91  return (spu_sel(minus_a, a, spu_cmpgt(a, -1)));
92}
93
94
95/* vec_add (vector add)
96 * =======
97 */
98static inline vec_uchar16 vec_add(vec_uchar16 a, vec_uchar16 b)
99{
100  return ((vec_uchar16)(spu_sel(spu_add((vec_ushort8)(a), (vec_ushort8)(b)),
101				spu_add(spu_and((vec_ushort8)(a), 0xFF00), spu_and((vec_ushort8)(b), 0xFF00)),
102				spu_splats((unsigned short)(0xFF00)))));
103}
104
105static inline vec_char16 vec_add(vec_char16 a, vec_char16 b)
106{
107  return ((vec_char16)vec_add((vec_uchar16)(a), (vec_uchar16)(b)));
108}
109
110static inline vec_char16 vec_add(vec_bchar16 a, vec_char16 b)
111{
112  return ((vec_char16)vec_add((vec_uchar16)(a), (vec_uchar16)(b)));
113}
114
115static inline vec_char16 vec_add(vec_char16 a, vec_bchar16 b)
116{
117  return ((vec_char16)vec_add((vec_uchar16)(a), (vec_uchar16)(b)));
118}
119
120static inline vec_ushort8 vec_add(vec_ushort8 a, vec_ushort8 b)
121{
122  return (spu_add(a, b));
123}
124
125static inline vec_short8 vec_add(vec_short8 a, vec_short8 b)
126{
127  return (spu_add(a, b));
128}
129
130static inline vec_short8 vec_add(vec_bshort8 a, vec_short8 b)
131{
132  return (spu_add((vec_short8)(a), b));
133}
134
135static inline vec_short8 vec_add(vec_short8 a, vec_bshort8 b)
136{
137  return (spu_add(a, (vec_short8)(b)));
138}
139
140static inline vec_uint4 vec_add(vec_uint4 a, vec_uint4 b)
141{
142  return (spu_add(a, b));
143}
144
145static inline vec_int4 vec_add(vec_int4 a, vec_int4 b)
146{
147  return (spu_add(a, b));
148}
149
150static inline vec_int4 vec_add(vec_bint4 a, vec_int4 b)
151{
152  return (spu_add((vec_int4)(a), b));
153}
154
155static inline vec_int4 vec_add(vec_int4 a, vec_bint4 b)
156{
157  return (spu_add(a, (vec_int4)(b)));
158}
159
160static inline vec_float4 vec_add(vec_float4 a, vec_float4 b)
161{
162  return (spu_add(a, b));
163}
164
165/* vec_addc (vector add carryout unsigned word)
166 * ========
167 */
168#define vec_addc(_a, _b)	spu_genc(_a, _b)
169
170/* vec_adds (vector add saturated)
171 * ========
172 */
173static inline vec_uchar16 vec_adds(vec_uchar16 a, vec_uchar16 b)
174{
175  vec_uchar16 s1, s2, s, d;
176
177  s1 = (vec_uchar16)(spu_add(spu_rlmask((vec_ushort8)(a), -8), spu_rlmask((vec_ushort8)(b), -8)));
178  s2 = (vec_uchar16)(spu_add(spu_and((vec_ushort8)(a), 0xFF), spu_and((vec_ushort8)(b), 0xFF)));
179  s  = spu_shuffle(s1, s2, ((vec_uchar16){0, 16,  2, 18,  4, 20,  6, 22,
180				          8, 24, 10, 26, 12, 28, 14, 30}));
181  d  = spu_shuffle(s1, s2, ((vec_uchar16){1, 17,  3, 19,  5, 21,  7, 23,
182				          9, 25, 11, 27, 13, 29, 15, 31}));
183  return (spu_or(d, spu_cmpeq(s, 1)));
184}
185
186static inline vec_char16 vec_adds(vec_char16 a, vec_char16 b)
187{
188  vec_uchar16 s1, s2, s, d;
189
190  s1 = (vec_uchar16)(spu_add(spu_rlmask((vec_ushort8)(a), -8), spu_rlmask((vec_ushort8)(b), -8)));
191  s2 = (vec_uchar16)(spu_add(spu_and((vec_ushort8)(a), 0xFF), spu_and((vec_ushort8)(b), 0xFF)));
192  s  = spu_shuffle(s1, s2, ((vec_uchar16){1, 17,  3, 19,  5, 21,  7, 23,
193				          9, 25, 11, 27, 13, 29, 15, 31}));
194  d = spu_sel(s, spu_splats((unsigned char)0x7F), spu_cmpgt(spu_and(s, (vec_uchar16)(spu_nor(a, b))), 0x7F));
195  d = spu_sel(d, spu_splats((unsigned char)0x80), spu_cmpgt(spu_nor(s, (vec_uchar16)(spu_nand(a, b))), 0x7F));
196  return ((vec_char16)(d));
197}
198
199static inline vec_char16 vec_adds(vec_bchar16 a, vec_char16 b)
200{
201  return (vec_adds((vec_char16)(a), b));
202}
203
204static inline vec_char16 vec_adds(vec_char16 a, vec_bchar16 b)
205{
206  return (vec_adds(a, (vec_char16)(b)));
207}
208
209static inline vec_ushort8 vec_adds(vec_ushort8 a, vec_ushort8 b)
210{
211  vec_ushort8 s, d;
212
213  s = spu_add(a, b);
214  d = spu_or(s, spu_rlmaska(spu_sel(spu_xor(s, -1), a, spu_eqv(a, b)), -15));
215  return (d);
216}
217
218static inline vec_short8 vec_adds(vec_short8 a, vec_short8 b)
219{
220  vec_short8 s, d;
221
222  s = spu_add(a, b);
223  d = spu_sel(s, spu_splats((signed short)0x7FFF), (vec_ushort8)(spu_rlmaska(spu_and(s, spu_nor(a, b)), -15)));
224  d = spu_sel(d, spu_splats((signed short)0x8000), (vec_ushort8)(spu_rlmaska(spu_nor(s, spu_nand(a, b)), -15)));
225  return (d);
226}
227
228static inline vec_short8 vec_adds(vec_bshort8 a, vec_short8 b)
229{
230  return (vec_adds((vec_short8)(a), b));
231}
232
233static inline vec_short8 vec_adds(vec_short8 a, vec_bshort8 b)
234{
235  return (vec_adds(a, (vec_short8)(b)));
236}
237
238static inline vec_uint4 vec_adds(vec_uint4 a, vec_uint4 b)
239{
240  return (spu_or(spu_add(a, b), spu_rlmaska(spu_sl(spu_genc(a, b), 31), -31)));
241}
242
243static inline vec_int4 vec_adds(vec_int4 a, vec_int4 b)
244{
245  vec_int4 s, d;
246
247  s = spu_add(a, b);
248  d = spu_sel(s, spu_splats((signed int)0x7FFFFFFF), (vec_uint4)spu_rlmaska(spu_and(s, spu_nor(a, b)), -31));
249  d = spu_sel(d, spu_splats((signed int)0x80000000), (vec_uint4)spu_rlmaska(spu_nor(s, spu_nand(a, b)), -31));
250  return (d);
251}
252
253static inline vec_int4 vec_adds(vec_bint4 a, vec_int4 b)
254{
255  return (vec_adds((vec_int4)(a), b));
256}
257
258static inline vec_int4 vec_adds(vec_int4 a, vec_bint4 b)
259{
260  return (vec_adds(a, (vec_int4)(b)));
261}
262
263/* vec_and (vector logical and)
264 * =======
265 */
266static inline vec_uchar16 vec_and(vec_uchar16 a, vec_uchar16 b)
267{
268  return (spu_and(a, b));
269}
270
271static inline vec_char16 vec_and(vec_char16 a, vec_char16 b)
272{
273  return (spu_and(a, b));
274}
275
276static inline vec_char16 vec_and(vec_bchar16 a, vec_char16 b)
277{
278  return (spu_and((vec_char16)(a), b));
279}
280
281static inline vec_char16 vec_and(vec_char16 a, vec_bchar16 b)
282{
283  return (spu_and(a, (vec_char16)(b)));
284}
285
286static inline vec_ushort8 vec_and(vec_ushort8 a, vec_ushort8 b)
287{
288  return (spu_and(a, b));
289}
290
291static inline vec_short8 vec_and(vec_short8 a, vec_short8 b)
292{
293  return (spu_and(a, b));
294}
295
296static inline vec_short8 vec_and(vec_bshort8 a, vec_short8 b)
297{
298  return (spu_and((vec_short8)(a), b));
299}
300
301static inline vec_short8 vec_and(vec_short8 a, vec_bshort8 b)
302{
303  return (spu_and(a, (vec_short8)(b)));
304}
305
306static inline vec_uint4 vec_and(vec_uint4 a, vec_uint4 b)
307{
308  return (spu_and(a, b));
309}
310
311static inline vec_int4 vec_and(vec_int4 a, vec_int4 b)
312{
313  return (spu_and(a, b));
314}
315
316static inline vec_int4 vec_and(vec_bint4 a, vec_int4 b)
317{
318  return (spu_and((vec_int4)(a), b));
319}
320
321static inline vec_int4 vec_and(vec_int4 a, vec_bint4 b)
322{
323  return (spu_and(a, (vec_int4)(b)));
324}
325
326static inline vec_float4 vec_and(vec_float4 a, vec_float4 b)
327{
328  return (spu_and(a, b));
329}
330
331static inline vec_float4 vec_and(vec_bint4 a, vec_float4 b)
332{
333  return (spu_and((vec_float4)(a),b));
334}
335
336static inline vec_float4 vec_and(vec_float4 a, vec_bint4 b)
337{
338  return (spu_and(a, (vec_float4)(b)));
339}
340
341
342/* vec_andc (vector logical and with complement)
343 * ========
344 */
345static inline vec_uchar16 vec_andc(vec_uchar16 a, vec_uchar16 b)
346{
347  return (spu_andc(a, b));
348}
349
350static inline vec_char16 vec_andc(vec_char16 a, vec_char16 b)
351{
352  return (spu_andc(a, b));
353}
354
355static inline vec_char16 vec_andc(vec_bchar16 a, vec_char16 b)
356{
357  return (spu_andc((vec_char16)(a), b));
358}
359
360static inline vec_char16 vec_andc(vec_char16 a, vec_bchar16 b)
361{
362  return (spu_andc(a, (vec_char16)(b)));
363}
364
365static inline vec_ushort8 vec_andc(vec_ushort8 a, vec_ushort8 b)
366{
367  return (spu_andc(a, b));
368}
369
370static inline vec_short8 vec_andc(vec_short8 a, vec_short8 b)
371{
372  return (spu_andc(a, b));
373}
374
375static inline vec_short8 vec_andc(vec_bshort8 a, vec_short8 b)
376{
377  return (spu_andc((vec_short8)(a), b));
378}
379
380static inline vec_short8 vec_andc(vec_short8 a, vec_bshort8 b)
381{
382  return (spu_andc(a, (vec_short8)(b)));
383}
384
385static inline vec_uint4 vec_andc(vec_uint4 a, vec_uint4 b)
386{
387  return (spu_andc(a, b));
388}
389
390static inline vec_int4 vec_andc(vec_int4 a, vec_int4 b)
391{
392  return (spu_andc(a, b));
393}
394
395static inline vec_int4 vec_andc(vec_bint4 a, vec_int4 b)
396{
397  return (spu_andc((vec_int4)(a), b));
398}
399
400static inline vec_int4 vec_andc(vec_int4 a, vec_bint4 b)
401{
402  return (spu_andc(a, (vec_int4)(b)));
403}
404
405static inline vec_float4 vec_andc(vec_float4 a, vec_float4 b)
406{
407  return (spu_andc(a,b));
408}
409
410static inline vec_float4 vec_andc(vec_bint4 a, vec_float4 b)
411{
412  return (spu_andc((vec_float4)(a),b));
413}
414
415static inline vec_float4 vec_andc(vec_float4 a, vec_bint4 b)
416{
417  return (spu_andc(a, (vec_float4)(b)));
418}
419
420/* vec_avg (vector average)
421 * =======
422 */
423static inline vec_uchar16 vec_avg(vec_uchar16 a, vec_uchar16 b)
424{
425  return (spu_avg(a, b));
426}
427
428static inline vec_char16 vec_avg(vec_char16 a, vec_char16 b)
429{
430  return ((vec_char16)(spu_xor(spu_avg((vec_uchar16)(a), (vec_uchar16)(b)),
431			       (vec_uchar16)(spu_and(spu_xor(a,b), 0x80)))));
432}
433
434static inline vec_ushort8 vec_avg(vec_ushort8 a, vec_ushort8 b)
435{
436  return (spu_add(spu_add(spu_rlmask(a, -1), spu_rlmask(b, -1)),
437		  spu_and(spu_or(a, b), 1)));
438}
439
440static inline vec_short8 vec_avg(vec_short8 a, vec_short8 b)
441{
442  return (spu_add(spu_add(spu_rlmaska(a, -1), spu_rlmaska(b, -1)),
443		  spu_and(spu_or(a, b), 1)));
444}
445
446static inline vec_uint4 vec_avg(vec_uint4 a, vec_uint4 b)
447{
448  return (spu_add(spu_add(spu_rlmask(a, -1), spu_rlmask(b, -1)),
449		  spu_and(spu_or(a, b), 1)));
450}
451
452static inline vec_int4 vec_avg(vec_int4 a, vec_int4 b)
453{
454  return (spu_add(spu_add(spu_rlmaska(a, -1), spu_rlmaska(b, -1)),
455		  spu_and(spu_or(a, b), 1)));
456}
457
458
459/* vec_ceil (vector ceiling)
460 * ========
461 */
462static inline vec_float4 vec_ceil(vec_float4 a)
463{
464  vec_int4  exp;
465  vec_uint4 mask;
466
467  a = spu_add(a, (vec_float4)(spu_and(spu_xor(spu_rlmaska((vec_int4)a, -31), -1), spu_splats((signed int)0x3F7FFFFF))));
468  exp = spu_sub(127, (vec_int4)(spu_and(spu_rlmask((vec_uint4)(a), -23), 0xFF)));
469  mask = spu_rlmask(spu_splats((unsigned int)0x7FFFFF), exp);
470  mask = spu_sel(spu_splats((unsigned int)0), mask, spu_cmpgt(exp, -31));
471  mask = spu_or(mask, spu_xor((vec_uint4)(spu_rlmaska(spu_add(exp, -1), -31)), -1));
472
473  return ((vec_float4)(spu_andc((vec_uint4)(a), mask)));
474}
475
476
477/* vec_cmpb (vector compare bounds floating-point)
478 * ========
479 */
480static inline vec_int4 vec_cmpb(vec_float4 a, vec_float4 b)
481{
482  vec_int4 b0 = (vec_int4)spu_splats(0x80000000);
483  vec_int4 b1 = (vec_int4)spu_splats(0x40000000);
484
485  return (spu_or(spu_and((vec_int4)spu_cmpgt(a, b), b0),
486		 spu_and((vec_int4)spu_cmpgt(spu_xor(b, (vec_float4)(b0)), a), b1)));
487}
488
489/* vec_cmpeq (vector compare equal)
490 * =========
491 */
492#define vec_cmpeq(_a, _b)	spu_cmpeq(_a, _b)
493
494
495/* vec_cmpge (vector compare greater than or equal)
496 * =========
497 */
498static inline vec_bint4 vec_cmpge(vec_float4 a, vec_float4 b)
499{
500  return (spu_xor(spu_cmpgt(b, a), -1));
501}
502
503
504/* vec_cmpgt (vector compare greater than)
505 * =========
506 */
507#define vec_cmpgt(_a, _b)	spu_cmpgt(_a, _b)
508
509
510/* vec_cmple (vector compare less than or equal)
511 * =========
512 */
513static inline vec_bint4 vec_cmple(vec_float4 a, vec_float4 b)
514{
515  return (spu_xor(spu_cmpgt(a, b), -1));
516}
517
518
519/* vec_cmplt (vector compare less than)
520 * =========
521 */
522#define vec_cmplt(_a, _b)	spu_cmpgt(_b, _a)
523
524
525/* vec_ctf (vector convert from fixed-point word)
526 * =======
527 */
528#define vec_ctf(_a, _b)		spu_convtf(_a, _b)
529
530
531/* vec_cts (vector convert to signed fixed-point word saturate)
532 * =======
533 */
534#define vec_cts(_a, _b)		spu_convts(_a, _b)
535
536
537/* vec_ctu (vector convert to unsigned fixed-point word saturate)
538 * =======
539 */
540#define vec_ctu(_a, _b)		spu_convtu(_a, _b)
541
542
543/* vec_dss (vector data stream stop)
544 * =======
545 */
546#define vec_dss(_a)
547
548
549/* vec_dssall (vector data stream stop all)
550 * ==========
551 */
552#define vec_dssall()
553
554
555/* vec_dst (vector data stream touch)
556 * =======
557 */
558#define vec_dst(_a, _b, _c)
559
560
561/* vec_dstst (vector data stream touch for store)
562 * =========
563 */
564#define vec_dstst(_a, _b, _c)
565
566
567/* vec_dststt (vector data stream touch for store transient)
568 * ==========
569 */
570#define vec_dststt(_a, _b, _c)
571
572
573/* vec_dstt (vector data stream touch transient)
574 * ========
575 */
576#define vec_dstt(_a, _b, _c)
577
578
579/* vec_expte (vector is 2 raised tp the exponent estimate floating-point)
580 * =========
581 */
582static inline vec_float4 vec_expte(vec_float4 a)
583{
584  vec_float4 bias, frac, exp;
585  vec_int4 ia;
586
587  bias = (vec_float4)(spu_andc(spu_splats((signed int)0x3F7FFFFF), spu_rlmaska((vec_int4)(a), -31)));
588  ia   = spu_convts(spu_add(a, bias), 0);
589  frac = spu_sub(spu_convtf(ia, 0), a);
590  exp  = (vec_float4)(spu_sl(spu_add(ia, 127), 23));
591
592  return (spu_mul(spu_madd(spu_madd(spu_splats(0.17157287f), frac, spu_splats(-0.67157287f)),
593			   frac, spu_splats(1.0f)), exp));
594}
595
596
597/* vec_floor (vector floor)
598 * =========
599 */
600static inline vec_float4 vec_floor(vec_float4 a)
601{
602  vec_int4  exp;
603  vec_uint4 mask;
604
605  a = spu_sub(a, (vec_float4)(spu_and(spu_rlmaska((vec_int4)a, -31), spu_splats((signed int)0x3F7FFFFF))));
606  exp = spu_sub(127, (vec_int4)(spu_and(spu_rlmask((vec_uint4)(a), -23), 0xFF)));
607  mask = spu_rlmask(spu_splats((unsigned int)0x7FFFFF), exp);
608  mask = spu_sel(spu_splats((unsigned int)0), mask, spu_cmpgt(exp, -31));
609  mask = spu_or(mask, spu_xor((vec_uint4)(spu_rlmaska(spu_add(exp, -1), -31)), -1));
610
611  return ((vec_float4)(spu_andc((vec_uint4)(a), mask)));
612}
613
614
615/* vec_ld (vector load indexed)
616 * ======
617 */
618static inline vec_uchar16 vec_ld(int a, unsigned char *b)
619{
620  return (*((vec_uchar16 *)(b+a)));
621}
622
623static inline vec_uchar16 vec_ld(int a, vec_uchar16 *b)
624{
625  return (*((vec_uchar16 *)((unsigned char *)(b)+a)));
626}
627
628static inline vec_char16 vec_ld(int a, signed char *b)
629{
630  return (*((vec_char16 *)(b+a)));
631}
632
633static inline vec_char16 vec_ld(int a, vec_char16 *b)
634{
635  return (*((vec_char16 *)((signed char *)(b)+a)));
636}
637
638static inline vec_ushort8 vec_ld(int a, unsigned short *b)
639{
640  return (*((vec_ushort8 *)((unsigned char *)(b)+a)));
641}
642
643static inline vec_ushort8 vec_ld(int a, vec_ushort8 *b)
644{
645  return (*((vec_ushort8 *)((unsigned char *)(b)+a)));
646}
647
648static inline vec_short8 vec_ld(int a, signed short *b)
649{
650  return (*((vec_short8 *)((unsigned char *)(b)+a)));
651}
652
653static inline vec_short8 vec_ld(int a, vec_short8 *b)
654{
655  return (*((vec_short8 *)((signed char *)(b)+a)));
656}
657
658static inline vec_uint4 vec_ld(int a, unsigned int *b)
659{
660  return (*((vec_uint4 *)((unsigned char *)(b)+a)));
661}
662
663static inline vec_uint4 vec_ld(int a, vec_uint4 *b)
664{
665  return (*((vec_uint4 *)((unsigned char *)(b)+a)));
666}
667
668static inline vec_int4 vec_ld(int a, signed int *b)
669{
670  return (*((vec_int4 *)((unsigned char *)(b)+a)));
671}
672
673static inline vec_int4 vec_ld(int a, vec_int4 *b)
674{
675  return (*((vec_int4 *)((signed char *)(b)+a)));
676}
677
678static inline vec_float4 vec_ld(int a, float *b)
679{
680  return (*((vec_float4 *)((unsigned char *)(b)+a)));
681}
682
683static inline vec_float4 vec_ld(int a, vec_float4 *b)
684{
685  return (*((vec_float4 *)((unsigned char *)(b)+a)));
686}
687
688/* vec_lde (vector load element indexed)
689 * =======
690 */
691static inline vec_uchar16 vec_lde(int a, unsigned char *b)
692{
693  return (*((vec_uchar16 *)(b+a)));
694}
695
696static inline vec_char16 vec_lde(int a, signed char *b)
697{
698  return (*((vec_char16 *)(b+a)));
699}
700
701static inline vec_ushort8 vec_lde(int a, unsigned short *b)
702{
703  return (*((vec_ushort8 *)((unsigned char *)(b)+a)));
704}
705
706static inline vec_short8 vec_lde(int a, signed short *b)
707{
708  return (*((vec_short8 *)((unsigned char *)(b)+a)));
709}
710
711
712static inline vec_uint4 vec_lde(int a, unsigned int *b)
713{
714  return (*((vec_uint4 *)((unsigned char *)(b)+a)));
715}
716
717static inline vec_int4 vec_lde(int a, signed int *b)
718{
719  return (*((vec_int4 *)((unsigned char *)(b)+a)));
720}
721
722
723static inline vec_float4 vec_lde(int a, float *b)
724{
725  return (*((vec_float4 *)((unsigned char *)(b)+a)));
726}
727
728/* vec_ldl (vector load indexed LRU)
729 * =======
730 */
731#define vec_ldl(_a, _b)		vec_ld(_a, _b)
732
733
734/* vec_loge (vector log2 estimate floating-point)
735 * ========
736 */
737static inline vec_float4 vec_loge(vec_float4 a)
738{
739  vec_int4 exp;
740  vec_float4 frac;
741
742  exp  = spu_add((vec_int4)(spu_and(spu_rlmask((vec_uint4)(a), -23), 0xFF)), -127);
743  frac = (vec_float4)(spu_sub((vec_int4)(a), spu_sl(exp, 23)));
744
745  return (spu_madd(spu_madd(spu_splats(-0.33985f), frac, spu_splats(2.01955f)),
746		   frac, spu_sub(spu_convtf(exp, 0), spu_splats(1.6797f))));
747}
748
749
750/* vec_lvsl (vector load for shift left)
751 * ========
752 */
753static inline vec_uchar16 vec_lvsl(int a, unsigned char *b)
754{
755  return ((vec_uchar16)spu_add((vec_ushort8)(spu_splats((unsigned char)((a + (int)(b)) & 0xF))),
756			       ((vec_ushort8){0x0001, 0x0203, 0x0405, 0x0607,
757				              0x0809, 0x0A0B, 0x0C0D, 0x0E0F})));
758}
759
760static inline vec_uchar16 vec_lvsl(int a, signed char *b)
761{
762  return (vec_lvsl(a, (unsigned char *)b));
763}
764
765static inline vec_uchar16 vec_lvsl(int a, unsigned short *b)
766{
767  return (vec_lvsl(a, (unsigned char *)b));
768}
769
770static inline vec_uchar16 vec_lvsl(int a, short *b)
771{
772  return (vec_lvsl(a, (unsigned char *)b));
773}
774
775static inline vec_uchar16 vec_lvsl(int a, unsigned int *b)
776{
777  return (vec_lvsl(a, (unsigned char *)b));
778}
779
780static inline vec_uchar16 vec_lvsl(int a, int *b)
781{
782  return (vec_lvsl(a, (unsigned char *)b));
783}
784
785static inline vec_uchar16 vec_lvsl(int a, float *b)
786{
787  return (vec_lvsl(a, (unsigned char *)b));
788}
789
790
791/* vec_lvsr (vector load for shift right)
792 * ========
793 */
794static  inline vec_uchar16 vec_lvsr(int a, unsigned char *b)
795{
796  return ((vec_uchar16)(spu_sub(((vec_ushort8){0x1011, 0x1213, 0x1415, 0x1617,
797				               0x1819, 0x1A1B, 0x1C1D, 0x1E1F}),
798				(vec_ushort8)(spu_splats((unsigned char)((a + (int)(b)) & 0xF))))));
799}
800
801static inline vec_uchar16 vec_lvsr(int a, signed char *b)
802{
803  return (vec_lvsr(a, (unsigned char *)b));
804}
805
806static inline vec_uchar16 vec_lvsr(int a, unsigned short *b)
807{
808  return (vec_lvsr(a, (unsigned char *)b));
809}
810
811static inline vec_uchar16 vec_lvsr(int a, short *b)
812{
813  return (vec_lvsr(a, (unsigned char *)b));
814}
815
816static inline vec_uchar16 vec_lvsr(int a, unsigned int *b)
817{
818  return (vec_lvsr(a, (unsigned char *)b));
819}
820
821static inline vec_uchar16 vec_lvsr(int a, int *b)
822{
823  return (vec_lvsr(a, (unsigned char *)b));
824}
825
826static inline vec_uchar16 vec_lvsr(int a, float *b)
827{
828  return (vec_lvsr(a, (unsigned char *)b));
829}
830
831/* vec_madd (vector multiply add)
832 * ========
833 */
834#define vec_madd(_a, _b, _c)	spu_madd(_a, _b, _c)
835
836
837
838/* vec_madds (vector multiply add saturate)
839 * =========
840 */
841static inline vec_short8 vec_madds(vec_short8 a, vec_short8 b, vec_short8 c)
842{
843  return (vec_adds(c, spu_sel((vec_short8)(spu_sl(spu_mule(a, b), 1)),
844			      (vec_short8)(spu_rlmask(spu_mulo(a, b), -15)),
845			      ((vec_ushort8){0, 0xFFFF, 0, 0xFFFF, 0, 0xFFFF, 0, 0xFFFF}))));
846}
847
848/* vec_max (vector maximum)
849 * =======
850 */
851static inline vec_uchar16 vec_max(vec_uchar16 a, vec_uchar16 b)
852{
853  return (spu_sel(b, a, spu_cmpgt(a, b)));
854}
855
856static inline vec_char16 vec_max(vec_char16 a, vec_char16 b)
857{
858  return (spu_sel(b, a, spu_cmpgt(a, b)));
859}
860
861static inline vec_char16 vec_max(vec_bchar16 a, vec_char16 b)
862{
863  return (spu_sel(b, (vec_char16)(a), spu_cmpgt((vec_char16)(a), b)));
864}
865
866static inline vec_char16 vec_max(vec_char16 a, vec_bchar16 b)
867{
868  return (spu_sel((vec_char16)(b), a, spu_cmpgt(a, (vec_char16)(b))));
869}
870
871static inline vec_ushort8 vec_max(vec_ushort8 a, vec_ushort8 b)
872{
873  return (spu_sel(b, a, spu_cmpgt(a, b)));
874}
875
876static inline vec_short8 vec_max(vec_short8 a, vec_short8 b)
877{
878  return (spu_sel(b, a, spu_cmpgt(a, b)));
879}
880
881static inline vec_short8 vec_max(vec_bshort8 a, vec_short8 b)
882{
883  return (spu_sel(b, (vec_short8)(a), spu_cmpgt((vec_short8)(a), b)));
884}
885
886static inline vec_short8 vec_max(vec_short8 a, vec_bshort8 b)
887{
888  return (spu_sel((vec_short8)(b), a, spu_cmpgt(a, (vec_short8)(b))));
889}
890
891static inline vec_uint4 vec_max(vec_uint4 a, vec_uint4 b)
892{
893  return (spu_sel(b, a, spu_cmpgt(a, b)));
894}
895
896static inline vec_int4 vec_max(vec_int4 a, vec_int4 b)
897{
898  return (spu_sel(b, a, spu_cmpgt(a, b)));
899}
900
901static inline vec_int4 vec_max(vec_bint4 a, vec_int4 b)
902{
903  return (spu_sel(b, (vec_int4)(a), spu_cmpgt((vec_int4)(a), b)));
904}
905
906static inline vec_int4 vec_max(vec_int4 a, vec_bint4 b)
907{
908  return (spu_sel((vec_int4)(b), a, spu_cmpgt(a, (vec_int4)(b))));
909}
910
911static inline vec_float4 vec_max(vec_float4 a, vec_float4 b)
912{
913  return (spu_sel(b, a, spu_cmpgt(a, b)));
914}
915
916
917/* vec_mergeh (vector merge high)
918 * ==========
919 */
920static inline vec_uchar16 vec_mergeh(vec_uchar16 a, vec_uchar16 b)
921{
922  return (spu_shuffle(a, b, ((vec_uchar16){0, 16, 1, 17, 2, 18, 3, 19,
923				           4, 20, 5, 21, 6, 22, 7, 23})));
924}
925
926static inline vec_char16 vec_mergeh(vec_char16 a, vec_char16 b)
927{
928  return (spu_shuffle(a, b, ((vec_uchar16){0, 16, 1, 17, 2, 18, 3, 19,
929				           4, 20, 5, 21, 6, 22, 7, 23})));
930}
931
932static inline vec_ushort8 vec_mergeh(vec_ushort8 a, vec_ushort8 b)
933{
934  return (spu_shuffle(a, b, ((vec_uchar16){0, 1, 16, 17, 2, 3, 18, 19,
935				           4, 5, 20, 21, 6, 7, 22, 23})));
936}
937
938static inline vec_short8 vec_mergeh(vec_short8 a, vec_short8 b)
939{
940  return (spu_shuffle(a, b, ((vec_uchar16){0, 1, 16, 17, 2, 3, 18, 19,
941				           4, 5, 20, 21, 6, 7, 22, 23})));
942}
943
944static inline vec_uint4 vec_mergeh(vec_uint4 a, vec_uint4 b)
945{
946  return (spu_shuffle(a, b, ((vec_uchar16){0, 1, 2, 3, 16, 17, 18, 19,
947				           4, 5, 6, 7, 20, 21, 22, 23})));
948}
949
950static inline vec_int4 vec_mergeh(vec_int4 a, vec_int4 b)
951{
952  return (spu_shuffle(a, b, ((vec_uchar16){0, 1, 2, 3, 16, 17, 18, 19,
953				           4, 5, 6, 7, 20, 21, 22, 23})));
954}
955
956static inline vec_float4 vec_mergeh(vec_float4 a, vec_float4 b)
957{
958  return (spu_shuffle(a, b, ((vec_uchar16){0, 1, 2, 3, 16, 17, 18, 19,
959				           4, 5, 6, 7, 20, 21, 22, 23})));
960}
961
962/* vec_mergel (vector merge low)
963 * ==========
964 */
965static inline vec_uchar16 vec_mergel(vec_uchar16 a, vec_uchar16 b)
966{
967  return (spu_shuffle(a, b, ((vec_uchar16){ 8, 24,  9, 25, 10, 26, 11, 27,
968				           12, 28, 13, 29, 14, 30, 15, 31})));
969}
970
971static inline vec_char16 vec_mergel(vec_char16 a, vec_char16 b)
972{
973  return (spu_shuffle(a, b, ((vec_uchar16){ 8, 24,  9, 25, 10, 26, 11, 27,
974				           12, 28, 13, 29, 14, 30, 15, 31})));
975}
976
977static inline vec_ushort8 vec_mergel(vec_ushort8 a, vec_ushort8 b)
978{
979  return (spu_shuffle(a, b, ((vec_uchar16){ 8,  9, 24, 25, 10, 11, 26, 27,
980				           12, 13, 28, 29, 14, 15, 30, 31})));
981}
982
983static inline vec_short8 vec_mergel(vec_short8 a, vec_short8 b)
984{
985  return (spu_shuffle(a, b, ((vec_uchar16){ 8,  9, 24, 25, 10, 11, 26, 27,
986				           12, 13, 28, 29, 14, 15, 30, 31})));
987}
988
989static inline vec_uint4 vec_mergel(vec_uint4 a, vec_uint4 b)
990{
991  return (spu_shuffle(a, b, ((vec_uchar16){ 8,  9, 10, 11, 24, 25, 26, 27,
992				           12, 13, 14, 15, 28, 29, 30, 31})));
993}
994
995static inline vec_int4 vec_mergel(vec_int4 a, vec_int4 b)
996{
997  return (spu_shuffle(a, b, ((vec_uchar16){ 8,  9, 10, 11, 24, 25, 26, 27,
998				           12, 13, 14, 15, 28, 29, 30, 31})));
999}
1000
1001static inline vec_float4 vec_mergel(vec_float4 a, vec_float4 b)
1002{
1003  return (spu_shuffle(a, b, ((vec_uchar16){ 8,  9, 10, 11, 24, 25, 26, 27,
1004				           12, 13, 14, 15, 28, 29, 30, 31})));
1005}
1006
1007/* vec_mfvscr (vector move from vector status and control register)
1008 * ==========
1009 */
1010static inline vec_ushort8 vec_mfvscr()
1011{
1012  return ((vec_ushort8)spu_splats(0)); 		/* not supported */
1013}
1014
1015
1016/* vec_min (vector minimum)
1017 * =======
1018 */
1019static inline vec_uchar16 vec_min(vec_uchar16 a, vec_uchar16 b)
1020{
1021  return (spu_sel(a, b, spu_cmpgt(a, b)));
1022}
1023
1024static inline vec_char16 vec_min(vec_char16 a, vec_char16 b)
1025{
1026  return (spu_sel(a, b, spu_cmpgt(a, b)));
1027}
1028
1029static inline vec_char16 vec_min(vec_bchar16 a, vec_char16 b)
1030{
1031  return (spu_sel((vec_char16)(a), b, spu_cmpgt((vec_char16)(a), b)));
1032}
1033
1034static inline vec_char16 vec_min(vec_char16 a, vec_bchar16 b)
1035{
1036  return (spu_sel(a, (vec_char16)(b), spu_cmpgt(a, (vec_char16)(b))));
1037}
1038
1039static inline vec_ushort8 vec_min(vec_ushort8 a, vec_ushort8 b)
1040{
1041  return (spu_sel(a, b, spu_cmpgt(a, b)));
1042}
1043
1044static inline vec_short8 vec_min(vec_short8 a, vec_short8 b)
1045{
1046  return (spu_sel(a, b, spu_cmpgt(a, b)));
1047}
1048
1049static inline vec_short8 vec_min(vec_bshort8 a, vec_short8 b)
1050{
1051  return (spu_sel((vec_short8)(a), b, spu_cmpgt((vec_short8)(a), b)));
1052}
1053
1054static inline vec_short8 vec_min(vec_short8 a, vec_bshort8 b)
1055{
1056  return (spu_sel(a, (vec_short8)(b), spu_cmpgt(a, (vec_short8)(b))));
1057}
1058
1059static inline vec_uint4 vec_min(vec_uint4 a, vec_uint4 b)
1060{
1061  return (spu_sel(a, b, spu_cmpgt(a, b)));
1062}
1063
1064static inline vec_int4 vec_min(vec_int4 a, vec_int4 b)
1065{
1066  return (spu_sel(a, b, spu_cmpgt(a, b)));
1067}
1068
1069static inline vec_int4 vec_min(vec_bint4 a, vec_int4 b)
1070{
1071  return (spu_sel((vec_int4)(a), b, spu_cmpgt((vec_int4)(a), b)));
1072}
1073
1074static inline vec_int4 vec_min(vec_int4 a, vec_bint4 b)
1075{
1076  return (spu_sel(a, (vec_int4)(b), spu_cmpgt(a, (vec_int4)(b))));
1077}
1078
1079static inline vec_float4 vec_min(vec_float4 a, vec_float4 b)
1080{
1081  return (spu_sel(a, b, spu_cmpgt(a, b)));
1082}
1083
1084/* vec_mladd (vector multiply low and add unsigned half word)
1085 * =========
1086 */
1087static inline vec_short8 vec_mladd(vec_short8 a, vec_short8 b, vec_short8 c)
1088{
1089  return ((vec_short8)(spu_shuffle(spu_madd((vec_short8)(spu_rl((vec_uint4)(a), -16)),
1090					    (vec_short8)(spu_rl((vec_uint4)(b), -16)),
1091					    (vec_int4)(spu_rl((vec_uint4)(c), -16))),
1092				   spu_madd(a, b, spu_extend(c)),
1093				   ((vec_uchar16){ 2,  3, 18, 19,  6,  7, 22, 23,
1094					          10, 11, 26, 27, 14, 15, 30, 31}))));
1095}
1096
1097
1098static inline vec_ushort8 vec_mladd(vec_ushort8 a, vec_ushort8 b, vec_ushort8 c)
1099{
1100  return ((vec_ushort8)(vec_mladd((vec_short8)(a), (vec_short8)(b), (vec_short8)(c))));
1101}
1102
1103static inline vec_short8 vec_mladd(vec_ushort8 a, vec_short8 b, vec_short8 c)
1104{
1105  return (vec_mladd((vec_short8)(a), b, c));
1106}
1107
1108static inline vec_short8 vec_mladd(vec_short8 a, vec_ushort8 b, vec_ushort8 c)
1109{
1110  return (vec_mladd(a, (vec_short8)(b), (vec_short8)(c)));
1111}
1112
1113
1114/* vec_mradds (vector multiply round and add saturate)
1115 * ==========
1116 */
1117static inline vec_short8 vec_mradds(vec_short8 a, vec_short8 b, vec_short8 c)
1118{
1119  vec_int4 round = (vec_int4)spu_splats(0x4000);
1120  vec_short8 hi, lo;
1121
1122  hi = (vec_short8)(spu_sl(spu_add(spu_mule(a, b), round), 1));
1123  lo = (vec_short8)(spu_rlmask(spu_add(spu_mulo(a, b), round), -15));
1124
1125  return (vec_adds(spu_sel(hi, lo, ((vec_ushort8){0, 0xFFFF, 0, 0xFFFF, 0, 0xFFFF, 0, 0xFFFF})), c));
1126}
1127
1128
1129/* vec_msum (vector multiply sum)
1130 * ========
1131 */
1132static inline vec_uint4 vec_msum(vec_uchar16 a, vec_uchar16 b, vec_uint4 c)
1133{
1134  vec_ushort8 a1, a2, b1, b2;
1135  vec_uint4 p1, p2;
1136
1137  a1 = spu_and((vec_ushort8)(a), 0xFF);
1138  a2 = spu_rlmask((vec_ushort8)(a), -8);
1139  b1 = spu_and((vec_ushort8)(b), 0xFF);
1140  b2 = spu_rlmask((vec_ushort8)(b), -8);
1141
1142  p1 = spu_add(spu_mulo(a1, b1), spu_mulo(spu_rlqwbyte(a1, -2), spu_rlqwbyte(b1, -2)));
1143  p2 = spu_add(spu_mulo(a2, b2), spu_mulo(spu_rlqwbyte(a2, -2), spu_rlqwbyte(b2, -2)));
1144  return (spu_add(p2, spu_add(p1, c)));
1145}
1146
1147static inline vec_int4 vec_msum(vec_char16 a, vec_uchar16 b, vec_int4 c)
1148{
1149  vec_short8 a1, a2, b1, b2;
1150  vec_int4 p1, p2;
1151
1152  a1 = (vec_short8)(spu_extend(a));
1153  a2 = spu_rlmaska((vec_short8)(a), -8);
1154  b1 = (vec_short8)(spu_and((vec_ushort8)(b), 0xFF));
1155  b2 = (vec_short8)spu_rlmask((vec_ushort8)(b), -8);
1156
1157  p1 = spu_add(spu_mulo(a1, b1), spu_mulo(spu_rlqwbyte(a1, -2), spu_rlqwbyte(b1, -2)));
1158  p2 = spu_add(spu_mulo(a2, b2), spu_mulo(spu_rlqwbyte(a2, -2), spu_rlqwbyte(b2, -2)));
1159  return (spu_add(p2, spu_add(p1, c)));
1160}
1161
1162static inline vec_uint4 vec_msum(vec_ushort8 a, vec_ushort8 b, vec_uint4 c)
1163{
1164  return (spu_add(spu_add(spu_mulo(a, b), spu_mulo(spu_rlqwbyte(a, -2), spu_rlqwbyte(b, -2))), c));
1165}
1166
1167static inline vec_int4 vec_msum(vec_short8 a, vec_short8 b, vec_int4 c)
1168{
1169  return (spu_add(spu_add(spu_mulo(a, b), spu_mulo(spu_rlqwbyte(a, -2), spu_rlqwbyte(b, -2))), c));
1170}
1171
1172
1173/* vec_msums (vector multiply sum saturate)
1174 * ========
1175 */
1176static inline vec_uint4 vec_msums(vec_ushort8 a, vec_ushort8 b, vec_uint4 c)
1177{
1178  vec_uint4 p1, p2;
1179
1180  p1 = spu_mulo(a, b);
1181  p2 = spu_mulo(spu_rlqwbyte(a, -2), spu_rlqwbyte(b, -2));
1182
1183  return (vec_adds(p2, vec_adds(p1, c)));
1184}
1185
1186static inline vec_int4 vec_msums(vec_short8 a, vec_short8 b, vec_int4 c)
1187{
1188  return (vec_adds(spu_add(spu_mulo(a, b), spu_mulo(spu_rlqwbyte(a, -2), spu_rlqwbyte(b, -2))), c));
1189}
1190
1191/* vec_mtvscr (vector move to vector status and control register)
1192 * ==========
1193 */
1194#define vec_mtvscr(_a)		/* not supported */
1195
1196
1197/* vec_mule (vector multiply even)
1198 * ========
1199 */
1200static inline vec_ushort8 vec_mule(vec_uchar16 a, vec_uchar16 b)
1201{
1202  vec_ushort8 hi, lo;
1203
1204  hi = (vec_ushort8)spu_mulo((vec_ushort8)(spu_rlmask((vec_uint4)(a), -24)),
1205			     (vec_ushort8)(spu_rlmask((vec_uint4)(b), -24)));
1206  lo = (vec_ushort8)spu_mulo((vec_ushort8)(spu_rlmask((vec_short8)(a), -8)),
1207			     (vec_ushort8)(spu_rlmask((vec_short8)(b), -8)));
1208
1209  return (spu_shuffle(hi, lo, ((vec_uchar16){ 2,  3, 18, 19,  6,  7, 22, 23,
1210				             10, 11, 26, 27, 14, 15, 30, 31})));
1211}
1212
1213static inline vec_short8 vec_mule(vec_char16 a, vec_char16 b)
1214{
1215  vec_short8 hi, lo;
1216
1217  hi = (vec_short8)spu_mulo((vec_short8)(spu_rlmaska((vec_uint4)(a), -24)),
1218			    (vec_short8)(spu_rlmaska((vec_uint4)(b), -24)));
1219  lo = (vec_short8)spu_mulo((vec_short8)(spu_rlmaska((vec_short8)(a), -8)),
1220			    (vec_short8)(spu_rlmaska((vec_short8)(b), -8)));
1221
1222  return (spu_shuffle(hi, lo, ((vec_uchar16){ 2,  3, 18, 19,  6,  7, 22, 23,
1223				             10, 11, 26, 27, 14, 15, 30, 31})));
1224}
1225
1226static inline vec_uint4 vec_mule(vec_ushort8 a, vec_ushort8 b)
1227{
1228 return (spu_mulo((vec_ushort8)spu_rlmask((vec_uint4)(a), -16),
1229		  (vec_ushort8)spu_rlmask((vec_uint4)(b), -16)));
1230}
1231
1232
1233static inline vec_int4 vec_mule(vec_short8 a, vec_short8 b)
1234{
1235 return (spu_mulo((vec_short8)spu_rlmaska((vec_int4)(a), -16),
1236		  (vec_short8)spu_rlmaska((vec_int4)(b), -16)));
1237}
1238
1239
1240/* vec_mulo (vector multiply odd)
1241 * ========
1242 */
1243static inline vec_ushort8 vec_mulo(vec_uchar16 a, vec_uchar16 b)
1244{
1245  vec_ushort8 hi, lo;
1246
1247  hi = (vec_ushort8)spu_mulo((vec_ushort8)(spu_and(spu_rlmask((vec_uint4)(a), -16), 0xFF)),
1248			     (vec_ushort8)(spu_and(spu_rlmask((vec_uint4)(b), -16), 0xFF)));
1249  lo = (vec_ushort8)spu_mulo(spu_and((vec_ushort8)(a), 0xFF), spu_and((vec_ushort8)(b), 0xFF));
1250
1251  return (spu_shuffle(hi, lo, ((vec_uchar16){ 2,  3, 18, 19,  6,  7, 22, 23,
1252				             10, 11, 26, 27, 14, 15, 30, 31})));
1253}
1254
1255static inline vec_short8 vec_mulo(vec_char16 a, vec_char16 b)
1256{
1257  vec_short8 aa, bb, hi, lo;
1258
1259  aa = spu_extend(a);
1260  bb = spu_extend(b);
1261
1262  hi = (vec_short8)spu_mulo((vec_short8)(spu_rlmaska((vec_uint4)(aa), -16)),
1263		(vec_short8)(spu_rlmaska((vec_uint4)(bb), -16)));
1264  lo = (vec_short8)spu_mulo(aa, bb);
1265  return (spu_shuffle(hi, lo, ((vec_uchar16){ 2,  3, 18, 19,  6,  7, 22, 23,
1266				             10, 11, 26, 27, 14, 15, 30, 31})));
1267}
1268
1269static inline vec_uint4 vec_mulo(vec_ushort8 a, vec_ushort8 b)
1270{
1271  return (spu_mulo(a, b));
1272}
1273
1274
1275static inline vec_int4 vec_mulo(vec_short8 a, vec_short8 b)
1276{
1277  return (spu_mulo(a, b));
1278}
1279
1280
1281/* vec_nmsub (vector negative multiply subtract)
1282 * =========
1283 */
1284#define vec_nmsub(_a, _b, _c)	spu_nmsub(_a, _b, _c)
1285
1286
1287/* vec_nor (vector logical nor)
1288 * =======
1289 */
1290#define vec_nor(_a, _b)		spu_nor(_a, _b)
1291
1292
1293/* vec_or (vector logical or)
1294 * ======
1295 */
1296static inline vec_uchar16 vec_or(vec_uchar16 a, vec_uchar16 b)
1297{
1298  return (spu_or(a, b));
1299}
1300
1301static inline vec_char16 vec_or(vec_char16 a, vec_char16 b)
1302{
1303  return (spu_or(a, b));
1304}
1305
1306static inline vec_char16 vec_or(vec_bchar16 a, vec_char16 b)
1307{
1308  return (spu_or((vec_char16)(a), b));
1309}
1310
1311static inline vec_char16 vec_or(vec_char16 a, vec_bchar16 b)
1312{
1313  return (spu_or(a, (vec_char16)(b)));
1314}
1315
1316static inline vec_ushort8 vec_or(vec_ushort8 a, vec_ushort8 b)
1317{
1318  return (spu_or(a, b));
1319}
1320
1321static inline vec_short8 vec_or(vec_short8 a, vec_short8 b)
1322{
1323  return (spu_or(a, b));
1324}
1325
1326static inline vec_short8 vec_or(vec_bshort8 a, vec_short8 b)
1327{
1328  return (spu_or((vec_short8)(a), b));
1329}
1330
1331static inline vec_short8 vec_or(vec_short8 a, vec_bshort8 b)
1332{
1333  return (spu_or(a, (vec_short8)(b)));
1334}
1335
1336static inline vec_uint4 vec_or(vec_uint4 a, vec_uint4 b)
1337{
1338  return (spu_or(a, b));
1339}
1340
1341static inline vec_int4 vec_or(vec_int4 a, vec_int4 b)
1342{
1343  return (spu_or(a, b));
1344}
1345
1346static inline vec_int4 vec_or(vec_bint4 a, vec_int4 b)
1347{
1348  return (spu_or((vec_int4)(a), b));
1349}
1350
1351static inline vec_int4 vec_or(vec_int4 a, vec_bint4 b)
1352{
1353  return (spu_or(a, (vec_int4)(b)));
1354}
1355
1356static inline vec_float4 vec_or(vec_float4 a, vec_float4 b)
1357{
1358  return (spu_or(a, b));
1359}
1360
1361static inline vec_float4 vec_or(vec_bint4 a, vec_float4 b)
1362{
1363  return (spu_or((vec_float4)(a),b));
1364}
1365
1366static inline vec_float4 vec_or(vec_float4 a, vec_bint4 b)
1367{
1368  return (spu_or(a, (vec_float4)(b)));
1369}
1370
1371
1372/* vec_pack (vector pack)
1373 * ========
1374 */
1375static inline vec_uchar16 vec_pack(vec_ushort8 a, vec_ushort8 b)
1376{
1377  return ((vec_uchar16)spu_shuffle(a, b, ((vec_uchar16){ 1,  3,  5,  7,  9, 11, 13, 15,
1378					                17, 19, 21, 23, 25, 27, 29, 31})));
1379}
1380
1381static inline vec_char16 vec_pack(vec_short8 a, vec_short8 b)
1382{
1383  return ((vec_char16)spu_shuffle(a, b, ((vec_uchar16){ 1,  3,  5,  7,  9, 11, 13, 15,
1384					               17, 19, 21, 23, 25, 27, 29, 31})));
1385}
1386
1387static inline vec_ushort8 vec_pack(vec_uint4 a, vec_uint4 b)
1388{
1389  return ((vec_ushort8)spu_shuffle(a, b, ((vec_uchar16){ 2,  3,  6,  7, 10, 11, 14, 15,
1390					                18, 19, 22, 23, 26, 27, 30, 31})));
1391}
1392
1393static inline vec_short8 vec_pack(vec_int4 a, vec_int4 b)
1394{
1395  return ((vec_short8)spu_shuffle(a, b, ((vec_uchar16){ 2,  3,  6,  7, 10, 11, 14, 15,
1396					               18, 19, 22, 23, 26, 27, 30, 31})));
1397}
1398
1399
1400/* vec_packpx (vector pack pixel)
1401 * ==========
1402 */
1403static inline vec_pixel8 vec_packpx(vec_uint4 a, vec_uint4 b)
1404{
1405  vec_uint4 x03FF = (vec_uint4)(spu_splats((unsigned short)0x03FF));
1406  vec_uint4 x001F = (vec_uint4)(spu_splats((unsigned short)0x001F));
1407
1408  return ((vec_pixel8)(spu_shuffle(spu_sel(spu_sel(spu_sl(a, 7), spu_sl(a, 10), x03FF),
1409					   spu_sl(a, 13), x001F),
1410				   spu_sel(spu_sel(spu_sl(b, 7), spu_sl(b, 10), x03FF),
1411					   spu_sl(b, 13), x001F),
1412				   ((vec_uchar16){ 0,  1,  4,  5,   8,  9, 12, 13,
1413					          16, 17, 20, 21, 24, 25, 28, 29}))));
1414}
1415
1416
1417/* vec_packs (vector pack saturate)
1418 * =========
1419 */
1420static inline vec_uchar16 vec_packs(vec_ushort8 a, vec_ushort8 b)
1421{
1422  vec_ushort8 max = spu_splats((unsigned short)0x00FF);
1423
1424  return ((vec_uchar16)(spu_shuffle(spu_sel(a, max, spu_cmpgt(a, 255)),
1425				    spu_sel(b, max, spu_cmpgt(b, 255)),
1426				    ((vec_uchar16){ 1,  3,  5,  7,  9, 11, 13, 15,
1427					           17, 19, 21, 23, 25, 27, 29, 31}))));
1428}
1429
1430static inline vec_char16 vec_packs(vec_short8 a, vec_short8 b)
1431{
1432  vec_short8 max = spu_splats((signed short)0x007F);
1433  vec_short8 min = spu_splats((signed short)0xFF80);
1434
1435  return ((vec_char16)(spu_shuffle(spu_sel(min, spu_sel(a, max, spu_cmpgt(a, 127)), spu_cmpgt(a, -128)),
1436				    spu_sel(min, spu_sel(b, max, spu_cmpgt(b, 127)), spu_cmpgt(b, -128)),
1437				   ((vec_uchar16){ 1,  3,  5,  7,  9, 11, 13, 15,
1438					          17, 19, 21, 23, 25, 27, 29, 31}))));
1439}
1440
1441static inline vec_ushort8 vec_packs(vec_uint4 a, vec_uint4 b)
1442{
1443  vec_uint4 max = spu_splats((unsigned int)0x0000FFFF);
1444
1445  return ((vec_ushort8)(spu_shuffle(spu_sel(a, max, spu_cmpgt(a, max)),
1446				    spu_sel(b, max, spu_cmpgt(b, max)),
1447				    ((vec_uchar16){ 2,  3,  6,  7, 10, 11, 14, 15,
1448					           18, 19, 22, 23, 26, 27, 30, 31}))));
1449}
1450
1451static inline vec_short8 vec_packs(vec_int4 a, vec_int4 b)
1452{
1453  vec_int4 max = spu_splats((signed int)0x00007FFF);
1454  vec_int4 min = spu_splats((signed int)0xFFFF8000);
1455
1456  return ((vec_short8)(spu_shuffle(spu_sel(min, spu_sel(a, max, spu_cmpgt(a, max)), spu_cmpgt(a, min)),
1457				   spu_sel(min, spu_sel(b, max, spu_cmpgt(b, max)), spu_cmpgt(b, min)),
1458				   ((vec_uchar16){ 2,  3,  6,  7, 10, 11, 14, 15,
1459					          18, 19, 22, 23, 26, 27, 30, 31}))));
1460}
1461
1462
1463/* vec_packsu (vector pack saturate unsigned)
1464 * ==========
1465 */
1466static inline vec_uchar16 vec_packsu(vec_ushort8 a, vec_ushort8 b)
1467{
1468  return ((vec_uchar16)spu_shuffle(spu_or(a, (vec_ushort8)(spu_cmpgt(a, 255))),
1469				   spu_or(b, (vec_ushort8)(spu_cmpgt(b, 255))),
1470				   ((vec_uchar16){ 1,  3,  5,  7,  9, 11, 13, 15,
1471					          17, 19, 21, 23, 25, 27, 29, 31})));
1472}
1473
1474static inline vec_uchar16 vec_packsu(vec_short8 a, vec_short8 b)
1475{
1476  vec_short8 max = spu_splats((signed short)0x00FF);
1477  vec_short8 min = spu_splats((signed short)0x0000);
1478
1479  return ((vec_uchar16)(spu_shuffle(spu_sel(min, spu_sel(a, max, spu_cmpgt(a, 255)), spu_cmpgt(a, 0)),
1480				    spu_sel(min, spu_sel(b, max, spu_cmpgt(b, 255)), spu_cmpgt(b, 0)),
1481				    ((vec_uchar16){ 1,  3,  5,  7,  9, 11, 13, 15,
1482					           17, 19, 21, 23, 25, 27, 29, 31}))));
1483
1484  return (vec_packsu((vec_ushort8)(a), (vec_ushort8)(b)));
1485}
1486
1487static inline vec_ushort8 vec_packsu(vec_uint4 a, vec_uint4 b)
1488{
1489  vec_uint4 max = spu_splats((unsigned int)0xFFFF);
1490
1491  return ((vec_ushort8)spu_shuffle(spu_or(a, (vec_uint4)(spu_cmpgt(a, max))),
1492				   spu_or(b, (vec_uint4)(spu_cmpgt(b, max))),
1493				   ((vec_uchar16){ 2,  3,  6,  7, 10, 11, 14, 15,
1494					          18, 19, 22, 23, 26, 27, 30, 31})));
1495}
1496
1497static inline vec_ushort8 vec_packsu(vec_int4 a, vec_int4 b)
1498{
1499  vec_int4 max = spu_splats((signed int)0x0000FFFF);
1500  vec_int4 min = spu_splats((signed int)0x00000000);
1501
1502  return ((vec_ushort8)(spu_shuffle(spu_sel(min, spu_sel(a, max, spu_cmpgt(a, max)), spu_cmpgt(a, min)),
1503				    spu_sel(min, spu_sel(b, max, spu_cmpgt(b, max)), spu_cmpgt(b, min)),
1504				    ((vec_uchar16){ 2,  3,  6,  7, 10, 11, 14, 15,
1505					           18, 19, 22, 23, 26, 27, 30, 31}))));
1506}
1507
1508
1509/* vec_perm (vector permute)
1510 * ========
1511 */
1512static inline vec_uchar16 vec_perm(vec_uchar16 a, vec_uchar16 b, vec_uchar16 c)
1513{
1514  return (spu_shuffle(a, b, spu_and(c, 0x1F)));
1515}
1516
1517static inline vec_char16 vec_perm(vec_char16 a, vec_char16 b, vec_uchar16 c)
1518{
1519  return ((vec_char16)(vec_perm((vec_uchar16)(a), (vec_uchar16)(b), c)));
1520}
1521
1522static inline vec_ushort8 vec_perm(vec_ushort8 a, vec_ushort8 b, vec_uchar16 c)
1523{
1524  return ((vec_ushort8)(vec_perm((vec_uchar16)(a), (vec_uchar16)(b), c)));
1525}
1526
1527static inline vec_short8 vec_perm(vec_short8 a, vec_short8 b, vec_uchar16 c)
1528{
1529  return ((vec_short8)(vec_perm((vec_uchar16)(a), (vec_uchar16)(b), c)));
1530}
1531
1532static inline vec_uint4 vec_perm(vec_uint4 a, vec_uint4 b, vec_uchar16 c)
1533{
1534  return ((vec_uint4)(vec_perm((vec_uchar16)(a), (vec_uchar16)(b), c)));
1535}
1536
1537static inline vec_int4 vec_perm(vec_int4 a, vec_int4 b, vec_uchar16 c)
1538{
1539  return ((vec_int4)(vec_perm((vec_uchar16)(a), (vec_uchar16)(b), c)));
1540}
1541
1542static inline vec_float4 vec_perm(vec_float4 a, vec_float4 b, vec_uchar16 c)
1543{
1544  return ((vec_float4)(vec_perm((vec_uchar16)(a), (vec_uchar16)(b), c)));
1545}
1546
1547
1548/* vec_re (vector reciprocal estimate)
1549 * ======
1550 */
1551#define vec_re(_a)	spu_re(_a)
1552
1553
1554/* vec_rl (vector rotate left)
1555 * ======
1556 */
1557static inline vec_uchar16 vec_rl(vec_uchar16 a, vec_uchar16 b)
1558{
1559  vec_ushort8 r1, r2;
1560
1561  r1 = spu_rl(spu_and((vec_ushort8)(a), 0xFF), (vec_short8)spu_and((vec_ushort8)(b), 7));
1562  r2 = spu_rl(spu_and((vec_ushort8)(a), -256), (vec_short8)spu_and(spu_rlmask((vec_ushort8)(b), -8), 7));
1563  return ((vec_uchar16)(spu_sel(spu_or(r2, spu_sl(r2, 8)), spu_or(r1, spu_rlmask(r1, -8)), spu_splats((unsigned short)0xFF))));
1564}
1565
1566static inline vec_char16 vec_rl(vec_char16 a, vec_uchar16 b)
1567{
1568  return ((vec_char16)(vec_rl((vec_uchar16)(a), b)));
1569}
1570
1571static inline vec_ushort8 vec_rl(vec_ushort8 a, vec_ushort8 b)
1572{
1573  return (spu_rl(a, (vec_short8)(b)));
1574}
1575
1576static inline vec_short8 vec_rl(vec_short8 a, vec_ushort8 b)
1577{
1578  return (spu_rl(a, (vec_short8)(b)));
1579}
1580
1581static inline vec_uint4 vec_rl(vec_uint4 a, vec_uint4 b)
1582{
1583  return (spu_rl(a, (vec_int4)(b)));
1584}
1585
1586static inline vec_int4 vec_rl(vec_int4 a, vec_uint4 b)
1587{
1588  return (spu_rl(a, (vec_int4)(b)));
1589}
1590
1591
1592/* vec_round (vector round)
1593 * =========
1594 */
1595static inline vec_float4 vec_round(vec_float4 a)
1596{
1597  vec_float4 s_half, s_one, d;
1598  vec_uint4 odd;
1599  vec_uint4 msb = spu_splats((unsigned int)0x80000000);
1600  vec_float4 half = spu_splats(0.5f);
1601  vec_int4 exp;
1602  vec_uint4 mask;
1603
1604  s_half = (vec_float4)(spu_sel((vec_uint4)(half), (vec_uint4)(a), msb));
1605  a = spu_add(a, s_half);
1606  s_one = spu_add(s_half, s_half);
1607  exp  = spu_sub(127, (vec_int4)(spu_and(spu_rlmask((vec_uint4)(a), -23), 0xFF)));
1608  mask = spu_rlmask(spu_splats((unsigned int)0x7FFFFF), exp);
1609  mask = spu_sel(spu_splats((unsigned int)0), mask, spu_cmpgt(exp, -31));
1610  mask = spu_or(mask, spu_xor((vec_uint4)(spu_rlmaska(spu_add(exp, -1), -31)), -1));
1611
1612  odd = spu_and((vec_uint4)(spu_convts(a, 0)), 1);
1613  s_one = spu_andc(s_one, (vec_float4)spu_cmpeq(mask, 0));
1614  s_one = spu_and(s_one, spu_and((vec_float4)spu_cmpeq(spu_and((vec_uint4)(a), mask), 0),
1615				 (vec_float4)spu_cmpeq(odd, 1)));
1616  d = spu_andc(a, (vec_float4)(mask));
1617  d = spu_sub(d, s_one);
1618  return (d);
1619}
1620
1621/* vec_rsqrte (vector reciprocal square root estimate)
1622 * ==========
1623 */
1624#define vec_rsqrte(_a)	spu_rsqrte(_a)
1625
1626
1627/* vec_sel (vector select)
1628 * =======
1629 */
1630#define vec_sel(_a, _b, _c)	spu_sel(_a, _b, _c)
1631
1632
1633/* vec_sl (vector shift left)
1634 * ======
1635 */
1636static inline vec_uchar16 vec_sl(vec_uchar16 a, vec_uchar16 b)
1637{
1638  vec_ushort8 hi, lo;
1639
1640  lo = spu_and(spu_sl((vec_ushort8)(a), spu_and((vec_ushort8)(b), 7)), 0xFF);
1641  hi = spu_sl(spu_and((vec_ushort8)(a), -256), spu_and(spu_rlmask((vec_ushort8)(b), -8), 7));
1642
1643  return ((vec_uchar16)(spu_or(hi, lo)));
1644}
1645
1646static inline vec_char16 vec_sl(vec_char16 a, vec_uchar16 b)
1647{
1648  return ((vec_char16)(vec_sl((vec_uchar16)(a), b)));
1649}
1650
1651static inline vec_ushort8 vec_sl(vec_ushort8 a, vec_ushort8 b)
1652{
1653  return (spu_sl(a, spu_and(b, 15)));
1654}
1655
1656static inline vec_short8 vec_sl(vec_short8 a, vec_ushort8 b)
1657{
1658  return (spu_sl(a, spu_and((vec_ushort8)(b), 15)));
1659}
1660
1661static inline vec_uint4 vec_sl(vec_uint4 a, vec_uint4 b)
1662{
1663  return (spu_sl(a, spu_and(b, 31)));
1664}
1665
1666static inline vec_int4 vec_sl(vec_int4 a, vec_uint4 b)
1667{
1668  return (spu_sl(a, spu_and(b, 31)));
1669}
1670
1671
1672/* vec_sld (vector shift left double)
1673 * =======
1674 */
1675#define vec_sld(_a, _b, _c)	spu_shuffle(_a, _b, ((vec_uchar16){ 0+(_c),  1+(_c),  2+(_c),  3+(_c),  \
1676								    4+(_c),  5+(_c),  6+(_c),  7+(_c), 	\
1677								    8+(_c),  9+(_c), 10+(_c), 11+(_c), 	\
1678							           12+(_c), 13+(_c), 14+(_c), 15+(_c)}))
1679
1680
1681/* vec_sll (vector shift left long)
1682 * =======
1683 */
1684#define vec_sll(_a, _b)		spu_slqw(_a, spu_extract((vec_uint4)(_b), 0))
1685
1686
1687/* vec_slo (vector shift left by octet)
1688 * =======
1689 */
1690#define vec_slo(_a, _b)		spu_slqwbytebc(_a, spu_extract((vec_uint4)(_b), 3) & 0x7F)
1691
1692
1693/* vec_splat (vector splat)
1694 * =========
1695 */
1696#define vec_splat(_a, _b)	spu_splats(spu_extract(_a, _b))
1697
1698
1699/* vec_splat_s8 (vector splat signed byte)
1700 * ============
1701 */
1702#define vec_splat_s8(_a)	spu_splats((signed char)(_a))
1703
1704
1705/* vec_splat_s16 (vector splat signed half-word)
1706 * =============
1707 */
1708#define vec_splat_s16(_a)	spu_splats((signed short)(_a))
1709
1710
1711/* vec_splat_s32 (vector splat signed word)
1712 * =============
1713 */
1714#define vec_splat_s32(_a)	spu_splats((signed int)(_a))
1715
1716
1717/* vec_splat_u8 (vector splat unsigned byte)
1718 * ============
1719 */
1720#define vec_splat_u8(_a)	spu_splats((unsigned char)(_a))
1721
1722
1723/* vec_splat_u16 (vector splat unsigned half-word)
1724 * =============
1725 */
1726#define vec_splat_u16(_a)	spu_splats((unsigned short)(_a))
1727
1728
1729/* vec_splat_u32 (vector splat unsigned word)
1730 * =============
1731 */
1732#define vec_splat_u32(_a)	spu_splats((unsigned int)(_a))
1733
1734
1735/* vec_sr (vector shift right)
1736 * ======
1737 */
1738static inline vec_uchar16 vec_sr(vec_uchar16 a, vec_uchar16 b)
1739{
1740  vec_ushort8 hi, lo;
1741
1742  lo = spu_rlmask(spu_and((vec_ushort8)(a), 0xFF), spu_sub(0, (vec_short8)(spu_and((vec_ushort8)(b), 7))));
1743  hi = spu_and(spu_rlmask((vec_ushort8)(a), spu_sub(0, (vec_short8)(spu_and(spu_rlmask((vec_ushort8)(b), -8), 7)))), -256);
1744
1745  return ((vec_uchar16)(spu_or(hi, lo)));
1746}
1747
1748static inline vec_char16 vec_sr(vec_char16 a, vec_uchar16 b)
1749{
1750  return ((vec_char16)(vec_sr((vec_uchar16)(a), b)));
1751}
1752
1753static inline vec_ushort8 vec_sr(vec_ushort8 a, vec_ushort8 b)
1754{
1755  return (spu_rlmask(a, spu_sub(0, (vec_short8)(spu_and(b, 15)))));
1756}
1757
1758static inline vec_short8 vec_sr(vec_short8 a, vec_ushort8 b)
1759{
1760  return ((vec_short8)(vec_sr((vec_ushort8)(a), b)));
1761}
1762
1763static inline vec_uint4 vec_sr(vec_uint4 a, vec_uint4 b)
1764{
1765  return (spu_rlmask(a, spu_sub(0, (vec_int4)(spu_and(b, 31)))));
1766}
1767
1768static inline vec_int4 vec_sr(vec_int4 a, vec_uint4 b)
1769{
1770  return ((vec_int4)(vec_sr((vec_uint4)(a), b)));
1771}
1772
1773
1774/* vec_sra (vector shift right algebraic)
1775 * =======
1776 */
1777static inline vec_char16 vec_sra(vec_char16 a, vec_uchar16 b)
1778{
1779  vec_short8 hi, lo;
1780
1781  lo = spu_and(spu_rlmaska(spu_extend(a), spu_sub(0, (vec_short8)(spu_and((vec_ushort8)(b), 7)))), 0xFF);
1782  hi = spu_and(spu_rlmaska((vec_short8)(a), spu_sub(0, (vec_short8)(spu_and(spu_rlmask((vec_ushort8)(b), -8), 7)))), -256);
1783
1784  return ((vec_char16)(spu_or(hi, lo)));
1785}
1786
1787static inline vec_uchar16 vec_sra(vec_uchar16 a, vec_uchar16 b)
1788{
1789  return ((vec_uchar16)(vec_sra((vec_char16)(a), b)));
1790}
1791
1792static inline vec_short8 vec_sra(vec_short8 a, vec_ushort8 b)
1793{
1794  return (spu_rlmaska(a, spu_sub(0, (vec_short8)(spu_and(b, 15)))));
1795}
1796
1797static inline vec_ushort8 vec_sra(vec_ushort8 a, vec_ushort8 b)
1798{
1799  return ((vec_ushort8)(vec_sra((vec_short8)(a), b)));
1800}
1801
1802static inline vec_int4 vec_sra(vec_int4 a, vec_uint4 b)
1803{
1804  return (spu_rlmaska(a, spu_sub(0, (vec_int4)(spu_and(b, 31)))));
1805}
1806
1807static inline vec_uint4 vec_sra(vec_uint4 a, vec_uint4 b)
1808{
1809  return ((vec_uint4)(vec_sra((vec_int4)(a), b)));
1810}
1811
1812
1813/* vec_srl (vector shift right long)
1814 * =======
1815 */
1816#define vec_srl(_a, _b)		spu_rlmaskqw(_a, 0-spu_extract((vec_int4)(_b), 3))
1817
1818
1819/* vec_sro (vector shift right by octet)
1820 * =======
1821 */
1822#define vec_sro(_a, _b)		spu_rlmaskqwbyte(_a, 0 - ((spu_extract((vec_int4)(_b), 3) >> 3) & 0xF))
1823
1824/* vec_st (vector store indexed)
1825 * ======
1826 */
1827static inline void vec_st(vec_uchar16 a, int b, unsigned char *c)
1828{
1829  *((vec_uchar16 *)(c+b)) = a;
1830}
1831
1832static inline void vec_st(vec_uchar16 a, int b, vec_uchar16 *c)
1833{
1834  *((vec_uchar16 *)((unsigned char *)(c)+b)) = a;
1835}
1836
1837static inline void vec_st(vec_char16 a, int b, signed char *c)
1838{
1839  *((vec_char16 *)(c+b)) = a;
1840}
1841
1842static inline void vec_st(vec_char16 a, int b, vec_char16 *c)
1843{
1844  *((vec_char16 *)((signed char *)(c)+b)) = a;
1845}
1846
1847static inline void vec_st(vec_bchar16 a, int b, signed char *c)
1848{
1849  *((vec_bchar16 *)((signed char *)(c)+b)) = a;
1850}
1851
1852static inline void vec_st(vec_ushort8 a, int b, unsigned short *c)
1853{
1854  *((vec_ushort8 *)((unsigned char *)(c)+b)) = a;
1855}
1856
1857static inline void vec_st(vec_ushort8 a, int b, vec_ushort8 *c)
1858{
1859  *((vec_ushort8 *)((unsigned char *)(c)+b)) = a;
1860}
1861
1862static inline void vec_st(vec_short8 a, int b, signed short *c)
1863{
1864  *((vec_short8 *)((unsigned char *)(c)+b)) = a;
1865}
1866
1867static inline void vec_st(vec_short8 a, int b, vec_short8 *c)
1868{
1869  *((vec_short8 *)((signed char *)(c)+b)) = a;
1870}
1871
1872static inline void vec_st(vec_bshort8 a, int b, signed short *c)
1873{
1874  *((vec_bshort8 *)((signed char *)(c)+b)) = a;
1875}
1876
1877static inline void vec_st(vec_uint4 a, int b, unsigned int *c)
1878{
1879  *((vec_uint4 *)((unsigned char *)(c)+b)) = a;
1880}
1881
1882static inline void vec_st(vec_uint4 a, int b, vec_uint4 *c)
1883{
1884  *((vec_uint4 *)((unsigned char *)(c)+b)) = a;
1885}
1886
1887static inline void vec_st(vec_int4 a, int b, signed int *c)
1888{
1889  *((vec_int4 *)((unsigned char *)(c)+b)) = a;
1890}
1891
1892static inline void vec_st(vec_int4 a, int b, vec_int4 *c)
1893{
1894  *((vec_int4 *)((signed char *)(c)+b)) = a;
1895}
1896
1897static inline void vec_st(vec_bint4 a, int b, signed int *c)
1898{
1899  *((vec_bint4 *)((signed char *)(c)+b)) = a;
1900}
1901
1902static inline void vec_st(vec_float4 a, int b, float *c)
1903{
1904  *((vec_float4 *)((unsigned char *)(c)+b)) = a;
1905}
1906
1907static inline void vec_st(vec_float4 a, int b, vec_float4 *c)
1908{
1909  *((vec_float4 *)((unsigned char *)(c)+b)) = a;
1910}
1911
1912
1913/* vec_ste (vector store element indexed)
1914 * =======
1915 */
1916static inline void vec_ste(vec_uchar16 a, int b, unsigned char *c)
1917{
1918  unsigned char *ptr;
1919
1920  ptr = c + b;
1921  *ptr = spu_extract(a, (int)(ptr) & 15);
1922}
1923
1924static inline void vec_ste(vec_char16 a, int b, signed char *c)
1925{
1926  vec_ste((vec_uchar16)(a), b, (unsigned char *)(c));
1927}
1928
1929static inline void vec_ste(vec_bchar16 a, int b, signed char *c)
1930{
1931  vec_ste((vec_uchar16)(a), b, (unsigned char *)(c));
1932}
1933
1934static inline void vec_ste(vec_ushort8 a, int b, unsigned short *c)
1935{
1936  unsigned short *ptr;
1937
1938  ptr = (unsigned short *)(((unsigned int)(c) + b) & ~1);
1939  *ptr = spu_extract(a, ((int)(ptr) >> 1) & 7);
1940}
1941
1942static inline void vec_ste(vec_short8 a, int b, signed short *c)
1943{
1944  vec_ste((vec_ushort8)(a), b, (unsigned short *)(c));
1945}
1946
1947static inline void vec_ste(vec_bshort8 a, int b, signed short *c)
1948{
1949  vec_ste((vec_ushort8)(a), b, (unsigned short *)(c));
1950}
1951
1952static inline void vec_ste(vec_uint4 a, int b, unsigned int *c)
1953{
1954  unsigned int *ptr;
1955
1956  ptr = (unsigned int *)(((unsigned int)(c) + b) & ~3);
1957  *ptr = spu_extract(a, ((int)(ptr) >> 2) & 3);
1958}
1959
1960static inline void vec_ste(vec_int4 a, int b, signed int *c)
1961{
1962  vec_ste((vec_uint4)(a), b, (unsigned int *)(c));
1963}
1964
1965static inline void vec_ste(vec_bint4 a, int b, signed int *c)
1966{
1967  vec_ste((vec_uint4)(a), b, (unsigned int *)(c));
1968}
1969
1970static inline void vec_ste(vec_float4 a, int b, float *c)
1971{
1972  vec_ste((vec_uint4)(a), b, (unsigned int *)(c));
1973}
1974
1975
1976/* vec_stl (vector store indexed LRU)
1977 * =======
1978 */
1979#define vec_stl(_a, _b, _c)		vec_st(_a, _b, _c)
1980
1981
1982/* vec_sub (vector subtract)
1983 * =======
1984 */
1985static inline vec_uchar16 vec_sub(vec_uchar16 a, vec_uchar16 b)
1986{
1987  return ((vec_uchar16)(spu_sel(spu_sub((vec_ushort8)(a), (vec_ushort8)(b)),
1988				spu_sub(spu_and((vec_ushort8)(a), -256), spu_and((vec_ushort8)(b), -256)),
1989				spu_splats((unsigned short)0xFF00))));
1990}
1991
1992static inline vec_char16 vec_sub(vec_char16 a, vec_char16 b)
1993{
1994  return ((vec_char16)(vec_sub((vec_uchar16)(a), (vec_uchar16)(b))));
1995}
1996
1997static inline vec_char16 vec_sub(vec_bchar16 a, vec_char16 b)
1998{
1999  return ((vec_char16)(vec_sub((vec_uchar16)(a), (vec_uchar16)(b))));
2000}
2001
2002static inline vec_char16 vec_sub(vec_char16 a, vec_bchar16 b)
2003{
2004  return ((vec_char16)(vec_sub((vec_uchar16)(a), (vec_uchar16)(b))));
2005}
2006
2007static inline vec_ushort8 vec_sub(vec_ushort8 a, vec_ushort8 b)
2008{
2009  return (spu_sub(a, b));
2010}
2011
2012static inline vec_short8 vec_sub(vec_short8 a, vec_short8 b)
2013{
2014  return (spu_sub(a, b));
2015}
2016
2017static inline vec_short8 vec_sub(vec_bshort8 a, vec_short8 b)
2018{
2019  return (spu_sub((vec_short8)(a), b));
2020}
2021
2022static inline vec_short8 vec_sub(vec_short8 a, vec_bshort8 b)
2023{
2024  return (spu_sub(a, (vec_short8)(b)));
2025}
2026
2027static inline vec_uint4 vec_sub(vec_uint4 a, vec_uint4 b)
2028{
2029  return (spu_sub(a, b));
2030}
2031
2032static inline vec_int4 vec_sub(vec_int4 a, vec_int4 b)
2033{
2034  return (spu_sub(a, b));
2035}
2036
2037static inline vec_int4 vec_sub(vec_bint4 a, vec_int4 b)
2038{
2039  return (spu_sub((vec_int4)(a), b));
2040}
2041
2042static inline vec_int4 vec_sub(vec_int4 a, vec_bint4 b)
2043{
2044  return (spu_sub(a, (vec_int4)(b)));
2045}
2046
2047static inline vec_float4 vec_sub(vec_float4 a, vec_float4 b)
2048{
2049  return (spu_sub(a, b));
2050}
2051
2052
2053/* vec_subc (vector subtract carryout)
2054 * ========
2055 */
2056#define vec_subc(_a, _b)	spu_genb(_a, _b)
2057
2058
2059/* vec_subs (vector subtract saturate)
2060 * ========
2061 */
2062static inline vec_uchar16 vec_subs(vec_uchar16 a, vec_uchar16 b)
2063{
2064  vec_ushort8 s1, s2;
2065  vec_uchar16 s, d;
2066
2067  s1 = spu_sub(spu_rlmask((vec_ushort8)(a), -8), spu_rlmask((vec_ushort8)(b), -8));
2068  s2 = spu_sub(spu_and((vec_ushort8)(a), 0xFF), spu_and((vec_ushort8)(b), 0xFF));
2069  s  = (vec_uchar16)(spu_shuffle(s1, s2, ((vec_uchar16){0, 16,  2, 18,  4, 20,  6, 22,
2070					                8, 24, 10, 26, 12, 28, 14, 30})));
2071  d  = (vec_uchar16)(spu_shuffle(s1, s2, ((vec_uchar16){1, 17,  3, 19,  5, 21,  7, 23,
2072					                9, 25, 11, 27, 13, 29, 15, 31})));
2073  return (spu_andc(d, s));
2074}
2075
2076static inline vec_char16 vec_subs(vec_char16 a, vec_char16 b)
2077{
2078  vec_ushort8 s1, s2;
2079  vec_uchar16 s, d;
2080
2081  s1 = spu_sub(spu_rlmask((vec_ushort8)(a), -8), spu_rlmask((vec_ushort8)(b), -8));
2082  s2 = spu_sub(spu_and((vec_ushort8)(a), 0xFF), spu_and((vec_ushort8)(b), 0xFF));
2083  s  = (vec_uchar16)(spu_shuffle(s1, s2, ((vec_uchar16){1, 17,  3, 19,  5, 21,  7, 23,
2084					                9, 25, 11, 27, 13, 29, 15, 31})));
2085  d  = spu_sel(s, spu_splats((unsigned char)0x7F), spu_cmpgt(spu_nor((vec_uchar16)(a), spu_nand(s, (vec_uchar16)(b))), 0x7F));
2086  d  = spu_sel(d, spu_splats((unsigned char)0x80), spu_cmpgt(spu_and((vec_uchar16)(a), spu_nor(s, (vec_uchar16)(b))), 0x7F));
2087
2088  return ((vec_char16)(d));
2089}
2090
2091static inline vec_char16 vec_subs(vec_bchar16 a, vec_char16 b)
2092{
2093  return (vec_subs((vec_char16)(a), b));
2094}
2095
2096static inline vec_char16 vec_subs(vec_char16 a, vec_bchar16 b)
2097{
2098  return (vec_subs(a, (vec_char16)(b)));
2099}
2100
2101static inline vec_ushort8 vec_subs(vec_ushort8 a, vec_ushort8 b)
2102{
2103  return (spu_andc(spu_sub(a, b), spu_cmpgt(b, a)));
2104}
2105
2106static inline vec_short8 vec_subs(vec_short8 a, vec_short8 b)
2107{
2108  vec_short8 s;
2109  vec_short8 d;
2110
2111  s = spu_sub(a, b);
2112  d = spu_sel(s, spu_splats((signed short)0x7FFF), (vec_ushort8)(spu_rlmaska(spu_nor(a, spu_nand(s, b)), -15)));
2113  d = spu_sel(d, spu_splats((signed short)0x8000), (vec_ushort8)(spu_rlmaska(spu_and(a, spu_nor(s, b)), -15)));
2114
2115  return (d);
2116}
2117
2118static inline vec_short8 vec_subs(vec_bshort8 a, vec_short8 b)
2119{
2120  return ((vec_short8)(vec_subs((vec_short8)(a), b)));
2121}
2122
2123static inline vec_short8 vec_subs(vec_short8 a, vec_bshort8 b)
2124{
2125  return ((vec_short8)(vec_subs(a, (vec_short8)(b))));
2126}
2127
2128static inline vec_uint4 vec_subs(vec_uint4 a, vec_uint4 b)
2129{
2130  return (spu_andc(spu_sub(a, b), spu_cmpgt(b, a)));
2131}
2132
2133static inline vec_int4 vec_subs(vec_int4 a, vec_int4 b)
2134{
2135  vec_int4 s;
2136  vec_int4 d;
2137
2138  s = spu_sub(a, b);
2139  d = spu_sel(s, spu_splats((signed int)0x7FFFFFFF), (vec_uint4)(spu_rlmaska(spu_nor(a, spu_nand(s, b)), -31)));
2140  d = spu_sel(d, spu_splats((signed int)0x80000000), (vec_uint4)(spu_rlmaska(spu_and(a, spu_nor(s, b)), -31)));
2141
2142  return (d);
2143}
2144
2145static inline vec_int4 vec_subs(vec_bint4 a, vec_int4 b)
2146{
2147  return ((vec_int4)(vec_subs((vec_int4)(a), b)));
2148}
2149
2150static inline vec_int4 vec_subs(vec_int4 a, vec_bint4 b)
2151{
2152  return ((vec_int4)(vec_subs(a, (vec_int4)(b))));
2153}
2154
2155
2156/* vec_sum4s (vector sum across partial (1/4) saturated)
2157 * =========
2158 */
2159static inline vec_uint4 vec_sum4s(vec_uchar16 a, vec_uint4 b)
2160{
2161  vec_uint4 a01_23, a0123;
2162
2163  a01_23 = (vec_uint4)(spu_add(spu_rlmask((vec_ushort8)(a), -8),
2164			       spu_and((vec_ushort8)(a), 0xFF)));
2165  a0123 = spu_add(spu_rlmask(a01_23, -16), spu_and(a01_23, 0x1FF));
2166  return (vec_adds(a0123, b));
2167}
2168
2169static inline vec_int4 vec_sum4s(vec_char16 a, vec_int4 b)
2170{
2171  vec_int4 a01_23, a0123;
2172
2173  a01_23 = (vec_int4)(spu_add(spu_rlmaska((vec_short8)(a), -8),
2174			      spu_extend(a)));
2175  a0123 = spu_add(spu_rlmaska(a01_23, -16), spu_extend((vec_short8)(a01_23)));
2176  return (vec_adds(a0123, b));
2177}
2178
2179static inline vec_int4 vec_sum4s(vec_short8 a, vec_int4 b)
2180{
2181  vec_int4 a0123;
2182
2183  a0123 = spu_add(spu_rlmaska((vec_int4)(a), -16), spu_extend(a));
2184  return (vec_adds(a0123, b));
2185}
2186
2187
2188/* vec_sum2s (vector sum across partial (1/2) saturated)
2189 * =========
2190 */
2191static inline vec_int4 vec_sum2s(vec_int4 a, vec_int4 b)
2192{
2193  vec_int4 c, d;
2194  vec_int4 sign1, sign2, sign3;
2195  vec_int4 carry, sum_l, sum_h, sat, sat_val;
2196
2197  sign1 = spu_rlmaska(a, -31);
2198  sign2 = spu_rlmaska(b, -31);
2199
2200  c = spu_rlqwbyte(a, -4);
2201  sign3 = spu_rlqwbyte(sign1, -4);
2202
2203  carry = spu_genc(a, b);
2204  sum_l = spu_add(a, b);
2205  sum_h = spu_addx(sign1, sign2, carry);
2206
2207  carry = spu_genc(sum_l, c);
2208  sum_l = spu_add(sum_l, c);
2209  sum_h = spu_addx(sum_h, sign3, carry);
2210
2211  sign1 = spu_rlmaska(sum_l, -31);
2212  sign2 = spu_rlmaska(sum_h, -31);
2213
2214  sat_val = spu_xor(sign2, spu_splats((signed int)0x7FFFFFFF));
2215
2216  sat = spu_orc(spu_xor(sign1, sign2), (vec_int4)spu_cmpeq(sum_h, sign2));
2217
2218  d = spu_and(spu_sel(sum_l, sat_val, (vec_uint4)(sat)), (vec_int4){0, -1, 0, -1});
2219
2220  return (d);
2221}
2222
2223
2224/* vec_sums (vector sum saturated)
2225 * ========
2226 */
2227static inline vec_int4 vec_sums(vec_int4 a, vec_int4 b)
2228{
2229  vec_int4 a0, a1, a2, c0, c1, c2, d;
2230  vec_int4 sign_a, sign_b, sign_l, sign_h;
2231  vec_int4 sum_l, sum_h, sat, sat_val;
2232
2233  sign_a = spu_rlmaska(a, -31);
2234  sign_b = spu_rlmaska(b, -31);
2235
2236  a0 = spu_rlqwbyte(a, -12);
2237  a1 = spu_rlqwbyte(a, -8);
2238  a2 = spu_rlqwbyte(a, -4);
2239
2240  sum_l = spu_add(a, b);
2241  sum_h = spu_addx(sign_a, sign_b, spu_genc(a, b));
2242
2243  c2 = spu_genc(sum_l, a2);
2244  sum_l = spu_add(sum_l, a2);
2245  sum_h = spu_addx(sum_h, spu_rlqwbyte(sign_a, -4), c2);
2246
2247  c1 = spu_genc(sum_l, a1);
2248  sum_l = spu_add(sum_l, a1);
2249  sum_h = spu_addx(sum_h, spu_rlqwbyte(sign_a, -8), c1);
2250
2251  c0 = spu_genc(sum_l, a0);
2252  sum_l = spu_add(sum_l, a0);
2253  sum_h = spu_addx(sum_h, spu_rlqwbyte(sign_a, -12), c0);
2254
2255  sign_l = spu_rlmaska(sum_l, -31);
2256  sign_h = spu_rlmaska(sum_h, -31);
2257
2258  sat_val = spu_xor(sign_h, spu_splats((signed int)0x7FFFFFFF));
2259
2260  sat = spu_orc(spu_xor(sign_l, sign_h), (vec_int4)spu_cmpeq(sum_h, sign_h));
2261
2262  d = spu_and(spu_sel(sum_l, sat_val, (vec_uint4)(sat)), ((vec_int4){0, 0, 0, -1}));
2263
2264  return (d);
2265}
2266
2267
2268/* vec_trunc (vector truncate)
2269 * =========
2270 */
2271static inline vec_float4 vec_trunc(vec_float4 a)
2272{
2273  vec_int4 exp;
2274  vec_uint4 mask;
2275
2276  exp  = spu_sub(127, (vec_int4)(spu_and(spu_rlmask((vec_uint4)(a), -23), 0xFF)));
2277  mask = spu_rlmask(spu_splats((unsigned int)0x7FFFFF), exp);
2278  mask = spu_sel(spu_splats((unsigned int)0), mask, spu_cmpgt(exp, -31));
2279  mask = spu_or(mask, spu_xor((vec_uint4)(spu_rlmaska(spu_add(exp, -1), -31)), -1));
2280  return (spu_andc(a, (vec_float4)(mask)));
2281}
2282
2283/* vec_unpackh (vector unpack high element)
2284 * ===========
2285 */
2286static inline vec_short8 vec_unpackh(vec_char16 a)
2287{
2288  return (spu_extend(spu_shuffle(a, a, ((vec_uchar16){0, 0, 1, 1, 2, 2, 3, 3,
2289					              4, 4, 5, 5, 6, 6, 7, 7}))));
2290}
2291
2292static inline vec_bshort8 vec_unpackh(vec_bchar16 a)
2293{
2294  return ((vec_bshort8)(vec_unpackh((vec_char16)(a))));
2295}
2296
2297static inline vec_int4 vec_unpackh(vec_short8 a)
2298{
2299  return (spu_extend(spu_shuffle(a, a, ((vec_uchar16){0, 0, 0, 1, 0, 0, 2, 3,
2300					              0, 0, 4, 5, 0, 0, 6, 7}))));
2301}
2302
2303#ifdef SUPPORT_UNPACK_PIXEL
2304/* Due to type conflicts, unpacking of pixel types and boolean shorts
2305 * can not simultaneously be supported. By default, the boolean short is
2306 * supported.
2307 */
2308static inline vec_uint4 vec_unpackh(vec_pixel8 a)
2309{
2310  vec_ushort8 p1, p2;
2311
2312  p1 = spu_shuffle((vec_ushort8)(spu_rlmaska((vec_short8)(a.p), -7)),
2313		   spu_and((vec_ushort8)(a.p), 0x1F),
2314		   ((vec_uchar16){ 0, 128, 128, 17,  2, 128, 128, 19,
2315			           4, 128, 128, 21,  6, 128, 128, 23}));
2316  p2 = spu_shuffle(spu_and(spu_rlmask((vec_ushort8)(a.p), -5), 0x1F),
2317		   spu_and(spu_rlmask((vec_ushort8)(a.p), -10), 0x1F),
2318		   ((vec_uchar16){ 128,  17, 1, 128, 128,  19, 3, 128,
2319			           128,  21, 5, 128, 128,  23, 7, 128}));
2320  return ((vec_uint4)(spu_or(p1, p2)));
2321}
2322
2323#else
2324
2325static inline vec_bint4 vec_unpackh(vec_bshort8 a)
2326{
2327  return ((vec_bint4)(vec_unpackh((vec_short8)(a))));
2328}
2329#endif
2330
2331
2332
2333
2334
2335/* vec_unpackl (vector unpack low element)
2336 * ===========
2337 */
2338static inline vec_short8 vec_unpackl(vec_char16 a)
2339{
2340  return (spu_extend(spu_shuffle(a, a, ((vec_uchar16){8, 8, 9, 9, 10, 10, 11, 11,
2341					              12, 12, 13, 13, 14, 14, 15, 15}))));
2342}
2343
2344static inline vec_bshort8 vec_unpackl(vec_bchar16 a)
2345{
2346  return ((vec_bshort8)(vec_unpackl((vec_char16)(a))));
2347}
2348
2349
2350static inline vec_int4 vec_unpackl(vec_short8 a)
2351{
2352  return (spu_extend(spu_shuffle(a, a, ((vec_uchar16){0, 0, 8, 9, 0, 0, 10, 11,
2353					              0, 0,12,13, 0, 0, 14, 15}))));
2354}
2355
2356
2357#ifdef SUPPORT_UNPACK_PIXEL
2358/* Due to type conflicts, unpacking of pixel types and boolean shorts
2359 * can not simultaneously be supported. By default, the boolean short is
2360 * supported.
2361 */
2362static inline vec_uint4 vec_unpackl(vec_pixel8 a)
2363{
2364  vec_ushort8 p1, p2;
2365
2366  p1 = spu_shuffle((vec_ushort8)(spu_rlmaska((vec_short8)(a), -7)),
2367		   spu_and((vec_ushort8)(a), 0x1F),
2368		   ((vec_uchar16){ 8, 128, 128, 25,  10, 128, 128, 27,
2369			          12, 128, 128, 29,  14, 128, 128, 31}));
2370  p2 = spu_shuffle(spu_and(spu_rlmask((vec_ushort8)(a), -5), 0x1F),
2371		   spu_and(spu_rlmask((vec_ushort8)(a), -10), 0x1F),
2372		   ((vec_uchar16){ 128, 25,  9, 128, 128, 27, 11, 128,
2373			           128, 29, 13, 128, 128, 31, 15, 128}));
2374  return ((vec_uint4)(spu_or(p1, p2)));
2375}
2376
2377#else
2378
2379static inline vec_bint4 vec_unpackl(vec_bshort8 a)
2380{
2381  return ((vec_bint4)(vec_unpackl((vec_short8)(a))));
2382
2383}
2384#endif
2385
2386
2387
2388/* vec_xor (vector logical xor)
2389 * ======
2390 */
2391static inline vec_uchar16 vec_xor(vec_uchar16 a, vec_uchar16 b)
2392{
2393  return (spu_xor(a, b));
2394}
2395
2396static inline vec_char16 vec_xor(vec_char16 a, vec_char16 b)
2397{
2398  return (spu_xor(a, b));
2399}
2400
2401static inline vec_char16 vec_xor(vec_bchar16 a, vec_char16 b)
2402{
2403  return (spu_xor((vec_char16)(a), b));
2404}
2405
2406static inline vec_char16 vec_xor(vec_char16 a, vec_bchar16 b)
2407{
2408  return (spu_xor(a, (vec_char16)(b)));
2409}
2410
2411static inline vec_ushort8 vec_xor(vec_ushort8 a, vec_ushort8 b)
2412{
2413  return (spu_xor(a, b));
2414}
2415
2416static inline vec_short8 vec_xor(vec_short8 a, vec_short8 b)
2417{
2418  return (spu_xor(a, b));
2419}
2420
2421static inline vec_short8 vec_xor(vec_bshort8 a, vec_short8 b)
2422{
2423  return (spu_xor((vec_short8)(a), b));
2424}
2425
2426static inline vec_short8 vec_xor(vec_short8 a, vec_bshort8 b)
2427{
2428  return (spu_xor(a, (vec_short8)(b)));
2429}
2430
2431static inline vec_uint4 vec_xor(vec_uint4 a, vec_uint4 b)
2432{
2433  return (spu_xor(a, b));
2434}
2435
2436static inline vec_int4 vec_xor(vec_int4 a, vec_int4 b)
2437{
2438  return (spu_xor(a, b));
2439}
2440
2441static inline vec_int4 vec_xor(vec_bint4 a, vec_int4 b)
2442{
2443  return (spu_xor((vec_int4)(a), b));
2444}
2445
2446static inline vec_int4 vec_xor(vec_int4 a, vec_bint4 b)
2447{
2448  return (spu_xor(a, (vec_int4)(b)));
2449}
2450
2451static inline vec_float4 vec_xor(vec_float4 a, vec_float4 b)
2452{
2453  return (spu_xor(a, b));
2454}
2455
2456static inline vec_float4 vec_xor(vec_bint4 a, vec_float4 b)
2457{
2458  return (spu_xor((vec_float4)(a),b));
2459}
2460
2461static inline vec_float4 vec_xor(vec_float4 a, vec_bint4 b)
2462{
2463  return (spu_xor(a, (vec_float4)(b)));
2464}
2465
2466/************************************************************************
2467 *                        PREDICATES
2468 ************************************************************************/
2469
2470/* vec_all_eq (all elements equal)
2471 * ==========
2472 */
2473static inline int vec_all_eq(vec_uchar16 a, vec_uchar16 b)
2474{
2475  return ((int)(spu_extract(spu_gather(spu_cmpeq(a, b)), 0) == 0xFFFF));
2476}
2477
2478static inline int vec_all_eq(vec_char16 a, vec_char16 b)
2479{
2480  return ((int)(spu_extract(spu_gather(spu_cmpeq(a, b)), 0) == 0xFFFF));
2481}
2482
2483static inline int vec_all_eq(vec_bchar16 a, vec_char16 b)
2484{
2485  return ((int)(spu_extract(spu_gather(spu_cmpeq((vec_char16)(a), b)), 0) == 0xFFFF));
2486}
2487
2488static inline int vec_all_eq(vec_char16 a, vec_bchar16 b)
2489{
2490  return ((int)(spu_extract(spu_gather(spu_cmpeq(a, (vec_char16)(b))), 0) == 0xFFFF));
2491}
2492
2493static inline int vec_all_eq(vec_ushort8 a, vec_ushort8 b)
2494{
2495  return ((int)(spu_extract(spu_gather(spu_cmpeq(a, b)), 0) == 0xFF));
2496}
2497
2498static inline int vec_all_eq(vec_short8 a, vec_short8 b)
2499{
2500  return ((int)(spu_extract(spu_gather(spu_cmpeq(a, b)), 0) == 0xFF));
2501}
2502
2503static inline int vec_all_eq(vec_bshort8 a, vec_short8 b)
2504{
2505  return ((int)(spu_extract(spu_gather(spu_cmpeq((vec_short8)(a), b)), 0) == 0xFF));
2506}
2507
2508static inline int vec_all_eq(vec_short8 a, vec_bshort8 b)
2509{
2510  return ((int)(spu_extract(spu_gather(spu_cmpeq(a, (vec_short8)(b))), 0) == 0xFF));
2511}
2512
2513static inline int vec_all_eq(vec_uint4 a, vec_uint4 b)
2514{
2515  return ((int)(spu_extract(spu_gather(spu_cmpeq(a, b)), 0) == 0xF));
2516}
2517
2518static inline int vec_all_eq(vec_int4 a, vec_int4 b)
2519{
2520  return ((int)(spu_extract(spu_gather(spu_cmpeq(a, b)), 0) == 0xF));
2521}
2522
2523static inline int vec_all_eq(vec_bint4 a, vec_int4 b)
2524{
2525  return ((int)(spu_extract(spu_gather(spu_cmpeq((vec_int4)(a), b)), 0) == 0xF));
2526}
2527
2528static inline int vec_all_eq(vec_int4 a, vec_bint4 b)
2529{
2530  return ((int)(spu_extract(spu_gather(spu_cmpeq(a, (vec_int4)(b))), 0) == 0xF));
2531}
2532
2533static inline int vec_all_eq(vec_float4 a, vec_float4 b)
2534{
2535  return ((int)(spu_extract(spu_gather(spu_cmpeq(a, b)), 0) == 0xF));
2536}
2537
2538
2539/* vec_all_ge (all elements greater than or equal)
2540 * ==========
2541 */
2542static inline int vec_all_ge(vec_uchar16 a, vec_uchar16 b)
2543{
2544  return ((int)(spu_extract(spu_gather(spu_cmpgt(b, a)), 0) == 0));
2545}
2546
2547static inline int vec_all_ge(vec_char16 a, vec_char16 b)
2548{
2549  return ((int)(spu_extract(spu_gather(spu_cmpgt(b, a)), 0) == 0));
2550}
2551
2552static inline  int vec_all_ge(vec_bchar16 a, vec_char16 b)
2553{
2554  return ((int)(spu_extract(spu_gather(spu_cmpgt(b, (vec_char16)(a))), 0) == 0));
2555}
2556
2557static inline int vec_all_ge(vec_char16 a, vec_bchar16 b)
2558{
2559  return ((int)(spu_extract(spu_gather(spu_cmpgt((vec_char16)(b), a)), 0) == 0));
2560}
2561
2562static inline int vec_all_ge(vec_ushort8 a, vec_ushort8 b)
2563{
2564  return ((int)(spu_extract(spu_gather(spu_cmpgt(b, a)), 0) == 0));
2565}
2566
2567static inline int vec_all_ge(vec_short8 a, vec_short8 b)
2568{
2569  return ((int)(spu_extract(spu_gather(spu_cmpgt(b, a)), 0) == 0));
2570}
2571
2572static inline int vec_all_ge(vec_bshort8 a, vec_short8 b)
2573{
2574  return ((int)(spu_extract(spu_gather(spu_cmpgt(b, (vec_short8)(a))), 0) == 0));
2575}
2576
2577static inline int vec_all_ge(vec_short8 a, vec_bshort8 b)
2578{
2579  return ((int)(spu_extract(spu_gather(spu_cmpgt((vec_short8)(b), a)), 0) == 0));
2580}
2581
2582static inline int vec_all_ge(vec_uint4 a, vec_uint4 b)
2583{
2584  return ((int)(spu_extract(spu_gather(spu_cmpgt(b, a)), 0) == 0));
2585}
2586
2587static inline int vec_all_ge(vec_int4 a, vec_int4 b)
2588{
2589  return ((int)(spu_extract(spu_gather(spu_cmpgt(b, a)), 0) == 0));
2590}
2591
2592static inline int vec_all_ge(vec_bint4 a, vec_int4 b)
2593{
2594  return ((int)(spu_extract(spu_gather(spu_cmpgt(b, (vec_int4)(a))), 0) == 0));
2595}
2596
2597static inline int vec_all_ge(vec_int4 a, vec_bint4 b)
2598{
2599  return ((int)(spu_extract(spu_gather(spu_cmpgt((vec_int4)(b), a)), 0) == 0));
2600}
2601
2602static inline int vec_all_ge(vec_float4 a, vec_float4 b)
2603{
2604  return ((int)(spu_extract(spu_gather(spu_cmpgt(b, a)), 0) == 0));
2605}
2606
2607
2608/* vec_all_gt (all elements greater than)
2609 * ==========
2610 */
2611static inline int vec_all_gt(vec_uchar16 a, vec_uchar16 b)
2612{
2613  return ((int)(spu_extract(spu_gather(spu_cmpgt(a, b)), 0) == 0xFFFF));
2614}
2615
2616static inline int vec_all_gt(vec_char16 a, vec_char16 b)
2617{
2618  return ((int)(spu_extract(spu_gather(spu_cmpgt(a, b)), 0) == 0xFFFF));
2619}
2620
2621static inline int vec_all_gt(vec_bchar16 a, vec_char16 b)
2622{
2623  return ((int)(spu_extract(spu_gather(spu_cmpgt((vec_char16)(a), b)), 0) == 0xFFFF));
2624}
2625
2626static inline int vec_all_gt(vec_char16 a, vec_bchar16 b)
2627{
2628  return ((int)(spu_extract(spu_gather(spu_cmpgt(a, (vec_char16)(b))), 0) == 0xFFFF));
2629}
2630
2631static inline int vec_all_gt(vec_ushort8 a, vec_ushort8 b)
2632{
2633  return ((int)(spu_extract(spu_gather(spu_cmpgt(a, b)), 0) == 0xFF));
2634}
2635
2636static inline int vec_all_gt(vec_short8 a, vec_short8 b)
2637{
2638  return ((int)(spu_extract(spu_gather(spu_cmpgt(a, b)), 0) == 0xFF));
2639}
2640
2641static inline int vec_all_gt(vec_bshort8 a, vec_short8 b)
2642{
2643  return ((int)(spu_extract(spu_gather(spu_cmpgt((vec_short8)(a), b)), 0) == 0xFF));
2644}
2645
2646static inline int vec_all_gt(vec_short8 a, vec_bshort8 b)
2647{
2648  return ((int)(spu_extract(spu_gather(spu_cmpgt(a, (vec_short8)(b))), 0) == 0xFF));
2649}
2650
2651static inline int vec_all_gt(vec_uint4 a, vec_uint4 b)
2652{
2653  return ((int)(spu_extract(spu_gather(spu_cmpgt(a, b)), 0) == 0xF));
2654}
2655
2656static inline int vec_all_gt(vec_int4 a, vec_int4 b)
2657{
2658  return ((int)(spu_extract(spu_gather(spu_cmpgt(a, b)), 0) == 0xF));
2659}
2660
2661static inline int vec_all_gt(vec_bint4 a, vec_int4 b)
2662{
2663  return ((int)(spu_extract(spu_gather(spu_cmpgt((vec_int4)(a), b)), 0) == 0xF));
2664}
2665
2666static inline int vec_all_gt(vec_int4 a, vec_bint4 b)
2667{
2668  return ((int)(spu_extract(spu_gather(spu_cmpgt(a, (vec_int4)(b))), 0) == 0xF));
2669}
2670
2671static inline int vec_all_gt(vec_float4 a, vec_float4 b)
2672{
2673  return ((int)(spu_extract(spu_gather(spu_cmpgt(a, b)), 0) == 0xF));
2674}
2675
2676
2677/* vec_all_in (all elements in bounds)
2678 * ==========
2679 */
2680static inline int vec_all_in(vec_float4 a, vec_float4 b)
2681{
2682  return (spu_extract(spu_gather(spu_nor(spu_cmpabsgt(a, b), (vec_uint4)(spu_rlmaska((vec_int4)(b), -31)))), 0) == 0xF);
2683}
2684
2685
2686/* vec_all_le (all elements less than or equal)
2687 * ==========
2688 */
2689static inline int vec_all_le(vec_uchar16 a, vec_uchar16 b)
2690{
2691  return ((int)(spu_extract(spu_gather(spu_cmpgt(a, b)), 0) == 0));
2692}
2693
2694static inline int vec_all_le(vec_char16 a, vec_char16 b)
2695{
2696  return ((int)(spu_extract(spu_gather(spu_cmpgt(a, b)), 0) == 0));
2697}
2698
2699static inline int vec_all_le(vec_bchar16 a, vec_char16 b)
2700{
2701  return ((int)(spu_extract(spu_gather(spu_cmpgt((vec_char16)(a), b)), 0) == 0));
2702}
2703
2704static inline int vec_all_le(vec_char16 a, vec_bchar16 b)
2705{
2706  return ((int)(spu_extract(spu_gather(spu_cmpgt(a, (vec_char16)(b))), 0) == 0));
2707}
2708
2709static inline int vec_all_le(vec_ushort8 a, vec_ushort8 b)
2710{
2711  return ((int)(spu_extract(spu_gather(spu_cmpgt(a, b)), 0) == 0));
2712}
2713
2714static inline int vec_all_le(vec_short8 a, vec_short8 b)
2715{
2716  return ((int)(spu_extract(spu_gather(spu_cmpgt(a, b)), 0) == 0));
2717}
2718
2719static inline int vec_all_le(vec_bshort8 a, vec_short8 b)
2720{
2721  return ((int)(spu_extract(spu_gather(spu_cmpgt((vec_short8)(a), b)), 0) == 0));
2722}
2723
2724static inline int vec_all_le(vec_short8 a, vec_bshort8 b)
2725{
2726  return ((int)(spu_extract(spu_gather(spu_cmpgt(a, (vec_short8)(b))), 0) == 0));
2727}
2728
2729static inline int vec_all_le(vec_uint4 a, vec_uint4 b)
2730{
2731  return ((int)(spu_extract(spu_gather(spu_cmpgt(a, b)), 0) == 0));
2732}
2733
2734static inline int vec_all_le(vec_int4 a, vec_int4 b)
2735{
2736  return ((int)(spu_extract(spu_gather(spu_cmpgt(a, b)), 0) == 0));
2737}
2738
2739static inline int vec_all_le(vec_bint4 a, vec_int4 b)
2740{
2741  return ((int)(spu_extract(spu_gather(spu_cmpgt((vec_int4)(a), b)), 0) == 0));
2742}
2743
2744static inline int vec_all_le(vec_int4 a, vec_bint4 b)
2745{
2746  return ((int)(spu_extract(spu_gather(spu_cmpgt(a, (vec_int4)(b))), 0) == 0));
2747}
2748
2749static inline int vec_all_le(vec_float4 a, vec_float4 b)
2750{
2751  return ((int)(spu_extract(spu_gather(spu_cmpgt(a, b)), 0) == 0));
2752}
2753
2754
2755/* vec_all_lt (all elements less than)
2756 * ==========
2757 */
2758static inline int vec_all_lt(vec_uchar16 a, vec_uchar16 b)
2759{
2760  return ((int)(spu_extract(spu_gather(spu_cmpgt(b, a)), 0) == 0xFFFF));
2761}
2762
2763static inline int vec_all_lt(vec_char16 a, vec_char16 b)
2764{
2765  return ((int)(spu_extract(spu_gather(spu_cmpgt(b, a)), 0) == 0xFFFF));
2766}
2767
2768static inline int vec_all_lt(vec_bchar16 a, vec_char16 b)
2769{
2770  return ((int)(spu_extract(spu_gather(spu_cmpgt(b, (vec_char16)(a))), 0) == 0xFFFF));
2771}
2772
2773static inline int vec_all_lt(vec_char16 a, vec_bchar16 b)
2774{
2775  return ((int)(spu_extract(spu_gather(spu_cmpgt((vec_char16)(b), a)), 0) == 0xFFFF));
2776}
2777
2778static inline int vec_all_lt(vec_ushort8 a, vec_ushort8 b)
2779{
2780  return ((int)(spu_extract(spu_gather(spu_cmpgt(b, a)), 0) == 0xFF));
2781}
2782
2783static inline int vec_all_lt(vec_short8 a, vec_short8 b)
2784{
2785  return ((int)(spu_extract(spu_gather(spu_cmpgt(b, a)), 0) == 0xFF));
2786}
2787
2788static inline int vec_all_lt(vec_bshort8 a, vec_short8 b)
2789{
2790  return ((int)(spu_extract(spu_gather(spu_cmpgt(b, (vec_short8)(a))), 0) == 0xFF));
2791}
2792
2793static inline int vec_all_lt(vec_short8 a, vec_bshort8 b)
2794{
2795  return ((int)(spu_extract(spu_gather(spu_cmpgt((vec_short8)(b), a)), 0) == 0xFF));
2796}
2797
2798static inline int vec_all_lt(vec_uint4 a, vec_uint4 b)
2799{
2800  return ((int)(spu_extract(spu_gather(spu_cmpgt(b, a)), 0) == 0xF));
2801}
2802
2803static inline int vec_all_lt(vec_int4 a, vec_int4 b)
2804{
2805  return ((int)(spu_extract(spu_gather(spu_cmpgt(b, a)), 0) == 0xF));
2806}
2807
2808static inline int vec_all_lt(vec_bint4 a, vec_int4 b)
2809{
2810  return ((int)(spu_extract(spu_gather(spu_cmpgt(b, (vec_int4)(a))), 0) == 0xF));
2811}
2812
2813static inline int vec_all_lt(vec_int4 a, vec_bint4 b)
2814{
2815  return ((int)(spu_extract(spu_gather(spu_cmpgt((vec_int4)(b), a)), 0) == 0xF));
2816}
2817
2818static inline int vec_all_lt(vec_float4 a, vec_float4 b)
2819{
2820  return ((int)(spu_extract(spu_gather(spu_cmpgt(b, a)), 0) == 0xF));
2821}
2822
2823
2824/* vec_all_nan (all elements not a number)
2825 * ===========
2826 */
2827static inline int vec_all_nan(vec_float4 a)
2828{
2829  vec_uint4 exp, man;
2830  vec_uint4 exp_mask = spu_splats((unsigned int)0x7F800000);
2831
2832  exp = spu_and((vec_uint4)(a), exp_mask);
2833  man = spu_and((vec_uint4)(a), spu_splats((unsigned int)0x007FFFFF));
2834  return ((int)(spu_extract(spu_gather(spu_andc(spu_cmpeq(exp, exp_mask),
2835						spu_cmpeq(man, 0))), 0) == 0xF));
2836}
2837
2838#define vec_all_nan(_a)		(0)
2839
2840
2841/* vec_all_ne (all elements not equal)
2842 * ==========
2843 */
2844static inline int vec_all_ne(vec_uchar16 a, vec_uchar16 b)
2845{
2846  return ((int)(spu_extract(spu_gather(spu_cmpeq(a, b)), 0) == 0));
2847}
2848
2849static inline int vec_all_ne(vec_char16 a, vec_char16 b)
2850{
2851  return ((int)(spu_extract(spu_gather(spu_cmpeq(a, b)), 0) == 0));
2852}
2853
2854static inline int vec_all_ne(vec_bchar16 a, vec_char16 b)
2855{
2856  return ((int)(spu_extract(spu_gather(spu_cmpeq((vec_char16)(a), b)), 0) == 0));
2857}
2858
2859static inline int vec_all_ne(vec_char16 a, vec_bchar16 b)
2860{
2861  return ((int)(spu_extract(spu_gather(spu_cmpeq(a, (vec_char16)(b))), 0) == 0));
2862}
2863
2864static inline int vec_all_ne(vec_ushort8 a, vec_ushort8 b)
2865{
2866  return ((int)(spu_extract(spu_gather(spu_cmpeq(a, b)), 0) == 0));
2867}
2868
2869static inline int vec_all_ne(vec_short8 a, vec_short8 b)
2870{
2871  return ((int)(spu_extract(spu_gather(spu_cmpeq(a, b)), 0) == 0));
2872}
2873
2874static inline int vec_all_ne(vec_bshort8 a, vec_short8 b)
2875{
2876  return ((int)(spu_extract(spu_gather(spu_cmpeq((vec_short8)(a), b)), 0) == 0));
2877}
2878
2879static inline int vec_all_ne(vec_short8 a, vec_bshort8 b)
2880{
2881  return ((int)(spu_extract(spu_gather(spu_cmpeq(a, (vec_short8)(b))), 0) == 0));
2882}
2883
2884static inline int vec_all_ne(vec_uint4 a, vec_uint4 b)
2885{
2886  return ((int)(spu_extract(spu_gather(spu_cmpeq(a, b)), 0) == 0));
2887}
2888
2889static inline int vec_all_ne(vec_int4 a, vec_int4 b)
2890{
2891  return ((int)(spu_extract(spu_gather(spu_cmpeq(a, b)), 0) == 0));
2892}
2893
2894static inline int vec_all_ne(vec_bint4 a, vec_int4 b)
2895{
2896  return ((int)(spu_extract(spu_gather(spu_cmpeq((vec_int4)(a), b)), 0) == 0));
2897}
2898
2899static inline int vec_all_ne(vec_int4 a, vec_bint4 b)
2900{
2901  return ((int)(spu_extract(spu_gather(spu_cmpeq(a, (vec_int4)(b))), 0) == 0));
2902}
2903
2904static inline int vec_all_ne(vec_float4 a, vec_float4 b)
2905{
2906  return ((int)(spu_extract(spu_gather(spu_cmpeq(a, b)), 0) == 0));
2907}
2908
2909
2910/* vec_all_nge (all elements not greater than or equal)
2911 * ===========
2912 */
2913static inline int vec_all_nge(vec_float4 a, vec_float4 b)
2914{
2915  return ((int)(spu_extract(spu_gather(spu_cmpgt(b, a)), 0) == 0xF));
2916}
2917
2918
2919/* vec_all_ngt (all elements not greater than)
2920 * ===========
2921 */
2922static inline int vec_all_ngt(vec_float4 a, vec_float4 b)
2923{
2924  return ((int)(spu_extract(spu_gather(spu_cmpgt(a, b)), 0) == 0));
2925}
2926
2927
2928/* vec_all_nle (all elements not less than or equal)
2929 * ===========
2930 */
2931static inline int vec_all_nle(vec_float4 a, vec_float4 b)
2932{
2933  return ((int)(spu_extract(spu_gather(spu_cmpgt(a, b)), 0) == 0xF));
2934}
2935
2936
2937/* vec_all_nlt (all elements not less than)
2938 * ===========
2939 */
2940static inline int vec_all_nlt(vec_float4 a, vec_float4 b)
2941{
2942  return ((int)(spu_extract(spu_gather(spu_cmpgt(b, a)), 0) == 0));
2943}
2944
2945
2946/* vec_all_numeric (all elements numeric)
2947 * ===========
2948 */
2949static inline int vec_all_numeric(vec_float4 a)
2950{
2951  vec_uint4 exp;
2952
2953  exp = spu_and(spu_rlmask((vec_uint4)(a), -23), 0xFF);
2954  return ((int)(spu_extract(spu_gather(spu_cmpeq(exp, 255)), 0) == 0));
2955}
2956
2957
2958
2959/* vec_any_eq (any elements equal)
2960 * ==========
2961 */
2962static inline int vec_any_eq(vec_uchar16 a, vec_uchar16 b)
2963{
2964  return ((int)(spu_extract(spu_gather(spu_cmpeq(a, b)), 0) != 0));
2965}
2966
2967static inline int vec_any_eq(vec_char16 a, vec_char16 b)
2968{
2969  return ((int)(spu_extract(spu_gather(spu_cmpeq(a, b)), 0) != 0));
2970}
2971
2972static inline int vec_any_eq(vec_bchar16 a, vec_char16 b)
2973{
2974  return ((int)(spu_extract(spu_gather(spu_cmpeq((vec_char16)(a), b)), 0) != 0));
2975}
2976
2977static inline int vec_any_eq(vec_char16 a, vec_bchar16 b)
2978{
2979  return ((int)(spu_extract(spu_gather(spu_cmpeq(a, (vec_char16)(b))), 0) != 0));
2980}
2981
2982static inline int vec_any_eq(vec_ushort8 a, vec_ushort8 b)
2983{
2984  return ((int)(spu_extract(spu_gather(spu_cmpeq(a, b)), 0) != 0));
2985}
2986
2987static inline int vec_any_eq(vec_short8 a, vec_short8 b)
2988{
2989  return ((int)(spu_extract(spu_gather(spu_cmpeq(a, b)), 0) != 0));
2990}
2991
2992static inline int vec_any_eq(vec_bshort8 a, vec_short8 b)
2993{
2994  return ((int)(spu_extract(spu_gather(spu_cmpeq((vec_short8)(a), b)), 0) != 0));
2995}
2996
2997static inline int vec_any_eq(vec_short8 a, vec_bshort8 b)
2998{
2999  return ((int)(spu_extract(spu_gather(spu_cmpeq(a, (vec_short8)(b))), 0) != 0));
3000}
3001
3002static inline int vec_any_eq(vec_uint4 a, vec_uint4 b)
3003{
3004  return ((int)(spu_extract(spu_orx(spu_rlmask(spu_cmpeq(a, b), -31)), 0)));
3005}
3006
3007static inline int vec_any_eq(vec_int4 a, vec_int4 b)
3008{
3009  return ((int)(spu_extract(spu_orx(spu_rlmask(spu_cmpeq(a, b), -31)), 0)));
3010}
3011
3012static inline int vec_any_eq(vec_bint4 a, vec_int4 b)
3013{
3014  return ((int)(spu_extract(spu_orx(spu_rlmask(spu_cmpeq((vec_int4)(a), b), -31)), 0)));
3015}
3016
3017static inline int vec_any_eq(vec_int4 a, vec_bint4 b)
3018{
3019  return ((int)(spu_extract(spu_orx(spu_rlmask(spu_cmpeq(a, (vec_int4)(b)), -31)), 0)));
3020}
3021
3022static inline int vec_any_eq(vec_float4 a, vec_float4 b)
3023{
3024  return ((int)(spu_extract(spu_orx(spu_rlmask(spu_cmpeq(a, b), -31)), 0)));
3025}
3026
3027/* vec_any_ge (any elements greater than or equal)
3028 * ==========
3029 */
3030static inline int vec_any_ge(vec_uchar16 a, vec_uchar16 b)
3031{
3032  return ((int)(spu_extract(spu_gather(spu_cmpgt(b, a)), 0) != 0xFFFF));
3033}
3034
3035static inline int vec_any_ge(vec_char16 a, vec_char16 b)
3036{
3037  return ((int)(spu_extract(spu_gather(spu_cmpgt(b, a)), 0) != 0xFFFF));
3038}
3039
3040static inline int vec_any_ge(vec_bchar16 a, vec_char16 b)
3041{
3042  return ((int)(spu_extract(spu_gather(spu_cmpgt(b, (vec_char16)(a))), 0) != 0xFFFF));
3043}
3044
3045static inline int vec_any_ge(vec_char16 a, vec_bchar16 b)
3046{
3047  return ((int)(spu_extract(spu_gather(spu_cmpgt((vec_char16)(b), a)), 0) != 0xFFFF));
3048}
3049
3050static inline int vec_any_ge(vec_ushort8 a, vec_ushort8 b)
3051{
3052  return ((int)(spu_extract(spu_gather(spu_cmpgt(b, a)), 0) != 0xFF));
3053}
3054
3055static inline int vec_any_ge(vec_short8 a, vec_short8 b)
3056{
3057  return ((int)(spu_extract(spu_gather(spu_cmpgt(b, a)), 0) != 0xFF));
3058}
3059
3060static inline int vec_any_ge(vec_bshort8 a, vec_short8 b)
3061{
3062  return ((int)(spu_extract(spu_gather(spu_cmpgt(b, (vec_short8)(a))), 0) != 0xFF));
3063}
3064
3065static inline int vec_any_ge(vec_short8 a, vec_bshort8 b)
3066{
3067  return ((int)(spu_extract(spu_gather(spu_cmpgt((vec_short8)(b), a)), 0) != 0xFF));
3068}
3069
3070static inline int vec_any_ge(vec_uint4 a, vec_uint4 b)
3071{
3072  return ((int)(spu_extract(spu_gather(spu_cmpgt(b, a)), 0) != 0xF));
3073}
3074
3075static inline int vec_any_ge(vec_int4 a, vec_int4 b)
3076{
3077  return ((int)(spu_extract(spu_gather(spu_cmpgt(b, a)), 0) != 0xF));
3078}
3079
3080static inline int vec_any_ge(vec_bint4 a, vec_int4 b)
3081{
3082  return ((int)(spu_extract(spu_gather(spu_cmpgt(b, (vec_int4)(a))), 0) != 0xF));
3083}
3084
3085static inline int vec_any_ge(vec_int4 a, vec_bint4 b)
3086{
3087  return ((int)(spu_extract(spu_gather(spu_cmpgt((vec_int4)(b), a)), 0) != 0xF));
3088}
3089
3090static inline int vec_any_ge(vec_float4 a, vec_float4 b)
3091{
3092  return ((int)(spu_extract(spu_gather(spu_cmpgt(b, a)), 0) != 0xF));
3093}
3094
3095
3096/* vec_any_gt (any elements greater than)
3097 * ==========
3098 */
3099static inline int vec_any_gt(vec_uchar16 a, vec_uchar16 b)
3100{
3101  return ((int)(spu_extract(spu_gather(spu_cmpgt(a, b)), 0) != 0));
3102}
3103
3104static inline int vec_any_gt(vec_char16 a, vec_char16 b)
3105{
3106  return ((int)(spu_extract(spu_gather(spu_cmpgt(a, b)), 0) != 0));
3107}
3108
3109static inline int vec_any_gt(vec_bchar16 a, vec_char16 b)
3110{
3111  return ((int)(spu_extract(spu_gather(spu_cmpgt((vec_char16)(a), b)), 0) != 0));
3112}
3113
3114static inline int vec_any_gt(vec_char16 a, vec_bchar16 b)
3115{
3116  return ((int)(spu_extract(spu_gather(spu_cmpgt(a, (vec_char16)(b))), 0) != 0));
3117}
3118
3119static inline int vec_any_gt(vec_ushort8 a, vec_ushort8 b)
3120{
3121  return ((int)(spu_extract(spu_gather(spu_cmpgt(a, b)), 0) != 0));
3122}
3123
3124static inline int vec_any_gt(vec_short8 a, vec_short8 b)
3125{
3126  return ((int)(spu_extract(spu_gather(spu_cmpgt(a, b)), 0) != 0));
3127}
3128
3129static inline int vec_any_gt(vec_bshort8 a, vec_short8 b)
3130{
3131  return ((int)(spu_extract(spu_gather(spu_cmpgt((vec_short8)(a), b)), 0) != 0));
3132}
3133
3134static inline int vec_any_gt(vec_short8 a, vec_bshort8 b)
3135{
3136  return ((int)(spu_extract(spu_gather(spu_cmpgt(a, (vec_short8)(b))), 0) != 0));
3137}
3138
3139
3140static inline int vec_any_gt(vec_uint4 a, vec_uint4 b)
3141{
3142  return ((int)(spu_extract(spu_orx(spu_rlmask(spu_cmpgt(a, b), -31)), 0)));
3143}
3144
3145static inline int vec_any_gt(vec_int4 a, vec_int4 b)
3146{
3147  return ((int)(spu_extract(spu_orx(spu_rlmask(spu_cmpgt(a, b), -31)), 0)));
3148}
3149
3150static inline int vec_any_gt(vec_bint4 a, vec_int4 b)
3151{
3152  return ((int)(spu_extract(spu_orx(spu_rlmask(spu_cmpgt((vec_int4)(a), b), -31)), 0)));
3153}
3154
3155static inline int vec_any_gt(vec_int4 a, vec_bint4 b)
3156{
3157  return ((int)(spu_extract(spu_orx(spu_rlmask(spu_cmpgt(a, (vec_int4)(b)), -31)), 0)));
3158}
3159
3160static inline int vec_any_gt(vec_float4 a, vec_float4 b)
3161{
3162  return ((int)(spu_extract(spu_orx(spu_rlmask(spu_cmpgt(a, b), -31)), 0)));
3163}
3164
3165/* vec_any_le (any elements less than or equal)
3166 * ==========
3167 */
3168static inline int vec_any_le(vec_uchar16 a, vec_uchar16 b)
3169{
3170  return ((int)(spu_extract(spu_gather(spu_cmpgt(a, b)), 0) != 0xFFFF));
3171}
3172
3173static inline int vec_any_le(vec_char16 a, vec_char16 b)
3174{
3175  return ((int)(spu_extract(spu_gather(spu_cmpgt(a, b)), 0) != 0xFFFF));
3176}
3177
3178static inline int vec_any_le(vec_bchar16 a, vec_char16 b)
3179{
3180  return ((int)(spu_extract(spu_gather(spu_cmpgt((vec_char16)(a), b)), 0) != 0xFFFF));
3181}
3182
3183static inline int vec_any_le(vec_char16 a, vec_bchar16 b)
3184{
3185  return ((int)(spu_extract(spu_gather(spu_cmpgt(a, (vec_char16)(b))), 0) != 0xFFFF));
3186}
3187
3188static inline int vec_any_le(vec_ushort8 a, vec_ushort8 b)
3189{
3190  return ((int)(spu_extract(spu_gather(spu_cmpgt(a, b)), 0) != 0xFF));
3191}
3192
3193static inline int vec_any_le(vec_short8 a, vec_short8 b)
3194{
3195  return ((int)(spu_extract(spu_gather(spu_cmpgt(a, b)), 0) != 0xFF));
3196}
3197
3198static inline int vec_any_le(vec_bshort8 a, vec_short8 b)
3199{
3200  return ((int)(spu_extract(spu_gather(spu_cmpgt((vec_short8)(a), b)), 0) != 0xFF));
3201}
3202
3203static inline int vec_any_le(vec_short8 a, vec_bshort8 b)
3204{
3205  return ((int)(spu_extract(spu_gather(spu_cmpgt(a, (vec_short8)(b))), 0) != 0xFF));
3206}
3207
3208static inline int vec_any_le(vec_uint4 a, vec_uint4 b)
3209{
3210  return ((int)(spu_extract(spu_gather(spu_cmpgt(a, b)), 0) != 0xF));
3211}
3212
3213static inline int vec_any_le(vec_int4 a, vec_int4 b)
3214{
3215  return ((int)(spu_extract(spu_gather(spu_cmpgt(a, b)), 0) != 0xF));
3216}
3217
3218static inline int vec_any_le(vec_bint4 a, vec_int4 b)
3219{
3220  return ((int)(spu_extract(spu_gather(spu_cmpgt((vec_int4)(a), b)), 0) != 0xF));
3221}
3222
3223static inline int vec_any_le(vec_int4 a, vec_bint4 b)
3224{
3225  return ((int)(spu_extract(spu_gather(spu_cmpgt(a, (vec_int4)(b))), 0) != 0xF));
3226}
3227
3228static inline int vec_any_le(vec_float4 a, vec_float4 b)
3229{
3230  return ((int)(spu_extract(spu_gather(spu_cmpgt(a, b)), 0) != 0xF));
3231}
3232
3233
3234/* vec_any_lt (any elements less than)
3235 * ==========
3236 */
3237static inline int vec_any_lt(vec_uchar16 a, vec_uchar16 b)
3238{
3239  return ((int)(spu_extract(spu_gather(spu_cmpgt(b, a)), 0) != 0));
3240}
3241
3242static inline int vec_any_lt(vec_char16 a, vec_char16 b)
3243{
3244  return ((int)(spu_extract(spu_gather(spu_cmpgt(b, a)), 0) != 0));
3245}
3246
3247static inline int vec_any_lt(vec_bchar16 a, vec_char16 b)
3248{
3249  return ((int)(spu_extract(spu_gather(spu_cmpgt(b, (vec_char16)(a))), 0) != 0));
3250}
3251
3252static inline int vec_any_lt(vec_char16 a, vec_bchar16 b)
3253{
3254  return ((int)(spu_extract(spu_gather(spu_cmpgt((vec_char16)(b), a)), 0) != 0));
3255}
3256
3257static inline int vec_any_lt(vec_ushort8 a, vec_ushort8 b)
3258{
3259  return ((int)(spu_extract(spu_gather(spu_cmpgt(b, a)), 0) != 0));
3260}
3261
3262static inline int vec_any_lt(vec_short8 a, vec_short8 b)
3263{
3264  return ((int)(spu_extract(spu_gather(spu_cmpgt(b, a)), 0) != 0));
3265}
3266
3267static inline int vec_any_lt(vec_bshort8 a, vec_short8 b)
3268{
3269  return ((int)(spu_extract(spu_gather(spu_cmpgt(b, (vec_short8)(a))), 0) != 0));
3270}
3271
3272static inline int vec_any_lt(vec_short8 a, vec_bshort8 b)
3273{
3274  return ((int)(spu_extract(spu_gather(spu_cmpgt((vec_short8)(b), a)), 0) != 0));
3275}
3276
3277static inline int vec_any_lt(vec_uint4 a, vec_uint4 b)
3278{
3279  return ((int)(spu_extract(spu_orx(spu_rlmask(spu_cmpgt(b, a), -31)), 0)));
3280}
3281
3282static inline int vec_any_lt(vec_int4 a, vec_int4 b)
3283{
3284  return ((int)(spu_extract(spu_orx(spu_rlmask(spu_cmpgt(b, a), -31)), 0)));
3285}
3286
3287static inline int vec_any_lt(vec_bint4 a, vec_int4 b)
3288{
3289  return ((int)(spu_extract(spu_orx(spu_rlmask(spu_cmpgt(b, (vec_int4)(a)), -31)), 0)));
3290}
3291
3292static inline int vec_any_lt(vec_int4 a, vec_bint4 b)
3293{
3294  return ((int)(spu_extract(spu_orx(spu_rlmask(spu_cmpgt((vec_int4)(b), a), -31)), 0)));
3295}
3296
3297static inline int vec_any_lt(vec_float4 a, vec_float4 b)
3298{
3299  return ((int)(spu_extract(spu_orx(spu_rlmask(spu_cmpgt(b, a), -31)), 0)));
3300}
3301
3302/* vec_any_nan (any elements not a number)
3303 * ===========
3304 */
3305static inline int vec_any_nan(vec_float4 a)
3306{
3307  vec_uint4 exp, man;
3308  vec_uint4 exp_mask = spu_splats((unsigned int)0x7F800000);
3309
3310  exp = spu_and((vec_uint4)(a), exp_mask);
3311  man = spu_and((vec_uint4)(a), spu_splats((unsigned int)0x007FFFFF));
3312  return ((int)(spu_extract(spu_gather(spu_andc(spu_cmpeq(exp, exp_mask),
3313						spu_cmpeq(man, 0))), 0) != 0));
3314}
3315
3316
3317/* vec_any_ne (any elements not equal)
3318 * ==========
3319 */
3320static inline int vec_any_ne(vec_uchar16 a, vec_uchar16 b)
3321{
3322  return ((int)(spu_extract(spu_gather(spu_cmpeq(a, b)), 0) != 0xFFFF));
3323}
3324
3325static inline int vec_any_ne(vec_char16 a, vec_char16 b)
3326{
3327  return ((int)(spu_extract(spu_gather(spu_cmpeq(a, b)), 0) != 0xFFFF));
3328}
3329
3330static inline int vec_any_ne(vec_bchar16 a, vec_char16 b)
3331{
3332  return ((int)(spu_extract(spu_gather(spu_cmpeq((vec_char16)(a), b)), 0) != 0xFFFF));
3333}
3334
3335static inline int vec_any_ne(vec_char16 a, vec_bchar16 b)
3336{
3337  return ((int)(spu_extract(spu_gather(spu_cmpeq(a, (vec_char16)(b))), 0) != 0xFFFF));
3338}
3339
3340static inline int vec_any_ne(vec_ushort8 a, vec_ushort8 b)
3341{
3342  return ((int)(spu_extract(spu_gather(spu_cmpeq(a, b)), 0) != 0xFF));
3343}
3344
3345static inline int vec_any_ne(vec_short8 a, vec_short8 b)
3346{
3347  return ((int)(spu_extract(spu_gather(spu_cmpeq(a, b)), 0) != 0xFF));
3348}
3349
3350static inline int vec_any_ne(vec_bshort8 a, vec_short8 b)
3351{
3352  return ((int)(spu_extract(spu_gather(spu_cmpeq((vec_short8)(a), b)), 0) != 0xFF));
3353}
3354
3355static inline int vec_any_ne(vec_short8 a, vec_bshort8 b)
3356{
3357  return ((int)(spu_extract(spu_gather(spu_cmpeq(a, (vec_short8)(b))), 0) != 0xFF));
3358}
3359
3360static inline int vec_any_ne(vec_uint4 a, vec_uint4 b)
3361{
3362  return ((int)(spu_extract(spu_gather(spu_cmpeq(a, b)), 0) != 0xF));
3363}
3364
3365static inline int vec_any_ne(vec_int4 a, vec_int4 b)
3366{
3367  return ((int)(spu_extract(spu_gather(spu_cmpeq(a, b)), 0) != 0xF));
3368}
3369
3370static inline int vec_any_ne(vec_bint4 a, vec_int4 b)
3371{
3372  return ((int)(spu_extract(spu_gather(spu_cmpeq((vec_int4)(a), b)), 0) != 0xF));
3373}
3374
3375static inline int vec_any_ne(vec_int4 a, vec_bint4 b)
3376{
3377  return ((int)(spu_extract(spu_gather(spu_cmpeq(a, (vec_int4)(b))), 0) != 0xF));
3378}
3379
3380static inline int vec_any_ne(vec_float4 a, vec_float4 b)
3381{
3382  return ((int)(spu_extract(spu_gather(spu_cmpeq(a, b)), 0) != 0xF));
3383}
3384
3385
3386/* vec_any_nge (any elements not greater than or equal)
3387 * ===========
3388 */
3389static inline int vec_any_nge(vec_float4 a, vec_float4 b)
3390{
3391  return ((int)(spu_extract(spu_orx(spu_rlmask(spu_cmpgt(b, a), -31)), 0)));
3392}
3393
3394/* vec_any_ngt (any elements not greater than)
3395 * ===========
3396 */
3397static inline int vec_any_ngt(vec_float4 a, vec_float4 b)
3398{
3399  return ((int)(spu_extract(spu_gather(spu_cmpgt(a, b)), 0) != 0xF));
3400}
3401
3402
3403/* vec_any_nle (any elements not less than or equal)
3404 * ===========
3405 */
3406static inline int vec_any_nle(vec_float4 a, vec_float4 b)
3407{
3408  return ((int)(spu_extract(spu_gather(spu_cmpgt(a, b)), 0) != 0));
3409}
3410
3411
3412/* vec_any_nlt (any elements not less than)
3413 * ===========
3414 */
3415static inline int vec_any_nlt(vec_float4 a, vec_float4 b)
3416{
3417  return ((int)(spu_extract(spu_gather(spu_cmpgt(b, a)), 0) != 0xF));
3418}
3419
3420
3421/* vec_any_numeric (any elements numeric)
3422 * ===============
3423 */
3424static inline int vec_any_numeric(vec_float4 a)
3425{
3426  vec_uint4 exp;
3427
3428  exp = spu_and(spu_rlmask((vec_uint4)(a), -23), 0xFF);
3429  return ((int)(spu_extract(spu_gather(spu_cmpeq(exp, 255)), 0) != 0xF));
3430}
3431
3432
3433/* vec_any_out (any elements out of bounds)
3434 * ===========
3435 */
3436static inline int vec_any_out(vec_float4 a, vec_float4 b)
3437{
3438  return (spu_extract(spu_gather(spu_nor(spu_cmpabsgt(a, b), (vec_uint4)(spu_rlmaska((vec_int4)(b), -31)))), 0) != 0xF);
3439}
3440
3441
3442/* CBE Language Extension Intrinsics
3443 */
3444
3445/* vec_extract (extract element from vector)
3446 * ===========
3447 */
3448#define vec_extract(_a, _element)	spu_extract(_a, _element)
3449
3450
3451/* vec_insert (insert scalar into specified vector element)
3452 * ==========
3453 */
3454#define vec_insert(_a, _b, _element)	spu_insert(_a, _b, _element)
3455
3456/* vec_lvlx (load vector left indexed)
3457 * ========
3458 */
3459static inline vec_uchar16 vec_lvlx(int a, unsigned char *b)
3460{
3461  vec_uchar16 *p = (vec_uchar16 *)((unsigned char *)(b) + a);
3462  return(spu_slqwbyte(*p, (unsigned int)p & 0xF));
3463}
3464
3465static inline vec_uchar16 vec_lvlx(int a, vec_uchar16 *b)
3466{
3467  vec_uchar16 *p = (vec_uchar16 *)((unsigned char *)(b) + a);
3468  return(spu_slqwbyte(*p, (unsigned int)p & 0xF));
3469}
3470
3471static inline vec_char16 vec_lvlx(int a, signed char *b)
3472{
3473  vec_char16 *p = (vec_char16 *)((unsigned char *)(b) + a);
3474  return(spu_slqwbyte(*p, (unsigned int)p & 0xF));
3475}
3476
3477static inline vec_char16 vec_lvlx(int a, vec_char16 *b)
3478{
3479  vec_char16 *p = (vec_char16 *)((unsigned char *)(b) + a);
3480  return(spu_slqwbyte(*p, (unsigned int)p & 0xF));
3481}
3482
3483static inline vec_ushort8 vec_lvlx(int a, unsigned short *b)
3484{
3485  vec_ushort8 *p = (vec_ushort8 *)((unsigned char *)(b) + a);
3486  return(spu_slqwbyte(*p, (unsigned int)p & 0xF));
3487}
3488
3489static inline vec_ushort8 vec_lvlx(int a, vec_ushort8 *b)
3490{
3491  vec_ushort8 *p = (vec_ushort8 *)((unsigned char *)(b) + a);
3492  return(spu_slqwbyte(*p, (unsigned int)p & 0xF));
3493}
3494
3495static inline vec_short8 vec_lvlx(int a, signed short *b)
3496{
3497  vec_short8 *p = (vec_short8 *)((unsigned char *)(b) + a);
3498  return(spu_slqwbyte(*p, (unsigned int)p & 0xF));
3499}
3500
3501static inline vec_short8 vec_lvlx(int a, vec_short8 *b)
3502{
3503  vec_short8 *p = (vec_short8 *)((unsigned char *)(b) + a);
3504  return(spu_slqwbyte(*p, (unsigned int)p & 0xF));
3505}
3506
3507static inline vec_uint4 vec_lvlx(int a, unsigned int *b)
3508{
3509  vec_uint4 *p = (vec_uint4 *)((unsigned char *)(b) + a);
3510  return(spu_slqwbyte(*p, (unsigned int)p & 0xF));
3511}
3512
3513static inline vec_uint4 vec_lvlx(int a, vec_uint4 *b)
3514{
3515  vec_uint4 *p = (vec_uint4 *)((unsigned char *)(b) + a);
3516  return(spu_slqwbyte(*p, (unsigned int)p & 0xF));
3517}
3518
3519static inline vec_int4 vec_lvlx(int a, signed int *b)
3520{
3521  vec_int4 *p = (vec_int4 *)((unsigned char *)(b) + a);
3522  return(spu_slqwbyte(*p, (unsigned int)p & 0xF));
3523}
3524
3525static inline vec_int4 vec_lvlx(int a, vec_int4 *b)
3526{
3527  vec_int4 *p = (vec_int4 *)((unsigned char *)(b) + a);
3528  return(spu_slqwbyte(*p, (unsigned int)p & 0xF));
3529}
3530
3531static inline vec_float4 vec_lvlx(int a, float *b)
3532{
3533  vec_float4 *p = (vec_float4 *)((unsigned char *)(b) + a);
3534  return(spu_slqwbyte(*p, (unsigned int)p & 0xF));
3535}
3536
3537static inline vec_float4 vec_lvlx(int a, vec_float4 *b)
3538{
3539  vec_float4 *p = (vec_float4 *)((unsigned char *)(b) + a);
3540  return(spu_slqwbyte(*p, (unsigned int)p & 0xF));
3541}
3542
3543
3544/* vec_lvlxl (load vector left indexed last)
3545 * =========
3546 */
3547#define vec_lvlxl(_a, _b)	vec_lvlx(_a, _b)
3548
3549
3550/* vec_lvrx (load vector right indexed)
3551 * ========
3552 */
3553static inline vec_uchar16 vec_lvrx(int a, unsigned char *b)
3554{
3555  vec_uchar16 *p = (vec_uchar16 *)((unsigned char *)(b) + a);
3556  return(spu_rlmaskqwbyte(*p, ((int)p & 0xF)-16));
3557}
3558
3559static inline vec_uchar16 vec_lvrx(int a, vec_uchar16 *b)
3560{
3561  vec_uchar16 *p = (vec_uchar16 *)((unsigned char *)(b) + a);
3562  return(spu_rlmaskqwbyte(*p, ((int)p & 0xF)-16));
3563}
3564
3565static inline vec_char16 vec_lvrx(int a, signed char *b)
3566{
3567  vec_char16 *p = (vec_char16 *)((unsigned char *)(b) + a);
3568  return(spu_rlmaskqwbyte(*p, ((int)p & 0xF)-16));
3569}
3570
3571static inline vec_char16 vec_lvrx(int a, vec_char16 *b)
3572{
3573  vec_char16 *p = (vec_char16 *)((unsigned char *)(b) + a);
3574  return(spu_rlmaskqwbyte(*p, ((int)p & 0xF)-16));
3575}
3576
3577static inline vec_ushort8 vec_lvrx(int a, unsigned short *b)
3578{
3579  vec_ushort8 *p = (vec_ushort8 *)((unsigned char *)(b) + a);
3580  return(spu_rlmaskqwbyte(*p, ((int)p & 0xF)-16));
3581}
3582
3583static inline vec_ushort8 vec_lvrx(int a, vec_ushort8 *b)
3584{
3585  vec_ushort8 *p = (vec_ushort8 *)((unsigned char *)(b) + a);
3586  return(spu_rlmaskqwbyte(*p, ((int)p & 0xF)-16));
3587}
3588
3589static inline vec_short8 vec_lvrx(int a, signed short *b)
3590{
3591  vec_short8 *p = (vec_short8 *)((unsigned char *)(b) + a);
3592  return(spu_rlmaskqwbyte(*p, ((int)p & 0xF)-16));
3593}
3594
3595static inline vec_short8 vec_lvrx(int a, vec_short8 *b)
3596{
3597  vec_short8 *p = (vec_short8 *)((unsigned char *)(b) + a);
3598  return(spu_rlmaskqwbyte(*p, ((int)p & 0xF)-16));
3599}
3600
3601static inline vec_uint4 vec_lvrx(int a, unsigned int *b)
3602{
3603  vec_uint4 *p = (vec_uint4 *)((unsigned char *)(b) + a);
3604  return(spu_rlmaskqwbyte(*p, ((int)p & 0xF)-16));
3605}
3606
3607static inline vec_uint4 vec_lvrx(int a, vec_uint4 *b)
3608{
3609  vec_uint4 *p = (vec_uint4 *)((unsigned char *)(b) + a);
3610  return(spu_rlmaskqwbyte(*p, ((int)p & 0xF)-16));
3611}
3612
3613static inline vec_int4 vec_lvrx(int a, signed int *b)
3614{
3615  vec_int4 *p = (vec_int4 *)((unsigned char *)(b) + a);
3616  return(spu_rlmaskqwbyte(*p, ((int)p & 0xF)-16));
3617}
3618
3619static inline vec_int4 vec_lvrx(int a, vec_int4 *b)
3620{
3621  vec_int4 *p = (vec_int4 *)((unsigned char *)(b) + a);
3622  return(spu_rlmaskqwbyte(*p, ((int)p & 0xF)-16));
3623}
3624
3625static inline vec_float4 vec_lvrx(int a, float *b)
3626{
3627  vec_float4 *p = (vec_float4 *)((unsigned char *)(b) + a);
3628  return(spu_rlmaskqwbyte(*p, ((int)p & 0xF)-16));
3629}
3630
3631static inline vec_float4 vec_lvrx(int a, vec_float4 *b)
3632{
3633  vec_float4 *p = (vec_float4 *)((unsigned char *)(b) + a);
3634  return(spu_rlmaskqwbyte(*p, ((int)p & 0xF)-16));
3635}
3636
3637
3638
3639/* vec_lvrxl (load vector right indexed last)
3640 * =========
3641 */
3642#define vec_lvrxl(_a, _b)	vec_lvrx(_a, _b)
3643
3644
3645/* vec_promote (promote scalar to a vector)
3646 * ===========
3647 */
3648#define vec_promote(_a, _element)	spu_promote(_a, _element)
3649
3650
3651/* vec_splats (splat scalar to a vector)
3652 * ==========
3653 */
3654#define vec_splats(_a)	spu_splats(_a)
3655
3656
3657/* vec_stvlx (store vector left indexed)
3658 * =========
3659 */
3660static inline void vec_stvlx(vec_uchar16 a, int b, unsigned char *c)
3661{
3662  int shift;
3663  vec_uchar16 *p = (vec_uchar16 *)((unsigned char *)(c) + b);
3664
3665  shift = -((int)p & 0xF);
3666  *p = spu_sel(*p,
3667	       spu_rlmaskqwbyte(a, shift),
3668	       spu_rlmaskqwbyte(spu_splats((unsigned char)0xFF), shift));
3669}
3670
3671static inline void vec_stvlx(vec_uchar16 a, int b, vec_uchar16 *c)
3672{
3673  int shift;
3674  vec_uchar16 *p = (vec_uchar16 *)((unsigned char *)(c) + b);
3675
3676  shift = -((int)p & 0xF);
3677  *p = spu_sel(*p,
3678	       spu_rlmaskqwbyte(a, shift),
3679	       spu_rlmaskqwbyte(spu_splats((unsigned char)0xFF), shift));
3680}
3681
3682static inline void vec_stvlx(vec_char16 a, int b, signed char *c)
3683{
3684  int shift;
3685  vec_char16 *p = (vec_char16 *)((unsigned char *)(c) + b);
3686
3687  shift = -((int)p & 0xF);
3688  *p = spu_sel(*p,
3689	       spu_rlmaskqwbyte(a, shift),
3690	       spu_rlmaskqwbyte(spu_splats((unsigned char)0xFF), shift));
3691}
3692
3693static inline void vec_stvlx(vec_char16 a, int b, vec_char16 *c)
3694{
3695  int shift;
3696  vec_char16 *p = (vec_char16 *)((unsigned char *)(c) + b);
3697
3698  shift = -((int)p & 0xF);
3699  *p = spu_sel(*p,
3700	       spu_rlmaskqwbyte(a, shift),
3701	       spu_rlmaskqwbyte(spu_splats((unsigned char)0xFF), shift));
3702}
3703
3704static inline void vec_stvlx(vec_ushort8 a, int b, unsigned short *c)
3705{
3706  int shift;
3707  vec_ushort8 *p = (vec_ushort8 *)((unsigned char *)(c) + b);
3708
3709  shift = -((int)p & 0xF);
3710  *p = spu_sel(*p,
3711	       spu_rlmaskqwbyte(a, shift),
3712	       spu_rlmaskqwbyte(spu_splats((unsigned short)0xFFFF), shift));
3713}
3714
3715static inline void vec_stvlx(vec_ushort8 a, int b, vec_ushort8 *c)
3716{
3717  int shift;
3718  vec_ushort8 *p = (vec_ushort8 *)((unsigned char *)(c) + b);
3719
3720  shift = -((int)p & 0xF);
3721  *p = spu_sel(*p,
3722	       spu_rlmaskqwbyte(a, shift),
3723	       spu_rlmaskqwbyte(spu_splats((unsigned short)0xFFFF), shift));
3724}
3725
3726static inline void vec_stvlx(vec_short8 a, int b, signed short *c)
3727{
3728  int shift;
3729  vec_short8 *p = (vec_short8 *)((unsigned char *)(c) + b);
3730
3731  shift = -((int)p & 0xF);
3732  *p = spu_sel(*p,
3733	       spu_rlmaskqwbyte(a, shift),
3734	       spu_rlmaskqwbyte(spu_splats((unsigned short)0xFFFF), shift));
3735}
3736
3737static inline void vec_stvlx(vec_short8 a, int b, vec_short8 *c)
3738{
3739  int shift;
3740  vec_short8 *p = (vec_short8 *)((unsigned char *)(c) + b);
3741
3742  shift = -((int)p & 0xF);
3743  *p = spu_sel(*p,
3744	       spu_rlmaskqwbyte(a, shift),
3745	       spu_rlmaskqwbyte(spu_splats((unsigned short)0xFFFF), shift));
3746}
3747
3748static inline void vec_stvlx(vec_uint4 a, int b, unsigned int *c)
3749{
3750  int shift;
3751  vec_uint4 *p = (vec_uint4 *)((unsigned char *)(c) + b);
3752
3753  shift = -((int)p & 0xF);
3754  *p = spu_sel(*p,
3755	       spu_rlmaskqwbyte(a, shift),
3756	       spu_rlmaskqwbyte(spu_splats((unsigned int)0xFFFFFFFF), shift));
3757}
3758
3759static inline void vec_stvlx(vec_uint4 a, int b, vec_uint4 *c)
3760{
3761  int shift;
3762  vec_uint4 *p = (vec_uint4 *)((unsigned char *)(c) + b);
3763
3764  shift = -((int)p & 0xF);
3765  *p = spu_sel(*p,
3766	       spu_rlmaskqwbyte(a, shift),
3767	       spu_rlmaskqwbyte(spu_splats((unsigned int)0xFFFFFFFF), shift));
3768}
3769
3770static inline void vec_stvlx(vec_int4 a, int b, signed int *c)
3771{
3772  int shift;
3773  vec_int4 *p = (vec_int4 *)((unsigned char *)(c) + b);
3774
3775  shift = -((int)p & 0xF);
3776  *p = spu_sel(*p,
3777	       spu_rlmaskqwbyte(a, shift),
3778	       spu_rlmaskqwbyte(spu_splats((unsigned int)0xFFFFFFFF), shift));
3779}
3780
3781static inline void vec_stvlx(vec_int4 a, int b, vec_int4 *c)
3782{
3783  int shift;
3784  vec_int4 *p = (vec_int4 *)((unsigned char *)(c) + b);
3785
3786  shift = -((int)p & 0xF);
3787  *p = spu_sel(*p,
3788	       spu_rlmaskqwbyte(a, shift),
3789	       spu_rlmaskqwbyte(spu_splats((unsigned int)0xFFFFFFFF), shift));
3790}
3791
3792static inline void vec_stvlx(vec_float4 a, int b, float *c)
3793{
3794  int shift;
3795  vec_float4 *p = (vec_float4 *)((unsigned char *)(c) + b);
3796
3797  shift = -((int)p & 0xF);
3798  *p = spu_sel(*p,
3799	       spu_rlmaskqwbyte(a, shift),
3800	       spu_rlmaskqwbyte(spu_splats((unsigned int)0xFFFFFFFF), shift));
3801}
3802
3803static inline void vec_stvlx(vec_float4 a, int b, vec_float4 *c)
3804{
3805  int shift;
3806  vec_float4 *p = (vec_float4 *)((unsigned char *)(c) + b);
3807
3808  shift = -((int)p & 0xF);
3809  *p = spu_sel(*p,
3810	       spu_rlmaskqwbyte(a, shift),
3811	       spu_rlmaskqwbyte(spu_splats((unsigned int)0xFFFFFFFF), shift));
3812}
3813
3814/* vec_stvlxl (store vector left indexed last)
3815 * ==========
3816 */
3817#define vec_stvlxl(_a, _b, _c)	vec_stvlx(_a, _b, _c)
3818
3819
3820/* vec_stvrx (store vector right indexed)
3821 * =========
3822 */
3823static inline void vec_stvrx(vec_uchar16 a, int b, unsigned char *c)
3824{
3825  int shift;
3826  vec_uchar16 *p = (vec_uchar16 *)((unsigned char *)(c) + b);
3827
3828  shift = 16-((int)p & 0xF);
3829  *p = spu_sel(*p,
3830	       spu_slqwbyte(a, shift),
3831	       spu_slqwbyte(spu_splats((unsigned char)0xFF), shift));
3832}
3833
3834static inline void vec_stvrx(vec_uchar16 a, int b, vec_uchar16 *c)
3835{
3836  int shift;
3837  vec_uchar16 *p = (vec_uchar16 *)((unsigned char *)(c) + b);
3838
3839  shift = 16-((int)p & 0xF);
3840  *p = spu_sel(*p,
3841	       spu_slqwbyte(a, shift),
3842	       spu_slqwbyte(spu_splats((unsigned char)0xFF), shift));
3843}
3844
3845static inline void vec_stvrx(vec_char16 a, int b, signed char *c)
3846{
3847  int shift;
3848  vec_char16 *p = (vec_char16 *)((unsigned char *)(c) + b);
3849
3850  shift = 16-((int)p & 0xF);
3851  *p = spu_sel(*p,
3852	       spu_slqwbyte(a, shift),
3853	       spu_slqwbyte(spu_splats((unsigned char)0xFF), shift));
3854}
3855
3856static inline void vec_stvrx(vec_char16 a, int b, vec_char16 *c)
3857{
3858  int shift;
3859  vec_char16 *p = (vec_char16 *)((unsigned char *)(c) + b);
3860
3861  shift = 16-((int)p & 0xF);
3862  *p = spu_sel(*p,
3863	       spu_slqwbyte(a, shift),
3864	       spu_slqwbyte(spu_splats((unsigned char)0xFF), shift));
3865}
3866
3867static inline void vec_stvrx(vec_ushort8 a, int b, unsigned short *c)
3868{
3869  int shift;
3870  vec_ushort8 *p = (vec_ushort8 *)((unsigned char *)(c) + b);
3871
3872  shift = 16-((int)p & 0xF);
3873  *p = spu_sel(*p,
3874	       spu_slqwbyte(a, shift),
3875	       spu_slqwbyte(spu_splats((unsigned short)0xFFFF), shift));
3876}
3877
3878static inline void vec_stvrx(vec_ushort8 a, int b, vec_ushort8 *c)
3879{
3880  int shift;
3881  vec_ushort8 *p = (vec_ushort8 *)((unsigned char *)(c) + b);
3882
3883  shift = 16-((int)p & 0xF);
3884  *p = spu_sel(*p,
3885	       spu_slqwbyte(a, shift),
3886	       spu_slqwbyte(spu_splats((unsigned short)0xFFFF), shift));
3887}
3888
3889static inline void vec_stvrx(vec_short8 a, int b, signed short *c)
3890{
3891  int shift;
3892  vec_short8 *p = (vec_short8 *)((unsigned char *)(c) + b);
3893
3894  shift = 16-((int)p & 0xF);
3895  *p = spu_sel(*p,
3896	       spu_slqwbyte(a, shift),
3897	       spu_slqwbyte(spu_splats((unsigned short)0xFFFF), shift));
3898}
3899
3900static inline void vec_stvrx(vec_short8 a, int b, vec_short8 *c)
3901{
3902  int shift;
3903  vec_short8 *p = (vec_short8 *)((unsigned char *)(c) + b);
3904
3905  shift = 16-((int)p & 0xF);
3906  *p = spu_sel(*p,
3907	       spu_slqwbyte(a, shift),
3908	       spu_slqwbyte(spu_splats((unsigned short)0xFFFF), shift));
3909}
3910
3911static inline void vec_stvrx(vec_uint4 a, int b, unsigned int *c)
3912{
3913  int shift;
3914  vec_uint4 *p = (vec_uint4 *)((unsigned char *)(c) + b);
3915
3916  shift = 16-((int)p & 0xF);
3917  *p = spu_sel(*p,
3918	       spu_slqwbyte(a, shift),
3919	       spu_slqwbyte(spu_splats((unsigned int)0xFFFFFFFF), shift));
3920}
3921
3922static inline void vec_stvrx(vec_uint4 a, int b, vec_uint4 *c)
3923{
3924  int shift;
3925  vec_uint4 *p = (vec_uint4 *)((unsigned char *)(c) + b);
3926
3927  shift = 16-((int)p & 0xF);
3928  *p = spu_sel(*p,
3929	       spu_slqwbyte(a, shift),
3930	       spu_slqwbyte(spu_splats((unsigned int)0xFFFFFFFF), shift));
3931}
3932
3933static inline void vec_stvrx(vec_int4 a, int b, signed int *c)
3934{
3935  int shift;
3936  vec_int4 *p = (vec_int4 *)((unsigned char *)(c) + b);
3937
3938  shift = 16-((int)p & 0xF);
3939  *p = spu_sel(*p,
3940	       spu_slqwbyte(a, shift),
3941	       spu_slqwbyte(spu_splats((unsigned int)0xFFFFFFFF), shift));
3942}
3943
3944static inline void vec_stvrx(vec_int4 a, int b, vec_int4 *c)
3945{
3946  int shift;
3947  vec_int4 *p = (vec_int4 *)((unsigned char *)(c) + b);
3948
3949  shift = 16-((int)p & 0xF);
3950  *p = spu_sel(*p,
3951	       spu_slqwbyte(a, shift),
3952	       spu_slqwbyte(spu_splats((unsigned int)0xFFFFFFFF), shift));
3953}
3954
3955static inline void vec_stvrx(vec_float4 a, int b, float *c)
3956{
3957  int shift;
3958  vec_float4 *p = (vec_float4 *)((unsigned char *)(c) + b);
3959
3960  shift = 16-((int)p & 0xF);
3961  *p = spu_sel(*p,
3962	       spu_slqwbyte(a, shift),
3963	       spu_slqwbyte(spu_splats((unsigned int)0xFFFFFFFF), shift));
3964}
3965
3966static inline void vec_stvrx(vec_float4 a, int b, vec_float4 *c)
3967{
3968  int shift;
3969  vec_float4 *p = (vec_float4 *)((unsigned char *)(c) + b);
3970
3971  shift = 16-((int)p & 0xF);
3972  *p = spu_sel(*p,
3973	       spu_slqwbyte(a, shift),
3974	       spu_slqwbyte(spu_splats((unsigned int)0xFFFFFFFF), shift));
3975}
3976
3977/* vec_stvrxl (store vector right indexed last)
3978 * ==========
3979 */
3980#define vec_stvrxl(_a, _b, _c)	vec_stvrx(_a, _b, _c)
3981
3982
3983#endif /* __SPU__ */
3984#endif /* __cplusplus */
3985#endif /* !_VMX2SPU_H_ */
3986