1/* Copyright (C) 2006-2015 Free Software Foundation, Inc. 2 3 This file is free software; you can redistribute it and/or modify it under 4 the terms of the GNU General Public License as published by the Free 5 Software Foundation; either version 3 of the License, or (at your option) 6 any later version. 7 8 This file is distributed in the hope that it will be useful, but WITHOUT 9 ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or 10 FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License 11 for more details. 12 13 Under Section 7 of GPL version 3, you are granted additional 14 permissions described in the GCC Runtime Library Exception, version 15 3.1, as published by the Free Software Foundation. 16 17 You should have received a copy of the GNU General Public License and 18 a copy of the GCC Runtime Library Exception along with this program; 19 see the files COPYING3 and COPYING.RUNTIME respectively. If not, see 20 <http://www.gnu.org/licenses/>. */ 21 22#ifndef _VMX2SPU_H_ 23#define _VMX2SPU_H_ 1 24 25#ifdef __cplusplus 26 27#ifdef __SPU__ 28 29#include <spu_intrinsics.h> 30#include <vec_types.h> 31 32/* This file maps generic VMX intrinsics and predicates to the SPU using 33 * overloaded C++ functions. 34 */ 35 36/************************************************************************ 37 * INTRINSICS 38 ************************************************************************/ 39 40/* vec_abs (vector absolute value) 41 * ======= 42 */ 43static inline vec_char16 vec_abs(vec_char16 a) 44{ 45 vec_char16 minus_a; 46 47 minus_a = (vec_char16)(spu_add((vec_ushort8)(spu_and(spu_xor(a, 0xFF), 0x7F)), 0x101)); 48 return (spu_sel(minus_a, a, spu_cmpgt(a, -1))); 49} 50 51static inline vec_short8 vec_abs(vec_short8 a) 52{ 53 return (spu_sel(spu_sub(0, a), a, spu_cmpgt(a, -1))); 54} 55 56static inline vec_int4 vec_abs(vec_int4 a) 57{ 58 return (spu_sel(spu_sub(0, a), a, spu_cmpgt(a, -1))); 59} 60 61static inline vec_float4 vec_abs(vec_float4 a) 62{ 63 return ((vec_float4)(spu_rlmask(spu_sl((vec_uint4)(a), 1), -1))); 64} 65 66/* vec_abss (vector absolute value saturate) 67 * ======== 68 */ 69static inline vec_char16 vec_abss(vec_char16 a) 70{ 71 vec_char16 minus_a; 72 73 minus_a = (vec_char16)spu_add((vec_short8)(spu_xor(a, -1)), 74 (vec_short8)(spu_and(spu_cmpgt((vec_uchar16)(a), 0x80), 1))); 75 return (spu_sel(minus_a, a, spu_cmpgt(a, -1))); 76} 77 78static inline vec_short8 vec_abss(vec_short8 a) 79{ 80 vec_short8 minus_a; 81 82 minus_a = spu_add(spu_sub(0, a), (vec_short8)(spu_cmpeq(a, ((vec_short8){0x8000,0x8000,0x8000,0x8000,0x8000,0x8000,0x8000,0x8000})))); 83 return (spu_sel(minus_a, a, spu_cmpgt(a, -1))); 84} 85 86static inline vec_int4 vec_abss(vec_int4 a) 87{ 88 vec_int4 minus_a; 89 90 minus_a = spu_add(spu_sub(0, a), (vec_int4)(spu_cmpeq(a, ((vec_int4){0x80000000,0x80000000,0x80000000,0x80000000})))); 91 return (spu_sel(minus_a, a, spu_cmpgt(a, -1))); 92} 93 94 95/* vec_add (vector add) 96 * ======= 97 */ 98static inline vec_uchar16 vec_add(vec_uchar16 a, vec_uchar16 b) 99{ 100 return ((vec_uchar16)(spu_sel(spu_add((vec_ushort8)(a), (vec_ushort8)(b)), 101 spu_add(spu_and((vec_ushort8)(a), 0xFF00), spu_and((vec_ushort8)(b), 0xFF00)), 102 spu_splats((unsigned short)(0xFF00))))); 103} 104 105static inline vec_char16 vec_add(vec_char16 a, vec_char16 b) 106{ 107 return ((vec_char16)vec_add((vec_uchar16)(a), (vec_uchar16)(b))); 108} 109 110static inline vec_char16 vec_add(vec_bchar16 a, vec_char16 b) 111{ 112 return ((vec_char16)vec_add((vec_uchar16)(a), (vec_uchar16)(b))); 113} 114 115static inline vec_char16 vec_add(vec_char16 a, vec_bchar16 b) 116{ 117 return ((vec_char16)vec_add((vec_uchar16)(a), (vec_uchar16)(b))); 118} 119 120static inline vec_ushort8 vec_add(vec_ushort8 a, vec_ushort8 b) 121{ 122 return (spu_add(a, b)); 123} 124 125static inline vec_short8 vec_add(vec_short8 a, vec_short8 b) 126{ 127 return (spu_add(a, b)); 128} 129 130static inline vec_short8 vec_add(vec_bshort8 a, vec_short8 b) 131{ 132 return (spu_add((vec_short8)(a), b)); 133} 134 135static inline vec_short8 vec_add(vec_short8 a, vec_bshort8 b) 136{ 137 return (spu_add(a, (vec_short8)(b))); 138} 139 140static inline vec_uint4 vec_add(vec_uint4 a, vec_uint4 b) 141{ 142 return (spu_add(a, b)); 143} 144 145static inline vec_int4 vec_add(vec_int4 a, vec_int4 b) 146{ 147 return (spu_add(a, b)); 148} 149 150static inline vec_int4 vec_add(vec_bint4 a, vec_int4 b) 151{ 152 return (spu_add((vec_int4)(a), b)); 153} 154 155static inline vec_int4 vec_add(vec_int4 a, vec_bint4 b) 156{ 157 return (spu_add(a, (vec_int4)(b))); 158} 159 160static inline vec_float4 vec_add(vec_float4 a, vec_float4 b) 161{ 162 return (spu_add(a, b)); 163} 164 165/* vec_addc (vector add carryout unsigned word) 166 * ======== 167 */ 168#define vec_addc(_a, _b) spu_genc(_a, _b) 169 170/* vec_adds (vector add saturated) 171 * ======== 172 */ 173static inline vec_uchar16 vec_adds(vec_uchar16 a, vec_uchar16 b) 174{ 175 vec_uchar16 s1, s2, s, d; 176 177 s1 = (vec_uchar16)(spu_add(spu_rlmask((vec_ushort8)(a), -8), spu_rlmask((vec_ushort8)(b), -8))); 178 s2 = (vec_uchar16)(spu_add(spu_and((vec_ushort8)(a), 0xFF), spu_and((vec_ushort8)(b), 0xFF))); 179 s = spu_shuffle(s1, s2, ((vec_uchar16){0, 16, 2, 18, 4, 20, 6, 22, 180 8, 24, 10, 26, 12, 28, 14, 30})); 181 d = spu_shuffle(s1, s2, ((vec_uchar16){1, 17, 3, 19, 5, 21, 7, 23, 182 9, 25, 11, 27, 13, 29, 15, 31})); 183 return (spu_or(d, spu_cmpeq(s, 1))); 184} 185 186static inline vec_char16 vec_adds(vec_char16 a, vec_char16 b) 187{ 188 vec_uchar16 s1, s2, s, d; 189 190 s1 = (vec_uchar16)(spu_add(spu_rlmask((vec_ushort8)(a), -8), spu_rlmask((vec_ushort8)(b), -8))); 191 s2 = (vec_uchar16)(spu_add(spu_and((vec_ushort8)(a), 0xFF), spu_and((vec_ushort8)(b), 0xFF))); 192 s = spu_shuffle(s1, s2, ((vec_uchar16){1, 17, 3, 19, 5, 21, 7, 23, 193 9, 25, 11, 27, 13, 29, 15, 31})); 194 d = spu_sel(s, spu_splats((unsigned char)0x7F), spu_cmpgt(spu_and(s, (vec_uchar16)(spu_nor(a, b))), 0x7F)); 195 d = spu_sel(d, spu_splats((unsigned char)0x80), spu_cmpgt(spu_nor(s, (vec_uchar16)(spu_nand(a, b))), 0x7F)); 196 return ((vec_char16)(d)); 197} 198 199static inline vec_char16 vec_adds(vec_bchar16 a, vec_char16 b) 200{ 201 return (vec_adds((vec_char16)(a), b)); 202} 203 204static inline vec_char16 vec_adds(vec_char16 a, vec_bchar16 b) 205{ 206 return (vec_adds(a, (vec_char16)(b))); 207} 208 209static inline vec_ushort8 vec_adds(vec_ushort8 a, vec_ushort8 b) 210{ 211 vec_ushort8 s, d; 212 213 s = spu_add(a, b); 214 d = spu_or(s, spu_rlmaska(spu_sel(spu_xor(s, -1), a, spu_eqv(a, b)), -15)); 215 return (d); 216} 217 218static inline vec_short8 vec_adds(vec_short8 a, vec_short8 b) 219{ 220 vec_short8 s, d; 221 222 s = spu_add(a, b); 223 d = spu_sel(s, spu_splats((signed short)0x7FFF), (vec_ushort8)(spu_rlmaska(spu_and(s, spu_nor(a, b)), -15))); 224 d = spu_sel(d, spu_splats((signed short)0x8000), (vec_ushort8)(spu_rlmaska(spu_nor(s, spu_nand(a, b)), -15))); 225 return (d); 226} 227 228static inline vec_short8 vec_adds(vec_bshort8 a, vec_short8 b) 229{ 230 return (vec_adds((vec_short8)(a), b)); 231} 232 233static inline vec_short8 vec_adds(vec_short8 a, vec_bshort8 b) 234{ 235 return (vec_adds(a, (vec_short8)(b))); 236} 237 238static inline vec_uint4 vec_adds(vec_uint4 a, vec_uint4 b) 239{ 240 return (spu_or(spu_add(a, b), spu_rlmaska(spu_sl(spu_genc(a, b), 31), -31))); 241} 242 243static inline vec_int4 vec_adds(vec_int4 a, vec_int4 b) 244{ 245 vec_int4 s, d; 246 247 s = spu_add(a, b); 248 d = spu_sel(s, spu_splats((signed int)0x7FFFFFFF), (vec_uint4)spu_rlmaska(spu_and(s, spu_nor(a, b)), -31)); 249 d = spu_sel(d, spu_splats((signed int)0x80000000), (vec_uint4)spu_rlmaska(spu_nor(s, spu_nand(a, b)), -31)); 250 return (d); 251} 252 253static inline vec_int4 vec_adds(vec_bint4 a, vec_int4 b) 254{ 255 return (vec_adds((vec_int4)(a), b)); 256} 257 258static inline vec_int4 vec_adds(vec_int4 a, vec_bint4 b) 259{ 260 return (vec_adds(a, (vec_int4)(b))); 261} 262 263/* vec_and (vector logical and) 264 * ======= 265 */ 266static inline vec_uchar16 vec_and(vec_uchar16 a, vec_uchar16 b) 267{ 268 return (spu_and(a, b)); 269} 270 271static inline vec_char16 vec_and(vec_char16 a, vec_char16 b) 272{ 273 return (spu_and(a, b)); 274} 275 276static inline vec_char16 vec_and(vec_bchar16 a, vec_char16 b) 277{ 278 return (spu_and((vec_char16)(a), b)); 279} 280 281static inline vec_char16 vec_and(vec_char16 a, vec_bchar16 b) 282{ 283 return (spu_and(a, (vec_char16)(b))); 284} 285 286static inline vec_ushort8 vec_and(vec_ushort8 a, vec_ushort8 b) 287{ 288 return (spu_and(a, b)); 289} 290 291static inline vec_short8 vec_and(vec_short8 a, vec_short8 b) 292{ 293 return (spu_and(a, b)); 294} 295 296static inline vec_short8 vec_and(vec_bshort8 a, vec_short8 b) 297{ 298 return (spu_and((vec_short8)(a), b)); 299} 300 301static inline vec_short8 vec_and(vec_short8 a, vec_bshort8 b) 302{ 303 return (spu_and(a, (vec_short8)(b))); 304} 305 306static inline vec_uint4 vec_and(vec_uint4 a, vec_uint4 b) 307{ 308 return (spu_and(a, b)); 309} 310 311static inline vec_int4 vec_and(vec_int4 a, vec_int4 b) 312{ 313 return (spu_and(a, b)); 314} 315 316static inline vec_int4 vec_and(vec_bint4 a, vec_int4 b) 317{ 318 return (spu_and((vec_int4)(a), b)); 319} 320 321static inline vec_int4 vec_and(vec_int4 a, vec_bint4 b) 322{ 323 return (spu_and(a, (vec_int4)(b))); 324} 325 326static inline vec_float4 vec_and(vec_float4 a, vec_float4 b) 327{ 328 return (spu_and(a, b)); 329} 330 331static inline vec_float4 vec_and(vec_bint4 a, vec_float4 b) 332{ 333 return (spu_and((vec_float4)(a),b)); 334} 335 336static inline vec_float4 vec_and(vec_float4 a, vec_bint4 b) 337{ 338 return (spu_and(a, (vec_float4)(b))); 339} 340 341 342/* vec_andc (vector logical and with complement) 343 * ======== 344 */ 345static inline vec_uchar16 vec_andc(vec_uchar16 a, vec_uchar16 b) 346{ 347 return (spu_andc(a, b)); 348} 349 350static inline vec_char16 vec_andc(vec_char16 a, vec_char16 b) 351{ 352 return (spu_andc(a, b)); 353} 354 355static inline vec_char16 vec_andc(vec_bchar16 a, vec_char16 b) 356{ 357 return (spu_andc((vec_char16)(a), b)); 358} 359 360static inline vec_char16 vec_andc(vec_char16 a, vec_bchar16 b) 361{ 362 return (spu_andc(a, (vec_char16)(b))); 363} 364 365static inline vec_ushort8 vec_andc(vec_ushort8 a, vec_ushort8 b) 366{ 367 return (spu_andc(a, b)); 368} 369 370static inline vec_short8 vec_andc(vec_short8 a, vec_short8 b) 371{ 372 return (spu_andc(a, b)); 373} 374 375static inline vec_short8 vec_andc(vec_bshort8 a, vec_short8 b) 376{ 377 return (spu_andc((vec_short8)(a), b)); 378} 379 380static inline vec_short8 vec_andc(vec_short8 a, vec_bshort8 b) 381{ 382 return (spu_andc(a, (vec_short8)(b))); 383} 384 385static inline vec_uint4 vec_andc(vec_uint4 a, vec_uint4 b) 386{ 387 return (spu_andc(a, b)); 388} 389 390static inline vec_int4 vec_andc(vec_int4 a, vec_int4 b) 391{ 392 return (spu_andc(a, b)); 393} 394 395static inline vec_int4 vec_andc(vec_bint4 a, vec_int4 b) 396{ 397 return (spu_andc((vec_int4)(a), b)); 398} 399 400static inline vec_int4 vec_andc(vec_int4 a, vec_bint4 b) 401{ 402 return (spu_andc(a, (vec_int4)(b))); 403} 404 405static inline vec_float4 vec_andc(vec_float4 a, vec_float4 b) 406{ 407 return (spu_andc(a,b)); 408} 409 410static inline vec_float4 vec_andc(vec_bint4 a, vec_float4 b) 411{ 412 return (spu_andc((vec_float4)(a),b)); 413} 414 415static inline vec_float4 vec_andc(vec_float4 a, vec_bint4 b) 416{ 417 return (spu_andc(a, (vec_float4)(b))); 418} 419 420/* vec_avg (vector average) 421 * ======= 422 */ 423static inline vec_uchar16 vec_avg(vec_uchar16 a, vec_uchar16 b) 424{ 425 return (spu_avg(a, b)); 426} 427 428static inline vec_char16 vec_avg(vec_char16 a, vec_char16 b) 429{ 430 return ((vec_char16)(spu_xor(spu_avg((vec_uchar16)(a), (vec_uchar16)(b)), 431 (vec_uchar16)(spu_and(spu_xor(a,b), 0x80))))); 432} 433 434static inline vec_ushort8 vec_avg(vec_ushort8 a, vec_ushort8 b) 435{ 436 return (spu_add(spu_add(spu_rlmask(a, -1), spu_rlmask(b, -1)), 437 spu_and(spu_or(a, b), 1))); 438} 439 440static inline vec_short8 vec_avg(vec_short8 a, vec_short8 b) 441{ 442 return (spu_add(spu_add(spu_rlmaska(a, -1), spu_rlmaska(b, -1)), 443 spu_and(spu_or(a, b), 1))); 444} 445 446static inline vec_uint4 vec_avg(vec_uint4 a, vec_uint4 b) 447{ 448 return (spu_add(spu_add(spu_rlmask(a, -1), spu_rlmask(b, -1)), 449 spu_and(spu_or(a, b), 1))); 450} 451 452static inline vec_int4 vec_avg(vec_int4 a, vec_int4 b) 453{ 454 return (spu_add(spu_add(spu_rlmaska(a, -1), spu_rlmaska(b, -1)), 455 spu_and(spu_or(a, b), 1))); 456} 457 458 459/* vec_ceil (vector ceiling) 460 * ======== 461 */ 462static inline vec_float4 vec_ceil(vec_float4 a) 463{ 464 vec_int4 exp; 465 vec_uint4 mask; 466 467 a = spu_add(a, (vec_float4)(spu_and(spu_xor(spu_rlmaska((vec_int4)a, -31), -1), spu_splats((signed int)0x3F7FFFFF)))); 468 exp = spu_sub(127, (vec_int4)(spu_and(spu_rlmask((vec_uint4)(a), -23), 0xFF))); 469 mask = spu_rlmask(spu_splats((unsigned int)0x7FFFFF), exp); 470 mask = spu_sel(spu_splats((unsigned int)0), mask, spu_cmpgt(exp, -31)); 471 mask = spu_or(mask, spu_xor((vec_uint4)(spu_rlmaska(spu_add(exp, -1), -31)), -1)); 472 473 return ((vec_float4)(spu_andc((vec_uint4)(a), mask))); 474} 475 476 477/* vec_cmpb (vector compare bounds floating-point) 478 * ======== 479 */ 480static inline vec_int4 vec_cmpb(vec_float4 a, vec_float4 b) 481{ 482 vec_int4 b0 = (vec_int4)spu_splats(0x80000000); 483 vec_int4 b1 = (vec_int4)spu_splats(0x40000000); 484 485 return (spu_or(spu_and((vec_int4)spu_cmpgt(a, b), b0), 486 spu_and((vec_int4)spu_cmpgt(spu_xor(b, (vec_float4)(b0)), a), b1))); 487} 488 489/* vec_cmpeq (vector compare equal) 490 * ========= 491 */ 492#define vec_cmpeq(_a, _b) spu_cmpeq(_a, _b) 493 494 495/* vec_cmpge (vector compare greater than or equal) 496 * ========= 497 */ 498static inline vec_bint4 vec_cmpge(vec_float4 a, vec_float4 b) 499{ 500 return (spu_xor(spu_cmpgt(b, a), -1)); 501} 502 503 504/* vec_cmpgt (vector compare greater than) 505 * ========= 506 */ 507#define vec_cmpgt(_a, _b) spu_cmpgt(_a, _b) 508 509 510/* vec_cmple (vector compare less than or equal) 511 * ========= 512 */ 513static inline vec_bint4 vec_cmple(vec_float4 a, vec_float4 b) 514{ 515 return (spu_xor(spu_cmpgt(a, b), -1)); 516} 517 518 519/* vec_cmplt (vector compare less than) 520 * ========= 521 */ 522#define vec_cmplt(_a, _b) spu_cmpgt(_b, _a) 523 524 525/* vec_ctf (vector convert from fixed-point word) 526 * ======= 527 */ 528#define vec_ctf(_a, _b) spu_convtf(_a, _b) 529 530 531/* vec_cts (vector convert to signed fixed-point word saturate) 532 * ======= 533 */ 534#define vec_cts(_a, _b) spu_convts(_a, _b) 535 536 537/* vec_ctu (vector convert to unsigned fixed-point word saturate) 538 * ======= 539 */ 540#define vec_ctu(_a, _b) spu_convtu(_a, _b) 541 542 543/* vec_dss (vector data stream stop) 544 * ======= 545 */ 546#define vec_dss(_a) 547 548 549/* vec_dssall (vector data stream stop all) 550 * ========== 551 */ 552#define vec_dssall() 553 554 555/* vec_dst (vector data stream touch) 556 * ======= 557 */ 558#define vec_dst(_a, _b, _c) 559 560 561/* vec_dstst (vector data stream touch for store) 562 * ========= 563 */ 564#define vec_dstst(_a, _b, _c) 565 566 567/* vec_dststt (vector data stream touch for store transient) 568 * ========== 569 */ 570#define vec_dststt(_a, _b, _c) 571 572 573/* vec_dstt (vector data stream touch transient) 574 * ======== 575 */ 576#define vec_dstt(_a, _b, _c) 577 578 579/* vec_expte (vector is 2 raised tp the exponent estimate floating-point) 580 * ========= 581 */ 582static inline vec_float4 vec_expte(vec_float4 a) 583{ 584 vec_float4 bias, frac, exp; 585 vec_int4 ia; 586 587 bias = (vec_float4)(spu_andc(spu_splats((signed int)0x3F7FFFFF), spu_rlmaska((vec_int4)(a), -31))); 588 ia = spu_convts(spu_add(a, bias), 0); 589 frac = spu_sub(spu_convtf(ia, 0), a); 590 exp = (vec_float4)(spu_sl(spu_add(ia, 127), 23)); 591 592 return (spu_mul(spu_madd(spu_madd(spu_splats(0.17157287f), frac, spu_splats(-0.67157287f)), 593 frac, spu_splats(1.0f)), exp)); 594} 595 596 597/* vec_floor (vector floor) 598 * ========= 599 */ 600static inline vec_float4 vec_floor(vec_float4 a) 601{ 602 vec_int4 exp; 603 vec_uint4 mask; 604 605 a = spu_sub(a, (vec_float4)(spu_and(spu_rlmaska((vec_int4)a, -31), spu_splats((signed int)0x3F7FFFFF)))); 606 exp = spu_sub(127, (vec_int4)(spu_and(spu_rlmask((vec_uint4)(a), -23), 0xFF))); 607 mask = spu_rlmask(spu_splats((unsigned int)0x7FFFFF), exp); 608 mask = spu_sel(spu_splats((unsigned int)0), mask, spu_cmpgt(exp, -31)); 609 mask = spu_or(mask, spu_xor((vec_uint4)(spu_rlmaska(spu_add(exp, -1), -31)), -1)); 610 611 return ((vec_float4)(spu_andc((vec_uint4)(a), mask))); 612} 613 614 615/* vec_ld (vector load indexed) 616 * ====== 617 */ 618static inline vec_uchar16 vec_ld(int a, unsigned char *b) 619{ 620 return (*((vec_uchar16 *)(b+a))); 621} 622 623static inline vec_uchar16 vec_ld(int a, vec_uchar16 *b) 624{ 625 return (*((vec_uchar16 *)((unsigned char *)(b)+a))); 626} 627 628static inline vec_char16 vec_ld(int a, signed char *b) 629{ 630 return (*((vec_char16 *)(b+a))); 631} 632 633static inline vec_char16 vec_ld(int a, vec_char16 *b) 634{ 635 return (*((vec_char16 *)((signed char *)(b)+a))); 636} 637 638static inline vec_ushort8 vec_ld(int a, unsigned short *b) 639{ 640 return (*((vec_ushort8 *)((unsigned char *)(b)+a))); 641} 642 643static inline vec_ushort8 vec_ld(int a, vec_ushort8 *b) 644{ 645 return (*((vec_ushort8 *)((unsigned char *)(b)+a))); 646} 647 648static inline vec_short8 vec_ld(int a, signed short *b) 649{ 650 return (*((vec_short8 *)((unsigned char *)(b)+a))); 651} 652 653static inline vec_short8 vec_ld(int a, vec_short8 *b) 654{ 655 return (*((vec_short8 *)((signed char *)(b)+a))); 656} 657 658static inline vec_uint4 vec_ld(int a, unsigned int *b) 659{ 660 return (*((vec_uint4 *)((unsigned char *)(b)+a))); 661} 662 663static inline vec_uint4 vec_ld(int a, vec_uint4 *b) 664{ 665 return (*((vec_uint4 *)((unsigned char *)(b)+a))); 666} 667 668static inline vec_int4 vec_ld(int a, signed int *b) 669{ 670 return (*((vec_int4 *)((unsigned char *)(b)+a))); 671} 672 673static inline vec_int4 vec_ld(int a, vec_int4 *b) 674{ 675 return (*((vec_int4 *)((signed char *)(b)+a))); 676} 677 678static inline vec_float4 vec_ld(int a, float *b) 679{ 680 return (*((vec_float4 *)((unsigned char *)(b)+a))); 681} 682 683static inline vec_float4 vec_ld(int a, vec_float4 *b) 684{ 685 return (*((vec_float4 *)((unsigned char *)(b)+a))); 686} 687 688/* vec_lde (vector load element indexed) 689 * ======= 690 */ 691static inline vec_uchar16 vec_lde(int a, unsigned char *b) 692{ 693 return (*((vec_uchar16 *)(b+a))); 694} 695 696static inline vec_char16 vec_lde(int a, signed char *b) 697{ 698 return (*((vec_char16 *)(b+a))); 699} 700 701static inline vec_ushort8 vec_lde(int a, unsigned short *b) 702{ 703 return (*((vec_ushort8 *)((unsigned char *)(b)+a))); 704} 705 706static inline vec_short8 vec_lde(int a, signed short *b) 707{ 708 return (*((vec_short8 *)((unsigned char *)(b)+a))); 709} 710 711 712static inline vec_uint4 vec_lde(int a, unsigned int *b) 713{ 714 return (*((vec_uint4 *)((unsigned char *)(b)+a))); 715} 716 717static inline vec_int4 vec_lde(int a, signed int *b) 718{ 719 return (*((vec_int4 *)((unsigned char *)(b)+a))); 720} 721 722 723static inline vec_float4 vec_lde(int a, float *b) 724{ 725 return (*((vec_float4 *)((unsigned char *)(b)+a))); 726} 727 728/* vec_ldl (vector load indexed LRU) 729 * ======= 730 */ 731#define vec_ldl(_a, _b) vec_ld(_a, _b) 732 733 734/* vec_loge (vector log2 estimate floating-point) 735 * ======== 736 */ 737static inline vec_float4 vec_loge(vec_float4 a) 738{ 739 vec_int4 exp; 740 vec_float4 frac; 741 742 exp = spu_add((vec_int4)(spu_and(spu_rlmask((vec_uint4)(a), -23), 0xFF)), -127); 743 frac = (vec_float4)(spu_sub((vec_int4)(a), spu_sl(exp, 23))); 744 745 return (spu_madd(spu_madd(spu_splats(-0.33985f), frac, spu_splats(2.01955f)), 746 frac, spu_sub(spu_convtf(exp, 0), spu_splats(1.6797f)))); 747} 748 749 750/* vec_lvsl (vector load for shift left) 751 * ======== 752 */ 753static inline vec_uchar16 vec_lvsl(int a, unsigned char *b) 754{ 755 return ((vec_uchar16)spu_add((vec_ushort8)(spu_splats((unsigned char)((a + (int)(b)) & 0xF))), 756 ((vec_ushort8){0x0001, 0x0203, 0x0405, 0x0607, 757 0x0809, 0x0A0B, 0x0C0D, 0x0E0F}))); 758} 759 760static inline vec_uchar16 vec_lvsl(int a, signed char *b) 761{ 762 return (vec_lvsl(a, (unsigned char *)b)); 763} 764 765static inline vec_uchar16 vec_lvsl(int a, unsigned short *b) 766{ 767 return (vec_lvsl(a, (unsigned char *)b)); 768} 769 770static inline vec_uchar16 vec_lvsl(int a, short *b) 771{ 772 return (vec_lvsl(a, (unsigned char *)b)); 773} 774 775static inline vec_uchar16 vec_lvsl(int a, unsigned int *b) 776{ 777 return (vec_lvsl(a, (unsigned char *)b)); 778} 779 780static inline vec_uchar16 vec_lvsl(int a, int *b) 781{ 782 return (vec_lvsl(a, (unsigned char *)b)); 783} 784 785static inline vec_uchar16 vec_lvsl(int a, float *b) 786{ 787 return (vec_lvsl(a, (unsigned char *)b)); 788} 789 790 791/* vec_lvsr (vector load for shift right) 792 * ======== 793 */ 794static inline vec_uchar16 vec_lvsr(int a, unsigned char *b) 795{ 796 return ((vec_uchar16)(spu_sub(((vec_ushort8){0x1011, 0x1213, 0x1415, 0x1617, 797 0x1819, 0x1A1B, 0x1C1D, 0x1E1F}), 798 (vec_ushort8)(spu_splats((unsigned char)((a + (int)(b)) & 0xF)))))); 799} 800 801static inline vec_uchar16 vec_lvsr(int a, signed char *b) 802{ 803 return (vec_lvsr(a, (unsigned char *)b)); 804} 805 806static inline vec_uchar16 vec_lvsr(int a, unsigned short *b) 807{ 808 return (vec_lvsr(a, (unsigned char *)b)); 809} 810 811static inline vec_uchar16 vec_lvsr(int a, short *b) 812{ 813 return (vec_lvsr(a, (unsigned char *)b)); 814} 815 816static inline vec_uchar16 vec_lvsr(int a, unsigned int *b) 817{ 818 return (vec_lvsr(a, (unsigned char *)b)); 819} 820 821static inline vec_uchar16 vec_lvsr(int a, int *b) 822{ 823 return (vec_lvsr(a, (unsigned char *)b)); 824} 825 826static inline vec_uchar16 vec_lvsr(int a, float *b) 827{ 828 return (vec_lvsr(a, (unsigned char *)b)); 829} 830 831/* vec_madd (vector multiply add) 832 * ======== 833 */ 834#define vec_madd(_a, _b, _c) spu_madd(_a, _b, _c) 835 836 837 838/* vec_madds (vector multiply add saturate) 839 * ========= 840 */ 841static inline vec_short8 vec_madds(vec_short8 a, vec_short8 b, vec_short8 c) 842{ 843 return (vec_adds(c, spu_sel((vec_short8)(spu_sl(spu_mule(a, b), 1)), 844 (vec_short8)(spu_rlmask(spu_mulo(a, b), -15)), 845 ((vec_ushort8){0, 0xFFFF, 0, 0xFFFF, 0, 0xFFFF, 0, 0xFFFF})))); 846} 847 848/* vec_max (vector maximum) 849 * ======= 850 */ 851static inline vec_uchar16 vec_max(vec_uchar16 a, vec_uchar16 b) 852{ 853 return (spu_sel(b, a, spu_cmpgt(a, b))); 854} 855 856static inline vec_char16 vec_max(vec_char16 a, vec_char16 b) 857{ 858 return (spu_sel(b, a, spu_cmpgt(a, b))); 859} 860 861static inline vec_char16 vec_max(vec_bchar16 a, vec_char16 b) 862{ 863 return (spu_sel(b, (vec_char16)(a), spu_cmpgt((vec_char16)(a), b))); 864} 865 866static inline vec_char16 vec_max(vec_char16 a, vec_bchar16 b) 867{ 868 return (spu_sel((vec_char16)(b), a, spu_cmpgt(a, (vec_char16)(b)))); 869} 870 871static inline vec_ushort8 vec_max(vec_ushort8 a, vec_ushort8 b) 872{ 873 return (spu_sel(b, a, spu_cmpgt(a, b))); 874} 875 876static inline vec_short8 vec_max(vec_short8 a, vec_short8 b) 877{ 878 return (spu_sel(b, a, spu_cmpgt(a, b))); 879} 880 881static inline vec_short8 vec_max(vec_bshort8 a, vec_short8 b) 882{ 883 return (spu_sel(b, (vec_short8)(a), spu_cmpgt((vec_short8)(a), b))); 884} 885 886static inline vec_short8 vec_max(vec_short8 a, vec_bshort8 b) 887{ 888 return (spu_sel((vec_short8)(b), a, spu_cmpgt(a, (vec_short8)(b)))); 889} 890 891static inline vec_uint4 vec_max(vec_uint4 a, vec_uint4 b) 892{ 893 return (spu_sel(b, a, spu_cmpgt(a, b))); 894} 895 896static inline vec_int4 vec_max(vec_int4 a, vec_int4 b) 897{ 898 return (spu_sel(b, a, spu_cmpgt(a, b))); 899} 900 901static inline vec_int4 vec_max(vec_bint4 a, vec_int4 b) 902{ 903 return (spu_sel(b, (vec_int4)(a), spu_cmpgt((vec_int4)(a), b))); 904} 905 906static inline vec_int4 vec_max(vec_int4 a, vec_bint4 b) 907{ 908 return (spu_sel((vec_int4)(b), a, spu_cmpgt(a, (vec_int4)(b)))); 909} 910 911static inline vec_float4 vec_max(vec_float4 a, vec_float4 b) 912{ 913 return (spu_sel(b, a, spu_cmpgt(a, b))); 914} 915 916 917/* vec_mergeh (vector merge high) 918 * ========== 919 */ 920static inline vec_uchar16 vec_mergeh(vec_uchar16 a, vec_uchar16 b) 921{ 922 return (spu_shuffle(a, b, ((vec_uchar16){0, 16, 1, 17, 2, 18, 3, 19, 923 4, 20, 5, 21, 6, 22, 7, 23}))); 924} 925 926static inline vec_char16 vec_mergeh(vec_char16 a, vec_char16 b) 927{ 928 return (spu_shuffle(a, b, ((vec_uchar16){0, 16, 1, 17, 2, 18, 3, 19, 929 4, 20, 5, 21, 6, 22, 7, 23}))); 930} 931 932static inline vec_ushort8 vec_mergeh(vec_ushort8 a, vec_ushort8 b) 933{ 934 return (spu_shuffle(a, b, ((vec_uchar16){0, 1, 16, 17, 2, 3, 18, 19, 935 4, 5, 20, 21, 6, 7, 22, 23}))); 936} 937 938static inline vec_short8 vec_mergeh(vec_short8 a, vec_short8 b) 939{ 940 return (spu_shuffle(a, b, ((vec_uchar16){0, 1, 16, 17, 2, 3, 18, 19, 941 4, 5, 20, 21, 6, 7, 22, 23}))); 942} 943 944static inline vec_uint4 vec_mergeh(vec_uint4 a, vec_uint4 b) 945{ 946 return (spu_shuffle(a, b, ((vec_uchar16){0, 1, 2, 3, 16, 17, 18, 19, 947 4, 5, 6, 7, 20, 21, 22, 23}))); 948} 949 950static inline vec_int4 vec_mergeh(vec_int4 a, vec_int4 b) 951{ 952 return (spu_shuffle(a, b, ((vec_uchar16){0, 1, 2, 3, 16, 17, 18, 19, 953 4, 5, 6, 7, 20, 21, 22, 23}))); 954} 955 956static inline vec_float4 vec_mergeh(vec_float4 a, vec_float4 b) 957{ 958 return (spu_shuffle(a, b, ((vec_uchar16){0, 1, 2, 3, 16, 17, 18, 19, 959 4, 5, 6, 7, 20, 21, 22, 23}))); 960} 961 962/* vec_mergel (vector merge low) 963 * ========== 964 */ 965static inline vec_uchar16 vec_mergel(vec_uchar16 a, vec_uchar16 b) 966{ 967 return (spu_shuffle(a, b, ((vec_uchar16){ 8, 24, 9, 25, 10, 26, 11, 27, 968 12, 28, 13, 29, 14, 30, 15, 31}))); 969} 970 971static inline vec_char16 vec_mergel(vec_char16 a, vec_char16 b) 972{ 973 return (spu_shuffle(a, b, ((vec_uchar16){ 8, 24, 9, 25, 10, 26, 11, 27, 974 12, 28, 13, 29, 14, 30, 15, 31}))); 975} 976 977static inline vec_ushort8 vec_mergel(vec_ushort8 a, vec_ushort8 b) 978{ 979 return (spu_shuffle(a, b, ((vec_uchar16){ 8, 9, 24, 25, 10, 11, 26, 27, 980 12, 13, 28, 29, 14, 15, 30, 31}))); 981} 982 983static inline vec_short8 vec_mergel(vec_short8 a, vec_short8 b) 984{ 985 return (spu_shuffle(a, b, ((vec_uchar16){ 8, 9, 24, 25, 10, 11, 26, 27, 986 12, 13, 28, 29, 14, 15, 30, 31}))); 987} 988 989static inline vec_uint4 vec_mergel(vec_uint4 a, vec_uint4 b) 990{ 991 return (spu_shuffle(a, b, ((vec_uchar16){ 8, 9, 10, 11, 24, 25, 26, 27, 992 12, 13, 14, 15, 28, 29, 30, 31}))); 993} 994 995static inline vec_int4 vec_mergel(vec_int4 a, vec_int4 b) 996{ 997 return (spu_shuffle(a, b, ((vec_uchar16){ 8, 9, 10, 11, 24, 25, 26, 27, 998 12, 13, 14, 15, 28, 29, 30, 31}))); 999} 1000 1001static inline vec_float4 vec_mergel(vec_float4 a, vec_float4 b) 1002{ 1003 return (spu_shuffle(a, b, ((vec_uchar16){ 8, 9, 10, 11, 24, 25, 26, 27, 1004 12, 13, 14, 15, 28, 29, 30, 31}))); 1005} 1006 1007/* vec_mfvscr (vector move from vector status and control register) 1008 * ========== 1009 */ 1010static inline vec_ushort8 vec_mfvscr() 1011{ 1012 return ((vec_ushort8)spu_splats(0)); /* not supported */ 1013} 1014 1015 1016/* vec_min (vector minimum) 1017 * ======= 1018 */ 1019static inline vec_uchar16 vec_min(vec_uchar16 a, vec_uchar16 b) 1020{ 1021 return (spu_sel(a, b, spu_cmpgt(a, b))); 1022} 1023 1024static inline vec_char16 vec_min(vec_char16 a, vec_char16 b) 1025{ 1026 return (spu_sel(a, b, spu_cmpgt(a, b))); 1027} 1028 1029static inline vec_char16 vec_min(vec_bchar16 a, vec_char16 b) 1030{ 1031 return (spu_sel((vec_char16)(a), b, spu_cmpgt((vec_char16)(a), b))); 1032} 1033 1034static inline vec_char16 vec_min(vec_char16 a, vec_bchar16 b) 1035{ 1036 return (spu_sel(a, (vec_char16)(b), spu_cmpgt(a, (vec_char16)(b)))); 1037} 1038 1039static inline vec_ushort8 vec_min(vec_ushort8 a, vec_ushort8 b) 1040{ 1041 return (spu_sel(a, b, spu_cmpgt(a, b))); 1042} 1043 1044static inline vec_short8 vec_min(vec_short8 a, vec_short8 b) 1045{ 1046 return (spu_sel(a, b, spu_cmpgt(a, b))); 1047} 1048 1049static inline vec_short8 vec_min(vec_bshort8 a, vec_short8 b) 1050{ 1051 return (spu_sel((vec_short8)(a), b, spu_cmpgt((vec_short8)(a), b))); 1052} 1053 1054static inline vec_short8 vec_min(vec_short8 a, vec_bshort8 b) 1055{ 1056 return (spu_sel(a, (vec_short8)(b), spu_cmpgt(a, (vec_short8)(b)))); 1057} 1058 1059static inline vec_uint4 vec_min(vec_uint4 a, vec_uint4 b) 1060{ 1061 return (spu_sel(a, b, spu_cmpgt(a, b))); 1062} 1063 1064static inline vec_int4 vec_min(vec_int4 a, vec_int4 b) 1065{ 1066 return (spu_sel(a, b, spu_cmpgt(a, b))); 1067} 1068 1069static inline vec_int4 vec_min(vec_bint4 a, vec_int4 b) 1070{ 1071 return (spu_sel((vec_int4)(a), b, spu_cmpgt((vec_int4)(a), b))); 1072} 1073 1074static inline vec_int4 vec_min(vec_int4 a, vec_bint4 b) 1075{ 1076 return (spu_sel(a, (vec_int4)(b), spu_cmpgt(a, (vec_int4)(b)))); 1077} 1078 1079static inline vec_float4 vec_min(vec_float4 a, vec_float4 b) 1080{ 1081 return (spu_sel(a, b, spu_cmpgt(a, b))); 1082} 1083 1084/* vec_mladd (vector multiply low and add unsigned half word) 1085 * ========= 1086 */ 1087static inline vec_short8 vec_mladd(vec_short8 a, vec_short8 b, vec_short8 c) 1088{ 1089 return ((vec_short8)(spu_shuffle(spu_madd((vec_short8)(spu_rl((vec_uint4)(a), -16)), 1090 (vec_short8)(spu_rl((vec_uint4)(b), -16)), 1091 (vec_int4)(spu_rl((vec_uint4)(c), -16))), 1092 spu_madd(a, b, spu_extend(c)), 1093 ((vec_uchar16){ 2, 3, 18, 19, 6, 7, 22, 23, 1094 10, 11, 26, 27, 14, 15, 30, 31})))); 1095} 1096 1097 1098static inline vec_ushort8 vec_mladd(vec_ushort8 a, vec_ushort8 b, vec_ushort8 c) 1099{ 1100 return ((vec_ushort8)(vec_mladd((vec_short8)(a), (vec_short8)(b), (vec_short8)(c)))); 1101} 1102 1103static inline vec_short8 vec_mladd(vec_ushort8 a, vec_short8 b, vec_short8 c) 1104{ 1105 return (vec_mladd((vec_short8)(a), b, c)); 1106} 1107 1108static inline vec_short8 vec_mladd(vec_short8 a, vec_ushort8 b, vec_ushort8 c) 1109{ 1110 return (vec_mladd(a, (vec_short8)(b), (vec_short8)(c))); 1111} 1112 1113 1114/* vec_mradds (vector multiply round and add saturate) 1115 * ========== 1116 */ 1117static inline vec_short8 vec_mradds(vec_short8 a, vec_short8 b, vec_short8 c) 1118{ 1119 vec_int4 round = (vec_int4)spu_splats(0x4000); 1120 vec_short8 hi, lo; 1121 1122 hi = (vec_short8)(spu_sl(spu_add(spu_mule(a, b), round), 1)); 1123 lo = (vec_short8)(spu_rlmask(spu_add(spu_mulo(a, b), round), -15)); 1124 1125 return (vec_adds(spu_sel(hi, lo, ((vec_ushort8){0, 0xFFFF, 0, 0xFFFF, 0, 0xFFFF, 0, 0xFFFF})), c)); 1126} 1127 1128 1129/* vec_msum (vector multiply sum) 1130 * ======== 1131 */ 1132static inline vec_uint4 vec_msum(vec_uchar16 a, vec_uchar16 b, vec_uint4 c) 1133{ 1134 vec_ushort8 a1, a2, b1, b2; 1135 vec_uint4 p1, p2; 1136 1137 a1 = spu_and((vec_ushort8)(a), 0xFF); 1138 a2 = spu_rlmask((vec_ushort8)(a), -8); 1139 b1 = spu_and((vec_ushort8)(b), 0xFF); 1140 b2 = spu_rlmask((vec_ushort8)(b), -8); 1141 1142 p1 = spu_add(spu_mulo(a1, b1), spu_mulo(spu_rlqwbyte(a1, -2), spu_rlqwbyte(b1, -2))); 1143 p2 = spu_add(spu_mulo(a2, b2), spu_mulo(spu_rlqwbyte(a2, -2), spu_rlqwbyte(b2, -2))); 1144 return (spu_add(p2, spu_add(p1, c))); 1145} 1146 1147static inline vec_int4 vec_msum(vec_char16 a, vec_uchar16 b, vec_int4 c) 1148{ 1149 vec_short8 a1, a2, b1, b2; 1150 vec_int4 p1, p2; 1151 1152 a1 = (vec_short8)(spu_extend(a)); 1153 a2 = spu_rlmaska((vec_short8)(a), -8); 1154 b1 = (vec_short8)(spu_and((vec_ushort8)(b), 0xFF)); 1155 b2 = (vec_short8)spu_rlmask((vec_ushort8)(b), -8); 1156 1157 p1 = spu_add(spu_mulo(a1, b1), spu_mulo(spu_rlqwbyte(a1, -2), spu_rlqwbyte(b1, -2))); 1158 p2 = spu_add(spu_mulo(a2, b2), spu_mulo(spu_rlqwbyte(a2, -2), spu_rlqwbyte(b2, -2))); 1159 return (spu_add(p2, spu_add(p1, c))); 1160} 1161 1162static inline vec_uint4 vec_msum(vec_ushort8 a, vec_ushort8 b, vec_uint4 c) 1163{ 1164 return (spu_add(spu_add(spu_mulo(a, b), spu_mulo(spu_rlqwbyte(a, -2), spu_rlqwbyte(b, -2))), c)); 1165} 1166 1167static inline vec_int4 vec_msum(vec_short8 a, vec_short8 b, vec_int4 c) 1168{ 1169 return (spu_add(spu_add(spu_mulo(a, b), spu_mulo(spu_rlqwbyte(a, -2), spu_rlqwbyte(b, -2))), c)); 1170} 1171 1172 1173/* vec_msums (vector multiply sum saturate) 1174 * ======== 1175 */ 1176static inline vec_uint4 vec_msums(vec_ushort8 a, vec_ushort8 b, vec_uint4 c) 1177{ 1178 vec_uint4 p1, p2; 1179 1180 p1 = spu_mulo(a, b); 1181 p2 = spu_mulo(spu_rlqwbyte(a, -2), spu_rlqwbyte(b, -2)); 1182 1183 return (vec_adds(p2, vec_adds(p1, c))); 1184} 1185 1186static inline vec_int4 vec_msums(vec_short8 a, vec_short8 b, vec_int4 c) 1187{ 1188 return (vec_adds(spu_add(spu_mulo(a, b), spu_mulo(spu_rlqwbyte(a, -2), spu_rlqwbyte(b, -2))), c)); 1189} 1190 1191/* vec_mtvscr (vector move to vector status and control register) 1192 * ========== 1193 */ 1194#define vec_mtvscr(_a) /* not supported */ 1195 1196 1197/* vec_mule (vector multiply even) 1198 * ======== 1199 */ 1200static inline vec_ushort8 vec_mule(vec_uchar16 a, vec_uchar16 b) 1201{ 1202 vec_ushort8 hi, lo; 1203 1204 hi = (vec_ushort8)spu_mulo((vec_ushort8)(spu_rlmask((vec_uint4)(a), -24)), 1205 (vec_ushort8)(spu_rlmask((vec_uint4)(b), -24))); 1206 lo = (vec_ushort8)spu_mulo((vec_ushort8)(spu_rlmask((vec_short8)(a), -8)), 1207 (vec_ushort8)(spu_rlmask((vec_short8)(b), -8))); 1208 1209 return (spu_shuffle(hi, lo, ((vec_uchar16){ 2, 3, 18, 19, 6, 7, 22, 23, 1210 10, 11, 26, 27, 14, 15, 30, 31}))); 1211} 1212 1213static inline vec_short8 vec_mule(vec_char16 a, vec_char16 b) 1214{ 1215 vec_short8 hi, lo; 1216 1217 hi = (vec_short8)spu_mulo((vec_short8)(spu_rlmaska((vec_uint4)(a), -24)), 1218 (vec_short8)(spu_rlmaska((vec_uint4)(b), -24))); 1219 lo = (vec_short8)spu_mulo((vec_short8)(spu_rlmaska((vec_short8)(a), -8)), 1220 (vec_short8)(spu_rlmaska((vec_short8)(b), -8))); 1221 1222 return (spu_shuffle(hi, lo, ((vec_uchar16){ 2, 3, 18, 19, 6, 7, 22, 23, 1223 10, 11, 26, 27, 14, 15, 30, 31}))); 1224} 1225 1226static inline vec_uint4 vec_mule(vec_ushort8 a, vec_ushort8 b) 1227{ 1228 return (spu_mulo((vec_ushort8)spu_rlmask((vec_uint4)(a), -16), 1229 (vec_ushort8)spu_rlmask((vec_uint4)(b), -16))); 1230} 1231 1232 1233static inline vec_int4 vec_mule(vec_short8 a, vec_short8 b) 1234{ 1235 return (spu_mulo((vec_short8)spu_rlmaska((vec_int4)(a), -16), 1236 (vec_short8)spu_rlmaska((vec_int4)(b), -16))); 1237} 1238 1239 1240/* vec_mulo (vector multiply odd) 1241 * ======== 1242 */ 1243static inline vec_ushort8 vec_mulo(vec_uchar16 a, vec_uchar16 b) 1244{ 1245 vec_ushort8 hi, lo; 1246 1247 hi = (vec_ushort8)spu_mulo((vec_ushort8)(spu_and(spu_rlmask((vec_uint4)(a), -16), 0xFF)), 1248 (vec_ushort8)(spu_and(spu_rlmask((vec_uint4)(b), -16), 0xFF))); 1249 lo = (vec_ushort8)spu_mulo(spu_and((vec_ushort8)(a), 0xFF), spu_and((vec_ushort8)(b), 0xFF)); 1250 1251 return (spu_shuffle(hi, lo, ((vec_uchar16){ 2, 3, 18, 19, 6, 7, 22, 23, 1252 10, 11, 26, 27, 14, 15, 30, 31}))); 1253} 1254 1255static inline vec_short8 vec_mulo(vec_char16 a, vec_char16 b) 1256{ 1257 vec_short8 aa, bb, hi, lo; 1258 1259 aa = spu_extend(a); 1260 bb = spu_extend(b); 1261 1262 hi = (vec_short8)spu_mulo((vec_short8)(spu_rlmaska((vec_uint4)(aa), -16)), 1263 (vec_short8)(spu_rlmaska((vec_uint4)(bb), -16))); 1264 lo = (vec_short8)spu_mulo(aa, bb); 1265 return (spu_shuffle(hi, lo, ((vec_uchar16){ 2, 3, 18, 19, 6, 7, 22, 23, 1266 10, 11, 26, 27, 14, 15, 30, 31}))); 1267} 1268 1269static inline vec_uint4 vec_mulo(vec_ushort8 a, vec_ushort8 b) 1270{ 1271 return (spu_mulo(a, b)); 1272} 1273 1274 1275static inline vec_int4 vec_mulo(vec_short8 a, vec_short8 b) 1276{ 1277 return (spu_mulo(a, b)); 1278} 1279 1280 1281/* vec_nmsub (vector negative multiply subtract) 1282 * ========= 1283 */ 1284#define vec_nmsub(_a, _b, _c) spu_nmsub(_a, _b, _c) 1285 1286 1287/* vec_nor (vector logical nor) 1288 * ======= 1289 */ 1290#define vec_nor(_a, _b) spu_nor(_a, _b) 1291 1292 1293/* vec_or (vector logical or) 1294 * ====== 1295 */ 1296static inline vec_uchar16 vec_or(vec_uchar16 a, vec_uchar16 b) 1297{ 1298 return (spu_or(a, b)); 1299} 1300 1301static inline vec_char16 vec_or(vec_char16 a, vec_char16 b) 1302{ 1303 return (spu_or(a, b)); 1304} 1305 1306static inline vec_char16 vec_or(vec_bchar16 a, vec_char16 b) 1307{ 1308 return (spu_or((vec_char16)(a), b)); 1309} 1310 1311static inline vec_char16 vec_or(vec_char16 a, vec_bchar16 b) 1312{ 1313 return (spu_or(a, (vec_char16)(b))); 1314} 1315 1316static inline vec_ushort8 vec_or(vec_ushort8 a, vec_ushort8 b) 1317{ 1318 return (spu_or(a, b)); 1319} 1320 1321static inline vec_short8 vec_or(vec_short8 a, vec_short8 b) 1322{ 1323 return (spu_or(a, b)); 1324} 1325 1326static inline vec_short8 vec_or(vec_bshort8 a, vec_short8 b) 1327{ 1328 return (spu_or((vec_short8)(a), b)); 1329} 1330 1331static inline vec_short8 vec_or(vec_short8 a, vec_bshort8 b) 1332{ 1333 return (spu_or(a, (vec_short8)(b))); 1334} 1335 1336static inline vec_uint4 vec_or(vec_uint4 a, vec_uint4 b) 1337{ 1338 return (spu_or(a, b)); 1339} 1340 1341static inline vec_int4 vec_or(vec_int4 a, vec_int4 b) 1342{ 1343 return (spu_or(a, b)); 1344} 1345 1346static inline vec_int4 vec_or(vec_bint4 a, vec_int4 b) 1347{ 1348 return (spu_or((vec_int4)(a), b)); 1349} 1350 1351static inline vec_int4 vec_or(vec_int4 a, vec_bint4 b) 1352{ 1353 return (spu_or(a, (vec_int4)(b))); 1354} 1355 1356static inline vec_float4 vec_or(vec_float4 a, vec_float4 b) 1357{ 1358 return (spu_or(a, b)); 1359} 1360 1361static inline vec_float4 vec_or(vec_bint4 a, vec_float4 b) 1362{ 1363 return (spu_or((vec_float4)(a),b)); 1364} 1365 1366static inline vec_float4 vec_or(vec_float4 a, vec_bint4 b) 1367{ 1368 return (spu_or(a, (vec_float4)(b))); 1369} 1370 1371 1372/* vec_pack (vector pack) 1373 * ======== 1374 */ 1375static inline vec_uchar16 vec_pack(vec_ushort8 a, vec_ushort8 b) 1376{ 1377 return ((vec_uchar16)spu_shuffle(a, b, ((vec_uchar16){ 1, 3, 5, 7, 9, 11, 13, 15, 1378 17, 19, 21, 23, 25, 27, 29, 31}))); 1379} 1380 1381static inline vec_char16 vec_pack(vec_short8 a, vec_short8 b) 1382{ 1383 return ((vec_char16)spu_shuffle(a, b, ((vec_uchar16){ 1, 3, 5, 7, 9, 11, 13, 15, 1384 17, 19, 21, 23, 25, 27, 29, 31}))); 1385} 1386 1387static inline vec_ushort8 vec_pack(vec_uint4 a, vec_uint4 b) 1388{ 1389 return ((vec_ushort8)spu_shuffle(a, b, ((vec_uchar16){ 2, 3, 6, 7, 10, 11, 14, 15, 1390 18, 19, 22, 23, 26, 27, 30, 31}))); 1391} 1392 1393static inline vec_short8 vec_pack(vec_int4 a, vec_int4 b) 1394{ 1395 return ((vec_short8)spu_shuffle(a, b, ((vec_uchar16){ 2, 3, 6, 7, 10, 11, 14, 15, 1396 18, 19, 22, 23, 26, 27, 30, 31}))); 1397} 1398 1399 1400/* vec_packpx (vector pack pixel) 1401 * ========== 1402 */ 1403static inline vec_pixel8 vec_packpx(vec_uint4 a, vec_uint4 b) 1404{ 1405 vec_uint4 x03FF = (vec_uint4)(spu_splats((unsigned short)0x03FF)); 1406 vec_uint4 x001F = (vec_uint4)(spu_splats((unsigned short)0x001F)); 1407 1408 return ((vec_pixel8)(spu_shuffle(spu_sel(spu_sel(spu_sl(a, 7), spu_sl(a, 10), x03FF), 1409 spu_sl(a, 13), x001F), 1410 spu_sel(spu_sel(spu_sl(b, 7), spu_sl(b, 10), x03FF), 1411 spu_sl(b, 13), x001F), 1412 ((vec_uchar16){ 0, 1, 4, 5, 8, 9, 12, 13, 1413 16, 17, 20, 21, 24, 25, 28, 29})))); 1414} 1415 1416 1417/* vec_packs (vector pack saturate) 1418 * ========= 1419 */ 1420static inline vec_uchar16 vec_packs(vec_ushort8 a, vec_ushort8 b) 1421{ 1422 vec_ushort8 max = spu_splats((unsigned short)0x00FF); 1423 1424 return ((vec_uchar16)(spu_shuffle(spu_sel(a, max, spu_cmpgt(a, 255)), 1425 spu_sel(b, max, spu_cmpgt(b, 255)), 1426 ((vec_uchar16){ 1, 3, 5, 7, 9, 11, 13, 15, 1427 17, 19, 21, 23, 25, 27, 29, 31})))); 1428} 1429 1430static inline vec_char16 vec_packs(vec_short8 a, vec_short8 b) 1431{ 1432 vec_short8 max = spu_splats((signed short)0x007F); 1433 vec_short8 min = spu_splats((signed short)0xFF80); 1434 1435 return ((vec_char16)(spu_shuffle(spu_sel(min, spu_sel(a, max, spu_cmpgt(a, 127)), spu_cmpgt(a, -128)), 1436 spu_sel(min, spu_sel(b, max, spu_cmpgt(b, 127)), spu_cmpgt(b, -128)), 1437 ((vec_uchar16){ 1, 3, 5, 7, 9, 11, 13, 15, 1438 17, 19, 21, 23, 25, 27, 29, 31})))); 1439} 1440 1441static inline vec_ushort8 vec_packs(vec_uint4 a, vec_uint4 b) 1442{ 1443 vec_uint4 max = spu_splats((unsigned int)0x0000FFFF); 1444 1445 return ((vec_ushort8)(spu_shuffle(spu_sel(a, max, spu_cmpgt(a, max)), 1446 spu_sel(b, max, spu_cmpgt(b, max)), 1447 ((vec_uchar16){ 2, 3, 6, 7, 10, 11, 14, 15, 1448 18, 19, 22, 23, 26, 27, 30, 31})))); 1449} 1450 1451static inline vec_short8 vec_packs(vec_int4 a, vec_int4 b) 1452{ 1453 vec_int4 max = spu_splats((signed int)0x00007FFF); 1454 vec_int4 min = spu_splats((signed int)0xFFFF8000); 1455 1456 return ((vec_short8)(spu_shuffle(spu_sel(min, spu_sel(a, max, spu_cmpgt(a, max)), spu_cmpgt(a, min)), 1457 spu_sel(min, spu_sel(b, max, spu_cmpgt(b, max)), spu_cmpgt(b, min)), 1458 ((vec_uchar16){ 2, 3, 6, 7, 10, 11, 14, 15, 1459 18, 19, 22, 23, 26, 27, 30, 31})))); 1460} 1461 1462 1463/* vec_packsu (vector pack saturate unsigned) 1464 * ========== 1465 */ 1466static inline vec_uchar16 vec_packsu(vec_ushort8 a, vec_ushort8 b) 1467{ 1468 return ((vec_uchar16)spu_shuffle(spu_or(a, (vec_ushort8)(spu_cmpgt(a, 255))), 1469 spu_or(b, (vec_ushort8)(spu_cmpgt(b, 255))), 1470 ((vec_uchar16){ 1, 3, 5, 7, 9, 11, 13, 15, 1471 17, 19, 21, 23, 25, 27, 29, 31}))); 1472} 1473 1474static inline vec_uchar16 vec_packsu(vec_short8 a, vec_short8 b) 1475{ 1476 vec_short8 max = spu_splats((signed short)0x00FF); 1477 vec_short8 min = spu_splats((signed short)0x0000); 1478 1479 return ((vec_uchar16)(spu_shuffle(spu_sel(min, spu_sel(a, max, spu_cmpgt(a, 255)), spu_cmpgt(a, 0)), 1480 spu_sel(min, spu_sel(b, max, spu_cmpgt(b, 255)), spu_cmpgt(b, 0)), 1481 ((vec_uchar16){ 1, 3, 5, 7, 9, 11, 13, 15, 1482 17, 19, 21, 23, 25, 27, 29, 31})))); 1483 1484 return (vec_packsu((vec_ushort8)(a), (vec_ushort8)(b))); 1485} 1486 1487static inline vec_ushort8 vec_packsu(vec_uint4 a, vec_uint4 b) 1488{ 1489 vec_uint4 max = spu_splats((unsigned int)0xFFFF); 1490 1491 return ((vec_ushort8)spu_shuffle(spu_or(a, (vec_uint4)(spu_cmpgt(a, max))), 1492 spu_or(b, (vec_uint4)(spu_cmpgt(b, max))), 1493 ((vec_uchar16){ 2, 3, 6, 7, 10, 11, 14, 15, 1494 18, 19, 22, 23, 26, 27, 30, 31}))); 1495} 1496 1497static inline vec_ushort8 vec_packsu(vec_int4 a, vec_int4 b) 1498{ 1499 vec_int4 max = spu_splats((signed int)0x0000FFFF); 1500 vec_int4 min = spu_splats((signed int)0x00000000); 1501 1502 return ((vec_ushort8)(spu_shuffle(spu_sel(min, spu_sel(a, max, spu_cmpgt(a, max)), spu_cmpgt(a, min)), 1503 spu_sel(min, spu_sel(b, max, spu_cmpgt(b, max)), spu_cmpgt(b, min)), 1504 ((vec_uchar16){ 2, 3, 6, 7, 10, 11, 14, 15, 1505 18, 19, 22, 23, 26, 27, 30, 31})))); 1506} 1507 1508 1509/* vec_perm (vector permute) 1510 * ======== 1511 */ 1512static inline vec_uchar16 vec_perm(vec_uchar16 a, vec_uchar16 b, vec_uchar16 c) 1513{ 1514 return (spu_shuffle(a, b, spu_and(c, 0x1F))); 1515} 1516 1517static inline vec_char16 vec_perm(vec_char16 a, vec_char16 b, vec_uchar16 c) 1518{ 1519 return ((vec_char16)(vec_perm((vec_uchar16)(a), (vec_uchar16)(b), c))); 1520} 1521 1522static inline vec_ushort8 vec_perm(vec_ushort8 a, vec_ushort8 b, vec_uchar16 c) 1523{ 1524 return ((vec_ushort8)(vec_perm((vec_uchar16)(a), (vec_uchar16)(b), c))); 1525} 1526 1527static inline vec_short8 vec_perm(vec_short8 a, vec_short8 b, vec_uchar16 c) 1528{ 1529 return ((vec_short8)(vec_perm((vec_uchar16)(a), (vec_uchar16)(b), c))); 1530} 1531 1532static inline vec_uint4 vec_perm(vec_uint4 a, vec_uint4 b, vec_uchar16 c) 1533{ 1534 return ((vec_uint4)(vec_perm((vec_uchar16)(a), (vec_uchar16)(b), c))); 1535} 1536 1537static inline vec_int4 vec_perm(vec_int4 a, vec_int4 b, vec_uchar16 c) 1538{ 1539 return ((vec_int4)(vec_perm((vec_uchar16)(a), (vec_uchar16)(b), c))); 1540} 1541 1542static inline vec_float4 vec_perm(vec_float4 a, vec_float4 b, vec_uchar16 c) 1543{ 1544 return ((vec_float4)(vec_perm((vec_uchar16)(a), (vec_uchar16)(b), c))); 1545} 1546 1547 1548/* vec_re (vector reciprocal estimate) 1549 * ====== 1550 */ 1551#define vec_re(_a) spu_re(_a) 1552 1553 1554/* vec_rl (vector rotate left) 1555 * ====== 1556 */ 1557static inline vec_uchar16 vec_rl(vec_uchar16 a, vec_uchar16 b) 1558{ 1559 vec_ushort8 r1, r2; 1560 1561 r1 = spu_rl(spu_and((vec_ushort8)(a), 0xFF), (vec_short8)spu_and((vec_ushort8)(b), 7)); 1562 r2 = spu_rl(spu_and((vec_ushort8)(a), -256), (vec_short8)spu_and(spu_rlmask((vec_ushort8)(b), -8), 7)); 1563 return ((vec_uchar16)(spu_sel(spu_or(r2, spu_sl(r2, 8)), spu_or(r1, spu_rlmask(r1, -8)), spu_splats((unsigned short)0xFF)))); 1564} 1565 1566static inline vec_char16 vec_rl(vec_char16 a, vec_uchar16 b) 1567{ 1568 return ((vec_char16)(vec_rl((vec_uchar16)(a), b))); 1569} 1570 1571static inline vec_ushort8 vec_rl(vec_ushort8 a, vec_ushort8 b) 1572{ 1573 return (spu_rl(a, (vec_short8)(b))); 1574} 1575 1576static inline vec_short8 vec_rl(vec_short8 a, vec_ushort8 b) 1577{ 1578 return (spu_rl(a, (vec_short8)(b))); 1579} 1580 1581static inline vec_uint4 vec_rl(vec_uint4 a, vec_uint4 b) 1582{ 1583 return (spu_rl(a, (vec_int4)(b))); 1584} 1585 1586static inline vec_int4 vec_rl(vec_int4 a, vec_uint4 b) 1587{ 1588 return (spu_rl(a, (vec_int4)(b))); 1589} 1590 1591 1592/* vec_round (vector round) 1593 * ========= 1594 */ 1595static inline vec_float4 vec_round(vec_float4 a) 1596{ 1597 vec_float4 s_half, s_one, d; 1598 vec_uint4 odd; 1599 vec_uint4 msb = spu_splats((unsigned int)0x80000000); 1600 vec_float4 half = spu_splats(0.5f); 1601 vec_int4 exp; 1602 vec_uint4 mask; 1603 1604 s_half = (vec_float4)(spu_sel((vec_uint4)(half), (vec_uint4)(a), msb)); 1605 a = spu_add(a, s_half); 1606 s_one = spu_add(s_half, s_half); 1607 exp = spu_sub(127, (vec_int4)(spu_and(spu_rlmask((vec_uint4)(a), -23), 0xFF))); 1608 mask = spu_rlmask(spu_splats((unsigned int)0x7FFFFF), exp); 1609 mask = spu_sel(spu_splats((unsigned int)0), mask, spu_cmpgt(exp, -31)); 1610 mask = spu_or(mask, spu_xor((vec_uint4)(spu_rlmaska(spu_add(exp, -1), -31)), -1)); 1611 1612 odd = spu_and((vec_uint4)(spu_convts(a, 0)), 1); 1613 s_one = spu_andc(s_one, (vec_float4)spu_cmpeq(mask, 0)); 1614 s_one = spu_and(s_one, spu_and((vec_float4)spu_cmpeq(spu_and((vec_uint4)(a), mask), 0), 1615 (vec_float4)spu_cmpeq(odd, 1))); 1616 d = spu_andc(a, (vec_float4)(mask)); 1617 d = spu_sub(d, s_one); 1618 return (d); 1619} 1620 1621/* vec_rsqrte (vector reciprocal square root estimate) 1622 * ========== 1623 */ 1624#define vec_rsqrte(_a) spu_rsqrte(_a) 1625 1626 1627/* vec_sel (vector select) 1628 * ======= 1629 */ 1630#define vec_sel(_a, _b, _c) spu_sel(_a, _b, _c) 1631 1632 1633/* vec_sl (vector shift left) 1634 * ====== 1635 */ 1636static inline vec_uchar16 vec_sl(vec_uchar16 a, vec_uchar16 b) 1637{ 1638 vec_ushort8 hi, lo; 1639 1640 lo = spu_and(spu_sl((vec_ushort8)(a), spu_and((vec_ushort8)(b), 7)), 0xFF); 1641 hi = spu_sl(spu_and((vec_ushort8)(a), -256), spu_and(spu_rlmask((vec_ushort8)(b), -8), 7)); 1642 1643 return ((vec_uchar16)(spu_or(hi, lo))); 1644} 1645 1646static inline vec_char16 vec_sl(vec_char16 a, vec_uchar16 b) 1647{ 1648 return ((vec_char16)(vec_sl((vec_uchar16)(a), b))); 1649} 1650 1651static inline vec_ushort8 vec_sl(vec_ushort8 a, vec_ushort8 b) 1652{ 1653 return (spu_sl(a, spu_and(b, 15))); 1654} 1655 1656static inline vec_short8 vec_sl(vec_short8 a, vec_ushort8 b) 1657{ 1658 return (spu_sl(a, spu_and((vec_ushort8)(b), 15))); 1659} 1660 1661static inline vec_uint4 vec_sl(vec_uint4 a, vec_uint4 b) 1662{ 1663 return (spu_sl(a, spu_and(b, 31))); 1664} 1665 1666static inline vec_int4 vec_sl(vec_int4 a, vec_uint4 b) 1667{ 1668 return (spu_sl(a, spu_and(b, 31))); 1669} 1670 1671 1672/* vec_sld (vector shift left double) 1673 * ======= 1674 */ 1675#define vec_sld(_a, _b, _c) spu_shuffle(_a, _b, ((vec_uchar16){ 0+(_c), 1+(_c), 2+(_c), 3+(_c), \ 1676 4+(_c), 5+(_c), 6+(_c), 7+(_c), \ 1677 8+(_c), 9+(_c), 10+(_c), 11+(_c), \ 1678 12+(_c), 13+(_c), 14+(_c), 15+(_c)})) 1679 1680 1681/* vec_sll (vector shift left long) 1682 * ======= 1683 */ 1684#define vec_sll(_a, _b) spu_slqw(_a, spu_extract((vec_uint4)(_b), 0)) 1685 1686 1687/* vec_slo (vector shift left by octet) 1688 * ======= 1689 */ 1690#define vec_slo(_a, _b) spu_slqwbytebc(_a, spu_extract((vec_uint4)(_b), 3) & 0x7F) 1691 1692 1693/* vec_splat (vector splat) 1694 * ========= 1695 */ 1696#define vec_splat(_a, _b) spu_splats(spu_extract(_a, _b)) 1697 1698 1699/* vec_splat_s8 (vector splat signed byte) 1700 * ============ 1701 */ 1702#define vec_splat_s8(_a) spu_splats((signed char)(_a)) 1703 1704 1705/* vec_splat_s16 (vector splat signed half-word) 1706 * ============= 1707 */ 1708#define vec_splat_s16(_a) spu_splats((signed short)(_a)) 1709 1710 1711/* vec_splat_s32 (vector splat signed word) 1712 * ============= 1713 */ 1714#define vec_splat_s32(_a) spu_splats((signed int)(_a)) 1715 1716 1717/* vec_splat_u8 (vector splat unsigned byte) 1718 * ============ 1719 */ 1720#define vec_splat_u8(_a) spu_splats((unsigned char)(_a)) 1721 1722 1723/* vec_splat_u16 (vector splat unsigned half-word) 1724 * ============= 1725 */ 1726#define vec_splat_u16(_a) spu_splats((unsigned short)(_a)) 1727 1728 1729/* vec_splat_u32 (vector splat unsigned word) 1730 * ============= 1731 */ 1732#define vec_splat_u32(_a) spu_splats((unsigned int)(_a)) 1733 1734 1735/* vec_sr (vector shift right) 1736 * ====== 1737 */ 1738static inline vec_uchar16 vec_sr(vec_uchar16 a, vec_uchar16 b) 1739{ 1740 vec_ushort8 hi, lo; 1741 1742 lo = spu_rlmask(spu_and((vec_ushort8)(a), 0xFF), spu_sub(0, (vec_short8)(spu_and((vec_ushort8)(b), 7)))); 1743 hi = spu_and(spu_rlmask((vec_ushort8)(a), spu_sub(0, (vec_short8)(spu_and(spu_rlmask((vec_ushort8)(b), -8), 7)))), -256); 1744 1745 return ((vec_uchar16)(spu_or(hi, lo))); 1746} 1747 1748static inline vec_char16 vec_sr(vec_char16 a, vec_uchar16 b) 1749{ 1750 return ((vec_char16)(vec_sr((vec_uchar16)(a), b))); 1751} 1752 1753static inline vec_ushort8 vec_sr(vec_ushort8 a, vec_ushort8 b) 1754{ 1755 return (spu_rlmask(a, spu_sub(0, (vec_short8)(spu_and(b, 15))))); 1756} 1757 1758static inline vec_short8 vec_sr(vec_short8 a, vec_ushort8 b) 1759{ 1760 return ((vec_short8)(vec_sr((vec_ushort8)(a), b))); 1761} 1762 1763static inline vec_uint4 vec_sr(vec_uint4 a, vec_uint4 b) 1764{ 1765 return (spu_rlmask(a, spu_sub(0, (vec_int4)(spu_and(b, 31))))); 1766} 1767 1768static inline vec_int4 vec_sr(vec_int4 a, vec_uint4 b) 1769{ 1770 return ((vec_int4)(vec_sr((vec_uint4)(a), b))); 1771} 1772 1773 1774/* vec_sra (vector shift right algebraic) 1775 * ======= 1776 */ 1777static inline vec_char16 vec_sra(vec_char16 a, vec_uchar16 b) 1778{ 1779 vec_short8 hi, lo; 1780 1781 lo = spu_and(spu_rlmaska(spu_extend(a), spu_sub(0, (vec_short8)(spu_and((vec_ushort8)(b), 7)))), 0xFF); 1782 hi = spu_and(spu_rlmaska((vec_short8)(a), spu_sub(0, (vec_short8)(spu_and(spu_rlmask((vec_ushort8)(b), -8), 7)))), -256); 1783 1784 return ((vec_char16)(spu_or(hi, lo))); 1785} 1786 1787static inline vec_uchar16 vec_sra(vec_uchar16 a, vec_uchar16 b) 1788{ 1789 return ((vec_uchar16)(vec_sra((vec_char16)(a), b))); 1790} 1791 1792static inline vec_short8 vec_sra(vec_short8 a, vec_ushort8 b) 1793{ 1794 return (spu_rlmaska(a, spu_sub(0, (vec_short8)(spu_and(b, 15))))); 1795} 1796 1797static inline vec_ushort8 vec_sra(vec_ushort8 a, vec_ushort8 b) 1798{ 1799 return ((vec_ushort8)(vec_sra((vec_short8)(a), b))); 1800} 1801 1802static inline vec_int4 vec_sra(vec_int4 a, vec_uint4 b) 1803{ 1804 return (spu_rlmaska(a, spu_sub(0, (vec_int4)(spu_and(b, 31))))); 1805} 1806 1807static inline vec_uint4 vec_sra(vec_uint4 a, vec_uint4 b) 1808{ 1809 return ((vec_uint4)(vec_sra((vec_int4)(a), b))); 1810} 1811 1812 1813/* vec_srl (vector shift right long) 1814 * ======= 1815 */ 1816#define vec_srl(_a, _b) spu_rlmaskqw(_a, 0-spu_extract((vec_int4)(_b), 3)) 1817 1818 1819/* vec_sro (vector shift right by octet) 1820 * ======= 1821 */ 1822#define vec_sro(_a, _b) spu_rlmaskqwbyte(_a, 0 - ((spu_extract((vec_int4)(_b), 3) >> 3) & 0xF)) 1823 1824/* vec_st (vector store indexed) 1825 * ====== 1826 */ 1827static inline void vec_st(vec_uchar16 a, int b, unsigned char *c) 1828{ 1829 *((vec_uchar16 *)(c+b)) = a; 1830} 1831 1832static inline void vec_st(vec_uchar16 a, int b, vec_uchar16 *c) 1833{ 1834 *((vec_uchar16 *)((unsigned char *)(c)+b)) = a; 1835} 1836 1837static inline void vec_st(vec_char16 a, int b, signed char *c) 1838{ 1839 *((vec_char16 *)(c+b)) = a; 1840} 1841 1842static inline void vec_st(vec_char16 a, int b, vec_char16 *c) 1843{ 1844 *((vec_char16 *)((signed char *)(c)+b)) = a; 1845} 1846 1847static inline void vec_st(vec_bchar16 a, int b, signed char *c) 1848{ 1849 *((vec_bchar16 *)((signed char *)(c)+b)) = a; 1850} 1851 1852static inline void vec_st(vec_ushort8 a, int b, unsigned short *c) 1853{ 1854 *((vec_ushort8 *)((unsigned char *)(c)+b)) = a; 1855} 1856 1857static inline void vec_st(vec_ushort8 a, int b, vec_ushort8 *c) 1858{ 1859 *((vec_ushort8 *)((unsigned char *)(c)+b)) = a; 1860} 1861 1862static inline void vec_st(vec_short8 a, int b, signed short *c) 1863{ 1864 *((vec_short8 *)((unsigned char *)(c)+b)) = a; 1865} 1866 1867static inline void vec_st(vec_short8 a, int b, vec_short8 *c) 1868{ 1869 *((vec_short8 *)((signed char *)(c)+b)) = a; 1870} 1871 1872static inline void vec_st(vec_bshort8 a, int b, signed short *c) 1873{ 1874 *((vec_bshort8 *)((signed char *)(c)+b)) = a; 1875} 1876 1877static inline void vec_st(vec_uint4 a, int b, unsigned int *c) 1878{ 1879 *((vec_uint4 *)((unsigned char *)(c)+b)) = a; 1880} 1881 1882static inline void vec_st(vec_uint4 a, int b, vec_uint4 *c) 1883{ 1884 *((vec_uint4 *)((unsigned char *)(c)+b)) = a; 1885} 1886 1887static inline void vec_st(vec_int4 a, int b, signed int *c) 1888{ 1889 *((vec_int4 *)((unsigned char *)(c)+b)) = a; 1890} 1891 1892static inline void vec_st(vec_int4 a, int b, vec_int4 *c) 1893{ 1894 *((vec_int4 *)((signed char *)(c)+b)) = a; 1895} 1896 1897static inline void vec_st(vec_bint4 a, int b, signed int *c) 1898{ 1899 *((vec_bint4 *)((signed char *)(c)+b)) = a; 1900} 1901 1902static inline void vec_st(vec_float4 a, int b, float *c) 1903{ 1904 *((vec_float4 *)((unsigned char *)(c)+b)) = a; 1905} 1906 1907static inline void vec_st(vec_float4 a, int b, vec_float4 *c) 1908{ 1909 *((vec_float4 *)((unsigned char *)(c)+b)) = a; 1910} 1911 1912 1913/* vec_ste (vector store element indexed) 1914 * ======= 1915 */ 1916static inline void vec_ste(vec_uchar16 a, int b, unsigned char *c) 1917{ 1918 unsigned char *ptr; 1919 1920 ptr = c + b; 1921 *ptr = spu_extract(a, (int)(ptr) & 15); 1922} 1923 1924static inline void vec_ste(vec_char16 a, int b, signed char *c) 1925{ 1926 vec_ste((vec_uchar16)(a), b, (unsigned char *)(c)); 1927} 1928 1929static inline void vec_ste(vec_bchar16 a, int b, signed char *c) 1930{ 1931 vec_ste((vec_uchar16)(a), b, (unsigned char *)(c)); 1932} 1933 1934static inline void vec_ste(vec_ushort8 a, int b, unsigned short *c) 1935{ 1936 unsigned short *ptr; 1937 1938 ptr = (unsigned short *)(((unsigned int)(c) + b) & ~1); 1939 *ptr = spu_extract(a, ((int)(ptr) >> 1) & 7); 1940} 1941 1942static inline void vec_ste(vec_short8 a, int b, signed short *c) 1943{ 1944 vec_ste((vec_ushort8)(a), b, (unsigned short *)(c)); 1945} 1946 1947static inline void vec_ste(vec_bshort8 a, int b, signed short *c) 1948{ 1949 vec_ste((vec_ushort8)(a), b, (unsigned short *)(c)); 1950} 1951 1952static inline void vec_ste(vec_uint4 a, int b, unsigned int *c) 1953{ 1954 unsigned int *ptr; 1955 1956 ptr = (unsigned int *)(((unsigned int)(c) + b) & ~3); 1957 *ptr = spu_extract(a, ((int)(ptr) >> 2) & 3); 1958} 1959 1960static inline void vec_ste(vec_int4 a, int b, signed int *c) 1961{ 1962 vec_ste((vec_uint4)(a), b, (unsigned int *)(c)); 1963} 1964 1965static inline void vec_ste(vec_bint4 a, int b, signed int *c) 1966{ 1967 vec_ste((vec_uint4)(a), b, (unsigned int *)(c)); 1968} 1969 1970static inline void vec_ste(vec_float4 a, int b, float *c) 1971{ 1972 vec_ste((vec_uint4)(a), b, (unsigned int *)(c)); 1973} 1974 1975 1976/* vec_stl (vector store indexed LRU) 1977 * ======= 1978 */ 1979#define vec_stl(_a, _b, _c) vec_st(_a, _b, _c) 1980 1981 1982/* vec_sub (vector subtract) 1983 * ======= 1984 */ 1985static inline vec_uchar16 vec_sub(vec_uchar16 a, vec_uchar16 b) 1986{ 1987 return ((vec_uchar16)(spu_sel(spu_sub((vec_ushort8)(a), (vec_ushort8)(b)), 1988 spu_sub(spu_and((vec_ushort8)(a), -256), spu_and((vec_ushort8)(b), -256)), 1989 spu_splats((unsigned short)0xFF00)))); 1990} 1991 1992static inline vec_char16 vec_sub(vec_char16 a, vec_char16 b) 1993{ 1994 return ((vec_char16)(vec_sub((vec_uchar16)(a), (vec_uchar16)(b)))); 1995} 1996 1997static inline vec_char16 vec_sub(vec_bchar16 a, vec_char16 b) 1998{ 1999 return ((vec_char16)(vec_sub((vec_uchar16)(a), (vec_uchar16)(b)))); 2000} 2001 2002static inline vec_char16 vec_sub(vec_char16 a, vec_bchar16 b) 2003{ 2004 return ((vec_char16)(vec_sub((vec_uchar16)(a), (vec_uchar16)(b)))); 2005} 2006 2007static inline vec_ushort8 vec_sub(vec_ushort8 a, vec_ushort8 b) 2008{ 2009 return (spu_sub(a, b)); 2010} 2011 2012static inline vec_short8 vec_sub(vec_short8 a, vec_short8 b) 2013{ 2014 return (spu_sub(a, b)); 2015} 2016 2017static inline vec_short8 vec_sub(vec_bshort8 a, vec_short8 b) 2018{ 2019 return (spu_sub((vec_short8)(a), b)); 2020} 2021 2022static inline vec_short8 vec_sub(vec_short8 a, vec_bshort8 b) 2023{ 2024 return (spu_sub(a, (vec_short8)(b))); 2025} 2026 2027static inline vec_uint4 vec_sub(vec_uint4 a, vec_uint4 b) 2028{ 2029 return (spu_sub(a, b)); 2030} 2031 2032static inline vec_int4 vec_sub(vec_int4 a, vec_int4 b) 2033{ 2034 return (spu_sub(a, b)); 2035} 2036 2037static inline vec_int4 vec_sub(vec_bint4 a, vec_int4 b) 2038{ 2039 return (spu_sub((vec_int4)(a), b)); 2040} 2041 2042static inline vec_int4 vec_sub(vec_int4 a, vec_bint4 b) 2043{ 2044 return (spu_sub(a, (vec_int4)(b))); 2045} 2046 2047static inline vec_float4 vec_sub(vec_float4 a, vec_float4 b) 2048{ 2049 return (spu_sub(a, b)); 2050} 2051 2052 2053/* vec_subc (vector subtract carryout) 2054 * ======== 2055 */ 2056#define vec_subc(_a, _b) spu_genb(_a, _b) 2057 2058 2059/* vec_subs (vector subtract saturate) 2060 * ======== 2061 */ 2062static inline vec_uchar16 vec_subs(vec_uchar16 a, vec_uchar16 b) 2063{ 2064 vec_ushort8 s1, s2; 2065 vec_uchar16 s, d; 2066 2067 s1 = spu_sub(spu_rlmask((vec_ushort8)(a), -8), spu_rlmask((vec_ushort8)(b), -8)); 2068 s2 = spu_sub(spu_and((vec_ushort8)(a), 0xFF), spu_and((vec_ushort8)(b), 0xFF)); 2069 s = (vec_uchar16)(spu_shuffle(s1, s2, ((vec_uchar16){0, 16, 2, 18, 4, 20, 6, 22, 2070 8, 24, 10, 26, 12, 28, 14, 30}))); 2071 d = (vec_uchar16)(spu_shuffle(s1, s2, ((vec_uchar16){1, 17, 3, 19, 5, 21, 7, 23, 2072 9, 25, 11, 27, 13, 29, 15, 31}))); 2073 return (spu_andc(d, s)); 2074} 2075 2076static inline vec_char16 vec_subs(vec_char16 a, vec_char16 b) 2077{ 2078 vec_ushort8 s1, s2; 2079 vec_uchar16 s, d; 2080 2081 s1 = spu_sub(spu_rlmask((vec_ushort8)(a), -8), spu_rlmask((vec_ushort8)(b), -8)); 2082 s2 = spu_sub(spu_and((vec_ushort8)(a), 0xFF), spu_and((vec_ushort8)(b), 0xFF)); 2083 s = (vec_uchar16)(spu_shuffle(s1, s2, ((vec_uchar16){1, 17, 3, 19, 5, 21, 7, 23, 2084 9, 25, 11, 27, 13, 29, 15, 31}))); 2085 d = spu_sel(s, spu_splats((unsigned char)0x7F), spu_cmpgt(spu_nor((vec_uchar16)(a), spu_nand(s, (vec_uchar16)(b))), 0x7F)); 2086 d = spu_sel(d, spu_splats((unsigned char)0x80), spu_cmpgt(spu_and((vec_uchar16)(a), spu_nor(s, (vec_uchar16)(b))), 0x7F)); 2087 2088 return ((vec_char16)(d)); 2089} 2090 2091static inline vec_char16 vec_subs(vec_bchar16 a, vec_char16 b) 2092{ 2093 return (vec_subs((vec_char16)(a), b)); 2094} 2095 2096static inline vec_char16 vec_subs(vec_char16 a, vec_bchar16 b) 2097{ 2098 return (vec_subs(a, (vec_char16)(b))); 2099} 2100 2101static inline vec_ushort8 vec_subs(vec_ushort8 a, vec_ushort8 b) 2102{ 2103 return (spu_andc(spu_sub(a, b), spu_cmpgt(b, a))); 2104} 2105 2106static inline vec_short8 vec_subs(vec_short8 a, vec_short8 b) 2107{ 2108 vec_short8 s; 2109 vec_short8 d; 2110 2111 s = spu_sub(a, b); 2112 d = spu_sel(s, spu_splats((signed short)0x7FFF), (vec_ushort8)(spu_rlmaska(spu_nor(a, spu_nand(s, b)), -15))); 2113 d = spu_sel(d, spu_splats((signed short)0x8000), (vec_ushort8)(spu_rlmaska(spu_and(a, spu_nor(s, b)), -15))); 2114 2115 return (d); 2116} 2117 2118static inline vec_short8 vec_subs(vec_bshort8 a, vec_short8 b) 2119{ 2120 return ((vec_short8)(vec_subs((vec_short8)(a), b))); 2121} 2122 2123static inline vec_short8 vec_subs(vec_short8 a, vec_bshort8 b) 2124{ 2125 return ((vec_short8)(vec_subs(a, (vec_short8)(b)))); 2126} 2127 2128static inline vec_uint4 vec_subs(vec_uint4 a, vec_uint4 b) 2129{ 2130 return (spu_andc(spu_sub(a, b), spu_cmpgt(b, a))); 2131} 2132 2133static inline vec_int4 vec_subs(vec_int4 a, vec_int4 b) 2134{ 2135 vec_int4 s; 2136 vec_int4 d; 2137 2138 s = spu_sub(a, b); 2139 d = spu_sel(s, spu_splats((signed int)0x7FFFFFFF), (vec_uint4)(spu_rlmaska(spu_nor(a, spu_nand(s, b)), -31))); 2140 d = spu_sel(d, spu_splats((signed int)0x80000000), (vec_uint4)(spu_rlmaska(spu_and(a, spu_nor(s, b)), -31))); 2141 2142 return (d); 2143} 2144 2145static inline vec_int4 vec_subs(vec_bint4 a, vec_int4 b) 2146{ 2147 return ((vec_int4)(vec_subs((vec_int4)(a), b))); 2148} 2149 2150static inline vec_int4 vec_subs(vec_int4 a, vec_bint4 b) 2151{ 2152 return ((vec_int4)(vec_subs(a, (vec_int4)(b)))); 2153} 2154 2155 2156/* vec_sum4s (vector sum across partial (1/4) saturated) 2157 * ========= 2158 */ 2159static inline vec_uint4 vec_sum4s(vec_uchar16 a, vec_uint4 b) 2160{ 2161 vec_uint4 a01_23, a0123; 2162 2163 a01_23 = (vec_uint4)(spu_add(spu_rlmask((vec_ushort8)(a), -8), 2164 spu_and((vec_ushort8)(a), 0xFF))); 2165 a0123 = spu_add(spu_rlmask(a01_23, -16), spu_and(a01_23, 0x1FF)); 2166 return (vec_adds(a0123, b)); 2167} 2168 2169static inline vec_int4 vec_sum4s(vec_char16 a, vec_int4 b) 2170{ 2171 vec_int4 a01_23, a0123; 2172 2173 a01_23 = (vec_int4)(spu_add(spu_rlmaska((vec_short8)(a), -8), 2174 spu_extend(a))); 2175 a0123 = spu_add(spu_rlmaska(a01_23, -16), spu_extend((vec_short8)(a01_23))); 2176 return (vec_adds(a0123, b)); 2177} 2178 2179static inline vec_int4 vec_sum4s(vec_short8 a, vec_int4 b) 2180{ 2181 vec_int4 a0123; 2182 2183 a0123 = spu_add(spu_rlmaska((vec_int4)(a), -16), spu_extend(a)); 2184 return (vec_adds(a0123, b)); 2185} 2186 2187 2188/* vec_sum2s (vector sum across partial (1/2) saturated) 2189 * ========= 2190 */ 2191static inline vec_int4 vec_sum2s(vec_int4 a, vec_int4 b) 2192{ 2193 vec_int4 c, d; 2194 vec_int4 sign1, sign2, sign3; 2195 vec_int4 carry, sum_l, sum_h, sat, sat_val; 2196 2197 sign1 = spu_rlmaska(a, -31); 2198 sign2 = spu_rlmaska(b, -31); 2199 2200 c = spu_rlqwbyte(a, -4); 2201 sign3 = spu_rlqwbyte(sign1, -4); 2202 2203 carry = spu_genc(a, b); 2204 sum_l = spu_add(a, b); 2205 sum_h = spu_addx(sign1, sign2, carry); 2206 2207 carry = spu_genc(sum_l, c); 2208 sum_l = spu_add(sum_l, c); 2209 sum_h = spu_addx(sum_h, sign3, carry); 2210 2211 sign1 = spu_rlmaska(sum_l, -31); 2212 sign2 = spu_rlmaska(sum_h, -31); 2213 2214 sat_val = spu_xor(sign2, spu_splats((signed int)0x7FFFFFFF)); 2215 2216 sat = spu_orc(spu_xor(sign1, sign2), (vec_int4)spu_cmpeq(sum_h, sign2)); 2217 2218 d = spu_and(spu_sel(sum_l, sat_val, (vec_uint4)(sat)), (vec_int4){0, -1, 0, -1}); 2219 2220 return (d); 2221} 2222 2223 2224/* vec_sums (vector sum saturated) 2225 * ======== 2226 */ 2227static inline vec_int4 vec_sums(vec_int4 a, vec_int4 b) 2228{ 2229 vec_int4 a0, a1, a2, c0, c1, c2, d; 2230 vec_int4 sign_a, sign_b, sign_l, sign_h; 2231 vec_int4 sum_l, sum_h, sat, sat_val; 2232 2233 sign_a = spu_rlmaska(a, -31); 2234 sign_b = spu_rlmaska(b, -31); 2235 2236 a0 = spu_rlqwbyte(a, -12); 2237 a1 = spu_rlqwbyte(a, -8); 2238 a2 = spu_rlqwbyte(a, -4); 2239 2240 sum_l = spu_add(a, b); 2241 sum_h = spu_addx(sign_a, sign_b, spu_genc(a, b)); 2242 2243 c2 = spu_genc(sum_l, a2); 2244 sum_l = spu_add(sum_l, a2); 2245 sum_h = spu_addx(sum_h, spu_rlqwbyte(sign_a, -4), c2); 2246 2247 c1 = spu_genc(sum_l, a1); 2248 sum_l = spu_add(sum_l, a1); 2249 sum_h = spu_addx(sum_h, spu_rlqwbyte(sign_a, -8), c1); 2250 2251 c0 = spu_genc(sum_l, a0); 2252 sum_l = spu_add(sum_l, a0); 2253 sum_h = spu_addx(sum_h, spu_rlqwbyte(sign_a, -12), c0); 2254 2255 sign_l = spu_rlmaska(sum_l, -31); 2256 sign_h = spu_rlmaska(sum_h, -31); 2257 2258 sat_val = spu_xor(sign_h, spu_splats((signed int)0x7FFFFFFF)); 2259 2260 sat = spu_orc(spu_xor(sign_l, sign_h), (vec_int4)spu_cmpeq(sum_h, sign_h)); 2261 2262 d = spu_and(spu_sel(sum_l, sat_val, (vec_uint4)(sat)), ((vec_int4){0, 0, 0, -1})); 2263 2264 return (d); 2265} 2266 2267 2268/* vec_trunc (vector truncate) 2269 * ========= 2270 */ 2271static inline vec_float4 vec_trunc(vec_float4 a) 2272{ 2273 vec_int4 exp; 2274 vec_uint4 mask; 2275 2276 exp = spu_sub(127, (vec_int4)(spu_and(spu_rlmask((vec_uint4)(a), -23), 0xFF))); 2277 mask = spu_rlmask(spu_splats((unsigned int)0x7FFFFF), exp); 2278 mask = spu_sel(spu_splats((unsigned int)0), mask, spu_cmpgt(exp, -31)); 2279 mask = spu_or(mask, spu_xor((vec_uint4)(spu_rlmaska(spu_add(exp, -1), -31)), -1)); 2280 return (spu_andc(a, (vec_float4)(mask))); 2281} 2282 2283/* vec_unpackh (vector unpack high element) 2284 * =========== 2285 */ 2286static inline vec_short8 vec_unpackh(vec_char16 a) 2287{ 2288 return (spu_extend(spu_shuffle(a, a, ((vec_uchar16){0, 0, 1, 1, 2, 2, 3, 3, 2289 4, 4, 5, 5, 6, 6, 7, 7})))); 2290} 2291 2292static inline vec_bshort8 vec_unpackh(vec_bchar16 a) 2293{ 2294 return ((vec_bshort8)(vec_unpackh((vec_char16)(a)))); 2295} 2296 2297static inline vec_int4 vec_unpackh(vec_short8 a) 2298{ 2299 return (spu_extend(spu_shuffle(a, a, ((vec_uchar16){0, 0, 0, 1, 0, 0, 2, 3, 2300 0, 0, 4, 5, 0, 0, 6, 7})))); 2301} 2302 2303#ifdef SUPPORT_UNPACK_PIXEL 2304/* Due to type conflicts, unpacking of pixel types and boolean shorts 2305 * can not simultaneously be supported. By default, the boolean short is 2306 * supported. 2307 */ 2308static inline vec_uint4 vec_unpackh(vec_pixel8 a) 2309{ 2310 vec_ushort8 p1, p2; 2311 2312 p1 = spu_shuffle((vec_ushort8)(spu_rlmaska((vec_short8)(a.p), -7)), 2313 spu_and((vec_ushort8)(a.p), 0x1F), 2314 ((vec_uchar16){ 0, 128, 128, 17, 2, 128, 128, 19, 2315 4, 128, 128, 21, 6, 128, 128, 23})); 2316 p2 = spu_shuffle(spu_and(spu_rlmask((vec_ushort8)(a.p), -5), 0x1F), 2317 spu_and(spu_rlmask((vec_ushort8)(a.p), -10), 0x1F), 2318 ((vec_uchar16){ 128, 17, 1, 128, 128, 19, 3, 128, 2319 128, 21, 5, 128, 128, 23, 7, 128})); 2320 return ((vec_uint4)(spu_or(p1, p2))); 2321} 2322 2323#else 2324 2325static inline vec_bint4 vec_unpackh(vec_bshort8 a) 2326{ 2327 return ((vec_bint4)(vec_unpackh((vec_short8)(a)))); 2328} 2329#endif 2330 2331 2332 2333 2334 2335/* vec_unpackl (vector unpack low element) 2336 * =========== 2337 */ 2338static inline vec_short8 vec_unpackl(vec_char16 a) 2339{ 2340 return (spu_extend(spu_shuffle(a, a, ((vec_uchar16){8, 8, 9, 9, 10, 10, 11, 11, 2341 12, 12, 13, 13, 14, 14, 15, 15})))); 2342} 2343 2344static inline vec_bshort8 vec_unpackl(vec_bchar16 a) 2345{ 2346 return ((vec_bshort8)(vec_unpackl((vec_char16)(a)))); 2347} 2348 2349 2350static inline vec_int4 vec_unpackl(vec_short8 a) 2351{ 2352 return (spu_extend(spu_shuffle(a, a, ((vec_uchar16){0, 0, 8, 9, 0, 0, 10, 11, 2353 0, 0,12,13, 0, 0, 14, 15})))); 2354} 2355 2356 2357#ifdef SUPPORT_UNPACK_PIXEL 2358/* Due to type conflicts, unpacking of pixel types and boolean shorts 2359 * can not simultaneously be supported. By default, the boolean short is 2360 * supported. 2361 */ 2362static inline vec_uint4 vec_unpackl(vec_pixel8 a) 2363{ 2364 vec_ushort8 p1, p2; 2365 2366 p1 = spu_shuffle((vec_ushort8)(spu_rlmaska((vec_short8)(a), -7)), 2367 spu_and((vec_ushort8)(a), 0x1F), 2368 ((vec_uchar16){ 8, 128, 128, 25, 10, 128, 128, 27, 2369 12, 128, 128, 29, 14, 128, 128, 31})); 2370 p2 = spu_shuffle(spu_and(spu_rlmask((vec_ushort8)(a), -5), 0x1F), 2371 spu_and(spu_rlmask((vec_ushort8)(a), -10), 0x1F), 2372 ((vec_uchar16){ 128, 25, 9, 128, 128, 27, 11, 128, 2373 128, 29, 13, 128, 128, 31, 15, 128})); 2374 return ((vec_uint4)(spu_or(p1, p2))); 2375} 2376 2377#else 2378 2379static inline vec_bint4 vec_unpackl(vec_bshort8 a) 2380{ 2381 return ((vec_bint4)(vec_unpackl((vec_short8)(a)))); 2382 2383} 2384#endif 2385 2386 2387 2388/* vec_xor (vector logical xor) 2389 * ====== 2390 */ 2391static inline vec_uchar16 vec_xor(vec_uchar16 a, vec_uchar16 b) 2392{ 2393 return (spu_xor(a, b)); 2394} 2395 2396static inline vec_char16 vec_xor(vec_char16 a, vec_char16 b) 2397{ 2398 return (spu_xor(a, b)); 2399} 2400 2401static inline vec_char16 vec_xor(vec_bchar16 a, vec_char16 b) 2402{ 2403 return (spu_xor((vec_char16)(a), b)); 2404} 2405 2406static inline vec_char16 vec_xor(vec_char16 a, vec_bchar16 b) 2407{ 2408 return (spu_xor(a, (vec_char16)(b))); 2409} 2410 2411static inline vec_ushort8 vec_xor(vec_ushort8 a, vec_ushort8 b) 2412{ 2413 return (spu_xor(a, b)); 2414} 2415 2416static inline vec_short8 vec_xor(vec_short8 a, vec_short8 b) 2417{ 2418 return (spu_xor(a, b)); 2419} 2420 2421static inline vec_short8 vec_xor(vec_bshort8 a, vec_short8 b) 2422{ 2423 return (spu_xor((vec_short8)(a), b)); 2424} 2425 2426static inline vec_short8 vec_xor(vec_short8 a, vec_bshort8 b) 2427{ 2428 return (spu_xor(a, (vec_short8)(b))); 2429} 2430 2431static inline vec_uint4 vec_xor(vec_uint4 a, vec_uint4 b) 2432{ 2433 return (spu_xor(a, b)); 2434} 2435 2436static inline vec_int4 vec_xor(vec_int4 a, vec_int4 b) 2437{ 2438 return (spu_xor(a, b)); 2439} 2440 2441static inline vec_int4 vec_xor(vec_bint4 a, vec_int4 b) 2442{ 2443 return (spu_xor((vec_int4)(a), b)); 2444} 2445 2446static inline vec_int4 vec_xor(vec_int4 a, vec_bint4 b) 2447{ 2448 return (spu_xor(a, (vec_int4)(b))); 2449} 2450 2451static inline vec_float4 vec_xor(vec_float4 a, vec_float4 b) 2452{ 2453 return (spu_xor(a, b)); 2454} 2455 2456static inline vec_float4 vec_xor(vec_bint4 a, vec_float4 b) 2457{ 2458 return (spu_xor((vec_float4)(a),b)); 2459} 2460 2461static inline vec_float4 vec_xor(vec_float4 a, vec_bint4 b) 2462{ 2463 return (spu_xor(a, (vec_float4)(b))); 2464} 2465 2466/************************************************************************ 2467 * PREDICATES 2468 ************************************************************************/ 2469 2470/* vec_all_eq (all elements equal) 2471 * ========== 2472 */ 2473static inline int vec_all_eq(vec_uchar16 a, vec_uchar16 b) 2474{ 2475 return ((int)(spu_extract(spu_gather(spu_cmpeq(a, b)), 0) == 0xFFFF)); 2476} 2477 2478static inline int vec_all_eq(vec_char16 a, vec_char16 b) 2479{ 2480 return ((int)(spu_extract(spu_gather(spu_cmpeq(a, b)), 0) == 0xFFFF)); 2481} 2482 2483static inline int vec_all_eq(vec_bchar16 a, vec_char16 b) 2484{ 2485 return ((int)(spu_extract(spu_gather(spu_cmpeq((vec_char16)(a), b)), 0) == 0xFFFF)); 2486} 2487 2488static inline int vec_all_eq(vec_char16 a, vec_bchar16 b) 2489{ 2490 return ((int)(spu_extract(spu_gather(spu_cmpeq(a, (vec_char16)(b))), 0) == 0xFFFF)); 2491} 2492 2493static inline int vec_all_eq(vec_ushort8 a, vec_ushort8 b) 2494{ 2495 return ((int)(spu_extract(spu_gather(spu_cmpeq(a, b)), 0) == 0xFF)); 2496} 2497 2498static inline int vec_all_eq(vec_short8 a, vec_short8 b) 2499{ 2500 return ((int)(spu_extract(spu_gather(spu_cmpeq(a, b)), 0) == 0xFF)); 2501} 2502 2503static inline int vec_all_eq(vec_bshort8 a, vec_short8 b) 2504{ 2505 return ((int)(spu_extract(spu_gather(spu_cmpeq((vec_short8)(a), b)), 0) == 0xFF)); 2506} 2507 2508static inline int vec_all_eq(vec_short8 a, vec_bshort8 b) 2509{ 2510 return ((int)(spu_extract(spu_gather(spu_cmpeq(a, (vec_short8)(b))), 0) == 0xFF)); 2511} 2512 2513static inline int vec_all_eq(vec_uint4 a, vec_uint4 b) 2514{ 2515 return ((int)(spu_extract(spu_gather(spu_cmpeq(a, b)), 0) == 0xF)); 2516} 2517 2518static inline int vec_all_eq(vec_int4 a, vec_int4 b) 2519{ 2520 return ((int)(spu_extract(spu_gather(spu_cmpeq(a, b)), 0) == 0xF)); 2521} 2522 2523static inline int vec_all_eq(vec_bint4 a, vec_int4 b) 2524{ 2525 return ((int)(spu_extract(spu_gather(spu_cmpeq((vec_int4)(a), b)), 0) == 0xF)); 2526} 2527 2528static inline int vec_all_eq(vec_int4 a, vec_bint4 b) 2529{ 2530 return ((int)(spu_extract(spu_gather(spu_cmpeq(a, (vec_int4)(b))), 0) == 0xF)); 2531} 2532 2533static inline int vec_all_eq(vec_float4 a, vec_float4 b) 2534{ 2535 return ((int)(spu_extract(spu_gather(spu_cmpeq(a, b)), 0) == 0xF)); 2536} 2537 2538 2539/* vec_all_ge (all elements greater than or equal) 2540 * ========== 2541 */ 2542static inline int vec_all_ge(vec_uchar16 a, vec_uchar16 b) 2543{ 2544 return ((int)(spu_extract(spu_gather(spu_cmpgt(b, a)), 0) == 0)); 2545} 2546 2547static inline int vec_all_ge(vec_char16 a, vec_char16 b) 2548{ 2549 return ((int)(spu_extract(spu_gather(spu_cmpgt(b, a)), 0) == 0)); 2550} 2551 2552static inline int vec_all_ge(vec_bchar16 a, vec_char16 b) 2553{ 2554 return ((int)(spu_extract(spu_gather(spu_cmpgt(b, (vec_char16)(a))), 0) == 0)); 2555} 2556 2557static inline int vec_all_ge(vec_char16 a, vec_bchar16 b) 2558{ 2559 return ((int)(spu_extract(spu_gather(spu_cmpgt((vec_char16)(b), a)), 0) == 0)); 2560} 2561 2562static inline int vec_all_ge(vec_ushort8 a, vec_ushort8 b) 2563{ 2564 return ((int)(spu_extract(spu_gather(spu_cmpgt(b, a)), 0) == 0)); 2565} 2566 2567static inline int vec_all_ge(vec_short8 a, vec_short8 b) 2568{ 2569 return ((int)(spu_extract(spu_gather(spu_cmpgt(b, a)), 0) == 0)); 2570} 2571 2572static inline int vec_all_ge(vec_bshort8 a, vec_short8 b) 2573{ 2574 return ((int)(spu_extract(spu_gather(spu_cmpgt(b, (vec_short8)(a))), 0) == 0)); 2575} 2576 2577static inline int vec_all_ge(vec_short8 a, vec_bshort8 b) 2578{ 2579 return ((int)(spu_extract(spu_gather(spu_cmpgt((vec_short8)(b), a)), 0) == 0)); 2580} 2581 2582static inline int vec_all_ge(vec_uint4 a, vec_uint4 b) 2583{ 2584 return ((int)(spu_extract(spu_gather(spu_cmpgt(b, a)), 0) == 0)); 2585} 2586 2587static inline int vec_all_ge(vec_int4 a, vec_int4 b) 2588{ 2589 return ((int)(spu_extract(spu_gather(spu_cmpgt(b, a)), 0) == 0)); 2590} 2591 2592static inline int vec_all_ge(vec_bint4 a, vec_int4 b) 2593{ 2594 return ((int)(spu_extract(spu_gather(spu_cmpgt(b, (vec_int4)(a))), 0) == 0)); 2595} 2596 2597static inline int vec_all_ge(vec_int4 a, vec_bint4 b) 2598{ 2599 return ((int)(spu_extract(spu_gather(spu_cmpgt((vec_int4)(b), a)), 0) == 0)); 2600} 2601 2602static inline int vec_all_ge(vec_float4 a, vec_float4 b) 2603{ 2604 return ((int)(spu_extract(spu_gather(spu_cmpgt(b, a)), 0) == 0)); 2605} 2606 2607 2608/* vec_all_gt (all elements greater than) 2609 * ========== 2610 */ 2611static inline int vec_all_gt(vec_uchar16 a, vec_uchar16 b) 2612{ 2613 return ((int)(spu_extract(spu_gather(spu_cmpgt(a, b)), 0) == 0xFFFF)); 2614} 2615 2616static inline int vec_all_gt(vec_char16 a, vec_char16 b) 2617{ 2618 return ((int)(spu_extract(spu_gather(spu_cmpgt(a, b)), 0) == 0xFFFF)); 2619} 2620 2621static inline int vec_all_gt(vec_bchar16 a, vec_char16 b) 2622{ 2623 return ((int)(spu_extract(spu_gather(spu_cmpgt((vec_char16)(a), b)), 0) == 0xFFFF)); 2624} 2625 2626static inline int vec_all_gt(vec_char16 a, vec_bchar16 b) 2627{ 2628 return ((int)(spu_extract(spu_gather(spu_cmpgt(a, (vec_char16)(b))), 0) == 0xFFFF)); 2629} 2630 2631static inline int vec_all_gt(vec_ushort8 a, vec_ushort8 b) 2632{ 2633 return ((int)(spu_extract(spu_gather(spu_cmpgt(a, b)), 0) == 0xFF)); 2634} 2635 2636static inline int vec_all_gt(vec_short8 a, vec_short8 b) 2637{ 2638 return ((int)(spu_extract(spu_gather(spu_cmpgt(a, b)), 0) == 0xFF)); 2639} 2640 2641static inline int vec_all_gt(vec_bshort8 a, vec_short8 b) 2642{ 2643 return ((int)(spu_extract(spu_gather(spu_cmpgt((vec_short8)(a), b)), 0) == 0xFF)); 2644} 2645 2646static inline int vec_all_gt(vec_short8 a, vec_bshort8 b) 2647{ 2648 return ((int)(spu_extract(spu_gather(spu_cmpgt(a, (vec_short8)(b))), 0) == 0xFF)); 2649} 2650 2651static inline int vec_all_gt(vec_uint4 a, vec_uint4 b) 2652{ 2653 return ((int)(spu_extract(spu_gather(spu_cmpgt(a, b)), 0) == 0xF)); 2654} 2655 2656static inline int vec_all_gt(vec_int4 a, vec_int4 b) 2657{ 2658 return ((int)(spu_extract(spu_gather(spu_cmpgt(a, b)), 0) == 0xF)); 2659} 2660 2661static inline int vec_all_gt(vec_bint4 a, vec_int4 b) 2662{ 2663 return ((int)(spu_extract(spu_gather(spu_cmpgt((vec_int4)(a), b)), 0) == 0xF)); 2664} 2665 2666static inline int vec_all_gt(vec_int4 a, vec_bint4 b) 2667{ 2668 return ((int)(spu_extract(spu_gather(spu_cmpgt(a, (vec_int4)(b))), 0) == 0xF)); 2669} 2670 2671static inline int vec_all_gt(vec_float4 a, vec_float4 b) 2672{ 2673 return ((int)(spu_extract(spu_gather(spu_cmpgt(a, b)), 0) == 0xF)); 2674} 2675 2676 2677/* vec_all_in (all elements in bounds) 2678 * ========== 2679 */ 2680static inline int vec_all_in(vec_float4 a, vec_float4 b) 2681{ 2682 return (spu_extract(spu_gather(spu_nor(spu_cmpabsgt(a, b), (vec_uint4)(spu_rlmaska((vec_int4)(b), -31)))), 0) == 0xF); 2683} 2684 2685 2686/* vec_all_le (all elements less than or equal) 2687 * ========== 2688 */ 2689static inline int vec_all_le(vec_uchar16 a, vec_uchar16 b) 2690{ 2691 return ((int)(spu_extract(spu_gather(spu_cmpgt(a, b)), 0) == 0)); 2692} 2693 2694static inline int vec_all_le(vec_char16 a, vec_char16 b) 2695{ 2696 return ((int)(spu_extract(spu_gather(spu_cmpgt(a, b)), 0) == 0)); 2697} 2698 2699static inline int vec_all_le(vec_bchar16 a, vec_char16 b) 2700{ 2701 return ((int)(spu_extract(spu_gather(spu_cmpgt((vec_char16)(a), b)), 0) == 0)); 2702} 2703 2704static inline int vec_all_le(vec_char16 a, vec_bchar16 b) 2705{ 2706 return ((int)(spu_extract(spu_gather(spu_cmpgt(a, (vec_char16)(b))), 0) == 0)); 2707} 2708 2709static inline int vec_all_le(vec_ushort8 a, vec_ushort8 b) 2710{ 2711 return ((int)(spu_extract(spu_gather(spu_cmpgt(a, b)), 0) == 0)); 2712} 2713 2714static inline int vec_all_le(vec_short8 a, vec_short8 b) 2715{ 2716 return ((int)(spu_extract(spu_gather(spu_cmpgt(a, b)), 0) == 0)); 2717} 2718 2719static inline int vec_all_le(vec_bshort8 a, vec_short8 b) 2720{ 2721 return ((int)(spu_extract(spu_gather(spu_cmpgt((vec_short8)(a), b)), 0) == 0)); 2722} 2723 2724static inline int vec_all_le(vec_short8 a, vec_bshort8 b) 2725{ 2726 return ((int)(spu_extract(spu_gather(spu_cmpgt(a, (vec_short8)(b))), 0) == 0)); 2727} 2728 2729static inline int vec_all_le(vec_uint4 a, vec_uint4 b) 2730{ 2731 return ((int)(spu_extract(spu_gather(spu_cmpgt(a, b)), 0) == 0)); 2732} 2733 2734static inline int vec_all_le(vec_int4 a, vec_int4 b) 2735{ 2736 return ((int)(spu_extract(spu_gather(spu_cmpgt(a, b)), 0) == 0)); 2737} 2738 2739static inline int vec_all_le(vec_bint4 a, vec_int4 b) 2740{ 2741 return ((int)(spu_extract(spu_gather(spu_cmpgt((vec_int4)(a), b)), 0) == 0)); 2742} 2743 2744static inline int vec_all_le(vec_int4 a, vec_bint4 b) 2745{ 2746 return ((int)(spu_extract(spu_gather(spu_cmpgt(a, (vec_int4)(b))), 0) == 0)); 2747} 2748 2749static inline int vec_all_le(vec_float4 a, vec_float4 b) 2750{ 2751 return ((int)(spu_extract(spu_gather(spu_cmpgt(a, b)), 0) == 0)); 2752} 2753 2754 2755/* vec_all_lt (all elements less than) 2756 * ========== 2757 */ 2758static inline int vec_all_lt(vec_uchar16 a, vec_uchar16 b) 2759{ 2760 return ((int)(spu_extract(spu_gather(spu_cmpgt(b, a)), 0) == 0xFFFF)); 2761} 2762 2763static inline int vec_all_lt(vec_char16 a, vec_char16 b) 2764{ 2765 return ((int)(spu_extract(spu_gather(spu_cmpgt(b, a)), 0) == 0xFFFF)); 2766} 2767 2768static inline int vec_all_lt(vec_bchar16 a, vec_char16 b) 2769{ 2770 return ((int)(spu_extract(spu_gather(spu_cmpgt(b, (vec_char16)(a))), 0) == 0xFFFF)); 2771} 2772 2773static inline int vec_all_lt(vec_char16 a, vec_bchar16 b) 2774{ 2775 return ((int)(spu_extract(spu_gather(spu_cmpgt((vec_char16)(b), a)), 0) == 0xFFFF)); 2776} 2777 2778static inline int vec_all_lt(vec_ushort8 a, vec_ushort8 b) 2779{ 2780 return ((int)(spu_extract(spu_gather(spu_cmpgt(b, a)), 0) == 0xFF)); 2781} 2782 2783static inline int vec_all_lt(vec_short8 a, vec_short8 b) 2784{ 2785 return ((int)(spu_extract(spu_gather(spu_cmpgt(b, a)), 0) == 0xFF)); 2786} 2787 2788static inline int vec_all_lt(vec_bshort8 a, vec_short8 b) 2789{ 2790 return ((int)(spu_extract(spu_gather(spu_cmpgt(b, (vec_short8)(a))), 0) == 0xFF)); 2791} 2792 2793static inline int vec_all_lt(vec_short8 a, vec_bshort8 b) 2794{ 2795 return ((int)(spu_extract(spu_gather(spu_cmpgt((vec_short8)(b), a)), 0) == 0xFF)); 2796} 2797 2798static inline int vec_all_lt(vec_uint4 a, vec_uint4 b) 2799{ 2800 return ((int)(spu_extract(spu_gather(spu_cmpgt(b, a)), 0) == 0xF)); 2801} 2802 2803static inline int vec_all_lt(vec_int4 a, vec_int4 b) 2804{ 2805 return ((int)(spu_extract(spu_gather(spu_cmpgt(b, a)), 0) == 0xF)); 2806} 2807 2808static inline int vec_all_lt(vec_bint4 a, vec_int4 b) 2809{ 2810 return ((int)(spu_extract(spu_gather(spu_cmpgt(b, (vec_int4)(a))), 0) == 0xF)); 2811} 2812 2813static inline int vec_all_lt(vec_int4 a, vec_bint4 b) 2814{ 2815 return ((int)(spu_extract(spu_gather(spu_cmpgt((vec_int4)(b), a)), 0) == 0xF)); 2816} 2817 2818static inline int vec_all_lt(vec_float4 a, vec_float4 b) 2819{ 2820 return ((int)(spu_extract(spu_gather(spu_cmpgt(b, a)), 0) == 0xF)); 2821} 2822 2823 2824/* vec_all_nan (all elements not a number) 2825 * =========== 2826 */ 2827static inline int vec_all_nan(vec_float4 a) 2828{ 2829 vec_uint4 exp, man; 2830 vec_uint4 exp_mask = spu_splats((unsigned int)0x7F800000); 2831 2832 exp = spu_and((vec_uint4)(a), exp_mask); 2833 man = spu_and((vec_uint4)(a), spu_splats((unsigned int)0x007FFFFF)); 2834 return ((int)(spu_extract(spu_gather(spu_andc(spu_cmpeq(exp, exp_mask), 2835 spu_cmpeq(man, 0))), 0) == 0xF)); 2836} 2837 2838#define vec_all_nan(_a) (0) 2839 2840 2841/* vec_all_ne (all elements not equal) 2842 * ========== 2843 */ 2844static inline int vec_all_ne(vec_uchar16 a, vec_uchar16 b) 2845{ 2846 return ((int)(spu_extract(spu_gather(spu_cmpeq(a, b)), 0) == 0)); 2847} 2848 2849static inline int vec_all_ne(vec_char16 a, vec_char16 b) 2850{ 2851 return ((int)(spu_extract(spu_gather(spu_cmpeq(a, b)), 0) == 0)); 2852} 2853 2854static inline int vec_all_ne(vec_bchar16 a, vec_char16 b) 2855{ 2856 return ((int)(spu_extract(spu_gather(spu_cmpeq((vec_char16)(a), b)), 0) == 0)); 2857} 2858 2859static inline int vec_all_ne(vec_char16 a, vec_bchar16 b) 2860{ 2861 return ((int)(spu_extract(spu_gather(spu_cmpeq(a, (vec_char16)(b))), 0) == 0)); 2862} 2863 2864static inline int vec_all_ne(vec_ushort8 a, vec_ushort8 b) 2865{ 2866 return ((int)(spu_extract(spu_gather(spu_cmpeq(a, b)), 0) == 0)); 2867} 2868 2869static inline int vec_all_ne(vec_short8 a, vec_short8 b) 2870{ 2871 return ((int)(spu_extract(spu_gather(spu_cmpeq(a, b)), 0) == 0)); 2872} 2873 2874static inline int vec_all_ne(vec_bshort8 a, vec_short8 b) 2875{ 2876 return ((int)(spu_extract(spu_gather(spu_cmpeq((vec_short8)(a), b)), 0) == 0)); 2877} 2878 2879static inline int vec_all_ne(vec_short8 a, vec_bshort8 b) 2880{ 2881 return ((int)(spu_extract(spu_gather(spu_cmpeq(a, (vec_short8)(b))), 0) == 0)); 2882} 2883 2884static inline int vec_all_ne(vec_uint4 a, vec_uint4 b) 2885{ 2886 return ((int)(spu_extract(spu_gather(spu_cmpeq(a, b)), 0) == 0)); 2887} 2888 2889static inline int vec_all_ne(vec_int4 a, vec_int4 b) 2890{ 2891 return ((int)(spu_extract(spu_gather(spu_cmpeq(a, b)), 0) == 0)); 2892} 2893 2894static inline int vec_all_ne(vec_bint4 a, vec_int4 b) 2895{ 2896 return ((int)(spu_extract(spu_gather(spu_cmpeq((vec_int4)(a), b)), 0) == 0)); 2897} 2898 2899static inline int vec_all_ne(vec_int4 a, vec_bint4 b) 2900{ 2901 return ((int)(spu_extract(spu_gather(spu_cmpeq(a, (vec_int4)(b))), 0) == 0)); 2902} 2903 2904static inline int vec_all_ne(vec_float4 a, vec_float4 b) 2905{ 2906 return ((int)(spu_extract(spu_gather(spu_cmpeq(a, b)), 0) == 0)); 2907} 2908 2909 2910/* vec_all_nge (all elements not greater than or equal) 2911 * =========== 2912 */ 2913static inline int vec_all_nge(vec_float4 a, vec_float4 b) 2914{ 2915 return ((int)(spu_extract(spu_gather(spu_cmpgt(b, a)), 0) == 0xF)); 2916} 2917 2918 2919/* vec_all_ngt (all elements not greater than) 2920 * =========== 2921 */ 2922static inline int vec_all_ngt(vec_float4 a, vec_float4 b) 2923{ 2924 return ((int)(spu_extract(spu_gather(spu_cmpgt(a, b)), 0) == 0)); 2925} 2926 2927 2928/* vec_all_nle (all elements not less than or equal) 2929 * =========== 2930 */ 2931static inline int vec_all_nle(vec_float4 a, vec_float4 b) 2932{ 2933 return ((int)(spu_extract(spu_gather(spu_cmpgt(a, b)), 0) == 0xF)); 2934} 2935 2936 2937/* vec_all_nlt (all elements not less than) 2938 * =========== 2939 */ 2940static inline int vec_all_nlt(vec_float4 a, vec_float4 b) 2941{ 2942 return ((int)(spu_extract(spu_gather(spu_cmpgt(b, a)), 0) == 0)); 2943} 2944 2945 2946/* vec_all_numeric (all elements numeric) 2947 * =========== 2948 */ 2949static inline int vec_all_numeric(vec_float4 a) 2950{ 2951 vec_uint4 exp; 2952 2953 exp = spu_and(spu_rlmask((vec_uint4)(a), -23), 0xFF); 2954 return ((int)(spu_extract(spu_gather(spu_cmpeq(exp, 255)), 0) == 0)); 2955} 2956 2957 2958 2959/* vec_any_eq (any elements equal) 2960 * ========== 2961 */ 2962static inline int vec_any_eq(vec_uchar16 a, vec_uchar16 b) 2963{ 2964 return ((int)(spu_extract(spu_gather(spu_cmpeq(a, b)), 0) != 0)); 2965} 2966 2967static inline int vec_any_eq(vec_char16 a, vec_char16 b) 2968{ 2969 return ((int)(spu_extract(spu_gather(spu_cmpeq(a, b)), 0) != 0)); 2970} 2971 2972static inline int vec_any_eq(vec_bchar16 a, vec_char16 b) 2973{ 2974 return ((int)(spu_extract(spu_gather(spu_cmpeq((vec_char16)(a), b)), 0) != 0)); 2975} 2976 2977static inline int vec_any_eq(vec_char16 a, vec_bchar16 b) 2978{ 2979 return ((int)(spu_extract(spu_gather(spu_cmpeq(a, (vec_char16)(b))), 0) != 0)); 2980} 2981 2982static inline int vec_any_eq(vec_ushort8 a, vec_ushort8 b) 2983{ 2984 return ((int)(spu_extract(spu_gather(spu_cmpeq(a, b)), 0) != 0)); 2985} 2986 2987static inline int vec_any_eq(vec_short8 a, vec_short8 b) 2988{ 2989 return ((int)(spu_extract(spu_gather(spu_cmpeq(a, b)), 0) != 0)); 2990} 2991 2992static inline int vec_any_eq(vec_bshort8 a, vec_short8 b) 2993{ 2994 return ((int)(spu_extract(spu_gather(spu_cmpeq((vec_short8)(a), b)), 0) != 0)); 2995} 2996 2997static inline int vec_any_eq(vec_short8 a, vec_bshort8 b) 2998{ 2999 return ((int)(spu_extract(spu_gather(spu_cmpeq(a, (vec_short8)(b))), 0) != 0)); 3000} 3001 3002static inline int vec_any_eq(vec_uint4 a, vec_uint4 b) 3003{ 3004 return ((int)(spu_extract(spu_orx(spu_rlmask(spu_cmpeq(a, b), -31)), 0))); 3005} 3006 3007static inline int vec_any_eq(vec_int4 a, vec_int4 b) 3008{ 3009 return ((int)(spu_extract(spu_orx(spu_rlmask(spu_cmpeq(a, b), -31)), 0))); 3010} 3011 3012static inline int vec_any_eq(vec_bint4 a, vec_int4 b) 3013{ 3014 return ((int)(spu_extract(spu_orx(spu_rlmask(spu_cmpeq((vec_int4)(a), b), -31)), 0))); 3015} 3016 3017static inline int vec_any_eq(vec_int4 a, vec_bint4 b) 3018{ 3019 return ((int)(spu_extract(spu_orx(spu_rlmask(spu_cmpeq(a, (vec_int4)(b)), -31)), 0))); 3020} 3021 3022static inline int vec_any_eq(vec_float4 a, vec_float4 b) 3023{ 3024 return ((int)(spu_extract(spu_orx(spu_rlmask(spu_cmpeq(a, b), -31)), 0))); 3025} 3026 3027/* vec_any_ge (any elements greater than or equal) 3028 * ========== 3029 */ 3030static inline int vec_any_ge(vec_uchar16 a, vec_uchar16 b) 3031{ 3032 return ((int)(spu_extract(spu_gather(spu_cmpgt(b, a)), 0) != 0xFFFF)); 3033} 3034 3035static inline int vec_any_ge(vec_char16 a, vec_char16 b) 3036{ 3037 return ((int)(spu_extract(spu_gather(spu_cmpgt(b, a)), 0) != 0xFFFF)); 3038} 3039 3040static inline int vec_any_ge(vec_bchar16 a, vec_char16 b) 3041{ 3042 return ((int)(spu_extract(spu_gather(spu_cmpgt(b, (vec_char16)(a))), 0) != 0xFFFF)); 3043} 3044 3045static inline int vec_any_ge(vec_char16 a, vec_bchar16 b) 3046{ 3047 return ((int)(spu_extract(spu_gather(spu_cmpgt((vec_char16)(b), a)), 0) != 0xFFFF)); 3048} 3049 3050static inline int vec_any_ge(vec_ushort8 a, vec_ushort8 b) 3051{ 3052 return ((int)(spu_extract(spu_gather(spu_cmpgt(b, a)), 0) != 0xFF)); 3053} 3054 3055static inline int vec_any_ge(vec_short8 a, vec_short8 b) 3056{ 3057 return ((int)(spu_extract(spu_gather(spu_cmpgt(b, a)), 0) != 0xFF)); 3058} 3059 3060static inline int vec_any_ge(vec_bshort8 a, vec_short8 b) 3061{ 3062 return ((int)(spu_extract(spu_gather(spu_cmpgt(b, (vec_short8)(a))), 0) != 0xFF)); 3063} 3064 3065static inline int vec_any_ge(vec_short8 a, vec_bshort8 b) 3066{ 3067 return ((int)(spu_extract(spu_gather(spu_cmpgt((vec_short8)(b), a)), 0) != 0xFF)); 3068} 3069 3070static inline int vec_any_ge(vec_uint4 a, vec_uint4 b) 3071{ 3072 return ((int)(spu_extract(spu_gather(spu_cmpgt(b, a)), 0) != 0xF)); 3073} 3074 3075static inline int vec_any_ge(vec_int4 a, vec_int4 b) 3076{ 3077 return ((int)(spu_extract(spu_gather(spu_cmpgt(b, a)), 0) != 0xF)); 3078} 3079 3080static inline int vec_any_ge(vec_bint4 a, vec_int4 b) 3081{ 3082 return ((int)(spu_extract(spu_gather(spu_cmpgt(b, (vec_int4)(a))), 0) != 0xF)); 3083} 3084 3085static inline int vec_any_ge(vec_int4 a, vec_bint4 b) 3086{ 3087 return ((int)(spu_extract(spu_gather(spu_cmpgt((vec_int4)(b), a)), 0) != 0xF)); 3088} 3089 3090static inline int vec_any_ge(vec_float4 a, vec_float4 b) 3091{ 3092 return ((int)(spu_extract(spu_gather(spu_cmpgt(b, a)), 0) != 0xF)); 3093} 3094 3095 3096/* vec_any_gt (any elements greater than) 3097 * ========== 3098 */ 3099static inline int vec_any_gt(vec_uchar16 a, vec_uchar16 b) 3100{ 3101 return ((int)(spu_extract(spu_gather(spu_cmpgt(a, b)), 0) != 0)); 3102} 3103 3104static inline int vec_any_gt(vec_char16 a, vec_char16 b) 3105{ 3106 return ((int)(spu_extract(spu_gather(spu_cmpgt(a, b)), 0) != 0)); 3107} 3108 3109static inline int vec_any_gt(vec_bchar16 a, vec_char16 b) 3110{ 3111 return ((int)(spu_extract(spu_gather(spu_cmpgt((vec_char16)(a), b)), 0) != 0)); 3112} 3113 3114static inline int vec_any_gt(vec_char16 a, vec_bchar16 b) 3115{ 3116 return ((int)(spu_extract(spu_gather(spu_cmpgt(a, (vec_char16)(b))), 0) != 0)); 3117} 3118 3119static inline int vec_any_gt(vec_ushort8 a, vec_ushort8 b) 3120{ 3121 return ((int)(spu_extract(spu_gather(spu_cmpgt(a, b)), 0) != 0)); 3122} 3123 3124static inline int vec_any_gt(vec_short8 a, vec_short8 b) 3125{ 3126 return ((int)(spu_extract(spu_gather(spu_cmpgt(a, b)), 0) != 0)); 3127} 3128 3129static inline int vec_any_gt(vec_bshort8 a, vec_short8 b) 3130{ 3131 return ((int)(spu_extract(spu_gather(spu_cmpgt((vec_short8)(a), b)), 0) != 0)); 3132} 3133 3134static inline int vec_any_gt(vec_short8 a, vec_bshort8 b) 3135{ 3136 return ((int)(spu_extract(spu_gather(spu_cmpgt(a, (vec_short8)(b))), 0) != 0)); 3137} 3138 3139 3140static inline int vec_any_gt(vec_uint4 a, vec_uint4 b) 3141{ 3142 return ((int)(spu_extract(spu_orx(spu_rlmask(spu_cmpgt(a, b), -31)), 0))); 3143} 3144 3145static inline int vec_any_gt(vec_int4 a, vec_int4 b) 3146{ 3147 return ((int)(spu_extract(spu_orx(spu_rlmask(spu_cmpgt(a, b), -31)), 0))); 3148} 3149 3150static inline int vec_any_gt(vec_bint4 a, vec_int4 b) 3151{ 3152 return ((int)(spu_extract(spu_orx(spu_rlmask(spu_cmpgt((vec_int4)(a), b), -31)), 0))); 3153} 3154 3155static inline int vec_any_gt(vec_int4 a, vec_bint4 b) 3156{ 3157 return ((int)(spu_extract(spu_orx(spu_rlmask(spu_cmpgt(a, (vec_int4)(b)), -31)), 0))); 3158} 3159 3160static inline int vec_any_gt(vec_float4 a, vec_float4 b) 3161{ 3162 return ((int)(spu_extract(spu_orx(spu_rlmask(spu_cmpgt(a, b), -31)), 0))); 3163} 3164 3165/* vec_any_le (any elements less than or equal) 3166 * ========== 3167 */ 3168static inline int vec_any_le(vec_uchar16 a, vec_uchar16 b) 3169{ 3170 return ((int)(spu_extract(spu_gather(spu_cmpgt(a, b)), 0) != 0xFFFF)); 3171} 3172 3173static inline int vec_any_le(vec_char16 a, vec_char16 b) 3174{ 3175 return ((int)(spu_extract(spu_gather(spu_cmpgt(a, b)), 0) != 0xFFFF)); 3176} 3177 3178static inline int vec_any_le(vec_bchar16 a, vec_char16 b) 3179{ 3180 return ((int)(spu_extract(spu_gather(spu_cmpgt((vec_char16)(a), b)), 0) != 0xFFFF)); 3181} 3182 3183static inline int vec_any_le(vec_char16 a, vec_bchar16 b) 3184{ 3185 return ((int)(spu_extract(spu_gather(spu_cmpgt(a, (vec_char16)(b))), 0) != 0xFFFF)); 3186} 3187 3188static inline int vec_any_le(vec_ushort8 a, vec_ushort8 b) 3189{ 3190 return ((int)(spu_extract(spu_gather(spu_cmpgt(a, b)), 0) != 0xFF)); 3191} 3192 3193static inline int vec_any_le(vec_short8 a, vec_short8 b) 3194{ 3195 return ((int)(spu_extract(spu_gather(spu_cmpgt(a, b)), 0) != 0xFF)); 3196} 3197 3198static inline int vec_any_le(vec_bshort8 a, vec_short8 b) 3199{ 3200 return ((int)(spu_extract(spu_gather(spu_cmpgt((vec_short8)(a), b)), 0) != 0xFF)); 3201} 3202 3203static inline int vec_any_le(vec_short8 a, vec_bshort8 b) 3204{ 3205 return ((int)(spu_extract(spu_gather(spu_cmpgt(a, (vec_short8)(b))), 0) != 0xFF)); 3206} 3207 3208static inline int vec_any_le(vec_uint4 a, vec_uint4 b) 3209{ 3210 return ((int)(spu_extract(spu_gather(spu_cmpgt(a, b)), 0) != 0xF)); 3211} 3212 3213static inline int vec_any_le(vec_int4 a, vec_int4 b) 3214{ 3215 return ((int)(spu_extract(spu_gather(spu_cmpgt(a, b)), 0) != 0xF)); 3216} 3217 3218static inline int vec_any_le(vec_bint4 a, vec_int4 b) 3219{ 3220 return ((int)(spu_extract(spu_gather(spu_cmpgt((vec_int4)(a), b)), 0) != 0xF)); 3221} 3222 3223static inline int vec_any_le(vec_int4 a, vec_bint4 b) 3224{ 3225 return ((int)(spu_extract(spu_gather(spu_cmpgt(a, (vec_int4)(b))), 0) != 0xF)); 3226} 3227 3228static inline int vec_any_le(vec_float4 a, vec_float4 b) 3229{ 3230 return ((int)(spu_extract(spu_gather(spu_cmpgt(a, b)), 0) != 0xF)); 3231} 3232 3233 3234/* vec_any_lt (any elements less than) 3235 * ========== 3236 */ 3237static inline int vec_any_lt(vec_uchar16 a, vec_uchar16 b) 3238{ 3239 return ((int)(spu_extract(spu_gather(spu_cmpgt(b, a)), 0) != 0)); 3240} 3241 3242static inline int vec_any_lt(vec_char16 a, vec_char16 b) 3243{ 3244 return ((int)(spu_extract(spu_gather(spu_cmpgt(b, a)), 0) != 0)); 3245} 3246 3247static inline int vec_any_lt(vec_bchar16 a, vec_char16 b) 3248{ 3249 return ((int)(spu_extract(spu_gather(spu_cmpgt(b, (vec_char16)(a))), 0) != 0)); 3250} 3251 3252static inline int vec_any_lt(vec_char16 a, vec_bchar16 b) 3253{ 3254 return ((int)(spu_extract(spu_gather(spu_cmpgt((vec_char16)(b), a)), 0) != 0)); 3255} 3256 3257static inline int vec_any_lt(vec_ushort8 a, vec_ushort8 b) 3258{ 3259 return ((int)(spu_extract(spu_gather(spu_cmpgt(b, a)), 0) != 0)); 3260} 3261 3262static inline int vec_any_lt(vec_short8 a, vec_short8 b) 3263{ 3264 return ((int)(spu_extract(spu_gather(spu_cmpgt(b, a)), 0) != 0)); 3265} 3266 3267static inline int vec_any_lt(vec_bshort8 a, vec_short8 b) 3268{ 3269 return ((int)(spu_extract(spu_gather(spu_cmpgt(b, (vec_short8)(a))), 0) != 0)); 3270} 3271 3272static inline int vec_any_lt(vec_short8 a, vec_bshort8 b) 3273{ 3274 return ((int)(spu_extract(spu_gather(spu_cmpgt((vec_short8)(b), a)), 0) != 0)); 3275} 3276 3277static inline int vec_any_lt(vec_uint4 a, vec_uint4 b) 3278{ 3279 return ((int)(spu_extract(spu_orx(spu_rlmask(spu_cmpgt(b, a), -31)), 0))); 3280} 3281 3282static inline int vec_any_lt(vec_int4 a, vec_int4 b) 3283{ 3284 return ((int)(spu_extract(spu_orx(spu_rlmask(spu_cmpgt(b, a), -31)), 0))); 3285} 3286 3287static inline int vec_any_lt(vec_bint4 a, vec_int4 b) 3288{ 3289 return ((int)(spu_extract(spu_orx(spu_rlmask(spu_cmpgt(b, (vec_int4)(a)), -31)), 0))); 3290} 3291 3292static inline int vec_any_lt(vec_int4 a, vec_bint4 b) 3293{ 3294 return ((int)(spu_extract(spu_orx(spu_rlmask(spu_cmpgt((vec_int4)(b), a), -31)), 0))); 3295} 3296 3297static inline int vec_any_lt(vec_float4 a, vec_float4 b) 3298{ 3299 return ((int)(spu_extract(spu_orx(spu_rlmask(spu_cmpgt(b, a), -31)), 0))); 3300} 3301 3302/* vec_any_nan (any elements not a number) 3303 * =========== 3304 */ 3305static inline int vec_any_nan(vec_float4 a) 3306{ 3307 vec_uint4 exp, man; 3308 vec_uint4 exp_mask = spu_splats((unsigned int)0x7F800000); 3309 3310 exp = spu_and((vec_uint4)(a), exp_mask); 3311 man = spu_and((vec_uint4)(a), spu_splats((unsigned int)0x007FFFFF)); 3312 return ((int)(spu_extract(spu_gather(spu_andc(spu_cmpeq(exp, exp_mask), 3313 spu_cmpeq(man, 0))), 0) != 0)); 3314} 3315 3316 3317/* vec_any_ne (any elements not equal) 3318 * ========== 3319 */ 3320static inline int vec_any_ne(vec_uchar16 a, vec_uchar16 b) 3321{ 3322 return ((int)(spu_extract(spu_gather(spu_cmpeq(a, b)), 0) != 0xFFFF)); 3323} 3324 3325static inline int vec_any_ne(vec_char16 a, vec_char16 b) 3326{ 3327 return ((int)(spu_extract(spu_gather(spu_cmpeq(a, b)), 0) != 0xFFFF)); 3328} 3329 3330static inline int vec_any_ne(vec_bchar16 a, vec_char16 b) 3331{ 3332 return ((int)(spu_extract(spu_gather(spu_cmpeq((vec_char16)(a), b)), 0) != 0xFFFF)); 3333} 3334 3335static inline int vec_any_ne(vec_char16 a, vec_bchar16 b) 3336{ 3337 return ((int)(spu_extract(spu_gather(spu_cmpeq(a, (vec_char16)(b))), 0) != 0xFFFF)); 3338} 3339 3340static inline int vec_any_ne(vec_ushort8 a, vec_ushort8 b) 3341{ 3342 return ((int)(spu_extract(spu_gather(spu_cmpeq(a, b)), 0) != 0xFF)); 3343} 3344 3345static inline int vec_any_ne(vec_short8 a, vec_short8 b) 3346{ 3347 return ((int)(spu_extract(spu_gather(spu_cmpeq(a, b)), 0) != 0xFF)); 3348} 3349 3350static inline int vec_any_ne(vec_bshort8 a, vec_short8 b) 3351{ 3352 return ((int)(spu_extract(spu_gather(spu_cmpeq((vec_short8)(a), b)), 0) != 0xFF)); 3353} 3354 3355static inline int vec_any_ne(vec_short8 a, vec_bshort8 b) 3356{ 3357 return ((int)(spu_extract(spu_gather(spu_cmpeq(a, (vec_short8)(b))), 0) != 0xFF)); 3358} 3359 3360static inline int vec_any_ne(vec_uint4 a, vec_uint4 b) 3361{ 3362 return ((int)(spu_extract(spu_gather(spu_cmpeq(a, b)), 0) != 0xF)); 3363} 3364 3365static inline int vec_any_ne(vec_int4 a, vec_int4 b) 3366{ 3367 return ((int)(spu_extract(spu_gather(spu_cmpeq(a, b)), 0) != 0xF)); 3368} 3369 3370static inline int vec_any_ne(vec_bint4 a, vec_int4 b) 3371{ 3372 return ((int)(spu_extract(spu_gather(spu_cmpeq((vec_int4)(a), b)), 0) != 0xF)); 3373} 3374 3375static inline int vec_any_ne(vec_int4 a, vec_bint4 b) 3376{ 3377 return ((int)(spu_extract(spu_gather(spu_cmpeq(a, (vec_int4)(b))), 0) != 0xF)); 3378} 3379 3380static inline int vec_any_ne(vec_float4 a, vec_float4 b) 3381{ 3382 return ((int)(spu_extract(spu_gather(spu_cmpeq(a, b)), 0) != 0xF)); 3383} 3384 3385 3386/* vec_any_nge (any elements not greater than or equal) 3387 * =========== 3388 */ 3389static inline int vec_any_nge(vec_float4 a, vec_float4 b) 3390{ 3391 return ((int)(spu_extract(spu_orx(spu_rlmask(spu_cmpgt(b, a), -31)), 0))); 3392} 3393 3394/* vec_any_ngt (any elements not greater than) 3395 * =========== 3396 */ 3397static inline int vec_any_ngt(vec_float4 a, vec_float4 b) 3398{ 3399 return ((int)(spu_extract(spu_gather(spu_cmpgt(a, b)), 0) != 0xF)); 3400} 3401 3402 3403/* vec_any_nle (any elements not less than or equal) 3404 * =========== 3405 */ 3406static inline int vec_any_nle(vec_float4 a, vec_float4 b) 3407{ 3408 return ((int)(spu_extract(spu_gather(spu_cmpgt(a, b)), 0) != 0)); 3409} 3410 3411 3412/* vec_any_nlt (any elements not less than) 3413 * =========== 3414 */ 3415static inline int vec_any_nlt(vec_float4 a, vec_float4 b) 3416{ 3417 return ((int)(spu_extract(spu_gather(spu_cmpgt(b, a)), 0) != 0xF)); 3418} 3419 3420 3421/* vec_any_numeric (any elements numeric) 3422 * =============== 3423 */ 3424static inline int vec_any_numeric(vec_float4 a) 3425{ 3426 vec_uint4 exp; 3427 3428 exp = spu_and(spu_rlmask((vec_uint4)(a), -23), 0xFF); 3429 return ((int)(spu_extract(spu_gather(spu_cmpeq(exp, 255)), 0) != 0xF)); 3430} 3431 3432 3433/* vec_any_out (any elements out of bounds) 3434 * =========== 3435 */ 3436static inline int vec_any_out(vec_float4 a, vec_float4 b) 3437{ 3438 return (spu_extract(spu_gather(spu_nor(spu_cmpabsgt(a, b), (vec_uint4)(spu_rlmaska((vec_int4)(b), -31)))), 0) != 0xF); 3439} 3440 3441 3442/* CBE Language Extension Intrinsics 3443 */ 3444 3445/* vec_extract (extract element from vector) 3446 * =========== 3447 */ 3448#define vec_extract(_a, _element) spu_extract(_a, _element) 3449 3450 3451/* vec_insert (insert scalar into specified vector element) 3452 * ========== 3453 */ 3454#define vec_insert(_a, _b, _element) spu_insert(_a, _b, _element) 3455 3456/* vec_lvlx (load vector left indexed) 3457 * ======== 3458 */ 3459static inline vec_uchar16 vec_lvlx(int a, unsigned char *b) 3460{ 3461 vec_uchar16 *p = (vec_uchar16 *)((unsigned char *)(b) + a); 3462 return(spu_slqwbyte(*p, (unsigned int)p & 0xF)); 3463} 3464 3465static inline vec_uchar16 vec_lvlx(int a, vec_uchar16 *b) 3466{ 3467 vec_uchar16 *p = (vec_uchar16 *)((unsigned char *)(b) + a); 3468 return(spu_slqwbyte(*p, (unsigned int)p & 0xF)); 3469} 3470 3471static inline vec_char16 vec_lvlx(int a, signed char *b) 3472{ 3473 vec_char16 *p = (vec_char16 *)((unsigned char *)(b) + a); 3474 return(spu_slqwbyte(*p, (unsigned int)p & 0xF)); 3475} 3476 3477static inline vec_char16 vec_lvlx(int a, vec_char16 *b) 3478{ 3479 vec_char16 *p = (vec_char16 *)((unsigned char *)(b) + a); 3480 return(spu_slqwbyte(*p, (unsigned int)p & 0xF)); 3481} 3482 3483static inline vec_ushort8 vec_lvlx(int a, unsigned short *b) 3484{ 3485 vec_ushort8 *p = (vec_ushort8 *)((unsigned char *)(b) + a); 3486 return(spu_slqwbyte(*p, (unsigned int)p & 0xF)); 3487} 3488 3489static inline vec_ushort8 vec_lvlx(int a, vec_ushort8 *b) 3490{ 3491 vec_ushort8 *p = (vec_ushort8 *)((unsigned char *)(b) + a); 3492 return(spu_slqwbyte(*p, (unsigned int)p & 0xF)); 3493} 3494 3495static inline vec_short8 vec_lvlx(int a, signed short *b) 3496{ 3497 vec_short8 *p = (vec_short8 *)((unsigned char *)(b) + a); 3498 return(spu_slqwbyte(*p, (unsigned int)p & 0xF)); 3499} 3500 3501static inline vec_short8 vec_lvlx(int a, vec_short8 *b) 3502{ 3503 vec_short8 *p = (vec_short8 *)((unsigned char *)(b) + a); 3504 return(spu_slqwbyte(*p, (unsigned int)p & 0xF)); 3505} 3506 3507static inline vec_uint4 vec_lvlx(int a, unsigned int *b) 3508{ 3509 vec_uint4 *p = (vec_uint4 *)((unsigned char *)(b) + a); 3510 return(spu_slqwbyte(*p, (unsigned int)p & 0xF)); 3511} 3512 3513static inline vec_uint4 vec_lvlx(int a, vec_uint4 *b) 3514{ 3515 vec_uint4 *p = (vec_uint4 *)((unsigned char *)(b) + a); 3516 return(spu_slqwbyte(*p, (unsigned int)p & 0xF)); 3517} 3518 3519static inline vec_int4 vec_lvlx(int a, signed int *b) 3520{ 3521 vec_int4 *p = (vec_int4 *)((unsigned char *)(b) + a); 3522 return(spu_slqwbyte(*p, (unsigned int)p & 0xF)); 3523} 3524 3525static inline vec_int4 vec_lvlx(int a, vec_int4 *b) 3526{ 3527 vec_int4 *p = (vec_int4 *)((unsigned char *)(b) + a); 3528 return(spu_slqwbyte(*p, (unsigned int)p & 0xF)); 3529} 3530 3531static inline vec_float4 vec_lvlx(int a, float *b) 3532{ 3533 vec_float4 *p = (vec_float4 *)((unsigned char *)(b) + a); 3534 return(spu_slqwbyte(*p, (unsigned int)p & 0xF)); 3535} 3536 3537static inline vec_float4 vec_lvlx(int a, vec_float4 *b) 3538{ 3539 vec_float4 *p = (vec_float4 *)((unsigned char *)(b) + a); 3540 return(spu_slqwbyte(*p, (unsigned int)p & 0xF)); 3541} 3542 3543 3544/* vec_lvlxl (load vector left indexed last) 3545 * ========= 3546 */ 3547#define vec_lvlxl(_a, _b) vec_lvlx(_a, _b) 3548 3549 3550/* vec_lvrx (load vector right indexed) 3551 * ======== 3552 */ 3553static inline vec_uchar16 vec_lvrx(int a, unsigned char *b) 3554{ 3555 vec_uchar16 *p = (vec_uchar16 *)((unsigned char *)(b) + a); 3556 return(spu_rlmaskqwbyte(*p, ((int)p & 0xF)-16)); 3557} 3558 3559static inline vec_uchar16 vec_lvrx(int a, vec_uchar16 *b) 3560{ 3561 vec_uchar16 *p = (vec_uchar16 *)((unsigned char *)(b) + a); 3562 return(spu_rlmaskqwbyte(*p, ((int)p & 0xF)-16)); 3563} 3564 3565static inline vec_char16 vec_lvrx(int a, signed char *b) 3566{ 3567 vec_char16 *p = (vec_char16 *)((unsigned char *)(b) + a); 3568 return(spu_rlmaskqwbyte(*p, ((int)p & 0xF)-16)); 3569} 3570 3571static inline vec_char16 vec_lvrx(int a, vec_char16 *b) 3572{ 3573 vec_char16 *p = (vec_char16 *)((unsigned char *)(b) + a); 3574 return(spu_rlmaskqwbyte(*p, ((int)p & 0xF)-16)); 3575} 3576 3577static inline vec_ushort8 vec_lvrx(int a, unsigned short *b) 3578{ 3579 vec_ushort8 *p = (vec_ushort8 *)((unsigned char *)(b) + a); 3580 return(spu_rlmaskqwbyte(*p, ((int)p & 0xF)-16)); 3581} 3582 3583static inline vec_ushort8 vec_lvrx(int a, vec_ushort8 *b) 3584{ 3585 vec_ushort8 *p = (vec_ushort8 *)((unsigned char *)(b) + a); 3586 return(spu_rlmaskqwbyte(*p, ((int)p & 0xF)-16)); 3587} 3588 3589static inline vec_short8 vec_lvrx(int a, signed short *b) 3590{ 3591 vec_short8 *p = (vec_short8 *)((unsigned char *)(b) + a); 3592 return(spu_rlmaskqwbyte(*p, ((int)p & 0xF)-16)); 3593} 3594 3595static inline vec_short8 vec_lvrx(int a, vec_short8 *b) 3596{ 3597 vec_short8 *p = (vec_short8 *)((unsigned char *)(b) + a); 3598 return(spu_rlmaskqwbyte(*p, ((int)p & 0xF)-16)); 3599} 3600 3601static inline vec_uint4 vec_lvrx(int a, unsigned int *b) 3602{ 3603 vec_uint4 *p = (vec_uint4 *)((unsigned char *)(b) + a); 3604 return(spu_rlmaskqwbyte(*p, ((int)p & 0xF)-16)); 3605} 3606 3607static inline vec_uint4 vec_lvrx(int a, vec_uint4 *b) 3608{ 3609 vec_uint4 *p = (vec_uint4 *)((unsigned char *)(b) + a); 3610 return(spu_rlmaskqwbyte(*p, ((int)p & 0xF)-16)); 3611} 3612 3613static inline vec_int4 vec_lvrx(int a, signed int *b) 3614{ 3615 vec_int4 *p = (vec_int4 *)((unsigned char *)(b) + a); 3616 return(spu_rlmaskqwbyte(*p, ((int)p & 0xF)-16)); 3617} 3618 3619static inline vec_int4 vec_lvrx(int a, vec_int4 *b) 3620{ 3621 vec_int4 *p = (vec_int4 *)((unsigned char *)(b) + a); 3622 return(spu_rlmaskqwbyte(*p, ((int)p & 0xF)-16)); 3623} 3624 3625static inline vec_float4 vec_lvrx(int a, float *b) 3626{ 3627 vec_float4 *p = (vec_float4 *)((unsigned char *)(b) + a); 3628 return(spu_rlmaskqwbyte(*p, ((int)p & 0xF)-16)); 3629} 3630 3631static inline vec_float4 vec_lvrx(int a, vec_float4 *b) 3632{ 3633 vec_float4 *p = (vec_float4 *)((unsigned char *)(b) + a); 3634 return(spu_rlmaskqwbyte(*p, ((int)p & 0xF)-16)); 3635} 3636 3637 3638 3639/* vec_lvrxl (load vector right indexed last) 3640 * ========= 3641 */ 3642#define vec_lvrxl(_a, _b) vec_lvrx(_a, _b) 3643 3644 3645/* vec_promote (promote scalar to a vector) 3646 * =========== 3647 */ 3648#define vec_promote(_a, _element) spu_promote(_a, _element) 3649 3650 3651/* vec_splats (splat scalar to a vector) 3652 * ========== 3653 */ 3654#define vec_splats(_a) spu_splats(_a) 3655 3656 3657/* vec_stvlx (store vector left indexed) 3658 * ========= 3659 */ 3660static inline void vec_stvlx(vec_uchar16 a, int b, unsigned char *c) 3661{ 3662 int shift; 3663 vec_uchar16 *p = (vec_uchar16 *)((unsigned char *)(c) + b); 3664 3665 shift = -((int)p & 0xF); 3666 *p = spu_sel(*p, 3667 spu_rlmaskqwbyte(a, shift), 3668 spu_rlmaskqwbyte(spu_splats((unsigned char)0xFF), shift)); 3669} 3670 3671static inline void vec_stvlx(vec_uchar16 a, int b, vec_uchar16 *c) 3672{ 3673 int shift; 3674 vec_uchar16 *p = (vec_uchar16 *)((unsigned char *)(c) + b); 3675 3676 shift = -((int)p & 0xF); 3677 *p = spu_sel(*p, 3678 spu_rlmaskqwbyte(a, shift), 3679 spu_rlmaskqwbyte(spu_splats((unsigned char)0xFF), shift)); 3680} 3681 3682static inline void vec_stvlx(vec_char16 a, int b, signed char *c) 3683{ 3684 int shift; 3685 vec_char16 *p = (vec_char16 *)((unsigned char *)(c) + b); 3686 3687 shift = -((int)p & 0xF); 3688 *p = spu_sel(*p, 3689 spu_rlmaskqwbyte(a, shift), 3690 spu_rlmaskqwbyte(spu_splats((unsigned char)0xFF), shift)); 3691} 3692 3693static inline void vec_stvlx(vec_char16 a, int b, vec_char16 *c) 3694{ 3695 int shift; 3696 vec_char16 *p = (vec_char16 *)((unsigned char *)(c) + b); 3697 3698 shift = -((int)p & 0xF); 3699 *p = spu_sel(*p, 3700 spu_rlmaskqwbyte(a, shift), 3701 spu_rlmaskqwbyte(spu_splats((unsigned char)0xFF), shift)); 3702} 3703 3704static inline void vec_stvlx(vec_ushort8 a, int b, unsigned short *c) 3705{ 3706 int shift; 3707 vec_ushort8 *p = (vec_ushort8 *)((unsigned char *)(c) + b); 3708 3709 shift = -((int)p & 0xF); 3710 *p = spu_sel(*p, 3711 spu_rlmaskqwbyte(a, shift), 3712 spu_rlmaskqwbyte(spu_splats((unsigned short)0xFFFF), shift)); 3713} 3714 3715static inline void vec_stvlx(vec_ushort8 a, int b, vec_ushort8 *c) 3716{ 3717 int shift; 3718 vec_ushort8 *p = (vec_ushort8 *)((unsigned char *)(c) + b); 3719 3720 shift = -((int)p & 0xF); 3721 *p = spu_sel(*p, 3722 spu_rlmaskqwbyte(a, shift), 3723 spu_rlmaskqwbyte(spu_splats((unsigned short)0xFFFF), shift)); 3724} 3725 3726static inline void vec_stvlx(vec_short8 a, int b, signed short *c) 3727{ 3728 int shift; 3729 vec_short8 *p = (vec_short8 *)((unsigned char *)(c) + b); 3730 3731 shift = -((int)p & 0xF); 3732 *p = spu_sel(*p, 3733 spu_rlmaskqwbyte(a, shift), 3734 spu_rlmaskqwbyte(spu_splats((unsigned short)0xFFFF), shift)); 3735} 3736 3737static inline void vec_stvlx(vec_short8 a, int b, vec_short8 *c) 3738{ 3739 int shift; 3740 vec_short8 *p = (vec_short8 *)((unsigned char *)(c) + b); 3741 3742 shift = -((int)p & 0xF); 3743 *p = spu_sel(*p, 3744 spu_rlmaskqwbyte(a, shift), 3745 spu_rlmaskqwbyte(spu_splats((unsigned short)0xFFFF), shift)); 3746} 3747 3748static inline void vec_stvlx(vec_uint4 a, int b, unsigned int *c) 3749{ 3750 int shift; 3751 vec_uint4 *p = (vec_uint4 *)((unsigned char *)(c) + b); 3752 3753 shift = -((int)p & 0xF); 3754 *p = spu_sel(*p, 3755 spu_rlmaskqwbyte(a, shift), 3756 spu_rlmaskqwbyte(spu_splats((unsigned int)0xFFFFFFFF), shift)); 3757} 3758 3759static inline void vec_stvlx(vec_uint4 a, int b, vec_uint4 *c) 3760{ 3761 int shift; 3762 vec_uint4 *p = (vec_uint4 *)((unsigned char *)(c) + b); 3763 3764 shift = -((int)p & 0xF); 3765 *p = spu_sel(*p, 3766 spu_rlmaskqwbyte(a, shift), 3767 spu_rlmaskqwbyte(spu_splats((unsigned int)0xFFFFFFFF), shift)); 3768} 3769 3770static inline void vec_stvlx(vec_int4 a, int b, signed int *c) 3771{ 3772 int shift; 3773 vec_int4 *p = (vec_int4 *)((unsigned char *)(c) + b); 3774 3775 shift = -((int)p & 0xF); 3776 *p = spu_sel(*p, 3777 spu_rlmaskqwbyte(a, shift), 3778 spu_rlmaskqwbyte(spu_splats((unsigned int)0xFFFFFFFF), shift)); 3779} 3780 3781static inline void vec_stvlx(vec_int4 a, int b, vec_int4 *c) 3782{ 3783 int shift; 3784 vec_int4 *p = (vec_int4 *)((unsigned char *)(c) + b); 3785 3786 shift = -((int)p & 0xF); 3787 *p = spu_sel(*p, 3788 spu_rlmaskqwbyte(a, shift), 3789 spu_rlmaskqwbyte(spu_splats((unsigned int)0xFFFFFFFF), shift)); 3790} 3791 3792static inline void vec_stvlx(vec_float4 a, int b, float *c) 3793{ 3794 int shift; 3795 vec_float4 *p = (vec_float4 *)((unsigned char *)(c) + b); 3796 3797 shift = -((int)p & 0xF); 3798 *p = spu_sel(*p, 3799 spu_rlmaskqwbyte(a, shift), 3800 spu_rlmaskqwbyte(spu_splats((unsigned int)0xFFFFFFFF), shift)); 3801} 3802 3803static inline void vec_stvlx(vec_float4 a, int b, vec_float4 *c) 3804{ 3805 int shift; 3806 vec_float4 *p = (vec_float4 *)((unsigned char *)(c) + b); 3807 3808 shift = -((int)p & 0xF); 3809 *p = spu_sel(*p, 3810 spu_rlmaskqwbyte(a, shift), 3811 spu_rlmaskqwbyte(spu_splats((unsigned int)0xFFFFFFFF), shift)); 3812} 3813 3814/* vec_stvlxl (store vector left indexed last) 3815 * ========== 3816 */ 3817#define vec_stvlxl(_a, _b, _c) vec_stvlx(_a, _b, _c) 3818 3819 3820/* vec_stvrx (store vector right indexed) 3821 * ========= 3822 */ 3823static inline void vec_stvrx(vec_uchar16 a, int b, unsigned char *c) 3824{ 3825 int shift; 3826 vec_uchar16 *p = (vec_uchar16 *)((unsigned char *)(c) + b); 3827 3828 shift = 16-((int)p & 0xF); 3829 *p = spu_sel(*p, 3830 spu_slqwbyte(a, shift), 3831 spu_slqwbyte(spu_splats((unsigned char)0xFF), shift)); 3832} 3833 3834static inline void vec_stvrx(vec_uchar16 a, int b, vec_uchar16 *c) 3835{ 3836 int shift; 3837 vec_uchar16 *p = (vec_uchar16 *)((unsigned char *)(c) + b); 3838 3839 shift = 16-((int)p & 0xF); 3840 *p = spu_sel(*p, 3841 spu_slqwbyte(a, shift), 3842 spu_slqwbyte(spu_splats((unsigned char)0xFF), shift)); 3843} 3844 3845static inline void vec_stvrx(vec_char16 a, int b, signed char *c) 3846{ 3847 int shift; 3848 vec_char16 *p = (vec_char16 *)((unsigned char *)(c) + b); 3849 3850 shift = 16-((int)p & 0xF); 3851 *p = spu_sel(*p, 3852 spu_slqwbyte(a, shift), 3853 spu_slqwbyte(spu_splats((unsigned char)0xFF), shift)); 3854} 3855 3856static inline void vec_stvrx(vec_char16 a, int b, vec_char16 *c) 3857{ 3858 int shift; 3859 vec_char16 *p = (vec_char16 *)((unsigned char *)(c) + b); 3860 3861 shift = 16-((int)p & 0xF); 3862 *p = spu_sel(*p, 3863 spu_slqwbyte(a, shift), 3864 spu_slqwbyte(spu_splats((unsigned char)0xFF), shift)); 3865} 3866 3867static inline void vec_stvrx(vec_ushort8 a, int b, unsigned short *c) 3868{ 3869 int shift; 3870 vec_ushort8 *p = (vec_ushort8 *)((unsigned char *)(c) + b); 3871 3872 shift = 16-((int)p & 0xF); 3873 *p = spu_sel(*p, 3874 spu_slqwbyte(a, shift), 3875 spu_slqwbyte(spu_splats((unsigned short)0xFFFF), shift)); 3876} 3877 3878static inline void vec_stvrx(vec_ushort8 a, int b, vec_ushort8 *c) 3879{ 3880 int shift; 3881 vec_ushort8 *p = (vec_ushort8 *)((unsigned char *)(c) + b); 3882 3883 shift = 16-((int)p & 0xF); 3884 *p = spu_sel(*p, 3885 spu_slqwbyte(a, shift), 3886 spu_slqwbyte(spu_splats((unsigned short)0xFFFF), shift)); 3887} 3888 3889static inline void vec_stvrx(vec_short8 a, int b, signed short *c) 3890{ 3891 int shift; 3892 vec_short8 *p = (vec_short8 *)((unsigned char *)(c) + b); 3893 3894 shift = 16-((int)p & 0xF); 3895 *p = spu_sel(*p, 3896 spu_slqwbyte(a, shift), 3897 spu_slqwbyte(spu_splats((unsigned short)0xFFFF), shift)); 3898} 3899 3900static inline void vec_stvrx(vec_short8 a, int b, vec_short8 *c) 3901{ 3902 int shift; 3903 vec_short8 *p = (vec_short8 *)((unsigned char *)(c) + b); 3904 3905 shift = 16-((int)p & 0xF); 3906 *p = spu_sel(*p, 3907 spu_slqwbyte(a, shift), 3908 spu_slqwbyte(spu_splats((unsigned short)0xFFFF), shift)); 3909} 3910 3911static inline void vec_stvrx(vec_uint4 a, int b, unsigned int *c) 3912{ 3913 int shift; 3914 vec_uint4 *p = (vec_uint4 *)((unsigned char *)(c) + b); 3915 3916 shift = 16-((int)p & 0xF); 3917 *p = spu_sel(*p, 3918 spu_slqwbyte(a, shift), 3919 spu_slqwbyte(spu_splats((unsigned int)0xFFFFFFFF), shift)); 3920} 3921 3922static inline void vec_stvrx(vec_uint4 a, int b, vec_uint4 *c) 3923{ 3924 int shift; 3925 vec_uint4 *p = (vec_uint4 *)((unsigned char *)(c) + b); 3926 3927 shift = 16-((int)p & 0xF); 3928 *p = spu_sel(*p, 3929 spu_slqwbyte(a, shift), 3930 spu_slqwbyte(spu_splats((unsigned int)0xFFFFFFFF), shift)); 3931} 3932 3933static inline void vec_stvrx(vec_int4 a, int b, signed int *c) 3934{ 3935 int shift; 3936 vec_int4 *p = (vec_int4 *)((unsigned char *)(c) + b); 3937 3938 shift = 16-((int)p & 0xF); 3939 *p = spu_sel(*p, 3940 spu_slqwbyte(a, shift), 3941 spu_slqwbyte(spu_splats((unsigned int)0xFFFFFFFF), shift)); 3942} 3943 3944static inline void vec_stvrx(vec_int4 a, int b, vec_int4 *c) 3945{ 3946 int shift; 3947 vec_int4 *p = (vec_int4 *)((unsigned char *)(c) + b); 3948 3949 shift = 16-((int)p & 0xF); 3950 *p = spu_sel(*p, 3951 spu_slqwbyte(a, shift), 3952 spu_slqwbyte(spu_splats((unsigned int)0xFFFFFFFF), shift)); 3953} 3954 3955static inline void vec_stvrx(vec_float4 a, int b, float *c) 3956{ 3957 int shift; 3958 vec_float4 *p = (vec_float4 *)((unsigned char *)(c) + b); 3959 3960 shift = 16-((int)p & 0xF); 3961 *p = spu_sel(*p, 3962 spu_slqwbyte(a, shift), 3963 spu_slqwbyte(spu_splats((unsigned int)0xFFFFFFFF), shift)); 3964} 3965 3966static inline void vec_stvrx(vec_float4 a, int b, vec_float4 *c) 3967{ 3968 int shift; 3969 vec_float4 *p = (vec_float4 *)((unsigned char *)(c) + b); 3970 3971 shift = 16-((int)p & 0xF); 3972 *p = spu_sel(*p, 3973 spu_slqwbyte(a, shift), 3974 spu_slqwbyte(spu_splats((unsigned int)0xFFFFFFFF), shift)); 3975} 3976 3977/* vec_stvrxl (store vector right indexed last) 3978 * ========== 3979 */ 3980#define vec_stvrxl(_a, _b, _c) vec_stvrx(_a, _b, _c) 3981 3982 3983#endif /* __SPU__ */ 3984#endif /* __cplusplus */ 3985#endif /* !_VMX2SPU_H_ */ 3986