1//===- README.txt - Notes for improving PowerPC-specific code gen ---------===// 2 3TODO: 4* gpr0 allocation 5* lmw/stmw pass a la arm load store optimizer for prolog/epilog 6 7===-------------------------------------------------------------------------=== 8 9On PPC64, this: 10 11long f2 (long x) { return 0xfffffff000000000UL; } 12long f3 (long x) { return 0x1ffffffffUL; } 13 14could compile into: 15 16_f2: 17 li r3,-1 18 rldicr r3,r3,0,27 19 blr 20_f3: 21 li r3,-1 22 rldicl r3,r3,0,31 23 blr 24 25we produce: 26 27_f2: 28 lis r2, 4095 29 ori r2, r2, 65535 30 sldi r3, r2, 36 31 blr 32_f3: 33 li r2, 1 34 sldi r2, r2, 32 35 oris r2, r2, 65535 36 ori r3, r2, 65535 37 blr 38 39===-------------------------------------------------------------------------=== 40 41This code: 42 43unsigned add32carry(unsigned sum, unsigned x) { 44 unsigned z = sum + x; 45 if (sum + x < x) 46 z++; 47 return z; 48} 49 50Should compile to something like: 51 52 addc r3,r3,r4 53 addze r3,r3 54 55instead we get: 56 57 add r3, r4, r3 58 cmplw cr7, r3, r4 59 mfcr r4 ; 1 60 rlwinm r4, r4, 29, 31, 31 61 add r3, r3, r4 62 63Ick. 64 65===-------------------------------------------------------------------------=== 66 67Support 'update' load/store instructions. These are cracked on the G5, but are 68still a codesize win. 69 70With preinc enabled, this: 71 72long *%test4(long *%X, long *%dest) { 73 %Y = getelementptr long* %X, int 4 74 %A = load long* %Y 75 store long %A, long* %dest 76 ret long* %Y 77} 78 79compiles to: 80 81_test4: 82 mr r2, r3 83 lwzu r5, 32(r2) 84 lwz r3, 36(r3) 85 stw r5, 0(r4) 86 stw r3, 4(r4) 87 mr r3, r2 88 blr 89 90with -sched=list-burr, I get: 91 92_test4: 93 lwz r2, 36(r3) 94 lwzu r5, 32(r3) 95 stw r2, 4(r4) 96 stw r5, 0(r4) 97 blr 98 99===-------------------------------------------------------------------------=== 100 101We compile the hottest inner loop of viterbi to: 102 103 li r6, 0 104 b LBB1_84 ;bb432.i 105LBB1_83: ;bb420.i 106 lbzx r8, r5, r7 107 addi r6, r7, 1 108 stbx r8, r4, r7 109LBB1_84: ;bb432.i 110 mr r7, r6 111 cmplwi cr0, r7, 143 112 bne cr0, LBB1_83 ;bb420.i 113 114The CBE manages to produce: 115 116 li r0, 143 117 mtctr r0 118loop: 119 lbzx r2, r2, r11 120 stbx r0, r2, r9 121 addi r2, r2, 1 122 bdz later 123 b loop 124 125This could be much better (bdnz instead of bdz) but it still beats us. If we 126produced this with bdnz, the loop would be a single dispatch group. 127 128===-------------------------------------------------------------------------=== 129 130Compile: 131 132void foo(int *P) { 133 if (P) *P = 0; 134} 135 136into: 137 138_foo: 139 cmpwi cr0,r3,0 140 beqlr cr0 141 li r0,0 142 stw r0,0(r3) 143 blr 144 145This is effectively a simple form of predication. 146 147===-------------------------------------------------------------------------=== 148 149Lump the constant pool for each function into ONE pic object, and reference 150pieces of it as offsets from the start. For functions like this (contrived 151to have lots of constants obviously): 152 153double X(double Y) { return (Y*1.23 + 4.512)*2.34 + 14.38; } 154 155We generate: 156 157_X: 158 lis r2, ha16(.CPI_X_0) 159 lfd f0, lo16(.CPI_X_0)(r2) 160 lis r2, ha16(.CPI_X_1) 161 lfd f2, lo16(.CPI_X_1)(r2) 162 fmadd f0, f1, f0, f2 163 lis r2, ha16(.CPI_X_2) 164 lfd f1, lo16(.CPI_X_2)(r2) 165 lis r2, ha16(.CPI_X_3) 166 lfd f2, lo16(.CPI_X_3)(r2) 167 fmadd f1, f0, f1, f2 168 blr 169 170It would be better to materialize .CPI_X into a register, then use immediates 171off of the register to avoid the lis's. This is even more important in PIC 172mode. 173 174Note that this (and the static variable version) is discussed here for GCC: 175http://gcc.gnu.org/ml/gcc-patches/2006-02/msg00133.html 176 177Here's another example (the sgn function): 178double testf(double a) { 179 return a == 0.0 ? 0.0 : (a > 0.0 ? 1.0 : -1.0); 180} 181 182it produces a BB like this: 183LBB1_1: ; cond_true 184 lis r2, ha16(LCPI1_0) 185 lfs f0, lo16(LCPI1_0)(r2) 186 lis r2, ha16(LCPI1_1) 187 lis r3, ha16(LCPI1_2) 188 lfs f2, lo16(LCPI1_2)(r3) 189 lfs f3, lo16(LCPI1_1)(r2) 190 fsub f0, f0, f1 191 fsel f1, f0, f2, f3 192 blr 193 194===-------------------------------------------------------------------------=== 195 196PIC Code Gen IPO optimization: 197 198Squish small scalar globals together into a single global struct, allowing the 199address of the struct to be CSE'd, avoiding PIC accesses (also reduces the size 200of the GOT on targets with one). 201 202Note that this is discussed here for GCC: 203http://gcc.gnu.org/ml/gcc-patches/2006-02/msg00133.html 204 205===-------------------------------------------------------------------------=== 206 207Implement Newton-Rhapson method for improving estimate instructions to the 208correct accuracy, and implementing divide as multiply by reciprocal when it has 209more than one use. Itanium would want this too. 210 211===-------------------------------------------------------------------------=== 212 213Compile offsets from allocas: 214 215int *%test() { 216 %X = alloca { int, int } 217 %Y = getelementptr {int,int}* %X, int 0, uint 1 218 ret int* %Y 219} 220 221into a single add, not two: 222 223_test: 224 addi r2, r1, -8 225 addi r3, r2, 4 226 blr 227 228--> important for C++. 229 230===-------------------------------------------------------------------------=== 231 232No loads or stores of the constants should be needed: 233 234struct foo { double X, Y; }; 235void xxx(struct foo F); 236void bar() { struct foo R = { 1.0, 2.0 }; xxx(R); } 237 238===-------------------------------------------------------------------------=== 239 240Darwin Stub removal: 241 242We still generate calls to foo$stub, and stubs, on Darwin. This is not 243necessary when building with the Leopard (10.5) or later linker, as stubs are 244generated by ld when necessary. Parameterizing this based on the deployment 245target (-mmacosx-version-min) is probably enough. x86-32 does this right, see 246its logic. 247 248===-------------------------------------------------------------------------=== 249 250Darwin Stub LICM optimization: 251 252Loops like this: 253 254 for (...) bar(); 255 256Have to go through an indirect stub if bar is external or linkonce. It would 257be better to compile it as: 258 259 fp = &bar; 260 for (...) fp(); 261 262which only computes the address of bar once (instead of each time through the 263stub). This is Darwin specific and would have to be done in the code generator. 264Probably not a win on x86. 265 266===-------------------------------------------------------------------------=== 267 268Simple IPO for argument passing, change: 269 void foo(int X, double Y, int Z) -> void foo(int X, int Z, double Y) 270 271the Darwin ABI specifies that any integer arguments in the first 32 bytes worth 272of arguments get assigned to r3 through r10. That is, if you have a function 273foo(int, double, int) you get r3, f1, r6, since the 64 bit double ate up the 274argument bytes for r4 and r5. The trick then would be to shuffle the argument 275order for functions we can internalize so that the maximum number of 276integers/pointers get passed in regs before you see any of the fp arguments. 277 278Instead of implementing this, it would actually probably be easier to just 279implement a PPC fastcc, where we could do whatever we wanted to the CC, 280including having this work sanely. 281 282===-------------------------------------------------------------------------=== 283 284Fix Darwin FP-In-Integer Registers ABI 285 286Darwin passes doubles in structures in integer registers, which is very very 287bad. Add something like a BITCAST to LLVM, then do an i-p transformation that 288percolates these things out of functions. 289 290Check out how horrible this is: 291http://gcc.gnu.org/ml/gcc/2005-10/msg01036.html 292 293This is an extension of "interprocedural CC unmunging" that can't be done with 294just fastcc. 295 296===-------------------------------------------------------------------------=== 297 298Compile this: 299 300int foo(int a) { 301 int b = (a < 8); 302 if (b) { 303 return b * 3; // ignore the fact that this is always 3. 304 } else { 305 return 2; 306 } 307} 308 309into something not this: 310 311_foo: 3121) cmpwi cr7, r3, 8 313 mfcr r2, 1 314 rlwinm r2, r2, 29, 31, 31 3151) cmpwi cr0, r3, 7 316 bgt cr0, LBB1_2 ; UnifiedReturnBlock 317LBB1_1: ; then 318 rlwinm r2, r2, 0, 31, 31 319 mulli r3, r2, 3 320 blr 321LBB1_2: ; UnifiedReturnBlock 322 li r3, 2 323 blr 324 325In particular, the two compares (marked 1) could be shared by reversing one. 326This could be done in the dag combiner, by swapping a BR_CC when a SETCC of the 327same operands (but backwards) exists. In this case, this wouldn't save us 328anything though, because the compares still wouldn't be shared. 329 330===-------------------------------------------------------------------------=== 331 332We should custom expand setcc instead of pretending that we have it. That 333would allow us to expose the access of the crbit after the mfcr, allowing 334that access to be trivially folded into other ops. A simple example: 335 336int foo(int a, int b) { return (a < b) << 4; } 337 338compiles into: 339 340_foo: 341 cmpw cr7, r3, r4 342 mfcr r2, 1 343 rlwinm r2, r2, 29, 31, 31 344 slwi r3, r2, 4 345 blr 346 347===-------------------------------------------------------------------------=== 348 349Fold add and sub with constant into non-extern, non-weak addresses so this: 350 351static int a; 352void bar(int b) { a = b; } 353void foo(unsigned char *c) { 354 *c = a; 355} 356 357So that 358 359_foo: 360 lis r2, ha16(_a) 361 la r2, lo16(_a)(r2) 362 lbz r2, 3(r2) 363 stb r2, 0(r3) 364 blr 365 366Becomes 367 368_foo: 369 lis r2, ha16(_a+3) 370 lbz r2, lo16(_a+3)(r2) 371 stb r2, 0(r3) 372 blr 373 374===-------------------------------------------------------------------------=== 375 376We generate really bad code for this: 377 378int f(signed char *a, _Bool b, _Bool c) { 379 signed char t = 0; 380 if (b) t = *a; 381 if (c) *a = t; 382} 383 384===-------------------------------------------------------------------------=== 385 386This: 387int test(unsigned *P) { return *P >> 24; } 388 389Should compile to: 390 391_test: 392 lbz r3,0(r3) 393 blr 394 395not: 396 397_test: 398 lwz r2, 0(r3) 399 srwi r3, r2, 24 400 blr 401 402===-------------------------------------------------------------------------=== 403 404On the G5, logical CR operations are more expensive in their three 405address form: ops that read/write the same register are half as expensive as 406those that read from two registers that are different from their destination. 407 408We should model this with two separate instructions. The isel should generate 409the "two address" form of the instructions. When the register allocator 410detects that it needs to insert a copy due to the two-addresness of the CR 411logical op, it will invoke PPCInstrInfo::convertToThreeAddress. At this point 412we can convert to the "three address" instruction, to save code space. 413 414This only matters when we start generating cr logical ops. 415 416===-------------------------------------------------------------------------=== 417 418We should compile these two functions to the same thing: 419 420#include <stdlib.h> 421void f(int a, int b, int *P) { 422 *P = (a-b)>=0?(a-b):(b-a); 423} 424void g(int a, int b, int *P) { 425 *P = abs(a-b); 426} 427 428Further, they should compile to something better than: 429 430_g: 431 subf r2, r4, r3 432 subfic r3, r2, 0 433 cmpwi cr0, r2, -1 434 bgt cr0, LBB2_2 ; entry 435LBB2_1: ; entry 436 mr r2, r3 437LBB2_2: ; entry 438 stw r2, 0(r5) 439 blr 440 441GCC produces: 442 443_g: 444 subf r4,r4,r3 445 srawi r2,r4,31 446 xor r0,r2,r4 447 subf r0,r2,r0 448 stw r0,0(r5) 449 blr 450 451... which is much nicer. 452 453This theoretically may help improve twolf slightly (used in dimbox.c:142?). 454 455===-------------------------------------------------------------------------=== 456 457PR5945: This: 458define i32 @clamp0g(i32 %a) { 459entry: 460 %cmp = icmp slt i32 %a, 0 461 %sel = select i1 %cmp, i32 0, i32 %a 462 ret i32 %sel 463} 464 465Is compile to this with the PowerPC (32-bit) backend: 466 467_clamp0g: 468 cmpwi cr0, r3, 0 469 li r2, 0 470 blt cr0, LBB1_2 471; BB#1: ; %entry 472 mr r2, r3 473LBB1_2: ; %entry 474 mr r3, r2 475 blr 476 477This could be reduced to the much simpler: 478 479_clamp0g: 480 srawi r2, r3, 31 481 andc r3, r3, r2 482 blr 483 484===-------------------------------------------------------------------------=== 485 486int foo(int N, int ***W, int **TK, int X) { 487 int t, i; 488 489 for (t = 0; t < N; ++t) 490 for (i = 0; i < 4; ++i) 491 W[t / X][i][t % X] = TK[i][t]; 492 493 return 5; 494} 495 496We generate relatively atrocious code for this loop compared to gcc. 497 498We could also strength reduce the rem and the div: 499http://www.lcs.mit.edu/pubs/pdf/MIT-LCS-TM-600.pdf 500 501===-------------------------------------------------------------------------=== 502 503float foo(float X) { return (int)(X); } 504 505Currently produces: 506 507_foo: 508 fctiwz f0, f1 509 stfd f0, -8(r1) 510 lwz r2, -4(r1) 511 extsw r2, r2 512 std r2, -16(r1) 513 lfd f0, -16(r1) 514 fcfid f0, f0 515 frsp f1, f0 516 blr 517 518We could use a target dag combine to turn the lwz/extsw into an lwa when the 519lwz has a single use. Since LWA is cracked anyway, this would be a codesize 520win only. 521 522===-------------------------------------------------------------------------=== 523 524We generate ugly code for this: 525 526void func(unsigned int *ret, float dx, float dy, float dz, float dw) { 527 unsigned code = 0; 528 if(dx < -dw) code |= 1; 529 if(dx > dw) code |= 2; 530 if(dy < -dw) code |= 4; 531 if(dy > dw) code |= 8; 532 if(dz < -dw) code |= 16; 533 if(dz > dw) code |= 32; 534 *ret = code; 535} 536 537===-------------------------------------------------------------------------=== 538 539Complete the signed i32 to FP conversion code using 64-bit registers 540transformation, good for PI. See PPCISelLowering.cpp, this comment: 541 542 // FIXME: disable this lowered code. This generates 64-bit register values, 543 // and we don't model the fact that the top part is clobbered by calls. We 544 // need to flag these together so that the value isn't live across a call. 545 //setOperationAction(ISD::SINT_TO_FP, MVT::i32, Custom); 546 547Also, if the registers are spilled to the stack, we have to ensure that all 54864-bits of them are save/restored, otherwise we will miscompile the code. It 549sounds like we need to get the 64-bit register classes going. 550 551===-------------------------------------------------------------------------=== 552 553%struct.B = type { i8, [3 x i8] } 554 555define void @bar(%struct.B* %b) { 556entry: 557 %tmp = bitcast %struct.B* %b to i32* ; <uint*> [#uses=1] 558 %tmp = load i32* %tmp ; <uint> [#uses=1] 559 %tmp3 = bitcast %struct.B* %b to i32* ; <uint*> [#uses=1] 560 %tmp4 = load i32* %tmp3 ; <uint> [#uses=1] 561 %tmp8 = bitcast %struct.B* %b to i32* ; <uint*> [#uses=2] 562 %tmp9 = load i32* %tmp8 ; <uint> [#uses=1] 563 %tmp4.mask17 = shl i32 %tmp4, i8 1 ; <uint> [#uses=1] 564 %tmp1415 = and i32 %tmp4.mask17, 2147483648 ; <uint> [#uses=1] 565 %tmp.masked = and i32 %tmp, 2147483648 ; <uint> [#uses=1] 566 %tmp11 = or i32 %tmp1415, %tmp.masked ; <uint> [#uses=1] 567 %tmp12 = and i32 %tmp9, 2147483647 ; <uint> [#uses=1] 568 %tmp13 = or i32 %tmp12, %tmp11 ; <uint> [#uses=1] 569 store i32 %tmp13, i32* %tmp8 570 ret void 571} 572 573We emit: 574 575_foo: 576 lwz r2, 0(r3) 577 slwi r4, r2, 1 578 or r4, r4, r2 579 rlwimi r2, r4, 0, 0, 0 580 stw r2, 0(r3) 581 blr 582 583We could collapse a bunch of those ORs and ANDs and generate the following 584equivalent code: 585 586_foo: 587 lwz r2, 0(r3) 588 rlwinm r4, r2, 1, 0, 0 589 or r2, r2, r4 590 stw r2, 0(r3) 591 blr 592 593===-------------------------------------------------------------------------=== 594 595We compile: 596 597unsigned test6(unsigned x) { 598 return ((x & 0x00FF0000) >> 16) | ((x & 0x000000FF) << 16); 599} 600 601into: 602 603_test6: 604 lis r2, 255 605 rlwinm r3, r3, 16, 0, 31 606 ori r2, r2, 255 607 and r3, r3, r2 608 blr 609 610GCC gets it down to: 611 612_test6: 613 rlwinm r0,r3,16,8,15 614 rlwinm r3,r3,16,24,31 615 or r3,r3,r0 616 blr 617 618 619===-------------------------------------------------------------------------=== 620 621Consider a function like this: 622 623float foo(float X) { return X + 1234.4123f; } 624 625The FP constant ends up in the constant pool, so we need to get the LR register. 626 This ends up producing code like this: 627 628_foo: 629.LBB_foo_0: ; entry 630 mflr r11 631*** stw r11, 8(r1) 632 bl "L00000$pb" 633"L00000$pb": 634 mflr r2 635 addis r2, r2, ha16(.CPI_foo_0-"L00000$pb") 636 lfs f0, lo16(.CPI_foo_0-"L00000$pb")(r2) 637 fadds f1, f1, f0 638*** lwz r11, 8(r1) 639 mtlr r11 640 blr 641 642This is functional, but there is no reason to spill the LR register all the way 643to the stack (the two marked instrs): spilling it to a GPR is quite enough. 644 645Implementing this will require some codegen improvements. Nate writes: 646 647"So basically what we need to support the "no stack frame save and restore" is a 648generalization of the LR optimization to "callee-save regs". 649 650Currently, we have LR marked as a callee-save reg. The register allocator sees 651that it's callee save, and spills it directly to the stack. 652 653Ideally, something like this would happen: 654 655LR would be in a separate register class from the GPRs. The class of LR would be 656marked "unspillable". When the register allocator came across an unspillable 657reg, it would ask "what is the best class to copy this into that I *can* spill" 658If it gets a class back, which it will in this case (the gprs), it grabs a free 659register of that class. If it is then later necessary to spill that reg, so be 660it. 661 662===-------------------------------------------------------------------------=== 663 664We compile this: 665int test(_Bool X) { 666 return X ? 524288 : 0; 667} 668 669to: 670_test: 671 cmplwi cr0, r3, 0 672 lis r2, 8 673 li r3, 0 674 beq cr0, LBB1_2 ;entry 675LBB1_1: ;entry 676 mr r3, r2 677LBB1_2: ;entry 678 blr 679 680instead of: 681_test: 682 addic r2,r3,-1 683 subfe r0,r2,r3 684 slwi r3,r0,19 685 blr 686 687This sort of thing occurs a lot due to globalopt. 688 689===-------------------------------------------------------------------------=== 690 691We compile: 692 693define i32 @bar(i32 %x) nounwind readnone ssp { 694entry: 695 %0 = icmp eq i32 %x, 0 ; <i1> [#uses=1] 696 %neg = sext i1 %0 to i32 ; <i32> [#uses=1] 697 ret i32 %neg 698} 699 700to: 701 702_bar: 703 cntlzw r2, r3 704 slwi r2, r2, 26 705 srawi r3, r2, 31 706 blr 707 708it would be better to produce: 709 710_bar: 711 addic r3,r3,-1 712 subfe r3,r3,r3 713 blr 714 715===-------------------------------------------------------------------------=== 716 717We currently compile 32-bit bswap: 718 719declare i32 @llvm.bswap.i32(i32 %A) 720define i32 @test(i32 %A) { 721 %B = call i32 @llvm.bswap.i32(i32 %A) 722 ret i32 %B 723} 724 725to: 726 727_test: 728 rlwinm r2, r3, 24, 16, 23 729 slwi r4, r3, 24 730 rlwimi r2, r3, 8, 24, 31 731 rlwimi r4, r3, 8, 8, 15 732 rlwimi r4, r2, 0, 16, 31 733 mr r3, r4 734 blr 735 736it would be more efficient to produce: 737 738_foo: mr r0,r3 739 rlwinm r3,r3,8,0xffffffff 740 rlwimi r3,r0,24,0,7 741 rlwimi r3,r0,24,16,23 742 blr 743 744===-------------------------------------------------------------------------=== 745 746test/CodeGen/PowerPC/2007-03-24-cntlzd.ll compiles to: 747 748__ZNK4llvm5APInt17countLeadingZerosEv: 749 ld r2, 0(r3) 750 cntlzd r2, r2 751 or r2, r2, r2 <<-- silly. 752 addi r3, r2, -64 753 blr 754 755The dead or is a 'truncate' from 64- to 32-bits. 756 757===-------------------------------------------------------------------------=== 758 759We generate horrible ppc code for this: 760 761#define N 2000000 762double a[N],c[N]; 763void simpleloop() { 764 int j; 765 for (j=0; j<N; j++) 766 c[j] = a[j]; 767} 768 769LBB1_1: ;bb 770 lfdx f0, r3, r4 771 addi r5, r5, 1 ;; Extra IV for the exit value compare. 772 stfdx f0, r2, r4 773 addi r4, r4, 8 774 775 xoris r6, r5, 30 ;; This is due to a large immediate. 776 cmplwi cr0, r6, 33920 777 bne cr0, LBB1_1 778 779//===---------------------------------------------------------------------===// 780 781This: 782 #include <algorithm> 783 inline std::pair<unsigned, bool> full_add(unsigned a, unsigned b) 784 { return std::make_pair(a + b, a + b < a); } 785 bool no_overflow(unsigned a, unsigned b) 786 { return !full_add(a, b).second; } 787 788Should compile to: 789 790__Z11no_overflowjj: 791 add r4,r3,r4 792 subfc r3,r3,r4 793 li r3,0 794 adde r3,r3,r3 795 blr 796 797(or better) not: 798 799__Z11no_overflowjj: 800 add r2, r4, r3 801 cmplw cr7, r2, r3 802 mfcr r2 803 rlwinm r2, r2, 29, 31, 31 804 xori r3, r2, 1 805 blr 806 807//===---------------------------------------------------------------------===// 808 809We compile some FP comparisons into an mfcr with two rlwinms and an or. For 810example: 811#include <math.h> 812int test(double x, double y) { return islessequal(x, y);} 813int test2(double x, double y) { return islessgreater(x, y);} 814int test3(double x, double y) { return !islessequal(x, y);} 815 816Compiles into (all three are similar, but the bits differ): 817 818_test: 819 fcmpu cr7, f1, f2 820 mfcr r2 821 rlwinm r3, r2, 29, 31, 31 822 rlwinm r2, r2, 31, 31, 31 823 or r3, r2, r3 824 blr 825 826GCC compiles this into: 827 828 _test: 829 fcmpu cr7,f1,f2 830 cror 30,28,30 831 mfcr r3 832 rlwinm r3,r3,31,1 833 blr 834 835which is more efficient and can use mfocr. See PR642 for some more context. 836 837//===---------------------------------------------------------------------===// 838 839void foo(float *data, float d) { 840 long i; 841 for (i = 0; i < 8000; i++) 842 data[i] = d; 843} 844void foo2(float *data, float d) { 845 long i; 846 data--; 847 for (i = 0; i < 8000; i++) { 848 data[1] = d; 849 data++; 850 } 851} 852 853These compile to: 854 855_foo: 856 li r2, 0 857LBB1_1: ; bb 858 addi r4, r2, 4 859 stfsx f1, r3, r2 860 cmplwi cr0, r4, 32000 861 mr r2, r4 862 bne cr0, LBB1_1 ; bb 863 blr 864_foo2: 865 li r2, 0 866LBB2_1: ; bb 867 addi r4, r2, 4 868 stfsx f1, r3, r2 869 cmplwi cr0, r4, 32000 870 mr r2, r4 871 bne cr0, LBB2_1 ; bb 872 blr 873 874The 'mr' could be eliminated to folding the add into the cmp better. 875 876//===---------------------------------------------------------------------===// 877Codegen for the following (low-probability) case deteriorated considerably 878when the correctness fixes for unordered comparisons went in (PR 642, 58871). 879It should be possible to recover the code quality described in the comments. 880 881; RUN: llvm-as < %s | llc -march=ppc32 | grep or | count 3 882; This should produce one 'or' or 'cror' instruction per function. 883 884; RUN: llvm-as < %s | llc -march=ppc32 | grep mfcr | count 3 885; PR2964 886 887define i32 @test(double %x, double %y) nounwind { 888entry: 889 %tmp3 = fcmp ole double %x, %y ; <i1> [#uses=1] 890 %tmp345 = zext i1 %tmp3 to i32 ; <i32> [#uses=1] 891 ret i32 %tmp345 892} 893 894define i32 @test2(double %x, double %y) nounwind { 895entry: 896 %tmp3 = fcmp one double %x, %y ; <i1> [#uses=1] 897 %tmp345 = zext i1 %tmp3 to i32 ; <i32> [#uses=1] 898 ret i32 %tmp345 899} 900 901define i32 @test3(double %x, double %y) nounwind { 902entry: 903 %tmp3 = fcmp ugt double %x, %y ; <i1> [#uses=1] 904 %tmp34 = zext i1 %tmp3 to i32 ; <i32> [#uses=1] 905 ret i32 %tmp34 906} 907//===----------------------------------------------------------------------===// 908; RUN: llvm-as < %s | llc -march=ppc32 | not grep fneg 909 910; This could generate FSEL with appropriate flags (FSEL is not IEEE-safe, and 911; should not be generated except with -enable-finite-only-fp-math or the like). 912; With the correctness fixes for PR642 (58871) LowerSELECT_CC would need to 913; recognize a more elaborate tree than a simple SETxx. 914 915define double @test_FNEG_sel(double %A, double %B, double %C) { 916 %D = fsub double -0.000000e+00, %A ; <double> [#uses=1] 917 %Cond = fcmp ugt double %D, -0.000000e+00 ; <i1> [#uses=1] 918 %E = select i1 %Cond, double %B, double %C ; <double> [#uses=1] 919 ret double %E 920} 921 922//===----------------------------------------------------------------------===// 923The save/restore sequence for CR in prolog/epilog is terrible: 924- Each CR subreg is saved individually, rather than doing one save as a unit. 925- On Darwin, the save is done after the decrement of SP, which means the offset 926from SP of the save slot can be too big for a store instruction, which means we 927need an additional register (currently hacked in 96015+96020; the solution there 928is correct, but poor). 929- On SVR4 the same thing can happen, and I don't think saving before the SP 930decrement is safe on that target, as there is no red zone. This is currently 931broken AFAIK, although it's not a target I can exercise. 932The following demonstrates the problem: 933extern void bar(char *p); 934void foo() { 935 char x[100000]; 936 bar(x); 937 __asm__("" ::: "cr2"); 938} 939