InstCombineCalls.cpp revision 360784
1//===- InstCombineCalls.cpp -----------------------------------------------===// 2// 3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 4// See https://llvm.org/LICENSE.txt for license information. 5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 6// 7//===----------------------------------------------------------------------===// 8// 9// This file implements the visitCall, visitInvoke, and visitCallBr functions. 10// 11//===----------------------------------------------------------------------===// 12 13#include "InstCombineInternal.h" 14#include "llvm/ADT/APFloat.h" 15#include "llvm/ADT/APInt.h" 16#include "llvm/ADT/APSInt.h" 17#include "llvm/ADT/ArrayRef.h" 18#include "llvm/ADT/None.h" 19#include "llvm/ADT/Optional.h" 20#include "llvm/ADT/STLExtras.h" 21#include "llvm/ADT/SmallVector.h" 22#include "llvm/ADT/Statistic.h" 23#include "llvm/ADT/Twine.h" 24#include "llvm/Analysis/AssumptionCache.h" 25#include "llvm/Analysis/InstructionSimplify.h" 26#include "llvm/Analysis/Loads.h" 27#include "llvm/Analysis/MemoryBuiltins.h" 28#include "llvm/Analysis/ValueTracking.h" 29#include "llvm/Analysis/VectorUtils.h" 30#include "llvm/IR/Attributes.h" 31#include "llvm/IR/BasicBlock.h" 32#include "llvm/IR/Constant.h" 33#include "llvm/IR/Constants.h" 34#include "llvm/IR/DataLayout.h" 35#include "llvm/IR/DerivedTypes.h" 36#include "llvm/IR/Function.h" 37#include "llvm/IR/GlobalVariable.h" 38#include "llvm/IR/InstrTypes.h" 39#include "llvm/IR/Instruction.h" 40#include "llvm/IR/Instructions.h" 41#include "llvm/IR/IntrinsicInst.h" 42#include "llvm/IR/Intrinsics.h" 43#include "llvm/IR/IntrinsicsX86.h" 44#include "llvm/IR/IntrinsicsARM.h" 45#include "llvm/IR/IntrinsicsAArch64.h" 46#include "llvm/IR/IntrinsicsNVPTX.h" 47#include "llvm/IR/IntrinsicsAMDGPU.h" 48#include "llvm/IR/IntrinsicsPowerPC.h" 49#include "llvm/IR/LLVMContext.h" 50#include "llvm/IR/Metadata.h" 51#include "llvm/IR/PatternMatch.h" 52#include "llvm/IR/Statepoint.h" 53#include "llvm/IR/Type.h" 54#include "llvm/IR/User.h" 55#include "llvm/IR/Value.h" 56#include "llvm/IR/ValueHandle.h" 57#include "llvm/Support/AtomicOrdering.h" 58#include "llvm/Support/Casting.h" 59#include "llvm/Support/CommandLine.h" 60#include "llvm/Support/Compiler.h" 61#include "llvm/Support/Debug.h" 62#include "llvm/Support/ErrorHandling.h" 63#include "llvm/Support/KnownBits.h" 64#include "llvm/Support/MathExtras.h" 65#include "llvm/Support/raw_ostream.h" 66#include "llvm/Transforms/InstCombine/InstCombineWorklist.h" 67#include "llvm/Transforms/Utils/Local.h" 68#include "llvm/Transforms/Utils/SimplifyLibCalls.h" 69#include <algorithm> 70#include <cassert> 71#include <cstdint> 72#include <cstring> 73#include <utility> 74#include <vector> 75 76using namespace llvm; 77using namespace PatternMatch; 78 79#define DEBUG_TYPE "instcombine" 80 81STATISTIC(NumSimplified, "Number of library calls simplified"); 82 83static cl::opt<unsigned> GuardWideningWindow( 84 "instcombine-guard-widening-window", 85 cl::init(3), 86 cl::desc("How wide an instruction window to bypass looking for " 87 "another guard")); 88 89/// Return the specified type promoted as it would be to pass though a va_arg 90/// area. 91static Type *getPromotedType(Type *Ty) { 92 if (IntegerType* ITy = dyn_cast<IntegerType>(Ty)) { 93 if (ITy->getBitWidth() < 32) 94 return Type::getInt32Ty(Ty->getContext()); 95 } 96 return Ty; 97} 98 99/// Return a constant boolean vector that has true elements in all positions 100/// where the input constant data vector has an element with the sign bit set. 101static Constant *getNegativeIsTrueBoolVec(ConstantDataVector *V) { 102 SmallVector<Constant *, 32> BoolVec; 103 IntegerType *BoolTy = Type::getInt1Ty(V->getContext()); 104 for (unsigned I = 0, E = V->getNumElements(); I != E; ++I) { 105 Constant *Elt = V->getElementAsConstant(I); 106 assert((isa<ConstantInt>(Elt) || isa<ConstantFP>(Elt)) && 107 "Unexpected constant data vector element type"); 108 bool Sign = V->getElementType()->isIntegerTy() 109 ? cast<ConstantInt>(Elt)->isNegative() 110 : cast<ConstantFP>(Elt)->isNegative(); 111 BoolVec.push_back(ConstantInt::get(BoolTy, Sign)); 112 } 113 return ConstantVector::get(BoolVec); 114} 115 116Instruction *InstCombiner::SimplifyAnyMemTransfer(AnyMemTransferInst *MI) { 117 unsigned DstAlign = getKnownAlignment(MI->getRawDest(), DL, MI, &AC, &DT); 118 unsigned CopyDstAlign = MI->getDestAlignment(); 119 if (CopyDstAlign < DstAlign){ 120 MI->setDestAlignment(DstAlign); 121 return MI; 122 } 123 124 unsigned SrcAlign = getKnownAlignment(MI->getRawSource(), DL, MI, &AC, &DT); 125 unsigned CopySrcAlign = MI->getSourceAlignment(); 126 if (CopySrcAlign < SrcAlign) { 127 MI->setSourceAlignment(SrcAlign); 128 return MI; 129 } 130 131 // If we have a store to a location which is known constant, we can conclude 132 // that the store must be storing the constant value (else the memory 133 // wouldn't be constant), and this must be a noop. 134 if (AA->pointsToConstantMemory(MI->getDest())) { 135 // Set the size of the copy to 0, it will be deleted on the next iteration. 136 MI->setLength(Constant::getNullValue(MI->getLength()->getType())); 137 return MI; 138 } 139 140 // If MemCpyInst length is 1/2/4/8 bytes then replace memcpy with 141 // load/store. 142 ConstantInt *MemOpLength = dyn_cast<ConstantInt>(MI->getLength()); 143 if (!MemOpLength) return nullptr; 144 145 // Source and destination pointer types are always "i8*" for intrinsic. See 146 // if the size is something we can handle with a single primitive load/store. 147 // A single load+store correctly handles overlapping memory in the memmove 148 // case. 149 uint64_t Size = MemOpLength->getLimitedValue(); 150 assert(Size && "0-sized memory transferring should be removed already."); 151 152 if (Size > 8 || (Size&(Size-1))) 153 return nullptr; // If not 1/2/4/8 bytes, exit. 154 155 // If it is an atomic and alignment is less than the size then we will 156 // introduce the unaligned memory access which will be later transformed 157 // into libcall in CodeGen. This is not evident performance gain so disable 158 // it now. 159 if (isa<AtomicMemTransferInst>(MI)) 160 if (CopyDstAlign < Size || CopySrcAlign < Size) 161 return nullptr; 162 163 // Use an integer load+store unless we can find something better. 164 unsigned SrcAddrSp = 165 cast<PointerType>(MI->getArgOperand(1)->getType())->getAddressSpace(); 166 unsigned DstAddrSp = 167 cast<PointerType>(MI->getArgOperand(0)->getType())->getAddressSpace(); 168 169 IntegerType* IntType = IntegerType::get(MI->getContext(), Size<<3); 170 Type *NewSrcPtrTy = PointerType::get(IntType, SrcAddrSp); 171 Type *NewDstPtrTy = PointerType::get(IntType, DstAddrSp); 172 173 // If the memcpy has metadata describing the members, see if we can get the 174 // TBAA tag describing our copy. 175 MDNode *CopyMD = nullptr; 176 if (MDNode *M = MI->getMetadata(LLVMContext::MD_tbaa)) { 177 CopyMD = M; 178 } else if (MDNode *M = MI->getMetadata(LLVMContext::MD_tbaa_struct)) { 179 if (M->getNumOperands() == 3 && M->getOperand(0) && 180 mdconst::hasa<ConstantInt>(M->getOperand(0)) && 181 mdconst::extract<ConstantInt>(M->getOperand(0))->isZero() && 182 M->getOperand(1) && 183 mdconst::hasa<ConstantInt>(M->getOperand(1)) && 184 mdconst::extract<ConstantInt>(M->getOperand(1))->getValue() == 185 Size && 186 M->getOperand(2) && isa<MDNode>(M->getOperand(2))) 187 CopyMD = cast<MDNode>(M->getOperand(2)); 188 } 189 190 Value *Src = Builder.CreateBitCast(MI->getArgOperand(1), NewSrcPtrTy); 191 Value *Dest = Builder.CreateBitCast(MI->getArgOperand(0), NewDstPtrTy); 192 LoadInst *L = Builder.CreateLoad(IntType, Src); 193 // Alignment from the mem intrinsic will be better, so use it. 194 L->setAlignment( 195 MaybeAlign(CopySrcAlign)); // FIXME: Check if we can use Align instead. 196 if (CopyMD) 197 L->setMetadata(LLVMContext::MD_tbaa, CopyMD); 198 MDNode *LoopMemParallelMD = 199 MI->getMetadata(LLVMContext::MD_mem_parallel_loop_access); 200 if (LoopMemParallelMD) 201 L->setMetadata(LLVMContext::MD_mem_parallel_loop_access, LoopMemParallelMD); 202 MDNode *AccessGroupMD = MI->getMetadata(LLVMContext::MD_access_group); 203 if (AccessGroupMD) 204 L->setMetadata(LLVMContext::MD_access_group, AccessGroupMD); 205 206 StoreInst *S = Builder.CreateStore(L, Dest); 207 // Alignment from the mem intrinsic will be better, so use it. 208 S->setAlignment( 209 MaybeAlign(CopyDstAlign)); // FIXME: Check if we can use Align instead. 210 if (CopyMD) 211 S->setMetadata(LLVMContext::MD_tbaa, CopyMD); 212 if (LoopMemParallelMD) 213 S->setMetadata(LLVMContext::MD_mem_parallel_loop_access, LoopMemParallelMD); 214 if (AccessGroupMD) 215 S->setMetadata(LLVMContext::MD_access_group, AccessGroupMD); 216 217 if (auto *MT = dyn_cast<MemTransferInst>(MI)) { 218 // non-atomics can be volatile 219 L->setVolatile(MT->isVolatile()); 220 S->setVolatile(MT->isVolatile()); 221 } 222 if (isa<AtomicMemTransferInst>(MI)) { 223 // atomics have to be unordered 224 L->setOrdering(AtomicOrdering::Unordered); 225 S->setOrdering(AtomicOrdering::Unordered); 226 } 227 228 // Set the size of the copy to 0, it will be deleted on the next iteration. 229 MI->setLength(Constant::getNullValue(MemOpLength->getType())); 230 return MI; 231} 232 233Instruction *InstCombiner::SimplifyAnyMemSet(AnyMemSetInst *MI) { 234 const unsigned KnownAlignment = 235 getKnownAlignment(MI->getDest(), DL, MI, &AC, &DT); 236 if (MI->getDestAlignment() < KnownAlignment) { 237 MI->setDestAlignment(KnownAlignment); 238 return MI; 239 } 240 241 // If we have a store to a location which is known constant, we can conclude 242 // that the store must be storing the constant value (else the memory 243 // wouldn't be constant), and this must be a noop. 244 if (AA->pointsToConstantMemory(MI->getDest())) { 245 // Set the size of the copy to 0, it will be deleted on the next iteration. 246 MI->setLength(Constant::getNullValue(MI->getLength()->getType())); 247 return MI; 248 } 249 250 // Extract the length and alignment and fill if they are constant. 251 ConstantInt *LenC = dyn_cast<ConstantInt>(MI->getLength()); 252 ConstantInt *FillC = dyn_cast<ConstantInt>(MI->getValue()); 253 if (!LenC || !FillC || !FillC->getType()->isIntegerTy(8)) 254 return nullptr; 255 const uint64_t Len = LenC->getLimitedValue(); 256 assert(Len && "0-sized memory setting should be removed already."); 257 const Align Alignment = assumeAligned(MI->getDestAlignment()); 258 259 // If it is an atomic and alignment is less than the size then we will 260 // introduce the unaligned memory access which will be later transformed 261 // into libcall in CodeGen. This is not evident performance gain so disable 262 // it now. 263 if (isa<AtomicMemSetInst>(MI)) 264 if (Alignment < Len) 265 return nullptr; 266 267 // memset(s,c,n) -> store s, c (for n=1,2,4,8) 268 if (Len <= 8 && isPowerOf2_32((uint32_t)Len)) { 269 Type *ITy = IntegerType::get(MI->getContext(), Len*8); // n=1 -> i8. 270 271 Value *Dest = MI->getDest(); 272 unsigned DstAddrSp = cast<PointerType>(Dest->getType())->getAddressSpace(); 273 Type *NewDstPtrTy = PointerType::get(ITy, DstAddrSp); 274 Dest = Builder.CreateBitCast(Dest, NewDstPtrTy); 275 276 // Extract the fill value and store. 277 uint64_t Fill = FillC->getZExtValue()*0x0101010101010101ULL; 278 StoreInst *S = Builder.CreateStore(ConstantInt::get(ITy, Fill), Dest, 279 MI->isVolatile()); 280 S->setAlignment(Alignment); 281 if (isa<AtomicMemSetInst>(MI)) 282 S->setOrdering(AtomicOrdering::Unordered); 283 284 // Set the size of the copy to 0, it will be deleted on the next iteration. 285 MI->setLength(Constant::getNullValue(LenC->getType())); 286 return MI; 287 } 288 289 return nullptr; 290} 291 292static Value *simplifyX86immShift(const IntrinsicInst &II, 293 InstCombiner::BuilderTy &Builder) { 294 bool LogicalShift = false; 295 bool ShiftLeft = false; 296 297 switch (II.getIntrinsicID()) { 298 default: llvm_unreachable("Unexpected intrinsic!"); 299 case Intrinsic::x86_sse2_psra_d: 300 case Intrinsic::x86_sse2_psra_w: 301 case Intrinsic::x86_sse2_psrai_d: 302 case Intrinsic::x86_sse2_psrai_w: 303 case Intrinsic::x86_avx2_psra_d: 304 case Intrinsic::x86_avx2_psra_w: 305 case Intrinsic::x86_avx2_psrai_d: 306 case Intrinsic::x86_avx2_psrai_w: 307 case Intrinsic::x86_avx512_psra_q_128: 308 case Intrinsic::x86_avx512_psrai_q_128: 309 case Intrinsic::x86_avx512_psra_q_256: 310 case Intrinsic::x86_avx512_psrai_q_256: 311 case Intrinsic::x86_avx512_psra_d_512: 312 case Intrinsic::x86_avx512_psra_q_512: 313 case Intrinsic::x86_avx512_psra_w_512: 314 case Intrinsic::x86_avx512_psrai_d_512: 315 case Intrinsic::x86_avx512_psrai_q_512: 316 case Intrinsic::x86_avx512_psrai_w_512: 317 LogicalShift = false; ShiftLeft = false; 318 break; 319 case Intrinsic::x86_sse2_psrl_d: 320 case Intrinsic::x86_sse2_psrl_q: 321 case Intrinsic::x86_sse2_psrl_w: 322 case Intrinsic::x86_sse2_psrli_d: 323 case Intrinsic::x86_sse2_psrli_q: 324 case Intrinsic::x86_sse2_psrli_w: 325 case Intrinsic::x86_avx2_psrl_d: 326 case Intrinsic::x86_avx2_psrl_q: 327 case Intrinsic::x86_avx2_psrl_w: 328 case Intrinsic::x86_avx2_psrli_d: 329 case Intrinsic::x86_avx2_psrli_q: 330 case Intrinsic::x86_avx2_psrli_w: 331 case Intrinsic::x86_avx512_psrl_d_512: 332 case Intrinsic::x86_avx512_psrl_q_512: 333 case Intrinsic::x86_avx512_psrl_w_512: 334 case Intrinsic::x86_avx512_psrli_d_512: 335 case Intrinsic::x86_avx512_psrli_q_512: 336 case Intrinsic::x86_avx512_psrli_w_512: 337 LogicalShift = true; ShiftLeft = false; 338 break; 339 case Intrinsic::x86_sse2_psll_d: 340 case Intrinsic::x86_sse2_psll_q: 341 case Intrinsic::x86_sse2_psll_w: 342 case Intrinsic::x86_sse2_pslli_d: 343 case Intrinsic::x86_sse2_pslli_q: 344 case Intrinsic::x86_sse2_pslli_w: 345 case Intrinsic::x86_avx2_psll_d: 346 case Intrinsic::x86_avx2_psll_q: 347 case Intrinsic::x86_avx2_psll_w: 348 case Intrinsic::x86_avx2_pslli_d: 349 case Intrinsic::x86_avx2_pslli_q: 350 case Intrinsic::x86_avx2_pslli_w: 351 case Intrinsic::x86_avx512_psll_d_512: 352 case Intrinsic::x86_avx512_psll_q_512: 353 case Intrinsic::x86_avx512_psll_w_512: 354 case Intrinsic::x86_avx512_pslli_d_512: 355 case Intrinsic::x86_avx512_pslli_q_512: 356 case Intrinsic::x86_avx512_pslli_w_512: 357 LogicalShift = true; ShiftLeft = true; 358 break; 359 } 360 assert((LogicalShift || !ShiftLeft) && "Only logical shifts can shift left"); 361 362 // Simplify if count is constant. 363 auto Arg1 = II.getArgOperand(1); 364 auto CAZ = dyn_cast<ConstantAggregateZero>(Arg1); 365 auto CDV = dyn_cast<ConstantDataVector>(Arg1); 366 auto CInt = dyn_cast<ConstantInt>(Arg1); 367 if (!CAZ && !CDV && !CInt) 368 return nullptr; 369 370 APInt Count(64, 0); 371 if (CDV) { 372 // SSE2/AVX2 uses all the first 64-bits of the 128-bit vector 373 // operand to compute the shift amount. 374 auto VT = cast<VectorType>(CDV->getType()); 375 unsigned BitWidth = VT->getElementType()->getPrimitiveSizeInBits(); 376 assert((64 % BitWidth) == 0 && "Unexpected packed shift size"); 377 unsigned NumSubElts = 64 / BitWidth; 378 379 // Concatenate the sub-elements to create the 64-bit value. 380 for (unsigned i = 0; i != NumSubElts; ++i) { 381 unsigned SubEltIdx = (NumSubElts - 1) - i; 382 auto SubElt = cast<ConstantInt>(CDV->getElementAsConstant(SubEltIdx)); 383 Count <<= BitWidth; 384 Count |= SubElt->getValue().zextOrTrunc(64); 385 } 386 } 387 else if (CInt) 388 Count = CInt->getValue(); 389 390 auto Vec = II.getArgOperand(0); 391 auto VT = cast<VectorType>(Vec->getType()); 392 auto SVT = VT->getElementType(); 393 unsigned VWidth = VT->getNumElements(); 394 unsigned BitWidth = SVT->getPrimitiveSizeInBits(); 395 396 // If shift-by-zero then just return the original value. 397 if (Count.isNullValue()) 398 return Vec; 399 400 // Handle cases when Shift >= BitWidth. 401 if (Count.uge(BitWidth)) { 402 // If LogicalShift - just return zero. 403 if (LogicalShift) 404 return ConstantAggregateZero::get(VT); 405 406 // If ArithmeticShift - clamp Shift to (BitWidth - 1). 407 Count = APInt(64, BitWidth - 1); 408 } 409 410 // Get a constant vector of the same type as the first operand. 411 auto ShiftAmt = ConstantInt::get(SVT, Count.zextOrTrunc(BitWidth)); 412 auto ShiftVec = Builder.CreateVectorSplat(VWidth, ShiftAmt); 413 414 if (ShiftLeft) 415 return Builder.CreateShl(Vec, ShiftVec); 416 417 if (LogicalShift) 418 return Builder.CreateLShr(Vec, ShiftVec); 419 420 return Builder.CreateAShr(Vec, ShiftVec); 421} 422 423// Attempt to simplify AVX2 per-element shift intrinsics to a generic IR shift. 424// Unlike the generic IR shifts, the intrinsics have defined behaviour for out 425// of range shift amounts (logical - set to zero, arithmetic - splat sign bit). 426static Value *simplifyX86varShift(const IntrinsicInst &II, 427 InstCombiner::BuilderTy &Builder) { 428 bool LogicalShift = false; 429 bool ShiftLeft = false; 430 431 switch (II.getIntrinsicID()) { 432 default: llvm_unreachable("Unexpected intrinsic!"); 433 case Intrinsic::x86_avx2_psrav_d: 434 case Intrinsic::x86_avx2_psrav_d_256: 435 case Intrinsic::x86_avx512_psrav_q_128: 436 case Intrinsic::x86_avx512_psrav_q_256: 437 case Intrinsic::x86_avx512_psrav_d_512: 438 case Intrinsic::x86_avx512_psrav_q_512: 439 case Intrinsic::x86_avx512_psrav_w_128: 440 case Intrinsic::x86_avx512_psrav_w_256: 441 case Intrinsic::x86_avx512_psrav_w_512: 442 LogicalShift = false; 443 ShiftLeft = false; 444 break; 445 case Intrinsic::x86_avx2_psrlv_d: 446 case Intrinsic::x86_avx2_psrlv_d_256: 447 case Intrinsic::x86_avx2_psrlv_q: 448 case Intrinsic::x86_avx2_psrlv_q_256: 449 case Intrinsic::x86_avx512_psrlv_d_512: 450 case Intrinsic::x86_avx512_psrlv_q_512: 451 case Intrinsic::x86_avx512_psrlv_w_128: 452 case Intrinsic::x86_avx512_psrlv_w_256: 453 case Intrinsic::x86_avx512_psrlv_w_512: 454 LogicalShift = true; 455 ShiftLeft = false; 456 break; 457 case Intrinsic::x86_avx2_psllv_d: 458 case Intrinsic::x86_avx2_psllv_d_256: 459 case Intrinsic::x86_avx2_psllv_q: 460 case Intrinsic::x86_avx2_psllv_q_256: 461 case Intrinsic::x86_avx512_psllv_d_512: 462 case Intrinsic::x86_avx512_psllv_q_512: 463 case Intrinsic::x86_avx512_psllv_w_128: 464 case Intrinsic::x86_avx512_psllv_w_256: 465 case Intrinsic::x86_avx512_psllv_w_512: 466 LogicalShift = true; 467 ShiftLeft = true; 468 break; 469 } 470 assert((LogicalShift || !ShiftLeft) && "Only logical shifts can shift left"); 471 472 // Simplify if all shift amounts are constant/undef. 473 auto *CShift = dyn_cast<Constant>(II.getArgOperand(1)); 474 if (!CShift) 475 return nullptr; 476 477 auto Vec = II.getArgOperand(0); 478 auto VT = cast<VectorType>(II.getType()); 479 auto SVT = VT->getVectorElementType(); 480 int NumElts = VT->getNumElements(); 481 int BitWidth = SVT->getIntegerBitWidth(); 482 483 // Collect each element's shift amount. 484 // We also collect special cases: UNDEF = -1, OUT-OF-RANGE = BitWidth. 485 bool AnyOutOfRange = false; 486 SmallVector<int, 8> ShiftAmts; 487 for (int I = 0; I < NumElts; ++I) { 488 auto *CElt = CShift->getAggregateElement(I); 489 if (CElt && isa<UndefValue>(CElt)) { 490 ShiftAmts.push_back(-1); 491 continue; 492 } 493 494 auto *COp = dyn_cast_or_null<ConstantInt>(CElt); 495 if (!COp) 496 return nullptr; 497 498 // Handle out of range shifts. 499 // If LogicalShift - set to BitWidth (special case). 500 // If ArithmeticShift - set to (BitWidth - 1) (sign splat). 501 APInt ShiftVal = COp->getValue(); 502 if (ShiftVal.uge(BitWidth)) { 503 AnyOutOfRange = LogicalShift; 504 ShiftAmts.push_back(LogicalShift ? BitWidth : BitWidth - 1); 505 continue; 506 } 507 508 ShiftAmts.push_back((int)ShiftVal.getZExtValue()); 509 } 510 511 // If all elements out of range or UNDEF, return vector of zeros/undefs. 512 // ArithmeticShift should only hit this if they are all UNDEF. 513 auto OutOfRange = [&](int Idx) { return (Idx < 0) || (BitWidth <= Idx); }; 514 if (llvm::all_of(ShiftAmts, OutOfRange)) { 515 SmallVector<Constant *, 8> ConstantVec; 516 for (int Idx : ShiftAmts) { 517 if (Idx < 0) { 518 ConstantVec.push_back(UndefValue::get(SVT)); 519 } else { 520 assert(LogicalShift && "Logical shift expected"); 521 ConstantVec.push_back(ConstantInt::getNullValue(SVT)); 522 } 523 } 524 return ConstantVector::get(ConstantVec); 525 } 526 527 // We can't handle only some out of range values with generic logical shifts. 528 if (AnyOutOfRange) 529 return nullptr; 530 531 // Build the shift amount constant vector. 532 SmallVector<Constant *, 8> ShiftVecAmts; 533 for (int Idx : ShiftAmts) { 534 if (Idx < 0) 535 ShiftVecAmts.push_back(UndefValue::get(SVT)); 536 else 537 ShiftVecAmts.push_back(ConstantInt::get(SVT, Idx)); 538 } 539 auto ShiftVec = ConstantVector::get(ShiftVecAmts); 540 541 if (ShiftLeft) 542 return Builder.CreateShl(Vec, ShiftVec); 543 544 if (LogicalShift) 545 return Builder.CreateLShr(Vec, ShiftVec); 546 547 return Builder.CreateAShr(Vec, ShiftVec); 548} 549 550static Value *simplifyX86pack(IntrinsicInst &II, 551 InstCombiner::BuilderTy &Builder, bool IsSigned) { 552 Value *Arg0 = II.getArgOperand(0); 553 Value *Arg1 = II.getArgOperand(1); 554 Type *ResTy = II.getType(); 555 556 // Fast all undef handling. 557 if (isa<UndefValue>(Arg0) && isa<UndefValue>(Arg1)) 558 return UndefValue::get(ResTy); 559 560 Type *ArgTy = Arg0->getType(); 561 unsigned NumLanes = ResTy->getPrimitiveSizeInBits() / 128; 562 unsigned NumSrcElts = ArgTy->getVectorNumElements(); 563 assert(ResTy->getVectorNumElements() == (2 * NumSrcElts) && 564 "Unexpected packing types"); 565 566 unsigned NumSrcEltsPerLane = NumSrcElts / NumLanes; 567 unsigned DstScalarSizeInBits = ResTy->getScalarSizeInBits(); 568 unsigned SrcScalarSizeInBits = ArgTy->getScalarSizeInBits(); 569 assert(SrcScalarSizeInBits == (2 * DstScalarSizeInBits) && 570 "Unexpected packing types"); 571 572 // Constant folding. 573 if (!isa<Constant>(Arg0) || !isa<Constant>(Arg1)) 574 return nullptr; 575 576 // Clamp Values - signed/unsigned both use signed clamp values, but they 577 // differ on the min/max values. 578 APInt MinValue, MaxValue; 579 if (IsSigned) { 580 // PACKSS: Truncate signed value with signed saturation. 581 // Source values less than dst minint are saturated to minint. 582 // Source values greater than dst maxint are saturated to maxint. 583 MinValue = 584 APInt::getSignedMinValue(DstScalarSizeInBits).sext(SrcScalarSizeInBits); 585 MaxValue = 586 APInt::getSignedMaxValue(DstScalarSizeInBits).sext(SrcScalarSizeInBits); 587 } else { 588 // PACKUS: Truncate signed value with unsigned saturation. 589 // Source values less than zero are saturated to zero. 590 // Source values greater than dst maxuint are saturated to maxuint. 591 MinValue = APInt::getNullValue(SrcScalarSizeInBits); 592 MaxValue = APInt::getLowBitsSet(SrcScalarSizeInBits, DstScalarSizeInBits); 593 } 594 595 auto *MinC = Constant::getIntegerValue(ArgTy, MinValue); 596 auto *MaxC = Constant::getIntegerValue(ArgTy, MaxValue); 597 Arg0 = Builder.CreateSelect(Builder.CreateICmpSLT(Arg0, MinC), MinC, Arg0); 598 Arg1 = Builder.CreateSelect(Builder.CreateICmpSLT(Arg1, MinC), MinC, Arg1); 599 Arg0 = Builder.CreateSelect(Builder.CreateICmpSGT(Arg0, MaxC), MaxC, Arg0); 600 Arg1 = Builder.CreateSelect(Builder.CreateICmpSGT(Arg1, MaxC), MaxC, Arg1); 601 602 // Shuffle clamped args together at the lane level. 603 SmallVector<unsigned, 32> PackMask; 604 for (unsigned Lane = 0; Lane != NumLanes; ++Lane) { 605 for (unsigned Elt = 0; Elt != NumSrcEltsPerLane; ++Elt) 606 PackMask.push_back(Elt + (Lane * NumSrcEltsPerLane)); 607 for (unsigned Elt = 0; Elt != NumSrcEltsPerLane; ++Elt) 608 PackMask.push_back(Elt + (Lane * NumSrcEltsPerLane) + NumSrcElts); 609 } 610 auto *Shuffle = Builder.CreateShuffleVector(Arg0, Arg1, PackMask); 611 612 // Truncate to dst size. 613 return Builder.CreateTrunc(Shuffle, ResTy); 614} 615 616static Value *simplifyX86movmsk(const IntrinsicInst &II, 617 InstCombiner::BuilderTy &Builder) { 618 Value *Arg = II.getArgOperand(0); 619 Type *ResTy = II.getType(); 620 Type *ArgTy = Arg->getType(); 621 622 // movmsk(undef) -> zero as we must ensure the upper bits are zero. 623 if (isa<UndefValue>(Arg)) 624 return Constant::getNullValue(ResTy); 625 626 // We can't easily peek through x86_mmx types. 627 if (!ArgTy->isVectorTy()) 628 return nullptr; 629 630 // Expand MOVMSK to compare/bitcast/zext: 631 // e.g. PMOVMSKB(v16i8 x): 632 // %cmp = icmp slt <16 x i8> %x, zeroinitializer 633 // %int = bitcast <16 x i1> %cmp to i16 634 // %res = zext i16 %int to i32 635 unsigned NumElts = ArgTy->getVectorNumElements(); 636 Type *IntegerVecTy = VectorType::getInteger(cast<VectorType>(ArgTy)); 637 Type *IntegerTy = Builder.getIntNTy(NumElts); 638 639 Value *Res = Builder.CreateBitCast(Arg, IntegerVecTy); 640 Res = Builder.CreateICmpSLT(Res, Constant::getNullValue(IntegerVecTy)); 641 Res = Builder.CreateBitCast(Res, IntegerTy); 642 Res = Builder.CreateZExtOrTrunc(Res, ResTy); 643 return Res; 644} 645 646static Value *simplifyX86addcarry(const IntrinsicInst &II, 647 InstCombiner::BuilderTy &Builder) { 648 Value *CarryIn = II.getArgOperand(0); 649 Value *Op1 = II.getArgOperand(1); 650 Value *Op2 = II.getArgOperand(2); 651 Type *RetTy = II.getType(); 652 Type *OpTy = Op1->getType(); 653 assert(RetTy->getStructElementType(0)->isIntegerTy(8) && 654 RetTy->getStructElementType(1) == OpTy && OpTy == Op2->getType() && 655 "Unexpected types for x86 addcarry"); 656 657 // If carry-in is zero, this is just an unsigned add with overflow. 658 if (match(CarryIn, m_ZeroInt())) { 659 Value *UAdd = Builder.CreateIntrinsic(Intrinsic::uadd_with_overflow, OpTy, 660 { Op1, Op2 }); 661 // The types have to be adjusted to match the x86 call types. 662 Value *UAddResult = Builder.CreateExtractValue(UAdd, 0); 663 Value *UAddOV = Builder.CreateZExt(Builder.CreateExtractValue(UAdd, 1), 664 Builder.getInt8Ty()); 665 Value *Res = UndefValue::get(RetTy); 666 Res = Builder.CreateInsertValue(Res, UAddOV, 0); 667 return Builder.CreateInsertValue(Res, UAddResult, 1); 668 } 669 670 return nullptr; 671} 672 673static Value *simplifyX86insertps(const IntrinsicInst &II, 674 InstCombiner::BuilderTy &Builder) { 675 auto *CInt = dyn_cast<ConstantInt>(II.getArgOperand(2)); 676 if (!CInt) 677 return nullptr; 678 679 VectorType *VecTy = cast<VectorType>(II.getType()); 680 assert(VecTy->getNumElements() == 4 && "insertps with wrong vector type"); 681 682 // The immediate permute control byte looks like this: 683 // [3:0] - zero mask for each 32-bit lane 684 // [5:4] - select one 32-bit destination lane 685 // [7:6] - select one 32-bit source lane 686 687 uint8_t Imm = CInt->getZExtValue(); 688 uint8_t ZMask = Imm & 0xf; 689 uint8_t DestLane = (Imm >> 4) & 0x3; 690 uint8_t SourceLane = (Imm >> 6) & 0x3; 691 692 ConstantAggregateZero *ZeroVector = ConstantAggregateZero::get(VecTy); 693 694 // If all zero mask bits are set, this was just a weird way to 695 // generate a zero vector. 696 if (ZMask == 0xf) 697 return ZeroVector; 698 699 // Initialize by passing all of the first source bits through. 700 uint32_t ShuffleMask[4] = { 0, 1, 2, 3 }; 701 702 // We may replace the second operand with the zero vector. 703 Value *V1 = II.getArgOperand(1); 704 705 if (ZMask) { 706 // If the zero mask is being used with a single input or the zero mask 707 // overrides the destination lane, this is a shuffle with the zero vector. 708 if ((II.getArgOperand(0) == II.getArgOperand(1)) || 709 (ZMask & (1 << DestLane))) { 710 V1 = ZeroVector; 711 // We may still move 32-bits of the first source vector from one lane 712 // to another. 713 ShuffleMask[DestLane] = SourceLane; 714 // The zero mask may override the previous insert operation. 715 for (unsigned i = 0; i < 4; ++i) 716 if ((ZMask >> i) & 0x1) 717 ShuffleMask[i] = i + 4; 718 } else { 719 // TODO: Model this case as 2 shuffles or a 'logical and' plus shuffle? 720 return nullptr; 721 } 722 } else { 723 // Replace the selected destination lane with the selected source lane. 724 ShuffleMask[DestLane] = SourceLane + 4; 725 } 726 727 return Builder.CreateShuffleVector(II.getArgOperand(0), V1, ShuffleMask); 728} 729 730/// Attempt to simplify SSE4A EXTRQ/EXTRQI instructions using constant folding 731/// or conversion to a shuffle vector. 732static Value *simplifyX86extrq(IntrinsicInst &II, Value *Op0, 733 ConstantInt *CILength, ConstantInt *CIIndex, 734 InstCombiner::BuilderTy &Builder) { 735 auto LowConstantHighUndef = [&](uint64_t Val) { 736 Type *IntTy64 = Type::getInt64Ty(II.getContext()); 737 Constant *Args[] = {ConstantInt::get(IntTy64, Val), 738 UndefValue::get(IntTy64)}; 739 return ConstantVector::get(Args); 740 }; 741 742 // See if we're dealing with constant values. 743 Constant *C0 = dyn_cast<Constant>(Op0); 744 ConstantInt *CI0 = 745 C0 ? dyn_cast_or_null<ConstantInt>(C0->getAggregateElement((unsigned)0)) 746 : nullptr; 747 748 // Attempt to constant fold. 749 if (CILength && CIIndex) { 750 // From AMD documentation: "The bit index and field length are each six 751 // bits in length other bits of the field are ignored." 752 APInt APIndex = CIIndex->getValue().zextOrTrunc(6); 753 APInt APLength = CILength->getValue().zextOrTrunc(6); 754 755 unsigned Index = APIndex.getZExtValue(); 756 757 // From AMD documentation: "a value of zero in the field length is 758 // defined as length of 64". 759 unsigned Length = APLength == 0 ? 64 : APLength.getZExtValue(); 760 761 // From AMD documentation: "If the sum of the bit index + length field 762 // is greater than 64, the results are undefined". 763 unsigned End = Index + Length; 764 765 // Note that both field index and field length are 8-bit quantities. 766 // Since variables 'Index' and 'Length' are unsigned values 767 // obtained from zero-extending field index and field length 768 // respectively, their sum should never wrap around. 769 if (End > 64) 770 return UndefValue::get(II.getType()); 771 772 // If we are inserting whole bytes, we can convert this to a shuffle. 773 // Lowering can recognize EXTRQI shuffle masks. 774 if ((Length % 8) == 0 && (Index % 8) == 0) { 775 // Convert bit indices to byte indices. 776 Length /= 8; 777 Index /= 8; 778 779 Type *IntTy8 = Type::getInt8Ty(II.getContext()); 780 Type *IntTy32 = Type::getInt32Ty(II.getContext()); 781 VectorType *ShufTy = VectorType::get(IntTy8, 16); 782 783 SmallVector<Constant *, 16> ShuffleMask; 784 for (int i = 0; i != (int)Length; ++i) 785 ShuffleMask.push_back( 786 Constant::getIntegerValue(IntTy32, APInt(32, i + Index))); 787 for (int i = Length; i != 8; ++i) 788 ShuffleMask.push_back( 789 Constant::getIntegerValue(IntTy32, APInt(32, i + 16))); 790 for (int i = 8; i != 16; ++i) 791 ShuffleMask.push_back(UndefValue::get(IntTy32)); 792 793 Value *SV = Builder.CreateShuffleVector( 794 Builder.CreateBitCast(Op0, ShufTy), 795 ConstantAggregateZero::get(ShufTy), ConstantVector::get(ShuffleMask)); 796 return Builder.CreateBitCast(SV, II.getType()); 797 } 798 799 // Constant Fold - shift Index'th bit to lowest position and mask off 800 // Length bits. 801 if (CI0) { 802 APInt Elt = CI0->getValue(); 803 Elt.lshrInPlace(Index); 804 Elt = Elt.zextOrTrunc(Length); 805 return LowConstantHighUndef(Elt.getZExtValue()); 806 } 807 808 // If we were an EXTRQ call, we'll save registers if we convert to EXTRQI. 809 if (II.getIntrinsicID() == Intrinsic::x86_sse4a_extrq) { 810 Value *Args[] = {Op0, CILength, CIIndex}; 811 Module *M = II.getModule(); 812 Function *F = Intrinsic::getDeclaration(M, Intrinsic::x86_sse4a_extrqi); 813 return Builder.CreateCall(F, Args); 814 } 815 } 816 817 // Constant Fold - extraction from zero is always {zero, undef}. 818 if (CI0 && CI0->isZero()) 819 return LowConstantHighUndef(0); 820 821 return nullptr; 822} 823 824/// Attempt to simplify SSE4A INSERTQ/INSERTQI instructions using constant 825/// folding or conversion to a shuffle vector. 826static Value *simplifyX86insertq(IntrinsicInst &II, Value *Op0, Value *Op1, 827 APInt APLength, APInt APIndex, 828 InstCombiner::BuilderTy &Builder) { 829 // From AMD documentation: "The bit index and field length are each six bits 830 // in length other bits of the field are ignored." 831 APIndex = APIndex.zextOrTrunc(6); 832 APLength = APLength.zextOrTrunc(6); 833 834 // Attempt to constant fold. 835 unsigned Index = APIndex.getZExtValue(); 836 837 // From AMD documentation: "a value of zero in the field length is 838 // defined as length of 64". 839 unsigned Length = APLength == 0 ? 64 : APLength.getZExtValue(); 840 841 // From AMD documentation: "If the sum of the bit index + length field 842 // is greater than 64, the results are undefined". 843 unsigned End = Index + Length; 844 845 // Note that both field index and field length are 8-bit quantities. 846 // Since variables 'Index' and 'Length' are unsigned values 847 // obtained from zero-extending field index and field length 848 // respectively, their sum should never wrap around. 849 if (End > 64) 850 return UndefValue::get(II.getType()); 851 852 // If we are inserting whole bytes, we can convert this to a shuffle. 853 // Lowering can recognize INSERTQI shuffle masks. 854 if ((Length % 8) == 0 && (Index % 8) == 0) { 855 // Convert bit indices to byte indices. 856 Length /= 8; 857 Index /= 8; 858 859 Type *IntTy8 = Type::getInt8Ty(II.getContext()); 860 Type *IntTy32 = Type::getInt32Ty(II.getContext()); 861 VectorType *ShufTy = VectorType::get(IntTy8, 16); 862 863 SmallVector<Constant *, 16> ShuffleMask; 864 for (int i = 0; i != (int)Index; ++i) 865 ShuffleMask.push_back(Constant::getIntegerValue(IntTy32, APInt(32, i))); 866 for (int i = 0; i != (int)Length; ++i) 867 ShuffleMask.push_back( 868 Constant::getIntegerValue(IntTy32, APInt(32, i + 16))); 869 for (int i = Index + Length; i != 8; ++i) 870 ShuffleMask.push_back(Constant::getIntegerValue(IntTy32, APInt(32, i))); 871 for (int i = 8; i != 16; ++i) 872 ShuffleMask.push_back(UndefValue::get(IntTy32)); 873 874 Value *SV = Builder.CreateShuffleVector(Builder.CreateBitCast(Op0, ShufTy), 875 Builder.CreateBitCast(Op1, ShufTy), 876 ConstantVector::get(ShuffleMask)); 877 return Builder.CreateBitCast(SV, II.getType()); 878 } 879 880 // See if we're dealing with constant values. 881 Constant *C0 = dyn_cast<Constant>(Op0); 882 Constant *C1 = dyn_cast<Constant>(Op1); 883 ConstantInt *CI00 = 884 C0 ? dyn_cast_or_null<ConstantInt>(C0->getAggregateElement((unsigned)0)) 885 : nullptr; 886 ConstantInt *CI10 = 887 C1 ? dyn_cast_or_null<ConstantInt>(C1->getAggregateElement((unsigned)0)) 888 : nullptr; 889 890 // Constant Fold - insert bottom Length bits starting at the Index'th bit. 891 if (CI00 && CI10) { 892 APInt V00 = CI00->getValue(); 893 APInt V10 = CI10->getValue(); 894 APInt Mask = APInt::getLowBitsSet(64, Length).shl(Index); 895 V00 = V00 & ~Mask; 896 V10 = V10.zextOrTrunc(Length).zextOrTrunc(64).shl(Index); 897 APInt Val = V00 | V10; 898 Type *IntTy64 = Type::getInt64Ty(II.getContext()); 899 Constant *Args[] = {ConstantInt::get(IntTy64, Val.getZExtValue()), 900 UndefValue::get(IntTy64)}; 901 return ConstantVector::get(Args); 902 } 903 904 // If we were an INSERTQ call, we'll save demanded elements if we convert to 905 // INSERTQI. 906 if (II.getIntrinsicID() == Intrinsic::x86_sse4a_insertq) { 907 Type *IntTy8 = Type::getInt8Ty(II.getContext()); 908 Constant *CILength = ConstantInt::get(IntTy8, Length, false); 909 Constant *CIIndex = ConstantInt::get(IntTy8, Index, false); 910 911 Value *Args[] = {Op0, Op1, CILength, CIIndex}; 912 Module *M = II.getModule(); 913 Function *F = Intrinsic::getDeclaration(M, Intrinsic::x86_sse4a_insertqi); 914 return Builder.CreateCall(F, Args); 915 } 916 917 return nullptr; 918} 919 920/// Attempt to convert pshufb* to shufflevector if the mask is constant. 921static Value *simplifyX86pshufb(const IntrinsicInst &II, 922 InstCombiner::BuilderTy &Builder) { 923 Constant *V = dyn_cast<Constant>(II.getArgOperand(1)); 924 if (!V) 925 return nullptr; 926 927 auto *VecTy = cast<VectorType>(II.getType()); 928 auto *MaskEltTy = Type::getInt32Ty(II.getContext()); 929 unsigned NumElts = VecTy->getNumElements(); 930 assert((NumElts == 16 || NumElts == 32 || NumElts == 64) && 931 "Unexpected number of elements in shuffle mask!"); 932 933 // Construct a shuffle mask from constant integers or UNDEFs. 934 Constant *Indexes[64] = {nullptr}; 935 936 // Each byte in the shuffle control mask forms an index to permute the 937 // corresponding byte in the destination operand. 938 for (unsigned I = 0; I < NumElts; ++I) { 939 Constant *COp = V->getAggregateElement(I); 940 if (!COp || (!isa<UndefValue>(COp) && !isa<ConstantInt>(COp))) 941 return nullptr; 942 943 if (isa<UndefValue>(COp)) { 944 Indexes[I] = UndefValue::get(MaskEltTy); 945 continue; 946 } 947 948 int8_t Index = cast<ConstantInt>(COp)->getValue().getZExtValue(); 949 950 // If the most significant bit (bit[7]) of each byte of the shuffle 951 // control mask is set, then zero is written in the result byte. 952 // The zero vector is in the right-hand side of the resulting 953 // shufflevector. 954 955 // The value of each index for the high 128-bit lane is the least 956 // significant 4 bits of the respective shuffle control byte. 957 Index = ((Index < 0) ? NumElts : Index & 0x0F) + (I & 0xF0); 958 Indexes[I] = ConstantInt::get(MaskEltTy, Index); 959 } 960 961 auto ShuffleMask = ConstantVector::get(makeArrayRef(Indexes, NumElts)); 962 auto V1 = II.getArgOperand(0); 963 auto V2 = Constant::getNullValue(VecTy); 964 return Builder.CreateShuffleVector(V1, V2, ShuffleMask); 965} 966 967/// Attempt to convert vpermilvar* to shufflevector if the mask is constant. 968static Value *simplifyX86vpermilvar(const IntrinsicInst &II, 969 InstCombiner::BuilderTy &Builder) { 970 Constant *V = dyn_cast<Constant>(II.getArgOperand(1)); 971 if (!V) 972 return nullptr; 973 974 auto *VecTy = cast<VectorType>(II.getType()); 975 auto *MaskEltTy = Type::getInt32Ty(II.getContext()); 976 unsigned NumElts = VecTy->getVectorNumElements(); 977 bool IsPD = VecTy->getScalarType()->isDoubleTy(); 978 unsigned NumLaneElts = IsPD ? 2 : 4; 979 assert(NumElts == 16 || NumElts == 8 || NumElts == 4 || NumElts == 2); 980 981 // Construct a shuffle mask from constant integers or UNDEFs. 982 Constant *Indexes[16] = {nullptr}; 983 984 // The intrinsics only read one or two bits, clear the rest. 985 for (unsigned I = 0; I < NumElts; ++I) { 986 Constant *COp = V->getAggregateElement(I); 987 if (!COp || (!isa<UndefValue>(COp) && !isa<ConstantInt>(COp))) 988 return nullptr; 989 990 if (isa<UndefValue>(COp)) { 991 Indexes[I] = UndefValue::get(MaskEltTy); 992 continue; 993 } 994 995 APInt Index = cast<ConstantInt>(COp)->getValue(); 996 Index = Index.zextOrTrunc(32).getLoBits(2); 997 998 // The PD variants uses bit 1 to select per-lane element index, so 999 // shift down to convert to generic shuffle mask index. 1000 if (IsPD) 1001 Index.lshrInPlace(1); 1002 1003 // The _256 variants are a bit trickier since the mask bits always index 1004 // into the corresponding 128 half. In order to convert to a generic 1005 // shuffle, we have to make that explicit. 1006 Index += APInt(32, (I / NumLaneElts) * NumLaneElts); 1007 1008 Indexes[I] = ConstantInt::get(MaskEltTy, Index); 1009 } 1010 1011 auto ShuffleMask = ConstantVector::get(makeArrayRef(Indexes, NumElts)); 1012 auto V1 = II.getArgOperand(0); 1013 auto V2 = UndefValue::get(V1->getType()); 1014 return Builder.CreateShuffleVector(V1, V2, ShuffleMask); 1015} 1016 1017/// Attempt to convert vpermd/vpermps to shufflevector if the mask is constant. 1018static Value *simplifyX86vpermv(const IntrinsicInst &II, 1019 InstCombiner::BuilderTy &Builder) { 1020 auto *V = dyn_cast<Constant>(II.getArgOperand(1)); 1021 if (!V) 1022 return nullptr; 1023 1024 auto *VecTy = cast<VectorType>(II.getType()); 1025 auto *MaskEltTy = Type::getInt32Ty(II.getContext()); 1026 unsigned Size = VecTy->getNumElements(); 1027 assert((Size == 4 || Size == 8 || Size == 16 || Size == 32 || Size == 64) && 1028 "Unexpected shuffle mask size"); 1029 1030 // Construct a shuffle mask from constant integers or UNDEFs. 1031 Constant *Indexes[64] = {nullptr}; 1032 1033 for (unsigned I = 0; I < Size; ++I) { 1034 Constant *COp = V->getAggregateElement(I); 1035 if (!COp || (!isa<UndefValue>(COp) && !isa<ConstantInt>(COp))) 1036 return nullptr; 1037 1038 if (isa<UndefValue>(COp)) { 1039 Indexes[I] = UndefValue::get(MaskEltTy); 1040 continue; 1041 } 1042 1043 uint32_t Index = cast<ConstantInt>(COp)->getZExtValue(); 1044 Index &= Size - 1; 1045 Indexes[I] = ConstantInt::get(MaskEltTy, Index); 1046 } 1047 1048 auto ShuffleMask = ConstantVector::get(makeArrayRef(Indexes, Size)); 1049 auto V1 = II.getArgOperand(0); 1050 auto V2 = UndefValue::get(VecTy); 1051 return Builder.CreateShuffleVector(V1, V2, ShuffleMask); 1052} 1053 1054// TODO, Obvious Missing Transforms: 1055// * Narrow width by halfs excluding zero/undef lanes 1056Value *InstCombiner::simplifyMaskedLoad(IntrinsicInst &II) { 1057 Value *LoadPtr = II.getArgOperand(0); 1058 unsigned Alignment = cast<ConstantInt>(II.getArgOperand(1))->getZExtValue(); 1059 1060 // If the mask is all ones or undefs, this is a plain vector load of the 1st 1061 // argument. 1062 if (maskIsAllOneOrUndef(II.getArgOperand(2))) 1063 return Builder.CreateAlignedLoad(II.getType(), LoadPtr, Alignment, 1064 "unmaskedload"); 1065 1066 // If we can unconditionally load from this address, replace with a 1067 // load/select idiom. TODO: use DT for context sensitive query 1068 if (isDereferenceableAndAlignedPointer( 1069 LoadPtr, II.getType(), MaybeAlign(Alignment), 1070 II.getModule()->getDataLayout(), &II, nullptr)) { 1071 Value *LI = Builder.CreateAlignedLoad(II.getType(), LoadPtr, Alignment, 1072 "unmaskedload"); 1073 return Builder.CreateSelect(II.getArgOperand(2), LI, II.getArgOperand(3)); 1074 } 1075 1076 return nullptr; 1077} 1078 1079// TODO, Obvious Missing Transforms: 1080// * Single constant active lane -> store 1081// * Narrow width by halfs excluding zero/undef lanes 1082Instruction *InstCombiner::simplifyMaskedStore(IntrinsicInst &II) { 1083 auto *ConstMask = dyn_cast<Constant>(II.getArgOperand(3)); 1084 if (!ConstMask) 1085 return nullptr; 1086 1087 // If the mask is all zeros, this instruction does nothing. 1088 if (ConstMask->isNullValue()) 1089 return eraseInstFromFunction(II); 1090 1091 // If the mask is all ones, this is a plain vector store of the 1st argument. 1092 if (ConstMask->isAllOnesValue()) { 1093 Value *StorePtr = II.getArgOperand(1); 1094 MaybeAlign Alignment( 1095 cast<ConstantInt>(II.getArgOperand(2))->getZExtValue()); 1096 return new StoreInst(II.getArgOperand(0), StorePtr, false, Alignment); 1097 } 1098 1099 // Use masked off lanes to simplify operands via SimplifyDemandedVectorElts 1100 APInt DemandedElts = possiblyDemandedEltsInMask(ConstMask); 1101 APInt UndefElts(DemandedElts.getBitWidth(), 0); 1102 if (Value *V = SimplifyDemandedVectorElts(II.getOperand(0), 1103 DemandedElts, UndefElts)) { 1104 II.setOperand(0, V); 1105 return &II; 1106 } 1107 1108 return nullptr; 1109} 1110 1111// TODO, Obvious Missing Transforms: 1112// * Single constant active lane load -> load 1113// * Dereferenceable address & few lanes -> scalarize speculative load/selects 1114// * Adjacent vector addresses -> masked.load 1115// * Narrow width by halfs excluding zero/undef lanes 1116// * Vector splat address w/known mask -> scalar load 1117// * Vector incrementing address -> vector masked load 1118Instruction *InstCombiner::simplifyMaskedGather(IntrinsicInst &II) { 1119 return nullptr; 1120} 1121 1122// TODO, Obvious Missing Transforms: 1123// * Single constant active lane -> store 1124// * Adjacent vector addresses -> masked.store 1125// * Narrow store width by halfs excluding zero/undef lanes 1126// * Vector splat address w/known mask -> scalar store 1127// * Vector incrementing address -> vector masked store 1128Instruction *InstCombiner::simplifyMaskedScatter(IntrinsicInst &II) { 1129 auto *ConstMask = dyn_cast<Constant>(II.getArgOperand(3)); 1130 if (!ConstMask) 1131 return nullptr; 1132 1133 // If the mask is all zeros, a scatter does nothing. 1134 if (ConstMask->isNullValue()) 1135 return eraseInstFromFunction(II); 1136 1137 // Use masked off lanes to simplify operands via SimplifyDemandedVectorElts 1138 APInt DemandedElts = possiblyDemandedEltsInMask(ConstMask); 1139 APInt UndefElts(DemandedElts.getBitWidth(), 0); 1140 if (Value *V = SimplifyDemandedVectorElts(II.getOperand(0), 1141 DemandedElts, UndefElts)) { 1142 II.setOperand(0, V); 1143 return &II; 1144 } 1145 if (Value *V = SimplifyDemandedVectorElts(II.getOperand(1), 1146 DemandedElts, UndefElts)) { 1147 II.setOperand(1, V); 1148 return &II; 1149 } 1150 1151 return nullptr; 1152} 1153 1154/// This function transforms launder.invariant.group and strip.invariant.group 1155/// like: 1156/// launder(launder(%x)) -> launder(%x) (the result is not the argument) 1157/// launder(strip(%x)) -> launder(%x) 1158/// strip(strip(%x)) -> strip(%x) (the result is not the argument) 1159/// strip(launder(%x)) -> strip(%x) 1160/// This is legal because it preserves the most recent information about 1161/// the presence or absence of invariant.group. 1162static Instruction *simplifyInvariantGroupIntrinsic(IntrinsicInst &II, 1163 InstCombiner &IC) { 1164 auto *Arg = II.getArgOperand(0); 1165 auto *StrippedArg = Arg->stripPointerCasts(); 1166 auto *StrippedInvariantGroupsArg = Arg->stripPointerCastsAndInvariantGroups(); 1167 if (StrippedArg == StrippedInvariantGroupsArg) 1168 return nullptr; // No launders/strips to remove. 1169 1170 Value *Result = nullptr; 1171 1172 if (II.getIntrinsicID() == Intrinsic::launder_invariant_group) 1173 Result = IC.Builder.CreateLaunderInvariantGroup(StrippedInvariantGroupsArg); 1174 else if (II.getIntrinsicID() == Intrinsic::strip_invariant_group) 1175 Result = IC.Builder.CreateStripInvariantGroup(StrippedInvariantGroupsArg); 1176 else 1177 llvm_unreachable( 1178 "simplifyInvariantGroupIntrinsic only handles launder and strip"); 1179 if (Result->getType()->getPointerAddressSpace() != 1180 II.getType()->getPointerAddressSpace()) 1181 Result = IC.Builder.CreateAddrSpaceCast(Result, II.getType()); 1182 if (Result->getType() != II.getType()) 1183 Result = IC.Builder.CreateBitCast(Result, II.getType()); 1184 1185 return cast<Instruction>(Result); 1186} 1187 1188static Instruction *foldCttzCtlz(IntrinsicInst &II, InstCombiner &IC) { 1189 assert((II.getIntrinsicID() == Intrinsic::cttz || 1190 II.getIntrinsicID() == Intrinsic::ctlz) && 1191 "Expected cttz or ctlz intrinsic"); 1192 bool IsTZ = II.getIntrinsicID() == Intrinsic::cttz; 1193 Value *Op0 = II.getArgOperand(0); 1194 Value *X; 1195 // ctlz(bitreverse(x)) -> cttz(x) 1196 // cttz(bitreverse(x)) -> ctlz(x) 1197 if (match(Op0, m_BitReverse(m_Value(X)))) { 1198 Intrinsic::ID ID = IsTZ ? Intrinsic::ctlz : Intrinsic::cttz; 1199 Function *F = Intrinsic::getDeclaration(II.getModule(), ID, II.getType()); 1200 return CallInst::Create(F, {X, II.getArgOperand(1)}); 1201 } 1202 1203 if (IsTZ) { 1204 // cttz(-x) -> cttz(x) 1205 if (match(Op0, m_Neg(m_Value(X)))) { 1206 II.setOperand(0, X); 1207 return &II; 1208 } 1209 1210 // cttz(abs(x)) -> cttz(x) 1211 // cttz(nabs(x)) -> cttz(x) 1212 Value *Y; 1213 SelectPatternFlavor SPF = matchSelectPattern(Op0, X, Y).Flavor; 1214 if (SPF == SPF_ABS || SPF == SPF_NABS) { 1215 II.setOperand(0, X); 1216 return &II; 1217 } 1218 } 1219 1220 KnownBits Known = IC.computeKnownBits(Op0, 0, &II); 1221 1222 // Create a mask for bits above (ctlz) or below (cttz) the first known one. 1223 unsigned PossibleZeros = IsTZ ? Known.countMaxTrailingZeros() 1224 : Known.countMaxLeadingZeros(); 1225 unsigned DefiniteZeros = IsTZ ? Known.countMinTrailingZeros() 1226 : Known.countMinLeadingZeros(); 1227 1228 // If all bits above (ctlz) or below (cttz) the first known one are known 1229 // zero, this value is constant. 1230 // FIXME: This should be in InstSimplify because we're replacing an 1231 // instruction with a constant. 1232 if (PossibleZeros == DefiniteZeros) { 1233 auto *C = ConstantInt::get(Op0->getType(), DefiniteZeros); 1234 return IC.replaceInstUsesWith(II, C); 1235 } 1236 1237 // If the input to cttz/ctlz is known to be non-zero, 1238 // then change the 'ZeroIsUndef' parameter to 'true' 1239 // because we know the zero behavior can't affect the result. 1240 if (!Known.One.isNullValue() || 1241 isKnownNonZero(Op0, IC.getDataLayout(), 0, &IC.getAssumptionCache(), &II, 1242 &IC.getDominatorTree())) { 1243 if (!match(II.getArgOperand(1), m_One())) { 1244 II.setOperand(1, IC.Builder.getTrue()); 1245 return &II; 1246 } 1247 } 1248 1249 // Add range metadata since known bits can't completely reflect what we know. 1250 // TODO: Handle splat vectors. 1251 auto *IT = dyn_cast<IntegerType>(Op0->getType()); 1252 if (IT && IT->getBitWidth() != 1 && !II.getMetadata(LLVMContext::MD_range)) { 1253 Metadata *LowAndHigh[] = { 1254 ConstantAsMetadata::get(ConstantInt::get(IT, DefiniteZeros)), 1255 ConstantAsMetadata::get(ConstantInt::get(IT, PossibleZeros + 1))}; 1256 II.setMetadata(LLVMContext::MD_range, 1257 MDNode::get(II.getContext(), LowAndHigh)); 1258 return &II; 1259 } 1260 1261 return nullptr; 1262} 1263 1264static Instruction *foldCtpop(IntrinsicInst &II, InstCombiner &IC) { 1265 assert(II.getIntrinsicID() == Intrinsic::ctpop && 1266 "Expected ctpop intrinsic"); 1267 Value *Op0 = II.getArgOperand(0); 1268 Value *X; 1269 // ctpop(bitreverse(x)) -> ctpop(x) 1270 // ctpop(bswap(x)) -> ctpop(x) 1271 if (match(Op0, m_BitReverse(m_Value(X))) || match(Op0, m_BSwap(m_Value(X)))) { 1272 II.setOperand(0, X); 1273 return &II; 1274 } 1275 1276 // FIXME: Try to simplify vectors of integers. 1277 auto *IT = dyn_cast<IntegerType>(Op0->getType()); 1278 if (!IT) 1279 return nullptr; 1280 1281 unsigned BitWidth = IT->getBitWidth(); 1282 KnownBits Known(BitWidth); 1283 IC.computeKnownBits(Op0, Known, 0, &II); 1284 1285 unsigned MinCount = Known.countMinPopulation(); 1286 unsigned MaxCount = Known.countMaxPopulation(); 1287 1288 // Add range metadata since known bits can't completely reflect what we know. 1289 if (IT->getBitWidth() != 1 && !II.getMetadata(LLVMContext::MD_range)) { 1290 Metadata *LowAndHigh[] = { 1291 ConstantAsMetadata::get(ConstantInt::get(IT, MinCount)), 1292 ConstantAsMetadata::get(ConstantInt::get(IT, MaxCount + 1))}; 1293 II.setMetadata(LLVMContext::MD_range, 1294 MDNode::get(II.getContext(), LowAndHigh)); 1295 return &II; 1296 } 1297 1298 return nullptr; 1299} 1300 1301// TODO: If the x86 backend knew how to convert a bool vector mask back to an 1302// XMM register mask efficiently, we could transform all x86 masked intrinsics 1303// to LLVM masked intrinsics and remove the x86 masked intrinsic defs. 1304static Instruction *simplifyX86MaskedLoad(IntrinsicInst &II, InstCombiner &IC) { 1305 Value *Ptr = II.getOperand(0); 1306 Value *Mask = II.getOperand(1); 1307 Constant *ZeroVec = Constant::getNullValue(II.getType()); 1308 1309 // Special case a zero mask since that's not a ConstantDataVector. 1310 // This masked load instruction creates a zero vector. 1311 if (isa<ConstantAggregateZero>(Mask)) 1312 return IC.replaceInstUsesWith(II, ZeroVec); 1313 1314 auto *ConstMask = dyn_cast<ConstantDataVector>(Mask); 1315 if (!ConstMask) 1316 return nullptr; 1317 1318 // The mask is constant. Convert this x86 intrinsic to the LLVM instrinsic 1319 // to allow target-independent optimizations. 1320 1321 // First, cast the x86 intrinsic scalar pointer to a vector pointer to match 1322 // the LLVM intrinsic definition for the pointer argument. 1323 unsigned AddrSpace = cast<PointerType>(Ptr->getType())->getAddressSpace(); 1324 PointerType *VecPtrTy = PointerType::get(II.getType(), AddrSpace); 1325 Value *PtrCast = IC.Builder.CreateBitCast(Ptr, VecPtrTy, "castvec"); 1326 1327 // Second, convert the x86 XMM integer vector mask to a vector of bools based 1328 // on each element's most significant bit (the sign bit). 1329 Constant *BoolMask = getNegativeIsTrueBoolVec(ConstMask); 1330 1331 // The pass-through vector for an x86 masked load is a zero vector. 1332 CallInst *NewMaskedLoad = 1333 IC.Builder.CreateMaskedLoad(PtrCast, 1, BoolMask, ZeroVec); 1334 return IC.replaceInstUsesWith(II, NewMaskedLoad); 1335} 1336 1337// TODO: If the x86 backend knew how to convert a bool vector mask back to an 1338// XMM register mask efficiently, we could transform all x86 masked intrinsics 1339// to LLVM masked intrinsics and remove the x86 masked intrinsic defs. 1340static bool simplifyX86MaskedStore(IntrinsicInst &II, InstCombiner &IC) { 1341 Value *Ptr = II.getOperand(0); 1342 Value *Mask = II.getOperand(1); 1343 Value *Vec = II.getOperand(2); 1344 1345 // Special case a zero mask since that's not a ConstantDataVector: 1346 // this masked store instruction does nothing. 1347 if (isa<ConstantAggregateZero>(Mask)) { 1348 IC.eraseInstFromFunction(II); 1349 return true; 1350 } 1351 1352 // The SSE2 version is too weird (eg, unaligned but non-temporal) to do 1353 // anything else at this level. 1354 if (II.getIntrinsicID() == Intrinsic::x86_sse2_maskmov_dqu) 1355 return false; 1356 1357 auto *ConstMask = dyn_cast<ConstantDataVector>(Mask); 1358 if (!ConstMask) 1359 return false; 1360 1361 // The mask is constant. Convert this x86 intrinsic to the LLVM instrinsic 1362 // to allow target-independent optimizations. 1363 1364 // First, cast the x86 intrinsic scalar pointer to a vector pointer to match 1365 // the LLVM intrinsic definition for the pointer argument. 1366 unsigned AddrSpace = cast<PointerType>(Ptr->getType())->getAddressSpace(); 1367 PointerType *VecPtrTy = PointerType::get(Vec->getType(), AddrSpace); 1368 Value *PtrCast = IC.Builder.CreateBitCast(Ptr, VecPtrTy, "castvec"); 1369 1370 // Second, convert the x86 XMM integer vector mask to a vector of bools based 1371 // on each element's most significant bit (the sign bit). 1372 Constant *BoolMask = getNegativeIsTrueBoolVec(ConstMask); 1373 1374 IC.Builder.CreateMaskedStore(Vec, PtrCast, 1, BoolMask); 1375 1376 // 'Replace uses' doesn't work for stores. Erase the original masked store. 1377 IC.eraseInstFromFunction(II); 1378 return true; 1379} 1380 1381// Constant fold llvm.amdgcn.fmed3 intrinsics for standard inputs. 1382// 1383// A single NaN input is folded to minnum, so we rely on that folding for 1384// handling NaNs. 1385static APFloat fmed3AMDGCN(const APFloat &Src0, const APFloat &Src1, 1386 const APFloat &Src2) { 1387 APFloat Max3 = maxnum(maxnum(Src0, Src1), Src2); 1388 1389 APFloat::cmpResult Cmp0 = Max3.compare(Src0); 1390 assert(Cmp0 != APFloat::cmpUnordered && "nans handled separately"); 1391 if (Cmp0 == APFloat::cmpEqual) 1392 return maxnum(Src1, Src2); 1393 1394 APFloat::cmpResult Cmp1 = Max3.compare(Src1); 1395 assert(Cmp1 != APFloat::cmpUnordered && "nans handled separately"); 1396 if (Cmp1 == APFloat::cmpEqual) 1397 return maxnum(Src0, Src2); 1398 1399 return maxnum(Src0, Src1); 1400} 1401 1402/// Convert a table lookup to shufflevector if the mask is constant. 1403/// This could benefit tbl1 if the mask is { 7,6,5,4,3,2,1,0 }, in 1404/// which case we could lower the shufflevector with rev64 instructions 1405/// as it's actually a byte reverse. 1406static Value *simplifyNeonTbl1(const IntrinsicInst &II, 1407 InstCombiner::BuilderTy &Builder) { 1408 // Bail out if the mask is not a constant. 1409 auto *C = dyn_cast<Constant>(II.getArgOperand(1)); 1410 if (!C) 1411 return nullptr; 1412 1413 auto *VecTy = cast<VectorType>(II.getType()); 1414 unsigned NumElts = VecTy->getNumElements(); 1415 1416 // Only perform this transformation for <8 x i8> vector types. 1417 if (!VecTy->getElementType()->isIntegerTy(8) || NumElts != 8) 1418 return nullptr; 1419 1420 uint32_t Indexes[8]; 1421 1422 for (unsigned I = 0; I < NumElts; ++I) { 1423 Constant *COp = C->getAggregateElement(I); 1424 1425 if (!COp || !isa<ConstantInt>(COp)) 1426 return nullptr; 1427 1428 Indexes[I] = cast<ConstantInt>(COp)->getLimitedValue(); 1429 1430 // Make sure the mask indices are in range. 1431 if (Indexes[I] >= NumElts) 1432 return nullptr; 1433 } 1434 1435 auto *ShuffleMask = ConstantDataVector::get(II.getContext(), 1436 makeArrayRef(Indexes)); 1437 auto *V1 = II.getArgOperand(0); 1438 auto *V2 = Constant::getNullValue(V1->getType()); 1439 return Builder.CreateShuffleVector(V1, V2, ShuffleMask); 1440} 1441 1442/// Convert a vector load intrinsic into a simple llvm load instruction. 1443/// This is beneficial when the underlying object being addressed comes 1444/// from a constant, since we get constant-folding for free. 1445static Value *simplifyNeonVld1(const IntrinsicInst &II, 1446 unsigned MemAlign, 1447 InstCombiner::BuilderTy &Builder) { 1448 auto *IntrAlign = dyn_cast<ConstantInt>(II.getArgOperand(1)); 1449 1450 if (!IntrAlign) 1451 return nullptr; 1452 1453 unsigned Alignment = IntrAlign->getLimitedValue() < MemAlign ? 1454 MemAlign : IntrAlign->getLimitedValue(); 1455 1456 if (!isPowerOf2_32(Alignment)) 1457 return nullptr; 1458 1459 auto *BCastInst = Builder.CreateBitCast(II.getArgOperand(0), 1460 PointerType::get(II.getType(), 0)); 1461 return Builder.CreateAlignedLoad(II.getType(), BCastInst, Alignment); 1462} 1463 1464// Returns true iff the 2 intrinsics have the same operands, limiting the 1465// comparison to the first NumOperands. 1466static bool haveSameOperands(const IntrinsicInst &I, const IntrinsicInst &E, 1467 unsigned NumOperands) { 1468 assert(I.getNumArgOperands() >= NumOperands && "Not enough operands"); 1469 assert(E.getNumArgOperands() >= NumOperands && "Not enough operands"); 1470 for (unsigned i = 0; i < NumOperands; i++) 1471 if (I.getArgOperand(i) != E.getArgOperand(i)) 1472 return false; 1473 return true; 1474} 1475 1476// Remove trivially empty start/end intrinsic ranges, i.e. a start 1477// immediately followed by an end (ignoring debuginfo or other 1478// start/end intrinsics in between). As this handles only the most trivial 1479// cases, tracking the nesting level is not needed: 1480// 1481// call @llvm.foo.start(i1 0) ; &I 1482// call @llvm.foo.start(i1 0) 1483// call @llvm.foo.end(i1 0) ; This one will not be skipped: it will be removed 1484// call @llvm.foo.end(i1 0) 1485static bool removeTriviallyEmptyRange(IntrinsicInst &I, unsigned StartID, 1486 unsigned EndID, InstCombiner &IC) { 1487 assert(I.getIntrinsicID() == StartID && 1488 "Start intrinsic does not have expected ID"); 1489 BasicBlock::iterator BI(I), BE(I.getParent()->end()); 1490 for (++BI; BI != BE; ++BI) { 1491 if (auto *E = dyn_cast<IntrinsicInst>(BI)) { 1492 if (isa<DbgInfoIntrinsic>(E) || E->getIntrinsicID() == StartID) 1493 continue; 1494 if (E->getIntrinsicID() == EndID && 1495 haveSameOperands(I, *E, E->getNumArgOperands())) { 1496 IC.eraseInstFromFunction(*E); 1497 IC.eraseInstFromFunction(I); 1498 return true; 1499 } 1500 } 1501 break; 1502 } 1503 1504 return false; 1505} 1506 1507// Convert NVVM intrinsics to target-generic LLVM code where possible. 1508static Instruction *SimplifyNVVMIntrinsic(IntrinsicInst *II, InstCombiner &IC) { 1509 // Each NVVM intrinsic we can simplify can be replaced with one of: 1510 // 1511 // * an LLVM intrinsic, 1512 // * an LLVM cast operation, 1513 // * an LLVM binary operation, or 1514 // * ad-hoc LLVM IR for the particular operation. 1515 1516 // Some transformations are only valid when the module's 1517 // flush-denormals-to-zero (ftz) setting is true/false, whereas other 1518 // transformations are valid regardless of the module's ftz setting. 1519 enum FtzRequirementTy { 1520 FTZ_Any, // Any ftz setting is ok. 1521 FTZ_MustBeOn, // Transformation is valid only if ftz is on. 1522 FTZ_MustBeOff, // Transformation is valid only if ftz is off. 1523 }; 1524 // Classes of NVVM intrinsics that can't be replaced one-to-one with a 1525 // target-generic intrinsic, cast op, or binary op but that we can nonetheless 1526 // simplify. 1527 enum SpecialCase { 1528 SPC_Reciprocal, 1529 }; 1530 1531 // SimplifyAction is a poor-man's variant (plus an additional flag) that 1532 // represents how to replace an NVVM intrinsic with target-generic LLVM IR. 1533 struct SimplifyAction { 1534 // Invariant: At most one of these Optionals has a value. 1535 Optional<Intrinsic::ID> IID; 1536 Optional<Instruction::CastOps> CastOp; 1537 Optional<Instruction::BinaryOps> BinaryOp; 1538 Optional<SpecialCase> Special; 1539 1540 FtzRequirementTy FtzRequirement = FTZ_Any; 1541 1542 SimplifyAction() = default; 1543 1544 SimplifyAction(Intrinsic::ID IID, FtzRequirementTy FtzReq) 1545 : IID(IID), FtzRequirement(FtzReq) {} 1546 1547 // Cast operations don't have anything to do with FTZ, so we skip that 1548 // argument. 1549 SimplifyAction(Instruction::CastOps CastOp) : CastOp(CastOp) {} 1550 1551 SimplifyAction(Instruction::BinaryOps BinaryOp, FtzRequirementTy FtzReq) 1552 : BinaryOp(BinaryOp), FtzRequirement(FtzReq) {} 1553 1554 SimplifyAction(SpecialCase Special, FtzRequirementTy FtzReq) 1555 : Special(Special), FtzRequirement(FtzReq) {} 1556 }; 1557 1558 // Try to generate a SimplifyAction describing how to replace our 1559 // IntrinsicInstr with target-generic LLVM IR. 1560 const SimplifyAction Action = [II]() -> SimplifyAction { 1561 switch (II->getIntrinsicID()) { 1562 // NVVM intrinsics that map directly to LLVM intrinsics. 1563 case Intrinsic::nvvm_ceil_d: 1564 return {Intrinsic::ceil, FTZ_Any}; 1565 case Intrinsic::nvvm_ceil_f: 1566 return {Intrinsic::ceil, FTZ_MustBeOff}; 1567 case Intrinsic::nvvm_ceil_ftz_f: 1568 return {Intrinsic::ceil, FTZ_MustBeOn}; 1569 case Intrinsic::nvvm_fabs_d: 1570 return {Intrinsic::fabs, FTZ_Any}; 1571 case Intrinsic::nvvm_fabs_f: 1572 return {Intrinsic::fabs, FTZ_MustBeOff}; 1573 case Intrinsic::nvvm_fabs_ftz_f: 1574 return {Intrinsic::fabs, FTZ_MustBeOn}; 1575 case Intrinsic::nvvm_floor_d: 1576 return {Intrinsic::floor, FTZ_Any}; 1577 case Intrinsic::nvvm_floor_f: 1578 return {Intrinsic::floor, FTZ_MustBeOff}; 1579 case Intrinsic::nvvm_floor_ftz_f: 1580 return {Intrinsic::floor, FTZ_MustBeOn}; 1581 case Intrinsic::nvvm_fma_rn_d: 1582 return {Intrinsic::fma, FTZ_Any}; 1583 case Intrinsic::nvvm_fma_rn_f: 1584 return {Intrinsic::fma, FTZ_MustBeOff}; 1585 case Intrinsic::nvvm_fma_rn_ftz_f: 1586 return {Intrinsic::fma, FTZ_MustBeOn}; 1587 case Intrinsic::nvvm_fmax_d: 1588 return {Intrinsic::maxnum, FTZ_Any}; 1589 case Intrinsic::nvvm_fmax_f: 1590 return {Intrinsic::maxnum, FTZ_MustBeOff}; 1591 case Intrinsic::nvvm_fmax_ftz_f: 1592 return {Intrinsic::maxnum, FTZ_MustBeOn}; 1593 case Intrinsic::nvvm_fmin_d: 1594 return {Intrinsic::minnum, FTZ_Any}; 1595 case Intrinsic::nvvm_fmin_f: 1596 return {Intrinsic::minnum, FTZ_MustBeOff}; 1597 case Intrinsic::nvvm_fmin_ftz_f: 1598 return {Intrinsic::minnum, FTZ_MustBeOn}; 1599 case Intrinsic::nvvm_round_d: 1600 return {Intrinsic::round, FTZ_Any}; 1601 case Intrinsic::nvvm_round_f: 1602 return {Intrinsic::round, FTZ_MustBeOff}; 1603 case Intrinsic::nvvm_round_ftz_f: 1604 return {Intrinsic::round, FTZ_MustBeOn}; 1605 case Intrinsic::nvvm_sqrt_rn_d: 1606 return {Intrinsic::sqrt, FTZ_Any}; 1607 case Intrinsic::nvvm_sqrt_f: 1608 // nvvm_sqrt_f is a special case. For most intrinsics, foo_ftz_f is the 1609 // ftz version, and foo_f is the non-ftz version. But nvvm_sqrt_f adopts 1610 // the ftz-ness of the surrounding code. sqrt_rn_f and sqrt_rn_ftz_f are 1611 // the versions with explicit ftz-ness. 1612 return {Intrinsic::sqrt, FTZ_Any}; 1613 case Intrinsic::nvvm_sqrt_rn_f: 1614 return {Intrinsic::sqrt, FTZ_MustBeOff}; 1615 case Intrinsic::nvvm_sqrt_rn_ftz_f: 1616 return {Intrinsic::sqrt, FTZ_MustBeOn}; 1617 case Intrinsic::nvvm_trunc_d: 1618 return {Intrinsic::trunc, FTZ_Any}; 1619 case Intrinsic::nvvm_trunc_f: 1620 return {Intrinsic::trunc, FTZ_MustBeOff}; 1621 case Intrinsic::nvvm_trunc_ftz_f: 1622 return {Intrinsic::trunc, FTZ_MustBeOn}; 1623 1624 // NVVM intrinsics that map to LLVM cast operations. 1625 // 1626 // Note that llvm's target-generic conversion operators correspond to the rz 1627 // (round to zero) versions of the nvvm conversion intrinsics, even though 1628 // most everything else here uses the rn (round to nearest even) nvvm ops. 1629 case Intrinsic::nvvm_d2i_rz: 1630 case Intrinsic::nvvm_f2i_rz: 1631 case Intrinsic::nvvm_d2ll_rz: 1632 case Intrinsic::nvvm_f2ll_rz: 1633 return {Instruction::FPToSI}; 1634 case Intrinsic::nvvm_d2ui_rz: 1635 case Intrinsic::nvvm_f2ui_rz: 1636 case Intrinsic::nvvm_d2ull_rz: 1637 case Intrinsic::nvvm_f2ull_rz: 1638 return {Instruction::FPToUI}; 1639 case Intrinsic::nvvm_i2d_rz: 1640 case Intrinsic::nvvm_i2f_rz: 1641 case Intrinsic::nvvm_ll2d_rz: 1642 case Intrinsic::nvvm_ll2f_rz: 1643 return {Instruction::SIToFP}; 1644 case Intrinsic::nvvm_ui2d_rz: 1645 case Intrinsic::nvvm_ui2f_rz: 1646 case Intrinsic::nvvm_ull2d_rz: 1647 case Intrinsic::nvvm_ull2f_rz: 1648 return {Instruction::UIToFP}; 1649 1650 // NVVM intrinsics that map to LLVM binary ops. 1651 case Intrinsic::nvvm_add_rn_d: 1652 return {Instruction::FAdd, FTZ_Any}; 1653 case Intrinsic::nvvm_add_rn_f: 1654 return {Instruction::FAdd, FTZ_MustBeOff}; 1655 case Intrinsic::nvvm_add_rn_ftz_f: 1656 return {Instruction::FAdd, FTZ_MustBeOn}; 1657 case Intrinsic::nvvm_mul_rn_d: 1658 return {Instruction::FMul, FTZ_Any}; 1659 case Intrinsic::nvvm_mul_rn_f: 1660 return {Instruction::FMul, FTZ_MustBeOff}; 1661 case Intrinsic::nvvm_mul_rn_ftz_f: 1662 return {Instruction::FMul, FTZ_MustBeOn}; 1663 case Intrinsic::nvvm_div_rn_d: 1664 return {Instruction::FDiv, FTZ_Any}; 1665 case Intrinsic::nvvm_div_rn_f: 1666 return {Instruction::FDiv, FTZ_MustBeOff}; 1667 case Intrinsic::nvvm_div_rn_ftz_f: 1668 return {Instruction::FDiv, FTZ_MustBeOn}; 1669 1670 // The remainder of cases are NVVM intrinsics that map to LLVM idioms, but 1671 // need special handling. 1672 // 1673 // We seem to be missing intrinsics for rcp.approx.{ftz.}f32, which is just 1674 // as well. 1675 case Intrinsic::nvvm_rcp_rn_d: 1676 return {SPC_Reciprocal, FTZ_Any}; 1677 case Intrinsic::nvvm_rcp_rn_f: 1678 return {SPC_Reciprocal, FTZ_MustBeOff}; 1679 case Intrinsic::nvvm_rcp_rn_ftz_f: 1680 return {SPC_Reciprocal, FTZ_MustBeOn}; 1681 1682 // We do not currently simplify intrinsics that give an approximate answer. 1683 // These include: 1684 // 1685 // - nvvm_cos_approx_{f,ftz_f} 1686 // - nvvm_ex2_approx_{d,f,ftz_f} 1687 // - nvvm_lg2_approx_{d,f,ftz_f} 1688 // - nvvm_sin_approx_{f,ftz_f} 1689 // - nvvm_sqrt_approx_{f,ftz_f} 1690 // - nvvm_rsqrt_approx_{d,f,ftz_f} 1691 // - nvvm_div_approx_{ftz_d,ftz_f,f} 1692 // - nvvm_rcp_approx_ftz_d 1693 // 1694 // Ideally we'd encode them as e.g. "fast call @llvm.cos", where "fast" 1695 // means that fastmath is enabled in the intrinsic. Unfortunately only 1696 // binary operators (currently) have a fastmath bit in SelectionDAG, so this 1697 // information gets lost and we can't select on it. 1698 // 1699 // TODO: div and rcp are lowered to a binary op, so these we could in theory 1700 // lower them to "fast fdiv". 1701 1702 default: 1703 return {}; 1704 } 1705 }(); 1706 1707 // If Action.FtzRequirementTy is not satisfied by the module's ftz state, we 1708 // can bail out now. (Notice that in the case that IID is not an NVVM 1709 // intrinsic, we don't have to look up any module metadata, as 1710 // FtzRequirementTy will be FTZ_Any.) 1711 if (Action.FtzRequirement != FTZ_Any) { 1712 bool FtzEnabled = 1713 II->getFunction()->getFnAttribute("nvptx-f32ftz").getValueAsString() == 1714 "true"; 1715 1716 if (FtzEnabled != (Action.FtzRequirement == FTZ_MustBeOn)) 1717 return nullptr; 1718 } 1719 1720 // Simplify to target-generic intrinsic. 1721 if (Action.IID) { 1722 SmallVector<Value *, 4> Args(II->arg_operands()); 1723 // All the target-generic intrinsics currently of interest to us have one 1724 // type argument, equal to that of the nvvm intrinsic's argument. 1725 Type *Tys[] = {II->getArgOperand(0)->getType()}; 1726 return CallInst::Create( 1727 Intrinsic::getDeclaration(II->getModule(), *Action.IID, Tys), Args); 1728 } 1729 1730 // Simplify to target-generic binary op. 1731 if (Action.BinaryOp) 1732 return BinaryOperator::Create(*Action.BinaryOp, II->getArgOperand(0), 1733 II->getArgOperand(1), II->getName()); 1734 1735 // Simplify to target-generic cast op. 1736 if (Action.CastOp) 1737 return CastInst::Create(*Action.CastOp, II->getArgOperand(0), II->getType(), 1738 II->getName()); 1739 1740 // All that's left are the special cases. 1741 if (!Action.Special) 1742 return nullptr; 1743 1744 switch (*Action.Special) { 1745 case SPC_Reciprocal: 1746 // Simplify reciprocal. 1747 return BinaryOperator::Create( 1748 Instruction::FDiv, ConstantFP::get(II->getArgOperand(0)->getType(), 1), 1749 II->getArgOperand(0), II->getName()); 1750 } 1751 llvm_unreachable("All SpecialCase enumerators should be handled in switch."); 1752} 1753 1754Instruction *InstCombiner::visitVAStartInst(VAStartInst &I) { 1755 removeTriviallyEmptyRange(I, Intrinsic::vastart, Intrinsic::vaend, *this); 1756 return nullptr; 1757} 1758 1759Instruction *InstCombiner::visitVACopyInst(VACopyInst &I) { 1760 removeTriviallyEmptyRange(I, Intrinsic::vacopy, Intrinsic::vaend, *this); 1761 return nullptr; 1762} 1763 1764static Instruction *canonicalizeConstantArg0ToArg1(CallInst &Call) { 1765 assert(Call.getNumArgOperands() > 1 && "Need at least 2 args to swap"); 1766 Value *Arg0 = Call.getArgOperand(0), *Arg1 = Call.getArgOperand(1); 1767 if (isa<Constant>(Arg0) && !isa<Constant>(Arg1)) { 1768 Call.setArgOperand(0, Arg1); 1769 Call.setArgOperand(1, Arg0); 1770 return &Call; 1771 } 1772 return nullptr; 1773} 1774 1775Instruction *InstCombiner::foldIntrinsicWithOverflowCommon(IntrinsicInst *II) { 1776 WithOverflowInst *WO = cast<WithOverflowInst>(II); 1777 Value *OperationResult = nullptr; 1778 Constant *OverflowResult = nullptr; 1779 if (OptimizeOverflowCheck(WO->getBinaryOp(), WO->isSigned(), WO->getLHS(), 1780 WO->getRHS(), *WO, OperationResult, OverflowResult)) 1781 return CreateOverflowTuple(WO, OperationResult, OverflowResult); 1782 return nullptr; 1783} 1784 1785/// CallInst simplification. This mostly only handles folding of intrinsic 1786/// instructions. For normal calls, it allows visitCallBase to do the heavy 1787/// lifting. 1788Instruction *InstCombiner::visitCallInst(CallInst &CI) { 1789 if (Value *V = SimplifyCall(&CI, SQ.getWithInstruction(&CI))) 1790 return replaceInstUsesWith(CI, V); 1791 1792 if (isFreeCall(&CI, &TLI)) 1793 return visitFree(CI); 1794 1795 // If the caller function is nounwind, mark the call as nounwind, even if the 1796 // callee isn't. 1797 if (CI.getFunction()->doesNotThrow() && !CI.doesNotThrow()) { 1798 CI.setDoesNotThrow(); 1799 return &CI; 1800 } 1801 1802 IntrinsicInst *II = dyn_cast<IntrinsicInst>(&CI); 1803 if (!II) return visitCallBase(CI); 1804 1805 // Intrinsics cannot occur in an invoke or a callbr, so handle them here 1806 // instead of in visitCallBase. 1807 if (auto *MI = dyn_cast<AnyMemIntrinsic>(II)) { 1808 bool Changed = false; 1809 1810 // memmove/cpy/set of zero bytes is a noop. 1811 if (Constant *NumBytes = dyn_cast<Constant>(MI->getLength())) { 1812 if (NumBytes->isNullValue()) 1813 return eraseInstFromFunction(CI); 1814 1815 if (ConstantInt *CI = dyn_cast<ConstantInt>(NumBytes)) 1816 if (CI->getZExtValue() == 1) { 1817 // Replace the instruction with just byte operations. We would 1818 // transform other cases to loads/stores, but we don't know if 1819 // alignment is sufficient. 1820 } 1821 } 1822 1823 // No other transformations apply to volatile transfers. 1824 if (auto *M = dyn_cast<MemIntrinsic>(MI)) 1825 if (M->isVolatile()) 1826 return nullptr; 1827 1828 // If we have a memmove and the source operation is a constant global, 1829 // then the source and dest pointers can't alias, so we can change this 1830 // into a call to memcpy. 1831 if (auto *MMI = dyn_cast<AnyMemMoveInst>(MI)) { 1832 if (GlobalVariable *GVSrc = dyn_cast<GlobalVariable>(MMI->getSource())) 1833 if (GVSrc->isConstant()) { 1834 Module *M = CI.getModule(); 1835 Intrinsic::ID MemCpyID = 1836 isa<AtomicMemMoveInst>(MMI) 1837 ? Intrinsic::memcpy_element_unordered_atomic 1838 : Intrinsic::memcpy; 1839 Type *Tys[3] = { CI.getArgOperand(0)->getType(), 1840 CI.getArgOperand(1)->getType(), 1841 CI.getArgOperand(2)->getType() }; 1842 CI.setCalledFunction(Intrinsic::getDeclaration(M, MemCpyID, Tys)); 1843 Changed = true; 1844 } 1845 } 1846 1847 if (AnyMemTransferInst *MTI = dyn_cast<AnyMemTransferInst>(MI)) { 1848 // memmove(x,x,size) -> noop. 1849 if (MTI->getSource() == MTI->getDest()) 1850 return eraseInstFromFunction(CI); 1851 } 1852 1853 // If we can determine a pointer alignment that is bigger than currently 1854 // set, update the alignment. 1855 if (auto *MTI = dyn_cast<AnyMemTransferInst>(MI)) { 1856 if (Instruction *I = SimplifyAnyMemTransfer(MTI)) 1857 return I; 1858 } else if (auto *MSI = dyn_cast<AnyMemSetInst>(MI)) { 1859 if (Instruction *I = SimplifyAnyMemSet(MSI)) 1860 return I; 1861 } 1862 1863 if (Changed) return II; 1864 } 1865 1866 // For vector result intrinsics, use the generic demanded vector support. 1867 if (II->getType()->isVectorTy()) { 1868 auto VWidth = II->getType()->getVectorNumElements(); 1869 APInt UndefElts(VWidth, 0); 1870 APInt AllOnesEltMask(APInt::getAllOnesValue(VWidth)); 1871 if (Value *V = SimplifyDemandedVectorElts(II, AllOnesEltMask, UndefElts)) { 1872 if (V != II) 1873 return replaceInstUsesWith(*II, V); 1874 return II; 1875 } 1876 } 1877 1878 if (Instruction *I = SimplifyNVVMIntrinsic(II, *this)) 1879 return I; 1880 1881 auto SimplifyDemandedVectorEltsLow = [this](Value *Op, unsigned Width, 1882 unsigned DemandedWidth) { 1883 APInt UndefElts(Width, 0); 1884 APInt DemandedElts = APInt::getLowBitsSet(Width, DemandedWidth); 1885 return SimplifyDemandedVectorElts(Op, DemandedElts, UndefElts); 1886 }; 1887 1888 Intrinsic::ID IID = II->getIntrinsicID(); 1889 switch (IID) { 1890 default: break; 1891 case Intrinsic::objectsize: 1892 if (Value *V = lowerObjectSizeCall(II, DL, &TLI, /*MustSucceed=*/false)) 1893 return replaceInstUsesWith(CI, V); 1894 return nullptr; 1895 case Intrinsic::bswap: { 1896 Value *IIOperand = II->getArgOperand(0); 1897 Value *X = nullptr; 1898 1899 // bswap(trunc(bswap(x))) -> trunc(lshr(x, c)) 1900 if (match(IIOperand, m_Trunc(m_BSwap(m_Value(X))))) { 1901 unsigned C = X->getType()->getPrimitiveSizeInBits() - 1902 IIOperand->getType()->getPrimitiveSizeInBits(); 1903 Value *CV = ConstantInt::get(X->getType(), C); 1904 Value *V = Builder.CreateLShr(X, CV); 1905 return new TruncInst(V, IIOperand->getType()); 1906 } 1907 break; 1908 } 1909 case Intrinsic::masked_load: 1910 if (Value *SimplifiedMaskedOp = simplifyMaskedLoad(*II)) 1911 return replaceInstUsesWith(CI, SimplifiedMaskedOp); 1912 break; 1913 case Intrinsic::masked_store: 1914 return simplifyMaskedStore(*II); 1915 case Intrinsic::masked_gather: 1916 return simplifyMaskedGather(*II); 1917 case Intrinsic::masked_scatter: 1918 return simplifyMaskedScatter(*II); 1919 case Intrinsic::launder_invariant_group: 1920 case Intrinsic::strip_invariant_group: 1921 if (auto *SkippedBarrier = simplifyInvariantGroupIntrinsic(*II, *this)) 1922 return replaceInstUsesWith(*II, SkippedBarrier); 1923 break; 1924 case Intrinsic::powi: 1925 if (ConstantInt *Power = dyn_cast<ConstantInt>(II->getArgOperand(1))) { 1926 // 0 and 1 are handled in instsimplify 1927 1928 // powi(x, -1) -> 1/x 1929 if (Power->isMinusOne()) 1930 return BinaryOperator::CreateFDiv(ConstantFP::get(CI.getType(), 1.0), 1931 II->getArgOperand(0)); 1932 // powi(x, 2) -> x*x 1933 if (Power->equalsInt(2)) 1934 return BinaryOperator::CreateFMul(II->getArgOperand(0), 1935 II->getArgOperand(0)); 1936 } 1937 break; 1938 1939 case Intrinsic::cttz: 1940 case Intrinsic::ctlz: 1941 if (auto *I = foldCttzCtlz(*II, *this)) 1942 return I; 1943 break; 1944 1945 case Intrinsic::ctpop: 1946 if (auto *I = foldCtpop(*II, *this)) 1947 return I; 1948 break; 1949 1950 case Intrinsic::fshl: 1951 case Intrinsic::fshr: { 1952 Value *Op0 = II->getArgOperand(0), *Op1 = II->getArgOperand(1); 1953 Type *Ty = II->getType(); 1954 unsigned BitWidth = Ty->getScalarSizeInBits(); 1955 Constant *ShAmtC; 1956 if (match(II->getArgOperand(2), m_Constant(ShAmtC)) && 1957 !isa<ConstantExpr>(ShAmtC) && !ShAmtC->containsConstantExpression()) { 1958 // Canonicalize a shift amount constant operand to modulo the bit-width. 1959 Constant *WidthC = ConstantInt::get(Ty, BitWidth); 1960 Constant *ModuloC = ConstantExpr::getURem(ShAmtC, WidthC); 1961 if (ModuloC != ShAmtC) { 1962 II->setArgOperand(2, ModuloC); 1963 return II; 1964 } 1965 assert(ConstantExpr::getICmp(ICmpInst::ICMP_UGT, WidthC, ShAmtC) == 1966 ConstantInt::getTrue(CmpInst::makeCmpResultType(Ty)) && 1967 "Shift amount expected to be modulo bitwidth"); 1968 1969 // Canonicalize funnel shift right by constant to funnel shift left. This 1970 // is not entirely arbitrary. For historical reasons, the backend may 1971 // recognize rotate left patterns but miss rotate right patterns. 1972 if (IID == Intrinsic::fshr) { 1973 // fshr X, Y, C --> fshl X, Y, (BitWidth - C) 1974 Constant *LeftShiftC = ConstantExpr::getSub(WidthC, ShAmtC); 1975 Module *Mod = II->getModule(); 1976 Function *Fshl = Intrinsic::getDeclaration(Mod, Intrinsic::fshl, Ty); 1977 return CallInst::Create(Fshl, { Op0, Op1, LeftShiftC }); 1978 } 1979 assert(IID == Intrinsic::fshl && 1980 "All funnel shifts by simple constants should go left"); 1981 1982 // fshl(X, 0, C) --> shl X, C 1983 // fshl(X, undef, C) --> shl X, C 1984 if (match(Op1, m_ZeroInt()) || match(Op1, m_Undef())) 1985 return BinaryOperator::CreateShl(Op0, ShAmtC); 1986 1987 // fshl(0, X, C) --> lshr X, (BW-C) 1988 // fshl(undef, X, C) --> lshr X, (BW-C) 1989 if (match(Op0, m_ZeroInt()) || match(Op0, m_Undef())) 1990 return BinaryOperator::CreateLShr(Op1, 1991 ConstantExpr::getSub(WidthC, ShAmtC)); 1992 1993 // fshl i16 X, X, 8 --> bswap i16 X (reduce to more-specific form) 1994 if (Op0 == Op1 && BitWidth == 16 && match(ShAmtC, m_SpecificInt(8))) { 1995 Module *Mod = II->getModule(); 1996 Function *Bswap = Intrinsic::getDeclaration(Mod, Intrinsic::bswap, Ty); 1997 return CallInst::Create(Bswap, { Op0 }); 1998 } 1999 } 2000 2001 // Left or right might be masked. 2002 if (SimplifyDemandedInstructionBits(*II)) 2003 return &CI; 2004 2005 // The shift amount (operand 2) of a funnel shift is modulo the bitwidth, 2006 // so only the low bits of the shift amount are demanded if the bitwidth is 2007 // a power-of-2. 2008 if (!isPowerOf2_32(BitWidth)) 2009 break; 2010 APInt Op2Demanded = APInt::getLowBitsSet(BitWidth, Log2_32_Ceil(BitWidth)); 2011 KnownBits Op2Known(BitWidth); 2012 if (SimplifyDemandedBits(II, 2, Op2Demanded, Op2Known)) 2013 return &CI; 2014 break; 2015 } 2016 case Intrinsic::uadd_with_overflow: 2017 case Intrinsic::sadd_with_overflow: { 2018 if (Instruction *I = canonicalizeConstantArg0ToArg1(CI)) 2019 return I; 2020 if (Instruction *I = foldIntrinsicWithOverflowCommon(II)) 2021 return I; 2022 2023 // Given 2 constant operands whose sum does not overflow: 2024 // uaddo (X +nuw C0), C1 -> uaddo X, C0 + C1 2025 // saddo (X +nsw C0), C1 -> saddo X, C0 + C1 2026 Value *X; 2027 const APInt *C0, *C1; 2028 Value *Arg0 = II->getArgOperand(0); 2029 Value *Arg1 = II->getArgOperand(1); 2030 bool IsSigned = IID == Intrinsic::sadd_with_overflow; 2031 bool HasNWAdd = IsSigned ? match(Arg0, m_NSWAdd(m_Value(X), m_APInt(C0))) 2032 : match(Arg0, m_NUWAdd(m_Value(X), m_APInt(C0))); 2033 if (HasNWAdd && match(Arg1, m_APInt(C1))) { 2034 bool Overflow; 2035 APInt NewC = 2036 IsSigned ? C1->sadd_ov(*C0, Overflow) : C1->uadd_ov(*C0, Overflow); 2037 if (!Overflow) 2038 return replaceInstUsesWith( 2039 *II, Builder.CreateBinaryIntrinsic( 2040 IID, X, ConstantInt::get(Arg1->getType(), NewC))); 2041 } 2042 break; 2043 } 2044 2045 case Intrinsic::umul_with_overflow: 2046 case Intrinsic::smul_with_overflow: 2047 if (Instruction *I = canonicalizeConstantArg0ToArg1(CI)) 2048 return I; 2049 LLVM_FALLTHROUGH; 2050 2051 case Intrinsic::usub_with_overflow: 2052 if (Instruction *I = foldIntrinsicWithOverflowCommon(II)) 2053 return I; 2054 break; 2055 2056 case Intrinsic::ssub_with_overflow: { 2057 if (Instruction *I = foldIntrinsicWithOverflowCommon(II)) 2058 return I; 2059 2060 Constant *C; 2061 Value *Arg0 = II->getArgOperand(0); 2062 Value *Arg1 = II->getArgOperand(1); 2063 // Given a constant C that is not the minimum signed value 2064 // for an integer of a given bit width: 2065 // 2066 // ssubo X, C -> saddo X, -C 2067 if (match(Arg1, m_Constant(C)) && C->isNotMinSignedValue()) { 2068 Value *NegVal = ConstantExpr::getNeg(C); 2069 // Build a saddo call that is equivalent to the discovered 2070 // ssubo call. 2071 return replaceInstUsesWith( 2072 *II, Builder.CreateBinaryIntrinsic(Intrinsic::sadd_with_overflow, 2073 Arg0, NegVal)); 2074 } 2075 2076 break; 2077 } 2078 2079 case Intrinsic::uadd_sat: 2080 case Intrinsic::sadd_sat: 2081 if (Instruction *I = canonicalizeConstantArg0ToArg1(CI)) 2082 return I; 2083 LLVM_FALLTHROUGH; 2084 case Intrinsic::usub_sat: 2085 case Intrinsic::ssub_sat: { 2086 SaturatingInst *SI = cast<SaturatingInst>(II); 2087 Type *Ty = SI->getType(); 2088 Value *Arg0 = SI->getLHS(); 2089 Value *Arg1 = SI->getRHS(); 2090 2091 // Make use of known overflow information. 2092 OverflowResult OR = computeOverflow(SI->getBinaryOp(), SI->isSigned(), 2093 Arg0, Arg1, SI); 2094 switch (OR) { 2095 case OverflowResult::MayOverflow: 2096 break; 2097 case OverflowResult::NeverOverflows: 2098 if (SI->isSigned()) 2099 return BinaryOperator::CreateNSW(SI->getBinaryOp(), Arg0, Arg1); 2100 else 2101 return BinaryOperator::CreateNUW(SI->getBinaryOp(), Arg0, Arg1); 2102 case OverflowResult::AlwaysOverflowsLow: { 2103 unsigned BitWidth = Ty->getScalarSizeInBits(); 2104 APInt Min = APSInt::getMinValue(BitWidth, !SI->isSigned()); 2105 return replaceInstUsesWith(*SI, ConstantInt::get(Ty, Min)); 2106 } 2107 case OverflowResult::AlwaysOverflowsHigh: { 2108 unsigned BitWidth = Ty->getScalarSizeInBits(); 2109 APInt Max = APSInt::getMaxValue(BitWidth, !SI->isSigned()); 2110 return replaceInstUsesWith(*SI, ConstantInt::get(Ty, Max)); 2111 } 2112 } 2113 2114 // ssub.sat(X, C) -> sadd.sat(X, -C) if C != MIN 2115 Constant *C; 2116 if (IID == Intrinsic::ssub_sat && match(Arg1, m_Constant(C)) && 2117 C->isNotMinSignedValue()) { 2118 Value *NegVal = ConstantExpr::getNeg(C); 2119 return replaceInstUsesWith( 2120 *II, Builder.CreateBinaryIntrinsic( 2121 Intrinsic::sadd_sat, Arg0, NegVal)); 2122 } 2123 2124 // sat(sat(X + Val2) + Val) -> sat(X + (Val+Val2)) 2125 // sat(sat(X - Val2) - Val) -> sat(X - (Val+Val2)) 2126 // if Val and Val2 have the same sign 2127 if (auto *Other = dyn_cast<IntrinsicInst>(Arg0)) { 2128 Value *X; 2129 const APInt *Val, *Val2; 2130 APInt NewVal; 2131 bool IsUnsigned = 2132 IID == Intrinsic::uadd_sat || IID == Intrinsic::usub_sat; 2133 if (Other->getIntrinsicID() == IID && 2134 match(Arg1, m_APInt(Val)) && 2135 match(Other->getArgOperand(0), m_Value(X)) && 2136 match(Other->getArgOperand(1), m_APInt(Val2))) { 2137 if (IsUnsigned) 2138 NewVal = Val->uadd_sat(*Val2); 2139 else if (Val->isNonNegative() == Val2->isNonNegative()) { 2140 bool Overflow; 2141 NewVal = Val->sadd_ov(*Val2, Overflow); 2142 if (Overflow) { 2143 // Both adds together may add more than SignedMaxValue 2144 // without saturating the final result. 2145 break; 2146 } 2147 } else { 2148 // Cannot fold saturated addition with different signs. 2149 break; 2150 } 2151 2152 return replaceInstUsesWith( 2153 *II, Builder.CreateBinaryIntrinsic( 2154 IID, X, ConstantInt::get(II->getType(), NewVal))); 2155 } 2156 } 2157 break; 2158 } 2159 2160 case Intrinsic::minnum: 2161 case Intrinsic::maxnum: 2162 case Intrinsic::minimum: 2163 case Intrinsic::maximum: { 2164 if (Instruction *I = canonicalizeConstantArg0ToArg1(CI)) 2165 return I; 2166 Value *Arg0 = II->getArgOperand(0); 2167 Value *Arg1 = II->getArgOperand(1); 2168 Value *X, *Y; 2169 if (match(Arg0, m_FNeg(m_Value(X))) && match(Arg1, m_FNeg(m_Value(Y))) && 2170 (Arg0->hasOneUse() || Arg1->hasOneUse())) { 2171 // If both operands are negated, invert the call and negate the result: 2172 // min(-X, -Y) --> -(max(X, Y)) 2173 // max(-X, -Y) --> -(min(X, Y)) 2174 Intrinsic::ID NewIID; 2175 switch (IID) { 2176 case Intrinsic::maxnum: 2177 NewIID = Intrinsic::minnum; 2178 break; 2179 case Intrinsic::minnum: 2180 NewIID = Intrinsic::maxnum; 2181 break; 2182 case Intrinsic::maximum: 2183 NewIID = Intrinsic::minimum; 2184 break; 2185 case Intrinsic::minimum: 2186 NewIID = Intrinsic::maximum; 2187 break; 2188 default: 2189 llvm_unreachable("unexpected intrinsic ID"); 2190 } 2191 Value *NewCall = Builder.CreateBinaryIntrinsic(NewIID, X, Y, II); 2192 Instruction *FNeg = BinaryOperator::CreateFNeg(NewCall); 2193 FNeg->copyIRFlags(II); 2194 return FNeg; 2195 } 2196 2197 // m(m(X, C2), C1) -> m(X, C) 2198 const APFloat *C1, *C2; 2199 if (auto *M = dyn_cast<IntrinsicInst>(Arg0)) { 2200 if (M->getIntrinsicID() == IID && match(Arg1, m_APFloat(C1)) && 2201 ((match(M->getArgOperand(0), m_Value(X)) && 2202 match(M->getArgOperand(1), m_APFloat(C2))) || 2203 (match(M->getArgOperand(1), m_Value(X)) && 2204 match(M->getArgOperand(0), m_APFloat(C2))))) { 2205 APFloat Res(0.0); 2206 switch (IID) { 2207 case Intrinsic::maxnum: 2208 Res = maxnum(*C1, *C2); 2209 break; 2210 case Intrinsic::minnum: 2211 Res = minnum(*C1, *C2); 2212 break; 2213 case Intrinsic::maximum: 2214 Res = maximum(*C1, *C2); 2215 break; 2216 case Intrinsic::minimum: 2217 Res = minimum(*C1, *C2); 2218 break; 2219 default: 2220 llvm_unreachable("unexpected intrinsic ID"); 2221 } 2222 Instruction *NewCall = Builder.CreateBinaryIntrinsic( 2223 IID, X, ConstantFP::get(Arg0->getType(), Res)); 2224 NewCall->copyIRFlags(II); 2225 return replaceInstUsesWith(*II, NewCall); 2226 } 2227 } 2228 2229 break; 2230 } 2231 case Intrinsic::fmuladd: { 2232 // Canonicalize fast fmuladd to the separate fmul + fadd. 2233 if (II->isFast()) { 2234 BuilderTy::FastMathFlagGuard Guard(Builder); 2235 Builder.setFastMathFlags(II->getFastMathFlags()); 2236 Value *Mul = Builder.CreateFMul(II->getArgOperand(0), 2237 II->getArgOperand(1)); 2238 Value *Add = Builder.CreateFAdd(Mul, II->getArgOperand(2)); 2239 Add->takeName(II); 2240 return replaceInstUsesWith(*II, Add); 2241 } 2242 2243 // Try to simplify the underlying FMul. 2244 if (Value *V = SimplifyFMulInst(II->getArgOperand(0), II->getArgOperand(1), 2245 II->getFastMathFlags(), 2246 SQ.getWithInstruction(II))) { 2247 auto *FAdd = BinaryOperator::CreateFAdd(V, II->getArgOperand(2)); 2248 FAdd->copyFastMathFlags(II); 2249 return FAdd; 2250 } 2251 2252 LLVM_FALLTHROUGH; 2253 } 2254 case Intrinsic::fma: { 2255 if (Instruction *I = canonicalizeConstantArg0ToArg1(CI)) 2256 return I; 2257 2258 // fma fneg(x), fneg(y), z -> fma x, y, z 2259 Value *Src0 = II->getArgOperand(0); 2260 Value *Src1 = II->getArgOperand(1); 2261 Value *X, *Y; 2262 if (match(Src0, m_FNeg(m_Value(X))) && match(Src1, m_FNeg(m_Value(Y)))) { 2263 II->setArgOperand(0, X); 2264 II->setArgOperand(1, Y); 2265 return II; 2266 } 2267 2268 // fma fabs(x), fabs(x), z -> fma x, x, z 2269 if (match(Src0, m_FAbs(m_Value(X))) && 2270 match(Src1, m_FAbs(m_Specific(X)))) { 2271 II->setArgOperand(0, X); 2272 II->setArgOperand(1, X); 2273 return II; 2274 } 2275 2276 // Try to simplify the underlying FMul. We can only apply simplifications 2277 // that do not require rounding. 2278 if (Value *V = SimplifyFMAFMul(II->getArgOperand(0), II->getArgOperand(1), 2279 II->getFastMathFlags(), 2280 SQ.getWithInstruction(II))) { 2281 auto *FAdd = BinaryOperator::CreateFAdd(V, II->getArgOperand(2)); 2282 FAdd->copyFastMathFlags(II); 2283 return FAdd; 2284 } 2285 2286 break; 2287 } 2288 case Intrinsic::copysign: { 2289 if (SignBitMustBeZero(II->getArgOperand(1), &TLI)) { 2290 // If we know that the sign argument is positive, reduce to FABS: 2291 // copysign X, Pos --> fabs X 2292 Value *Fabs = Builder.CreateUnaryIntrinsic(Intrinsic::fabs, 2293 II->getArgOperand(0), II); 2294 return replaceInstUsesWith(*II, Fabs); 2295 } 2296 // TODO: There should be a ValueTracking sibling like SignBitMustBeOne. 2297 const APFloat *C; 2298 if (match(II->getArgOperand(1), m_APFloat(C)) && C->isNegative()) { 2299 // If we know that the sign argument is negative, reduce to FNABS: 2300 // copysign X, Neg --> fneg (fabs X) 2301 Value *Fabs = Builder.CreateUnaryIntrinsic(Intrinsic::fabs, 2302 II->getArgOperand(0), II); 2303 return replaceInstUsesWith(*II, Builder.CreateFNegFMF(Fabs, II)); 2304 } 2305 2306 // Propagate sign argument through nested calls: 2307 // copysign X, (copysign ?, SignArg) --> copysign X, SignArg 2308 Value *SignArg; 2309 if (match(II->getArgOperand(1), 2310 m_Intrinsic<Intrinsic::copysign>(m_Value(), m_Value(SignArg)))) { 2311 II->setArgOperand(1, SignArg); 2312 return II; 2313 } 2314 2315 break; 2316 } 2317 case Intrinsic::fabs: { 2318 Value *Cond; 2319 Constant *LHS, *RHS; 2320 if (match(II->getArgOperand(0), 2321 m_Select(m_Value(Cond), m_Constant(LHS), m_Constant(RHS)))) { 2322 CallInst *Call0 = Builder.CreateCall(II->getCalledFunction(), {LHS}); 2323 CallInst *Call1 = Builder.CreateCall(II->getCalledFunction(), {RHS}); 2324 return SelectInst::Create(Cond, Call0, Call1); 2325 } 2326 2327 LLVM_FALLTHROUGH; 2328 } 2329 case Intrinsic::ceil: 2330 case Intrinsic::floor: 2331 case Intrinsic::round: 2332 case Intrinsic::nearbyint: 2333 case Intrinsic::rint: 2334 case Intrinsic::trunc: { 2335 Value *ExtSrc; 2336 if (match(II->getArgOperand(0), m_OneUse(m_FPExt(m_Value(ExtSrc))))) { 2337 // Narrow the call: intrinsic (fpext x) -> fpext (intrinsic x) 2338 Value *NarrowII = Builder.CreateUnaryIntrinsic(IID, ExtSrc, II); 2339 return new FPExtInst(NarrowII, II->getType()); 2340 } 2341 break; 2342 } 2343 case Intrinsic::cos: 2344 case Intrinsic::amdgcn_cos: { 2345 Value *X; 2346 Value *Src = II->getArgOperand(0); 2347 if (match(Src, m_FNeg(m_Value(X))) || match(Src, m_FAbs(m_Value(X)))) { 2348 // cos(-x) -> cos(x) 2349 // cos(fabs(x)) -> cos(x) 2350 II->setArgOperand(0, X); 2351 return II; 2352 } 2353 break; 2354 } 2355 case Intrinsic::sin: { 2356 Value *X; 2357 if (match(II->getArgOperand(0), m_OneUse(m_FNeg(m_Value(X))))) { 2358 // sin(-x) --> -sin(x) 2359 Value *NewSin = Builder.CreateUnaryIntrinsic(Intrinsic::sin, X, II); 2360 Instruction *FNeg = BinaryOperator::CreateFNeg(NewSin); 2361 FNeg->copyFastMathFlags(II); 2362 return FNeg; 2363 } 2364 break; 2365 } 2366 case Intrinsic::ppc_altivec_lvx: 2367 case Intrinsic::ppc_altivec_lvxl: 2368 // Turn PPC lvx -> load if the pointer is known aligned. 2369 if (getOrEnforceKnownAlignment(II->getArgOperand(0), 16, DL, II, &AC, 2370 &DT) >= 16) { 2371 Value *Ptr = Builder.CreateBitCast(II->getArgOperand(0), 2372 PointerType::getUnqual(II->getType())); 2373 return new LoadInst(II->getType(), Ptr); 2374 } 2375 break; 2376 case Intrinsic::ppc_vsx_lxvw4x: 2377 case Intrinsic::ppc_vsx_lxvd2x: { 2378 // Turn PPC VSX loads into normal loads. 2379 Value *Ptr = Builder.CreateBitCast(II->getArgOperand(0), 2380 PointerType::getUnqual(II->getType())); 2381 return new LoadInst(II->getType(), Ptr, Twine(""), false, Align::None()); 2382 } 2383 case Intrinsic::ppc_altivec_stvx: 2384 case Intrinsic::ppc_altivec_stvxl: 2385 // Turn stvx -> store if the pointer is known aligned. 2386 if (getOrEnforceKnownAlignment(II->getArgOperand(1), 16, DL, II, &AC, 2387 &DT) >= 16) { 2388 Type *OpPtrTy = 2389 PointerType::getUnqual(II->getArgOperand(0)->getType()); 2390 Value *Ptr = Builder.CreateBitCast(II->getArgOperand(1), OpPtrTy); 2391 return new StoreInst(II->getArgOperand(0), Ptr); 2392 } 2393 break; 2394 case Intrinsic::ppc_vsx_stxvw4x: 2395 case Intrinsic::ppc_vsx_stxvd2x: { 2396 // Turn PPC VSX stores into normal stores. 2397 Type *OpPtrTy = PointerType::getUnqual(II->getArgOperand(0)->getType()); 2398 Value *Ptr = Builder.CreateBitCast(II->getArgOperand(1), OpPtrTy); 2399 return new StoreInst(II->getArgOperand(0), Ptr, false, Align::None()); 2400 } 2401 case Intrinsic::ppc_qpx_qvlfs: 2402 // Turn PPC QPX qvlfs -> load if the pointer is known aligned. 2403 if (getOrEnforceKnownAlignment(II->getArgOperand(0), 16, DL, II, &AC, 2404 &DT) >= 16) { 2405 Type *VTy = VectorType::get(Builder.getFloatTy(), 2406 II->getType()->getVectorNumElements()); 2407 Value *Ptr = Builder.CreateBitCast(II->getArgOperand(0), 2408 PointerType::getUnqual(VTy)); 2409 Value *Load = Builder.CreateLoad(VTy, Ptr); 2410 return new FPExtInst(Load, II->getType()); 2411 } 2412 break; 2413 case Intrinsic::ppc_qpx_qvlfd: 2414 // Turn PPC QPX qvlfd -> load if the pointer is known aligned. 2415 if (getOrEnforceKnownAlignment(II->getArgOperand(0), 32, DL, II, &AC, 2416 &DT) >= 32) { 2417 Value *Ptr = Builder.CreateBitCast(II->getArgOperand(0), 2418 PointerType::getUnqual(II->getType())); 2419 return new LoadInst(II->getType(), Ptr); 2420 } 2421 break; 2422 case Intrinsic::ppc_qpx_qvstfs: 2423 // Turn PPC QPX qvstfs -> store if the pointer is known aligned. 2424 if (getOrEnforceKnownAlignment(II->getArgOperand(1), 16, DL, II, &AC, 2425 &DT) >= 16) { 2426 Type *VTy = VectorType::get(Builder.getFloatTy(), 2427 II->getArgOperand(0)->getType()->getVectorNumElements()); 2428 Value *TOp = Builder.CreateFPTrunc(II->getArgOperand(0), VTy); 2429 Type *OpPtrTy = PointerType::getUnqual(VTy); 2430 Value *Ptr = Builder.CreateBitCast(II->getArgOperand(1), OpPtrTy); 2431 return new StoreInst(TOp, Ptr); 2432 } 2433 break; 2434 case Intrinsic::ppc_qpx_qvstfd: 2435 // Turn PPC QPX qvstfd -> store if the pointer is known aligned. 2436 if (getOrEnforceKnownAlignment(II->getArgOperand(1), 32, DL, II, &AC, 2437 &DT) >= 32) { 2438 Type *OpPtrTy = 2439 PointerType::getUnqual(II->getArgOperand(0)->getType()); 2440 Value *Ptr = Builder.CreateBitCast(II->getArgOperand(1), OpPtrTy); 2441 return new StoreInst(II->getArgOperand(0), Ptr); 2442 } 2443 break; 2444 2445 case Intrinsic::x86_bmi_bextr_32: 2446 case Intrinsic::x86_bmi_bextr_64: 2447 case Intrinsic::x86_tbm_bextri_u32: 2448 case Intrinsic::x86_tbm_bextri_u64: 2449 // If the RHS is a constant we can try some simplifications. 2450 if (auto *C = dyn_cast<ConstantInt>(II->getArgOperand(1))) { 2451 uint64_t Shift = C->getZExtValue(); 2452 uint64_t Length = (Shift >> 8) & 0xff; 2453 Shift &= 0xff; 2454 unsigned BitWidth = II->getType()->getIntegerBitWidth(); 2455 // If the length is 0 or the shift is out of range, replace with zero. 2456 if (Length == 0 || Shift >= BitWidth) 2457 return replaceInstUsesWith(CI, ConstantInt::get(II->getType(), 0)); 2458 // If the LHS is also a constant, we can completely constant fold this. 2459 if (auto *InC = dyn_cast<ConstantInt>(II->getArgOperand(0))) { 2460 uint64_t Result = InC->getZExtValue() >> Shift; 2461 if (Length > BitWidth) 2462 Length = BitWidth; 2463 Result &= maskTrailingOnes<uint64_t>(Length); 2464 return replaceInstUsesWith(CI, ConstantInt::get(II->getType(), Result)); 2465 } 2466 // TODO should we turn this into 'and' if shift is 0? Or 'shl' if we 2467 // are only masking bits that a shift already cleared? 2468 } 2469 break; 2470 2471 case Intrinsic::x86_bmi_bzhi_32: 2472 case Intrinsic::x86_bmi_bzhi_64: 2473 // If the RHS is a constant we can try some simplifications. 2474 if (auto *C = dyn_cast<ConstantInt>(II->getArgOperand(1))) { 2475 uint64_t Index = C->getZExtValue() & 0xff; 2476 unsigned BitWidth = II->getType()->getIntegerBitWidth(); 2477 if (Index >= BitWidth) 2478 return replaceInstUsesWith(CI, II->getArgOperand(0)); 2479 if (Index == 0) 2480 return replaceInstUsesWith(CI, ConstantInt::get(II->getType(), 0)); 2481 // If the LHS is also a constant, we can completely constant fold this. 2482 if (auto *InC = dyn_cast<ConstantInt>(II->getArgOperand(0))) { 2483 uint64_t Result = InC->getZExtValue(); 2484 Result &= maskTrailingOnes<uint64_t>(Index); 2485 return replaceInstUsesWith(CI, ConstantInt::get(II->getType(), Result)); 2486 } 2487 // TODO should we convert this to an AND if the RHS is constant? 2488 } 2489 break; 2490 case Intrinsic::x86_bmi_pext_32: 2491 case Intrinsic::x86_bmi_pext_64: 2492 if (auto *MaskC = dyn_cast<ConstantInt>(II->getArgOperand(1))) { 2493 if (MaskC->isNullValue()) 2494 return replaceInstUsesWith(CI, ConstantInt::get(II->getType(), 0)); 2495 if (MaskC->isAllOnesValue()) 2496 return replaceInstUsesWith(CI, II->getArgOperand(0)); 2497 2498 if (auto *SrcC = dyn_cast<ConstantInt>(II->getArgOperand(0))) { 2499 uint64_t Src = SrcC->getZExtValue(); 2500 uint64_t Mask = MaskC->getZExtValue(); 2501 uint64_t Result = 0; 2502 uint64_t BitToSet = 1; 2503 2504 while (Mask) { 2505 // Isolate lowest set bit. 2506 uint64_t BitToTest = Mask & -Mask; 2507 if (BitToTest & Src) 2508 Result |= BitToSet; 2509 2510 BitToSet <<= 1; 2511 // Clear lowest set bit. 2512 Mask &= Mask - 1; 2513 } 2514 2515 return replaceInstUsesWith(CI, ConstantInt::get(II->getType(), Result)); 2516 } 2517 } 2518 break; 2519 case Intrinsic::x86_bmi_pdep_32: 2520 case Intrinsic::x86_bmi_pdep_64: 2521 if (auto *MaskC = dyn_cast<ConstantInt>(II->getArgOperand(1))) { 2522 if (MaskC->isNullValue()) 2523 return replaceInstUsesWith(CI, ConstantInt::get(II->getType(), 0)); 2524 if (MaskC->isAllOnesValue()) 2525 return replaceInstUsesWith(CI, II->getArgOperand(0)); 2526 2527 if (auto *SrcC = dyn_cast<ConstantInt>(II->getArgOperand(0))) { 2528 uint64_t Src = SrcC->getZExtValue(); 2529 uint64_t Mask = MaskC->getZExtValue(); 2530 uint64_t Result = 0; 2531 uint64_t BitToTest = 1; 2532 2533 while (Mask) { 2534 // Isolate lowest set bit. 2535 uint64_t BitToSet = Mask & -Mask; 2536 if (BitToTest & Src) 2537 Result |= BitToSet; 2538 2539 BitToTest <<= 1; 2540 // Clear lowest set bit; 2541 Mask &= Mask - 1; 2542 } 2543 2544 return replaceInstUsesWith(CI, ConstantInt::get(II->getType(), Result)); 2545 } 2546 } 2547 break; 2548 2549 case Intrinsic::x86_vcvtph2ps_128: 2550 case Intrinsic::x86_vcvtph2ps_256: { 2551 auto Arg = II->getArgOperand(0); 2552 auto ArgType = cast<VectorType>(Arg->getType()); 2553 auto RetType = cast<VectorType>(II->getType()); 2554 unsigned ArgWidth = ArgType->getNumElements(); 2555 unsigned RetWidth = RetType->getNumElements(); 2556 assert(RetWidth <= ArgWidth && "Unexpected input/return vector widths"); 2557 assert(ArgType->isIntOrIntVectorTy() && 2558 ArgType->getScalarSizeInBits() == 16 && 2559 "CVTPH2PS input type should be 16-bit integer vector"); 2560 assert(RetType->getScalarType()->isFloatTy() && 2561 "CVTPH2PS output type should be 32-bit float vector"); 2562 2563 // Constant folding: Convert to generic half to single conversion. 2564 if (isa<ConstantAggregateZero>(Arg)) 2565 return replaceInstUsesWith(*II, ConstantAggregateZero::get(RetType)); 2566 2567 if (isa<ConstantDataVector>(Arg)) { 2568 auto VectorHalfAsShorts = Arg; 2569 if (RetWidth < ArgWidth) { 2570 SmallVector<uint32_t, 8> SubVecMask; 2571 for (unsigned i = 0; i != RetWidth; ++i) 2572 SubVecMask.push_back((int)i); 2573 VectorHalfAsShorts = Builder.CreateShuffleVector( 2574 Arg, UndefValue::get(ArgType), SubVecMask); 2575 } 2576 2577 auto VectorHalfType = 2578 VectorType::get(Type::getHalfTy(II->getContext()), RetWidth); 2579 auto VectorHalfs = 2580 Builder.CreateBitCast(VectorHalfAsShorts, VectorHalfType); 2581 auto VectorFloats = Builder.CreateFPExt(VectorHalfs, RetType); 2582 return replaceInstUsesWith(*II, VectorFloats); 2583 } 2584 2585 // We only use the lowest lanes of the argument. 2586 if (Value *V = SimplifyDemandedVectorEltsLow(Arg, ArgWidth, RetWidth)) { 2587 II->setArgOperand(0, V); 2588 return II; 2589 } 2590 break; 2591 } 2592 2593 case Intrinsic::x86_sse_cvtss2si: 2594 case Intrinsic::x86_sse_cvtss2si64: 2595 case Intrinsic::x86_sse_cvttss2si: 2596 case Intrinsic::x86_sse_cvttss2si64: 2597 case Intrinsic::x86_sse2_cvtsd2si: 2598 case Intrinsic::x86_sse2_cvtsd2si64: 2599 case Intrinsic::x86_sse2_cvttsd2si: 2600 case Intrinsic::x86_sse2_cvttsd2si64: 2601 case Intrinsic::x86_avx512_vcvtss2si32: 2602 case Intrinsic::x86_avx512_vcvtss2si64: 2603 case Intrinsic::x86_avx512_vcvtss2usi32: 2604 case Intrinsic::x86_avx512_vcvtss2usi64: 2605 case Intrinsic::x86_avx512_vcvtsd2si32: 2606 case Intrinsic::x86_avx512_vcvtsd2si64: 2607 case Intrinsic::x86_avx512_vcvtsd2usi32: 2608 case Intrinsic::x86_avx512_vcvtsd2usi64: 2609 case Intrinsic::x86_avx512_cvttss2si: 2610 case Intrinsic::x86_avx512_cvttss2si64: 2611 case Intrinsic::x86_avx512_cvttss2usi: 2612 case Intrinsic::x86_avx512_cvttss2usi64: 2613 case Intrinsic::x86_avx512_cvttsd2si: 2614 case Intrinsic::x86_avx512_cvttsd2si64: 2615 case Intrinsic::x86_avx512_cvttsd2usi: 2616 case Intrinsic::x86_avx512_cvttsd2usi64: { 2617 // These intrinsics only demand the 0th element of their input vectors. If 2618 // we can simplify the input based on that, do so now. 2619 Value *Arg = II->getArgOperand(0); 2620 unsigned VWidth = Arg->getType()->getVectorNumElements(); 2621 if (Value *V = SimplifyDemandedVectorEltsLow(Arg, VWidth, 1)) { 2622 II->setArgOperand(0, V); 2623 return II; 2624 } 2625 break; 2626 } 2627 2628 case Intrinsic::x86_mmx_pmovmskb: 2629 case Intrinsic::x86_sse_movmsk_ps: 2630 case Intrinsic::x86_sse2_movmsk_pd: 2631 case Intrinsic::x86_sse2_pmovmskb_128: 2632 case Intrinsic::x86_avx_movmsk_pd_256: 2633 case Intrinsic::x86_avx_movmsk_ps_256: 2634 case Intrinsic::x86_avx2_pmovmskb: 2635 if (Value *V = simplifyX86movmsk(*II, Builder)) 2636 return replaceInstUsesWith(*II, V); 2637 break; 2638 2639 case Intrinsic::x86_sse_comieq_ss: 2640 case Intrinsic::x86_sse_comige_ss: 2641 case Intrinsic::x86_sse_comigt_ss: 2642 case Intrinsic::x86_sse_comile_ss: 2643 case Intrinsic::x86_sse_comilt_ss: 2644 case Intrinsic::x86_sse_comineq_ss: 2645 case Intrinsic::x86_sse_ucomieq_ss: 2646 case Intrinsic::x86_sse_ucomige_ss: 2647 case Intrinsic::x86_sse_ucomigt_ss: 2648 case Intrinsic::x86_sse_ucomile_ss: 2649 case Intrinsic::x86_sse_ucomilt_ss: 2650 case Intrinsic::x86_sse_ucomineq_ss: 2651 case Intrinsic::x86_sse2_comieq_sd: 2652 case Intrinsic::x86_sse2_comige_sd: 2653 case Intrinsic::x86_sse2_comigt_sd: 2654 case Intrinsic::x86_sse2_comile_sd: 2655 case Intrinsic::x86_sse2_comilt_sd: 2656 case Intrinsic::x86_sse2_comineq_sd: 2657 case Intrinsic::x86_sse2_ucomieq_sd: 2658 case Intrinsic::x86_sse2_ucomige_sd: 2659 case Intrinsic::x86_sse2_ucomigt_sd: 2660 case Intrinsic::x86_sse2_ucomile_sd: 2661 case Intrinsic::x86_sse2_ucomilt_sd: 2662 case Intrinsic::x86_sse2_ucomineq_sd: 2663 case Intrinsic::x86_avx512_vcomi_ss: 2664 case Intrinsic::x86_avx512_vcomi_sd: 2665 case Intrinsic::x86_avx512_mask_cmp_ss: 2666 case Intrinsic::x86_avx512_mask_cmp_sd: { 2667 // These intrinsics only demand the 0th element of their input vectors. If 2668 // we can simplify the input based on that, do so now. 2669 bool MadeChange = false; 2670 Value *Arg0 = II->getArgOperand(0); 2671 Value *Arg1 = II->getArgOperand(1); 2672 unsigned VWidth = Arg0->getType()->getVectorNumElements(); 2673 if (Value *V = SimplifyDemandedVectorEltsLow(Arg0, VWidth, 1)) { 2674 II->setArgOperand(0, V); 2675 MadeChange = true; 2676 } 2677 if (Value *V = SimplifyDemandedVectorEltsLow(Arg1, VWidth, 1)) { 2678 II->setArgOperand(1, V); 2679 MadeChange = true; 2680 } 2681 if (MadeChange) 2682 return II; 2683 break; 2684 } 2685 case Intrinsic::x86_avx512_cmp_pd_128: 2686 case Intrinsic::x86_avx512_cmp_pd_256: 2687 case Intrinsic::x86_avx512_cmp_pd_512: 2688 case Intrinsic::x86_avx512_cmp_ps_128: 2689 case Intrinsic::x86_avx512_cmp_ps_256: 2690 case Intrinsic::x86_avx512_cmp_ps_512: { 2691 // Folding cmp(sub(a,b),0) -> cmp(a,b) and cmp(0,sub(a,b)) -> cmp(b,a) 2692 Value *Arg0 = II->getArgOperand(0); 2693 Value *Arg1 = II->getArgOperand(1); 2694 bool Arg0IsZero = match(Arg0, m_PosZeroFP()); 2695 if (Arg0IsZero) 2696 std::swap(Arg0, Arg1); 2697 Value *A, *B; 2698 // This fold requires only the NINF(not +/- inf) since inf minus 2699 // inf is nan. 2700 // NSZ(No Signed Zeros) is not needed because zeros of any sign are 2701 // equal for both compares. 2702 // NNAN is not needed because nans compare the same for both compares. 2703 // The compare intrinsic uses the above assumptions and therefore 2704 // doesn't require additional flags. 2705 if ((match(Arg0, m_OneUse(m_FSub(m_Value(A), m_Value(B)))) && 2706 match(Arg1, m_PosZeroFP()) && isa<Instruction>(Arg0) && 2707 cast<Instruction>(Arg0)->getFastMathFlags().noInfs())) { 2708 if (Arg0IsZero) 2709 std::swap(A, B); 2710 II->setArgOperand(0, A); 2711 II->setArgOperand(1, B); 2712 return II; 2713 } 2714 break; 2715 } 2716 2717 case Intrinsic::x86_avx512_add_ps_512: 2718 case Intrinsic::x86_avx512_div_ps_512: 2719 case Intrinsic::x86_avx512_mul_ps_512: 2720 case Intrinsic::x86_avx512_sub_ps_512: 2721 case Intrinsic::x86_avx512_add_pd_512: 2722 case Intrinsic::x86_avx512_div_pd_512: 2723 case Intrinsic::x86_avx512_mul_pd_512: 2724 case Intrinsic::x86_avx512_sub_pd_512: 2725 // If the rounding mode is CUR_DIRECTION(4) we can turn these into regular 2726 // IR operations. 2727 if (auto *R = dyn_cast<ConstantInt>(II->getArgOperand(2))) { 2728 if (R->getValue() == 4) { 2729 Value *Arg0 = II->getArgOperand(0); 2730 Value *Arg1 = II->getArgOperand(1); 2731 2732 Value *V; 2733 switch (IID) { 2734 default: llvm_unreachable("Case stmts out of sync!"); 2735 case Intrinsic::x86_avx512_add_ps_512: 2736 case Intrinsic::x86_avx512_add_pd_512: 2737 V = Builder.CreateFAdd(Arg0, Arg1); 2738 break; 2739 case Intrinsic::x86_avx512_sub_ps_512: 2740 case Intrinsic::x86_avx512_sub_pd_512: 2741 V = Builder.CreateFSub(Arg0, Arg1); 2742 break; 2743 case Intrinsic::x86_avx512_mul_ps_512: 2744 case Intrinsic::x86_avx512_mul_pd_512: 2745 V = Builder.CreateFMul(Arg0, Arg1); 2746 break; 2747 case Intrinsic::x86_avx512_div_ps_512: 2748 case Intrinsic::x86_avx512_div_pd_512: 2749 V = Builder.CreateFDiv(Arg0, Arg1); 2750 break; 2751 } 2752 2753 return replaceInstUsesWith(*II, V); 2754 } 2755 } 2756 break; 2757 2758 case Intrinsic::x86_avx512_mask_add_ss_round: 2759 case Intrinsic::x86_avx512_mask_div_ss_round: 2760 case Intrinsic::x86_avx512_mask_mul_ss_round: 2761 case Intrinsic::x86_avx512_mask_sub_ss_round: 2762 case Intrinsic::x86_avx512_mask_add_sd_round: 2763 case Intrinsic::x86_avx512_mask_div_sd_round: 2764 case Intrinsic::x86_avx512_mask_mul_sd_round: 2765 case Intrinsic::x86_avx512_mask_sub_sd_round: 2766 // If the rounding mode is CUR_DIRECTION(4) we can turn these into regular 2767 // IR operations. 2768 if (auto *R = dyn_cast<ConstantInt>(II->getArgOperand(4))) { 2769 if (R->getValue() == 4) { 2770 // Extract the element as scalars. 2771 Value *Arg0 = II->getArgOperand(0); 2772 Value *Arg1 = II->getArgOperand(1); 2773 Value *LHS = Builder.CreateExtractElement(Arg0, (uint64_t)0); 2774 Value *RHS = Builder.CreateExtractElement(Arg1, (uint64_t)0); 2775 2776 Value *V; 2777 switch (IID) { 2778 default: llvm_unreachable("Case stmts out of sync!"); 2779 case Intrinsic::x86_avx512_mask_add_ss_round: 2780 case Intrinsic::x86_avx512_mask_add_sd_round: 2781 V = Builder.CreateFAdd(LHS, RHS); 2782 break; 2783 case Intrinsic::x86_avx512_mask_sub_ss_round: 2784 case Intrinsic::x86_avx512_mask_sub_sd_round: 2785 V = Builder.CreateFSub(LHS, RHS); 2786 break; 2787 case Intrinsic::x86_avx512_mask_mul_ss_round: 2788 case Intrinsic::x86_avx512_mask_mul_sd_round: 2789 V = Builder.CreateFMul(LHS, RHS); 2790 break; 2791 case Intrinsic::x86_avx512_mask_div_ss_round: 2792 case Intrinsic::x86_avx512_mask_div_sd_round: 2793 V = Builder.CreateFDiv(LHS, RHS); 2794 break; 2795 } 2796 2797 // Handle the masking aspect of the intrinsic. 2798 Value *Mask = II->getArgOperand(3); 2799 auto *C = dyn_cast<ConstantInt>(Mask); 2800 // We don't need a select if we know the mask bit is a 1. 2801 if (!C || !C->getValue()[0]) { 2802 // Cast the mask to an i1 vector and then extract the lowest element. 2803 auto *MaskTy = VectorType::get(Builder.getInt1Ty(), 2804 cast<IntegerType>(Mask->getType())->getBitWidth()); 2805 Mask = Builder.CreateBitCast(Mask, MaskTy); 2806 Mask = Builder.CreateExtractElement(Mask, (uint64_t)0); 2807 // Extract the lowest element from the passthru operand. 2808 Value *Passthru = Builder.CreateExtractElement(II->getArgOperand(2), 2809 (uint64_t)0); 2810 V = Builder.CreateSelect(Mask, V, Passthru); 2811 } 2812 2813 // Insert the result back into the original argument 0. 2814 V = Builder.CreateInsertElement(Arg0, V, (uint64_t)0); 2815 2816 return replaceInstUsesWith(*II, V); 2817 } 2818 } 2819 break; 2820 2821 // Constant fold ashr( <A x Bi>, Ci ). 2822 // Constant fold lshr( <A x Bi>, Ci ). 2823 // Constant fold shl( <A x Bi>, Ci ). 2824 case Intrinsic::x86_sse2_psrai_d: 2825 case Intrinsic::x86_sse2_psrai_w: 2826 case Intrinsic::x86_avx2_psrai_d: 2827 case Intrinsic::x86_avx2_psrai_w: 2828 case Intrinsic::x86_avx512_psrai_q_128: 2829 case Intrinsic::x86_avx512_psrai_q_256: 2830 case Intrinsic::x86_avx512_psrai_d_512: 2831 case Intrinsic::x86_avx512_psrai_q_512: 2832 case Intrinsic::x86_avx512_psrai_w_512: 2833 case Intrinsic::x86_sse2_psrli_d: 2834 case Intrinsic::x86_sse2_psrli_q: 2835 case Intrinsic::x86_sse2_psrli_w: 2836 case Intrinsic::x86_avx2_psrli_d: 2837 case Intrinsic::x86_avx2_psrli_q: 2838 case Intrinsic::x86_avx2_psrli_w: 2839 case Intrinsic::x86_avx512_psrli_d_512: 2840 case Intrinsic::x86_avx512_psrli_q_512: 2841 case Intrinsic::x86_avx512_psrli_w_512: 2842 case Intrinsic::x86_sse2_pslli_d: 2843 case Intrinsic::x86_sse2_pslli_q: 2844 case Intrinsic::x86_sse2_pslli_w: 2845 case Intrinsic::x86_avx2_pslli_d: 2846 case Intrinsic::x86_avx2_pslli_q: 2847 case Intrinsic::x86_avx2_pslli_w: 2848 case Intrinsic::x86_avx512_pslli_d_512: 2849 case Intrinsic::x86_avx512_pslli_q_512: 2850 case Intrinsic::x86_avx512_pslli_w_512: 2851 if (Value *V = simplifyX86immShift(*II, Builder)) 2852 return replaceInstUsesWith(*II, V); 2853 break; 2854 2855 case Intrinsic::x86_sse2_psra_d: 2856 case Intrinsic::x86_sse2_psra_w: 2857 case Intrinsic::x86_avx2_psra_d: 2858 case Intrinsic::x86_avx2_psra_w: 2859 case Intrinsic::x86_avx512_psra_q_128: 2860 case Intrinsic::x86_avx512_psra_q_256: 2861 case Intrinsic::x86_avx512_psra_d_512: 2862 case Intrinsic::x86_avx512_psra_q_512: 2863 case Intrinsic::x86_avx512_psra_w_512: 2864 case Intrinsic::x86_sse2_psrl_d: 2865 case Intrinsic::x86_sse2_psrl_q: 2866 case Intrinsic::x86_sse2_psrl_w: 2867 case Intrinsic::x86_avx2_psrl_d: 2868 case Intrinsic::x86_avx2_psrl_q: 2869 case Intrinsic::x86_avx2_psrl_w: 2870 case Intrinsic::x86_avx512_psrl_d_512: 2871 case Intrinsic::x86_avx512_psrl_q_512: 2872 case Intrinsic::x86_avx512_psrl_w_512: 2873 case Intrinsic::x86_sse2_psll_d: 2874 case Intrinsic::x86_sse2_psll_q: 2875 case Intrinsic::x86_sse2_psll_w: 2876 case Intrinsic::x86_avx2_psll_d: 2877 case Intrinsic::x86_avx2_psll_q: 2878 case Intrinsic::x86_avx2_psll_w: 2879 case Intrinsic::x86_avx512_psll_d_512: 2880 case Intrinsic::x86_avx512_psll_q_512: 2881 case Intrinsic::x86_avx512_psll_w_512: { 2882 if (Value *V = simplifyX86immShift(*II, Builder)) 2883 return replaceInstUsesWith(*II, V); 2884 2885 // SSE2/AVX2 uses only the first 64-bits of the 128-bit vector 2886 // operand to compute the shift amount. 2887 Value *Arg1 = II->getArgOperand(1); 2888 assert(Arg1->getType()->getPrimitiveSizeInBits() == 128 && 2889 "Unexpected packed shift size"); 2890 unsigned VWidth = Arg1->getType()->getVectorNumElements(); 2891 2892 if (Value *V = SimplifyDemandedVectorEltsLow(Arg1, VWidth, VWidth / 2)) { 2893 II->setArgOperand(1, V); 2894 return II; 2895 } 2896 break; 2897 } 2898 2899 case Intrinsic::x86_avx2_psllv_d: 2900 case Intrinsic::x86_avx2_psllv_d_256: 2901 case Intrinsic::x86_avx2_psllv_q: 2902 case Intrinsic::x86_avx2_psllv_q_256: 2903 case Intrinsic::x86_avx512_psllv_d_512: 2904 case Intrinsic::x86_avx512_psllv_q_512: 2905 case Intrinsic::x86_avx512_psllv_w_128: 2906 case Intrinsic::x86_avx512_psllv_w_256: 2907 case Intrinsic::x86_avx512_psllv_w_512: 2908 case Intrinsic::x86_avx2_psrav_d: 2909 case Intrinsic::x86_avx2_psrav_d_256: 2910 case Intrinsic::x86_avx512_psrav_q_128: 2911 case Intrinsic::x86_avx512_psrav_q_256: 2912 case Intrinsic::x86_avx512_psrav_d_512: 2913 case Intrinsic::x86_avx512_psrav_q_512: 2914 case Intrinsic::x86_avx512_psrav_w_128: 2915 case Intrinsic::x86_avx512_psrav_w_256: 2916 case Intrinsic::x86_avx512_psrav_w_512: 2917 case Intrinsic::x86_avx2_psrlv_d: 2918 case Intrinsic::x86_avx2_psrlv_d_256: 2919 case Intrinsic::x86_avx2_psrlv_q: 2920 case Intrinsic::x86_avx2_psrlv_q_256: 2921 case Intrinsic::x86_avx512_psrlv_d_512: 2922 case Intrinsic::x86_avx512_psrlv_q_512: 2923 case Intrinsic::x86_avx512_psrlv_w_128: 2924 case Intrinsic::x86_avx512_psrlv_w_256: 2925 case Intrinsic::x86_avx512_psrlv_w_512: 2926 if (Value *V = simplifyX86varShift(*II, Builder)) 2927 return replaceInstUsesWith(*II, V); 2928 break; 2929 2930 case Intrinsic::x86_sse2_packssdw_128: 2931 case Intrinsic::x86_sse2_packsswb_128: 2932 case Intrinsic::x86_avx2_packssdw: 2933 case Intrinsic::x86_avx2_packsswb: 2934 case Intrinsic::x86_avx512_packssdw_512: 2935 case Intrinsic::x86_avx512_packsswb_512: 2936 if (Value *V = simplifyX86pack(*II, Builder, true)) 2937 return replaceInstUsesWith(*II, V); 2938 break; 2939 2940 case Intrinsic::x86_sse2_packuswb_128: 2941 case Intrinsic::x86_sse41_packusdw: 2942 case Intrinsic::x86_avx2_packusdw: 2943 case Intrinsic::x86_avx2_packuswb: 2944 case Intrinsic::x86_avx512_packusdw_512: 2945 case Intrinsic::x86_avx512_packuswb_512: 2946 if (Value *V = simplifyX86pack(*II, Builder, false)) 2947 return replaceInstUsesWith(*II, V); 2948 break; 2949 2950 case Intrinsic::x86_pclmulqdq: 2951 case Intrinsic::x86_pclmulqdq_256: 2952 case Intrinsic::x86_pclmulqdq_512: { 2953 if (auto *C = dyn_cast<ConstantInt>(II->getArgOperand(2))) { 2954 unsigned Imm = C->getZExtValue(); 2955 2956 bool MadeChange = false; 2957 Value *Arg0 = II->getArgOperand(0); 2958 Value *Arg1 = II->getArgOperand(1); 2959 unsigned VWidth = Arg0->getType()->getVectorNumElements(); 2960 2961 APInt UndefElts1(VWidth, 0); 2962 APInt DemandedElts1 = APInt::getSplat(VWidth, 2963 APInt(2, (Imm & 0x01) ? 2 : 1)); 2964 if (Value *V = SimplifyDemandedVectorElts(Arg0, DemandedElts1, 2965 UndefElts1)) { 2966 II->setArgOperand(0, V); 2967 MadeChange = true; 2968 } 2969 2970 APInt UndefElts2(VWidth, 0); 2971 APInt DemandedElts2 = APInt::getSplat(VWidth, 2972 APInt(2, (Imm & 0x10) ? 2 : 1)); 2973 if (Value *V = SimplifyDemandedVectorElts(Arg1, DemandedElts2, 2974 UndefElts2)) { 2975 II->setArgOperand(1, V); 2976 MadeChange = true; 2977 } 2978 2979 // If either input elements are undef, the result is zero. 2980 if (DemandedElts1.isSubsetOf(UndefElts1) || 2981 DemandedElts2.isSubsetOf(UndefElts2)) 2982 return replaceInstUsesWith(*II, 2983 ConstantAggregateZero::get(II->getType())); 2984 2985 if (MadeChange) 2986 return II; 2987 } 2988 break; 2989 } 2990 2991 case Intrinsic::x86_sse41_insertps: 2992 if (Value *V = simplifyX86insertps(*II, Builder)) 2993 return replaceInstUsesWith(*II, V); 2994 break; 2995 2996 case Intrinsic::x86_sse4a_extrq: { 2997 Value *Op0 = II->getArgOperand(0); 2998 Value *Op1 = II->getArgOperand(1); 2999 unsigned VWidth0 = Op0->getType()->getVectorNumElements(); 3000 unsigned VWidth1 = Op1->getType()->getVectorNumElements(); 3001 assert(Op0->getType()->getPrimitiveSizeInBits() == 128 && 3002 Op1->getType()->getPrimitiveSizeInBits() == 128 && VWidth0 == 2 && 3003 VWidth1 == 16 && "Unexpected operand sizes"); 3004 3005 // See if we're dealing with constant values. 3006 Constant *C1 = dyn_cast<Constant>(Op1); 3007 ConstantInt *CILength = 3008 C1 ? dyn_cast_or_null<ConstantInt>(C1->getAggregateElement((unsigned)0)) 3009 : nullptr; 3010 ConstantInt *CIIndex = 3011 C1 ? dyn_cast_or_null<ConstantInt>(C1->getAggregateElement((unsigned)1)) 3012 : nullptr; 3013 3014 // Attempt to simplify to a constant, shuffle vector or EXTRQI call. 3015 if (Value *V = simplifyX86extrq(*II, Op0, CILength, CIIndex, Builder)) 3016 return replaceInstUsesWith(*II, V); 3017 3018 // EXTRQ only uses the lowest 64-bits of the first 128-bit vector 3019 // operands and the lowest 16-bits of the second. 3020 bool MadeChange = false; 3021 if (Value *V = SimplifyDemandedVectorEltsLow(Op0, VWidth0, 1)) { 3022 II->setArgOperand(0, V); 3023 MadeChange = true; 3024 } 3025 if (Value *V = SimplifyDemandedVectorEltsLow(Op1, VWidth1, 2)) { 3026 II->setArgOperand(1, V); 3027 MadeChange = true; 3028 } 3029 if (MadeChange) 3030 return II; 3031 break; 3032 } 3033 3034 case Intrinsic::x86_sse4a_extrqi: { 3035 // EXTRQI: Extract Length bits starting from Index. Zero pad the remaining 3036 // bits of the lower 64-bits. The upper 64-bits are undefined. 3037 Value *Op0 = II->getArgOperand(0); 3038 unsigned VWidth = Op0->getType()->getVectorNumElements(); 3039 assert(Op0->getType()->getPrimitiveSizeInBits() == 128 && VWidth == 2 && 3040 "Unexpected operand size"); 3041 3042 // See if we're dealing with constant values. 3043 ConstantInt *CILength = dyn_cast<ConstantInt>(II->getArgOperand(1)); 3044 ConstantInt *CIIndex = dyn_cast<ConstantInt>(II->getArgOperand(2)); 3045 3046 // Attempt to simplify to a constant or shuffle vector. 3047 if (Value *V = simplifyX86extrq(*II, Op0, CILength, CIIndex, Builder)) 3048 return replaceInstUsesWith(*II, V); 3049 3050 // EXTRQI only uses the lowest 64-bits of the first 128-bit vector 3051 // operand. 3052 if (Value *V = SimplifyDemandedVectorEltsLow(Op0, VWidth, 1)) { 3053 II->setArgOperand(0, V); 3054 return II; 3055 } 3056 break; 3057 } 3058 3059 case Intrinsic::x86_sse4a_insertq: { 3060 Value *Op0 = II->getArgOperand(0); 3061 Value *Op1 = II->getArgOperand(1); 3062 unsigned VWidth = Op0->getType()->getVectorNumElements(); 3063 assert(Op0->getType()->getPrimitiveSizeInBits() == 128 && 3064 Op1->getType()->getPrimitiveSizeInBits() == 128 && VWidth == 2 && 3065 Op1->getType()->getVectorNumElements() == 2 && 3066 "Unexpected operand size"); 3067 3068 // See if we're dealing with constant values. 3069 Constant *C1 = dyn_cast<Constant>(Op1); 3070 ConstantInt *CI11 = 3071 C1 ? dyn_cast_or_null<ConstantInt>(C1->getAggregateElement((unsigned)1)) 3072 : nullptr; 3073 3074 // Attempt to simplify to a constant, shuffle vector or INSERTQI call. 3075 if (CI11) { 3076 const APInt &V11 = CI11->getValue(); 3077 APInt Len = V11.zextOrTrunc(6); 3078 APInt Idx = V11.lshr(8).zextOrTrunc(6); 3079 if (Value *V = simplifyX86insertq(*II, Op0, Op1, Len, Idx, Builder)) 3080 return replaceInstUsesWith(*II, V); 3081 } 3082 3083 // INSERTQ only uses the lowest 64-bits of the first 128-bit vector 3084 // operand. 3085 if (Value *V = SimplifyDemandedVectorEltsLow(Op0, VWidth, 1)) { 3086 II->setArgOperand(0, V); 3087 return II; 3088 } 3089 break; 3090 } 3091 3092 case Intrinsic::x86_sse4a_insertqi: { 3093 // INSERTQI: Extract lowest Length bits from lower half of second source and 3094 // insert over first source starting at Index bit. The upper 64-bits are 3095 // undefined. 3096 Value *Op0 = II->getArgOperand(0); 3097 Value *Op1 = II->getArgOperand(1); 3098 unsigned VWidth0 = Op0->getType()->getVectorNumElements(); 3099 unsigned VWidth1 = Op1->getType()->getVectorNumElements(); 3100 assert(Op0->getType()->getPrimitiveSizeInBits() == 128 && 3101 Op1->getType()->getPrimitiveSizeInBits() == 128 && VWidth0 == 2 && 3102 VWidth1 == 2 && "Unexpected operand sizes"); 3103 3104 // See if we're dealing with constant values. 3105 ConstantInt *CILength = dyn_cast<ConstantInt>(II->getArgOperand(2)); 3106 ConstantInt *CIIndex = dyn_cast<ConstantInt>(II->getArgOperand(3)); 3107 3108 // Attempt to simplify to a constant or shuffle vector. 3109 if (CILength && CIIndex) { 3110 APInt Len = CILength->getValue().zextOrTrunc(6); 3111 APInt Idx = CIIndex->getValue().zextOrTrunc(6); 3112 if (Value *V = simplifyX86insertq(*II, Op0, Op1, Len, Idx, Builder)) 3113 return replaceInstUsesWith(*II, V); 3114 } 3115 3116 // INSERTQI only uses the lowest 64-bits of the first two 128-bit vector 3117 // operands. 3118 bool MadeChange = false; 3119 if (Value *V = SimplifyDemandedVectorEltsLow(Op0, VWidth0, 1)) { 3120 II->setArgOperand(0, V); 3121 MadeChange = true; 3122 } 3123 if (Value *V = SimplifyDemandedVectorEltsLow(Op1, VWidth1, 1)) { 3124 II->setArgOperand(1, V); 3125 MadeChange = true; 3126 } 3127 if (MadeChange) 3128 return II; 3129 break; 3130 } 3131 3132 case Intrinsic::x86_sse41_pblendvb: 3133 case Intrinsic::x86_sse41_blendvps: 3134 case Intrinsic::x86_sse41_blendvpd: 3135 case Intrinsic::x86_avx_blendv_ps_256: 3136 case Intrinsic::x86_avx_blendv_pd_256: 3137 case Intrinsic::x86_avx2_pblendvb: { 3138 // fold (blend A, A, Mask) -> A 3139 Value *Op0 = II->getArgOperand(0); 3140 Value *Op1 = II->getArgOperand(1); 3141 Value *Mask = II->getArgOperand(2); 3142 if (Op0 == Op1) 3143 return replaceInstUsesWith(CI, Op0); 3144 3145 // Zero Mask - select 1st argument. 3146 if (isa<ConstantAggregateZero>(Mask)) 3147 return replaceInstUsesWith(CI, Op0); 3148 3149 // Constant Mask - select 1st/2nd argument lane based on top bit of mask. 3150 if (auto *ConstantMask = dyn_cast<ConstantDataVector>(Mask)) { 3151 Constant *NewSelector = getNegativeIsTrueBoolVec(ConstantMask); 3152 return SelectInst::Create(NewSelector, Op1, Op0, "blendv"); 3153 } 3154 3155 // Convert to a vector select if we can bypass casts and find a boolean 3156 // vector condition value. 3157 Value *BoolVec; 3158 Mask = peekThroughBitcast(Mask); 3159 if (match(Mask, m_SExt(m_Value(BoolVec))) && 3160 BoolVec->getType()->isVectorTy() && 3161 BoolVec->getType()->getScalarSizeInBits() == 1) { 3162 assert(Mask->getType()->getPrimitiveSizeInBits() == 3163 II->getType()->getPrimitiveSizeInBits() && 3164 "Not expecting mask and operands with different sizes"); 3165 3166 unsigned NumMaskElts = Mask->getType()->getVectorNumElements(); 3167 unsigned NumOperandElts = II->getType()->getVectorNumElements(); 3168 if (NumMaskElts == NumOperandElts) 3169 return SelectInst::Create(BoolVec, Op1, Op0); 3170 3171 // If the mask has less elements than the operands, each mask bit maps to 3172 // multiple elements of the operands. Bitcast back and forth. 3173 if (NumMaskElts < NumOperandElts) { 3174 Value *CastOp0 = Builder.CreateBitCast(Op0, Mask->getType()); 3175 Value *CastOp1 = Builder.CreateBitCast(Op1, Mask->getType()); 3176 Value *Sel = Builder.CreateSelect(BoolVec, CastOp1, CastOp0); 3177 return new BitCastInst(Sel, II->getType()); 3178 } 3179 } 3180 3181 break; 3182 } 3183 3184 case Intrinsic::x86_ssse3_pshuf_b_128: 3185 case Intrinsic::x86_avx2_pshuf_b: 3186 case Intrinsic::x86_avx512_pshuf_b_512: 3187 if (Value *V = simplifyX86pshufb(*II, Builder)) 3188 return replaceInstUsesWith(*II, V); 3189 break; 3190 3191 case Intrinsic::x86_avx_vpermilvar_ps: 3192 case Intrinsic::x86_avx_vpermilvar_ps_256: 3193 case Intrinsic::x86_avx512_vpermilvar_ps_512: 3194 case Intrinsic::x86_avx_vpermilvar_pd: 3195 case Intrinsic::x86_avx_vpermilvar_pd_256: 3196 case Intrinsic::x86_avx512_vpermilvar_pd_512: 3197 if (Value *V = simplifyX86vpermilvar(*II, Builder)) 3198 return replaceInstUsesWith(*II, V); 3199 break; 3200 3201 case Intrinsic::x86_avx2_permd: 3202 case Intrinsic::x86_avx2_permps: 3203 case Intrinsic::x86_avx512_permvar_df_256: 3204 case Intrinsic::x86_avx512_permvar_df_512: 3205 case Intrinsic::x86_avx512_permvar_di_256: 3206 case Intrinsic::x86_avx512_permvar_di_512: 3207 case Intrinsic::x86_avx512_permvar_hi_128: 3208 case Intrinsic::x86_avx512_permvar_hi_256: 3209 case Intrinsic::x86_avx512_permvar_hi_512: 3210 case Intrinsic::x86_avx512_permvar_qi_128: 3211 case Intrinsic::x86_avx512_permvar_qi_256: 3212 case Intrinsic::x86_avx512_permvar_qi_512: 3213 case Intrinsic::x86_avx512_permvar_sf_512: 3214 case Intrinsic::x86_avx512_permvar_si_512: 3215 if (Value *V = simplifyX86vpermv(*II, Builder)) 3216 return replaceInstUsesWith(*II, V); 3217 break; 3218 3219 case Intrinsic::x86_avx_maskload_ps: 3220 case Intrinsic::x86_avx_maskload_pd: 3221 case Intrinsic::x86_avx_maskload_ps_256: 3222 case Intrinsic::x86_avx_maskload_pd_256: 3223 case Intrinsic::x86_avx2_maskload_d: 3224 case Intrinsic::x86_avx2_maskload_q: 3225 case Intrinsic::x86_avx2_maskload_d_256: 3226 case Intrinsic::x86_avx2_maskload_q_256: 3227 if (Instruction *I = simplifyX86MaskedLoad(*II, *this)) 3228 return I; 3229 break; 3230 3231 case Intrinsic::x86_sse2_maskmov_dqu: 3232 case Intrinsic::x86_avx_maskstore_ps: 3233 case Intrinsic::x86_avx_maskstore_pd: 3234 case Intrinsic::x86_avx_maskstore_ps_256: 3235 case Intrinsic::x86_avx_maskstore_pd_256: 3236 case Intrinsic::x86_avx2_maskstore_d: 3237 case Intrinsic::x86_avx2_maskstore_q: 3238 case Intrinsic::x86_avx2_maskstore_d_256: 3239 case Intrinsic::x86_avx2_maskstore_q_256: 3240 if (simplifyX86MaskedStore(*II, *this)) 3241 return nullptr; 3242 break; 3243 3244 case Intrinsic::x86_addcarry_32: 3245 case Intrinsic::x86_addcarry_64: 3246 if (Value *V = simplifyX86addcarry(*II, Builder)) 3247 return replaceInstUsesWith(*II, V); 3248 break; 3249 3250 case Intrinsic::ppc_altivec_vperm: 3251 // Turn vperm(V1,V2,mask) -> shuffle(V1,V2,mask) if mask is a constant. 3252 // Note that ppc_altivec_vperm has a big-endian bias, so when creating 3253 // a vectorshuffle for little endian, we must undo the transformation 3254 // performed on vec_perm in altivec.h. That is, we must complement 3255 // the permutation mask with respect to 31 and reverse the order of 3256 // V1 and V2. 3257 if (Constant *Mask = dyn_cast<Constant>(II->getArgOperand(2))) { 3258 assert(Mask->getType()->getVectorNumElements() == 16 && 3259 "Bad type for intrinsic!"); 3260 3261 // Check that all of the elements are integer constants or undefs. 3262 bool AllEltsOk = true; 3263 for (unsigned i = 0; i != 16; ++i) { 3264 Constant *Elt = Mask->getAggregateElement(i); 3265 if (!Elt || !(isa<ConstantInt>(Elt) || isa<UndefValue>(Elt))) { 3266 AllEltsOk = false; 3267 break; 3268 } 3269 } 3270 3271 if (AllEltsOk) { 3272 // Cast the input vectors to byte vectors. 3273 Value *Op0 = Builder.CreateBitCast(II->getArgOperand(0), 3274 Mask->getType()); 3275 Value *Op1 = Builder.CreateBitCast(II->getArgOperand(1), 3276 Mask->getType()); 3277 Value *Result = UndefValue::get(Op0->getType()); 3278 3279 // Only extract each element once. 3280 Value *ExtractedElts[32]; 3281 memset(ExtractedElts, 0, sizeof(ExtractedElts)); 3282 3283 for (unsigned i = 0; i != 16; ++i) { 3284 if (isa<UndefValue>(Mask->getAggregateElement(i))) 3285 continue; 3286 unsigned Idx = 3287 cast<ConstantInt>(Mask->getAggregateElement(i))->getZExtValue(); 3288 Idx &= 31; // Match the hardware behavior. 3289 if (DL.isLittleEndian()) 3290 Idx = 31 - Idx; 3291 3292 if (!ExtractedElts[Idx]) { 3293 Value *Op0ToUse = (DL.isLittleEndian()) ? Op1 : Op0; 3294 Value *Op1ToUse = (DL.isLittleEndian()) ? Op0 : Op1; 3295 ExtractedElts[Idx] = 3296 Builder.CreateExtractElement(Idx < 16 ? Op0ToUse : Op1ToUse, 3297 Builder.getInt32(Idx&15)); 3298 } 3299 3300 // Insert this value into the result vector. 3301 Result = Builder.CreateInsertElement(Result, ExtractedElts[Idx], 3302 Builder.getInt32(i)); 3303 } 3304 return CastInst::Create(Instruction::BitCast, Result, CI.getType()); 3305 } 3306 } 3307 break; 3308 3309 case Intrinsic::arm_neon_vld1: { 3310 unsigned MemAlign = getKnownAlignment(II->getArgOperand(0), 3311 DL, II, &AC, &DT); 3312 if (Value *V = simplifyNeonVld1(*II, MemAlign, Builder)) 3313 return replaceInstUsesWith(*II, V); 3314 break; 3315 } 3316 3317 case Intrinsic::arm_neon_vld2: 3318 case Intrinsic::arm_neon_vld3: 3319 case Intrinsic::arm_neon_vld4: 3320 case Intrinsic::arm_neon_vld2lane: 3321 case Intrinsic::arm_neon_vld3lane: 3322 case Intrinsic::arm_neon_vld4lane: 3323 case Intrinsic::arm_neon_vst1: 3324 case Intrinsic::arm_neon_vst2: 3325 case Intrinsic::arm_neon_vst3: 3326 case Intrinsic::arm_neon_vst4: 3327 case Intrinsic::arm_neon_vst2lane: 3328 case Intrinsic::arm_neon_vst3lane: 3329 case Intrinsic::arm_neon_vst4lane: { 3330 unsigned MemAlign = 3331 getKnownAlignment(II->getArgOperand(0), DL, II, &AC, &DT); 3332 unsigned AlignArg = II->getNumArgOperands() - 1; 3333 ConstantInt *IntrAlign = dyn_cast<ConstantInt>(II->getArgOperand(AlignArg)); 3334 if (IntrAlign && IntrAlign->getZExtValue() < MemAlign) { 3335 II->setArgOperand(AlignArg, 3336 ConstantInt::get(Type::getInt32Ty(II->getContext()), 3337 MemAlign, false)); 3338 return II; 3339 } 3340 break; 3341 } 3342 3343 case Intrinsic::arm_neon_vtbl1: 3344 case Intrinsic::aarch64_neon_tbl1: 3345 if (Value *V = simplifyNeonTbl1(*II, Builder)) 3346 return replaceInstUsesWith(*II, V); 3347 break; 3348 3349 case Intrinsic::arm_neon_vmulls: 3350 case Intrinsic::arm_neon_vmullu: 3351 case Intrinsic::aarch64_neon_smull: 3352 case Intrinsic::aarch64_neon_umull: { 3353 Value *Arg0 = II->getArgOperand(0); 3354 Value *Arg1 = II->getArgOperand(1); 3355 3356 // Handle mul by zero first: 3357 if (isa<ConstantAggregateZero>(Arg0) || isa<ConstantAggregateZero>(Arg1)) { 3358 return replaceInstUsesWith(CI, ConstantAggregateZero::get(II->getType())); 3359 } 3360 3361 // Check for constant LHS & RHS - in this case we just simplify. 3362 bool Zext = (IID == Intrinsic::arm_neon_vmullu || 3363 IID == Intrinsic::aarch64_neon_umull); 3364 VectorType *NewVT = cast<VectorType>(II->getType()); 3365 if (Constant *CV0 = dyn_cast<Constant>(Arg0)) { 3366 if (Constant *CV1 = dyn_cast<Constant>(Arg1)) { 3367 CV0 = ConstantExpr::getIntegerCast(CV0, NewVT, /*isSigned=*/!Zext); 3368 CV1 = ConstantExpr::getIntegerCast(CV1, NewVT, /*isSigned=*/!Zext); 3369 3370 return replaceInstUsesWith(CI, ConstantExpr::getMul(CV0, CV1)); 3371 } 3372 3373 // Couldn't simplify - canonicalize constant to the RHS. 3374 std::swap(Arg0, Arg1); 3375 } 3376 3377 // Handle mul by one: 3378 if (Constant *CV1 = dyn_cast<Constant>(Arg1)) 3379 if (ConstantInt *Splat = 3380 dyn_cast_or_null<ConstantInt>(CV1->getSplatValue())) 3381 if (Splat->isOne()) 3382 return CastInst::CreateIntegerCast(Arg0, II->getType(), 3383 /*isSigned=*/!Zext); 3384 3385 break; 3386 } 3387 case Intrinsic::arm_neon_aesd: 3388 case Intrinsic::arm_neon_aese: 3389 case Intrinsic::aarch64_crypto_aesd: 3390 case Intrinsic::aarch64_crypto_aese: { 3391 Value *DataArg = II->getArgOperand(0); 3392 Value *KeyArg = II->getArgOperand(1); 3393 3394 // Try to use the builtin XOR in AESE and AESD to eliminate a prior XOR 3395 Value *Data, *Key; 3396 if (match(KeyArg, m_ZeroInt()) && 3397 match(DataArg, m_Xor(m_Value(Data), m_Value(Key)))) { 3398 II->setArgOperand(0, Data); 3399 II->setArgOperand(1, Key); 3400 return II; 3401 } 3402 break; 3403 } 3404 case Intrinsic::arm_mve_pred_i2v: { 3405 Value *Arg = II->getArgOperand(0); 3406 Value *ArgArg; 3407 if (match(Arg, m_Intrinsic<Intrinsic::arm_mve_pred_v2i>(m_Value(ArgArg))) && 3408 II->getType() == ArgArg->getType()) 3409 return replaceInstUsesWith(*II, ArgArg); 3410 Constant *XorMask; 3411 if (match(Arg, 3412 m_Xor(m_Intrinsic<Intrinsic::arm_mve_pred_v2i>(m_Value(ArgArg)), 3413 m_Constant(XorMask))) && 3414 II->getType() == ArgArg->getType()) { 3415 if (auto *CI = dyn_cast<ConstantInt>(XorMask)) { 3416 if (CI->getValue().trunc(16).isAllOnesValue()) { 3417 auto TrueVector = Builder.CreateVectorSplat( 3418 II->getType()->getVectorNumElements(), Builder.getTrue()); 3419 return BinaryOperator::Create(Instruction::Xor, ArgArg, TrueVector); 3420 } 3421 } 3422 } 3423 KnownBits ScalarKnown(32); 3424 if (SimplifyDemandedBits(II, 0, APInt::getLowBitsSet(32, 16), 3425 ScalarKnown, 0)) 3426 return II; 3427 break; 3428 } 3429 case Intrinsic::arm_mve_pred_v2i: { 3430 Value *Arg = II->getArgOperand(0); 3431 Value *ArgArg; 3432 if (match(Arg, m_Intrinsic<Intrinsic::arm_mve_pred_i2v>(m_Value(ArgArg)))) 3433 return replaceInstUsesWith(*II, ArgArg); 3434 if (!II->getMetadata(LLVMContext::MD_range)) { 3435 Type *IntTy32 = Type::getInt32Ty(II->getContext()); 3436 Metadata *M[] = { 3437 ConstantAsMetadata::get(ConstantInt::get(IntTy32, 0)), 3438 ConstantAsMetadata::get(ConstantInt::get(IntTy32, 0xFFFF)) 3439 }; 3440 II->setMetadata(LLVMContext::MD_range, MDNode::get(II->getContext(), M)); 3441 return II; 3442 } 3443 break; 3444 } 3445 case Intrinsic::arm_mve_vadc: 3446 case Intrinsic::arm_mve_vadc_predicated: { 3447 unsigned CarryOp = 3448 (II->getIntrinsicID() == Intrinsic::arm_mve_vadc_predicated) ? 3 : 2; 3449 assert(II->getArgOperand(CarryOp)->getType()->getScalarSizeInBits() == 32 && 3450 "Bad type for intrinsic!"); 3451 3452 KnownBits CarryKnown(32); 3453 if (SimplifyDemandedBits(II, CarryOp, APInt::getOneBitSet(32, 29), 3454 CarryKnown)) 3455 return II; 3456 break; 3457 } 3458 case Intrinsic::amdgcn_rcp: { 3459 Value *Src = II->getArgOperand(0); 3460 3461 // TODO: Move to ConstantFolding/InstSimplify? 3462 if (isa<UndefValue>(Src)) 3463 return replaceInstUsesWith(CI, Src); 3464 3465 if (const ConstantFP *C = dyn_cast<ConstantFP>(Src)) { 3466 const APFloat &ArgVal = C->getValueAPF(); 3467 APFloat Val(ArgVal.getSemantics(), 1); 3468 APFloat::opStatus Status = Val.divide(ArgVal, 3469 APFloat::rmNearestTiesToEven); 3470 // Only do this if it was exact and therefore not dependent on the 3471 // rounding mode. 3472 if (Status == APFloat::opOK) 3473 return replaceInstUsesWith(CI, ConstantFP::get(II->getContext(), Val)); 3474 } 3475 3476 break; 3477 } 3478 case Intrinsic::amdgcn_rsq: { 3479 Value *Src = II->getArgOperand(0); 3480 3481 // TODO: Move to ConstantFolding/InstSimplify? 3482 if (isa<UndefValue>(Src)) 3483 return replaceInstUsesWith(CI, Src); 3484 break; 3485 } 3486 case Intrinsic::amdgcn_frexp_mant: 3487 case Intrinsic::amdgcn_frexp_exp: { 3488 Value *Src = II->getArgOperand(0); 3489 if (const ConstantFP *C = dyn_cast<ConstantFP>(Src)) { 3490 int Exp; 3491 APFloat Significand = frexp(C->getValueAPF(), Exp, 3492 APFloat::rmNearestTiesToEven); 3493 3494 if (IID == Intrinsic::amdgcn_frexp_mant) { 3495 return replaceInstUsesWith(CI, ConstantFP::get(II->getContext(), 3496 Significand)); 3497 } 3498 3499 // Match instruction special case behavior. 3500 if (Exp == APFloat::IEK_NaN || Exp == APFloat::IEK_Inf) 3501 Exp = 0; 3502 3503 return replaceInstUsesWith(CI, ConstantInt::get(II->getType(), Exp)); 3504 } 3505 3506 if (isa<UndefValue>(Src)) 3507 return replaceInstUsesWith(CI, UndefValue::get(II->getType())); 3508 3509 break; 3510 } 3511 case Intrinsic::amdgcn_class: { 3512 enum { 3513 S_NAN = 1 << 0, // Signaling NaN 3514 Q_NAN = 1 << 1, // Quiet NaN 3515 N_INFINITY = 1 << 2, // Negative infinity 3516 N_NORMAL = 1 << 3, // Negative normal 3517 N_SUBNORMAL = 1 << 4, // Negative subnormal 3518 N_ZERO = 1 << 5, // Negative zero 3519 P_ZERO = 1 << 6, // Positive zero 3520 P_SUBNORMAL = 1 << 7, // Positive subnormal 3521 P_NORMAL = 1 << 8, // Positive normal 3522 P_INFINITY = 1 << 9 // Positive infinity 3523 }; 3524 3525 const uint32_t FullMask = S_NAN | Q_NAN | N_INFINITY | N_NORMAL | 3526 N_SUBNORMAL | N_ZERO | P_ZERO | P_SUBNORMAL | P_NORMAL | P_INFINITY; 3527 3528 Value *Src0 = II->getArgOperand(0); 3529 Value *Src1 = II->getArgOperand(1); 3530 const ConstantInt *CMask = dyn_cast<ConstantInt>(Src1); 3531 if (!CMask) { 3532 if (isa<UndefValue>(Src0)) 3533 return replaceInstUsesWith(*II, UndefValue::get(II->getType())); 3534 3535 if (isa<UndefValue>(Src1)) 3536 return replaceInstUsesWith(*II, ConstantInt::get(II->getType(), false)); 3537 break; 3538 } 3539 3540 uint32_t Mask = CMask->getZExtValue(); 3541 3542 // If all tests are made, it doesn't matter what the value is. 3543 if ((Mask & FullMask) == FullMask) 3544 return replaceInstUsesWith(*II, ConstantInt::get(II->getType(), true)); 3545 3546 if ((Mask & FullMask) == 0) 3547 return replaceInstUsesWith(*II, ConstantInt::get(II->getType(), false)); 3548 3549 if (Mask == (S_NAN | Q_NAN)) { 3550 // Equivalent of isnan. Replace with standard fcmp. 3551 Value *FCmp = Builder.CreateFCmpUNO(Src0, Src0); 3552 FCmp->takeName(II); 3553 return replaceInstUsesWith(*II, FCmp); 3554 } 3555 3556 if (Mask == (N_ZERO | P_ZERO)) { 3557 // Equivalent of == 0. 3558 Value *FCmp = Builder.CreateFCmpOEQ( 3559 Src0, ConstantFP::get(Src0->getType(), 0.0)); 3560 3561 FCmp->takeName(II); 3562 return replaceInstUsesWith(*II, FCmp); 3563 } 3564 3565 // fp_class (nnan x), qnan|snan|other -> fp_class (nnan x), other 3566 if (((Mask & S_NAN) || (Mask & Q_NAN)) && isKnownNeverNaN(Src0, &TLI)) { 3567 II->setArgOperand(1, ConstantInt::get(Src1->getType(), 3568 Mask & ~(S_NAN | Q_NAN))); 3569 return II; 3570 } 3571 3572 const ConstantFP *CVal = dyn_cast<ConstantFP>(Src0); 3573 if (!CVal) { 3574 if (isa<UndefValue>(Src0)) 3575 return replaceInstUsesWith(*II, UndefValue::get(II->getType())); 3576 3577 // Clamp mask to used bits 3578 if ((Mask & FullMask) != Mask) { 3579 CallInst *NewCall = Builder.CreateCall(II->getCalledFunction(), 3580 { Src0, ConstantInt::get(Src1->getType(), Mask & FullMask) } 3581 ); 3582 3583 NewCall->takeName(II); 3584 return replaceInstUsesWith(*II, NewCall); 3585 } 3586 3587 break; 3588 } 3589 3590 const APFloat &Val = CVal->getValueAPF(); 3591 3592 bool Result = 3593 ((Mask & S_NAN) && Val.isNaN() && Val.isSignaling()) || 3594 ((Mask & Q_NAN) && Val.isNaN() && !Val.isSignaling()) || 3595 ((Mask & N_INFINITY) && Val.isInfinity() && Val.isNegative()) || 3596 ((Mask & N_NORMAL) && Val.isNormal() && Val.isNegative()) || 3597 ((Mask & N_SUBNORMAL) && Val.isDenormal() && Val.isNegative()) || 3598 ((Mask & N_ZERO) && Val.isZero() && Val.isNegative()) || 3599 ((Mask & P_ZERO) && Val.isZero() && !Val.isNegative()) || 3600 ((Mask & P_SUBNORMAL) && Val.isDenormal() && !Val.isNegative()) || 3601 ((Mask & P_NORMAL) && Val.isNormal() && !Val.isNegative()) || 3602 ((Mask & P_INFINITY) && Val.isInfinity() && !Val.isNegative()); 3603 3604 return replaceInstUsesWith(*II, ConstantInt::get(II->getType(), Result)); 3605 } 3606 case Intrinsic::amdgcn_cvt_pkrtz: { 3607 Value *Src0 = II->getArgOperand(0); 3608 Value *Src1 = II->getArgOperand(1); 3609 if (const ConstantFP *C0 = dyn_cast<ConstantFP>(Src0)) { 3610 if (const ConstantFP *C1 = dyn_cast<ConstantFP>(Src1)) { 3611 const fltSemantics &HalfSem 3612 = II->getType()->getScalarType()->getFltSemantics(); 3613 bool LosesInfo; 3614 APFloat Val0 = C0->getValueAPF(); 3615 APFloat Val1 = C1->getValueAPF(); 3616 Val0.convert(HalfSem, APFloat::rmTowardZero, &LosesInfo); 3617 Val1.convert(HalfSem, APFloat::rmTowardZero, &LosesInfo); 3618 3619 Constant *Folded = ConstantVector::get({ 3620 ConstantFP::get(II->getContext(), Val0), 3621 ConstantFP::get(II->getContext(), Val1) }); 3622 return replaceInstUsesWith(*II, Folded); 3623 } 3624 } 3625 3626 if (isa<UndefValue>(Src0) && isa<UndefValue>(Src1)) 3627 return replaceInstUsesWith(*II, UndefValue::get(II->getType())); 3628 3629 break; 3630 } 3631 case Intrinsic::amdgcn_cvt_pknorm_i16: 3632 case Intrinsic::amdgcn_cvt_pknorm_u16: 3633 case Intrinsic::amdgcn_cvt_pk_i16: 3634 case Intrinsic::amdgcn_cvt_pk_u16: { 3635 Value *Src0 = II->getArgOperand(0); 3636 Value *Src1 = II->getArgOperand(1); 3637 3638 if (isa<UndefValue>(Src0) && isa<UndefValue>(Src1)) 3639 return replaceInstUsesWith(*II, UndefValue::get(II->getType())); 3640 3641 break; 3642 } 3643 case Intrinsic::amdgcn_ubfe: 3644 case Intrinsic::amdgcn_sbfe: { 3645 // Decompose simple cases into standard shifts. 3646 Value *Src = II->getArgOperand(0); 3647 if (isa<UndefValue>(Src)) 3648 return replaceInstUsesWith(*II, Src); 3649 3650 unsigned Width; 3651 Type *Ty = II->getType(); 3652 unsigned IntSize = Ty->getIntegerBitWidth(); 3653 3654 ConstantInt *CWidth = dyn_cast<ConstantInt>(II->getArgOperand(2)); 3655 if (CWidth) { 3656 Width = CWidth->getZExtValue(); 3657 if ((Width & (IntSize - 1)) == 0) 3658 return replaceInstUsesWith(*II, ConstantInt::getNullValue(Ty)); 3659 3660 if (Width >= IntSize) { 3661 // Hardware ignores high bits, so remove those. 3662 II->setArgOperand(2, ConstantInt::get(CWidth->getType(), 3663 Width & (IntSize - 1))); 3664 return II; 3665 } 3666 } 3667 3668 unsigned Offset; 3669 ConstantInt *COffset = dyn_cast<ConstantInt>(II->getArgOperand(1)); 3670 if (COffset) { 3671 Offset = COffset->getZExtValue(); 3672 if (Offset >= IntSize) { 3673 II->setArgOperand(1, ConstantInt::get(COffset->getType(), 3674 Offset & (IntSize - 1))); 3675 return II; 3676 } 3677 } 3678 3679 bool Signed = IID == Intrinsic::amdgcn_sbfe; 3680 3681 if (!CWidth || !COffset) 3682 break; 3683 3684 // The case of Width == 0 is handled above, which makes this tranformation 3685 // safe. If Width == 0, then the ashr and lshr instructions become poison 3686 // value since the shift amount would be equal to the bit size. 3687 assert(Width != 0); 3688 3689 // TODO: This allows folding to undef when the hardware has specific 3690 // behavior? 3691 if (Offset + Width < IntSize) { 3692 Value *Shl = Builder.CreateShl(Src, IntSize - Offset - Width); 3693 Value *RightShift = Signed ? Builder.CreateAShr(Shl, IntSize - Width) 3694 : Builder.CreateLShr(Shl, IntSize - Width); 3695 RightShift->takeName(II); 3696 return replaceInstUsesWith(*II, RightShift); 3697 } 3698 3699 Value *RightShift = Signed ? Builder.CreateAShr(Src, Offset) 3700 : Builder.CreateLShr(Src, Offset); 3701 3702 RightShift->takeName(II); 3703 return replaceInstUsesWith(*II, RightShift); 3704 } 3705 case Intrinsic::amdgcn_exp: 3706 case Intrinsic::amdgcn_exp_compr: { 3707 ConstantInt *En = cast<ConstantInt>(II->getArgOperand(1)); 3708 unsigned EnBits = En->getZExtValue(); 3709 if (EnBits == 0xf) 3710 break; // All inputs enabled. 3711 3712 bool IsCompr = IID == Intrinsic::amdgcn_exp_compr; 3713 bool Changed = false; 3714 for (int I = 0; I < (IsCompr ? 2 : 4); ++I) { 3715 if ((!IsCompr && (EnBits & (1 << I)) == 0) || 3716 (IsCompr && ((EnBits & (0x3 << (2 * I))) == 0))) { 3717 Value *Src = II->getArgOperand(I + 2); 3718 if (!isa<UndefValue>(Src)) { 3719 II->setArgOperand(I + 2, UndefValue::get(Src->getType())); 3720 Changed = true; 3721 } 3722 } 3723 } 3724 3725 if (Changed) 3726 return II; 3727 3728 break; 3729 } 3730 case Intrinsic::amdgcn_fmed3: { 3731 // Note this does not preserve proper sNaN behavior if IEEE-mode is enabled 3732 // for the shader. 3733 3734 Value *Src0 = II->getArgOperand(0); 3735 Value *Src1 = II->getArgOperand(1); 3736 Value *Src2 = II->getArgOperand(2); 3737 3738 // Checking for NaN before canonicalization provides better fidelity when 3739 // mapping other operations onto fmed3 since the order of operands is 3740 // unchanged. 3741 CallInst *NewCall = nullptr; 3742 if (match(Src0, m_NaN()) || isa<UndefValue>(Src0)) { 3743 NewCall = Builder.CreateMinNum(Src1, Src2); 3744 } else if (match(Src1, m_NaN()) || isa<UndefValue>(Src1)) { 3745 NewCall = Builder.CreateMinNum(Src0, Src2); 3746 } else if (match(Src2, m_NaN()) || isa<UndefValue>(Src2)) { 3747 NewCall = Builder.CreateMaxNum(Src0, Src1); 3748 } 3749 3750 if (NewCall) { 3751 NewCall->copyFastMathFlags(II); 3752 NewCall->takeName(II); 3753 return replaceInstUsesWith(*II, NewCall); 3754 } 3755 3756 bool Swap = false; 3757 // Canonicalize constants to RHS operands. 3758 // 3759 // fmed3(c0, x, c1) -> fmed3(x, c0, c1) 3760 if (isa<Constant>(Src0) && !isa<Constant>(Src1)) { 3761 std::swap(Src0, Src1); 3762 Swap = true; 3763 } 3764 3765 if (isa<Constant>(Src1) && !isa<Constant>(Src2)) { 3766 std::swap(Src1, Src2); 3767 Swap = true; 3768 } 3769 3770 if (isa<Constant>(Src0) && !isa<Constant>(Src1)) { 3771 std::swap(Src0, Src1); 3772 Swap = true; 3773 } 3774 3775 if (Swap) { 3776 II->setArgOperand(0, Src0); 3777 II->setArgOperand(1, Src1); 3778 II->setArgOperand(2, Src2); 3779 return II; 3780 } 3781 3782 if (const ConstantFP *C0 = dyn_cast<ConstantFP>(Src0)) { 3783 if (const ConstantFP *C1 = dyn_cast<ConstantFP>(Src1)) { 3784 if (const ConstantFP *C2 = dyn_cast<ConstantFP>(Src2)) { 3785 APFloat Result = fmed3AMDGCN(C0->getValueAPF(), C1->getValueAPF(), 3786 C2->getValueAPF()); 3787 return replaceInstUsesWith(*II, 3788 ConstantFP::get(Builder.getContext(), Result)); 3789 } 3790 } 3791 } 3792 3793 break; 3794 } 3795 case Intrinsic::amdgcn_icmp: 3796 case Intrinsic::amdgcn_fcmp: { 3797 const ConstantInt *CC = cast<ConstantInt>(II->getArgOperand(2)); 3798 // Guard against invalid arguments. 3799 int64_t CCVal = CC->getZExtValue(); 3800 bool IsInteger = IID == Intrinsic::amdgcn_icmp; 3801 if ((IsInteger && (CCVal < CmpInst::FIRST_ICMP_PREDICATE || 3802 CCVal > CmpInst::LAST_ICMP_PREDICATE)) || 3803 (!IsInteger && (CCVal < CmpInst::FIRST_FCMP_PREDICATE || 3804 CCVal > CmpInst::LAST_FCMP_PREDICATE))) 3805 break; 3806 3807 Value *Src0 = II->getArgOperand(0); 3808 Value *Src1 = II->getArgOperand(1); 3809 3810 if (auto *CSrc0 = dyn_cast<Constant>(Src0)) { 3811 if (auto *CSrc1 = dyn_cast<Constant>(Src1)) { 3812 Constant *CCmp = ConstantExpr::getCompare(CCVal, CSrc0, CSrc1); 3813 if (CCmp->isNullValue()) { 3814 return replaceInstUsesWith( 3815 *II, ConstantExpr::getSExt(CCmp, II->getType())); 3816 } 3817 3818 // The result of V_ICMP/V_FCMP assembly instructions (which this 3819 // intrinsic exposes) is one bit per thread, masked with the EXEC 3820 // register (which contains the bitmask of live threads). So a 3821 // comparison that always returns true is the same as a read of the 3822 // EXEC register. 3823 Function *NewF = Intrinsic::getDeclaration( 3824 II->getModule(), Intrinsic::read_register, II->getType()); 3825 Metadata *MDArgs[] = {MDString::get(II->getContext(), "exec")}; 3826 MDNode *MD = MDNode::get(II->getContext(), MDArgs); 3827 Value *Args[] = {MetadataAsValue::get(II->getContext(), MD)}; 3828 CallInst *NewCall = Builder.CreateCall(NewF, Args); 3829 NewCall->addAttribute(AttributeList::FunctionIndex, 3830 Attribute::Convergent); 3831 NewCall->takeName(II); 3832 return replaceInstUsesWith(*II, NewCall); 3833 } 3834 3835 // Canonicalize constants to RHS. 3836 CmpInst::Predicate SwapPred 3837 = CmpInst::getSwappedPredicate(static_cast<CmpInst::Predicate>(CCVal)); 3838 II->setArgOperand(0, Src1); 3839 II->setArgOperand(1, Src0); 3840 II->setArgOperand(2, ConstantInt::get(CC->getType(), 3841 static_cast<int>(SwapPred))); 3842 return II; 3843 } 3844 3845 if (CCVal != CmpInst::ICMP_EQ && CCVal != CmpInst::ICMP_NE) 3846 break; 3847 3848 // Canonicalize compare eq with true value to compare != 0 3849 // llvm.amdgcn.icmp(zext (i1 x), 1, eq) 3850 // -> llvm.amdgcn.icmp(zext (i1 x), 0, ne) 3851 // llvm.amdgcn.icmp(sext (i1 x), -1, eq) 3852 // -> llvm.amdgcn.icmp(sext (i1 x), 0, ne) 3853 Value *ExtSrc; 3854 if (CCVal == CmpInst::ICMP_EQ && 3855 ((match(Src1, m_One()) && match(Src0, m_ZExt(m_Value(ExtSrc)))) || 3856 (match(Src1, m_AllOnes()) && match(Src0, m_SExt(m_Value(ExtSrc))))) && 3857 ExtSrc->getType()->isIntegerTy(1)) { 3858 II->setArgOperand(1, ConstantInt::getNullValue(Src1->getType())); 3859 II->setArgOperand(2, ConstantInt::get(CC->getType(), CmpInst::ICMP_NE)); 3860 return II; 3861 } 3862 3863 CmpInst::Predicate SrcPred; 3864 Value *SrcLHS; 3865 Value *SrcRHS; 3866 3867 // Fold compare eq/ne with 0 from a compare result as the predicate to the 3868 // intrinsic. The typical use is a wave vote function in the library, which 3869 // will be fed from a user code condition compared with 0. Fold in the 3870 // redundant compare. 3871 3872 // llvm.amdgcn.icmp([sz]ext ([if]cmp pred a, b), 0, ne) 3873 // -> llvm.amdgcn.[if]cmp(a, b, pred) 3874 // 3875 // llvm.amdgcn.icmp([sz]ext ([if]cmp pred a, b), 0, eq) 3876 // -> llvm.amdgcn.[if]cmp(a, b, inv pred) 3877 if (match(Src1, m_Zero()) && 3878 match(Src0, 3879 m_ZExtOrSExt(m_Cmp(SrcPred, m_Value(SrcLHS), m_Value(SrcRHS))))) { 3880 if (CCVal == CmpInst::ICMP_EQ) 3881 SrcPred = CmpInst::getInversePredicate(SrcPred); 3882 3883 Intrinsic::ID NewIID = CmpInst::isFPPredicate(SrcPred) ? 3884 Intrinsic::amdgcn_fcmp : Intrinsic::amdgcn_icmp; 3885 3886 Type *Ty = SrcLHS->getType(); 3887 if (auto *CmpType = dyn_cast<IntegerType>(Ty)) { 3888 // Promote to next legal integer type. 3889 unsigned Width = CmpType->getBitWidth(); 3890 unsigned NewWidth = Width; 3891 3892 // Don't do anything for i1 comparisons. 3893 if (Width == 1) 3894 break; 3895 3896 if (Width <= 16) 3897 NewWidth = 16; 3898 else if (Width <= 32) 3899 NewWidth = 32; 3900 else if (Width <= 64) 3901 NewWidth = 64; 3902 else if (Width > 64) 3903 break; // Can't handle this. 3904 3905 if (Width != NewWidth) { 3906 IntegerType *CmpTy = Builder.getIntNTy(NewWidth); 3907 if (CmpInst::isSigned(SrcPred)) { 3908 SrcLHS = Builder.CreateSExt(SrcLHS, CmpTy); 3909 SrcRHS = Builder.CreateSExt(SrcRHS, CmpTy); 3910 } else { 3911 SrcLHS = Builder.CreateZExt(SrcLHS, CmpTy); 3912 SrcRHS = Builder.CreateZExt(SrcRHS, CmpTy); 3913 } 3914 } 3915 } else if (!Ty->isFloatTy() && !Ty->isDoubleTy() && !Ty->isHalfTy()) 3916 break; 3917 3918 Function *NewF = 3919 Intrinsic::getDeclaration(II->getModule(), NewIID, 3920 { II->getType(), 3921 SrcLHS->getType() }); 3922 Value *Args[] = { SrcLHS, SrcRHS, 3923 ConstantInt::get(CC->getType(), SrcPred) }; 3924 CallInst *NewCall = Builder.CreateCall(NewF, Args); 3925 NewCall->takeName(II); 3926 return replaceInstUsesWith(*II, NewCall); 3927 } 3928 3929 break; 3930 } 3931 case Intrinsic::amdgcn_wqm_vote: { 3932 // wqm_vote is identity when the argument is constant. 3933 if (!isa<Constant>(II->getArgOperand(0))) 3934 break; 3935 3936 return replaceInstUsesWith(*II, II->getArgOperand(0)); 3937 } 3938 case Intrinsic::amdgcn_kill: { 3939 const ConstantInt *C = dyn_cast<ConstantInt>(II->getArgOperand(0)); 3940 if (!C || !C->getZExtValue()) 3941 break; 3942 3943 // amdgcn.kill(i1 1) is a no-op 3944 return eraseInstFromFunction(CI); 3945 } 3946 case Intrinsic::amdgcn_update_dpp: { 3947 Value *Old = II->getArgOperand(0); 3948 3949 auto BC = cast<ConstantInt>(II->getArgOperand(5)); 3950 auto RM = cast<ConstantInt>(II->getArgOperand(3)); 3951 auto BM = cast<ConstantInt>(II->getArgOperand(4)); 3952 if (BC->isZeroValue() || 3953 RM->getZExtValue() != 0xF || 3954 BM->getZExtValue() != 0xF || 3955 isa<UndefValue>(Old)) 3956 break; 3957 3958 // If bound_ctrl = 1, row mask = bank mask = 0xf we can omit old value. 3959 II->setOperand(0, UndefValue::get(Old->getType())); 3960 return II; 3961 } 3962 case Intrinsic::amdgcn_readfirstlane: 3963 case Intrinsic::amdgcn_readlane: { 3964 // A constant value is trivially uniform. 3965 if (Constant *C = dyn_cast<Constant>(II->getArgOperand(0))) 3966 return replaceInstUsesWith(*II, C); 3967 3968 // The rest of these may not be safe if the exec may not be the same between 3969 // the def and use. 3970 Value *Src = II->getArgOperand(0); 3971 Instruction *SrcInst = dyn_cast<Instruction>(Src); 3972 if (SrcInst && SrcInst->getParent() != II->getParent()) 3973 break; 3974 3975 // readfirstlane (readfirstlane x) -> readfirstlane x 3976 // readlane (readfirstlane x), y -> readfirstlane x 3977 if (match(Src, m_Intrinsic<Intrinsic::amdgcn_readfirstlane>())) 3978 return replaceInstUsesWith(*II, Src); 3979 3980 if (IID == Intrinsic::amdgcn_readfirstlane) { 3981 // readfirstlane (readlane x, y) -> readlane x, y 3982 if (match(Src, m_Intrinsic<Intrinsic::amdgcn_readlane>())) 3983 return replaceInstUsesWith(*II, Src); 3984 } else { 3985 // readlane (readlane x, y), y -> readlane x, y 3986 if (match(Src, m_Intrinsic<Intrinsic::amdgcn_readlane>( 3987 m_Value(), m_Specific(II->getArgOperand(1))))) 3988 return replaceInstUsesWith(*II, Src); 3989 } 3990 3991 break; 3992 } 3993 case Intrinsic::stackrestore: { 3994 // If the save is right next to the restore, remove the restore. This can 3995 // happen when variable allocas are DCE'd. 3996 if (IntrinsicInst *SS = dyn_cast<IntrinsicInst>(II->getArgOperand(0))) { 3997 if (SS->getIntrinsicID() == Intrinsic::stacksave) { 3998 // Skip over debug info. 3999 if (SS->getNextNonDebugInstruction() == II) { 4000 return eraseInstFromFunction(CI); 4001 } 4002 } 4003 } 4004 4005 // Scan down this block to see if there is another stack restore in the 4006 // same block without an intervening call/alloca. 4007 BasicBlock::iterator BI(II); 4008 Instruction *TI = II->getParent()->getTerminator(); 4009 bool CannotRemove = false; 4010 for (++BI; &*BI != TI; ++BI) { 4011 if (isa<AllocaInst>(BI)) { 4012 CannotRemove = true; 4013 break; 4014 } 4015 if (CallInst *BCI = dyn_cast<CallInst>(BI)) { 4016 if (auto *II2 = dyn_cast<IntrinsicInst>(BCI)) { 4017 // If there is a stackrestore below this one, remove this one. 4018 if (II2->getIntrinsicID() == Intrinsic::stackrestore) 4019 return eraseInstFromFunction(CI); 4020 4021 // Bail if we cross over an intrinsic with side effects, such as 4022 // llvm.stacksave, or llvm.read_register. 4023 if (II2->mayHaveSideEffects()) { 4024 CannotRemove = true; 4025 break; 4026 } 4027 } else { 4028 // If we found a non-intrinsic call, we can't remove the stack 4029 // restore. 4030 CannotRemove = true; 4031 break; 4032 } 4033 } 4034 } 4035 4036 // If the stack restore is in a return, resume, or unwind block and if there 4037 // are no allocas or calls between the restore and the return, nuke the 4038 // restore. 4039 if (!CannotRemove && (isa<ReturnInst>(TI) || isa<ResumeInst>(TI))) 4040 return eraseInstFromFunction(CI); 4041 break; 4042 } 4043 case Intrinsic::lifetime_start: 4044 // Asan needs to poison memory to detect invalid access which is possible 4045 // even for empty lifetime range. 4046 if (II->getFunction()->hasFnAttribute(Attribute::SanitizeAddress) || 4047 II->getFunction()->hasFnAttribute(Attribute::SanitizeMemory) || 4048 II->getFunction()->hasFnAttribute(Attribute::SanitizeHWAddress)) 4049 break; 4050 4051 if (removeTriviallyEmptyRange(*II, Intrinsic::lifetime_start, 4052 Intrinsic::lifetime_end, *this)) 4053 return nullptr; 4054 break; 4055 case Intrinsic::assume: { 4056 Value *IIOperand = II->getArgOperand(0); 4057 // Remove an assume if it is followed by an identical assume. 4058 // TODO: Do we need this? Unless there are conflicting assumptions, the 4059 // computeKnownBits(IIOperand) below here eliminates redundant assumes. 4060 Instruction *Next = II->getNextNonDebugInstruction(); 4061 if (match(Next, m_Intrinsic<Intrinsic::assume>(m_Specific(IIOperand)))) 4062 return eraseInstFromFunction(CI); 4063 4064 // Canonicalize assume(a && b) -> assume(a); assume(b); 4065 // Note: New assumption intrinsics created here are registered by 4066 // the InstCombineIRInserter object. 4067 FunctionType *AssumeIntrinsicTy = II->getFunctionType(); 4068 Value *AssumeIntrinsic = II->getCalledValue(); 4069 Value *A, *B; 4070 if (match(IIOperand, m_And(m_Value(A), m_Value(B)))) { 4071 Builder.CreateCall(AssumeIntrinsicTy, AssumeIntrinsic, A, II->getName()); 4072 Builder.CreateCall(AssumeIntrinsicTy, AssumeIntrinsic, B, II->getName()); 4073 return eraseInstFromFunction(*II); 4074 } 4075 // assume(!(a || b)) -> assume(!a); assume(!b); 4076 if (match(IIOperand, m_Not(m_Or(m_Value(A), m_Value(B))))) { 4077 Builder.CreateCall(AssumeIntrinsicTy, AssumeIntrinsic, 4078 Builder.CreateNot(A), II->getName()); 4079 Builder.CreateCall(AssumeIntrinsicTy, AssumeIntrinsic, 4080 Builder.CreateNot(B), II->getName()); 4081 return eraseInstFromFunction(*II); 4082 } 4083 4084 // assume( (load addr) != null ) -> add 'nonnull' metadata to load 4085 // (if assume is valid at the load) 4086 CmpInst::Predicate Pred; 4087 Instruction *LHS; 4088 if (match(IIOperand, m_ICmp(Pred, m_Instruction(LHS), m_Zero())) && 4089 Pred == ICmpInst::ICMP_NE && LHS->getOpcode() == Instruction::Load && 4090 LHS->getType()->isPointerTy() && 4091 isValidAssumeForContext(II, LHS, &DT)) { 4092 MDNode *MD = MDNode::get(II->getContext(), None); 4093 LHS->setMetadata(LLVMContext::MD_nonnull, MD); 4094 return eraseInstFromFunction(*II); 4095 4096 // TODO: apply nonnull return attributes to calls and invokes 4097 // TODO: apply range metadata for range check patterns? 4098 } 4099 4100 // If there is a dominating assume with the same condition as this one, 4101 // then this one is redundant, and should be removed. 4102 KnownBits Known(1); 4103 computeKnownBits(IIOperand, Known, 0, II); 4104 if (Known.isAllOnes()) 4105 return eraseInstFromFunction(*II); 4106 4107 // Update the cache of affected values for this assumption (we might be 4108 // here because we just simplified the condition). 4109 AC.updateAffectedValues(II); 4110 break; 4111 } 4112 case Intrinsic::experimental_gc_relocate: { 4113 auto &GCR = *cast<GCRelocateInst>(II); 4114 4115 // If we have two copies of the same pointer in the statepoint argument 4116 // list, canonicalize to one. This may let us common gc.relocates. 4117 if (GCR.getBasePtr() == GCR.getDerivedPtr() && 4118 GCR.getBasePtrIndex() != GCR.getDerivedPtrIndex()) { 4119 auto *OpIntTy = GCR.getOperand(2)->getType(); 4120 II->setOperand(2, ConstantInt::get(OpIntTy, GCR.getBasePtrIndex())); 4121 return II; 4122 } 4123 4124 // Translate facts known about a pointer before relocating into 4125 // facts about the relocate value, while being careful to 4126 // preserve relocation semantics. 4127 Value *DerivedPtr = GCR.getDerivedPtr(); 4128 4129 // Remove the relocation if unused, note that this check is required 4130 // to prevent the cases below from looping forever. 4131 if (II->use_empty()) 4132 return eraseInstFromFunction(*II); 4133 4134 // Undef is undef, even after relocation. 4135 // TODO: provide a hook for this in GCStrategy. This is clearly legal for 4136 // most practical collectors, but there was discussion in the review thread 4137 // about whether it was legal for all possible collectors. 4138 if (isa<UndefValue>(DerivedPtr)) 4139 // Use undef of gc_relocate's type to replace it. 4140 return replaceInstUsesWith(*II, UndefValue::get(II->getType())); 4141 4142 if (auto *PT = dyn_cast<PointerType>(II->getType())) { 4143 // The relocation of null will be null for most any collector. 4144 // TODO: provide a hook for this in GCStrategy. There might be some 4145 // weird collector this property does not hold for. 4146 if (isa<ConstantPointerNull>(DerivedPtr)) 4147 // Use null-pointer of gc_relocate's type to replace it. 4148 return replaceInstUsesWith(*II, ConstantPointerNull::get(PT)); 4149 4150 // isKnownNonNull -> nonnull attribute 4151 if (!II->hasRetAttr(Attribute::NonNull) && 4152 isKnownNonZero(DerivedPtr, DL, 0, &AC, II, &DT)) { 4153 II->addAttribute(AttributeList::ReturnIndex, Attribute::NonNull); 4154 return II; 4155 } 4156 } 4157 4158 // TODO: bitcast(relocate(p)) -> relocate(bitcast(p)) 4159 // Canonicalize on the type from the uses to the defs 4160 4161 // TODO: relocate((gep p, C, C2, ...)) -> gep(relocate(p), C, C2, ...) 4162 break; 4163 } 4164 4165 case Intrinsic::experimental_guard: { 4166 // Is this guard followed by another guard? We scan forward over a small 4167 // fixed window of instructions to handle common cases with conditions 4168 // computed between guards. 4169 Instruction *NextInst = II->getNextNonDebugInstruction(); 4170 for (unsigned i = 0; i < GuardWideningWindow; i++) { 4171 // Note: Using context-free form to avoid compile time blow up 4172 if (!isSafeToSpeculativelyExecute(NextInst)) 4173 break; 4174 NextInst = NextInst->getNextNonDebugInstruction(); 4175 } 4176 Value *NextCond = nullptr; 4177 if (match(NextInst, 4178 m_Intrinsic<Intrinsic::experimental_guard>(m_Value(NextCond)))) { 4179 Value *CurrCond = II->getArgOperand(0); 4180 4181 // Remove a guard that it is immediately preceded by an identical guard. 4182 // Otherwise canonicalize guard(a); guard(b) -> guard(a & b). 4183 if (CurrCond != NextCond) { 4184 Instruction *MoveI = II->getNextNonDebugInstruction(); 4185 while (MoveI != NextInst) { 4186 auto *Temp = MoveI; 4187 MoveI = MoveI->getNextNonDebugInstruction(); 4188 Temp->moveBefore(II); 4189 } 4190 II->setArgOperand(0, Builder.CreateAnd(CurrCond, NextCond)); 4191 } 4192 eraseInstFromFunction(*NextInst); 4193 return II; 4194 } 4195 break; 4196 } 4197 } 4198 return visitCallBase(*II); 4199} 4200 4201// Fence instruction simplification 4202Instruction *InstCombiner::visitFenceInst(FenceInst &FI) { 4203 // Remove identical consecutive fences. 4204 Instruction *Next = FI.getNextNonDebugInstruction(); 4205 if (auto *NFI = dyn_cast<FenceInst>(Next)) 4206 if (FI.isIdenticalTo(NFI)) 4207 return eraseInstFromFunction(FI); 4208 return nullptr; 4209} 4210 4211// InvokeInst simplification 4212Instruction *InstCombiner::visitInvokeInst(InvokeInst &II) { 4213 return visitCallBase(II); 4214} 4215 4216// CallBrInst simplification 4217Instruction *InstCombiner::visitCallBrInst(CallBrInst &CBI) { 4218 return visitCallBase(CBI); 4219} 4220 4221/// If this cast does not affect the value passed through the varargs area, we 4222/// can eliminate the use of the cast. 4223static bool isSafeToEliminateVarargsCast(const CallBase &Call, 4224 const DataLayout &DL, 4225 const CastInst *const CI, 4226 const int ix) { 4227 if (!CI->isLosslessCast()) 4228 return false; 4229 4230 // If this is a GC intrinsic, avoid munging types. We need types for 4231 // statepoint reconstruction in SelectionDAG. 4232 // TODO: This is probably something which should be expanded to all 4233 // intrinsics since the entire point of intrinsics is that 4234 // they are understandable by the optimizer. 4235 if (isStatepoint(&Call) || isGCRelocate(&Call) || isGCResult(&Call)) 4236 return false; 4237 4238 // The size of ByVal or InAlloca arguments is derived from the type, so we 4239 // can't change to a type with a different size. If the size were 4240 // passed explicitly we could avoid this check. 4241 if (!Call.isByValOrInAllocaArgument(ix)) 4242 return true; 4243 4244 Type* SrcTy = 4245 cast<PointerType>(CI->getOperand(0)->getType())->getElementType(); 4246 Type *DstTy = Call.isByValArgument(ix) 4247 ? Call.getParamByValType(ix) 4248 : cast<PointerType>(CI->getType())->getElementType(); 4249 if (!SrcTy->isSized() || !DstTy->isSized()) 4250 return false; 4251 if (DL.getTypeAllocSize(SrcTy) != DL.getTypeAllocSize(DstTy)) 4252 return false; 4253 return true; 4254} 4255 4256Instruction *InstCombiner::tryOptimizeCall(CallInst *CI) { 4257 if (!CI->getCalledFunction()) return nullptr; 4258 4259 auto InstCombineRAUW = [this](Instruction *From, Value *With) { 4260 replaceInstUsesWith(*From, With); 4261 }; 4262 auto InstCombineErase = [this](Instruction *I) { 4263 eraseInstFromFunction(*I); 4264 }; 4265 LibCallSimplifier Simplifier(DL, &TLI, ORE, BFI, PSI, InstCombineRAUW, 4266 InstCombineErase); 4267 if (Value *With = Simplifier.optimizeCall(CI)) { 4268 ++NumSimplified; 4269 return CI->use_empty() ? CI : replaceInstUsesWith(*CI, With); 4270 } 4271 4272 return nullptr; 4273} 4274 4275static IntrinsicInst *findInitTrampolineFromAlloca(Value *TrampMem) { 4276 // Strip off at most one level of pointer casts, looking for an alloca. This 4277 // is good enough in practice and simpler than handling any number of casts. 4278 Value *Underlying = TrampMem->stripPointerCasts(); 4279 if (Underlying != TrampMem && 4280 (!Underlying->hasOneUse() || Underlying->user_back() != TrampMem)) 4281 return nullptr; 4282 if (!isa<AllocaInst>(Underlying)) 4283 return nullptr; 4284 4285 IntrinsicInst *InitTrampoline = nullptr; 4286 for (User *U : TrampMem->users()) { 4287 IntrinsicInst *II = dyn_cast<IntrinsicInst>(U); 4288 if (!II) 4289 return nullptr; 4290 if (II->getIntrinsicID() == Intrinsic::init_trampoline) { 4291 if (InitTrampoline) 4292 // More than one init_trampoline writes to this value. Give up. 4293 return nullptr; 4294 InitTrampoline = II; 4295 continue; 4296 } 4297 if (II->getIntrinsicID() == Intrinsic::adjust_trampoline) 4298 // Allow any number of calls to adjust.trampoline. 4299 continue; 4300 return nullptr; 4301 } 4302 4303 // No call to init.trampoline found. 4304 if (!InitTrampoline) 4305 return nullptr; 4306 4307 // Check that the alloca is being used in the expected way. 4308 if (InitTrampoline->getOperand(0) != TrampMem) 4309 return nullptr; 4310 4311 return InitTrampoline; 4312} 4313 4314static IntrinsicInst *findInitTrampolineFromBB(IntrinsicInst *AdjustTramp, 4315 Value *TrampMem) { 4316 // Visit all the previous instructions in the basic block, and try to find a 4317 // init.trampoline which has a direct path to the adjust.trampoline. 4318 for (BasicBlock::iterator I = AdjustTramp->getIterator(), 4319 E = AdjustTramp->getParent()->begin(); 4320 I != E;) { 4321 Instruction *Inst = &*--I; 4322 if (IntrinsicInst *II = dyn_cast<IntrinsicInst>(I)) 4323 if (II->getIntrinsicID() == Intrinsic::init_trampoline && 4324 II->getOperand(0) == TrampMem) 4325 return II; 4326 if (Inst->mayWriteToMemory()) 4327 return nullptr; 4328 } 4329 return nullptr; 4330} 4331 4332// Given a call to llvm.adjust.trampoline, find and return the corresponding 4333// call to llvm.init.trampoline if the call to the trampoline can be optimized 4334// to a direct call to a function. Otherwise return NULL. 4335static IntrinsicInst *findInitTrampoline(Value *Callee) { 4336 Callee = Callee->stripPointerCasts(); 4337 IntrinsicInst *AdjustTramp = dyn_cast<IntrinsicInst>(Callee); 4338 if (!AdjustTramp || 4339 AdjustTramp->getIntrinsicID() != Intrinsic::adjust_trampoline) 4340 return nullptr; 4341 4342 Value *TrampMem = AdjustTramp->getOperand(0); 4343 4344 if (IntrinsicInst *IT = findInitTrampolineFromAlloca(TrampMem)) 4345 return IT; 4346 if (IntrinsicInst *IT = findInitTrampolineFromBB(AdjustTramp, TrampMem)) 4347 return IT; 4348 return nullptr; 4349} 4350 4351static void annotateAnyAllocSite(CallBase &Call, const TargetLibraryInfo *TLI) { 4352 unsigned NumArgs = Call.getNumArgOperands(); 4353 ConstantInt *Op0C = dyn_cast<ConstantInt>(Call.getOperand(0)); 4354 ConstantInt *Op1C = 4355 (NumArgs == 1) ? nullptr : dyn_cast<ConstantInt>(Call.getOperand(1)); 4356 // Bail out if the allocation size is zero. 4357 if ((Op0C && Op0C->isNullValue()) || (Op1C && Op1C->isNullValue())) 4358 return; 4359 4360 if (isMallocLikeFn(&Call, TLI) && Op0C) { 4361 if (isOpNewLikeFn(&Call, TLI)) 4362 Call.addAttribute(AttributeList::ReturnIndex, 4363 Attribute::getWithDereferenceableBytes( 4364 Call.getContext(), Op0C->getZExtValue())); 4365 else 4366 Call.addAttribute(AttributeList::ReturnIndex, 4367 Attribute::getWithDereferenceableOrNullBytes( 4368 Call.getContext(), Op0C->getZExtValue())); 4369 } else if (isReallocLikeFn(&Call, TLI) && Op1C) { 4370 Call.addAttribute(AttributeList::ReturnIndex, 4371 Attribute::getWithDereferenceableOrNullBytes( 4372 Call.getContext(), Op1C->getZExtValue())); 4373 } else if (isCallocLikeFn(&Call, TLI) && Op0C && Op1C) { 4374 bool Overflow; 4375 const APInt &N = Op0C->getValue(); 4376 APInt Size = N.umul_ov(Op1C->getValue(), Overflow); 4377 if (!Overflow) 4378 Call.addAttribute(AttributeList::ReturnIndex, 4379 Attribute::getWithDereferenceableOrNullBytes( 4380 Call.getContext(), Size.getZExtValue())); 4381 } else if (isStrdupLikeFn(&Call, TLI)) { 4382 uint64_t Len = GetStringLength(Call.getOperand(0)); 4383 if (Len) { 4384 // strdup 4385 if (NumArgs == 1) 4386 Call.addAttribute(AttributeList::ReturnIndex, 4387 Attribute::getWithDereferenceableOrNullBytes( 4388 Call.getContext(), Len)); 4389 // strndup 4390 else if (NumArgs == 2 && Op1C) 4391 Call.addAttribute( 4392 AttributeList::ReturnIndex, 4393 Attribute::getWithDereferenceableOrNullBytes( 4394 Call.getContext(), std::min(Len, Op1C->getZExtValue() + 1))); 4395 } 4396 } 4397} 4398 4399/// Improvements for call, callbr and invoke instructions. 4400Instruction *InstCombiner::visitCallBase(CallBase &Call) { 4401 if (isAllocationFn(&Call, &TLI)) 4402 annotateAnyAllocSite(Call, &TLI); 4403 4404 bool Changed = false; 4405 4406 // Mark any parameters that are known to be non-null with the nonnull 4407 // attribute. This is helpful for inlining calls to functions with null 4408 // checks on their arguments. 4409 SmallVector<unsigned, 4> ArgNos; 4410 unsigned ArgNo = 0; 4411 4412 for (Value *V : Call.args()) { 4413 if (V->getType()->isPointerTy() && 4414 !Call.paramHasAttr(ArgNo, Attribute::NonNull) && 4415 isKnownNonZero(V, DL, 0, &AC, &Call, &DT)) 4416 ArgNos.push_back(ArgNo); 4417 ArgNo++; 4418 } 4419 4420 assert(ArgNo == Call.arg_size() && "sanity check"); 4421 4422 if (!ArgNos.empty()) { 4423 AttributeList AS = Call.getAttributes(); 4424 LLVMContext &Ctx = Call.getContext(); 4425 AS = AS.addParamAttribute(Ctx, ArgNos, 4426 Attribute::get(Ctx, Attribute::NonNull)); 4427 Call.setAttributes(AS); 4428 Changed = true; 4429 } 4430 4431 // If the callee is a pointer to a function, attempt to move any casts to the 4432 // arguments of the call/callbr/invoke. 4433 Value *Callee = Call.getCalledValue(); 4434 if (!isa<Function>(Callee) && transformConstExprCastCall(Call)) 4435 return nullptr; 4436 4437 if (Function *CalleeF = dyn_cast<Function>(Callee)) { 4438 // Remove the convergent attr on calls when the callee is not convergent. 4439 if (Call.isConvergent() && !CalleeF->isConvergent() && 4440 !CalleeF->isIntrinsic()) { 4441 LLVM_DEBUG(dbgs() << "Removing convergent attr from instr " << Call 4442 << "\n"); 4443 Call.setNotConvergent(); 4444 return &Call; 4445 } 4446 4447 // If the call and callee calling conventions don't match, this call must 4448 // be unreachable, as the call is undefined. 4449 if (CalleeF->getCallingConv() != Call.getCallingConv() && 4450 // Only do this for calls to a function with a body. A prototype may 4451 // not actually end up matching the implementation's calling conv for a 4452 // variety of reasons (e.g. it may be written in assembly). 4453 !CalleeF->isDeclaration()) { 4454 Instruction *OldCall = &Call; 4455 CreateNonTerminatorUnreachable(OldCall); 4456 // If OldCall does not return void then replaceAllUsesWith undef. 4457 // This allows ValueHandlers and custom metadata to adjust itself. 4458 if (!OldCall->getType()->isVoidTy()) 4459 replaceInstUsesWith(*OldCall, UndefValue::get(OldCall->getType())); 4460 if (isa<CallInst>(OldCall)) 4461 return eraseInstFromFunction(*OldCall); 4462 4463 // We cannot remove an invoke or a callbr, because it would change thexi 4464 // CFG, just change the callee to a null pointer. 4465 cast<CallBase>(OldCall)->setCalledFunction( 4466 CalleeF->getFunctionType(), 4467 Constant::getNullValue(CalleeF->getType())); 4468 return nullptr; 4469 } 4470 } 4471 4472 if ((isa<ConstantPointerNull>(Callee) && 4473 !NullPointerIsDefined(Call.getFunction())) || 4474 isa<UndefValue>(Callee)) { 4475 // If Call does not return void then replaceAllUsesWith undef. 4476 // This allows ValueHandlers and custom metadata to adjust itself. 4477 if (!Call.getType()->isVoidTy()) 4478 replaceInstUsesWith(Call, UndefValue::get(Call.getType())); 4479 4480 if (Call.isTerminator()) { 4481 // Can't remove an invoke or callbr because we cannot change the CFG. 4482 return nullptr; 4483 } 4484 4485 // This instruction is not reachable, just remove it. 4486 CreateNonTerminatorUnreachable(&Call); 4487 return eraseInstFromFunction(Call); 4488 } 4489 4490 if (IntrinsicInst *II = findInitTrampoline(Callee)) 4491 return transformCallThroughTrampoline(Call, *II); 4492 4493 PointerType *PTy = cast<PointerType>(Callee->getType()); 4494 FunctionType *FTy = cast<FunctionType>(PTy->getElementType()); 4495 if (FTy->isVarArg()) { 4496 int ix = FTy->getNumParams(); 4497 // See if we can optimize any arguments passed through the varargs area of 4498 // the call. 4499 for (auto I = Call.arg_begin() + FTy->getNumParams(), E = Call.arg_end(); 4500 I != E; ++I, ++ix) { 4501 CastInst *CI = dyn_cast<CastInst>(*I); 4502 if (CI && isSafeToEliminateVarargsCast(Call, DL, CI, ix)) { 4503 *I = CI->getOperand(0); 4504 4505 // Update the byval type to match the argument type. 4506 if (Call.isByValArgument(ix)) { 4507 Call.removeParamAttr(ix, Attribute::ByVal); 4508 Call.addParamAttr( 4509 ix, Attribute::getWithByValType( 4510 Call.getContext(), 4511 CI->getOperand(0)->getType()->getPointerElementType())); 4512 } 4513 Changed = true; 4514 } 4515 } 4516 } 4517 4518 if (isa<InlineAsm>(Callee) && !Call.doesNotThrow()) { 4519 // Inline asm calls cannot throw - mark them 'nounwind'. 4520 Call.setDoesNotThrow(); 4521 Changed = true; 4522 } 4523 4524 // Try to optimize the call if possible, we require DataLayout for most of 4525 // this. None of these calls are seen as possibly dead so go ahead and 4526 // delete the instruction now. 4527 if (CallInst *CI = dyn_cast<CallInst>(&Call)) { 4528 Instruction *I = tryOptimizeCall(CI); 4529 // If we changed something return the result, etc. Otherwise let 4530 // the fallthrough check. 4531 if (I) return eraseInstFromFunction(*I); 4532 } 4533 4534 if (isAllocLikeFn(&Call, &TLI)) 4535 return visitAllocSite(Call); 4536 4537 return Changed ? &Call : nullptr; 4538} 4539 4540/// If the callee is a constexpr cast of a function, attempt to move the cast to 4541/// the arguments of the call/callbr/invoke. 4542bool InstCombiner::transformConstExprCastCall(CallBase &Call) { 4543 auto *Callee = dyn_cast<Function>(Call.getCalledValue()->stripPointerCasts()); 4544 if (!Callee) 4545 return false; 4546 4547 // If this is a call to a thunk function, don't remove the cast. Thunks are 4548 // used to transparently forward all incoming parameters and outgoing return 4549 // values, so it's important to leave the cast in place. 4550 if (Callee->hasFnAttribute("thunk")) 4551 return false; 4552 4553 // If this is a musttail call, the callee's prototype must match the caller's 4554 // prototype with the exception of pointee types. The code below doesn't 4555 // implement that, so we can't do this transform. 4556 // TODO: Do the transform if it only requires adding pointer casts. 4557 if (Call.isMustTailCall()) 4558 return false; 4559 4560 Instruction *Caller = &Call; 4561 const AttributeList &CallerPAL = Call.getAttributes(); 4562 4563 // Okay, this is a cast from a function to a different type. Unless doing so 4564 // would cause a type conversion of one of our arguments, change this call to 4565 // be a direct call with arguments casted to the appropriate types. 4566 FunctionType *FT = Callee->getFunctionType(); 4567 Type *OldRetTy = Caller->getType(); 4568 Type *NewRetTy = FT->getReturnType(); 4569 4570 // Check to see if we are changing the return type... 4571 if (OldRetTy != NewRetTy) { 4572 4573 if (NewRetTy->isStructTy()) 4574 return false; // TODO: Handle multiple return values. 4575 4576 if (!CastInst::isBitOrNoopPointerCastable(NewRetTy, OldRetTy, DL)) { 4577 if (Callee->isDeclaration()) 4578 return false; // Cannot transform this return value. 4579 4580 if (!Caller->use_empty() && 4581 // void -> non-void is handled specially 4582 !NewRetTy->isVoidTy()) 4583 return false; // Cannot transform this return value. 4584 } 4585 4586 if (!CallerPAL.isEmpty() && !Caller->use_empty()) { 4587 AttrBuilder RAttrs(CallerPAL, AttributeList::ReturnIndex); 4588 if (RAttrs.overlaps(AttributeFuncs::typeIncompatible(NewRetTy))) 4589 return false; // Attribute not compatible with transformed value. 4590 } 4591 4592 // If the callbase is an invoke/callbr instruction, and the return value is 4593 // used by a PHI node in a successor, we cannot change the return type of 4594 // the call because there is no place to put the cast instruction (without 4595 // breaking the critical edge). Bail out in this case. 4596 if (!Caller->use_empty()) { 4597 if (InvokeInst *II = dyn_cast<InvokeInst>(Caller)) 4598 for (User *U : II->users()) 4599 if (PHINode *PN = dyn_cast<PHINode>(U)) 4600 if (PN->getParent() == II->getNormalDest() || 4601 PN->getParent() == II->getUnwindDest()) 4602 return false; 4603 // FIXME: Be conservative for callbr to avoid a quadratic search. 4604 if (isa<CallBrInst>(Caller)) 4605 return false; 4606 } 4607 } 4608 4609 unsigned NumActualArgs = Call.arg_size(); 4610 unsigned NumCommonArgs = std::min(FT->getNumParams(), NumActualArgs); 4611 4612 // Prevent us turning: 4613 // declare void @takes_i32_inalloca(i32* inalloca) 4614 // call void bitcast (void (i32*)* @takes_i32_inalloca to void (i32)*)(i32 0) 4615 // 4616 // into: 4617 // call void @takes_i32_inalloca(i32* null) 4618 // 4619 // Similarly, avoid folding away bitcasts of byval calls. 4620 if (Callee->getAttributes().hasAttrSomewhere(Attribute::InAlloca) || 4621 Callee->getAttributes().hasAttrSomewhere(Attribute::ByVal)) 4622 return false; 4623 4624 auto AI = Call.arg_begin(); 4625 for (unsigned i = 0, e = NumCommonArgs; i != e; ++i, ++AI) { 4626 Type *ParamTy = FT->getParamType(i); 4627 Type *ActTy = (*AI)->getType(); 4628 4629 if (!CastInst::isBitOrNoopPointerCastable(ActTy, ParamTy, DL)) 4630 return false; // Cannot transform this parameter value. 4631 4632 if (AttrBuilder(CallerPAL.getParamAttributes(i)) 4633 .overlaps(AttributeFuncs::typeIncompatible(ParamTy))) 4634 return false; // Attribute not compatible with transformed value. 4635 4636 if (Call.isInAllocaArgument(i)) 4637 return false; // Cannot transform to and from inalloca. 4638 4639 // If the parameter is passed as a byval argument, then we have to have a 4640 // sized type and the sized type has to have the same size as the old type. 4641 if (ParamTy != ActTy && CallerPAL.hasParamAttribute(i, Attribute::ByVal)) { 4642 PointerType *ParamPTy = dyn_cast<PointerType>(ParamTy); 4643 if (!ParamPTy || !ParamPTy->getElementType()->isSized()) 4644 return false; 4645 4646 Type *CurElTy = Call.getParamByValType(i); 4647 if (DL.getTypeAllocSize(CurElTy) != 4648 DL.getTypeAllocSize(ParamPTy->getElementType())) 4649 return false; 4650 } 4651 } 4652 4653 if (Callee->isDeclaration()) { 4654 // Do not delete arguments unless we have a function body. 4655 if (FT->getNumParams() < NumActualArgs && !FT->isVarArg()) 4656 return false; 4657 4658 // If the callee is just a declaration, don't change the varargsness of the 4659 // call. We don't want to introduce a varargs call where one doesn't 4660 // already exist. 4661 PointerType *APTy = cast<PointerType>(Call.getCalledValue()->getType()); 4662 if (FT->isVarArg()!=cast<FunctionType>(APTy->getElementType())->isVarArg()) 4663 return false; 4664 4665 // If both the callee and the cast type are varargs, we still have to make 4666 // sure the number of fixed parameters are the same or we have the same 4667 // ABI issues as if we introduce a varargs call. 4668 if (FT->isVarArg() && 4669 cast<FunctionType>(APTy->getElementType())->isVarArg() && 4670 FT->getNumParams() != 4671 cast<FunctionType>(APTy->getElementType())->getNumParams()) 4672 return false; 4673 } 4674 4675 if (FT->getNumParams() < NumActualArgs && FT->isVarArg() && 4676 !CallerPAL.isEmpty()) { 4677 // In this case we have more arguments than the new function type, but we 4678 // won't be dropping them. Check that these extra arguments have attributes 4679 // that are compatible with being a vararg call argument. 4680 unsigned SRetIdx; 4681 if (CallerPAL.hasAttrSomewhere(Attribute::StructRet, &SRetIdx) && 4682 SRetIdx > FT->getNumParams()) 4683 return false; 4684 } 4685 4686 // Okay, we decided that this is a safe thing to do: go ahead and start 4687 // inserting cast instructions as necessary. 4688 SmallVector<Value *, 8> Args; 4689 SmallVector<AttributeSet, 8> ArgAttrs; 4690 Args.reserve(NumActualArgs); 4691 ArgAttrs.reserve(NumActualArgs); 4692 4693 // Get any return attributes. 4694 AttrBuilder RAttrs(CallerPAL, AttributeList::ReturnIndex); 4695 4696 // If the return value is not being used, the type may not be compatible 4697 // with the existing attributes. Wipe out any problematic attributes. 4698 RAttrs.remove(AttributeFuncs::typeIncompatible(NewRetTy)); 4699 4700 LLVMContext &Ctx = Call.getContext(); 4701 AI = Call.arg_begin(); 4702 for (unsigned i = 0; i != NumCommonArgs; ++i, ++AI) { 4703 Type *ParamTy = FT->getParamType(i); 4704 4705 Value *NewArg = *AI; 4706 if ((*AI)->getType() != ParamTy) 4707 NewArg = Builder.CreateBitOrPointerCast(*AI, ParamTy); 4708 Args.push_back(NewArg); 4709 4710 // Add any parameter attributes. 4711 if (CallerPAL.hasParamAttribute(i, Attribute::ByVal)) { 4712 AttrBuilder AB(CallerPAL.getParamAttributes(i)); 4713 AB.addByValAttr(NewArg->getType()->getPointerElementType()); 4714 ArgAttrs.push_back(AttributeSet::get(Ctx, AB)); 4715 } else 4716 ArgAttrs.push_back(CallerPAL.getParamAttributes(i)); 4717 } 4718 4719 // If the function takes more arguments than the call was taking, add them 4720 // now. 4721 for (unsigned i = NumCommonArgs; i != FT->getNumParams(); ++i) { 4722 Args.push_back(Constant::getNullValue(FT->getParamType(i))); 4723 ArgAttrs.push_back(AttributeSet()); 4724 } 4725 4726 // If we are removing arguments to the function, emit an obnoxious warning. 4727 if (FT->getNumParams() < NumActualArgs) { 4728 // TODO: if (!FT->isVarArg()) this call may be unreachable. PR14722 4729 if (FT->isVarArg()) { 4730 // Add all of the arguments in their promoted form to the arg list. 4731 for (unsigned i = FT->getNumParams(); i != NumActualArgs; ++i, ++AI) { 4732 Type *PTy = getPromotedType((*AI)->getType()); 4733 Value *NewArg = *AI; 4734 if (PTy != (*AI)->getType()) { 4735 // Must promote to pass through va_arg area! 4736 Instruction::CastOps opcode = 4737 CastInst::getCastOpcode(*AI, false, PTy, false); 4738 NewArg = Builder.CreateCast(opcode, *AI, PTy); 4739 } 4740 Args.push_back(NewArg); 4741 4742 // Add any parameter attributes. 4743 ArgAttrs.push_back(CallerPAL.getParamAttributes(i)); 4744 } 4745 } 4746 } 4747 4748 AttributeSet FnAttrs = CallerPAL.getFnAttributes(); 4749 4750 if (NewRetTy->isVoidTy()) 4751 Caller->setName(""); // Void type should not have a name. 4752 4753 assert((ArgAttrs.size() == FT->getNumParams() || FT->isVarArg()) && 4754 "missing argument attributes"); 4755 AttributeList NewCallerPAL = AttributeList::get( 4756 Ctx, FnAttrs, AttributeSet::get(Ctx, RAttrs), ArgAttrs); 4757 4758 SmallVector<OperandBundleDef, 1> OpBundles; 4759 Call.getOperandBundlesAsDefs(OpBundles); 4760 4761 CallBase *NewCall; 4762 if (InvokeInst *II = dyn_cast<InvokeInst>(Caller)) { 4763 NewCall = Builder.CreateInvoke(Callee, II->getNormalDest(), 4764 II->getUnwindDest(), Args, OpBundles); 4765 } else if (CallBrInst *CBI = dyn_cast<CallBrInst>(Caller)) { 4766 NewCall = Builder.CreateCallBr(Callee, CBI->getDefaultDest(), 4767 CBI->getIndirectDests(), Args, OpBundles); 4768 } else { 4769 NewCall = Builder.CreateCall(Callee, Args, OpBundles); 4770 cast<CallInst>(NewCall)->setTailCallKind( 4771 cast<CallInst>(Caller)->getTailCallKind()); 4772 } 4773 NewCall->takeName(Caller); 4774 NewCall->setCallingConv(Call.getCallingConv()); 4775 NewCall->setAttributes(NewCallerPAL); 4776 4777 // Preserve the weight metadata for the new call instruction. The metadata 4778 // is used by SamplePGO to check callsite's hotness. 4779 uint64_t W; 4780 if (Caller->extractProfTotalWeight(W)) 4781 NewCall->setProfWeight(W); 4782 4783 // Insert a cast of the return type as necessary. 4784 Instruction *NC = NewCall; 4785 Value *NV = NC; 4786 if (OldRetTy != NV->getType() && !Caller->use_empty()) { 4787 if (!NV->getType()->isVoidTy()) { 4788 NV = NC = CastInst::CreateBitOrPointerCast(NC, OldRetTy); 4789 NC->setDebugLoc(Caller->getDebugLoc()); 4790 4791 // If this is an invoke/callbr instruction, we should insert it after the 4792 // first non-phi instruction in the normal successor block. 4793 if (InvokeInst *II = dyn_cast<InvokeInst>(Caller)) { 4794 BasicBlock::iterator I = II->getNormalDest()->getFirstInsertionPt(); 4795 InsertNewInstBefore(NC, *I); 4796 } else if (CallBrInst *CBI = dyn_cast<CallBrInst>(Caller)) { 4797 BasicBlock::iterator I = CBI->getDefaultDest()->getFirstInsertionPt(); 4798 InsertNewInstBefore(NC, *I); 4799 } else { 4800 // Otherwise, it's a call, just insert cast right after the call. 4801 InsertNewInstBefore(NC, *Caller); 4802 } 4803 Worklist.AddUsersToWorkList(*Caller); 4804 } else { 4805 NV = UndefValue::get(Caller->getType()); 4806 } 4807 } 4808 4809 if (!Caller->use_empty()) 4810 replaceInstUsesWith(*Caller, NV); 4811 else if (Caller->hasValueHandle()) { 4812 if (OldRetTy == NV->getType()) 4813 ValueHandleBase::ValueIsRAUWd(Caller, NV); 4814 else 4815 // We cannot call ValueIsRAUWd with a different type, and the 4816 // actual tracked value will disappear. 4817 ValueHandleBase::ValueIsDeleted(Caller); 4818 } 4819 4820 eraseInstFromFunction(*Caller); 4821 return true; 4822} 4823 4824/// Turn a call to a function created by init_trampoline / adjust_trampoline 4825/// intrinsic pair into a direct call to the underlying function. 4826Instruction * 4827InstCombiner::transformCallThroughTrampoline(CallBase &Call, 4828 IntrinsicInst &Tramp) { 4829 Value *Callee = Call.getCalledValue(); 4830 Type *CalleeTy = Callee->getType(); 4831 FunctionType *FTy = Call.getFunctionType(); 4832 AttributeList Attrs = Call.getAttributes(); 4833 4834 // If the call already has the 'nest' attribute somewhere then give up - 4835 // otherwise 'nest' would occur twice after splicing in the chain. 4836 if (Attrs.hasAttrSomewhere(Attribute::Nest)) 4837 return nullptr; 4838 4839 Function *NestF = cast<Function>(Tramp.getArgOperand(1)->stripPointerCasts()); 4840 FunctionType *NestFTy = NestF->getFunctionType(); 4841 4842 AttributeList NestAttrs = NestF->getAttributes(); 4843 if (!NestAttrs.isEmpty()) { 4844 unsigned NestArgNo = 0; 4845 Type *NestTy = nullptr; 4846 AttributeSet NestAttr; 4847 4848 // Look for a parameter marked with the 'nest' attribute. 4849 for (FunctionType::param_iterator I = NestFTy->param_begin(), 4850 E = NestFTy->param_end(); 4851 I != E; ++NestArgNo, ++I) { 4852 AttributeSet AS = NestAttrs.getParamAttributes(NestArgNo); 4853 if (AS.hasAttribute(Attribute::Nest)) { 4854 // Record the parameter type and any other attributes. 4855 NestTy = *I; 4856 NestAttr = AS; 4857 break; 4858 } 4859 } 4860 4861 if (NestTy) { 4862 std::vector<Value*> NewArgs; 4863 std::vector<AttributeSet> NewArgAttrs; 4864 NewArgs.reserve(Call.arg_size() + 1); 4865 NewArgAttrs.reserve(Call.arg_size()); 4866 4867 // Insert the nest argument into the call argument list, which may 4868 // mean appending it. Likewise for attributes. 4869 4870 { 4871 unsigned ArgNo = 0; 4872 auto I = Call.arg_begin(), E = Call.arg_end(); 4873 do { 4874 if (ArgNo == NestArgNo) { 4875 // Add the chain argument and attributes. 4876 Value *NestVal = Tramp.getArgOperand(2); 4877 if (NestVal->getType() != NestTy) 4878 NestVal = Builder.CreateBitCast(NestVal, NestTy, "nest"); 4879 NewArgs.push_back(NestVal); 4880 NewArgAttrs.push_back(NestAttr); 4881 } 4882 4883 if (I == E) 4884 break; 4885 4886 // Add the original argument and attributes. 4887 NewArgs.push_back(*I); 4888 NewArgAttrs.push_back(Attrs.getParamAttributes(ArgNo)); 4889 4890 ++ArgNo; 4891 ++I; 4892 } while (true); 4893 } 4894 4895 // The trampoline may have been bitcast to a bogus type (FTy). 4896 // Handle this by synthesizing a new function type, equal to FTy 4897 // with the chain parameter inserted. 4898 4899 std::vector<Type*> NewTypes; 4900 NewTypes.reserve(FTy->getNumParams()+1); 4901 4902 // Insert the chain's type into the list of parameter types, which may 4903 // mean appending it. 4904 { 4905 unsigned ArgNo = 0; 4906 FunctionType::param_iterator I = FTy->param_begin(), 4907 E = FTy->param_end(); 4908 4909 do { 4910 if (ArgNo == NestArgNo) 4911 // Add the chain's type. 4912 NewTypes.push_back(NestTy); 4913 4914 if (I == E) 4915 break; 4916 4917 // Add the original type. 4918 NewTypes.push_back(*I); 4919 4920 ++ArgNo; 4921 ++I; 4922 } while (true); 4923 } 4924 4925 // Replace the trampoline call with a direct call. Let the generic 4926 // code sort out any function type mismatches. 4927 FunctionType *NewFTy = FunctionType::get(FTy->getReturnType(), NewTypes, 4928 FTy->isVarArg()); 4929 Constant *NewCallee = 4930 NestF->getType() == PointerType::getUnqual(NewFTy) ? 4931 NestF : ConstantExpr::getBitCast(NestF, 4932 PointerType::getUnqual(NewFTy)); 4933 AttributeList NewPAL = 4934 AttributeList::get(FTy->getContext(), Attrs.getFnAttributes(), 4935 Attrs.getRetAttributes(), NewArgAttrs); 4936 4937 SmallVector<OperandBundleDef, 1> OpBundles; 4938 Call.getOperandBundlesAsDefs(OpBundles); 4939 4940 Instruction *NewCaller; 4941 if (InvokeInst *II = dyn_cast<InvokeInst>(&Call)) { 4942 NewCaller = InvokeInst::Create(NewFTy, NewCallee, 4943 II->getNormalDest(), II->getUnwindDest(), 4944 NewArgs, OpBundles); 4945 cast<InvokeInst>(NewCaller)->setCallingConv(II->getCallingConv()); 4946 cast<InvokeInst>(NewCaller)->setAttributes(NewPAL); 4947 } else if (CallBrInst *CBI = dyn_cast<CallBrInst>(&Call)) { 4948 NewCaller = 4949 CallBrInst::Create(NewFTy, NewCallee, CBI->getDefaultDest(), 4950 CBI->getIndirectDests(), NewArgs, OpBundles); 4951 cast<CallBrInst>(NewCaller)->setCallingConv(CBI->getCallingConv()); 4952 cast<CallBrInst>(NewCaller)->setAttributes(NewPAL); 4953 } else { 4954 NewCaller = CallInst::Create(NewFTy, NewCallee, NewArgs, OpBundles); 4955 cast<CallInst>(NewCaller)->setTailCallKind( 4956 cast<CallInst>(Call).getTailCallKind()); 4957 cast<CallInst>(NewCaller)->setCallingConv( 4958 cast<CallInst>(Call).getCallingConv()); 4959 cast<CallInst>(NewCaller)->setAttributes(NewPAL); 4960 } 4961 NewCaller->setDebugLoc(Call.getDebugLoc()); 4962 4963 return NewCaller; 4964 } 4965 } 4966 4967 // Replace the trampoline call with a direct call. Since there is no 'nest' 4968 // parameter, there is no need to adjust the argument list. Let the generic 4969 // code sort out any function type mismatches. 4970 Constant *NewCallee = ConstantExpr::getBitCast(NestF, CalleeTy); 4971 Call.setCalledFunction(FTy, NewCallee); 4972 return &Call; 4973} 4974