1//===------- VectorCombine.cpp - Optimize partial vector operations -------===// 2// 3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 4// See https://llvm.org/LICENSE.txt for license information. 5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 6// 7//===----------------------------------------------------------------------===// 8// 9// This pass optimizes scalar/vector interactions using target cost models. The 10// transforms implemented here may not fit in traditional loop-based or SLP 11// vectorization passes. 12// 13//===----------------------------------------------------------------------===// 14 15#include "llvm/Transforms/Vectorize/VectorCombine.h" 16#include "llvm/ADT/DenseMap.h" 17#include "llvm/ADT/ScopeExit.h" 18#include "llvm/ADT/Statistic.h" 19#include "llvm/Analysis/AssumptionCache.h" 20#include "llvm/Analysis/BasicAliasAnalysis.h" 21#include "llvm/Analysis/GlobalsModRef.h" 22#include "llvm/Analysis/Loads.h" 23#include "llvm/Analysis/TargetTransformInfo.h" 24#include "llvm/Analysis/ValueTracking.h" 25#include "llvm/Analysis/VectorUtils.h" 26#include "llvm/IR/Dominators.h" 27#include "llvm/IR/Function.h" 28#include "llvm/IR/IRBuilder.h" 29#include "llvm/IR/PatternMatch.h" 30#include "llvm/Support/CommandLine.h" 31#include "llvm/Transforms/Utils/Local.h" 32#include <numeric> 33#include <queue> 34 35#define DEBUG_TYPE "vector-combine" 36#include "llvm/Transforms/Utils/InstructionWorklist.h" 37 38using namespace llvm; 39using namespace llvm::PatternMatch; 40 41STATISTIC(NumVecLoad, "Number of vector loads formed"); 42STATISTIC(NumVecCmp, "Number of vector compares formed"); 43STATISTIC(NumVecBO, "Number of vector binops formed"); 44STATISTIC(NumVecCmpBO, "Number of vector compare + binop formed"); 45STATISTIC(NumShufOfBitcast, "Number of shuffles moved after bitcast"); 46STATISTIC(NumScalarBO, "Number of scalar binops formed"); 47STATISTIC(NumScalarCmp, "Number of scalar compares formed"); 48 49static cl::opt<bool> DisableVectorCombine( 50 "disable-vector-combine", cl::init(false), cl::Hidden, 51 cl::desc("Disable all vector combine transforms")); 52 53static cl::opt<bool> DisableBinopExtractShuffle( 54 "disable-binop-extract-shuffle", cl::init(false), cl::Hidden, 55 cl::desc("Disable binop extract to shuffle transforms")); 56 57static cl::opt<unsigned> MaxInstrsToScan( 58 "vector-combine-max-scan-instrs", cl::init(30), cl::Hidden, 59 cl::desc("Max number of instructions to scan for vector combining.")); 60 61static const unsigned InvalidIndex = std::numeric_limits<unsigned>::max(); 62 63namespace { 64class VectorCombine { 65public: 66 VectorCombine(Function &F, const TargetTransformInfo &TTI, 67 const DominatorTree &DT, AAResults &AA, AssumptionCache &AC, 68 bool TryEarlyFoldsOnly) 69 : F(F), Builder(F.getContext()), TTI(TTI), DT(DT), AA(AA), AC(AC), 70 TryEarlyFoldsOnly(TryEarlyFoldsOnly) {} 71 72 bool run(); 73 74private: 75 Function &F; 76 IRBuilder<> Builder; 77 const TargetTransformInfo &TTI; 78 const DominatorTree &DT; 79 AAResults &AA; 80 AssumptionCache &AC; 81 82 /// If true, only perform beneficial early IR transforms. Do not introduce new 83 /// vector operations. 84 bool TryEarlyFoldsOnly; 85 86 InstructionWorklist Worklist; 87 88 // TODO: Direct calls from the top-level "run" loop use a plain "Instruction" 89 // parameter. That should be updated to specific sub-classes because the 90 // run loop was changed to dispatch on opcode. 91 bool vectorizeLoadInsert(Instruction &I); 92 bool widenSubvectorLoad(Instruction &I); 93 ExtractElementInst *getShuffleExtract(ExtractElementInst *Ext0, 94 ExtractElementInst *Ext1, 95 unsigned PreferredExtractIndex) const; 96 bool isExtractExtractCheap(ExtractElementInst *Ext0, ExtractElementInst *Ext1, 97 const Instruction &I, 98 ExtractElementInst *&ConvertToShuffle, 99 unsigned PreferredExtractIndex); 100 void foldExtExtCmp(ExtractElementInst *Ext0, ExtractElementInst *Ext1, 101 Instruction &I); 102 void foldExtExtBinop(ExtractElementInst *Ext0, ExtractElementInst *Ext1, 103 Instruction &I); 104 bool foldExtractExtract(Instruction &I); 105 bool foldInsExtFNeg(Instruction &I); 106 bool foldBitcastShuffle(Instruction &I); 107 bool scalarizeBinopOrCmp(Instruction &I); 108 bool scalarizeVPIntrinsic(Instruction &I); 109 bool foldExtractedCmps(Instruction &I); 110 bool foldSingleElementStore(Instruction &I); 111 bool scalarizeLoadExtract(Instruction &I); 112 bool foldShuffleOfBinops(Instruction &I); 113 bool foldShuffleFromReductions(Instruction &I); 114 bool foldSelectShuffle(Instruction &I, bool FromReduction = false); 115 116 void replaceValue(Value &Old, Value &New) { 117 Old.replaceAllUsesWith(&New); 118 if (auto *NewI = dyn_cast<Instruction>(&New)) { 119 New.takeName(&Old); 120 Worklist.pushUsersToWorkList(*NewI); 121 Worklist.pushValue(NewI); 122 } 123 Worklist.pushValue(&Old); 124 } 125 126 void eraseInstruction(Instruction &I) { 127 for (Value *Op : I.operands()) 128 Worklist.pushValue(Op); 129 Worklist.remove(&I); 130 I.eraseFromParent(); 131 } 132}; 133} // namespace 134 135static bool canWidenLoad(LoadInst *Load, const TargetTransformInfo &TTI) { 136 // Do not widen load if atomic/volatile or under asan/hwasan/memtag/tsan. 137 // The widened load may load data from dirty regions or create data races 138 // non-existent in the source. 139 if (!Load || !Load->isSimple() || !Load->hasOneUse() || 140 Load->getFunction()->hasFnAttribute(Attribute::SanitizeMemTag) || 141 mustSuppressSpeculation(*Load)) 142 return false; 143 144 // We are potentially transforming byte-sized (8-bit) memory accesses, so make 145 // sure we have all of our type-based constraints in place for this target. 146 Type *ScalarTy = Load->getType()->getScalarType(); 147 uint64_t ScalarSize = ScalarTy->getPrimitiveSizeInBits(); 148 unsigned MinVectorSize = TTI.getMinVectorRegisterBitWidth(); 149 if (!ScalarSize || !MinVectorSize || MinVectorSize % ScalarSize != 0 || 150 ScalarSize % 8 != 0) 151 return false; 152 153 return true; 154} 155 156bool VectorCombine::vectorizeLoadInsert(Instruction &I) { 157 // Match insert into fixed vector of scalar value. 158 // TODO: Handle non-zero insert index. 159 Value *Scalar; 160 if (!match(&I, m_InsertElt(m_Undef(), m_Value(Scalar), m_ZeroInt())) || 161 !Scalar->hasOneUse()) 162 return false; 163 164 // Optionally match an extract from another vector. 165 Value *X; 166 bool HasExtract = match(Scalar, m_ExtractElt(m_Value(X), m_ZeroInt())); 167 if (!HasExtract) 168 X = Scalar; 169 170 auto *Load = dyn_cast<LoadInst>(X); 171 if (!canWidenLoad(Load, TTI)) 172 return false; 173 174 Type *ScalarTy = Scalar->getType(); 175 uint64_t ScalarSize = ScalarTy->getPrimitiveSizeInBits(); 176 unsigned MinVectorSize = TTI.getMinVectorRegisterBitWidth(); 177 178 // Check safety of replacing the scalar load with a larger vector load. 179 // We use minimal alignment (maximum flexibility) because we only care about 180 // the dereferenceable region. When calculating cost and creating a new op, 181 // we may use a larger value based on alignment attributes. 182 const DataLayout &DL = I.getModule()->getDataLayout(); 183 Value *SrcPtr = Load->getPointerOperand()->stripPointerCasts(); 184 assert(isa<PointerType>(SrcPtr->getType()) && "Expected a pointer type"); 185 186 unsigned MinVecNumElts = MinVectorSize / ScalarSize; 187 auto *MinVecTy = VectorType::get(ScalarTy, MinVecNumElts, false); 188 unsigned OffsetEltIndex = 0; 189 Align Alignment = Load->getAlign(); 190 if (!isSafeToLoadUnconditionally(SrcPtr, MinVecTy, Align(1), DL, Load, &AC, 191 &DT)) { 192 // It is not safe to load directly from the pointer, but we can still peek 193 // through gep offsets and check if it safe to load from a base address with 194 // updated alignment. If it is, we can shuffle the element(s) into place 195 // after loading. 196 unsigned OffsetBitWidth = DL.getIndexTypeSizeInBits(SrcPtr->getType()); 197 APInt Offset(OffsetBitWidth, 0); 198 SrcPtr = SrcPtr->stripAndAccumulateInBoundsConstantOffsets(DL, Offset); 199 200 // We want to shuffle the result down from a high element of a vector, so 201 // the offset must be positive. 202 if (Offset.isNegative()) 203 return false; 204 205 // The offset must be a multiple of the scalar element to shuffle cleanly 206 // in the element's size. 207 uint64_t ScalarSizeInBytes = ScalarSize / 8; 208 if (Offset.urem(ScalarSizeInBytes) != 0) 209 return false; 210 211 // If we load MinVecNumElts, will our target element still be loaded? 212 OffsetEltIndex = Offset.udiv(ScalarSizeInBytes).getZExtValue(); 213 if (OffsetEltIndex >= MinVecNumElts) 214 return false; 215 216 if (!isSafeToLoadUnconditionally(SrcPtr, MinVecTy, Align(1), DL, Load, &AC, 217 &DT)) 218 return false; 219 220 // Update alignment with offset value. Note that the offset could be negated 221 // to more accurately represent "(new) SrcPtr - Offset = (old) SrcPtr", but 222 // negation does not change the result of the alignment calculation. 223 Alignment = commonAlignment(Alignment, Offset.getZExtValue()); 224 } 225 226 // Original pattern: insertelt undef, load [free casts of] PtrOp, 0 227 // Use the greater of the alignment on the load or its source pointer. 228 Alignment = std::max(SrcPtr->getPointerAlignment(DL), Alignment); 229 Type *LoadTy = Load->getType(); 230 unsigned AS = Load->getPointerAddressSpace(); 231 InstructionCost OldCost = 232 TTI.getMemoryOpCost(Instruction::Load, LoadTy, Alignment, AS); 233 APInt DemandedElts = APInt::getOneBitSet(MinVecNumElts, 0); 234 TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput; 235 OldCost += 236 TTI.getScalarizationOverhead(MinVecTy, DemandedElts, 237 /* Insert */ true, HasExtract, CostKind); 238 239 // New pattern: load VecPtr 240 InstructionCost NewCost = 241 TTI.getMemoryOpCost(Instruction::Load, MinVecTy, Alignment, AS); 242 // Optionally, we are shuffling the loaded vector element(s) into place. 243 // For the mask set everything but element 0 to undef to prevent poison from 244 // propagating from the extra loaded memory. This will also optionally 245 // shrink/grow the vector from the loaded size to the output size. 246 // We assume this operation has no cost in codegen if there was no offset. 247 // Note that we could use freeze to avoid poison problems, but then we might 248 // still need a shuffle to change the vector size. 249 auto *Ty = cast<FixedVectorType>(I.getType()); 250 unsigned OutputNumElts = Ty->getNumElements(); 251 SmallVector<int, 16> Mask(OutputNumElts, PoisonMaskElem); 252 assert(OffsetEltIndex < MinVecNumElts && "Address offset too big"); 253 Mask[0] = OffsetEltIndex; 254 if (OffsetEltIndex) 255 NewCost += TTI.getShuffleCost(TTI::SK_PermuteSingleSrc, MinVecTy, Mask); 256 257 // We can aggressively convert to the vector form because the backend can 258 // invert this transform if it does not result in a performance win. 259 if (OldCost < NewCost || !NewCost.isValid()) 260 return false; 261 262 // It is safe and potentially profitable to load a vector directly: 263 // inselt undef, load Scalar, 0 --> load VecPtr 264 IRBuilder<> Builder(Load); 265 Value *CastedPtr = 266 Builder.CreatePointerBitCastOrAddrSpaceCast(SrcPtr, Builder.getPtrTy(AS)); 267 Value *VecLd = Builder.CreateAlignedLoad(MinVecTy, CastedPtr, Alignment); 268 VecLd = Builder.CreateShuffleVector(VecLd, Mask); 269 270 replaceValue(I, *VecLd); 271 ++NumVecLoad; 272 return true; 273} 274 275/// If we are loading a vector and then inserting it into a larger vector with 276/// undefined elements, try to load the larger vector and eliminate the insert. 277/// This removes a shuffle in IR and may allow combining of other loaded values. 278bool VectorCombine::widenSubvectorLoad(Instruction &I) { 279 // Match subvector insert of fixed vector. 280 auto *Shuf = cast<ShuffleVectorInst>(&I); 281 if (!Shuf->isIdentityWithPadding()) 282 return false; 283 284 // Allow a non-canonical shuffle mask that is choosing elements from op1. 285 unsigned NumOpElts = 286 cast<FixedVectorType>(Shuf->getOperand(0)->getType())->getNumElements(); 287 unsigned OpIndex = any_of(Shuf->getShuffleMask(), [&NumOpElts](int M) { 288 return M >= (int)(NumOpElts); 289 }); 290 291 auto *Load = dyn_cast<LoadInst>(Shuf->getOperand(OpIndex)); 292 if (!canWidenLoad(Load, TTI)) 293 return false; 294 295 // We use minimal alignment (maximum flexibility) because we only care about 296 // the dereferenceable region. When calculating cost and creating a new op, 297 // we may use a larger value based on alignment attributes. 298 auto *Ty = cast<FixedVectorType>(I.getType()); 299 const DataLayout &DL = I.getModule()->getDataLayout(); 300 Value *SrcPtr = Load->getPointerOperand()->stripPointerCasts(); 301 assert(isa<PointerType>(SrcPtr->getType()) && "Expected a pointer type"); 302 Align Alignment = Load->getAlign(); 303 if (!isSafeToLoadUnconditionally(SrcPtr, Ty, Align(1), DL, Load, &AC, &DT)) 304 return false; 305 306 Alignment = std::max(SrcPtr->getPointerAlignment(DL), Alignment); 307 Type *LoadTy = Load->getType(); 308 unsigned AS = Load->getPointerAddressSpace(); 309 310 // Original pattern: insert_subvector (load PtrOp) 311 // This conservatively assumes that the cost of a subvector insert into an 312 // undef value is 0. We could add that cost if the cost model accurately 313 // reflects the real cost of that operation. 314 InstructionCost OldCost = 315 TTI.getMemoryOpCost(Instruction::Load, LoadTy, Alignment, AS); 316 317 // New pattern: load PtrOp 318 InstructionCost NewCost = 319 TTI.getMemoryOpCost(Instruction::Load, Ty, Alignment, AS); 320 321 // We can aggressively convert to the vector form because the backend can 322 // invert this transform if it does not result in a performance win. 323 if (OldCost < NewCost || !NewCost.isValid()) 324 return false; 325 326 IRBuilder<> Builder(Load); 327 Value *CastedPtr = 328 Builder.CreatePointerBitCastOrAddrSpaceCast(SrcPtr, Builder.getPtrTy(AS)); 329 Value *VecLd = Builder.CreateAlignedLoad(Ty, CastedPtr, Alignment); 330 replaceValue(I, *VecLd); 331 ++NumVecLoad; 332 return true; 333} 334 335/// Determine which, if any, of the inputs should be replaced by a shuffle 336/// followed by extract from a different index. 337ExtractElementInst *VectorCombine::getShuffleExtract( 338 ExtractElementInst *Ext0, ExtractElementInst *Ext1, 339 unsigned PreferredExtractIndex = InvalidIndex) const { 340 auto *Index0C = dyn_cast<ConstantInt>(Ext0->getIndexOperand()); 341 auto *Index1C = dyn_cast<ConstantInt>(Ext1->getIndexOperand()); 342 assert(Index0C && Index1C && "Expected constant extract indexes"); 343 344 unsigned Index0 = Index0C->getZExtValue(); 345 unsigned Index1 = Index1C->getZExtValue(); 346 347 // If the extract indexes are identical, no shuffle is needed. 348 if (Index0 == Index1) 349 return nullptr; 350 351 Type *VecTy = Ext0->getVectorOperand()->getType(); 352 TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput; 353 assert(VecTy == Ext1->getVectorOperand()->getType() && "Need matching types"); 354 InstructionCost Cost0 = 355 TTI.getVectorInstrCost(*Ext0, VecTy, CostKind, Index0); 356 InstructionCost Cost1 = 357 TTI.getVectorInstrCost(*Ext1, VecTy, CostKind, Index1); 358 359 // If both costs are invalid no shuffle is needed 360 if (!Cost0.isValid() && !Cost1.isValid()) 361 return nullptr; 362 363 // We are extracting from 2 different indexes, so one operand must be shuffled 364 // before performing a vector operation and/or extract. The more expensive 365 // extract will be replaced by a shuffle. 366 if (Cost0 > Cost1) 367 return Ext0; 368 if (Cost1 > Cost0) 369 return Ext1; 370 371 // If the costs are equal and there is a preferred extract index, shuffle the 372 // opposite operand. 373 if (PreferredExtractIndex == Index0) 374 return Ext1; 375 if (PreferredExtractIndex == Index1) 376 return Ext0; 377 378 // Otherwise, replace the extract with the higher index. 379 return Index0 > Index1 ? Ext0 : Ext1; 380} 381 382/// Compare the relative costs of 2 extracts followed by scalar operation vs. 383/// vector operation(s) followed by extract. Return true if the existing 384/// instructions are cheaper than a vector alternative. Otherwise, return false 385/// and if one of the extracts should be transformed to a shufflevector, set 386/// \p ConvertToShuffle to that extract instruction. 387bool VectorCombine::isExtractExtractCheap(ExtractElementInst *Ext0, 388 ExtractElementInst *Ext1, 389 const Instruction &I, 390 ExtractElementInst *&ConvertToShuffle, 391 unsigned PreferredExtractIndex) { 392 auto *Ext0IndexC = dyn_cast<ConstantInt>(Ext0->getOperand(1)); 393 auto *Ext1IndexC = dyn_cast<ConstantInt>(Ext1->getOperand(1)); 394 assert(Ext0IndexC && Ext1IndexC && "Expected constant extract indexes"); 395 396 unsigned Opcode = I.getOpcode(); 397 Type *ScalarTy = Ext0->getType(); 398 auto *VecTy = cast<VectorType>(Ext0->getOperand(0)->getType()); 399 InstructionCost ScalarOpCost, VectorOpCost; 400 401 // Get cost estimates for scalar and vector versions of the operation. 402 bool IsBinOp = Instruction::isBinaryOp(Opcode); 403 if (IsBinOp) { 404 ScalarOpCost = TTI.getArithmeticInstrCost(Opcode, ScalarTy); 405 VectorOpCost = TTI.getArithmeticInstrCost(Opcode, VecTy); 406 } else { 407 assert((Opcode == Instruction::ICmp || Opcode == Instruction::FCmp) && 408 "Expected a compare"); 409 CmpInst::Predicate Pred = cast<CmpInst>(I).getPredicate(); 410 ScalarOpCost = TTI.getCmpSelInstrCost( 411 Opcode, ScalarTy, CmpInst::makeCmpResultType(ScalarTy), Pred); 412 VectorOpCost = TTI.getCmpSelInstrCost( 413 Opcode, VecTy, CmpInst::makeCmpResultType(VecTy), Pred); 414 } 415 416 // Get cost estimates for the extract elements. These costs will factor into 417 // both sequences. 418 unsigned Ext0Index = Ext0IndexC->getZExtValue(); 419 unsigned Ext1Index = Ext1IndexC->getZExtValue(); 420 TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput; 421 422 InstructionCost Extract0Cost = 423 TTI.getVectorInstrCost(*Ext0, VecTy, CostKind, Ext0Index); 424 InstructionCost Extract1Cost = 425 TTI.getVectorInstrCost(*Ext1, VecTy, CostKind, Ext1Index); 426 427 // A more expensive extract will always be replaced by a splat shuffle. 428 // For example, if Ext0 is more expensive: 429 // opcode (extelt V0, Ext0), (ext V1, Ext1) --> 430 // extelt (opcode (splat V0, Ext0), V1), Ext1 431 // TODO: Evaluate whether that always results in lowest cost. Alternatively, 432 // check the cost of creating a broadcast shuffle and shuffling both 433 // operands to element 0. 434 InstructionCost CheapExtractCost = std::min(Extract0Cost, Extract1Cost); 435 436 // Extra uses of the extracts mean that we include those costs in the 437 // vector total because those instructions will not be eliminated. 438 InstructionCost OldCost, NewCost; 439 if (Ext0->getOperand(0) == Ext1->getOperand(0) && Ext0Index == Ext1Index) { 440 // Handle a special case. If the 2 extracts are identical, adjust the 441 // formulas to account for that. The extra use charge allows for either the 442 // CSE'd pattern or an unoptimized form with identical values: 443 // opcode (extelt V, C), (extelt V, C) --> extelt (opcode V, V), C 444 bool HasUseTax = Ext0 == Ext1 ? !Ext0->hasNUses(2) 445 : !Ext0->hasOneUse() || !Ext1->hasOneUse(); 446 OldCost = CheapExtractCost + ScalarOpCost; 447 NewCost = VectorOpCost + CheapExtractCost + HasUseTax * CheapExtractCost; 448 } else { 449 // Handle the general case. Each extract is actually a different value: 450 // opcode (extelt V0, C0), (extelt V1, C1) --> extelt (opcode V0, V1), C 451 OldCost = Extract0Cost + Extract1Cost + ScalarOpCost; 452 NewCost = VectorOpCost + CheapExtractCost + 453 !Ext0->hasOneUse() * Extract0Cost + 454 !Ext1->hasOneUse() * Extract1Cost; 455 } 456 457 ConvertToShuffle = getShuffleExtract(Ext0, Ext1, PreferredExtractIndex); 458 if (ConvertToShuffle) { 459 if (IsBinOp && DisableBinopExtractShuffle) 460 return true; 461 462 // If we are extracting from 2 different indexes, then one operand must be 463 // shuffled before performing the vector operation. The shuffle mask is 464 // poison except for 1 lane that is being translated to the remaining 465 // extraction lane. Therefore, it is a splat shuffle. Ex: 466 // ShufMask = { poison, poison, 0, poison } 467 // TODO: The cost model has an option for a "broadcast" shuffle 468 // (splat-from-element-0), but no option for a more general splat. 469 NewCost += 470 TTI.getShuffleCost(TargetTransformInfo::SK_PermuteSingleSrc, VecTy); 471 } 472 473 // Aggressively form a vector op if the cost is equal because the transform 474 // may enable further optimization. 475 // Codegen can reverse this transform (scalarize) if it was not profitable. 476 return OldCost < NewCost; 477} 478 479/// Create a shuffle that translates (shifts) 1 element from the input vector 480/// to a new element location. 481static Value *createShiftShuffle(Value *Vec, unsigned OldIndex, 482 unsigned NewIndex, IRBuilder<> &Builder) { 483 // The shuffle mask is poison except for 1 lane that is being translated 484 // to the new element index. Example for OldIndex == 2 and NewIndex == 0: 485 // ShufMask = { 2, poison, poison, poison } 486 auto *VecTy = cast<FixedVectorType>(Vec->getType()); 487 SmallVector<int, 32> ShufMask(VecTy->getNumElements(), PoisonMaskElem); 488 ShufMask[NewIndex] = OldIndex; 489 return Builder.CreateShuffleVector(Vec, ShufMask, "shift"); 490} 491 492/// Given an extract element instruction with constant index operand, shuffle 493/// the source vector (shift the scalar element) to a NewIndex for extraction. 494/// Return null if the input can be constant folded, so that we are not creating 495/// unnecessary instructions. 496static ExtractElementInst *translateExtract(ExtractElementInst *ExtElt, 497 unsigned NewIndex, 498 IRBuilder<> &Builder) { 499 // Shufflevectors can only be created for fixed-width vectors. 500 if (!isa<FixedVectorType>(ExtElt->getOperand(0)->getType())) 501 return nullptr; 502 503 // If the extract can be constant-folded, this code is unsimplified. Defer 504 // to other passes to handle that. 505 Value *X = ExtElt->getVectorOperand(); 506 Value *C = ExtElt->getIndexOperand(); 507 assert(isa<ConstantInt>(C) && "Expected a constant index operand"); 508 if (isa<Constant>(X)) 509 return nullptr; 510 511 Value *Shuf = createShiftShuffle(X, cast<ConstantInt>(C)->getZExtValue(), 512 NewIndex, Builder); 513 return cast<ExtractElementInst>(Builder.CreateExtractElement(Shuf, NewIndex)); 514} 515 516/// Try to reduce extract element costs by converting scalar compares to vector 517/// compares followed by extract. 518/// cmp (ext0 V0, C), (ext1 V1, C) 519void VectorCombine::foldExtExtCmp(ExtractElementInst *Ext0, 520 ExtractElementInst *Ext1, Instruction &I) { 521 assert(isa<CmpInst>(&I) && "Expected a compare"); 522 assert(cast<ConstantInt>(Ext0->getIndexOperand())->getZExtValue() == 523 cast<ConstantInt>(Ext1->getIndexOperand())->getZExtValue() && 524 "Expected matching constant extract indexes"); 525 526 // cmp Pred (extelt V0, C), (extelt V1, C) --> extelt (cmp Pred V0, V1), C 527 ++NumVecCmp; 528 CmpInst::Predicate Pred = cast<CmpInst>(&I)->getPredicate(); 529 Value *V0 = Ext0->getVectorOperand(), *V1 = Ext1->getVectorOperand(); 530 Value *VecCmp = Builder.CreateCmp(Pred, V0, V1); 531 Value *NewExt = Builder.CreateExtractElement(VecCmp, Ext0->getIndexOperand()); 532 replaceValue(I, *NewExt); 533} 534 535/// Try to reduce extract element costs by converting scalar binops to vector 536/// binops followed by extract. 537/// bo (ext0 V0, C), (ext1 V1, C) 538void VectorCombine::foldExtExtBinop(ExtractElementInst *Ext0, 539 ExtractElementInst *Ext1, Instruction &I) { 540 assert(isa<BinaryOperator>(&I) && "Expected a binary operator"); 541 assert(cast<ConstantInt>(Ext0->getIndexOperand())->getZExtValue() == 542 cast<ConstantInt>(Ext1->getIndexOperand())->getZExtValue() && 543 "Expected matching constant extract indexes"); 544 545 // bo (extelt V0, C), (extelt V1, C) --> extelt (bo V0, V1), C 546 ++NumVecBO; 547 Value *V0 = Ext0->getVectorOperand(), *V1 = Ext1->getVectorOperand(); 548 Value *VecBO = 549 Builder.CreateBinOp(cast<BinaryOperator>(&I)->getOpcode(), V0, V1); 550 551 // All IR flags are safe to back-propagate because any potential poison 552 // created in unused vector elements is discarded by the extract. 553 if (auto *VecBOInst = dyn_cast<Instruction>(VecBO)) 554 VecBOInst->copyIRFlags(&I); 555 556 Value *NewExt = Builder.CreateExtractElement(VecBO, Ext0->getIndexOperand()); 557 replaceValue(I, *NewExt); 558} 559 560/// Match an instruction with extracted vector operands. 561bool VectorCombine::foldExtractExtract(Instruction &I) { 562 // It is not safe to transform things like div, urem, etc. because we may 563 // create undefined behavior when executing those on unknown vector elements. 564 if (!isSafeToSpeculativelyExecute(&I)) 565 return false; 566 567 Instruction *I0, *I1; 568 CmpInst::Predicate Pred = CmpInst::BAD_ICMP_PREDICATE; 569 if (!match(&I, m_Cmp(Pred, m_Instruction(I0), m_Instruction(I1))) && 570 !match(&I, m_BinOp(m_Instruction(I0), m_Instruction(I1)))) 571 return false; 572 573 Value *V0, *V1; 574 uint64_t C0, C1; 575 if (!match(I0, m_ExtractElt(m_Value(V0), m_ConstantInt(C0))) || 576 !match(I1, m_ExtractElt(m_Value(V1), m_ConstantInt(C1))) || 577 V0->getType() != V1->getType()) 578 return false; 579 580 // If the scalar value 'I' is going to be re-inserted into a vector, then try 581 // to create an extract to that same element. The extract/insert can be 582 // reduced to a "select shuffle". 583 // TODO: If we add a larger pattern match that starts from an insert, this 584 // probably becomes unnecessary. 585 auto *Ext0 = cast<ExtractElementInst>(I0); 586 auto *Ext1 = cast<ExtractElementInst>(I1); 587 uint64_t InsertIndex = InvalidIndex; 588 if (I.hasOneUse()) 589 match(I.user_back(), 590 m_InsertElt(m_Value(), m_Value(), m_ConstantInt(InsertIndex))); 591 592 ExtractElementInst *ExtractToChange; 593 if (isExtractExtractCheap(Ext0, Ext1, I, ExtractToChange, InsertIndex)) 594 return false; 595 596 if (ExtractToChange) { 597 unsigned CheapExtractIdx = ExtractToChange == Ext0 ? C1 : C0; 598 ExtractElementInst *NewExtract = 599 translateExtract(ExtractToChange, CheapExtractIdx, Builder); 600 if (!NewExtract) 601 return false; 602 if (ExtractToChange == Ext0) 603 Ext0 = NewExtract; 604 else 605 Ext1 = NewExtract; 606 } 607 608 if (Pred != CmpInst::BAD_ICMP_PREDICATE) 609 foldExtExtCmp(Ext0, Ext1, I); 610 else 611 foldExtExtBinop(Ext0, Ext1, I); 612 613 Worklist.push(Ext0); 614 Worklist.push(Ext1); 615 return true; 616} 617 618/// Try to replace an extract + scalar fneg + insert with a vector fneg + 619/// shuffle. 620bool VectorCombine::foldInsExtFNeg(Instruction &I) { 621 // Match an insert (op (extract)) pattern. 622 Value *DestVec; 623 uint64_t Index; 624 Instruction *FNeg; 625 if (!match(&I, m_InsertElt(m_Value(DestVec), m_OneUse(m_Instruction(FNeg)), 626 m_ConstantInt(Index)))) 627 return false; 628 629 // Note: This handles the canonical fneg instruction and "fsub -0.0, X". 630 Value *SrcVec; 631 Instruction *Extract; 632 if (!match(FNeg, m_FNeg(m_CombineAnd( 633 m_Instruction(Extract), 634 m_ExtractElt(m_Value(SrcVec), m_SpecificInt(Index)))))) 635 return false; 636 637 // TODO: We could handle this with a length-changing shuffle. 638 auto *VecTy = cast<FixedVectorType>(I.getType()); 639 if (SrcVec->getType() != VecTy) 640 return false; 641 642 // Ignore bogus insert/extract index. 643 unsigned NumElts = VecTy->getNumElements(); 644 if (Index >= NumElts) 645 return false; 646 647 // We are inserting the negated element into the same lane that we extracted 648 // from. This is equivalent to a select-shuffle that chooses all but the 649 // negated element from the destination vector. 650 SmallVector<int> Mask(NumElts); 651 std::iota(Mask.begin(), Mask.end(), 0); 652 Mask[Index] = Index + NumElts; 653 654 Type *ScalarTy = VecTy->getScalarType(); 655 TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput; 656 InstructionCost OldCost = 657 TTI.getArithmeticInstrCost(Instruction::FNeg, ScalarTy) + 658 TTI.getVectorInstrCost(I, VecTy, CostKind, Index); 659 660 // If the extract has one use, it will be eliminated, so count it in the 661 // original cost. If it has more than one use, ignore the cost because it will 662 // be the same before/after. 663 if (Extract->hasOneUse()) 664 OldCost += TTI.getVectorInstrCost(*Extract, VecTy, CostKind, Index); 665 666 InstructionCost NewCost = 667 TTI.getArithmeticInstrCost(Instruction::FNeg, VecTy) + 668 TTI.getShuffleCost(TargetTransformInfo::SK_Select, VecTy, Mask); 669 670 if (NewCost > OldCost) 671 return false; 672 673 // insertelt DestVec, (fneg (extractelt SrcVec, Index)), Index --> 674 // shuffle DestVec, (fneg SrcVec), Mask 675 Value *VecFNeg = Builder.CreateFNegFMF(SrcVec, FNeg); 676 Value *Shuf = Builder.CreateShuffleVector(DestVec, VecFNeg, Mask); 677 replaceValue(I, *Shuf); 678 return true; 679} 680 681/// If this is a bitcast of a shuffle, try to bitcast the source vector to the 682/// destination type followed by shuffle. This can enable further transforms by 683/// moving bitcasts or shuffles together. 684bool VectorCombine::foldBitcastShuffle(Instruction &I) { 685 Value *V; 686 ArrayRef<int> Mask; 687 if (!match(&I, m_BitCast( 688 m_OneUse(m_Shuffle(m_Value(V), m_Undef(), m_Mask(Mask)))))) 689 return false; 690 691 // 1) Do not fold bitcast shuffle for scalable type. First, shuffle cost for 692 // scalable type is unknown; Second, we cannot reason if the narrowed shuffle 693 // mask for scalable type is a splat or not. 694 // 2) Disallow non-vector casts. 695 // TODO: We could allow any shuffle. 696 auto *DestTy = dyn_cast<FixedVectorType>(I.getType()); 697 auto *SrcTy = dyn_cast<FixedVectorType>(V->getType()); 698 if (!DestTy || !SrcTy) 699 return false; 700 701 unsigned DestEltSize = DestTy->getScalarSizeInBits(); 702 unsigned SrcEltSize = SrcTy->getScalarSizeInBits(); 703 if (SrcTy->getPrimitiveSizeInBits() % DestEltSize != 0) 704 return false; 705 706 SmallVector<int, 16> NewMask; 707 if (DestEltSize <= SrcEltSize) { 708 // The bitcast is from wide to narrow/equal elements. The shuffle mask can 709 // always be expanded to the equivalent form choosing narrower elements. 710 assert(SrcEltSize % DestEltSize == 0 && "Unexpected shuffle mask"); 711 unsigned ScaleFactor = SrcEltSize / DestEltSize; 712 narrowShuffleMaskElts(ScaleFactor, Mask, NewMask); 713 } else { 714 // The bitcast is from narrow elements to wide elements. The shuffle mask 715 // must choose consecutive elements to allow casting first. 716 assert(DestEltSize % SrcEltSize == 0 && "Unexpected shuffle mask"); 717 unsigned ScaleFactor = DestEltSize / SrcEltSize; 718 if (!widenShuffleMaskElts(ScaleFactor, Mask, NewMask)) 719 return false; 720 } 721 722 // Bitcast the shuffle src - keep its original width but using the destination 723 // scalar type. 724 unsigned NumSrcElts = SrcTy->getPrimitiveSizeInBits() / DestEltSize; 725 auto *ShuffleTy = FixedVectorType::get(DestTy->getScalarType(), NumSrcElts); 726 727 // The new shuffle must not cost more than the old shuffle. The bitcast is 728 // moved ahead of the shuffle, so assume that it has the same cost as before. 729 InstructionCost DestCost = TTI.getShuffleCost( 730 TargetTransformInfo::SK_PermuteSingleSrc, ShuffleTy, NewMask); 731 InstructionCost SrcCost = 732 TTI.getShuffleCost(TargetTransformInfo::SK_PermuteSingleSrc, SrcTy, Mask); 733 if (DestCost > SrcCost || !DestCost.isValid()) 734 return false; 735 736 // bitcast (shuf V, MaskC) --> shuf (bitcast V), MaskC' 737 ++NumShufOfBitcast; 738 Value *CastV = Builder.CreateBitCast(V, ShuffleTy); 739 Value *Shuf = Builder.CreateShuffleVector(CastV, NewMask); 740 replaceValue(I, *Shuf); 741 return true; 742} 743 744/// VP Intrinsics whose vector operands are both splat values may be simplified 745/// into the scalar version of the operation and the result splatted. This 746/// can lead to scalarization down the line. 747bool VectorCombine::scalarizeVPIntrinsic(Instruction &I) { 748 if (!isa<VPIntrinsic>(I)) 749 return false; 750 VPIntrinsic &VPI = cast<VPIntrinsic>(I); 751 Value *Op0 = VPI.getArgOperand(0); 752 Value *Op1 = VPI.getArgOperand(1); 753 754 if (!isSplatValue(Op0) || !isSplatValue(Op1)) 755 return false; 756 757 // Check getSplatValue early in this function, to avoid doing unnecessary 758 // work. 759 Value *ScalarOp0 = getSplatValue(Op0); 760 Value *ScalarOp1 = getSplatValue(Op1); 761 if (!ScalarOp0 || !ScalarOp1) 762 return false; 763 764 // For the binary VP intrinsics supported here, the result on disabled lanes 765 // is a poison value. For now, only do this simplification if all lanes 766 // are active. 767 // TODO: Relax the condition that all lanes are active by using insertelement 768 // on inactive lanes. 769 auto IsAllTrueMask = [](Value *MaskVal) { 770 if (Value *SplattedVal = getSplatValue(MaskVal)) 771 if (auto *ConstValue = dyn_cast<Constant>(SplattedVal)) 772 return ConstValue->isAllOnesValue(); 773 return false; 774 }; 775 if (!IsAllTrueMask(VPI.getArgOperand(2))) 776 return false; 777 778 // Check to make sure we support scalarization of the intrinsic 779 Intrinsic::ID IntrID = VPI.getIntrinsicID(); 780 if (!VPBinOpIntrinsic::isVPBinOp(IntrID)) 781 return false; 782 783 // Calculate cost of splatting both operands into vectors and the vector 784 // intrinsic 785 VectorType *VecTy = cast<VectorType>(VPI.getType()); 786 TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput; 787 InstructionCost SplatCost = 788 TTI.getVectorInstrCost(Instruction::InsertElement, VecTy, CostKind, 0) + 789 TTI.getShuffleCost(TargetTransformInfo::SK_Broadcast, VecTy); 790 791 // Calculate the cost of the VP Intrinsic 792 SmallVector<Type *, 4> Args; 793 for (Value *V : VPI.args()) 794 Args.push_back(V->getType()); 795 IntrinsicCostAttributes Attrs(IntrID, VecTy, Args); 796 InstructionCost VectorOpCost = TTI.getIntrinsicInstrCost(Attrs, CostKind); 797 InstructionCost OldCost = 2 * SplatCost + VectorOpCost; 798 799 // Determine scalar opcode 800 std::optional<unsigned> FunctionalOpcode = 801 VPI.getFunctionalOpcode(); 802 std::optional<Intrinsic::ID> ScalarIntrID = std::nullopt; 803 if (!FunctionalOpcode) { 804 ScalarIntrID = VPI.getFunctionalIntrinsicID(); 805 if (!ScalarIntrID) 806 return false; 807 } 808 809 // Calculate cost of scalarizing 810 InstructionCost ScalarOpCost = 0; 811 if (ScalarIntrID) { 812 IntrinsicCostAttributes Attrs(*ScalarIntrID, VecTy->getScalarType(), Args); 813 ScalarOpCost = TTI.getIntrinsicInstrCost(Attrs, CostKind); 814 } else { 815 ScalarOpCost = 816 TTI.getArithmeticInstrCost(*FunctionalOpcode, VecTy->getScalarType()); 817 } 818 819 // The existing splats may be kept around if other instructions use them. 820 InstructionCost CostToKeepSplats = 821 (SplatCost * !Op0->hasOneUse()) + (SplatCost * !Op1->hasOneUse()); 822 InstructionCost NewCost = ScalarOpCost + SplatCost + CostToKeepSplats; 823 824 LLVM_DEBUG(dbgs() << "Found a VP Intrinsic to scalarize: " << VPI 825 << "\n"); 826 LLVM_DEBUG(dbgs() << "Cost of Intrinsic: " << OldCost 827 << ", Cost of scalarizing:" << NewCost << "\n"); 828 829 // We want to scalarize unless the vector variant actually has lower cost. 830 if (OldCost < NewCost || !NewCost.isValid()) 831 return false; 832 833 // Scalarize the intrinsic 834 ElementCount EC = cast<VectorType>(Op0->getType())->getElementCount(); 835 Value *EVL = VPI.getArgOperand(3); 836 const DataLayout &DL = VPI.getModule()->getDataLayout(); 837 838 // If the VP op might introduce UB or poison, we can scalarize it provided 839 // that we know the EVL > 0: If the EVL is zero, then the original VP op 840 // becomes a no-op and thus won't be UB, so make sure we don't introduce UB by 841 // scalarizing it. 842 bool SafeToSpeculate; 843 if (ScalarIntrID) 844 SafeToSpeculate = Intrinsic::getAttributes(I.getContext(), *ScalarIntrID) 845 .hasFnAttr(Attribute::AttrKind::Speculatable); 846 else 847 SafeToSpeculate = isSafeToSpeculativelyExecuteWithOpcode( 848 *FunctionalOpcode, &VPI, nullptr, &AC, &DT); 849 if (!SafeToSpeculate && !isKnownNonZero(EVL, DL, 0, &AC, &VPI, &DT)) 850 return false; 851 852 Value *ScalarVal = 853 ScalarIntrID 854 ? Builder.CreateIntrinsic(VecTy->getScalarType(), *ScalarIntrID, 855 {ScalarOp0, ScalarOp1}) 856 : Builder.CreateBinOp((Instruction::BinaryOps)(*FunctionalOpcode), 857 ScalarOp0, ScalarOp1); 858 859 replaceValue(VPI, *Builder.CreateVectorSplat(EC, ScalarVal)); 860 return true; 861} 862 863/// Match a vector binop or compare instruction with at least one inserted 864/// scalar operand and convert to scalar binop/cmp followed by insertelement. 865bool VectorCombine::scalarizeBinopOrCmp(Instruction &I) { 866 CmpInst::Predicate Pred = CmpInst::BAD_ICMP_PREDICATE; 867 Value *Ins0, *Ins1; 868 if (!match(&I, m_BinOp(m_Value(Ins0), m_Value(Ins1))) && 869 !match(&I, m_Cmp(Pred, m_Value(Ins0), m_Value(Ins1)))) 870 return false; 871 872 // Do not convert the vector condition of a vector select into a scalar 873 // condition. That may cause problems for codegen because of differences in 874 // boolean formats and register-file transfers. 875 // TODO: Can we account for that in the cost model? 876 bool IsCmp = Pred != CmpInst::Predicate::BAD_ICMP_PREDICATE; 877 if (IsCmp) 878 for (User *U : I.users()) 879 if (match(U, m_Select(m_Specific(&I), m_Value(), m_Value()))) 880 return false; 881 882 // Match against one or both scalar values being inserted into constant 883 // vectors: 884 // vec_op VecC0, (inselt VecC1, V1, Index) 885 // vec_op (inselt VecC0, V0, Index), VecC1 886 // vec_op (inselt VecC0, V0, Index), (inselt VecC1, V1, Index) 887 // TODO: Deal with mismatched index constants and variable indexes? 888 Constant *VecC0 = nullptr, *VecC1 = nullptr; 889 Value *V0 = nullptr, *V1 = nullptr; 890 uint64_t Index0 = 0, Index1 = 0; 891 if (!match(Ins0, m_InsertElt(m_Constant(VecC0), m_Value(V0), 892 m_ConstantInt(Index0))) && 893 !match(Ins0, m_Constant(VecC0))) 894 return false; 895 if (!match(Ins1, m_InsertElt(m_Constant(VecC1), m_Value(V1), 896 m_ConstantInt(Index1))) && 897 !match(Ins1, m_Constant(VecC1))) 898 return false; 899 900 bool IsConst0 = !V0; 901 bool IsConst1 = !V1; 902 if (IsConst0 && IsConst1) 903 return false; 904 if (!IsConst0 && !IsConst1 && Index0 != Index1) 905 return false; 906 907 // Bail for single insertion if it is a load. 908 // TODO: Handle this once getVectorInstrCost can cost for load/stores. 909 auto *I0 = dyn_cast_or_null<Instruction>(V0); 910 auto *I1 = dyn_cast_or_null<Instruction>(V1); 911 if ((IsConst0 && I1 && I1->mayReadFromMemory()) || 912 (IsConst1 && I0 && I0->mayReadFromMemory())) 913 return false; 914 915 uint64_t Index = IsConst0 ? Index1 : Index0; 916 Type *ScalarTy = IsConst0 ? V1->getType() : V0->getType(); 917 Type *VecTy = I.getType(); 918 assert(VecTy->isVectorTy() && 919 (IsConst0 || IsConst1 || V0->getType() == V1->getType()) && 920 (ScalarTy->isIntegerTy() || ScalarTy->isFloatingPointTy() || 921 ScalarTy->isPointerTy()) && 922 "Unexpected types for insert element into binop or cmp"); 923 924 unsigned Opcode = I.getOpcode(); 925 InstructionCost ScalarOpCost, VectorOpCost; 926 if (IsCmp) { 927 CmpInst::Predicate Pred = cast<CmpInst>(I).getPredicate(); 928 ScalarOpCost = TTI.getCmpSelInstrCost( 929 Opcode, ScalarTy, CmpInst::makeCmpResultType(ScalarTy), Pred); 930 VectorOpCost = TTI.getCmpSelInstrCost( 931 Opcode, VecTy, CmpInst::makeCmpResultType(VecTy), Pred); 932 } else { 933 ScalarOpCost = TTI.getArithmeticInstrCost(Opcode, ScalarTy); 934 VectorOpCost = TTI.getArithmeticInstrCost(Opcode, VecTy); 935 } 936 937 // Get cost estimate for the insert element. This cost will factor into 938 // both sequences. 939 TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput; 940 InstructionCost InsertCost = TTI.getVectorInstrCost( 941 Instruction::InsertElement, VecTy, CostKind, Index); 942 InstructionCost OldCost = 943 (IsConst0 ? 0 : InsertCost) + (IsConst1 ? 0 : InsertCost) + VectorOpCost; 944 InstructionCost NewCost = ScalarOpCost + InsertCost + 945 (IsConst0 ? 0 : !Ins0->hasOneUse() * InsertCost) + 946 (IsConst1 ? 0 : !Ins1->hasOneUse() * InsertCost); 947 948 // We want to scalarize unless the vector variant actually has lower cost. 949 if (OldCost < NewCost || !NewCost.isValid()) 950 return false; 951 952 // vec_op (inselt VecC0, V0, Index), (inselt VecC1, V1, Index) --> 953 // inselt NewVecC, (scalar_op V0, V1), Index 954 if (IsCmp) 955 ++NumScalarCmp; 956 else 957 ++NumScalarBO; 958 959 // For constant cases, extract the scalar element, this should constant fold. 960 if (IsConst0) 961 V0 = ConstantExpr::getExtractElement(VecC0, Builder.getInt64(Index)); 962 if (IsConst1) 963 V1 = ConstantExpr::getExtractElement(VecC1, Builder.getInt64(Index)); 964 965 Value *Scalar = 966 IsCmp ? Builder.CreateCmp(Pred, V0, V1) 967 : Builder.CreateBinOp((Instruction::BinaryOps)Opcode, V0, V1); 968 969 Scalar->setName(I.getName() + ".scalar"); 970 971 // All IR flags are safe to back-propagate. There is no potential for extra 972 // poison to be created by the scalar instruction. 973 if (auto *ScalarInst = dyn_cast<Instruction>(Scalar)) 974 ScalarInst->copyIRFlags(&I); 975 976 // Fold the vector constants in the original vectors into a new base vector. 977 Value *NewVecC = 978 IsCmp ? Builder.CreateCmp(Pred, VecC0, VecC1) 979 : Builder.CreateBinOp((Instruction::BinaryOps)Opcode, VecC0, VecC1); 980 Value *Insert = Builder.CreateInsertElement(NewVecC, Scalar, Index); 981 replaceValue(I, *Insert); 982 return true; 983} 984 985/// Try to combine a scalar binop + 2 scalar compares of extracted elements of 986/// a vector into vector operations followed by extract. Note: The SLP pass 987/// may miss this pattern because of implementation problems. 988bool VectorCombine::foldExtractedCmps(Instruction &I) { 989 // We are looking for a scalar binop of booleans. 990 // binop i1 (cmp Pred I0, C0), (cmp Pred I1, C1) 991 if (!I.isBinaryOp() || !I.getType()->isIntegerTy(1)) 992 return false; 993 994 // The compare predicates should match, and each compare should have a 995 // constant operand. 996 // TODO: Relax the one-use constraints. 997 Value *B0 = I.getOperand(0), *B1 = I.getOperand(1); 998 Instruction *I0, *I1; 999 Constant *C0, *C1; 1000 CmpInst::Predicate P0, P1; 1001 if (!match(B0, m_OneUse(m_Cmp(P0, m_Instruction(I0), m_Constant(C0)))) || 1002 !match(B1, m_OneUse(m_Cmp(P1, m_Instruction(I1), m_Constant(C1)))) || 1003 P0 != P1) 1004 return false; 1005 1006 // The compare operands must be extracts of the same vector with constant 1007 // extract indexes. 1008 // TODO: Relax the one-use constraints. 1009 Value *X; 1010 uint64_t Index0, Index1; 1011 if (!match(I0, m_OneUse(m_ExtractElt(m_Value(X), m_ConstantInt(Index0)))) || 1012 !match(I1, m_OneUse(m_ExtractElt(m_Specific(X), m_ConstantInt(Index1))))) 1013 return false; 1014 1015 auto *Ext0 = cast<ExtractElementInst>(I0); 1016 auto *Ext1 = cast<ExtractElementInst>(I1); 1017 ExtractElementInst *ConvertToShuf = getShuffleExtract(Ext0, Ext1); 1018 if (!ConvertToShuf) 1019 return false; 1020 1021 // The original scalar pattern is: 1022 // binop i1 (cmp Pred (ext X, Index0), C0), (cmp Pred (ext X, Index1), C1) 1023 CmpInst::Predicate Pred = P0; 1024 unsigned CmpOpcode = CmpInst::isFPPredicate(Pred) ? Instruction::FCmp 1025 : Instruction::ICmp; 1026 auto *VecTy = dyn_cast<FixedVectorType>(X->getType()); 1027 if (!VecTy) 1028 return false; 1029 1030 TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput; 1031 InstructionCost OldCost = 1032 TTI.getVectorInstrCost(*Ext0, VecTy, CostKind, Index0); 1033 OldCost += TTI.getVectorInstrCost(*Ext1, VecTy, CostKind, Index1); 1034 OldCost += 1035 TTI.getCmpSelInstrCost(CmpOpcode, I0->getType(), 1036 CmpInst::makeCmpResultType(I0->getType()), Pred) * 1037 2; 1038 OldCost += TTI.getArithmeticInstrCost(I.getOpcode(), I.getType()); 1039 1040 // The proposed vector pattern is: 1041 // vcmp = cmp Pred X, VecC 1042 // ext (binop vNi1 vcmp, (shuffle vcmp, Index1)), Index0 1043 int CheapIndex = ConvertToShuf == Ext0 ? Index1 : Index0; 1044 int ExpensiveIndex = ConvertToShuf == Ext0 ? Index0 : Index1; 1045 auto *CmpTy = cast<FixedVectorType>(CmpInst::makeCmpResultType(X->getType())); 1046 InstructionCost NewCost = TTI.getCmpSelInstrCost( 1047 CmpOpcode, X->getType(), CmpInst::makeCmpResultType(X->getType()), Pred); 1048 SmallVector<int, 32> ShufMask(VecTy->getNumElements(), PoisonMaskElem); 1049 ShufMask[CheapIndex] = ExpensiveIndex; 1050 NewCost += TTI.getShuffleCost(TargetTransformInfo::SK_PermuteSingleSrc, CmpTy, 1051 ShufMask); 1052 NewCost += TTI.getArithmeticInstrCost(I.getOpcode(), CmpTy); 1053 NewCost += TTI.getVectorInstrCost(*Ext0, CmpTy, CostKind, CheapIndex); 1054 1055 // Aggressively form vector ops if the cost is equal because the transform 1056 // may enable further optimization. 1057 // Codegen can reverse this transform (scalarize) if it was not profitable. 1058 if (OldCost < NewCost || !NewCost.isValid()) 1059 return false; 1060 1061 // Create a vector constant from the 2 scalar constants. 1062 SmallVector<Constant *, 32> CmpC(VecTy->getNumElements(), 1063 PoisonValue::get(VecTy->getElementType())); 1064 CmpC[Index0] = C0; 1065 CmpC[Index1] = C1; 1066 Value *VCmp = Builder.CreateCmp(Pred, X, ConstantVector::get(CmpC)); 1067 1068 Value *Shuf = createShiftShuffle(VCmp, ExpensiveIndex, CheapIndex, Builder); 1069 Value *VecLogic = Builder.CreateBinOp(cast<BinaryOperator>(I).getOpcode(), 1070 VCmp, Shuf); 1071 Value *NewExt = Builder.CreateExtractElement(VecLogic, CheapIndex); 1072 replaceValue(I, *NewExt); 1073 ++NumVecCmpBO; 1074 return true; 1075} 1076 1077// Check if memory loc modified between two instrs in the same BB 1078static bool isMemModifiedBetween(BasicBlock::iterator Begin, 1079 BasicBlock::iterator End, 1080 const MemoryLocation &Loc, AAResults &AA) { 1081 unsigned NumScanned = 0; 1082 return std::any_of(Begin, End, [&](const Instruction &Instr) { 1083 return isModSet(AA.getModRefInfo(&Instr, Loc)) || 1084 ++NumScanned > MaxInstrsToScan; 1085 }); 1086} 1087 1088namespace { 1089/// Helper class to indicate whether a vector index can be safely scalarized and 1090/// if a freeze needs to be inserted. 1091class ScalarizationResult { 1092 enum class StatusTy { Unsafe, Safe, SafeWithFreeze }; 1093 1094 StatusTy Status; 1095 Value *ToFreeze; 1096 1097 ScalarizationResult(StatusTy Status, Value *ToFreeze = nullptr) 1098 : Status(Status), ToFreeze(ToFreeze) {} 1099 1100public: 1101 ScalarizationResult(const ScalarizationResult &Other) = default; 1102 ~ScalarizationResult() { 1103 assert(!ToFreeze && "freeze() not called with ToFreeze being set"); 1104 } 1105 1106 static ScalarizationResult unsafe() { return {StatusTy::Unsafe}; } 1107 static ScalarizationResult safe() { return {StatusTy::Safe}; } 1108 static ScalarizationResult safeWithFreeze(Value *ToFreeze) { 1109 return {StatusTy::SafeWithFreeze, ToFreeze}; 1110 } 1111 1112 /// Returns true if the index can be scalarize without requiring a freeze. 1113 bool isSafe() const { return Status == StatusTy::Safe; } 1114 /// Returns true if the index cannot be scalarized. 1115 bool isUnsafe() const { return Status == StatusTy::Unsafe; } 1116 /// Returns true if the index can be scalarize, but requires inserting a 1117 /// freeze. 1118 bool isSafeWithFreeze() const { return Status == StatusTy::SafeWithFreeze; } 1119 1120 /// Reset the state of Unsafe and clear ToFreze if set. 1121 void discard() { 1122 ToFreeze = nullptr; 1123 Status = StatusTy::Unsafe; 1124 } 1125 1126 /// Freeze the ToFreeze and update the use in \p User to use it. 1127 void freeze(IRBuilder<> &Builder, Instruction &UserI) { 1128 assert(isSafeWithFreeze() && 1129 "should only be used when freezing is required"); 1130 assert(is_contained(ToFreeze->users(), &UserI) && 1131 "UserI must be a user of ToFreeze"); 1132 IRBuilder<>::InsertPointGuard Guard(Builder); 1133 Builder.SetInsertPoint(cast<Instruction>(&UserI)); 1134 Value *Frozen = 1135 Builder.CreateFreeze(ToFreeze, ToFreeze->getName() + ".frozen"); 1136 for (Use &U : make_early_inc_range((UserI.operands()))) 1137 if (U.get() == ToFreeze) 1138 U.set(Frozen); 1139 1140 ToFreeze = nullptr; 1141 } 1142}; 1143} // namespace 1144 1145/// Check if it is legal to scalarize a memory access to \p VecTy at index \p 1146/// Idx. \p Idx must access a valid vector element. 1147static ScalarizationResult canScalarizeAccess(VectorType *VecTy, Value *Idx, 1148 Instruction *CtxI, 1149 AssumptionCache &AC, 1150 const DominatorTree &DT) { 1151 // We do checks for both fixed vector types and scalable vector types. 1152 // This is the number of elements of fixed vector types, 1153 // or the minimum number of elements of scalable vector types. 1154 uint64_t NumElements = VecTy->getElementCount().getKnownMinValue(); 1155 1156 if (auto *C = dyn_cast<ConstantInt>(Idx)) { 1157 if (C->getValue().ult(NumElements)) 1158 return ScalarizationResult::safe(); 1159 return ScalarizationResult::unsafe(); 1160 } 1161 1162 unsigned IntWidth = Idx->getType()->getScalarSizeInBits(); 1163 APInt Zero(IntWidth, 0); 1164 APInt MaxElts(IntWidth, NumElements); 1165 ConstantRange ValidIndices(Zero, MaxElts); 1166 ConstantRange IdxRange(IntWidth, true); 1167 1168 if (isGuaranteedNotToBePoison(Idx, &AC)) { 1169 if (ValidIndices.contains(computeConstantRange(Idx, /* ForSigned */ false, 1170 true, &AC, CtxI, &DT))) 1171 return ScalarizationResult::safe(); 1172 return ScalarizationResult::unsafe(); 1173 } 1174 1175 // If the index may be poison, check if we can insert a freeze before the 1176 // range of the index is restricted. 1177 Value *IdxBase; 1178 ConstantInt *CI; 1179 if (match(Idx, m_And(m_Value(IdxBase), m_ConstantInt(CI)))) { 1180 IdxRange = IdxRange.binaryAnd(CI->getValue()); 1181 } else if (match(Idx, m_URem(m_Value(IdxBase), m_ConstantInt(CI)))) { 1182 IdxRange = IdxRange.urem(CI->getValue()); 1183 } 1184 1185 if (ValidIndices.contains(IdxRange)) 1186 return ScalarizationResult::safeWithFreeze(IdxBase); 1187 return ScalarizationResult::unsafe(); 1188} 1189 1190/// The memory operation on a vector of \p ScalarType had alignment of 1191/// \p VectorAlignment. Compute the maximal, but conservatively correct, 1192/// alignment that will be valid for the memory operation on a single scalar 1193/// element of the same type with index \p Idx. 1194static Align computeAlignmentAfterScalarization(Align VectorAlignment, 1195 Type *ScalarType, Value *Idx, 1196 const DataLayout &DL) { 1197 if (auto *C = dyn_cast<ConstantInt>(Idx)) 1198 return commonAlignment(VectorAlignment, 1199 C->getZExtValue() * DL.getTypeStoreSize(ScalarType)); 1200 return commonAlignment(VectorAlignment, DL.getTypeStoreSize(ScalarType)); 1201} 1202 1203// Combine patterns like: 1204// %0 = load <4 x i32>, <4 x i32>* %a 1205// %1 = insertelement <4 x i32> %0, i32 %b, i32 1 1206// store <4 x i32> %1, <4 x i32>* %a 1207// to: 1208// %0 = bitcast <4 x i32>* %a to i32* 1209// %1 = getelementptr inbounds i32, i32* %0, i64 0, i64 1 1210// store i32 %b, i32* %1 1211bool VectorCombine::foldSingleElementStore(Instruction &I) { 1212 auto *SI = cast<StoreInst>(&I); 1213 if (!SI->isSimple() || !isa<VectorType>(SI->getValueOperand()->getType())) 1214 return false; 1215 1216 // TODO: Combine more complicated patterns (multiple insert) by referencing 1217 // TargetTransformInfo. 1218 Instruction *Source; 1219 Value *NewElement; 1220 Value *Idx; 1221 if (!match(SI->getValueOperand(), 1222 m_InsertElt(m_Instruction(Source), m_Value(NewElement), 1223 m_Value(Idx)))) 1224 return false; 1225 1226 if (auto *Load = dyn_cast<LoadInst>(Source)) { 1227 auto VecTy = cast<VectorType>(SI->getValueOperand()->getType()); 1228 const DataLayout &DL = I.getModule()->getDataLayout(); 1229 Value *SrcAddr = Load->getPointerOperand()->stripPointerCasts(); 1230 // Don't optimize for atomic/volatile load or store. Ensure memory is not 1231 // modified between, vector type matches store size, and index is inbounds. 1232 if (!Load->isSimple() || Load->getParent() != SI->getParent() || 1233 !DL.typeSizeEqualsStoreSize(Load->getType()->getScalarType()) || 1234 SrcAddr != SI->getPointerOperand()->stripPointerCasts()) 1235 return false; 1236 1237 auto ScalarizableIdx = canScalarizeAccess(VecTy, Idx, Load, AC, DT); 1238 if (ScalarizableIdx.isUnsafe() || 1239 isMemModifiedBetween(Load->getIterator(), SI->getIterator(), 1240 MemoryLocation::get(SI), AA)) 1241 return false; 1242 1243 if (ScalarizableIdx.isSafeWithFreeze()) 1244 ScalarizableIdx.freeze(Builder, *cast<Instruction>(Idx)); 1245 Value *GEP = Builder.CreateInBoundsGEP( 1246 SI->getValueOperand()->getType(), SI->getPointerOperand(), 1247 {ConstantInt::get(Idx->getType(), 0), Idx}); 1248 StoreInst *NSI = Builder.CreateStore(NewElement, GEP); 1249 NSI->copyMetadata(*SI); 1250 Align ScalarOpAlignment = computeAlignmentAfterScalarization( 1251 std::max(SI->getAlign(), Load->getAlign()), NewElement->getType(), Idx, 1252 DL); 1253 NSI->setAlignment(ScalarOpAlignment); 1254 replaceValue(I, *NSI); 1255 eraseInstruction(I); 1256 return true; 1257 } 1258 1259 return false; 1260} 1261 1262/// Try to scalarize vector loads feeding extractelement instructions. 1263bool VectorCombine::scalarizeLoadExtract(Instruction &I) { 1264 Value *Ptr; 1265 if (!match(&I, m_Load(m_Value(Ptr)))) 1266 return false; 1267 1268 auto *VecTy = cast<VectorType>(I.getType()); 1269 auto *LI = cast<LoadInst>(&I); 1270 const DataLayout &DL = I.getModule()->getDataLayout(); 1271 if (LI->isVolatile() || !DL.typeSizeEqualsStoreSize(VecTy->getScalarType())) 1272 return false; 1273 1274 InstructionCost OriginalCost = 1275 TTI.getMemoryOpCost(Instruction::Load, VecTy, LI->getAlign(), 1276 LI->getPointerAddressSpace()); 1277 InstructionCost ScalarizedCost = 0; 1278 1279 Instruction *LastCheckedInst = LI; 1280 unsigned NumInstChecked = 0; 1281 DenseMap<ExtractElementInst *, ScalarizationResult> NeedFreeze; 1282 auto FailureGuard = make_scope_exit([&]() { 1283 // If the transform is aborted, discard the ScalarizationResults. 1284 for (auto &Pair : NeedFreeze) 1285 Pair.second.discard(); 1286 }); 1287 1288 // Check if all users of the load are extracts with no memory modifications 1289 // between the load and the extract. Compute the cost of both the original 1290 // code and the scalarized version. 1291 for (User *U : LI->users()) { 1292 auto *UI = dyn_cast<ExtractElementInst>(U); 1293 if (!UI || UI->getParent() != LI->getParent()) 1294 return false; 1295 1296 // Check if any instruction between the load and the extract may modify 1297 // memory. 1298 if (LastCheckedInst->comesBefore(UI)) { 1299 for (Instruction &I : 1300 make_range(std::next(LI->getIterator()), UI->getIterator())) { 1301 // Bail out if we reached the check limit or the instruction may write 1302 // to memory. 1303 if (NumInstChecked == MaxInstrsToScan || I.mayWriteToMemory()) 1304 return false; 1305 NumInstChecked++; 1306 } 1307 LastCheckedInst = UI; 1308 } 1309 1310 auto ScalarIdx = canScalarizeAccess(VecTy, UI->getOperand(1), &I, AC, DT); 1311 if (ScalarIdx.isUnsafe()) 1312 return false; 1313 if (ScalarIdx.isSafeWithFreeze()) { 1314 NeedFreeze.try_emplace(UI, ScalarIdx); 1315 ScalarIdx.discard(); 1316 } 1317 1318 auto *Index = dyn_cast<ConstantInt>(UI->getOperand(1)); 1319 TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput; 1320 OriginalCost += 1321 TTI.getVectorInstrCost(Instruction::ExtractElement, VecTy, CostKind, 1322 Index ? Index->getZExtValue() : -1); 1323 ScalarizedCost += 1324 TTI.getMemoryOpCost(Instruction::Load, VecTy->getElementType(), 1325 Align(1), LI->getPointerAddressSpace()); 1326 ScalarizedCost += TTI.getAddressComputationCost(VecTy->getElementType()); 1327 } 1328 1329 if (ScalarizedCost >= OriginalCost) 1330 return false; 1331 1332 // Replace extracts with narrow scalar loads. 1333 for (User *U : LI->users()) { 1334 auto *EI = cast<ExtractElementInst>(U); 1335 Value *Idx = EI->getOperand(1); 1336 1337 // Insert 'freeze' for poison indexes. 1338 auto It = NeedFreeze.find(EI); 1339 if (It != NeedFreeze.end()) 1340 It->second.freeze(Builder, *cast<Instruction>(Idx)); 1341 1342 Builder.SetInsertPoint(EI); 1343 Value *GEP = 1344 Builder.CreateInBoundsGEP(VecTy, Ptr, {Builder.getInt32(0), Idx}); 1345 auto *NewLoad = cast<LoadInst>(Builder.CreateLoad( 1346 VecTy->getElementType(), GEP, EI->getName() + ".scalar")); 1347 1348 Align ScalarOpAlignment = computeAlignmentAfterScalarization( 1349 LI->getAlign(), VecTy->getElementType(), Idx, DL); 1350 NewLoad->setAlignment(ScalarOpAlignment); 1351 1352 replaceValue(*EI, *NewLoad); 1353 } 1354 1355 FailureGuard.release(); 1356 return true; 1357} 1358 1359/// Try to convert "shuffle (binop), (binop)" with a shared binop operand into 1360/// "binop (shuffle), (shuffle)". 1361bool VectorCombine::foldShuffleOfBinops(Instruction &I) { 1362 auto *VecTy = cast<FixedVectorType>(I.getType()); 1363 BinaryOperator *B0, *B1; 1364 ArrayRef<int> Mask; 1365 if (!match(&I, m_Shuffle(m_OneUse(m_BinOp(B0)), m_OneUse(m_BinOp(B1)), 1366 m_Mask(Mask))) || 1367 B0->getOpcode() != B1->getOpcode() || B0->getType() != VecTy) 1368 return false; 1369 1370 // Try to replace a binop with a shuffle if the shuffle is not costly. 1371 // The new shuffle will choose from a single, common operand, so it may be 1372 // cheaper than the existing two-operand shuffle. 1373 SmallVector<int> UnaryMask = createUnaryMask(Mask, Mask.size()); 1374 Instruction::BinaryOps Opcode = B0->getOpcode(); 1375 InstructionCost BinopCost = TTI.getArithmeticInstrCost(Opcode, VecTy); 1376 InstructionCost ShufCost = TTI.getShuffleCost( 1377 TargetTransformInfo::SK_PermuteSingleSrc, VecTy, UnaryMask); 1378 if (ShufCost > BinopCost) 1379 return false; 1380 1381 // If we have something like "add X, Y" and "add Z, X", swap ops to match. 1382 Value *X = B0->getOperand(0), *Y = B0->getOperand(1); 1383 Value *Z = B1->getOperand(0), *W = B1->getOperand(1); 1384 if (BinaryOperator::isCommutative(Opcode) && X != Z && Y != W) 1385 std::swap(X, Y); 1386 1387 Value *Shuf0, *Shuf1; 1388 if (X == Z) { 1389 // shuf (bo X, Y), (bo X, W) --> bo (shuf X), (shuf Y, W) 1390 Shuf0 = Builder.CreateShuffleVector(X, UnaryMask); 1391 Shuf1 = Builder.CreateShuffleVector(Y, W, Mask); 1392 } else if (Y == W) { 1393 // shuf (bo X, Y), (bo Z, Y) --> bo (shuf X, Z), (shuf Y) 1394 Shuf0 = Builder.CreateShuffleVector(X, Z, Mask); 1395 Shuf1 = Builder.CreateShuffleVector(Y, UnaryMask); 1396 } else { 1397 return false; 1398 } 1399 1400 Value *NewBO = Builder.CreateBinOp(Opcode, Shuf0, Shuf1); 1401 // Intersect flags from the old binops. 1402 if (auto *NewInst = dyn_cast<Instruction>(NewBO)) { 1403 NewInst->copyIRFlags(B0); 1404 NewInst->andIRFlags(B1); 1405 } 1406 replaceValue(I, *NewBO); 1407 return true; 1408} 1409 1410/// Given a commutative reduction, the order of the input lanes does not alter 1411/// the results. We can use this to remove certain shuffles feeding the 1412/// reduction, removing the need to shuffle at all. 1413bool VectorCombine::foldShuffleFromReductions(Instruction &I) { 1414 auto *II = dyn_cast<IntrinsicInst>(&I); 1415 if (!II) 1416 return false; 1417 switch (II->getIntrinsicID()) { 1418 case Intrinsic::vector_reduce_add: 1419 case Intrinsic::vector_reduce_mul: 1420 case Intrinsic::vector_reduce_and: 1421 case Intrinsic::vector_reduce_or: 1422 case Intrinsic::vector_reduce_xor: 1423 case Intrinsic::vector_reduce_smin: 1424 case Intrinsic::vector_reduce_smax: 1425 case Intrinsic::vector_reduce_umin: 1426 case Intrinsic::vector_reduce_umax: 1427 break; 1428 default: 1429 return false; 1430 } 1431 1432 // Find all the inputs when looking through operations that do not alter the 1433 // lane order (binops, for example). Currently we look for a single shuffle, 1434 // and can ignore splat values. 1435 std::queue<Value *> Worklist; 1436 SmallPtrSet<Value *, 4> Visited; 1437 ShuffleVectorInst *Shuffle = nullptr; 1438 if (auto *Op = dyn_cast<Instruction>(I.getOperand(0))) 1439 Worklist.push(Op); 1440 1441 while (!Worklist.empty()) { 1442 Value *CV = Worklist.front(); 1443 Worklist.pop(); 1444 if (Visited.contains(CV)) 1445 continue; 1446 1447 // Splats don't change the order, so can be safely ignored. 1448 if (isSplatValue(CV)) 1449 continue; 1450 1451 Visited.insert(CV); 1452 1453 if (auto *CI = dyn_cast<Instruction>(CV)) { 1454 if (CI->isBinaryOp()) { 1455 for (auto *Op : CI->operand_values()) 1456 Worklist.push(Op); 1457 continue; 1458 } else if (auto *SV = dyn_cast<ShuffleVectorInst>(CI)) { 1459 if (Shuffle && Shuffle != SV) 1460 return false; 1461 Shuffle = SV; 1462 continue; 1463 } 1464 } 1465 1466 // Anything else is currently an unknown node. 1467 return false; 1468 } 1469 1470 if (!Shuffle) 1471 return false; 1472 1473 // Check all uses of the binary ops and shuffles are also included in the 1474 // lane-invariant operations (Visited should be the list of lanewise 1475 // instructions, including the shuffle that we found). 1476 for (auto *V : Visited) 1477 for (auto *U : V->users()) 1478 if (!Visited.contains(U) && U != &I) 1479 return false; 1480 1481 FixedVectorType *VecType = 1482 dyn_cast<FixedVectorType>(II->getOperand(0)->getType()); 1483 if (!VecType) 1484 return false; 1485 FixedVectorType *ShuffleInputType = 1486 dyn_cast<FixedVectorType>(Shuffle->getOperand(0)->getType()); 1487 if (!ShuffleInputType) 1488 return false; 1489 unsigned NumInputElts = ShuffleInputType->getNumElements(); 1490 1491 // Find the mask from sorting the lanes into order. This is most likely to 1492 // become a identity or concat mask. Undef elements are pushed to the end. 1493 SmallVector<int> ConcatMask; 1494 Shuffle->getShuffleMask(ConcatMask); 1495 sort(ConcatMask, [](int X, int Y) { return (unsigned)X < (unsigned)Y; }); 1496 // In the case of a truncating shuffle it's possible for the mask 1497 // to have an index greater than the size of the resulting vector. 1498 // This requires special handling. 1499 bool IsTruncatingShuffle = VecType->getNumElements() < NumInputElts; 1500 bool UsesSecondVec = 1501 any_of(ConcatMask, [&](int M) { return M >= (int)NumInputElts; }); 1502 1503 FixedVectorType *VecTyForCost = 1504 (UsesSecondVec && !IsTruncatingShuffle) ? VecType : ShuffleInputType; 1505 InstructionCost OldCost = TTI.getShuffleCost( 1506 UsesSecondVec ? TTI::SK_PermuteTwoSrc : TTI::SK_PermuteSingleSrc, 1507 VecTyForCost, Shuffle->getShuffleMask()); 1508 InstructionCost NewCost = TTI.getShuffleCost( 1509 UsesSecondVec ? TTI::SK_PermuteTwoSrc : TTI::SK_PermuteSingleSrc, 1510 VecTyForCost, ConcatMask); 1511 1512 LLVM_DEBUG(dbgs() << "Found a reduction feeding from a shuffle: " << *Shuffle 1513 << "\n"); 1514 LLVM_DEBUG(dbgs() << " OldCost: " << OldCost << " vs NewCost: " << NewCost 1515 << "\n"); 1516 if (NewCost < OldCost) { 1517 Builder.SetInsertPoint(Shuffle); 1518 Value *NewShuffle = Builder.CreateShuffleVector( 1519 Shuffle->getOperand(0), Shuffle->getOperand(1), ConcatMask); 1520 LLVM_DEBUG(dbgs() << "Created new shuffle: " << *NewShuffle << "\n"); 1521 replaceValue(*Shuffle, *NewShuffle); 1522 } 1523 1524 // See if we can re-use foldSelectShuffle, getting it to reduce the size of 1525 // the shuffle into a nicer order, as it can ignore the order of the shuffles. 1526 return foldSelectShuffle(*Shuffle, true); 1527} 1528 1529/// This method looks for groups of shuffles acting on binops, of the form: 1530/// %x = shuffle ... 1531/// %y = shuffle ... 1532/// %a = binop %x, %y 1533/// %b = binop %x, %y 1534/// shuffle %a, %b, selectmask 1535/// We may, especially if the shuffle is wider than legal, be able to convert 1536/// the shuffle to a form where only parts of a and b need to be computed. On 1537/// architectures with no obvious "select" shuffle, this can reduce the total 1538/// number of operations if the target reports them as cheaper. 1539bool VectorCombine::foldSelectShuffle(Instruction &I, bool FromReduction) { 1540 auto *SVI = cast<ShuffleVectorInst>(&I); 1541 auto *VT = cast<FixedVectorType>(I.getType()); 1542 auto *Op0 = dyn_cast<Instruction>(SVI->getOperand(0)); 1543 auto *Op1 = dyn_cast<Instruction>(SVI->getOperand(1)); 1544 if (!Op0 || !Op1 || Op0 == Op1 || !Op0->isBinaryOp() || !Op1->isBinaryOp() || 1545 VT != Op0->getType()) 1546 return false; 1547 1548 auto *SVI0A = dyn_cast<Instruction>(Op0->getOperand(0)); 1549 auto *SVI0B = dyn_cast<Instruction>(Op0->getOperand(1)); 1550 auto *SVI1A = dyn_cast<Instruction>(Op1->getOperand(0)); 1551 auto *SVI1B = dyn_cast<Instruction>(Op1->getOperand(1)); 1552 SmallPtrSet<Instruction *, 4> InputShuffles({SVI0A, SVI0B, SVI1A, SVI1B}); 1553 auto checkSVNonOpUses = [&](Instruction *I) { 1554 if (!I || I->getOperand(0)->getType() != VT) 1555 return true; 1556 return any_of(I->users(), [&](User *U) { 1557 return U != Op0 && U != Op1 && 1558 !(isa<ShuffleVectorInst>(U) && 1559 (InputShuffles.contains(cast<Instruction>(U)) || 1560 isInstructionTriviallyDead(cast<Instruction>(U)))); 1561 }); 1562 }; 1563 if (checkSVNonOpUses(SVI0A) || checkSVNonOpUses(SVI0B) || 1564 checkSVNonOpUses(SVI1A) || checkSVNonOpUses(SVI1B)) 1565 return false; 1566 1567 // Collect all the uses that are shuffles that we can transform together. We 1568 // may not have a single shuffle, but a group that can all be transformed 1569 // together profitably. 1570 SmallVector<ShuffleVectorInst *> Shuffles; 1571 auto collectShuffles = [&](Instruction *I) { 1572 for (auto *U : I->users()) { 1573 auto *SV = dyn_cast<ShuffleVectorInst>(U); 1574 if (!SV || SV->getType() != VT) 1575 return false; 1576 if ((SV->getOperand(0) != Op0 && SV->getOperand(0) != Op1) || 1577 (SV->getOperand(1) != Op0 && SV->getOperand(1) != Op1)) 1578 return false; 1579 if (!llvm::is_contained(Shuffles, SV)) 1580 Shuffles.push_back(SV); 1581 } 1582 return true; 1583 }; 1584 if (!collectShuffles(Op0) || !collectShuffles(Op1)) 1585 return false; 1586 // From a reduction, we need to be processing a single shuffle, otherwise the 1587 // other uses will not be lane-invariant. 1588 if (FromReduction && Shuffles.size() > 1) 1589 return false; 1590 1591 // Add any shuffle uses for the shuffles we have found, to include them in our 1592 // cost calculations. 1593 if (!FromReduction) { 1594 for (ShuffleVectorInst *SV : Shuffles) { 1595 for (auto *U : SV->users()) { 1596 ShuffleVectorInst *SSV = dyn_cast<ShuffleVectorInst>(U); 1597 if (SSV && isa<UndefValue>(SSV->getOperand(1)) && SSV->getType() == VT) 1598 Shuffles.push_back(SSV); 1599 } 1600 } 1601 } 1602 1603 // For each of the output shuffles, we try to sort all the first vector 1604 // elements to the beginning, followed by the second array elements at the 1605 // end. If the binops are legalized to smaller vectors, this may reduce total 1606 // number of binops. We compute the ReconstructMask mask needed to convert 1607 // back to the original lane order. 1608 SmallVector<std::pair<int, int>> V1, V2; 1609 SmallVector<SmallVector<int>> OrigReconstructMasks; 1610 int MaxV1Elt = 0, MaxV2Elt = 0; 1611 unsigned NumElts = VT->getNumElements(); 1612 for (ShuffleVectorInst *SVN : Shuffles) { 1613 SmallVector<int> Mask; 1614 SVN->getShuffleMask(Mask); 1615 1616 // Check the operands are the same as the original, or reversed (in which 1617 // case we need to commute the mask). 1618 Value *SVOp0 = SVN->getOperand(0); 1619 Value *SVOp1 = SVN->getOperand(1); 1620 if (isa<UndefValue>(SVOp1)) { 1621 auto *SSV = cast<ShuffleVectorInst>(SVOp0); 1622 SVOp0 = SSV->getOperand(0); 1623 SVOp1 = SSV->getOperand(1); 1624 for (unsigned I = 0, E = Mask.size(); I != E; I++) { 1625 if (Mask[I] >= static_cast<int>(SSV->getShuffleMask().size())) 1626 return false; 1627 Mask[I] = Mask[I] < 0 ? Mask[I] : SSV->getMaskValue(Mask[I]); 1628 } 1629 } 1630 if (SVOp0 == Op1 && SVOp1 == Op0) { 1631 std::swap(SVOp0, SVOp1); 1632 ShuffleVectorInst::commuteShuffleMask(Mask, NumElts); 1633 } 1634 if (SVOp0 != Op0 || SVOp1 != Op1) 1635 return false; 1636 1637 // Calculate the reconstruction mask for this shuffle, as the mask needed to 1638 // take the packed values from Op0/Op1 and reconstructing to the original 1639 // order. 1640 SmallVector<int> ReconstructMask; 1641 for (unsigned I = 0; I < Mask.size(); I++) { 1642 if (Mask[I] < 0) { 1643 ReconstructMask.push_back(-1); 1644 } else if (Mask[I] < static_cast<int>(NumElts)) { 1645 MaxV1Elt = std::max(MaxV1Elt, Mask[I]); 1646 auto It = find_if(V1, [&](const std::pair<int, int> &A) { 1647 return Mask[I] == A.first; 1648 }); 1649 if (It != V1.end()) 1650 ReconstructMask.push_back(It - V1.begin()); 1651 else { 1652 ReconstructMask.push_back(V1.size()); 1653 V1.emplace_back(Mask[I], V1.size()); 1654 } 1655 } else { 1656 MaxV2Elt = std::max<int>(MaxV2Elt, Mask[I] - NumElts); 1657 auto It = find_if(V2, [&](const std::pair<int, int> &A) { 1658 return Mask[I] - static_cast<int>(NumElts) == A.first; 1659 }); 1660 if (It != V2.end()) 1661 ReconstructMask.push_back(NumElts + It - V2.begin()); 1662 else { 1663 ReconstructMask.push_back(NumElts + V2.size()); 1664 V2.emplace_back(Mask[I] - NumElts, NumElts + V2.size()); 1665 } 1666 } 1667 } 1668 1669 // For reductions, we know that the lane ordering out doesn't alter the 1670 // result. In-order can help simplify the shuffle away. 1671 if (FromReduction) 1672 sort(ReconstructMask); 1673 OrigReconstructMasks.push_back(std::move(ReconstructMask)); 1674 } 1675 1676 // If the Maximum element used from V1 and V2 are not larger than the new 1677 // vectors, the vectors are already packes and performing the optimization 1678 // again will likely not help any further. This also prevents us from getting 1679 // stuck in a cycle in case the costs do not also rule it out. 1680 if (V1.empty() || V2.empty() || 1681 (MaxV1Elt == static_cast<int>(V1.size()) - 1 && 1682 MaxV2Elt == static_cast<int>(V2.size()) - 1)) 1683 return false; 1684 1685 // GetBaseMaskValue takes one of the inputs, which may either be a shuffle, a 1686 // shuffle of another shuffle, or not a shuffle (that is treated like a 1687 // identity shuffle). 1688 auto GetBaseMaskValue = [&](Instruction *I, int M) { 1689 auto *SV = dyn_cast<ShuffleVectorInst>(I); 1690 if (!SV) 1691 return M; 1692 if (isa<UndefValue>(SV->getOperand(1))) 1693 if (auto *SSV = dyn_cast<ShuffleVectorInst>(SV->getOperand(0))) 1694 if (InputShuffles.contains(SSV)) 1695 return SSV->getMaskValue(SV->getMaskValue(M)); 1696 return SV->getMaskValue(M); 1697 }; 1698 1699 // Attempt to sort the inputs my ascending mask values to make simpler input 1700 // shuffles and push complex shuffles down to the uses. We sort on the first 1701 // of the two input shuffle orders, to try and get at least one input into a 1702 // nice order. 1703 auto SortBase = [&](Instruction *A, std::pair<int, int> X, 1704 std::pair<int, int> Y) { 1705 int MXA = GetBaseMaskValue(A, X.first); 1706 int MYA = GetBaseMaskValue(A, Y.first); 1707 return MXA < MYA; 1708 }; 1709 stable_sort(V1, [&](std::pair<int, int> A, std::pair<int, int> B) { 1710 return SortBase(SVI0A, A, B); 1711 }); 1712 stable_sort(V2, [&](std::pair<int, int> A, std::pair<int, int> B) { 1713 return SortBase(SVI1A, A, B); 1714 }); 1715 // Calculate our ReconstructMasks from the OrigReconstructMasks and the 1716 // modified order of the input shuffles. 1717 SmallVector<SmallVector<int>> ReconstructMasks; 1718 for (const auto &Mask : OrigReconstructMasks) { 1719 SmallVector<int> ReconstructMask; 1720 for (int M : Mask) { 1721 auto FindIndex = [](const SmallVector<std::pair<int, int>> &V, int M) { 1722 auto It = find_if(V, [M](auto A) { return A.second == M; }); 1723 assert(It != V.end() && "Expected all entries in Mask"); 1724 return std::distance(V.begin(), It); 1725 }; 1726 if (M < 0) 1727 ReconstructMask.push_back(-1); 1728 else if (M < static_cast<int>(NumElts)) { 1729 ReconstructMask.push_back(FindIndex(V1, M)); 1730 } else { 1731 ReconstructMask.push_back(NumElts + FindIndex(V2, M)); 1732 } 1733 } 1734 ReconstructMasks.push_back(std::move(ReconstructMask)); 1735 } 1736 1737 // Calculate the masks needed for the new input shuffles, which get padded 1738 // with undef 1739 SmallVector<int> V1A, V1B, V2A, V2B; 1740 for (unsigned I = 0; I < V1.size(); I++) { 1741 V1A.push_back(GetBaseMaskValue(SVI0A, V1[I].first)); 1742 V1B.push_back(GetBaseMaskValue(SVI0B, V1[I].first)); 1743 } 1744 for (unsigned I = 0; I < V2.size(); I++) { 1745 V2A.push_back(GetBaseMaskValue(SVI1A, V2[I].first)); 1746 V2B.push_back(GetBaseMaskValue(SVI1B, V2[I].first)); 1747 } 1748 while (V1A.size() < NumElts) { 1749 V1A.push_back(PoisonMaskElem); 1750 V1B.push_back(PoisonMaskElem); 1751 } 1752 while (V2A.size() < NumElts) { 1753 V2A.push_back(PoisonMaskElem); 1754 V2B.push_back(PoisonMaskElem); 1755 } 1756 1757 auto AddShuffleCost = [&](InstructionCost C, Instruction *I) { 1758 auto *SV = dyn_cast<ShuffleVectorInst>(I); 1759 if (!SV) 1760 return C; 1761 return C + TTI.getShuffleCost(isa<UndefValue>(SV->getOperand(1)) 1762 ? TTI::SK_PermuteSingleSrc 1763 : TTI::SK_PermuteTwoSrc, 1764 VT, SV->getShuffleMask()); 1765 }; 1766 auto AddShuffleMaskCost = [&](InstructionCost C, ArrayRef<int> Mask) { 1767 return C + TTI.getShuffleCost(TTI::SK_PermuteTwoSrc, VT, Mask); 1768 }; 1769 1770 // Get the costs of the shuffles + binops before and after with the new 1771 // shuffle masks. 1772 InstructionCost CostBefore = 1773 TTI.getArithmeticInstrCost(Op0->getOpcode(), VT) + 1774 TTI.getArithmeticInstrCost(Op1->getOpcode(), VT); 1775 CostBefore += std::accumulate(Shuffles.begin(), Shuffles.end(), 1776 InstructionCost(0), AddShuffleCost); 1777 CostBefore += std::accumulate(InputShuffles.begin(), InputShuffles.end(), 1778 InstructionCost(0), AddShuffleCost); 1779 1780 // The new binops will be unused for lanes past the used shuffle lengths. 1781 // These types attempt to get the correct cost for that from the target. 1782 FixedVectorType *Op0SmallVT = 1783 FixedVectorType::get(VT->getScalarType(), V1.size()); 1784 FixedVectorType *Op1SmallVT = 1785 FixedVectorType::get(VT->getScalarType(), V2.size()); 1786 InstructionCost CostAfter = 1787 TTI.getArithmeticInstrCost(Op0->getOpcode(), Op0SmallVT) + 1788 TTI.getArithmeticInstrCost(Op1->getOpcode(), Op1SmallVT); 1789 CostAfter += std::accumulate(ReconstructMasks.begin(), ReconstructMasks.end(), 1790 InstructionCost(0), AddShuffleMaskCost); 1791 std::set<SmallVector<int>> OutputShuffleMasks({V1A, V1B, V2A, V2B}); 1792 CostAfter += 1793 std::accumulate(OutputShuffleMasks.begin(), OutputShuffleMasks.end(), 1794 InstructionCost(0), AddShuffleMaskCost); 1795 1796 LLVM_DEBUG(dbgs() << "Found a binop select shuffle pattern: " << I << "\n"); 1797 LLVM_DEBUG(dbgs() << " CostBefore: " << CostBefore 1798 << " vs CostAfter: " << CostAfter << "\n"); 1799 if (CostBefore <= CostAfter) 1800 return false; 1801 1802 // The cost model has passed, create the new instructions. 1803 auto GetShuffleOperand = [&](Instruction *I, unsigned Op) -> Value * { 1804 auto *SV = dyn_cast<ShuffleVectorInst>(I); 1805 if (!SV) 1806 return I; 1807 if (isa<UndefValue>(SV->getOperand(1))) 1808 if (auto *SSV = dyn_cast<ShuffleVectorInst>(SV->getOperand(0))) 1809 if (InputShuffles.contains(SSV)) 1810 return SSV->getOperand(Op); 1811 return SV->getOperand(Op); 1812 }; 1813 Builder.SetInsertPoint(*SVI0A->getInsertionPointAfterDef()); 1814 Value *NSV0A = Builder.CreateShuffleVector(GetShuffleOperand(SVI0A, 0), 1815 GetShuffleOperand(SVI0A, 1), V1A); 1816 Builder.SetInsertPoint(*SVI0B->getInsertionPointAfterDef()); 1817 Value *NSV0B = Builder.CreateShuffleVector(GetShuffleOperand(SVI0B, 0), 1818 GetShuffleOperand(SVI0B, 1), V1B); 1819 Builder.SetInsertPoint(*SVI1A->getInsertionPointAfterDef()); 1820 Value *NSV1A = Builder.CreateShuffleVector(GetShuffleOperand(SVI1A, 0), 1821 GetShuffleOperand(SVI1A, 1), V2A); 1822 Builder.SetInsertPoint(*SVI1B->getInsertionPointAfterDef()); 1823 Value *NSV1B = Builder.CreateShuffleVector(GetShuffleOperand(SVI1B, 0), 1824 GetShuffleOperand(SVI1B, 1), V2B); 1825 Builder.SetInsertPoint(Op0); 1826 Value *NOp0 = Builder.CreateBinOp((Instruction::BinaryOps)Op0->getOpcode(), 1827 NSV0A, NSV0B); 1828 if (auto *I = dyn_cast<Instruction>(NOp0)) 1829 I->copyIRFlags(Op0, true); 1830 Builder.SetInsertPoint(Op1); 1831 Value *NOp1 = Builder.CreateBinOp((Instruction::BinaryOps)Op1->getOpcode(), 1832 NSV1A, NSV1B); 1833 if (auto *I = dyn_cast<Instruction>(NOp1)) 1834 I->copyIRFlags(Op1, true); 1835 1836 for (int S = 0, E = ReconstructMasks.size(); S != E; S++) { 1837 Builder.SetInsertPoint(Shuffles[S]); 1838 Value *NSV = Builder.CreateShuffleVector(NOp0, NOp1, ReconstructMasks[S]); 1839 replaceValue(*Shuffles[S], *NSV); 1840 } 1841 1842 Worklist.pushValue(NSV0A); 1843 Worklist.pushValue(NSV0B); 1844 Worklist.pushValue(NSV1A); 1845 Worklist.pushValue(NSV1B); 1846 for (auto *S : Shuffles) 1847 Worklist.add(S); 1848 return true; 1849} 1850 1851/// This is the entry point for all transforms. Pass manager differences are 1852/// handled in the callers of this function. 1853bool VectorCombine::run() { 1854 if (DisableVectorCombine) 1855 return false; 1856 1857 // Don't attempt vectorization if the target does not support vectors. 1858 if (!TTI.getNumberOfRegisters(TTI.getRegisterClassForType(/*Vector*/ true))) 1859 return false; 1860 1861 bool MadeChange = false; 1862 auto FoldInst = [this, &MadeChange](Instruction &I) { 1863 Builder.SetInsertPoint(&I); 1864 bool IsFixedVectorType = isa<FixedVectorType>(I.getType()); 1865 auto Opcode = I.getOpcode(); 1866 1867 // These folds should be beneficial regardless of when this pass is run 1868 // in the optimization pipeline. 1869 // The type checking is for run-time efficiency. We can avoid wasting time 1870 // dispatching to folding functions if there's no chance of matching. 1871 if (IsFixedVectorType) { 1872 switch (Opcode) { 1873 case Instruction::InsertElement: 1874 MadeChange |= vectorizeLoadInsert(I); 1875 break; 1876 case Instruction::ShuffleVector: 1877 MadeChange |= widenSubvectorLoad(I); 1878 break; 1879 default: 1880 break; 1881 } 1882 } 1883 1884 // This transform works with scalable and fixed vectors 1885 // TODO: Identify and allow other scalable transforms 1886 if (isa<VectorType>(I.getType())) { 1887 MadeChange |= scalarizeBinopOrCmp(I); 1888 MadeChange |= scalarizeLoadExtract(I); 1889 MadeChange |= scalarizeVPIntrinsic(I); 1890 } 1891 1892 if (Opcode == Instruction::Store) 1893 MadeChange |= foldSingleElementStore(I); 1894 1895 // If this is an early pipeline invocation of this pass, we are done. 1896 if (TryEarlyFoldsOnly) 1897 return; 1898 1899 // Otherwise, try folds that improve codegen but may interfere with 1900 // early IR canonicalizations. 1901 // The type checking is for run-time efficiency. We can avoid wasting time 1902 // dispatching to folding functions if there's no chance of matching. 1903 if (IsFixedVectorType) { 1904 switch (Opcode) { 1905 case Instruction::InsertElement: 1906 MadeChange |= foldInsExtFNeg(I); 1907 break; 1908 case Instruction::ShuffleVector: 1909 MadeChange |= foldShuffleOfBinops(I); 1910 MadeChange |= foldSelectShuffle(I); 1911 break; 1912 case Instruction::BitCast: 1913 MadeChange |= foldBitcastShuffle(I); 1914 break; 1915 } 1916 } else { 1917 switch (Opcode) { 1918 case Instruction::Call: 1919 MadeChange |= foldShuffleFromReductions(I); 1920 break; 1921 case Instruction::ICmp: 1922 case Instruction::FCmp: 1923 MadeChange |= foldExtractExtract(I); 1924 break; 1925 default: 1926 if (Instruction::isBinaryOp(Opcode)) { 1927 MadeChange |= foldExtractExtract(I); 1928 MadeChange |= foldExtractedCmps(I); 1929 } 1930 break; 1931 } 1932 } 1933 }; 1934 1935 for (BasicBlock &BB : F) { 1936 // Ignore unreachable basic blocks. 1937 if (!DT.isReachableFromEntry(&BB)) 1938 continue; 1939 // Use early increment range so that we can erase instructions in loop. 1940 for (Instruction &I : make_early_inc_range(BB)) { 1941 if (I.isDebugOrPseudoInst()) 1942 continue; 1943 FoldInst(I); 1944 } 1945 } 1946 1947 while (!Worklist.isEmpty()) { 1948 Instruction *I = Worklist.removeOne(); 1949 if (!I) 1950 continue; 1951 1952 if (isInstructionTriviallyDead(I)) { 1953 eraseInstruction(*I); 1954 continue; 1955 } 1956 1957 FoldInst(*I); 1958 } 1959 1960 return MadeChange; 1961} 1962 1963PreservedAnalyses VectorCombinePass::run(Function &F, 1964 FunctionAnalysisManager &FAM) { 1965 auto &AC = FAM.getResult<AssumptionAnalysis>(F); 1966 TargetTransformInfo &TTI = FAM.getResult<TargetIRAnalysis>(F); 1967 DominatorTree &DT = FAM.getResult<DominatorTreeAnalysis>(F); 1968 AAResults &AA = FAM.getResult<AAManager>(F); 1969 VectorCombine Combiner(F, TTI, DT, AA, AC, TryEarlyFoldsOnly); 1970 if (!Combiner.run()) 1971 return PreservedAnalyses::all(); 1972 PreservedAnalyses PA; 1973 PA.preserveSet<CFGAnalyses>(); 1974 return PA; 1975} 1976