1236769Sobrien//===-- VPlanTransforms.cpp - Utility VPlan to VPlan transforms -----------===// 2236769Sobrien// 3236769Sobrien// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 4236769Sobrien// See https://llvm.org/LICENSE.txt for license information. 5236769Sobrien// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 6236769Sobrien// 7236769Sobrien//===----------------------------------------------------------------------===// 8236769Sobrien/// 9236769Sobrien/// \file 10236769Sobrien/// This file implements a set of utility VPlan to VPlan transformations. 11236769Sobrien/// 12236769Sobrien//===----------------------------------------------------------------------===// 13236769Sobrien 14236769Sobrien#include "VPlanTransforms.h" 15236769Sobrien#include "VPRecipeBuilder.h" 16236769Sobrien#include "VPlanAnalysis.h" 17236769Sobrien#include "VPlanCFG.h" 18236769Sobrien#include "VPlanDominatorTree.h" 19236769Sobrien#include "llvm/ADT/PostOrderIterator.h" 20236769Sobrien#include "llvm/ADT/STLExtras.h" 21236769Sobrien#include "llvm/ADT/SetVector.h" 22236769Sobrien#include "llvm/Analysis/IVDescriptors.h" 23236769Sobrien#include "llvm/Analysis/VectorUtils.h" 24236769Sobrien#include "llvm/IR/Intrinsics.h" 25236769Sobrien#include "llvm/IR/PatternMatch.h" 26236769Sobrien 27236769Sobrienusing namespace llvm; 28236769Sobrien 29236769Sobrienusing namespace llvm::PatternMatch; 30236769Sobrien 31236769Sobrienvoid VPlanTransforms::VPInstructionsToVPRecipes( 32236769Sobrien VPlanPtr &Plan, 33236769Sobrien function_ref<const InductionDescriptor *(PHINode *)> 34236769Sobrien GetIntOrFpInductionDescriptor, 35236769Sobrien ScalarEvolution &SE, const TargetLibraryInfo &TLI) { 36236769Sobrien 37236769Sobrien ReversePostOrderTraversal<VPBlockDeepTraversalWrapper<VPBlockBase *>> RPOT( 38236769Sobrien Plan->getEntry()); 39236769Sobrien for (VPBasicBlock *VPBB : VPBlockUtils::blocksOnly<VPBasicBlock>(RPOT)) { 40236769Sobrien VPRecipeBase *Term = VPBB->getTerminator(); 41236769Sobrien auto EndIter = Term ? Term->getIterator() : VPBB->end(); 42236769Sobrien // Introduce each ingredient into VPlan. 43236769Sobrien for (VPRecipeBase &Ingredient : 44236769Sobrien make_early_inc_range(make_range(VPBB->begin(), EndIter))) { 45236769Sobrien 46236769Sobrien VPValue *VPV = Ingredient.getVPSingleValue(); 47236769Sobrien Instruction *Inst = cast<Instruction>(VPV->getUnderlyingValue()); 48236769Sobrien 49236769Sobrien VPRecipeBase *NewRecipe = nullptr; 50236769Sobrien if (auto *VPPhi = dyn_cast<VPWidenPHIRecipe>(&Ingredient)) { 51236769Sobrien auto *Phi = cast<PHINode>(VPPhi->getUnderlyingValue()); 52236769Sobrien if (const auto *II = GetIntOrFpInductionDescriptor(Phi)) { 53236769Sobrien VPValue *Start = Plan->getVPValueOrAddLiveIn(II->getStartValue()); 54236769Sobrien VPValue *Step = 55236769Sobrien vputils::getOrCreateVPValueForSCEVExpr(*Plan, II->getStep(), SE); 56236769Sobrien NewRecipe = new VPWidenIntOrFpInductionRecipe(Phi, Start, Step, *II); 57236769Sobrien } else { 58236769Sobrien Plan->addVPValue(Phi, VPPhi); 59236769Sobrien continue; 60236769Sobrien } 61236769Sobrien } else { 62236769Sobrien assert(isa<VPInstruction>(&Ingredient) && 63236769Sobrien "only VPInstructions expected here"); 64236769Sobrien assert(!isa<PHINode>(Inst) && "phis should be handled above"); 65236769Sobrien // Create VPWidenMemoryInstructionRecipe for loads and stores. 66236769Sobrien if (LoadInst *Load = dyn_cast<LoadInst>(Inst)) { 67236769Sobrien NewRecipe = new VPWidenMemoryInstructionRecipe( 68236769Sobrien *Load, Ingredient.getOperand(0), nullptr /*Mask*/, 69236769Sobrien false /*Consecutive*/, false /*Reverse*/); 70236769Sobrien } else if (StoreInst *Store = dyn_cast<StoreInst>(Inst)) { 71236769Sobrien NewRecipe = new VPWidenMemoryInstructionRecipe( 72236769Sobrien *Store, Ingredient.getOperand(1), Ingredient.getOperand(0), 73236769Sobrien nullptr /*Mask*/, false /*Consecutive*/, false /*Reverse*/); 74236769Sobrien } else if (GetElementPtrInst *GEP = dyn_cast<GetElementPtrInst>(Inst)) { 75236769Sobrien NewRecipe = new VPWidenGEPRecipe(GEP, Ingredient.operands()); 76236769Sobrien } else if (CallInst *CI = dyn_cast<CallInst>(Inst)) { 77236769Sobrien NewRecipe = new VPWidenCallRecipe( 78236769Sobrien *CI, drop_end(Ingredient.operands()), 79236769Sobrien getVectorIntrinsicIDForCall(CI, &TLI), CI->getDebugLoc()); 80236769Sobrien } else if (SelectInst *SI = dyn_cast<SelectInst>(Inst)) { 81236769Sobrien NewRecipe = new VPWidenSelectRecipe(*SI, Ingredient.operands()); 82236769Sobrien } else if (auto *CI = dyn_cast<CastInst>(Inst)) { 83236769Sobrien NewRecipe = new VPWidenCastRecipe( 84236769Sobrien CI->getOpcode(), Ingredient.getOperand(0), CI->getType(), *CI); 85236769Sobrien } else { 86236769Sobrien NewRecipe = new VPWidenRecipe(*Inst, Ingredient.operands()); 87236769Sobrien } 88236769Sobrien } 89236769Sobrien 90236769Sobrien NewRecipe->insertBefore(&Ingredient); 91236769Sobrien if (NewRecipe->getNumDefinedValues() == 1) 92236769Sobrien VPV->replaceAllUsesWith(NewRecipe->getVPSingleValue()); 93236769Sobrien else 94236769Sobrien assert(NewRecipe->getNumDefinedValues() == 0 && 95236769Sobrien "Only recpies with zero or one defined values expected"); 96236769Sobrien Ingredient.eraseFromParent(); 97236769Sobrien } 98236769Sobrien } 99236769Sobrien} 100236769Sobrien 101236769Sobrienstatic bool sinkScalarOperands(VPlan &Plan) { 102236769Sobrien auto Iter = vp_depth_first_deep(Plan.getEntry()); 103236769Sobrien bool Changed = false; 104236769Sobrien // First, collect the operands of all recipes in replicate blocks as seeds for 105236769Sobrien // sinking. 106 SetVector<std::pair<VPBasicBlock *, VPSingleDefRecipe *>> WorkList; 107 for (VPRegionBlock *VPR : VPBlockUtils::blocksOnly<VPRegionBlock>(Iter)) { 108 VPBasicBlock *EntryVPBB = VPR->getEntryBasicBlock(); 109 if (!VPR->isReplicator() || EntryVPBB->getSuccessors().size() != 2) 110 continue; 111 VPBasicBlock *VPBB = dyn_cast<VPBasicBlock>(EntryVPBB->getSuccessors()[0]); 112 if (!VPBB || VPBB->getSingleSuccessor() != VPR->getExitingBasicBlock()) 113 continue; 114 for (auto &Recipe : *VPBB) { 115 for (VPValue *Op : Recipe.operands()) 116 if (auto *Def = 117 dyn_cast_or_null<VPSingleDefRecipe>(Op->getDefiningRecipe())) 118 WorkList.insert(std::make_pair(VPBB, Def)); 119 } 120 } 121 122 bool ScalarVFOnly = Plan.hasScalarVFOnly(); 123 // Try to sink each replicate or scalar IV steps recipe in the worklist. 124 for (unsigned I = 0; I != WorkList.size(); ++I) { 125 VPBasicBlock *SinkTo; 126 VPSingleDefRecipe *SinkCandidate; 127 std::tie(SinkTo, SinkCandidate) = WorkList[I]; 128 if (SinkCandidate->getParent() == SinkTo || 129 SinkCandidate->mayHaveSideEffects() || 130 SinkCandidate->mayReadOrWriteMemory()) 131 continue; 132 if (auto *RepR = dyn_cast<VPReplicateRecipe>(SinkCandidate)) { 133 if (!ScalarVFOnly && RepR->isUniform()) 134 continue; 135 } else if (!isa<VPScalarIVStepsRecipe>(SinkCandidate)) 136 continue; 137 138 bool NeedsDuplicating = false; 139 // All recipe users of the sink candidate must be in the same block SinkTo 140 // or all users outside of SinkTo must be uniform-after-vectorization ( 141 // i.e., only first lane is used) . In the latter case, we need to duplicate 142 // SinkCandidate. 143 auto CanSinkWithUser = [SinkTo, &NeedsDuplicating, 144 SinkCandidate](VPUser *U) { 145 auto *UI = dyn_cast<VPRecipeBase>(U); 146 if (!UI) 147 return false; 148 if (UI->getParent() == SinkTo) 149 return true; 150 NeedsDuplicating = UI->onlyFirstLaneUsed(SinkCandidate); 151 // We only know how to duplicate VPRecipeRecipes for now. 152 return NeedsDuplicating && isa<VPReplicateRecipe>(SinkCandidate); 153 }; 154 if (!all_of(SinkCandidate->users(), CanSinkWithUser)) 155 continue; 156 157 if (NeedsDuplicating) { 158 if (ScalarVFOnly) 159 continue; 160 Instruction *I = cast<Instruction>( 161 cast<VPReplicateRecipe>(SinkCandidate)->getUnderlyingValue()); 162 auto *Clone = new VPReplicateRecipe(I, SinkCandidate->operands(), true); 163 // TODO: add ".cloned" suffix to name of Clone's VPValue. 164 165 Clone->insertBefore(SinkCandidate); 166 SinkCandidate->replaceUsesWithIf(Clone, [SinkTo](VPUser &U, unsigned) { 167 return cast<VPRecipeBase>(&U)->getParent() != SinkTo; 168 }); 169 } 170 SinkCandidate->moveBefore(*SinkTo, SinkTo->getFirstNonPhi()); 171 for (VPValue *Op : SinkCandidate->operands()) 172 if (auto *Def = 173 dyn_cast_or_null<VPSingleDefRecipe>(Op->getDefiningRecipe())) 174 WorkList.insert(std::make_pair(SinkTo, Def)); 175 Changed = true; 176 } 177 return Changed; 178} 179 180/// If \p R is a region with a VPBranchOnMaskRecipe in the entry block, return 181/// the mask. 182VPValue *getPredicatedMask(VPRegionBlock *R) { 183 auto *EntryBB = dyn_cast<VPBasicBlock>(R->getEntry()); 184 if (!EntryBB || EntryBB->size() != 1 || 185 !isa<VPBranchOnMaskRecipe>(EntryBB->begin())) 186 return nullptr; 187 188 return cast<VPBranchOnMaskRecipe>(&*EntryBB->begin())->getOperand(0); 189} 190 191/// If \p R is a triangle region, return the 'then' block of the triangle. 192static VPBasicBlock *getPredicatedThenBlock(VPRegionBlock *R) { 193 auto *EntryBB = cast<VPBasicBlock>(R->getEntry()); 194 if (EntryBB->getNumSuccessors() != 2) 195 return nullptr; 196 197 auto *Succ0 = dyn_cast<VPBasicBlock>(EntryBB->getSuccessors()[0]); 198 auto *Succ1 = dyn_cast<VPBasicBlock>(EntryBB->getSuccessors()[1]); 199 if (!Succ0 || !Succ1) 200 return nullptr; 201 202 if (Succ0->getNumSuccessors() + Succ1->getNumSuccessors() != 1) 203 return nullptr; 204 if (Succ0->getSingleSuccessor() == Succ1) 205 return Succ0; 206 if (Succ1->getSingleSuccessor() == Succ0) 207 return Succ1; 208 return nullptr; 209} 210 211// Merge replicate regions in their successor region, if a replicate region 212// is connected to a successor replicate region with the same predicate by a 213// single, empty VPBasicBlock. 214static bool mergeReplicateRegionsIntoSuccessors(VPlan &Plan) { 215 SetVector<VPRegionBlock *> DeletedRegions; 216 217 // Collect replicate regions followed by an empty block, followed by another 218 // replicate region with matching masks to process front. This is to avoid 219 // iterator invalidation issues while merging regions. 220 SmallVector<VPRegionBlock *, 8> WorkList; 221 for (VPRegionBlock *Region1 : VPBlockUtils::blocksOnly<VPRegionBlock>( 222 vp_depth_first_deep(Plan.getEntry()))) { 223 if (!Region1->isReplicator()) 224 continue; 225 auto *MiddleBasicBlock = 226 dyn_cast_or_null<VPBasicBlock>(Region1->getSingleSuccessor()); 227 if (!MiddleBasicBlock || !MiddleBasicBlock->empty()) 228 continue; 229 230 auto *Region2 = 231 dyn_cast_or_null<VPRegionBlock>(MiddleBasicBlock->getSingleSuccessor()); 232 if (!Region2 || !Region2->isReplicator()) 233 continue; 234 235 VPValue *Mask1 = getPredicatedMask(Region1); 236 VPValue *Mask2 = getPredicatedMask(Region2); 237 if (!Mask1 || Mask1 != Mask2) 238 continue; 239 240 assert(Mask1 && Mask2 && "both region must have conditions"); 241 WorkList.push_back(Region1); 242 } 243 244 // Move recipes from Region1 to its successor region, if both are triangles. 245 for (VPRegionBlock *Region1 : WorkList) { 246 if (DeletedRegions.contains(Region1)) 247 continue; 248 auto *MiddleBasicBlock = cast<VPBasicBlock>(Region1->getSingleSuccessor()); 249 auto *Region2 = cast<VPRegionBlock>(MiddleBasicBlock->getSingleSuccessor()); 250 251 VPBasicBlock *Then1 = getPredicatedThenBlock(Region1); 252 VPBasicBlock *Then2 = getPredicatedThenBlock(Region2); 253 if (!Then1 || !Then2) 254 continue; 255 256 // Note: No fusion-preventing memory dependencies are expected in either 257 // region. Such dependencies should be rejected during earlier dependence 258 // checks, which guarantee accesses can be re-ordered for vectorization. 259 // 260 // Move recipes to the successor region. 261 for (VPRecipeBase &ToMove : make_early_inc_range(reverse(*Then1))) 262 ToMove.moveBefore(*Then2, Then2->getFirstNonPhi()); 263 264 auto *Merge1 = cast<VPBasicBlock>(Then1->getSingleSuccessor()); 265 auto *Merge2 = cast<VPBasicBlock>(Then2->getSingleSuccessor()); 266 267 // Move VPPredInstPHIRecipes from the merge block to the successor region's 268 // merge block. Update all users inside the successor region to use the 269 // original values. 270 for (VPRecipeBase &Phi1ToMove : make_early_inc_range(reverse(*Merge1))) { 271 VPValue *PredInst1 = 272 cast<VPPredInstPHIRecipe>(&Phi1ToMove)->getOperand(0); 273 VPValue *Phi1ToMoveV = Phi1ToMove.getVPSingleValue(); 274 Phi1ToMoveV->replaceUsesWithIf(PredInst1, [Then2](VPUser &U, unsigned) { 275 auto *UI = dyn_cast<VPRecipeBase>(&U); 276 return UI && UI->getParent() == Then2; 277 }); 278 279 Phi1ToMove.moveBefore(*Merge2, Merge2->begin()); 280 } 281 282 // Finally, remove the first region. 283 for (VPBlockBase *Pred : make_early_inc_range(Region1->getPredecessors())) { 284 VPBlockUtils::disconnectBlocks(Pred, Region1); 285 VPBlockUtils::connectBlocks(Pred, MiddleBasicBlock); 286 } 287 VPBlockUtils::disconnectBlocks(Region1, MiddleBasicBlock); 288 DeletedRegions.insert(Region1); 289 } 290 291 for (VPRegionBlock *ToDelete : DeletedRegions) 292 delete ToDelete; 293 return !DeletedRegions.empty(); 294} 295 296static VPRegionBlock *createReplicateRegion(VPReplicateRecipe *PredRecipe, 297 VPlan &Plan) { 298 Instruction *Instr = PredRecipe->getUnderlyingInstr(); 299 // Build the triangular if-then region. 300 std::string RegionName = (Twine("pred.") + Instr->getOpcodeName()).str(); 301 assert(Instr->getParent() && "Predicated instruction not in any basic block"); 302 auto *BlockInMask = PredRecipe->getMask(); 303 auto *BOMRecipe = new VPBranchOnMaskRecipe(BlockInMask); 304 auto *Entry = new VPBasicBlock(Twine(RegionName) + ".entry", BOMRecipe); 305 306 // Replace predicated replicate recipe with a replicate recipe without a 307 // mask but in the replicate region. 308 auto *RecipeWithoutMask = new VPReplicateRecipe( 309 PredRecipe->getUnderlyingInstr(), 310 make_range(PredRecipe->op_begin(), std::prev(PredRecipe->op_end())), 311 PredRecipe->isUniform()); 312 auto *Pred = new VPBasicBlock(Twine(RegionName) + ".if", RecipeWithoutMask); 313 314 VPPredInstPHIRecipe *PHIRecipe = nullptr; 315 if (PredRecipe->getNumUsers() != 0) { 316 PHIRecipe = new VPPredInstPHIRecipe(RecipeWithoutMask); 317 PredRecipe->replaceAllUsesWith(PHIRecipe); 318 PHIRecipe->setOperand(0, RecipeWithoutMask); 319 } 320 PredRecipe->eraseFromParent(); 321 auto *Exiting = new VPBasicBlock(Twine(RegionName) + ".continue", PHIRecipe); 322 VPRegionBlock *Region = new VPRegionBlock(Entry, Exiting, RegionName, true); 323 324 // Note: first set Entry as region entry and then connect successors starting 325 // from it in order, to propagate the "parent" of each VPBasicBlock. 326 VPBlockUtils::insertTwoBlocksAfter(Pred, Exiting, Entry); 327 VPBlockUtils::connectBlocks(Pred, Exiting); 328 329 return Region; 330} 331 332static void addReplicateRegions(VPlan &Plan) { 333 SmallVector<VPReplicateRecipe *> WorkList; 334 for (VPBasicBlock *VPBB : VPBlockUtils::blocksOnly<VPBasicBlock>( 335 vp_depth_first_deep(Plan.getEntry()))) { 336 for (VPRecipeBase &R : *VPBB) 337 if (auto *RepR = dyn_cast<VPReplicateRecipe>(&R)) { 338 if (RepR->isPredicated()) 339 WorkList.push_back(RepR); 340 } 341 } 342 343 unsigned BBNum = 0; 344 for (VPReplicateRecipe *RepR : WorkList) { 345 VPBasicBlock *CurrentBlock = RepR->getParent(); 346 VPBasicBlock *SplitBlock = CurrentBlock->splitAt(RepR->getIterator()); 347 348 BasicBlock *OrigBB = RepR->getUnderlyingInstr()->getParent(); 349 SplitBlock->setName( 350 OrigBB->hasName() ? OrigBB->getName() + "." + Twine(BBNum++) : ""); 351 // Record predicated instructions for above packing optimizations. 352 VPBlockBase *Region = createReplicateRegion(RepR, Plan); 353 Region->setParent(CurrentBlock->getParent()); 354 VPBlockUtils::disconnectBlocks(CurrentBlock, SplitBlock); 355 VPBlockUtils::connectBlocks(CurrentBlock, Region); 356 VPBlockUtils::connectBlocks(Region, SplitBlock); 357 } 358} 359 360void VPlanTransforms::createAndOptimizeReplicateRegions(VPlan &Plan) { 361 // Convert masked VPReplicateRecipes to if-then region blocks. 362 addReplicateRegions(Plan); 363 364 bool ShouldSimplify = true; 365 while (ShouldSimplify) { 366 ShouldSimplify = sinkScalarOperands(Plan); 367 ShouldSimplify |= mergeReplicateRegionsIntoSuccessors(Plan); 368 ShouldSimplify |= VPlanTransforms::mergeBlocksIntoPredecessors(Plan); 369 } 370} 371bool VPlanTransforms::mergeBlocksIntoPredecessors(VPlan &Plan) { 372 SmallVector<VPBasicBlock *> WorkList; 373 for (VPBasicBlock *VPBB : VPBlockUtils::blocksOnly<VPBasicBlock>( 374 vp_depth_first_deep(Plan.getEntry()))) { 375 auto *PredVPBB = 376 dyn_cast_or_null<VPBasicBlock>(VPBB->getSinglePredecessor()); 377 if (PredVPBB && PredVPBB->getNumSuccessors() == 1) 378 WorkList.push_back(VPBB); 379 } 380 381 for (VPBasicBlock *VPBB : WorkList) { 382 VPBasicBlock *PredVPBB = cast<VPBasicBlock>(VPBB->getSinglePredecessor()); 383 for (VPRecipeBase &R : make_early_inc_range(*VPBB)) 384 R.moveBefore(*PredVPBB, PredVPBB->end()); 385 VPBlockUtils::disconnectBlocks(PredVPBB, VPBB); 386 auto *ParentRegion = cast_or_null<VPRegionBlock>(VPBB->getParent()); 387 if (ParentRegion && ParentRegion->getExiting() == VPBB) 388 ParentRegion->setExiting(PredVPBB); 389 for (auto *Succ : to_vector(VPBB->successors())) { 390 VPBlockUtils::disconnectBlocks(VPBB, Succ); 391 VPBlockUtils::connectBlocks(PredVPBB, Succ); 392 } 393 delete VPBB; 394 } 395 return !WorkList.empty(); 396} 397 398void VPlanTransforms::removeRedundantInductionCasts(VPlan &Plan) { 399 for (auto &Phi : Plan.getVectorLoopRegion()->getEntryBasicBlock()->phis()) { 400 auto *IV = dyn_cast<VPWidenIntOrFpInductionRecipe>(&Phi); 401 if (!IV || IV->getTruncInst()) 402 continue; 403 404 // A sequence of IR Casts has potentially been recorded for IV, which 405 // *must be bypassed* when the IV is vectorized, because the vectorized IV 406 // will produce the desired casted value. This sequence forms a def-use 407 // chain and is provided in reverse order, ending with the cast that uses 408 // the IV phi. Search for the recipe of the last cast in the chain and 409 // replace it with the original IV. Note that only the final cast is 410 // expected to have users outside the cast-chain and the dead casts left 411 // over will be cleaned up later. 412 auto &Casts = IV->getInductionDescriptor().getCastInsts(); 413 VPValue *FindMyCast = IV; 414 for (Instruction *IRCast : reverse(Casts)) { 415 VPSingleDefRecipe *FoundUserCast = nullptr; 416 for (auto *U : FindMyCast->users()) { 417 auto *UserCast = dyn_cast<VPSingleDefRecipe>(U); 418 if (UserCast && UserCast->getUnderlyingValue() == IRCast) { 419 FoundUserCast = UserCast; 420 break; 421 } 422 } 423 FindMyCast = FoundUserCast; 424 } 425 FindMyCast->replaceAllUsesWith(IV); 426 } 427} 428 429void VPlanTransforms::removeRedundantCanonicalIVs(VPlan &Plan) { 430 VPCanonicalIVPHIRecipe *CanonicalIV = Plan.getCanonicalIV(); 431 VPWidenCanonicalIVRecipe *WidenNewIV = nullptr; 432 for (VPUser *U : CanonicalIV->users()) { 433 WidenNewIV = dyn_cast<VPWidenCanonicalIVRecipe>(U); 434 if (WidenNewIV) 435 break; 436 } 437 438 if (!WidenNewIV) 439 return; 440 441 VPBasicBlock *HeaderVPBB = Plan.getVectorLoopRegion()->getEntryBasicBlock(); 442 for (VPRecipeBase &Phi : HeaderVPBB->phis()) { 443 auto *WidenOriginalIV = dyn_cast<VPWidenIntOrFpInductionRecipe>(&Phi); 444 445 if (!WidenOriginalIV || !WidenOriginalIV->isCanonical() || 446 WidenOriginalIV->getScalarType() != WidenNewIV->getScalarType()) 447 continue; 448 449 // Replace WidenNewIV with WidenOriginalIV if WidenOriginalIV provides 450 // everything WidenNewIV's users need. That is, WidenOriginalIV will 451 // generate a vector phi or all users of WidenNewIV demand the first lane 452 // only. 453 if (any_of(WidenOriginalIV->users(), 454 [WidenOriginalIV](VPUser *U) { 455 return !U->usesScalars(WidenOriginalIV); 456 }) || 457 vputils::onlyFirstLaneUsed(WidenNewIV)) { 458 WidenNewIV->replaceAllUsesWith(WidenOriginalIV); 459 WidenNewIV->eraseFromParent(); 460 return; 461 } 462 } 463} 464 465void VPlanTransforms::removeDeadRecipes(VPlan &Plan) { 466 ReversePostOrderTraversal<VPBlockDeepTraversalWrapper<VPBlockBase *>> RPOT( 467 Plan.getEntry()); 468 469 for (VPBasicBlock *VPBB : reverse(VPBlockUtils::blocksOnly<VPBasicBlock>(RPOT))) { 470 // The recipes in the block are processed in reverse order, to catch chains 471 // of dead recipes. 472 for (VPRecipeBase &R : make_early_inc_range(reverse(*VPBB))) { 473 // A user keeps R alive: 474 if (any_of(R.definedValues(), 475 [](VPValue *V) { return V->getNumUsers(); })) 476 continue; 477 478 // Having side effects keeps R alive, but do remove conditional assume 479 // instructions as their conditions may be flattened. 480 auto *RepR = dyn_cast<VPReplicateRecipe>(&R); 481 bool IsConditionalAssume = 482 RepR && RepR->isPredicated() && 483 match(RepR->getUnderlyingInstr(), m_Intrinsic<Intrinsic::assume>()); 484 if (R.mayHaveSideEffects() && !IsConditionalAssume) 485 continue; 486 487 R.eraseFromParent(); 488 } 489 } 490} 491 492static VPValue *createScalarIVSteps(VPlan &Plan, const InductionDescriptor &ID, 493 ScalarEvolution &SE, Instruction *TruncI, 494 Type *IVTy, VPValue *StartV, 495 VPValue *Step) { 496 VPBasicBlock *HeaderVPBB = Plan.getVectorLoopRegion()->getEntryBasicBlock(); 497 auto IP = HeaderVPBB->getFirstNonPhi(); 498 VPCanonicalIVPHIRecipe *CanonicalIV = Plan.getCanonicalIV(); 499 Type *TruncTy = TruncI ? TruncI->getType() : IVTy; 500 VPValue *BaseIV = CanonicalIV; 501 if (!CanonicalIV->isCanonical(ID.getKind(), StartV, Step, TruncTy)) { 502 BaseIV = new VPDerivedIVRecipe(ID, StartV, CanonicalIV, Step, 503 TruncI ? TruncI->getType() : nullptr); 504 HeaderVPBB->insert(BaseIV->getDefiningRecipe(), IP); 505 } 506 507 VPScalarIVStepsRecipe *Steps = new VPScalarIVStepsRecipe(ID, BaseIV, Step); 508 HeaderVPBB->insert(Steps, IP); 509 return Steps; 510} 511 512void VPlanTransforms::optimizeInductions(VPlan &Plan, ScalarEvolution &SE) { 513 SmallVector<VPRecipeBase *> ToRemove; 514 VPBasicBlock *HeaderVPBB = Plan.getVectorLoopRegion()->getEntryBasicBlock(); 515 bool HasOnlyVectorVFs = !Plan.hasVF(ElementCount::getFixed(1)); 516 for (VPRecipeBase &Phi : HeaderVPBB->phis()) { 517 auto *WideIV = dyn_cast<VPWidenIntOrFpInductionRecipe>(&Phi); 518 if (!WideIV) 519 continue; 520 if (HasOnlyVectorVFs && none_of(WideIV->users(), [WideIV](VPUser *U) { 521 return U->usesScalars(WideIV); 522 })) 523 continue; 524 525 const InductionDescriptor &ID = WideIV->getInductionDescriptor(); 526 VPValue *Steps = createScalarIVSteps( 527 Plan, ID, SE, WideIV->getTruncInst(), WideIV->getPHINode()->getType(), 528 WideIV->getStartValue(), WideIV->getStepValue()); 529 530 // Update scalar users of IV to use Step instead. 531 if (!HasOnlyVectorVFs) 532 WideIV->replaceAllUsesWith(Steps); 533 else 534 WideIV->replaceUsesWithIf(Steps, [WideIV](VPUser &U, unsigned) { 535 return U.usesScalars(WideIV); 536 }); 537 } 538} 539 540void VPlanTransforms::removeRedundantExpandSCEVRecipes(VPlan &Plan) { 541 DenseMap<const SCEV *, VPValue *> SCEV2VPV; 542 543 for (VPRecipeBase &R : 544 make_early_inc_range(*Plan.getEntry()->getEntryBasicBlock())) { 545 auto *ExpR = dyn_cast<VPExpandSCEVRecipe>(&R); 546 if (!ExpR) 547 continue; 548 549 auto I = SCEV2VPV.insert({ExpR->getSCEV(), ExpR}); 550 if (I.second) 551 continue; 552 ExpR->replaceAllUsesWith(I.first->second); 553 ExpR->eraseFromParent(); 554 } 555} 556 557static bool canSimplifyBranchOnCond(VPInstruction *Term) { 558 VPInstruction *Not = dyn_cast<VPInstruction>(Term->getOperand(0)); 559 if (!Not || Not->getOpcode() != VPInstruction::Not) 560 return false; 561 562 VPInstruction *ALM = dyn_cast<VPInstruction>(Not->getOperand(0)); 563 return ALM && ALM->getOpcode() == VPInstruction::ActiveLaneMask; 564} 565 566void VPlanTransforms::optimizeForVFAndUF(VPlan &Plan, ElementCount BestVF, 567 unsigned BestUF, 568 PredicatedScalarEvolution &PSE) { 569 assert(Plan.hasVF(BestVF) && "BestVF is not available in Plan"); 570 assert(Plan.hasUF(BestUF) && "BestUF is not available in Plan"); 571 VPBasicBlock *ExitingVPBB = 572 Plan.getVectorLoopRegion()->getExitingBasicBlock(); 573 auto *Term = dyn_cast<VPInstruction>(&ExitingVPBB->back()); 574 // Try to simplify the branch condition if TC <= VF * UF when preparing to 575 // execute the plan for the main vector loop. We only do this if the 576 // terminator is: 577 // 1. BranchOnCount, or 578 // 2. BranchOnCond where the input is Not(ActiveLaneMask). 579 if (!Term || (Term->getOpcode() != VPInstruction::BranchOnCount && 580 (Term->getOpcode() != VPInstruction::BranchOnCond || 581 !canSimplifyBranchOnCond(Term)))) 582 return; 583 584 Type *IdxTy = 585 Plan.getCanonicalIV()->getStartValue()->getLiveInIRValue()->getType(); 586 const SCEV *TripCount = createTripCountSCEV(IdxTy, PSE); 587 ScalarEvolution &SE = *PSE.getSE(); 588 const SCEV *C = 589 SE.getConstant(TripCount->getType(), BestVF.getKnownMinValue() * BestUF); 590 if (TripCount->isZero() || 591 !SE.isKnownPredicate(CmpInst::ICMP_ULE, TripCount, C)) 592 return; 593 594 LLVMContext &Ctx = SE.getContext(); 595 auto *BOC = new VPInstruction( 596 VPInstruction::BranchOnCond, 597 {Plan.getVPValueOrAddLiveIn(ConstantInt::getTrue(Ctx))}); 598 Term->eraseFromParent(); 599 ExitingVPBB->appendRecipe(BOC); 600 Plan.setVF(BestVF); 601 Plan.setUF(BestUF); 602 // TODO: Further simplifications are possible 603 // 1. Replace inductions with constants. 604 // 2. Replace vector loop region with VPBasicBlock. 605} 606 607#ifndef NDEBUG 608static VPRegionBlock *GetReplicateRegion(VPRecipeBase *R) { 609 auto *Region = dyn_cast_or_null<VPRegionBlock>(R->getParent()->getParent()); 610 if (Region && Region->isReplicator()) { 611 assert(Region->getNumSuccessors() == 1 && 612 Region->getNumPredecessors() == 1 && "Expected SESE region!"); 613 assert(R->getParent()->size() == 1 && 614 "A recipe in an original replicator region must be the only " 615 "recipe in its block"); 616 return Region; 617 } 618 return nullptr; 619} 620#endif 621 622static bool properlyDominates(const VPRecipeBase *A, const VPRecipeBase *B, 623 VPDominatorTree &VPDT) { 624 if (A == B) 625 return false; 626 627 auto LocalComesBefore = [](const VPRecipeBase *A, const VPRecipeBase *B) { 628 for (auto &R : *A->getParent()) { 629 if (&R == A) 630 return true; 631 if (&R == B) 632 return false; 633 } 634 llvm_unreachable("recipe not found"); 635 }; 636 const VPBlockBase *ParentA = A->getParent(); 637 const VPBlockBase *ParentB = B->getParent(); 638 if (ParentA == ParentB) 639 return LocalComesBefore(A, B); 640 641 assert(!GetReplicateRegion(const_cast<VPRecipeBase *>(A)) && 642 "No replicate regions expected at this point"); 643 assert(!GetReplicateRegion(const_cast<VPRecipeBase *>(B)) && 644 "No replicate regions expected at this point"); 645 return VPDT.properlyDominates(ParentA, ParentB); 646} 647 648/// Sink users of \p FOR after the recipe defining the previous value \p 649/// Previous of the recurrence. \returns true if all users of \p FOR could be 650/// re-arranged as needed or false if it is not possible. 651static bool 652sinkRecurrenceUsersAfterPrevious(VPFirstOrderRecurrencePHIRecipe *FOR, 653 VPRecipeBase *Previous, 654 VPDominatorTree &VPDT) { 655 // Collect recipes that need sinking. 656 SmallVector<VPRecipeBase *> WorkList; 657 SmallPtrSet<VPRecipeBase *, 8> Seen; 658 Seen.insert(Previous); 659 auto TryToPushSinkCandidate = [&](VPRecipeBase *SinkCandidate) { 660 // The previous value must not depend on the users of the recurrence phi. In 661 // that case, FOR is not a fixed order recurrence. 662 if (SinkCandidate == Previous) 663 return false; 664 665 if (isa<VPHeaderPHIRecipe>(SinkCandidate) || 666 !Seen.insert(SinkCandidate).second || 667 properlyDominates(Previous, SinkCandidate, VPDT)) 668 return true; 669 670 if (SinkCandidate->mayHaveSideEffects()) 671 return false; 672 673 WorkList.push_back(SinkCandidate); 674 return true; 675 }; 676 677 // Recursively sink users of FOR after Previous. 678 WorkList.push_back(FOR); 679 for (unsigned I = 0; I != WorkList.size(); ++I) { 680 VPRecipeBase *Current = WorkList[I]; 681 assert(Current->getNumDefinedValues() == 1 && 682 "only recipes with a single defined value expected"); 683 684 for (VPUser *User : Current->getVPSingleValue()->users()) { 685 if (auto *R = dyn_cast<VPRecipeBase>(User)) 686 if (!TryToPushSinkCandidate(R)) 687 return false; 688 } 689 } 690 691 // Keep recipes to sink ordered by dominance so earlier instructions are 692 // processed first. 693 sort(WorkList, [&VPDT](const VPRecipeBase *A, const VPRecipeBase *B) { 694 return properlyDominates(A, B, VPDT); 695 }); 696 697 for (VPRecipeBase *SinkCandidate : WorkList) { 698 if (SinkCandidate == FOR) 699 continue; 700 701 SinkCandidate->moveAfter(Previous); 702 Previous = SinkCandidate; 703 } 704 return true; 705} 706 707bool VPlanTransforms::adjustFixedOrderRecurrences(VPlan &Plan, 708 VPBuilder &Builder) { 709 VPDominatorTree VPDT; 710 VPDT.recalculate(Plan); 711 712 SmallVector<VPFirstOrderRecurrencePHIRecipe *> RecurrencePhis; 713 for (VPRecipeBase &R : 714 Plan.getVectorLoopRegion()->getEntry()->getEntryBasicBlock()->phis()) 715 if (auto *FOR = dyn_cast<VPFirstOrderRecurrencePHIRecipe>(&R)) 716 RecurrencePhis.push_back(FOR); 717 718 for (VPFirstOrderRecurrencePHIRecipe *FOR : RecurrencePhis) { 719 SmallPtrSet<VPFirstOrderRecurrencePHIRecipe *, 4> SeenPhis; 720 VPRecipeBase *Previous = FOR->getBackedgeValue()->getDefiningRecipe(); 721 // Fixed-order recurrences do not contain cycles, so this loop is guaranteed 722 // to terminate. 723 while (auto *PrevPhi = 724 dyn_cast_or_null<VPFirstOrderRecurrencePHIRecipe>(Previous)) { 725 assert(PrevPhi->getParent() == FOR->getParent()); 726 assert(SeenPhis.insert(PrevPhi).second); 727 Previous = PrevPhi->getBackedgeValue()->getDefiningRecipe(); 728 } 729 730 if (!sinkRecurrenceUsersAfterPrevious(FOR, Previous, VPDT)) 731 return false; 732 733 // Introduce a recipe to combine the incoming and previous values of a 734 // fixed-order recurrence. 735 VPBasicBlock *InsertBlock = Previous->getParent(); 736 if (isa<VPHeaderPHIRecipe>(Previous)) 737 Builder.setInsertPoint(InsertBlock, InsertBlock->getFirstNonPhi()); 738 else 739 Builder.setInsertPoint(InsertBlock, std::next(Previous->getIterator())); 740 741 auto *RecurSplice = cast<VPInstruction>( 742 Builder.createNaryOp(VPInstruction::FirstOrderRecurrenceSplice, 743 {FOR, FOR->getBackedgeValue()})); 744 745 FOR->replaceAllUsesWith(RecurSplice); 746 // Set the first operand of RecurSplice to FOR again, after replacing 747 // all users. 748 RecurSplice->setOperand(0, FOR); 749 } 750 return true; 751} 752 753void VPlanTransforms::clearReductionWrapFlags(VPlan &Plan) { 754 for (VPRecipeBase &R : 755 Plan.getVectorLoopRegion()->getEntryBasicBlock()->phis()) { 756 auto *PhiR = dyn_cast<VPReductionPHIRecipe>(&R); 757 if (!PhiR) 758 continue; 759 const RecurrenceDescriptor &RdxDesc = PhiR->getRecurrenceDescriptor(); 760 RecurKind RK = RdxDesc.getRecurrenceKind(); 761 if (RK != RecurKind::Add && RK != RecurKind::Mul) 762 continue; 763 764 SmallSetVector<VPValue *, 8> Worklist; 765 Worklist.insert(PhiR); 766 767 for (unsigned I = 0; I != Worklist.size(); ++I) { 768 VPValue *Cur = Worklist[I]; 769 if (auto *RecWithFlags = 770 dyn_cast<VPRecipeWithIRFlags>(Cur->getDefiningRecipe())) { 771 RecWithFlags->dropPoisonGeneratingFlags(); 772 } 773 774 for (VPUser *U : Cur->users()) { 775 auto *UserRecipe = dyn_cast<VPRecipeBase>(U); 776 if (!UserRecipe) 777 continue; 778 for (VPValue *V : UserRecipe->definedValues()) 779 Worklist.insert(V); 780 } 781 } 782 } 783} 784 785/// Returns true is \p V is constant one. 786static bool isConstantOne(VPValue *V) { 787 if (!V->isLiveIn()) 788 return false; 789 auto *C = dyn_cast<ConstantInt>(V->getLiveInIRValue()); 790 return C && C->isOne(); 791} 792 793/// Returns the llvm::Instruction opcode for \p R. 794static unsigned getOpcodeForRecipe(VPRecipeBase &R) { 795 if (auto *WidenR = dyn_cast<VPWidenRecipe>(&R)) 796 return WidenR->getUnderlyingInstr()->getOpcode(); 797 if (auto *WidenC = dyn_cast<VPWidenCastRecipe>(&R)) 798 return WidenC->getOpcode(); 799 if (auto *RepR = dyn_cast<VPReplicateRecipe>(&R)) 800 return RepR->getUnderlyingInstr()->getOpcode(); 801 if (auto *VPI = dyn_cast<VPInstruction>(&R)) 802 return VPI->getOpcode(); 803 return 0; 804} 805 806/// Try to simplify recipe \p R. 807static void simplifyRecipe(VPRecipeBase &R, VPTypeAnalysis &TypeInfo) { 808 switch (getOpcodeForRecipe(R)) { 809 case Instruction::Mul: { 810 VPValue *A = R.getOperand(0); 811 VPValue *B = R.getOperand(1); 812 if (isConstantOne(A)) 813 return R.getVPSingleValue()->replaceAllUsesWith(B); 814 if (isConstantOne(B)) 815 return R.getVPSingleValue()->replaceAllUsesWith(A); 816 break; 817 } 818 case Instruction::Trunc: { 819 VPRecipeBase *Ext = R.getOperand(0)->getDefiningRecipe(); 820 if (!Ext) 821 break; 822 unsigned ExtOpcode = getOpcodeForRecipe(*Ext); 823 if (ExtOpcode != Instruction::ZExt && ExtOpcode != Instruction::SExt) 824 break; 825 VPValue *A = Ext->getOperand(0); 826 VPValue *Trunc = R.getVPSingleValue(); 827 Type *TruncTy = TypeInfo.inferScalarType(Trunc); 828 Type *ATy = TypeInfo.inferScalarType(A); 829 if (TruncTy == ATy) { 830 Trunc->replaceAllUsesWith(A); 831 } else { 832 // Don't replace a scalarizing recipe with a widened cast. 833 if (isa<VPReplicateRecipe>(&R)) 834 break; 835 if (ATy->getScalarSizeInBits() < TruncTy->getScalarSizeInBits()) { 836 auto *VPC = 837 new VPWidenCastRecipe(Instruction::CastOps(ExtOpcode), A, TruncTy); 838 VPC->insertBefore(&R); 839 Trunc->replaceAllUsesWith(VPC); 840 } else if (ATy->getScalarSizeInBits() > TruncTy->getScalarSizeInBits()) { 841 auto *VPC = new VPWidenCastRecipe(Instruction::Trunc, A, TruncTy); 842 VPC->insertBefore(&R); 843 Trunc->replaceAllUsesWith(VPC); 844 } 845 } 846#ifndef NDEBUG 847 // Verify that the cached type info is for both A and its users is still 848 // accurate by comparing it to freshly computed types. 849 VPTypeAnalysis TypeInfo2(TypeInfo.getContext()); 850 assert(TypeInfo.inferScalarType(A) == TypeInfo2.inferScalarType(A)); 851 for (VPUser *U : A->users()) { 852 auto *R = dyn_cast<VPRecipeBase>(U); 853 if (!R) 854 continue; 855 for (VPValue *VPV : R->definedValues()) 856 assert(TypeInfo.inferScalarType(VPV) == TypeInfo2.inferScalarType(VPV)); 857 } 858#endif 859 break; 860 } 861 default: 862 break; 863 } 864} 865 866/// Try to simplify the recipes in \p Plan. 867static void simplifyRecipes(VPlan &Plan, LLVMContext &Ctx) { 868 ReversePostOrderTraversal<VPBlockDeepTraversalWrapper<VPBlockBase *>> RPOT( 869 Plan.getEntry()); 870 VPTypeAnalysis TypeInfo(Ctx); 871 for (VPBasicBlock *VPBB : VPBlockUtils::blocksOnly<VPBasicBlock>(RPOT)) { 872 for (VPRecipeBase &R : make_early_inc_range(*VPBB)) { 873 simplifyRecipe(R, TypeInfo); 874 } 875 } 876} 877 878void VPlanTransforms::truncateToMinimalBitwidths( 879 VPlan &Plan, const MapVector<Instruction *, uint64_t> &MinBWs, 880 LLVMContext &Ctx) { 881#ifndef NDEBUG 882 // Count the processed recipes and cross check the count later with MinBWs 883 // size, to make sure all entries in MinBWs have been handled. 884 unsigned NumProcessedRecipes = 0; 885#endif 886 // Keep track of created truncates, so they can be re-used. Note that we 887 // cannot use RAUW after creating a new truncate, as this would could make 888 // other uses have different types for their operands, making them invalidly 889 // typed. 890 DenseMap<VPValue *, VPWidenCastRecipe *> ProcessedTruncs; 891 VPTypeAnalysis TypeInfo(Ctx); 892 VPBasicBlock *PH = Plan.getEntry(); 893 for (VPBasicBlock *VPBB : VPBlockUtils::blocksOnly<VPBasicBlock>( 894 vp_depth_first_deep(Plan.getVectorLoopRegion()))) { 895 for (VPRecipeBase &R : make_early_inc_range(*VPBB)) { 896 if (!isa<VPWidenRecipe, VPWidenCastRecipe, VPReplicateRecipe, 897 VPWidenSelectRecipe, VPWidenMemoryInstructionRecipe>(&R)) 898 continue; 899 if (isa<VPWidenMemoryInstructionRecipe>(&R) && 900 cast<VPWidenMemoryInstructionRecipe>(&R)->isStore()) 901 continue; 902 903 VPValue *ResultVPV = R.getVPSingleValue(); 904 auto *UI = cast_or_null<Instruction>(ResultVPV->getUnderlyingValue()); 905 unsigned NewResSizeInBits = MinBWs.lookup(UI); 906 if (!NewResSizeInBits) 907 continue; 908 909#ifndef NDEBUG 910 NumProcessedRecipes++; 911#endif 912 // If the value wasn't vectorized, we must maintain the original scalar 913 // type. Skip those here, after incrementing NumProcessedRecipes. Also 914 // skip casts which do not need to be handled explicitly here, as 915 // redundant casts will be removed during recipe simplification. 916 if (isa<VPReplicateRecipe, VPWidenCastRecipe>(&R)) { 917#ifndef NDEBUG 918 // If any of the operands is a live-in and not used by VPWidenRecipe or 919 // VPWidenSelectRecipe, but in MinBWs, make sure it is counted as 920 // processed as well. When MinBWs is currently constructed, there is no 921 // information about whether recipes are widened or replicated and in 922 // case they are reciplicated the operands are not truncated. Counting 923 // them them here ensures we do not miss any recipes in MinBWs. 924 // TODO: Remove once the analysis is done on VPlan. 925 for (VPValue *Op : R.operands()) { 926 if (!Op->isLiveIn()) 927 continue; 928 auto *UV = dyn_cast_or_null<Instruction>(Op->getUnderlyingValue()); 929 if (UV && MinBWs.contains(UV) && !ProcessedTruncs.contains(Op) && 930 all_of(Op->users(), [](VPUser *U) { 931 return !isa<VPWidenRecipe, VPWidenSelectRecipe>(U); 932 })) { 933 // Add an entry to ProcessedTruncs to avoid counting the same 934 // operand multiple times. 935 ProcessedTruncs[Op] = nullptr; 936 NumProcessedRecipes += 1; 937 } 938 } 939#endif 940 continue; 941 } 942 943 Type *OldResTy = TypeInfo.inferScalarType(ResultVPV); 944 unsigned OldResSizeInBits = OldResTy->getScalarSizeInBits(); 945 assert(OldResTy->isIntegerTy() && "only integer types supported"); 946 if (OldResSizeInBits == NewResSizeInBits) 947 continue; 948 assert(OldResSizeInBits > NewResSizeInBits && "Nothing to shrink?"); 949 (void)OldResSizeInBits; 950 951 auto *NewResTy = IntegerType::get(Ctx, NewResSizeInBits); 952 953 // Any wrapping introduced by shrinking this operation shouldn't be 954 // considered undefined behavior. So, we can't unconditionally copy 955 // arithmetic wrapping flags to VPW. 956 if (auto *VPW = dyn_cast<VPRecipeWithIRFlags>(&R)) 957 VPW->dropPoisonGeneratingFlags(); 958 959 // Extend result to original width. 960 auto *Ext = new VPWidenCastRecipe(Instruction::ZExt, ResultVPV, OldResTy); 961 Ext->insertAfter(&R); 962 ResultVPV->replaceAllUsesWith(Ext); 963 Ext->setOperand(0, ResultVPV); 964 965 if (isa<VPWidenMemoryInstructionRecipe>(&R)) { 966 assert(!cast<VPWidenMemoryInstructionRecipe>(&R)->isStore() && "stores cannot be narrowed"); 967 continue; 968 } 969 970 // Shrink operands by introducing truncates as needed. 971 unsigned StartIdx = isa<VPWidenSelectRecipe>(&R) ? 1 : 0; 972 for (unsigned Idx = StartIdx; Idx != R.getNumOperands(); ++Idx) { 973 auto *Op = R.getOperand(Idx); 974 unsigned OpSizeInBits = 975 TypeInfo.inferScalarType(Op)->getScalarSizeInBits(); 976 if (OpSizeInBits == NewResSizeInBits) 977 continue; 978 assert(OpSizeInBits > NewResSizeInBits && "nothing to truncate"); 979 auto [ProcessedIter, IterIsEmpty] = 980 ProcessedTruncs.insert({Op, nullptr}); 981 VPWidenCastRecipe *NewOp = 982 IterIsEmpty 983 ? new VPWidenCastRecipe(Instruction::Trunc, Op, NewResTy) 984 : ProcessedIter->second; 985 R.setOperand(Idx, NewOp); 986 if (!IterIsEmpty) 987 continue; 988 ProcessedIter->second = NewOp; 989 if (!Op->isLiveIn()) { 990 NewOp->insertBefore(&R); 991 } else { 992 PH->appendRecipe(NewOp); 993#ifndef NDEBUG 994 auto *OpInst = dyn_cast<Instruction>(Op->getLiveInIRValue()); 995 bool IsContained = MinBWs.contains(OpInst); 996 NumProcessedRecipes += IsContained; 997#endif 998 } 999 } 1000 1001 } 1002 } 1003 1004 assert(MinBWs.size() == NumProcessedRecipes && 1005 "some entries in MinBWs haven't been processed"); 1006} 1007 1008void VPlanTransforms::optimize(VPlan &Plan, ScalarEvolution &SE) { 1009 removeRedundantCanonicalIVs(Plan); 1010 removeRedundantInductionCasts(Plan); 1011 1012 optimizeInductions(Plan, SE); 1013 simplifyRecipes(Plan, SE.getContext()); 1014 removeDeadRecipes(Plan); 1015 1016 createAndOptimizeReplicateRegions(Plan); 1017 1018 removeRedundantExpandSCEVRecipes(Plan); 1019 mergeBlocksIntoPredecessors(Plan); 1020} 1021 1022// Add a VPActiveLaneMaskPHIRecipe and related recipes to \p Plan and replace 1023// the loop terminator with a branch-on-cond recipe with the negated 1024// active-lane-mask as operand. Note that this turns the loop into an 1025// uncountable one. Only the existing terminator is replaced, all other existing 1026// recipes/users remain unchanged, except for poison-generating flags being 1027// dropped from the canonical IV increment. Return the created 1028// VPActiveLaneMaskPHIRecipe. 1029// 1030// The function uses the following definitions: 1031// 1032// %TripCount = DataWithControlFlowWithoutRuntimeCheck ? 1033// calculate-trip-count-minus-VF (original TC) : original TC 1034// %IncrementValue = DataWithControlFlowWithoutRuntimeCheck ? 1035// CanonicalIVPhi : CanonicalIVIncrement 1036// %StartV is the canonical induction start value. 1037// 1038// The function adds the following recipes: 1039// 1040// vector.ph: 1041// %TripCount = calculate-trip-count-minus-VF (original TC) 1042// [if DataWithControlFlowWithoutRuntimeCheck] 1043// %EntryInc = canonical-iv-increment-for-part %StartV 1044// %EntryALM = active-lane-mask %EntryInc, %TripCount 1045// 1046// vector.body: 1047// ... 1048// %P = active-lane-mask-phi [ %EntryALM, %vector.ph ], [ %ALM, %vector.body ] 1049// ... 1050// %InLoopInc = canonical-iv-increment-for-part %IncrementValue 1051// %ALM = active-lane-mask %InLoopInc, TripCount 1052// %Negated = Not %ALM 1053// branch-on-cond %Negated 1054// 1055static VPActiveLaneMaskPHIRecipe *addVPLaneMaskPhiAndUpdateExitBranch( 1056 VPlan &Plan, bool DataAndControlFlowWithoutRuntimeCheck) { 1057 VPRegionBlock *TopRegion = Plan.getVectorLoopRegion(); 1058 VPBasicBlock *EB = TopRegion->getExitingBasicBlock(); 1059 auto *CanonicalIVPHI = Plan.getCanonicalIV(); 1060 VPValue *StartV = CanonicalIVPHI->getStartValue(); 1061 1062 auto *CanonicalIVIncrement = 1063 cast<VPInstruction>(CanonicalIVPHI->getBackedgeValue()); 1064 // TODO: Check if dropping the flags is needed if 1065 // !DataAndControlFlowWithoutRuntimeCheck. 1066 CanonicalIVIncrement->dropPoisonGeneratingFlags(); 1067 DebugLoc DL = CanonicalIVIncrement->getDebugLoc(); 1068 // We can't use StartV directly in the ActiveLaneMask VPInstruction, since 1069 // we have to take unrolling into account. Each part needs to start at 1070 // Part * VF 1071 auto *VecPreheader = cast<VPBasicBlock>(TopRegion->getSinglePredecessor()); 1072 VPBuilder Builder(VecPreheader); 1073 1074 // Create the ActiveLaneMask instruction using the correct start values. 1075 VPValue *TC = Plan.getTripCount(); 1076 1077 VPValue *TripCount, *IncrementValue; 1078 if (!DataAndControlFlowWithoutRuntimeCheck) { 1079 // When the loop is guarded by a runtime overflow check for the loop 1080 // induction variable increment by VF, we can increment the value before 1081 // the get.active.lane mask and use the unmodified tripcount. 1082 IncrementValue = CanonicalIVIncrement; 1083 TripCount = TC; 1084 } else { 1085 // When avoiding a runtime check, the active.lane.mask inside the loop 1086 // uses a modified trip count and the induction variable increment is 1087 // done after the active.lane.mask intrinsic is called. 1088 IncrementValue = CanonicalIVPHI; 1089 TripCount = Builder.createNaryOp(VPInstruction::CalculateTripCountMinusVF, 1090 {TC}, DL); 1091 } 1092 auto *EntryIncrement = Builder.createOverflowingOp( 1093 VPInstruction::CanonicalIVIncrementForPart, {StartV}, {false, false}, DL, 1094 "index.part.next"); 1095 1096 // Create the active lane mask instruction in the VPlan preheader. 1097 auto *EntryALM = 1098 Builder.createNaryOp(VPInstruction::ActiveLaneMask, {EntryIncrement, TC}, 1099 DL, "active.lane.mask.entry"); 1100 1101 // Now create the ActiveLaneMaskPhi recipe in the main loop using the 1102 // preheader ActiveLaneMask instruction. 1103 auto LaneMaskPhi = new VPActiveLaneMaskPHIRecipe(EntryALM, DebugLoc()); 1104 LaneMaskPhi->insertAfter(CanonicalIVPHI); 1105 1106 // Create the active lane mask for the next iteration of the loop before the 1107 // original terminator. 1108 VPRecipeBase *OriginalTerminator = EB->getTerminator(); 1109 Builder.setInsertPoint(OriginalTerminator); 1110 auto *InLoopIncrement = 1111 Builder.createOverflowingOp(VPInstruction::CanonicalIVIncrementForPart, 1112 {IncrementValue}, {false, false}, DL); 1113 auto *ALM = Builder.createNaryOp(VPInstruction::ActiveLaneMask, 1114 {InLoopIncrement, TripCount}, DL, 1115 "active.lane.mask.next"); 1116 LaneMaskPhi->addOperand(ALM); 1117 1118 // Replace the original terminator with BranchOnCond. We have to invert the 1119 // mask here because a true condition means jumping to the exit block. 1120 auto *NotMask = Builder.createNot(ALM, DL); 1121 Builder.createNaryOp(VPInstruction::BranchOnCond, {NotMask}, DL); 1122 OriginalTerminator->eraseFromParent(); 1123 return LaneMaskPhi; 1124} 1125 1126void VPlanTransforms::addActiveLaneMask( 1127 VPlan &Plan, bool UseActiveLaneMaskForControlFlow, 1128 bool DataAndControlFlowWithoutRuntimeCheck) { 1129 assert((!DataAndControlFlowWithoutRuntimeCheck || 1130 UseActiveLaneMaskForControlFlow) && 1131 "DataAndControlFlowWithoutRuntimeCheck implies " 1132 "UseActiveLaneMaskForControlFlow"); 1133 1134 auto FoundWidenCanonicalIVUser = 1135 find_if(Plan.getCanonicalIV()->users(), 1136 [](VPUser *U) { return isa<VPWidenCanonicalIVRecipe>(U); }); 1137 assert(FoundWidenCanonicalIVUser && 1138 "Must have widened canonical IV when tail folding!"); 1139 auto *WideCanonicalIV = 1140 cast<VPWidenCanonicalIVRecipe>(*FoundWidenCanonicalIVUser); 1141 VPSingleDefRecipe *LaneMask; 1142 if (UseActiveLaneMaskForControlFlow) { 1143 LaneMask = addVPLaneMaskPhiAndUpdateExitBranch( 1144 Plan, DataAndControlFlowWithoutRuntimeCheck); 1145 } else { 1146 LaneMask = new VPInstruction(VPInstruction::ActiveLaneMask, 1147 {WideCanonicalIV, Plan.getTripCount()}, 1148 nullptr, "active.lane.mask"); 1149 LaneMask->insertAfter(WideCanonicalIV); 1150 } 1151 1152 // Walk users of WideCanonicalIV and replace all compares of the form 1153 // (ICMP_ULE, WideCanonicalIV, backedge-taken-count) with an 1154 // active-lane-mask. 1155 VPValue *BTC = Plan.getOrCreateBackedgeTakenCount(); 1156 for (VPUser *U : SmallVector<VPUser *>(WideCanonicalIV->users())) { 1157 auto *CompareToReplace = dyn_cast<VPInstruction>(U); 1158 if (!CompareToReplace || 1159 CompareToReplace->getOpcode() != Instruction::ICmp || 1160 CompareToReplace->getPredicate() != CmpInst::ICMP_ULE || 1161 CompareToReplace->getOperand(1) != BTC) 1162 continue; 1163 1164 assert(CompareToReplace->getOperand(0) == WideCanonicalIV && 1165 "WidenCanonicalIV must be the first operand of the compare"); 1166 CompareToReplace->replaceAllUsesWith(LaneMask); 1167 CompareToReplace->eraseFromParent(); 1168 } 1169} 1170