1236769Sobrien//===-- VPlanTransforms.cpp - Utility VPlan to VPlan transforms -----------===//
2236769Sobrien//
3236769Sobrien// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4236769Sobrien// See https://llvm.org/LICENSE.txt for license information.
5236769Sobrien// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6236769Sobrien//
7236769Sobrien//===----------------------------------------------------------------------===//
8236769Sobrien///
9236769Sobrien/// \file
10236769Sobrien/// This file implements a set of utility VPlan to VPlan transformations.
11236769Sobrien///
12236769Sobrien//===----------------------------------------------------------------------===//
13236769Sobrien
14236769Sobrien#include "VPlanTransforms.h"
15236769Sobrien#include "VPRecipeBuilder.h"
16236769Sobrien#include "VPlanAnalysis.h"
17236769Sobrien#include "VPlanCFG.h"
18236769Sobrien#include "VPlanDominatorTree.h"
19236769Sobrien#include "llvm/ADT/PostOrderIterator.h"
20236769Sobrien#include "llvm/ADT/STLExtras.h"
21236769Sobrien#include "llvm/ADT/SetVector.h"
22236769Sobrien#include "llvm/Analysis/IVDescriptors.h"
23236769Sobrien#include "llvm/Analysis/VectorUtils.h"
24236769Sobrien#include "llvm/IR/Intrinsics.h"
25236769Sobrien#include "llvm/IR/PatternMatch.h"
26236769Sobrien
27236769Sobrienusing namespace llvm;
28236769Sobrien
29236769Sobrienusing namespace llvm::PatternMatch;
30236769Sobrien
31236769Sobrienvoid VPlanTransforms::VPInstructionsToVPRecipes(
32236769Sobrien    VPlanPtr &Plan,
33236769Sobrien    function_ref<const InductionDescriptor *(PHINode *)>
34236769Sobrien        GetIntOrFpInductionDescriptor,
35236769Sobrien    ScalarEvolution &SE, const TargetLibraryInfo &TLI) {
36236769Sobrien
37236769Sobrien  ReversePostOrderTraversal<VPBlockDeepTraversalWrapper<VPBlockBase *>> RPOT(
38236769Sobrien      Plan->getEntry());
39236769Sobrien  for (VPBasicBlock *VPBB : VPBlockUtils::blocksOnly<VPBasicBlock>(RPOT)) {
40236769Sobrien    VPRecipeBase *Term = VPBB->getTerminator();
41236769Sobrien    auto EndIter = Term ? Term->getIterator() : VPBB->end();
42236769Sobrien    // Introduce each ingredient into VPlan.
43236769Sobrien    for (VPRecipeBase &Ingredient :
44236769Sobrien         make_early_inc_range(make_range(VPBB->begin(), EndIter))) {
45236769Sobrien
46236769Sobrien      VPValue *VPV = Ingredient.getVPSingleValue();
47236769Sobrien      Instruction *Inst = cast<Instruction>(VPV->getUnderlyingValue());
48236769Sobrien
49236769Sobrien      VPRecipeBase *NewRecipe = nullptr;
50236769Sobrien      if (auto *VPPhi = dyn_cast<VPWidenPHIRecipe>(&Ingredient)) {
51236769Sobrien        auto *Phi = cast<PHINode>(VPPhi->getUnderlyingValue());
52236769Sobrien        if (const auto *II = GetIntOrFpInductionDescriptor(Phi)) {
53236769Sobrien          VPValue *Start = Plan->getVPValueOrAddLiveIn(II->getStartValue());
54236769Sobrien          VPValue *Step =
55236769Sobrien              vputils::getOrCreateVPValueForSCEVExpr(*Plan, II->getStep(), SE);
56236769Sobrien          NewRecipe = new VPWidenIntOrFpInductionRecipe(Phi, Start, Step, *II);
57236769Sobrien        } else {
58236769Sobrien          Plan->addVPValue(Phi, VPPhi);
59236769Sobrien          continue;
60236769Sobrien        }
61236769Sobrien      } else {
62236769Sobrien        assert(isa<VPInstruction>(&Ingredient) &&
63236769Sobrien               "only VPInstructions expected here");
64236769Sobrien        assert(!isa<PHINode>(Inst) && "phis should be handled above");
65236769Sobrien        // Create VPWidenMemoryInstructionRecipe for loads and stores.
66236769Sobrien        if (LoadInst *Load = dyn_cast<LoadInst>(Inst)) {
67236769Sobrien          NewRecipe = new VPWidenMemoryInstructionRecipe(
68236769Sobrien              *Load, Ingredient.getOperand(0), nullptr /*Mask*/,
69236769Sobrien              false /*Consecutive*/, false /*Reverse*/);
70236769Sobrien        } else if (StoreInst *Store = dyn_cast<StoreInst>(Inst)) {
71236769Sobrien          NewRecipe = new VPWidenMemoryInstructionRecipe(
72236769Sobrien              *Store, Ingredient.getOperand(1), Ingredient.getOperand(0),
73236769Sobrien              nullptr /*Mask*/, false /*Consecutive*/, false /*Reverse*/);
74236769Sobrien        } else if (GetElementPtrInst *GEP = dyn_cast<GetElementPtrInst>(Inst)) {
75236769Sobrien          NewRecipe = new VPWidenGEPRecipe(GEP, Ingredient.operands());
76236769Sobrien        } else if (CallInst *CI = dyn_cast<CallInst>(Inst)) {
77236769Sobrien          NewRecipe = new VPWidenCallRecipe(
78236769Sobrien              *CI, drop_end(Ingredient.operands()),
79236769Sobrien              getVectorIntrinsicIDForCall(CI, &TLI), CI->getDebugLoc());
80236769Sobrien        } else if (SelectInst *SI = dyn_cast<SelectInst>(Inst)) {
81236769Sobrien          NewRecipe = new VPWidenSelectRecipe(*SI, Ingredient.operands());
82236769Sobrien        } else if (auto *CI = dyn_cast<CastInst>(Inst)) {
83236769Sobrien          NewRecipe = new VPWidenCastRecipe(
84236769Sobrien              CI->getOpcode(), Ingredient.getOperand(0), CI->getType(), *CI);
85236769Sobrien        } else {
86236769Sobrien          NewRecipe = new VPWidenRecipe(*Inst, Ingredient.operands());
87236769Sobrien        }
88236769Sobrien      }
89236769Sobrien
90236769Sobrien      NewRecipe->insertBefore(&Ingredient);
91236769Sobrien      if (NewRecipe->getNumDefinedValues() == 1)
92236769Sobrien        VPV->replaceAllUsesWith(NewRecipe->getVPSingleValue());
93236769Sobrien      else
94236769Sobrien        assert(NewRecipe->getNumDefinedValues() == 0 &&
95236769Sobrien               "Only recpies with zero or one defined values expected");
96236769Sobrien      Ingredient.eraseFromParent();
97236769Sobrien    }
98236769Sobrien  }
99236769Sobrien}
100236769Sobrien
101236769Sobrienstatic bool sinkScalarOperands(VPlan &Plan) {
102236769Sobrien  auto Iter = vp_depth_first_deep(Plan.getEntry());
103236769Sobrien  bool Changed = false;
104236769Sobrien  // First, collect the operands of all recipes in replicate blocks as seeds for
105236769Sobrien  // sinking.
106  SetVector<std::pair<VPBasicBlock *, VPSingleDefRecipe *>> WorkList;
107  for (VPRegionBlock *VPR : VPBlockUtils::blocksOnly<VPRegionBlock>(Iter)) {
108    VPBasicBlock *EntryVPBB = VPR->getEntryBasicBlock();
109    if (!VPR->isReplicator() || EntryVPBB->getSuccessors().size() != 2)
110      continue;
111    VPBasicBlock *VPBB = dyn_cast<VPBasicBlock>(EntryVPBB->getSuccessors()[0]);
112    if (!VPBB || VPBB->getSingleSuccessor() != VPR->getExitingBasicBlock())
113      continue;
114    for (auto &Recipe : *VPBB) {
115      for (VPValue *Op : Recipe.operands())
116        if (auto *Def =
117                dyn_cast_or_null<VPSingleDefRecipe>(Op->getDefiningRecipe()))
118          WorkList.insert(std::make_pair(VPBB, Def));
119    }
120  }
121
122  bool ScalarVFOnly = Plan.hasScalarVFOnly();
123  // Try to sink each replicate or scalar IV steps recipe in the worklist.
124  for (unsigned I = 0; I != WorkList.size(); ++I) {
125    VPBasicBlock *SinkTo;
126    VPSingleDefRecipe *SinkCandidate;
127    std::tie(SinkTo, SinkCandidate) = WorkList[I];
128    if (SinkCandidate->getParent() == SinkTo ||
129        SinkCandidate->mayHaveSideEffects() ||
130        SinkCandidate->mayReadOrWriteMemory())
131      continue;
132    if (auto *RepR = dyn_cast<VPReplicateRecipe>(SinkCandidate)) {
133      if (!ScalarVFOnly && RepR->isUniform())
134        continue;
135    } else if (!isa<VPScalarIVStepsRecipe>(SinkCandidate))
136      continue;
137
138    bool NeedsDuplicating = false;
139    // All recipe users of the sink candidate must be in the same block SinkTo
140    // or all users outside of SinkTo must be uniform-after-vectorization (
141    // i.e., only first lane is used) . In the latter case, we need to duplicate
142    // SinkCandidate.
143    auto CanSinkWithUser = [SinkTo, &NeedsDuplicating,
144                            SinkCandidate](VPUser *U) {
145      auto *UI = dyn_cast<VPRecipeBase>(U);
146      if (!UI)
147        return false;
148      if (UI->getParent() == SinkTo)
149        return true;
150      NeedsDuplicating = UI->onlyFirstLaneUsed(SinkCandidate);
151      // We only know how to duplicate VPRecipeRecipes for now.
152      return NeedsDuplicating && isa<VPReplicateRecipe>(SinkCandidate);
153    };
154    if (!all_of(SinkCandidate->users(), CanSinkWithUser))
155      continue;
156
157    if (NeedsDuplicating) {
158      if (ScalarVFOnly)
159        continue;
160      Instruction *I = cast<Instruction>(
161          cast<VPReplicateRecipe>(SinkCandidate)->getUnderlyingValue());
162      auto *Clone = new VPReplicateRecipe(I, SinkCandidate->operands(), true);
163      // TODO: add ".cloned" suffix to name of Clone's VPValue.
164
165      Clone->insertBefore(SinkCandidate);
166      SinkCandidate->replaceUsesWithIf(Clone, [SinkTo](VPUser &U, unsigned) {
167        return cast<VPRecipeBase>(&U)->getParent() != SinkTo;
168      });
169    }
170    SinkCandidate->moveBefore(*SinkTo, SinkTo->getFirstNonPhi());
171    for (VPValue *Op : SinkCandidate->operands())
172      if (auto *Def =
173              dyn_cast_or_null<VPSingleDefRecipe>(Op->getDefiningRecipe()))
174        WorkList.insert(std::make_pair(SinkTo, Def));
175    Changed = true;
176  }
177  return Changed;
178}
179
180/// If \p R is a region with a VPBranchOnMaskRecipe in the entry block, return
181/// the mask.
182VPValue *getPredicatedMask(VPRegionBlock *R) {
183  auto *EntryBB = dyn_cast<VPBasicBlock>(R->getEntry());
184  if (!EntryBB || EntryBB->size() != 1 ||
185      !isa<VPBranchOnMaskRecipe>(EntryBB->begin()))
186    return nullptr;
187
188  return cast<VPBranchOnMaskRecipe>(&*EntryBB->begin())->getOperand(0);
189}
190
191/// If \p R is a triangle region, return the 'then' block of the triangle.
192static VPBasicBlock *getPredicatedThenBlock(VPRegionBlock *R) {
193  auto *EntryBB = cast<VPBasicBlock>(R->getEntry());
194  if (EntryBB->getNumSuccessors() != 2)
195    return nullptr;
196
197  auto *Succ0 = dyn_cast<VPBasicBlock>(EntryBB->getSuccessors()[0]);
198  auto *Succ1 = dyn_cast<VPBasicBlock>(EntryBB->getSuccessors()[1]);
199  if (!Succ0 || !Succ1)
200    return nullptr;
201
202  if (Succ0->getNumSuccessors() + Succ1->getNumSuccessors() != 1)
203    return nullptr;
204  if (Succ0->getSingleSuccessor() == Succ1)
205    return Succ0;
206  if (Succ1->getSingleSuccessor() == Succ0)
207    return Succ1;
208  return nullptr;
209}
210
211// Merge replicate regions in their successor region, if a replicate region
212// is connected to a successor replicate region with the same predicate by a
213// single, empty VPBasicBlock.
214static bool mergeReplicateRegionsIntoSuccessors(VPlan &Plan) {
215  SetVector<VPRegionBlock *> DeletedRegions;
216
217  // Collect replicate regions followed by an empty block, followed by another
218  // replicate region with matching masks to process front. This is to avoid
219  // iterator invalidation issues while merging regions.
220  SmallVector<VPRegionBlock *, 8> WorkList;
221  for (VPRegionBlock *Region1 : VPBlockUtils::blocksOnly<VPRegionBlock>(
222           vp_depth_first_deep(Plan.getEntry()))) {
223    if (!Region1->isReplicator())
224      continue;
225    auto *MiddleBasicBlock =
226        dyn_cast_or_null<VPBasicBlock>(Region1->getSingleSuccessor());
227    if (!MiddleBasicBlock || !MiddleBasicBlock->empty())
228      continue;
229
230    auto *Region2 =
231        dyn_cast_or_null<VPRegionBlock>(MiddleBasicBlock->getSingleSuccessor());
232    if (!Region2 || !Region2->isReplicator())
233      continue;
234
235    VPValue *Mask1 = getPredicatedMask(Region1);
236    VPValue *Mask2 = getPredicatedMask(Region2);
237    if (!Mask1 || Mask1 != Mask2)
238      continue;
239
240    assert(Mask1 && Mask2 && "both region must have conditions");
241    WorkList.push_back(Region1);
242  }
243
244  // Move recipes from Region1 to its successor region, if both are triangles.
245  for (VPRegionBlock *Region1 : WorkList) {
246    if (DeletedRegions.contains(Region1))
247      continue;
248    auto *MiddleBasicBlock = cast<VPBasicBlock>(Region1->getSingleSuccessor());
249    auto *Region2 = cast<VPRegionBlock>(MiddleBasicBlock->getSingleSuccessor());
250
251    VPBasicBlock *Then1 = getPredicatedThenBlock(Region1);
252    VPBasicBlock *Then2 = getPredicatedThenBlock(Region2);
253    if (!Then1 || !Then2)
254      continue;
255
256    // Note: No fusion-preventing memory dependencies are expected in either
257    // region. Such dependencies should be rejected during earlier dependence
258    // checks, which guarantee accesses can be re-ordered for vectorization.
259    //
260    // Move recipes to the successor region.
261    for (VPRecipeBase &ToMove : make_early_inc_range(reverse(*Then1)))
262      ToMove.moveBefore(*Then2, Then2->getFirstNonPhi());
263
264    auto *Merge1 = cast<VPBasicBlock>(Then1->getSingleSuccessor());
265    auto *Merge2 = cast<VPBasicBlock>(Then2->getSingleSuccessor());
266
267    // Move VPPredInstPHIRecipes from the merge block to the successor region's
268    // merge block. Update all users inside the successor region to use the
269    // original values.
270    for (VPRecipeBase &Phi1ToMove : make_early_inc_range(reverse(*Merge1))) {
271      VPValue *PredInst1 =
272          cast<VPPredInstPHIRecipe>(&Phi1ToMove)->getOperand(0);
273      VPValue *Phi1ToMoveV = Phi1ToMove.getVPSingleValue();
274      Phi1ToMoveV->replaceUsesWithIf(PredInst1, [Then2](VPUser &U, unsigned) {
275        auto *UI = dyn_cast<VPRecipeBase>(&U);
276        return UI && UI->getParent() == Then2;
277      });
278
279      Phi1ToMove.moveBefore(*Merge2, Merge2->begin());
280    }
281
282    // Finally, remove the first region.
283    for (VPBlockBase *Pred : make_early_inc_range(Region1->getPredecessors())) {
284      VPBlockUtils::disconnectBlocks(Pred, Region1);
285      VPBlockUtils::connectBlocks(Pred, MiddleBasicBlock);
286    }
287    VPBlockUtils::disconnectBlocks(Region1, MiddleBasicBlock);
288    DeletedRegions.insert(Region1);
289  }
290
291  for (VPRegionBlock *ToDelete : DeletedRegions)
292    delete ToDelete;
293  return !DeletedRegions.empty();
294}
295
296static VPRegionBlock *createReplicateRegion(VPReplicateRecipe *PredRecipe,
297                                            VPlan &Plan) {
298  Instruction *Instr = PredRecipe->getUnderlyingInstr();
299  // Build the triangular if-then region.
300  std::string RegionName = (Twine("pred.") + Instr->getOpcodeName()).str();
301  assert(Instr->getParent() && "Predicated instruction not in any basic block");
302  auto *BlockInMask = PredRecipe->getMask();
303  auto *BOMRecipe = new VPBranchOnMaskRecipe(BlockInMask);
304  auto *Entry = new VPBasicBlock(Twine(RegionName) + ".entry", BOMRecipe);
305
306  // Replace predicated replicate recipe with a replicate recipe without a
307  // mask but in the replicate region.
308  auto *RecipeWithoutMask = new VPReplicateRecipe(
309      PredRecipe->getUnderlyingInstr(),
310      make_range(PredRecipe->op_begin(), std::prev(PredRecipe->op_end())),
311      PredRecipe->isUniform());
312  auto *Pred = new VPBasicBlock(Twine(RegionName) + ".if", RecipeWithoutMask);
313
314  VPPredInstPHIRecipe *PHIRecipe = nullptr;
315  if (PredRecipe->getNumUsers() != 0) {
316    PHIRecipe = new VPPredInstPHIRecipe(RecipeWithoutMask);
317    PredRecipe->replaceAllUsesWith(PHIRecipe);
318    PHIRecipe->setOperand(0, RecipeWithoutMask);
319  }
320  PredRecipe->eraseFromParent();
321  auto *Exiting = new VPBasicBlock(Twine(RegionName) + ".continue", PHIRecipe);
322  VPRegionBlock *Region = new VPRegionBlock(Entry, Exiting, RegionName, true);
323
324  // Note: first set Entry as region entry and then connect successors starting
325  // from it in order, to propagate the "parent" of each VPBasicBlock.
326  VPBlockUtils::insertTwoBlocksAfter(Pred, Exiting, Entry);
327  VPBlockUtils::connectBlocks(Pred, Exiting);
328
329  return Region;
330}
331
332static void addReplicateRegions(VPlan &Plan) {
333  SmallVector<VPReplicateRecipe *> WorkList;
334  for (VPBasicBlock *VPBB : VPBlockUtils::blocksOnly<VPBasicBlock>(
335           vp_depth_first_deep(Plan.getEntry()))) {
336    for (VPRecipeBase &R : *VPBB)
337      if (auto *RepR = dyn_cast<VPReplicateRecipe>(&R)) {
338        if (RepR->isPredicated())
339          WorkList.push_back(RepR);
340      }
341  }
342
343  unsigned BBNum = 0;
344  for (VPReplicateRecipe *RepR : WorkList) {
345    VPBasicBlock *CurrentBlock = RepR->getParent();
346    VPBasicBlock *SplitBlock = CurrentBlock->splitAt(RepR->getIterator());
347
348    BasicBlock *OrigBB = RepR->getUnderlyingInstr()->getParent();
349    SplitBlock->setName(
350        OrigBB->hasName() ? OrigBB->getName() + "." + Twine(BBNum++) : "");
351    // Record predicated instructions for above packing optimizations.
352    VPBlockBase *Region = createReplicateRegion(RepR, Plan);
353    Region->setParent(CurrentBlock->getParent());
354    VPBlockUtils::disconnectBlocks(CurrentBlock, SplitBlock);
355    VPBlockUtils::connectBlocks(CurrentBlock, Region);
356    VPBlockUtils::connectBlocks(Region, SplitBlock);
357  }
358}
359
360void VPlanTransforms::createAndOptimizeReplicateRegions(VPlan &Plan) {
361  // Convert masked VPReplicateRecipes to if-then region blocks.
362  addReplicateRegions(Plan);
363
364  bool ShouldSimplify = true;
365  while (ShouldSimplify) {
366    ShouldSimplify = sinkScalarOperands(Plan);
367    ShouldSimplify |= mergeReplicateRegionsIntoSuccessors(Plan);
368    ShouldSimplify |= VPlanTransforms::mergeBlocksIntoPredecessors(Plan);
369  }
370}
371bool VPlanTransforms::mergeBlocksIntoPredecessors(VPlan &Plan) {
372  SmallVector<VPBasicBlock *> WorkList;
373  for (VPBasicBlock *VPBB : VPBlockUtils::blocksOnly<VPBasicBlock>(
374           vp_depth_first_deep(Plan.getEntry()))) {
375    auto *PredVPBB =
376        dyn_cast_or_null<VPBasicBlock>(VPBB->getSinglePredecessor());
377    if (PredVPBB && PredVPBB->getNumSuccessors() == 1)
378      WorkList.push_back(VPBB);
379  }
380
381  for (VPBasicBlock *VPBB : WorkList) {
382    VPBasicBlock *PredVPBB = cast<VPBasicBlock>(VPBB->getSinglePredecessor());
383    for (VPRecipeBase &R : make_early_inc_range(*VPBB))
384      R.moveBefore(*PredVPBB, PredVPBB->end());
385    VPBlockUtils::disconnectBlocks(PredVPBB, VPBB);
386    auto *ParentRegion = cast_or_null<VPRegionBlock>(VPBB->getParent());
387    if (ParentRegion && ParentRegion->getExiting() == VPBB)
388      ParentRegion->setExiting(PredVPBB);
389    for (auto *Succ : to_vector(VPBB->successors())) {
390      VPBlockUtils::disconnectBlocks(VPBB, Succ);
391      VPBlockUtils::connectBlocks(PredVPBB, Succ);
392    }
393    delete VPBB;
394  }
395  return !WorkList.empty();
396}
397
398void VPlanTransforms::removeRedundantInductionCasts(VPlan &Plan) {
399  for (auto &Phi : Plan.getVectorLoopRegion()->getEntryBasicBlock()->phis()) {
400    auto *IV = dyn_cast<VPWidenIntOrFpInductionRecipe>(&Phi);
401    if (!IV || IV->getTruncInst())
402      continue;
403
404    // A sequence of IR Casts has potentially been recorded for IV, which
405    // *must be bypassed* when the IV is vectorized, because the vectorized IV
406    // will produce the desired casted value. This sequence forms a def-use
407    // chain and is provided in reverse order, ending with the cast that uses
408    // the IV phi. Search for the recipe of the last cast in the chain and
409    // replace it with the original IV. Note that only the final cast is
410    // expected to have users outside the cast-chain and the dead casts left
411    // over will be cleaned up later.
412    auto &Casts = IV->getInductionDescriptor().getCastInsts();
413    VPValue *FindMyCast = IV;
414    for (Instruction *IRCast : reverse(Casts)) {
415      VPSingleDefRecipe *FoundUserCast = nullptr;
416      for (auto *U : FindMyCast->users()) {
417        auto *UserCast = dyn_cast<VPSingleDefRecipe>(U);
418        if (UserCast && UserCast->getUnderlyingValue() == IRCast) {
419          FoundUserCast = UserCast;
420          break;
421        }
422      }
423      FindMyCast = FoundUserCast;
424    }
425    FindMyCast->replaceAllUsesWith(IV);
426  }
427}
428
429void VPlanTransforms::removeRedundantCanonicalIVs(VPlan &Plan) {
430  VPCanonicalIVPHIRecipe *CanonicalIV = Plan.getCanonicalIV();
431  VPWidenCanonicalIVRecipe *WidenNewIV = nullptr;
432  for (VPUser *U : CanonicalIV->users()) {
433    WidenNewIV = dyn_cast<VPWidenCanonicalIVRecipe>(U);
434    if (WidenNewIV)
435      break;
436  }
437
438  if (!WidenNewIV)
439    return;
440
441  VPBasicBlock *HeaderVPBB = Plan.getVectorLoopRegion()->getEntryBasicBlock();
442  for (VPRecipeBase &Phi : HeaderVPBB->phis()) {
443    auto *WidenOriginalIV = dyn_cast<VPWidenIntOrFpInductionRecipe>(&Phi);
444
445    if (!WidenOriginalIV || !WidenOriginalIV->isCanonical() ||
446        WidenOriginalIV->getScalarType() != WidenNewIV->getScalarType())
447      continue;
448
449    // Replace WidenNewIV with WidenOriginalIV if WidenOriginalIV provides
450    // everything WidenNewIV's users need. That is, WidenOriginalIV will
451    // generate a vector phi or all users of WidenNewIV demand the first lane
452    // only.
453    if (any_of(WidenOriginalIV->users(),
454               [WidenOriginalIV](VPUser *U) {
455                 return !U->usesScalars(WidenOriginalIV);
456               }) ||
457        vputils::onlyFirstLaneUsed(WidenNewIV)) {
458      WidenNewIV->replaceAllUsesWith(WidenOriginalIV);
459      WidenNewIV->eraseFromParent();
460      return;
461    }
462  }
463}
464
465void VPlanTransforms::removeDeadRecipes(VPlan &Plan) {
466  ReversePostOrderTraversal<VPBlockDeepTraversalWrapper<VPBlockBase *>> RPOT(
467      Plan.getEntry());
468
469  for (VPBasicBlock *VPBB : reverse(VPBlockUtils::blocksOnly<VPBasicBlock>(RPOT))) {
470    // The recipes in the block are processed in reverse order, to catch chains
471    // of dead recipes.
472    for (VPRecipeBase &R : make_early_inc_range(reverse(*VPBB))) {
473      // A user keeps R alive:
474      if (any_of(R.definedValues(),
475                 [](VPValue *V) { return V->getNumUsers(); }))
476        continue;
477
478      // Having side effects keeps R alive, but do remove conditional assume
479      // instructions as their conditions may be flattened.
480      auto *RepR = dyn_cast<VPReplicateRecipe>(&R);
481      bool IsConditionalAssume =
482          RepR && RepR->isPredicated() &&
483          match(RepR->getUnderlyingInstr(), m_Intrinsic<Intrinsic::assume>());
484      if (R.mayHaveSideEffects() && !IsConditionalAssume)
485        continue;
486
487      R.eraseFromParent();
488    }
489  }
490}
491
492static VPValue *createScalarIVSteps(VPlan &Plan, const InductionDescriptor &ID,
493                                    ScalarEvolution &SE, Instruction *TruncI,
494                                    Type *IVTy, VPValue *StartV,
495                                    VPValue *Step) {
496  VPBasicBlock *HeaderVPBB = Plan.getVectorLoopRegion()->getEntryBasicBlock();
497  auto IP = HeaderVPBB->getFirstNonPhi();
498  VPCanonicalIVPHIRecipe *CanonicalIV = Plan.getCanonicalIV();
499  Type *TruncTy = TruncI ? TruncI->getType() : IVTy;
500  VPValue *BaseIV = CanonicalIV;
501  if (!CanonicalIV->isCanonical(ID.getKind(), StartV, Step, TruncTy)) {
502    BaseIV = new VPDerivedIVRecipe(ID, StartV, CanonicalIV, Step,
503                                   TruncI ? TruncI->getType() : nullptr);
504    HeaderVPBB->insert(BaseIV->getDefiningRecipe(), IP);
505  }
506
507  VPScalarIVStepsRecipe *Steps = new VPScalarIVStepsRecipe(ID, BaseIV, Step);
508  HeaderVPBB->insert(Steps, IP);
509  return Steps;
510}
511
512void VPlanTransforms::optimizeInductions(VPlan &Plan, ScalarEvolution &SE) {
513  SmallVector<VPRecipeBase *> ToRemove;
514  VPBasicBlock *HeaderVPBB = Plan.getVectorLoopRegion()->getEntryBasicBlock();
515  bool HasOnlyVectorVFs = !Plan.hasVF(ElementCount::getFixed(1));
516  for (VPRecipeBase &Phi : HeaderVPBB->phis()) {
517    auto *WideIV = dyn_cast<VPWidenIntOrFpInductionRecipe>(&Phi);
518    if (!WideIV)
519      continue;
520    if (HasOnlyVectorVFs && none_of(WideIV->users(), [WideIV](VPUser *U) {
521          return U->usesScalars(WideIV);
522        }))
523      continue;
524
525    const InductionDescriptor &ID = WideIV->getInductionDescriptor();
526    VPValue *Steps = createScalarIVSteps(
527        Plan, ID, SE, WideIV->getTruncInst(), WideIV->getPHINode()->getType(),
528        WideIV->getStartValue(), WideIV->getStepValue());
529
530    // Update scalar users of IV to use Step instead.
531    if (!HasOnlyVectorVFs)
532      WideIV->replaceAllUsesWith(Steps);
533    else
534      WideIV->replaceUsesWithIf(Steps, [WideIV](VPUser &U, unsigned) {
535        return U.usesScalars(WideIV);
536      });
537  }
538}
539
540void VPlanTransforms::removeRedundantExpandSCEVRecipes(VPlan &Plan) {
541  DenseMap<const SCEV *, VPValue *> SCEV2VPV;
542
543  for (VPRecipeBase &R :
544       make_early_inc_range(*Plan.getEntry()->getEntryBasicBlock())) {
545    auto *ExpR = dyn_cast<VPExpandSCEVRecipe>(&R);
546    if (!ExpR)
547      continue;
548
549    auto I = SCEV2VPV.insert({ExpR->getSCEV(), ExpR});
550    if (I.second)
551      continue;
552    ExpR->replaceAllUsesWith(I.first->second);
553    ExpR->eraseFromParent();
554  }
555}
556
557static bool canSimplifyBranchOnCond(VPInstruction *Term) {
558  VPInstruction *Not = dyn_cast<VPInstruction>(Term->getOperand(0));
559  if (!Not || Not->getOpcode() != VPInstruction::Not)
560    return false;
561
562  VPInstruction *ALM = dyn_cast<VPInstruction>(Not->getOperand(0));
563  return ALM && ALM->getOpcode() == VPInstruction::ActiveLaneMask;
564}
565
566void VPlanTransforms::optimizeForVFAndUF(VPlan &Plan, ElementCount BestVF,
567                                         unsigned BestUF,
568                                         PredicatedScalarEvolution &PSE) {
569  assert(Plan.hasVF(BestVF) && "BestVF is not available in Plan");
570  assert(Plan.hasUF(BestUF) && "BestUF is not available in Plan");
571  VPBasicBlock *ExitingVPBB =
572      Plan.getVectorLoopRegion()->getExitingBasicBlock();
573  auto *Term = dyn_cast<VPInstruction>(&ExitingVPBB->back());
574  // Try to simplify the branch condition if TC <= VF * UF when preparing to
575  // execute the plan for the main vector loop. We only do this if the
576  // terminator is:
577  //  1. BranchOnCount, or
578  //  2. BranchOnCond where the input is Not(ActiveLaneMask).
579  if (!Term || (Term->getOpcode() != VPInstruction::BranchOnCount &&
580                (Term->getOpcode() != VPInstruction::BranchOnCond ||
581                 !canSimplifyBranchOnCond(Term))))
582    return;
583
584  Type *IdxTy =
585      Plan.getCanonicalIV()->getStartValue()->getLiveInIRValue()->getType();
586  const SCEV *TripCount = createTripCountSCEV(IdxTy, PSE);
587  ScalarEvolution &SE = *PSE.getSE();
588  const SCEV *C =
589      SE.getConstant(TripCount->getType(), BestVF.getKnownMinValue() * BestUF);
590  if (TripCount->isZero() ||
591      !SE.isKnownPredicate(CmpInst::ICMP_ULE, TripCount, C))
592    return;
593
594  LLVMContext &Ctx = SE.getContext();
595  auto *BOC = new VPInstruction(
596      VPInstruction::BranchOnCond,
597      {Plan.getVPValueOrAddLiveIn(ConstantInt::getTrue(Ctx))});
598  Term->eraseFromParent();
599  ExitingVPBB->appendRecipe(BOC);
600  Plan.setVF(BestVF);
601  Plan.setUF(BestUF);
602  // TODO: Further simplifications are possible
603  //      1. Replace inductions with constants.
604  //      2. Replace vector loop region with VPBasicBlock.
605}
606
607#ifndef NDEBUG
608static VPRegionBlock *GetReplicateRegion(VPRecipeBase *R) {
609  auto *Region = dyn_cast_or_null<VPRegionBlock>(R->getParent()->getParent());
610  if (Region && Region->isReplicator()) {
611    assert(Region->getNumSuccessors() == 1 &&
612           Region->getNumPredecessors() == 1 && "Expected SESE region!");
613    assert(R->getParent()->size() == 1 &&
614           "A recipe in an original replicator region must be the only "
615           "recipe in its block");
616    return Region;
617  }
618  return nullptr;
619}
620#endif
621
622static bool properlyDominates(const VPRecipeBase *A, const VPRecipeBase *B,
623                              VPDominatorTree &VPDT) {
624  if (A == B)
625    return false;
626
627  auto LocalComesBefore = [](const VPRecipeBase *A, const VPRecipeBase *B) {
628    for (auto &R : *A->getParent()) {
629      if (&R == A)
630        return true;
631      if (&R == B)
632        return false;
633    }
634    llvm_unreachable("recipe not found");
635  };
636  const VPBlockBase *ParentA = A->getParent();
637  const VPBlockBase *ParentB = B->getParent();
638  if (ParentA == ParentB)
639    return LocalComesBefore(A, B);
640
641  assert(!GetReplicateRegion(const_cast<VPRecipeBase *>(A)) &&
642         "No replicate regions expected at this point");
643  assert(!GetReplicateRegion(const_cast<VPRecipeBase *>(B)) &&
644         "No replicate regions expected at this point");
645  return VPDT.properlyDominates(ParentA, ParentB);
646}
647
648/// Sink users of \p FOR after the recipe defining the previous value \p
649/// Previous of the recurrence. \returns true if all users of \p FOR could be
650/// re-arranged as needed or false if it is not possible.
651static bool
652sinkRecurrenceUsersAfterPrevious(VPFirstOrderRecurrencePHIRecipe *FOR,
653                                 VPRecipeBase *Previous,
654                                 VPDominatorTree &VPDT) {
655  // Collect recipes that need sinking.
656  SmallVector<VPRecipeBase *> WorkList;
657  SmallPtrSet<VPRecipeBase *, 8> Seen;
658  Seen.insert(Previous);
659  auto TryToPushSinkCandidate = [&](VPRecipeBase *SinkCandidate) {
660    // The previous value must not depend on the users of the recurrence phi. In
661    // that case, FOR is not a fixed order recurrence.
662    if (SinkCandidate == Previous)
663      return false;
664
665    if (isa<VPHeaderPHIRecipe>(SinkCandidate) ||
666        !Seen.insert(SinkCandidate).second ||
667        properlyDominates(Previous, SinkCandidate, VPDT))
668      return true;
669
670    if (SinkCandidate->mayHaveSideEffects())
671      return false;
672
673    WorkList.push_back(SinkCandidate);
674    return true;
675  };
676
677  // Recursively sink users of FOR after Previous.
678  WorkList.push_back(FOR);
679  for (unsigned I = 0; I != WorkList.size(); ++I) {
680    VPRecipeBase *Current = WorkList[I];
681    assert(Current->getNumDefinedValues() == 1 &&
682           "only recipes with a single defined value expected");
683
684    for (VPUser *User : Current->getVPSingleValue()->users()) {
685      if (auto *R = dyn_cast<VPRecipeBase>(User))
686        if (!TryToPushSinkCandidate(R))
687          return false;
688    }
689  }
690
691  // Keep recipes to sink ordered by dominance so earlier instructions are
692  // processed first.
693  sort(WorkList, [&VPDT](const VPRecipeBase *A, const VPRecipeBase *B) {
694    return properlyDominates(A, B, VPDT);
695  });
696
697  for (VPRecipeBase *SinkCandidate : WorkList) {
698    if (SinkCandidate == FOR)
699      continue;
700
701    SinkCandidate->moveAfter(Previous);
702    Previous = SinkCandidate;
703  }
704  return true;
705}
706
707bool VPlanTransforms::adjustFixedOrderRecurrences(VPlan &Plan,
708                                                  VPBuilder &Builder) {
709  VPDominatorTree VPDT;
710  VPDT.recalculate(Plan);
711
712  SmallVector<VPFirstOrderRecurrencePHIRecipe *> RecurrencePhis;
713  for (VPRecipeBase &R :
714       Plan.getVectorLoopRegion()->getEntry()->getEntryBasicBlock()->phis())
715    if (auto *FOR = dyn_cast<VPFirstOrderRecurrencePHIRecipe>(&R))
716      RecurrencePhis.push_back(FOR);
717
718  for (VPFirstOrderRecurrencePHIRecipe *FOR : RecurrencePhis) {
719    SmallPtrSet<VPFirstOrderRecurrencePHIRecipe *, 4> SeenPhis;
720    VPRecipeBase *Previous = FOR->getBackedgeValue()->getDefiningRecipe();
721    // Fixed-order recurrences do not contain cycles, so this loop is guaranteed
722    // to terminate.
723    while (auto *PrevPhi =
724               dyn_cast_or_null<VPFirstOrderRecurrencePHIRecipe>(Previous)) {
725      assert(PrevPhi->getParent() == FOR->getParent());
726      assert(SeenPhis.insert(PrevPhi).second);
727      Previous = PrevPhi->getBackedgeValue()->getDefiningRecipe();
728    }
729
730    if (!sinkRecurrenceUsersAfterPrevious(FOR, Previous, VPDT))
731      return false;
732
733    // Introduce a recipe to combine the incoming and previous values of a
734    // fixed-order recurrence.
735    VPBasicBlock *InsertBlock = Previous->getParent();
736    if (isa<VPHeaderPHIRecipe>(Previous))
737      Builder.setInsertPoint(InsertBlock, InsertBlock->getFirstNonPhi());
738    else
739      Builder.setInsertPoint(InsertBlock, std::next(Previous->getIterator()));
740
741    auto *RecurSplice = cast<VPInstruction>(
742        Builder.createNaryOp(VPInstruction::FirstOrderRecurrenceSplice,
743                             {FOR, FOR->getBackedgeValue()}));
744
745    FOR->replaceAllUsesWith(RecurSplice);
746    // Set the first operand of RecurSplice to FOR again, after replacing
747    // all users.
748    RecurSplice->setOperand(0, FOR);
749  }
750  return true;
751}
752
753void VPlanTransforms::clearReductionWrapFlags(VPlan &Plan) {
754  for (VPRecipeBase &R :
755       Plan.getVectorLoopRegion()->getEntryBasicBlock()->phis()) {
756    auto *PhiR = dyn_cast<VPReductionPHIRecipe>(&R);
757    if (!PhiR)
758      continue;
759    const RecurrenceDescriptor &RdxDesc = PhiR->getRecurrenceDescriptor();
760    RecurKind RK = RdxDesc.getRecurrenceKind();
761    if (RK != RecurKind::Add && RK != RecurKind::Mul)
762      continue;
763
764    SmallSetVector<VPValue *, 8> Worklist;
765    Worklist.insert(PhiR);
766
767    for (unsigned I = 0; I != Worklist.size(); ++I) {
768      VPValue *Cur = Worklist[I];
769      if (auto *RecWithFlags =
770              dyn_cast<VPRecipeWithIRFlags>(Cur->getDefiningRecipe())) {
771        RecWithFlags->dropPoisonGeneratingFlags();
772      }
773
774      for (VPUser *U : Cur->users()) {
775        auto *UserRecipe = dyn_cast<VPRecipeBase>(U);
776        if (!UserRecipe)
777          continue;
778        for (VPValue *V : UserRecipe->definedValues())
779          Worklist.insert(V);
780      }
781    }
782  }
783}
784
785/// Returns true is \p V is constant one.
786static bool isConstantOne(VPValue *V) {
787  if (!V->isLiveIn())
788    return false;
789  auto *C = dyn_cast<ConstantInt>(V->getLiveInIRValue());
790  return C && C->isOne();
791}
792
793/// Returns the llvm::Instruction opcode for \p R.
794static unsigned getOpcodeForRecipe(VPRecipeBase &R) {
795  if (auto *WidenR = dyn_cast<VPWidenRecipe>(&R))
796    return WidenR->getUnderlyingInstr()->getOpcode();
797  if (auto *WidenC = dyn_cast<VPWidenCastRecipe>(&R))
798    return WidenC->getOpcode();
799  if (auto *RepR = dyn_cast<VPReplicateRecipe>(&R))
800    return RepR->getUnderlyingInstr()->getOpcode();
801  if (auto *VPI = dyn_cast<VPInstruction>(&R))
802    return VPI->getOpcode();
803  return 0;
804}
805
806/// Try to simplify recipe \p R.
807static void simplifyRecipe(VPRecipeBase &R, VPTypeAnalysis &TypeInfo) {
808  switch (getOpcodeForRecipe(R)) {
809  case Instruction::Mul: {
810    VPValue *A = R.getOperand(0);
811    VPValue *B = R.getOperand(1);
812    if (isConstantOne(A))
813      return R.getVPSingleValue()->replaceAllUsesWith(B);
814    if (isConstantOne(B))
815      return R.getVPSingleValue()->replaceAllUsesWith(A);
816    break;
817  }
818  case Instruction::Trunc: {
819    VPRecipeBase *Ext = R.getOperand(0)->getDefiningRecipe();
820    if (!Ext)
821      break;
822    unsigned ExtOpcode = getOpcodeForRecipe(*Ext);
823    if (ExtOpcode != Instruction::ZExt && ExtOpcode != Instruction::SExt)
824      break;
825    VPValue *A = Ext->getOperand(0);
826    VPValue *Trunc = R.getVPSingleValue();
827    Type *TruncTy = TypeInfo.inferScalarType(Trunc);
828    Type *ATy = TypeInfo.inferScalarType(A);
829    if (TruncTy == ATy) {
830      Trunc->replaceAllUsesWith(A);
831    } else {
832      // Don't replace a scalarizing recipe with a widened cast.
833      if (isa<VPReplicateRecipe>(&R))
834        break;
835      if (ATy->getScalarSizeInBits() < TruncTy->getScalarSizeInBits()) {
836        auto *VPC =
837            new VPWidenCastRecipe(Instruction::CastOps(ExtOpcode), A, TruncTy);
838        VPC->insertBefore(&R);
839        Trunc->replaceAllUsesWith(VPC);
840      } else if (ATy->getScalarSizeInBits() > TruncTy->getScalarSizeInBits()) {
841        auto *VPC = new VPWidenCastRecipe(Instruction::Trunc, A, TruncTy);
842        VPC->insertBefore(&R);
843        Trunc->replaceAllUsesWith(VPC);
844      }
845    }
846#ifndef NDEBUG
847    // Verify that the cached type info is for both A and its users is still
848    // accurate by comparing it to freshly computed types.
849    VPTypeAnalysis TypeInfo2(TypeInfo.getContext());
850    assert(TypeInfo.inferScalarType(A) == TypeInfo2.inferScalarType(A));
851    for (VPUser *U : A->users()) {
852      auto *R = dyn_cast<VPRecipeBase>(U);
853      if (!R)
854        continue;
855      for (VPValue *VPV : R->definedValues())
856        assert(TypeInfo.inferScalarType(VPV) == TypeInfo2.inferScalarType(VPV));
857    }
858#endif
859    break;
860  }
861  default:
862    break;
863  }
864}
865
866/// Try to simplify the recipes in \p Plan.
867static void simplifyRecipes(VPlan &Plan, LLVMContext &Ctx) {
868  ReversePostOrderTraversal<VPBlockDeepTraversalWrapper<VPBlockBase *>> RPOT(
869      Plan.getEntry());
870  VPTypeAnalysis TypeInfo(Ctx);
871  for (VPBasicBlock *VPBB : VPBlockUtils::blocksOnly<VPBasicBlock>(RPOT)) {
872    for (VPRecipeBase &R : make_early_inc_range(*VPBB)) {
873      simplifyRecipe(R, TypeInfo);
874    }
875  }
876}
877
878void VPlanTransforms::truncateToMinimalBitwidths(
879    VPlan &Plan, const MapVector<Instruction *, uint64_t> &MinBWs,
880    LLVMContext &Ctx) {
881#ifndef NDEBUG
882  // Count the processed recipes and cross check the count later with MinBWs
883  // size, to make sure all entries in MinBWs have been handled.
884  unsigned NumProcessedRecipes = 0;
885#endif
886  // Keep track of created truncates, so they can be re-used. Note that we
887  // cannot use RAUW after creating a new truncate, as this would could make
888  // other uses have different types for their operands, making them invalidly
889  // typed.
890  DenseMap<VPValue *, VPWidenCastRecipe *> ProcessedTruncs;
891  VPTypeAnalysis TypeInfo(Ctx);
892  VPBasicBlock *PH = Plan.getEntry();
893  for (VPBasicBlock *VPBB : VPBlockUtils::blocksOnly<VPBasicBlock>(
894           vp_depth_first_deep(Plan.getVectorLoopRegion()))) {
895    for (VPRecipeBase &R : make_early_inc_range(*VPBB)) {
896      if (!isa<VPWidenRecipe, VPWidenCastRecipe, VPReplicateRecipe,
897               VPWidenSelectRecipe, VPWidenMemoryInstructionRecipe>(&R))
898        continue;
899      if (isa<VPWidenMemoryInstructionRecipe>(&R) &&
900          cast<VPWidenMemoryInstructionRecipe>(&R)->isStore())
901        continue;
902
903      VPValue *ResultVPV = R.getVPSingleValue();
904      auto *UI = cast_or_null<Instruction>(ResultVPV->getUnderlyingValue());
905      unsigned NewResSizeInBits = MinBWs.lookup(UI);
906      if (!NewResSizeInBits)
907        continue;
908
909#ifndef NDEBUG
910      NumProcessedRecipes++;
911#endif
912      // If the value wasn't vectorized, we must maintain the original scalar
913      // type. Skip those here, after incrementing NumProcessedRecipes. Also
914      // skip casts which do not need to be handled explicitly here, as
915      // redundant casts will be removed during recipe simplification.
916      if (isa<VPReplicateRecipe, VPWidenCastRecipe>(&R)) {
917#ifndef NDEBUG
918        // If any of the operands is a live-in and not used by VPWidenRecipe or
919        // VPWidenSelectRecipe, but in MinBWs, make sure it is counted as
920        // processed as well. When MinBWs is currently constructed, there is no
921        // information about whether recipes are widened or replicated and in
922        // case they are reciplicated the operands are not truncated. Counting
923        // them them here ensures we do not miss any recipes in MinBWs.
924        // TODO: Remove once the analysis is done on VPlan.
925        for (VPValue *Op : R.operands()) {
926          if (!Op->isLiveIn())
927            continue;
928          auto *UV = dyn_cast_or_null<Instruction>(Op->getUnderlyingValue());
929          if (UV && MinBWs.contains(UV) && !ProcessedTruncs.contains(Op) &&
930              all_of(Op->users(), [](VPUser *U) {
931                return !isa<VPWidenRecipe, VPWidenSelectRecipe>(U);
932              })) {
933            // Add an entry to ProcessedTruncs to avoid counting the same
934            // operand multiple times.
935            ProcessedTruncs[Op] = nullptr;
936            NumProcessedRecipes += 1;
937          }
938        }
939#endif
940        continue;
941      }
942
943      Type *OldResTy = TypeInfo.inferScalarType(ResultVPV);
944      unsigned OldResSizeInBits = OldResTy->getScalarSizeInBits();
945      assert(OldResTy->isIntegerTy() && "only integer types supported");
946      if (OldResSizeInBits == NewResSizeInBits)
947        continue;
948      assert(OldResSizeInBits > NewResSizeInBits && "Nothing to shrink?");
949      (void)OldResSizeInBits;
950
951      auto *NewResTy = IntegerType::get(Ctx, NewResSizeInBits);
952
953      // Any wrapping introduced by shrinking this operation shouldn't be
954      // considered undefined behavior. So, we can't unconditionally copy
955      // arithmetic wrapping flags to VPW.
956      if (auto *VPW = dyn_cast<VPRecipeWithIRFlags>(&R))
957        VPW->dropPoisonGeneratingFlags();
958
959      // Extend result to original width.
960      auto *Ext = new VPWidenCastRecipe(Instruction::ZExt, ResultVPV, OldResTy);
961      Ext->insertAfter(&R);
962      ResultVPV->replaceAllUsesWith(Ext);
963      Ext->setOperand(0, ResultVPV);
964
965      if (isa<VPWidenMemoryInstructionRecipe>(&R)) {
966        assert(!cast<VPWidenMemoryInstructionRecipe>(&R)->isStore() && "stores cannot be narrowed");
967        continue;
968      }
969
970      // Shrink operands by introducing truncates as needed.
971      unsigned StartIdx = isa<VPWidenSelectRecipe>(&R) ? 1 : 0;
972      for (unsigned Idx = StartIdx; Idx != R.getNumOperands(); ++Idx) {
973        auto *Op = R.getOperand(Idx);
974        unsigned OpSizeInBits =
975            TypeInfo.inferScalarType(Op)->getScalarSizeInBits();
976        if (OpSizeInBits == NewResSizeInBits)
977          continue;
978        assert(OpSizeInBits > NewResSizeInBits && "nothing to truncate");
979        auto [ProcessedIter, IterIsEmpty] =
980            ProcessedTruncs.insert({Op, nullptr});
981        VPWidenCastRecipe *NewOp =
982            IterIsEmpty
983                ? new VPWidenCastRecipe(Instruction::Trunc, Op, NewResTy)
984                : ProcessedIter->second;
985        R.setOperand(Idx, NewOp);
986        if (!IterIsEmpty)
987          continue;
988        ProcessedIter->second = NewOp;
989        if (!Op->isLiveIn()) {
990          NewOp->insertBefore(&R);
991        } else {
992          PH->appendRecipe(NewOp);
993#ifndef NDEBUG
994          auto *OpInst = dyn_cast<Instruction>(Op->getLiveInIRValue());
995          bool IsContained = MinBWs.contains(OpInst);
996          NumProcessedRecipes += IsContained;
997#endif
998        }
999      }
1000
1001    }
1002  }
1003
1004  assert(MinBWs.size() == NumProcessedRecipes &&
1005         "some entries in MinBWs haven't been processed");
1006}
1007
1008void VPlanTransforms::optimize(VPlan &Plan, ScalarEvolution &SE) {
1009  removeRedundantCanonicalIVs(Plan);
1010  removeRedundantInductionCasts(Plan);
1011
1012  optimizeInductions(Plan, SE);
1013  simplifyRecipes(Plan, SE.getContext());
1014  removeDeadRecipes(Plan);
1015
1016  createAndOptimizeReplicateRegions(Plan);
1017
1018  removeRedundantExpandSCEVRecipes(Plan);
1019  mergeBlocksIntoPredecessors(Plan);
1020}
1021
1022// Add a VPActiveLaneMaskPHIRecipe and related recipes to \p Plan and replace
1023// the loop terminator with a branch-on-cond recipe with the negated
1024// active-lane-mask as operand. Note that this turns the loop into an
1025// uncountable one. Only the existing terminator is replaced, all other existing
1026// recipes/users remain unchanged, except for poison-generating flags being
1027// dropped from the canonical IV increment. Return the created
1028// VPActiveLaneMaskPHIRecipe.
1029//
1030// The function uses the following definitions:
1031//
1032//  %TripCount = DataWithControlFlowWithoutRuntimeCheck ?
1033//    calculate-trip-count-minus-VF (original TC) : original TC
1034//  %IncrementValue = DataWithControlFlowWithoutRuntimeCheck ?
1035//     CanonicalIVPhi : CanonicalIVIncrement
1036//  %StartV is the canonical induction start value.
1037//
1038// The function adds the following recipes:
1039//
1040// vector.ph:
1041//   %TripCount = calculate-trip-count-minus-VF (original TC)
1042//       [if DataWithControlFlowWithoutRuntimeCheck]
1043//   %EntryInc = canonical-iv-increment-for-part %StartV
1044//   %EntryALM = active-lane-mask %EntryInc, %TripCount
1045//
1046// vector.body:
1047//   ...
1048//   %P = active-lane-mask-phi [ %EntryALM, %vector.ph ], [ %ALM, %vector.body ]
1049//   ...
1050//   %InLoopInc = canonical-iv-increment-for-part %IncrementValue
1051//   %ALM = active-lane-mask %InLoopInc, TripCount
1052//   %Negated = Not %ALM
1053//   branch-on-cond %Negated
1054//
1055static VPActiveLaneMaskPHIRecipe *addVPLaneMaskPhiAndUpdateExitBranch(
1056    VPlan &Plan, bool DataAndControlFlowWithoutRuntimeCheck) {
1057  VPRegionBlock *TopRegion = Plan.getVectorLoopRegion();
1058  VPBasicBlock *EB = TopRegion->getExitingBasicBlock();
1059  auto *CanonicalIVPHI = Plan.getCanonicalIV();
1060  VPValue *StartV = CanonicalIVPHI->getStartValue();
1061
1062  auto *CanonicalIVIncrement =
1063      cast<VPInstruction>(CanonicalIVPHI->getBackedgeValue());
1064  // TODO: Check if dropping the flags is needed if
1065  // !DataAndControlFlowWithoutRuntimeCheck.
1066  CanonicalIVIncrement->dropPoisonGeneratingFlags();
1067  DebugLoc DL = CanonicalIVIncrement->getDebugLoc();
1068  // We can't use StartV directly in the ActiveLaneMask VPInstruction, since
1069  // we have to take unrolling into account. Each part needs to start at
1070  //   Part * VF
1071  auto *VecPreheader = cast<VPBasicBlock>(TopRegion->getSinglePredecessor());
1072  VPBuilder Builder(VecPreheader);
1073
1074  // Create the ActiveLaneMask instruction using the correct start values.
1075  VPValue *TC = Plan.getTripCount();
1076
1077  VPValue *TripCount, *IncrementValue;
1078  if (!DataAndControlFlowWithoutRuntimeCheck) {
1079    // When the loop is guarded by a runtime overflow check for the loop
1080    // induction variable increment by VF, we can increment the value before
1081    // the get.active.lane mask and use the unmodified tripcount.
1082    IncrementValue = CanonicalIVIncrement;
1083    TripCount = TC;
1084  } else {
1085    // When avoiding a runtime check, the active.lane.mask inside the loop
1086    // uses a modified trip count and the induction variable increment is
1087    // done after the active.lane.mask intrinsic is called.
1088    IncrementValue = CanonicalIVPHI;
1089    TripCount = Builder.createNaryOp(VPInstruction::CalculateTripCountMinusVF,
1090                                     {TC}, DL);
1091  }
1092  auto *EntryIncrement = Builder.createOverflowingOp(
1093      VPInstruction::CanonicalIVIncrementForPart, {StartV}, {false, false}, DL,
1094      "index.part.next");
1095
1096  // Create the active lane mask instruction in the VPlan preheader.
1097  auto *EntryALM =
1098      Builder.createNaryOp(VPInstruction::ActiveLaneMask, {EntryIncrement, TC},
1099                           DL, "active.lane.mask.entry");
1100
1101  // Now create the ActiveLaneMaskPhi recipe in the main loop using the
1102  // preheader ActiveLaneMask instruction.
1103  auto LaneMaskPhi = new VPActiveLaneMaskPHIRecipe(EntryALM, DebugLoc());
1104  LaneMaskPhi->insertAfter(CanonicalIVPHI);
1105
1106  // Create the active lane mask for the next iteration of the loop before the
1107  // original terminator.
1108  VPRecipeBase *OriginalTerminator = EB->getTerminator();
1109  Builder.setInsertPoint(OriginalTerminator);
1110  auto *InLoopIncrement =
1111      Builder.createOverflowingOp(VPInstruction::CanonicalIVIncrementForPart,
1112                                  {IncrementValue}, {false, false}, DL);
1113  auto *ALM = Builder.createNaryOp(VPInstruction::ActiveLaneMask,
1114                                   {InLoopIncrement, TripCount}, DL,
1115                                   "active.lane.mask.next");
1116  LaneMaskPhi->addOperand(ALM);
1117
1118  // Replace the original terminator with BranchOnCond. We have to invert the
1119  // mask here because a true condition means jumping to the exit block.
1120  auto *NotMask = Builder.createNot(ALM, DL);
1121  Builder.createNaryOp(VPInstruction::BranchOnCond, {NotMask}, DL);
1122  OriginalTerminator->eraseFromParent();
1123  return LaneMaskPhi;
1124}
1125
1126void VPlanTransforms::addActiveLaneMask(
1127    VPlan &Plan, bool UseActiveLaneMaskForControlFlow,
1128    bool DataAndControlFlowWithoutRuntimeCheck) {
1129  assert((!DataAndControlFlowWithoutRuntimeCheck ||
1130          UseActiveLaneMaskForControlFlow) &&
1131         "DataAndControlFlowWithoutRuntimeCheck implies "
1132         "UseActiveLaneMaskForControlFlow");
1133
1134  auto FoundWidenCanonicalIVUser =
1135      find_if(Plan.getCanonicalIV()->users(),
1136              [](VPUser *U) { return isa<VPWidenCanonicalIVRecipe>(U); });
1137  assert(FoundWidenCanonicalIVUser &&
1138         "Must have widened canonical IV when tail folding!");
1139  auto *WideCanonicalIV =
1140      cast<VPWidenCanonicalIVRecipe>(*FoundWidenCanonicalIVUser);
1141  VPSingleDefRecipe *LaneMask;
1142  if (UseActiveLaneMaskForControlFlow) {
1143    LaneMask = addVPLaneMaskPhiAndUpdateExitBranch(
1144        Plan, DataAndControlFlowWithoutRuntimeCheck);
1145  } else {
1146    LaneMask = new VPInstruction(VPInstruction::ActiveLaneMask,
1147                                 {WideCanonicalIV, Plan.getTripCount()},
1148                                 nullptr, "active.lane.mask");
1149    LaneMask->insertAfter(WideCanonicalIV);
1150  }
1151
1152  // Walk users of WideCanonicalIV and replace all compares of the form
1153  // (ICMP_ULE, WideCanonicalIV, backedge-taken-count) with an
1154  // active-lane-mask.
1155  VPValue *BTC = Plan.getOrCreateBackedgeTakenCount();
1156  for (VPUser *U : SmallVector<VPUser *>(WideCanonicalIV->users())) {
1157    auto *CompareToReplace = dyn_cast<VPInstruction>(U);
1158    if (!CompareToReplace ||
1159        CompareToReplace->getOpcode() != Instruction::ICmp ||
1160        CompareToReplace->getPredicate() != CmpInst::ICMP_ULE ||
1161        CompareToReplace->getOperand(1) != BTC)
1162      continue;
1163
1164    assert(CompareToReplace->getOperand(0) == WideCanonicalIV &&
1165           "WidenCanonicalIV must be the first operand of the compare");
1166    CompareToReplace->replaceAllUsesWith(LaneMask);
1167    CompareToReplace->eraseFromParent();
1168  }
1169}
1170