1//===- SLPVectorizer.cpp - A bottom up SLP Vectorizer ---------------------===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8//
9// This pass implements the Bottom Up SLP vectorizer. It detects consecutive
10// stores that can be put together into vector-stores. Next, it attempts to
11// construct vectorizable tree using the use-def chains. If a profitable tree
12// was found, the SLP vectorizer performs vectorization on the tree.
13//
14// The pass is inspired by the work described in the paper:
15//  "Loop-Aware SLP in GCC" by Ira Rosen, Dorit Nuzman, Ayal Zaks.
16//
17//===----------------------------------------------------------------------===//
18
19#include "llvm/Transforms/Vectorize/SLPVectorizer.h"
20#include "llvm/ADT/DenseMap.h"
21#include "llvm/ADT/DenseSet.h"
22#include "llvm/ADT/PriorityQueue.h"
23#include "llvm/ADT/STLExtras.h"
24#include "llvm/ADT/SetOperations.h"
25#include "llvm/ADT/SetVector.h"
26#include "llvm/ADT/SmallBitVector.h"
27#include "llvm/ADT/SmallPtrSet.h"
28#include "llvm/ADT/SmallSet.h"
29#include "llvm/ADT/SmallString.h"
30#include "llvm/ADT/Statistic.h"
31#include "llvm/ADT/iterator.h"
32#include "llvm/ADT/iterator_range.h"
33#include "llvm/Analysis/AliasAnalysis.h"
34#include "llvm/Analysis/AssumptionCache.h"
35#include "llvm/Analysis/CodeMetrics.h"
36#include "llvm/Analysis/ConstantFolding.h"
37#include "llvm/Analysis/DemandedBits.h"
38#include "llvm/Analysis/GlobalsModRef.h"
39#include "llvm/Analysis/IVDescriptors.h"
40#include "llvm/Analysis/LoopAccessAnalysis.h"
41#include "llvm/Analysis/LoopInfo.h"
42#include "llvm/Analysis/MemoryLocation.h"
43#include "llvm/Analysis/OptimizationRemarkEmitter.h"
44#include "llvm/Analysis/ScalarEvolution.h"
45#include "llvm/Analysis/ScalarEvolutionExpressions.h"
46#include "llvm/Analysis/TargetLibraryInfo.h"
47#include "llvm/Analysis/TargetTransformInfo.h"
48#include "llvm/Analysis/ValueTracking.h"
49#include "llvm/Analysis/VectorUtils.h"
50#include "llvm/IR/Attributes.h"
51#include "llvm/IR/BasicBlock.h"
52#include "llvm/IR/Constant.h"
53#include "llvm/IR/Constants.h"
54#include "llvm/IR/DataLayout.h"
55#include "llvm/IR/DerivedTypes.h"
56#include "llvm/IR/Dominators.h"
57#include "llvm/IR/Function.h"
58#include "llvm/IR/IRBuilder.h"
59#include "llvm/IR/InstrTypes.h"
60#include "llvm/IR/Instruction.h"
61#include "llvm/IR/Instructions.h"
62#include "llvm/IR/IntrinsicInst.h"
63#include "llvm/IR/Intrinsics.h"
64#include "llvm/IR/Module.h"
65#include "llvm/IR/Operator.h"
66#include "llvm/IR/PatternMatch.h"
67#include "llvm/IR/Type.h"
68#include "llvm/IR/Use.h"
69#include "llvm/IR/User.h"
70#include "llvm/IR/Value.h"
71#include "llvm/IR/ValueHandle.h"
72#ifdef EXPENSIVE_CHECKS
73#include "llvm/IR/Verifier.h"
74#endif
75#include "llvm/Pass.h"
76#include "llvm/Support/Casting.h"
77#include "llvm/Support/CommandLine.h"
78#include "llvm/Support/Compiler.h"
79#include "llvm/Support/DOTGraphTraits.h"
80#include "llvm/Support/Debug.h"
81#include "llvm/Support/ErrorHandling.h"
82#include "llvm/Support/GraphWriter.h"
83#include "llvm/Support/InstructionCost.h"
84#include "llvm/Support/KnownBits.h"
85#include "llvm/Support/MathExtras.h"
86#include "llvm/Support/raw_ostream.h"
87#include "llvm/Transforms/Utils/InjectTLIMappings.h"
88#include "llvm/Transforms/Utils/Local.h"
89#include "llvm/Transforms/Utils/LoopUtils.h"
90#include <algorithm>
91#include <cassert>
92#include <cstdint>
93#include <iterator>
94#include <memory>
95#include <optional>
96#include <set>
97#include <string>
98#include <tuple>
99#include <utility>
100
101using namespace llvm;
102using namespace llvm::PatternMatch;
103using namespace slpvectorizer;
104
105#define SV_NAME "slp-vectorizer"
106#define DEBUG_TYPE "SLP"
107
108STATISTIC(NumVectorInstructions, "Number of vector instructions generated");
109
110static cl::opt<bool>
111    RunSLPVectorization("vectorize-slp", cl::init(true), cl::Hidden,
112                        cl::desc("Run the SLP vectorization passes"));
113
114static cl::opt<int>
115    SLPCostThreshold("slp-threshold", cl::init(0), cl::Hidden,
116                     cl::desc("Only vectorize if you gain more than this "
117                              "number "));
118
119static cl::opt<bool>
120ShouldVectorizeHor("slp-vectorize-hor", cl::init(true), cl::Hidden,
121                   cl::desc("Attempt to vectorize horizontal reductions"));
122
123static cl::opt<bool> ShouldStartVectorizeHorAtStore(
124    "slp-vectorize-hor-store", cl::init(false), cl::Hidden,
125    cl::desc(
126        "Attempt to vectorize horizontal reductions feeding into a store"));
127
128// NOTE: If AllowHorRdxIdenityOptimization is true, the optimization will run
129// even if we match a reduction but do not vectorize in the end.
130static cl::opt<bool> AllowHorRdxIdenityOptimization(
131    "slp-optimize-identity-hor-reduction-ops", cl::init(true), cl::Hidden,
132    cl::desc("Allow optimization of original scalar identity operations on "
133             "matched horizontal reductions."));
134
135static cl::opt<int>
136MaxVectorRegSizeOption("slp-max-reg-size", cl::init(128), cl::Hidden,
137    cl::desc("Attempt to vectorize for this register size in bits"));
138
139static cl::opt<unsigned>
140MaxVFOption("slp-max-vf", cl::init(0), cl::Hidden,
141    cl::desc("Maximum SLP vectorization factor (0=unlimited)"));
142
143/// Limits the size of scheduling regions in a block.
144/// It avoid long compile times for _very_ large blocks where vector
145/// instructions are spread over a wide range.
146/// This limit is way higher than needed by real-world functions.
147static cl::opt<int>
148ScheduleRegionSizeBudget("slp-schedule-budget", cl::init(100000), cl::Hidden,
149    cl::desc("Limit the size of the SLP scheduling region per block"));
150
151static cl::opt<int> MinVectorRegSizeOption(
152    "slp-min-reg-size", cl::init(128), cl::Hidden,
153    cl::desc("Attempt to vectorize for this register size in bits"));
154
155static cl::opt<unsigned> RecursionMaxDepth(
156    "slp-recursion-max-depth", cl::init(12), cl::Hidden,
157    cl::desc("Limit the recursion depth when building a vectorizable tree"));
158
159static cl::opt<unsigned> MinTreeSize(
160    "slp-min-tree-size", cl::init(3), cl::Hidden,
161    cl::desc("Only vectorize small trees if they are fully vectorizable"));
162
163// The maximum depth that the look-ahead score heuristic will explore.
164// The higher this value, the higher the compilation time overhead.
165static cl::opt<int> LookAheadMaxDepth(
166    "slp-max-look-ahead-depth", cl::init(2), cl::Hidden,
167    cl::desc("The maximum look-ahead depth for operand reordering scores"));
168
169// The maximum depth that the look-ahead score heuristic will explore
170// when it probing among candidates for vectorization tree roots.
171// The higher this value, the higher the compilation time overhead but unlike
172// similar limit for operands ordering this is less frequently used, hence
173// impact of higher value is less noticeable.
174static cl::opt<int> RootLookAheadMaxDepth(
175    "slp-max-root-look-ahead-depth", cl::init(2), cl::Hidden,
176    cl::desc("The maximum look-ahead depth for searching best rooting option"));
177
178static cl::opt<bool>
179    ViewSLPTree("view-slp-tree", cl::Hidden,
180                cl::desc("Display the SLP trees with Graphviz"));
181
182// Limit the number of alias checks. The limit is chosen so that
183// it has no negative effect on the llvm benchmarks.
184static const unsigned AliasedCheckLimit = 10;
185
186// Another limit for the alias checks: The maximum distance between load/store
187// instructions where alias checks are done.
188// This limit is useful for very large basic blocks.
189static const unsigned MaxMemDepDistance = 160;
190
191/// If the ScheduleRegionSizeBudget is exhausted, we allow small scheduling
192/// regions to be handled.
193static const int MinScheduleRegionSize = 16;
194
195/// Predicate for the element types that the SLP vectorizer supports.
196///
197/// The most important thing to filter here are types which are invalid in LLVM
198/// vectors. We also filter target specific types which have absolutely no
199/// meaningful vectorization path such as x86_fp80 and ppc_f128. This just
200/// avoids spending time checking the cost model and realizing that they will
201/// be inevitably scalarized.
202static bool isValidElementType(Type *Ty) {
203  return VectorType::isValidElementType(Ty) && !Ty->isX86_FP80Ty() &&
204         !Ty->isPPC_FP128Ty();
205}
206
207/// \returns True if the value is a constant (but not globals/constant
208/// expressions).
209static bool isConstant(Value *V) {
210  return isa<Constant>(V) && !isa<ConstantExpr, GlobalValue>(V);
211}
212
213/// Checks if \p V is one of vector-like instructions, i.e. undef,
214/// insertelement/extractelement with constant indices for fixed vector type or
215/// extractvalue instruction.
216static bool isVectorLikeInstWithConstOps(Value *V) {
217  if (!isa<InsertElementInst, ExtractElementInst>(V) &&
218      !isa<ExtractValueInst, UndefValue>(V))
219    return false;
220  auto *I = dyn_cast<Instruction>(V);
221  if (!I || isa<ExtractValueInst>(I))
222    return true;
223  if (!isa<FixedVectorType>(I->getOperand(0)->getType()))
224    return false;
225  if (isa<ExtractElementInst>(I))
226    return isConstant(I->getOperand(1));
227  assert(isa<InsertElementInst>(V) && "Expected only insertelement.");
228  return isConstant(I->getOperand(2));
229}
230
231#if !defined(NDEBUG)
232/// Print a short descriptor of the instruction bundle suitable for debug output.
233static std::string shortBundleName(ArrayRef<Value *> VL) {
234  std::string Result;
235  raw_string_ostream OS(Result);
236  OS << "n=" << VL.size() << " [" << *VL.front() << ", ..]";
237  OS.flush();
238  return Result;
239}
240#endif
241
242/// \returns true if all of the instructions in \p VL are in the same block or
243/// false otherwise.
244static bool allSameBlock(ArrayRef<Value *> VL) {
245  Instruction *I0 = dyn_cast<Instruction>(VL[0]);
246  if (!I0)
247    return false;
248  if (all_of(VL, isVectorLikeInstWithConstOps))
249    return true;
250
251  BasicBlock *BB = I0->getParent();
252  for (int I = 1, E = VL.size(); I < E; I++) {
253    auto *II = dyn_cast<Instruction>(VL[I]);
254    if (!II)
255      return false;
256
257    if (BB != II->getParent())
258      return false;
259  }
260  return true;
261}
262
263/// \returns True if all of the values in \p VL are constants (but not
264/// globals/constant expressions).
265static bool allConstant(ArrayRef<Value *> VL) {
266  // Constant expressions and globals can't be vectorized like normal integer/FP
267  // constants.
268  return all_of(VL, isConstant);
269}
270
271/// \returns True if all of the values in \p VL are identical or some of them
272/// are UndefValue.
273static bool isSplat(ArrayRef<Value *> VL) {
274  Value *FirstNonUndef = nullptr;
275  for (Value *V : VL) {
276    if (isa<UndefValue>(V))
277      continue;
278    if (!FirstNonUndef) {
279      FirstNonUndef = V;
280      continue;
281    }
282    if (V != FirstNonUndef)
283      return false;
284  }
285  return FirstNonUndef != nullptr;
286}
287
288/// \returns True if \p I is commutative, handles CmpInst and BinaryOperator.
289static bool isCommutative(Instruction *I) {
290  if (auto *Cmp = dyn_cast<CmpInst>(I))
291    return Cmp->isCommutative();
292  if (auto *BO = dyn_cast<BinaryOperator>(I))
293    return BO->isCommutative();
294  // TODO: This should check for generic Instruction::isCommutative(), but
295  //       we need to confirm that the caller code correctly handles Intrinsics
296  //       for example (does not have 2 operands).
297  return false;
298}
299
300/// \returns inserting index of InsertElement or InsertValue instruction,
301/// using Offset as base offset for index.
302static std::optional<unsigned> getInsertIndex(const Value *InsertInst,
303                                              unsigned Offset = 0) {
304  int Index = Offset;
305  if (const auto *IE = dyn_cast<InsertElementInst>(InsertInst)) {
306    const auto *VT = dyn_cast<FixedVectorType>(IE->getType());
307    if (!VT)
308      return std::nullopt;
309    const auto *CI = dyn_cast<ConstantInt>(IE->getOperand(2));
310    if (!CI)
311      return std::nullopt;
312    if (CI->getValue().uge(VT->getNumElements()))
313      return std::nullopt;
314    Index *= VT->getNumElements();
315    Index += CI->getZExtValue();
316    return Index;
317  }
318
319  const auto *IV = cast<InsertValueInst>(InsertInst);
320  Type *CurrentType = IV->getType();
321  for (unsigned I : IV->indices()) {
322    if (const auto *ST = dyn_cast<StructType>(CurrentType)) {
323      Index *= ST->getNumElements();
324      CurrentType = ST->getElementType(I);
325    } else if (const auto *AT = dyn_cast<ArrayType>(CurrentType)) {
326      Index *= AT->getNumElements();
327      CurrentType = AT->getElementType();
328    } else {
329      return std::nullopt;
330    }
331    Index += I;
332  }
333  return Index;
334}
335
336namespace {
337/// Specifies the way the mask should be analyzed for undefs/poisonous elements
338/// in the shuffle mask.
339enum class UseMask {
340  FirstArg, ///< The mask is expected to be for permutation of 1-2 vectors,
341            ///< check for the mask elements for the first argument (mask
342            ///< indices are in range [0:VF)).
343  SecondArg, ///< The mask is expected to be for permutation of 2 vectors, check
344             ///< for the mask elements for the second argument (mask indices
345             ///< are in range [VF:2*VF))
346  UndefsAsMask ///< Consider undef mask elements (-1) as placeholders for
347               ///< future shuffle elements and mark them as ones as being used
348               ///< in future. Non-undef elements are considered as unused since
349               ///< they're already marked as used in the mask.
350};
351} // namespace
352
353/// Prepares a use bitset for the given mask either for the first argument or
354/// for the second.
355static SmallBitVector buildUseMask(int VF, ArrayRef<int> Mask,
356                                   UseMask MaskArg) {
357  SmallBitVector UseMask(VF, true);
358  for (auto [Idx, Value] : enumerate(Mask)) {
359    if (Value == PoisonMaskElem) {
360      if (MaskArg == UseMask::UndefsAsMask)
361        UseMask.reset(Idx);
362      continue;
363    }
364    if (MaskArg == UseMask::FirstArg && Value < VF)
365      UseMask.reset(Value);
366    else if (MaskArg == UseMask::SecondArg && Value >= VF)
367      UseMask.reset(Value - VF);
368  }
369  return UseMask;
370}
371
372/// Checks if the given value is actually an undefined constant vector.
373/// Also, if the \p UseMask is not empty, tries to check if the non-masked
374/// elements actually mask the insertelement buildvector, if any.
375template <bool IsPoisonOnly = false>
376static SmallBitVector isUndefVector(const Value *V,
377                                    const SmallBitVector &UseMask = {}) {
378  SmallBitVector Res(UseMask.empty() ? 1 : UseMask.size(), true);
379  using T = std::conditional_t<IsPoisonOnly, PoisonValue, UndefValue>;
380  if (isa<T>(V))
381    return Res;
382  auto *VecTy = dyn_cast<FixedVectorType>(V->getType());
383  if (!VecTy)
384    return Res.reset();
385  auto *C = dyn_cast<Constant>(V);
386  if (!C) {
387    if (!UseMask.empty()) {
388      const Value *Base = V;
389      while (auto *II = dyn_cast<InsertElementInst>(Base)) {
390        Base = II->getOperand(0);
391        if (isa<T>(II->getOperand(1)))
392          continue;
393        std::optional<unsigned> Idx = getInsertIndex(II);
394        if (!Idx) {
395          Res.reset();
396          return Res;
397        }
398        if (*Idx < UseMask.size() && !UseMask.test(*Idx))
399          Res.reset(*Idx);
400      }
401      // TODO: Add analysis for shuffles here too.
402      if (V == Base) {
403        Res.reset();
404      } else {
405        SmallBitVector SubMask(UseMask.size(), false);
406        Res &= isUndefVector<IsPoisonOnly>(Base, SubMask);
407      }
408    } else {
409      Res.reset();
410    }
411    return Res;
412  }
413  for (unsigned I = 0, E = VecTy->getNumElements(); I != E; ++I) {
414    if (Constant *Elem = C->getAggregateElement(I))
415      if (!isa<T>(Elem) &&
416          (UseMask.empty() || (I < UseMask.size() && !UseMask.test(I))))
417        Res.reset(I);
418  }
419  return Res;
420}
421
422/// Checks if the vector of instructions can be represented as a shuffle, like:
423/// %x0 = extractelement <4 x i8> %x, i32 0
424/// %x3 = extractelement <4 x i8> %x, i32 3
425/// %y1 = extractelement <4 x i8> %y, i32 1
426/// %y2 = extractelement <4 x i8> %y, i32 2
427/// %x0x0 = mul i8 %x0, %x0
428/// %x3x3 = mul i8 %x3, %x3
429/// %y1y1 = mul i8 %y1, %y1
430/// %y2y2 = mul i8 %y2, %y2
431/// %ins1 = insertelement <4 x i8> poison, i8 %x0x0, i32 0
432/// %ins2 = insertelement <4 x i8> %ins1, i8 %x3x3, i32 1
433/// %ins3 = insertelement <4 x i8> %ins2, i8 %y1y1, i32 2
434/// %ins4 = insertelement <4 x i8> %ins3, i8 %y2y2, i32 3
435/// ret <4 x i8> %ins4
436/// can be transformed into:
437/// %1 = shufflevector <4 x i8> %x, <4 x i8> %y, <4 x i32> <i32 0, i32 3, i32 5,
438///                                                         i32 6>
439/// %2 = mul <4 x i8> %1, %1
440/// ret <4 x i8> %2
441/// Mask will return the Shuffle Mask equivalent to the extracted elements.
442/// TODO: Can we split off and reuse the shuffle mask detection from
443/// ShuffleVectorInst/getShuffleCost?
444static std::optional<TargetTransformInfo::ShuffleKind>
445isFixedVectorShuffle(ArrayRef<Value *> VL, SmallVectorImpl<int> &Mask) {
446  const auto *It =
447      find_if(VL, [](Value *V) { return isa<ExtractElementInst>(V); });
448  if (It == VL.end())
449    return std::nullopt;
450  auto *EI0 = cast<ExtractElementInst>(*It);
451  if (isa<ScalableVectorType>(EI0->getVectorOperandType()))
452    return std::nullopt;
453  unsigned Size =
454      cast<FixedVectorType>(EI0->getVectorOperandType())->getNumElements();
455  Value *Vec1 = nullptr;
456  Value *Vec2 = nullptr;
457  enum ShuffleMode { Unknown, Select, Permute };
458  ShuffleMode CommonShuffleMode = Unknown;
459  Mask.assign(VL.size(), PoisonMaskElem);
460  for (unsigned I = 0, E = VL.size(); I < E; ++I) {
461    // Undef can be represented as an undef element in a vector.
462    if (isa<UndefValue>(VL[I]))
463      continue;
464    auto *EI = cast<ExtractElementInst>(VL[I]);
465    if (isa<ScalableVectorType>(EI->getVectorOperandType()))
466      return std::nullopt;
467    auto *Vec = EI->getVectorOperand();
468    // We can extractelement from undef or poison vector.
469    if (isUndefVector(Vec).all())
470      continue;
471    // All vector operands must have the same number of vector elements.
472    if (cast<FixedVectorType>(Vec->getType())->getNumElements() != Size)
473      return std::nullopt;
474    if (isa<UndefValue>(EI->getIndexOperand()))
475      continue;
476    auto *Idx = dyn_cast<ConstantInt>(EI->getIndexOperand());
477    if (!Idx)
478      return std::nullopt;
479    // Undefined behavior if Idx is negative or >= Size.
480    if (Idx->getValue().uge(Size))
481      continue;
482    unsigned IntIdx = Idx->getValue().getZExtValue();
483    Mask[I] = IntIdx;
484    // For correct shuffling we have to have at most 2 different vector operands
485    // in all extractelement instructions.
486    if (!Vec1 || Vec1 == Vec) {
487      Vec1 = Vec;
488    } else if (!Vec2 || Vec2 == Vec) {
489      Vec2 = Vec;
490      Mask[I] += Size;
491    } else {
492      return std::nullopt;
493    }
494    if (CommonShuffleMode == Permute)
495      continue;
496    // If the extract index is not the same as the operation number, it is a
497    // permutation.
498    if (IntIdx != I) {
499      CommonShuffleMode = Permute;
500      continue;
501    }
502    CommonShuffleMode = Select;
503  }
504  // If we're not crossing lanes in different vectors, consider it as blending.
505  if (CommonShuffleMode == Select && Vec2)
506    return TargetTransformInfo::SK_Select;
507  // If Vec2 was never used, we have a permutation of a single vector, otherwise
508  // we have permutation of 2 vectors.
509  return Vec2 ? TargetTransformInfo::SK_PermuteTwoSrc
510              : TargetTransformInfo::SK_PermuteSingleSrc;
511}
512
513/// \returns True if Extract{Value,Element} instruction extracts element Idx.
514static std::optional<unsigned> getExtractIndex(Instruction *E) {
515  unsigned Opcode = E->getOpcode();
516  assert((Opcode == Instruction::ExtractElement ||
517          Opcode == Instruction::ExtractValue) &&
518         "Expected extractelement or extractvalue instruction.");
519  if (Opcode == Instruction::ExtractElement) {
520    auto *CI = dyn_cast<ConstantInt>(E->getOperand(1));
521    if (!CI)
522      return std::nullopt;
523    return CI->getZExtValue();
524  }
525  auto *EI = cast<ExtractValueInst>(E);
526  if (EI->getNumIndices() != 1)
527    return std::nullopt;
528  return *EI->idx_begin();
529}
530
531namespace {
532
533/// Main data required for vectorization of instructions.
534struct InstructionsState {
535  /// The very first instruction in the list with the main opcode.
536  Value *OpValue = nullptr;
537
538  /// The main/alternate instruction.
539  Instruction *MainOp = nullptr;
540  Instruction *AltOp = nullptr;
541
542  /// The main/alternate opcodes for the list of instructions.
543  unsigned getOpcode() const {
544    return MainOp ? MainOp->getOpcode() : 0;
545  }
546
547  unsigned getAltOpcode() const {
548    return AltOp ? AltOp->getOpcode() : 0;
549  }
550
551  /// Some of the instructions in the list have alternate opcodes.
552  bool isAltShuffle() const { return AltOp != MainOp; }
553
554  bool isOpcodeOrAlt(Instruction *I) const {
555    unsigned CheckedOpcode = I->getOpcode();
556    return getOpcode() == CheckedOpcode || getAltOpcode() == CheckedOpcode;
557  }
558
559  InstructionsState() = delete;
560  InstructionsState(Value *OpValue, Instruction *MainOp, Instruction *AltOp)
561      : OpValue(OpValue), MainOp(MainOp), AltOp(AltOp) {}
562};
563
564} // end anonymous namespace
565
566/// Chooses the correct key for scheduling data. If \p Op has the same (or
567/// alternate) opcode as \p OpValue, the key is \p Op. Otherwise the key is \p
568/// OpValue.
569static Value *isOneOf(const InstructionsState &S, Value *Op) {
570  auto *I = dyn_cast<Instruction>(Op);
571  if (I && S.isOpcodeOrAlt(I))
572    return Op;
573  return S.OpValue;
574}
575
576/// \returns true if \p Opcode is allowed as part of the main/alternate
577/// instruction for SLP vectorization.
578///
579/// Example of unsupported opcode is SDIV that can potentially cause UB if the
580/// "shuffled out" lane would result in division by zero.
581static bool isValidForAlternation(unsigned Opcode) {
582  if (Instruction::isIntDivRem(Opcode))
583    return false;
584
585  return true;
586}
587
588static InstructionsState getSameOpcode(ArrayRef<Value *> VL,
589                                       const TargetLibraryInfo &TLI,
590                                       unsigned BaseIndex = 0);
591
592/// Checks if the provided operands of 2 cmp instructions are compatible, i.e.
593/// compatible instructions or constants, or just some other regular values.
594static bool areCompatibleCmpOps(Value *BaseOp0, Value *BaseOp1, Value *Op0,
595                                Value *Op1, const TargetLibraryInfo &TLI) {
596  return (isConstant(BaseOp0) && isConstant(Op0)) ||
597         (isConstant(BaseOp1) && isConstant(Op1)) ||
598         (!isa<Instruction>(BaseOp0) && !isa<Instruction>(Op0) &&
599          !isa<Instruction>(BaseOp1) && !isa<Instruction>(Op1)) ||
600         BaseOp0 == Op0 || BaseOp1 == Op1 ||
601         getSameOpcode({BaseOp0, Op0}, TLI).getOpcode() ||
602         getSameOpcode({BaseOp1, Op1}, TLI).getOpcode();
603}
604
605/// \returns true if a compare instruction \p CI has similar "look" and
606/// same predicate as \p BaseCI, "as is" or with its operands and predicate
607/// swapped, false otherwise.
608static bool isCmpSameOrSwapped(const CmpInst *BaseCI, const CmpInst *CI,
609                               const TargetLibraryInfo &TLI) {
610  assert(BaseCI->getOperand(0)->getType() == CI->getOperand(0)->getType() &&
611         "Assessing comparisons of different types?");
612  CmpInst::Predicate BasePred = BaseCI->getPredicate();
613  CmpInst::Predicate Pred = CI->getPredicate();
614  CmpInst::Predicate SwappedPred = CmpInst::getSwappedPredicate(Pred);
615
616  Value *BaseOp0 = BaseCI->getOperand(0);
617  Value *BaseOp1 = BaseCI->getOperand(1);
618  Value *Op0 = CI->getOperand(0);
619  Value *Op1 = CI->getOperand(1);
620
621  return (BasePred == Pred &&
622          areCompatibleCmpOps(BaseOp0, BaseOp1, Op0, Op1, TLI)) ||
623         (BasePred == SwappedPred &&
624          areCompatibleCmpOps(BaseOp0, BaseOp1, Op1, Op0, TLI));
625}
626
627/// \returns analysis of the Instructions in \p VL described in
628/// InstructionsState, the Opcode that we suppose the whole list
629/// could be vectorized even if its structure is diverse.
630static InstructionsState getSameOpcode(ArrayRef<Value *> VL,
631                                       const TargetLibraryInfo &TLI,
632                                       unsigned BaseIndex) {
633  // Make sure these are all Instructions.
634  if (llvm::any_of(VL, [](Value *V) { return !isa<Instruction>(V); }))
635    return InstructionsState(VL[BaseIndex], nullptr, nullptr);
636
637  bool IsCastOp = isa<CastInst>(VL[BaseIndex]);
638  bool IsBinOp = isa<BinaryOperator>(VL[BaseIndex]);
639  bool IsCmpOp = isa<CmpInst>(VL[BaseIndex]);
640  CmpInst::Predicate BasePred =
641      IsCmpOp ? cast<CmpInst>(VL[BaseIndex])->getPredicate()
642              : CmpInst::BAD_ICMP_PREDICATE;
643  unsigned Opcode = cast<Instruction>(VL[BaseIndex])->getOpcode();
644  unsigned AltOpcode = Opcode;
645  unsigned AltIndex = BaseIndex;
646
647  // Check for one alternate opcode from another BinaryOperator.
648  // TODO - generalize to support all operators (types, calls etc.).
649  auto *IBase = cast<Instruction>(VL[BaseIndex]);
650  Intrinsic::ID BaseID = 0;
651  SmallVector<VFInfo> BaseMappings;
652  if (auto *CallBase = dyn_cast<CallInst>(IBase)) {
653    BaseID = getVectorIntrinsicIDForCall(CallBase, &TLI);
654    BaseMappings = VFDatabase(*CallBase).getMappings(*CallBase);
655    if (!isTriviallyVectorizable(BaseID) && BaseMappings.empty())
656      return InstructionsState(VL[BaseIndex], nullptr, nullptr);
657  }
658  for (int Cnt = 0, E = VL.size(); Cnt < E; Cnt++) {
659    auto *I = cast<Instruction>(VL[Cnt]);
660    unsigned InstOpcode = I->getOpcode();
661    if (IsBinOp && isa<BinaryOperator>(I)) {
662      if (InstOpcode == Opcode || InstOpcode == AltOpcode)
663        continue;
664      if (Opcode == AltOpcode && isValidForAlternation(InstOpcode) &&
665          isValidForAlternation(Opcode)) {
666        AltOpcode = InstOpcode;
667        AltIndex = Cnt;
668        continue;
669      }
670    } else if (IsCastOp && isa<CastInst>(I)) {
671      Value *Op0 = IBase->getOperand(0);
672      Type *Ty0 = Op0->getType();
673      Value *Op1 = I->getOperand(0);
674      Type *Ty1 = Op1->getType();
675      if (Ty0 == Ty1) {
676        if (InstOpcode == Opcode || InstOpcode == AltOpcode)
677          continue;
678        if (Opcode == AltOpcode) {
679          assert(isValidForAlternation(Opcode) &&
680                 isValidForAlternation(InstOpcode) &&
681                 "Cast isn't safe for alternation, logic needs to be updated!");
682          AltOpcode = InstOpcode;
683          AltIndex = Cnt;
684          continue;
685        }
686      }
687    } else if (auto *Inst = dyn_cast<CmpInst>(VL[Cnt]); Inst && IsCmpOp) {
688      auto *BaseInst = cast<CmpInst>(VL[BaseIndex]);
689      Type *Ty0 = BaseInst->getOperand(0)->getType();
690      Type *Ty1 = Inst->getOperand(0)->getType();
691      if (Ty0 == Ty1) {
692        assert(InstOpcode == Opcode && "Expected same CmpInst opcode.");
693        // Check for compatible operands. If the corresponding operands are not
694        // compatible - need to perform alternate vectorization.
695        CmpInst::Predicate CurrentPred = Inst->getPredicate();
696        CmpInst::Predicate SwappedCurrentPred =
697            CmpInst::getSwappedPredicate(CurrentPred);
698
699        if (E == 2 &&
700            (BasePred == CurrentPred || BasePred == SwappedCurrentPred))
701          continue;
702
703        if (isCmpSameOrSwapped(BaseInst, Inst, TLI))
704          continue;
705        auto *AltInst = cast<CmpInst>(VL[AltIndex]);
706        if (AltIndex != BaseIndex) {
707          if (isCmpSameOrSwapped(AltInst, Inst, TLI))
708            continue;
709        } else if (BasePred != CurrentPred) {
710          assert(
711              isValidForAlternation(InstOpcode) &&
712              "CmpInst isn't safe for alternation, logic needs to be updated!");
713          AltIndex = Cnt;
714          continue;
715        }
716        CmpInst::Predicate AltPred = AltInst->getPredicate();
717        if (BasePred == CurrentPred || BasePred == SwappedCurrentPred ||
718            AltPred == CurrentPred || AltPred == SwappedCurrentPred)
719          continue;
720      }
721    } else if (InstOpcode == Opcode || InstOpcode == AltOpcode) {
722      if (auto *Gep = dyn_cast<GetElementPtrInst>(I)) {
723        if (Gep->getNumOperands() != 2 ||
724            Gep->getOperand(0)->getType() != IBase->getOperand(0)->getType())
725          return InstructionsState(VL[BaseIndex], nullptr, nullptr);
726      } else if (auto *EI = dyn_cast<ExtractElementInst>(I)) {
727        if (!isVectorLikeInstWithConstOps(EI))
728          return InstructionsState(VL[BaseIndex], nullptr, nullptr);
729      } else if (auto *LI = dyn_cast<LoadInst>(I)) {
730        auto *BaseLI = cast<LoadInst>(IBase);
731        if (!LI->isSimple() || !BaseLI->isSimple())
732          return InstructionsState(VL[BaseIndex], nullptr, nullptr);
733      } else if (auto *Call = dyn_cast<CallInst>(I)) {
734        auto *CallBase = cast<CallInst>(IBase);
735        if (Call->getCalledFunction() != CallBase->getCalledFunction())
736          return InstructionsState(VL[BaseIndex], nullptr, nullptr);
737        if (Call->hasOperandBundles() &&
738            !std::equal(Call->op_begin() + Call->getBundleOperandsStartIndex(),
739                        Call->op_begin() + Call->getBundleOperandsEndIndex(),
740                        CallBase->op_begin() +
741                            CallBase->getBundleOperandsStartIndex()))
742          return InstructionsState(VL[BaseIndex], nullptr, nullptr);
743        Intrinsic::ID ID = getVectorIntrinsicIDForCall(Call, &TLI);
744        if (ID != BaseID)
745          return InstructionsState(VL[BaseIndex], nullptr, nullptr);
746        if (!ID) {
747          SmallVector<VFInfo> Mappings = VFDatabase(*Call).getMappings(*Call);
748          if (Mappings.size() != BaseMappings.size() ||
749              Mappings.front().ISA != BaseMappings.front().ISA ||
750              Mappings.front().ScalarName != BaseMappings.front().ScalarName ||
751              Mappings.front().VectorName != BaseMappings.front().VectorName ||
752              Mappings.front().Shape.VF != BaseMappings.front().Shape.VF ||
753              Mappings.front().Shape.Parameters !=
754                  BaseMappings.front().Shape.Parameters)
755            return InstructionsState(VL[BaseIndex], nullptr, nullptr);
756        }
757      }
758      continue;
759    }
760    return InstructionsState(VL[BaseIndex], nullptr, nullptr);
761  }
762
763  return InstructionsState(VL[BaseIndex], cast<Instruction>(VL[BaseIndex]),
764                           cast<Instruction>(VL[AltIndex]));
765}
766
767/// \returns true if all of the values in \p VL have the same type or false
768/// otherwise.
769static bool allSameType(ArrayRef<Value *> VL) {
770  Type *Ty = VL.front()->getType();
771  return all_of(VL.drop_front(), [&](Value *V) { return V->getType() == Ty; });
772}
773
774/// \returns True if in-tree use also needs extract. This refers to
775/// possible scalar operand in vectorized instruction.
776static bool doesInTreeUserNeedToExtract(Value *Scalar, Instruction *UserInst,
777                                        TargetLibraryInfo *TLI) {
778  unsigned Opcode = UserInst->getOpcode();
779  switch (Opcode) {
780  case Instruction::Load: {
781    LoadInst *LI = cast<LoadInst>(UserInst);
782    return (LI->getPointerOperand() == Scalar);
783  }
784  case Instruction::Store: {
785    StoreInst *SI = cast<StoreInst>(UserInst);
786    return (SI->getPointerOperand() == Scalar);
787  }
788  case Instruction::Call: {
789    CallInst *CI = cast<CallInst>(UserInst);
790    Intrinsic::ID ID = getVectorIntrinsicIDForCall(CI, TLI);
791    return any_of(enumerate(CI->args()), [&](auto &&Arg) {
792      return isVectorIntrinsicWithScalarOpAtArg(ID, Arg.index()) &&
793             Arg.value().get() == Scalar;
794    });
795  }
796  default:
797    return false;
798  }
799}
800
801/// \returns the AA location that is being access by the instruction.
802static MemoryLocation getLocation(Instruction *I) {
803  if (StoreInst *SI = dyn_cast<StoreInst>(I))
804    return MemoryLocation::get(SI);
805  if (LoadInst *LI = dyn_cast<LoadInst>(I))
806    return MemoryLocation::get(LI);
807  return MemoryLocation();
808}
809
810/// \returns True if the instruction is not a volatile or atomic load/store.
811static bool isSimple(Instruction *I) {
812  if (LoadInst *LI = dyn_cast<LoadInst>(I))
813    return LI->isSimple();
814  if (StoreInst *SI = dyn_cast<StoreInst>(I))
815    return SI->isSimple();
816  if (MemIntrinsic *MI = dyn_cast<MemIntrinsic>(I))
817    return !MI->isVolatile();
818  return true;
819}
820
821/// Shuffles \p Mask in accordance with the given \p SubMask.
822/// \param ExtendingManyInputs Supports reshuffling of the mask with not only
823/// one but two input vectors.
824static void addMask(SmallVectorImpl<int> &Mask, ArrayRef<int> SubMask,
825                    bool ExtendingManyInputs = false) {
826  if (SubMask.empty())
827    return;
828  assert(
829      (!ExtendingManyInputs || SubMask.size() > Mask.size() ||
830       // Check if input scalars were extended to match the size of other node.
831       (SubMask.size() == Mask.size() &&
832        std::all_of(std::next(Mask.begin(), Mask.size() / 2), Mask.end(),
833                    [](int Idx) { return Idx == PoisonMaskElem; }))) &&
834      "SubMask with many inputs support must be larger than the mask.");
835  if (Mask.empty()) {
836    Mask.append(SubMask.begin(), SubMask.end());
837    return;
838  }
839  SmallVector<int> NewMask(SubMask.size(), PoisonMaskElem);
840  int TermValue = std::min(Mask.size(), SubMask.size());
841  for (int I = 0, E = SubMask.size(); I < E; ++I) {
842    if (SubMask[I] == PoisonMaskElem ||
843        (!ExtendingManyInputs &&
844         (SubMask[I] >= TermValue || Mask[SubMask[I]] >= TermValue)))
845      continue;
846    NewMask[I] = Mask[SubMask[I]];
847  }
848  Mask.swap(NewMask);
849}
850
851/// Order may have elements assigned special value (size) which is out of
852/// bounds. Such indices only appear on places which correspond to undef values
853/// (see canReuseExtract for details) and used in order to avoid undef values
854/// have effect on operands ordering.
855/// The first loop below simply finds all unused indices and then the next loop
856/// nest assigns these indices for undef values positions.
857/// As an example below Order has two undef positions and they have assigned
858/// values 3 and 7 respectively:
859/// before:  6 9 5 4 9 2 1 0
860/// after:   6 3 5 4 7 2 1 0
861static void fixupOrderingIndices(SmallVectorImpl<unsigned> &Order) {
862  const unsigned Sz = Order.size();
863  SmallBitVector UnusedIndices(Sz, /*t=*/true);
864  SmallBitVector MaskedIndices(Sz);
865  for (unsigned I = 0; I < Sz; ++I) {
866    if (Order[I] < Sz)
867      UnusedIndices.reset(Order[I]);
868    else
869      MaskedIndices.set(I);
870  }
871  if (MaskedIndices.none())
872    return;
873  assert(UnusedIndices.count() == MaskedIndices.count() &&
874         "Non-synced masked/available indices.");
875  int Idx = UnusedIndices.find_first();
876  int MIdx = MaskedIndices.find_first();
877  while (MIdx >= 0) {
878    assert(Idx >= 0 && "Indices must be synced.");
879    Order[MIdx] = Idx;
880    Idx = UnusedIndices.find_next(Idx);
881    MIdx = MaskedIndices.find_next(MIdx);
882  }
883}
884
885namespace llvm {
886
887static void inversePermutation(ArrayRef<unsigned> Indices,
888                               SmallVectorImpl<int> &Mask) {
889  Mask.clear();
890  const unsigned E = Indices.size();
891  Mask.resize(E, PoisonMaskElem);
892  for (unsigned I = 0; I < E; ++I)
893    Mask[Indices[I]] = I;
894}
895
896/// Reorders the list of scalars in accordance with the given \p Mask.
897static void reorderScalars(SmallVectorImpl<Value *> &Scalars,
898                           ArrayRef<int> Mask) {
899  assert(!Mask.empty() && "Expected non-empty mask.");
900  SmallVector<Value *> Prev(Scalars.size(),
901                            UndefValue::get(Scalars.front()->getType()));
902  Prev.swap(Scalars);
903  for (unsigned I = 0, E = Prev.size(); I < E; ++I)
904    if (Mask[I] != PoisonMaskElem)
905      Scalars[Mask[I]] = Prev[I];
906}
907
908/// Checks if the provided value does not require scheduling. It does not
909/// require scheduling if this is not an instruction or it is an instruction
910/// that does not read/write memory and all operands are either not instructions
911/// or phi nodes or instructions from different blocks.
912static bool areAllOperandsNonInsts(Value *V) {
913  auto *I = dyn_cast<Instruction>(V);
914  if (!I)
915    return true;
916  return !mayHaveNonDefUseDependency(*I) &&
917    all_of(I->operands(), [I](Value *V) {
918      auto *IO = dyn_cast<Instruction>(V);
919      if (!IO)
920        return true;
921      return isa<PHINode>(IO) || IO->getParent() != I->getParent();
922    });
923}
924
925/// Checks if the provided value does not require scheduling. It does not
926/// require scheduling if this is not an instruction or it is an instruction
927/// that does not read/write memory and all users are phi nodes or instructions
928/// from the different blocks.
929static bool isUsedOutsideBlock(Value *V) {
930  auto *I = dyn_cast<Instruction>(V);
931  if (!I)
932    return true;
933  // Limits the number of uses to save compile time.
934  constexpr int UsesLimit = 8;
935  return !I->mayReadOrWriteMemory() && !I->hasNUsesOrMore(UsesLimit) &&
936         all_of(I->users(), [I](User *U) {
937           auto *IU = dyn_cast<Instruction>(U);
938           if (!IU)
939             return true;
940           return IU->getParent() != I->getParent() || isa<PHINode>(IU);
941         });
942}
943
944/// Checks if the specified value does not require scheduling. It does not
945/// require scheduling if all operands and all users do not need to be scheduled
946/// in the current basic block.
947static bool doesNotNeedToBeScheduled(Value *V) {
948  return areAllOperandsNonInsts(V) && isUsedOutsideBlock(V);
949}
950
951/// Checks if the specified array of instructions does not require scheduling.
952/// It is so if all either instructions have operands that do not require
953/// scheduling or their users do not require scheduling since they are phis or
954/// in other basic blocks.
955static bool doesNotNeedToSchedule(ArrayRef<Value *> VL) {
956  return !VL.empty() &&
957         (all_of(VL, isUsedOutsideBlock) || all_of(VL, areAllOperandsNonInsts));
958}
959
960namespace slpvectorizer {
961
962/// Bottom Up SLP Vectorizer.
963class BoUpSLP {
964  struct TreeEntry;
965  struct ScheduleData;
966  class ShuffleCostEstimator;
967  class ShuffleInstructionBuilder;
968
969public:
970  using ValueList = SmallVector<Value *, 8>;
971  using InstrList = SmallVector<Instruction *, 16>;
972  using ValueSet = SmallPtrSet<Value *, 16>;
973  using StoreList = SmallVector<StoreInst *, 8>;
974  using ExtraValueToDebugLocsMap =
975      MapVector<Value *, SmallVector<Instruction *, 2>>;
976  using OrdersType = SmallVector<unsigned, 4>;
977
978  BoUpSLP(Function *Func, ScalarEvolution *Se, TargetTransformInfo *Tti,
979          TargetLibraryInfo *TLi, AAResults *Aa, LoopInfo *Li,
980          DominatorTree *Dt, AssumptionCache *AC, DemandedBits *DB,
981          const DataLayout *DL, OptimizationRemarkEmitter *ORE)
982      : BatchAA(*Aa), F(Func), SE(Se), TTI(Tti), TLI(TLi), LI(Li),
983        DT(Dt), AC(AC), DB(DB), DL(DL), ORE(ORE), Builder(Se->getContext()) {
984    CodeMetrics::collectEphemeralValues(F, AC, EphValues);
985    // Use the vector register size specified by the target unless overridden
986    // by a command-line option.
987    // TODO: It would be better to limit the vectorization factor based on
988    //       data type rather than just register size. For example, x86 AVX has
989    //       256-bit registers, but it does not support integer operations
990    //       at that width (that requires AVX2).
991    if (MaxVectorRegSizeOption.getNumOccurrences())
992      MaxVecRegSize = MaxVectorRegSizeOption;
993    else
994      MaxVecRegSize =
995          TTI->getRegisterBitWidth(TargetTransformInfo::RGK_FixedWidthVector)
996              .getFixedValue();
997
998    if (MinVectorRegSizeOption.getNumOccurrences())
999      MinVecRegSize = MinVectorRegSizeOption;
1000    else
1001      MinVecRegSize = TTI->getMinVectorRegisterBitWidth();
1002  }
1003
1004  /// Vectorize the tree that starts with the elements in \p VL.
1005  /// Returns the vectorized root.
1006  Value *vectorizeTree();
1007
1008  /// Vectorize the tree but with the list of externally used values \p
1009  /// ExternallyUsedValues. Values in this MapVector can be replaced but the
1010  /// generated extractvalue instructions.
1011  /// \param ReplacedExternals containd list of replaced external values
1012  /// {scalar, replace} after emitting extractelement for external uses.
1013  Value *
1014  vectorizeTree(const ExtraValueToDebugLocsMap &ExternallyUsedValues,
1015                SmallVectorImpl<std::pair<Value *, Value *>> &ReplacedExternals,
1016                Instruction *ReductionRoot = nullptr);
1017
1018  /// \returns the cost incurred by unwanted spills and fills, caused by
1019  /// holding live values over call sites.
1020  InstructionCost getSpillCost() const;
1021
1022  /// \returns the vectorization cost of the subtree that starts at \p VL.
1023  /// A negative number means that this is profitable.
1024  InstructionCost getTreeCost(ArrayRef<Value *> VectorizedVals = std::nullopt);
1025
1026  /// Construct a vectorizable tree that starts at \p Roots, ignoring users for
1027  /// the purpose of scheduling and extraction in the \p UserIgnoreLst.
1028  void buildTree(ArrayRef<Value *> Roots,
1029                 const SmallDenseSet<Value *> &UserIgnoreLst);
1030
1031  /// Construct a vectorizable tree that starts at \p Roots.
1032  void buildTree(ArrayRef<Value *> Roots);
1033
1034  /// Returns whether the root node has in-tree uses.
1035  bool doesRootHaveInTreeUses() const {
1036    return !VectorizableTree.empty() &&
1037           !VectorizableTree.front()->UserTreeIndices.empty();
1038  }
1039
1040  /// Return the scalars of the root node.
1041  ArrayRef<Value *> getRootNodeScalars() const {
1042    assert(!VectorizableTree.empty() && "No graph to get the first node from");
1043    return VectorizableTree.front()->Scalars;
1044  }
1045
1046  /// Builds external uses of the vectorized scalars, i.e. the list of
1047  /// vectorized scalars to be extracted, their lanes and their scalar users. \p
1048  /// ExternallyUsedValues contains additional list of external uses to handle
1049  /// vectorization of reductions.
1050  void
1051  buildExternalUses(const ExtraValueToDebugLocsMap &ExternallyUsedValues = {});
1052
1053  /// Clear the internal data structures that are created by 'buildTree'.
1054  void deleteTree() {
1055    VectorizableTree.clear();
1056    ScalarToTreeEntry.clear();
1057    MultiNodeScalars.clear();
1058    MustGather.clear();
1059    EntryToLastInstruction.clear();
1060    ExternalUses.clear();
1061    for (auto &Iter : BlocksSchedules) {
1062      BlockScheduling *BS = Iter.second.get();
1063      BS->clear();
1064    }
1065    MinBWs.clear();
1066    InstrElementSize.clear();
1067    UserIgnoreList = nullptr;
1068    PostponedGathers.clear();
1069    ValueToGatherNodes.clear();
1070  }
1071
1072  unsigned getTreeSize() const { return VectorizableTree.size(); }
1073
1074  /// Perform LICM and CSE on the newly generated gather sequences.
1075  void optimizeGatherSequence();
1076
1077  /// Checks if the specified gather tree entry \p TE can be represented as a
1078  /// shuffled vector entry + (possibly) permutation with other gathers. It
1079  /// implements the checks only for possibly ordered scalars (Loads,
1080  /// ExtractElement, ExtractValue), which can be part of the graph.
1081  std::optional<OrdersType> findReusedOrderedScalars(const TreeEntry &TE);
1082
1083  /// Sort loads into increasing pointers offsets to allow greater clustering.
1084  std::optional<OrdersType> findPartiallyOrderedLoads(const TreeEntry &TE);
1085
1086  /// Gets reordering data for the given tree entry. If the entry is vectorized
1087  /// - just return ReorderIndices, otherwise check if the scalars can be
1088  /// reordered and return the most optimal order.
1089  /// \return std::nullopt if ordering is not important, empty order, if
1090  /// identity order is important, or the actual order.
1091  /// \param TopToBottom If true, include the order of vectorized stores and
1092  /// insertelement nodes, otherwise skip them.
1093  std::optional<OrdersType> getReorderingData(const TreeEntry &TE,
1094                                              bool TopToBottom);
1095
1096  /// Reorders the current graph to the most profitable order starting from the
1097  /// root node to the leaf nodes. The best order is chosen only from the nodes
1098  /// of the same size (vectorization factor). Smaller nodes are considered
1099  /// parts of subgraph with smaller VF and they are reordered independently. We
1100  /// can make it because we still need to extend smaller nodes to the wider VF
1101  /// and we can merge reordering shuffles with the widening shuffles.
1102  void reorderTopToBottom();
1103
1104  /// Reorders the current graph to the most profitable order starting from
1105  /// leaves to the root. It allows to rotate small subgraphs and reduce the
1106  /// number of reshuffles if the leaf nodes use the same order. In this case we
1107  /// can merge the orders and just shuffle user node instead of shuffling its
1108  /// operands. Plus, even the leaf nodes have different orders, it allows to
1109  /// sink reordering in the graph closer to the root node and merge it later
1110  /// during analysis.
1111  void reorderBottomToTop(bool IgnoreReorder = false);
1112
1113  /// \return The vector element size in bits to use when vectorizing the
1114  /// expression tree ending at \p V. If V is a store, the size is the width of
1115  /// the stored value. Otherwise, the size is the width of the largest loaded
1116  /// value reaching V. This method is used by the vectorizer to calculate
1117  /// vectorization factors.
1118  unsigned getVectorElementSize(Value *V);
1119
1120  /// Compute the minimum type sizes required to represent the entries in a
1121  /// vectorizable tree.
1122  void computeMinimumValueSizes();
1123
1124  // \returns maximum vector register size as set by TTI or overridden by cl::opt.
1125  unsigned getMaxVecRegSize() const {
1126    return MaxVecRegSize;
1127  }
1128
1129  // \returns minimum vector register size as set by cl::opt.
1130  unsigned getMinVecRegSize() const {
1131    return MinVecRegSize;
1132  }
1133
1134  unsigned getMinVF(unsigned Sz) const {
1135    return std::max(2U, getMinVecRegSize() / Sz);
1136  }
1137
1138  unsigned getMaximumVF(unsigned ElemWidth, unsigned Opcode) const {
1139    unsigned MaxVF = MaxVFOption.getNumOccurrences() ?
1140      MaxVFOption : TTI->getMaximumVF(ElemWidth, Opcode);
1141    return MaxVF ? MaxVF : UINT_MAX;
1142  }
1143
1144  /// Check if homogeneous aggregate is isomorphic to some VectorType.
1145  /// Accepts homogeneous multidimensional aggregate of scalars/vectors like
1146  /// {[4 x i16], [4 x i16]}, { <2 x float>, <2 x float> },
1147  /// {{{i16, i16}, {i16, i16}}, {{i16, i16}, {i16, i16}}} and so on.
1148  ///
1149  /// \returns number of elements in vector if isomorphism exists, 0 otherwise.
1150  unsigned canMapToVector(Type *T) const;
1151
1152  /// \returns True if the VectorizableTree is both tiny and not fully
1153  /// vectorizable. We do not vectorize such trees.
1154  bool isTreeTinyAndNotFullyVectorizable(bool ForReduction = false) const;
1155
1156  /// Assume that a legal-sized 'or'-reduction of shifted/zexted loaded values
1157  /// can be load combined in the backend. Load combining may not be allowed in
1158  /// the IR optimizer, so we do not want to alter the pattern. For example,
1159  /// partially transforming a scalar bswap() pattern into vector code is
1160  /// effectively impossible for the backend to undo.
1161  /// TODO: If load combining is allowed in the IR optimizer, this analysis
1162  ///       may not be necessary.
1163  bool isLoadCombineReductionCandidate(RecurKind RdxKind) const;
1164
1165  /// Assume that a vector of stores of bitwise-or/shifted/zexted loaded values
1166  /// can be load combined in the backend. Load combining may not be allowed in
1167  /// the IR optimizer, so we do not want to alter the pattern. For example,
1168  /// partially transforming a scalar bswap() pattern into vector code is
1169  /// effectively impossible for the backend to undo.
1170  /// TODO: If load combining is allowed in the IR optimizer, this analysis
1171  ///       may not be necessary.
1172  bool isLoadCombineCandidate() const;
1173
1174  OptimizationRemarkEmitter *getORE() { return ORE; }
1175
1176  /// This structure holds any data we need about the edges being traversed
1177  /// during buildTree_rec(). We keep track of:
1178  /// (i) the user TreeEntry index, and
1179  /// (ii) the index of the edge.
1180  struct EdgeInfo {
1181    EdgeInfo() = default;
1182    EdgeInfo(TreeEntry *UserTE, unsigned EdgeIdx)
1183        : UserTE(UserTE), EdgeIdx(EdgeIdx) {}
1184    /// The user TreeEntry.
1185    TreeEntry *UserTE = nullptr;
1186    /// The operand index of the use.
1187    unsigned EdgeIdx = UINT_MAX;
1188#ifndef NDEBUG
1189    friend inline raw_ostream &operator<<(raw_ostream &OS,
1190                                          const BoUpSLP::EdgeInfo &EI) {
1191      EI.dump(OS);
1192      return OS;
1193    }
1194    /// Debug print.
1195    void dump(raw_ostream &OS) const {
1196      OS << "{User:" << (UserTE ? std::to_string(UserTE->Idx) : "null")
1197         << " EdgeIdx:" << EdgeIdx << "}";
1198    }
1199    LLVM_DUMP_METHOD void dump() const { dump(dbgs()); }
1200#endif
1201    bool operator == (const EdgeInfo &Other) const {
1202      return UserTE == Other.UserTE && EdgeIdx == Other.EdgeIdx;
1203    }
1204  };
1205
1206  /// A helper class used for scoring candidates for two consecutive lanes.
1207  class LookAheadHeuristics {
1208    const TargetLibraryInfo &TLI;
1209    const DataLayout &DL;
1210    ScalarEvolution &SE;
1211    const BoUpSLP &R;
1212    int NumLanes; // Total number of lanes (aka vectorization factor).
1213    int MaxLevel; // The maximum recursion depth for accumulating score.
1214
1215  public:
1216    LookAheadHeuristics(const TargetLibraryInfo &TLI, const DataLayout &DL,
1217                        ScalarEvolution &SE, const BoUpSLP &R, int NumLanes,
1218                        int MaxLevel)
1219        : TLI(TLI), DL(DL), SE(SE), R(R), NumLanes(NumLanes),
1220          MaxLevel(MaxLevel) {}
1221
1222    // The hard-coded scores listed here are not very important, though it shall
1223    // be higher for better matches to improve the resulting cost. When
1224    // computing the scores of matching one sub-tree with another, we are
1225    // basically counting the number of values that are matching. So even if all
1226    // scores are set to 1, we would still get a decent matching result.
1227    // However, sometimes we have to break ties. For example we may have to
1228    // choose between matching loads vs matching opcodes. This is what these
1229    // scores are helping us with: they provide the order of preference. Also,
1230    // this is important if the scalar is externally used or used in another
1231    // tree entry node in the different lane.
1232
1233    /// Loads from consecutive memory addresses, e.g. load(A[i]), load(A[i+1]).
1234    static const int ScoreConsecutiveLoads = 4;
1235    /// The same load multiple times. This should have a better score than
1236    /// `ScoreSplat` because it in x86 for a 2-lane vector we can represent it
1237    /// with `movddup (%reg), xmm0` which has a throughput of 0.5 versus 0.5 for
1238    /// a vector load and 1.0 for a broadcast.
1239    static const int ScoreSplatLoads = 3;
1240    /// Loads from reversed memory addresses, e.g. load(A[i+1]), load(A[i]).
1241    static const int ScoreReversedLoads = 3;
1242    /// A load candidate for masked gather.
1243    static const int ScoreMaskedGatherCandidate = 1;
1244    /// ExtractElementInst from same vector and consecutive indexes.
1245    static const int ScoreConsecutiveExtracts = 4;
1246    /// ExtractElementInst from same vector and reversed indices.
1247    static const int ScoreReversedExtracts = 3;
1248    /// Constants.
1249    static const int ScoreConstants = 2;
1250    /// Instructions with the same opcode.
1251    static const int ScoreSameOpcode = 2;
1252    /// Instructions with alt opcodes (e.g, add + sub).
1253    static const int ScoreAltOpcodes = 1;
1254    /// Identical instructions (a.k.a. splat or broadcast).
1255    static const int ScoreSplat = 1;
1256    /// Matching with an undef is preferable to failing.
1257    static const int ScoreUndef = 1;
1258    /// Score for failing to find a decent match.
1259    static const int ScoreFail = 0;
1260    /// Score if all users are vectorized.
1261    static const int ScoreAllUserVectorized = 1;
1262
1263    /// \returns the score of placing \p V1 and \p V2 in consecutive lanes.
1264    /// \p U1 and \p U2 are the users of \p V1 and \p V2.
1265    /// Also, checks if \p V1 and \p V2 are compatible with instructions in \p
1266    /// MainAltOps.
1267    int getShallowScore(Value *V1, Value *V2, Instruction *U1, Instruction *U2,
1268                        ArrayRef<Value *> MainAltOps) const {
1269      if (!isValidElementType(V1->getType()) ||
1270          !isValidElementType(V2->getType()))
1271        return LookAheadHeuristics::ScoreFail;
1272
1273      if (V1 == V2) {
1274        if (isa<LoadInst>(V1)) {
1275          // Retruns true if the users of V1 and V2 won't need to be extracted.
1276          auto AllUsersAreInternal = [U1, U2, this](Value *V1, Value *V2) {
1277            // Bail out if we have too many uses to save compilation time.
1278            static constexpr unsigned Limit = 8;
1279            if (V1->hasNUsesOrMore(Limit) || V2->hasNUsesOrMore(Limit))
1280              return false;
1281
1282            auto AllUsersVectorized = [U1, U2, this](Value *V) {
1283              return llvm::all_of(V->users(), [U1, U2, this](Value *U) {
1284                return U == U1 || U == U2 || R.getTreeEntry(U) != nullptr;
1285              });
1286            };
1287            return AllUsersVectorized(V1) && AllUsersVectorized(V2);
1288          };
1289          // A broadcast of a load can be cheaper on some targets.
1290          if (R.TTI->isLegalBroadcastLoad(V1->getType(),
1291                                          ElementCount::getFixed(NumLanes)) &&
1292              ((int)V1->getNumUses() == NumLanes ||
1293               AllUsersAreInternal(V1, V2)))
1294            return LookAheadHeuristics::ScoreSplatLoads;
1295        }
1296        return LookAheadHeuristics::ScoreSplat;
1297      }
1298
1299      auto *LI1 = dyn_cast<LoadInst>(V1);
1300      auto *LI2 = dyn_cast<LoadInst>(V2);
1301      if (LI1 && LI2) {
1302        if (LI1->getParent() != LI2->getParent() || !LI1->isSimple() ||
1303            !LI2->isSimple())
1304          return LookAheadHeuristics::ScoreFail;
1305
1306        std::optional<int> Dist = getPointersDiff(
1307            LI1->getType(), LI1->getPointerOperand(), LI2->getType(),
1308            LI2->getPointerOperand(), DL, SE, /*StrictCheck=*/true);
1309        if (!Dist || *Dist == 0) {
1310          if (getUnderlyingObject(LI1->getPointerOperand()) ==
1311                  getUnderlyingObject(LI2->getPointerOperand()) &&
1312              R.TTI->isLegalMaskedGather(
1313                  FixedVectorType::get(LI1->getType(), NumLanes),
1314                  LI1->getAlign()))
1315            return LookAheadHeuristics::ScoreMaskedGatherCandidate;
1316          return LookAheadHeuristics::ScoreFail;
1317        }
1318        // The distance is too large - still may be profitable to use masked
1319        // loads/gathers.
1320        if (std::abs(*Dist) > NumLanes / 2)
1321          return LookAheadHeuristics::ScoreMaskedGatherCandidate;
1322        // This still will detect consecutive loads, but we might have "holes"
1323        // in some cases. It is ok for non-power-2 vectorization and may produce
1324        // better results. It should not affect current vectorization.
1325        return (*Dist > 0) ? LookAheadHeuristics::ScoreConsecutiveLoads
1326                           : LookAheadHeuristics::ScoreReversedLoads;
1327      }
1328
1329      auto *C1 = dyn_cast<Constant>(V1);
1330      auto *C2 = dyn_cast<Constant>(V2);
1331      if (C1 && C2)
1332        return LookAheadHeuristics::ScoreConstants;
1333
1334      // Extracts from consecutive indexes of the same vector better score as
1335      // the extracts could be optimized away.
1336      Value *EV1;
1337      ConstantInt *Ex1Idx;
1338      if (match(V1, m_ExtractElt(m_Value(EV1), m_ConstantInt(Ex1Idx)))) {
1339        // Undefs are always profitable for extractelements.
1340        // Compiler can easily combine poison and extractelement <non-poison> or
1341        // undef and extractelement <poison>. But combining undef +
1342        // extractelement <non-poison-but-may-produce-poison> requires some
1343        // extra operations.
1344        if (isa<UndefValue>(V2))
1345          return (isa<PoisonValue>(V2) || isUndefVector(EV1).all())
1346                     ? LookAheadHeuristics::ScoreConsecutiveExtracts
1347                     : LookAheadHeuristics::ScoreSameOpcode;
1348        Value *EV2 = nullptr;
1349        ConstantInt *Ex2Idx = nullptr;
1350        if (match(V2,
1351                  m_ExtractElt(m_Value(EV2), m_CombineOr(m_ConstantInt(Ex2Idx),
1352                                                         m_Undef())))) {
1353          // Undefs are always profitable for extractelements.
1354          if (!Ex2Idx)
1355            return LookAheadHeuristics::ScoreConsecutiveExtracts;
1356          if (isUndefVector(EV2).all() && EV2->getType() == EV1->getType())
1357            return LookAheadHeuristics::ScoreConsecutiveExtracts;
1358          if (EV2 == EV1) {
1359            int Idx1 = Ex1Idx->getZExtValue();
1360            int Idx2 = Ex2Idx->getZExtValue();
1361            int Dist = Idx2 - Idx1;
1362            // The distance is too large - still may be profitable to use
1363            // shuffles.
1364            if (std::abs(Dist) == 0)
1365              return LookAheadHeuristics::ScoreSplat;
1366            if (std::abs(Dist) > NumLanes / 2)
1367              return LookAheadHeuristics::ScoreSameOpcode;
1368            return (Dist > 0) ? LookAheadHeuristics::ScoreConsecutiveExtracts
1369                              : LookAheadHeuristics::ScoreReversedExtracts;
1370          }
1371          return LookAheadHeuristics::ScoreAltOpcodes;
1372        }
1373        return LookAheadHeuristics::ScoreFail;
1374      }
1375
1376      auto *I1 = dyn_cast<Instruction>(V1);
1377      auto *I2 = dyn_cast<Instruction>(V2);
1378      if (I1 && I2) {
1379        if (I1->getParent() != I2->getParent())
1380          return LookAheadHeuristics::ScoreFail;
1381        SmallVector<Value *, 4> Ops(MainAltOps.begin(), MainAltOps.end());
1382        Ops.push_back(I1);
1383        Ops.push_back(I2);
1384        InstructionsState S = getSameOpcode(Ops, TLI);
1385        // Note: Only consider instructions with <= 2 operands to avoid
1386        // complexity explosion.
1387        if (S.getOpcode() &&
1388            (S.MainOp->getNumOperands() <= 2 || !MainAltOps.empty() ||
1389             !S.isAltShuffle()) &&
1390            all_of(Ops, [&S](Value *V) {
1391              return cast<Instruction>(V)->getNumOperands() ==
1392                     S.MainOp->getNumOperands();
1393            }))
1394          return S.isAltShuffle() ? LookAheadHeuristics::ScoreAltOpcodes
1395                                  : LookAheadHeuristics::ScoreSameOpcode;
1396      }
1397
1398      if (isa<UndefValue>(V2))
1399        return LookAheadHeuristics::ScoreUndef;
1400
1401      return LookAheadHeuristics::ScoreFail;
1402    }
1403
1404    /// Go through the operands of \p LHS and \p RHS recursively until
1405    /// MaxLevel, and return the cummulative score. \p U1 and \p U2 are
1406    /// the users of \p LHS and \p RHS (that is \p LHS and \p RHS are operands
1407    /// of \p U1 and \p U2), except at the beginning of the recursion where
1408    /// these are set to nullptr.
1409    ///
1410    /// For example:
1411    /// \verbatim
1412    ///  A[0]  B[0]  A[1]  B[1]  C[0] D[0]  B[1] A[1]
1413    ///     \ /         \ /         \ /        \ /
1414    ///      +           +           +          +
1415    ///     G1          G2          G3         G4
1416    /// \endverbatim
1417    /// The getScoreAtLevelRec(G1, G2) function will try to match the nodes at
1418    /// each level recursively, accumulating the score. It starts from matching
1419    /// the additions at level 0, then moves on to the loads (level 1). The
1420    /// score of G1 and G2 is higher than G1 and G3, because {A[0],A[1]} and
1421    /// {B[0],B[1]} match with LookAheadHeuristics::ScoreConsecutiveLoads, while
1422    /// {A[0],C[0]} has a score of LookAheadHeuristics::ScoreFail.
1423    /// Please note that the order of the operands does not matter, as we
1424    /// evaluate the score of all profitable combinations of operands. In
1425    /// other words the score of G1 and G4 is the same as G1 and G2. This
1426    /// heuristic is based on ideas described in:
1427    ///   Look-ahead SLP: Auto-vectorization in the presence of commutative
1428    ///   operations, CGO 2018 by Vasileios Porpodas, Rodrigo C. O. Rocha,
1429    ///   Lu��s F. W. G��es
1430    int getScoreAtLevelRec(Value *LHS, Value *RHS, Instruction *U1,
1431                           Instruction *U2, int CurrLevel,
1432                           ArrayRef<Value *> MainAltOps) const {
1433
1434      // Get the shallow score of V1 and V2.
1435      int ShallowScoreAtThisLevel =
1436          getShallowScore(LHS, RHS, U1, U2, MainAltOps);
1437
1438      // If reached MaxLevel,
1439      //  or if V1 and V2 are not instructions,
1440      //  or if they are SPLAT,
1441      //  or if they are not consecutive,
1442      //  or if profitable to vectorize loads or extractelements, early return
1443      //  the current cost.
1444      auto *I1 = dyn_cast<Instruction>(LHS);
1445      auto *I2 = dyn_cast<Instruction>(RHS);
1446      if (CurrLevel == MaxLevel || !(I1 && I2) || I1 == I2 ||
1447          ShallowScoreAtThisLevel == LookAheadHeuristics::ScoreFail ||
1448          (((isa<LoadInst>(I1) && isa<LoadInst>(I2)) ||
1449            (I1->getNumOperands() > 2 && I2->getNumOperands() > 2) ||
1450            (isa<ExtractElementInst>(I1) && isa<ExtractElementInst>(I2))) &&
1451           ShallowScoreAtThisLevel))
1452        return ShallowScoreAtThisLevel;
1453      assert(I1 && I2 && "Should have early exited.");
1454
1455      // Contains the I2 operand indexes that got matched with I1 operands.
1456      SmallSet<unsigned, 4> Op2Used;
1457
1458      // Recursion towards the operands of I1 and I2. We are trying all possible
1459      // operand pairs, and keeping track of the best score.
1460      for (unsigned OpIdx1 = 0, NumOperands1 = I1->getNumOperands();
1461           OpIdx1 != NumOperands1; ++OpIdx1) {
1462        // Try to pair op1I with the best operand of I2.
1463        int MaxTmpScore = 0;
1464        unsigned MaxOpIdx2 = 0;
1465        bool FoundBest = false;
1466        // If I2 is commutative try all combinations.
1467        unsigned FromIdx = isCommutative(I2) ? 0 : OpIdx1;
1468        unsigned ToIdx = isCommutative(I2)
1469                             ? I2->getNumOperands()
1470                             : std::min(I2->getNumOperands(), OpIdx1 + 1);
1471        assert(FromIdx <= ToIdx && "Bad index");
1472        for (unsigned OpIdx2 = FromIdx; OpIdx2 != ToIdx; ++OpIdx2) {
1473          // Skip operands already paired with OpIdx1.
1474          if (Op2Used.count(OpIdx2))
1475            continue;
1476          // Recursively calculate the cost at each level
1477          int TmpScore =
1478              getScoreAtLevelRec(I1->getOperand(OpIdx1), I2->getOperand(OpIdx2),
1479                                 I1, I2, CurrLevel + 1, std::nullopt);
1480          // Look for the best score.
1481          if (TmpScore > LookAheadHeuristics::ScoreFail &&
1482              TmpScore > MaxTmpScore) {
1483            MaxTmpScore = TmpScore;
1484            MaxOpIdx2 = OpIdx2;
1485            FoundBest = true;
1486          }
1487        }
1488        if (FoundBest) {
1489          // Pair {OpIdx1, MaxOpIdx2} was found to be best. Never revisit it.
1490          Op2Used.insert(MaxOpIdx2);
1491          ShallowScoreAtThisLevel += MaxTmpScore;
1492        }
1493      }
1494      return ShallowScoreAtThisLevel;
1495    }
1496  };
1497  /// A helper data structure to hold the operands of a vector of instructions.
1498  /// This supports a fixed vector length for all operand vectors.
1499  class VLOperands {
1500    /// For each operand we need (i) the value, and (ii) the opcode that it
1501    /// would be attached to if the expression was in a left-linearized form.
1502    /// This is required to avoid illegal operand reordering.
1503    /// For example:
1504    /// \verbatim
1505    ///                         0 Op1
1506    ///                         |/
1507    /// Op1 Op2   Linearized    + Op2
1508    ///   \ /     ---------->   |/
1509    ///    -                    -
1510    ///
1511    /// Op1 - Op2            (0 + Op1) - Op2
1512    /// \endverbatim
1513    ///
1514    /// Value Op1 is attached to a '+' operation, and Op2 to a '-'.
1515    ///
1516    /// Another way to think of this is to track all the operations across the
1517    /// path from the operand all the way to the root of the tree and to
1518    /// calculate the operation that corresponds to this path. For example, the
1519    /// path from Op2 to the root crosses the RHS of the '-', therefore the
1520    /// corresponding operation is a '-' (which matches the one in the
1521    /// linearized tree, as shown above).
1522    ///
1523    /// For lack of a better term, we refer to this operation as Accumulated
1524    /// Path Operation (APO).
1525    struct OperandData {
1526      OperandData() = default;
1527      OperandData(Value *V, bool APO, bool IsUsed)
1528          : V(V), APO(APO), IsUsed(IsUsed) {}
1529      /// The operand value.
1530      Value *V = nullptr;
1531      /// TreeEntries only allow a single opcode, or an alternate sequence of
1532      /// them (e.g, +, -). Therefore, we can safely use a boolean value for the
1533      /// APO. It is set to 'true' if 'V' is attached to an inverse operation
1534      /// in the left-linearized form (e.g., Sub/Div), and 'false' otherwise
1535      /// (e.g., Add/Mul)
1536      bool APO = false;
1537      /// Helper data for the reordering function.
1538      bool IsUsed = false;
1539    };
1540
1541    /// During operand reordering, we are trying to select the operand at lane
1542    /// that matches best with the operand at the neighboring lane. Our
1543    /// selection is based on the type of value we are looking for. For example,
1544    /// if the neighboring lane has a load, we need to look for a load that is
1545    /// accessing a consecutive address. These strategies are summarized in the
1546    /// 'ReorderingMode' enumerator.
1547    enum class ReorderingMode {
1548      Load,     ///< Matching loads to consecutive memory addresses
1549      Opcode,   ///< Matching instructions based on opcode (same or alternate)
1550      Constant, ///< Matching constants
1551      Splat,    ///< Matching the same instruction multiple times (broadcast)
1552      Failed,   ///< We failed to create a vectorizable group
1553    };
1554
1555    using OperandDataVec = SmallVector<OperandData, 2>;
1556
1557    /// A vector of operand vectors.
1558    SmallVector<OperandDataVec, 4> OpsVec;
1559
1560    const TargetLibraryInfo &TLI;
1561    const DataLayout &DL;
1562    ScalarEvolution &SE;
1563    const BoUpSLP &R;
1564
1565    /// \returns the operand data at \p OpIdx and \p Lane.
1566    OperandData &getData(unsigned OpIdx, unsigned Lane) {
1567      return OpsVec[OpIdx][Lane];
1568    }
1569
1570    /// \returns the operand data at \p OpIdx and \p Lane. Const version.
1571    const OperandData &getData(unsigned OpIdx, unsigned Lane) const {
1572      return OpsVec[OpIdx][Lane];
1573    }
1574
1575    /// Clears the used flag for all entries.
1576    void clearUsed() {
1577      for (unsigned OpIdx = 0, NumOperands = getNumOperands();
1578           OpIdx != NumOperands; ++OpIdx)
1579        for (unsigned Lane = 0, NumLanes = getNumLanes(); Lane != NumLanes;
1580             ++Lane)
1581          OpsVec[OpIdx][Lane].IsUsed = false;
1582    }
1583
1584    /// Swap the operand at \p OpIdx1 with that one at \p OpIdx2.
1585    void swap(unsigned OpIdx1, unsigned OpIdx2, unsigned Lane) {
1586      std::swap(OpsVec[OpIdx1][Lane], OpsVec[OpIdx2][Lane]);
1587    }
1588
1589    /// \param Lane lane of the operands under analysis.
1590    /// \param OpIdx operand index in \p Lane lane we're looking the best
1591    /// candidate for.
1592    /// \param Idx operand index of the current candidate value.
1593    /// \returns The additional score due to possible broadcasting of the
1594    /// elements in the lane. It is more profitable to have power-of-2 unique
1595    /// elements in the lane, it will be vectorized with higher probability
1596    /// after removing duplicates. Currently the SLP vectorizer supports only
1597    /// vectorization of the power-of-2 number of unique scalars.
1598    int getSplatScore(unsigned Lane, unsigned OpIdx, unsigned Idx) const {
1599      Value *IdxLaneV = getData(Idx, Lane).V;
1600      if (!isa<Instruction>(IdxLaneV) || IdxLaneV == getData(OpIdx, Lane).V)
1601        return 0;
1602      SmallPtrSet<Value *, 4> Uniques;
1603      for (unsigned Ln = 0, E = getNumLanes(); Ln < E; ++Ln) {
1604        if (Ln == Lane)
1605          continue;
1606        Value *OpIdxLnV = getData(OpIdx, Ln).V;
1607        if (!isa<Instruction>(OpIdxLnV))
1608          return 0;
1609        Uniques.insert(OpIdxLnV);
1610      }
1611      int UniquesCount = Uniques.size();
1612      int UniquesCntWithIdxLaneV =
1613          Uniques.contains(IdxLaneV) ? UniquesCount : UniquesCount + 1;
1614      Value *OpIdxLaneV = getData(OpIdx, Lane).V;
1615      int UniquesCntWithOpIdxLaneV =
1616          Uniques.contains(OpIdxLaneV) ? UniquesCount : UniquesCount + 1;
1617      if (UniquesCntWithIdxLaneV == UniquesCntWithOpIdxLaneV)
1618        return 0;
1619      return (PowerOf2Ceil(UniquesCntWithOpIdxLaneV) -
1620              UniquesCntWithOpIdxLaneV) -
1621             (PowerOf2Ceil(UniquesCntWithIdxLaneV) - UniquesCntWithIdxLaneV);
1622    }
1623
1624    /// \param Lane lane of the operands under analysis.
1625    /// \param OpIdx operand index in \p Lane lane we're looking the best
1626    /// candidate for.
1627    /// \param Idx operand index of the current candidate value.
1628    /// \returns The additional score for the scalar which users are all
1629    /// vectorized.
1630    int getExternalUseScore(unsigned Lane, unsigned OpIdx, unsigned Idx) const {
1631      Value *IdxLaneV = getData(Idx, Lane).V;
1632      Value *OpIdxLaneV = getData(OpIdx, Lane).V;
1633      // Do not care about number of uses for vector-like instructions
1634      // (extractelement/extractvalue with constant indices), they are extracts
1635      // themselves and already externally used. Vectorization of such
1636      // instructions does not add extra extractelement instruction, just may
1637      // remove it.
1638      if (isVectorLikeInstWithConstOps(IdxLaneV) &&
1639          isVectorLikeInstWithConstOps(OpIdxLaneV))
1640        return LookAheadHeuristics::ScoreAllUserVectorized;
1641      auto *IdxLaneI = dyn_cast<Instruction>(IdxLaneV);
1642      if (!IdxLaneI || !isa<Instruction>(OpIdxLaneV))
1643        return 0;
1644      return R.areAllUsersVectorized(IdxLaneI)
1645                 ? LookAheadHeuristics::ScoreAllUserVectorized
1646                 : 0;
1647    }
1648
1649    /// Score scaling factor for fully compatible instructions but with
1650    /// different number of external uses. Allows better selection of the
1651    /// instructions with less external uses.
1652    static const int ScoreScaleFactor = 10;
1653
1654    /// \Returns the look-ahead score, which tells us how much the sub-trees
1655    /// rooted at \p LHS and \p RHS match, the more they match the higher the
1656    /// score. This helps break ties in an informed way when we cannot decide on
1657    /// the order of the operands by just considering the immediate
1658    /// predecessors.
1659    int getLookAheadScore(Value *LHS, Value *RHS, ArrayRef<Value *> MainAltOps,
1660                          int Lane, unsigned OpIdx, unsigned Idx,
1661                          bool &IsUsed) {
1662      LookAheadHeuristics LookAhead(TLI, DL, SE, R, getNumLanes(),
1663                                    LookAheadMaxDepth);
1664      // Keep track of the instruction stack as we recurse into the operands
1665      // during the look-ahead score exploration.
1666      int Score =
1667          LookAhead.getScoreAtLevelRec(LHS, RHS, /*U1=*/nullptr, /*U2=*/nullptr,
1668                                       /*CurrLevel=*/1, MainAltOps);
1669      if (Score) {
1670        int SplatScore = getSplatScore(Lane, OpIdx, Idx);
1671        if (Score <= -SplatScore) {
1672          // Set the minimum score for splat-like sequence to avoid setting
1673          // failed state.
1674          Score = 1;
1675        } else {
1676          Score += SplatScore;
1677          // Scale score to see the difference between different operands
1678          // and similar operands but all vectorized/not all vectorized
1679          // uses. It does not affect actual selection of the best
1680          // compatible operand in general, just allows to select the
1681          // operand with all vectorized uses.
1682          Score *= ScoreScaleFactor;
1683          Score += getExternalUseScore(Lane, OpIdx, Idx);
1684          IsUsed = true;
1685        }
1686      }
1687      return Score;
1688    }
1689
1690    /// Best defined scores per lanes between the passes. Used to choose the
1691    /// best operand (with the highest score) between the passes.
1692    /// The key - {Operand Index, Lane}.
1693    /// The value - the best score between the passes for the lane and the
1694    /// operand.
1695    SmallDenseMap<std::pair<unsigned, unsigned>, unsigned, 8>
1696        BestScoresPerLanes;
1697
1698    // Search all operands in Ops[*][Lane] for the one that matches best
1699    // Ops[OpIdx][LastLane] and return its opreand index.
1700    // If no good match can be found, return std::nullopt.
1701    std::optional<unsigned>
1702    getBestOperand(unsigned OpIdx, int Lane, int LastLane,
1703                   ArrayRef<ReorderingMode> ReorderingModes,
1704                   ArrayRef<Value *> MainAltOps) {
1705      unsigned NumOperands = getNumOperands();
1706
1707      // The operand of the previous lane at OpIdx.
1708      Value *OpLastLane = getData(OpIdx, LastLane).V;
1709
1710      // Our strategy mode for OpIdx.
1711      ReorderingMode RMode = ReorderingModes[OpIdx];
1712      if (RMode == ReorderingMode::Failed)
1713        return std::nullopt;
1714
1715      // The linearized opcode of the operand at OpIdx, Lane.
1716      bool OpIdxAPO = getData(OpIdx, Lane).APO;
1717
1718      // The best operand index and its score.
1719      // Sometimes we have more than one option (e.g., Opcode and Undefs), so we
1720      // are using the score to differentiate between the two.
1721      struct BestOpData {
1722        std::optional<unsigned> Idx;
1723        unsigned Score = 0;
1724      } BestOp;
1725      BestOp.Score =
1726          BestScoresPerLanes.try_emplace(std::make_pair(OpIdx, Lane), 0)
1727              .first->second;
1728
1729      // Track if the operand must be marked as used. If the operand is set to
1730      // Score 1 explicitly (because of non power-of-2 unique scalars, we may
1731      // want to reestimate the operands again on the following iterations).
1732      bool IsUsed =
1733          RMode == ReorderingMode::Splat || RMode == ReorderingMode::Constant;
1734      // Iterate through all unused operands and look for the best.
1735      for (unsigned Idx = 0; Idx != NumOperands; ++Idx) {
1736        // Get the operand at Idx and Lane.
1737        OperandData &OpData = getData(Idx, Lane);
1738        Value *Op = OpData.V;
1739        bool OpAPO = OpData.APO;
1740
1741        // Skip already selected operands.
1742        if (OpData.IsUsed)
1743          continue;
1744
1745        // Skip if we are trying to move the operand to a position with a
1746        // different opcode in the linearized tree form. This would break the
1747        // semantics.
1748        if (OpAPO != OpIdxAPO)
1749          continue;
1750
1751        // Look for an operand that matches the current mode.
1752        switch (RMode) {
1753        case ReorderingMode::Load:
1754        case ReorderingMode::Constant:
1755        case ReorderingMode::Opcode: {
1756          bool LeftToRight = Lane > LastLane;
1757          Value *OpLeft = (LeftToRight) ? OpLastLane : Op;
1758          Value *OpRight = (LeftToRight) ? Op : OpLastLane;
1759          int Score = getLookAheadScore(OpLeft, OpRight, MainAltOps, Lane,
1760                                        OpIdx, Idx, IsUsed);
1761          if (Score > static_cast<int>(BestOp.Score)) {
1762            BestOp.Idx = Idx;
1763            BestOp.Score = Score;
1764            BestScoresPerLanes[std::make_pair(OpIdx, Lane)] = Score;
1765          }
1766          break;
1767        }
1768        case ReorderingMode::Splat:
1769          if (Op == OpLastLane)
1770            BestOp.Idx = Idx;
1771          break;
1772        case ReorderingMode::Failed:
1773          llvm_unreachable("Not expected Failed reordering mode.");
1774        }
1775      }
1776
1777      if (BestOp.Idx) {
1778        getData(*BestOp.Idx, Lane).IsUsed = IsUsed;
1779        return BestOp.Idx;
1780      }
1781      // If we could not find a good match return std::nullopt.
1782      return std::nullopt;
1783    }
1784
1785    /// Helper for reorderOperandVecs.
1786    /// \returns the lane that we should start reordering from. This is the one
1787    /// which has the least number of operands that can freely move about or
1788    /// less profitable because it already has the most optimal set of operands.
1789    unsigned getBestLaneToStartReordering() const {
1790      unsigned Min = UINT_MAX;
1791      unsigned SameOpNumber = 0;
1792      // std::pair<unsigned, unsigned> is used to implement a simple voting
1793      // algorithm and choose the lane with the least number of operands that
1794      // can freely move about or less profitable because it already has the
1795      // most optimal set of operands. The first unsigned is a counter for
1796      // voting, the second unsigned is the counter of lanes with instructions
1797      // with same/alternate opcodes and same parent basic block.
1798      MapVector<unsigned, std::pair<unsigned, unsigned>> HashMap;
1799      // Try to be closer to the original results, if we have multiple lanes
1800      // with same cost. If 2 lanes have the same cost, use the one with the
1801      // lowest index.
1802      for (int I = getNumLanes(); I > 0; --I) {
1803        unsigned Lane = I - 1;
1804        OperandsOrderData NumFreeOpsHash =
1805            getMaxNumOperandsThatCanBeReordered(Lane);
1806        // Compare the number of operands that can move and choose the one with
1807        // the least number.
1808        if (NumFreeOpsHash.NumOfAPOs < Min) {
1809          Min = NumFreeOpsHash.NumOfAPOs;
1810          SameOpNumber = NumFreeOpsHash.NumOpsWithSameOpcodeParent;
1811          HashMap.clear();
1812          HashMap[NumFreeOpsHash.Hash] = std::make_pair(1, Lane);
1813        } else if (NumFreeOpsHash.NumOfAPOs == Min &&
1814                   NumFreeOpsHash.NumOpsWithSameOpcodeParent < SameOpNumber) {
1815          // Select the most optimal lane in terms of number of operands that
1816          // should be moved around.
1817          SameOpNumber = NumFreeOpsHash.NumOpsWithSameOpcodeParent;
1818          HashMap[NumFreeOpsHash.Hash] = std::make_pair(1, Lane);
1819        } else if (NumFreeOpsHash.NumOfAPOs == Min &&
1820                   NumFreeOpsHash.NumOpsWithSameOpcodeParent == SameOpNumber) {
1821          auto *It = HashMap.find(NumFreeOpsHash.Hash);
1822          if (It == HashMap.end())
1823            HashMap[NumFreeOpsHash.Hash] = std::make_pair(1, Lane);
1824          else
1825            ++It->second.first;
1826        }
1827      }
1828      // Select the lane with the minimum counter.
1829      unsigned BestLane = 0;
1830      unsigned CntMin = UINT_MAX;
1831      for (const auto &Data : reverse(HashMap)) {
1832        if (Data.second.first < CntMin) {
1833          CntMin = Data.second.first;
1834          BestLane = Data.second.second;
1835        }
1836      }
1837      return BestLane;
1838    }
1839
1840    /// Data structure that helps to reorder operands.
1841    struct OperandsOrderData {
1842      /// The best number of operands with the same APOs, which can be
1843      /// reordered.
1844      unsigned NumOfAPOs = UINT_MAX;
1845      /// Number of operands with the same/alternate instruction opcode and
1846      /// parent.
1847      unsigned NumOpsWithSameOpcodeParent = 0;
1848      /// Hash for the actual operands ordering.
1849      /// Used to count operands, actually their position id and opcode
1850      /// value. It is used in the voting mechanism to find the lane with the
1851      /// least number of operands that can freely move about or less profitable
1852      /// because it already has the most optimal set of operands. Can be
1853      /// replaced with SmallVector<unsigned> instead but hash code is faster
1854      /// and requires less memory.
1855      unsigned Hash = 0;
1856    };
1857    /// \returns the maximum number of operands that are allowed to be reordered
1858    /// for \p Lane and the number of compatible instructions(with the same
1859    /// parent/opcode). This is used as a heuristic for selecting the first lane
1860    /// to start operand reordering.
1861    OperandsOrderData getMaxNumOperandsThatCanBeReordered(unsigned Lane) const {
1862      unsigned CntTrue = 0;
1863      unsigned NumOperands = getNumOperands();
1864      // Operands with the same APO can be reordered. We therefore need to count
1865      // how many of them we have for each APO, like this: Cnt[APO] = x.
1866      // Since we only have two APOs, namely true and false, we can avoid using
1867      // a map. Instead we can simply count the number of operands that
1868      // correspond to one of them (in this case the 'true' APO), and calculate
1869      // the other by subtracting it from the total number of operands.
1870      // Operands with the same instruction opcode and parent are more
1871      // profitable since we don't need to move them in many cases, with a high
1872      // probability such lane already can be vectorized effectively.
1873      bool AllUndefs = true;
1874      unsigned NumOpsWithSameOpcodeParent = 0;
1875      Instruction *OpcodeI = nullptr;
1876      BasicBlock *Parent = nullptr;
1877      unsigned Hash = 0;
1878      for (unsigned OpIdx = 0; OpIdx != NumOperands; ++OpIdx) {
1879        const OperandData &OpData = getData(OpIdx, Lane);
1880        if (OpData.APO)
1881          ++CntTrue;
1882        // Use Boyer-Moore majority voting for finding the majority opcode and
1883        // the number of times it occurs.
1884        if (auto *I = dyn_cast<Instruction>(OpData.V)) {
1885          if (!OpcodeI || !getSameOpcode({OpcodeI, I}, TLI).getOpcode() ||
1886              I->getParent() != Parent) {
1887            if (NumOpsWithSameOpcodeParent == 0) {
1888              NumOpsWithSameOpcodeParent = 1;
1889              OpcodeI = I;
1890              Parent = I->getParent();
1891            } else {
1892              --NumOpsWithSameOpcodeParent;
1893            }
1894          } else {
1895            ++NumOpsWithSameOpcodeParent;
1896          }
1897        }
1898        Hash = hash_combine(
1899            Hash, hash_value((OpIdx + 1) * (OpData.V->getValueID() + 1)));
1900        AllUndefs = AllUndefs && isa<UndefValue>(OpData.V);
1901      }
1902      if (AllUndefs)
1903        return {};
1904      OperandsOrderData Data;
1905      Data.NumOfAPOs = std::max(CntTrue, NumOperands - CntTrue);
1906      Data.NumOpsWithSameOpcodeParent = NumOpsWithSameOpcodeParent;
1907      Data.Hash = Hash;
1908      return Data;
1909    }
1910
1911    /// Go through the instructions in VL and append their operands.
1912    void appendOperandsOfVL(ArrayRef<Value *> VL) {
1913      assert(!VL.empty() && "Bad VL");
1914      assert((empty() || VL.size() == getNumLanes()) &&
1915             "Expected same number of lanes");
1916      assert(isa<Instruction>(VL[0]) && "Expected instruction");
1917      unsigned NumOperands = cast<Instruction>(VL[0])->getNumOperands();
1918      OpsVec.resize(NumOperands);
1919      unsigned NumLanes = VL.size();
1920      for (unsigned OpIdx = 0; OpIdx != NumOperands; ++OpIdx) {
1921        OpsVec[OpIdx].resize(NumLanes);
1922        for (unsigned Lane = 0; Lane != NumLanes; ++Lane) {
1923          assert(isa<Instruction>(VL[Lane]) && "Expected instruction");
1924          // Our tree has just 3 nodes: the root and two operands.
1925          // It is therefore trivial to get the APO. We only need to check the
1926          // opcode of VL[Lane] and whether the operand at OpIdx is the LHS or
1927          // RHS operand. The LHS operand of both add and sub is never attached
1928          // to an inversese operation in the linearized form, therefore its APO
1929          // is false. The RHS is true only if VL[Lane] is an inverse operation.
1930
1931          // Since operand reordering is performed on groups of commutative
1932          // operations or alternating sequences (e.g., +, -), we can safely
1933          // tell the inverse operations by checking commutativity.
1934          bool IsInverseOperation = !isCommutative(cast<Instruction>(VL[Lane]));
1935          bool APO = (OpIdx == 0) ? false : IsInverseOperation;
1936          OpsVec[OpIdx][Lane] = {cast<Instruction>(VL[Lane])->getOperand(OpIdx),
1937                                 APO, false};
1938        }
1939      }
1940    }
1941
1942    /// \returns the number of operands.
1943    unsigned getNumOperands() const { return OpsVec.size(); }
1944
1945    /// \returns the number of lanes.
1946    unsigned getNumLanes() const { return OpsVec[0].size(); }
1947
1948    /// \returns the operand value at \p OpIdx and \p Lane.
1949    Value *getValue(unsigned OpIdx, unsigned Lane) const {
1950      return getData(OpIdx, Lane).V;
1951    }
1952
1953    /// \returns true if the data structure is empty.
1954    bool empty() const { return OpsVec.empty(); }
1955
1956    /// Clears the data.
1957    void clear() { OpsVec.clear(); }
1958
1959    /// \Returns true if there are enough operands identical to \p Op to fill
1960    /// the whole vector.
1961    /// Note: This modifies the 'IsUsed' flag, so a cleanUsed() must follow.
1962    bool shouldBroadcast(Value *Op, unsigned OpIdx, unsigned Lane) {
1963      bool OpAPO = getData(OpIdx, Lane).APO;
1964      for (unsigned Ln = 0, Lns = getNumLanes(); Ln != Lns; ++Ln) {
1965        if (Ln == Lane)
1966          continue;
1967        // This is set to true if we found a candidate for broadcast at Lane.
1968        bool FoundCandidate = false;
1969        for (unsigned OpI = 0, OpE = getNumOperands(); OpI != OpE; ++OpI) {
1970          OperandData &Data = getData(OpI, Ln);
1971          if (Data.APO != OpAPO || Data.IsUsed)
1972            continue;
1973          if (Data.V == Op) {
1974            FoundCandidate = true;
1975            Data.IsUsed = true;
1976            break;
1977          }
1978        }
1979        if (!FoundCandidate)
1980          return false;
1981      }
1982      return true;
1983    }
1984
1985  public:
1986    /// Initialize with all the operands of the instruction vector \p RootVL.
1987    VLOperands(ArrayRef<Value *> RootVL, const TargetLibraryInfo &TLI,
1988               const DataLayout &DL, ScalarEvolution &SE, const BoUpSLP &R)
1989        : TLI(TLI), DL(DL), SE(SE), R(R) {
1990      // Append all the operands of RootVL.
1991      appendOperandsOfVL(RootVL);
1992    }
1993
1994    /// \Returns a value vector with the operands across all lanes for the
1995    /// opearnd at \p OpIdx.
1996    ValueList getVL(unsigned OpIdx) const {
1997      ValueList OpVL(OpsVec[OpIdx].size());
1998      assert(OpsVec[OpIdx].size() == getNumLanes() &&
1999             "Expected same num of lanes across all operands");
2000      for (unsigned Lane = 0, Lanes = getNumLanes(); Lane != Lanes; ++Lane)
2001        OpVL[Lane] = OpsVec[OpIdx][Lane].V;
2002      return OpVL;
2003    }
2004
2005    // Performs operand reordering for 2 or more operands.
2006    // The original operands are in OrigOps[OpIdx][Lane].
2007    // The reordered operands are returned in 'SortedOps[OpIdx][Lane]'.
2008    void reorder() {
2009      unsigned NumOperands = getNumOperands();
2010      unsigned NumLanes = getNumLanes();
2011      // Each operand has its own mode. We are using this mode to help us select
2012      // the instructions for each lane, so that they match best with the ones
2013      // we have selected so far.
2014      SmallVector<ReorderingMode, 2> ReorderingModes(NumOperands);
2015
2016      // This is a greedy single-pass algorithm. We are going over each lane
2017      // once and deciding on the best order right away with no back-tracking.
2018      // However, in order to increase its effectiveness, we start with the lane
2019      // that has operands that can move the least. For example, given the
2020      // following lanes:
2021      //  Lane 0 : A[0] = B[0] + C[0]   // Visited 3rd
2022      //  Lane 1 : A[1] = C[1] - B[1]   // Visited 1st
2023      //  Lane 2 : A[2] = B[2] + C[2]   // Visited 2nd
2024      //  Lane 3 : A[3] = C[3] - B[3]   // Visited 4th
2025      // we will start at Lane 1, since the operands of the subtraction cannot
2026      // be reordered. Then we will visit the rest of the lanes in a circular
2027      // fashion. That is, Lanes 2, then Lane 0, and finally Lane 3.
2028
2029      // Find the first lane that we will start our search from.
2030      unsigned FirstLane = getBestLaneToStartReordering();
2031
2032      // Initialize the modes.
2033      for (unsigned OpIdx = 0; OpIdx != NumOperands; ++OpIdx) {
2034        Value *OpLane0 = getValue(OpIdx, FirstLane);
2035        // Keep track if we have instructions with all the same opcode on one
2036        // side.
2037        if (isa<LoadInst>(OpLane0))
2038          ReorderingModes[OpIdx] = ReorderingMode::Load;
2039        else if (isa<Instruction>(OpLane0)) {
2040          // Check if OpLane0 should be broadcast.
2041          if (shouldBroadcast(OpLane0, OpIdx, FirstLane))
2042            ReorderingModes[OpIdx] = ReorderingMode::Splat;
2043          else
2044            ReorderingModes[OpIdx] = ReorderingMode::Opcode;
2045        }
2046        else if (isa<Constant>(OpLane0))
2047          ReorderingModes[OpIdx] = ReorderingMode::Constant;
2048        else if (isa<Argument>(OpLane0))
2049          // Our best hope is a Splat. It may save some cost in some cases.
2050          ReorderingModes[OpIdx] = ReorderingMode::Splat;
2051        else
2052          // NOTE: This should be unreachable.
2053          ReorderingModes[OpIdx] = ReorderingMode::Failed;
2054      }
2055
2056      // Check that we don't have same operands. No need to reorder if operands
2057      // are just perfect diamond or shuffled diamond match. Do not do it only
2058      // for possible broadcasts or non-power of 2 number of scalars (just for
2059      // now).
2060      auto &&SkipReordering = [this]() {
2061        SmallPtrSet<Value *, 4> UniqueValues;
2062        ArrayRef<OperandData> Op0 = OpsVec.front();
2063        for (const OperandData &Data : Op0)
2064          UniqueValues.insert(Data.V);
2065        for (ArrayRef<OperandData> Op : drop_begin(OpsVec, 1)) {
2066          if (any_of(Op, [&UniqueValues](const OperandData &Data) {
2067                return !UniqueValues.contains(Data.V);
2068              }))
2069            return false;
2070        }
2071        // TODO: Check if we can remove a check for non-power-2 number of
2072        // scalars after full support of non-power-2 vectorization.
2073        return UniqueValues.size() != 2 && isPowerOf2_32(UniqueValues.size());
2074      };
2075
2076      // If the initial strategy fails for any of the operand indexes, then we
2077      // perform reordering again in a second pass. This helps avoid assigning
2078      // high priority to the failed strategy, and should improve reordering for
2079      // the non-failed operand indexes.
2080      for (int Pass = 0; Pass != 2; ++Pass) {
2081        // Check if no need to reorder operands since they're are perfect or
2082        // shuffled diamond match.
2083        // Need to do it to avoid extra external use cost counting for
2084        // shuffled matches, which may cause regressions.
2085        if (SkipReordering())
2086          break;
2087        // Skip the second pass if the first pass did not fail.
2088        bool StrategyFailed = false;
2089        // Mark all operand data as free to use.
2090        clearUsed();
2091        // We keep the original operand order for the FirstLane, so reorder the
2092        // rest of the lanes. We are visiting the nodes in a circular fashion,
2093        // using FirstLane as the center point and increasing the radius
2094        // distance.
2095        SmallVector<SmallVector<Value *, 2>> MainAltOps(NumOperands);
2096        for (unsigned I = 0; I < NumOperands; ++I)
2097          MainAltOps[I].push_back(getData(I, FirstLane).V);
2098
2099        for (unsigned Distance = 1; Distance != NumLanes; ++Distance) {
2100          // Visit the lane on the right and then the lane on the left.
2101          for (int Direction : {+1, -1}) {
2102            int Lane = FirstLane + Direction * Distance;
2103            if (Lane < 0 || Lane >= (int)NumLanes)
2104              continue;
2105            int LastLane = Lane - Direction;
2106            assert(LastLane >= 0 && LastLane < (int)NumLanes &&
2107                   "Out of bounds");
2108            // Look for a good match for each operand.
2109            for (unsigned OpIdx = 0; OpIdx != NumOperands; ++OpIdx) {
2110              // Search for the operand that matches SortedOps[OpIdx][Lane-1].
2111              std::optional<unsigned> BestIdx = getBestOperand(
2112                  OpIdx, Lane, LastLane, ReorderingModes, MainAltOps[OpIdx]);
2113              // By not selecting a value, we allow the operands that follow to
2114              // select a better matching value. We will get a non-null value in
2115              // the next run of getBestOperand().
2116              if (BestIdx) {
2117                // Swap the current operand with the one returned by
2118                // getBestOperand().
2119                swap(OpIdx, *BestIdx, Lane);
2120              } else {
2121                // We failed to find a best operand, set mode to 'Failed'.
2122                ReorderingModes[OpIdx] = ReorderingMode::Failed;
2123                // Enable the second pass.
2124                StrategyFailed = true;
2125              }
2126              // Try to get the alternate opcode and follow it during analysis.
2127              if (MainAltOps[OpIdx].size() != 2) {
2128                OperandData &AltOp = getData(OpIdx, Lane);
2129                InstructionsState OpS =
2130                    getSameOpcode({MainAltOps[OpIdx].front(), AltOp.V}, TLI);
2131                if (OpS.getOpcode() && OpS.isAltShuffle())
2132                  MainAltOps[OpIdx].push_back(AltOp.V);
2133              }
2134            }
2135          }
2136        }
2137        // Skip second pass if the strategy did not fail.
2138        if (!StrategyFailed)
2139          break;
2140      }
2141    }
2142
2143#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
2144    LLVM_DUMP_METHOD static StringRef getModeStr(ReorderingMode RMode) {
2145      switch (RMode) {
2146      case ReorderingMode::Load:
2147        return "Load";
2148      case ReorderingMode::Opcode:
2149        return "Opcode";
2150      case ReorderingMode::Constant:
2151        return "Constant";
2152      case ReorderingMode::Splat:
2153        return "Splat";
2154      case ReorderingMode::Failed:
2155        return "Failed";
2156      }
2157      llvm_unreachable("Unimplemented Reordering Type");
2158    }
2159
2160    LLVM_DUMP_METHOD static raw_ostream &printMode(ReorderingMode RMode,
2161                                                   raw_ostream &OS) {
2162      return OS << getModeStr(RMode);
2163    }
2164
2165    /// Debug print.
2166    LLVM_DUMP_METHOD static void dumpMode(ReorderingMode RMode) {
2167      printMode(RMode, dbgs());
2168    }
2169
2170    friend raw_ostream &operator<<(raw_ostream &OS, ReorderingMode RMode) {
2171      return printMode(RMode, OS);
2172    }
2173
2174    LLVM_DUMP_METHOD raw_ostream &print(raw_ostream &OS) const {
2175      const unsigned Indent = 2;
2176      unsigned Cnt = 0;
2177      for (const OperandDataVec &OpDataVec : OpsVec) {
2178        OS << "Operand " << Cnt++ << "\n";
2179        for (const OperandData &OpData : OpDataVec) {
2180          OS.indent(Indent) << "{";
2181          if (Value *V = OpData.V)
2182            OS << *V;
2183          else
2184            OS << "null";
2185          OS << ", APO:" << OpData.APO << "}\n";
2186        }
2187        OS << "\n";
2188      }
2189      return OS;
2190    }
2191
2192    /// Debug print.
2193    LLVM_DUMP_METHOD void dump() const { print(dbgs()); }
2194#endif
2195  };
2196
2197  /// Evaluate each pair in \p Candidates and return index into \p Candidates
2198  /// for a pair which have highest score deemed to have best chance to form
2199  /// root of profitable tree to vectorize. Return std::nullopt if no candidate
2200  /// scored above the LookAheadHeuristics::ScoreFail. \param Limit Lower limit
2201  /// of the cost, considered to be good enough score.
2202  std::optional<int>
2203  findBestRootPair(ArrayRef<std::pair<Value *, Value *>> Candidates,
2204                   int Limit = LookAheadHeuristics::ScoreFail) {
2205    LookAheadHeuristics LookAhead(*TLI, *DL, *SE, *this, /*NumLanes=*/2,
2206                                  RootLookAheadMaxDepth);
2207    int BestScore = Limit;
2208    std::optional<int> Index;
2209    for (int I : seq<int>(0, Candidates.size())) {
2210      int Score = LookAhead.getScoreAtLevelRec(Candidates[I].first,
2211                                               Candidates[I].second,
2212                                               /*U1=*/nullptr, /*U2=*/nullptr,
2213                                               /*Level=*/1, std::nullopt);
2214      if (Score > BestScore) {
2215        BestScore = Score;
2216        Index = I;
2217      }
2218    }
2219    return Index;
2220  }
2221
2222  /// Checks if the instruction is marked for deletion.
2223  bool isDeleted(Instruction *I) const { return DeletedInstructions.count(I); }
2224
2225  /// Removes an instruction from its block and eventually deletes it.
2226  /// It's like Instruction::eraseFromParent() except that the actual deletion
2227  /// is delayed until BoUpSLP is destructed.
2228  void eraseInstruction(Instruction *I) {
2229    DeletedInstructions.insert(I);
2230  }
2231
2232  /// Checks if the instruction was already analyzed for being possible
2233  /// reduction root.
2234  bool isAnalyzedReductionRoot(Instruction *I) const {
2235    return AnalyzedReductionsRoots.count(I);
2236  }
2237  /// Register given instruction as already analyzed for being possible
2238  /// reduction root.
2239  void analyzedReductionRoot(Instruction *I) {
2240    AnalyzedReductionsRoots.insert(I);
2241  }
2242  /// Checks if the provided list of reduced values was checked already for
2243  /// vectorization.
2244  bool areAnalyzedReductionVals(ArrayRef<Value *> VL) const {
2245    return AnalyzedReductionVals.contains(hash_value(VL));
2246  }
2247  /// Adds the list of reduced values to list of already checked values for the
2248  /// vectorization.
2249  void analyzedReductionVals(ArrayRef<Value *> VL) {
2250    AnalyzedReductionVals.insert(hash_value(VL));
2251  }
2252  /// Clear the list of the analyzed reduction root instructions.
2253  void clearReductionData() {
2254    AnalyzedReductionsRoots.clear();
2255    AnalyzedReductionVals.clear();
2256  }
2257  /// Checks if the given value is gathered in one of the nodes.
2258  bool isAnyGathered(const SmallDenseSet<Value *> &Vals) const {
2259    return any_of(MustGather, [&](Value *V) { return Vals.contains(V); });
2260  }
2261
2262  /// Check if the value is vectorized in the tree.
2263  bool isVectorized(Value *V) const { return getTreeEntry(V); }
2264
2265  ~BoUpSLP();
2266
2267private:
2268  /// Determine if a vectorized value \p V in can be demoted to
2269  /// a smaller type with a truncation. We collect the values that will be
2270  /// demoted in ToDemote and additional roots that require investigating in
2271  /// Roots.
2272  /// \param DemotedConsts list of Instruction/OperandIndex pairs that are
2273  /// constant and to be demoted. Required to correctly identify constant nodes
2274  /// to be demoted.
2275  bool collectValuesToDemote(
2276      Value *V, SmallVectorImpl<Value *> &ToDemote,
2277      DenseMap<Instruction *, SmallVector<unsigned>> &DemotedConsts,
2278      SmallVectorImpl<Value *> &Roots, DenseSet<Value *> &Visited) const;
2279
2280  /// Check if the operands on the edges \p Edges of the \p UserTE allows
2281  /// reordering (i.e. the operands can be reordered because they have only one
2282  /// user and reordarable).
2283  /// \param ReorderableGathers List of all gather nodes that require reordering
2284  /// (e.g., gather of extractlements or partially vectorizable loads).
2285  /// \param GatherOps List of gather operand nodes for \p UserTE that require
2286  /// reordering, subset of \p NonVectorized.
2287  bool
2288  canReorderOperands(TreeEntry *UserTE,
2289                     SmallVectorImpl<std::pair<unsigned, TreeEntry *>> &Edges,
2290                     ArrayRef<TreeEntry *> ReorderableGathers,
2291                     SmallVectorImpl<TreeEntry *> &GatherOps);
2292
2293  /// Checks if the given \p TE is a gather node with clustered reused scalars
2294  /// and reorders it per given \p Mask.
2295  void reorderNodeWithReuses(TreeEntry &TE, ArrayRef<int> Mask) const;
2296
2297  /// Returns vectorized operand \p OpIdx of the node \p UserTE from the graph,
2298  /// if any. If it is not vectorized (gather node), returns nullptr.
2299  TreeEntry *getVectorizedOperand(TreeEntry *UserTE, unsigned OpIdx) {
2300    ArrayRef<Value *> VL = UserTE->getOperand(OpIdx);
2301    TreeEntry *TE = nullptr;
2302    const auto *It = find_if(VL, [&](Value *V) {
2303      TE = getTreeEntry(V);
2304      if (TE && is_contained(TE->UserTreeIndices, EdgeInfo(UserTE, OpIdx)))
2305        return true;
2306      auto It = MultiNodeScalars.find(V);
2307      if (It != MultiNodeScalars.end()) {
2308        for (TreeEntry *E : It->second) {
2309          if (is_contained(E->UserTreeIndices, EdgeInfo(UserTE, OpIdx))) {
2310            TE = E;
2311            return true;
2312          }
2313        }
2314      }
2315      return false;
2316    });
2317    if (It != VL.end()) {
2318      assert(TE->isSame(VL) && "Expected same scalars.");
2319      return TE;
2320    }
2321    return nullptr;
2322  }
2323
2324  /// Returns vectorized operand \p OpIdx of the node \p UserTE from the graph,
2325  /// if any. If it is not vectorized (gather node), returns nullptr.
2326  const TreeEntry *getVectorizedOperand(const TreeEntry *UserTE,
2327                                        unsigned OpIdx) const {
2328    return const_cast<BoUpSLP *>(this)->getVectorizedOperand(
2329        const_cast<TreeEntry *>(UserTE), OpIdx);
2330  }
2331
2332  /// Checks if all users of \p I are the part of the vectorization tree.
2333  bool areAllUsersVectorized(
2334      Instruction *I,
2335      const SmallDenseSet<Value *> *VectorizedVals = nullptr) const;
2336
2337  /// Return information about the vector formed for the specified index
2338  /// of a vector of (the same) instruction.
2339  TargetTransformInfo::OperandValueInfo getOperandInfo(ArrayRef<Value *> Ops);
2340
2341  /// \ returns the graph entry for the \p Idx operand of the \p E entry.
2342  const TreeEntry *getOperandEntry(const TreeEntry *E, unsigned Idx) const;
2343
2344  /// \returns the cost of the vectorizable entry.
2345  InstructionCost getEntryCost(const TreeEntry *E,
2346                               ArrayRef<Value *> VectorizedVals,
2347                               SmallPtrSetImpl<Value *> &CheckedExtracts);
2348
2349  /// This is the recursive part of buildTree.
2350  void buildTree_rec(ArrayRef<Value *> Roots, unsigned Depth,
2351                     const EdgeInfo &EI);
2352
2353  /// \returns true if the ExtractElement/ExtractValue instructions in \p VL can
2354  /// be vectorized to use the original vector (or aggregate "bitcast" to a
2355  /// vector) and sets \p CurrentOrder to the identity permutation; otherwise
2356  /// returns false, setting \p CurrentOrder to either an empty vector or a
2357  /// non-identity permutation that allows to reuse extract instructions.
2358  /// \param ResizeAllowed indicates whether it is allowed to handle subvector
2359  /// extract order.
2360  bool canReuseExtract(ArrayRef<Value *> VL, Value *OpValue,
2361                       SmallVectorImpl<unsigned> &CurrentOrder,
2362                       bool ResizeAllowed = false) const;
2363
2364  /// Vectorize a single entry in the tree.
2365  /// \param PostponedPHIs true, if need to postpone emission of phi nodes to
2366  /// avoid issues with def-use order.
2367  Value *vectorizeTree(TreeEntry *E, bool PostponedPHIs);
2368
2369  /// Vectorize a single entry in the tree, the \p Idx-th operand of the entry
2370  /// \p E.
2371  /// \param PostponedPHIs true, if need to postpone emission of phi nodes to
2372  /// avoid issues with def-use order.
2373  Value *vectorizeOperand(TreeEntry *E, unsigned NodeIdx, bool PostponedPHIs);
2374
2375  /// Create a new vector from a list of scalar values.  Produces a sequence
2376  /// which exploits values reused across lanes, and arranges the inserts
2377  /// for ease of later optimization.
2378  template <typename BVTy, typename ResTy, typename... Args>
2379  ResTy processBuildVector(const TreeEntry *E, Args &...Params);
2380
2381  /// Create a new vector from a list of scalar values.  Produces a sequence
2382  /// which exploits values reused across lanes, and arranges the inserts
2383  /// for ease of later optimization.
2384  Value *createBuildVector(const TreeEntry *E);
2385
2386  /// Returns the instruction in the bundle, which can be used as a base point
2387  /// for scheduling. Usually it is the last instruction in the bundle, except
2388  /// for the case when all operands are external (in this case, it is the first
2389  /// instruction in the list).
2390  Instruction &getLastInstructionInBundle(const TreeEntry *E);
2391
2392  /// Tries to find extractelement instructions with constant indices from fixed
2393  /// vector type and gather such instructions into a bunch, which highly likely
2394  /// might be detected as a shuffle of 1 or 2 input vectors. If this attempt
2395  /// was successful, the matched scalars are replaced by poison values in \p VL
2396  /// for future analysis.
2397  std::optional<TargetTransformInfo::ShuffleKind>
2398  tryToGatherSingleRegisterExtractElements(MutableArrayRef<Value *> VL,
2399                                           SmallVectorImpl<int> &Mask) const;
2400
2401  /// Tries to find extractelement instructions with constant indices from fixed
2402  /// vector type and gather such instructions into a bunch, which highly likely
2403  /// might be detected as a shuffle of 1 or 2 input vectors. If this attempt
2404  /// was successful, the matched scalars are replaced by poison values in \p VL
2405  /// for future analysis.
2406  SmallVector<std::optional<TargetTransformInfo::ShuffleKind>>
2407  tryToGatherExtractElements(SmallVectorImpl<Value *> &VL,
2408                             SmallVectorImpl<int> &Mask,
2409                             unsigned NumParts) const;
2410
2411  /// Checks if the gathered \p VL can be represented as a single register
2412  /// shuffle(s) of previous tree entries.
2413  /// \param TE Tree entry checked for permutation.
2414  /// \param VL List of scalars (a subset of the TE scalar), checked for
2415  /// permutations. Must form single-register vector.
2416  /// \returns ShuffleKind, if gathered values can be represented as shuffles of
2417  /// previous tree entries. \p Part of \p Mask is filled with the shuffle mask.
2418  std::optional<TargetTransformInfo::ShuffleKind>
2419  isGatherShuffledSingleRegisterEntry(
2420      const TreeEntry *TE, ArrayRef<Value *> VL, MutableArrayRef<int> Mask,
2421      SmallVectorImpl<const TreeEntry *> &Entries, unsigned Part);
2422
2423  /// Checks if the gathered \p VL can be represented as multi-register
2424  /// shuffle(s) of previous tree entries.
2425  /// \param TE Tree entry checked for permutation.
2426  /// \param VL List of scalars (a subset of the TE scalar), checked for
2427  /// permutations.
2428  /// \returns per-register series of ShuffleKind, if gathered values can be
2429  /// represented as shuffles of previous tree entries. \p Mask is filled with
2430  /// the shuffle mask (also on per-register base).
2431  SmallVector<std::optional<TargetTransformInfo::ShuffleKind>>
2432  isGatherShuffledEntry(
2433      const TreeEntry *TE, ArrayRef<Value *> VL, SmallVectorImpl<int> &Mask,
2434      SmallVectorImpl<SmallVector<const TreeEntry *>> &Entries,
2435      unsigned NumParts);
2436
2437  /// \returns the scalarization cost for this list of values. Assuming that
2438  /// this subtree gets vectorized, we may need to extract the values from the
2439  /// roots. This method calculates the cost of extracting the values.
2440  /// \param ForPoisonSrc true if initial vector is poison, false otherwise.
2441  InstructionCost getGatherCost(ArrayRef<Value *> VL, bool ForPoisonSrc) const;
2442
2443  /// Set the Builder insert point to one after the last instruction in
2444  /// the bundle
2445  void setInsertPointAfterBundle(const TreeEntry *E);
2446
2447  /// \returns a vector from a collection of scalars in \p VL. if \p Root is not
2448  /// specified, the starting vector value is poison.
2449  Value *gather(ArrayRef<Value *> VL, Value *Root);
2450
2451  /// \returns whether the VectorizableTree is fully vectorizable and will
2452  /// be beneficial even the tree height is tiny.
2453  bool isFullyVectorizableTinyTree(bool ForReduction) const;
2454
2455  /// Reorder commutative or alt operands to get better probability of
2456  /// generating vectorized code.
2457  static void reorderInputsAccordingToOpcode(
2458      ArrayRef<Value *> VL, SmallVectorImpl<Value *> &Left,
2459      SmallVectorImpl<Value *> &Right, const TargetLibraryInfo &TLI,
2460      const DataLayout &DL, ScalarEvolution &SE, const BoUpSLP &R);
2461
2462  /// Helper for `findExternalStoreUsersReorderIndices()`. It iterates over the
2463  /// users of \p TE and collects the stores. It returns the map from the store
2464  /// pointers to the collected stores.
2465  DenseMap<Value *, SmallVector<StoreInst *>>
2466  collectUserStores(const BoUpSLP::TreeEntry *TE) const;
2467
2468  /// Helper for `findExternalStoreUsersReorderIndices()`. It checks if the
2469  /// stores in \p StoresVec can form a vector instruction. If so it returns
2470  /// true and populates \p ReorderIndices with the shuffle indices of the
2471  /// stores when compared to the sorted vector.
2472  bool canFormVector(ArrayRef<StoreInst *> StoresVec,
2473                     OrdersType &ReorderIndices) const;
2474
2475  /// Iterates through the users of \p TE, looking for scalar stores that can be
2476  /// potentially vectorized in a future SLP-tree. If found, it keeps track of
2477  /// their order and builds an order index vector for each store bundle. It
2478  /// returns all these order vectors found.
2479  /// We run this after the tree has formed, otherwise we may come across user
2480  /// instructions that are not yet in the tree.
2481  SmallVector<OrdersType, 1>
2482  findExternalStoreUsersReorderIndices(TreeEntry *TE) const;
2483
2484  struct TreeEntry {
2485    using VecTreeTy = SmallVector<std::unique_ptr<TreeEntry>, 8>;
2486    TreeEntry(VecTreeTy &Container) : Container(Container) {}
2487
2488    /// \returns Common mask for reorder indices and reused scalars.
2489    SmallVector<int> getCommonMask() const {
2490      SmallVector<int> Mask;
2491      inversePermutation(ReorderIndices, Mask);
2492      ::addMask(Mask, ReuseShuffleIndices);
2493      return Mask;
2494    }
2495
2496    /// \returns true if the scalars in VL are equal to this entry.
2497    bool isSame(ArrayRef<Value *> VL) const {
2498      auto &&IsSame = [VL](ArrayRef<Value *> Scalars, ArrayRef<int> Mask) {
2499        if (Mask.size() != VL.size() && VL.size() == Scalars.size())
2500          return std::equal(VL.begin(), VL.end(), Scalars.begin());
2501        return VL.size() == Mask.size() &&
2502               std::equal(VL.begin(), VL.end(), Mask.begin(),
2503                          [Scalars](Value *V, int Idx) {
2504                            return (isa<UndefValue>(V) &&
2505                                    Idx == PoisonMaskElem) ||
2506                                   (Idx != PoisonMaskElem && V == Scalars[Idx]);
2507                          });
2508      };
2509      if (!ReorderIndices.empty()) {
2510        // TODO: implement matching if the nodes are just reordered, still can
2511        // treat the vector as the same if the list of scalars matches VL
2512        // directly, without reordering.
2513        SmallVector<int> Mask;
2514        inversePermutation(ReorderIndices, Mask);
2515        if (VL.size() == Scalars.size())
2516          return IsSame(Scalars, Mask);
2517        if (VL.size() == ReuseShuffleIndices.size()) {
2518          ::addMask(Mask, ReuseShuffleIndices);
2519          return IsSame(Scalars, Mask);
2520        }
2521        return false;
2522      }
2523      return IsSame(Scalars, ReuseShuffleIndices);
2524    }
2525
2526    bool isOperandGatherNode(const EdgeInfo &UserEI) const {
2527      return State == TreeEntry::NeedToGather &&
2528             UserTreeIndices.front().EdgeIdx == UserEI.EdgeIdx &&
2529             UserTreeIndices.front().UserTE == UserEI.UserTE;
2530    }
2531
2532    /// \returns true if current entry has same operands as \p TE.
2533    bool hasEqualOperands(const TreeEntry &TE) const {
2534      if (TE.getNumOperands() != getNumOperands())
2535        return false;
2536      SmallBitVector Used(getNumOperands());
2537      for (unsigned I = 0, E = getNumOperands(); I < E; ++I) {
2538        unsigned PrevCount = Used.count();
2539        for (unsigned K = 0; K < E; ++K) {
2540          if (Used.test(K))
2541            continue;
2542          if (getOperand(K) == TE.getOperand(I)) {
2543            Used.set(K);
2544            break;
2545          }
2546        }
2547        // Check if we actually found the matching operand.
2548        if (PrevCount == Used.count())
2549          return false;
2550      }
2551      return true;
2552    }
2553
2554    /// \return Final vectorization factor for the node. Defined by the total
2555    /// number of vectorized scalars, including those, used several times in the
2556    /// entry and counted in the \a ReuseShuffleIndices, if any.
2557    unsigned getVectorFactor() const {
2558      if (!ReuseShuffleIndices.empty())
2559        return ReuseShuffleIndices.size();
2560      return Scalars.size();
2561    };
2562
2563    /// A vector of scalars.
2564    ValueList Scalars;
2565
2566    /// The Scalars are vectorized into this value. It is initialized to Null.
2567    WeakTrackingVH VectorizedValue = nullptr;
2568
2569    /// New vector phi instructions emitted for the vectorized phi nodes.
2570    PHINode *PHI = nullptr;
2571
2572    /// Do we need to gather this sequence or vectorize it
2573    /// (either with vector instruction or with scatter/gather
2574    /// intrinsics for store/load)?
2575    enum EntryState {
2576      Vectorize,
2577      ScatterVectorize,
2578      PossibleStridedVectorize,
2579      NeedToGather
2580    };
2581    EntryState State;
2582
2583    /// Does this sequence require some shuffling?
2584    SmallVector<int, 4> ReuseShuffleIndices;
2585
2586    /// Does this entry require reordering?
2587    SmallVector<unsigned, 4> ReorderIndices;
2588
2589    /// Points back to the VectorizableTree.
2590    ///
2591    /// Only used for Graphviz right now.  Unfortunately GraphTrait::NodeRef has
2592    /// to be a pointer and needs to be able to initialize the child iterator.
2593    /// Thus we need a reference back to the container to translate the indices
2594    /// to entries.
2595    VecTreeTy &Container;
2596
2597    /// The TreeEntry index containing the user of this entry.  We can actually
2598    /// have multiple users so the data structure is not truly a tree.
2599    SmallVector<EdgeInfo, 1> UserTreeIndices;
2600
2601    /// The index of this treeEntry in VectorizableTree.
2602    int Idx = -1;
2603
2604  private:
2605    /// The operands of each instruction in each lane Operands[op_index][lane].
2606    /// Note: This helps avoid the replication of the code that performs the
2607    /// reordering of operands during buildTree_rec() and vectorizeTree().
2608    SmallVector<ValueList, 2> Operands;
2609
2610    /// The main/alternate instruction.
2611    Instruction *MainOp = nullptr;
2612    Instruction *AltOp = nullptr;
2613
2614  public:
2615    /// Set this bundle's \p OpIdx'th operand to \p OpVL.
2616    void setOperand(unsigned OpIdx, ArrayRef<Value *> OpVL) {
2617      if (Operands.size() < OpIdx + 1)
2618        Operands.resize(OpIdx + 1);
2619      assert(Operands[OpIdx].empty() && "Already resized?");
2620      assert(OpVL.size() <= Scalars.size() &&
2621             "Number of operands is greater than the number of scalars.");
2622      Operands[OpIdx].resize(OpVL.size());
2623      copy(OpVL, Operands[OpIdx].begin());
2624    }
2625
2626    /// Set the operands of this bundle in their original order.
2627    void setOperandsInOrder() {
2628      assert(Operands.empty() && "Already initialized?");
2629      auto *I0 = cast<Instruction>(Scalars[0]);
2630      Operands.resize(I0->getNumOperands());
2631      unsigned NumLanes = Scalars.size();
2632      for (unsigned OpIdx = 0, NumOperands = I0->getNumOperands();
2633           OpIdx != NumOperands; ++OpIdx) {
2634        Operands[OpIdx].resize(NumLanes);
2635        for (unsigned Lane = 0; Lane != NumLanes; ++Lane) {
2636          auto *I = cast<Instruction>(Scalars[Lane]);
2637          assert(I->getNumOperands() == NumOperands &&
2638                 "Expected same number of operands");
2639          Operands[OpIdx][Lane] = I->getOperand(OpIdx);
2640        }
2641      }
2642    }
2643
2644    /// Reorders operands of the node to the given mask \p Mask.
2645    void reorderOperands(ArrayRef<int> Mask) {
2646      for (ValueList &Operand : Operands)
2647        reorderScalars(Operand, Mask);
2648    }
2649
2650    /// \returns the \p OpIdx operand of this TreeEntry.
2651    ValueList &getOperand(unsigned OpIdx) {
2652      assert(OpIdx < Operands.size() && "Off bounds");
2653      return Operands[OpIdx];
2654    }
2655
2656    /// \returns the \p OpIdx operand of this TreeEntry.
2657    ArrayRef<Value *> getOperand(unsigned OpIdx) const {
2658      assert(OpIdx < Operands.size() && "Off bounds");
2659      return Operands[OpIdx];
2660    }
2661
2662    /// \returns the number of operands.
2663    unsigned getNumOperands() const { return Operands.size(); }
2664
2665    /// \return the single \p OpIdx operand.
2666    Value *getSingleOperand(unsigned OpIdx) const {
2667      assert(OpIdx < Operands.size() && "Off bounds");
2668      assert(!Operands[OpIdx].empty() && "No operand available");
2669      return Operands[OpIdx][0];
2670    }
2671
2672    /// Some of the instructions in the list have alternate opcodes.
2673    bool isAltShuffle() const { return MainOp != AltOp; }
2674
2675    bool isOpcodeOrAlt(Instruction *I) const {
2676      unsigned CheckedOpcode = I->getOpcode();
2677      return (getOpcode() == CheckedOpcode ||
2678              getAltOpcode() == CheckedOpcode);
2679    }
2680
2681    /// Chooses the correct key for scheduling data. If \p Op has the same (or
2682    /// alternate) opcode as \p OpValue, the key is \p Op. Otherwise the key is
2683    /// \p OpValue.
2684    Value *isOneOf(Value *Op) const {
2685      auto *I = dyn_cast<Instruction>(Op);
2686      if (I && isOpcodeOrAlt(I))
2687        return Op;
2688      return MainOp;
2689    }
2690
2691    void setOperations(const InstructionsState &S) {
2692      MainOp = S.MainOp;
2693      AltOp = S.AltOp;
2694    }
2695
2696    Instruction *getMainOp() const {
2697      return MainOp;
2698    }
2699
2700    Instruction *getAltOp() const {
2701      return AltOp;
2702    }
2703
2704    /// The main/alternate opcodes for the list of instructions.
2705    unsigned getOpcode() const {
2706      return MainOp ? MainOp->getOpcode() : 0;
2707    }
2708
2709    unsigned getAltOpcode() const {
2710      return AltOp ? AltOp->getOpcode() : 0;
2711    }
2712
2713    /// When ReuseReorderShuffleIndices is empty it just returns position of \p
2714    /// V within vector of Scalars. Otherwise, try to remap on its reuse index.
2715    int findLaneForValue(Value *V) const {
2716      unsigned FoundLane = std::distance(Scalars.begin(), find(Scalars, V));
2717      assert(FoundLane < Scalars.size() && "Couldn't find extract lane");
2718      if (!ReorderIndices.empty())
2719        FoundLane = ReorderIndices[FoundLane];
2720      assert(FoundLane < Scalars.size() && "Couldn't find extract lane");
2721      if (!ReuseShuffleIndices.empty()) {
2722        FoundLane = std::distance(ReuseShuffleIndices.begin(),
2723                                  find(ReuseShuffleIndices, FoundLane));
2724      }
2725      return FoundLane;
2726    }
2727
2728    /// Build a shuffle mask for graph entry which represents a merge of main
2729    /// and alternate operations.
2730    void
2731    buildAltOpShuffleMask(const function_ref<bool(Instruction *)> IsAltOp,
2732                          SmallVectorImpl<int> &Mask,
2733                          SmallVectorImpl<Value *> *OpScalars = nullptr,
2734                          SmallVectorImpl<Value *> *AltScalars = nullptr) const;
2735
2736#ifndef NDEBUG
2737    /// Debug printer.
2738    LLVM_DUMP_METHOD void dump() const {
2739      dbgs() << Idx << ".\n";
2740      for (unsigned OpI = 0, OpE = Operands.size(); OpI != OpE; ++OpI) {
2741        dbgs() << "Operand " << OpI << ":\n";
2742        for (const Value *V : Operands[OpI])
2743          dbgs().indent(2) << *V << "\n";
2744      }
2745      dbgs() << "Scalars: \n";
2746      for (Value *V : Scalars)
2747        dbgs().indent(2) << *V << "\n";
2748      dbgs() << "State: ";
2749      switch (State) {
2750      case Vectorize:
2751        dbgs() << "Vectorize\n";
2752        break;
2753      case ScatterVectorize:
2754        dbgs() << "ScatterVectorize\n";
2755        break;
2756      case PossibleStridedVectorize:
2757        dbgs() << "PossibleStridedVectorize\n";
2758        break;
2759      case NeedToGather:
2760        dbgs() << "NeedToGather\n";
2761        break;
2762      }
2763      dbgs() << "MainOp: ";
2764      if (MainOp)
2765        dbgs() << *MainOp << "\n";
2766      else
2767        dbgs() << "NULL\n";
2768      dbgs() << "AltOp: ";
2769      if (AltOp)
2770        dbgs() << *AltOp << "\n";
2771      else
2772        dbgs() << "NULL\n";
2773      dbgs() << "VectorizedValue: ";
2774      if (VectorizedValue)
2775        dbgs() << *VectorizedValue << "\n";
2776      else
2777        dbgs() << "NULL\n";
2778      dbgs() << "ReuseShuffleIndices: ";
2779      if (ReuseShuffleIndices.empty())
2780        dbgs() << "Empty";
2781      else
2782        for (int ReuseIdx : ReuseShuffleIndices)
2783          dbgs() << ReuseIdx << ", ";
2784      dbgs() << "\n";
2785      dbgs() << "ReorderIndices: ";
2786      for (unsigned ReorderIdx : ReorderIndices)
2787        dbgs() << ReorderIdx << ", ";
2788      dbgs() << "\n";
2789      dbgs() << "UserTreeIndices: ";
2790      for (const auto &EInfo : UserTreeIndices)
2791        dbgs() << EInfo << ", ";
2792      dbgs() << "\n";
2793    }
2794#endif
2795  };
2796
2797#ifndef NDEBUG
2798  void dumpTreeCosts(const TreeEntry *E, InstructionCost ReuseShuffleCost,
2799                     InstructionCost VecCost, InstructionCost ScalarCost,
2800                     StringRef Banner) const {
2801    dbgs() << "SLP: " << Banner << ":\n";
2802    E->dump();
2803    dbgs() << "SLP: Costs:\n";
2804    dbgs() << "SLP:     ReuseShuffleCost = " << ReuseShuffleCost << "\n";
2805    dbgs() << "SLP:     VectorCost = " << VecCost << "\n";
2806    dbgs() << "SLP:     ScalarCost = " << ScalarCost << "\n";
2807    dbgs() << "SLP:     ReuseShuffleCost + VecCost - ScalarCost = "
2808           << ReuseShuffleCost + VecCost - ScalarCost << "\n";
2809  }
2810#endif
2811
2812  /// Create a new VectorizableTree entry.
2813  TreeEntry *newTreeEntry(ArrayRef<Value *> VL,
2814                          std::optional<ScheduleData *> Bundle,
2815                          const InstructionsState &S,
2816                          const EdgeInfo &UserTreeIdx,
2817                          ArrayRef<int> ReuseShuffleIndices = std::nullopt,
2818                          ArrayRef<unsigned> ReorderIndices = std::nullopt) {
2819    TreeEntry::EntryState EntryState =
2820        Bundle ? TreeEntry::Vectorize : TreeEntry::NeedToGather;
2821    return newTreeEntry(VL, EntryState, Bundle, S, UserTreeIdx,
2822                        ReuseShuffleIndices, ReorderIndices);
2823  }
2824
2825  TreeEntry *newTreeEntry(ArrayRef<Value *> VL,
2826                          TreeEntry::EntryState EntryState,
2827                          std::optional<ScheduleData *> Bundle,
2828                          const InstructionsState &S,
2829                          const EdgeInfo &UserTreeIdx,
2830                          ArrayRef<int> ReuseShuffleIndices = std::nullopt,
2831                          ArrayRef<unsigned> ReorderIndices = std::nullopt) {
2832    assert(((!Bundle && EntryState == TreeEntry::NeedToGather) ||
2833            (Bundle && EntryState != TreeEntry::NeedToGather)) &&
2834           "Need to vectorize gather entry?");
2835    VectorizableTree.push_back(std::make_unique<TreeEntry>(VectorizableTree));
2836    TreeEntry *Last = VectorizableTree.back().get();
2837    Last->Idx = VectorizableTree.size() - 1;
2838    Last->State = EntryState;
2839    Last->ReuseShuffleIndices.append(ReuseShuffleIndices.begin(),
2840                                     ReuseShuffleIndices.end());
2841    if (ReorderIndices.empty()) {
2842      Last->Scalars.assign(VL.begin(), VL.end());
2843      Last->setOperations(S);
2844    } else {
2845      // Reorder scalars and build final mask.
2846      Last->Scalars.assign(VL.size(), nullptr);
2847      transform(ReorderIndices, Last->Scalars.begin(),
2848                [VL](unsigned Idx) -> Value * {
2849                  if (Idx >= VL.size())
2850                    return UndefValue::get(VL.front()->getType());
2851                  return VL[Idx];
2852                });
2853      InstructionsState S = getSameOpcode(Last->Scalars, *TLI);
2854      Last->setOperations(S);
2855      Last->ReorderIndices.append(ReorderIndices.begin(), ReorderIndices.end());
2856    }
2857    if (Last->State != TreeEntry::NeedToGather) {
2858      for (Value *V : VL) {
2859        const TreeEntry *TE = getTreeEntry(V);
2860        assert((!TE || TE == Last || doesNotNeedToBeScheduled(V)) &&
2861               "Scalar already in tree!");
2862        if (TE) {
2863          if (TE != Last)
2864            MultiNodeScalars.try_emplace(V).first->getSecond().push_back(Last);
2865          continue;
2866        }
2867        ScalarToTreeEntry[V] = Last;
2868      }
2869      // Update the scheduler bundle to point to this TreeEntry.
2870      ScheduleData *BundleMember = *Bundle;
2871      assert((BundleMember || isa<PHINode>(S.MainOp) ||
2872              isVectorLikeInstWithConstOps(S.MainOp) ||
2873              doesNotNeedToSchedule(VL)) &&
2874             "Bundle and VL out of sync");
2875      if (BundleMember) {
2876        for (Value *V : VL) {
2877          if (doesNotNeedToBeScheduled(V))
2878            continue;
2879          if (!BundleMember)
2880            continue;
2881          BundleMember->TE = Last;
2882          BundleMember = BundleMember->NextInBundle;
2883        }
2884      }
2885      assert(!BundleMember && "Bundle and VL out of sync");
2886    } else {
2887      MustGather.insert(VL.begin(), VL.end());
2888      // Build a map for gathered scalars to the nodes where they are used.
2889      for (Value *V : VL)
2890        if (!isConstant(V))
2891          ValueToGatherNodes.try_emplace(V).first->getSecond().insert(Last);
2892    }
2893
2894    if (UserTreeIdx.UserTE)
2895      Last->UserTreeIndices.push_back(UserTreeIdx);
2896
2897    return Last;
2898  }
2899
2900  /// -- Vectorization State --
2901  /// Holds all of the tree entries.
2902  TreeEntry::VecTreeTy VectorizableTree;
2903
2904#ifndef NDEBUG
2905  /// Debug printer.
2906  LLVM_DUMP_METHOD void dumpVectorizableTree() const {
2907    for (unsigned Id = 0, IdE = VectorizableTree.size(); Id != IdE; ++Id) {
2908      VectorizableTree[Id]->dump();
2909      dbgs() << "\n";
2910    }
2911  }
2912#endif
2913
2914  TreeEntry *getTreeEntry(Value *V) { return ScalarToTreeEntry.lookup(V); }
2915
2916  const TreeEntry *getTreeEntry(Value *V) const {
2917    return ScalarToTreeEntry.lookup(V);
2918  }
2919
2920  /// Checks if the specified list of the instructions/values can be vectorized
2921  /// and fills required data before actual scheduling of the instructions.
2922  TreeEntry::EntryState getScalarsVectorizationState(
2923      InstructionsState &S, ArrayRef<Value *> VL, bool IsScatterVectorizeUserTE,
2924      OrdersType &CurrentOrder, SmallVectorImpl<Value *> &PointerOps) const;
2925
2926  /// Maps a specific scalar to its tree entry.
2927  SmallDenseMap<Value *, TreeEntry *> ScalarToTreeEntry;
2928
2929  /// List of scalars, used in several vectorize nodes, and the list of the
2930  /// nodes.
2931  SmallDenseMap<Value *, SmallVector<TreeEntry *>> MultiNodeScalars;
2932
2933  /// Maps a value to the proposed vectorizable size.
2934  SmallDenseMap<Value *, unsigned> InstrElementSize;
2935
2936  /// A list of scalars that we found that we need to keep as scalars.
2937  ValueSet MustGather;
2938
2939  /// A map between the vectorized entries and the last instructions in the
2940  /// bundles. The bundles are built in use order, not in the def order of the
2941  /// instructions. So, we cannot rely directly on the last instruction in the
2942  /// bundle being the last instruction in the program order during
2943  /// vectorization process since the basic blocks are affected, need to
2944  /// pre-gather them before.
2945  DenseMap<const TreeEntry *, Instruction *> EntryToLastInstruction;
2946
2947  /// List of gather nodes, depending on other gather/vector nodes, which should
2948  /// be emitted after the vector instruction emission process to correctly
2949  /// handle order of the vector instructions and shuffles.
2950  SetVector<const TreeEntry *> PostponedGathers;
2951
2952  using ValueToGatherNodesMap =
2953      DenseMap<Value *, SmallPtrSet<const TreeEntry *, 4>>;
2954  ValueToGatherNodesMap ValueToGatherNodes;
2955
2956  /// This POD struct describes one external user in the vectorized tree.
2957  struct ExternalUser {
2958    ExternalUser(Value *S, llvm::User *U, int L)
2959        : Scalar(S), User(U), Lane(L) {}
2960
2961    // Which scalar in our function.
2962    Value *Scalar;
2963
2964    // Which user that uses the scalar.
2965    llvm::User *User;
2966
2967    // Which lane does the scalar belong to.
2968    int Lane;
2969  };
2970  using UserList = SmallVector<ExternalUser, 16>;
2971
2972  /// Checks if two instructions may access the same memory.
2973  ///
2974  /// \p Loc1 is the location of \p Inst1. It is passed explicitly because it
2975  /// is invariant in the calling loop.
2976  bool isAliased(const MemoryLocation &Loc1, Instruction *Inst1,
2977                 Instruction *Inst2) {
2978    if (!Loc1.Ptr || !isSimple(Inst1) || !isSimple(Inst2))
2979      return true;
2980    // First check if the result is already in the cache.
2981    AliasCacheKey Key = std::make_pair(Inst1, Inst2);
2982    auto It = AliasCache.find(Key);
2983    if (It != AliasCache.end())
2984      return It->second;
2985    bool Aliased = isModOrRefSet(BatchAA.getModRefInfo(Inst2, Loc1));
2986    // Store the result in the cache.
2987    AliasCache.try_emplace(Key, Aliased);
2988    AliasCache.try_emplace(std::make_pair(Inst2, Inst1), Aliased);
2989    return Aliased;
2990  }
2991
2992  using AliasCacheKey = std::pair<Instruction *, Instruction *>;
2993
2994  /// Cache for alias results.
2995  /// TODO: consider moving this to the AliasAnalysis itself.
2996  DenseMap<AliasCacheKey, bool> AliasCache;
2997
2998  // Cache for pointerMayBeCaptured calls inside AA.  This is preserved
2999  // globally through SLP because we don't perform any action which
3000  // invalidates capture results.
3001  BatchAAResults BatchAA;
3002
3003  /// Temporary store for deleted instructions. Instructions will be deleted
3004  /// eventually when the BoUpSLP is destructed.  The deferral is required to
3005  /// ensure that there are no incorrect collisions in the AliasCache, which
3006  /// can happen if a new instruction is allocated at the same address as a
3007  /// previously deleted instruction.
3008  DenseSet<Instruction *> DeletedInstructions;
3009
3010  /// Set of the instruction, being analyzed already for reductions.
3011  SmallPtrSet<Instruction *, 16> AnalyzedReductionsRoots;
3012
3013  /// Set of hashes for the list of reduction values already being analyzed.
3014  DenseSet<size_t> AnalyzedReductionVals;
3015
3016  /// A list of values that need to extracted out of the tree.
3017  /// This list holds pairs of (Internal Scalar : External User). External User
3018  /// can be nullptr, it means that this Internal Scalar will be used later,
3019  /// after vectorization.
3020  UserList ExternalUses;
3021
3022  /// Values used only by @llvm.assume calls.
3023  SmallPtrSet<const Value *, 32> EphValues;
3024
3025  /// Holds all of the instructions that we gathered, shuffle instructions and
3026  /// extractelements.
3027  SetVector<Instruction *> GatherShuffleExtractSeq;
3028
3029  /// A list of blocks that we are going to CSE.
3030  DenseSet<BasicBlock *> CSEBlocks;
3031
3032  /// Contains all scheduling relevant data for an instruction.
3033  /// A ScheduleData either represents a single instruction or a member of an
3034  /// instruction bundle (= a group of instructions which is combined into a
3035  /// vector instruction).
3036  struct ScheduleData {
3037    // The initial value for the dependency counters. It means that the
3038    // dependencies are not calculated yet.
3039    enum { InvalidDeps = -1 };
3040
3041    ScheduleData() = default;
3042
3043    void init(int BlockSchedulingRegionID, Value *OpVal) {
3044      FirstInBundle = this;
3045      NextInBundle = nullptr;
3046      NextLoadStore = nullptr;
3047      IsScheduled = false;
3048      SchedulingRegionID = BlockSchedulingRegionID;
3049      clearDependencies();
3050      OpValue = OpVal;
3051      TE = nullptr;
3052    }
3053
3054    /// Verify basic self consistency properties
3055    void verify() {
3056      if (hasValidDependencies()) {
3057        assert(UnscheduledDeps <= Dependencies && "invariant");
3058      } else {
3059        assert(UnscheduledDeps == Dependencies && "invariant");
3060      }
3061
3062      if (IsScheduled) {
3063        assert(isSchedulingEntity() &&
3064                "unexpected scheduled state");
3065        for (const ScheduleData *BundleMember = this; BundleMember;
3066             BundleMember = BundleMember->NextInBundle) {
3067          assert(BundleMember->hasValidDependencies() &&
3068                 BundleMember->UnscheduledDeps == 0 &&
3069                 "unexpected scheduled state");
3070          assert((BundleMember == this || !BundleMember->IsScheduled) &&
3071                 "only bundle is marked scheduled");
3072        }
3073      }
3074
3075      assert(Inst->getParent() == FirstInBundle->Inst->getParent() &&
3076             "all bundle members must be in same basic block");
3077    }
3078
3079    /// Returns true if the dependency information has been calculated.
3080    /// Note that depenendency validity can vary between instructions within
3081    /// a single bundle.
3082    bool hasValidDependencies() const { return Dependencies != InvalidDeps; }
3083
3084    /// Returns true for single instructions and for bundle representatives
3085    /// (= the head of a bundle).
3086    bool isSchedulingEntity() const { return FirstInBundle == this; }
3087
3088    /// Returns true if it represents an instruction bundle and not only a
3089    /// single instruction.
3090    bool isPartOfBundle() const {
3091      return NextInBundle != nullptr || FirstInBundle != this || TE;
3092    }
3093
3094    /// Returns true if it is ready for scheduling, i.e. it has no more
3095    /// unscheduled depending instructions/bundles.
3096    bool isReady() const {
3097      assert(isSchedulingEntity() &&
3098             "can't consider non-scheduling entity for ready list");
3099      return unscheduledDepsInBundle() == 0 && !IsScheduled;
3100    }
3101
3102    /// Modifies the number of unscheduled dependencies for this instruction,
3103    /// and returns the number of remaining dependencies for the containing
3104    /// bundle.
3105    int incrementUnscheduledDeps(int Incr) {
3106      assert(hasValidDependencies() &&
3107             "increment of unscheduled deps would be meaningless");
3108      UnscheduledDeps += Incr;
3109      return FirstInBundle->unscheduledDepsInBundle();
3110    }
3111
3112    /// Sets the number of unscheduled dependencies to the number of
3113    /// dependencies.
3114    void resetUnscheduledDeps() {
3115      UnscheduledDeps = Dependencies;
3116    }
3117
3118    /// Clears all dependency information.
3119    void clearDependencies() {
3120      Dependencies = InvalidDeps;
3121      resetUnscheduledDeps();
3122      MemoryDependencies.clear();
3123      ControlDependencies.clear();
3124    }
3125
3126    int unscheduledDepsInBundle() const {
3127      assert(isSchedulingEntity() && "only meaningful on the bundle");
3128      int Sum = 0;
3129      for (const ScheduleData *BundleMember = this; BundleMember;
3130           BundleMember = BundleMember->NextInBundle) {
3131        if (BundleMember->UnscheduledDeps == InvalidDeps)
3132          return InvalidDeps;
3133        Sum += BundleMember->UnscheduledDeps;
3134      }
3135      return Sum;
3136    }
3137
3138    void dump(raw_ostream &os) const {
3139      if (!isSchedulingEntity()) {
3140        os << "/ " << *Inst;
3141      } else if (NextInBundle) {
3142        os << '[' << *Inst;
3143        ScheduleData *SD = NextInBundle;
3144        while (SD) {
3145          os << ';' << *SD->Inst;
3146          SD = SD->NextInBundle;
3147        }
3148        os << ']';
3149      } else {
3150        os << *Inst;
3151      }
3152    }
3153
3154    Instruction *Inst = nullptr;
3155
3156    /// Opcode of the current instruction in the schedule data.
3157    Value *OpValue = nullptr;
3158
3159    /// The TreeEntry that this instruction corresponds to.
3160    TreeEntry *TE = nullptr;
3161
3162    /// Points to the head in an instruction bundle (and always to this for
3163    /// single instructions).
3164    ScheduleData *FirstInBundle = nullptr;
3165
3166    /// Single linked list of all instructions in a bundle. Null if it is a
3167    /// single instruction.
3168    ScheduleData *NextInBundle = nullptr;
3169
3170    /// Single linked list of all memory instructions (e.g. load, store, call)
3171    /// in the block - until the end of the scheduling region.
3172    ScheduleData *NextLoadStore = nullptr;
3173
3174    /// The dependent memory instructions.
3175    /// This list is derived on demand in calculateDependencies().
3176    SmallVector<ScheduleData *, 4> MemoryDependencies;
3177
3178    /// List of instructions which this instruction could be control dependent
3179    /// on.  Allowing such nodes to be scheduled below this one could introduce
3180    /// a runtime fault which didn't exist in the original program.
3181    /// ex: this is a load or udiv following a readonly call which inf loops
3182    SmallVector<ScheduleData *, 4> ControlDependencies;
3183
3184    /// This ScheduleData is in the current scheduling region if this matches
3185    /// the current SchedulingRegionID of BlockScheduling.
3186    int SchedulingRegionID = 0;
3187
3188    /// Used for getting a "good" final ordering of instructions.
3189    int SchedulingPriority = 0;
3190
3191    /// The number of dependencies. Constitutes of the number of users of the
3192    /// instruction plus the number of dependent memory instructions (if any).
3193    /// This value is calculated on demand.
3194    /// If InvalidDeps, the number of dependencies is not calculated yet.
3195    int Dependencies = InvalidDeps;
3196
3197    /// The number of dependencies minus the number of dependencies of scheduled
3198    /// instructions. As soon as this is zero, the instruction/bundle gets ready
3199    /// for scheduling.
3200    /// Note that this is negative as long as Dependencies is not calculated.
3201    int UnscheduledDeps = InvalidDeps;
3202
3203    /// True if this instruction is scheduled (or considered as scheduled in the
3204    /// dry-run).
3205    bool IsScheduled = false;
3206  };
3207
3208#ifndef NDEBUG
3209  friend inline raw_ostream &operator<<(raw_ostream &os,
3210                                        const BoUpSLP::ScheduleData &SD) {
3211    SD.dump(os);
3212    return os;
3213  }
3214#endif
3215
3216  friend struct GraphTraits<BoUpSLP *>;
3217  friend struct DOTGraphTraits<BoUpSLP *>;
3218
3219  /// Contains all scheduling data for a basic block.
3220  /// It does not schedules instructions, which are not memory read/write
3221  /// instructions and their operands are either constants, or arguments, or
3222  /// phis, or instructions from others blocks, or their users are phis or from
3223  /// the other blocks. The resulting vector instructions can be placed at the
3224  /// beginning of the basic block without scheduling (if operands does not need
3225  /// to be scheduled) or at the end of the block (if users are outside of the
3226  /// block). It allows to save some compile time and memory used by the
3227  /// compiler.
3228  /// ScheduleData is assigned for each instruction in between the boundaries of
3229  /// the tree entry, even for those, which are not part of the graph. It is
3230  /// required to correctly follow the dependencies between the instructions and
3231  /// their correct scheduling. The ScheduleData is not allocated for the
3232  /// instructions, which do not require scheduling, like phis, nodes with
3233  /// extractelements/insertelements only or nodes with instructions, with
3234  /// uses/operands outside of the block.
3235  struct BlockScheduling {
3236    BlockScheduling(BasicBlock *BB)
3237        : BB(BB), ChunkSize(BB->size()), ChunkPos(ChunkSize) {}
3238
3239    void clear() {
3240      ReadyInsts.clear();
3241      ScheduleStart = nullptr;
3242      ScheduleEnd = nullptr;
3243      FirstLoadStoreInRegion = nullptr;
3244      LastLoadStoreInRegion = nullptr;
3245      RegionHasStackSave = false;
3246
3247      // Reduce the maximum schedule region size by the size of the
3248      // previous scheduling run.
3249      ScheduleRegionSizeLimit -= ScheduleRegionSize;
3250      if (ScheduleRegionSizeLimit < MinScheduleRegionSize)
3251        ScheduleRegionSizeLimit = MinScheduleRegionSize;
3252      ScheduleRegionSize = 0;
3253
3254      // Make a new scheduling region, i.e. all existing ScheduleData is not
3255      // in the new region yet.
3256      ++SchedulingRegionID;
3257    }
3258
3259    ScheduleData *getScheduleData(Instruction *I) {
3260      if (BB != I->getParent())
3261        // Avoid lookup if can't possibly be in map.
3262        return nullptr;
3263      ScheduleData *SD = ScheduleDataMap.lookup(I);
3264      if (SD && isInSchedulingRegion(SD))
3265        return SD;
3266      return nullptr;
3267    }
3268
3269    ScheduleData *getScheduleData(Value *V) {
3270      if (auto *I = dyn_cast<Instruction>(V))
3271        return getScheduleData(I);
3272      return nullptr;
3273    }
3274
3275    ScheduleData *getScheduleData(Value *V, Value *Key) {
3276      if (V == Key)
3277        return getScheduleData(V);
3278      auto I = ExtraScheduleDataMap.find(V);
3279      if (I != ExtraScheduleDataMap.end()) {
3280        ScheduleData *SD = I->second.lookup(Key);
3281        if (SD && isInSchedulingRegion(SD))
3282          return SD;
3283      }
3284      return nullptr;
3285    }
3286
3287    bool isInSchedulingRegion(ScheduleData *SD) const {
3288      return SD->SchedulingRegionID == SchedulingRegionID;
3289    }
3290
3291    /// Marks an instruction as scheduled and puts all dependent ready
3292    /// instructions into the ready-list.
3293    template <typename ReadyListType>
3294    void schedule(ScheduleData *SD, ReadyListType &ReadyList) {
3295      SD->IsScheduled = true;
3296      LLVM_DEBUG(dbgs() << "SLP:   schedule " << *SD << "\n");
3297
3298      for (ScheduleData *BundleMember = SD; BundleMember;
3299           BundleMember = BundleMember->NextInBundle) {
3300        if (BundleMember->Inst != BundleMember->OpValue)
3301          continue;
3302
3303        // Handle the def-use chain dependencies.
3304
3305        // Decrement the unscheduled counter and insert to ready list if ready.
3306        auto &&DecrUnsched = [this, &ReadyList](Instruction *I) {
3307          doForAllOpcodes(I, [&ReadyList](ScheduleData *OpDef) {
3308            if (OpDef && OpDef->hasValidDependencies() &&
3309                OpDef->incrementUnscheduledDeps(-1) == 0) {
3310              // There are no more unscheduled dependencies after
3311              // decrementing, so we can put the dependent instruction
3312              // into the ready list.
3313              ScheduleData *DepBundle = OpDef->FirstInBundle;
3314              assert(!DepBundle->IsScheduled &&
3315                     "already scheduled bundle gets ready");
3316              ReadyList.insert(DepBundle);
3317              LLVM_DEBUG(dbgs()
3318                         << "SLP:    gets ready (def): " << *DepBundle << "\n");
3319            }
3320          });
3321        };
3322
3323        // If BundleMember is a vector bundle, its operands may have been
3324        // reordered during buildTree(). We therefore need to get its operands
3325        // through the TreeEntry.
3326        if (TreeEntry *TE = BundleMember->TE) {
3327          // Need to search for the lane since the tree entry can be reordered.
3328          int Lane = std::distance(TE->Scalars.begin(),
3329                                   find(TE->Scalars, BundleMember->Inst));
3330          assert(Lane >= 0 && "Lane not set");
3331
3332          // Since vectorization tree is being built recursively this assertion
3333          // ensures that the tree entry has all operands set before reaching
3334          // this code. Couple of exceptions known at the moment are extracts
3335          // where their second (immediate) operand is not added. Since
3336          // immediates do not affect scheduler behavior this is considered
3337          // okay.
3338          auto *In = BundleMember->Inst;
3339          assert(In &&
3340                 (isa<ExtractValueInst, ExtractElementInst>(In) ||
3341                  In->getNumOperands() == TE->getNumOperands()) &&
3342                 "Missed TreeEntry operands?");
3343          (void)In; // fake use to avoid build failure when assertions disabled
3344
3345          for (unsigned OpIdx = 0, NumOperands = TE->getNumOperands();
3346               OpIdx != NumOperands; ++OpIdx)
3347            if (auto *I = dyn_cast<Instruction>(TE->getOperand(OpIdx)[Lane]))
3348              DecrUnsched(I);
3349        } else {
3350          // If BundleMember is a stand-alone instruction, no operand reordering
3351          // has taken place, so we directly access its operands.
3352          for (Use &U : BundleMember->Inst->operands())
3353            if (auto *I = dyn_cast<Instruction>(U.get()))
3354              DecrUnsched(I);
3355        }
3356        // Handle the memory dependencies.
3357        for (ScheduleData *MemoryDepSD : BundleMember->MemoryDependencies) {
3358          if (MemoryDepSD->hasValidDependencies() &&
3359              MemoryDepSD->incrementUnscheduledDeps(-1) == 0) {
3360            // There are no more unscheduled dependencies after decrementing,
3361            // so we can put the dependent instruction into the ready list.
3362            ScheduleData *DepBundle = MemoryDepSD->FirstInBundle;
3363            assert(!DepBundle->IsScheduled &&
3364                   "already scheduled bundle gets ready");
3365            ReadyList.insert(DepBundle);
3366            LLVM_DEBUG(dbgs()
3367                       << "SLP:    gets ready (mem): " << *DepBundle << "\n");
3368          }
3369        }
3370        // Handle the control dependencies.
3371        for (ScheduleData *DepSD : BundleMember->ControlDependencies) {
3372          if (DepSD->incrementUnscheduledDeps(-1) == 0) {
3373            // There are no more unscheduled dependencies after decrementing,
3374            // so we can put the dependent instruction into the ready list.
3375            ScheduleData *DepBundle = DepSD->FirstInBundle;
3376            assert(!DepBundle->IsScheduled &&
3377                   "already scheduled bundle gets ready");
3378            ReadyList.insert(DepBundle);
3379            LLVM_DEBUG(dbgs()
3380                       << "SLP:    gets ready (ctl): " << *DepBundle << "\n");
3381          }
3382        }
3383      }
3384    }
3385
3386    /// Verify basic self consistency properties of the data structure.
3387    void verify() {
3388      if (!ScheduleStart)
3389        return;
3390
3391      assert(ScheduleStart->getParent() == ScheduleEnd->getParent() &&
3392             ScheduleStart->comesBefore(ScheduleEnd) &&
3393             "Not a valid scheduling region?");
3394
3395      for (auto *I = ScheduleStart; I != ScheduleEnd; I = I->getNextNode()) {
3396        auto *SD = getScheduleData(I);
3397        if (!SD)
3398          continue;
3399        assert(isInSchedulingRegion(SD) &&
3400               "primary schedule data not in window?");
3401        assert(isInSchedulingRegion(SD->FirstInBundle) &&
3402               "entire bundle in window!");
3403        (void)SD;
3404        doForAllOpcodes(I, [](ScheduleData *SD) { SD->verify(); });
3405      }
3406
3407      for (auto *SD : ReadyInsts) {
3408        assert(SD->isSchedulingEntity() && SD->isReady() &&
3409               "item in ready list not ready?");
3410        (void)SD;
3411      }
3412    }
3413
3414    void doForAllOpcodes(Value *V,
3415                         function_ref<void(ScheduleData *SD)> Action) {
3416      if (ScheduleData *SD = getScheduleData(V))
3417        Action(SD);
3418      auto I = ExtraScheduleDataMap.find(V);
3419      if (I != ExtraScheduleDataMap.end())
3420        for (auto &P : I->second)
3421          if (isInSchedulingRegion(P.second))
3422            Action(P.second);
3423    }
3424
3425    /// Put all instructions into the ReadyList which are ready for scheduling.
3426    template <typename ReadyListType>
3427    void initialFillReadyList(ReadyListType &ReadyList) {
3428      for (auto *I = ScheduleStart; I != ScheduleEnd; I = I->getNextNode()) {
3429        doForAllOpcodes(I, [&](ScheduleData *SD) {
3430          if (SD->isSchedulingEntity() && SD->hasValidDependencies() &&
3431              SD->isReady()) {
3432            ReadyList.insert(SD);
3433            LLVM_DEBUG(dbgs()
3434                       << "SLP:    initially in ready list: " << *SD << "\n");
3435          }
3436        });
3437      }
3438    }
3439
3440    /// Build a bundle from the ScheduleData nodes corresponding to the
3441    /// scalar instruction for each lane.
3442    ScheduleData *buildBundle(ArrayRef<Value *> VL);
3443
3444    /// Checks if a bundle of instructions can be scheduled, i.e. has no
3445    /// cyclic dependencies. This is only a dry-run, no instructions are
3446    /// actually moved at this stage.
3447    /// \returns the scheduling bundle. The returned Optional value is not
3448    /// std::nullopt if \p VL is allowed to be scheduled.
3449    std::optional<ScheduleData *>
3450    tryScheduleBundle(ArrayRef<Value *> VL, BoUpSLP *SLP,
3451                      const InstructionsState &S);
3452
3453    /// Un-bundles a group of instructions.
3454    void cancelScheduling(ArrayRef<Value *> VL, Value *OpValue);
3455
3456    /// Allocates schedule data chunk.
3457    ScheduleData *allocateScheduleDataChunks();
3458
3459    /// Extends the scheduling region so that V is inside the region.
3460    /// \returns true if the region size is within the limit.
3461    bool extendSchedulingRegion(Value *V, const InstructionsState &S);
3462
3463    /// Initialize the ScheduleData structures for new instructions in the
3464    /// scheduling region.
3465    void initScheduleData(Instruction *FromI, Instruction *ToI,
3466                          ScheduleData *PrevLoadStore,
3467                          ScheduleData *NextLoadStore);
3468
3469    /// Updates the dependency information of a bundle and of all instructions/
3470    /// bundles which depend on the original bundle.
3471    void calculateDependencies(ScheduleData *SD, bool InsertInReadyList,
3472                               BoUpSLP *SLP);
3473
3474    /// Sets all instruction in the scheduling region to un-scheduled.
3475    void resetSchedule();
3476
3477    BasicBlock *BB;
3478
3479    /// Simple memory allocation for ScheduleData.
3480    SmallVector<std::unique_ptr<ScheduleData[]>> ScheduleDataChunks;
3481
3482    /// The size of a ScheduleData array in ScheduleDataChunks.
3483    int ChunkSize;
3484
3485    /// The allocator position in the current chunk, which is the last entry
3486    /// of ScheduleDataChunks.
3487    int ChunkPos;
3488
3489    /// Attaches ScheduleData to Instruction.
3490    /// Note that the mapping survives during all vectorization iterations, i.e.
3491    /// ScheduleData structures are recycled.
3492    DenseMap<Instruction *, ScheduleData *> ScheduleDataMap;
3493
3494    /// Attaches ScheduleData to Instruction with the leading key.
3495    DenseMap<Value *, SmallDenseMap<Value *, ScheduleData *>>
3496        ExtraScheduleDataMap;
3497
3498    /// The ready-list for scheduling (only used for the dry-run).
3499    SetVector<ScheduleData *> ReadyInsts;
3500
3501    /// The first instruction of the scheduling region.
3502    Instruction *ScheduleStart = nullptr;
3503
3504    /// The first instruction _after_ the scheduling region.
3505    Instruction *ScheduleEnd = nullptr;
3506
3507    /// The first memory accessing instruction in the scheduling region
3508    /// (can be null).
3509    ScheduleData *FirstLoadStoreInRegion = nullptr;
3510
3511    /// The last memory accessing instruction in the scheduling region
3512    /// (can be null).
3513    ScheduleData *LastLoadStoreInRegion = nullptr;
3514
3515    /// Is there an llvm.stacksave or llvm.stackrestore in the scheduling
3516    /// region?  Used to optimize the dependence calculation for the
3517    /// common case where there isn't.
3518    bool RegionHasStackSave = false;
3519
3520    /// The current size of the scheduling region.
3521    int ScheduleRegionSize = 0;
3522
3523    /// The maximum size allowed for the scheduling region.
3524    int ScheduleRegionSizeLimit = ScheduleRegionSizeBudget;
3525
3526    /// The ID of the scheduling region. For a new vectorization iteration this
3527    /// is incremented which "removes" all ScheduleData from the region.
3528    /// Make sure that the initial SchedulingRegionID is greater than the
3529    /// initial SchedulingRegionID in ScheduleData (which is 0).
3530    int SchedulingRegionID = 1;
3531  };
3532
3533  /// Attaches the BlockScheduling structures to basic blocks.
3534  MapVector<BasicBlock *, std::unique_ptr<BlockScheduling>> BlocksSchedules;
3535
3536  /// Performs the "real" scheduling. Done before vectorization is actually
3537  /// performed in a basic block.
3538  void scheduleBlock(BlockScheduling *BS);
3539
3540  /// List of users to ignore during scheduling and that don't need extracting.
3541  const SmallDenseSet<Value *> *UserIgnoreList = nullptr;
3542
3543  /// A DenseMapInfo implementation for holding DenseMaps and DenseSets of
3544  /// sorted SmallVectors of unsigned.
3545  struct OrdersTypeDenseMapInfo {
3546    static OrdersType getEmptyKey() {
3547      OrdersType V;
3548      V.push_back(~1U);
3549      return V;
3550    }
3551
3552    static OrdersType getTombstoneKey() {
3553      OrdersType V;
3554      V.push_back(~2U);
3555      return V;
3556    }
3557
3558    static unsigned getHashValue(const OrdersType &V) {
3559      return static_cast<unsigned>(hash_combine_range(V.begin(), V.end()));
3560    }
3561
3562    static bool isEqual(const OrdersType &LHS, const OrdersType &RHS) {
3563      return LHS == RHS;
3564    }
3565  };
3566
3567  // Analysis and block reference.
3568  Function *F;
3569  ScalarEvolution *SE;
3570  TargetTransformInfo *TTI;
3571  TargetLibraryInfo *TLI;
3572  LoopInfo *LI;
3573  DominatorTree *DT;
3574  AssumptionCache *AC;
3575  DemandedBits *DB;
3576  const DataLayout *DL;
3577  OptimizationRemarkEmitter *ORE;
3578
3579  unsigned MaxVecRegSize; // This is set by TTI or overridden by cl::opt.
3580  unsigned MinVecRegSize; // Set by cl::opt (default: 128).
3581
3582  /// Instruction builder to construct the vectorized tree.
3583  IRBuilder<> Builder;
3584
3585  /// A map of scalar integer values to the smallest bit width with which they
3586  /// can legally be represented. The values map to (width, signed) pairs,
3587  /// where "width" indicates the minimum bit width and "signed" is True if the
3588  /// value must be signed-extended, rather than zero-extended, back to its
3589  /// original width.
3590  DenseMap<const TreeEntry *, std::pair<uint64_t, bool>> MinBWs;
3591};
3592
3593} // end namespace slpvectorizer
3594
3595template <> struct GraphTraits<BoUpSLP *> {
3596  using TreeEntry = BoUpSLP::TreeEntry;
3597
3598  /// NodeRef has to be a pointer per the GraphWriter.
3599  using NodeRef = TreeEntry *;
3600
3601  using ContainerTy = BoUpSLP::TreeEntry::VecTreeTy;
3602
3603  /// Add the VectorizableTree to the index iterator to be able to return
3604  /// TreeEntry pointers.
3605  struct ChildIteratorType
3606      : public iterator_adaptor_base<
3607            ChildIteratorType, SmallVector<BoUpSLP::EdgeInfo, 1>::iterator> {
3608    ContainerTy &VectorizableTree;
3609
3610    ChildIteratorType(SmallVector<BoUpSLP::EdgeInfo, 1>::iterator W,
3611                      ContainerTy &VT)
3612        : ChildIteratorType::iterator_adaptor_base(W), VectorizableTree(VT) {}
3613
3614    NodeRef operator*() { return I->UserTE; }
3615  };
3616
3617  static NodeRef getEntryNode(BoUpSLP &R) {
3618    return R.VectorizableTree[0].get();
3619  }
3620
3621  static ChildIteratorType child_begin(NodeRef N) {
3622    return {N->UserTreeIndices.begin(), N->Container};
3623  }
3624
3625  static ChildIteratorType child_end(NodeRef N) {
3626    return {N->UserTreeIndices.end(), N->Container};
3627  }
3628
3629  /// For the node iterator we just need to turn the TreeEntry iterator into a
3630  /// TreeEntry* iterator so that it dereferences to NodeRef.
3631  class nodes_iterator {
3632    using ItTy = ContainerTy::iterator;
3633    ItTy It;
3634
3635  public:
3636    nodes_iterator(const ItTy &It2) : It(It2) {}
3637    NodeRef operator*() { return It->get(); }
3638    nodes_iterator operator++() {
3639      ++It;
3640      return *this;
3641    }
3642    bool operator!=(const nodes_iterator &N2) const { return N2.It != It; }
3643  };
3644
3645  static nodes_iterator nodes_begin(BoUpSLP *R) {
3646    return nodes_iterator(R->VectorizableTree.begin());
3647  }
3648
3649  static nodes_iterator nodes_end(BoUpSLP *R) {
3650    return nodes_iterator(R->VectorizableTree.end());
3651  }
3652
3653  static unsigned size(BoUpSLP *R) { return R->VectorizableTree.size(); }
3654};
3655
3656template <> struct DOTGraphTraits<BoUpSLP *> : public DefaultDOTGraphTraits {
3657  using TreeEntry = BoUpSLP::TreeEntry;
3658
3659  DOTGraphTraits(bool IsSimple = false) : DefaultDOTGraphTraits(IsSimple) {}
3660
3661  std::string getNodeLabel(const TreeEntry *Entry, const BoUpSLP *R) {
3662    std::string Str;
3663    raw_string_ostream OS(Str);
3664    OS << Entry->Idx << ".\n";
3665    if (isSplat(Entry->Scalars))
3666      OS << "<splat> ";
3667    for (auto *V : Entry->Scalars) {
3668      OS << *V;
3669      if (llvm::any_of(R->ExternalUses, [&](const BoUpSLP::ExternalUser &EU) {
3670            return EU.Scalar == V;
3671          }))
3672        OS << " <extract>";
3673      OS << "\n";
3674    }
3675    return Str;
3676  }
3677
3678  static std::string getNodeAttributes(const TreeEntry *Entry,
3679                                       const BoUpSLP *) {
3680    if (Entry->State == TreeEntry::NeedToGather)
3681      return "color=red";
3682    if (Entry->State == TreeEntry::ScatterVectorize ||
3683        Entry->State == TreeEntry::PossibleStridedVectorize)
3684      return "color=blue";
3685    return "";
3686  }
3687};
3688
3689} // end namespace llvm
3690
3691BoUpSLP::~BoUpSLP() {
3692  SmallVector<WeakTrackingVH> DeadInsts;
3693  for (auto *I : DeletedInstructions) {
3694    for (Use &U : I->operands()) {
3695      auto *Op = dyn_cast<Instruction>(U.get());
3696      if (Op && !DeletedInstructions.count(Op) && Op->hasOneUser() &&
3697          wouldInstructionBeTriviallyDead(Op, TLI))
3698        DeadInsts.emplace_back(Op);
3699    }
3700    I->dropAllReferences();
3701  }
3702  for (auto *I : DeletedInstructions) {
3703    assert(I->use_empty() &&
3704           "trying to erase instruction with users.");
3705    I->eraseFromParent();
3706  }
3707
3708  // Cleanup any dead scalar code feeding the vectorized instructions
3709  RecursivelyDeleteTriviallyDeadInstructions(DeadInsts, TLI);
3710
3711#ifdef EXPENSIVE_CHECKS
3712  // If we could guarantee that this call is not extremely slow, we could
3713  // remove the ifdef limitation (see PR47712).
3714  assert(!verifyFunction(*F, &dbgs()));
3715#endif
3716}
3717
3718/// Reorders the given \p Reuses mask according to the given \p Mask. \p Reuses
3719/// contains original mask for the scalars reused in the node. Procedure
3720/// transform this mask in accordance with the given \p Mask.
3721static void reorderReuses(SmallVectorImpl<int> &Reuses, ArrayRef<int> Mask) {
3722  assert(!Mask.empty() && Reuses.size() == Mask.size() &&
3723         "Expected non-empty mask.");
3724  SmallVector<int> Prev(Reuses.begin(), Reuses.end());
3725  Prev.swap(Reuses);
3726  for (unsigned I = 0, E = Prev.size(); I < E; ++I)
3727    if (Mask[I] != PoisonMaskElem)
3728      Reuses[Mask[I]] = Prev[I];
3729}
3730
3731/// Reorders the given \p Order according to the given \p Mask. \p Order - is
3732/// the original order of the scalars. Procedure transforms the provided order
3733/// in accordance with the given \p Mask. If the resulting \p Order is just an
3734/// identity order, \p Order is cleared.
3735static void reorderOrder(SmallVectorImpl<unsigned> &Order, ArrayRef<int> Mask) {
3736  assert(!Mask.empty() && "Expected non-empty mask.");
3737  SmallVector<int> MaskOrder;
3738  if (Order.empty()) {
3739    MaskOrder.resize(Mask.size());
3740    std::iota(MaskOrder.begin(), MaskOrder.end(), 0);
3741  } else {
3742    inversePermutation(Order, MaskOrder);
3743  }
3744  reorderReuses(MaskOrder, Mask);
3745  if (ShuffleVectorInst::isIdentityMask(MaskOrder, MaskOrder.size())) {
3746    Order.clear();
3747    return;
3748  }
3749  Order.assign(Mask.size(), Mask.size());
3750  for (unsigned I = 0, E = Mask.size(); I < E; ++I)
3751    if (MaskOrder[I] != PoisonMaskElem)
3752      Order[MaskOrder[I]] = I;
3753  fixupOrderingIndices(Order);
3754}
3755
3756std::optional<BoUpSLP::OrdersType>
3757BoUpSLP::findReusedOrderedScalars(const BoUpSLP::TreeEntry &TE) {
3758  assert(TE.State == TreeEntry::NeedToGather && "Expected gather node only.");
3759  unsigned NumScalars = TE.Scalars.size();
3760  OrdersType CurrentOrder(NumScalars, NumScalars);
3761  SmallVector<int> Positions;
3762  SmallBitVector UsedPositions(NumScalars);
3763  const TreeEntry *STE = nullptr;
3764  // Try to find all gathered scalars that are gets vectorized in other
3765  // vectorize node. Here we can have only one single tree vector node to
3766  // correctly identify order of the gathered scalars.
3767  for (unsigned I = 0; I < NumScalars; ++I) {
3768    Value *V = TE.Scalars[I];
3769    if (!isa<LoadInst, ExtractElementInst, ExtractValueInst>(V))
3770      continue;
3771    if (const auto *LocalSTE = getTreeEntry(V)) {
3772      if (!STE)
3773        STE = LocalSTE;
3774      else if (STE != LocalSTE)
3775        // Take the order only from the single vector node.
3776        return std::nullopt;
3777      unsigned Lane =
3778          std::distance(STE->Scalars.begin(), find(STE->Scalars, V));
3779      if (Lane >= NumScalars)
3780        return std::nullopt;
3781      if (CurrentOrder[Lane] != NumScalars) {
3782        if (Lane != I)
3783          continue;
3784        UsedPositions.reset(CurrentOrder[Lane]);
3785      }
3786      // The partial identity (where only some elements of the gather node are
3787      // in the identity order) is good.
3788      CurrentOrder[Lane] = I;
3789      UsedPositions.set(I);
3790    }
3791  }
3792  // Need to keep the order if we have a vector entry and at least 2 scalars or
3793  // the vectorized entry has just 2 scalars.
3794  if (STE && (UsedPositions.count() > 1 || STE->Scalars.size() == 2)) {
3795    auto &&IsIdentityOrder = [NumScalars](ArrayRef<unsigned> CurrentOrder) {
3796      for (unsigned I = 0; I < NumScalars; ++I)
3797        if (CurrentOrder[I] != I && CurrentOrder[I] != NumScalars)
3798          return false;
3799      return true;
3800    };
3801    if (IsIdentityOrder(CurrentOrder))
3802      return OrdersType();
3803    auto *It = CurrentOrder.begin();
3804    for (unsigned I = 0; I < NumScalars;) {
3805      if (UsedPositions.test(I)) {
3806        ++I;
3807        continue;
3808      }
3809      if (*It == NumScalars) {
3810        *It = I;
3811        ++I;
3812      }
3813      ++It;
3814    }
3815    return std::move(CurrentOrder);
3816  }
3817  return std::nullopt;
3818}
3819
3820namespace {
3821/// Tracks the state we can represent the loads in the given sequence.
3822enum class LoadsState {
3823  Gather,
3824  Vectorize,
3825  ScatterVectorize,
3826  PossibleStridedVectorize
3827};
3828} // anonymous namespace
3829
3830static bool arePointersCompatible(Value *Ptr1, Value *Ptr2,
3831                                  const TargetLibraryInfo &TLI,
3832                                  bool CompareOpcodes = true) {
3833  if (getUnderlyingObject(Ptr1) != getUnderlyingObject(Ptr2))
3834    return false;
3835  auto *GEP1 = dyn_cast<GetElementPtrInst>(Ptr1);
3836  if (!GEP1)
3837    return false;
3838  auto *GEP2 = dyn_cast<GetElementPtrInst>(Ptr2);
3839  if (!GEP2)
3840    return false;
3841  return GEP1->getNumOperands() == 2 && GEP2->getNumOperands() == 2 &&
3842         ((isConstant(GEP1->getOperand(1)) &&
3843           isConstant(GEP2->getOperand(1))) ||
3844          !CompareOpcodes ||
3845          getSameOpcode({GEP1->getOperand(1), GEP2->getOperand(1)}, TLI)
3846              .getOpcode());
3847}
3848
3849/// Checks if the given array of loads can be represented as a vectorized,
3850/// scatter or just simple gather.
3851static LoadsState canVectorizeLoads(ArrayRef<Value *> VL, const Value *VL0,
3852                                    const TargetTransformInfo &TTI,
3853                                    const DataLayout &DL, ScalarEvolution &SE,
3854                                    LoopInfo &LI, const TargetLibraryInfo &TLI,
3855                                    SmallVectorImpl<unsigned> &Order,
3856                                    SmallVectorImpl<Value *> &PointerOps) {
3857  // Check that a vectorized load would load the same memory as a scalar
3858  // load. For example, we don't want to vectorize loads that are smaller
3859  // than 8-bit. Even though we have a packed struct {<i2, i2, i2, i2>} LLVM
3860  // treats loading/storing it as an i8 struct. If we vectorize loads/stores
3861  // from such a struct, we read/write packed bits disagreeing with the
3862  // unvectorized version.
3863  Type *ScalarTy = VL0->getType();
3864
3865  if (DL.getTypeSizeInBits(ScalarTy) != DL.getTypeAllocSizeInBits(ScalarTy))
3866    return LoadsState::Gather;
3867
3868  // Make sure all loads in the bundle are simple - we can't vectorize
3869  // atomic or volatile loads.
3870  PointerOps.clear();
3871  PointerOps.resize(VL.size());
3872  auto *POIter = PointerOps.begin();
3873  for (Value *V : VL) {
3874    auto *L = cast<LoadInst>(V);
3875    if (!L->isSimple())
3876      return LoadsState::Gather;
3877    *POIter = L->getPointerOperand();
3878    ++POIter;
3879  }
3880
3881  Order.clear();
3882  // Check the order of pointer operands or that all pointers are the same.
3883  bool IsSorted = sortPtrAccesses(PointerOps, ScalarTy, DL, SE, Order);
3884  if (IsSorted || all_of(PointerOps, [&](Value *P) {
3885        return arePointersCompatible(P, PointerOps.front(), TLI);
3886      })) {
3887    bool IsPossibleStrided = false;
3888    if (IsSorted) {
3889      Value *Ptr0;
3890      Value *PtrN;
3891      if (Order.empty()) {
3892        Ptr0 = PointerOps.front();
3893        PtrN = PointerOps.back();
3894      } else {
3895        Ptr0 = PointerOps[Order.front()];
3896        PtrN = PointerOps[Order.back()];
3897      }
3898      std::optional<int> Diff =
3899          getPointersDiff(ScalarTy, Ptr0, ScalarTy, PtrN, DL, SE);
3900      // Check that the sorted loads are consecutive.
3901      if (static_cast<unsigned>(*Diff) == VL.size() - 1)
3902        return LoadsState::Vectorize;
3903      // Simple check if not a strided access - clear order.
3904      IsPossibleStrided = *Diff % (VL.size() - 1) == 0;
3905    }
3906    // TODO: need to improve analysis of the pointers, if not all of them are
3907    // GEPs or have > 2 operands, we end up with a gather node, which just
3908    // increases the cost.
3909    Loop *L = LI.getLoopFor(cast<LoadInst>(VL0)->getParent());
3910    bool ProfitableGatherPointers =
3911        static_cast<unsigned>(count_if(PointerOps, [L](Value *V) {
3912          return L && L->isLoopInvariant(V);
3913        })) <= VL.size() / 2 && VL.size() > 2;
3914    if (ProfitableGatherPointers || all_of(PointerOps, [IsSorted](Value *P) {
3915          auto *GEP = dyn_cast<GetElementPtrInst>(P);
3916          return (IsSorted && !GEP && doesNotNeedToBeScheduled(P)) ||
3917                 (GEP && GEP->getNumOperands() == 2);
3918        })) {
3919      Align CommonAlignment = cast<LoadInst>(VL0)->getAlign();
3920      for (Value *V : VL)
3921        CommonAlignment =
3922            std::min(CommonAlignment, cast<LoadInst>(V)->getAlign());
3923      auto *VecTy = FixedVectorType::get(ScalarTy, VL.size());
3924      if (TTI.isLegalMaskedGather(VecTy, CommonAlignment) &&
3925          !TTI.forceScalarizeMaskedGather(VecTy, CommonAlignment))
3926        return IsPossibleStrided ? LoadsState::PossibleStridedVectorize
3927                                 : LoadsState::ScatterVectorize;
3928    }
3929  }
3930
3931  return LoadsState::Gather;
3932}
3933
3934static bool clusterSortPtrAccesses(ArrayRef<Value *> VL, Type *ElemTy,
3935                                   const DataLayout &DL, ScalarEvolution &SE,
3936                                   SmallVectorImpl<unsigned> &SortedIndices) {
3937  assert(llvm::all_of(
3938             VL, [](const Value *V) { return V->getType()->isPointerTy(); }) &&
3939         "Expected list of pointer operands.");
3940  // Map from bases to a vector of (Ptr, Offset, OrigIdx), which we insert each
3941  // Ptr into, sort and return the sorted indices with values next to one
3942  // another.
3943  MapVector<Value *, SmallVector<std::tuple<Value *, int, unsigned>>> Bases;
3944  Bases[VL[0]].push_back(std::make_tuple(VL[0], 0U, 0U));
3945
3946  unsigned Cnt = 1;
3947  for (Value *Ptr : VL.drop_front()) {
3948    bool Found = any_of(Bases, [&](auto &Base) {
3949      std::optional<int> Diff =
3950          getPointersDiff(ElemTy, Base.first, ElemTy, Ptr, DL, SE,
3951                          /*StrictCheck=*/true);
3952      if (!Diff)
3953        return false;
3954
3955      Base.second.emplace_back(Ptr, *Diff, Cnt++);
3956      return true;
3957    });
3958
3959    if (!Found) {
3960      // If we haven't found enough to usefully cluster, return early.
3961      if (Bases.size() > VL.size() / 2 - 1)
3962        return false;
3963
3964      // Not found already - add a new Base
3965      Bases[Ptr].emplace_back(Ptr, 0, Cnt++);
3966    }
3967  }
3968
3969  // For each of the bases sort the pointers by Offset and check if any of the
3970  // base become consecutively allocated.
3971  bool AnyConsecutive = false;
3972  for (auto &Base : Bases) {
3973    auto &Vec = Base.second;
3974    if (Vec.size() > 1) {
3975      llvm::stable_sort(Vec, [](const std::tuple<Value *, int, unsigned> &X,
3976                                const std::tuple<Value *, int, unsigned> &Y) {
3977        return std::get<1>(X) < std::get<1>(Y);
3978      });
3979      int InitialOffset = std::get<1>(Vec[0]);
3980      AnyConsecutive |= all_of(enumerate(Vec), [InitialOffset](const auto &P) {
3981        return std::get<1>(P.value()) == int(P.index()) + InitialOffset;
3982      });
3983    }
3984  }
3985
3986  // Fill SortedIndices array only if it looks worth-while to sort the ptrs.
3987  SortedIndices.clear();
3988  if (!AnyConsecutive)
3989    return false;
3990
3991  for (auto &Base : Bases) {
3992    for (auto &T : Base.second)
3993      SortedIndices.push_back(std::get<2>(T));
3994  }
3995
3996  assert(SortedIndices.size() == VL.size() &&
3997         "Expected SortedIndices to be the size of VL");
3998  return true;
3999}
4000
4001std::optional<BoUpSLP::OrdersType>
4002BoUpSLP::findPartiallyOrderedLoads(const BoUpSLP::TreeEntry &TE) {
4003  assert(TE.State == TreeEntry::NeedToGather && "Expected gather node only.");
4004  Type *ScalarTy = TE.Scalars[0]->getType();
4005
4006  SmallVector<Value *> Ptrs;
4007  Ptrs.reserve(TE.Scalars.size());
4008  for (Value *V : TE.Scalars) {
4009    auto *L = dyn_cast<LoadInst>(V);
4010    if (!L || !L->isSimple())
4011      return std::nullopt;
4012    Ptrs.push_back(L->getPointerOperand());
4013  }
4014
4015  BoUpSLP::OrdersType Order;
4016  if (clusterSortPtrAccesses(Ptrs, ScalarTy, *DL, *SE, Order))
4017    return std::move(Order);
4018  return std::nullopt;
4019}
4020
4021/// Check if two insertelement instructions are from the same buildvector.
4022static bool areTwoInsertFromSameBuildVector(
4023    InsertElementInst *VU, InsertElementInst *V,
4024    function_ref<Value *(InsertElementInst *)> GetBaseOperand) {
4025  // Instructions must be from the same basic blocks.
4026  if (VU->getParent() != V->getParent())
4027    return false;
4028  // Checks if 2 insertelements are from the same buildvector.
4029  if (VU->getType() != V->getType())
4030    return false;
4031  // Multiple used inserts are separate nodes.
4032  if (!VU->hasOneUse() && !V->hasOneUse())
4033    return false;
4034  auto *IE1 = VU;
4035  auto *IE2 = V;
4036  std::optional<unsigned> Idx1 = getInsertIndex(IE1);
4037  std::optional<unsigned> Idx2 = getInsertIndex(IE2);
4038  if (Idx1 == std::nullopt || Idx2 == std::nullopt)
4039    return false;
4040  // Go through the vector operand of insertelement instructions trying to find
4041  // either VU as the original vector for IE2 or V as the original vector for
4042  // IE1.
4043  SmallBitVector ReusedIdx(
4044      cast<VectorType>(VU->getType())->getElementCount().getKnownMinValue());
4045  bool IsReusedIdx = false;
4046  do {
4047    if (IE2 == VU && !IE1)
4048      return VU->hasOneUse();
4049    if (IE1 == V && !IE2)
4050      return V->hasOneUse();
4051    if (IE1 && IE1 != V) {
4052      unsigned Idx1 = getInsertIndex(IE1).value_or(*Idx2);
4053      IsReusedIdx |= ReusedIdx.test(Idx1);
4054      ReusedIdx.set(Idx1);
4055      if ((IE1 != VU && !IE1->hasOneUse()) || IsReusedIdx)
4056        IE1 = nullptr;
4057      else
4058        IE1 = dyn_cast_or_null<InsertElementInst>(GetBaseOperand(IE1));
4059    }
4060    if (IE2 && IE2 != VU) {
4061      unsigned Idx2 = getInsertIndex(IE2).value_or(*Idx1);
4062      IsReusedIdx |= ReusedIdx.test(Idx2);
4063      ReusedIdx.set(Idx2);
4064      if ((IE2 != V && !IE2->hasOneUse()) || IsReusedIdx)
4065        IE2 = nullptr;
4066      else
4067        IE2 = dyn_cast_or_null<InsertElementInst>(GetBaseOperand(IE2));
4068    }
4069  } while (!IsReusedIdx && (IE1 || IE2));
4070  return false;
4071}
4072
4073std::optional<BoUpSLP::OrdersType>
4074BoUpSLP::getReorderingData(const TreeEntry &TE, bool TopToBottom) {
4075  // No need to reorder if need to shuffle reuses, still need to shuffle the
4076  // node.
4077  if (!TE.ReuseShuffleIndices.empty()) {
4078    // Check if reuse shuffle indices can be improved by reordering.
4079    // For this, check that reuse mask is "clustered", i.e. each scalar values
4080    // is used once in each submask of size <number_of_scalars>.
4081    // Example: 4 scalar values.
4082    // ReuseShuffleIndices mask: 0, 1, 2, 3, 3, 2, 0, 1 - clustered.
4083    //                           0, 1, 2, 3, 3, 3, 1, 0 - not clustered, because
4084    //                           element 3 is used twice in the second submask.
4085    unsigned Sz = TE.Scalars.size();
4086    if (!ShuffleVectorInst::isOneUseSingleSourceMask(TE.ReuseShuffleIndices,
4087                                                     Sz))
4088      return std::nullopt;
4089    unsigned VF = TE.getVectorFactor();
4090    // Try build correct order for extractelement instructions.
4091    SmallVector<int> ReusedMask(TE.ReuseShuffleIndices.begin(),
4092                                TE.ReuseShuffleIndices.end());
4093    if (TE.getOpcode() == Instruction::ExtractElement && !TE.isAltShuffle() &&
4094        all_of(TE.Scalars, [Sz](Value *V) {
4095          std::optional<unsigned> Idx = getExtractIndex(cast<Instruction>(V));
4096          return Idx && *Idx < Sz;
4097        })) {
4098      SmallVector<int> ReorderMask(Sz, PoisonMaskElem);
4099      if (TE.ReorderIndices.empty())
4100        std::iota(ReorderMask.begin(), ReorderMask.end(), 0);
4101      else
4102        inversePermutation(TE.ReorderIndices, ReorderMask);
4103      for (unsigned I = 0; I < VF; ++I) {
4104        int &Idx = ReusedMask[I];
4105        if (Idx == PoisonMaskElem)
4106          continue;
4107        Value *V = TE.Scalars[ReorderMask[Idx]];
4108        std::optional<unsigned> EI = getExtractIndex(cast<Instruction>(V));
4109        Idx = std::distance(ReorderMask.begin(), find(ReorderMask, *EI));
4110      }
4111    }
4112    // Build the order of the VF size, need to reorder reuses shuffles, they are
4113    // always of VF size.
4114    OrdersType ResOrder(VF);
4115    std::iota(ResOrder.begin(), ResOrder.end(), 0);
4116    auto *It = ResOrder.begin();
4117    for (unsigned K = 0; K < VF; K += Sz) {
4118      OrdersType CurrentOrder(TE.ReorderIndices);
4119      SmallVector<int> SubMask{ArrayRef(ReusedMask).slice(K, Sz)};
4120      if (SubMask.front() == PoisonMaskElem)
4121        std::iota(SubMask.begin(), SubMask.end(), 0);
4122      reorderOrder(CurrentOrder, SubMask);
4123      transform(CurrentOrder, It, [K](unsigned Pos) { return Pos + K; });
4124      std::advance(It, Sz);
4125    }
4126    if (all_of(enumerate(ResOrder),
4127               [](const auto &Data) { return Data.index() == Data.value(); }))
4128      return std::nullopt; // No need to reorder.
4129    return std::move(ResOrder);
4130  }
4131  if ((TE.State == TreeEntry::Vectorize ||
4132       TE.State == TreeEntry::PossibleStridedVectorize) &&
4133      (isa<LoadInst, ExtractElementInst, ExtractValueInst>(TE.getMainOp()) ||
4134       (TopToBottom && isa<StoreInst, InsertElementInst>(TE.getMainOp()))) &&
4135      !TE.isAltShuffle())
4136    return TE.ReorderIndices;
4137  if (TE.State == TreeEntry::Vectorize && TE.getOpcode() == Instruction::PHI) {
4138    auto PHICompare = [&](unsigned I1, unsigned I2) {
4139      Value *V1 = TE.Scalars[I1];
4140      Value *V2 = TE.Scalars[I2];
4141      if (V1 == V2)
4142        return false;
4143      if (!V1->hasOneUse() || !V2->hasOneUse())
4144        return false;
4145      auto *FirstUserOfPhi1 = cast<Instruction>(*V1->user_begin());
4146      auto *FirstUserOfPhi2 = cast<Instruction>(*V2->user_begin());
4147      if (auto *IE1 = dyn_cast<InsertElementInst>(FirstUserOfPhi1))
4148        if (auto *IE2 = dyn_cast<InsertElementInst>(FirstUserOfPhi2)) {
4149          if (!areTwoInsertFromSameBuildVector(
4150                  IE1, IE2,
4151                  [](InsertElementInst *II) { return II->getOperand(0); }))
4152            return false;
4153          std::optional<unsigned> Idx1 = getInsertIndex(IE1);
4154          std::optional<unsigned> Idx2 = getInsertIndex(IE2);
4155          if (Idx1 == std::nullopt || Idx2 == std::nullopt)
4156            return false;
4157          return *Idx1 < *Idx2;
4158        }
4159      if (auto *EE1 = dyn_cast<ExtractElementInst>(FirstUserOfPhi1))
4160        if (auto *EE2 = dyn_cast<ExtractElementInst>(FirstUserOfPhi2)) {
4161          if (EE1->getOperand(0) != EE2->getOperand(0))
4162            return false;
4163          std::optional<unsigned> Idx1 = getExtractIndex(EE1);
4164          std::optional<unsigned> Idx2 = getExtractIndex(EE2);
4165          if (Idx1 == std::nullopt || Idx2 == std::nullopt)
4166            return false;
4167          return *Idx1 < *Idx2;
4168        }
4169      return false;
4170    };
4171    auto IsIdentityOrder = [](const OrdersType &Order) {
4172      for (unsigned Idx : seq<unsigned>(0, Order.size()))
4173        if (Idx != Order[Idx])
4174          return false;
4175      return true;
4176    };
4177    if (!TE.ReorderIndices.empty())
4178      return TE.ReorderIndices;
4179    DenseMap<unsigned, unsigned> PhiToId;
4180    SmallVector<unsigned> Phis(TE.Scalars.size());
4181    std::iota(Phis.begin(), Phis.end(), 0);
4182    OrdersType ResOrder(TE.Scalars.size());
4183    for (unsigned Id = 0, Sz = TE.Scalars.size(); Id < Sz; ++Id)
4184      PhiToId[Id] = Id;
4185    stable_sort(Phis, PHICompare);
4186    for (unsigned Id = 0, Sz = Phis.size(); Id < Sz; ++Id)
4187      ResOrder[Id] = PhiToId[Phis[Id]];
4188    if (IsIdentityOrder(ResOrder))
4189      return std::nullopt; // No need to reorder.
4190    return std::move(ResOrder);
4191  }
4192  if (TE.State == TreeEntry::NeedToGather) {
4193    // TODO: add analysis of other gather nodes with extractelement
4194    // instructions and other values/instructions, not only undefs.
4195    if (((TE.getOpcode() == Instruction::ExtractElement &&
4196          !TE.isAltShuffle()) ||
4197         (all_of(TE.Scalars,
4198                 [](Value *V) {
4199                   return isa<UndefValue, ExtractElementInst>(V);
4200                 }) &&
4201          any_of(TE.Scalars,
4202                 [](Value *V) { return isa<ExtractElementInst>(V); }))) &&
4203        all_of(TE.Scalars,
4204               [](Value *V) {
4205                 auto *EE = dyn_cast<ExtractElementInst>(V);
4206                 return !EE || isa<FixedVectorType>(EE->getVectorOperandType());
4207               }) &&
4208        allSameType(TE.Scalars)) {
4209      // Check that gather of extractelements can be represented as
4210      // just a shuffle of a single vector.
4211      OrdersType CurrentOrder;
4212      bool Reuse = canReuseExtract(TE.Scalars, TE.getMainOp(), CurrentOrder,
4213                                   /*ResizeAllowed=*/true);
4214      if (Reuse || !CurrentOrder.empty()) {
4215        if (!CurrentOrder.empty())
4216          fixupOrderingIndices(CurrentOrder);
4217        return std::move(CurrentOrder);
4218      }
4219    }
4220    // If the gather node is <undef, v, .., poison> and
4221    // insertelement poison, v, 0 [+ permute]
4222    // is cheaper than
4223    // insertelement poison, v, n - try to reorder.
4224    // If rotating the whole graph, exclude the permute cost, the whole graph
4225    // might be transformed.
4226    int Sz = TE.Scalars.size();
4227    if (isSplat(TE.Scalars) && !allConstant(TE.Scalars) &&
4228        count_if(TE.Scalars, UndefValue::classof) == Sz - 1) {
4229      const auto *It =
4230          find_if(TE.Scalars, [](Value *V) { return !isConstant(V); });
4231      if (It == TE.Scalars.begin())
4232        return OrdersType();
4233      auto *Ty = FixedVectorType::get(TE.Scalars.front()->getType(), Sz);
4234      if (It != TE.Scalars.end()) {
4235        OrdersType Order(Sz, Sz);
4236        unsigned Idx = std::distance(TE.Scalars.begin(), It);
4237        Order[Idx] = 0;
4238        fixupOrderingIndices(Order);
4239        SmallVector<int> Mask;
4240        inversePermutation(Order, Mask);
4241        InstructionCost PermuteCost =
4242            TopToBottom
4243                ? 0
4244                : TTI->getShuffleCost(TTI::SK_PermuteSingleSrc, Ty, Mask);
4245        InstructionCost InsertFirstCost = TTI->getVectorInstrCost(
4246            Instruction::InsertElement, Ty, TTI::TCK_RecipThroughput, 0,
4247            PoisonValue::get(Ty), *It);
4248        InstructionCost InsertIdxCost = TTI->getVectorInstrCost(
4249            Instruction::InsertElement, Ty, TTI::TCK_RecipThroughput, Idx,
4250            PoisonValue::get(Ty), *It);
4251        if (InsertFirstCost + PermuteCost < InsertIdxCost)
4252          return std::move(Order);
4253      }
4254    }
4255    if (std::optional<OrdersType> CurrentOrder = findReusedOrderedScalars(TE))
4256      return CurrentOrder;
4257    if (TE.Scalars.size() >= 4)
4258      if (std::optional<OrdersType> Order = findPartiallyOrderedLoads(TE))
4259        return Order;
4260  }
4261  return std::nullopt;
4262}
4263
4264/// Checks if the given mask is a "clustered" mask with the same clusters of
4265/// size \p Sz, which are not identity submasks.
4266static bool isRepeatedNonIdentityClusteredMask(ArrayRef<int> Mask,
4267                                               unsigned Sz) {
4268  ArrayRef<int> FirstCluster = Mask.slice(0, Sz);
4269  if (ShuffleVectorInst::isIdentityMask(FirstCluster, Sz))
4270    return false;
4271  for (unsigned I = Sz, E = Mask.size(); I < E; I += Sz) {
4272    ArrayRef<int> Cluster = Mask.slice(I, Sz);
4273    if (Cluster != FirstCluster)
4274      return false;
4275  }
4276  return true;
4277}
4278
4279void BoUpSLP::reorderNodeWithReuses(TreeEntry &TE, ArrayRef<int> Mask) const {
4280  // Reorder reuses mask.
4281  reorderReuses(TE.ReuseShuffleIndices, Mask);
4282  const unsigned Sz = TE.Scalars.size();
4283  // For vectorized and non-clustered reused no need to do anything else.
4284  if (TE.State != TreeEntry::NeedToGather ||
4285      !ShuffleVectorInst::isOneUseSingleSourceMask(TE.ReuseShuffleIndices,
4286                                                   Sz) ||
4287      !isRepeatedNonIdentityClusteredMask(TE.ReuseShuffleIndices, Sz))
4288    return;
4289  SmallVector<int> NewMask;
4290  inversePermutation(TE.ReorderIndices, NewMask);
4291  addMask(NewMask, TE.ReuseShuffleIndices);
4292  // Clear reorder since it is going to be applied to the new mask.
4293  TE.ReorderIndices.clear();
4294  // Try to improve gathered nodes with clustered reuses, if possible.
4295  ArrayRef<int> Slice = ArrayRef(NewMask).slice(0, Sz);
4296  SmallVector<unsigned> NewOrder(Slice.begin(), Slice.end());
4297  inversePermutation(NewOrder, NewMask);
4298  reorderScalars(TE.Scalars, NewMask);
4299  // Fill the reuses mask with the identity submasks.
4300  for (auto *It = TE.ReuseShuffleIndices.begin(),
4301            *End = TE.ReuseShuffleIndices.end();
4302       It != End; std::advance(It, Sz))
4303    std::iota(It, std::next(It, Sz), 0);
4304}
4305
4306void BoUpSLP::reorderTopToBottom() {
4307  // Maps VF to the graph nodes.
4308  DenseMap<unsigned, SetVector<TreeEntry *>> VFToOrderedEntries;
4309  // ExtractElement gather nodes which can be vectorized and need to handle
4310  // their ordering.
4311  DenseMap<const TreeEntry *, OrdersType> GathersToOrders;
4312
4313  // Phi nodes can have preferred ordering based on their result users
4314  DenseMap<const TreeEntry *, OrdersType> PhisToOrders;
4315
4316  // AltShuffles can also have a preferred ordering that leads to fewer
4317  // instructions, e.g., the addsub instruction in x86.
4318  DenseMap<const TreeEntry *, OrdersType> AltShufflesToOrders;
4319
4320  // Maps a TreeEntry to the reorder indices of external users.
4321  DenseMap<const TreeEntry *, SmallVector<OrdersType, 1>>
4322      ExternalUserReorderMap;
4323  // FIXME: Workaround for syntax error reported by MSVC buildbots.
4324  TargetTransformInfo &TTIRef = *TTI;
4325  // Find all reorderable nodes with the given VF.
4326  // Currently the are vectorized stores,loads,extracts + some gathering of
4327  // extracts.
4328  for_each(VectorizableTree, [this, &TTIRef, &VFToOrderedEntries,
4329                              &GathersToOrders, &ExternalUserReorderMap,
4330                              &AltShufflesToOrders, &PhisToOrders](
4331                                 const std::unique_ptr<TreeEntry> &TE) {
4332    // Look for external users that will probably be vectorized.
4333    SmallVector<OrdersType, 1> ExternalUserReorderIndices =
4334        findExternalStoreUsersReorderIndices(TE.get());
4335    if (!ExternalUserReorderIndices.empty()) {
4336      VFToOrderedEntries[TE->getVectorFactor()].insert(TE.get());
4337      ExternalUserReorderMap.try_emplace(TE.get(),
4338                                         std::move(ExternalUserReorderIndices));
4339    }
4340
4341    // Patterns like [fadd,fsub] can be combined into a single instruction in
4342    // x86. Reordering them into [fsub,fadd] blocks this pattern. So we need
4343    // to take into account their order when looking for the most used order.
4344    if (TE->isAltShuffle()) {
4345      VectorType *VecTy =
4346          FixedVectorType::get(TE->Scalars[0]->getType(), TE->Scalars.size());
4347      unsigned Opcode0 = TE->getOpcode();
4348      unsigned Opcode1 = TE->getAltOpcode();
4349      // The opcode mask selects between the two opcodes.
4350      SmallBitVector OpcodeMask(TE->Scalars.size(), false);
4351      for (unsigned Lane : seq<unsigned>(0, TE->Scalars.size()))
4352        if (cast<Instruction>(TE->Scalars[Lane])->getOpcode() == Opcode1)
4353          OpcodeMask.set(Lane);
4354      // If this pattern is supported by the target then we consider the order.
4355      if (TTIRef.isLegalAltInstr(VecTy, Opcode0, Opcode1, OpcodeMask)) {
4356        VFToOrderedEntries[TE->getVectorFactor()].insert(TE.get());
4357        AltShufflesToOrders.try_emplace(TE.get(), OrdersType());
4358      }
4359      // TODO: Check the reverse order too.
4360    }
4361
4362    if (std::optional<OrdersType> CurrentOrder =
4363            getReorderingData(*TE, /*TopToBottom=*/true)) {
4364      // Do not include ordering for nodes used in the alt opcode vectorization,
4365      // better to reorder them during bottom-to-top stage. If follow the order
4366      // here, it causes reordering of the whole graph though actually it is
4367      // profitable just to reorder the subgraph that starts from the alternate
4368      // opcode vectorization node. Such nodes already end-up with the shuffle
4369      // instruction and it is just enough to change this shuffle rather than
4370      // rotate the scalars for the whole graph.
4371      unsigned Cnt = 0;
4372      const TreeEntry *UserTE = TE.get();
4373      while (UserTE && Cnt < RecursionMaxDepth) {
4374        if (UserTE->UserTreeIndices.size() != 1)
4375          break;
4376        if (all_of(UserTE->UserTreeIndices, [](const EdgeInfo &EI) {
4377              return EI.UserTE->State == TreeEntry::Vectorize &&
4378                     EI.UserTE->isAltShuffle() && EI.UserTE->Idx != 0;
4379            }))
4380          return;
4381        UserTE = UserTE->UserTreeIndices.back().UserTE;
4382        ++Cnt;
4383      }
4384      VFToOrderedEntries[TE->getVectorFactor()].insert(TE.get());
4385      if (!(TE->State == TreeEntry::Vectorize ||
4386            TE->State == TreeEntry::PossibleStridedVectorize) ||
4387          !TE->ReuseShuffleIndices.empty())
4388        GathersToOrders.try_emplace(TE.get(), *CurrentOrder);
4389      if (TE->State == TreeEntry::Vectorize &&
4390          TE->getOpcode() == Instruction::PHI)
4391        PhisToOrders.try_emplace(TE.get(), *CurrentOrder);
4392    }
4393  });
4394
4395  // Reorder the graph nodes according to their vectorization factor.
4396  for (unsigned VF = VectorizableTree.front()->getVectorFactor(); VF > 1;
4397       VF /= 2) {
4398    auto It = VFToOrderedEntries.find(VF);
4399    if (It == VFToOrderedEntries.end())
4400      continue;
4401    // Try to find the most profitable order. We just are looking for the most
4402    // used order and reorder scalar elements in the nodes according to this
4403    // mostly used order.
4404    ArrayRef<TreeEntry *> OrderedEntries = It->second.getArrayRef();
4405    // All operands are reordered and used only in this node - propagate the
4406    // most used order to the user node.
4407    MapVector<OrdersType, unsigned,
4408              DenseMap<OrdersType, unsigned, OrdersTypeDenseMapInfo>>
4409        OrdersUses;
4410    // Last chance orders - scatter vectorize. Try to use their orders if no
4411    // other orders or the order is counted already.
4412    SmallVector<OrdersType> StridedVectorizeOrders;
4413    SmallPtrSet<const TreeEntry *, 4> VisitedOps;
4414    for (const TreeEntry *OpTE : OrderedEntries) {
4415      // No need to reorder this nodes, still need to extend and to use shuffle,
4416      // just need to merge reordering shuffle and the reuse shuffle.
4417      if (!OpTE->ReuseShuffleIndices.empty() && !GathersToOrders.count(OpTE))
4418        continue;
4419      // Count number of orders uses.
4420      const auto &Order = [OpTE, &GathersToOrders, &AltShufflesToOrders,
4421                           &PhisToOrders]() -> const OrdersType & {
4422        if (OpTE->State == TreeEntry::NeedToGather ||
4423            !OpTE->ReuseShuffleIndices.empty()) {
4424          auto It = GathersToOrders.find(OpTE);
4425          if (It != GathersToOrders.end())
4426            return It->second;
4427        }
4428        if (OpTE->isAltShuffle()) {
4429          auto It = AltShufflesToOrders.find(OpTE);
4430          if (It != AltShufflesToOrders.end())
4431            return It->second;
4432        }
4433        if (OpTE->State == TreeEntry::Vectorize &&
4434            OpTE->getOpcode() == Instruction::PHI) {
4435          auto It = PhisToOrders.find(OpTE);
4436          if (It != PhisToOrders.end())
4437            return It->second;
4438        }
4439        return OpTE->ReorderIndices;
4440      }();
4441      // First consider the order of the external scalar users.
4442      auto It = ExternalUserReorderMap.find(OpTE);
4443      if (It != ExternalUserReorderMap.end()) {
4444        const auto &ExternalUserReorderIndices = It->second;
4445        // If the OpTE vector factor != number of scalars - use natural order,
4446        // it is an attempt to reorder node with reused scalars but with
4447        // external uses.
4448        if (OpTE->getVectorFactor() != OpTE->Scalars.size()) {
4449          OrdersUses.insert(std::make_pair(OrdersType(), 0)).first->second +=
4450              ExternalUserReorderIndices.size();
4451        } else {
4452          for (const OrdersType &ExtOrder : ExternalUserReorderIndices)
4453            ++OrdersUses.insert(std::make_pair(ExtOrder, 0)).first->second;
4454        }
4455        // No other useful reorder data in this entry.
4456        if (Order.empty())
4457          continue;
4458      }
4459      // Postpone scatter orders.
4460      if (OpTE->State == TreeEntry::PossibleStridedVectorize) {
4461        StridedVectorizeOrders.push_back(Order);
4462        continue;
4463      }
4464      // Stores actually store the mask, not the order, need to invert.
4465      if (OpTE->State == TreeEntry::Vectorize && !OpTE->isAltShuffle() &&
4466          OpTE->getOpcode() == Instruction::Store && !Order.empty()) {
4467        SmallVector<int> Mask;
4468        inversePermutation(Order, Mask);
4469        unsigned E = Order.size();
4470        OrdersType CurrentOrder(E, E);
4471        transform(Mask, CurrentOrder.begin(), [E](int Idx) {
4472          return Idx == PoisonMaskElem ? E : static_cast<unsigned>(Idx);
4473        });
4474        fixupOrderingIndices(CurrentOrder);
4475        ++OrdersUses.insert(std::make_pair(CurrentOrder, 0)).first->second;
4476      } else {
4477        ++OrdersUses.insert(std::make_pair(Order, 0)).first->second;
4478      }
4479    }
4480    // Set order of the user node.
4481    if (OrdersUses.empty()) {
4482      if (StridedVectorizeOrders.empty())
4483        continue;
4484      // Add (potentially!) strided vectorize orders.
4485      for (OrdersType &Order : StridedVectorizeOrders)
4486        ++OrdersUses.insert(std::make_pair(Order, 0)).first->second;
4487    } else {
4488      // Account (potentially!) strided vectorize orders only if it was used
4489      // already.
4490      for (OrdersType &Order : StridedVectorizeOrders) {
4491        auto *It = OrdersUses.find(Order);
4492        if (It != OrdersUses.end())
4493          ++It->second;
4494      }
4495    }
4496    // Choose the most used order.
4497    ArrayRef<unsigned> BestOrder = OrdersUses.front().first;
4498    unsigned Cnt = OrdersUses.front().second;
4499    for (const auto &Pair : drop_begin(OrdersUses)) {
4500      if (Cnt < Pair.second || (Cnt == Pair.second && Pair.first.empty())) {
4501        BestOrder = Pair.first;
4502        Cnt = Pair.second;
4503      }
4504    }
4505    // Set order of the user node.
4506    if (BestOrder.empty())
4507      continue;
4508    SmallVector<int> Mask;
4509    inversePermutation(BestOrder, Mask);
4510    SmallVector<int> MaskOrder(BestOrder.size(), PoisonMaskElem);
4511    unsigned E = BestOrder.size();
4512    transform(BestOrder, MaskOrder.begin(), [E](unsigned I) {
4513      return I < E ? static_cast<int>(I) : PoisonMaskElem;
4514    });
4515    // Do an actual reordering, if profitable.
4516    for (std::unique_ptr<TreeEntry> &TE : VectorizableTree) {
4517      // Just do the reordering for the nodes with the given VF.
4518      if (TE->Scalars.size() != VF) {
4519        if (TE->ReuseShuffleIndices.size() == VF) {
4520          // Need to reorder the reuses masks of the operands with smaller VF to
4521          // be able to find the match between the graph nodes and scalar
4522          // operands of the given node during vectorization/cost estimation.
4523          assert(all_of(TE->UserTreeIndices,
4524                        [VF, &TE](const EdgeInfo &EI) {
4525                          return EI.UserTE->Scalars.size() == VF ||
4526                                 EI.UserTE->Scalars.size() ==
4527                                     TE->Scalars.size();
4528                        }) &&
4529                 "All users must be of VF size.");
4530          // Update ordering of the operands with the smaller VF than the given
4531          // one.
4532          reorderNodeWithReuses(*TE, Mask);
4533        }
4534        continue;
4535      }
4536      if ((TE->State == TreeEntry::Vectorize ||
4537           TE->State == TreeEntry::PossibleStridedVectorize) &&
4538          isa<ExtractElementInst, ExtractValueInst, LoadInst, StoreInst,
4539              InsertElementInst>(TE->getMainOp()) &&
4540          !TE->isAltShuffle()) {
4541        // Build correct orders for extract{element,value}, loads and
4542        // stores.
4543        reorderOrder(TE->ReorderIndices, Mask);
4544        if (isa<InsertElementInst, StoreInst>(TE->getMainOp()))
4545          TE->reorderOperands(Mask);
4546      } else {
4547        // Reorder the node and its operands.
4548        TE->reorderOperands(Mask);
4549        assert(TE->ReorderIndices.empty() &&
4550               "Expected empty reorder sequence.");
4551        reorderScalars(TE->Scalars, Mask);
4552      }
4553      if (!TE->ReuseShuffleIndices.empty()) {
4554        // Apply reversed order to keep the original ordering of the reused
4555        // elements to avoid extra reorder indices shuffling.
4556        OrdersType CurrentOrder;
4557        reorderOrder(CurrentOrder, MaskOrder);
4558        SmallVector<int> NewReuses;
4559        inversePermutation(CurrentOrder, NewReuses);
4560        addMask(NewReuses, TE->ReuseShuffleIndices);
4561        TE->ReuseShuffleIndices.swap(NewReuses);
4562      }
4563    }
4564  }
4565}
4566
4567bool BoUpSLP::canReorderOperands(
4568    TreeEntry *UserTE, SmallVectorImpl<std::pair<unsigned, TreeEntry *>> &Edges,
4569    ArrayRef<TreeEntry *> ReorderableGathers,
4570    SmallVectorImpl<TreeEntry *> &GatherOps) {
4571  for (unsigned I = 0, E = UserTE->getNumOperands(); I < E; ++I) {
4572    if (any_of(Edges, [I](const std::pair<unsigned, TreeEntry *> &OpData) {
4573          return OpData.first == I &&
4574                 OpData.second->State == TreeEntry::Vectorize;
4575        }))
4576      continue;
4577    if (TreeEntry *TE = getVectorizedOperand(UserTE, I)) {
4578      // FIXME: Do not reorder (possible!) strided vectorized nodes, they
4579      // require reordering of the operands, which is not implemented yet.
4580      if (TE->State == TreeEntry::PossibleStridedVectorize)
4581        return false;
4582      // Do not reorder if operand node is used by many user nodes.
4583      if (any_of(TE->UserTreeIndices,
4584                 [UserTE](const EdgeInfo &EI) { return EI.UserTE != UserTE; }))
4585        return false;
4586      // Add the node to the list of the ordered nodes with the identity
4587      // order.
4588      Edges.emplace_back(I, TE);
4589      // Add ScatterVectorize nodes to the list of operands, where just
4590      // reordering of the scalars is required. Similar to the gathers, so
4591      // simply add to the list of gathered ops.
4592      // If there are reused scalars, process this node as a regular vectorize
4593      // node, just reorder reuses mask.
4594      if (TE->State != TreeEntry::Vectorize &&
4595          TE->ReuseShuffleIndices.empty() && TE->ReorderIndices.empty())
4596        GatherOps.push_back(TE);
4597      continue;
4598    }
4599    TreeEntry *Gather = nullptr;
4600    if (count_if(ReorderableGathers,
4601                 [&Gather, UserTE, I](TreeEntry *TE) {
4602                   assert(TE->State != TreeEntry::Vectorize &&
4603                          "Only non-vectorized nodes are expected.");
4604                   if (any_of(TE->UserTreeIndices,
4605                              [UserTE, I](const EdgeInfo &EI) {
4606                                return EI.UserTE == UserTE && EI.EdgeIdx == I;
4607                              })) {
4608                     assert(TE->isSame(UserTE->getOperand(I)) &&
4609                            "Operand entry does not match operands.");
4610                     Gather = TE;
4611                     return true;
4612                   }
4613                   return false;
4614                 }) > 1 &&
4615        !allConstant(UserTE->getOperand(I)))
4616      return false;
4617    if (Gather)
4618      GatherOps.push_back(Gather);
4619  }
4620  return true;
4621}
4622
4623void BoUpSLP::reorderBottomToTop(bool IgnoreReorder) {
4624  SetVector<TreeEntry *> OrderedEntries;
4625  DenseMap<const TreeEntry *, OrdersType> GathersToOrders;
4626  // Find all reorderable leaf nodes with the given VF.
4627  // Currently the are vectorized loads,extracts without alternate operands +
4628  // some gathering of extracts.
4629  SmallVector<TreeEntry *> NonVectorized;
4630  for (const std::unique_ptr<TreeEntry> &TE : VectorizableTree) {
4631    if (TE->State != TreeEntry::Vectorize &&
4632        TE->State != TreeEntry::PossibleStridedVectorize)
4633      NonVectorized.push_back(TE.get());
4634    if (std::optional<OrdersType> CurrentOrder =
4635            getReorderingData(*TE, /*TopToBottom=*/false)) {
4636      OrderedEntries.insert(TE.get());
4637      if (!(TE->State == TreeEntry::Vectorize ||
4638            TE->State == TreeEntry::PossibleStridedVectorize) ||
4639          !TE->ReuseShuffleIndices.empty())
4640        GathersToOrders.try_emplace(TE.get(), *CurrentOrder);
4641    }
4642  }
4643
4644  // 1. Propagate order to the graph nodes, which use only reordered nodes.
4645  // I.e., if the node has operands, that are reordered, try to make at least
4646  // one operand order in the natural order and reorder others + reorder the
4647  // user node itself.
4648  SmallPtrSet<const TreeEntry *, 4> Visited;
4649  while (!OrderedEntries.empty()) {
4650    // 1. Filter out only reordered nodes.
4651    // 2. If the entry has multiple uses - skip it and jump to the next node.
4652    DenseMap<TreeEntry *, SmallVector<std::pair<unsigned, TreeEntry *>>> Users;
4653    SmallVector<TreeEntry *> Filtered;
4654    for (TreeEntry *TE : OrderedEntries) {
4655      if (!(TE->State == TreeEntry::Vectorize ||
4656            TE->State == TreeEntry::PossibleStridedVectorize ||
4657            (TE->State == TreeEntry::NeedToGather &&
4658             GathersToOrders.count(TE))) ||
4659          TE->UserTreeIndices.empty() || !TE->ReuseShuffleIndices.empty() ||
4660          !all_of(drop_begin(TE->UserTreeIndices),
4661                  [TE](const EdgeInfo &EI) {
4662                    return EI.UserTE == TE->UserTreeIndices.front().UserTE;
4663                  }) ||
4664          !Visited.insert(TE).second) {
4665        Filtered.push_back(TE);
4666        continue;
4667      }
4668      // Build a map between user nodes and their operands order to speedup
4669      // search. The graph currently does not provide this dependency directly.
4670      for (EdgeInfo &EI : TE->UserTreeIndices) {
4671        TreeEntry *UserTE = EI.UserTE;
4672        auto It = Users.find(UserTE);
4673        if (It == Users.end())
4674          It = Users.insert({UserTE, {}}).first;
4675        It->second.emplace_back(EI.EdgeIdx, TE);
4676      }
4677    }
4678    // Erase filtered entries.
4679    for (TreeEntry *TE : Filtered)
4680      OrderedEntries.remove(TE);
4681    SmallVector<
4682        std::pair<TreeEntry *, SmallVector<std::pair<unsigned, TreeEntry *>>>>
4683        UsersVec(Users.begin(), Users.end());
4684    sort(UsersVec, [](const auto &Data1, const auto &Data2) {
4685      return Data1.first->Idx > Data2.first->Idx;
4686    });
4687    for (auto &Data : UsersVec) {
4688      // Check that operands are used only in the User node.
4689      SmallVector<TreeEntry *> GatherOps;
4690      if (!canReorderOperands(Data.first, Data.second, NonVectorized,
4691                              GatherOps)) {
4692        for (const std::pair<unsigned, TreeEntry *> &Op : Data.second)
4693          OrderedEntries.remove(Op.second);
4694        continue;
4695      }
4696      // All operands are reordered and used only in this node - propagate the
4697      // most used order to the user node.
4698      MapVector<OrdersType, unsigned,
4699                DenseMap<OrdersType, unsigned, OrdersTypeDenseMapInfo>>
4700          OrdersUses;
4701      // Last chance orders - scatter vectorize. Try to use their orders if no
4702      // other orders or the order is counted already.
4703      SmallVector<std::pair<OrdersType, unsigned>> StridedVectorizeOrders;
4704      // Do the analysis for each tree entry only once, otherwise the order of
4705      // the same node my be considered several times, though might be not
4706      // profitable.
4707      SmallPtrSet<const TreeEntry *, 4> VisitedOps;
4708      SmallPtrSet<const TreeEntry *, 4> VisitedUsers;
4709      for (const auto &Op : Data.second) {
4710        TreeEntry *OpTE = Op.second;
4711        if (!VisitedOps.insert(OpTE).second)
4712          continue;
4713        if (!OpTE->ReuseShuffleIndices.empty() && !GathersToOrders.count(OpTE))
4714          continue;
4715        const auto &Order = [OpTE, &GathersToOrders]() -> const OrdersType & {
4716          if (OpTE->State == TreeEntry::NeedToGather ||
4717              !OpTE->ReuseShuffleIndices.empty())
4718            return GathersToOrders.find(OpTE)->second;
4719          return OpTE->ReorderIndices;
4720        }();
4721        unsigned NumOps = count_if(
4722            Data.second, [OpTE](const std::pair<unsigned, TreeEntry *> &P) {
4723              return P.second == OpTE;
4724            });
4725        // Postpone scatter orders.
4726        if (OpTE->State == TreeEntry::PossibleStridedVectorize) {
4727          StridedVectorizeOrders.emplace_back(Order, NumOps);
4728          continue;
4729        }
4730        // Stores actually store the mask, not the order, need to invert.
4731        if (OpTE->State == TreeEntry::Vectorize && !OpTE->isAltShuffle() &&
4732            OpTE->getOpcode() == Instruction::Store && !Order.empty()) {
4733          SmallVector<int> Mask;
4734          inversePermutation(Order, Mask);
4735          unsigned E = Order.size();
4736          OrdersType CurrentOrder(E, E);
4737          transform(Mask, CurrentOrder.begin(), [E](int Idx) {
4738            return Idx == PoisonMaskElem ? E : static_cast<unsigned>(Idx);
4739          });
4740          fixupOrderingIndices(CurrentOrder);
4741          OrdersUses.insert(std::make_pair(CurrentOrder, 0)).first->second +=
4742              NumOps;
4743        } else {
4744          OrdersUses.insert(std::make_pair(Order, 0)).first->second += NumOps;
4745        }
4746        auto Res = OrdersUses.insert(std::make_pair(OrdersType(), 0));
4747        const auto &&AllowsReordering = [IgnoreReorder, &GathersToOrders](
4748                                            const TreeEntry *TE) {
4749          if (!TE->ReorderIndices.empty() || !TE->ReuseShuffleIndices.empty() ||
4750              (TE->State == TreeEntry::Vectorize && TE->isAltShuffle()) ||
4751              (IgnoreReorder && TE->Idx == 0))
4752            return true;
4753          if (TE->State == TreeEntry::NeedToGather) {
4754            auto It = GathersToOrders.find(TE);
4755            if (It != GathersToOrders.end())
4756              return !It->second.empty();
4757            return true;
4758          }
4759          return false;
4760        };
4761        for (const EdgeInfo &EI : OpTE->UserTreeIndices) {
4762          TreeEntry *UserTE = EI.UserTE;
4763          if (!VisitedUsers.insert(UserTE).second)
4764            continue;
4765          // May reorder user node if it requires reordering, has reused
4766          // scalars, is an alternate op vectorize node or its op nodes require
4767          // reordering.
4768          if (AllowsReordering(UserTE))
4769            continue;
4770          // Check if users allow reordering.
4771          // Currently look up just 1 level of operands to avoid increase of
4772          // the compile time.
4773          // Profitable to reorder if definitely more operands allow
4774          // reordering rather than those with natural order.
4775          ArrayRef<std::pair<unsigned, TreeEntry *>> Ops = Users[UserTE];
4776          if (static_cast<unsigned>(count_if(
4777                  Ops, [UserTE, &AllowsReordering](
4778                           const std::pair<unsigned, TreeEntry *> &Op) {
4779                    return AllowsReordering(Op.second) &&
4780                           all_of(Op.second->UserTreeIndices,
4781                                  [UserTE](const EdgeInfo &EI) {
4782                                    return EI.UserTE == UserTE;
4783                                  });
4784                  })) <= Ops.size() / 2)
4785            ++Res.first->second;
4786        }
4787      }
4788      // If no orders - skip current nodes and jump to the next one, if any.
4789      if (OrdersUses.empty()) {
4790        if (StridedVectorizeOrders.empty() ||
4791            (Data.first->ReorderIndices.empty() &&
4792             Data.first->ReuseShuffleIndices.empty() &&
4793             !(IgnoreReorder &&
4794               Data.first == VectorizableTree.front().get()))) {
4795          for (const std::pair<unsigned, TreeEntry *> &Op : Data.second)
4796            OrderedEntries.remove(Op.second);
4797          continue;
4798        }
4799        // Add (potentially!) strided vectorize orders.
4800        for (std::pair<OrdersType, unsigned> &Pair : StridedVectorizeOrders)
4801          OrdersUses.insert(std::make_pair(Pair.first, 0)).first->second +=
4802              Pair.second;
4803      } else {
4804        // Account (potentially!) strided vectorize orders only if it was used
4805        // already.
4806        for (std::pair<OrdersType, unsigned> &Pair : StridedVectorizeOrders) {
4807          auto *It = OrdersUses.find(Pair.first);
4808          if (It != OrdersUses.end())
4809            It->second += Pair.second;
4810        }
4811      }
4812      // Choose the best order.
4813      ArrayRef<unsigned> BestOrder = OrdersUses.front().first;
4814      unsigned Cnt = OrdersUses.front().second;
4815      for (const auto &Pair : drop_begin(OrdersUses)) {
4816        if (Cnt < Pair.second || (Cnt == Pair.second && Pair.first.empty())) {
4817          BestOrder = Pair.first;
4818          Cnt = Pair.second;
4819        }
4820      }
4821      // Set order of the user node (reordering of operands and user nodes).
4822      if (BestOrder.empty()) {
4823        for (const std::pair<unsigned, TreeEntry *> &Op : Data.second)
4824          OrderedEntries.remove(Op.second);
4825        continue;
4826      }
4827      // Erase operands from OrderedEntries list and adjust their orders.
4828      VisitedOps.clear();
4829      SmallVector<int> Mask;
4830      inversePermutation(BestOrder, Mask);
4831      SmallVector<int> MaskOrder(BestOrder.size(), PoisonMaskElem);
4832      unsigned E = BestOrder.size();
4833      transform(BestOrder, MaskOrder.begin(), [E](unsigned I) {
4834        return I < E ? static_cast<int>(I) : PoisonMaskElem;
4835      });
4836      for (const std::pair<unsigned, TreeEntry *> &Op : Data.second) {
4837        TreeEntry *TE = Op.second;
4838        OrderedEntries.remove(TE);
4839        if (!VisitedOps.insert(TE).second)
4840          continue;
4841        if (TE->ReuseShuffleIndices.size() == BestOrder.size()) {
4842          reorderNodeWithReuses(*TE, Mask);
4843          continue;
4844        }
4845        // Gathers are processed separately.
4846        if (TE->State != TreeEntry::Vectorize &&
4847            TE->State != TreeEntry::PossibleStridedVectorize &&
4848            (TE->State != TreeEntry::ScatterVectorize ||
4849             TE->ReorderIndices.empty()))
4850          continue;
4851        assert((BestOrder.size() == TE->ReorderIndices.size() ||
4852                TE->ReorderIndices.empty()) &&
4853               "Non-matching sizes of user/operand entries.");
4854        reorderOrder(TE->ReorderIndices, Mask);
4855        if (IgnoreReorder && TE == VectorizableTree.front().get())
4856          IgnoreReorder = false;
4857      }
4858      // For gathers just need to reorder its scalars.
4859      for (TreeEntry *Gather : GatherOps) {
4860        assert(Gather->ReorderIndices.empty() &&
4861               "Unexpected reordering of gathers.");
4862        if (!Gather->ReuseShuffleIndices.empty()) {
4863          // Just reorder reuses indices.
4864          reorderReuses(Gather->ReuseShuffleIndices, Mask);
4865          continue;
4866        }
4867        reorderScalars(Gather->Scalars, Mask);
4868        OrderedEntries.remove(Gather);
4869      }
4870      // Reorder operands of the user node and set the ordering for the user
4871      // node itself.
4872      if (Data.first->State != TreeEntry::Vectorize ||
4873          !isa<ExtractElementInst, ExtractValueInst, LoadInst>(
4874              Data.first->getMainOp()) ||
4875          Data.first->isAltShuffle())
4876        Data.first->reorderOperands(Mask);
4877      if (!isa<InsertElementInst, StoreInst>(Data.first->getMainOp()) ||
4878          Data.first->isAltShuffle() ||
4879          Data.first->State == TreeEntry::PossibleStridedVectorize) {
4880        reorderScalars(Data.first->Scalars, Mask);
4881        reorderOrder(Data.first->ReorderIndices, MaskOrder);
4882        if (Data.first->ReuseShuffleIndices.empty() &&
4883            !Data.first->ReorderIndices.empty() &&
4884            !Data.first->isAltShuffle()) {
4885          // Insert user node to the list to try to sink reordering deeper in
4886          // the graph.
4887          OrderedEntries.insert(Data.first);
4888        }
4889      } else {
4890        reorderOrder(Data.first->ReorderIndices, Mask);
4891      }
4892    }
4893  }
4894  // If the reordering is unnecessary, just remove the reorder.
4895  if (IgnoreReorder && !VectorizableTree.front()->ReorderIndices.empty() &&
4896      VectorizableTree.front()->ReuseShuffleIndices.empty())
4897    VectorizableTree.front()->ReorderIndices.clear();
4898}
4899
4900void BoUpSLP::buildExternalUses(
4901    const ExtraValueToDebugLocsMap &ExternallyUsedValues) {
4902  // Collect the values that we need to extract from the tree.
4903  for (auto &TEPtr : VectorizableTree) {
4904    TreeEntry *Entry = TEPtr.get();
4905
4906    // No need to handle users of gathered values.
4907    if (Entry->State == TreeEntry::NeedToGather)
4908      continue;
4909
4910    // For each lane:
4911    for (int Lane = 0, LE = Entry->Scalars.size(); Lane != LE; ++Lane) {
4912      Value *Scalar = Entry->Scalars[Lane];
4913      if (!isa<Instruction>(Scalar))
4914        continue;
4915      int FoundLane = Entry->findLaneForValue(Scalar);
4916
4917      // Check if the scalar is externally used as an extra arg.
4918      const auto *ExtI = ExternallyUsedValues.find(Scalar);
4919      if (ExtI != ExternallyUsedValues.end()) {
4920        LLVM_DEBUG(dbgs() << "SLP: Need to extract: Extra arg from lane "
4921                          << Lane << " from " << *Scalar << ".\n");
4922        ExternalUses.emplace_back(Scalar, nullptr, FoundLane);
4923      }
4924      for (User *U : Scalar->users()) {
4925        LLVM_DEBUG(dbgs() << "SLP: Checking user:" << *U << ".\n");
4926
4927        Instruction *UserInst = dyn_cast<Instruction>(U);
4928        if (!UserInst || isDeleted(UserInst))
4929          continue;
4930
4931        // Ignore users in the user ignore list.
4932        if (UserIgnoreList && UserIgnoreList->contains(UserInst))
4933          continue;
4934
4935        // Skip in-tree scalars that become vectors
4936        if (TreeEntry *UseEntry = getTreeEntry(U)) {
4937          // Some in-tree scalars will remain as scalar in vectorized
4938          // instructions. If that is the case, the one in FoundLane will
4939          // be used.
4940          if (UseEntry->State == TreeEntry::ScatterVectorize ||
4941              UseEntry->State == TreeEntry::PossibleStridedVectorize ||
4942              !doesInTreeUserNeedToExtract(
4943                  Scalar, cast<Instruction>(UseEntry->Scalars.front()), TLI)) {
4944            LLVM_DEBUG(dbgs() << "SLP: \tInternal user will be removed:" << *U
4945                              << ".\n");
4946            assert(UseEntry->State != TreeEntry::NeedToGather && "Bad state");
4947            continue;
4948          }
4949          U = nullptr;
4950        }
4951
4952        LLVM_DEBUG(dbgs() << "SLP: Need to extract:" << *UserInst
4953                          << " from lane " << Lane << " from " << *Scalar
4954                          << ".\n");
4955        ExternalUses.emplace_back(Scalar, U, FoundLane);
4956      }
4957    }
4958  }
4959}
4960
4961DenseMap<Value *, SmallVector<StoreInst *>>
4962BoUpSLP::collectUserStores(const BoUpSLP::TreeEntry *TE) const {
4963  DenseMap<Value *, SmallVector<StoreInst *>> PtrToStoresMap;
4964  for (unsigned Lane : seq<unsigned>(0, TE->Scalars.size())) {
4965    Value *V = TE->Scalars[Lane];
4966    // To save compilation time we don't visit if we have too many users.
4967    static constexpr unsigned UsersLimit = 4;
4968    if (V->hasNUsesOrMore(UsersLimit))
4969      break;
4970
4971    // Collect stores per pointer object.
4972    for (User *U : V->users()) {
4973      auto *SI = dyn_cast<StoreInst>(U);
4974      if (SI == nullptr || !SI->isSimple() ||
4975          !isValidElementType(SI->getValueOperand()->getType()))
4976        continue;
4977      // Skip entry if already
4978      if (getTreeEntry(U))
4979        continue;
4980
4981      Value *Ptr = getUnderlyingObject(SI->getPointerOperand());
4982      auto &StoresVec = PtrToStoresMap[Ptr];
4983      // For now just keep one store per pointer object per lane.
4984      // TODO: Extend this to support multiple stores per pointer per lane
4985      if (StoresVec.size() > Lane)
4986        continue;
4987      // Skip if in different BBs.
4988      if (!StoresVec.empty() &&
4989          SI->getParent() != StoresVec.back()->getParent())
4990        continue;
4991      // Make sure that the stores are of the same type.
4992      if (!StoresVec.empty() &&
4993          SI->getValueOperand()->getType() !=
4994              StoresVec.back()->getValueOperand()->getType())
4995        continue;
4996      StoresVec.push_back(SI);
4997    }
4998  }
4999  return PtrToStoresMap;
5000}
5001
5002bool BoUpSLP::canFormVector(ArrayRef<StoreInst *> StoresVec,
5003                            OrdersType &ReorderIndices) const {
5004  // We check whether the stores in StoreVec can form a vector by sorting them
5005  // and checking whether they are consecutive.
5006
5007  // To avoid calling getPointersDiff() while sorting we create a vector of
5008  // pairs {store, offset from first} and sort this instead.
5009  SmallVector<std::pair<StoreInst *, int>> StoreOffsetVec(StoresVec.size());
5010  StoreInst *S0 = StoresVec[0];
5011  StoreOffsetVec[0] = {S0, 0};
5012  Type *S0Ty = S0->getValueOperand()->getType();
5013  Value *S0Ptr = S0->getPointerOperand();
5014  for (unsigned Idx : seq<unsigned>(1, StoresVec.size())) {
5015    StoreInst *SI = StoresVec[Idx];
5016    std::optional<int> Diff =
5017        getPointersDiff(S0Ty, S0Ptr, SI->getValueOperand()->getType(),
5018                        SI->getPointerOperand(), *DL, *SE,
5019                        /*StrictCheck=*/true);
5020    // We failed to compare the pointers so just abandon this StoresVec.
5021    if (!Diff)
5022      return false;
5023    StoreOffsetVec[Idx] = {StoresVec[Idx], *Diff};
5024  }
5025
5026  // Sort the vector based on the pointers. We create a copy because we may
5027  // need the original later for calculating the reorder (shuffle) indices.
5028  stable_sort(StoreOffsetVec, [](const std::pair<StoreInst *, int> &Pair1,
5029                                 const std::pair<StoreInst *, int> &Pair2) {
5030    int Offset1 = Pair1.second;
5031    int Offset2 = Pair2.second;
5032    return Offset1 < Offset2;
5033  });
5034
5035  // Check if the stores are consecutive by checking if their difference is 1.
5036  for (unsigned Idx : seq<unsigned>(1, StoreOffsetVec.size()))
5037    if (StoreOffsetVec[Idx].second != StoreOffsetVec[Idx - 1].second + 1)
5038      return false;
5039
5040  // Calculate the shuffle indices according to their offset against the sorted
5041  // StoreOffsetVec.
5042  ReorderIndices.reserve(StoresVec.size());
5043  for (StoreInst *SI : StoresVec) {
5044    unsigned Idx = find_if(StoreOffsetVec,
5045                           [SI](const std::pair<StoreInst *, int> &Pair) {
5046                             return Pair.first == SI;
5047                           }) -
5048                   StoreOffsetVec.begin();
5049    ReorderIndices.push_back(Idx);
5050  }
5051  // Identity order (e.g., {0,1,2,3}) is modeled as an empty OrdersType in
5052  // reorderTopToBottom() and reorderBottomToTop(), so we are following the
5053  // same convention here.
5054  auto IsIdentityOrder = [](const OrdersType &Order) {
5055    for (unsigned Idx : seq<unsigned>(0, Order.size()))
5056      if (Idx != Order[Idx])
5057        return false;
5058    return true;
5059  };
5060  if (IsIdentityOrder(ReorderIndices))
5061    ReorderIndices.clear();
5062
5063  return true;
5064}
5065
5066#ifndef NDEBUG
5067LLVM_DUMP_METHOD static void dumpOrder(const BoUpSLP::OrdersType &Order) {
5068  for (unsigned Idx : Order)
5069    dbgs() << Idx << ", ";
5070  dbgs() << "\n";
5071}
5072#endif
5073
5074SmallVector<BoUpSLP::OrdersType, 1>
5075BoUpSLP::findExternalStoreUsersReorderIndices(TreeEntry *TE) const {
5076  unsigned NumLanes = TE->Scalars.size();
5077
5078  DenseMap<Value *, SmallVector<StoreInst *>> PtrToStoresMap =
5079      collectUserStores(TE);
5080
5081  // Holds the reorder indices for each candidate store vector that is a user of
5082  // the current TreeEntry.
5083  SmallVector<OrdersType, 1> ExternalReorderIndices;
5084
5085  // Now inspect the stores collected per pointer and look for vectorization
5086  // candidates. For each candidate calculate the reorder index vector and push
5087  // it into `ExternalReorderIndices`
5088  for (const auto &Pair : PtrToStoresMap) {
5089    auto &StoresVec = Pair.second;
5090    // If we have fewer than NumLanes stores, then we can't form a vector.
5091    if (StoresVec.size() != NumLanes)
5092      continue;
5093
5094    // If the stores are not consecutive then abandon this StoresVec.
5095    OrdersType ReorderIndices;
5096    if (!canFormVector(StoresVec, ReorderIndices))
5097      continue;
5098
5099    // We now know that the scalars in StoresVec can form a vector instruction,
5100    // so set the reorder indices.
5101    ExternalReorderIndices.push_back(ReorderIndices);
5102  }
5103  return ExternalReorderIndices;
5104}
5105
5106void BoUpSLP::buildTree(ArrayRef<Value *> Roots,
5107                        const SmallDenseSet<Value *> &UserIgnoreLst) {
5108  deleteTree();
5109  UserIgnoreList = &UserIgnoreLst;
5110  if (!allSameType(Roots))
5111    return;
5112  buildTree_rec(Roots, 0, EdgeInfo());
5113}
5114
5115void BoUpSLP::buildTree(ArrayRef<Value *> Roots) {
5116  deleteTree();
5117  if (!allSameType(Roots))
5118    return;
5119  buildTree_rec(Roots, 0, EdgeInfo());
5120}
5121
5122/// \return true if the specified list of values has only one instruction that
5123/// requires scheduling, false otherwise.
5124#ifndef NDEBUG
5125static bool needToScheduleSingleInstruction(ArrayRef<Value *> VL) {
5126  Value *NeedsScheduling = nullptr;
5127  for (Value *V : VL) {
5128    if (doesNotNeedToBeScheduled(V))
5129      continue;
5130    if (!NeedsScheduling) {
5131      NeedsScheduling = V;
5132      continue;
5133    }
5134    return false;
5135  }
5136  return NeedsScheduling;
5137}
5138#endif
5139
5140/// Generates key/subkey pair for the given value to provide effective sorting
5141/// of the values and better detection of the vectorizable values sequences. The
5142/// keys/subkeys can be used for better sorting of the values themselves (keys)
5143/// and in values subgroups (subkeys).
5144static std::pair<size_t, size_t> generateKeySubkey(
5145    Value *V, const TargetLibraryInfo *TLI,
5146    function_ref<hash_code(size_t, LoadInst *)> LoadsSubkeyGenerator,
5147    bool AllowAlternate) {
5148  hash_code Key = hash_value(V->getValueID() + 2);
5149  hash_code SubKey = hash_value(0);
5150  // Sort the loads by the distance between the pointers.
5151  if (auto *LI = dyn_cast<LoadInst>(V)) {
5152    Key = hash_combine(LI->getType(), hash_value(Instruction::Load), Key);
5153    if (LI->isSimple())
5154      SubKey = hash_value(LoadsSubkeyGenerator(Key, LI));
5155    else
5156      Key = SubKey = hash_value(LI);
5157  } else if (isVectorLikeInstWithConstOps(V)) {
5158    // Sort extracts by the vector operands.
5159    if (isa<ExtractElementInst, UndefValue>(V))
5160      Key = hash_value(Value::UndefValueVal + 1);
5161    if (auto *EI = dyn_cast<ExtractElementInst>(V)) {
5162      if (!isUndefVector(EI->getVectorOperand()).all() &&
5163          !isa<UndefValue>(EI->getIndexOperand()))
5164        SubKey = hash_value(EI->getVectorOperand());
5165    }
5166  } else if (auto *I = dyn_cast<Instruction>(V)) {
5167    // Sort other instructions just by the opcodes except for CMPInst.
5168    // For CMP also sort by the predicate kind.
5169    if ((isa<BinaryOperator, CastInst>(I)) &&
5170        isValidForAlternation(I->getOpcode())) {
5171      if (AllowAlternate)
5172        Key = hash_value(isa<BinaryOperator>(I) ? 1 : 0);
5173      else
5174        Key = hash_combine(hash_value(I->getOpcode()), Key);
5175      SubKey = hash_combine(
5176          hash_value(I->getOpcode()), hash_value(I->getType()),
5177          hash_value(isa<BinaryOperator>(I)
5178                         ? I->getType()
5179                         : cast<CastInst>(I)->getOperand(0)->getType()));
5180      // For casts, look through the only operand to improve compile time.
5181      if (isa<CastInst>(I)) {
5182        std::pair<size_t, size_t> OpVals =
5183            generateKeySubkey(I->getOperand(0), TLI, LoadsSubkeyGenerator,
5184                              /*AllowAlternate=*/true);
5185        Key = hash_combine(OpVals.first, Key);
5186        SubKey = hash_combine(OpVals.first, SubKey);
5187      }
5188    } else if (auto *CI = dyn_cast<CmpInst>(I)) {
5189      CmpInst::Predicate Pred = CI->getPredicate();
5190      if (CI->isCommutative())
5191        Pred = std::min(Pred, CmpInst::getInversePredicate(Pred));
5192      CmpInst::Predicate SwapPred = CmpInst::getSwappedPredicate(Pred);
5193      SubKey = hash_combine(hash_value(I->getOpcode()), hash_value(Pred),
5194                            hash_value(SwapPred),
5195                            hash_value(CI->getOperand(0)->getType()));
5196    } else if (auto *Call = dyn_cast<CallInst>(I)) {
5197      Intrinsic::ID ID = getVectorIntrinsicIDForCall(Call, TLI);
5198      if (isTriviallyVectorizable(ID)) {
5199        SubKey = hash_combine(hash_value(I->getOpcode()), hash_value(ID));
5200      } else if (!VFDatabase(*Call).getMappings(*Call).empty()) {
5201        SubKey = hash_combine(hash_value(I->getOpcode()),
5202                              hash_value(Call->getCalledFunction()));
5203      } else {
5204        Key = hash_combine(hash_value(Call), Key);
5205        SubKey = hash_combine(hash_value(I->getOpcode()), hash_value(Call));
5206      }
5207      for (const CallBase::BundleOpInfo &Op : Call->bundle_op_infos())
5208        SubKey = hash_combine(hash_value(Op.Begin), hash_value(Op.End),
5209                              hash_value(Op.Tag), SubKey);
5210    } else if (auto *Gep = dyn_cast<GetElementPtrInst>(I)) {
5211      if (Gep->getNumOperands() == 2 && isa<ConstantInt>(Gep->getOperand(1)))
5212        SubKey = hash_value(Gep->getPointerOperand());
5213      else
5214        SubKey = hash_value(Gep);
5215    } else if (BinaryOperator::isIntDivRem(I->getOpcode()) &&
5216               !isa<ConstantInt>(I->getOperand(1))) {
5217      // Do not try to vectorize instructions with potentially high cost.
5218      SubKey = hash_value(I);
5219    } else {
5220      SubKey = hash_value(I->getOpcode());
5221    }
5222    Key = hash_combine(hash_value(I->getParent()), Key);
5223  }
5224  return std::make_pair(Key, SubKey);
5225}
5226
5227/// Checks if the specified instruction \p I is an alternate operation for
5228/// the given \p MainOp and \p AltOp instructions.
5229static bool isAlternateInstruction(const Instruction *I,
5230                                   const Instruction *MainOp,
5231                                   const Instruction *AltOp,
5232                                   const TargetLibraryInfo &TLI);
5233
5234BoUpSLP::TreeEntry::EntryState BoUpSLP::getScalarsVectorizationState(
5235    InstructionsState &S, ArrayRef<Value *> VL, bool IsScatterVectorizeUserTE,
5236    OrdersType &CurrentOrder, SmallVectorImpl<Value *> &PointerOps) const {
5237  assert(S.MainOp && "Expected instructions with same/alternate opcodes only.");
5238
5239  unsigned ShuffleOrOp =
5240      S.isAltShuffle() ? (unsigned)Instruction::ShuffleVector : S.getOpcode();
5241  auto *VL0 = cast<Instruction>(S.OpValue);
5242  switch (ShuffleOrOp) {
5243  case Instruction::PHI: {
5244    // Check for terminator values (e.g. invoke).
5245    for (Value *V : VL)
5246      for (Value *Incoming : cast<PHINode>(V)->incoming_values()) {
5247        Instruction *Term = dyn_cast<Instruction>(Incoming);
5248        if (Term && Term->isTerminator()) {
5249          LLVM_DEBUG(dbgs()
5250                     << "SLP: Need to swizzle PHINodes (terminator use).\n");
5251          return TreeEntry::NeedToGather;
5252        }
5253      }
5254
5255    return TreeEntry::Vectorize;
5256  }
5257  case Instruction::ExtractValue:
5258  case Instruction::ExtractElement: {
5259    bool Reuse = canReuseExtract(VL, VL0, CurrentOrder);
5260    if (Reuse || !CurrentOrder.empty())
5261      return TreeEntry::Vectorize;
5262    LLVM_DEBUG(dbgs() << "SLP: Gather extract sequence.\n");
5263    return TreeEntry::NeedToGather;
5264  }
5265  case Instruction::InsertElement: {
5266    // Check that we have a buildvector and not a shuffle of 2 or more
5267    // different vectors.
5268    ValueSet SourceVectors;
5269    for (Value *V : VL) {
5270      SourceVectors.insert(cast<Instruction>(V)->getOperand(0));
5271      assert(getInsertIndex(V) != std::nullopt &&
5272             "Non-constant or undef index?");
5273    }
5274
5275    if (count_if(VL, [&SourceVectors](Value *V) {
5276          return !SourceVectors.contains(V);
5277        }) >= 2) {
5278      // Found 2nd source vector - cancel.
5279      LLVM_DEBUG(dbgs() << "SLP: Gather of insertelement vectors with "
5280                           "different source vectors.\n");
5281      return TreeEntry::NeedToGather;
5282    }
5283
5284    return TreeEntry::Vectorize;
5285  }
5286  case Instruction::Load: {
5287    // Check that a vectorized load would load the same memory as a scalar
5288    // load. For example, we don't want to vectorize loads that are smaller
5289    // than 8-bit. Even though we have a packed struct {<i2, i2, i2, i2>} LLVM
5290    // treats loading/storing it as an i8 struct. If we vectorize loads/stores
5291    // from such a struct, we read/write packed bits disagreeing with the
5292    // unvectorized version.
5293    switch (canVectorizeLoads(VL, VL0, *TTI, *DL, *SE, *LI, *TLI, CurrentOrder,
5294                              PointerOps)) {
5295    case LoadsState::Vectorize:
5296      return TreeEntry::Vectorize;
5297    case LoadsState::ScatterVectorize:
5298      return TreeEntry::ScatterVectorize;
5299    case LoadsState::PossibleStridedVectorize:
5300      return TreeEntry::PossibleStridedVectorize;
5301    case LoadsState::Gather:
5302#ifndef NDEBUG
5303      Type *ScalarTy = VL0->getType();
5304      if (DL->getTypeSizeInBits(ScalarTy) !=
5305          DL->getTypeAllocSizeInBits(ScalarTy))
5306        LLVM_DEBUG(dbgs() << "SLP: Gathering loads of non-packed type.\n");
5307      else if (any_of(VL,
5308                      [](Value *V) { return !cast<LoadInst>(V)->isSimple(); }))
5309        LLVM_DEBUG(dbgs() << "SLP: Gathering non-simple loads.\n");
5310      else
5311        LLVM_DEBUG(dbgs() << "SLP: Gathering non-consecutive loads.\n");
5312#endif // NDEBUG
5313      return TreeEntry::NeedToGather;
5314    }
5315    llvm_unreachable("Unexpected state of loads");
5316  }
5317  case Instruction::ZExt:
5318  case Instruction::SExt:
5319  case Instruction::FPToUI:
5320  case Instruction::FPToSI:
5321  case Instruction::FPExt:
5322  case Instruction::PtrToInt:
5323  case Instruction::IntToPtr:
5324  case Instruction::SIToFP:
5325  case Instruction::UIToFP:
5326  case Instruction::Trunc:
5327  case Instruction::FPTrunc:
5328  case Instruction::BitCast: {
5329    Type *SrcTy = VL0->getOperand(0)->getType();
5330    for (Value *V : VL) {
5331      Type *Ty = cast<Instruction>(V)->getOperand(0)->getType();
5332      if (Ty != SrcTy || !isValidElementType(Ty)) {
5333        LLVM_DEBUG(
5334            dbgs() << "SLP: Gathering casts with different src types.\n");
5335        return TreeEntry::NeedToGather;
5336      }
5337    }
5338    return TreeEntry::Vectorize;
5339  }
5340  case Instruction::ICmp:
5341  case Instruction::FCmp: {
5342    // Check that all of the compares have the same predicate.
5343    CmpInst::Predicate P0 = cast<CmpInst>(VL0)->getPredicate();
5344    CmpInst::Predicate SwapP0 = CmpInst::getSwappedPredicate(P0);
5345    Type *ComparedTy = VL0->getOperand(0)->getType();
5346    for (Value *V : VL) {
5347      CmpInst *Cmp = cast<CmpInst>(V);
5348      if ((Cmp->getPredicate() != P0 && Cmp->getPredicate() != SwapP0) ||
5349          Cmp->getOperand(0)->getType() != ComparedTy) {
5350        LLVM_DEBUG(dbgs() << "SLP: Gathering cmp with different predicate.\n");
5351        return TreeEntry::NeedToGather;
5352      }
5353    }
5354    return TreeEntry::Vectorize;
5355  }
5356  case Instruction::Select:
5357  case Instruction::FNeg:
5358  case Instruction::Add:
5359  case Instruction::FAdd:
5360  case Instruction::Sub:
5361  case Instruction::FSub:
5362  case Instruction::Mul:
5363  case Instruction::FMul:
5364  case Instruction::UDiv:
5365  case Instruction::SDiv:
5366  case Instruction::FDiv:
5367  case Instruction::URem:
5368  case Instruction::SRem:
5369  case Instruction::FRem:
5370  case Instruction::Shl:
5371  case Instruction::LShr:
5372  case Instruction::AShr:
5373  case Instruction::And:
5374  case Instruction::Or:
5375  case Instruction::Xor:
5376    return TreeEntry::Vectorize;
5377  case Instruction::GetElementPtr: {
5378    // We don't combine GEPs with complicated (nested) indexing.
5379    for (Value *V : VL) {
5380      auto *I = dyn_cast<GetElementPtrInst>(V);
5381      if (!I)
5382        continue;
5383      if (I->getNumOperands() != 2) {
5384        LLVM_DEBUG(dbgs() << "SLP: not-vectorizable GEP (nested indexes).\n");
5385        return TreeEntry::NeedToGather;
5386      }
5387    }
5388
5389    // We can't combine several GEPs into one vector if they operate on
5390    // different types.
5391    Type *Ty0 = cast<GEPOperator>(VL0)->getSourceElementType();
5392    for (Value *V : VL) {
5393      auto *GEP = dyn_cast<GEPOperator>(V);
5394      if (!GEP)
5395        continue;
5396      Type *CurTy = GEP->getSourceElementType();
5397      if (Ty0 != CurTy) {
5398        LLVM_DEBUG(dbgs() << "SLP: not-vectorizable GEP (different types).\n");
5399        return TreeEntry::NeedToGather;
5400      }
5401    }
5402
5403    // We don't combine GEPs with non-constant indexes.
5404    Type *Ty1 = VL0->getOperand(1)->getType();
5405    for (Value *V : VL) {
5406      auto *I = dyn_cast<GetElementPtrInst>(V);
5407      if (!I)
5408        continue;
5409      auto *Op = I->getOperand(1);
5410      if ((!IsScatterVectorizeUserTE && !isa<ConstantInt>(Op)) ||
5411          (Op->getType() != Ty1 &&
5412           ((IsScatterVectorizeUserTE && !isa<ConstantInt>(Op)) ||
5413            Op->getType()->getScalarSizeInBits() >
5414                DL->getIndexSizeInBits(
5415                    V->getType()->getPointerAddressSpace())))) {
5416        LLVM_DEBUG(
5417            dbgs() << "SLP: not-vectorizable GEP (non-constant indexes).\n");
5418        return TreeEntry::NeedToGather;
5419      }
5420    }
5421
5422    return TreeEntry::Vectorize;
5423  }
5424  case Instruction::Store: {
5425    // Check if the stores are consecutive or if we need to swizzle them.
5426    llvm::Type *ScalarTy = cast<StoreInst>(VL0)->getValueOperand()->getType();
5427    // Avoid types that are padded when being allocated as scalars, while
5428    // being packed together in a vector (such as i1).
5429    if (DL->getTypeSizeInBits(ScalarTy) !=
5430        DL->getTypeAllocSizeInBits(ScalarTy)) {
5431      LLVM_DEBUG(dbgs() << "SLP: Gathering stores of non-packed type.\n");
5432      return TreeEntry::NeedToGather;
5433    }
5434    // Make sure all stores in the bundle are simple - we can't vectorize
5435    // atomic or volatile stores.
5436    for (Value *V : VL) {
5437      auto *SI = cast<StoreInst>(V);
5438      if (!SI->isSimple()) {
5439        LLVM_DEBUG(dbgs() << "SLP: Gathering non-simple stores.\n");
5440        return TreeEntry::NeedToGather;
5441      }
5442      PointerOps.push_back(SI->getPointerOperand());
5443    }
5444
5445    // Check the order of pointer operands.
5446    if (llvm::sortPtrAccesses(PointerOps, ScalarTy, *DL, *SE, CurrentOrder)) {
5447      Value *Ptr0;
5448      Value *PtrN;
5449      if (CurrentOrder.empty()) {
5450        Ptr0 = PointerOps.front();
5451        PtrN = PointerOps.back();
5452      } else {
5453        Ptr0 = PointerOps[CurrentOrder.front()];
5454        PtrN = PointerOps[CurrentOrder.back()];
5455      }
5456      std::optional<int> Dist =
5457          getPointersDiff(ScalarTy, Ptr0, ScalarTy, PtrN, *DL, *SE);
5458      // Check that the sorted pointer operands are consecutive.
5459      if (static_cast<unsigned>(*Dist) == VL.size() - 1)
5460        return TreeEntry::Vectorize;
5461    }
5462
5463    LLVM_DEBUG(dbgs() << "SLP: Non-consecutive store.\n");
5464    return TreeEntry::NeedToGather;
5465  }
5466  case Instruction::Call: {
5467    // Check if the calls are all to the same vectorizable intrinsic or
5468    // library function.
5469    CallInst *CI = cast<CallInst>(VL0);
5470    Intrinsic::ID ID = getVectorIntrinsicIDForCall(CI, TLI);
5471
5472    VFShape Shape = VFShape::get(
5473        CI->getFunctionType(),
5474        ElementCount::getFixed(static_cast<unsigned int>(VL.size())),
5475        false /*HasGlobalPred*/);
5476    Function *VecFunc = VFDatabase(*CI).getVectorizedFunction(Shape);
5477
5478    if (!VecFunc && !isTriviallyVectorizable(ID)) {
5479      LLVM_DEBUG(dbgs() << "SLP: Non-vectorizable call.\n");
5480      return TreeEntry::NeedToGather;
5481    }
5482    Function *F = CI->getCalledFunction();
5483    unsigned NumArgs = CI->arg_size();
5484    SmallVector<Value *, 4> ScalarArgs(NumArgs, nullptr);
5485    for (unsigned J = 0; J != NumArgs; ++J)
5486      if (isVectorIntrinsicWithScalarOpAtArg(ID, J))
5487        ScalarArgs[J] = CI->getArgOperand(J);
5488    for (Value *V : VL) {
5489      CallInst *CI2 = dyn_cast<CallInst>(V);
5490      if (!CI2 || CI2->getCalledFunction() != F ||
5491          getVectorIntrinsicIDForCall(CI2, TLI) != ID ||
5492          (VecFunc &&
5493           VecFunc != VFDatabase(*CI2).getVectorizedFunction(Shape)) ||
5494          !CI->hasIdenticalOperandBundleSchema(*CI2)) {
5495        LLVM_DEBUG(dbgs() << "SLP: mismatched calls:" << *CI << "!=" << *V
5496                          << "\n");
5497        return TreeEntry::NeedToGather;
5498      }
5499      // Some intrinsics have scalar arguments and should be same in order for
5500      // them to be vectorized.
5501      for (unsigned J = 0; J != NumArgs; ++J) {
5502        if (isVectorIntrinsicWithScalarOpAtArg(ID, J)) {
5503          Value *A1J = CI2->getArgOperand(J);
5504          if (ScalarArgs[J] != A1J) {
5505            LLVM_DEBUG(dbgs()
5506                       << "SLP: mismatched arguments in call:" << *CI
5507                       << " argument " << ScalarArgs[J] << "!=" << A1J << "\n");
5508            return TreeEntry::NeedToGather;
5509          }
5510        }
5511      }
5512      // Verify that the bundle operands are identical between the two calls.
5513      if (CI->hasOperandBundles() &&
5514          !std::equal(CI->op_begin() + CI->getBundleOperandsStartIndex(),
5515                      CI->op_begin() + CI->getBundleOperandsEndIndex(),
5516                      CI2->op_begin() + CI2->getBundleOperandsStartIndex())) {
5517        LLVM_DEBUG(dbgs() << "SLP: mismatched bundle operands in calls:" << *CI
5518                          << "!=" << *V << '\n');
5519        return TreeEntry::NeedToGather;
5520      }
5521    }
5522
5523    return TreeEntry::Vectorize;
5524  }
5525  case Instruction::ShuffleVector: {
5526    // If this is not an alternate sequence of opcode like add-sub
5527    // then do not vectorize this instruction.
5528    if (!S.isAltShuffle()) {
5529      LLVM_DEBUG(dbgs() << "SLP: ShuffleVector are not vectorized.\n");
5530      return TreeEntry::NeedToGather;
5531    }
5532    return TreeEntry::Vectorize;
5533  }
5534  default:
5535    LLVM_DEBUG(dbgs() << "SLP: Gathering unknown instruction.\n");
5536    return TreeEntry::NeedToGather;
5537  }
5538}
5539
5540void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth,
5541                            const EdgeInfo &UserTreeIdx) {
5542  assert((allConstant(VL) || allSameType(VL)) && "Invalid types!");
5543
5544  SmallVector<int> ReuseShuffleIndicies;
5545  SmallVector<Value *> UniqueValues;
5546  SmallVector<Value *> NonUniqueValueVL;
5547  auto TryToFindDuplicates = [&](const InstructionsState &S,
5548                                 bool DoNotFail = false) {
5549    // Check that every instruction appears once in this bundle.
5550    DenseMap<Value *, unsigned> UniquePositions(VL.size());
5551    for (Value *V : VL) {
5552      if (isConstant(V)) {
5553        ReuseShuffleIndicies.emplace_back(
5554            isa<UndefValue>(V) ? PoisonMaskElem : UniqueValues.size());
5555        UniqueValues.emplace_back(V);
5556        continue;
5557      }
5558      auto Res = UniquePositions.try_emplace(V, UniqueValues.size());
5559      ReuseShuffleIndicies.emplace_back(Res.first->second);
5560      if (Res.second)
5561        UniqueValues.emplace_back(V);
5562    }
5563    size_t NumUniqueScalarValues = UniqueValues.size();
5564    if (NumUniqueScalarValues == VL.size()) {
5565      ReuseShuffleIndicies.clear();
5566    } else {
5567      LLVM_DEBUG(dbgs() << "SLP: Shuffle for reused scalars.\n");
5568      if (NumUniqueScalarValues <= 1 ||
5569          (UniquePositions.size() == 1 && all_of(UniqueValues,
5570                                                 [](Value *V) {
5571                                                   return isa<UndefValue>(V) ||
5572                                                          !isConstant(V);
5573                                                 })) ||
5574          !llvm::has_single_bit<uint32_t>(NumUniqueScalarValues)) {
5575        if (DoNotFail && UniquePositions.size() > 1 &&
5576            NumUniqueScalarValues > 1 && S.MainOp->isSafeToRemove() &&
5577            all_of(UniqueValues, [=](Value *V) {
5578              return isa<ExtractElementInst>(V) ||
5579                     areAllUsersVectorized(cast<Instruction>(V),
5580                                           UserIgnoreList);
5581            })) {
5582          unsigned PWSz = PowerOf2Ceil(UniqueValues.size());
5583          if (PWSz == VL.size()) {
5584            ReuseShuffleIndicies.clear();
5585          } else {
5586            NonUniqueValueVL.assign(UniqueValues.begin(), UniqueValues.end());
5587            NonUniqueValueVL.append(PWSz - UniqueValues.size(),
5588                                    UniqueValues.back());
5589            VL = NonUniqueValueVL;
5590          }
5591          return true;
5592        }
5593        LLVM_DEBUG(dbgs() << "SLP: Scalar used twice in bundle.\n");
5594        newTreeEntry(VL, std::nullopt /*not vectorized*/, S, UserTreeIdx);
5595        return false;
5596      }
5597      VL = UniqueValues;
5598    }
5599    return true;
5600  };
5601
5602  InstructionsState S = getSameOpcode(VL, *TLI);
5603
5604  // Don't vectorize ephemeral values.
5605  if (!EphValues.empty()) {
5606    for (Value *V : VL) {
5607      if (EphValues.count(V)) {
5608        LLVM_DEBUG(dbgs() << "SLP: The instruction (" << *V
5609                          << ") is ephemeral.\n");
5610        newTreeEntry(VL, std::nullopt /*not vectorized*/, S, UserTreeIdx);
5611        return;
5612      }
5613    }
5614  }
5615
5616  // Gather if we hit the RecursionMaxDepth, unless this is a load (or z/sext of
5617  // a load), in which case peek through to include it in the tree, without
5618  // ballooning over-budget.
5619  if (Depth >= RecursionMaxDepth &&
5620      !(S.MainOp && isa<Instruction>(S.MainOp) && S.MainOp == S.AltOp &&
5621        VL.size() >= 4 &&
5622        (match(S.MainOp, m_Load(m_Value())) || all_of(VL, [&S](const Value *I) {
5623           return match(I,
5624                        m_OneUse(m_ZExtOrSExt(m_OneUse(m_Load(m_Value()))))) &&
5625                  cast<Instruction>(I)->getOpcode() ==
5626                      cast<Instruction>(S.MainOp)->getOpcode();
5627         })))) {
5628    LLVM_DEBUG(dbgs() << "SLP: Gathering due to max recursion depth.\n");
5629    if (TryToFindDuplicates(S))
5630      newTreeEntry(VL, std::nullopt /*not vectorized*/, S, UserTreeIdx,
5631                   ReuseShuffleIndicies);
5632    return;
5633  }
5634
5635  // Don't handle scalable vectors
5636  if (S.getOpcode() == Instruction::ExtractElement &&
5637      isa<ScalableVectorType>(
5638          cast<ExtractElementInst>(S.OpValue)->getVectorOperandType())) {
5639    LLVM_DEBUG(dbgs() << "SLP: Gathering due to scalable vector type.\n");
5640    if (TryToFindDuplicates(S))
5641      newTreeEntry(VL, std::nullopt /*not vectorized*/, S, UserTreeIdx,
5642                   ReuseShuffleIndicies);
5643    return;
5644  }
5645
5646  // Don't handle vectors.
5647  if (S.OpValue->getType()->isVectorTy() &&
5648      !isa<InsertElementInst>(S.OpValue)) {
5649    LLVM_DEBUG(dbgs() << "SLP: Gathering due to vector type.\n");
5650    newTreeEntry(VL, std::nullopt /*not vectorized*/, S, UserTreeIdx);
5651    return;
5652  }
5653
5654  if (StoreInst *SI = dyn_cast<StoreInst>(S.OpValue))
5655    if (SI->getValueOperand()->getType()->isVectorTy()) {
5656      LLVM_DEBUG(dbgs() << "SLP: Gathering due to store vector type.\n");
5657      newTreeEntry(VL, std::nullopt /*not vectorized*/, S, UserTreeIdx);
5658      return;
5659    }
5660
5661  // If all of the operands are identical or constant we have a simple solution.
5662  // If we deal with insert/extract instructions, they all must have constant
5663  // indices, otherwise we should gather them, not try to vectorize.
5664  // If alternate op node with 2 elements with gathered operands - do not
5665  // vectorize.
5666  auto &&NotProfitableForVectorization = [&S, this,
5667                                          Depth](ArrayRef<Value *> VL) {
5668    if (!S.getOpcode() || !S.isAltShuffle() || VL.size() > 2)
5669      return false;
5670    if (VectorizableTree.size() < MinTreeSize)
5671      return false;
5672    if (Depth >= RecursionMaxDepth - 1)
5673      return true;
5674    // Check if all operands are extracts, part of vector node or can build a
5675    // regular vectorize node.
5676    SmallVector<unsigned, 2> InstsCount(VL.size(), 0);
5677    for (Value *V : VL) {
5678      auto *I = cast<Instruction>(V);
5679      InstsCount.push_back(count_if(I->operand_values(), [](Value *Op) {
5680        return isa<Instruction>(Op) || isVectorLikeInstWithConstOps(Op);
5681      }));
5682    }
5683    bool IsCommutative = isCommutative(S.MainOp) || isCommutative(S.AltOp);
5684    if ((IsCommutative &&
5685         std::accumulate(InstsCount.begin(), InstsCount.end(), 0) < 2) ||
5686        (!IsCommutative &&
5687         all_of(InstsCount, [](unsigned ICnt) { return ICnt < 2; })))
5688      return true;
5689    assert(VL.size() == 2 && "Expected only 2 alternate op instructions.");
5690    SmallVector<SmallVector<std::pair<Value *, Value *>>> Candidates;
5691    auto *I1 = cast<Instruction>(VL.front());
5692    auto *I2 = cast<Instruction>(VL.back());
5693    for (int Op = 0, E = S.MainOp->getNumOperands(); Op < E; ++Op)
5694      Candidates.emplace_back().emplace_back(I1->getOperand(Op),
5695                                             I2->getOperand(Op));
5696    if (static_cast<unsigned>(count_if(
5697            Candidates, [this](ArrayRef<std::pair<Value *, Value *>> Cand) {
5698              return findBestRootPair(Cand, LookAheadHeuristics::ScoreSplat);
5699            })) >= S.MainOp->getNumOperands() / 2)
5700      return false;
5701    if (S.MainOp->getNumOperands() > 2)
5702      return true;
5703    if (IsCommutative) {
5704      // Check permuted operands.
5705      Candidates.clear();
5706      for (int Op = 0, E = S.MainOp->getNumOperands(); Op < E; ++Op)
5707        Candidates.emplace_back().emplace_back(I1->getOperand(Op),
5708                                               I2->getOperand((Op + 1) % E));
5709      if (any_of(
5710              Candidates, [this](ArrayRef<std::pair<Value *, Value *>> Cand) {
5711                return findBestRootPair(Cand, LookAheadHeuristics::ScoreSplat);
5712              }))
5713        return false;
5714    }
5715    return true;
5716  };
5717  SmallVector<unsigned> SortedIndices;
5718  BasicBlock *BB = nullptr;
5719  bool IsScatterVectorizeUserTE =
5720      UserTreeIdx.UserTE &&
5721      (UserTreeIdx.UserTE->State == TreeEntry::ScatterVectorize ||
5722       UserTreeIdx.UserTE->State == TreeEntry::PossibleStridedVectorize);
5723  bool AreAllSameInsts =
5724      (S.getOpcode() && allSameBlock(VL)) ||
5725      (S.OpValue->getType()->isPointerTy() && IsScatterVectorizeUserTE &&
5726       VL.size() > 2 &&
5727       all_of(VL,
5728              [&BB](Value *V) {
5729                auto *I = dyn_cast<GetElementPtrInst>(V);
5730                if (!I)
5731                  return doesNotNeedToBeScheduled(V);
5732                if (!BB)
5733                  BB = I->getParent();
5734                return BB == I->getParent() && I->getNumOperands() == 2;
5735              }) &&
5736       BB &&
5737       sortPtrAccesses(VL, UserTreeIdx.UserTE->getMainOp()->getType(), *DL, *SE,
5738                       SortedIndices));
5739  if (!AreAllSameInsts || allConstant(VL) || isSplat(VL) ||
5740      (isa<InsertElementInst, ExtractValueInst, ExtractElementInst>(
5741           S.OpValue) &&
5742       !all_of(VL, isVectorLikeInstWithConstOps)) ||
5743      NotProfitableForVectorization(VL)) {
5744    LLVM_DEBUG(dbgs() << "SLP: Gathering due to C,S,B,O, small shuffle. \n");
5745    if (TryToFindDuplicates(S))
5746      newTreeEntry(VL, std::nullopt /*not vectorized*/, S, UserTreeIdx,
5747                   ReuseShuffleIndicies);
5748    return;
5749  }
5750
5751  // We now know that this is a vector of instructions of the same type from
5752  // the same block.
5753
5754  // Check if this is a duplicate of another entry.
5755  if (TreeEntry *E = getTreeEntry(S.OpValue)) {
5756    LLVM_DEBUG(dbgs() << "SLP: \tChecking bundle: " << *S.OpValue << ".\n");
5757    if (!E->isSame(VL)) {
5758      auto It = MultiNodeScalars.find(S.OpValue);
5759      if (It != MultiNodeScalars.end()) {
5760        auto *TEIt = find_if(It->getSecond(),
5761                             [&](TreeEntry *ME) { return ME->isSame(VL); });
5762        if (TEIt != It->getSecond().end())
5763          E = *TEIt;
5764        else
5765          E = nullptr;
5766      } else {
5767        E = nullptr;
5768      }
5769    }
5770    if (!E) {
5771      if (!doesNotNeedToBeScheduled(S.OpValue)) {
5772        LLVM_DEBUG(dbgs() << "SLP: Gathering due to partial overlap.\n");
5773        if (TryToFindDuplicates(S))
5774          newTreeEntry(VL, std::nullopt /*not vectorized*/, S, UserTreeIdx,
5775                       ReuseShuffleIndicies);
5776        return;
5777      }
5778    } else {
5779      // Record the reuse of the tree node.  FIXME, currently this is only used
5780      // to properly draw the graph rather than for the actual vectorization.
5781      E->UserTreeIndices.push_back(UserTreeIdx);
5782      LLVM_DEBUG(dbgs() << "SLP: Perfect diamond merge at " << *S.OpValue
5783                        << ".\n");
5784      return;
5785    }
5786  }
5787
5788  // Check that none of the instructions in the bundle are already in the tree.
5789  for (Value *V : VL) {
5790    if ((!IsScatterVectorizeUserTE && !isa<Instruction>(V)) ||
5791        doesNotNeedToBeScheduled(V))
5792      continue;
5793    if (getTreeEntry(V)) {
5794      LLVM_DEBUG(dbgs() << "SLP: The instruction (" << *V
5795                        << ") is already in tree.\n");
5796      if (TryToFindDuplicates(S))
5797        newTreeEntry(VL, std::nullopt /*not vectorized*/, S, UserTreeIdx,
5798                     ReuseShuffleIndicies);
5799      return;
5800    }
5801  }
5802
5803  // The reduction nodes (stored in UserIgnoreList) also should stay scalar.
5804  if (UserIgnoreList && !UserIgnoreList->empty()) {
5805    for (Value *V : VL) {
5806      if (UserIgnoreList && UserIgnoreList->contains(V)) {
5807        LLVM_DEBUG(dbgs() << "SLP: Gathering due to gathered scalar.\n");
5808        if (TryToFindDuplicates(S))
5809          newTreeEntry(VL, std::nullopt /*not vectorized*/, S, UserTreeIdx,
5810                       ReuseShuffleIndicies);
5811        return;
5812      }
5813    }
5814  }
5815
5816  // Special processing for sorted pointers for ScatterVectorize node with
5817  // constant indeces only.
5818  if (AreAllSameInsts && UserTreeIdx.UserTE &&
5819      (UserTreeIdx.UserTE->State == TreeEntry::ScatterVectorize ||
5820       UserTreeIdx.UserTE->State == TreeEntry::PossibleStridedVectorize) &&
5821      !(S.getOpcode() && allSameBlock(VL))) {
5822    assert(S.OpValue->getType()->isPointerTy() &&
5823           count_if(VL, [](Value *V) { return isa<GetElementPtrInst>(V); }) >=
5824               2 &&
5825           "Expected pointers only.");
5826    // Reset S to make it GetElementPtr kind of node.
5827    const auto *It = find_if(VL, [](Value *V) { return isa<GetElementPtrInst>(V); });
5828    assert(It != VL.end() && "Expected at least one GEP.");
5829    S = getSameOpcode(*It, *TLI);
5830  }
5831
5832  // Check that all of the users of the scalars that we want to vectorize are
5833  // schedulable.
5834  auto *VL0 = cast<Instruction>(S.OpValue);
5835  BB = VL0->getParent();
5836
5837  if (!DT->isReachableFromEntry(BB)) {
5838    // Don't go into unreachable blocks. They may contain instructions with
5839    // dependency cycles which confuse the final scheduling.
5840    LLVM_DEBUG(dbgs() << "SLP: bundle in unreachable block.\n");
5841    newTreeEntry(VL, std::nullopt /*not vectorized*/, S, UserTreeIdx);
5842    return;
5843  }
5844
5845  // Don't go into catchswitch blocks, which can happen with PHIs.
5846  // Such blocks can only have PHIs and the catchswitch.  There is no
5847  // place to insert a shuffle if we need to, so just avoid that issue.
5848  if (isa<CatchSwitchInst>(BB->getTerminator())) {
5849    LLVM_DEBUG(dbgs() << "SLP: bundle in catchswitch block.\n");
5850    newTreeEntry(VL, std::nullopt /*not vectorized*/, S, UserTreeIdx);
5851    return;
5852  }
5853
5854  // Check that every instruction appears once in this bundle.
5855  if (!TryToFindDuplicates(S, /*DoNotFail=*/true))
5856    return;
5857
5858  // Perform specific checks for each particular instruction kind.
5859  OrdersType CurrentOrder;
5860  SmallVector<Value *> PointerOps;
5861  TreeEntry::EntryState State = getScalarsVectorizationState(
5862      S, VL, IsScatterVectorizeUserTE, CurrentOrder, PointerOps);
5863  if (State == TreeEntry::NeedToGather) {
5864    newTreeEntry(VL, std::nullopt /*not vectorized*/, S, UserTreeIdx,
5865                 ReuseShuffleIndicies);
5866    return;
5867  }
5868
5869  auto &BSRef = BlocksSchedules[BB];
5870  if (!BSRef)
5871    BSRef = std::make_unique<BlockScheduling>(BB);
5872
5873  BlockScheduling &BS = *BSRef;
5874
5875  std::optional<ScheduleData *> Bundle =
5876      BS.tryScheduleBundle(UniqueValues, this, S);
5877#ifdef EXPENSIVE_CHECKS
5878  // Make sure we didn't break any internal invariants
5879  BS.verify();
5880#endif
5881  if (!Bundle) {
5882    LLVM_DEBUG(dbgs() << "SLP: We are not able to schedule this bundle!\n");
5883    assert((!BS.getScheduleData(VL0) ||
5884            !BS.getScheduleData(VL0)->isPartOfBundle()) &&
5885           "tryScheduleBundle should cancelScheduling on failure");
5886    newTreeEntry(VL, std::nullopt /*not vectorized*/, S, UserTreeIdx,
5887                 ReuseShuffleIndicies);
5888    return;
5889  }
5890  LLVM_DEBUG(dbgs() << "SLP: We are able to schedule this bundle.\n");
5891
5892  unsigned ShuffleOrOp = S.isAltShuffle() ?
5893                (unsigned) Instruction::ShuffleVector : S.getOpcode();
5894  switch (ShuffleOrOp) {
5895    case Instruction::PHI: {
5896      auto *PH = cast<PHINode>(VL0);
5897
5898      TreeEntry *TE =
5899          newTreeEntry(VL, Bundle, S, UserTreeIdx, ReuseShuffleIndicies);
5900      LLVM_DEBUG(dbgs() << "SLP: added a vector of PHINodes.\n");
5901
5902      // Keeps the reordered operands to avoid code duplication.
5903      SmallVector<ValueList, 2> OperandsVec;
5904      for (unsigned I = 0, E = PH->getNumIncomingValues(); I < E; ++I) {
5905        if (!DT->isReachableFromEntry(PH->getIncomingBlock(I))) {
5906          ValueList Operands(VL.size(), PoisonValue::get(PH->getType()));
5907          TE->setOperand(I, Operands);
5908          OperandsVec.push_back(Operands);
5909          continue;
5910        }
5911        ValueList Operands;
5912        // Prepare the operand vector.
5913        for (Value *V : VL)
5914          Operands.push_back(cast<PHINode>(V)->getIncomingValueForBlock(
5915              PH->getIncomingBlock(I)));
5916        TE->setOperand(I, Operands);
5917        OperandsVec.push_back(Operands);
5918      }
5919      for (unsigned OpIdx = 0, OpE = OperandsVec.size(); OpIdx != OpE; ++OpIdx)
5920        buildTree_rec(OperandsVec[OpIdx], Depth + 1, {TE, OpIdx});
5921      return;
5922    }
5923    case Instruction::ExtractValue:
5924    case Instruction::ExtractElement: {
5925      if (CurrentOrder.empty()) {
5926        LLVM_DEBUG(dbgs() << "SLP: Reusing or shuffling extract sequence.\n");
5927        newTreeEntry(VL, Bundle /*vectorized*/, S, UserTreeIdx,
5928                     ReuseShuffleIndicies);
5929        // This is a special case, as it does not gather, but at the same time
5930        // we are not extending buildTree_rec() towards the operands.
5931        ValueList Op0;
5932        Op0.assign(VL.size(), VL0->getOperand(0));
5933        VectorizableTree.back()->setOperand(0, Op0);
5934        return;
5935      }
5936      LLVM_DEBUG({
5937        dbgs() << "SLP: Reusing or shuffling of reordered extract sequence "
5938                  "with order";
5939        for (unsigned Idx : CurrentOrder)
5940          dbgs() << " " << Idx;
5941        dbgs() << "\n";
5942      });
5943      fixupOrderingIndices(CurrentOrder);
5944      // Insert new order with initial value 0, if it does not exist,
5945      // otherwise return the iterator to the existing one.
5946      newTreeEntry(VL, Bundle /*vectorized*/, S, UserTreeIdx,
5947                   ReuseShuffleIndicies, CurrentOrder);
5948      // This is a special case, as it does not gather, but at the same time
5949      // we are not extending buildTree_rec() towards the operands.
5950      ValueList Op0;
5951      Op0.assign(VL.size(), VL0->getOperand(0));
5952      VectorizableTree.back()->setOperand(0, Op0);
5953      return;
5954    }
5955    case Instruction::InsertElement: {
5956      assert(ReuseShuffleIndicies.empty() && "All inserts should be unique");
5957
5958      auto OrdCompare = [](const std::pair<int, int> &P1,
5959                           const std::pair<int, int> &P2) {
5960        return P1.first > P2.first;
5961      };
5962      PriorityQueue<std::pair<int, int>, SmallVector<std::pair<int, int>>,
5963                    decltype(OrdCompare)>
5964          Indices(OrdCompare);
5965      for (int I = 0, E = VL.size(); I < E; ++I) {
5966        unsigned Idx = *getInsertIndex(VL[I]);
5967        Indices.emplace(Idx, I);
5968      }
5969      OrdersType CurrentOrder(VL.size(), VL.size());
5970      bool IsIdentity = true;
5971      for (int I = 0, E = VL.size(); I < E; ++I) {
5972        CurrentOrder[Indices.top().second] = I;
5973        IsIdentity &= Indices.top().second == I;
5974        Indices.pop();
5975      }
5976      if (IsIdentity)
5977        CurrentOrder.clear();
5978      TreeEntry *TE = newTreeEntry(VL, Bundle /*vectorized*/, S, UserTreeIdx,
5979                                   std::nullopt, CurrentOrder);
5980      LLVM_DEBUG(dbgs() << "SLP: added inserts bundle.\n");
5981
5982      constexpr int NumOps = 2;
5983      ValueList VectorOperands[NumOps];
5984      for (int I = 0; I < NumOps; ++I) {
5985        for (Value *V : VL)
5986          VectorOperands[I].push_back(cast<Instruction>(V)->getOperand(I));
5987
5988        TE->setOperand(I, VectorOperands[I]);
5989      }
5990      buildTree_rec(VectorOperands[NumOps - 1], Depth + 1, {TE, NumOps - 1});
5991      return;
5992    }
5993    case Instruction::Load: {
5994      // Check that a vectorized load would load the same memory as a scalar
5995      // load. For example, we don't want to vectorize loads that are smaller
5996      // than 8-bit. Even though we have a packed struct {<i2, i2, i2, i2>} LLVM
5997      // treats loading/storing it as an i8 struct. If we vectorize loads/stores
5998      // from such a struct, we read/write packed bits disagreeing with the
5999      // unvectorized version.
6000      TreeEntry *TE = nullptr;
6001      fixupOrderingIndices(CurrentOrder);
6002      switch (State) {
6003      case TreeEntry::Vectorize:
6004        if (CurrentOrder.empty()) {
6005          // Original loads are consecutive and does not require reordering.
6006          TE = newTreeEntry(VL, Bundle /*vectorized*/, S, UserTreeIdx,
6007                            ReuseShuffleIndicies);
6008          LLVM_DEBUG(dbgs() << "SLP: added a vector of loads.\n");
6009        } else {
6010          // Need to reorder.
6011          TE = newTreeEntry(VL, Bundle /*vectorized*/, S, UserTreeIdx,
6012                            ReuseShuffleIndicies, CurrentOrder);
6013          LLVM_DEBUG(dbgs() << "SLP: added a vector of jumbled loads.\n");
6014        }
6015        TE->setOperandsInOrder();
6016        break;
6017      case TreeEntry::PossibleStridedVectorize:
6018        // Vectorizing non-consecutive loads with `llvm.masked.gather`.
6019        if (CurrentOrder.empty()) {
6020          TE = newTreeEntry(VL, TreeEntry::PossibleStridedVectorize, Bundle, S,
6021                            UserTreeIdx, ReuseShuffleIndicies);
6022        } else {
6023          TE = newTreeEntry(VL, TreeEntry::PossibleStridedVectorize, Bundle, S,
6024                            UserTreeIdx, ReuseShuffleIndicies, CurrentOrder);
6025        }
6026        TE->setOperandsInOrder();
6027        buildTree_rec(PointerOps, Depth + 1, {TE, 0});
6028        LLVM_DEBUG(dbgs() << "SLP: added a vector of non-consecutive loads.\n");
6029        break;
6030      case TreeEntry::ScatterVectorize:
6031        // Vectorizing non-consecutive loads with `llvm.masked.gather`.
6032        TE = newTreeEntry(VL, TreeEntry::ScatterVectorize, Bundle, S,
6033                          UserTreeIdx, ReuseShuffleIndicies);
6034        TE->setOperandsInOrder();
6035        buildTree_rec(PointerOps, Depth + 1, {TE, 0});
6036        LLVM_DEBUG(dbgs() << "SLP: added a vector of non-consecutive loads.\n");
6037        break;
6038      case TreeEntry::NeedToGather:
6039        llvm_unreachable("Unexpected loads state.");
6040      }
6041      return;
6042    }
6043    case Instruction::ZExt:
6044    case Instruction::SExt:
6045    case Instruction::FPToUI:
6046    case Instruction::FPToSI:
6047    case Instruction::FPExt:
6048    case Instruction::PtrToInt:
6049    case Instruction::IntToPtr:
6050    case Instruction::SIToFP:
6051    case Instruction::UIToFP:
6052    case Instruction::Trunc:
6053    case Instruction::FPTrunc:
6054    case Instruction::BitCast: {
6055      TreeEntry *TE = newTreeEntry(VL, Bundle /*vectorized*/, S, UserTreeIdx,
6056                                   ReuseShuffleIndicies);
6057      LLVM_DEBUG(dbgs() << "SLP: added a vector of casts.\n");
6058
6059      TE->setOperandsInOrder();
6060      for (unsigned I : seq<unsigned>(0, VL0->getNumOperands())) {
6061        ValueList Operands;
6062        // Prepare the operand vector.
6063        for (Value *V : VL)
6064          Operands.push_back(cast<Instruction>(V)->getOperand(I));
6065
6066        buildTree_rec(Operands, Depth + 1, {TE, I});
6067      }
6068      return;
6069    }
6070    case Instruction::ICmp:
6071    case Instruction::FCmp: {
6072      // Check that all of the compares have the same predicate.
6073      CmpInst::Predicate P0 = cast<CmpInst>(VL0)->getPredicate();
6074      TreeEntry *TE = newTreeEntry(VL, Bundle /*vectorized*/, S, UserTreeIdx,
6075                                   ReuseShuffleIndicies);
6076      LLVM_DEBUG(dbgs() << "SLP: added a vector of compares.\n");
6077
6078      ValueList Left, Right;
6079      if (cast<CmpInst>(VL0)->isCommutative()) {
6080        // Commutative predicate - collect + sort operands of the instructions
6081        // so that each side is more likely to have the same opcode.
6082        assert(P0 == CmpInst::getSwappedPredicate(P0) &&
6083               "Commutative Predicate mismatch");
6084        reorderInputsAccordingToOpcode(VL, Left, Right, *TLI, *DL, *SE, *this);
6085      } else {
6086        // Collect operands - commute if it uses the swapped predicate.
6087        for (Value *V : VL) {
6088          auto *Cmp = cast<CmpInst>(V);
6089          Value *LHS = Cmp->getOperand(0);
6090          Value *RHS = Cmp->getOperand(1);
6091          if (Cmp->getPredicate() != P0)
6092            std::swap(LHS, RHS);
6093          Left.push_back(LHS);
6094          Right.push_back(RHS);
6095        }
6096      }
6097      TE->setOperand(0, Left);
6098      TE->setOperand(1, Right);
6099      buildTree_rec(Left, Depth + 1, {TE, 0});
6100      buildTree_rec(Right, Depth + 1, {TE, 1});
6101      return;
6102    }
6103    case Instruction::Select:
6104    case Instruction::FNeg:
6105    case Instruction::Add:
6106    case Instruction::FAdd:
6107    case Instruction::Sub:
6108    case Instruction::FSub:
6109    case Instruction::Mul:
6110    case Instruction::FMul:
6111    case Instruction::UDiv:
6112    case Instruction::SDiv:
6113    case Instruction::FDiv:
6114    case Instruction::URem:
6115    case Instruction::SRem:
6116    case Instruction::FRem:
6117    case Instruction::Shl:
6118    case Instruction::LShr:
6119    case Instruction::AShr:
6120    case Instruction::And:
6121    case Instruction::Or:
6122    case Instruction::Xor: {
6123      TreeEntry *TE = newTreeEntry(VL, Bundle /*vectorized*/, S, UserTreeIdx,
6124                                   ReuseShuffleIndicies);
6125      LLVM_DEBUG(dbgs() << "SLP: added a vector of un/bin op.\n");
6126
6127      // Sort operands of the instructions so that each side is more likely to
6128      // have the same opcode.
6129      if (isa<BinaryOperator>(VL0) && VL0->isCommutative()) {
6130        ValueList Left, Right;
6131        reorderInputsAccordingToOpcode(VL, Left, Right, *TLI, *DL, *SE, *this);
6132        TE->setOperand(0, Left);
6133        TE->setOperand(1, Right);
6134        buildTree_rec(Left, Depth + 1, {TE, 0});
6135        buildTree_rec(Right, Depth + 1, {TE, 1});
6136        return;
6137      }
6138
6139      TE->setOperandsInOrder();
6140      for (unsigned I : seq<unsigned>(0, VL0->getNumOperands())) {
6141        ValueList Operands;
6142        // Prepare the operand vector.
6143        for (Value *V : VL)
6144          Operands.push_back(cast<Instruction>(V)->getOperand(I));
6145
6146        buildTree_rec(Operands, Depth + 1, {TE, I});
6147      }
6148      return;
6149    }
6150    case Instruction::GetElementPtr: {
6151      TreeEntry *TE = newTreeEntry(VL, Bundle /*vectorized*/, S, UserTreeIdx,
6152                                   ReuseShuffleIndicies);
6153      LLVM_DEBUG(dbgs() << "SLP: added a vector of GEPs.\n");
6154      SmallVector<ValueList, 2> Operands(2);
6155      // Prepare the operand vector for pointer operands.
6156      for (Value *V : VL) {
6157        auto *GEP = dyn_cast<GetElementPtrInst>(V);
6158        if (!GEP) {
6159          Operands.front().push_back(V);
6160          continue;
6161        }
6162        Operands.front().push_back(GEP->getPointerOperand());
6163      }
6164      TE->setOperand(0, Operands.front());
6165      // Need to cast all indices to the same type before vectorization to
6166      // avoid crash.
6167      // Required to be able to find correct matches between different gather
6168      // nodes and reuse the vectorized values rather than trying to gather them
6169      // again.
6170      int IndexIdx = 1;
6171      Type *VL0Ty = VL0->getOperand(IndexIdx)->getType();
6172      Type *Ty = all_of(VL,
6173                        [VL0Ty, IndexIdx](Value *V) {
6174                          auto *GEP = dyn_cast<GetElementPtrInst>(V);
6175                          if (!GEP)
6176                            return true;
6177                          return VL0Ty == GEP->getOperand(IndexIdx)->getType();
6178                        })
6179                     ? VL0Ty
6180                     : DL->getIndexType(cast<GetElementPtrInst>(VL0)
6181                                            ->getPointerOperandType()
6182                                            ->getScalarType());
6183      // Prepare the operand vector.
6184      for (Value *V : VL) {
6185        auto *I = dyn_cast<GetElementPtrInst>(V);
6186        if (!I) {
6187          Operands.back().push_back(
6188              ConstantInt::get(Ty, 0, /*isSigned=*/false));
6189          continue;
6190        }
6191        auto *Op = I->getOperand(IndexIdx);
6192        auto *CI = dyn_cast<ConstantInt>(Op);
6193        if (!CI)
6194          Operands.back().push_back(Op);
6195        else
6196          Operands.back().push_back(ConstantFoldIntegerCast(
6197              CI, Ty, CI->getValue().isSignBitSet(), *DL));
6198      }
6199      TE->setOperand(IndexIdx, Operands.back());
6200
6201      for (unsigned I = 0, Ops = Operands.size(); I < Ops; ++I)
6202        buildTree_rec(Operands[I], Depth + 1, {TE, I});
6203      return;
6204    }
6205    case Instruction::Store: {
6206      // Check if the stores are consecutive or if we need to swizzle them.
6207      ValueList Operands(VL.size());
6208      auto *OIter = Operands.begin();
6209      for (Value *V : VL) {
6210        auto *SI = cast<StoreInst>(V);
6211        *OIter = SI->getValueOperand();
6212        ++OIter;
6213      }
6214      // Check that the sorted pointer operands are consecutive.
6215      if (CurrentOrder.empty()) {
6216        // Original stores are consecutive and does not require reordering.
6217        TreeEntry *TE = newTreeEntry(VL, Bundle /*vectorized*/, S, UserTreeIdx,
6218                                     ReuseShuffleIndicies);
6219        TE->setOperandsInOrder();
6220        buildTree_rec(Operands, Depth + 1, {TE, 0});
6221        LLVM_DEBUG(dbgs() << "SLP: added a vector of stores.\n");
6222      } else {
6223        fixupOrderingIndices(CurrentOrder);
6224        TreeEntry *TE = newTreeEntry(VL, Bundle /*vectorized*/, S, UserTreeIdx,
6225                                     ReuseShuffleIndicies, CurrentOrder);
6226        TE->setOperandsInOrder();
6227        buildTree_rec(Operands, Depth + 1, {TE, 0});
6228        LLVM_DEBUG(dbgs() << "SLP: added a vector of jumbled stores.\n");
6229      }
6230      return;
6231    }
6232    case Instruction::Call: {
6233      // Check if the calls are all to the same vectorizable intrinsic or
6234      // library function.
6235      CallInst *CI = cast<CallInst>(VL0);
6236      Intrinsic::ID ID = getVectorIntrinsicIDForCall(CI, TLI);
6237
6238      TreeEntry *TE = newTreeEntry(VL, Bundle /*vectorized*/, S, UserTreeIdx,
6239                                   ReuseShuffleIndicies);
6240      TE->setOperandsInOrder();
6241      for (unsigned I : seq<unsigned>(0, CI->arg_size())) {
6242        // For scalar operands no need to create an entry since no need to
6243        // vectorize it.
6244        if (isVectorIntrinsicWithScalarOpAtArg(ID, I))
6245          continue;
6246        ValueList Operands;
6247        // Prepare the operand vector.
6248        for (Value *V : VL) {
6249          auto *CI2 = cast<CallInst>(V);
6250          Operands.push_back(CI2->getArgOperand(I));
6251        }
6252        buildTree_rec(Operands, Depth + 1, {TE, I});
6253      }
6254      return;
6255    }
6256    case Instruction::ShuffleVector: {
6257      TreeEntry *TE = newTreeEntry(VL, Bundle /*vectorized*/, S, UserTreeIdx,
6258                                   ReuseShuffleIndicies);
6259      LLVM_DEBUG(dbgs() << "SLP: added a ShuffleVector op.\n");
6260
6261      // Reorder operands if reordering would enable vectorization.
6262      auto *CI = dyn_cast<CmpInst>(VL0);
6263      if (isa<BinaryOperator>(VL0) || CI) {
6264        ValueList Left, Right;
6265        if (!CI || all_of(VL, [](Value *V) {
6266              return cast<CmpInst>(V)->isCommutative();
6267            })) {
6268          reorderInputsAccordingToOpcode(VL, Left, Right, *TLI, *DL, *SE,
6269                                         *this);
6270        } else {
6271          auto *MainCI = cast<CmpInst>(S.MainOp);
6272          auto *AltCI = cast<CmpInst>(S.AltOp);
6273          CmpInst::Predicate MainP = MainCI->getPredicate();
6274          CmpInst::Predicate AltP = AltCI->getPredicate();
6275          assert(MainP != AltP &&
6276                 "Expected different main/alternate predicates.");
6277          // Collect operands - commute if it uses the swapped predicate or
6278          // alternate operation.
6279          for (Value *V : VL) {
6280            auto *Cmp = cast<CmpInst>(V);
6281            Value *LHS = Cmp->getOperand(0);
6282            Value *RHS = Cmp->getOperand(1);
6283
6284            if (isAlternateInstruction(Cmp, MainCI, AltCI, *TLI)) {
6285              if (AltP == CmpInst::getSwappedPredicate(Cmp->getPredicate()))
6286                std::swap(LHS, RHS);
6287            } else {
6288              if (MainP == CmpInst::getSwappedPredicate(Cmp->getPredicate()))
6289                std::swap(LHS, RHS);
6290            }
6291            Left.push_back(LHS);
6292            Right.push_back(RHS);
6293          }
6294        }
6295        TE->setOperand(0, Left);
6296        TE->setOperand(1, Right);
6297        buildTree_rec(Left, Depth + 1, {TE, 0});
6298        buildTree_rec(Right, Depth + 1, {TE, 1});
6299        return;
6300      }
6301
6302      TE->setOperandsInOrder();
6303      for (unsigned I : seq<unsigned>(0, VL0->getNumOperands())) {
6304        ValueList Operands;
6305        // Prepare the operand vector.
6306        for (Value *V : VL)
6307          Operands.push_back(cast<Instruction>(V)->getOperand(I));
6308
6309        buildTree_rec(Operands, Depth + 1, {TE, I});
6310      }
6311      return;
6312    }
6313    default:
6314      break;
6315  }
6316  llvm_unreachable("Unexpected vectorization of the instructions.");
6317}
6318
6319unsigned BoUpSLP::canMapToVector(Type *T) const {
6320  unsigned N = 1;
6321  Type *EltTy = T;
6322
6323  while (isa<StructType, ArrayType, FixedVectorType>(EltTy)) {
6324    if (auto *ST = dyn_cast<StructType>(EltTy)) {
6325      // Check that struct is homogeneous.
6326      for (const auto *Ty : ST->elements())
6327        if (Ty != *ST->element_begin())
6328          return 0;
6329      N *= ST->getNumElements();
6330      EltTy = *ST->element_begin();
6331    } else if (auto *AT = dyn_cast<ArrayType>(EltTy)) {
6332      N *= AT->getNumElements();
6333      EltTy = AT->getElementType();
6334    } else {
6335      auto *VT = cast<FixedVectorType>(EltTy);
6336      N *= VT->getNumElements();
6337      EltTy = VT->getElementType();
6338    }
6339  }
6340
6341  if (!isValidElementType(EltTy))
6342    return 0;
6343  uint64_t VTSize = DL->getTypeStoreSizeInBits(FixedVectorType::get(EltTy, N));
6344  if (VTSize < MinVecRegSize || VTSize > MaxVecRegSize ||
6345      VTSize != DL->getTypeStoreSizeInBits(T))
6346    return 0;
6347  return N;
6348}
6349
6350bool BoUpSLP::canReuseExtract(ArrayRef<Value *> VL, Value *OpValue,
6351                              SmallVectorImpl<unsigned> &CurrentOrder,
6352                              bool ResizeAllowed) const {
6353  const auto *It = find_if(VL, [](Value *V) {
6354    return isa<ExtractElementInst, ExtractValueInst>(V);
6355  });
6356  assert(It != VL.end() && "Expected at least one extract instruction.");
6357  auto *E0 = cast<Instruction>(*It);
6358  assert(all_of(VL,
6359                [](Value *V) {
6360                  return isa<UndefValue, ExtractElementInst, ExtractValueInst>(
6361                      V);
6362                }) &&
6363         "Invalid opcode");
6364  // Check if all of the extracts come from the same vector and from the
6365  // correct offset.
6366  Value *Vec = E0->getOperand(0);
6367
6368  CurrentOrder.clear();
6369
6370  // We have to extract from a vector/aggregate with the same number of elements.
6371  unsigned NElts;
6372  if (E0->getOpcode() == Instruction::ExtractValue) {
6373    NElts = canMapToVector(Vec->getType());
6374    if (!NElts)
6375      return false;
6376    // Check if load can be rewritten as load of vector.
6377    LoadInst *LI = dyn_cast<LoadInst>(Vec);
6378    if (!LI || !LI->isSimple() || !LI->hasNUses(VL.size()))
6379      return false;
6380  } else {
6381    NElts = cast<FixedVectorType>(Vec->getType())->getNumElements();
6382  }
6383
6384  unsigned E = VL.size();
6385  if (!ResizeAllowed && NElts != E)
6386    return false;
6387  SmallVector<int> Indices(E, PoisonMaskElem);
6388  unsigned MinIdx = NElts, MaxIdx = 0;
6389  for (auto [I, V] : enumerate(VL)) {
6390    auto *Inst = dyn_cast<Instruction>(V);
6391    if (!Inst)
6392      continue;
6393    if (Inst->getOperand(0) != Vec)
6394      return false;
6395    if (auto *EE = dyn_cast<ExtractElementInst>(Inst))
6396      if (isa<UndefValue>(EE->getIndexOperand()))
6397        continue;
6398    std::optional<unsigned> Idx = getExtractIndex(Inst);
6399    if (!Idx)
6400      return false;
6401    const unsigned ExtIdx = *Idx;
6402    if (ExtIdx >= NElts)
6403      continue;
6404    Indices[I] = ExtIdx;
6405    if (MinIdx > ExtIdx)
6406      MinIdx = ExtIdx;
6407    if (MaxIdx < ExtIdx)
6408      MaxIdx = ExtIdx;
6409  }
6410  if (MaxIdx - MinIdx + 1 > E)
6411    return false;
6412  if (MaxIdx + 1 <= E)
6413    MinIdx = 0;
6414
6415  // Check that all of the indices extract from the correct offset.
6416  bool ShouldKeepOrder = true;
6417  // Assign to all items the initial value E + 1 so we can check if the extract
6418  // instruction index was used already.
6419  // Also, later we can check that all the indices are used and we have a
6420  // consecutive access in the extract instructions, by checking that no
6421  // element of CurrentOrder still has value E + 1.
6422  CurrentOrder.assign(E, E);
6423  for (unsigned I = 0; I < E; ++I) {
6424    if (Indices[I] == PoisonMaskElem)
6425      continue;
6426    const unsigned ExtIdx = Indices[I] - MinIdx;
6427    if (CurrentOrder[ExtIdx] != E) {
6428      CurrentOrder.clear();
6429      return false;
6430    }
6431    ShouldKeepOrder &= ExtIdx == I;
6432    CurrentOrder[ExtIdx] = I;
6433  }
6434  if (ShouldKeepOrder)
6435    CurrentOrder.clear();
6436
6437  return ShouldKeepOrder;
6438}
6439
6440bool BoUpSLP::areAllUsersVectorized(
6441    Instruction *I, const SmallDenseSet<Value *> *VectorizedVals) const {
6442  return (I->hasOneUse() && (!VectorizedVals || VectorizedVals->contains(I))) ||
6443         all_of(I->users(), [this](User *U) {
6444           return ScalarToTreeEntry.contains(U) ||
6445                  isVectorLikeInstWithConstOps(U) ||
6446                  (isa<ExtractElementInst>(U) && MustGather.contains(U));
6447         });
6448}
6449
6450static std::pair<InstructionCost, InstructionCost>
6451getVectorCallCosts(CallInst *CI, FixedVectorType *VecTy,
6452                   TargetTransformInfo *TTI, TargetLibraryInfo *TLI) {
6453  Intrinsic::ID ID = getVectorIntrinsicIDForCall(CI, TLI);
6454
6455  // Calculate the cost of the scalar and vector calls.
6456  SmallVector<Type *, 4> VecTys;
6457  for (Use &Arg : CI->args())
6458    VecTys.push_back(
6459        FixedVectorType::get(Arg->getType(), VecTy->getNumElements()));
6460  FastMathFlags FMF;
6461  if (auto *FPCI = dyn_cast<FPMathOperator>(CI))
6462    FMF = FPCI->getFastMathFlags();
6463  SmallVector<const Value *> Arguments(CI->args());
6464  IntrinsicCostAttributes CostAttrs(ID, VecTy, Arguments, VecTys, FMF,
6465                                    dyn_cast<IntrinsicInst>(CI));
6466  auto IntrinsicCost =
6467    TTI->getIntrinsicInstrCost(CostAttrs, TTI::TCK_RecipThroughput);
6468
6469  auto Shape = VFShape::get(CI->getFunctionType(),
6470                            ElementCount::getFixed(VecTy->getNumElements()),
6471                            false /*HasGlobalPred*/);
6472  Function *VecFunc = VFDatabase(*CI).getVectorizedFunction(Shape);
6473  auto LibCost = IntrinsicCost;
6474  if (!CI->isNoBuiltin() && VecFunc) {
6475    // Calculate the cost of the vector library call.
6476    // If the corresponding vector call is cheaper, return its cost.
6477    LibCost = TTI->getCallInstrCost(nullptr, VecTy, VecTys,
6478                                    TTI::TCK_RecipThroughput);
6479  }
6480  return {IntrinsicCost, LibCost};
6481}
6482
6483void BoUpSLP::TreeEntry::buildAltOpShuffleMask(
6484    const function_ref<bool(Instruction *)> IsAltOp, SmallVectorImpl<int> &Mask,
6485    SmallVectorImpl<Value *> *OpScalars,
6486    SmallVectorImpl<Value *> *AltScalars) const {
6487  unsigned Sz = Scalars.size();
6488  Mask.assign(Sz, PoisonMaskElem);
6489  SmallVector<int> OrderMask;
6490  if (!ReorderIndices.empty())
6491    inversePermutation(ReorderIndices, OrderMask);
6492  for (unsigned I = 0; I < Sz; ++I) {
6493    unsigned Idx = I;
6494    if (!ReorderIndices.empty())
6495      Idx = OrderMask[I];
6496    auto *OpInst = cast<Instruction>(Scalars[Idx]);
6497    if (IsAltOp(OpInst)) {
6498      Mask[I] = Sz + Idx;
6499      if (AltScalars)
6500        AltScalars->push_back(OpInst);
6501    } else {
6502      Mask[I] = Idx;
6503      if (OpScalars)
6504        OpScalars->push_back(OpInst);
6505    }
6506  }
6507  if (!ReuseShuffleIndices.empty()) {
6508    SmallVector<int> NewMask(ReuseShuffleIndices.size(), PoisonMaskElem);
6509    transform(ReuseShuffleIndices, NewMask.begin(), [&Mask](int Idx) {
6510      return Idx != PoisonMaskElem ? Mask[Idx] : PoisonMaskElem;
6511    });
6512    Mask.swap(NewMask);
6513  }
6514}
6515
6516static bool isAlternateInstruction(const Instruction *I,
6517                                   const Instruction *MainOp,
6518                                   const Instruction *AltOp,
6519                                   const TargetLibraryInfo &TLI) {
6520  if (auto *MainCI = dyn_cast<CmpInst>(MainOp)) {
6521    auto *AltCI = cast<CmpInst>(AltOp);
6522    CmpInst::Predicate MainP = MainCI->getPredicate();
6523    CmpInst::Predicate AltP = AltCI->getPredicate();
6524    assert(MainP != AltP && "Expected different main/alternate predicates.");
6525    auto *CI = cast<CmpInst>(I);
6526    if (isCmpSameOrSwapped(MainCI, CI, TLI))
6527      return false;
6528    if (isCmpSameOrSwapped(AltCI, CI, TLI))
6529      return true;
6530    CmpInst::Predicate P = CI->getPredicate();
6531    CmpInst::Predicate SwappedP = CmpInst::getSwappedPredicate(P);
6532
6533    assert((MainP == P || AltP == P || MainP == SwappedP || AltP == SwappedP) &&
6534           "CmpInst expected to match either main or alternate predicate or "
6535           "their swap.");
6536    (void)AltP;
6537    return MainP != P && MainP != SwappedP;
6538  }
6539  return I->getOpcode() == AltOp->getOpcode();
6540}
6541
6542TTI::OperandValueInfo BoUpSLP::getOperandInfo(ArrayRef<Value *> Ops) {
6543  assert(!Ops.empty());
6544  const auto *Op0 = Ops.front();
6545
6546  const bool IsConstant = all_of(Ops, [](Value *V) {
6547    // TODO: We should allow undef elements here
6548    return isConstant(V) && !isa<UndefValue>(V);
6549  });
6550  const bool IsUniform = all_of(Ops, [=](Value *V) {
6551    // TODO: We should allow undef elements here
6552    return V == Op0;
6553  });
6554  const bool IsPowerOfTwo = all_of(Ops, [](Value *V) {
6555    // TODO: We should allow undef elements here
6556    if (auto *CI = dyn_cast<ConstantInt>(V))
6557      return CI->getValue().isPowerOf2();
6558    return false;
6559  });
6560  const bool IsNegatedPowerOfTwo = all_of(Ops, [](Value *V) {
6561    // TODO: We should allow undef elements here
6562    if (auto *CI = dyn_cast<ConstantInt>(V))
6563      return CI->getValue().isNegatedPowerOf2();
6564    return false;
6565  });
6566
6567  TTI::OperandValueKind VK = TTI::OK_AnyValue;
6568  if (IsConstant && IsUniform)
6569    VK = TTI::OK_UniformConstantValue;
6570  else if (IsConstant)
6571    VK = TTI::OK_NonUniformConstantValue;
6572  else if (IsUniform)
6573    VK = TTI::OK_UniformValue;
6574
6575  TTI::OperandValueProperties VP = TTI::OP_None;
6576  VP = IsPowerOfTwo ? TTI::OP_PowerOf2 : VP;
6577  VP = IsNegatedPowerOfTwo ? TTI::OP_NegatedPowerOf2 : VP;
6578
6579  return {VK, VP};
6580}
6581
6582namespace {
6583/// The base class for shuffle instruction emission and shuffle cost estimation.
6584class BaseShuffleAnalysis {
6585protected:
6586  /// Checks if the mask is an identity mask.
6587  /// \param IsStrict if is true the function returns false if mask size does
6588  /// not match vector size.
6589  static bool isIdentityMask(ArrayRef<int> Mask, const FixedVectorType *VecTy,
6590                             bool IsStrict) {
6591    int Limit = Mask.size();
6592    int VF = VecTy->getNumElements();
6593    int Index = -1;
6594    if (VF == Limit && ShuffleVectorInst::isIdentityMask(Mask, Limit))
6595      return true;
6596    if (!IsStrict) {
6597      // Consider extract subvector starting from index 0.
6598      if (ShuffleVectorInst::isExtractSubvectorMask(Mask, VF, Index) &&
6599          Index == 0)
6600        return true;
6601      // All VF-size submasks are identity (e.g.
6602      // <poison,poison,poison,poison,0,1,2,poison,poison,1,2,3> etc. for VF 4).
6603      if (Limit % VF == 0 && all_of(seq<int>(0, Limit / VF), [=](int Idx) {
6604            ArrayRef<int> Slice = Mask.slice(Idx * VF, VF);
6605            return all_of(Slice, [](int I) { return I == PoisonMaskElem; }) ||
6606                   ShuffleVectorInst::isIdentityMask(Slice, VF);
6607          }))
6608        return true;
6609    }
6610    return false;
6611  }
6612
6613  /// Tries to combine 2 different masks into single one.
6614  /// \param LocalVF Vector length of the permuted input vector. \p Mask may
6615  /// change the size of the vector, \p LocalVF is the original size of the
6616  /// shuffled vector.
6617  static void combineMasks(unsigned LocalVF, SmallVectorImpl<int> &Mask,
6618                           ArrayRef<int> ExtMask) {
6619    unsigned VF = Mask.size();
6620    SmallVector<int> NewMask(ExtMask.size(), PoisonMaskElem);
6621    for (int I = 0, Sz = ExtMask.size(); I < Sz; ++I) {
6622      if (ExtMask[I] == PoisonMaskElem)
6623        continue;
6624      int MaskedIdx = Mask[ExtMask[I] % VF];
6625      NewMask[I] =
6626          MaskedIdx == PoisonMaskElem ? PoisonMaskElem : MaskedIdx % LocalVF;
6627    }
6628    Mask.swap(NewMask);
6629  }
6630
6631  /// Looks through shuffles trying to reduce final number of shuffles in the
6632  /// code. The function looks through the previously emitted shuffle
6633  /// instructions and properly mark indices in mask as undef.
6634  /// For example, given the code
6635  /// \code
6636  /// %s1 = shufflevector <2 x ty> %0, poison, <1, 0>
6637  /// %s2 = shufflevector <2 x ty> %1, poison, <1, 0>
6638  /// \endcode
6639  /// and if need to emit shuffle of %s1 and %s2 with mask <1, 0, 3, 2>, it will
6640  /// look through %s1 and %s2 and select vectors %0 and %1 with mask
6641  /// <0, 1, 2, 3> for the shuffle.
6642  /// If 2 operands are of different size, the smallest one will be resized and
6643  /// the mask recalculated properly.
6644  /// For example, given the code
6645  /// \code
6646  /// %s1 = shufflevector <2 x ty> %0, poison, <1, 0, 1, 0>
6647  /// %s2 = shufflevector <2 x ty> %1, poison, <1, 0, 1, 0>
6648  /// \endcode
6649  /// and if need to emit shuffle of %s1 and %s2 with mask <1, 0, 5, 4>, it will
6650  /// look through %s1 and %s2 and select vectors %0 and %1 with mask
6651  /// <0, 1, 2, 3> for the shuffle.
6652  /// So, it tries to transform permutations to simple vector merge, if
6653  /// possible.
6654  /// \param V The input vector which must be shuffled using the given \p Mask.
6655  /// If the better candidate is found, \p V is set to this best candidate
6656  /// vector.
6657  /// \param Mask The input mask for the shuffle. If the best candidate is found
6658  /// during looking-through-shuffles attempt, it is updated accordingly.
6659  /// \param SinglePermute true if the shuffle operation is originally a
6660  /// single-value-permutation. In this case the look-through-shuffles procedure
6661  /// may look for resizing shuffles as the best candidates.
6662  /// \return true if the shuffle results in the non-resizing identity shuffle
6663  /// (and thus can be ignored), false - otherwise.
6664  static bool peekThroughShuffles(Value *&V, SmallVectorImpl<int> &Mask,
6665                                  bool SinglePermute) {
6666    Value *Op = V;
6667    ShuffleVectorInst *IdentityOp = nullptr;
6668    SmallVector<int> IdentityMask;
6669    while (auto *SV = dyn_cast<ShuffleVectorInst>(Op)) {
6670      // Exit if not a fixed vector type or changing size shuffle.
6671      auto *SVTy = dyn_cast<FixedVectorType>(SV->getType());
6672      if (!SVTy)
6673        break;
6674      // Remember the identity or broadcast mask, if it is not a resizing
6675      // shuffle. If no better candidates are found, this Op and Mask will be
6676      // used in the final shuffle.
6677      if (isIdentityMask(Mask, SVTy, /*IsStrict=*/false)) {
6678        if (!IdentityOp || !SinglePermute ||
6679            (isIdentityMask(Mask, SVTy, /*IsStrict=*/true) &&
6680             !ShuffleVectorInst::isZeroEltSplatMask(IdentityMask,
6681                                                    IdentityMask.size()))) {
6682          IdentityOp = SV;
6683          // Store current mask in the IdentityMask so later we did not lost
6684          // this info if IdentityOp is selected as the best candidate for the
6685          // permutation.
6686          IdentityMask.assign(Mask);
6687        }
6688      }
6689      // Remember the broadcast mask. If no better candidates are found, this Op
6690      // and Mask will be used in the final shuffle.
6691      // Zero splat can be used as identity too, since it might be used with
6692      // mask <0, 1, 2, ...>, i.e. identity mask without extra reshuffling.
6693      // E.g. if need to shuffle the vector with the mask <3, 1, 2, 0>, which is
6694      // expensive, the analysis founds out, that the source vector is just a
6695      // broadcast, this original mask can be transformed to identity mask <0,
6696      // 1, 2, 3>.
6697      // \code
6698      // %0 = shuffle %v, poison, zeroinitalizer
6699      // %res = shuffle %0, poison, <3, 1, 2, 0>
6700      // \endcode
6701      // may be transformed to
6702      // \code
6703      // %0 = shuffle %v, poison, zeroinitalizer
6704      // %res = shuffle %0, poison, <0, 1, 2, 3>
6705      // \endcode
6706      if (SV->isZeroEltSplat()) {
6707        IdentityOp = SV;
6708        IdentityMask.assign(Mask);
6709      }
6710      int LocalVF = Mask.size();
6711      if (auto *SVOpTy =
6712              dyn_cast<FixedVectorType>(SV->getOperand(0)->getType()))
6713        LocalVF = SVOpTy->getNumElements();
6714      SmallVector<int> ExtMask(Mask.size(), PoisonMaskElem);
6715      for (auto [Idx, I] : enumerate(Mask)) {
6716        if (I == PoisonMaskElem ||
6717            static_cast<unsigned>(I) >= SV->getShuffleMask().size())
6718          continue;
6719        ExtMask[Idx] = SV->getMaskValue(I);
6720      }
6721      bool IsOp1Undef =
6722          isUndefVector(SV->getOperand(0),
6723                        buildUseMask(LocalVF, ExtMask, UseMask::FirstArg))
6724              .all();
6725      bool IsOp2Undef =
6726          isUndefVector(SV->getOperand(1),
6727                        buildUseMask(LocalVF, ExtMask, UseMask::SecondArg))
6728              .all();
6729      if (!IsOp1Undef && !IsOp2Undef) {
6730        // Update mask and mark undef elems.
6731        for (int &I : Mask) {
6732          if (I == PoisonMaskElem)
6733            continue;
6734          if (SV->getMaskValue(I % SV->getShuffleMask().size()) ==
6735              PoisonMaskElem)
6736            I = PoisonMaskElem;
6737        }
6738        break;
6739      }
6740      SmallVector<int> ShuffleMask(SV->getShuffleMask().begin(),
6741                                   SV->getShuffleMask().end());
6742      combineMasks(LocalVF, ShuffleMask, Mask);
6743      Mask.swap(ShuffleMask);
6744      if (IsOp2Undef)
6745        Op = SV->getOperand(0);
6746      else
6747        Op = SV->getOperand(1);
6748    }
6749    if (auto *OpTy = dyn_cast<FixedVectorType>(Op->getType());
6750        !OpTy || !isIdentityMask(Mask, OpTy, SinglePermute) ||
6751        ShuffleVectorInst::isZeroEltSplatMask(Mask, Mask.size())) {
6752      if (IdentityOp) {
6753        V = IdentityOp;
6754        assert(Mask.size() == IdentityMask.size() &&
6755               "Expected masks of same sizes.");
6756        // Clear known poison elements.
6757        for (auto [I, Idx] : enumerate(Mask))
6758          if (Idx == PoisonMaskElem)
6759            IdentityMask[I] = PoisonMaskElem;
6760        Mask.swap(IdentityMask);
6761        auto *Shuffle = dyn_cast<ShuffleVectorInst>(V);
6762        return SinglePermute &&
6763               (isIdentityMask(Mask, cast<FixedVectorType>(V->getType()),
6764                               /*IsStrict=*/true) ||
6765                (Shuffle && Mask.size() == Shuffle->getShuffleMask().size() &&
6766                 Shuffle->isZeroEltSplat() &&
6767                 ShuffleVectorInst::isZeroEltSplatMask(Mask, Mask.size())));
6768      }
6769      V = Op;
6770      return false;
6771    }
6772    V = Op;
6773    return true;
6774  }
6775
6776  /// Smart shuffle instruction emission, walks through shuffles trees and
6777  /// tries to find the best matching vector for the actual shuffle
6778  /// instruction.
6779  template <typename T, typename ShuffleBuilderTy>
6780  static T createShuffle(Value *V1, Value *V2, ArrayRef<int> Mask,
6781                         ShuffleBuilderTy &Builder) {
6782    assert(V1 && "Expected at least one vector value.");
6783    if (V2)
6784      Builder.resizeToMatch(V1, V2);
6785    int VF = Mask.size();
6786    if (auto *FTy = dyn_cast<FixedVectorType>(V1->getType()))
6787      VF = FTy->getNumElements();
6788    if (V2 &&
6789        !isUndefVector(V2, buildUseMask(VF, Mask, UseMask::SecondArg)).all()) {
6790      // Peek through shuffles.
6791      Value *Op1 = V1;
6792      Value *Op2 = V2;
6793      int VF =
6794          cast<VectorType>(V1->getType())->getElementCount().getKnownMinValue();
6795      SmallVector<int> CombinedMask1(Mask.size(), PoisonMaskElem);
6796      SmallVector<int> CombinedMask2(Mask.size(), PoisonMaskElem);
6797      for (int I = 0, E = Mask.size(); I < E; ++I) {
6798        if (Mask[I] < VF)
6799          CombinedMask1[I] = Mask[I];
6800        else
6801          CombinedMask2[I] = Mask[I] - VF;
6802      }
6803      Value *PrevOp1;
6804      Value *PrevOp2;
6805      do {
6806        PrevOp1 = Op1;
6807        PrevOp2 = Op2;
6808        (void)peekThroughShuffles(Op1, CombinedMask1, /*SinglePermute=*/false);
6809        (void)peekThroughShuffles(Op2, CombinedMask2, /*SinglePermute=*/false);
6810        // Check if we have 2 resizing shuffles - need to peek through operands
6811        // again.
6812        if (auto *SV1 = dyn_cast<ShuffleVectorInst>(Op1))
6813          if (auto *SV2 = dyn_cast<ShuffleVectorInst>(Op2)) {
6814            SmallVector<int> ExtMask1(Mask.size(), PoisonMaskElem);
6815            for (auto [Idx, I] : enumerate(CombinedMask1)) {
6816                if (I == PoisonMaskElem)
6817                continue;
6818                ExtMask1[Idx] = SV1->getMaskValue(I);
6819            }
6820            SmallBitVector UseMask1 = buildUseMask(
6821                cast<FixedVectorType>(SV1->getOperand(1)->getType())
6822                    ->getNumElements(),
6823                ExtMask1, UseMask::SecondArg);
6824            SmallVector<int> ExtMask2(CombinedMask2.size(), PoisonMaskElem);
6825            for (auto [Idx, I] : enumerate(CombinedMask2)) {
6826                if (I == PoisonMaskElem)
6827                continue;
6828                ExtMask2[Idx] = SV2->getMaskValue(I);
6829            }
6830            SmallBitVector UseMask2 = buildUseMask(
6831                cast<FixedVectorType>(SV2->getOperand(1)->getType())
6832                    ->getNumElements(),
6833                ExtMask2, UseMask::SecondArg);
6834            if (SV1->getOperand(0)->getType() ==
6835                    SV2->getOperand(0)->getType() &&
6836                SV1->getOperand(0)->getType() != SV1->getType() &&
6837                isUndefVector(SV1->getOperand(1), UseMask1).all() &&
6838                isUndefVector(SV2->getOperand(1), UseMask2).all()) {
6839              Op1 = SV1->getOperand(0);
6840              Op2 = SV2->getOperand(0);
6841              SmallVector<int> ShuffleMask1(SV1->getShuffleMask().begin(),
6842                                            SV1->getShuffleMask().end());
6843              int LocalVF = ShuffleMask1.size();
6844              if (auto *FTy = dyn_cast<FixedVectorType>(Op1->getType()))
6845                LocalVF = FTy->getNumElements();
6846              combineMasks(LocalVF, ShuffleMask1, CombinedMask1);
6847              CombinedMask1.swap(ShuffleMask1);
6848              SmallVector<int> ShuffleMask2(SV2->getShuffleMask().begin(),
6849                                            SV2->getShuffleMask().end());
6850              LocalVF = ShuffleMask2.size();
6851              if (auto *FTy = dyn_cast<FixedVectorType>(Op2->getType()))
6852                LocalVF = FTy->getNumElements();
6853              combineMasks(LocalVF, ShuffleMask2, CombinedMask2);
6854              CombinedMask2.swap(ShuffleMask2);
6855            }
6856          }
6857      } while (PrevOp1 != Op1 || PrevOp2 != Op2);
6858      Builder.resizeToMatch(Op1, Op2);
6859      VF = std::max(cast<VectorType>(Op1->getType())
6860                        ->getElementCount()
6861                        .getKnownMinValue(),
6862                    cast<VectorType>(Op2->getType())
6863                        ->getElementCount()
6864                        .getKnownMinValue());
6865      for (int I = 0, E = Mask.size(); I < E; ++I) {
6866        if (CombinedMask2[I] != PoisonMaskElem) {
6867          assert(CombinedMask1[I] == PoisonMaskElem &&
6868                 "Expected undefined mask element");
6869          CombinedMask1[I] = CombinedMask2[I] + (Op1 == Op2 ? 0 : VF);
6870        }
6871      }
6872      if (Op1 == Op2 &&
6873          (ShuffleVectorInst::isIdentityMask(CombinedMask1, VF) ||
6874           (ShuffleVectorInst::isZeroEltSplatMask(CombinedMask1, VF) &&
6875            isa<ShuffleVectorInst>(Op1) &&
6876            cast<ShuffleVectorInst>(Op1)->getShuffleMask() ==
6877                ArrayRef(CombinedMask1))))
6878        return Builder.createIdentity(Op1);
6879      return Builder.createShuffleVector(
6880          Op1, Op1 == Op2 ? PoisonValue::get(Op1->getType()) : Op2,
6881          CombinedMask1);
6882    }
6883    if (isa<PoisonValue>(V1))
6884      return Builder.createPoison(
6885          cast<VectorType>(V1->getType())->getElementType(), Mask.size());
6886    SmallVector<int> NewMask(Mask.begin(), Mask.end());
6887    bool IsIdentity = peekThroughShuffles(V1, NewMask, /*SinglePermute=*/true);
6888    assert(V1 && "Expected non-null value after looking through shuffles.");
6889
6890    if (!IsIdentity)
6891      return Builder.createShuffleVector(V1, NewMask);
6892    return Builder.createIdentity(V1);
6893  }
6894};
6895} // namespace
6896
6897/// Returns the cost of the shuffle instructions with the given \p Kind, vector
6898/// type \p Tp and optional \p Mask. Adds SLP-specifc cost estimation for insert
6899/// subvector pattern.
6900static InstructionCost
6901getShuffleCost(const TargetTransformInfo &TTI, TTI::ShuffleKind Kind,
6902               VectorType *Tp, ArrayRef<int> Mask = std::nullopt,
6903               TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput,
6904               int Index = 0, VectorType *SubTp = nullptr,
6905               ArrayRef<const Value *> Args = std::nullopt) {
6906  if (Kind != TTI::SK_PermuteTwoSrc)
6907    return TTI.getShuffleCost(Kind, Tp, Mask, CostKind, Index, SubTp, Args);
6908  int NumSrcElts = Tp->getElementCount().getKnownMinValue();
6909  int NumSubElts;
6910  if (Mask.size() > 2 && ShuffleVectorInst::isInsertSubvectorMask(
6911                             Mask, NumSrcElts, NumSubElts, Index)) {
6912    if (Index + NumSubElts > NumSrcElts &&
6913        Index + NumSrcElts <= static_cast<int>(Mask.size()))
6914      return TTI.getShuffleCost(
6915          TTI::SK_InsertSubvector,
6916          FixedVectorType::get(Tp->getElementType(), Mask.size()), std::nullopt,
6917          TTI::TCK_RecipThroughput, Index, Tp);
6918  }
6919  return TTI.getShuffleCost(Kind, Tp, Mask, CostKind, Index, SubTp, Args);
6920}
6921
6922/// Merges shuffle masks and emits final shuffle instruction, if required. It
6923/// supports shuffling of 2 input vectors. It implements lazy shuffles emission,
6924/// when the actual shuffle instruction is generated only if this is actually
6925/// required. Otherwise, the shuffle instruction emission is delayed till the
6926/// end of the process, to reduce the number of emitted instructions and further
6927/// analysis/transformations.
6928class BoUpSLP::ShuffleCostEstimator : public BaseShuffleAnalysis {
6929  bool IsFinalized = false;
6930  SmallVector<int> CommonMask;
6931  SmallVector<PointerUnion<Value *, const TreeEntry *>, 2> InVectors;
6932  const TargetTransformInfo &TTI;
6933  InstructionCost Cost = 0;
6934  SmallDenseSet<Value *> VectorizedVals;
6935  BoUpSLP &R;
6936  SmallPtrSetImpl<Value *> &CheckedExtracts;
6937  constexpr static TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput;
6938  /// While set, still trying to estimate the cost for the same nodes and we
6939  /// can delay actual cost estimation (virtual shuffle instruction emission).
6940  /// May help better estimate the cost if same nodes must be permuted + allows
6941  /// to move most of the long shuffles cost estimation to TTI.
6942  bool SameNodesEstimated = true;
6943
6944  static Constant *getAllOnesValue(const DataLayout &DL, Type *Ty) {
6945    if (Ty->getScalarType()->isPointerTy()) {
6946      Constant *Res = ConstantExpr::getIntToPtr(
6947          ConstantInt::getAllOnesValue(
6948              IntegerType::get(Ty->getContext(),
6949                               DL.getTypeStoreSizeInBits(Ty->getScalarType()))),
6950          Ty->getScalarType());
6951      if (auto *VTy = dyn_cast<VectorType>(Ty))
6952        Res = ConstantVector::getSplat(VTy->getElementCount(), Res);
6953      return Res;
6954    }
6955    return Constant::getAllOnesValue(Ty);
6956  }
6957
6958  InstructionCost getBuildVectorCost(ArrayRef<Value *> VL, Value *Root) {
6959    if ((!Root && allConstant(VL)) || all_of(VL, UndefValue::classof))
6960      return TTI::TCC_Free;
6961    auto *VecTy = FixedVectorType::get(VL.front()->getType(), VL.size());
6962    InstructionCost GatherCost = 0;
6963    SmallVector<Value *> Gathers(VL.begin(), VL.end());
6964    // Improve gather cost for gather of loads, if we can group some of the
6965    // loads into vector loads.
6966    InstructionsState S = getSameOpcode(VL, *R.TLI);
6967    const unsigned Sz = R.DL->getTypeSizeInBits(VL.front()->getType());
6968    unsigned MinVF = R.getMinVF(2 * Sz);
6969    if (VL.size() > 2 &&
6970        ((S.getOpcode() == Instruction::Load && !S.isAltShuffle()) ||
6971         (InVectors.empty() &&
6972          any_of(seq<unsigned>(0, VL.size() / MinVF),
6973                 [&](unsigned Idx) {
6974                   ArrayRef<Value *> SubVL = VL.slice(Idx * MinVF, MinVF);
6975                   InstructionsState S = getSameOpcode(SubVL, *R.TLI);
6976                   return S.getOpcode() == Instruction::Load &&
6977                          !S.isAltShuffle();
6978                 }))) &&
6979        !all_of(Gathers, [&](Value *V) { return R.getTreeEntry(V); }) &&
6980        !isSplat(Gathers)) {
6981      SetVector<Value *> VectorizedLoads;
6982      SmallVector<LoadInst *> VectorizedStarts;
6983      SmallVector<std::pair<unsigned, unsigned>> ScatterVectorized;
6984      unsigned StartIdx = 0;
6985      unsigned VF = VL.size() / 2;
6986      for (; VF >= MinVF; VF /= 2) {
6987        for (unsigned Cnt = StartIdx, End = VL.size(); Cnt + VF <= End;
6988             Cnt += VF) {
6989          ArrayRef<Value *> Slice = VL.slice(Cnt, VF);
6990          if (S.getOpcode() != Instruction::Load || S.isAltShuffle()) {
6991            InstructionsState SliceS = getSameOpcode(Slice, *R.TLI);
6992            if (SliceS.getOpcode() != Instruction::Load ||
6993                SliceS.isAltShuffle())
6994              continue;
6995          }
6996          if (!VectorizedLoads.count(Slice.front()) &&
6997              !VectorizedLoads.count(Slice.back()) && allSameBlock(Slice)) {
6998            SmallVector<Value *> PointerOps;
6999            OrdersType CurrentOrder;
7000            LoadsState LS =
7001                canVectorizeLoads(Slice, Slice.front(), TTI, *R.DL, *R.SE,
7002                                  *R.LI, *R.TLI, CurrentOrder, PointerOps);
7003            switch (LS) {
7004            case LoadsState::Vectorize:
7005            case LoadsState::ScatterVectorize:
7006            case LoadsState::PossibleStridedVectorize:
7007              // Mark the vectorized loads so that we don't vectorize them
7008              // again.
7009              // TODO: better handling of loads with reorders.
7010              if (LS == LoadsState::Vectorize && CurrentOrder.empty())
7011                VectorizedStarts.push_back(cast<LoadInst>(Slice.front()));
7012              else
7013                ScatterVectorized.emplace_back(Cnt, VF);
7014              VectorizedLoads.insert(Slice.begin(), Slice.end());
7015              // If we vectorized initial block, no need to try to vectorize
7016              // it again.
7017              if (Cnt == StartIdx)
7018                StartIdx += VF;
7019              break;
7020            case LoadsState::Gather:
7021              break;
7022            }
7023          }
7024        }
7025        // Check if the whole array was vectorized already - exit.
7026        if (StartIdx >= VL.size())
7027          break;
7028        // Found vectorizable parts - exit.
7029        if (!VectorizedLoads.empty())
7030          break;
7031      }
7032      if (!VectorizedLoads.empty()) {
7033        unsigned NumParts = TTI.getNumberOfParts(VecTy);
7034        bool NeedInsertSubvectorAnalysis =
7035            !NumParts || (VL.size() / VF) > NumParts;
7036        // Get the cost for gathered loads.
7037        for (unsigned I = 0, End = VL.size(); I < End; I += VF) {
7038          if (VectorizedLoads.contains(VL[I]))
7039            continue;
7040          GatherCost += getBuildVectorCost(VL.slice(I, VF), Root);
7041        }
7042        // Exclude potentially vectorized loads from list of gathered
7043        // scalars.
7044        Gathers.assign(Gathers.size(), PoisonValue::get(VL.front()->getType()));
7045        // The cost for vectorized loads.
7046        InstructionCost ScalarsCost = 0;
7047        for (Value *V : VectorizedLoads) {
7048          auto *LI = cast<LoadInst>(V);
7049          ScalarsCost +=
7050              TTI.getMemoryOpCost(Instruction::Load, LI->getType(),
7051                                  LI->getAlign(), LI->getPointerAddressSpace(),
7052                                  CostKind, TTI::OperandValueInfo(), LI);
7053        }
7054        auto *LoadTy = FixedVectorType::get(VL.front()->getType(), VF);
7055        for (LoadInst *LI : VectorizedStarts) {
7056          Align Alignment = LI->getAlign();
7057          GatherCost +=
7058              TTI.getMemoryOpCost(Instruction::Load, LoadTy, Alignment,
7059                                  LI->getPointerAddressSpace(), CostKind,
7060                                  TTI::OperandValueInfo(), LI);
7061        }
7062        for (std::pair<unsigned, unsigned> P : ScatterVectorized) {
7063          auto *LI0 = cast<LoadInst>(VL[P.first]);
7064          Align CommonAlignment = LI0->getAlign();
7065          for (Value *V : VL.slice(P.first + 1, VF - 1))
7066            CommonAlignment =
7067                std::min(CommonAlignment, cast<LoadInst>(V)->getAlign());
7068          GatherCost += TTI.getGatherScatterOpCost(
7069              Instruction::Load, LoadTy, LI0->getPointerOperand(),
7070              /*VariableMask=*/false, CommonAlignment, CostKind, LI0);
7071        }
7072        if (NeedInsertSubvectorAnalysis) {
7073          // Add the cost for the subvectors insert.
7074          for (int I = VF, E = VL.size(); I < E; I += VF)
7075            GatherCost += TTI.getShuffleCost(TTI::SK_InsertSubvector, VecTy,
7076                                             std::nullopt, CostKind, I, LoadTy);
7077        }
7078        GatherCost -= ScalarsCost;
7079      }
7080    } else if (!Root && isSplat(VL)) {
7081      // Found the broadcasting of the single scalar, calculate the cost as
7082      // the broadcast.
7083      const auto *It =
7084          find_if(VL, [](Value *V) { return !isa<UndefValue>(V); });
7085      assert(It != VL.end() && "Expected at least one non-undef value.");
7086      // Add broadcast for non-identity shuffle only.
7087      bool NeedShuffle =
7088          count(VL, *It) > 1 &&
7089          (VL.front() != *It || !all_of(VL.drop_front(), UndefValue::classof));
7090      InstructionCost InsertCost = TTI.getVectorInstrCost(
7091          Instruction::InsertElement, VecTy, CostKind,
7092          NeedShuffle ? 0 : std::distance(VL.begin(), It),
7093          PoisonValue::get(VecTy), *It);
7094      return InsertCost +
7095             (NeedShuffle ? TTI.getShuffleCost(
7096                                TargetTransformInfo::SK_Broadcast, VecTy,
7097                                /*Mask=*/std::nullopt, CostKind, /*Index=*/0,
7098                                /*SubTp=*/nullptr, /*Args=*/*It)
7099                          : TTI::TCC_Free);
7100    }
7101    return GatherCost +
7102           (all_of(Gathers, UndefValue::classof)
7103                ? TTI::TCC_Free
7104                : R.getGatherCost(Gathers, !Root && VL.equals(Gathers)));
7105  };
7106
7107  /// Compute the cost of creating a vector containing the extracted values from
7108  /// \p VL.
7109  InstructionCost
7110  computeExtractCost(ArrayRef<Value *> VL, ArrayRef<int> Mask,
7111                     ArrayRef<std::optional<TTI::ShuffleKind>> ShuffleKinds,
7112                     unsigned NumParts) {
7113    assert(VL.size() > NumParts && "Unexpected scalarized shuffle.");
7114    unsigned NumElts =
7115        std::accumulate(VL.begin(), VL.end(), 0, [](unsigned Sz, Value *V) {
7116          auto *EE = dyn_cast<ExtractElementInst>(V);
7117          if (!EE)
7118            return Sz;
7119          auto *VecTy = cast<FixedVectorType>(EE->getVectorOperandType());
7120          return std::max(Sz, VecTy->getNumElements());
7121        });
7122    unsigned NumSrcRegs = TTI.getNumberOfParts(
7123        FixedVectorType::get(VL.front()->getType(), NumElts));
7124    if (NumSrcRegs == 0)
7125      NumSrcRegs = 1;
7126    // FIXME: this must be moved to TTI for better estimation.
7127    unsigned EltsPerVector = PowerOf2Ceil(std::max(
7128        divideCeil(VL.size(), NumParts), divideCeil(NumElts, NumSrcRegs)));
7129    auto CheckPerRegistersShuffle =
7130        [&](MutableArrayRef<int> Mask) -> std::optional<TTI::ShuffleKind> {
7131      DenseSet<int> RegIndices;
7132      // Check that if trying to permute same single/2 input vectors.
7133      TTI::ShuffleKind ShuffleKind = TTI::SK_PermuteSingleSrc;
7134      int FirstRegId = -1;
7135      for (int &I : Mask) {
7136        if (I == PoisonMaskElem)
7137          continue;
7138        int RegId = (I / NumElts) * NumParts + (I % NumElts) / EltsPerVector;
7139        if (FirstRegId < 0)
7140          FirstRegId = RegId;
7141        RegIndices.insert(RegId);
7142        if (RegIndices.size() > 2)
7143          return std::nullopt;
7144        if (RegIndices.size() == 2)
7145          ShuffleKind = TTI::SK_PermuteTwoSrc;
7146        I = (I % NumElts) % EltsPerVector +
7147            (RegId == FirstRegId ? 0 : EltsPerVector);
7148      }
7149      return ShuffleKind;
7150    };
7151    InstructionCost Cost = 0;
7152
7153    // Process extracts in blocks of EltsPerVector to check if the source vector
7154    // operand can be re-used directly. If not, add the cost of creating a
7155    // shuffle to extract the values into a vector register.
7156    for (unsigned Part = 0; Part < NumParts; ++Part) {
7157      if (!ShuffleKinds[Part])
7158        continue;
7159      ArrayRef<int> MaskSlice =
7160          Mask.slice(Part * EltsPerVector,
7161                     (Part == NumParts - 1 && Mask.size() % EltsPerVector != 0)
7162                         ? Mask.size() % EltsPerVector
7163                         : EltsPerVector);
7164      SmallVector<int> SubMask(EltsPerVector, PoisonMaskElem);
7165      copy(MaskSlice, SubMask.begin());
7166      std::optional<TTI::ShuffleKind> RegShuffleKind =
7167          CheckPerRegistersShuffle(SubMask);
7168      if (!RegShuffleKind) {
7169        Cost += ::getShuffleCost(
7170            TTI, *ShuffleKinds[Part],
7171            FixedVectorType::get(VL.front()->getType(), NumElts), MaskSlice);
7172        continue;
7173      }
7174      if (*RegShuffleKind != TTI::SK_PermuteSingleSrc ||
7175          !ShuffleVectorInst::isIdentityMask(SubMask, EltsPerVector)) {
7176        Cost += ::getShuffleCost(
7177            TTI, *RegShuffleKind,
7178            FixedVectorType::get(VL.front()->getType(), EltsPerVector),
7179            SubMask);
7180      }
7181    }
7182    return Cost;
7183  }
7184  /// Transforms mask \p CommonMask per given \p Mask to make proper set after
7185  /// shuffle emission.
7186  static void transformMaskAfterShuffle(MutableArrayRef<int> CommonMask,
7187                                        ArrayRef<int> Mask) {
7188    for (unsigned Idx = 0, Sz = CommonMask.size(); Idx < Sz; ++Idx)
7189      if (Mask[Idx] != PoisonMaskElem)
7190        CommonMask[Idx] = Idx;
7191  }
7192  /// Adds the cost of reshuffling \p E1 and \p E2 (if present), using given
7193  /// mask \p Mask, register number \p Part, that includes \p SliceSize
7194  /// elements.
7195  void estimateNodesPermuteCost(const TreeEntry &E1, const TreeEntry *E2,
7196                                ArrayRef<int> Mask, unsigned Part,
7197                                unsigned SliceSize) {
7198    if (SameNodesEstimated) {
7199      // Delay the cost estimation if the same nodes are reshuffling.
7200      // If we already requested the cost of reshuffling of E1 and E2 before, no
7201      // need to estimate another cost with the sub-Mask, instead include this
7202      // sub-Mask into the CommonMask to estimate it later and avoid double cost
7203      // estimation.
7204      if ((InVectors.size() == 2 &&
7205           InVectors.front().get<const TreeEntry *>() == &E1 &&
7206           InVectors.back().get<const TreeEntry *>() == E2) ||
7207          (!E2 && InVectors.front().get<const TreeEntry *>() == &E1)) {
7208        assert(all_of(ArrayRef(CommonMask).slice(Part * SliceSize, SliceSize),
7209                      [](int Idx) { return Idx == PoisonMaskElem; }) &&
7210               "Expected all poisoned elements.");
7211        ArrayRef<int> SubMask =
7212            ArrayRef(Mask).slice(Part * SliceSize, SliceSize);
7213        copy(SubMask, std::next(CommonMask.begin(), SliceSize * Part));
7214        return;
7215      }
7216      // Found non-matching nodes - need to estimate the cost for the matched
7217      // and transform mask.
7218      Cost += createShuffle(InVectors.front(),
7219                            InVectors.size() == 1 ? nullptr : InVectors.back(),
7220                            CommonMask);
7221      transformMaskAfterShuffle(CommonMask, CommonMask);
7222    }
7223    SameNodesEstimated = false;
7224    Cost += createShuffle(&E1, E2, Mask);
7225    transformMaskAfterShuffle(CommonMask, Mask);
7226  }
7227
7228  class ShuffleCostBuilder {
7229    const TargetTransformInfo &TTI;
7230
7231    static bool isEmptyOrIdentity(ArrayRef<int> Mask, unsigned VF) {
7232      int Index = -1;
7233      return Mask.empty() ||
7234             (VF == Mask.size() &&
7235              ShuffleVectorInst::isIdentityMask(Mask, VF)) ||
7236             (ShuffleVectorInst::isExtractSubvectorMask(Mask, VF, Index) &&
7237              Index == 0);
7238    }
7239
7240  public:
7241    ShuffleCostBuilder(const TargetTransformInfo &TTI) : TTI(TTI) {}
7242    ~ShuffleCostBuilder() = default;
7243    InstructionCost createShuffleVector(Value *V1, Value *,
7244                                        ArrayRef<int> Mask) const {
7245      // Empty mask or identity mask are free.
7246      unsigned VF =
7247          cast<VectorType>(V1->getType())->getElementCount().getKnownMinValue();
7248      if (isEmptyOrIdentity(Mask, VF))
7249        return TTI::TCC_Free;
7250      return ::getShuffleCost(TTI, TTI::SK_PermuteTwoSrc,
7251                              cast<VectorType>(V1->getType()), Mask);
7252    }
7253    InstructionCost createShuffleVector(Value *V1, ArrayRef<int> Mask) const {
7254      // Empty mask or identity mask are free.
7255      unsigned VF =
7256          cast<VectorType>(V1->getType())->getElementCount().getKnownMinValue();
7257      if (isEmptyOrIdentity(Mask, VF))
7258        return TTI::TCC_Free;
7259      return TTI.getShuffleCost(TTI::SK_PermuteSingleSrc,
7260                                cast<VectorType>(V1->getType()), Mask);
7261    }
7262    InstructionCost createIdentity(Value *) const { return TTI::TCC_Free; }
7263    InstructionCost createPoison(Type *Ty, unsigned VF) const {
7264      return TTI::TCC_Free;
7265    }
7266    void resizeToMatch(Value *&, Value *&) const {}
7267  };
7268
7269  /// Smart shuffle instruction emission, walks through shuffles trees and
7270  /// tries to find the best matching vector for the actual shuffle
7271  /// instruction.
7272  InstructionCost
7273  createShuffle(const PointerUnion<Value *, const TreeEntry *> &P1,
7274                const PointerUnion<Value *, const TreeEntry *> &P2,
7275                ArrayRef<int> Mask) {
7276    ShuffleCostBuilder Builder(TTI);
7277    SmallVector<int> CommonMask(Mask.begin(), Mask.end());
7278    Value *V1 = P1.dyn_cast<Value *>(), *V2 = P2.dyn_cast<Value *>();
7279    unsigned CommonVF = Mask.size();
7280    if (!V1 && !V2 && !P2.isNull()) {
7281      // Shuffle 2 entry nodes.
7282      const TreeEntry *E = P1.get<const TreeEntry *>();
7283      unsigned VF = E->getVectorFactor();
7284      const TreeEntry *E2 = P2.get<const TreeEntry *>();
7285      CommonVF = std::max(VF, E2->getVectorFactor());
7286      assert(all_of(Mask,
7287                    [=](int Idx) {
7288                      return Idx < 2 * static_cast<int>(CommonVF);
7289                    }) &&
7290             "All elements in mask must be less than 2 * CommonVF.");
7291      if (E->Scalars.size() == E2->Scalars.size()) {
7292        SmallVector<int> EMask = E->getCommonMask();
7293        SmallVector<int> E2Mask = E2->getCommonMask();
7294        if (!EMask.empty() || !E2Mask.empty()) {
7295          for (int &Idx : CommonMask) {
7296            if (Idx == PoisonMaskElem)
7297              continue;
7298            if (Idx < static_cast<int>(CommonVF) && !EMask.empty())
7299              Idx = EMask[Idx];
7300            else if (Idx >= static_cast<int>(CommonVF))
7301              Idx = (E2Mask.empty() ? Idx - CommonVF : E2Mask[Idx - CommonVF]) +
7302                    E->Scalars.size();
7303          }
7304        }
7305        CommonVF = E->Scalars.size();
7306      }
7307      V1 = Constant::getNullValue(
7308          FixedVectorType::get(E->Scalars.front()->getType(), CommonVF));
7309      V2 = getAllOnesValue(
7310          *R.DL, FixedVectorType::get(E->Scalars.front()->getType(), CommonVF));
7311    } else if (!V1 && P2.isNull()) {
7312      // Shuffle single entry node.
7313      const TreeEntry *E = P1.get<const TreeEntry *>();
7314      unsigned VF = E->getVectorFactor();
7315      CommonVF = VF;
7316      assert(
7317          all_of(Mask,
7318                 [=](int Idx) { return Idx < static_cast<int>(CommonVF); }) &&
7319          "All elements in mask must be less than CommonVF.");
7320      if (E->Scalars.size() == Mask.size() && VF != Mask.size()) {
7321        SmallVector<int> EMask = E->getCommonMask();
7322        assert(!EMask.empty() && "Expected non-empty common mask.");
7323        for (int &Idx : CommonMask) {
7324          if (Idx != PoisonMaskElem)
7325            Idx = EMask[Idx];
7326        }
7327        CommonVF = E->Scalars.size();
7328      }
7329      V1 = Constant::getNullValue(
7330          FixedVectorType::get(E->Scalars.front()->getType(), CommonVF));
7331    } else if (V1 && P2.isNull()) {
7332      // Shuffle single vector.
7333      CommonVF = cast<FixedVectorType>(V1->getType())->getNumElements();
7334      assert(
7335          all_of(Mask,
7336                 [=](int Idx) { return Idx < static_cast<int>(CommonVF); }) &&
7337          "All elements in mask must be less than CommonVF.");
7338    } else if (V1 && !V2) {
7339      // Shuffle vector and tree node.
7340      unsigned VF = cast<FixedVectorType>(V1->getType())->getNumElements();
7341      const TreeEntry *E2 = P2.get<const TreeEntry *>();
7342      CommonVF = std::max(VF, E2->getVectorFactor());
7343      assert(all_of(Mask,
7344                    [=](int Idx) {
7345                      return Idx < 2 * static_cast<int>(CommonVF);
7346                    }) &&
7347             "All elements in mask must be less than 2 * CommonVF.");
7348      if (E2->Scalars.size() == VF && VF != CommonVF) {
7349        SmallVector<int> E2Mask = E2->getCommonMask();
7350        assert(!E2Mask.empty() && "Expected non-empty common mask.");
7351        for (int &Idx : CommonMask) {
7352          if (Idx == PoisonMaskElem)
7353            continue;
7354          if (Idx >= static_cast<int>(CommonVF))
7355            Idx = E2Mask[Idx - CommonVF] + VF;
7356        }
7357        CommonVF = VF;
7358      }
7359      V1 = Constant::getNullValue(
7360          FixedVectorType::get(E2->Scalars.front()->getType(), CommonVF));
7361      V2 = getAllOnesValue(
7362          *R.DL,
7363          FixedVectorType::get(E2->Scalars.front()->getType(), CommonVF));
7364    } else if (!V1 && V2) {
7365      // Shuffle vector and tree node.
7366      unsigned VF = cast<FixedVectorType>(V2->getType())->getNumElements();
7367      const TreeEntry *E1 = P1.get<const TreeEntry *>();
7368      CommonVF = std::max(VF, E1->getVectorFactor());
7369      assert(all_of(Mask,
7370                    [=](int Idx) {
7371                      return Idx < 2 * static_cast<int>(CommonVF);
7372                    }) &&
7373             "All elements in mask must be less than 2 * CommonVF.");
7374      if (E1->Scalars.size() == VF && VF != CommonVF) {
7375        SmallVector<int> E1Mask = E1->getCommonMask();
7376        assert(!E1Mask.empty() && "Expected non-empty common mask.");
7377        for (int &Idx : CommonMask) {
7378          if (Idx == PoisonMaskElem)
7379            continue;
7380          if (Idx >= static_cast<int>(CommonVF))
7381            Idx = E1Mask[Idx - CommonVF] + VF;
7382          else
7383            Idx = E1Mask[Idx];
7384        }
7385        CommonVF = VF;
7386      }
7387      V1 = Constant::getNullValue(
7388          FixedVectorType::get(E1->Scalars.front()->getType(), CommonVF));
7389      V2 = getAllOnesValue(
7390          *R.DL,
7391          FixedVectorType::get(E1->Scalars.front()->getType(), CommonVF));
7392    } else {
7393      assert(V1 && V2 && "Expected both vectors.");
7394      unsigned VF = cast<FixedVectorType>(V1->getType())->getNumElements();
7395      CommonVF =
7396          std::max(VF, cast<FixedVectorType>(V2->getType())->getNumElements());
7397      assert(all_of(Mask,
7398                    [=](int Idx) {
7399                      return Idx < 2 * static_cast<int>(CommonVF);
7400                    }) &&
7401             "All elements in mask must be less than 2 * CommonVF.");
7402      if (V1->getType() != V2->getType()) {
7403        V1 = Constant::getNullValue(FixedVectorType::get(
7404            cast<FixedVectorType>(V1->getType())->getElementType(), CommonVF));
7405        V2 = getAllOnesValue(
7406            *R.DL, FixedVectorType::get(
7407                       cast<FixedVectorType>(V1->getType())->getElementType(),
7408                       CommonVF));
7409      }
7410    }
7411    InVectors.front() = Constant::getNullValue(FixedVectorType::get(
7412        cast<FixedVectorType>(V1->getType())->getElementType(),
7413        CommonMask.size()));
7414    if (InVectors.size() == 2)
7415      InVectors.pop_back();
7416    return BaseShuffleAnalysis::createShuffle<InstructionCost>(
7417        V1, V2, CommonMask, Builder);
7418  }
7419
7420public:
7421  ShuffleCostEstimator(TargetTransformInfo &TTI,
7422                       ArrayRef<Value *> VectorizedVals, BoUpSLP &R,
7423                       SmallPtrSetImpl<Value *> &CheckedExtracts)
7424      : TTI(TTI), VectorizedVals(VectorizedVals.begin(), VectorizedVals.end()),
7425        R(R), CheckedExtracts(CheckedExtracts) {}
7426  Value *adjustExtracts(const TreeEntry *E, MutableArrayRef<int> Mask,
7427                        ArrayRef<std::optional<TTI::ShuffleKind>> ShuffleKinds,
7428                        unsigned NumParts, bool &UseVecBaseAsInput) {
7429    UseVecBaseAsInput = false;
7430    if (Mask.empty())
7431      return nullptr;
7432    Value *VecBase = nullptr;
7433    ArrayRef<Value *> VL = E->Scalars;
7434    // If the resulting type is scalarized, do not adjust the cost.
7435    if (NumParts == VL.size())
7436      return nullptr;
7437    // Check if it can be considered reused if same extractelements were
7438    // vectorized already.
7439    bool PrevNodeFound = any_of(
7440        ArrayRef(R.VectorizableTree).take_front(E->Idx),
7441        [&](const std::unique_ptr<TreeEntry> &TE) {
7442          return ((!TE->isAltShuffle() &&
7443                   TE->getOpcode() == Instruction::ExtractElement) ||
7444                  TE->State == TreeEntry::NeedToGather) &&
7445                 all_of(enumerate(TE->Scalars), [&](auto &&Data) {
7446                   return VL.size() > Data.index() &&
7447                          (Mask[Data.index()] == PoisonMaskElem ||
7448                           isa<UndefValue>(VL[Data.index()]) ||
7449                           Data.value() == VL[Data.index()]);
7450                 });
7451        });
7452    SmallPtrSet<Value *, 4> UniqueBases;
7453    unsigned SliceSize = VL.size() / NumParts;
7454    for (unsigned Part = 0; Part < NumParts; ++Part) {
7455      ArrayRef<int> SubMask = Mask.slice(Part * SliceSize, SliceSize);
7456      for (auto [I, V] : enumerate(VL.slice(Part * SliceSize, SliceSize))) {
7457        // Ignore non-extractelement scalars.
7458        if (isa<UndefValue>(V) ||
7459            (!SubMask.empty() && SubMask[I] == PoisonMaskElem))
7460          continue;
7461        // If all users of instruction are going to be vectorized and this
7462        // instruction itself is not going to be vectorized, consider this
7463        // instruction as dead and remove its cost from the final cost of the
7464        // vectorized tree.
7465        // Also, avoid adjusting the cost for extractelements with multiple uses
7466        // in different graph entries.
7467        auto *EE = cast<ExtractElementInst>(V);
7468        VecBase = EE->getVectorOperand();
7469        UniqueBases.insert(VecBase);
7470        const TreeEntry *VE = R.getTreeEntry(V);
7471        if (!CheckedExtracts.insert(V).second ||
7472            !R.areAllUsersVectorized(cast<Instruction>(V), &VectorizedVals) ||
7473            (VE && VE != E))
7474          continue;
7475        std::optional<unsigned> EEIdx = getExtractIndex(EE);
7476        if (!EEIdx)
7477          continue;
7478        unsigned Idx = *EEIdx;
7479        // Take credit for instruction that will become dead.
7480        if (EE->hasOneUse() || !PrevNodeFound) {
7481          Instruction *Ext = EE->user_back();
7482          if (isa<SExtInst, ZExtInst>(Ext) && all_of(Ext->users(), [](User *U) {
7483                return isa<GetElementPtrInst>(U);
7484              })) {
7485            // Use getExtractWithExtendCost() to calculate the cost of
7486            // extractelement/ext pair.
7487            Cost -=
7488                TTI.getExtractWithExtendCost(Ext->getOpcode(), Ext->getType(),
7489                                             EE->getVectorOperandType(), Idx);
7490            // Add back the cost of s|zext which is subtracted separately.
7491            Cost += TTI.getCastInstrCost(
7492                Ext->getOpcode(), Ext->getType(), EE->getType(),
7493                TTI::getCastContextHint(Ext), CostKind, Ext);
7494            continue;
7495          }
7496        }
7497        Cost -= TTI.getVectorInstrCost(*EE, EE->getVectorOperandType(),
7498                                       CostKind, Idx);
7499      }
7500    }
7501    // Check that gather of extractelements can be represented as just a
7502    // shuffle of a single/two vectors the scalars are extracted from.
7503    // Found the bunch of extractelement instructions that must be gathered
7504    // into a vector and can be represented as a permutation elements in a
7505    // single input vector or of 2 input vectors.
7506    // Done for reused if same extractelements were vectorized already.
7507    if (!PrevNodeFound)
7508      Cost += computeExtractCost(VL, Mask, ShuffleKinds, NumParts);
7509    InVectors.assign(1, E);
7510    CommonMask.assign(Mask.begin(), Mask.end());
7511    transformMaskAfterShuffle(CommonMask, CommonMask);
7512    SameNodesEstimated = false;
7513    if (NumParts != 1 && UniqueBases.size() != 1) {
7514      UseVecBaseAsInput = true;
7515      VecBase = Constant::getNullValue(
7516          FixedVectorType::get(VL.front()->getType(), CommonMask.size()));
7517    }
7518    return VecBase;
7519  }
7520  /// Checks if the specified entry \p E needs to be delayed because of its
7521  /// dependency nodes.
7522  std::optional<InstructionCost>
7523  needToDelay(const TreeEntry *,
7524              ArrayRef<SmallVector<const TreeEntry *>>) const {
7525    // No need to delay the cost estimation during analysis.
7526    return std::nullopt;
7527  }
7528  void add(const TreeEntry &E1, const TreeEntry &E2, ArrayRef<int> Mask) {
7529    if (&E1 == &E2) {
7530      assert(all_of(Mask,
7531                    [&](int Idx) {
7532                      return Idx < static_cast<int>(E1.getVectorFactor());
7533                    }) &&
7534             "Expected single vector shuffle mask.");
7535      add(E1, Mask);
7536      return;
7537    }
7538    if (InVectors.empty()) {
7539      CommonMask.assign(Mask.begin(), Mask.end());
7540      InVectors.assign({&E1, &E2});
7541      return;
7542    }
7543    assert(!CommonMask.empty() && "Expected non-empty common mask.");
7544    auto *MaskVecTy =
7545        FixedVectorType::get(E1.Scalars.front()->getType(), Mask.size());
7546    unsigned NumParts = TTI.getNumberOfParts(MaskVecTy);
7547    if (NumParts == 0 || NumParts >= Mask.size())
7548      NumParts = 1;
7549    unsigned SliceSize = Mask.size() / NumParts;
7550    const auto *It =
7551        find_if(Mask, [](int Idx) { return Idx != PoisonMaskElem; });
7552    unsigned Part = std::distance(Mask.begin(), It) / SliceSize;
7553    estimateNodesPermuteCost(E1, &E2, Mask, Part, SliceSize);
7554  }
7555  void add(const TreeEntry &E1, ArrayRef<int> Mask) {
7556    if (InVectors.empty()) {
7557      CommonMask.assign(Mask.begin(), Mask.end());
7558      InVectors.assign(1, &E1);
7559      return;
7560    }
7561    assert(!CommonMask.empty() && "Expected non-empty common mask.");
7562    auto *MaskVecTy =
7563        FixedVectorType::get(E1.Scalars.front()->getType(), Mask.size());
7564    unsigned NumParts = TTI.getNumberOfParts(MaskVecTy);
7565    if (NumParts == 0 || NumParts >= Mask.size())
7566      NumParts = 1;
7567    unsigned SliceSize = Mask.size() / NumParts;
7568    const auto *It =
7569        find_if(Mask, [](int Idx) { return Idx != PoisonMaskElem; });
7570    unsigned Part = std::distance(Mask.begin(), It) / SliceSize;
7571    estimateNodesPermuteCost(E1, nullptr, Mask, Part, SliceSize);
7572    if (!SameNodesEstimated && InVectors.size() == 1)
7573      InVectors.emplace_back(&E1);
7574  }
7575  /// Adds 2 input vectors and the mask for their shuffling.
7576  void add(Value *V1, Value *V2, ArrayRef<int> Mask) {
7577    // May come only for shuffling of 2 vectors with extractelements, already
7578    // handled in adjustExtracts.
7579    assert(InVectors.size() == 1 &&
7580           all_of(enumerate(CommonMask),
7581                  [&](auto P) {
7582                    if (P.value() == PoisonMaskElem)
7583                      return Mask[P.index()] == PoisonMaskElem;
7584                    auto *EI =
7585                        cast<ExtractElementInst>(InVectors.front()
7586                                                     .get<const TreeEntry *>()
7587                                                     ->Scalars[P.index()]);
7588                    return EI->getVectorOperand() == V1 ||
7589                           EI->getVectorOperand() == V2;
7590                  }) &&
7591           "Expected extractelement vectors.");
7592  }
7593  /// Adds another one input vector and the mask for the shuffling.
7594  void add(Value *V1, ArrayRef<int> Mask, bool ForExtracts = false) {
7595    if (InVectors.empty()) {
7596      assert(CommonMask.empty() && !ForExtracts &&
7597             "Expected empty input mask/vectors.");
7598      CommonMask.assign(Mask.begin(), Mask.end());
7599      InVectors.assign(1, V1);
7600      return;
7601    }
7602    if (ForExtracts) {
7603      // No need to add vectors here, already handled them in adjustExtracts.
7604      assert(InVectors.size() == 1 &&
7605             InVectors.front().is<const TreeEntry *>() && !CommonMask.empty() &&
7606             all_of(enumerate(CommonMask),
7607                    [&](auto P) {
7608                      Value *Scalar = InVectors.front()
7609                                          .get<const TreeEntry *>()
7610                                          ->Scalars[P.index()];
7611                      if (P.value() == PoisonMaskElem)
7612                        return P.value() == Mask[P.index()] ||
7613                               isa<UndefValue>(Scalar);
7614                      if (isa<Constant>(V1))
7615                        return true;
7616                      auto *EI = cast<ExtractElementInst>(Scalar);
7617                      return EI->getVectorOperand() == V1;
7618                    }) &&
7619             "Expected only tree entry for extractelement vectors.");
7620      return;
7621    }
7622    assert(!InVectors.empty() && !CommonMask.empty() &&
7623           "Expected only tree entries from extracts/reused buildvectors.");
7624    unsigned VF = cast<FixedVectorType>(V1->getType())->getNumElements();
7625    if (InVectors.size() == 2) {
7626      Cost += createShuffle(InVectors.front(), InVectors.back(), CommonMask);
7627      transformMaskAfterShuffle(CommonMask, CommonMask);
7628      VF = std::max<unsigned>(VF, CommonMask.size());
7629    } else if (const auto *InTE =
7630                   InVectors.front().dyn_cast<const TreeEntry *>()) {
7631      VF = std::max(VF, InTE->getVectorFactor());
7632    } else {
7633      VF = std::max(
7634          VF, cast<FixedVectorType>(InVectors.front().get<Value *>()->getType())
7635                  ->getNumElements());
7636    }
7637    InVectors.push_back(V1);
7638    for (unsigned Idx = 0, Sz = CommonMask.size(); Idx < Sz; ++Idx)
7639      if (Mask[Idx] != PoisonMaskElem && CommonMask[Idx] == PoisonMaskElem)
7640        CommonMask[Idx] = Mask[Idx] + VF;
7641  }
7642  Value *gather(ArrayRef<Value *> VL, unsigned MaskVF = 0,
7643                Value *Root = nullptr) {
7644    Cost += getBuildVectorCost(VL, Root);
7645    if (!Root) {
7646      // FIXME: Need to find a way to avoid use of getNullValue here.
7647      SmallVector<Constant *> Vals;
7648      unsigned VF = VL.size();
7649      if (MaskVF != 0)
7650        VF = std::min(VF, MaskVF);
7651      for (Value *V : VL.take_front(VF)) {
7652        if (isa<UndefValue>(V)) {
7653          Vals.push_back(cast<Constant>(V));
7654          continue;
7655        }
7656        Vals.push_back(Constant::getNullValue(V->getType()));
7657      }
7658      return ConstantVector::get(Vals);
7659    }
7660    return ConstantVector::getSplat(
7661        ElementCount::getFixed(
7662            cast<FixedVectorType>(Root->getType())->getNumElements()),
7663        getAllOnesValue(*R.DL, VL.front()->getType()));
7664  }
7665  InstructionCost createFreeze(InstructionCost Cost) { return Cost; }
7666  /// Finalize emission of the shuffles.
7667  InstructionCost
7668  finalize(ArrayRef<int> ExtMask, unsigned VF = 0,
7669           function_ref<void(Value *&, SmallVectorImpl<int> &)> Action = {}) {
7670    IsFinalized = true;
7671    if (Action) {
7672      const PointerUnion<Value *, const TreeEntry *> &Vec = InVectors.front();
7673      if (InVectors.size() == 2)
7674        Cost += createShuffle(Vec, InVectors.back(), CommonMask);
7675      else
7676        Cost += createShuffle(Vec, nullptr, CommonMask);
7677      for (unsigned Idx = 0, Sz = CommonMask.size(); Idx < Sz; ++Idx)
7678        if (CommonMask[Idx] != PoisonMaskElem)
7679          CommonMask[Idx] = Idx;
7680      assert(VF > 0 &&
7681             "Expected vector length for the final value before action.");
7682      Value *V = Vec.get<Value *>();
7683      Action(V, CommonMask);
7684      InVectors.front() = V;
7685    }
7686    ::addMask(CommonMask, ExtMask, /*ExtendingManyInputs=*/true);
7687    if (CommonMask.empty()) {
7688      assert(InVectors.size() == 1 && "Expected only one vector with no mask");
7689      return Cost;
7690    }
7691    return Cost +
7692           createShuffle(InVectors.front(),
7693                         InVectors.size() == 2 ? InVectors.back() : nullptr,
7694                         CommonMask);
7695  }
7696
7697  ~ShuffleCostEstimator() {
7698    assert((IsFinalized || CommonMask.empty()) &&
7699           "Shuffle construction must be finalized.");
7700  }
7701};
7702
7703const BoUpSLP::TreeEntry *BoUpSLP::getOperandEntry(const TreeEntry *E,
7704                                                   unsigned Idx) const {
7705  Value *Op = E->getOperand(Idx).front();
7706  if (const TreeEntry *TE = getTreeEntry(Op)) {
7707    if (find_if(E->UserTreeIndices, [&](const EdgeInfo &EI) {
7708          return EI.EdgeIdx == Idx && EI.UserTE == E;
7709        }) != TE->UserTreeIndices.end())
7710      return TE;
7711    auto MIt = MultiNodeScalars.find(Op);
7712    if (MIt != MultiNodeScalars.end()) {
7713      for (const TreeEntry *TE : MIt->second) {
7714        if (find_if(TE->UserTreeIndices, [&](const EdgeInfo &EI) {
7715              return EI.EdgeIdx == Idx && EI.UserTE == E;
7716            }) != TE->UserTreeIndices.end())
7717          return TE;
7718      }
7719    }
7720  }
7721  const auto *It =
7722      find_if(VectorizableTree, [&](const std::unique_ptr<TreeEntry> &TE) {
7723        return TE->State == TreeEntry::NeedToGather &&
7724               find_if(TE->UserTreeIndices, [&](const EdgeInfo &EI) {
7725                 return EI.EdgeIdx == Idx && EI.UserTE == E;
7726               }) != TE->UserTreeIndices.end();
7727      });
7728  assert(It != VectorizableTree.end() && "Expected vectorizable entry.");
7729  return It->get();
7730}
7731
7732InstructionCost
7733BoUpSLP::getEntryCost(const TreeEntry *E, ArrayRef<Value *> VectorizedVals,
7734                      SmallPtrSetImpl<Value *> &CheckedExtracts) {
7735  ArrayRef<Value *> VL = E->Scalars;
7736
7737  Type *ScalarTy = VL[0]->getType();
7738  if (E->State != TreeEntry::NeedToGather) {
7739    if (auto *SI = dyn_cast<StoreInst>(VL[0]))
7740      ScalarTy = SI->getValueOperand()->getType();
7741    else if (auto *CI = dyn_cast<CmpInst>(VL[0]))
7742      ScalarTy = CI->getOperand(0)->getType();
7743    else if (auto *IE = dyn_cast<InsertElementInst>(VL[0]))
7744      ScalarTy = IE->getOperand(1)->getType();
7745  }
7746  if (!FixedVectorType::isValidElementType(ScalarTy))
7747    return InstructionCost::getInvalid();
7748  auto *VecTy = FixedVectorType::get(ScalarTy, VL.size());
7749  TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput;
7750
7751  // If we have computed a smaller type for the expression, update VecTy so
7752  // that the costs will be accurate.
7753  auto It = MinBWs.find(E);
7754  if (It != MinBWs.end()) {
7755    ScalarTy = IntegerType::get(F->getContext(), It->second.first);
7756    VecTy = FixedVectorType::get(ScalarTy, VL.size());
7757  }
7758  unsigned EntryVF = E->getVectorFactor();
7759  auto *FinalVecTy = FixedVectorType::get(ScalarTy, EntryVF);
7760
7761  bool NeedToShuffleReuses = !E->ReuseShuffleIndices.empty();
7762  if (E->State == TreeEntry::NeedToGather) {
7763    if (allConstant(VL))
7764      return 0;
7765    if (isa<InsertElementInst>(VL[0]))
7766      return InstructionCost::getInvalid();
7767    return processBuildVector<ShuffleCostEstimator, InstructionCost>(
7768        E, *TTI, VectorizedVals, *this, CheckedExtracts);
7769  }
7770  InstructionCost CommonCost = 0;
7771  SmallVector<int> Mask;
7772  if (!E->ReorderIndices.empty() &&
7773      E->State != TreeEntry::PossibleStridedVectorize) {
7774    SmallVector<int> NewMask;
7775    if (E->getOpcode() == Instruction::Store) {
7776      // For stores the order is actually a mask.
7777      NewMask.resize(E->ReorderIndices.size());
7778      copy(E->ReorderIndices, NewMask.begin());
7779    } else {
7780      inversePermutation(E->ReorderIndices, NewMask);
7781    }
7782    ::addMask(Mask, NewMask);
7783  }
7784  if (NeedToShuffleReuses)
7785    ::addMask(Mask, E->ReuseShuffleIndices);
7786  if (!Mask.empty() && !ShuffleVectorInst::isIdentityMask(Mask, Mask.size()))
7787    CommonCost =
7788        TTI->getShuffleCost(TTI::SK_PermuteSingleSrc, FinalVecTy, Mask);
7789  assert((E->State == TreeEntry::Vectorize ||
7790          E->State == TreeEntry::ScatterVectorize ||
7791          E->State == TreeEntry::PossibleStridedVectorize) &&
7792         "Unhandled state");
7793  assert(E->getOpcode() &&
7794         ((allSameType(VL) && allSameBlock(VL)) ||
7795          (E->getOpcode() == Instruction::GetElementPtr &&
7796           E->getMainOp()->getType()->isPointerTy())) &&
7797         "Invalid VL");
7798  Instruction *VL0 = E->getMainOp();
7799  unsigned ShuffleOrOp =
7800      E->isAltShuffle() ? (unsigned)Instruction::ShuffleVector : E->getOpcode();
7801  SetVector<Value *> UniqueValues(VL.begin(), VL.end());
7802  const unsigned Sz = UniqueValues.size();
7803  SmallBitVector UsedScalars(Sz, false);
7804  for (unsigned I = 0; I < Sz; ++I) {
7805    if (getTreeEntry(UniqueValues[I]) == E)
7806      continue;
7807    UsedScalars.set(I);
7808  }
7809  auto GetCastContextHint = [&](Value *V) {
7810    if (const TreeEntry *OpTE = getTreeEntry(V)) {
7811      if (OpTE->State == TreeEntry::ScatterVectorize)
7812        return TTI::CastContextHint::GatherScatter;
7813      if (OpTE->State == TreeEntry::Vectorize &&
7814          OpTE->getOpcode() == Instruction::Load && !OpTE->isAltShuffle()) {
7815        if (OpTE->ReorderIndices.empty())
7816          return TTI::CastContextHint::Normal;
7817        SmallVector<int> Mask;
7818        inversePermutation(OpTE->ReorderIndices, Mask);
7819        if (ShuffleVectorInst::isReverseMask(Mask, Mask.size()))
7820          return TTI::CastContextHint::Reversed;
7821      }
7822    } else {
7823      InstructionsState SrcState = getSameOpcode(E->getOperand(0), *TLI);
7824      if (SrcState.getOpcode() == Instruction::Load && !SrcState.isAltShuffle())
7825        return TTI::CastContextHint::GatherScatter;
7826    }
7827    return TTI::CastContextHint::None;
7828  };
7829  auto GetCostDiff =
7830      [=](function_ref<InstructionCost(unsigned)> ScalarEltCost,
7831          function_ref<InstructionCost(InstructionCost)> VectorCost) {
7832        // Calculate the cost of this instruction.
7833        InstructionCost ScalarCost = 0;
7834        if (isa<CastInst, CmpInst, SelectInst, CallInst>(VL0)) {
7835          // For some of the instructions no need to calculate cost for each
7836          // particular instruction, we can use the cost of the single
7837          // instruction x total number of scalar instructions.
7838          ScalarCost = (Sz - UsedScalars.count()) * ScalarEltCost(0);
7839        } else {
7840          for (unsigned I = 0; I < Sz; ++I) {
7841            if (UsedScalars.test(I))
7842              continue;
7843            ScalarCost += ScalarEltCost(I);
7844          }
7845        }
7846
7847        InstructionCost VecCost = VectorCost(CommonCost);
7848        // Check if the current node must be resized, if the parent node is not
7849        // resized.
7850        if (!UnaryInstruction::isCast(E->getOpcode()) && E->Idx != 0) {
7851          const EdgeInfo &EI = E->UserTreeIndices.front();
7852          if ((EI.UserTE->getOpcode() != Instruction::Select ||
7853               EI.EdgeIdx != 0) &&
7854              It != MinBWs.end()) {
7855            auto UserBWIt = MinBWs.find(EI.UserTE);
7856            Type *UserScalarTy =
7857                EI.UserTE->getOperand(EI.EdgeIdx).front()->getType();
7858            if (UserBWIt != MinBWs.end())
7859              UserScalarTy = IntegerType::get(ScalarTy->getContext(),
7860                                              UserBWIt->second.first);
7861            if (ScalarTy != UserScalarTy) {
7862              unsigned BWSz = DL->getTypeSizeInBits(ScalarTy);
7863              unsigned SrcBWSz = DL->getTypeSizeInBits(UserScalarTy);
7864              unsigned VecOpcode;
7865              auto *SrcVecTy =
7866                  FixedVectorType::get(UserScalarTy, E->getVectorFactor());
7867              if (BWSz > SrcBWSz)
7868                VecOpcode = Instruction::Trunc;
7869              else
7870                VecOpcode =
7871                    It->second.second ? Instruction::SExt : Instruction::ZExt;
7872              TTI::CastContextHint CCH = GetCastContextHint(VL0);
7873              VecCost += TTI->getCastInstrCost(VecOpcode, VecTy, SrcVecTy, CCH,
7874                                               CostKind);
7875              ScalarCost +=
7876                  Sz * TTI->getCastInstrCost(VecOpcode, ScalarTy, UserScalarTy,
7877                                             CCH, CostKind);
7878            }
7879          }
7880        }
7881        LLVM_DEBUG(dumpTreeCosts(E, CommonCost, VecCost - CommonCost,
7882                                 ScalarCost, "Calculated costs for Tree"));
7883        return VecCost - ScalarCost;
7884      };
7885  // Calculate cost difference from vectorizing set of GEPs.
7886  // Negative value means vectorizing is profitable.
7887  auto GetGEPCostDiff = [=](ArrayRef<Value *> Ptrs, Value *BasePtr) {
7888    InstructionCost ScalarCost = 0;
7889    InstructionCost VecCost = 0;
7890    // Here we differentiate two cases: (1) when Ptrs represent a regular
7891    // vectorization tree node (as they are pointer arguments of scattered
7892    // loads) or (2) when Ptrs are the arguments of loads or stores being
7893    // vectorized as plane wide unit-stride load/store since all the
7894    // loads/stores are known to be from/to adjacent locations.
7895    assert(E->State == TreeEntry::Vectorize &&
7896           "Entry state expected to be Vectorize here.");
7897    if (isa<LoadInst, StoreInst>(VL0)) {
7898      // Case 2: estimate costs for pointer related costs when vectorizing to
7899      // a wide load/store.
7900      // Scalar cost is estimated as a set of pointers with known relationship
7901      // between them.
7902      // For vector code we will use BasePtr as argument for the wide load/store
7903      // but we also need to account all the instructions which are going to
7904      // stay in vectorized code due to uses outside of these scalar
7905      // loads/stores.
7906      ScalarCost = TTI->getPointersChainCost(
7907          Ptrs, BasePtr, TTI::PointersChainInfo::getUnitStride(), ScalarTy,
7908          CostKind);
7909
7910      SmallVector<const Value *> PtrsRetainedInVecCode;
7911      for (Value *V : Ptrs) {
7912        if (V == BasePtr) {
7913          PtrsRetainedInVecCode.push_back(V);
7914          continue;
7915        }
7916        auto *Ptr = dyn_cast<GetElementPtrInst>(V);
7917        // For simplicity assume Ptr to stay in vectorized code if it's not a
7918        // GEP instruction. We don't care since it's cost considered free.
7919        // TODO: We should check for any uses outside of vectorizable tree
7920        // rather than just single use.
7921        if (!Ptr || !Ptr->hasOneUse())
7922          PtrsRetainedInVecCode.push_back(V);
7923      }
7924
7925      if (PtrsRetainedInVecCode.size() == Ptrs.size()) {
7926        // If all pointers stay in vectorized code then we don't have
7927        // any savings on that.
7928        LLVM_DEBUG(dumpTreeCosts(E, 0, ScalarCost, ScalarCost,
7929                                 "Calculated GEPs cost for Tree"));
7930        return InstructionCost{TTI::TCC_Free};
7931      }
7932      VecCost = TTI->getPointersChainCost(
7933          PtrsRetainedInVecCode, BasePtr,
7934          TTI::PointersChainInfo::getKnownStride(), VecTy, CostKind);
7935    } else {
7936      // Case 1: Ptrs are the arguments of loads that we are going to transform
7937      // into masked gather load intrinsic.
7938      // All the scalar GEPs will be removed as a result of vectorization.
7939      // For any external uses of some lanes extract element instructions will
7940      // be generated (which cost is estimated separately).
7941      TTI::PointersChainInfo PtrsInfo =
7942          all_of(Ptrs,
7943                 [](const Value *V) {
7944                   auto *Ptr = dyn_cast<GetElementPtrInst>(V);
7945                   return Ptr && !Ptr->hasAllConstantIndices();
7946                 })
7947              ? TTI::PointersChainInfo::getUnknownStride()
7948              : TTI::PointersChainInfo::getKnownStride();
7949
7950      ScalarCost = TTI->getPointersChainCost(Ptrs, BasePtr, PtrsInfo, ScalarTy,
7951                                             CostKind);
7952      if (auto *BaseGEP = dyn_cast<GEPOperator>(BasePtr)) {
7953        SmallVector<const Value *> Indices(BaseGEP->indices());
7954        VecCost = TTI->getGEPCost(BaseGEP->getSourceElementType(),
7955                                  BaseGEP->getPointerOperand(), Indices, VecTy,
7956                                  CostKind);
7957      }
7958    }
7959
7960    LLVM_DEBUG(dumpTreeCosts(E, 0, VecCost, ScalarCost,
7961                             "Calculated GEPs cost for Tree"));
7962
7963    return VecCost - ScalarCost;
7964  };
7965
7966  switch (ShuffleOrOp) {
7967  case Instruction::PHI: {
7968    // Count reused scalars.
7969    InstructionCost ScalarCost = 0;
7970    SmallPtrSet<const TreeEntry *, 4> CountedOps;
7971    for (Value *V : UniqueValues) {
7972      auto *PHI = dyn_cast<PHINode>(V);
7973      if (!PHI)
7974        continue;
7975
7976      ValueList Operands(PHI->getNumIncomingValues(), nullptr);
7977      for (unsigned I = 0, N = PHI->getNumIncomingValues(); I < N; ++I) {
7978        Value *Op = PHI->getIncomingValue(I);
7979        Operands[I] = Op;
7980      }
7981      if (const TreeEntry *OpTE = getTreeEntry(Operands.front()))
7982        if (OpTE->isSame(Operands) && CountedOps.insert(OpTE).second)
7983          if (!OpTE->ReuseShuffleIndices.empty())
7984            ScalarCost += TTI::TCC_Basic * (OpTE->ReuseShuffleIndices.size() -
7985                                            OpTE->Scalars.size());
7986    }
7987
7988    return CommonCost - ScalarCost;
7989  }
7990  case Instruction::ExtractValue:
7991  case Instruction::ExtractElement: {
7992    auto GetScalarCost = [&](unsigned Idx) {
7993      auto *I = cast<Instruction>(UniqueValues[Idx]);
7994      VectorType *SrcVecTy;
7995      if (ShuffleOrOp == Instruction::ExtractElement) {
7996        auto *EE = cast<ExtractElementInst>(I);
7997        SrcVecTy = EE->getVectorOperandType();
7998      } else {
7999        auto *EV = cast<ExtractValueInst>(I);
8000        Type *AggregateTy = EV->getAggregateOperand()->getType();
8001        unsigned NumElts;
8002        if (auto *ATy = dyn_cast<ArrayType>(AggregateTy))
8003          NumElts = ATy->getNumElements();
8004        else
8005          NumElts = AggregateTy->getStructNumElements();
8006        SrcVecTy = FixedVectorType::get(ScalarTy, NumElts);
8007      }
8008      if (I->hasOneUse()) {
8009        Instruction *Ext = I->user_back();
8010        if ((isa<SExtInst>(Ext) || isa<ZExtInst>(Ext)) &&
8011            all_of(Ext->users(),
8012                   [](User *U) { return isa<GetElementPtrInst>(U); })) {
8013          // Use getExtractWithExtendCost() to calculate the cost of
8014          // extractelement/ext pair.
8015          InstructionCost Cost = TTI->getExtractWithExtendCost(
8016              Ext->getOpcode(), Ext->getType(), SrcVecTy, *getExtractIndex(I));
8017          // Subtract the cost of s|zext which is subtracted separately.
8018          Cost -= TTI->getCastInstrCost(
8019              Ext->getOpcode(), Ext->getType(), I->getType(),
8020              TTI::getCastContextHint(Ext), CostKind, Ext);
8021          return Cost;
8022        }
8023      }
8024      return TTI->getVectorInstrCost(Instruction::ExtractElement, SrcVecTy,
8025                                     CostKind, *getExtractIndex(I));
8026    };
8027    auto GetVectorCost = [](InstructionCost CommonCost) { return CommonCost; };
8028    return GetCostDiff(GetScalarCost, GetVectorCost);
8029  }
8030  case Instruction::InsertElement: {
8031    assert(E->ReuseShuffleIndices.empty() &&
8032           "Unique insertelements only are expected.");
8033    auto *SrcVecTy = cast<FixedVectorType>(VL0->getType());
8034    unsigned const NumElts = SrcVecTy->getNumElements();
8035    unsigned const NumScalars = VL.size();
8036
8037    unsigned NumOfParts = TTI->getNumberOfParts(SrcVecTy);
8038
8039    SmallVector<int> InsertMask(NumElts, PoisonMaskElem);
8040    unsigned OffsetBeg = *getInsertIndex(VL.front());
8041    unsigned OffsetEnd = OffsetBeg;
8042    InsertMask[OffsetBeg] = 0;
8043    for (auto [I, V] : enumerate(VL.drop_front())) {
8044      unsigned Idx = *getInsertIndex(V);
8045      if (OffsetBeg > Idx)
8046        OffsetBeg = Idx;
8047      else if (OffsetEnd < Idx)
8048        OffsetEnd = Idx;
8049      InsertMask[Idx] = I + 1;
8050    }
8051    unsigned VecScalarsSz = PowerOf2Ceil(NumElts);
8052    if (NumOfParts > 0)
8053      VecScalarsSz = PowerOf2Ceil((NumElts + NumOfParts - 1) / NumOfParts);
8054    unsigned VecSz = (1 + OffsetEnd / VecScalarsSz - OffsetBeg / VecScalarsSz) *
8055                     VecScalarsSz;
8056    unsigned Offset = VecScalarsSz * (OffsetBeg / VecScalarsSz);
8057    unsigned InsertVecSz = std::min<unsigned>(
8058        PowerOf2Ceil(OffsetEnd - OffsetBeg + 1),
8059        ((OffsetEnd - OffsetBeg + VecScalarsSz) / VecScalarsSz) * VecScalarsSz);
8060    bool IsWholeSubvector =
8061        OffsetBeg == Offset && ((OffsetEnd + 1) % VecScalarsSz == 0);
8062    // Check if we can safely insert a subvector. If it is not possible, just
8063    // generate a whole-sized vector and shuffle the source vector and the new
8064    // subvector.
8065    if (OffsetBeg + InsertVecSz > VecSz) {
8066      // Align OffsetBeg to generate correct mask.
8067      OffsetBeg = alignDown(OffsetBeg, VecSz, Offset);
8068      InsertVecSz = VecSz;
8069    }
8070
8071    APInt DemandedElts = APInt::getZero(NumElts);
8072    // TODO: Add support for Instruction::InsertValue.
8073    SmallVector<int> Mask;
8074    if (!E->ReorderIndices.empty()) {
8075      inversePermutation(E->ReorderIndices, Mask);
8076      Mask.append(InsertVecSz - Mask.size(), PoisonMaskElem);
8077    } else {
8078      Mask.assign(VecSz, PoisonMaskElem);
8079      std::iota(Mask.begin(), std::next(Mask.begin(), InsertVecSz), 0);
8080    }
8081    bool IsIdentity = true;
8082    SmallVector<int> PrevMask(InsertVecSz, PoisonMaskElem);
8083    Mask.swap(PrevMask);
8084    for (unsigned I = 0; I < NumScalars; ++I) {
8085      unsigned InsertIdx = *getInsertIndex(VL[PrevMask[I]]);
8086      DemandedElts.setBit(InsertIdx);
8087      IsIdentity &= InsertIdx - OffsetBeg == I;
8088      Mask[InsertIdx - OffsetBeg] = I;
8089    }
8090    assert(Offset < NumElts && "Failed to find vector index offset");
8091
8092    InstructionCost Cost = 0;
8093    Cost -= TTI->getScalarizationOverhead(SrcVecTy, DemandedElts,
8094                                          /*Insert*/ true, /*Extract*/ false,
8095                                          CostKind);
8096
8097    // First cost - resize to actual vector size if not identity shuffle or
8098    // need to shift the vector.
8099    // Do not calculate the cost if the actual size is the register size and
8100    // we can merge this shuffle with the following SK_Select.
8101    auto *InsertVecTy = FixedVectorType::get(ScalarTy, InsertVecSz);
8102    if (!IsIdentity)
8103      Cost += TTI->getShuffleCost(TargetTransformInfo::SK_PermuteSingleSrc,
8104                                  InsertVecTy, Mask);
8105    auto *FirstInsert = cast<Instruction>(*find_if(E->Scalars, [E](Value *V) {
8106      return !is_contained(E->Scalars, cast<Instruction>(V)->getOperand(0));
8107    }));
8108    // Second cost - permutation with subvector, if some elements are from the
8109    // initial vector or inserting a subvector.
8110    // TODO: Implement the analysis of the FirstInsert->getOperand(0)
8111    // subvector of ActualVecTy.
8112    SmallBitVector InMask =
8113        isUndefVector(FirstInsert->getOperand(0),
8114                      buildUseMask(NumElts, InsertMask, UseMask::UndefsAsMask));
8115    if (!InMask.all() && NumScalars != NumElts && !IsWholeSubvector) {
8116      if (InsertVecSz != VecSz) {
8117        auto *ActualVecTy = FixedVectorType::get(ScalarTy, VecSz);
8118        Cost += TTI->getShuffleCost(TTI::SK_InsertSubvector, ActualVecTy,
8119                                    std::nullopt, CostKind, OffsetBeg - Offset,
8120                                    InsertVecTy);
8121      } else {
8122        for (unsigned I = 0, End = OffsetBeg - Offset; I < End; ++I)
8123          Mask[I] = InMask.test(I) ? PoisonMaskElem : I;
8124        for (unsigned I = OffsetBeg - Offset, End = OffsetEnd - Offset;
8125             I <= End; ++I)
8126          if (Mask[I] != PoisonMaskElem)
8127            Mask[I] = I + VecSz;
8128        for (unsigned I = OffsetEnd + 1 - Offset; I < VecSz; ++I)
8129          Mask[I] =
8130              ((I >= InMask.size()) || InMask.test(I)) ? PoisonMaskElem : I;
8131        Cost +=
8132            ::getShuffleCost(*TTI, TTI::SK_PermuteTwoSrc, InsertVecTy, Mask);
8133      }
8134    }
8135    return Cost;
8136  }
8137  case Instruction::ZExt:
8138  case Instruction::SExt:
8139  case Instruction::FPToUI:
8140  case Instruction::FPToSI:
8141  case Instruction::FPExt:
8142  case Instruction::PtrToInt:
8143  case Instruction::IntToPtr:
8144  case Instruction::SIToFP:
8145  case Instruction::UIToFP:
8146  case Instruction::Trunc:
8147  case Instruction::FPTrunc:
8148  case Instruction::BitCast: {
8149    auto SrcIt = MinBWs.find(getOperandEntry(E, 0));
8150    Type *SrcScalarTy = VL0->getOperand(0)->getType();
8151    auto *SrcVecTy = FixedVectorType::get(SrcScalarTy, VL.size());
8152    unsigned Opcode = ShuffleOrOp;
8153    unsigned VecOpcode = Opcode;
8154    if (!ScalarTy->isFloatingPointTy() && !SrcScalarTy->isFloatingPointTy() &&
8155        (SrcIt != MinBWs.end() || It != MinBWs.end())) {
8156      // Check if the values are candidates to demote.
8157      unsigned SrcBWSz = DL->getTypeSizeInBits(SrcScalarTy);
8158      if (SrcIt != MinBWs.end()) {
8159        SrcBWSz = SrcIt->second.first;
8160        SrcScalarTy = IntegerType::get(F->getContext(), SrcBWSz);
8161        SrcVecTy = FixedVectorType::get(SrcScalarTy, VL.size());
8162      }
8163      unsigned BWSz = DL->getTypeSizeInBits(ScalarTy);
8164      if (BWSz == SrcBWSz) {
8165        VecOpcode = Instruction::BitCast;
8166      } else if (BWSz < SrcBWSz) {
8167        VecOpcode = Instruction::Trunc;
8168      } else if (It != MinBWs.end()) {
8169        assert(BWSz > SrcBWSz && "Invalid cast!");
8170        VecOpcode = It->second.second ? Instruction::SExt : Instruction::ZExt;
8171      }
8172    }
8173    auto GetScalarCost = [&](unsigned Idx) -> InstructionCost {
8174      // Do not count cost here if minimum bitwidth is in effect and it is just
8175      // a bitcast (here it is just a noop).
8176      if (VecOpcode != Opcode && VecOpcode == Instruction::BitCast)
8177        return TTI::TCC_Free;
8178      auto *VI = VL0->getOpcode() == Opcode
8179                     ? cast<Instruction>(UniqueValues[Idx])
8180                     : nullptr;
8181      return TTI->getCastInstrCost(Opcode, VL0->getType(),
8182                                   VL0->getOperand(0)->getType(),
8183                                   TTI::getCastContextHint(VI), CostKind, VI);
8184    };
8185    auto GetVectorCost = [=](InstructionCost CommonCost) {
8186      // Do not count cost here if minimum bitwidth is in effect and it is just
8187      // a bitcast (here it is just a noop).
8188      if (VecOpcode != Opcode && VecOpcode == Instruction::BitCast)
8189        return CommonCost;
8190      auto *VI = VL0->getOpcode() == Opcode ? VL0 : nullptr;
8191      TTI::CastContextHint CCH = GetCastContextHint(VL0->getOperand(0));
8192      return CommonCost +
8193             TTI->getCastInstrCost(VecOpcode, VecTy, SrcVecTy, CCH, CostKind,
8194                                   VecOpcode == Opcode ? VI : nullptr);
8195    };
8196    return GetCostDiff(GetScalarCost, GetVectorCost);
8197  }
8198  case Instruction::FCmp:
8199  case Instruction::ICmp:
8200  case Instruction::Select: {
8201    CmpInst::Predicate VecPred, SwappedVecPred;
8202    auto MatchCmp = m_Cmp(VecPred, m_Value(), m_Value());
8203    if (match(VL0, m_Select(MatchCmp, m_Value(), m_Value())) ||
8204        match(VL0, MatchCmp))
8205      SwappedVecPred = CmpInst::getSwappedPredicate(VecPred);
8206    else
8207      SwappedVecPred = VecPred = ScalarTy->isFloatingPointTy()
8208                                     ? CmpInst::BAD_FCMP_PREDICATE
8209                                     : CmpInst::BAD_ICMP_PREDICATE;
8210    auto GetScalarCost = [&](unsigned Idx) {
8211      auto *VI = cast<Instruction>(UniqueValues[Idx]);
8212      CmpInst::Predicate CurrentPred = ScalarTy->isFloatingPointTy()
8213                                           ? CmpInst::BAD_FCMP_PREDICATE
8214                                           : CmpInst::BAD_ICMP_PREDICATE;
8215      auto MatchCmp = m_Cmp(CurrentPred, m_Value(), m_Value());
8216      if ((!match(VI, m_Select(MatchCmp, m_Value(), m_Value())) &&
8217           !match(VI, MatchCmp)) ||
8218          (CurrentPred != VecPred && CurrentPred != SwappedVecPred))
8219        VecPred = SwappedVecPred = ScalarTy->isFloatingPointTy()
8220                                       ? CmpInst::BAD_FCMP_PREDICATE
8221                                       : CmpInst::BAD_ICMP_PREDICATE;
8222
8223      return TTI->getCmpSelInstrCost(E->getOpcode(), ScalarTy,
8224                                     Builder.getInt1Ty(), CurrentPred, CostKind,
8225                                     VI);
8226    };
8227    auto GetVectorCost = [&](InstructionCost CommonCost) {
8228      auto *MaskTy = FixedVectorType::get(Builder.getInt1Ty(), VL.size());
8229
8230      InstructionCost VecCost = TTI->getCmpSelInstrCost(
8231          E->getOpcode(), VecTy, MaskTy, VecPred, CostKind, VL0);
8232      // Check if it is possible and profitable to use min/max for selects
8233      // in VL.
8234      //
8235      auto IntrinsicAndUse = canConvertToMinOrMaxIntrinsic(VL);
8236      if (IntrinsicAndUse.first != Intrinsic::not_intrinsic) {
8237        IntrinsicCostAttributes CostAttrs(IntrinsicAndUse.first, VecTy,
8238                                          {VecTy, VecTy});
8239        InstructionCost IntrinsicCost =
8240            TTI->getIntrinsicInstrCost(CostAttrs, CostKind);
8241        // If the selects are the only uses of the compares, they will be
8242        // dead and we can adjust the cost by removing their cost.
8243        if (IntrinsicAndUse.second)
8244          IntrinsicCost -= TTI->getCmpSelInstrCost(Instruction::ICmp, VecTy,
8245                                                   MaskTy, VecPred, CostKind);
8246        VecCost = std::min(VecCost, IntrinsicCost);
8247      }
8248      return VecCost + CommonCost;
8249    };
8250    return GetCostDiff(GetScalarCost, GetVectorCost);
8251  }
8252  case Instruction::FNeg:
8253  case Instruction::Add:
8254  case Instruction::FAdd:
8255  case Instruction::Sub:
8256  case Instruction::FSub:
8257  case Instruction::Mul:
8258  case Instruction::FMul:
8259  case Instruction::UDiv:
8260  case Instruction::SDiv:
8261  case Instruction::FDiv:
8262  case Instruction::URem:
8263  case Instruction::SRem:
8264  case Instruction::FRem:
8265  case Instruction::Shl:
8266  case Instruction::LShr:
8267  case Instruction::AShr:
8268  case Instruction::And:
8269  case Instruction::Or:
8270  case Instruction::Xor: {
8271    auto GetScalarCost = [&](unsigned Idx) {
8272      auto *VI = cast<Instruction>(UniqueValues[Idx]);
8273      unsigned OpIdx = isa<UnaryOperator>(VI) ? 0 : 1;
8274      TTI::OperandValueInfo Op1Info = TTI::getOperandInfo(VI->getOperand(0));
8275      TTI::OperandValueInfo Op2Info =
8276          TTI::getOperandInfo(VI->getOperand(OpIdx));
8277      SmallVector<const Value *> Operands(VI->operand_values());
8278      return TTI->getArithmeticInstrCost(ShuffleOrOp, ScalarTy, CostKind,
8279                                         Op1Info, Op2Info, Operands, VI);
8280    };
8281    auto GetVectorCost = [=](InstructionCost CommonCost) {
8282      unsigned OpIdx = isa<UnaryOperator>(VL0) ? 0 : 1;
8283      TTI::OperandValueInfo Op1Info = getOperandInfo(E->getOperand(0));
8284      TTI::OperandValueInfo Op2Info = getOperandInfo(E->getOperand(OpIdx));
8285      return TTI->getArithmeticInstrCost(ShuffleOrOp, VecTy, CostKind, Op1Info,
8286                                         Op2Info) +
8287             CommonCost;
8288    };
8289    return GetCostDiff(GetScalarCost, GetVectorCost);
8290  }
8291  case Instruction::GetElementPtr: {
8292    return CommonCost + GetGEPCostDiff(VL, VL0);
8293  }
8294  case Instruction::Load: {
8295    auto GetScalarCost = [&](unsigned Idx) {
8296      auto *VI = cast<LoadInst>(UniqueValues[Idx]);
8297      return TTI->getMemoryOpCost(Instruction::Load, ScalarTy, VI->getAlign(),
8298                                  VI->getPointerAddressSpace(), CostKind,
8299                                  TTI::OperandValueInfo(), VI);
8300    };
8301    auto *LI0 = cast<LoadInst>(VL0);
8302    auto GetVectorCost = [&](InstructionCost CommonCost) {
8303      InstructionCost VecLdCost;
8304      if (E->State == TreeEntry::Vectorize) {
8305        VecLdCost = TTI->getMemoryOpCost(
8306            Instruction::Load, VecTy, LI0->getAlign(),
8307            LI0->getPointerAddressSpace(), CostKind, TTI::OperandValueInfo());
8308      } else {
8309        assert((E->State == TreeEntry::ScatterVectorize ||
8310                E->State == TreeEntry::PossibleStridedVectorize) &&
8311               "Unknown EntryState");
8312        Align CommonAlignment = LI0->getAlign();
8313        for (Value *V : UniqueValues)
8314          CommonAlignment =
8315              std::min(CommonAlignment, cast<LoadInst>(V)->getAlign());
8316        VecLdCost = TTI->getGatherScatterOpCost(
8317            Instruction::Load, VecTy, LI0->getPointerOperand(),
8318            /*VariableMask=*/false, CommonAlignment, CostKind);
8319      }
8320      return VecLdCost + CommonCost;
8321    };
8322
8323    InstructionCost Cost = GetCostDiff(GetScalarCost, GetVectorCost);
8324    // If this node generates masked gather load then it is not a terminal node.
8325    // Hence address operand cost is estimated separately.
8326    if (E->State == TreeEntry::ScatterVectorize ||
8327        E->State == TreeEntry::PossibleStridedVectorize)
8328      return Cost;
8329
8330    // Estimate cost of GEPs since this tree node is a terminator.
8331    SmallVector<Value *> PointerOps(VL.size());
8332    for (auto [I, V] : enumerate(VL))
8333      PointerOps[I] = cast<LoadInst>(V)->getPointerOperand();
8334    return Cost + GetGEPCostDiff(PointerOps, LI0->getPointerOperand());
8335  }
8336  case Instruction::Store: {
8337    bool IsReorder = !E->ReorderIndices.empty();
8338    auto GetScalarCost = [=](unsigned Idx) {
8339      auto *VI = cast<StoreInst>(VL[Idx]);
8340      TTI::OperandValueInfo OpInfo = TTI::getOperandInfo(VI->getValueOperand());
8341      return TTI->getMemoryOpCost(Instruction::Store, ScalarTy, VI->getAlign(),
8342                                  VI->getPointerAddressSpace(), CostKind,
8343                                  OpInfo, VI);
8344    };
8345    auto *BaseSI =
8346        cast<StoreInst>(IsReorder ? VL[E->ReorderIndices.front()] : VL0);
8347    auto GetVectorCost = [=](InstructionCost CommonCost) {
8348      // We know that we can merge the stores. Calculate the cost.
8349      TTI::OperandValueInfo OpInfo = getOperandInfo(E->getOperand(0));
8350      return TTI->getMemoryOpCost(Instruction::Store, VecTy, BaseSI->getAlign(),
8351                                  BaseSI->getPointerAddressSpace(), CostKind,
8352                                  OpInfo) +
8353             CommonCost;
8354    };
8355    SmallVector<Value *> PointerOps(VL.size());
8356    for (auto [I, V] : enumerate(VL)) {
8357      unsigned Idx = IsReorder ? E->ReorderIndices[I] : I;
8358      PointerOps[Idx] = cast<StoreInst>(V)->getPointerOperand();
8359    }
8360
8361    return GetCostDiff(GetScalarCost, GetVectorCost) +
8362           GetGEPCostDiff(PointerOps, BaseSI->getPointerOperand());
8363  }
8364  case Instruction::Call: {
8365    auto GetScalarCost = [&](unsigned Idx) {
8366      auto *CI = cast<CallInst>(UniqueValues[Idx]);
8367      Intrinsic::ID ID = getVectorIntrinsicIDForCall(CI, TLI);
8368      if (ID != Intrinsic::not_intrinsic) {
8369        IntrinsicCostAttributes CostAttrs(ID, *CI, 1);
8370        return TTI->getIntrinsicInstrCost(CostAttrs, CostKind);
8371      }
8372      return TTI->getCallInstrCost(CI->getCalledFunction(),
8373                                   CI->getFunctionType()->getReturnType(),
8374                                   CI->getFunctionType()->params(), CostKind);
8375    };
8376    auto GetVectorCost = [=](InstructionCost CommonCost) {
8377      auto *CI = cast<CallInst>(VL0);
8378      auto VecCallCosts = getVectorCallCosts(CI, VecTy, TTI, TLI);
8379      return std::min(VecCallCosts.first, VecCallCosts.second) + CommonCost;
8380    };
8381    return GetCostDiff(GetScalarCost, GetVectorCost);
8382  }
8383  case Instruction::ShuffleVector: {
8384    assert(E->isAltShuffle() &&
8385           ((Instruction::isBinaryOp(E->getOpcode()) &&
8386             Instruction::isBinaryOp(E->getAltOpcode())) ||
8387            (Instruction::isCast(E->getOpcode()) &&
8388             Instruction::isCast(E->getAltOpcode())) ||
8389            (isa<CmpInst>(VL0) && isa<CmpInst>(E->getAltOp()))) &&
8390           "Invalid Shuffle Vector Operand");
8391    // Try to find the previous shuffle node with the same operands and same
8392    // main/alternate ops.
8393    auto TryFindNodeWithEqualOperands = [=]() {
8394      for (const std::unique_ptr<TreeEntry> &TE : VectorizableTree) {
8395        if (TE.get() == E)
8396          break;
8397        if (TE->isAltShuffle() &&
8398            ((TE->getOpcode() == E->getOpcode() &&
8399              TE->getAltOpcode() == E->getAltOpcode()) ||
8400             (TE->getOpcode() == E->getAltOpcode() &&
8401              TE->getAltOpcode() == E->getOpcode())) &&
8402            TE->hasEqualOperands(*E))
8403          return true;
8404      }
8405      return false;
8406    };
8407    auto GetScalarCost = [&](unsigned Idx) {
8408      auto *VI = cast<Instruction>(UniqueValues[Idx]);
8409      assert(E->isOpcodeOrAlt(VI) && "Unexpected main/alternate opcode");
8410      (void)E;
8411      return TTI->getInstructionCost(VI, CostKind);
8412    };
8413    // FIXME: Workaround for syntax error reported by MSVC buildbots.
8414    TargetTransformInfo &TTIRef = *TTI;
8415    // Need to clear CommonCost since the final shuffle cost is included into
8416    // vector cost.
8417    auto GetVectorCost = [&](InstructionCost) {
8418      // VecCost is equal to sum of the cost of creating 2 vectors
8419      // and the cost of creating shuffle.
8420      InstructionCost VecCost = 0;
8421      if (TryFindNodeWithEqualOperands()) {
8422        LLVM_DEBUG({
8423          dbgs() << "SLP: diamond match for alternate node found.\n";
8424          E->dump();
8425        });
8426        // No need to add new vector costs here since we're going to reuse
8427        // same main/alternate vector ops, just do different shuffling.
8428      } else if (Instruction::isBinaryOp(E->getOpcode())) {
8429        VecCost =
8430            TTIRef.getArithmeticInstrCost(E->getOpcode(), VecTy, CostKind);
8431        VecCost +=
8432            TTIRef.getArithmeticInstrCost(E->getAltOpcode(), VecTy, CostKind);
8433      } else if (auto *CI0 = dyn_cast<CmpInst>(VL0)) {
8434        auto *MaskTy = FixedVectorType::get(Builder.getInt1Ty(), VL.size());
8435        VecCost = TTIRef.getCmpSelInstrCost(E->getOpcode(), VecTy, MaskTy,
8436                                            CI0->getPredicate(), CostKind, VL0);
8437        VecCost += TTIRef.getCmpSelInstrCost(
8438            E->getOpcode(), VecTy, MaskTy,
8439            cast<CmpInst>(E->getAltOp())->getPredicate(), CostKind,
8440            E->getAltOp());
8441      } else {
8442        Type *Src0SclTy = E->getMainOp()->getOperand(0)->getType();
8443        Type *Src1SclTy = E->getAltOp()->getOperand(0)->getType();
8444        auto *Src0Ty = FixedVectorType::get(Src0SclTy, VL.size());
8445        auto *Src1Ty = FixedVectorType::get(Src1SclTy, VL.size());
8446        VecCost = TTIRef.getCastInstrCost(E->getOpcode(), VecTy, Src0Ty,
8447                                          TTI::CastContextHint::None, CostKind);
8448        VecCost +=
8449            TTIRef.getCastInstrCost(E->getAltOpcode(), VecTy, Src1Ty,
8450                                    TTI::CastContextHint::None, CostKind);
8451      }
8452      SmallVector<int> Mask;
8453      E->buildAltOpShuffleMask(
8454          [E](Instruction *I) {
8455            assert(E->isOpcodeOrAlt(I) && "Unexpected main/alternate opcode");
8456            return I->getOpcode() == E->getAltOpcode();
8457          },
8458          Mask);
8459      VecCost += ::getShuffleCost(TTIRef, TargetTransformInfo::SK_PermuteTwoSrc,
8460                                  FinalVecTy, Mask);
8461      // Patterns like [fadd,fsub] can be combined into a single instruction
8462      // in x86. Reordering them into [fsub,fadd] blocks this pattern. So we
8463      // need to take into account their order when looking for the most used
8464      // order.
8465      unsigned Opcode0 = E->getOpcode();
8466      unsigned Opcode1 = E->getAltOpcode();
8467      // The opcode mask selects between the two opcodes.
8468      SmallBitVector OpcodeMask(E->Scalars.size(), false);
8469      for (unsigned Lane : seq<unsigned>(0, E->Scalars.size()))
8470        if (cast<Instruction>(E->Scalars[Lane])->getOpcode() == Opcode1)
8471          OpcodeMask.set(Lane);
8472      // If this pattern is supported by the target then we consider the
8473      // order.
8474      if (TTIRef.isLegalAltInstr(VecTy, Opcode0, Opcode1, OpcodeMask)) {
8475        InstructionCost AltVecCost = TTIRef.getAltInstrCost(
8476            VecTy, Opcode0, Opcode1, OpcodeMask, CostKind);
8477        return AltVecCost < VecCost ? AltVecCost : VecCost;
8478      }
8479      // TODO: Check the reverse order too.
8480      return VecCost;
8481    };
8482    return GetCostDiff(GetScalarCost, GetVectorCost);
8483  }
8484  default:
8485    llvm_unreachable("Unknown instruction");
8486  }
8487}
8488
8489bool BoUpSLP::isFullyVectorizableTinyTree(bool ForReduction) const {
8490  LLVM_DEBUG(dbgs() << "SLP: Check whether the tree with height "
8491                    << VectorizableTree.size() << " is fully vectorizable .\n");
8492
8493  auto &&AreVectorizableGathers = [this](const TreeEntry *TE, unsigned Limit) {
8494    SmallVector<int> Mask;
8495    return TE->State == TreeEntry::NeedToGather &&
8496           !any_of(TE->Scalars,
8497                   [this](Value *V) { return EphValues.contains(V); }) &&
8498           (allConstant(TE->Scalars) || isSplat(TE->Scalars) ||
8499            TE->Scalars.size() < Limit ||
8500            ((TE->getOpcode() == Instruction::ExtractElement ||
8501              all_of(TE->Scalars,
8502                     [](Value *V) {
8503                       return isa<ExtractElementInst, UndefValue>(V);
8504                     })) &&
8505             isFixedVectorShuffle(TE->Scalars, Mask)) ||
8506            (TE->State == TreeEntry::NeedToGather &&
8507             TE->getOpcode() == Instruction::Load && !TE->isAltShuffle()));
8508  };
8509
8510  // We only handle trees of heights 1 and 2.
8511  if (VectorizableTree.size() == 1 &&
8512      (VectorizableTree[0]->State == TreeEntry::Vectorize ||
8513       (ForReduction &&
8514        AreVectorizableGathers(VectorizableTree[0].get(),
8515                               VectorizableTree[0]->Scalars.size()) &&
8516        VectorizableTree[0]->getVectorFactor() > 2)))
8517    return true;
8518
8519  if (VectorizableTree.size() != 2)
8520    return false;
8521
8522  // Handle splat and all-constants stores. Also try to vectorize tiny trees
8523  // with the second gather nodes if they have less scalar operands rather than
8524  // the initial tree element (may be profitable to shuffle the second gather)
8525  // or they are extractelements, which form shuffle.
8526  SmallVector<int> Mask;
8527  if (VectorizableTree[0]->State == TreeEntry::Vectorize &&
8528      AreVectorizableGathers(VectorizableTree[1].get(),
8529                             VectorizableTree[0]->Scalars.size()))
8530    return true;
8531
8532  // Gathering cost would be too much for tiny trees.
8533  if (VectorizableTree[0]->State == TreeEntry::NeedToGather ||
8534      (VectorizableTree[1]->State == TreeEntry::NeedToGather &&
8535       VectorizableTree[0]->State != TreeEntry::ScatterVectorize &&
8536       VectorizableTree[0]->State != TreeEntry::PossibleStridedVectorize))
8537    return false;
8538
8539  return true;
8540}
8541
8542static bool isLoadCombineCandidateImpl(Value *Root, unsigned NumElts,
8543                                       TargetTransformInfo *TTI,
8544                                       bool MustMatchOrInst) {
8545  // Look past the root to find a source value. Arbitrarily follow the
8546  // path through operand 0 of any 'or'. Also, peek through optional
8547  // shift-left-by-multiple-of-8-bits.
8548  Value *ZextLoad = Root;
8549  const APInt *ShAmtC;
8550  bool FoundOr = false;
8551  while (!isa<ConstantExpr>(ZextLoad) &&
8552         (match(ZextLoad, m_Or(m_Value(), m_Value())) ||
8553          (match(ZextLoad, m_Shl(m_Value(), m_APInt(ShAmtC))) &&
8554           ShAmtC->urem(8) == 0))) {
8555    auto *BinOp = cast<BinaryOperator>(ZextLoad);
8556    ZextLoad = BinOp->getOperand(0);
8557    if (BinOp->getOpcode() == Instruction::Or)
8558      FoundOr = true;
8559  }
8560  // Check if the input is an extended load of the required or/shift expression.
8561  Value *Load;
8562  if ((MustMatchOrInst && !FoundOr) || ZextLoad == Root ||
8563      !match(ZextLoad, m_ZExt(m_Value(Load))) || !isa<LoadInst>(Load))
8564    return false;
8565
8566  // Require that the total load bit width is a legal integer type.
8567  // For example, <8 x i8> --> i64 is a legal integer on a 64-bit target.
8568  // But <16 x i8> --> i128 is not, so the backend probably can't reduce it.
8569  Type *SrcTy = Load->getType();
8570  unsigned LoadBitWidth = SrcTy->getIntegerBitWidth() * NumElts;
8571  if (!TTI->isTypeLegal(IntegerType::get(Root->getContext(), LoadBitWidth)))
8572    return false;
8573
8574  // Everything matched - assume that we can fold the whole sequence using
8575  // load combining.
8576  LLVM_DEBUG(dbgs() << "SLP: Assume load combining for tree starting at "
8577             << *(cast<Instruction>(Root)) << "\n");
8578
8579  return true;
8580}
8581
8582bool BoUpSLP::isLoadCombineReductionCandidate(RecurKind RdxKind) const {
8583  if (RdxKind != RecurKind::Or)
8584    return false;
8585
8586  unsigned NumElts = VectorizableTree[0]->Scalars.size();
8587  Value *FirstReduced = VectorizableTree[0]->Scalars[0];
8588  return isLoadCombineCandidateImpl(FirstReduced, NumElts, TTI,
8589                                    /* MatchOr */ false);
8590}
8591
8592bool BoUpSLP::isLoadCombineCandidate() const {
8593  // Peek through a final sequence of stores and check if all operations are
8594  // likely to be load-combined.
8595  unsigned NumElts = VectorizableTree[0]->Scalars.size();
8596  for (Value *Scalar : VectorizableTree[0]->Scalars) {
8597    Value *X;
8598    if (!match(Scalar, m_Store(m_Value(X), m_Value())) ||
8599        !isLoadCombineCandidateImpl(X, NumElts, TTI, /* MatchOr */ true))
8600      return false;
8601  }
8602  return true;
8603}
8604
8605bool BoUpSLP::isTreeTinyAndNotFullyVectorizable(bool ForReduction) const {
8606  // No need to vectorize inserts of gathered values.
8607  if (VectorizableTree.size() == 2 &&
8608      isa<InsertElementInst>(VectorizableTree[0]->Scalars[0]) &&
8609      VectorizableTree[1]->State == TreeEntry::NeedToGather &&
8610      (VectorizableTree[1]->getVectorFactor() <= 2 ||
8611       !(isSplat(VectorizableTree[1]->Scalars) ||
8612         allConstant(VectorizableTree[1]->Scalars))))
8613    return true;
8614
8615  // If the graph includes only PHI nodes and gathers, it is defnitely not
8616  // profitable for the vectorization, we can skip it, if the cost threshold is
8617  // default. The cost of vectorized PHI nodes is almost always 0 + the cost of
8618  // gathers/buildvectors.
8619  constexpr int Limit = 4;
8620  if (!ForReduction && !SLPCostThreshold.getNumOccurrences() &&
8621      !VectorizableTree.empty() &&
8622      all_of(VectorizableTree, [&](const std::unique_ptr<TreeEntry> &TE) {
8623        return (TE->State == TreeEntry::NeedToGather &&
8624                TE->getOpcode() != Instruction::ExtractElement &&
8625                count_if(TE->Scalars,
8626                         [](Value *V) { return isa<ExtractElementInst>(V); }) <=
8627                    Limit) ||
8628               TE->getOpcode() == Instruction::PHI;
8629      }))
8630    return true;
8631
8632  // We can vectorize the tree if its size is greater than or equal to the
8633  // minimum size specified by the MinTreeSize command line option.
8634  if (VectorizableTree.size() >= MinTreeSize)
8635    return false;
8636
8637  // If we have a tiny tree (a tree whose size is less than MinTreeSize), we
8638  // can vectorize it if we can prove it fully vectorizable.
8639  if (isFullyVectorizableTinyTree(ForReduction))
8640    return false;
8641
8642  assert(VectorizableTree.empty()
8643             ? ExternalUses.empty()
8644             : true && "We shouldn't have any external users");
8645
8646  // Otherwise, we can't vectorize the tree. It is both tiny and not fully
8647  // vectorizable.
8648  return true;
8649}
8650
8651InstructionCost BoUpSLP::getSpillCost() const {
8652  // Walk from the bottom of the tree to the top, tracking which values are
8653  // live. When we see a call instruction that is not part of our tree,
8654  // query TTI to see if there is a cost to keeping values live over it
8655  // (for example, if spills and fills are required).
8656  unsigned BundleWidth = VectorizableTree.front()->Scalars.size();
8657  InstructionCost Cost = 0;
8658
8659  SmallPtrSet<Instruction *, 4> LiveValues;
8660  Instruction *PrevInst = nullptr;
8661
8662  // The entries in VectorizableTree are not necessarily ordered by their
8663  // position in basic blocks. Collect them and order them by dominance so later
8664  // instructions are guaranteed to be visited first. For instructions in
8665  // different basic blocks, we only scan to the beginning of the block, so
8666  // their order does not matter, as long as all instructions in a basic block
8667  // are grouped together. Using dominance ensures a deterministic order.
8668  SmallVector<Instruction *, 16> OrderedScalars;
8669  for (const auto &TEPtr : VectorizableTree) {
8670    if (TEPtr->State != TreeEntry::Vectorize)
8671      continue;
8672    Instruction *Inst = dyn_cast<Instruction>(TEPtr->Scalars[0]);
8673    if (!Inst)
8674      continue;
8675    OrderedScalars.push_back(Inst);
8676  }
8677  llvm::sort(OrderedScalars, [&](Instruction *A, Instruction *B) {
8678    auto *NodeA = DT->getNode(A->getParent());
8679    auto *NodeB = DT->getNode(B->getParent());
8680    assert(NodeA && "Should only process reachable instructions");
8681    assert(NodeB && "Should only process reachable instructions");
8682    assert((NodeA == NodeB) == (NodeA->getDFSNumIn() == NodeB->getDFSNumIn()) &&
8683           "Different nodes should have different DFS numbers");
8684    if (NodeA != NodeB)
8685      return NodeA->getDFSNumIn() > NodeB->getDFSNumIn();
8686    return B->comesBefore(A);
8687  });
8688
8689  for (Instruction *Inst : OrderedScalars) {
8690    if (!PrevInst) {
8691      PrevInst = Inst;
8692      continue;
8693    }
8694
8695    // Update LiveValues.
8696    LiveValues.erase(PrevInst);
8697    for (auto &J : PrevInst->operands()) {
8698      if (isa<Instruction>(&*J) && getTreeEntry(&*J))
8699        LiveValues.insert(cast<Instruction>(&*J));
8700    }
8701
8702    LLVM_DEBUG({
8703      dbgs() << "SLP: #LV: " << LiveValues.size();
8704      for (auto *X : LiveValues)
8705        dbgs() << " " << X->getName();
8706      dbgs() << ", Looking at ";
8707      Inst->dump();
8708    });
8709
8710    // Now find the sequence of instructions between PrevInst and Inst.
8711    unsigned NumCalls = 0;
8712    BasicBlock::reverse_iterator InstIt = ++Inst->getIterator().getReverse(),
8713                                 PrevInstIt =
8714                                     PrevInst->getIterator().getReverse();
8715    while (InstIt != PrevInstIt) {
8716      if (PrevInstIt == PrevInst->getParent()->rend()) {
8717        PrevInstIt = Inst->getParent()->rbegin();
8718        continue;
8719      }
8720
8721      auto NoCallIntrinsic = [this](Instruction *I) {
8722        if (auto *II = dyn_cast<IntrinsicInst>(I)) {
8723          if (II->isAssumeLikeIntrinsic())
8724            return true;
8725          FastMathFlags FMF;
8726          SmallVector<Type *, 4> Tys;
8727          for (auto &ArgOp : II->args())
8728            Tys.push_back(ArgOp->getType());
8729          if (auto *FPMO = dyn_cast<FPMathOperator>(II))
8730            FMF = FPMO->getFastMathFlags();
8731          IntrinsicCostAttributes ICA(II->getIntrinsicID(), II->getType(), Tys,
8732                                      FMF);
8733          InstructionCost IntrCost =
8734              TTI->getIntrinsicInstrCost(ICA, TTI::TCK_RecipThroughput);
8735          InstructionCost CallCost = TTI->getCallInstrCost(
8736              nullptr, II->getType(), Tys, TTI::TCK_RecipThroughput);
8737          if (IntrCost < CallCost)
8738            return true;
8739        }
8740        return false;
8741      };
8742
8743      // Debug information does not impact spill cost.
8744      if (isa<CallBase>(&*PrevInstIt) && !NoCallIntrinsic(&*PrevInstIt) &&
8745          &*PrevInstIt != PrevInst)
8746        NumCalls++;
8747
8748      ++PrevInstIt;
8749    }
8750
8751    if (NumCalls) {
8752      SmallVector<Type *, 4> V;
8753      for (auto *II : LiveValues) {
8754        auto *ScalarTy = II->getType();
8755        if (auto *VectorTy = dyn_cast<FixedVectorType>(ScalarTy))
8756          ScalarTy = VectorTy->getElementType();
8757        V.push_back(FixedVectorType::get(ScalarTy, BundleWidth));
8758      }
8759      Cost += NumCalls * TTI->getCostOfKeepingLiveOverCall(V);
8760    }
8761
8762    PrevInst = Inst;
8763  }
8764
8765  return Cost;
8766}
8767
8768/// Checks if the \p IE1 instructions is followed by \p IE2 instruction in the
8769/// buildvector sequence.
8770static bool isFirstInsertElement(const InsertElementInst *IE1,
8771                                 const InsertElementInst *IE2) {
8772  if (IE1 == IE2)
8773    return false;
8774  const auto *I1 = IE1;
8775  const auto *I2 = IE2;
8776  const InsertElementInst *PrevI1;
8777  const InsertElementInst *PrevI2;
8778  unsigned Idx1 = *getInsertIndex(IE1);
8779  unsigned Idx2 = *getInsertIndex(IE2);
8780  do {
8781    if (I2 == IE1)
8782      return true;
8783    if (I1 == IE2)
8784      return false;
8785    PrevI1 = I1;
8786    PrevI2 = I2;
8787    if (I1 && (I1 == IE1 || I1->hasOneUse()) &&
8788        getInsertIndex(I1).value_or(Idx2) != Idx2)
8789      I1 = dyn_cast<InsertElementInst>(I1->getOperand(0));
8790    if (I2 && ((I2 == IE2 || I2->hasOneUse())) &&
8791        getInsertIndex(I2).value_or(Idx1) != Idx1)
8792      I2 = dyn_cast<InsertElementInst>(I2->getOperand(0));
8793  } while ((I1 && PrevI1 != I1) || (I2 && PrevI2 != I2));
8794  llvm_unreachable("Two different buildvectors not expected.");
8795}
8796
8797namespace {
8798/// Returns incoming Value *, if the requested type is Value * too, or a default
8799/// value, otherwise.
8800struct ValueSelect {
8801  template <typename U>
8802  static std::enable_if_t<std::is_same_v<Value *, U>, Value *> get(Value *V) {
8803    return V;
8804  }
8805  template <typename U>
8806  static std::enable_if_t<!std::is_same_v<Value *, U>, U> get(Value *) {
8807    return U();
8808  }
8809};
8810} // namespace
8811
8812/// Does the analysis of the provided shuffle masks and performs the requested
8813/// actions on the vectors with the given shuffle masks. It tries to do it in
8814/// several steps.
8815/// 1. If the Base vector is not undef vector, resizing the very first mask to
8816/// have common VF and perform action for 2 input vectors (including non-undef
8817/// Base). Other shuffle masks are combined with the resulting after the 1 stage
8818/// and processed as a shuffle of 2 elements.
8819/// 2. If the Base is undef vector and have only 1 shuffle mask, perform the
8820/// action only for 1 vector with the given mask, if it is not the identity
8821/// mask.
8822/// 3. If > 2 masks are used, perform the remaining shuffle actions for 2
8823/// vectors, combing the masks properly between the steps.
8824template <typename T>
8825static T *performExtractsShuffleAction(
8826    MutableArrayRef<std::pair<T *, SmallVector<int>>> ShuffleMask, Value *Base,
8827    function_ref<unsigned(T *)> GetVF,
8828    function_ref<std::pair<T *, bool>(T *, ArrayRef<int>, bool)> ResizeAction,
8829    function_ref<T *(ArrayRef<int>, ArrayRef<T *>)> Action) {
8830  assert(!ShuffleMask.empty() && "Empty list of shuffles for inserts.");
8831  SmallVector<int> Mask(ShuffleMask.begin()->second);
8832  auto VMIt = std::next(ShuffleMask.begin());
8833  T *Prev = nullptr;
8834  SmallBitVector UseMask =
8835      buildUseMask(Mask.size(), Mask, UseMask::UndefsAsMask);
8836  SmallBitVector IsBaseUndef = isUndefVector(Base, UseMask);
8837  if (!IsBaseUndef.all()) {
8838    // Base is not undef, need to combine it with the next subvectors.
8839    std::pair<T *, bool> Res =
8840        ResizeAction(ShuffleMask.begin()->first, Mask, /*ForSingleMask=*/false);
8841    SmallBitVector IsBasePoison = isUndefVector<true>(Base, UseMask);
8842    for (unsigned Idx = 0, VF = Mask.size(); Idx < VF; ++Idx) {
8843      if (Mask[Idx] == PoisonMaskElem)
8844        Mask[Idx] = IsBasePoison.test(Idx) ? PoisonMaskElem : Idx;
8845      else
8846        Mask[Idx] = (Res.second ? Idx : Mask[Idx]) + VF;
8847    }
8848    auto *V = ValueSelect::get<T *>(Base);
8849    (void)V;
8850    assert((!V || GetVF(V) == Mask.size()) &&
8851           "Expected base vector of VF number of elements.");
8852    Prev = Action(Mask, {nullptr, Res.first});
8853  } else if (ShuffleMask.size() == 1) {
8854    // Base is undef and only 1 vector is shuffled - perform the action only for
8855    // single vector, if the mask is not the identity mask.
8856    std::pair<T *, bool> Res = ResizeAction(ShuffleMask.begin()->first, Mask,
8857                                            /*ForSingleMask=*/true);
8858    if (Res.second)
8859      // Identity mask is found.
8860      Prev = Res.first;
8861    else
8862      Prev = Action(Mask, {ShuffleMask.begin()->first});
8863  } else {
8864    // Base is undef and at least 2 input vectors shuffled - perform 2 vectors
8865    // shuffles step by step, combining shuffle between the steps.
8866    unsigned Vec1VF = GetVF(ShuffleMask.begin()->first);
8867    unsigned Vec2VF = GetVF(VMIt->first);
8868    if (Vec1VF == Vec2VF) {
8869      // No need to resize the input vectors since they are of the same size, we
8870      // can shuffle them directly.
8871      ArrayRef<int> SecMask = VMIt->second;
8872      for (unsigned I = 0, VF = Mask.size(); I < VF; ++I) {
8873        if (SecMask[I] != PoisonMaskElem) {
8874          assert(Mask[I] == PoisonMaskElem && "Multiple uses of scalars.");
8875          Mask[I] = SecMask[I] + Vec1VF;
8876        }
8877      }
8878      Prev = Action(Mask, {ShuffleMask.begin()->first, VMIt->first});
8879    } else {
8880      // Vectors of different sizes - resize and reshuffle.
8881      std::pair<T *, bool> Res1 = ResizeAction(ShuffleMask.begin()->first, Mask,
8882                                               /*ForSingleMask=*/false);
8883      std::pair<T *, bool> Res2 =
8884          ResizeAction(VMIt->first, VMIt->second, /*ForSingleMask=*/false);
8885      ArrayRef<int> SecMask = VMIt->second;
8886      for (unsigned I = 0, VF = Mask.size(); I < VF; ++I) {
8887        if (Mask[I] != PoisonMaskElem) {
8888          assert(SecMask[I] == PoisonMaskElem && "Multiple uses of scalars.");
8889          if (Res1.second)
8890            Mask[I] = I;
8891        } else if (SecMask[I] != PoisonMaskElem) {
8892          assert(Mask[I] == PoisonMaskElem && "Multiple uses of scalars.");
8893          Mask[I] = (Res2.second ? I : SecMask[I]) + VF;
8894        }
8895      }
8896      Prev = Action(Mask, {Res1.first, Res2.first});
8897    }
8898    VMIt = std::next(VMIt);
8899  }
8900  bool IsBaseNotUndef = !IsBaseUndef.all();
8901  (void)IsBaseNotUndef;
8902  // Perform requested actions for the remaining masks/vectors.
8903  for (auto E = ShuffleMask.end(); VMIt != E; ++VMIt) {
8904    // Shuffle other input vectors, if any.
8905    std::pair<T *, bool> Res =
8906        ResizeAction(VMIt->first, VMIt->second, /*ForSingleMask=*/false);
8907    ArrayRef<int> SecMask = VMIt->second;
8908    for (unsigned I = 0, VF = Mask.size(); I < VF; ++I) {
8909      if (SecMask[I] != PoisonMaskElem) {
8910        assert((Mask[I] == PoisonMaskElem || IsBaseNotUndef) &&
8911               "Multiple uses of scalars.");
8912        Mask[I] = (Res.second ? I : SecMask[I]) + VF;
8913      } else if (Mask[I] != PoisonMaskElem) {
8914        Mask[I] = I;
8915      }
8916    }
8917    Prev = Action(Mask, {Prev, Res.first});
8918  }
8919  return Prev;
8920}
8921
8922InstructionCost BoUpSLP::getTreeCost(ArrayRef<Value *> VectorizedVals) {
8923  InstructionCost Cost = 0;
8924  LLVM_DEBUG(dbgs() << "SLP: Calculating cost for tree of size "
8925                    << VectorizableTree.size() << ".\n");
8926
8927  unsigned BundleWidth = VectorizableTree[0]->Scalars.size();
8928
8929  SmallPtrSet<Value *, 4> CheckedExtracts;
8930  for (unsigned I = 0, E = VectorizableTree.size(); I < E; ++I) {
8931    TreeEntry &TE = *VectorizableTree[I];
8932    if (TE.State == TreeEntry::NeedToGather) {
8933      if (const TreeEntry *E = getTreeEntry(TE.getMainOp());
8934          E && E->getVectorFactor() == TE.getVectorFactor() &&
8935          E->isSame(TE.Scalars)) {
8936        // Some gather nodes might be absolutely the same as some vectorizable
8937        // nodes after reordering, need to handle it.
8938        LLVM_DEBUG(dbgs() << "SLP: Adding cost 0 for bundle "
8939                          << shortBundleName(TE.Scalars) << ".\n"
8940                          << "SLP: Current total cost = " << Cost << "\n");
8941        continue;
8942      }
8943    }
8944
8945    InstructionCost C = getEntryCost(&TE, VectorizedVals, CheckedExtracts);
8946    Cost += C;
8947    LLVM_DEBUG(dbgs() << "SLP: Adding cost " << C << " for bundle "
8948                      << shortBundleName(TE.Scalars) << ".\n"
8949                      << "SLP: Current total cost = " << Cost << "\n");
8950  }
8951
8952  SmallPtrSet<Value *, 16> ExtractCostCalculated;
8953  InstructionCost ExtractCost = 0;
8954  SmallVector<MapVector<const TreeEntry *, SmallVector<int>>> ShuffleMasks;
8955  SmallVector<std::pair<Value *, const TreeEntry *>> FirstUsers;
8956  SmallVector<APInt> DemandedElts;
8957  SmallDenseSet<Value *, 4> UsedInserts;
8958  DenseSet<Value *> VectorCasts;
8959  for (ExternalUser &EU : ExternalUses) {
8960    // We only add extract cost once for the same scalar.
8961    if (!isa_and_nonnull<InsertElementInst>(EU.User) &&
8962        !ExtractCostCalculated.insert(EU.Scalar).second)
8963      continue;
8964
8965    // Uses by ephemeral values are free (because the ephemeral value will be
8966    // removed prior to code generation, and so the extraction will be
8967    // removed as well).
8968    if (EphValues.count(EU.User))
8969      continue;
8970
8971    // No extract cost for vector "scalar"
8972    if (isa<FixedVectorType>(EU.Scalar->getType()))
8973      continue;
8974
8975    // If found user is an insertelement, do not calculate extract cost but try
8976    // to detect it as a final shuffled/identity match.
8977    if (auto *VU = dyn_cast_or_null<InsertElementInst>(EU.User)) {
8978      if (auto *FTy = dyn_cast<FixedVectorType>(VU->getType())) {
8979        if (!UsedInserts.insert(VU).second)
8980          continue;
8981        std::optional<unsigned> InsertIdx = getInsertIndex(VU);
8982        if (InsertIdx) {
8983          const TreeEntry *ScalarTE = getTreeEntry(EU.Scalar);
8984          auto *It = find_if(
8985              FirstUsers,
8986              [this, VU](const std::pair<Value *, const TreeEntry *> &Pair) {
8987                return areTwoInsertFromSameBuildVector(
8988                    VU, cast<InsertElementInst>(Pair.first),
8989                    [this](InsertElementInst *II) -> Value * {
8990                      Value *Op0 = II->getOperand(0);
8991                      if (getTreeEntry(II) && !getTreeEntry(Op0))
8992                        return nullptr;
8993                      return Op0;
8994                    });
8995              });
8996          int VecId = -1;
8997          if (It == FirstUsers.end()) {
8998            (void)ShuffleMasks.emplace_back();
8999            SmallVectorImpl<int> &Mask = ShuffleMasks.back()[ScalarTE];
9000            if (Mask.empty())
9001              Mask.assign(FTy->getNumElements(), PoisonMaskElem);
9002            // Find the insertvector, vectorized in tree, if any.
9003            Value *Base = VU;
9004            while (auto *IEBase = dyn_cast<InsertElementInst>(Base)) {
9005              if (IEBase != EU.User &&
9006                  (!IEBase->hasOneUse() ||
9007                   getInsertIndex(IEBase).value_or(*InsertIdx) == *InsertIdx))
9008                break;
9009              // Build the mask for the vectorized insertelement instructions.
9010              if (const TreeEntry *E = getTreeEntry(IEBase)) {
9011                VU = IEBase;
9012                do {
9013                  IEBase = cast<InsertElementInst>(Base);
9014                  int Idx = *getInsertIndex(IEBase);
9015                  assert(Mask[Idx] == PoisonMaskElem &&
9016                         "InsertElementInstruction used already.");
9017                  Mask[Idx] = Idx;
9018                  Base = IEBase->getOperand(0);
9019                } while (E == getTreeEntry(Base));
9020                break;
9021              }
9022              Base = cast<InsertElementInst>(Base)->getOperand(0);
9023            }
9024            FirstUsers.emplace_back(VU, ScalarTE);
9025            DemandedElts.push_back(APInt::getZero(FTy->getNumElements()));
9026            VecId = FirstUsers.size() - 1;
9027            auto It = MinBWs.find(ScalarTE);
9028            if (It != MinBWs.end() && VectorCasts.insert(EU.Scalar).second) {
9029              unsigned BWSz = It->second.second;
9030              unsigned SrcBWSz = DL->getTypeSizeInBits(FTy->getElementType());
9031              unsigned VecOpcode;
9032              if (BWSz < SrcBWSz)
9033                VecOpcode = Instruction::Trunc;
9034              else
9035                VecOpcode =
9036                    It->second.second ? Instruction::SExt : Instruction::ZExt;
9037              TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput;
9038              InstructionCost C = TTI->getCastInstrCost(
9039                  VecOpcode, FTy,
9040                  FixedVectorType::get(
9041                      IntegerType::get(FTy->getContext(), It->second.first),
9042                      FTy->getNumElements()),
9043                  TTI::CastContextHint::None, CostKind);
9044              LLVM_DEBUG(dbgs() << "SLP: Adding cost " << C
9045                                << " for extending externally used vector with "
9046                                   "non-equal minimum bitwidth.\n");
9047              Cost += C;
9048            }
9049          } else {
9050            if (isFirstInsertElement(VU, cast<InsertElementInst>(It->first)))
9051              It->first = VU;
9052            VecId = std::distance(FirstUsers.begin(), It);
9053          }
9054          int InIdx = *InsertIdx;
9055          SmallVectorImpl<int> &Mask = ShuffleMasks[VecId][ScalarTE];
9056          if (Mask.empty())
9057            Mask.assign(FTy->getNumElements(), PoisonMaskElem);
9058          Mask[InIdx] = EU.Lane;
9059          DemandedElts[VecId].setBit(InIdx);
9060          continue;
9061        }
9062      }
9063    }
9064
9065    // If we plan to rewrite the tree in a smaller type, we will need to sign
9066    // extend the extracted value back to the original type. Here, we account
9067    // for the extract and the added cost of the sign extend if needed.
9068    auto *VecTy = FixedVectorType::get(EU.Scalar->getType(), BundleWidth);
9069    TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput;
9070    auto It = MinBWs.find(getTreeEntry(EU.Scalar));
9071    if (It != MinBWs.end()) {
9072      auto *MinTy = IntegerType::get(F->getContext(), It->second.first);
9073      unsigned Extend =
9074          It->second.second ? Instruction::SExt : Instruction::ZExt;
9075      VecTy = FixedVectorType::get(MinTy, BundleWidth);
9076      ExtractCost += TTI->getExtractWithExtendCost(Extend, EU.Scalar->getType(),
9077                                                   VecTy, EU.Lane);
9078    } else {
9079      ExtractCost += TTI->getVectorInstrCost(Instruction::ExtractElement, VecTy,
9080                                             CostKind, EU.Lane);
9081    }
9082  }
9083  // Add reduced value cost, if resized.
9084  if (!VectorizedVals.empty()) {
9085    auto BWIt = MinBWs.find(VectorizableTree.front().get());
9086    if (BWIt != MinBWs.end()) {
9087      Type *DstTy = VectorizableTree.front()->Scalars.front()->getType();
9088      unsigned OriginalSz = DL->getTypeSizeInBits(DstTy);
9089      unsigned Opcode = Instruction::Trunc;
9090      if (OriginalSz < BWIt->second.first)
9091        Opcode = BWIt->second.second ? Instruction::SExt : Instruction::ZExt;
9092      Type *SrcTy = IntegerType::get(DstTy->getContext(), BWIt->second.first);
9093      Cost += TTI->getCastInstrCost(Opcode, DstTy, SrcTy,
9094                                    TTI::CastContextHint::None,
9095                                    TTI::TCK_RecipThroughput);
9096    }
9097  }
9098
9099  InstructionCost SpillCost = getSpillCost();
9100  Cost += SpillCost + ExtractCost;
9101  auto &&ResizeToVF = [this, &Cost](const TreeEntry *TE, ArrayRef<int> Mask,
9102                                    bool) {
9103    InstructionCost C = 0;
9104    unsigned VF = Mask.size();
9105    unsigned VecVF = TE->getVectorFactor();
9106    if (VF != VecVF &&
9107        (any_of(Mask, [VF](int Idx) { return Idx >= static_cast<int>(VF); }) ||
9108         !ShuffleVectorInst::isIdentityMask(Mask, VF))) {
9109      SmallVector<int> OrigMask(VecVF, PoisonMaskElem);
9110      std::copy(Mask.begin(), std::next(Mask.begin(), std::min(VF, VecVF)),
9111                OrigMask.begin());
9112      C = TTI->getShuffleCost(
9113          TTI::SK_PermuteSingleSrc,
9114          FixedVectorType::get(TE->getMainOp()->getType(), VecVF), OrigMask);
9115      LLVM_DEBUG(
9116          dbgs() << "SLP: Adding cost " << C
9117                 << " for final shuffle of insertelement external users.\n";
9118          TE->dump(); dbgs() << "SLP: Current total cost = " << Cost << "\n");
9119      Cost += C;
9120      return std::make_pair(TE, true);
9121    }
9122    return std::make_pair(TE, false);
9123  };
9124  // Calculate the cost of the reshuffled vectors, if any.
9125  for (int I = 0, E = FirstUsers.size(); I < E; ++I) {
9126    Value *Base = cast<Instruction>(FirstUsers[I].first)->getOperand(0);
9127    auto Vector = ShuffleMasks[I].takeVector();
9128    unsigned VF = 0;
9129    auto EstimateShufflesCost = [&](ArrayRef<int> Mask,
9130                                    ArrayRef<const TreeEntry *> TEs) {
9131      assert((TEs.size() == 1 || TEs.size() == 2) &&
9132             "Expected exactly 1 or 2 tree entries.");
9133      if (TEs.size() == 1) {
9134        if (VF == 0)
9135          VF = TEs.front()->getVectorFactor();
9136        auto *FTy =
9137            FixedVectorType::get(TEs.back()->Scalars.front()->getType(), VF);
9138        if (!ShuffleVectorInst::isIdentityMask(Mask, VF) &&
9139            !all_of(enumerate(Mask), [=](const auto &Data) {
9140              return Data.value() == PoisonMaskElem ||
9141                     (Data.index() < VF &&
9142                      static_cast<int>(Data.index()) == Data.value());
9143            })) {
9144          InstructionCost C =
9145              TTI->getShuffleCost(TTI::SK_PermuteSingleSrc, FTy, Mask);
9146          LLVM_DEBUG(dbgs() << "SLP: Adding cost " << C
9147                            << " for final shuffle of insertelement "
9148                               "external users.\n";
9149                     TEs.front()->dump();
9150                     dbgs() << "SLP: Current total cost = " << Cost << "\n");
9151          Cost += C;
9152        }
9153      } else {
9154        if (VF == 0) {
9155          if (TEs.front() &&
9156              TEs.front()->getVectorFactor() == TEs.back()->getVectorFactor())
9157            VF = TEs.front()->getVectorFactor();
9158          else
9159            VF = Mask.size();
9160        }
9161        auto *FTy =
9162            FixedVectorType::get(TEs.back()->Scalars.front()->getType(), VF);
9163        InstructionCost C =
9164            ::getShuffleCost(*TTI, TTI::SK_PermuteTwoSrc, FTy, Mask);
9165        LLVM_DEBUG(dbgs() << "SLP: Adding cost " << C
9166                          << " for final shuffle of vector node and external "
9167                             "insertelement users.\n";
9168                   if (TEs.front()) { TEs.front()->dump(); } TEs.back()->dump();
9169                   dbgs() << "SLP: Current total cost = " << Cost << "\n");
9170        Cost += C;
9171      }
9172      VF = Mask.size();
9173      return TEs.back();
9174    };
9175    (void)performExtractsShuffleAction<const TreeEntry>(
9176        MutableArrayRef(Vector.data(), Vector.size()), Base,
9177        [](const TreeEntry *E) { return E->getVectorFactor(); }, ResizeToVF,
9178        EstimateShufflesCost);
9179    InstructionCost InsertCost = TTI->getScalarizationOverhead(
9180        cast<FixedVectorType>(FirstUsers[I].first->getType()), DemandedElts[I],
9181        /*Insert*/ true, /*Extract*/ false, TTI::TCK_RecipThroughput);
9182    Cost -= InsertCost;
9183  }
9184
9185#ifndef NDEBUG
9186  SmallString<256> Str;
9187  {
9188    raw_svector_ostream OS(Str);
9189    OS << "SLP: Spill Cost = " << SpillCost << ".\n"
9190       << "SLP: Extract Cost = " << ExtractCost << ".\n"
9191       << "SLP: Total Cost = " << Cost << ".\n";
9192  }
9193  LLVM_DEBUG(dbgs() << Str);
9194  if (ViewSLPTree)
9195    ViewGraph(this, "SLP" + F->getName(), false, Str);
9196#endif
9197
9198  return Cost;
9199}
9200
9201/// Tries to find extractelement instructions with constant indices from fixed
9202/// vector type and gather such instructions into a bunch, which highly likely
9203/// might be detected as a shuffle of 1 or 2 input vectors. If this attempt was
9204/// successful, the matched scalars are replaced by poison values in \p VL for
9205/// future analysis.
9206std::optional<TTI::ShuffleKind>
9207BoUpSLP::tryToGatherSingleRegisterExtractElements(
9208    MutableArrayRef<Value *> VL, SmallVectorImpl<int> &Mask) const {
9209  // Scan list of gathered scalars for extractelements that can be represented
9210  // as shuffles.
9211  MapVector<Value *, SmallVector<int>> VectorOpToIdx;
9212  SmallVector<int> UndefVectorExtracts;
9213  for (int I = 0, E = VL.size(); I < E; ++I) {
9214    auto *EI = dyn_cast<ExtractElementInst>(VL[I]);
9215    if (!EI) {
9216      if (isa<UndefValue>(VL[I]))
9217        UndefVectorExtracts.push_back(I);
9218      continue;
9219    }
9220    auto *VecTy = dyn_cast<FixedVectorType>(EI->getVectorOperandType());
9221    if (!VecTy || !isa<ConstantInt, UndefValue>(EI->getIndexOperand()))
9222      continue;
9223    std::optional<unsigned> Idx = getExtractIndex(EI);
9224    // Undefined index.
9225    if (!Idx) {
9226      UndefVectorExtracts.push_back(I);
9227      continue;
9228    }
9229    SmallBitVector ExtractMask(VecTy->getNumElements(), true);
9230    ExtractMask.reset(*Idx);
9231    if (isUndefVector(EI->getVectorOperand(), ExtractMask).all()) {
9232      UndefVectorExtracts.push_back(I);
9233      continue;
9234    }
9235    VectorOpToIdx[EI->getVectorOperand()].push_back(I);
9236  }
9237  // Sort the vector operands by the maximum number of uses in extractelements.
9238  MapVector<unsigned, SmallVector<Value *>> VFToVector;
9239  for (const auto &Data : VectorOpToIdx)
9240    VFToVector[cast<FixedVectorType>(Data.first->getType())->getNumElements()]
9241        .push_back(Data.first);
9242  for (auto &Data : VFToVector) {
9243    stable_sort(Data.second, [&VectorOpToIdx](Value *V1, Value *V2) {
9244      return VectorOpToIdx.find(V1)->second.size() >
9245             VectorOpToIdx.find(V2)->second.size();
9246    });
9247  }
9248  // Find the best pair of the vectors with the same number of elements or a
9249  // single vector.
9250  const int UndefSz = UndefVectorExtracts.size();
9251  unsigned SingleMax = 0;
9252  Value *SingleVec = nullptr;
9253  unsigned PairMax = 0;
9254  std::pair<Value *, Value *> PairVec(nullptr, nullptr);
9255  for (auto &Data : VFToVector) {
9256    Value *V1 = Data.second.front();
9257    if (SingleMax < VectorOpToIdx[V1].size() + UndefSz) {
9258      SingleMax = VectorOpToIdx[V1].size() + UndefSz;
9259      SingleVec = V1;
9260    }
9261    Value *V2 = nullptr;
9262    if (Data.second.size() > 1)
9263      V2 = *std::next(Data.second.begin());
9264    if (V2 && PairMax < VectorOpToIdx[V1].size() + VectorOpToIdx[V2].size() +
9265                            UndefSz) {
9266      PairMax = VectorOpToIdx[V1].size() + VectorOpToIdx[V2].size() + UndefSz;
9267      PairVec = std::make_pair(V1, V2);
9268    }
9269  }
9270  if (SingleMax == 0 && PairMax == 0 && UndefSz == 0)
9271    return std::nullopt;
9272  // Check if better to perform a shuffle of 2 vectors or just of a single
9273  // vector.
9274  SmallVector<Value *> SavedVL(VL.begin(), VL.end());
9275  SmallVector<Value *> GatheredExtracts(
9276      VL.size(), PoisonValue::get(VL.front()->getType()));
9277  if (SingleMax >= PairMax && SingleMax) {
9278    for (int Idx : VectorOpToIdx[SingleVec])
9279      std::swap(GatheredExtracts[Idx], VL[Idx]);
9280  } else {
9281    for (Value *V : {PairVec.first, PairVec.second})
9282      for (int Idx : VectorOpToIdx[V])
9283        std::swap(GatheredExtracts[Idx], VL[Idx]);
9284  }
9285  // Add extracts from undefs too.
9286  for (int Idx : UndefVectorExtracts)
9287    std::swap(GatheredExtracts[Idx], VL[Idx]);
9288  // Check that gather of extractelements can be represented as just a
9289  // shuffle of a single/two vectors the scalars are extracted from.
9290  std::optional<TTI::ShuffleKind> Res =
9291      isFixedVectorShuffle(GatheredExtracts, Mask);
9292  if (!Res) {
9293    // TODO: try to check other subsets if possible.
9294    // Restore the original VL if attempt was not successful.
9295    copy(SavedVL, VL.begin());
9296    return std::nullopt;
9297  }
9298  // Restore unused scalars from mask, if some of the extractelements were not
9299  // selected for shuffle.
9300  for (int I = 0, E = GatheredExtracts.size(); I < E; ++I) {
9301    if (Mask[I] == PoisonMaskElem && !isa<PoisonValue>(GatheredExtracts[I]) &&
9302        isa<UndefValue>(GatheredExtracts[I])) {
9303      std::swap(VL[I], GatheredExtracts[I]);
9304      continue;
9305    }
9306    auto *EI = dyn_cast<ExtractElementInst>(VL[I]);
9307    if (!EI || !isa<FixedVectorType>(EI->getVectorOperandType()) ||
9308        !isa<ConstantInt, UndefValue>(EI->getIndexOperand()) ||
9309        is_contained(UndefVectorExtracts, I))
9310      continue;
9311  }
9312  return Res;
9313}
9314
9315/// Tries to find extractelement instructions with constant indices from fixed
9316/// vector type and gather such instructions into a bunch, which highly likely
9317/// might be detected as a shuffle of 1 or 2 input vectors. If this attempt was
9318/// successful, the matched scalars are replaced by poison values in \p VL for
9319/// future analysis.
9320SmallVector<std::optional<TTI::ShuffleKind>>
9321BoUpSLP::tryToGatherExtractElements(SmallVectorImpl<Value *> &VL,
9322                                    SmallVectorImpl<int> &Mask,
9323                                    unsigned NumParts) const {
9324  assert(NumParts > 0 && "NumParts expected be greater than or equal to 1.");
9325  SmallVector<std::optional<TTI::ShuffleKind>> ShufflesRes(NumParts);
9326  Mask.assign(VL.size(), PoisonMaskElem);
9327  unsigned SliceSize = VL.size() / NumParts;
9328  for (unsigned Part = 0; Part < NumParts; ++Part) {
9329    // Scan list of gathered scalars for extractelements that can be represented
9330    // as shuffles.
9331    MutableArrayRef<Value *> SubVL =
9332        MutableArrayRef(VL).slice(Part * SliceSize, SliceSize);
9333    SmallVector<int> SubMask;
9334    std::optional<TTI::ShuffleKind> Res =
9335        tryToGatherSingleRegisterExtractElements(SubVL, SubMask);
9336    ShufflesRes[Part] = Res;
9337    copy(SubMask, std::next(Mask.begin(), Part * SliceSize));
9338  }
9339  if (none_of(ShufflesRes, [](const std::optional<TTI::ShuffleKind> &Res) {
9340        return Res.has_value();
9341      }))
9342    ShufflesRes.clear();
9343  return ShufflesRes;
9344}
9345
9346std::optional<TargetTransformInfo::ShuffleKind>
9347BoUpSLP::isGatherShuffledSingleRegisterEntry(
9348    const TreeEntry *TE, ArrayRef<Value *> VL, MutableArrayRef<int> Mask,
9349    SmallVectorImpl<const TreeEntry *> &Entries, unsigned Part) {
9350  Entries.clear();
9351  // TODO: currently checking only for Scalars in the tree entry, need to count
9352  // reused elements too for better cost estimation.
9353  const EdgeInfo &TEUseEI = TE->UserTreeIndices.front();
9354  const Instruction *TEInsertPt = &getLastInstructionInBundle(TEUseEI.UserTE);
9355  const BasicBlock *TEInsertBlock = nullptr;
9356  // Main node of PHI entries keeps the correct order of operands/incoming
9357  // blocks.
9358  if (auto *PHI = dyn_cast<PHINode>(TEUseEI.UserTE->getMainOp())) {
9359    TEInsertBlock = PHI->getIncomingBlock(TEUseEI.EdgeIdx);
9360    TEInsertPt = TEInsertBlock->getTerminator();
9361  } else {
9362    TEInsertBlock = TEInsertPt->getParent();
9363  }
9364  auto *NodeUI = DT->getNode(TEInsertBlock);
9365  assert(NodeUI && "Should only process reachable instructions");
9366  SmallPtrSet<Value *, 4> GatheredScalars(VL.begin(), VL.end());
9367  auto CheckOrdering = [&](const Instruction *InsertPt) {
9368    // Argument InsertPt is an instruction where vector code for some other
9369    // tree entry (one that shares one or more scalars with TE) is going to be
9370    // generated. This lambda returns true if insertion point of vector code
9371    // for the TE dominates that point (otherwise dependency is the other way
9372    // around). The other node is not limited to be of a gather kind. Gather
9373    // nodes are not scheduled and their vector code is inserted before their
9374    // first user. If user is PHI, that is supposed to be at the end of a
9375    // predecessor block. Otherwise it is the last instruction among scalars of
9376    // the user node. So, instead of checking dependency between instructions
9377    // themselves, we check dependency between their insertion points for vector
9378    // code (since each scalar instruction ends up as a lane of a vector
9379    // instruction).
9380    const BasicBlock *InsertBlock = InsertPt->getParent();
9381    auto *NodeEUI = DT->getNode(InsertBlock);
9382    if (!NodeEUI)
9383      return false;
9384    assert((NodeUI == NodeEUI) ==
9385               (NodeUI->getDFSNumIn() == NodeEUI->getDFSNumIn()) &&
9386           "Different nodes should have different DFS numbers");
9387    // Check the order of the gather nodes users.
9388    if (TEInsertPt->getParent() != InsertBlock &&
9389        (DT->dominates(NodeUI, NodeEUI) || !DT->dominates(NodeEUI, NodeUI)))
9390      return false;
9391    if (TEInsertPt->getParent() == InsertBlock &&
9392        TEInsertPt->comesBefore(InsertPt))
9393      return false;
9394    return true;
9395  };
9396  // Find all tree entries used by the gathered values. If no common entries
9397  // found - not a shuffle.
9398  // Here we build a set of tree nodes for each gathered value and trying to
9399  // find the intersection between these sets. If we have at least one common
9400  // tree node for each gathered value - we have just a permutation of the
9401  // single vector. If we have 2 different sets, we're in situation where we
9402  // have a permutation of 2 input vectors.
9403  SmallVector<SmallPtrSet<const TreeEntry *, 4>> UsedTEs;
9404  DenseMap<Value *, int> UsedValuesEntry;
9405  for (Value *V : VL) {
9406    if (isConstant(V))
9407      continue;
9408    // Build a list of tree entries where V is used.
9409    SmallPtrSet<const TreeEntry *, 4> VToTEs;
9410    for (const TreeEntry *TEPtr : ValueToGatherNodes.find(V)->second) {
9411      if (TEPtr == TE)
9412        continue;
9413      assert(any_of(TEPtr->Scalars,
9414                    [&](Value *V) { return GatheredScalars.contains(V); }) &&
9415             "Must contain at least single gathered value.");
9416      assert(TEPtr->UserTreeIndices.size() == 1 &&
9417             "Expected only single user of a gather node.");
9418      const EdgeInfo &UseEI = TEPtr->UserTreeIndices.front();
9419
9420      PHINode *UserPHI = dyn_cast<PHINode>(UseEI.UserTE->getMainOp());
9421      const Instruction *InsertPt =
9422          UserPHI ? UserPHI->getIncomingBlock(UseEI.EdgeIdx)->getTerminator()
9423                  : &getLastInstructionInBundle(UseEI.UserTE);
9424      if (TEInsertPt == InsertPt) {
9425        // If 2 gathers are operands of the same entry (regardless of whether
9426        // user is PHI or else), compare operands indices, use the earlier one
9427        // as the base.
9428        if (TEUseEI.UserTE == UseEI.UserTE && TEUseEI.EdgeIdx < UseEI.EdgeIdx)
9429          continue;
9430        // If the user instruction is used for some reason in different
9431        // vectorized nodes - make it depend on index.
9432        if (TEUseEI.UserTE != UseEI.UserTE &&
9433            TEUseEI.UserTE->Idx < UseEI.UserTE->Idx)
9434          continue;
9435      }
9436
9437      // Check if the user node of the TE comes after user node of TEPtr,
9438      // otherwise TEPtr depends on TE.
9439      if ((TEInsertBlock != InsertPt->getParent() ||
9440           TEUseEI.EdgeIdx < UseEI.EdgeIdx || TEUseEI.UserTE != UseEI.UserTE) &&
9441          !CheckOrdering(InsertPt))
9442        continue;
9443      VToTEs.insert(TEPtr);
9444    }
9445    if (const TreeEntry *VTE = getTreeEntry(V)) {
9446      Instruction &LastBundleInst = getLastInstructionInBundle(VTE);
9447      if (&LastBundleInst == TEInsertPt || !CheckOrdering(&LastBundleInst))
9448        continue;
9449      auto It = MinBWs.find(VTE);
9450      // If vectorize node is demoted - do not match.
9451      if (It != MinBWs.end() &&
9452          It->second.first != DL->getTypeSizeInBits(V->getType()))
9453        continue;
9454      VToTEs.insert(VTE);
9455    }
9456    if (VToTEs.empty())
9457      continue;
9458    if (UsedTEs.empty()) {
9459      // The first iteration, just insert the list of nodes to vector.
9460      UsedTEs.push_back(VToTEs);
9461      UsedValuesEntry.try_emplace(V, 0);
9462    } else {
9463      // Need to check if there are any previously used tree nodes which use V.
9464      // If there are no such nodes, consider that we have another one input
9465      // vector.
9466      SmallPtrSet<const TreeEntry *, 4> SavedVToTEs(VToTEs);
9467      unsigned Idx = 0;
9468      for (SmallPtrSet<const TreeEntry *, 4> &Set : UsedTEs) {
9469        // Do we have a non-empty intersection of previously listed tree entries
9470        // and tree entries using current V?
9471        set_intersect(VToTEs, Set);
9472        if (!VToTEs.empty()) {
9473          // Yes, write the new subset and continue analysis for the next
9474          // scalar.
9475          Set.swap(VToTEs);
9476          break;
9477        }
9478        VToTEs = SavedVToTEs;
9479        ++Idx;
9480      }
9481      // No non-empty intersection found - need to add a second set of possible
9482      // source vectors.
9483      if (Idx == UsedTEs.size()) {
9484        // If the number of input vectors is greater than 2 - not a permutation,
9485        // fallback to the regular gather.
9486        // TODO: support multiple reshuffled nodes.
9487        if (UsedTEs.size() == 2)
9488          continue;
9489        UsedTEs.push_back(SavedVToTEs);
9490        Idx = UsedTEs.size() - 1;
9491      }
9492      UsedValuesEntry.try_emplace(V, Idx);
9493    }
9494  }
9495
9496  if (UsedTEs.empty()) {
9497    Entries.clear();
9498    return std::nullopt;
9499  }
9500
9501  unsigned VF = 0;
9502  if (UsedTEs.size() == 1) {
9503    // Keep the order to avoid non-determinism.
9504    SmallVector<const TreeEntry *> FirstEntries(UsedTEs.front().begin(),
9505                                                UsedTEs.front().end());
9506    sort(FirstEntries, [](const TreeEntry *TE1, const TreeEntry *TE2) {
9507      return TE1->Idx < TE2->Idx;
9508    });
9509    // Try to find the perfect match in another gather node at first.
9510    auto *It = find_if(FirstEntries, [=](const TreeEntry *EntryPtr) {
9511      return EntryPtr->isSame(VL) || EntryPtr->isSame(TE->Scalars);
9512    });
9513    if (It != FirstEntries.end() &&
9514        ((*It)->getVectorFactor() == VL.size() ||
9515         ((*It)->getVectorFactor() == TE->Scalars.size() &&
9516          TE->ReuseShuffleIndices.size() == VL.size() &&
9517          (*It)->isSame(TE->Scalars)))) {
9518      Entries.push_back(*It);
9519      if ((*It)->getVectorFactor() == VL.size()) {
9520        std::iota(std::next(Mask.begin(), Part * VL.size()),
9521                  std::next(Mask.begin(), (Part + 1) * VL.size()), 0);
9522      } else {
9523        SmallVector<int> CommonMask = TE->getCommonMask();
9524        copy(CommonMask, Mask.begin());
9525      }
9526      // Clear undef scalars.
9527      for (int I = 0, Sz = VL.size(); I < Sz; ++I)
9528        if (isa<PoisonValue>(VL[I]))
9529          Mask[I] = PoisonMaskElem;
9530      return TargetTransformInfo::SK_PermuteSingleSrc;
9531    }
9532    // No perfect match, just shuffle, so choose the first tree node from the
9533    // tree.
9534    Entries.push_back(FirstEntries.front());
9535  } else {
9536    // Try to find nodes with the same vector factor.
9537    assert(UsedTEs.size() == 2 && "Expected at max 2 permuted entries.");
9538    // Keep the order of tree nodes to avoid non-determinism.
9539    DenseMap<int, const TreeEntry *> VFToTE;
9540    for (const TreeEntry *TE : UsedTEs.front()) {
9541      unsigned VF = TE->getVectorFactor();
9542      auto It = VFToTE.find(VF);
9543      if (It != VFToTE.end()) {
9544        if (It->second->Idx > TE->Idx)
9545          It->getSecond() = TE;
9546        continue;
9547      }
9548      VFToTE.try_emplace(VF, TE);
9549    }
9550    // Same, keep the order to avoid non-determinism.
9551    SmallVector<const TreeEntry *> SecondEntries(UsedTEs.back().begin(),
9552                                                 UsedTEs.back().end());
9553    sort(SecondEntries, [](const TreeEntry *TE1, const TreeEntry *TE2) {
9554      return TE1->Idx < TE2->Idx;
9555    });
9556    for (const TreeEntry *TE : SecondEntries) {
9557      auto It = VFToTE.find(TE->getVectorFactor());
9558      if (It != VFToTE.end()) {
9559        VF = It->first;
9560        Entries.push_back(It->second);
9561        Entries.push_back(TE);
9562        break;
9563      }
9564    }
9565    // No 2 source vectors with the same vector factor - just choose 2 with max
9566    // index.
9567    if (Entries.empty()) {
9568      Entries.push_back(
9569          *std::max_element(UsedTEs.front().begin(), UsedTEs.front().end(),
9570                            [](const TreeEntry *TE1, const TreeEntry *TE2) {
9571                              return TE1->Idx < TE2->Idx;
9572                            }));
9573      Entries.push_back(SecondEntries.front());
9574      VF = std::max(Entries.front()->getVectorFactor(),
9575                    Entries.back()->getVectorFactor());
9576    }
9577  }
9578
9579  bool IsSplatOrUndefs = isSplat(VL) || all_of(VL, UndefValue::classof);
9580  // Checks if the 2 PHIs are compatible in terms of high possibility to be
9581  // vectorized.
9582  auto AreCompatiblePHIs = [&](Value *V, Value *V1) {
9583    auto *PHI = cast<PHINode>(V);
9584    auto *PHI1 = cast<PHINode>(V1);
9585    // Check that all incoming values are compatible/from same parent (if they
9586    // are instructions).
9587    // The incoming values are compatible if they all are constants, or
9588    // instruction with the same/alternate opcodes from the same basic block.
9589    for (int I = 0, E = PHI->getNumIncomingValues(); I < E; ++I) {
9590      Value *In = PHI->getIncomingValue(I);
9591      Value *In1 = PHI1->getIncomingValue(I);
9592      if (isConstant(In) && isConstant(In1))
9593        continue;
9594      if (!getSameOpcode({In, In1}, *TLI).getOpcode())
9595        return false;
9596      if (cast<Instruction>(In)->getParent() !=
9597          cast<Instruction>(In1)->getParent())
9598        return false;
9599    }
9600    return true;
9601  };
9602  // Check if the value can be ignored during analysis for shuffled gathers.
9603  // We suppose it is better to ignore instruction, which do not form splats,
9604  // are not vectorized/not extractelements (these instructions will be handled
9605  // by extractelements processing) or may form vector node in future.
9606  auto MightBeIgnored = [=](Value *V) {
9607    auto *I = dyn_cast<Instruction>(V);
9608    return I && !IsSplatOrUndefs && !ScalarToTreeEntry.count(I) &&
9609           !isVectorLikeInstWithConstOps(I) &&
9610           !areAllUsersVectorized(I, UserIgnoreList) && isSimple(I);
9611  };
9612  // Check that the neighbor instruction may form a full vector node with the
9613  // current instruction V. It is possible, if they have same/alternate opcode
9614  // and same parent basic block.
9615  auto NeighborMightBeIgnored = [&](Value *V, int Idx) {
9616    Value *V1 = VL[Idx];
9617    bool UsedInSameVTE = false;
9618    auto It = UsedValuesEntry.find(V1);
9619    if (It != UsedValuesEntry.end())
9620      UsedInSameVTE = It->second == UsedValuesEntry.find(V)->second;
9621    return V != V1 && MightBeIgnored(V1) && !UsedInSameVTE &&
9622           getSameOpcode({V, V1}, *TLI).getOpcode() &&
9623           cast<Instruction>(V)->getParent() ==
9624               cast<Instruction>(V1)->getParent() &&
9625           (!isa<PHINode>(V1) || AreCompatiblePHIs(V, V1));
9626  };
9627  // Build a shuffle mask for better cost estimation and vector emission.
9628  SmallBitVector UsedIdxs(Entries.size());
9629  SmallVector<std::pair<unsigned, int>> EntryLanes;
9630  for (int I = 0, E = VL.size(); I < E; ++I) {
9631    Value *V = VL[I];
9632    auto It = UsedValuesEntry.find(V);
9633    if (It == UsedValuesEntry.end())
9634      continue;
9635    // Do not try to shuffle scalars, if they are constants, or instructions
9636    // that can be vectorized as a result of the following vector build
9637    // vectorization.
9638    if (isConstant(V) || (MightBeIgnored(V) &&
9639                          ((I > 0 && NeighborMightBeIgnored(V, I - 1)) ||
9640                           (I != E - 1 && NeighborMightBeIgnored(V, I + 1)))))
9641      continue;
9642    unsigned Idx = It->second;
9643    EntryLanes.emplace_back(Idx, I);
9644    UsedIdxs.set(Idx);
9645  }
9646  // Iterate through all shuffled scalars and select entries, which can be used
9647  // for final shuffle.
9648  SmallVector<const TreeEntry *> TempEntries;
9649  for (unsigned I = 0, Sz = Entries.size(); I < Sz; ++I) {
9650    if (!UsedIdxs.test(I))
9651      continue;
9652    // Fix the entry number for the given scalar. If it is the first entry, set
9653    // Pair.first to 0, otherwise to 1 (currently select at max 2 nodes).
9654    // These indices are used when calculating final shuffle mask as the vector
9655    // offset.
9656    for (std::pair<unsigned, int> &Pair : EntryLanes)
9657      if (Pair.first == I)
9658        Pair.first = TempEntries.size();
9659    TempEntries.push_back(Entries[I]);
9660  }
9661  Entries.swap(TempEntries);
9662  if (EntryLanes.size() == Entries.size() &&
9663      !VL.equals(ArrayRef(TE->Scalars)
9664                     .slice(Part * VL.size(),
9665                            std::min<int>(VL.size(), TE->Scalars.size())))) {
9666    // We may have here 1 or 2 entries only. If the number of scalars is equal
9667    // to the number of entries, no need to do the analysis, it is not very
9668    // profitable. Since VL is not the same as TE->Scalars, it means we already
9669    // have some shuffles before. Cut off not profitable case.
9670    Entries.clear();
9671    return std::nullopt;
9672  }
9673  // Build the final mask, check for the identity shuffle, if possible.
9674  bool IsIdentity = Entries.size() == 1;
9675  // Pair.first is the offset to the vector, while Pair.second is the index of
9676  // scalar in the list.
9677  for (const std::pair<unsigned, int> &Pair : EntryLanes) {
9678    unsigned Idx = Part * VL.size() + Pair.second;
9679    Mask[Idx] = Pair.first * VF +
9680                Entries[Pair.first]->findLaneForValue(VL[Pair.second]);
9681    IsIdentity &= Mask[Idx] == Pair.second;
9682  }
9683  switch (Entries.size()) {
9684  case 1:
9685    if (IsIdentity || EntryLanes.size() > 1 || VL.size() <= 2)
9686      return TargetTransformInfo::SK_PermuteSingleSrc;
9687    break;
9688  case 2:
9689    if (EntryLanes.size() > 2 || VL.size() <= 2)
9690      return TargetTransformInfo::SK_PermuteTwoSrc;
9691    break;
9692  default:
9693    break;
9694  }
9695  Entries.clear();
9696  // Clear the corresponding mask elements.
9697  std::fill(std::next(Mask.begin(), Part * VL.size()),
9698            std::next(Mask.begin(), (Part + 1) * VL.size()), PoisonMaskElem);
9699  return std::nullopt;
9700}
9701
9702SmallVector<std::optional<TargetTransformInfo::ShuffleKind>>
9703BoUpSLP::isGatherShuffledEntry(
9704    const TreeEntry *TE, ArrayRef<Value *> VL, SmallVectorImpl<int> &Mask,
9705    SmallVectorImpl<SmallVector<const TreeEntry *>> &Entries,
9706    unsigned NumParts) {
9707  assert(NumParts > 0 && NumParts < VL.size() &&
9708         "Expected positive number of registers.");
9709  Entries.clear();
9710  // No need to check for the topmost gather node.
9711  if (TE == VectorizableTree.front().get())
9712    return {};
9713  Mask.assign(VL.size(), PoisonMaskElem);
9714  assert(TE->UserTreeIndices.size() == 1 &&
9715         "Expected only single user of the gather node.");
9716  assert(VL.size() % NumParts == 0 &&
9717         "Number of scalars must be divisible by NumParts.");
9718  unsigned SliceSize = VL.size() / NumParts;
9719  SmallVector<std::optional<TTI::ShuffleKind>> Res;
9720  for (unsigned Part = 0; Part < NumParts; ++Part) {
9721    ArrayRef<Value *> SubVL = VL.slice(Part * SliceSize, SliceSize);
9722    SmallVectorImpl<const TreeEntry *> &SubEntries = Entries.emplace_back();
9723    std::optional<TTI::ShuffleKind> SubRes =
9724        isGatherShuffledSingleRegisterEntry(TE, SubVL, Mask, SubEntries, Part);
9725    if (!SubRes)
9726      SubEntries.clear();
9727    Res.push_back(SubRes);
9728    if (SubEntries.size() == 1 && *SubRes == TTI::SK_PermuteSingleSrc &&
9729        SubEntries.front()->getVectorFactor() == VL.size() &&
9730        (SubEntries.front()->isSame(TE->Scalars) ||
9731         SubEntries.front()->isSame(VL))) {
9732      SmallVector<const TreeEntry *> LocalSubEntries;
9733      LocalSubEntries.swap(SubEntries);
9734      Entries.clear();
9735      Res.clear();
9736      std::iota(Mask.begin(), Mask.end(), 0);
9737      // Clear undef scalars.
9738      for (int I = 0, Sz = VL.size(); I < Sz; ++I)
9739        if (isa<PoisonValue>(VL[I]))
9740          Mask[I] = PoisonMaskElem;
9741      Entries.emplace_back(1, LocalSubEntries.front());
9742      Res.push_back(TargetTransformInfo::SK_PermuteSingleSrc);
9743      return Res;
9744    }
9745  }
9746  if (all_of(Res,
9747             [](const std::optional<TTI::ShuffleKind> &SK) { return !SK; })) {
9748    Entries.clear();
9749    return {};
9750  }
9751  return Res;
9752}
9753
9754InstructionCost BoUpSLP::getGatherCost(ArrayRef<Value *> VL,
9755                                       bool ForPoisonSrc) const {
9756  // Find the type of the operands in VL.
9757  Type *ScalarTy = VL[0]->getType();
9758  if (StoreInst *SI = dyn_cast<StoreInst>(VL[0]))
9759    ScalarTy = SI->getValueOperand()->getType();
9760  auto *VecTy = FixedVectorType::get(ScalarTy, VL.size());
9761  bool DuplicateNonConst = false;
9762  // Find the cost of inserting/extracting values from the vector.
9763  // Check if the same elements are inserted several times and count them as
9764  // shuffle candidates.
9765  APInt ShuffledElements = APInt::getZero(VL.size());
9766  DenseSet<Value *> UniqueElements;
9767  constexpr TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput;
9768  InstructionCost Cost;
9769  auto EstimateInsertCost = [&](unsigned I, Value *V) {
9770    if (!ForPoisonSrc)
9771      Cost +=
9772          TTI->getVectorInstrCost(Instruction::InsertElement, VecTy, CostKind,
9773                                  I, Constant::getNullValue(VecTy), V);
9774  };
9775  for (unsigned I = 0, E = VL.size(); I < E; ++I) {
9776    Value *V = VL[I];
9777    // No need to shuffle duplicates for constants.
9778    if ((ForPoisonSrc && isConstant(V)) || isa<UndefValue>(V)) {
9779      ShuffledElements.setBit(I);
9780      continue;
9781    }
9782    if (!UniqueElements.insert(V).second) {
9783      DuplicateNonConst = true;
9784      ShuffledElements.setBit(I);
9785      continue;
9786    }
9787    EstimateInsertCost(I, V);
9788  }
9789  if (ForPoisonSrc)
9790    Cost =
9791        TTI->getScalarizationOverhead(VecTy, ~ShuffledElements, /*Insert*/ true,
9792                                      /*Extract*/ false, CostKind);
9793  if (DuplicateNonConst)
9794    Cost +=
9795        TTI->getShuffleCost(TargetTransformInfo::SK_PermuteSingleSrc, VecTy);
9796  return Cost;
9797}
9798
9799// Perform operand reordering on the instructions in VL and return the reordered
9800// operands in Left and Right.
9801void BoUpSLP::reorderInputsAccordingToOpcode(
9802    ArrayRef<Value *> VL, SmallVectorImpl<Value *> &Left,
9803    SmallVectorImpl<Value *> &Right, const TargetLibraryInfo &TLI,
9804    const DataLayout &DL, ScalarEvolution &SE, const BoUpSLP &R) {
9805  if (VL.empty())
9806    return;
9807  VLOperands Ops(VL, TLI, DL, SE, R);
9808  // Reorder the operands in place.
9809  Ops.reorder();
9810  Left = Ops.getVL(0);
9811  Right = Ops.getVL(1);
9812}
9813
9814Instruction &BoUpSLP::getLastInstructionInBundle(const TreeEntry *E) {
9815  auto &Res = EntryToLastInstruction.FindAndConstruct(E);
9816  if (Res.second)
9817    return *Res.second;
9818  // Get the basic block this bundle is in. All instructions in the bundle
9819  // should be in this block (except for extractelement-like instructions with
9820  // constant indeces).
9821  auto *Front = E->getMainOp();
9822  auto *BB = Front->getParent();
9823  assert(llvm::all_of(E->Scalars, [=](Value *V) -> bool {
9824    if (E->getOpcode() == Instruction::GetElementPtr &&
9825        !isa<GetElementPtrInst>(V))
9826      return true;
9827    auto *I = cast<Instruction>(V);
9828    return !E->isOpcodeOrAlt(I) || I->getParent() == BB ||
9829           isVectorLikeInstWithConstOps(I);
9830  }));
9831
9832  auto FindLastInst = [&]() {
9833    Instruction *LastInst = Front;
9834    for (Value *V : E->Scalars) {
9835      auto *I = dyn_cast<Instruction>(V);
9836      if (!I)
9837        continue;
9838      if (LastInst->getParent() == I->getParent()) {
9839        if (LastInst->comesBefore(I))
9840          LastInst = I;
9841        continue;
9842      }
9843      assert(((E->getOpcode() == Instruction::GetElementPtr &&
9844               !isa<GetElementPtrInst>(I)) ||
9845              (isVectorLikeInstWithConstOps(LastInst) &&
9846               isVectorLikeInstWithConstOps(I))) &&
9847             "Expected vector-like or non-GEP in GEP node insts only.");
9848      if (!DT->isReachableFromEntry(LastInst->getParent())) {
9849        LastInst = I;
9850        continue;
9851      }
9852      if (!DT->isReachableFromEntry(I->getParent()))
9853        continue;
9854      auto *NodeA = DT->getNode(LastInst->getParent());
9855      auto *NodeB = DT->getNode(I->getParent());
9856      assert(NodeA && "Should only process reachable instructions");
9857      assert(NodeB && "Should only process reachable instructions");
9858      assert((NodeA == NodeB) ==
9859                 (NodeA->getDFSNumIn() == NodeB->getDFSNumIn()) &&
9860             "Different nodes should have different DFS numbers");
9861      if (NodeA->getDFSNumIn() < NodeB->getDFSNumIn())
9862        LastInst = I;
9863    }
9864    BB = LastInst->getParent();
9865    return LastInst;
9866  };
9867
9868  auto FindFirstInst = [&]() {
9869    Instruction *FirstInst = Front;
9870    for (Value *V : E->Scalars) {
9871      auto *I = dyn_cast<Instruction>(V);
9872      if (!I)
9873        continue;
9874      if (FirstInst->getParent() == I->getParent()) {
9875        if (I->comesBefore(FirstInst))
9876          FirstInst = I;
9877        continue;
9878      }
9879      assert(((E->getOpcode() == Instruction::GetElementPtr &&
9880              !isa<GetElementPtrInst>(I)) ||
9881             (isVectorLikeInstWithConstOps(FirstInst) &&
9882              isVectorLikeInstWithConstOps(I))) &&
9883                 "Expected vector-like or non-GEP in GEP node insts only.");
9884      if (!DT->isReachableFromEntry(FirstInst->getParent())) {
9885        FirstInst = I;
9886        continue;
9887      }
9888      if (!DT->isReachableFromEntry(I->getParent()))
9889        continue;
9890      auto *NodeA = DT->getNode(FirstInst->getParent());
9891      auto *NodeB = DT->getNode(I->getParent());
9892      assert(NodeA && "Should only process reachable instructions");
9893      assert(NodeB && "Should only process reachable instructions");
9894      assert((NodeA == NodeB) ==
9895                 (NodeA->getDFSNumIn() == NodeB->getDFSNumIn()) &&
9896             "Different nodes should have different DFS numbers");
9897      if (NodeA->getDFSNumIn() > NodeB->getDFSNumIn())
9898        FirstInst = I;
9899    }
9900    return FirstInst;
9901  };
9902
9903  // Set the insert point to the beginning of the basic block if the entry
9904  // should not be scheduled.
9905  if (doesNotNeedToSchedule(E->Scalars) ||
9906      (E->State != TreeEntry::NeedToGather &&
9907       all_of(E->Scalars, isVectorLikeInstWithConstOps))) {
9908    if ((E->getOpcode() == Instruction::GetElementPtr &&
9909         any_of(E->Scalars,
9910                [](Value *V) {
9911                  return !isa<GetElementPtrInst>(V) && isa<Instruction>(V);
9912                })) ||
9913        all_of(E->Scalars, [](Value *V) {
9914          return !isVectorLikeInstWithConstOps(V) && isUsedOutsideBlock(V);
9915        }))
9916      Res.second = FindLastInst();
9917    else
9918      Res.second = FindFirstInst();
9919    return *Res.second;
9920  }
9921
9922  // Find the last instruction. The common case should be that BB has been
9923  // scheduled, and the last instruction is VL.back(). So we start with
9924  // VL.back() and iterate over schedule data until we reach the end of the
9925  // bundle. The end of the bundle is marked by null ScheduleData.
9926  if (BlocksSchedules.count(BB)) {
9927    Value *V = E->isOneOf(E->Scalars.back());
9928    if (doesNotNeedToBeScheduled(V))
9929      V = *find_if_not(E->Scalars, doesNotNeedToBeScheduled);
9930    auto *Bundle = BlocksSchedules[BB]->getScheduleData(V);
9931    if (Bundle && Bundle->isPartOfBundle())
9932      for (; Bundle; Bundle = Bundle->NextInBundle)
9933        if (Bundle->OpValue == Bundle->Inst)
9934          Res.second = Bundle->Inst;
9935  }
9936
9937  // LastInst can still be null at this point if there's either not an entry
9938  // for BB in BlocksSchedules or there's no ScheduleData available for
9939  // VL.back(). This can be the case if buildTree_rec aborts for various
9940  // reasons (e.g., the maximum recursion depth is reached, the maximum region
9941  // size is reached, etc.). ScheduleData is initialized in the scheduling
9942  // "dry-run".
9943  //
9944  // If this happens, we can still find the last instruction by brute force. We
9945  // iterate forwards from Front (inclusive) until we either see all
9946  // instructions in the bundle or reach the end of the block. If Front is the
9947  // last instruction in program order, LastInst will be set to Front, and we
9948  // will visit all the remaining instructions in the block.
9949  //
9950  // One of the reasons we exit early from buildTree_rec is to place an upper
9951  // bound on compile-time. Thus, taking an additional compile-time hit here is
9952  // not ideal. However, this should be exceedingly rare since it requires that
9953  // we both exit early from buildTree_rec and that the bundle be out-of-order
9954  // (causing us to iterate all the way to the end of the block).
9955  if (!Res.second)
9956    Res.second = FindLastInst();
9957  assert(Res.second && "Failed to find last instruction in bundle");
9958  return *Res.second;
9959}
9960
9961void BoUpSLP::setInsertPointAfterBundle(const TreeEntry *E) {
9962  auto *Front = E->getMainOp();
9963  Instruction *LastInst = &getLastInstructionInBundle(E);
9964  assert(LastInst && "Failed to find last instruction in bundle");
9965  BasicBlock::iterator LastInstIt = LastInst->getIterator();
9966  // If the instruction is PHI, set the insert point after all the PHIs.
9967  bool IsPHI = isa<PHINode>(LastInst);
9968  if (IsPHI)
9969    LastInstIt = LastInst->getParent()->getFirstNonPHIIt();
9970  if (IsPHI || (E->State != TreeEntry::NeedToGather &&
9971                doesNotNeedToSchedule(E->Scalars))) {
9972    Builder.SetInsertPoint(LastInst->getParent(), LastInstIt);
9973  } else {
9974    // Set the insertion point after the last instruction in the bundle. Set the
9975    // debug location to Front.
9976    Builder.SetInsertPoint(
9977        LastInst->getParent(),
9978        LastInst->getNextNonDebugInstruction()->getIterator());
9979  }
9980  Builder.SetCurrentDebugLocation(Front->getDebugLoc());
9981}
9982
9983Value *BoUpSLP::gather(ArrayRef<Value *> VL, Value *Root) {
9984  // List of instructions/lanes from current block and/or the blocks which are
9985  // part of the current loop. These instructions will be inserted at the end to
9986  // make it possible to optimize loops and hoist invariant instructions out of
9987  // the loops body with better chances for success.
9988  SmallVector<std::pair<Value *, unsigned>, 4> PostponedInsts;
9989  SmallSet<int, 4> PostponedIndices;
9990  Loop *L = LI->getLoopFor(Builder.GetInsertBlock());
9991  auto &&CheckPredecessor = [](BasicBlock *InstBB, BasicBlock *InsertBB) {
9992    SmallPtrSet<BasicBlock *, 4> Visited;
9993    while (InsertBB && InsertBB != InstBB && Visited.insert(InsertBB).second)
9994      InsertBB = InsertBB->getSinglePredecessor();
9995    return InsertBB && InsertBB == InstBB;
9996  };
9997  for (int I = 0, E = VL.size(); I < E; ++I) {
9998    if (auto *Inst = dyn_cast<Instruction>(VL[I]))
9999      if ((CheckPredecessor(Inst->getParent(), Builder.GetInsertBlock()) ||
10000           getTreeEntry(Inst) ||
10001           (L && (!Root || L->isLoopInvariant(Root)) && L->contains(Inst))) &&
10002          PostponedIndices.insert(I).second)
10003        PostponedInsts.emplace_back(Inst, I);
10004  }
10005
10006  auto &&CreateInsertElement = [this](Value *Vec, Value *V, unsigned Pos) {
10007    Vec = Builder.CreateInsertElement(Vec, V, Builder.getInt32(Pos));
10008    auto *InsElt = dyn_cast<InsertElementInst>(Vec);
10009    if (!InsElt)
10010      return Vec;
10011    GatherShuffleExtractSeq.insert(InsElt);
10012    CSEBlocks.insert(InsElt->getParent());
10013    // Add to our 'need-to-extract' list.
10014    if (isa<Instruction>(V)) {
10015      if (TreeEntry *Entry = getTreeEntry(V)) {
10016        // Find which lane we need to extract.
10017        unsigned FoundLane = Entry->findLaneForValue(V);
10018        ExternalUses.emplace_back(V, InsElt, FoundLane);
10019      }
10020    }
10021    return Vec;
10022  };
10023  Value *Val0 =
10024      isa<StoreInst>(VL[0]) ? cast<StoreInst>(VL[0])->getValueOperand() : VL[0];
10025  FixedVectorType *VecTy = FixedVectorType::get(Val0->getType(), VL.size());
10026  Value *Vec = Root ? Root : PoisonValue::get(VecTy);
10027  SmallVector<int> NonConsts;
10028  // Insert constant values at first.
10029  for (int I = 0, E = VL.size(); I < E; ++I) {
10030    if (PostponedIndices.contains(I))
10031      continue;
10032    if (!isConstant(VL[I])) {
10033      NonConsts.push_back(I);
10034      continue;
10035    }
10036    if (Root) {
10037      if (!isa<UndefValue>(VL[I])) {
10038        NonConsts.push_back(I);
10039        continue;
10040      }
10041      if (isa<PoisonValue>(VL[I]))
10042        continue;
10043      if (auto *SV = dyn_cast<ShuffleVectorInst>(Root)) {
10044        if (SV->getMaskValue(I) == PoisonMaskElem)
10045          continue;
10046      }
10047    }
10048    Vec = CreateInsertElement(Vec, VL[I], I);
10049  }
10050  // Insert non-constant values.
10051  for (int I : NonConsts)
10052    Vec = CreateInsertElement(Vec, VL[I], I);
10053  // Append instructions, which are/may be part of the loop, in the end to make
10054  // it possible to hoist non-loop-based instructions.
10055  for (const std::pair<Value *, unsigned> &Pair : PostponedInsts)
10056    Vec = CreateInsertElement(Vec, Pair.first, Pair.second);
10057
10058  return Vec;
10059}
10060
10061/// Merges shuffle masks and emits final shuffle instruction, if required. It
10062/// supports shuffling of 2 input vectors. It implements lazy shuffles emission,
10063/// when the actual shuffle instruction is generated only if this is actually
10064/// required. Otherwise, the shuffle instruction emission is delayed till the
10065/// end of the process, to reduce the number of emitted instructions and further
10066/// analysis/transformations.
10067/// The class also will look through the previously emitted shuffle instructions
10068/// and properly mark indices in mask as undef.
10069/// For example, given the code
10070/// \code
10071/// %s1 = shufflevector <2 x ty> %0, poison, <1, 0>
10072/// %s2 = shufflevector <2 x ty> %1, poison, <1, 0>
10073/// \endcode
10074/// and if need to emit shuffle of %s1 and %s2 with mask <1, 0, 3, 2>, it will
10075/// look through %s1 and %s2 and emit
10076/// \code
10077/// %res = shufflevector <2 x ty> %0, %1, <0, 1, 2, 3>
10078/// \endcode
10079/// instead.
10080/// If 2 operands are of different size, the smallest one will be resized and
10081/// the mask recalculated properly.
10082/// For example, given the code
10083/// \code
10084/// %s1 = shufflevector <2 x ty> %0, poison, <1, 0, 1, 0>
10085/// %s2 = shufflevector <2 x ty> %1, poison, <1, 0, 1, 0>
10086/// \endcode
10087/// and if need to emit shuffle of %s1 and %s2 with mask <1, 0, 5, 4>, it will
10088/// look through %s1 and %s2 and emit
10089/// \code
10090/// %res = shufflevector <2 x ty> %0, %1, <0, 1, 2, 3>
10091/// \endcode
10092/// instead.
10093class BoUpSLP::ShuffleInstructionBuilder final : public BaseShuffleAnalysis {
10094  bool IsFinalized = false;
10095  /// Combined mask for all applied operands and masks. It is built during
10096  /// analysis and actual emission of shuffle vector instructions.
10097  SmallVector<int> CommonMask;
10098  /// List of operands for the shuffle vector instruction. It hold at max 2
10099  /// operands, if the 3rd is going to be added, the first 2 are combined into
10100  /// shuffle with \p CommonMask mask, the first operand sets to be the
10101  /// resulting shuffle and the second operand sets to be the newly added
10102  /// operand. The \p CommonMask is transformed in the proper way after that.
10103  SmallVector<Value *, 2> InVectors;
10104  IRBuilderBase &Builder;
10105  BoUpSLP &R;
10106
10107  class ShuffleIRBuilder {
10108    IRBuilderBase &Builder;
10109    /// Holds all of the instructions that we gathered.
10110    SetVector<Instruction *> &GatherShuffleExtractSeq;
10111    /// A list of blocks that we are going to CSE.
10112    DenseSet<BasicBlock *> &CSEBlocks;
10113
10114  public:
10115    ShuffleIRBuilder(IRBuilderBase &Builder,
10116                     SetVector<Instruction *> &GatherShuffleExtractSeq,
10117                     DenseSet<BasicBlock *> &CSEBlocks)
10118        : Builder(Builder), GatherShuffleExtractSeq(GatherShuffleExtractSeq),
10119          CSEBlocks(CSEBlocks) {}
10120    ~ShuffleIRBuilder() = default;
10121    /// Creates shufflevector for the 2 operands with the given mask.
10122    Value *createShuffleVector(Value *V1, Value *V2, ArrayRef<int> Mask) {
10123      Value *Vec = Builder.CreateShuffleVector(V1, V2, Mask);
10124      if (auto *I = dyn_cast<Instruction>(Vec)) {
10125        GatherShuffleExtractSeq.insert(I);
10126        CSEBlocks.insert(I->getParent());
10127      }
10128      return Vec;
10129    }
10130    /// Creates permutation of the single vector operand with the given mask, if
10131    /// it is not identity mask.
10132    Value *createShuffleVector(Value *V1, ArrayRef<int> Mask) {
10133      if (Mask.empty())
10134        return V1;
10135      unsigned VF = Mask.size();
10136      unsigned LocalVF = cast<FixedVectorType>(V1->getType())->getNumElements();
10137      if (VF == LocalVF && ShuffleVectorInst::isIdentityMask(Mask, VF))
10138        return V1;
10139      Value *Vec = Builder.CreateShuffleVector(V1, Mask);
10140      if (auto *I = dyn_cast<Instruction>(Vec)) {
10141        GatherShuffleExtractSeq.insert(I);
10142        CSEBlocks.insert(I->getParent());
10143      }
10144      return Vec;
10145    }
10146    Value *createIdentity(Value *V) { return V; }
10147    Value *createPoison(Type *Ty, unsigned VF) {
10148      return PoisonValue::get(FixedVectorType::get(Ty, VF));
10149    }
10150    /// Resizes 2 input vector to match the sizes, if the they are not equal
10151    /// yet. The smallest vector is resized to the size of the larger vector.
10152    void resizeToMatch(Value *&V1, Value *&V2) {
10153      if (V1->getType() == V2->getType())
10154        return;
10155      int V1VF = cast<FixedVectorType>(V1->getType())->getNumElements();
10156      int V2VF = cast<FixedVectorType>(V2->getType())->getNumElements();
10157      int VF = std::max(V1VF, V2VF);
10158      int MinVF = std::min(V1VF, V2VF);
10159      SmallVector<int> IdentityMask(VF, PoisonMaskElem);
10160      std::iota(IdentityMask.begin(), std::next(IdentityMask.begin(), MinVF),
10161                0);
10162      Value *&Op = MinVF == V1VF ? V1 : V2;
10163      Op = Builder.CreateShuffleVector(Op, IdentityMask);
10164      if (auto *I = dyn_cast<Instruction>(Op)) {
10165        GatherShuffleExtractSeq.insert(I);
10166        CSEBlocks.insert(I->getParent());
10167      }
10168      if (MinVF == V1VF)
10169        V1 = Op;
10170      else
10171        V2 = Op;
10172    }
10173  };
10174
10175  /// Smart shuffle instruction emission, walks through shuffles trees and
10176  /// tries to find the best matching vector for the actual shuffle
10177  /// instruction.
10178  Value *createShuffle(Value *V1, Value *V2, ArrayRef<int> Mask) {
10179    assert(V1 && "Expected at least one vector value.");
10180    ShuffleIRBuilder ShuffleBuilder(Builder, R.GatherShuffleExtractSeq,
10181                                    R.CSEBlocks);
10182    return BaseShuffleAnalysis::createShuffle<Value *>(V1, V2, Mask,
10183                                                       ShuffleBuilder);
10184  }
10185
10186  /// Transforms mask \p CommonMask per given \p Mask to make proper set after
10187  /// shuffle emission.
10188  static void transformMaskAfterShuffle(MutableArrayRef<int> CommonMask,
10189                                        ArrayRef<int> Mask) {
10190    for (unsigned Idx = 0, Sz = CommonMask.size(); Idx < Sz; ++Idx)
10191      if (Mask[Idx] != PoisonMaskElem)
10192        CommonMask[Idx] = Idx;
10193  }
10194
10195public:
10196  ShuffleInstructionBuilder(IRBuilderBase &Builder, BoUpSLP &R)
10197      : Builder(Builder), R(R) {}
10198
10199  /// Adjusts extractelements after reusing them.
10200  Value *adjustExtracts(const TreeEntry *E, MutableArrayRef<int> Mask,
10201                        ArrayRef<std::optional<TTI::ShuffleKind>> ShuffleKinds,
10202                        unsigned NumParts, bool &UseVecBaseAsInput) {
10203    UseVecBaseAsInput = false;
10204    SmallPtrSet<Value *, 4> UniqueBases;
10205    Value *VecBase = nullptr;
10206    for (int I = 0, Sz = Mask.size(); I < Sz; ++I) {
10207      int Idx = Mask[I];
10208      if (Idx == PoisonMaskElem)
10209        continue;
10210      auto *EI = cast<ExtractElementInst>(E->Scalars[I]);
10211      VecBase = EI->getVectorOperand();
10212      if (const TreeEntry *TE = R.getTreeEntry(VecBase))
10213        VecBase = TE->VectorizedValue;
10214      assert(VecBase && "Expected vectorized value.");
10215      UniqueBases.insert(VecBase);
10216      // If the only one use is vectorized - can delete the extractelement
10217      // itself.
10218      if (!EI->hasOneUse() || (NumParts != 1 && count(E->Scalars, EI) > 1) ||
10219          any_of(EI->users(), [&](User *U) {
10220            const TreeEntry *UTE = R.getTreeEntry(U);
10221            return !UTE || R.MultiNodeScalars.contains(U) ||
10222                   count_if(R.VectorizableTree,
10223                            [&](const std::unique_ptr<TreeEntry> &TE) {
10224                              return any_of(TE->UserTreeIndices,
10225                                            [&](const EdgeInfo &Edge) {
10226                                              return Edge.UserTE == UTE;
10227                                            }) &&
10228                                     is_contained(TE->Scalars, EI);
10229                            }) != 1;
10230          }))
10231        continue;
10232      R.eraseInstruction(EI);
10233    }
10234    if (NumParts == 1 || UniqueBases.size() == 1)
10235      return VecBase;
10236    UseVecBaseAsInput = true;
10237    auto TransformToIdentity = [](MutableArrayRef<int> Mask) {
10238      for (auto [I, Idx] : enumerate(Mask))
10239        if (Idx != PoisonMaskElem)
10240          Idx = I;
10241    };
10242    // Perform multi-register vector shuffle, joining them into a single virtual
10243    // long vector.
10244    // Need to shuffle each part independently and then insert all this parts
10245    // into a long virtual vector register, forming the original vector.
10246    Value *Vec = nullptr;
10247    SmallVector<int> VecMask(Mask.size(), PoisonMaskElem);
10248    unsigned SliceSize = E->Scalars.size() / NumParts;
10249    for (unsigned Part = 0; Part < NumParts; ++Part) {
10250      ArrayRef<Value *> VL =
10251          ArrayRef(E->Scalars).slice(Part * SliceSize, SliceSize);
10252      MutableArrayRef<int> SubMask = Mask.slice(Part * SliceSize, SliceSize);
10253      constexpr int MaxBases = 2;
10254      SmallVector<Value *, MaxBases> Bases(MaxBases);
10255#ifndef NDEBUG
10256      int PrevSize = 0;
10257#endif // NDEBUG
10258      for (const auto [I, V]: enumerate(VL)) {
10259        if (SubMask[I] == PoisonMaskElem)
10260          continue;
10261        Value *VecOp = cast<ExtractElementInst>(V)->getVectorOperand();
10262        if (const TreeEntry *TE = R.getTreeEntry(VecOp))
10263          VecOp = TE->VectorizedValue;
10264        assert(VecOp && "Expected vectorized value.");
10265        const int Size =
10266            cast<FixedVectorType>(VecOp->getType())->getNumElements();
10267#ifndef NDEBUG
10268        assert((PrevSize == Size || PrevSize == 0) &&
10269               "Expected vectors of the same size.");
10270        PrevSize = Size;
10271#endif // NDEBUG
10272        Bases[SubMask[I] < Size ? 0 : 1] = VecOp;
10273      }
10274      if (!Bases.front())
10275        continue;
10276      Value *SubVec;
10277      if (Bases.back()) {
10278        SubVec = createShuffle(Bases.front(), Bases.back(), SubMask);
10279        TransformToIdentity(SubMask);
10280      } else {
10281        SubVec = Bases.front();
10282      }
10283      if (!Vec) {
10284        Vec = SubVec;
10285        assert((Part == 0 || all_of(seq<unsigned>(0, Part),
10286                                    [&](unsigned P) {
10287                                      ArrayRef<int> SubMask =
10288                                          Mask.slice(P * SliceSize, SliceSize);
10289                                      return all_of(SubMask, [](int Idx) {
10290                                        return Idx == PoisonMaskElem;
10291                                      });
10292                                    })) &&
10293               "Expected first part or all previous parts masked.");
10294        copy(SubMask, std::next(VecMask.begin(), Part * SliceSize));
10295      } else {
10296        unsigned VF = cast<FixedVectorType>(Vec->getType())->getNumElements();
10297        if (Vec->getType() != SubVec->getType()) {
10298          unsigned SubVecVF =
10299              cast<FixedVectorType>(SubVec->getType())->getNumElements();
10300          VF = std::max(VF, SubVecVF);
10301        }
10302        // Adjust SubMask.
10303        for (auto [I, Idx] : enumerate(SubMask))
10304          if (Idx != PoisonMaskElem)
10305            Idx += VF;
10306        copy(SubMask, std::next(VecMask.begin(), Part * SliceSize));
10307        Vec = createShuffle(Vec, SubVec, VecMask);
10308        TransformToIdentity(VecMask);
10309      }
10310    }
10311    copy(VecMask, Mask.begin());
10312    return Vec;
10313  }
10314  /// Checks if the specified entry \p E needs to be delayed because of its
10315  /// dependency nodes.
10316  std::optional<Value *>
10317  needToDelay(const TreeEntry *E,
10318              ArrayRef<SmallVector<const TreeEntry *>> Deps) const {
10319    // No need to delay emission if all deps are ready.
10320    if (all_of(Deps, [](ArrayRef<const TreeEntry *> TEs) {
10321          return all_of(
10322              TEs, [](const TreeEntry *TE) { return TE->VectorizedValue; });
10323        }))
10324      return std::nullopt;
10325    // Postpone gather emission, will be emitted after the end of the
10326    // process to keep correct order.
10327    auto *VecTy = FixedVectorType::get(E->Scalars.front()->getType(),
10328                                       E->getVectorFactor());
10329    return Builder.CreateAlignedLoad(
10330        VecTy, PoisonValue::get(PointerType::getUnqual(VecTy->getContext())),
10331        MaybeAlign());
10332  }
10333  /// Adds 2 input vectors (in form of tree entries) and the mask for their
10334  /// shuffling.
10335  void add(const TreeEntry &E1, const TreeEntry &E2, ArrayRef<int> Mask) {
10336    add(E1.VectorizedValue, E2.VectorizedValue, Mask);
10337  }
10338  /// Adds single input vector (in form of tree entry) and the mask for its
10339  /// shuffling.
10340  void add(const TreeEntry &E1, ArrayRef<int> Mask) {
10341    add(E1.VectorizedValue, Mask);
10342  }
10343  /// Adds 2 input vectors and the mask for their shuffling.
10344  void add(Value *V1, Value *V2, ArrayRef<int> Mask) {
10345    assert(V1 && V2 && !Mask.empty() && "Expected non-empty input vectors.");
10346    if (InVectors.empty()) {
10347      InVectors.push_back(V1);
10348      InVectors.push_back(V2);
10349      CommonMask.assign(Mask.begin(), Mask.end());
10350      return;
10351    }
10352    Value *Vec = InVectors.front();
10353    if (InVectors.size() == 2) {
10354      Vec = createShuffle(Vec, InVectors.back(), CommonMask);
10355      transformMaskAfterShuffle(CommonMask, CommonMask);
10356    } else if (cast<FixedVectorType>(Vec->getType())->getNumElements() !=
10357               Mask.size()) {
10358      Vec = createShuffle(Vec, nullptr, CommonMask);
10359      transformMaskAfterShuffle(CommonMask, CommonMask);
10360    }
10361    V1 = createShuffle(V1, V2, Mask);
10362    for (unsigned Idx = 0, Sz = CommonMask.size(); Idx < Sz; ++Idx)
10363      if (Mask[Idx] != PoisonMaskElem)
10364        CommonMask[Idx] = Idx + Sz;
10365    InVectors.front() = Vec;
10366    if (InVectors.size() == 2)
10367      InVectors.back() = V1;
10368    else
10369      InVectors.push_back(V1);
10370  }
10371  /// Adds another one input vector and the mask for the shuffling.
10372  void add(Value *V1, ArrayRef<int> Mask, bool = false) {
10373    if (InVectors.empty()) {
10374      if (!isa<FixedVectorType>(V1->getType())) {
10375        V1 = createShuffle(V1, nullptr, CommonMask);
10376        CommonMask.assign(Mask.size(), PoisonMaskElem);
10377        transformMaskAfterShuffle(CommonMask, Mask);
10378      }
10379      InVectors.push_back(V1);
10380      CommonMask.assign(Mask.begin(), Mask.end());
10381      return;
10382    }
10383    const auto *It = find(InVectors, V1);
10384    if (It == InVectors.end()) {
10385      if (InVectors.size() == 2 ||
10386          InVectors.front()->getType() != V1->getType() ||
10387          !isa<FixedVectorType>(V1->getType())) {
10388        Value *V = InVectors.front();
10389        if (InVectors.size() == 2) {
10390          V = createShuffle(InVectors.front(), InVectors.back(), CommonMask);
10391          transformMaskAfterShuffle(CommonMask, CommonMask);
10392        } else if (cast<FixedVectorType>(V->getType())->getNumElements() !=
10393                   CommonMask.size()) {
10394          V = createShuffle(InVectors.front(), nullptr, CommonMask);
10395          transformMaskAfterShuffle(CommonMask, CommonMask);
10396        }
10397        for (unsigned Idx = 0, Sz = CommonMask.size(); Idx < Sz; ++Idx)
10398          if (CommonMask[Idx] == PoisonMaskElem && Mask[Idx] != PoisonMaskElem)
10399            CommonMask[Idx] =
10400                V->getType() != V1->getType()
10401                    ? Idx + Sz
10402                    : Mask[Idx] + cast<FixedVectorType>(V1->getType())
10403                                      ->getNumElements();
10404        if (V->getType() != V1->getType())
10405          V1 = createShuffle(V1, nullptr, Mask);
10406        InVectors.front() = V;
10407        if (InVectors.size() == 2)
10408          InVectors.back() = V1;
10409        else
10410          InVectors.push_back(V1);
10411        return;
10412      }
10413      // Check if second vector is required if the used elements are already
10414      // used from the first one.
10415      for (unsigned Idx = 0, Sz = CommonMask.size(); Idx < Sz; ++Idx)
10416        if (Mask[Idx] != PoisonMaskElem && CommonMask[Idx] == PoisonMaskElem) {
10417          InVectors.push_back(V1);
10418          break;
10419        }
10420    }
10421    int VF = CommonMask.size();
10422    if (auto *FTy = dyn_cast<FixedVectorType>(V1->getType()))
10423      VF = FTy->getNumElements();
10424    for (unsigned Idx = 0, Sz = CommonMask.size(); Idx < Sz; ++Idx)
10425      if (Mask[Idx] != PoisonMaskElem && CommonMask[Idx] == PoisonMaskElem)
10426        CommonMask[Idx] = Mask[Idx] + (It == InVectors.begin() ? 0 : VF);
10427  }
10428  /// Adds another one input vector and the mask for the shuffling.
10429  void addOrdered(Value *V1, ArrayRef<unsigned> Order) {
10430    SmallVector<int> NewMask;
10431    inversePermutation(Order, NewMask);
10432    add(V1, NewMask);
10433  }
10434  Value *gather(ArrayRef<Value *> VL, unsigned MaskVF = 0,
10435                Value *Root = nullptr) {
10436    return R.gather(VL, Root);
10437  }
10438  Value *createFreeze(Value *V) { return Builder.CreateFreeze(V); }
10439  /// Finalize emission of the shuffles.
10440  /// \param Action the action (if any) to be performed before final applying of
10441  /// the \p ExtMask mask.
10442  Value *
10443  finalize(ArrayRef<int> ExtMask, unsigned VF = 0,
10444           function_ref<void(Value *&, SmallVectorImpl<int> &)> Action = {}) {
10445    IsFinalized = true;
10446    if (Action) {
10447      Value *Vec = InVectors.front();
10448      if (InVectors.size() == 2) {
10449        Vec = createShuffle(Vec, InVectors.back(), CommonMask);
10450        InVectors.pop_back();
10451      } else {
10452        Vec = createShuffle(Vec, nullptr, CommonMask);
10453      }
10454      for (unsigned Idx = 0, Sz = CommonMask.size(); Idx < Sz; ++Idx)
10455        if (CommonMask[Idx] != PoisonMaskElem)
10456          CommonMask[Idx] = Idx;
10457      assert(VF > 0 &&
10458             "Expected vector length for the final value before action.");
10459      unsigned VecVF = cast<FixedVectorType>(Vec->getType())->getNumElements();
10460      if (VecVF < VF) {
10461        SmallVector<int> ResizeMask(VF, PoisonMaskElem);
10462        std::iota(ResizeMask.begin(), std::next(ResizeMask.begin(), VecVF), 0);
10463        Vec = createShuffle(Vec, nullptr, ResizeMask);
10464      }
10465      Action(Vec, CommonMask);
10466      InVectors.front() = Vec;
10467    }
10468    if (!ExtMask.empty()) {
10469      if (CommonMask.empty()) {
10470        CommonMask.assign(ExtMask.begin(), ExtMask.end());
10471      } else {
10472        SmallVector<int> NewMask(ExtMask.size(), PoisonMaskElem);
10473        for (int I = 0, Sz = ExtMask.size(); I < Sz; ++I) {
10474          if (ExtMask[I] == PoisonMaskElem)
10475            continue;
10476          NewMask[I] = CommonMask[ExtMask[I]];
10477        }
10478        CommonMask.swap(NewMask);
10479      }
10480    }
10481    if (CommonMask.empty()) {
10482      assert(InVectors.size() == 1 && "Expected only one vector with no mask");
10483      return InVectors.front();
10484    }
10485    if (InVectors.size() == 2)
10486      return createShuffle(InVectors.front(), InVectors.back(), CommonMask);
10487    return createShuffle(InVectors.front(), nullptr, CommonMask);
10488  }
10489
10490  ~ShuffleInstructionBuilder() {
10491    assert((IsFinalized || CommonMask.empty()) &&
10492           "Shuffle construction must be finalized.");
10493  }
10494};
10495
10496Value *BoUpSLP::vectorizeOperand(TreeEntry *E, unsigned NodeIdx,
10497                                 bool PostponedPHIs) {
10498  ValueList &VL = E->getOperand(NodeIdx);
10499  if (E->State == TreeEntry::PossibleStridedVectorize &&
10500      !E->ReorderIndices.empty()) {
10501    SmallVector<int> Mask(E->ReorderIndices.begin(), E->ReorderIndices.end());
10502    reorderScalars(VL, Mask);
10503  }
10504  const unsigned VF = VL.size();
10505  InstructionsState S = getSameOpcode(VL, *TLI);
10506  // Special processing for GEPs bundle, which may include non-gep values.
10507  if (!S.getOpcode() && VL.front()->getType()->isPointerTy()) {
10508    const auto *It =
10509        find_if(VL, [](Value *V) { return isa<GetElementPtrInst>(V); });
10510    if (It != VL.end())
10511      S = getSameOpcode(*It, *TLI);
10512  }
10513  if (S.getOpcode()) {
10514    auto CheckSameVE = [&](const TreeEntry *VE) {
10515      return VE->isSame(VL) &&
10516             (any_of(VE->UserTreeIndices,
10517                     [E, NodeIdx](const EdgeInfo &EI) {
10518                       return EI.UserTE == E && EI.EdgeIdx == NodeIdx;
10519                     }) ||
10520              any_of(VectorizableTree,
10521                     [E, NodeIdx, VE](const std::unique_ptr<TreeEntry> &TE) {
10522                       return TE->isOperandGatherNode({E, NodeIdx}) &&
10523                              VE->isSame(TE->Scalars);
10524                     }));
10525    };
10526    TreeEntry *VE = getTreeEntry(S.OpValue);
10527    bool IsSameVE = VE && CheckSameVE(VE);
10528    if (!IsSameVE) {
10529      auto It = MultiNodeScalars.find(S.OpValue);
10530      if (It != MultiNodeScalars.end()) {
10531        auto *I = find_if(It->getSecond(), [&](const TreeEntry *TE) {
10532          return TE != VE && CheckSameVE(TE);
10533        });
10534        if (I != It->getSecond().end()) {
10535          VE = *I;
10536          IsSameVE = true;
10537        }
10538      }
10539    }
10540    if (IsSameVE) {
10541      auto FinalShuffle = [&](Value *V, ArrayRef<int> Mask) {
10542        ShuffleInstructionBuilder ShuffleBuilder(Builder, *this);
10543        ShuffleBuilder.add(V, Mask);
10544        return ShuffleBuilder.finalize(std::nullopt);
10545      };
10546      Value *V = vectorizeTree(VE, PostponedPHIs);
10547      if (VF != cast<FixedVectorType>(V->getType())->getNumElements()) {
10548        if (!VE->ReuseShuffleIndices.empty()) {
10549          // Reshuffle to get only unique values.
10550          // If some of the scalars are duplicated in the vectorization
10551          // tree entry, we do not vectorize them but instead generate a
10552          // mask for the reuses. But if there are several users of the
10553          // same entry, they may have different vectorization factors.
10554          // This is especially important for PHI nodes. In this case, we
10555          // need to adapt the resulting instruction for the user
10556          // vectorization factor and have to reshuffle it again to take
10557          // only unique elements of the vector. Without this code the
10558          // function incorrectly returns reduced vector instruction with
10559          // the same elements, not with the unique ones.
10560
10561          // block:
10562          // %phi = phi <2 x > { .., %entry} {%shuffle, %block}
10563          // %2 = shuffle <2 x > %phi, poison, <4 x > <1, 1, 0, 0>
10564          // ... (use %2)
10565          // %shuffle = shuffle <2 x> %2, poison, <2 x> {2, 0}
10566          // br %block
10567          SmallVector<int> UniqueIdxs(VF, PoisonMaskElem);
10568          SmallSet<int, 4> UsedIdxs;
10569          int Pos = 0;
10570          for (int Idx : VE->ReuseShuffleIndices) {
10571            if (Idx != static_cast<int>(VF) && Idx != PoisonMaskElem &&
10572                UsedIdxs.insert(Idx).second)
10573              UniqueIdxs[Idx] = Pos;
10574            ++Pos;
10575          }
10576          assert(VF >= UsedIdxs.size() && "Expected vectorization factor "
10577                                          "less than original vector size.");
10578          UniqueIdxs.append(VF - UsedIdxs.size(), PoisonMaskElem);
10579          V = FinalShuffle(V, UniqueIdxs);
10580        } else {
10581          assert(VF < cast<FixedVectorType>(V->getType())->getNumElements() &&
10582                 "Expected vectorization factor less "
10583                 "than original vector size.");
10584          SmallVector<int> UniformMask(VF, 0);
10585          std::iota(UniformMask.begin(), UniformMask.end(), 0);
10586          V = FinalShuffle(V, UniformMask);
10587        }
10588      }
10589      // Need to update the operand gather node, if actually the operand is not a
10590      // vectorized node, but the buildvector/gather node, which matches one of
10591      // the vectorized nodes.
10592      if (find_if(VE->UserTreeIndices, [&](const EdgeInfo &EI) {
10593            return EI.UserTE == E && EI.EdgeIdx == NodeIdx;
10594          }) == VE->UserTreeIndices.end()) {
10595        auto *It = find_if(
10596            VectorizableTree, [&](const std::unique_ptr<TreeEntry> &TE) {
10597              return TE->State == TreeEntry::NeedToGather &&
10598                     TE->UserTreeIndices.front().UserTE == E &&
10599                     TE->UserTreeIndices.front().EdgeIdx == NodeIdx;
10600            });
10601        assert(It != VectorizableTree.end() && "Expected gather node operand.");
10602        (*It)->VectorizedValue = V;
10603      }
10604      return V;
10605    }
10606  }
10607
10608  // Find the corresponding gather entry and vectorize it.
10609  // Allows to be more accurate with tree/graph transformations, checks for the
10610  // correctness of the transformations in many cases.
10611  auto *I = find_if(VectorizableTree,
10612                    [E, NodeIdx](const std::unique_ptr<TreeEntry> &TE) {
10613                      return TE->isOperandGatherNode({E, NodeIdx});
10614                    });
10615  assert(I != VectorizableTree.end() && "Gather node is not in the graph.");
10616  assert(I->get()->UserTreeIndices.size() == 1 &&
10617         "Expected only single user for the gather node.");
10618  assert(I->get()->isSame(VL) && "Expected same list of scalars.");
10619  return vectorizeTree(I->get(), PostponedPHIs);
10620}
10621
10622template <typename BVTy, typename ResTy, typename... Args>
10623ResTy BoUpSLP::processBuildVector(const TreeEntry *E, Args &...Params) {
10624  assert(E->State == TreeEntry::NeedToGather && "Expected gather node.");
10625  unsigned VF = E->getVectorFactor();
10626
10627  bool NeedFreeze = false;
10628  SmallVector<int> ReuseShuffleIndicies(E->ReuseShuffleIndices.begin(),
10629                                        E->ReuseShuffleIndices.end());
10630  SmallVector<Value *> GatheredScalars(E->Scalars.begin(), E->Scalars.end());
10631  // Build a mask out of the reorder indices and reorder scalars per this
10632  // mask.
10633  SmallVector<int> ReorderMask;
10634  inversePermutation(E->ReorderIndices, ReorderMask);
10635  if (!ReorderMask.empty())
10636    reorderScalars(GatheredScalars, ReorderMask);
10637  auto FindReusedSplat = [&](MutableArrayRef<int> Mask, unsigned InputVF,
10638                             unsigned I, unsigned SliceSize) {
10639    if (!isSplat(E->Scalars) || none_of(E->Scalars, [](Value *V) {
10640          return isa<UndefValue>(V) && !isa<PoisonValue>(V);
10641        }))
10642      return false;
10643    TreeEntry *UserTE = E->UserTreeIndices.back().UserTE;
10644    unsigned EdgeIdx = E->UserTreeIndices.back().EdgeIdx;
10645    if (UserTE->getNumOperands() != 2)
10646      return false;
10647    auto *It =
10648        find_if(VectorizableTree, [=](const std::unique_ptr<TreeEntry> &TE) {
10649          return find_if(TE->UserTreeIndices, [=](const EdgeInfo &EI) {
10650                   return EI.UserTE == UserTE && EI.EdgeIdx != EdgeIdx;
10651                 }) != TE->UserTreeIndices.end();
10652        });
10653    if (It == VectorizableTree.end())
10654      return false;
10655    int Idx;
10656    if ((Mask.size() < InputVF &&
10657         ShuffleVectorInst::isExtractSubvectorMask(Mask, InputVF, Idx) &&
10658         Idx == 0) ||
10659        (Mask.size() == InputVF &&
10660         ShuffleVectorInst::isIdentityMask(Mask, Mask.size()))) {
10661      std::iota(std::next(Mask.begin(), I * SliceSize),
10662                std::next(Mask.begin(), (I + 1) * SliceSize), 0);
10663    } else {
10664      unsigned IVal =
10665          *find_if_not(Mask, [](int Idx) { return Idx == PoisonMaskElem; });
10666      std::fill(std::next(Mask.begin(), I * SliceSize),
10667                std::next(Mask.begin(), (I + 1) * SliceSize), IVal);
10668    }
10669    return true;
10670  };
10671  BVTy ShuffleBuilder(Params...);
10672  ResTy Res = ResTy();
10673  SmallVector<int> Mask;
10674  SmallVector<int> ExtractMask(GatheredScalars.size(), PoisonMaskElem);
10675  SmallVector<std::optional<TTI::ShuffleKind>> ExtractShuffles;
10676  Value *ExtractVecBase = nullptr;
10677  bool UseVecBaseAsInput = false;
10678  SmallVector<std::optional<TargetTransformInfo::ShuffleKind>> GatherShuffles;
10679  SmallVector<SmallVector<const TreeEntry *>> Entries;
10680  Type *ScalarTy = GatheredScalars.front()->getType();
10681  auto *VecTy = FixedVectorType::get(ScalarTy, GatheredScalars.size());
10682  unsigned NumParts = TTI->getNumberOfParts(VecTy);
10683  if (NumParts == 0 || NumParts >= GatheredScalars.size())
10684    NumParts = 1;
10685  if (!all_of(GatheredScalars, UndefValue::classof)) {
10686    // Check for gathered extracts.
10687    bool Resized = false;
10688    ExtractShuffles =
10689        tryToGatherExtractElements(GatheredScalars, ExtractMask, NumParts);
10690    if (!ExtractShuffles.empty()) {
10691      SmallVector<const TreeEntry *> ExtractEntries;
10692      for (auto [Idx, I] : enumerate(ExtractMask)) {
10693        if (I == PoisonMaskElem)
10694          continue;
10695        if (const auto *TE = getTreeEntry(
10696                cast<ExtractElementInst>(E->Scalars[Idx])->getVectorOperand()))
10697          ExtractEntries.push_back(TE);
10698      }
10699      if (std::optional<ResTy> Delayed =
10700              ShuffleBuilder.needToDelay(E, ExtractEntries)) {
10701        // Delay emission of gathers which are not ready yet.
10702        PostponedGathers.insert(E);
10703        // Postpone gather emission, will be emitted after the end of the
10704        // process to keep correct order.
10705        return *Delayed;
10706      }
10707      if (Value *VecBase = ShuffleBuilder.adjustExtracts(
10708              E, ExtractMask, ExtractShuffles, NumParts, UseVecBaseAsInput)) {
10709        ExtractVecBase = VecBase;
10710        if (auto *VecBaseTy = dyn_cast<FixedVectorType>(VecBase->getType()))
10711          if (VF == VecBaseTy->getNumElements() &&
10712              GatheredScalars.size() != VF) {
10713            Resized = true;
10714            GatheredScalars.append(VF - GatheredScalars.size(),
10715                                   PoisonValue::get(ScalarTy));
10716          }
10717      }
10718    }
10719    // Gather extracts after we check for full matched gathers only.
10720    if (!ExtractShuffles.empty() || E->getOpcode() != Instruction::Load ||
10721        E->isAltShuffle() ||
10722        all_of(E->Scalars, [this](Value *V) { return getTreeEntry(V); }) ||
10723        isSplat(E->Scalars) ||
10724        (E->Scalars != GatheredScalars && GatheredScalars.size() <= 2)) {
10725      GatherShuffles =
10726          isGatherShuffledEntry(E, GatheredScalars, Mask, Entries, NumParts);
10727    }
10728    if (!GatherShuffles.empty()) {
10729      if (std::optional<ResTy> Delayed =
10730              ShuffleBuilder.needToDelay(E, Entries)) {
10731        // Delay emission of gathers which are not ready yet.
10732        PostponedGathers.insert(E);
10733        // Postpone gather emission, will be emitted after the end of the
10734        // process to keep correct order.
10735        return *Delayed;
10736      }
10737      if (GatherShuffles.size() == 1 &&
10738          *GatherShuffles.front() == TTI::SK_PermuteSingleSrc &&
10739          Entries.front().front()->isSame(E->Scalars)) {
10740        // Perfect match in the graph, will reuse the previously vectorized
10741        // node. Cost is 0.
10742        LLVM_DEBUG(
10743            dbgs()
10744            << "SLP: perfect diamond match for gather bundle "
10745            << shortBundleName(E->Scalars) << ".\n");
10746        // Restore the mask for previous partially matched values.
10747        Mask.resize(E->Scalars.size());
10748        const TreeEntry *FrontTE = Entries.front().front();
10749        if (FrontTE->ReorderIndices.empty() &&
10750            ((FrontTE->ReuseShuffleIndices.empty() &&
10751              E->Scalars.size() == FrontTE->Scalars.size()) ||
10752             (E->Scalars.size() == FrontTE->ReuseShuffleIndices.size()))) {
10753          std::iota(Mask.begin(), Mask.end(), 0);
10754        } else {
10755          for (auto [I, V] : enumerate(E->Scalars)) {
10756            if (isa<PoisonValue>(V)) {
10757              Mask[I] = PoisonMaskElem;
10758              continue;
10759            }
10760            Mask[I] = FrontTE->findLaneForValue(V);
10761          }
10762        }
10763        ShuffleBuilder.add(*FrontTE, Mask);
10764        Res = ShuffleBuilder.finalize(E->getCommonMask());
10765        return Res;
10766      }
10767      if (!Resized) {
10768        if (GatheredScalars.size() != VF &&
10769            any_of(Entries, [&](ArrayRef<const TreeEntry *> TEs) {
10770              return any_of(TEs, [&](const TreeEntry *TE) {
10771                return TE->getVectorFactor() == VF;
10772              });
10773            }))
10774          GatheredScalars.append(VF - GatheredScalars.size(),
10775                                 PoisonValue::get(ScalarTy));
10776      }
10777      // Remove shuffled elements from list of gathers.
10778      for (int I = 0, Sz = Mask.size(); I < Sz; ++I) {
10779        if (Mask[I] != PoisonMaskElem)
10780          GatheredScalars[I] = PoisonValue::get(ScalarTy);
10781      }
10782    }
10783  }
10784  auto TryPackScalars = [&](SmallVectorImpl<Value *> &Scalars,
10785                            SmallVectorImpl<int> &ReuseMask,
10786                            bool IsRootPoison) {
10787    // For splats with can emit broadcasts instead of gathers, so try to find
10788    // such sequences.
10789    bool IsSplat = IsRootPoison && isSplat(Scalars) &&
10790                   (Scalars.size() > 2 || Scalars.front() == Scalars.back());
10791    Scalars.append(VF - Scalars.size(), PoisonValue::get(ScalarTy));
10792    SmallVector<int> UndefPos;
10793    DenseMap<Value *, unsigned> UniquePositions;
10794    // Gather unique non-const values and all constant values.
10795    // For repeated values, just shuffle them.
10796    int NumNonConsts = 0;
10797    int SinglePos = 0;
10798    for (auto [I, V] : enumerate(Scalars)) {
10799      if (isa<UndefValue>(V)) {
10800        if (!isa<PoisonValue>(V)) {
10801          ReuseMask[I] = I;
10802          UndefPos.push_back(I);
10803        }
10804        continue;
10805      }
10806      if (isConstant(V)) {
10807        ReuseMask[I] = I;
10808        continue;
10809      }
10810      ++NumNonConsts;
10811      SinglePos = I;
10812      Value *OrigV = V;
10813      Scalars[I] = PoisonValue::get(ScalarTy);
10814      if (IsSplat) {
10815        Scalars.front() = OrigV;
10816        ReuseMask[I] = 0;
10817      } else {
10818        const auto Res = UniquePositions.try_emplace(OrigV, I);
10819        Scalars[Res.first->second] = OrigV;
10820        ReuseMask[I] = Res.first->second;
10821      }
10822    }
10823    if (NumNonConsts == 1) {
10824      // Restore single insert element.
10825      if (IsSplat) {
10826        ReuseMask.assign(VF, PoisonMaskElem);
10827        std::swap(Scalars.front(), Scalars[SinglePos]);
10828        if (!UndefPos.empty() && UndefPos.front() == 0)
10829          Scalars.front() = UndefValue::get(ScalarTy);
10830      }
10831      ReuseMask[SinglePos] = SinglePos;
10832    } else if (!UndefPos.empty() && IsSplat) {
10833      // For undef values, try to replace them with the simple broadcast.
10834      // We can do it if the broadcasted value is guaranteed to be
10835      // non-poisonous, or by freezing the incoming scalar value first.
10836      auto *It = find_if(Scalars, [this, E](Value *V) {
10837        return !isa<UndefValue>(V) &&
10838               (getTreeEntry(V) || isGuaranteedNotToBePoison(V) ||
10839                (E->UserTreeIndices.size() == 1 &&
10840                 any_of(V->uses(), [E](const Use &U) {
10841                   // Check if the value already used in the same operation in
10842                   // one of the nodes already.
10843                   return E->UserTreeIndices.front().EdgeIdx !=
10844                              U.getOperandNo() &&
10845                          is_contained(
10846                              E->UserTreeIndices.front().UserTE->Scalars,
10847                              U.getUser());
10848                 })));
10849      });
10850      if (It != Scalars.end()) {
10851        // Replace undefs by the non-poisoned scalars and emit broadcast.
10852        int Pos = std::distance(Scalars.begin(), It);
10853        for (int I : UndefPos) {
10854          // Set the undef position to the non-poisoned scalar.
10855          ReuseMask[I] = Pos;
10856          // Replace the undef by the poison, in the mask it is replaced by
10857          // non-poisoned scalar already.
10858          if (I != Pos)
10859            Scalars[I] = PoisonValue::get(ScalarTy);
10860        }
10861      } else {
10862        // Replace undefs by the poisons, emit broadcast and then emit
10863        // freeze.
10864        for (int I : UndefPos) {
10865          ReuseMask[I] = PoisonMaskElem;
10866          if (isa<UndefValue>(Scalars[I]))
10867            Scalars[I] = PoisonValue::get(ScalarTy);
10868        }
10869        NeedFreeze = true;
10870      }
10871    }
10872  };
10873  if (!ExtractShuffles.empty() || !GatherShuffles.empty()) {
10874    bool IsNonPoisoned = true;
10875    bool IsUsedInExpr = true;
10876    Value *Vec1 = nullptr;
10877    if (!ExtractShuffles.empty()) {
10878      // Gather of extractelements can be represented as just a shuffle of
10879      // a single/two vectors the scalars are extracted from.
10880      // Find input vectors.
10881      Value *Vec2 = nullptr;
10882      for (unsigned I = 0, Sz = ExtractMask.size(); I < Sz; ++I) {
10883        if (!Mask.empty() && Mask[I] != PoisonMaskElem)
10884          ExtractMask[I] = PoisonMaskElem;
10885      }
10886      if (UseVecBaseAsInput) {
10887        Vec1 = ExtractVecBase;
10888      } else {
10889        for (unsigned I = 0, Sz = ExtractMask.size(); I < Sz; ++I) {
10890          if (ExtractMask[I] == PoisonMaskElem)
10891            continue;
10892          if (isa<UndefValue>(E->Scalars[I]))
10893            continue;
10894          auto *EI = cast<ExtractElementInst>(E->Scalars[I]);
10895          Value *VecOp = EI->getVectorOperand();
10896          if (const auto *TE = getTreeEntry(VecOp))
10897            if (TE->VectorizedValue)
10898              VecOp = TE->VectorizedValue;
10899          if (!Vec1) {
10900            Vec1 = VecOp;
10901          } else if (Vec1 != EI->getVectorOperand()) {
10902            assert((!Vec2 || Vec2 == EI->getVectorOperand()) &&
10903                   "Expected only 1 or 2 vectors shuffle.");
10904            Vec2 = VecOp;
10905          }
10906        }
10907      }
10908      if (Vec2) {
10909        IsUsedInExpr = false;
10910        IsNonPoisoned &=
10911            isGuaranteedNotToBePoison(Vec1) && isGuaranteedNotToBePoison(Vec2);
10912        ShuffleBuilder.add(Vec1, Vec2, ExtractMask);
10913      } else if (Vec1) {
10914        IsUsedInExpr &= FindReusedSplat(
10915            ExtractMask,
10916            cast<FixedVectorType>(Vec1->getType())->getNumElements(), 0,
10917            ExtractMask.size());
10918        ShuffleBuilder.add(Vec1, ExtractMask, /*ForExtracts=*/true);
10919        IsNonPoisoned &= isGuaranteedNotToBePoison(Vec1);
10920      } else {
10921        IsUsedInExpr = false;
10922        ShuffleBuilder.add(PoisonValue::get(FixedVectorType::get(
10923                               ScalarTy, GatheredScalars.size())),
10924                           ExtractMask, /*ForExtracts=*/true);
10925      }
10926    }
10927    if (!GatherShuffles.empty()) {
10928      unsigned SliceSize = E->Scalars.size() / NumParts;
10929      SmallVector<int> VecMask(Mask.size(), PoisonMaskElem);
10930      for (const auto [I, TEs] : enumerate(Entries)) {
10931        if (TEs.empty()) {
10932          assert(!GatherShuffles[I] &&
10933                 "No shuffles with empty entries list expected.");
10934          continue;
10935        }
10936        assert((TEs.size() == 1 || TEs.size() == 2) &&
10937               "Expected shuffle of 1 or 2 entries.");
10938        auto SubMask = ArrayRef(Mask).slice(I * SliceSize, SliceSize);
10939        VecMask.assign(VecMask.size(), PoisonMaskElem);
10940        copy(SubMask, std::next(VecMask.begin(), I * SliceSize));
10941        if (TEs.size() == 1) {
10942          IsUsedInExpr &=
10943              FindReusedSplat(VecMask, TEs.front()->getVectorFactor(), I, SliceSize);
10944          ShuffleBuilder.add(*TEs.front(), VecMask);
10945          if (TEs.front()->VectorizedValue)
10946            IsNonPoisoned &=
10947                isGuaranteedNotToBePoison(TEs.front()->VectorizedValue);
10948        } else {
10949          IsUsedInExpr = false;
10950          ShuffleBuilder.add(*TEs.front(), *TEs.back(), VecMask);
10951          if (TEs.front()->VectorizedValue && TEs.back()->VectorizedValue)
10952            IsNonPoisoned &=
10953                isGuaranteedNotToBePoison(TEs.front()->VectorizedValue) &&
10954                isGuaranteedNotToBePoison(TEs.back()->VectorizedValue);
10955        }
10956      }
10957    }
10958    // Try to figure out best way to combine values: build a shuffle and insert
10959    // elements or just build several shuffles.
10960    // Insert non-constant scalars.
10961    SmallVector<Value *> NonConstants(GatheredScalars);
10962    int EMSz = ExtractMask.size();
10963    int MSz = Mask.size();
10964    // Try to build constant vector and shuffle with it only if currently we
10965    // have a single permutation and more than 1 scalar constants.
10966    bool IsSingleShuffle = ExtractShuffles.empty() || GatherShuffles.empty();
10967    bool IsIdentityShuffle =
10968        ((UseVecBaseAsInput ||
10969          all_of(ExtractShuffles,
10970                 [](const std::optional<TTI::ShuffleKind> &SK) {
10971                   return SK.value_or(TTI::SK_PermuteTwoSrc) ==
10972                          TTI::SK_PermuteSingleSrc;
10973                 })) &&
10974         none_of(ExtractMask, [&](int I) { return I >= EMSz; }) &&
10975         ShuffleVectorInst::isIdentityMask(ExtractMask, EMSz)) ||
10976        (!GatherShuffles.empty() &&
10977         all_of(GatherShuffles,
10978                [](const std::optional<TTI::ShuffleKind> &SK) {
10979                  return SK.value_or(TTI::SK_PermuteTwoSrc) ==
10980                         TTI::SK_PermuteSingleSrc;
10981                }) &&
10982         none_of(Mask, [&](int I) { return I >= MSz; }) &&
10983         ShuffleVectorInst::isIdentityMask(Mask, MSz));
10984    bool EnoughConstsForShuffle =
10985        IsSingleShuffle &&
10986        (none_of(GatheredScalars,
10987                 [](Value *V) {
10988                   return isa<UndefValue>(V) && !isa<PoisonValue>(V);
10989                 }) ||
10990         any_of(GatheredScalars,
10991                [](Value *V) {
10992                  return isa<Constant>(V) && !isa<UndefValue>(V);
10993                })) &&
10994        (!IsIdentityShuffle ||
10995         (GatheredScalars.size() == 2 &&
10996          any_of(GatheredScalars,
10997                 [](Value *V) { return !isa<UndefValue>(V); })) ||
10998         count_if(GatheredScalars, [](Value *V) {
10999           return isa<Constant>(V) && !isa<PoisonValue>(V);
11000         }) > 1);
11001    // NonConstants array contains just non-constant values, GatheredScalars
11002    // contains only constant to build final vector and then shuffle.
11003    for (int I = 0, Sz = GatheredScalars.size(); I < Sz; ++I) {
11004      if (EnoughConstsForShuffle && isa<Constant>(GatheredScalars[I]))
11005        NonConstants[I] = PoisonValue::get(ScalarTy);
11006      else
11007        GatheredScalars[I] = PoisonValue::get(ScalarTy);
11008    }
11009    // Generate constants for final shuffle and build a mask for them.
11010    if (!all_of(GatheredScalars, PoisonValue::classof)) {
11011      SmallVector<int> BVMask(GatheredScalars.size(), PoisonMaskElem);
11012      TryPackScalars(GatheredScalars, BVMask, /*IsRootPoison=*/true);
11013      Value *BV = ShuffleBuilder.gather(GatheredScalars, BVMask.size());
11014      ShuffleBuilder.add(BV, BVMask);
11015    }
11016    if (all_of(NonConstants, [=](Value *V) {
11017          return isa<PoisonValue>(V) ||
11018                 (IsSingleShuffle && ((IsIdentityShuffle &&
11019                  IsNonPoisoned) || IsUsedInExpr) && isa<UndefValue>(V));
11020        }))
11021      Res = ShuffleBuilder.finalize(E->ReuseShuffleIndices);
11022    else
11023      Res = ShuffleBuilder.finalize(
11024          E->ReuseShuffleIndices, E->Scalars.size(),
11025          [&](Value *&Vec, SmallVectorImpl<int> &Mask) {
11026            TryPackScalars(NonConstants, Mask, /*IsRootPoison=*/false);
11027            Vec = ShuffleBuilder.gather(NonConstants, Mask.size(), Vec);
11028          });
11029  } else if (!allConstant(GatheredScalars)) {
11030    // Gather unique scalars and all constants.
11031    SmallVector<int> ReuseMask(GatheredScalars.size(), PoisonMaskElem);
11032    TryPackScalars(GatheredScalars, ReuseMask, /*IsRootPoison=*/true);
11033    Value *BV = ShuffleBuilder.gather(GatheredScalars, ReuseMask.size());
11034    ShuffleBuilder.add(BV, ReuseMask);
11035    Res = ShuffleBuilder.finalize(E->ReuseShuffleIndices);
11036  } else {
11037    // Gather all constants.
11038    SmallVector<int> Mask(E->Scalars.size(), PoisonMaskElem);
11039    for (auto [I, V] : enumerate(E->Scalars)) {
11040      if (!isa<PoisonValue>(V))
11041        Mask[I] = I;
11042    }
11043    Value *BV = ShuffleBuilder.gather(E->Scalars);
11044    ShuffleBuilder.add(BV, Mask);
11045    Res = ShuffleBuilder.finalize(E->ReuseShuffleIndices);
11046  }
11047
11048  if (NeedFreeze)
11049    Res = ShuffleBuilder.createFreeze(Res);
11050  return Res;
11051}
11052
11053Value *BoUpSLP::createBuildVector(const TreeEntry *E) {
11054  return processBuildVector<ShuffleInstructionBuilder, Value *>(E, Builder,
11055                                                                *this);
11056}
11057
11058Value *BoUpSLP::vectorizeTree(TreeEntry *E, bool PostponedPHIs) {
11059  IRBuilder<>::InsertPointGuard Guard(Builder);
11060
11061  if (E->VectorizedValue &&
11062      (E->State != TreeEntry::Vectorize || E->getOpcode() != Instruction::PHI ||
11063       E->isAltShuffle())) {
11064    LLVM_DEBUG(dbgs() << "SLP: Diamond merged for " << *E->Scalars[0] << ".\n");
11065    return E->VectorizedValue;
11066  }
11067
11068  if (E->State == TreeEntry::NeedToGather) {
11069    // Set insert point for non-reduction initial nodes.
11070    if (E->getMainOp() && E->Idx == 0 && !UserIgnoreList)
11071      setInsertPointAfterBundle(E);
11072    Value *Vec = createBuildVector(E);
11073    E->VectorizedValue = Vec;
11074    return Vec;
11075  }
11076
11077  auto FinalShuffle = [&](Value *V, const TreeEntry *E, VectorType *VecTy,
11078                          bool IsSigned) {
11079    if (V->getType() != VecTy)
11080      V = Builder.CreateIntCast(V, VecTy, IsSigned);
11081    ShuffleInstructionBuilder ShuffleBuilder(Builder, *this);
11082    if (E->getOpcode() == Instruction::Store) {
11083      ArrayRef<int> Mask =
11084          ArrayRef(reinterpret_cast<const int *>(E->ReorderIndices.begin()),
11085                   E->ReorderIndices.size());
11086      ShuffleBuilder.add(V, Mask);
11087    } else if (E->State == TreeEntry::PossibleStridedVectorize) {
11088      ShuffleBuilder.addOrdered(V, std::nullopt);
11089    } else {
11090      ShuffleBuilder.addOrdered(V, E->ReorderIndices);
11091    }
11092    return ShuffleBuilder.finalize(E->ReuseShuffleIndices);
11093  };
11094
11095  assert((E->State == TreeEntry::Vectorize ||
11096          E->State == TreeEntry::ScatterVectorize ||
11097          E->State == TreeEntry::PossibleStridedVectorize) &&
11098         "Unhandled state");
11099  unsigned ShuffleOrOp =
11100      E->isAltShuffle() ? (unsigned)Instruction::ShuffleVector : E->getOpcode();
11101  Instruction *VL0 = E->getMainOp();
11102  Type *ScalarTy = VL0->getType();
11103  if (auto *Store = dyn_cast<StoreInst>(VL0))
11104    ScalarTy = Store->getValueOperand()->getType();
11105  else if (auto *IE = dyn_cast<InsertElementInst>(VL0))
11106    ScalarTy = IE->getOperand(1)->getType();
11107  bool IsSigned = false;
11108  auto It = MinBWs.find(E);
11109  if (It != MinBWs.end()) {
11110    ScalarTy = IntegerType::get(F->getContext(), It->second.first);
11111    IsSigned = It->second.second;
11112  }
11113  auto *VecTy = FixedVectorType::get(ScalarTy, E->Scalars.size());
11114  switch (ShuffleOrOp) {
11115    case Instruction::PHI: {
11116      assert((E->ReorderIndices.empty() ||
11117              E != VectorizableTree.front().get() ||
11118              !E->UserTreeIndices.empty()) &&
11119             "PHI reordering is free.");
11120      if (PostponedPHIs && E->VectorizedValue)
11121        return E->VectorizedValue;
11122      auto *PH = cast<PHINode>(VL0);
11123      Builder.SetInsertPoint(PH->getParent(),
11124                             PH->getParent()->getFirstNonPHIIt());
11125      Builder.SetCurrentDebugLocation(PH->getDebugLoc());
11126      if (PostponedPHIs || !E->VectorizedValue) {
11127        PHINode *NewPhi = Builder.CreatePHI(VecTy, PH->getNumIncomingValues());
11128        E->PHI = NewPhi;
11129        Value *V = NewPhi;
11130
11131        // Adjust insertion point once all PHI's have been generated.
11132        Builder.SetInsertPoint(PH->getParent(),
11133                               PH->getParent()->getFirstInsertionPt());
11134        Builder.SetCurrentDebugLocation(PH->getDebugLoc());
11135
11136        V = FinalShuffle(V, E, VecTy, IsSigned);
11137
11138        E->VectorizedValue = V;
11139        if (PostponedPHIs)
11140          return V;
11141      }
11142      PHINode *NewPhi = cast<PHINode>(E->PHI);
11143      // If phi node is fully emitted - exit.
11144      if (NewPhi->getNumIncomingValues() != 0)
11145        return NewPhi;
11146
11147      // PHINodes may have multiple entries from the same block. We want to
11148      // visit every block once.
11149      SmallPtrSet<BasicBlock *, 4> VisitedBBs;
11150
11151      for (unsigned I : seq<unsigned>(0, PH->getNumIncomingValues())) {
11152        ValueList Operands;
11153        BasicBlock *IBB = PH->getIncomingBlock(I);
11154
11155        // Stop emission if all incoming values are generated.
11156        if (NewPhi->getNumIncomingValues() == PH->getNumIncomingValues()) {
11157          LLVM_DEBUG(dbgs() << "SLP: Diamond merged for " << *VL0 << ".\n");
11158          return NewPhi;
11159        }
11160
11161        if (!VisitedBBs.insert(IBB).second) {
11162          NewPhi->addIncoming(NewPhi->getIncomingValueForBlock(IBB), IBB);
11163          continue;
11164        }
11165
11166        Builder.SetInsertPoint(IBB->getTerminator());
11167        Builder.SetCurrentDebugLocation(PH->getDebugLoc());
11168        Value *Vec = vectorizeOperand(E, I, /*PostponedPHIs=*/true);
11169        if (VecTy != Vec->getType()) {
11170          assert(MinBWs.contains(getOperandEntry(E, I)) &&
11171                 "Expected item in MinBWs.");
11172          Vec = Builder.CreateIntCast(Vec, VecTy, It->second.second);
11173        }
11174        NewPhi->addIncoming(Vec, IBB);
11175      }
11176
11177      assert(NewPhi->getNumIncomingValues() == PH->getNumIncomingValues() &&
11178             "Invalid number of incoming values");
11179      return NewPhi;
11180    }
11181
11182    case Instruction::ExtractElement: {
11183      Value *V = E->getSingleOperand(0);
11184      if (const TreeEntry *TE = getTreeEntry(V))
11185        V = TE->VectorizedValue;
11186      setInsertPointAfterBundle(E);
11187      V = FinalShuffle(V, E, VecTy, IsSigned);
11188      E->VectorizedValue = V;
11189      return V;
11190    }
11191    case Instruction::ExtractValue: {
11192      auto *LI = cast<LoadInst>(E->getSingleOperand(0));
11193      Builder.SetInsertPoint(LI);
11194      Value *Ptr = LI->getPointerOperand();
11195      LoadInst *V = Builder.CreateAlignedLoad(VecTy, Ptr, LI->getAlign());
11196      Value *NewV = propagateMetadata(V, E->Scalars);
11197      NewV = FinalShuffle(NewV, E, VecTy, IsSigned);
11198      E->VectorizedValue = NewV;
11199      return NewV;
11200    }
11201    case Instruction::InsertElement: {
11202      assert(E->ReuseShuffleIndices.empty() && "All inserts should be unique");
11203      Builder.SetInsertPoint(cast<Instruction>(E->Scalars.back()));
11204      Value *V = vectorizeOperand(E, 1, PostponedPHIs);
11205      ArrayRef<Value *> Op = E->getOperand(1);
11206      Type *ScalarTy = Op.front()->getType();
11207      if (cast<VectorType>(V->getType())->getElementType() != ScalarTy) {
11208        assert(ScalarTy->isIntegerTy() && "Expected item in MinBWs.");
11209        std::pair<unsigned, bool> Res = MinBWs.lookup(getOperandEntry(E, 1));
11210        assert(Res.first > 0 && "Expected item in MinBWs.");
11211        V = Builder.CreateIntCast(
11212            V,
11213            FixedVectorType::get(
11214                ScalarTy,
11215                cast<FixedVectorType>(V->getType())->getNumElements()),
11216            Res.second);
11217      }
11218
11219      // Create InsertVector shuffle if necessary
11220      auto *FirstInsert = cast<Instruction>(*find_if(E->Scalars, [E](Value *V) {
11221        return !is_contained(E->Scalars, cast<Instruction>(V)->getOperand(0));
11222      }));
11223      const unsigned NumElts =
11224          cast<FixedVectorType>(FirstInsert->getType())->getNumElements();
11225      const unsigned NumScalars = E->Scalars.size();
11226
11227      unsigned Offset = *getInsertIndex(VL0);
11228      assert(Offset < NumElts && "Failed to find vector index offset");
11229
11230      // Create shuffle to resize vector
11231      SmallVector<int> Mask;
11232      if (!E->ReorderIndices.empty()) {
11233        inversePermutation(E->ReorderIndices, Mask);
11234        Mask.append(NumElts - NumScalars, PoisonMaskElem);
11235      } else {
11236        Mask.assign(NumElts, PoisonMaskElem);
11237        std::iota(Mask.begin(), std::next(Mask.begin(), NumScalars), 0);
11238      }
11239      // Create InsertVector shuffle if necessary
11240      bool IsIdentity = true;
11241      SmallVector<int> PrevMask(NumElts, PoisonMaskElem);
11242      Mask.swap(PrevMask);
11243      for (unsigned I = 0; I < NumScalars; ++I) {
11244        Value *Scalar = E->Scalars[PrevMask[I]];
11245        unsigned InsertIdx = *getInsertIndex(Scalar);
11246        IsIdentity &= InsertIdx - Offset == I;
11247        Mask[InsertIdx - Offset] = I;
11248      }
11249      if (!IsIdentity || NumElts != NumScalars) {
11250        Value *V2 = nullptr;
11251        bool IsVNonPoisonous = isGuaranteedNotToBePoison(V) && !isConstant(V);
11252        SmallVector<int> InsertMask(Mask);
11253        if (NumElts != NumScalars && Offset == 0) {
11254          // Follow all insert element instructions from the current buildvector
11255          // sequence.
11256          InsertElementInst *Ins = cast<InsertElementInst>(VL0);
11257          do {
11258            std::optional<unsigned> InsertIdx = getInsertIndex(Ins);
11259            if (!InsertIdx)
11260              break;
11261            if (InsertMask[*InsertIdx] == PoisonMaskElem)
11262              InsertMask[*InsertIdx] = *InsertIdx;
11263            if (!Ins->hasOneUse())
11264              break;
11265            Ins = dyn_cast_or_null<InsertElementInst>(
11266                Ins->getUniqueUndroppableUser());
11267          } while (Ins);
11268          SmallBitVector UseMask =
11269              buildUseMask(NumElts, InsertMask, UseMask::UndefsAsMask);
11270          SmallBitVector IsFirstPoison =
11271              isUndefVector<true>(FirstInsert->getOperand(0), UseMask);
11272          SmallBitVector IsFirstUndef =
11273              isUndefVector(FirstInsert->getOperand(0), UseMask);
11274          if (!IsFirstPoison.all()) {
11275            unsigned Idx = 0;
11276            for (unsigned I = 0; I < NumElts; I++) {
11277              if (InsertMask[I] == PoisonMaskElem && !IsFirstPoison.test(I) &&
11278                  IsFirstUndef.test(I)) {
11279                if (IsVNonPoisonous) {
11280                  InsertMask[I] = I < NumScalars ? I : 0;
11281                  continue;
11282                }
11283                if (!V2)
11284                  V2 = UndefValue::get(V->getType());
11285                if (Idx >= NumScalars)
11286                  Idx = NumScalars - 1;
11287                InsertMask[I] = NumScalars + Idx;
11288                ++Idx;
11289              } else if (InsertMask[I] != PoisonMaskElem &&
11290                         Mask[I] == PoisonMaskElem) {
11291                InsertMask[I] = PoisonMaskElem;
11292              }
11293            }
11294          } else {
11295            InsertMask = Mask;
11296          }
11297        }
11298        if (!V2)
11299          V2 = PoisonValue::get(V->getType());
11300        V = Builder.CreateShuffleVector(V, V2, InsertMask);
11301        if (auto *I = dyn_cast<Instruction>(V)) {
11302          GatherShuffleExtractSeq.insert(I);
11303          CSEBlocks.insert(I->getParent());
11304        }
11305      }
11306
11307      SmallVector<int> InsertMask(NumElts, PoisonMaskElem);
11308      for (unsigned I = 0; I < NumElts; I++) {
11309        if (Mask[I] != PoisonMaskElem)
11310          InsertMask[Offset + I] = I;
11311      }
11312      SmallBitVector UseMask =
11313          buildUseMask(NumElts, InsertMask, UseMask::UndefsAsMask);
11314      SmallBitVector IsFirstUndef =
11315          isUndefVector(FirstInsert->getOperand(0), UseMask);
11316      if ((!IsIdentity || Offset != 0 || !IsFirstUndef.all()) &&
11317          NumElts != NumScalars) {
11318        if (IsFirstUndef.all()) {
11319          if (!ShuffleVectorInst::isIdentityMask(InsertMask, NumElts)) {
11320            SmallBitVector IsFirstPoison =
11321                isUndefVector<true>(FirstInsert->getOperand(0), UseMask);
11322            if (!IsFirstPoison.all()) {
11323              for (unsigned I = 0; I < NumElts; I++) {
11324                if (InsertMask[I] == PoisonMaskElem && !IsFirstPoison.test(I))
11325                  InsertMask[I] = I + NumElts;
11326              }
11327            }
11328            V = Builder.CreateShuffleVector(
11329                V,
11330                IsFirstPoison.all() ? PoisonValue::get(V->getType())
11331                                    : FirstInsert->getOperand(0),
11332                InsertMask, cast<Instruction>(E->Scalars.back())->getName());
11333            if (auto *I = dyn_cast<Instruction>(V)) {
11334              GatherShuffleExtractSeq.insert(I);
11335              CSEBlocks.insert(I->getParent());
11336            }
11337          }
11338        } else {
11339          SmallBitVector IsFirstPoison =
11340              isUndefVector<true>(FirstInsert->getOperand(0), UseMask);
11341          for (unsigned I = 0; I < NumElts; I++) {
11342            if (InsertMask[I] == PoisonMaskElem)
11343              InsertMask[I] = IsFirstPoison.test(I) ? PoisonMaskElem : I;
11344            else
11345              InsertMask[I] += NumElts;
11346          }
11347          V = Builder.CreateShuffleVector(
11348              FirstInsert->getOperand(0), V, InsertMask,
11349              cast<Instruction>(E->Scalars.back())->getName());
11350          if (auto *I = dyn_cast<Instruction>(V)) {
11351            GatherShuffleExtractSeq.insert(I);
11352            CSEBlocks.insert(I->getParent());
11353          }
11354        }
11355      }
11356
11357      ++NumVectorInstructions;
11358      E->VectorizedValue = V;
11359      return V;
11360    }
11361    case Instruction::ZExt:
11362    case Instruction::SExt:
11363    case Instruction::FPToUI:
11364    case Instruction::FPToSI:
11365    case Instruction::FPExt:
11366    case Instruction::PtrToInt:
11367    case Instruction::IntToPtr:
11368    case Instruction::SIToFP:
11369    case Instruction::UIToFP:
11370    case Instruction::Trunc:
11371    case Instruction::FPTrunc:
11372    case Instruction::BitCast: {
11373      setInsertPointAfterBundle(E);
11374
11375      Value *InVec = vectorizeOperand(E, 0, PostponedPHIs);
11376      if (E->VectorizedValue) {
11377        LLVM_DEBUG(dbgs() << "SLP: Diamond merged for " << *VL0 << ".\n");
11378        return E->VectorizedValue;
11379      }
11380
11381      auto *CI = cast<CastInst>(VL0);
11382      Instruction::CastOps VecOpcode = CI->getOpcode();
11383      Type *SrcScalarTy = VL0->getOperand(0)->getType();
11384      auto SrcIt = MinBWs.find(getOperandEntry(E, 0));
11385      if (!ScalarTy->isFloatingPointTy() && !SrcScalarTy->isFloatingPointTy() &&
11386          (SrcIt != MinBWs.end() || It != MinBWs.end())) {
11387        // Check if the values are candidates to demote.
11388        unsigned SrcBWSz = DL->getTypeSizeInBits(SrcScalarTy);
11389        if (SrcIt != MinBWs.end())
11390          SrcBWSz = SrcIt->second.first;
11391        unsigned BWSz = DL->getTypeSizeInBits(ScalarTy);
11392        if (BWSz == SrcBWSz) {
11393          VecOpcode = Instruction::BitCast;
11394        } else if (BWSz < SrcBWSz) {
11395          VecOpcode = Instruction::Trunc;
11396        } else if (It != MinBWs.end()) {
11397          assert(BWSz > SrcBWSz && "Invalid cast!");
11398          VecOpcode = It->second.second ? Instruction::SExt : Instruction::ZExt;
11399        }
11400      }
11401      Value *V = (VecOpcode != ShuffleOrOp && VecOpcode == Instruction::BitCast)
11402                     ? InVec
11403                     : Builder.CreateCast(VecOpcode, InVec, VecTy);
11404      V = FinalShuffle(V, E, VecTy, IsSigned);
11405
11406      E->VectorizedValue = V;
11407      ++NumVectorInstructions;
11408      return V;
11409    }
11410    case Instruction::FCmp:
11411    case Instruction::ICmp: {
11412      setInsertPointAfterBundle(E);
11413
11414      Value *L = vectorizeOperand(E, 0, PostponedPHIs);
11415      if (E->VectorizedValue) {
11416        LLVM_DEBUG(dbgs() << "SLP: Diamond merged for " << *VL0 << ".\n");
11417        return E->VectorizedValue;
11418      }
11419      Value *R = vectorizeOperand(E, 1, PostponedPHIs);
11420      if (E->VectorizedValue) {
11421        LLVM_DEBUG(dbgs() << "SLP: Diamond merged for " << *VL0 << ".\n");
11422        return E->VectorizedValue;
11423      }
11424      if (L->getType() != R->getType()) {
11425        assert((MinBWs.contains(getOperandEntry(E, 0)) ||
11426                MinBWs.contains(getOperandEntry(E, 1))) &&
11427               "Expected item in MinBWs.");
11428        L = Builder.CreateIntCast(L, VecTy, IsSigned);
11429        R = Builder.CreateIntCast(R, VecTy, IsSigned);
11430      }
11431
11432      CmpInst::Predicate P0 = cast<CmpInst>(VL0)->getPredicate();
11433      Value *V = Builder.CreateCmp(P0, L, R);
11434      propagateIRFlags(V, E->Scalars, VL0);
11435      // Do not cast for cmps.
11436      VecTy = cast<FixedVectorType>(V->getType());
11437      V = FinalShuffle(V, E, VecTy, IsSigned);
11438
11439      E->VectorizedValue = V;
11440      ++NumVectorInstructions;
11441      return V;
11442    }
11443    case Instruction::Select: {
11444      setInsertPointAfterBundle(E);
11445
11446      Value *Cond = vectorizeOperand(E, 0, PostponedPHIs);
11447      if (E->VectorizedValue) {
11448        LLVM_DEBUG(dbgs() << "SLP: Diamond merged for " << *VL0 << ".\n");
11449        return E->VectorizedValue;
11450      }
11451      Value *True = vectorizeOperand(E, 1, PostponedPHIs);
11452      if (E->VectorizedValue) {
11453        LLVM_DEBUG(dbgs() << "SLP: Diamond merged for " << *VL0 << ".\n");
11454        return E->VectorizedValue;
11455      }
11456      Value *False = vectorizeOperand(E, 2, PostponedPHIs);
11457      if (E->VectorizedValue) {
11458        LLVM_DEBUG(dbgs() << "SLP: Diamond merged for " << *VL0 << ".\n");
11459        return E->VectorizedValue;
11460      }
11461      if (True->getType() != False->getType()) {
11462        assert((MinBWs.contains(getOperandEntry(E, 1)) ||
11463                MinBWs.contains(getOperandEntry(E, 2))) &&
11464               "Expected item in MinBWs.");
11465        True = Builder.CreateIntCast(True, VecTy, IsSigned);
11466        False = Builder.CreateIntCast(False, VecTy, IsSigned);
11467      }
11468
11469      Value *V = Builder.CreateSelect(Cond, True, False);
11470      V = FinalShuffle(V, E, VecTy, IsSigned);
11471
11472      E->VectorizedValue = V;
11473      ++NumVectorInstructions;
11474      return V;
11475    }
11476    case Instruction::FNeg: {
11477      setInsertPointAfterBundle(E);
11478
11479      Value *Op = vectorizeOperand(E, 0, PostponedPHIs);
11480
11481      if (E->VectorizedValue) {
11482        LLVM_DEBUG(dbgs() << "SLP: Diamond merged for " << *VL0 << ".\n");
11483        return E->VectorizedValue;
11484      }
11485
11486      Value *V = Builder.CreateUnOp(
11487          static_cast<Instruction::UnaryOps>(E->getOpcode()), Op);
11488      propagateIRFlags(V, E->Scalars, VL0);
11489      if (auto *I = dyn_cast<Instruction>(V))
11490        V = propagateMetadata(I, E->Scalars);
11491
11492      V = FinalShuffle(V, E, VecTy, IsSigned);
11493
11494      E->VectorizedValue = V;
11495      ++NumVectorInstructions;
11496
11497      return V;
11498    }
11499    case Instruction::Add:
11500    case Instruction::FAdd:
11501    case Instruction::Sub:
11502    case Instruction::FSub:
11503    case Instruction::Mul:
11504    case Instruction::FMul:
11505    case Instruction::UDiv:
11506    case Instruction::SDiv:
11507    case Instruction::FDiv:
11508    case Instruction::URem:
11509    case Instruction::SRem:
11510    case Instruction::FRem:
11511    case Instruction::Shl:
11512    case Instruction::LShr:
11513    case Instruction::AShr:
11514    case Instruction::And:
11515    case Instruction::Or:
11516    case Instruction::Xor: {
11517      setInsertPointAfterBundle(E);
11518
11519      Value *LHS = vectorizeOperand(E, 0, PostponedPHIs);
11520      if (E->VectorizedValue) {
11521        LLVM_DEBUG(dbgs() << "SLP: Diamond merged for " << *VL0 << ".\n");
11522        return E->VectorizedValue;
11523      }
11524      Value *RHS = vectorizeOperand(E, 1, PostponedPHIs);
11525      if (E->VectorizedValue) {
11526        LLVM_DEBUG(dbgs() << "SLP: Diamond merged for " << *VL0 << ".\n");
11527        return E->VectorizedValue;
11528      }
11529      if (LHS->getType() != RHS->getType()) {
11530        assert((MinBWs.contains(getOperandEntry(E, 0)) ||
11531                MinBWs.contains(getOperandEntry(E, 1))) &&
11532               "Expected item in MinBWs.");
11533        LHS = Builder.CreateIntCast(LHS, VecTy, IsSigned);
11534        RHS = Builder.CreateIntCast(RHS, VecTy, IsSigned);
11535      }
11536
11537      Value *V = Builder.CreateBinOp(
11538          static_cast<Instruction::BinaryOps>(E->getOpcode()), LHS,
11539          RHS);
11540      propagateIRFlags(V, E->Scalars, VL0, !MinBWs.contains(E));
11541      if (auto *I = dyn_cast<Instruction>(V))
11542        V = propagateMetadata(I, E->Scalars);
11543
11544      V = FinalShuffle(V, E, VecTy, IsSigned);
11545
11546      E->VectorizedValue = V;
11547      ++NumVectorInstructions;
11548
11549      return V;
11550    }
11551    case Instruction::Load: {
11552      // Loads are inserted at the head of the tree because we don't want to
11553      // sink them all the way down past store instructions.
11554      setInsertPointAfterBundle(E);
11555
11556      LoadInst *LI = cast<LoadInst>(VL0);
11557      Instruction *NewLI;
11558      Value *PO = LI->getPointerOperand();
11559      if (E->State == TreeEntry::Vectorize) {
11560        NewLI = Builder.CreateAlignedLoad(VecTy, PO, LI->getAlign());
11561      } else {
11562        assert((E->State == TreeEntry::ScatterVectorize ||
11563                E->State == TreeEntry::PossibleStridedVectorize) &&
11564               "Unhandled state");
11565        Value *VecPtr = vectorizeOperand(E, 0, PostponedPHIs);
11566        if (E->VectorizedValue) {
11567          LLVM_DEBUG(dbgs() << "SLP: Diamond merged for " << *VL0 << ".\n");
11568          return E->VectorizedValue;
11569        }
11570        // Use the minimum alignment of the gathered loads.
11571        Align CommonAlignment = LI->getAlign();
11572        for (Value *V : E->Scalars)
11573          CommonAlignment =
11574              std::min(CommonAlignment, cast<LoadInst>(V)->getAlign());
11575        NewLI = Builder.CreateMaskedGather(VecTy, VecPtr, CommonAlignment);
11576      }
11577      Value *V = propagateMetadata(NewLI, E->Scalars);
11578
11579      V = FinalShuffle(V, E, VecTy, IsSigned);
11580      E->VectorizedValue = V;
11581      ++NumVectorInstructions;
11582      return V;
11583    }
11584    case Instruction::Store: {
11585      auto *SI = cast<StoreInst>(VL0);
11586
11587      setInsertPointAfterBundle(E);
11588
11589      Value *VecValue = vectorizeOperand(E, 0, PostponedPHIs);
11590      VecValue = FinalShuffle(VecValue, E, VecTy, IsSigned);
11591
11592      Value *Ptr = SI->getPointerOperand();
11593      StoreInst *ST =
11594          Builder.CreateAlignedStore(VecValue, Ptr, SI->getAlign());
11595
11596      Value *V = propagateMetadata(ST, E->Scalars);
11597
11598      E->VectorizedValue = V;
11599      ++NumVectorInstructions;
11600      return V;
11601    }
11602    case Instruction::GetElementPtr: {
11603      auto *GEP0 = cast<GetElementPtrInst>(VL0);
11604      setInsertPointAfterBundle(E);
11605
11606      Value *Op0 = vectorizeOperand(E, 0, PostponedPHIs);
11607      if (E->VectorizedValue) {
11608        LLVM_DEBUG(dbgs() << "SLP: Diamond merged for " << *VL0 << ".\n");
11609        return E->VectorizedValue;
11610      }
11611
11612      SmallVector<Value *> OpVecs;
11613      for (int J = 1, N = GEP0->getNumOperands(); J < N; ++J) {
11614        Value *OpVec = vectorizeOperand(E, J, PostponedPHIs);
11615        if (E->VectorizedValue) {
11616          LLVM_DEBUG(dbgs() << "SLP: Diamond merged for " << *VL0 << ".\n");
11617          return E->VectorizedValue;
11618        }
11619        OpVecs.push_back(OpVec);
11620      }
11621
11622      Value *V = Builder.CreateGEP(GEP0->getSourceElementType(), Op0, OpVecs);
11623      if (Instruction *I = dyn_cast<GetElementPtrInst>(V)) {
11624        SmallVector<Value *> GEPs;
11625        for (Value *V : E->Scalars) {
11626          if (isa<GetElementPtrInst>(V))
11627            GEPs.push_back(V);
11628        }
11629        V = propagateMetadata(I, GEPs);
11630      }
11631
11632      V = FinalShuffle(V, E, VecTy, IsSigned);
11633
11634      E->VectorizedValue = V;
11635      ++NumVectorInstructions;
11636
11637      return V;
11638    }
11639    case Instruction::Call: {
11640      CallInst *CI = cast<CallInst>(VL0);
11641      setInsertPointAfterBundle(E);
11642
11643      Intrinsic::ID ID = getVectorIntrinsicIDForCall(CI, TLI);
11644
11645      auto VecCallCosts = getVectorCallCosts(CI, VecTy, TTI, TLI);
11646      bool UseIntrinsic = ID != Intrinsic::not_intrinsic &&
11647                          VecCallCosts.first <= VecCallCosts.second;
11648
11649      Value *ScalarArg = nullptr;
11650      SmallVector<Value *> OpVecs;
11651      SmallVector<Type *, 2> TysForDecl;
11652      // Add return type if intrinsic is overloaded on it.
11653      if (UseIntrinsic && isVectorIntrinsicWithOverloadTypeAtArg(ID, -1))
11654        TysForDecl.push_back(
11655            FixedVectorType::get(CI->getType(), E->Scalars.size()));
11656      for (unsigned I : seq<unsigned>(0, CI->arg_size())) {
11657        ValueList OpVL;
11658        // Some intrinsics have scalar arguments. This argument should not be
11659        // vectorized.
11660        if (UseIntrinsic && isVectorIntrinsicWithScalarOpAtArg(ID, I)) {
11661          CallInst *CEI = cast<CallInst>(VL0);
11662          ScalarArg = CEI->getArgOperand(I);
11663          OpVecs.push_back(CEI->getArgOperand(I));
11664          if (isVectorIntrinsicWithOverloadTypeAtArg(ID, I))
11665            TysForDecl.push_back(ScalarArg->getType());
11666          continue;
11667        }
11668
11669        Value *OpVec = vectorizeOperand(E, I, PostponedPHIs);
11670        if (E->VectorizedValue) {
11671          LLVM_DEBUG(dbgs() << "SLP: Diamond merged for " << *VL0 << ".\n");
11672          return E->VectorizedValue;
11673        }
11674        LLVM_DEBUG(dbgs() << "SLP: OpVec[" << I << "]: " << *OpVec << "\n");
11675        OpVecs.push_back(OpVec);
11676        if (UseIntrinsic && isVectorIntrinsicWithOverloadTypeAtArg(ID, I))
11677          TysForDecl.push_back(OpVec->getType());
11678      }
11679
11680      Function *CF;
11681      if (!UseIntrinsic) {
11682        VFShape Shape =
11683            VFShape::get(CI->getFunctionType(),
11684                         ElementCount::getFixed(
11685                             static_cast<unsigned>(VecTy->getNumElements())),
11686                         false /*HasGlobalPred*/);
11687        CF = VFDatabase(*CI).getVectorizedFunction(Shape);
11688      } else {
11689        CF = Intrinsic::getDeclaration(F->getParent(), ID, TysForDecl);
11690      }
11691
11692      SmallVector<OperandBundleDef, 1> OpBundles;
11693      CI->getOperandBundlesAsDefs(OpBundles);
11694      Value *V = Builder.CreateCall(CF, OpVecs, OpBundles);
11695
11696      propagateIRFlags(V, E->Scalars, VL0);
11697      V = FinalShuffle(V, E, VecTy, IsSigned);
11698
11699      E->VectorizedValue = V;
11700      ++NumVectorInstructions;
11701      return V;
11702    }
11703    case Instruction::ShuffleVector: {
11704      assert(E->isAltShuffle() &&
11705             ((Instruction::isBinaryOp(E->getOpcode()) &&
11706               Instruction::isBinaryOp(E->getAltOpcode())) ||
11707              (Instruction::isCast(E->getOpcode()) &&
11708               Instruction::isCast(E->getAltOpcode())) ||
11709              (isa<CmpInst>(VL0) && isa<CmpInst>(E->getAltOp()))) &&
11710             "Invalid Shuffle Vector Operand");
11711
11712      Value *LHS = nullptr, *RHS = nullptr;
11713      if (Instruction::isBinaryOp(E->getOpcode()) || isa<CmpInst>(VL0)) {
11714        setInsertPointAfterBundle(E);
11715        LHS = vectorizeOperand(E, 0, PostponedPHIs);
11716        if (E->VectorizedValue) {
11717          LLVM_DEBUG(dbgs() << "SLP: Diamond merged for " << *VL0 << ".\n");
11718          return E->VectorizedValue;
11719        }
11720        RHS = vectorizeOperand(E, 1, PostponedPHIs);
11721      } else {
11722        setInsertPointAfterBundle(E);
11723        LHS = vectorizeOperand(E, 0, PostponedPHIs);
11724      }
11725      if (E->VectorizedValue) {
11726        LLVM_DEBUG(dbgs() << "SLP: Diamond merged for " << *VL0 << ".\n");
11727        return E->VectorizedValue;
11728      }
11729      if (LHS && RHS && LHS->getType() != RHS->getType()) {
11730        assert((MinBWs.contains(getOperandEntry(E, 0)) ||
11731                MinBWs.contains(getOperandEntry(E, 1))) &&
11732               "Expected item in MinBWs.");
11733        LHS = Builder.CreateIntCast(LHS, VecTy, IsSigned);
11734        RHS = Builder.CreateIntCast(RHS, VecTy, IsSigned);
11735      }
11736
11737      Value *V0, *V1;
11738      if (Instruction::isBinaryOp(E->getOpcode())) {
11739        V0 = Builder.CreateBinOp(
11740            static_cast<Instruction::BinaryOps>(E->getOpcode()), LHS, RHS);
11741        V1 = Builder.CreateBinOp(
11742            static_cast<Instruction::BinaryOps>(E->getAltOpcode()), LHS, RHS);
11743      } else if (auto *CI0 = dyn_cast<CmpInst>(VL0)) {
11744        V0 = Builder.CreateCmp(CI0->getPredicate(), LHS, RHS);
11745        auto *AltCI = cast<CmpInst>(E->getAltOp());
11746        CmpInst::Predicate AltPred = AltCI->getPredicate();
11747        V1 = Builder.CreateCmp(AltPred, LHS, RHS);
11748      } else {
11749        V0 = Builder.CreateCast(
11750            static_cast<Instruction::CastOps>(E->getOpcode()), LHS, VecTy);
11751        V1 = Builder.CreateCast(
11752            static_cast<Instruction::CastOps>(E->getAltOpcode()), LHS, VecTy);
11753      }
11754      // Add V0 and V1 to later analysis to try to find and remove matching
11755      // instruction, if any.
11756      for (Value *V : {V0, V1}) {
11757        if (auto *I = dyn_cast<Instruction>(V)) {
11758          GatherShuffleExtractSeq.insert(I);
11759          CSEBlocks.insert(I->getParent());
11760        }
11761      }
11762
11763      // Create shuffle to take alternate operations from the vector.
11764      // Also, gather up main and alt scalar ops to propagate IR flags to
11765      // each vector operation.
11766      ValueList OpScalars, AltScalars;
11767      SmallVector<int> Mask;
11768      E->buildAltOpShuffleMask(
11769          [E, this](Instruction *I) {
11770            assert(E->isOpcodeOrAlt(I) && "Unexpected main/alternate opcode");
11771            return isAlternateInstruction(I, E->getMainOp(), E->getAltOp(),
11772                                          *TLI);
11773          },
11774          Mask, &OpScalars, &AltScalars);
11775
11776      propagateIRFlags(V0, OpScalars);
11777      propagateIRFlags(V1, AltScalars);
11778
11779      Value *V = Builder.CreateShuffleVector(V0, V1, Mask);
11780      if (auto *I = dyn_cast<Instruction>(V)) {
11781        V = propagateMetadata(I, E->Scalars);
11782        GatherShuffleExtractSeq.insert(I);
11783        CSEBlocks.insert(I->getParent());
11784      }
11785
11786      if (V->getType() != VecTy && !isa<CmpInst>(VL0))
11787        V = Builder.CreateIntCast(
11788            V, FixedVectorType::get(ScalarTy, E->getVectorFactor()), IsSigned);
11789      E->VectorizedValue = V;
11790      ++NumVectorInstructions;
11791
11792      return V;
11793    }
11794    default:
11795      llvm_unreachable("unknown inst");
11796  }
11797  return nullptr;
11798}
11799
11800Value *BoUpSLP::vectorizeTree() {
11801  ExtraValueToDebugLocsMap ExternallyUsedValues;
11802  SmallVector<std::pair<Value *, Value *>> ReplacedExternals;
11803  return vectorizeTree(ExternallyUsedValues, ReplacedExternals);
11804}
11805
11806namespace {
11807/// Data type for handling buildvector sequences with the reused scalars from
11808/// other tree entries.
11809struct ShuffledInsertData {
11810  /// List of insertelements to be replaced by shuffles.
11811  SmallVector<InsertElementInst *> InsertElements;
11812  /// The parent vectors and shuffle mask for the given list of inserts.
11813  MapVector<Value *, SmallVector<int>> ValueMasks;
11814};
11815} // namespace
11816
11817Value *BoUpSLP::vectorizeTree(
11818    const ExtraValueToDebugLocsMap &ExternallyUsedValues,
11819    SmallVectorImpl<std::pair<Value *, Value *>> &ReplacedExternals,
11820    Instruction *ReductionRoot) {
11821  // All blocks must be scheduled before any instructions are inserted.
11822  for (auto &BSIter : BlocksSchedules) {
11823    scheduleBlock(BSIter.second.get());
11824  }
11825  // Clean Entry-to-LastInstruction table. It can be affected after scheduling,
11826  // need to rebuild it.
11827  EntryToLastInstruction.clear();
11828
11829  if (ReductionRoot)
11830    Builder.SetInsertPoint(ReductionRoot->getParent(),
11831                           ReductionRoot->getIterator());
11832  else
11833    Builder.SetInsertPoint(&F->getEntryBlock(), F->getEntryBlock().begin());
11834
11835  // Postpone emission of PHIs operands to avoid cyclic dependencies issues.
11836  (void)vectorizeTree(VectorizableTree[0].get(), /*PostponedPHIs=*/true);
11837  for (const std::unique_ptr<TreeEntry> &TE : VectorizableTree)
11838    if (TE->State == TreeEntry::Vectorize &&
11839        TE->getOpcode() == Instruction::PHI && !TE->isAltShuffle() &&
11840        TE->VectorizedValue)
11841      (void)vectorizeTree(TE.get(), /*PostponedPHIs=*/false);
11842  // Run through the list of postponed gathers and emit them, replacing the temp
11843  // emitted allocas with actual vector instructions.
11844  ArrayRef<const TreeEntry *> PostponedNodes = PostponedGathers.getArrayRef();
11845  DenseMap<Value *, SmallVector<TreeEntry *>> PostponedValues;
11846  for (const TreeEntry *E : PostponedNodes) {
11847    auto *TE = const_cast<TreeEntry *>(E);
11848    if (auto *VecTE = getTreeEntry(TE->Scalars.front()))
11849      if (VecTE->isSame(TE->UserTreeIndices.front().UserTE->getOperand(
11850              TE->UserTreeIndices.front().EdgeIdx)))
11851        // Found gather node which is absolutely the same as one of the
11852        // vectorized nodes. It may happen after reordering.
11853        continue;
11854    auto *PrevVec = cast<Instruction>(TE->VectorizedValue);
11855    TE->VectorizedValue = nullptr;
11856    auto *UserI =
11857        cast<Instruction>(TE->UserTreeIndices.front().UserTE->VectorizedValue);
11858    // If user is a PHI node, its vector code have to be inserted right before
11859    // block terminator. Since the node was delayed, there were some unresolved
11860    // dependencies at the moment when stab instruction was emitted. In a case
11861    // when any of these dependencies turn out an operand of another PHI, coming
11862    // from this same block, position of a stab instruction will become invalid.
11863    // The is because source vector that supposed to feed this gather node was
11864    // inserted at the end of the block [after stab instruction]. So we need
11865    // to adjust insertion point again to the end of block.
11866    if (isa<PHINode>(UserI)) {
11867      // Insert before all users.
11868      Instruction *InsertPt = PrevVec->getParent()->getTerminator();
11869      for (User *U : PrevVec->users()) {
11870        if (U == UserI)
11871          continue;
11872        auto *UI = dyn_cast<Instruction>(U);
11873        if (!UI || isa<PHINode>(UI) || UI->getParent() != InsertPt->getParent())
11874          continue;
11875        if (UI->comesBefore(InsertPt))
11876          InsertPt = UI;
11877      }
11878      Builder.SetInsertPoint(InsertPt);
11879    } else {
11880      Builder.SetInsertPoint(PrevVec);
11881    }
11882    Builder.SetCurrentDebugLocation(UserI->getDebugLoc());
11883    Value *Vec = vectorizeTree(TE, /*PostponedPHIs=*/false);
11884    PrevVec->replaceAllUsesWith(Vec);
11885    PostponedValues.try_emplace(Vec).first->second.push_back(TE);
11886    // Replace the stub vector node, if it was used before for one of the
11887    // buildvector nodes already.
11888    auto It = PostponedValues.find(PrevVec);
11889    if (It != PostponedValues.end()) {
11890      for (TreeEntry *VTE : It->getSecond())
11891        VTE->VectorizedValue = Vec;
11892    }
11893    eraseInstruction(PrevVec);
11894  }
11895
11896  LLVM_DEBUG(dbgs() << "SLP: Extracting " << ExternalUses.size()
11897                    << " values .\n");
11898
11899  SmallVector<ShuffledInsertData> ShuffledInserts;
11900  // Maps vector instruction to original insertelement instruction
11901  DenseMap<Value *, InsertElementInst *> VectorToInsertElement;
11902  // Maps extract Scalar to the corresponding extractelement instruction in the
11903  // basic block. Only one extractelement per block should be emitted.
11904  DenseMap<Value *, DenseMap<BasicBlock *, Instruction *>> ScalarToEEs;
11905  SmallDenseSet<Value *, 4> UsedInserts;
11906  DenseMap<Value *, Value *> VectorCasts;
11907  SmallDenseSet<Value *, 4> ScalarsWithNullptrUser;
11908  // Extract all of the elements with the external uses.
11909  for (const auto &ExternalUse : ExternalUses) {
11910    Value *Scalar = ExternalUse.Scalar;
11911    llvm::User *User = ExternalUse.User;
11912
11913    // Skip users that we already RAUW. This happens when one instruction
11914    // has multiple uses of the same value.
11915    if (User && !is_contained(Scalar->users(), User))
11916      continue;
11917    TreeEntry *E = getTreeEntry(Scalar);
11918    assert(E && "Invalid scalar");
11919    assert(E->State != TreeEntry::NeedToGather &&
11920           "Extracting from a gather list");
11921    // Non-instruction pointers are not deleted, just skip them.
11922    if (E->getOpcode() == Instruction::GetElementPtr &&
11923        !isa<GetElementPtrInst>(Scalar))
11924      continue;
11925
11926    Value *Vec = E->VectorizedValue;
11927    assert(Vec && "Can't find vectorizable value");
11928
11929    Value *Lane = Builder.getInt32(ExternalUse.Lane);
11930    auto ExtractAndExtendIfNeeded = [&](Value *Vec) {
11931      if (Scalar->getType() != Vec->getType()) {
11932        Value *Ex = nullptr;
11933        auto It = ScalarToEEs.find(Scalar);
11934        if (It != ScalarToEEs.end()) {
11935          // No need to emit many extracts, just move the only one in the
11936          // current block.
11937          auto EEIt = It->second.find(Builder.GetInsertBlock());
11938          if (EEIt != It->second.end()) {
11939            Instruction *I = EEIt->second;
11940            if (Builder.GetInsertPoint() != Builder.GetInsertBlock()->end() &&
11941                Builder.GetInsertPoint()->comesBefore(I))
11942              I->moveBefore(*Builder.GetInsertPoint()->getParent(),
11943                            Builder.GetInsertPoint());
11944            Ex = I;
11945          }
11946        }
11947        if (!Ex) {
11948          // "Reuse" the existing extract to improve final codegen.
11949          if (auto *ES = dyn_cast<ExtractElementInst>(Scalar)) {
11950            Value *V = ES->getVectorOperand();
11951            if (const TreeEntry *ETE = getTreeEntry(V))
11952              V = ETE->VectorizedValue;
11953            Ex = Builder.CreateExtractElement(V, ES->getIndexOperand());
11954          } else {
11955            Ex = Builder.CreateExtractElement(Vec, Lane);
11956          }
11957          if (auto *I = dyn_cast<Instruction>(Ex))
11958            ScalarToEEs[Scalar].try_emplace(Builder.GetInsertBlock(), I);
11959        }
11960        // The then branch of the previous if may produce constants, since 0
11961        // operand might be a constant.
11962        if (auto *ExI = dyn_cast<Instruction>(Ex)) {
11963          GatherShuffleExtractSeq.insert(ExI);
11964          CSEBlocks.insert(ExI->getParent());
11965        }
11966        // If necessary, sign-extend or zero-extend ScalarRoot
11967        // to the larger type.
11968        if (Scalar->getType() != Ex->getType())
11969          return Builder.CreateIntCast(Ex, Scalar->getType(),
11970                                       MinBWs.find(E)->second.second);
11971        return Ex;
11972      }
11973      assert(isa<FixedVectorType>(Scalar->getType()) &&
11974             isa<InsertElementInst>(Scalar) &&
11975             "In-tree scalar of vector type is not insertelement?");
11976      auto *IE = cast<InsertElementInst>(Scalar);
11977      VectorToInsertElement.try_emplace(Vec, IE);
11978      return Vec;
11979    };
11980    // If User == nullptr, the Scalar remains as scalar in vectorized
11981    // instructions or is used as extra arg. Generate ExtractElement instruction
11982    // and update the record for this scalar in ExternallyUsedValues.
11983    if (!User) {
11984      if (!ScalarsWithNullptrUser.insert(Scalar).second)
11985        continue;
11986      assert((ExternallyUsedValues.count(Scalar) ||
11987              any_of(Scalar->users(),
11988                     [&](llvm::User *U) {
11989                       TreeEntry *UseEntry = getTreeEntry(U);
11990                       return UseEntry &&
11991                              UseEntry->State == TreeEntry::Vectorize &&
11992                              E->State == TreeEntry::Vectorize &&
11993                              doesInTreeUserNeedToExtract(
11994                                  Scalar,
11995                                  cast<Instruction>(UseEntry->Scalars.front()),
11996                                  TLI);
11997                     })) &&
11998             "Scalar with nullptr User must be registered in "
11999             "ExternallyUsedValues map or remain as scalar in vectorized "
12000             "instructions");
12001      if (auto *VecI = dyn_cast<Instruction>(Vec)) {
12002        if (auto *PHI = dyn_cast<PHINode>(VecI))
12003          Builder.SetInsertPoint(PHI->getParent(),
12004                                 PHI->getParent()->getFirstNonPHIIt());
12005        else
12006          Builder.SetInsertPoint(VecI->getParent(),
12007                                 std::next(VecI->getIterator()));
12008      } else {
12009        Builder.SetInsertPoint(&F->getEntryBlock(), F->getEntryBlock().begin());
12010      }
12011      Value *NewInst = ExtractAndExtendIfNeeded(Vec);
12012      // Required to update internally referenced instructions.
12013      Scalar->replaceAllUsesWith(NewInst);
12014      ReplacedExternals.emplace_back(Scalar, NewInst);
12015      continue;
12016    }
12017
12018    if (auto *VU = dyn_cast<InsertElementInst>(User)) {
12019      // Skip if the scalar is another vector op or Vec is not an instruction.
12020      if (!Scalar->getType()->isVectorTy() && isa<Instruction>(Vec)) {
12021        if (auto *FTy = dyn_cast<FixedVectorType>(User->getType())) {
12022          if (!UsedInserts.insert(VU).second)
12023            continue;
12024          // Need to use original vector, if the root is truncated.
12025          auto BWIt = MinBWs.find(E);
12026          if (BWIt != MinBWs.end() && Vec->getType() != VU->getType()) {
12027            auto VecIt = VectorCasts.find(Scalar);
12028            if (VecIt == VectorCasts.end()) {
12029              IRBuilder<>::InsertPointGuard Guard(Builder);
12030              if (auto *IVec = dyn_cast<Instruction>(Vec))
12031                Builder.SetInsertPoint(IVec->getNextNonDebugInstruction());
12032              Vec = Builder.CreateIntCast(
12033                  Vec,
12034                  FixedVectorType::get(
12035                      cast<VectorType>(VU->getType())->getElementType(),
12036                      cast<FixedVectorType>(Vec->getType())->getNumElements()),
12037                  BWIt->second.second);
12038              VectorCasts.try_emplace(Scalar, Vec);
12039            } else {
12040              Vec = VecIt->second;
12041            }
12042          }
12043
12044          std::optional<unsigned> InsertIdx = getInsertIndex(VU);
12045          if (InsertIdx) {
12046            auto *It =
12047                find_if(ShuffledInserts, [VU](const ShuffledInsertData &Data) {
12048                  // Checks if 2 insertelements are from the same buildvector.
12049                  InsertElementInst *VecInsert = Data.InsertElements.front();
12050                  return areTwoInsertFromSameBuildVector(
12051                      VU, VecInsert,
12052                      [](InsertElementInst *II) { return II->getOperand(0); });
12053                });
12054            unsigned Idx = *InsertIdx;
12055            if (It == ShuffledInserts.end()) {
12056              (void)ShuffledInserts.emplace_back();
12057              It = std::next(ShuffledInserts.begin(),
12058                             ShuffledInserts.size() - 1);
12059              SmallVectorImpl<int> &Mask = It->ValueMasks[Vec];
12060              if (Mask.empty())
12061                Mask.assign(FTy->getNumElements(), PoisonMaskElem);
12062              // Find the insertvector, vectorized in tree, if any.
12063              Value *Base = VU;
12064              while (auto *IEBase = dyn_cast<InsertElementInst>(Base)) {
12065                if (IEBase != User &&
12066                    (!IEBase->hasOneUse() ||
12067                     getInsertIndex(IEBase).value_or(Idx) == Idx))
12068                  break;
12069                // Build the mask for the vectorized insertelement instructions.
12070                if (const TreeEntry *E = getTreeEntry(IEBase)) {
12071                  do {
12072                    IEBase = cast<InsertElementInst>(Base);
12073                    int IEIdx = *getInsertIndex(IEBase);
12074                    assert(Mask[Idx] == PoisonMaskElem &&
12075                           "InsertElementInstruction used already.");
12076                    Mask[IEIdx] = IEIdx;
12077                    Base = IEBase->getOperand(0);
12078                  } while (E == getTreeEntry(Base));
12079                  break;
12080                }
12081                Base = cast<InsertElementInst>(Base)->getOperand(0);
12082                // After the vectorization the def-use chain has changed, need
12083                // to look through original insertelement instructions, if they
12084                // get replaced by vector instructions.
12085                auto It = VectorToInsertElement.find(Base);
12086                if (It != VectorToInsertElement.end())
12087                  Base = It->second;
12088              }
12089            }
12090            SmallVectorImpl<int> &Mask = It->ValueMasks[Vec];
12091            if (Mask.empty())
12092              Mask.assign(FTy->getNumElements(), PoisonMaskElem);
12093            Mask[Idx] = ExternalUse.Lane;
12094            It->InsertElements.push_back(cast<InsertElementInst>(User));
12095            continue;
12096          }
12097        }
12098      }
12099    }
12100
12101    // Generate extracts for out-of-tree users.
12102    // Find the insertion point for the extractelement lane.
12103    if (auto *VecI = dyn_cast<Instruction>(Vec)) {
12104      if (PHINode *PH = dyn_cast<PHINode>(User)) {
12105        for (unsigned I : seq<unsigned>(0, PH->getNumIncomingValues())) {
12106          if (PH->getIncomingValue(I) == Scalar) {
12107            Instruction *IncomingTerminator =
12108                PH->getIncomingBlock(I)->getTerminator();
12109            if (isa<CatchSwitchInst>(IncomingTerminator)) {
12110              Builder.SetInsertPoint(VecI->getParent(),
12111                                     std::next(VecI->getIterator()));
12112            } else {
12113              Builder.SetInsertPoint(PH->getIncomingBlock(I)->getTerminator());
12114            }
12115            Value *NewInst = ExtractAndExtendIfNeeded(Vec);
12116            PH->setOperand(I, NewInst);
12117          }
12118        }
12119      } else {
12120        Builder.SetInsertPoint(cast<Instruction>(User));
12121        Value *NewInst = ExtractAndExtendIfNeeded(Vec);
12122        User->replaceUsesOfWith(Scalar, NewInst);
12123      }
12124    } else {
12125      Builder.SetInsertPoint(&F->getEntryBlock(), F->getEntryBlock().begin());
12126      Value *NewInst = ExtractAndExtendIfNeeded(Vec);
12127      User->replaceUsesOfWith(Scalar, NewInst);
12128    }
12129
12130    LLVM_DEBUG(dbgs() << "SLP: Replaced:" << *User << ".\n");
12131  }
12132
12133  auto CreateShuffle = [&](Value *V1, Value *V2, ArrayRef<int> Mask) {
12134    SmallVector<int> CombinedMask1(Mask.size(), PoisonMaskElem);
12135    SmallVector<int> CombinedMask2(Mask.size(), PoisonMaskElem);
12136    int VF = cast<FixedVectorType>(V1->getType())->getNumElements();
12137    for (int I = 0, E = Mask.size(); I < E; ++I) {
12138      if (Mask[I] < VF)
12139        CombinedMask1[I] = Mask[I];
12140      else
12141        CombinedMask2[I] = Mask[I] - VF;
12142    }
12143    ShuffleInstructionBuilder ShuffleBuilder(Builder, *this);
12144    ShuffleBuilder.add(V1, CombinedMask1);
12145    if (V2)
12146      ShuffleBuilder.add(V2, CombinedMask2);
12147    return ShuffleBuilder.finalize(std::nullopt);
12148  };
12149
12150  auto &&ResizeToVF = [&CreateShuffle](Value *Vec, ArrayRef<int> Mask,
12151                                       bool ForSingleMask) {
12152    unsigned VF = Mask.size();
12153    unsigned VecVF = cast<FixedVectorType>(Vec->getType())->getNumElements();
12154    if (VF != VecVF) {
12155      if (any_of(Mask, [VF](int Idx) { return Idx >= static_cast<int>(VF); })) {
12156        Vec = CreateShuffle(Vec, nullptr, Mask);
12157        return std::make_pair(Vec, true);
12158      }
12159      if (!ForSingleMask) {
12160        SmallVector<int> ResizeMask(VF, PoisonMaskElem);
12161        for (unsigned I = 0; I < VF; ++I) {
12162          if (Mask[I] != PoisonMaskElem)
12163            ResizeMask[Mask[I]] = Mask[I];
12164        }
12165        Vec = CreateShuffle(Vec, nullptr, ResizeMask);
12166      }
12167    }
12168
12169    return std::make_pair(Vec, false);
12170  };
12171  // Perform shuffling of the vectorize tree entries for better handling of
12172  // external extracts.
12173  for (int I = 0, E = ShuffledInserts.size(); I < E; ++I) {
12174    // Find the first and the last instruction in the list of insertelements.
12175    sort(ShuffledInserts[I].InsertElements, isFirstInsertElement);
12176    InsertElementInst *FirstInsert = ShuffledInserts[I].InsertElements.front();
12177    InsertElementInst *LastInsert = ShuffledInserts[I].InsertElements.back();
12178    Builder.SetInsertPoint(LastInsert);
12179    auto Vector = ShuffledInserts[I].ValueMasks.takeVector();
12180    Value *NewInst = performExtractsShuffleAction<Value>(
12181        MutableArrayRef(Vector.data(), Vector.size()),
12182        FirstInsert->getOperand(0),
12183        [](Value *Vec) {
12184          return cast<VectorType>(Vec->getType())
12185              ->getElementCount()
12186              .getKnownMinValue();
12187        },
12188        ResizeToVF,
12189        [FirstInsert, &CreateShuffle](ArrayRef<int> Mask,
12190                                      ArrayRef<Value *> Vals) {
12191          assert((Vals.size() == 1 || Vals.size() == 2) &&
12192                 "Expected exactly 1 or 2 input values.");
12193          if (Vals.size() == 1) {
12194            // Do not create shuffle if the mask is a simple identity
12195            // non-resizing mask.
12196            if (Mask.size() != cast<FixedVectorType>(Vals.front()->getType())
12197                                   ->getNumElements() ||
12198                !ShuffleVectorInst::isIdentityMask(Mask, Mask.size()))
12199              return CreateShuffle(Vals.front(), nullptr, Mask);
12200            return Vals.front();
12201          }
12202          return CreateShuffle(Vals.front() ? Vals.front()
12203                                            : FirstInsert->getOperand(0),
12204                               Vals.back(), Mask);
12205        });
12206    auto It = ShuffledInserts[I].InsertElements.rbegin();
12207    // Rebuild buildvector chain.
12208    InsertElementInst *II = nullptr;
12209    if (It != ShuffledInserts[I].InsertElements.rend())
12210      II = *It;
12211    SmallVector<Instruction *> Inserts;
12212    while (It != ShuffledInserts[I].InsertElements.rend()) {
12213      assert(II && "Must be an insertelement instruction.");
12214      if (*It == II)
12215        ++It;
12216      else
12217        Inserts.push_back(cast<Instruction>(II));
12218      II = dyn_cast<InsertElementInst>(II->getOperand(0));
12219    }
12220    for (Instruction *II : reverse(Inserts)) {
12221      II->replaceUsesOfWith(II->getOperand(0), NewInst);
12222      if (auto *NewI = dyn_cast<Instruction>(NewInst))
12223        if (II->getParent() == NewI->getParent() && II->comesBefore(NewI))
12224          II->moveAfter(NewI);
12225      NewInst = II;
12226    }
12227    LastInsert->replaceAllUsesWith(NewInst);
12228    for (InsertElementInst *IE : reverse(ShuffledInserts[I].InsertElements)) {
12229      IE->replaceUsesOfWith(IE->getOperand(0),
12230                            PoisonValue::get(IE->getOperand(0)->getType()));
12231      IE->replaceUsesOfWith(IE->getOperand(1),
12232                            PoisonValue::get(IE->getOperand(1)->getType()));
12233      eraseInstruction(IE);
12234    }
12235    CSEBlocks.insert(LastInsert->getParent());
12236  }
12237
12238  SmallVector<Instruction *> RemovedInsts;
12239  // For each vectorized value:
12240  for (auto &TEPtr : VectorizableTree) {
12241    TreeEntry *Entry = TEPtr.get();
12242
12243    // No need to handle users of gathered values.
12244    if (Entry->State == TreeEntry::NeedToGather)
12245      continue;
12246
12247    assert(Entry->VectorizedValue && "Can't find vectorizable value");
12248
12249    // For each lane:
12250    for (int Lane = 0, LE = Entry->Scalars.size(); Lane != LE; ++Lane) {
12251      Value *Scalar = Entry->Scalars[Lane];
12252
12253      if (Entry->getOpcode() == Instruction::GetElementPtr &&
12254          !isa<GetElementPtrInst>(Scalar))
12255        continue;
12256#ifndef NDEBUG
12257      Type *Ty = Scalar->getType();
12258      if (!Ty->isVoidTy()) {
12259        for (User *U : Scalar->users()) {
12260          LLVM_DEBUG(dbgs() << "SLP: \tvalidating user:" << *U << ".\n");
12261
12262          // It is legal to delete users in the ignorelist.
12263          assert((getTreeEntry(U) ||
12264                  (UserIgnoreList && UserIgnoreList->contains(U)) ||
12265                  (isa_and_nonnull<Instruction>(U) &&
12266                   isDeleted(cast<Instruction>(U)))) &&
12267                 "Deleting out-of-tree value");
12268        }
12269      }
12270#endif
12271      LLVM_DEBUG(dbgs() << "SLP: \tErasing scalar:" << *Scalar << ".\n");
12272      eraseInstruction(cast<Instruction>(Scalar));
12273      // Retain to-be-deleted instructions for some debug-info
12274      // bookkeeping. NOTE: eraseInstruction only marks the instruction for
12275      // deletion - instructions are not deleted until later.
12276      RemovedInsts.push_back(cast<Instruction>(Scalar));
12277    }
12278  }
12279
12280  // Merge the DIAssignIDs from the about-to-be-deleted instructions into the
12281  // new vector instruction.
12282  if (auto *V = dyn_cast<Instruction>(VectorizableTree[0]->VectorizedValue))
12283    V->mergeDIAssignID(RemovedInsts);
12284
12285  Builder.ClearInsertionPoint();
12286  InstrElementSize.clear();
12287
12288  return VectorizableTree[0]->VectorizedValue;
12289}
12290
12291void BoUpSLP::optimizeGatherSequence() {
12292  LLVM_DEBUG(dbgs() << "SLP: Optimizing " << GatherShuffleExtractSeq.size()
12293                    << " gather sequences instructions.\n");
12294  // LICM InsertElementInst sequences.
12295  for (Instruction *I : GatherShuffleExtractSeq) {
12296    if (isDeleted(I))
12297      continue;
12298
12299    // Check if this block is inside a loop.
12300    Loop *L = LI->getLoopFor(I->getParent());
12301    if (!L)
12302      continue;
12303
12304    // Check if it has a preheader.
12305    BasicBlock *PreHeader = L->getLoopPreheader();
12306    if (!PreHeader)
12307      continue;
12308
12309    // If the vector or the element that we insert into it are
12310    // instructions that are defined in this basic block then we can't
12311    // hoist this instruction.
12312    if (any_of(I->operands(), [L](Value *V) {
12313          auto *OpI = dyn_cast<Instruction>(V);
12314          return OpI && L->contains(OpI);
12315        }))
12316      continue;
12317
12318    // We can hoist this instruction. Move it to the pre-header.
12319    I->moveBefore(PreHeader->getTerminator());
12320    CSEBlocks.insert(PreHeader);
12321  }
12322
12323  // Make a list of all reachable blocks in our CSE queue.
12324  SmallVector<const DomTreeNode *, 8> CSEWorkList;
12325  CSEWorkList.reserve(CSEBlocks.size());
12326  for (BasicBlock *BB : CSEBlocks)
12327    if (DomTreeNode *N = DT->getNode(BB)) {
12328      assert(DT->isReachableFromEntry(N));
12329      CSEWorkList.push_back(N);
12330    }
12331
12332  // Sort blocks by domination. This ensures we visit a block after all blocks
12333  // dominating it are visited.
12334  llvm::sort(CSEWorkList, [](const DomTreeNode *A, const DomTreeNode *B) {
12335    assert((A == B) == (A->getDFSNumIn() == B->getDFSNumIn()) &&
12336           "Different nodes should have different DFS numbers");
12337    return A->getDFSNumIn() < B->getDFSNumIn();
12338  });
12339
12340  // Less defined shuffles can be replaced by the more defined copies.
12341  // Between two shuffles one is less defined if it has the same vector operands
12342  // and its mask indeces are the same as in the first one or undefs. E.g.
12343  // shuffle %0, poison, <0, 0, 0, undef> is less defined than shuffle %0,
12344  // poison, <0, 0, 0, 0>.
12345  auto &&IsIdenticalOrLessDefined = [this](Instruction *I1, Instruction *I2,
12346                                           SmallVectorImpl<int> &NewMask) {
12347    if (I1->getType() != I2->getType())
12348      return false;
12349    auto *SI1 = dyn_cast<ShuffleVectorInst>(I1);
12350    auto *SI2 = dyn_cast<ShuffleVectorInst>(I2);
12351    if (!SI1 || !SI2)
12352      return I1->isIdenticalTo(I2);
12353    if (SI1->isIdenticalTo(SI2))
12354      return true;
12355    for (int I = 0, E = SI1->getNumOperands(); I < E; ++I)
12356      if (SI1->getOperand(I) != SI2->getOperand(I))
12357        return false;
12358    // Check if the second instruction is more defined than the first one.
12359    NewMask.assign(SI2->getShuffleMask().begin(), SI2->getShuffleMask().end());
12360    ArrayRef<int> SM1 = SI1->getShuffleMask();
12361    // Count trailing undefs in the mask to check the final number of used
12362    // registers.
12363    unsigned LastUndefsCnt = 0;
12364    for (int I = 0, E = NewMask.size(); I < E; ++I) {
12365      if (SM1[I] == PoisonMaskElem)
12366        ++LastUndefsCnt;
12367      else
12368        LastUndefsCnt = 0;
12369      if (NewMask[I] != PoisonMaskElem && SM1[I] != PoisonMaskElem &&
12370          NewMask[I] != SM1[I])
12371        return false;
12372      if (NewMask[I] == PoisonMaskElem)
12373        NewMask[I] = SM1[I];
12374    }
12375    // Check if the last undefs actually change the final number of used vector
12376    // registers.
12377    return SM1.size() - LastUndefsCnt > 1 &&
12378           TTI->getNumberOfParts(SI1->getType()) ==
12379               TTI->getNumberOfParts(
12380                   FixedVectorType::get(SI1->getType()->getElementType(),
12381                                        SM1.size() - LastUndefsCnt));
12382  };
12383  // Perform O(N^2) search over the gather/shuffle sequences and merge identical
12384  // instructions. TODO: We can further optimize this scan if we split the
12385  // instructions into different buckets based on the insert lane.
12386  SmallVector<Instruction *, 16> Visited;
12387  for (auto I = CSEWorkList.begin(), E = CSEWorkList.end(); I != E; ++I) {
12388    assert(*I &&
12389           (I == CSEWorkList.begin() || !DT->dominates(*I, *std::prev(I))) &&
12390           "Worklist not sorted properly!");
12391    BasicBlock *BB = (*I)->getBlock();
12392    // For all instructions in blocks containing gather sequences:
12393    for (Instruction &In : llvm::make_early_inc_range(*BB)) {
12394      if (isDeleted(&In))
12395        continue;
12396      if (!isa<InsertElementInst, ExtractElementInst, ShuffleVectorInst>(&In) &&
12397          !GatherShuffleExtractSeq.contains(&In))
12398        continue;
12399
12400      // Check if we can replace this instruction with any of the
12401      // visited instructions.
12402      bool Replaced = false;
12403      for (Instruction *&V : Visited) {
12404        SmallVector<int> NewMask;
12405        if (IsIdenticalOrLessDefined(&In, V, NewMask) &&
12406            DT->dominates(V->getParent(), In.getParent())) {
12407          In.replaceAllUsesWith(V);
12408          eraseInstruction(&In);
12409          if (auto *SI = dyn_cast<ShuffleVectorInst>(V))
12410            if (!NewMask.empty())
12411              SI->setShuffleMask(NewMask);
12412          Replaced = true;
12413          break;
12414        }
12415        if (isa<ShuffleVectorInst>(In) && isa<ShuffleVectorInst>(V) &&
12416            GatherShuffleExtractSeq.contains(V) &&
12417            IsIdenticalOrLessDefined(V, &In, NewMask) &&
12418            DT->dominates(In.getParent(), V->getParent())) {
12419          In.moveAfter(V);
12420          V->replaceAllUsesWith(&In);
12421          eraseInstruction(V);
12422          if (auto *SI = dyn_cast<ShuffleVectorInst>(&In))
12423            if (!NewMask.empty())
12424              SI->setShuffleMask(NewMask);
12425          V = &In;
12426          Replaced = true;
12427          break;
12428        }
12429      }
12430      if (!Replaced) {
12431        assert(!is_contained(Visited, &In));
12432        Visited.push_back(&In);
12433      }
12434    }
12435  }
12436  CSEBlocks.clear();
12437  GatherShuffleExtractSeq.clear();
12438}
12439
12440BoUpSLP::ScheduleData *
12441BoUpSLP::BlockScheduling::buildBundle(ArrayRef<Value *> VL) {
12442  ScheduleData *Bundle = nullptr;
12443  ScheduleData *PrevInBundle = nullptr;
12444  for (Value *V : VL) {
12445    if (doesNotNeedToBeScheduled(V))
12446      continue;
12447    ScheduleData *BundleMember = getScheduleData(V);
12448    assert(BundleMember &&
12449           "no ScheduleData for bundle member "
12450           "(maybe not in same basic block)");
12451    assert(BundleMember->isSchedulingEntity() &&
12452           "bundle member already part of other bundle");
12453    if (PrevInBundle) {
12454      PrevInBundle->NextInBundle = BundleMember;
12455    } else {
12456      Bundle = BundleMember;
12457    }
12458
12459    // Group the instructions to a bundle.
12460    BundleMember->FirstInBundle = Bundle;
12461    PrevInBundle = BundleMember;
12462  }
12463  assert(Bundle && "Failed to find schedule bundle");
12464  return Bundle;
12465}
12466
12467// Groups the instructions to a bundle (which is then a single scheduling entity)
12468// and schedules instructions until the bundle gets ready.
12469std::optional<BoUpSLP::ScheduleData *>
12470BoUpSLP::BlockScheduling::tryScheduleBundle(ArrayRef<Value *> VL, BoUpSLP *SLP,
12471                                            const InstructionsState &S) {
12472  // No need to schedule PHIs, insertelement, extractelement and extractvalue
12473  // instructions.
12474  if (isa<PHINode>(S.OpValue) || isVectorLikeInstWithConstOps(S.OpValue) ||
12475      doesNotNeedToSchedule(VL))
12476    return nullptr;
12477
12478  // Initialize the instruction bundle.
12479  Instruction *OldScheduleEnd = ScheduleEnd;
12480  LLVM_DEBUG(dbgs() << "SLP:  bundle: " << *S.OpValue << "\n");
12481
12482  auto TryScheduleBundleImpl = [this, OldScheduleEnd, SLP](bool ReSchedule,
12483                                                         ScheduleData *Bundle) {
12484    // The scheduling region got new instructions at the lower end (or it is a
12485    // new region for the first bundle). This makes it necessary to
12486    // recalculate all dependencies.
12487    // It is seldom that this needs to be done a second time after adding the
12488    // initial bundle to the region.
12489    if (ScheduleEnd != OldScheduleEnd) {
12490      for (auto *I = ScheduleStart; I != ScheduleEnd; I = I->getNextNode())
12491        doForAllOpcodes(I, [](ScheduleData *SD) { SD->clearDependencies(); });
12492      ReSchedule = true;
12493    }
12494    if (Bundle) {
12495      LLVM_DEBUG(dbgs() << "SLP: try schedule bundle " << *Bundle
12496                        << " in block " << BB->getName() << "\n");
12497      calculateDependencies(Bundle, /*InsertInReadyList=*/true, SLP);
12498    }
12499
12500    if (ReSchedule) {
12501      resetSchedule();
12502      initialFillReadyList(ReadyInsts);
12503    }
12504
12505    // Now try to schedule the new bundle or (if no bundle) just calculate
12506    // dependencies. As soon as the bundle is "ready" it means that there are no
12507    // cyclic dependencies and we can schedule it. Note that's important that we
12508    // don't "schedule" the bundle yet (see cancelScheduling).
12509    while (((!Bundle && ReSchedule) || (Bundle && !Bundle->isReady())) &&
12510           !ReadyInsts.empty()) {
12511      ScheduleData *Picked = ReadyInsts.pop_back_val();
12512      assert(Picked->isSchedulingEntity() && Picked->isReady() &&
12513             "must be ready to schedule");
12514      schedule(Picked, ReadyInsts);
12515    }
12516  };
12517
12518  // Make sure that the scheduling region contains all
12519  // instructions of the bundle.
12520  for (Value *V : VL) {
12521    if (doesNotNeedToBeScheduled(V))
12522      continue;
12523    if (!extendSchedulingRegion(V, S)) {
12524      // If the scheduling region got new instructions at the lower end (or it
12525      // is a new region for the first bundle). This makes it necessary to
12526      // recalculate all dependencies.
12527      // Otherwise the compiler may crash trying to incorrectly calculate
12528      // dependencies and emit instruction in the wrong order at the actual
12529      // scheduling.
12530      TryScheduleBundleImpl(/*ReSchedule=*/false, nullptr);
12531      return std::nullopt;
12532    }
12533  }
12534
12535  bool ReSchedule = false;
12536  for (Value *V : VL) {
12537    if (doesNotNeedToBeScheduled(V))
12538      continue;
12539    ScheduleData *BundleMember = getScheduleData(V);
12540    assert(BundleMember &&
12541           "no ScheduleData for bundle member (maybe not in same basic block)");
12542
12543    // Make sure we don't leave the pieces of the bundle in the ready list when
12544    // whole bundle might not be ready.
12545    ReadyInsts.remove(BundleMember);
12546
12547    if (!BundleMember->IsScheduled)
12548      continue;
12549    // A bundle member was scheduled as single instruction before and now
12550    // needs to be scheduled as part of the bundle. We just get rid of the
12551    // existing schedule.
12552    LLVM_DEBUG(dbgs() << "SLP:  reset schedule because " << *BundleMember
12553                      << " was already scheduled\n");
12554    ReSchedule = true;
12555  }
12556
12557  auto *Bundle = buildBundle(VL);
12558  TryScheduleBundleImpl(ReSchedule, Bundle);
12559  if (!Bundle->isReady()) {
12560    cancelScheduling(VL, S.OpValue);
12561    return std::nullopt;
12562  }
12563  return Bundle;
12564}
12565
12566void BoUpSLP::BlockScheduling::cancelScheduling(ArrayRef<Value *> VL,
12567                                                Value *OpValue) {
12568  if (isa<PHINode>(OpValue) || isVectorLikeInstWithConstOps(OpValue) ||
12569      doesNotNeedToSchedule(VL))
12570    return;
12571
12572  if (doesNotNeedToBeScheduled(OpValue))
12573    OpValue = *find_if_not(VL, doesNotNeedToBeScheduled);
12574  ScheduleData *Bundle = getScheduleData(OpValue);
12575  LLVM_DEBUG(dbgs() << "SLP:  cancel scheduling of " << *Bundle << "\n");
12576  assert(!Bundle->IsScheduled &&
12577         "Can't cancel bundle which is already scheduled");
12578  assert(Bundle->isSchedulingEntity() &&
12579         (Bundle->isPartOfBundle() || needToScheduleSingleInstruction(VL)) &&
12580         "tried to unbundle something which is not a bundle");
12581
12582  // Remove the bundle from the ready list.
12583  if (Bundle->isReady())
12584    ReadyInsts.remove(Bundle);
12585
12586  // Un-bundle: make single instructions out of the bundle.
12587  ScheduleData *BundleMember = Bundle;
12588  while (BundleMember) {
12589    assert(BundleMember->FirstInBundle == Bundle && "corrupt bundle links");
12590    BundleMember->FirstInBundle = BundleMember;
12591    ScheduleData *Next = BundleMember->NextInBundle;
12592    BundleMember->NextInBundle = nullptr;
12593    BundleMember->TE = nullptr;
12594    if (BundleMember->unscheduledDepsInBundle() == 0) {
12595      ReadyInsts.insert(BundleMember);
12596    }
12597    BundleMember = Next;
12598  }
12599}
12600
12601BoUpSLP::ScheduleData *BoUpSLP::BlockScheduling::allocateScheduleDataChunks() {
12602  // Allocate a new ScheduleData for the instruction.
12603  if (ChunkPos >= ChunkSize) {
12604    ScheduleDataChunks.push_back(std::make_unique<ScheduleData[]>(ChunkSize));
12605    ChunkPos = 0;
12606  }
12607  return &(ScheduleDataChunks.back()[ChunkPos++]);
12608}
12609
12610bool BoUpSLP::BlockScheduling::extendSchedulingRegion(Value *V,
12611                                                      const InstructionsState &S) {
12612  if (getScheduleData(V, isOneOf(S, V)))
12613    return true;
12614  Instruction *I = dyn_cast<Instruction>(V);
12615  assert(I && "bundle member must be an instruction");
12616  assert(!isa<PHINode>(I) && !isVectorLikeInstWithConstOps(I) &&
12617         !doesNotNeedToBeScheduled(I) &&
12618         "phi nodes/insertelements/extractelements/extractvalues don't need to "
12619         "be scheduled");
12620  auto &&CheckScheduleForI = [this, &S](Instruction *I) -> bool {
12621    ScheduleData *ISD = getScheduleData(I);
12622    if (!ISD)
12623      return false;
12624    assert(isInSchedulingRegion(ISD) &&
12625           "ScheduleData not in scheduling region");
12626    ScheduleData *SD = allocateScheduleDataChunks();
12627    SD->Inst = I;
12628    SD->init(SchedulingRegionID, S.OpValue);
12629    ExtraScheduleDataMap[I][S.OpValue] = SD;
12630    return true;
12631  };
12632  if (CheckScheduleForI(I))
12633    return true;
12634  if (!ScheduleStart) {
12635    // It's the first instruction in the new region.
12636    initScheduleData(I, I->getNextNode(), nullptr, nullptr);
12637    ScheduleStart = I;
12638    ScheduleEnd = I->getNextNode();
12639    if (isOneOf(S, I) != I)
12640      CheckScheduleForI(I);
12641    assert(ScheduleEnd && "tried to vectorize a terminator?");
12642    LLVM_DEBUG(dbgs() << "SLP:  initialize schedule region to " << *I << "\n");
12643    return true;
12644  }
12645  // Search up and down at the same time, because we don't know if the new
12646  // instruction is above or below the existing scheduling region.
12647  // Ignore debug info (and other "AssumeLike" intrinsics) so that's not counted
12648  // against the budget. Otherwise debug info could affect codegen.
12649  BasicBlock::reverse_iterator UpIter =
12650      ++ScheduleStart->getIterator().getReverse();
12651  BasicBlock::reverse_iterator UpperEnd = BB->rend();
12652  BasicBlock::iterator DownIter = ScheduleEnd->getIterator();
12653  BasicBlock::iterator LowerEnd = BB->end();
12654  auto IsAssumeLikeIntr = [](const Instruction &I) {
12655    if (auto *II = dyn_cast<IntrinsicInst>(&I))
12656      return II->isAssumeLikeIntrinsic();
12657    return false;
12658  };
12659  UpIter = std::find_if_not(UpIter, UpperEnd, IsAssumeLikeIntr);
12660  DownIter = std::find_if_not(DownIter, LowerEnd, IsAssumeLikeIntr);
12661  while (UpIter != UpperEnd && DownIter != LowerEnd && &*UpIter != I &&
12662         &*DownIter != I) {
12663    if (++ScheduleRegionSize > ScheduleRegionSizeLimit) {
12664      LLVM_DEBUG(dbgs() << "SLP:  exceeded schedule region size limit\n");
12665      return false;
12666    }
12667
12668    ++UpIter;
12669    ++DownIter;
12670
12671    UpIter = std::find_if_not(UpIter, UpperEnd, IsAssumeLikeIntr);
12672    DownIter = std::find_if_not(DownIter, LowerEnd, IsAssumeLikeIntr);
12673  }
12674  if (DownIter == LowerEnd || (UpIter != UpperEnd && &*UpIter == I)) {
12675    assert(I->getParent() == ScheduleStart->getParent() &&
12676           "Instruction is in wrong basic block.");
12677    initScheduleData(I, ScheduleStart, nullptr, FirstLoadStoreInRegion);
12678    ScheduleStart = I;
12679    if (isOneOf(S, I) != I)
12680      CheckScheduleForI(I);
12681    LLVM_DEBUG(dbgs() << "SLP:  extend schedule region start to " << *I
12682                      << "\n");
12683    return true;
12684  }
12685  assert((UpIter == UpperEnd || (DownIter != LowerEnd && &*DownIter == I)) &&
12686         "Expected to reach top of the basic block or instruction down the "
12687         "lower end.");
12688  assert(I->getParent() == ScheduleEnd->getParent() &&
12689         "Instruction is in wrong basic block.");
12690  initScheduleData(ScheduleEnd, I->getNextNode(), LastLoadStoreInRegion,
12691                   nullptr);
12692  ScheduleEnd = I->getNextNode();
12693  if (isOneOf(S, I) != I)
12694    CheckScheduleForI(I);
12695  assert(ScheduleEnd && "tried to vectorize a terminator?");
12696  LLVM_DEBUG(dbgs() << "SLP:  extend schedule region end to " << *I << "\n");
12697  return true;
12698}
12699
12700void BoUpSLP::BlockScheduling::initScheduleData(Instruction *FromI,
12701                                                Instruction *ToI,
12702                                                ScheduleData *PrevLoadStore,
12703                                                ScheduleData *NextLoadStore) {
12704  ScheduleData *CurrentLoadStore = PrevLoadStore;
12705  for (Instruction *I = FromI; I != ToI; I = I->getNextNode()) {
12706    // No need to allocate data for non-schedulable instructions.
12707    if (doesNotNeedToBeScheduled(I))
12708      continue;
12709    ScheduleData *SD = ScheduleDataMap.lookup(I);
12710    if (!SD) {
12711      SD = allocateScheduleDataChunks();
12712      ScheduleDataMap[I] = SD;
12713      SD->Inst = I;
12714    }
12715    assert(!isInSchedulingRegion(SD) &&
12716           "new ScheduleData already in scheduling region");
12717    SD->init(SchedulingRegionID, I);
12718
12719    if (I->mayReadOrWriteMemory() &&
12720        (!isa<IntrinsicInst>(I) ||
12721         (cast<IntrinsicInst>(I)->getIntrinsicID() != Intrinsic::sideeffect &&
12722          cast<IntrinsicInst>(I)->getIntrinsicID() !=
12723              Intrinsic::pseudoprobe))) {
12724      // Update the linked list of memory accessing instructions.
12725      if (CurrentLoadStore) {
12726        CurrentLoadStore->NextLoadStore = SD;
12727      } else {
12728        FirstLoadStoreInRegion = SD;
12729      }
12730      CurrentLoadStore = SD;
12731    }
12732
12733    if (match(I, m_Intrinsic<Intrinsic::stacksave>()) ||
12734        match(I, m_Intrinsic<Intrinsic::stackrestore>()))
12735      RegionHasStackSave = true;
12736  }
12737  if (NextLoadStore) {
12738    if (CurrentLoadStore)
12739      CurrentLoadStore->NextLoadStore = NextLoadStore;
12740  } else {
12741    LastLoadStoreInRegion = CurrentLoadStore;
12742  }
12743}
12744
12745void BoUpSLP::BlockScheduling::calculateDependencies(ScheduleData *SD,
12746                                                     bool InsertInReadyList,
12747                                                     BoUpSLP *SLP) {
12748  assert(SD->isSchedulingEntity());
12749
12750  SmallVector<ScheduleData *, 10> WorkList;
12751  WorkList.push_back(SD);
12752
12753  while (!WorkList.empty()) {
12754    ScheduleData *SD = WorkList.pop_back_val();
12755    for (ScheduleData *BundleMember = SD; BundleMember;
12756         BundleMember = BundleMember->NextInBundle) {
12757      assert(isInSchedulingRegion(BundleMember));
12758      if (BundleMember->hasValidDependencies())
12759        continue;
12760
12761      LLVM_DEBUG(dbgs() << "SLP:       update deps of " << *BundleMember
12762                 << "\n");
12763      BundleMember->Dependencies = 0;
12764      BundleMember->resetUnscheduledDeps();
12765
12766      // Handle def-use chain dependencies.
12767      if (BundleMember->OpValue != BundleMember->Inst) {
12768        if (ScheduleData *UseSD = getScheduleData(BundleMember->Inst)) {
12769          BundleMember->Dependencies++;
12770          ScheduleData *DestBundle = UseSD->FirstInBundle;
12771          if (!DestBundle->IsScheduled)
12772            BundleMember->incrementUnscheduledDeps(1);
12773          if (!DestBundle->hasValidDependencies())
12774            WorkList.push_back(DestBundle);
12775        }
12776      } else {
12777        for (User *U : BundleMember->Inst->users()) {
12778          if (ScheduleData *UseSD = getScheduleData(cast<Instruction>(U))) {
12779            BundleMember->Dependencies++;
12780            ScheduleData *DestBundle = UseSD->FirstInBundle;
12781            if (!DestBundle->IsScheduled)
12782              BundleMember->incrementUnscheduledDeps(1);
12783            if (!DestBundle->hasValidDependencies())
12784              WorkList.push_back(DestBundle);
12785          }
12786        }
12787      }
12788
12789      auto MakeControlDependent = [&](Instruction *I) {
12790        auto *DepDest = getScheduleData(I);
12791        assert(DepDest && "must be in schedule window");
12792        DepDest->ControlDependencies.push_back(BundleMember);
12793        BundleMember->Dependencies++;
12794        ScheduleData *DestBundle = DepDest->FirstInBundle;
12795        if (!DestBundle->IsScheduled)
12796          BundleMember->incrementUnscheduledDeps(1);
12797        if (!DestBundle->hasValidDependencies())
12798          WorkList.push_back(DestBundle);
12799      };
12800
12801      // Any instruction which isn't safe to speculate at the beginning of the
12802      // block is control dependend on any early exit or non-willreturn call
12803      // which proceeds it.
12804      if (!isGuaranteedToTransferExecutionToSuccessor(BundleMember->Inst)) {
12805        for (Instruction *I = BundleMember->Inst->getNextNode();
12806             I != ScheduleEnd; I = I->getNextNode()) {
12807          if (isSafeToSpeculativelyExecute(I, &*BB->begin(), SLP->AC))
12808            continue;
12809
12810          // Add the dependency
12811          MakeControlDependent(I);
12812
12813          if (!isGuaranteedToTransferExecutionToSuccessor(I))
12814            // Everything past here must be control dependent on I.
12815            break;
12816        }
12817      }
12818
12819      if (RegionHasStackSave) {
12820        // If we have an inalloc alloca instruction, it needs to be scheduled
12821        // after any preceeding stacksave.  We also need to prevent any alloca
12822        // from reordering above a preceeding stackrestore.
12823        if (match(BundleMember->Inst, m_Intrinsic<Intrinsic::stacksave>()) ||
12824            match(BundleMember->Inst, m_Intrinsic<Intrinsic::stackrestore>())) {
12825          for (Instruction *I = BundleMember->Inst->getNextNode();
12826               I != ScheduleEnd; I = I->getNextNode()) {
12827            if (match(I, m_Intrinsic<Intrinsic::stacksave>()) ||
12828                match(I, m_Intrinsic<Intrinsic::stackrestore>()))
12829              // Any allocas past here must be control dependent on I, and I
12830              // must be memory dependend on BundleMember->Inst.
12831              break;
12832
12833            if (!isa<AllocaInst>(I))
12834              continue;
12835
12836            // Add the dependency
12837            MakeControlDependent(I);
12838          }
12839        }
12840
12841        // In addition to the cases handle just above, we need to prevent
12842        // allocas and loads/stores from moving below a stacksave or a
12843        // stackrestore. Avoiding moving allocas below stackrestore is currently
12844        // thought to be conservatism. Moving loads/stores below a stackrestore
12845        // can lead to incorrect code.
12846        if (isa<AllocaInst>(BundleMember->Inst) ||
12847            BundleMember->Inst->mayReadOrWriteMemory()) {
12848          for (Instruction *I = BundleMember->Inst->getNextNode();
12849               I != ScheduleEnd; I = I->getNextNode()) {
12850            if (!match(I, m_Intrinsic<Intrinsic::stacksave>()) &&
12851                !match(I, m_Intrinsic<Intrinsic::stackrestore>()))
12852              continue;
12853
12854            // Add the dependency
12855            MakeControlDependent(I);
12856            break;
12857          }
12858        }
12859      }
12860
12861      // Handle the memory dependencies (if any).
12862      ScheduleData *DepDest = BundleMember->NextLoadStore;
12863      if (!DepDest)
12864        continue;
12865      Instruction *SrcInst = BundleMember->Inst;
12866      assert(SrcInst->mayReadOrWriteMemory() &&
12867             "NextLoadStore list for non memory effecting bundle?");
12868      MemoryLocation SrcLoc = getLocation(SrcInst);
12869      bool SrcMayWrite = BundleMember->Inst->mayWriteToMemory();
12870      unsigned NumAliased = 0;
12871      unsigned DistToSrc = 1;
12872
12873      for (; DepDest; DepDest = DepDest->NextLoadStore) {
12874        assert(isInSchedulingRegion(DepDest));
12875
12876        // We have two limits to reduce the complexity:
12877        // 1) AliasedCheckLimit: It's a small limit to reduce calls to
12878        //    SLP->isAliased (which is the expensive part in this loop).
12879        // 2) MaxMemDepDistance: It's for very large blocks and it aborts
12880        //    the whole loop (even if the loop is fast, it's quadratic).
12881        //    It's important for the loop break condition (see below) to
12882        //    check this limit even between two read-only instructions.
12883        if (DistToSrc >= MaxMemDepDistance ||
12884            ((SrcMayWrite || DepDest->Inst->mayWriteToMemory()) &&
12885             (NumAliased >= AliasedCheckLimit ||
12886              SLP->isAliased(SrcLoc, SrcInst, DepDest->Inst)))) {
12887
12888          // We increment the counter only if the locations are aliased
12889          // (instead of counting all alias checks). This gives a better
12890          // balance between reduced runtime and accurate dependencies.
12891          NumAliased++;
12892
12893          DepDest->MemoryDependencies.push_back(BundleMember);
12894          BundleMember->Dependencies++;
12895          ScheduleData *DestBundle = DepDest->FirstInBundle;
12896          if (!DestBundle->IsScheduled) {
12897            BundleMember->incrementUnscheduledDeps(1);
12898          }
12899          if (!DestBundle->hasValidDependencies()) {
12900            WorkList.push_back(DestBundle);
12901          }
12902        }
12903
12904        // Example, explaining the loop break condition: Let's assume our
12905        // starting instruction is i0 and MaxMemDepDistance = 3.
12906        //
12907        //                      +--------v--v--v
12908        //             i0,i1,i2,i3,i4,i5,i6,i7,i8
12909        //             +--------^--^--^
12910        //
12911        // MaxMemDepDistance let us stop alias-checking at i3 and we add
12912        // dependencies from i0 to i3,i4,.. (even if they are not aliased).
12913        // Previously we already added dependencies from i3 to i6,i7,i8
12914        // (because of MaxMemDepDistance). As we added a dependency from
12915        // i0 to i3, we have transitive dependencies from i0 to i6,i7,i8
12916        // and we can abort this loop at i6.
12917        if (DistToSrc >= 2 * MaxMemDepDistance)
12918          break;
12919        DistToSrc++;
12920      }
12921    }
12922    if (InsertInReadyList && SD->isReady()) {
12923      ReadyInsts.insert(SD);
12924      LLVM_DEBUG(dbgs() << "SLP:     gets ready on update: " << *SD->Inst
12925                        << "\n");
12926    }
12927  }
12928}
12929
12930void BoUpSLP::BlockScheduling::resetSchedule() {
12931  assert(ScheduleStart &&
12932         "tried to reset schedule on block which has not been scheduled");
12933  for (Instruction *I = ScheduleStart; I != ScheduleEnd; I = I->getNextNode()) {
12934    doForAllOpcodes(I, [&](ScheduleData *SD) {
12935      assert(isInSchedulingRegion(SD) &&
12936             "ScheduleData not in scheduling region");
12937      SD->IsScheduled = false;
12938      SD->resetUnscheduledDeps();
12939    });
12940  }
12941  ReadyInsts.clear();
12942}
12943
12944void BoUpSLP::scheduleBlock(BlockScheduling *BS) {
12945  if (!BS->ScheduleStart)
12946    return;
12947
12948  LLVM_DEBUG(dbgs() << "SLP: schedule block " << BS->BB->getName() << "\n");
12949
12950  // A key point - if we got here, pre-scheduling was able to find a valid
12951  // scheduling of the sub-graph of the scheduling window which consists
12952  // of all vector bundles and their transitive users.  As such, we do not
12953  // need to reschedule anything *outside of* that subgraph.
12954
12955  BS->resetSchedule();
12956
12957  // For the real scheduling we use a more sophisticated ready-list: it is
12958  // sorted by the original instruction location. This lets the final schedule
12959  // be as  close as possible to the original instruction order.
12960  // WARNING: If changing this order causes a correctness issue, that means
12961  // there is some missing dependence edge in the schedule data graph.
12962  struct ScheduleDataCompare {
12963    bool operator()(ScheduleData *SD1, ScheduleData *SD2) const {
12964      return SD2->SchedulingPriority < SD1->SchedulingPriority;
12965    }
12966  };
12967  std::set<ScheduleData *, ScheduleDataCompare> ReadyInsts;
12968
12969  // Ensure that all dependency data is updated (for nodes in the sub-graph)
12970  // and fill the ready-list with initial instructions.
12971  int Idx = 0;
12972  for (auto *I = BS->ScheduleStart; I != BS->ScheduleEnd;
12973       I = I->getNextNode()) {
12974    BS->doForAllOpcodes(I, [this, &Idx, BS](ScheduleData *SD) {
12975      TreeEntry *SDTE = getTreeEntry(SD->Inst);
12976      (void)SDTE;
12977      assert((isVectorLikeInstWithConstOps(SD->Inst) ||
12978              SD->isPartOfBundle() ==
12979                  (SDTE && !doesNotNeedToSchedule(SDTE->Scalars))) &&
12980             "scheduler and vectorizer bundle mismatch");
12981      SD->FirstInBundle->SchedulingPriority = Idx++;
12982
12983      if (SD->isSchedulingEntity() && SD->isPartOfBundle())
12984        BS->calculateDependencies(SD, false, this);
12985    });
12986  }
12987  BS->initialFillReadyList(ReadyInsts);
12988
12989  Instruction *LastScheduledInst = BS->ScheduleEnd;
12990
12991  // Do the "real" scheduling.
12992  while (!ReadyInsts.empty()) {
12993    ScheduleData *Picked = *ReadyInsts.begin();
12994    ReadyInsts.erase(ReadyInsts.begin());
12995
12996    // Move the scheduled instruction(s) to their dedicated places, if not
12997    // there yet.
12998    for (ScheduleData *BundleMember = Picked; BundleMember;
12999         BundleMember = BundleMember->NextInBundle) {
13000      Instruction *PickedInst = BundleMember->Inst;
13001      if (PickedInst->getNextNonDebugInstruction() != LastScheduledInst)
13002        PickedInst->moveAfter(LastScheduledInst->getPrevNode());
13003      LastScheduledInst = PickedInst;
13004    }
13005
13006    BS->schedule(Picked, ReadyInsts);
13007  }
13008
13009  // Check that we didn't break any of our invariants.
13010#ifdef EXPENSIVE_CHECKS
13011  BS->verify();
13012#endif
13013
13014#if !defined(NDEBUG) || defined(EXPENSIVE_CHECKS)
13015  // Check that all schedulable entities got scheduled
13016  for (auto *I = BS->ScheduleStart; I != BS->ScheduleEnd; I = I->getNextNode()) {
13017    BS->doForAllOpcodes(I, [&](ScheduleData *SD) {
13018      if (SD->isSchedulingEntity() && SD->hasValidDependencies()) {
13019        assert(SD->IsScheduled && "must be scheduled at this point");
13020      }
13021    });
13022  }
13023#endif
13024
13025  // Avoid duplicate scheduling of the block.
13026  BS->ScheduleStart = nullptr;
13027}
13028
13029unsigned BoUpSLP::getVectorElementSize(Value *V) {
13030  // If V is a store, just return the width of the stored value (or value
13031  // truncated just before storing) without traversing the expression tree.
13032  // This is the common case.
13033  if (auto *Store = dyn_cast<StoreInst>(V))
13034    return DL->getTypeSizeInBits(Store->getValueOperand()->getType());
13035
13036  if (auto *IEI = dyn_cast<InsertElementInst>(V))
13037    return getVectorElementSize(IEI->getOperand(1));
13038
13039  auto E = InstrElementSize.find(V);
13040  if (E != InstrElementSize.end())
13041    return E->second;
13042
13043  // If V is not a store, we can traverse the expression tree to find loads
13044  // that feed it. The type of the loaded value may indicate a more suitable
13045  // width than V's type. We want to base the vector element size on the width
13046  // of memory operations where possible.
13047  SmallVector<std::pair<Instruction *, BasicBlock *>, 16> Worklist;
13048  SmallPtrSet<Instruction *, 16> Visited;
13049  if (auto *I = dyn_cast<Instruction>(V)) {
13050    Worklist.emplace_back(I, I->getParent());
13051    Visited.insert(I);
13052  }
13053
13054  // Traverse the expression tree in bottom-up order looking for loads. If we
13055  // encounter an instruction we don't yet handle, we give up.
13056  auto Width = 0u;
13057  while (!Worklist.empty()) {
13058    Instruction *I;
13059    BasicBlock *Parent;
13060    std::tie(I, Parent) = Worklist.pop_back_val();
13061
13062    // We should only be looking at scalar instructions here. If the current
13063    // instruction has a vector type, skip.
13064    auto *Ty = I->getType();
13065    if (isa<VectorType>(Ty))
13066      continue;
13067
13068    // If the current instruction is a load, update MaxWidth to reflect the
13069    // width of the loaded value.
13070    if (isa<LoadInst, ExtractElementInst, ExtractValueInst>(I))
13071      Width = std::max<unsigned>(Width, DL->getTypeSizeInBits(Ty));
13072
13073    // Otherwise, we need to visit the operands of the instruction. We only
13074    // handle the interesting cases from buildTree here. If an operand is an
13075    // instruction we haven't yet visited and from the same basic block as the
13076    // user or the use is a PHI node, we add it to the worklist.
13077    else if (isa<PHINode, CastInst, GetElementPtrInst, CmpInst, SelectInst,
13078                 BinaryOperator, UnaryOperator>(I)) {
13079      for (Use &U : I->operands())
13080        if (auto *J = dyn_cast<Instruction>(U.get()))
13081          if (Visited.insert(J).second &&
13082              (isa<PHINode>(I) || J->getParent() == Parent))
13083            Worklist.emplace_back(J, J->getParent());
13084    } else {
13085      break;
13086    }
13087  }
13088
13089  // If we didn't encounter a memory access in the expression tree, or if we
13090  // gave up for some reason, just return the width of V. Otherwise, return the
13091  // maximum width we found.
13092  if (!Width) {
13093    if (auto *CI = dyn_cast<CmpInst>(V))
13094      V = CI->getOperand(0);
13095    Width = DL->getTypeSizeInBits(V->getType());
13096  }
13097
13098  for (Instruction *I : Visited)
13099    InstrElementSize[I] = Width;
13100
13101  return Width;
13102}
13103
13104// Determine if a value V in a vectorizable expression Expr can be demoted to a
13105// smaller type with a truncation. We collect the values that will be demoted
13106// in ToDemote and additional roots that require investigating in Roots.
13107bool BoUpSLP::collectValuesToDemote(
13108    Value *V, SmallVectorImpl<Value *> &ToDemote,
13109    DenseMap<Instruction *, SmallVector<unsigned>> &DemotedConsts,
13110    SmallVectorImpl<Value *> &Roots, DenseSet<Value *> &Visited) const {
13111  // We can always demote constants.
13112  if (isa<Constant>(V))
13113    return true;
13114
13115  // If the value is not a vectorized instruction in the expression and not used
13116  // by the insertelement instruction and not used in multiple vector nodes, it
13117  // cannot be demoted.
13118  auto *I = dyn_cast<Instruction>(V);
13119  if (!I || !getTreeEntry(I) || MultiNodeScalars.contains(I) ||
13120      !Visited.insert(I).second || all_of(I->users(), [&](User *U) {
13121        return isa<InsertElementInst>(U) && !getTreeEntry(U);
13122      }))
13123    return false;
13124
13125  unsigned Start = 0;
13126  unsigned End = I->getNumOperands();
13127  switch (I->getOpcode()) {
13128
13129  // We can always demote truncations and extensions. Since truncations can
13130  // seed additional demotion, we save the truncated value.
13131  case Instruction::Trunc:
13132    Roots.push_back(I->getOperand(0));
13133    break;
13134  case Instruction::ZExt:
13135  case Instruction::SExt:
13136    if (isa<ExtractElementInst, InsertElementInst>(I->getOperand(0)))
13137      return false;
13138    break;
13139
13140  // We can demote certain binary operations if we can demote both of their
13141  // operands.
13142  case Instruction::Add:
13143  case Instruction::Sub:
13144  case Instruction::Mul:
13145  case Instruction::And:
13146  case Instruction::Or:
13147  case Instruction::Xor:
13148    if (!collectValuesToDemote(I->getOperand(0), ToDemote, DemotedConsts, Roots,
13149                               Visited) ||
13150        !collectValuesToDemote(I->getOperand(1), ToDemote, DemotedConsts, Roots,
13151                               Visited))
13152      return false;
13153    break;
13154
13155  // We can demote selects if we can demote their true and false values.
13156  case Instruction::Select: {
13157    Start = 1;
13158    SelectInst *SI = cast<SelectInst>(I);
13159    if (!collectValuesToDemote(SI->getTrueValue(), ToDemote, DemotedConsts,
13160                               Roots, Visited) ||
13161        !collectValuesToDemote(SI->getFalseValue(), ToDemote, DemotedConsts,
13162                               Roots, Visited))
13163      return false;
13164    break;
13165  }
13166
13167  // We can demote phis if we can demote all their incoming operands. Note that
13168  // we don't need to worry about cycles since we ensure single use above.
13169  case Instruction::PHI: {
13170    PHINode *PN = cast<PHINode>(I);
13171    for (Value *IncValue : PN->incoming_values())
13172      if (!collectValuesToDemote(IncValue, ToDemote, DemotedConsts, Roots,
13173                                 Visited))
13174        return false;
13175    break;
13176  }
13177
13178  // Otherwise, conservatively give up.
13179  default:
13180    return false;
13181  }
13182
13183  // Gather demoted constant operands.
13184  for (unsigned Idx : seq<unsigned>(Start, End))
13185    if (isa<Constant>(I->getOperand(Idx)))
13186      DemotedConsts.try_emplace(I).first->getSecond().push_back(Idx);
13187  // Record the value that we can demote.
13188  ToDemote.push_back(V);
13189  return true;
13190}
13191
13192void BoUpSLP::computeMinimumValueSizes() {
13193  // We only attempt to truncate integer expressions.
13194  auto &TreeRoot = VectorizableTree[0]->Scalars;
13195  auto *TreeRootIT = dyn_cast<IntegerType>(TreeRoot[0]->getType());
13196  if (!TreeRootIT || VectorizableTree.front()->State == TreeEntry::NeedToGather)
13197    return;
13198
13199  // Ensure the roots of the vectorizable tree don't form a cycle.
13200  if (!VectorizableTree.front()->UserTreeIndices.empty())
13201    return;
13202
13203  // Conservatively determine if we can actually truncate the roots of the
13204  // expression. Collect the values that can be demoted in ToDemote and
13205  // additional roots that require investigating in Roots.
13206  SmallVector<Value *, 32> ToDemote;
13207  DenseMap<Instruction *, SmallVector<unsigned>> DemotedConsts;
13208  SmallVector<Value *, 4> Roots;
13209  for (auto *Root : TreeRoot) {
13210    DenseSet<Value *> Visited;
13211    if (!collectValuesToDemote(Root, ToDemote, DemotedConsts, Roots, Visited))
13212      return;
13213  }
13214
13215  // The maximum bit width required to represent all the values that can be
13216  // demoted without loss of precision. It would be safe to truncate the roots
13217  // of the expression to this width.
13218  auto MaxBitWidth = 1u;
13219
13220  // We first check if all the bits of the roots are demanded. If they're not,
13221  // we can truncate the roots to this narrower type.
13222  for (auto *Root : TreeRoot) {
13223    auto Mask = DB->getDemandedBits(cast<Instruction>(Root));
13224    MaxBitWidth = std::max<unsigned>(Mask.getBitWidth() - Mask.countl_zero(),
13225                                     MaxBitWidth);
13226  }
13227
13228  // True if the roots can be zero-extended back to their original type, rather
13229  // than sign-extended. We know that if the leading bits are not demanded, we
13230  // can safely zero-extend. So we initialize IsKnownPositive to True.
13231  bool IsKnownPositive = true;
13232
13233  // If all the bits of the roots are demanded, we can try a little harder to
13234  // compute a narrower type. This can happen, for example, if the roots are
13235  // getelementptr indices. InstCombine promotes these indices to the pointer
13236  // width. Thus, all their bits are technically demanded even though the
13237  // address computation might be vectorized in a smaller type.
13238  //
13239  // We start by looking at each entry that can be demoted. We compute the
13240  // maximum bit width required to store the scalar by using ValueTracking to
13241  // compute the number of high-order bits we can truncate.
13242  if (MaxBitWidth == DL->getTypeSizeInBits(TreeRoot[0]->getType()) &&
13243      all_of(TreeRoot, [](Value *V) {
13244        return all_of(V->users(),
13245                      [](User *U) { return isa<GetElementPtrInst>(U); });
13246      })) {
13247    MaxBitWidth = 8u;
13248
13249    // Determine if the sign bit of all the roots is known to be zero. If not,
13250    // IsKnownPositive is set to False.
13251    IsKnownPositive = llvm::all_of(TreeRoot, [&](Value *R) {
13252      KnownBits Known = computeKnownBits(R, *DL);
13253      return Known.isNonNegative();
13254    });
13255
13256    // Determine the maximum number of bits required to store the scalar
13257    // values.
13258    for (auto *Scalar : ToDemote) {
13259      auto NumSignBits = ComputeNumSignBits(Scalar, *DL, 0, AC, nullptr, DT);
13260      auto NumTypeBits = DL->getTypeSizeInBits(Scalar->getType());
13261      MaxBitWidth = std::max<unsigned>(NumTypeBits - NumSignBits, MaxBitWidth);
13262    }
13263
13264    // If we can't prove that the sign bit is zero, we must add one to the
13265    // maximum bit width to account for the unknown sign bit. This preserves
13266    // the existing sign bit so we can safely sign-extend the root back to the
13267    // original type. Otherwise, if we know the sign bit is zero, we will
13268    // zero-extend the root instead.
13269    //
13270    // FIXME: This is somewhat suboptimal, as there will be cases where adding
13271    //        one to the maximum bit width will yield a larger-than-necessary
13272    //        type. In general, we need to add an extra bit only if we can't
13273    //        prove that the upper bit of the original type is equal to the
13274    //        upper bit of the proposed smaller type. If these two bits are the
13275    //        same (either zero or one) we know that sign-extending from the
13276    //        smaller type will result in the same value. Here, since we can't
13277    //        yet prove this, we are just making the proposed smaller type
13278    //        larger to ensure correctness.
13279    if (!IsKnownPositive)
13280      ++MaxBitWidth;
13281  }
13282
13283  // Round MaxBitWidth up to the next power-of-two.
13284  MaxBitWidth = llvm::bit_ceil(MaxBitWidth);
13285
13286  // If the maximum bit width we compute is less than the with of the roots'
13287  // type, we can proceed with the narrowing. Otherwise, do nothing.
13288  if (MaxBitWidth >= TreeRootIT->getBitWidth())
13289    return;
13290
13291  // If we can truncate the root, we must collect additional values that might
13292  // be demoted as a result. That is, those seeded by truncations we will
13293  // modify.
13294  while (!Roots.empty()) {
13295    DenseSet<Value *> Visited;
13296    collectValuesToDemote(Roots.pop_back_val(), ToDemote, DemotedConsts, Roots,
13297                          Visited);
13298  }
13299
13300  // Finally, map the values we can demote to the maximum bit with we computed.
13301  for (auto *Scalar : ToDemote) {
13302    auto *TE = getTreeEntry(Scalar);
13303    assert(TE && "Expected vectorized scalar.");
13304    if (MinBWs.contains(TE))
13305      continue;
13306    bool IsSigned = any_of(TE->Scalars, [&](Value *R) {
13307      KnownBits Known = computeKnownBits(R, *DL);
13308      return !Known.isNonNegative();
13309    });
13310    MinBWs.try_emplace(TE, MaxBitWidth, IsSigned);
13311    const auto *I = cast<Instruction>(Scalar);
13312    auto DCIt = DemotedConsts.find(I);
13313    if (DCIt != DemotedConsts.end()) {
13314      for (unsigned Idx : DCIt->getSecond()) {
13315        // Check that all instructions operands are demoted.
13316        if (all_of(TE->Scalars, [&](Value *V) {
13317              auto SIt = DemotedConsts.find(cast<Instruction>(V));
13318              return SIt != DemotedConsts.end() &&
13319                     is_contained(SIt->getSecond(), Idx);
13320            })) {
13321          const TreeEntry *CTE = getOperandEntry(TE, Idx);
13322          MinBWs.try_emplace(CTE, MaxBitWidth, IsSigned);
13323        }
13324      }
13325    }
13326  }
13327}
13328
13329PreservedAnalyses SLPVectorizerPass::run(Function &F, FunctionAnalysisManager &AM) {
13330  auto *SE = &AM.getResult<ScalarEvolutionAnalysis>(F);
13331  auto *TTI = &AM.getResult<TargetIRAnalysis>(F);
13332  auto *TLI = AM.getCachedResult<TargetLibraryAnalysis>(F);
13333  auto *AA = &AM.getResult<AAManager>(F);
13334  auto *LI = &AM.getResult<LoopAnalysis>(F);
13335  auto *DT = &AM.getResult<DominatorTreeAnalysis>(F);
13336  auto *AC = &AM.getResult<AssumptionAnalysis>(F);
13337  auto *DB = &AM.getResult<DemandedBitsAnalysis>(F);
13338  auto *ORE = &AM.getResult<OptimizationRemarkEmitterAnalysis>(F);
13339
13340  bool Changed = runImpl(F, SE, TTI, TLI, AA, LI, DT, AC, DB, ORE);
13341  if (!Changed)
13342    return PreservedAnalyses::all();
13343
13344  PreservedAnalyses PA;
13345  PA.preserveSet<CFGAnalyses>();
13346  return PA;
13347}
13348
13349bool SLPVectorizerPass::runImpl(Function &F, ScalarEvolution *SE_,
13350                                TargetTransformInfo *TTI_,
13351                                TargetLibraryInfo *TLI_, AAResults *AA_,
13352                                LoopInfo *LI_, DominatorTree *DT_,
13353                                AssumptionCache *AC_, DemandedBits *DB_,
13354                                OptimizationRemarkEmitter *ORE_) {
13355  if (!RunSLPVectorization)
13356    return false;
13357  SE = SE_;
13358  TTI = TTI_;
13359  TLI = TLI_;
13360  AA = AA_;
13361  LI = LI_;
13362  DT = DT_;
13363  AC = AC_;
13364  DB = DB_;
13365  DL = &F.getParent()->getDataLayout();
13366
13367  Stores.clear();
13368  GEPs.clear();
13369  bool Changed = false;
13370
13371  // If the target claims to have no vector registers don't attempt
13372  // vectorization.
13373  if (!TTI->getNumberOfRegisters(TTI->getRegisterClassForType(true))) {
13374    LLVM_DEBUG(
13375        dbgs() << "SLP: Didn't find any vector registers for target, abort.\n");
13376    return false;
13377  }
13378
13379  // Don't vectorize when the attribute NoImplicitFloat is used.
13380  if (F.hasFnAttribute(Attribute::NoImplicitFloat))
13381    return false;
13382
13383  LLVM_DEBUG(dbgs() << "SLP: Analyzing blocks in " << F.getName() << ".\n");
13384
13385  // Use the bottom up slp vectorizer to construct chains that start with
13386  // store instructions.
13387  BoUpSLP R(&F, SE, TTI, TLI, AA, LI, DT, AC, DB, DL, ORE_);
13388
13389  // A general note: the vectorizer must use BoUpSLP::eraseInstruction() to
13390  // delete instructions.
13391
13392  // Update DFS numbers now so that we can use them for ordering.
13393  DT->updateDFSNumbers();
13394
13395  // Scan the blocks in the function in post order.
13396  for (auto *BB : post_order(&F.getEntryBlock())) {
13397    // Start new block - clear the list of reduction roots.
13398    R.clearReductionData();
13399    collectSeedInstructions(BB);
13400
13401    // Vectorize trees that end at stores.
13402    if (!Stores.empty()) {
13403      LLVM_DEBUG(dbgs() << "SLP: Found stores for " << Stores.size()
13404                        << " underlying objects.\n");
13405      Changed |= vectorizeStoreChains(R);
13406    }
13407
13408    // Vectorize trees that end at reductions.
13409    Changed |= vectorizeChainsInBlock(BB, R);
13410
13411    // Vectorize the index computations of getelementptr instructions. This
13412    // is primarily intended to catch gather-like idioms ending at
13413    // non-consecutive loads.
13414    if (!GEPs.empty()) {
13415      LLVM_DEBUG(dbgs() << "SLP: Found GEPs for " << GEPs.size()
13416                        << " underlying objects.\n");
13417      Changed |= vectorizeGEPIndices(BB, R);
13418    }
13419  }
13420
13421  if (Changed) {
13422    R.optimizeGatherSequence();
13423    LLVM_DEBUG(dbgs() << "SLP: vectorized \"" << F.getName() << "\"\n");
13424  }
13425  return Changed;
13426}
13427
13428bool SLPVectorizerPass::vectorizeStoreChain(ArrayRef<Value *> Chain, BoUpSLP &R,
13429                                            unsigned Idx, unsigned MinVF) {
13430  LLVM_DEBUG(dbgs() << "SLP: Analyzing a store chain of length " << Chain.size()
13431                    << "\n");
13432  const unsigned Sz = R.getVectorElementSize(Chain[0]);
13433  unsigned VF = Chain.size();
13434
13435  if (!isPowerOf2_32(Sz) || !isPowerOf2_32(VF) || VF < 2 || VF < MinVF)
13436    return false;
13437
13438  LLVM_DEBUG(dbgs() << "SLP: Analyzing " << VF << " stores at offset " << Idx
13439                    << "\n");
13440
13441  R.buildTree(Chain);
13442  if (R.isTreeTinyAndNotFullyVectorizable())
13443    return false;
13444  if (R.isLoadCombineCandidate())
13445    return false;
13446  R.reorderTopToBottom();
13447  R.reorderBottomToTop();
13448  R.buildExternalUses();
13449
13450  R.computeMinimumValueSizes();
13451
13452  InstructionCost Cost = R.getTreeCost();
13453
13454  LLVM_DEBUG(dbgs() << "SLP: Found cost = " << Cost << " for VF=" << VF << "\n");
13455  if (Cost < -SLPCostThreshold) {
13456    LLVM_DEBUG(dbgs() << "SLP: Decided to vectorize cost = " << Cost << "\n");
13457
13458    using namespace ore;
13459
13460    R.getORE()->emit(OptimizationRemark(SV_NAME, "StoresVectorized",
13461                                        cast<StoreInst>(Chain[0]))
13462                     << "Stores SLP vectorized with cost " << NV("Cost", Cost)
13463                     << " and with tree size "
13464                     << NV("TreeSize", R.getTreeSize()));
13465
13466    R.vectorizeTree();
13467    return true;
13468  }
13469
13470  return false;
13471}
13472
13473bool SLPVectorizerPass::vectorizeStores(ArrayRef<StoreInst *> Stores,
13474                                        BoUpSLP &R) {
13475  // We may run into multiple chains that merge into a single chain. We mark the
13476  // stores that we vectorized so that we don't visit the same store twice.
13477  BoUpSLP::ValueSet VectorizedStores;
13478  bool Changed = false;
13479
13480  // Stores the pair of stores (first_store, last_store) in a range, that were
13481  // already tried to be vectorized. Allows to skip the store ranges that were
13482  // already tried to be vectorized but the attempts were unsuccessful.
13483  DenseSet<std::pair<Value *, Value *>> TriedSequences;
13484  struct StoreDistCompare {
13485    bool operator()(const std::pair<unsigned, int> &Op1,
13486                    const std::pair<unsigned, int> &Op2) const {
13487      return Op1.second < Op2.second;
13488    }
13489  };
13490  // A set of pairs (index of store in Stores array ref, Distance of the store
13491  // address relative to base store address in units).
13492  using StoreIndexToDistSet =
13493      std::set<std::pair<unsigned, int>, StoreDistCompare>;
13494  auto TryToVectorize = [&](const StoreIndexToDistSet &Set) {
13495    int PrevDist = -1;
13496    BoUpSLP::ValueList Operands;
13497    // Collect the chain into a list.
13498    for (auto [Idx, Data] : enumerate(Set)) {
13499      if (Operands.empty() || Data.second - PrevDist == 1) {
13500        Operands.push_back(Stores[Data.first]);
13501        PrevDist = Data.second;
13502        if (Idx != Set.size() - 1)
13503          continue;
13504      }
13505      if (Operands.size() <= 1) {
13506        Operands.clear();
13507        Operands.push_back(Stores[Data.first]);
13508        PrevDist = Data.second;
13509        continue;
13510      }
13511
13512      unsigned MaxVecRegSize = R.getMaxVecRegSize();
13513      unsigned EltSize = R.getVectorElementSize(Operands[0]);
13514      unsigned MaxElts = llvm::bit_floor(MaxVecRegSize / EltSize);
13515
13516      unsigned MaxVF =
13517          std::min(R.getMaximumVF(EltSize, Instruction::Store), MaxElts);
13518      auto *Store = cast<StoreInst>(Operands[0]);
13519      Type *StoreTy = Store->getValueOperand()->getType();
13520      Type *ValueTy = StoreTy;
13521      if (auto *Trunc = dyn_cast<TruncInst>(Store->getValueOperand()))
13522        ValueTy = Trunc->getSrcTy();
13523      unsigned MinVF = TTI->getStoreMinimumVF(
13524          R.getMinVF(DL->getTypeSizeInBits(ValueTy)), StoreTy, ValueTy);
13525
13526      if (MaxVF <= MinVF) {
13527        LLVM_DEBUG(dbgs() << "SLP: Vectorization infeasible as MaxVF (" << MaxVF
13528                          << ") <= "
13529                          << "MinVF (" << MinVF << ")\n");
13530      }
13531
13532      // FIXME: Is division-by-2 the correct step? Should we assert that the
13533      // register size is a power-of-2?
13534      unsigned StartIdx = 0;
13535      for (unsigned Size = MaxVF; Size >= MinVF; Size /= 2) {
13536        for (unsigned Cnt = StartIdx, E = Operands.size(); Cnt + Size <= E;) {
13537          ArrayRef<Value *> Slice = ArrayRef(Operands).slice(Cnt, Size);
13538          assert(
13539              all_of(
13540                  Slice,
13541                  [&](Value *V) {
13542                    return cast<StoreInst>(V)->getValueOperand()->getType() ==
13543                           cast<StoreInst>(Slice.front())
13544                               ->getValueOperand()
13545                               ->getType();
13546                  }) &&
13547              "Expected all operands of same type.");
13548          if (!VectorizedStores.count(Slice.front()) &&
13549              !VectorizedStores.count(Slice.back()) &&
13550              TriedSequences.insert(std::make_pair(Slice.front(), Slice.back()))
13551                  .second &&
13552              vectorizeStoreChain(Slice, R, Cnt, MinVF)) {
13553            // Mark the vectorized stores so that we don't vectorize them again.
13554            VectorizedStores.insert(Slice.begin(), Slice.end());
13555            Changed = true;
13556            // If we vectorized initial block, no need to try to vectorize it
13557            // again.
13558            if (Cnt == StartIdx)
13559              StartIdx += Size;
13560            Cnt += Size;
13561            continue;
13562          }
13563          ++Cnt;
13564        }
13565        // Check if the whole array was vectorized already - exit.
13566        if (StartIdx >= Operands.size())
13567          break;
13568      }
13569      Operands.clear();
13570      Operands.push_back(Stores[Data.first]);
13571      PrevDist = Data.second;
13572    }
13573  };
13574
13575  // Stores pair (first: index of the store into Stores array ref, address of
13576  // which taken as base, second: sorted set of pairs {index, dist}, which are
13577  // indices of stores in the set and their store location distances relative to
13578  // the base address).
13579
13580  // Need to store the index of the very first store separately, since the set
13581  // may be reordered after the insertion and the first store may be moved. This
13582  // container allows to reduce number of calls of getPointersDiff() function.
13583  SmallVector<std::pair<unsigned, StoreIndexToDistSet>> SortedStores;
13584  // Inserts the specified store SI with the given index Idx to the set of the
13585  // stores. If the store with the same distance is found already - stop
13586  // insertion, try to vectorize already found stores. If some stores from this
13587  // sequence were not vectorized - try to vectorize them with the new store
13588  // later. But this logic is applied only to the stores, that come before the
13589  // previous store with the same distance.
13590  // Example:
13591  // 1. store x, %p
13592  // 2. store y, %p+1
13593  // 3. store z, %p+2
13594  // 4. store a, %p
13595  // 5. store b, %p+3
13596  // - Scan this from the last to first store. The very first bunch of stores is
13597  // {5, {{4, -3}, {2, -2}, {3, -1}, {5, 0}}} (the element in SortedStores
13598  // vector).
13599  // - The next store in the list - #1 - has the same distance from store #5 as
13600  // the store #4.
13601  // - Try to vectorize sequence of stores 4,2,3,5.
13602  // - If all these stores are vectorized - just drop them.
13603  // - If some of them are not vectorized (say, #3 and #5), do extra analysis.
13604  // - Start new stores sequence.
13605  // The new bunch of stores is {1, {1, 0}}.
13606  // - Add the stores from previous sequence, that were not vectorized.
13607  // Here we consider the stores in the reversed order, rather they are used in
13608  // the IR (Stores are reversed already, see vectorizeStoreChains() function).
13609  // Store #3 can be added -> comes after store #4 with the same distance as
13610  // store #1.
13611  // Store #5 cannot be added - comes before store #4.
13612  // This logic allows to improve the compile time, we assume that the stores
13613  // after previous store with the same distance most likely have memory
13614  // dependencies and no need to waste compile time to try to vectorize them.
13615  // - Try to vectorize the sequence {1, {1, 0}, {3, 2}}.
13616  auto FillStoresSet = [&](unsigned Idx, StoreInst *SI) {
13617    for (std::pair<unsigned, StoreIndexToDistSet> &Set : SortedStores) {
13618      std::optional<int> Diff = getPointersDiff(
13619          Stores[Set.first]->getValueOperand()->getType(),
13620          Stores[Set.first]->getPointerOperand(),
13621          SI->getValueOperand()->getType(), SI->getPointerOperand(), *DL, *SE,
13622          /*StrictCheck=*/true);
13623      if (!Diff)
13624        continue;
13625      auto It = Set.second.find(std::make_pair(Idx, *Diff));
13626      if (It == Set.second.end()) {
13627        Set.second.emplace(Idx, *Diff);
13628        return;
13629      }
13630      // Try to vectorize the first found set to avoid duplicate analysis.
13631      TryToVectorize(Set.second);
13632      StoreIndexToDistSet PrevSet;
13633      PrevSet.swap(Set.second);
13634      Set.first = Idx;
13635      Set.second.emplace(Idx, 0);
13636      // Insert stores that followed previous match to try to vectorize them
13637      // with this store.
13638      unsigned StartIdx = It->first + 1;
13639      SmallBitVector UsedStores(Idx - StartIdx);
13640      // Distances to previously found dup store (or this store, since they
13641      // store to the same addresses).
13642      SmallVector<int> Dists(Idx - StartIdx, 0);
13643      for (const std::pair<unsigned, int> &Pair : reverse(PrevSet)) {
13644        // Do not try to vectorize sequences, we already tried.
13645        if (Pair.first <= It->first ||
13646            VectorizedStores.contains(Stores[Pair.first]))
13647          break;
13648        unsigned BI = Pair.first - StartIdx;
13649        UsedStores.set(BI);
13650        Dists[BI] = Pair.second - It->second;
13651      }
13652      for (unsigned I = StartIdx; I < Idx; ++I) {
13653        unsigned BI = I - StartIdx;
13654        if (UsedStores.test(BI))
13655          Set.second.emplace(I, Dists[BI]);
13656      }
13657      return;
13658    }
13659    auto &Res = SortedStores.emplace_back();
13660    Res.first = Idx;
13661    Res.second.emplace(Idx, 0);
13662  };
13663  StoreInst *PrevStore = Stores.front();
13664  for (auto [I, SI] : enumerate(Stores)) {
13665    // Check that we do not try to vectorize stores of different types.
13666    if (PrevStore->getValueOperand()->getType() !=
13667        SI->getValueOperand()->getType()) {
13668      for (auto &Set : SortedStores)
13669        TryToVectorize(Set.second);
13670      SortedStores.clear();
13671      PrevStore = SI;
13672    }
13673    FillStoresSet(I, SI);
13674  }
13675
13676  // Final vectorization attempt.
13677  for (auto &Set : SortedStores)
13678    TryToVectorize(Set.second);
13679
13680  return Changed;
13681}
13682
13683void SLPVectorizerPass::collectSeedInstructions(BasicBlock *BB) {
13684  // Initialize the collections. We will make a single pass over the block.
13685  Stores.clear();
13686  GEPs.clear();
13687
13688  // Visit the store and getelementptr instructions in BB and organize them in
13689  // Stores and GEPs according to the underlying objects of their pointer
13690  // operands.
13691  for (Instruction &I : *BB) {
13692    // Ignore store instructions that are volatile or have a pointer operand
13693    // that doesn't point to a scalar type.
13694    if (auto *SI = dyn_cast<StoreInst>(&I)) {
13695      if (!SI->isSimple())
13696        continue;
13697      if (!isValidElementType(SI->getValueOperand()->getType()))
13698        continue;
13699      Stores[getUnderlyingObject(SI->getPointerOperand())].push_back(SI);
13700    }
13701
13702    // Ignore getelementptr instructions that have more than one index, a
13703    // constant index, or a pointer operand that doesn't point to a scalar
13704    // type.
13705    else if (auto *GEP = dyn_cast<GetElementPtrInst>(&I)) {
13706      if (GEP->getNumIndices() != 1)
13707        continue;
13708      Value *Idx = GEP->idx_begin()->get();
13709      if (isa<Constant>(Idx))
13710        continue;
13711      if (!isValidElementType(Idx->getType()))
13712        continue;
13713      if (GEP->getType()->isVectorTy())
13714        continue;
13715      GEPs[GEP->getPointerOperand()].push_back(GEP);
13716    }
13717  }
13718}
13719
13720bool SLPVectorizerPass::tryToVectorizeList(ArrayRef<Value *> VL, BoUpSLP &R,
13721                                           bool MaxVFOnly) {
13722  if (VL.size() < 2)
13723    return false;
13724
13725  LLVM_DEBUG(dbgs() << "SLP: Trying to vectorize a list of length = "
13726                    << VL.size() << ".\n");
13727
13728  // Check that all of the parts are instructions of the same type,
13729  // we permit an alternate opcode via InstructionsState.
13730  InstructionsState S = getSameOpcode(VL, *TLI);
13731  if (!S.getOpcode())
13732    return false;
13733
13734  Instruction *I0 = cast<Instruction>(S.OpValue);
13735  // Make sure invalid types (including vector type) are rejected before
13736  // determining vectorization factor for scalar instructions.
13737  for (Value *V : VL) {
13738    Type *Ty = V->getType();
13739    if (!isa<InsertElementInst>(V) && !isValidElementType(Ty)) {
13740      // NOTE: the following will give user internal llvm type name, which may
13741      // not be useful.
13742      R.getORE()->emit([&]() {
13743        std::string TypeStr;
13744        llvm::raw_string_ostream rso(TypeStr);
13745        Ty->print(rso);
13746        return OptimizationRemarkMissed(SV_NAME, "UnsupportedType", I0)
13747               << "Cannot SLP vectorize list: type "
13748               << rso.str() + " is unsupported by vectorizer";
13749      });
13750      return false;
13751    }
13752  }
13753
13754  unsigned Sz = R.getVectorElementSize(I0);
13755  unsigned MinVF = R.getMinVF(Sz);
13756  unsigned MaxVF = std::max<unsigned>(llvm::bit_floor(VL.size()), MinVF);
13757  MaxVF = std::min(R.getMaximumVF(Sz, S.getOpcode()), MaxVF);
13758  if (MaxVF < 2) {
13759    R.getORE()->emit([&]() {
13760      return OptimizationRemarkMissed(SV_NAME, "SmallVF", I0)
13761             << "Cannot SLP vectorize list: vectorization factor "
13762             << "less than 2 is not supported";
13763    });
13764    return false;
13765  }
13766
13767  bool Changed = false;
13768  bool CandidateFound = false;
13769  InstructionCost MinCost = SLPCostThreshold.getValue();
13770  Type *ScalarTy = VL[0]->getType();
13771  if (auto *IE = dyn_cast<InsertElementInst>(VL[0]))
13772    ScalarTy = IE->getOperand(1)->getType();
13773
13774  unsigned NextInst = 0, MaxInst = VL.size();
13775  for (unsigned VF = MaxVF; NextInst + 1 < MaxInst && VF >= MinVF; VF /= 2) {
13776    // No actual vectorization should happen, if number of parts is the same as
13777    // provided vectorization factor (i.e. the scalar type is used for vector
13778    // code during codegen).
13779    auto *VecTy = FixedVectorType::get(ScalarTy, VF);
13780    if (TTI->getNumberOfParts(VecTy) == VF)
13781      continue;
13782    for (unsigned I = NextInst; I < MaxInst; ++I) {
13783      unsigned ActualVF = std::min(MaxInst - I, VF);
13784
13785      if (!isPowerOf2_32(ActualVF))
13786        continue;
13787
13788      if (MaxVFOnly && ActualVF < MaxVF)
13789        break;
13790      if ((VF > MinVF && ActualVF <= VF / 2) || (VF == MinVF && ActualVF < 2))
13791        break;
13792
13793      ArrayRef<Value *> Ops = VL.slice(I, ActualVF);
13794      // Check that a previous iteration of this loop did not delete the Value.
13795      if (llvm::any_of(Ops, [&R](Value *V) {
13796            auto *I = dyn_cast<Instruction>(V);
13797            return I && R.isDeleted(I);
13798          }))
13799        continue;
13800
13801      LLVM_DEBUG(dbgs() << "SLP: Analyzing " << ActualVF << " operations "
13802                        << "\n");
13803
13804      R.buildTree(Ops);
13805      if (R.isTreeTinyAndNotFullyVectorizable())
13806        continue;
13807      R.reorderTopToBottom();
13808      R.reorderBottomToTop(
13809          /*IgnoreReorder=*/!isa<InsertElementInst>(Ops.front()) &&
13810          !R.doesRootHaveInTreeUses());
13811      R.buildExternalUses();
13812
13813      R.computeMinimumValueSizes();
13814      InstructionCost Cost = R.getTreeCost();
13815      CandidateFound = true;
13816      MinCost = std::min(MinCost, Cost);
13817
13818      LLVM_DEBUG(dbgs() << "SLP: Found cost = " << Cost
13819                        << " for VF=" << ActualVF << "\n");
13820      if (Cost < -SLPCostThreshold) {
13821        LLVM_DEBUG(dbgs() << "SLP: Vectorizing list at cost:" << Cost << ".\n");
13822        R.getORE()->emit(OptimizationRemark(SV_NAME, "VectorizedList",
13823                                                    cast<Instruction>(Ops[0]))
13824                                 << "SLP vectorized with cost " << ore::NV("Cost", Cost)
13825                                 << " and with tree size "
13826                                 << ore::NV("TreeSize", R.getTreeSize()));
13827
13828        R.vectorizeTree();
13829        // Move to the next bundle.
13830        I += VF - 1;
13831        NextInst = I + 1;
13832        Changed = true;
13833      }
13834    }
13835  }
13836
13837  if (!Changed && CandidateFound) {
13838    R.getORE()->emit([&]() {
13839      return OptimizationRemarkMissed(SV_NAME, "NotBeneficial", I0)
13840             << "List vectorization was possible but not beneficial with cost "
13841             << ore::NV("Cost", MinCost) << " >= "
13842             << ore::NV("Treshold", -SLPCostThreshold);
13843    });
13844  } else if (!Changed) {
13845    R.getORE()->emit([&]() {
13846      return OptimizationRemarkMissed(SV_NAME, "NotPossible", I0)
13847             << "Cannot SLP vectorize list: vectorization was impossible"
13848             << " with available vectorization factors";
13849    });
13850  }
13851  return Changed;
13852}
13853
13854bool SLPVectorizerPass::tryToVectorize(Instruction *I, BoUpSLP &R) {
13855  if (!I)
13856    return false;
13857
13858  if (!isa<BinaryOperator, CmpInst>(I) || isa<VectorType>(I->getType()))
13859    return false;
13860
13861  Value *P = I->getParent();
13862
13863  // Vectorize in current basic block only.
13864  auto *Op0 = dyn_cast<Instruction>(I->getOperand(0));
13865  auto *Op1 = dyn_cast<Instruction>(I->getOperand(1));
13866  if (!Op0 || !Op1 || Op0->getParent() != P || Op1->getParent() != P)
13867    return false;
13868
13869  // First collect all possible candidates
13870  SmallVector<std::pair<Value *, Value *>, 4> Candidates;
13871  Candidates.emplace_back(Op0, Op1);
13872
13873  auto *A = dyn_cast<BinaryOperator>(Op0);
13874  auto *B = dyn_cast<BinaryOperator>(Op1);
13875  // Try to skip B.
13876  if (A && B && B->hasOneUse()) {
13877    auto *B0 = dyn_cast<BinaryOperator>(B->getOperand(0));
13878    auto *B1 = dyn_cast<BinaryOperator>(B->getOperand(1));
13879    if (B0 && B0->getParent() == P)
13880      Candidates.emplace_back(A, B0);
13881    if (B1 && B1->getParent() == P)
13882      Candidates.emplace_back(A, B1);
13883  }
13884  // Try to skip A.
13885  if (B && A && A->hasOneUse()) {
13886    auto *A0 = dyn_cast<BinaryOperator>(A->getOperand(0));
13887    auto *A1 = dyn_cast<BinaryOperator>(A->getOperand(1));
13888    if (A0 && A0->getParent() == P)
13889      Candidates.emplace_back(A0, B);
13890    if (A1 && A1->getParent() == P)
13891      Candidates.emplace_back(A1, B);
13892  }
13893
13894  if (Candidates.size() == 1)
13895    return tryToVectorizeList({Op0, Op1}, R);
13896
13897  // We have multiple options. Try to pick the single best.
13898  std::optional<int> BestCandidate = R.findBestRootPair(Candidates);
13899  if (!BestCandidate)
13900    return false;
13901  return tryToVectorizeList(
13902      {Candidates[*BestCandidate].first, Candidates[*BestCandidate].second}, R);
13903}
13904
13905namespace {
13906
13907/// Model horizontal reductions.
13908///
13909/// A horizontal reduction is a tree of reduction instructions that has values
13910/// that can be put into a vector as its leaves. For example:
13911///
13912/// mul mul mul mul
13913///  \  /    \  /
13914///   +       +
13915///    \     /
13916///       +
13917/// This tree has "mul" as its leaf values and "+" as its reduction
13918/// instructions. A reduction can feed into a store or a binary operation
13919/// feeding a phi.
13920///    ...
13921///    \  /
13922///     +
13923///     |
13924///  phi +=
13925///
13926///  Or:
13927///    ...
13928///    \  /
13929///     +
13930///     |
13931///   *p =
13932///
13933class HorizontalReduction {
13934  using ReductionOpsType = SmallVector<Value *, 16>;
13935  using ReductionOpsListType = SmallVector<ReductionOpsType, 2>;
13936  ReductionOpsListType ReductionOps;
13937  /// List of possibly reduced values.
13938  SmallVector<SmallVector<Value *>> ReducedVals;
13939  /// Maps reduced value to the corresponding reduction operation.
13940  DenseMap<Value *, SmallVector<Instruction *>> ReducedValsToOps;
13941  // Use map vector to make stable output.
13942  MapVector<Instruction *, Value *> ExtraArgs;
13943  WeakTrackingVH ReductionRoot;
13944  /// The type of reduction operation.
13945  RecurKind RdxKind;
13946  /// Checks if the optimization of original scalar identity operations on
13947  /// matched horizontal reductions is enabled and allowed.
13948  bool IsSupportedHorRdxIdentityOp = false;
13949
13950  static bool isCmpSelMinMax(Instruction *I) {
13951    return match(I, m_Select(m_Cmp(), m_Value(), m_Value())) &&
13952           RecurrenceDescriptor::isMinMaxRecurrenceKind(getRdxKind(I));
13953  }
13954
13955  // And/or are potentially poison-safe logical patterns like:
13956  // select x, y, false
13957  // select x, true, y
13958  static bool isBoolLogicOp(Instruction *I) {
13959    return isa<SelectInst>(I) &&
13960           (match(I, m_LogicalAnd()) || match(I, m_LogicalOr()));
13961  }
13962
13963  /// Checks if instruction is associative and can be vectorized.
13964  static bool isVectorizable(RecurKind Kind, Instruction *I) {
13965    if (Kind == RecurKind::None)
13966      return false;
13967
13968    // Integer ops that map to select instructions or intrinsics are fine.
13969    if (RecurrenceDescriptor::isIntMinMaxRecurrenceKind(Kind) ||
13970        isBoolLogicOp(I))
13971      return true;
13972
13973    if (Kind == RecurKind::FMax || Kind == RecurKind::FMin) {
13974      // FP min/max are associative except for NaN and -0.0. We do not
13975      // have to rule out -0.0 here because the intrinsic semantics do not
13976      // specify a fixed result for it.
13977      return I->getFastMathFlags().noNaNs();
13978    }
13979
13980    if (Kind == RecurKind::FMaximum || Kind == RecurKind::FMinimum)
13981      return true;
13982
13983    return I->isAssociative();
13984  }
13985
13986  static Value *getRdxOperand(Instruction *I, unsigned Index) {
13987    // Poison-safe 'or' takes the form: select X, true, Y
13988    // To make that work with the normal operand processing, we skip the
13989    // true value operand.
13990    // TODO: Change the code and data structures to handle this without a hack.
13991    if (getRdxKind(I) == RecurKind::Or && isa<SelectInst>(I) && Index == 1)
13992      return I->getOperand(2);
13993    return I->getOperand(Index);
13994  }
13995
13996  /// Creates reduction operation with the current opcode.
13997  static Value *createOp(IRBuilder<> &Builder, RecurKind Kind, Value *LHS,
13998                         Value *RHS, const Twine &Name, bool UseSelect) {
13999    unsigned RdxOpcode = RecurrenceDescriptor::getOpcode(Kind);
14000    bool IsConstant = isConstant(LHS) && isConstant(RHS);
14001    switch (Kind) {
14002    case RecurKind::Or:
14003      if (UseSelect &&
14004          LHS->getType() == CmpInst::makeCmpResultType(LHS->getType()))
14005        return Builder.CreateSelect(LHS, Builder.getTrue(), RHS, Name);
14006      return Builder.CreateBinOp((Instruction::BinaryOps)RdxOpcode, LHS, RHS,
14007                                 Name);
14008    case RecurKind::And:
14009      if (UseSelect &&
14010          LHS->getType() == CmpInst::makeCmpResultType(LHS->getType()))
14011        return Builder.CreateSelect(LHS, RHS, Builder.getFalse(), Name);
14012      return Builder.CreateBinOp((Instruction::BinaryOps)RdxOpcode, LHS, RHS,
14013                                 Name);
14014    case RecurKind::Add:
14015    case RecurKind::Mul:
14016    case RecurKind::Xor:
14017    case RecurKind::FAdd:
14018    case RecurKind::FMul:
14019      return Builder.CreateBinOp((Instruction::BinaryOps)RdxOpcode, LHS, RHS,
14020                                 Name);
14021    case RecurKind::FMax:
14022      if (IsConstant)
14023        return ConstantFP::get(LHS->getType(),
14024                               maxnum(cast<ConstantFP>(LHS)->getValueAPF(),
14025                                      cast<ConstantFP>(RHS)->getValueAPF()));
14026      return Builder.CreateBinaryIntrinsic(Intrinsic::maxnum, LHS, RHS);
14027    case RecurKind::FMin:
14028      if (IsConstant)
14029        return ConstantFP::get(LHS->getType(),
14030                               minnum(cast<ConstantFP>(LHS)->getValueAPF(),
14031                                      cast<ConstantFP>(RHS)->getValueAPF()));
14032      return Builder.CreateBinaryIntrinsic(Intrinsic::minnum, LHS, RHS);
14033    case RecurKind::FMaximum:
14034      if (IsConstant)
14035        return ConstantFP::get(LHS->getType(),
14036                               maximum(cast<ConstantFP>(LHS)->getValueAPF(),
14037                                      cast<ConstantFP>(RHS)->getValueAPF()));
14038      return Builder.CreateBinaryIntrinsic(Intrinsic::maximum, LHS, RHS);
14039    case RecurKind::FMinimum:
14040      if (IsConstant)
14041        return ConstantFP::get(LHS->getType(),
14042                               minimum(cast<ConstantFP>(LHS)->getValueAPF(),
14043                                      cast<ConstantFP>(RHS)->getValueAPF()));
14044      return Builder.CreateBinaryIntrinsic(Intrinsic::minimum, LHS, RHS);
14045    case RecurKind::SMax:
14046      if (IsConstant || UseSelect) {
14047        Value *Cmp = Builder.CreateICmpSGT(LHS, RHS, Name);
14048        return Builder.CreateSelect(Cmp, LHS, RHS, Name);
14049      }
14050      return Builder.CreateBinaryIntrinsic(Intrinsic::smax, LHS, RHS);
14051    case RecurKind::SMin:
14052      if (IsConstant || UseSelect) {
14053        Value *Cmp = Builder.CreateICmpSLT(LHS, RHS, Name);
14054        return Builder.CreateSelect(Cmp, LHS, RHS, Name);
14055      }
14056      return Builder.CreateBinaryIntrinsic(Intrinsic::smin, LHS, RHS);
14057    case RecurKind::UMax:
14058      if (IsConstant || UseSelect) {
14059        Value *Cmp = Builder.CreateICmpUGT(LHS, RHS, Name);
14060        return Builder.CreateSelect(Cmp, LHS, RHS, Name);
14061      }
14062      return Builder.CreateBinaryIntrinsic(Intrinsic::umax, LHS, RHS);
14063    case RecurKind::UMin:
14064      if (IsConstant || UseSelect) {
14065        Value *Cmp = Builder.CreateICmpULT(LHS, RHS, Name);
14066        return Builder.CreateSelect(Cmp, LHS, RHS, Name);
14067      }
14068      return Builder.CreateBinaryIntrinsic(Intrinsic::umin, LHS, RHS);
14069    default:
14070      llvm_unreachable("Unknown reduction operation.");
14071    }
14072  }
14073
14074  /// Creates reduction operation with the current opcode with the IR flags
14075  /// from \p ReductionOps, dropping nuw/nsw flags.
14076  static Value *createOp(IRBuilder<> &Builder, RecurKind RdxKind, Value *LHS,
14077                         Value *RHS, const Twine &Name,
14078                         const ReductionOpsListType &ReductionOps) {
14079    bool UseSelect =
14080        ReductionOps.size() == 2 ||
14081        // Logical or/and.
14082        (ReductionOps.size() == 1 && any_of(ReductionOps.front(), [](Value *V) {
14083           return isa<SelectInst>(V);
14084         }));
14085    assert((!UseSelect || ReductionOps.size() != 2 ||
14086            isa<SelectInst>(ReductionOps[1][0])) &&
14087           "Expected cmp + select pairs for reduction");
14088    Value *Op = createOp(Builder, RdxKind, LHS, RHS, Name, UseSelect);
14089    if (RecurrenceDescriptor::isIntMinMaxRecurrenceKind(RdxKind)) {
14090      if (auto *Sel = dyn_cast<SelectInst>(Op)) {
14091        propagateIRFlags(Sel->getCondition(), ReductionOps[0], nullptr,
14092                         /*IncludeWrapFlags=*/false);
14093        propagateIRFlags(Op, ReductionOps[1], nullptr,
14094                         /*IncludeWrapFlags=*/false);
14095        return Op;
14096      }
14097    }
14098    propagateIRFlags(Op, ReductionOps[0], nullptr, /*IncludeWrapFlags=*/false);
14099    return Op;
14100  }
14101
14102public:
14103  static RecurKind getRdxKind(Value *V) {
14104    auto *I = dyn_cast<Instruction>(V);
14105    if (!I)
14106      return RecurKind::None;
14107    if (match(I, m_Add(m_Value(), m_Value())))
14108      return RecurKind::Add;
14109    if (match(I, m_Mul(m_Value(), m_Value())))
14110      return RecurKind::Mul;
14111    if (match(I, m_And(m_Value(), m_Value())) ||
14112        match(I, m_LogicalAnd(m_Value(), m_Value())))
14113      return RecurKind::And;
14114    if (match(I, m_Or(m_Value(), m_Value())) ||
14115        match(I, m_LogicalOr(m_Value(), m_Value())))
14116      return RecurKind::Or;
14117    if (match(I, m_Xor(m_Value(), m_Value())))
14118      return RecurKind::Xor;
14119    if (match(I, m_FAdd(m_Value(), m_Value())))
14120      return RecurKind::FAdd;
14121    if (match(I, m_FMul(m_Value(), m_Value())))
14122      return RecurKind::FMul;
14123
14124    if (match(I, m_Intrinsic<Intrinsic::maxnum>(m_Value(), m_Value())))
14125      return RecurKind::FMax;
14126    if (match(I, m_Intrinsic<Intrinsic::minnum>(m_Value(), m_Value())))
14127      return RecurKind::FMin;
14128
14129    if (match(I, m_Intrinsic<Intrinsic::maximum>(m_Value(), m_Value())))
14130      return RecurKind::FMaximum;
14131    if (match(I, m_Intrinsic<Intrinsic::minimum>(m_Value(), m_Value())))
14132      return RecurKind::FMinimum;
14133    // This matches either cmp+select or intrinsics. SLP is expected to handle
14134    // either form.
14135    // TODO: If we are canonicalizing to intrinsics, we can remove several
14136    //       special-case paths that deal with selects.
14137    if (match(I, m_SMax(m_Value(), m_Value())))
14138      return RecurKind::SMax;
14139    if (match(I, m_SMin(m_Value(), m_Value())))
14140      return RecurKind::SMin;
14141    if (match(I, m_UMax(m_Value(), m_Value())))
14142      return RecurKind::UMax;
14143    if (match(I, m_UMin(m_Value(), m_Value())))
14144      return RecurKind::UMin;
14145
14146    if (auto *Select = dyn_cast<SelectInst>(I)) {
14147      // Try harder: look for min/max pattern based on instructions producing
14148      // same values such as: select ((cmp Inst1, Inst2), Inst1, Inst2).
14149      // During the intermediate stages of SLP, it's very common to have
14150      // pattern like this (since optimizeGatherSequence is run only once
14151      // at the end):
14152      // %1 = extractelement <2 x i32> %a, i32 0
14153      // %2 = extractelement <2 x i32> %a, i32 1
14154      // %cond = icmp sgt i32 %1, %2
14155      // %3 = extractelement <2 x i32> %a, i32 0
14156      // %4 = extractelement <2 x i32> %a, i32 1
14157      // %select = select i1 %cond, i32 %3, i32 %4
14158      CmpInst::Predicate Pred;
14159      Instruction *L1;
14160      Instruction *L2;
14161
14162      Value *LHS = Select->getTrueValue();
14163      Value *RHS = Select->getFalseValue();
14164      Value *Cond = Select->getCondition();
14165
14166      // TODO: Support inverse predicates.
14167      if (match(Cond, m_Cmp(Pred, m_Specific(LHS), m_Instruction(L2)))) {
14168        if (!isa<ExtractElementInst>(RHS) ||
14169            !L2->isIdenticalTo(cast<Instruction>(RHS)))
14170          return RecurKind::None;
14171      } else if (match(Cond, m_Cmp(Pred, m_Instruction(L1), m_Specific(RHS)))) {
14172        if (!isa<ExtractElementInst>(LHS) ||
14173            !L1->isIdenticalTo(cast<Instruction>(LHS)))
14174          return RecurKind::None;
14175      } else {
14176        if (!isa<ExtractElementInst>(LHS) || !isa<ExtractElementInst>(RHS))
14177          return RecurKind::None;
14178        if (!match(Cond, m_Cmp(Pred, m_Instruction(L1), m_Instruction(L2))) ||
14179            !L1->isIdenticalTo(cast<Instruction>(LHS)) ||
14180            !L2->isIdenticalTo(cast<Instruction>(RHS)))
14181          return RecurKind::None;
14182      }
14183
14184      switch (Pred) {
14185      default:
14186        return RecurKind::None;
14187      case CmpInst::ICMP_SGT:
14188      case CmpInst::ICMP_SGE:
14189        return RecurKind::SMax;
14190      case CmpInst::ICMP_SLT:
14191      case CmpInst::ICMP_SLE:
14192        return RecurKind::SMin;
14193      case CmpInst::ICMP_UGT:
14194      case CmpInst::ICMP_UGE:
14195        return RecurKind::UMax;
14196      case CmpInst::ICMP_ULT:
14197      case CmpInst::ICMP_ULE:
14198        return RecurKind::UMin;
14199      }
14200    }
14201    return RecurKind::None;
14202  }
14203
14204  /// Get the index of the first operand.
14205  static unsigned getFirstOperandIndex(Instruction *I) {
14206    return isCmpSelMinMax(I) ? 1 : 0;
14207  }
14208
14209private:
14210  /// Total number of operands in the reduction operation.
14211  static unsigned getNumberOfOperands(Instruction *I) {
14212    return isCmpSelMinMax(I) ? 3 : 2;
14213  }
14214
14215  /// Checks if the instruction is in basic block \p BB.
14216  /// For a cmp+sel min/max reduction check that both ops are in \p BB.
14217  static bool hasSameParent(Instruction *I, BasicBlock *BB) {
14218    if (isCmpSelMinMax(I) || isBoolLogicOp(I)) {
14219      auto *Sel = cast<SelectInst>(I);
14220      auto *Cmp = dyn_cast<Instruction>(Sel->getCondition());
14221      return Sel->getParent() == BB && Cmp && Cmp->getParent() == BB;
14222    }
14223    return I->getParent() == BB;
14224  }
14225
14226  /// Expected number of uses for reduction operations/reduced values.
14227  static bool hasRequiredNumberOfUses(bool IsCmpSelMinMax, Instruction *I) {
14228    if (IsCmpSelMinMax) {
14229      // SelectInst must be used twice while the condition op must have single
14230      // use only.
14231      if (auto *Sel = dyn_cast<SelectInst>(I))
14232        return Sel->hasNUses(2) && Sel->getCondition()->hasOneUse();
14233      return I->hasNUses(2);
14234    }
14235
14236    // Arithmetic reduction operation must be used once only.
14237    return I->hasOneUse();
14238  }
14239
14240  /// Initializes the list of reduction operations.
14241  void initReductionOps(Instruction *I) {
14242    if (isCmpSelMinMax(I))
14243      ReductionOps.assign(2, ReductionOpsType());
14244    else
14245      ReductionOps.assign(1, ReductionOpsType());
14246  }
14247
14248  /// Add all reduction operations for the reduction instruction \p I.
14249  void addReductionOps(Instruction *I) {
14250    if (isCmpSelMinMax(I)) {
14251      ReductionOps[0].emplace_back(cast<SelectInst>(I)->getCondition());
14252      ReductionOps[1].emplace_back(I);
14253    } else {
14254      ReductionOps[0].emplace_back(I);
14255    }
14256  }
14257
14258  static bool isGoodForReduction(ArrayRef<Value *> Data) {
14259    int Sz = Data.size();
14260    auto *I = dyn_cast<Instruction>(Data.front());
14261    return Sz > 1 || isConstant(Data.front()) ||
14262           (I && !isa<LoadInst>(I) && isValidForAlternation(I->getOpcode()));
14263  }
14264
14265public:
14266  HorizontalReduction() = default;
14267
14268  /// Try to find a reduction tree.
14269  bool matchAssociativeReduction(BoUpSLP &R, Instruction *Root,
14270                                 ScalarEvolution &SE, const DataLayout &DL,
14271                                 const TargetLibraryInfo &TLI) {
14272    RdxKind = HorizontalReduction::getRdxKind(Root);
14273    if (!isVectorizable(RdxKind, Root))
14274      return false;
14275
14276    // Analyze "regular" integer/FP types for reductions - no target-specific
14277    // types or pointers.
14278    Type *Ty = Root->getType();
14279    if (!isValidElementType(Ty) || Ty->isPointerTy())
14280      return false;
14281
14282    // Though the ultimate reduction may have multiple uses, its condition must
14283    // have only single use.
14284    if (auto *Sel = dyn_cast<SelectInst>(Root))
14285      if (!Sel->getCondition()->hasOneUse())
14286        return false;
14287
14288    ReductionRoot = Root;
14289
14290    // Iterate through all the operands of the possible reduction tree and
14291    // gather all the reduced values, sorting them by their value id.
14292    BasicBlock *BB = Root->getParent();
14293    bool IsCmpSelMinMax = isCmpSelMinMax(Root);
14294    SmallVector<Instruction *> Worklist(1, Root);
14295    // Checks if the operands of the \p TreeN instruction are also reduction
14296    // operations or should be treated as reduced values or an extra argument,
14297    // which is not part of the reduction.
14298    auto CheckOperands = [&](Instruction *TreeN,
14299                             SmallVectorImpl<Value *> &ExtraArgs,
14300                             SmallVectorImpl<Value *> &PossibleReducedVals,
14301                             SmallVectorImpl<Instruction *> &ReductionOps) {
14302      for (int I = getFirstOperandIndex(TreeN),
14303               End = getNumberOfOperands(TreeN);
14304           I < End; ++I) {
14305        Value *EdgeVal = getRdxOperand(TreeN, I);
14306        ReducedValsToOps[EdgeVal].push_back(TreeN);
14307        auto *EdgeInst = dyn_cast<Instruction>(EdgeVal);
14308        // Edge has wrong parent - mark as an extra argument.
14309        if (EdgeInst && !isVectorLikeInstWithConstOps(EdgeInst) &&
14310            !hasSameParent(EdgeInst, BB)) {
14311          ExtraArgs.push_back(EdgeVal);
14312          continue;
14313        }
14314        // If the edge is not an instruction, or it is different from the main
14315        // reduction opcode or has too many uses - possible reduced value.
14316        // Also, do not try to reduce const values, if the operation is not
14317        // foldable.
14318        if (!EdgeInst || getRdxKind(EdgeInst) != RdxKind ||
14319            IsCmpSelMinMax != isCmpSelMinMax(EdgeInst) ||
14320            !hasRequiredNumberOfUses(IsCmpSelMinMax, EdgeInst) ||
14321            !isVectorizable(RdxKind, EdgeInst) ||
14322            (R.isAnalyzedReductionRoot(EdgeInst) &&
14323             all_of(EdgeInst->operands(), Constant::classof))) {
14324          PossibleReducedVals.push_back(EdgeVal);
14325          continue;
14326        }
14327        ReductionOps.push_back(EdgeInst);
14328      }
14329    };
14330    // Try to regroup reduced values so that it gets more profitable to try to
14331    // reduce them. Values are grouped by their value ids, instructions - by
14332    // instruction op id and/or alternate op id, plus do extra analysis for
14333    // loads (grouping them by the distabce between pointers) and cmp
14334    // instructions (grouping them by the predicate).
14335    MapVector<size_t, MapVector<size_t, MapVector<Value *, unsigned>>>
14336        PossibleReducedVals;
14337    initReductionOps(Root);
14338    DenseMap<Value *, SmallVector<LoadInst *>> LoadsMap;
14339    SmallSet<size_t, 2> LoadKeyUsed;
14340    SmallPtrSet<Value *, 4> DoNotReverseVals;
14341
14342    auto GenerateLoadsSubkey = [&](size_t Key, LoadInst *LI) {
14343      Value *Ptr = getUnderlyingObject(LI->getPointerOperand());
14344      if (LoadKeyUsed.contains(Key)) {
14345        auto LIt = LoadsMap.find(Ptr);
14346        if (LIt != LoadsMap.end()) {
14347          for (LoadInst *RLI : LIt->second) {
14348            if (getPointersDiff(RLI->getType(), RLI->getPointerOperand(),
14349                                LI->getType(), LI->getPointerOperand(), DL, SE,
14350                                /*StrictCheck=*/true))
14351              return hash_value(RLI->getPointerOperand());
14352          }
14353          for (LoadInst *RLI : LIt->second) {
14354            if (arePointersCompatible(RLI->getPointerOperand(),
14355                                      LI->getPointerOperand(), TLI)) {
14356              hash_code SubKey = hash_value(RLI->getPointerOperand());
14357              DoNotReverseVals.insert(RLI);
14358              return SubKey;
14359            }
14360          }
14361          if (LIt->second.size() > 2) {
14362            hash_code SubKey =
14363                hash_value(LIt->second.back()->getPointerOperand());
14364            DoNotReverseVals.insert(LIt->second.back());
14365            return SubKey;
14366          }
14367        }
14368      }
14369      LoadKeyUsed.insert(Key);
14370      LoadsMap.try_emplace(Ptr).first->second.push_back(LI);
14371      return hash_value(LI->getPointerOperand());
14372    };
14373
14374    while (!Worklist.empty()) {
14375      Instruction *TreeN = Worklist.pop_back_val();
14376      SmallVector<Value *> Args;
14377      SmallVector<Value *> PossibleRedVals;
14378      SmallVector<Instruction *> PossibleReductionOps;
14379      CheckOperands(TreeN, Args, PossibleRedVals, PossibleReductionOps);
14380      // If too many extra args - mark the instruction itself as a reduction
14381      // value, not a reduction operation.
14382      if (Args.size() < 2) {
14383        addReductionOps(TreeN);
14384        // Add extra args.
14385        if (!Args.empty()) {
14386          assert(Args.size() == 1 && "Expected only single argument.");
14387          ExtraArgs[TreeN] = Args.front();
14388        }
14389        // Add reduction values. The values are sorted for better vectorization
14390        // results.
14391        for (Value *V : PossibleRedVals) {
14392          size_t Key, Idx;
14393          std::tie(Key, Idx) = generateKeySubkey(V, &TLI, GenerateLoadsSubkey,
14394                                                 /*AllowAlternate=*/false);
14395          ++PossibleReducedVals[Key][Idx]
14396                .insert(std::make_pair(V, 0))
14397                .first->second;
14398        }
14399        Worklist.append(PossibleReductionOps.rbegin(),
14400                        PossibleReductionOps.rend());
14401      } else {
14402        size_t Key, Idx;
14403        std::tie(Key, Idx) = generateKeySubkey(TreeN, &TLI, GenerateLoadsSubkey,
14404                                               /*AllowAlternate=*/false);
14405        ++PossibleReducedVals[Key][Idx]
14406              .insert(std::make_pair(TreeN, 0))
14407              .first->second;
14408      }
14409    }
14410    auto PossibleReducedValsVect = PossibleReducedVals.takeVector();
14411    // Sort values by the total number of values kinds to start the reduction
14412    // from the longest possible reduced values sequences.
14413    for (auto &PossibleReducedVals : PossibleReducedValsVect) {
14414      auto PossibleRedVals = PossibleReducedVals.second.takeVector();
14415      SmallVector<SmallVector<Value *>> PossibleRedValsVect;
14416      for (auto It = PossibleRedVals.begin(), E = PossibleRedVals.end();
14417           It != E; ++It) {
14418        PossibleRedValsVect.emplace_back();
14419        auto RedValsVect = It->second.takeVector();
14420        stable_sort(RedValsVect, llvm::less_second());
14421        for (const std::pair<Value *, unsigned> &Data : RedValsVect)
14422          PossibleRedValsVect.back().append(Data.second, Data.first);
14423      }
14424      stable_sort(PossibleRedValsVect, [](const auto &P1, const auto &P2) {
14425        return P1.size() > P2.size();
14426      });
14427      int NewIdx = -1;
14428      for (ArrayRef<Value *> Data : PossibleRedValsVect) {
14429        if (isGoodForReduction(Data) ||
14430            (isa<LoadInst>(Data.front()) && NewIdx >= 0 &&
14431             isa<LoadInst>(ReducedVals[NewIdx].front()) &&
14432             getUnderlyingObject(
14433                 cast<LoadInst>(Data.front())->getPointerOperand()) ==
14434                 getUnderlyingObject(cast<LoadInst>(ReducedVals[NewIdx].front())
14435                                         ->getPointerOperand()))) {
14436          if (NewIdx < 0) {
14437            NewIdx = ReducedVals.size();
14438            ReducedVals.emplace_back();
14439          }
14440          if (DoNotReverseVals.contains(Data.front()))
14441            ReducedVals[NewIdx].append(Data.begin(), Data.end());
14442          else
14443            ReducedVals[NewIdx].append(Data.rbegin(), Data.rend());
14444        } else {
14445          ReducedVals.emplace_back().append(Data.rbegin(), Data.rend());
14446        }
14447      }
14448    }
14449    // Sort the reduced values by number of same/alternate opcode and/or pointer
14450    // operand.
14451    stable_sort(ReducedVals, [](ArrayRef<Value *> P1, ArrayRef<Value *> P2) {
14452      return P1.size() > P2.size();
14453    });
14454    return true;
14455  }
14456
14457  /// Attempt to vectorize the tree found by matchAssociativeReduction.
14458  Value *tryToReduce(BoUpSLP &V, TargetTransformInfo *TTI,
14459                     const TargetLibraryInfo &TLI) {
14460    constexpr int ReductionLimit = 4;
14461    constexpr unsigned RegMaxNumber = 4;
14462    constexpr unsigned RedValsMaxNumber = 128;
14463    // If there are a sufficient number of reduction values, reduce
14464    // to a nearby power-of-2. We can safely generate oversized
14465    // vectors and rely on the backend to split them to legal sizes.
14466    unsigned NumReducedVals =
14467        std::accumulate(ReducedVals.begin(), ReducedVals.end(), 0,
14468                        [](unsigned Num, ArrayRef<Value *> Vals) -> unsigned {
14469                          if (!isGoodForReduction(Vals))
14470                            return Num;
14471                          return Num + Vals.size();
14472                        });
14473    if (NumReducedVals < ReductionLimit &&
14474        (!AllowHorRdxIdenityOptimization ||
14475         all_of(ReducedVals, [](ArrayRef<Value *> RedV) {
14476           return RedV.size() < 2 || !allConstant(RedV) || !isSplat(RedV);
14477         }))) {
14478      for (ReductionOpsType &RdxOps : ReductionOps)
14479        for (Value *RdxOp : RdxOps)
14480          V.analyzedReductionRoot(cast<Instruction>(RdxOp));
14481      return nullptr;
14482    }
14483
14484    IRBuilder<> Builder(cast<Instruction>(ReductionRoot));
14485
14486    // Track the reduced values in case if they are replaced by extractelement
14487    // because of the vectorization.
14488    DenseMap<Value *, WeakTrackingVH> TrackedVals(
14489        ReducedVals.size() * ReducedVals.front().size() + ExtraArgs.size());
14490    BoUpSLP::ExtraValueToDebugLocsMap ExternallyUsedValues;
14491    SmallVector<std::pair<Value *, Value *>> ReplacedExternals;
14492    ExternallyUsedValues.reserve(ExtraArgs.size() + 1);
14493    // The same extra argument may be used several times, so log each attempt
14494    // to use it.
14495    for (const std::pair<Instruction *, Value *> &Pair : ExtraArgs) {
14496      assert(Pair.first && "DebugLoc must be set.");
14497      ExternallyUsedValues[Pair.second].push_back(Pair.first);
14498      TrackedVals.try_emplace(Pair.second, Pair.second);
14499    }
14500
14501    // The compare instruction of a min/max is the insertion point for new
14502    // instructions and may be replaced with a new compare instruction.
14503    auto &&GetCmpForMinMaxReduction = [](Instruction *RdxRootInst) {
14504      assert(isa<SelectInst>(RdxRootInst) &&
14505             "Expected min/max reduction to have select root instruction");
14506      Value *ScalarCond = cast<SelectInst>(RdxRootInst)->getCondition();
14507      assert(isa<Instruction>(ScalarCond) &&
14508             "Expected min/max reduction to have compare condition");
14509      return cast<Instruction>(ScalarCond);
14510    };
14511
14512    // Return new VectorizedTree, based on previous value.
14513    auto GetNewVectorizedTree = [&](Value *VectorizedTree, Value *Res) {
14514      if (VectorizedTree) {
14515        // Update the final value in the reduction.
14516        Builder.SetCurrentDebugLocation(
14517            cast<Instruction>(ReductionOps.front().front())->getDebugLoc());
14518        if ((isa<PoisonValue>(VectorizedTree) && !isa<PoisonValue>(Res)) ||
14519            (isGuaranteedNotToBePoison(Res) &&
14520             !isGuaranteedNotToBePoison(VectorizedTree))) {
14521          auto It = ReducedValsToOps.find(Res);
14522          if (It != ReducedValsToOps.end() &&
14523              any_of(It->getSecond(),
14524                     [](Instruction *I) { return isBoolLogicOp(I); }))
14525            std::swap(VectorizedTree, Res);
14526        }
14527
14528        return createOp(Builder, RdxKind, VectorizedTree, Res, "op.rdx",
14529                        ReductionOps);
14530      }
14531      // Initialize the final value in the reduction.
14532      return Res;
14533    };
14534    bool AnyBoolLogicOp =
14535        any_of(ReductionOps.back(), [](Value *V) {
14536          return isBoolLogicOp(cast<Instruction>(V));
14537        });
14538    // The reduction root is used as the insertion point for new instructions,
14539    // so set it as externally used to prevent it from being deleted.
14540    ExternallyUsedValues[ReductionRoot];
14541    SmallDenseSet<Value *> IgnoreList(ReductionOps.size() *
14542                                      ReductionOps.front().size());
14543    for (ReductionOpsType &RdxOps : ReductionOps)
14544      for (Value *RdxOp : RdxOps) {
14545        if (!RdxOp)
14546          continue;
14547        IgnoreList.insert(RdxOp);
14548      }
14549    // Intersect the fast-math-flags from all reduction operations.
14550    FastMathFlags RdxFMF;
14551    RdxFMF.set();
14552    for (Value *U : IgnoreList)
14553      if (auto *FPMO = dyn_cast<FPMathOperator>(U))
14554        RdxFMF &= FPMO->getFastMathFlags();
14555    bool IsCmpSelMinMax = isCmpSelMinMax(cast<Instruction>(ReductionRoot));
14556
14557    // Need to track reduced vals, they may be changed during vectorization of
14558    // subvectors.
14559    for (ArrayRef<Value *> Candidates : ReducedVals)
14560      for (Value *V : Candidates)
14561        TrackedVals.try_emplace(V, V);
14562
14563    DenseMap<Value *, unsigned> VectorizedVals(ReducedVals.size());
14564    // List of the values that were reduced in other trees as part of gather
14565    // nodes and thus requiring extract if fully vectorized in other trees.
14566    SmallPtrSet<Value *, 4> RequiredExtract;
14567    Value *VectorizedTree = nullptr;
14568    bool CheckForReusedReductionOps = false;
14569    // Try to vectorize elements based on their type.
14570    for (unsigned I = 0, E = ReducedVals.size(); I < E; ++I) {
14571      ArrayRef<Value *> OrigReducedVals = ReducedVals[I];
14572      InstructionsState S = getSameOpcode(OrigReducedVals, TLI);
14573      SmallVector<Value *> Candidates;
14574      Candidates.reserve(2 * OrigReducedVals.size());
14575      DenseMap<Value *, Value *> TrackedToOrig(2 * OrigReducedVals.size());
14576      for (unsigned Cnt = 0, Sz = OrigReducedVals.size(); Cnt < Sz; ++Cnt) {
14577        Value *RdxVal = TrackedVals.find(OrigReducedVals[Cnt])->second;
14578        // Check if the reduction value was not overriden by the extractelement
14579        // instruction because of the vectorization and exclude it, if it is not
14580        // compatible with other values.
14581        // Also check if the instruction was folded to constant/other value.
14582        auto *Inst = dyn_cast<Instruction>(RdxVal);
14583        if ((Inst && isVectorLikeInstWithConstOps(Inst) &&
14584             (!S.getOpcode() || !S.isOpcodeOrAlt(Inst))) ||
14585            (S.getOpcode() && !Inst))
14586          continue;
14587        Candidates.push_back(RdxVal);
14588        TrackedToOrig.try_emplace(RdxVal, OrigReducedVals[Cnt]);
14589      }
14590      bool ShuffledExtracts = false;
14591      // Try to handle shuffled extractelements.
14592      if (S.getOpcode() == Instruction::ExtractElement && !S.isAltShuffle() &&
14593          I + 1 < E) {
14594        InstructionsState NextS = getSameOpcode(ReducedVals[I + 1], TLI);
14595        if (NextS.getOpcode() == Instruction::ExtractElement &&
14596            !NextS.isAltShuffle()) {
14597          SmallVector<Value *> CommonCandidates(Candidates);
14598          for (Value *RV : ReducedVals[I + 1]) {
14599            Value *RdxVal = TrackedVals.find(RV)->second;
14600            // Check if the reduction value was not overriden by the
14601            // extractelement instruction because of the vectorization and
14602            // exclude it, if it is not compatible with other values.
14603            if (auto *Inst = dyn_cast<Instruction>(RdxVal))
14604              if (!NextS.getOpcode() || !NextS.isOpcodeOrAlt(Inst))
14605                continue;
14606            CommonCandidates.push_back(RdxVal);
14607            TrackedToOrig.try_emplace(RdxVal, RV);
14608          }
14609          SmallVector<int> Mask;
14610          if (isFixedVectorShuffle(CommonCandidates, Mask)) {
14611            ++I;
14612            Candidates.swap(CommonCandidates);
14613            ShuffledExtracts = true;
14614          }
14615        }
14616      }
14617
14618      // Emit code for constant values.
14619      if (AllowHorRdxIdenityOptimization && Candidates.size() > 1 &&
14620          allConstant(Candidates)) {
14621        Value *Res = Candidates.front();
14622        ++VectorizedVals.try_emplace(Candidates.front(), 0).first->getSecond();
14623        for (Value *VC : ArrayRef(Candidates).drop_front()) {
14624          Res = createOp(Builder, RdxKind, Res, VC, "const.rdx", ReductionOps);
14625          ++VectorizedVals.try_emplace(VC, 0).first->getSecond();
14626          if (auto *ResI = dyn_cast<Instruction>(Res))
14627            V.analyzedReductionRoot(ResI);
14628        }
14629        VectorizedTree = GetNewVectorizedTree(VectorizedTree, Res);
14630        continue;
14631      }
14632
14633      unsigned NumReducedVals = Candidates.size();
14634      if (NumReducedVals < ReductionLimit &&
14635          (NumReducedVals < 2 || !AllowHorRdxIdenityOptimization ||
14636           !isSplat(Candidates)))
14637        continue;
14638
14639      // Check if we support repeated scalar values processing (optimization of
14640      // original scalar identity operations on matched horizontal reductions).
14641      IsSupportedHorRdxIdentityOp =
14642          AllowHorRdxIdenityOptimization && RdxKind != RecurKind::Mul &&
14643          RdxKind != RecurKind::FMul && RdxKind != RecurKind::FMulAdd;
14644      // Gather same values.
14645      MapVector<Value *, unsigned> SameValuesCounter;
14646      if (IsSupportedHorRdxIdentityOp)
14647        for (Value *V : Candidates)
14648          ++SameValuesCounter.insert(std::make_pair(V, 0)).first->second;
14649      // Used to check if the reduced values used same number of times. In this
14650      // case the compiler may produce better code. E.g. if reduced values are
14651      // aabbccdd (8 x values), then the first node of the tree will have a node
14652      // for 4 x abcd + shuffle <4 x abcd>, <0, 0, 1, 1, 2, 2, 3, 3>.
14653      // Plus, the final reduction will be performed on <8 x aabbccdd>.
14654      // Instead compiler may build <4 x abcd> tree immediately, + reduction (4
14655      // x abcd) * 2.
14656      // Currently it only handles add/fadd/xor. and/or/min/max do not require
14657      // this analysis, other operations may require an extra estimation of
14658      // the profitability.
14659      bool SameScaleFactor = false;
14660      bool OptReusedScalars = IsSupportedHorRdxIdentityOp &&
14661                              SameValuesCounter.size() != Candidates.size();
14662      if (OptReusedScalars) {
14663        SameScaleFactor =
14664            (RdxKind == RecurKind::Add || RdxKind == RecurKind::FAdd ||
14665             RdxKind == RecurKind::Xor) &&
14666            all_of(drop_begin(SameValuesCounter),
14667                   [&SameValuesCounter](const std::pair<Value *, unsigned> &P) {
14668                     return P.second == SameValuesCounter.front().second;
14669                   });
14670        Candidates.resize(SameValuesCounter.size());
14671        transform(SameValuesCounter, Candidates.begin(),
14672                  [](const auto &P) { return P.first; });
14673        NumReducedVals = Candidates.size();
14674        // Have a reduction of the same element.
14675        if (NumReducedVals == 1) {
14676          Value *OrigV = TrackedToOrig.find(Candidates.front())->second;
14677          unsigned Cnt = SameValuesCounter.lookup(OrigV);
14678          Value *RedVal =
14679              emitScaleForReusedOps(Candidates.front(), Builder, Cnt);
14680          VectorizedTree = GetNewVectorizedTree(VectorizedTree, RedVal);
14681          VectorizedVals.try_emplace(OrigV, Cnt);
14682          continue;
14683        }
14684      }
14685
14686      unsigned MaxVecRegSize = V.getMaxVecRegSize();
14687      unsigned EltSize = V.getVectorElementSize(Candidates[0]);
14688      unsigned MaxElts =
14689          RegMaxNumber * llvm::bit_floor(MaxVecRegSize / EltSize);
14690
14691      unsigned ReduxWidth = std::min<unsigned>(
14692          llvm::bit_floor(NumReducedVals), std::max(RedValsMaxNumber, MaxElts));
14693      unsigned Start = 0;
14694      unsigned Pos = Start;
14695      // Restarts vectorization attempt with lower vector factor.
14696      unsigned PrevReduxWidth = ReduxWidth;
14697      bool CheckForReusedReductionOpsLocal = false;
14698      auto &&AdjustReducedVals = [&Pos, &Start, &ReduxWidth, NumReducedVals,
14699                                  &CheckForReusedReductionOpsLocal,
14700                                  &PrevReduxWidth, &V,
14701                                  &IgnoreList](bool IgnoreVL = false) {
14702        bool IsAnyRedOpGathered = !IgnoreVL && V.isAnyGathered(IgnoreList);
14703        if (!CheckForReusedReductionOpsLocal && PrevReduxWidth == ReduxWidth) {
14704          // Check if any of the reduction ops are gathered. If so, worth
14705          // trying again with less number of reduction ops.
14706          CheckForReusedReductionOpsLocal |= IsAnyRedOpGathered;
14707        }
14708        ++Pos;
14709        if (Pos < NumReducedVals - ReduxWidth + 1)
14710          return IsAnyRedOpGathered;
14711        Pos = Start;
14712        ReduxWidth /= 2;
14713        return IsAnyRedOpGathered;
14714      };
14715      bool AnyVectorized = false;
14716      while (Pos < NumReducedVals - ReduxWidth + 1 &&
14717             ReduxWidth >= ReductionLimit) {
14718        // Dependency in tree of the reduction ops - drop this attempt, try
14719        // later.
14720        if (CheckForReusedReductionOpsLocal && PrevReduxWidth != ReduxWidth &&
14721            Start == 0) {
14722          CheckForReusedReductionOps = true;
14723          break;
14724        }
14725        PrevReduxWidth = ReduxWidth;
14726        ArrayRef<Value *> VL(std::next(Candidates.begin(), Pos), ReduxWidth);
14727        // Beeing analyzed already - skip.
14728        if (V.areAnalyzedReductionVals(VL)) {
14729          (void)AdjustReducedVals(/*IgnoreVL=*/true);
14730          continue;
14731        }
14732        // Early exit if any of the reduction values were deleted during
14733        // previous vectorization attempts.
14734        if (any_of(VL, [&V](Value *RedVal) {
14735              auto *RedValI = dyn_cast<Instruction>(RedVal);
14736              if (!RedValI)
14737                return false;
14738              return V.isDeleted(RedValI);
14739            }))
14740          break;
14741        V.buildTree(VL, IgnoreList);
14742        if (V.isTreeTinyAndNotFullyVectorizable(/*ForReduction=*/true)) {
14743          if (!AdjustReducedVals())
14744            V.analyzedReductionVals(VL);
14745          continue;
14746        }
14747        if (V.isLoadCombineReductionCandidate(RdxKind)) {
14748          if (!AdjustReducedVals())
14749            V.analyzedReductionVals(VL);
14750          continue;
14751        }
14752        V.reorderTopToBottom();
14753        // No need to reorder the root node at all.
14754        V.reorderBottomToTop(/*IgnoreReorder=*/true);
14755        // Keep extracted other reduction values, if they are used in the
14756        // vectorization trees.
14757        BoUpSLP::ExtraValueToDebugLocsMap LocalExternallyUsedValues(
14758            ExternallyUsedValues);
14759        for (unsigned Cnt = 0, Sz = ReducedVals.size(); Cnt < Sz; ++Cnt) {
14760          if (Cnt == I || (ShuffledExtracts && Cnt == I - 1))
14761            continue;
14762          for (Value *V : ReducedVals[Cnt])
14763            if (isa<Instruction>(V))
14764              LocalExternallyUsedValues[TrackedVals[V]];
14765        }
14766        if (!IsSupportedHorRdxIdentityOp) {
14767          // Number of uses of the candidates in the vector of values.
14768          assert(SameValuesCounter.empty() &&
14769                 "Reused values counter map is not empty");
14770          for (unsigned Cnt = 0; Cnt < NumReducedVals; ++Cnt) {
14771            if (Cnt >= Pos && Cnt < Pos + ReduxWidth)
14772              continue;
14773            Value *V = Candidates[Cnt];
14774            Value *OrigV = TrackedToOrig.find(V)->second;
14775            ++SameValuesCounter[OrigV];
14776          }
14777        }
14778        SmallPtrSet<Value *, 4> VLScalars(VL.begin(), VL.end());
14779        // Gather externally used values.
14780        SmallPtrSet<Value *, 4> Visited;
14781        for (unsigned Cnt = 0; Cnt < NumReducedVals; ++Cnt) {
14782          if (Cnt >= Pos && Cnt < Pos + ReduxWidth)
14783            continue;
14784          Value *RdxVal = Candidates[Cnt];
14785          if (!Visited.insert(RdxVal).second)
14786            continue;
14787          // Check if the scalar was vectorized as part of the vectorization
14788          // tree but not the top node.
14789          if (!VLScalars.contains(RdxVal) && V.isVectorized(RdxVal)) {
14790            LocalExternallyUsedValues[RdxVal];
14791            continue;
14792          }
14793          Value *OrigV = TrackedToOrig.find(RdxVal)->second;
14794          unsigned NumOps =
14795              VectorizedVals.lookup(RdxVal) + SameValuesCounter[OrigV];
14796          if (NumOps != ReducedValsToOps.find(OrigV)->second.size())
14797            LocalExternallyUsedValues[RdxVal];
14798        }
14799        // Do not need the list of reused scalars in regular mode anymore.
14800        if (!IsSupportedHorRdxIdentityOp)
14801          SameValuesCounter.clear();
14802        for (Value *RdxVal : VL)
14803          if (RequiredExtract.contains(RdxVal))
14804            LocalExternallyUsedValues[RdxVal];
14805        // Update LocalExternallyUsedValues for the scalar, replaced by
14806        // extractelement instructions.
14807        DenseMap<Value *, Value *> ReplacementToExternal;
14808        for (const std::pair<Value *, Value *> &Pair : ReplacedExternals)
14809          ReplacementToExternal.try_emplace(Pair.second, Pair.first);
14810        for (const std::pair<Value *, Value *> &Pair : ReplacedExternals) {
14811          Value *Ext = Pair.first;
14812          auto RIt = ReplacementToExternal.find(Ext);
14813          while (RIt != ReplacementToExternal.end()) {
14814            Ext = RIt->second;
14815            RIt = ReplacementToExternal.find(Ext);
14816          }
14817          auto *It = ExternallyUsedValues.find(Ext);
14818          if (It == ExternallyUsedValues.end())
14819            continue;
14820          LocalExternallyUsedValues[Pair.second].append(It->second);
14821        }
14822        V.buildExternalUses(LocalExternallyUsedValues);
14823
14824        V.computeMinimumValueSizes();
14825
14826        // Estimate cost.
14827        InstructionCost TreeCost = V.getTreeCost(VL);
14828        InstructionCost ReductionCost =
14829            getReductionCost(TTI, VL, IsCmpSelMinMax, ReduxWidth, RdxFMF);
14830        InstructionCost Cost = TreeCost + ReductionCost;
14831        LLVM_DEBUG(dbgs() << "SLP: Found cost = " << Cost
14832                          << " for reduction\n");
14833        if (!Cost.isValid())
14834          return nullptr;
14835        if (Cost >= -SLPCostThreshold) {
14836          V.getORE()->emit([&]() {
14837            return OptimizationRemarkMissed(
14838                       SV_NAME, "HorSLPNotBeneficial",
14839                       ReducedValsToOps.find(VL[0])->second.front())
14840                   << "Vectorizing horizontal reduction is possible "
14841                   << "but not beneficial with cost " << ore::NV("Cost", Cost)
14842                   << " and threshold "
14843                   << ore::NV("Threshold", -SLPCostThreshold);
14844          });
14845          if (!AdjustReducedVals())
14846            V.analyzedReductionVals(VL);
14847          continue;
14848        }
14849
14850        LLVM_DEBUG(dbgs() << "SLP: Vectorizing horizontal reduction at cost:"
14851                          << Cost << ". (HorRdx)\n");
14852        V.getORE()->emit([&]() {
14853          return OptimizationRemark(
14854                     SV_NAME, "VectorizedHorizontalReduction",
14855                     ReducedValsToOps.find(VL[0])->second.front())
14856                 << "Vectorized horizontal reduction with cost "
14857                 << ore::NV("Cost", Cost) << " and with tree size "
14858                 << ore::NV("TreeSize", V.getTreeSize());
14859        });
14860
14861        Builder.setFastMathFlags(RdxFMF);
14862
14863        // Emit a reduction. If the root is a select (min/max idiom), the insert
14864        // point is the compare condition of that select.
14865        Instruction *RdxRootInst = cast<Instruction>(ReductionRoot);
14866        Instruction *InsertPt = RdxRootInst;
14867        if (IsCmpSelMinMax)
14868          InsertPt = GetCmpForMinMaxReduction(RdxRootInst);
14869
14870        // Vectorize a tree.
14871        Value *VectorizedRoot = V.vectorizeTree(LocalExternallyUsedValues,
14872                                                ReplacedExternals, InsertPt);
14873
14874        Builder.SetInsertPoint(InsertPt);
14875
14876        // To prevent poison from leaking across what used to be sequential,
14877        // safe, scalar boolean logic operations, the reduction operand must be
14878        // frozen.
14879        if ((isBoolLogicOp(RdxRootInst) ||
14880             (AnyBoolLogicOp && VL.size() != TrackedVals.size())) &&
14881            !isGuaranteedNotToBePoison(VectorizedRoot))
14882          VectorizedRoot = Builder.CreateFreeze(VectorizedRoot);
14883
14884        // Emit code to correctly handle reused reduced values, if required.
14885        if (OptReusedScalars && !SameScaleFactor) {
14886          VectorizedRoot =
14887              emitReusedOps(VectorizedRoot, Builder, V.getRootNodeScalars(),
14888                            SameValuesCounter, TrackedToOrig);
14889        }
14890
14891        Value *ReducedSubTree =
14892            emitReduction(VectorizedRoot, Builder, ReduxWidth, TTI);
14893        if (ReducedSubTree->getType() != VL.front()->getType()) {
14894          ReducedSubTree = Builder.CreateIntCast(
14895              ReducedSubTree, VL.front()->getType(), any_of(VL, [&](Value *R) {
14896                KnownBits Known = computeKnownBits(
14897                    R, cast<Instruction>(ReductionOps.front().front())
14898                           ->getModule()
14899                           ->getDataLayout());
14900                return !Known.isNonNegative();
14901              }));
14902        }
14903
14904        // Improved analysis for add/fadd/xor reductions with same scale factor
14905        // for all operands of reductions. We can emit scalar ops for them
14906        // instead.
14907        if (OptReusedScalars && SameScaleFactor)
14908          ReducedSubTree = emitScaleForReusedOps(
14909              ReducedSubTree, Builder, SameValuesCounter.front().second);
14910
14911        VectorizedTree = GetNewVectorizedTree(VectorizedTree, ReducedSubTree);
14912        // Count vectorized reduced values to exclude them from final reduction.
14913        for (Value *RdxVal : VL) {
14914          Value *OrigV = TrackedToOrig.find(RdxVal)->second;
14915          if (IsSupportedHorRdxIdentityOp) {
14916            VectorizedVals.try_emplace(OrigV, SameValuesCounter[RdxVal]);
14917            continue;
14918          }
14919          ++VectorizedVals.try_emplace(OrigV, 0).first->getSecond();
14920          if (!V.isVectorized(RdxVal))
14921            RequiredExtract.insert(RdxVal);
14922        }
14923        Pos += ReduxWidth;
14924        Start = Pos;
14925        ReduxWidth = llvm::bit_floor(NumReducedVals - Pos);
14926        AnyVectorized = true;
14927      }
14928      if (OptReusedScalars && !AnyVectorized) {
14929        for (const std::pair<Value *, unsigned> &P : SameValuesCounter) {
14930          Value *RedVal = emitScaleForReusedOps(P.first, Builder, P.second);
14931          VectorizedTree = GetNewVectorizedTree(VectorizedTree, RedVal);
14932          Value *OrigV = TrackedToOrig.find(P.first)->second;
14933          VectorizedVals.try_emplace(OrigV, P.second);
14934        }
14935        continue;
14936      }
14937    }
14938    if (VectorizedTree) {
14939      // Reorder operands of bool logical op in the natural order to avoid
14940      // possible problem with poison propagation. If not possible to reorder
14941      // (both operands are originally RHS), emit an extra freeze instruction
14942      // for the LHS operand.
14943      // I.e., if we have original code like this:
14944      // RedOp1 = select i1 ?, i1 LHS, i1 false
14945      // RedOp2 = select i1 RHS, i1 ?, i1 false
14946
14947      // Then, we swap LHS/RHS to create a new op that matches the poison
14948      // semantics of the original code.
14949
14950      // If we have original code like this and both values could be poison:
14951      // RedOp1 = select i1 ?, i1 LHS, i1 false
14952      // RedOp2 = select i1 ?, i1 RHS, i1 false
14953
14954      // Then, we must freeze LHS in the new op.
14955      auto FixBoolLogicalOps = [&, VectorizedTree](Value *&LHS, Value *&RHS,
14956                                                   Instruction *RedOp1,
14957                                                   Instruction *RedOp2,
14958                                                   bool InitStep) {
14959        if (!AnyBoolLogicOp)
14960          return;
14961        if (isBoolLogicOp(RedOp1) &&
14962            ((!InitStep && LHS == VectorizedTree) ||
14963             getRdxOperand(RedOp1, 0) == LHS || isGuaranteedNotToBePoison(LHS)))
14964          return;
14965        if (isBoolLogicOp(RedOp2) && ((!InitStep && RHS == VectorizedTree) ||
14966                                      getRdxOperand(RedOp2, 0) == RHS ||
14967                                      isGuaranteedNotToBePoison(RHS))) {
14968          std::swap(LHS, RHS);
14969          return;
14970        }
14971        if (LHS != VectorizedTree)
14972          LHS = Builder.CreateFreeze(LHS);
14973      };
14974      // Finish the reduction.
14975      // Need to add extra arguments and not vectorized possible reduction
14976      // values.
14977      // Try to avoid dependencies between the scalar remainders after
14978      // reductions.
14979      auto FinalGen =
14980          [&](ArrayRef<std::pair<Instruction *, Value *>> InstVals,
14981              bool InitStep) {
14982            unsigned Sz = InstVals.size();
14983            SmallVector<std::pair<Instruction *, Value *>> ExtraReds(Sz / 2 +
14984                                                                     Sz % 2);
14985            for (unsigned I = 0, E = (Sz / 2) * 2; I < E; I += 2) {
14986              Instruction *RedOp = InstVals[I + 1].first;
14987              Builder.SetCurrentDebugLocation(RedOp->getDebugLoc());
14988              Value *RdxVal1 = InstVals[I].second;
14989              Value *StableRdxVal1 = RdxVal1;
14990              auto It1 = TrackedVals.find(RdxVal1);
14991              if (It1 != TrackedVals.end())
14992                StableRdxVal1 = It1->second;
14993              Value *RdxVal2 = InstVals[I + 1].second;
14994              Value *StableRdxVal2 = RdxVal2;
14995              auto It2 = TrackedVals.find(RdxVal2);
14996              if (It2 != TrackedVals.end())
14997                StableRdxVal2 = It2->second;
14998              // To prevent poison from leaking across what used to be
14999              // sequential, safe, scalar boolean logic operations, the
15000              // reduction operand must be frozen.
15001              FixBoolLogicalOps(StableRdxVal1, StableRdxVal2, InstVals[I].first,
15002                                RedOp, InitStep);
15003              Value *ExtraRed = createOp(Builder, RdxKind, StableRdxVal1,
15004                                         StableRdxVal2, "op.rdx", ReductionOps);
15005              ExtraReds[I / 2] = std::make_pair(InstVals[I].first, ExtraRed);
15006            }
15007            if (Sz % 2 == 1)
15008              ExtraReds[Sz / 2] = InstVals.back();
15009            return ExtraReds;
15010          };
15011      SmallVector<std::pair<Instruction *, Value *>> ExtraReductions;
15012      ExtraReductions.emplace_back(cast<Instruction>(ReductionRoot),
15013                                   VectorizedTree);
15014      SmallPtrSet<Value *, 8> Visited;
15015      for (ArrayRef<Value *> Candidates : ReducedVals) {
15016        for (Value *RdxVal : Candidates) {
15017          if (!Visited.insert(RdxVal).second)
15018            continue;
15019          unsigned NumOps = VectorizedVals.lookup(RdxVal);
15020          for (Instruction *RedOp :
15021               ArrayRef(ReducedValsToOps.find(RdxVal)->second)
15022                   .drop_back(NumOps))
15023            ExtraReductions.emplace_back(RedOp, RdxVal);
15024        }
15025      }
15026      for (auto &Pair : ExternallyUsedValues) {
15027        // Add each externally used value to the final reduction.
15028        for (auto *I : Pair.second)
15029          ExtraReductions.emplace_back(I, Pair.first);
15030      }
15031      // Iterate through all not-vectorized reduction values/extra arguments.
15032      bool InitStep = true;
15033      while (ExtraReductions.size() > 1) {
15034        VectorizedTree = ExtraReductions.front().second;
15035        SmallVector<std::pair<Instruction *, Value *>> NewReds =
15036            FinalGen(ExtraReductions, InitStep);
15037        ExtraReductions.swap(NewReds);
15038        InitStep = false;
15039      }
15040      VectorizedTree = ExtraReductions.front().second;
15041
15042      ReductionRoot->replaceAllUsesWith(VectorizedTree);
15043
15044      // The original scalar reduction is expected to have no remaining
15045      // uses outside the reduction tree itself.  Assert that we got this
15046      // correct, replace internal uses with undef, and mark for eventual
15047      // deletion.
15048#ifndef NDEBUG
15049      SmallSet<Value *, 4> IgnoreSet;
15050      for (ArrayRef<Value *> RdxOps : ReductionOps)
15051        IgnoreSet.insert(RdxOps.begin(), RdxOps.end());
15052#endif
15053      for (ArrayRef<Value *> RdxOps : ReductionOps) {
15054        for (Value *Ignore : RdxOps) {
15055          if (!Ignore)
15056            continue;
15057#ifndef NDEBUG
15058          for (auto *U : Ignore->users()) {
15059            assert(IgnoreSet.count(U) &&
15060                   "All users must be either in the reduction ops list.");
15061          }
15062#endif
15063          if (!Ignore->use_empty()) {
15064            Value *Undef = UndefValue::get(Ignore->getType());
15065            Ignore->replaceAllUsesWith(Undef);
15066          }
15067          V.eraseInstruction(cast<Instruction>(Ignore));
15068        }
15069      }
15070    } else if (!CheckForReusedReductionOps) {
15071      for (ReductionOpsType &RdxOps : ReductionOps)
15072        for (Value *RdxOp : RdxOps)
15073          V.analyzedReductionRoot(cast<Instruction>(RdxOp));
15074    }
15075    return VectorizedTree;
15076  }
15077
15078private:
15079  /// Calculate the cost of a reduction.
15080  InstructionCost getReductionCost(TargetTransformInfo *TTI,
15081                                   ArrayRef<Value *> ReducedVals,
15082                                   bool IsCmpSelMinMax, unsigned ReduxWidth,
15083                                   FastMathFlags FMF) {
15084    TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput;
15085    Type *ScalarTy = ReducedVals.front()->getType();
15086    FixedVectorType *VectorTy = FixedVectorType::get(ScalarTy, ReduxWidth);
15087    InstructionCost VectorCost = 0, ScalarCost;
15088    // If all of the reduced values are constant, the vector cost is 0, since
15089    // the reduction value can be calculated at the compile time.
15090    bool AllConsts = allConstant(ReducedVals);
15091    auto EvaluateScalarCost = [&](function_ref<InstructionCost()> GenCostFn) {
15092      InstructionCost Cost = 0;
15093      // Scalar cost is repeated for N-1 elements.
15094      int Cnt = ReducedVals.size();
15095      for (Value *RdxVal : ReducedVals) {
15096        if (Cnt == 1)
15097          break;
15098        --Cnt;
15099        if (RdxVal->hasNUsesOrMore(IsCmpSelMinMax ? 3 : 2)) {
15100          Cost += GenCostFn();
15101          continue;
15102        }
15103        InstructionCost ScalarCost = 0;
15104        for (User *U : RdxVal->users()) {
15105          auto *RdxOp = cast<Instruction>(U);
15106          if (hasRequiredNumberOfUses(IsCmpSelMinMax, RdxOp)) {
15107            ScalarCost += TTI->getInstructionCost(RdxOp, CostKind);
15108            continue;
15109          }
15110          ScalarCost = InstructionCost::getInvalid();
15111          break;
15112        }
15113        if (ScalarCost.isValid())
15114          Cost += ScalarCost;
15115        else
15116          Cost += GenCostFn();
15117      }
15118      return Cost;
15119    };
15120    switch (RdxKind) {
15121    case RecurKind::Add:
15122    case RecurKind::Mul:
15123    case RecurKind::Or:
15124    case RecurKind::And:
15125    case RecurKind::Xor:
15126    case RecurKind::FAdd:
15127    case RecurKind::FMul: {
15128      unsigned RdxOpcode = RecurrenceDescriptor::getOpcode(RdxKind);
15129      if (!AllConsts)
15130        VectorCost =
15131            TTI->getArithmeticReductionCost(RdxOpcode, VectorTy, FMF, CostKind);
15132      ScalarCost = EvaluateScalarCost([&]() {
15133        return TTI->getArithmeticInstrCost(RdxOpcode, ScalarTy, CostKind);
15134      });
15135      break;
15136    }
15137    case RecurKind::FMax:
15138    case RecurKind::FMin:
15139    case RecurKind::FMaximum:
15140    case RecurKind::FMinimum:
15141    case RecurKind::SMax:
15142    case RecurKind::SMin:
15143    case RecurKind::UMax:
15144    case RecurKind::UMin: {
15145      Intrinsic::ID Id = getMinMaxReductionIntrinsicOp(RdxKind);
15146      if (!AllConsts)
15147        VectorCost = TTI->getMinMaxReductionCost(Id, VectorTy, FMF, CostKind);
15148      ScalarCost = EvaluateScalarCost([&]() {
15149        IntrinsicCostAttributes ICA(Id, ScalarTy, {ScalarTy, ScalarTy}, FMF);
15150        return TTI->getIntrinsicInstrCost(ICA, CostKind);
15151      });
15152      break;
15153    }
15154    default:
15155      llvm_unreachable("Expected arithmetic or min/max reduction operation");
15156    }
15157
15158    LLVM_DEBUG(dbgs() << "SLP: Adding cost " << VectorCost - ScalarCost
15159                      << " for reduction of " << shortBundleName(ReducedVals)
15160                      << " (It is a splitting reduction)\n");
15161    return VectorCost - ScalarCost;
15162  }
15163
15164  /// Emit a horizontal reduction of the vectorized value.
15165  Value *emitReduction(Value *VectorizedValue, IRBuilder<> &Builder,
15166                       unsigned ReduxWidth, const TargetTransformInfo *TTI) {
15167    assert(VectorizedValue && "Need to have a vectorized tree node");
15168    assert(isPowerOf2_32(ReduxWidth) &&
15169           "We only handle power-of-two reductions for now");
15170    assert(RdxKind != RecurKind::FMulAdd &&
15171           "A call to the llvm.fmuladd intrinsic is not handled yet");
15172
15173    ++NumVectorInstructions;
15174    return createSimpleTargetReduction(Builder, VectorizedValue, RdxKind);
15175  }
15176
15177  /// Emits optimized code for unique scalar value reused \p Cnt times.
15178  Value *emitScaleForReusedOps(Value *VectorizedValue, IRBuilderBase &Builder,
15179                               unsigned Cnt) {
15180    assert(IsSupportedHorRdxIdentityOp &&
15181           "The optimization of matched scalar identity horizontal reductions "
15182           "must be supported.");
15183    switch (RdxKind) {
15184    case RecurKind::Add: {
15185      // res = mul vv, n
15186      Value *Scale = ConstantInt::get(VectorizedValue->getType(), Cnt);
15187      LLVM_DEBUG(dbgs() << "SLP: Add (to-mul) " << Cnt << "of "
15188                        << VectorizedValue << ". (HorRdx)\n");
15189      return Builder.CreateMul(VectorizedValue, Scale);
15190    }
15191    case RecurKind::Xor: {
15192      // res = n % 2 ? 0 : vv
15193      LLVM_DEBUG(dbgs() << "SLP: Xor " << Cnt << "of " << VectorizedValue
15194                        << ". (HorRdx)\n");
15195      if (Cnt % 2 == 0)
15196        return Constant::getNullValue(VectorizedValue->getType());
15197      return VectorizedValue;
15198    }
15199    case RecurKind::FAdd: {
15200      // res = fmul v, n
15201      Value *Scale = ConstantFP::get(VectorizedValue->getType(), Cnt);
15202      LLVM_DEBUG(dbgs() << "SLP: FAdd (to-fmul) " << Cnt << "of "
15203                        << VectorizedValue << ". (HorRdx)\n");
15204      return Builder.CreateFMul(VectorizedValue, Scale);
15205    }
15206    case RecurKind::And:
15207    case RecurKind::Or:
15208    case RecurKind::SMax:
15209    case RecurKind::SMin:
15210    case RecurKind::UMax:
15211    case RecurKind::UMin:
15212    case RecurKind::FMax:
15213    case RecurKind::FMin:
15214    case RecurKind::FMaximum:
15215    case RecurKind::FMinimum:
15216      // res = vv
15217      return VectorizedValue;
15218    case RecurKind::Mul:
15219    case RecurKind::FMul:
15220    case RecurKind::FMulAdd:
15221    case RecurKind::IAnyOf:
15222    case RecurKind::FAnyOf:
15223    case RecurKind::None:
15224      llvm_unreachable("Unexpected reduction kind for repeated scalar.");
15225    }
15226    return nullptr;
15227  }
15228
15229  /// Emits actual operation for the scalar identity values, found during
15230  /// horizontal reduction analysis.
15231  Value *emitReusedOps(Value *VectorizedValue, IRBuilderBase &Builder,
15232                       ArrayRef<Value *> VL,
15233                       const MapVector<Value *, unsigned> &SameValuesCounter,
15234                       const DenseMap<Value *, Value *> &TrackedToOrig) {
15235    assert(IsSupportedHorRdxIdentityOp &&
15236           "The optimization of matched scalar identity horizontal reductions "
15237           "must be supported.");
15238    auto *VTy = cast<FixedVectorType>(VectorizedValue->getType());
15239    if (VTy->getElementType() != VL.front()->getType()) {
15240      VectorizedValue = Builder.CreateIntCast(
15241          VectorizedValue,
15242          FixedVectorType::get(VL.front()->getType(), VTy->getNumElements()),
15243          any_of(VL, [&](Value *R) {
15244            KnownBits Known = computeKnownBits(
15245                R, cast<Instruction>(ReductionOps.front().front())
15246                       ->getModule()
15247                       ->getDataLayout());
15248            return !Known.isNonNegative();
15249          }));
15250    }
15251    switch (RdxKind) {
15252    case RecurKind::Add: {
15253      // root = mul prev_root, <1, 1, n, 1>
15254      SmallVector<Constant *> Vals;
15255      for (Value *V : VL) {
15256        unsigned Cnt = SameValuesCounter.lookup(TrackedToOrig.find(V)->second);
15257        Vals.push_back(ConstantInt::get(V->getType(), Cnt, /*IsSigned=*/false));
15258      }
15259      auto *Scale = ConstantVector::get(Vals);
15260      LLVM_DEBUG(dbgs() << "SLP: Add (to-mul) " << Scale << "of "
15261                        << VectorizedValue << ". (HorRdx)\n");
15262      return Builder.CreateMul(VectorizedValue, Scale);
15263    }
15264    case RecurKind::And:
15265    case RecurKind::Or:
15266      // No need for multiple or/and(s).
15267      LLVM_DEBUG(dbgs() << "SLP: And/or of same " << VectorizedValue
15268                        << ". (HorRdx)\n");
15269      return VectorizedValue;
15270    case RecurKind::SMax:
15271    case RecurKind::SMin:
15272    case RecurKind::UMax:
15273    case RecurKind::UMin:
15274    case RecurKind::FMax:
15275    case RecurKind::FMin:
15276    case RecurKind::FMaximum:
15277    case RecurKind::FMinimum:
15278      // No need for multiple min/max(s) of the same value.
15279      LLVM_DEBUG(dbgs() << "SLP: Max/min of same " << VectorizedValue
15280                        << ". (HorRdx)\n");
15281      return VectorizedValue;
15282    case RecurKind::Xor: {
15283      // Replace values with even number of repeats with 0, since
15284      // x xor x = 0.
15285      // root = shuffle prev_root, zeroinitalizer, <0, 1, 2, vf, 4, vf, 5, 6,
15286      // 7>, if elements 4th and 6th elements have even number of repeats.
15287      SmallVector<int> Mask(
15288          cast<FixedVectorType>(VectorizedValue->getType())->getNumElements(),
15289          PoisonMaskElem);
15290      std::iota(Mask.begin(), Mask.end(), 0);
15291      bool NeedShuffle = false;
15292      for (unsigned I = 0, VF = VL.size(); I < VF; ++I) {
15293        Value *V = VL[I];
15294        unsigned Cnt = SameValuesCounter.lookup(TrackedToOrig.find(V)->second);
15295        if (Cnt % 2 == 0) {
15296          Mask[I] = VF;
15297          NeedShuffle = true;
15298        }
15299      }
15300      LLVM_DEBUG(dbgs() << "SLP: Xor <"; for (int I
15301                                              : Mask) dbgs()
15302                                         << I << " ";
15303                 dbgs() << "> of " << VectorizedValue << ". (HorRdx)\n");
15304      if (NeedShuffle)
15305        VectorizedValue = Builder.CreateShuffleVector(
15306            VectorizedValue,
15307            ConstantVector::getNullValue(VectorizedValue->getType()), Mask);
15308      return VectorizedValue;
15309    }
15310    case RecurKind::FAdd: {
15311      // root = fmul prev_root, <1.0, 1.0, n.0, 1.0>
15312      SmallVector<Constant *> Vals;
15313      for (Value *V : VL) {
15314        unsigned Cnt = SameValuesCounter.lookup(TrackedToOrig.find(V)->second);
15315        Vals.push_back(ConstantFP::get(V->getType(), Cnt));
15316      }
15317      auto *Scale = ConstantVector::get(Vals);
15318      return Builder.CreateFMul(VectorizedValue, Scale);
15319    }
15320    case RecurKind::Mul:
15321    case RecurKind::FMul:
15322    case RecurKind::FMulAdd:
15323    case RecurKind::IAnyOf:
15324    case RecurKind::FAnyOf:
15325    case RecurKind::None:
15326      llvm_unreachable("Unexpected reduction kind for reused scalars.");
15327    }
15328    return nullptr;
15329  }
15330};
15331} // end anonymous namespace
15332
15333static std::optional<unsigned> getAggregateSize(Instruction *InsertInst) {
15334  if (auto *IE = dyn_cast<InsertElementInst>(InsertInst))
15335    return cast<FixedVectorType>(IE->getType())->getNumElements();
15336
15337  unsigned AggregateSize = 1;
15338  auto *IV = cast<InsertValueInst>(InsertInst);
15339  Type *CurrentType = IV->getType();
15340  do {
15341    if (auto *ST = dyn_cast<StructType>(CurrentType)) {
15342      for (auto *Elt : ST->elements())
15343        if (Elt != ST->getElementType(0)) // check homogeneity
15344          return std::nullopt;
15345      AggregateSize *= ST->getNumElements();
15346      CurrentType = ST->getElementType(0);
15347    } else if (auto *AT = dyn_cast<ArrayType>(CurrentType)) {
15348      AggregateSize *= AT->getNumElements();
15349      CurrentType = AT->getElementType();
15350    } else if (auto *VT = dyn_cast<FixedVectorType>(CurrentType)) {
15351      AggregateSize *= VT->getNumElements();
15352      return AggregateSize;
15353    } else if (CurrentType->isSingleValueType()) {
15354      return AggregateSize;
15355    } else {
15356      return std::nullopt;
15357    }
15358  } while (true);
15359}
15360
15361static void findBuildAggregate_rec(Instruction *LastInsertInst,
15362                                   TargetTransformInfo *TTI,
15363                                   SmallVectorImpl<Value *> &BuildVectorOpds,
15364                                   SmallVectorImpl<Value *> &InsertElts,
15365                                   unsigned OperandOffset) {
15366  do {
15367    Value *InsertedOperand = LastInsertInst->getOperand(1);
15368    std::optional<unsigned> OperandIndex =
15369        getInsertIndex(LastInsertInst, OperandOffset);
15370    if (!OperandIndex)
15371      return;
15372    if (isa<InsertElementInst, InsertValueInst>(InsertedOperand)) {
15373      findBuildAggregate_rec(cast<Instruction>(InsertedOperand), TTI,
15374                             BuildVectorOpds, InsertElts, *OperandIndex);
15375
15376    } else {
15377      BuildVectorOpds[*OperandIndex] = InsertedOperand;
15378      InsertElts[*OperandIndex] = LastInsertInst;
15379    }
15380    LastInsertInst = dyn_cast<Instruction>(LastInsertInst->getOperand(0));
15381  } while (LastInsertInst != nullptr &&
15382           isa<InsertValueInst, InsertElementInst>(LastInsertInst) &&
15383           LastInsertInst->hasOneUse());
15384}
15385
15386/// Recognize construction of vectors like
15387///  %ra = insertelement <4 x float> poison, float %s0, i32 0
15388///  %rb = insertelement <4 x float> %ra, float %s1, i32 1
15389///  %rc = insertelement <4 x float> %rb, float %s2, i32 2
15390///  %rd = insertelement <4 x float> %rc, float %s3, i32 3
15391///  starting from the last insertelement or insertvalue instruction.
15392///
15393/// Also recognize homogeneous aggregates like {<2 x float>, <2 x float>},
15394/// {{float, float}, {float, float}}, [2 x {float, float}] and so on.
15395/// See llvm/test/Transforms/SLPVectorizer/X86/pr42022.ll for examples.
15396///
15397/// Assume LastInsertInst is of InsertElementInst or InsertValueInst type.
15398///
15399/// \return true if it matches.
15400static bool findBuildAggregate(Instruction *LastInsertInst,
15401                               TargetTransformInfo *TTI,
15402                               SmallVectorImpl<Value *> &BuildVectorOpds,
15403                               SmallVectorImpl<Value *> &InsertElts) {
15404
15405  assert((isa<InsertElementInst>(LastInsertInst) ||
15406          isa<InsertValueInst>(LastInsertInst)) &&
15407         "Expected insertelement or insertvalue instruction!");
15408
15409  assert((BuildVectorOpds.empty() && InsertElts.empty()) &&
15410         "Expected empty result vectors!");
15411
15412  std::optional<unsigned> AggregateSize = getAggregateSize(LastInsertInst);
15413  if (!AggregateSize)
15414    return false;
15415  BuildVectorOpds.resize(*AggregateSize);
15416  InsertElts.resize(*AggregateSize);
15417
15418  findBuildAggregate_rec(LastInsertInst, TTI, BuildVectorOpds, InsertElts, 0);
15419  llvm::erase(BuildVectorOpds, nullptr);
15420  llvm::erase(InsertElts, nullptr);
15421  if (BuildVectorOpds.size() >= 2)
15422    return true;
15423
15424  return false;
15425}
15426
15427/// Try and get a reduction instruction from a phi node.
15428///
15429/// Given a phi node \p P in a block \p ParentBB, consider possible reductions
15430/// if they come from either \p ParentBB or a containing loop latch.
15431///
15432/// \returns A candidate reduction value if possible, or \code nullptr \endcode
15433/// if not possible.
15434static Instruction *getReductionInstr(const DominatorTree *DT, PHINode *P,
15435                                      BasicBlock *ParentBB, LoopInfo *LI) {
15436  // There are situations where the reduction value is not dominated by the
15437  // reduction phi. Vectorizing such cases has been reported to cause
15438  // miscompiles. See PR25787.
15439  auto DominatedReduxValue = [&](Value *R) {
15440    return isa<Instruction>(R) &&
15441           DT->dominates(P->getParent(), cast<Instruction>(R)->getParent());
15442  };
15443
15444  Instruction *Rdx = nullptr;
15445
15446  // Return the incoming value if it comes from the same BB as the phi node.
15447  if (P->getIncomingBlock(0) == ParentBB) {
15448    Rdx = dyn_cast<Instruction>(P->getIncomingValue(0));
15449  } else if (P->getIncomingBlock(1) == ParentBB) {
15450    Rdx = dyn_cast<Instruction>(P->getIncomingValue(1));
15451  }
15452
15453  if (Rdx && DominatedReduxValue(Rdx))
15454    return Rdx;
15455
15456  // Otherwise, check whether we have a loop latch to look at.
15457  Loop *BBL = LI->getLoopFor(ParentBB);
15458  if (!BBL)
15459    return nullptr;
15460  BasicBlock *BBLatch = BBL->getLoopLatch();
15461  if (!BBLatch)
15462    return nullptr;
15463
15464  // There is a loop latch, return the incoming value if it comes from
15465  // that. This reduction pattern occasionally turns up.
15466  if (P->getIncomingBlock(0) == BBLatch) {
15467    Rdx = dyn_cast<Instruction>(P->getIncomingValue(0));
15468  } else if (P->getIncomingBlock(1) == BBLatch) {
15469    Rdx = dyn_cast<Instruction>(P->getIncomingValue(1));
15470  }
15471
15472  if (Rdx && DominatedReduxValue(Rdx))
15473    return Rdx;
15474
15475  return nullptr;
15476}
15477
15478static bool matchRdxBop(Instruction *I, Value *&V0, Value *&V1) {
15479  if (match(I, m_BinOp(m_Value(V0), m_Value(V1))))
15480    return true;
15481  if (match(I, m_Intrinsic<Intrinsic::maxnum>(m_Value(V0), m_Value(V1))))
15482    return true;
15483  if (match(I, m_Intrinsic<Intrinsic::minnum>(m_Value(V0), m_Value(V1))))
15484    return true;
15485  if (match(I, m_Intrinsic<Intrinsic::maximum>(m_Value(V0), m_Value(V1))))
15486    return true;
15487  if (match(I, m_Intrinsic<Intrinsic::minimum>(m_Value(V0), m_Value(V1))))
15488    return true;
15489  if (match(I, m_Intrinsic<Intrinsic::smax>(m_Value(V0), m_Value(V1))))
15490    return true;
15491  if (match(I, m_Intrinsic<Intrinsic::smin>(m_Value(V0), m_Value(V1))))
15492    return true;
15493  if (match(I, m_Intrinsic<Intrinsic::umax>(m_Value(V0), m_Value(V1))))
15494    return true;
15495  if (match(I, m_Intrinsic<Intrinsic::umin>(m_Value(V0), m_Value(V1))))
15496    return true;
15497  return false;
15498}
15499
15500/// We could have an initial reduction that is not an add.
15501///  r *= v1 + v2 + v3 + v4
15502/// In such a case start looking for a tree rooted in the first '+'.
15503/// \Returns the new root if found, which may be nullptr if not an instruction.
15504static Instruction *tryGetSecondaryReductionRoot(PHINode *Phi,
15505                                                 Instruction *Root) {
15506  assert((isa<BinaryOperator>(Root) || isa<SelectInst>(Root) ||
15507          isa<IntrinsicInst>(Root)) &&
15508         "Expected binop, select, or intrinsic for reduction matching");
15509  Value *LHS =
15510      Root->getOperand(HorizontalReduction::getFirstOperandIndex(Root));
15511  Value *RHS =
15512      Root->getOperand(HorizontalReduction::getFirstOperandIndex(Root) + 1);
15513  if (LHS == Phi)
15514    return dyn_cast<Instruction>(RHS);
15515  if (RHS == Phi)
15516    return dyn_cast<Instruction>(LHS);
15517  return nullptr;
15518}
15519
15520/// \p Returns the first operand of \p I that does not match \p Phi. If
15521/// operand is not an instruction it returns nullptr.
15522static Instruction *getNonPhiOperand(Instruction *I, PHINode *Phi) {
15523  Value *Op0 = nullptr;
15524  Value *Op1 = nullptr;
15525  if (!matchRdxBop(I, Op0, Op1))
15526    return nullptr;
15527  return dyn_cast<Instruction>(Op0 == Phi ? Op1 : Op0);
15528}
15529
15530/// \Returns true if \p I is a candidate instruction for reduction vectorization.
15531static bool isReductionCandidate(Instruction *I) {
15532  bool IsSelect = match(I, m_Select(m_Value(), m_Value(), m_Value()));
15533  Value *B0 = nullptr, *B1 = nullptr;
15534  bool IsBinop = matchRdxBop(I, B0, B1);
15535  return IsBinop || IsSelect;
15536}
15537
15538bool SLPVectorizerPass::vectorizeHorReduction(
15539    PHINode *P, Instruction *Root, BasicBlock *BB, BoUpSLP &R, TargetTransformInfo *TTI,
15540    SmallVectorImpl<WeakTrackingVH> &PostponedInsts) {
15541  if (!ShouldVectorizeHor)
15542    return false;
15543  bool TryOperandsAsNewSeeds = P && isa<BinaryOperator>(Root);
15544
15545  if (Root->getParent() != BB || isa<PHINode>(Root))
15546    return false;
15547
15548  // If we can find a secondary reduction root, use that instead.
15549  auto SelectRoot = [&]() {
15550    if (TryOperandsAsNewSeeds && isReductionCandidate(Root) &&
15551        HorizontalReduction::getRdxKind(Root) != RecurKind::None)
15552      if (Instruction *NewRoot = tryGetSecondaryReductionRoot(P, Root))
15553        return NewRoot;
15554    return Root;
15555  };
15556
15557  // Start analysis starting from Root instruction. If horizontal reduction is
15558  // found, try to vectorize it. If it is not a horizontal reduction or
15559  // vectorization is not possible or not effective, and currently analyzed
15560  // instruction is a binary operation, try to vectorize the operands, using
15561  // pre-order DFS traversal order. If the operands were not vectorized, repeat
15562  // the same procedure considering each operand as a possible root of the
15563  // horizontal reduction.
15564  // Interrupt the process if the Root instruction itself was vectorized or all
15565  // sub-trees not higher that RecursionMaxDepth were analyzed/vectorized.
15566  // If a horizintal reduction was not matched or vectorized we collect
15567  // instructions for possible later attempts for vectorization.
15568  std::queue<std::pair<Instruction *, unsigned>> Stack;
15569  Stack.emplace(SelectRoot(), 0);
15570  SmallPtrSet<Value *, 8> VisitedInstrs;
15571  bool Res = false;
15572  auto &&TryToReduce = [this, TTI, &R](Instruction *Inst) -> Value * {
15573    if (R.isAnalyzedReductionRoot(Inst))
15574      return nullptr;
15575    if (!isReductionCandidate(Inst))
15576      return nullptr;
15577    HorizontalReduction HorRdx;
15578    if (!HorRdx.matchAssociativeReduction(R, Inst, *SE, *DL, *TLI))
15579      return nullptr;
15580    return HorRdx.tryToReduce(R, TTI, *TLI);
15581  };
15582  auto TryAppendToPostponedInsts = [&](Instruction *FutureSeed) {
15583    if (TryOperandsAsNewSeeds && FutureSeed == Root) {
15584      FutureSeed = getNonPhiOperand(Root, P);
15585      if (!FutureSeed)
15586        return false;
15587    }
15588    // Do not collect CmpInst or InsertElementInst/InsertValueInst as their
15589    // analysis is done separately.
15590    if (!isa<CmpInst, InsertElementInst, InsertValueInst>(FutureSeed))
15591      PostponedInsts.push_back(FutureSeed);
15592    return true;
15593  };
15594
15595  while (!Stack.empty()) {
15596    Instruction *Inst;
15597    unsigned Level;
15598    std::tie(Inst, Level) = Stack.front();
15599    Stack.pop();
15600    // Do not try to analyze instruction that has already been vectorized.
15601    // This may happen when we vectorize instruction operands on a previous
15602    // iteration while stack was populated before that happened.
15603    if (R.isDeleted(Inst))
15604      continue;
15605    if (Value *VectorizedV = TryToReduce(Inst)) {
15606      Res = true;
15607      if (auto *I = dyn_cast<Instruction>(VectorizedV)) {
15608        // Try to find another reduction.
15609        Stack.emplace(I, Level);
15610        continue;
15611      }
15612    } else {
15613      // We could not vectorize `Inst` so try to use it as a future seed.
15614      if (!TryAppendToPostponedInsts(Inst)) {
15615        assert(Stack.empty() && "Expected empty stack");
15616        break;
15617      }
15618    }
15619
15620    // Try to vectorize operands.
15621    // Continue analysis for the instruction from the same basic block only to
15622    // save compile time.
15623    if (++Level < RecursionMaxDepth)
15624      for (auto *Op : Inst->operand_values())
15625        if (VisitedInstrs.insert(Op).second)
15626          if (auto *I = dyn_cast<Instruction>(Op))
15627            // Do not try to vectorize CmpInst operands,  this is done
15628            // separately.
15629            if (!isa<PHINode, CmpInst, InsertElementInst, InsertValueInst>(I) &&
15630                !R.isDeleted(I) && I->getParent() == BB)
15631              Stack.emplace(I, Level);
15632  }
15633  return Res;
15634}
15635
15636bool SLPVectorizerPass::vectorizeRootInstruction(PHINode *P, Instruction *Root,
15637                                                 BasicBlock *BB, BoUpSLP &R,
15638                                                 TargetTransformInfo *TTI) {
15639  SmallVector<WeakTrackingVH> PostponedInsts;
15640  bool Res = vectorizeHorReduction(P, Root, BB, R, TTI, PostponedInsts);
15641  Res |= tryToVectorize(PostponedInsts, R);
15642  return Res;
15643}
15644
15645bool SLPVectorizerPass::tryToVectorize(ArrayRef<WeakTrackingVH> Insts,
15646                                       BoUpSLP &R) {
15647  bool Res = false;
15648  for (Value *V : Insts)
15649    if (auto *Inst = dyn_cast<Instruction>(V); Inst && !R.isDeleted(Inst))
15650      Res |= tryToVectorize(Inst, R);
15651  return Res;
15652}
15653
15654bool SLPVectorizerPass::vectorizeInsertValueInst(InsertValueInst *IVI,
15655                                                 BasicBlock *BB, BoUpSLP &R) {
15656  if (!R.canMapToVector(IVI->getType()))
15657    return false;
15658
15659  SmallVector<Value *, 16> BuildVectorOpds;
15660  SmallVector<Value *, 16> BuildVectorInsts;
15661  if (!findBuildAggregate(IVI, TTI, BuildVectorOpds, BuildVectorInsts))
15662    return false;
15663
15664  LLVM_DEBUG(dbgs() << "SLP: array mappable to vector: " << *IVI << "\n");
15665  // Aggregate value is unlikely to be processed in vector register.
15666  return tryToVectorizeList(BuildVectorOpds, R);
15667}
15668
15669bool SLPVectorizerPass::vectorizeInsertElementInst(InsertElementInst *IEI,
15670                                                   BasicBlock *BB, BoUpSLP &R) {
15671  SmallVector<Value *, 16> BuildVectorInsts;
15672  SmallVector<Value *, 16> BuildVectorOpds;
15673  SmallVector<int> Mask;
15674  if (!findBuildAggregate(IEI, TTI, BuildVectorOpds, BuildVectorInsts) ||
15675      (llvm::all_of(
15676           BuildVectorOpds,
15677           [](Value *V) { return isa<ExtractElementInst, UndefValue>(V); }) &&
15678       isFixedVectorShuffle(BuildVectorOpds, Mask)))
15679    return false;
15680
15681  LLVM_DEBUG(dbgs() << "SLP: array mappable to vector: " << *IEI << "\n");
15682  return tryToVectorizeList(BuildVectorInsts, R);
15683}
15684
15685template <typename T>
15686static bool tryToVectorizeSequence(
15687    SmallVectorImpl<T *> &Incoming, function_ref<bool(T *, T *)> Comparator,
15688    function_ref<bool(T *, T *)> AreCompatible,
15689    function_ref<bool(ArrayRef<T *>, bool)> TryToVectorizeHelper,
15690    bool MaxVFOnly, BoUpSLP &R) {
15691  bool Changed = false;
15692  // Sort by type, parent, operands.
15693  stable_sort(Incoming, Comparator);
15694
15695  // Try to vectorize elements base on their type.
15696  SmallVector<T *> Candidates;
15697  for (auto *IncIt = Incoming.begin(), *E = Incoming.end(); IncIt != E;) {
15698    // Look for the next elements with the same type, parent and operand
15699    // kinds.
15700    auto *SameTypeIt = IncIt;
15701    while (SameTypeIt != E && AreCompatible(*SameTypeIt, *IncIt))
15702      ++SameTypeIt;
15703
15704    // Try to vectorize them.
15705    unsigned NumElts = (SameTypeIt - IncIt);
15706    LLVM_DEBUG(dbgs() << "SLP: Trying to vectorize starting at nodes ("
15707                      << NumElts << ")\n");
15708    // The vectorization is a 3-state attempt:
15709    // 1. Try to vectorize instructions with the same/alternate opcodes with the
15710    // size of maximal register at first.
15711    // 2. Try to vectorize remaining instructions with the same type, if
15712    // possible. This may result in the better vectorization results rather than
15713    // if we try just to vectorize instructions with the same/alternate opcodes.
15714    // 3. Final attempt to try to vectorize all instructions with the
15715    // same/alternate ops only, this may result in some extra final
15716    // vectorization.
15717    if (NumElts > 1 &&
15718        TryToVectorizeHelper(ArrayRef(IncIt, NumElts), MaxVFOnly)) {
15719      // Success start over because instructions might have been changed.
15720      Changed = true;
15721    } else {
15722      /// \Returns the minimum number of elements that we will attempt to
15723      /// vectorize.
15724      auto GetMinNumElements = [&R](Value *V) {
15725        unsigned EltSize = R.getVectorElementSize(V);
15726        return std::max(2U, R.getMaxVecRegSize() / EltSize);
15727      };
15728      if (NumElts < GetMinNumElements(*IncIt) &&
15729          (Candidates.empty() ||
15730           Candidates.front()->getType() == (*IncIt)->getType())) {
15731        Candidates.append(IncIt, std::next(IncIt, NumElts));
15732      }
15733    }
15734    // Final attempt to vectorize instructions with the same types.
15735    if (Candidates.size() > 1 &&
15736        (SameTypeIt == E || (*SameTypeIt)->getType() != (*IncIt)->getType())) {
15737      if (TryToVectorizeHelper(Candidates, /*MaxVFOnly=*/false)) {
15738        // Success start over because instructions might have been changed.
15739        Changed = true;
15740      } else if (MaxVFOnly) {
15741        // Try to vectorize using small vectors.
15742        for (auto *It = Candidates.begin(), *End = Candidates.end();
15743             It != End;) {
15744          auto *SameTypeIt = It;
15745          while (SameTypeIt != End && AreCompatible(*SameTypeIt, *It))
15746            ++SameTypeIt;
15747          unsigned NumElts = (SameTypeIt - It);
15748          if (NumElts > 1 && TryToVectorizeHelper(ArrayRef(It, NumElts),
15749                                                  /*MaxVFOnly=*/false))
15750            Changed = true;
15751          It = SameTypeIt;
15752        }
15753      }
15754      Candidates.clear();
15755    }
15756
15757    // Start over at the next instruction of a different type (or the end).
15758    IncIt = SameTypeIt;
15759  }
15760  return Changed;
15761}
15762
15763/// Compare two cmp instructions. If IsCompatibility is true, function returns
15764/// true if 2 cmps have same/swapped predicates and mos compatible corresponding
15765/// operands. If IsCompatibility is false, function implements strict weak
15766/// ordering relation between two cmp instructions, returning true if the first
15767/// instruction is "less" than the second, i.e. its predicate is less than the
15768/// predicate of the second or the operands IDs are less than the operands IDs
15769/// of the second cmp instruction.
15770template <bool IsCompatibility>
15771static bool compareCmp(Value *V, Value *V2, TargetLibraryInfo &TLI,
15772                       const DominatorTree &DT) {
15773  assert(isValidElementType(V->getType()) &&
15774         isValidElementType(V2->getType()) &&
15775         "Expected valid element types only.");
15776  if (V == V2)
15777    return IsCompatibility;
15778  auto *CI1 = cast<CmpInst>(V);
15779  auto *CI2 = cast<CmpInst>(V2);
15780  if (CI1->getOperand(0)->getType()->getTypeID() <
15781      CI2->getOperand(0)->getType()->getTypeID())
15782    return !IsCompatibility;
15783  if (CI1->getOperand(0)->getType()->getTypeID() >
15784      CI2->getOperand(0)->getType()->getTypeID())
15785    return false;
15786  CmpInst::Predicate Pred1 = CI1->getPredicate();
15787  CmpInst::Predicate Pred2 = CI2->getPredicate();
15788  CmpInst::Predicate SwapPred1 = CmpInst::getSwappedPredicate(Pred1);
15789  CmpInst::Predicate SwapPred2 = CmpInst::getSwappedPredicate(Pred2);
15790  CmpInst::Predicate BasePred1 = std::min(Pred1, SwapPred1);
15791  CmpInst::Predicate BasePred2 = std::min(Pred2, SwapPred2);
15792  if (BasePred1 < BasePred2)
15793    return !IsCompatibility;
15794  if (BasePred1 > BasePred2)
15795    return false;
15796  // Compare operands.
15797  bool CI1Preds = Pred1 == BasePred1;
15798  bool CI2Preds = Pred2 == BasePred1;
15799  for (int I = 0, E = CI1->getNumOperands(); I < E; ++I) {
15800    auto *Op1 = CI1->getOperand(CI1Preds ? I : E - I - 1);
15801    auto *Op2 = CI2->getOperand(CI2Preds ? I : E - I - 1);
15802    if (Op1 == Op2)
15803      continue;
15804    if (Op1->getValueID() < Op2->getValueID())
15805      return !IsCompatibility;
15806    if (Op1->getValueID() > Op2->getValueID())
15807      return false;
15808    if (auto *I1 = dyn_cast<Instruction>(Op1))
15809      if (auto *I2 = dyn_cast<Instruction>(Op2)) {
15810        if (IsCompatibility) {
15811          if (I1->getParent() != I2->getParent())
15812            return false;
15813        } else {
15814          // Try to compare nodes with same parent.
15815          DomTreeNodeBase<BasicBlock> *NodeI1 = DT.getNode(I1->getParent());
15816          DomTreeNodeBase<BasicBlock> *NodeI2 = DT.getNode(I2->getParent());
15817          if (!NodeI1)
15818            return NodeI2 != nullptr;
15819          if (!NodeI2)
15820            return false;
15821          assert((NodeI1 == NodeI2) ==
15822                     (NodeI1->getDFSNumIn() == NodeI2->getDFSNumIn()) &&
15823                 "Different nodes should have different DFS numbers");
15824          if (NodeI1 != NodeI2)
15825            return NodeI1->getDFSNumIn() < NodeI2->getDFSNumIn();
15826        }
15827        InstructionsState S = getSameOpcode({I1, I2}, TLI);
15828        if (S.getOpcode() && (IsCompatibility || !S.isAltShuffle()))
15829          continue;
15830        if (IsCompatibility)
15831          return false;
15832        if (I1->getOpcode() != I2->getOpcode())
15833          return I1->getOpcode() < I2->getOpcode();
15834      }
15835  }
15836  return IsCompatibility;
15837}
15838
15839template <typename ItT>
15840bool SLPVectorizerPass::vectorizeCmpInsts(iterator_range<ItT> CmpInsts,
15841                                          BasicBlock *BB, BoUpSLP &R) {
15842  bool Changed = false;
15843  // Try to find reductions first.
15844  for (CmpInst *I : CmpInsts) {
15845    if (R.isDeleted(I))
15846      continue;
15847    for (Value *Op : I->operands())
15848      if (auto *RootOp = dyn_cast<Instruction>(Op))
15849        Changed |= vectorizeRootInstruction(nullptr, RootOp, BB, R, TTI);
15850  }
15851  // Try to vectorize operands as vector bundles.
15852  for (CmpInst *I : CmpInsts) {
15853    if (R.isDeleted(I))
15854      continue;
15855    Changed |= tryToVectorize(I, R);
15856  }
15857  // Try to vectorize list of compares.
15858  // Sort by type, compare predicate, etc.
15859  auto CompareSorter = [&](Value *V, Value *V2) {
15860    if (V == V2)
15861      return false;
15862    return compareCmp<false>(V, V2, *TLI, *DT);
15863  };
15864
15865  auto AreCompatibleCompares = [&](Value *V1, Value *V2) {
15866    if (V1 == V2)
15867      return true;
15868    return compareCmp<true>(V1, V2, *TLI, *DT);
15869  };
15870
15871  SmallVector<Value *> Vals;
15872  for (Instruction *V : CmpInsts)
15873    if (!R.isDeleted(V) && isValidElementType(V->getType()))
15874      Vals.push_back(V);
15875  if (Vals.size() <= 1)
15876    return Changed;
15877  Changed |= tryToVectorizeSequence<Value>(
15878      Vals, CompareSorter, AreCompatibleCompares,
15879      [this, &R](ArrayRef<Value *> Candidates, bool MaxVFOnly) {
15880        // Exclude possible reductions from other blocks.
15881        bool ArePossiblyReducedInOtherBlock = any_of(Candidates, [](Value *V) {
15882          return any_of(V->users(), [V](User *U) {
15883            auto *Select = dyn_cast<SelectInst>(U);
15884            return Select &&
15885                   Select->getParent() != cast<Instruction>(V)->getParent();
15886          });
15887        });
15888        if (ArePossiblyReducedInOtherBlock)
15889          return false;
15890        return tryToVectorizeList(Candidates, R, MaxVFOnly);
15891      },
15892      /*MaxVFOnly=*/true, R);
15893  return Changed;
15894}
15895
15896bool SLPVectorizerPass::vectorizeInserts(InstSetVector &Instructions,
15897                                         BasicBlock *BB, BoUpSLP &R) {
15898  assert(all_of(Instructions,
15899                [](auto *I) {
15900                  return isa<InsertElementInst, InsertValueInst>(I);
15901                }) &&
15902         "This function only accepts Insert instructions");
15903  bool OpsChanged = false;
15904  SmallVector<WeakTrackingVH> PostponedInsts;
15905  // pass1 - try to vectorize reductions only
15906  for (auto *I : reverse(Instructions)) {
15907    if (R.isDeleted(I))
15908      continue;
15909    OpsChanged |= vectorizeHorReduction(nullptr, I, BB, R, TTI, PostponedInsts);
15910  }
15911  // pass2 - try to match and vectorize a buildvector sequence.
15912  for (auto *I : reverse(Instructions)) {
15913    if (R.isDeleted(I) || isa<CmpInst>(I))
15914      continue;
15915    if (auto *LastInsertValue = dyn_cast<InsertValueInst>(I)) {
15916      OpsChanged |= vectorizeInsertValueInst(LastInsertValue, BB, R);
15917    } else if (auto *LastInsertElem = dyn_cast<InsertElementInst>(I)) {
15918      OpsChanged |= vectorizeInsertElementInst(LastInsertElem, BB, R);
15919    }
15920  }
15921  // Now try to vectorize postponed instructions.
15922  OpsChanged |= tryToVectorize(PostponedInsts, R);
15923
15924  Instructions.clear();
15925  return OpsChanged;
15926}
15927
15928bool SLPVectorizerPass::vectorizeChainsInBlock(BasicBlock *BB, BoUpSLP &R) {
15929  bool Changed = false;
15930  SmallVector<Value *, 4> Incoming;
15931  SmallPtrSet<Value *, 16> VisitedInstrs;
15932  // Maps phi nodes to the non-phi nodes found in the use tree for each phi
15933  // node. Allows better to identify the chains that can be vectorized in the
15934  // better way.
15935  DenseMap<Value *, SmallVector<Value *, 4>> PHIToOpcodes;
15936  auto PHICompare = [this, &PHIToOpcodes](Value *V1, Value *V2) {
15937    assert(isValidElementType(V1->getType()) &&
15938           isValidElementType(V2->getType()) &&
15939           "Expected vectorizable types only.");
15940    // It is fine to compare type IDs here, since we expect only vectorizable
15941    // types, like ints, floats and pointers, we don't care about other type.
15942    if (V1->getType()->getTypeID() < V2->getType()->getTypeID())
15943      return true;
15944    if (V1->getType()->getTypeID() > V2->getType()->getTypeID())
15945      return false;
15946    ArrayRef<Value *> Opcodes1 = PHIToOpcodes[V1];
15947    ArrayRef<Value *> Opcodes2 = PHIToOpcodes[V2];
15948    if (Opcodes1.size() < Opcodes2.size())
15949      return true;
15950    if (Opcodes1.size() > Opcodes2.size())
15951      return false;
15952    for (int I = 0, E = Opcodes1.size(); I < E; ++I) {
15953      // Undefs are compatible with any other value.
15954      if (isa<UndefValue>(Opcodes1[I]) || isa<UndefValue>(Opcodes2[I])) {
15955        if (isa<Instruction>(Opcodes1[I]))
15956          return true;
15957        if (isa<Instruction>(Opcodes2[I]))
15958          return false;
15959        if (isa<Constant>(Opcodes1[I]) && !isa<UndefValue>(Opcodes1[I]))
15960          return true;
15961        if (isa<Constant>(Opcodes2[I]) && !isa<UndefValue>(Opcodes2[I]))
15962          return false;
15963        if (isa<UndefValue>(Opcodes1[I]) && isa<UndefValue>(Opcodes2[I]))
15964          continue;
15965        return isa<UndefValue>(Opcodes2[I]);
15966      }
15967      if (auto *I1 = dyn_cast<Instruction>(Opcodes1[I]))
15968        if (auto *I2 = dyn_cast<Instruction>(Opcodes2[I])) {
15969          DomTreeNodeBase<BasicBlock> *NodeI1 = DT->getNode(I1->getParent());
15970          DomTreeNodeBase<BasicBlock> *NodeI2 = DT->getNode(I2->getParent());
15971          if (!NodeI1)
15972            return NodeI2 != nullptr;
15973          if (!NodeI2)
15974            return false;
15975          assert((NodeI1 == NodeI2) ==
15976                     (NodeI1->getDFSNumIn() == NodeI2->getDFSNumIn()) &&
15977                 "Different nodes should have different DFS numbers");
15978          if (NodeI1 != NodeI2)
15979            return NodeI1->getDFSNumIn() < NodeI2->getDFSNumIn();
15980          InstructionsState S = getSameOpcode({I1, I2}, *TLI);
15981          if (S.getOpcode() && !S.isAltShuffle())
15982            continue;
15983          return I1->getOpcode() < I2->getOpcode();
15984        }
15985      if (isa<Constant>(Opcodes1[I]) && isa<Constant>(Opcodes2[I]))
15986        return Opcodes1[I]->getValueID() < Opcodes2[I]->getValueID();
15987      if (isa<Instruction>(Opcodes1[I]))
15988        return true;
15989      if (isa<Instruction>(Opcodes2[I]))
15990        return false;
15991      if (isa<Constant>(Opcodes1[I]))
15992        return true;
15993      if (isa<Constant>(Opcodes2[I]))
15994        return false;
15995      if (Opcodes1[I]->getValueID() < Opcodes2[I]->getValueID())
15996        return true;
15997      if (Opcodes1[I]->getValueID() > Opcodes2[I]->getValueID())
15998        return false;
15999    }
16000    return false;
16001  };
16002  auto AreCompatiblePHIs = [&PHIToOpcodes, this](Value *V1, Value *V2) {
16003    if (V1 == V2)
16004      return true;
16005    if (V1->getType() != V2->getType())
16006      return false;
16007    ArrayRef<Value *> Opcodes1 = PHIToOpcodes[V1];
16008    ArrayRef<Value *> Opcodes2 = PHIToOpcodes[V2];
16009    if (Opcodes1.size() != Opcodes2.size())
16010      return false;
16011    for (int I = 0, E = Opcodes1.size(); I < E; ++I) {
16012      // Undefs are compatible with any other value.
16013      if (isa<UndefValue>(Opcodes1[I]) || isa<UndefValue>(Opcodes2[I]))
16014        continue;
16015      if (auto *I1 = dyn_cast<Instruction>(Opcodes1[I]))
16016        if (auto *I2 = dyn_cast<Instruction>(Opcodes2[I])) {
16017          if (I1->getParent() != I2->getParent())
16018            return false;
16019          InstructionsState S = getSameOpcode({I1, I2}, *TLI);
16020          if (S.getOpcode())
16021            continue;
16022          return false;
16023        }
16024      if (isa<Constant>(Opcodes1[I]) && isa<Constant>(Opcodes2[I]))
16025        continue;
16026      if (Opcodes1[I]->getValueID() != Opcodes2[I]->getValueID())
16027        return false;
16028    }
16029    return true;
16030  };
16031
16032  bool HaveVectorizedPhiNodes = false;
16033  do {
16034    // Collect the incoming values from the PHIs.
16035    Incoming.clear();
16036    for (Instruction &I : *BB) {
16037      PHINode *P = dyn_cast<PHINode>(&I);
16038      if (!P)
16039        break;
16040
16041      // No need to analyze deleted, vectorized and non-vectorizable
16042      // instructions.
16043      if (!VisitedInstrs.count(P) && !R.isDeleted(P) &&
16044          isValidElementType(P->getType()))
16045        Incoming.push_back(P);
16046    }
16047
16048    if (Incoming.size() <= 1)
16049      break;
16050
16051    // Find the corresponding non-phi nodes for better matching when trying to
16052    // build the tree.
16053    for (Value *V : Incoming) {
16054      SmallVectorImpl<Value *> &Opcodes =
16055          PHIToOpcodes.try_emplace(V).first->getSecond();
16056      if (!Opcodes.empty())
16057        continue;
16058      SmallVector<Value *, 4> Nodes(1, V);
16059      SmallPtrSet<Value *, 4> Visited;
16060      while (!Nodes.empty()) {
16061        auto *PHI = cast<PHINode>(Nodes.pop_back_val());
16062        if (!Visited.insert(PHI).second)
16063          continue;
16064        for (Value *V : PHI->incoming_values()) {
16065          if (auto *PHI1 = dyn_cast<PHINode>((V))) {
16066            Nodes.push_back(PHI1);
16067            continue;
16068          }
16069          Opcodes.emplace_back(V);
16070        }
16071      }
16072    }
16073
16074    HaveVectorizedPhiNodes = tryToVectorizeSequence<Value>(
16075        Incoming, PHICompare, AreCompatiblePHIs,
16076        [this, &R](ArrayRef<Value *> Candidates, bool MaxVFOnly) {
16077          return tryToVectorizeList(Candidates, R, MaxVFOnly);
16078        },
16079        /*MaxVFOnly=*/true, R);
16080    Changed |= HaveVectorizedPhiNodes;
16081    VisitedInstrs.insert(Incoming.begin(), Incoming.end());
16082  } while (HaveVectorizedPhiNodes);
16083
16084  VisitedInstrs.clear();
16085
16086  InstSetVector PostProcessInserts;
16087  SmallSetVector<CmpInst *, 8> PostProcessCmps;
16088  // Vectorizes Inserts in `PostProcessInserts` and if `VecctorizeCmps` is true
16089  // also vectorizes `PostProcessCmps`.
16090  auto VectorizeInsertsAndCmps = [&](bool VectorizeCmps) {
16091    bool Changed = vectorizeInserts(PostProcessInserts, BB, R);
16092    if (VectorizeCmps) {
16093      Changed |= vectorizeCmpInsts(reverse(PostProcessCmps), BB, R);
16094      PostProcessCmps.clear();
16095    }
16096    PostProcessInserts.clear();
16097    return Changed;
16098  };
16099  // Returns true if `I` is in `PostProcessInserts` or `PostProcessCmps`.
16100  auto IsInPostProcessInstrs = [&](Instruction *I) {
16101    if (auto *Cmp = dyn_cast<CmpInst>(I))
16102      return PostProcessCmps.contains(Cmp);
16103    return isa<InsertElementInst, InsertValueInst>(I) &&
16104           PostProcessInserts.contains(I);
16105  };
16106  // Returns true if `I` is an instruction without users, like terminator, or
16107  // function call with ignored return value, store. Ignore unused instructions
16108  // (basing on instruction type, except for CallInst and InvokeInst).
16109  auto HasNoUsers = [](Instruction *I) {
16110    return I->use_empty() &&
16111           (I->getType()->isVoidTy() || isa<CallInst, InvokeInst>(I));
16112  };
16113  for (BasicBlock::iterator It = BB->begin(), E = BB->end(); It != E; ++It) {
16114    // Skip instructions with scalable type. The num of elements is unknown at
16115    // compile-time for scalable type.
16116    if (isa<ScalableVectorType>(It->getType()))
16117      continue;
16118
16119    // Skip instructions marked for the deletion.
16120    if (R.isDeleted(&*It))
16121      continue;
16122    // We may go through BB multiple times so skip the one we have checked.
16123    if (!VisitedInstrs.insert(&*It).second) {
16124      if (HasNoUsers(&*It) &&
16125          VectorizeInsertsAndCmps(/*VectorizeCmps=*/It->isTerminator())) {
16126        // We would like to start over since some instructions are deleted
16127        // and the iterator may become invalid value.
16128        Changed = true;
16129        It = BB->begin();
16130        E = BB->end();
16131      }
16132      continue;
16133    }
16134
16135    if (isa<DbgInfoIntrinsic>(It))
16136      continue;
16137
16138    // Try to vectorize reductions that use PHINodes.
16139    if (PHINode *P = dyn_cast<PHINode>(It)) {
16140      // Check that the PHI is a reduction PHI.
16141      if (P->getNumIncomingValues() == 2) {
16142        // Try to match and vectorize a horizontal reduction.
16143        Instruction *Root = getReductionInstr(DT, P, BB, LI);
16144        if (Root && vectorizeRootInstruction(P, Root, BB, R, TTI)) {
16145          Changed = true;
16146          It = BB->begin();
16147          E = BB->end();
16148          continue;
16149        }
16150      }
16151      // Try to vectorize the incoming values of the PHI, to catch reductions
16152      // that feed into PHIs.
16153      for (unsigned I = 0, E = P->getNumIncomingValues(); I != E; I++) {
16154        // Skip if the incoming block is the current BB for now. Also, bypass
16155        // unreachable IR for efficiency and to avoid crashing.
16156        // TODO: Collect the skipped incoming values and try to vectorize them
16157        // after processing BB.
16158        if (BB == P->getIncomingBlock(I) ||
16159            !DT->isReachableFromEntry(P->getIncomingBlock(I)))
16160          continue;
16161
16162        // Postponed instructions should not be vectorized here, delay their
16163        // vectorization.
16164        if (auto *PI = dyn_cast<Instruction>(P->getIncomingValue(I));
16165            PI && !IsInPostProcessInstrs(PI))
16166          Changed |= vectorizeRootInstruction(nullptr, PI,
16167                                              P->getIncomingBlock(I), R, TTI);
16168      }
16169      continue;
16170    }
16171
16172    if (HasNoUsers(&*It)) {
16173      bool OpsChanged = false;
16174      auto *SI = dyn_cast<StoreInst>(It);
16175      bool TryToVectorizeRoot = ShouldStartVectorizeHorAtStore || !SI;
16176      if (SI) {
16177        auto *I = Stores.find(getUnderlyingObject(SI->getPointerOperand()));
16178        // Try to vectorize chain in store, if this is the only store to the
16179        // address in the block.
16180        // TODO: This is just a temporarily solution to save compile time. Need
16181        // to investigate if we can safely turn on slp-vectorize-hor-store
16182        // instead to allow lookup for reduction chains in all non-vectorized
16183        // stores (need to check side effects and compile time).
16184        TryToVectorizeRoot |= (I == Stores.end() || I->second.size() == 1) &&
16185                              SI->getValueOperand()->hasOneUse();
16186      }
16187      if (TryToVectorizeRoot) {
16188        for (auto *V : It->operand_values()) {
16189          // Postponed instructions should not be vectorized here, delay their
16190          // vectorization.
16191          if (auto *VI = dyn_cast<Instruction>(V);
16192              VI && !IsInPostProcessInstrs(VI))
16193            // Try to match and vectorize a horizontal reduction.
16194            OpsChanged |= vectorizeRootInstruction(nullptr, VI, BB, R, TTI);
16195        }
16196      }
16197      // Start vectorization of post-process list of instructions from the
16198      // top-tree instructions to try to vectorize as many instructions as
16199      // possible.
16200      OpsChanged |=
16201          VectorizeInsertsAndCmps(/*VectorizeCmps=*/It->isTerminator());
16202      if (OpsChanged) {
16203        // We would like to start over since some instructions are deleted
16204        // and the iterator may become invalid value.
16205        Changed = true;
16206        It = BB->begin();
16207        E = BB->end();
16208        continue;
16209      }
16210    }
16211
16212    if (isa<InsertElementInst, InsertValueInst>(It))
16213      PostProcessInserts.insert(&*It);
16214    else if (isa<CmpInst>(It))
16215      PostProcessCmps.insert(cast<CmpInst>(&*It));
16216  }
16217
16218  return Changed;
16219}
16220
16221bool SLPVectorizerPass::vectorizeGEPIndices(BasicBlock *BB, BoUpSLP &R) {
16222  auto Changed = false;
16223  for (auto &Entry : GEPs) {
16224    // If the getelementptr list has fewer than two elements, there's nothing
16225    // to do.
16226    if (Entry.second.size() < 2)
16227      continue;
16228
16229    LLVM_DEBUG(dbgs() << "SLP: Analyzing a getelementptr list of length "
16230                      << Entry.second.size() << ".\n");
16231
16232    // Process the GEP list in chunks suitable for the target's supported
16233    // vector size. If a vector register can't hold 1 element, we are done. We
16234    // are trying to vectorize the index computations, so the maximum number of
16235    // elements is based on the size of the index expression, rather than the
16236    // size of the GEP itself (the target's pointer size).
16237    unsigned MaxVecRegSize = R.getMaxVecRegSize();
16238    unsigned EltSize = R.getVectorElementSize(*Entry.second[0]->idx_begin());
16239    if (MaxVecRegSize < EltSize)
16240      continue;
16241
16242    unsigned MaxElts = MaxVecRegSize / EltSize;
16243    for (unsigned BI = 0, BE = Entry.second.size(); BI < BE; BI += MaxElts) {
16244      auto Len = std::min<unsigned>(BE - BI, MaxElts);
16245      ArrayRef<GetElementPtrInst *> GEPList(&Entry.second[BI], Len);
16246
16247      // Initialize a set a candidate getelementptrs. Note that we use a
16248      // SetVector here to preserve program order. If the index computations
16249      // are vectorizable and begin with loads, we want to minimize the chance
16250      // of having to reorder them later.
16251      SetVector<Value *> Candidates(GEPList.begin(), GEPList.end());
16252
16253      // Some of the candidates may have already been vectorized after we
16254      // initially collected them or their index is optimized to constant value.
16255      // If so, they are marked as deleted, so remove them from the set of
16256      // candidates.
16257      Candidates.remove_if([&R](Value *I) {
16258        return R.isDeleted(cast<Instruction>(I)) ||
16259               isa<Constant>(cast<GetElementPtrInst>(I)->idx_begin()->get());
16260      });
16261
16262      // Remove from the set of candidates all pairs of getelementptrs with
16263      // constant differences. Such getelementptrs are likely not good
16264      // candidates for vectorization in a bottom-up phase since one can be
16265      // computed from the other. We also ensure all candidate getelementptr
16266      // indices are unique.
16267      for (int I = 0, E = GEPList.size(); I < E && Candidates.size() > 1; ++I) {
16268        auto *GEPI = GEPList[I];
16269        if (!Candidates.count(GEPI))
16270          continue;
16271        auto *SCEVI = SE->getSCEV(GEPList[I]);
16272        for (int J = I + 1; J < E && Candidates.size() > 1; ++J) {
16273          auto *GEPJ = GEPList[J];
16274          auto *SCEVJ = SE->getSCEV(GEPList[J]);
16275          if (isa<SCEVConstant>(SE->getMinusSCEV(SCEVI, SCEVJ))) {
16276            Candidates.remove(GEPI);
16277            Candidates.remove(GEPJ);
16278          } else if (GEPI->idx_begin()->get() == GEPJ->idx_begin()->get()) {
16279            Candidates.remove(GEPJ);
16280          }
16281        }
16282      }
16283
16284      // We break out of the above computation as soon as we know there are
16285      // fewer than two candidates remaining.
16286      if (Candidates.size() < 2)
16287        continue;
16288
16289      // Add the single, non-constant index of each candidate to the bundle. We
16290      // ensured the indices met these constraints when we originally collected
16291      // the getelementptrs.
16292      SmallVector<Value *, 16> Bundle(Candidates.size());
16293      auto BundleIndex = 0u;
16294      for (auto *V : Candidates) {
16295        auto *GEP = cast<GetElementPtrInst>(V);
16296        auto *GEPIdx = GEP->idx_begin()->get();
16297        assert(GEP->getNumIndices() == 1 && !isa<Constant>(GEPIdx));
16298        Bundle[BundleIndex++] = GEPIdx;
16299      }
16300
16301      // Try and vectorize the indices. We are currently only interested in
16302      // gather-like cases of the form:
16303      //
16304      // ... = g[a[0] - b[0]] + g[a[1] - b[1]] + ...
16305      //
16306      // where the loads of "a", the loads of "b", and the subtractions can be
16307      // performed in parallel. It's likely that detecting this pattern in a
16308      // bottom-up phase will be simpler and less costly than building a
16309      // full-blown top-down phase beginning at the consecutive loads.
16310      Changed |= tryToVectorizeList(Bundle, R);
16311    }
16312  }
16313  return Changed;
16314}
16315
16316bool SLPVectorizerPass::vectorizeStoreChains(BoUpSLP &R) {
16317  bool Changed = false;
16318  // Sort by type, base pointers and values operand. Value operands must be
16319  // compatible (have the same opcode, same parent), otherwise it is
16320  // definitely not profitable to try to vectorize them.
16321  auto &&StoreSorter = [this](StoreInst *V, StoreInst *V2) {
16322    if (V->getValueOperand()->getType()->getTypeID() <
16323        V2->getValueOperand()->getType()->getTypeID())
16324      return true;
16325    if (V->getValueOperand()->getType()->getTypeID() >
16326        V2->getValueOperand()->getType()->getTypeID())
16327      return false;
16328    if (V->getPointerOperandType()->getTypeID() <
16329        V2->getPointerOperandType()->getTypeID())
16330      return true;
16331    if (V->getPointerOperandType()->getTypeID() >
16332        V2->getPointerOperandType()->getTypeID())
16333      return false;
16334    // UndefValues are compatible with all other values.
16335    if (isa<UndefValue>(V->getValueOperand()) ||
16336        isa<UndefValue>(V2->getValueOperand()))
16337      return false;
16338    if (auto *I1 = dyn_cast<Instruction>(V->getValueOperand()))
16339      if (auto *I2 = dyn_cast<Instruction>(V2->getValueOperand())) {
16340        DomTreeNodeBase<llvm::BasicBlock> *NodeI1 =
16341            DT->getNode(I1->getParent());
16342        DomTreeNodeBase<llvm::BasicBlock> *NodeI2 =
16343            DT->getNode(I2->getParent());
16344        assert(NodeI1 && "Should only process reachable instructions");
16345        assert(NodeI2 && "Should only process reachable instructions");
16346        assert((NodeI1 == NodeI2) ==
16347                   (NodeI1->getDFSNumIn() == NodeI2->getDFSNumIn()) &&
16348               "Different nodes should have different DFS numbers");
16349        if (NodeI1 != NodeI2)
16350          return NodeI1->getDFSNumIn() < NodeI2->getDFSNumIn();
16351        InstructionsState S = getSameOpcode({I1, I2}, *TLI);
16352        if (S.getOpcode())
16353          return false;
16354        return I1->getOpcode() < I2->getOpcode();
16355      }
16356    if (isa<Constant>(V->getValueOperand()) &&
16357        isa<Constant>(V2->getValueOperand()))
16358      return false;
16359    return V->getValueOperand()->getValueID() <
16360           V2->getValueOperand()->getValueID();
16361  };
16362
16363  auto &&AreCompatibleStores = [this](StoreInst *V1, StoreInst *V2) {
16364    if (V1 == V2)
16365      return true;
16366    if (V1->getValueOperand()->getType() != V2->getValueOperand()->getType())
16367      return false;
16368    if (V1->getPointerOperandType() != V2->getPointerOperandType())
16369      return false;
16370    // Undefs are compatible with any other value.
16371    if (isa<UndefValue>(V1->getValueOperand()) ||
16372        isa<UndefValue>(V2->getValueOperand()))
16373      return true;
16374    if (auto *I1 = dyn_cast<Instruction>(V1->getValueOperand()))
16375      if (auto *I2 = dyn_cast<Instruction>(V2->getValueOperand())) {
16376        if (I1->getParent() != I2->getParent())
16377          return false;
16378        InstructionsState S = getSameOpcode({I1, I2}, *TLI);
16379        return S.getOpcode() > 0;
16380      }
16381    if (isa<Constant>(V1->getValueOperand()) &&
16382        isa<Constant>(V2->getValueOperand()))
16383      return true;
16384    return V1->getValueOperand()->getValueID() ==
16385           V2->getValueOperand()->getValueID();
16386  };
16387
16388  // Attempt to sort and vectorize each of the store-groups.
16389  for (auto &Pair : Stores) {
16390    if (Pair.second.size() < 2)
16391      continue;
16392
16393    LLVM_DEBUG(dbgs() << "SLP: Analyzing a store chain of length "
16394                      << Pair.second.size() << ".\n");
16395
16396    if (!isValidElementType(Pair.second.front()->getValueOperand()->getType()))
16397      continue;
16398
16399    // Reverse stores to do bottom-to-top analysis. This is important if the
16400    // values are stores to the same addresses several times, in this case need
16401    // to follow the stores order (reversed to meet the memory dependecies).
16402    SmallVector<StoreInst *> ReversedStores(Pair.second.rbegin(),
16403                                            Pair.second.rend());
16404    Changed |= tryToVectorizeSequence<StoreInst>(
16405        ReversedStores, StoreSorter, AreCompatibleStores,
16406        [this, &R](ArrayRef<StoreInst *> Candidates, bool) {
16407          return vectorizeStores(Candidates, R);
16408        },
16409        /*MaxVFOnly=*/false, R);
16410  }
16411  return Changed;
16412}
16413