1//===- LoadStoreVectorizer.cpp - GPU Load & Store Vectorizer --------------===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8//
9// This pass merges loads/stores to/from sequential memory addresses into vector
10// loads/stores.  Although there's nothing GPU-specific in here, this pass is
11// motivated by the microarchitectural quirks of nVidia and AMD GPUs.
12//
13// (For simplicity below we talk about loads only, but everything also applies
14// to stores.)
15//
16// This pass is intended to be run late in the pipeline, after other
17// vectorization opportunities have been exploited.  So the assumption here is
18// that immediately following our new vector load we'll need to extract out the
19// individual elements of the load, so we can operate on them individually.
20//
21// On CPUs this transformation is usually not beneficial, because extracting the
22// elements of a vector register is expensive on most architectures.  It's
23// usually better just to load each element individually into its own scalar
24// register.
25//
26// However, nVidia and AMD GPUs don't have proper vector registers.  Instead, a
27// "vector load" loads directly into a series of scalar registers.  In effect,
28// extracting the elements of the vector is free.  It's therefore always
29// beneficial to vectorize a sequence of loads on these architectures.
30//
31// Vectorizing (perhaps a better name might be "coalescing") loads can have
32// large performance impacts on GPU kernels, and opportunities for vectorizing
33// are common in GPU code.  This pass tries very hard to find such
34// opportunities; its runtime is quadratic in the number of loads in a BB.
35//
36// Some CPU architectures, such as ARM, have instructions that load into
37// multiple scalar registers, similar to a GPU vectorized load.  In theory ARM
38// could use this pass (with some modifications), but currently it implements
39// its own pass to do something similar to what we do here.
40//
41// Overview of the algorithm and terminology in this pass:
42//
43//  - Break up each basic block into pseudo-BBs, composed of instructions which
44//    are guaranteed to transfer control to their successors.
45//  - Within a single pseudo-BB, find all loads, and group them into
46//    "equivalence classes" according to getUnderlyingObject() and loaded
47//    element size.  Do the same for stores.
48//  - For each equivalence class, greedily build "chains".  Each chain has a
49//    leader instruction, and every other member of the chain has a known
50//    constant offset from the first instr in the chain.
51//  - Break up chains so that they contain only contiguous accesses of legal
52//    size with no intervening may-alias instrs.
53//  - Convert each chain to vector instructions.
54//
55// The O(n^2) behavior of this pass comes from initially building the chains.
56// In the worst case we have to compare each new instruction to all of those
57// that came before. To limit this, we only calculate the offset to the leaders
58// of the N most recently-used chains.
59
60#include "llvm/Transforms/Vectorize/LoadStoreVectorizer.h"
61#include "llvm/ADT/APInt.h"
62#include "llvm/ADT/ArrayRef.h"
63#include "llvm/ADT/DenseMap.h"
64#include "llvm/ADT/MapVector.h"
65#include "llvm/ADT/PostOrderIterator.h"
66#include "llvm/ADT/STLExtras.h"
67#include "llvm/ADT/Sequence.h"
68#include "llvm/ADT/SmallPtrSet.h"
69#include "llvm/ADT/SmallVector.h"
70#include "llvm/ADT/Statistic.h"
71#include "llvm/ADT/iterator_range.h"
72#include "llvm/Analysis/AliasAnalysis.h"
73#include "llvm/Analysis/AssumptionCache.h"
74#include "llvm/Analysis/MemoryLocation.h"
75#include "llvm/Analysis/ScalarEvolution.h"
76#include "llvm/Analysis/TargetTransformInfo.h"
77#include "llvm/Analysis/ValueTracking.h"
78#include "llvm/Analysis/VectorUtils.h"
79#include "llvm/IR/Attributes.h"
80#include "llvm/IR/BasicBlock.h"
81#include "llvm/IR/ConstantRange.h"
82#include "llvm/IR/Constants.h"
83#include "llvm/IR/DataLayout.h"
84#include "llvm/IR/DerivedTypes.h"
85#include "llvm/IR/Dominators.h"
86#include "llvm/IR/Function.h"
87#include "llvm/IR/GetElementPtrTypeIterator.h"
88#include "llvm/IR/IRBuilder.h"
89#include "llvm/IR/InstrTypes.h"
90#include "llvm/IR/Instruction.h"
91#include "llvm/IR/Instructions.h"
92#include "llvm/IR/LLVMContext.h"
93#include "llvm/IR/Module.h"
94#include "llvm/IR/Type.h"
95#include "llvm/IR/Value.h"
96#include "llvm/InitializePasses.h"
97#include "llvm/Pass.h"
98#include "llvm/Support/Alignment.h"
99#include "llvm/Support/Casting.h"
100#include "llvm/Support/Debug.h"
101#include "llvm/Support/KnownBits.h"
102#include "llvm/Support/MathExtras.h"
103#include "llvm/Support/ModRef.h"
104#include "llvm/Support/raw_ostream.h"
105#include "llvm/Transforms/Utils/Local.h"
106#include <algorithm>
107#include <cassert>
108#include <cstdint>
109#include <cstdlib>
110#include <iterator>
111#include <numeric>
112#include <optional>
113#include <tuple>
114#include <type_traits>
115#include <utility>
116#include <vector>
117
118using namespace llvm;
119
120#define DEBUG_TYPE "load-store-vectorizer"
121
122STATISTIC(NumVectorInstructions, "Number of vector accesses generated");
123STATISTIC(NumScalarsVectorized, "Number of scalar accesses vectorized");
124
125namespace {
126
127// Equivalence class key, the initial tuple by which we group loads/stores.
128// Loads/stores with different EqClassKeys are never merged.
129//
130// (We could in theory remove element-size from the this tuple.  We'd just need
131// to fix up the vector packing/unpacking code.)
132using EqClassKey =
133    std::tuple<const Value * /* result of getUnderlyingObject() */,
134               unsigned /* AddrSpace */,
135               unsigned /* Load/Store element size bits */,
136               char /* IsLoad; char b/c bool can't be a DenseMap key */
137               >;
138[[maybe_unused]] llvm::raw_ostream &operator<<(llvm::raw_ostream &OS,
139                                               const EqClassKey &K) {
140  const auto &[UnderlyingObject, AddrSpace, ElementSize, IsLoad] = K;
141  return OS << (IsLoad ? "load" : "store") << " of " << *UnderlyingObject
142            << " of element size " << ElementSize << " bits in addrspace "
143            << AddrSpace;
144}
145
146// A Chain is a set of instructions such that:
147//  - All instructions have the same equivalence class, so in particular all are
148//    loads, or all are stores.
149//  - We know the address accessed by the i'th chain elem relative to the
150//    chain's leader instruction, which is the first instr of the chain in BB
151//    order.
152//
153// Chains have two canonical orderings:
154//  - BB order, sorted by Instr->comesBefore.
155//  - Offset order, sorted by OffsetFromLeader.
156// This pass switches back and forth between these orders.
157struct ChainElem {
158  Instruction *Inst;
159  APInt OffsetFromLeader;
160};
161using Chain = SmallVector<ChainElem, 1>;
162
163void sortChainInBBOrder(Chain &C) {
164  sort(C, [](auto &A, auto &B) { return A.Inst->comesBefore(B.Inst); });
165}
166
167void sortChainInOffsetOrder(Chain &C) {
168  sort(C, [](const auto &A, const auto &B) {
169    if (A.OffsetFromLeader != B.OffsetFromLeader)
170      return A.OffsetFromLeader.slt(B.OffsetFromLeader);
171    return A.Inst->comesBefore(B.Inst); // stable tiebreaker
172  });
173}
174
175[[maybe_unused]] void dumpChain(ArrayRef<ChainElem> C) {
176  for (const auto &E : C) {
177    dbgs() << "  " << *E.Inst << " (offset " << E.OffsetFromLeader << ")\n";
178  }
179}
180
181using EquivalenceClassMap =
182    MapVector<EqClassKey, SmallVector<Instruction *, 8>>;
183
184// FIXME: Assuming stack alignment of 4 is always good enough
185constexpr unsigned StackAdjustedAlignment = 4;
186
187Instruction *propagateMetadata(Instruction *I, const Chain &C) {
188  SmallVector<Value *, 8> Values;
189  for (const ChainElem &E : C)
190    Values.push_back(E.Inst);
191  return propagateMetadata(I, Values);
192}
193
194bool isInvariantLoad(const Instruction *I) {
195  const LoadInst *LI = dyn_cast<LoadInst>(I);
196  return LI != nullptr && LI->hasMetadata(LLVMContext::MD_invariant_load);
197}
198
199/// Reorders the instructions that I depends on (the instructions defining its
200/// operands), to ensure they dominate I.
201void reorder(Instruction *I) {
202  SmallPtrSet<Instruction *, 16> InstructionsToMove;
203  SmallVector<Instruction *, 16> Worklist;
204
205  Worklist.push_back(I);
206  while (!Worklist.empty()) {
207    Instruction *IW = Worklist.pop_back_val();
208    int NumOperands = IW->getNumOperands();
209    for (int i = 0; i < NumOperands; i++) {
210      Instruction *IM = dyn_cast<Instruction>(IW->getOperand(i));
211      if (!IM || IM->getOpcode() == Instruction::PHI)
212        continue;
213
214      // If IM is in another BB, no need to move it, because this pass only
215      // vectorizes instructions within one BB.
216      if (IM->getParent() != I->getParent())
217        continue;
218
219      if (!IM->comesBefore(I)) {
220        InstructionsToMove.insert(IM);
221        Worklist.push_back(IM);
222      }
223    }
224  }
225
226  // All instructions to move should follow I. Start from I, not from begin().
227  for (auto BBI = I->getIterator(), E = I->getParent()->end(); BBI != E;) {
228    Instruction *IM = &*(BBI++);
229    if (!InstructionsToMove.count(IM))
230      continue;
231    IM->moveBefore(I);
232  }
233}
234
235class Vectorizer {
236  Function &F;
237  AliasAnalysis &AA;
238  AssumptionCache &AC;
239  DominatorTree &DT;
240  ScalarEvolution &SE;
241  TargetTransformInfo &TTI;
242  const DataLayout &DL;
243  IRBuilder<> Builder;
244
245  // We could erase instrs right after vectorizing them, but that can mess up
246  // our BB iterators, and also can make the equivalence class keys point to
247  // freed memory.  This is fixable, but it's simpler just to wait until we're
248  // done with the BB and erase all at once.
249  SmallVector<Instruction *, 128> ToErase;
250
251public:
252  Vectorizer(Function &F, AliasAnalysis &AA, AssumptionCache &AC,
253             DominatorTree &DT, ScalarEvolution &SE, TargetTransformInfo &TTI)
254      : F(F), AA(AA), AC(AC), DT(DT), SE(SE), TTI(TTI),
255        DL(F.getParent()->getDataLayout()), Builder(SE.getContext()) {}
256
257  bool run();
258
259private:
260  static const unsigned MaxDepth = 3;
261
262  /// Runs the vectorizer on a "pseudo basic block", which is a range of
263  /// instructions [Begin, End) within one BB all of which have
264  /// isGuaranteedToTransferExecutionToSuccessor(I) == true.
265  bool runOnPseudoBB(BasicBlock::iterator Begin, BasicBlock::iterator End);
266
267  /// Runs the vectorizer on one equivalence class, i.e. one set of loads/stores
268  /// in the same BB with the same value for getUnderlyingObject() etc.
269  bool runOnEquivalenceClass(const EqClassKey &EqClassKey,
270                             ArrayRef<Instruction *> EqClass);
271
272  /// Runs the vectorizer on one chain, i.e. a subset of an equivalence class
273  /// where all instructions access a known, constant offset from the first
274  /// instruction.
275  bool runOnChain(Chain &C);
276
277  /// Splits the chain into subchains of instructions which read/write a
278  /// contiguous block of memory.  Discards any length-1 subchains (because
279  /// there's nothing to vectorize in there).
280  std::vector<Chain> splitChainByContiguity(Chain &C);
281
282  /// Splits the chain into subchains where it's safe to hoist loads up to the
283  /// beginning of the sub-chain and it's safe to sink loads up to the end of
284  /// the sub-chain.  Discards any length-1 subchains.
285  std::vector<Chain> splitChainByMayAliasInstrs(Chain &C);
286
287  /// Splits the chain into subchains that make legal, aligned accesses.
288  /// Discards any length-1 subchains.
289  std::vector<Chain> splitChainByAlignment(Chain &C);
290
291  /// Converts the instrs in the chain into a single vectorized load or store.
292  /// Adds the old scalar loads/stores to ToErase.
293  bool vectorizeChain(Chain &C);
294
295  /// Tries to compute the offset in bytes PtrB - PtrA.
296  std::optional<APInt> getConstantOffset(Value *PtrA, Value *PtrB,
297                                         Instruction *ContextInst,
298                                         unsigned Depth = 0);
299  std::optional<APInt> getConstantOffsetComplexAddrs(Value *PtrA, Value *PtrB,
300                                                     Instruction *ContextInst,
301                                                     unsigned Depth);
302  std::optional<APInt> getConstantOffsetSelects(Value *PtrA, Value *PtrB,
303                                                Instruction *ContextInst,
304                                                unsigned Depth);
305
306  /// Gets the element type of the vector that the chain will load or store.
307  /// This is nontrivial because the chain may contain elements of different
308  /// types; e.g. it's legal to have a chain that contains both i32 and float.
309  Type *getChainElemTy(const Chain &C);
310
311  /// Determines whether ChainElem can be moved up (if IsLoad) or down (if
312  /// !IsLoad) to ChainBegin -- i.e. there are no intervening may-alias
313  /// instructions.
314  ///
315  /// The map ChainElemOffsets must contain all of the elements in
316  /// [ChainBegin, ChainElem] and their offsets from some arbitrary base
317  /// address.  It's ok if it contains additional entries.
318  template <bool IsLoadChain>
319  bool isSafeToMove(
320      Instruction *ChainElem, Instruction *ChainBegin,
321      const DenseMap<Instruction *, APInt /*OffsetFromLeader*/> &ChainOffsets);
322
323  /// Collects loads and stores grouped by "equivalence class", where:
324  ///   - all elements in an eq class are a load or all are a store,
325  ///   - they all load/store the same element size (it's OK to have e.g. i8 and
326  ///     <4 x i8> in the same class, but not i32 and <4 x i8>), and
327  ///   - they all have the same value for getUnderlyingObject().
328  EquivalenceClassMap collectEquivalenceClasses(BasicBlock::iterator Begin,
329                                                BasicBlock::iterator End);
330
331  /// Partitions Instrs into "chains" where every instruction has a known
332  /// constant offset from the first instr in the chain.
333  ///
334  /// Postcondition: For all i, ret[i][0].second == 0, because the first instr
335  /// in the chain is the leader, and an instr touches distance 0 from itself.
336  std::vector<Chain> gatherChains(ArrayRef<Instruction *> Instrs);
337};
338
339class LoadStoreVectorizerLegacyPass : public FunctionPass {
340public:
341  static char ID;
342
343  LoadStoreVectorizerLegacyPass() : FunctionPass(ID) {
344    initializeLoadStoreVectorizerLegacyPassPass(
345        *PassRegistry::getPassRegistry());
346  }
347
348  bool runOnFunction(Function &F) override;
349
350  StringRef getPassName() const override {
351    return "GPU Load and Store Vectorizer";
352  }
353
354  void getAnalysisUsage(AnalysisUsage &AU) const override {
355    AU.addRequired<AAResultsWrapperPass>();
356    AU.addRequired<AssumptionCacheTracker>();
357    AU.addRequired<ScalarEvolutionWrapperPass>();
358    AU.addRequired<DominatorTreeWrapperPass>();
359    AU.addRequired<TargetTransformInfoWrapperPass>();
360    AU.setPreservesCFG();
361  }
362};
363
364} // end anonymous namespace
365
366char LoadStoreVectorizerLegacyPass::ID = 0;
367
368INITIALIZE_PASS_BEGIN(LoadStoreVectorizerLegacyPass, DEBUG_TYPE,
369                      "Vectorize load and Store instructions", false, false)
370INITIALIZE_PASS_DEPENDENCY(SCEVAAWrapperPass)
371INITIALIZE_PASS_DEPENDENCY(AssumptionCacheTracker);
372INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass)
373INITIALIZE_PASS_DEPENDENCY(AAResultsWrapperPass)
374INITIALIZE_PASS_DEPENDENCY(GlobalsAAWrapperPass)
375INITIALIZE_PASS_DEPENDENCY(TargetTransformInfoWrapperPass)
376INITIALIZE_PASS_END(LoadStoreVectorizerLegacyPass, DEBUG_TYPE,
377                    "Vectorize load and store instructions", false, false)
378
379Pass *llvm::createLoadStoreVectorizerPass() {
380  return new LoadStoreVectorizerLegacyPass();
381}
382
383bool LoadStoreVectorizerLegacyPass::runOnFunction(Function &F) {
384  // Don't vectorize when the attribute NoImplicitFloat is used.
385  if (skipFunction(F) || F.hasFnAttribute(Attribute::NoImplicitFloat))
386    return false;
387
388  AliasAnalysis &AA = getAnalysis<AAResultsWrapperPass>().getAAResults();
389  DominatorTree &DT = getAnalysis<DominatorTreeWrapperPass>().getDomTree();
390  ScalarEvolution &SE = getAnalysis<ScalarEvolutionWrapperPass>().getSE();
391  TargetTransformInfo &TTI =
392      getAnalysis<TargetTransformInfoWrapperPass>().getTTI(F);
393
394  AssumptionCache &AC =
395      getAnalysis<AssumptionCacheTracker>().getAssumptionCache(F);
396
397  return Vectorizer(F, AA, AC, DT, SE, TTI).run();
398}
399
400PreservedAnalyses LoadStoreVectorizerPass::run(Function &F,
401                                               FunctionAnalysisManager &AM) {
402  // Don't vectorize when the attribute NoImplicitFloat is used.
403  if (F.hasFnAttribute(Attribute::NoImplicitFloat))
404    return PreservedAnalyses::all();
405
406  AliasAnalysis &AA = AM.getResult<AAManager>(F);
407  DominatorTree &DT = AM.getResult<DominatorTreeAnalysis>(F);
408  ScalarEvolution &SE = AM.getResult<ScalarEvolutionAnalysis>(F);
409  TargetTransformInfo &TTI = AM.getResult<TargetIRAnalysis>(F);
410  AssumptionCache &AC = AM.getResult<AssumptionAnalysis>(F);
411
412  bool Changed = Vectorizer(F, AA, AC, DT, SE, TTI).run();
413  PreservedAnalyses PA;
414  PA.preserveSet<CFGAnalyses>();
415  return Changed ? PA : PreservedAnalyses::all();
416}
417
418bool Vectorizer::run() {
419  bool Changed = false;
420  // Break up the BB if there are any instrs which aren't guaranteed to transfer
421  // execution to their successor.
422  //
423  // Consider, for example:
424  //
425  //   def assert_arr_len(int n) { if (n < 2) exit(); }
426  //
427  //   load arr[0]
428  //   call assert_array_len(arr.length)
429  //   load arr[1]
430  //
431  // Even though assert_arr_len does not read or write any memory, we can't
432  // speculate the second load before the call.  More info at
433  // https://github.com/llvm/llvm-project/issues/52950.
434  for (BasicBlock *BB : post_order(&F)) {
435    // BB must at least have a terminator.
436    assert(!BB->empty());
437
438    SmallVector<BasicBlock::iterator, 8> Barriers;
439    Barriers.push_back(BB->begin());
440    for (Instruction &I : *BB)
441      if (!isGuaranteedToTransferExecutionToSuccessor(&I))
442        Barriers.push_back(I.getIterator());
443    Barriers.push_back(BB->end());
444
445    for (auto It = Barriers.begin(), End = std::prev(Barriers.end()); It != End;
446         ++It)
447      Changed |= runOnPseudoBB(*It, *std::next(It));
448
449    for (Instruction *I : ToErase) {
450      auto *PtrOperand = getLoadStorePointerOperand(I);
451      if (I->use_empty())
452        I->eraseFromParent();
453      RecursivelyDeleteTriviallyDeadInstructions(PtrOperand);
454    }
455    ToErase.clear();
456  }
457
458  return Changed;
459}
460
461bool Vectorizer::runOnPseudoBB(BasicBlock::iterator Begin,
462                               BasicBlock::iterator End) {
463  LLVM_DEBUG({
464    dbgs() << "LSV: Running on pseudo-BB [" << *Begin << " ... ";
465    if (End != Begin->getParent()->end())
466      dbgs() << *End;
467    else
468      dbgs() << "<BB end>";
469    dbgs() << ")\n";
470  });
471
472  bool Changed = false;
473  for (const auto &[EqClassKey, EqClass] :
474       collectEquivalenceClasses(Begin, End))
475    Changed |= runOnEquivalenceClass(EqClassKey, EqClass);
476
477  return Changed;
478}
479
480bool Vectorizer::runOnEquivalenceClass(const EqClassKey &EqClassKey,
481                                       ArrayRef<Instruction *> EqClass) {
482  bool Changed = false;
483
484  LLVM_DEBUG({
485    dbgs() << "LSV: Running on equivalence class of size " << EqClass.size()
486           << " keyed on " << EqClassKey << ":\n";
487    for (Instruction *I : EqClass)
488      dbgs() << "  " << *I << "\n";
489  });
490
491  std::vector<Chain> Chains = gatherChains(EqClass);
492  LLVM_DEBUG(dbgs() << "LSV: Got " << Chains.size()
493                    << " nontrivial chains.\n";);
494  for (Chain &C : Chains)
495    Changed |= runOnChain(C);
496  return Changed;
497}
498
499bool Vectorizer::runOnChain(Chain &C) {
500  LLVM_DEBUG({
501    dbgs() << "LSV: Running on chain with " << C.size() << " instructions:\n";
502    dumpChain(C);
503  });
504
505  // Split up the chain into increasingly smaller chains, until we can finally
506  // vectorize the chains.
507  //
508  // (Don't be scared by the depth of the loop nest here.  These operations are
509  // all at worst O(n lg n) in the number of instructions, and splitting chains
510  // doesn't change the number of instrs.  So the whole loop nest is O(n lg n).)
511  bool Changed = false;
512  for (auto &C : splitChainByMayAliasInstrs(C))
513    for (auto &C : splitChainByContiguity(C))
514      for (auto &C : splitChainByAlignment(C))
515        Changed |= vectorizeChain(C);
516  return Changed;
517}
518
519std::vector<Chain> Vectorizer::splitChainByMayAliasInstrs(Chain &C) {
520  if (C.empty())
521    return {};
522
523  sortChainInBBOrder(C);
524
525  LLVM_DEBUG({
526    dbgs() << "LSV: splitChainByMayAliasInstrs considering chain:\n";
527    dumpChain(C);
528  });
529
530  // We know that elements in the chain with nonverlapping offsets can't
531  // alias, but AA may not be smart enough to figure this out.  Use a
532  // hashtable.
533  DenseMap<Instruction *, APInt /*OffsetFromLeader*/> ChainOffsets;
534  for (const auto &E : C)
535    ChainOffsets.insert({&*E.Inst, E.OffsetFromLeader});
536
537  // Loads get hoisted up to the first load in the chain.  Stores get sunk
538  // down to the last store in the chain.  Our algorithm for loads is:
539  //
540  //  - Take the first element of the chain.  This is the start of a new chain.
541  //  - Take the next element of `Chain` and check for may-alias instructions
542  //    up to the start of NewChain.  If no may-alias instrs, add it to
543  //    NewChain.  Otherwise, start a new NewChain.
544  //
545  // For stores it's the same except in the reverse direction.
546  //
547  // We expect IsLoad to be an std::bool_constant.
548  auto Impl = [&](auto IsLoad) {
549    // MSVC is unhappy if IsLoad is a capture, so pass it as an arg.
550    auto [ChainBegin, ChainEnd] = [&](auto IsLoad) {
551      if constexpr (IsLoad())
552        return std::make_pair(C.begin(), C.end());
553      else
554        return std::make_pair(C.rbegin(), C.rend());
555    }(IsLoad);
556    assert(ChainBegin != ChainEnd);
557
558    std::vector<Chain> Chains;
559    SmallVector<ChainElem, 1> NewChain;
560    NewChain.push_back(*ChainBegin);
561    for (auto ChainIt = std::next(ChainBegin); ChainIt != ChainEnd; ++ChainIt) {
562      if (isSafeToMove<IsLoad>(ChainIt->Inst, NewChain.front().Inst,
563                               ChainOffsets)) {
564        LLVM_DEBUG(dbgs() << "LSV: No intervening may-alias instrs; can merge "
565                          << *ChainIt->Inst << " into " << *ChainBegin->Inst
566                          << "\n");
567        NewChain.push_back(*ChainIt);
568      } else {
569        LLVM_DEBUG(
570            dbgs() << "LSV: Found intervening may-alias instrs; cannot merge "
571                   << *ChainIt->Inst << " into " << *ChainBegin->Inst << "\n");
572        if (NewChain.size() > 1) {
573          LLVM_DEBUG({
574            dbgs() << "LSV: got nontrivial chain without aliasing instrs:\n";
575            dumpChain(NewChain);
576          });
577          Chains.push_back(std::move(NewChain));
578        }
579
580        // Start a new chain.
581        NewChain = SmallVector<ChainElem, 1>({*ChainIt});
582      }
583    }
584    if (NewChain.size() > 1) {
585      LLVM_DEBUG({
586        dbgs() << "LSV: got nontrivial chain without aliasing instrs:\n";
587        dumpChain(NewChain);
588      });
589      Chains.push_back(std::move(NewChain));
590    }
591    return Chains;
592  };
593
594  if (isa<LoadInst>(C[0].Inst))
595    return Impl(/*IsLoad=*/std::bool_constant<true>());
596
597  assert(isa<StoreInst>(C[0].Inst));
598  return Impl(/*IsLoad=*/std::bool_constant<false>());
599}
600
601std::vector<Chain> Vectorizer::splitChainByContiguity(Chain &C) {
602  if (C.empty())
603    return {};
604
605  sortChainInOffsetOrder(C);
606
607  LLVM_DEBUG({
608    dbgs() << "LSV: splitChainByContiguity considering chain:\n";
609    dumpChain(C);
610  });
611
612  std::vector<Chain> Ret;
613  Ret.push_back({C.front()});
614
615  for (auto It = std::next(C.begin()), End = C.end(); It != End; ++It) {
616    // `prev` accesses offsets [PrevDistFromBase, PrevReadEnd).
617    auto &CurChain = Ret.back();
618    const ChainElem &Prev = CurChain.back();
619    unsigned SzBits = DL.getTypeSizeInBits(getLoadStoreType(&*Prev.Inst));
620    assert(SzBits % 8 == 0 && "Non-byte sizes should have been filtered out by "
621                              "collectEquivalenceClass");
622    APInt PrevReadEnd = Prev.OffsetFromLeader + SzBits / 8;
623
624    // Add this instruction to the end of the current chain, or start a new one.
625    bool AreContiguous = It->OffsetFromLeader == PrevReadEnd;
626    LLVM_DEBUG(dbgs() << "LSV: Instructions are "
627                      << (AreContiguous ? "" : "not ") << "contiguous: "
628                      << *Prev.Inst << " (ends at offset " << PrevReadEnd
629                      << ") -> " << *It->Inst << " (starts at offset "
630                      << It->OffsetFromLeader << ")\n");
631    if (AreContiguous)
632      CurChain.push_back(*It);
633    else
634      Ret.push_back({*It});
635  }
636
637  // Filter out length-1 chains, these are uninteresting.
638  llvm::erase_if(Ret, [](const auto &Chain) { return Chain.size() <= 1; });
639  return Ret;
640}
641
642Type *Vectorizer::getChainElemTy(const Chain &C) {
643  assert(!C.empty());
644  // The rules are:
645  //  - If there are any pointer types in the chain, use an integer type.
646  //  - Prefer an integer type if it appears in the chain.
647  //  - Otherwise, use the first type in the chain.
648  //
649  // The rule about pointer types is a simplification when we merge e.g.  a load
650  // of a ptr and a double.  There's no direct conversion from a ptr to a
651  // double; it requires a ptrtoint followed by a bitcast.
652  //
653  // It's unclear to me if the other rules have any practical effect, but we do
654  // it to match this pass's previous behavior.
655  if (any_of(C, [](const ChainElem &E) {
656        return getLoadStoreType(E.Inst)->getScalarType()->isPointerTy();
657      })) {
658    return Type::getIntNTy(
659        F.getContext(),
660        DL.getTypeSizeInBits(getLoadStoreType(C[0].Inst)->getScalarType()));
661  }
662
663  for (const ChainElem &E : C)
664    if (Type *T = getLoadStoreType(E.Inst)->getScalarType(); T->isIntegerTy())
665      return T;
666  return getLoadStoreType(C[0].Inst)->getScalarType();
667}
668
669std::vector<Chain> Vectorizer::splitChainByAlignment(Chain &C) {
670  // We use a simple greedy algorithm.
671  //  - Given a chain of length N, find all prefixes that
672  //    (a) are not longer than the max register length, and
673  //    (b) are a power of 2.
674  //  - Starting from the longest prefix, try to create a vector of that length.
675  //  - If one of them works, great.  Repeat the algorithm on any remaining
676  //    elements in the chain.
677  //  - If none of them work, discard the first element and repeat on a chain
678  //    of length N-1.
679  if (C.empty())
680    return {};
681
682  sortChainInOffsetOrder(C);
683
684  LLVM_DEBUG({
685    dbgs() << "LSV: splitChainByAlignment considering chain:\n";
686    dumpChain(C);
687  });
688
689  bool IsLoadChain = isa<LoadInst>(C[0].Inst);
690  auto getVectorFactor = [&](unsigned VF, unsigned LoadStoreSize,
691                             unsigned ChainSizeBytes, VectorType *VecTy) {
692    return IsLoadChain ? TTI.getLoadVectorFactor(VF, LoadStoreSize,
693                                                 ChainSizeBytes, VecTy)
694                       : TTI.getStoreVectorFactor(VF, LoadStoreSize,
695                                                  ChainSizeBytes, VecTy);
696  };
697
698#ifndef NDEBUG
699  for (const auto &E : C) {
700    Type *Ty = getLoadStoreType(E.Inst)->getScalarType();
701    assert(isPowerOf2_32(DL.getTypeSizeInBits(Ty)) &&
702           "Should have filtered out non-power-of-two elements in "
703           "collectEquivalenceClasses.");
704  }
705#endif
706
707  unsigned AS = getLoadStoreAddressSpace(C[0].Inst);
708  unsigned VecRegBytes = TTI.getLoadStoreVecRegBitWidth(AS) / 8;
709
710  std::vector<Chain> Ret;
711  for (unsigned CBegin = 0; CBegin < C.size(); ++CBegin) {
712    // Find candidate chains of size not greater than the largest vector reg.
713    // These chains are over the closed interval [CBegin, CEnd].
714    SmallVector<std::pair<unsigned /*CEnd*/, unsigned /*SizeBytes*/>, 8>
715        CandidateChains;
716    for (unsigned CEnd = CBegin + 1, Size = C.size(); CEnd < Size; ++CEnd) {
717      APInt Sz = C[CEnd].OffsetFromLeader +
718                 DL.getTypeStoreSize(getLoadStoreType(C[CEnd].Inst)) -
719                 C[CBegin].OffsetFromLeader;
720      if (Sz.sgt(VecRegBytes))
721        break;
722      CandidateChains.push_back(
723          {CEnd, static_cast<unsigned>(Sz.getLimitedValue())});
724    }
725
726    // Consider the longest chain first.
727    for (auto It = CandidateChains.rbegin(), End = CandidateChains.rend();
728         It != End; ++It) {
729      auto [CEnd, SizeBytes] = *It;
730      LLVM_DEBUG(
731          dbgs() << "LSV: splitChainByAlignment considering candidate chain ["
732                 << *C[CBegin].Inst << " ... " << *C[CEnd].Inst << "]\n");
733
734      Type *VecElemTy = getChainElemTy(C);
735      // Note, VecElemTy is a power of 2, but might be less than one byte.  For
736      // example, we can vectorize 2 x <2 x i4> to <4 x i4>, and in this case
737      // VecElemTy would be i4.
738      unsigned VecElemBits = DL.getTypeSizeInBits(VecElemTy);
739
740      // SizeBytes and VecElemBits are powers of 2, so they divide evenly.
741      assert((8 * SizeBytes) % VecElemBits == 0);
742      unsigned NumVecElems = 8 * SizeBytes / VecElemBits;
743      FixedVectorType *VecTy = FixedVectorType::get(VecElemTy, NumVecElems);
744      unsigned VF = 8 * VecRegBytes / VecElemBits;
745
746      // Check that TTI is happy with this vectorization factor.
747      unsigned TargetVF = getVectorFactor(VF, VecElemBits,
748                                          VecElemBits * NumVecElems / 8, VecTy);
749      if (TargetVF != VF && TargetVF < NumVecElems) {
750        LLVM_DEBUG(
751            dbgs() << "LSV: splitChainByAlignment discarding candidate chain "
752                      "because TargetVF="
753                   << TargetVF << " != VF=" << VF
754                   << " and TargetVF < NumVecElems=" << NumVecElems << "\n");
755        continue;
756      }
757
758      // Is a load/store with this alignment allowed by TTI and at least as fast
759      // as an unvectorized load/store?
760      //
761      // TTI and F are passed as explicit captures to WAR an MSVC misparse (??).
762      auto IsAllowedAndFast = [&, SizeBytes = SizeBytes, &TTI = TTI,
763                               &F = F](Align Alignment) {
764        if (Alignment.value() % SizeBytes == 0)
765          return true;
766        unsigned VectorizedSpeed = 0;
767        bool AllowsMisaligned = TTI.allowsMisalignedMemoryAccesses(
768            F.getContext(), SizeBytes * 8, AS, Alignment, &VectorizedSpeed);
769        if (!AllowsMisaligned) {
770          LLVM_DEBUG(dbgs()
771                     << "LSV: Access of " << SizeBytes << "B in addrspace "
772                     << AS << " with alignment " << Alignment.value()
773                     << " is misaligned, and therefore can't be vectorized.\n");
774          return false;
775        }
776
777        unsigned ElementwiseSpeed = 0;
778        (TTI).allowsMisalignedMemoryAccesses((F).getContext(), VecElemBits, AS,
779                                             Alignment, &ElementwiseSpeed);
780        if (VectorizedSpeed < ElementwiseSpeed) {
781          LLVM_DEBUG(dbgs()
782                     << "LSV: Access of " << SizeBytes << "B in addrspace "
783                     << AS << " with alignment " << Alignment.value()
784                     << " has relative speed " << VectorizedSpeed
785                     << ", which is lower than the elementwise speed of "
786                     << ElementwiseSpeed
787                     << ".  Therefore this access won't be vectorized.\n");
788          return false;
789        }
790        return true;
791      };
792
793      // If we're loading/storing from an alloca, align it if possible.
794      //
795      // FIXME: We eagerly upgrade the alignment, regardless of whether TTI
796      // tells us this is beneficial.  This feels a bit odd, but it matches
797      // existing tests.  This isn't *so* bad, because at most we align to 4
798      // bytes (current value of StackAdjustedAlignment).
799      //
800      // FIXME: We will upgrade the alignment of the alloca even if it turns out
801      // we can't vectorize for some other reason.
802      Value *PtrOperand = getLoadStorePointerOperand(C[CBegin].Inst);
803      bool IsAllocaAccess = AS == DL.getAllocaAddrSpace() &&
804                            isa<AllocaInst>(PtrOperand->stripPointerCasts());
805      Align Alignment = getLoadStoreAlignment(C[CBegin].Inst);
806      Align PrefAlign = Align(StackAdjustedAlignment);
807      if (IsAllocaAccess && Alignment.value() % SizeBytes != 0 &&
808          IsAllowedAndFast(PrefAlign)) {
809        Align NewAlign = getOrEnforceKnownAlignment(
810            PtrOperand, PrefAlign, DL, C[CBegin].Inst, nullptr, &DT);
811        if (NewAlign >= Alignment) {
812          LLVM_DEBUG(dbgs()
813                     << "LSV: splitByChain upgrading alloca alignment from "
814                     << Alignment.value() << " to " << NewAlign.value()
815                     << "\n");
816          Alignment = NewAlign;
817        }
818      }
819
820      if (!IsAllowedAndFast(Alignment)) {
821        LLVM_DEBUG(
822            dbgs() << "LSV: splitChainByAlignment discarding candidate chain "
823                      "because its alignment is not AllowedAndFast: "
824                   << Alignment.value() << "\n");
825        continue;
826      }
827
828      if ((IsLoadChain &&
829           !TTI.isLegalToVectorizeLoadChain(SizeBytes, Alignment, AS)) ||
830          (!IsLoadChain &&
831           !TTI.isLegalToVectorizeStoreChain(SizeBytes, Alignment, AS))) {
832        LLVM_DEBUG(
833            dbgs() << "LSV: splitChainByAlignment discarding candidate chain "
834                      "because !isLegalToVectorizeLoad/StoreChain.");
835        continue;
836      }
837
838      // Hooray, we can vectorize this chain!
839      Chain &NewChain = Ret.emplace_back();
840      for (unsigned I = CBegin; I <= CEnd; ++I)
841        NewChain.push_back(C[I]);
842      CBegin = CEnd; // Skip over the instructions we've added to the chain.
843      break;
844    }
845  }
846  return Ret;
847}
848
849bool Vectorizer::vectorizeChain(Chain &C) {
850  if (C.size() < 2)
851    return false;
852
853  sortChainInOffsetOrder(C);
854
855  LLVM_DEBUG({
856    dbgs() << "LSV: Vectorizing chain of " << C.size() << " instructions:\n";
857    dumpChain(C);
858  });
859
860  Type *VecElemTy = getChainElemTy(C);
861  bool IsLoadChain = isa<LoadInst>(C[0].Inst);
862  unsigned AS = getLoadStoreAddressSpace(C[0].Inst);
863  unsigned ChainBytes = std::accumulate(
864      C.begin(), C.end(), 0u, [&](unsigned Bytes, const ChainElem &E) {
865        return Bytes + DL.getTypeStoreSize(getLoadStoreType(E.Inst));
866      });
867  assert(ChainBytes % DL.getTypeStoreSize(VecElemTy) == 0);
868  // VecTy is a power of 2 and 1 byte at smallest, but VecElemTy may be smaller
869  // than 1 byte (e.g. VecTy == <32 x i1>).
870  Type *VecTy = FixedVectorType::get(
871      VecElemTy, 8 * ChainBytes / DL.getTypeSizeInBits(VecElemTy));
872
873  Align Alignment = getLoadStoreAlignment(C[0].Inst);
874  // If this is a load/store of an alloca, we might have upgraded the alloca's
875  // alignment earlier.  Get the new alignment.
876  if (AS == DL.getAllocaAddrSpace()) {
877    Alignment = std::max(
878        Alignment,
879        getOrEnforceKnownAlignment(getLoadStorePointerOperand(C[0].Inst),
880                                   MaybeAlign(), DL, C[0].Inst, nullptr, &DT));
881  }
882
883  // All elements of the chain must have the same scalar-type size.
884#ifndef NDEBUG
885  for (const ChainElem &E : C)
886    assert(DL.getTypeStoreSize(getLoadStoreType(E.Inst)->getScalarType()) ==
887           DL.getTypeStoreSize(VecElemTy));
888#endif
889
890  Instruction *VecInst;
891  if (IsLoadChain) {
892    // Loads get hoisted to the location of the first load in the chain.  We may
893    // also need to hoist the (transitive) operands of the loads.
894    Builder.SetInsertPoint(
895        std::min_element(C.begin(), C.end(), [](const auto &A, const auto &B) {
896          return A.Inst->comesBefore(B.Inst);
897        })->Inst);
898
899    // Chain is in offset order, so C[0] is the instr with the lowest offset,
900    // i.e. the root of the vector.
901    VecInst = Builder.CreateAlignedLoad(VecTy,
902                                        getLoadStorePointerOperand(C[0].Inst),
903                                        Alignment);
904
905    unsigned VecIdx = 0;
906    for (const ChainElem &E : C) {
907      Instruction *I = E.Inst;
908      Value *V;
909      Type *T = getLoadStoreType(I);
910      if (auto *VT = dyn_cast<FixedVectorType>(T)) {
911        auto Mask = llvm::to_vector<8>(
912            llvm::seq<int>(VecIdx, VecIdx + VT->getNumElements()));
913        V = Builder.CreateShuffleVector(VecInst, Mask, I->getName());
914        VecIdx += VT->getNumElements();
915      } else {
916        V = Builder.CreateExtractElement(VecInst, Builder.getInt32(VecIdx),
917                                         I->getName());
918        ++VecIdx;
919      }
920      if (V->getType() != I->getType())
921        V = Builder.CreateBitOrPointerCast(V, I->getType());
922      I->replaceAllUsesWith(V);
923    }
924
925    // Finally, we need to reorder the instrs in the BB so that the (transitive)
926    // operands of VecInst appear before it.  To see why, suppose we have
927    // vectorized the following code:
928    //
929    //   ptr1  = gep a, 1
930    //   load1 = load i32 ptr1
931    //   ptr0  = gep a, 0
932    //   load0 = load i32 ptr0
933    //
934    // We will put the vectorized load at the location of the earliest load in
935    // the BB, i.e. load1.  We get:
936    //
937    //   ptr1  = gep a, 1
938    //   loadv = load <2 x i32> ptr0
939    //   load0 = extractelement loadv, 0
940    //   load1 = extractelement loadv, 1
941    //   ptr0 = gep a, 0
942    //
943    // Notice that loadv uses ptr0, which is defined *after* it!
944    reorder(VecInst);
945  } else {
946    // Stores get sunk to the location of the last store in the chain.
947    Builder.SetInsertPoint(
948        std::max_element(C.begin(), C.end(), [](auto &A, auto &B) {
949          return A.Inst->comesBefore(B.Inst);
950        })->Inst);
951
952    // Build the vector to store.
953    Value *Vec = PoisonValue::get(VecTy);
954    unsigned VecIdx = 0;
955    auto InsertElem = [&](Value *V) {
956      if (V->getType() != VecElemTy)
957        V = Builder.CreateBitOrPointerCast(V, VecElemTy);
958      Vec = Builder.CreateInsertElement(Vec, V, Builder.getInt32(VecIdx++));
959    };
960    for (const ChainElem &E : C) {
961      auto I = cast<StoreInst>(E.Inst);
962      if (FixedVectorType *VT =
963              dyn_cast<FixedVectorType>(getLoadStoreType(I))) {
964        for (int J = 0, JE = VT->getNumElements(); J < JE; ++J) {
965          InsertElem(Builder.CreateExtractElement(I->getValueOperand(),
966                                                  Builder.getInt32(J)));
967        }
968      } else {
969        InsertElem(I->getValueOperand());
970      }
971    }
972
973    // Chain is in offset order, so C[0] is the instr with the lowest offset,
974    // i.e. the root of the vector.
975    VecInst = Builder.CreateAlignedStore(
976        Vec,
977        getLoadStorePointerOperand(C[0].Inst),
978        Alignment);
979  }
980
981  propagateMetadata(VecInst, C);
982
983  for (const ChainElem &E : C)
984    ToErase.push_back(E.Inst);
985
986  ++NumVectorInstructions;
987  NumScalarsVectorized += C.size();
988  return true;
989}
990
991template <bool IsLoadChain>
992bool Vectorizer::isSafeToMove(
993    Instruction *ChainElem, Instruction *ChainBegin,
994    const DenseMap<Instruction *, APInt /*OffsetFromLeader*/> &ChainOffsets) {
995  LLVM_DEBUG(dbgs() << "LSV: isSafeToMove(" << *ChainElem << " -> "
996                    << *ChainBegin << ")\n");
997
998  assert(isa<LoadInst>(ChainElem) == IsLoadChain);
999  if (ChainElem == ChainBegin)
1000    return true;
1001
1002  // Invariant loads can always be reordered; by definition they are not
1003  // clobbered by stores.
1004  if (isInvariantLoad(ChainElem))
1005    return true;
1006
1007  auto BBIt = std::next([&] {
1008    if constexpr (IsLoadChain)
1009      return BasicBlock::reverse_iterator(ChainElem);
1010    else
1011      return BasicBlock::iterator(ChainElem);
1012  }());
1013  auto BBItEnd = std::next([&] {
1014    if constexpr (IsLoadChain)
1015      return BasicBlock::reverse_iterator(ChainBegin);
1016    else
1017      return BasicBlock::iterator(ChainBegin);
1018  }());
1019
1020  const APInt &ChainElemOffset = ChainOffsets.at(ChainElem);
1021  const unsigned ChainElemSize =
1022      DL.getTypeStoreSize(getLoadStoreType(ChainElem));
1023
1024  for (; BBIt != BBItEnd; ++BBIt) {
1025    Instruction *I = &*BBIt;
1026
1027    if (!I->mayReadOrWriteMemory())
1028      continue;
1029
1030    // Loads can be reordered with other loads.
1031    if (IsLoadChain && isa<LoadInst>(I))
1032      continue;
1033
1034    // Stores can be sunk below invariant loads.
1035    if (!IsLoadChain && isInvariantLoad(I))
1036      continue;
1037
1038    // If I is in the chain, we can tell whether it aliases ChainIt by checking
1039    // what offset ChainIt accesses.  This may be better than AA is able to do.
1040    //
1041    // We should really only have duplicate offsets for stores (the duplicate
1042    // loads should be CSE'ed), but in case we have a duplicate load, we'll
1043    // split the chain so we don't have to handle this case specially.
1044    if (auto OffsetIt = ChainOffsets.find(I); OffsetIt != ChainOffsets.end()) {
1045      // I and ChainElem overlap if:
1046      //   - I and ChainElem have the same offset, OR
1047      //   - I's offset is less than ChainElem's, but I touches past the
1048      //     beginning of ChainElem, OR
1049      //   - ChainElem's offset is less than I's, but ChainElem touches past the
1050      //     beginning of I.
1051      const APInt &IOffset = OffsetIt->second;
1052      unsigned IElemSize = DL.getTypeStoreSize(getLoadStoreType(I));
1053      if (IOffset == ChainElemOffset ||
1054          (IOffset.sle(ChainElemOffset) &&
1055           (IOffset + IElemSize).sgt(ChainElemOffset)) ||
1056          (ChainElemOffset.sle(IOffset) &&
1057           (ChainElemOffset + ChainElemSize).sgt(OffsetIt->second))) {
1058        LLVM_DEBUG({
1059          // Double check that AA also sees this alias.  If not, we probably
1060          // have a bug.
1061          ModRefInfo MR = AA.getModRefInfo(I, MemoryLocation::get(ChainElem));
1062          assert(IsLoadChain ? isModSet(MR) : isModOrRefSet(MR));
1063          dbgs() << "LSV: Found alias in chain: " << *I << "\n";
1064        });
1065        return false; // We found an aliasing instruction; bail.
1066      }
1067
1068      continue; // We're confident there's no alias.
1069    }
1070
1071    LLVM_DEBUG(dbgs() << "LSV: Querying AA for " << *I << "\n");
1072    ModRefInfo MR = AA.getModRefInfo(I, MemoryLocation::get(ChainElem));
1073    if (IsLoadChain ? isModSet(MR) : isModOrRefSet(MR)) {
1074      LLVM_DEBUG(dbgs() << "LSV: Found alias in chain:\n"
1075                        << "  Aliasing instruction:\n"
1076                        << "    " << *I << '\n'
1077                        << "  Aliased instruction and pointer:\n"
1078                        << "    " << *ChainElem << '\n'
1079                        << "    " << *getLoadStorePointerOperand(ChainElem)
1080                        << '\n');
1081
1082      return false;
1083    }
1084  }
1085  return true;
1086}
1087
1088static bool checkNoWrapFlags(Instruction *I, bool Signed) {
1089  BinaryOperator *BinOpI = cast<BinaryOperator>(I);
1090  return (Signed && BinOpI->hasNoSignedWrap()) ||
1091         (!Signed && BinOpI->hasNoUnsignedWrap());
1092}
1093
1094static bool checkIfSafeAddSequence(const APInt &IdxDiff, Instruction *AddOpA,
1095                                   unsigned MatchingOpIdxA, Instruction *AddOpB,
1096                                   unsigned MatchingOpIdxB, bool Signed) {
1097  LLVM_DEBUG(dbgs() << "LSV: checkIfSafeAddSequence IdxDiff=" << IdxDiff
1098                    << ", AddOpA=" << *AddOpA << ", MatchingOpIdxA="
1099                    << MatchingOpIdxA << ", AddOpB=" << *AddOpB
1100                    << ", MatchingOpIdxB=" << MatchingOpIdxB
1101                    << ", Signed=" << Signed << "\n");
1102  // If both OpA and OpB are adds with NSW/NUW and with one of the operands
1103  // being the same, we can guarantee that the transformation is safe if we can
1104  // prove that OpA won't overflow when Ret added to the other operand of OpA.
1105  // For example:
1106  //  %tmp7 = add nsw i32 %tmp2, %v0
1107  //  %tmp8 = sext i32 %tmp7 to i64
1108  //  ...
1109  //  %tmp11 = add nsw i32 %v0, 1
1110  //  %tmp12 = add nsw i32 %tmp2, %tmp11
1111  //  %tmp13 = sext i32 %tmp12 to i64
1112  //
1113  //  Both %tmp7 and %tmp12 have the nsw flag and the first operand is %tmp2.
1114  //  It's guaranteed that adding 1 to %tmp7 won't overflow because %tmp11 adds
1115  //  1 to %v0 and both %tmp11 and %tmp12 have the nsw flag.
1116  assert(AddOpA->getOpcode() == Instruction::Add &&
1117         AddOpB->getOpcode() == Instruction::Add &&
1118         checkNoWrapFlags(AddOpA, Signed) && checkNoWrapFlags(AddOpB, Signed));
1119  if (AddOpA->getOperand(MatchingOpIdxA) ==
1120      AddOpB->getOperand(MatchingOpIdxB)) {
1121    Value *OtherOperandA = AddOpA->getOperand(MatchingOpIdxA == 1 ? 0 : 1);
1122    Value *OtherOperandB = AddOpB->getOperand(MatchingOpIdxB == 1 ? 0 : 1);
1123    Instruction *OtherInstrA = dyn_cast<Instruction>(OtherOperandA);
1124    Instruction *OtherInstrB = dyn_cast<Instruction>(OtherOperandB);
1125    // Match `x +nsw/nuw y` and `x +nsw/nuw (y +nsw/nuw IdxDiff)`.
1126    if (OtherInstrB && OtherInstrB->getOpcode() == Instruction::Add &&
1127        checkNoWrapFlags(OtherInstrB, Signed) &&
1128        isa<ConstantInt>(OtherInstrB->getOperand(1))) {
1129      int64_t CstVal =
1130          cast<ConstantInt>(OtherInstrB->getOperand(1))->getSExtValue();
1131      if (OtherInstrB->getOperand(0) == OtherOperandA &&
1132          IdxDiff.getSExtValue() == CstVal)
1133        return true;
1134    }
1135    // Match `x +nsw/nuw (y +nsw/nuw -Idx)` and `x +nsw/nuw (y +nsw/nuw x)`.
1136    if (OtherInstrA && OtherInstrA->getOpcode() == Instruction::Add &&
1137        checkNoWrapFlags(OtherInstrA, Signed) &&
1138        isa<ConstantInt>(OtherInstrA->getOperand(1))) {
1139      int64_t CstVal =
1140          cast<ConstantInt>(OtherInstrA->getOperand(1))->getSExtValue();
1141      if (OtherInstrA->getOperand(0) == OtherOperandB &&
1142          IdxDiff.getSExtValue() == -CstVal)
1143        return true;
1144    }
1145    // Match `x +nsw/nuw (y +nsw/nuw c)` and
1146    // `x +nsw/nuw (y +nsw/nuw (c + IdxDiff))`.
1147    if (OtherInstrA && OtherInstrB &&
1148        OtherInstrA->getOpcode() == Instruction::Add &&
1149        OtherInstrB->getOpcode() == Instruction::Add &&
1150        checkNoWrapFlags(OtherInstrA, Signed) &&
1151        checkNoWrapFlags(OtherInstrB, Signed) &&
1152        isa<ConstantInt>(OtherInstrA->getOperand(1)) &&
1153        isa<ConstantInt>(OtherInstrB->getOperand(1))) {
1154      int64_t CstValA =
1155          cast<ConstantInt>(OtherInstrA->getOperand(1))->getSExtValue();
1156      int64_t CstValB =
1157          cast<ConstantInt>(OtherInstrB->getOperand(1))->getSExtValue();
1158      if (OtherInstrA->getOperand(0) == OtherInstrB->getOperand(0) &&
1159          IdxDiff.getSExtValue() == (CstValB - CstValA))
1160        return true;
1161    }
1162  }
1163  return false;
1164}
1165
1166std::optional<APInt> Vectorizer::getConstantOffsetComplexAddrs(
1167    Value *PtrA, Value *PtrB, Instruction *ContextInst, unsigned Depth) {
1168  LLVM_DEBUG(dbgs() << "LSV: getConstantOffsetComplexAddrs PtrA=" << *PtrA
1169                    << " PtrB=" << *PtrB << " ContextInst=" << *ContextInst
1170                    << " Depth=" << Depth << "\n");
1171  auto *GEPA = dyn_cast<GetElementPtrInst>(PtrA);
1172  auto *GEPB = dyn_cast<GetElementPtrInst>(PtrB);
1173  if (!GEPA || !GEPB)
1174    return getConstantOffsetSelects(PtrA, PtrB, ContextInst, Depth);
1175
1176  // Look through GEPs after checking they're the same except for the last
1177  // index.
1178  if (GEPA->getNumOperands() != GEPB->getNumOperands() ||
1179      GEPA->getPointerOperand() != GEPB->getPointerOperand())
1180    return std::nullopt;
1181  gep_type_iterator GTIA = gep_type_begin(GEPA);
1182  gep_type_iterator GTIB = gep_type_begin(GEPB);
1183  for (unsigned I = 0, E = GEPA->getNumIndices() - 1; I < E; ++I) {
1184    if (GTIA.getOperand() != GTIB.getOperand())
1185      return std::nullopt;
1186    ++GTIA;
1187    ++GTIB;
1188  }
1189
1190  Instruction *OpA = dyn_cast<Instruction>(GTIA.getOperand());
1191  Instruction *OpB = dyn_cast<Instruction>(GTIB.getOperand());
1192  if (!OpA || !OpB || OpA->getOpcode() != OpB->getOpcode() ||
1193      OpA->getType() != OpB->getType())
1194    return std::nullopt;
1195
1196  uint64_t Stride = GTIA.getSequentialElementStride(DL);
1197
1198  // Only look through a ZExt/SExt.
1199  if (!isa<SExtInst>(OpA) && !isa<ZExtInst>(OpA))
1200    return std::nullopt;
1201
1202  bool Signed = isa<SExtInst>(OpA);
1203
1204  // At this point A could be a function parameter, i.e. not an instruction
1205  Value *ValA = OpA->getOperand(0);
1206  OpB = dyn_cast<Instruction>(OpB->getOperand(0));
1207  if (!OpB || ValA->getType() != OpB->getType())
1208    return std::nullopt;
1209
1210  const SCEV *OffsetSCEVA = SE.getSCEV(ValA);
1211  const SCEV *OffsetSCEVB = SE.getSCEV(OpB);
1212  const SCEV *IdxDiffSCEV = SE.getMinusSCEV(OffsetSCEVB, OffsetSCEVA);
1213  if (IdxDiffSCEV == SE.getCouldNotCompute())
1214    return std::nullopt;
1215
1216  ConstantRange IdxDiffRange = SE.getSignedRange(IdxDiffSCEV);
1217  if (!IdxDiffRange.isSingleElement())
1218    return std::nullopt;
1219  APInt IdxDiff = *IdxDiffRange.getSingleElement();
1220
1221  LLVM_DEBUG(dbgs() << "LSV: getConstantOffsetComplexAddrs IdxDiff=" << IdxDiff
1222                    << "\n");
1223
1224  // Now we need to prove that adding IdxDiff to ValA won't overflow.
1225  bool Safe = false;
1226
1227  // First attempt: if OpB is an add with NSW/NUW, and OpB is IdxDiff added to
1228  // ValA, we're okay.
1229  if (OpB->getOpcode() == Instruction::Add &&
1230      isa<ConstantInt>(OpB->getOperand(1)) &&
1231      IdxDiff.sle(cast<ConstantInt>(OpB->getOperand(1))->getSExtValue()) &&
1232      checkNoWrapFlags(OpB, Signed))
1233    Safe = true;
1234
1235  // Second attempt: check if we have eligible add NSW/NUW instruction
1236  // sequences.
1237  OpA = dyn_cast<Instruction>(ValA);
1238  if (!Safe && OpA && OpA->getOpcode() == Instruction::Add &&
1239      OpB->getOpcode() == Instruction::Add && checkNoWrapFlags(OpA, Signed) &&
1240      checkNoWrapFlags(OpB, Signed)) {
1241    // In the checks below a matching operand in OpA and OpB is an operand which
1242    // is the same in those two instructions.  Below we account for possible
1243    // orders of the operands of these add instructions.
1244    for (unsigned MatchingOpIdxA : {0, 1})
1245      for (unsigned MatchingOpIdxB : {0, 1})
1246        if (!Safe)
1247          Safe = checkIfSafeAddSequence(IdxDiff, OpA, MatchingOpIdxA, OpB,
1248                                        MatchingOpIdxB, Signed);
1249  }
1250
1251  unsigned BitWidth = ValA->getType()->getScalarSizeInBits();
1252
1253  // Third attempt:
1254  //
1255  // Assuming IdxDiff is positive: If all set bits of IdxDiff or any higher
1256  // order bit other than the sign bit are known to be zero in ValA, we can add
1257  // Diff to it while guaranteeing no overflow of any sort.
1258  //
1259  // If IdxDiff is negative, do the same, but swap ValA and ValB.
1260  if (!Safe) {
1261    // When computing known bits, use the GEPs as context instructions, since
1262    // they likely are in the same BB as the load/store.
1263    KnownBits Known(BitWidth);
1264    computeKnownBits((IdxDiff.sge(0) ? ValA : OpB), Known, DL, 0, &AC,
1265                     ContextInst, &DT);
1266    APInt BitsAllowedToBeSet = Known.Zero.zext(IdxDiff.getBitWidth());
1267    if (Signed)
1268      BitsAllowedToBeSet.clearBit(BitWidth - 1);
1269    if (BitsAllowedToBeSet.ult(IdxDiff.abs()))
1270      return std::nullopt;
1271    Safe = true;
1272  }
1273
1274  if (Safe)
1275    return IdxDiff * Stride;
1276  return std::nullopt;
1277}
1278
1279std::optional<APInt> Vectorizer::getConstantOffsetSelects(
1280    Value *PtrA, Value *PtrB, Instruction *ContextInst, unsigned Depth) {
1281  if (Depth++ == MaxDepth)
1282    return std::nullopt;
1283
1284  if (auto *SelectA = dyn_cast<SelectInst>(PtrA)) {
1285    if (auto *SelectB = dyn_cast<SelectInst>(PtrB)) {
1286      if (SelectA->getCondition() != SelectB->getCondition())
1287        return std::nullopt;
1288      LLVM_DEBUG(dbgs() << "LSV: getConstantOffsetSelects, PtrA=" << *PtrA
1289                        << ", PtrB=" << *PtrB << ", ContextInst="
1290                        << *ContextInst << ", Depth=" << Depth << "\n");
1291      std::optional<APInt> TrueDiff = getConstantOffset(
1292          SelectA->getTrueValue(), SelectB->getTrueValue(), ContextInst, Depth);
1293      if (!TrueDiff.has_value())
1294        return std::nullopt;
1295      std::optional<APInt> FalseDiff =
1296          getConstantOffset(SelectA->getFalseValue(), SelectB->getFalseValue(),
1297                            ContextInst, Depth);
1298      if (TrueDiff == FalseDiff)
1299        return TrueDiff;
1300    }
1301  }
1302  return std::nullopt;
1303}
1304
1305EquivalenceClassMap
1306Vectorizer::collectEquivalenceClasses(BasicBlock::iterator Begin,
1307                                      BasicBlock::iterator End) {
1308  EquivalenceClassMap Ret;
1309
1310  auto getUnderlyingObject = [](const Value *Ptr) -> const Value * {
1311    const Value *ObjPtr = llvm::getUnderlyingObject(Ptr);
1312    if (const auto *Sel = dyn_cast<SelectInst>(ObjPtr)) {
1313      // The select's themselves are distinct instructions even if they share
1314      // the same condition and evaluate to consecutive pointers for true and
1315      // false values of the condition. Therefore using the select's themselves
1316      // for grouping instructions would put consecutive accesses into different
1317      // lists and they won't be even checked for being consecutive, and won't
1318      // be vectorized.
1319      return Sel->getCondition();
1320    }
1321    return ObjPtr;
1322  };
1323
1324  for (Instruction &I : make_range(Begin, End)) {
1325    auto *LI = dyn_cast<LoadInst>(&I);
1326    auto *SI = dyn_cast<StoreInst>(&I);
1327    if (!LI && !SI)
1328      continue;
1329
1330    if ((LI && !LI->isSimple()) || (SI && !SI->isSimple()))
1331      continue;
1332
1333    if ((LI && !TTI.isLegalToVectorizeLoad(LI)) ||
1334        (SI && !TTI.isLegalToVectorizeStore(SI)))
1335      continue;
1336
1337    Type *Ty = getLoadStoreType(&I);
1338    if (!VectorType::isValidElementType(Ty->getScalarType()))
1339      continue;
1340
1341    // Skip weird non-byte sizes. They probably aren't worth the effort of
1342    // handling correctly.
1343    unsigned TySize = DL.getTypeSizeInBits(Ty);
1344    if ((TySize % 8) != 0)
1345      continue;
1346
1347    // Skip vectors of pointers. The vectorizeLoadChain/vectorizeStoreChain
1348    // functions are currently using an integer type for the vectorized
1349    // load/store, and does not support casting between the integer type and a
1350    // vector of pointers (e.g. i64 to <2 x i16*>)
1351    if (Ty->isVectorTy() && Ty->isPtrOrPtrVectorTy())
1352      continue;
1353
1354    Value *Ptr = getLoadStorePointerOperand(&I);
1355    unsigned AS = Ptr->getType()->getPointerAddressSpace();
1356    unsigned VecRegSize = TTI.getLoadStoreVecRegBitWidth(AS);
1357
1358    unsigned VF = VecRegSize / TySize;
1359    VectorType *VecTy = dyn_cast<VectorType>(Ty);
1360
1361    // Only handle power-of-two sized elements.
1362    if ((!VecTy && !isPowerOf2_32(DL.getTypeSizeInBits(Ty))) ||
1363        (VecTy && !isPowerOf2_32(DL.getTypeSizeInBits(VecTy->getScalarType()))))
1364      continue;
1365
1366    // No point in looking at these if they're too big to vectorize.
1367    if (TySize > VecRegSize / 2 ||
1368        (VecTy && TTI.getLoadVectorFactor(VF, TySize, TySize / 8, VecTy) == 0))
1369      continue;
1370
1371    Ret[{getUnderlyingObject(Ptr), AS,
1372         DL.getTypeSizeInBits(getLoadStoreType(&I)->getScalarType()),
1373         /*IsLoad=*/LI != nullptr}]
1374        .push_back(&I);
1375  }
1376
1377  return Ret;
1378}
1379
1380std::vector<Chain> Vectorizer::gatherChains(ArrayRef<Instruction *> Instrs) {
1381  if (Instrs.empty())
1382    return {};
1383
1384  unsigned AS = getLoadStoreAddressSpace(Instrs[0]);
1385  unsigned ASPtrBits = DL.getIndexSizeInBits(AS);
1386
1387#ifndef NDEBUG
1388  // Check that Instrs is in BB order and all have the same addr space.
1389  for (size_t I = 1; I < Instrs.size(); ++I) {
1390    assert(Instrs[I - 1]->comesBefore(Instrs[I]));
1391    assert(getLoadStoreAddressSpace(Instrs[I]) == AS);
1392  }
1393#endif
1394
1395  // Machinery to build an MRU-hashtable of Chains.
1396  //
1397  // (Ideally this could be done with MapVector, but as currently implemented,
1398  // moving an element to the front of a MapVector is O(n).)
1399  struct InstrListElem : ilist_node<InstrListElem>,
1400                         std::pair<Instruction *, Chain> {
1401    explicit InstrListElem(Instruction *I)
1402        : std::pair<Instruction *, Chain>(I, {}) {}
1403  };
1404  struct InstrListElemDenseMapInfo {
1405    using PtrInfo = DenseMapInfo<InstrListElem *>;
1406    using IInfo = DenseMapInfo<Instruction *>;
1407    static InstrListElem *getEmptyKey() { return PtrInfo::getEmptyKey(); }
1408    static InstrListElem *getTombstoneKey() {
1409      return PtrInfo::getTombstoneKey();
1410    }
1411    static unsigned getHashValue(const InstrListElem *E) {
1412      return IInfo::getHashValue(E->first);
1413    }
1414    static bool isEqual(const InstrListElem *A, const InstrListElem *B) {
1415      if (A == getEmptyKey() || B == getEmptyKey())
1416        return A == getEmptyKey() && B == getEmptyKey();
1417      if (A == getTombstoneKey() || B == getTombstoneKey())
1418        return A == getTombstoneKey() && B == getTombstoneKey();
1419      return IInfo::isEqual(A->first, B->first);
1420    }
1421  };
1422  SpecificBumpPtrAllocator<InstrListElem> Allocator;
1423  simple_ilist<InstrListElem> MRU;
1424  DenseSet<InstrListElem *, InstrListElemDenseMapInfo> Chains;
1425
1426  // Compare each instruction in `instrs` to leader of the N most recently-used
1427  // chains.  This limits the O(n^2) behavior of this pass while also allowing
1428  // us to build arbitrarily long chains.
1429  for (Instruction *I : Instrs) {
1430    constexpr int MaxChainsToTry = 64;
1431
1432    bool MatchFound = false;
1433    auto ChainIter = MRU.begin();
1434    for (size_t J = 0; J < MaxChainsToTry && ChainIter != MRU.end();
1435         ++J, ++ChainIter) {
1436      std::optional<APInt> Offset = getConstantOffset(
1437          getLoadStorePointerOperand(ChainIter->first),
1438          getLoadStorePointerOperand(I),
1439          /*ContextInst=*/
1440          (ChainIter->first->comesBefore(I) ? I : ChainIter->first));
1441      if (Offset.has_value()) {
1442        // `Offset` might not have the expected number of bits, if e.g. AS has a
1443        // different number of bits than opaque pointers.
1444        ChainIter->second.push_back(ChainElem{I, Offset.value()});
1445        // Move ChainIter to the front of the MRU list.
1446        MRU.remove(*ChainIter);
1447        MRU.push_front(*ChainIter);
1448        MatchFound = true;
1449        break;
1450      }
1451    }
1452
1453    if (!MatchFound) {
1454      APInt ZeroOffset(ASPtrBits, 0);
1455      InstrListElem *E = new (Allocator.Allocate()) InstrListElem(I);
1456      E->second.push_back(ChainElem{I, ZeroOffset});
1457      MRU.push_front(*E);
1458      Chains.insert(E);
1459    }
1460  }
1461
1462  std::vector<Chain> Ret;
1463  Ret.reserve(Chains.size());
1464  // Iterate over MRU rather than Chains so the order is deterministic.
1465  for (auto &E : MRU)
1466    if (E.second.size() > 1)
1467      Ret.push_back(std::move(E.second));
1468  return Ret;
1469}
1470
1471std::optional<APInt> Vectorizer::getConstantOffset(Value *PtrA, Value *PtrB,
1472                                                   Instruction *ContextInst,
1473                                                   unsigned Depth) {
1474  LLVM_DEBUG(dbgs() << "LSV: getConstantOffset, PtrA=" << *PtrA
1475                    << ", PtrB=" << *PtrB << ", ContextInst= " << *ContextInst
1476                    << ", Depth=" << Depth << "\n");
1477  // We'll ultimately return a value of this bit width, even if computations
1478  // happen in a different width.
1479  unsigned OrigBitWidth = DL.getIndexTypeSizeInBits(PtrA->getType());
1480  APInt OffsetA(OrigBitWidth, 0);
1481  APInt OffsetB(OrigBitWidth, 0);
1482  PtrA = PtrA->stripAndAccumulateInBoundsConstantOffsets(DL, OffsetA);
1483  PtrB = PtrB->stripAndAccumulateInBoundsConstantOffsets(DL, OffsetB);
1484  unsigned NewPtrBitWidth = DL.getTypeStoreSizeInBits(PtrA->getType());
1485  if (NewPtrBitWidth != DL.getTypeStoreSizeInBits(PtrB->getType()))
1486    return std::nullopt;
1487
1488  // If we have to shrink the pointer, stripAndAccumulateInBoundsConstantOffsets
1489  // should properly handle a possible overflow and the value should fit into
1490  // the smallest data type used in the cast/gep chain.
1491  assert(OffsetA.getSignificantBits() <= NewPtrBitWidth &&
1492         OffsetB.getSignificantBits() <= NewPtrBitWidth);
1493
1494  OffsetA = OffsetA.sextOrTrunc(NewPtrBitWidth);
1495  OffsetB = OffsetB.sextOrTrunc(NewPtrBitWidth);
1496  if (PtrA == PtrB)
1497    return (OffsetB - OffsetA).sextOrTrunc(OrigBitWidth);
1498
1499  // Try to compute B - A.
1500  const SCEV *DistScev = SE.getMinusSCEV(SE.getSCEV(PtrB), SE.getSCEV(PtrA));
1501  if (DistScev != SE.getCouldNotCompute()) {
1502    LLVM_DEBUG(dbgs() << "LSV: SCEV PtrB - PtrA =" << *DistScev << "\n");
1503    ConstantRange DistRange = SE.getSignedRange(DistScev);
1504    if (DistRange.isSingleElement()) {
1505      // Handle index width (the width of Dist) != pointer width (the width of
1506      // the Offset*s at this point).
1507      APInt Dist = DistRange.getSingleElement()->sextOrTrunc(NewPtrBitWidth);
1508      return (OffsetB - OffsetA + Dist).sextOrTrunc(OrigBitWidth);
1509    }
1510  }
1511  std::optional<APInt> Diff =
1512      getConstantOffsetComplexAddrs(PtrA, PtrB, ContextInst, Depth);
1513  if (Diff.has_value())
1514    return (OffsetB - OffsetA + Diff->sext(OffsetB.getBitWidth()))
1515        .sextOrTrunc(OrigBitWidth);
1516  return std::nullopt;
1517}
1518