1//===- LoadStoreVectorizer.cpp - GPU Load & Store Vectorizer --------------===// 2// 3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 4// See https://llvm.org/LICENSE.txt for license information. 5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 6// 7//===----------------------------------------------------------------------===// 8// 9// This pass merges loads/stores to/from sequential memory addresses into vector 10// loads/stores. Although there's nothing GPU-specific in here, this pass is 11// motivated by the microarchitectural quirks of nVidia and AMD GPUs. 12// 13// (For simplicity below we talk about loads only, but everything also applies 14// to stores.) 15// 16// This pass is intended to be run late in the pipeline, after other 17// vectorization opportunities have been exploited. So the assumption here is 18// that immediately following our new vector load we'll need to extract out the 19// individual elements of the load, so we can operate on them individually. 20// 21// On CPUs this transformation is usually not beneficial, because extracting the 22// elements of a vector register is expensive on most architectures. It's 23// usually better just to load each element individually into its own scalar 24// register. 25// 26// However, nVidia and AMD GPUs don't have proper vector registers. Instead, a 27// "vector load" loads directly into a series of scalar registers. In effect, 28// extracting the elements of the vector is free. It's therefore always 29// beneficial to vectorize a sequence of loads on these architectures. 30// 31// Vectorizing (perhaps a better name might be "coalescing") loads can have 32// large performance impacts on GPU kernels, and opportunities for vectorizing 33// are common in GPU code. This pass tries very hard to find such 34// opportunities; its runtime is quadratic in the number of loads in a BB. 35// 36// Some CPU architectures, such as ARM, have instructions that load into 37// multiple scalar registers, similar to a GPU vectorized load. In theory ARM 38// could use this pass (with some modifications), but currently it implements 39// its own pass to do something similar to what we do here. 40// 41// Overview of the algorithm and terminology in this pass: 42// 43// - Break up each basic block into pseudo-BBs, composed of instructions which 44// are guaranteed to transfer control to their successors. 45// - Within a single pseudo-BB, find all loads, and group them into 46// "equivalence classes" according to getUnderlyingObject() and loaded 47// element size. Do the same for stores. 48// - For each equivalence class, greedily build "chains". Each chain has a 49// leader instruction, and every other member of the chain has a known 50// constant offset from the first instr in the chain. 51// - Break up chains so that they contain only contiguous accesses of legal 52// size with no intervening may-alias instrs. 53// - Convert each chain to vector instructions. 54// 55// The O(n^2) behavior of this pass comes from initially building the chains. 56// In the worst case we have to compare each new instruction to all of those 57// that came before. To limit this, we only calculate the offset to the leaders 58// of the N most recently-used chains. 59 60#include "llvm/Transforms/Vectorize/LoadStoreVectorizer.h" 61#include "llvm/ADT/APInt.h" 62#include "llvm/ADT/ArrayRef.h" 63#include "llvm/ADT/DenseMap.h" 64#include "llvm/ADT/MapVector.h" 65#include "llvm/ADT/PostOrderIterator.h" 66#include "llvm/ADT/STLExtras.h" 67#include "llvm/ADT/Sequence.h" 68#include "llvm/ADT/SmallPtrSet.h" 69#include "llvm/ADT/SmallVector.h" 70#include "llvm/ADT/Statistic.h" 71#include "llvm/ADT/iterator_range.h" 72#include "llvm/Analysis/AliasAnalysis.h" 73#include "llvm/Analysis/AssumptionCache.h" 74#include "llvm/Analysis/MemoryLocation.h" 75#include "llvm/Analysis/ScalarEvolution.h" 76#include "llvm/Analysis/TargetTransformInfo.h" 77#include "llvm/Analysis/ValueTracking.h" 78#include "llvm/Analysis/VectorUtils.h" 79#include "llvm/IR/Attributes.h" 80#include "llvm/IR/BasicBlock.h" 81#include "llvm/IR/ConstantRange.h" 82#include "llvm/IR/Constants.h" 83#include "llvm/IR/DataLayout.h" 84#include "llvm/IR/DerivedTypes.h" 85#include "llvm/IR/Dominators.h" 86#include "llvm/IR/Function.h" 87#include "llvm/IR/GetElementPtrTypeIterator.h" 88#include "llvm/IR/IRBuilder.h" 89#include "llvm/IR/InstrTypes.h" 90#include "llvm/IR/Instruction.h" 91#include "llvm/IR/Instructions.h" 92#include "llvm/IR/LLVMContext.h" 93#include "llvm/IR/Module.h" 94#include "llvm/IR/Type.h" 95#include "llvm/IR/Value.h" 96#include "llvm/InitializePasses.h" 97#include "llvm/Pass.h" 98#include "llvm/Support/Alignment.h" 99#include "llvm/Support/Casting.h" 100#include "llvm/Support/Debug.h" 101#include "llvm/Support/KnownBits.h" 102#include "llvm/Support/MathExtras.h" 103#include "llvm/Support/ModRef.h" 104#include "llvm/Support/raw_ostream.h" 105#include "llvm/Transforms/Utils/Local.h" 106#include <algorithm> 107#include <cassert> 108#include <cstdint> 109#include <cstdlib> 110#include <iterator> 111#include <numeric> 112#include <optional> 113#include <tuple> 114#include <type_traits> 115#include <utility> 116#include <vector> 117 118using namespace llvm; 119 120#define DEBUG_TYPE "load-store-vectorizer" 121 122STATISTIC(NumVectorInstructions, "Number of vector accesses generated"); 123STATISTIC(NumScalarsVectorized, "Number of scalar accesses vectorized"); 124 125namespace { 126 127// Equivalence class key, the initial tuple by which we group loads/stores. 128// Loads/stores with different EqClassKeys are never merged. 129// 130// (We could in theory remove element-size from the this tuple. We'd just need 131// to fix up the vector packing/unpacking code.) 132using EqClassKey = 133 std::tuple<const Value * /* result of getUnderlyingObject() */, 134 unsigned /* AddrSpace */, 135 unsigned /* Load/Store element size bits */, 136 char /* IsLoad; char b/c bool can't be a DenseMap key */ 137 >; 138[[maybe_unused]] llvm::raw_ostream &operator<<(llvm::raw_ostream &OS, 139 const EqClassKey &K) { 140 const auto &[UnderlyingObject, AddrSpace, ElementSize, IsLoad] = K; 141 return OS << (IsLoad ? "load" : "store") << " of " << *UnderlyingObject 142 << " of element size " << ElementSize << " bits in addrspace " 143 << AddrSpace; 144} 145 146// A Chain is a set of instructions such that: 147// - All instructions have the same equivalence class, so in particular all are 148// loads, or all are stores. 149// - We know the address accessed by the i'th chain elem relative to the 150// chain's leader instruction, which is the first instr of the chain in BB 151// order. 152// 153// Chains have two canonical orderings: 154// - BB order, sorted by Instr->comesBefore. 155// - Offset order, sorted by OffsetFromLeader. 156// This pass switches back and forth between these orders. 157struct ChainElem { 158 Instruction *Inst; 159 APInt OffsetFromLeader; 160}; 161using Chain = SmallVector<ChainElem, 1>; 162 163void sortChainInBBOrder(Chain &C) { 164 sort(C, [](auto &A, auto &B) { return A.Inst->comesBefore(B.Inst); }); 165} 166 167void sortChainInOffsetOrder(Chain &C) { 168 sort(C, [](const auto &A, const auto &B) { 169 if (A.OffsetFromLeader != B.OffsetFromLeader) 170 return A.OffsetFromLeader.slt(B.OffsetFromLeader); 171 return A.Inst->comesBefore(B.Inst); // stable tiebreaker 172 }); 173} 174 175[[maybe_unused]] void dumpChain(ArrayRef<ChainElem> C) { 176 for (const auto &E : C) { 177 dbgs() << " " << *E.Inst << " (offset " << E.OffsetFromLeader << ")\n"; 178 } 179} 180 181using EquivalenceClassMap = 182 MapVector<EqClassKey, SmallVector<Instruction *, 8>>; 183 184// FIXME: Assuming stack alignment of 4 is always good enough 185constexpr unsigned StackAdjustedAlignment = 4; 186 187Instruction *propagateMetadata(Instruction *I, const Chain &C) { 188 SmallVector<Value *, 8> Values; 189 for (const ChainElem &E : C) 190 Values.push_back(E.Inst); 191 return propagateMetadata(I, Values); 192} 193 194bool isInvariantLoad(const Instruction *I) { 195 const LoadInst *LI = dyn_cast<LoadInst>(I); 196 return LI != nullptr && LI->hasMetadata(LLVMContext::MD_invariant_load); 197} 198 199/// Reorders the instructions that I depends on (the instructions defining its 200/// operands), to ensure they dominate I. 201void reorder(Instruction *I) { 202 SmallPtrSet<Instruction *, 16> InstructionsToMove; 203 SmallVector<Instruction *, 16> Worklist; 204 205 Worklist.push_back(I); 206 while (!Worklist.empty()) { 207 Instruction *IW = Worklist.pop_back_val(); 208 int NumOperands = IW->getNumOperands(); 209 for (int i = 0; i < NumOperands; i++) { 210 Instruction *IM = dyn_cast<Instruction>(IW->getOperand(i)); 211 if (!IM || IM->getOpcode() == Instruction::PHI) 212 continue; 213 214 // If IM is in another BB, no need to move it, because this pass only 215 // vectorizes instructions within one BB. 216 if (IM->getParent() != I->getParent()) 217 continue; 218 219 if (!IM->comesBefore(I)) { 220 InstructionsToMove.insert(IM); 221 Worklist.push_back(IM); 222 } 223 } 224 } 225 226 // All instructions to move should follow I. Start from I, not from begin(). 227 for (auto BBI = I->getIterator(), E = I->getParent()->end(); BBI != E;) { 228 Instruction *IM = &*(BBI++); 229 if (!InstructionsToMove.count(IM)) 230 continue; 231 IM->moveBefore(I); 232 } 233} 234 235class Vectorizer { 236 Function &F; 237 AliasAnalysis &AA; 238 AssumptionCache &AC; 239 DominatorTree &DT; 240 ScalarEvolution &SE; 241 TargetTransformInfo &TTI; 242 const DataLayout &DL; 243 IRBuilder<> Builder; 244 245 // We could erase instrs right after vectorizing them, but that can mess up 246 // our BB iterators, and also can make the equivalence class keys point to 247 // freed memory. This is fixable, but it's simpler just to wait until we're 248 // done with the BB and erase all at once. 249 SmallVector<Instruction *, 128> ToErase; 250 251public: 252 Vectorizer(Function &F, AliasAnalysis &AA, AssumptionCache &AC, 253 DominatorTree &DT, ScalarEvolution &SE, TargetTransformInfo &TTI) 254 : F(F), AA(AA), AC(AC), DT(DT), SE(SE), TTI(TTI), 255 DL(F.getParent()->getDataLayout()), Builder(SE.getContext()) {} 256 257 bool run(); 258 259private: 260 static const unsigned MaxDepth = 3; 261 262 /// Runs the vectorizer on a "pseudo basic block", which is a range of 263 /// instructions [Begin, End) within one BB all of which have 264 /// isGuaranteedToTransferExecutionToSuccessor(I) == true. 265 bool runOnPseudoBB(BasicBlock::iterator Begin, BasicBlock::iterator End); 266 267 /// Runs the vectorizer on one equivalence class, i.e. one set of loads/stores 268 /// in the same BB with the same value for getUnderlyingObject() etc. 269 bool runOnEquivalenceClass(const EqClassKey &EqClassKey, 270 ArrayRef<Instruction *> EqClass); 271 272 /// Runs the vectorizer on one chain, i.e. a subset of an equivalence class 273 /// where all instructions access a known, constant offset from the first 274 /// instruction. 275 bool runOnChain(Chain &C); 276 277 /// Splits the chain into subchains of instructions which read/write a 278 /// contiguous block of memory. Discards any length-1 subchains (because 279 /// there's nothing to vectorize in there). 280 std::vector<Chain> splitChainByContiguity(Chain &C); 281 282 /// Splits the chain into subchains where it's safe to hoist loads up to the 283 /// beginning of the sub-chain and it's safe to sink loads up to the end of 284 /// the sub-chain. Discards any length-1 subchains. 285 std::vector<Chain> splitChainByMayAliasInstrs(Chain &C); 286 287 /// Splits the chain into subchains that make legal, aligned accesses. 288 /// Discards any length-1 subchains. 289 std::vector<Chain> splitChainByAlignment(Chain &C); 290 291 /// Converts the instrs in the chain into a single vectorized load or store. 292 /// Adds the old scalar loads/stores to ToErase. 293 bool vectorizeChain(Chain &C); 294 295 /// Tries to compute the offset in bytes PtrB - PtrA. 296 std::optional<APInt> getConstantOffset(Value *PtrA, Value *PtrB, 297 Instruction *ContextInst, 298 unsigned Depth = 0); 299 std::optional<APInt> getConstantOffsetComplexAddrs(Value *PtrA, Value *PtrB, 300 Instruction *ContextInst, 301 unsigned Depth); 302 std::optional<APInt> getConstantOffsetSelects(Value *PtrA, Value *PtrB, 303 Instruction *ContextInst, 304 unsigned Depth); 305 306 /// Gets the element type of the vector that the chain will load or store. 307 /// This is nontrivial because the chain may contain elements of different 308 /// types; e.g. it's legal to have a chain that contains both i32 and float. 309 Type *getChainElemTy(const Chain &C); 310 311 /// Determines whether ChainElem can be moved up (if IsLoad) or down (if 312 /// !IsLoad) to ChainBegin -- i.e. there are no intervening may-alias 313 /// instructions. 314 /// 315 /// The map ChainElemOffsets must contain all of the elements in 316 /// [ChainBegin, ChainElem] and their offsets from some arbitrary base 317 /// address. It's ok if it contains additional entries. 318 template <bool IsLoadChain> 319 bool isSafeToMove( 320 Instruction *ChainElem, Instruction *ChainBegin, 321 const DenseMap<Instruction *, APInt /*OffsetFromLeader*/> &ChainOffsets); 322 323 /// Collects loads and stores grouped by "equivalence class", where: 324 /// - all elements in an eq class are a load or all are a store, 325 /// - they all load/store the same element size (it's OK to have e.g. i8 and 326 /// <4 x i8> in the same class, but not i32 and <4 x i8>), and 327 /// - they all have the same value for getUnderlyingObject(). 328 EquivalenceClassMap collectEquivalenceClasses(BasicBlock::iterator Begin, 329 BasicBlock::iterator End); 330 331 /// Partitions Instrs into "chains" where every instruction has a known 332 /// constant offset from the first instr in the chain. 333 /// 334 /// Postcondition: For all i, ret[i][0].second == 0, because the first instr 335 /// in the chain is the leader, and an instr touches distance 0 from itself. 336 std::vector<Chain> gatherChains(ArrayRef<Instruction *> Instrs); 337}; 338 339class LoadStoreVectorizerLegacyPass : public FunctionPass { 340public: 341 static char ID; 342 343 LoadStoreVectorizerLegacyPass() : FunctionPass(ID) { 344 initializeLoadStoreVectorizerLegacyPassPass( 345 *PassRegistry::getPassRegistry()); 346 } 347 348 bool runOnFunction(Function &F) override; 349 350 StringRef getPassName() const override { 351 return "GPU Load and Store Vectorizer"; 352 } 353 354 void getAnalysisUsage(AnalysisUsage &AU) const override { 355 AU.addRequired<AAResultsWrapperPass>(); 356 AU.addRequired<AssumptionCacheTracker>(); 357 AU.addRequired<ScalarEvolutionWrapperPass>(); 358 AU.addRequired<DominatorTreeWrapperPass>(); 359 AU.addRequired<TargetTransformInfoWrapperPass>(); 360 AU.setPreservesCFG(); 361 } 362}; 363 364} // end anonymous namespace 365 366char LoadStoreVectorizerLegacyPass::ID = 0; 367 368INITIALIZE_PASS_BEGIN(LoadStoreVectorizerLegacyPass, DEBUG_TYPE, 369 "Vectorize load and Store instructions", false, false) 370INITIALIZE_PASS_DEPENDENCY(SCEVAAWrapperPass) 371INITIALIZE_PASS_DEPENDENCY(AssumptionCacheTracker); 372INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass) 373INITIALIZE_PASS_DEPENDENCY(AAResultsWrapperPass) 374INITIALIZE_PASS_DEPENDENCY(GlobalsAAWrapperPass) 375INITIALIZE_PASS_DEPENDENCY(TargetTransformInfoWrapperPass) 376INITIALIZE_PASS_END(LoadStoreVectorizerLegacyPass, DEBUG_TYPE, 377 "Vectorize load and store instructions", false, false) 378 379Pass *llvm::createLoadStoreVectorizerPass() { 380 return new LoadStoreVectorizerLegacyPass(); 381} 382 383bool LoadStoreVectorizerLegacyPass::runOnFunction(Function &F) { 384 // Don't vectorize when the attribute NoImplicitFloat is used. 385 if (skipFunction(F) || F.hasFnAttribute(Attribute::NoImplicitFloat)) 386 return false; 387 388 AliasAnalysis &AA = getAnalysis<AAResultsWrapperPass>().getAAResults(); 389 DominatorTree &DT = getAnalysis<DominatorTreeWrapperPass>().getDomTree(); 390 ScalarEvolution &SE = getAnalysis<ScalarEvolutionWrapperPass>().getSE(); 391 TargetTransformInfo &TTI = 392 getAnalysis<TargetTransformInfoWrapperPass>().getTTI(F); 393 394 AssumptionCache &AC = 395 getAnalysis<AssumptionCacheTracker>().getAssumptionCache(F); 396 397 return Vectorizer(F, AA, AC, DT, SE, TTI).run(); 398} 399 400PreservedAnalyses LoadStoreVectorizerPass::run(Function &F, 401 FunctionAnalysisManager &AM) { 402 // Don't vectorize when the attribute NoImplicitFloat is used. 403 if (F.hasFnAttribute(Attribute::NoImplicitFloat)) 404 return PreservedAnalyses::all(); 405 406 AliasAnalysis &AA = AM.getResult<AAManager>(F); 407 DominatorTree &DT = AM.getResult<DominatorTreeAnalysis>(F); 408 ScalarEvolution &SE = AM.getResult<ScalarEvolutionAnalysis>(F); 409 TargetTransformInfo &TTI = AM.getResult<TargetIRAnalysis>(F); 410 AssumptionCache &AC = AM.getResult<AssumptionAnalysis>(F); 411 412 bool Changed = Vectorizer(F, AA, AC, DT, SE, TTI).run(); 413 PreservedAnalyses PA; 414 PA.preserveSet<CFGAnalyses>(); 415 return Changed ? PA : PreservedAnalyses::all(); 416} 417 418bool Vectorizer::run() { 419 bool Changed = false; 420 // Break up the BB if there are any instrs which aren't guaranteed to transfer 421 // execution to their successor. 422 // 423 // Consider, for example: 424 // 425 // def assert_arr_len(int n) { if (n < 2) exit(); } 426 // 427 // load arr[0] 428 // call assert_array_len(arr.length) 429 // load arr[1] 430 // 431 // Even though assert_arr_len does not read or write any memory, we can't 432 // speculate the second load before the call. More info at 433 // https://github.com/llvm/llvm-project/issues/52950. 434 for (BasicBlock *BB : post_order(&F)) { 435 // BB must at least have a terminator. 436 assert(!BB->empty()); 437 438 SmallVector<BasicBlock::iterator, 8> Barriers; 439 Barriers.push_back(BB->begin()); 440 for (Instruction &I : *BB) 441 if (!isGuaranteedToTransferExecutionToSuccessor(&I)) 442 Barriers.push_back(I.getIterator()); 443 Barriers.push_back(BB->end()); 444 445 for (auto It = Barriers.begin(), End = std::prev(Barriers.end()); It != End; 446 ++It) 447 Changed |= runOnPseudoBB(*It, *std::next(It)); 448 449 for (Instruction *I : ToErase) { 450 auto *PtrOperand = getLoadStorePointerOperand(I); 451 if (I->use_empty()) 452 I->eraseFromParent(); 453 RecursivelyDeleteTriviallyDeadInstructions(PtrOperand); 454 } 455 ToErase.clear(); 456 } 457 458 return Changed; 459} 460 461bool Vectorizer::runOnPseudoBB(BasicBlock::iterator Begin, 462 BasicBlock::iterator End) { 463 LLVM_DEBUG({ 464 dbgs() << "LSV: Running on pseudo-BB [" << *Begin << " ... "; 465 if (End != Begin->getParent()->end()) 466 dbgs() << *End; 467 else 468 dbgs() << "<BB end>"; 469 dbgs() << ")\n"; 470 }); 471 472 bool Changed = false; 473 for (const auto &[EqClassKey, EqClass] : 474 collectEquivalenceClasses(Begin, End)) 475 Changed |= runOnEquivalenceClass(EqClassKey, EqClass); 476 477 return Changed; 478} 479 480bool Vectorizer::runOnEquivalenceClass(const EqClassKey &EqClassKey, 481 ArrayRef<Instruction *> EqClass) { 482 bool Changed = false; 483 484 LLVM_DEBUG({ 485 dbgs() << "LSV: Running on equivalence class of size " << EqClass.size() 486 << " keyed on " << EqClassKey << ":\n"; 487 for (Instruction *I : EqClass) 488 dbgs() << " " << *I << "\n"; 489 }); 490 491 std::vector<Chain> Chains = gatherChains(EqClass); 492 LLVM_DEBUG(dbgs() << "LSV: Got " << Chains.size() 493 << " nontrivial chains.\n";); 494 for (Chain &C : Chains) 495 Changed |= runOnChain(C); 496 return Changed; 497} 498 499bool Vectorizer::runOnChain(Chain &C) { 500 LLVM_DEBUG({ 501 dbgs() << "LSV: Running on chain with " << C.size() << " instructions:\n"; 502 dumpChain(C); 503 }); 504 505 // Split up the chain into increasingly smaller chains, until we can finally 506 // vectorize the chains. 507 // 508 // (Don't be scared by the depth of the loop nest here. These operations are 509 // all at worst O(n lg n) in the number of instructions, and splitting chains 510 // doesn't change the number of instrs. So the whole loop nest is O(n lg n).) 511 bool Changed = false; 512 for (auto &C : splitChainByMayAliasInstrs(C)) 513 for (auto &C : splitChainByContiguity(C)) 514 for (auto &C : splitChainByAlignment(C)) 515 Changed |= vectorizeChain(C); 516 return Changed; 517} 518 519std::vector<Chain> Vectorizer::splitChainByMayAliasInstrs(Chain &C) { 520 if (C.empty()) 521 return {}; 522 523 sortChainInBBOrder(C); 524 525 LLVM_DEBUG({ 526 dbgs() << "LSV: splitChainByMayAliasInstrs considering chain:\n"; 527 dumpChain(C); 528 }); 529 530 // We know that elements in the chain with nonverlapping offsets can't 531 // alias, but AA may not be smart enough to figure this out. Use a 532 // hashtable. 533 DenseMap<Instruction *, APInt /*OffsetFromLeader*/> ChainOffsets; 534 for (const auto &E : C) 535 ChainOffsets.insert({&*E.Inst, E.OffsetFromLeader}); 536 537 // Loads get hoisted up to the first load in the chain. Stores get sunk 538 // down to the last store in the chain. Our algorithm for loads is: 539 // 540 // - Take the first element of the chain. This is the start of a new chain. 541 // - Take the next element of `Chain` and check for may-alias instructions 542 // up to the start of NewChain. If no may-alias instrs, add it to 543 // NewChain. Otherwise, start a new NewChain. 544 // 545 // For stores it's the same except in the reverse direction. 546 // 547 // We expect IsLoad to be an std::bool_constant. 548 auto Impl = [&](auto IsLoad) { 549 // MSVC is unhappy if IsLoad is a capture, so pass it as an arg. 550 auto [ChainBegin, ChainEnd] = [&](auto IsLoad) { 551 if constexpr (IsLoad()) 552 return std::make_pair(C.begin(), C.end()); 553 else 554 return std::make_pair(C.rbegin(), C.rend()); 555 }(IsLoad); 556 assert(ChainBegin != ChainEnd); 557 558 std::vector<Chain> Chains; 559 SmallVector<ChainElem, 1> NewChain; 560 NewChain.push_back(*ChainBegin); 561 for (auto ChainIt = std::next(ChainBegin); ChainIt != ChainEnd; ++ChainIt) { 562 if (isSafeToMove<IsLoad>(ChainIt->Inst, NewChain.front().Inst, 563 ChainOffsets)) { 564 LLVM_DEBUG(dbgs() << "LSV: No intervening may-alias instrs; can merge " 565 << *ChainIt->Inst << " into " << *ChainBegin->Inst 566 << "\n"); 567 NewChain.push_back(*ChainIt); 568 } else { 569 LLVM_DEBUG( 570 dbgs() << "LSV: Found intervening may-alias instrs; cannot merge " 571 << *ChainIt->Inst << " into " << *ChainBegin->Inst << "\n"); 572 if (NewChain.size() > 1) { 573 LLVM_DEBUG({ 574 dbgs() << "LSV: got nontrivial chain without aliasing instrs:\n"; 575 dumpChain(NewChain); 576 }); 577 Chains.push_back(std::move(NewChain)); 578 } 579 580 // Start a new chain. 581 NewChain = SmallVector<ChainElem, 1>({*ChainIt}); 582 } 583 } 584 if (NewChain.size() > 1) { 585 LLVM_DEBUG({ 586 dbgs() << "LSV: got nontrivial chain without aliasing instrs:\n"; 587 dumpChain(NewChain); 588 }); 589 Chains.push_back(std::move(NewChain)); 590 } 591 return Chains; 592 }; 593 594 if (isa<LoadInst>(C[0].Inst)) 595 return Impl(/*IsLoad=*/std::bool_constant<true>()); 596 597 assert(isa<StoreInst>(C[0].Inst)); 598 return Impl(/*IsLoad=*/std::bool_constant<false>()); 599} 600 601std::vector<Chain> Vectorizer::splitChainByContiguity(Chain &C) { 602 if (C.empty()) 603 return {}; 604 605 sortChainInOffsetOrder(C); 606 607 LLVM_DEBUG({ 608 dbgs() << "LSV: splitChainByContiguity considering chain:\n"; 609 dumpChain(C); 610 }); 611 612 std::vector<Chain> Ret; 613 Ret.push_back({C.front()}); 614 615 for (auto It = std::next(C.begin()), End = C.end(); It != End; ++It) { 616 // `prev` accesses offsets [PrevDistFromBase, PrevReadEnd). 617 auto &CurChain = Ret.back(); 618 const ChainElem &Prev = CurChain.back(); 619 unsigned SzBits = DL.getTypeSizeInBits(getLoadStoreType(&*Prev.Inst)); 620 assert(SzBits % 8 == 0 && "Non-byte sizes should have been filtered out by " 621 "collectEquivalenceClass"); 622 APInt PrevReadEnd = Prev.OffsetFromLeader + SzBits / 8; 623 624 // Add this instruction to the end of the current chain, or start a new one. 625 bool AreContiguous = It->OffsetFromLeader == PrevReadEnd; 626 LLVM_DEBUG(dbgs() << "LSV: Instructions are " 627 << (AreContiguous ? "" : "not ") << "contiguous: " 628 << *Prev.Inst << " (ends at offset " << PrevReadEnd 629 << ") -> " << *It->Inst << " (starts at offset " 630 << It->OffsetFromLeader << ")\n"); 631 if (AreContiguous) 632 CurChain.push_back(*It); 633 else 634 Ret.push_back({*It}); 635 } 636 637 // Filter out length-1 chains, these are uninteresting. 638 llvm::erase_if(Ret, [](const auto &Chain) { return Chain.size() <= 1; }); 639 return Ret; 640} 641 642Type *Vectorizer::getChainElemTy(const Chain &C) { 643 assert(!C.empty()); 644 // The rules are: 645 // - If there are any pointer types in the chain, use an integer type. 646 // - Prefer an integer type if it appears in the chain. 647 // - Otherwise, use the first type in the chain. 648 // 649 // The rule about pointer types is a simplification when we merge e.g. a load 650 // of a ptr and a double. There's no direct conversion from a ptr to a 651 // double; it requires a ptrtoint followed by a bitcast. 652 // 653 // It's unclear to me if the other rules have any practical effect, but we do 654 // it to match this pass's previous behavior. 655 if (any_of(C, [](const ChainElem &E) { 656 return getLoadStoreType(E.Inst)->getScalarType()->isPointerTy(); 657 })) { 658 return Type::getIntNTy( 659 F.getContext(), 660 DL.getTypeSizeInBits(getLoadStoreType(C[0].Inst)->getScalarType())); 661 } 662 663 for (const ChainElem &E : C) 664 if (Type *T = getLoadStoreType(E.Inst)->getScalarType(); T->isIntegerTy()) 665 return T; 666 return getLoadStoreType(C[0].Inst)->getScalarType(); 667} 668 669std::vector<Chain> Vectorizer::splitChainByAlignment(Chain &C) { 670 // We use a simple greedy algorithm. 671 // - Given a chain of length N, find all prefixes that 672 // (a) are not longer than the max register length, and 673 // (b) are a power of 2. 674 // - Starting from the longest prefix, try to create a vector of that length. 675 // - If one of them works, great. Repeat the algorithm on any remaining 676 // elements in the chain. 677 // - If none of them work, discard the first element and repeat on a chain 678 // of length N-1. 679 if (C.empty()) 680 return {}; 681 682 sortChainInOffsetOrder(C); 683 684 LLVM_DEBUG({ 685 dbgs() << "LSV: splitChainByAlignment considering chain:\n"; 686 dumpChain(C); 687 }); 688 689 bool IsLoadChain = isa<LoadInst>(C[0].Inst); 690 auto getVectorFactor = [&](unsigned VF, unsigned LoadStoreSize, 691 unsigned ChainSizeBytes, VectorType *VecTy) { 692 return IsLoadChain ? TTI.getLoadVectorFactor(VF, LoadStoreSize, 693 ChainSizeBytes, VecTy) 694 : TTI.getStoreVectorFactor(VF, LoadStoreSize, 695 ChainSizeBytes, VecTy); 696 }; 697 698#ifndef NDEBUG 699 for (const auto &E : C) { 700 Type *Ty = getLoadStoreType(E.Inst)->getScalarType(); 701 assert(isPowerOf2_32(DL.getTypeSizeInBits(Ty)) && 702 "Should have filtered out non-power-of-two elements in " 703 "collectEquivalenceClasses."); 704 } 705#endif 706 707 unsigned AS = getLoadStoreAddressSpace(C[0].Inst); 708 unsigned VecRegBytes = TTI.getLoadStoreVecRegBitWidth(AS) / 8; 709 710 std::vector<Chain> Ret; 711 for (unsigned CBegin = 0; CBegin < C.size(); ++CBegin) { 712 // Find candidate chains of size not greater than the largest vector reg. 713 // These chains are over the closed interval [CBegin, CEnd]. 714 SmallVector<std::pair<unsigned /*CEnd*/, unsigned /*SizeBytes*/>, 8> 715 CandidateChains; 716 for (unsigned CEnd = CBegin + 1, Size = C.size(); CEnd < Size; ++CEnd) { 717 APInt Sz = C[CEnd].OffsetFromLeader + 718 DL.getTypeStoreSize(getLoadStoreType(C[CEnd].Inst)) - 719 C[CBegin].OffsetFromLeader; 720 if (Sz.sgt(VecRegBytes)) 721 break; 722 CandidateChains.push_back( 723 {CEnd, static_cast<unsigned>(Sz.getLimitedValue())}); 724 } 725 726 // Consider the longest chain first. 727 for (auto It = CandidateChains.rbegin(), End = CandidateChains.rend(); 728 It != End; ++It) { 729 auto [CEnd, SizeBytes] = *It; 730 LLVM_DEBUG( 731 dbgs() << "LSV: splitChainByAlignment considering candidate chain [" 732 << *C[CBegin].Inst << " ... " << *C[CEnd].Inst << "]\n"); 733 734 Type *VecElemTy = getChainElemTy(C); 735 // Note, VecElemTy is a power of 2, but might be less than one byte. For 736 // example, we can vectorize 2 x <2 x i4> to <4 x i4>, and in this case 737 // VecElemTy would be i4. 738 unsigned VecElemBits = DL.getTypeSizeInBits(VecElemTy); 739 740 // SizeBytes and VecElemBits are powers of 2, so they divide evenly. 741 assert((8 * SizeBytes) % VecElemBits == 0); 742 unsigned NumVecElems = 8 * SizeBytes / VecElemBits; 743 FixedVectorType *VecTy = FixedVectorType::get(VecElemTy, NumVecElems); 744 unsigned VF = 8 * VecRegBytes / VecElemBits; 745 746 // Check that TTI is happy with this vectorization factor. 747 unsigned TargetVF = getVectorFactor(VF, VecElemBits, 748 VecElemBits * NumVecElems / 8, VecTy); 749 if (TargetVF != VF && TargetVF < NumVecElems) { 750 LLVM_DEBUG( 751 dbgs() << "LSV: splitChainByAlignment discarding candidate chain " 752 "because TargetVF=" 753 << TargetVF << " != VF=" << VF 754 << " and TargetVF < NumVecElems=" << NumVecElems << "\n"); 755 continue; 756 } 757 758 // Is a load/store with this alignment allowed by TTI and at least as fast 759 // as an unvectorized load/store? 760 // 761 // TTI and F are passed as explicit captures to WAR an MSVC misparse (??). 762 auto IsAllowedAndFast = [&, SizeBytes = SizeBytes, &TTI = TTI, 763 &F = F](Align Alignment) { 764 if (Alignment.value() % SizeBytes == 0) 765 return true; 766 unsigned VectorizedSpeed = 0; 767 bool AllowsMisaligned = TTI.allowsMisalignedMemoryAccesses( 768 F.getContext(), SizeBytes * 8, AS, Alignment, &VectorizedSpeed); 769 if (!AllowsMisaligned) { 770 LLVM_DEBUG(dbgs() 771 << "LSV: Access of " << SizeBytes << "B in addrspace " 772 << AS << " with alignment " << Alignment.value() 773 << " is misaligned, and therefore can't be vectorized.\n"); 774 return false; 775 } 776 777 unsigned ElementwiseSpeed = 0; 778 (TTI).allowsMisalignedMemoryAccesses((F).getContext(), VecElemBits, AS, 779 Alignment, &ElementwiseSpeed); 780 if (VectorizedSpeed < ElementwiseSpeed) { 781 LLVM_DEBUG(dbgs() 782 << "LSV: Access of " << SizeBytes << "B in addrspace " 783 << AS << " with alignment " << Alignment.value() 784 << " has relative speed " << VectorizedSpeed 785 << ", which is lower than the elementwise speed of " 786 << ElementwiseSpeed 787 << ". Therefore this access won't be vectorized.\n"); 788 return false; 789 } 790 return true; 791 }; 792 793 // If we're loading/storing from an alloca, align it if possible. 794 // 795 // FIXME: We eagerly upgrade the alignment, regardless of whether TTI 796 // tells us this is beneficial. This feels a bit odd, but it matches 797 // existing tests. This isn't *so* bad, because at most we align to 4 798 // bytes (current value of StackAdjustedAlignment). 799 // 800 // FIXME: We will upgrade the alignment of the alloca even if it turns out 801 // we can't vectorize for some other reason. 802 Value *PtrOperand = getLoadStorePointerOperand(C[CBegin].Inst); 803 bool IsAllocaAccess = AS == DL.getAllocaAddrSpace() && 804 isa<AllocaInst>(PtrOperand->stripPointerCasts()); 805 Align Alignment = getLoadStoreAlignment(C[CBegin].Inst); 806 Align PrefAlign = Align(StackAdjustedAlignment); 807 if (IsAllocaAccess && Alignment.value() % SizeBytes != 0 && 808 IsAllowedAndFast(PrefAlign)) { 809 Align NewAlign = getOrEnforceKnownAlignment( 810 PtrOperand, PrefAlign, DL, C[CBegin].Inst, nullptr, &DT); 811 if (NewAlign >= Alignment) { 812 LLVM_DEBUG(dbgs() 813 << "LSV: splitByChain upgrading alloca alignment from " 814 << Alignment.value() << " to " << NewAlign.value() 815 << "\n"); 816 Alignment = NewAlign; 817 } 818 } 819 820 if (!IsAllowedAndFast(Alignment)) { 821 LLVM_DEBUG( 822 dbgs() << "LSV: splitChainByAlignment discarding candidate chain " 823 "because its alignment is not AllowedAndFast: " 824 << Alignment.value() << "\n"); 825 continue; 826 } 827 828 if ((IsLoadChain && 829 !TTI.isLegalToVectorizeLoadChain(SizeBytes, Alignment, AS)) || 830 (!IsLoadChain && 831 !TTI.isLegalToVectorizeStoreChain(SizeBytes, Alignment, AS))) { 832 LLVM_DEBUG( 833 dbgs() << "LSV: splitChainByAlignment discarding candidate chain " 834 "because !isLegalToVectorizeLoad/StoreChain."); 835 continue; 836 } 837 838 // Hooray, we can vectorize this chain! 839 Chain &NewChain = Ret.emplace_back(); 840 for (unsigned I = CBegin; I <= CEnd; ++I) 841 NewChain.push_back(C[I]); 842 CBegin = CEnd; // Skip over the instructions we've added to the chain. 843 break; 844 } 845 } 846 return Ret; 847} 848 849bool Vectorizer::vectorizeChain(Chain &C) { 850 if (C.size() < 2) 851 return false; 852 853 sortChainInOffsetOrder(C); 854 855 LLVM_DEBUG({ 856 dbgs() << "LSV: Vectorizing chain of " << C.size() << " instructions:\n"; 857 dumpChain(C); 858 }); 859 860 Type *VecElemTy = getChainElemTy(C); 861 bool IsLoadChain = isa<LoadInst>(C[0].Inst); 862 unsigned AS = getLoadStoreAddressSpace(C[0].Inst); 863 unsigned ChainBytes = std::accumulate( 864 C.begin(), C.end(), 0u, [&](unsigned Bytes, const ChainElem &E) { 865 return Bytes + DL.getTypeStoreSize(getLoadStoreType(E.Inst)); 866 }); 867 assert(ChainBytes % DL.getTypeStoreSize(VecElemTy) == 0); 868 // VecTy is a power of 2 and 1 byte at smallest, but VecElemTy may be smaller 869 // than 1 byte (e.g. VecTy == <32 x i1>). 870 Type *VecTy = FixedVectorType::get( 871 VecElemTy, 8 * ChainBytes / DL.getTypeSizeInBits(VecElemTy)); 872 873 Align Alignment = getLoadStoreAlignment(C[0].Inst); 874 // If this is a load/store of an alloca, we might have upgraded the alloca's 875 // alignment earlier. Get the new alignment. 876 if (AS == DL.getAllocaAddrSpace()) { 877 Alignment = std::max( 878 Alignment, 879 getOrEnforceKnownAlignment(getLoadStorePointerOperand(C[0].Inst), 880 MaybeAlign(), DL, C[0].Inst, nullptr, &DT)); 881 } 882 883 // All elements of the chain must have the same scalar-type size. 884#ifndef NDEBUG 885 for (const ChainElem &E : C) 886 assert(DL.getTypeStoreSize(getLoadStoreType(E.Inst)->getScalarType()) == 887 DL.getTypeStoreSize(VecElemTy)); 888#endif 889 890 Instruction *VecInst; 891 if (IsLoadChain) { 892 // Loads get hoisted to the location of the first load in the chain. We may 893 // also need to hoist the (transitive) operands of the loads. 894 Builder.SetInsertPoint( 895 std::min_element(C.begin(), C.end(), [](const auto &A, const auto &B) { 896 return A.Inst->comesBefore(B.Inst); 897 })->Inst); 898 899 // Chain is in offset order, so C[0] is the instr with the lowest offset, 900 // i.e. the root of the vector. 901 VecInst = Builder.CreateAlignedLoad(VecTy, 902 getLoadStorePointerOperand(C[0].Inst), 903 Alignment); 904 905 unsigned VecIdx = 0; 906 for (const ChainElem &E : C) { 907 Instruction *I = E.Inst; 908 Value *V; 909 Type *T = getLoadStoreType(I); 910 if (auto *VT = dyn_cast<FixedVectorType>(T)) { 911 auto Mask = llvm::to_vector<8>( 912 llvm::seq<int>(VecIdx, VecIdx + VT->getNumElements())); 913 V = Builder.CreateShuffleVector(VecInst, Mask, I->getName()); 914 VecIdx += VT->getNumElements(); 915 } else { 916 V = Builder.CreateExtractElement(VecInst, Builder.getInt32(VecIdx), 917 I->getName()); 918 ++VecIdx; 919 } 920 if (V->getType() != I->getType()) 921 V = Builder.CreateBitOrPointerCast(V, I->getType()); 922 I->replaceAllUsesWith(V); 923 } 924 925 // Finally, we need to reorder the instrs in the BB so that the (transitive) 926 // operands of VecInst appear before it. To see why, suppose we have 927 // vectorized the following code: 928 // 929 // ptr1 = gep a, 1 930 // load1 = load i32 ptr1 931 // ptr0 = gep a, 0 932 // load0 = load i32 ptr0 933 // 934 // We will put the vectorized load at the location of the earliest load in 935 // the BB, i.e. load1. We get: 936 // 937 // ptr1 = gep a, 1 938 // loadv = load <2 x i32> ptr0 939 // load0 = extractelement loadv, 0 940 // load1 = extractelement loadv, 1 941 // ptr0 = gep a, 0 942 // 943 // Notice that loadv uses ptr0, which is defined *after* it! 944 reorder(VecInst); 945 } else { 946 // Stores get sunk to the location of the last store in the chain. 947 Builder.SetInsertPoint( 948 std::max_element(C.begin(), C.end(), [](auto &A, auto &B) { 949 return A.Inst->comesBefore(B.Inst); 950 })->Inst); 951 952 // Build the vector to store. 953 Value *Vec = PoisonValue::get(VecTy); 954 unsigned VecIdx = 0; 955 auto InsertElem = [&](Value *V) { 956 if (V->getType() != VecElemTy) 957 V = Builder.CreateBitOrPointerCast(V, VecElemTy); 958 Vec = Builder.CreateInsertElement(Vec, V, Builder.getInt32(VecIdx++)); 959 }; 960 for (const ChainElem &E : C) { 961 auto I = cast<StoreInst>(E.Inst); 962 if (FixedVectorType *VT = 963 dyn_cast<FixedVectorType>(getLoadStoreType(I))) { 964 for (int J = 0, JE = VT->getNumElements(); J < JE; ++J) { 965 InsertElem(Builder.CreateExtractElement(I->getValueOperand(), 966 Builder.getInt32(J))); 967 } 968 } else { 969 InsertElem(I->getValueOperand()); 970 } 971 } 972 973 // Chain is in offset order, so C[0] is the instr with the lowest offset, 974 // i.e. the root of the vector. 975 VecInst = Builder.CreateAlignedStore( 976 Vec, 977 getLoadStorePointerOperand(C[0].Inst), 978 Alignment); 979 } 980 981 propagateMetadata(VecInst, C); 982 983 for (const ChainElem &E : C) 984 ToErase.push_back(E.Inst); 985 986 ++NumVectorInstructions; 987 NumScalarsVectorized += C.size(); 988 return true; 989} 990 991template <bool IsLoadChain> 992bool Vectorizer::isSafeToMove( 993 Instruction *ChainElem, Instruction *ChainBegin, 994 const DenseMap<Instruction *, APInt /*OffsetFromLeader*/> &ChainOffsets) { 995 LLVM_DEBUG(dbgs() << "LSV: isSafeToMove(" << *ChainElem << " -> " 996 << *ChainBegin << ")\n"); 997 998 assert(isa<LoadInst>(ChainElem) == IsLoadChain); 999 if (ChainElem == ChainBegin) 1000 return true; 1001 1002 // Invariant loads can always be reordered; by definition they are not 1003 // clobbered by stores. 1004 if (isInvariantLoad(ChainElem)) 1005 return true; 1006 1007 auto BBIt = std::next([&] { 1008 if constexpr (IsLoadChain) 1009 return BasicBlock::reverse_iterator(ChainElem); 1010 else 1011 return BasicBlock::iterator(ChainElem); 1012 }()); 1013 auto BBItEnd = std::next([&] { 1014 if constexpr (IsLoadChain) 1015 return BasicBlock::reverse_iterator(ChainBegin); 1016 else 1017 return BasicBlock::iterator(ChainBegin); 1018 }()); 1019 1020 const APInt &ChainElemOffset = ChainOffsets.at(ChainElem); 1021 const unsigned ChainElemSize = 1022 DL.getTypeStoreSize(getLoadStoreType(ChainElem)); 1023 1024 for (; BBIt != BBItEnd; ++BBIt) { 1025 Instruction *I = &*BBIt; 1026 1027 if (!I->mayReadOrWriteMemory()) 1028 continue; 1029 1030 // Loads can be reordered with other loads. 1031 if (IsLoadChain && isa<LoadInst>(I)) 1032 continue; 1033 1034 // Stores can be sunk below invariant loads. 1035 if (!IsLoadChain && isInvariantLoad(I)) 1036 continue; 1037 1038 // If I is in the chain, we can tell whether it aliases ChainIt by checking 1039 // what offset ChainIt accesses. This may be better than AA is able to do. 1040 // 1041 // We should really only have duplicate offsets for stores (the duplicate 1042 // loads should be CSE'ed), but in case we have a duplicate load, we'll 1043 // split the chain so we don't have to handle this case specially. 1044 if (auto OffsetIt = ChainOffsets.find(I); OffsetIt != ChainOffsets.end()) { 1045 // I and ChainElem overlap if: 1046 // - I and ChainElem have the same offset, OR 1047 // - I's offset is less than ChainElem's, but I touches past the 1048 // beginning of ChainElem, OR 1049 // - ChainElem's offset is less than I's, but ChainElem touches past the 1050 // beginning of I. 1051 const APInt &IOffset = OffsetIt->second; 1052 unsigned IElemSize = DL.getTypeStoreSize(getLoadStoreType(I)); 1053 if (IOffset == ChainElemOffset || 1054 (IOffset.sle(ChainElemOffset) && 1055 (IOffset + IElemSize).sgt(ChainElemOffset)) || 1056 (ChainElemOffset.sle(IOffset) && 1057 (ChainElemOffset + ChainElemSize).sgt(OffsetIt->second))) { 1058 LLVM_DEBUG({ 1059 // Double check that AA also sees this alias. If not, we probably 1060 // have a bug. 1061 ModRefInfo MR = AA.getModRefInfo(I, MemoryLocation::get(ChainElem)); 1062 assert(IsLoadChain ? isModSet(MR) : isModOrRefSet(MR)); 1063 dbgs() << "LSV: Found alias in chain: " << *I << "\n"; 1064 }); 1065 return false; // We found an aliasing instruction; bail. 1066 } 1067 1068 continue; // We're confident there's no alias. 1069 } 1070 1071 LLVM_DEBUG(dbgs() << "LSV: Querying AA for " << *I << "\n"); 1072 ModRefInfo MR = AA.getModRefInfo(I, MemoryLocation::get(ChainElem)); 1073 if (IsLoadChain ? isModSet(MR) : isModOrRefSet(MR)) { 1074 LLVM_DEBUG(dbgs() << "LSV: Found alias in chain:\n" 1075 << " Aliasing instruction:\n" 1076 << " " << *I << '\n' 1077 << " Aliased instruction and pointer:\n" 1078 << " " << *ChainElem << '\n' 1079 << " " << *getLoadStorePointerOperand(ChainElem) 1080 << '\n'); 1081 1082 return false; 1083 } 1084 } 1085 return true; 1086} 1087 1088static bool checkNoWrapFlags(Instruction *I, bool Signed) { 1089 BinaryOperator *BinOpI = cast<BinaryOperator>(I); 1090 return (Signed && BinOpI->hasNoSignedWrap()) || 1091 (!Signed && BinOpI->hasNoUnsignedWrap()); 1092} 1093 1094static bool checkIfSafeAddSequence(const APInt &IdxDiff, Instruction *AddOpA, 1095 unsigned MatchingOpIdxA, Instruction *AddOpB, 1096 unsigned MatchingOpIdxB, bool Signed) { 1097 LLVM_DEBUG(dbgs() << "LSV: checkIfSafeAddSequence IdxDiff=" << IdxDiff 1098 << ", AddOpA=" << *AddOpA << ", MatchingOpIdxA=" 1099 << MatchingOpIdxA << ", AddOpB=" << *AddOpB 1100 << ", MatchingOpIdxB=" << MatchingOpIdxB 1101 << ", Signed=" << Signed << "\n"); 1102 // If both OpA and OpB are adds with NSW/NUW and with one of the operands 1103 // being the same, we can guarantee that the transformation is safe if we can 1104 // prove that OpA won't overflow when Ret added to the other operand of OpA. 1105 // For example: 1106 // %tmp7 = add nsw i32 %tmp2, %v0 1107 // %tmp8 = sext i32 %tmp7 to i64 1108 // ... 1109 // %tmp11 = add nsw i32 %v0, 1 1110 // %tmp12 = add nsw i32 %tmp2, %tmp11 1111 // %tmp13 = sext i32 %tmp12 to i64 1112 // 1113 // Both %tmp7 and %tmp12 have the nsw flag and the first operand is %tmp2. 1114 // It's guaranteed that adding 1 to %tmp7 won't overflow because %tmp11 adds 1115 // 1 to %v0 and both %tmp11 and %tmp12 have the nsw flag. 1116 assert(AddOpA->getOpcode() == Instruction::Add && 1117 AddOpB->getOpcode() == Instruction::Add && 1118 checkNoWrapFlags(AddOpA, Signed) && checkNoWrapFlags(AddOpB, Signed)); 1119 if (AddOpA->getOperand(MatchingOpIdxA) == 1120 AddOpB->getOperand(MatchingOpIdxB)) { 1121 Value *OtherOperandA = AddOpA->getOperand(MatchingOpIdxA == 1 ? 0 : 1); 1122 Value *OtherOperandB = AddOpB->getOperand(MatchingOpIdxB == 1 ? 0 : 1); 1123 Instruction *OtherInstrA = dyn_cast<Instruction>(OtherOperandA); 1124 Instruction *OtherInstrB = dyn_cast<Instruction>(OtherOperandB); 1125 // Match `x +nsw/nuw y` and `x +nsw/nuw (y +nsw/nuw IdxDiff)`. 1126 if (OtherInstrB && OtherInstrB->getOpcode() == Instruction::Add && 1127 checkNoWrapFlags(OtherInstrB, Signed) && 1128 isa<ConstantInt>(OtherInstrB->getOperand(1))) { 1129 int64_t CstVal = 1130 cast<ConstantInt>(OtherInstrB->getOperand(1))->getSExtValue(); 1131 if (OtherInstrB->getOperand(0) == OtherOperandA && 1132 IdxDiff.getSExtValue() == CstVal) 1133 return true; 1134 } 1135 // Match `x +nsw/nuw (y +nsw/nuw -Idx)` and `x +nsw/nuw (y +nsw/nuw x)`. 1136 if (OtherInstrA && OtherInstrA->getOpcode() == Instruction::Add && 1137 checkNoWrapFlags(OtherInstrA, Signed) && 1138 isa<ConstantInt>(OtherInstrA->getOperand(1))) { 1139 int64_t CstVal = 1140 cast<ConstantInt>(OtherInstrA->getOperand(1))->getSExtValue(); 1141 if (OtherInstrA->getOperand(0) == OtherOperandB && 1142 IdxDiff.getSExtValue() == -CstVal) 1143 return true; 1144 } 1145 // Match `x +nsw/nuw (y +nsw/nuw c)` and 1146 // `x +nsw/nuw (y +nsw/nuw (c + IdxDiff))`. 1147 if (OtherInstrA && OtherInstrB && 1148 OtherInstrA->getOpcode() == Instruction::Add && 1149 OtherInstrB->getOpcode() == Instruction::Add && 1150 checkNoWrapFlags(OtherInstrA, Signed) && 1151 checkNoWrapFlags(OtherInstrB, Signed) && 1152 isa<ConstantInt>(OtherInstrA->getOperand(1)) && 1153 isa<ConstantInt>(OtherInstrB->getOperand(1))) { 1154 int64_t CstValA = 1155 cast<ConstantInt>(OtherInstrA->getOperand(1))->getSExtValue(); 1156 int64_t CstValB = 1157 cast<ConstantInt>(OtherInstrB->getOperand(1))->getSExtValue(); 1158 if (OtherInstrA->getOperand(0) == OtherInstrB->getOperand(0) && 1159 IdxDiff.getSExtValue() == (CstValB - CstValA)) 1160 return true; 1161 } 1162 } 1163 return false; 1164} 1165 1166std::optional<APInt> Vectorizer::getConstantOffsetComplexAddrs( 1167 Value *PtrA, Value *PtrB, Instruction *ContextInst, unsigned Depth) { 1168 LLVM_DEBUG(dbgs() << "LSV: getConstantOffsetComplexAddrs PtrA=" << *PtrA 1169 << " PtrB=" << *PtrB << " ContextInst=" << *ContextInst 1170 << " Depth=" << Depth << "\n"); 1171 auto *GEPA = dyn_cast<GetElementPtrInst>(PtrA); 1172 auto *GEPB = dyn_cast<GetElementPtrInst>(PtrB); 1173 if (!GEPA || !GEPB) 1174 return getConstantOffsetSelects(PtrA, PtrB, ContextInst, Depth); 1175 1176 // Look through GEPs after checking they're the same except for the last 1177 // index. 1178 if (GEPA->getNumOperands() != GEPB->getNumOperands() || 1179 GEPA->getPointerOperand() != GEPB->getPointerOperand()) 1180 return std::nullopt; 1181 gep_type_iterator GTIA = gep_type_begin(GEPA); 1182 gep_type_iterator GTIB = gep_type_begin(GEPB); 1183 for (unsigned I = 0, E = GEPA->getNumIndices() - 1; I < E; ++I) { 1184 if (GTIA.getOperand() != GTIB.getOperand()) 1185 return std::nullopt; 1186 ++GTIA; 1187 ++GTIB; 1188 } 1189 1190 Instruction *OpA = dyn_cast<Instruction>(GTIA.getOperand()); 1191 Instruction *OpB = dyn_cast<Instruction>(GTIB.getOperand()); 1192 if (!OpA || !OpB || OpA->getOpcode() != OpB->getOpcode() || 1193 OpA->getType() != OpB->getType()) 1194 return std::nullopt; 1195 1196 uint64_t Stride = GTIA.getSequentialElementStride(DL); 1197 1198 // Only look through a ZExt/SExt. 1199 if (!isa<SExtInst>(OpA) && !isa<ZExtInst>(OpA)) 1200 return std::nullopt; 1201 1202 bool Signed = isa<SExtInst>(OpA); 1203 1204 // At this point A could be a function parameter, i.e. not an instruction 1205 Value *ValA = OpA->getOperand(0); 1206 OpB = dyn_cast<Instruction>(OpB->getOperand(0)); 1207 if (!OpB || ValA->getType() != OpB->getType()) 1208 return std::nullopt; 1209 1210 const SCEV *OffsetSCEVA = SE.getSCEV(ValA); 1211 const SCEV *OffsetSCEVB = SE.getSCEV(OpB); 1212 const SCEV *IdxDiffSCEV = SE.getMinusSCEV(OffsetSCEVB, OffsetSCEVA); 1213 if (IdxDiffSCEV == SE.getCouldNotCompute()) 1214 return std::nullopt; 1215 1216 ConstantRange IdxDiffRange = SE.getSignedRange(IdxDiffSCEV); 1217 if (!IdxDiffRange.isSingleElement()) 1218 return std::nullopt; 1219 APInt IdxDiff = *IdxDiffRange.getSingleElement(); 1220 1221 LLVM_DEBUG(dbgs() << "LSV: getConstantOffsetComplexAddrs IdxDiff=" << IdxDiff 1222 << "\n"); 1223 1224 // Now we need to prove that adding IdxDiff to ValA won't overflow. 1225 bool Safe = false; 1226 1227 // First attempt: if OpB is an add with NSW/NUW, and OpB is IdxDiff added to 1228 // ValA, we're okay. 1229 if (OpB->getOpcode() == Instruction::Add && 1230 isa<ConstantInt>(OpB->getOperand(1)) && 1231 IdxDiff.sle(cast<ConstantInt>(OpB->getOperand(1))->getSExtValue()) && 1232 checkNoWrapFlags(OpB, Signed)) 1233 Safe = true; 1234 1235 // Second attempt: check if we have eligible add NSW/NUW instruction 1236 // sequences. 1237 OpA = dyn_cast<Instruction>(ValA); 1238 if (!Safe && OpA && OpA->getOpcode() == Instruction::Add && 1239 OpB->getOpcode() == Instruction::Add && checkNoWrapFlags(OpA, Signed) && 1240 checkNoWrapFlags(OpB, Signed)) { 1241 // In the checks below a matching operand in OpA and OpB is an operand which 1242 // is the same in those two instructions. Below we account for possible 1243 // orders of the operands of these add instructions. 1244 for (unsigned MatchingOpIdxA : {0, 1}) 1245 for (unsigned MatchingOpIdxB : {0, 1}) 1246 if (!Safe) 1247 Safe = checkIfSafeAddSequence(IdxDiff, OpA, MatchingOpIdxA, OpB, 1248 MatchingOpIdxB, Signed); 1249 } 1250 1251 unsigned BitWidth = ValA->getType()->getScalarSizeInBits(); 1252 1253 // Third attempt: 1254 // 1255 // Assuming IdxDiff is positive: If all set bits of IdxDiff or any higher 1256 // order bit other than the sign bit are known to be zero in ValA, we can add 1257 // Diff to it while guaranteeing no overflow of any sort. 1258 // 1259 // If IdxDiff is negative, do the same, but swap ValA and ValB. 1260 if (!Safe) { 1261 // When computing known bits, use the GEPs as context instructions, since 1262 // they likely are in the same BB as the load/store. 1263 KnownBits Known(BitWidth); 1264 computeKnownBits((IdxDiff.sge(0) ? ValA : OpB), Known, DL, 0, &AC, 1265 ContextInst, &DT); 1266 APInt BitsAllowedToBeSet = Known.Zero.zext(IdxDiff.getBitWidth()); 1267 if (Signed) 1268 BitsAllowedToBeSet.clearBit(BitWidth - 1); 1269 if (BitsAllowedToBeSet.ult(IdxDiff.abs())) 1270 return std::nullopt; 1271 Safe = true; 1272 } 1273 1274 if (Safe) 1275 return IdxDiff * Stride; 1276 return std::nullopt; 1277} 1278 1279std::optional<APInt> Vectorizer::getConstantOffsetSelects( 1280 Value *PtrA, Value *PtrB, Instruction *ContextInst, unsigned Depth) { 1281 if (Depth++ == MaxDepth) 1282 return std::nullopt; 1283 1284 if (auto *SelectA = dyn_cast<SelectInst>(PtrA)) { 1285 if (auto *SelectB = dyn_cast<SelectInst>(PtrB)) { 1286 if (SelectA->getCondition() != SelectB->getCondition()) 1287 return std::nullopt; 1288 LLVM_DEBUG(dbgs() << "LSV: getConstantOffsetSelects, PtrA=" << *PtrA 1289 << ", PtrB=" << *PtrB << ", ContextInst=" 1290 << *ContextInst << ", Depth=" << Depth << "\n"); 1291 std::optional<APInt> TrueDiff = getConstantOffset( 1292 SelectA->getTrueValue(), SelectB->getTrueValue(), ContextInst, Depth); 1293 if (!TrueDiff.has_value()) 1294 return std::nullopt; 1295 std::optional<APInt> FalseDiff = 1296 getConstantOffset(SelectA->getFalseValue(), SelectB->getFalseValue(), 1297 ContextInst, Depth); 1298 if (TrueDiff == FalseDiff) 1299 return TrueDiff; 1300 } 1301 } 1302 return std::nullopt; 1303} 1304 1305EquivalenceClassMap 1306Vectorizer::collectEquivalenceClasses(BasicBlock::iterator Begin, 1307 BasicBlock::iterator End) { 1308 EquivalenceClassMap Ret; 1309 1310 auto getUnderlyingObject = [](const Value *Ptr) -> const Value * { 1311 const Value *ObjPtr = llvm::getUnderlyingObject(Ptr); 1312 if (const auto *Sel = dyn_cast<SelectInst>(ObjPtr)) { 1313 // The select's themselves are distinct instructions even if they share 1314 // the same condition and evaluate to consecutive pointers for true and 1315 // false values of the condition. Therefore using the select's themselves 1316 // for grouping instructions would put consecutive accesses into different 1317 // lists and they won't be even checked for being consecutive, and won't 1318 // be vectorized. 1319 return Sel->getCondition(); 1320 } 1321 return ObjPtr; 1322 }; 1323 1324 for (Instruction &I : make_range(Begin, End)) { 1325 auto *LI = dyn_cast<LoadInst>(&I); 1326 auto *SI = dyn_cast<StoreInst>(&I); 1327 if (!LI && !SI) 1328 continue; 1329 1330 if ((LI && !LI->isSimple()) || (SI && !SI->isSimple())) 1331 continue; 1332 1333 if ((LI && !TTI.isLegalToVectorizeLoad(LI)) || 1334 (SI && !TTI.isLegalToVectorizeStore(SI))) 1335 continue; 1336 1337 Type *Ty = getLoadStoreType(&I); 1338 if (!VectorType::isValidElementType(Ty->getScalarType())) 1339 continue; 1340 1341 // Skip weird non-byte sizes. They probably aren't worth the effort of 1342 // handling correctly. 1343 unsigned TySize = DL.getTypeSizeInBits(Ty); 1344 if ((TySize % 8) != 0) 1345 continue; 1346 1347 // Skip vectors of pointers. The vectorizeLoadChain/vectorizeStoreChain 1348 // functions are currently using an integer type for the vectorized 1349 // load/store, and does not support casting between the integer type and a 1350 // vector of pointers (e.g. i64 to <2 x i16*>) 1351 if (Ty->isVectorTy() && Ty->isPtrOrPtrVectorTy()) 1352 continue; 1353 1354 Value *Ptr = getLoadStorePointerOperand(&I); 1355 unsigned AS = Ptr->getType()->getPointerAddressSpace(); 1356 unsigned VecRegSize = TTI.getLoadStoreVecRegBitWidth(AS); 1357 1358 unsigned VF = VecRegSize / TySize; 1359 VectorType *VecTy = dyn_cast<VectorType>(Ty); 1360 1361 // Only handle power-of-two sized elements. 1362 if ((!VecTy && !isPowerOf2_32(DL.getTypeSizeInBits(Ty))) || 1363 (VecTy && !isPowerOf2_32(DL.getTypeSizeInBits(VecTy->getScalarType())))) 1364 continue; 1365 1366 // No point in looking at these if they're too big to vectorize. 1367 if (TySize > VecRegSize / 2 || 1368 (VecTy && TTI.getLoadVectorFactor(VF, TySize, TySize / 8, VecTy) == 0)) 1369 continue; 1370 1371 Ret[{getUnderlyingObject(Ptr), AS, 1372 DL.getTypeSizeInBits(getLoadStoreType(&I)->getScalarType()), 1373 /*IsLoad=*/LI != nullptr}] 1374 .push_back(&I); 1375 } 1376 1377 return Ret; 1378} 1379 1380std::vector<Chain> Vectorizer::gatherChains(ArrayRef<Instruction *> Instrs) { 1381 if (Instrs.empty()) 1382 return {}; 1383 1384 unsigned AS = getLoadStoreAddressSpace(Instrs[0]); 1385 unsigned ASPtrBits = DL.getIndexSizeInBits(AS); 1386 1387#ifndef NDEBUG 1388 // Check that Instrs is in BB order and all have the same addr space. 1389 for (size_t I = 1; I < Instrs.size(); ++I) { 1390 assert(Instrs[I - 1]->comesBefore(Instrs[I])); 1391 assert(getLoadStoreAddressSpace(Instrs[I]) == AS); 1392 } 1393#endif 1394 1395 // Machinery to build an MRU-hashtable of Chains. 1396 // 1397 // (Ideally this could be done with MapVector, but as currently implemented, 1398 // moving an element to the front of a MapVector is O(n).) 1399 struct InstrListElem : ilist_node<InstrListElem>, 1400 std::pair<Instruction *, Chain> { 1401 explicit InstrListElem(Instruction *I) 1402 : std::pair<Instruction *, Chain>(I, {}) {} 1403 }; 1404 struct InstrListElemDenseMapInfo { 1405 using PtrInfo = DenseMapInfo<InstrListElem *>; 1406 using IInfo = DenseMapInfo<Instruction *>; 1407 static InstrListElem *getEmptyKey() { return PtrInfo::getEmptyKey(); } 1408 static InstrListElem *getTombstoneKey() { 1409 return PtrInfo::getTombstoneKey(); 1410 } 1411 static unsigned getHashValue(const InstrListElem *E) { 1412 return IInfo::getHashValue(E->first); 1413 } 1414 static bool isEqual(const InstrListElem *A, const InstrListElem *B) { 1415 if (A == getEmptyKey() || B == getEmptyKey()) 1416 return A == getEmptyKey() && B == getEmptyKey(); 1417 if (A == getTombstoneKey() || B == getTombstoneKey()) 1418 return A == getTombstoneKey() && B == getTombstoneKey(); 1419 return IInfo::isEqual(A->first, B->first); 1420 } 1421 }; 1422 SpecificBumpPtrAllocator<InstrListElem> Allocator; 1423 simple_ilist<InstrListElem> MRU; 1424 DenseSet<InstrListElem *, InstrListElemDenseMapInfo> Chains; 1425 1426 // Compare each instruction in `instrs` to leader of the N most recently-used 1427 // chains. This limits the O(n^2) behavior of this pass while also allowing 1428 // us to build arbitrarily long chains. 1429 for (Instruction *I : Instrs) { 1430 constexpr int MaxChainsToTry = 64; 1431 1432 bool MatchFound = false; 1433 auto ChainIter = MRU.begin(); 1434 for (size_t J = 0; J < MaxChainsToTry && ChainIter != MRU.end(); 1435 ++J, ++ChainIter) { 1436 std::optional<APInt> Offset = getConstantOffset( 1437 getLoadStorePointerOperand(ChainIter->first), 1438 getLoadStorePointerOperand(I), 1439 /*ContextInst=*/ 1440 (ChainIter->first->comesBefore(I) ? I : ChainIter->first)); 1441 if (Offset.has_value()) { 1442 // `Offset` might not have the expected number of bits, if e.g. AS has a 1443 // different number of bits than opaque pointers. 1444 ChainIter->second.push_back(ChainElem{I, Offset.value()}); 1445 // Move ChainIter to the front of the MRU list. 1446 MRU.remove(*ChainIter); 1447 MRU.push_front(*ChainIter); 1448 MatchFound = true; 1449 break; 1450 } 1451 } 1452 1453 if (!MatchFound) { 1454 APInt ZeroOffset(ASPtrBits, 0); 1455 InstrListElem *E = new (Allocator.Allocate()) InstrListElem(I); 1456 E->second.push_back(ChainElem{I, ZeroOffset}); 1457 MRU.push_front(*E); 1458 Chains.insert(E); 1459 } 1460 } 1461 1462 std::vector<Chain> Ret; 1463 Ret.reserve(Chains.size()); 1464 // Iterate over MRU rather than Chains so the order is deterministic. 1465 for (auto &E : MRU) 1466 if (E.second.size() > 1) 1467 Ret.push_back(std::move(E.second)); 1468 return Ret; 1469} 1470 1471std::optional<APInt> Vectorizer::getConstantOffset(Value *PtrA, Value *PtrB, 1472 Instruction *ContextInst, 1473 unsigned Depth) { 1474 LLVM_DEBUG(dbgs() << "LSV: getConstantOffset, PtrA=" << *PtrA 1475 << ", PtrB=" << *PtrB << ", ContextInst= " << *ContextInst 1476 << ", Depth=" << Depth << "\n"); 1477 // We'll ultimately return a value of this bit width, even if computations 1478 // happen in a different width. 1479 unsigned OrigBitWidth = DL.getIndexTypeSizeInBits(PtrA->getType()); 1480 APInt OffsetA(OrigBitWidth, 0); 1481 APInt OffsetB(OrigBitWidth, 0); 1482 PtrA = PtrA->stripAndAccumulateInBoundsConstantOffsets(DL, OffsetA); 1483 PtrB = PtrB->stripAndAccumulateInBoundsConstantOffsets(DL, OffsetB); 1484 unsigned NewPtrBitWidth = DL.getTypeStoreSizeInBits(PtrA->getType()); 1485 if (NewPtrBitWidth != DL.getTypeStoreSizeInBits(PtrB->getType())) 1486 return std::nullopt; 1487 1488 // If we have to shrink the pointer, stripAndAccumulateInBoundsConstantOffsets 1489 // should properly handle a possible overflow and the value should fit into 1490 // the smallest data type used in the cast/gep chain. 1491 assert(OffsetA.getSignificantBits() <= NewPtrBitWidth && 1492 OffsetB.getSignificantBits() <= NewPtrBitWidth); 1493 1494 OffsetA = OffsetA.sextOrTrunc(NewPtrBitWidth); 1495 OffsetB = OffsetB.sextOrTrunc(NewPtrBitWidth); 1496 if (PtrA == PtrB) 1497 return (OffsetB - OffsetA).sextOrTrunc(OrigBitWidth); 1498 1499 // Try to compute B - A. 1500 const SCEV *DistScev = SE.getMinusSCEV(SE.getSCEV(PtrB), SE.getSCEV(PtrA)); 1501 if (DistScev != SE.getCouldNotCompute()) { 1502 LLVM_DEBUG(dbgs() << "LSV: SCEV PtrB - PtrA =" << *DistScev << "\n"); 1503 ConstantRange DistRange = SE.getSignedRange(DistScev); 1504 if (DistRange.isSingleElement()) { 1505 // Handle index width (the width of Dist) != pointer width (the width of 1506 // the Offset*s at this point). 1507 APInt Dist = DistRange.getSingleElement()->sextOrTrunc(NewPtrBitWidth); 1508 return (OffsetB - OffsetA + Dist).sextOrTrunc(OrigBitWidth); 1509 } 1510 } 1511 std::optional<APInt> Diff = 1512 getConstantOffsetComplexAddrs(PtrA, PtrB, ContextInst, Depth); 1513 if (Diff.has_value()) 1514 return (OffsetB - OffsetA + Diff->sext(OffsetB.getBitWidth())) 1515 .sextOrTrunc(OrigBitWidth); 1516 return std::nullopt; 1517} 1518