LoopStrengthReduce.cpp revision 263508
1//===- LoopStrengthReduce.cpp - Strength Reduce IVs in Loops --------------===// 2// 3// The LLVM Compiler Infrastructure 4// 5// This file is distributed under the University of Illinois Open Source 6// License. See LICENSE.TXT for details. 7// 8//===----------------------------------------------------------------------===// 9// 10// This transformation analyzes and transforms the induction variables (and 11// computations derived from them) into forms suitable for efficient execution 12// on the target. 13// 14// This pass performs a strength reduction on array references inside loops that 15// have as one or more of their components the loop induction variable, it 16// rewrites expressions to take advantage of scaled-index addressing modes 17// available on the target, and it performs a variety of other optimizations 18// related to loop induction variables. 19// 20// Terminology note: this code has a lot of handling for "post-increment" or 21// "post-inc" users. This is not talking about post-increment addressing modes; 22// it is instead talking about code like this: 23// 24// %i = phi [ 0, %entry ], [ %i.next, %latch ] 25// ... 26// %i.next = add %i, 1 27// %c = icmp eq %i.next, %n 28// 29// The SCEV for %i is {0,+,1}<%L>. The SCEV for %i.next is {1,+,1}<%L>, however 30// it's useful to think about these as the same register, with some uses using 31// the value of the register before the add and some using // it after. In this 32// example, the icmp is a post-increment user, since it uses %i.next, which is 33// the value of the induction variable after the increment. The other common 34// case of post-increment users is users outside the loop. 35// 36// TODO: More sophistication in the way Formulae are generated and filtered. 37// 38// TODO: Handle multiple loops at a time. 39// 40// TODO: Should the addressing mode BaseGV be changed to a ConstantExpr instead 41// of a GlobalValue? 42// 43// TODO: When truncation is free, truncate ICmp users' operands to make it a 44// smaller encoding (on x86 at least). 45// 46// TODO: When a negated register is used by an add (such as in a list of 47// multiple base registers, or as the increment expression in an addrec), 48// we may not actually need both reg and (-1 * reg) in registers; the 49// negation can be implemented by using a sub instead of an add. The 50// lack of support for taking this into consideration when making 51// register pressure decisions is partly worked around by the "Special" 52// use kind. 53// 54//===----------------------------------------------------------------------===// 55 56#define DEBUG_TYPE "loop-reduce" 57#include "llvm/Transforms/Scalar.h" 58#include "llvm/ADT/DenseSet.h" 59#include "llvm/ADT/SetVector.h" 60#include "llvm/ADT/SmallBitVector.h" 61#include "llvm/ADT/STLExtras.h" 62#include "llvm/Analysis/Dominators.h" 63#include "llvm/Analysis/IVUsers.h" 64#include "llvm/Analysis/LoopPass.h" 65#include "llvm/Analysis/ScalarEvolutionExpander.h" 66#include "llvm/Analysis/TargetTransformInfo.h" 67#include "llvm/Assembly/Writer.h" 68#include "llvm/IR/Constants.h" 69#include "llvm/IR/DerivedTypes.h" 70#include "llvm/IR/Instructions.h" 71#include "llvm/IR/IntrinsicInst.h" 72#include "llvm/Support/CommandLine.h" 73#include "llvm/Support/Debug.h" 74#include "llvm/Support/ValueHandle.h" 75#include "llvm/Support/raw_ostream.h" 76#include "llvm/Transforms/Utils/BasicBlockUtils.h" 77#include "llvm/Transforms/Utils/Local.h" 78#include <algorithm> 79using namespace llvm; 80 81/// MaxIVUsers is an arbitrary threshold that provides an early opportunitiy for 82/// bail out. This threshold is far beyond the number of users that LSR can 83/// conceivably solve, so it should not affect generated code, but catches the 84/// worst cases before LSR burns too much compile time and stack space. 85static const unsigned MaxIVUsers = 200; 86 87// Temporary flag to cleanup congruent phis after LSR phi expansion. 88// It's currently disabled until we can determine whether it's truly useful or 89// not. The flag should be removed after the v3.0 release. 90// This is now needed for ivchains. 91static cl::opt<bool> EnablePhiElim( 92 "enable-lsr-phielim", cl::Hidden, cl::init(true), 93 cl::desc("Enable LSR phi elimination")); 94 95#ifndef NDEBUG 96// Stress test IV chain generation. 97static cl::opt<bool> StressIVChain( 98 "stress-ivchain", cl::Hidden, cl::init(false), 99 cl::desc("Stress test LSR IV chains")); 100#else 101static bool StressIVChain = false; 102#endif 103 104namespace { 105 106/// RegSortData - This class holds data which is used to order reuse candidates. 107class RegSortData { 108public: 109 /// UsedByIndices - This represents the set of LSRUse indices which reference 110 /// a particular register. 111 SmallBitVector UsedByIndices; 112 113 RegSortData() {} 114 115 void print(raw_ostream &OS) const; 116 void dump() const; 117}; 118 119} 120 121void RegSortData::print(raw_ostream &OS) const { 122 OS << "[NumUses=" << UsedByIndices.count() << ']'; 123} 124 125#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) 126void RegSortData::dump() const { 127 print(errs()); errs() << '\n'; 128} 129#endif 130 131namespace { 132 133/// RegUseTracker - Map register candidates to information about how they are 134/// used. 135class RegUseTracker { 136 typedef DenseMap<const SCEV *, RegSortData> RegUsesTy; 137 138 RegUsesTy RegUsesMap; 139 SmallVector<const SCEV *, 16> RegSequence; 140 141public: 142 void CountRegister(const SCEV *Reg, size_t LUIdx); 143 void DropRegister(const SCEV *Reg, size_t LUIdx); 144 void SwapAndDropUse(size_t LUIdx, size_t LastLUIdx); 145 146 bool isRegUsedByUsesOtherThan(const SCEV *Reg, size_t LUIdx) const; 147 148 const SmallBitVector &getUsedByIndices(const SCEV *Reg) const; 149 150 void clear(); 151 152 typedef SmallVectorImpl<const SCEV *>::iterator iterator; 153 typedef SmallVectorImpl<const SCEV *>::const_iterator const_iterator; 154 iterator begin() { return RegSequence.begin(); } 155 iterator end() { return RegSequence.end(); } 156 const_iterator begin() const { return RegSequence.begin(); } 157 const_iterator end() const { return RegSequence.end(); } 158}; 159 160} 161 162void 163RegUseTracker::CountRegister(const SCEV *Reg, size_t LUIdx) { 164 std::pair<RegUsesTy::iterator, bool> Pair = 165 RegUsesMap.insert(std::make_pair(Reg, RegSortData())); 166 RegSortData &RSD = Pair.first->second; 167 if (Pair.second) 168 RegSequence.push_back(Reg); 169 RSD.UsedByIndices.resize(std::max(RSD.UsedByIndices.size(), LUIdx + 1)); 170 RSD.UsedByIndices.set(LUIdx); 171} 172 173void 174RegUseTracker::DropRegister(const SCEV *Reg, size_t LUIdx) { 175 RegUsesTy::iterator It = RegUsesMap.find(Reg); 176 assert(It != RegUsesMap.end()); 177 RegSortData &RSD = It->second; 178 assert(RSD.UsedByIndices.size() > LUIdx); 179 RSD.UsedByIndices.reset(LUIdx); 180} 181 182void 183RegUseTracker::SwapAndDropUse(size_t LUIdx, size_t LastLUIdx) { 184 assert(LUIdx <= LastLUIdx); 185 186 // Update RegUses. The data structure is not optimized for this purpose; 187 // we must iterate through it and update each of the bit vectors. 188 for (RegUsesTy::iterator I = RegUsesMap.begin(), E = RegUsesMap.end(); 189 I != E; ++I) { 190 SmallBitVector &UsedByIndices = I->second.UsedByIndices; 191 if (LUIdx < UsedByIndices.size()) 192 UsedByIndices[LUIdx] = 193 LastLUIdx < UsedByIndices.size() ? UsedByIndices[LastLUIdx] : 0; 194 UsedByIndices.resize(std::min(UsedByIndices.size(), LastLUIdx)); 195 } 196} 197 198bool 199RegUseTracker::isRegUsedByUsesOtherThan(const SCEV *Reg, size_t LUIdx) const { 200 RegUsesTy::const_iterator I = RegUsesMap.find(Reg); 201 if (I == RegUsesMap.end()) 202 return false; 203 const SmallBitVector &UsedByIndices = I->second.UsedByIndices; 204 int i = UsedByIndices.find_first(); 205 if (i == -1) return false; 206 if ((size_t)i != LUIdx) return true; 207 return UsedByIndices.find_next(i) != -1; 208} 209 210const SmallBitVector &RegUseTracker::getUsedByIndices(const SCEV *Reg) const { 211 RegUsesTy::const_iterator I = RegUsesMap.find(Reg); 212 assert(I != RegUsesMap.end() && "Unknown register!"); 213 return I->second.UsedByIndices; 214} 215 216void RegUseTracker::clear() { 217 RegUsesMap.clear(); 218 RegSequence.clear(); 219} 220 221namespace { 222 223/// Formula - This class holds information that describes a formula for 224/// computing satisfying a use. It may include broken-out immediates and scaled 225/// registers. 226struct Formula { 227 /// Global base address used for complex addressing. 228 GlobalValue *BaseGV; 229 230 /// Base offset for complex addressing. 231 int64_t BaseOffset; 232 233 /// Whether any complex addressing has a base register. 234 bool HasBaseReg; 235 236 /// The scale of any complex addressing. 237 int64_t Scale; 238 239 /// BaseRegs - The list of "base" registers for this use. When this is 240 /// non-empty, 241 SmallVector<const SCEV *, 4> BaseRegs; 242 243 /// ScaledReg - The 'scaled' register for this use. This should be non-null 244 /// when Scale is not zero. 245 const SCEV *ScaledReg; 246 247 /// UnfoldedOffset - An additional constant offset which added near the 248 /// use. This requires a temporary register, but the offset itself can 249 /// live in an add immediate field rather than a register. 250 int64_t UnfoldedOffset; 251 252 Formula() 253 : BaseGV(0), BaseOffset(0), HasBaseReg(false), Scale(0), ScaledReg(0), 254 UnfoldedOffset(0) {} 255 256 void InitialMatch(const SCEV *S, Loop *L, ScalarEvolution &SE); 257 258 unsigned getNumRegs() const; 259 Type *getType() const; 260 261 void DeleteBaseReg(const SCEV *&S); 262 263 bool referencesReg(const SCEV *S) const; 264 bool hasRegsUsedByUsesOtherThan(size_t LUIdx, 265 const RegUseTracker &RegUses) const; 266 267 void print(raw_ostream &OS) const; 268 void dump() const; 269}; 270 271} 272 273/// DoInitialMatch - Recursion helper for InitialMatch. 274static void DoInitialMatch(const SCEV *S, Loop *L, 275 SmallVectorImpl<const SCEV *> &Good, 276 SmallVectorImpl<const SCEV *> &Bad, 277 ScalarEvolution &SE) { 278 // Collect expressions which properly dominate the loop header. 279 if (SE.properlyDominates(S, L->getHeader())) { 280 Good.push_back(S); 281 return; 282 } 283 284 // Look at add operands. 285 if (const SCEVAddExpr *Add = dyn_cast<SCEVAddExpr>(S)) { 286 for (SCEVAddExpr::op_iterator I = Add->op_begin(), E = Add->op_end(); 287 I != E; ++I) 288 DoInitialMatch(*I, L, Good, Bad, SE); 289 return; 290 } 291 292 // Look at addrec operands. 293 if (const SCEVAddRecExpr *AR = dyn_cast<SCEVAddRecExpr>(S)) 294 if (!AR->getStart()->isZero()) { 295 DoInitialMatch(AR->getStart(), L, Good, Bad, SE); 296 DoInitialMatch(SE.getAddRecExpr(SE.getConstant(AR->getType(), 0), 297 AR->getStepRecurrence(SE), 298 // FIXME: AR->getNoWrapFlags() 299 AR->getLoop(), SCEV::FlagAnyWrap), 300 L, Good, Bad, SE); 301 return; 302 } 303 304 // Handle a multiplication by -1 (negation) if it didn't fold. 305 if (const SCEVMulExpr *Mul = dyn_cast<SCEVMulExpr>(S)) 306 if (Mul->getOperand(0)->isAllOnesValue()) { 307 SmallVector<const SCEV *, 4> Ops(Mul->op_begin()+1, Mul->op_end()); 308 const SCEV *NewMul = SE.getMulExpr(Ops); 309 310 SmallVector<const SCEV *, 4> MyGood; 311 SmallVector<const SCEV *, 4> MyBad; 312 DoInitialMatch(NewMul, L, MyGood, MyBad, SE); 313 const SCEV *NegOne = SE.getSCEV(ConstantInt::getAllOnesValue( 314 SE.getEffectiveSCEVType(NewMul->getType()))); 315 for (SmallVectorImpl<const SCEV *>::const_iterator I = MyGood.begin(), 316 E = MyGood.end(); I != E; ++I) 317 Good.push_back(SE.getMulExpr(NegOne, *I)); 318 for (SmallVectorImpl<const SCEV *>::const_iterator I = MyBad.begin(), 319 E = MyBad.end(); I != E; ++I) 320 Bad.push_back(SE.getMulExpr(NegOne, *I)); 321 return; 322 } 323 324 // Ok, we can't do anything interesting. Just stuff the whole thing into a 325 // register and hope for the best. 326 Bad.push_back(S); 327} 328 329/// InitialMatch - Incorporate loop-variant parts of S into this Formula, 330/// attempting to keep all loop-invariant and loop-computable values in a 331/// single base register. 332void Formula::InitialMatch(const SCEV *S, Loop *L, ScalarEvolution &SE) { 333 SmallVector<const SCEV *, 4> Good; 334 SmallVector<const SCEV *, 4> Bad; 335 DoInitialMatch(S, L, Good, Bad, SE); 336 if (!Good.empty()) { 337 const SCEV *Sum = SE.getAddExpr(Good); 338 if (!Sum->isZero()) 339 BaseRegs.push_back(Sum); 340 HasBaseReg = true; 341 } 342 if (!Bad.empty()) { 343 const SCEV *Sum = SE.getAddExpr(Bad); 344 if (!Sum->isZero()) 345 BaseRegs.push_back(Sum); 346 HasBaseReg = true; 347 } 348} 349 350/// getNumRegs - Return the total number of register operands used by this 351/// formula. This does not include register uses implied by non-constant 352/// addrec strides. 353unsigned Formula::getNumRegs() const { 354 return !!ScaledReg + BaseRegs.size(); 355} 356 357/// getType - Return the type of this formula, if it has one, or null 358/// otherwise. This type is meaningless except for the bit size. 359Type *Formula::getType() const { 360 return !BaseRegs.empty() ? BaseRegs.front()->getType() : 361 ScaledReg ? ScaledReg->getType() : 362 BaseGV ? BaseGV->getType() : 363 0; 364} 365 366/// DeleteBaseReg - Delete the given base reg from the BaseRegs list. 367void Formula::DeleteBaseReg(const SCEV *&S) { 368 if (&S != &BaseRegs.back()) 369 std::swap(S, BaseRegs.back()); 370 BaseRegs.pop_back(); 371} 372 373/// referencesReg - Test if this formula references the given register. 374bool Formula::referencesReg(const SCEV *S) const { 375 return S == ScaledReg || 376 std::find(BaseRegs.begin(), BaseRegs.end(), S) != BaseRegs.end(); 377} 378 379/// hasRegsUsedByUsesOtherThan - Test whether this formula uses registers 380/// which are used by uses other than the use with the given index. 381bool Formula::hasRegsUsedByUsesOtherThan(size_t LUIdx, 382 const RegUseTracker &RegUses) const { 383 if (ScaledReg) 384 if (RegUses.isRegUsedByUsesOtherThan(ScaledReg, LUIdx)) 385 return true; 386 for (SmallVectorImpl<const SCEV *>::const_iterator I = BaseRegs.begin(), 387 E = BaseRegs.end(); I != E; ++I) 388 if (RegUses.isRegUsedByUsesOtherThan(*I, LUIdx)) 389 return true; 390 return false; 391} 392 393void Formula::print(raw_ostream &OS) const { 394 bool First = true; 395 if (BaseGV) { 396 if (!First) OS << " + "; else First = false; 397 WriteAsOperand(OS, BaseGV, /*PrintType=*/false); 398 } 399 if (BaseOffset != 0) { 400 if (!First) OS << " + "; else First = false; 401 OS << BaseOffset; 402 } 403 for (SmallVectorImpl<const SCEV *>::const_iterator I = BaseRegs.begin(), 404 E = BaseRegs.end(); I != E; ++I) { 405 if (!First) OS << " + "; else First = false; 406 OS << "reg(" << **I << ')'; 407 } 408 if (HasBaseReg && BaseRegs.empty()) { 409 if (!First) OS << " + "; else First = false; 410 OS << "**error: HasBaseReg**"; 411 } else if (!HasBaseReg && !BaseRegs.empty()) { 412 if (!First) OS << " + "; else First = false; 413 OS << "**error: !HasBaseReg**"; 414 } 415 if (Scale != 0) { 416 if (!First) OS << " + "; else First = false; 417 OS << Scale << "*reg("; 418 if (ScaledReg) 419 OS << *ScaledReg; 420 else 421 OS << "<unknown>"; 422 OS << ')'; 423 } 424 if (UnfoldedOffset != 0) { 425 if (!First) OS << " + "; else First = false; 426 OS << "imm(" << UnfoldedOffset << ')'; 427 } 428} 429 430#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) 431void Formula::dump() const { 432 print(errs()); errs() << '\n'; 433} 434#endif 435 436/// isAddRecSExtable - Return true if the given addrec can be sign-extended 437/// without changing its value. 438static bool isAddRecSExtable(const SCEVAddRecExpr *AR, ScalarEvolution &SE) { 439 Type *WideTy = 440 IntegerType::get(SE.getContext(), SE.getTypeSizeInBits(AR->getType()) + 1); 441 return isa<SCEVAddRecExpr>(SE.getSignExtendExpr(AR, WideTy)); 442} 443 444/// isAddSExtable - Return true if the given add can be sign-extended 445/// without changing its value. 446static bool isAddSExtable(const SCEVAddExpr *A, ScalarEvolution &SE) { 447 Type *WideTy = 448 IntegerType::get(SE.getContext(), SE.getTypeSizeInBits(A->getType()) + 1); 449 return isa<SCEVAddExpr>(SE.getSignExtendExpr(A, WideTy)); 450} 451 452/// isMulSExtable - Return true if the given mul can be sign-extended 453/// without changing its value. 454static bool isMulSExtable(const SCEVMulExpr *M, ScalarEvolution &SE) { 455 Type *WideTy = 456 IntegerType::get(SE.getContext(), 457 SE.getTypeSizeInBits(M->getType()) * M->getNumOperands()); 458 return isa<SCEVMulExpr>(SE.getSignExtendExpr(M, WideTy)); 459} 460 461/// getExactSDiv - Return an expression for LHS /s RHS, if it can be determined 462/// and if the remainder is known to be zero, or null otherwise. If 463/// IgnoreSignificantBits is true, expressions like (X * Y) /s Y are simplified 464/// to Y, ignoring that the multiplication may overflow, which is useful when 465/// the result will be used in a context where the most significant bits are 466/// ignored. 467static const SCEV *getExactSDiv(const SCEV *LHS, const SCEV *RHS, 468 ScalarEvolution &SE, 469 bool IgnoreSignificantBits = false) { 470 // Handle the trivial case, which works for any SCEV type. 471 if (LHS == RHS) 472 return SE.getConstant(LHS->getType(), 1); 473 474 // Handle a few RHS special cases. 475 const SCEVConstant *RC = dyn_cast<SCEVConstant>(RHS); 476 if (RC) { 477 const APInt &RA = RC->getValue()->getValue(); 478 // Handle x /s -1 as x * -1, to give ScalarEvolution a chance to do 479 // some folding. 480 if (RA.isAllOnesValue()) 481 return SE.getMulExpr(LHS, RC); 482 // Handle x /s 1 as x. 483 if (RA == 1) 484 return LHS; 485 } 486 487 // Check for a division of a constant by a constant. 488 if (const SCEVConstant *C = dyn_cast<SCEVConstant>(LHS)) { 489 if (!RC) 490 return 0; 491 const APInt &LA = C->getValue()->getValue(); 492 const APInt &RA = RC->getValue()->getValue(); 493 if (LA.srem(RA) != 0) 494 return 0; 495 return SE.getConstant(LA.sdiv(RA)); 496 } 497 498 // Distribute the sdiv over addrec operands, if the addrec doesn't overflow. 499 if (const SCEVAddRecExpr *AR = dyn_cast<SCEVAddRecExpr>(LHS)) { 500 if (IgnoreSignificantBits || isAddRecSExtable(AR, SE)) { 501 const SCEV *Step = getExactSDiv(AR->getStepRecurrence(SE), RHS, SE, 502 IgnoreSignificantBits); 503 if (!Step) return 0; 504 const SCEV *Start = getExactSDiv(AR->getStart(), RHS, SE, 505 IgnoreSignificantBits); 506 if (!Start) return 0; 507 // FlagNW is independent of the start value, step direction, and is 508 // preserved with smaller magnitude steps. 509 // FIXME: AR->getNoWrapFlags(SCEV::FlagNW) 510 return SE.getAddRecExpr(Start, Step, AR->getLoop(), SCEV::FlagAnyWrap); 511 } 512 return 0; 513 } 514 515 // Distribute the sdiv over add operands, if the add doesn't overflow. 516 if (const SCEVAddExpr *Add = dyn_cast<SCEVAddExpr>(LHS)) { 517 if (IgnoreSignificantBits || isAddSExtable(Add, SE)) { 518 SmallVector<const SCEV *, 8> Ops; 519 for (SCEVAddExpr::op_iterator I = Add->op_begin(), E = Add->op_end(); 520 I != E; ++I) { 521 const SCEV *Op = getExactSDiv(*I, RHS, SE, 522 IgnoreSignificantBits); 523 if (!Op) return 0; 524 Ops.push_back(Op); 525 } 526 return SE.getAddExpr(Ops); 527 } 528 return 0; 529 } 530 531 // Check for a multiply operand that we can pull RHS out of. 532 if (const SCEVMulExpr *Mul = dyn_cast<SCEVMulExpr>(LHS)) { 533 if (IgnoreSignificantBits || isMulSExtable(Mul, SE)) { 534 SmallVector<const SCEV *, 4> Ops; 535 bool Found = false; 536 for (SCEVMulExpr::op_iterator I = Mul->op_begin(), E = Mul->op_end(); 537 I != E; ++I) { 538 const SCEV *S = *I; 539 if (!Found) 540 if (const SCEV *Q = getExactSDiv(S, RHS, SE, 541 IgnoreSignificantBits)) { 542 S = Q; 543 Found = true; 544 } 545 Ops.push_back(S); 546 } 547 return Found ? SE.getMulExpr(Ops) : 0; 548 } 549 return 0; 550 } 551 552 // Otherwise we don't know. 553 return 0; 554} 555 556/// ExtractImmediate - If S involves the addition of a constant integer value, 557/// return that integer value, and mutate S to point to a new SCEV with that 558/// value excluded. 559static int64_t ExtractImmediate(const SCEV *&S, ScalarEvolution &SE) { 560 if (const SCEVConstant *C = dyn_cast<SCEVConstant>(S)) { 561 if (C->getValue()->getValue().getMinSignedBits() <= 64) { 562 S = SE.getConstant(C->getType(), 0); 563 return C->getValue()->getSExtValue(); 564 } 565 } else if (const SCEVAddExpr *Add = dyn_cast<SCEVAddExpr>(S)) { 566 SmallVector<const SCEV *, 8> NewOps(Add->op_begin(), Add->op_end()); 567 int64_t Result = ExtractImmediate(NewOps.front(), SE); 568 if (Result != 0) 569 S = SE.getAddExpr(NewOps); 570 return Result; 571 } else if (const SCEVAddRecExpr *AR = dyn_cast<SCEVAddRecExpr>(S)) { 572 SmallVector<const SCEV *, 8> NewOps(AR->op_begin(), AR->op_end()); 573 int64_t Result = ExtractImmediate(NewOps.front(), SE); 574 if (Result != 0) 575 S = SE.getAddRecExpr(NewOps, AR->getLoop(), 576 // FIXME: AR->getNoWrapFlags(SCEV::FlagNW) 577 SCEV::FlagAnyWrap); 578 return Result; 579 } 580 return 0; 581} 582 583/// ExtractSymbol - If S involves the addition of a GlobalValue address, 584/// return that symbol, and mutate S to point to a new SCEV with that 585/// value excluded. 586static GlobalValue *ExtractSymbol(const SCEV *&S, ScalarEvolution &SE) { 587 if (const SCEVUnknown *U = dyn_cast<SCEVUnknown>(S)) { 588 if (GlobalValue *GV = dyn_cast<GlobalValue>(U->getValue())) { 589 S = SE.getConstant(GV->getType(), 0); 590 return GV; 591 } 592 } else if (const SCEVAddExpr *Add = dyn_cast<SCEVAddExpr>(S)) { 593 SmallVector<const SCEV *, 8> NewOps(Add->op_begin(), Add->op_end()); 594 GlobalValue *Result = ExtractSymbol(NewOps.back(), SE); 595 if (Result) 596 S = SE.getAddExpr(NewOps); 597 return Result; 598 } else if (const SCEVAddRecExpr *AR = dyn_cast<SCEVAddRecExpr>(S)) { 599 SmallVector<const SCEV *, 8> NewOps(AR->op_begin(), AR->op_end()); 600 GlobalValue *Result = ExtractSymbol(NewOps.front(), SE); 601 if (Result) 602 S = SE.getAddRecExpr(NewOps, AR->getLoop(), 603 // FIXME: AR->getNoWrapFlags(SCEV::FlagNW) 604 SCEV::FlagAnyWrap); 605 return Result; 606 } 607 return 0; 608} 609 610/// isAddressUse - Returns true if the specified instruction is using the 611/// specified value as an address. 612static bool isAddressUse(Instruction *Inst, Value *OperandVal) { 613 bool isAddress = isa<LoadInst>(Inst); 614 if (StoreInst *SI = dyn_cast<StoreInst>(Inst)) { 615 if (SI->getOperand(1) == OperandVal) 616 isAddress = true; 617 } else if (IntrinsicInst *II = dyn_cast<IntrinsicInst>(Inst)) { 618 // Addressing modes can also be folded into prefetches and a variety 619 // of intrinsics. 620 switch (II->getIntrinsicID()) { 621 default: break; 622 case Intrinsic::prefetch: 623 case Intrinsic::x86_sse_storeu_ps: 624 case Intrinsic::x86_sse2_storeu_pd: 625 case Intrinsic::x86_sse2_storeu_dq: 626 case Intrinsic::x86_sse2_storel_dq: 627 if (II->getArgOperand(0) == OperandVal) 628 isAddress = true; 629 break; 630 } 631 } 632 return isAddress; 633} 634 635/// getAccessType - Return the type of the memory being accessed. 636static Type *getAccessType(const Instruction *Inst) { 637 Type *AccessTy = Inst->getType(); 638 if (const StoreInst *SI = dyn_cast<StoreInst>(Inst)) 639 AccessTy = SI->getOperand(0)->getType(); 640 else if (const IntrinsicInst *II = dyn_cast<IntrinsicInst>(Inst)) { 641 // Addressing modes can also be folded into prefetches and a variety 642 // of intrinsics. 643 switch (II->getIntrinsicID()) { 644 default: break; 645 case Intrinsic::x86_sse_storeu_ps: 646 case Intrinsic::x86_sse2_storeu_pd: 647 case Intrinsic::x86_sse2_storeu_dq: 648 case Intrinsic::x86_sse2_storel_dq: 649 AccessTy = II->getArgOperand(0)->getType(); 650 break; 651 } 652 } 653 654 // All pointers have the same requirements, so canonicalize them to an 655 // arbitrary pointer type to minimize variation. 656 if (PointerType *PTy = dyn_cast<PointerType>(AccessTy)) 657 AccessTy = PointerType::get(IntegerType::get(PTy->getContext(), 1), 658 PTy->getAddressSpace()); 659 660 return AccessTy; 661} 662 663/// isExistingPhi - Return true if this AddRec is already a phi in its loop. 664static bool isExistingPhi(const SCEVAddRecExpr *AR, ScalarEvolution &SE) { 665 for (BasicBlock::iterator I = AR->getLoop()->getHeader()->begin(); 666 PHINode *PN = dyn_cast<PHINode>(I); ++I) { 667 if (SE.isSCEVable(PN->getType()) && 668 (SE.getEffectiveSCEVType(PN->getType()) == 669 SE.getEffectiveSCEVType(AR->getType())) && 670 SE.getSCEV(PN) == AR) 671 return true; 672 } 673 return false; 674} 675 676/// Check if expanding this expression is likely to incur significant cost. This 677/// is tricky because SCEV doesn't track which expressions are actually computed 678/// by the current IR. 679/// 680/// We currently allow expansion of IV increments that involve adds, 681/// multiplication by constants, and AddRecs from existing phis. 682/// 683/// TODO: Allow UDivExpr if we can find an existing IV increment that is an 684/// obvious multiple of the UDivExpr. 685static bool isHighCostExpansion(const SCEV *S, 686 SmallPtrSet<const SCEV*, 8> &Processed, 687 ScalarEvolution &SE) { 688 // Zero/One operand expressions 689 switch (S->getSCEVType()) { 690 case scUnknown: 691 case scConstant: 692 return false; 693 case scTruncate: 694 return isHighCostExpansion(cast<SCEVTruncateExpr>(S)->getOperand(), 695 Processed, SE); 696 case scZeroExtend: 697 return isHighCostExpansion(cast<SCEVZeroExtendExpr>(S)->getOperand(), 698 Processed, SE); 699 case scSignExtend: 700 return isHighCostExpansion(cast<SCEVSignExtendExpr>(S)->getOperand(), 701 Processed, SE); 702 } 703 704 if (!Processed.insert(S)) 705 return false; 706 707 if (const SCEVAddExpr *Add = dyn_cast<SCEVAddExpr>(S)) { 708 for (SCEVAddExpr::op_iterator I = Add->op_begin(), E = Add->op_end(); 709 I != E; ++I) { 710 if (isHighCostExpansion(*I, Processed, SE)) 711 return true; 712 } 713 return false; 714 } 715 716 if (const SCEVMulExpr *Mul = dyn_cast<SCEVMulExpr>(S)) { 717 if (Mul->getNumOperands() == 2) { 718 // Multiplication by a constant is ok 719 if (isa<SCEVConstant>(Mul->getOperand(0))) 720 return isHighCostExpansion(Mul->getOperand(1), Processed, SE); 721 722 // If we have the value of one operand, check if an existing 723 // multiplication already generates this expression. 724 if (const SCEVUnknown *U = dyn_cast<SCEVUnknown>(Mul->getOperand(1))) { 725 Value *UVal = U->getValue(); 726 for (Value::use_iterator UI = UVal->use_begin(), UE = UVal->use_end(); 727 UI != UE; ++UI) { 728 // If U is a constant, it may be used by a ConstantExpr. 729 Instruction *User = dyn_cast<Instruction>(*UI); 730 if (User && User->getOpcode() == Instruction::Mul 731 && SE.isSCEVable(User->getType())) { 732 return SE.getSCEV(User) == Mul; 733 } 734 } 735 } 736 } 737 } 738 739 if (const SCEVAddRecExpr *AR = dyn_cast<SCEVAddRecExpr>(S)) { 740 if (isExistingPhi(AR, SE)) 741 return false; 742 } 743 744 // Fow now, consider any other type of expression (div/mul/min/max) high cost. 745 return true; 746} 747 748/// DeleteTriviallyDeadInstructions - If any of the instructions is the 749/// specified set are trivially dead, delete them and see if this makes any of 750/// their operands subsequently dead. 751static bool 752DeleteTriviallyDeadInstructions(SmallVectorImpl<WeakVH> &DeadInsts) { 753 bool Changed = false; 754 755 while (!DeadInsts.empty()) { 756 Value *V = DeadInsts.pop_back_val(); 757 Instruction *I = dyn_cast_or_null<Instruction>(V); 758 759 if (I == 0 || !isInstructionTriviallyDead(I)) 760 continue; 761 762 for (User::op_iterator OI = I->op_begin(), E = I->op_end(); OI != E; ++OI) 763 if (Instruction *U = dyn_cast<Instruction>(*OI)) { 764 *OI = 0; 765 if (U->use_empty()) 766 DeadInsts.push_back(U); 767 } 768 769 I->eraseFromParent(); 770 Changed = true; 771 } 772 773 return Changed; 774} 775 776namespace { 777class LSRUse; 778} 779// Check if it is legal to fold 2 base registers. 780static bool isLegal2RegAMUse(const TargetTransformInfo &TTI, const LSRUse &LU, 781 const Formula &F); 782// Get the cost of the scaling factor used in F for LU. 783static unsigned getScalingFactorCost(const TargetTransformInfo &TTI, 784 const LSRUse &LU, const Formula &F); 785 786namespace { 787 788/// Cost - This class is used to measure and compare candidate formulae. 789class Cost { 790 /// TODO: Some of these could be merged. Also, a lexical ordering 791 /// isn't always optimal. 792 unsigned NumRegs; 793 unsigned AddRecCost; 794 unsigned NumIVMuls; 795 unsigned NumBaseAdds; 796 unsigned ImmCost; 797 unsigned SetupCost; 798 unsigned ScaleCost; 799 800public: 801 Cost() 802 : NumRegs(0), AddRecCost(0), NumIVMuls(0), NumBaseAdds(0), ImmCost(0), 803 SetupCost(0), ScaleCost(0) {} 804 805 bool operator<(const Cost &Other) const; 806 807 void Loose(); 808 809#ifndef NDEBUG 810 // Once any of the metrics loses, they must all remain losers. 811 bool isValid() { 812 return ((NumRegs | AddRecCost | NumIVMuls | NumBaseAdds 813 | ImmCost | SetupCost | ScaleCost) != ~0u) 814 || ((NumRegs & AddRecCost & NumIVMuls & NumBaseAdds 815 & ImmCost & SetupCost & ScaleCost) == ~0u); 816 } 817#endif 818 819 bool isLoser() { 820 assert(isValid() && "invalid cost"); 821 return NumRegs == ~0u; 822 } 823 824 void RateFormula(const TargetTransformInfo &TTI, 825 const Formula &F, 826 SmallPtrSet<const SCEV *, 16> &Regs, 827 const DenseSet<const SCEV *> &VisitedRegs, 828 const Loop *L, 829 const SmallVectorImpl<int64_t> &Offsets, 830 ScalarEvolution &SE, DominatorTree &DT, 831 const LSRUse &LU, 832 SmallPtrSet<const SCEV *, 16> *LoserRegs = 0); 833 834 void print(raw_ostream &OS) const; 835 void dump() const; 836 837private: 838 void RateRegister(const SCEV *Reg, 839 SmallPtrSet<const SCEV *, 16> &Regs, 840 const Loop *L, 841 ScalarEvolution &SE, DominatorTree &DT); 842 void RatePrimaryRegister(const SCEV *Reg, 843 SmallPtrSet<const SCEV *, 16> &Regs, 844 const Loop *L, 845 ScalarEvolution &SE, DominatorTree &DT, 846 SmallPtrSet<const SCEV *, 16> *LoserRegs); 847}; 848 849} 850 851/// RateRegister - Tally up interesting quantities from the given register. 852void Cost::RateRegister(const SCEV *Reg, 853 SmallPtrSet<const SCEV *, 16> &Regs, 854 const Loop *L, 855 ScalarEvolution &SE, DominatorTree &DT) { 856 if (const SCEVAddRecExpr *AR = dyn_cast<SCEVAddRecExpr>(Reg)) { 857 // If this is an addrec for another loop, don't second-guess its addrec phi 858 // nodes. LSR isn't currently smart enough to reason about more than one 859 // loop at a time. LSR has already run on inner loops, will not run on outer 860 // loops, and cannot be expected to change sibling loops. 861 if (AR->getLoop() != L) { 862 // If the AddRec exists, consider it's register free and leave it alone. 863 if (isExistingPhi(AR, SE)) 864 return; 865 866 // Otherwise, do not consider this formula at all. 867 Loose(); 868 return; 869 } 870 AddRecCost += 1; /// TODO: This should be a function of the stride. 871 872 // Add the step value register, if it needs one. 873 // TODO: The non-affine case isn't precisely modeled here. 874 if (!AR->isAffine() || !isa<SCEVConstant>(AR->getOperand(1))) { 875 if (!Regs.count(AR->getOperand(1))) { 876 RateRegister(AR->getOperand(1), Regs, L, SE, DT); 877 if (isLoser()) 878 return; 879 } 880 } 881 } 882 ++NumRegs; 883 884 // Rough heuristic; favor registers which don't require extra setup 885 // instructions in the preheader. 886 if (!isa<SCEVUnknown>(Reg) && 887 !isa<SCEVConstant>(Reg) && 888 !(isa<SCEVAddRecExpr>(Reg) && 889 (isa<SCEVUnknown>(cast<SCEVAddRecExpr>(Reg)->getStart()) || 890 isa<SCEVConstant>(cast<SCEVAddRecExpr>(Reg)->getStart())))) 891 ++SetupCost; 892 893 NumIVMuls += isa<SCEVMulExpr>(Reg) && 894 SE.hasComputableLoopEvolution(Reg, L); 895} 896 897/// RatePrimaryRegister - Record this register in the set. If we haven't seen it 898/// before, rate it. Optional LoserRegs provides a way to declare any formula 899/// that refers to one of those regs an instant loser. 900void Cost::RatePrimaryRegister(const SCEV *Reg, 901 SmallPtrSet<const SCEV *, 16> &Regs, 902 const Loop *L, 903 ScalarEvolution &SE, DominatorTree &DT, 904 SmallPtrSet<const SCEV *, 16> *LoserRegs) { 905 if (LoserRegs && LoserRegs->count(Reg)) { 906 Loose(); 907 return; 908 } 909 if (Regs.insert(Reg)) { 910 RateRegister(Reg, Regs, L, SE, DT); 911 if (LoserRegs && isLoser()) 912 LoserRegs->insert(Reg); 913 } 914} 915 916void Cost::RateFormula(const TargetTransformInfo &TTI, 917 const Formula &F, 918 SmallPtrSet<const SCEV *, 16> &Regs, 919 const DenseSet<const SCEV *> &VisitedRegs, 920 const Loop *L, 921 const SmallVectorImpl<int64_t> &Offsets, 922 ScalarEvolution &SE, DominatorTree &DT, 923 const LSRUse &LU, 924 SmallPtrSet<const SCEV *, 16> *LoserRegs) { 925 // Tally up the registers. 926 if (const SCEV *ScaledReg = F.ScaledReg) { 927 if (VisitedRegs.count(ScaledReg)) { 928 Loose(); 929 return; 930 } 931 RatePrimaryRegister(ScaledReg, Regs, L, SE, DT, LoserRegs); 932 if (isLoser()) 933 return; 934 } 935 for (SmallVectorImpl<const SCEV *>::const_iterator I = F.BaseRegs.begin(), 936 E = F.BaseRegs.end(); I != E; ++I) { 937 const SCEV *BaseReg = *I; 938 if (VisitedRegs.count(BaseReg)) { 939 Loose(); 940 return; 941 } 942 RatePrimaryRegister(BaseReg, Regs, L, SE, DT, LoserRegs); 943 if (isLoser()) 944 return; 945 } 946 947 // Determine how many (unfolded) adds we'll need inside the loop. 948 size_t NumBaseParts = F.BaseRegs.size() + (F.UnfoldedOffset != 0); 949 if (NumBaseParts > 1) 950 // Do not count the base and a possible second register if the target 951 // allows to fold 2 registers. 952 NumBaseAdds += NumBaseParts - (1 + isLegal2RegAMUse(TTI, LU, F)); 953 954 // Accumulate non-free scaling amounts. 955 ScaleCost += getScalingFactorCost(TTI, LU, F); 956 957 // Tally up the non-zero immediates. 958 for (SmallVectorImpl<int64_t>::const_iterator I = Offsets.begin(), 959 E = Offsets.end(); I != E; ++I) { 960 int64_t Offset = (uint64_t)*I + F.BaseOffset; 961 if (F.BaseGV) 962 ImmCost += 64; // Handle symbolic values conservatively. 963 // TODO: This should probably be the pointer size. 964 else if (Offset != 0) 965 ImmCost += APInt(64, Offset, true).getMinSignedBits(); 966 } 967 assert(isValid() && "invalid cost"); 968} 969 970/// Loose - Set this cost to a losing value. 971void Cost::Loose() { 972 NumRegs = ~0u; 973 AddRecCost = ~0u; 974 NumIVMuls = ~0u; 975 NumBaseAdds = ~0u; 976 ImmCost = ~0u; 977 SetupCost = ~0u; 978 ScaleCost = ~0u; 979} 980 981/// operator< - Choose the lower cost. 982bool Cost::operator<(const Cost &Other) const { 983 if (NumRegs != Other.NumRegs) 984 return NumRegs < Other.NumRegs; 985 if (AddRecCost != Other.AddRecCost) 986 return AddRecCost < Other.AddRecCost; 987 if (NumIVMuls != Other.NumIVMuls) 988 return NumIVMuls < Other.NumIVMuls; 989 if (NumBaseAdds != Other.NumBaseAdds) 990 return NumBaseAdds < Other.NumBaseAdds; 991 if (ScaleCost != Other.ScaleCost) 992 return ScaleCost < Other.ScaleCost; 993 if (ImmCost != Other.ImmCost) 994 return ImmCost < Other.ImmCost; 995 if (SetupCost != Other.SetupCost) 996 return SetupCost < Other.SetupCost; 997 return false; 998} 999 1000void Cost::print(raw_ostream &OS) const { 1001 OS << NumRegs << " reg" << (NumRegs == 1 ? "" : "s"); 1002 if (AddRecCost != 0) 1003 OS << ", with addrec cost " << AddRecCost; 1004 if (NumIVMuls != 0) 1005 OS << ", plus " << NumIVMuls << " IV mul" << (NumIVMuls == 1 ? "" : "s"); 1006 if (NumBaseAdds != 0) 1007 OS << ", plus " << NumBaseAdds << " base add" 1008 << (NumBaseAdds == 1 ? "" : "s"); 1009 if (ScaleCost != 0) 1010 OS << ", plus " << ScaleCost << " scale cost"; 1011 if (ImmCost != 0) 1012 OS << ", plus " << ImmCost << " imm cost"; 1013 if (SetupCost != 0) 1014 OS << ", plus " << SetupCost << " setup cost"; 1015} 1016 1017#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) 1018void Cost::dump() const { 1019 print(errs()); errs() << '\n'; 1020} 1021#endif 1022 1023namespace { 1024 1025/// LSRFixup - An operand value in an instruction which is to be replaced 1026/// with some equivalent, possibly strength-reduced, replacement. 1027struct LSRFixup { 1028 /// UserInst - The instruction which will be updated. 1029 Instruction *UserInst; 1030 1031 /// OperandValToReplace - The operand of the instruction which will 1032 /// be replaced. The operand may be used more than once; every instance 1033 /// will be replaced. 1034 Value *OperandValToReplace; 1035 1036 /// PostIncLoops - If this user is to use the post-incremented value of an 1037 /// induction variable, this variable is non-null and holds the loop 1038 /// associated with the induction variable. 1039 PostIncLoopSet PostIncLoops; 1040 1041 /// LUIdx - The index of the LSRUse describing the expression which 1042 /// this fixup needs, minus an offset (below). 1043 size_t LUIdx; 1044 1045 /// Offset - A constant offset to be added to the LSRUse expression. 1046 /// This allows multiple fixups to share the same LSRUse with different 1047 /// offsets, for example in an unrolled loop. 1048 int64_t Offset; 1049 1050 bool isUseFullyOutsideLoop(const Loop *L) const; 1051 1052 LSRFixup(); 1053 1054 void print(raw_ostream &OS) const; 1055 void dump() const; 1056}; 1057 1058} 1059 1060LSRFixup::LSRFixup() 1061 : UserInst(0), OperandValToReplace(0), LUIdx(~size_t(0)), Offset(0) {} 1062 1063/// isUseFullyOutsideLoop - Test whether this fixup always uses its 1064/// value outside of the given loop. 1065bool LSRFixup::isUseFullyOutsideLoop(const Loop *L) const { 1066 // PHI nodes use their value in their incoming blocks. 1067 if (const PHINode *PN = dyn_cast<PHINode>(UserInst)) { 1068 for (unsigned i = 0, e = PN->getNumIncomingValues(); i != e; ++i) 1069 if (PN->getIncomingValue(i) == OperandValToReplace && 1070 L->contains(PN->getIncomingBlock(i))) 1071 return false; 1072 return true; 1073 } 1074 1075 return !L->contains(UserInst); 1076} 1077 1078void LSRFixup::print(raw_ostream &OS) const { 1079 OS << "UserInst="; 1080 // Store is common and interesting enough to be worth special-casing. 1081 if (StoreInst *Store = dyn_cast<StoreInst>(UserInst)) { 1082 OS << "store "; 1083 WriteAsOperand(OS, Store->getOperand(0), /*PrintType=*/false); 1084 } else if (UserInst->getType()->isVoidTy()) 1085 OS << UserInst->getOpcodeName(); 1086 else 1087 WriteAsOperand(OS, UserInst, /*PrintType=*/false); 1088 1089 OS << ", OperandValToReplace="; 1090 WriteAsOperand(OS, OperandValToReplace, /*PrintType=*/false); 1091 1092 for (PostIncLoopSet::const_iterator I = PostIncLoops.begin(), 1093 E = PostIncLoops.end(); I != E; ++I) { 1094 OS << ", PostIncLoop="; 1095 WriteAsOperand(OS, (*I)->getHeader(), /*PrintType=*/false); 1096 } 1097 1098 if (LUIdx != ~size_t(0)) 1099 OS << ", LUIdx=" << LUIdx; 1100 1101 if (Offset != 0) 1102 OS << ", Offset=" << Offset; 1103} 1104 1105#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) 1106void LSRFixup::dump() const { 1107 print(errs()); errs() << '\n'; 1108} 1109#endif 1110 1111namespace { 1112 1113/// UniquifierDenseMapInfo - A DenseMapInfo implementation for holding 1114/// DenseMaps and DenseSets of sorted SmallVectors of const SCEV*. 1115struct UniquifierDenseMapInfo { 1116 static SmallVector<const SCEV *, 4> getEmptyKey() { 1117 SmallVector<const SCEV *, 4> V; 1118 V.push_back(reinterpret_cast<const SCEV *>(-1)); 1119 return V; 1120 } 1121 1122 static SmallVector<const SCEV *, 4> getTombstoneKey() { 1123 SmallVector<const SCEV *, 4> V; 1124 V.push_back(reinterpret_cast<const SCEV *>(-2)); 1125 return V; 1126 } 1127 1128 static unsigned getHashValue(const SmallVector<const SCEV *, 4> &V) { 1129 unsigned Result = 0; 1130 for (SmallVectorImpl<const SCEV *>::const_iterator I = V.begin(), 1131 E = V.end(); I != E; ++I) 1132 Result ^= DenseMapInfo<const SCEV *>::getHashValue(*I); 1133 return Result; 1134 } 1135 1136 static bool isEqual(const SmallVector<const SCEV *, 4> &LHS, 1137 const SmallVector<const SCEV *, 4> &RHS) { 1138 return LHS == RHS; 1139 } 1140}; 1141 1142/// LSRUse - This class holds the state that LSR keeps for each use in 1143/// IVUsers, as well as uses invented by LSR itself. It includes information 1144/// about what kinds of things can be folded into the user, information about 1145/// the user itself, and information about how the use may be satisfied. 1146/// TODO: Represent multiple users of the same expression in common? 1147class LSRUse { 1148 DenseSet<SmallVector<const SCEV *, 4>, UniquifierDenseMapInfo> Uniquifier; 1149 1150public: 1151 /// KindType - An enum for a kind of use, indicating what types of 1152 /// scaled and immediate operands it might support. 1153 enum KindType { 1154 Basic, ///< A normal use, with no folding. 1155 Special, ///< A special case of basic, allowing -1 scales. 1156 Address, ///< An address use; folding according to TargetLowering 1157 ICmpZero ///< An equality icmp with both operands folded into one. 1158 // TODO: Add a generic icmp too? 1159 }; 1160 1161 KindType Kind; 1162 Type *AccessTy; 1163 1164 SmallVector<int64_t, 8> Offsets; 1165 int64_t MinOffset; 1166 int64_t MaxOffset; 1167 1168 /// AllFixupsOutsideLoop - This records whether all of the fixups using this 1169 /// LSRUse are outside of the loop, in which case some special-case heuristics 1170 /// may be used. 1171 bool AllFixupsOutsideLoop; 1172 1173 /// RigidFormula is set to true to guarantee that this use will be associated 1174 /// with a single formula--the one that initially matched. Some SCEV 1175 /// expressions cannot be expanded. This allows LSR to consider the registers 1176 /// used by those expressions without the need to expand them later after 1177 /// changing the formula. 1178 bool RigidFormula; 1179 1180 /// WidestFixupType - This records the widest use type for any fixup using 1181 /// this LSRUse. FindUseWithSimilarFormula can't consider uses with different 1182 /// max fixup widths to be equivalent, because the narrower one may be relying 1183 /// on the implicit truncation to truncate away bogus bits. 1184 Type *WidestFixupType; 1185 1186 /// Formulae - A list of ways to build a value that can satisfy this user. 1187 /// After the list is populated, one of these is selected heuristically and 1188 /// used to formulate a replacement for OperandValToReplace in UserInst. 1189 SmallVector<Formula, 12> Formulae; 1190 1191 /// Regs - The set of register candidates used by all formulae in this LSRUse. 1192 SmallPtrSet<const SCEV *, 4> Regs; 1193 1194 LSRUse(KindType K, Type *T) : Kind(K), AccessTy(T), 1195 MinOffset(INT64_MAX), 1196 MaxOffset(INT64_MIN), 1197 AllFixupsOutsideLoop(true), 1198 RigidFormula(false), 1199 WidestFixupType(0) {} 1200 1201 bool HasFormulaWithSameRegs(const Formula &F) const; 1202 bool InsertFormula(const Formula &F); 1203 void DeleteFormula(Formula &F); 1204 void RecomputeRegs(size_t LUIdx, RegUseTracker &Reguses); 1205 1206 void print(raw_ostream &OS) const; 1207 void dump() const; 1208}; 1209 1210} 1211 1212/// HasFormula - Test whether this use as a formula which has the same 1213/// registers as the given formula. 1214bool LSRUse::HasFormulaWithSameRegs(const Formula &F) const { 1215 SmallVector<const SCEV *, 4> Key = F.BaseRegs; 1216 if (F.ScaledReg) Key.push_back(F.ScaledReg); 1217 // Unstable sort by host order ok, because this is only used for uniquifying. 1218 std::sort(Key.begin(), Key.end()); 1219 return Uniquifier.count(Key); 1220} 1221 1222/// InsertFormula - If the given formula has not yet been inserted, add it to 1223/// the list, and return true. Return false otherwise. 1224bool LSRUse::InsertFormula(const Formula &F) { 1225 if (!Formulae.empty() && RigidFormula) 1226 return false; 1227 1228 SmallVector<const SCEV *, 4> Key = F.BaseRegs; 1229 if (F.ScaledReg) Key.push_back(F.ScaledReg); 1230 // Unstable sort by host order ok, because this is only used for uniquifying. 1231 std::sort(Key.begin(), Key.end()); 1232 1233 if (!Uniquifier.insert(Key).second) 1234 return false; 1235 1236 // Using a register to hold the value of 0 is not profitable. 1237 assert((!F.ScaledReg || !F.ScaledReg->isZero()) && 1238 "Zero allocated in a scaled register!"); 1239#ifndef NDEBUG 1240 for (SmallVectorImpl<const SCEV *>::const_iterator I = 1241 F.BaseRegs.begin(), E = F.BaseRegs.end(); I != E; ++I) 1242 assert(!(*I)->isZero() && "Zero allocated in a base register!"); 1243#endif 1244 1245 // Add the formula to the list. 1246 Formulae.push_back(F); 1247 1248 // Record registers now being used by this use. 1249 Regs.insert(F.BaseRegs.begin(), F.BaseRegs.end()); 1250 1251 return true; 1252} 1253 1254/// DeleteFormula - Remove the given formula from this use's list. 1255void LSRUse::DeleteFormula(Formula &F) { 1256 if (&F != &Formulae.back()) 1257 std::swap(F, Formulae.back()); 1258 Formulae.pop_back(); 1259} 1260 1261/// RecomputeRegs - Recompute the Regs field, and update RegUses. 1262void LSRUse::RecomputeRegs(size_t LUIdx, RegUseTracker &RegUses) { 1263 // Now that we've filtered out some formulae, recompute the Regs set. 1264 SmallPtrSet<const SCEV *, 4> OldRegs = Regs; 1265 Regs.clear(); 1266 for (SmallVectorImpl<Formula>::const_iterator I = Formulae.begin(), 1267 E = Formulae.end(); I != E; ++I) { 1268 const Formula &F = *I; 1269 if (F.ScaledReg) Regs.insert(F.ScaledReg); 1270 Regs.insert(F.BaseRegs.begin(), F.BaseRegs.end()); 1271 } 1272 1273 // Update the RegTracker. 1274 for (SmallPtrSet<const SCEV *, 4>::iterator I = OldRegs.begin(), 1275 E = OldRegs.end(); I != E; ++I) 1276 if (!Regs.count(*I)) 1277 RegUses.DropRegister(*I, LUIdx); 1278} 1279 1280void LSRUse::print(raw_ostream &OS) const { 1281 OS << "LSR Use: Kind="; 1282 switch (Kind) { 1283 case Basic: OS << "Basic"; break; 1284 case Special: OS << "Special"; break; 1285 case ICmpZero: OS << "ICmpZero"; break; 1286 case Address: 1287 OS << "Address of "; 1288 if (AccessTy->isPointerTy()) 1289 OS << "pointer"; // the full pointer type could be really verbose 1290 else 1291 OS << *AccessTy; 1292 } 1293 1294 OS << ", Offsets={"; 1295 for (SmallVectorImpl<int64_t>::const_iterator I = Offsets.begin(), 1296 E = Offsets.end(); I != E; ++I) { 1297 OS << *I; 1298 if (llvm::next(I) != E) 1299 OS << ','; 1300 } 1301 OS << '}'; 1302 1303 if (AllFixupsOutsideLoop) 1304 OS << ", all-fixups-outside-loop"; 1305 1306 if (WidestFixupType) 1307 OS << ", widest fixup type: " << *WidestFixupType; 1308} 1309 1310#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) 1311void LSRUse::dump() const { 1312 print(errs()); errs() << '\n'; 1313} 1314#endif 1315 1316/// isLegalUse - Test whether the use described by AM is "legal", meaning it can 1317/// be completely folded into the user instruction at isel time. This includes 1318/// address-mode folding and special icmp tricks. 1319static bool isLegalUse(const TargetTransformInfo &TTI, LSRUse::KindType Kind, 1320 Type *AccessTy, GlobalValue *BaseGV, int64_t BaseOffset, 1321 bool HasBaseReg, int64_t Scale) { 1322 switch (Kind) { 1323 case LSRUse::Address: 1324 return TTI.isLegalAddressingMode(AccessTy, BaseGV, BaseOffset, HasBaseReg, Scale); 1325 1326 // Otherwise, just guess that reg+reg addressing is legal. 1327 //return ; 1328 1329 case LSRUse::ICmpZero: 1330 // There's not even a target hook for querying whether it would be legal to 1331 // fold a GV into an ICmp. 1332 if (BaseGV) 1333 return false; 1334 1335 // ICmp only has two operands; don't allow more than two non-trivial parts. 1336 if (Scale != 0 && HasBaseReg && BaseOffset != 0) 1337 return false; 1338 1339 // ICmp only supports no scale or a -1 scale, as we can "fold" a -1 scale by 1340 // putting the scaled register in the other operand of the icmp. 1341 if (Scale != 0 && Scale != -1) 1342 return false; 1343 1344 // If we have low-level target information, ask the target if it can fold an 1345 // integer immediate on an icmp. 1346 if (BaseOffset != 0) { 1347 // We have one of: 1348 // ICmpZero BaseReg + BaseOffset => ICmp BaseReg, -BaseOffset 1349 // ICmpZero -1*ScaleReg + BaseOffset => ICmp ScaleReg, BaseOffset 1350 // Offs is the ICmp immediate. 1351 if (Scale == 0) 1352 // The cast does the right thing with INT64_MIN. 1353 BaseOffset = -(uint64_t)BaseOffset; 1354 return TTI.isLegalICmpImmediate(BaseOffset); 1355 } 1356 1357 // ICmpZero BaseReg + -1*ScaleReg => ICmp BaseReg, ScaleReg 1358 return true; 1359 1360 case LSRUse::Basic: 1361 // Only handle single-register values. 1362 return !BaseGV && Scale == 0 && BaseOffset == 0; 1363 1364 case LSRUse::Special: 1365 // Special case Basic to handle -1 scales. 1366 return !BaseGV && (Scale == 0 || Scale == -1) && BaseOffset == 0; 1367 } 1368 1369 llvm_unreachable("Invalid LSRUse Kind!"); 1370} 1371 1372static bool isLegalUse(const TargetTransformInfo &TTI, int64_t MinOffset, 1373 int64_t MaxOffset, LSRUse::KindType Kind, Type *AccessTy, 1374 GlobalValue *BaseGV, int64_t BaseOffset, bool HasBaseReg, 1375 int64_t Scale) { 1376 // Check for overflow. 1377 if (((int64_t)((uint64_t)BaseOffset + MinOffset) > BaseOffset) != 1378 (MinOffset > 0)) 1379 return false; 1380 MinOffset = (uint64_t)BaseOffset + MinOffset; 1381 if (((int64_t)((uint64_t)BaseOffset + MaxOffset) > BaseOffset) != 1382 (MaxOffset > 0)) 1383 return false; 1384 MaxOffset = (uint64_t)BaseOffset + MaxOffset; 1385 1386 return isLegalUse(TTI, Kind, AccessTy, BaseGV, MinOffset, HasBaseReg, 1387 Scale) && 1388 isLegalUse(TTI, Kind, AccessTy, BaseGV, MaxOffset, HasBaseReg, Scale); 1389} 1390 1391static bool isLegalUse(const TargetTransformInfo &TTI, int64_t MinOffset, 1392 int64_t MaxOffset, LSRUse::KindType Kind, Type *AccessTy, 1393 const Formula &F) { 1394 return isLegalUse(TTI, MinOffset, MaxOffset, Kind, AccessTy, F.BaseGV, 1395 F.BaseOffset, F.HasBaseReg, F.Scale); 1396} 1397 1398static bool isLegal2RegAMUse(const TargetTransformInfo &TTI, const LSRUse &LU, 1399 const Formula &F) { 1400 // If F is used as an Addressing Mode, it may fold one Base plus one 1401 // scaled register. If the scaled register is nil, do as if another 1402 // element of the base regs is a 1-scaled register. 1403 // This is possible if BaseRegs has at least 2 registers. 1404 1405 // If this is not an address calculation, this is not an addressing mode 1406 // use. 1407 if (LU.Kind != LSRUse::Address) 1408 return false; 1409 1410 // F is already scaled. 1411 if (F.Scale != 0) 1412 return false; 1413 1414 // We need to keep one register for the base and one to scale. 1415 if (F.BaseRegs.size() < 2) 1416 return false; 1417 1418 return isLegalUse(TTI, LU.MinOffset, LU.MaxOffset, LU.Kind, LU.AccessTy, 1419 F.BaseGV, F.BaseOffset, F.HasBaseReg, 1); 1420 } 1421 1422static unsigned getScalingFactorCost(const TargetTransformInfo &TTI, 1423 const LSRUse &LU, const Formula &F) { 1424 if (!F.Scale) 1425 return 0; 1426 assert(isLegalUse(TTI, LU.MinOffset, LU.MaxOffset, LU.Kind, 1427 LU.AccessTy, F) && "Illegal formula in use."); 1428 1429 switch (LU.Kind) { 1430 case LSRUse::Address: { 1431 // Check the scaling factor cost with both the min and max offsets. 1432 int ScaleCostMinOffset = 1433 TTI.getScalingFactorCost(LU.AccessTy, F.BaseGV, 1434 F.BaseOffset + LU.MinOffset, 1435 F.HasBaseReg, F.Scale); 1436 int ScaleCostMaxOffset = 1437 TTI.getScalingFactorCost(LU.AccessTy, F.BaseGV, 1438 F.BaseOffset + LU.MaxOffset, 1439 F.HasBaseReg, F.Scale); 1440 1441 assert(ScaleCostMinOffset >= 0 && ScaleCostMaxOffset >= 0 && 1442 "Legal addressing mode has an illegal cost!"); 1443 return std::max(ScaleCostMinOffset, ScaleCostMaxOffset); 1444 } 1445 case LSRUse::ICmpZero: 1446 // ICmpZero BaseReg + -1*ScaleReg => ICmp BaseReg, ScaleReg. 1447 // Therefore, return 0 in case F.Scale == -1. 1448 return F.Scale != -1; 1449 1450 case LSRUse::Basic: 1451 case LSRUse::Special: 1452 return 0; 1453 } 1454 1455 llvm_unreachable("Invalid LSRUse Kind!"); 1456} 1457 1458static bool isAlwaysFoldable(const TargetTransformInfo &TTI, 1459 LSRUse::KindType Kind, Type *AccessTy, 1460 GlobalValue *BaseGV, int64_t BaseOffset, 1461 bool HasBaseReg) { 1462 // Fast-path: zero is always foldable. 1463 if (BaseOffset == 0 && !BaseGV) return true; 1464 1465 // Conservatively, create an address with an immediate and a 1466 // base and a scale. 1467 int64_t Scale = Kind == LSRUse::ICmpZero ? -1 : 1; 1468 1469 // Canonicalize a scale of 1 to a base register if the formula doesn't 1470 // already have a base register. 1471 if (!HasBaseReg && Scale == 1) { 1472 Scale = 0; 1473 HasBaseReg = true; 1474 } 1475 1476 return isLegalUse(TTI, Kind, AccessTy, BaseGV, BaseOffset, HasBaseReg, Scale); 1477} 1478 1479static bool isAlwaysFoldable(const TargetTransformInfo &TTI, 1480 ScalarEvolution &SE, int64_t MinOffset, 1481 int64_t MaxOffset, LSRUse::KindType Kind, 1482 Type *AccessTy, const SCEV *S, bool HasBaseReg) { 1483 // Fast-path: zero is always foldable. 1484 if (S->isZero()) return true; 1485 1486 // Conservatively, create an address with an immediate and a 1487 // base and a scale. 1488 int64_t BaseOffset = ExtractImmediate(S, SE); 1489 GlobalValue *BaseGV = ExtractSymbol(S, SE); 1490 1491 // If there's anything else involved, it's not foldable. 1492 if (!S->isZero()) return false; 1493 1494 // Fast-path: zero is always foldable. 1495 if (BaseOffset == 0 && !BaseGV) return true; 1496 1497 // Conservatively, create an address with an immediate and a 1498 // base and a scale. 1499 int64_t Scale = Kind == LSRUse::ICmpZero ? -1 : 1; 1500 1501 return isLegalUse(TTI, MinOffset, MaxOffset, Kind, AccessTy, BaseGV, 1502 BaseOffset, HasBaseReg, Scale); 1503} 1504 1505namespace { 1506 1507/// UseMapDenseMapInfo - A DenseMapInfo implementation for holding 1508/// DenseMaps and DenseSets of pairs of const SCEV* and LSRUse::Kind. 1509struct UseMapDenseMapInfo { 1510 static std::pair<const SCEV *, LSRUse::KindType> getEmptyKey() { 1511 return std::make_pair(reinterpret_cast<const SCEV *>(-1), LSRUse::Basic); 1512 } 1513 1514 static std::pair<const SCEV *, LSRUse::KindType> getTombstoneKey() { 1515 return std::make_pair(reinterpret_cast<const SCEV *>(-2), LSRUse::Basic); 1516 } 1517 1518 static unsigned 1519 getHashValue(const std::pair<const SCEV *, LSRUse::KindType> &V) { 1520 unsigned Result = DenseMapInfo<const SCEV *>::getHashValue(V.first); 1521 Result ^= DenseMapInfo<unsigned>::getHashValue(unsigned(V.second)); 1522 return Result; 1523 } 1524 1525 static bool isEqual(const std::pair<const SCEV *, LSRUse::KindType> &LHS, 1526 const std::pair<const SCEV *, LSRUse::KindType> &RHS) { 1527 return LHS == RHS; 1528 } 1529}; 1530 1531/// IVInc - An individual increment in a Chain of IV increments. 1532/// Relate an IV user to an expression that computes the IV it uses from the IV 1533/// used by the previous link in the Chain. 1534/// 1535/// For the head of a chain, IncExpr holds the absolute SCEV expression for the 1536/// original IVOperand. The head of the chain's IVOperand is only valid during 1537/// chain collection, before LSR replaces IV users. During chain generation, 1538/// IncExpr can be used to find the new IVOperand that computes the same 1539/// expression. 1540struct IVInc { 1541 Instruction *UserInst; 1542 Value* IVOperand; 1543 const SCEV *IncExpr; 1544 1545 IVInc(Instruction *U, Value *O, const SCEV *E): 1546 UserInst(U), IVOperand(O), IncExpr(E) {} 1547}; 1548 1549// IVChain - The list of IV increments in program order. 1550// We typically add the head of a chain without finding subsequent links. 1551struct IVChain { 1552 SmallVector<IVInc,1> Incs; 1553 const SCEV *ExprBase; 1554 1555 IVChain() : ExprBase(0) {} 1556 1557 IVChain(const IVInc &Head, const SCEV *Base) 1558 : Incs(1, Head), ExprBase(Base) {} 1559 1560 typedef SmallVectorImpl<IVInc>::const_iterator const_iterator; 1561 1562 // begin - return the first increment in the chain. 1563 const_iterator begin() const { 1564 assert(!Incs.empty()); 1565 return llvm::next(Incs.begin()); 1566 } 1567 const_iterator end() const { 1568 return Incs.end(); 1569 } 1570 1571 // hasIncs - Returns true if this chain contains any increments. 1572 bool hasIncs() const { return Incs.size() >= 2; } 1573 1574 // add - Add an IVInc to the end of this chain. 1575 void add(const IVInc &X) { Incs.push_back(X); } 1576 1577 // tailUserInst - Returns the last UserInst in the chain. 1578 Instruction *tailUserInst() const { return Incs.back().UserInst; } 1579 1580 // isProfitableIncrement - Returns true if IncExpr can be profitably added to 1581 // this chain. 1582 bool isProfitableIncrement(const SCEV *OperExpr, 1583 const SCEV *IncExpr, 1584 ScalarEvolution&); 1585}; 1586 1587/// ChainUsers - Helper for CollectChains to track multiple IV increment uses. 1588/// Distinguish between FarUsers that definitely cross IV increments and 1589/// NearUsers that may be used between IV increments. 1590struct ChainUsers { 1591 SmallPtrSet<Instruction*, 4> FarUsers; 1592 SmallPtrSet<Instruction*, 4> NearUsers; 1593}; 1594 1595/// LSRInstance - This class holds state for the main loop strength reduction 1596/// logic. 1597class LSRInstance { 1598 IVUsers &IU; 1599 ScalarEvolution &SE; 1600 DominatorTree &DT; 1601 LoopInfo &LI; 1602 const TargetTransformInfo &TTI; 1603 Loop *const L; 1604 bool Changed; 1605 1606 /// IVIncInsertPos - This is the insert position that the current loop's 1607 /// induction variable increment should be placed. In simple loops, this is 1608 /// the latch block's terminator. But in more complicated cases, this is a 1609 /// position which will dominate all the in-loop post-increment users. 1610 Instruction *IVIncInsertPos; 1611 1612 /// Factors - Interesting factors between use strides. 1613 SmallSetVector<int64_t, 8> Factors; 1614 1615 /// Types - Interesting use types, to facilitate truncation reuse. 1616 SmallSetVector<Type *, 4> Types; 1617 1618 /// Fixups - The list of operands which are to be replaced. 1619 SmallVector<LSRFixup, 16> Fixups; 1620 1621 /// Uses - The list of interesting uses. 1622 SmallVector<LSRUse, 16> Uses; 1623 1624 /// RegUses - Track which uses use which register candidates. 1625 RegUseTracker RegUses; 1626 1627 // Limit the number of chains to avoid quadratic behavior. We don't expect to 1628 // have more than a few IV increment chains in a loop. Missing a Chain falls 1629 // back to normal LSR behavior for those uses. 1630 static const unsigned MaxChains = 8; 1631 1632 /// IVChainVec - IV users can form a chain of IV increments. 1633 SmallVector<IVChain, MaxChains> IVChainVec; 1634 1635 /// IVIncSet - IV users that belong to profitable IVChains. 1636 SmallPtrSet<Use*, MaxChains> IVIncSet; 1637 1638 void OptimizeShadowIV(); 1639 bool FindIVUserForCond(ICmpInst *Cond, IVStrideUse *&CondUse); 1640 ICmpInst *OptimizeMax(ICmpInst *Cond, IVStrideUse* &CondUse); 1641 void OptimizeLoopTermCond(); 1642 1643 void ChainInstruction(Instruction *UserInst, Instruction *IVOper, 1644 SmallVectorImpl<ChainUsers> &ChainUsersVec); 1645 void FinalizeChain(IVChain &Chain); 1646 void CollectChains(); 1647 void GenerateIVChain(const IVChain &Chain, SCEVExpander &Rewriter, 1648 SmallVectorImpl<WeakVH> &DeadInsts); 1649 1650 void CollectInterestingTypesAndFactors(); 1651 void CollectFixupsAndInitialFormulae(); 1652 1653 LSRFixup &getNewFixup() { 1654 Fixups.push_back(LSRFixup()); 1655 return Fixups.back(); 1656 } 1657 1658 // Support for sharing of LSRUses between LSRFixups. 1659 typedef DenseMap<std::pair<const SCEV *, LSRUse::KindType>, 1660 size_t, 1661 UseMapDenseMapInfo> UseMapTy; 1662 UseMapTy UseMap; 1663 1664 bool reconcileNewOffset(LSRUse &LU, int64_t NewOffset, bool HasBaseReg, 1665 LSRUse::KindType Kind, Type *AccessTy); 1666 1667 std::pair<size_t, int64_t> getUse(const SCEV *&Expr, 1668 LSRUse::KindType Kind, 1669 Type *AccessTy); 1670 1671 void DeleteUse(LSRUse &LU, size_t LUIdx); 1672 1673 LSRUse *FindUseWithSimilarFormula(const Formula &F, const LSRUse &OrigLU); 1674 1675 void InsertInitialFormula(const SCEV *S, LSRUse &LU, size_t LUIdx); 1676 void InsertSupplementalFormula(const SCEV *S, LSRUse &LU, size_t LUIdx); 1677 void CountRegisters(const Formula &F, size_t LUIdx); 1678 bool InsertFormula(LSRUse &LU, unsigned LUIdx, const Formula &F); 1679 1680 void CollectLoopInvariantFixupsAndFormulae(); 1681 1682 void GenerateReassociations(LSRUse &LU, unsigned LUIdx, Formula Base, 1683 unsigned Depth = 0); 1684 void GenerateCombinations(LSRUse &LU, unsigned LUIdx, Formula Base); 1685 void GenerateSymbolicOffsets(LSRUse &LU, unsigned LUIdx, Formula Base); 1686 void GenerateConstantOffsets(LSRUse &LU, unsigned LUIdx, Formula Base); 1687 void GenerateICmpZeroScales(LSRUse &LU, unsigned LUIdx, Formula Base); 1688 void GenerateScales(LSRUse &LU, unsigned LUIdx, Formula Base); 1689 void GenerateTruncates(LSRUse &LU, unsigned LUIdx, Formula Base); 1690 void GenerateCrossUseConstantOffsets(); 1691 void GenerateAllReuseFormulae(); 1692 1693 void FilterOutUndesirableDedicatedRegisters(); 1694 1695 size_t EstimateSearchSpaceComplexity() const; 1696 void NarrowSearchSpaceByDetectingSupersets(); 1697 void NarrowSearchSpaceByCollapsingUnrolledCode(); 1698 void NarrowSearchSpaceByRefilteringUndesirableDedicatedRegisters(); 1699 void NarrowSearchSpaceByPickingWinnerRegs(); 1700 void NarrowSearchSpaceUsingHeuristics(); 1701 1702 void SolveRecurse(SmallVectorImpl<const Formula *> &Solution, 1703 Cost &SolutionCost, 1704 SmallVectorImpl<const Formula *> &Workspace, 1705 const Cost &CurCost, 1706 const SmallPtrSet<const SCEV *, 16> &CurRegs, 1707 DenseSet<const SCEV *> &VisitedRegs) const; 1708 void Solve(SmallVectorImpl<const Formula *> &Solution) const; 1709 1710 BasicBlock::iterator 1711 HoistInsertPosition(BasicBlock::iterator IP, 1712 const SmallVectorImpl<Instruction *> &Inputs) const; 1713 BasicBlock::iterator 1714 AdjustInsertPositionForExpand(BasicBlock::iterator IP, 1715 const LSRFixup &LF, 1716 const LSRUse &LU, 1717 SCEVExpander &Rewriter) const; 1718 1719 Value *Expand(const LSRFixup &LF, 1720 const Formula &F, 1721 BasicBlock::iterator IP, 1722 SCEVExpander &Rewriter, 1723 SmallVectorImpl<WeakVH> &DeadInsts) const; 1724 void RewriteForPHI(PHINode *PN, const LSRFixup &LF, 1725 const Formula &F, 1726 SCEVExpander &Rewriter, 1727 SmallVectorImpl<WeakVH> &DeadInsts, 1728 Pass *P) const; 1729 void Rewrite(const LSRFixup &LF, 1730 const Formula &F, 1731 SCEVExpander &Rewriter, 1732 SmallVectorImpl<WeakVH> &DeadInsts, 1733 Pass *P) const; 1734 void ImplementSolution(const SmallVectorImpl<const Formula *> &Solution, 1735 Pass *P); 1736 1737public: 1738 LSRInstance(Loop *L, Pass *P); 1739 1740 bool getChanged() const { return Changed; } 1741 1742 void print_factors_and_types(raw_ostream &OS) const; 1743 void print_fixups(raw_ostream &OS) const; 1744 void print_uses(raw_ostream &OS) const; 1745 void print(raw_ostream &OS) const; 1746 void dump() const; 1747}; 1748 1749} 1750 1751/// OptimizeShadowIV - If IV is used in a int-to-float cast 1752/// inside the loop then try to eliminate the cast operation. 1753void LSRInstance::OptimizeShadowIV() { 1754 const SCEV *BackedgeTakenCount = SE.getBackedgeTakenCount(L); 1755 if (isa<SCEVCouldNotCompute>(BackedgeTakenCount)) 1756 return; 1757 1758 for (IVUsers::const_iterator UI = IU.begin(), E = IU.end(); 1759 UI != E; /* empty */) { 1760 IVUsers::const_iterator CandidateUI = UI; 1761 ++UI; 1762 Instruction *ShadowUse = CandidateUI->getUser(); 1763 Type *DestTy = 0; 1764 bool IsSigned = false; 1765 1766 /* If shadow use is a int->float cast then insert a second IV 1767 to eliminate this cast. 1768 1769 for (unsigned i = 0; i < n; ++i) 1770 foo((double)i); 1771 1772 is transformed into 1773 1774 double d = 0.0; 1775 for (unsigned i = 0; i < n; ++i, ++d) 1776 foo(d); 1777 */ 1778 if (UIToFPInst *UCast = dyn_cast<UIToFPInst>(CandidateUI->getUser())) { 1779 IsSigned = false; 1780 DestTy = UCast->getDestTy(); 1781 } 1782 else if (SIToFPInst *SCast = dyn_cast<SIToFPInst>(CandidateUI->getUser())) { 1783 IsSigned = true; 1784 DestTy = SCast->getDestTy(); 1785 } 1786 if (!DestTy) continue; 1787 1788 // If target does not support DestTy natively then do not apply 1789 // this transformation. 1790 if (!TTI.isTypeLegal(DestTy)) continue; 1791 1792 PHINode *PH = dyn_cast<PHINode>(ShadowUse->getOperand(0)); 1793 if (!PH) continue; 1794 if (PH->getNumIncomingValues() != 2) continue; 1795 1796 Type *SrcTy = PH->getType(); 1797 int Mantissa = DestTy->getFPMantissaWidth(); 1798 if (Mantissa == -1) continue; 1799 if ((int)SE.getTypeSizeInBits(SrcTy) > Mantissa) 1800 continue; 1801 1802 unsigned Entry, Latch; 1803 if (PH->getIncomingBlock(0) == L->getLoopPreheader()) { 1804 Entry = 0; 1805 Latch = 1; 1806 } else { 1807 Entry = 1; 1808 Latch = 0; 1809 } 1810 1811 ConstantInt *Init = dyn_cast<ConstantInt>(PH->getIncomingValue(Entry)); 1812 if (!Init) continue; 1813 Constant *NewInit = ConstantFP::get(DestTy, IsSigned ? 1814 (double)Init->getSExtValue() : 1815 (double)Init->getZExtValue()); 1816 1817 BinaryOperator *Incr = 1818 dyn_cast<BinaryOperator>(PH->getIncomingValue(Latch)); 1819 if (!Incr) continue; 1820 if (Incr->getOpcode() != Instruction::Add 1821 && Incr->getOpcode() != Instruction::Sub) 1822 continue; 1823 1824 /* Initialize new IV, double d = 0.0 in above example. */ 1825 ConstantInt *C = 0; 1826 if (Incr->getOperand(0) == PH) 1827 C = dyn_cast<ConstantInt>(Incr->getOperand(1)); 1828 else if (Incr->getOperand(1) == PH) 1829 C = dyn_cast<ConstantInt>(Incr->getOperand(0)); 1830 else 1831 continue; 1832 1833 if (!C) continue; 1834 1835 // Ignore negative constants, as the code below doesn't handle them 1836 // correctly. TODO: Remove this restriction. 1837 if (!C->getValue().isStrictlyPositive()) continue; 1838 1839 /* Add new PHINode. */ 1840 PHINode *NewPH = PHINode::Create(DestTy, 2, "IV.S.", PH); 1841 1842 /* create new increment. '++d' in above example. */ 1843 Constant *CFP = ConstantFP::get(DestTy, C->getZExtValue()); 1844 BinaryOperator *NewIncr = 1845 BinaryOperator::Create(Incr->getOpcode() == Instruction::Add ? 1846 Instruction::FAdd : Instruction::FSub, 1847 NewPH, CFP, "IV.S.next.", Incr); 1848 1849 NewPH->addIncoming(NewInit, PH->getIncomingBlock(Entry)); 1850 NewPH->addIncoming(NewIncr, PH->getIncomingBlock(Latch)); 1851 1852 /* Remove cast operation */ 1853 ShadowUse->replaceAllUsesWith(NewPH); 1854 ShadowUse->eraseFromParent(); 1855 Changed = true; 1856 break; 1857 } 1858} 1859 1860/// FindIVUserForCond - If Cond has an operand that is an expression of an IV, 1861/// set the IV user and stride information and return true, otherwise return 1862/// false. 1863bool LSRInstance::FindIVUserForCond(ICmpInst *Cond, IVStrideUse *&CondUse) { 1864 for (IVUsers::iterator UI = IU.begin(), E = IU.end(); UI != E; ++UI) 1865 if (UI->getUser() == Cond) { 1866 // NOTE: we could handle setcc instructions with multiple uses here, but 1867 // InstCombine does it as well for simple uses, it's not clear that it 1868 // occurs enough in real life to handle. 1869 CondUse = UI; 1870 return true; 1871 } 1872 return false; 1873} 1874 1875/// OptimizeMax - Rewrite the loop's terminating condition if it uses 1876/// a max computation. 1877/// 1878/// This is a narrow solution to a specific, but acute, problem. For loops 1879/// like this: 1880/// 1881/// i = 0; 1882/// do { 1883/// p[i] = 0.0; 1884/// } while (++i < n); 1885/// 1886/// the trip count isn't just 'n', because 'n' might not be positive. And 1887/// unfortunately this can come up even for loops where the user didn't use 1888/// a C do-while loop. For example, seemingly well-behaved top-test loops 1889/// will commonly be lowered like this: 1890// 1891/// if (n > 0) { 1892/// i = 0; 1893/// do { 1894/// p[i] = 0.0; 1895/// } while (++i < n); 1896/// } 1897/// 1898/// and then it's possible for subsequent optimization to obscure the if 1899/// test in such a way that indvars can't find it. 1900/// 1901/// When indvars can't find the if test in loops like this, it creates a 1902/// max expression, which allows it to give the loop a canonical 1903/// induction variable: 1904/// 1905/// i = 0; 1906/// max = n < 1 ? 1 : n; 1907/// do { 1908/// p[i] = 0.0; 1909/// } while (++i != max); 1910/// 1911/// Canonical induction variables are necessary because the loop passes 1912/// are designed around them. The most obvious example of this is the 1913/// LoopInfo analysis, which doesn't remember trip count values. It 1914/// expects to be able to rediscover the trip count each time it is 1915/// needed, and it does this using a simple analysis that only succeeds if 1916/// the loop has a canonical induction variable. 1917/// 1918/// However, when it comes time to generate code, the maximum operation 1919/// can be quite costly, especially if it's inside of an outer loop. 1920/// 1921/// This function solves this problem by detecting this type of loop and 1922/// rewriting their conditions from ICMP_NE back to ICMP_SLT, and deleting 1923/// the instructions for the maximum computation. 1924/// 1925ICmpInst *LSRInstance::OptimizeMax(ICmpInst *Cond, IVStrideUse* &CondUse) { 1926 // Check that the loop matches the pattern we're looking for. 1927 if (Cond->getPredicate() != CmpInst::ICMP_EQ && 1928 Cond->getPredicate() != CmpInst::ICMP_NE) 1929 return Cond; 1930 1931 SelectInst *Sel = dyn_cast<SelectInst>(Cond->getOperand(1)); 1932 if (!Sel || !Sel->hasOneUse()) return Cond; 1933 1934 const SCEV *BackedgeTakenCount = SE.getBackedgeTakenCount(L); 1935 if (isa<SCEVCouldNotCompute>(BackedgeTakenCount)) 1936 return Cond; 1937 const SCEV *One = SE.getConstant(BackedgeTakenCount->getType(), 1); 1938 1939 // Add one to the backedge-taken count to get the trip count. 1940 const SCEV *IterationCount = SE.getAddExpr(One, BackedgeTakenCount); 1941 if (IterationCount != SE.getSCEV(Sel)) return Cond; 1942 1943 // Check for a max calculation that matches the pattern. There's no check 1944 // for ICMP_ULE here because the comparison would be with zero, which 1945 // isn't interesting. 1946 CmpInst::Predicate Pred = ICmpInst::BAD_ICMP_PREDICATE; 1947 const SCEVNAryExpr *Max = 0; 1948 if (const SCEVSMaxExpr *S = dyn_cast<SCEVSMaxExpr>(BackedgeTakenCount)) { 1949 Pred = ICmpInst::ICMP_SLE; 1950 Max = S; 1951 } else if (const SCEVSMaxExpr *S = dyn_cast<SCEVSMaxExpr>(IterationCount)) { 1952 Pred = ICmpInst::ICMP_SLT; 1953 Max = S; 1954 } else if (const SCEVUMaxExpr *U = dyn_cast<SCEVUMaxExpr>(IterationCount)) { 1955 Pred = ICmpInst::ICMP_ULT; 1956 Max = U; 1957 } else { 1958 // No match; bail. 1959 return Cond; 1960 } 1961 1962 // To handle a max with more than two operands, this optimization would 1963 // require additional checking and setup. 1964 if (Max->getNumOperands() != 2) 1965 return Cond; 1966 1967 const SCEV *MaxLHS = Max->getOperand(0); 1968 const SCEV *MaxRHS = Max->getOperand(1); 1969 1970 // ScalarEvolution canonicalizes constants to the left. For < and >, look 1971 // for a comparison with 1. For <= and >=, a comparison with zero. 1972 if (!MaxLHS || 1973 (ICmpInst::isTrueWhenEqual(Pred) ? !MaxLHS->isZero() : (MaxLHS != One))) 1974 return Cond; 1975 1976 // Check the relevant induction variable for conformance to 1977 // the pattern. 1978 const SCEV *IV = SE.getSCEV(Cond->getOperand(0)); 1979 const SCEVAddRecExpr *AR = dyn_cast<SCEVAddRecExpr>(IV); 1980 if (!AR || !AR->isAffine() || 1981 AR->getStart() != One || 1982 AR->getStepRecurrence(SE) != One) 1983 return Cond; 1984 1985 assert(AR->getLoop() == L && 1986 "Loop condition operand is an addrec in a different loop!"); 1987 1988 // Check the right operand of the select, and remember it, as it will 1989 // be used in the new comparison instruction. 1990 Value *NewRHS = 0; 1991 if (ICmpInst::isTrueWhenEqual(Pred)) { 1992 // Look for n+1, and grab n. 1993 if (AddOperator *BO = dyn_cast<AddOperator>(Sel->getOperand(1))) 1994 if (ConstantInt *BO1 = dyn_cast<ConstantInt>(BO->getOperand(1))) 1995 if (BO1->isOne() && SE.getSCEV(BO->getOperand(0)) == MaxRHS) 1996 NewRHS = BO->getOperand(0); 1997 if (AddOperator *BO = dyn_cast<AddOperator>(Sel->getOperand(2))) 1998 if (ConstantInt *BO1 = dyn_cast<ConstantInt>(BO->getOperand(1))) 1999 if (BO1->isOne() && SE.getSCEV(BO->getOperand(0)) == MaxRHS) 2000 NewRHS = BO->getOperand(0); 2001 if (!NewRHS) 2002 return Cond; 2003 } else if (SE.getSCEV(Sel->getOperand(1)) == MaxRHS) 2004 NewRHS = Sel->getOperand(1); 2005 else if (SE.getSCEV(Sel->getOperand(2)) == MaxRHS) 2006 NewRHS = Sel->getOperand(2); 2007 else if (const SCEVUnknown *SU = dyn_cast<SCEVUnknown>(MaxRHS)) 2008 NewRHS = SU->getValue(); 2009 else 2010 // Max doesn't match expected pattern. 2011 return Cond; 2012 2013 // Determine the new comparison opcode. It may be signed or unsigned, 2014 // and the original comparison may be either equality or inequality. 2015 if (Cond->getPredicate() == CmpInst::ICMP_EQ) 2016 Pred = CmpInst::getInversePredicate(Pred); 2017 2018 // Ok, everything looks ok to change the condition into an SLT or SGE and 2019 // delete the max calculation. 2020 ICmpInst *NewCond = 2021 new ICmpInst(Cond, Pred, Cond->getOperand(0), NewRHS, "scmp"); 2022 2023 // Delete the max calculation instructions. 2024 Cond->replaceAllUsesWith(NewCond); 2025 CondUse->setUser(NewCond); 2026 Instruction *Cmp = cast<Instruction>(Sel->getOperand(0)); 2027 Cond->eraseFromParent(); 2028 Sel->eraseFromParent(); 2029 if (Cmp->use_empty()) 2030 Cmp->eraseFromParent(); 2031 return NewCond; 2032} 2033 2034/// OptimizeLoopTermCond - Change loop terminating condition to use the 2035/// postinc iv when possible. 2036void 2037LSRInstance::OptimizeLoopTermCond() { 2038 SmallPtrSet<Instruction *, 4> PostIncs; 2039 2040 BasicBlock *LatchBlock = L->getLoopLatch(); 2041 SmallVector<BasicBlock*, 8> ExitingBlocks; 2042 L->getExitingBlocks(ExitingBlocks); 2043 2044 for (unsigned i = 0, e = ExitingBlocks.size(); i != e; ++i) { 2045 BasicBlock *ExitingBlock = ExitingBlocks[i]; 2046 2047 // Get the terminating condition for the loop if possible. If we 2048 // can, we want to change it to use a post-incremented version of its 2049 // induction variable, to allow coalescing the live ranges for the IV into 2050 // one register value. 2051 2052 BranchInst *TermBr = dyn_cast<BranchInst>(ExitingBlock->getTerminator()); 2053 if (!TermBr) 2054 continue; 2055 // FIXME: Overly conservative, termination condition could be an 'or' etc.. 2056 if (TermBr->isUnconditional() || !isa<ICmpInst>(TermBr->getCondition())) 2057 continue; 2058 2059 // Search IVUsesByStride to find Cond's IVUse if there is one. 2060 IVStrideUse *CondUse = 0; 2061 ICmpInst *Cond = cast<ICmpInst>(TermBr->getCondition()); 2062 if (!FindIVUserForCond(Cond, CondUse)) 2063 continue; 2064 2065 // If the trip count is computed in terms of a max (due to ScalarEvolution 2066 // being unable to find a sufficient guard, for example), change the loop 2067 // comparison to use SLT or ULT instead of NE. 2068 // One consequence of doing this now is that it disrupts the count-down 2069 // optimization. That's not always a bad thing though, because in such 2070 // cases it may still be worthwhile to avoid a max. 2071 Cond = OptimizeMax(Cond, CondUse); 2072 2073 // If this exiting block dominates the latch block, it may also use 2074 // the post-inc value if it won't be shared with other uses. 2075 // Check for dominance. 2076 if (!DT.dominates(ExitingBlock, LatchBlock)) 2077 continue; 2078 2079 // Conservatively avoid trying to use the post-inc value in non-latch 2080 // exits if there may be pre-inc users in intervening blocks. 2081 if (LatchBlock != ExitingBlock) 2082 for (IVUsers::const_iterator UI = IU.begin(), E = IU.end(); UI != E; ++UI) 2083 // Test if the use is reachable from the exiting block. This dominator 2084 // query is a conservative approximation of reachability. 2085 if (&*UI != CondUse && 2086 !DT.properlyDominates(UI->getUser()->getParent(), ExitingBlock)) { 2087 // Conservatively assume there may be reuse if the quotient of their 2088 // strides could be a legal scale. 2089 const SCEV *A = IU.getStride(*CondUse, L); 2090 const SCEV *B = IU.getStride(*UI, L); 2091 if (!A || !B) continue; 2092 if (SE.getTypeSizeInBits(A->getType()) != 2093 SE.getTypeSizeInBits(B->getType())) { 2094 if (SE.getTypeSizeInBits(A->getType()) > 2095 SE.getTypeSizeInBits(B->getType())) 2096 B = SE.getSignExtendExpr(B, A->getType()); 2097 else 2098 A = SE.getSignExtendExpr(A, B->getType()); 2099 } 2100 if (const SCEVConstant *D = 2101 dyn_cast_or_null<SCEVConstant>(getExactSDiv(B, A, SE))) { 2102 const ConstantInt *C = D->getValue(); 2103 // Stride of one or negative one can have reuse with non-addresses. 2104 if (C->isOne() || C->isAllOnesValue()) 2105 goto decline_post_inc; 2106 // Avoid weird situations. 2107 if (C->getValue().getMinSignedBits() >= 64 || 2108 C->getValue().isMinSignedValue()) 2109 goto decline_post_inc; 2110 // Check for possible scaled-address reuse. 2111 Type *AccessTy = getAccessType(UI->getUser()); 2112 int64_t Scale = C->getSExtValue(); 2113 if (TTI.isLegalAddressingMode(AccessTy, /*BaseGV=*/ 0, 2114 /*BaseOffset=*/ 0, 2115 /*HasBaseReg=*/ false, Scale)) 2116 goto decline_post_inc; 2117 Scale = -Scale; 2118 if (TTI.isLegalAddressingMode(AccessTy, /*BaseGV=*/ 0, 2119 /*BaseOffset=*/ 0, 2120 /*HasBaseReg=*/ false, Scale)) 2121 goto decline_post_inc; 2122 } 2123 } 2124 2125 DEBUG(dbgs() << " Change loop exiting icmp to use postinc iv: " 2126 << *Cond << '\n'); 2127 2128 // It's possible for the setcc instruction to be anywhere in the loop, and 2129 // possible for it to have multiple users. If it is not immediately before 2130 // the exiting block branch, move it. 2131 if (&*++BasicBlock::iterator(Cond) != TermBr) { 2132 if (Cond->hasOneUse()) { 2133 Cond->moveBefore(TermBr); 2134 } else { 2135 // Clone the terminating condition and insert into the loopend. 2136 ICmpInst *OldCond = Cond; 2137 Cond = cast<ICmpInst>(Cond->clone()); 2138 Cond->setName(L->getHeader()->getName() + ".termcond"); 2139 ExitingBlock->getInstList().insert(TermBr, Cond); 2140 2141 // Clone the IVUse, as the old use still exists! 2142 CondUse = &IU.AddUser(Cond, CondUse->getOperandValToReplace()); 2143 TermBr->replaceUsesOfWith(OldCond, Cond); 2144 } 2145 } 2146 2147 // If we get to here, we know that we can transform the setcc instruction to 2148 // use the post-incremented version of the IV, allowing us to coalesce the 2149 // live ranges for the IV correctly. 2150 CondUse->transformToPostInc(L); 2151 Changed = true; 2152 2153 PostIncs.insert(Cond); 2154 decline_post_inc:; 2155 } 2156 2157 // Determine an insertion point for the loop induction variable increment. It 2158 // must dominate all the post-inc comparisons we just set up, and it must 2159 // dominate the loop latch edge. 2160 IVIncInsertPos = L->getLoopLatch()->getTerminator(); 2161 for (SmallPtrSet<Instruction *, 4>::const_iterator I = PostIncs.begin(), 2162 E = PostIncs.end(); I != E; ++I) { 2163 BasicBlock *BB = 2164 DT.findNearestCommonDominator(IVIncInsertPos->getParent(), 2165 (*I)->getParent()); 2166 if (BB == (*I)->getParent()) 2167 IVIncInsertPos = *I; 2168 else if (BB != IVIncInsertPos->getParent()) 2169 IVIncInsertPos = BB->getTerminator(); 2170 } 2171} 2172 2173/// reconcileNewOffset - Determine if the given use can accommodate a fixup 2174/// at the given offset and other details. If so, update the use and 2175/// return true. 2176bool 2177LSRInstance::reconcileNewOffset(LSRUse &LU, int64_t NewOffset, bool HasBaseReg, 2178 LSRUse::KindType Kind, Type *AccessTy) { 2179 int64_t NewMinOffset = LU.MinOffset; 2180 int64_t NewMaxOffset = LU.MaxOffset; 2181 Type *NewAccessTy = AccessTy; 2182 2183 // Check for a mismatched kind. It's tempting to collapse mismatched kinds to 2184 // something conservative, however this can pessimize in the case that one of 2185 // the uses will have all its uses outside the loop, for example. 2186 if (LU.Kind != Kind) 2187 return false; 2188 // Conservatively assume HasBaseReg is true for now. 2189 if (NewOffset < LU.MinOffset) { 2190 if (!isAlwaysFoldable(TTI, Kind, AccessTy, /*BaseGV=*/ 0, 2191 LU.MaxOffset - NewOffset, HasBaseReg)) 2192 return false; 2193 NewMinOffset = NewOffset; 2194 } else if (NewOffset > LU.MaxOffset) { 2195 if (!isAlwaysFoldable(TTI, Kind, AccessTy, /*BaseGV=*/ 0, 2196 NewOffset - LU.MinOffset, HasBaseReg)) 2197 return false; 2198 NewMaxOffset = NewOffset; 2199 } 2200 // Check for a mismatched access type, and fall back conservatively as needed. 2201 // TODO: Be less conservative when the type is similar and can use the same 2202 // addressing modes. 2203 if (Kind == LSRUse::Address && AccessTy != LU.AccessTy) 2204 NewAccessTy = Type::getVoidTy(AccessTy->getContext()); 2205 2206 // Update the use. 2207 LU.MinOffset = NewMinOffset; 2208 LU.MaxOffset = NewMaxOffset; 2209 LU.AccessTy = NewAccessTy; 2210 if (NewOffset != LU.Offsets.back()) 2211 LU.Offsets.push_back(NewOffset); 2212 return true; 2213} 2214 2215/// getUse - Return an LSRUse index and an offset value for a fixup which 2216/// needs the given expression, with the given kind and optional access type. 2217/// Either reuse an existing use or create a new one, as needed. 2218std::pair<size_t, int64_t> 2219LSRInstance::getUse(const SCEV *&Expr, 2220 LSRUse::KindType Kind, Type *AccessTy) { 2221 const SCEV *Copy = Expr; 2222 int64_t Offset = ExtractImmediate(Expr, SE); 2223 2224 // Basic uses can't accept any offset, for example. 2225 if (!isAlwaysFoldable(TTI, Kind, AccessTy, /*BaseGV=*/ 0, 2226 Offset, /*HasBaseReg=*/ true)) { 2227 Expr = Copy; 2228 Offset = 0; 2229 } 2230 2231 std::pair<UseMapTy::iterator, bool> P = 2232 UseMap.insert(std::make_pair(std::make_pair(Expr, Kind), 0)); 2233 if (!P.second) { 2234 // A use already existed with this base. 2235 size_t LUIdx = P.first->second; 2236 LSRUse &LU = Uses[LUIdx]; 2237 if (reconcileNewOffset(LU, Offset, /*HasBaseReg=*/true, Kind, AccessTy)) 2238 // Reuse this use. 2239 return std::make_pair(LUIdx, Offset); 2240 } 2241 2242 // Create a new use. 2243 size_t LUIdx = Uses.size(); 2244 P.first->second = LUIdx; 2245 Uses.push_back(LSRUse(Kind, AccessTy)); 2246 LSRUse &LU = Uses[LUIdx]; 2247 2248 // We don't need to track redundant offsets, but we don't need to go out 2249 // of our way here to avoid them. 2250 if (LU.Offsets.empty() || Offset != LU.Offsets.back()) 2251 LU.Offsets.push_back(Offset); 2252 2253 LU.MinOffset = Offset; 2254 LU.MaxOffset = Offset; 2255 return std::make_pair(LUIdx, Offset); 2256} 2257 2258/// DeleteUse - Delete the given use from the Uses list. 2259void LSRInstance::DeleteUse(LSRUse &LU, size_t LUIdx) { 2260 if (&LU != &Uses.back()) 2261 std::swap(LU, Uses.back()); 2262 Uses.pop_back(); 2263 2264 // Update RegUses. 2265 RegUses.SwapAndDropUse(LUIdx, Uses.size()); 2266} 2267 2268/// FindUseWithFormula - Look for a use distinct from OrigLU which is has 2269/// a formula that has the same registers as the given formula. 2270LSRUse * 2271LSRInstance::FindUseWithSimilarFormula(const Formula &OrigF, 2272 const LSRUse &OrigLU) { 2273 // Search all uses for the formula. This could be more clever. 2274 for (size_t LUIdx = 0, NumUses = Uses.size(); LUIdx != NumUses; ++LUIdx) { 2275 LSRUse &LU = Uses[LUIdx]; 2276 // Check whether this use is close enough to OrigLU, to see whether it's 2277 // worthwhile looking through its formulae. 2278 // Ignore ICmpZero uses because they may contain formulae generated by 2279 // GenerateICmpZeroScales, in which case adding fixup offsets may 2280 // be invalid. 2281 if (&LU != &OrigLU && 2282 LU.Kind != LSRUse::ICmpZero && 2283 LU.Kind == OrigLU.Kind && OrigLU.AccessTy == LU.AccessTy && 2284 LU.WidestFixupType == OrigLU.WidestFixupType && 2285 LU.HasFormulaWithSameRegs(OrigF)) { 2286 // Scan through this use's formulae. 2287 for (SmallVectorImpl<Formula>::const_iterator I = LU.Formulae.begin(), 2288 E = LU.Formulae.end(); I != E; ++I) { 2289 const Formula &F = *I; 2290 // Check to see if this formula has the same registers and symbols 2291 // as OrigF. 2292 if (F.BaseRegs == OrigF.BaseRegs && 2293 F.ScaledReg == OrigF.ScaledReg && 2294 F.BaseGV == OrigF.BaseGV && 2295 F.Scale == OrigF.Scale && 2296 F.UnfoldedOffset == OrigF.UnfoldedOffset) { 2297 if (F.BaseOffset == 0) 2298 return &LU; 2299 // This is the formula where all the registers and symbols matched; 2300 // there aren't going to be any others. Since we declined it, we 2301 // can skip the rest of the formulae and proceed to the next LSRUse. 2302 break; 2303 } 2304 } 2305 } 2306 } 2307 2308 // Nothing looked good. 2309 return 0; 2310} 2311 2312void LSRInstance::CollectInterestingTypesAndFactors() { 2313 SmallSetVector<const SCEV *, 4> Strides; 2314 2315 // Collect interesting types and strides. 2316 SmallVector<const SCEV *, 4> Worklist; 2317 for (IVUsers::const_iterator UI = IU.begin(), E = IU.end(); UI != E; ++UI) { 2318 const SCEV *Expr = IU.getExpr(*UI); 2319 2320 // Collect interesting types. 2321 Types.insert(SE.getEffectiveSCEVType(Expr->getType())); 2322 2323 // Add strides for mentioned loops. 2324 Worklist.push_back(Expr); 2325 do { 2326 const SCEV *S = Worklist.pop_back_val(); 2327 if (const SCEVAddRecExpr *AR = dyn_cast<SCEVAddRecExpr>(S)) { 2328 if (AR->getLoop() == L) 2329 Strides.insert(AR->getStepRecurrence(SE)); 2330 Worklist.push_back(AR->getStart()); 2331 } else if (const SCEVAddExpr *Add = dyn_cast<SCEVAddExpr>(S)) { 2332 Worklist.append(Add->op_begin(), Add->op_end()); 2333 } 2334 } while (!Worklist.empty()); 2335 } 2336 2337 // Compute interesting factors from the set of interesting strides. 2338 for (SmallSetVector<const SCEV *, 4>::const_iterator 2339 I = Strides.begin(), E = Strides.end(); I != E; ++I) 2340 for (SmallSetVector<const SCEV *, 4>::const_iterator NewStrideIter = 2341 llvm::next(I); NewStrideIter != E; ++NewStrideIter) { 2342 const SCEV *OldStride = *I; 2343 const SCEV *NewStride = *NewStrideIter; 2344 2345 if (SE.getTypeSizeInBits(OldStride->getType()) != 2346 SE.getTypeSizeInBits(NewStride->getType())) { 2347 if (SE.getTypeSizeInBits(OldStride->getType()) > 2348 SE.getTypeSizeInBits(NewStride->getType())) 2349 NewStride = SE.getSignExtendExpr(NewStride, OldStride->getType()); 2350 else 2351 OldStride = SE.getSignExtendExpr(OldStride, NewStride->getType()); 2352 } 2353 if (const SCEVConstant *Factor = 2354 dyn_cast_or_null<SCEVConstant>(getExactSDiv(NewStride, OldStride, 2355 SE, true))) { 2356 if (Factor->getValue()->getValue().getMinSignedBits() <= 64) 2357 Factors.insert(Factor->getValue()->getValue().getSExtValue()); 2358 } else if (const SCEVConstant *Factor = 2359 dyn_cast_or_null<SCEVConstant>(getExactSDiv(OldStride, 2360 NewStride, 2361 SE, true))) { 2362 if (Factor->getValue()->getValue().getMinSignedBits() <= 64) 2363 Factors.insert(Factor->getValue()->getValue().getSExtValue()); 2364 } 2365 } 2366 2367 // If all uses use the same type, don't bother looking for truncation-based 2368 // reuse. 2369 if (Types.size() == 1) 2370 Types.clear(); 2371 2372 DEBUG(print_factors_and_types(dbgs())); 2373} 2374 2375/// findIVOperand - Helper for CollectChains that finds an IV operand (computed 2376/// by an AddRec in this loop) within [OI,OE) or returns OE. If IVUsers mapped 2377/// Instructions to IVStrideUses, we could partially skip this. 2378static User::op_iterator 2379findIVOperand(User::op_iterator OI, User::op_iterator OE, 2380 Loop *L, ScalarEvolution &SE) { 2381 for(; OI != OE; ++OI) { 2382 if (Instruction *Oper = dyn_cast<Instruction>(*OI)) { 2383 if (!SE.isSCEVable(Oper->getType())) 2384 continue; 2385 2386 if (const SCEVAddRecExpr *AR = 2387 dyn_cast<SCEVAddRecExpr>(SE.getSCEV(Oper))) { 2388 if (AR->getLoop() == L) 2389 break; 2390 } 2391 } 2392 } 2393 return OI; 2394} 2395 2396/// getWideOperand - IVChain logic must consistenctly peek base TruncInst 2397/// operands, so wrap it in a convenient helper. 2398static Value *getWideOperand(Value *Oper) { 2399 if (TruncInst *Trunc = dyn_cast<TruncInst>(Oper)) 2400 return Trunc->getOperand(0); 2401 return Oper; 2402} 2403 2404/// isCompatibleIVType - Return true if we allow an IV chain to include both 2405/// types. 2406static bool isCompatibleIVType(Value *LVal, Value *RVal) { 2407 Type *LType = LVal->getType(); 2408 Type *RType = RVal->getType(); 2409 return (LType == RType) || (LType->isPointerTy() && RType->isPointerTy()); 2410} 2411 2412/// getExprBase - Return an approximation of this SCEV expression's "base", or 2413/// NULL for any constant. Returning the expression itself is 2414/// conservative. Returning a deeper subexpression is more precise and valid as 2415/// long as it isn't less complex than another subexpression. For expressions 2416/// involving multiple unscaled values, we need to return the pointer-type 2417/// SCEVUnknown. This avoids forming chains across objects, such as: 2418/// PrevOper==a[i], IVOper==b[i], IVInc==b-a. 2419/// 2420/// Since SCEVUnknown is the rightmost type, and pointers are the rightmost 2421/// SCEVUnknown, we simply return the rightmost SCEV operand. 2422static const SCEV *getExprBase(const SCEV *S) { 2423 switch (S->getSCEVType()) { 2424 default: // uncluding scUnknown. 2425 return S; 2426 case scConstant: 2427 return 0; 2428 case scTruncate: 2429 return getExprBase(cast<SCEVTruncateExpr>(S)->getOperand()); 2430 case scZeroExtend: 2431 return getExprBase(cast<SCEVZeroExtendExpr>(S)->getOperand()); 2432 case scSignExtend: 2433 return getExprBase(cast<SCEVSignExtendExpr>(S)->getOperand()); 2434 case scAddExpr: { 2435 // Skip over scaled operands (scMulExpr) to follow add operands as long as 2436 // there's nothing more complex. 2437 // FIXME: not sure if we want to recognize negation. 2438 const SCEVAddExpr *Add = cast<SCEVAddExpr>(S); 2439 for (std::reverse_iterator<SCEVAddExpr::op_iterator> I(Add->op_end()), 2440 E(Add->op_begin()); I != E; ++I) { 2441 const SCEV *SubExpr = *I; 2442 if (SubExpr->getSCEVType() == scAddExpr) 2443 return getExprBase(SubExpr); 2444 2445 if (SubExpr->getSCEVType() != scMulExpr) 2446 return SubExpr; 2447 } 2448 return S; // all operands are scaled, be conservative. 2449 } 2450 case scAddRecExpr: 2451 return getExprBase(cast<SCEVAddRecExpr>(S)->getStart()); 2452 } 2453} 2454 2455/// Return true if the chain increment is profitable to expand into a loop 2456/// invariant value, which may require its own register. A profitable chain 2457/// increment will be an offset relative to the same base. We allow such offsets 2458/// to potentially be used as chain increment as long as it's not obviously 2459/// expensive to expand using real instructions. 2460bool IVChain::isProfitableIncrement(const SCEV *OperExpr, 2461 const SCEV *IncExpr, 2462 ScalarEvolution &SE) { 2463 // Aggressively form chains when -stress-ivchain. 2464 if (StressIVChain) 2465 return true; 2466 2467 // Do not replace a constant offset from IV head with a nonconstant IV 2468 // increment. 2469 if (!isa<SCEVConstant>(IncExpr)) { 2470 const SCEV *HeadExpr = SE.getSCEV(getWideOperand(Incs[0].IVOperand)); 2471 if (isa<SCEVConstant>(SE.getMinusSCEV(OperExpr, HeadExpr))) 2472 return 0; 2473 } 2474 2475 SmallPtrSet<const SCEV*, 8> Processed; 2476 return !isHighCostExpansion(IncExpr, Processed, SE); 2477} 2478 2479/// Return true if the number of registers needed for the chain is estimated to 2480/// be less than the number required for the individual IV users. First prohibit 2481/// any IV users that keep the IV live across increments (the Users set should 2482/// be empty). Next count the number and type of increments in the chain. 2483/// 2484/// Chaining IVs can lead to considerable code bloat if ISEL doesn't 2485/// effectively use postinc addressing modes. Only consider it profitable it the 2486/// increments can be computed in fewer registers when chained. 2487/// 2488/// TODO: Consider IVInc free if it's already used in another chains. 2489static bool 2490isProfitableChain(IVChain &Chain, SmallPtrSet<Instruction*, 4> &Users, 2491 ScalarEvolution &SE, const TargetTransformInfo &TTI) { 2492 if (StressIVChain) 2493 return true; 2494 2495 if (!Chain.hasIncs()) 2496 return false; 2497 2498 if (!Users.empty()) { 2499 DEBUG(dbgs() << "Chain: " << *Chain.Incs[0].UserInst << " users:\n"; 2500 for (SmallPtrSet<Instruction*, 4>::const_iterator I = Users.begin(), 2501 E = Users.end(); I != E; ++I) { 2502 dbgs() << " " << **I << "\n"; 2503 }); 2504 return false; 2505 } 2506 assert(!Chain.Incs.empty() && "empty IV chains are not allowed"); 2507 2508 // The chain itself may require a register, so intialize cost to 1. 2509 int cost = 1; 2510 2511 // A complete chain likely eliminates the need for keeping the original IV in 2512 // a register. LSR does not currently know how to form a complete chain unless 2513 // the header phi already exists. 2514 if (isa<PHINode>(Chain.tailUserInst()) 2515 && SE.getSCEV(Chain.tailUserInst()) == Chain.Incs[0].IncExpr) { 2516 --cost; 2517 } 2518 const SCEV *LastIncExpr = 0; 2519 unsigned NumConstIncrements = 0; 2520 unsigned NumVarIncrements = 0; 2521 unsigned NumReusedIncrements = 0; 2522 for (IVChain::const_iterator I = Chain.begin(), E = Chain.end(); 2523 I != E; ++I) { 2524 2525 if (I->IncExpr->isZero()) 2526 continue; 2527 2528 // Incrementing by zero or some constant is neutral. We assume constants can 2529 // be folded into an addressing mode or an add's immediate operand. 2530 if (isa<SCEVConstant>(I->IncExpr)) { 2531 ++NumConstIncrements; 2532 continue; 2533 } 2534 2535 if (I->IncExpr == LastIncExpr) 2536 ++NumReusedIncrements; 2537 else 2538 ++NumVarIncrements; 2539 2540 LastIncExpr = I->IncExpr; 2541 } 2542 // An IV chain with a single increment is handled by LSR's postinc 2543 // uses. However, a chain with multiple increments requires keeping the IV's 2544 // value live longer than it needs to be if chained. 2545 if (NumConstIncrements > 1) 2546 --cost; 2547 2548 // Materializing increment expressions in the preheader that didn't exist in 2549 // the original code may cost a register. For example, sign-extended array 2550 // indices can produce ridiculous increments like this: 2551 // IV + ((sext i32 (2 * %s) to i64) + (-1 * (sext i32 %s to i64))) 2552 cost += NumVarIncrements; 2553 2554 // Reusing variable increments likely saves a register to hold the multiple of 2555 // the stride. 2556 cost -= NumReusedIncrements; 2557 2558 DEBUG(dbgs() << "Chain: " << *Chain.Incs[0].UserInst << " Cost: " << cost 2559 << "\n"); 2560 2561 return cost < 0; 2562} 2563 2564/// ChainInstruction - Add this IV user to an existing chain or make it the head 2565/// of a new chain. 2566void LSRInstance::ChainInstruction(Instruction *UserInst, Instruction *IVOper, 2567 SmallVectorImpl<ChainUsers> &ChainUsersVec) { 2568 // When IVs are used as types of varying widths, they are generally converted 2569 // to a wider type with some uses remaining narrow under a (free) trunc. 2570 Value *const NextIV = getWideOperand(IVOper); 2571 const SCEV *const OperExpr = SE.getSCEV(NextIV); 2572 const SCEV *const OperExprBase = getExprBase(OperExpr); 2573 2574 // Visit all existing chains. Check if its IVOper can be computed as a 2575 // profitable loop invariant increment from the last link in the Chain. 2576 unsigned ChainIdx = 0, NChains = IVChainVec.size(); 2577 const SCEV *LastIncExpr = 0; 2578 for (; ChainIdx < NChains; ++ChainIdx) { 2579 IVChain &Chain = IVChainVec[ChainIdx]; 2580 2581 // Prune the solution space aggressively by checking that both IV operands 2582 // are expressions that operate on the same unscaled SCEVUnknown. This 2583 // "base" will be canceled by the subsequent getMinusSCEV call. Checking 2584 // first avoids creating extra SCEV expressions. 2585 if (!StressIVChain && Chain.ExprBase != OperExprBase) 2586 continue; 2587 2588 Value *PrevIV = getWideOperand(Chain.Incs.back().IVOperand); 2589 if (!isCompatibleIVType(PrevIV, NextIV)) 2590 continue; 2591 2592 // A phi node terminates a chain. 2593 if (isa<PHINode>(UserInst) && isa<PHINode>(Chain.tailUserInst())) 2594 continue; 2595 2596 // The increment must be loop-invariant so it can be kept in a register. 2597 const SCEV *PrevExpr = SE.getSCEV(PrevIV); 2598 const SCEV *IncExpr = SE.getMinusSCEV(OperExpr, PrevExpr); 2599 if (!SE.isLoopInvariant(IncExpr, L)) 2600 continue; 2601 2602 if (Chain.isProfitableIncrement(OperExpr, IncExpr, SE)) { 2603 LastIncExpr = IncExpr; 2604 break; 2605 } 2606 } 2607 // If we haven't found a chain, create a new one, unless we hit the max. Don't 2608 // bother for phi nodes, because they must be last in the chain. 2609 if (ChainIdx == NChains) { 2610 if (isa<PHINode>(UserInst)) 2611 return; 2612 if (NChains >= MaxChains && !StressIVChain) { 2613 DEBUG(dbgs() << "IV Chain Limit\n"); 2614 return; 2615 } 2616 LastIncExpr = OperExpr; 2617 // IVUsers may have skipped over sign/zero extensions. We don't currently 2618 // attempt to form chains involving extensions unless they can be hoisted 2619 // into this loop's AddRec. 2620 if (!isa<SCEVAddRecExpr>(LastIncExpr)) 2621 return; 2622 ++NChains; 2623 IVChainVec.push_back(IVChain(IVInc(UserInst, IVOper, LastIncExpr), 2624 OperExprBase)); 2625 ChainUsersVec.resize(NChains); 2626 DEBUG(dbgs() << "IV Chain#" << ChainIdx << " Head: (" << *UserInst 2627 << ") IV=" << *LastIncExpr << "\n"); 2628 } else { 2629 DEBUG(dbgs() << "IV Chain#" << ChainIdx << " Inc: (" << *UserInst 2630 << ") IV+" << *LastIncExpr << "\n"); 2631 // Add this IV user to the end of the chain. 2632 IVChainVec[ChainIdx].add(IVInc(UserInst, IVOper, LastIncExpr)); 2633 } 2634 IVChain &Chain = IVChainVec[ChainIdx]; 2635 2636 SmallPtrSet<Instruction*,4> &NearUsers = ChainUsersVec[ChainIdx].NearUsers; 2637 // This chain's NearUsers become FarUsers. 2638 if (!LastIncExpr->isZero()) { 2639 ChainUsersVec[ChainIdx].FarUsers.insert(NearUsers.begin(), 2640 NearUsers.end()); 2641 NearUsers.clear(); 2642 } 2643 2644 // All other uses of IVOperand become near uses of the chain. 2645 // We currently ignore intermediate values within SCEV expressions, assuming 2646 // they will eventually be used be the current chain, or can be computed 2647 // from one of the chain increments. To be more precise we could 2648 // transitively follow its user and only add leaf IV users to the set. 2649 for (Value::use_iterator UseIter = IVOper->use_begin(), 2650 UseEnd = IVOper->use_end(); UseIter != UseEnd; ++UseIter) { 2651 Instruction *OtherUse = dyn_cast<Instruction>(*UseIter); 2652 if (!OtherUse) 2653 continue; 2654 // Uses in the chain will no longer be uses if the chain is formed. 2655 // Include the head of the chain in this iteration (not Chain.begin()). 2656 IVChain::const_iterator IncIter = Chain.Incs.begin(); 2657 IVChain::const_iterator IncEnd = Chain.Incs.end(); 2658 for( ; IncIter != IncEnd; ++IncIter) { 2659 if (IncIter->UserInst == OtherUse) 2660 break; 2661 } 2662 if (IncIter != IncEnd) 2663 continue; 2664 2665 if (SE.isSCEVable(OtherUse->getType()) 2666 && !isa<SCEVUnknown>(SE.getSCEV(OtherUse)) 2667 && IU.isIVUserOrOperand(OtherUse)) { 2668 continue; 2669 } 2670 NearUsers.insert(OtherUse); 2671 } 2672 2673 // Since this user is part of the chain, it's no longer considered a use 2674 // of the chain. 2675 ChainUsersVec[ChainIdx].FarUsers.erase(UserInst); 2676} 2677 2678/// CollectChains - Populate the vector of Chains. 2679/// 2680/// This decreases ILP at the architecture level. Targets with ample registers, 2681/// multiple memory ports, and no register renaming probably don't want 2682/// this. However, such targets should probably disable LSR altogether. 2683/// 2684/// The job of LSR is to make a reasonable choice of induction variables across 2685/// the loop. Subsequent passes can easily "unchain" computation exposing more 2686/// ILP *within the loop* if the target wants it. 2687/// 2688/// Finding the best IV chain is potentially a scheduling problem. Since LSR 2689/// will not reorder memory operations, it will recognize this as a chain, but 2690/// will generate redundant IV increments. Ideally this would be corrected later 2691/// by a smart scheduler: 2692/// = A[i] 2693/// = A[i+x] 2694/// A[i] = 2695/// A[i+x] = 2696/// 2697/// TODO: Walk the entire domtree within this loop, not just the path to the 2698/// loop latch. This will discover chains on side paths, but requires 2699/// maintaining multiple copies of the Chains state. 2700void LSRInstance::CollectChains() { 2701 DEBUG(dbgs() << "Collecting IV Chains.\n"); 2702 SmallVector<ChainUsers, 8> ChainUsersVec; 2703 2704 SmallVector<BasicBlock *,8> LatchPath; 2705 BasicBlock *LoopHeader = L->getHeader(); 2706 for (DomTreeNode *Rung = DT.getNode(L->getLoopLatch()); 2707 Rung->getBlock() != LoopHeader; Rung = Rung->getIDom()) { 2708 LatchPath.push_back(Rung->getBlock()); 2709 } 2710 LatchPath.push_back(LoopHeader); 2711 2712 // Walk the instruction stream from the loop header to the loop latch. 2713 for (SmallVectorImpl<BasicBlock *>::reverse_iterator 2714 BBIter = LatchPath.rbegin(), BBEnd = LatchPath.rend(); 2715 BBIter != BBEnd; ++BBIter) { 2716 for (BasicBlock::iterator I = (*BBIter)->begin(), E = (*BBIter)->end(); 2717 I != E; ++I) { 2718 // Skip instructions that weren't seen by IVUsers analysis. 2719 if (isa<PHINode>(I) || !IU.isIVUserOrOperand(I)) 2720 continue; 2721 2722 // Ignore users that are part of a SCEV expression. This way we only 2723 // consider leaf IV Users. This effectively rediscovers a portion of 2724 // IVUsers analysis but in program order this time. 2725 if (SE.isSCEVable(I->getType()) && !isa<SCEVUnknown>(SE.getSCEV(I))) 2726 continue; 2727 2728 // Remove this instruction from any NearUsers set it may be in. 2729 for (unsigned ChainIdx = 0, NChains = IVChainVec.size(); 2730 ChainIdx < NChains; ++ChainIdx) { 2731 ChainUsersVec[ChainIdx].NearUsers.erase(I); 2732 } 2733 // Search for operands that can be chained. 2734 SmallPtrSet<Instruction*, 4> UniqueOperands; 2735 User::op_iterator IVOpEnd = I->op_end(); 2736 User::op_iterator IVOpIter = findIVOperand(I->op_begin(), IVOpEnd, L, SE); 2737 while (IVOpIter != IVOpEnd) { 2738 Instruction *IVOpInst = cast<Instruction>(*IVOpIter); 2739 if (UniqueOperands.insert(IVOpInst)) 2740 ChainInstruction(I, IVOpInst, ChainUsersVec); 2741 IVOpIter = findIVOperand(llvm::next(IVOpIter), IVOpEnd, L, SE); 2742 } 2743 } // Continue walking down the instructions. 2744 } // Continue walking down the domtree. 2745 // Visit phi backedges to determine if the chain can generate the IV postinc. 2746 for (BasicBlock::iterator I = L->getHeader()->begin(); 2747 PHINode *PN = dyn_cast<PHINode>(I); ++I) { 2748 if (!SE.isSCEVable(PN->getType())) 2749 continue; 2750 2751 Instruction *IncV = 2752 dyn_cast<Instruction>(PN->getIncomingValueForBlock(L->getLoopLatch())); 2753 if (IncV) 2754 ChainInstruction(PN, IncV, ChainUsersVec); 2755 } 2756 // Remove any unprofitable chains. 2757 unsigned ChainIdx = 0; 2758 for (unsigned UsersIdx = 0, NChains = IVChainVec.size(); 2759 UsersIdx < NChains; ++UsersIdx) { 2760 if (!isProfitableChain(IVChainVec[UsersIdx], 2761 ChainUsersVec[UsersIdx].FarUsers, SE, TTI)) 2762 continue; 2763 // Preserve the chain at UsesIdx. 2764 if (ChainIdx != UsersIdx) 2765 IVChainVec[ChainIdx] = IVChainVec[UsersIdx]; 2766 FinalizeChain(IVChainVec[ChainIdx]); 2767 ++ChainIdx; 2768 } 2769 IVChainVec.resize(ChainIdx); 2770} 2771 2772void LSRInstance::FinalizeChain(IVChain &Chain) { 2773 assert(!Chain.Incs.empty() && "empty IV chains are not allowed"); 2774 DEBUG(dbgs() << "Final Chain: " << *Chain.Incs[0].UserInst << "\n"); 2775 2776 for (IVChain::const_iterator I = Chain.begin(), E = Chain.end(); 2777 I != E; ++I) { 2778 DEBUG(dbgs() << " Inc: " << *I->UserInst << "\n"); 2779 User::op_iterator UseI = 2780 std::find(I->UserInst->op_begin(), I->UserInst->op_end(), I->IVOperand); 2781 assert(UseI != I->UserInst->op_end() && "cannot find IV operand"); 2782 IVIncSet.insert(UseI); 2783 } 2784} 2785 2786/// Return true if the IVInc can be folded into an addressing mode. 2787static bool canFoldIVIncExpr(const SCEV *IncExpr, Instruction *UserInst, 2788 Value *Operand, const TargetTransformInfo &TTI) { 2789 const SCEVConstant *IncConst = dyn_cast<SCEVConstant>(IncExpr); 2790 if (!IncConst || !isAddressUse(UserInst, Operand)) 2791 return false; 2792 2793 if (IncConst->getValue()->getValue().getMinSignedBits() > 64) 2794 return false; 2795 2796 int64_t IncOffset = IncConst->getValue()->getSExtValue(); 2797 if (!isAlwaysFoldable(TTI, LSRUse::Address, 2798 getAccessType(UserInst), /*BaseGV=*/ 0, 2799 IncOffset, /*HaseBaseReg=*/ false)) 2800 return false; 2801 2802 return true; 2803} 2804 2805/// GenerateIVChains - Generate an add or subtract for each IVInc in a chain to 2806/// materialize the IV user's operand from the previous IV user's operand. 2807void LSRInstance::GenerateIVChain(const IVChain &Chain, SCEVExpander &Rewriter, 2808 SmallVectorImpl<WeakVH> &DeadInsts) { 2809 // Find the new IVOperand for the head of the chain. It may have been replaced 2810 // by LSR. 2811 const IVInc &Head = Chain.Incs[0]; 2812 User::op_iterator IVOpEnd = Head.UserInst->op_end(); 2813 // findIVOperand returns IVOpEnd if it can no longer find a valid IV user. 2814 User::op_iterator IVOpIter = findIVOperand(Head.UserInst->op_begin(), 2815 IVOpEnd, L, SE); 2816 Value *IVSrc = 0; 2817 while (IVOpIter != IVOpEnd) { 2818 IVSrc = getWideOperand(*IVOpIter); 2819 2820 // If this operand computes the expression that the chain needs, we may use 2821 // it. (Check this after setting IVSrc which is used below.) 2822 // 2823 // Note that if Head.IncExpr is wider than IVSrc, then this phi is too 2824 // narrow for the chain, so we can no longer use it. We do allow using a 2825 // wider phi, assuming the LSR checked for free truncation. In that case we 2826 // should already have a truncate on this operand such that 2827 // getSCEV(IVSrc) == IncExpr. 2828 if (SE.getSCEV(*IVOpIter) == Head.IncExpr 2829 || SE.getSCEV(IVSrc) == Head.IncExpr) { 2830 break; 2831 } 2832 IVOpIter = findIVOperand(llvm::next(IVOpIter), IVOpEnd, L, SE); 2833 } 2834 if (IVOpIter == IVOpEnd) { 2835 // Gracefully give up on this chain. 2836 DEBUG(dbgs() << "Concealed chain head: " << *Head.UserInst << "\n"); 2837 return; 2838 } 2839 2840 DEBUG(dbgs() << "Generate chain at: " << *IVSrc << "\n"); 2841 Type *IVTy = IVSrc->getType(); 2842 Type *IntTy = SE.getEffectiveSCEVType(IVTy); 2843 const SCEV *LeftOverExpr = 0; 2844 for (IVChain::const_iterator IncI = Chain.begin(), 2845 IncE = Chain.end(); IncI != IncE; ++IncI) { 2846 2847 Instruction *InsertPt = IncI->UserInst; 2848 if (isa<PHINode>(InsertPt)) 2849 InsertPt = L->getLoopLatch()->getTerminator(); 2850 2851 // IVOper will replace the current IV User's operand. IVSrc is the IV 2852 // value currently held in a register. 2853 Value *IVOper = IVSrc; 2854 if (!IncI->IncExpr->isZero()) { 2855 // IncExpr was the result of subtraction of two narrow values, so must 2856 // be signed. 2857 const SCEV *IncExpr = SE.getNoopOrSignExtend(IncI->IncExpr, IntTy); 2858 LeftOverExpr = LeftOverExpr ? 2859 SE.getAddExpr(LeftOverExpr, IncExpr) : IncExpr; 2860 } 2861 if (LeftOverExpr && !LeftOverExpr->isZero()) { 2862 // Expand the IV increment. 2863 Rewriter.clearPostInc(); 2864 Value *IncV = Rewriter.expandCodeFor(LeftOverExpr, IntTy, InsertPt); 2865 const SCEV *IVOperExpr = SE.getAddExpr(SE.getUnknown(IVSrc), 2866 SE.getUnknown(IncV)); 2867 IVOper = Rewriter.expandCodeFor(IVOperExpr, IVTy, InsertPt); 2868 2869 // If an IV increment can't be folded, use it as the next IV value. 2870 if (!canFoldIVIncExpr(LeftOverExpr, IncI->UserInst, IncI->IVOperand, 2871 TTI)) { 2872 assert(IVTy == IVOper->getType() && "inconsistent IV increment type"); 2873 IVSrc = IVOper; 2874 LeftOverExpr = 0; 2875 } 2876 } 2877 Type *OperTy = IncI->IVOperand->getType(); 2878 if (IVTy != OperTy) { 2879 assert(SE.getTypeSizeInBits(IVTy) >= SE.getTypeSizeInBits(OperTy) && 2880 "cannot extend a chained IV"); 2881 IRBuilder<> Builder(InsertPt); 2882 IVOper = Builder.CreateTruncOrBitCast(IVOper, OperTy, "lsr.chain"); 2883 } 2884 IncI->UserInst->replaceUsesOfWith(IncI->IVOperand, IVOper); 2885 DeadInsts.push_back(IncI->IVOperand); 2886 } 2887 // If LSR created a new, wider phi, we may also replace its postinc. We only 2888 // do this if we also found a wide value for the head of the chain. 2889 if (isa<PHINode>(Chain.tailUserInst())) { 2890 for (BasicBlock::iterator I = L->getHeader()->begin(); 2891 PHINode *Phi = dyn_cast<PHINode>(I); ++I) { 2892 if (!isCompatibleIVType(Phi, IVSrc)) 2893 continue; 2894 Instruction *PostIncV = dyn_cast<Instruction>( 2895 Phi->getIncomingValueForBlock(L->getLoopLatch())); 2896 if (!PostIncV || (SE.getSCEV(PostIncV) != SE.getSCEV(IVSrc))) 2897 continue; 2898 Value *IVOper = IVSrc; 2899 Type *PostIncTy = PostIncV->getType(); 2900 if (IVTy != PostIncTy) { 2901 assert(PostIncTy->isPointerTy() && "mixing int/ptr IV types"); 2902 IRBuilder<> Builder(L->getLoopLatch()->getTerminator()); 2903 Builder.SetCurrentDebugLocation(PostIncV->getDebugLoc()); 2904 IVOper = Builder.CreatePointerCast(IVSrc, PostIncTy, "lsr.chain"); 2905 } 2906 Phi->replaceUsesOfWith(PostIncV, IVOper); 2907 DeadInsts.push_back(PostIncV); 2908 } 2909 } 2910} 2911 2912void LSRInstance::CollectFixupsAndInitialFormulae() { 2913 for (IVUsers::const_iterator UI = IU.begin(), E = IU.end(); UI != E; ++UI) { 2914 Instruction *UserInst = UI->getUser(); 2915 // Skip IV users that are part of profitable IV Chains. 2916 User::op_iterator UseI = std::find(UserInst->op_begin(), UserInst->op_end(), 2917 UI->getOperandValToReplace()); 2918 assert(UseI != UserInst->op_end() && "cannot find IV operand"); 2919 if (IVIncSet.count(UseI)) 2920 continue; 2921 2922 // Record the uses. 2923 LSRFixup &LF = getNewFixup(); 2924 LF.UserInst = UserInst; 2925 LF.OperandValToReplace = UI->getOperandValToReplace(); 2926 LF.PostIncLoops = UI->getPostIncLoops(); 2927 2928 LSRUse::KindType Kind = LSRUse::Basic; 2929 Type *AccessTy = 0; 2930 if (isAddressUse(LF.UserInst, LF.OperandValToReplace)) { 2931 Kind = LSRUse::Address; 2932 AccessTy = getAccessType(LF.UserInst); 2933 } 2934 2935 const SCEV *S = IU.getExpr(*UI); 2936 2937 // Equality (== and !=) ICmps are special. We can rewrite (i == N) as 2938 // (N - i == 0), and this allows (N - i) to be the expression that we work 2939 // with rather than just N or i, so we can consider the register 2940 // requirements for both N and i at the same time. Limiting this code to 2941 // equality icmps is not a problem because all interesting loops use 2942 // equality icmps, thanks to IndVarSimplify. 2943 if (ICmpInst *CI = dyn_cast<ICmpInst>(LF.UserInst)) 2944 if (CI->isEquality()) { 2945 // Swap the operands if needed to put the OperandValToReplace on the 2946 // left, for consistency. 2947 Value *NV = CI->getOperand(1); 2948 if (NV == LF.OperandValToReplace) { 2949 CI->setOperand(1, CI->getOperand(0)); 2950 CI->setOperand(0, NV); 2951 NV = CI->getOperand(1); 2952 Changed = true; 2953 } 2954 2955 // x == y --> x - y == 0 2956 const SCEV *N = SE.getSCEV(NV); 2957 if (SE.isLoopInvariant(N, L) && isSafeToExpand(N, SE)) { 2958 // S is normalized, so normalize N before folding it into S 2959 // to keep the result normalized. 2960 N = TransformForPostIncUse(Normalize, N, CI, 0, 2961 LF.PostIncLoops, SE, DT); 2962 Kind = LSRUse::ICmpZero; 2963 S = SE.getMinusSCEV(N, S); 2964 } 2965 2966 // -1 and the negations of all interesting strides (except the negation 2967 // of -1) are now also interesting. 2968 for (size_t i = 0, e = Factors.size(); i != e; ++i) 2969 if (Factors[i] != -1) 2970 Factors.insert(-(uint64_t)Factors[i]); 2971 Factors.insert(-1); 2972 } 2973 2974 // Set up the initial formula for this use. 2975 std::pair<size_t, int64_t> P = getUse(S, Kind, AccessTy); 2976 LF.LUIdx = P.first; 2977 LF.Offset = P.second; 2978 LSRUse &LU = Uses[LF.LUIdx]; 2979 LU.AllFixupsOutsideLoop &= LF.isUseFullyOutsideLoop(L); 2980 if (!LU.WidestFixupType || 2981 SE.getTypeSizeInBits(LU.WidestFixupType) < 2982 SE.getTypeSizeInBits(LF.OperandValToReplace->getType())) 2983 LU.WidestFixupType = LF.OperandValToReplace->getType(); 2984 2985 // If this is the first use of this LSRUse, give it a formula. 2986 if (LU.Formulae.empty()) { 2987 InsertInitialFormula(S, LU, LF.LUIdx); 2988 CountRegisters(LU.Formulae.back(), LF.LUIdx); 2989 } 2990 } 2991 2992 DEBUG(print_fixups(dbgs())); 2993} 2994 2995/// InsertInitialFormula - Insert a formula for the given expression into 2996/// the given use, separating out loop-variant portions from loop-invariant 2997/// and loop-computable portions. 2998void 2999LSRInstance::InsertInitialFormula(const SCEV *S, LSRUse &LU, size_t LUIdx) { 3000 // Mark uses whose expressions cannot be expanded. 3001 if (!isSafeToExpand(S, SE)) 3002 LU.RigidFormula = true; 3003 3004 Formula F; 3005 F.InitialMatch(S, L, SE); 3006 bool Inserted = InsertFormula(LU, LUIdx, F); 3007 assert(Inserted && "Initial formula already exists!"); (void)Inserted; 3008} 3009 3010/// InsertSupplementalFormula - Insert a simple single-register formula for 3011/// the given expression into the given use. 3012void 3013LSRInstance::InsertSupplementalFormula(const SCEV *S, 3014 LSRUse &LU, size_t LUIdx) { 3015 Formula F; 3016 F.BaseRegs.push_back(S); 3017 F.HasBaseReg = true; 3018 bool Inserted = InsertFormula(LU, LUIdx, F); 3019 assert(Inserted && "Supplemental formula already exists!"); (void)Inserted; 3020} 3021 3022/// CountRegisters - Note which registers are used by the given formula, 3023/// updating RegUses. 3024void LSRInstance::CountRegisters(const Formula &F, size_t LUIdx) { 3025 if (F.ScaledReg) 3026 RegUses.CountRegister(F.ScaledReg, LUIdx); 3027 for (SmallVectorImpl<const SCEV *>::const_iterator I = F.BaseRegs.begin(), 3028 E = F.BaseRegs.end(); I != E; ++I) 3029 RegUses.CountRegister(*I, LUIdx); 3030} 3031 3032/// InsertFormula - If the given formula has not yet been inserted, add it to 3033/// the list, and return true. Return false otherwise. 3034bool LSRInstance::InsertFormula(LSRUse &LU, unsigned LUIdx, const Formula &F) { 3035 if (!LU.InsertFormula(F)) 3036 return false; 3037 3038 CountRegisters(F, LUIdx); 3039 return true; 3040} 3041 3042/// CollectLoopInvariantFixupsAndFormulae - Check for other uses of 3043/// loop-invariant values which we're tracking. These other uses will pin these 3044/// values in registers, making them less profitable for elimination. 3045/// TODO: This currently misses non-constant addrec step registers. 3046/// TODO: Should this give more weight to users inside the loop? 3047void 3048LSRInstance::CollectLoopInvariantFixupsAndFormulae() { 3049 SmallVector<const SCEV *, 8> Worklist(RegUses.begin(), RegUses.end()); 3050 SmallPtrSet<const SCEV *, 8> Inserted; 3051 3052 while (!Worklist.empty()) { 3053 const SCEV *S = Worklist.pop_back_val(); 3054 3055 if (const SCEVNAryExpr *N = dyn_cast<SCEVNAryExpr>(S)) 3056 Worklist.append(N->op_begin(), N->op_end()); 3057 else if (const SCEVCastExpr *C = dyn_cast<SCEVCastExpr>(S)) 3058 Worklist.push_back(C->getOperand()); 3059 else if (const SCEVUDivExpr *D = dyn_cast<SCEVUDivExpr>(S)) { 3060 Worklist.push_back(D->getLHS()); 3061 Worklist.push_back(D->getRHS()); 3062 } else if (const SCEVUnknown *U = dyn_cast<SCEVUnknown>(S)) { 3063 if (!Inserted.insert(U)) continue; 3064 const Value *V = U->getValue(); 3065 if (const Instruction *Inst = dyn_cast<Instruction>(V)) { 3066 // Look for instructions defined outside the loop. 3067 if (L->contains(Inst)) continue; 3068 } else if (isa<UndefValue>(V)) 3069 // Undef doesn't have a live range, so it doesn't matter. 3070 continue; 3071 for (Value::const_use_iterator UI = V->use_begin(), UE = V->use_end(); 3072 UI != UE; ++UI) { 3073 const Instruction *UserInst = dyn_cast<Instruction>(*UI); 3074 // Ignore non-instructions. 3075 if (!UserInst) 3076 continue; 3077 // Ignore instructions in other functions (as can happen with 3078 // Constants). 3079 if (UserInst->getParent()->getParent() != L->getHeader()->getParent()) 3080 continue; 3081 // Ignore instructions not dominated by the loop. 3082 const BasicBlock *UseBB = !isa<PHINode>(UserInst) ? 3083 UserInst->getParent() : 3084 cast<PHINode>(UserInst)->getIncomingBlock( 3085 PHINode::getIncomingValueNumForOperand(UI.getOperandNo())); 3086 if (!DT.dominates(L->getHeader(), UseBB)) 3087 continue; 3088 // Ignore uses which are part of other SCEV expressions, to avoid 3089 // analyzing them multiple times. 3090 if (SE.isSCEVable(UserInst->getType())) { 3091 const SCEV *UserS = SE.getSCEV(const_cast<Instruction *>(UserInst)); 3092 // If the user is a no-op, look through to its uses. 3093 if (!isa<SCEVUnknown>(UserS)) 3094 continue; 3095 if (UserS == U) { 3096 Worklist.push_back( 3097 SE.getUnknown(const_cast<Instruction *>(UserInst))); 3098 continue; 3099 } 3100 } 3101 // Ignore icmp instructions which are already being analyzed. 3102 if (const ICmpInst *ICI = dyn_cast<ICmpInst>(UserInst)) { 3103 unsigned OtherIdx = !UI.getOperandNo(); 3104 Value *OtherOp = const_cast<Value *>(ICI->getOperand(OtherIdx)); 3105 if (SE.hasComputableLoopEvolution(SE.getSCEV(OtherOp), L)) 3106 continue; 3107 } 3108 3109 LSRFixup &LF = getNewFixup(); 3110 LF.UserInst = const_cast<Instruction *>(UserInst); 3111 LF.OperandValToReplace = UI.getUse(); 3112 std::pair<size_t, int64_t> P = getUse(S, LSRUse::Basic, 0); 3113 LF.LUIdx = P.first; 3114 LF.Offset = P.second; 3115 LSRUse &LU = Uses[LF.LUIdx]; 3116 LU.AllFixupsOutsideLoop &= LF.isUseFullyOutsideLoop(L); 3117 if (!LU.WidestFixupType || 3118 SE.getTypeSizeInBits(LU.WidestFixupType) < 3119 SE.getTypeSizeInBits(LF.OperandValToReplace->getType())) 3120 LU.WidestFixupType = LF.OperandValToReplace->getType(); 3121 InsertSupplementalFormula(U, LU, LF.LUIdx); 3122 CountRegisters(LU.Formulae.back(), Uses.size() - 1); 3123 break; 3124 } 3125 } 3126 } 3127} 3128 3129/// CollectSubexprs - Split S into subexpressions which can be pulled out into 3130/// separate registers. If C is non-null, multiply each subexpression by C. 3131/// 3132/// Return remainder expression after factoring the subexpressions captured by 3133/// Ops. If Ops is complete, return NULL. 3134static const SCEV *CollectSubexprs(const SCEV *S, const SCEVConstant *C, 3135 SmallVectorImpl<const SCEV *> &Ops, 3136 const Loop *L, 3137 ScalarEvolution &SE, 3138 unsigned Depth = 0) { 3139 // Arbitrarily cap recursion to protect compile time. 3140 if (Depth >= 3) 3141 return S; 3142 3143 if (const SCEVAddExpr *Add = dyn_cast<SCEVAddExpr>(S)) { 3144 // Break out add operands. 3145 for (SCEVAddExpr::op_iterator I = Add->op_begin(), E = Add->op_end(); 3146 I != E; ++I) { 3147 const SCEV *Remainder = CollectSubexprs(*I, C, Ops, L, SE, Depth+1); 3148 if (Remainder) 3149 Ops.push_back(C ? SE.getMulExpr(C, Remainder) : Remainder); 3150 } 3151 return 0; 3152 } else if (const SCEVAddRecExpr *AR = dyn_cast<SCEVAddRecExpr>(S)) { 3153 // Split a non-zero base out of an addrec. 3154 if (AR->getStart()->isZero()) 3155 return S; 3156 3157 const SCEV *Remainder = CollectSubexprs(AR->getStart(), 3158 C, Ops, L, SE, Depth+1); 3159 // Split the non-zero AddRec unless it is part of a nested recurrence that 3160 // does not pertain to this loop. 3161 if (Remainder && (AR->getLoop() == L || !isa<SCEVAddRecExpr>(Remainder))) { 3162 Ops.push_back(C ? SE.getMulExpr(C, Remainder) : Remainder); 3163 Remainder = 0; 3164 } 3165 if (Remainder != AR->getStart()) { 3166 if (!Remainder) 3167 Remainder = SE.getConstant(AR->getType(), 0); 3168 return SE.getAddRecExpr(Remainder, 3169 AR->getStepRecurrence(SE), 3170 AR->getLoop(), 3171 //FIXME: AR->getNoWrapFlags(SCEV::FlagNW) 3172 SCEV::FlagAnyWrap); 3173 } 3174 } else if (const SCEVMulExpr *Mul = dyn_cast<SCEVMulExpr>(S)) { 3175 // Break (C * (a + b + c)) into C*a + C*b + C*c. 3176 if (Mul->getNumOperands() != 2) 3177 return S; 3178 if (const SCEVConstant *Op0 = 3179 dyn_cast<SCEVConstant>(Mul->getOperand(0))) { 3180 C = C ? cast<SCEVConstant>(SE.getMulExpr(C, Op0)) : Op0; 3181 const SCEV *Remainder = 3182 CollectSubexprs(Mul->getOperand(1), C, Ops, L, SE, Depth+1); 3183 if (Remainder) 3184 Ops.push_back(SE.getMulExpr(C, Remainder)); 3185 return 0; 3186 } 3187 } 3188 return S; 3189} 3190 3191/// GenerateReassociations - Split out subexpressions from adds and the bases of 3192/// addrecs. 3193void LSRInstance::GenerateReassociations(LSRUse &LU, unsigned LUIdx, 3194 Formula Base, 3195 unsigned Depth) { 3196 // Arbitrarily cap recursion to protect compile time. 3197 if (Depth >= 3) return; 3198 3199 for (size_t i = 0, e = Base.BaseRegs.size(); i != e; ++i) { 3200 const SCEV *BaseReg = Base.BaseRegs[i]; 3201 3202 SmallVector<const SCEV *, 8> AddOps; 3203 const SCEV *Remainder = CollectSubexprs(BaseReg, 0, AddOps, L, SE); 3204 if (Remainder) 3205 AddOps.push_back(Remainder); 3206 3207 if (AddOps.size() == 1) continue; 3208 3209 for (SmallVectorImpl<const SCEV *>::const_iterator J = AddOps.begin(), 3210 JE = AddOps.end(); J != JE; ++J) { 3211 3212 // Loop-variant "unknown" values are uninteresting; we won't be able to 3213 // do anything meaningful with them. 3214 if (isa<SCEVUnknown>(*J) && !SE.isLoopInvariant(*J, L)) 3215 continue; 3216 3217 // Don't pull a constant into a register if the constant could be folded 3218 // into an immediate field. 3219 if (isAlwaysFoldable(TTI, SE, LU.MinOffset, LU.MaxOffset, LU.Kind, 3220 LU.AccessTy, *J, Base.getNumRegs() > 1)) 3221 continue; 3222 3223 // Collect all operands except *J. 3224 SmallVector<const SCEV *, 8> InnerAddOps 3225 (((const SmallVector<const SCEV *, 8> &)AddOps).begin(), J); 3226 InnerAddOps.append 3227 (llvm::next(J), ((const SmallVector<const SCEV *, 8> &)AddOps).end()); 3228 3229 // Don't leave just a constant behind in a register if the constant could 3230 // be folded into an immediate field. 3231 if (InnerAddOps.size() == 1 && 3232 isAlwaysFoldable(TTI, SE, LU.MinOffset, LU.MaxOffset, LU.Kind, 3233 LU.AccessTy, InnerAddOps[0], Base.getNumRegs() > 1)) 3234 continue; 3235 3236 const SCEV *InnerSum = SE.getAddExpr(InnerAddOps); 3237 if (InnerSum->isZero()) 3238 continue; 3239 Formula F = Base; 3240 3241 // Add the remaining pieces of the add back into the new formula. 3242 const SCEVConstant *InnerSumSC = dyn_cast<SCEVConstant>(InnerSum); 3243 if (InnerSumSC && 3244 SE.getTypeSizeInBits(InnerSumSC->getType()) <= 64 && 3245 TTI.isLegalAddImmediate((uint64_t)F.UnfoldedOffset + 3246 InnerSumSC->getValue()->getZExtValue())) { 3247 F.UnfoldedOffset = (uint64_t)F.UnfoldedOffset + 3248 InnerSumSC->getValue()->getZExtValue(); 3249 F.BaseRegs.erase(F.BaseRegs.begin() + i); 3250 } else 3251 F.BaseRegs[i] = InnerSum; 3252 3253 // Add J as its own register, or an unfolded immediate. 3254 const SCEVConstant *SC = dyn_cast<SCEVConstant>(*J); 3255 if (SC && SE.getTypeSizeInBits(SC->getType()) <= 64 && 3256 TTI.isLegalAddImmediate((uint64_t)F.UnfoldedOffset + 3257 SC->getValue()->getZExtValue())) 3258 F.UnfoldedOffset = (uint64_t)F.UnfoldedOffset + 3259 SC->getValue()->getZExtValue(); 3260 else 3261 F.BaseRegs.push_back(*J); 3262 3263 if (InsertFormula(LU, LUIdx, F)) 3264 // If that formula hadn't been seen before, recurse to find more like 3265 // it. 3266 GenerateReassociations(LU, LUIdx, LU.Formulae.back(), Depth+1); 3267 } 3268 } 3269} 3270 3271/// GenerateCombinations - Generate a formula consisting of all of the 3272/// loop-dominating registers added into a single register. 3273void LSRInstance::GenerateCombinations(LSRUse &LU, unsigned LUIdx, 3274 Formula Base) { 3275 // This method is only interesting on a plurality of registers. 3276 if (Base.BaseRegs.size() <= 1) return; 3277 3278 Formula F = Base; 3279 F.BaseRegs.clear(); 3280 SmallVector<const SCEV *, 4> Ops; 3281 for (SmallVectorImpl<const SCEV *>::const_iterator 3282 I = Base.BaseRegs.begin(), E = Base.BaseRegs.end(); I != E; ++I) { 3283 const SCEV *BaseReg = *I; 3284 if (SE.properlyDominates(BaseReg, L->getHeader()) && 3285 !SE.hasComputableLoopEvolution(BaseReg, L)) 3286 Ops.push_back(BaseReg); 3287 else 3288 F.BaseRegs.push_back(BaseReg); 3289 } 3290 if (Ops.size() > 1) { 3291 const SCEV *Sum = SE.getAddExpr(Ops); 3292 // TODO: If Sum is zero, it probably means ScalarEvolution missed an 3293 // opportunity to fold something. For now, just ignore such cases 3294 // rather than proceed with zero in a register. 3295 if (!Sum->isZero()) { 3296 F.BaseRegs.push_back(Sum); 3297 (void)InsertFormula(LU, LUIdx, F); 3298 } 3299 } 3300} 3301 3302/// GenerateSymbolicOffsets - Generate reuse formulae using symbolic offsets. 3303void LSRInstance::GenerateSymbolicOffsets(LSRUse &LU, unsigned LUIdx, 3304 Formula Base) { 3305 // We can't add a symbolic offset if the address already contains one. 3306 if (Base.BaseGV) return; 3307 3308 for (size_t i = 0, e = Base.BaseRegs.size(); i != e; ++i) { 3309 const SCEV *G = Base.BaseRegs[i]; 3310 GlobalValue *GV = ExtractSymbol(G, SE); 3311 if (G->isZero() || !GV) 3312 continue; 3313 Formula F = Base; 3314 F.BaseGV = GV; 3315 if (!isLegalUse(TTI, LU.MinOffset, LU.MaxOffset, LU.Kind, LU.AccessTy, F)) 3316 continue; 3317 F.BaseRegs[i] = G; 3318 (void)InsertFormula(LU, LUIdx, F); 3319 } 3320} 3321 3322/// GenerateConstantOffsets - Generate reuse formulae using symbolic offsets. 3323void LSRInstance::GenerateConstantOffsets(LSRUse &LU, unsigned LUIdx, 3324 Formula Base) { 3325 // TODO: For now, just add the min and max offset, because it usually isn't 3326 // worthwhile looking at everything inbetween. 3327 SmallVector<int64_t, 2> Worklist; 3328 Worklist.push_back(LU.MinOffset); 3329 if (LU.MaxOffset != LU.MinOffset) 3330 Worklist.push_back(LU.MaxOffset); 3331 3332 for (size_t i = 0, e = Base.BaseRegs.size(); i != e; ++i) { 3333 const SCEV *G = Base.BaseRegs[i]; 3334 3335 for (SmallVectorImpl<int64_t>::const_iterator I = Worklist.begin(), 3336 E = Worklist.end(); I != E; ++I) { 3337 Formula F = Base; 3338 F.BaseOffset = (uint64_t)Base.BaseOffset - *I; 3339 if (isLegalUse(TTI, LU.MinOffset - *I, LU.MaxOffset - *I, LU.Kind, 3340 LU.AccessTy, F)) { 3341 // Add the offset to the base register. 3342 const SCEV *NewG = SE.getAddExpr(SE.getConstant(G->getType(), *I), G); 3343 // If it cancelled out, drop the base register, otherwise update it. 3344 if (NewG->isZero()) { 3345 std::swap(F.BaseRegs[i], F.BaseRegs.back()); 3346 F.BaseRegs.pop_back(); 3347 } else 3348 F.BaseRegs[i] = NewG; 3349 3350 (void)InsertFormula(LU, LUIdx, F); 3351 } 3352 } 3353 3354 int64_t Imm = ExtractImmediate(G, SE); 3355 if (G->isZero() || Imm == 0) 3356 continue; 3357 Formula F = Base; 3358 F.BaseOffset = (uint64_t)F.BaseOffset + Imm; 3359 if (!isLegalUse(TTI, LU.MinOffset, LU.MaxOffset, LU.Kind, LU.AccessTy, F)) 3360 continue; 3361 F.BaseRegs[i] = G; 3362 (void)InsertFormula(LU, LUIdx, F); 3363 } 3364} 3365 3366/// GenerateICmpZeroScales - For ICmpZero, check to see if we can scale up 3367/// the comparison. For example, x == y -> x*c == y*c. 3368void LSRInstance::GenerateICmpZeroScales(LSRUse &LU, unsigned LUIdx, 3369 Formula Base) { 3370 if (LU.Kind != LSRUse::ICmpZero) return; 3371 3372 // Determine the integer type for the base formula. 3373 Type *IntTy = Base.getType(); 3374 if (!IntTy) return; 3375 if (SE.getTypeSizeInBits(IntTy) > 64) return; 3376 3377 // Don't do this if there is more than one offset. 3378 if (LU.MinOffset != LU.MaxOffset) return; 3379 3380 assert(!Base.BaseGV && "ICmpZero use is not legal!"); 3381 3382 // Check each interesting stride. 3383 for (SmallSetVector<int64_t, 8>::const_iterator 3384 I = Factors.begin(), E = Factors.end(); I != E; ++I) { 3385 int64_t Factor = *I; 3386 3387 // Check that the multiplication doesn't overflow. 3388 if (Base.BaseOffset == INT64_MIN && Factor == -1) 3389 continue; 3390 int64_t NewBaseOffset = (uint64_t)Base.BaseOffset * Factor; 3391 if (NewBaseOffset / Factor != Base.BaseOffset) 3392 continue; 3393 3394 // Check that multiplying with the use offset doesn't overflow. 3395 int64_t Offset = LU.MinOffset; 3396 if (Offset == INT64_MIN && Factor == -1) 3397 continue; 3398 Offset = (uint64_t)Offset * Factor; 3399 if (Offset / Factor != LU.MinOffset) 3400 continue; 3401 3402 Formula F = Base; 3403 F.BaseOffset = NewBaseOffset; 3404 3405 // Check that this scale is legal. 3406 if (!isLegalUse(TTI, Offset, Offset, LU.Kind, LU.AccessTy, F)) 3407 continue; 3408 3409 // Compensate for the use having MinOffset built into it. 3410 F.BaseOffset = (uint64_t)F.BaseOffset + Offset - LU.MinOffset; 3411 3412 const SCEV *FactorS = SE.getConstant(IntTy, Factor); 3413 3414 // Check that multiplying with each base register doesn't overflow. 3415 for (size_t i = 0, e = F.BaseRegs.size(); i != e; ++i) { 3416 F.BaseRegs[i] = SE.getMulExpr(F.BaseRegs[i], FactorS); 3417 if (getExactSDiv(F.BaseRegs[i], FactorS, SE) != Base.BaseRegs[i]) 3418 goto next; 3419 } 3420 3421 // Check that multiplying with the scaled register doesn't overflow. 3422 if (F.ScaledReg) { 3423 F.ScaledReg = SE.getMulExpr(F.ScaledReg, FactorS); 3424 if (getExactSDiv(F.ScaledReg, FactorS, SE) != Base.ScaledReg) 3425 continue; 3426 } 3427 3428 // Check that multiplying with the unfolded offset doesn't overflow. 3429 if (F.UnfoldedOffset != 0) { 3430 if (F.UnfoldedOffset == INT64_MIN && Factor == -1) 3431 continue; 3432 F.UnfoldedOffset = (uint64_t)F.UnfoldedOffset * Factor; 3433 if (F.UnfoldedOffset / Factor != Base.UnfoldedOffset) 3434 continue; 3435 } 3436 3437 // If we make it here and it's legal, add it. 3438 (void)InsertFormula(LU, LUIdx, F); 3439 next:; 3440 } 3441} 3442 3443/// GenerateScales - Generate stride factor reuse formulae by making use of 3444/// scaled-offset address modes, for example. 3445void LSRInstance::GenerateScales(LSRUse &LU, unsigned LUIdx, Formula Base) { 3446 // Determine the integer type for the base formula. 3447 Type *IntTy = Base.getType(); 3448 if (!IntTy) return; 3449 3450 // If this Formula already has a scaled register, we can't add another one. 3451 if (Base.Scale != 0) return; 3452 3453 // Check each interesting stride. 3454 for (SmallSetVector<int64_t, 8>::const_iterator 3455 I = Factors.begin(), E = Factors.end(); I != E; ++I) { 3456 int64_t Factor = *I; 3457 3458 Base.Scale = Factor; 3459 Base.HasBaseReg = Base.BaseRegs.size() > 1; 3460 // Check whether this scale is going to be legal. 3461 if (!isLegalUse(TTI, LU.MinOffset, LU.MaxOffset, LU.Kind, LU.AccessTy, 3462 Base)) { 3463 // As a special-case, handle special out-of-loop Basic users specially. 3464 // TODO: Reconsider this special case. 3465 if (LU.Kind == LSRUse::Basic && 3466 isLegalUse(TTI, LU.MinOffset, LU.MaxOffset, LSRUse::Special, 3467 LU.AccessTy, Base) && 3468 LU.AllFixupsOutsideLoop) 3469 LU.Kind = LSRUse::Special; 3470 else 3471 continue; 3472 } 3473 // For an ICmpZero, negating a solitary base register won't lead to 3474 // new solutions. 3475 if (LU.Kind == LSRUse::ICmpZero && 3476 !Base.HasBaseReg && Base.BaseOffset == 0 && !Base.BaseGV) 3477 continue; 3478 // For each addrec base reg, apply the scale, if possible. 3479 for (size_t i = 0, e = Base.BaseRegs.size(); i != e; ++i) 3480 if (const SCEVAddRecExpr *AR = 3481 dyn_cast<SCEVAddRecExpr>(Base.BaseRegs[i])) { 3482 const SCEV *FactorS = SE.getConstant(IntTy, Factor); 3483 if (FactorS->isZero()) 3484 continue; 3485 // Divide out the factor, ignoring high bits, since we'll be 3486 // scaling the value back up in the end. 3487 if (const SCEV *Quotient = getExactSDiv(AR, FactorS, SE, true)) { 3488 // TODO: This could be optimized to avoid all the copying. 3489 Formula F = Base; 3490 F.ScaledReg = Quotient; 3491 F.DeleteBaseReg(F.BaseRegs[i]); 3492 (void)InsertFormula(LU, LUIdx, F); 3493 } 3494 } 3495 } 3496} 3497 3498/// GenerateTruncates - Generate reuse formulae from different IV types. 3499void LSRInstance::GenerateTruncates(LSRUse &LU, unsigned LUIdx, Formula Base) { 3500 // Don't bother truncating symbolic values. 3501 if (Base.BaseGV) return; 3502 3503 // Determine the integer type for the base formula. 3504 Type *DstTy = Base.getType(); 3505 if (!DstTy) return; 3506 DstTy = SE.getEffectiveSCEVType(DstTy); 3507 3508 for (SmallSetVector<Type *, 4>::const_iterator 3509 I = Types.begin(), E = Types.end(); I != E; ++I) { 3510 Type *SrcTy = *I; 3511 if (SrcTy != DstTy && TTI.isTruncateFree(SrcTy, DstTy)) { 3512 Formula F = Base; 3513 3514 if (F.ScaledReg) F.ScaledReg = SE.getAnyExtendExpr(F.ScaledReg, *I); 3515 for (SmallVectorImpl<const SCEV *>::iterator J = F.BaseRegs.begin(), 3516 JE = F.BaseRegs.end(); J != JE; ++J) 3517 *J = SE.getAnyExtendExpr(*J, SrcTy); 3518 3519 // TODO: This assumes we've done basic processing on all uses and 3520 // have an idea what the register usage is. 3521 if (!F.hasRegsUsedByUsesOtherThan(LUIdx, RegUses)) 3522 continue; 3523 3524 (void)InsertFormula(LU, LUIdx, F); 3525 } 3526 } 3527} 3528 3529namespace { 3530 3531/// WorkItem - Helper class for GenerateCrossUseConstantOffsets. It's used to 3532/// defer modifications so that the search phase doesn't have to worry about 3533/// the data structures moving underneath it. 3534struct WorkItem { 3535 size_t LUIdx; 3536 int64_t Imm; 3537 const SCEV *OrigReg; 3538 3539 WorkItem(size_t LI, int64_t I, const SCEV *R) 3540 : LUIdx(LI), Imm(I), OrigReg(R) {} 3541 3542 void print(raw_ostream &OS) const; 3543 void dump() const; 3544}; 3545 3546} 3547 3548void WorkItem::print(raw_ostream &OS) const { 3549 OS << "in formulae referencing " << *OrigReg << " in use " << LUIdx 3550 << " , add offset " << Imm; 3551} 3552 3553#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) 3554void WorkItem::dump() const { 3555 print(errs()); errs() << '\n'; 3556} 3557#endif 3558 3559/// GenerateCrossUseConstantOffsets - Look for registers which are a constant 3560/// distance apart and try to form reuse opportunities between them. 3561void LSRInstance::GenerateCrossUseConstantOffsets() { 3562 // Group the registers by their value without any added constant offset. 3563 typedef std::map<int64_t, const SCEV *> ImmMapTy; 3564 typedef DenseMap<const SCEV *, ImmMapTy> RegMapTy; 3565 RegMapTy Map; 3566 DenseMap<const SCEV *, SmallBitVector> UsedByIndicesMap; 3567 SmallVector<const SCEV *, 8> Sequence; 3568 for (RegUseTracker::const_iterator I = RegUses.begin(), E = RegUses.end(); 3569 I != E; ++I) { 3570 const SCEV *Reg = *I; 3571 int64_t Imm = ExtractImmediate(Reg, SE); 3572 std::pair<RegMapTy::iterator, bool> Pair = 3573 Map.insert(std::make_pair(Reg, ImmMapTy())); 3574 if (Pair.second) 3575 Sequence.push_back(Reg); 3576 Pair.first->second.insert(std::make_pair(Imm, *I)); 3577 UsedByIndicesMap[Reg] |= RegUses.getUsedByIndices(*I); 3578 } 3579 3580 // Now examine each set of registers with the same base value. Build up 3581 // a list of work to do and do the work in a separate step so that we're 3582 // not adding formulae and register counts while we're searching. 3583 SmallVector<WorkItem, 32> WorkItems; 3584 SmallSet<std::pair<size_t, int64_t>, 32> UniqueItems; 3585 for (SmallVectorImpl<const SCEV *>::const_iterator I = Sequence.begin(), 3586 E = Sequence.end(); I != E; ++I) { 3587 const SCEV *Reg = *I; 3588 const ImmMapTy &Imms = Map.find(Reg)->second; 3589 3590 // It's not worthwhile looking for reuse if there's only one offset. 3591 if (Imms.size() == 1) 3592 continue; 3593 3594 DEBUG(dbgs() << "Generating cross-use offsets for " << *Reg << ':'; 3595 for (ImmMapTy::const_iterator J = Imms.begin(), JE = Imms.end(); 3596 J != JE; ++J) 3597 dbgs() << ' ' << J->first; 3598 dbgs() << '\n'); 3599 3600 // Examine each offset. 3601 for (ImmMapTy::const_iterator J = Imms.begin(), JE = Imms.end(); 3602 J != JE; ++J) { 3603 const SCEV *OrigReg = J->second; 3604 3605 int64_t JImm = J->first; 3606 const SmallBitVector &UsedByIndices = RegUses.getUsedByIndices(OrigReg); 3607 3608 if (!isa<SCEVConstant>(OrigReg) && 3609 UsedByIndicesMap[Reg].count() == 1) { 3610 DEBUG(dbgs() << "Skipping cross-use reuse for " << *OrigReg << '\n'); 3611 continue; 3612 } 3613 3614 // Conservatively examine offsets between this orig reg a few selected 3615 // other orig regs. 3616 ImmMapTy::const_iterator OtherImms[] = { 3617 Imms.begin(), prior(Imms.end()), 3618 Imms.lower_bound((Imms.begin()->first + prior(Imms.end())->first) / 2) 3619 }; 3620 for (size_t i = 0, e = array_lengthof(OtherImms); i != e; ++i) { 3621 ImmMapTy::const_iterator M = OtherImms[i]; 3622 if (M == J || M == JE) continue; 3623 3624 // Compute the difference between the two. 3625 int64_t Imm = (uint64_t)JImm - M->first; 3626 for (int LUIdx = UsedByIndices.find_first(); LUIdx != -1; 3627 LUIdx = UsedByIndices.find_next(LUIdx)) 3628 // Make a memo of this use, offset, and register tuple. 3629 if (UniqueItems.insert(std::make_pair(LUIdx, Imm))) 3630 WorkItems.push_back(WorkItem(LUIdx, Imm, OrigReg)); 3631 } 3632 } 3633 } 3634 3635 Map.clear(); 3636 Sequence.clear(); 3637 UsedByIndicesMap.clear(); 3638 UniqueItems.clear(); 3639 3640 // Now iterate through the worklist and add new formulae. 3641 for (SmallVectorImpl<WorkItem>::const_iterator I = WorkItems.begin(), 3642 E = WorkItems.end(); I != E; ++I) { 3643 const WorkItem &WI = *I; 3644 size_t LUIdx = WI.LUIdx; 3645 LSRUse &LU = Uses[LUIdx]; 3646 int64_t Imm = WI.Imm; 3647 const SCEV *OrigReg = WI.OrigReg; 3648 3649 Type *IntTy = SE.getEffectiveSCEVType(OrigReg->getType()); 3650 const SCEV *NegImmS = SE.getSCEV(ConstantInt::get(IntTy, -(uint64_t)Imm)); 3651 unsigned BitWidth = SE.getTypeSizeInBits(IntTy); 3652 3653 // TODO: Use a more targeted data structure. 3654 for (size_t L = 0, LE = LU.Formulae.size(); L != LE; ++L) { 3655 const Formula &F = LU.Formulae[L]; 3656 // Use the immediate in the scaled register. 3657 if (F.ScaledReg == OrigReg) { 3658 int64_t Offset = (uint64_t)F.BaseOffset + Imm * (uint64_t)F.Scale; 3659 // Don't create 50 + reg(-50). 3660 if (F.referencesReg(SE.getSCEV( 3661 ConstantInt::get(IntTy, -(uint64_t)Offset)))) 3662 continue; 3663 Formula NewF = F; 3664 NewF.BaseOffset = Offset; 3665 if (!isLegalUse(TTI, LU.MinOffset, LU.MaxOffset, LU.Kind, LU.AccessTy, 3666 NewF)) 3667 continue; 3668 NewF.ScaledReg = SE.getAddExpr(NegImmS, NewF.ScaledReg); 3669 3670 // If the new scale is a constant in a register, and adding the constant 3671 // value to the immediate would produce a value closer to zero than the 3672 // immediate itself, then the formula isn't worthwhile. 3673 if (const SCEVConstant *C = dyn_cast<SCEVConstant>(NewF.ScaledReg)) 3674 if (C->getValue()->isNegative() != 3675 (NewF.BaseOffset < 0) && 3676 (C->getValue()->getValue().abs() * APInt(BitWidth, F.Scale)) 3677 .ule(abs64(NewF.BaseOffset))) 3678 continue; 3679 3680 // OK, looks good. 3681 (void)InsertFormula(LU, LUIdx, NewF); 3682 } else { 3683 // Use the immediate in a base register. 3684 for (size_t N = 0, NE = F.BaseRegs.size(); N != NE; ++N) { 3685 const SCEV *BaseReg = F.BaseRegs[N]; 3686 if (BaseReg != OrigReg) 3687 continue; 3688 Formula NewF = F; 3689 NewF.BaseOffset = (uint64_t)NewF.BaseOffset + Imm; 3690 if (!isLegalUse(TTI, LU.MinOffset, LU.MaxOffset, 3691 LU.Kind, LU.AccessTy, NewF)) { 3692 if (!TTI.isLegalAddImmediate((uint64_t)NewF.UnfoldedOffset + Imm)) 3693 continue; 3694 NewF = F; 3695 NewF.UnfoldedOffset = (uint64_t)NewF.UnfoldedOffset + Imm; 3696 } 3697 NewF.BaseRegs[N] = SE.getAddExpr(NegImmS, BaseReg); 3698 3699 // If the new formula has a constant in a register, and adding the 3700 // constant value to the immediate would produce a value closer to 3701 // zero than the immediate itself, then the formula isn't worthwhile. 3702 for (SmallVectorImpl<const SCEV *>::const_iterator 3703 J = NewF.BaseRegs.begin(), JE = NewF.BaseRegs.end(); 3704 J != JE; ++J) 3705 if (const SCEVConstant *C = dyn_cast<SCEVConstant>(*J)) 3706 if ((C->getValue()->getValue() + NewF.BaseOffset).abs().slt( 3707 abs64(NewF.BaseOffset)) && 3708 (C->getValue()->getValue() + 3709 NewF.BaseOffset).countTrailingZeros() >= 3710 countTrailingZeros<uint64_t>(NewF.BaseOffset)) 3711 goto skip_formula; 3712 3713 // Ok, looks good. 3714 (void)InsertFormula(LU, LUIdx, NewF); 3715 break; 3716 skip_formula:; 3717 } 3718 } 3719 } 3720 } 3721} 3722 3723/// GenerateAllReuseFormulae - Generate formulae for each use. 3724void 3725LSRInstance::GenerateAllReuseFormulae() { 3726 // This is split into multiple loops so that hasRegsUsedByUsesOtherThan 3727 // queries are more precise. 3728 for (size_t LUIdx = 0, NumUses = Uses.size(); LUIdx != NumUses; ++LUIdx) { 3729 LSRUse &LU = Uses[LUIdx]; 3730 for (size_t i = 0, f = LU.Formulae.size(); i != f; ++i) 3731 GenerateReassociations(LU, LUIdx, LU.Formulae[i]); 3732 for (size_t i = 0, f = LU.Formulae.size(); i != f; ++i) 3733 GenerateCombinations(LU, LUIdx, LU.Formulae[i]); 3734 } 3735 for (size_t LUIdx = 0, NumUses = Uses.size(); LUIdx != NumUses; ++LUIdx) { 3736 LSRUse &LU = Uses[LUIdx]; 3737 for (size_t i = 0, f = LU.Formulae.size(); i != f; ++i) 3738 GenerateSymbolicOffsets(LU, LUIdx, LU.Formulae[i]); 3739 for (size_t i = 0, f = LU.Formulae.size(); i != f; ++i) 3740 GenerateConstantOffsets(LU, LUIdx, LU.Formulae[i]); 3741 for (size_t i = 0, f = LU.Formulae.size(); i != f; ++i) 3742 GenerateICmpZeroScales(LU, LUIdx, LU.Formulae[i]); 3743 for (size_t i = 0, f = LU.Formulae.size(); i != f; ++i) 3744 GenerateScales(LU, LUIdx, LU.Formulae[i]); 3745 } 3746 for (size_t LUIdx = 0, NumUses = Uses.size(); LUIdx != NumUses; ++LUIdx) { 3747 LSRUse &LU = Uses[LUIdx]; 3748 for (size_t i = 0, f = LU.Formulae.size(); i != f; ++i) 3749 GenerateTruncates(LU, LUIdx, LU.Formulae[i]); 3750 } 3751 3752 GenerateCrossUseConstantOffsets(); 3753 3754 DEBUG(dbgs() << "\n" 3755 "After generating reuse formulae:\n"; 3756 print_uses(dbgs())); 3757} 3758 3759/// If there are multiple formulae with the same set of registers used 3760/// by other uses, pick the best one and delete the others. 3761void LSRInstance::FilterOutUndesirableDedicatedRegisters() { 3762 DenseSet<const SCEV *> VisitedRegs; 3763 SmallPtrSet<const SCEV *, 16> Regs; 3764 SmallPtrSet<const SCEV *, 16> LoserRegs; 3765#ifndef NDEBUG 3766 bool ChangedFormulae = false; 3767#endif 3768 3769 // Collect the best formula for each unique set of shared registers. This 3770 // is reset for each use. 3771 typedef DenseMap<SmallVector<const SCEV *, 4>, size_t, UniquifierDenseMapInfo> 3772 BestFormulaeTy; 3773 BestFormulaeTy BestFormulae; 3774 3775 for (size_t LUIdx = 0, NumUses = Uses.size(); LUIdx != NumUses; ++LUIdx) { 3776 LSRUse &LU = Uses[LUIdx]; 3777 DEBUG(dbgs() << "Filtering for use "; LU.print(dbgs()); dbgs() << '\n'); 3778 3779 bool Any = false; 3780 for (size_t FIdx = 0, NumForms = LU.Formulae.size(); 3781 FIdx != NumForms; ++FIdx) { 3782 Formula &F = LU.Formulae[FIdx]; 3783 3784 // Some formulas are instant losers. For example, they may depend on 3785 // nonexistent AddRecs from other loops. These need to be filtered 3786 // immediately, otherwise heuristics could choose them over others leading 3787 // to an unsatisfactory solution. Passing LoserRegs into RateFormula here 3788 // avoids the need to recompute this information across formulae using the 3789 // same bad AddRec. Passing LoserRegs is also essential unless we remove 3790 // the corresponding bad register from the Regs set. 3791 Cost CostF; 3792 Regs.clear(); 3793 CostF.RateFormula(TTI, F, Regs, VisitedRegs, L, LU.Offsets, SE, DT, LU, 3794 &LoserRegs); 3795 if (CostF.isLoser()) { 3796 // During initial formula generation, undesirable formulae are generated 3797 // by uses within other loops that have some non-trivial address mode or 3798 // use the postinc form of the IV. LSR needs to provide these formulae 3799 // as the basis of rediscovering the desired formula that uses an AddRec 3800 // corresponding to the existing phi. Once all formulae have been 3801 // generated, these initial losers may be pruned. 3802 DEBUG(dbgs() << " Filtering loser "; F.print(dbgs()); 3803 dbgs() << "\n"); 3804 } 3805 else { 3806 SmallVector<const SCEV *, 4> Key; 3807 for (SmallVectorImpl<const SCEV *>::const_iterator J = F.BaseRegs.begin(), 3808 JE = F.BaseRegs.end(); J != JE; ++J) { 3809 const SCEV *Reg = *J; 3810 if (RegUses.isRegUsedByUsesOtherThan(Reg, LUIdx)) 3811 Key.push_back(Reg); 3812 } 3813 if (F.ScaledReg && 3814 RegUses.isRegUsedByUsesOtherThan(F.ScaledReg, LUIdx)) 3815 Key.push_back(F.ScaledReg); 3816 // Unstable sort by host order ok, because this is only used for 3817 // uniquifying. 3818 std::sort(Key.begin(), Key.end()); 3819 3820 std::pair<BestFormulaeTy::const_iterator, bool> P = 3821 BestFormulae.insert(std::make_pair(Key, FIdx)); 3822 if (P.second) 3823 continue; 3824 3825 Formula &Best = LU.Formulae[P.first->second]; 3826 3827 Cost CostBest; 3828 Regs.clear(); 3829 CostBest.RateFormula(TTI, Best, Regs, VisitedRegs, L, LU.Offsets, SE, 3830 DT, LU); 3831 if (CostF < CostBest) 3832 std::swap(F, Best); 3833 DEBUG(dbgs() << " Filtering out formula "; F.print(dbgs()); 3834 dbgs() << "\n" 3835 " in favor of formula "; Best.print(dbgs()); 3836 dbgs() << '\n'); 3837 } 3838#ifndef NDEBUG 3839 ChangedFormulae = true; 3840#endif 3841 LU.DeleteFormula(F); 3842 --FIdx; 3843 --NumForms; 3844 Any = true; 3845 } 3846 3847 // Now that we've filtered out some formulae, recompute the Regs set. 3848 if (Any) 3849 LU.RecomputeRegs(LUIdx, RegUses); 3850 3851 // Reset this to prepare for the next use. 3852 BestFormulae.clear(); 3853 } 3854 3855 DEBUG(if (ChangedFormulae) { 3856 dbgs() << "\n" 3857 "After filtering out undesirable candidates:\n"; 3858 print_uses(dbgs()); 3859 }); 3860} 3861 3862// This is a rough guess that seems to work fairly well. 3863static const size_t ComplexityLimit = UINT16_MAX; 3864 3865/// EstimateSearchSpaceComplexity - Estimate the worst-case number of 3866/// solutions the solver might have to consider. It almost never considers 3867/// this many solutions because it prune the search space, but the pruning 3868/// isn't always sufficient. 3869size_t LSRInstance::EstimateSearchSpaceComplexity() const { 3870 size_t Power = 1; 3871 for (SmallVectorImpl<LSRUse>::const_iterator I = Uses.begin(), 3872 E = Uses.end(); I != E; ++I) { 3873 size_t FSize = I->Formulae.size(); 3874 if (FSize >= ComplexityLimit) { 3875 Power = ComplexityLimit; 3876 break; 3877 } 3878 Power *= FSize; 3879 if (Power >= ComplexityLimit) 3880 break; 3881 } 3882 return Power; 3883} 3884 3885/// NarrowSearchSpaceByDetectingSupersets - When one formula uses a superset 3886/// of the registers of another formula, it won't help reduce register 3887/// pressure (though it may not necessarily hurt register pressure); remove 3888/// it to simplify the system. 3889void LSRInstance::NarrowSearchSpaceByDetectingSupersets() { 3890 if (EstimateSearchSpaceComplexity() >= ComplexityLimit) { 3891 DEBUG(dbgs() << "The search space is too complex.\n"); 3892 3893 DEBUG(dbgs() << "Narrowing the search space by eliminating formulae " 3894 "which use a superset of registers used by other " 3895 "formulae.\n"); 3896 3897 for (size_t LUIdx = 0, NumUses = Uses.size(); LUIdx != NumUses; ++LUIdx) { 3898 LSRUse &LU = Uses[LUIdx]; 3899 bool Any = false; 3900 for (size_t i = 0, e = LU.Formulae.size(); i != e; ++i) { 3901 Formula &F = LU.Formulae[i]; 3902 // Look for a formula with a constant or GV in a register. If the use 3903 // also has a formula with that same value in an immediate field, 3904 // delete the one that uses a register. 3905 for (SmallVectorImpl<const SCEV *>::const_iterator 3906 I = F.BaseRegs.begin(), E = F.BaseRegs.end(); I != E; ++I) { 3907 if (const SCEVConstant *C = dyn_cast<SCEVConstant>(*I)) { 3908 Formula NewF = F; 3909 NewF.BaseOffset += C->getValue()->getSExtValue(); 3910 NewF.BaseRegs.erase(NewF.BaseRegs.begin() + 3911 (I - F.BaseRegs.begin())); 3912 if (LU.HasFormulaWithSameRegs(NewF)) { 3913 DEBUG(dbgs() << " Deleting "; F.print(dbgs()); dbgs() << '\n'); 3914 LU.DeleteFormula(F); 3915 --i; 3916 --e; 3917 Any = true; 3918 break; 3919 } 3920 } else if (const SCEVUnknown *U = dyn_cast<SCEVUnknown>(*I)) { 3921 if (GlobalValue *GV = dyn_cast<GlobalValue>(U->getValue())) 3922 if (!F.BaseGV) { 3923 Formula NewF = F; 3924 NewF.BaseGV = GV; 3925 NewF.BaseRegs.erase(NewF.BaseRegs.begin() + 3926 (I - F.BaseRegs.begin())); 3927 if (LU.HasFormulaWithSameRegs(NewF)) { 3928 DEBUG(dbgs() << " Deleting "; F.print(dbgs()); 3929 dbgs() << '\n'); 3930 LU.DeleteFormula(F); 3931 --i; 3932 --e; 3933 Any = true; 3934 break; 3935 } 3936 } 3937 } 3938 } 3939 } 3940 if (Any) 3941 LU.RecomputeRegs(LUIdx, RegUses); 3942 } 3943 3944 DEBUG(dbgs() << "After pre-selection:\n"; 3945 print_uses(dbgs())); 3946 } 3947} 3948 3949/// NarrowSearchSpaceByCollapsingUnrolledCode - When there are many registers 3950/// for expressions like A, A+1, A+2, etc., allocate a single register for 3951/// them. 3952void LSRInstance::NarrowSearchSpaceByCollapsingUnrolledCode() { 3953 if (EstimateSearchSpaceComplexity() < ComplexityLimit) 3954 return; 3955 3956 DEBUG(dbgs() << "The search space is too complex.\n" 3957 "Narrowing the search space by assuming that uses separated " 3958 "by a constant offset will use the same registers.\n"); 3959 3960 // This is especially useful for unrolled loops. 3961 3962 for (size_t LUIdx = 0, NumUses = Uses.size(); LUIdx != NumUses; ++LUIdx) { 3963 LSRUse &LU = Uses[LUIdx]; 3964 for (SmallVectorImpl<Formula>::const_iterator I = LU.Formulae.begin(), 3965 E = LU.Formulae.end(); I != E; ++I) { 3966 const Formula &F = *I; 3967 if (F.BaseOffset == 0 || F.Scale != 0) 3968 continue; 3969 3970 LSRUse *LUThatHas = FindUseWithSimilarFormula(F, LU); 3971 if (!LUThatHas) 3972 continue; 3973 3974 if (!reconcileNewOffset(*LUThatHas, F.BaseOffset, /*HasBaseReg=*/ false, 3975 LU.Kind, LU.AccessTy)) 3976 continue; 3977 3978 DEBUG(dbgs() << " Deleting use "; LU.print(dbgs()); dbgs() << '\n'); 3979 3980 LUThatHas->AllFixupsOutsideLoop &= LU.AllFixupsOutsideLoop; 3981 3982 // Update the relocs to reference the new use. 3983 for (SmallVectorImpl<LSRFixup>::iterator I = Fixups.begin(), 3984 E = Fixups.end(); I != E; ++I) { 3985 LSRFixup &Fixup = *I; 3986 if (Fixup.LUIdx == LUIdx) { 3987 Fixup.LUIdx = LUThatHas - &Uses.front(); 3988 Fixup.Offset += F.BaseOffset; 3989 // Add the new offset to LUThatHas' offset list. 3990 if (LUThatHas->Offsets.back() != Fixup.Offset) { 3991 LUThatHas->Offsets.push_back(Fixup.Offset); 3992 if (Fixup.Offset > LUThatHas->MaxOffset) 3993 LUThatHas->MaxOffset = Fixup.Offset; 3994 if (Fixup.Offset < LUThatHas->MinOffset) 3995 LUThatHas->MinOffset = Fixup.Offset; 3996 } 3997 DEBUG(dbgs() << "New fixup has offset " << Fixup.Offset << '\n'); 3998 } 3999 if (Fixup.LUIdx == NumUses-1) 4000 Fixup.LUIdx = LUIdx; 4001 } 4002 4003 // Delete formulae from the new use which are no longer legal. 4004 bool Any = false; 4005 for (size_t i = 0, e = LUThatHas->Formulae.size(); i != e; ++i) { 4006 Formula &F = LUThatHas->Formulae[i]; 4007 if (!isLegalUse(TTI, LUThatHas->MinOffset, LUThatHas->MaxOffset, 4008 LUThatHas->Kind, LUThatHas->AccessTy, F)) { 4009 DEBUG(dbgs() << " Deleting "; F.print(dbgs()); 4010 dbgs() << '\n'); 4011 LUThatHas->DeleteFormula(F); 4012 --i; 4013 --e; 4014 Any = true; 4015 } 4016 } 4017 4018 if (Any) 4019 LUThatHas->RecomputeRegs(LUThatHas - &Uses.front(), RegUses); 4020 4021 // Delete the old use. 4022 DeleteUse(LU, LUIdx); 4023 --LUIdx; 4024 --NumUses; 4025 break; 4026 } 4027 } 4028 4029 DEBUG(dbgs() << "After pre-selection:\n"; print_uses(dbgs())); 4030} 4031 4032/// NarrowSearchSpaceByRefilteringUndesirableDedicatedRegisters - Call 4033/// FilterOutUndesirableDedicatedRegisters again, if necessary, now that 4034/// we've done more filtering, as it may be able to find more formulae to 4035/// eliminate. 4036void LSRInstance::NarrowSearchSpaceByRefilteringUndesirableDedicatedRegisters(){ 4037 if (EstimateSearchSpaceComplexity() >= ComplexityLimit) { 4038 DEBUG(dbgs() << "The search space is too complex.\n"); 4039 4040 DEBUG(dbgs() << "Narrowing the search space by re-filtering out " 4041 "undesirable dedicated registers.\n"); 4042 4043 FilterOutUndesirableDedicatedRegisters(); 4044 4045 DEBUG(dbgs() << "After pre-selection:\n"; 4046 print_uses(dbgs())); 4047 } 4048} 4049 4050/// NarrowSearchSpaceByPickingWinnerRegs - Pick a register which seems likely 4051/// to be profitable, and then in any use which has any reference to that 4052/// register, delete all formulae which do not reference that register. 4053void LSRInstance::NarrowSearchSpaceByPickingWinnerRegs() { 4054 // With all other options exhausted, loop until the system is simple 4055 // enough to handle. 4056 SmallPtrSet<const SCEV *, 4> Taken; 4057 while (EstimateSearchSpaceComplexity() >= ComplexityLimit) { 4058 // Ok, we have too many of formulae on our hands to conveniently handle. 4059 // Use a rough heuristic to thin out the list. 4060 DEBUG(dbgs() << "The search space is too complex.\n"); 4061 4062 // Pick the register which is used by the most LSRUses, which is likely 4063 // to be a good reuse register candidate. 4064 const SCEV *Best = 0; 4065 unsigned BestNum = 0; 4066 for (RegUseTracker::const_iterator I = RegUses.begin(), E = RegUses.end(); 4067 I != E; ++I) { 4068 const SCEV *Reg = *I; 4069 if (Taken.count(Reg)) 4070 continue; 4071 if (!Best) 4072 Best = Reg; 4073 else { 4074 unsigned Count = RegUses.getUsedByIndices(Reg).count(); 4075 if (Count > BestNum) { 4076 Best = Reg; 4077 BestNum = Count; 4078 } 4079 } 4080 } 4081 4082 DEBUG(dbgs() << "Narrowing the search space by assuming " << *Best 4083 << " will yield profitable reuse.\n"); 4084 Taken.insert(Best); 4085 4086 // In any use with formulae which references this register, delete formulae 4087 // which don't reference it. 4088 for (size_t LUIdx = 0, NumUses = Uses.size(); LUIdx != NumUses; ++LUIdx) { 4089 LSRUse &LU = Uses[LUIdx]; 4090 if (!LU.Regs.count(Best)) continue; 4091 4092 bool Any = false; 4093 for (size_t i = 0, e = LU.Formulae.size(); i != e; ++i) { 4094 Formula &F = LU.Formulae[i]; 4095 if (!F.referencesReg(Best)) { 4096 DEBUG(dbgs() << " Deleting "; F.print(dbgs()); dbgs() << '\n'); 4097 LU.DeleteFormula(F); 4098 --e; 4099 --i; 4100 Any = true; 4101 assert(e != 0 && "Use has no formulae left! Is Regs inconsistent?"); 4102 continue; 4103 } 4104 } 4105 4106 if (Any) 4107 LU.RecomputeRegs(LUIdx, RegUses); 4108 } 4109 4110 DEBUG(dbgs() << "After pre-selection:\n"; 4111 print_uses(dbgs())); 4112 } 4113} 4114 4115/// NarrowSearchSpaceUsingHeuristics - If there are an extraordinary number of 4116/// formulae to choose from, use some rough heuristics to prune down the number 4117/// of formulae. This keeps the main solver from taking an extraordinary amount 4118/// of time in some worst-case scenarios. 4119void LSRInstance::NarrowSearchSpaceUsingHeuristics() { 4120 NarrowSearchSpaceByDetectingSupersets(); 4121 NarrowSearchSpaceByCollapsingUnrolledCode(); 4122 NarrowSearchSpaceByRefilteringUndesirableDedicatedRegisters(); 4123 NarrowSearchSpaceByPickingWinnerRegs(); 4124} 4125 4126/// SolveRecurse - This is the recursive solver. 4127void LSRInstance::SolveRecurse(SmallVectorImpl<const Formula *> &Solution, 4128 Cost &SolutionCost, 4129 SmallVectorImpl<const Formula *> &Workspace, 4130 const Cost &CurCost, 4131 const SmallPtrSet<const SCEV *, 16> &CurRegs, 4132 DenseSet<const SCEV *> &VisitedRegs) const { 4133 // Some ideas: 4134 // - prune more: 4135 // - use more aggressive filtering 4136 // - sort the formula so that the most profitable solutions are found first 4137 // - sort the uses too 4138 // - search faster: 4139 // - don't compute a cost, and then compare. compare while computing a cost 4140 // and bail early. 4141 // - track register sets with SmallBitVector 4142 4143 const LSRUse &LU = Uses[Workspace.size()]; 4144 4145 // If this use references any register that's already a part of the 4146 // in-progress solution, consider it a requirement that a formula must 4147 // reference that register in order to be considered. This prunes out 4148 // unprofitable searching. 4149 SmallSetVector<const SCEV *, 4> ReqRegs; 4150 for (SmallPtrSet<const SCEV *, 16>::const_iterator I = CurRegs.begin(), 4151 E = CurRegs.end(); I != E; ++I) 4152 if (LU.Regs.count(*I)) 4153 ReqRegs.insert(*I); 4154 4155 SmallPtrSet<const SCEV *, 16> NewRegs; 4156 Cost NewCost; 4157 for (SmallVectorImpl<Formula>::const_iterator I = LU.Formulae.begin(), 4158 E = LU.Formulae.end(); I != E; ++I) { 4159 const Formula &F = *I; 4160 4161 // Ignore formulae which do not use any of the required registers. 4162 bool SatisfiedReqReg = true; 4163 for (SmallSetVector<const SCEV *, 4>::const_iterator J = ReqRegs.begin(), 4164 JE = ReqRegs.end(); J != JE; ++J) { 4165 const SCEV *Reg = *J; 4166 if ((!F.ScaledReg || F.ScaledReg != Reg) && 4167 std::find(F.BaseRegs.begin(), F.BaseRegs.end(), Reg) == 4168 F.BaseRegs.end()) { 4169 SatisfiedReqReg = false; 4170 break; 4171 } 4172 } 4173 if (!SatisfiedReqReg) { 4174 // If none of the formulae satisfied the required registers, then we could 4175 // clear ReqRegs and try again. Currently, we simply give up in this case. 4176 continue; 4177 } 4178 4179 // Evaluate the cost of the current formula. If it's already worse than 4180 // the current best, prune the search at that point. 4181 NewCost = CurCost; 4182 NewRegs = CurRegs; 4183 NewCost.RateFormula(TTI, F, NewRegs, VisitedRegs, L, LU.Offsets, SE, DT, 4184 LU); 4185 if (NewCost < SolutionCost) { 4186 Workspace.push_back(&F); 4187 if (Workspace.size() != Uses.size()) { 4188 SolveRecurse(Solution, SolutionCost, Workspace, NewCost, 4189 NewRegs, VisitedRegs); 4190 if (F.getNumRegs() == 1 && Workspace.size() == 1) 4191 VisitedRegs.insert(F.ScaledReg ? F.ScaledReg : F.BaseRegs[0]); 4192 } else { 4193 DEBUG(dbgs() << "New best at "; NewCost.print(dbgs()); 4194 dbgs() << ".\n Regs:"; 4195 for (SmallPtrSet<const SCEV *, 16>::const_iterator 4196 I = NewRegs.begin(), E = NewRegs.end(); I != E; ++I) 4197 dbgs() << ' ' << **I; 4198 dbgs() << '\n'); 4199 4200 SolutionCost = NewCost; 4201 Solution = Workspace; 4202 } 4203 Workspace.pop_back(); 4204 } 4205 } 4206} 4207 4208/// Solve - Choose one formula from each use. Return the results in the given 4209/// Solution vector. 4210void LSRInstance::Solve(SmallVectorImpl<const Formula *> &Solution) const { 4211 SmallVector<const Formula *, 8> Workspace; 4212 Cost SolutionCost; 4213 SolutionCost.Loose(); 4214 Cost CurCost; 4215 SmallPtrSet<const SCEV *, 16> CurRegs; 4216 DenseSet<const SCEV *> VisitedRegs; 4217 Workspace.reserve(Uses.size()); 4218 4219 // SolveRecurse does all the work. 4220 SolveRecurse(Solution, SolutionCost, Workspace, CurCost, 4221 CurRegs, VisitedRegs); 4222 if (Solution.empty()) { 4223 DEBUG(dbgs() << "\nNo Satisfactory Solution\n"); 4224 return; 4225 } 4226 4227 // Ok, we've now made all our decisions. 4228 DEBUG(dbgs() << "\n" 4229 "The chosen solution requires "; SolutionCost.print(dbgs()); 4230 dbgs() << ":\n"; 4231 for (size_t i = 0, e = Uses.size(); i != e; ++i) { 4232 dbgs() << " "; 4233 Uses[i].print(dbgs()); 4234 dbgs() << "\n" 4235 " "; 4236 Solution[i]->print(dbgs()); 4237 dbgs() << '\n'; 4238 }); 4239 4240 assert(Solution.size() == Uses.size() && "Malformed solution!"); 4241} 4242 4243/// HoistInsertPosition - Helper for AdjustInsertPositionForExpand. Climb up 4244/// the dominator tree far as we can go while still being dominated by the 4245/// input positions. This helps canonicalize the insert position, which 4246/// encourages sharing. 4247BasicBlock::iterator 4248LSRInstance::HoistInsertPosition(BasicBlock::iterator IP, 4249 const SmallVectorImpl<Instruction *> &Inputs) 4250 const { 4251 for (;;) { 4252 const Loop *IPLoop = LI.getLoopFor(IP->getParent()); 4253 unsigned IPLoopDepth = IPLoop ? IPLoop->getLoopDepth() : 0; 4254 4255 BasicBlock *IDom; 4256 for (DomTreeNode *Rung = DT.getNode(IP->getParent()); ; ) { 4257 if (!Rung) return IP; 4258 Rung = Rung->getIDom(); 4259 if (!Rung) return IP; 4260 IDom = Rung->getBlock(); 4261 4262 // Don't climb into a loop though. 4263 const Loop *IDomLoop = LI.getLoopFor(IDom); 4264 unsigned IDomDepth = IDomLoop ? IDomLoop->getLoopDepth() : 0; 4265 if (IDomDepth <= IPLoopDepth && 4266 (IDomDepth != IPLoopDepth || IDomLoop == IPLoop)) 4267 break; 4268 } 4269 4270 bool AllDominate = true; 4271 Instruction *BetterPos = 0; 4272 Instruction *Tentative = IDom->getTerminator(); 4273 for (SmallVectorImpl<Instruction *>::const_iterator I = Inputs.begin(), 4274 E = Inputs.end(); I != E; ++I) { 4275 Instruction *Inst = *I; 4276 if (Inst == Tentative || !DT.dominates(Inst, Tentative)) { 4277 AllDominate = false; 4278 break; 4279 } 4280 // Attempt to find an insert position in the middle of the block, 4281 // instead of at the end, so that it can be used for other expansions. 4282 if (IDom == Inst->getParent() && 4283 (!BetterPos || !DT.dominates(Inst, BetterPos))) 4284 BetterPos = llvm::next(BasicBlock::iterator(Inst)); 4285 } 4286 if (!AllDominate) 4287 break; 4288 if (BetterPos) 4289 IP = BetterPos; 4290 else 4291 IP = Tentative; 4292 } 4293 4294 return IP; 4295} 4296 4297/// AdjustInsertPositionForExpand - Determine an input position which will be 4298/// dominated by the operands and which will dominate the result. 4299BasicBlock::iterator 4300LSRInstance::AdjustInsertPositionForExpand(BasicBlock::iterator LowestIP, 4301 const LSRFixup &LF, 4302 const LSRUse &LU, 4303 SCEVExpander &Rewriter) const { 4304 // Collect some instructions which must be dominated by the 4305 // expanding replacement. These must be dominated by any operands that 4306 // will be required in the expansion. 4307 SmallVector<Instruction *, 4> Inputs; 4308 if (Instruction *I = dyn_cast<Instruction>(LF.OperandValToReplace)) 4309 Inputs.push_back(I); 4310 if (LU.Kind == LSRUse::ICmpZero) 4311 if (Instruction *I = 4312 dyn_cast<Instruction>(cast<ICmpInst>(LF.UserInst)->getOperand(1))) 4313 Inputs.push_back(I); 4314 if (LF.PostIncLoops.count(L)) { 4315 if (LF.isUseFullyOutsideLoop(L)) 4316 Inputs.push_back(L->getLoopLatch()->getTerminator()); 4317 else 4318 Inputs.push_back(IVIncInsertPos); 4319 } 4320 // The expansion must also be dominated by the increment positions of any 4321 // loops it for which it is using post-inc mode. 4322 for (PostIncLoopSet::const_iterator I = LF.PostIncLoops.begin(), 4323 E = LF.PostIncLoops.end(); I != E; ++I) { 4324 const Loop *PIL = *I; 4325 if (PIL == L) continue; 4326 4327 // Be dominated by the loop exit. 4328 SmallVector<BasicBlock *, 4> ExitingBlocks; 4329 PIL->getExitingBlocks(ExitingBlocks); 4330 if (!ExitingBlocks.empty()) { 4331 BasicBlock *BB = ExitingBlocks[0]; 4332 for (unsigned i = 1, e = ExitingBlocks.size(); i != e; ++i) 4333 BB = DT.findNearestCommonDominator(BB, ExitingBlocks[i]); 4334 Inputs.push_back(BB->getTerminator()); 4335 } 4336 } 4337 4338 assert(!isa<PHINode>(LowestIP) && !isa<LandingPadInst>(LowestIP) 4339 && !isa<DbgInfoIntrinsic>(LowestIP) && 4340 "Insertion point must be a normal instruction"); 4341 4342 // Then, climb up the immediate dominator tree as far as we can go while 4343 // still being dominated by the input positions. 4344 BasicBlock::iterator IP = HoistInsertPosition(LowestIP, Inputs); 4345 4346 // Don't insert instructions before PHI nodes. 4347 while (isa<PHINode>(IP)) ++IP; 4348 4349 // Ignore landingpad instructions. 4350 while (isa<LandingPadInst>(IP)) ++IP; 4351 4352 // Ignore debug intrinsics. 4353 while (isa<DbgInfoIntrinsic>(IP)) ++IP; 4354 4355 // Set IP below instructions recently inserted by SCEVExpander. This keeps the 4356 // IP consistent across expansions and allows the previously inserted 4357 // instructions to be reused by subsequent expansion. 4358 while (Rewriter.isInsertedInstruction(IP) && IP != LowestIP) ++IP; 4359 4360 return IP; 4361} 4362 4363/// Expand - Emit instructions for the leading candidate expression for this 4364/// LSRUse (this is called "expanding"). 4365Value *LSRInstance::Expand(const LSRFixup &LF, 4366 const Formula &F, 4367 BasicBlock::iterator IP, 4368 SCEVExpander &Rewriter, 4369 SmallVectorImpl<WeakVH> &DeadInsts) const { 4370 const LSRUse &LU = Uses[LF.LUIdx]; 4371 if (LU.RigidFormula) 4372 return LF.OperandValToReplace; 4373 4374 // Determine an input position which will be dominated by the operands and 4375 // which will dominate the result. 4376 IP = AdjustInsertPositionForExpand(IP, LF, LU, Rewriter); 4377 4378 // Inform the Rewriter if we have a post-increment use, so that it can 4379 // perform an advantageous expansion. 4380 Rewriter.setPostInc(LF.PostIncLoops); 4381 4382 // This is the type that the user actually needs. 4383 Type *OpTy = LF.OperandValToReplace->getType(); 4384 // This will be the type that we'll initially expand to. 4385 Type *Ty = F.getType(); 4386 if (!Ty) 4387 // No type known; just expand directly to the ultimate type. 4388 Ty = OpTy; 4389 else if (SE.getEffectiveSCEVType(Ty) == SE.getEffectiveSCEVType(OpTy)) 4390 // Expand directly to the ultimate type if it's the right size. 4391 Ty = OpTy; 4392 // This is the type to do integer arithmetic in. 4393 Type *IntTy = SE.getEffectiveSCEVType(Ty); 4394 4395 // Build up a list of operands to add together to form the full base. 4396 SmallVector<const SCEV *, 8> Ops; 4397 4398 // Expand the BaseRegs portion. 4399 for (SmallVectorImpl<const SCEV *>::const_iterator I = F.BaseRegs.begin(), 4400 E = F.BaseRegs.end(); I != E; ++I) { 4401 const SCEV *Reg = *I; 4402 assert(!Reg->isZero() && "Zero allocated in a base register!"); 4403 4404 // If we're expanding for a post-inc user, make the post-inc adjustment. 4405 PostIncLoopSet &Loops = const_cast<PostIncLoopSet &>(LF.PostIncLoops); 4406 Reg = TransformForPostIncUse(Denormalize, Reg, 4407 LF.UserInst, LF.OperandValToReplace, 4408 Loops, SE, DT); 4409 4410 Ops.push_back(SE.getUnknown(Rewriter.expandCodeFor(Reg, 0, IP))); 4411 } 4412 4413 // Expand the ScaledReg portion. 4414 Value *ICmpScaledV = 0; 4415 if (F.Scale != 0) { 4416 const SCEV *ScaledS = F.ScaledReg; 4417 4418 // If we're expanding for a post-inc user, make the post-inc adjustment. 4419 PostIncLoopSet &Loops = const_cast<PostIncLoopSet &>(LF.PostIncLoops); 4420 ScaledS = TransformForPostIncUse(Denormalize, ScaledS, 4421 LF.UserInst, LF.OperandValToReplace, 4422 Loops, SE, DT); 4423 4424 if (LU.Kind == LSRUse::ICmpZero) { 4425 // An interesting way of "folding" with an icmp is to use a negated 4426 // scale, which we'll implement by inserting it into the other operand 4427 // of the icmp. 4428 assert(F.Scale == -1 && 4429 "The only scale supported by ICmpZero uses is -1!"); 4430 ICmpScaledV = Rewriter.expandCodeFor(ScaledS, 0, IP); 4431 } else { 4432 // Otherwise just expand the scaled register and an explicit scale, 4433 // which is expected to be matched as part of the address. 4434 4435 // Flush the operand list to suppress SCEVExpander hoisting address modes. 4436 if (!Ops.empty() && LU.Kind == LSRUse::Address) { 4437 Value *FullV = Rewriter.expandCodeFor(SE.getAddExpr(Ops), Ty, IP); 4438 Ops.clear(); 4439 Ops.push_back(SE.getUnknown(FullV)); 4440 } 4441 ScaledS = SE.getUnknown(Rewriter.expandCodeFor(ScaledS, 0, IP)); 4442 ScaledS = SE.getMulExpr(ScaledS, 4443 SE.getConstant(ScaledS->getType(), F.Scale)); 4444 Ops.push_back(ScaledS); 4445 } 4446 } 4447 4448 // Expand the GV portion. 4449 if (F.BaseGV) { 4450 // Flush the operand list to suppress SCEVExpander hoisting. 4451 if (!Ops.empty()) { 4452 Value *FullV = Rewriter.expandCodeFor(SE.getAddExpr(Ops), Ty, IP); 4453 Ops.clear(); 4454 Ops.push_back(SE.getUnknown(FullV)); 4455 } 4456 Ops.push_back(SE.getUnknown(F.BaseGV)); 4457 } 4458 4459 // Flush the operand list to suppress SCEVExpander hoisting of both folded and 4460 // unfolded offsets. LSR assumes they both live next to their uses. 4461 if (!Ops.empty()) { 4462 Value *FullV = Rewriter.expandCodeFor(SE.getAddExpr(Ops), Ty, IP); 4463 Ops.clear(); 4464 Ops.push_back(SE.getUnknown(FullV)); 4465 } 4466 4467 // Expand the immediate portion. 4468 int64_t Offset = (uint64_t)F.BaseOffset + LF.Offset; 4469 if (Offset != 0) { 4470 if (LU.Kind == LSRUse::ICmpZero) { 4471 // The other interesting way of "folding" with an ICmpZero is to use a 4472 // negated immediate. 4473 if (!ICmpScaledV) 4474 ICmpScaledV = ConstantInt::get(IntTy, -(uint64_t)Offset); 4475 else { 4476 Ops.push_back(SE.getUnknown(ICmpScaledV)); 4477 ICmpScaledV = ConstantInt::get(IntTy, Offset); 4478 } 4479 } else { 4480 // Just add the immediate values. These again are expected to be matched 4481 // as part of the address. 4482 Ops.push_back(SE.getUnknown(ConstantInt::getSigned(IntTy, Offset))); 4483 } 4484 } 4485 4486 // Expand the unfolded offset portion. 4487 int64_t UnfoldedOffset = F.UnfoldedOffset; 4488 if (UnfoldedOffset != 0) { 4489 // Just add the immediate values. 4490 Ops.push_back(SE.getUnknown(ConstantInt::getSigned(IntTy, 4491 UnfoldedOffset))); 4492 } 4493 4494 // Emit instructions summing all the operands. 4495 const SCEV *FullS = Ops.empty() ? 4496 SE.getConstant(IntTy, 0) : 4497 SE.getAddExpr(Ops); 4498 Value *FullV = Rewriter.expandCodeFor(FullS, Ty, IP); 4499 4500 // We're done expanding now, so reset the rewriter. 4501 Rewriter.clearPostInc(); 4502 4503 // An ICmpZero Formula represents an ICmp which we're handling as a 4504 // comparison against zero. Now that we've expanded an expression for that 4505 // form, update the ICmp's other operand. 4506 if (LU.Kind == LSRUse::ICmpZero) { 4507 ICmpInst *CI = cast<ICmpInst>(LF.UserInst); 4508 DeadInsts.push_back(CI->getOperand(1)); 4509 assert(!F.BaseGV && "ICmp does not support folding a global value and " 4510 "a scale at the same time!"); 4511 if (F.Scale == -1) { 4512 if (ICmpScaledV->getType() != OpTy) { 4513 Instruction *Cast = 4514 CastInst::Create(CastInst::getCastOpcode(ICmpScaledV, false, 4515 OpTy, false), 4516 ICmpScaledV, OpTy, "tmp", CI); 4517 ICmpScaledV = Cast; 4518 } 4519 CI->setOperand(1, ICmpScaledV); 4520 } else { 4521 assert(F.Scale == 0 && 4522 "ICmp does not support folding a global value and " 4523 "a scale at the same time!"); 4524 Constant *C = ConstantInt::getSigned(SE.getEffectiveSCEVType(OpTy), 4525 -(uint64_t)Offset); 4526 if (C->getType() != OpTy) 4527 C = ConstantExpr::getCast(CastInst::getCastOpcode(C, false, 4528 OpTy, false), 4529 C, OpTy); 4530 4531 CI->setOperand(1, C); 4532 } 4533 } 4534 4535 return FullV; 4536} 4537 4538/// RewriteForPHI - Helper for Rewrite. PHI nodes are special because the use 4539/// of their operands effectively happens in their predecessor blocks, so the 4540/// expression may need to be expanded in multiple places. 4541void LSRInstance::RewriteForPHI(PHINode *PN, 4542 const LSRFixup &LF, 4543 const Formula &F, 4544 SCEVExpander &Rewriter, 4545 SmallVectorImpl<WeakVH> &DeadInsts, 4546 Pass *P) const { 4547 DenseMap<BasicBlock *, Value *> Inserted; 4548 for (unsigned i = 0, e = PN->getNumIncomingValues(); i != e; ++i) 4549 if (PN->getIncomingValue(i) == LF.OperandValToReplace) { 4550 BasicBlock *BB = PN->getIncomingBlock(i); 4551 4552 // If this is a critical edge, split the edge so that we do not insert 4553 // the code on all predecessor/successor paths. We do this unless this 4554 // is the canonical backedge for this loop, which complicates post-inc 4555 // users. 4556 if (e != 1 && BB->getTerminator()->getNumSuccessors() > 1 && 4557 !isa<IndirectBrInst>(BB->getTerminator())) { 4558 BasicBlock *Parent = PN->getParent(); 4559 Loop *PNLoop = LI.getLoopFor(Parent); 4560 if (!PNLoop || Parent != PNLoop->getHeader()) { 4561 // Split the critical edge. 4562 BasicBlock *NewBB = 0; 4563 if (!Parent->isLandingPad()) { 4564 NewBB = SplitCriticalEdge(BB, Parent, P, 4565 /*MergeIdenticalEdges=*/true, 4566 /*DontDeleteUselessPhis=*/true); 4567 } else { 4568 SmallVector<BasicBlock*, 2> NewBBs; 4569 SplitLandingPadPredecessors(Parent, BB, "", "", P, NewBBs); 4570 NewBB = NewBBs[0]; 4571 } 4572 // If NewBB==NULL, then SplitCriticalEdge refused to split because all 4573 // phi predecessors are identical. The simple thing to do is skip 4574 // splitting in this case rather than complicate the API. 4575 if (NewBB) { 4576 // If PN is outside of the loop and BB is in the loop, we want to 4577 // move the block to be immediately before the PHI block, not 4578 // immediately after BB. 4579 if (L->contains(BB) && !L->contains(PN)) 4580 NewBB->moveBefore(PN->getParent()); 4581 4582 // Splitting the edge can reduce the number of PHI entries we have. 4583 e = PN->getNumIncomingValues(); 4584 BB = NewBB; 4585 i = PN->getBasicBlockIndex(BB); 4586 } 4587 } 4588 } 4589 4590 std::pair<DenseMap<BasicBlock *, Value *>::iterator, bool> Pair = 4591 Inserted.insert(std::make_pair(BB, static_cast<Value *>(0))); 4592 if (!Pair.second) 4593 PN->setIncomingValue(i, Pair.first->second); 4594 else { 4595 Value *FullV = Expand(LF, F, BB->getTerminator(), Rewriter, DeadInsts); 4596 4597 // If this is reuse-by-noop-cast, insert the noop cast. 4598 Type *OpTy = LF.OperandValToReplace->getType(); 4599 if (FullV->getType() != OpTy) 4600 FullV = 4601 CastInst::Create(CastInst::getCastOpcode(FullV, false, 4602 OpTy, false), 4603 FullV, LF.OperandValToReplace->getType(), 4604 "tmp", BB->getTerminator()); 4605 4606 PN->setIncomingValue(i, FullV); 4607 Pair.first->second = FullV; 4608 } 4609 } 4610} 4611 4612/// Rewrite - Emit instructions for the leading candidate expression for this 4613/// LSRUse (this is called "expanding"), and update the UserInst to reference 4614/// the newly expanded value. 4615void LSRInstance::Rewrite(const LSRFixup &LF, 4616 const Formula &F, 4617 SCEVExpander &Rewriter, 4618 SmallVectorImpl<WeakVH> &DeadInsts, 4619 Pass *P) const { 4620 // First, find an insertion point that dominates UserInst. For PHI nodes, 4621 // find the nearest block which dominates all the relevant uses. 4622 if (PHINode *PN = dyn_cast<PHINode>(LF.UserInst)) { 4623 RewriteForPHI(PN, LF, F, Rewriter, DeadInsts, P); 4624 } else { 4625 Value *FullV = Expand(LF, F, LF.UserInst, Rewriter, DeadInsts); 4626 4627 // If this is reuse-by-noop-cast, insert the noop cast. 4628 Type *OpTy = LF.OperandValToReplace->getType(); 4629 if (FullV->getType() != OpTy) { 4630 Instruction *Cast = 4631 CastInst::Create(CastInst::getCastOpcode(FullV, false, OpTy, false), 4632 FullV, OpTy, "tmp", LF.UserInst); 4633 FullV = Cast; 4634 } 4635 4636 // Update the user. ICmpZero is handled specially here (for now) because 4637 // Expand may have updated one of the operands of the icmp already, and 4638 // its new value may happen to be equal to LF.OperandValToReplace, in 4639 // which case doing replaceUsesOfWith leads to replacing both operands 4640 // with the same value. TODO: Reorganize this. 4641 if (Uses[LF.LUIdx].Kind == LSRUse::ICmpZero) 4642 LF.UserInst->setOperand(0, FullV); 4643 else 4644 LF.UserInst->replaceUsesOfWith(LF.OperandValToReplace, FullV); 4645 } 4646 4647 DeadInsts.push_back(LF.OperandValToReplace); 4648} 4649 4650/// ImplementSolution - Rewrite all the fixup locations with new values, 4651/// following the chosen solution. 4652void 4653LSRInstance::ImplementSolution(const SmallVectorImpl<const Formula *> &Solution, 4654 Pass *P) { 4655 // Keep track of instructions we may have made dead, so that 4656 // we can remove them after we are done working. 4657 SmallVector<WeakVH, 16> DeadInsts; 4658 4659 SCEVExpander Rewriter(SE, "lsr"); 4660#ifndef NDEBUG 4661 Rewriter.setDebugType(DEBUG_TYPE); 4662#endif 4663 Rewriter.disableCanonicalMode(); 4664 Rewriter.enableLSRMode(); 4665 Rewriter.setIVIncInsertPos(L, IVIncInsertPos); 4666 4667 // Mark phi nodes that terminate chains so the expander tries to reuse them. 4668 for (SmallVectorImpl<IVChain>::const_iterator ChainI = IVChainVec.begin(), 4669 ChainE = IVChainVec.end(); ChainI != ChainE; ++ChainI) { 4670 if (PHINode *PN = dyn_cast<PHINode>(ChainI->tailUserInst())) 4671 Rewriter.setChainedPhi(PN); 4672 } 4673 4674 // Expand the new value definitions and update the users. 4675 for (SmallVectorImpl<LSRFixup>::const_iterator I = Fixups.begin(), 4676 E = Fixups.end(); I != E; ++I) { 4677 const LSRFixup &Fixup = *I; 4678 4679 Rewrite(Fixup, *Solution[Fixup.LUIdx], Rewriter, DeadInsts, P); 4680 4681 Changed = true; 4682 } 4683 4684 for (SmallVectorImpl<IVChain>::const_iterator ChainI = IVChainVec.begin(), 4685 ChainE = IVChainVec.end(); ChainI != ChainE; ++ChainI) { 4686 GenerateIVChain(*ChainI, Rewriter, DeadInsts); 4687 Changed = true; 4688 } 4689 // Clean up after ourselves. This must be done before deleting any 4690 // instructions. 4691 Rewriter.clear(); 4692 4693 Changed |= DeleteTriviallyDeadInstructions(DeadInsts); 4694} 4695 4696LSRInstance::LSRInstance(Loop *L, Pass *P) 4697 : IU(P->getAnalysis<IVUsers>()), SE(P->getAnalysis<ScalarEvolution>()), 4698 DT(P->getAnalysis<DominatorTree>()), LI(P->getAnalysis<LoopInfo>()), 4699 TTI(P->getAnalysis<TargetTransformInfo>()), L(L), Changed(false), 4700 IVIncInsertPos(0) { 4701 // If LoopSimplify form is not available, stay out of trouble. 4702 if (!L->isLoopSimplifyForm()) 4703 return; 4704 4705 // If there's no interesting work to be done, bail early. 4706 if (IU.empty()) return; 4707 4708 // If there's too much analysis to be done, bail early. We won't be able to 4709 // model the problem anyway. 4710 unsigned NumUsers = 0; 4711 for (IVUsers::const_iterator UI = IU.begin(), E = IU.end(); UI != E; ++UI) { 4712 if (++NumUsers > MaxIVUsers) { 4713 DEBUG(dbgs() << "LSR skipping loop, too many IV Users in " << *L 4714 << "\n"); 4715 return; 4716 } 4717 } 4718 4719#ifndef NDEBUG 4720 // All dominating loops must have preheaders, or SCEVExpander may not be able 4721 // to materialize an AddRecExpr whose Start is an outer AddRecExpr. 4722 // 4723 // IVUsers analysis should only create users that are dominated by simple loop 4724 // headers. Since this loop should dominate all of its users, its user list 4725 // should be empty if this loop itself is not within a simple loop nest. 4726 for (DomTreeNode *Rung = DT.getNode(L->getLoopPreheader()); 4727 Rung; Rung = Rung->getIDom()) { 4728 BasicBlock *BB = Rung->getBlock(); 4729 const Loop *DomLoop = LI.getLoopFor(BB); 4730 if (DomLoop && DomLoop->getHeader() == BB) { 4731 assert(DomLoop->getLoopPreheader() && "LSR needs a simplified loop nest"); 4732 } 4733 } 4734#endif // DEBUG 4735 4736 DEBUG(dbgs() << "\nLSR on loop "; 4737 WriteAsOperand(dbgs(), L->getHeader(), /*PrintType=*/false); 4738 dbgs() << ":\n"); 4739 4740 // First, perform some low-level loop optimizations. 4741 OptimizeShadowIV(); 4742 OptimizeLoopTermCond(); 4743 4744 // If loop preparation eliminates all interesting IV users, bail. 4745 if (IU.empty()) return; 4746 4747 // Skip nested loops until we can model them better with formulae. 4748 if (!L->empty()) { 4749 DEBUG(dbgs() << "LSR skipping outer loop " << *L << "\n"); 4750 return; 4751 } 4752 4753 // Start collecting data and preparing for the solver. 4754 CollectChains(); 4755 CollectInterestingTypesAndFactors(); 4756 CollectFixupsAndInitialFormulae(); 4757 CollectLoopInvariantFixupsAndFormulae(); 4758 4759 assert(!Uses.empty() && "IVUsers reported at least one use"); 4760 DEBUG(dbgs() << "LSR found " << Uses.size() << " uses:\n"; 4761 print_uses(dbgs())); 4762 4763 // Now use the reuse data to generate a bunch of interesting ways 4764 // to formulate the values needed for the uses. 4765 GenerateAllReuseFormulae(); 4766 4767 FilterOutUndesirableDedicatedRegisters(); 4768 NarrowSearchSpaceUsingHeuristics(); 4769 4770 SmallVector<const Formula *, 8> Solution; 4771 Solve(Solution); 4772 4773 // Release memory that is no longer needed. 4774 Factors.clear(); 4775 Types.clear(); 4776 RegUses.clear(); 4777 4778 if (Solution.empty()) 4779 return; 4780 4781#ifndef NDEBUG 4782 // Formulae should be legal. 4783 for (SmallVectorImpl<LSRUse>::const_iterator I = Uses.begin(), E = Uses.end(); 4784 I != E; ++I) { 4785 const LSRUse &LU = *I; 4786 for (SmallVectorImpl<Formula>::const_iterator J = LU.Formulae.begin(), 4787 JE = LU.Formulae.end(); 4788 J != JE; ++J) 4789 assert(isLegalUse(TTI, LU.MinOffset, LU.MaxOffset, LU.Kind, LU.AccessTy, 4790 *J) && "Illegal formula generated!"); 4791 }; 4792#endif 4793 4794 // Now that we've decided what we want, make it so. 4795 ImplementSolution(Solution, P); 4796} 4797 4798void LSRInstance::print_factors_and_types(raw_ostream &OS) const { 4799 if (Factors.empty() && Types.empty()) return; 4800 4801 OS << "LSR has identified the following interesting factors and types: "; 4802 bool First = true; 4803 4804 for (SmallSetVector<int64_t, 8>::const_iterator 4805 I = Factors.begin(), E = Factors.end(); I != E; ++I) { 4806 if (!First) OS << ", "; 4807 First = false; 4808 OS << '*' << *I; 4809 } 4810 4811 for (SmallSetVector<Type *, 4>::const_iterator 4812 I = Types.begin(), E = Types.end(); I != E; ++I) { 4813 if (!First) OS << ", "; 4814 First = false; 4815 OS << '(' << **I << ')'; 4816 } 4817 OS << '\n'; 4818} 4819 4820void LSRInstance::print_fixups(raw_ostream &OS) const { 4821 OS << "LSR is examining the following fixup sites:\n"; 4822 for (SmallVectorImpl<LSRFixup>::const_iterator I = Fixups.begin(), 4823 E = Fixups.end(); I != E; ++I) { 4824 dbgs() << " "; 4825 I->print(OS); 4826 OS << '\n'; 4827 } 4828} 4829 4830void LSRInstance::print_uses(raw_ostream &OS) const { 4831 OS << "LSR is examining the following uses:\n"; 4832 for (SmallVectorImpl<LSRUse>::const_iterator I = Uses.begin(), 4833 E = Uses.end(); I != E; ++I) { 4834 const LSRUse &LU = *I; 4835 dbgs() << " "; 4836 LU.print(OS); 4837 OS << '\n'; 4838 for (SmallVectorImpl<Formula>::const_iterator J = LU.Formulae.begin(), 4839 JE = LU.Formulae.end(); J != JE; ++J) { 4840 OS << " "; 4841 J->print(OS); 4842 OS << '\n'; 4843 } 4844 } 4845} 4846 4847void LSRInstance::print(raw_ostream &OS) const { 4848 print_factors_and_types(OS); 4849 print_fixups(OS); 4850 print_uses(OS); 4851} 4852 4853#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) 4854void LSRInstance::dump() const { 4855 print(errs()); errs() << '\n'; 4856} 4857#endif 4858 4859namespace { 4860 4861class LoopStrengthReduce : public LoopPass { 4862public: 4863 static char ID; // Pass ID, replacement for typeid 4864 LoopStrengthReduce(); 4865 4866private: 4867 bool runOnLoop(Loop *L, LPPassManager &LPM); 4868 void getAnalysisUsage(AnalysisUsage &AU) const; 4869}; 4870 4871} 4872 4873char LoopStrengthReduce::ID = 0; 4874INITIALIZE_PASS_BEGIN(LoopStrengthReduce, "loop-reduce", 4875 "Loop Strength Reduction", false, false) 4876INITIALIZE_AG_DEPENDENCY(TargetTransformInfo) 4877INITIALIZE_PASS_DEPENDENCY(DominatorTree) 4878INITIALIZE_PASS_DEPENDENCY(ScalarEvolution) 4879INITIALIZE_PASS_DEPENDENCY(IVUsers) 4880INITIALIZE_PASS_DEPENDENCY(LoopInfo) 4881INITIALIZE_PASS_DEPENDENCY(LoopSimplify) 4882INITIALIZE_PASS_END(LoopStrengthReduce, "loop-reduce", 4883 "Loop Strength Reduction", false, false) 4884 4885 4886Pass *llvm::createLoopStrengthReducePass() { 4887 return new LoopStrengthReduce(); 4888} 4889 4890LoopStrengthReduce::LoopStrengthReduce() : LoopPass(ID) { 4891 initializeLoopStrengthReducePass(*PassRegistry::getPassRegistry()); 4892} 4893 4894void LoopStrengthReduce::getAnalysisUsage(AnalysisUsage &AU) const { 4895 // We split critical edges, so we change the CFG. However, we do update 4896 // many analyses if they are around. 4897 AU.addPreservedID(LoopSimplifyID); 4898 4899 AU.addRequired<LoopInfo>(); 4900 AU.addPreserved<LoopInfo>(); 4901 AU.addRequiredID(LoopSimplifyID); 4902 AU.addRequired<DominatorTree>(); 4903 AU.addPreserved<DominatorTree>(); 4904 AU.addRequired<ScalarEvolution>(); 4905 AU.addPreserved<ScalarEvolution>(); 4906 // Requiring LoopSimplify a second time here prevents IVUsers from running 4907 // twice, since LoopSimplify was invalidated by running ScalarEvolution. 4908 AU.addRequiredID(LoopSimplifyID); 4909 AU.addRequired<IVUsers>(); 4910 AU.addPreserved<IVUsers>(); 4911 AU.addRequired<TargetTransformInfo>(); 4912} 4913 4914bool LoopStrengthReduce::runOnLoop(Loop *L, LPPassManager & /*LPM*/) { 4915 bool Changed = false; 4916 4917 // Run the main LSR transformation. 4918 Changed |= LSRInstance(L, this).getChanged(); 4919 4920 // Remove any extra phis created by processing inner loops. 4921 Changed |= DeleteDeadPHIs(L->getHeader()); 4922 if (EnablePhiElim && L->isLoopSimplifyForm()) { 4923 SmallVector<WeakVH, 16> DeadInsts; 4924 SCEVExpander Rewriter(getAnalysis<ScalarEvolution>(), "lsr"); 4925#ifndef NDEBUG 4926 Rewriter.setDebugType(DEBUG_TYPE); 4927#endif 4928 unsigned numFolded = 4929 Rewriter.replaceCongruentIVs(L, &getAnalysis<DominatorTree>(), 4930 DeadInsts, 4931 &getAnalysis<TargetTransformInfo>()); 4932 if (numFolded) { 4933 Changed = true; 4934 DeleteTriviallyDeadInstructions(DeadInsts); 4935 DeleteDeadPHIs(L->getHeader()); 4936 } 4937 } 4938 return Changed; 4939} 4940