LoopRerollPass.cpp revision 266715
1//===-- LoopReroll.cpp - Loop rerolling pass ------------------------------===// 2// 3// The LLVM Compiler Infrastructure 4// 5// This file is distributed under the University of Illinois Open Source 6// License. See LICENSE.TXT for details. 7// 8//===----------------------------------------------------------------------===// 9// 10// This pass implements a simple loop reroller. 11// 12//===----------------------------------------------------------------------===// 13 14#define DEBUG_TYPE "loop-reroll" 15#include "llvm/Transforms/Scalar.h" 16#include "llvm/ADT/SmallSet.h" 17#include "llvm/ADT/Statistic.h" 18#include "llvm/ADT/STLExtras.h" 19#include "llvm/Analysis/AliasAnalysis.h" 20#include "llvm/Analysis/AliasSetTracker.h" 21#include "llvm/Analysis/LoopPass.h" 22#include "llvm/Analysis/ScalarEvolution.h" 23#include "llvm/Analysis/ScalarEvolutionExpander.h" 24#include "llvm/Analysis/ScalarEvolutionExpressions.h" 25#include "llvm/Analysis/ValueTracking.h" 26#include "llvm/IR/DataLayout.h" 27#include "llvm/IR/IntrinsicInst.h" 28#include "llvm/Support/CommandLine.h" 29#include "llvm/Support/Debug.h" 30#include "llvm/Support/raw_ostream.h" 31#include "llvm/Target/TargetLibraryInfo.h" 32#include "llvm/Transforms/Utils/BasicBlockUtils.h" 33#include "llvm/Transforms/Utils/Local.h" 34#include "llvm/Transforms/Utils/LoopUtils.h" 35 36using namespace llvm; 37 38STATISTIC(NumRerolledLoops, "Number of rerolled loops"); 39 40static cl::opt<unsigned> 41MaxInc("max-reroll-increment", cl::init(2048), cl::Hidden, 42 cl::desc("The maximum increment for loop rerolling")); 43 44// This loop re-rolling transformation aims to transform loops like this: 45// 46// int foo(int a); 47// void bar(int *x) { 48// for (int i = 0; i < 500; i += 3) { 49// foo(i); 50// foo(i+1); 51// foo(i+2); 52// } 53// } 54// 55// into a loop like this: 56// 57// void bar(int *x) { 58// for (int i = 0; i < 500; ++i) 59// foo(i); 60// } 61// 62// It does this by looking for loops that, besides the latch code, are composed 63// of isomorphic DAGs of instructions, with each DAG rooted at some increment 64// to the induction variable, and where each DAG is isomorphic to the DAG 65// rooted at the induction variable (excepting the sub-DAGs which root the 66// other induction-variable increments). In other words, we're looking for loop 67// bodies of the form: 68// 69// %iv = phi [ (preheader, ...), (body, %iv.next) ] 70// f(%iv) 71// %iv.1 = add %iv, 1 <-- a root increment 72// f(%iv.1) 73// %iv.2 = add %iv, 2 <-- a root increment 74// f(%iv.2) 75// %iv.scale_m_1 = add %iv, scale-1 <-- a root increment 76// f(%iv.scale_m_1) 77// ... 78// %iv.next = add %iv, scale 79// %cmp = icmp(%iv, ...) 80// br %cmp, header, exit 81// 82// where each f(i) is a set of instructions that, collectively, are a function 83// only of i (and other loop-invariant values). 84// 85// As a special case, we can also reroll loops like this: 86// 87// int foo(int); 88// void bar(int *x) { 89// for (int i = 0; i < 500; ++i) { 90// x[3*i] = foo(0); 91// x[3*i+1] = foo(0); 92// x[3*i+2] = foo(0); 93// } 94// } 95// 96// into this: 97// 98// void bar(int *x) { 99// for (int i = 0; i < 1500; ++i) 100// x[i] = foo(0); 101// } 102// 103// in which case, we're looking for inputs like this: 104// 105// %iv = phi [ (preheader, ...), (body, %iv.next) ] 106// %scaled.iv = mul %iv, scale 107// f(%scaled.iv) 108// %scaled.iv.1 = add %scaled.iv, 1 109// f(%scaled.iv.1) 110// %scaled.iv.2 = add %scaled.iv, 2 111// f(%scaled.iv.2) 112// %scaled.iv.scale_m_1 = add %scaled.iv, scale-1 113// f(%scaled.iv.scale_m_1) 114// ... 115// %iv.next = add %iv, 1 116// %cmp = icmp(%iv, ...) 117// br %cmp, header, exit 118 119namespace { 120 class LoopReroll : public LoopPass { 121 public: 122 static char ID; // Pass ID, replacement for typeid 123 LoopReroll() : LoopPass(ID) { 124 initializeLoopRerollPass(*PassRegistry::getPassRegistry()); 125 } 126 127 bool runOnLoop(Loop *L, LPPassManager &LPM); 128 129 virtual void getAnalysisUsage(AnalysisUsage &AU) const { 130 AU.addRequired<AliasAnalysis>(); 131 AU.addRequired<LoopInfo>(); 132 AU.addPreserved<LoopInfo>(); 133 AU.addRequired<DominatorTree>(); 134 AU.addPreserved<DominatorTree>(); 135 AU.addRequired<ScalarEvolution>(); 136 AU.addRequired<TargetLibraryInfo>(); 137 } 138 139protected: 140 AliasAnalysis *AA; 141 LoopInfo *LI; 142 ScalarEvolution *SE; 143 DataLayout *DL; 144 TargetLibraryInfo *TLI; 145 DominatorTree *DT; 146 147 typedef SmallVector<Instruction *, 16> SmallInstructionVector; 148 typedef SmallSet<Instruction *, 16> SmallInstructionSet; 149 150 // A chain of isomorphic instructions, indentified by a single-use PHI, 151 // representing a reduction. Only the last value may be used outside the 152 // loop. 153 struct SimpleLoopReduction { 154 SimpleLoopReduction(Instruction *P, Loop *L) 155 : Valid(false), Instructions(1, P) { 156 assert(isa<PHINode>(P) && "First reduction instruction must be a PHI"); 157 add(L); 158 } 159 160 bool valid() const { 161 return Valid; 162 } 163 164 Instruction *getPHI() const { 165 assert(Valid && "Using invalid reduction"); 166 return Instructions.front(); 167 } 168 169 Instruction *getReducedValue() const { 170 assert(Valid && "Using invalid reduction"); 171 return Instructions.back(); 172 } 173 174 Instruction *get(size_t i) const { 175 assert(Valid && "Using invalid reduction"); 176 return Instructions[i+1]; 177 } 178 179 Instruction *operator [] (size_t i) const { return get(i); } 180 181 // The size, ignoring the initial PHI. 182 size_t size() const { 183 assert(Valid && "Using invalid reduction"); 184 return Instructions.size()-1; 185 } 186 187 typedef SmallInstructionVector::iterator iterator; 188 typedef SmallInstructionVector::const_iterator const_iterator; 189 190 iterator begin() { 191 assert(Valid && "Using invalid reduction"); 192 return llvm::next(Instructions.begin()); 193 } 194 195 const_iterator begin() const { 196 assert(Valid && "Using invalid reduction"); 197 return llvm::next(Instructions.begin()); 198 } 199 200 iterator end() { return Instructions.end(); } 201 const_iterator end() const { return Instructions.end(); } 202 203 protected: 204 bool Valid; 205 SmallInstructionVector Instructions; 206 207 void add(Loop *L); 208 }; 209 210 // The set of all reductions, and state tracking of possible reductions 211 // during loop instruction processing. 212 struct ReductionTracker { 213 typedef SmallVector<SimpleLoopReduction, 16> SmallReductionVector; 214 215 // Add a new possible reduction. 216 void addSLR(SimpleLoopReduction &SLR) { 217 PossibleReds.push_back(SLR); 218 } 219 220 // Setup to track possible reductions corresponding to the provided 221 // rerolling scale. Only reductions with a number of non-PHI instructions 222 // that is divisible by the scale are considered. Three instructions sets 223 // are filled in: 224 // - A set of all possible instructions in eligible reductions. 225 // - A set of all PHIs in eligible reductions 226 // - A set of all reduced values (last instructions) in eligible reductions. 227 void restrictToScale(uint64_t Scale, 228 SmallInstructionSet &PossibleRedSet, 229 SmallInstructionSet &PossibleRedPHISet, 230 SmallInstructionSet &PossibleRedLastSet) { 231 PossibleRedIdx.clear(); 232 PossibleRedIter.clear(); 233 Reds.clear(); 234 235 for (unsigned i = 0, e = PossibleReds.size(); i != e; ++i) 236 if (PossibleReds[i].size() % Scale == 0) { 237 PossibleRedLastSet.insert(PossibleReds[i].getReducedValue()); 238 PossibleRedPHISet.insert(PossibleReds[i].getPHI()); 239 240 PossibleRedSet.insert(PossibleReds[i].getPHI()); 241 PossibleRedIdx[PossibleReds[i].getPHI()] = i; 242 for (SimpleLoopReduction::iterator J = PossibleReds[i].begin(), 243 JE = PossibleReds[i].end(); J != JE; ++J) { 244 PossibleRedSet.insert(*J); 245 PossibleRedIdx[*J] = i; 246 } 247 } 248 } 249 250 // The functions below are used while processing the loop instructions. 251 252 // Are the two instructions both from reductions, and furthermore, from 253 // the same reduction? 254 bool isPairInSame(Instruction *J1, Instruction *J2) { 255 DenseMap<Instruction *, int>::iterator J1I = PossibleRedIdx.find(J1); 256 if (J1I != PossibleRedIdx.end()) { 257 DenseMap<Instruction *, int>::iterator J2I = PossibleRedIdx.find(J2); 258 if (J2I != PossibleRedIdx.end() && J1I->second == J2I->second) 259 return true; 260 } 261 262 return false; 263 } 264 265 // The two provided instructions, the first from the base iteration, and 266 // the second from iteration i, form a matched pair. If these are part of 267 // a reduction, record that fact. 268 void recordPair(Instruction *J1, Instruction *J2, unsigned i) { 269 if (PossibleRedIdx.count(J1)) { 270 assert(PossibleRedIdx.count(J2) && 271 "Recording reduction vs. non-reduction instruction?"); 272 273 PossibleRedIter[J1] = 0; 274 PossibleRedIter[J2] = i; 275 276 int Idx = PossibleRedIdx[J1]; 277 assert(Idx == PossibleRedIdx[J2] && 278 "Recording pair from different reductions?"); 279 Reds.insert(Idx); 280 } 281 } 282 283 // The functions below can be called after we've finished processing all 284 // instructions in the loop, and we know which reductions were selected. 285 286 // Is the provided instruction the PHI of a reduction selected for 287 // rerolling? 288 bool isSelectedPHI(Instruction *J) { 289 if (!isa<PHINode>(J)) 290 return false; 291 292 for (DenseSet<int>::iterator RI = Reds.begin(), RIE = Reds.end(); 293 RI != RIE; ++RI) { 294 int i = *RI; 295 if (cast<Instruction>(J) == PossibleReds[i].getPHI()) 296 return true; 297 } 298 299 return false; 300 } 301 302 bool validateSelected(); 303 void replaceSelected(); 304 305 protected: 306 // The vector of all possible reductions (for any scale). 307 SmallReductionVector PossibleReds; 308 309 DenseMap<Instruction *, int> PossibleRedIdx; 310 DenseMap<Instruction *, int> PossibleRedIter; 311 DenseSet<int> Reds; 312 }; 313 314 void collectPossibleIVs(Loop *L, SmallInstructionVector &PossibleIVs); 315 void collectPossibleReductions(Loop *L, 316 ReductionTracker &Reductions); 317 void collectInLoopUserSet(Loop *L, 318 const SmallInstructionVector &Roots, 319 const SmallInstructionSet &Exclude, 320 const SmallInstructionSet &Final, 321 DenseSet<Instruction *> &Users); 322 void collectInLoopUserSet(Loop *L, 323 Instruction * Root, 324 const SmallInstructionSet &Exclude, 325 const SmallInstructionSet &Final, 326 DenseSet<Instruction *> &Users); 327 bool findScaleFromMul(Instruction *RealIV, uint64_t &Scale, 328 Instruction *&IV, 329 SmallInstructionVector &LoopIncs); 330 bool collectAllRoots(Loop *L, uint64_t Inc, uint64_t Scale, Instruction *IV, 331 SmallVector<SmallInstructionVector, 32> &Roots, 332 SmallInstructionSet &AllRoots, 333 SmallInstructionVector &LoopIncs); 334 bool reroll(Instruction *IV, Loop *L, BasicBlock *Header, const SCEV *IterCount, 335 ReductionTracker &Reductions); 336 }; 337} 338 339char LoopReroll::ID = 0; 340INITIALIZE_PASS_BEGIN(LoopReroll, "loop-reroll", "Reroll loops", false, false) 341INITIALIZE_AG_DEPENDENCY(AliasAnalysis) 342INITIALIZE_PASS_DEPENDENCY(LoopInfo) 343INITIALIZE_PASS_DEPENDENCY(DominatorTree) 344INITIALIZE_PASS_DEPENDENCY(ScalarEvolution) 345INITIALIZE_PASS_DEPENDENCY(TargetLibraryInfo) 346INITIALIZE_PASS_END(LoopReroll, "loop-reroll", "Reroll loops", false, false) 347 348Pass *llvm::createLoopRerollPass() { 349 return new LoopReroll; 350} 351 352// Returns true if the provided instruction is used outside the given loop. 353// This operates like Instruction::isUsedOutsideOfBlock, but considers PHIs in 354// non-loop blocks to be outside the loop. 355static bool hasUsesOutsideLoop(Instruction *I, Loop *L) { 356 for (Value::use_iterator UI = I->use_begin(), 357 UIE = I->use_end(); UI != UIE; ++UI) { 358 Instruction *User = cast<Instruction>(*UI); 359 if (!L->contains(User)) 360 return true; 361 } 362 363 return false; 364} 365 366// Collect the list of loop induction variables with respect to which it might 367// be possible to reroll the loop. 368void LoopReroll::collectPossibleIVs(Loop *L, 369 SmallInstructionVector &PossibleIVs) { 370 BasicBlock *Header = L->getHeader(); 371 for (BasicBlock::iterator I = Header->begin(), 372 IE = Header->getFirstInsertionPt(); I != IE; ++I) { 373 if (!isa<PHINode>(I)) 374 continue; 375 if (!I->getType()->isIntegerTy()) 376 continue; 377 378 if (const SCEVAddRecExpr *PHISCEV = 379 dyn_cast<SCEVAddRecExpr>(SE->getSCEV(I))) { 380 if (PHISCEV->getLoop() != L) 381 continue; 382 if (!PHISCEV->isAffine()) 383 continue; 384 if (const SCEVConstant *IncSCEV = 385 dyn_cast<SCEVConstant>(PHISCEV->getStepRecurrence(*SE))) { 386 if (!IncSCEV->getValue()->getValue().isStrictlyPositive()) 387 continue; 388 if (IncSCEV->getValue()->uge(MaxInc)) 389 continue; 390 391 DEBUG(dbgs() << "LRR: Possible IV: " << *I << " = " << 392 *PHISCEV << "\n"); 393 PossibleIVs.push_back(I); 394 } 395 } 396 } 397} 398 399// Add the remainder of the reduction-variable chain to the instruction vector 400// (the initial PHINode has already been added). If successful, the object is 401// marked as valid. 402void LoopReroll::SimpleLoopReduction::add(Loop *L) { 403 assert(!Valid && "Cannot add to an already-valid chain"); 404 405 // The reduction variable must be a chain of single-use instructions 406 // (including the PHI), except for the last value (which is used by the PHI 407 // and also outside the loop). 408 Instruction *C = Instructions.front(); 409 410 do { 411 C = cast<Instruction>(*C->use_begin()); 412 if (C->hasOneUse()) { 413 if (!C->isBinaryOp()) 414 return; 415 416 if (!(isa<PHINode>(Instructions.back()) || 417 C->isSameOperationAs(Instructions.back()))) 418 return; 419 420 Instructions.push_back(C); 421 } 422 } while (C->hasOneUse()); 423 424 if (Instructions.size() < 2 || 425 !C->isSameOperationAs(Instructions.back()) || 426 C->use_begin() == C->use_end()) 427 return; 428 429 // C is now the (potential) last instruction in the reduction chain. 430 for (Value::use_iterator UI = C->use_begin(), UIE = C->use_end(); 431 UI != UIE; ++UI) { 432 // The only in-loop user can be the initial PHI. 433 if (L->contains(cast<Instruction>(*UI))) 434 if (cast<Instruction>(*UI ) != Instructions.front()) 435 return; 436 } 437 438 Instructions.push_back(C); 439 Valid = true; 440} 441 442// Collect the vector of possible reduction variables. 443void LoopReroll::collectPossibleReductions(Loop *L, 444 ReductionTracker &Reductions) { 445 BasicBlock *Header = L->getHeader(); 446 for (BasicBlock::iterator I = Header->begin(), 447 IE = Header->getFirstInsertionPt(); I != IE; ++I) { 448 if (!isa<PHINode>(I)) 449 continue; 450 if (!I->getType()->isSingleValueType()) 451 continue; 452 453 SimpleLoopReduction SLR(I, L); 454 if (!SLR.valid()) 455 continue; 456 457 DEBUG(dbgs() << "LRR: Possible reduction: " << *I << " (with " << 458 SLR.size() << " chained instructions)\n"); 459 Reductions.addSLR(SLR); 460 } 461} 462 463// Collect the set of all users of the provided root instruction. This set of 464// users contains not only the direct users of the root instruction, but also 465// all users of those users, and so on. There are two exceptions: 466// 467// 1. Instructions in the set of excluded instructions are never added to the 468// use set (even if they are users). This is used, for example, to exclude 469// including root increments in the use set of the primary IV. 470// 471// 2. Instructions in the set of final instructions are added to the use set 472// if they are users, but their users are not added. This is used, for 473// example, to prevent a reduction update from forcing all later reduction 474// updates into the use set. 475void LoopReroll::collectInLoopUserSet(Loop *L, 476 Instruction *Root, const SmallInstructionSet &Exclude, 477 const SmallInstructionSet &Final, 478 DenseSet<Instruction *> &Users) { 479 SmallInstructionVector Queue(1, Root); 480 while (!Queue.empty()) { 481 Instruction *I = Queue.pop_back_val(); 482 if (!Users.insert(I).second) 483 continue; 484 485 if (!Final.count(I)) 486 for (Value::use_iterator UI = I->use_begin(), 487 UIE = I->use_end(); UI != UIE; ++UI) { 488 Instruction *User = cast<Instruction>(*UI); 489 if (PHINode *PN = dyn_cast<PHINode>(User)) { 490 // Ignore "wrap-around" uses to PHIs of this loop's header. 491 if (PN->getIncomingBlock(UI) == L->getHeader()) 492 continue; 493 } 494 495 if (L->contains(User) && !Exclude.count(User)) { 496 Queue.push_back(User); 497 } 498 } 499 500 // We also want to collect single-user "feeder" values. 501 for (User::op_iterator OI = I->op_begin(), 502 OIE = I->op_end(); OI != OIE; ++OI) { 503 if (Instruction *Op = dyn_cast<Instruction>(*OI)) 504 if (Op->hasOneUse() && L->contains(Op) && !Exclude.count(Op) && 505 !Final.count(Op)) 506 Queue.push_back(Op); 507 } 508 } 509} 510 511// Collect all of the users of all of the provided root instructions (combined 512// into a single set). 513void LoopReroll::collectInLoopUserSet(Loop *L, 514 const SmallInstructionVector &Roots, 515 const SmallInstructionSet &Exclude, 516 const SmallInstructionSet &Final, 517 DenseSet<Instruction *> &Users) { 518 for (SmallInstructionVector::const_iterator I = Roots.begin(), 519 IE = Roots.end(); I != IE; ++I) 520 collectInLoopUserSet(L, *I, Exclude, Final, Users); 521} 522 523static bool isSimpleLoadStore(Instruction *I) { 524 if (LoadInst *LI = dyn_cast<LoadInst>(I)) 525 return LI->isSimple(); 526 if (StoreInst *SI = dyn_cast<StoreInst>(I)) 527 return SI->isSimple(); 528 if (MemIntrinsic *MI = dyn_cast<MemIntrinsic>(I)) 529 return !MI->isVolatile(); 530 return false; 531} 532 533// Recognize loops that are setup like this: 534// 535// %iv = phi [ (preheader, ...), (body, %iv.next) ] 536// %scaled.iv = mul %iv, scale 537// f(%scaled.iv) 538// %scaled.iv.1 = add %scaled.iv, 1 539// f(%scaled.iv.1) 540// %scaled.iv.2 = add %scaled.iv, 2 541// f(%scaled.iv.2) 542// %scaled.iv.scale_m_1 = add %scaled.iv, scale-1 543// f(%scaled.iv.scale_m_1) 544// ... 545// %iv.next = add %iv, 1 546// %cmp = icmp(%iv, ...) 547// br %cmp, header, exit 548// 549// and, if found, set IV = %scaled.iv, and add %iv.next to LoopIncs. 550bool LoopReroll::findScaleFromMul(Instruction *RealIV, uint64_t &Scale, 551 Instruction *&IV, 552 SmallInstructionVector &LoopIncs) { 553 // This is a special case: here we're looking for all uses (except for 554 // the increment) to be multiplied by a common factor. The increment must 555 // be by one. This is to capture loops like: 556 // for (int i = 0; i < 500; ++i) { 557 // foo(3*i); foo(3*i+1); foo(3*i+2); 558 // } 559 if (RealIV->getNumUses() != 2) 560 return false; 561 const SCEVAddRecExpr *RealIVSCEV = cast<SCEVAddRecExpr>(SE->getSCEV(RealIV)); 562 Instruction *User1 = cast<Instruction>(*RealIV->use_begin()), 563 *User2 = cast<Instruction>(*llvm::next(RealIV->use_begin())); 564 if (!SE->isSCEVable(User1->getType()) || !SE->isSCEVable(User2->getType())) 565 return false; 566 const SCEVAddRecExpr *User1SCEV = 567 dyn_cast<SCEVAddRecExpr>(SE->getSCEV(User1)), 568 *User2SCEV = 569 dyn_cast<SCEVAddRecExpr>(SE->getSCEV(User2)); 570 if (!User1SCEV || !User1SCEV->isAffine() || 571 !User2SCEV || !User2SCEV->isAffine()) 572 return false; 573 574 // We assume below that User1 is the scale multiply and User2 is the 575 // increment. If this can't be true, then swap them. 576 if (User1SCEV == RealIVSCEV->getPostIncExpr(*SE)) { 577 std::swap(User1, User2); 578 std::swap(User1SCEV, User2SCEV); 579 } 580 581 if (User2SCEV != RealIVSCEV->getPostIncExpr(*SE)) 582 return false; 583 assert(User2SCEV->getStepRecurrence(*SE)->isOne() && 584 "Invalid non-unit step for multiplicative scaling"); 585 LoopIncs.push_back(User2); 586 587 if (const SCEVConstant *MulScale = 588 dyn_cast<SCEVConstant>(User1SCEV->getStepRecurrence(*SE))) { 589 // Make sure that both the start and step have the same multiplier. 590 if (RealIVSCEV->getStart()->getType() != MulScale->getType()) 591 return false; 592 if (SE->getMulExpr(RealIVSCEV->getStart(), MulScale) != 593 User1SCEV->getStart()) 594 return false; 595 596 ConstantInt *MulScaleCI = MulScale->getValue(); 597 if (!MulScaleCI->uge(2) || MulScaleCI->uge(MaxInc)) 598 return false; 599 Scale = MulScaleCI->getZExtValue(); 600 IV = User1; 601 } else 602 return false; 603 604 DEBUG(dbgs() << "LRR: Found possible scaling " << *User1 << "\n"); 605 return true; 606} 607 608// Collect all root increments with respect to the provided induction variable 609// (normally the PHI, but sometimes a multiply). A root increment is an 610// instruction, normally an add, with a positive constant less than Scale. In a 611// rerollable loop, each of these increments is the root of an instruction 612// graph isomorphic to the others. Also, we collect the final induction 613// increment (the increment equal to the Scale), and its users in LoopIncs. 614bool LoopReroll::collectAllRoots(Loop *L, uint64_t Inc, uint64_t Scale, 615 Instruction *IV, 616 SmallVector<SmallInstructionVector, 32> &Roots, 617 SmallInstructionSet &AllRoots, 618 SmallInstructionVector &LoopIncs) { 619 for (Value::use_iterator UI = IV->use_begin(), 620 UIE = IV->use_end(); UI != UIE; ++UI) { 621 Instruction *User = cast<Instruction>(*UI); 622 if (!SE->isSCEVable(User->getType())) 623 continue; 624 if (User->getType() != IV->getType()) 625 continue; 626 if (!L->contains(User)) 627 continue; 628 if (hasUsesOutsideLoop(User, L)) 629 continue; 630 631 if (const SCEVConstant *Diff = dyn_cast<SCEVConstant>(SE->getMinusSCEV( 632 SE->getSCEV(User), SE->getSCEV(IV)))) { 633 uint64_t Idx = Diff->getValue()->getValue().getZExtValue(); 634 if (Idx > 0 && Idx < Scale) { 635 Roots[Idx-1].push_back(User); 636 AllRoots.insert(User); 637 } else if (Idx == Scale && Inc > 1) { 638 LoopIncs.push_back(User); 639 } 640 } 641 } 642 643 if (Roots[0].empty()) 644 return false; 645 bool AllSame = true; 646 for (unsigned i = 1; i < Scale-1; ++i) 647 if (Roots[i].size() != Roots[0].size()) { 648 AllSame = false; 649 break; 650 } 651 652 if (!AllSame) 653 return false; 654 655 return true; 656} 657 658// Validate the selected reductions. All iterations must have an isomorphic 659// part of the reduction chain and, for non-associative reductions, the chain 660// entries must appear in order. 661bool LoopReroll::ReductionTracker::validateSelected() { 662 // For a non-associative reduction, the chain entries must appear in order. 663 for (DenseSet<int>::iterator RI = Reds.begin(), RIE = Reds.end(); 664 RI != RIE; ++RI) { 665 int i = *RI; 666 int PrevIter = 0, BaseCount = 0, Count = 0; 667 for (SimpleLoopReduction::iterator J = PossibleReds[i].begin(), 668 JE = PossibleReds[i].end(); J != JE; ++J) { 669 // Note that all instructions in the chain must have been found because 670 // all instructions in the function must have been assigned to some 671 // iteration. 672 int Iter = PossibleRedIter[*J]; 673 if (Iter != PrevIter && Iter != PrevIter + 1 && 674 !PossibleReds[i].getReducedValue()->isAssociative()) { 675 DEBUG(dbgs() << "LRR: Out-of-order non-associative reduction: " << 676 *J << "\n"); 677 return false; 678 } 679 680 if (Iter != PrevIter) { 681 if (Count != BaseCount) { 682 DEBUG(dbgs() << "LRR: Iteration " << PrevIter << 683 " reduction use count " << Count << 684 " is not equal to the base use count " << 685 BaseCount << "\n"); 686 return false; 687 } 688 689 Count = 0; 690 } 691 692 ++Count; 693 if (Iter == 0) 694 ++BaseCount; 695 696 PrevIter = Iter; 697 } 698 } 699 700 return true; 701} 702 703// For all selected reductions, remove all parts except those in the first 704// iteration (and the PHI). Replace outside uses of the reduced value with uses 705// of the first-iteration reduced value (in other words, reroll the selected 706// reductions). 707void LoopReroll::ReductionTracker::replaceSelected() { 708 // Fixup reductions to refer to the last instruction associated with the 709 // first iteration (not the last). 710 for (DenseSet<int>::iterator RI = Reds.begin(), RIE = Reds.end(); 711 RI != RIE; ++RI) { 712 int i = *RI; 713 int j = 0; 714 for (int e = PossibleReds[i].size(); j != e; ++j) 715 if (PossibleRedIter[PossibleReds[i][j]] != 0) { 716 --j; 717 break; 718 } 719 720 // Replace users with the new end-of-chain value. 721 SmallInstructionVector Users; 722 for (Value::use_iterator UI = 723 PossibleReds[i].getReducedValue()->use_begin(), 724 UIE = PossibleReds[i].getReducedValue()->use_end(); UI != UIE; ++UI) 725 Users.push_back(cast<Instruction>(*UI)); 726 727 for (SmallInstructionVector::iterator J = Users.begin(), 728 JE = Users.end(); J != JE; ++J) 729 (*J)->replaceUsesOfWith(PossibleReds[i].getReducedValue(), 730 PossibleReds[i][j]); 731 } 732} 733 734// Reroll the provided loop with respect to the provided induction variable. 735// Generally, we're looking for a loop like this: 736// 737// %iv = phi [ (preheader, ...), (body, %iv.next) ] 738// f(%iv) 739// %iv.1 = add %iv, 1 <-- a root increment 740// f(%iv.1) 741// %iv.2 = add %iv, 2 <-- a root increment 742// f(%iv.2) 743// %iv.scale_m_1 = add %iv, scale-1 <-- a root increment 744// f(%iv.scale_m_1) 745// ... 746// %iv.next = add %iv, scale 747// %cmp = icmp(%iv, ...) 748// br %cmp, header, exit 749// 750// Notably, we do not require that f(%iv), f(%iv.1), etc. be isolated groups of 751// instructions. In other words, the instructions in f(%iv), f(%iv.1), etc. can 752// be intermixed with eachother. The restriction imposed by this algorithm is 753// that the relative order of the isomorphic instructions in f(%iv), f(%iv.1), 754// etc. be the same. 755// 756// First, we collect the use set of %iv, excluding the other increment roots. 757// This gives us f(%iv). Then we iterate over the loop instructions (scale-1) 758// times, having collected the use set of f(%iv.(i+1)), during which we: 759// - Ensure that the next unmatched instruction in f(%iv) is isomorphic to 760// the next unmatched instruction in f(%iv.(i+1)). 761// - Ensure that both matched instructions don't have any external users 762// (with the exception of last-in-chain reduction instructions). 763// - Track the (aliasing) write set, and other side effects, of all 764// instructions that belong to future iterations that come before the matched 765// instructions. If the matched instructions read from that write set, then 766// f(%iv) or f(%iv.(i+1)) has some dependency on instructions in 767// f(%iv.(j+1)) for some j > i, and we cannot reroll the loop. Similarly, 768// if any of these future instructions had side effects (could not be 769// speculatively executed), and so do the matched instructions, when we 770// cannot reorder those side-effect-producing instructions, and rerolling 771// fails. 772// 773// Finally, we make sure that all loop instructions are either loop increment 774// roots, belong to simple latch code, parts of validated reductions, part of 775// f(%iv) or part of some f(%iv.i). If all of that is true (and all reductions 776// have been validated), then we reroll the loop. 777bool LoopReroll::reroll(Instruction *IV, Loop *L, BasicBlock *Header, 778 const SCEV *IterCount, 779 ReductionTracker &Reductions) { 780 const SCEVAddRecExpr *RealIVSCEV = cast<SCEVAddRecExpr>(SE->getSCEV(IV)); 781 uint64_t Inc = cast<SCEVConstant>(RealIVSCEV->getOperand(1))-> 782 getValue()->getZExtValue(); 783 // The collection of loop increment instructions. 784 SmallInstructionVector LoopIncs; 785 uint64_t Scale = Inc; 786 787 // The effective induction variable, IV, is normally also the real induction 788 // variable. When we're dealing with a loop like: 789 // for (int i = 0; i < 500; ++i) 790 // x[3*i] = ...; 791 // x[3*i+1] = ...; 792 // x[3*i+2] = ...; 793 // then the real IV is still i, but the effective IV is (3*i). 794 Instruction *RealIV = IV; 795 if (Inc == 1 && !findScaleFromMul(RealIV, Scale, IV, LoopIncs)) 796 return false; 797 798 assert(Scale <= MaxInc && "Scale is too large"); 799 assert(Scale > 1 && "Scale must be at least 2"); 800 801 // The set of increment instructions for each increment value. 802 SmallVector<SmallInstructionVector, 32> Roots(Scale-1); 803 SmallInstructionSet AllRoots; 804 if (!collectAllRoots(L, Inc, Scale, IV, Roots, AllRoots, LoopIncs)) 805 return false; 806 807 DEBUG(dbgs() << "LRR: Found all root induction increments for: " << 808 *RealIV << "\n"); 809 810 // An array of just the possible reductions for this scale factor. When we 811 // collect the set of all users of some root instructions, these reduction 812 // instructions are treated as 'final' (their uses are not considered). 813 // This is important because we don't want the root use set to search down 814 // the reduction chain. 815 SmallInstructionSet PossibleRedSet; 816 SmallInstructionSet PossibleRedLastSet, PossibleRedPHISet; 817 Reductions.restrictToScale(Scale, PossibleRedSet, PossibleRedPHISet, 818 PossibleRedLastSet); 819 820 // We now need to check for equivalence of the use graph of each root with 821 // that of the primary induction variable (excluding the roots). Our goal 822 // here is not to solve the full graph isomorphism problem, but rather to 823 // catch common cases without a lot of work. As a result, we will assume 824 // that the relative order of the instructions in each unrolled iteration 825 // is the same (although we will not make an assumption about how the 826 // different iterations are intermixed). Note that while the order must be 827 // the same, the instructions may not be in the same basic block. 828 SmallInstructionSet Exclude(AllRoots); 829 Exclude.insert(LoopIncs.begin(), LoopIncs.end()); 830 831 DenseSet<Instruction *> BaseUseSet; 832 collectInLoopUserSet(L, IV, Exclude, PossibleRedSet, BaseUseSet); 833 834 DenseSet<Instruction *> AllRootUses; 835 std::vector<DenseSet<Instruction *> > RootUseSets(Scale-1); 836 837 bool MatchFailed = false; 838 for (unsigned i = 0; i < Scale-1 && !MatchFailed; ++i) { 839 DenseSet<Instruction *> &RootUseSet = RootUseSets[i]; 840 collectInLoopUserSet(L, Roots[i], SmallInstructionSet(), 841 PossibleRedSet, RootUseSet); 842 843 DEBUG(dbgs() << "LRR: base use set size: " << BaseUseSet.size() << 844 " vs. iteration increment " << (i+1) << 845 " use set size: " << RootUseSet.size() << "\n"); 846 847 if (BaseUseSet.size() != RootUseSet.size()) { 848 MatchFailed = true; 849 break; 850 } 851 852 // In addition to regular aliasing information, we need to look for 853 // instructions from later (future) iterations that have side effects 854 // preventing us from reordering them past other instructions with side 855 // effects. 856 bool FutureSideEffects = false; 857 AliasSetTracker AST(*AA); 858 859 // The map between instructions in f(%iv.(i+1)) and f(%iv). 860 DenseMap<Value *, Value *> BaseMap; 861 862 assert(L->getNumBlocks() == 1 && "Cannot handle multi-block loops"); 863 for (BasicBlock::iterator J1 = Header->begin(), J2 = Header->begin(), 864 JE = Header->end(); J1 != JE && !MatchFailed; ++J1) { 865 if (cast<Instruction>(J1) == RealIV) 866 continue; 867 if (cast<Instruction>(J1) == IV) 868 continue; 869 if (!BaseUseSet.count(J1)) 870 continue; 871 if (PossibleRedPHISet.count(J1)) // Skip reduction PHIs. 872 continue; 873 874 while (J2 != JE && (!RootUseSet.count(J2) || 875 std::find(Roots[i].begin(), Roots[i].end(), J2) != 876 Roots[i].end())) { 877 // As we iterate through the instructions, instructions that don't 878 // belong to previous iterations (or the base case), must belong to 879 // future iterations. We want to track the alias set of writes from 880 // previous iterations. 881 if (!isa<PHINode>(J2) && !BaseUseSet.count(J2) && 882 !AllRootUses.count(J2)) { 883 if (J2->mayWriteToMemory()) 884 AST.add(J2); 885 886 // Note: This is specifically guarded by a check on isa<PHINode>, 887 // which while a valid (somewhat arbitrary) micro-optimization, is 888 // needed because otherwise isSafeToSpeculativelyExecute returns 889 // false on PHI nodes. 890 if (!isSimpleLoadStore(J2) && !isSafeToSpeculativelyExecute(J2, DL)) 891 FutureSideEffects = true; 892 } 893 894 ++J2; 895 } 896 897 if (!J1->isSameOperationAs(J2)) { 898 DEBUG(dbgs() << "LRR: iteration root match failed at " << *J1 << 899 " vs. " << *J2 << "\n"); 900 MatchFailed = true; 901 break; 902 } 903 904 // Make sure that this instruction, which is in the use set of this 905 // root instruction, does not also belong to the base set or the set of 906 // some previous root instruction. 907 if (BaseUseSet.count(J2) || AllRootUses.count(J2)) { 908 DEBUG(dbgs() << "LRR: iteration root match failed at " << *J1 << 909 " vs. " << *J2 << " (prev. case overlap)\n"); 910 MatchFailed = true; 911 break; 912 } 913 914 // Make sure that we don't alias with any instruction in the alias set 915 // tracker. If we do, then we depend on a future iteration, and we 916 // can't reroll. 917 if (J2->mayReadFromMemory()) { 918 for (AliasSetTracker::iterator K = AST.begin(), KE = AST.end(); 919 K != KE && !MatchFailed; ++K) { 920 if (K->aliasesUnknownInst(J2, *AA)) { 921 DEBUG(dbgs() << "LRR: iteration root match failed at " << *J1 << 922 " vs. " << *J2 << " (depends on future store)\n"); 923 MatchFailed = true; 924 break; 925 } 926 } 927 } 928 929 // If we've past an instruction from a future iteration that may have 930 // side effects, and this instruction might also, then we can't reorder 931 // them, and this matching fails. As an exception, we allow the alias 932 // set tracker to handle regular (simple) load/store dependencies. 933 if (FutureSideEffects && 934 ((!isSimpleLoadStore(J1) && !isSafeToSpeculativelyExecute(J1)) || 935 (!isSimpleLoadStore(J2) && !isSafeToSpeculativelyExecute(J2)))) { 936 DEBUG(dbgs() << "LRR: iteration root match failed at " << *J1 << 937 " vs. " << *J2 << 938 " (side effects prevent reordering)\n"); 939 MatchFailed = true; 940 break; 941 } 942 943 // For instructions that are part of a reduction, if the operation is 944 // associative, then don't bother matching the operands (because we 945 // already know that the instructions are isomorphic, and the order 946 // within the iteration does not matter). For non-associative reductions, 947 // we do need to match the operands, because we need to reject 948 // out-of-order instructions within an iteration! 949 // For example (assume floating-point addition), we need to reject this: 950 // x += a[i]; x += b[i]; 951 // x += a[i+1]; x += b[i+1]; 952 // x += b[i+2]; x += a[i+2]; 953 bool InReduction = Reductions.isPairInSame(J1, J2); 954 955 if (!(InReduction && J1->isAssociative())) { 956 bool Swapped = false, SomeOpMatched = false;; 957 for (unsigned j = 0; j < J1->getNumOperands() && !MatchFailed; ++j) { 958 Value *Op2 = J2->getOperand(j); 959 960 // If this is part of a reduction (and the operation is not 961 // associatve), then we match all operands, but not those that are 962 // part of the reduction. 963 if (InReduction) 964 if (Instruction *Op2I = dyn_cast<Instruction>(Op2)) 965 if (Reductions.isPairInSame(J2, Op2I)) 966 continue; 967 968 DenseMap<Value *, Value *>::iterator BMI = BaseMap.find(Op2); 969 if (BMI != BaseMap.end()) 970 Op2 = BMI->second; 971 else if (std::find(Roots[i].begin(), Roots[i].end(), 972 (Instruction*) Op2) != Roots[i].end()) 973 Op2 = IV; 974 975 if (J1->getOperand(Swapped ? unsigned(!j) : j) != Op2) { 976 // If we've not already decided to swap the matched operands, and 977 // we've not already matched our first operand (note that we could 978 // have skipped matching the first operand because it is part of a 979 // reduction above), and the instruction is commutative, then try 980 // the swapped match. 981 if (!Swapped && J1->isCommutative() && !SomeOpMatched && 982 J1->getOperand(!j) == Op2) { 983 Swapped = true; 984 } else { 985 DEBUG(dbgs() << "LRR: iteration root match failed at " << *J1 << 986 " vs. " << *J2 << " (operand " << j << ")\n"); 987 MatchFailed = true; 988 break; 989 } 990 } 991 992 SomeOpMatched = true; 993 } 994 } 995 996 if ((!PossibleRedLastSet.count(J1) && hasUsesOutsideLoop(J1, L)) || 997 (!PossibleRedLastSet.count(J2) && hasUsesOutsideLoop(J2, L))) { 998 DEBUG(dbgs() << "LRR: iteration root match failed at " << *J1 << 999 " vs. " << *J2 << " (uses outside loop)\n"); 1000 MatchFailed = true; 1001 break; 1002 } 1003 1004 if (!MatchFailed) 1005 BaseMap.insert(std::pair<Value *, Value *>(J2, J1)); 1006 1007 AllRootUses.insert(J2); 1008 Reductions.recordPair(J1, J2, i+1); 1009 1010 ++J2; 1011 } 1012 } 1013 1014 if (MatchFailed) 1015 return false; 1016 1017 DEBUG(dbgs() << "LRR: Matched all iteration increments for " << 1018 *RealIV << "\n"); 1019 1020 DenseSet<Instruction *> LoopIncUseSet; 1021 collectInLoopUserSet(L, LoopIncs, SmallInstructionSet(), 1022 SmallInstructionSet(), LoopIncUseSet); 1023 DEBUG(dbgs() << "LRR: Loop increment set size: " << 1024 LoopIncUseSet.size() << "\n"); 1025 1026 // Make sure that all instructions in the loop have been included in some 1027 // use set. 1028 for (BasicBlock::iterator J = Header->begin(), JE = Header->end(); 1029 J != JE; ++J) { 1030 if (isa<DbgInfoIntrinsic>(J)) 1031 continue; 1032 if (cast<Instruction>(J) == RealIV) 1033 continue; 1034 if (cast<Instruction>(J) == IV) 1035 continue; 1036 if (BaseUseSet.count(J) || AllRootUses.count(J) || 1037 (LoopIncUseSet.count(J) && (J->isTerminator() || 1038 isSafeToSpeculativelyExecute(J, DL)))) 1039 continue; 1040 1041 if (AllRoots.count(J)) 1042 continue; 1043 1044 if (Reductions.isSelectedPHI(J)) 1045 continue; 1046 1047 DEBUG(dbgs() << "LRR: aborting reroll based on " << *RealIV << 1048 " unprocessed instruction found: " << *J << "\n"); 1049 MatchFailed = true; 1050 break; 1051 } 1052 1053 if (MatchFailed) 1054 return false; 1055 1056 DEBUG(dbgs() << "LRR: all instructions processed from " << 1057 *RealIV << "\n"); 1058 1059 if (!Reductions.validateSelected()) 1060 return false; 1061 1062 // At this point, we've validated the rerolling, and we're committed to 1063 // making changes! 1064 1065 Reductions.replaceSelected(); 1066 1067 // Remove instructions associated with non-base iterations. 1068 for (BasicBlock::reverse_iterator J = Header->rbegin(); 1069 J != Header->rend();) { 1070 if (AllRootUses.count(&*J)) { 1071 Instruction *D = &*J; 1072 DEBUG(dbgs() << "LRR: removing: " << *D << "\n"); 1073 D->eraseFromParent(); 1074 continue; 1075 } 1076 1077 ++J; 1078 } 1079 1080 // Insert the new induction variable. 1081 const SCEV *Start = RealIVSCEV->getStart(); 1082 if (Inc == 1) 1083 Start = SE->getMulExpr(Start, 1084 SE->getConstant(Start->getType(), Scale)); 1085 const SCEVAddRecExpr *H = 1086 cast<SCEVAddRecExpr>(SE->getAddRecExpr(Start, 1087 SE->getConstant(RealIVSCEV->getType(), 1), 1088 L, SCEV::FlagAnyWrap)); 1089 { // Limit the lifetime of SCEVExpander. 1090 SCEVExpander Expander(*SE, "reroll"); 1091 Value *NewIV = Expander.expandCodeFor(H, IV->getType(), Header->begin()); 1092 1093 for (DenseSet<Instruction *>::iterator J = BaseUseSet.begin(), 1094 JE = BaseUseSet.end(); J != JE; ++J) 1095 (*J)->replaceUsesOfWith(IV, NewIV); 1096 1097 if (BranchInst *BI = dyn_cast<BranchInst>(Header->getTerminator())) { 1098 if (LoopIncUseSet.count(BI)) { 1099 const SCEV *ICSCEV = RealIVSCEV->evaluateAtIteration(IterCount, *SE); 1100 if (Inc == 1) 1101 ICSCEV = 1102 SE->getMulExpr(ICSCEV, SE->getConstant(ICSCEV->getType(), Scale)); 1103 // Iteration count SCEV minus 1 1104 const SCEV *ICMinus1SCEV = 1105 SE->getMinusSCEV(ICSCEV, SE->getConstant(ICSCEV->getType(), 1)); 1106 1107 Value *ICMinus1; // Iteration count minus 1 1108 if (isa<SCEVConstant>(ICMinus1SCEV)) { 1109 ICMinus1 = Expander.expandCodeFor(ICMinus1SCEV, NewIV->getType(), BI); 1110 } else { 1111 BasicBlock *Preheader = L->getLoopPreheader(); 1112 if (!Preheader) 1113 Preheader = InsertPreheaderForLoop(L, this); 1114 1115 ICMinus1 = Expander.expandCodeFor(ICMinus1SCEV, NewIV->getType(), 1116 Preheader->getTerminator()); 1117 } 1118 1119 Value *Cond = new ICmpInst(BI, CmpInst::ICMP_EQ, NewIV, ICMinus1, 1120 "exitcond"); 1121 BI->setCondition(Cond); 1122 1123 if (BI->getSuccessor(1) != Header) 1124 BI->swapSuccessors(); 1125 } 1126 } 1127 } 1128 1129 SimplifyInstructionsInBlock(Header, DL, TLI); 1130 DeleteDeadPHIs(Header, TLI); 1131 ++NumRerolledLoops; 1132 return true; 1133} 1134 1135bool LoopReroll::runOnLoop(Loop *L, LPPassManager &LPM) { 1136 AA = &getAnalysis<AliasAnalysis>(); 1137 LI = &getAnalysis<LoopInfo>(); 1138 SE = &getAnalysis<ScalarEvolution>(); 1139 TLI = &getAnalysis<TargetLibraryInfo>(); 1140 DL = getAnalysisIfAvailable<DataLayout>(); 1141 DT = &getAnalysis<DominatorTree>(); 1142 1143 BasicBlock *Header = L->getHeader(); 1144 DEBUG(dbgs() << "LRR: F[" << Header->getParent()->getName() << 1145 "] Loop %" << Header->getName() << " (" << 1146 L->getNumBlocks() << " block(s))\n"); 1147 1148 bool Changed = false; 1149 1150 // For now, we'll handle only single BB loops. 1151 if (L->getNumBlocks() > 1) 1152 return Changed; 1153 1154 if (!SE->hasLoopInvariantBackedgeTakenCount(L)) 1155 return Changed; 1156 1157 const SCEV *LIBETC = SE->getBackedgeTakenCount(L); 1158 const SCEV *IterCount = 1159 SE->getAddExpr(LIBETC, SE->getConstant(LIBETC->getType(), 1)); 1160 DEBUG(dbgs() << "LRR: iteration count = " << *IterCount << "\n"); 1161 1162 // First, we need to find the induction variable with respect to which we can 1163 // reroll (there may be several possible options). 1164 SmallInstructionVector PossibleIVs; 1165 collectPossibleIVs(L, PossibleIVs); 1166 1167 if (PossibleIVs.empty()) { 1168 DEBUG(dbgs() << "LRR: No possible IVs found\n"); 1169 return Changed; 1170 } 1171 1172 ReductionTracker Reductions; 1173 collectPossibleReductions(L, Reductions); 1174 1175 // For each possible IV, collect the associated possible set of 'root' nodes 1176 // (i+1, i+2, etc.). 1177 for (SmallInstructionVector::iterator I = PossibleIVs.begin(), 1178 IE = PossibleIVs.end(); I != IE; ++I) 1179 if (reroll(*I, L, Header, IterCount, Reductions)) { 1180 Changed = true; 1181 break; 1182 } 1183 1184 return Changed; 1185} 1186 1187