1//===- SROA.cpp - Scalar Replacement Of Aggregates ------------------------===//
2//
3//                     The LLVM Compiler Infrastructure
4//
5// This file is distributed under the University of Illinois Open Source
6// License. See LICENSE.TXT for details.
7//
8//===----------------------------------------------------------------------===//
9/// \file
10/// This transformation implements the well known scalar replacement of
11/// aggregates transformation. It tries to identify promotable elements of an
12/// aggregate alloca, and promote them to registers. It will also try to
13/// convert uses of an element (or set of elements) of an alloca into a vector
14/// or bitfield-style integer scalar if appropriate.
15///
16/// It works to do this with minimal slicing of the alloca so that regions
17/// which are merely transferred in and out of external memory remain unchanged
18/// and are not decomposed to scalar code.
19///
20/// Because this also performs alloca promotion, it can be thought of as also
21/// serving the purpose of SSA formation. The algorithm iterates on the
22/// function until all opportunities for promotion have been realized.
23///
24//===----------------------------------------------------------------------===//
25
26#define DEBUG_TYPE "sroa"
27#include "llvm/Transforms/Scalar.h"
28#include "llvm/ADT/STLExtras.h"
29#include "llvm/ADT/SetVector.h"
30#include "llvm/ADT/SmallVector.h"
31#include "llvm/ADT/Statistic.h"
32#include "llvm/Analysis/Dominators.h"
33#include "llvm/Analysis/Loads.h"
34#include "llvm/Analysis/PtrUseVisitor.h"
35#include "llvm/Analysis/ValueTracking.h"
36#include "llvm/DIBuilder.h"
37#include "llvm/DebugInfo.h"
38#include "llvm/IR/Constants.h"
39#include "llvm/IR/DataLayout.h"
40#include "llvm/IR/DerivedTypes.h"
41#include "llvm/IR/Function.h"
42#include "llvm/IR/IRBuilder.h"
43#include "llvm/IR/Instructions.h"
44#include "llvm/IR/IntrinsicInst.h"
45#include "llvm/IR/LLVMContext.h"
46#include "llvm/IR/Operator.h"
47#include "llvm/InstVisitor.h"
48#include "llvm/Pass.h"
49#include "llvm/Support/CommandLine.h"
50#include "llvm/Support/Debug.h"
51#include "llvm/Support/ErrorHandling.h"
52#include "llvm/Support/MathExtras.h"
53#include "llvm/Support/raw_ostream.h"
54#include "llvm/Transforms/Utils/Local.h"
55#include "llvm/Transforms/Utils/PromoteMemToReg.h"
56#include "llvm/Transforms/Utils/SSAUpdater.h"
57using namespace llvm;
58
59STATISTIC(NumAllocasAnalyzed, "Number of allocas analyzed for replacement");
60STATISTIC(NumAllocaPartitions, "Number of alloca partitions formed");
61STATISTIC(MaxPartitionsPerAlloca, "Maximum number of partitions");
62STATISTIC(NumAllocaPartitionUses, "Number of alloca partition uses found");
63STATISTIC(MaxPartitionUsesPerAlloca, "Maximum number of partition uses");
64STATISTIC(NumNewAllocas, "Number of new, smaller allocas introduced");
65STATISTIC(NumPromoted, "Number of allocas promoted to SSA values");
66STATISTIC(NumLoadsSpeculated, "Number of loads speculated to allow promotion");
67STATISTIC(NumDeleted, "Number of instructions deleted");
68STATISTIC(NumVectorized, "Number of vectorized aggregates");
69
70/// Hidden option to force the pass to not use DomTree and mem2reg, instead
71/// forming SSA values through the SSAUpdater infrastructure.
72static cl::opt<bool>
73ForceSSAUpdater("force-ssa-updater", cl::init(false), cl::Hidden);
74
75namespace {
76/// \brief A custom IRBuilder inserter which prefixes all names if they are
77/// preserved.
78template <bool preserveNames = true>
79class IRBuilderPrefixedInserter :
80    public IRBuilderDefaultInserter<preserveNames> {
81  std::string Prefix;
82
83public:
84  void SetNamePrefix(const Twine &P) { Prefix = P.str(); }
85
86protected:
87  void InsertHelper(Instruction *I, const Twine &Name, BasicBlock *BB,
88                    BasicBlock::iterator InsertPt) const {
89    IRBuilderDefaultInserter<preserveNames>::InsertHelper(
90        I, Name.isTriviallyEmpty() ? Name : Prefix + Name, BB, InsertPt);
91  }
92};
93
94// Specialization for not preserving the name is trivial.
95template <>
96class IRBuilderPrefixedInserter<false> :
97    public IRBuilderDefaultInserter<false> {
98public:
99  void SetNamePrefix(const Twine &P) {}
100};
101
102/// \brief Provide a typedef for IRBuilder that drops names in release builds.
103#ifndef NDEBUG
104typedef llvm::IRBuilder<true, ConstantFolder,
105                        IRBuilderPrefixedInserter<true> > IRBuilderTy;
106#else
107typedef llvm::IRBuilder<false, ConstantFolder,
108                        IRBuilderPrefixedInserter<false> > IRBuilderTy;
109#endif
110}
111
112namespace {
113/// \brief A common base class for representing a half-open byte range.
114struct ByteRange {
115  /// \brief The beginning offset of the range.
116  uint64_t BeginOffset;
117
118  /// \brief The ending offset, not included in the range.
119  uint64_t EndOffset;
120
121  ByteRange() : BeginOffset(), EndOffset() {}
122  ByteRange(uint64_t BeginOffset, uint64_t EndOffset)
123      : BeginOffset(BeginOffset), EndOffset(EndOffset) {}
124
125  /// \brief Support for ordering ranges.
126  ///
127  /// This provides an ordering over ranges such that start offsets are
128  /// always increasing, and within equal start offsets, the end offsets are
129  /// decreasing. Thus the spanning range comes first in a cluster with the
130  /// same start position.
131  bool operator<(const ByteRange &RHS) const {
132    if (BeginOffset < RHS.BeginOffset) return true;
133    if (BeginOffset > RHS.BeginOffset) return false;
134    if (EndOffset > RHS.EndOffset) return true;
135    return false;
136  }
137
138  /// \brief Support comparison with a single offset to allow binary searches.
139  friend bool operator<(const ByteRange &LHS, uint64_t RHSOffset) {
140    return LHS.BeginOffset < RHSOffset;
141  }
142
143  friend LLVM_ATTRIBUTE_UNUSED bool operator<(uint64_t LHSOffset,
144                                              const ByteRange &RHS) {
145    return LHSOffset < RHS.BeginOffset;
146  }
147
148  bool operator==(const ByteRange &RHS) const {
149    return BeginOffset == RHS.BeginOffset && EndOffset == RHS.EndOffset;
150  }
151  bool operator!=(const ByteRange &RHS) const { return !operator==(RHS); }
152};
153
154/// \brief A partition of an alloca.
155///
156/// This structure represents a contiguous partition of the alloca. These are
157/// formed by examining the uses of the alloca. During formation, they may
158/// overlap but once an AllocaPartitioning is built, the Partitions within it
159/// are all disjoint.
160struct Partition : public ByteRange {
161  /// \brief Whether this partition is splittable into smaller partitions.
162  ///
163  /// We flag partitions as splittable when they are formed entirely due to
164  /// accesses by trivially splittable operations such as memset and memcpy.
165  bool IsSplittable;
166
167  /// \brief Test whether a partition has been marked as dead.
168  bool isDead() const {
169    if (BeginOffset == UINT64_MAX) {
170      assert(EndOffset == UINT64_MAX);
171      return true;
172    }
173    return false;
174  }
175
176  /// \brief Kill a partition.
177  /// This is accomplished by setting both its beginning and end offset to
178  /// the maximum possible value.
179  void kill() {
180    assert(!isDead() && "He's Dead, Jim!");
181    BeginOffset = EndOffset = UINT64_MAX;
182  }
183
184  Partition() : ByteRange(), IsSplittable() {}
185  Partition(uint64_t BeginOffset, uint64_t EndOffset, bool IsSplittable)
186      : ByteRange(BeginOffset, EndOffset), IsSplittable(IsSplittable) {}
187};
188
189/// \brief A particular use of a partition of the alloca.
190///
191/// This structure is used to associate uses of a partition with it. They
192/// mark the range of bytes which are referenced by a particular instruction,
193/// and includes a handle to the user itself and the pointer value in use.
194/// The bounds of these uses are determined by intersecting the bounds of the
195/// memory use itself with a particular partition. As a consequence there is
196/// intentionally overlap between various uses of the same partition.
197class PartitionUse : public ByteRange {
198  /// \brief Combined storage for both the Use* and split state.
199  PointerIntPair<Use*, 1, bool> UsePtrAndIsSplit;
200
201public:
202  PartitionUse() : ByteRange(), UsePtrAndIsSplit() {}
203  PartitionUse(uint64_t BeginOffset, uint64_t EndOffset, Use *U,
204               bool IsSplit)
205      : ByteRange(BeginOffset, EndOffset), UsePtrAndIsSplit(U, IsSplit) {}
206
207  /// \brief The use in question. Provides access to both user and used value.
208  ///
209  /// Note that this may be null if the partition use is *dead*, that is, it
210  /// should be ignored.
211  Use *getUse() const { return UsePtrAndIsSplit.getPointer(); }
212
213  /// \brief Set the use for this partition use range.
214  void setUse(Use *U) { UsePtrAndIsSplit.setPointer(U); }
215
216  /// \brief Whether this use is split across multiple partitions.
217  bool isSplit() const { return UsePtrAndIsSplit.getInt(); }
218};
219}
220
221namespace llvm {
222template <> struct isPodLike<Partition> : llvm::true_type {};
223template <> struct isPodLike<PartitionUse> : llvm::true_type {};
224}
225
226namespace {
227/// \brief Alloca partitioning representation.
228///
229/// This class represents a partitioning of an alloca into slices, and
230/// information about the nature of uses of each slice of the alloca. The goal
231/// is that this information is sufficient to decide if and how to split the
232/// alloca apart and replace slices with scalars. It is also intended that this
233/// structure can capture the relevant information needed both to decide about
234/// and to enact these transformations.
235class AllocaPartitioning {
236public:
237  /// \brief Construct a partitioning of a particular alloca.
238  ///
239  /// Construction does most of the work for partitioning the alloca. This
240  /// performs the necessary walks of users and builds a partitioning from it.
241  AllocaPartitioning(const DataLayout &TD, AllocaInst &AI);
242
243  /// \brief Test whether a pointer to the allocation escapes our analysis.
244  ///
245  /// If this is true, the partitioning is never fully built and should be
246  /// ignored.
247  bool isEscaped() const { return PointerEscapingInstr; }
248
249  /// \brief Support for iterating over the partitions.
250  /// @{
251  typedef SmallVectorImpl<Partition>::iterator iterator;
252  iterator begin() { return Partitions.begin(); }
253  iterator end() { return Partitions.end(); }
254
255  typedef SmallVectorImpl<Partition>::const_iterator const_iterator;
256  const_iterator begin() const { return Partitions.begin(); }
257  const_iterator end() const { return Partitions.end(); }
258  /// @}
259
260  /// \brief Support for iterating over and manipulating a particular
261  /// partition's uses.
262  ///
263  /// The iteration support provided for uses is more limited, but also
264  /// includes some manipulation routines to support rewriting the uses of
265  /// partitions during SROA.
266  /// @{
267  typedef SmallVectorImpl<PartitionUse>::iterator use_iterator;
268  use_iterator use_begin(unsigned Idx) { return Uses[Idx].begin(); }
269  use_iterator use_begin(const_iterator I) { return Uses[I - begin()].begin(); }
270  use_iterator use_end(unsigned Idx) { return Uses[Idx].end(); }
271  use_iterator use_end(const_iterator I) { return Uses[I - begin()].end(); }
272
273  typedef SmallVectorImpl<PartitionUse>::const_iterator const_use_iterator;
274  const_use_iterator use_begin(unsigned Idx) const { return Uses[Idx].begin(); }
275  const_use_iterator use_begin(const_iterator I) const {
276    return Uses[I - begin()].begin();
277  }
278  const_use_iterator use_end(unsigned Idx) const { return Uses[Idx].end(); }
279  const_use_iterator use_end(const_iterator I) const {
280    return Uses[I - begin()].end();
281  }
282
283  unsigned use_size(unsigned Idx) const { return Uses[Idx].size(); }
284  unsigned use_size(const_iterator I) const { return Uses[I - begin()].size(); }
285  const PartitionUse &getUse(unsigned PIdx, unsigned UIdx) const {
286    return Uses[PIdx][UIdx];
287  }
288  const PartitionUse &getUse(const_iterator I, unsigned UIdx) const {
289    return Uses[I - begin()][UIdx];
290  }
291
292  void use_push_back(unsigned Idx, const PartitionUse &PU) {
293    Uses[Idx].push_back(PU);
294  }
295  void use_push_back(const_iterator I, const PartitionUse &PU) {
296    Uses[I - begin()].push_back(PU);
297  }
298  /// @}
299
300  /// \brief Allow iterating the dead users for this alloca.
301  ///
302  /// These are instructions which will never actually use the alloca as they
303  /// are outside the allocated range. They are safe to replace with undef and
304  /// delete.
305  /// @{
306  typedef SmallVectorImpl<Instruction *>::const_iterator dead_user_iterator;
307  dead_user_iterator dead_user_begin() const { return DeadUsers.begin(); }
308  dead_user_iterator dead_user_end() const { return DeadUsers.end(); }
309  /// @}
310
311  /// \brief Allow iterating the dead expressions referring to this alloca.
312  ///
313  /// These are operands which have cannot actually be used to refer to the
314  /// alloca as they are outside its range and the user doesn't correct for
315  /// that. These mostly consist of PHI node inputs and the like which we just
316  /// need to replace with undef.
317  /// @{
318  typedef SmallVectorImpl<Use *>::const_iterator dead_op_iterator;
319  dead_op_iterator dead_op_begin() const { return DeadOperands.begin(); }
320  dead_op_iterator dead_op_end() const { return DeadOperands.end(); }
321  /// @}
322
323  /// \brief MemTransferInst auxiliary data.
324  /// This struct provides some auxiliary data about memory transfer
325  /// intrinsics such as memcpy and memmove. These intrinsics can use two
326  /// different ranges within the same alloca, and provide other challenges to
327  /// correctly represent. We stash extra data to help us untangle this
328  /// after the partitioning is complete.
329  struct MemTransferOffsets {
330    /// The destination begin and end offsets when the destination is within
331    /// this alloca. If the end offset is zero the destination is not within
332    /// this alloca.
333    uint64_t DestBegin, DestEnd;
334
335    /// The source begin and end offsets when the source is within this alloca.
336    /// If the end offset is zero, the source is not within this alloca.
337    uint64_t SourceBegin, SourceEnd;
338
339    /// Flag for whether an alloca is splittable.
340    bool IsSplittable;
341  };
342  MemTransferOffsets getMemTransferOffsets(MemTransferInst &II) const {
343    return MemTransferInstData.lookup(&II);
344  }
345
346  /// \brief Map from a PHI or select operand back to a partition.
347  ///
348  /// When manipulating PHI nodes or selects, they can use more than one
349  /// partition of an alloca. We store a special mapping to allow finding the
350  /// partition referenced by each of these operands, if any.
351  iterator findPartitionForPHIOrSelectOperand(Use *U) {
352    SmallDenseMap<Use *, std::pair<unsigned, unsigned> >::const_iterator MapIt
353      = PHIOrSelectOpMap.find(U);
354    if (MapIt == PHIOrSelectOpMap.end())
355      return end();
356
357    return begin() + MapIt->second.first;
358  }
359
360  /// \brief Map from a PHI or select operand back to the specific use of
361  /// a partition.
362  ///
363  /// Similar to mapping these operands back to the partitions, this maps
364  /// directly to the use structure of that partition.
365  use_iterator findPartitionUseForPHIOrSelectOperand(Use *U) {
366    SmallDenseMap<Use *, std::pair<unsigned, unsigned> >::const_iterator MapIt
367      = PHIOrSelectOpMap.find(U);
368    assert(MapIt != PHIOrSelectOpMap.end());
369    return Uses[MapIt->second.first].begin() + MapIt->second.second;
370  }
371
372  /// \brief Compute a common type among the uses of a particular partition.
373  ///
374  /// This routines walks all of the uses of a particular partition and tries
375  /// to find a common type between them. Untyped operations such as memset and
376  /// memcpy are ignored.
377  Type *getCommonType(iterator I) const;
378
379#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
380  void print(raw_ostream &OS, const_iterator I, StringRef Indent = "  ") const;
381  void printUsers(raw_ostream &OS, const_iterator I,
382                  StringRef Indent = "  ") const;
383  void print(raw_ostream &OS) const;
384  void LLVM_ATTRIBUTE_NOINLINE LLVM_ATTRIBUTE_USED dump(const_iterator I) const;
385  void LLVM_ATTRIBUTE_NOINLINE LLVM_ATTRIBUTE_USED dump() const;
386#endif
387
388private:
389  template <typename DerivedT, typename RetT = void> class BuilderBase;
390  class PartitionBuilder;
391  friend class AllocaPartitioning::PartitionBuilder;
392  class UseBuilder;
393  friend class AllocaPartitioning::UseBuilder;
394
395#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
396  /// \brief Handle to alloca instruction to simplify method interfaces.
397  AllocaInst &AI;
398#endif
399
400  /// \brief The instruction responsible for this alloca having no partitioning.
401  ///
402  /// When an instruction (potentially) escapes the pointer to the alloca, we
403  /// store a pointer to that here and abort trying to partition the alloca.
404  /// This will be null if the alloca is partitioned successfully.
405  Instruction *PointerEscapingInstr;
406
407  /// \brief The partitions of the alloca.
408  ///
409  /// We store a vector of the partitions over the alloca here. This vector is
410  /// sorted by increasing begin offset, and then by decreasing end offset. See
411  /// the Partition inner class for more details. Initially (during
412  /// construction) there are overlaps, but we form a disjoint sequence of
413  /// partitions while finishing construction and a fully constructed object is
414  /// expected to always have this as a disjoint space.
415  SmallVector<Partition, 8> Partitions;
416
417  /// \brief The uses of the partitions.
418  ///
419  /// This is essentially a mapping from each partition to a list of uses of
420  /// that partition. The mapping is done with a Uses vector that has the exact
421  /// same number of entries as the partition vector. Each entry is itself
422  /// a vector of the uses.
423  SmallVector<SmallVector<PartitionUse, 2>, 8> Uses;
424
425  /// \brief Instructions which will become dead if we rewrite the alloca.
426  ///
427  /// Note that these are not separated by partition. This is because we expect
428  /// a partitioned alloca to be completely rewritten or not rewritten at all.
429  /// If rewritten, all these instructions can simply be removed and replaced
430  /// with undef as they come from outside of the allocated space.
431  SmallVector<Instruction *, 8> DeadUsers;
432
433  /// \brief Operands which will become dead if we rewrite the alloca.
434  ///
435  /// These are operands that in their particular use can be replaced with
436  /// undef when we rewrite the alloca. These show up in out-of-bounds inputs
437  /// to PHI nodes and the like. They aren't entirely dead (there might be
438  /// a GEP back into the bounds using it elsewhere) and nor is the PHI, but we
439  /// want to swap this particular input for undef to simplify the use lists of
440  /// the alloca.
441  SmallVector<Use *, 8> DeadOperands;
442
443  /// \brief The underlying storage for auxiliary memcpy and memset info.
444  SmallDenseMap<MemTransferInst *, MemTransferOffsets, 4> MemTransferInstData;
445
446  /// \brief A side datastructure used when building up the partitions and uses.
447  ///
448  /// This mapping is only really used during the initial building of the
449  /// partitioning so that we can retain information about PHI and select nodes
450  /// processed.
451  SmallDenseMap<Instruction *, std::pair<uint64_t, bool> > PHIOrSelectSizes;
452
453  /// \brief Auxiliary information for particular PHI or select operands.
454  SmallDenseMap<Use *, std::pair<unsigned, unsigned>, 4> PHIOrSelectOpMap;
455
456  /// \brief A utility routine called from the constructor.
457  ///
458  /// This does what it says on the tin. It is the key of the alloca partition
459  /// splitting and merging. After it is called we have the desired disjoint
460  /// collection of partitions.
461  void splitAndMergePartitions();
462};
463}
464
465static Value *foldSelectInst(SelectInst &SI) {
466  // If the condition being selected on is a constant or the same value is
467  // being selected between, fold the select. Yes this does (rarely) happen
468  // early on.
469  if (ConstantInt *CI = dyn_cast<ConstantInt>(SI.getCondition()))
470    return SI.getOperand(1+CI->isZero());
471  if (SI.getOperand(1) == SI.getOperand(2))
472    return SI.getOperand(1);
473
474  return 0;
475}
476
477/// \brief Builder for the alloca partitioning.
478///
479/// This class builds an alloca partitioning by recursively visiting the uses
480/// of an alloca and splitting the partitions for each load and store at each
481/// offset.
482class AllocaPartitioning::PartitionBuilder
483    : public PtrUseVisitor<PartitionBuilder> {
484  friend class PtrUseVisitor<PartitionBuilder>;
485  friend class InstVisitor<PartitionBuilder>;
486  typedef PtrUseVisitor<PartitionBuilder> Base;
487
488  const uint64_t AllocSize;
489  AllocaPartitioning &P;
490
491  SmallDenseMap<Instruction *, unsigned> MemTransferPartitionMap;
492
493public:
494  PartitionBuilder(const DataLayout &DL, AllocaInst &AI, AllocaPartitioning &P)
495      : PtrUseVisitor<PartitionBuilder>(DL),
496        AllocSize(DL.getTypeAllocSize(AI.getAllocatedType())),
497        P(P) {}
498
499private:
500  void insertUse(Instruction &I, const APInt &Offset, uint64_t Size,
501                 bool IsSplittable = false) {
502    // Completely skip uses which have a zero size or start either before or
503    // past the end of the allocation.
504    if (Size == 0 || Offset.isNegative() || Offset.uge(AllocSize)) {
505      DEBUG(dbgs() << "WARNING: Ignoring " << Size << " byte use @" << Offset
506                   << " which has zero size or starts outside of the "
507                   << AllocSize << " byte alloca:\n"
508                   << "    alloca: " << P.AI << "\n"
509                   << "       use: " << I << "\n");
510      return;
511    }
512
513    uint64_t BeginOffset = Offset.getZExtValue();
514    uint64_t EndOffset = BeginOffset + Size;
515
516    // Clamp the end offset to the end of the allocation. Note that this is
517    // formulated to handle even the case where "BeginOffset + Size" overflows.
518    // This may appear superficially to be something we could ignore entirely,
519    // but that is not so! There may be widened loads or PHI-node uses where
520    // some instructions are dead but not others. We can't completely ignore
521    // them, and so have to record at least the information here.
522    assert(AllocSize >= BeginOffset); // Established above.
523    if (Size > AllocSize - BeginOffset) {
524      DEBUG(dbgs() << "WARNING: Clamping a " << Size << " byte use @" << Offset
525                   << " to remain within the " << AllocSize << " byte alloca:\n"
526                   << "    alloca: " << P.AI << "\n"
527                   << "       use: " << I << "\n");
528      EndOffset = AllocSize;
529    }
530
531    Partition New(BeginOffset, EndOffset, IsSplittable);
532    P.Partitions.push_back(New);
533  }
534
535  void handleLoadOrStore(Type *Ty, Instruction &I, const APInt &Offset,
536                         uint64_t Size, bool IsVolatile) {
537    // We allow splitting of loads and stores where the type is an integer type
538    // and cover the entire alloca. This prevents us from splitting over
539    // eagerly.
540    // FIXME: In the great blue eventually, we should eagerly split all integer
541    // loads and stores, and then have a separate step that merges adjacent
542    // alloca partitions into a single partition suitable for integer widening.
543    // Or we should skip the merge step and rely on GVN and other passes to
544    // merge adjacent loads and stores that survive mem2reg.
545    bool IsSplittable =
546        Ty->isIntegerTy() && !IsVolatile && Offset == 0 && Size >= AllocSize;
547
548    insertUse(I, Offset, Size, IsSplittable);
549  }
550
551  void visitLoadInst(LoadInst &LI) {
552    assert((!LI.isSimple() || LI.getType()->isSingleValueType()) &&
553           "All simple FCA loads should have been pre-split");
554
555    if (!IsOffsetKnown)
556      return PI.setAborted(&LI);
557
558    uint64_t Size = DL.getTypeStoreSize(LI.getType());
559    return handleLoadOrStore(LI.getType(), LI, Offset, Size, LI.isVolatile());
560  }
561
562  void visitStoreInst(StoreInst &SI) {
563    Value *ValOp = SI.getValueOperand();
564    if (ValOp == *U)
565      return PI.setEscapedAndAborted(&SI);
566    if (!IsOffsetKnown)
567      return PI.setAborted(&SI);
568
569    uint64_t Size = DL.getTypeStoreSize(ValOp->getType());
570
571    // If this memory access can be shown to *statically* extend outside the
572    // bounds of of the allocation, it's behavior is undefined, so simply
573    // ignore it. Note that this is more strict than the generic clamping
574    // behavior of insertUse. We also try to handle cases which might run the
575    // risk of overflow.
576    // FIXME: We should instead consider the pointer to have escaped if this
577    // function is being instrumented for addressing bugs or race conditions.
578    if (Offset.isNegative() || Size > AllocSize ||
579        Offset.ugt(AllocSize - Size)) {
580      DEBUG(dbgs() << "WARNING: Ignoring " << Size << " byte store @" << Offset
581                   << " which extends past the end of the " << AllocSize
582                   << " byte alloca:\n"
583                   << "    alloca: " << P.AI << "\n"
584                   << "       use: " << SI << "\n");
585      return;
586    }
587
588    assert((!SI.isSimple() || ValOp->getType()->isSingleValueType()) &&
589           "All simple FCA stores should have been pre-split");
590    handleLoadOrStore(ValOp->getType(), SI, Offset, Size, SI.isVolatile());
591  }
592
593
594  void visitMemSetInst(MemSetInst &II) {
595    assert(II.getRawDest() == *U && "Pointer use is not the destination?");
596    ConstantInt *Length = dyn_cast<ConstantInt>(II.getLength());
597    if ((Length && Length->getValue() == 0) ||
598        (IsOffsetKnown && !Offset.isNegative() && Offset.uge(AllocSize)))
599      // Zero-length mem transfer intrinsics can be ignored entirely.
600      return;
601
602    if (!IsOffsetKnown)
603      return PI.setAborted(&II);
604
605    insertUse(II, Offset,
606              Length ? Length->getLimitedValue()
607                     : AllocSize - Offset.getLimitedValue(),
608              (bool)Length);
609  }
610
611  void visitMemTransferInst(MemTransferInst &II) {
612    ConstantInt *Length = dyn_cast<ConstantInt>(II.getLength());
613    if ((Length && Length->getValue() == 0) ||
614        (IsOffsetKnown && !Offset.isNegative() && Offset.uge(AllocSize)))
615      // Zero-length mem transfer intrinsics can be ignored entirely.
616      return;
617
618    if (!IsOffsetKnown)
619      return PI.setAborted(&II);
620
621    uint64_t RawOffset = Offset.getLimitedValue();
622    uint64_t Size = Length ? Length->getLimitedValue()
623                           : AllocSize - RawOffset;
624
625    MemTransferOffsets &Offsets = P.MemTransferInstData[&II];
626
627    // Only intrinsics with a constant length can be split.
628    Offsets.IsSplittable = Length;
629
630    if (*U == II.getRawDest()) {
631      Offsets.DestBegin = RawOffset;
632      Offsets.DestEnd = RawOffset + Size;
633    }
634    if (*U == II.getRawSource()) {
635      Offsets.SourceBegin = RawOffset;
636      Offsets.SourceEnd = RawOffset + Size;
637    }
638
639    // If we have set up end offsets for both the source and the destination,
640    // we have found both sides of this transfer pointing at the same alloca.
641    bool SeenBothEnds = Offsets.SourceEnd && Offsets.DestEnd;
642    if (SeenBothEnds && II.getRawDest() != II.getRawSource()) {
643      unsigned PrevIdx = MemTransferPartitionMap[&II];
644
645      // Check if the begin offsets match and this is a non-volatile transfer.
646      // In that case, we can completely elide the transfer.
647      if (!II.isVolatile() && Offsets.SourceBegin == Offsets.DestBegin) {
648        P.Partitions[PrevIdx].kill();
649        return;
650      }
651
652      // Otherwise we have an offset transfer within the same alloca. We can't
653      // split those.
654      P.Partitions[PrevIdx].IsSplittable = Offsets.IsSplittable = false;
655    } else if (SeenBothEnds) {
656      // Handle the case where this exact use provides both ends of the
657      // operation.
658      assert(II.getRawDest() == II.getRawSource());
659
660      // For non-volatile transfers this is a no-op.
661      if (!II.isVolatile())
662        return;
663
664      // Otherwise just suppress splitting.
665      Offsets.IsSplittable = false;
666    }
667
668
669    // Insert the use now that we've fixed up the splittable nature.
670    insertUse(II, Offset, Size, Offsets.IsSplittable);
671
672    // Setup the mapping from intrinsic to partition of we've not seen both
673    // ends of this transfer.
674    if (!SeenBothEnds) {
675      unsigned NewIdx = P.Partitions.size() - 1;
676      bool Inserted
677        = MemTransferPartitionMap.insert(std::make_pair(&II, NewIdx)).second;
678      assert(Inserted &&
679             "Already have intrinsic in map but haven't seen both ends");
680      (void)Inserted;
681    }
682  }
683
684  // Disable SRoA for any intrinsics except for lifetime invariants.
685  // FIXME: What about debug intrinsics? This matches old behavior, but
686  // doesn't make sense.
687  void visitIntrinsicInst(IntrinsicInst &II) {
688    if (!IsOffsetKnown)
689      return PI.setAborted(&II);
690
691    if (II.getIntrinsicID() == Intrinsic::lifetime_start ||
692        II.getIntrinsicID() == Intrinsic::lifetime_end) {
693      ConstantInt *Length = cast<ConstantInt>(II.getArgOperand(0));
694      uint64_t Size = std::min(AllocSize - Offset.getLimitedValue(),
695                               Length->getLimitedValue());
696      insertUse(II, Offset, Size, true);
697      return;
698    }
699
700    Base::visitIntrinsicInst(II);
701  }
702
703  Instruction *hasUnsafePHIOrSelectUse(Instruction *Root, uint64_t &Size) {
704    // We consider any PHI or select that results in a direct load or store of
705    // the same offset to be a viable use for partitioning purposes. These uses
706    // are considered unsplittable and the size is the maximum loaded or stored
707    // size.
708    SmallPtrSet<Instruction *, 4> Visited;
709    SmallVector<std::pair<Instruction *, Instruction *>, 4> Uses;
710    Visited.insert(Root);
711    Uses.push_back(std::make_pair(cast<Instruction>(*U), Root));
712    // If there are no loads or stores, the access is dead. We mark that as
713    // a size zero access.
714    Size = 0;
715    do {
716      Instruction *I, *UsedI;
717      llvm::tie(UsedI, I) = Uses.pop_back_val();
718
719      if (LoadInst *LI = dyn_cast<LoadInst>(I)) {
720        Size = std::max(Size, DL.getTypeStoreSize(LI->getType()));
721        continue;
722      }
723      if (StoreInst *SI = dyn_cast<StoreInst>(I)) {
724        Value *Op = SI->getOperand(0);
725        if (Op == UsedI)
726          return SI;
727        Size = std::max(Size, DL.getTypeStoreSize(Op->getType()));
728        continue;
729      }
730
731      if (GetElementPtrInst *GEP = dyn_cast<GetElementPtrInst>(I)) {
732        if (!GEP->hasAllZeroIndices())
733          return GEP;
734      } else if (!isa<BitCastInst>(I) && !isa<PHINode>(I) &&
735                 !isa<SelectInst>(I)) {
736        return I;
737      }
738
739      for (Value::use_iterator UI = I->use_begin(), UE = I->use_end(); UI != UE;
740           ++UI)
741        if (Visited.insert(cast<Instruction>(*UI)))
742          Uses.push_back(std::make_pair(I, cast<Instruction>(*UI)));
743    } while (!Uses.empty());
744
745    return 0;
746  }
747
748  void visitPHINode(PHINode &PN) {
749    if (PN.use_empty())
750      return;
751    if (!IsOffsetKnown)
752      return PI.setAborted(&PN);
753
754    // See if we already have computed info on this node.
755    std::pair<uint64_t, bool> &PHIInfo = P.PHIOrSelectSizes[&PN];
756    if (PHIInfo.first) {
757      PHIInfo.second = true;
758      insertUse(PN, Offset, PHIInfo.first);
759      return;
760    }
761
762    // Check for an unsafe use of the PHI node.
763    if (Instruction *UnsafeI = hasUnsafePHIOrSelectUse(&PN, PHIInfo.first))
764      return PI.setAborted(UnsafeI);
765
766    insertUse(PN, Offset, PHIInfo.first);
767  }
768
769  void visitSelectInst(SelectInst &SI) {
770    if (SI.use_empty())
771      return;
772    if (Value *Result = foldSelectInst(SI)) {
773      if (Result == *U)
774        // If the result of the constant fold will be the pointer, recurse
775        // through the select as if we had RAUW'ed it.
776        enqueueUsers(SI);
777
778      return;
779    }
780    if (!IsOffsetKnown)
781      return PI.setAborted(&SI);
782
783    // See if we already have computed info on this node.
784    std::pair<uint64_t, bool> &SelectInfo = P.PHIOrSelectSizes[&SI];
785    if (SelectInfo.first) {
786      SelectInfo.second = true;
787      insertUse(SI, Offset, SelectInfo.first);
788      return;
789    }
790
791    // Check for an unsafe use of the PHI node.
792    if (Instruction *UnsafeI = hasUnsafePHIOrSelectUse(&SI, SelectInfo.first))
793      return PI.setAborted(UnsafeI);
794
795    insertUse(SI, Offset, SelectInfo.first);
796  }
797
798  /// \brief Disable SROA entirely if there are unhandled users of the alloca.
799  void visitInstruction(Instruction &I) {
800    PI.setAborted(&I);
801  }
802};
803
804/// \brief Use adder for the alloca partitioning.
805///
806/// This class adds the uses of an alloca to all of the partitions which they
807/// use. For splittable partitions, this can end up doing essentially a linear
808/// walk of the partitions, but the number of steps remains bounded by the
809/// total result instruction size:
810/// - The number of partitions is a result of the number unsplittable
811///   instructions using the alloca.
812/// - The number of users of each partition is at worst the total number of
813///   splittable instructions using the alloca.
814/// Thus we will produce N * M instructions in the end, where N are the number
815/// of unsplittable uses and M are the number of splittable. This visitor does
816/// the exact same number of updates to the partitioning.
817///
818/// In the more common case, this visitor will leverage the fact that the
819/// partition space is pre-sorted, and do a logarithmic search for the
820/// partition needed, making the total visit a classical ((N + M) * log(N))
821/// complexity operation.
822class AllocaPartitioning::UseBuilder : public PtrUseVisitor<UseBuilder> {
823  friend class PtrUseVisitor<UseBuilder>;
824  friend class InstVisitor<UseBuilder>;
825  typedef PtrUseVisitor<UseBuilder> Base;
826
827  const uint64_t AllocSize;
828  AllocaPartitioning &P;
829
830  /// \brief Set to de-duplicate dead instructions found in the use walk.
831  SmallPtrSet<Instruction *, 4> VisitedDeadInsts;
832
833public:
834  UseBuilder(const DataLayout &TD, AllocaInst &AI, AllocaPartitioning &P)
835      : PtrUseVisitor<UseBuilder>(TD),
836        AllocSize(TD.getTypeAllocSize(AI.getAllocatedType())),
837        P(P) {}
838
839private:
840  void markAsDead(Instruction &I) {
841    if (VisitedDeadInsts.insert(&I))
842      P.DeadUsers.push_back(&I);
843  }
844
845  void insertUse(Instruction &User, const APInt &Offset, uint64_t Size) {
846    // If the use has a zero size or extends outside of the allocation, record
847    // it as a dead use for elimination later.
848    if (Size == 0 || Offset.isNegative() || Offset.uge(AllocSize))
849      return markAsDead(User);
850
851    uint64_t BeginOffset = Offset.getZExtValue();
852    uint64_t EndOffset = BeginOffset + Size;
853
854    // Clamp the end offset to the end of the allocation. Note that this is
855    // formulated to handle even the case where "BeginOffset + Size" overflows.
856    assert(AllocSize >= BeginOffset); // Established above.
857    if (Size > AllocSize - BeginOffset)
858      EndOffset = AllocSize;
859
860    // NB: This only works if we have zero overlapping partitions.
861    iterator I = std::lower_bound(P.begin(), P.end(), BeginOffset);
862    if (I != P.begin() && llvm::prior(I)->EndOffset > BeginOffset)
863      I = llvm::prior(I);
864    iterator E = P.end();
865    bool IsSplit = llvm::next(I) != E && llvm::next(I)->BeginOffset < EndOffset;
866    for (; I != E && I->BeginOffset < EndOffset; ++I) {
867      PartitionUse NewPU(std::max(I->BeginOffset, BeginOffset),
868                         std::min(I->EndOffset, EndOffset), U, IsSplit);
869      P.use_push_back(I, NewPU);
870      if (isa<PHINode>(U->getUser()) || isa<SelectInst>(U->getUser()))
871        P.PHIOrSelectOpMap[U]
872          = std::make_pair(I - P.begin(), P.Uses[I - P.begin()].size() - 1);
873    }
874  }
875
876  void visitBitCastInst(BitCastInst &BC) {
877    if (BC.use_empty())
878      return markAsDead(BC);
879
880    return Base::visitBitCastInst(BC);
881  }
882
883  void visitGetElementPtrInst(GetElementPtrInst &GEPI) {
884    if (GEPI.use_empty())
885      return markAsDead(GEPI);
886
887    return Base::visitGetElementPtrInst(GEPI);
888  }
889
890  void visitLoadInst(LoadInst &LI) {
891    assert(IsOffsetKnown);
892    uint64_t Size = DL.getTypeStoreSize(LI.getType());
893    insertUse(LI, Offset, Size);
894  }
895
896  void visitStoreInst(StoreInst &SI) {
897    assert(IsOffsetKnown);
898    uint64_t Size = DL.getTypeStoreSize(SI.getOperand(0)->getType());
899
900    // If this memory access can be shown to *statically* extend outside the
901    // bounds of of the allocation, it's behavior is undefined, so simply
902    // ignore it. Note that this is more strict than the generic clamping
903    // behavior of insertUse.
904    if (Offset.isNegative() || Size > AllocSize ||
905        Offset.ugt(AllocSize - Size))
906      return markAsDead(SI);
907
908    insertUse(SI, Offset, Size);
909  }
910
911  void visitMemSetInst(MemSetInst &II) {
912    ConstantInt *Length = dyn_cast<ConstantInt>(II.getLength());
913    if ((Length && Length->getValue() == 0) ||
914        (IsOffsetKnown && !Offset.isNegative() && Offset.uge(AllocSize)))
915      return markAsDead(II);
916
917    assert(IsOffsetKnown);
918    insertUse(II, Offset, Length ? Length->getLimitedValue()
919                                 : AllocSize - Offset.getLimitedValue());
920  }
921
922  void visitMemTransferInst(MemTransferInst &II) {
923    ConstantInt *Length = dyn_cast<ConstantInt>(II.getLength());
924    if ((Length && Length->getValue() == 0) ||
925        (IsOffsetKnown && !Offset.isNegative() && Offset.uge(AllocSize)))
926      return markAsDead(II);
927
928    assert(IsOffsetKnown);
929    uint64_t Size = Length ? Length->getLimitedValue()
930                           : AllocSize - Offset.getLimitedValue();
931
932    const MemTransferOffsets &Offsets = P.MemTransferInstData[&II];
933    if (!II.isVolatile() && Offsets.DestEnd && Offsets.SourceEnd &&
934        Offsets.DestBegin == Offsets.SourceBegin)
935      return markAsDead(II); // Skip identity transfers without side-effects.
936
937    insertUse(II, Offset, Size);
938  }
939
940  void visitIntrinsicInst(IntrinsicInst &II) {
941    assert(IsOffsetKnown);
942    assert(II.getIntrinsicID() == Intrinsic::lifetime_start ||
943           II.getIntrinsicID() == Intrinsic::lifetime_end);
944
945    ConstantInt *Length = cast<ConstantInt>(II.getArgOperand(0));
946    insertUse(II, Offset, std::min(Length->getLimitedValue(),
947                                   AllocSize - Offset.getLimitedValue()));
948  }
949
950  void insertPHIOrSelect(Instruction &User, const APInt &Offset) {
951    uint64_t Size = P.PHIOrSelectSizes.lookup(&User).first;
952
953    // For PHI and select operands outside the alloca, we can't nuke the entire
954    // phi or select -- the other side might still be relevant, so we special
955    // case them here and use a separate structure to track the operands
956    // themselves which should be replaced with undef.
957    if ((Offset.isNegative() && Offset.uge(Size)) ||
958        (!Offset.isNegative() && Offset.uge(AllocSize))) {
959      P.DeadOperands.push_back(U);
960      return;
961    }
962
963    insertUse(User, Offset, Size);
964  }
965
966  void visitPHINode(PHINode &PN) {
967    if (PN.use_empty())
968      return markAsDead(PN);
969
970    assert(IsOffsetKnown);
971    insertPHIOrSelect(PN, Offset);
972  }
973
974  void visitSelectInst(SelectInst &SI) {
975    if (SI.use_empty())
976      return markAsDead(SI);
977
978    if (Value *Result = foldSelectInst(SI)) {
979      if (Result == *U)
980        // If the result of the constant fold will be the pointer, recurse
981        // through the select as if we had RAUW'ed it.
982        enqueueUsers(SI);
983      else
984        // Otherwise the operand to the select is dead, and we can replace it
985        // with undef.
986        P.DeadOperands.push_back(U);
987
988      return;
989    }
990
991    assert(IsOffsetKnown);
992    insertPHIOrSelect(SI, Offset);
993  }
994
995  /// \brief Unreachable, we've already visited the alloca once.
996  void visitInstruction(Instruction &I) {
997    llvm_unreachable("Unhandled instruction in use builder.");
998  }
999};
1000
1001void AllocaPartitioning::splitAndMergePartitions() {
1002  size_t NumDeadPartitions = 0;
1003
1004  // Track the range of splittable partitions that we pass when accumulating
1005  // overlapping unsplittable partitions.
1006  uint64_t SplitEndOffset = 0ull;
1007
1008  Partition New(0ull, 0ull, false);
1009
1010  for (unsigned i = 0, j = i, e = Partitions.size(); i != e; i = j) {
1011    ++j;
1012
1013    if (!Partitions[i].IsSplittable || New.BeginOffset == New.EndOffset) {
1014      assert(New.BeginOffset == New.EndOffset);
1015      New = Partitions[i];
1016    } else {
1017      assert(New.IsSplittable);
1018      New.EndOffset = std::max(New.EndOffset, Partitions[i].EndOffset);
1019    }
1020    assert(New.BeginOffset != New.EndOffset);
1021
1022    // Scan the overlapping partitions.
1023    while (j != e && New.EndOffset > Partitions[j].BeginOffset) {
1024      // If the new partition we are forming is splittable, stop at the first
1025      // unsplittable partition.
1026      if (New.IsSplittable && !Partitions[j].IsSplittable)
1027        break;
1028
1029      // Grow the new partition to include any equally splittable range. 'j' is
1030      // always equally splittable when New is splittable, but when New is not
1031      // splittable, we may subsume some (or part of some) splitable partition
1032      // without growing the new one.
1033      if (New.IsSplittable == Partitions[j].IsSplittable) {
1034        New.EndOffset = std::max(New.EndOffset, Partitions[j].EndOffset);
1035      } else {
1036        assert(!New.IsSplittable);
1037        assert(Partitions[j].IsSplittable);
1038        SplitEndOffset = std::max(SplitEndOffset, Partitions[j].EndOffset);
1039      }
1040
1041      Partitions[j].kill();
1042      ++NumDeadPartitions;
1043      ++j;
1044    }
1045
1046    // If the new partition is splittable, chop off the end as soon as the
1047    // unsplittable subsequent partition starts and ensure we eventually cover
1048    // the splittable area.
1049    if (j != e && New.IsSplittable) {
1050      SplitEndOffset = std::max(SplitEndOffset, New.EndOffset);
1051      New.EndOffset = std::min(New.EndOffset, Partitions[j].BeginOffset);
1052    }
1053
1054    // Add the new partition if it differs from the original one and is
1055    // non-empty. We can end up with an empty partition here if it was
1056    // splittable but there is an unsplittable one that starts at the same
1057    // offset.
1058    if (New != Partitions[i]) {
1059      if (New.BeginOffset != New.EndOffset)
1060        Partitions.push_back(New);
1061      // Mark the old one for removal.
1062      Partitions[i].kill();
1063      ++NumDeadPartitions;
1064    }
1065
1066    New.BeginOffset = New.EndOffset;
1067    if (!New.IsSplittable) {
1068      New.EndOffset = std::max(New.EndOffset, SplitEndOffset);
1069      if (j != e && !Partitions[j].IsSplittable)
1070        New.EndOffset = std::min(New.EndOffset, Partitions[j].BeginOffset);
1071      New.IsSplittable = true;
1072      // If there is a trailing splittable partition which won't be fused into
1073      // the next splittable partition go ahead and add it onto the partitions
1074      // list.
1075      if (New.BeginOffset < New.EndOffset &&
1076          (j == e || !Partitions[j].IsSplittable ||
1077           New.EndOffset < Partitions[j].BeginOffset)) {
1078        Partitions.push_back(New);
1079        New.BeginOffset = New.EndOffset = 0ull;
1080      }
1081    }
1082  }
1083
1084  // Re-sort the partitions now that they have been split and merged into
1085  // disjoint set of partitions. Also remove any of the dead partitions we've
1086  // replaced in the process.
1087  std::sort(Partitions.begin(), Partitions.end());
1088  if (NumDeadPartitions) {
1089    assert(Partitions.back().isDead());
1090    assert((ptrdiff_t)NumDeadPartitions ==
1091           std::count(Partitions.begin(), Partitions.end(), Partitions.back()));
1092  }
1093  Partitions.erase(Partitions.end() - NumDeadPartitions, Partitions.end());
1094}
1095
1096AllocaPartitioning::AllocaPartitioning(const DataLayout &TD, AllocaInst &AI)
1097    :
1098#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
1099      AI(AI),
1100#endif
1101      PointerEscapingInstr(0) {
1102  PartitionBuilder PB(TD, AI, *this);
1103  PartitionBuilder::PtrInfo PtrI = PB.visitPtr(AI);
1104  if (PtrI.isEscaped() || PtrI.isAborted()) {
1105    // FIXME: We should sink the escape vs. abort info into the caller nicely,
1106    // possibly by just storing the PtrInfo in the AllocaPartitioning.
1107    PointerEscapingInstr = PtrI.getEscapingInst() ? PtrI.getEscapingInst()
1108                                                  : PtrI.getAbortingInst();
1109    assert(PointerEscapingInstr && "Did not track a bad instruction");
1110    return;
1111  }
1112
1113  // Sort the uses. This arranges for the offsets to be in ascending order,
1114  // and the sizes to be in descending order.
1115  std::sort(Partitions.begin(), Partitions.end());
1116
1117  // Remove any partitions from the back which are marked as dead.
1118  while (!Partitions.empty() && Partitions.back().isDead())
1119    Partitions.pop_back();
1120
1121  if (Partitions.size() > 1) {
1122    // Intersect splittability for all partitions with equal offsets and sizes.
1123    // Then remove all but the first so that we have a sequence of non-equal but
1124    // potentially overlapping partitions.
1125    for (iterator I = Partitions.begin(), J = I, E = Partitions.end(); I != E;
1126         I = J) {
1127      ++J;
1128      while (J != E && *I == *J) {
1129        I->IsSplittable &= J->IsSplittable;
1130        ++J;
1131      }
1132    }
1133    Partitions.erase(std::unique(Partitions.begin(), Partitions.end()),
1134                     Partitions.end());
1135
1136    // Split splittable and merge unsplittable partitions into a disjoint set
1137    // of partitions over the used space of the allocation.
1138    splitAndMergePartitions();
1139  }
1140
1141  // Record how many partitions we end up with.
1142  NumAllocaPartitions += Partitions.size();
1143  MaxPartitionsPerAlloca = std::max<unsigned>(Partitions.size(), MaxPartitionsPerAlloca);
1144
1145  // Now build up the user lists for each of these disjoint partitions by
1146  // re-walking the recursive users of the alloca.
1147  Uses.resize(Partitions.size());
1148  UseBuilder UB(TD, AI, *this);
1149  PtrI = UB.visitPtr(AI);
1150  assert(!PtrI.isEscaped() && "Previously analyzed pointer now escapes!");
1151  assert(!PtrI.isAborted() && "Early aborted the visit of the pointer.");
1152
1153  unsigned NumUses = 0;
1154#if !defined(NDEBUG) || defined(LLVM_ENABLE_STATS)
1155  for (unsigned Idx = 0, Size = Uses.size(); Idx != Size; ++Idx)
1156    NumUses += Uses[Idx].size();
1157#endif
1158  NumAllocaPartitionUses += NumUses;
1159  MaxPartitionUsesPerAlloca = std::max<unsigned>(NumUses, MaxPartitionUsesPerAlloca);
1160}
1161
1162Type *AllocaPartitioning::getCommonType(iterator I) const {
1163  Type *Ty = 0;
1164  for (const_use_iterator UI = use_begin(I), UE = use_end(I); UI != UE; ++UI) {
1165    Use *U = UI->getUse();
1166    if (!U)
1167      continue; // Skip dead uses.
1168    if (isa<IntrinsicInst>(*U->getUser()))
1169      continue;
1170    if (UI->BeginOffset != I->BeginOffset || UI->EndOffset != I->EndOffset)
1171      continue;
1172
1173    Type *UserTy = 0;
1174    if (LoadInst *LI = dyn_cast<LoadInst>(U->getUser()))
1175      UserTy = LI->getType();
1176    else if (StoreInst *SI = dyn_cast<StoreInst>(U->getUser()))
1177      UserTy = SI->getValueOperand()->getType();
1178    else
1179      return 0; // Bail if we have weird uses.
1180
1181    if (IntegerType *ITy = dyn_cast<IntegerType>(UserTy)) {
1182      // If the type is larger than the partition, skip it. We only encounter
1183      // this for split integer operations where we want to use the type of the
1184      // entity causing the split.
1185      if (ITy->getBitWidth() > (I->EndOffset - I->BeginOffset)*8)
1186        continue;
1187
1188      // If we have found an integer type use covering the alloca, use that
1189      // regardless of the other types, as integers are often used for a "bucket
1190      // of bits" type.
1191      return ITy;
1192    }
1193
1194    if (Ty && Ty != UserTy)
1195      return 0;
1196
1197    Ty = UserTy;
1198  }
1199  return Ty;
1200}
1201
1202#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
1203
1204void AllocaPartitioning::print(raw_ostream &OS, const_iterator I,
1205                               StringRef Indent) const {
1206  OS << Indent << "partition #" << (I - begin())
1207     << " [" << I->BeginOffset << "," << I->EndOffset << ")"
1208     << (I->IsSplittable ? " (splittable)" : "")
1209     << (Uses[I - begin()].empty() ? " (zero uses)" : "")
1210     << "\n";
1211}
1212
1213void AllocaPartitioning::printUsers(raw_ostream &OS, const_iterator I,
1214                                    StringRef Indent) const {
1215  for (const_use_iterator UI = use_begin(I), UE = use_end(I); UI != UE; ++UI) {
1216    if (!UI->getUse())
1217      continue; // Skip dead uses.
1218    OS << Indent << "  [" << UI->BeginOffset << "," << UI->EndOffset << ") "
1219       << "used by: " << *UI->getUse()->getUser() << "\n";
1220    if (MemTransferInst *II =
1221            dyn_cast<MemTransferInst>(UI->getUse()->getUser())) {
1222      const MemTransferOffsets &MTO = MemTransferInstData.lookup(II);
1223      bool IsDest;
1224      if (!MTO.IsSplittable)
1225        IsDest = UI->BeginOffset == MTO.DestBegin;
1226      else
1227        IsDest = MTO.DestBegin != 0u;
1228      OS << Indent << "    (original " << (IsDest ? "dest" : "source") << ": "
1229         << "[" << (IsDest ? MTO.DestBegin : MTO.SourceBegin)
1230         << "," << (IsDest ? MTO.DestEnd : MTO.SourceEnd) << ")\n";
1231    }
1232  }
1233}
1234
1235void AllocaPartitioning::print(raw_ostream &OS) const {
1236  if (PointerEscapingInstr) {
1237    OS << "No partitioning for alloca: " << AI << "\n"
1238       << "  A pointer to this alloca escaped by:\n"
1239       << "  " << *PointerEscapingInstr << "\n";
1240    return;
1241  }
1242
1243  OS << "Partitioning of alloca: " << AI << "\n";
1244  for (const_iterator I = begin(), E = end(); I != E; ++I) {
1245    print(OS, I);
1246    printUsers(OS, I);
1247  }
1248}
1249
1250void AllocaPartitioning::dump(const_iterator I) const { print(dbgs(), I); }
1251void AllocaPartitioning::dump() const { print(dbgs()); }
1252
1253#endif // !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
1254
1255
1256namespace {
1257/// \brief Implementation of LoadAndStorePromoter for promoting allocas.
1258///
1259/// This subclass of LoadAndStorePromoter adds overrides to handle promoting
1260/// the loads and stores of an alloca instruction, as well as updating its
1261/// debug information. This is used when a domtree is unavailable and thus
1262/// mem2reg in its full form can't be used to handle promotion of allocas to
1263/// scalar values.
1264class AllocaPromoter : public LoadAndStorePromoter {
1265  AllocaInst &AI;
1266  DIBuilder &DIB;
1267
1268  SmallVector<DbgDeclareInst *, 4> DDIs;
1269  SmallVector<DbgValueInst *, 4> DVIs;
1270
1271public:
1272  AllocaPromoter(const SmallVectorImpl<Instruction*> &Insts, SSAUpdater &S,
1273                 AllocaInst &AI, DIBuilder &DIB)
1274    : LoadAndStorePromoter(Insts, S), AI(AI), DIB(DIB) {}
1275
1276  void run(const SmallVectorImpl<Instruction*> &Insts) {
1277    // Remember which alloca we're promoting (for isInstInList).
1278    if (MDNode *DebugNode = MDNode::getIfExists(AI.getContext(), &AI)) {
1279      for (Value::use_iterator UI = DebugNode->use_begin(),
1280                               UE = DebugNode->use_end();
1281           UI != UE; ++UI)
1282        if (DbgDeclareInst *DDI = dyn_cast<DbgDeclareInst>(*UI))
1283          DDIs.push_back(DDI);
1284        else if (DbgValueInst *DVI = dyn_cast<DbgValueInst>(*UI))
1285          DVIs.push_back(DVI);
1286    }
1287
1288    LoadAndStorePromoter::run(Insts);
1289    AI.eraseFromParent();
1290    while (!DDIs.empty())
1291      DDIs.pop_back_val()->eraseFromParent();
1292    while (!DVIs.empty())
1293      DVIs.pop_back_val()->eraseFromParent();
1294  }
1295
1296  virtual bool isInstInList(Instruction *I,
1297                            const SmallVectorImpl<Instruction*> &Insts) const {
1298    if (LoadInst *LI = dyn_cast<LoadInst>(I))
1299      return LI->getOperand(0) == &AI;
1300    return cast<StoreInst>(I)->getPointerOperand() == &AI;
1301  }
1302
1303  virtual void updateDebugInfo(Instruction *Inst) const {
1304    for (SmallVector<DbgDeclareInst *, 4>::const_iterator I = DDIs.begin(),
1305           E = DDIs.end(); I != E; ++I) {
1306      DbgDeclareInst *DDI = *I;
1307      if (StoreInst *SI = dyn_cast<StoreInst>(Inst))
1308        ConvertDebugDeclareToDebugValue(DDI, SI, DIB);
1309      else if (LoadInst *LI = dyn_cast<LoadInst>(Inst))
1310        ConvertDebugDeclareToDebugValue(DDI, LI, DIB);
1311    }
1312    for (SmallVector<DbgValueInst *, 4>::const_iterator I = DVIs.begin(),
1313           E = DVIs.end(); I != E; ++I) {
1314      DbgValueInst *DVI = *I;
1315      Value *Arg = 0;
1316      if (StoreInst *SI = dyn_cast<StoreInst>(Inst)) {
1317        // If an argument is zero extended then use argument directly. The ZExt
1318        // may be zapped by an optimization pass in future.
1319        if (ZExtInst *ZExt = dyn_cast<ZExtInst>(SI->getOperand(0)))
1320          Arg = dyn_cast<Argument>(ZExt->getOperand(0));
1321        else if (SExtInst *SExt = dyn_cast<SExtInst>(SI->getOperand(0)))
1322          Arg = dyn_cast<Argument>(SExt->getOperand(0));
1323        if (!Arg)
1324          Arg = SI->getValueOperand();
1325      } else if (LoadInst *LI = dyn_cast<LoadInst>(Inst)) {
1326        Arg = LI->getPointerOperand();
1327      } else {
1328        continue;
1329      }
1330      Instruction *DbgVal =
1331        DIB.insertDbgValueIntrinsic(Arg, 0, DIVariable(DVI->getVariable()),
1332                                     Inst);
1333      DbgVal->setDebugLoc(DVI->getDebugLoc());
1334    }
1335  }
1336};
1337} // end anon namespace
1338
1339
1340namespace {
1341/// \brief An optimization pass providing Scalar Replacement of Aggregates.
1342///
1343/// This pass takes allocations which can be completely analyzed (that is, they
1344/// don't escape) and tries to turn them into scalar SSA values. There are
1345/// a few steps to this process.
1346///
1347/// 1) It takes allocations of aggregates and analyzes the ways in which they
1348///    are used to try to split them into smaller allocations, ideally of
1349///    a single scalar data type. It will split up memcpy and memset accesses
1350///    as necessary and try to isolate individual scalar accesses.
1351/// 2) It will transform accesses into forms which are suitable for SSA value
1352///    promotion. This can be replacing a memset with a scalar store of an
1353///    integer value, or it can involve speculating operations on a PHI or
1354///    select to be a PHI or select of the results.
1355/// 3) Finally, this will try to detect a pattern of accesses which map cleanly
1356///    onto insert and extract operations on a vector value, and convert them to
1357///    this form. By doing so, it will enable promotion of vector aggregates to
1358///    SSA vector values.
1359class SROA : public FunctionPass {
1360  const bool RequiresDomTree;
1361
1362  LLVMContext *C;
1363  const DataLayout *TD;
1364  DominatorTree *DT;
1365
1366  /// \brief Worklist of alloca instructions to simplify.
1367  ///
1368  /// Each alloca in the function is added to this. Each new alloca formed gets
1369  /// added to it as well to recursively simplify unless that alloca can be
1370  /// directly promoted. Finally, each time we rewrite a use of an alloca other
1371  /// the one being actively rewritten, we add it back onto the list if not
1372  /// already present to ensure it is re-visited.
1373  SetVector<AllocaInst *, SmallVector<AllocaInst *, 16> > Worklist;
1374
1375  /// \brief A collection of instructions to delete.
1376  /// We try to batch deletions to simplify code and make things a bit more
1377  /// efficient.
1378  SetVector<Instruction *, SmallVector<Instruction *, 8> > DeadInsts;
1379
1380  /// \brief Post-promotion worklist.
1381  ///
1382  /// Sometimes we discover an alloca which has a high probability of becoming
1383  /// viable for SROA after a round of promotion takes place. In those cases,
1384  /// the alloca is enqueued here for re-processing.
1385  ///
1386  /// Note that we have to be very careful to clear allocas out of this list in
1387  /// the event they are deleted.
1388  SetVector<AllocaInst *, SmallVector<AllocaInst *, 16> > PostPromotionWorklist;
1389
1390  /// \brief A collection of alloca instructions we can directly promote.
1391  std::vector<AllocaInst *> PromotableAllocas;
1392
1393public:
1394  SROA(bool RequiresDomTree = true)
1395      : FunctionPass(ID), RequiresDomTree(RequiresDomTree),
1396        C(0), TD(0), DT(0) {
1397    initializeSROAPass(*PassRegistry::getPassRegistry());
1398  }
1399  bool runOnFunction(Function &F);
1400  void getAnalysisUsage(AnalysisUsage &AU) const;
1401
1402  const char *getPassName() const { return "SROA"; }
1403  static char ID;
1404
1405private:
1406  friend class PHIOrSelectSpeculator;
1407  friend class AllocaPartitionRewriter;
1408  friend class AllocaPartitionVectorRewriter;
1409
1410  bool rewriteAllocaPartition(AllocaInst &AI,
1411                              AllocaPartitioning &P,
1412                              AllocaPartitioning::iterator PI);
1413  bool splitAlloca(AllocaInst &AI, AllocaPartitioning &P);
1414  bool runOnAlloca(AllocaInst &AI);
1415  void deleteDeadInstructions(SmallPtrSet<AllocaInst *, 4> &DeletedAllocas);
1416  bool promoteAllocas(Function &F);
1417};
1418}
1419
1420char SROA::ID = 0;
1421
1422FunctionPass *llvm::createSROAPass(bool RequiresDomTree) {
1423  return new SROA(RequiresDomTree);
1424}
1425
1426INITIALIZE_PASS_BEGIN(SROA, "sroa", "Scalar Replacement Of Aggregates",
1427                      false, false)
1428INITIALIZE_PASS_DEPENDENCY(DominatorTree)
1429INITIALIZE_PASS_END(SROA, "sroa", "Scalar Replacement Of Aggregates",
1430                    false, false)
1431
1432namespace {
1433/// \brief Visitor to speculate PHIs and Selects where possible.
1434class PHIOrSelectSpeculator : public InstVisitor<PHIOrSelectSpeculator> {
1435  // Befriend the base class so it can delegate to private visit methods.
1436  friend class llvm::InstVisitor<PHIOrSelectSpeculator>;
1437
1438  const DataLayout &TD;
1439  AllocaPartitioning &P;
1440  SROA &Pass;
1441
1442public:
1443  PHIOrSelectSpeculator(const DataLayout &TD, AllocaPartitioning &P, SROA &Pass)
1444    : TD(TD), P(P), Pass(Pass) {}
1445
1446  /// \brief Visit the users of an alloca partition and rewrite them.
1447  void visitUsers(AllocaPartitioning::const_iterator PI) {
1448    // Note that we need to use an index here as the underlying vector of uses
1449    // may be grown during speculation. However, we never need to re-visit the
1450    // new uses, and so we can use the initial size bound.
1451    for (unsigned Idx = 0, Size = P.use_size(PI); Idx != Size; ++Idx) {
1452      const PartitionUse &PU = P.getUse(PI, Idx);
1453      if (!PU.getUse())
1454        continue; // Skip dead use.
1455
1456      visit(cast<Instruction>(PU.getUse()->getUser()));
1457    }
1458  }
1459
1460private:
1461  // By default, skip this instruction.
1462  void visitInstruction(Instruction &I) {}
1463
1464  /// PHI instructions that use an alloca and are subsequently loaded can be
1465  /// rewritten to load both input pointers in the pred blocks and then PHI the
1466  /// results, allowing the load of the alloca to be promoted.
1467  /// From this:
1468  ///   %P2 = phi [i32* %Alloca, i32* %Other]
1469  ///   %V = load i32* %P2
1470  /// to:
1471  ///   %V1 = load i32* %Alloca      -> will be mem2reg'd
1472  ///   ...
1473  ///   %V2 = load i32* %Other
1474  ///   ...
1475  ///   %V = phi [i32 %V1, i32 %V2]
1476  ///
1477  /// We can do this to a select if its only uses are loads and if the operands
1478  /// to the select can be loaded unconditionally.
1479  ///
1480  /// FIXME: This should be hoisted into a generic utility, likely in
1481  /// Transforms/Util/Local.h
1482  bool isSafePHIToSpeculate(PHINode &PN, SmallVectorImpl<LoadInst *> &Loads) {
1483    // For now, we can only do this promotion if the load is in the same block
1484    // as the PHI, and if there are no stores between the phi and load.
1485    // TODO: Allow recursive phi users.
1486    // TODO: Allow stores.
1487    BasicBlock *BB = PN.getParent();
1488    unsigned MaxAlign = 0;
1489    for (Value::use_iterator UI = PN.use_begin(), UE = PN.use_end();
1490         UI != UE; ++UI) {
1491      LoadInst *LI = dyn_cast<LoadInst>(*UI);
1492      if (LI == 0 || !LI->isSimple()) return false;
1493
1494      // For now we only allow loads in the same block as the PHI.  This is
1495      // a common case that happens when instcombine merges two loads through
1496      // a PHI.
1497      if (LI->getParent() != BB) return false;
1498
1499      // Ensure that there are no instructions between the PHI and the load that
1500      // could store.
1501      for (BasicBlock::iterator BBI = &PN; &*BBI != LI; ++BBI)
1502        if (BBI->mayWriteToMemory())
1503          return false;
1504
1505      MaxAlign = std::max(MaxAlign, LI->getAlignment());
1506      Loads.push_back(LI);
1507    }
1508
1509    // We can only transform this if it is safe to push the loads into the
1510    // predecessor blocks. The only thing to watch out for is that we can't put
1511    // a possibly trapping load in the predecessor if it is a critical edge.
1512    for (unsigned Idx = 0, Num = PN.getNumIncomingValues(); Idx != Num; ++Idx) {
1513      TerminatorInst *TI = PN.getIncomingBlock(Idx)->getTerminator();
1514      Value *InVal = PN.getIncomingValue(Idx);
1515
1516      // If the value is produced by the terminator of the predecessor (an
1517      // invoke) or it has side-effects, there is no valid place to put a load
1518      // in the predecessor.
1519      if (TI == InVal || TI->mayHaveSideEffects())
1520        return false;
1521
1522      // If the predecessor has a single successor, then the edge isn't
1523      // critical.
1524      if (TI->getNumSuccessors() == 1)
1525        continue;
1526
1527      // If this pointer is always safe to load, or if we can prove that there
1528      // is already a load in the block, then we can move the load to the pred
1529      // block.
1530      if (InVal->isDereferenceablePointer() ||
1531          isSafeToLoadUnconditionally(InVal, TI, MaxAlign, &TD))
1532        continue;
1533
1534      return false;
1535    }
1536
1537    return true;
1538  }
1539
1540  void visitPHINode(PHINode &PN) {
1541    DEBUG(dbgs() << "    original: " << PN << "\n");
1542
1543    SmallVector<LoadInst *, 4> Loads;
1544    if (!isSafePHIToSpeculate(PN, Loads))
1545      return;
1546
1547    assert(!Loads.empty());
1548
1549    Type *LoadTy = cast<PointerType>(PN.getType())->getElementType();
1550    IRBuilderTy PHIBuilder(&PN);
1551    PHINode *NewPN = PHIBuilder.CreatePHI(LoadTy, PN.getNumIncomingValues(),
1552                                          PN.getName() + ".sroa.speculated");
1553
1554    // Get the TBAA tag and alignment to use from one of the loads.  It doesn't
1555    // matter which one we get and if any differ.
1556    LoadInst *SomeLoad = cast<LoadInst>(Loads.back());
1557    MDNode *TBAATag = SomeLoad->getMetadata(LLVMContext::MD_tbaa);
1558    unsigned Align = SomeLoad->getAlignment();
1559
1560    // Rewrite all loads of the PN to use the new PHI.
1561    do {
1562      LoadInst *LI = Loads.pop_back_val();
1563      LI->replaceAllUsesWith(NewPN);
1564      Pass.DeadInsts.insert(LI);
1565    } while (!Loads.empty());
1566
1567    // Inject loads into all of the pred blocks.
1568    for (unsigned Idx = 0, Num = PN.getNumIncomingValues(); Idx != Num; ++Idx) {
1569      BasicBlock *Pred = PN.getIncomingBlock(Idx);
1570      TerminatorInst *TI = Pred->getTerminator();
1571      Use *InUse = &PN.getOperandUse(PN.getOperandNumForIncomingValue(Idx));
1572      Value *InVal = PN.getIncomingValue(Idx);
1573      IRBuilderTy PredBuilder(TI);
1574
1575      LoadInst *Load
1576        = PredBuilder.CreateLoad(InVal, (PN.getName() + ".sroa.speculate.load." +
1577                                         Pred->getName()));
1578      ++NumLoadsSpeculated;
1579      Load->setAlignment(Align);
1580      if (TBAATag)
1581        Load->setMetadata(LLVMContext::MD_tbaa, TBAATag);
1582      NewPN->addIncoming(Load, Pred);
1583
1584      Instruction *Ptr = dyn_cast<Instruction>(InVal);
1585      if (!Ptr)
1586        // No uses to rewrite.
1587        continue;
1588
1589      // Try to lookup and rewrite any partition uses corresponding to this phi
1590      // input.
1591      AllocaPartitioning::iterator PI
1592        = P.findPartitionForPHIOrSelectOperand(InUse);
1593      if (PI == P.end())
1594        continue;
1595
1596      // Replace the Use in the PartitionUse for this operand with the Use
1597      // inside the load.
1598      AllocaPartitioning::use_iterator UI
1599        = P.findPartitionUseForPHIOrSelectOperand(InUse);
1600      assert(isa<PHINode>(*UI->getUse()->getUser()));
1601      UI->setUse(&Load->getOperandUse(Load->getPointerOperandIndex()));
1602    }
1603    DEBUG(dbgs() << "          speculated to: " << *NewPN << "\n");
1604  }
1605
1606  /// Select instructions that use an alloca and are subsequently loaded can be
1607  /// rewritten to load both input pointers and then select between the result,
1608  /// allowing the load of the alloca to be promoted.
1609  /// From this:
1610  ///   %P2 = select i1 %cond, i32* %Alloca, i32* %Other
1611  ///   %V = load i32* %P2
1612  /// to:
1613  ///   %V1 = load i32* %Alloca      -> will be mem2reg'd
1614  ///   %V2 = load i32* %Other
1615  ///   %V = select i1 %cond, i32 %V1, i32 %V2
1616  ///
1617  /// We can do this to a select if its only uses are loads and if the operand
1618  /// to the select can be loaded unconditionally.
1619  bool isSafeSelectToSpeculate(SelectInst &SI,
1620                               SmallVectorImpl<LoadInst *> &Loads) {
1621    Value *TValue = SI.getTrueValue();
1622    Value *FValue = SI.getFalseValue();
1623    bool TDerefable = TValue->isDereferenceablePointer();
1624    bool FDerefable = FValue->isDereferenceablePointer();
1625
1626    for (Value::use_iterator UI = SI.use_begin(), UE = SI.use_end();
1627         UI != UE; ++UI) {
1628      LoadInst *LI = dyn_cast<LoadInst>(*UI);
1629      if (LI == 0 || !LI->isSimple()) return false;
1630
1631      // Both operands to the select need to be dereferencable, either
1632      // absolutely (e.g. allocas) or at this point because we can see other
1633      // accesses to it.
1634      if (!TDerefable && !isSafeToLoadUnconditionally(TValue, LI,
1635                                                      LI->getAlignment(), &TD))
1636        return false;
1637      if (!FDerefable && !isSafeToLoadUnconditionally(FValue, LI,
1638                                                      LI->getAlignment(), &TD))
1639        return false;
1640      Loads.push_back(LI);
1641    }
1642
1643    return true;
1644  }
1645
1646  void visitSelectInst(SelectInst &SI) {
1647    DEBUG(dbgs() << "    original: " << SI << "\n");
1648
1649    // If the select isn't safe to speculate, just use simple logic to emit it.
1650    SmallVector<LoadInst *, 4> Loads;
1651    if (!isSafeSelectToSpeculate(SI, Loads))
1652      return;
1653
1654    IRBuilderTy IRB(&SI);
1655    Use *Ops[2] = { &SI.getOperandUse(1), &SI.getOperandUse(2) };
1656    AllocaPartitioning::iterator PIs[2];
1657    PartitionUse PUs[2];
1658    for (unsigned i = 0, e = 2; i != e; ++i) {
1659      PIs[i] = P.findPartitionForPHIOrSelectOperand(Ops[i]);
1660      if (PIs[i] != P.end()) {
1661        // If the pointer is within the partitioning, remove the select from
1662        // its uses. We'll add in the new loads below.
1663        AllocaPartitioning::use_iterator UI
1664          = P.findPartitionUseForPHIOrSelectOperand(Ops[i]);
1665        PUs[i] = *UI;
1666        // Clear out the use here so that the offsets into the use list remain
1667        // stable but this use is ignored when rewriting.
1668        UI->setUse(0);
1669      }
1670    }
1671
1672    Value *TV = SI.getTrueValue();
1673    Value *FV = SI.getFalseValue();
1674    // Replace the loads of the select with a select of two loads.
1675    while (!Loads.empty()) {
1676      LoadInst *LI = Loads.pop_back_val();
1677
1678      IRB.SetInsertPoint(LI);
1679      LoadInst *TL =
1680        IRB.CreateLoad(TV, LI->getName() + ".sroa.speculate.load.true");
1681      LoadInst *FL =
1682        IRB.CreateLoad(FV, LI->getName() + ".sroa.speculate.load.false");
1683      NumLoadsSpeculated += 2;
1684
1685      // Transfer alignment and TBAA info if present.
1686      TL->setAlignment(LI->getAlignment());
1687      FL->setAlignment(LI->getAlignment());
1688      if (MDNode *Tag = LI->getMetadata(LLVMContext::MD_tbaa)) {
1689        TL->setMetadata(LLVMContext::MD_tbaa, Tag);
1690        FL->setMetadata(LLVMContext::MD_tbaa, Tag);
1691      }
1692
1693      Value *V = IRB.CreateSelect(SI.getCondition(), TL, FL,
1694                                  LI->getName() + ".sroa.speculated");
1695
1696      LoadInst *Loads[2] = { TL, FL };
1697      for (unsigned i = 0, e = 2; i != e; ++i) {
1698        if (PIs[i] != P.end()) {
1699          Use *LoadUse = &Loads[i]->getOperandUse(0);
1700          assert(PUs[i].getUse()->get() == LoadUse->get());
1701          PUs[i].setUse(LoadUse);
1702          P.use_push_back(PIs[i], PUs[i]);
1703        }
1704      }
1705
1706      DEBUG(dbgs() << "          speculated to: " << *V << "\n");
1707      LI->replaceAllUsesWith(V);
1708      Pass.DeadInsts.insert(LI);
1709    }
1710  }
1711};
1712}
1713
1714/// \brief Build a GEP out of a base pointer and indices.
1715///
1716/// This will return the BasePtr if that is valid, or build a new GEP
1717/// instruction using the IRBuilder if GEP-ing is needed.
1718static Value *buildGEP(IRBuilderTy &IRB, Value *BasePtr,
1719                       SmallVectorImpl<Value *> &Indices) {
1720  if (Indices.empty())
1721    return BasePtr;
1722
1723  // A single zero index is a no-op, so check for this and avoid building a GEP
1724  // in that case.
1725  if (Indices.size() == 1 && cast<ConstantInt>(Indices.back())->isZero())
1726    return BasePtr;
1727
1728  return IRB.CreateInBoundsGEP(BasePtr, Indices, "idx");
1729}
1730
1731/// \brief Get a natural GEP off of the BasePtr walking through Ty toward
1732/// TargetTy without changing the offset of the pointer.
1733///
1734/// This routine assumes we've already established a properly offset GEP with
1735/// Indices, and arrived at the Ty type. The goal is to continue to GEP with
1736/// zero-indices down through type layers until we find one the same as
1737/// TargetTy. If we can't find one with the same type, we at least try to use
1738/// one with the same size. If none of that works, we just produce the GEP as
1739/// indicated by Indices to have the correct offset.
1740static Value *getNaturalGEPWithType(IRBuilderTy &IRB, const DataLayout &TD,
1741                                    Value *BasePtr, Type *Ty, Type *TargetTy,
1742                                    SmallVectorImpl<Value *> &Indices) {
1743  if (Ty == TargetTy)
1744    return buildGEP(IRB, BasePtr, Indices);
1745
1746  // See if we can descend into a struct and locate a field with the correct
1747  // type.
1748  unsigned NumLayers = 0;
1749  Type *ElementTy = Ty;
1750  do {
1751    if (ElementTy->isPointerTy())
1752      break;
1753    if (SequentialType *SeqTy = dyn_cast<SequentialType>(ElementTy)) {
1754      ElementTy = SeqTy->getElementType();
1755      // Note that we use the default address space as this index is over an
1756      // array or a vector, not a pointer.
1757      Indices.push_back(IRB.getInt(APInt(TD.getPointerSizeInBits(0), 0)));
1758    } else if (StructType *STy = dyn_cast<StructType>(ElementTy)) {
1759      if (STy->element_begin() == STy->element_end())
1760        break; // Nothing left to descend into.
1761      ElementTy = *STy->element_begin();
1762      Indices.push_back(IRB.getInt32(0));
1763    } else {
1764      break;
1765    }
1766    ++NumLayers;
1767  } while (ElementTy != TargetTy);
1768  if (ElementTy != TargetTy)
1769    Indices.erase(Indices.end() - NumLayers, Indices.end());
1770
1771  return buildGEP(IRB, BasePtr, Indices);
1772}
1773
1774/// \brief Recursively compute indices for a natural GEP.
1775///
1776/// This is the recursive step for getNaturalGEPWithOffset that walks down the
1777/// element types adding appropriate indices for the GEP.
1778static Value *getNaturalGEPRecursively(IRBuilderTy &IRB, const DataLayout &TD,
1779                                       Value *Ptr, Type *Ty, APInt &Offset,
1780                                       Type *TargetTy,
1781                                       SmallVectorImpl<Value *> &Indices) {
1782  if (Offset == 0)
1783    return getNaturalGEPWithType(IRB, TD, Ptr, Ty, TargetTy, Indices);
1784
1785  // We can't recurse through pointer types.
1786  if (Ty->isPointerTy())
1787    return 0;
1788
1789  // We try to analyze GEPs over vectors here, but note that these GEPs are
1790  // extremely poorly defined currently. The long-term goal is to remove GEPing
1791  // over a vector from the IR completely.
1792  if (VectorType *VecTy = dyn_cast<VectorType>(Ty)) {
1793    unsigned ElementSizeInBits = TD.getTypeSizeInBits(VecTy->getScalarType());
1794    if (ElementSizeInBits % 8)
1795      return 0; // GEPs over non-multiple of 8 size vector elements are invalid.
1796    APInt ElementSize(Offset.getBitWidth(), ElementSizeInBits / 8);
1797    APInt NumSkippedElements = Offset.sdiv(ElementSize);
1798    if (NumSkippedElements.ugt(VecTy->getNumElements()))
1799      return 0;
1800    Offset -= NumSkippedElements * ElementSize;
1801    Indices.push_back(IRB.getInt(NumSkippedElements));
1802    return getNaturalGEPRecursively(IRB, TD, Ptr, VecTy->getElementType(),
1803                                    Offset, TargetTy, Indices);
1804  }
1805
1806  if (ArrayType *ArrTy = dyn_cast<ArrayType>(Ty)) {
1807    Type *ElementTy = ArrTy->getElementType();
1808    APInt ElementSize(Offset.getBitWidth(), TD.getTypeAllocSize(ElementTy));
1809    APInt NumSkippedElements = Offset.sdiv(ElementSize);
1810    if (NumSkippedElements.ugt(ArrTy->getNumElements()))
1811      return 0;
1812
1813    Offset -= NumSkippedElements * ElementSize;
1814    Indices.push_back(IRB.getInt(NumSkippedElements));
1815    return getNaturalGEPRecursively(IRB, TD, Ptr, ElementTy, Offset, TargetTy,
1816                                    Indices);
1817  }
1818
1819  StructType *STy = dyn_cast<StructType>(Ty);
1820  if (!STy)
1821    return 0;
1822
1823  const StructLayout *SL = TD.getStructLayout(STy);
1824  uint64_t StructOffset = Offset.getZExtValue();
1825  if (StructOffset >= SL->getSizeInBytes())
1826    return 0;
1827  unsigned Index = SL->getElementContainingOffset(StructOffset);
1828  Offset -= APInt(Offset.getBitWidth(), SL->getElementOffset(Index));
1829  Type *ElementTy = STy->getElementType(Index);
1830  if (Offset.uge(TD.getTypeAllocSize(ElementTy)))
1831    return 0; // The offset points into alignment padding.
1832
1833  Indices.push_back(IRB.getInt32(Index));
1834  return getNaturalGEPRecursively(IRB, TD, Ptr, ElementTy, Offset, TargetTy,
1835                                  Indices);
1836}
1837
1838/// \brief Get a natural GEP from a base pointer to a particular offset and
1839/// resulting in a particular type.
1840///
1841/// The goal is to produce a "natural" looking GEP that works with the existing
1842/// composite types to arrive at the appropriate offset and element type for
1843/// a pointer. TargetTy is the element type the returned GEP should point-to if
1844/// possible. We recurse by decreasing Offset, adding the appropriate index to
1845/// Indices, and setting Ty to the result subtype.
1846///
1847/// If no natural GEP can be constructed, this function returns null.
1848static Value *getNaturalGEPWithOffset(IRBuilderTy &IRB, const DataLayout &TD,
1849                                      Value *Ptr, APInt Offset, Type *TargetTy,
1850                                      SmallVectorImpl<Value *> &Indices) {
1851  PointerType *Ty = cast<PointerType>(Ptr->getType());
1852
1853  // Don't consider any GEPs through an i8* as natural unless the TargetTy is
1854  // an i8.
1855  if (Ty == IRB.getInt8PtrTy() && TargetTy->isIntegerTy(8))
1856    return 0;
1857
1858  Type *ElementTy = Ty->getElementType();
1859  if (!ElementTy->isSized())
1860    return 0; // We can't GEP through an unsized element.
1861  APInt ElementSize(Offset.getBitWidth(), TD.getTypeAllocSize(ElementTy));
1862  if (ElementSize == 0)
1863    return 0; // Zero-length arrays can't help us build a natural GEP.
1864  APInt NumSkippedElements = Offset.sdiv(ElementSize);
1865
1866  Offset -= NumSkippedElements * ElementSize;
1867  Indices.push_back(IRB.getInt(NumSkippedElements));
1868  return getNaturalGEPRecursively(IRB, TD, Ptr, ElementTy, Offset, TargetTy,
1869                                  Indices);
1870}
1871
1872/// \brief Compute an adjusted pointer from Ptr by Offset bytes where the
1873/// resulting pointer has PointerTy.
1874///
1875/// This tries very hard to compute a "natural" GEP which arrives at the offset
1876/// and produces the pointer type desired. Where it cannot, it will try to use
1877/// the natural GEP to arrive at the offset and bitcast to the type. Where that
1878/// fails, it will try to use an existing i8* and GEP to the byte offset and
1879/// bitcast to the type.
1880///
1881/// The strategy for finding the more natural GEPs is to peel off layers of the
1882/// pointer, walking back through bit casts and GEPs, searching for a base
1883/// pointer from which we can compute a natural GEP with the desired
1884/// properties. The algorithm tries to fold as many constant indices into
1885/// a single GEP as possible, thus making each GEP more independent of the
1886/// surrounding code.
1887static Value *getAdjustedPtr(IRBuilderTy &IRB, const DataLayout &TD,
1888                             Value *Ptr, APInt Offset, Type *PointerTy) {
1889  // Even though we don't look through PHI nodes, we could be called on an
1890  // instruction in an unreachable block, which may be on a cycle.
1891  SmallPtrSet<Value *, 4> Visited;
1892  Visited.insert(Ptr);
1893  SmallVector<Value *, 4> Indices;
1894
1895  // We may end up computing an offset pointer that has the wrong type. If we
1896  // never are able to compute one directly that has the correct type, we'll
1897  // fall back to it, so keep it around here.
1898  Value *OffsetPtr = 0;
1899
1900  // Remember any i8 pointer we come across to re-use if we need to do a raw
1901  // byte offset.
1902  Value *Int8Ptr = 0;
1903  APInt Int8PtrOffset(Offset.getBitWidth(), 0);
1904
1905  Type *TargetTy = PointerTy->getPointerElementType();
1906
1907  do {
1908    // First fold any existing GEPs into the offset.
1909    while (GEPOperator *GEP = dyn_cast<GEPOperator>(Ptr)) {
1910      APInt GEPOffset(Offset.getBitWidth(), 0);
1911      if (!GEP->accumulateConstantOffset(TD, GEPOffset))
1912        break;
1913      Offset += GEPOffset;
1914      Ptr = GEP->getPointerOperand();
1915      if (!Visited.insert(Ptr))
1916        break;
1917    }
1918
1919    // See if we can perform a natural GEP here.
1920    Indices.clear();
1921    if (Value *P = getNaturalGEPWithOffset(IRB, TD, Ptr, Offset, TargetTy,
1922                                           Indices)) {
1923      if (P->getType() == PointerTy) {
1924        // Zap any offset pointer that we ended up computing in previous rounds.
1925        if (OffsetPtr && OffsetPtr->use_empty())
1926          if (Instruction *I = dyn_cast<Instruction>(OffsetPtr))
1927            I->eraseFromParent();
1928        return P;
1929      }
1930      if (!OffsetPtr) {
1931        OffsetPtr = P;
1932      }
1933    }
1934
1935    // Stash this pointer if we've found an i8*.
1936    if (Ptr->getType()->isIntegerTy(8)) {
1937      Int8Ptr = Ptr;
1938      Int8PtrOffset = Offset;
1939    }
1940
1941    // Peel off a layer of the pointer and update the offset appropriately.
1942    if (Operator::getOpcode(Ptr) == Instruction::BitCast) {
1943      Ptr = cast<Operator>(Ptr)->getOperand(0);
1944    } else if (GlobalAlias *GA = dyn_cast<GlobalAlias>(Ptr)) {
1945      if (GA->mayBeOverridden())
1946        break;
1947      Ptr = GA->getAliasee();
1948    } else {
1949      break;
1950    }
1951    assert(Ptr->getType()->isPointerTy() && "Unexpected operand type!");
1952  } while (Visited.insert(Ptr));
1953
1954  if (!OffsetPtr) {
1955    if (!Int8Ptr) {
1956      Int8Ptr = IRB.CreateBitCast(Ptr, IRB.getInt8PtrTy(),
1957                                  "raw_cast");
1958      Int8PtrOffset = Offset;
1959    }
1960
1961    OffsetPtr = Int8PtrOffset == 0 ? Int8Ptr :
1962      IRB.CreateInBoundsGEP(Int8Ptr, IRB.getInt(Int8PtrOffset),
1963                            "raw_idx");
1964  }
1965  Ptr = OffsetPtr;
1966
1967  // On the off chance we were targeting i8*, guard the bitcast here.
1968  if (Ptr->getType() != PointerTy)
1969    Ptr = IRB.CreateBitCast(Ptr, PointerTy, "cast");
1970
1971  return Ptr;
1972}
1973
1974/// \brief Test whether we can convert a value from the old to the new type.
1975///
1976/// This predicate should be used to guard calls to convertValue in order to
1977/// ensure that we only try to convert viable values. The strategy is that we
1978/// will peel off single element struct and array wrappings to get to an
1979/// underlying value, and convert that value.
1980static bool canConvertValue(const DataLayout &DL, Type *OldTy, Type *NewTy) {
1981  if (OldTy == NewTy)
1982    return true;
1983  if (IntegerType *OldITy = dyn_cast<IntegerType>(OldTy))
1984    if (IntegerType *NewITy = dyn_cast<IntegerType>(NewTy))
1985      if (NewITy->getBitWidth() >= OldITy->getBitWidth())
1986        return true;
1987  if (DL.getTypeSizeInBits(NewTy) != DL.getTypeSizeInBits(OldTy))
1988    return false;
1989  if (!NewTy->isSingleValueType() || !OldTy->isSingleValueType())
1990    return false;
1991
1992  if (NewTy->isPointerTy() || OldTy->isPointerTy()) {
1993    if (NewTy->isPointerTy() && OldTy->isPointerTy())
1994      return true;
1995    if (NewTy->isIntegerTy() || OldTy->isIntegerTy())
1996      return true;
1997    return false;
1998  }
1999
2000  return true;
2001}
2002
2003/// \brief Generic routine to convert an SSA value to a value of a different
2004/// type.
2005///
2006/// This will try various different casting techniques, such as bitcasts,
2007/// inttoptr, and ptrtoint casts. Use the \c canConvertValue predicate to test
2008/// two types for viability with this routine.
2009static Value *convertValue(const DataLayout &DL, IRBuilderTy &IRB, Value *V,
2010                           Type *Ty) {
2011  assert(canConvertValue(DL, V->getType(), Ty) &&
2012         "Value not convertable to type");
2013  if (V->getType() == Ty)
2014    return V;
2015  if (IntegerType *OldITy = dyn_cast<IntegerType>(V->getType()))
2016    if (IntegerType *NewITy = dyn_cast<IntegerType>(Ty))
2017      if (NewITy->getBitWidth() > OldITy->getBitWidth())
2018        return IRB.CreateZExt(V, NewITy);
2019  if (V->getType()->isIntegerTy() && Ty->isPointerTy())
2020    return IRB.CreateIntToPtr(V, Ty);
2021  if (V->getType()->isPointerTy() && Ty->isIntegerTy())
2022    return IRB.CreatePtrToInt(V, Ty);
2023
2024  return IRB.CreateBitCast(V, Ty);
2025}
2026
2027/// \brief Test whether the given alloca partition can be promoted to a vector.
2028///
2029/// This is a quick test to check whether we can rewrite a particular alloca
2030/// partition (and its newly formed alloca) into a vector alloca with only
2031/// whole-vector loads and stores such that it could be promoted to a vector
2032/// SSA value. We only can ensure this for a limited set of operations, and we
2033/// don't want to do the rewrites unless we are confident that the result will
2034/// be promotable, so we have an early test here.
2035static bool isVectorPromotionViable(const DataLayout &TD,
2036                                    Type *AllocaTy,
2037                                    AllocaPartitioning &P,
2038                                    uint64_t PartitionBeginOffset,
2039                                    uint64_t PartitionEndOffset,
2040                                    AllocaPartitioning::const_use_iterator I,
2041                                    AllocaPartitioning::const_use_iterator E) {
2042  VectorType *Ty = dyn_cast<VectorType>(AllocaTy);
2043  if (!Ty)
2044    return false;
2045
2046  uint64_t ElementSize = TD.getTypeSizeInBits(Ty->getScalarType());
2047
2048  // While the definition of LLVM vectors is bitpacked, we don't support sizes
2049  // that aren't byte sized.
2050  if (ElementSize % 8)
2051    return false;
2052  assert((TD.getTypeSizeInBits(Ty) % 8) == 0 &&
2053         "vector size not a multiple of element size?");
2054  ElementSize /= 8;
2055
2056  for (; I != E; ++I) {
2057    Use *U = I->getUse();
2058    if (!U)
2059      continue; // Skip dead use.
2060
2061    uint64_t BeginOffset = I->BeginOffset - PartitionBeginOffset;
2062    uint64_t BeginIndex = BeginOffset / ElementSize;
2063    if (BeginIndex * ElementSize != BeginOffset ||
2064        BeginIndex >= Ty->getNumElements())
2065      return false;
2066    uint64_t EndOffset = I->EndOffset - PartitionBeginOffset;
2067    uint64_t EndIndex = EndOffset / ElementSize;
2068    if (EndIndex * ElementSize != EndOffset ||
2069        EndIndex > Ty->getNumElements())
2070      return false;
2071
2072    assert(EndIndex > BeginIndex && "Empty vector!");
2073    uint64_t NumElements = EndIndex - BeginIndex;
2074    Type *PartitionTy
2075      = (NumElements == 1) ? Ty->getElementType()
2076                           : VectorType::get(Ty->getElementType(), NumElements);
2077
2078    if (MemIntrinsic *MI = dyn_cast<MemIntrinsic>(U->getUser())) {
2079      if (MI->isVolatile())
2080        return false;
2081      if (MemTransferInst *MTI = dyn_cast<MemTransferInst>(U->getUser())) {
2082        const AllocaPartitioning::MemTransferOffsets &MTO
2083          = P.getMemTransferOffsets(*MTI);
2084        if (!MTO.IsSplittable)
2085          return false;
2086      }
2087    } else if (U->get()->getType()->getPointerElementType()->isStructTy()) {
2088      // Disable vector promotion when there are loads or stores of an FCA.
2089      return false;
2090    } else if (LoadInst *LI = dyn_cast<LoadInst>(U->getUser())) {
2091      if (LI->isVolatile())
2092        return false;
2093      if (!canConvertValue(TD, PartitionTy, LI->getType()))
2094        return false;
2095    } else if (StoreInst *SI = dyn_cast<StoreInst>(U->getUser())) {
2096      if (SI->isVolatile())
2097        return false;
2098      if (!canConvertValue(TD, SI->getValueOperand()->getType(), PartitionTy))
2099        return false;
2100    } else {
2101      return false;
2102    }
2103  }
2104  return true;
2105}
2106
2107/// \brief Test whether the given alloca partition's integer operations can be
2108/// widened to promotable ones.
2109///
2110/// This is a quick test to check whether we can rewrite the integer loads and
2111/// stores to a particular alloca into wider loads and stores and be able to
2112/// promote the resulting alloca.
2113static bool isIntegerWideningViable(const DataLayout &TD,
2114                                    Type *AllocaTy,
2115                                    uint64_t AllocBeginOffset,
2116                                    AllocaPartitioning &P,
2117                                    AllocaPartitioning::const_use_iterator I,
2118                                    AllocaPartitioning::const_use_iterator E) {
2119  uint64_t SizeInBits = TD.getTypeSizeInBits(AllocaTy);
2120  // Don't create integer types larger than the maximum bitwidth.
2121  if (SizeInBits > IntegerType::MAX_INT_BITS)
2122    return false;
2123
2124  // Don't try to handle allocas with bit-padding.
2125  if (SizeInBits != TD.getTypeStoreSizeInBits(AllocaTy))
2126    return false;
2127
2128  // We need to ensure that an integer type with the appropriate bitwidth can
2129  // be converted to the alloca type, whatever that is. We don't want to force
2130  // the alloca itself to have an integer type if there is a more suitable one.
2131  Type *IntTy = Type::getIntNTy(AllocaTy->getContext(), SizeInBits);
2132  if (!canConvertValue(TD, AllocaTy, IntTy) ||
2133      !canConvertValue(TD, IntTy, AllocaTy))
2134    return false;
2135
2136  uint64_t Size = TD.getTypeStoreSize(AllocaTy);
2137
2138  // Check the uses to ensure the uses are (likely) promotable integer uses.
2139  // Also ensure that the alloca has a covering load or store. We don't want
2140  // to widen the integer operations only to fail to promote due to some other
2141  // unsplittable entry (which we may make splittable later).
2142  bool WholeAllocaOp = false;
2143  for (; I != E; ++I) {
2144    Use *U = I->getUse();
2145    if (!U)
2146      continue; // Skip dead use.
2147
2148    uint64_t RelBegin = I->BeginOffset - AllocBeginOffset;
2149    uint64_t RelEnd = I->EndOffset - AllocBeginOffset;
2150
2151    // We can't reasonably handle cases where the load or store extends past
2152    // the end of the aloca's type and into its padding.
2153    if (RelEnd > Size)
2154      return false;
2155
2156    if (LoadInst *LI = dyn_cast<LoadInst>(U->getUser())) {
2157      if (LI->isVolatile())
2158        return false;
2159      if (RelBegin == 0 && RelEnd == Size)
2160        WholeAllocaOp = true;
2161      if (IntegerType *ITy = dyn_cast<IntegerType>(LI->getType())) {
2162        if (ITy->getBitWidth() < TD.getTypeStoreSizeInBits(ITy))
2163          return false;
2164        continue;
2165      }
2166      // Non-integer loads need to be convertible from the alloca type so that
2167      // they are promotable.
2168      if (RelBegin != 0 || RelEnd != Size ||
2169          !canConvertValue(TD, AllocaTy, LI->getType()))
2170        return false;
2171    } else if (StoreInst *SI = dyn_cast<StoreInst>(U->getUser())) {
2172      Type *ValueTy = SI->getValueOperand()->getType();
2173      if (SI->isVolatile())
2174        return false;
2175      if (RelBegin == 0 && RelEnd == Size)
2176        WholeAllocaOp = true;
2177      if (IntegerType *ITy = dyn_cast<IntegerType>(ValueTy)) {
2178        if (ITy->getBitWidth() < TD.getTypeStoreSizeInBits(ITy))
2179          return false;
2180        continue;
2181      }
2182      // Non-integer stores need to be convertible to the alloca type so that
2183      // they are promotable.
2184      if (RelBegin != 0 || RelEnd != Size ||
2185          !canConvertValue(TD, ValueTy, AllocaTy))
2186        return false;
2187    } else if (MemIntrinsic *MI = dyn_cast<MemIntrinsic>(U->getUser())) {
2188      if (MI->isVolatile() || !isa<Constant>(MI->getLength()))
2189        return false;
2190      if (MemTransferInst *MTI = dyn_cast<MemTransferInst>(U->getUser())) {
2191        const AllocaPartitioning::MemTransferOffsets &MTO
2192          = P.getMemTransferOffsets(*MTI);
2193        if (!MTO.IsSplittable)
2194          return false;
2195      }
2196    } else if (IntrinsicInst *II = dyn_cast<IntrinsicInst>(U->getUser())) {
2197      if (II->getIntrinsicID() != Intrinsic::lifetime_start &&
2198          II->getIntrinsicID() != Intrinsic::lifetime_end)
2199        return false;
2200    } else {
2201      return false;
2202    }
2203  }
2204  return WholeAllocaOp;
2205}
2206
2207static Value *extractInteger(const DataLayout &DL, IRBuilderTy &IRB, Value *V,
2208                             IntegerType *Ty, uint64_t Offset,
2209                             const Twine &Name) {
2210  DEBUG(dbgs() << "       start: " << *V << "\n");
2211  IntegerType *IntTy = cast<IntegerType>(V->getType());
2212  assert(DL.getTypeStoreSize(Ty) + Offset <= DL.getTypeStoreSize(IntTy) &&
2213         "Element extends past full value");
2214  uint64_t ShAmt = 8*Offset;
2215  if (DL.isBigEndian())
2216    ShAmt = 8*(DL.getTypeStoreSize(IntTy) - DL.getTypeStoreSize(Ty) - Offset);
2217  if (ShAmt) {
2218    V = IRB.CreateLShr(V, ShAmt, Name + ".shift");
2219    DEBUG(dbgs() << "     shifted: " << *V << "\n");
2220  }
2221  assert(Ty->getBitWidth() <= IntTy->getBitWidth() &&
2222         "Cannot extract to a larger integer!");
2223  if (Ty != IntTy) {
2224    V = IRB.CreateTrunc(V, Ty, Name + ".trunc");
2225    DEBUG(dbgs() << "     trunced: " << *V << "\n");
2226  }
2227  return V;
2228}
2229
2230static Value *insertInteger(const DataLayout &DL, IRBuilderTy &IRB, Value *Old,
2231                            Value *V, uint64_t Offset, const Twine &Name) {
2232  IntegerType *IntTy = cast<IntegerType>(Old->getType());
2233  IntegerType *Ty = cast<IntegerType>(V->getType());
2234  assert(Ty->getBitWidth() <= IntTy->getBitWidth() &&
2235         "Cannot insert a larger integer!");
2236  DEBUG(dbgs() << "       start: " << *V << "\n");
2237  if (Ty != IntTy) {
2238    V = IRB.CreateZExt(V, IntTy, Name + ".ext");
2239    DEBUG(dbgs() << "    extended: " << *V << "\n");
2240  }
2241  assert(DL.getTypeStoreSize(Ty) + Offset <= DL.getTypeStoreSize(IntTy) &&
2242         "Element store outside of alloca store");
2243  uint64_t ShAmt = 8*Offset;
2244  if (DL.isBigEndian())
2245    ShAmt = 8*(DL.getTypeStoreSize(IntTy) - DL.getTypeStoreSize(Ty) - Offset);
2246  if (ShAmt) {
2247    V = IRB.CreateShl(V, ShAmt, Name + ".shift");
2248    DEBUG(dbgs() << "     shifted: " << *V << "\n");
2249  }
2250
2251  if (ShAmt || Ty->getBitWidth() < IntTy->getBitWidth()) {
2252    APInt Mask = ~Ty->getMask().zext(IntTy->getBitWidth()).shl(ShAmt);
2253    Old = IRB.CreateAnd(Old, Mask, Name + ".mask");
2254    DEBUG(dbgs() << "      masked: " << *Old << "\n");
2255    V = IRB.CreateOr(Old, V, Name + ".insert");
2256    DEBUG(dbgs() << "    inserted: " << *V << "\n");
2257  }
2258  return V;
2259}
2260
2261static Value *extractVector(IRBuilderTy &IRB, Value *V,
2262                            unsigned BeginIndex, unsigned EndIndex,
2263                            const Twine &Name) {
2264  VectorType *VecTy = cast<VectorType>(V->getType());
2265  unsigned NumElements = EndIndex - BeginIndex;
2266  assert(NumElements <= VecTy->getNumElements() && "Too many elements!");
2267
2268  if (NumElements == VecTy->getNumElements())
2269    return V;
2270
2271  if (NumElements == 1) {
2272    V = IRB.CreateExtractElement(V, IRB.getInt32(BeginIndex),
2273                                 Name + ".extract");
2274    DEBUG(dbgs() << "     extract: " << *V << "\n");
2275    return V;
2276  }
2277
2278  SmallVector<Constant*, 8> Mask;
2279  Mask.reserve(NumElements);
2280  for (unsigned i = BeginIndex; i != EndIndex; ++i)
2281    Mask.push_back(IRB.getInt32(i));
2282  V = IRB.CreateShuffleVector(V, UndefValue::get(V->getType()),
2283                              ConstantVector::get(Mask),
2284                              Name + ".extract");
2285  DEBUG(dbgs() << "     shuffle: " << *V << "\n");
2286  return V;
2287}
2288
2289static Value *insertVector(IRBuilderTy &IRB, Value *Old, Value *V,
2290                           unsigned BeginIndex, const Twine &Name) {
2291  VectorType *VecTy = cast<VectorType>(Old->getType());
2292  assert(VecTy && "Can only insert a vector into a vector");
2293
2294  VectorType *Ty = dyn_cast<VectorType>(V->getType());
2295  if (!Ty) {
2296    // Single element to insert.
2297    V = IRB.CreateInsertElement(Old, V, IRB.getInt32(BeginIndex),
2298                                Name + ".insert");
2299    DEBUG(dbgs() <<  "     insert: " << *V << "\n");
2300    return V;
2301  }
2302
2303  assert(Ty->getNumElements() <= VecTy->getNumElements() &&
2304         "Too many elements!");
2305  if (Ty->getNumElements() == VecTy->getNumElements()) {
2306    assert(V->getType() == VecTy && "Vector type mismatch");
2307    return V;
2308  }
2309  unsigned EndIndex = BeginIndex + Ty->getNumElements();
2310
2311  // When inserting a smaller vector into the larger to store, we first
2312  // use a shuffle vector to widen it with undef elements, and then
2313  // a second shuffle vector to select between the loaded vector and the
2314  // incoming vector.
2315  SmallVector<Constant*, 8> Mask;
2316  Mask.reserve(VecTy->getNumElements());
2317  for (unsigned i = 0; i != VecTy->getNumElements(); ++i)
2318    if (i >= BeginIndex && i < EndIndex)
2319      Mask.push_back(IRB.getInt32(i - BeginIndex));
2320    else
2321      Mask.push_back(UndefValue::get(IRB.getInt32Ty()));
2322  V = IRB.CreateShuffleVector(V, UndefValue::get(V->getType()),
2323                              ConstantVector::get(Mask),
2324                              Name + ".expand");
2325  DEBUG(dbgs() << "    shuffle: " << *V << "\n");
2326
2327  Mask.clear();
2328  for (unsigned i = 0; i != VecTy->getNumElements(); ++i)
2329    Mask.push_back(IRB.getInt1(i >= BeginIndex && i < EndIndex));
2330
2331  V = IRB.CreateSelect(ConstantVector::get(Mask), V, Old, Name + "blend");
2332
2333  DEBUG(dbgs() << "    blend: " << *V << "\n");
2334  return V;
2335}
2336
2337namespace {
2338/// \brief Visitor to rewrite instructions using a partition of an alloca to
2339/// use a new alloca.
2340///
2341/// Also implements the rewriting to vector-based accesses when the partition
2342/// passes the isVectorPromotionViable predicate. Most of the rewriting logic
2343/// lives here.
2344class AllocaPartitionRewriter : public InstVisitor<AllocaPartitionRewriter,
2345                                                   bool> {
2346  // Befriend the base class so it can delegate to private visit methods.
2347  friend class llvm::InstVisitor<AllocaPartitionRewriter, bool>;
2348
2349  const DataLayout &TD;
2350  AllocaPartitioning &P;
2351  SROA &Pass;
2352  AllocaInst &OldAI, &NewAI;
2353  const uint64_t NewAllocaBeginOffset, NewAllocaEndOffset;
2354  Type *NewAllocaTy;
2355
2356  // If we are rewriting an alloca partition which can be written as pure
2357  // vector operations, we stash extra information here. When VecTy is
2358  // non-null, we have some strict guarantees about the rewritten alloca:
2359  //   - The new alloca is exactly the size of the vector type here.
2360  //   - The accesses all either map to the entire vector or to a single
2361  //     element.
2362  //   - The set of accessing instructions is only one of those handled above
2363  //     in isVectorPromotionViable. Generally these are the same access kinds
2364  //     which are promotable via mem2reg.
2365  VectorType *VecTy;
2366  Type *ElementTy;
2367  uint64_t ElementSize;
2368
2369  // This is a convenience and flag variable that will be null unless the new
2370  // alloca's integer operations should be widened to this integer type due to
2371  // passing isIntegerWideningViable above. If it is non-null, the desired
2372  // integer type will be stored here for easy access during rewriting.
2373  IntegerType *IntTy;
2374
2375  // The offset of the partition user currently being rewritten.
2376  uint64_t BeginOffset, EndOffset;
2377  bool IsSplit;
2378  Use *OldUse;
2379  Instruction *OldPtr;
2380
2381  // Utility IR builder, whose name prefix is setup for each visited use, and
2382  // the insertion point is set to point to the user.
2383  IRBuilderTy IRB;
2384
2385public:
2386  AllocaPartitionRewriter(const DataLayout &TD, AllocaPartitioning &P,
2387                          AllocaPartitioning::iterator PI,
2388                          SROA &Pass, AllocaInst &OldAI, AllocaInst &NewAI,
2389                          uint64_t NewBeginOffset, uint64_t NewEndOffset)
2390    : TD(TD), P(P), Pass(Pass),
2391      OldAI(OldAI), NewAI(NewAI),
2392      NewAllocaBeginOffset(NewBeginOffset),
2393      NewAllocaEndOffset(NewEndOffset),
2394      NewAllocaTy(NewAI.getAllocatedType()),
2395      VecTy(), ElementTy(), ElementSize(), IntTy(),
2396      BeginOffset(), EndOffset(), IsSplit(), OldUse(), OldPtr(),
2397      IRB(NewAI.getContext(), ConstantFolder()) {
2398  }
2399
2400  /// \brief Visit the users of the alloca partition and rewrite them.
2401  bool visitUsers(AllocaPartitioning::const_use_iterator I,
2402                  AllocaPartitioning::const_use_iterator E) {
2403    if (isVectorPromotionViable(TD, NewAI.getAllocatedType(), P,
2404                                NewAllocaBeginOffset, NewAllocaEndOffset,
2405                                I, E)) {
2406      ++NumVectorized;
2407      VecTy = cast<VectorType>(NewAI.getAllocatedType());
2408      ElementTy = VecTy->getElementType();
2409      assert((TD.getTypeSizeInBits(VecTy->getScalarType()) % 8) == 0 &&
2410             "Only multiple-of-8 sized vector elements are viable");
2411      ElementSize = TD.getTypeSizeInBits(VecTy->getScalarType()) / 8;
2412    } else if (isIntegerWideningViable(TD, NewAI.getAllocatedType(),
2413                                       NewAllocaBeginOffset, P, I, E)) {
2414      IntTy = Type::getIntNTy(NewAI.getContext(),
2415                              TD.getTypeSizeInBits(NewAI.getAllocatedType()));
2416    }
2417    bool CanSROA = true;
2418    for (; I != E; ++I) {
2419      if (!I->getUse())
2420        continue; // Skip dead uses.
2421      BeginOffset = I->BeginOffset;
2422      EndOffset = I->EndOffset;
2423      IsSplit = I->isSplit();
2424      OldUse = I->getUse();
2425      OldPtr = cast<Instruction>(OldUse->get());
2426
2427      Instruction *OldUserI = cast<Instruction>(OldUse->getUser());
2428      IRB.SetInsertPoint(OldUserI);
2429      IRB.SetCurrentDebugLocation(OldUserI->getDebugLoc());
2430      IRB.SetNamePrefix(Twine(NewAI.getName()) + "." + Twine(BeginOffset) +
2431                        ".");
2432
2433      CanSROA &= visit(cast<Instruction>(OldUse->getUser()));
2434    }
2435    if (VecTy) {
2436      assert(CanSROA);
2437      VecTy = 0;
2438      ElementTy = 0;
2439      ElementSize = 0;
2440    }
2441    if (IntTy) {
2442      assert(CanSROA);
2443      IntTy = 0;
2444    }
2445    return CanSROA;
2446  }
2447
2448private:
2449  // Every instruction which can end up as a user must have a rewrite rule.
2450  bool visitInstruction(Instruction &I) {
2451    DEBUG(dbgs() << "    !!!! Cannot rewrite: " << I << "\n");
2452    llvm_unreachable("No rewrite rule for this instruction!");
2453  }
2454
2455  Value *getAdjustedAllocaPtr(IRBuilderTy &IRB, Type *PointerTy) {
2456    assert(BeginOffset >= NewAllocaBeginOffset);
2457    APInt Offset(TD.getPointerSizeInBits(), BeginOffset - NewAllocaBeginOffset);
2458    return getAdjustedPtr(IRB, TD, &NewAI, Offset, PointerTy);
2459  }
2460
2461  /// \brief Compute suitable alignment to access an offset into the new alloca.
2462  unsigned getOffsetAlign(uint64_t Offset) {
2463    unsigned NewAIAlign = NewAI.getAlignment();
2464    if (!NewAIAlign)
2465      NewAIAlign = TD.getABITypeAlignment(NewAI.getAllocatedType());
2466    return MinAlign(NewAIAlign, Offset);
2467  }
2468
2469  /// \brief Compute suitable alignment to access this partition of the new
2470  /// alloca.
2471  unsigned getPartitionAlign() {
2472    return getOffsetAlign(BeginOffset - NewAllocaBeginOffset);
2473  }
2474
2475  /// \brief Compute suitable alignment to access a type at an offset of the
2476  /// new alloca.
2477  ///
2478  /// \returns zero if the type's ABI alignment is a suitable alignment,
2479  /// otherwise returns the maximal suitable alignment.
2480  unsigned getOffsetTypeAlign(Type *Ty, uint64_t Offset) {
2481    unsigned Align = getOffsetAlign(Offset);
2482    return Align == TD.getABITypeAlignment(Ty) ? 0 : Align;
2483  }
2484
2485  /// \brief Compute suitable alignment to access a type at the beginning of
2486  /// this partition of the new alloca.
2487  ///
2488  /// See \c getOffsetTypeAlign for details; this routine delegates to it.
2489  unsigned getPartitionTypeAlign(Type *Ty) {
2490    return getOffsetTypeAlign(Ty, BeginOffset - NewAllocaBeginOffset);
2491  }
2492
2493  unsigned getIndex(uint64_t Offset) {
2494    assert(VecTy && "Can only call getIndex when rewriting a vector");
2495    uint64_t RelOffset = Offset - NewAllocaBeginOffset;
2496    assert(RelOffset / ElementSize < UINT32_MAX && "Index out of bounds");
2497    uint32_t Index = RelOffset / ElementSize;
2498    assert(Index * ElementSize == RelOffset);
2499    return Index;
2500  }
2501
2502  void deleteIfTriviallyDead(Value *V) {
2503    Instruction *I = cast<Instruction>(V);
2504    if (isInstructionTriviallyDead(I))
2505      Pass.DeadInsts.insert(I);
2506  }
2507
2508  Value *rewriteVectorizedLoadInst() {
2509    unsigned BeginIndex = getIndex(BeginOffset);
2510    unsigned EndIndex = getIndex(EndOffset);
2511    assert(EndIndex > BeginIndex && "Empty vector!");
2512
2513    Value *V = IRB.CreateAlignedLoad(&NewAI, NewAI.getAlignment(),
2514                                     "load");
2515    return extractVector(IRB, V, BeginIndex, EndIndex, "vec");
2516  }
2517
2518  Value *rewriteIntegerLoad(LoadInst &LI) {
2519    assert(IntTy && "We cannot insert an integer to the alloca");
2520    assert(!LI.isVolatile());
2521    Value *V = IRB.CreateAlignedLoad(&NewAI, NewAI.getAlignment(),
2522                                     "load");
2523    V = convertValue(TD, IRB, V, IntTy);
2524    assert(BeginOffset >= NewAllocaBeginOffset && "Out of bounds offset");
2525    uint64_t Offset = BeginOffset - NewAllocaBeginOffset;
2526    if (Offset > 0 || EndOffset < NewAllocaEndOffset)
2527      V = extractInteger(TD, IRB, V, cast<IntegerType>(LI.getType()), Offset,
2528                         "extract");
2529    return V;
2530  }
2531
2532  bool visitLoadInst(LoadInst &LI) {
2533    DEBUG(dbgs() << "    original: " << LI << "\n");
2534    Value *OldOp = LI.getOperand(0);
2535    assert(OldOp == OldPtr);
2536
2537    uint64_t Size = EndOffset - BeginOffset;
2538
2539    Type *TargetTy = IsSplit ? Type::getIntNTy(LI.getContext(), Size * 8)
2540                             : LI.getType();
2541    bool IsPtrAdjusted = false;
2542    Value *V;
2543    if (VecTy) {
2544      V = rewriteVectorizedLoadInst();
2545    } else if (IntTy && LI.getType()->isIntegerTy()) {
2546      V = rewriteIntegerLoad(LI);
2547    } else if (BeginOffset == NewAllocaBeginOffset &&
2548               canConvertValue(TD, NewAllocaTy, LI.getType())) {
2549      V = IRB.CreateAlignedLoad(&NewAI, NewAI.getAlignment(),
2550                                LI.isVolatile(), "load");
2551    } else {
2552      Type *LTy = TargetTy->getPointerTo();
2553      V = IRB.CreateAlignedLoad(getAdjustedAllocaPtr(IRB, LTy),
2554                                getPartitionTypeAlign(TargetTy),
2555                                LI.isVolatile(), "load");
2556      IsPtrAdjusted = true;
2557    }
2558    V = convertValue(TD, IRB, V, TargetTy);
2559
2560    if (IsSplit) {
2561      assert(!LI.isVolatile());
2562      assert(LI.getType()->isIntegerTy() &&
2563             "Only integer type loads and stores are split");
2564      assert(Size < TD.getTypeStoreSize(LI.getType()) &&
2565             "Split load isn't smaller than original load");
2566      assert(LI.getType()->getIntegerBitWidth() ==
2567             TD.getTypeStoreSizeInBits(LI.getType()) &&
2568             "Non-byte-multiple bit width");
2569      // Move the insertion point just past the load so that we can refer to it.
2570      IRB.SetInsertPoint(llvm::next(BasicBlock::iterator(&LI)));
2571      // Create a placeholder value with the same type as LI to use as the
2572      // basis for the new value. This allows us to replace the uses of LI with
2573      // the computed value, and then replace the placeholder with LI, leaving
2574      // LI only used for this computation.
2575      Value *Placeholder
2576        = new LoadInst(UndefValue::get(LI.getType()->getPointerTo()));
2577      V = insertInteger(TD, IRB, Placeholder, V, BeginOffset,
2578                        "insert");
2579      LI.replaceAllUsesWith(V);
2580      Placeholder->replaceAllUsesWith(&LI);
2581      delete Placeholder;
2582    } else {
2583      LI.replaceAllUsesWith(V);
2584    }
2585
2586    Pass.DeadInsts.insert(&LI);
2587    deleteIfTriviallyDead(OldOp);
2588    DEBUG(dbgs() << "          to: " << *V << "\n");
2589    return !LI.isVolatile() && !IsPtrAdjusted;
2590  }
2591
2592  bool rewriteVectorizedStoreInst(Value *V,
2593                                  StoreInst &SI, Value *OldOp) {
2594    unsigned BeginIndex = getIndex(BeginOffset);
2595    unsigned EndIndex = getIndex(EndOffset);
2596    assert(EndIndex > BeginIndex && "Empty vector!");
2597    unsigned NumElements = EndIndex - BeginIndex;
2598    assert(NumElements <= VecTy->getNumElements() && "Too many elements!");
2599    Type *PartitionTy
2600      = (NumElements == 1) ? ElementTy
2601                           : VectorType::get(ElementTy, NumElements);
2602    if (V->getType() != PartitionTy)
2603      V = convertValue(TD, IRB, V, PartitionTy);
2604
2605    // Mix in the existing elements.
2606    Value *Old = IRB.CreateAlignedLoad(&NewAI, NewAI.getAlignment(),
2607                                       "load");
2608    V = insertVector(IRB, Old, V, BeginIndex, "vec");
2609
2610    StoreInst *Store = IRB.CreateAlignedStore(V, &NewAI, NewAI.getAlignment());
2611    Pass.DeadInsts.insert(&SI);
2612
2613    (void)Store;
2614    DEBUG(dbgs() << "          to: " << *Store << "\n");
2615    return true;
2616  }
2617
2618  bool rewriteIntegerStore(Value *V, StoreInst &SI) {
2619    assert(IntTy && "We cannot extract an integer from the alloca");
2620    assert(!SI.isVolatile());
2621    if (TD.getTypeSizeInBits(V->getType()) != IntTy->getBitWidth()) {
2622      Value *Old = IRB.CreateAlignedLoad(&NewAI, NewAI.getAlignment(),
2623                                         "oldload");
2624      Old = convertValue(TD, IRB, Old, IntTy);
2625      assert(BeginOffset >= NewAllocaBeginOffset && "Out of bounds offset");
2626      uint64_t Offset = BeginOffset - NewAllocaBeginOffset;
2627      V = insertInteger(TD, IRB, Old, SI.getValueOperand(), Offset,
2628                        "insert");
2629    }
2630    V = convertValue(TD, IRB, V, NewAllocaTy);
2631    StoreInst *Store = IRB.CreateAlignedStore(V, &NewAI, NewAI.getAlignment());
2632    Pass.DeadInsts.insert(&SI);
2633    (void)Store;
2634    DEBUG(dbgs() << "          to: " << *Store << "\n");
2635    return true;
2636  }
2637
2638  bool visitStoreInst(StoreInst &SI) {
2639    DEBUG(dbgs() << "    original: " << SI << "\n");
2640    Value *OldOp = SI.getOperand(1);
2641    assert(OldOp == OldPtr);
2642
2643    Value *V = SI.getValueOperand();
2644
2645    // Strip all inbounds GEPs and pointer casts to try to dig out any root
2646    // alloca that should be re-examined after promoting this alloca.
2647    if (V->getType()->isPointerTy())
2648      if (AllocaInst *AI = dyn_cast<AllocaInst>(V->stripInBoundsOffsets()))
2649        Pass.PostPromotionWorklist.insert(AI);
2650
2651    uint64_t Size = EndOffset - BeginOffset;
2652    if (Size < TD.getTypeStoreSize(V->getType())) {
2653      assert(!SI.isVolatile());
2654      assert(IsSplit && "A seemingly split store isn't splittable");
2655      assert(V->getType()->isIntegerTy() &&
2656             "Only integer type loads and stores are split");
2657      assert(V->getType()->getIntegerBitWidth() ==
2658             TD.getTypeStoreSizeInBits(V->getType()) &&
2659             "Non-byte-multiple bit width");
2660      IntegerType *NarrowTy = Type::getIntNTy(SI.getContext(), Size * 8);
2661      V = extractInteger(TD, IRB, V, NarrowTy, BeginOffset,
2662                         "extract");
2663    }
2664
2665    if (VecTy)
2666      return rewriteVectorizedStoreInst(V, SI, OldOp);
2667    if (IntTy && V->getType()->isIntegerTy())
2668      return rewriteIntegerStore(V, SI);
2669
2670    StoreInst *NewSI;
2671    if (BeginOffset == NewAllocaBeginOffset &&
2672        EndOffset == NewAllocaEndOffset &&
2673        canConvertValue(TD, V->getType(), NewAllocaTy)) {
2674      V = convertValue(TD, IRB, V, NewAllocaTy);
2675      NewSI = IRB.CreateAlignedStore(V, &NewAI, NewAI.getAlignment(),
2676                                     SI.isVolatile());
2677    } else {
2678      Value *NewPtr = getAdjustedAllocaPtr(IRB, V->getType()->getPointerTo());
2679      NewSI = IRB.CreateAlignedStore(V, NewPtr,
2680                                     getPartitionTypeAlign(V->getType()),
2681                                     SI.isVolatile());
2682    }
2683    (void)NewSI;
2684    Pass.DeadInsts.insert(&SI);
2685    deleteIfTriviallyDead(OldOp);
2686
2687    DEBUG(dbgs() << "          to: " << *NewSI << "\n");
2688    return NewSI->getPointerOperand() == &NewAI && !SI.isVolatile();
2689  }
2690
2691  /// \brief Compute an integer value from splatting an i8 across the given
2692  /// number of bytes.
2693  ///
2694  /// Note that this routine assumes an i8 is a byte. If that isn't true, don't
2695  /// call this routine.
2696  /// FIXME: Heed the advice above.
2697  ///
2698  /// \param V The i8 value to splat.
2699  /// \param Size The number of bytes in the output (assuming i8 is one byte)
2700  Value *getIntegerSplat(Value *V, unsigned Size) {
2701    assert(Size > 0 && "Expected a positive number of bytes.");
2702    IntegerType *VTy = cast<IntegerType>(V->getType());
2703    assert(VTy->getBitWidth() == 8 && "Expected an i8 value for the byte");
2704    if (Size == 1)
2705      return V;
2706
2707    Type *SplatIntTy = Type::getIntNTy(VTy->getContext(), Size*8);
2708    V = IRB.CreateMul(IRB.CreateZExt(V, SplatIntTy, "zext"),
2709                      ConstantExpr::getUDiv(
2710                        Constant::getAllOnesValue(SplatIntTy),
2711                        ConstantExpr::getZExt(
2712                          Constant::getAllOnesValue(V->getType()),
2713                          SplatIntTy)),
2714                      "isplat");
2715    return V;
2716  }
2717
2718  /// \brief Compute a vector splat for a given element value.
2719  Value *getVectorSplat(Value *V, unsigned NumElements) {
2720    V = IRB.CreateVectorSplat(NumElements, V, "vsplat");
2721    DEBUG(dbgs() << "       splat: " << *V << "\n");
2722    return V;
2723  }
2724
2725  bool visitMemSetInst(MemSetInst &II) {
2726    DEBUG(dbgs() << "    original: " << II << "\n");
2727    assert(II.getRawDest() == OldPtr);
2728
2729    // If the memset has a variable size, it cannot be split, just adjust the
2730    // pointer to the new alloca.
2731    if (!isa<Constant>(II.getLength())) {
2732      II.setDest(getAdjustedAllocaPtr(IRB, II.getRawDest()->getType()));
2733      Type *CstTy = II.getAlignmentCst()->getType();
2734      II.setAlignment(ConstantInt::get(CstTy, getPartitionAlign()));
2735
2736      deleteIfTriviallyDead(OldPtr);
2737      return false;
2738    }
2739
2740    // Record this instruction for deletion.
2741    Pass.DeadInsts.insert(&II);
2742
2743    Type *AllocaTy = NewAI.getAllocatedType();
2744    Type *ScalarTy = AllocaTy->getScalarType();
2745
2746    // If this doesn't map cleanly onto the alloca type, and that type isn't
2747    // a single value type, just emit a memset.
2748    if (!VecTy && !IntTy &&
2749        (BeginOffset != NewAllocaBeginOffset ||
2750         EndOffset != NewAllocaEndOffset ||
2751         !AllocaTy->isSingleValueType() ||
2752         !TD.isLegalInteger(TD.getTypeSizeInBits(ScalarTy)) ||
2753         TD.getTypeSizeInBits(ScalarTy)%8 != 0)) {
2754      Type *SizeTy = II.getLength()->getType();
2755      Constant *Size = ConstantInt::get(SizeTy, EndOffset - BeginOffset);
2756      CallInst *New
2757        = IRB.CreateMemSet(getAdjustedAllocaPtr(IRB,
2758                                                II.getRawDest()->getType()),
2759                           II.getValue(), Size, getPartitionAlign(),
2760                           II.isVolatile());
2761      (void)New;
2762      DEBUG(dbgs() << "          to: " << *New << "\n");
2763      return false;
2764    }
2765
2766    // If we can represent this as a simple value, we have to build the actual
2767    // value to store, which requires expanding the byte present in memset to
2768    // a sensible representation for the alloca type. This is essentially
2769    // splatting the byte to a sufficiently wide integer, splatting it across
2770    // any desired vector width, and bitcasting to the final type.
2771    Value *V;
2772
2773    if (VecTy) {
2774      // If this is a memset of a vectorized alloca, insert it.
2775      assert(ElementTy == ScalarTy);
2776
2777      unsigned BeginIndex = getIndex(BeginOffset);
2778      unsigned EndIndex = getIndex(EndOffset);
2779      assert(EndIndex > BeginIndex && "Empty vector!");
2780      unsigned NumElements = EndIndex - BeginIndex;
2781      assert(NumElements <= VecTy->getNumElements() && "Too many elements!");
2782
2783      Value *Splat =
2784          getIntegerSplat(II.getValue(), TD.getTypeSizeInBits(ElementTy) / 8);
2785      Splat = convertValue(TD, IRB, Splat, ElementTy);
2786      if (NumElements > 1)
2787        Splat = getVectorSplat(Splat, NumElements);
2788
2789      Value *Old = IRB.CreateAlignedLoad(&NewAI, NewAI.getAlignment(),
2790                                         "oldload");
2791      V = insertVector(IRB, Old, Splat, BeginIndex, "vec");
2792    } else if (IntTy) {
2793      // If this is a memset on an alloca where we can widen stores, insert the
2794      // set integer.
2795      assert(!II.isVolatile());
2796
2797      uint64_t Size = EndOffset - BeginOffset;
2798      V = getIntegerSplat(II.getValue(), Size);
2799
2800      if (IntTy && (BeginOffset != NewAllocaBeginOffset ||
2801                    EndOffset != NewAllocaBeginOffset)) {
2802        Value *Old = IRB.CreateAlignedLoad(&NewAI, NewAI.getAlignment(),
2803                                           "oldload");
2804        Old = convertValue(TD, IRB, Old, IntTy);
2805        assert(BeginOffset >= NewAllocaBeginOffset && "Out of bounds offset");
2806        uint64_t Offset = BeginOffset - NewAllocaBeginOffset;
2807        V = insertInteger(TD, IRB, Old, V, Offset, "insert");
2808      } else {
2809        assert(V->getType() == IntTy &&
2810               "Wrong type for an alloca wide integer!");
2811      }
2812      V = convertValue(TD, IRB, V, AllocaTy);
2813    } else {
2814      // Established these invariants above.
2815      assert(BeginOffset == NewAllocaBeginOffset);
2816      assert(EndOffset == NewAllocaEndOffset);
2817
2818      V = getIntegerSplat(II.getValue(), TD.getTypeSizeInBits(ScalarTy) / 8);
2819      if (VectorType *AllocaVecTy = dyn_cast<VectorType>(AllocaTy))
2820        V = getVectorSplat(V, AllocaVecTy->getNumElements());
2821
2822      V = convertValue(TD, IRB, V, AllocaTy);
2823    }
2824
2825    Value *New = IRB.CreateAlignedStore(V, &NewAI, NewAI.getAlignment(),
2826                                        II.isVolatile());
2827    (void)New;
2828    DEBUG(dbgs() << "          to: " << *New << "\n");
2829    return !II.isVolatile();
2830  }
2831
2832  bool visitMemTransferInst(MemTransferInst &II) {
2833    // Rewriting of memory transfer instructions can be a bit tricky. We break
2834    // them into two categories: split intrinsics and unsplit intrinsics.
2835
2836    DEBUG(dbgs() << "    original: " << II << "\n");
2837
2838    assert(II.getRawSource() == OldPtr || II.getRawDest() == OldPtr);
2839    bool IsDest = II.getRawDest() == OldPtr;
2840
2841    const AllocaPartitioning::MemTransferOffsets &MTO
2842      = P.getMemTransferOffsets(II);
2843
2844    // Compute the relative offset within the transfer.
2845    unsigned IntPtrWidth = TD.getPointerSizeInBits();
2846    APInt RelOffset(IntPtrWidth, BeginOffset - (IsDest ? MTO.DestBegin
2847                                                       : MTO.SourceBegin));
2848
2849    unsigned Align = II.getAlignment();
2850    if (Align > 1)
2851      Align = MinAlign(RelOffset.zextOrTrunc(64).getZExtValue(),
2852                       MinAlign(II.getAlignment(), getPartitionAlign()));
2853
2854    // For unsplit intrinsics, we simply modify the source and destination
2855    // pointers in place. This isn't just an optimization, it is a matter of
2856    // correctness. With unsplit intrinsics we may be dealing with transfers
2857    // within a single alloca before SROA ran, or with transfers that have
2858    // a variable length. We may also be dealing with memmove instead of
2859    // memcpy, and so simply updating the pointers is the necessary for us to
2860    // update both source and dest of a single call.
2861    if (!MTO.IsSplittable) {
2862      Value *OldOp = IsDest ? II.getRawDest() : II.getRawSource();
2863      if (IsDest)
2864        II.setDest(getAdjustedAllocaPtr(IRB, II.getRawDest()->getType()));
2865      else
2866        II.setSource(getAdjustedAllocaPtr(IRB, II.getRawSource()->getType()));
2867
2868      Type *CstTy = II.getAlignmentCst()->getType();
2869      II.setAlignment(ConstantInt::get(CstTy, Align));
2870
2871      DEBUG(dbgs() << "          to: " << II << "\n");
2872      deleteIfTriviallyDead(OldOp);
2873      return false;
2874    }
2875    // For split transfer intrinsics we have an incredibly useful assurance:
2876    // the source and destination do not reside within the same alloca, and at
2877    // least one of them does not escape. This means that we can replace
2878    // memmove with memcpy, and we don't need to worry about all manner of
2879    // downsides to splitting and transforming the operations.
2880
2881    // If this doesn't map cleanly onto the alloca type, and that type isn't
2882    // a single value type, just emit a memcpy.
2883    bool EmitMemCpy
2884      = !VecTy && !IntTy && (BeginOffset != NewAllocaBeginOffset ||
2885                             EndOffset != NewAllocaEndOffset ||
2886                             !NewAI.getAllocatedType()->isSingleValueType());
2887
2888    // If we're just going to emit a memcpy, the alloca hasn't changed, and the
2889    // size hasn't been shrunk based on analysis of the viable range, this is
2890    // a no-op.
2891    if (EmitMemCpy && &OldAI == &NewAI) {
2892      uint64_t OrigBegin = IsDest ? MTO.DestBegin : MTO.SourceBegin;
2893      uint64_t OrigEnd = IsDest ? MTO.DestEnd : MTO.SourceEnd;
2894      // Ensure the start lines up.
2895      assert(BeginOffset == OrigBegin);
2896      (void)OrigBegin;
2897
2898      // Rewrite the size as needed.
2899      if (EndOffset != OrigEnd)
2900        II.setLength(ConstantInt::get(II.getLength()->getType(),
2901                                      EndOffset - BeginOffset));
2902      return false;
2903    }
2904    // Record this instruction for deletion.
2905    Pass.DeadInsts.insert(&II);
2906
2907    // Strip all inbounds GEPs and pointer casts to try to dig out any root
2908    // alloca that should be re-examined after rewriting this instruction.
2909    Value *OtherPtr = IsDest ? II.getRawSource() : II.getRawDest();
2910    if (AllocaInst *AI
2911          = dyn_cast<AllocaInst>(OtherPtr->stripInBoundsOffsets()))
2912      Pass.Worklist.insert(AI);
2913
2914    if (EmitMemCpy) {
2915      Type *OtherPtrTy = IsDest ? II.getRawSource()->getType()
2916                                : II.getRawDest()->getType();
2917
2918      // Compute the other pointer, folding as much as possible to produce
2919      // a single, simple GEP in most cases.
2920      OtherPtr = getAdjustedPtr(IRB, TD, OtherPtr, RelOffset, OtherPtrTy);
2921
2922      Value *OurPtr
2923        = getAdjustedAllocaPtr(IRB, IsDest ? II.getRawDest()->getType()
2924                                           : II.getRawSource()->getType());
2925      Type *SizeTy = II.getLength()->getType();
2926      Constant *Size = ConstantInt::get(SizeTy, EndOffset - BeginOffset);
2927
2928      CallInst *New = IRB.CreateMemCpy(IsDest ? OurPtr : OtherPtr,
2929                                       IsDest ? OtherPtr : OurPtr,
2930                                       Size, Align, II.isVolatile());
2931      (void)New;
2932      DEBUG(dbgs() << "          to: " << *New << "\n");
2933      return false;
2934    }
2935
2936    // Note that we clamp the alignment to 1 here as a 0 alignment for a memcpy
2937    // is equivalent to 1, but that isn't true if we end up rewriting this as
2938    // a load or store.
2939    if (!Align)
2940      Align = 1;
2941
2942    bool IsWholeAlloca = BeginOffset == NewAllocaBeginOffset &&
2943                         EndOffset == NewAllocaEndOffset;
2944    uint64_t Size = EndOffset - BeginOffset;
2945    unsigned BeginIndex = VecTy ? getIndex(BeginOffset) : 0;
2946    unsigned EndIndex = VecTy ? getIndex(EndOffset) : 0;
2947    unsigned NumElements = EndIndex - BeginIndex;
2948    IntegerType *SubIntTy
2949      = IntTy ? Type::getIntNTy(IntTy->getContext(), Size*8) : 0;
2950
2951    Type *OtherPtrTy = NewAI.getType();
2952    if (VecTy && !IsWholeAlloca) {
2953      if (NumElements == 1)
2954        OtherPtrTy = VecTy->getElementType();
2955      else
2956        OtherPtrTy = VectorType::get(VecTy->getElementType(), NumElements);
2957
2958      OtherPtrTy = OtherPtrTy->getPointerTo();
2959    } else if (IntTy && !IsWholeAlloca) {
2960      OtherPtrTy = SubIntTy->getPointerTo();
2961    }
2962
2963    Value *SrcPtr = getAdjustedPtr(IRB, TD, OtherPtr, RelOffset, OtherPtrTy);
2964    Value *DstPtr = &NewAI;
2965    if (!IsDest)
2966      std::swap(SrcPtr, DstPtr);
2967
2968    Value *Src;
2969    if (VecTy && !IsWholeAlloca && !IsDest) {
2970      Src = IRB.CreateAlignedLoad(&NewAI, NewAI.getAlignment(),
2971                                  "load");
2972      Src = extractVector(IRB, Src, BeginIndex, EndIndex, "vec");
2973    } else if (IntTy && !IsWholeAlloca && !IsDest) {
2974      Src = IRB.CreateAlignedLoad(&NewAI, NewAI.getAlignment(),
2975                                  "load");
2976      Src = convertValue(TD, IRB, Src, IntTy);
2977      assert(BeginOffset >= NewAllocaBeginOffset && "Out of bounds offset");
2978      uint64_t Offset = BeginOffset - NewAllocaBeginOffset;
2979      Src = extractInteger(TD, IRB, Src, SubIntTy, Offset, "extract");
2980    } else {
2981      Src = IRB.CreateAlignedLoad(SrcPtr, Align, II.isVolatile(),
2982                                  "copyload");
2983    }
2984
2985    if (VecTy && !IsWholeAlloca && IsDest) {
2986      Value *Old = IRB.CreateAlignedLoad(&NewAI, NewAI.getAlignment(),
2987                                         "oldload");
2988      Src = insertVector(IRB, Old, Src, BeginIndex, "vec");
2989    } else if (IntTy && !IsWholeAlloca && IsDest) {
2990      Value *Old = IRB.CreateAlignedLoad(&NewAI, NewAI.getAlignment(),
2991                                         "oldload");
2992      Old = convertValue(TD, IRB, Old, IntTy);
2993      assert(BeginOffset >= NewAllocaBeginOffset && "Out of bounds offset");
2994      uint64_t Offset = BeginOffset - NewAllocaBeginOffset;
2995      Src = insertInteger(TD, IRB, Old, Src, Offset, "insert");
2996      Src = convertValue(TD, IRB, Src, NewAllocaTy);
2997    }
2998
2999    StoreInst *Store = cast<StoreInst>(
3000      IRB.CreateAlignedStore(Src, DstPtr, Align, II.isVolatile()));
3001    (void)Store;
3002    DEBUG(dbgs() << "          to: " << *Store << "\n");
3003    return !II.isVolatile();
3004  }
3005
3006  bool visitIntrinsicInst(IntrinsicInst &II) {
3007    assert(II.getIntrinsicID() == Intrinsic::lifetime_start ||
3008           II.getIntrinsicID() == Intrinsic::lifetime_end);
3009    DEBUG(dbgs() << "    original: " << II << "\n");
3010    assert(II.getArgOperand(1) == OldPtr);
3011
3012    // Record this instruction for deletion.
3013    Pass.DeadInsts.insert(&II);
3014
3015    ConstantInt *Size
3016      = ConstantInt::get(cast<IntegerType>(II.getArgOperand(0)->getType()),
3017                         EndOffset - BeginOffset);
3018    Value *Ptr = getAdjustedAllocaPtr(IRB, II.getArgOperand(1)->getType());
3019    Value *New;
3020    if (II.getIntrinsicID() == Intrinsic::lifetime_start)
3021      New = IRB.CreateLifetimeStart(Ptr, Size);
3022    else
3023      New = IRB.CreateLifetimeEnd(Ptr, Size);
3024
3025    (void)New;
3026    DEBUG(dbgs() << "          to: " << *New << "\n");
3027    return true;
3028  }
3029
3030  bool visitPHINode(PHINode &PN) {
3031    DEBUG(dbgs() << "    original: " << PN << "\n");
3032
3033    // We would like to compute a new pointer in only one place, but have it be
3034    // as local as possible to the PHI. To do that, we re-use the location of
3035    // the old pointer, which necessarily must be in the right position to
3036    // dominate the PHI.
3037    IRBuilderTy PtrBuilder(cast<Instruction>(OldPtr));
3038    PtrBuilder.SetNamePrefix(Twine(NewAI.getName()) + "." + Twine(BeginOffset) +
3039                             ".");
3040
3041    Value *NewPtr = getAdjustedAllocaPtr(PtrBuilder, OldPtr->getType());
3042    // Replace the operands which were using the old pointer.
3043    std::replace(PN.op_begin(), PN.op_end(), cast<Value>(OldPtr), NewPtr);
3044
3045    DEBUG(dbgs() << "          to: " << PN << "\n");
3046    deleteIfTriviallyDead(OldPtr);
3047    return false;
3048  }
3049
3050  bool visitSelectInst(SelectInst &SI) {
3051    DEBUG(dbgs() << "    original: " << SI << "\n");
3052    assert((SI.getTrueValue() == OldPtr || SI.getFalseValue() == OldPtr) &&
3053           "Pointer isn't an operand!");
3054
3055    Value *NewPtr = getAdjustedAllocaPtr(IRB, OldPtr->getType());
3056    // Replace the operands which were using the old pointer.
3057    if (SI.getOperand(1) == OldPtr)
3058      SI.setOperand(1, NewPtr);
3059    if (SI.getOperand(2) == OldPtr)
3060      SI.setOperand(2, NewPtr);
3061
3062    DEBUG(dbgs() << "          to: " << SI << "\n");
3063    deleteIfTriviallyDead(OldPtr);
3064    return false;
3065  }
3066
3067};
3068}
3069
3070namespace {
3071/// \brief Visitor to rewrite aggregate loads and stores as scalar.
3072///
3073/// This pass aggressively rewrites all aggregate loads and stores on
3074/// a particular pointer (or any pointer derived from it which we can identify)
3075/// with scalar loads and stores.
3076class AggLoadStoreRewriter : public InstVisitor<AggLoadStoreRewriter, bool> {
3077  // Befriend the base class so it can delegate to private visit methods.
3078  friend class llvm::InstVisitor<AggLoadStoreRewriter, bool>;
3079
3080  const DataLayout &TD;
3081
3082  /// Queue of pointer uses to analyze and potentially rewrite.
3083  SmallVector<Use *, 8> Queue;
3084
3085  /// Set to prevent us from cycling with phi nodes and loops.
3086  SmallPtrSet<User *, 8> Visited;
3087
3088  /// The current pointer use being rewritten. This is used to dig up the used
3089  /// value (as opposed to the user).
3090  Use *U;
3091
3092public:
3093  AggLoadStoreRewriter(const DataLayout &TD) : TD(TD) {}
3094
3095  /// Rewrite loads and stores through a pointer and all pointers derived from
3096  /// it.
3097  bool rewrite(Instruction &I) {
3098    DEBUG(dbgs() << "  Rewriting FCA loads and stores...\n");
3099    enqueueUsers(I);
3100    bool Changed = false;
3101    while (!Queue.empty()) {
3102      U = Queue.pop_back_val();
3103      Changed |= visit(cast<Instruction>(U->getUser()));
3104    }
3105    return Changed;
3106  }
3107
3108private:
3109  /// Enqueue all the users of the given instruction for further processing.
3110  /// This uses a set to de-duplicate users.
3111  void enqueueUsers(Instruction &I) {
3112    for (Value::use_iterator UI = I.use_begin(), UE = I.use_end(); UI != UE;
3113         ++UI)
3114      if (Visited.insert(*UI))
3115        Queue.push_back(&UI.getUse());
3116  }
3117
3118  // Conservative default is to not rewrite anything.
3119  bool visitInstruction(Instruction &I) { return false; }
3120
3121  /// \brief Generic recursive split emission class.
3122  template <typename Derived>
3123  class OpSplitter {
3124  protected:
3125    /// The builder used to form new instructions.
3126    IRBuilderTy IRB;
3127    /// The indices which to be used with insert- or extractvalue to select the
3128    /// appropriate value within the aggregate.
3129    SmallVector<unsigned, 4> Indices;
3130    /// The indices to a GEP instruction which will move Ptr to the correct slot
3131    /// within the aggregate.
3132    SmallVector<Value *, 4> GEPIndices;
3133    /// The base pointer of the original op, used as a base for GEPing the
3134    /// split operations.
3135    Value *Ptr;
3136
3137    /// Initialize the splitter with an insertion point, Ptr and start with a
3138    /// single zero GEP index.
3139    OpSplitter(Instruction *InsertionPoint, Value *Ptr)
3140      : IRB(InsertionPoint), GEPIndices(1, IRB.getInt32(0)), Ptr(Ptr) {}
3141
3142  public:
3143    /// \brief Generic recursive split emission routine.
3144    ///
3145    /// This method recursively splits an aggregate op (load or store) into
3146    /// scalar or vector ops. It splits recursively until it hits a single value
3147    /// and emits that single value operation via the template argument.
3148    ///
3149    /// The logic of this routine relies on GEPs and insertvalue and
3150    /// extractvalue all operating with the same fundamental index list, merely
3151    /// formatted differently (GEPs need actual values).
3152    ///
3153    /// \param Ty  The type being split recursively into smaller ops.
3154    /// \param Agg The aggregate value being built up or stored, depending on
3155    /// whether this is splitting a load or a store respectively.
3156    void emitSplitOps(Type *Ty, Value *&Agg, const Twine &Name) {
3157      if (Ty->isSingleValueType())
3158        return static_cast<Derived *>(this)->emitFunc(Ty, Agg, Name);
3159
3160      if (ArrayType *ATy = dyn_cast<ArrayType>(Ty)) {
3161        unsigned OldSize = Indices.size();
3162        (void)OldSize;
3163        for (unsigned Idx = 0, Size = ATy->getNumElements(); Idx != Size;
3164             ++Idx) {
3165          assert(Indices.size() == OldSize && "Did not return to the old size");
3166          Indices.push_back(Idx);
3167          GEPIndices.push_back(IRB.getInt32(Idx));
3168          emitSplitOps(ATy->getElementType(), Agg, Name + "." + Twine(Idx));
3169          GEPIndices.pop_back();
3170          Indices.pop_back();
3171        }
3172        return;
3173      }
3174
3175      if (StructType *STy = dyn_cast<StructType>(Ty)) {
3176        unsigned OldSize = Indices.size();
3177        (void)OldSize;
3178        for (unsigned Idx = 0, Size = STy->getNumElements(); Idx != Size;
3179             ++Idx) {
3180          assert(Indices.size() == OldSize && "Did not return to the old size");
3181          Indices.push_back(Idx);
3182          GEPIndices.push_back(IRB.getInt32(Idx));
3183          emitSplitOps(STy->getElementType(Idx), Agg, Name + "." + Twine(Idx));
3184          GEPIndices.pop_back();
3185          Indices.pop_back();
3186        }
3187        return;
3188      }
3189
3190      llvm_unreachable("Only arrays and structs are aggregate loadable types");
3191    }
3192  };
3193
3194  struct LoadOpSplitter : public OpSplitter<LoadOpSplitter> {
3195    LoadOpSplitter(Instruction *InsertionPoint, Value *Ptr)
3196      : OpSplitter<LoadOpSplitter>(InsertionPoint, Ptr) {}
3197
3198    /// Emit a leaf load of a single value. This is called at the leaves of the
3199    /// recursive emission to actually load values.
3200    void emitFunc(Type *Ty, Value *&Agg, const Twine &Name) {
3201      assert(Ty->isSingleValueType());
3202      // Load the single value and insert it using the indices.
3203      Value *GEP = IRB.CreateInBoundsGEP(Ptr, GEPIndices, Name + ".gep");
3204      Value *Load = IRB.CreateLoad(GEP, Name + ".load");
3205      Agg = IRB.CreateInsertValue(Agg, Load, Indices, Name + ".insert");
3206      DEBUG(dbgs() << "          to: " << *Load << "\n");
3207    }
3208  };
3209
3210  bool visitLoadInst(LoadInst &LI) {
3211    assert(LI.getPointerOperand() == *U);
3212    if (!LI.isSimple() || LI.getType()->isSingleValueType())
3213      return false;
3214
3215    // We have an aggregate being loaded, split it apart.
3216    DEBUG(dbgs() << "    original: " << LI << "\n");
3217    LoadOpSplitter Splitter(&LI, *U);
3218    Value *V = UndefValue::get(LI.getType());
3219    Splitter.emitSplitOps(LI.getType(), V, LI.getName() + ".fca");
3220    LI.replaceAllUsesWith(V);
3221    LI.eraseFromParent();
3222    return true;
3223  }
3224
3225  struct StoreOpSplitter : public OpSplitter<StoreOpSplitter> {
3226    StoreOpSplitter(Instruction *InsertionPoint, Value *Ptr)
3227      : OpSplitter<StoreOpSplitter>(InsertionPoint, Ptr) {}
3228
3229    /// Emit a leaf store of a single value. This is called at the leaves of the
3230    /// recursive emission to actually produce stores.
3231    void emitFunc(Type *Ty, Value *&Agg, const Twine &Name) {
3232      assert(Ty->isSingleValueType());
3233      // Extract the single value and store it using the indices.
3234      Value *Store = IRB.CreateStore(
3235        IRB.CreateExtractValue(Agg, Indices, Name + ".extract"),
3236        IRB.CreateInBoundsGEP(Ptr, GEPIndices, Name + ".gep"));
3237      (void)Store;
3238      DEBUG(dbgs() << "          to: " << *Store << "\n");
3239    }
3240  };
3241
3242  bool visitStoreInst(StoreInst &SI) {
3243    if (!SI.isSimple() || SI.getPointerOperand() != *U)
3244      return false;
3245    Value *V = SI.getValueOperand();
3246    if (V->getType()->isSingleValueType())
3247      return false;
3248
3249    // We have an aggregate being stored, split it apart.
3250    DEBUG(dbgs() << "    original: " << SI << "\n");
3251    StoreOpSplitter Splitter(&SI, *U);
3252    Splitter.emitSplitOps(V->getType(), V, V->getName() + ".fca");
3253    SI.eraseFromParent();
3254    return true;
3255  }
3256
3257  bool visitBitCastInst(BitCastInst &BC) {
3258    enqueueUsers(BC);
3259    return false;
3260  }
3261
3262  bool visitGetElementPtrInst(GetElementPtrInst &GEPI) {
3263    enqueueUsers(GEPI);
3264    return false;
3265  }
3266
3267  bool visitPHINode(PHINode &PN) {
3268    enqueueUsers(PN);
3269    return false;
3270  }
3271
3272  bool visitSelectInst(SelectInst &SI) {
3273    enqueueUsers(SI);
3274    return false;
3275  }
3276};
3277}
3278
3279/// \brief Strip aggregate type wrapping.
3280///
3281/// This removes no-op aggregate types wrapping an underlying type. It will
3282/// strip as many layers of types as it can without changing either the type
3283/// size or the allocated size.
3284static Type *stripAggregateTypeWrapping(const DataLayout &DL, Type *Ty) {
3285  if (Ty->isSingleValueType())
3286    return Ty;
3287
3288  uint64_t AllocSize = DL.getTypeAllocSize(Ty);
3289  uint64_t TypeSize = DL.getTypeSizeInBits(Ty);
3290
3291  Type *InnerTy;
3292  if (ArrayType *ArrTy = dyn_cast<ArrayType>(Ty)) {
3293    InnerTy = ArrTy->getElementType();
3294  } else if (StructType *STy = dyn_cast<StructType>(Ty)) {
3295    const StructLayout *SL = DL.getStructLayout(STy);
3296    unsigned Index = SL->getElementContainingOffset(0);
3297    InnerTy = STy->getElementType(Index);
3298  } else {
3299    return Ty;
3300  }
3301
3302  if (AllocSize > DL.getTypeAllocSize(InnerTy) ||
3303      TypeSize > DL.getTypeSizeInBits(InnerTy))
3304    return Ty;
3305
3306  return stripAggregateTypeWrapping(DL, InnerTy);
3307}
3308
3309/// \brief Try to find a partition of the aggregate type passed in for a given
3310/// offset and size.
3311///
3312/// This recurses through the aggregate type and tries to compute a subtype
3313/// based on the offset and size. When the offset and size span a sub-section
3314/// of an array, it will even compute a new array type for that sub-section,
3315/// and the same for structs.
3316///
3317/// Note that this routine is very strict and tries to find a partition of the
3318/// type which produces the *exact* right offset and size. It is not forgiving
3319/// when the size or offset cause either end of type-based partition to be off.
3320/// Also, this is a best-effort routine. It is reasonable to give up and not
3321/// return a type if necessary.
3322static Type *getTypePartition(const DataLayout &TD, Type *Ty,
3323                              uint64_t Offset, uint64_t Size) {
3324  if (Offset == 0 && TD.getTypeAllocSize(Ty) == Size)
3325    return stripAggregateTypeWrapping(TD, Ty);
3326  if (Offset > TD.getTypeAllocSize(Ty) ||
3327      (TD.getTypeAllocSize(Ty) - Offset) < Size)
3328    return 0;
3329
3330  if (SequentialType *SeqTy = dyn_cast<SequentialType>(Ty)) {
3331    // We can't partition pointers...
3332    if (SeqTy->isPointerTy())
3333      return 0;
3334
3335    Type *ElementTy = SeqTy->getElementType();
3336    uint64_t ElementSize = TD.getTypeAllocSize(ElementTy);
3337    uint64_t NumSkippedElements = Offset / ElementSize;
3338    if (ArrayType *ArrTy = dyn_cast<ArrayType>(SeqTy)) {
3339      if (NumSkippedElements >= ArrTy->getNumElements())
3340        return 0;
3341    } else if (VectorType *VecTy = dyn_cast<VectorType>(SeqTy)) {
3342      if (NumSkippedElements >= VecTy->getNumElements())
3343        return 0;
3344    }
3345    Offset -= NumSkippedElements * ElementSize;
3346
3347    // First check if we need to recurse.
3348    if (Offset > 0 || Size < ElementSize) {
3349      // Bail if the partition ends in a different array element.
3350      if ((Offset + Size) > ElementSize)
3351        return 0;
3352      // Recurse through the element type trying to peel off offset bytes.
3353      return getTypePartition(TD, ElementTy, Offset, Size);
3354    }
3355    assert(Offset == 0);
3356
3357    if (Size == ElementSize)
3358      return stripAggregateTypeWrapping(TD, ElementTy);
3359    assert(Size > ElementSize);
3360    uint64_t NumElements = Size / ElementSize;
3361    if (NumElements * ElementSize != Size)
3362      return 0;
3363    return ArrayType::get(ElementTy, NumElements);
3364  }
3365
3366  StructType *STy = dyn_cast<StructType>(Ty);
3367  if (!STy)
3368    return 0;
3369
3370  const StructLayout *SL = TD.getStructLayout(STy);
3371  if (Offset >= SL->getSizeInBytes())
3372    return 0;
3373  uint64_t EndOffset = Offset + Size;
3374  if (EndOffset > SL->getSizeInBytes())
3375    return 0;
3376
3377  unsigned Index = SL->getElementContainingOffset(Offset);
3378  Offset -= SL->getElementOffset(Index);
3379
3380  Type *ElementTy = STy->getElementType(Index);
3381  uint64_t ElementSize = TD.getTypeAllocSize(ElementTy);
3382  if (Offset >= ElementSize)
3383    return 0; // The offset points into alignment padding.
3384
3385  // See if any partition must be contained by the element.
3386  if (Offset > 0 || Size < ElementSize) {
3387    if ((Offset + Size) > ElementSize)
3388      return 0;
3389    return getTypePartition(TD, ElementTy, Offset, Size);
3390  }
3391  assert(Offset == 0);
3392
3393  if (Size == ElementSize)
3394    return stripAggregateTypeWrapping(TD, ElementTy);
3395
3396  StructType::element_iterator EI = STy->element_begin() + Index,
3397                               EE = STy->element_end();
3398  if (EndOffset < SL->getSizeInBytes()) {
3399    unsigned EndIndex = SL->getElementContainingOffset(EndOffset);
3400    if (Index == EndIndex)
3401      return 0; // Within a single element and its padding.
3402
3403    // Don't try to form "natural" types if the elements don't line up with the
3404    // expected size.
3405    // FIXME: We could potentially recurse down through the last element in the
3406    // sub-struct to find a natural end point.
3407    if (SL->getElementOffset(EndIndex) != EndOffset)
3408      return 0;
3409
3410    assert(Index < EndIndex);
3411    EE = STy->element_begin() + EndIndex;
3412  }
3413
3414  // Try to build up a sub-structure.
3415  StructType *SubTy = StructType::get(STy->getContext(), makeArrayRef(EI, EE),
3416                                      STy->isPacked());
3417  const StructLayout *SubSL = TD.getStructLayout(SubTy);
3418  if (Size != SubSL->getSizeInBytes())
3419    return 0; // The sub-struct doesn't have quite the size needed.
3420
3421  return SubTy;
3422}
3423
3424/// \brief Rewrite an alloca partition's users.
3425///
3426/// This routine drives both of the rewriting goals of the SROA pass. It tries
3427/// to rewrite uses of an alloca partition to be conducive for SSA value
3428/// promotion. If the partition needs a new, more refined alloca, this will
3429/// build that new alloca, preserving as much type information as possible, and
3430/// rewrite the uses of the old alloca to point at the new one and have the
3431/// appropriate new offsets. It also evaluates how successful the rewrite was
3432/// at enabling promotion and if it was successful queues the alloca to be
3433/// promoted.
3434bool SROA::rewriteAllocaPartition(AllocaInst &AI,
3435                                  AllocaPartitioning &P,
3436                                  AllocaPartitioning::iterator PI) {
3437  uint64_t AllocaSize = PI->EndOffset - PI->BeginOffset;
3438  bool IsLive = false;
3439  for (AllocaPartitioning::use_iterator UI = P.use_begin(PI),
3440                                        UE = P.use_end(PI);
3441       UI != UE && !IsLive; ++UI)
3442    if (UI->getUse())
3443      IsLive = true;
3444  if (!IsLive)
3445    return false; // No live uses left of this partition.
3446
3447  DEBUG(dbgs() << "Speculating PHIs and selects in partition "
3448               << "[" << PI->BeginOffset << "," << PI->EndOffset << ")\n");
3449
3450  PHIOrSelectSpeculator Speculator(*TD, P, *this);
3451  DEBUG(dbgs() << "  speculating ");
3452  DEBUG(P.print(dbgs(), PI, ""));
3453  Speculator.visitUsers(PI);
3454
3455  // Try to compute a friendly type for this partition of the alloca. This
3456  // won't always succeed, in which case we fall back to a legal integer type
3457  // or an i8 array of an appropriate size.
3458  Type *AllocaTy = 0;
3459  if (Type *PartitionTy = P.getCommonType(PI))
3460    if (TD->getTypeAllocSize(PartitionTy) >= AllocaSize)
3461      AllocaTy = PartitionTy;
3462  if (!AllocaTy)
3463    if (Type *PartitionTy = getTypePartition(*TD, AI.getAllocatedType(),
3464                                             PI->BeginOffset, AllocaSize))
3465      AllocaTy = PartitionTy;
3466  if ((!AllocaTy ||
3467       (AllocaTy->isArrayTy() &&
3468        AllocaTy->getArrayElementType()->isIntegerTy())) &&
3469      TD->isLegalInteger(AllocaSize * 8))
3470    AllocaTy = Type::getIntNTy(*C, AllocaSize * 8);
3471  if (!AllocaTy)
3472    AllocaTy = ArrayType::get(Type::getInt8Ty(*C), AllocaSize);
3473  assert(TD->getTypeAllocSize(AllocaTy) >= AllocaSize);
3474
3475  // Check for the case where we're going to rewrite to a new alloca of the
3476  // exact same type as the original, and with the same access offsets. In that
3477  // case, re-use the existing alloca, but still run through the rewriter to
3478  // perform phi and select speculation.
3479  AllocaInst *NewAI;
3480  if (AllocaTy == AI.getAllocatedType()) {
3481    assert(PI->BeginOffset == 0 &&
3482           "Non-zero begin offset but same alloca type");
3483    assert(PI == P.begin() && "Begin offset is zero on later partition");
3484    NewAI = &AI;
3485  } else {
3486    unsigned Alignment = AI.getAlignment();
3487    if (!Alignment) {
3488      // The minimum alignment which users can rely on when the explicit
3489      // alignment is omitted or zero is that required by the ABI for this
3490      // type.
3491      Alignment = TD->getABITypeAlignment(AI.getAllocatedType());
3492    }
3493    Alignment = MinAlign(Alignment, PI->BeginOffset);
3494    // If we will get at least this much alignment from the type alone, leave
3495    // the alloca's alignment unconstrained.
3496    if (Alignment <= TD->getABITypeAlignment(AllocaTy))
3497      Alignment = 0;
3498    NewAI = new AllocaInst(AllocaTy, 0, Alignment,
3499                           AI.getName() + ".sroa." + Twine(PI - P.begin()),
3500                           &AI);
3501    ++NumNewAllocas;
3502  }
3503
3504  DEBUG(dbgs() << "Rewriting alloca partition "
3505               << "[" << PI->BeginOffset << "," << PI->EndOffset << ") to: "
3506               << *NewAI << "\n");
3507
3508  // Track the high watermark of the post-promotion worklist. We will reset it
3509  // to this point if the alloca is not in fact scheduled for promotion.
3510  unsigned PPWOldSize = PostPromotionWorklist.size();
3511
3512  AllocaPartitionRewriter Rewriter(*TD, P, PI, *this, AI, *NewAI,
3513                                   PI->BeginOffset, PI->EndOffset);
3514  DEBUG(dbgs() << "  rewriting ");
3515  DEBUG(P.print(dbgs(), PI, ""));
3516  bool Promotable = Rewriter.visitUsers(P.use_begin(PI), P.use_end(PI));
3517  if (Promotable) {
3518    DEBUG(dbgs() << "  and queuing for promotion\n");
3519    PromotableAllocas.push_back(NewAI);
3520  } else if (NewAI != &AI) {
3521    // If we can't promote the alloca, iterate on it to check for new
3522    // refinements exposed by splitting the current alloca. Don't iterate on an
3523    // alloca which didn't actually change and didn't get promoted.
3524    Worklist.insert(NewAI);
3525  }
3526
3527  // Drop any post-promotion work items if promotion didn't happen.
3528  if (!Promotable)
3529    while (PostPromotionWorklist.size() > PPWOldSize)
3530      PostPromotionWorklist.pop_back();
3531
3532  return true;
3533}
3534
3535/// \brief Walks the partitioning of an alloca rewriting uses of each partition.
3536bool SROA::splitAlloca(AllocaInst &AI, AllocaPartitioning &P) {
3537  bool Changed = false;
3538  for (AllocaPartitioning::iterator PI = P.begin(), PE = P.end(); PI != PE;
3539       ++PI)
3540    Changed |= rewriteAllocaPartition(AI, P, PI);
3541
3542  return Changed;
3543}
3544
3545/// \brief Analyze an alloca for SROA.
3546///
3547/// This analyzes the alloca to ensure we can reason about it, builds
3548/// a partitioning of the alloca, and then hands it off to be split and
3549/// rewritten as needed.
3550bool SROA::runOnAlloca(AllocaInst &AI) {
3551  DEBUG(dbgs() << "SROA alloca: " << AI << "\n");
3552  ++NumAllocasAnalyzed;
3553
3554  // Special case dead allocas, as they're trivial.
3555  if (AI.use_empty()) {
3556    AI.eraseFromParent();
3557    return true;
3558  }
3559
3560  // Skip alloca forms that this analysis can't handle.
3561  if (AI.isArrayAllocation() || !AI.getAllocatedType()->isSized() ||
3562      TD->getTypeAllocSize(AI.getAllocatedType()) == 0)
3563    return false;
3564
3565  bool Changed = false;
3566
3567  // First, split any FCA loads and stores touching this alloca to promote
3568  // better splitting and promotion opportunities.
3569  AggLoadStoreRewriter AggRewriter(*TD);
3570  Changed |= AggRewriter.rewrite(AI);
3571
3572  // Build the partition set using a recursive instruction-visiting builder.
3573  AllocaPartitioning P(*TD, AI);
3574  DEBUG(P.print(dbgs()));
3575  if (P.isEscaped())
3576    return Changed;
3577
3578  // Delete all the dead users of this alloca before splitting and rewriting it.
3579  for (AllocaPartitioning::dead_user_iterator DI = P.dead_user_begin(),
3580                                              DE = P.dead_user_end();
3581       DI != DE; ++DI) {
3582    Changed = true;
3583    (*DI)->replaceAllUsesWith(UndefValue::get((*DI)->getType()));
3584    DeadInsts.insert(*DI);
3585  }
3586  for (AllocaPartitioning::dead_op_iterator DO = P.dead_op_begin(),
3587                                            DE = P.dead_op_end();
3588       DO != DE; ++DO) {
3589    Value *OldV = **DO;
3590    // Clobber the use with an undef value.
3591    **DO = UndefValue::get(OldV->getType());
3592    if (Instruction *OldI = dyn_cast<Instruction>(OldV))
3593      if (isInstructionTriviallyDead(OldI)) {
3594        Changed = true;
3595        DeadInsts.insert(OldI);
3596      }
3597  }
3598
3599  // No partitions to split. Leave the dead alloca for a later pass to clean up.
3600  if (P.begin() == P.end())
3601    return Changed;
3602
3603  return splitAlloca(AI, P) || Changed;
3604}
3605
3606/// \brief Delete the dead instructions accumulated in this run.
3607///
3608/// Recursively deletes the dead instructions we've accumulated. This is done
3609/// at the very end to maximize locality of the recursive delete and to
3610/// minimize the problems of invalidated instruction pointers as such pointers
3611/// are used heavily in the intermediate stages of the algorithm.
3612///
3613/// We also record the alloca instructions deleted here so that they aren't
3614/// subsequently handed to mem2reg to promote.
3615void SROA::deleteDeadInstructions(SmallPtrSet<AllocaInst*, 4> &DeletedAllocas) {
3616  while (!DeadInsts.empty()) {
3617    Instruction *I = DeadInsts.pop_back_val();
3618    DEBUG(dbgs() << "Deleting dead instruction: " << *I << "\n");
3619
3620    I->replaceAllUsesWith(UndefValue::get(I->getType()));
3621
3622    for (User::op_iterator OI = I->op_begin(), E = I->op_end(); OI != E; ++OI)
3623      if (Instruction *U = dyn_cast<Instruction>(*OI)) {
3624        // Zero out the operand and see if it becomes trivially dead.
3625        *OI = 0;
3626        if (isInstructionTriviallyDead(U))
3627          DeadInsts.insert(U);
3628      }
3629
3630    if (AllocaInst *AI = dyn_cast<AllocaInst>(I))
3631      DeletedAllocas.insert(AI);
3632
3633    ++NumDeleted;
3634    I->eraseFromParent();
3635  }
3636}
3637
3638/// \brief Promote the allocas, using the best available technique.
3639///
3640/// This attempts to promote whatever allocas have been identified as viable in
3641/// the PromotableAllocas list. If that list is empty, there is nothing to do.
3642/// If there is a domtree available, we attempt to promote using the full power
3643/// of mem2reg. Otherwise, we build and use the AllocaPromoter above which is
3644/// based on the SSAUpdater utilities. This function returns whether any
3645/// promotion occurred.
3646bool SROA::promoteAllocas(Function &F) {
3647  if (PromotableAllocas.empty())
3648    return false;
3649
3650  NumPromoted += PromotableAllocas.size();
3651
3652  if (DT && !ForceSSAUpdater) {
3653    DEBUG(dbgs() << "Promoting allocas with mem2reg...\n");
3654    PromoteMemToReg(PromotableAllocas, *DT);
3655    PromotableAllocas.clear();
3656    return true;
3657  }
3658
3659  DEBUG(dbgs() << "Promoting allocas with SSAUpdater...\n");
3660  SSAUpdater SSA;
3661  DIBuilder DIB(*F.getParent());
3662  SmallVector<Instruction*, 64> Insts;
3663
3664  for (unsigned Idx = 0, Size = PromotableAllocas.size(); Idx != Size; ++Idx) {
3665    AllocaInst *AI = PromotableAllocas[Idx];
3666    for (Value::use_iterator UI = AI->use_begin(), UE = AI->use_end();
3667         UI != UE;) {
3668      Instruction *I = cast<Instruction>(*UI++);
3669      // FIXME: Currently the SSAUpdater infrastructure doesn't reason about
3670      // lifetime intrinsics and so we strip them (and the bitcasts+GEPs
3671      // leading to them) here. Eventually it should use them to optimize the
3672      // scalar values produced.
3673      if (isa<BitCastInst>(I) || isa<GetElementPtrInst>(I)) {
3674        assert(onlyUsedByLifetimeMarkers(I) &&
3675               "Found a bitcast used outside of a lifetime marker.");
3676        while (!I->use_empty())
3677          cast<Instruction>(*I->use_begin())->eraseFromParent();
3678        I->eraseFromParent();
3679        continue;
3680      }
3681      if (IntrinsicInst *II = dyn_cast<IntrinsicInst>(I)) {
3682        assert(II->getIntrinsicID() == Intrinsic::lifetime_start ||
3683               II->getIntrinsicID() == Intrinsic::lifetime_end);
3684        II->eraseFromParent();
3685        continue;
3686      }
3687
3688      Insts.push_back(I);
3689    }
3690    AllocaPromoter(Insts, SSA, *AI, DIB).run(Insts);
3691    Insts.clear();
3692  }
3693
3694  PromotableAllocas.clear();
3695  return true;
3696}
3697
3698namespace {
3699  /// \brief A predicate to test whether an alloca belongs to a set.
3700  class IsAllocaInSet {
3701    typedef SmallPtrSet<AllocaInst *, 4> SetType;
3702    const SetType &Set;
3703
3704  public:
3705    typedef AllocaInst *argument_type;
3706
3707    IsAllocaInSet(const SetType &Set) : Set(Set) {}
3708    bool operator()(AllocaInst *AI) const { return Set.count(AI); }
3709  };
3710}
3711
3712bool SROA::runOnFunction(Function &F) {
3713  DEBUG(dbgs() << "SROA function: " << F.getName() << "\n");
3714  C = &F.getContext();
3715  TD = getAnalysisIfAvailable<DataLayout>();
3716  if (!TD) {
3717    DEBUG(dbgs() << "  Skipping SROA -- no target data!\n");
3718    return false;
3719  }
3720  DT = getAnalysisIfAvailable<DominatorTree>();
3721
3722  BasicBlock &EntryBB = F.getEntryBlock();
3723  for (BasicBlock::iterator I = EntryBB.begin(), E = llvm::prior(EntryBB.end());
3724       I != E; ++I)
3725    if (AllocaInst *AI = dyn_cast<AllocaInst>(I))
3726      Worklist.insert(AI);
3727
3728  bool Changed = false;
3729  // A set of deleted alloca instruction pointers which should be removed from
3730  // the list of promotable allocas.
3731  SmallPtrSet<AllocaInst *, 4> DeletedAllocas;
3732
3733  do {
3734    while (!Worklist.empty()) {
3735      Changed |= runOnAlloca(*Worklist.pop_back_val());
3736      deleteDeadInstructions(DeletedAllocas);
3737
3738      // Remove the deleted allocas from various lists so that we don't try to
3739      // continue processing them.
3740      if (!DeletedAllocas.empty()) {
3741        Worklist.remove_if(IsAllocaInSet(DeletedAllocas));
3742        PostPromotionWorklist.remove_if(IsAllocaInSet(DeletedAllocas));
3743        PromotableAllocas.erase(std::remove_if(PromotableAllocas.begin(),
3744                                               PromotableAllocas.end(),
3745                                               IsAllocaInSet(DeletedAllocas)),
3746                                PromotableAllocas.end());
3747        DeletedAllocas.clear();
3748      }
3749    }
3750
3751    Changed |= promoteAllocas(F);
3752
3753    Worklist = PostPromotionWorklist;
3754    PostPromotionWorklist.clear();
3755  } while (!Worklist.empty());
3756
3757  return Changed;
3758}
3759
3760void SROA::getAnalysisUsage(AnalysisUsage &AU) const {
3761  if (RequiresDomTree)
3762    AU.addRequired<DominatorTree>();
3763  AU.setPreservesCFG();
3764}
3765