TargetTransformInfo.h revision 360784
1//===- TargetTransformInfo.h ------------------------------------*- C++ -*-===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8/// \file
9/// This pass exposes codegen information to IR-level passes. Every
10/// transformation that uses codegen information is broken into three parts:
11/// 1. The IR-level analysis pass.
12/// 2. The IR-level transformation interface which provides the needed
13///    information.
14/// 3. Codegen-level implementation which uses target-specific hooks.
15///
16/// This file defines #2, which is the interface that IR-level transformations
17/// use for querying the codegen.
18///
19//===----------------------------------------------------------------------===//
20
21#ifndef LLVM_ANALYSIS_TARGETTRANSFORMINFO_H
22#define LLVM_ANALYSIS_TARGETTRANSFORMINFO_H
23
24#include "llvm/ADT/Optional.h"
25#include "llvm/IR/Operator.h"
26#include "llvm/IR/PassManager.h"
27#include "llvm/Pass.h"
28#include "llvm/Support/AtomicOrdering.h"
29#include "llvm/Support/DataTypes.h"
30#include "llvm/Analysis/LoopInfo.h"
31#include "llvm/Analysis/ScalarEvolution.h"
32#include "llvm/IR/Dominators.h"
33#include "llvm/Analysis/AssumptionCache.h"
34#include <functional>
35
36namespace llvm {
37
38namespace Intrinsic {
39typedef unsigned ID;
40}
41
42class AssumptionCache;
43class BlockFrequencyInfo;
44class BranchInst;
45class Function;
46class GlobalValue;
47class IntrinsicInst;
48class LoadInst;
49class LoopAccessInfo;
50class Loop;
51class ProfileSummaryInfo;
52class SCEV;
53class ScalarEvolution;
54class StoreInst;
55class SwitchInst;
56class TargetLibraryInfo;
57class Type;
58class User;
59class Value;
60
61/// Information about a load/store intrinsic defined by the target.
62struct MemIntrinsicInfo {
63  /// This is the pointer that the intrinsic is loading from or storing to.
64  /// If this is non-null, then analysis/optimization passes can assume that
65  /// this intrinsic is functionally equivalent to a load/store from this
66  /// pointer.
67  Value *PtrVal = nullptr;
68
69  // Ordering for atomic operations.
70  AtomicOrdering Ordering = AtomicOrdering::NotAtomic;
71
72  // Same Id is set by the target for corresponding load/store intrinsics.
73  unsigned short MatchingId = 0;
74
75  bool ReadMem = false;
76  bool WriteMem = false;
77  bool IsVolatile = false;
78
79  bool isUnordered() const {
80    return (Ordering == AtomicOrdering::NotAtomic ||
81            Ordering == AtomicOrdering::Unordered) && !IsVolatile;
82  }
83};
84
85/// Attributes of a target dependent hardware loop.
86struct HardwareLoopInfo {
87  HardwareLoopInfo() = delete;
88  HardwareLoopInfo(Loop *L) : L(L) {}
89  Loop *L = nullptr;
90  BasicBlock *ExitBlock = nullptr;
91  BranchInst *ExitBranch = nullptr;
92  const SCEV *ExitCount = nullptr;
93  IntegerType *CountType = nullptr;
94  Value *LoopDecrement = nullptr; // Decrement the loop counter by this
95                                  // value in every iteration.
96  bool IsNestingLegal = false;    // Can a hardware loop be a parent to
97                                  // another hardware loop?
98  bool CounterInReg = false;      // Should loop counter be updated in
99                                  // the loop via a phi?
100  bool PerformEntryTest = false;  // Generate the intrinsic which also performs
101                                  // icmp ne zero on the loop counter value and
102                                  // produces an i1 to guard the loop entry.
103  bool isHardwareLoopCandidate(ScalarEvolution &SE, LoopInfo &LI,
104                               DominatorTree &DT, bool ForceNestedLoop = false,
105                               bool ForceHardwareLoopPHI = false);
106  bool canAnalyze(LoopInfo &LI);
107};
108
109/// This pass provides access to the codegen interfaces that are needed
110/// for IR-level transformations.
111class TargetTransformInfo {
112public:
113  /// Construct a TTI object using a type implementing the \c Concept
114  /// API below.
115  ///
116  /// This is used by targets to construct a TTI wrapping their target-specific
117  /// implementation that encodes appropriate costs for their target.
118  template <typename T> TargetTransformInfo(T Impl);
119
120  /// Construct a baseline TTI object using a minimal implementation of
121  /// the \c Concept API below.
122  ///
123  /// The TTI implementation will reflect the information in the DataLayout
124  /// provided if non-null.
125  explicit TargetTransformInfo(const DataLayout &DL);
126
127  // Provide move semantics.
128  TargetTransformInfo(TargetTransformInfo &&Arg);
129  TargetTransformInfo &operator=(TargetTransformInfo &&RHS);
130
131  // We need to define the destructor out-of-line to define our sub-classes
132  // out-of-line.
133  ~TargetTransformInfo();
134
135  /// Handle the invalidation of this information.
136  ///
137  /// When used as a result of \c TargetIRAnalysis this method will be called
138  /// when the function this was computed for changes. When it returns false,
139  /// the information is preserved across those changes.
140  bool invalidate(Function &, const PreservedAnalyses &,
141                  FunctionAnalysisManager::Invalidator &) {
142    // FIXME: We should probably in some way ensure that the subtarget
143    // information for a function hasn't changed.
144    return false;
145  }
146
147  /// \name Generic Target Information
148  /// @{
149
150  /// The kind of cost model.
151  ///
152  /// There are several different cost models that can be customized by the
153  /// target. The normalization of each cost model may be target specific.
154  enum TargetCostKind {
155    TCK_RecipThroughput, ///< Reciprocal throughput.
156    TCK_Latency,         ///< The latency of instruction.
157    TCK_CodeSize         ///< Instruction code size.
158  };
159
160  /// Query the cost of a specified instruction.
161  ///
162  /// Clients should use this interface to query the cost of an existing
163  /// instruction. The instruction must have a valid parent (basic block).
164  ///
165  /// Note, this method does not cache the cost calculation and it
166  /// can be expensive in some cases.
167  int getInstructionCost(const Instruction *I, enum TargetCostKind kind) const {
168    switch (kind){
169    case TCK_RecipThroughput:
170      return getInstructionThroughput(I);
171
172    case TCK_Latency:
173      return getInstructionLatency(I);
174
175    case TCK_CodeSize:
176      return getUserCost(I);
177    }
178    llvm_unreachable("Unknown instruction cost kind");
179  }
180
181  /// Underlying constants for 'cost' values in this interface.
182  ///
183  /// Many APIs in this interface return a cost. This enum defines the
184  /// fundamental values that should be used to interpret (and produce) those
185  /// costs. The costs are returned as an int rather than a member of this
186  /// enumeration because it is expected that the cost of one IR instruction
187  /// may have a multiplicative factor to it or otherwise won't fit directly
188  /// into the enum. Moreover, it is common to sum or average costs which works
189  /// better as simple integral values. Thus this enum only provides constants.
190  /// Also note that the returned costs are signed integers to make it natural
191  /// to add, subtract, and test with zero (a common boundary condition). It is
192  /// not expected that 2^32 is a realistic cost to be modeling at any point.
193  ///
194  /// Note that these costs should usually reflect the intersection of code-size
195  /// cost and execution cost. A free instruction is typically one that folds
196  /// into another instruction. For example, reg-to-reg moves can often be
197  /// skipped by renaming the registers in the CPU, but they still are encoded
198  /// and thus wouldn't be considered 'free' here.
199  enum TargetCostConstants {
200    TCC_Free = 0,     ///< Expected to fold away in lowering.
201    TCC_Basic = 1,    ///< The cost of a typical 'add' instruction.
202    TCC_Expensive = 4 ///< The cost of a 'div' instruction on x86.
203  };
204
205  /// Estimate the cost of a specific operation when lowered.
206  ///
207  /// Note that this is designed to work on an arbitrary synthetic opcode, and
208  /// thus work for hypothetical queries before an instruction has even been
209  /// formed. However, this does *not* work for GEPs, and must not be called
210  /// for a GEP instruction. Instead, use the dedicated getGEPCost interface as
211  /// analyzing a GEP's cost required more information.
212  ///
213  /// Typically only the result type is required, and the operand type can be
214  /// omitted. However, if the opcode is one of the cast instructions, the
215  /// operand type is required.
216  ///
217  /// The returned cost is defined in terms of \c TargetCostConstants, see its
218  /// comments for a detailed explanation of the cost values.
219  int getOperationCost(unsigned Opcode, Type *Ty, Type *OpTy = nullptr) const;
220
221  /// Estimate the cost of a GEP operation when lowered.
222  ///
223  /// The contract for this function is the same as \c getOperationCost except
224  /// that it supports an interface that provides extra information specific to
225  /// the GEP operation.
226  int getGEPCost(Type *PointeeType, const Value *Ptr,
227                 ArrayRef<const Value *> Operands) const;
228
229  /// Estimate the cost of a EXT operation when lowered.
230  ///
231  /// The contract for this function is the same as \c getOperationCost except
232  /// that it supports an interface that provides extra information specific to
233  /// the EXT operation.
234  int getExtCost(const Instruction *I, const Value *Src) const;
235
236  /// Estimate the cost of a function call when lowered.
237  ///
238  /// The contract for this is the same as \c getOperationCost except that it
239  /// supports an interface that provides extra information specific to call
240  /// instructions.
241  ///
242  /// This is the most basic query for estimating call cost: it only knows the
243  /// function type and (potentially) the number of arguments at the call site.
244  /// The latter is only interesting for varargs function types.
245  int getCallCost(FunctionType *FTy, int NumArgs = -1,
246                  const User *U = nullptr) const;
247
248  /// Estimate the cost of calling a specific function when lowered.
249  ///
250  /// This overload adds the ability to reason about the particular function
251  /// being called in the event it is a library call with special lowering.
252  int getCallCost(const Function *F, int NumArgs = -1,
253                  const User *U = nullptr) const;
254
255  /// Estimate the cost of calling a specific function when lowered.
256  ///
257  /// This overload allows specifying a set of candidate argument values.
258  int getCallCost(const Function *F, ArrayRef<const Value *> Arguments,
259                  const User *U = nullptr) const;
260
261  /// \returns A value by which our inlining threshold should be multiplied.
262  /// This is primarily used to bump up the inlining threshold wholesale on
263  /// targets where calls are unusually expensive.
264  ///
265  /// TODO: This is a rather blunt instrument.  Perhaps altering the costs of
266  /// individual classes of instructions would be better.
267  unsigned getInliningThresholdMultiplier() const;
268
269  /// \returns Vector bonus in percent.
270  ///
271  /// Vector bonuses: We want to more aggressively inline vector-dense kernels
272  /// and apply this bonus based on the percentage of vector instructions. A
273  /// bonus is applied if the vector instructions exceed 50% and half that amount
274  /// is applied if it exceeds 10%. Note that these bonuses are some what
275  /// arbitrary and evolved over time by accident as much as because they are
276  /// principled bonuses.
277  /// FIXME: It would be nice to base the bonus values on something more
278  /// scientific. A target may has no bonus on vector instructions.
279  int getInlinerVectorBonusPercent() const;
280
281  /// Estimate the cost of an intrinsic when lowered.
282  ///
283  /// Mirrors the \c getCallCost method but uses an intrinsic identifier.
284  int getIntrinsicCost(Intrinsic::ID IID, Type *RetTy,
285                       ArrayRef<Type *> ParamTys,
286                       const User *U = nullptr) const;
287
288  /// Estimate the cost of an intrinsic when lowered.
289  ///
290  /// Mirrors the \c getCallCost method but uses an intrinsic identifier.
291  int getIntrinsicCost(Intrinsic::ID IID, Type *RetTy,
292                       ArrayRef<const Value *> Arguments,
293                       const User *U = nullptr) const;
294
295  /// \return the expected cost of a memcpy, which could e.g. depend on the
296  /// source/destination type and alignment and the number of bytes copied.
297  int getMemcpyCost(const Instruction *I) const;
298
299  /// \return The estimated number of case clusters when lowering \p 'SI'.
300  /// \p JTSize Set a jump table size only when \p SI is suitable for a jump
301  /// table.
302  unsigned getEstimatedNumberOfCaseClusters(const SwitchInst &SI,
303                                            unsigned &JTSize,
304                                            ProfileSummaryInfo *PSI,
305                                            BlockFrequencyInfo *BFI) const;
306
307  /// Estimate the cost of a given IR user when lowered.
308  ///
309  /// This can estimate the cost of either a ConstantExpr or Instruction when
310  /// lowered. It has two primary advantages over the \c getOperationCost and
311  /// \c getGEPCost above, and one significant disadvantage: it can only be
312  /// used when the IR construct has already been formed.
313  ///
314  /// The advantages are that it can inspect the SSA use graph to reason more
315  /// accurately about the cost. For example, all-constant-GEPs can often be
316  /// folded into a load or other instruction, but if they are used in some
317  /// other context they may not be folded. This routine can distinguish such
318  /// cases.
319  ///
320  /// \p Operands is a list of operands which can be a result of transformations
321  /// of the current operands. The number of the operands on the list must equal
322  /// to the number of the current operands the IR user has. Their order on the
323  /// list must be the same as the order of the current operands the IR user
324  /// has.
325  ///
326  /// The returned cost is defined in terms of \c TargetCostConstants, see its
327  /// comments for a detailed explanation of the cost values.
328  int getUserCost(const User *U, ArrayRef<const Value *> Operands) const;
329
330  /// This is a helper function which calls the two-argument getUserCost
331  /// with \p Operands which are the current operands U has.
332  int getUserCost(const User *U) const {
333    SmallVector<const Value *, 4> Operands(U->value_op_begin(),
334                                           U->value_op_end());
335    return getUserCost(U, Operands);
336  }
337
338  /// Return true if branch divergence exists.
339  ///
340  /// Branch divergence has a significantly negative impact on GPU performance
341  /// when threads in the same wavefront take different paths due to conditional
342  /// branches.
343  bool hasBranchDivergence() const;
344
345  /// Returns whether V is a source of divergence.
346  ///
347  /// This function provides the target-dependent information for
348  /// the target-independent LegacyDivergenceAnalysis. LegacyDivergenceAnalysis first
349  /// builds the dependency graph, and then runs the reachability algorithm
350  /// starting with the sources of divergence.
351  bool isSourceOfDivergence(const Value *V) const;
352
353  // Returns true for the target specific
354  // set of operations which produce uniform result
355  // even taking non-uniform arguments
356  bool isAlwaysUniform(const Value *V) const;
357
358  /// Returns the address space ID for a target's 'flat' address space. Note
359  /// this is not necessarily the same as addrspace(0), which LLVM sometimes
360  /// refers to as the generic address space. The flat address space is a
361  /// generic address space that can be used access multiple segments of memory
362  /// with different address spaces. Access of a memory location through a
363  /// pointer with this address space is expected to be legal but slower
364  /// compared to the same memory location accessed through a pointer with a
365  /// different address space.
366  //
367  /// This is for targets with different pointer representations which can
368  /// be converted with the addrspacecast instruction. If a pointer is converted
369  /// to this address space, optimizations should attempt to replace the access
370  /// with the source address space.
371  ///
372  /// \returns ~0u if the target does not have such a flat address space to
373  /// optimize away.
374  unsigned getFlatAddressSpace() const;
375
376  /// Return any intrinsic address operand indexes which may be rewritten if
377  /// they use a flat address space pointer.
378  ///
379  /// \returns true if the intrinsic was handled.
380  bool collectFlatAddressOperands(SmallVectorImpl<int> &OpIndexes,
381                                  Intrinsic::ID IID) const;
382
383  /// Rewrite intrinsic call \p II such that \p OldV will be replaced with \p
384  /// NewV, which has a different address space. This should happen for every
385  /// operand index that collectFlatAddressOperands returned for the intrinsic.
386  /// \returns true if the intrinsic /// was handled.
387  bool rewriteIntrinsicWithAddressSpace(IntrinsicInst *II,
388                                        Value *OldV, Value *NewV) const;
389
390  /// Test whether calls to a function lower to actual program function
391  /// calls.
392  ///
393  /// The idea is to test whether the program is likely to require a 'call'
394  /// instruction or equivalent in order to call the given function.
395  ///
396  /// FIXME: It's not clear that this is a good or useful query API. Client's
397  /// should probably move to simpler cost metrics using the above.
398  /// Alternatively, we could split the cost interface into distinct code-size
399  /// and execution-speed costs. This would allow modelling the core of this
400  /// query more accurately as a call is a single small instruction, but
401  /// incurs significant execution cost.
402  bool isLoweredToCall(const Function *F) const;
403
404  struct LSRCost {
405    /// TODO: Some of these could be merged. Also, a lexical ordering
406    /// isn't always optimal.
407    unsigned Insns;
408    unsigned NumRegs;
409    unsigned AddRecCost;
410    unsigned NumIVMuls;
411    unsigned NumBaseAdds;
412    unsigned ImmCost;
413    unsigned SetupCost;
414    unsigned ScaleCost;
415  };
416
417  /// Parameters that control the generic loop unrolling transformation.
418  struct UnrollingPreferences {
419    /// The cost threshold for the unrolled loop. Should be relative to the
420    /// getUserCost values returned by this API, and the expectation is that
421    /// the unrolled loop's instructions when run through that interface should
422    /// not exceed this cost. However, this is only an estimate. Also, specific
423    /// loops may be unrolled even with a cost above this threshold if deemed
424    /// profitable. Set this to UINT_MAX to disable the loop body cost
425    /// restriction.
426    unsigned Threshold;
427    /// If complete unrolling will reduce the cost of the loop, we will boost
428    /// the Threshold by a certain percent to allow more aggressive complete
429    /// unrolling. This value provides the maximum boost percentage that we
430    /// can apply to Threshold (The value should be no less than 100).
431    /// BoostedThreshold = Threshold * min(RolledCost / UnrolledCost,
432    ///                                    MaxPercentThresholdBoost / 100)
433    /// E.g. if complete unrolling reduces the loop execution time by 50%
434    /// then we boost the threshold by the factor of 2x. If unrolling is not
435    /// expected to reduce the running time, then we do not increase the
436    /// threshold.
437    unsigned MaxPercentThresholdBoost;
438    /// The cost threshold for the unrolled loop when optimizing for size (set
439    /// to UINT_MAX to disable).
440    unsigned OptSizeThreshold;
441    /// The cost threshold for the unrolled loop, like Threshold, but used
442    /// for partial/runtime unrolling (set to UINT_MAX to disable).
443    unsigned PartialThreshold;
444    /// The cost threshold for the unrolled loop when optimizing for size, like
445    /// OptSizeThreshold, but used for partial/runtime unrolling (set to
446    /// UINT_MAX to disable).
447    unsigned PartialOptSizeThreshold;
448    /// A forced unrolling factor (the number of concatenated bodies of the
449    /// original loop in the unrolled loop body). When set to 0, the unrolling
450    /// transformation will select an unrolling factor based on the current cost
451    /// threshold and other factors.
452    unsigned Count;
453    /// A forced peeling factor (the number of bodied of the original loop
454    /// that should be peeled off before the loop body). When set to 0, the
455    /// unrolling transformation will select a peeling factor based on profile
456    /// information and other factors.
457    unsigned PeelCount;
458    /// Default unroll count for loops with run-time trip count.
459    unsigned DefaultUnrollRuntimeCount;
460    // Set the maximum unrolling factor. The unrolling factor may be selected
461    // using the appropriate cost threshold, but may not exceed this number
462    // (set to UINT_MAX to disable). This does not apply in cases where the
463    // loop is being fully unrolled.
464    unsigned MaxCount;
465    /// Set the maximum unrolling factor for full unrolling. Like MaxCount, but
466    /// applies even if full unrolling is selected. This allows a target to fall
467    /// back to Partial unrolling if full unrolling is above FullUnrollMaxCount.
468    unsigned FullUnrollMaxCount;
469    // Represents number of instructions optimized when "back edge"
470    // becomes "fall through" in unrolled loop.
471    // For now we count a conditional branch on a backedge and a comparison
472    // feeding it.
473    unsigned BEInsns;
474    /// Allow partial unrolling (unrolling of loops to expand the size of the
475    /// loop body, not only to eliminate small constant-trip-count loops).
476    bool Partial;
477    /// Allow runtime unrolling (unrolling of loops to expand the size of the
478    /// loop body even when the number of loop iterations is not known at
479    /// compile time).
480    bool Runtime;
481    /// Allow generation of a loop remainder (extra iterations after unroll).
482    bool AllowRemainder;
483    /// Allow emitting expensive instructions (such as divisions) when computing
484    /// the trip count of a loop for runtime unrolling.
485    bool AllowExpensiveTripCount;
486    /// Apply loop unroll on any kind of loop
487    /// (mainly to loops that fail runtime unrolling).
488    bool Force;
489    /// Allow using trip count upper bound to unroll loops.
490    bool UpperBound;
491    /// Allow peeling off loop iterations.
492    bool AllowPeeling;
493    /// Allow unrolling of all the iterations of the runtime loop remainder.
494    bool UnrollRemainder;
495    /// Allow unroll and jam. Used to enable unroll and jam for the target.
496    bool UnrollAndJam;
497    /// Allow peeling basing on profile. Uses to enable peeling off all
498    /// iterations basing on provided profile.
499    /// If the value is true the peeling cost model can decide to peel only
500    /// some iterations and in this case it will set this to false.
501    bool PeelProfiledIterations;
502    /// Threshold for unroll and jam, for inner loop size. The 'Threshold'
503    /// value above is used during unroll and jam for the outer loop size.
504    /// This value is used in the same manner to limit the size of the inner
505    /// loop.
506    unsigned UnrollAndJamInnerLoopThreshold;
507  };
508
509  /// Get target-customized preferences for the generic loop unrolling
510  /// transformation. The caller will initialize UP with the current
511  /// target-independent defaults.
512  void getUnrollingPreferences(Loop *L, ScalarEvolution &,
513                               UnrollingPreferences &UP) const;
514
515  /// Query the target whether it would be profitable to convert the given loop
516  /// into a hardware loop.
517  bool isHardwareLoopProfitable(Loop *L, ScalarEvolution &SE,
518                                AssumptionCache &AC,
519                                TargetLibraryInfo *LibInfo,
520                                HardwareLoopInfo &HWLoopInfo) const;
521
522  /// Query the target whether it would be prefered to create a predicated vector
523  /// loop, which can avoid the need to emit a scalar epilogue loop.
524  bool preferPredicateOverEpilogue(Loop *L, LoopInfo *LI, ScalarEvolution &SE,
525                                   AssumptionCache &AC, TargetLibraryInfo *TLI,
526                                   DominatorTree *DT,
527                                   const LoopAccessInfo *LAI) const;
528
529  /// @}
530
531  /// \name Scalar Target Information
532  /// @{
533
534  /// Flags indicating the kind of support for population count.
535  ///
536  /// Compared to the SW implementation, HW support is supposed to
537  /// significantly boost the performance when the population is dense, and it
538  /// may or may not degrade performance if the population is sparse. A HW
539  /// support is considered as "Fast" if it can outperform, or is on a par
540  /// with, SW implementation when the population is sparse; otherwise, it is
541  /// considered as "Slow".
542  enum PopcntSupportKind { PSK_Software, PSK_SlowHardware, PSK_FastHardware };
543
544  /// Return true if the specified immediate is legal add immediate, that
545  /// is the target has add instructions which can add a register with the
546  /// immediate without having to materialize the immediate into a register.
547  bool isLegalAddImmediate(int64_t Imm) const;
548
549  /// Return true if the specified immediate is legal icmp immediate,
550  /// that is the target has icmp instructions which can compare a register
551  /// against the immediate without having to materialize the immediate into a
552  /// register.
553  bool isLegalICmpImmediate(int64_t Imm) const;
554
555  /// Return true if the addressing mode represented by AM is legal for
556  /// this target, for a load/store of the specified type.
557  /// The type may be VoidTy, in which case only return true if the addressing
558  /// mode is legal for a load/store of any legal type.
559  /// If target returns true in LSRWithInstrQueries(), I may be valid.
560  /// TODO: Handle pre/postinc as well.
561  bool isLegalAddressingMode(Type *Ty, GlobalValue *BaseGV, int64_t BaseOffset,
562                             bool HasBaseReg, int64_t Scale,
563                             unsigned AddrSpace = 0,
564                             Instruction *I = nullptr) const;
565
566  /// Return true if LSR cost of C1 is lower than C1.
567  bool isLSRCostLess(TargetTransformInfo::LSRCost &C1,
568                     TargetTransformInfo::LSRCost &C2) const;
569
570  /// Return true if the target can fuse a compare and branch.
571  /// Loop-strength-reduction (LSR) uses that knowledge to adjust its cost
572  /// calculation for the instructions in a loop.
573  bool canMacroFuseCmp() const;
574
575  /// Return true if the target can save a compare for loop count, for example
576  /// hardware loop saves a compare.
577  bool canSaveCmp(Loop *L, BranchInst **BI, ScalarEvolution *SE, LoopInfo *LI,
578                  DominatorTree *DT, AssumptionCache *AC,
579                  TargetLibraryInfo *LibInfo) const;
580
581  /// \return True is LSR should make efforts to create/preserve post-inc
582  /// addressing mode expressions.
583  bool shouldFavorPostInc() const;
584
585  /// Return true if LSR should make efforts to generate indexed addressing
586  /// modes that operate across loop iterations.
587  bool shouldFavorBackedgeIndex(const Loop *L) const;
588
589  /// Return true if the target supports masked store.
590  bool isLegalMaskedStore(Type *DataType, MaybeAlign Alignment) const;
591  /// Return true if the target supports masked load.
592  bool isLegalMaskedLoad(Type *DataType, MaybeAlign Alignment) const;
593
594  /// Return true if the target supports nontemporal store.
595  bool isLegalNTStore(Type *DataType, Align Alignment) const;
596  /// Return true if the target supports nontemporal load.
597  bool isLegalNTLoad(Type *DataType, Align Alignment) const;
598
599  /// Return true if the target supports masked scatter.
600  bool isLegalMaskedScatter(Type *DataType, MaybeAlign Alignment) const;
601  /// Return true if the target supports masked gather.
602  bool isLegalMaskedGather(Type *DataType, MaybeAlign Alignment) const;
603
604  /// Return true if the target supports masked compress store.
605  bool isLegalMaskedCompressStore(Type *DataType) const;
606  /// Return true if the target supports masked expand load.
607  bool isLegalMaskedExpandLoad(Type *DataType) const;
608
609  /// Return true if the target has a unified operation to calculate division
610  /// and remainder. If so, the additional implicit multiplication and
611  /// subtraction required to calculate a remainder from division are free. This
612  /// can enable more aggressive transformations for division and remainder than
613  /// would typically be allowed using throughput or size cost models.
614  bool hasDivRemOp(Type *DataType, bool IsSigned) const;
615
616  /// Return true if the given instruction (assumed to be a memory access
617  /// instruction) has a volatile variant. If that's the case then we can avoid
618  /// addrspacecast to generic AS for volatile loads/stores. Default
619  /// implementation returns false, which prevents address space inference for
620  /// volatile loads/stores.
621  bool hasVolatileVariant(Instruction *I, unsigned AddrSpace) const;
622
623  /// Return true if target doesn't mind addresses in vectors.
624  bool prefersVectorizedAddressing() const;
625
626  /// Return the cost of the scaling factor used in the addressing
627  /// mode represented by AM for this target, for a load/store
628  /// of the specified type.
629  /// If the AM is supported, the return value must be >= 0.
630  /// If the AM is not supported, it returns a negative value.
631  /// TODO: Handle pre/postinc as well.
632  int getScalingFactorCost(Type *Ty, GlobalValue *BaseGV, int64_t BaseOffset,
633                           bool HasBaseReg, int64_t Scale,
634                           unsigned AddrSpace = 0) const;
635
636  /// Return true if the loop strength reduce pass should make
637  /// Instruction* based TTI queries to isLegalAddressingMode(). This is
638  /// needed on SystemZ, where e.g. a memcpy can only have a 12 bit unsigned
639  /// immediate offset and no index register.
640  bool LSRWithInstrQueries() const;
641
642  /// Return true if it's free to truncate a value of type Ty1 to type
643  /// Ty2. e.g. On x86 it's free to truncate a i32 value in register EAX to i16
644  /// by referencing its sub-register AX.
645  bool isTruncateFree(Type *Ty1, Type *Ty2) const;
646
647  /// Return true if it is profitable to hoist instruction in the
648  /// then/else to before if.
649  bool isProfitableToHoist(Instruction *I) const;
650
651  bool useAA() const;
652
653  /// Return true if this type is legal.
654  bool isTypeLegal(Type *Ty) const;
655
656  /// Return true if switches should be turned into lookup tables for the
657  /// target.
658  bool shouldBuildLookupTables() const;
659
660  /// Return true if switches should be turned into lookup tables
661  /// containing this constant value for the target.
662  bool shouldBuildLookupTablesForConstant(Constant *C) const;
663
664  /// Return true if the input function which is cold at all call sites,
665  ///  should use coldcc calling convention.
666  bool useColdCCForColdCall(Function &F) const;
667
668  unsigned getScalarizationOverhead(Type *Ty, bool Insert, bool Extract) const;
669
670  unsigned getOperandsScalarizationOverhead(ArrayRef<const Value *> Args,
671                                            unsigned VF) const;
672
673  /// If target has efficient vector element load/store instructions, it can
674  /// return true here so that insertion/extraction costs are not added to
675  /// the scalarization cost of a load/store.
676  bool supportsEfficientVectorElementLoadStore() const;
677
678  /// Don't restrict interleaved unrolling to small loops.
679  bool enableAggressiveInterleaving(bool LoopHasReductions) const;
680
681  /// Returns options for expansion of memcmp. IsZeroCmp is
682  // true if this is the expansion of memcmp(p1, p2, s) == 0.
683  struct MemCmpExpansionOptions {
684    // Return true if memcmp expansion is enabled.
685    operator bool() const { return MaxNumLoads > 0; }
686
687    // Maximum number of load operations.
688    unsigned MaxNumLoads = 0;
689
690    // The list of available load sizes (in bytes), sorted in decreasing order.
691    SmallVector<unsigned, 8> LoadSizes;
692
693    // For memcmp expansion when the memcmp result is only compared equal or
694    // not-equal to 0, allow up to this number of load pairs per block. As an
695    // example, this may allow 'memcmp(a, b, 3) == 0' in a single block:
696    //   a0 = load2bytes &a[0]
697    //   b0 = load2bytes &b[0]
698    //   a2 = load1byte  &a[2]
699    //   b2 = load1byte  &b[2]
700    //   r  = cmp eq (a0 ^ b0 | a2 ^ b2), 0
701    unsigned NumLoadsPerBlock = 1;
702
703    // Set to true to allow overlapping loads. For example, 7-byte compares can
704    // be done with two 4-byte compares instead of 4+2+1-byte compares. This
705    // requires all loads in LoadSizes to be doable in an unaligned way.
706    bool AllowOverlappingLoads = false;
707  };
708  MemCmpExpansionOptions enableMemCmpExpansion(bool OptSize,
709                                               bool IsZeroCmp) const;
710
711  /// Enable matching of interleaved access groups.
712  bool enableInterleavedAccessVectorization() const;
713
714  /// Enable matching of interleaved access groups that contain predicated
715  /// accesses or gaps and therefore vectorized using masked
716  /// vector loads/stores.
717  bool enableMaskedInterleavedAccessVectorization() const;
718
719  /// Indicate that it is potentially unsafe to automatically vectorize
720  /// floating-point operations because the semantics of vector and scalar
721  /// floating-point semantics may differ. For example, ARM NEON v7 SIMD math
722  /// does not support IEEE-754 denormal numbers, while depending on the
723  /// platform, scalar floating-point math does.
724  /// This applies to floating-point math operations and calls, not memory
725  /// operations, shuffles, or casts.
726  bool isFPVectorizationPotentiallyUnsafe() const;
727
728  /// Determine if the target supports unaligned memory accesses.
729  bool allowsMisalignedMemoryAccesses(LLVMContext &Context,
730                                      unsigned BitWidth, unsigned AddressSpace = 0,
731                                      unsigned Alignment = 1,
732                                      bool *Fast = nullptr) const;
733
734  /// Return hardware support for population count.
735  PopcntSupportKind getPopcntSupport(unsigned IntTyWidthInBit) const;
736
737  /// Return true if the hardware has a fast square-root instruction.
738  bool haveFastSqrt(Type *Ty) const;
739
740  /// Return true if it is faster to check if a floating-point value is NaN
741  /// (or not-NaN) versus a comparison against a constant FP zero value.
742  /// Targets should override this if materializing a 0.0 for comparison is
743  /// generally as cheap as checking for ordered/unordered.
744  bool isFCmpOrdCheaperThanFCmpZero(Type *Ty) const;
745
746  /// Return the expected cost of supporting the floating point operation
747  /// of the specified type.
748  int getFPOpCost(Type *Ty) const;
749
750  /// Return the expected cost of materializing for the given integer
751  /// immediate of the specified type.
752  int getIntImmCost(const APInt &Imm, Type *Ty) const;
753
754  /// Return the expected cost of materialization for the given integer
755  /// immediate of the specified type for a given instruction. The cost can be
756  /// zero if the immediate can be folded into the specified instruction.
757  int getIntImmCostInst(unsigned Opc, unsigned Idx, const APInt &Imm,
758                        Type *Ty) const;
759  int getIntImmCostIntrin(Intrinsic::ID IID, unsigned Idx, const APInt &Imm,
760                          Type *Ty) const;
761
762  /// Return the expected cost for the given integer when optimising
763  /// for size. This is different than the other integer immediate cost
764  /// functions in that it is subtarget agnostic. This is useful when you e.g.
765  /// target one ISA such as Aarch32 but smaller encodings could be possible
766  /// with another such as Thumb. This return value is used as a penalty when
767  /// the total costs for a constant is calculated (the bigger the cost, the
768  /// more beneficial constant hoisting is).
769  int getIntImmCodeSizeCost(unsigned Opc, unsigned Idx, const APInt &Imm,
770                            Type *Ty) const;
771  /// @}
772
773  /// \name Vector Target Information
774  /// @{
775
776  /// The various kinds of shuffle patterns for vector queries.
777  enum ShuffleKind {
778    SK_Broadcast,       ///< Broadcast element 0 to all other elements.
779    SK_Reverse,         ///< Reverse the order of the vector.
780    SK_Select,          ///< Selects elements from the corresponding lane of
781                        ///< either source operand. This is equivalent to a
782                        ///< vector select with a constant condition operand.
783    SK_Transpose,       ///< Transpose two vectors.
784    SK_InsertSubvector, ///< InsertSubvector. Index indicates start offset.
785    SK_ExtractSubvector,///< ExtractSubvector Index indicates start offset.
786    SK_PermuteTwoSrc,   ///< Merge elements from two source vectors into one
787                        ///< with any shuffle mask.
788    SK_PermuteSingleSrc ///< Shuffle elements of single source vector with any
789                        ///< shuffle mask.
790  };
791
792  /// Additional information about an operand's possible values.
793  enum OperandValueKind {
794    OK_AnyValue,               // Operand can have any value.
795    OK_UniformValue,           // Operand is uniform (splat of a value).
796    OK_UniformConstantValue,   // Operand is uniform constant.
797    OK_NonUniformConstantValue // Operand is a non uniform constant value.
798  };
799
800  /// Additional properties of an operand's values.
801  enum OperandValueProperties { OP_None = 0, OP_PowerOf2 = 1 };
802
803  /// \return the number of registers in the target-provided register class.
804  unsigned getNumberOfRegisters(unsigned ClassID) const;
805
806  /// \return the target-provided register class ID for the provided type,
807  /// accounting for type promotion and other type-legalization techniques that the target might apply.
808  /// However, it specifically does not account for the scalarization or splitting of vector types.
809  /// Should a vector type require scalarization or splitting into multiple underlying vector registers,
810  /// that type should be mapped to a register class containing no registers.
811  /// Specifically, this is designed to provide a simple, high-level view of the register allocation
812  /// later performed by the backend. These register classes don't necessarily map onto the
813  /// register classes used by the backend.
814  /// FIXME: It's not currently possible to determine how many registers
815  /// are used by the provided type.
816  unsigned getRegisterClassForType(bool Vector, Type *Ty = nullptr) const;
817
818  /// \return the target-provided register class name
819  const char* getRegisterClassName(unsigned ClassID) const;
820
821  /// \return The width of the largest scalar or vector register type.
822  unsigned getRegisterBitWidth(bool Vector) const;
823
824  /// \return The width of the smallest vector register type.
825  unsigned getMinVectorRegisterBitWidth() const;
826
827  /// \return True if the vectorization factor should be chosen to
828  /// make the vector of the smallest element type match the size of a
829  /// vector register. For wider element types, this could result in
830  /// creating vectors that span multiple vector registers.
831  /// If false, the vectorization factor will be chosen based on the
832  /// size of the widest element type.
833  bool shouldMaximizeVectorBandwidth(bool OptSize) const;
834
835  /// \return The minimum vectorization factor for types of given element
836  /// bit width, or 0 if there is no minimum VF. The returned value only
837  /// applies when shouldMaximizeVectorBandwidth returns true.
838  unsigned getMinimumVF(unsigned ElemWidth) const;
839
840  /// \return True if it should be considered for address type promotion.
841  /// \p AllowPromotionWithoutCommonHeader Set true if promoting \p I is
842  /// profitable without finding other extensions fed by the same input.
843  bool shouldConsiderAddressTypePromotion(
844      const Instruction &I, bool &AllowPromotionWithoutCommonHeader) const;
845
846  /// \return The size of a cache line in bytes.
847  unsigned getCacheLineSize() const;
848
849  /// The possible cache levels
850  enum class CacheLevel {
851    L1D,   // The L1 data cache
852    L2D,   // The L2 data cache
853
854    // We currently do not model L3 caches, as their sizes differ widely between
855    // microarchitectures. Also, we currently do not have a use for L3 cache
856    // size modeling yet.
857  };
858
859  /// \return The size of the cache level in bytes, if available.
860  llvm::Optional<unsigned> getCacheSize(CacheLevel Level) const;
861
862  /// \return The associativity of the cache level, if available.
863  llvm::Optional<unsigned> getCacheAssociativity(CacheLevel Level) const;
864
865  /// \return How much before a load we should place the prefetch
866  /// instruction.  This is currently measured in number of
867  /// instructions.
868  unsigned getPrefetchDistance() const;
869
870  /// \return Some HW prefetchers can handle accesses up to a certain
871  /// constant stride.  This is the minimum stride in bytes where it
872  /// makes sense to start adding SW prefetches.  The default is 1,
873  /// i.e. prefetch with any stride.
874  unsigned getMinPrefetchStride() const;
875
876  /// \return The maximum number of iterations to prefetch ahead.  If
877  /// the required number of iterations is more than this number, no
878  /// prefetching is performed.
879  unsigned getMaxPrefetchIterationsAhead() const;
880
881  /// \return The maximum interleave factor that any transform should try to
882  /// perform for this target. This number depends on the level of parallelism
883  /// and the number of execution units in the CPU.
884  unsigned getMaxInterleaveFactor(unsigned VF) const;
885
886  /// Collect properties of V used in cost analysis, e.g. OP_PowerOf2.
887  static OperandValueKind getOperandInfo(Value *V,
888                                         OperandValueProperties &OpProps);
889
890  /// This is an approximation of reciprocal throughput of a math/logic op.
891  /// A higher cost indicates less expected throughput.
892  /// From Agner Fog's guides, reciprocal throughput is "the average number of
893  /// clock cycles per instruction when the instructions are not part of a
894  /// limiting dependency chain."
895  /// Therefore, costs should be scaled to account for multiple execution units
896  /// on the target that can process this type of instruction. For example, if
897  /// there are 5 scalar integer units and 2 vector integer units that can
898  /// calculate an 'add' in a single cycle, this model should indicate that the
899  /// cost of the vector add instruction is 2.5 times the cost of the scalar
900  /// add instruction.
901  /// \p Args is an optional argument which holds the instruction operands
902  /// values so the TTI can analyze those values searching for special
903  /// cases or optimizations based on those values.
904  /// \p CxtI is the optional original context instruction, if one exists, to
905  /// provide even more information.
906  int getArithmeticInstrCost(
907      unsigned Opcode, Type *Ty, OperandValueKind Opd1Info = OK_AnyValue,
908      OperandValueKind Opd2Info = OK_AnyValue,
909      OperandValueProperties Opd1PropInfo = OP_None,
910      OperandValueProperties Opd2PropInfo = OP_None,
911      ArrayRef<const Value *> Args = ArrayRef<const Value *>(),
912      const Instruction *CxtI = nullptr) const;
913
914  /// \return The cost of a shuffle instruction of kind Kind and of type Tp.
915  /// The index and subtype parameters are used by the subvector insertion and
916  /// extraction shuffle kinds to show the insert/extract point and the type of
917  /// the subvector being inserted/extracted.
918  /// NOTE: For subvector extractions Tp represents the source type.
919  int getShuffleCost(ShuffleKind Kind, Type *Tp, int Index = 0,
920                     Type *SubTp = nullptr) const;
921
922  /// \return The expected cost of cast instructions, such as bitcast, trunc,
923  /// zext, etc. If there is an existing instruction that holds Opcode, it
924  /// may be passed in the 'I' parameter.
925  int getCastInstrCost(unsigned Opcode, Type *Dst, Type *Src,
926                       const Instruction *I = nullptr) const;
927
928  /// \return The expected cost of a sign- or zero-extended vector extract. Use
929  /// -1 to indicate that there is no information about the index value.
930  int getExtractWithExtendCost(unsigned Opcode, Type *Dst, VectorType *VecTy,
931                               unsigned Index = -1) const;
932
933  /// \return The expected cost of control-flow related instructions such as
934  /// Phi, Ret, Br.
935  int getCFInstrCost(unsigned Opcode) const;
936
937  /// \returns The expected cost of compare and select instructions. If there
938  /// is an existing instruction that holds Opcode, it may be passed in the
939  /// 'I' parameter.
940  int getCmpSelInstrCost(unsigned Opcode, Type *ValTy,
941                 Type *CondTy = nullptr, const Instruction *I = nullptr) const;
942
943  /// \return The expected cost of vector Insert and Extract.
944  /// Use -1 to indicate that there is no information on the index value.
945  int getVectorInstrCost(unsigned Opcode, Type *Val, unsigned Index = -1) const;
946
947  /// \return The cost of Load and Store instructions.
948  int getMemoryOpCost(unsigned Opcode, Type *Src, MaybeAlign Alignment,
949                      unsigned AddressSpace,
950                      const Instruction *I = nullptr) const;
951
952  /// \return The cost of masked Load and Store instructions.
953  int getMaskedMemoryOpCost(unsigned Opcode, Type *Src, unsigned Alignment,
954                            unsigned AddressSpace) const;
955
956  /// \return The cost of Gather or Scatter operation
957  /// \p Opcode - is a type of memory access Load or Store
958  /// \p DataTy - a vector type of the data to be loaded or stored
959  /// \p Ptr - pointer [or vector of pointers] - address[es] in memory
960  /// \p VariableMask - true when the memory access is predicated with a mask
961  ///                   that is not a compile-time constant
962  /// \p Alignment - alignment of single element
963  int getGatherScatterOpCost(unsigned Opcode, Type *DataTy, Value *Ptr,
964                             bool VariableMask, unsigned Alignment) const;
965
966  /// \return The cost of the interleaved memory operation.
967  /// \p Opcode is the memory operation code
968  /// \p VecTy is the vector type of the interleaved access.
969  /// \p Factor is the interleave factor
970  /// \p Indices is the indices for interleaved load members (as interleaved
971  ///    load allows gaps)
972  /// \p Alignment is the alignment of the memory operation
973  /// \p AddressSpace is address space of the pointer.
974  /// \p UseMaskForCond indicates if the memory access is predicated.
975  /// \p UseMaskForGaps indicates if gaps should be masked.
976  int getInterleavedMemoryOpCost(unsigned Opcode, Type *VecTy, unsigned Factor,
977                                 ArrayRef<unsigned> Indices, unsigned Alignment,
978                                 unsigned AddressSpace,
979                                 bool UseMaskForCond = false,
980                                 bool UseMaskForGaps = false) const;
981
982  /// Calculate the cost of performing a vector reduction.
983  ///
984  /// This is the cost of reducing the vector value of type \p Ty to a scalar
985  /// value using the operation denoted by \p Opcode. The form of the reduction
986  /// can either be a pairwise reduction or a reduction that splits the vector
987  /// at every reduction level.
988  ///
989  /// Pairwise:
990  ///  (v0, v1, v2, v3)
991  ///  ((v0+v1), (v2+v3), undef, undef)
992  /// Split:
993  ///  (v0, v1, v2, v3)
994  ///  ((v0+v2), (v1+v3), undef, undef)
995  int getArithmeticReductionCost(unsigned Opcode, Type *Ty,
996                                 bool IsPairwiseForm) const;
997  int getMinMaxReductionCost(Type *Ty, Type *CondTy, bool IsPairwiseForm,
998                             bool IsUnsigned) const;
999
1000  /// \returns The cost of Intrinsic instructions. Analyses the real arguments.
1001  /// Three cases are handled: 1. scalar instruction 2. vector instruction
1002  /// 3. scalar instruction which is to be vectorized with VF.
1003  int getIntrinsicInstrCost(Intrinsic::ID ID, Type *RetTy,
1004                            ArrayRef<Value *> Args, FastMathFlags FMF,
1005                            unsigned VF = 1) const;
1006
1007  /// \returns The cost of Intrinsic instructions. Types analysis only.
1008  /// If ScalarizationCostPassed is UINT_MAX, the cost of scalarizing the
1009  /// arguments and the return value will be computed based on types.
1010  int getIntrinsicInstrCost(Intrinsic::ID ID, Type *RetTy,
1011                            ArrayRef<Type *> Tys, FastMathFlags FMF,
1012                            unsigned ScalarizationCostPassed = UINT_MAX) const;
1013
1014  /// \returns The cost of Call instructions.
1015  int getCallInstrCost(Function *F, Type *RetTy, ArrayRef<Type *> Tys) const;
1016
1017  /// \returns The number of pieces into which the provided type must be
1018  /// split during legalization. Zero is returned when the answer is unknown.
1019  unsigned getNumberOfParts(Type *Tp) const;
1020
1021  /// \returns The cost of the address computation. For most targets this can be
1022  /// merged into the instruction indexing mode. Some targets might want to
1023  /// distinguish between address computation for memory operations on vector
1024  /// types and scalar types. Such targets should override this function.
1025  /// The 'SE' parameter holds pointer for the scalar evolution object which
1026  /// is used in order to get the Ptr step value in case of constant stride.
1027  /// The 'Ptr' parameter holds SCEV of the access pointer.
1028  int getAddressComputationCost(Type *Ty, ScalarEvolution *SE = nullptr,
1029                                const SCEV *Ptr = nullptr) const;
1030
1031  /// \returns The cost, if any, of keeping values of the given types alive
1032  /// over a callsite.
1033  ///
1034  /// Some types may require the use of register classes that do not have
1035  /// any callee-saved registers, so would require a spill and fill.
1036  unsigned getCostOfKeepingLiveOverCall(ArrayRef<Type *> Tys) const;
1037
1038  /// \returns True if the intrinsic is a supported memory intrinsic.  Info
1039  /// will contain additional information - whether the intrinsic may write
1040  /// or read to memory, volatility and the pointer.  Info is undefined
1041  /// if false is returned.
1042  bool getTgtMemIntrinsic(IntrinsicInst *Inst, MemIntrinsicInfo &Info) const;
1043
1044  /// \returns The maximum element size, in bytes, for an element
1045  /// unordered-atomic memory intrinsic.
1046  unsigned getAtomicMemIntrinsicMaxElementSize() const;
1047
1048  /// \returns A value which is the result of the given memory intrinsic.  New
1049  /// instructions may be created to extract the result from the given intrinsic
1050  /// memory operation.  Returns nullptr if the target cannot create a result
1051  /// from the given intrinsic.
1052  Value *getOrCreateResultFromMemIntrinsic(IntrinsicInst *Inst,
1053                                           Type *ExpectedType) const;
1054
1055  /// \returns The type to use in a loop expansion of a memcpy call.
1056  Type *getMemcpyLoopLoweringType(LLVMContext &Context, Value *Length,
1057                                  unsigned SrcAlign, unsigned DestAlign) const;
1058
1059  /// \param[out] OpsOut The operand types to copy RemainingBytes of memory.
1060  /// \param RemainingBytes The number of bytes to copy.
1061  ///
1062  /// Calculates the operand types to use when copying \p RemainingBytes of
1063  /// memory, where source and destination alignments are \p SrcAlign and
1064  /// \p DestAlign respectively.
1065  void getMemcpyLoopResidualLoweringType(SmallVectorImpl<Type *> &OpsOut,
1066                                         LLVMContext &Context,
1067                                         unsigned RemainingBytes,
1068                                         unsigned SrcAlign,
1069                                         unsigned DestAlign) const;
1070
1071  /// \returns True if the two functions have compatible attributes for inlining
1072  /// purposes.
1073  bool areInlineCompatible(const Function *Caller,
1074                           const Function *Callee) const;
1075
1076  /// \returns True if the caller and callee agree on how \p Args will be passed
1077  /// to the callee.
1078  /// \param[out] Args The list of compatible arguments.  The implementation may
1079  /// filter out any incompatible args from this list.
1080  bool areFunctionArgsABICompatible(const Function *Caller,
1081                                    const Function *Callee,
1082                                    SmallPtrSetImpl<Argument *> &Args) const;
1083
1084  /// The type of load/store indexing.
1085  enum MemIndexedMode {
1086    MIM_Unindexed,  ///< No indexing.
1087    MIM_PreInc,     ///< Pre-incrementing.
1088    MIM_PreDec,     ///< Pre-decrementing.
1089    MIM_PostInc,    ///< Post-incrementing.
1090    MIM_PostDec     ///< Post-decrementing.
1091  };
1092
1093  /// \returns True if the specified indexed load for the given type is legal.
1094  bool isIndexedLoadLegal(enum MemIndexedMode Mode, Type *Ty) const;
1095
1096  /// \returns True if the specified indexed store for the given type is legal.
1097  bool isIndexedStoreLegal(enum MemIndexedMode Mode, Type *Ty) const;
1098
1099  /// \returns The bitwidth of the largest vector type that should be used to
1100  /// load/store in the given address space.
1101  unsigned getLoadStoreVecRegBitWidth(unsigned AddrSpace) const;
1102
1103  /// \returns True if the load instruction is legal to vectorize.
1104  bool isLegalToVectorizeLoad(LoadInst *LI) const;
1105
1106  /// \returns True if the store instruction is legal to vectorize.
1107  bool isLegalToVectorizeStore(StoreInst *SI) const;
1108
1109  /// \returns True if it is legal to vectorize the given load chain.
1110  bool isLegalToVectorizeLoadChain(unsigned ChainSizeInBytes,
1111                                   unsigned Alignment,
1112                                   unsigned AddrSpace) const;
1113
1114  /// \returns True if it is legal to vectorize the given store chain.
1115  bool isLegalToVectorizeStoreChain(unsigned ChainSizeInBytes,
1116                                    unsigned Alignment,
1117                                    unsigned AddrSpace) const;
1118
1119  /// \returns The new vector factor value if the target doesn't support \p
1120  /// SizeInBytes loads or has a better vector factor.
1121  unsigned getLoadVectorFactor(unsigned VF, unsigned LoadSize,
1122                               unsigned ChainSizeInBytes,
1123                               VectorType *VecTy) const;
1124
1125  /// \returns The new vector factor value if the target doesn't support \p
1126  /// SizeInBytes stores or has a better vector factor.
1127  unsigned getStoreVectorFactor(unsigned VF, unsigned StoreSize,
1128                                unsigned ChainSizeInBytes,
1129                                VectorType *VecTy) const;
1130
1131  /// Flags describing the kind of vector reduction.
1132  struct ReductionFlags {
1133    ReductionFlags() : IsMaxOp(false), IsSigned(false), NoNaN(false) {}
1134    bool IsMaxOp;  ///< If the op a min/max kind, true if it's a max operation.
1135    bool IsSigned; ///< Whether the operation is a signed int reduction.
1136    bool NoNaN;    ///< If op is an fp min/max, whether NaNs may be present.
1137  };
1138
1139  /// \returns True if the target wants to handle the given reduction idiom in
1140  /// the intrinsics form instead of the shuffle form.
1141  bool useReductionIntrinsic(unsigned Opcode, Type *Ty,
1142                             ReductionFlags Flags) const;
1143
1144  /// \returns True if the target wants to expand the given reduction intrinsic
1145  /// into a shuffle sequence.
1146  bool shouldExpandReduction(const IntrinsicInst *II) const;
1147
1148  /// \returns the size cost of rematerializing a GlobalValue address relative
1149  /// to a stack reload.
1150  unsigned getGISelRematGlobalCost() const;
1151
1152  /// @}
1153
1154private:
1155  /// Estimate the latency of specified instruction.
1156  /// Returns 1 as the default value.
1157  int getInstructionLatency(const Instruction *I) const;
1158
1159  /// Returns the expected throughput cost of the instruction.
1160  /// Returns -1 if the cost is unknown.
1161  int getInstructionThroughput(const Instruction *I) const;
1162
1163  /// The abstract base class used to type erase specific TTI
1164  /// implementations.
1165  class Concept;
1166
1167  /// The template model for the base class which wraps a concrete
1168  /// implementation in a type erased interface.
1169  template <typename T> class Model;
1170
1171  std::unique_ptr<Concept> TTIImpl;
1172};
1173
1174class TargetTransformInfo::Concept {
1175public:
1176  virtual ~Concept() = 0;
1177  virtual const DataLayout &getDataLayout() const = 0;
1178  virtual int getOperationCost(unsigned Opcode, Type *Ty, Type *OpTy) = 0;
1179  virtual int getGEPCost(Type *PointeeType, const Value *Ptr,
1180                         ArrayRef<const Value *> Operands) = 0;
1181  virtual int getExtCost(const Instruction *I, const Value *Src) = 0;
1182  virtual int getCallCost(FunctionType *FTy, int NumArgs, const User *U) = 0;
1183  virtual int getCallCost(const Function *F, int NumArgs, const User *U) = 0;
1184  virtual int getCallCost(const Function *F,
1185                          ArrayRef<const Value *> Arguments, const User *U) = 0;
1186  virtual unsigned getInliningThresholdMultiplier() = 0;
1187  virtual int getInlinerVectorBonusPercent() = 0;
1188  virtual int getIntrinsicCost(Intrinsic::ID IID, Type *RetTy,
1189                               ArrayRef<Type *> ParamTys, const User *U) = 0;
1190  virtual int getIntrinsicCost(Intrinsic::ID IID, Type *RetTy,
1191                               ArrayRef<const Value *> Arguments,
1192                               const User *U) = 0;
1193  virtual int getMemcpyCost(const Instruction *I) = 0;
1194  virtual unsigned getEstimatedNumberOfCaseClusters(const SwitchInst &SI,
1195                                                    unsigned &JTSize,
1196                                                    ProfileSummaryInfo *PSI,
1197                                                    BlockFrequencyInfo *BFI) = 0;
1198  virtual int
1199  getUserCost(const User *U, ArrayRef<const Value *> Operands) = 0;
1200  virtual bool hasBranchDivergence() = 0;
1201  virtual bool isSourceOfDivergence(const Value *V) = 0;
1202  virtual bool isAlwaysUniform(const Value *V) = 0;
1203  virtual unsigned getFlatAddressSpace() = 0;
1204  virtual bool collectFlatAddressOperands(SmallVectorImpl<int> &OpIndexes,
1205                                          Intrinsic::ID IID) const = 0;
1206  virtual bool rewriteIntrinsicWithAddressSpace(
1207    IntrinsicInst *II, Value *OldV, Value *NewV) const = 0;
1208  virtual bool isLoweredToCall(const Function *F) = 0;
1209  virtual void getUnrollingPreferences(Loop *L, ScalarEvolution &,
1210                                       UnrollingPreferences &UP) = 0;
1211  virtual bool isHardwareLoopProfitable(Loop *L, ScalarEvolution &SE,
1212                                        AssumptionCache &AC,
1213                                        TargetLibraryInfo *LibInfo,
1214                                        HardwareLoopInfo &HWLoopInfo) = 0;
1215  virtual bool preferPredicateOverEpilogue(Loop *L, LoopInfo *LI,
1216                                           ScalarEvolution &SE,
1217                                           AssumptionCache &AC,
1218                                           TargetLibraryInfo *TLI,
1219                                           DominatorTree *DT,
1220                                           const LoopAccessInfo *LAI) = 0;
1221  virtual bool isLegalAddImmediate(int64_t Imm) = 0;
1222  virtual bool isLegalICmpImmediate(int64_t Imm) = 0;
1223  virtual bool isLegalAddressingMode(Type *Ty, GlobalValue *BaseGV,
1224                                     int64_t BaseOffset, bool HasBaseReg,
1225                                     int64_t Scale,
1226                                     unsigned AddrSpace,
1227                                     Instruction *I) = 0;
1228  virtual bool isLSRCostLess(TargetTransformInfo::LSRCost &C1,
1229                             TargetTransformInfo::LSRCost &C2) = 0;
1230  virtual bool canMacroFuseCmp() = 0;
1231  virtual bool canSaveCmp(Loop *L, BranchInst **BI, ScalarEvolution *SE,
1232                          LoopInfo *LI, DominatorTree *DT, AssumptionCache *AC,
1233                          TargetLibraryInfo *LibInfo) = 0;
1234  virtual bool shouldFavorPostInc() const = 0;
1235  virtual bool shouldFavorBackedgeIndex(const Loop *L) const = 0;
1236  virtual bool isLegalMaskedStore(Type *DataType, MaybeAlign Alignment) = 0;
1237  virtual bool isLegalMaskedLoad(Type *DataType, MaybeAlign Alignment) = 0;
1238  virtual bool isLegalNTStore(Type *DataType, Align Alignment) = 0;
1239  virtual bool isLegalNTLoad(Type *DataType, Align Alignment) = 0;
1240  virtual bool isLegalMaskedScatter(Type *DataType, MaybeAlign Alignment) = 0;
1241  virtual bool isLegalMaskedGather(Type *DataType, MaybeAlign Alignment) = 0;
1242  virtual bool isLegalMaskedCompressStore(Type *DataType) = 0;
1243  virtual bool isLegalMaskedExpandLoad(Type *DataType) = 0;
1244  virtual bool hasDivRemOp(Type *DataType, bool IsSigned) = 0;
1245  virtual bool hasVolatileVariant(Instruction *I, unsigned AddrSpace) = 0;
1246  virtual bool prefersVectorizedAddressing() = 0;
1247  virtual int getScalingFactorCost(Type *Ty, GlobalValue *BaseGV,
1248                                   int64_t BaseOffset, bool HasBaseReg,
1249                                   int64_t Scale, unsigned AddrSpace) = 0;
1250  virtual bool LSRWithInstrQueries() = 0;
1251  virtual bool isTruncateFree(Type *Ty1, Type *Ty2) = 0;
1252  virtual bool isProfitableToHoist(Instruction *I) = 0;
1253  virtual bool useAA() = 0;
1254  virtual bool isTypeLegal(Type *Ty) = 0;
1255  virtual bool shouldBuildLookupTables() = 0;
1256  virtual bool shouldBuildLookupTablesForConstant(Constant *C) = 0;
1257  virtual bool useColdCCForColdCall(Function &F) = 0;
1258  virtual unsigned
1259  getScalarizationOverhead(Type *Ty, bool Insert, bool Extract) = 0;
1260  virtual unsigned getOperandsScalarizationOverhead(ArrayRef<const Value *> Args,
1261                                                    unsigned VF) = 0;
1262  virtual bool supportsEfficientVectorElementLoadStore() = 0;
1263  virtual bool enableAggressiveInterleaving(bool LoopHasReductions) = 0;
1264  virtual MemCmpExpansionOptions
1265  enableMemCmpExpansion(bool OptSize, bool IsZeroCmp) const = 0;
1266  virtual bool enableInterleavedAccessVectorization() = 0;
1267  virtual bool enableMaskedInterleavedAccessVectorization() = 0;
1268  virtual bool isFPVectorizationPotentiallyUnsafe() = 0;
1269  virtual bool allowsMisalignedMemoryAccesses(LLVMContext &Context,
1270                                              unsigned BitWidth,
1271                                              unsigned AddressSpace,
1272                                              unsigned Alignment,
1273                                              bool *Fast) = 0;
1274  virtual PopcntSupportKind getPopcntSupport(unsigned IntTyWidthInBit) = 0;
1275  virtual bool haveFastSqrt(Type *Ty) = 0;
1276  virtual bool isFCmpOrdCheaperThanFCmpZero(Type *Ty) = 0;
1277  virtual int getFPOpCost(Type *Ty) = 0;
1278  virtual int getIntImmCodeSizeCost(unsigned Opc, unsigned Idx, const APInt &Imm,
1279                                    Type *Ty) = 0;
1280  virtual int getIntImmCost(const APInt &Imm, Type *Ty) = 0;
1281  virtual int getIntImmCostInst(unsigned Opc, unsigned Idx, const APInt &Imm,
1282                                Type *Ty) = 0;
1283  virtual int getIntImmCostIntrin(Intrinsic::ID IID, unsigned Idx,
1284                                  const APInt &Imm, Type *Ty) = 0;
1285  virtual unsigned getNumberOfRegisters(unsigned ClassID) const = 0;
1286  virtual unsigned getRegisterClassForType(bool Vector, Type *Ty = nullptr) const = 0;
1287  virtual const char* getRegisterClassName(unsigned ClassID) const = 0;
1288  virtual unsigned getRegisterBitWidth(bool Vector) const = 0;
1289  virtual unsigned getMinVectorRegisterBitWidth() = 0;
1290  virtual bool shouldMaximizeVectorBandwidth(bool OptSize) const = 0;
1291  virtual unsigned getMinimumVF(unsigned ElemWidth) const = 0;
1292  virtual bool shouldConsiderAddressTypePromotion(
1293      const Instruction &I, bool &AllowPromotionWithoutCommonHeader) = 0;
1294  virtual unsigned getCacheLineSize() const = 0;
1295  virtual llvm::Optional<unsigned> getCacheSize(CacheLevel Level) const = 0;
1296  virtual llvm::Optional<unsigned> getCacheAssociativity(CacheLevel Level) const = 0;
1297
1298  /// \return How much before a load we should place the prefetch
1299  /// instruction.  This is currently measured in number of
1300  /// instructions.
1301  virtual unsigned getPrefetchDistance() const = 0;
1302
1303  /// \return Some HW prefetchers can handle accesses up to a certain
1304  /// constant stride.  This is the minimum stride in bytes where it
1305  /// makes sense to start adding SW prefetches.  The default is 1,
1306  /// i.e. prefetch with any stride.
1307  virtual unsigned getMinPrefetchStride() const = 0;
1308
1309  /// \return The maximum number of iterations to prefetch ahead.  If
1310  /// the required number of iterations is more than this number, no
1311  /// prefetching is performed.
1312  virtual unsigned getMaxPrefetchIterationsAhead() const = 0;
1313
1314  virtual unsigned getMaxInterleaveFactor(unsigned VF) = 0;
1315  virtual unsigned getArithmeticInstrCost(
1316      unsigned Opcode, Type *Ty, OperandValueKind Opd1Info,
1317      OperandValueKind Opd2Info, OperandValueProperties Opd1PropInfo,
1318      OperandValueProperties Opd2PropInfo, ArrayRef<const Value *> Args,
1319      const Instruction *CxtI = nullptr) = 0;
1320  virtual int getShuffleCost(ShuffleKind Kind, Type *Tp, int Index,
1321                             Type *SubTp) = 0;
1322  virtual int getCastInstrCost(unsigned Opcode, Type *Dst, Type *Src,
1323                               const Instruction *I) = 0;
1324  virtual int getExtractWithExtendCost(unsigned Opcode, Type *Dst,
1325                                       VectorType *VecTy, unsigned Index) = 0;
1326  virtual int getCFInstrCost(unsigned Opcode) = 0;
1327  virtual int getCmpSelInstrCost(unsigned Opcode, Type *ValTy,
1328                                Type *CondTy, const Instruction *I) = 0;
1329  virtual int getVectorInstrCost(unsigned Opcode, Type *Val,
1330                                 unsigned Index) = 0;
1331  virtual int getMemoryOpCost(unsigned Opcode, Type *Src, MaybeAlign Alignment,
1332                              unsigned AddressSpace, const Instruction *I) = 0;
1333  virtual int getMaskedMemoryOpCost(unsigned Opcode, Type *Src,
1334                                    unsigned Alignment,
1335                                    unsigned AddressSpace) = 0;
1336  virtual int getGatherScatterOpCost(unsigned Opcode, Type *DataTy,
1337                                     Value *Ptr, bool VariableMask,
1338                                     unsigned Alignment) = 0;
1339  virtual int getInterleavedMemoryOpCost(unsigned Opcode, Type *VecTy,
1340                                         unsigned Factor,
1341                                         ArrayRef<unsigned> Indices,
1342                                         unsigned Alignment,
1343                                         unsigned AddressSpace,
1344                                         bool UseMaskForCond = false,
1345                                         bool UseMaskForGaps = false) = 0;
1346  virtual int getArithmeticReductionCost(unsigned Opcode, Type *Ty,
1347                                         bool IsPairwiseForm) = 0;
1348  virtual int getMinMaxReductionCost(Type *Ty, Type *CondTy,
1349                                     bool IsPairwiseForm, bool IsUnsigned) = 0;
1350  virtual int getIntrinsicInstrCost(Intrinsic::ID ID, Type *RetTy,
1351                      ArrayRef<Type *> Tys, FastMathFlags FMF,
1352                      unsigned ScalarizationCostPassed) = 0;
1353  virtual int getIntrinsicInstrCost(Intrinsic::ID ID, Type *RetTy,
1354         ArrayRef<Value *> Args, FastMathFlags FMF, unsigned VF) = 0;
1355  virtual int getCallInstrCost(Function *F, Type *RetTy,
1356                               ArrayRef<Type *> Tys) = 0;
1357  virtual unsigned getNumberOfParts(Type *Tp) = 0;
1358  virtual int getAddressComputationCost(Type *Ty, ScalarEvolution *SE,
1359                                        const SCEV *Ptr) = 0;
1360  virtual unsigned getCostOfKeepingLiveOverCall(ArrayRef<Type *> Tys) = 0;
1361  virtual bool getTgtMemIntrinsic(IntrinsicInst *Inst,
1362                                  MemIntrinsicInfo &Info) = 0;
1363  virtual unsigned getAtomicMemIntrinsicMaxElementSize() const = 0;
1364  virtual Value *getOrCreateResultFromMemIntrinsic(IntrinsicInst *Inst,
1365                                                   Type *ExpectedType) = 0;
1366  virtual Type *getMemcpyLoopLoweringType(LLVMContext &Context, Value *Length,
1367                                          unsigned SrcAlign,
1368                                          unsigned DestAlign) const = 0;
1369  virtual void getMemcpyLoopResidualLoweringType(
1370      SmallVectorImpl<Type *> &OpsOut, LLVMContext &Context,
1371      unsigned RemainingBytes, unsigned SrcAlign, unsigned DestAlign) const = 0;
1372  virtual bool areInlineCompatible(const Function *Caller,
1373                                   const Function *Callee) const = 0;
1374  virtual bool
1375  areFunctionArgsABICompatible(const Function *Caller, const Function *Callee,
1376                               SmallPtrSetImpl<Argument *> &Args) const = 0;
1377  virtual bool isIndexedLoadLegal(MemIndexedMode Mode, Type *Ty) const = 0;
1378  virtual bool isIndexedStoreLegal(MemIndexedMode Mode,Type *Ty) const = 0;
1379  virtual unsigned getLoadStoreVecRegBitWidth(unsigned AddrSpace) const = 0;
1380  virtual bool isLegalToVectorizeLoad(LoadInst *LI) const = 0;
1381  virtual bool isLegalToVectorizeStore(StoreInst *SI) const = 0;
1382  virtual bool isLegalToVectorizeLoadChain(unsigned ChainSizeInBytes,
1383                                           unsigned Alignment,
1384                                           unsigned AddrSpace) const = 0;
1385  virtual bool isLegalToVectorizeStoreChain(unsigned ChainSizeInBytes,
1386                                            unsigned Alignment,
1387                                            unsigned AddrSpace) const = 0;
1388  virtual unsigned getLoadVectorFactor(unsigned VF, unsigned LoadSize,
1389                                       unsigned ChainSizeInBytes,
1390                                       VectorType *VecTy) const = 0;
1391  virtual unsigned getStoreVectorFactor(unsigned VF, unsigned StoreSize,
1392                                        unsigned ChainSizeInBytes,
1393                                        VectorType *VecTy) const = 0;
1394  virtual bool useReductionIntrinsic(unsigned Opcode, Type *Ty,
1395                                     ReductionFlags) const = 0;
1396  virtual bool shouldExpandReduction(const IntrinsicInst *II) const = 0;
1397  virtual unsigned getGISelRematGlobalCost() const = 0;
1398  virtual int getInstructionLatency(const Instruction *I) = 0;
1399};
1400
1401template <typename T>
1402class TargetTransformInfo::Model final : public TargetTransformInfo::Concept {
1403  T Impl;
1404
1405public:
1406  Model(T Impl) : Impl(std::move(Impl)) {}
1407  ~Model() override {}
1408
1409  const DataLayout &getDataLayout() const override {
1410    return Impl.getDataLayout();
1411  }
1412
1413  int getOperationCost(unsigned Opcode, Type *Ty, Type *OpTy) override {
1414    return Impl.getOperationCost(Opcode, Ty, OpTy);
1415  }
1416  int getGEPCost(Type *PointeeType, const Value *Ptr,
1417                 ArrayRef<const Value *> Operands) override {
1418    return Impl.getGEPCost(PointeeType, Ptr, Operands);
1419  }
1420  int getExtCost(const Instruction *I, const Value *Src) override {
1421    return Impl.getExtCost(I, Src);
1422  }
1423  int getCallCost(FunctionType *FTy, int NumArgs, const User *U) override {
1424    return Impl.getCallCost(FTy, NumArgs, U);
1425  }
1426  int getCallCost(const Function *F, int NumArgs, const User *U) override {
1427    return Impl.getCallCost(F, NumArgs, U);
1428  }
1429  int getCallCost(const Function *F,
1430                  ArrayRef<const Value *> Arguments, const User *U) override {
1431    return Impl.getCallCost(F, Arguments, U);
1432  }
1433  unsigned getInliningThresholdMultiplier() override {
1434    return Impl.getInliningThresholdMultiplier();
1435  }
1436  int getInlinerVectorBonusPercent() override {
1437    return Impl.getInlinerVectorBonusPercent();
1438  }
1439  int getIntrinsicCost(Intrinsic::ID IID, Type *RetTy,
1440                       ArrayRef<Type *> ParamTys, const User *U = nullptr) override {
1441    return Impl.getIntrinsicCost(IID, RetTy, ParamTys, U);
1442  }
1443  int getIntrinsicCost(Intrinsic::ID IID, Type *RetTy,
1444                       ArrayRef<const Value *> Arguments,
1445                       const User *U = nullptr) override {
1446    return Impl.getIntrinsicCost(IID, RetTy, Arguments, U);
1447  }
1448  int getMemcpyCost(const Instruction *I) override {
1449    return Impl.getMemcpyCost(I);
1450  }
1451  int getUserCost(const User *U, ArrayRef<const Value *> Operands) override {
1452    return Impl.getUserCost(U, Operands);
1453  }
1454  bool hasBranchDivergence() override { return Impl.hasBranchDivergence(); }
1455  bool isSourceOfDivergence(const Value *V) override {
1456    return Impl.isSourceOfDivergence(V);
1457  }
1458
1459  bool isAlwaysUniform(const Value *V) override {
1460    return Impl.isAlwaysUniform(V);
1461  }
1462
1463  unsigned getFlatAddressSpace() override {
1464    return Impl.getFlatAddressSpace();
1465  }
1466
1467  bool collectFlatAddressOperands(SmallVectorImpl<int> &OpIndexes,
1468                                  Intrinsic::ID IID) const override {
1469    return Impl.collectFlatAddressOperands(OpIndexes, IID);
1470  }
1471
1472  bool rewriteIntrinsicWithAddressSpace(
1473    IntrinsicInst *II, Value *OldV, Value *NewV) const override {
1474    return Impl.rewriteIntrinsicWithAddressSpace(II, OldV, NewV);
1475  }
1476
1477  bool isLoweredToCall(const Function *F) override {
1478    return Impl.isLoweredToCall(F);
1479  }
1480  void getUnrollingPreferences(Loop *L, ScalarEvolution &SE,
1481                               UnrollingPreferences &UP) override {
1482    return Impl.getUnrollingPreferences(L, SE, UP);
1483  }
1484  bool isHardwareLoopProfitable(Loop *L, ScalarEvolution &SE,
1485                                AssumptionCache &AC,
1486                                TargetLibraryInfo *LibInfo,
1487                                HardwareLoopInfo &HWLoopInfo) override {
1488    return Impl.isHardwareLoopProfitable(L, SE, AC, LibInfo, HWLoopInfo);
1489  }
1490  bool preferPredicateOverEpilogue(Loop *L, LoopInfo *LI, ScalarEvolution &SE,
1491                                   AssumptionCache &AC, TargetLibraryInfo *TLI,
1492                                   DominatorTree *DT,
1493                                   const LoopAccessInfo *LAI) override {
1494    return Impl.preferPredicateOverEpilogue(L, LI, SE, AC, TLI, DT, LAI);
1495  }
1496  bool isLegalAddImmediate(int64_t Imm) override {
1497    return Impl.isLegalAddImmediate(Imm);
1498  }
1499  bool isLegalICmpImmediate(int64_t Imm) override {
1500    return Impl.isLegalICmpImmediate(Imm);
1501  }
1502  bool isLegalAddressingMode(Type *Ty, GlobalValue *BaseGV, int64_t BaseOffset,
1503                             bool HasBaseReg, int64_t Scale,
1504                             unsigned AddrSpace,
1505                             Instruction *I) override {
1506    return Impl.isLegalAddressingMode(Ty, BaseGV, BaseOffset, HasBaseReg,
1507                                      Scale, AddrSpace, I);
1508  }
1509  bool isLSRCostLess(TargetTransformInfo::LSRCost &C1,
1510                     TargetTransformInfo::LSRCost &C2) override {
1511    return Impl.isLSRCostLess(C1, C2);
1512  }
1513  bool canMacroFuseCmp() override {
1514    return Impl.canMacroFuseCmp();
1515  }
1516  bool canSaveCmp(Loop *L, BranchInst **BI,
1517                        ScalarEvolution *SE,
1518                        LoopInfo *LI, DominatorTree *DT, AssumptionCache *AC,
1519                        TargetLibraryInfo *LibInfo) override {
1520    return Impl.canSaveCmp(L, BI, SE, LI, DT, AC, LibInfo);
1521  }
1522  bool shouldFavorPostInc() const override {
1523    return Impl.shouldFavorPostInc();
1524  }
1525  bool shouldFavorBackedgeIndex(const Loop *L) const override {
1526    return Impl.shouldFavorBackedgeIndex(L);
1527  }
1528  bool isLegalMaskedStore(Type *DataType, MaybeAlign Alignment) override {
1529    return Impl.isLegalMaskedStore(DataType, Alignment);
1530  }
1531  bool isLegalMaskedLoad(Type *DataType, MaybeAlign Alignment) override {
1532    return Impl.isLegalMaskedLoad(DataType, Alignment);
1533  }
1534  bool isLegalNTStore(Type *DataType, Align Alignment) override {
1535    return Impl.isLegalNTStore(DataType, Alignment);
1536  }
1537  bool isLegalNTLoad(Type *DataType, Align Alignment) override {
1538    return Impl.isLegalNTLoad(DataType, Alignment);
1539  }
1540  bool isLegalMaskedScatter(Type *DataType, MaybeAlign Alignment) override {
1541    return Impl.isLegalMaskedScatter(DataType, Alignment);
1542  }
1543  bool isLegalMaskedGather(Type *DataType, MaybeAlign Alignment) override {
1544    return Impl.isLegalMaskedGather(DataType, Alignment);
1545  }
1546  bool isLegalMaskedCompressStore(Type *DataType) override {
1547    return Impl.isLegalMaskedCompressStore(DataType);
1548  }
1549  bool isLegalMaskedExpandLoad(Type *DataType) override {
1550    return Impl.isLegalMaskedExpandLoad(DataType);
1551  }
1552  bool hasDivRemOp(Type *DataType, bool IsSigned) override {
1553    return Impl.hasDivRemOp(DataType, IsSigned);
1554  }
1555  bool hasVolatileVariant(Instruction *I, unsigned AddrSpace) override {
1556    return Impl.hasVolatileVariant(I, AddrSpace);
1557  }
1558  bool prefersVectorizedAddressing() override {
1559    return Impl.prefersVectorizedAddressing();
1560  }
1561  int getScalingFactorCost(Type *Ty, GlobalValue *BaseGV, int64_t BaseOffset,
1562                           bool HasBaseReg, int64_t Scale,
1563                           unsigned AddrSpace) override {
1564    return Impl.getScalingFactorCost(Ty, BaseGV, BaseOffset, HasBaseReg,
1565                                     Scale, AddrSpace);
1566  }
1567  bool LSRWithInstrQueries() override {
1568    return Impl.LSRWithInstrQueries();
1569  }
1570  bool isTruncateFree(Type *Ty1, Type *Ty2) override {
1571    return Impl.isTruncateFree(Ty1, Ty2);
1572  }
1573  bool isProfitableToHoist(Instruction *I) override {
1574    return Impl.isProfitableToHoist(I);
1575  }
1576  bool useAA() override { return Impl.useAA(); }
1577  bool isTypeLegal(Type *Ty) override { return Impl.isTypeLegal(Ty); }
1578  bool shouldBuildLookupTables() override {
1579    return Impl.shouldBuildLookupTables();
1580  }
1581  bool shouldBuildLookupTablesForConstant(Constant *C) override {
1582    return Impl.shouldBuildLookupTablesForConstant(C);
1583  }
1584  bool useColdCCForColdCall(Function &F) override {
1585    return Impl.useColdCCForColdCall(F);
1586  }
1587
1588  unsigned getScalarizationOverhead(Type *Ty, bool Insert,
1589                                    bool Extract) override {
1590    return Impl.getScalarizationOverhead(Ty, Insert, Extract);
1591  }
1592  unsigned getOperandsScalarizationOverhead(ArrayRef<const Value *> Args,
1593                                            unsigned VF) override {
1594    return Impl.getOperandsScalarizationOverhead(Args, VF);
1595  }
1596
1597  bool supportsEfficientVectorElementLoadStore() override {
1598    return Impl.supportsEfficientVectorElementLoadStore();
1599  }
1600
1601  bool enableAggressiveInterleaving(bool LoopHasReductions) override {
1602    return Impl.enableAggressiveInterleaving(LoopHasReductions);
1603  }
1604  MemCmpExpansionOptions enableMemCmpExpansion(bool OptSize,
1605                                               bool IsZeroCmp) const override {
1606    return Impl.enableMemCmpExpansion(OptSize, IsZeroCmp);
1607  }
1608  bool enableInterleavedAccessVectorization() override {
1609    return Impl.enableInterleavedAccessVectorization();
1610  }
1611  bool enableMaskedInterleavedAccessVectorization() override {
1612    return Impl.enableMaskedInterleavedAccessVectorization();
1613  }
1614  bool isFPVectorizationPotentiallyUnsafe() override {
1615    return Impl.isFPVectorizationPotentiallyUnsafe();
1616  }
1617  bool allowsMisalignedMemoryAccesses(LLVMContext &Context,
1618                                      unsigned BitWidth, unsigned AddressSpace,
1619                                      unsigned Alignment, bool *Fast) override {
1620    return Impl.allowsMisalignedMemoryAccesses(Context, BitWidth, AddressSpace,
1621                                               Alignment, Fast);
1622  }
1623  PopcntSupportKind getPopcntSupport(unsigned IntTyWidthInBit) override {
1624    return Impl.getPopcntSupport(IntTyWidthInBit);
1625  }
1626  bool haveFastSqrt(Type *Ty) override { return Impl.haveFastSqrt(Ty); }
1627
1628  bool isFCmpOrdCheaperThanFCmpZero(Type *Ty) override {
1629    return Impl.isFCmpOrdCheaperThanFCmpZero(Ty);
1630  }
1631
1632  int getFPOpCost(Type *Ty) override { return Impl.getFPOpCost(Ty); }
1633
1634  int getIntImmCodeSizeCost(unsigned Opc, unsigned Idx, const APInt &Imm,
1635                            Type *Ty) override {
1636    return Impl.getIntImmCodeSizeCost(Opc, Idx, Imm, Ty);
1637  }
1638  int getIntImmCost(const APInt &Imm, Type *Ty) override {
1639    return Impl.getIntImmCost(Imm, Ty);
1640  }
1641  int getIntImmCostInst(unsigned Opc, unsigned Idx, const APInt &Imm,
1642                        Type *Ty) override {
1643    return Impl.getIntImmCostInst(Opc, Idx, Imm, Ty);
1644  }
1645  int getIntImmCostIntrin(Intrinsic::ID IID, unsigned Idx, const APInt &Imm,
1646                          Type *Ty) override {
1647    return Impl.getIntImmCostIntrin(IID, Idx, Imm, Ty);
1648  }
1649  unsigned getNumberOfRegisters(unsigned ClassID) const override {
1650    return Impl.getNumberOfRegisters(ClassID);
1651  }
1652  unsigned getRegisterClassForType(bool Vector, Type *Ty = nullptr) const override {
1653    return Impl.getRegisterClassForType(Vector, Ty);
1654  }
1655  const char* getRegisterClassName(unsigned ClassID) const override {
1656    return Impl.getRegisterClassName(ClassID);
1657  }
1658  unsigned getRegisterBitWidth(bool Vector) const override {
1659    return Impl.getRegisterBitWidth(Vector);
1660  }
1661  unsigned getMinVectorRegisterBitWidth() override {
1662    return Impl.getMinVectorRegisterBitWidth();
1663  }
1664  bool shouldMaximizeVectorBandwidth(bool OptSize) const override {
1665    return Impl.shouldMaximizeVectorBandwidth(OptSize);
1666  }
1667  unsigned getMinimumVF(unsigned ElemWidth) const override {
1668    return Impl.getMinimumVF(ElemWidth);
1669  }
1670  bool shouldConsiderAddressTypePromotion(
1671      const Instruction &I, bool &AllowPromotionWithoutCommonHeader) override {
1672    return Impl.shouldConsiderAddressTypePromotion(
1673        I, AllowPromotionWithoutCommonHeader);
1674  }
1675  unsigned getCacheLineSize() const override {
1676    return Impl.getCacheLineSize();
1677  }
1678  llvm::Optional<unsigned> getCacheSize(CacheLevel Level) const override {
1679    return Impl.getCacheSize(Level);
1680  }
1681  llvm::Optional<unsigned> getCacheAssociativity(CacheLevel Level) const override {
1682    return Impl.getCacheAssociativity(Level);
1683  }
1684
1685  /// Return the preferred prefetch distance in terms of instructions.
1686  ///
1687  unsigned getPrefetchDistance() const override {
1688    return Impl.getPrefetchDistance();
1689  }
1690
1691  /// Return the minimum stride necessary to trigger software
1692  /// prefetching.
1693  ///
1694  unsigned getMinPrefetchStride() const override {
1695    return Impl.getMinPrefetchStride();
1696  }
1697
1698  /// Return the maximum prefetch distance in terms of loop
1699  /// iterations.
1700  ///
1701  unsigned getMaxPrefetchIterationsAhead() const override {
1702    return Impl.getMaxPrefetchIterationsAhead();
1703  }
1704
1705  unsigned getMaxInterleaveFactor(unsigned VF) override {
1706    return Impl.getMaxInterleaveFactor(VF);
1707  }
1708  unsigned getEstimatedNumberOfCaseClusters(const SwitchInst &SI,
1709                                            unsigned &JTSize,
1710                                            ProfileSummaryInfo *PSI,
1711                                            BlockFrequencyInfo *BFI) override {
1712    return Impl.getEstimatedNumberOfCaseClusters(SI, JTSize, PSI, BFI);
1713  }
1714  unsigned getArithmeticInstrCost(unsigned Opcode, Type *Ty,
1715                                  OperandValueKind Opd1Info,
1716                                  OperandValueKind Opd2Info,
1717                                  OperandValueProperties Opd1PropInfo,
1718                                  OperandValueProperties Opd2PropInfo,
1719                                  ArrayRef<const Value *> Args,
1720                                  const Instruction *CxtI = nullptr) override {
1721    return Impl.getArithmeticInstrCost(Opcode, Ty, Opd1Info, Opd2Info,
1722                                       Opd1PropInfo, Opd2PropInfo, Args, CxtI);
1723  }
1724  int getShuffleCost(ShuffleKind Kind, Type *Tp, int Index,
1725                     Type *SubTp) override {
1726    return Impl.getShuffleCost(Kind, Tp, Index, SubTp);
1727  }
1728  int getCastInstrCost(unsigned Opcode, Type *Dst, Type *Src,
1729                       const Instruction *I) override {
1730    return Impl.getCastInstrCost(Opcode, Dst, Src, I);
1731  }
1732  int getExtractWithExtendCost(unsigned Opcode, Type *Dst, VectorType *VecTy,
1733                               unsigned Index) override {
1734    return Impl.getExtractWithExtendCost(Opcode, Dst, VecTy, Index);
1735  }
1736  int getCFInstrCost(unsigned Opcode) override {
1737    return Impl.getCFInstrCost(Opcode);
1738  }
1739  int getCmpSelInstrCost(unsigned Opcode, Type *ValTy, Type *CondTy,
1740                         const Instruction *I) override {
1741    return Impl.getCmpSelInstrCost(Opcode, ValTy, CondTy, I);
1742  }
1743  int getVectorInstrCost(unsigned Opcode, Type *Val, unsigned Index) override {
1744    return Impl.getVectorInstrCost(Opcode, Val, Index);
1745  }
1746  int getMemoryOpCost(unsigned Opcode, Type *Src, MaybeAlign Alignment,
1747                      unsigned AddressSpace, const Instruction *I) override {
1748    return Impl.getMemoryOpCost(Opcode, Src, Alignment, AddressSpace, I);
1749  }
1750  int getMaskedMemoryOpCost(unsigned Opcode, Type *Src, unsigned Alignment,
1751                            unsigned AddressSpace) override {
1752    return Impl.getMaskedMemoryOpCost(Opcode, Src, Alignment, AddressSpace);
1753  }
1754  int getGatherScatterOpCost(unsigned Opcode, Type *DataTy,
1755                             Value *Ptr, bool VariableMask,
1756                             unsigned Alignment) override {
1757    return Impl.getGatherScatterOpCost(Opcode, DataTy, Ptr, VariableMask,
1758                                       Alignment);
1759  }
1760  int getInterleavedMemoryOpCost(unsigned Opcode, Type *VecTy, unsigned Factor,
1761                                 ArrayRef<unsigned> Indices, unsigned Alignment,
1762                                 unsigned AddressSpace, bool UseMaskForCond,
1763                                 bool UseMaskForGaps) override {
1764    return Impl.getInterleavedMemoryOpCost(Opcode, VecTy, Factor, Indices,
1765                                           Alignment, AddressSpace,
1766                                           UseMaskForCond, UseMaskForGaps);
1767  }
1768  int getArithmeticReductionCost(unsigned Opcode, Type *Ty,
1769                                 bool IsPairwiseForm) override {
1770    return Impl.getArithmeticReductionCost(Opcode, Ty, IsPairwiseForm);
1771  }
1772  int getMinMaxReductionCost(Type *Ty, Type *CondTy,
1773                             bool IsPairwiseForm, bool IsUnsigned) override {
1774    return Impl.getMinMaxReductionCost(Ty, CondTy, IsPairwiseForm, IsUnsigned);
1775   }
1776  int getIntrinsicInstrCost(Intrinsic::ID ID, Type *RetTy, ArrayRef<Type *> Tys,
1777               FastMathFlags FMF, unsigned ScalarizationCostPassed) override {
1778    return Impl.getIntrinsicInstrCost(ID, RetTy, Tys, FMF,
1779                                      ScalarizationCostPassed);
1780  }
1781  int getIntrinsicInstrCost(Intrinsic::ID ID, Type *RetTy,
1782       ArrayRef<Value *> Args, FastMathFlags FMF, unsigned VF) override {
1783    return Impl.getIntrinsicInstrCost(ID, RetTy, Args, FMF, VF);
1784  }
1785  int getCallInstrCost(Function *F, Type *RetTy,
1786                       ArrayRef<Type *> Tys) override {
1787    return Impl.getCallInstrCost(F, RetTy, Tys);
1788  }
1789  unsigned getNumberOfParts(Type *Tp) override {
1790    return Impl.getNumberOfParts(Tp);
1791  }
1792  int getAddressComputationCost(Type *Ty, ScalarEvolution *SE,
1793                                const SCEV *Ptr) override {
1794    return Impl.getAddressComputationCost(Ty, SE, Ptr);
1795  }
1796  unsigned getCostOfKeepingLiveOverCall(ArrayRef<Type *> Tys) override {
1797    return Impl.getCostOfKeepingLiveOverCall(Tys);
1798  }
1799  bool getTgtMemIntrinsic(IntrinsicInst *Inst,
1800                          MemIntrinsicInfo &Info) override {
1801    return Impl.getTgtMemIntrinsic(Inst, Info);
1802  }
1803  unsigned getAtomicMemIntrinsicMaxElementSize() const override {
1804    return Impl.getAtomicMemIntrinsicMaxElementSize();
1805  }
1806  Value *getOrCreateResultFromMemIntrinsic(IntrinsicInst *Inst,
1807                                           Type *ExpectedType) override {
1808    return Impl.getOrCreateResultFromMemIntrinsic(Inst, ExpectedType);
1809  }
1810  Type *getMemcpyLoopLoweringType(LLVMContext &Context, Value *Length,
1811                                  unsigned SrcAlign,
1812                                  unsigned DestAlign) const override {
1813    return Impl.getMemcpyLoopLoweringType(Context, Length, SrcAlign, DestAlign);
1814  }
1815  void getMemcpyLoopResidualLoweringType(SmallVectorImpl<Type *> &OpsOut,
1816                                         LLVMContext &Context,
1817                                         unsigned RemainingBytes,
1818                                         unsigned SrcAlign,
1819                                         unsigned DestAlign) const override {
1820    Impl.getMemcpyLoopResidualLoweringType(OpsOut, Context, RemainingBytes,
1821                                           SrcAlign, DestAlign);
1822  }
1823  bool areInlineCompatible(const Function *Caller,
1824                           const Function *Callee) const override {
1825    return Impl.areInlineCompatible(Caller, Callee);
1826  }
1827  bool areFunctionArgsABICompatible(
1828      const Function *Caller, const Function *Callee,
1829      SmallPtrSetImpl<Argument *> &Args) const override {
1830    return Impl.areFunctionArgsABICompatible(Caller, Callee, Args);
1831  }
1832  bool isIndexedLoadLegal(MemIndexedMode Mode, Type *Ty) const override {
1833    return Impl.isIndexedLoadLegal(Mode, Ty, getDataLayout());
1834  }
1835  bool isIndexedStoreLegal(MemIndexedMode Mode, Type *Ty) const override {
1836    return Impl.isIndexedStoreLegal(Mode, Ty, getDataLayout());
1837  }
1838  unsigned getLoadStoreVecRegBitWidth(unsigned AddrSpace) const override {
1839    return Impl.getLoadStoreVecRegBitWidth(AddrSpace);
1840  }
1841  bool isLegalToVectorizeLoad(LoadInst *LI) const override {
1842    return Impl.isLegalToVectorizeLoad(LI);
1843  }
1844  bool isLegalToVectorizeStore(StoreInst *SI) const override {
1845    return Impl.isLegalToVectorizeStore(SI);
1846  }
1847  bool isLegalToVectorizeLoadChain(unsigned ChainSizeInBytes,
1848                                   unsigned Alignment,
1849                                   unsigned AddrSpace) const override {
1850    return Impl.isLegalToVectorizeLoadChain(ChainSizeInBytes, Alignment,
1851                                            AddrSpace);
1852  }
1853  bool isLegalToVectorizeStoreChain(unsigned ChainSizeInBytes,
1854                                    unsigned Alignment,
1855                                    unsigned AddrSpace) const override {
1856    return Impl.isLegalToVectorizeStoreChain(ChainSizeInBytes, Alignment,
1857                                             AddrSpace);
1858  }
1859  unsigned getLoadVectorFactor(unsigned VF, unsigned LoadSize,
1860                               unsigned ChainSizeInBytes,
1861                               VectorType *VecTy) const override {
1862    return Impl.getLoadVectorFactor(VF, LoadSize, ChainSizeInBytes, VecTy);
1863  }
1864  unsigned getStoreVectorFactor(unsigned VF, unsigned StoreSize,
1865                                unsigned ChainSizeInBytes,
1866                                VectorType *VecTy) const override {
1867    return Impl.getStoreVectorFactor(VF, StoreSize, ChainSizeInBytes, VecTy);
1868  }
1869  bool useReductionIntrinsic(unsigned Opcode, Type *Ty,
1870                             ReductionFlags Flags) const override {
1871    return Impl.useReductionIntrinsic(Opcode, Ty, Flags);
1872  }
1873  bool shouldExpandReduction(const IntrinsicInst *II) const override {
1874    return Impl.shouldExpandReduction(II);
1875  }
1876
1877  unsigned getGISelRematGlobalCost() const override {
1878    return Impl.getGISelRematGlobalCost();
1879  }
1880
1881  int getInstructionLatency(const Instruction *I) override {
1882    return Impl.getInstructionLatency(I);
1883  }
1884};
1885
1886template <typename T>
1887TargetTransformInfo::TargetTransformInfo(T Impl)
1888    : TTIImpl(new Model<T>(Impl)) {}
1889
1890/// Analysis pass providing the \c TargetTransformInfo.
1891///
1892/// The core idea of the TargetIRAnalysis is to expose an interface through
1893/// which LLVM targets can analyze and provide information about the middle
1894/// end's target-independent IR. This supports use cases such as target-aware
1895/// cost modeling of IR constructs.
1896///
1897/// This is a function analysis because much of the cost modeling for targets
1898/// is done in a subtarget specific way and LLVM supports compiling different
1899/// functions targeting different subtargets in order to support runtime
1900/// dispatch according to the observed subtarget.
1901class TargetIRAnalysis : public AnalysisInfoMixin<TargetIRAnalysis> {
1902public:
1903  typedef TargetTransformInfo Result;
1904
1905  /// Default construct a target IR analysis.
1906  ///
1907  /// This will use the module's datalayout to construct a baseline
1908  /// conservative TTI result.
1909  TargetIRAnalysis();
1910
1911  /// Construct an IR analysis pass around a target-provide callback.
1912  ///
1913  /// The callback will be called with a particular function for which the TTI
1914  /// is needed and must return a TTI object for that function.
1915  TargetIRAnalysis(std::function<Result(const Function &)> TTICallback);
1916
1917  // Value semantics. We spell out the constructors for MSVC.
1918  TargetIRAnalysis(const TargetIRAnalysis &Arg)
1919      : TTICallback(Arg.TTICallback) {}
1920  TargetIRAnalysis(TargetIRAnalysis &&Arg)
1921      : TTICallback(std::move(Arg.TTICallback)) {}
1922  TargetIRAnalysis &operator=(const TargetIRAnalysis &RHS) {
1923    TTICallback = RHS.TTICallback;
1924    return *this;
1925  }
1926  TargetIRAnalysis &operator=(TargetIRAnalysis &&RHS) {
1927    TTICallback = std::move(RHS.TTICallback);
1928    return *this;
1929  }
1930
1931  Result run(const Function &F, FunctionAnalysisManager &);
1932
1933private:
1934  friend AnalysisInfoMixin<TargetIRAnalysis>;
1935  static AnalysisKey Key;
1936
1937  /// The callback used to produce a result.
1938  ///
1939  /// We use a completely opaque callback so that targets can provide whatever
1940  /// mechanism they desire for constructing the TTI for a given function.
1941  ///
1942  /// FIXME: Should we really use std::function? It's relatively inefficient.
1943  /// It might be possible to arrange for even stateful callbacks to outlive
1944  /// the analysis and thus use a function_ref which would be lighter weight.
1945  /// This may also be less error prone as the callback is likely to reference
1946  /// the external TargetMachine, and that reference needs to never dangle.
1947  std::function<Result(const Function &)> TTICallback;
1948
1949  /// Helper function used as the callback in the default constructor.
1950  static Result getDefaultTTI(const Function &F);
1951};
1952
1953/// Wrapper pass for TargetTransformInfo.
1954///
1955/// This pass can be constructed from a TTI object which it stores internally
1956/// and is queried by passes.
1957class TargetTransformInfoWrapperPass : public ImmutablePass {
1958  TargetIRAnalysis TIRA;
1959  Optional<TargetTransformInfo> TTI;
1960
1961  virtual void anchor();
1962
1963public:
1964  static char ID;
1965
1966  /// We must provide a default constructor for the pass but it should
1967  /// never be used.
1968  ///
1969  /// Use the constructor below or call one of the creation routines.
1970  TargetTransformInfoWrapperPass();
1971
1972  explicit TargetTransformInfoWrapperPass(TargetIRAnalysis TIRA);
1973
1974  TargetTransformInfo &getTTI(const Function &F);
1975};
1976
1977/// Create an analysis pass wrapper around a TTI object.
1978///
1979/// This analysis pass just holds the TTI instance and makes it available to
1980/// clients.
1981ImmutablePass *createTargetTransformInfoWrapperPass(TargetIRAnalysis TIRA);
1982
1983} // End llvm namespace
1984
1985#endif
1986