MachineCombiner.cpp revision 360784
1//===---- MachineCombiner.cpp - Instcombining on SSA form machine code ----===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8//
9// The machine combiner pass uses machine trace metrics to ensure the combined
10// instructions do not lengthen the critical path or the resource depth.
11//===----------------------------------------------------------------------===//
12
13#include "llvm/ADT/DenseMap.h"
14#include "llvm/ADT/Statistic.h"
15#include "llvm/Analysis/ProfileSummaryInfo.h"
16#include "llvm/CodeGen/LazyMachineBlockFrequencyInfo.h"
17#include "llvm/CodeGen/MachineDominators.h"
18#include "llvm/CodeGen/MachineFunction.h"
19#include "llvm/CodeGen/MachineFunctionPass.h"
20#include "llvm/CodeGen/MachineLoopInfo.h"
21#include "llvm/CodeGen/MachineRegisterInfo.h"
22#include "llvm/CodeGen/MachineSizeOpts.h"
23#include "llvm/CodeGen/MachineTraceMetrics.h"
24#include "llvm/CodeGen/Passes.h"
25#include "llvm/CodeGen/TargetInstrInfo.h"
26#include "llvm/CodeGen/TargetRegisterInfo.h"
27#include "llvm/CodeGen/TargetSchedule.h"
28#include "llvm/CodeGen/TargetSubtargetInfo.h"
29#include "llvm/InitializePasses.h"
30#include "llvm/Support/CommandLine.h"
31#include "llvm/Support/Debug.h"
32#include "llvm/Support/raw_ostream.h"
33
34using namespace llvm;
35
36#define DEBUG_TYPE "machine-combiner"
37
38STATISTIC(NumInstCombined, "Number of machineinst combined");
39
40static cl::opt<unsigned>
41inc_threshold("machine-combiner-inc-threshold", cl::Hidden,
42              cl::desc("Incremental depth computation will be used for basic "
43                       "blocks with more instructions."), cl::init(500));
44
45static cl::opt<bool> dump_intrs("machine-combiner-dump-subst-intrs", cl::Hidden,
46                                cl::desc("Dump all substituted intrs"),
47                                cl::init(false));
48
49#ifdef EXPENSIVE_CHECKS
50static cl::opt<bool> VerifyPatternOrder(
51    "machine-combiner-verify-pattern-order", cl::Hidden,
52    cl::desc(
53        "Verify that the generated patterns are ordered by increasing latency"),
54    cl::init(true));
55#else
56static cl::opt<bool> VerifyPatternOrder(
57    "machine-combiner-verify-pattern-order", cl::Hidden,
58    cl::desc(
59        "Verify that the generated patterns are ordered by increasing latency"),
60    cl::init(false));
61#endif
62
63namespace {
64class MachineCombiner : public MachineFunctionPass {
65  const TargetSubtargetInfo *STI;
66  const TargetInstrInfo *TII;
67  const TargetRegisterInfo *TRI;
68  MCSchedModel SchedModel;
69  MachineRegisterInfo *MRI;
70  MachineLoopInfo *MLI; // Current MachineLoopInfo
71  MachineTraceMetrics *Traces;
72  MachineTraceMetrics::Ensemble *MinInstr;
73  MachineBlockFrequencyInfo *MBFI;
74  ProfileSummaryInfo *PSI;
75
76  TargetSchedModel TSchedModel;
77
78  /// True if optimizing for code size.
79  bool OptSize;
80
81public:
82  static char ID;
83  MachineCombiner() : MachineFunctionPass(ID) {
84    initializeMachineCombinerPass(*PassRegistry::getPassRegistry());
85  }
86  void getAnalysisUsage(AnalysisUsage &AU) const override;
87  bool runOnMachineFunction(MachineFunction &MF) override;
88  StringRef getPassName() const override { return "Machine InstCombiner"; }
89
90private:
91  bool doSubstitute(unsigned NewSize, unsigned OldSize, bool OptForSize);
92  bool combineInstructions(MachineBasicBlock *);
93  MachineInstr *getOperandDef(const MachineOperand &MO);
94  unsigned getDepth(SmallVectorImpl<MachineInstr *> &InsInstrs,
95                    DenseMap<unsigned, unsigned> &InstrIdxForVirtReg,
96                    MachineTraceMetrics::Trace BlockTrace);
97  unsigned getLatency(MachineInstr *Root, MachineInstr *NewRoot,
98                      MachineTraceMetrics::Trace BlockTrace);
99  bool
100  improvesCriticalPathLen(MachineBasicBlock *MBB, MachineInstr *Root,
101                          MachineTraceMetrics::Trace BlockTrace,
102                          SmallVectorImpl<MachineInstr *> &InsInstrs,
103                          SmallVectorImpl<MachineInstr *> &DelInstrs,
104                          DenseMap<unsigned, unsigned> &InstrIdxForVirtReg,
105                          MachineCombinerPattern Pattern, bool SlackIsAccurate);
106  bool preservesResourceLen(MachineBasicBlock *MBB,
107                            MachineTraceMetrics::Trace BlockTrace,
108                            SmallVectorImpl<MachineInstr *> &InsInstrs,
109                            SmallVectorImpl<MachineInstr *> &DelInstrs);
110  void instr2instrSC(SmallVectorImpl<MachineInstr *> &Instrs,
111                     SmallVectorImpl<const MCSchedClassDesc *> &InstrsSC);
112  std::pair<unsigned, unsigned>
113  getLatenciesForInstrSequences(MachineInstr &MI,
114                                SmallVectorImpl<MachineInstr *> &InsInstrs,
115                                SmallVectorImpl<MachineInstr *> &DelInstrs,
116                                MachineTraceMetrics::Trace BlockTrace);
117
118  void verifyPatternOrder(MachineBasicBlock *MBB, MachineInstr &Root,
119                          SmallVector<MachineCombinerPattern, 16> &Patterns);
120};
121}
122
123char MachineCombiner::ID = 0;
124char &llvm::MachineCombinerID = MachineCombiner::ID;
125
126INITIALIZE_PASS_BEGIN(MachineCombiner, DEBUG_TYPE,
127                      "Machine InstCombiner", false, false)
128INITIALIZE_PASS_DEPENDENCY(MachineLoopInfo)
129INITIALIZE_PASS_DEPENDENCY(MachineTraceMetrics)
130INITIALIZE_PASS_END(MachineCombiner, DEBUG_TYPE, "Machine InstCombiner",
131                    false, false)
132
133void MachineCombiner::getAnalysisUsage(AnalysisUsage &AU) const {
134  AU.setPreservesCFG();
135  AU.addPreserved<MachineDominatorTree>();
136  AU.addRequired<MachineLoopInfo>();
137  AU.addPreserved<MachineLoopInfo>();
138  AU.addRequired<MachineTraceMetrics>();
139  AU.addPreserved<MachineTraceMetrics>();
140  AU.addRequired<LazyMachineBlockFrequencyInfoPass>();
141  AU.addRequired<ProfileSummaryInfoWrapperPass>();
142  MachineFunctionPass::getAnalysisUsage(AU);
143}
144
145MachineInstr *MachineCombiner::getOperandDef(const MachineOperand &MO) {
146  MachineInstr *DefInstr = nullptr;
147  // We need a virtual register definition.
148  if (MO.isReg() && Register::isVirtualRegister(MO.getReg()))
149    DefInstr = MRI->getUniqueVRegDef(MO.getReg());
150  // PHI's have no depth etc.
151  if (DefInstr && DefInstr->isPHI())
152    DefInstr = nullptr;
153  return DefInstr;
154}
155
156/// Computes depth of instructions in vector \InsInstr.
157///
158/// \param InsInstrs is a vector of machine instructions
159/// \param InstrIdxForVirtReg is a dense map of virtual register to index
160/// of defining machine instruction in \p InsInstrs
161/// \param BlockTrace is a trace of machine instructions
162///
163/// \returns Depth of last instruction in \InsInstrs ("NewRoot")
164unsigned
165MachineCombiner::getDepth(SmallVectorImpl<MachineInstr *> &InsInstrs,
166                          DenseMap<unsigned, unsigned> &InstrIdxForVirtReg,
167                          MachineTraceMetrics::Trace BlockTrace) {
168  SmallVector<unsigned, 16> InstrDepth;
169  assert(TSchedModel.hasInstrSchedModelOrItineraries() &&
170         "Missing machine model\n");
171
172  // For each instruction in the new sequence compute the depth based on the
173  // operands. Use the trace information when possible. For new operands which
174  // are tracked in the InstrIdxForVirtReg map depth is looked up in InstrDepth
175  for (auto *InstrPtr : InsInstrs) { // for each Use
176    unsigned IDepth = 0;
177    for (const MachineOperand &MO : InstrPtr->operands()) {
178      // Check for virtual register operand.
179      if (!(MO.isReg() && Register::isVirtualRegister(MO.getReg())))
180        continue;
181      if (!MO.isUse())
182        continue;
183      unsigned DepthOp = 0;
184      unsigned LatencyOp = 0;
185      DenseMap<unsigned, unsigned>::iterator II =
186          InstrIdxForVirtReg.find(MO.getReg());
187      if (II != InstrIdxForVirtReg.end()) {
188        // Operand is new virtual register not in trace
189        assert(II->second < InstrDepth.size() && "Bad Index");
190        MachineInstr *DefInstr = InsInstrs[II->second];
191        assert(DefInstr &&
192               "There must be a definition for a new virtual register");
193        DepthOp = InstrDepth[II->second];
194        int DefIdx = DefInstr->findRegisterDefOperandIdx(MO.getReg());
195        int UseIdx = InstrPtr->findRegisterUseOperandIdx(MO.getReg());
196        LatencyOp = TSchedModel.computeOperandLatency(DefInstr, DefIdx,
197                                                      InstrPtr, UseIdx);
198      } else {
199        MachineInstr *DefInstr = getOperandDef(MO);
200        if (DefInstr) {
201          DepthOp = BlockTrace.getInstrCycles(*DefInstr).Depth;
202          LatencyOp = TSchedModel.computeOperandLatency(
203              DefInstr, DefInstr->findRegisterDefOperandIdx(MO.getReg()),
204              InstrPtr, InstrPtr->findRegisterUseOperandIdx(MO.getReg()));
205        }
206      }
207      IDepth = std::max(IDepth, DepthOp + LatencyOp);
208    }
209    InstrDepth.push_back(IDepth);
210  }
211  unsigned NewRootIdx = InsInstrs.size() - 1;
212  return InstrDepth[NewRootIdx];
213}
214
215/// Computes instruction latency as max of latency of defined operands.
216///
217/// \param Root is a machine instruction that could be replaced by NewRoot.
218/// It is used to compute a more accurate latency information for NewRoot in
219/// case there is a dependent instruction in the same trace (\p BlockTrace)
220/// \param NewRoot is the instruction for which the latency is computed
221/// \param BlockTrace is a trace of machine instructions
222///
223/// \returns Latency of \p NewRoot
224unsigned MachineCombiner::getLatency(MachineInstr *Root, MachineInstr *NewRoot,
225                                     MachineTraceMetrics::Trace BlockTrace) {
226  assert(TSchedModel.hasInstrSchedModelOrItineraries() &&
227         "Missing machine model\n");
228
229  // Check each definition in NewRoot and compute the latency
230  unsigned NewRootLatency = 0;
231
232  for (const MachineOperand &MO : NewRoot->operands()) {
233    // Check for virtual register operand.
234    if (!(MO.isReg() && Register::isVirtualRegister(MO.getReg())))
235      continue;
236    if (!MO.isDef())
237      continue;
238    // Get the first instruction that uses MO
239    MachineRegisterInfo::reg_iterator RI = MRI->reg_begin(MO.getReg());
240    RI++;
241    if (RI == MRI->reg_end())
242      continue;
243    MachineInstr *UseMO = RI->getParent();
244    unsigned LatencyOp = 0;
245    if (UseMO && BlockTrace.isDepInTrace(*Root, *UseMO)) {
246      LatencyOp = TSchedModel.computeOperandLatency(
247          NewRoot, NewRoot->findRegisterDefOperandIdx(MO.getReg()), UseMO,
248          UseMO->findRegisterUseOperandIdx(MO.getReg()));
249    } else {
250      LatencyOp = TSchedModel.computeInstrLatency(NewRoot);
251    }
252    NewRootLatency = std::max(NewRootLatency, LatencyOp);
253  }
254  return NewRootLatency;
255}
256
257/// The combiner's goal may differ based on which pattern it is attempting
258/// to optimize.
259enum class CombinerObjective {
260  MustReduceDepth, // The data dependency chain must be improved.
261  Default          // The critical path must not be lengthened.
262};
263
264static CombinerObjective getCombinerObjective(MachineCombinerPattern P) {
265  // TODO: If C++ ever gets a real enum class, make this part of the
266  // MachineCombinerPattern class.
267  switch (P) {
268  case MachineCombinerPattern::REASSOC_AX_BY:
269  case MachineCombinerPattern::REASSOC_AX_YB:
270  case MachineCombinerPattern::REASSOC_XA_BY:
271  case MachineCombinerPattern::REASSOC_XA_YB:
272    return CombinerObjective::MustReduceDepth;
273  default:
274    return CombinerObjective::Default;
275  }
276}
277
278/// Estimate the latency of the new and original instruction sequence by summing
279/// up the latencies of the inserted and deleted instructions. This assumes
280/// that the inserted and deleted instructions are dependent instruction chains,
281/// which might not hold in all cases.
282std::pair<unsigned, unsigned> MachineCombiner::getLatenciesForInstrSequences(
283    MachineInstr &MI, SmallVectorImpl<MachineInstr *> &InsInstrs,
284    SmallVectorImpl<MachineInstr *> &DelInstrs,
285    MachineTraceMetrics::Trace BlockTrace) {
286  assert(!InsInstrs.empty() && "Only support sequences that insert instrs.");
287  unsigned NewRootLatency = 0;
288  // NewRoot is the last instruction in the \p InsInstrs vector.
289  MachineInstr *NewRoot = InsInstrs.back();
290  for (unsigned i = 0; i < InsInstrs.size() - 1; i++)
291    NewRootLatency += TSchedModel.computeInstrLatency(InsInstrs[i]);
292  NewRootLatency += getLatency(&MI, NewRoot, BlockTrace);
293
294  unsigned RootLatency = 0;
295  for (auto I : DelInstrs)
296    RootLatency += TSchedModel.computeInstrLatency(I);
297
298  return {NewRootLatency, RootLatency};
299}
300
301/// The DAGCombine code sequence ends in MI (Machine Instruction) Root.
302/// The new code sequence ends in MI NewRoot. A necessary condition for the new
303/// sequence to replace the old sequence is that it cannot lengthen the critical
304/// path. The definition of "improve" may be restricted by specifying that the
305/// new path improves the data dependency chain (MustReduceDepth).
306bool MachineCombiner::improvesCriticalPathLen(
307    MachineBasicBlock *MBB, MachineInstr *Root,
308    MachineTraceMetrics::Trace BlockTrace,
309    SmallVectorImpl<MachineInstr *> &InsInstrs,
310    SmallVectorImpl<MachineInstr *> &DelInstrs,
311    DenseMap<unsigned, unsigned> &InstrIdxForVirtReg,
312    MachineCombinerPattern Pattern,
313    bool SlackIsAccurate) {
314  assert(TSchedModel.hasInstrSchedModelOrItineraries() &&
315         "Missing machine model\n");
316  // Get depth and latency of NewRoot and Root.
317  unsigned NewRootDepth = getDepth(InsInstrs, InstrIdxForVirtReg, BlockTrace);
318  unsigned RootDepth = BlockTrace.getInstrCycles(*Root).Depth;
319
320  LLVM_DEBUG(dbgs() << "  Dependence data for " << *Root << "\tNewRootDepth: "
321                    << NewRootDepth << "\tRootDepth: " << RootDepth);
322
323  // For a transform such as reassociation, the cost equation is
324  // conservatively calculated so that we must improve the depth (data
325  // dependency cycles) in the critical path to proceed with the transform.
326  // Being conservative also protects against inaccuracies in the underlying
327  // machine trace metrics and CPU models.
328  if (getCombinerObjective(Pattern) == CombinerObjective::MustReduceDepth) {
329    LLVM_DEBUG(dbgs() << "\tIt MustReduceDepth ");
330    LLVM_DEBUG(NewRootDepth < RootDepth
331                   ? dbgs() << "\t  and it does it\n"
332                   : dbgs() << "\t  but it does NOT do it\n");
333    return NewRootDepth < RootDepth;
334  }
335
336  // A more flexible cost calculation for the critical path includes the slack
337  // of the original code sequence. This may allow the transform to proceed
338  // even if the instruction depths (data dependency cycles) become worse.
339
340  // Account for the latency of the inserted and deleted instructions by
341  unsigned NewRootLatency, RootLatency;
342  std::tie(NewRootLatency, RootLatency) =
343      getLatenciesForInstrSequences(*Root, InsInstrs, DelInstrs, BlockTrace);
344
345  unsigned RootSlack = BlockTrace.getInstrSlack(*Root);
346  unsigned NewCycleCount = NewRootDepth + NewRootLatency;
347  unsigned OldCycleCount =
348      RootDepth + RootLatency + (SlackIsAccurate ? RootSlack : 0);
349  LLVM_DEBUG(dbgs() << "\n\tNewRootLatency: " << NewRootLatency
350                    << "\tRootLatency: " << RootLatency << "\n\tRootSlack: "
351                    << RootSlack << " SlackIsAccurate=" << SlackIsAccurate
352                    << "\n\tNewRootDepth + NewRootLatency = " << NewCycleCount
353                    << "\n\tRootDepth + RootLatency + RootSlack = "
354                    << OldCycleCount;);
355  LLVM_DEBUG(NewCycleCount <= OldCycleCount
356                 ? dbgs() << "\n\t  It IMPROVES PathLen because"
357                 : dbgs() << "\n\t  It DOES NOT improve PathLen because");
358  LLVM_DEBUG(dbgs() << "\n\t\tNewCycleCount = " << NewCycleCount
359                    << ", OldCycleCount = " << OldCycleCount << "\n");
360
361  return NewCycleCount <= OldCycleCount;
362}
363
364/// helper routine to convert instructions into SC
365void MachineCombiner::instr2instrSC(
366    SmallVectorImpl<MachineInstr *> &Instrs,
367    SmallVectorImpl<const MCSchedClassDesc *> &InstrsSC) {
368  for (auto *InstrPtr : Instrs) {
369    unsigned Opc = InstrPtr->getOpcode();
370    unsigned Idx = TII->get(Opc).getSchedClass();
371    const MCSchedClassDesc *SC = SchedModel.getSchedClassDesc(Idx);
372    InstrsSC.push_back(SC);
373  }
374}
375
376/// True when the new instructions do not increase resource length
377bool MachineCombiner::preservesResourceLen(
378    MachineBasicBlock *MBB, MachineTraceMetrics::Trace BlockTrace,
379    SmallVectorImpl<MachineInstr *> &InsInstrs,
380    SmallVectorImpl<MachineInstr *> &DelInstrs) {
381  if (!TSchedModel.hasInstrSchedModel())
382    return true;
383
384  // Compute current resource length
385
386  //ArrayRef<const MachineBasicBlock *> MBBarr(MBB);
387  SmallVector <const MachineBasicBlock *, 1> MBBarr;
388  MBBarr.push_back(MBB);
389  unsigned ResLenBeforeCombine = BlockTrace.getResourceLength(MBBarr);
390
391  // Deal with SC rather than Instructions.
392  SmallVector<const MCSchedClassDesc *, 16> InsInstrsSC;
393  SmallVector<const MCSchedClassDesc *, 16> DelInstrsSC;
394
395  instr2instrSC(InsInstrs, InsInstrsSC);
396  instr2instrSC(DelInstrs, DelInstrsSC);
397
398  ArrayRef<const MCSchedClassDesc *> MSCInsArr = makeArrayRef(InsInstrsSC);
399  ArrayRef<const MCSchedClassDesc *> MSCDelArr = makeArrayRef(DelInstrsSC);
400
401  // Compute new resource length.
402  unsigned ResLenAfterCombine =
403      BlockTrace.getResourceLength(MBBarr, MSCInsArr, MSCDelArr);
404
405  LLVM_DEBUG(dbgs() << "\t\tResource length before replacement: "
406                    << ResLenBeforeCombine
407                    << " and after: " << ResLenAfterCombine << "\n";);
408  LLVM_DEBUG(
409      ResLenAfterCombine <= ResLenBeforeCombine
410          ? dbgs() << "\t\t  As result it IMPROVES/PRESERVES Resource Length\n"
411          : dbgs() << "\t\t  As result it DOES NOT improve/preserve Resource "
412                      "Length\n");
413
414  return ResLenAfterCombine <= ResLenBeforeCombine;
415}
416
417/// \returns true when new instruction sequence should be generated
418/// independent if it lengthens critical path or not
419bool MachineCombiner::doSubstitute(unsigned NewSize, unsigned OldSize,
420                                   bool OptForSize) {
421  if (OptForSize && (NewSize < OldSize))
422    return true;
423  if (!TSchedModel.hasInstrSchedModelOrItineraries())
424    return true;
425  return false;
426}
427
428/// Inserts InsInstrs and deletes DelInstrs. Incrementally updates instruction
429/// depths if requested.
430///
431/// \param MBB basic block to insert instructions in
432/// \param MI current machine instruction
433/// \param InsInstrs new instructions to insert in \p MBB
434/// \param DelInstrs instruction to delete from \p MBB
435/// \param MinInstr is a pointer to the machine trace information
436/// \param RegUnits set of live registers, needed to compute instruction depths
437/// \param IncrementalUpdate if true, compute instruction depths incrementally,
438///                          otherwise invalidate the trace
439static void insertDeleteInstructions(MachineBasicBlock *MBB, MachineInstr &MI,
440                                     SmallVector<MachineInstr *, 16> InsInstrs,
441                                     SmallVector<MachineInstr *, 16> DelInstrs,
442                                     MachineTraceMetrics::Ensemble *MinInstr,
443                                     SparseSet<LiveRegUnit> &RegUnits,
444                                     bool IncrementalUpdate) {
445  for (auto *InstrPtr : InsInstrs)
446    MBB->insert((MachineBasicBlock::iterator)&MI, InstrPtr);
447
448  for (auto *InstrPtr : DelInstrs) {
449    InstrPtr->eraseFromParentAndMarkDBGValuesForRemoval();
450    // Erase all LiveRegs defined by the removed instruction
451    for (auto I = RegUnits.begin(); I != RegUnits.end(); ) {
452      if (I->MI == InstrPtr)
453        I = RegUnits.erase(I);
454      else
455        I++;
456    }
457  }
458
459  if (IncrementalUpdate)
460    for (auto *InstrPtr : InsInstrs)
461      MinInstr->updateDepth(MBB, *InstrPtr, RegUnits);
462  else
463    MinInstr->invalidate(MBB);
464
465  NumInstCombined++;
466}
467
468// Check that the difference between original and new latency is decreasing for
469// later patterns. This helps to discover sub-optimal pattern orderings.
470void MachineCombiner::verifyPatternOrder(
471    MachineBasicBlock *MBB, MachineInstr &Root,
472    SmallVector<MachineCombinerPattern, 16> &Patterns) {
473  long PrevLatencyDiff = std::numeric_limits<long>::max();
474  (void)PrevLatencyDiff; // Variable is used in assert only.
475  for (auto P : Patterns) {
476    SmallVector<MachineInstr *, 16> InsInstrs;
477    SmallVector<MachineInstr *, 16> DelInstrs;
478    DenseMap<unsigned, unsigned> InstrIdxForVirtReg;
479    TII->genAlternativeCodeSequence(Root, P, InsInstrs, DelInstrs,
480                                    InstrIdxForVirtReg);
481    // Found pattern, but did not generate alternative sequence.
482    // This can happen e.g. when an immediate could not be materialized
483    // in a single instruction.
484    if (InsInstrs.empty() || !TSchedModel.hasInstrSchedModelOrItineraries())
485      continue;
486
487    unsigned NewRootLatency, RootLatency;
488    std::tie(NewRootLatency, RootLatency) = getLatenciesForInstrSequences(
489        Root, InsInstrs, DelInstrs, MinInstr->getTrace(MBB));
490    long CurrentLatencyDiff = ((long)RootLatency) - ((long)NewRootLatency);
491    assert(CurrentLatencyDiff <= PrevLatencyDiff &&
492           "Current pattern is better than previous pattern.");
493    PrevLatencyDiff = CurrentLatencyDiff;
494  }
495}
496
497/// Substitute a slow code sequence with a faster one by
498/// evaluating instruction combining pattern.
499/// The prototype of such a pattern is MUl + ADD -> MADD. Performs instruction
500/// combining based on machine trace metrics. Only combine a sequence of
501/// instructions  when this neither lengthens the critical path nor increases
502/// resource pressure. When optimizing for codesize always combine when the new
503/// sequence is shorter.
504bool MachineCombiner::combineInstructions(MachineBasicBlock *MBB) {
505  bool Changed = false;
506  LLVM_DEBUG(dbgs() << "Combining MBB " << MBB->getName() << "\n");
507
508  bool IncrementalUpdate = false;
509  auto BlockIter = MBB->begin();
510  decltype(BlockIter) LastUpdate;
511  // Check if the block is in a loop.
512  const MachineLoop *ML = MLI->getLoopFor(MBB);
513  if (!MinInstr)
514    MinInstr = Traces->getEnsemble(MachineTraceMetrics::TS_MinInstrCount);
515
516  SparseSet<LiveRegUnit> RegUnits;
517  RegUnits.setUniverse(TRI->getNumRegUnits());
518
519  bool OptForSize = OptSize || llvm::shouldOptimizeForSize(MBB, PSI, MBFI);
520
521  while (BlockIter != MBB->end()) {
522    auto &MI = *BlockIter++;
523    SmallVector<MachineCombinerPattern, 16> Patterns;
524    // The motivating example is:
525    //
526    //     MUL  Other        MUL_op1 MUL_op2  Other
527    //      \    /               \      |    /
528    //      ADD/SUB      =>        MADD/MSUB
529    //      (=Root)                (=NewRoot)
530
531    // The DAGCombine code always replaced MUL + ADD/SUB by MADD. While this is
532    // usually beneficial for code size it unfortunately can hurt performance
533    // when the ADD is on the critical path, but the MUL is not. With the
534    // substitution the MUL becomes part of the critical path (in form of the
535    // MADD) and can lengthen it on architectures where the MADD latency is
536    // longer than the ADD latency.
537    //
538    // For each instruction we check if it can be the root of a combiner
539    // pattern. Then for each pattern the new code sequence in form of MI is
540    // generated and evaluated. When the efficiency criteria (don't lengthen
541    // critical path, don't use more resources) is met the new sequence gets
542    // hooked up into the basic block before the old sequence is removed.
543    //
544    // The algorithm does not try to evaluate all patterns and pick the best.
545    // This is only an artificial restriction though. In practice there is
546    // mostly one pattern, and getMachineCombinerPatterns() can order patterns
547    // based on an internal cost heuristic. If
548    // machine-combiner-verify-pattern-order is enabled, all patterns are
549    // checked to ensure later patterns do not provide better latency savings.
550
551    if (!TII->getMachineCombinerPatterns(MI, Patterns))
552      continue;
553
554    if (VerifyPatternOrder)
555      verifyPatternOrder(MBB, MI, Patterns);
556
557    for (auto P : Patterns) {
558      SmallVector<MachineInstr *, 16> InsInstrs;
559      SmallVector<MachineInstr *, 16> DelInstrs;
560      DenseMap<unsigned, unsigned> InstrIdxForVirtReg;
561      TII->genAlternativeCodeSequence(MI, P, InsInstrs, DelInstrs,
562                                      InstrIdxForVirtReg);
563      unsigned NewInstCount = InsInstrs.size();
564      unsigned OldInstCount = DelInstrs.size();
565      // Found pattern, but did not generate alternative sequence.
566      // This can happen e.g. when an immediate could not be materialized
567      // in a single instruction.
568      if (!NewInstCount)
569        continue;
570
571      LLVM_DEBUG(if (dump_intrs) {
572        dbgs() << "\tFor the Pattern (" << (int)P
573               << ") these instructions could be removed\n";
574        for (auto const *InstrPtr : DelInstrs)
575          InstrPtr->print(dbgs(), /*IsStandalone*/false, /*SkipOpers*/false,
576                          /*SkipDebugLoc*/false, /*AddNewLine*/true, TII);
577        dbgs() << "\tThese instructions could replace the removed ones\n";
578        for (auto const *InstrPtr : InsInstrs)
579          InstrPtr->print(dbgs(), /*IsStandalone*/false, /*SkipOpers*/false,
580                          /*SkipDebugLoc*/false, /*AddNewLine*/true, TII);
581      });
582
583      bool SubstituteAlways = false;
584      if (ML && TII->isThroughputPattern(P))
585        SubstituteAlways = true;
586
587      if (IncrementalUpdate) {
588        // Update depths since the last incremental update.
589        MinInstr->updateDepths(LastUpdate, BlockIter, RegUnits);
590        LastUpdate = BlockIter;
591      }
592
593      // Substitute when we optimize for codesize and the new sequence has
594      // fewer instructions OR
595      // the new sequence neither lengthens the critical path nor increases
596      // resource pressure.
597      if (SubstituteAlways ||
598          doSubstitute(NewInstCount, OldInstCount, OptForSize)) {
599        insertDeleteInstructions(MBB, MI, InsInstrs, DelInstrs, MinInstr,
600                                 RegUnits, IncrementalUpdate);
601        // Eagerly stop after the first pattern fires.
602        Changed = true;
603        break;
604      } else {
605        // For big basic blocks, we only compute the full trace the first time
606        // we hit this. We do not invalidate the trace, but instead update the
607        // instruction depths incrementally.
608        // NOTE: Only the instruction depths up to MI are accurate. All other
609        // trace information is not updated.
610        MachineTraceMetrics::Trace BlockTrace = MinInstr->getTrace(MBB);
611        Traces->verifyAnalysis();
612        if (improvesCriticalPathLen(MBB, &MI, BlockTrace, InsInstrs, DelInstrs,
613                                    InstrIdxForVirtReg, P,
614                                    !IncrementalUpdate) &&
615            preservesResourceLen(MBB, BlockTrace, InsInstrs, DelInstrs)) {
616          if (MBB->size() > inc_threshold) {
617            // Use incremental depth updates for basic blocks above treshold
618            IncrementalUpdate = true;
619            LastUpdate = BlockIter;
620          }
621
622          insertDeleteInstructions(MBB, MI, InsInstrs, DelInstrs, MinInstr,
623                                   RegUnits, IncrementalUpdate);
624
625          // Eagerly stop after the first pattern fires.
626          Changed = true;
627          break;
628        }
629        // Cleanup instructions of the alternative code sequence. There is no
630        // use for them.
631        MachineFunction *MF = MBB->getParent();
632        for (auto *InstrPtr : InsInstrs)
633          MF->DeleteMachineInstr(InstrPtr);
634      }
635      InstrIdxForVirtReg.clear();
636    }
637  }
638
639  if (Changed && IncrementalUpdate)
640    Traces->invalidate(MBB);
641  return Changed;
642}
643
644bool MachineCombiner::runOnMachineFunction(MachineFunction &MF) {
645  STI = &MF.getSubtarget();
646  TII = STI->getInstrInfo();
647  TRI = STI->getRegisterInfo();
648  SchedModel = STI->getSchedModel();
649  TSchedModel.init(STI);
650  MRI = &MF.getRegInfo();
651  MLI = &getAnalysis<MachineLoopInfo>();
652  Traces = &getAnalysis<MachineTraceMetrics>();
653  PSI = &getAnalysis<ProfileSummaryInfoWrapperPass>().getPSI();
654  MBFI = (PSI && PSI->hasProfileSummary()) ?
655         &getAnalysis<LazyMachineBlockFrequencyInfoPass>().getBFI() :
656         nullptr;
657  MinInstr = nullptr;
658  OptSize = MF.getFunction().hasOptSize();
659
660  LLVM_DEBUG(dbgs() << getPassName() << ": " << MF.getName() << '\n');
661  if (!TII->useMachineCombiner()) {
662    LLVM_DEBUG(
663        dbgs()
664        << "  Skipping pass: Target does not support machine combiner\n");
665    return false;
666  }
667
668  bool Changed = false;
669
670  // Try to combine instructions.
671  for (auto &MBB : MF)
672    Changed |= combineInstructions(&MBB);
673
674  return Changed;
675}
676