SIMachineScheduler.cpp revision 360784
1//===-- SIMachineScheduler.cpp - SI Scheduler Interface -------------------===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8//
9/// \file
10/// SI Machine Scheduler interface
11//
12//===----------------------------------------------------------------------===//
13
14#include "SIMachineScheduler.h"
15#include "AMDGPU.h"
16#include "SIInstrInfo.h"
17#include "SIRegisterInfo.h"
18#include "MCTargetDesc/AMDGPUMCTargetDesc.h"
19#include "llvm/ADT/STLExtras.h"
20#include "llvm/ADT/SmallVector.h"
21#include "llvm/CodeGen/LiveInterval.h"
22#include "llvm/CodeGen/LiveIntervals.h"
23#include "llvm/CodeGen/MachineInstr.h"
24#include "llvm/CodeGen/MachineRegisterInfo.h"
25#include "llvm/CodeGen/MachineScheduler.h"
26#include "llvm/CodeGen/RegisterPressure.h"
27#include "llvm/CodeGen/SlotIndexes.h"
28#include "llvm/CodeGen/TargetRegisterInfo.h"
29#include "llvm/Support/Debug.h"
30#include "llvm/Support/ErrorHandling.h"
31#include "llvm/Support/raw_ostream.h"
32#include <algorithm>
33#include <cassert>
34#include <map>
35#include <set>
36#include <utility>
37#include <vector>
38
39using namespace llvm;
40
41#define DEBUG_TYPE "machine-scheduler"
42
43// This scheduler implements a different scheduling algorithm than
44// GenericScheduler.
45//
46// There are several specific architecture behaviours that can't be modelled
47// for GenericScheduler:
48// . When accessing the result of an SGPR load instruction, you have to wait
49// for all the SGPR load instructions before your current instruction to
50// have finished.
51// . When accessing the result of an VGPR load instruction, you have to wait
52// for all the VGPR load instructions previous to the VGPR load instruction
53// you are interested in to finish.
54// . The less the register pressure, the best load latencies are hidden
55//
56// Moreover some specifities (like the fact a lot of instructions in the shader
57// have few dependencies) makes the generic scheduler have some unpredictable
58// behaviours. For example when register pressure becomes high, it can either
59// manage to prevent register pressure from going too high, or it can
60// increase register pressure even more than if it hadn't taken register
61// pressure into account.
62//
63// Also some other bad behaviours are generated, like loading at the beginning
64// of the shader a constant in VGPR you won't need until the end of the shader.
65//
66// The scheduling problem for SI can distinguish three main parts:
67// . Hiding high latencies (texture sampling, etc)
68// . Hiding low latencies (SGPR constant loading, etc)
69// . Keeping register usage low for better latency hiding and general
70//   performance
71//
72// Some other things can also affect performance, but are hard to predict
73// (cache usage, the fact the HW can issue several instructions from different
74// wavefronts if different types, etc)
75//
76// This scheduler tries to solve the scheduling problem by dividing it into
77// simpler sub-problems. It divides the instructions into blocks, schedules
78// locally inside the blocks where it takes care of low latencies, and then
79// chooses the order of the blocks by taking care of high latencies.
80// Dividing the instructions into blocks helps control keeping register
81// usage low.
82//
83// First the instructions are put into blocks.
84//   We want the blocks help control register usage and hide high latencies
85//   later. To help control register usage, we typically want all local
86//   computations, when for example you create a result that can be comsummed
87//   right away, to be contained in a block. Block inputs and outputs would
88//   typically be important results that are needed in several locations of
89//   the shader. Since we do want blocks to help hide high latencies, we want
90//   the instructions inside the block to have a minimal set of dependencies
91//   on high latencies. It will make it easy to pick blocks to hide specific
92//   high latencies.
93//   The block creation algorithm is divided into several steps, and several
94//   variants can be tried during the scheduling process.
95//
96// Second the order of the instructions inside the blocks is chosen.
97//   At that step we do take into account only register usage and hiding
98//   low latency instructions
99//
100// Third the block order is chosen, there we try to hide high latencies
101// and keep register usage low.
102//
103// After the third step, a pass is done to improve the hiding of low
104// latencies.
105//
106// Actually when talking about 'low latency' or 'high latency' it includes
107// both the latency to get the cache (or global mem) data go to the register,
108// and the bandwidth limitations.
109// Increasing the number of active wavefronts helps hide the former, but it
110// doesn't solve the latter, thus why even if wavefront count is high, we have
111// to try have as many instructions hiding high latencies as possible.
112// The OpenCL doc says for example latency of 400 cycles for a global mem access,
113// which is hidden by 10 instructions if the wavefront count is 10.
114
115// Some figures taken from AMD docs:
116// Both texture and constant L1 caches are 4-way associative with 64 bytes
117// lines.
118// Constant cache is shared with 4 CUs.
119// For texture sampling, the address generation unit receives 4 texture
120// addresses per cycle, thus we could expect texture sampling latency to be
121// equivalent to 4 instructions in the very best case (a VGPR is 64 work items,
122// instructions in a wavefront group are executed every 4 cycles),
123// or 16 instructions if the other wavefronts associated to the 3 other VALUs
124// of the CU do texture sampling too. (Don't take these figures too seriously,
125// as I'm not 100% sure of the computation)
126// Data exports should get similar latency.
127// For constant loading, the cache is shader with 4 CUs.
128// The doc says "a throughput of 16B/cycle for each of the 4 Compute Unit"
129// I guess if the other CU don't read the cache, it can go up to 64B/cycle.
130// It means a simple s_buffer_load should take one instruction to hide, as
131// well as a s_buffer_loadx2 and potentially a s_buffer_loadx8 if on the same
132// cache line.
133//
134// As of today the driver doesn't preload the constants in cache, thus the
135// first loads get extra latency. The doc says global memory access can be
136// 300-600 cycles. We do not specially take that into account when scheduling
137// As we expect the driver to be able to preload the constants soon.
138
139// common code //
140
141#ifndef NDEBUG
142
143static const char *getReasonStr(SIScheduleCandReason Reason) {
144  switch (Reason) {
145  case NoCand:         return "NOCAND";
146  case RegUsage:       return "REGUSAGE";
147  case Latency:        return "LATENCY";
148  case Successor:      return "SUCCESSOR";
149  case Depth:          return "DEPTH";
150  case NodeOrder:      return "ORDER";
151  }
152  llvm_unreachable("Unknown reason!");
153}
154
155#endif
156
157namespace llvm {
158namespace SISched {
159static bool tryLess(int TryVal, int CandVal,
160                    SISchedulerCandidate &TryCand,
161                    SISchedulerCandidate &Cand,
162                    SIScheduleCandReason Reason) {
163  if (TryVal < CandVal) {
164    TryCand.Reason = Reason;
165    return true;
166  }
167  if (TryVal > CandVal) {
168    if (Cand.Reason > Reason)
169      Cand.Reason = Reason;
170    return true;
171  }
172  Cand.setRepeat(Reason);
173  return false;
174}
175
176static bool tryGreater(int TryVal, int CandVal,
177                       SISchedulerCandidate &TryCand,
178                       SISchedulerCandidate &Cand,
179                       SIScheduleCandReason Reason) {
180  if (TryVal > CandVal) {
181    TryCand.Reason = Reason;
182    return true;
183  }
184  if (TryVal < CandVal) {
185    if (Cand.Reason > Reason)
186      Cand.Reason = Reason;
187    return true;
188  }
189  Cand.setRepeat(Reason);
190  return false;
191}
192} // end namespace SISched
193} // end namespace llvm
194
195// SIScheduleBlock //
196
197void SIScheduleBlock::addUnit(SUnit *SU) {
198  NodeNum2Index[SU->NodeNum] = SUnits.size();
199  SUnits.push_back(SU);
200}
201
202#ifndef NDEBUG
203void SIScheduleBlock::traceCandidate(const SISchedCandidate &Cand) {
204
205  dbgs() << "  SU(" << Cand.SU->NodeNum << ") " << getReasonStr(Cand.Reason);
206  dbgs() << '\n';
207}
208#endif
209
210void SIScheduleBlock::tryCandidateTopDown(SISchedCandidate &Cand,
211                                          SISchedCandidate &TryCand) {
212  // Initialize the candidate if needed.
213  if (!Cand.isValid()) {
214    TryCand.Reason = NodeOrder;
215    return;
216  }
217
218  if (Cand.SGPRUsage > 60 &&
219      SISched::tryLess(TryCand.SGPRUsage, Cand.SGPRUsage,
220                       TryCand, Cand, RegUsage))
221    return;
222
223  // Schedule low latency instructions as top as possible.
224  // Order of priority is:
225  // . Low latency instructions which do not depend on other low latency
226  //   instructions we haven't waited for
227  // . Other instructions which do not depend on low latency instructions
228  //   we haven't waited for
229  // . Low latencies
230  // . All other instructions
231  // Goal is to get: low latency instructions - independent instructions
232  //     - (eventually some more low latency instructions)
233  //     - instructions that depend on the first low latency instructions.
234  // If in the block there is a lot of constant loads, the SGPR usage
235  // could go quite high, thus above the arbitrary limit of 60 will encourage
236  // use the already loaded constants (in order to release some SGPRs) before
237  // loading more.
238  if (SISched::tryLess(TryCand.HasLowLatencyNonWaitedParent,
239                       Cand.HasLowLatencyNonWaitedParent,
240                       TryCand, Cand, SIScheduleCandReason::Depth))
241    return;
242
243  if (SISched::tryGreater(TryCand.IsLowLatency, Cand.IsLowLatency,
244                          TryCand, Cand, SIScheduleCandReason::Depth))
245    return;
246
247  if (TryCand.IsLowLatency &&
248      SISched::tryLess(TryCand.LowLatencyOffset, Cand.LowLatencyOffset,
249                       TryCand, Cand, SIScheduleCandReason::Depth))
250    return;
251
252  if (SISched::tryLess(TryCand.VGPRUsage, Cand.VGPRUsage,
253                       TryCand, Cand, RegUsage))
254    return;
255
256  // Fall through to original instruction order.
257  if (TryCand.SU->NodeNum < Cand.SU->NodeNum) {
258    TryCand.Reason = NodeOrder;
259  }
260}
261
262SUnit* SIScheduleBlock::pickNode() {
263  SISchedCandidate TopCand;
264
265  for (SUnit* SU : TopReadySUs) {
266    SISchedCandidate TryCand;
267    std::vector<unsigned> pressure;
268    std::vector<unsigned> MaxPressure;
269    // Predict register usage after this instruction.
270    TryCand.SU = SU;
271    TopRPTracker.getDownwardPressure(SU->getInstr(), pressure, MaxPressure);
272    TryCand.SGPRUsage = pressure[DAG->getSGPRSetID()];
273    TryCand.VGPRUsage = pressure[DAG->getVGPRSetID()];
274    TryCand.IsLowLatency = DAG->IsLowLatencySU[SU->NodeNum];
275    TryCand.LowLatencyOffset = DAG->LowLatencyOffset[SU->NodeNum];
276    TryCand.HasLowLatencyNonWaitedParent =
277      HasLowLatencyNonWaitedParent[NodeNum2Index[SU->NodeNum]];
278    tryCandidateTopDown(TopCand, TryCand);
279    if (TryCand.Reason != NoCand)
280      TopCand.setBest(TryCand);
281  }
282
283  return TopCand.SU;
284}
285
286
287// Schedule something valid.
288void SIScheduleBlock::fastSchedule() {
289  TopReadySUs.clear();
290  if (Scheduled)
291    undoSchedule();
292
293  for (SUnit* SU : SUnits) {
294    if (!SU->NumPredsLeft)
295      TopReadySUs.push_back(SU);
296  }
297
298  while (!TopReadySUs.empty()) {
299    SUnit *SU = TopReadySUs[0];
300    ScheduledSUnits.push_back(SU);
301    nodeScheduled(SU);
302  }
303
304  Scheduled = true;
305}
306
307// Returns if the register was set between first and last.
308static bool isDefBetween(unsigned Reg,
309                           SlotIndex First, SlotIndex Last,
310                           const MachineRegisterInfo *MRI,
311                           const LiveIntervals *LIS) {
312  for (MachineRegisterInfo::def_instr_iterator
313       UI = MRI->def_instr_begin(Reg),
314       UE = MRI->def_instr_end(); UI != UE; ++UI) {
315    const MachineInstr* MI = &*UI;
316    if (MI->isDebugValue())
317      continue;
318    SlotIndex InstSlot = LIS->getInstructionIndex(*MI).getRegSlot();
319    if (InstSlot >= First && InstSlot <= Last)
320      return true;
321  }
322  return false;
323}
324
325void SIScheduleBlock::initRegPressure(MachineBasicBlock::iterator BeginBlock,
326                                      MachineBasicBlock::iterator EndBlock) {
327  IntervalPressure Pressure, BotPressure;
328  RegPressureTracker RPTracker(Pressure), BotRPTracker(BotPressure);
329  LiveIntervals *LIS = DAG->getLIS();
330  MachineRegisterInfo *MRI = DAG->getMRI();
331  DAG->initRPTracker(TopRPTracker);
332  DAG->initRPTracker(BotRPTracker);
333  DAG->initRPTracker(RPTracker);
334
335  // Goes though all SU. RPTracker captures what had to be alive for the SUs
336  // to execute, and what is still alive at the end.
337  for (SUnit* SU : ScheduledSUnits) {
338    RPTracker.setPos(SU->getInstr());
339    RPTracker.advance();
340  }
341
342  // Close the RPTracker to finalize live ins/outs.
343  RPTracker.closeRegion();
344
345  // Initialize the live ins and live outs.
346  TopRPTracker.addLiveRegs(RPTracker.getPressure().LiveInRegs);
347  BotRPTracker.addLiveRegs(RPTracker.getPressure().LiveOutRegs);
348
349  // Do not Track Physical Registers, because it messes up.
350  for (const auto &RegMaskPair : RPTracker.getPressure().LiveInRegs) {
351    if (Register::isVirtualRegister(RegMaskPair.RegUnit))
352      LiveInRegs.insert(RegMaskPair.RegUnit);
353  }
354  LiveOutRegs.clear();
355  // There is several possibilities to distinguish:
356  // 1) Reg is not input to any instruction in the block, but is output of one
357  // 2) 1) + read in the block and not needed after it
358  // 3) 1) + read in the block but needed in another block
359  // 4) Reg is input of an instruction but another block will read it too
360  // 5) Reg is input of an instruction and then rewritten in the block.
361  //    result is not read in the block (implies used in another block)
362  // 6) Reg is input of an instruction and then rewritten in the block.
363  //    result is read in the block and not needed in another block
364  // 7) Reg is input of an instruction and then rewritten in the block.
365  //    result is read in the block but also needed in another block
366  // LiveInRegs will contains all the regs in situation 4, 5, 6, 7
367  // We want LiveOutRegs to contain only Regs whose content will be read after
368  // in another block, and whose content was written in the current block,
369  // that is we want it to get 1, 3, 5, 7
370  // Since we made the MIs of a block to be packed all together before
371  // scheduling, then the LiveIntervals were correct, and the RPTracker was
372  // able to correctly handle 5 vs 6, 2 vs 3.
373  // (Note: This is not sufficient for RPTracker to not do mistakes for case 4)
374  // The RPTracker's LiveOutRegs has 1, 3, (some correct or incorrect)4, 5, 7
375  // Comparing to LiveInRegs is not sufficient to differenciate 4 vs 5, 7
376  // The use of findDefBetween removes the case 4.
377  for (const auto &RegMaskPair : RPTracker.getPressure().LiveOutRegs) {
378    unsigned Reg = RegMaskPair.RegUnit;
379    if (Register::isVirtualRegister(Reg) &&
380        isDefBetween(Reg, LIS->getInstructionIndex(*BeginBlock).getRegSlot(),
381                     LIS->getInstructionIndex(*EndBlock).getRegSlot(), MRI,
382                     LIS)) {
383      LiveOutRegs.insert(Reg);
384    }
385  }
386
387  // Pressure = sum_alive_registers register size
388  // Internally llvm will represent some registers as big 128 bits registers
389  // for example, but they actually correspond to 4 actual 32 bits registers.
390  // Thus Pressure is not equal to num_alive_registers * constant.
391  LiveInPressure = TopPressure.MaxSetPressure;
392  LiveOutPressure = BotPressure.MaxSetPressure;
393
394  // Prepares TopRPTracker for top down scheduling.
395  TopRPTracker.closeTop();
396}
397
398void SIScheduleBlock::schedule(MachineBasicBlock::iterator BeginBlock,
399                               MachineBasicBlock::iterator EndBlock) {
400  if (!Scheduled)
401    fastSchedule();
402
403  // PreScheduling phase to set LiveIn and LiveOut.
404  initRegPressure(BeginBlock, EndBlock);
405  undoSchedule();
406
407  // Schedule for real now.
408
409  TopReadySUs.clear();
410
411  for (SUnit* SU : SUnits) {
412    if (!SU->NumPredsLeft)
413      TopReadySUs.push_back(SU);
414  }
415
416  while (!TopReadySUs.empty()) {
417    SUnit *SU = pickNode();
418    ScheduledSUnits.push_back(SU);
419    TopRPTracker.setPos(SU->getInstr());
420    TopRPTracker.advance();
421    nodeScheduled(SU);
422  }
423
424  // TODO: compute InternalAdditionnalPressure.
425  InternalAdditionnalPressure.resize(TopPressure.MaxSetPressure.size());
426
427  // Check everything is right.
428#ifndef NDEBUG
429  assert(SUnits.size() == ScheduledSUnits.size() &&
430            TopReadySUs.empty());
431  for (SUnit* SU : SUnits) {
432    assert(SU->isScheduled &&
433              SU->NumPredsLeft == 0);
434  }
435#endif
436
437  Scheduled = true;
438}
439
440void SIScheduleBlock::undoSchedule() {
441  for (SUnit* SU : SUnits) {
442    SU->isScheduled = false;
443    for (SDep& Succ : SU->Succs) {
444      if (BC->isSUInBlock(Succ.getSUnit(), ID))
445        undoReleaseSucc(SU, &Succ);
446    }
447  }
448  HasLowLatencyNonWaitedParent.assign(SUnits.size(), 0);
449  ScheduledSUnits.clear();
450  Scheduled = false;
451}
452
453void SIScheduleBlock::undoReleaseSucc(SUnit *SU, SDep *SuccEdge) {
454  SUnit *SuccSU = SuccEdge->getSUnit();
455
456  if (SuccEdge->isWeak()) {
457    ++SuccSU->WeakPredsLeft;
458    return;
459  }
460  ++SuccSU->NumPredsLeft;
461}
462
463void SIScheduleBlock::releaseSucc(SUnit *SU, SDep *SuccEdge) {
464  SUnit *SuccSU = SuccEdge->getSUnit();
465
466  if (SuccEdge->isWeak()) {
467    --SuccSU->WeakPredsLeft;
468    return;
469  }
470#ifndef NDEBUG
471  if (SuccSU->NumPredsLeft == 0) {
472    dbgs() << "*** Scheduling failed! ***\n";
473    DAG->dumpNode(*SuccSU);
474    dbgs() << " has been released too many times!\n";
475    llvm_unreachable(nullptr);
476  }
477#endif
478
479  --SuccSU->NumPredsLeft;
480}
481
482/// Release Successors of the SU that are in the block or not.
483void SIScheduleBlock::releaseSuccessors(SUnit *SU, bool InOrOutBlock) {
484  for (SDep& Succ : SU->Succs) {
485    SUnit *SuccSU = Succ.getSUnit();
486
487    if (SuccSU->NodeNum >= DAG->SUnits.size())
488        continue;
489
490    if (BC->isSUInBlock(SuccSU, ID) != InOrOutBlock)
491      continue;
492
493    releaseSucc(SU, &Succ);
494    if (SuccSU->NumPredsLeft == 0 && InOrOutBlock)
495      TopReadySUs.push_back(SuccSU);
496  }
497}
498
499void SIScheduleBlock::nodeScheduled(SUnit *SU) {
500  // Is in TopReadySUs
501  assert (!SU->NumPredsLeft);
502  std::vector<SUnit *>::iterator I = llvm::find(TopReadySUs, SU);
503  if (I == TopReadySUs.end()) {
504    dbgs() << "Data Structure Bug in SI Scheduler\n";
505    llvm_unreachable(nullptr);
506  }
507  TopReadySUs.erase(I);
508
509  releaseSuccessors(SU, true);
510  // Scheduling this node will trigger a wait,
511  // thus propagate to other instructions that they do not need to wait either.
512  if (HasLowLatencyNonWaitedParent[NodeNum2Index[SU->NodeNum]])
513    HasLowLatencyNonWaitedParent.assign(SUnits.size(), 0);
514
515  if (DAG->IsLowLatencySU[SU->NodeNum]) {
516     for (SDep& Succ : SU->Succs) {
517      std::map<unsigned, unsigned>::iterator I =
518        NodeNum2Index.find(Succ.getSUnit()->NodeNum);
519      if (I != NodeNum2Index.end())
520        HasLowLatencyNonWaitedParent[I->second] = 1;
521    }
522  }
523  SU->isScheduled = true;
524}
525
526void SIScheduleBlock::finalizeUnits() {
527  // We remove links from outside blocks to enable scheduling inside the block.
528  for (SUnit* SU : SUnits) {
529    releaseSuccessors(SU, false);
530    if (DAG->IsHighLatencySU[SU->NodeNum])
531      HighLatencyBlock = true;
532  }
533  HasLowLatencyNonWaitedParent.resize(SUnits.size(), 0);
534}
535
536// we maintain ascending order of IDs
537void SIScheduleBlock::addPred(SIScheduleBlock *Pred) {
538  unsigned PredID = Pred->getID();
539
540  // Check if not already predecessor.
541  for (SIScheduleBlock* P : Preds) {
542    if (PredID == P->getID())
543      return;
544  }
545  Preds.push_back(Pred);
546
547  assert(none_of(Succs,
548                 [=](std::pair<SIScheduleBlock*,
549                     SIScheduleBlockLinkKind> S) {
550                   return PredID == S.first->getID();
551                    }) &&
552         "Loop in the Block Graph!");
553}
554
555void SIScheduleBlock::addSucc(SIScheduleBlock *Succ,
556                              SIScheduleBlockLinkKind Kind) {
557  unsigned SuccID = Succ->getID();
558
559  // Check if not already predecessor.
560  for (std::pair<SIScheduleBlock*, SIScheduleBlockLinkKind> &S : Succs) {
561    if (SuccID == S.first->getID()) {
562      if (S.second == SIScheduleBlockLinkKind::NoData &&
563          Kind == SIScheduleBlockLinkKind::Data)
564        S.second = Kind;
565      return;
566    }
567  }
568  if (Succ->isHighLatencyBlock())
569    ++NumHighLatencySuccessors;
570  Succs.push_back(std::make_pair(Succ, Kind));
571
572  assert(none_of(Preds,
573                 [=](SIScheduleBlock *P) { return SuccID == P->getID(); }) &&
574         "Loop in the Block Graph!");
575}
576
577#ifndef NDEBUG
578void SIScheduleBlock::printDebug(bool full) {
579  dbgs() << "Block (" << ID << ")\n";
580  if (!full)
581    return;
582
583  dbgs() << "\nContains High Latency Instruction: "
584         << HighLatencyBlock << '\n';
585  dbgs() << "\nDepends On:\n";
586  for (SIScheduleBlock* P : Preds) {
587    P->printDebug(false);
588  }
589
590  dbgs() << "\nSuccessors:\n";
591  for (std::pair<SIScheduleBlock*, SIScheduleBlockLinkKind> S : Succs) {
592    if (S.second == SIScheduleBlockLinkKind::Data)
593      dbgs() << "(Data Dep) ";
594    S.first->printDebug(false);
595  }
596
597  if (Scheduled) {
598    dbgs() << "LiveInPressure " << LiveInPressure[DAG->getSGPRSetID()] << ' '
599           << LiveInPressure[DAG->getVGPRSetID()] << '\n';
600    dbgs() << "LiveOutPressure " << LiveOutPressure[DAG->getSGPRSetID()] << ' '
601           << LiveOutPressure[DAG->getVGPRSetID()] << "\n\n";
602    dbgs() << "LiveIns:\n";
603    for (unsigned Reg : LiveInRegs)
604      dbgs() << printVRegOrUnit(Reg, DAG->getTRI()) << ' ';
605
606    dbgs() << "\nLiveOuts:\n";
607    for (unsigned Reg : LiveOutRegs)
608      dbgs() << printVRegOrUnit(Reg, DAG->getTRI()) << ' ';
609  }
610
611  dbgs() << "\nInstructions:\n";
612  for (const SUnit* SU : SUnits)
613      DAG->dumpNode(*SU);
614
615  dbgs() << "///////////////////////\n";
616}
617#endif
618
619// SIScheduleBlockCreator //
620
621SIScheduleBlockCreator::SIScheduleBlockCreator(SIScheduleDAGMI *DAG)
622    : DAG(DAG) {}
623
624SIScheduleBlocks
625SIScheduleBlockCreator::getBlocks(SISchedulerBlockCreatorVariant BlockVariant) {
626  std::map<SISchedulerBlockCreatorVariant, SIScheduleBlocks>::iterator B =
627    Blocks.find(BlockVariant);
628  if (B == Blocks.end()) {
629    SIScheduleBlocks Res;
630    createBlocksForVariant(BlockVariant);
631    topologicalSort();
632    scheduleInsideBlocks();
633    fillStats();
634    Res.Blocks = CurrentBlocks;
635    Res.TopDownIndex2Block = TopDownIndex2Block;
636    Res.TopDownBlock2Index = TopDownBlock2Index;
637    Blocks[BlockVariant] = Res;
638    return Res;
639  } else {
640    return B->second;
641  }
642}
643
644bool SIScheduleBlockCreator::isSUInBlock(SUnit *SU, unsigned ID) {
645  if (SU->NodeNum >= DAG->SUnits.size())
646    return false;
647  return CurrentBlocks[Node2CurrentBlock[SU->NodeNum]]->getID() == ID;
648}
649
650void SIScheduleBlockCreator::colorHighLatenciesAlone() {
651  unsigned DAGSize = DAG->SUnits.size();
652
653  for (unsigned i = 0, e = DAGSize; i != e; ++i) {
654    SUnit *SU = &DAG->SUnits[i];
655    if (DAG->IsHighLatencySU[SU->NodeNum]) {
656      CurrentColoring[SU->NodeNum] = NextReservedID++;
657    }
658  }
659}
660
661static bool
662hasDataDependencyPred(const SUnit &SU, const SUnit &FromSU) {
663  for (const auto &PredDep : SU.Preds) {
664    if (PredDep.getSUnit() == &FromSU &&
665        PredDep.getKind() == llvm::SDep::Data)
666      return true;
667  }
668  return false;
669}
670
671void SIScheduleBlockCreator::colorHighLatenciesGroups() {
672  unsigned DAGSize = DAG->SUnits.size();
673  unsigned NumHighLatencies = 0;
674  unsigned GroupSize;
675  int Color = NextReservedID;
676  unsigned Count = 0;
677  std::set<unsigned> FormingGroup;
678
679  for (unsigned i = 0, e = DAGSize; i != e; ++i) {
680    SUnit *SU = &DAG->SUnits[i];
681    if (DAG->IsHighLatencySU[SU->NodeNum])
682      ++NumHighLatencies;
683  }
684
685  if (NumHighLatencies == 0)
686    return;
687
688  if (NumHighLatencies <= 6)
689    GroupSize = 2;
690  else if (NumHighLatencies <= 12)
691    GroupSize = 3;
692  else
693    GroupSize = 4;
694
695  for (unsigned SUNum : DAG->TopDownIndex2SU) {
696    const SUnit &SU = DAG->SUnits[SUNum];
697    if (DAG->IsHighLatencySU[SU.NodeNum]) {
698      unsigned CompatibleGroup = true;
699      int ProposedColor = Color;
700      std::vector<int> AdditionalElements;
701
702      // We don't want to put in the same block
703      // two high latency instructions that depend
704      // on each other.
705      // One way would be to check canAddEdge
706      // in both directions, but that currently is not
707      // enough because there the high latency order is
708      // enforced (via links).
709      // Instead, look at the dependencies between the
710      // high latency instructions and deduce if it is
711      // a data dependency or not.
712      for (unsigned j : FormingGroup) {
713        bool HasSubGraph;
714        std::vector<int> SubGraph;
715        // By construction (topological order), if SU and
716        // DAG->SUnits[j] are linked, DAG->SUnits[j] is neccessary
717        // in the parent graph of SU.
718#ifndef NDEBUG
719        SubGraph = DAG->GetTopo()->GetSubGraph(SU, DAG->SUnits[j],
720                                               HasSubGraph);
721        assert(!HasSubGraph);
722#endif
723        SubGraph = DAG->GetTopo()->GetSubGraph(DAG->SUnits[j], SU,
724                                               HasSubGraph);
725        if (!HasSubGraph)
726          continue; // No dependencies between each other
727        else if (SubGraph.size() > 5) {
728          // Too many elements would be required to be added to the block.
729          CompatibleGroup = false;
730          break;
731        }
732        else {
733          // Check the type of dependency
734          for (unsigned k : SubGraph) {
735            // If in the path to join the two instructions,
736            // there is another high latency instruction,
737            // or instructions colored for another block
738            // abort the merge.
739            if (DAG->IsHighLatencySU[k] ||
740                (CurrentColoring[k] != ProposedColor &&
741                 CurrentColoring[k] != 0)) {
742              CompatibleGroup = false;
743              break;
744            }
745            // If one of the SU in the subgraph depends on the result of SU j,
746            // there'll be a data dependency.
747            if (hasDataDependencyPred(DAG->SUnits[k], DAG->SUnits[j])) {
748              CompatibleGroup = false;
749              break;
750            }
751          }
752          if (!CompatibleGroup)
753            break;
754          // Same check for the SU
755          if (hasDataDependencyPred(SU, DAG->SUnits[j])) {
756            CompatibleGroup = false;
757            break;
758          }
759          // Add all the required instructions to the block
760          // These cannot live in another block (because they
761          // depend (order dependency) on one of the
762          // instruction in the block, and are required for the
763          // high latency instruction we add.
764          AdditionalElements.insert(AdditionalElements.end(),
765                                    SubGraph.begin(), SubGraph.end());
766        }
767      }
768      if (CompatibleGroup) {
769        FormingGroup.insert(SU.NodeNum);
770        for (unsigned j : AdditionalElements)
771          CurrentColoring[j] = ProposedColor;
772        CurrentColoring[SU.NodeNum] = ProposedColor;
773        ++Count;
774      }
775      // Found one incompatible instruction,
776      // or has filled a big enough group.
777      // -> start a new one.
778      if (!CompatibleGroup) {
779        FormingGroup.clear();
780        Color = ++NextReservedID;
781        ProposedColor = Color;
782        FormingGroup.insert(SU.NodeNum);
783        CurrentColoring[SU.NodeNum] = ProposedColor;
784        Count = 0;
785      } else if (Count == GroupSize) {
786        FormingGroup.clear();
787        Color = ++NextReservedID;
788        ProposedColor = Color;
789        Count = 0;
790      }
791    }
792  }
793}
794
795void SIScheduleBlockCreator::colorComputeReservedDependencies() {
796  unsigned DAGSize = DAG->SUnits.size();
797  std::map<std::set<unsigned>, unsigned> ColorCombinations;
798
799  CurrentTopDownReservedDependencyColoring.clear();
800  CurrentBottomUpReservedDependencyColoring.clear();
801
802  CurrentTopDownReservedDependencyColoring.resize(DAGSize, 0);
803  CurrentBottomUpReservedDependencyColoring.resize(DAGSize, 0);
804
805  // Traverse TopDown, and give different colors to SUs depending
806  // on which combination of High Latencies they depend on.
807
808  for (unsigned SUNum : DAG->TopDownIndex2SU) {
809    SUnit *SU = &DAG->SUnits[SUNum];
810    std::set<unsigned> SUColors;
811
812    // Already given.
813    if (CurrentColoring[SU->NodeNum]) {
814      CurrentTopDownReservedDependencyColoring[SU->NodeNum] =
815        CurrentColoring[SU->NodeNum];
816      continue;
817    }
818
819   for (SDep& PredDep : SU->Preds) {
820      SUnit *Pred = PredDep.getSUnit();
821      if (PredDep.isWeak() || Pred->NodeNum >= DAGSize)
822        continue;
823      if (CurrentTopDownReservedDependencyColoring[Pred->NodeNum] > 0)
824        SUColors.insert(CurrentTopDownReservedDependencyColoring[Pred->NodeNum]);
825    }
826    // Color 0 by default.
827    if (SUColors.empty())
828      continue;
829    // Same color than parents.
830    if (SUColors.size() == 1 && *SUColors.begin() > DAGSize)
831      CurrentTopDownReservedDependencyColoring[SU->NodeNum] =
832        *SUColors.begin();
833    else {
834      std::map<std::set<unsigned>, unsigned>::iterator Pos =
835        ColorCombinations.find(SUColors);
836      if (Pos != ColorCombinations.end()) {
837          CurrentTopDownReservedDependencyColoring[SU->NodeNum] = Pos->second;
838      } else {
839        CurrentTopDownReservedDependencyColoring[SU->NodeNum] =
840          NextNonReservedID;
841        ColorCombinations[SUColors] = NextNonReservedID++;
842      }
843    }
844  }
845
846  ColorCombinations.clear();
847
848  // Same as before, but BottomUp.
849
850  for (unsigned SUNum : DAG->BottomUpIndex2SU) {
851    SUnit *SU = &DAG->SUnits[SUNum];
852    std::set<unsigned> SUColors;
853
854    // Already given.
855    if (CurrentColoring[SU->NodeNum]) {
856      CurrentBottomUpReservedDependencyColoring[SU->NodeNum] =
857        CurrentColoring[SU->NodeNum];
858      continue;
859    }
860
861    for (SDep& SuccDep : SU->Succs) {
862      SUnit *Succ = SuccDep.getSUnit();
863      if (SuccDep.isWeak() || Succ->NodeNum >= DAGSize)
864        continue;
865      if (CurrentBottomUpReservedDependencyColoring[Succ->NodeNum] > 0)
866        SUColors.insert(CurrentBottomUpReservedDependencyColoring[Succ->NodeNum]);
867    }
868    // Keep color 0.
869    if (SUColors.empty())
870      continue;
871    // Same color than parents.
872    if (SUColors.size() == 1 && *SUColors.begin() > DAGSize)
873      CurrentBottomUpReservedDependencyColoring[SU->NodeNum] =
874        *SUColors.begin();
875    else {
876      std::map<std::set<unsigned>, unsigned>::iterator Pos =
877        ColorCombinations.find(SUColors);
878      if (Pos != ColorCombinations.end()) {
879        CurrentBottomUpReservedDependencyColoring[SU->NodeNum] = Pos->second;
880      } else {
881        CurrentBottomUpReservedDependencyColoring[SU->NodeNum] =
882          NextNonReservedID;
883        ColorCombinations[SUColors] = NextNonReservedID++;
884      }
885    }
886  }
887}
888
889void SIScheduleBlockCreator::colorAccordingToReservedDependencies() {
890  unsigned DAGSize = DAG->SUnits.size();
891  std::map<std::pair<unsigned, unsigned>, unsigned> ColorCombinations;
892
893  // Every combination of colors given by the top down
894  // and bottom up Reserved node dependency
895
896  for (unsigned i = 0, e = DAGSize; i != e; ++i) {
897    SUnit *SU = &DAG->SUnits[i];
898    std::pair<unsigned, unsigned> SUColors;
899
900    // High latency instructions: already given.
901    if (CurrentColoring[SU->NodeNum])
902      continue;
903
904    SUColors.first = CurrentTopDownReservedDependencyColoring[SU->NodeNum];
905    SUColors.second = CurrentBottomUpReservedDependencyColoring[SU->NodeNum];
906
907    std::map<std::pair<unsigned, unsigned>, unsigned>::iterator Pos =
908      ColorCombinations.find(SUColors);
909    if (Pos != ColorCombinations.end()) {
910      CurrentColoring[SU->NodeNum] = Pos->second;
911    } else {
912      CurrentColoring[SU->NodeNum] = NextNonReservedID;
913      ColorCombinations[SUColors] = NextNonReservedID++;
914    }
915  }
916}
917
918void SIScheduleBlockCreator::colorEndsAccordingToDependencies() {
919  unsigned DAGSize = DAG->SUnits.size();
920  std::vector<int> PendingColoring = CurrentColoring;
921
922  assert(DAGSize >= 1 &&
923         CurrentBottomUpReservedDependencyColoring.size() == DAGSize &&
924         CurrentTopDownReservedDependencyColoring.size() == DAGSize);
925  // If there is no reserved block at all, do nothing. We don't want
926  // everything in one block.
927  if (*std::max_element(CurrentBottomUpReservedDependencyColoring.begin(),
928                        CurrentBottomUpReservedDependencyColoring.end()) == 0 &&
929      *std::max_element(CurrentTopDownReservedDependencyColoring.begin(),
930                        CurrentTopDownReservedDependencyColoring.end()) == 0)
931    return;
932
933  for (unsigned SUNum : DAG->BottomUpIndex2SU) {
934    SUnit *SU = &DAG->SUnits[SUNum];
935    std::set<unsigned> SUColors;
936    std::set<unsigned> SUColorsPending;
937
938    if (CurrentColoring[SU->NodeNum] <= (int)DAGSize)
939      continue;
940
941    if (CurrentBottomUpReservedDependencyColoring[SU->NodeNum] > 0 ||
942        CurrentTopDownReservedDependencyColoring[SU->NodeNum] > 0)
943      continue;
944
945    for (SDep& SuccDep : SU->Succs) {
946      SUnit *Succ = SuccDep.getSUnit();
947      if (SuccDep.isWeak() || Succ->NodeNum >= DAGSize)
948        continue;
949      if (CurrentBottomUpReservedDependencyColoring[Succ->NodeNum] > 0 ||
950          CurrentTopDownReservedDependencyColoring[Succ->NodeNum] > 0)
951        SUColors.insert(CurrentColoring[Succ->NodeNum]);
952      SUColorsPending.insert(PendingColoring[Succ->NodeNum]);
953    }
954    // If there is only one child/parent block, and that block
955    // is not among the ones we are removing in this path, then
956    // merge the instruction to that block
957    if (SUColors.size() == 1 && SUColorsPending.size() == 1)
958      PendingColoring[SU->NodeNum] = *SUColors.begin();
959    else // TODO: Attribute new colors depending on color
960         // combination of children.
961      PendingColoring[SU->NodeNum] = NextNonReservedID++;
962  }
963  CurrentColoring = PendingColoring;
964}
965
966
967void SIScheduleBlockCreator::colorForceConsecutiveOrderInGroup() {
968  unsigned DAGSize = DAG->SUnits.size();
969  unsigned PreviousColor;
970  std::set<unsigned> SeenColors;
971
972  if (DAGSize <= 1)
973    return;
974
975  PreviousColor = CurrentColoring[0];
976
977  for (unsigned i = 1, e = DAGSize; i != e; ++i) {
978    SUnit *SU = &DAG->SUnits[i];
979    unsigned CurrentColor = CurrentColoring[i];
980    unsigned PreviousColorSave = PreviousColor;
981    assert(i == SU->NodeNum);
982
983    if (CurrentColor != PreviousColor)
984      SeenColors.insert(PreviousColor);
985    PreviousColor = CurrentColor;
986
987    if (CurrentColoring[SU->NodeNum] <= (int)DAGSize)
988      continue;
989
990    if (SeenColors.find(CurrentColor) == SeenColors.end())
991      continue;
992
993    if (PreviousColorSave != CurrentColor)
994      CurrentColoring[i] = NextNonReservedID++;
995    else
996      CurrentColoring[i] = CurrentColoring[i-1];
997  }
998}
999
1000void SIScheduleBlockCreator::colorMergeConstantLoadsNextGroup() {
1001  unsigned DAGSize = DAG->SUnits.size();
1002
1003  for (unsigned SUNum : DAG->BottomUpIndex2SU) {
1004    SUnit *SU = &DAG->SUnits[SUNum];
1005    std::set<unsigned> SUColors;
1006
1007    if (CurrentColoring[SU->NodeNum] <= (int)DAGSize)
1008      continue;
1009
1010    // No predecessor: Vgpr constant loading.
1011    // Low latency instructions usually have a predecessor (the address)
1012    if (SU->Preds.size() > 0 && !DAG->IsLowLatencySU[SU->NodeNum])
1013      continue;
1014
1015    for (SDep& SuccDep : SU->Succs) {
1016      SUnit *Succ = SuccDep.getSUnit();
1017      if (SuccDep.isWeak() || Succ->NodeNum >= DAGSize)
1018        continue;
1019      SUColors.insert(CurrentColoring[Succ->NodeNum]);
1020    }
1021    if (SUColors.size() == 1)
1022      CurrentColoring[SU->NodeNum] = *SUColors.begin();
1023  }
1024}
1025
1026void SIScheduleBlockCreator::colorMergeIfPossibleNextGroup() {
1027  unsigned DAGSize = DAG->SUnits.size();
1028
1029  for (unsigned SUNum : DAG->BottomUpIndex2SU) {
1030    SUnit *SU = &DAG->SUnits[SUNum];
1031    std::set<unsigned> SUColors;
1032
1033    if (CurrentColoring[SU->NodeNum] <= (int)DAGSize)
1034      continue;
1035
1036    for (SDep& SuccDep : SU->Succs) {
1037       SUnit *Succ = SuccDep.getSUnit();
1038      if (SuccDep.isWeak() || Succ->NodeNum >= DAGSize)
1039        continue;
1040      SUColors.insert(CurrentColoring[Succ->NodeNum]);
1041    }
1042    if (SUColors.size() == 1)
1043      CurrentColoring[SU->NodeNum] = *SUColors.begin();
1044  }
1045}
1046
1047void SIScheduleBlockCreator::colorMergeIfPossibleNextGroupOnlyForReserved() {
1048  unsigned DAGSize = DAG->SUnits.size();
1049
1050  for (unsigned SUNum : DAG->BottomUpIndex2SU) {
1051    SUnit *SU = &DAG->SUnits[SUNum];
1052    std::set<unsigned> SUColors;
1053
1054    if (CurrentColoring[SU->NodeNum] <= (int)DAGSize)
1055      continue;
1056
1057    for (SDep& SuccDep : SU->Succs) {
1058       SUnit *Succ = SuccDep.getSUnit();
1059      if (SuccDep.isWeak() || Succ->NodeNum >= DAGSize)
1060        continue;
1061      SUColors.insert(CurrentColoring[Succ->NodeNum]);
1062    }
1063    if (SUColors.size() == 1 && *SUColors.begin() <= DAGSize)
1064      CurrentColoring[SU->NodeNum] = *SUColors.begin();
1065  }
1066}
1067
1068void SIScheduleBlockCreator::colorMergeIfPossibleSmallGroupsToNextGroup() {
1069  unsigned DAGSize = DAG->SUnits.size();
1070  std::map<unsigned, unsigned> ColorCount;
1071
1072  for (unsigned SUNum : DAG->BottomUpIndex2SU) {
1073    SUnit *SU = &DAG->SUnits[SUNum];
1074    unsigned color = CurrentColoring[SU->NodeNum];
1075     ++ColorCount[color];
1076  }
1077
1078  for (unsigned SUNum : DAG->BottomUpIndex2SU) {
1079    SUnit *SU = &DAG->SUnits[SUNum];
1080    unsigned color = CurrentColoring[SU->NodeNum];
1081    std::set<unsigned> SUColors;
1082
1083    if (CurrentColoring[SU->NodeNum] <= (int)DAGSize)
1084      continue;
1085
1086    if (ColorCount[color] > 1)
1087      continue;
1088
1089    for (SDep& SuccDep : SU->Succs) {
1090       SUnit *Succ = SuccDep.getSUnit();
1091      if (SuccDep.isWeak() || Succ->NodeNum >= DAGSize)
1092        continue;
1093      SUColors.insert(CurrentColoring[Succ->NodeNum]);
1094    }
1095    if (SUColors.size() == 1 && *SUColors.begin() != color) {
1096      --ColorCount[color];
1097      CurrentColoring[SU->NodeNum] = *SUColors.begin();
1098      ++ColorCount[*SUColors.begin()];
1099    }
1100  }
1101}
1102
1103void SIScheduleBlockCreator::cutHugeBlocks() {
1104  // TODO
1105}
1106
1107void SIScheduleBlockCreator::regroupNoUserInstructions() {
1108  unsigned DAGSize = DAG->SUnits.size();
1109  int GroupID = NextNonReservedID++;
1110
1111  for (unsigned SUNum : DAG->BottomUpIndex2SU) {
1112    SUnit *SU = &DAG->SUnits[SUNum];
1113    bool hasSuccessor = false;
1114
1115    if (CurrentColoring[SU->NodeNum] <= (int)DAGSize)
1116      continue;
1117
1118    for (SDep& SuccDep : SU->Succs) {
1119       SUnit *Succ = SuccDep.getSUnit();
1120      if (SuccDep.isWeak() || Succ->NodeNum >= DAGSize)
1121        continue;
1122      hasSuccessor = true;
1123    }
1124    if (!hasSuccessor)
1125      CurrentColoring[SU->NodeNum] = GroupID;
1126  }
1127}
1128
1129void SIScheduleBlockCreator::colorExports() {
1130  unsigned ExportColor = NextNonReservedID++;
1131  SmallVector<unsigned, 8> ExpGroup;
1132
1133  // Put all exports together in a block.
1134  // The block will naturally end up being scheduled last,
1135  // thus putting exports at the end of the schedule, which
1136  // is better for performance.
1137  // However we must ensure, for safety, the exports can be put
1138  // together in the same block without any other instruction.
1139  // This could happen, for example, when scheduling after regalloc
1140  // if reloading a spilled register from memory using the same
1141  // register than used in a previous export.
1142  // If that happens, do not regroup the exports.
1143  for (unsigned SUNum : DAG->TopDownIndex2SU) {
1144    const SUnit &SU = DAG->SUnits[SUNum];
1145    if (SIInstrInfo::isEXP(*SU.getInstr())) {
1146      // Check the EXP can be added to the group safely,
1147      // ie without needing any other instruction.
1148      // The EXP is allowed to depend on other EXP
1149      // (they will be in the same group).
1150      for (unsigned j : ExpGroup) {
1151        bool HasSubGraph;
1152        std::vector<int> SubGraph;
1153        // By construction (topological order), if SU and
1154        // DAG->SUnits[j] are linked, DAG->SUnits[j] is neccessary
1155        // in the parent graph of SU.
1156#ifndef NDEBUG
1157        SubGraph = DAG->GetTopo()->GetSubGraph(SU, DAG->SUnits[j],
1158                                               HasSubGraph);
1159        assert(!HasSubGraph);
1160#endif
1161        SubGraph = DAG->GetTopo()->GetSubGraph(DAG->SUnits[j], SU,
1162                                               HasSubGraph);
1163        if (!HasSubGraph)
1164          continue; // No dependencies between each other
1165
1166        // SubGraph contains all the instructions required
1167        // between EXP SUnits[j] and EXP SU.
1168        for (unsigned k : SubGraph) {
1169          if (!SIInstrInfo::isEXP(*DAG->SUnits[k].getInstr()))
1170            // Other instructions than EXP would be required in the group.
1171            // Abort the groupping.
1172            return;
1173        }
1174      }
1175
1176      ExpGroup.push_back(SUNum);
1177    }
1178  }
1179
1180  // The group can be formed. Give the color.
1181  for (unsigned j : ExpGroup)
1182    CurrentColoring[j] = ExportColor;
1183}
1184
1185void SIScheduleBlockCreator::createBlocksForVariant(SISchedulerBlockCreatorVariant BlockVariant) {
1186  unsigned DAGSize = DAG->SUnits.size();
1187  std::map<unsigned,unsigned> RealID;
1188
1189  CurrentBlocks.clear();
1190  CurrentColoring.clear();
1191  CurrentColoring.resize(DAGSize, 0);
1192  Node2CurrentBlock.clear();
1193
1194  // Restore links previous scheduling variant has overridden.
1195  DAG->restoreSULinksLeft();
1196
1197  NextReservedID = 1;
1198  NextNonReservedID = DAGSize + 1;
1199
1200  LLVM_DEBUG(dbgs() << "Coloring the graph\n");
1201
1202  if (BlockVariant == SISchedulerBlockCreatorVariant::LatenciesGrouped)
1203    colorHighLatenciesGroups();
1204  else
1205    colorHighLatenciesAlone();
1206  colorComputeReservedDependencies();
1207  colorAccordingToReservedDependencies();
1208  colorEndsAccordingToDependencies();
1209  if (BlockVariant == SISchedulerBlockCreatorVariant::LatenciesAlonePlusConsecutive)
1210    colorForceConsecutiveOrderInGroup();
1211  regroupNoUserInstructions();
1212  colorMergeConstantLoadsNextGroup();
1213  colorMergeIfPossibleNextGroupOnlyForReserved();
1214  colorExports();
1215
1216  // Put SUs of same color into same block
1217  Node2CurrentBlock.resize(DAGSize, -1);
1218  for (unsigned i = 0, e = DAGSize; i != e; ++i) {
1219    SUnit *SU = &DAG->SUnits[i];
1220    unsigned Color = CurrentColoring[SU->NodeNum];
1221    if (RealID.find(Color) == RealID.end()) {
1222      int ID = CurrentBlocks.size();
1223      BlockPtrs.push_back(std::make_unique<SIScheduleBlock>(DAG, this, ID));
1224      CurrentBlocks.push_back(BlockPtrs.rbegin()->get());
1225      RealID[Color] = ID;
1226    }
1227    CurrentBlocks[RealID[Color]]->addUnit(SU);
1228    Node2CurrentBlock[SU->NodeNum] = RealID[Color];
1229  }
1230
1231  // Build dependencies between blocks.
1232  for (unsigned i = 0, e = DAGSize; i != e; ++i) {
1233    SUnit *SU = &DAG->SUnits[i];
1234    int SUID = Node2CurrentBlock[i];
1235     for (SDep& SuccDep : SU->Succs) {
1236       SUnit *Succ = SuccDep.getSUnit();
1237      if (SuccDep.isWeak() || Succ->NodeNum >= DAGSize)
1238        continue;
1239      if (Node2CurrentBlock[Succ->NodeNum] != SUID)
1240        CurrentBlocks[SUID]->addSucc(CurrentBlocks[Node2CurrentBlock[Succ->NodeNum]],
1241                                     SuccDep.isCtrl() ? NoData : Data);
1242    }
1243    for (SDep& PredDep : SU->Preds) {
1244      SUnit *Pred = PredDep.getSUnit();
1245      if (PredDep.isWeak() || Pred->NodeNum >= DAGSize)
1246        continue;
1247      if (Node2CurrentBlock[Pred->NodeNum] != SUID)
1248        CurrentBlocks[SUID]->addPred(CurrentBlocks[Node2CurrentBlock[Pred->NodeNum]]);
1249    }
1250  }
1251
1252  // Free root and leafs of all blocks to enable scheduling inside them.
1253  for (unsigned i = 0, e = CurrentBlocks.size(); i != e; ++i) {
1254    SIScheduleBlock *Block = CurrentBlocks[i];
1255    Block->finalizeUnits();
1256  }
1257  LLVM_DEBUG(dbgs() << "Blocks created:\n\n";
1258             for (unsigned i = 0, e = CurrentBlocks.size(); i != e; ++i) {
1259               SIScheduleBlock *Block = CurrentBlocks[i];
1260               Block->printDebug(true);
1261             });
1262}
1263
1264// Two functions taken from Codegen/MachineScheduler.cpp
1265
1266/// Non-const version.
1267static MachineBasicBlock::iterator
1268nextIfDebug(MachineBasicBlock::iterator I,
1269            MachineBasicBlock::const_iterator End) {
1270  for (; I != End; ++I) {
1271    if (!I->isDebugInstr())
1272      break;
1273  }
1274  return I;
1275}
1276
1277void SIScheduleBlockCreator::topologicalSort() {
1278  unsigned DAGSize = CurrentBlocks.size();
1279  std::vector<int> WorkList;
1280
1281  LLVM_DEBUG(dbgs() << "Topological Sort\n");
1282
1283  WorkList.reserve(DAGSize);
1284  TopDownIndex2Block.resize(DAGSize);
1285  TopDownBlock2Index.resize(DAGSize);
1286  BottomUpIndex2Block.resize(DAGSize);
1287
1288  for (unsigned i = 0, e = DAGSize; i != e; ++i) {
1289    SIScheduleBlock *Block = CurrentBlocks[i];
1290    unsigned Degree = Block->getSuccs().size();
1291    TopDownBlock2Index[i] = Degree;
1292    if (Degree == 0) {
1293      WorkList.push_back(i);
1294    }
1295  }
1296
1297  int Id = DAGSize;
1298  while (!WorkList.empty()) {
1299    int i = WorkList.back();
1300    SIScheduleBlock *Block = CurrentBlocks[i];
1301    WorkList.pop_back();
1302    TopDownBlock2Index[i] = --Id;
1303    TopDownIndex2Block[Id] = i;
1304    for (SIScheduleBlock* Pred : Block->getPreds()) {
1305      if (!--TopDownBlock2Index[Pred->getID()])
1306        WorkList.push_back(Pred->getID());
1307    }
1308  }
1309
1310#ifndef NDEBUG
1311  // Check correctness of the ordering.
1312  for (unsigned i = 0, e = DAGSize; i != e; ++i) {
1313    SIScheduleBlock *Block = CurrentBlocks[i];
1314    for (SIScheduleBlock* Pred : Block->getPreds()) {
1315      assert(TopDownBlock2Index[i] > TopDownBlock2Index[Pred->getID()] &&
1316      "Wrong Top Down topological sorting");
1317    }
1318  }
1319#endif
1320
1321  BottomUpIndex2Block = std::vector<int>(TopDownIndex2Block.rbegin(),
1322                                         TopDownIndex2Block.rend());
1323}
1324
1325void SIScheduleBlockCreator::scheduleInsideBlocks() {
1326  unsigned DAGSize = CurrentBlocks.size();
1327
1328  LLVM_DEBUG(dbgs() << "\nScheduling Blocks\n\n");
1329
1330  // We do schedule a valid scheduling such that a Block corresponds
1331  // to a range of instructions.
1332  LLVM_DEBUG(dbgs() << "First phase: Fast scheduling for Reg Liveness\n");
1333  for (unsigned i = 0, e = DAGSize; i != e; ++i) {
1334    SIScheduleBlock *Block = CurrentBlocks[i];
1335    Block->fastSchedule();
1336  }
1337
1338  // Note: the following code, and the part restoring previous position
1339  // is by far the most expensive operation of the Scheduler.
1340
1341  // Do not update CurrentTop.
1342  MachineBasicBlock::iterator CurrentTopFastSched = DAG->getCurrentTop();
1343  std::vector<MachineBasicBlock::iterator> PosOld;
1344  std::vector<MachineBasicBlock::iterator> PosNew;
1345  PosOld.reserve(DAG->SUnits.size());
1346  PosNew.reserve(DAG->SUnits.size());
1347
1348  for (unsigned i = 0, e = DAGSize; i != e; ++i) {
1349    int BlockIndice = TopDownIndex2Block[i];
1350    SIScheduleBlock *Block = CurrentBlocks[BlockIndice];
1351    std::vector<SUnit*> SUs = Block->getScheduledUnits();
1352
1353    for (SUnit* SU : SUs) {
1354      MachineInstr *MI = SU->getInstr();
1355      MachineBasicBlock::iterator Pos = MI;
1356      PosOld.push_back(Pos);
1357      if (&*CurrentTopFastSched == MI) {
1358        PosNew.push_back(Pos);
1359        CurrentTopFastSched = nextIfDebug(++CurrentTopFastSched,
1360                                          DAG->getCurrentBottom());
1361      } else {
1362        // Update the instruction stream.
1363        DAG->getBB()->splice(CurrentTopFastSched, DAG->getBB(), MI);
1364
1365        // Update LiveIntervals.
1366        // Note: Moving all instructions and calling handleMove every time
1367        // is the most cpu intensive operation of the scheduler.
1368        // It would gain a lot if there was a way to recompute the
1369        // LiveIntervals for the entire scheduling region.
1370        DAG->getLIS()->handleMove(*MI, /*UpdateFlags=*/true);
1371        PosNew.push_back(CurrentTopFastSched);
1372      }
1373    }
1374  }
1375
1376  // Now we have Block of SUs == Block of MI.
1377  // We do the final schedule for the instructions inside the block.
1378  // The property that all the SUs of the Block are grouped together as MI
1379  // is used for correct reg usage tracking.
1380  for (unsigned i = 0, e = DAGSize; i != e; ++i) {
1381    SIScheduleBlock *Block = CurrentBlocks[i];
1382    std::vector<SUnit*> SUs = Block->getScheduledUnits();
1383    Block->schedule((*SUs.begin())->getInstr(), (*SUs.rbegin())->getInstr());
1384  }
1385
1386  LLVM_DEBUG(dbgs() << "Restoring MI Pos\n");
1387  // Restore old ordering (which prevents a LIS->handleMove bug).
1388  for (unsigned i = PosOld.size(), e = 0; i != e; --i) {
1389    MachineBasicBlock::iterator POld = PosOld[i-1];
1390    MachineBasicBlock::iterator PNew = PosNew[i-1];
1391    if (PNew != POld) {
1392      // Update the instruction stream.
1393      DAG->getBB()->splice(POld, DAG->getBB(), PNew);
1394
1395      // Update LiveIntervals.
1396      DAG->getLIS()->handleMove(*POld, /*UpdateFlags=*/true);
1397    }
1398  }
1399
1400  LLVM_DEBUG(for (unsigned i = 0, e = CurrentBlocks.size(); i != e; ++i) {
1401    SIScheduleBlock *Block = CurrentBlocks[i];
1402    Block->printDebug(true);
1403  });
1404}
1405
1406void SIScheduleBlockCreator::fillStats() {
1407  unsigned DAGSize = CurrentBlocks.size();
1408
1409  for (unsigned i = 0, e = DAGSize; i != e; ++i) {
1410    int BlockIndice = TopDownIndex2Block[i];
1411    SIScheduleBlock *Block = CurrentBlocks[BlockIndice];
1412    if (Block->getPreds().empty())
1413      Block->Depth = 0;
1414    else {
1415      unsigned Depth = 0;
1416      for (SIScheduleBlock *Pred : Block->getPreds()) {
1417        if (Depth < Pred->Depth + Pred->getCost())
1418          Depth = Pred->Depth + Pred->getCost();
1419      }
1420      Block->Depth = Depth;
1421    }
1422  }
1423
1424  for (unsigned i = 0, e = DAGSize; i != e; ++i) {
1425    int BlockIndice = BottomUpIndex2Block[i];
1426    SIScheduleBlock *Block = CurrentBlocks[BlockIndice];
1427    if (Block->getSuccs().empty())
1428      Block->Height = 0;
1429    else {
1430      unsigned Height = 0;
1431      for (const auto &Succ : Block->getSuccs())
1432        Height = std::max(Height, Succ.first->Height + Succ.first->getCost());
1433      Block->Height = Height;
1434    }
1435  }
1436}
1437
1438// SIScheduleBlockScheduler //
1439
1440SIScheduleBlockScheduler::SIScheduleBlockScheduler(SIScheduleDAGMI *DAG,
1441                                                   SISchedulerBlockSchedulerVariant Variant,
1442                                                   SIScheduleBlocks  BlocksStruct) :
1443  DAG(DAG), Variant(Variant), Blocks(BlocksStruct.Blocks),
1444  LastPosWaitedHighLatency(0), NumBlockScheduled(0), VregCurrentUsage(0),
1445  SregCurrentUsage(0), maxVregUsage(0), maxSregUsage(0) {
1446
1447  // Fill the usage of every output
1448  // Warning: while by construction we always have a link between two blocks
1449  // when one needs a result from the other, the number of users of an output
1450  // is not the sum of child blocks having as input the same virtual register.
1451  // Here is an example. A produces x and y. B eats x and produces x'.
1452  // C eats x' and y. The register coalescer may have attributed the same
1453  // virtual register to x and x'.
1454  // To count accurately, we do a topological sort. In case the register is
1455  // found for several parents, we increment the usage of the one with the
1456  // highest topological index.
1457  LiveOutRegsNumUsages.resize(Blocks.size());
1458  for (unsigned i = 0, e = Blocks.size(); i != e; ++i) {
1459    SIScheduleBlock *Block = Blocks[i];
1460    for (unsigned Reg : Block->getInRegs()) {
1461      bool Found = false;
1462      int topoInd = -1;
1463      for (SIScheduleBlock* Pred: Block->getPreds()) {
1464        std::set<unsigned> PredOutRegs = Pred->getOutRegs();
1465        std::set<unsigned>::iterator RegPos = PredOutRegs.find(Reg);
1466
1467        if (RegPos != PredOutRegs.end()) {
1468          Found = true;
1469          if (topoInd < BlocksStruct.TopDownBlock2Index[Pred->getID()]) {
1470            topoInd = BlocksStruct.TopDownBlock2Index[Pred->getID()];
1471          }
1472        }
1473      }
1474
1475      if (!Found)
1476        continue;
1477
1478      int PredID = BlocksStruct.TopDownIndex2Block[topoInd];
1479      ++LiveOutRegsNumUsages[PredID][Reg];
1480    }
1481  }
1482
1483  LastPosHighLatencyParentScheduled.resize(Blocks.size(), 0);
1484  BlockNumPredsLeft.resize(Blocks.size());
1485  BlockNumSuccsLeft.resize(Blocks.size());
1486
1487  for (unsigned i = 0, e = Blocks.size(); i != e; ++i) {
1488    SIScheduleBlock *Block = Blocks[i];
1489    BlockNumPredsLeft[i] = Block->getPreds().size();
1490    BlockNumSuccsLeft[i] = Block->getSuccs().size();
1491  }
1492
1493#ifndef NDEBUG
1494  for (unsigned i = 0, e = Blocks.size(); i != e; ++i) {
1495    SIScheduleBlock *Block = Blocks[i];
1496    assert(Block->getID() == i);
1497  }
1498#endif
1499
1500  std::set<unsigned> InRegs = DAG->getInRegs();
1501  addLiveRegs(InRegs);
1502
1503  // Increase LiveOutRegsNumUsages for blocks
1504  // producing registers consumed in another
1505  // scheduling region.
1506  for (unsigned Reg : DAG->getOutRegs()) {
1507    for (unsigned i = 0, e = Blocks.size(); i != e; ++i) {
1508      // Do reverse traversal
1509      int ID = BlocksStruct.TopDownIndex2Block[Blocks.size()-1-i];
1510      SIScheduleBlock *Block = Blocks[ID];
1511      const std::set<unsigned> &OutRegs = Block->getOutRegs();
1512
1513      if (OutRegs.find(Reg) == OutRegs.end())
1514        continue;
1515
1516      ++LiveOutRegsNumUsages[ID][Reg];
1517      break;
1518    }
1519  }
1520
1521  // Fill LiveRegsConsumers for regs that were already
1522  // defined before scheduling.
1523  for (unsigned i = 0, e = Blocks.size(); i != e; ++i) {
1524    SIScheduleBlock *Block = Blocks[i];
1525    for (unsigned Reg : Block->getInRegs()) {
1526      bool Found = false;
1527      for (SIScheduleBlock* Pred: Block->getPreds()) {
1528        std::set<unsigned> PredOutRegs = Pred->getOutRegs();
1529        std::set<unsigned>::iterator RegPos = PredOutRegs.find(Reg);
1530
1531        if (RegPos != PredOutRegs.end()) {
1532          Found = true;
1533          break;
1534        }
1535      }
1536
1537      if (!Found)
1538        ++LiveRegsConsumers[Reg];
1539    }
1540  }
1541
1542  for (unsigned i = 0, e = Blocks.size(); i != e; ++i) {
1543    SIScheduleBlock *Block = Blocks[i];
1544    if (BlockNumPredsLeft[i] == 0) {
1545      ReadyBlocks.push_back(Block);
1546    }
1547  }
1548
1549  while (SIScheduleBlock *Block = pickBlock()) {
1550    BlocksScheduled.push_back(Block);
1551    blockScheduled(Block);
1552  }
1553
1554  LLVM_DEBUG(dbgs() << "Block Order:"; for (SIScheduleBlock *Block
1555                                            : BlocksScheduled) {
1556    dbgs() << ' ' << Block->getID();
1557  } dbgs() << '\n';);
1558}
1559
1560bool SIScheduleBlockScheduler::tryCandidateLatency(SIBlockSchedCandidate &Cand,
1561                                                   SIBlockSchedCandidate &TryCand) {
1562  if (!Cand.isValid()) {
1563    TryCand.Reason = NodeOrder;
1564    return true;
1565  }
1566
1567  // Try to hide high latencies.
1568  if (SISched::tryLess(TryCand.LastPosHighLatParentScheduled,
1569                 Cand.LastPosHighLatParentScheduled, TryCand, Cand, Latency))
1570    return true;
1571  // Schedule high latencies early so you can hide them better.
1572  if (SISched::tryGreater(TryCand.IsHighLatency, Cand.IsHighLatency,
1573                          TryCand, Cand, Latency))
1574    return true;
1575  if (TryCand.IsHighLatency && SISched::tryGreater(TryCand.Height, Cand.Height,
1576                                                   TryCand, Cand, Depth))
1577    return true;
1578  if (SISched::tryGreater(TryCand.NumHighLatencySuccessors,
1579                          Cand.NumHighLatencySuccessors,
1580                          TryCand, Cand, Successor))
1581    return true;
1582  return false;
1583}
1584
1585bool SIScheduleBlockScheduler::tryCandidateRegUsage(SIBlockSchedCandidate &Cand,
1586                                                    SIBlockSchedCandidate &TryCand) {
1587  if (!Cand.isValid()) {
1588    TryCand.Reason = NodeOrder;
1589    return true;
1590  }
1591
1592  if (SISched::tryLess(TryCand.VGPRUsageDiff > 0, Cand.VGPRUsageDiff > 0,
1593                       TryCand, Cand, RegUsage))
1594    return true;
1595  if (SISched::tryGreater(TryCand.NumSuccessors > 0,
1596                          Cand.NumSuccessors > 0,
1597                          TryCand, Cand, Successor))
1598    return true;
1599  if (SISched::tryGreater(TryCand.Height, Cand.Height, TryCand, Cand, Depth))
1600    return true;
1601  if (SISched::tryLess(TryCand.VGPRUsageDiff, Cand.VGPRUsageDiff,
1602                       TryCand, Cand, RegUsage))
1603    return true;
1604  return false;
1605}
1606
1607SIScheduleBlock *SIScheduleBlockScheduler::pickBlock() {
1608  SIBlockSchedCandidate Cand;
1609  std::vector<SIScheduleBlock*>::iterator Best;
1610  SIScheduleBlock *Block;
1611  if (ReadyBlocks.empty())
1612    return nullptr;
1613
1614  DAG->fillVgprSgprCost(LiveRegs.begin(), LiveRegs.end(),
1615                        VregCurrentUsage, SregCurrentUsage);
1616  if (VregCurrentUsage > maxVregUsage)
1617    maxVregUsage = VregCurrentUsage;
1618  if (SregCurrentUsage > maxSregUsage)
1619    maxSregUsage = SregCurrentUsage;
1620  LLVM_DEBUG(dbgs() << "Picking New Blocks\n"; dbgs() << "Available: ";
1621             for (SIScheduleBlock *Block
1622                  : ReadyBlocks) dbgs()
1623             << Block->getID() << ' ';
1624             dbgs() << "\nCurrent Live:\n";
1625             for (unsigned Reg
1626                  : LiveRegs) dbgs()
1627             << printVRegOrUnit(Reg, DAG->getTRI()) << ' ';
1628             dbgs() << '\n';
1629             dbgs() << "Current VGPRs: " << VregCurrentUsage << '\n';
1630             dbgs() << "Current SGPRs: " << SregCurrentUsage << '\n';);
1631
1632  Cand.Block = nullptr;
1633  for (std::vector<SIScheduleBlock*>::iterator I = ReadyBlocks.begin(),
1634       E = ReadyBlocks.end(); I != E; ++I) {
1635    SIBlockSchedCandidate TryCand;
1636    TryCand.Block = *I;
1637    TryCand.IsHighLatency = TryCand.Block->isHighLatencyBlock();
1638    TryCand.VGPRUsageDiff =
1639      checkRegUsageImpact(TryCand.Block->getInRegs(),
1640                          TryCand.Block->getOutRegs())[DAG->getVGPRSetID()];
1641    TryCand.NumSuccessors = TryCand.Block->getSuccs().size();
1642    TryCand.NumHighLatencySuccessors =
1643      TryCand.Block->getNumHighLatencySuccessors();
1644    TryCand.LastPosHighLatParentScheduled =
1645      (unsigned int) std::max<int> (0,
1646         LastPosHighLatencyParentScheduled[TryCand.Block->getID()] -
1647           LastPosWaitedHighLatency);
1648    TryCand.Height = TryCand.Block->Height;
1649    // Try not to increase VGPR usage too much, else we may spill.
1650    if (VregCurrentUsage > 120 ||
1651        Variant != SISchedulerBlockSchedulerVariant::BlockLatencyRegUsage) {
1652      if (!tryCandidateRegUsage(Cand, TryCand) &&
1653          Variant != SISchedulerBlockSchedulerVariant::BlockRegUsage)
1654        tryCandidateLatency(Cand, TryCand);
1655    } else {
1656      if (!tryCandidateLatency(Cand, TryCand))
1657        tryCandidateRegUsage(Cand, TryCand);
1658    }
1659    if (TryCand.Reason != NoCand) {
1660      Cand.setBest(TryCand);
1661      Best = I;
1662      LLVM_DEBUG(dbgs() << "Best Current Choice: " << Cand.Block->getID() << ' '
1663                        << getReasonStr(Cand.Reason) << '\n');
1664    }
1665  }
1666
1667  LLVM_DEBUG(dbgs() << "Picking: " << Cand.Block->getID() << '\n';
1668             dbgs() << "Is a block with high latency instruction: "
1669                    << (Cand.IsHighLatency ? "yes\n" : "no\n");
1670             dbgs() << "Position of last high latency dependency: "
1671                    << Cand.LastPosHighLatParentScheduled << '\n';
1672             dbgs() << "VGPRUsageDiff: " << Cand.VGPRUsageDiff << '\n';
1673             dbgs() << '\n';);
1674
1675  Block = Cand.Block;
1676  ReadyBlocks.erase(Best);
1677  return Block;
1678}
1679
1680// Tracking of currently alive registers to determine VGPR Usage.
1681
1682void SIScheduleBlockScheduler::addLiveRegs(std::set<unsigned> &Regs) {
1683  for (unsigned Reg : Regs) {
1684    // For now only track virtual registers.
1685    if (!Register::isVirtualRegister(Reg))
1686      continue;
1687    // If not already in the live set, then add it.
1688    (void) LiveRegs.insert(Reg);
1689  }
1690}
1691
1692void SIScheduleBlockScheduler::decreaseLiveRegs(SIScheduleBlock *Block,
1693                                       std::set<unsigned> &Regs) {
1694  for (unsigned Reg : Regs) {
1695    // For now only track virtual registers.
1696    std::set<unsigned>::iterator Pos = LiveRegs.find(Reg);
1697    assert (Pos != LiveRegs.end() && // Reg must be live.
1698               LiveRegsConsumers.find(Reg) != LiveRegsConsumers.end() &&
1699               LiveRegsConsumers[Reg] >= 1);
1700    --LiveRegsConsumers[Reg];
1701    if (LiveRegsConsumers[Reg] == 0)
1702      LiveRegs.erase(Pos);
1703  }
1704}
1705
1706void SIScheduleBlockScheduler::releaseBlockSuccs(SIScheduleBlock *Parent) {
1707  for (const auto &Block : Parent->getSuccs()) {
1708    if (--BlockNumPredsLeft[Block.first->getID()] == 0)
1709      ReadyBlocks.push_back(Block.first);
1710
1711    if (Parent->isHighLatencyBlock() &&
1712        Block.second == SIScheduleBlockLinkKind::Data)
1713      LastPosHighLatencyParentScheduled[Block.first->getID()] = NumBlockScheduled;
1714  }
1715}
1716
1717void SIScheduleBlockScheduler::blockScheduled(SIScheduleBlock *Block) {
1718  decreaseLiveRegs(Block, Block->getInRegs());
1719  addLiveRegs(Block->getOutRegs());
1720  releaseBlockSuccs(Block);
1721  for (std::map<unsigned, unsigned>::iterator RegI =
1722       LiveOutRegsNumUsages[Block->getID()].begin(),
1723       E = LiveOutRegsNumUsages[Block->getID()].end(); RegI != E; ++RegI) {
1724    std::pair<unsigned, unsigned> RegP = *RegI;
1725    // We produce this register, thus it must not be previously alive.
1726    assert(LiveRegsConsumers.find(RegP.first) == LiveRegsConsumers.end() ||
1727           LiveRegsConsumers[RegP.first] == 0);
1728    LiveRegsConsumers[RegP.first] += RegP.second;
1729  }
1730  if (LastPosHighLatencyParentScheduled[Block->getID()] >
1731        (unsigned)LastPosWaitedHighLatency)
1732    LastPosWaitedHighLatency =
1733      LastPosHighLatencyParentScheduled[Block->getID()];
1734  ++NumBlockScheduled;
1735}
1736
1737std::vector<int>
1738SIScheduleBlockScheduler::checkRegUsageImpact(std::set<unsigned> &InRegs,
1739                                     std::set<unsigned> &OutRegs) {
1740  std::vector<int> DiffSetPressure;
1741  DiffSetPressure.assign(DAG->getTRI()->getNumRegPressureSets(), 0);
1742
1743  for (unsigned Reg : InRegs) {
1744    // For now only track virtual registers.
1745    if (!Register::isVirtualRegister(Reg))
1746      continue;
1747    if (LiveRegsConsumers[Reg] > 1)
1748      continue;
1749    PSetIterator PSetI = DAG->getMRI()->getPressureSets(Reg);
1750    for (; PSetI.isValid(); ++PSetI) {
1751      DiffSetPressure[*PSetI] -= PSetI.getWeight();
1752    }
1753  }
1754
1755  for (unsigned Reg : OutRegs) {
1756    // For now only track virtual registers.
1757    if (!Register::isVirtualRegister(Reg))
1758      continue;
1759    PSetIterator PSetI = DAG->getMRI()->getPressureSets(Reg);
1760    for (; PSetI.isValid(); ++PSetI) {
1761      DiffSetPressure[*PSetI] += PSetI.getWeight();
1762    }
1763  }
1764
1765  return DiffSetPressure;
1766}
1767
1768// SIScheduler //
1769
1770struct SIScheduleBlockResult
1771SIScheduler::scheduleVariant(SISchedulerBlockCreatorVariant BlockVariant,
1772                             SISchedulerBlockSchedulerVariant ScheduleVariant) {
1773  SIScheduleBlocks Blocks = BlockCreator.getBlocks(BlockVariant);
1774  SIScheduleBlockScheduler Scheduler(DAG, ScheduleVariant, Blocks);
1775  std::vector<SIScheduleBlock*> ScheduledBlocks;
1776  struct SIScheduleBlockResult Res;
1777
1778  ScheduledBlocks = Scheduler.getBlocks();
1779
1780  for (unsigned b = 0; b < ScheduledBlocks.size(); ++b) {
1781    SIScheduleBlock *Block = ScheduledBlocks[b];
1782    std::vector<SUnit*> SUs = Block->getScheduledUnits();
1783
1784    for (SUnit* SU : SUs)
1785      Res.SUs.push_back(SU->NodeNum);
1786  }
1787
1788  Res.MaxSGPRUsage = Scheduler.getSGPRUsage();
1789  Res.MaxVGPRUsage = Scheduler.getVGPRUsage();
1790  return Res;
1791}
1792
1793// SIScheduleDAGMI //
1794
1795SIScheduleDAGMI::SIScheduleDAGMI(MachineSchedContext *C) :
1796  ScheduleDAGMILive(C, std::make_unique<GenericScheduler>(C)) {
1797  SITII = static_cast<const SIInstrInfo*>(TII);
1798  SITRI = static_cast<const SIRegisterInfo*>(TRI);
1799
1800  VGPRSetID = SITRI->getVGPRPressureSet();
1801  SGPRSetID = SITRI->getSGPRPressureSet();
1802}
1803
1804SIScheduleDAGMI::~SIScheduleDAGMI() = default;
1805
1806// Code adapted from scheduleDAG.cpp
1807// Does a topological sort over the SUs.
1808// Both TopDown and BottomUp
1809void SIScheduleDAGMI::topologicalSort() {
1810  Topo.InitDAGTopologicalSorting();
1811
1812  TopDownIndex2SU = std::vector<int>(Topo.begin(), Topo.end());
1813  BottomUpIndex2SU = std::vector<int>(Topo.rbegin(), Topo.rend());
1814}
1815
1816// Move low latencies further from their user without
1817// increasing SGPR usage (in general)
1818// This is to be replaced by a better pass that would
1819// take into account SGPR usage (based on VGPR Usage
1820// and the corresponding wavefront count), that would
1821// try to merge groups of loads if it make sense, etc
1822void SIScheduleDAGMI::moveLowLatencies() {
1823   unsigned DAGSize = SUnits.size();
1824   int LastLowLatencyUser = -1;
1825   int LastLowLatencyPos = -1;
1826
1827   for (unsigned i = 0, e = ScheduledSUnits.size(); i != e; ++i) {
1828    SUnit *SU = &SUnits[ScheduledSUnits[i]];
1829    bool IsLowLatencyUser = false;
1830    unsigned MinPos = 0;
1831
1832    for (SDep& PredDep : SU->Preds) {
1833      SUnit *Pred = PredDep.getSUnit();
1834      if (SITII->isLowLatencyInstruction(*Pred->getInstr())) {
1835        IsLowLatencyUser = true;
1836      }
1837      if (Pred->NodeNum >= DAGSize)
1838        continue;
1839      unsigned PredPos = ScheduledSUnitsInv[Pred->NodeNum];
1840      if (PredPos >= MinPos)
1841        MinPos = PredPos + 1;
1842    }
1843
1844    if (SITII->isLowLatencyInstruction(*SU->getInstr())) {
1845      unsigned BestPos = LastLowLatencyUser + 1;
1846      if ((int)BestPos <= LastLowLatencyPos)
1847        BestPos = LastLowLatencyPos + 1;
1848      if (BestPos < MinPos)
1849        BestPos = MinPos;
1850      if (BestPos < i) {
1851        for (unsigned u = i; u > BestPos; --u) {
1852          ++ScheduledSUnitsInv[ScheduledSUnits[u-1]];
1853          ScheduledSUnits[u] = ScheduledSUnits[u-1];
1854        }
1855        ScheduledSUnits[BestPos] = SU->NodeNum;
1856        ScheduledSUnitsInv[SU->NodeNum] = BestPos;
1857      }
1858      LastLowLatencyPos = BestPos;
1859      if (IsLowLatencyUser)
1860        LastLowLatencyUser = BestPos;
1861    } else if (IsLowLatencyUser) {
1862      LastLowLatencyUser = i;
1863    // Moves COPY instructions on which depends
1864    // the low latency instructions too.
1865    } else if (SU->getInstr()->getOpcode() == AMDGPU::COPY) {
1866      bool CopyForLowLat = false;
1867      for (SDep& SuccDep : SU->Succs) {
1868        SUnit *Succ = SuccDep.getSUnit();
1869        if (SuccDep.isWeak() || Succ->NodeNum >= DAGSize)
1870          continue;
1871        if (SITII->isLowLatencyInstruction(*Succ->getInstr())) {
1872          CopyForLowLat = true;
1873        }
1874      }
1875      if (!CopyForLowLat)
1876        continue;
1877      if (MinPos < i) {
1878        for (unsigned u = i; u > MinPos; --u) {
1879          ++ScheduledSUnitsInv[ScheduledSUnits[u-1]];
1880          ScheduledSUnits[u] = ScheduledSUnits[u-1];
1881        }
1882        ScheduledSUnits[MinPos] = SU->NodeNum;
1883        ScheduledSUnitsInv[SU->NodeNum] = MinPos;
1884      }
1885    }
1886  }
1887}
1888
1889void SIScheduleDAGMI::restoreSULinksLeft() {
1890  for (unsigned i = 0, e = SUnits.size(); i != e; ++i) {
1891    SUnits[i].isScheduled = false;
1892    SUnits[i].WeakPredsLeft = SUnitsLinksBackup[i].WeakPredsLeft;
1893    SUnits[i].NumPredsLeft = SUnitsLinksBackup[i].NumPredsLeft;
1894    SUnits[i].WeakSuccsLeft = SUnitsLinksBackup[i].WeakSuccsLeft;
1895    SUnits[i].NumSuccsLeft = SUnitsLinksBackup[i].NumSuccsLeft;
1896  }
1897}
1898
1899// Return the Vgpr and Sgpr usage corresponding to some virtual registers.
1900template<typename _Iterator> void
1901SIScheduleDAGMI::fillVgprSgprCost(_Iterator First, _Iterator End,
1902                                  unsigned &VgprUsage, unsigned &SgprUsage) {
1903  VgprUsage = 0;
1904  SgprUsage = 0;
1905  for (_Iterator RegI = First; RegI != End; ++RegI) {
1906    unsigned Reg = *RegI;
1907    // For now only track virtual registers
1908    if (!Register::isVirtualRegister(Reg))
1909      continue;
1910    PSetIterator PSetI = MRI.getPressureSets(Reg);
1911    for (; PSetI.isValid(); ++PSetI) {
1912      if (*PSetI == VGPRSetID)
1913        VgprUsage += PSetI.getWeight();
1914      else if (*PSetI == SGPRSetID)
1915        SgprUsage += PSetI.getWeight();
1916    }
1917  }
1918}
1919
1920void SIScheduleDAGMI::schedule()
1921{
1922  SmallVector<SUnit*, 8> TopRoots, BotRoots;
1923  SIScheduleBlockResult Best, Temp;
1924  LLVM_DEBUG(dbgs() << "Preparing Scheduling\n");
1925
1926  buildDAGWithRegPressure();
1927  LLVM_DEBUG(dump());
1928
1929  topologicalSort();
1930  findRootsAndBiasEdges(TopRoots, BotRoots);
1931  // We reuse several ScheduleDAGMI and ScheduleDAGMILive
1932  // functions, but to make them happy we must initialize
1933  // the default Scheduler implementation (even if we do not
1934  // run it)
1935  SchedImpl->initialize(this);
1936  initQueues(TopRoots, BotRoots);
1937
1938  // Fill some stats to help scheduling.
1939
1940  SUnitsLinksBackup = SUnits;
1941  IsLowLatencySU.clear();
1942  LowLatencyOffset.clear();
1943  IsHighLatencySU.clear();
1944
1945  IsLowLatencySU.resize(SUnits.size(), 0);
1946  LowLatencyOffset.resize(SUnits.size(), 0);
1947  IsHighLatencySU.resize(SUnits.size(), 0);
1948
1949  for (unsigned i = 0, e = (unsigned)SUnits.size(); i != e; ++i) {
1950    SUnit *SU = &SUnits[i];
1951    const MachineOperand *BaseLatOp;
1952    int64_t OffLatReg;
1953    if (SITII->isLowLatencyInstruction(*SU->getInstr())) {
1954      IsLowLatencySU[i] = 1;
1955      if (SITII->getMemOperandWithOffset(*SU->getInstr(), BaseLatOp, OffLatReg,
1956                                         TRI))
1957        LowLatencyOffset[i] = OffLatReg;
1958    } else if (SITII->isHighLatencyInstruction(*SU->getInstr()))
1959      IsHighLatencySU[i] = 1;
1960  }
1961
1962  SIScheduler Scheduler(this);
1963  Best = Scheduler.scheduleVariant(SISchedulerBlockCreatorVariant::LatenciesAlone,
1964                                   SISchedulerBlockSchedulerVariant::BlockLatencyRegUsage);
1965
1966  // if VGPR usage is extremely high, try other good performing variants
1967  // which could lead to lower VGPR usage
1968  if (Best.MaxVGPRUsage > 180) {
1969    static const std::pair<SISchedulerBlockCreatorVariant,
1970                           SISchedulerBlockSchedulerVariant>
1971        Variants[] = {
1972      { LatenciesAlone, BlockRegUsageLatency },
1973//      { LatenciesAlone, BlockRegUsage },
1974      { LatenciesGrouped, BlockLatencyRegUsage },
1975//      { LatenciesGrouped, BlockRegUsageLatency },
1976//      { LatenciesGrouped, BlockRegUsage },
1977      { LatenciesAlonePlusConsecutive, BlockLatencyRegUsage },
1978//      { LatenciesAlonePlusConsecutive, BlockRegUsageLatency },
1979//      { LatenciesAlonePlusConsecutive, BlockRegUsage }
1980    };
1981    for (std::pair<SISchedulerBlockCreatorVariant, SISchedulerBlockSchedulerVariant> v : Variants) {
1982      Temp = Scheduler.scheduleVariant(v.first, v.second);
1983      if (Temp.MaxVGPRUsage < Best.MaxVGPRUsage)
1984        Best = Temp;
1985    }
1986  }
1987  // if VGPR usage is still extremely high, we may spill. Try other variants
1988  // which are less performing, but that could lead to lower VGPR usage.
1989  if (Best.MaxVGPRUsage > 200) {
1990    static const std::pair<SISchedulerBlockCreatorVariant,
1991                           SISchedulerBlockSchedulerVariant>
1992        Variants[] = {
1993//      { LatenciesAlone, BlockRegUsageLatency },
1994      { LatenciesAlone, BlockRegUsage },
1995//      { LatenciesGrouped, BlockLatencyRegUsage },
1996      { LatenciesGrouped, BlockRegUsageLatency },
1997      { LatenciesGrouped, BlockRegUsage },
1998//      { LatenciesAlonePlusConsecutive, BlockLatencyRegUsage },
1999      { LatenciesAlonePlusConsecutive, BlockRegUsageLatency },
2000      { LatenciesAlonePlusConsecutive, BlockRegUsage }
2001    };
2002    for (std::pair<SISchedulerBlockCreatorVariant, SISchedulerBlockSchedulerVariant> v : Variants) {
2003      Temp = Scheduler.scheduleVariant(v.first, v.second);
2004      if (Temp.MaxVGPRUsage < Best.MaxVGPRUsage)
2005        Best = Temp;
2006    }
2007  }
2008
2009  ScheduledSUnits = Best.SUs;
2010  ScheduledSUnitsInv.resize(SUnits.size());
2011
2012  for (unsigned i = 0, e = (unsigned)SUnits.size(); i != e; ++i) {
2013    ScheduledSUnitsInv[ScheduledSUnits[i]] = i;
2014  }
2015
2016  moveLowLatencies();
2017
2018  // Tell the outside world about the result of the scheduling.
2019
2020  assert(TopRPTracker.getPos() == RegionBegin && "bad initial Top tracker");
2021  TopRPTracker.setPos(CurrentTop);
2022
2023  for (std::vector<unsigned>::iterator I = ScheduledSUnits.begin(),
2024       E = ScheduledSUnits.end(); I != E; ++I) {
2025    SUnit *SU = &SUnits[*I];
2026
2027    scheduleMI(SU, true);
2028
2029    LLVM_DEBUG(dbgs() << "Scheduling SU(" << SU->NodeNum << ") "
2030                      << *SU->getInstr());
2031  }
2032
2033  assert(CurrentTop == CurrentBottom && "Nonempty unscheduled zone.");
2034
2035  placeDebugValues();
2036
2037  LLVM_DEBUG({
2038    dbgs() << "*** Final schedule for "
2039           << printMBBReference(*begin()->getParent()) << " ***\n";
2040    dumpSchedule();
2041    dbgs() << '\n';
2042  });
2043}
2044