GCNRegBankReassign.cpp revision 360784
1//===-- GCNRegBankReassign.cpp - Reassign registers after regalloc --------===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8//
9/// \file
10/// \brief Try to reassign registers on GFX10+ to reduce register bank
11/// conflicts.
12///
13/// On GFX10 registers are organized in banks. VGPRs have 4 banks assigned in
14/// a round-robin fashion: v0, v4, v8... belong to bank 0. v1, v5, v9... to
15/// bank 1, etc. SGPRs have 8 banks and allocated in pairs, so that s0:s1,
16/// s16:s17, s32:s33 are at bank 0. s2:s3, s18:s19, s34:s35 are at bank 1 etc.
17///
18/// The shader can read one dword from each of these banks once per cycle.
19/// If an instruction has to read more register operands from the same bank
20/// an additional cycle is needed. HW attempts to pre-load registers through
21/// input operand gathering, but a stall cycle may occur if that fails. For
22/// example V_FMA_F32 V111 = V0 + V4 * V8 will need 3 cycles to read operands,
23/// potentially incuring 2 stall cycles.
24///
25/// The pass tries to reassign registers to reduce bank conflicts.
26///
27/// In this pass bank numbers 0-3 are VGPR banks and 4-11 are SGPR banks, so
28/// that 4 has to be subtracted from an SGPR bank number to get the real value.
29/// This also corresponds to bit numbers in bank masks used in the pass.
30///
31//===----------------------------------------------------------------------===//
32
33#include "AMDGPU.h"
34#include "AMDGPUSubtarget.h"
35#include "MCTargetDesc/AMDGPUMCTargetDesc.h"
36#include "SIInstrInfo.h"
37#include "SIMachineFunctionInfo.h"
38#include "llvm/ADT/SmallSet.h"
39#include "llvm/ADT/Statistic.h"
40#include "llvm/CodeGen/LiveInterval.h"
41#include "llvm/CodeGen/LiveIntervals.h"
42#include "llvm/CodeGen/LiveRegMatrix.h"
43#include "llvm/CodeGen/MachineFunctionPass.h"
44#include "llvm/CodeGen/MachineLoopInfo.h"
45#include "llvm/CodeGen/VirtRegMap.h"
46#include "llvm/InitializePasses.h"
47#include "llvm/Support/MathExtras.h"
48
49using namespace llvm;
50
51static cl::opt<unsigned> VerifyStallCycles("amdgpu-verify-regbanks-reassign",
52  cl::desc("Verify stall cycles in the regbanks reassign pass"),
53  cl::value_desc("0|1|2"),
54  cl::init(0), cl::Hidden);
55
56#define DEBUG_TYPE "amdgpu-regbanks-reassign"
57
58#define NUM_VGPR_BANKS 4
59#define NUM_SGPR_BANKS 8
60#define NUM_BANKS (NUM_VGPR_BANKS + NUM_SGPR_BANKS)
61#define SGPR_BANK_OFFSET NUM_VGPR_BANKS
62#define VGPR_BANK_MASK 0xf
63#define SGPR_BANK_MASK 0xff0
64#define SGPR_BANK_SHIFTED_MASK (SGPR_BANK_MASK >> SGPR_BANK_OFFSET)
65
66STATISTIC(NumStallsDetected,
67          "Number of operand read stalls detected");
68STATISTIC(NumStallsRecovered,
69          "Number of operand read stalls recovered");
70
71namespace {
72
73class GCNRegBankReassign : public MachineFunctionPass {
74
75  class OperandMask {
76  public:
77    OperandMask(unsigned r, unsigned s, unsigned m)
78      : Reg(r), SubReg(s), Mask(m) {}
79    unsigned Reg;
80    unsigned SubReg;
81    unsigned Mask;
82  };
83
84  class Candidate {
85  public:
86    Candidate(MachineInstr *mi, unsigned reg, unsigned freebanks,
87              unsigned weight)
88      : MI(mi), Reg(reg), FreeBanks(freebanks), Weight(weight) {}
89
90    bool operator< (const Candidate& RHS) const { return Weight < RHS.Weight; }
91
92#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
93    void dump(const GCNRegBankReassign *P) const {
94      MI->dump();
95      dbgs() << P->printReg(Reg) << " to banks ";
96      dumpFreeBanks(FreeBanks);
97      dbgs() << " weight " << Weight << '\n';
98    }
99#endif
100
101    MachineInstr *MI;
102    unsigned Reg;
103    unsigned FreeBanks;
104    unsigned Weight;
105  };
106
107  class CandidateList : public std::list<Candidate> {
108  public:
109    // Speedup subsequent sort.
110    void push(const Candidate&& C) {
111      if (C.Weight) push_back(C);
112      else push_front(C);
113    }
114  };
115
116public:
117  static char ID;
118
119public:
120  GCNRegBankReassign() : MachineFunctionPass(ID) {
121    initializeGCNRegBankReassignPass(*PassRegistry::getPassRegistry());
122  }
123
124  bool runOnMachineFunction(MachineFunction &MF) override;
125
126  StringRef getPassName() const override { return "GCN RegBank Reassign"; }
127
128  void getAnalysisUsage(AnalysisUsage &AU) const override {
129    AU.addRequired<MachineLoopInfo>();
130    AU.addRequired<LiveIntervals>();
131    AU.addRequired<VirtRegMap>();
132    AU.addRequired<LiveRegMatrix>();
133    AU.setPreservesAll();
134    MachineFunctionPass::getAnalysisUsage(AU);
135  }
136
137private:
138  const GCNSubtarget *ST;
139
140  const MachineRegisterInfo *MRI;
141
142  const SIRegisterInfo *TRI;
143
144  MachineLoopInfo *MLI;
145
146  VirtRegMap *VRM;
147
148  LiveRegMatrix *LRM;
149
150  LiveIntervals *LIS;
151
152  unsigned MaxNumVGPRs;
153
154  unsigned MaxNumSGPRs;
155
156  BitVector RegsUsed;
157
158  SmallVector<OperandMask, 8> OperandMasks;
159
160  CandidateList Candidates;
161
162  const MCPhysReg *CSRegs;
163
164  // Returns bank for a phys reg.
165  unsigned getPhysRegBank(unsigned Reg) const;
166
167  // Return a bit set for each register bank used. 4 banks for VGPRs and
168  // 8 banks for SGPRs.
169  // Registers already processed and recorded in RegsUsed are excluded.
170  // If Bank is not -1 assume Reg:SubReg to belong to that Bank.
171  unsigned getRegBankMask(unsigned Reg, unsigned SubReg, int Bank);
172
173  // Return number of stalls in the instructions.
174  // UsedBanks has bits set for the banks used by all operands.
175  // If Reg and Bank provided substitute the Reg with the Bank.
176  unsigned analyzeInst(const MachineInstr& MI, unsigned& UsedBanks,
177                       unsigned Reg = AMDGPU::NoRegister, int Bank = -1);
178
179  // Return true if register is regular VGPR or SGPR or their tuples.
180  // Returns false for special registers like m0, vcc etc.
181  bool isReassignable(unsigned Reg) const;
182
183  // Check if registers' defs are old and may be pre-loaded.
184  // Returns 0 if both registers are old enough, 1 or 2 if one or both
185  // registers will not likely be pre-loaded.
186  unsigned getOperandGatherWeight(const MachineInstr& MI,
187                                  unsigned Reg1,
188                                  unsigned Reg2,
189                                  unsigned StallCycles) const;
190
191
192  // Find all bank bits in UsedBanks where Mask can be relocated to.
193  unsigned getFreeBanks(unsigned Mask, unsigned UsedBanks) const;
194
195  // Find all bank bits in UsedBanks where Mask can be relocated to.
196  // Bank is relative to the register and not its subregister component.
197  // Returns 0 is a register is not reassignable.
198  unsigned getFreeBanks(unsigned Reg, unsigned SubReg, unsigned Mask,
199                        unsigned UsedBanks) const;
200
201  // Add cadidate instruction to the work list.
202  void collectCandidates(MachineInstr& MI, unsigned UsedBanks,
203                         unsigned StallCycles);
204
205  // Collect cadidate instructions across function. Returns a number stall
206  // cycles detected. Only counts stalls if Collect is false.
207  unsigned collectCandidates(MachineFunction &MF, bool Collect = true);
208
209  // Remove all candidates that read specified register.
210  void removeCandidates(unsigned Reg);
211
212  // Compute stalls within the uses of SrcReg replaced by a register from
213  // Bank. If Bank is -1 does not perform substitution. If Collect is set
214  // candidates are collected and added to work list.
215  unsigned computeStallCycles(unsigned SrcReg,
216                              unsigned Reg = AMDGPU::NoRegister,
217                              int Bank = -1, bool Collect = false);
218
219  // Search for a register in Bank unused within LI.
220  // Returns phys reg or NoRegister.
221  unsigned scavengeReg(LiveInterval& LI, unsigned Bank) const;
222
223  // Try to reassign candidate. Returns number or stall cycles saved.
224  unsigned tryReassign(Candidate &C);
225
226  bool verifyCycles(MachineFunction &MF,
227                    unsigned OriginalCycles, unsigned CyclesSaved);
228
229
230#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
231public:
232  Printable printReg(unsigned Reg, unsigned SubReg = 0) const {
233    return Printable([Reg, SubReg, this](raw_ostream &OS) {
234      if (Register::isPhysicalRegister(Reg)) {
235        OS << llvm::printReg(Reg, TRI);
236        return;
237      }
238      if (!VRM->isAssignedReg(Reg))
239        OS << "<unassigned> " << llvm::printReg(Reg, TRI);
240      else
241        OS << llvm::printReg(Reg, TRI) << '('
242           << llvm::printReg(VRM->getPhys(Reg), TRI) << ')';
243      if (SubReg)
244        OS << ':' << TRI->getSubRegIndexName(SubReg);
245    });
246  }
247
248  static Printable printBank(unsigned Bank) {
249    return Printable([Bank](raw_ostream &OS) {
250      OS << ((Bank >= SGPR_BANK_OFFSET) ? Bank - SGPR_BANK_OFFSET : Bank);
251    });
252  }
253
254  static void dumpFreeBanks(unsigned FreeBanks) {
255    for (unsigned L = 0; L < NUM_BANKS; ++L)
256      if (FreeBanks & (1 << L))
257        dbgs() << printBank(L) << ' ';
258  }
259#endif
260};
261
262} // End anonymous namespace.
263
264INITIALIZE_PASS_BEGIN(GCNRegBankReassign, DEBUG_TYPE, "GCN RegBank Reassign",
265                      false, false)
266INITIALIZE_PASS_DEPENDENCY(LiveIntervals)
267INITIALIZE_PASS_DEPENDENCY(MachineLoopInfo)
268INITIALIZE_PASS_DEPENDENCY(VirtRegMap)
269INITIALIZE_PASS_DEPENDENCY(LiveRegMatrix)
270INITIALIZE_PASS_END(GCNRegBankReassign, DEBUG_TYPE, "GCN RegBank Reassign",
271                    false, false)
272
273
274char GCNRegBankReassign::ID = 0;
275
276char &llvm::GCNRegBankReassignID = GCNRegBankReassign::ID;
277
278unsigned GCNRegBankReassign::getPhysRegBank(unsigned Reg) const {
279  assert(Register::isPhysicalRegister(Reg));
280
281  const TargetRegisterClass *RC = TRI->getMinimalPhysRegClass(Reg);
282  unsigned Size = TRI->getRegSizeInBits(*RC);
283  if (Size > 32)
284    Reg = TRI->getSubReg(Reg, AMDGPU::sub0);
285
286  if (TRI->hasVGPRs(RC)) {
287    Reg -= AMDGPU::VGPR0;
288    return Reg % NUM_VGPR_BANKS;
289  }
290
291  Reg = TRI->getEncodingValue(Reg) / 2;
292  return Reg % NUM_SGPR_BANKS + SGPR_BANK_OFFSET;
293}
294
295unsigned GCNRegBankReassign::getRegBankMask(unsigned Reg, unsigned SubReg,
296                                            int Bank) {
297  if (Register::isVirtualRegister(Reg)) {
298    if (!VRM->isAssignedReg(Reg))
299      return 0;
300
301    Reg = VRM->getPhys(Reg);
302    if (!Reg)
303      return 0;
304    if (SubReg)
305      Reg = TRI->getSubReg(Reg, SubReg);
306  }
307
308  const TargetRegisterClass *RC = TRI->getMinimalPhysRegClass(Reg);
309  unsigned Size = TRI->getRegSizeInBits(*RC) / 32;
310  if (Size > 1)
311    Reg = TRI->getSubReg(Reg, AMDGPU::sub0);
312
313  if (TRI->hasVGPRs(RC)) {
314    // VGPRs have 4 banks assigned in a round-robin fashion.
315    Reg -= AMDGPU::VGPR0;
316    unsigned Mask = (1 << Size) - 1;
317    unsigned Used = 0;
318    // Bitmask lacks an extract method
319    for (unsigned I = 0; I < Size; ++I)
320      if (RegsUsed.test(Reg + I))
321        Used |= 1 << I;
322    RegsUsed.set(Reg, Reg + Size);
323    Mask &= ~Used;
324    Mask <<= (Bank == -1) ? Reg % NUM_VGPR_BANKS : unsigned(Bank);
325    return (Mask | (Mask >> NUM_VGPR_BANKS)) & VGPR_BANK_MASK;
326  }
327
328  // SGPRs have 8 banks holding 2 consequitive registers each.
329  Reg = TRI->getEncodingValue(Reg) / 2;
330  unsigned StartBit = AMDGPU::VGPR_32RegClass.getNumRegs();
331  if (Reg + StartBit >= RegsUsed.size())
332    return 0;
333
334  if (Size > 1)
335    Size /= 2;
336  unsigned Mask = (1 << Size) - 1;
337  unsigned Used = 0;
338  for (unsigned I = 0; I < Size; ++I)
339    if (RegsUsed.test(StartBit + Reg + I))
340      Used |= 1 << I;
341  RegsUsed.set(StartBit + Reg, StartBit + Reg + Size);
342  Mask &= ~Used;
343  Mask <<= (Bank == -1) ? Reg % NUM_SGPR_BANKS
344                        : unsigned(Bank - SGPR_BANK_OFFSET);
345  Mask = (Mask | (Mask >> NUM_SGPR_BANKS)) & SGPR_BANK_SHIFTED_MASK;
346  // Reserve 4 bank ids for VGPRs.
347  return Mask << SGPR_BANK_OFFSET;
348}
349
350unsigned GCNRegBankReassign::analyzeInst(const MachineInstr& MI,
351                                         unsigned& UsedBanks,
352                                         unsigned Reg,
353                                         int Bank) {
354  unsigned StallCycles = 0;
355  UsedBanks = 0;
356
357  if (MI.isDebugValue())
358    return 0;
359
360  RegsUsed.reset();
361  OperandMasks.clear();
362  for (const auto& Op : MI.explicit_uses()) {
363    // Undef can be assigned to any register, so two vregs can be assigned
364    // the same phys reg within the same instruction.
365    if (!Op.isReg() || Op.isUndef())
366      continue;
367
368    Register R = Op.getReg();
369    if (TRI->hasAGPRs(TRI->getRegClassForReg(*MRI, R)))
370      continue;
371
372    unsigned ShiftedBank = Bank;
373
374    if (Bank != -1 && R == Reg && Op.getSubReg()) {
375      unsigned LM = TRI->getSubRegIndexLaneMask(Op.getSubReg()).getAsInteger();
376      if (!(LM & 1) && (Bank < NUM_VGPR_BANKS)) {
377        // If a register spans all banks we cannot shift it to avoid conflict.
378        if (countPopulation(LM) >= NUM_VGPR_BANKS)
379          continue;
380        ShiftedBank = (Bank + countTrailingZeros(LM)) % NUM_VGPR_BANKS;
381      } else if (!(LM & 3) && (Bank >= SGPR_BANK_OFFSET)) {
382        // If a register spans all banks we cannot shift it to avoid conflict.
383        if (countPopulation(LM) / 2 >= NUM_SGPR_BANKS)
384          continue;
385        ShiftedBank = SGPR_BANK_OFFSET + (Bank - SGPR_BANK_OFFSET +
386                                          (countTrailingZeros(LM) >> 1)) %
387                                             NUM_SGPR_BANKS;
388      }
389    }
390
391    unsigned Mask = getRegBankMask(R, Op.getSubReg(),
392                                   (Reg == R) ? ShiftedBank : -1);
393    StallCycles += countPopulation(UsedBanks & Mask);
394    UsedBanks |= Mask;
395    OperandMasks.push_back(OperandMask(Op.getReg(), Op.getSubReg(), Mask));
396  }
397
398  return StallCycles;
399}
400
401unsigned GCNRegBankReassign::getOperandGatherWeight(const MachineInstr& MI,
402                                                    unsigned Reg1,
403                                                    unsigned Reg2,
404                                                    unsigned StallCycles) const
405{
406  unsigned Defs = 0;
407  MachineBasicBlock::const_instr_iterator Def(MI.getIterator());
408  MachineBasicBlock::const_instr_iterator B(MI.getParent()->instr_begin());
409  for (unsigned S = StallCycles; S && Def != B && Defs != 3; --S) {
410    if (MI.isDebugInstr())
411      continue;
412    --Def;
413    if (Def->getOpcode() == TargetOpcode::IMPLICIT_DEF)
414      continue;
415    if (Def->modifiesRegister(Reg1, TRI))
416      Defs |= 1;
417    if (Def->modifiesRegister(Reg2, TRI))
418      Defs |= 2;
419  }
420  return countPopulation(Defs);
421}
422
423bool GCNRegBankReassign::isReassignable(unsigned Reg) const {
424  if (Register::isPhysicalRegister(Reg) || !VRM->isAssignedReg(Reg))
425    return false;
426
427  const MachineInstr *Def = MRI->getUniqueVRegDef(Reg);
428
429  Register PhysReg = VRM->getPhys(Reg);
430
431  if (Def && Def->isCopy() && Def->getOperand(1).getReg() == PhysReg)
432    return false;
433
434  for (auto U : MRI->use_nodbg_operands(Reg)) {
435    if (U.isImplicit())
436      return false;
437    const MachineInstr *UseInst = U.getParent();
438    if (UseInst->isCopy() && UseInst->getOperand(0).getReg() == PhysReg)
439      return false;
440  }
441
442  const TargetRegisterClass *RC = TRI->getMinimalPhysRegClass(PhysReg);
443  if (TRI->hasVGPRs(RC))
444    return true;
445
446  unsigned Size = TRI->getRegSizeInBits(*RC);
447  if (Size > 32)
448    PhysReg = TRI->getSubReg(PhysReg, AMDGPU::sub0);
449
450  return AMDGPU::SGPR_32RegClass.contains(PhysReg);
451}
452
453unsigned GCNRegBankReassign::getFreeBanks(unsigned Mask,
454                                          unsigned UsedBanks) const {
455  unsigned Size = countPopulation(Mask);
456  unsigned FreeBanks = 0;
457  unsigned Bank = findFirstSet(Mask);
458
459  UsedBanks &= ~Mask;
460
461  // Find free VGPR banks
462  if ((Mask & VGPR_BANK_MASK) && (Size < NUM_VGPR_BANKS)) {
463    for (unsigned I = 0; I < NUM_VGPR_BANKS; ++I) {
464      if (Bank == I)
465        continue;
466      unsigned NewMask = ((1 << Size) - 1) << I;
467      NewMask = (NewMask | (NewMask >> NUM_VGPR_BANKS)) & VGPR_BANK_MASK;
468      if (!(UsedBanks & NewMask))
469        FreeBanks |= 1 << I;
470    }
471    return FreeBanks;
472  }
473
474  // Find free SGPR banks
475  // SGPR tuples must be aligned, so step is size in banks it
476  // crosses.
477  Bank -= SGPR_BANK_OFFSET;
478  for (unsigned I = 0; I < NUM_SGPR_BANKS; I += Size) {
479    if (Bank == I)
480      continue;
481    unsigned NewMask = ((1 << Size) - 1) << I;
482    NewMask = (NewMask | (NewMask >> NUM_SGPR_BANKS)) & SGPR_BANK_SHIFTED_MASK;
483    if (!(UsedBanks & (NewMask << SGPR_BANK_OFFSET)))
484      FreeBanks |= (1 << SGPR_BANK_OFFSET) << I;
485  }
486
487  return FreeBanks;
488}
489
490unsigned GCNRegBankReassign::getFreeBanks(unsigned Reg,
491                                          unsigned SubReg,
492                                          unsigned Mask,
493                                          unsigned UsedBanks) const {
494  if (!isReassignable(Reg))
495    return 0;
496
497  unsigned FreeBanks = getFreeBanks(Mask, UsedBanks);
498
499  unsigned LM = TRI->getSubRegIndexLaneMask(SubReg).getAsInteger();
500  if (!(LM & 1) && (Mask & VGPR_BANK_MASK)) {
501    unsigned Shift = countTrailingZeros(LM);
502    if (Shift >= NUM_VGPR_BANKS)
503      return 0;
504    unsigned VB = FreeBanks & VGPR_BANK_MASK;
505    FreeBanks = ((VB >> Shift) | (VB << (NUM_VGPR_BANKS - Shift))) &
506                VGPR_BANK_MASK;
507  } else if (!(LM & 3) && (Mask & SGPR_BANK_MASK)) {
508    unsigned Shift = countTrailingZeros(LM) >> 1;
509    if (Shift >= NUM_SGPR_BANKS)
510      return 0;
511    unsigned SB = FreeBanks >> SGPR_BANK_OFFSET;
512    FreeBanks = ((SB >> Shift) | (SB << (NUM_SGPR_BANKS - Shift))) &
513                SGPR_BANK_SHIFTED_MASK;
514    FreeBanks <<= SGPR_BANK_OFFSET;
515  }
516
517  LLVM_DEBUG(if (FreeBanks) {
518          dbgs() << "Potential reassignments of " << printReg(Reg, SubReg)
519                 << " to banks: "; dumpFreeBanks(FreeBanks);
520          dbgs() << '\n'; });
521
522  return FreeBanks;
523}
524
525void GCNRegBankReassign::collectCandidates(MachineInstr& MI,
526                                           unsigned UsedBanks,
527                                           unsigned StallCycles) {
528  LLVM_DEBUG(MI.dump());
529
530  if (!StallCycles)
531    return;
532
533  LLVM_DEBUG(dbgs() << "Stall cycles = " << StallCycles << '\n');
534
535  for (unsigned I = 0, E = OperandMasks.size(); I + 1 < E; ++I) {
536    for (unsigned J = I + 1; J != E; ++J) {
537      if (!(OperandMasks[I].Mask & OperandMasks[J].Mask))
538        continue;
539
540      unsigned Reg1 = OperandMasks[I].Reg;
541      unsigned Reg2 = OperandMasks[J].Reg;
542      unsigned SubReg1 = OperandMasks[I].SubReg;
543      unsigned SubReg2 = OperandMasks[J].SubReg;
544      unsigned Mask1 = OperandMasks[I].Mask;
545      unsigned Mask2 = OperandMasks[J].Mask;
546      unsigned Size1 = countPopulation(Mask1);
547      unsigned Size2 = countPopulation(Mask2);
548
549      LLVM_DEBUG(dbgs() << "Conflicting operands: " << printReg(Reg1, SubReg1) <<
550                      " and " << printReg(Reg2, SubReg2) << '\n');
551
552      unsigned Weight = getOperandGatherWeight(MI, Reg1, Reg2, StallCycles);
553      Weight += MLI->getLoopDepth(MI.getParent()) * 10;
554
555      LLVM_DEBUG(dbgs() << "Stall weight = " << Weight << '\n');
556
557      unsigned FreeBanks1 = getFreeBanks(Reg1, SubReg1, Mask1, UsedBanks);
558      unsigned FreeBanks2 = getFreeBanks(Reg2, SubReg2, Mask2, UsedBanks);
559      if (FreeBanks1)
560        Candidates.push(Candidate(&MI, Reg1, FreeBanks1, Weight
561                                    + ((Size2 > Size1) ? 1 : 0)));
562      if (FreeBanks2)
563        Candidates.push(Candidate(&MI, Reg2, FreeBanks2, Weight
564                                    + ((Size1 > Size2) ? 1 : 0)));
565    }
566  }
567}
568
569unsigned GCNRegBankReassign::computeStallCycles(unsigned SrcReg,
570                                                unsigned Reg, int Bank,
571                                                bool Collect) {
572  unsigned TotalStallCycles = 0;
573  unsigned UsedBanks = 0;
574  SmallSet<const MachineInstr *, 16> Visited;
575
576  for (auto &MI : MRI->use_nodbg_instructions(SrcReg)) {
577    if (MI.isBundle())
578      continue;
579    if (!Visited.insert(&MI).second)
580      continue;
581    unsigned StallCycles = analyzeInst(MI, UsedBanks, Reg, Bank);
582    TotalStallCycles += StallCycles;
583    if (Collect)
584      collectCandidates(MI, UsedBanks, StallCycles);
585  }
586
587  return TotalStallCycles;
588}
589
590unsigned GCNRegBankReassign::scavengeReg(LiveInterval& LI,
591                                         unsigned Bank) const {
592  const TargetRegisterClass *RC = MRI->getRegClass(LI.reg);
593  unsigned MaxNumRegs = (Bank < NUM_VGPR_BANKS) ? MaxNumVGPRs
594                                                : MaxNumSGPRs;
595  unsigned MaxReg = MaxNumRegs + (Bank < NUM_VGPR_BANKS ? AMDGPU::VGPR0
596                                                        : AMDGPU::SGPR0);
597
598  for (unsigned Reg : RC->getRegisters()) {
599    // Check occupancy limit.
600    if (TRI->isSubRegisterEq(Reg, MaxReg))
601      break;
602
603    if (!MRI->isAllocatable(Reg) || getPhysRegBank(Reg) != Bank)
604      continue;
605
606    for (unsigned I = 0; CSRegs[I]; ++I)
607      if (TRI->isSubRegisterEq(Reg, CSRegs[I]) &&
608          !LRM->isPhysRegUsed(CSRegs[I]))
609        return AMDGPU::NoRegister;
610
611    LLVM_DEBUG(dbgs() << "Trying register " << printReg(Reg) << '\n');
612
613    if (!LRM->checkInterference(LI, Reg))
614      return Reg;
615  }
616
617  return AMDGPU::NoRegister;
618}
619
620unsigned GCNRegBankReassign::tryReassign(Candidate &C) {
621  if (!LIS->hasInterval(C.Reg))
622    return 0;
623
624  LiveInterval &LI = LIS->getInterval(C.Reg);
625  LLVM_DEBUG(dbgs() << "Try reassign " << printReg(C.Reg) << " in "; C.MI->dump();
626             LI.dump());
627
628  // For each candidate bank walk all instructions in the range of live
629  // interval and check if replacing the register with one belonging to
630  // the candidate bank reduces conflicts.
631
632  unsigned OrigStalls = computeStallCycles(C.Reg);
633  LLVM_DEBUG(dbgs() << "--- Stall cycles in range = " << OrigStalls << '\n');
634  if (!OrigStalls)
635    return 0;
636
637  struct BankStall {
638    BankStall(unsigned b, unsigned s) : Bank(b), Stalls(s) {};
639    bool operator< (const BankStall &RHS) const { return Stalls > RHS.Stalls; }
640    unsigned Bank;
641    unsigned Stalls;
642  };
643  SmallVector<BankStall, 8> BankStalls;
644
645  for (int Bank = 0; Bank < NUM_BANKS; ++Bank) {
646    if (C.FreeBanks & (1 << Bank)) {
647      LLVM_DEBUG(dbgs() << "Trying bank " << printBank(Bank) << '\n');
648      unsigned Stalls = computeStallCycles(C.Reg, C.Reg, Bank);
649      if (Stalls < OrigStalls) {
650        LLVM_DEBUG(dbgs() << "With bank " << printBank(Bank) << " -> "
651                     << Stalls << '\n');
652        BankStalls.push_back(BankStall((unsigned)Bank, Stalls));
653      }
654    }
655  }
656  std::sort(BankStalls.begin(), BankStalls.end());
657
658  Register OrigReg = VRM->getPhys(C.Reg);
659  LRM->unassign(LI);
660  while (!BankStalls.empty()) {
661    BankStall BS = BankStalls.pop_back_val();
662    unsigned Reg = scavengeReg(LI, BS.Bank);
663    if (Reg == AMDGPU::NoRegister) {
664      LLVM_DEBUG(dbgs() << "No free registers in bank " << printBank(BS.Bank)
665                   << '\n');
666      continue;
667    }
668    LLVM_DEBUG(dbgs() << "Found free register " << printReg(Reg)
669                 << (LRM->isPhysRegUsed(Reg) ? "" : " (new)")
670                 << " in bank " << printBank(BS.Bank) << '\n');
671
672    LRM->assign(LI, Reg);
673
674    LLVM_DEBUG(dbgs() << "--- Cycles saved: " << OrigStalls - BS.Stalls << '\n');
675
676    return OrigStalls - BS.Stalls;
677  }
678  LRM->assign(LI, OrigReg);
679
680  return 0;
681}
682
683unsigned GCNRegBankReassign::collectCandidates(MachineFunction &MF,
684                                               bool Collect) {
685  unsigned TotalStallCycles = 0;
686
687  for (MachineBasicBlock &MBB : MF) {
688
689    LLVM_DEBUG(if (Collect) {
690            if (MBB.getName().empty()) dbgs() << "bb." << MBB.getNumber();
691            else dbgs() << MBB.getName(); dbgs() << ":\n";
692          });
693
694    for (MachineInstr &MI : MBB.instrs()) {
695      if (MI.isBundle())
696          continue; // we analyze the instructions inside the bundle individually
697
698      unsigned UsedBanks = 0;
699      unsigned StallCycles = analyzeInst(MI, UsedBanks);
700
701      if (Collect)
702        collectCandidates(MI, UsedBanks, StallCycles);
703
704      TotalStallCycles += StallCycles;
705    }
706
707    LLVM_DEBUG(if (Collect) { dbgs() << '\n'; });
708  }
709
710  return TotalStallCycles;
711}
712
713void GCNRegBankReassign::removeCandidates(unsigned Reg) {
714  Candidates.remove_if([Reg, this](const Candidate& C) {
715    return C.MI->readsRegister(Reg, TRI);
716  });
717}
718
719bool GCNRegBankReassign::verifyCycles(MachineFunction &MF,
720                                      unsigned OriginalCycles,
721                                      unsigned CyclesSaved) {
722  unsigned StallCycles = collectCandidates(MF, false);
723  LLVM_DEBUG(dbgs() << "=== After the pass " << StallCycles
724               << " stall cycles left\n");
725  return StallCycles + CyclesSaved == OriginalCycles;
726}
727
728bool GCNRegBankReassign::runOnMachineFunction(MachineFunction &MF) {
729  ST = &MF.getSubtarget<GCNSubtarget>();
730  if (!ST->hasRegisterBanking() || skipFunction(MF.getFunction()))
731    return false;
732
733  MRI = &MF.getRegInfo();
734  TRI = ST->getRegisterInfo();
735  MLI = &getAnalysis<MachineLoopInfo>();
736  VRM = &getAnalysis<VirtRegMap>();
737  LRM = &getAnalysis<LiveRegMatrix>();
738  LIS = &getAnalysis<LiveIntervals>();
739
740  const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
741  unsigned Occupancy = MFI->getOccupancy();
742  MaxNumVGPRs = ST->getMaxNumVGPRs(MF);
743  MaxNumSGPRs = ST->getMaxNumSGPRs(MF);
744  MaxNumVGPRs = std::min(ST->getMaxNumVGPRs(Occupancy), MaxNumVGPRs);
745  MaxNumSGPRs = std::min(ST->getMaxNumSGPRs(Occupancy, true), MaxNumSGPRs);
746
747  CSRegs = MRI->getCalleeSavedRegs();
748
749  RegsUsed.resize(AMDGPU::VGPR_32RegClass.getNumRegs() +
750                  TRI->getEncodingValue(AMDGPU::SGPR_NULL) / 2 + 1);
751
752  LLVM_DEBUG(dbgs() << "=== RegBanks reassign analysis on function " << MF.getName()
753               << '\n');
754
755  unsigned StallCycles = collectCandidates(MF);
756  NumStallsDetected += StallCycles;
757
758  LLVM_DEBUG(dbgs() << "=== " << StallCycles << " stall cycles detected in "
759                  "function " << MF.getName() << '\n');
760
761  Candidates.sort();
762
763  LLVM_DEBUG(dbgs() << "\nCandidates:\n\n";
764        for (auto C : Candidates) C.dump(this);
765        dbgs() << "\n\n");
766
767  unsigned CyclesSaved = 0;
768  while (!Candidates.empty()) {
769    Candidate C = Candidates.back();
770    unsigned LocalCyclesSaved = tryReassign(C);
771    CyclesSaved += LocalCyclesSaved;
772
773    if (VerifyStallCycles > 1 && !verifyCycles(MF, StallCycles, CyclesSaved))
774      report_fatal_error("RegBank reassign stall cycles verification failed.");
775
776    Candidates.pop_back();
777    if (LocalCyclesSaved) {
778      removeCandidates(C.Reg);
779      computeStallCycles(C.Reg, AMDGPU::NoRegister, -1, true);
780      Candidates.sort();
781
782      LLVM_DEBUG(dbgs() << "\nCandidates:\n\n";
783            for (auto C : Candidates)
784              C.dump(this);
785            dbgs() << "\n\n");
786    }
787  }
788  NumStallsRecovered += CyclesSaved;
789
790  LLVM_DEBUG(dbgs() << "=== After the pass " << CyclesSaved
791               << " cycles saved in function " << MF.getName() << '\n');
792
793  Candidates.clear();
794
795  if (VerifyStallCycles == 1 && !verifyCycles(MF, StallCycles, CyclesSaved))
796    report_fatal_error("RegBank reassign stall cycles verification failed.");
797
798  RegsUsed.clear();
799
800  return CyclesSaved > 0;
801}
802