GCNNSAReassign.cpp revision 360784
1//===-- GCNNSAReassign.cpp - Reassign registers in NSA unstructions -------===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8//
9/// \file
10/// \brief Try to reassign registers on GFX10+ from non-sequential to sequential
11/// in NSA image instructions. Later SIShrinkInstructions pass will relace NSA
12/// with sequential versions where possible.
13///
14//===----------------------------------------------------------------------===//
15
16#include "AMDGPU.h"
17#include "AMDGPUSubtarget.h"
18#include "SIInstrInfo.h"
19#include "SIMachineFunctionInfo.h"
20#include "llvm/ADT/Statistic.h"
21#include "llvm/CodeGen/LiveInterval.h"
22#include "llvm/CodeGen/LiveIntervals.h"
23#include "llvm/CodeGen/LiveRegMatrix.h"
24#include "llvm/CodeGen/MachineFunctionPass.h"
25#include "llvm/CodeGen/VirtRegMap.h"
26#include "llvm/InitializePasses.h"
27#include "llvm/Support/MathExtras.h"
28#include <algorithm>
29
30using namespace llvm;
31
32#define DEBUG_TYPE "amdgpu-nsa-reassign"
33
34STATISTIC(NumNSAInstructions,
35          "Number of NSA instructions with non-sequential address found");
36STATISTIC(NumNSAConverted,
37          "Number of NSA instructions changed to sequential");
38
39namespace {
40
41class GCNNSAReassign : public MachineFunctionPass {
42public:
43  static char ID;
44
45  GCNNSAReassign() : MachineFunctionPass(ID) {
46    initializeGCNNSAReassignPass(*PassRegistry::getPassRegistry());
47  }
48
49  bool runOnMachineFunction(MachineFunction &MF) override;
50
51  StringRef getPassName() const override { return "GCN NSA Reassign"; }
52
53  void getAnalysisUsage(AnalysisUsage &AU) const override {
54    AU.addRequired<LiveIntervals>();
55    AU.addRequired<VirtRegMap>();
56    AU.addRequired<LiveRegMatrix>();
57    AU.setPreservesAll();
58    MachineFunctionPass::getAnalysisUsage(AU);
59  }
60
61private:
62  typedef enum {
63    NOT_NSA,        // Not an NSA instruction
64    FIXED,          // NSA which we cannot modify
65    NON_CONTIGUOUS, // NSA with non-sequential address which we can try
66                    // to optimize.
67    CONTIGUOUS      // NSA with all sequential address registers
68  } NSA_Status;
69
70  const GCNSubtarget *ST;
71
72  const MachineRegisterInfo *MRI;
73
74  const SIRegisterInfo *TRI;
75
76  VirtRegMap *VRM;
77
78  LiveRegMatrix *LRM;
79
80  LiveIntervals *LIS;
81
82  unsigned MaxNumVGPRs;
83
84  const MCPhysReg *CSRegs;
85
86  NSA_Status CheckNSA(const MachineInstr &MI, bool Fast = false) const;
87
88  bool tryAssignRegisters(SmallVectorImpl<LiveInterval *> &Intervals,
89                          unsigned StartReg) const;
90
91  bool canAssign(unsigned StartReg, unsigned NumRegs) const;
92
93  bool scavengeRegs(SmallVectorImpl<LiveInterval *> &Intervals) const;
94};
95
96} // End anonymous namespace.
97
98INITIALIZE_PASS_BEGIN(GCNNSAReassign, DEBUG_TYPE, "GCN NSA Reassign",
99                      false, false)
100INITIALIZE_PASS_DEPENDENCY(LiveIntervals)
101INITIALIZE_PASS_DEPENDENCY(VirtRegMap)
102INITIALIZE_PASS_DEPENDENCY(LiveRegMatrix)
103INITIALIZE_PASS_END(GCNNSAReassign, DEBUG_TYPE, "GCN NSA Reassign",
104                    false, false)
105
106
107char GCNNSAReassign::ID = 0;
108
109char &llvm::GCNNSAReassignID = GCNNSAReassign::ID;
110
111bool
112GCNNSAReassign::tryAssignRegisters(SmallVectorImpl<LiveInterval *> &Intervals,
113                                   unsigned StartReg) const {
114  unsigned NumRegs = Intervals.size();
115
116  for (unsigned N = 0; N < NumRegs; ++N)
117    if (VRM->hasPhys(Intervals[N]->reg))
118      LRM->unassign(*Intervals[N]);
119
120  for (unsigned N = 0; N < NumRegs; ++N)
121    if (LRM->checkInterference(*Intervals[N], StartReg + N))
122      return false;
123
124  for (unsigned N = 0; N < NumRegs; ++N)
125    LRM->assign(*Intervals[N], StartReg + N);
126
127  return true;
128}
129
130bool GCNNSAReassign::canAssign(unsigned StartReg, unsigned NumRegs) const {
131  for (unsigned N = 0; N < NumRegs; ++N) {
132    unsigned Reg = StartReg + N;
133    if (!MRI->isAllocatable(Reg))
134      return false;
135
136    for (unsigned I = 0; CSRegs[I]; ++I)
137      if (TRI->isSubRegisterEq(Reg, CSRegs[I]) &&
138          !LRM->isPhysRegUsed(CSRegs[I]))
139      return false;
140  }
141
142  return true;
143}
144
145bool
146GCNNSAReassign::scavengeRegs(SmallVectorImpl<LiveInterval *> &Intervals) const {
147  unsigned NumRegs = Intervals.size();
148
149  if (NumRegs > MaxNumVGPRs)
150    return false;
151  unsigned MaxReg = MaxNumVGPRs - NumRegs + AMDGPU::VGPR0;
152
153  for (unsigned Reg = AMDGPU::VGPR0; Reg <= MaxReg; ++Reg) {
154    if (!canAssign(Reg, NumRegs))
155      continue;
156
157    if (tryAssignRegisters(Intervals, Reg))
158      return true;
159  }
160
161  return false;
162}
163
164GCNNSAReassign::NSA_Status
165GCNNSAReassign::CheckNSA(const MachineInstr &MI, bool Fast) const {
166  const AMDGPU::MIMGInfo *Info = AMDGPU::getMIMGInfo(MI.getOpcode());
167  if (!Info || Info->MIMGEncoding != AMDGPU::MIMGEncGfx10NSA)
168    return NSA_Status::NOT_NSA;
169
170  int VAddr0Idx =
171    AMDGPU::getNamedOperandIdx(MI.getOpcode(), AMDGPU::OpName::vaddr0);
172
173  unsigned VgprBase = 0;
174  bool NSA = false;
175  for (unsigned I = 0; I < Info->VAddrDwords; ++I) {
176    const MachineOperand &Op = MI.getOperand(VAddr0Idx + I);
177    Register Reg = Op.getReg();
178    if (Register::isPhysicalRegister(Reg) || !VRM->isAssignedReg(Reg))
179      return NSA_Status::FIXED;
180
181    Register PhysReg = VRM->getPhys(Reg);
182
183    if (!Fast) {
184      if (!PhysReg)
185        return NSA_Status::FIXED;
186
187      // Bail if address is not a VGPR32. That should be possible to extend the
188      // optimization to work with subregs of a wider register tuples, but the
189      // logic to find free registers will be much more complicated with much
190      // less chances for success. That seems reasonable to assume that in most
191      // cases a tuple is used because a vector variable contains different
192      // parts of an address and it is either already consequitive or cannot
193      // be reassigned if not. If needed it is better to rely on register
194      // coalescer to process such address tuples.
195      if (MRI->getRegClass(Reg) != &AMDGPU::VGPR_32RegClass || Op.getSubReg())
196        return NSA_Status::FIXED;
197
198      const MachineInstr *Def = MRI->getUniqueVRegDef(Reg);
199
200      if (Def && Def->isCopy() && Def->getOperand(1).getReg() == PhysReg)
201        return NSA_Status::FIXED;
202
203      for (auto U : MRI->use_nodbg_operands(Reg)) {
204        if (U.isImplicit())
205          return NSA_Status::FIXED;
206        const MachineInstr *UseInst = U.getParent();
207        if (UseInst->isCopy() && UseInst->getOperand(0).getReg() == PhysReg)
208          return NSA_Status::FIXED;
209      }
210
211      if (!LIS->hasInterval(Reg))
212        return NSA_Status::FIXED;
213    }
214
215    if (I == 0)
216      VgprBase = PhysReg;
217    else if (VgprBase + I != PhysReg)
218      NSA = true;
219  }
220
221  return NSA ? NSA_Status::NON_CONTIGUOUS : NSA_Status::CONTIGUOUS;
222}
223
224bool GCNNSAReassign::runOnMachineFunction(MachineFunction &MF) {
225  ST = &MF.getSubtarget<GCNSubtarget>();
226  if (ST->getGeneration() < GCNSubtarget::GFX10)
227    return false;
228
229  MRI = &MF.getRegInfo();
230  TRI = ST->getRegisterInfo();
231  VRM = &getAnalysis<VirtRegMap>();
232  LRM = &getAnalysis<LiveRegMatrix>();
233  LIS = &getAnalysis<LiveIntervals>();
234
235  const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
236  MaxNumVGPRs = ST->getMaxNumVGPRs(MF);
237  MaxNumVGPRs = std::min(ST->getMaxNumVGPRs(MFI->getOccupancy()), MaxNumVGPRs);
238  CSRegs = MRI->getCalleeSavedRegs();
239
240  using Candidate = std::pair<const MachineInstr*, bool>;
241  SmallVector<Candidate, 32> Candidates;
242  for (const MachineBasicBlock &MBB : MF) {
243    for (const MachineInstr &MI : MBB) {
244      switch (CheckNSA(MI)) {
245      default:
246        continue;
247      case NSA_Status::CONTIGUOUS:
248        Candidates.push_back(std::make_pair(&MI, true));
249        break;
250      case NSA_Status::NON_CONTIGUOUS:
251        Candidates.push_back(std::make_pair(&MI, false));
252        ++NumNSAInstructions;
253        break;
254      }
255    }
256  }
257
258  bool Changed = false;
259  for (auto &C : Candidates) {
260    if (C.second)
261      continue;
262
263    const MachineInstr *MI = C.first;
264    if (CheckNSA(*MI, true) == NSA_Status::CONTIGUOUS) {
265      // Already happen to be fixed.
266      C.second = true;
267      ++NumNSAConverted;
268      continue;
269    }
270
271    const AMDGPU::MIMGInfo *Info = AMDGPU::getMIMGInfo(MI->getOpcode());
272    int VAddr0Idx =
273      AMDGPU::getNamedOperandIdx(MI->getOpcode(), AMDGPU::OpName::vaddr0);
274
275    SmallVector<LiveInterval *, 16> Intervals;
276    SmallVector<unsigned, 16> OrigRegs;
277    SlotIndex MinInd, MaxInd;
278    for (unsigned I = 0; I < Info->VAddrDwords; ++I) {
279      const MachineOperand &Op = MI->getOperand(VAddr0Idx + I);
280      Register Reg = Op.getReg();
281      LiveInterval *LI = &LIS->getInterval(Reg);
282      if (llvm::find(Intervals, LI) != Intervals.end()) {
283        // Same register used, unable to make sequential
284        Intervals.clear();
285        break;
286      }
287      Intervals.push_back(LI);
288      OrigRegs.push_back(VRM->getPhys(Reg));
289      MinInd = I ? std::min(MinInd, LI->beginIndex()) : LI->beginIndex();
290      MaxInd = I ? std::max(MaxInd, LI->endIndex()) : LI->endIndex();
291    }
292
293    if (Intervals.empty())
294      continue;
295
296    LLVM_DEBUG(dbgs() << "Attempting to reassign NSA: " << *MI
297                      << "\tOriginal allocation:\t";
298               for(auto *LI : Intervals)
299                 dbgs() << " " << llvm::printReg((VRM->getPhys(LI->reg)), TRI);
300               dbgs() << '\n');
301
302    bool Success = scavengeRegs(Intervals);
303    if (!Success) {
304      LLVM_DEBUG(dbgs() << "\tCannot reallocate.\n");
305      if (VRM->hasPhys(Intervals.back()->reg)) // Did not change allocation.
306        continue;
307    } else {
308      // Check we did not make it worse for other instructions.
309      auto I = std::lower_bound(Candidates.begin(), &C, MinInd,
310                                [this](const Candidate &C, SlotIndex I) {
311                                  return LIS->getInstructionIndex(*C.first) < I;
312                                });
313      for (auto E = Candidates.end(); Success && I != E &&
314              LIS->getInstructionIndex(*I->first) < MaxInd; ++I) {
315        if (I->second && CheckNSA(*I->first, true) < NSA_Status::CONTIGUOUS) {
316          Success = false;
317          LLVM_DEBUG(dbgs() << "\tNSA conversion conflict with " << *I->first);
318        }
319      }
320    }
321
322    if (!Success) {
323      for (unsigned I = 0; I < Info->VAddrDwords; ++I)
324        if (VRM->hasPhys(Intervals[I]->reg))
325          LRM->unassign(*Intervals[I]);
326
327      for (unsigned I = 0; I < Info->VAddrDwords; ++I)
328        LRM->assign(*Intervals[I], OrigRegs[I]);
329
330      continue;
331    }
332
333    C.second = true;
334    ++NumNSAConverted;
335    LLVM_DEBUG(dbgs() << "\tNew allocation:\t\t ["
336                 << llvm::printReg((VRM->getPhys(Intervals.front()->reg)), TRI)
337                 << " : "
338                 << llvm::printReg((VRM->getPhys(Intervals.back()->reg)), TRI)
339                 << "]\n");
340    Changed = true;
341  }
342
343  return Changed;
344}
345