GCNRegBankReassign.cpp revision 360784
1//===-- GCNRegBankReassign.cpp - Reassign registers after regalloc --------===// 2// 3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 4// See https://llvm.org/LICENSE.txt for license information. 5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 6// 7//===----------------------------------------------------------------------===// 8// 9/// \file 10/// \brief Try to reassign registers on GFX10+ to reduce register bank 11/// conflicts. 12/// 13/// On GFX10 registers are organized in banks. VGPRs have 4 banks assigned in 14/// a round-robin fashion: v0, v4, v8... belong to bank 0. v1, v5, v9... to 15/// bank 1, etc. SGPRs have 8 banks and allocated in pairs, so that s0:s1, 16/// s16:s17, s32:s33 are at bank 0. s2:s3, s18:s19, s34:s35 are at bank 1 etc. 17/// 18/// The shader can read one dword from each of these banks once per cycle. 19/// If an instruction has to read more register operands from the same bank 20/// an additional cycle is needed. HW attempts to pre-load registers through 21/// input operand gathering, but a stall cycle may occur if that fails. For 22/// example V_FMA_F32 V111 = V0 + V4 * V8 will need 3 cycles to read operands, 23/// potentially incuring 2 stall cycles. 24/// 25/// The pass tries to reassign registers to reduce bank conflicts. 26/// 27/// In this pass bank numbers 0-3 are VGPR banks and 4-11 are SGPR banks, so 28/// that 4 has to be subtracted from an SGPR bank number to get the real value. 29/// This also corresponds to bit numbers in bank masks used in the pass. 30/// 31//===----------------------------------------------------------------------===// 32 33#include "AMDGPU.h" 34#include "AMDGPUSubtarget.h" 35#include "MCTargetDesc/AMDGPUMCTargetDesc.h" 36#include "SIInstrInfo.h" 37#include "SIMachineFunctionInfo.h" 38#include "llvm/ADT/SmallSet.h" 39#include "llvm/ADT/Statistic.h" 40#include "llvm/CodeGen/LiveInterval.h" 41#include "llvm/CodeGen/LiveIntervals.h" 42#include "llvm/CodeGen/LiveRegMatrix.h" 43#include "llvm/CodeGen/MachineFunctionPass.h" 44#include "llvm/CodeGen/MachineLoopInfo.h" 45#include "llvm/CodeGen/VirtRegMap.h" 46#include "llvm/InitializePasses.h" 47#include "llvm/Support/MathExtras.h" 48 49using namespace llvm; 50 51static cl::opt<unsigned> VerifyStallCycles("amdgpu-verify-regbanks-reassign", 52 cl::desc("Verify stall cycles in the regbanks reassign pass"), 53 cl::value_desc("0|1|2"), 54 cl::init(0), cl::Hidden); 55 56#define DEBUG_TYPE "amdgpu-regbanks-reassign" 57 58#define NUM_VGPR_BANKS 4 59#define NUM_SGPR_BANKS 8 60#define NUM_BANKS (NUM_VGPR_BANKS + NUM_SGPR_BANKS) 61#define SGPR_BANK_OFFSET NUM_VGPR_BANKS 62#define VGPR_BANK_MASK 0xf 63#define SGPR_BANK_MASK 0xff0 64#define SGPR_BANK_SHIFTED_MASK (SGPR_BANK_MASK >> SGPR_BANK_OFFSET) 65 66STATISTIC(NumStallsDetected, 67 "Number of operand read stalls detected"); 68STATISTIC(NumStallsRecovered, 69 "Number of operand read stalls recovered"); 70 71namespace { 72 73class GCNRegBankReassign : public MachineFunctionPass { 74 75 class OperandMask { 76 public: 77 OperandMask(unsigned r, unsigned s, unsigned m) 78 : Reg(r), SubReg(s), Mask(m) {} 79 unsigned Reg; 80 unsigned SubReg; 81 unsigned Mask; 82 }; 83 84 class Candidate { 85 public: 86 Candidate(MachineInstr *mi, unsigned reg, unsigned freebanks, 87 unsigned weight) 88 : MI(mi), Reg(reg), FreeBanks(freebanks), Weight(weight) {} 89 90 bool operator< (const Candidate& RHS) const { return Weight < RHS.Weight; } 91 92#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) 93 void dump(const GCNRegBankReassign *P) const { 94 MI->dump(); 95 dbgs() << P->printReg(Reg) << " to banks "; 96 dumpFreeBanks(FreeBanks); 97 dbgs() << " weight " << Weight << '\n'; 98 } 99#endif 100 101 MachineInstr *MI; 102 unsigned Reg; 103 unsigned FreeBanks; 104 unsigned Weight; 105 }; 106 107 class CandidateList : public std::list<Candidate> { 108 public: 109 // Speedup subsequent sort. 110 void push(const Candidate&& C) { 111 if (C.Weight) push_back(C); 112 else push_front(C); 113 } 114 }; 115 116public: 117 static char ID; 118 119public: 120 GCNRegBankReassign() : MachineFunctionPass(ID) { 121 initializeGCNRegBankReassignPass(*PassRegistry::getPassRegistry()); 122 } 123 124 bool runOnMachineFunction(MachineFunction &MF) override; 125 126 StringRef getPassName() const override { return "GCN RegBank Reassign"; } 127 128 void getAnalysisUsage(AnalysisUsage &AU) const override { 129 AU.addRequired<MachineLoopInfo>(); 130 AU.addRequired<LiveIntervals>(); 131 AU.addRequired<VirtRegMap>(); 132 AU.addRequired<LiveRegMatrix>(); 133 AU.setPreservesAll(); 134 MachineFunctionPass::getAnalysisUsage(AU); 135 } 136 137private: 138 const GCNSubtarget *ST; 139 140 const MachineRegisterInfo *MRI; 141 142 const SIRegisterInfo *TRI; 143 144 MachineLoopInfo *MLI; 145 146 VirtRegMap *VRM; 147 148 LiveRegMatrix *LRM; 149 150 LiveIntervals *LIS; 151 152 unsigned MaxNumVGPRs; 153 154 unsigned MaxNumSGPRs; 155 156 BitVector RegsUsed; 157 158 SmallVector<OperandMask, 8> OperandMasks; 159 160 CandidateList Candidates; 161 162 const MCPhysReg *CSRegs; 163 164 // Returns bank for a phys reg. 165 unsigned getPhysRegBank(unsigned Reg) const; 166 167 // Return a bit set for each register bank used. 4 banks for VGPRs and 168 // 8 banks for SGPRs. 169 // Registers already processed and recorded in RegsUsed are excluded. 170 // If Bank is not -1 assume Reg:SubReg to belong to that Bank. 171 unsigned getRegBankMask(unsigned Reg, unsigned SubReg, int Bank); 172 173 // Return number of stalls in the instructions. 174 // UsedBanks has bits set for the banks used by all operands. 175 // If Reg and Bank provided substitute the Reg with the Bank. 176 unsigned analyzeInst(const MachineInstr& MI, unsigned& UsedBanks, 177 unsigned Reg = AMDGPU::NoRegister, int Bank = -1); 178 179 // Return true if register is regular VGPR or SGPR or their tuples. 180 // Returns false for special registers like m0, vcc etc. 181 bool isReassignable(unsigned Reg) const; 182 183 // Check if registers' defs are old and may be pre-loaded. 184 // Returns 0 if both registers are old enough, 1 or 2 if one or both 185 // registers will not likely be pre-loaded. 186 unsigned getOperandGatherWeight(const MachineInstr& MI, 187 unsigned Reg1, 188 unsigned Reg2, 189 unsigned StallCycles) const; 190 191 192 // Find all bank bits in UsedBanks where Mask can be relocated to. 193 unsigned getFreeBanks(unsigned Mask, unsigned UsedBanks) const; 194 195 // Find all bank bits in UsedBanks where Mask can be relocated to. 196 // Bank is relative to the register and not its subregister component. 197 // Returns 0 is a register is not reassignable. 198 unsigned getFreeBanks(unsigned Reg, unsigned SubReg, unsigned Mask, 199 unsigned UsedBanks) const; 200 201 // Add cadidate instruction to the work list. 202 void collectCandidates(MachineInstr& MI, unsigned UsedBanks, 203 unsigned StallCycles); 204 205 // Collect cadidate instructions across function. Returns a number stall 206 // cycles detected. Only counts stalls if Collect is false. 207 unsigned collectCandidates(MachineFunction &MF, bool Collect = true); 208 209 // Remove all candidates that read specified register. 210 void removeCandidates(unsigned Reg); 211 212 // Compute stalls within the uses of SrcReg replaced by a register from 213 // Bank. If Bank is -1 does not perform substitution. If Collect is set 214 // candidates are collected and added to work list. 215 unsigned computeStallCycles(unsigned SrcReg, 216 unsigned Reg = AMDGPU::NoRegister, 217 int Bank = -1, bool Collect = false); 218 219 // Search for a register in Bank unused within LI. 220 // Returns phys reg or NoRegister. 221 unsigned scavengeReg(LiveInterval& LI, unsigned Bank) const; 222 223 // Try to reassign candidate. Returns number or stall cycles saved. 224 unsigned tryReassign(Candidate &C); 225 226 bool verifyCycles(MachineFunction &MF, 227 unsigned OriginalCycles, unsigned CyclesSaved); 228 229 230#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) 231public: 232 Printable printReg(unsigned Reg, unsigned SubReg = 0) const { 233 return Printable([Reg, SubReg, this](raw_ostream &OS) { 234 if (Register::isPhysicalRegister(Reg)) { 235 OS << llvm::printReg(Reg, TRI); 236 return; 237 } 238 if (!VRM->isAssignedReg(Reg)) 239 OS << "<unassigned> " << llvm::printReg(Reg, TRI); 240 else 241 OS << llvm::printReg(Reg, TRI) << '(' 242 << llvm::printReg(VRM->getPhys(Reg), TRI) << ')'; 243 if (SubReg) 244 OS << ':' << TRI->getSubRegIndexName(SubReg); 245 }); 246 } 247 248 static Printable printBank(unsigned Bank) { 249 return Printable([Bank](raw_ostream &OS) { 250 OS << ((Bank >= SGPR_BANK_OFFSET) ? Bank - SGPR_BANK_OFFSET : Bank); 251 }); 252 } 253 254 static void dumpFreeBanks(unsigned FreeBanks) { 255 for (unsigned L = 0; L < NUM_BANKS; ++L) 256 if (FreeBanks & (1 << L)) 257 dbgs() << printBank(L) << ' '; 258 } 259#endif 260}; 261 262} // End anonymous namespace. 263 264INITIALIZE_PASS_BEGIN(GCNRegBankReassign, DEBUG_TYPE, "GCN RegBank Reassign", 265 false, false) 266INITIALIZE_PASS_DEPENDENCY(LiveIntervals) 267INITIALIZE_PASS_DEPENDENCY(MachineLoopInfo) 268INITIALIZE_PASS_DEPENDENCY(VirtRegMap) 269INITIALIZE_PASS_DEPENDENCY(LiveRegMatrix) 270INITIALIZE_PASS_END(GCNRegBankReassign, DEBUG_TYPE, "GCN RegBank Reassign", 271 false, false) 272 273 274char GCNRegBankReassign::ID = 0; 275 276char &llvm::GCNRegBankReassignID = GCNRegBankReassign::ID; 277 278unsigned GCNRegBankReassign::getPhysRegBank(unsigned Reg) const { 279 assert(Register::isPhysicalRegister(Reg)); 280 281 const TargetRegisterClass *RC = TRI->getMinimalPhysRegClass(Reg); 282 unsigned Size = TRI->getRegSizeInBits(*RC); 283 if (Size > 32) 284 Reg = TRI->getSubReg(Reg, AMDGPU::sub0); 285 286 if (TRI->hasVGPRs(RC)) { 287 Reg -= AMDGPU::VGPR0; 288 return Reg % NUM_VGPR_BANKS; 289 } 290 291 Reg = TRI->getEncodingValue(Reg) / 2; 292 return Reg % NUM_SGPR_BANKS + SGPR_BANK_OFFSET; 293} 294 295unsigned GCNRegBankReassign::getRegBankMask(unsigned Reg, unsigned SubReg, 296 int Bank) { 297 if (Register::isVirtualRegister(Reg)) { 298 if (!VRM->isAssignedReg(Reg)) 299 return 0; 300 301 Reg = VRM->getPhys(Reg); 302 if (!Reg) 303 return 0; 304 if (SubReg) 305 Reg = TRI->getSubReg(Reg, SubReg); 306 } 307 308 const TargetRegisterClass *RC = TRI->getMinimalPhysRegClass(Reg); 309 unsigned Size = TRI->getRegSizeInBits(*RC) / 32; 310 if (Size > 1) 311 Reg = TRI->getSubReg(Reg, AMDGPU::sub0); 312 313 if (TRI->hasVGPRs(RC)) { 314 // VGPRs have 4 banks assigned in a round-robin fashion. 315 Reg -= AMDGPU::VGPR0; 316 unsigned Mask = (1 << Size) - 1; 317 unsigned Used = 0; 318 // Bitmask lacks an extract method 319 for (unsigned I = 0; I < Size; ++I) 320 if (RegsUsed.test(Reg + I)) 321 Used |= 1 << I; 322 RegsUsed.set(Reg, Reg + Size); 323 Mask &= ~Used; 324 Mask <<= (Bank == -1) ? Reg % NUM_VGPR_BANKS : unsigned(Bank); 325 return (Mask | (Mask >> NUM_VGPR_BANKS)) & VGPR_BANK_MASK; 326 } 327 328 // SGPRs have 8 banks holding 2 consequitive registers each. 329 Reg = TRI->getEncodingValue(Reg) / 2; 330 unsigned StartBit = AMDGPU::VGPR_32RegClass.getNumRegs(); 331 if (Reg + StartBit >= RegsUsed.size()) 332 return 0; 333 334 if (Size > 1) 335 Size /= 2; 336 unsigned Mask = (1 << Size) - 1; 337 unsigned Used = 0; 338 for (unsigned I = 0; I < Size; ++I) 339 if (RegsUsed.test(StartBit + Reg + I)) 340 Used |= 1 << I; 341 RegsUsed.set(StartBit + Reg, StartBit + Reg + Size); 342 Mask &= ~Used; 343 Mask <<= (Bank == -1) ? Reg % NUM_SGPR_BANKS 344 : unsigned(Bank - SGPR_BANK_OFFSET); 345 Mask = (Mask | (Mask >> NUM_SGPR_BANKS)) & SGPR_BANK_SHIFTED_MASK; 346 // Reserve 4 bank ids for VGPRs. 347 return Mask << SGPR_BANK_OFFSET; 348} 349 350unsigned GCNRegBankReassign::analyzeInst(const MachineInstr& MI, 351 unsigned& UsedBanks, 352 unsigned Reg, 353 int Bank) { 354 unsigned StallCycles = 0; 355 UsedBanks = 0; 356 357 if (MI.isDebugValue()) 358 return 0; 359 360 RegsUsed.reset(); 361 OperandMasks.clear(); 362 for (const auto& Op : MI.explicit_uses()) { 363 // Undef can be assigned to any register, so two vregs can be assigned 364 // the same phys reg within the same instruction. 365 if (!Op.isReg() || Op.isUndef()) 366 continue; 367 368 Register R = Op.getReg(); 369 if (TRI->hasAGPRs(TRI->getRegClassForReg(*MRI, R))) 370 continue; 371 372 unsigned ShiftedBank = Bank; 373 374 if (Bank != -1 && R == Reg && Op.getSubReg()) { 375 unsigned LM = TRI->getSubRegIndexLaneMask(Op.getSubReg()).getAsInteger(); 376 if (!(LM & 1) && (Bank < NUM_VGPR_BANKS)) { 377 // If a register spans all banks we cannot shift it to avoid conflict. 378 if (countPopulation(LM) >= NUM_VGPR_BANKS) 379 continue; 380 ShiftedBank = (Bank + countTrailingZeros(LM)) % NUM_VGPR_BANKS; 381 } else if (!(LM & 3) && (Bank >= SGPR_BANK_OFFSET)) { 382 // If a register spans all banks we cannot shift it to avoid conflict. 383 if (countPopulation(LM) / 2 >= NUM_SGPR_BANKS) 384 continue; 385 ShiftedBank = SGPR_BANK_OFFSET + (Bank - SGPR_BANK_OFFSET + 386 (countTrailingZeros(LM) >> 1)) % 387 NUM_SGPR_BANKS; 388 } 389 } 390 391 unsigned Mask = getRegBankMask(R, Op.getSubReg(), 392 (Reg == R) ? ShiftedBank : -1); 393 StallCycles += countPopulation(UsedBanks & Mask); 394 UsedBanks |= Mask; 395 OperandMasks.push_back(OperandMask(Op.getReg(), Op.getSubReg(), Mask)); 396 } 397 398 return StallCycles; 399} 400 401unsigned GCNRegBankReassign::getOperandGatherWeight(const MachineInstr& MI, 402 unsigned Reg1, 403 unsigned Reg2, 404 unsigned StallCycles) const 405{ 406 unsigned Defs = 0; 407 MachineBasicBlock::const_instr_iterator Def(MI.getIterator()); 408 MachineBasicBlock::const_instr_iterator B(MI.getParent()->instr_begin()); 409 for (unsigned S = StallCycles; S && Def != B && Defs != 3; --S) { 410 if (MI.isDebugInstr()) 411 continue; 412 --Def; 413 if (Def->getOpcode() == TargetOpcode::IMPLICIT_DEF) 414 continue; 415 if (Def->modifiesRegister(Reg1, TRI)) 416 Defs |= 1; 417 if (Def->modifiesRegister(Reg2, TRI)) 418 Defs |= 2; 419 } 420 return countPopulation(Defs); 421} 422 423bool GCNRegBankReassign::isReassignable(unsigned Reg) const { 424 if (Register::isPhysicalRegister(Reg) || !VRM->isAssignedReg(Reg)) 425 return false; 426 427 const MachineInstr *Def = MRI->getUniqueVRegDef(Reg); 428 429 Register PhysReg = VRM->getPhys(Reg); 430 431 if (Def && Def->isCopy() && Def->getOperand(1).getReg() == PhysReg) 432 return false; 433 434 for (auto U : MRI->use_nodbg_operands(Reg)) { 435 if (U.isImplicit()) 436 return false; 437 const MachineInstr *UseInst = U.getParent(); 438 if (UseInst->isCopy() && UseInst->getOperand(0).getReg() == PhysReg) 439 return false; 440 } 441 442 const TargetRegisterClass *RC = TRI->getMinimalPhysRegClass(PhysReg); 443 if (TRI->hasVGPRs(RC)) 444 return true; 445 446 unsigned Size = TRI->getRegSizeInBits(*RC); 447 if (Size > 32) 448 PhysReg = TRI->getSubReg(PhysReg, AMDGPU::sub0); 449 450 return AMDGPU::SGPR_32RegClass.contains(PhysReg); 451} 452 453unsigned GCNRegBankReassign::getFreeBanks(unsigned Mask, 454 unsigned UsedBanks) const { 455 unsigned Size = countPopulation(Mask); 456 unsigned FreeBanks = 0; 457 unsigned Bank = findFirstSet(Mask); 458 459 UsedBanks &= ~Mask; 460 461 // Find free VGPR banks 462 if ((Mask & VGPR_BANK_MASK) && (Size < NUM_VGPR_BANKS)) { 463 for (unsigned I = 0; I < NUM_VGPR_BANKS; ++I) { 464 if (Bank == I) 465 continue; 466 unsigned NewMask = ((1 << Size) - 1) << I; 467 NewMask = (NewMask | (NewMask >> NUM_VGPR_BANKS)) & VGPR_BANK_MASK; 468 if (!(UsedBanks & NewMask)) 469 FreeBanks |= 1 << I; 470 } 471 return FreeBanks; 472 } 473 474 // Find free SGPR banks 475 // SGPR tuples must be aligned, so step is size in banks it 476 // crosses. 477 Bank -= SGPR_BANK_OFFSET; 478 for (unsigned I = 0; I < NUM_SGPR_BANKS; I += Size) { 479 if (Bank == I) 480 continue; 481 unsigned NewMask = ((1 << Size) - 1) << I; 482 NewMask = (NewMask | (NewMask >> NUM_SGPR_BANKS)) & SGPR_BANK_SHIFTED_MASK; 483 if (!(UsedBanks & (NewMask << SGPR_BANK_OFFSET))) 484 FreeBanks |= (1 << SGPR_BANK_OFFSET) << I; 485 } 486 487 return FreeBanks; 488} 489 490unsigned GCNRegBankReassign::getFreeBanks(unsigned Reg, 491 unsigned SubReg, 492 unsigned Mask, 493 unsigned UsedBanks) const { 494 if (!isReassignable(Reg)) 495 return 0; 496 497 unsigned FreeBanks = getFreeBanks(Mask, UsedBanks); 498 499 unsigned LM = TRI->getSubRegIndexLaneMask(SubReg).getAsInteger(); 500 if (!(LM & 1) && (Mask & VGPR_BANK_MASK)) { 501 unsigned Shift = countTrailingZeros(LM); 502 if (Shift >= NUM_VGPR_BANKS) 503 return 0; 504 unsigned VB = FreeBanks & VGPR_BANK_MASK; 505 FreeBanks = ((VB >> Shift) | (VB << (NUM_VGPR_BANKS - Shift))) & 506 VGPR_BANK_MASK; 507 } else if (!(LM & 3) && (Mask & SGPR_BANK_MASK)) { 508 unsigned Shift = countTrailingZeros(LM) >> 1; 509 if (Shift >= NUM_SGPR_BANKS) 510 return 0; 511 unsigned SB = FreeBanks >> SGPR_BANK_OFFSET; 512 FreeBanks = ((SB >> Shift) | (SB << (NUM_SGPR_BANKS - Shift))) & 513 SGPR_BANK_SHIFTED_MASK; 514 FreeBanks <<= SGPR_BANK_OFFSET; 515 } 516 517 LLVM_DEBUG(if (FreeBanks) { 518 dbgs() << "Potential reassignments of " << printReg(Reg, SubReg) 519 << " to banks: "; dumpFreeBanks(FreeBanks); 520 dbgs() << '\n'; }); 521 522 return FreeBanks; 523} 524 525void GCNRegBankReassign::collectCandidates(MachineInstr& MI, 526 unsigned UsedBanks, 527 unsigned StallCycles) { 528 LLVM_DEBUG(MI.dump()); 529 530 if (!StallCycles) 531 return; 532 533 LLVM_DEBUG(dbgs() << "Stall cycles = " << StallCycles << '\n'); 534 535 for (unsigned I = 0, E = OperandMasks.size(); I + 1 < E; ++I) { 536 for (unsigned J = I + 1; J != E; ++J) { 537 if (!(OperandMasks[I].Mask & OperandMasks[J].Mask)) 538 continue; 539 540 unsigned Reg1 = OperandMasks[I].Reg; 541 unsigned Reg2 = OperandMasks[J].Reg; 542 unsigned SubReg1 = OperandMasks[I].SubReg; 543 unsigned SubReg2 = OperandMasks[J].SubReg; 544 unsigned Mask1 = OperandMasks[I].Mask; 545 unsigned Mask2 = OperandMasks[J].Mask; 546 unsigned Size1 = countPopulation(Mask1); 547 unsigned Size2 = countPopulation(Mask2); 548 549 LLVM_DEBUG(dbgs() << "Conflicting operands: " << printReg(Reg1, SubReg1) << 550 " and " << printReg(Reg2, SubReg2) << '\n'); 551 552 unsigned Weight = getOperandGatherWeight(MI, Reg1, Reg2, StallCycles); 553 Weight += MLI->getLoopDepth(MI.getParent()) * 10; 554 555 LLVM_DEBUG(dbgs() << "Stall weight = " << Weight << '\n'); 556 557 unsigned FreeBanks1 = getFreeBanks(Reg1, SubReg1, Mask1, UsedBanks); 558 unsigned FreeBanks2 = getFreeBanks(Reg2, SubReg2, Mask2, UsedBanks); 559 if (FreeBanks1) 560 Candidates.push(Candidate(&MI, Reg1, FreeBanks1, Weight 561 + ((Size2 > Size1) ? 1 : 0))); 562 if (FreeBanks2) 563 Candidates.push(Candidate(&MI, Reg2, FreeBanks2, Weight 564 + ((Size1 > Size2) ? 1 : 0))); 565 } 566 } 567} 568 569unsigned GCNRegBankReassign::computeStallCycles(unsigned SrcReg, 570 unsigned Reg, int Bank, 571 bool Collect) { 572 unsigned TotalStallCycles = 0; 573 unsigned UsedBanks = 0; 574 SmallSet<const MachineInstr *, 16> Visited; 575 576 for (auto &MI : MRI->use_nodbg_instructions(SrcReg)) { 577 if (MI.isBundle()) 578 continue; 579 if (!Visited.insert(&MI).second) 580 continue; 581 unsigned StallCycles = analyzeInst(MI, UsedBanks, Reg, Bank); 582 TotalStallCycles += StallCycles; 583 if (Collect) 584 collectCandidates(MI, UsedBanks, StallCycles); 585 } 586 587 return TotalStallCycles; 588} 589 590unsigned GCNRegBankReassign::scavengeReg(LiveInterval& LI, 591 unsigned Bank) const { 592 const TargetRegisterClass *RC = MRI->getRegClass(LI.reg); 593 unsigned MaxNumRegs = (Bank < NUM_VGPR_BANKS) ? MaxNumVGPRs 594 : MaxNumSGPRs; 595 unsigned MaxReg = MaxNumRegs + (Bank < NUM_VGPR_BANKS ? AMDGPU::VGPR0 596 : AMDGPU::SGPR0); 597 598 for (unsigned Reg : RC->getRegisters()) { 599 // Check occupancy limit. 600 if (TRI->isSubRegisterEq(Reg, MaxReg)) 601 break; 602 603 if (!MRI->isAllocatable(Reg) || getPhysRegBank(Reg) != Bank) 604 continue; 605 606 for (unsigned I = 0; CSRegs[I]; ++I) 607 if (TRI->isSubRegisterEq(Reg, CSRegs[I]) && 608 !LRM->isPhysRegUsed(CSRegs[I])) 609 return AMDGPU::NoRegister; 610 611 LLVM_DEBUG(dbgs() << "Trying register " << printReg(Reg) << '\n'); 612 613 if (!LRM->checkInterference(LI, Reg)) 614 return Reg; 615 } 616 617 return AMDGPU::NoRegister; 618} 619 620unsigned GCNRegBankReassign::tryReassign(Candidate &C) { 621 if (!LIS->hasInterval(C.Reg)) 622 return 0; 623 624 LiveInterval &LI = LIS->getInterval(C.Reg); 625 LLVM_DEBUG(dbgs() << "Try reassign " << printReg(C.Reg) << " in "; C.MI->dump(); 626 LI.dump()); 627 628 // For each candidate bank walk all instructions in the range of live 629 // interval and check if replacing the register with one belonging to 630 // the candidate bank reduces conflicts. 631 632 unsigned OrigStalls = computeStallCycles(C.Reg); 633 LLVM_DEBUG(dbgs() << "--- Stall cycles in range = " << OrigStalls << '\n'); 634 if (!OrigStalls) 635 return 0; 636 637 struct BankStall { 638 BankStall(unsigned b, unsigned s) : Bank(b), Stalls(s) {}; 639 bool operator< (const BankStall &RHS) const { return Stalls > RHS.Stalls; } 640 unsigned Bank; 641 unsigned Stalls; 642 }; 643 SmallVector<BankStall, 8> BankStalls; 644 645 for (int Bank = 0; Bank < NUM_BANKS; ++Bank) { 646 if (C.FreeBanks & (1 << Bank)) { 647 LLVM_DEBUG(dbgs() << "Trying bank " << printBank(Bank) << '\n'); 648 unsigned Stalls = computeStallCycles(C.Reg, C.Reg, Bank); 649 if (Stalls < OrigStalls) { 650 LLVM_DEBUG(dbgs() << "With bank " << printBank(Bank) << " -> " 651 << Stalls << '\n'); 652 BankStalls.push_back(BankStall((unsigned)Bank, Stalls)); 653 } 654 } 655 } 656 std::sort(BankStalls.begin(), BankStalls.end()); 657 658 Register OrigReg = VRM->getPhys(C.Reg); 659 LRM->unassign(LI); 660 while (!BankStalls.empty()) { 661 BankStall BS = BankStalls.pop_back_val(); 662 unsigned Reg = scavengeReg(LI, BS.Bank); 663 if (Reg == AMDGPU::NoRegister) { 664 LLVM_DEBUG(dbgs() << "No free registers in bank " << printBank(BS.Bank) 665 << '\n'); 666 continue; 667 } 668 LLVM_DEBUG(dbgs() << "Found free register " << printReg(Reg) 669 << (LRM->isPhysRegUsed(Reg) ? "" : " (new)") 670 << " in bank " << printBank(BS.Bank) << '\n'); 671 672 LRM->assign(LI, Reg); 673 674 LLVM_DEBUG(dbgs() << "--- Cycles saved: " << OrigStalls - BS.Stalls << '\n'); 675 676 return OrigStalls - BS.Stalls; 677 } 678 LRM->assign(LI, OrigReg); 679 680 return 0; 681} 682 683unsigned GCNRegBankReassign::collectCandidates(MachineFunction &MF, 684 bool Collect) { 685 unsigned TotalStallCycles = 0; 686 687 for (MachineBasicBlock &MBB : MF) { 688 689 LLVM_DEBUG(if (Collect) { 690 if (MBB.getName().empty()) dbgs() << "bb." << MBB.getNumber(); 691 else dbgs() << MBB.getName(); dbgs() << ":\n"; 692 }); 693 694 for (MachineInstr &MI : MBB.instrs()) { 695 if (MI.isBundle()) 696 continue; // we analyze the instructions inside the bundle individually 697 698 unsigned UsedBanks = 0; 699 unsigned StallCycles = analyzeInst(MI, UsedBanks); 700 701 if (Collect) 702 collectCandidates(MI, UsedBanks, StallCycles); 703 704 TotalStallCycles += StallCycles; 705 } 706 707 LLVM_DEBUG(if (Collect) { dbgs() << '\n'; }); 708 } 709 710 return TotalStallCycles; 711} 712 713void GCNRegBankReassign::removeCandidates(unsigned Reg) { 714 Candidates.remove_if([Reg, this](const Candidate& C) { 715 return C.MI->readsRegister(Reg, TRI); 716 }); 717} 718 719bool GCNRegBankReassign::verifyCycles(MachineFunction &MF, 720 unsigned OriginalCycles, 721 unsigned CyclesSaved) { 722 unsigned StallCycles = collectCandidates(MF, false); 723 LLVM_DEBUG(dbgs() << "=== After the pass " << StallCycles 724 << " stall cycles left\n"); 725 return StallCycles + CyclesSaved == OriginalCycles; 726} 727 728bool GCNRegBankReassign::runOnMachineFunction(MachineFunction &MF) { 729 ST = &MF.getSubtarget<GCNSubtarget>(); 730 if (!ST->hasRegisterBanking() || skipFunction(MF.getFunction())) 731 return false; 732 733 MRI = &MF.getRegInfo(); 734 TRI = ST->getRegisterInfo(); 735 MLI = &getAnalysis<MachineLoopInfo>(); 736 VRM = &getAnalysis<VirtRegMap>(); 737 LRM = &getAnalysis<LiveRegMatrix>(); 738 LIS = &getAnalysis<LiveIntervals>(); 739 740 const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>(); 741 unsigned Occupancy = MFI->getOccupancy(); 742 MaxNumVGPRs = ST->getMaxNumVGPRs(MF); 743 MaxNumSGPRs = ST->getMaxNumSGPRs(MF); 744 MaxNumVGPRs = std::min(ST->getMaxNumVGPRs(Occupancy), MaxNumVGPRs); 745 MaxNumSGPRs = std::min(ST->getMaxNumSGPRs(Occupancy, true), MaxNumSGPRs); 746 747 CSRegs = MRI->getCalleeSavedRegs(); 748 749 RegsUsed.resize(AMDGPU::VGPR_32RegClass.getNumRegs() + 750 TRI->getEncodingValue(AMDGPU::SGPR_NULL) / 2 + 1); 751 752 LLVM_DEBUG(dbgs() << "=== RegBanks reassign analysis on function " << MF.getName() 753 << '\n'); 754 755 unsigned StallCycles = collectCandidates(MF); 756 NumStallsDetected += StallCycles; 757 758 LLVM_DEBUG(dbgs() << "=== " << StallCycles << " stall cycles detected in " 759 "function " << MF.getName() << '\n'); 760 761 Candidates.sort(); 762 763 LLVM_DEBUG(dbgs() << "\nCandidates:\n\n"; 764 for (auto C : Candidates) C.dump(this); 765 dbgs() << "\n\n"); 766 767 unsigned CyclesSaved = 0; 768 while (!Candidates.empty()) { 769 Candidate C = Candidates.back(); 770 unsigned LocalCyclesSaved = tryReassign(C); 771 CyclesSaved += LocalCyclesSaved; 772 773 if (VerifyStallCycles > 1 && !verifyCycles(MF, StallCycles, CyclesSaved)) 774 report_fatal_error("RegBank reassign stall cycles verification failed."); 775 776 Candidates.pop_back(); 777 if (LocalCyclesSaved) { 778 removeCandidates(C.Reg); 779 computeStallCycles(C.Reg, AMDGPU::NoRegister, -1, true); 780 Candidates.sort(); 781 782 LLVM_DEBUG(dbgs() << "\nCandidates:\n\n"; 783 for (auto C : Candidates) 784 C.dump(this); 785 dbgs() << "\n\n"); 786 } 787 } 788 NumStallsRecovered += CyclesSaved; 789 790 LLVM_DEBUG(dbgs() << "=== After the pass " << CyclesSaved 791 << " cycles saved in function " << MF.getName() << '\n'); 792 793 Candidates.clear(); 794 795 if (VerifyStallCycles == 1 && !verifyCycles(MF, StallCycles, CyclesSaved)) 796 report_fatal_error("RegBank reassign stall cycles verification failed."); 797 798 RegsUsed.clear(); 799 800 return CyclesSaved > 0; 801} 802