Target/AMDGPU/AMDGPULowerModuleLDSPass.cpp

201342Snyan//===-- AMDGPULowerModuleLDSPass.cpp ------------------------------*- C++ -*-=//
201342Snyan//
201342Snyan// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
201342Snyan// See https://llvm.org/LICENSE.txt for license information.
201342Snyan// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
201342Snyan//
201342Snyan//===----------------------------------------------------------------------===//
201342Snyan//
201342Snyan// This pass eliminates local data store, LDS, uses from non-kernel functions.
201342Snyan// LDS is contiguous memory allocated per kernel execution.
201342Snyan//
201342Snyan// Background.
201342Snyan//
201342Snyan// The programming model is global variables, or equivalently function local
201342Snyan// static variables, accessible from kernels or other functions. For uses from
201342Snyan// kernels this is straightforward - assign an integer to the kernel for the
201342Snyan// memory required by all the variables combined, allocate them within that.
201342Snyan// For uses from functions there are performance tradeoffs to choose between.
201342Snyan//
201342Snyan// This model means the GPU runtime can specify the amount of memory allocated.
201342Snyan// If this is more than the kernel assumed, the excess can be made available
201342Snyan// using a language specific feature, which IR represents as a variable with
201342Snyan// no initializer. This feature is referred to here as "Dynamic LDS" and is
201342Snyan// lowered slightly differently to the normal case.
201342Snyan//
201342Snyan// Consequences of this GPU feature:
201342Snyan// - memory is limited and exceeding it halts compilation
201342Snyan// - a global accessed by one kernel exists independent of other kernels
201342Snyan// - a global exists independent of simultaneous execution of the same kernel
201342Snyan// - the address of the global may be different from different kernels as they
201342Snyan//   do not alias, which permits only allocating variables they use
201342Snyan// - if the address is allowed to differ, functions need help to find it
201342Snyan//
201342Snyan// Uses from kernels are implemented here by grouping them in a per-kernel
201342Snyan// struct instance. This duplicates the variables, accurately modelling their
201342Snyan// aliasing properties relative to a single global representation. It also
201342Snyan// permits control over alignment via padding.
201342Snyan//
201342Snyan// Uses from functions are more complicated and the primary purpose of this
201342Snyan// IR pass. Several different lowering are chosen between to meet requirements
201342Snyan// to avoid allocating any LDS where it is not necessary, as that impacts
201342Snyan// occupancy and may fail the compilation, while not imposing overhead on a
201342Snyan// feature whose primary advantage over global memory is performance. The basic
201342Snyan// design goal is to avoid one kernel imposing overhead on another.
201342Snyan//
201342Snyan// Implementation.
201342Snyan//
201342Snyan// LDS variables with constant annotation or non-undef initializer are passed
201342Snyan// through unchanged for simplification or error diagnostics in later passes.
201342Snyan// Non-undef initializers are not yet implemented for LDS.
201342Snyan//
201342Snyan// LDS variables that are always allocated at the same address can be found
201342Snyan// by lookup at that address. Otherwise runtime information/cost is required.
201342Snyan//
201342Snyan// The simplest strategy possible is to group all LDS variables in a single
201342Snyan// struct and allocate that struct in every kernel such that the original
201342Snyan// variables are always at the same address. LDS is however a limited resource
201342Snyan// so this strategy is unusable in practice. It is not implemented here.
201342Snyan//
201342Snyan// Strategy | Precise allocation | Zero runtime cost | General purpose |
201342Snyan//  --------+--------------------+-------------------+-----------------+
201342Snyan//   Module |                 No |               Yes |             Yes |
201342Snyan//    Table |                Yes |                No |             Yes |
201342Snyan//   Kernel |                Yes |               Yes |              No |
201342Snyan//   Hybrid |                Yes |           Partial |             Yes |
201342Snyan//
201342Snyan// "Module" spends LDS memory to save cycles. "Table" spends cycles and global
201342Snyan// memory to save LDS. "Kernel" is as fast as kernel allocation but only works
201342Snyan// for variables that are known reachable from a single kernel. "Hybrid" picks
201342Snyan// between all three. When forced to choose between LDS and cycles we minimise
201342Snyan// LDS use.
201342Snyan
201342Snyan// The "module" lowering implemented here finds LDS variables which are used by
201342Snyan// non-kernel functions and creates a new struct with a field for each of those
201342Snyan// LDS variables. Variables that are only used from kernels are excluded.
201342Snyan//
201342Snyan// The "table" lowering implemented here has three components.
201342Snyan// First kernels are assigned a unique integer identifier which is available in
226506Sdes// functions it calls through the intrinsic amdgcn_lds_kernel_id. The integer
226506Sdes// is passed through a specific SGPR, thus works with indirect calls.
201342Snyan// Second, each kernel allocates LDS variables independent of other kernels and
201342Snyan// writes the addresses it chose for each variable into an array in consistent
201342Snyan// order. If the kernel does not allocate a given variable, it writes undef to
201342Snyan// the corresponding array location. These arrays are written to a constant
201342Snyan// table in the order matching the kernel unique integer identifier.
201342Snyan// Third, uses from non-kernel functions are replaced with a table lookup using
201342Snyan// the intrinsic function to find the address of the variable.
201342Snyan//
201342Snyan// "Kernel" lowering is only applicable for variables that are unambiguously
201342Snyan// reachable from exactly one kernel. For those cases, accesses to the variable
201342Snyan// can be lowered to ConstantExpr address of a struct instance specific to that
201342Snyan// one kernel. This is zero cost in space and in compute. It will raise a fatal
201342Snyan// error on any variable that might be reachable from multiple kernels and is
201342Snyan// thus most easily used as part of the hybrid lowering strategy.
201342Snyan//
201342Snyan// Hybrid lowering is a mixture of the above. It uses the zero cost kernel
201342Snyan// lowering where it can. It lowers the variable accessed by the greatest
201342Snyan// number of kernels using the module strategy as that is free for the first
201342Snyan// variable. Any futher variables that can be lowered with the module strategy
201342Snyan// without incurring LDS memory overhead are. The remaining ones are lowered
201342Snyan// via table.
201342Snyan//
201342Snyan// Consequences
201342Snyan// - No heuristics or user controlled magic numbers, hybrid is the right choice
201342Snyan// - Kernels that don't use functions (or have had them all inlined) are not
201342Snyan//   affected by any lowering for kernels that do.
201342Snyan// - Kernels that don't make indirect function calls are not affected by those
201342Snyan//   that do.
201342Snyan// - Variables which are used by lots of kernels, e.g. those injected by a
201342Snyan//   language runtime in most kernels, are expected to have no overhead
201342Snyan// - Implementations that instantiate templates per-kernel where those templates
201342Snyan//   use LDS are expected to hit the "Kernel" lowering strategy
201342Snyan// - The runtime properties impose a cost in compiler implementation complexity
201342Snyan//
201342Snyan// Dynamic LDS implementation
201342Snyan// Dynamic LDS is lowered similarly to the "table" strategy above and uses the
201342Snyan// same intrinsic to identify which kernel is at the root of the dynamic call
201342Snyan// graph. This relies on the specified behaviour that all dynamic LDS variables
201342Snyan// alias one another, i.e. are at the same address, with respect to a given
201342Snyan// kernel. Therefore this pass creates new dynamic LDS variables for each kernel
201342Snyan// that allocates any dynamic LDS and builds a table of addresses out of those.
201342Snyan// The AMDGPUPromoteAlloca pass skips kernels that use dynamic LDS.
201342Snyan// The corresponding optimisation for "kernel" lowering where the table lookup
201342Snyan// is elided is not implemented.
201342Snyan//
201342Snyan//
201342Snyan// Implementation notes / limitations
201342Snyan// A single LDS global variable represents an instance per kernel that can reach
239063Snyan// said variables. This pass essentially specialises said variables per kernel.
239063Snyan// Handling ConstantExpr during the pass complicated this significantly so now
201342Snyan// all ConstantExpr uses of LDS variables are expanded to instructions. This
201342Snyan// may need amending when implementing non-undef initialisers.
232784Snyan//
239063Snyan// Lowering is split between this IR pass and the back end. This pass chooses
239063Snyan// where given variables should be allocated and marks them with metadata,
201342Snyan// MD_absolute_symbol. The backend places the variables in coincidentally the
201342Snyan// same location and raises a fatal error if something has gone awry. This works
219960Snyan// in practice because the only pass between this one and the backend that
201342Snyan// changes LDS is PromoteAlloca and the changes it makes do not conflict.
201342Snyan//
201342Snyan// Addresses are written to constant global arrays based on the same metadata.
201342Snyan//
201342Snyan// The backend lowers LDS variables in the order of traversal of the function.
201342Snyan// This is at odds with the deterministic layout required. The workaround is to
201342Snyan// allocate the fixed-address variables immediately upon starting the function
201342Snyan// where they can be placed as intended. This requires a means of mapping from
201342Snyan// the function to the variables that it allocates. For the module scope lds,
201342Snyan// this is via metadata indicating whether the variable is not required. If a
201342Snyan// pass deletes that metadata, a fatal error on disagreement with the absolute
220685Snyan// symbol metadata will occur. For kernel scope and dynamic, this is by _name_
201342Snyan// correspondence between the function and the variable. It requires the
201342Snyan// kernel to have a name (which is only a limitation for tests in practice) and
201342Snyan// for nothing to rename the corresponding symbols. This is a hazard if the pass
201342Snyan// is run multiple times during debugging. Alternative schemes considered all
201342Snyan// involve bespoke metadata.
201342Snyan//
201342Snyan// If the name correspondence can be replaced, multiple distinct kernels that
201342Snyan// have the same memory layout can map to the same kernel id (as the address
201342Snyan// itself is handled by the absolute symbol metadata) and that will allow more
201342Snyan// uses of the "kernel" style faster lowering and reduce the size of the lookup
201342Snyan// tables.
201342Snyan//
201342Snyan// There is a test that checks this does not fire for a graphics shader. This
201342Snyan// lowering is expected to work for graphics if the isKernel test is changed.
201342Snyan//
201342Snyan// The current markUsedByKernel is sufficient for PromoteAlloca but is elided
201342Snyan// before codegen. Replacing this with an equivalent intrinsic which lasts until
201342Snyan// shortly after the machine function lowering of LDS would help break the name
201342Snyan// mapping. The other part needed is probably to amend PromoteAlloca to embed
201342Snyan// the LDS variables it creates in the same struct created here. That avoids the
201342Snyan// current hazard where a PromoteAlloca LDS variable might be allocated before
201342Snyan// the kernel scope (and thus error on the address check). Given a new invariant
201342Snyan// that no LDS variables exist outside of the structs managed here, and an
235988Sgleb// intrinsic that lasts until after the LDS frame lowering, it should be
201342Snyan// possible to drop the name mapping and fold equivalent memory layouts.
201342Snyan//
201342Snyan//===----------------------------------------------------------------------===//
201342Snyan
201342Snyan#include "AMDGPU.h"
201342Snyan#include "AMDGPUTargetMachine.h"
201342Snyan#include "Utils/AMDGPUBaseInfo.h"
201342Snyan#include "Utils/AMDGPUMemoryUtils.h"
201342Snyan#include "llvm/ADT/BitVector.h"
201342Snyan#include "llvm/ADT/DenseMap.h"
201342Snyan#include "llvm/ADT/DenseSet.h"
201342Snyan#include "llvm/ADT/STLExtras.h"
201342Snyan#include "llvm/ADT/SetOperations.h"
201342Snyan#include "llvm/Analysis/CallGraph.h"
201342Snyan#include "llvm/CodeGen/TargetPassConfig.h"
201342Snyan#include "llvm/IR/Constants.h"
201342Snyan#include "llvm/IR/DerivedTypes.h"
201342Snyan#include "llvm/IR/IRBuilder.h"
201342Snyan#include "llvm/IR/InlineAsm.h"
201342Snyan#include "llvm/IR/Instructions.h"
201342Snyan#include "llvm/IR/IntrinsicsAMDGPU.h"
201342Snyan#include "llvm/IR/MDBuilder.h"
201342Snyan#include "llvm/IR/ReplaceConstant.h"
201342Snyan#include "llvm/InitializePasses.h"
201342Snyan#include "llvm/Pass.h"
201342Snyan#include "llvm/Support/CommandLine.h"
201342Snyan#include "llvm/Support/Debug.h"
201342Snyan#include "llvm/Support/Format.h"
201342Snyan#include "llvm/Support/OptimizedStructLayout.h"
201342Snyan#include "llvm/Support/raw_ostream.h"
201342Snyan#include "llvm/Transforms/Utils/BasicBlockUtils.h"
201342Snyan#include "llvm/Transforms/Utils/ModuleUtils.h"
201342Snyan
201342Snyan#include <vector>
201342Snyan
201342Snyan#include <cstdio>
201342Snyan
201342Snyan#define DEBUG_TYPE "amdgpu-lower-module-lds"
201342Snyan
201342Snyanusing namespace llvm;
201342Snyan
201342Snyannamespace {
201342Snyan
201342Snyancl::opt<bool> SuperAlignLDSGlobals(
201342Snyan    "amdgpu-super-align-lds-globals",
201342Snyan    cl::desc("Increase alignment of LDS if it is not on align boundary"),
201342Snyan    cl::init(true), cl::Hidden);
201342Snyan
201342Snyanenum class LoweringKind { module, table, kernel, hybrid };
201342Snyancl::opt<LoweringKind> LoweringKindLoc(
201342Snyan    "amdgpu-lower-module-lds-strategy",
201342Snyan    cl::desc("Specify lowering strategy for function LDS access:"), cl::Hidden,
201342Snyan    cl::init(LoweringKind::hybrid),
201342Snyan    cl::values(
201342Snyan        clEnumValN(LoweringKind::table, "table", "Lower via table lookup"),
201342Snyan        clEnumValN(LoweringKind::module, "module", "Lower via module struct"),
201342Snyan        clEnumValN(
201342Snyan            LoweringKind::kernel, "kernel",
201342Snyan            "Lower variables reachable from one kernel, otherwise abort"),
201342Snyan        clEnumValN(LoweringKind::hybrid, "hybrid",
201342Snyan                   "Lower via mixture of above strategies")));
201342Snyan
201342Snyanbool isKernelLDS(const Function *F) {
201342Snyan  // Some weirdness here. AMDGPU::isKernelCC does not call into
201342Snyan  // AMDGPU::isKernel with the calling conv, it instead calls into
201342Snyan  // isModuleEntryFunction which returns true for more calling conventions
201342Snyan  // than AMDGPU::isKernel does. There's a FIXME on AMDGPU::isKernel.
201342Snyan  // There's also a test that checks that the LDS lowering does not hit on
201342Snyan  // a graphics shader, denoted amdgpu_ps, so stay with the limited case.
201342Snyan  // Putting LDS in the name of the function to draw attention to this.
201342Snyan  return AMDGPU::isKernel(F->getCallingConv());
201342Snyan}
201342Snyan
201342Snyantemplate <typename T> std::vector<T> sortByName(std::vector<T> &&V) {
201342Snyan  llvm::sort(V.begin(), V.end(), [](const auto *L, const auto *R) {
201342Snyan    return L->getName() < R->getName();
201342Snyan  });
201342Snyan  return {std::move(V)};
201342Snyan}
201342Snyan
201342Snyanclass AMDGPULowerModuleLDS {
201342Snyan  const AMDGPUTargetMachine &TM;
201342Snyan
201342Snyan  static void
201342Snyan  removeLocalVarsFromUsedLists(Module &M,
201342Snyan                               const DenseSet<GlobalVariable *> &LocalVars) {
201342Snyan    // The verifier rejects used lists containing an inttoptr of a constant
201342Snyan    // so remove the variables from these lists before replaceAllUsesWith
201342Snyan    SmallPtrSet<Constant *, 8> LocalVarsSet;
201342Snyan    for (GlobalVariable *LocalVar : LocalVars)
201342Snyan      LocalVarsSet.insert(cast<Constant>(LocalVar->stripPointerCasts()));
201342Snyan
201342Snyan    removeFromUsedLists(
201342Snyan        M, [&LocalVarsSet](Constant *C) { return LocalVarsSet.count(C); });
201342Snyan
201342Snyan    for (GlobalVariable *LocalVar : LocalVars)
201342Snyan      LocalVar->removeDeadConstantUsers();
201342Snyan  }
201342Snyan
201342Snyan  static void markUsedByKernel(Function *Func, GlobalVariable *SGV) {
201342Snyan    // The llvm.amdgcn.module.lds instance is implicitly used by all kernels
201342Snyan    // that might call a function which accesses a field within it. This is
201342Snyan    // presently approximated to 'all kernels' if there are any such functions
201342Snyan    // in the module. This implicit use is redefined as an explicit use here so
201342Snyan    // that later passes, specifically PromoteAlloca, account for the required
201342Snyan    // memory without any knowledge of this transform.
201342Snyan
201342Snyan    // An operand bundle on llvm.donothing works because the call instruction
201342Snyan    // survives until after the last pass that needs to account for LDS. It is
201342Snyan    // better than inline asm as the latter survives until the end of codegen. A
201342Snyan    // totally robust solution would be a function with the same semantics as
201342Snyan    // llvm.donothing that takes a pointer to the instance and is lowered to a
201342Snyan    // no-op after LDS is allocated, but that is not presently necessary.
201342Snyan
201342Snyan    // This intrinsic is eliminated shortly before instruction selection. It
201342Snyan    // does not suffice to indicate to ISel that a given global which is not
201342Snyan    // immediately used by the kernel must still be allocated by it. An
201342Snyan    // equivalent target specific intrinsic which lasts until immediately after
201342Snyan    // codegen would suffice for that, but one would still need to ensure that
201342Snyan    // the variables are allocated in the anticpated order.
201342Snyan    BasicBlock *Entry = &Func->getEntryBlock();
201342Snyan    IRBuilder<> Builder(Entry, Entry->getFirstNonPHIIt());
201342Snyan
201342Snyan    Function *Decl =
201342Snyan        Intrinsic::getDeclaration(Func->getParent(), Intrinsic::donothing, {});
201342Snyan
201342Snyan    Value *UseInstance[1] = {
201342Snyan        Builder.CreateConstInBoundsGEP1_32(SGV->getValueType(), SGV, 0)};
201342Snyan
201342Snyan    Builder.CreateCall(
201342Snyan        Decl, {}, {OperandBundleDefT<Value *>("ExplicitUse", UseInstance)});
201342Snyan  }
201342Snyan
201342Snyan  static bool eliminateConstantExprUsesOfLDSFromAllInstructions(Module &M) {
201342Snyan    // Constants are uniqued within LLVM. A ConstantExpr referring to a LDS
201342Snyan    // global may have uses from multiple different functions as a result.
201342Snyan    // This pass specialises LDS variables with respect to the kernel that
201342Snyan    // allocates them.
201342Snyan
201342Snyan    // This is semantically equivalent to (the unimplemented as slow):
201342Snyan    // for (auto &F : M.functions())
201342Snyan    //   for (auto &BB : F)
201342Snyan    //     for (auto &I : BB)
201342Snyan    //       for (Use &Op : I.operands())
201342Snyan    //         if (constantExprUsesLDS(Op))
201342Snyan    //           replaceConstantExprInFunction(I, Op);
201342Snyan
201342Snyan    SmallVector<Constant *> LDSGlobals;
201342Snyan    for (auto &GV : M.globals())
201342Snyan      if (AMDGPU::isLDSVariableToLower(GV))
201342Snyan        LDSGlobals.push_back(&GV);
201342Snyan
201342Snyan    return convertUsersOfConstantsToInstructions(LDSGlobals);
201342Snyan  }
201342Snyan
201342Snyanpublic:
201342Snyan  AMDGPULowerModuleLDS(const AMDGPUTargetMachine &TM_) : TM(TM_) {}
201342Snyan
254015Smarcel  using FunctionVariableMap = DenseMap<Function *, DenseSet<GlobalVariable *>>;
201342Snyan
254015Smarcel  using VariableFunctionMap = DenseMap<GlobalVariable *, DenseSet<Function *>>;
254015Smarcel
201342Snyan  static void getUsesOfLDSByFunction(CallGraph const &CG, Module &M,
201342Snyan                                     FunctionVariableMap &kernels,
201342Snyan                                     FunctionVariableMap &functions) {
201342Snyan
201342Snyan    // Get uses from the current function, excluding uses by called functions
201342Snyan    // Two output variables to avoid walking the globals list twice
201342Snyan    for (auto &GV : M.globals()) {
201342Snyan      if (!AMDGPU::isLDSVariableToLower(GV)) {
201342Snyan        continue;
201342Snyan      }
201342Snyan
201342Snyan      if (GV.isAbsoluteSymbolRef()) {
201342Snyan        report_fatal_error(
201342Snyan            "LDS variables with absolute addresses are unimplemented.");
201342Snyan      }
218737Snyan
235988Sgleb      for (User *V : GV.users()) {
232784Snyan        if (auto *I = dyn_cast<Instruction>(V)) {
201342Snyan          Function *F = I->getFunction();
201342Snyan          if (isKernelLDS(F)) {
201342Snyan            kernels[F].insert(&GV);
201342Snyan          } else {
201342Snyan            functions[F].insert(&GV);
201342Snyan          }
201342Snyan        }
201342Snyan      }
201342Snyan    }
201342Snyan  }
201342Snyan
201342Snyan  struct LDSUsesInfoTy {
201342Snyan    FunctionVariableMap direct_access;
201342Snyan    FunctionVariableMap indirect_access;
201342Snyan  };
201342Snyan
201342Snyan  static LDSUsesInfoTy getTransitiveUsesOfLDS(CallGraph const &CG, Module &M) {
201342Snyan
201342Snyan    FunctionVariableMap direct_map_kernel;
201342Snyan    FunctionVariableMap direct_map_function;
201342Snyan    getUsesOfLDSByFunction(CG, M, direct_map_kernel, direct_map_function);
201342Snyan
201342Snyan    // Collect variables that are used by functions whose address has escaped
201342Snyan    DenseSet<GlobalVariable *> VariablesReachableThroughFunctionPointer;
201342Snyan    for (Function &F : M.functions()) {
226506Sdes      if (!isKernelLDS(&F))
232784Snyan        if (F.hasAddressTaken(nullptr,
232784Snyan                              /* IgnoreCallbackUses */ false,
232784Snyan                              /* IgnoreAssumeLikeCalls */ false,
232784Snyan                              /* IgnoreLLVMUsed */ true,
201342Snyan                              /* IgnoreArcAttachedCall */ false)) {
201342Snyan          set_union(VariablesReachableThroughFunctionPointer,
201342Snyan                    direct_map_function[&F]);
201342Snyan        }
201342Snyan    }
201342Snyan
201342Snyan    auto functionMakesUnknownCall = [&](const Function *F) -> bool {
201342Snyan      assert(!F->isDeclaration());
201342Snyan      for (const CallGraphNode::CallRecord &R : *CG[F]) {
201342Snyan        if (!R.second->getFunction()) {
201342Snyan          return true;
201342Snyan        }
201342Snyan      }
201342Snyan      return false;
201342Snyan    };
201342Snyan
232784Snyan    // Work out which variables are reachable through function calls
219225Snyan    FunctionVariableMap transitive_map_function = direct_map_function;
232784Snyan
201342Snyan    // If the function makes any unknown call, assume the worst case that it can
219225Snyan    // access all variables accessed by functions whose address escaped
201342Snyan    for (Function &F : M.functions()) {
201342Snyan      if (!F.isDeclaration() && functionMakesUnknownCall(&F)) {
201342Snyan        if (!isKernelLDS(&F)) {
201342Snyan          set_union(transitive_map_function[&F],
201342Snyan                    VariablesReachableThroughFunctionPointer);
201342Snyan        }
201342Snyan      }
201342Snyan    }
201342Snyan
201342Snyan    // Direct implementation of collecting all variables reachable from each
201342Snyan    // function
201342Snyan    for (Function &Func : M.functions()) {
201342Snyan      if (Func.isDeclaration() || isKernelLDS(&Func))
201342Snyan        continue;
219225Snyan
201342Snyan      DenseSet<Function *> seen; // catches cycles
201342Snyan      SmallVector<Function *, 4> wip{&Func};
201342Snyan
201342Snyan      while (!wip.empty()) {
201342Snyan        Function *F = wip.pop_back_val();
201342Snyan
201342Snyan        // Can accelerate this by referring to transitive map for functions that
201342Snyan        // have already been computed, with more care than this
201342Snyan        set_union(transitive_map_function[&Func], direct_map_function[F]);
201342Snyan
201342Snyan        for (const CallGraphNode::CallRecord &R : *CG[F]) {
201342Snyan          Function *ith = R.second->getFunction();
201342Snyan          if (ith) {
201342Snyan            if (!seen.contains(ith)) {
201342Snyan              seen.insert(ith);
201342Snyan              wip.push_back(ith);
201342Snyan            }
201342Snyan          }
201342Snyan        }
201342Snyan      }
201342Snyan    }
201342Snyan
201342Snyan    // direct_map_kernel lists which variables are used by the kernel
201342Snyan    // find the variables which are used through a function call
201342Snyan    FunctionVariableMap indirect_map_kernel;
201342Snyan
201342Snyan    for (Function &Func : M.functions()) {
235988Sgleb      if (Func.isDeclaration() || !isKernelLDS(&Func))
219960Snyan        continue;
218737Snyan
201342Snyan      for (const CallGraphNode::CallRecord &R : *CG[&Func]) {
201342Snyan        Function *ith = R.second->getFunction();
201342Snyan        if (ith) {
201342Snyan          set_union(indirect_map_kernel[&Func], transitive_map_function[ith]);
201342Snyan        } else {
201342Snyan          set_union(indirect_map_kernel[&Func],
201342Snyan                    VariablesReachableThroughFunctionPointer);
201342Snyan        }
219960Snyan      }
219960Snyan    }
201342Snyan
201342Snyan    return {std::move(direct_map_kernel), std::move(indirect_map_kernel)};
201342Snyan  }
201342Snyan
201342Snyan  struct LDSVariableReplacement {
201342Snyan    GlobalVariable *SGV = nullptr;
201342Snyan    DenseMap<GlobalVariable *, Constant *> LDSVarsToConstantGEP;
201342Snyan  };
219960Snyan
201342Snyan  // remap from lds global to a constantexpr gep to where it has been moved to
201342Snyan  // for each kernel
201342Snyan  // an array with an element for each kernel containing where the corresponding
201342Snyan  // variable was remapped to
201342Snyan
201342Snyan  static Constant *getAddressesOfVariablesInKernel(
201342Snyan      LLVMContext &Ctx, ArrayRef<GlobalVariable *> Variables,
201342Snyan      const DenseMap<GlobalVariable *, Constant *> &LDSVarsToConstantGEP) {
201342Snyan    // Create a ConstantArray containing the address of each Variable within the
201342Snyan    // kernel corresponding to LDSVarsToConstantGEP, or poison if that kernel
201342Snyan    // does not allocate it
201342Snyan    // TODO: Drop the ptrtoint conversion
201342Snyan
201342Snyan    Type *I32 = Type::getInt32Ty(Ctx);
201342Snyan
201342Snyan    ArrayType *KernelOffsetsType = ArrayType::get(I32, Variables.size());
201342Snyan
201342Snyan    SmallVector<Constant *> Elements;
201342Snyan    for (size_t i = 0; i < Variables.size(); i++) {
201342Snyan      GlobalVariable *GV = Variables[i];
201342Snyan      auto ConstantGepIt = LDSVarsToConstantGEP.find(GV);
214257Snyan      if (ConstantGepIt != LDSVarsToConstantGEP.end()) {
201342Snyan        auto elt = ConstantExpr::getPtrToInt(ConstantGepIt->second, I32);
201342Snyan        Elements.push_back(elt);
201342Snyan      } else {
201342Snyan        Elements.push_back(PoisonValue::get(I32));
201342Snyan      }
201342Snyan    }
201342Snyan    return ConstantArray::get(KernelOffsetsType, Elements);
201342Snyan  }
218842Snyan
219960Snyan  static GlobalVariable *buildLookupTable(
219960Snyan      Module &M, ArrayRef<GlobalVariable *> Variables,
219960Snyan      ArrayRef<Function *> kernels,
201342Snyan      DenseMap<Function *, LDSVariableReplacement> &KernelToReplacement) {
219960Snyan    if (Variables.empty()) {
201342Snyan      return nullptr;
201342Snyan    }
201342Snyan    LLVMContext &Ctx = M.getContext();
201342Snyan
201342Snyan    const size_t NumberVariables = Variables.size();
201342Snyan    const size_t NumberKernels = kernels.size();
201342Snyan
201342Snyan    ArrayType *KernelOffsetsType =
201342Snyan        ArrayType::get(Type::getInt32Ty(Ctx), NumberVariables);
201342Snyan
201342Snyan    ArrayType *AllKernelsOffsetsType =
201342Snyan        ArrayType::get(KernelOffsetsType, NumberKernels);
201342Snyan
201342Snyan    Constant *Missing = PoisonValue::get(KernelOffsetsType);
201342Snyan    std::vector<Constant *> overallConstantExprElts(NumberKernels);
201342Snyan    for (size_t i = 0; i < NumberKernels; i++) {
201342Snyan      auto Replacement = KernelToReplacement.find(kernels[i]);
201342Snyan      overallConstantExprElts[i] =
201342Snyan          (Replacement == KernelToReplacement.end())
201342Snyan              ? Missing
201342Snyan              : getAddressesOfVariablesInKernel(
201342Snyan                    Ctx, Variables, Replacement->second.LDSVarsToConstantGEP);
201342Snyan    }
201342Snyan
201342Snyan    Constant *init =
201342Snyan        ConstantArray::get(AllKernelsOffsetsType, overallConstantExprElts);
201342Snyan
201342Snyan    return new GlobalVariable(
201342Snyan        M, AllKernelsOffsetsType, true, GlobalValue::InternalLinkage, init,
201342Snyan        "llvm.amdgcn.lds.offset.table", nullptr, GlobalValue::NotThreadLocal,
201342Snyan        AMDGPUAS::CONSTANT_ADDRESS);
201342Snyan  }
201342Snyan
201342Snyan  void replaceUseWithTableLookup(Module &M, IRBuilder<> &Builder,
201342Snyan                                 GlobalVariable *LookupTable,
201342Snyan                                 GlobalVariable *GV, Use &U,
201342Snyan                                 Value *OptionalIndex) {
201342Snyan    // Table is a constant array of the same length as OrderedKernels
201342Snyan    LLVMContext &Ctx = M.getContext();
201342Snyan    Type *I32 = Type::getInt32Ty(Ctx);
201342Snyan    auto *I = cast<Instruction>(U.getUser());
201342Snyan
201342Snyan    Value *tableKernelIndex = getTableLookupKernelIndex(M, I->getFunction());
201342Snyan
201342Snyan    if (auto *Phi = dyn_cast<PHINode>(I)) {
201342Snyan      BasicBlock *BB = Phi->getIncomingBlock(U);
201342Snyan      Builder.SetInsertPoint(&(*(BB->getFirstInsertionPt())));
201342Snyan    } else {
201342Snyan      Builder.SetInsertPoint(I);
201342Snyan    }
201342Snyan
242863Snyan    SmallVector<Value *, 3> GEPIdx = {
242863Snyan        ConstantInt::get(I32, 0),
242863Snyan        tableKernelIndex,
242863Snyan    };
201342Snyan    if (OptionalIndex)
201342Snyan      GEPIdx.push_back(OptionalIndex);
201342Snyan
201342Snyan    Value *Address = Builder.CreateInBoundsGEP(
201342Snyan        LookupTable->getValueType(), LookupTable, GEPIdx, GV->getName());
201342Snyan
201342Snyan    Value *loaded = Builder.CreateLoad(I32, Address);
201342Snyan
201342Snyan    Value *replacement =
201342Snyan        Builder.CreateIntToPtr(loaded, GV->getType(), GV->getName());
201342Snyan
201342Snyan    U.set(replacement);
201342Snyan  }
201342Snyan
201342Snyan  void replaceUsesInInstructionsWithTableLookup(
201342Snyan      Module &M, ArrayRef<GlobalVariable *> ModuleScopeVariables,
201342Snyan      GlobalVariable *LookupTable) {
201342Snyan
201342Snyan    LLVMContext &Ctx = M.getContext();
201342Snyan    IRBuilder<> Builder(Ctx);
201342Snyan    Type *I32 = Type::getInt32Ty(Ctx);
201342Snyan
201342Snyan    for (size_t Index = 0; Index < ModuleScopeVariables.size(); Index++) {
201342Snyan      auto *GV = ModuleScopeVariables[Index];
201342Snyan
254015Smarcel      for (Use &U : make_early_inc_range(GV->uses())) {
201342Snyan        auto *I = dyn_cast<Instruction>(U.getUser());
201342Snyan        if (!I)
201342Snyan          continue;
201342Snyan
201342Snyan        replaceUseWithTableLookup(M, Builder, LookupTable, GV, U,
201342Snyan                                  ConstantInt::get(I32, Index));
201342Snyan      }
201342Snyan    }
201342Snyan  }
201342Snyan
201342Snyan  static DenseSet<Function *> kernelsThatIndirectlyAccessAnyOfPassedVariables(
201342Snyan      Module &M, LDSUsesInfoTy &LDSUsesInfo,
201342Snyan      DenseSet<GlobalVariable *> const &VariableSet) {
201342Snyan
201342Snyan    DenseSet<Function *> KernelSet;
232784Snyan
232784Snyan    if (VariableSet.empty())
232784Snyan      return KernelSet;
232784Snyan
232784Snyan    for (Function &Func : M.functions()) {
232784Snyan      if (Func.isDeclaration() || !isKernelLDS(&Func))
201342Snyan        continue;
201342Snyan      for (GlobalVariable *GV : LDSUsesInfo.indirect_access[&Func]) {
201342Snyan        if (VariableSet.contains(GV)) {
201342Snyan          KernelSet.insert(&Func);
201342Snyan          break;
201342Snyan        }
201342Snyan      }
201342Snyan    }
201342Snyan
201342Snyan    return KernelSet;
201342Snyan  }
201342Snyan
239063Snyan  static GlobalVariable *
239063Snyan  chooseBestVariableForModuleStrategy(const DataLayout &DL,
201342Snyan                                      VariableFunctionMap &LDSVars) {
201342Snyan    // Find the global variable with the most indirect uses from kernels
201342Snyan
201342Snyan    struct CandidateTy {
201342Snyan      GlobalVariable *GV = nullptr;
201342Snyan      size_t UserCount = 0;
201342Snyan      size_t Size = 0;
254015Smarcel
201342Snyan      CandidateTy() = default;
254015Smarcel
201342Snyan      CandidateTy(GlobalVariable *GV, uint64_t UserCount, uint64_t AllocSize)
201342Snyan          : GV(GV), UserCount(UserCount), Size(AllocSize) {}
254015Smarcel
201342Snyan      bool operator<(const CandidateTy &Other) const {
201342Snyan        // Fewer users makes module scope variable less attractive
201342Snyan        if (UserCount < Other.UserCount) {
201342Snyan          return true;
201342Snyan        }
201342Snyan        if (UserCount > Other.UserCount) {
201342Snyan          return false;
201342Snyan        }
201342Snyan
201342Snyan        // Bigger makes module scope variable less attractive
201342Snyan        if (Size < Other.Size) {
201342Snyan          return false;
201342Snyan        }
201342Snyan
201342Snyan        if (Size > Other.Size) {
201342Snyan          return true;
201342Snyan        }
201342Snyan
201342Snyan        // Arbitrary but consistent
201342Snyan        return GV->getName() < Other.GV->getName();
201342Snyan      }
201342Snyan    };
201342Snyan
201342Snyan    CandidateTy MostUsed;
201342Snyan
201342Snyan    for (auto &K : LDSVars) {
201342Snyan      GlobalVariable *GV = K.first;
201342Snyan      if (K.second.size() <= 1) {
201342Snyan        // A variable reachable by only one kernel is best lowered with kernel
201342Snyan        // strategy
201342Snyan        continue;
201342Snyan      }
201342Snyan      CandidateTy Candidate(
201342Snyan          GV, K.second.size(),
201342Snyan          DL.getTypeAllocSize(GV->getValueType()).getFixedValue());
201342Snyan      if (MostUsed < Candidate)
201342Snyan        MostUsed = Candidate;
201342Snyan    }
201342Snyan
201342Snyan    return MostUsed.GV;
201342Snyan  }
201342Snyan
201342Snyan  static void recordLDSAbsoluteAddress(Module *M, GlobalVariable *GV,
201342Snyan                                       uint32_t Address) {
201342Snyan    // Write the specified address into metadata where it can be retrieved by
219960Snyan    // the assembler. Format is a half open range, [Address Address+1)
201342Snyan    LLVMContext &Ctx = M->getContext();
201342Snyan    auto *IntTy =
201342Snyan        M->getDataLayout().getIntPtrType(Ctx, AMDGPUAS::LOCAL_ADDRESS);
201342Snyan    auto *MinC = ConstantAsMetadata::get(ConstantInt::get(IntTy, Address));
201342Snyan    auto *MaxC = ConstantAsMetadata::get(ConstantInt::get(IntTy, Address + 1));
201342Snyan    GV->setMetadata(LLVMContext::MD_absolute_symbol,
201342Snyan                    MDNode::get(Ctx, {MinC, MaxC}));
201342Snyan  }
201342Snyan
201342Snyan  DenseMap<Function *, Value *> tableKernelIndexCache;
201342Snyan  Value *getTableLookupKernelIndex(Module &M, Function *F) {
201342Snyan    // Accesses from a function use the amdgcn_lds_kernel_id intrinsic which
201342Snyan    // lowers to a read from a live in register. Emit it once in the entry
201342Snyan    // block to spare deduplicating it later.
201342Snyan    auto [It, Inserted] = tableKernelIndexCache.try_emplace(F);
201342Snyan    if (Inserted) {
201342Snyan      Function *Decl =
201342Snyan          Intrinsic::getDeclaration(&M, Intrinsic::amdgcn_lds_kernel_id, {});
201342Snyan
201342Snyan      auto InsertAt = F->getEntryBlock().getFirstNonPHIOrDbgOrAlloca();
201342Snyan      IRBuilder<> Builder(&*InsertAt);
201342Snyan
201342Snyan      It->second = Builder.CreateCall(Decl, {});
201342Snyan    }
201342Snyan
201342Snyan    return It->second;
201342Snyan  }
201342Snyan
201342Snyan  static std::vector<Function *> assignLDSKernelIDToEachKernel(
201342Snyan      Module *M, DenseSet<Function *> const &KernelsThatAllocateTableLDS,
201342Snyan      DenseSet<Function *> const &KernelsThatIndirectlyAllocateDynamicLDS) {
201342Snyan    // Associate kernels in the set with an arbirary but reproducible order and
201342Snyan    // annotate them with that order in metadata. This metadata is recognised by
201342Snyan    // the backend and lowered to a SGPR which can be read from using
201342Snyan    // amdgcn_lds_kernel_id.
201342Snyan
201342Snyan    std::vector<Function *> OrderedKernels;
201342Snyan    if (!KernelsThatAllocateTableLDS.empty() ||
201342Snyan        !KernelsThatIndirectlyAllocateDynamicLDS.empty()) {
201342Snyan
201342Snyan      for (Function &Func : M->functions()) {
201342Snyan        if (Func.isDeclaration())
201342Snyan          continue;
201342Snyan        if (!isKernelLDS(&Func))
201342Snyan          continue;
201342Snyan
201342Snyan        if (KernelsThatAllocateTableLDS.contains(&Func) ||
201342Snyan            KernelsThatIndirectlyAllocateDynamicLDS.contains(&Func)) {
201342Snyan          assert(Func.hasName()); // else fatal error earlier
201342Snyan          OrderedKernels.push_back(&Func);
201342Snyan        }
201342Snyan      }
201342Snyan
201342Snyan      // Put them in an arbitrary but reproducible order
201342Snyan      OrderedKernels = sortByName(std::move(OrderedKernels));
201342Snyan
201342Snyan      // Annotate the kernels with their order in this vector
201342Snyan      LLVMContext &Ctx = M->getContext();
201342Snyan      IRBuilder<> Builder(Ctx);
201342Snyan
201342Snyan      if (OrderedKernels.size() > UINT32_MAX) {
201342Snyan        // 32 bit keeps it in one SGPR. > 2**32 kernels won't fit on the GPU
201342Snyan        report_fatal_error("Unimplemented LDS lowering for > 2**32 kernels");
201342Snyan      }
201342Snyan
201342Snyan      for (size_t i = 0; i < OrderedKernels.size(); i++) {
201342Snyan        Metadata *AttrMDArgs[1] = {
201342Snyan            ConstantAsMetadata::get(Builder.getInt32(i)),
201342Snyan        };
201342Snyan        OrderedKernels[i]->setMetadata("llvm.amdgcn.lds.kernel.id",
201342Snyan                                       MDNode::get(Ctx, AttrMDArgs));
201342Snyan      }
201342Snyan    }
201342Snyan    return OrderedKernels;
201342Snyan  }
201342Snyan
201342Snyan  static void partitionVariablesIntoIndirectStrategies(
201342Snyan      Module &M, LDSUsesInfoTy const &LDSUsesInfo,
201342Snyan      VariableFunctionMap &LDSToKernelsThatNeedToAccessItIndirectly,
201342Snyan      DenseSet<GlobalVariable *> &ModuleScopeVariables,
201342Snyan      DenseSet<GlobalVariable *> &TableLookupVariables,
201342Snyan      DenseSet<GlobalVariable *> &KernelAccessVariables,
201342Snyan      DenseSet<GlobalVariable *> &DynamicVariables) {
201342Snyan
201342Snyan    GlobalVariable *HybridModuleRoot =
201342Snyan        LoweringKindLoc != LoweringKind::hybrid
201342Snyan            ? nullptr
201342Snyan            : chooseBestVariableForModuleStrategy(
201342Snyan                  M.getDataLayout(), LDSToKernelsThatNeedToAccessItIndirectly);
201342Snyan
201342Snyan    DenseSet<Function *> const EmptySet;
201342Snyan    DenseSet<Function *> const &HybridModuleRootKernels =
201342Snyan        HybridModuleRoot
201342Snyan            ? LDSToKernelsThatNeedToAccessItIndirectly[HybridModuleRoot]
201342Snyan            : EmptySet;
201342Snyan
201342Snyan    for (auto &K : LDSToKernelsThatNeedToAccessItIndirectly) {
201342Snyan      // Each iteration of this loop assigns exactly one global variable to
201342Snyan      // exactly one of the implementation strategies.
201342Snyan
201342Snyan      GlobalVariable *GV = K.first;
201342Snyan      assert(AMDGPU::isLDSVariableToLower(*GV));
201342Snyan      assert(K.second.size() != 0);
201342Snyan
201342Snyan      if (AMDGPU::isDynamicLDS(*GV)) {
201342Snyan        DynamicVariables.insert(GV);
201342Snyan        continue;
201342Snyan      }
201342Snyan
201342Snyan      switch (LoweringKindLoc) {
220685Snyan      case LoweringKind::module:
220685Snyan        ModuleScopeVariables.insert(GV);
220685Snyan        break;
220685Snyan
220685Snyan      case LoweringKind::table:
220685Snyan        TableLookupVariables.insert(GV);
220685Snyan        break;
220685Snyan
220685Snyan      case LoweringKind::kernel:
220685Snyan        if (K.second.size() == 1) {
220685Snyan          KernelAccessVariables.insert(GV);
220685Snyan        } else {
201342Snyan          report_fatal_error(
201342Snyan              "cannot lower LDS '" + GV->getName() +
201342Snyan              "' to kernel access as it is reachable from multiple kernels");
201342Snyan        }
201342Snyan        break;
201342Snyan
201342Snyan      case LoweringKind::hybrid: {
201342Snyan        if (GV == HybridModuleRoot) {
201342Snyan          assert(K.second.size() != 1);
201342Snyan          ModuleScopeVariables.insert(GV);
201342Snyan        } else if (K.second.size() == 1) {
201342Snyan          KernelAccessVariables.insert(GV);
201342Snyan        } else if (set_is_subset(K.second, HybridModuleRootKernels)) {
          ModuleScopeVariables.insert(GV);
        } else {
          TableLookupVariables.insert(GV);
        }
        break;
      }
      }
    }

    // All LDS variables accessed indirectly have now been partitioned into
    // the distinct lowering strategies.
    assert(ModuleScopeVariables.size() + TableLookupVariables.size() +
               KernelAccessVariables.size() + DynamicVariables.size() ==
           LDSToKernelsThatNeedToAccessItIndirectly.size());
  }

  static GlobalVariable *lowerModuleScopeStructVariables(
      Module &M, DenseSet<GlobalVariable *> const &ModuleScopeVariables,
      DenseSet<Function *> const &KernelsThatAllocateModuleLDS) {
    // Create a struct to hold the ModuleScopeVariables
    // Replace all uses of those variables from non-kernel functions with the
    // new struct instance Replace only the uses from kernel functions that will
    // allocate this instance. That is a space optimisation - kernels that use a
    // subset of the module scope struct and do not need to allocate it for
    // indirect calls will only allocate the subset they use (they do so as part
    // of the per-kernel lowering).
    if (ModuleScopeVariables.empty()) {
      return nullptr;
    }

    LLVMContext &Ctx = M.getContext();

    LDSVariableReplacement ModuleScopeReplacement =
        createLDSVariableReplacement(M, "llvm.amdgcn.module.lds",
                                     ModuleScopeVariables);

    appendToCompilerUsed(M, {static_cast<GlobalValue *>(
                                ConstantExpr::getPointerBitCastOrAddrSpaceCast(
                                    cast<Constant>(ModuleScopeReplacement.SGV),
                                    PointerType::getUnqual(Ctx)))});

    // module.lds will be allocated at zero in any kernel that allocates it
    recordLDSAbsoluteAddress(&M, ModuleScopeReplacement.SGV, 0);

    // historic
    removeLocalVarsFromUsedLists(M, ModuleScopeVariables);

    // Replace all uses of module scope variable from non-kernel functions
    replaceLDSVariablesWithStruct(
        M, ModuleScopeVariables, ModuleScopeReplacement, [&](Use &U) {
          Instruction *I = dyn_cast<Instruction>(U.getUser());
          if (!I) {
            return false;
          }
          Function *F = I->getFunction();
          return !isKernelLDS(F);
        });

    // Replace uses of module scope variable from kernel functions that
    // allocate the module scope variable, otherwise leave them unchanged
    // Record on each kernel whether the module scope global is used by it

    for (Function &Func : M.functions()) {
      if (Func.isDeclaration() || !isKernelLDS(&Func))
        continue;

      if (KernelsThatAllocateModuleLDS.contains(&Func)) {
        replaceLDSVariablesWithStruct(
            M, ModuleScopeVariables, ModuleScopeReplacement, [&](Use &U) {
              Instruction *I = dyn_cast<Instruction>(U.getUser());
              if (!I) {
                return false;
              }
              Function *F = I->getFunction();
              return F == &Func;
            });

        markUsedByKernel(&Func, ModuleScopeReplacement.SGV);
      }
    }

    return ModuleScopeReplacement.SGV;
  }

  static DenseMap<Function *, LDSVariableReplacement>
  lowerKernelScopeStructVariables(
      Module &M, LDSUsesInfoTy &LDSUsesInfo,
      DenseSet<GlobalVariable *> const &ModuleScopeVariables,
      DenseSet<Function *> const &KernelsThatAllocateModuleLDS,
      GlobalVariable *MaybeModuleScopeStruct) {

    // Create a struct for each kernel for the non-module-scope variables.

    DenseMap<Function *, LDSVariableReplacement> KernelToReplacement;
    for (Function &Func : M.functions()) {
      if (Func.isDeclaration() || !isKernelLDS(&Func))
        continue;

      DenseSet<GlobalVariable *> KernelUsedVariables;
      // Allocating variables that are used directly in this struct to get
      // alignment aware allocation and predictable frame size.
      for (auto &v : LDSUsesInfo.direct_access[&Func]) {
        if (!AMDGPU::isDynamicLDS(*v)) {
          KernelUsedVariables.insert(v);
        }
      }

      // Allocating variables that are accessed indirectly so that a lookup of
      // this struct instance can find them from nested functions.
      for (auto &v : LDSUsesInfo.indirect_access[&Func]) {
        if (!AMDGPU::isDynamicLDS(*v)) {
          KernelUsedVariables.insert(v);
        }
      }

      // Variables allocated in module lds must all resolve to that struct,
      // not to the per-kernel instance.
      if (KernelsThatAllocateModuleLDS.contains(&Func)) {
        for (GlobalVariable *v : ModuleScopeVariables) {
          KernelUsedVariables.erase(v);
        }
      }

      if (KernelUsedVariables.empty()) {
        // Either used no LDS, or the LDS it used was all in the module struct
        // or dynamically sized
        continue;
      }

      // The association between kernel function and LDS struct is done by
      // symbol name, which only works if the function in question has a
      // name This is not expected to be a problem in practice as kernels
      // are called by name making anonymous ones (which are named by the
      // backend) difficult to use. This does mean that llvm test cases need
      // to name the kernels.
      if (!Func.hasName()) {
        report_fatal_error("Anonymous kernels cannot use LDS variables");
      }

      std::string VarName =
          (Twine("llvm.amdgcn.kernel.") + Func.getName() + ".lds").str();

      auto Replacement =
          createLDSVariableReplacement(M, VarName, KernelUsedVariables);

      // If any indirect uses, create a direct use to ensure allocation
      // TODO: Simpler to unconditionally mark used but that regresses
      // codegen in test/CodeGen/AMDGPU/noclobber-barrier.ll
      auto Accesses = LDSUsesInfo.indirect_access.find(&Func);
      if ((Accesses != LDSUsesInfo.indirect_access.end()) &&
          !Accesses->second.empty())
        markUsedByKernel(&Func, Replacement.SGV);

      // remove preserves existing codegen
      removeLocalVarsFromUsedLists(M, KernelUsedVariables);
      KernelToReplacement[&Func] = Replacement;

      // Rewrite uses within kernel to the new struct
      replaceLDSVariablesWithStruct(
          M, KernelUsedVariables, Replacement, [&Func](Use &U) {
            Instruction *I = dyn_cast<Instruction>(U.getUser());
            return I && I->getFunction() == &Func;
          });
    }
    return KernelToReplacement;
  }

  static GlobalVariable *
  buildRepresentativeDynamicLDSInstance(Module &M, LDSUsesInfoTy &LDSUsesInfo,
                                        Function *func) {
    // Create a dynamic lds variable with a name associated with the passed
    // function that has the maximum alignment of any dynamic lds variable
    // reachable from this kernel. Dynamic LDS is allocated after the static LDS
    // allocation, possibly after alignment padding. The representative variable
    // created here has the maximum alignment of any other dynamic variable
    // reachable by that kernel. All dynamic LDS variables are allocated at the
    // same address in each kernel in order to provide the documented aliasing
    // semantics. Setting the alignment here allows this IR pass to accurately
    // predict the exact constant at which it will be allocated.

    assert(isKernelLDS(func));

    LLVMContext &Ctx = M.getContext();
    const DataLayout &DL = M.getDataLayout();
    Align MaxDynamicAlignment(1);

    auto UpdateMaxAlignment = [&MaxDynamicAlignment, &DL](GlobalVariable *GV) {
      if (AMDGPU::isDynamicLDS(*GV)) {
        MaxDynamicAlignment =
            std::max(MaxDynamicAlignment, AMDGPU::getAlign(DL, GV));
      }
    };

    for (GlobalVariable *GV : LDSUsesInfo.indirect_access[func]) {
      UpdateMaxAlignment(GV);
    }

    for (GlobalVariable *GV : LDSUsesInfo.direct_access[func]) {
      UpdateMaxAlignment(GV);
    }

    assert(func->hasName()); // Checked by caller
    auto emptyCharArray = ArrayType::get(Type::getInt8Ty(Ctx), 0);
    GlobalVariable *N = new GlobalVariable(
        M, emptyCharArray, false, GlobalValue::ExternalLinkage, nullptr,
        Twine("llvm.amdgcn." + func->getName() + ".dynlds"), nullptr, GlobalValue::NotThreadLocal, AMDGPUAS::LOCAL_ADDRESS,
        false);
    N->setAlignment(MaxDynamicAlignment);

    assert(AMDGPU::isDynamicLDS(*N));
    return N;
  }

  /// Strip "amdgpu-no-lds-kernel-id" from any functions where we may have
  /// introduced its use. If AMDGPUAttributor ran prior to the pass, we inferred
  /// the lack of llvm.amdgcn.lds.kernel.id calls.
  void removeNoLdsKernelIdFromReachable(CallGraph &CG, Function *KernelRoot) {
    KernelRoot->removeFnAttr("amdgpu-no-lds-kernel-id");

    SmallVector<Function *> Tmp({CG[KernelRoot]->getFunction()});
    if (!Tmp.back())
      return;

    SmallPtrSet<Function *, 8> Visited;
    bool SeenUnknownCall = false;

    do {
      Function *F = Tmp.pop_back_val();

      for (auto &N : *CG[F]) {
        if (!N.second)
          continue;

        Function *Callee = N.second->getFunction();
        if (!Callee) {
          if (!SeenUnknownCall) {
            SeenUnknownCall = true;

            // If we see any indirect calls, assume nothing about potential
            // targets.
            // TODO: This could be refined to possible LDS global users.
            for (auto &N : *CG.getExternalCallingNode()) {
              Function *PotentialCallee = N.second->getFunction();
              if (!isKernelLDS(PotentialCallee))
                PotentialCallee->removeFnAttr("amdgpu-no-lds-kernel-id");
            }

            continue;
          }
        }

        Callee->removeFnAttr("amdgpu-no-lds-kernel-id");
        if (Visited.insert(Callee).second)
          Tmp.push_back(Callee);
      }
    } while (!Tmp.empty());
  }

  DenseMap<Function *, GlobalVariable *> lowerDynamicLDSVariables(
      Module &M, LDSUsesInfoTy &LDSUsesInfo,
      DenseSet<Function *> const &KernelsThatIndirectlyAllocateDynamicLDS,
      DenseSet<GlobalVariable *> const &DynamicVariables,
      std::vector<Function *> const &OrderedKernels) {
    DenseMap<Function *, GlobalVariable *> KernelToCreatedDynamicLDS;
    if (!KernelsThatIndirectlyAllocateDynamicLDS.empty()) {
      LLVMContext &Ctx = M.getContext();
      IRBuilder<> Builder(Ctx);
      Type *I32 = Type::getInt32Ty(Ctx);

      std::vector<Constant *> newDynamicLDS;

      // Table is built in the same order as OrderedKernels
      for (auto &func : OrderedKernels) {

        if (KernelsThatIndirectlyAllocateDynamicLDS.contains(func)) {
          assert(isKernelLDS(func));
          if (!func->hasName()) {
            report_fatal_error("Anonymous kernels cannot use LDS variables");
          }

          GlobalVariable *N =
              buildRepresentativeDynamicLDSInstance(M, LDSUsesInfo, func);

          KernelToCreatedDynamicLDS[func] = N;

          markUsedByKernel(func, N);

          auto emptyCharArray = ArrayType::get(Type::getInt8Ty(Ctx), 0);
          auto GEP = ConstantExpr::getGetElementPtr(
              emptyCharArray, N, ConstantInt::get(I32, 0), true);
          newDynamicLDS.push_back(ConstantExpr::getPtrToInt(GEP, I32));
        } else {
          newDynamicLDS.push_back(PoisonValue::get(I32));
        }
      }
      assert(OrderedKernels.size() == newDynamicLDS.size());

      ArrayType *t = ArrayType::get(I32, newDynamicLDS.size());
      Constant *init = ConstantArray::get(t, newDynamicLDS);
      GlobalVariable *table = new GlobalVariable(
          M, t, true, GlobalValue::InternalLinkage, init,
          "llvm.amdgcn.dynlds.offset.table", nullptr,
          GlobalValue::NotThreadLocal, AMDGPUAS::CONSTANT_ADDRESS);

      for (GlobalVariable *GV : DynamicVariables) {
        for (Use &U : make_early_inc_range(GV->uses())) {
          auto *I = dyn_cast<Instruction>(U.getUser());
          if (!I)
            continue;
          if (isKernelLDS(I->getFunction()))
            continue;

          replaceUseWithTableLookup(M, Builder, table, GV, U, nullptr);
        }
      }
    }
    return KernelToCreatedDynamicLDS;
  }

  bool runOnModule(Module &M) {
    CallGraph CG = CallGraph(M);
    bool Changed = superAlignLDSGlobals(M);

    Changed |= eliminateConstantExprUsesOfLDSFromAllInstructions(M);

    Changed = true; // todo: narrow this down

    // For each kernel, what variables does it access directly or through
    // callees
    LDSUsesInfoTy LDSUsesInfo = getTransitiveUsesOfLDS(CG, M);

    // For each variable accessed through callees, which kernels access it
    VariableFunctionMap LDSToKernelsThatNeedToAccessItIndirectly;
    for (auto &K : LDSUsesInfo.indirect_access) {
      Function *F = K.first;
      assert(isKernelLDS(F));
      for (GlobalVariable *GV : K.second) {
        LDSToKernelsThatNeedToAccessItIndirectly[GV].insert(F);
      }
    }

    // Partition variables accessed indirectly into the different strategies
    DenseSet<GlobalVariable *> ModuleScopeVariables;
    DenseSet<GlobalVariable *> TableLookupVariables;
    DenseSet<GlobalVariable *> KernelAccessVariables;
    DenseSet<GlobalVariable *> DynamicVariables;
    partitionVariablesIntoIndirectStrategies(
        M, LDSUsesInfo, LDSToKernelsThatNeedToAccessItIndirectly,
        ModuleScopeVariables, TableLookupVariables, KernelAccessVariables,
        DynamicVariables);

    // If the kernel accesses a variable that is going to be stored in the
    // module instance through a call then that kernel needs to allocate the
    // module instance
    const DenseSet<Function *> KernelsThatAllocateModuleLDS =
        kernelsThatIndirectlyAccessAnyOfPassedVariables(M, LDSUsesInfo,
                                                        ModuleScopeVariables);
    const DenseSet<Function *> KernelsThatAllocateTableLDS =
        kernelsThatIndirectlyAccessAnyOfPassedVariables(M, LDSUsesInfo,
                                                        TableLookupVariables);

    const DenseSet<Function *> KernelsThatIndirectlyAllocateDynamicLDS =
        kernelsThatIndirectlyAccessAnyOfPassedVariables(M, LDSUsesInfo,
                                                        DynamicVariables);

    GlobalVariable *MaybeModuleScopeStruct = lowerModuleScopeStructVariables(
        M, ModuleScopeVariables, KernelsThatAllocateModuleLDS);

    DenseMap<Function *, LDSVariableReplacement> KernelToReplacement =
        lowerKernelScopeStructVariables(M, LDSUsesInfo, ModuleScopeVariables,
                                        KernelsThatAllocateModuleLDS,
                                        MaybeModuleScopeStruct);

    // Lower zero cost accesses to the kernel instances just created
    for (auto &GV : KernelAccessVariables) {
      auto &funcs = LDSToKernelsThatNeedToAccessItIndirectly[GV];
      assert(funcs.size() == 1); // Only one kernel can access it
      LDSVariableReplacement Replacement =
          KernelToReplacement[*(funcs.begin())];

      DenseSet<GlobalVariable *> Vec;
      Vec.insert(GV);

      replaceLDSVariablesWithStruct(M, Vec, Replacement, [](Use &U) {
        return isa<Instruction>(U.getUser());
      });
    }

    // The ith element of this vector is kernel id i
    std::vector<Function *> OrderedKernels =
        assignLDSKernelIDToEachKernel(&M, KernelsThatAllocateTableLDS,
                                      KernelsThatIndirectlyAllocateDynamicLDS);

    if (!KernelsThatAllocateTableLDS.empty()) {
      LLVMContext &Ctx = M.getContext();
      IRBuilder<> Builder(Ctx);

      // The order must be consistent between lookup table and accesses to
      // lookup table
      auto TableLookupVariablesOrdered =
          sortByName(std::vector<GlobalVariable *>(TableLookupVariables.begin(),
                                                   TableLookupVariables.end()));

      GlobalVariable *LookupTable = buildLookupTable(
          M, TableLookupVariablesOrdered, OrderedKernels, KernelToReplacement);
      replaceUsesInInstructionsWithTableLookup(M, TableLookupVariablesOrdered,
                                               LookupTable);

      // Strip amdgpu-no-lds-kernel-id from all functions reachable from the
      // kernel. We may have inferred this wasn't used prior to the pass.
      //
      // TODO: We could filter out subgraphs that do not access LDS globals.
      for (Function *F : KernelsThatAllocateTableLDS)
        removeNoLdsKernelIdFromReachable(CG, F);
    }

    DenseMap<Function *, GlobalVariable *> KernelToCreatedDynamicLDS =
        lowerDynamicLDSVariables(M, LDSUsesInfo,
                                 KernelsThatIndirectlyAllocateDynamicLDS,
                                 DynamicVariables, OrderedKernels);

    // All kernel frames have been allocated. Calculate and record the
    // addresses.
    {
      const DataLayout &DL = M.getDataLayout();

      for (Function &Func : M.functions()) {
        if (Func.isDeclaration() || !isKernelLDS(&Func))
          continue;

        // All three of these are optional. The first variable is allocated at
        // zero. They are allocated by AMDGPUMachineFunction as one block.
        // Layout:
        //{
        //  module.lds
        //  alignment padding
        //  kernel instance
        //  alignment padding
        //  dynamic lds variables
        //}

        const bool AllocateModuleScopeStruct =
            MaybeModuleScopeStruct &&
            KernelsThatAllocateModuleLDS.contains(&Func);

        auto Replacement = KernelToReplacement.find(&Func);
        const bool AllocateKernelScopeStruct =
            Replacement != KernelToReplacement.end();

        const bool AllocateDynamicVariable =
            KernelToCreatedDynamicLDS.contains(&Func);

        uint32_t Offset = 0;

        if (AllocateModuleScopeStruct) {
          // Allocated at zero, recorded once on construction, not once per
          // kernel
          Offset += DL.getTypeAllocSize(MaybeModuleScopeStruct->getValueType());
        }

        if (AllocateKernelScopeStruct) {
          GlobalVariable *KernelStruct = Replacement->second.SGV;
          Offset = alignTo(Offset, AMDGPU::getAlign(DL, KernelStruct));
          recordLDSAbsoluteAddress(&M, KernelStruct, Offset);
          Offset += DL.getTypeAllocSize(KernelStruct->getValueType());
        }

        // If there is dynamic allocation, the alignment needed is included in
        // the static frame size. There may be no reference to the dynamic
        // variable in the kernel itself, so without including it here, that
        // alignment padding could be missed.
        if (AllocateDynamicVariable) {
          GlobalVariable *DynamicVariable = KernelToCreatedDynamicLDS[&Func];
          Offset = alignTo(Offset, AMDGPU::getAlign(DL, DynamicVariable));
          recordLDSAbsoluteAddress(&M, DynamicVariable, Offset);
        }

        if (Offset != 0) {
          (void)TM; // TODO: Account for target maximum LDS
          std::string Buffer;
          raw_string_ostream SS{Buffer};
          SS << format("%u", Offset);

          // Instead of explictly marking kernels that access dynamic variables
          // using special case metadata, annotate with min-lds == max-lds, i.e.
          // that there is no more space available for allocating more static
          // LDS variables. That is the right condition to prevent allocating
          // more variables which would collide with the addresses assigned to
          // dynamic variables.
          if (AllocateDynamicVariable)
            SS << format(",%u", Offset);

          Func.addFnAttr("amdgpu-lds-size", Buffer);
        }
      }
    }

    for (auto &GV : make_early_inc_range(M.globals()))
      if (AMDGPU::isLDSVariableToLower(GV)) {
        // probably want to remove from used lists
        GV.removeDeadConstantUsers();
        if (GV.use_empty())
          GV.eraseFromParent();
      }

    return Changed;
  }

private:
  // Increase the alignment of LDS globals if necessary to maximise the chance
  // that we can use aligned LDS instructions to access them.
  static bool superAlignLDSGlobals(Module &M) {
    const DataLayout &DL = M.getDataLayout();
    bool Changed = false;
    if (!SuperAlignLDSGlobals) {
      return Changed;
    }

    for (auto &GV : M.globals()) {
      if (GV.getType()->getPointerAddressSpace() != AMDGPUAS::LOCAL_ADDRESS) {
        // Only changing alignment of LDS variables
        continue;
      }
      if (!GV.hasInitializer()) {
        // cuda/hip extern __shared__ variable, leave alignment alone
        continue;
      }

      Align Alignment = AMDGPU::getAlign(DL, &GV);
      TypeSize GVSize = DL.getTypeAllocSize(GV.getValueType());

      if (GVSize > 8) {
        // We might want to use a b96 or b128 load/store
        Alignment = std::max(Alignment, Align(16));
      } else if (GVSize > 4) {
        // We might want to use a b64 load/store
        Alignment = std::max(Alignment, Align(8));
      } else if (GVSize > 2) {
        // We might want to use a b32 load/store
        Alignment = std::max(Alignment, Align(4));
      } else if (GVSize > 1) {
        // We might want to use a b16 load/store
        Alignment = std::max(Alignment, Align(2));
      }

      if (Alignment != AMDGPU::getAlign(DL, &GV)) {
        Changed = true;
        GV.setAlignment(Alignment);
      }
    }
    return Changed;
  }

  static LDSVariableReplacement createLDSVariableReplacement(
      Module &M, std::string VarName,
      DenseSet<GlobalVariable *> const &LDSVarsToTransform) {
    // Create a struct instance containing LDSVarsToTransform and map from those
    // variables to ConstantExprGEP
    // Variables may be introduced to meet alignment requirements. No aliasing
    // metadata is useful for these as they have no uses. Erased before return.

    LLVMContext &Ctx = M.getContext();
    const DataLayout &DL = M.getDataLayout();
    assert(!LDSVarsToTransform.empty());

    SmallVector<OptimizedStructLayoutField, 8> LayoutFields;
    LayoutFields.reserve(LDSVarsToTransform.size());
    {
      // The order of fields in this struct depends on the order of
      // varables in the argument which varies when changing how they
      // are identified, leading to spurious test breakage.
      auto Sorted = sortByName(std::vector<GlobalVariable *>(
          LDSVarsToTransform.begin(), LDSVarsToTransform.end()));

      for (GlobalVariable *GV : Sorted) {
        OptimizedStructLayoutField F(GV,
                                     DL.getTypeAllocSize(GV->getValueType()),
                                     AMDGPU::getAlign(DL, GV));
        LayoutFields.emplace_back(F);
      }
    }

    performOptimizedStructLayout(LayoutFields);

    std::vector<GlobalVariable *> LocalVars;
    BitVector IsPaddingField;
    LocalVars.reserve(LDSVarsToTransform.size()); // will be at least this large
    IsPaddingField.reserve(LDSVarsToTransform.size());
    {
      uint64_t CurrentOffset = 0;
      for (size_t I = 0; I < LayoutFields.size(); I++) {
        GlobalVariable *FGV = static_cast<GlobalVariable *>(
            const_cast<void *>(LayoutFields[I].Id));
        Align DataAlign = LayoutFields[I].Alignment;

        uint64_t DataAlignV = DataAlign.value();
        if (uint64_t Rem = CurrentOffset % DataAlignV) {
          uint64_t Padding = DataAlignV - Rem;

          // Append an array of padding bytes to meet alignment requested
          // Note (o +      (a - (o % a)) ) % a == 0
          //      (offset + Padding       ) % align == 0

          Type *ATy = ArrayType::get(Type::getInt8Ty(Ctx), Padding);
          LocalVars.push_back(new GlobalVariable(
              M, ATy, false, GlobalValue::InternalLinkage,
              PoisonValue::get(ATy), "", nullptr, GlobalValue::NotThreadLocal,
              AMDGPUAS::LOCAL_ADDRESS, false));
          IsPaddingField.push_back(true);
          CurrentOffset += Padding;
        }

        LocalVars.push_back(FGV);
        IsPaddingField.push_back(false);
        CurrentOffset += LayoutFields[I].Size;
      }
    }

    std::vector<Type *> LocalVarTypes;
    LocalVarTypes.reserve(LocalVars.size());
    std::transform(
        LocalVars.cbegin(), LocalVars.cend(), std::back_inserter(LocalVarTypes),
        [](const GlobalVariable *V) -> Type * { return V->getValueType(); });

    StructType *LDSTy = StructType::create(Ctx, LocalVarTypes, VarName + ".t");

    Align StructAlign = AMDGPU::getAlign(DL, LocalVars[0]);

    GlobalVariable *SGV = new GlobalVariable(
        M, LDSTy, false, GlobalValue::InternalLinkage, PoisonValue::get(LDSTy),
        VarName, nullptr, GlobalValue::NotThreadLocal, AMDGPUAS::LOCAL_ADDRESS,
        false);
    SGV->setAlignment(StructAlign);

    DenseMap<GlobalVariable *, Constant *> Map;
    Type *I32 = Type::getInt32Ty(Ctx);
    for (size_t I = 0; I < LocalVars.size(); I++) {
      GlobalVariable *GV = LocalVars[I];
      Constant *GEPIdx[] = {ConstantInt::get(I32, 0), ConstantInt::get(I32, I)};
      Constant *GEP = ConstantExpr::getGetElementPtr(LDSTy, SGV, GEPIdx, true);
      if (IsPaddingField[I]) {
        assert(GV->use_empty());
        GV->eraseFromParent();
      } else {
        Map[GV] = GEP;
      }
    }
    assert(Map.size() == LDSVarsToTransform.size());
    return {SGV, std::move(Map)};
  }

  template <typename PredicateTy>
  static void replaceLDSVariablesWithStruct(
      Module &M, DenseSet<GlobalVariable *> const &LDSVarsToTransformArg,
      const LDSVariableReplacement &Replacement, PredicateTy Predicate) {
    LLVMContext &Ctx = M.getContext();
    const DataLayout &DL = M.getDataLayout();

    // A hack... we need to insert the aliasing info in a predictable order for
    // lit tests. Would like to have them in a stable order already, ideally the
    // same order they get allocated, which might mean an ordered set container
    auto LDSVarsToTransform = sortByName(std::vector<GlobalVariable *>(
        LDSVarsToTransformArg.begin(), LDSVarsToTransformArg.end()));

    // Create alias.scope and their lists. Each field in the new structure
    // does not alias with all other fields.
    SmallVector<MDNode *> AliasScopes;
    SmallVector<Metadata *> NoAliasList;
    const size_t NumberVars = LDSVarsToTransform.size();
    if (NumberVars > 1) {
      MDBuilder MDB(Ctx);
      AliasScopes.reserve(NumberVars);
      MDNode *Domain = MDB.createAnonymousAliasScopeDomain();
      for (size_t I = 0; I < NumberVars; I++) {
        MDNode *Scope = MDB.createAnonymousAliasScope(Domain);
        AliasScopes.push_back(Scope);
      }
      NoAliasList.append(&AliasScopes[1], AliasScopes.end());
    }

    // Replace uses of ith variable with a constantexpr to the corresponding
    // field of the instance that will be allocated by AMDGPUMachineFunction
    for (size_t I = 0; I < NumberVars; I++) {
      GlobalVariable *GV = LDSVarsToTransform[I];
      Constant *GEP = Replacement.LDSVarsToConstantGEP.at(GV);

      GV->replaceUsesWithIf(GEP, Predicate);

      APInt APOff(DL.getIndexTypeSizeInBits(GEP->getType()), 0);
      GEP->stripAndAccumulateInBoundsConstantOffsets(DL, APOff);
      uint64_t Offset = APOff.getZExtValue();

      Align A =
          commonAlignment(Replacement.SGV->getAlign().valueOrOne(), Offset);

      if (I)
        NoAliasList[I - 1] = AliasScopes[I - 1];
      MDNode *NoAlias =
          NoAliasList.empty() ? nullptr : MDNode::get(Ctx, NoAliasList);
      MDNode *AliasScope =
          AliasScopes.empty() ? nullptr : MDNode::get(Ctx, {AliasScopes[I]});

      refineUsesAlignmentAndAA(GEP, A, DL, AliasScope, NoAlias);
    }
  }

  static void refineUsesAlignmentAndAA(Value *Ptr, Align A,
                                       const DataLayout &DL, MDNode *AliasScope,
                                       MDNode *NoAlias, unsigned MaxDepth = 5) {
    if (!MaxDepth || (A == 1 && !AliasScope))
      return;

    for (User *U : Ptr->users()) {
      if (auto *I = dyn_cast<Instruction>(U)) {
        if (AliasScope && I->mayReadOrWriteMemory()) {
          MDNode *AS = I->getMetadata(LLVMContext::MD_alias_scope);
          AS = (AS ? MDNode::getMostGenericAliasScope(AS, AliasScope)
                   : AliasScope);
          I->setMetadata(LLVMContext::MD_alias_scope, AS);

          MDNode *NA = I->getMetadata(LLVMContext::MD_noalias);
          NA = (NA ? MDNode::intersect(NA, NoAlias) : NoAlias);
          I->setMetadata(LLVMContext::MD_noalias, NA);
        }
      }

      if (auto *LI = dyn_cast<LoadInst>(U)) {
        LI->setAlignment(std::max(A, LI->getAlign()));
        continue;
      }
      if (auto *SI = dyn_cast<StoreInst>(U)) {
        if (SI->getPointerOperand() == Ptr)
          SI->setAlignment(std::max(A, SI->getAlign()));
        continue;
      }
      if (auto *AI = dyn_cast<AtomicRMWInst>(U)) {
        // None of atomicrmw operations can work on pointers, but let's
        // check it anyway in case it will or we will process ConstantExpr.
        if (AI->getPointerOperand() == Ptr)
          AI->setAlignment(std::max(A, AI->getAlign()));
        continue;
      }
      if (auto *AI = dyn_cast<AtomicCmpXchgInst>(U)) {
        if (AI->getPointerOperand() == Ptr)
          AI->setAlignment(std::max(A, AI->getAlign()));
        continue;
      }
      if (auto *GEP = dyn_cast<GetElementPtrInst>(U)) {
        unsigned BitWidth = DL.getIndexTypeSizeInBits(GEP->getType());
        APInt Off(BitWidth, 0);
        if (GEP->getPointerOperand() == Ptr) {
          Align GA;
          if (GEP->accumulateConstantOffset(DL, Off))
            GA = commonAlignment(A, Off.getLimitedValue());
          refineUsesAlignmentAndAA(GEP, GA, DL, AliasScope, NoAlias,
                                   MaxDepth - 1);
        }
        continue;
      }
      if (auto *I = dyn_cast<Instruction>(U)) {
        if (I->getOpcode() == Instruction::BitCast ||
            I->getOpcode() == Instruction::AddrSpaceCast)
          refineUsesAlignmentAndAA(I, A, DL, AliasScope, NoAlias, MaxDepth - 1);
      }
    }
  }
};

class AMDGPULowerModuleLDSLegacy : public ModulePass {
public:
  const AMDGPUTargetMachine *TM;
  static char ID;

  AMDGPULowerModuleLDSLegacy(const AMDGPUTargetMachine *TM_ = nullptr)
      : ModulePass(ID), TM(TM_) {
    initializeAMDGPULowerModuleLDSLegacyPass(*PassRegistry::getPassRegistry());
  }

  void getAnalysisUsage(AnalysisUsage &AU) const override {
    if (!TM)
      AU.addRequired<TargetPassConfig>();
  }

  bool runOnModule(Module &M) override {
    if (!TM) {
      auto &TPC = getAnalysis<TargetPassConfig>();
      TM = &TPC.getTM<AMDGPUTargetMachine>();
    }

    return AMDGPULowerModuleLDS(*TM).runOnModule(M);
  }
};

} // namespace
char AMDGPULowerModuleLDSLegacy::ID = 0;

char &llvm::AMDGPULowerModuleLDSLegacyPassID = AMDGPULowerModuleLDSLegacy::ID;

INITIALIZE_PASS_BEGIN(AMDGPULowerModuleLDSLegacy, DEBUG_TYPE,
                      "Lower uses of LDS variables from non-kernel functions",
                      false, false)
INITIALIZE_PASS_DEPENDENCY(TargetPassConfig)
INITIALIZE_PASS_END(AMDGPULowerModuleLDSLegacy, DEBUG_TYPE,
                    "Lower uses of LDS variables from non-kernel functions",
                    false, false)

ModulePass *
llvm::createAMDGPULowerModuleLDSLegacyPass(const AMDGPUTargetMachine *TM) {
  return new AMDGPULowerModuleLDSLegacy(TM);
}

PreservedAnalyses AMDGPULowerModuleLDSPass::run(Module &M,
                                                ModuleAnalysisManager &) {
  return AMDGPULowerModuleLDS(TM).runOnModule(M) ? PreservedAnalyses::none()
                                                 : PreservedAnalyses::all();
}