1//===-- NVPTXISelLowering.cpp - NVPTX DAG Lowering Implementation ---------===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8//
9// This file defines the interfaces that NVPTX uses to lower LLVM code into a
10// selection DAG.
11//
12//===----------------------------------------------------------------------===//
13
14#include "NVPTXISelLowering.h"
15#include "MCTargetDesc/NVPTXBaseInfo.h"
16#include "NVPTX.h"
17#include "NVPTXSubtarget.h"
18#include "NVPTXTargetMachine.h"
19#include "NVPTXTargetObjectFile.h"
20#include "NVPTXUtilities.h"
21#include "llvm/ADT/APInt.h"
22#include "llvm/ADT/STLExtras.h"
23#include "llvm/ADT/SmallVector.h"
24#include "llvm/ADT/StringRef.h"
25#include "llvm/CodeGen/Analysis.h"
26#include "llvm/CodeGen/ISDOpcodes.h"
27#include "llvm/CodeGen/MachineFunction.h"
28#include "llvm/CodeGen/MachineMemOperand.h"
29#include "llvm/CodeGen/MachineValueType.h"
30#include "llvm/CodeGen/SelectionDAG.h"
31#include "llvm/CodeGen/SelectionDAGNodes.h"
32#include "llvm/CodeGen/TargetCallingConv.h"
33#include "llvm/CodeGen/TargetLowering.h"
34#include "llvm/CodeGen/ValueTypes.h"
35#include "llvm/IR/Argument.h"
36#include "llvm/IR/Attributes.h"
37#include "llvm/IR/Constants.h"
38#include "llvm/IR/DataLayout.h"
39#include "llvm/IR/DerivedTypes.h"
40#include "llvm/IR/DiagnosticInfo.h"
41#include "llvm/IR/FPEnv.h"
42#include "llvm/IR/Function.h"
43#include "llvm/IR/GlobalValue.h"
44#include "llvm/IR/Instruction.h"
45#include "llvm/IR/Instructions.h"
46#include "llvm/IR/IntrinsicsNVPTX.h"
47#include "llvm/IR/Module.h"
48#include "llvm/IR/Type.h"
49#include "llvm/IR/Value.h"
50#include "llvm/Support/Casting.h"
51#include "llvm/Support/CodeGen.h"
52#include "llvm/Support/CommandLine.h"
53#include "llvm/Support/ErrorHandling.h"
54#include "llvm/Support/raw_ostream.h"
55#include "llvm/Target/TargetMachine.h"
56#include "llvm/Target/TargetOptions.h"
57#include <algorithm>
58#include <cassert>
59#include <cmath>
60#include <cstdint>
61#include <iterator>
62#include <sstream>
63#include <string>
64#include <utility>
65#include <vector>
66
67#define DEBUG_TYPE "nvptx-lower"
68
69using namespace llvm;
70
71static std::atomic<unsigned> GlobalUniqueCallSite;
72
73static cl::opt<bool> sched4reg(
74    "nvptx-sched4reg",
75    cl::desc("NVPTX Specific: schedule for register pressue"), cl::init(false));
76
77static cl::opt<unsigned> FMAContractLevelOpt(
78    "nvptx-fma-level", cl::Hidden,
79    cl::desc("NVPTX Specific: FMA contraction (0: don't do it"
80             " 1: do it  2: do it aggressively"),
81    cl::init(2));
82
83static cl::opt<int> UsePrecDivF32(
84    "nvptx-prec-divf32", cl::Hidden,
85    cl::desc("NVPTX Specifies: 0 use div.approx, 1 use div.full, 2 use"
86             " IEEE Compliant F32 div.rnd if available."),
87    cl::init(2));
88
89static cl::opt<bool> UsePrecSqrtF32(
90    "nvptx-prec-sqrtf32", cl::Hidden,
91    cl::desc("NVPTX Specific: 0 use sqrt.approx, 1 use sqrt.rn."),
92    cl::init(true));
93
94static cl::opt<bool> ForceMinByValParamAlign(
95    "nvptx-force-min-byval-param-align", cl::Hidden,
96    cl::desc("NVPTX Specific: force 4-byte minimal alignment for byval"
97             " params of device functions."),
98    cl::init(false));
99
100int NVPTXTargetLowering::getDivF32Level() const {
101  if (UsePrecDivF32.getNumOccurrences() > 0) {
102    // If nvptx-prec-div32=N is used on the command-line, always honor it
103    return UsePrecDivF32;
104  } else {
105    // Otherwise, use div.approx if fast math is enabled
106    if (getTargetMachine().Options.UnsafeFPMath)
107      return 0;
108    else
109      return 2;
110  }
111}
112
113bool NVPTXTargetLowering::usePrecSqrtF32() const {
114  if (UsePrecSqrtF32.getNumOccurrences() > 0) {
115    // If nvptx-prec-sqrtf32 is used on the command-line, always honor it
116    return UsePrecSqrtF32;
117  } else {
118    // Otherwise, use sqrt.approx if fast math is enabled
119    return !getTargetMachine().Options.UnsafeFPMath;
120  }
121}
122
123bool NVPTXTargetLowering::useF32FTZ(const MachineFunction &MF) const {
124  return MF.getDenormalMode(APFloat::IEEEsingle()).Output ==
125         DenormalMode::PreserveSign;
126}
127
128static bool IsPTXVectorType(MVT VT) {
129  switch (VT.SimpleTy) {
130  default:
131    return false;
132  case MVT::v2i1:
133  case MVT::v4i1:
134  case MVT::v2i8:
135  case MVT::v4i8:
136  case MVT::v2i16:
137  case MVT::v4i16:
138  case MVT::v8i16: // <4 x i16x2>
139  case MVT::v2i32:
140  case MVT::v4i32:
141  case MVT::v2i64:
142  case MVT::v2f16:
143  case MVT::v4f16:
144  case MVT::v8f16: // <4 x f16x2>
145  case MVT::v2bf16:
146  case MVT::v4bf16:
147  case MVT::v8bf16: // <4 x bf16x2>
148  case MVT::v2f32:
149  case MVT::v4f32:
150  case MVT::v2f64:
151    return true;
152  }
153}
154
155static bool Is16bitsType(MVT VT) {
156  return (VT.SimpleTy == MVT::f16 || VT.SimpleTy == MVT::bf16 ||
157          VT.SimpleTy == MVT::i16);
158}
159
160/// ComputePTXValueVTs - For the given Type \p Ty, returns the set of primitive
161/// EVTs that compose it.  Unlike ComputeValueVTs, this will break apart vectors
162/// into their primitive components.
163/// NOTE: This is a band-aid for code that expects ComputeValueVTs to return the
164/// same number of types as the Ins/Outs arrays in LowerFormalArguments,
165/// LowerCall, and LowerReturn.
166static void ComputePTXValueVTs(const TargetLowering &TLI, const DataLayout &DL,
167                               Type *Ty, SmallVectorImpl<EVT> &ValueVTs,
168                               SmallVectorImpl<uint64_t> *Offsets = nullptr,
169                               uint64_t StartingOffset = 0) {
170  SmallVector<EVT, 16> TempVTs;
171  SmallVector<uint64_t, 16> TempOffsets;
172
173  // Special case for i128 - decompose to (i64, i64)
174  if (Ty->isIntegerTy(128)) {
175    ValueVTs.push_back(EVT(MVT::i64));
176    ValueVTs.push_back(EVT(MVT::i64));
177
178    if (Offsets) {
179      Offsets->push_back(StartingOffset + 0);
180      Offsets->push_back(StartingOffset + 8);
181    }
182
183    return;
184  }
185
186  // Given a struct type, recursively traverse the elements with custom ComputePTXValueVTs.
187  if (StructType *STy = dyn_cast<StructType>(Ty)) {
188    auto const *SL = DL.getStructLayout(STy);
189    auto ElementNum = 0;
190    for(auto *EI : STy->elements()) {
191      ComputePTXValueVTs(TLI, DL, EI, ValueVTs, Offsets,
192                         StartingOffset + SL->getElementOffset(ElementNum));
193      ++ElementNum;
194    }
195    return;
196  }
197
198  ComputeValueVTs(TLI, DL, Ty, TempVTs, &TempOffsets, StartingOffset);
199  for (unsigned i = 0, e = TempVTs.size(); i != e; ++i) {
200    EVT VT = TempVTs[i];
201    uint64_t Off = TempOffsets[i];
202    // Split vectors into individual elements, except for v2f16, which
203    // we will pass as a single scalar.
204    if (VT.isVector()) {
205      unsigned NumElts = VT.getVectorNumElements();
206      EVT EltVT = VT.getVectorElementType();
207      // Vectors with an even number of f16 elements will be passed to
208      // us as an array of v2f16/v2bf16 elements. We must match this so we
209      // stay in sync with Ins/Outs.
210      if ((Is16bitsType(EltVT.getSimpleVT())) && NumElts % 2 == 0) {
211        switch (EltVT.getSimpleVT().SimpleTy) {
212        case MVT::f16:
213          EltVT = MVT::v2f16;
214          break;
215        case MVT::bf16:
216          EltVT = MVT::v2bf16;
217          break;
218        case MVT::i16:
219          EltVT = MVT::v2i16;
220          break;
221        default:
222          llvm_unreachable("Unexpected type");
223        }
224        NumElts /= 2;
225      } else if (EltVT.getSimpleVT() == MVT::i8 &&
226                 (NumElts % 4 == 0 || NumElts == 3)) {
227        // v*i8 are formally lowered as v4i8
228        EltVT = MVT::v4i8;
229        NumElts = (NumElts + 3) / 4;
230      }
231      for (unsigned j = 0; j != NumElts; ++j) {
232        ValueVTs.push_back(EltVT);
233        if (Offsets)
234          Offsets->push_back(Off + j * EltVT.getStoreSize());
235      }
236    } else {
237      ValueVTs.push_back(VT);
238      if (Offsets)
239        Offsets->push_back(Off);
240    }
241  }
242}
243
244/// PromoteScalarIntegerPTX
245/// Used to make sure the arguments/returns are suitable for passing
246/// and promote them to a larger size if they're not.
247///
248/// The promoted type is placed in \p PromoteVT if the function returns true.
249static bool PromoteScalarIntegerPTX(const EVT &VT, MVT *PromotedVT) {
250  if (VT.isScalarInteger()) {
251    switch (PowerOf2Ceil(VT.getFixedSizeInBits())) {
252    default:
253      llvm_unreachable(
254          "Promotion is not suitable for scalars of size larger than 64-bits");
255    case 1:
256      *PromotedVT = MVT::i1;
257      break;
258    case 2:
259    case 4:
260    case 8:
261      *PromotedVT = MVT::i8;
262      break;
263    case 16:
264      *PromotedVT = MVT::i16;
265      break;
266    case 32:
267      *PromotedVT = MVT::i32;
268      break;
269    case 64:
270      *PromotedVT = MVT::i64;
271      break;
272    }
273    return EVT(*PromotedVT) != VT;
274  }
275  return false;
276}
277
278// Check whether we can merge loads/stores of some of the pieces of a
279// flattened function parameter or return value into a single vector
280// load/store.
281//
282// The flattened parameter is represented as a list of EVTs and
283// offsets, and the whole structure is aligned to ParamAlignment. This
284// function determines whether we can load/store pieces of the
285// parameter starting at index Idx using a single vectorized op of
286// size AccessSize. If so, it returns the number of param pieces
287// covered by the vector op. Otherwise, it returns 1.
288static unsigned CanMergeParamLoadStoresStartingAt(
289    unsigned Idx, uint32_t AccessSize, const SmallVectorImpl<EVT> &ValueVTs,
290    const SmallVectorImpl<uint64_t> &Offsets, Align ParamAlignment) {
291
292  // Can't vectorize if param alignment is not sufficient.
293  if (ParamAlignment < AccessSize)
294    return 1;
295  // Can't vectorize if offset is not aligned.
296  if (Offsets[Idx] & (AccessSize - 1))
297    return 1;
298
299  EVT EltVT = ValueVTs[Idx];
300  unsigned EltSize = EltVT.getStoreSize();
301
302  // Element is too large to vectorize.
303  if (EltSize >= AccessSize)
304    return 1;
305
306  unsigned NumElts = AccessSize / EltSize;
307  // Can't vectorize if AccessBytes if not a multiple of EltSize.
308  if (AccessSize != EltSize * NumElts)
309    return 1;
310
311  // We don't have enough elements to vectorize.
312  if (Idx + NumElts > ValueVTs.size())
313    return 1;
314
315  // PTX ISA can only deal with 2- and 4-element vector ops.
316  if (NumElts != 4 && NumElts != 2)
317    return 1;
318
319  for (unsigned j = Idx + 1; j < Idx + NumElts; ++j) {
320    // Types do not match.
321    if (ValueVTs[j] != EltVT)
322      return 1;
323
324    // Elements are not contiguous.
325    if (Offsets[j] - Offsets[j - 1] != EltSize)
326      return 1;
327  }
328  // OK. We can vectorize ValueVTs[i..i+NumElts)
329  return NumElts;
330}
331
332// Flags for tracking per-element vectorization state of loads/stores
333// of a flattened function parameter or return value.
334enum ParamVectorizationFlags {
335  PVF_INNER = 0x0, // Middle elements of a vector.
336  PVF_FIRST = 0x1, // First element of the vector.
337  PVF_LAST = 0x2,  // Last element of the vector.
338  // Scalar is effectively a 1-element vector.
339  PVF_SCALAR = PVF_FIRST | PVF_LAST
340};
341
342// Computes whether and how we can vectorize the loads/stores of a
343// flattened function parameter or return value.
344//
345// The flattened parameter is represented as the list of ValueVTs and
346// Offsets, and is aligned to ParamAlignment bytes. We return a vector
347// of the same size as ValueVTs indicating how each piece should be
348// loaded/stored (i.e. as a scalar, or as part of a vector
349// load/store).
350static SmallVector<ParamVectorizationFlags, 16>
351VectorizePTXValueVTs(const SmallVectorImpl<EVT> &ValueVTs,
352                     const SmallVectorImpl<uint64_t> &Offsets,
353                     Align ParamAlignment, bool IsVAArg = false) {
354  // Set vector size to match ValueVTs and mark all elements as
355  // scalars by default.
356  SmallVector<ParamVectorizationFlags, 16> VectorInfo;
357  VectorInfo.assign(ValueVTs.size(), PVF_SCALAR);
358
359  if (IsVAArg)
360    return VectorInfo;
361
362  // Check what we can vectorize using 128/64/32-bit accesses.
363  for (int I = 0, E = ValueVTs.size(); I != E; ++I) {
364    // Skip elements we've already processed.
365    assert(VectorInfo[I] == PVF_SCALAR && "Unexpected vector info state.");
366    for (unsigned AccessSize : {16, 8, 4, 2}) {
367      unsigned NumElts = CanMergeParamLoadStoresStartingAt(
368          I, AccessSize, ValueVTs, Offsets, ParamAlignment);
369      // Mark vectorized elements.
370      switch (NumElts) {
371      default:
372        llvm_unreachable("Unexpected return value");
373      case 1:
374        // Can't vectorize using this size, try next smaller size.
375        continue;
376      case 2:
377        assert(I + 1 < E && "Not enough elements.");
378        VectorInfo[I] = PVF_FIRST;
379        VectorInfo[I + 1] = PVF_LAST;
380        I += 1;
381        break;
382      case 4:
383        assert(I + 3 < E && "Not enough elements.");
384        VectorInfo[I] = PVF_FIRST;
385        VectorInfo[I + 1] = PVF_INNER;
386        VectorInfo[I + 2] = PVF_INNER;
387        VectorInfo[I + 3] = PVF_LAST;
388        I += 3;
389        break;
390      }
391      // Break out of the inner loop because we've already succeeded
392      // using largest possible AccessSize.
393      break;
394    }
395  }
396  return VectorInfo;
397}
398
399// NVPTXTargetLowering Constructor.
400NVPTXTargetLowering::NVPTXTargetLowering(const NVPTXTargetMachine &TM,
401                                         const NVPTXSubtarget &STI)
402    : TargetLowering(TM), nvTM(&TM), STI(STI) {
403  // always lower memset, memcpy, and memmove intrinsics to load/store
404  // instructions, rather
405  // then generating calls to memset, mempcy or memmove.
406  MaxStoresPerMemset = MaxStoresPerMemsetOptSize = (unsigned)0xFFFFFFFF;
407  MaxStoresPerMemcpy = MaxStoresPerMemcpyOptSize = (unsigned) 0xFFFFFFFF;
408  MaxStoresPerMemmove = MaxStoresPerMemmoveOptSize = (unsigned) 0xFFFFFFFF;
409
410  setBooleanContents(ZeroOrNegativeOneBooleanContent);
411  setBooleanVectorContents(ZeroOrNegativeOneBooleanContent);
412
413  // Jump is Expensive. Don't create extra control flow for 'and', 'or'
414  // condition branches.
415  setJumpIsExpensive(true);
416
417  // Wide divides are _very_ slow. Try to reduce the width of the divide if
418  // possible.
419  addBypassSlowDiv(64, 32);
420
421  // By default, use the Source scheduling
422  if (sched4reg)
423    setSchedulingPreference(Sched::RegPressure);
424  else
425    setSchedulingPreference(Sched::Source);
426
427  auto setFP16OperationAction = [&](unsigned Op, MVT VT, LegalizeAction Action,
428                                    LegalizeAction NoF16Action) {
429    setOperationAction(Op, VT, STI.allowFP16Math() ? Action : NoF16Action);
430  };
431
432  auto setBF16OperationAction = [&](unsigned Op, MVT VT, LegalizeAction Action,
433                                    LegalizeAction NoBF16Action) {
434    bool IsOpSupported = STI.hasBF16Math();
435    // Few instructions are available on sm_90 only
436    switch(Op) {
437      case ISD::FADD:
438      case ISD::FMUL:
439      case ISD::FSUB:
440      case ISD::SELECT:
441      case ISD::SELECT_CC:
442      case ISD::SETCC:
443      case ISD::FEXP2:
444      case ISD::FCEIL:
445      case ISD::FFLOOR:
446      case ISD::FNEARBYINT:
447      case ISD::FRINT:
448      case ISD::FTRUNC:
449        IsOpSupported = STI.getSmVersion() >= 90 && STI.getPTXVersion() >= 78;
450        break;
451    }
452    setOperationAction(
453        Op, VT, IsOpSupported ? Action : NoBF16Action);
454  };
455
456  auto setI16x2OperationAction = [&](unsigned Op, MVT VT, LegalizeAction Action,
457                                     LegalizeAction NoI16x2Action) {
458    bool IsOpSupported = false;
459    // instructions are available on sm_90 only
460    switch (Op) {
461    case ISD::ADD:
462    case ISD::SMAX:
463    case ISD::SMIN:
464    case ISD::UMIN:
465    case ISD::UMAX:
466    case ISD::SUB:
467      IsOpSupported = STI.getSmVersion() >= 90 && STI.getPTXVersion() >= 80;
468      break;
469    }
470    setOperationAction(Op, VT, IsOpSupported ? Action : NoI16x2Action);
471  };
472
473  addRegisterClass(MVT::i1, &NVPTX::Int1RegsRegClass);
474  addRegisterClass(MVT::i16, &NVPTX::Int16RegsRegClass);
475  addRegisterClass(MVT::v2i16, &NVPTX::Int32RegsRegClass);
476  addRegisterClass(MVT::v4i8, &NVPTX::Int32RegsRegClass);
477  addRegisterClass(MVT::i32, &NVPTX::Int32RegsRegClass);
478  addRegisterClass(MVT::i64, &NVPTX::Int64RegsRegClass);
479  addRegisterClass(MVT::f32, &NVPTX::Float32RegsRegClass);
480  addRegisterClass(MVT::f64, &NVPTX::Float64RegsRegClass);
481  addRegisterClass(MVT::f16, &NVPTX::Int16RegsRegClass);
482  addRegisterClass(MVT::v2f16, &NVPTX::Int32RegsRegClass);
483  addRegisterClass(MVT::bf16, &NVPTX::Int16RegsRegClass);
484  addRegisterClass(MVT::v2bf16, &NVPTX::Int32RegsRegClass);
485
486  // Conversion to/from FP16/FP16x2 is always legal.
487  setOperationAction(ISD::BUILD_VECTOR, MVT::v2f16, Custom);
488  setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v2f16, Custom);
489  setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v2f16, Expand);
490  setOperationAction(ISD::VECTOR_SHUFFLE, MVT::v2f16, Expand);
491
492  setFP16OperationAction(ISD::SETCC, MVT::f16, Legal, Promote);
493  setFP16OperationAction(ISD::SETCC, MVT::v2f16, Legal, Expand);
494
495  // Conversion to/from BFP16/BFP16x2 is always legal.
496  setOperationAction(ISD::BUILD_VECTOR, MVT::v2bf16, Custom);
497  setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v2bf16, Custom);
498  setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v2bf16, Expand);
499  setOperationAction(ISD::VECTOR_SHUFFLE, MVT::v2bf16, Expand);
500
501  setBF16OperationAction(ISD::SETCC, MVT::v2bf16, Legal, Expand);
502  setBF16OperationAction(ISD::SETCC, MVT::bf16, Legal, Promote);
503  if (getOperationAction(ISD::SETCC, MVT::bf16) == Promote)
504    AddPromotedToType(ISD::SETCC, MVT::bf16, MVT::f32);
505
506  // Conversion to/from i16/i16x2 is always legal.
507  setOperationAction(ISD::BUILD_VECTOR, MVT::v2i16, Custom);
508  setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v2i16, Custom);
509  setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v2i16, Expand);
510  setOperationAction(ISD::VECTOR_SHUFFLE, MVT::v2i16, Expand);
511
512  setOperationAction(ISD::BUILD_VECTOR, MVT::v4i8, Custom);
513  setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v4i8, Custom);
514  setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v4i8, Custom);
515  setOperationAction(ISD::VECTOR_SHUFFLE, MVT::v4i8, Custom);
516  // Only logical ops can be done on v4i8 directly, others must be done
517  // elementwise.
518  setOperationAction(
519      {ISD::ABS,         ISD::ADD,        ISD::ADDC,        ISD::ADDE,
520       ISD::BITREVERSE,  ISD::CTLZ,       ISD::CTPOP,       ISD::CTTZ,
521       ISD::FP_TO_SINT,  ISD::FP_TO_UINT, ISD::FSHL,        ISD::FSHR,
522       ISD::MUL,         ISD::MULHS,      ISD::MULHU,       ISD::PARITY,
523       ISD::ROTL,        ISD::ROTR,       ISD::SADDO,       ISD::SADDO_CARRY,
524       ISD::SADDSAT,     ISD::SDIV,       ISD::SDIVREM,     ISD::SELECT_CC,
525       ISD::SETCC,       ISD::SHL,        ISD::SINT_TO_FP,  ISD::SMAX,
526       ISD::SMIN,        ISD::SMULO,      ISD::SMUL_LOHI,   ISD::SRA,
527       ISD::SREM,        ISD::SRL,        ISD::SSHLSAT,     ISD::SSUBO,
528       ISD::SSUBO_CARRY, ISD::SSUBSAT,    ISD::SUB,         ISD::SUBC,
529       ISD::SUBE,        ISD::UADDO,      ISD::UADDO_CARRY, ISD::UADDSAT,
530       ISD::UDIV,        ISD::UDIVREM,    ISD::UINT_TO_FP,  ISD::UMAX,
531       ISD::UMIN,        ISD::UMULO,      ISD::UMUL_LOHI,   ISD::UREM,
532       ISD::USHLSAT,     ISD::USUBO,      ISD::USUBO_CARRY, ISD::VSELECT,
533       ISD::USUBSAT},
534      MVT::v4i8, Expand);
535
536  // Operations not directly supported by NVPTX.
537  for (MVT VT : {MVT::bf16, MVT::f16, MVT::v2bf16, MVT::v2f16, MVT::f32,
538                 MVT::f64, MVT::i1, MVT::i8, MVT::i16, MVT::v2i16, MVT::v4i8,
539                 MVT::i32, MVT::i64}) {
540    setOperationAction(ISD::SELECT_CC, VT, Expand);
541    setOperationAction(ISD::BR_CC, VT, Expand);
542  }
543
544  // Some SIGN_EXTEND_INREG can be done using cvt instruction.
545  // For others we will expand to a SHL/SRA pair.
546  setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i64, Legal);
547  setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i32, Legal);
548  setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i16, Legal);
549  setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i8 , Legal);
550  setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i1, Expand);
551  setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::v2i16, Expand);
552
553  setOperationAction(ISD::SHL_PARTS, MVT::i32  , Custom);
554  setOperationAction(ISD::SRA_PARTS, MVT::i32  , Custom);
555  setOperationAction(ISD::SRL_PARTS, MVT::i32  , Custom);
556  setOperationAction(ISD::SHL_PARTS, MVT::i64  , Custom);
557  setOperationAction(ISD::SRA_PARTS, MVT::i64  , Custom);
558  setOperationAction(ISD::SRL_PARTS, MVT::i64  , Custom);
559
560  setOperationAction(ISD::BITREVERSE, MVT::i32, Legal);
561  setOperationAction(ISD::BITREVERSE, MVT::i64, Legal);
562
563  // TODO: we may consider expanding ROTL/ROTR on older GPUs.  Currently on GPUs
564  // that don't have h/w rotation we lower them to multi-instruction assembly.
565  // See ROT*_sw in NVPTXIntrInfo.td
566  setOperationAction(ISD::ROTL, MVT::i64, Legal);
567  setOperationAction(ISD::ROTR, MVT::i64, Legal);
568  setOperationAction(ISD::ROTL, MVT::i32, Legal);
569  setOperationAction(ISD::ROTR, MVT::i32, Legal);
570
571  setOperationAction(ISD::ROTL, MVT::i16, Expand);
572  setOperationAction(ISD::ROTL, MVT::v2i16, Expand);
573  setOperationAction(ISD::ROTR, MVT::i16, Expand);
574  setOperationAction(ISD::ROTR, MVT::v2i16, Expand);
575  setOperationAction(ISD::ROTL, MVT::i8, Expand);
576  setOperationAction(ISD::ROTR, MVT::i8, Expand);
577  setOperationAction(ISD::BSWAP, MVT::i16, Expand);
578  setOperationAction(ISD::BSWAP, MVT::v2i16, Expand);
579  setOperationAction(ISD::BSWAP, MVT::i32, Expand);
580  setOperationAction(ISD::BSWAP, MVT::i64, Expand);
581
582  // Indirect branch is not supported.
583  // This also disables Jump Table creation.
584  setOperationAction(ISD::BR_JT, MVT::Other, Expand);
585  setOperationAction(ISD::BRIND, MVT::Other, Expand);
586
587  setOperationAction(ISD::GlobalAddress, MVT::i32, Custom);
588  setOperationAction(ISD::GlobalAddress, MVT::i64, Custom);
589
590  // We want to legalize constant related memmove and memcopy
591  // intrinsics.
592  setOperationAction(ISD::INTRINSIC_W_CHAIN, MVT::Other, Custom);
593
594  // Turn FP extload into load/fpextend
595  setLoadExtAction(ISD::EXTLOAD, MVT::f32, MVT::f16, Expand);
596  setLoadExtAction(ISD::EXTLOAD, MVT::f64, MVT::f16, Expand);
597  setLoadExtAction(ISD::EXTLOAD, MVT::f32, MVT::bf16, Expand);
598  setLoadExtAction(ISD::EXTLOAD, MVT::f64, MVT::bf16, Expand);
599  setLoadExtAction(ISD::EXTLOAD, MVT::f64, MVT::f32, Expand);
600  setLoadExtAction(ISD::EXTLOAD, MVT::v2f32, MVT::v2f16, Expand);
601  setLoadExtAction(ISD::EXTLOAD, MVT::v2f64, MVT::v2f16, Expand);
602  setLoadExtAction(ISD::EXTLOAD, MVT::v2f32, MVT::v2bf16, Expand);
603  setLoadExtAction(ISD::EXTLOAD, MVT::v2f64, MVT::v2bf16, Expand);
604  setLoadExtAction(ISD::EXTLOAD, MVT::v2f64, MVT::v2f32, Expand);
605  setLoadExtAction(ISD::EXTLOAD, MVT::v4f32, MVT::v4f16, Expand);
606  setLoadExtAction(ISD::EXTLOAD, MVT::v4f64, MVT::v4f16, Expand);
607  setLoadExtAction(ISD::EXTLOAD, MVT::v4f32, MVT::v4bf16, Expand);
608  setLoadExtAction(ISD::EXTLOAD, MVT::v4f64, MVT::v4bf16, Expand);
609  setLoadExtAction(ISD::EXTLOAD, MVT::v4f64, MVT::v4f32, Expand);
610  setLoadExtAction(ISD::EXTLOAD, MVT::v8f32, MVT::v8f16, Expand);
611  setLoadExtAction(ISD::EXTLOAD, MVT::v8f64, MVT::v8f16, Expand);
612  setLoadExtAction(ISD::EXTLOAD, MVT::v8f32, MVT::v8bf16, Expand);
613  setLoadExtAction(ISD::EXTLOAD, MVT::v8f64, MVT::v8bf16, Expand);
614  // Turn FP truncstore into trunc + store.
615  // FIXME: vector types should also be expanded
616  setTruncStoreAction(MVT::f32, MVT::f16, Expand);
617  setTruncStoreAction(MVT::f64, MVT::f16, Expand);
618  setTruncStoreAction(MVT::f32, MVT::bf16, Expand);
619  setTruncStoreAction(MVT::f64, MVT::bf16, Expand);
620  setTruncStoreAction(MVT::f64, MVT::f32, Expand);
621
622  // PTX does not support load / store predicate registers
623  setOperationAction(ISD::LOAD, MVT::i1, Custom);
624  setOperationAction(ISD::STORE, MVT::i1, Custom);
625
626  for (MVT VT : MVT::integer_valuetypes()) {
627    setLoadExtAction(ISD::SEXTLOAD, VT, MVT::i1, Promote);
628    setLoadExtAction(ISD::ZEXTLOAD, VT, MVT::i1, Promote);
629    setTruncStoreAction(VT, MVT::i1, Expand);
630  }
631
632  // expand extload of vector of integers.
633  setLoadExtAction({ISD::EXTLOAD, ISD::SEXTLOAD, ISD::ZEXTLOAD}, MVT::v2i16,
634                   MVT::v2i8, Expand);
635  setTruncStoreAction(MVT::v2i16, MVT::v2i8, Expand);
636
637  // This is legal in NVPTX
638  setOperationAction(ISD::ConstantFP, MVT::f64, Legal);
639  setOperationAction(ISD::ConstantFP, MVT::f32, Legal);
640  setOperationAction(ISD::ConstantFP, MVT::f16, Legal);
641  setOperationAction(ISD::ConstantFP, MVT::bf16, Legal);
642
643  // Lowering of DYNAMIC_STACKALLOC is unsupported.
644  // Custom lower to produce an error.
645  setOperationAction(ISD::DYNAMIC_STACKALLOC, MVT::i32, Custom);
646  setOperationAction(ISD::DYNAMIC_STACKALLOC, MVT::i64, Custom);
647
648  // TRAP can be lowered to PTX trap
649  setOperationAction(ISD::TRAP, MVT::Other, Legal);
650
651  // Register custom handling for vector loads/stores
652  for (MVT VT : MVT::fixedlen_vector_valuetypes()) {
653    if (IsPTXVectorType(VT)) {
654      setOperationAction(ISD::LOAD, VT, Custom);
655      setOperationAction(ISD::STORE, VT, Custom);
656      setOperationAction(ISD::INTRINSIC_W_CHAIN, VT, Custom);
657    }
658  }
659
660  // Support varargs.
661  setOperationAction(ISD::VASTART, MVT::Other, Custom);
662  setOperationAction(ISD::VAARG, MVT::Other, Custom);
663  setOperationAction(ISD::VACOPY, MVT::Other, Expand);
664  setOperationAction(ISD::VAEND, MVT::Other, Expand);
665
666  // Custom handling for i8 intrinsics
667  setOperationAction(ISD::INTRINSIC_W_CHAIN, MVT::i8, Custom);
668
669  for (const auto& Ty : {MVT::i16, MVT::i32, MVT::i64}) {
670    setOperationAction(ISD::ABS,  Ty, Legal);
671    setOperationAction(ISD::SMIN, Ty, Legal);
672    setOperationAction(ISD::SMAX, Ty, Legal);
673    setOperationAction(ISD::UMIN, Ty, Legal);
674    setOperationAction(ISD::UMAX, Ty, Legal);
675
676    setOperationAction(ISD::CTPOP, Ty, Legal);
677    setOperationAction(ISD::CTLZ, Ty, Legal);
678  }
679
680  setI16x2OperationAction(ISD::ABS, MVT::v2i16, Legal, Custom);
681  setI16x2OperationAction(ISD::SMIN, MVT::v2i16, Legal, Custom);
682  setI16x2OperationAction(ISD::SMAX, MVT::v2i16, Legal, Custom);
683  setI16x2OperationAction(ISD::UMIN, MVT::v2i16, Legal, Custom);
684  setI16x2OperationAction(ISD::UMAX, MVT::v2i16, Legal, Custom);
685  setI16x2OperationAction(ISD::CTPOP, MVT::v2i16, Legal, Expand);
686  setI16x2OperationAction(ISD::CTLZ, MVT::v2i16, Legal, Expand);
687
688  setI16x2OperationAction(ISD::ADD, MVT::v2i16, Legal, Custom);
689  setI16x2OperationAction(ISD::SUB, MVT::v2i16, Legal, Custom);
690  setI16x2OperationAction(ISD::MUL, MVT::v2i16, Legal, Custom);
691  setI16x2OperationAction(ISD::SHL, MVT::v2i16, Legal, Custom);
692  setI16x2OperationAction(ISD::SREM, MVT::v2i16, Legal, Custom);
693  setI16x2OperationAction(ISD::UREM, MVT::v2i16, Legal, Custom);
694
695  // Other arithmetic and logic ops are unsupported.
696  setOperationAction({ISD::SDIV, ISD::UDIV, ISD::SRA, ISD::SRL, ISD::MULHS,
697                      ISD::MULHU, ISD::FP_TO_SINT, ISD::FP_TO_UINT,
698                      ISD::SINT_TO_FP, ISD::UINT_TO_FP},
699                     MVT::v2i16, Expand);
700
701  setOperationAction(ISD::ADDC, MVT::i32, Legal);
702  setOperationAction(ISD::ADDE, MVT::i32, Legal);
703  setOperationAction(ISD::SUBC, MVT::i32, Legal);
704  setOperationAction(ISD::SUBE, MVT::i32, Legal);
705  if (STI.getPTXVersion() >= 43) {
706    setOperationAction(ISD::ADDC, MVT::i64, Legal);
707    setOperationAction(ISD::ADDE, MVT::i64, Legal);
708    setOperationAction(ISD::SUBC, MVT::i64, Legal);
709    setOperationAction(ISD::SUBE, MVT::i64, Legal);
710  }
711
712  setOperationAction(ISD::CTTZ, MVT::i16, Expand);
713  setOperationAction(ISD::CTTZ, MVT::v2i16, Expand);
714  setOperationAction(ISD::CTTZ, MVT::i32, Expand);
715  setOperationAction(ISD::CTTZ, MVT::i64, Expand);
716
717  // PTX does not directly support SELP of i1, so promote to i32 first
718  setOperationAction(ISD::SELECT, MVT::i1, Custom);
719
720  // PTX cannot multiply two i64s in a single instruction.
721  setOperationAction(ISD::SMUL_LOHI, MVT::i64, Expand);
722  setOperationAction(ISD::UMUL_LOHI, MVT::i64, Expand);
723
724  // We have some custom DAG combine patterns for these nodes
725  setTargetDAGCombine({ISD::ADD, ISD::AND, ISD::EXTRACT_VECTOR_ELT, ISD::FADD,
726                       ISD::LOAD, ISD::MUL, ISD::SHL, ISD::SREM, ISD::UREM,
727                       ISD::VSELECT});
728
729  // setcc for f16x2 and bf16x2 needs special handling to prevent
730  // legalizer's attempt to scalarize it due to v2i1 not being legal.
731  if (STI.allowFP16Math() || STI.hasBF16Math())
732    setTargetDAGCombine(ISD::SETCC);
733
734  // Promote fp16 arithmetic if fp16 hardware isn't available or the
735  // user passed --nvptx-no-fp16-math. The flag is useful because,
736  // although sm_53+ GPUs have some sort of FP16 support in
737  // hardware, only sm_53 and sm_60 have full implementation. Others
738  // only have token amount of hardware and are likely to run faster
739  // by using fp32 units instead.
740  for (const auto &Op : {ISD::FADD, ISD::FMUL, ISD::FSUB, ISD::FMA}) {
741    setFP16OperationAction(Op, MVT::f16, Legal, Promote);
742    setFP16OperationAction(Op, MVT::v2f16, Legal, Expand);
743    setBF16OperationAction(Op, MVT::v2bf16, Legal, Expand);
744    // bf16 must be promoted to f32.
745    setBF16OperationAction(Op, MVT::bf16, Legal, Promote);
746    if (getOperationAction(Op, MVT::bf16) == Promote)
747      AddPromotedToType(Op, MVT::bf16, MVT::f32);
748  }
749
750  // f16/f16x2 neg was introduced in PTX 60, SM_53.
751  const bool IsFP16FP16x2NegAvailable = STI.getSmVersion() >= 53 &&
752                                        STI.getPTXVersion() >= 60 &&
753                                        STI.allowFP16Math();
754  for (const auto &VT : {MVT::f16, MVT::v2f16})
755    setOperationAction(ISD::FNEG, VT,
756                       IsFP16FP16x2NegAvailable ? Legal : Expand);
757
758  setBF16OperationAction(ISD::FNEG, MVT::bf16, Legal, Expand);
759  setBF16OperationAction(ISD::FNEG, MVT::v2bf16, Legal, Expand);
760  // (would be) Library functions.
761
762  // These map to conversion instructions for scalar FP types.
763  for (const auto &Op : {ISD::FCEIL, ISD::FFLOOR, ISD::FNEARBYINT, ISD::FRINT,
764                         ISD::FROUNDEVEN, ISD::FTRUNC}) {
765    setOperationAction(Op, MVT::f16, Legal);
766    setOperationAction(Op, MVT::f32, Legal);
767    setOperationAction(Op, MVT::f64, Legal);
768    setOperationAction(Op, MVT::v2f16, Expand);
769    setOperationAction(Op, MVT::v2bf16, Expand);
770    setBF16OperationAction(Op, MVT::bf16, Legal, Promote);
771    if (getOperationAction(Op, MVT::bf16) == Promote)
772      AddPromotedToType(Op, MVT::bf16, MVT::f32);
773  }
774
775  // sm_80 only has conversions between f32 and bf16. Custom lower all other
776  // bf16 conversions.
777  if (STI.hasBF16Math() &&
778      (STI.getSmVersion() < 90 || STI.getPTXVersion() < 78)) {
779    for (MVT VT : {MVT::i1, MVT::i16, MVT::i32, MVT::i64}) {
780      setOperationAction(
781          {ISD::SINT_TO_FP, ISD::UINT_TO_FP, ISD::FP_TO_SINT, ISD::FP_TO_UINT},
782          VT, Custom);
783    }
784  }
785
786  setOperationAction(ISD::FROUND, MVT::f16, Promote);
787  setOperationAction(ISD::FROUND, MVT::v2f16, Expand);
788  setOperationAction(ISD::FROUND, MVT::v2bf16, Expand);
789  setOperationAction(ISD::FROUND, MVT::f32, Custom);
790  setOperationAction(ISD::FROUND, MVT::f64, Custom);
791  setOperationAction(ISD::FROUND, MVT::bf16, Promote);
792  AddPromotedToType(ISD::FROUND, MVT::bf16, MVT::f32);
793
794  // 'Expand' implements FCOPYSIGN without calling an external library.
795  setOperationAction(ISD::FCOPYSIGN, MVT::f16, Expand);
796  setOperationAction(ISD::FCOPYSIGN, MVT::v2f16, Expand);
797  setOperationAction(ISD::FCOPYSIGN, MVT::bf16, Expand);
798  setOperationAction(ISD::FCOPYSIGN, MVT::v2bf16, Expand);
799  setOperationAction(ISD::FCOPYSIGN, MVT::f32, Expand);
800  setOperationAction(ISD::FCOPYSIGN, MVT::f64, Expand);
801
802  // These map to corresponding instructions for f32/f64. f16 must be
803  // promoted to f32. v2f16 is expanded to f16, which is then promoted
804  // to f32.
805  for (const auto &Op :
806       {ISD::FDIV, ISD::FREM, ISD::FSQRT, ISD::FSIN, ISD::FCOS}) {
807    setOperationAction(Op, MVT::f16, Promote);
808    setOperationAction(Op, MVT::f32, Legal);
809    setOperationAction(Op, MVT::f64, Legal);
810    setOperationAction(Op, MVT::v2f16, Expand);
811    setOperationAction(Op, MVT::v2bf16, Expand);
812    setOperationAction(Op, MVT::bf16, Promote);
813    AddPromotedToType(Op, MVT::bf16, MVT::f32);
814  }
815  for (const auto &Op : {ISD::FABS}) {
816    setOperationAction(Op, MVT::f16, Promote);
817    setOperationAction(Op, MVT::f32, Legal);
818    setOperationAction(Op, MVT::f64, Legal);
819    setOperationAction(Op, MVT::v2f16, Expand);
820    setBF16OperationAction(Op, MVT::v2bf16, Legal, Expand);
821    setBF16OperationAction(Op, MVT::bf16, Legal, Promote);
822    if (getOperationAction(Op, MVT::bf16) == Promote)
823      AddPromotedToType(Op, MVT::bf16, MVT::f32);
824  }
825
826  // max.f16, max.f16x2 and max.NaN are supported on sm_80+.
827  auto GetMinMaxAction = [&](LegalizeAction NotSm80Action) {
828    bool IsAtLeastSm80 = STI.getSmVersion() >= 80 && STI.getPTXVersion() >= 70;
829    return IsAtLeastSm80 ? Legal : NotSm80Action;
830  };
831  for (const auto &Op : {ISD::FMINNUM, ISD::FMAXNUM}) {
832    setFP16OperationAction(Op, MVT::f16, GetMinMaxAction(Promote), Promote);
833    setOperationAction(Op, MVT::f32, Legal);
834    setOperationAction(Op, MVT::f64, Legal);
835    setFP16OperationAction(Op, MVT::v2f16, GetMinMaxAction(Expand), Expand);
836    setBF16OperationAction(Op, MVT::v2bf16, Legal, Expand);
837    setBF16OperationAction(Op, MVT::bf16, Legal, Promote);
838    if (getOperationAction(Op, MVT::bf16) == Promote)
839      AddPromotedToType(Op, MVT::bf16, MVT::f32);
840  }
841  for (const auto &Op : {ISD::FMINIMUM, ISD::FMAXIMUM}) {
842    setFP16OperationAction(Op, MVT::f16, GetMinMaxAction(Expand), Expand);
843    setFP16OperationAction(Op, MVT::bf16, Legal, Expand);
844    setOperationAction(Op, MVT::f32, GetMinMaxAction(Expand));
845    setFP16OperationAction(Op, MVT::v2f16, GetMinMaxAction(Expand), Expand);
846    setBF16OperationAction(Op, MVT::v2bf16, Legal, Expand);
847  }
848
849  // No FEXP2, FLOG2.  The PTX ex2 and log2 functions are always approximate.
850  // No FPOW or FREM in PTX.
851
852  // Now deduce the information based on the above mentioned
853  // actions
854  computeRegisterProperties(STI.getRegisterInfo());
855
856  setMinCmpXchgSizeInBits(32);
857  setMaxAtomicSizeInBitsSupported(64);
858}
859
860const char *NVPTXTargetLowering::getTargetNodeName(unsigned Opcode) const {
861  switch ((NVPTXISD::NodeType)Opcode) {
862  case NVPTXISD::FIRST_NUMBER:
863    break;
864  case NVPTXISD::CALL:
865    return "NVPTXISD::CALL";
866  case NVPTXISD::RET_GLUE:
867    return "NVPTXISD::RET_GLUE";
868  case NVPTXISD::LOAD_PARAM:
869    return "NVPTXISD::LOAD_PARAM";
870  case NVPTXISD::Wrapper:
871    return "NVPTXISD::Wrapper";
872  case NVPTXISD::DeclareParam:
873    return "NVPTXISD::DeclareParam";
874  case NVPTXISD::DeclareScalarParam:
875    return "NVPTXISD::DeclareScalarParam";
876  case NVPTXISD::DeclareRet:
877    return "NVPTXISD::DeclareRet";
878  case NVPTXISD::DeclareScalarRet:
879    return "NVPTXISD::DeclareScalarRet";
880  case NVPTXISD::DeclareRetParam:
881    return "NVPTXISD::DeclareRetParam";
882  case NVPTXISD::PrintCall:
883    return "NVPTXISD::PrintCall";
884  case NVPTXISD::PrintConvergentCall:
885    return "NVPTXISD::PrintConvergentCall";
886  case NVPTXISD::PrintCallUni:
887    return "NVPTXISD::PrintCallUni";
888  case NVPTXISD::PrintConvergentCallUni:
889    return "NVPTXISD::PrintConvergentCallUni";
890  case NVPTXISD::LoadParam:
891    return "NVPTXISD::LoadParam";
892  case NVPTXISD::LoadParamV2:
893    return "NVPTXISD::LoadParamV2";
894  case NVPTXISD::LoadParamV4:
895    return "NVPTXISD::LoadParamV4";
896  case NVPTXISD::StoreParam:
897    return "NVPTXISD::StoreParam";
898  case NVPTXISD::StoreParamV2:
899    return "NVPTXISD::StoreParamV2";
900  case NVPTXISD::StoreParamV4:
901    return "NVPTXISD::StoreParamV4";
902  case NVPTXISD::StoreParamS32:
903    return "NVPTXISD::StoreParamS32";
904  case NVPTXISD::StoreParamU32:
905    return "NVPTXISD::StoreParamU32";
906  case NVPTXISD::CallArgBegin:
907    return "NVPTXISD::CallArgBegin";
908  case NVPTXISD::CallArg:
909    return "NVPTXISD::CallArg";
910  case NVPTXISD::LastCallArg:
911    return "NVPTXISD::LastCallArg";
912  case NVPTXISD::CallArgEnd:
913    return "NVPTXISD::CallArgEnd";
914  case NVPTXISD::CallVoid:
915    return "NVPTXISD::CallVoid";
916  case NVPTXISD::CallVal:
917    return "NVPTXISD::CallVal";
918  case NVPTXISD::CallSymbol:
919    return "NVPTXISD::CallSymbol";
920  case NVPTXISD::Prototype:
921    return "NVPTXISD::Prototype";
922  case NVPTXISD::MoveParam:
923    return "NVPTXISD::MoveParam";
924  case NVPTXISD::StoreRetval:
925    return "NVPTXISD::StoreRetval";
926  case NVPTXISD::StoreRetvalV2:
927    return "NVPTXISD::StoreRetvalV2";
928  case NVPTXISD::StoreRetvalV4:
929    return "NVPTXISD::StoreRetvalV4";
930  case NVPTXISD::PseudoUseParam:
931    return "NVPTXISD::PseudoUseParam";
932  case NVPTXISD::RETURN:
933    return "NVPTXISD::RETURN";
934  case NVPTXISD::CallSeqBegin:
935    return "NVPTXISD::CallSeqBegin";
936  case NVPTXISD::CallSeqEnd:
937    return "NVPTXISD::CallSeqEnd";
938  case NVPTXISD::CallPrototype:
939    return "NVPTXISD::CallPrototype";
940  case NVPTXISD::ProxyReg:
941    return "NVPTXISD::ProxyReg";
942  case NVPTXISD::LoadV2:
943    return "NVPTXISD::LoadV2";
944  case NVPTXISD::LoadV4:
945    return "NVPTXISD::LoadV4";
946  case NVPTXISD::LDGV2:
947    return "NVPTXISD::LDGV2";
948  case NVPTXISD::LDGV4:
949    return "NVPTXISD::LDGV4";
950  case NVPTXISD::LDUV2:
951    return "NVPTXISD::LDUV2";
952  case NVPTXISD::LDUV4:
953    return "NVPTXISD::LDUV4";
954  case NVPTXISD::StoreV2:
955    return "NVPTXISD::StoreV2";
956  case NVPTXISD::StoreV4:
957    return "NVPTXISD::StoreV4";
958  case NVPTXISD::FUN_SHFL_CLAMP:
959    return "NVPTXISD::FUN_SHFL_CLAMP";
960  case NVPTXISD::FUN_SHFR_CLAMP:
961    return "NVPTXISD::FUN_SHFR_CLAMP";
962  case NVPTXISD::IMAD:
963    return "NVPTXISD::IMAD";
964  case NVPTXISD::BFE:
965    return "NVPTXISD::BFE";
966  case NVPTXISD::BFI:
967    return "NVPTXISD::BFI";
968  case NVPTXISD::PRMT:
969    return "NVPTXISD::PRMT";
970  case NVPTXISD::SETP_F16X2:
971    return "NVPTXISD::SETP_F16X2";
972  case NVPTXISD::SETP_BF16X2:
973    return "NVPTXISD::SETP_BF16X2";
974  case NVPTXISD::Dummy:
975    return "NVPTXISD::Dummy";
976  case NVPTXISD::MUL_WIDE_SIGNED:
977    return "NVPTXISD::MUL_WIDE_SIGNED";
978  case NVPTXISD::MUL_WIDE_UNSIGNED:
979    return "NVPTXISD::MUL_WIDE_UNSIGNED";
980  case NVPTXISD::Tex1DFloatS32:        return "NVPTXISD::Tex1DFloatS32";
981  case NVPTXISD::Tex1DFloatFloat:      return "NVPTXISD::Tex1DFloatFloat";
982  case NVPTXISD::Tex1DFloatFloatLevel:
983    return "NVPTXISD::Tex1DFloatFloatLevel";
984  case NVPTXISD::Tex1DFloatFloatGrad:
985    return "NVPTXISD::Tex1DFloatFloatGrad";
986  case NVPTXISD::Tex1DS32S32:          return "NVPTXISD::Tex1DS32S32";
987  case NVPTXISD::Tex1DS32Float:        return "NVPTXISD::Tex1DS32Float";
988  case NVPTXISD::Tex1DS32FloatLevel:
989    return "NVPTXISD::Tex1DS32FloatLevel";
990  case NVPTXISD::Tex1DS32FloatGrad:
991    return "NVPTXISD::Tex1DS32FloatGrad";
992  case NVPTXISD::Tex1DU32S32:          return "NVPTXISD::Tex1DU32S32";
993  case NVPTXISD::Tex1DU32Float:        return "NVPTXISD::Tex1DU32Float";
994  case NVPTXISD::Tex1DU32FloatLevel:
995    return "NVPTXISD::Tex1DU32FloatLevel";
996  case NVPTXISD::Tex1DU32FloatGrad:
997    return "NVPTXISD::Tex1DU32FloatGrad";
998  case NVPTXISD::Tex1DArrayFloatS32:   return "NVPTXISD::Tex1DArrayFloatS32";
999  case NVPTXISD::Tex1DArrayFloatFloat: return "NVPTXISD::Tex1DArrayFloatFloat";
1000  case NVPTXISD::Tex1DArrayFloatFloatLevel:
1001    return "NVPTXISD::Tex1DArrayFloatFloatLevel";
1002  case NVPTXISD::Tex1DArrayFloatFloatGrad:
1003    return "NVPTXISD::Tex1DArrayFloatFloatGrad";
1004  case NVPTXISD::Tex1DArrayS32S32:     return "NVPTXISD::Tex1DArrayS32S32";
1005  case NVPTXISD::Tex1DArrayS32Float:   return "NVPTXISD::Tex1DArrayS32Float";
1006  case NVPTXISD::Tex1DArrayS32FloatLevel:
1007    return "NVPTXISD::Tex1DArrayS32FloatLevel";
1008  case NVPTXISD::Tex1DArrayS32FloatGrad:
1009    return "NVPTXISD::Tex1DArrayS32FloatGrad";
1010  case NVPTXISD::Tex1DArrayU32S32:     return "NVPTXISD::Tex1DArrayU32S32";
1011  case NVPTXISD::Tex1DArrayU32Float:   return "NVPTXISD::Tex1DArrayU32Float";
1012  case NVPTXISD::Tex1DArrayU32FloatLevel:
1013    return "NVPTXISD::Tex1DArrayU32FloatLevel";
1014  case NVPTXISD::Tex1DArrayU32FloatGrad:
1015    return "NVPTXISD::Tex1DArrayU32FloatGrad";
1016  case NVPTXISD::Tex2DFloatS32:        return "NVPTXISD::Tex2DFloatS32";
1017  case NVPTXISD::Tex2DFloatFloat:      return "NVPTXISD::Tex2DFloatFloat";
1018  case NVPTXISD::Tex2DFloatFloatLevel:
1019    return "NVPTXISD::Tex2DFloatFloatLevel";
1020  case NVPTXISD::Tex2DFloatFloatGrad:
1021    return "NVPTXISD::Tex2DFloatFloatGrad";
1022  case NVPTXISD::Tex2DS32S32:          return "NVPTXISD::Tex2DS32S32";
1023  case NVPTXISD::Tex2DS32Float:        return "NVPTXISD::Tex2DS32Float";
1024  case NVPTXISD::Tex2DS32FloatLevel:
1025    return "NVPTXISD::Tex2DS32FloatLevel";
1026  case NVPTXISD::Tex2DS32FloatGrad:
1027    return "NVPTXISD::Tex2DS32FloatGrad";
1028  case NVPTXISD::Tex2DU32S32:          return "NVPTXISD::Tex2DU32S32";
1029  case NVPTXISD::Tex2DU32Float:        return "NVPTXISD::Tex2DU32Float";
1030  case NVPTXISD::Tex2DU32FloatLevel:
1031    return "NVPTXISD::Tex2DU32FloatLevel";
1032  case NVPTXISD::Tex2DU32FloatGrad:
1033    return "NVPTXISD::Tex2DU32FloatGrad";
1034  case NVPTXISD::Tex2DArrayFloatS32:   return "NVPTXISD::Tex2DArrayFloatS32";
1035  case NVPTXISD::Tex2DArrayFloatFloat: return "NVPTXISD::Tex2DArrayFloatFloat";
1036  case NVPTXISD::Tex2DArrayFloatFloatLevel:
1037    return "NVPTXISD::Tex2DArrayFloatFloatLevel";
1038  case NVPTXISD::Tex2DArrayFloatFloatGrad:
1039    return "NVPTXISD::Tex2DArrayFloatFloatGrad";
1040  case NVPTXISD::Tex2DArrayS32S32:     return "NVPTXISD::Tex2DArrayS32S32";
1041  case NVPTXISD::Tex2DArrayS32Float:   return "NVPTXISD::Tex2DArrayS32Float";
1042  case NVPTXISD::Tex2DArrayS32FloatLevel:
1043    return "NVPTXISD::Tex2DArrayS32FloatLevel";
1044  case NVPTXISD::Tex2DArrayS32FloatGrad:
1045    return "NVPTXISD::Tex2DArrayS32FloatGrad";
1046  case NVPTXISD::Tex2DArrayU32S32:     return "NVPTXISD::Tex2DArrayU32S32";
1047  case NVPTXISD::Tex2DArrayU32Float:   return "NVPTXISD::Tex2DArrayU32Float";
1048  case NVPTXISD::Tex2DArrayU32FloatLevel:
1049    return "NVPTXISD::Tex2DArrayU32FloatLevel";
1050  case NVPTXISD::Tex2DArrayU32FloatGrad:
1051    return "NVPTXISD::Tex2DArrayU32FloatGrad";
1052  case NVPTXISD::Tex3DFloatS32:        return "NVPTXISD::Tex3DFloatS32";
1053  case NVPTXISD::Tex3DFloatFloat:      return "NVPTXISD::Tex3DFloatFloat";
1054  case NVPTXISD::Tex3DFloatFloatLevel:
1055    return "NVPTXISD::Tex3DFloatFloatLevel";
1056  case NVPTXISD::Tex3DFloatFloatGrad:
1057    return "NVPTXISD::Tex3DFloatFloatGrad";
1058  case NVPTXISD::Tex3DS32S32:          return "NVPTXISD::Tex3DS32S32";
1059  case NVPTXISD::Tex3DS32Float:        return "NVPTXISD::Tex3DS32Float";
1060  case NVPTXISD::Tex3DS32FloatLevel:
1061    return "NVPTXISD::Tex3DS32FloatLevel";
1062  case NVPTXISD::Tex3DS32FloatGrad:
1063    return "NVPTXISD::Tex3DS32FloatGrad";
1064  case NVPTXISD::Tex3DU32S32:          return "NVPTXISD::Tex3DU32S32";
1065  case NVPTXISD::Tex3DU32Float:        return "NVPTXISD::Tex3DU32Float";
1066  case NVPTXISD::Tex3DU32FloatLevel:
1067    return "NVPTXISD::Tex3DU32FloatLevel";
1068  case NVPTXISD::Tex3DU32FloatGrad:
1069    return "NVPTXISD::Tex3DU32FloatGrad";
1070  case NVPTXISD::TexCubeFloatFloat:      return "NVPTXISD::TexCubeFloatFloat";
1071  case NVPTXISD::TexCubeFloatFloatLevel:
1072    return "NVPTXISD::TexCubeFloatFloatLevel";
1073  case NVPTXISD::TexCubeS32Float:        return "NVPTXISD::TexCubeS32Float";
1074  case NVPTXISD::TexCubeS32FloatLevel:
1075    return "NVPTXISD::TexCubeS32FloatLevel";
1076  case NVPTXISD::TexCubeU32Float:        return "NVPTXISD::TexCubeU32Float";
1077  case NVPTXISD::TexCubeU32FloatLevel:
1078    return "NVPTXISD::TexCubeU32FloatLevel";
1079  case NVPTXISD::TexCubeArrayFloatFloat:
1080    return "NVPTXISD::TexCubeArrayFloatFloat";
1081  case NVPTXISD::TexCubeArrayFloatFloatLevel:
1082    return "NVPTXISD::TexCubeArrayFloatFloatLevel";
1083  case NVPTXISD::TexCubeArrayS32Float:
1084    return "NVPTXISD::TexCubeArrayS32Float";
1085  case NVPTXISD::TexCubeArrayS32FloatLevel:
1086    return "NVPTXISD::TexCubeArrayS32FloatLevel";
1087  case NVPTXISD::TexCubeArrayU32Float:
1088    return "NVPTXISD::TexCubeArrayU32Float";
1089  case NVPTXISD::TexCubeArrayU32FloatLevel:
1090    return "NVPTXISD::TexCubeArrayU32FloatLevel";
1091  case NVPTXISD::Tld4R2DFloatFloat:
1092    return "NVPTXISD::Tld4R2DFloatFloat";
1093  case NVPTXISD::Tld4G2DFloatFloat:
1094    return "NVPTXISD::Tld4G2DFloatFloat";
1095  case NVPTXISD::Tld4B2DFloatFloat:
1096    return "NVPTXISD::Tld4B2DFloatFloat";
1097  case NVPTXISD::Tld4A2DFloatFloat:
1098    return "NVPTXISD::Tld4A2DFloatFloat";
1099  case NVPTXISD::Tld4R2DS64Float:
1100    return "NVPTXISD::Tld4R2DS64Float";
1101  case NVPTXISD::Tld4G2DS64Float:
1102    return "NVPTXISD::Tld4G2DS64Float";
1103  case NVPTXISD::Tld4B2DS64Float:
1104    return "NVPTXISD::Tld4B2DS64Float";
1105  case NVPTXISD::Tld4A2DS64Float:
1106    return "NVPTXISD::Tld4A2DS64Float";
1107  case NVPTXISD::Tld4R2DU64Float:
1108    return "NVPTXISD::Tld4R2DU64Float";
1109  case NVPTXISD::Tld4G2DU64Float:
1110    return "NVPTXISD::Tld4G2DU64Float";
1111  case NVPTXISD::Tld4B2DU64Float:
1112    return "NVPTXISD::Tld4B2DU64Float";
1113  case NVPTXISD::Tld4A2DU64Float:
1114    return "NVPTXISD::Tld4A2DU64Float";
1115
1116  case NVPTXISD::TexUnified1DFloatS32:
1117    return "NVPTXISD::TexUnified1DFloatS32";
1118  case NVPTXISD::TexUnified1DFloatFloat:
1119    return "NVPTXISD::TexUnified1DFloatFloat";
1120  case NVPTXISD::TexUnified1DFloatFloatLevel:
1121    return "NVPTXISD::TexUnified1DFloatFloatLevel";
1122  case NVPTXISD::TexUnified1DFloatFloatGrad:
1123    return "NVPTXISD::TexUnified1DFloatFloatGrad";
1124  case NVPTXISD::TexUnified1DS32S32:
1125    return "NVPTXISD::TexUnified1DS32S32";
1126  case NVPTXISD::TexUnified1DS32Float:
1127    return "NVPTXISD::TexUnified1DS32Float";
1128  case NVPTXISD::TexUnified1DS32FloatLevel:
1129    return "NVPTXISD::TexUnified1DS32FloatLevel";
1130  case NVPTXISD::TexUnified1DS32FloatGrad:
1131    return "NVPTXISD::TexUnified1DS32FloatGrad";
1132  case NVPTXISD::TexUnified1DU32S32:
1133    return "NVPTXISD::TexUnified1DU32S32";
1134  case NVPTXISD::TexUnified1DU32Float:
1135    return "NVPTXISD::TexUnified1DU32Float";
1136  case NVPTXISD::TexUnified1DU32FloatLevel:
1137    return "NVPTXISD::TexUnified1DU32FloatLevel";
1138  case NVPTXISD::TexUnified1DU32FloatGrad:
1139    return "NVPTXISD::TexUnified1DU32FloatGrad";
1140  case NVPTXISD::TexUnified1DArrayFloatS32:
1141    return "NVPTXISD::TexUnified1DArrayFloatS32";
1142  case NVPTXISD::TexUnified1DArrayFloatFloat:
1143    return "NVPTXISD::TexUnified1DArrayFloatFloat";
1144  case NVPTXISD::TexUnified1DArrayFloatFloatLevel:
1145    return "NVPTXISD::TexUnified1DArrayFloatFloatLevel";
1146  case NVPTXISD::TexUnified1DArrayFloatFloatGrad:
1147    return "NVPTXISD::TexUnified1DArrayFloatFloatGrad";
1148  case NVPTXISD::TexUnified1DArrayS32S32:
1149    return "NVPTXISD::TexUnified1DArrayS32S32";
1150  case NVPTXISD::TexUnified1DArrayS32Float:
1151    return "NVPTXISD::TexUnified1DArrayS32Float";
1152  case NVPTXISD::TexUnified1DArrayS32FloatLevel:
1153    return "NVPTXISD::TexUnified1DArrayS32FloatLevel";
1154  case NVPTXISD::TexUnified1DArrayS32FloatGrad:
1155    return "NVPTXISD::TexUnified1DArrayS32FloatGrad";
1156  case NVPTXISD::TexUnified1DArrayU32S32:
1157    return "NVPTXISD::TexUnified1DArrayU32S32";
1158  case NVPTXISD::TexUnified1DArrayU32Float:
1159    return "NVPTXISD::TexUnified1DArrayU32Float";
1160  case NVPTXISD::TexUnified1DArrayU32FloatLevel:
1161    return "NVPTXISD::TexUnified1DArrayU32FloatLevel";
1162  case NVPTXISD::TexUnified1DArrayU32FloatGrad:
1163    return "NVPTXISD::TexUnified1DArrayU32FloatGrad";
1164  case NVPTXISD::TexUnified2DFloatS32:
1165    return "NVPTXISD::TexUnified2DFloatS32";
1166  case NVPTXISD::TexUnified2DFloatFloat:
1167    return "NVPTXISD::TexUnified2DFloatFloat";
1168  case NVPTXISD::TexUnified2DFloatFloatLevel:
1169    return "NVPTXISD::TexUnified2DFloatFloatLevel";
1170  case NVPTXISD::TexUnified2DFloatFloatGrad:
1171    return "NVPTXISD::TexUnified2DFloatFloatGrad";
1172  case NVPTXISD::TexUnified2DS32S32:
1173    return "NVPTXISD::TexUnified2DS32S32";
1174  case NVPTXISD::TexUnified2DS32Float:
1175    return "NVPTXISD::TexUnified2DS32Float";
1176  case NVPTXISD::TexUnified2DS32FloatLevel:
1177    return "NVPTXISD::TexUnified2DS32FloatLevel";
1178  case NVPTXISD::TexUnified2DS32FloatGrad:
1179    return "NVPTXISD::TexUnified2DS32FloatGrad";
1180  case NVPTXISD::TexUnified2DU32S32:
1181    return "NVPTXISD::TexUnified2DU32S32";
1182  case NVPTXISD::TexUnified2DU32Float:
1183    return "NVPTXISD::TexUnified2DU32Float";
1184  case NVPTXISD::TexUnified2DU32FloatLevel:
1185    return "NVPTXISD::TexUnified2DU32FloatLevel";
1186  case NVPTXISD::TexUnified2DU32FloatGrad:
1187    return "NVPTXISD::TexUnified2DU32FloatGrad";
1188  case NVPTXISD::TexUnified2DArrayFloatS32:
1189    return "NVPTXISD::TexUnified2DArrayFloatS32";
1190  case NVPTXISD::TexUnified2DArrayFloatFloat:
1191    return "NVPTXISD::TexUnified2DArrayFloatFloat";
1192  case NVPTXISD::TexUnified2DArrayFloatFloatLevel:
1193    return "NVPTXISD::TexUnified2DArrayFloatFloatLevel";
1194  case NVPTXISD::TexUnified2DArrayFloatFloatGrad:
1195    return "NVPTXISD::TexUnified2DArrayFloatFloatGrad";
1196  case NVPTXISD::TexUnified2DArrayS32S32:
1197    return "NVPTXISD::TexUnified2DArrayS32S32";
1198  case NVPTXISD::TexUnified2DArrayS32Float:
1199    return "NVPTXISD::TexUnified2DArrayS32Float";
1200  case NVPTXISD::TexUnified2DArrayS32FloatLevel:
1201    return "NVPTXISD::TexUnified2DArrayS32FloatLevel";
1202  case NVPTXISD::TexUnified2DArrayS32FloatGrad:
1203    return "NVPTXISD::TexUnified2DArrayS32FloatGrad";
1204  case NVPTXISD::TexUnified2DArrayU32S32:
1205    return "NVPTXISD::TexUnified2DArrayU32S32";
1206  case NVPTXISD::TexUnified2DArrayU32Float:
1207    return "NVPTXISD::TexUnified2DArrayU32Float";
1208  case NVPTXISD::TexUnified2DArrayU32FloatLevel:
1209    return "NVPTXISD::TexUnified2DArrayU32FloatLevel";
1210  case NVPTXISD::TexUnified2DArrayU32FloatGrad:
1211    return "NVPTXISD::TexUnified2DArrayU32FloatGrad";
1212  case NVPTXISD::TexUnified3DFloatS32:
1213    return "NVPTXISD::TexUnified3DFloatS32";
1214  case NVPTXISD::TexUnified3DFloatFloat:
1215    return "NVPTXISD::TexUnified3DFloatFloat";
1216  case NVPTXISD::TexUnified3DFloatFloatLevel:
1217    return "NVPTXISD::TexUnified3DFloatFloatLevel";
1218  case NVPTXISD::TexUnified3DFloatFloatGrad:
1219    return "NVPTXISD::TexUnified3DFloatFloatGrad";
1220  case NVPTXISD::TexUnified3DS32S32:
1221    return "NVPTXISD::TexUnified3DS32S32";
1222  case NVPTXISD::TexUnified3DS32Float:
1223    return "NVPTXISD::TexUnified3DS32Float";
1224  case NVPTXISD::TexUnified3DS32FloatLevel:
1225    return "NVPTXISD::TexUnified3DS32FloatLevel";
1226  case NVPTXISD::TexUnified3DS32FloatGrad:
1227    return "NVPTXISD::TexUnified3DS32FloatGrad";
1228  case NVPTXISD::TexUnified3DU32S32:
1229    return "NVPTXISD::TexUnified3DU32S32";
1230  case NVPTXISD::TexUnified3DU32Float:
1231    return "NVPTXISD::TexUnified3DU32Float";
1232  case NVPTXISD::TexUnified3DU32FloatLevel:
1233    return "NVPTXISD::TexUnified3DU32FloatLevel";
1234  case NVPTXISD::TexUnified3DU32FloatGrad:
1235    return "NVPTXISD::TexUnified3DU32FloatGrad";
1236  case NVPTXISD::TexUnifiedCubeFloatFloat:
1237    return "NVPTXISD::TexUnifiedCubeFloatFloat";
1238  case NVPTXISD::TexUnifiedCubeFloatFloatLevel:
1239    return "NVPTXISD::TexUnifiedCubeFloatFloatLevel";
1240  case NVPTXISD::TexUnifiedCubeS32Float:
1241    return "NVPTXISD::TexUnifiedCubeS32Float";
1242  case NVPTXISD::TexUnifiedCubeS32FloatLevel:
1243    return "NVPTXISD::TexUnifiedCubeS32FloatLevel";
1244  case NVPTXISD::TexUnifiedCubeU32Float:
1245    return "NVPTXISD::TexUnifiedCubeU32Float";
1246  case NVPTXISD::TexUnifiedCubeU32FloatLevel:
1247    return "NVPTXISD::TexUnifiedCubeU32FloatLevel";
1248  case NVPTXISD::TexUnifiedCubeArrayFloatFloat:
1249    return "NVPTXISD::TexUnifiedCubeArrayFloatFloat";
1250  case NVPTXISD::TexUnifiedCubeArrayFloatFloatLevel:
1251    return "NVPTXISD::TexUnifiedCubeArrayFloatFloatLevel";
1252  case NVPTXISD::TexUnifiedCubeArrayS32Float:
1253    return "NVPTXISD::TexUnifiedCubeArrayS32Float";
1254  case NVPTXISD::TexUnifiedCubeArrayS32FloatLevel:
1255    return "NVPTXISD::TexUnifiedCubeArrayS32FloatLevel";
1256  case NVPTXISD::TexUnifiedCubeArrayU32Float:
1257    return "NVPTXISD::TexUnifiedCubeArrayU32Float";
1258  case NVPTXISD::TexUnifiedCubeArrayU32FloatLevel:
1259    return "NVPTXISD::TexUnifiedCubeArrayU32FloatLevel";
1260  case NVPTXISD::TexUnifiedCubeFloatFloatGrad:
1261    return "NVPTXISD::TexUnifiedCubeFloatFloatGrad";
1262  case NVPTXISD::TexUnifiedCubeS32FloatGrad:
1263    return "NVPTXISD::TexUnifiedCubeS32FloatGrad";
1264  case NVPTXISD::TexUnifiedCubeU32FloatGrad:
1265    return "NVPTXISD::TexUnifiedCubeU32FloatGrad";
1266  case NVPTXISD::TexUnifiedCubeArrayFloatFloatGrad:
1267    return "NVPTXISD::TexUnifiedCubeArrayFloatFloatGrad";
1268  case NVPTXISD::TexUnifiedCubeArrayS32FloatGrad:
1269    return "NVPTXISD::TexUnifiedCubeArrayS32FloatGrad";
1270  case NVPTXISD::TexUnifiedCubeArrayU32FloatGrad:
1271    return "NVPTXISD::TexUnifiedCubeArrayU32FloatGrad";
1272  case NVPTXISD::Tld4UnifiedR2DFloatFloat:
1273    return "NVPTXISD::Tld4UnifiedR2DFloatFloat";
1274  case NVPTXISD::Tld4UnifiedG2DFloatFloat:
1275    return "NVPTXISD::Tld4UnifiedG2DFloatFloat";
1276  case NVPTXISD::Tld4UnifiedB2DFloatFloat:
1277    return "NVPTXISD::Tld4UnifiedB2DFloatFloat";
1278  case NVPTXISD::Tld4UnifiedA2DFloatFloat:
1279    return "NVPTXISD::Tld4UnifiedA2DFloatFloat";
1280  case NVPTXISD::Tld4UnifiedR2DS64Float:
1281    return "NVPTXISD::Tld4UnifiedR2DS64Float";
1282  case NVPTXISD::Tld4UnifiedG2DS64Float:
1283    return "NVPTXISD::Tld4UnifiedG2DS64Float";
1284  case NVPTXISD::Tld4UnifiedB2DS64Float:
1285    return "NVPTXISD::Tld4UnifiedB2DS64Float";
1286  case NVPTXISD::Tld4UnifiedA2DS64Float:
1287    return "NVPTXISD::Tld4UnifiedA2DS64Float";
1288  case NVPTXISD::Tld4UnifiedR2DU64Float:
1289    return "NVPTXISD::Tld4UnifiedR2DU64Float";
1290  case NVPTXISD::Tld4UnifiedG2DU64Float:
1291    return "NVPTXISD::Tld4UnifiedG2DU64Float";
1292  case NVPTXISD::Tld4UnifiedB2DU64Float:
1293    return "NVPTXISD::Tld4UnifiedB2DU64Float";
1294  case NVPTXISD::Tld4UnifiedA2DU64Float:
1295    return "NVPTXISD::Tld4UnifiedA2DU64Float";
1296
1297  case NVPTXISD::Suld1DI8Clamp:          return "NVPTXISD::Suld1DI8Clamp";
1298  case NVPTXISD::Suld1DI16Clamp:         return "NVPTXISD::Suld1DI16Clamp";
1299  case NVPTXISD::Suld1DI32Clamp:         return "NVPTXISD::Suld1DI32Clamp";
1300  case NVPTXISD::Suld1DI64Clamp:         return "NVPTXISD::Suld1DI64Clamp";
1301  case NVPTXISD::Suld1DV2I8Clamp:        return "NVPTXISD::Suld1DV2I8Clamp";
1302  case NVPTXISD::Suld1DV2I16Clamp:       return "NVPTXISD::Suld1DV2I16Clamp";
1303  case NVPTXISD::Suld1DV2I32Clamp:       return "NVPTXISD::Suld1DV2I32Clamp";
1304  case NVPTXISD::Suld1DV2I64Clamp:       return "NVPTXISD::Suld1DV2I64Clamp";
1305  case NVPTXISD::Suld1DV4I8Clamp:        return "NVPTXISD::Suld1DV4I8Clamp";
1306  case NVPTXISD::Suld1DV4I16Clamp:       return "NVPTXISD::Suld1DV4I16Clamp";
1307  case NVPTXISD::Suld1DV4I32Clamp:       return "NVPTXISD::Suld1DV4I32Clamp";
1308
1309  case NVPTXISD::Suld1DArrayI8Clamp:   return "NVPTXISD::Suld1DArrayI8Clamp";
1310  case NVPTXISD::Suld1DArrayI16Clamp:  return "NVPTXISD::Suld1DArrayI16Clamp";
1311  case NVPTXISD::Suld1DArrayI32Clamp:  return "NVPTXISD::Suld1DArrayI32Clamp";
1312  case NVPTXISD::Suld1DArrayI64Clamp:  return "NVPTXISD::Suld1DArrayI64Clamp";
1313  case NVPTXISD::Suld1DArrayV2I8Clamp: return "NVPTXISD::Suld1DArrayV2I8Clamp";
1314  case NVPTXISD::Suld1DArrayV2I16Clamp:return "NVPTXISD::Suld1DArrayV2I16Clamp";
1315  case NVPTXISD::Suld1DArrayV2I32Clamp:return "NVPTXISD::Suld1DArrayV2I32Clamp";
1316  case NVPTXISD::Suld1DArrayV2I64Clamp:return "NVPTXISD::Suld1DArrayV2I64Clamp";
1317  case NVPTXISD::Suld1DArrayV4I8Clamp: return "NVPTXISD::Suld1DArrayV4I8Clamp";
1318  case NVPTXISD::Suld1DArrayV4I16Clamp:return "NVPTXISD::Suld1DArrayV4I16Clamp";
1319  case NVPTXISD::Suld1DArrayV4I32Clamp:return "NVPTXISD::Suld1DArrayV4I32Clamp";
1320
1321  case NVPTXISD::Suld2DI8Clamp:          return "NVPTXISD::Suld2DI8Clamp";
1322  case NVPTXISD::Suld2DI16Clamp:         return "NVPTXISD::Suld2DI16Clamp";
1323  case NVPTXISD::Suld2DI32Clamp:         return "NVPTXISD::Suld2DI32Clamp";
1324  case NVPTXISD::Suld2DI64Clamp:         return "NVPTXISD::Suld2DI64Clamp";
1325  case NVPTXISD::Suld2DV2I8Clamp:        return "NVPTXISD::Suld2DV2I8Clamp";
1326  case NVPTXISD::Suld2DV2I16Clamp:       return "NVPTXISD::Suld2DV2I16Clamp";
1327  case NVPTXISD::Suld2DV2I32Clamp:       return "NVPTXISD::Suld2DV2I32Clamp";
1328  case NVPTXISD::Suld2DV2I64Clamp:       return "NVPTXISD::Suld2DV2I64Clamp";
1329  case NVPTXISD::Suld2DV4I8Clamp:        return "NVPTXISD::Suld2DV4I8Clamp";
1330  case NVPTXISD::Suld2DV4I16Clamp:       return "NVPTXISD::Suld2DV4I16Clamp";
1331  case NVPTXISD::Suld2DV4I32Clamp:       return "NVPTXISD::Suld2DV4I32Clamp";
1332
1333  case NVPTXISD::Suld2DArrayI8Clamp:   return "NVPTXISD::Suld2DArrayI8Clamp";
1334  case NVPTXISD::Suld2DArrayI16Clamp:  return "NVPTXISD::Suld2DArrayI16Clamp";
1335  case NVPTXISD::Suld2DArrayI32Clamp:  return "NVPTXISD::Suld2DArrayI32Clamp";
1336  case NVPTXISD::Suld2DArrayI64Clamp:  return "NVPTXISD::Suld2DArrayI64Clamp";
1337  case NVPTXISD::Suld2DArrayV2I8Clamp: return "NVPTXISD::Suld2DArrayV2I8Clamp";
1338  case NVPTXISD::Suld2DArrayV2I16Clamp:return "NVPTXISD::Suld2DArrayV2I16Clamp";
1339  case NVPTXISD::Suld2DArrayV2I32Clamp:return "NVPTXISD::Suld2DArrayV2I32Clamp";
1340  case NVPTXISD::Suld2DArrayV2I64Clamp:return "NVPTXISD::Suld2DArrayV2I64Clamp";
1341  case NVPTXISD::Suld2DArrayV4I8Clamp: return "NVPTXISD::Suld2DArrayV4I8Clamp";
1342  case NVPTXISD::Suld2DArrayV4I16Clamp:return "NVPTXISD::Suld2DArrayV4I16Clamp";
1343  case NVPTXISD::Suld2DArrayV4I32Clamp:return "NVPTXISD::Suld2DArrayV4I32Clamp";
1344
1345  case NVPTXISD::Suld3DI8Clamp:          return "NVPTXISD::Suld3DI8Clamp";
1346  case NVPTXISD::Suld3DI16Clamp:         return "NVPTXISD::Suld3DI16Clamp";
1347  case NVPTXISD::Suld3DI32Clamp:         return "NVPTXISD::Suld3DI32Clamp";
1348  case NVPTXISD::Suld3DI64Clamp:         return "NVPTXISD::Suld3DI64Clamp";
1349  case NVPTXISD::Suld3DV2I8Clamp:        return "NVPTXISD::Suld3DV2I8Clamp";
1350  case NVPTXISD::Suld3DV2I16Clamp:       return "NVPTXISD::Suld3DV2I16Clamp";
1351  case NVPTXISD::Suld3DV2I32Clamp:       return "NVPTXISD::Suld3DV2I32Clamp";
1352  case NVPTXISD::Suld3DV2I64Clamp:       return "NVPTXISD::Suld3DV2I64Clamp";
1353  case NVPTXISD::Suld3DV4I8Clamp:        return "NVPTXISD::Suld3DV4I8Clamp";
1354  case NVPTXISD::Suld3DV4I16Clamp:       return "NVPTXISD::Suld3DV4I16Clamp";
1355  case NVPTXISD::Suld3DV4I32Clamp:       return "NVPTXISD::Suld3DV4I32Clamp";
1356
1357  case NVPTXISD::Suld1DI8Trap:          return "NVPTXISD::Suld1DI8Trap";
1358  case NVPTXISD::Suld1DI16Trap:         return "NVPTXISD::Suld1DI16Trap";
1359  case NVPTXISD::Suld1DI32Trap:         return "NVPTXISD::Suld1DI32Trap";
1360  case NVPTXISD::Suld1DI64Trap:         return "NVPTXISD::Suld1DI64Trap";
1361  case NVPTXISD::Suld1DV2I8Trap:        return "NVPTXISD::Suld1DV2I8Trap";
1362  case NVPTXISD::Suld1DV2I16Trap:       return "NVPTXISD::Suld1DV2I16Trap";
1363  case NVPTXISD::Suld1DV2I32Trap:       return "NVPTXISD::Suld1DV2I32Trap";
1364  case NVPTXISD::Suld1DV2I64Trap:       return "NVPTXISD::Suld1DV2I64Trap";
1365  case NVPTXISD::Suld1DV4I8Trap:        return "NVPTXISD::Suld1DV4I8Trap";
1366  case NVPTXISD::Suld1DV4I16Trap:       return "NVPTXISD::Suld1DV4I16Trap";
1367  case NVPTXISD::Suld1DV4I32Trap:       return "NVPTXISD::Suld1DV4I32Trap";
1368
1369  case NVPTXISD::Suld1DArrayI8Trap:     return "NVPTXISD::Suld1DArrayI8Trap";
1370  case NVPTXISD::Suld1DArrayI16Trap:    return "NVPTXISD::Suld1DArrayI16Trap";
1371  case NVPTXISD::Suld1DArrayI32Trap:    return "NVPTXISD::Suld1DArrayI32Trap";
1372  case NVPTXISD::Suld1DArrayI64Trap:    return "NVPTXISD::Suld1DArrayI64Trap";
1373  case NVPTXISD::Suld1DArrayV2I8Trap:   return "NVPTXISD::Suld1DArrayV2I8Trap";
1374  case NVPTXISD::Suld1DArrayV2I16Trap:  return "NVPTXISD::Suld1DArrayV2I16Trap";
1375  case NVPTXISD::Suld1DArrayV2I32Trap:  return "NVPTXISD::Suld1DArrayV2I32Trap";
1376  case NVPTXISD::Suld1DArrayV2I64Trap:  return "NVPTXISD::Suld1DArrayV2I64Trap";
1377  case NVPTXISD::Suld1DArrayV4I8Trap:   return "NVPTXISD::Suld1DArrayV4I8Trap";
1378  case NVPTXISD::Suld1DArrayV4I16Trap:  return "NVPTXISD::Suld1DArrayV4I16Trap";
1379  case NVPTXISD::Suld1DArrayV4I32Trap:  return "NVPTXISD::Suld1DArrayV4I32Trap";
1380
1381  case NVPTXISD::Suld2DI8Trap:          return "NVPTXISD::Suld2DI8Trap";
1382  case NVPTXISD::Suld2DI16Trap:         return "NVPTXISD::Suld2DI16Trap";
1383  case NVPTXISD::Suld2DI32Trap:         return "NVPTXISD::Suld2DI32Trap";
1384  case NVPTXISD::Suld2DI64Trap:         return "NVPTXISD::Suld2DI64Trap";
1385  case NVPTXISD::Suld2DV2I8Trap:        return "NVPTXISD::Suld2DV2I8Trap";
1386  case NVPTXISD::Suld2DV2I16Trap:       return "NVPTXISD::Suld2DV2I16Trap";
1387  case NVPTXISD::Suld2DV2I32Trap:       return "NVPTXISD::Suld2DV2I32Trap";
1388  case NVPTXISD::Suld2DV2I64Trap:       return "NVPTXISD::Suld2DV2I64Trap";
1389  case NVPTXISD::Suld2DV4I8Trap:        return "NVPTXISD::Suld2DV4I8Trap";
1390  case NVPTXISD::Suld2DV4I16Trap:       return "NVPTXISD::Suld2DV4I16Trap";
1391  case NVPTXISD::Suld2DV4I32Trap:       return "NVPTXISD::Suld2DV4I32Trap";
1392
1393  case NVPTXISD::Suld2DArrayI8Trap:     return "NVPTXISD::Suld2DArrayI8Trap";
1394  case NVPTXISD::Suld2DArrayI16Trap:    return "NVPTXISD::Suld2DArrayI16Trap";
1395  case NVPTXISD::Suld2DArrayI32Trap:    return "NVPTXISD::Suld2DArrayI32Trap";
1396  case NVPTXISD::Suld2DArrayI64Trap:    return "NVPTXISD::Suld2DArrayI64Trap";
1397  case NVPTXISD::Suld2DArrayV2I8Trap:   return "NVPTXISD::Suld2DArrayV2I8Trap";
1398  case NVPTXISD::Suld2DArrayV2I16Trap:  return "NVPTXISD::Suld2DArrayV2I16Trap";
1399  case NVPTXISD::Suld2DArrayV2I32Trap:  return "NVPTXISD::Suld2DArrayV2I32Trap";
1400  case NVPTXISD::Suld2DArrayV2I64Trap:  return "NVPTXISD::Suld2DArrayV2I64Trap";
1401  case NVPTXISD::Suld2DArrayV4I8Trap:   return "NVPTXISD::Suld2DArrayV4I8Trap";
1402  case NVPTXISD::Suld2DArrayV4I16Trap:  return "NVPTXISD::Suld2DArrayV4I16Trap";
1403  case NVPTXISD::Suld2DArrayV4I32Trap:  return "NVPTXISD::Suld2DArrayV4I32Trap";
1404
1405  case NVPTXISD::Suld3DI8Trap:          return "NVPTXISD::Suld3DI8Trap";
1406  case NVPTXISD::Suld3DI16Trap:         return "NVPTXISD::Suld3DI16Trap";
1407  case NVPTXISD::Suld3DI32Trap:         return "NVPTXISD::Suld3DI32Trap";
1408  case NVPTXISD::Suld3DI64Trap:         return "NVPTXISD::Suld3DI64Trap";
1409  case NVPTXISD::Suld3DV2I8Trap:        return "NVPTXISD::Suld3DV2I8Trap";
1410  case NVPTXISD::Suld3DV2I16Trap:       return "NVPTXISD::Suld3DV2I16Trap";
1411  case NVPTXISD::Suld3DV2I32Trap:       return "NVPTXISD::Suld3DV2I32Trap";
1412  case NVPTXISD::Suld3DV2I64Trap:       return "NVPTXISD::Suld3DV2I64Trap";
1413  case NVPTXISD::Suld3DV4I8Trap:        return "NVPTXISD::Suld3DV4I8Trap";
1414  case NVPTXISD::Suld3DV4I16Trap:       return "NVPTXISD::Suld3DV4I16Trap";
1415  case NVPTXISD::Suld3DV4I32Trap:       return "NVPTXISD::Suld3DV4I32Trap";
1416
1417  case NVPTXISD::Suld1DI8Zero:          return "NVPTXISD::Suld1DI8Zero";
1418  case NVPTXISD::Suld1DI16Zero:         return "NVPTXISD::Suld1DI16Zero";
1419  case NVPTXISD::Suld1DI32Zero:         return "NVPTXISD::Suld1DI32Zero";
1420  case NVPTXISD::Suld1DI64Zero:         return "NVPTXISD::Suld1DI64Zero";
1421  case NVPTXISD::Suld1DV2I8Zero:        return "NVPTXISD::Suld1DV2I8Zero";
1422  case NVPTXISD::Suld1DV2I16Zero:       return "NVPTXISD::Suld1DV2I16Zero";
1423  case NVPTXISD::Suld1DV2I32Zero:       return "NVPTXISD::Suld1DV2I32Zero";
1424  case NVPTXISD::Suld1DV2I64Zero:       return "NVPTXISD::Suld1DV2I64Zero";
1425  case NVPTXISD::Suld1DV4I8Zero:        return "NVPTXISD::Suld1DV4I8Zero";
1426  case NVPTXISD::Suld1DV4I16Zero:       return "NVPTXISD::Suld1DV4I16Zero";
1427  case NVPTXISD::Suld1DV4I32Zero:       return "NVPTXISD::Suld1DV4I32Zero";
1428
1429  case NVPTXISD::Suld1DArrayI8Zero:     return "NVPTXISD::Suld1DArrayI8Zero";
1430  case NVPTXISD::Suld1DArrayI16Zero:    return "NVPTXISD::Suld1DArrayI16Zero";
1431  case NVPTXISD::Suld1DArrayI32Zero:    return "NVPTXISD::Suld1DArrayI32Zero";
1432  case NVPTXISD::Suld1DArrayI64Zero:    return "NVPTXISD::Suld1DArrayI64Zero";
1433  case NVPTXISD::Suld1DArrayV2I8Zero:   return "NVPTXISD::Suld1DArrayV2I8Zero";
1434  case NVPTXISD::Suld1DArrayV2I16Zero:  return "NVPTXISD::Suld1DArrayV2I16Zero";
1435  case NVPTXISD::Suld1DArrayV2I32Zero:  return "NVPTXISD::Suld1DArrayV2I32Zero";
1436  case NVPTXISD::Suld1DArrayV2I64Zero:  return "NVPTXISD::Suld1DArrayV2I64Zero";
1437  case NVPTXISD::Suld1DArrayV4I8Zero:   return "NVPTXISD::Suld1DArrayV4I8Zero";
1438  case NVPTXISD::Suld1DArrayV4I16Zero:  return "NVPTXISD::Suld1DArrayV4I16Zero";
1439  case NVPTXISD::Suld1DArrayV4I32Zero:  return "NVPTXISD::Suld1DArrayV4I32Zero";
1440
1441  case NVPTXISD::Suld2DI8Zero:          return "NVPTXISD::Suld2DI8Zero";
1442  case NVPTXISD::Suld2DI16Zero:         return "NVPTXISD::Suld2DI16Zero";
1443  case NVPTXISD::Suld2DI32Zero:         return "NVPTXISD::Suld2DI32Zero";
1444  case NVPTXISD::Suld2DI64Zero:         return "NVPTXISD::Suld2DI64Zero";
1445  case NVPTXISD::Suld2DV2I8Zero:        return "NVPTXISD::Suld2DV2I8Zero";
1446  case NVPTXISD::Suld2DV2I16Zero:       return "NVPTXISD::Suld2DV2I16Zero";
1447  case NVPTXISD::Suld2DV2I32Zero:       return "NVPTXISD::Suld2DV2I32Zero";
1448  case NVPTXISD::Suld2DV2I64Zero:       return "NVPTXISD::Suld2DV2I64Zero";
1449  case NVPTXISD::Suld2DV4I8Zero:        return "NVPTXISD::Suld2DV4I8Zero";
1450  case NVPTXISD::Suld2DV4I16Zero:       return "NVPTXISD::Suld2DV4I16Zero";
1451  case NVPTXISD::Suld2DV4I32Zero:       return "NVPTXISD::Suld2DV4I32Zero";
1452
1453  case NVPTXISD::Suld2DArrayI8Zero:     return "NVPTXISD::Suld2DArrayI8Zero";
1454  case NVPTXISD::Suld2DArrayI16Zero:    return "NVPTXISD::Suld2DArrayI16Zero";
1455  case NVPTXISD::Suld2DArrayI32Zero:    return "NVPTXISD::Suld2DArrayI32Zero";
1456  case NVPTXISD::Suld2DArrayI64Zero:    return "NVPTXISD::Suld2DArrayI64Zero";
1457  case NVPTXISD::Suld2DArrayV2I8Zero:   return "NVPTXISD::Suld2DArrayV2I8Zero";
1458  case NVPTXISD::Suld2DArrayV2I16Zero:  return "NVPTXISD::Suld2DArrayV2I16Zero";
1459  case NVPTXISD::Suld2DArrayV2I32Zero:  return "NVPTXISD::Suld2DArrayV2I32Zero";
1460  case NVPTXISD::Suld2DArrayV2I64Zero:  return "NVPTXISD::Suld2DArrayV2I64Zero";
1461  case NVPTXISD::Suld2DArrayV4I8Zero:   return "NVPTXISD::Suld2DArrayV4I8Zero";
1462  case NVPTXISD::Suld2DArrayV4I16Zero:  return "NVPTXISD::Suld2DArrayV4I16Zero";
1463  case NVPTXISD::Suld2DArrayV4I32Zero:  return "NVPTXISD::Suld2DArrayV4I32Zero";
1464
1465  case NVPTXISD::Suld3DI8Zero:          return "NVPTXISD::Suld3DI8Zero";
1466  case NVPTXISD::Suld3DI16Zero:         return "NVPTXISD::Suld3DI16Zero";
1467  case NVPTXISD::Suld3DI32Zero:         return "NVPTXISD::Suld3DI32Zero";
1468  case NVPTXISD::Suld3DI64Zero:         return "NVPTXISD::Suld3DI64Zero";
1469  case NVPTXISD::Suld3DV2I8Zero:        return "NVPTXISD::Suld3DV2I8Zero";
1470  case NVPTXISD::Suld3DV2I16Zero:       return "NVPTXISD::Suld3DV2I16Zero";
1471  case NVPTXISD::Suld3DV2I32Zero:       return "NVPTXISD::Suld3DV2I32Zero";
1472  case NVPTXISD::Suld3DV2I64Zero:       return "NVPTXISD::Suld3DV2I64Zero";
1473  case NVPTXISD::Suld3DV4I8Zero:        return "NVPTXISD::Suld3DV4I8Zero";
1474  case NVPTXISD::Suld3DV4I16Zero:       return "NVPTXISD::Suld3DV4I16Zero";
1475  case NVPTXISD::Suld3DV4I32Zero:       return "NVPTXISD::Suld3DV4I32Zero";
1476  }
1477  return nullptr;
1478}
1479
1480TargetLoweringBase::LegalizeTypeAction
1481NVPTXTargetLowering::getPreferredVectorAction(MVT VT) const {
1482  if (!VT.isScalableVector() && VT.getVectorNumElements() != 1 &&
1483      VT.getScalarType() == MVT::i1)
1484    return TypeSplitVector;
1485  if (Isv2x16VT(VT))
1486    return TypeLegal;
1487  return TargetLoweringBase::getPreferredVectorAction(VT);
1488}
1489
1490SDValue NVPTXTargetLowering::getSqrtEstimate(SDValue Operand, SelectionDAG &DAG,
1491                                             int Enabled, int &ExtraSteps,
1492                                             bool &UseOneConst,
1493                                             bool Reciprocal) const {
1494  if (!(Enabled == ReciprocalEstimate::Enabled ||
1495        (Enabled == ReciprocalEstimate::Unspecified && !usePrecSqrtF32())))
1496    return SDValue();
1497
1498  if (ExtraSteps == ReciprocalEstimate::Unspecified)
1499    ExtraSteps = 0;
1500
1501  SDLoc DL(Operand);
1502  EVT VT = Operand.getValueType();
1503  bool Ftz = useF32FTZ(DAG.getMachineFunction());
1504
1505  auto MakeIntrinsicCall = [&](Intrinsic::ID IID) {
1506    return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, VT,
1507                       DAG.getConstant(IID, DL, MVT::i32), Operand);
1508  };
1509
1510  // The sqrt and rsqrt refinement processes assume we always start out with an
1511  // approximation of the rsqrt.  Therefore, if we're going to do any refinement
1512  // (i.e. ExtraSteps > 0), we must return an rsqrt.  But if we're *not* doing
1513  // any refinement, we must return a regular sqrt.
1514  if (Reciprocal || ExtraSteps > 0) {
1515    if (VT == MVT::f32)
1516      return MakeIntrinsicCall(Ftz ? Intrinsic::nvvm_rsqrt_approx_ftz_f
1517                                   : Intrinsic::nvvm_rsqrt_approx_f);
1518    else if (VT == MVT::f64)
1519      return MakeIntrinsicCall(Intrinsic::nvvm_rsqrt_approx_d);
1520    else
1521      return SDValue();
1522  } else {
1523    if (VT == MVT::f32)
1524      return MakeIntrinsicCall(Ftz ? Intrinsic::nvvm_sqrt_approx_ftz_f
1525                                   : Intrinsic::nvvm_sqrt_approx_f);
1526    else {
1527      // There's no sqrt.approx.f64 instruction, so we emit
1528      // reciprocal(rsqrt(x)).  This is faster than
1529      // select(x == 0, 0, x * rsqrt(x)).  (In fact, it's faster than plain
1530      // x * rsqrt(x).)
1531      return DAG.getNode(
1532          ISD::INTRINSIC_WO_CHAIN, DL, VT,
1533          DAG.getConstant(Intrinsic::nvvm_rcp_approx_ftz_d, DL, MVT::i32),
1534          MakeIntrinsicCall(Intrinsic::nvvm_rsqrt_approx_d));
1535    }
1536  }
1537}
1538
1539SDValue
1540NVPTXTargetLowering::LowerGlobalAddress(SDValue Op, SelectionDAG &DAG) const {
1541  SDLoc dl(Op);
1542  const GlobalAddressSDNode *GAN = cast<GlobalAddressSDNode>(Op);
1543  auto PtrVT = getPointerTy(DAG.getDataLayout(), GAN->getAddressSpace());
1544  Op = DAG.getTargetGlobalAddress(GAN->getGlobal(), dl, PtrVT);
1545  return DAG.getNode(NVPTXISD::Wrapper, dl, PtrVT, Op);
1546}
1547
1548static bool IsTypePassedAsArray(const Type *Ty) {
1549  return Ty->isAggregateType() || Ty->isVectorTy() || Ty->isIntegerTy(128) ||
1550         Ty->isHalfTy() || Ty->isBFloatTy();
1551}
1552
1553std::string NVPTXTargetLowering::getPrototype(
1554    const DataLayout &DL, Type *retTy, const ArgListTy &Args,
1555    const SmallVectorImpl<ISD::OutputArg> &Outs, MaybeAlign retAlignment,
1556    std::optional<std::pair<unsigned, const APInt &>> VAInfo,
1557    const CallBase &CB, unsigned UniqueCallSite) const {
1558  auto PtrVT = getPointerTy(DL);
1559
1560  bool isABI = (STI.getSmVersion() >= 20);
1561  assert(isABI && "Non-ABI compilation is not supported");
1562  if (!isABI)
1563    return "";
1564
1565  std::string Prototype;
1566  raw_string_ostream O(Prototype);
1567  O << "prototype_" << UniqueCallSite << " : .callprototype ";
1568
1569  if (retTy->getTypeID() == Type::VoidTyID) {
1570    O << "()";
1571  } else {
1572    O << "(";
1573    if ((retTy->isFloatingPointTy() || retTy->isIntegerTy()) &&
1574        !IsTypePassedAsArray(retTy)) {
1575      unsigned size = 0;
1576      if (auto *ITy = dyn_cast<IntegerType>(retTy)) {
1577        size = ITy->getBitWidth();
1578      } else {
1579        assert(retTy->isFloatingPointTy() &&
1580               "Floating point type expected here");
1581        size = retTy->getPrimitiveSizeInBits();
1582      }
1583      // PTX ABI requires all scalar return values to be at least 32
1584      // bits in size.  fp16 normally uses .b16 as its storage type in
1585      // PTX, so its size must be adjusted here, too.
1586      size = promoteScalarArgumentSize(size);
1587
1588      O << ".param .b" << size << " _";
1589    } else if (isa<PointerType>(retTy)) {
1590      O << ".param .b" << PtrVT.getSizeInBits() << " _";
1591    } else if (IsTypePassedAsArray(retTy)) {
1592      O << ".param .align " << (retAlignment ? retAlignment->value() : 0)
1593        << " .b8 _[" << DL.getTypeAllocSize(retTy) << "]";
1594    } else {
1595      llvm_unreachable("Unknown return type");
1596    }
1597    O << ") ";
1598  }
1599  O << "_ (";
1600
1601  bool first = true;
1602
1603  const Function *F = CB.getFunction();
1604  unsigned NumArgs = VAInfo ? VAInfo->first : Args.size();
1605  for (unsigned i = 0, OIdx = 0; i != NumArgs; ++i, ++OIdx) {
1606    Type *Ty = Args[i].Ty;
1607    if (!first) {
1608      O << ", ";
1609    }
1610    first = false;
1611
1612    if (!Outs[OIdx].Flags.isByVal()) {
1613      if (IsTypePassedAsArray(Ty)) {
1614        unsigned ParamAlign = 0;
1615        const CallInst *CallI = cast<CallInst>(&CB);
1616        // +1 because index 0 is reserved for return type alignment
1617        if (!getAlign(*CallI, i + 1, ParamAlign))
1618          ParamAlign = getFunctionParamOptimizedAlign(F, Ty, DL).value();
1619        O << ".param .align " << ParamAlign << " .b8 ";
1620        O << "_";
1621        O << "[" << DL.getTypeAllocSize(Ty) << "]";
1622        // update the index for Outs
1623        SmallVector<EVT, 16> vtparts;
1624        ComputeValueVTs(*this, DL, Ty, vtparts);
1625        if (unsigned len = vtparts.size())
1626          OIdx += len - 1;
1627        continue;
1628      }
1629      // i8 types in IR will be i16 types in SDAG
1630      assert((getValueType(DL, Ty) == Outs[OIdx].VT ||
1631              (getValueType(DL, Ty) == MVT::i8 && Outs[OIdx].VT == MVT::i16)) &&
1632             "type mismatch between callee prototype and arguments");
1633      // scalar type
1634      unsigned sz = 0;
1635      if (isa<IntegerType>(Ty)) {
1636        sz = cast<IntegerType>(Ty)->getBitWidth();
1637        sz = promoteScalarArgumentSize(sz);
1638      } else if (isa<PointerType>(Ty)) {
1639        sz = PtrVT.getSizeInBits();
1640      } else {
1641        sz = Ty->getPrimitiveSizeInBits();
1642      }
1643      O << ".param .b" << sz << " ";
1644      O << "_";
1645      continue;
1646    }
1647
1648    Type *ETy = Args[i].IndirectType;
1649    Align InitialAlign = Outs[OIdx].Flags.getNonZeroByValAlign();
1650    Align ParamByValAlign =
1651        getFunctionByValParamAlign(F, ETy, InitialAlign, DL);
1652
1653    O << ".param .align " << ParamByValAlign.value() << " .b8 ";
1654    O << "_";
1655    O << "[" << Outs[OIdx].Flags.getByValSize() << "]";
1656  }
1657
1658  if (VAInfo)
1659    O << (first ? "" : ",") << " .param .align " << VAInfo->second
1660      << " .b8 _[]\n";
1661  O << ")";
1662  if (shouldEmitPTXNoReturn(&CB, *nvTM))
1663    O << " .noreturn";
1664  O << ";";
1665
1666  return Prototype;
1667}
1668
1669Align NVPTXTargetLowering::getArgumentAlignment(const CallBase *CB, Type *Ty,
1670                                                unsigned Idx,
1671                                                const DataLayout &DL) const {
1672  if (!CB) {
1673    // CallSite is zero, fallback to ABI type alignment
1674    return DL.getABITypeAlign(Ty);
1675  }
1676
1677  unsigned Alignment = 0;
1678  const Function *DirectCallee = CB->getCalledFunction();
1679
1680  if (!DirectCallee) {
1681    // We don't have a direct function symbol, but that may be because of
1682    // constant cast instructions in the call.
1683
1684    // With bitcast'd call targets, the instruction will be the call
1685    if (const auto *CI = dyn_cast<CallInst>(CB)) {
1686      // Check if we have call alignment metadata
1687      if (getAlign(*CI, Idx, Alignment))
1688        return Align(Alignment);
1689    }
1690    DirectCallee = getMaybeBitcastedCallee(CB);
1691  }
1692
1693  // Check for function alignment information if we found that the
1694  // ultimate target is a Function
1695  if (DirectCallee) {
1696    if (getAlign(*DirectCallee, Idx, Alignment))
1697      return Align(Alignment);
1698    // If alignment information is not available, fall back to the
1699    // default function param optimized type alignment
1700    return getFunctionParamOptimizedAlign(DirectCallee, Ty, DL);
1701  }
1702
1703  // Call is indirect, fall back to the ABI type alignment
1704  return DL.getABITypeAlign(Ty);
1705}
1706
1707SDValue NVPTXTargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI,
1708                                       SmallVectorImpl<SDValue> &InVals) const {
1709
1710  if (CLI.IsVarArg && (STI.getPTXVersion() < 60 || STI.getSmVersion() < 30))
1711    report_fatal_error(
1712        "Support for variadic functions (unsized array parameter) introduced "
1713        "in PTX ISA version 6.0 and requires target sm_30.");
1714
1715  SelectionDAG &DAG = CLI.DAG;
1716  SDLoc dl = CLI.DL;
1717  SmallVectorImpl<ISD::OutputArg> &Outs = CLI.Outs;
1718  SmallVectorImpl<SDValue> &OutVals = CLI.OutVals;
1719  SmallVectorImpl<ISD::InputArg> &Ins = CLI.Ins;
1720  SDValue Chain = CLI.Chain;
1721  SDValue Callee = CLI.Callee;
1722  bool &isTailCall = CLI.IsTailCall;
1723  ArgListTy &Args = CLI.getArgs();
1724  Type *RetTy = CLI.RetTy;
1725  const CallBase *CB = CLI.CB;
1726  const DataLayout &DL = DAG.getDataLayout();
1727
1728  bool isABI = (STI.getSmVersion() >= 20);
1729  assert(isABI && "Non-ABI compilation is not supported");
1730  if (!isABI)
1731    return Chain;
1732
1733  // Variadic arguments.
1734  //
1735  // Normally, for each argument, we declare a param scalar or a param
1736  // byte array in the .param space, and store the argument value to that
1737  // param scalar or array starting at offset 0.
1738  //
1739  // In the case of the first variadic argument, we declare a vararg byte array
1740  // with size 0. The exact size of this array isn't known at this point, so
1741  // it'll be patched later. All the variadic arguments will be stored to this
1742  // array at a certain offset (which gets tracked by 'VAOffset'). The offset is
1743  // initially set to 0, so it can be used for non-variadic arguments (which use
1744  // 0 offset) to simplify the code.
1745  //
1746  // After all vararg is processed, 'VAOffset' holds the size of the
1747  // vararg byte array.
1748
1749  SDValue VADeclareParam;                 // vararg byte array
1750  unsigned FirstVAArg = CLI.NumFixedArgs; // position of the first variadic
1751  unsigned VAOffset = 0;                  // current offset in the param array
1752
1753  unsigned UniqueCallSite = GlobalUniqueCallSite.fetch_add(1);
1754  SDValue TempChain = Chain;
1755  Chain = DAG.getCALLSEQ_START(Chain, UniqueCallSite, 0, dl);
1756  SDValue InGlue = Chain.getValue(1);
1757
1758  unsigned ParamCount = 0;
1759  // Args.size() and Outs.size() need not match.
1760  // Outs.size() will be larger
1761  //   * if there is an aggregate argument with multiple fields (each field
1762  //     showing up separately in Outs)
1763  //   * if there is a vector argument with more than typical vector-length
1764  //     elements (generally if more than 4) where each vector element is
1765  //     individually present in Outs.
1766  // So a different index should be used for indexing into Outs/OutVals.
1767  // See similar issue in LowerFormalArguments.
1768  unsigned OIdx = 0;
1769  // Declare the .params or .reg need to pass values
1770  // to the function
1771  for (unsigned i = 0, e = Args.size(); i != e; ++i, ++OIdx) {
1772    EVT VT = Outs[OIdx].VT;
1773    Type *Ty = Args[i].Ty;
1774    bool IsVAArg = (i >= CLI.NumFixedArgs);
1775    bool IsByVal = Outs[OIdx].Flags.isByVal();
1776
1777    SmallVector<EVT, 16> VTs;
1778    SmallVector<uint64_t, 16> Offsets;
1779
1780    assert((!IsByVal || Args[i].IndirectType) &&
1781           "byval arg must have indirect type");
1782    Type *ETy = (IsByVal ? Args[i].IndirectType : Ty);
1783    ComputePTXValueVTs(*this, DL, ETy, VTs, &Offsets, IsByVal ? 0 : VAOffset);
1784
1785    Align ArgAlign;
1786    if (IsByVal) {
1787      // The ByValAlign in the Outs[OIdx].Flags is always set at this point,
1788      // so we don't need to worry whether it's naturally aligned or not.
1789      // See TargetLowering::LowerCallTo().
1790      Align InitialAlign = Outs[OIdx].Flags.getNonZeroByValAlign();
1791      ArgAlign = getFunctionByValParamAlign(CB->getCalledFunction(), ETy,
1792                                            InitialAlign, DL);
1793      if (IsVAArg)
1794        VAOffset = alignTo(VAOffset, ArgAlign);
1795    } else {
1796      ArgAlign = getArgumentAlignment(CB, Ty, ParamCount + 1, DL);
1797    }
1798
1799    unsigned TypeSize =
1800        (IsByVal ? Outs[OIdx].Flags.getByValSize() : DL.getTypeAllocSize(Ty));
1801    SDVTList DeclareParamVTs = DAG.getVTList(MVT::Other, MVT::Glue);
1802
1803    bool NeedAlign; // Does argument declaration specify alignment?
1804    bool PassAsArray = IsByVal || IsTypePassedAsArray(Ty);
1805    if (IsVAArg) {
1806      if (ParamCount == FirstVAArg) {
1807        SDValue DeclareParamOps[] = {
1808            Chain, DAG.getConstant(STI.getMaxRequiredAlignment(), dl, MVT::i32),
1809            DAG.getConstant(ParamCount, dl, MVT::i32),
1810            DAG.getConstant(1, dl, MVT::i32), InGlue};
1811        VADeclareParam = Chain = DAG.getNode(NVPTXISD::DeclareParam, dl,
1812                                             DeclareParamVTs, DeclareParamOps);
1813      }
1814      NeedAlign = PassAsArray;
1815    } else if (PassAsArray) {
1816      // declare .param .align <align> .b8 .param<n>[<size>];
1817      SDValue DeclareParamOps[] = {
1818          Chain, DAG.getConstant(ArgAlign.value(), dl, MVT::i32),
1819          DAG.getConstant(ParamCount, dl, MVT::i32),
1820          DAG.getConstant(TypeSize, dl, MVT::i32), InGlue};
1821      Chain = DAG.getNode(NVPTXISD::DeclareParam, dl, DeclareParamVTs,
1822                          DeclareParamOps);
1823      NeedAlign = true;
1824    } else {
1825      // declare .param .b<size> .param<n>;
1826      if (VT.isInteger() || VT.isFloatingPoint()) {
1827        // PTX ABI requires integral types to be at least 32 bits in
1828        // size. FP16 is loaded/stored using i16, so it's handled
1829        // here as well.
1830        TypeSize = promoteScalarArgumentSize(TypeSize * 8) / 8;
1831      }
1832      SDValue DeclareScalarParamOps[] = {
1833          Chain, DAG.getConstant(ParamCount, dl, MVT::i32),
1834          DAG.getConstant(TypeSize * 8, dl, MVT::i32),
1835          DAG.getConstant(0, dl, MVT::i32), InGlue};
1836      Chain = DAG.getNode(NVPTXISD::DeclareScalarParam, dl, DeclareParamVTs,
1837                          DeclareScalarParamOps);
1838      NeedAlign = false;
1839    }
1840    InGlue = Chain.getValue(1);
1841
1842    // PTX Interoperability Guide 3.3(A): [Integer] Values shorter
1843    // than 32-bits are sign extended or zero extended, depending on
1844    // whether they are signed or unsigned types. This case applies
1845    // only to scalar parameters and not to aggregate values.
1846    bool ExtendIntegerParam =
1847        Ty->isIntegerTy() && DL.getTypeAllocSizeInBits(Ty) < 32;
1848
1849    auto VectorInfo = VectorizePTXValueVTs(VTs, Offsets, ArgAlign, IsVAArg);
1850    SmallVector<SDValue, 6> StoreOperands;
1851    for (unsigned j = 0, je = VTs.size(); j != je; ++j) {
1852      EVT EltVT = VTs[j];
1853      int CurOffset = Offsets[j];
1854      MaybeAlign PartAlign;
1855      if (NeedAlign)
1856        PartAlign = commonAlignment(ArgAlign, CurOffset);
1857
1858      // New store.
1859      if (VectorInfo[j] & PVF_FIRST) {
1860        assert(StoreOperands.empty() && "Unfinished preceding store.");
1861        StoreOperands.push_back(Chain);
1862        StoreOperands.push_back(
1863            DAG.getConstant(IsVAArg ? FirstVAArg : ParamCount, dl, MVT::i32));
1864        StoreOperands.push_back(DAG.getConstant(
1865            IsByVal ? CurOffset + VAOffset : (IsVAArg ? VAOffset : CurOffset),
1866            dl, MVT::i32));
1867      }
1868
1869      SDValue StVal = OutVals[OIdx];
1870
1871      MVT PromotedVT;
1872      if (PromoteScalarIntegerPTX(EltVT, &PromotedVT)) {
1873        EltVT = EVT(PromotedVT);
1874      }
1875      if (PromoteScalarIntegerPTX(StVal.getValueType(), &PromotedVT)) {
1876        llvm::ISD::NodeType Ext =
1877            Outs[OIdx].Flags.isSExt() ? ISD::SIGN_EXTEND : ISD::ZERO_EXTEND;
1878        StVal = DAG.getNode(Ext, dl, PromotedVT, StVal);
1879      }
1880
1881      if (IsByVal) {
1882        auto PtrVT = getPointerTy(DL);
1883        SDValue srcAddr = DAG.getNode(ISD::ADD, dl, PtrVT, StVal,
1884                                      DAG.getConstant(CurOffset, dl, PtrVT));
1885        StVal = DAG.getLoad(EltVT, dl, TempChain, srcAddr, MachinePointerInfo(),
1886                            PartAlign);
1887      } else if (ExtendIntegerParam) {
1888        assert(VTs.size() == 1 && "Scalar can't have multiple parts.");
1889        // zext/sext to i32
1890        StVal = DAG.getNode(Outs[OIdx].Flags.isSExt() ? ISD::SIGN_EXTEND
1891                                                      : ISD::ZERO_EXTEND,
1892                            dl, MVT::i32, StVal);
1893      }
1894
1895      if (!ExtendIntegerParam && EltVT.getSizeInBits() < 16) {
1896        // Use 16-bit registers for small stores as it's the
1897        // smallest general purpose register size supported by NVPTX.
1898        StVal = DAG.getNode(ISD::ANY_EXTEND, dl, MVT::i16, StVal);
1899      }
1900
1901      // Record the value to store.
1902      StoreOperands.push_back(StVal);
1903
1904      if (VectorInfo[j] & PVF_LAST) {
1905        unsigned NumElts = StoreOperands.size() - 3;
1906        NVPTXISD::NodeType Op;
1907        switch (NumElts) {
1908        case 1:
1909          Op = NVPTXISD::StoreParam;
1910          break;
1911        case 2:
1912          Op = NVPTXISD::StoreParamV2;
1913          break;
1914        case 4:
1915          Op = NVPTXISD::StoreParamV4;
1916          break;
1917        default:
1918          llvm_unreachable("Invalid vector info.");
1919        }
1920
1921        StoreOperands.push_back(InGlue);
1922
1923        // Adjust type of the store op if we've extended the scalar
1924        // return value.
1925        EVT TheStoreType = ExtendIntegerParam ? MVT::i32 : EltVT;
1926
1927        Chain = DAG.getMemIntrinsicNode(
1928            Op, dl, DAG.getVTList(MVT::Other, MVT::Glue), StoreOperands,
1929            TheStoreType, MachinePointerInfo(), PartAlign,
1930            MachineMemOperand::MOStore);
1931        InGlue = Chain.getValue(1);
1932
1933        // Cleanup.
1934        StoreOperands.clear();
1935
1936        // TODO: We may need to support vector types that can be passed
1937        // as scalars in variadic arguments.
1938        if (!IsByVal && IsVAArg) {
1939          assert(NumElts == 1 &&
1940                 "Vectorization is expected to be disabled for variadics.");
1941          VAOffset += DL.getTypeAllocSize(
1942              TheStoreType.getTypeForEVT(*DAG.getContext()));
1943        }
1944      }
1945      if (!IsByVal)
1946        ++OIdx;
1947    }
1948    assert(StoreOperands.empty() && "Unfinished parameter store.");
1949    if (!IsByVal && VTs.size() > 0)
1950      --OIdx;
1951    ++ParamCount;
1952    if (IsByVal && IsVAArg)
1953      VAOffset += TypeSize;
1954  }
1955
1956  GlobalAddressSDNode *Func = dyn_cast<GlobalAddressSDNode>(Callee.getNode());
1957  MaybeAlign retAlignment = std::nullopt;
1958
1959  // Handle Result
1960  if (Ins.size() > 0) {
1961    SmallVector<EVT, 16> resvtparts;
1962    ComputeValueVTs(*this, DL, RetTy, resvtparts);
1963
1964    // Declare
1965    //  .param .align N .b8 retval0[<size-in-bytes>], or
1966    //  .param .b<size-in-bits> retval0
1967    unsigned resultsz = DL.getTypeAllocSizeInBits(RetTy);
1968    if (!IsTypePassedAsArray(RetTy)) {
1969      resultsz = promoteScalarArgumentSize(resultsz);
1970      SDVTList DeclareRetVTs = DAG.getVTList(MVT::Other, MVT::Glue);
1971      SDValue DeclareRetOps[] = { Chain, DAG.getConstant(1, dl, MVT::i32),
1972                                  DAG.getConstant(resultsz, dl, MVT::i32),
1973                                  DAG.getConstant(0, dl, MVT::i32), InGlue };
1974      Chain = DAG.getNode(NVPTXISD::DeclareRet, dl, DeclareRetVTs,
1975                          DeclareRetOps);
1976      InGlue = Chain.getValue(1);
1977    } else {
1978      retAlignment = getArgumentAlignment(CB, RetTy, 0, DL);
1979      assert(retAlignment && "retAlignment is guaranteed to be set");
1980      SDVTList DeclareRetVTs = DAG.getVTList(MVT::Other, MVT::Glue);
1981      SDValue DeclareRetOps[] = {
1982          Chain, DAG.getConstant(retAlignment->value(), dl, MVT::i32),
1983          DAG.getConstant(resultsz / 8, dl, MVT::i32),
1984          DAG.getConstant(0, dl, MVT::i32), InGlue};
1985      Chain = DAG.getNode(NVPTXISD::DeclareRetParam, dl, DeclareRetVTs,
1986                          DeclareRetOps);
1987      InGlue = Chain.getValue(1);
1988    }
1989  }
1990
1991  bool HasVAArgs = CLI.IsVarArg && (CLI.Args.size() > CLI.NumFixedArgs);
1992  // Set the size of the vararg param byte array if the callee is a variadic
1993  // function and the variadic part is not empty.
1994  if (HasVAArgs) {
1995    SDValue DeclareParamOps[] = {
1996        VADeclareParam.getOperand(0), VADeclareParam.getOperand(1),
1997        VADeclareParam.getOperand(2), DAG.getConstant(VAOffset, dl, MVT::i32),
1998        VADeclareParam.getOperand(4)};
1999    DAG.MorphNodeTo(VADeclareParam.getNode(), VADeclareParam.getOpcode(),
2000                    VADeclareParam->getVTList(), DeclareParamOps);
2001  }
2002
2003  // Both indirect calls and libcalls have nullptr Func. In order to distinguish
2004  // between them we must rely on the call site value which is valid for
2005  // indirect calls but is always null for libcalls.
2006  bool isIndirectCall = !Func && CB;
2007
2008  if (isa<ExternalSymbolSDNode>(Callee)) {
2009    Function* CalleeFunc = nullptr;
2010
2011    // Try to find the callee in the current module.
2012    Callee = DAG.getSymbolFunctionGlobalAddress(Callee, &CalleeFunc);
2013    assert(CalleeFunc != nullptr && "Libcall callee must be set.");
2014
2015    // Set the "libcall callee" attribute to indicate that the function
2016    // must always have a declaration.
2017    CalleeFunc->addFnAttr("nvptx-libcall-callee", "true");
2018  }
2019
2020  if (isIndirectCall) {
2021    // This is indirect function call case : PTX requires a prototype of the
2022    // form
2023    // proto_0 : .callprototype(.param .b32 _) _ (.param .b32 _);
2024    // to be emitted, and the label has to used as the last arg of call
2025    // instruction.
2026    // The prototype is embedded in a string and put as the operand for a
2027    // CallPrototype SDNode which will print out to the value of the string.
2028    SDVTList ProtoVTs = DAG.getVTList(MVT::Other, MVT::Glue);
2029    std::string Proto = getPrototype(
2030        DL, RetTy, Args, Outs, retAlignment,
2031        HasVAArgs
2032            ? std::optional<std::pair<unsigned, const APInt &>>(std::make_pair(
2033                  CLI.NumFixedArgs, VADeclareParam->getConstantOperandAPInt(1)))
2034            : std::nullopt,
2035        *CB, UniqueCallSite);
2036    const char *ProtoStr = nvTM->getStrPool().save(Proto).data();
2037    SDValue ProtoOps[] = {
2038        Chain,
2039        DAG.getTargetExternalSymbol(ProtoStr, MVT::i32),
2040        InGlue,
2041    };
2042    Chain = DAG.getNode(NVPTXISD::CallPrototype, dl, ProtoVTs, ProtoOps);
2043    InGlue = Chain.getValue(1);
2044  }
2045  // Op to just print "call"
2046  SDVTList PrintCallVTs = DAG.getVTList(MVT::Other, MVT::Glue);
2047  SDValue PrintCallOps[] = {
2048    Chain, DAG.getConstant((Ins.size() == 0) ? 0 : 1, dl, MVT::i32), InGlue
2049  };
2050  // We model convergent calls as separate opcodes.
2051  unsigned Opcode = isIndirectCall ? NVPTXISD::PrintCall : NVPTXISD::PrintCallUni;
2052  if (CLI.IsConvergent)
2053    Opcode = Opcode == NVPTXISD::PrintCallUni ? NVPTXISD::PrintConvergentCallUni
2054                                              : NVPTXISD::PrintConvergentCall;
2055  Chain = DAG.getNode(Opcode, dl, PrintCallVTs, PrintCallOps);
2056  InGlue = Chain.getValue(1);
2057
2058  // Ops to print out the function name
2059  SDVTList CallVoidVTs = DAG.getVTList(MVT::Other, MVT::Glue);
2060  SDValue CallVoidOps[] = { Chain, Callee, InGlue };
2061  Chain = DAG.getNode(NVPTXISD::CallVoid, dl, CallVoidVTs, CallVoidOps);
2062  InGlue = Chain.getValue(1);
2063
2064  // Ops to print out the param list
2065  SDVTList CallArgBeginVTs = DAG.getVTList(MVT::Other, MVT::Glue);
2066  SDValue CallArgBeginOps[] = { Chain, InGlue };
2067  Chain = DAG.getNode(NVPTXISD::CallArgBegin, dl, CallArgBeginVTs,
2068                      CallArgBeginOps);
2069  InGlue = Chain.getValue(1);
2070
2071  for (unsigned i = 0, e = std::min(CLI.NumFixedArgs + 1, ParamCount); i != e;
2072       ++i) {
2073    unsigned opcode;
2074    if (i == (e - 1))
2075      opcode = NVPTXISD::LastCallArg;
2076    else
2077      opcode = NVPTXISD::CallArg;
2078    SDVTList CallArgVTs = DAG.getVTList(MVT::Other, MVT::Glue);
2079    SDValue CallArgOps[] = { Chain, DAG.getConstant(1, dl, MVT::i32),
2080                             DAG.getConstant(i, dl, MVT::i32), InGlue };
2081    Chain = DAG.getNode(opcode, dl, CallArgVTs, CallArgOps);
2082    InGlue = Chain.getValue(1);
2083  }
2084  SDVTList CallArgEndVTs = DAG.getVTList(MVT::Other, MVT::Glue);
2085  SDValue CallArgEndOps[] = { Chain,
2086                              DAG.getConstant(isIndirectCall ? 0 : 1, dl, MVT::i32),
2087                              InGlue };
2088  Chain = DAG.getNode(NVPTXISD::CallArgEnd, dl, CallArgEndVTs, CallArgEndOps);
2089  InGlue = Chain.getValue(1);
2090
2091  if (isIndirectCall) {
2092    SDVTList PrototypeVTs = DAG.getVTList(MVT::Other, MVT::Glue);
2093    SDValue PrototypeOps[] = {
2094        Chain, DAG.getConstant(UniqueCallSite, dl, MVT::i32), InGlue};
2095    Chain = DAG.getNode(NVPTXISD::Prototype, dl, PrototypeVTs, PrototypeOps);
2096    InGlue = Chain.getValue(1);
2097  }
2098
2099  SmallVector<SDValue, 16> ProxyRegOps;
2100  SmallVector<std::optional<MVT>, 16> ProxyRegTruncates;
2101
2102  // Generate loads from param memory/moves from registers for result
2103  if (Ins.size() > 0) {
2104    SmallVector<EVT, 16> VTs;
2105    SmallVector<uint64_t, 16> Offsets;
2106    ComputePTXValueVTs(*this, DL, RetTy, VTs, &Offsets, 0);
2107    assert(VTs.size() == Ins.size() && "Bad value decomposition");
2108
2109    Align RetAlign = getArgumentAlignment(CB, RetTy, 0, DL);
2110    auto VectorInfo = VectorizePTXValueVTs(VTs, Offsets, RetAlign);
2111
2112    SmallVector<EVT, 6> LoadVTs;
2113    int VecIdx = -1; // Index of the first element of the vector.
2114
2115    // PTX Interoperability Guide 3.3(A): [Integer] Values shorter than
2116    // 32-bits are sign extended or zero extended, depending on whether
2117    // they are signed or unsigned types.
2118    bool ExtendIntegerRetVal =
2119        RetTy->isIntegerTy() && DL.getTypeAllocSizeInBits(RetTy) < 32;
2120
2121    for (unsigned i = 0, e = VTs.size(); i != e; ++i) {
2122      bool needTruncate = false;
2123      EVT TheLoadType = VTs[i];
2124      EVT EltType = Ins[i].VT;
2125      Align EltAlign = commonAlignment(RetAlign, Offsets[i]);
2126      MVT PromotedVT;
2127
2128      if (PromoteScalarIntegerPTX(TheLoadType, &PromotedVT)) {
2129        TheLoadType = EVT(PromotedVT);
2130        EltType = EVT(PromotedVT);
2131        needTruncate = true;
2132      }
2133
2134      if (ExtendIntegerRetVal) {
2135        TheLoadType = MVT::i32;
2136        EltType = MVT::i32;
2137        needTruncate = true;
2138      } else if (TheLoadType.getSizeInBits() < 16) {
2139        if (VTs[i].isInteger())
2140          needTruncate = true;
2141        EltType = MVT::i16;
2142      }
2143
2144      // Record index of the very first element of the vector.
2145      if (VectorInfo[i] & PVF_FIRST) {
2146        assert(VecIdx == -1 && LoadVTs.empty() && "Orphaned operand list.");
2147        VecIdx = i;
2148      }
2149
2150      LoadVTs.push_back(EltType);
2151
2152      if (VectorInfo[i] & PVF_LAST) {
2153        unsigned NumElts = LoadVTs.size();
2154        LoadVTs.push_back(MVT::Other);
2155        LoadVTs.push_back(MVT::Glue);
2156        NVPTXISD::NodeType Op;
2157        switch (NumElts) {
2158        case 1:
2159          Op = NVPTXISD::LoadParam;
2160          break;
2161        case 2:
2162          Op = NVPTXISD::LoadParamV2;
2163          break;
2164        case 4:
2165          Op = NVPTXISD::LoadParamV4;
2166          break;
2167        default:
2168          llvm_unreachable("Invalid vector info.");
2169        }
2170
2171        SDValue LoadOperands[] = {
2172            Chain, DAG.getConstant(1, dl, MVT::i32),
2173            DAG.getConstant(Offsets[VecIdx], dl, MVT::i32), InGlue};
2174        SDValue RetVal = DAG.getMemIntrinsicNode(
2175            Op, dl, DAG.getVTList(LoadVTs), LoadOperands, TheLoadType,
2176            MachinePointerInfo(), EltAlign,
2177            MachineMemOperand::MOLoad);
2178
2179        for (unsigned j = 0; j < NumElts; ++j) {
2180          ProxyRegOps.push_back(RetVal.getValue(j));
2181
2182          if (needTruncate)
2183            ProxyRegTruncates.push_back(std::optional<MVT>(Ins[VecIdx + j].VT));
2184          else
2185            ProxyRegTruncates.push_back(std::optional<MVT>());
2186        }
2187
2188        Chain = RetVal.getValue(NumElts);
2189        InGlue = RetVal.getValue(NumElts + 1);
2190
2191        // Cleanup
2192        VecIdx = -1;
2193        LoadVTs.clear();
2194      }
2195    }
2196  }
2197
2198  Chain =
2199      DAG.getCALLSEQ_END(Chain, UniqueCallSite, UniqueCallSite + 1, InGlue, dl);
2200  InGlue = Chain.getValue(1);
2201
2202  // Append ProxyReg instructions to the chain to make sure that `callseq_end`
2203  // will not get lost. Otherwise, during libcalls expansion, the nodes can become
2204  // dangling.
2205  for (unsigned i = 0; i < ProxyRegOps.size(); ++i) {
2206    SDValue Ret = DAG.getNode(
2207      NVPTXISD::ProxyReg, dl,
2208      DAG.getVTList(ProxyRegOps[i].getSimpleValueType(), MVT::Other, MVT::Glue),
2209      { Chain, ProxyRegOps[i], InGlue }
2210    );
2211
2212    Chain = Ret.getValue(1);
2213    InGlue = Ret.getValue(2);
2214
2215    if (ProxyRegTruncates[i]) {
2216      Ret = DAG.getNode(ISD::TRUNCATE, dl, *ProxyRegTruncates[i], Ret);
2217    }
2218
2219    InVals.push_back(Ret);
2220  }
2221
2222  // set isTailCall to false for now, until we figure out how to express
2223  // tail call optimization in PTX
2224  isTailCall = false;
2225  return Chain;
2226}
2227
2228SDValue NVPTXTargetLowering::LowerDYNAMIC_STACKALLOC(SDValue Op,
2229                                                     SelectionDAG &DAG) const {
2230  const Function &Fn = DAG.getMachineFunction().getFunction();
2231
2232  DiagnosticInfoUnsupported NoDynamicAlloca(
2233      Fn, "dynamic alloca unsupported by NVPTX backend",
2234      SDLoc(Op).getDebugLoc());
2235  DAG.getContext()->diagnose(NoDynamicAlloca);
2236  auto Ops = {DAG.getConstant(0, SDLoc(), Op.getValueType()), Op.getOperand(0)};
2237  return DAG.getMergeValues(Ops, SDLoc());
2238}
2239
2240// By default CONCAT_VECTORS is lowered by ExpandVectorBuildThroughStack()
2241// (see LegalizeDAG.cpp). This is slow and uses local memory.
2242// We use extract/insert/build vector just as what LegalizeOp() does in llvm 2.5
2243SDValue
2244NVPTXTargetLowering::LowerCONCAT_VECTORS(SDValue Op, SelectionDAG &DAG) const {
2245  SDNode *Node = Op.getNode();
2246  SDLoc dl(Node);
2247  SmallVector<SDValue, 8> Ops;
2248  unsigned NumOperands = Node->getNumOperands();
2249  for (unsigned i = 0; i < NumOperands; ++i) {
2250    SDValue SubOp = Node->getOperand(i);
2251    EVT VVT = SubOp.getNode()->getValueType(0);
2252    EVT EltVT = VVT.getVectorElementType();
2253    unsigned NumSubElem = VVT.getVectorNumElements();
2254    for (unsigned j = 0; j < NumSubElem; ++j) {
2255      Ops.push_back(DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, EltVT, SubOp,
2256                                DAG.getIntPtrConstant(j, dl)));
2257    }
2258  }
2259  return DAG.getBuildVector(Node->getValueType(0), dl, Ops);
2260}
2261
2262// We can init constant f16x2/v2i16/v4i8 with a single .b32 move.  Normally it
2263// would get lowered as two constant loads and vector-packing move.
2264// Instead we want just a constant move:
2265//        mov.b32         %r2, 0x40003C00
2266SDValue NVPTXTargetLowering::LowerBUILD_VECTOR(SDValue Op,
2267                                               SelectionDAG &DAG) const {
2268  EVT VT = Op->getValueType(0);
2269  if (!(Isv2x16VT(VT) || VT == MVT::v4i8))
2270    return Op;
2271
2272  SDLoc DL(Op);
2273
2274  if (!llvm::all_of(Op->ops(), [](SDValue Operand) {
2275        return Operand->isUndef() || isa<ConstantSDNode>(Operand) ||
2276               isa<ConstantFPSDNode>(Operand);
2277      })) {
2278    // Lower non-const v4i8 vector as byte-wise constructed i32, which allows us
2279    // to optimize calculation of constant parts.
2280    if (VT == MVT::v4i8) {
2281      SDValue C8 = DAG.getConstant(8, DL, MVT::i32);
2282      SDValue E01 = DAG.getNode(
2283          NVPTXISD::BFI, DL, MVT::i32,
2284          DAG.getAnyExtOrTrunc(Op->getOperand(1), DL, MVT::i32),
2285          DAG.getAnyExtOrTrunc(Op->getOperand(0), DL, MVT::i32), C8, C8);
2286      SDValue E012 =
2287          DAG.getNode(NVPTXISD::BFI, DL, MVT::i32,
2288                      DAG.getAnyExtOrTrunc(Op->getOperand(2), DL, MVT::i32),
2289                      E01, DAG.getConstant(16, DL, MVT::i32), C8);
2290      SDValue E0123 =
2291          DAG.getNode(NVPTXISD::BFI, DL, MVT::i32,
2292                      DAG.getAnyExtOrTrunc(Op->getOperand(3), DL, MVT::i32),
2293                      E012, DAG.getConstant(24, DL, MVT::i32), C8);
2294      return DAG.getNode(ISD::BITCAST, DL, VT, E0123);
2295    }
2296    return Op;
2297  }
2298
2299  // Get value or the Nth operand as an APInt(32). Undef values treated as 0.
2300  auto GetOperand = [](SDValue Op, int N) -> APInt {
2301    const SDValue &Operand = Op->getOperand(N);
2302    EVT VT = Op->getValueType(0);
2303    if (Operand->isUndef())
2304      return APInt(32, 0);
2305    APInt Value;
2306    if (VT == MVT::v2f16 || VT == MVT::v2bf16)
2307      Value = cast<ConstantFPSDNode>(Operand)->getValueAPF().bitcastToAPInt();
2308    else if (VT == MVT::v2i16 || VT == MVT::v4i8)
2309      Value = Operand->getAsAPIntVal();
2310    else
2311      llvm_unreachable("Unsupported type");
2312    // i8 values are carried around as i16, so we need to zero out upper bits,
2313    // so they do not get in the way of combining individual byte values
2314    if (VT == MVT::v4i8)
2315      Value = Value.trunc(8);
2316    return Value.zext(32);
2317  };
2318  APInt Value;
2319  if (Isv2x16VT(VT)) {
2320    Value = GetOperand(Op, 0) | GetOperand(Op, 1).shl(16);
2321  } else if (VT == MVT::v4i8) {
2322    Value = GetOperand(Op, 0) | GetOperand(Op, 1).shl(8) |
2323            GetOperand(Op, 2).shl(16) | GetOperand(Op, 3).shl(24);
2324  } else {
2325    llvm_unreachable("Unsupported type");
2326  }
2327  SDValue Const = DAG.getConstant(Value, SDLoc(Op), MVT::i32);
2328  return DAG.getNode(ISD::BITCAST, SDLoc(Op), Op->getValueType(0), Const);
2329}
2330
2331SDValue NVPTXTargetLowering::LowerEXTRACT_VECTOR_ELT(SDValue Op,
2332                                                     SelectionDAG &DAG) const {
2333  SDValue Index = Op->getOperand(1);
2334  SDValue Vector = Op->getOperand(0);
2335  SDLoc DL(Op);
2336  EVT VectorVT = Vector.getValueType();
2337
2338  if (VectorVT == MVT::v4i8) {
2339    SDValue BFE =
2340        DAG.getNode(NVPTXISD::BFE, DL, MVT::i32,
2341                    {Vector,
2342                     DAG.getNode(ISD::MUL, DL, MVT::i32,
2343                                 DAG.getZExtOrTrunc(Index, DL, MVT::i32),
2344                                 DAG.getConstant(8, DL, MVT::i32)),
2345                     DAG.getConstant(8, DL, MVT::i32)});
2346    return DAG.getAnyExtOrTrunc(BFE, DL, Op->getValueType(0));
2347  }
2348
2349  // Constant index will be matched by tablegen.
2350  if (isa<ConstantSDNode>(Index.getNode()))
2351    return Op;
2352
2353  // Extract individual elements and select one of them.
2354  assert(Isv2x16VT(VectorVT) && "Unexpected vector type.");
2355  EVT EltVT = VectorVT.getVectorElementType();
2356
2357  SDLoc dl(Op.getNode());
2358  SDValue E0 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, EltVT, Vector,
2359                           DAG.getIntPtrConstant(0, dl));
2360  SDValue E1 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, EltVT, Vector,
2361                           DAG.getIntPtrConstant(1, dl));
2362  return DAG.getSelectCC(dl, Index, DAG.getIntPtrConstant(0, dl), E0, E1,
2363                         ISD::CondCode::SETEQ);
2364}
2365
2366SDValue NVPTXTargetLowering::LowerINSERT_VECTOR_ELT(SDValue Op,
2367                                                    SelectionDAG &DAG) const {
2368  SDValue Vector = Op->getOperand(0);
2369  EVT VectorVT = Vector.getValueType();
2370
2371  if (VectorVT != MVT::v4i8)
2372    return Op;
2373  SDLoc DL(Op);
2374  SDValue Value = Op->getOperand(1);
2375  if (Value->isUndef())
2376    return Vector;
2377
2378  SDValue Index = Op->getOperand(2);
2379
2380  SDValue BFI =
2381      DAG.getNode(NVPTXISD::BFI, DL, MVT::i32,
2382                  {DAG.getZExtOrTrunc(Value, DL, MVT::i32), Vector,
2383                   DAG.getNode(ISD::MUL, DL, MVT::i32,
2384                               DAG.getZExtOrTrunc(Index, DL, MVT::i32),
2385                               DAG.getConstant(8, DL, MVT::i32)),
2386                   DAG.getConstant(8, DL, MVT::i32)});
2387  return DAG.getNode(ISD::BITCAST, DL, Op->getValueType(0), BFI);
2388}
2389
2390SDValue NVPTXTargetLowering::LowerVECTOR_SHUFFLE(SDValue Op,
2391                                                 SelectionDAG &DAG) const {
2392  SDValue V1 = Op.getOperand(0);
2393  EVT VectorVT = V1.getValueType();
2394  if (VectorVT != MVT::v4i8 || Op.getValueType() != MVT::v4i8)
2395    return Op;
2396
2397  // Lower shuffle to PRMT instruction.
2398  const ShuffleVectorSDNode *SVN = cast<ShuffleVectorSDNode>(Op.getNode());
2399  SDValue V2 = Op.getOperand(1);
2400  uint32_t Selector = 0;
2401  for (auto I : llvm::enumerate(SVN->getMask())) {
2402    if (I.value() != -1) // -1 is a placeholder for undef.
2403      Selector |= (I.value() << (I.index() * 4));
2404  }
2405
2406  SDLoc DL(Op);
2407  return DAG.getNode(NVPTXISD::PRMT, DL, MVT::v4i8, V1, V2,
2408                     DAG.getConstant(Selector, DL, MVT::i32),
2409                     DAG.getConstant(NVPTX::PTXPrmtMode::NONE, DL, MVT::i32));
2410}
2411/// LowerShiftRightParts - Lower SRL_PARTS, SRA_PARTS, which
2412/// 1) returns two i32 values and take a 2 x i32 value to shift plus a shift
2413///    amount, or
2414/// 2) returns two i64 values and take a 2 x i64 value to shift plus a shift
2415///    amount.
2416SDValue NVPTXTargetLowering::LowerShiftRightParts(SDValue Op,
2417                                                  SelectionDAG &DAG) const {
2418  assert(Op.getNumOperands() == 3 && "Not a double-shift!");
2419  assert(Op.getOpcode() == ISD::SRA_PARTS || Op.getOpcode() == ISD::SRL_PARTS);
2420
2421  EVT VT = Op.getValueType();
2422  unsigned VTBits = VT.getSizeInBits();
2423  SDLoc dl(Op);
2424  SDValue ShOpLo = Op.getOperand(0);
2425  SDValue ShOpHi = Op.getOperand(1);
2426  SDValue ShAmt  = Op.getOperand(2);
2427  unsigned Opc = (Op.getOpcode() == ISD::SRA_PARTS) ? ISD::SRA : ISD::SRL;
2428
2429  if (VTBits == 32 && STI.getSmVersion() >= 35) {
2430    // For 32bit and sm35, we can use the funnel shift 'shf' instruction.
2431    // {dHi, dLo} = {aHi, aLo} >> Amt
2432    //   dHi = aHi >> Amt
2433    //   dLo = shf.r.clamp aLo, aHi, Amt
2434
2435    SDValue Hi = DAG.getNode(Opc, dl, VT, ShOpHi, ShAmt);
2436    SDValue Lo = DAG.getNode(NVPTXISD::FUN_SHFR_CLAMP, dl, VT, ShOpLo, ShOpHi,
2437                             ShAmt);
2438
2439    SDValue Ops[2] = { Lo, Hi };
2440    return DAG.getMergeValues(Ops, dl);
2441  }
2442  else {
2443    // {dHi, dLo} = {aHi, aLo} >> Amt
2444    // - if (Amt>=size) then
2445    //      dLo = aHi >> (Amt-size)
2446    //      dHi = aHi >> Amt (this is either all 0 or all 1)
2447    //   else
2448    //      dLo = (aLo >>logic Amt) | (aHi << (size-Amt))
2449    //      dHi = aHi >> Amt
2450
2451    SDValue RevShAmt = DAG.getNode(ISD::SUB, dl, MVT::i32,
2452                                   DAG.getConstant(VTBits, dl, MVT::i32),
2453                                   ShAmt);
2454    SDValue Tmp1 = DAG.getNode(ISD::SRL, dl, VT, ShOpLo, ShAmt);
2455    SDValue ExtraShAmt = DAG.getNode(ISD::SUB, dl, MVT::i32, ShAmt,
2456                                     DAG.getConstant(VTBits, dl, MVT::i32));
2457    SDValue Tmp2 = DAG.getNode(ISD::SHL, dl, VT, ShOpHi, RevShAmt);
2458    SDValue FalseVal = DAG.getNode(ISD::OR, dl, VT, Tmp1, Tmp2);
2459    SDValue TrueVal = DAG.getNode(Opc, dl, VT, ShOpHi, ExtraShAmt);
2460
2461    SDValue Cmp = DAG.getSetCC(dl, MVT::i1, ShAmt,
2462                               DAG.getConstant(VTBits, dl, MVT::i32),
2463                               ISD::SETGE);
2464    SDValue Hi = DAG.getNode(Opc, dl, VT, ShOpHi, ShAmt);
2465    SDValue Lo = DAG.getNode(ISD::SELECT, dl, VT, Cmp, TrueVal, FalseVal);
2466
2467    SDValue Ops[2] = { Lo, Hi };
2468    return DAG.getMergeValues(Ops, dl);
2469  }
2470}
2471
2472/// LowerShiftLeftParts - Lower SHL_PARTS, which
2473/// 1) returns two i32 values and take a 2 x i32 value to shift plus a shift
2474///    amount, or
2475/// 2) returns two i64 values and take a 2 x i64 value to shift plus a shift
2476///    amount.
2477SDValue NVPTXTargetLowering::LowerShiftLeftParts(SDValue Op,
2478                                                 SelectionDAG &DAG) const {
2479  assert(Op.getNumOperands() == 3 && "Not a double-shift!");
2480  assert(Op.getOpcode() == ISD::SHL_PARTS);
2481
2482  EVT VT = Op.getValueType();
2483  unsigned VTBits = VT.getSizeInBits();
2484  SDLoc dl(Op);
2485  SDValue ShOpLo = Op.getOperand(0);
2486  SDValue ShOpHi = Op.getOperand(1);
2487  SDValue ShAmt  = Op.getOperand(2);
2488
2489  if (VTBits == 32 && STI.getSmVersion() >= 35) {
2490    // For 32bit and sm35, we can use the funnel shift 'shf' instruction.
2491    // {dHi, dLo} = {aHi, aLo} << Amt
2492    //   dHi = shf.l.clamp aLo, aHi, Amt
2493    //   dLo = aLo << Amt
2494
2495    SDValue Hi = DAG.getNode(NVPTXISD::FUN_SHFL_CLAMP, dl, VT, ShOpLo, ShOpHi,
2496                             ShAmt);
2497    SDValue Lo = DAG.getNode(ISD::SHL, dl, VT, ShOpLo, ShAmt);
2498
2499    SDValue Ops[2] = { Lo, Hi };
2500    return DAG.getMergeValues(Ops, dl);
2501  }
2502  else {
2503    // {dHi, dLo} = {aHi, aLo} << Amt
2504    // - if (Amt>=size) then
2505    //      dLo = aLo << Amt (all 0)
2506    //      dLo = aLo << (Amt-size)
2507    //   else
2508    //      dLo = aLo << Amt
2509    //      dHi = (aHi << Amt) | (aLo >> (size-Amt))
2510
2511    SDValue RevShAmt = DAG.getNode(ISD::SUB, dl, MVT::i32,
2512                                   DAG.getConstant(VTBits, dl, MVT::i32),
2513                                   ShAmt);
2514    SDValue Tmp1 = DAG.getNode(ISD::SHL, dl, VT, ShOpHi, ShAmt);
2515    SDValue ExtraShAmt = DAG.getNode(ISD::SUB, dl, MVT::i32, ShAmt,
2516                                     DAG.getConstant(VTBits, dl, MVT::i32));
2517    SDValue Tmp2 = DAG.getNode(ISD::SRL, dl, VT, ShOpLo, RevShAmt);
2518    SDValue FalseVal = DAG.getNode(ISD::OR, dl, VT, Tmp1, Tmp2);
2519    SDValue TrueVal = DAG.getNode(ISD::SHL, dl, VT, ShOpLo, ExtraShAmt);
2520
2521    SDValue Cmp = DAG.getSetCC(dl, MVT::i1, ShAmt,
2522                               DAG.getConstant(VTBits, dl, MVT::i32),
2523                               ISD::SETGE);
2524    SDValue Lo = DAG.getNode(ISD::SHL, dl, VT, ShOpLo, ShAmt);
2525    SDValue Hi = DAG.getNode(ISD::SELECT, dl, VT, Cmp, TrueVal, FalseVal);
2526
2527    SDValue Ops[2] = { Lo, Hi };
2528    return DAG.getMergeValues(Ops, dl);
2529  }
2530}
2531
2532SDValue NVPTXTargetLowering::LowerFROUND(SDValue Op, SelectionDAG &DAG) const {
2533  EVT VT = Op.getValueType();
2534
2535  if (VT == MVT::f32)
2536    return LowerFROUND32(Op, DAG);
2537
2538  if (VT == MVT::f64)
2539    return LowerFROUND64(Op, DAG);
2540
2541  llvm_unreachable("unhandled type");
2542}
2543
2544// This is the the rounding method used in CUDA libdevice in C like code:
2545// float roundf(float A)
2546// {
2547//   float RoundedA = (float) (int) ( A > 0 ? (A + 0.5f) : (A - 0.5f));
2548//   RoundedA = abs(A) > 0x1.0p23 ? A : RoundedA;
2549//   return abs(A) < 0.5 ? (float)(int)A : RoundedA;
2550// }
2551SDValue NVPTXTargetLowering::LowerFROUND32(SDValue Op,
2552                                           SelectionDAG &DAG) const {
2553  SDLoc SL(Op);
2554  SDValue A = Op.getOperand(0);
2555  EVT VT = Op.getValueType();
2556
2557  SDValue AbsA = DAG.getNode(ISD::FABS, SL, VT, A);
2558
2559  // RoundedA = (float) (int) ( A > 0 ? (A + 0.5f) : (A - 0.5f))
2560  SDValue Bitcast  = DAG.getNode(ISD::BITCAST, SL, MVT::i32, A);
2561  const int SignBitMask = 0x80000000;
2562  SDValue Sign = DAG.getNode(ISD::AND, SL, MVT::i32, Bitcast,
2563                             DAG.getConstant(SignBitMask, SL, MVT::i32));
2564  const int PointFiveInBits = 0x3F000000;
2565  SDValue PointFiveWithSignRaw =
2566      DAG.getNode(ISD::OR, SL, MVT::i32, Sign,
2567                  DAG.getConstant(PointFiveInBits, SL, MVT::i32));
2568  SDValue PointFiveWithSign =
2569      DAG.getNode(ISD::BITCAST, SL, VT, PointFiveWithSignRaw);
2570  SDValue AdjustedA = DAG.getNode(ISD::FADD, SL, VT, A, PointFiveWithSign);
2571  SDValue RoundedA = DAG.getNode(ISD::FTRUNC, SL, VT, AdjustedA);
2572
2573  // RoundedA = abs(A) > 0x1.0p23 ? A : RoundedA;
2574  EVT SetCCVT = getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), VT);
2575  SDValue IsLarge =
2576      DAG.getSetCC(SL, SetCCVT, AbsA, DAG.getConstantFP(pow(2.0, 23.0), SL, VT),
2577                   ISD::SETOGT);
2578  RoundedA = DAG.getNode(ISD::SELECT, SL, VT, IsLarge, A, RoundedA);
2579
2580  // return abs(A) < 0.5 ? (float)(int)A : RoundedA;
2581  SDValue IsSmall =DAG.getSetCC(SL, SetCCVT, AbsA,
2582                                DAG.getConstantFP(0.5, SL, VT), ISD::SETOLT);
2583  SDValue RoundedAForSmallA = DAG.getNode(ISD::FTRUNC, SL, VT, A);
2584  return DAG.getNode(ISD::SELECT, SL, VT, IsSmall, RoundedAForSmallA, RoundedA);
2585}
2586
2587// The implementation of round(double) is similar to that of round(float) in
2588// that they both separate the value range into three regions and use a method
2589// specific to the region to round the values. However, round(double) first
2590// calculates the round of the absolute value and then adds the sign back while
2591// round(float) directly rounds the value with sign.
2592SDValue NVPTXTargetLowering::LowerFROUND64(SDValue Op,
2593                                           SelectionDAG &DAG) const {
2594  SDLoc SL(Op);
2595  SDValue A = Op.getOperand(0);
2596  EVT VT = Op.getValueType();
2597
2598  SDValue AbsA = DAG.getNode(ISD::FABS, SL, VT, A);
2599
2600  // double RoundedA = (double) (int) (abs(A) + 0.5f);
2601  SDValue AdjustedA = DAG.getNode(ISD::FADD, SL, VT, AbsA,
2602                                  DAG.getConstantFP(0.5, SL, VT));
2603  SDValue RoundedA = DAG.getNode(ISD::FTRUNC, SL, VT, AdjustedA);
2604
2605  // RoundedA = abs(A) < 0.5 ? (double)0 : RoundedA;
2606  EVT SetCCVT = getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), VT);
2607  SDValue IsSmall =DAG.getSetCC(SL, SetCCVT, AbsA,
2608                                DAG.getConstantFP(0.5, SL, VT), ISD::SETOLT);
2609  RoundedA = DAG.getNode(ISD::SELECT, SL, VT, IsSmall,
2610                         DAG.getConstantFP(0, SL, VT),
2611                         RoundedA);
2612
2613  // Add sign to rounded_A
2614  RoundedA = DAG.getNode(ISD::FCOPYSIGN, SL, VT, RoundedA, A);
2615  DAG.getNode(ISD::FTRUNC, SL, VT, A);
2616
2617  // RoundedA = abs(A) > 0x1.0p52 ? A : RoundedA;
2618  SDValue IsLarge =
2619      DAG.getSetCC(SL, SetCCVT, AbsA, DAG.getConstantFP(pow(2.0, 52.0), SL, VT),
2620                   ISD::SETOGT);
2621  return DAG.getNode(ISD::SELECT, SL, VT, IsLarge, A, RoundedA);
2622}
2623
2624SDValue NVPTXTargetLowering::LowerINT_TO_FP(SDValue Op,
2625                                            SelectionDAG &DAG) const {
2626  assert(STI.getSmVersion() < 90 || STI.getPTXVersion() < 78);
2627
2628  if (Op.getValueType() == MVT::bf16) {
2629    SDLoc Loc(Op);
2630    return DAG.getNode(
2631        ISD::FP_ROUND, Loc, MVT::bf16,
2632        DAG.getNode(Op.getOpcode(), Loc, MVT::f32, Op.getOperand(0)),
2633        DAG.getIntPtrConstant(0, Loc));
2634  }
2635
2636  // Everything else is considered legal.
2637  return Op;
2638}
2639
2640SDValue NVPTXTargetLowering::LowerFP_TO_INT(SDValue Op,
2641                                            SelectionDAG &DAG) const {
2642  assert(STI.getSmVersion() < 90 || STI.getPTXVersion() < 78);
2643
2644  if (Op.getOperand(0).getValueType() == MVT::bf16) {
2645    SDLoc Loc(Op);
2646    return DAG.getNode(
2647        Op.getOpcode(), Loc, Op.getValueType(),
2648        DAG.getNode(ISD::FP_EXTEND, Loc, MVT::f32, Op.getOperand(0)));
2649  }
2650
2651  // Everything else is considered legal.
2652  return Op;
2653}
2654
2655static SDValue LowerVectorArith(SDValue Op, SelectionDAG &DAG) {
2656  SDLoc DL(Op);
2657  if (Op.getValueType() != MVT::v2i16)
2658    return Op;
2659  EVT EltVT = Op.getValueType().getVectorElementType();
2660  SmallVector<SDValue> VecElements;
2661  for (int I = 0, E = Op.getValueType().getVectorNumElements(); I < E; I++) {
2662    SmallVector<SDValue> ScalarArgs;
2663    llvm::transform(Op->ops(), std::back_inserter(ScalarArgs),
2664                    [&](const SDUse &O) {
2665                      return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, EltVT,
2666                                         O.get(), DAG.getIntPtrConstant(I, DL));
2667                    });
2668    VecElements.push_back(DAG.getNode(Op.getOpcode(), DL, EltVT, ScalarArgs));
2669  }
2670  SDValue V =
2671      DAG.getNode(ISD::BUILD_VECTOR, DL, Op.getValueType(), VecElements);
2672  return V;
2673}
2674
2675SDValue
2676NVPTXTargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const {
2677  switch (Op.getOpcode()) {
2678  case ISD::RETURNADDR:
2679    return SDValue();
2680  case ISD::FRAMEADDR:
2681    return SDValue();
2682  case ISD::GlobalAddress:
2683    return LowerGlobalAddress(Op, DAG);
2684  case ISD::INTRINSIC_W_CHAIN:
2685    return Op;
2686  case ISD::BUILD_VECTOR:
2687    return LowerBUILD_VECTOR(Op, DAG);
2688  case ISD::EXTRACT_SUBVECTOR:
2689    return Op;
2690  case ISD::EXTRACT_VECTOR_ELT:
2691    return LowerEXTRACT_VECTOR_ELT(Op, DAG);
2692  case ISD::INSERT_VECTOR_ELT:
2693    return LowerINSERT_VECTOR_ELT(Op, DAG);
2694  case ISD::VECTOR_SHUFFLE:
2695    return LowerVECTOR_SHUFFLE(Op, DAG);
2696  case ISD::CONCAT_VECTORS:
2697    return LowerCONCAT_VECTORS(Op, DAG);
2698  case ISD::STORE:
2699    return LowerSTORE(Op, DAG);
2700  case ISD::LOAD:
2701    return LowerLOAD(Op, DAG);
2702  case ISD::SHL_PARTS:
2703    return LowerShiftLeftParts(Op, DAG);
2704  case ISD::SRA_PARTS:
2705  case ISD::SRL_PARTS:
2706    return LowerShiftRightParts(Op, DAG);
2707  case ISD::SELECT:
2708    return LowerSelect(Op, DAG);
2709  case ISD::FROUND:
2710    return LowerFROUND(Op, DAG);
2711  case ISD::SINT_TO_FP:
2712  case ISD::UINT_TO_FP:
2713    return LowerINT_TO_FP(Op, DAG);
2714  case ISD::FP_TO_SINT:
2715  case ISD::FP_TO_UINT:
2716    return LowerFP_TO_INT(Op, DAG);
2717  case ISD::VAARG:
2718    return LowerVAARG(Op, DAG);
2719  case ISD::VASTART:
2720    return LowerVASTART(Op, DAG);
2721  case ISD::ABS:
2722  case ISD::SMIN:
2723  case ISD::SMAX:
2724  case ISD::UMIN:
2725  case ISD::UMAX:
2726  case ISD::ADD:
2727  case ISD::SUB:
2728  case ISD::MUL:
2729  case ISD::SHL:
2730  case ISD::SREM:
2731  case ISD::UREM:
2732    return LowerVectorArith(Op, DAG);
2733  case ISD::DYNAMIC_STACKALLOC:
2734    return LowerDYNAMIC_STACKALLOC(Op, DAG);
2735  default:
2736    llvm_unreachable("Custom lowering not defined for operation");
2737  }
2738}
2739
2740// This function is almost a copy of SelectionDAG::expandVAArg().
2741// The only diff is that this one produces loads from local address space.
2742SDValue NVPTXTargetLowering::LowerVAARG(SDValue Op, SelectionDAG &DAG) const {
2743  const TargetLowering *TLI = STI.getTargetLowering();
2744  SDLoc DL(Op);
2745
2746  SDNode *Node = Op.getNode();
2747  const Value *V = cast<SrcValueSDNode>(Node->getOperand(2))->getValue();
2748  EVT VT = Node->getValueType(0);
2749  auto *Ty = VT.getTypeForEVT(*DAG.getContext());
2750  SDValue Tmp1 = Node->getOperand(0);
2751  SDValue Tmp2 = Node->getOperand(1);
2752  const MaybeAlign MA(Node->getConstantOperandVal(3));
2753
2754  SDValue VAListLoad = DAG.getLoad(TLI->getPointerTy(DAG.getDataLayout()), DL,
2755                                   Tmp1, Tmp2, MachinePointerInfo(V));
2756  SDValue VAList = VAListLoad;
2757
2758  if (MA && *MA > TLI->getMinStackArgumentAlignment()) {
2759    VAList = DAG.getNode(
2760        ISD::ADD, DL, VAList.getValueType(), VAList,
2761        DAG.getConstant(MA->value() - 1, DL, VAList.getValueType()));
2762
2763    VAList = DAG.getNode(
2764        ISD::AND, DL, VAList.getValueType(), VAList,
2765        DAG.getConstant(-(int64_t)MA->value(), DL, VAList.getValueType()));
2766  }
2767
2768  // Increment the pointer, VAList, to the next vaarg
2769  Tmp1 = DAG.getNode(ISD::ADD, DL, VAList.getValueType(), VAList,
2770                     DAG.getConstant(DAG.getDataLayout().getTypeAllocSize(Ty),
2771                                     DL, VAList.getValueType()));
2772
2773  // Store the incremented VAList to the legalized pointer
2774  Tmp1 = DAG.getStore(VAListLoad.getValue(1), DL, Tmp1, Tmp2,
2775                      MachinePointerInfo(V));
2776
2777  const Value *SrcV =
2778      Constant::getNullValue(PointerType::get(Ty, ADDRESS_SPACE_LOCAL));
2779
2780  // Load the actual argument out of the pointer VAList
2781  return DAG.getLoad(VT, DL, Tmp1, VAList, MachinePointerInfo(SrcV));
2782}
2783
2784SDValue NVPTXTargetLowering::LowerVASTART(SDValue Op, SelectionDAG &DAG) const {
2785  const TargetLowering *TLI = STI.getTargetLowering();
2786  SDLoc DL(Op);
2787  EVT PtrVT = TLI->getPointerTy(DAG.getDataLayout());
2788
2789  // Store the address of unsized array <function>_vararg[] in the ap object.
2790  SDValue Arg = getParamSymbol(DAG, /* vararg */ -1, PtrVT);
2791  SDValue VAReg = DAG.getNode(NVPTXISD::Wrapper, DL, PtrVT, Arg);
2792
2793  const Value *SV = cast<SrcValueSDNode>(Op.getOperand(2))->getValue();
2794  return DAG.getStore(Op.getOperand(0), DL, VAReg, Op.getOperand(1),
2795                      MachinePointerInfo(SV));
2796}
2797
2798SDValue NVPTXTargetLowering::LowerSelect(SDValue Op, SelectionDAG &DAG) const {
2799  SDValue Op0 = Op->getOperand(0);
2800  SDValue Op1 = Op->getOperand(1);
2801  SDValue Op2 = Op->getOperand(2);
2802  SDLoc DL(Op.getNode());
2803
2804  assert(Op.getValueType() == MVT::i1 && "Custom lowering enabled only for i1");
2805
2806  Op1 = DAG.getNode(ISD::ANY_EXTEND, DL, MVT::i32, Op1);
2807  Op2 = DAG.getNode(ISD::ANY_EXTEND, DL, MVT::i32, Op2);
2808  SDValue Select = DAG.getNode(ISD::SELECT, DL, MVT::i32, Op0, Op1, Op2);
2809  SDValue Trunc = DAG.getNode(ISD::TRUNCATE, DL, MVT::i1, Select);
2810
2811  return Trunc;
2812}
2813
2814SDValue NVPTXTargetLowering::LowerLOAD(SDValue Op, SelectionDAG &DAG) const {
2815  if (Op.getValueType() == MVT::i1)
2816    return LowerLOADi1(Op, DAG);
2817
2818  // v2f16/v2bf16/v2i16/v4i8 are legal, so we can't rely on legalizer to handle
2819  // unaligned loads and have to handle it here.
2820  EVT VT = Op.getValueType();
2821  if (Isv2x16VT(VT) || VT == MVT::v4i8) {
2822    LoadSDNode *Load = cast<LoadSDNode>(Op);
2823    EVT MemVT = Load->getMemoryVT();
2824    if (!allowsMemoryAccessForAlignment(*DAG.getContext(), DAG.getDataLayout(),
2825                                        MemVT, *Load->getMemOperand())) {
2826      SDValue Ops[2];
2827      std::tie(Ops[0], Ops[1]) = expandUnalignedLoad(Load, DAG);
2828      return DAG.getMergeValues(Ops, SDLoc(Op));
2829    }
2830  }
2831
2832  return SDValue();
2833}
2834
2835// v = ld i1* addr
2836//   =>
2837// v1 = ld i8* addr (-> i16)
2838// v = trunc i16 to i1
2839SDValue NVPTXTargetLowering::LowerLOADi1(SDValue Op, SelectionDAG &DAG) const {
2840  SDNode *Node = Op.getNode();
2841  LoadSDNode *LD = cast<LoadSDNode>(Node);
2842  SDLoc dl(Node);
2843  assert(LD->getExtensionType() == ISD::NON_EXTLOAD);
2844  assert(Node->getValueType(0) == MVT::i1 &&
2845         "Custom lowering for i1 load only");
2846  SDValue newLD = DAG.getLoad(MVT::i16, dl, LD->getChain(), LD->getBasePtr(),
2847                              LD->getPointerInfo(), LD->getAlign(),
2848                              LD->getMemOperand()->getFlags());
2849  SDValue result = DAG.getNode(ISD::TRUNCATE, dl, MVT::i1, newLD);
2850  // The legalizer (the caller) is expecting two values from the legalized
2851  // load, so we build a MergeValues node for it. See ExpandUnalignedLoad()
2852  // in LegalizeDAG.cpp which also uses MergeValues.
2853  SDValue Ops[] = { result, LD->getChain() };
2854  return DAG.getMergeValues(Ops, dl);
2855}
2856
2857SDValue NVPTXTargetLowering::LowerSTORE(SDValue Op, SelectionDAG &DAG) const {
2858  StoreSDNode *Store = cast<StoreSDNode>(Op);
2859  EVT VT = Store->getMemoryVT();
2860
2861  if (VT == MVT::i1)
2862    return LowerSTOREi1(Op, DAG);
2863
2864  // v2f16 is legal, so we can't rely on legalizer to handle unaligned
2865  // stores and have to handle it here.
2866  if ((Isv2x16VT(VT) || VT == MVT::v4i8) &&
2867      !allowsMemoryAccessForAlignment(*DAG.getContext(), DAG.getDataLayout(),
2868                                      VT, *Store->getMemOperand()))
2869    return expandUnalignedStore(Store, DAG);
2870
2871  // v2f16, v2bf16 and v2i16 don't need special handling.
2872  if (Isv2x16VT(VT) || VT == MVT::v4i8)
2873    return SDValue();
2874
2875  if (VT.isVector())
2876    return LowerSTOREVector(Op, DAG);
2877
2878  return SDValue();
2879}
2880
2881SDValue
2882NVPTXTargetLowering::LowerSTOREVector(SDValue Op, SelectionDAG &DAG) const {
2883  SDNode *N = Op.getNode();
2884  SDValue Val = N->getOperand(1);
2885  SDLoc DL(N);
2886  EVT ValVT = Val.getValueType();
2887
2888  if (ValVT.isVector()) {
2889    // We only handle "native" vector sizes for now, e.g. <4 x double> is not
2890    // legal.  We can (and should) split that into 2 stores of <2 x double> here
2891    // but I'm leaving that as a TODO for now.
2892    if (!ValVT.isSimple())
2893      return SDValue();
2894    switch (ValVT.getSimpleVT().SimpleTy) {
2895    default:
2896      return SDValue();
2897    case MVT::v2i8:
2898    case MVT::v2i16:
2899    case MVT::v2i32:
2900    case MVT::v2i64:
2901    case MVT::v2f16:
2902    case MVT::v2bf16:
2903    case MVT::v2f32:
2904    case MVT::v2f64:
2905    case MVT::v4i8:
2906    case MVT::v4i16:
2907    case MVT::v4i32:
2908    case MVT::v4f16:
2909    case MVT::v4bf16:
2910    case MVT::v4f32:
2911    case MVT::v8f16: // <4 x f16x2>
2912    case MVT::v8bf16: // <4 x bf16x2>
2913    case MVT::v8i16:  // <4 x i16x2>
2914      // This is a "native" vector type
2915      break;
2916    }
2917
2918    MemSDNode *MemSD = cast<MemSDNode>(N);
2919    const DataLayout &TD = DAG.getDataLayout();
2920
2921    Align Alignment = MemSD->getAlign();
2922    Align PrefAlign =
2923        TD.getPrefTypeAlign(ValVT.getTypeForEVT(*DAG.getContext()));
2924    if (Alignment < PrefAlign) {
2925      // This store is not sufficiently aligned, so bail out and let this vector
2926      // store be scalarized.  Note that we may still be able to emit smaller
2927      // vector stores.  For example, if we are storing a <4 x float> with an
2928      // alignment of 8, this check will fail but the legalizer will try again
2929      // with 2 x <2 x float>, which will succeed with an alignment of 8.
2930      return SDValue();
2931    }
2932
2933    unsigned Opcode = 0;
2934    EVT EltVT = ValVT.getVectorElementType();
2935    unsigned NumElts = ValVT.getVectorNumElements();
2936
2937    // Since StoreV2 is a target node, we cannot rely on DAG type legalization.
2938    // Therefore, we must ensure the type is legal.  For i1 and i8, we set the
2939    // stored type to i16 and propagate the "real" type as the memory type.
2940    bool NeedExt = false;
2941    if (EltVT.getSizeInBits() < 16)
2942      NeedExt = true;
2943
2944    bool StoreF16x2 = false;
2945    switch (NumElts) {
2946    default:
2947      return SDValue();
2948    case 2:
2949      Opcode = NVPTXISD::StoreV2;
2950      break;
2951    case 4:
2952      Opcode = NVPTXISD::StoreV4;
2953      break;
2954    case 8:
2955      // v8f16 is a special case. PTX doesn't have st.v8.f16
2956      // instruction. Instead, we split the vector into v2f16 chunks and
2957      // store them with st.v4.b32.
2958      assert(Is16bitsType(EltVT.getSimpleVT()) && "Wrong type for the vector.");
2959      Opcode = NVPTXISD::StoreV4;
2960      StoreF16x2 = true;
2961      break;
2962    }
2963
2964    SmallVector<SDValue, 8> Ops;
2965
2966    // First is the chain
2967    Ops.push_back(N->getOperand(0));
2968
2969    if (StoreF16x2) {
2970      // Combine f16,f16 -> v2f16
2971      NumElts /= 2;
2972      for (unsigned i = 0; i < NumElts; ++i) {
2973        SDValue E0 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, EltVT, Val,
2974                                 DAG.getIntPtrConstant(i * 2, DL));
2975        SDValue E1 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, EltVT, Val,
2976                                 DAG.getIntPtrConstant(i * 2 + 1, DL));
2977        EVT VecVT = EVT::getVectorVT(*DAG.getContext(), EltVT, 2);
2978        SDValue V2 = DAG.getNode(ISD::BUILD_VECTOR, DL, VecVT, E0, E1);
2979        Ops.push_back(V2);
2980      }
2981    } else {
2982      // Then the split values
2983      for (unsigned i = 0; i < NumElts; ++i) {
2984        SDValue ExtVal = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, EltVT, Val,
2985                                     DAG.getIntPtrConstant(i, DL));
2986        if (NeedExt)
2987          ExtVal = DAG.getNode(ISD::ANY_EXTEND, DL, MVT::i16, ExtVal);
2988        Ops.push_back(ExtVal);
2989      }
2990    }
2991
2992    // Then any remaining arguments
2993    Ops.append(N->op_begin() + 2, N->op_end());
2994
2995    SDValue NewSt =
2996        DAG.getMemIntrinsicNode(Opcode, DL, DAG.getVTList(MVT::Other), Ops,
2997                                MemSD->getMemoryVT(), MemSD->getMemOperand());
2998
2999    // return DCI.CombineTo(N, NewSt, true);
3000    return NewSt;
3001  }
3002
3003  return SDValue();
3004}
3005
3006// st i1 v, addr
3007//    =>
3008// v1 = zxt v to i16
3009// st.u8 i16, addr
3010SDValue NVPTXTargetLowering::LowerSTOREi1(SDValue Op, SelectionDAG &DAG) const {
3011  SDNode *Node = Op.getNode();
3012  SDLoc dl(Node);
3013  StoreSDNode *ST = cast<StoreSDNode>(Node);
3014  SDValue Tmp1 = ST->getChain();
3015  SDValue Tmp2 = ST->getBasePtr();
3016  SDValue Tmp3 = ST->getValue();
3017  assert(Tmp3.getValueType() == MVT::i1 && "Custom lowering for i1 store only");
3018  Tmp3 = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i16, Tmp3);
3019  SDValue Result =
3020      DAG.getTruncStore(Tmp1, dl, Tmp3, Tmp2, ST->getPointerInfo(), MVT::i8,
3021                        ST->getAlign(), ST->getMemOperand()->getFlags());
3022  return Result;
3023}
3024
3025// This creates target external symbol for a function parameter.
3026// Name of the symbol is composed from its index and the function name.
3027// Negative index corresponds to special parameter (unsized array) used for
3028// passing variable arguments.
3029SDValue NVPTXTargetLowering::getParamSymbol(SelectionDAG &DAG, int idx,
3030                                            EVT v) const {
3031  StringRef SavedStr = nvTM->getStrPool().save(
3032      getParamName(&DAG.getMachineFunction().getFunction(), idx));
3033  return DAG.getTargetExternalSymbol(SavedStr.data(), v);
3034}
3035
3036SDValue NVPTXTargetLowering::LowerFormalArguments(
3037    SDValue Chain, CallingConv::ID CallConv, bool isVarArg,
3038    const SmallVectorImpl<ISD::InputArg> &Ins, const SDLoc &dl,
3039    SelectionDAG &DAG, SmallVectorImpl<SDValue> &InVals) const {
3040  MachineFunction &MF = DAG.getMachineFunction();
3041  const DataLayout &DL = DAG.getDataLayout();
3042  auto PtrVT = getPointerTy(DAG.getDataLayout());
3043
3044  const Function *F = &MF.getFunction();
3045  const AttributeList &PAL = F->getAttributes();
3046  const TargetLowering *TLI = STI.getTargetLowering();
3047
3048  SDValue Root = DAG.getRoot();
3049  std::vector<SDValue> OutChains;
3050
3051  bool isABI = (STI.getSmVersion() >= 20);
3052  assert(isABI && "Non-ABI compilation is not supported");
3053  if (!isABI)
3054    return Chain;
3055
3056  std::vector<Type *> argTypes;
3057  std::vector<const Argument *> theArgs;
3058  for (const Argument &I : F->args()) {
3059    theArgs.push_back(&I);
3060    argTypes.push_back(I.getType());
3061  }
3062  // argTypes.size() (or theArgs.size()) and Ins.size() need not match.
3063  // Ins.size() will be larger
3064  //   * if there is an aggregate argument with multiple fields (each field
3065  //     showing up separately in Ins)
3066  //   * if there is a vector argument with more than typical vector-length
3067  //     elements (generally if more than 4) where each vector element is
3068  //     individually present in Ins.
3069  // So a different index should be used for indexing into Ins.
3070  // See similar issue in LowerCall.
3071  unsigned InsIdx = 0;
3072
3073  int idx = 0;
3074  for (unsigned i = 0, e = theArgs.size(); i != e; ++i, ++idx, ++InsIdx) {
3075    Type *Ty = argTypes[i];
3076
3077    if (theArgs[i]->use_empty()) {
3078      // argument is dead
3079      if (IsTypePassedAsArray(Ty) && !Ty->isVectorTy()) {
3080        SmallVector<EVT, 16> vtparts;
3081
3082        ComputePTXValueVTs(*this, DAG.getDataLayout(), Ty, vtparts);
3083        if (vtparts.empty())
3084          report_fatal_error("Empty parameter types are not supported");
3085
3086        for (unsigned parti = 0, parte = vtparts.size(); parti != parte;
3087             ++parti) {
3088          InVals.push_back(DAG.getNode(ISD::UNDEF, dl, Ins[InsIdx].VT));
3089          ++InsIdx;
3090        }
3091        if (vtparts.size() > 0)
3092          --InsIdx;
3093        continue;
3094      }
3095      if (Ty->isVectorTy()) {
3096        EVT ObjectVT = getValueType(DL, Ty);
3097        unsigned NumRegs = TLI->getNumRegisters(F->getContext(), ObjectVT);
3098        for (unsigned parti = 0; parti < NumRegs; ++parti) {
3099          InVals.push_back(DAG.getNode(ISD::UNDEF, dl, Ins[InsIdx].VT));
3100          ++InsIdx;
3101        }
3102        if (NumRegs > 0)
3103          --InsIdx;
3104        continue;
3105      }
3106      InVals.push_back(DAG.getNode(ISD::UNDEF, dl, Ins[InsIdx].VT));
3107      continue;
3108    }
3109
3110    // In the following cases, assign a node order of "idx+1"
3111    // to newly created nodes. The SDNodes for params have to
3112    // appear in the same order as their order of appearance
3113    // in the original function. "idx+1" holds that order.
3114    if (!PAL.hasParamAttr(i, Attribute::ByVal)) {
3115      bool aggregateIsPacked = false;
3116      if (StructType *STy = dyn_cast<StructType>(Ty))
3117        aggregateIsPacked = STy->isPacked();
3118
3119      SmallVector<EVT, 16> VTs;
3120      SmallVector<uint64_t, 16> Offsets;
3121      ComputePTXValueVTs(*this, DL, Ty, VTs, &Offsets, 0);
3122      if (VTs.empty())
3123        report_fatal_error("Empty parameter types are not supported");
3124
3125      auto VectorInfo =
3126          VectorizePTXValueVTs(VTs, Offsets, DL.getABITypeAlign(Ty));
3127
3128      SDValue Arg = getParamSymbol(DAG, idx, PtrVT);
3129      int VecIdx = -1; // Index of the first element of the current vector.
3130      for (unsigned parti = 0, parte = VTs.size(); parti != parte; ++parti) {
3131        if (VectorInfo[parti] & PVF_FIRST) {
3132          assert(VecIdx == -1 && "Orphaned vector.");
3133          VecIdx = parti;
3134        }
3135
3136        // That's the last element of this store op.
3137        if (VectorInfo[parti] & PVF_LAST) {
3138          unsigned NumElts = parti - VecIdx + 1;
3139          EVT EltVT = VTs[parti];
3140          // i1 is loaded/stored as i8.
3141          EVT LoadVT = EltVT;
3142          if (EltVT == MVT::i1)
3143            LoadVT = MVT::i8;
3144          else if (Isv2x16VT(EltVT) || EltVT == MVT::v4i8)
3145            // getLoad needs a vector type, but it can't handle
3146            // vectors which contain v2f16 or v2bf16 elements. So we must load
3147            // using i32 here and then bitcast back.
3148            LoadVT = MVT::i32;
3149
3150          EVT VecVT = EVT::getVectorVT(F->getContext(), LoadVT, NumElts);
3151          SDValue VecAddr =
3152              DAG.getNode(ISD::ADD, dl, PtrVT, Arg,
3153                          DAG.getConstant(Offsets[VecIdx], dl, PtrVT));
3154          Value *srcValue = Constant::getNullValue(PointerType::get(
3155              EltVT.getTypeForEVT(F->getContext()), ADDRESS_SPACE_PARAM));
3156          SDValue P = DAG.getLoad(VecVT, dl, Root, VecAddr,
3157                                  MachinePointerInfo(srcValue),
3158                                  MaybeAlign(aggregateIsPacked ? 1 : 0),
3159                                  MachineMemOperand::MODereferenceable |
3160                                      MachineMemOperand::MOInvariant);
3161          if (P.getNode())
3162            P.getNode()->setIROrder(idx + 1);
3163          for (unsigned j = 0; j < NumElts; ++j) {
3164            SDValue Elt = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, LoadVT, P,
3165                                      DAG.getIntPtrConstant(j, dl));
3166            // We've loaded i1 as an i8 and now must truncate it back to i1
3167            if (EltVT == MVT::i1)
3168              Elt = DAG.getNode(ISD::TRUNCATE, dl, MVT::i1, Elt);
3169            // v2f16 was loaded as an i32. Now we must bitcast it back.
3170            else if (EltVT != LoadVT)
3171              Elt = DAG.getNode(ISD::BITCAST, dl, EltVT, Elt);
3172
3173            // If a promoted integer type is used, truncate down to the original
3174            MVT PromotedVT;
3175            if (PromoteScalarIntegerPTX(EltVT, &PromotedVT)) {
3176              Elt = DAG.getNode(ISD::TRUNCATE, dl, EltVT, Elt);
3177            }
3178
3179            // Extend the element if necessary (e.g. an i8 is loaded
3180            // into an i16 register)
3181            if (Ins[InsIdx].VT.isInteger() &&
3182                Ins[InsIdx].VT.getFixedSizeInBits() >
3183                    LoadVT.getFixedSizeInBits()) {
3184              unsigned Extend = Ins[InsIdx].Flags.isSExt() ? ISD::SIGN_EXTEND
3185                                                           : ISD::ZERO_EXTEND;
3186              Elt = DAG.getNode(Extend, dl, Ins[InsIdx].VT, Elt);
3187            }
3188            InVals.push_back(Elt);
3189          }
3190
3191          // Reset vector tracking state.
3192          VecIdx = -1;
3193        }
3194        ++InsIdx;
3195      }
3196      if (VTs.size() > 0)
3197        --InsIdx;
3198      continue;
3199    }
3200
3201    // Param has ByVal attribute
3202    // Return MoveParam(param symbol).
3203    // Ideally, the param symbol can be returned directly,
3204    // but when SDNode builder decides to use it in a CopyToReg(),
3205    // machine instruction fails because TargetExternalSymbol
3206    // (not lowered) is target dependent, and CopyToReg assumes
3207    // the source is lowered.
3208    EVT ObjectVT = getValueType(DL, Ty);
3209    assert(ObjectVT == Ins[InsIdx].VT &&
3210           "Ins type did not match function type");
3211    SDValue Arg = getParamSymbol(DAG, idx, PtrVT);
3212    SDValue p = DAG.getNode(NVPTXISD::MoveParam, dl, ObjectVT, Arg);
3213    if (p.getNode())
3214      p.getNode()->setIROrder(idx + 1);
3215    InVals.push_back(p);
3216  }
3217
3218  if (!OutChains.empty())
3219    DAG.setRoot(DAG.getNode(ISD::TokenFactor, dl, MVT::Other, OutChains));
3220
3221  return Chain;
3222}
3223
3224SDValue
3225NVPTXTargetLowering::LowerReturn(SDValue Chain, CallingConv::ID CallConv,
3226                                 bool isVarArg,
3227                                 const SmallVectorImpl<ISD::OutputArg> &Outs,
3228                                 const SmallVectorImpl<SDValue> &OutVals,
3229                                 const SDLoc &dl, SelectionDAG &DAG) const {
3230  const MachineFunction &MF = DAG.getMachineFunction();
3231  const Function &F = MF.getFunction();
3232  Type *RetTy = MF.getFunction().getReturnType();
3233
3234  bool isABI = (STI.getSmVersion() >= 20);
3235  assert(isABI && "Non-ABI compilation is not supported");
3236  if (!isABI)
3237    return Chain;
3238
3239  const DataLayout &DL = DAG.getDataLayout();
3240  SmallVector<SDValue, 16> PromotedOutVals;
3241  SmallVector<EVT, 16> VTs;
3242  SmallVector<uint64_t, 16> Offsets;
3243  ComputePTXValueVTs(*this, DL, RetTy, VTs, &Offsets);
3244  assert(VTs.size() == OutVals.size() && "Bad return value decomposition");
3245
3246  for (unsigned i = 0, e = VTs.size(); i != e; ++i) {
3247    SDValue PromotedOutVal = OutVals[i];
3248    MVT PromotedVT;
3249    if (PromoteScalarIntegerPTX(VTs[i], &PromotedVT)) {
3250      VTs[i] = EVT(PromotedVT);
3251    }
3252    if (PromoteScalarIntegerPTX(PromotedOutVal.getValueType(), &PromotedVT)) {
3253      llvm::ISD::NodeType Ext =
3254          Outs[i].Flags.isSExt() ? ISD::SIGN_EXTEND : ISD::ZERO_EXTEND;
3255      PromotedOutVal = DAG.getNode(Ext, dl, PromotedVT, PromotedOutVal);
3256    }
3257    PromotedOutVals.push_back(PromotedOutVal);
3258  }
3259
3260  auto VectorInfo = VectorizePTXValueVTs(
3261      VTs, Offsets,
3262      RetTy->isSized() ? getFunctionParamOptimizedAlign(&F, RetTy, DL)
3263                       : Align(1));
3264
3265  // PTX Interoperability Guide 3.3(A): [Integer] Values shorter than
3266  // 32-bits are sign extended or zero extended, depending on whether
3267  // they are signed or unsigned types.
3268  bool ExtendIntegerRetVal =
3269      RetTy->isIntegerTy() && DL.getTypeAllocSizeInBits(RetTy) < 32;
3270
3271  SmallVector<SDValue, 6> StoreOperands;
3272  for (unsigned i = 0, e = VTs.size(); i != e; ++i) {
3273    // New load/store. Record chain and offset operands.
3274    if (VectorInfo[i] & PVF_FIRST) {
3275      assert(StoreOperands.empty() && "Orphaned operand list.");
3276      StoreOperands.push_back(Chain);
3277      StoreOperands.push_back(DAG.getConstant(Offsets[i], dl, MVT::i32));
3278    }
3279
3280    SDValue OutVal = OutVals[i];
3281    SDValue RetVal = PromotedOutVals[i];
3282
3283    if (ExtendIntegerRetVal) {
3284      RetVal = DAG.getNode(Outs[i].Flags.isSExt() ? ISD::SIGN_EXTEND
3285                                                  : ISD::ZERO_EXTEND,
3286                           dl, MVT::i32, RetVal);
3287    } else if (OutVal.getValueSizeInBits() < 16) {
3288      // Use 16-bit registers for small load-stores as it's the
3289      // smallest general purpose register size supported by NVPTX.
3290      RetVal = DAG.getNode(ISD::ANY_EXTEND, dl, MVT::i16, RetVal);
3291    }
3292
3293    // Record the value to return.
3294    StoreOperands.push_back(RetVal);
3295
3296    // That's the last element of this store op.
3297    if (VectorInfo[i] & PVF_LAST) {
3298      NVPTXISD::NodeType Op;
3299      unsigned NumElts = StoreOperands.size() - 2;
3300      switch (NumElts) {
3301      case 1:
3302        Op = NVPTXISD::StoreRetval;
3303        break;
3304      case 2:
3305        Op = NVPTXISD::StoreRetvalV2;
3306        break;
3307      case 4:
3308        Op = NVPTXISD::StoreRetvalV4;
3309        break;
3310      default:
3311        llvm_unreachable("Invalid vector info.");
3312      }
3313
3314      // Adjust type of load/store op if we've extended the scalar
3315      // return value.
3316      EVT TheStoreType = ExtendIntegerRetVal ? MVT::i32 : VTs[i];
3317      Chain = DAG.getMemIntrinsicNode(
3318          Op, dl, DAG.getVTList(MVT::Other), StoreOperands, TheStoreType,
3319          MachinePointerInfo(), Align(1), MachineMemOperand::MOStore);
3320      // Cleanup vector state.
3321      StoreOperands.clear();
3322    }
3323  }
3324
3325  return DAG.getNode(NVPTXISD::RET_GLUE, dl, MVT::Other, Chain);
3326}
3327
3328void NVPTXTargetLowering::LowerAsmOperandForConstraint(
3329    SDValue Op, StringRef Constraint, std::vector<SDValue> &Ops,
3330    SelectionDAG &DAG) const {
3331  if (Constraint.size() > 1)
3332    return;
3333  TargetLowering::LowerAsmOperandForConstraint(Op, Constraint, Ops, DAG);
3334}
3335
3336static unsigned getOpcForTextureInstr(unsigned Intrinsic) {
3337  switch (Intrinsic) {
3338  default:
3339    return 0;
3340
3341  case Intrinsic::nvvm_tex_1d_v4f32_s32:
3342    return NVPTXISD::Tex1DFloatS32;
3343  case Intrinsic::nvvm_tex_1d_v4f32_f32:
3344    return NVPTXISD::Tex1DFloatFloat;
3345  case Intrinsic::nvvm_tex_1d_level_v4f32_f32:
3346    return NVPTXISD::Tex1DFloatFloatLevel;
3347  case Intrinsic::nvvm_tex_1d_grad_v4f32_f32:
3348    return NVPTXISD::Tex1DFloatFloatGrad;
3349  case Intrinsic::nvvm_tex_1d_v4s32_s32:
3350    return NVPTXISD::Tex1DS32S32;
3351  case Intrinsic::nvvm_tex_1d_v4s32_f32:
3352    return NVPTXISD::Tex1DS32Float;
3353  case Intrinsic::nvvm_tex_1d_level_v4s32_f32:
3354    return NVPTXISD::Tex1DS32FloatLevel;
3355  case Intrinsic::nvvm_tex_1d_grad_v4s32_f32:
3356    return NVPTXISD::Tex1DS32FloatGrad;
3357  case Intrinsic::nvvm_tex_1d_v4u32_s32:
3358    return NVPTXISD::Tex1DU32S32;
3359  case Intrinsic::nvvm_tex_1d_v4u32_f32:
3360    return NVPTXISD::Tex1DU32Float;
3361  case Intrinsic::nvvm_tex_1d_level_v4u32_f32:
3362    return NVPTXISD::Tex1DU32FloatLevel;
3363  case Intrinsic::nvvm_tex_1d_grad_v4u32_f32:
3364    return NVPTXISD::Tex1DU32FloatGrad;
3365
3366  case Intrinsic::nvvm_tex_1d_array_v4f32_s32:
3367    return NVPTXISD::Tex1DArrayFloatS32;
3368  case Intrinsic::nvvm_tex_1d_array_v4f32_f32:
3369    return NVPTXISD::Tex1DArrayFloatFloat;
3370  case Intrinsic::nvvm_tex_1d_array_level_v4f32_f32:
3371    return NVPTXISD::Tex1DArrayFloatFloatLevel;
3372  case Intrinsic::nvvm_tex_1d_array_grad_v4f32_f32:
3373    return NVPTXISD::Tex1DArrayFloatFloatGrad;
3374  case Intrinsic::nvvm_tex_1d_array_v4s32_s32:
3375    return NVPTXISD::Tex1DArrayS32S32;
3376  case Intrinsic::nvvm_tex_1d_array_v4s32_f32:
3377    return NVPTXISD::Tex1DArrayS32Float;
3378  case Intrinsic::nvvm_tex_1d_array_level_v4s32_f32:
3379    return NVPTXISD::Tex1DArrayS32FloatLevel;
3380  case Intrinsic::nvvm_tex_1d_array_grad_v4s32_f32:
3381    return NVPTXISD::Tex1DArrayS32FloatGrad;
3382  case Intrinsic::nvvm_tex_1d_array_v4u32_s32:
3383    return NVPTXISD::Tex1DArrayU32S32;
3384  case Intrinsic::nvvm_tex_1d_array_v4u32_f32:
3385    return NVPTXISD::Tex1DArrayU32Float;
3386  case Intrinsic::nvvm_tex_1d_array_level_v4u32_f32:
3387    return NVPTXISD::Tex1DArrayU32FloatLevel;
3388  case Intrinsic::nvvm_tex_1d_array_grad_v4u32_f32:
3389    return NVPTXISD::Tex1DArrayU32FloatGrad;
3390
3391  case Intrinsic::nvvm_tex_2d_v4f32_s32:
3392    return NVPTXISD::Tex2DFloatS32;
3393  case Intrinsic::nvvm_tex_2d_v4f32_f32:
3394    return NVPTXISD::Tex2DFloatFloat;
3395  case Intrinsic::nvvm_tex_2d_level_v4f32_f32:
3396    return NVPTXISD::Tex2DFloatFloatLevel;
3397  case Intrinsic::nvvm_tex_2d_grad_v4f32_f32:
3398    return NVPTXISD::Tex2DFloatFloatGrad;
3399  case Intrinsic::nvvm_tex_2d_v4s32_s32:
3400    return NVPTXISD::Tex2DS32S32;
3401  case Intrinsic::nvvm_tex_2d_v4s32_f32:
3402    return NVPTXISD::Tex2DS32Float;
3403  case Intrinsic::nvvm_tex_2d_level_v4s32_f32:
3404    return NVPTXISD::Tex2DS32FloatLevel;
3405  case Intrinsic::nvvm_tex_2d_grad_v4s32_f32:
3406    return NVPTXISD::Tex2DS32FloatGrad;
3407  case Intrinsic::nvvm_tex_2d_v4u32_s32:
3408    return NVPTXISD::Tex2DU32S32;
3409  case Intrinsic::nvvm_tex_2d_v4u32_f32:
3410    return NVPTXISD::Tex2DU32Float;
3411  case Intrinsic::nvvm_tex_2d_level_v4u32_f32:
3412    return NVPTXISD::Tex2DU32FloatLevel;
3413  case Intrinsic::nvvm_tex_2d_grad_v4u32_f32:
3414    return NVPTXISD::Tex2DU32FloatGrad;
3415
3416  case Intrinsic::nvvm_tex_2d_array_v4f32_s32:
3417    return NVPTXISD::Tex2DArrayFloatS32;
3418  case Intrinsic::nvvm_tex_2d_array_v4f32_f32:
3419    return NVPTXISD::Tex2DArrayFloatFloat;
3420  case Intrinsic::nvvm_tex_2d_array_level_v4f32_f32:
3421    return NVPTXISD::Tex2DArrayFloatFloatLevel;
3422  case Intrinsic::nvvm_tex_2d_array_grad_v4f32_f32:
3423    return NVPTXISD::Tex2DArrayFloatFloatGrad;
3424  case Intrinsic::nvvm_tex_2d_array_v4s32_s32:
3425    return NVPTXISD::Tex2DArrayS32S32;
3426  case Intrinsic::nvvm_tex_2d_array_v4s32_f32:
3427    return NVPTXISD::Tex2DArrayS32Float;
3428  case Intrinsic::nvvm_tex_2d_array_level_v4s32_f32:
3429    return NVPTXISD::Tex2DArrayS32FloatLevel;
3430  case Intrinsic::nvvm_tex_2d_array_grad_v4s32_f32:
3431    return NVPTXISD::Tex2DArrayS32FloatGrad;
3432  case Intrinsic::nvvm_tex_2d_array_v4u32_s32:
3433    return NVPTXISD::Tex2DArrayU32S32;
3434  case Intrinsic::nvvm_tex_2d_array_v4u32_f32:
3435    return NVPTXISD::Tex2DArrayU32Float;
3436  case Intrinsic::nvvm_tex_2d_array_level_v4u32_f32:
3437    return NVPTXISD::Tex2DArrayU32FloatLevel;
3438  case Intrinsic::nvvm_tex_2d_array_grad_v4u32_f32:
3439    return NVPTXISD::Tex2DArrayU32FloatGrad;
3440
3441  case Intrinsic::nvvm_tex_3d_v4f32_s32:
3442    return NVPTXISD::Tex3DFloatS32;
3443  case Intrinsic::nvvm_tex_3d_v4f32_f32:
3444    return NVPTXISD::Tex3DFloatFloat;
3445  case Intrinsic::nvvm_tex_3d_level_v4f32_f32:
3446    return NVPTXISD::Tex3DFloatFloatLevel;
3447  case Intrinsic::nvvm_tex_3d_grad_v4f32_f32:
3448    return NVPTXISD::Tex3DFloatFloatGrad;
3449  case Intrinsic::nvvm_tex_3d_v4s32_s32:
3450    return NVPTXISD::Tex3DS32S32;
3451  case Intrinsic::nvvm_tex_3d_v4s32_f32:
3452    return NVPTXISD::Tex3DS32Float;
3453  case Intrinsic::nvvm_tex_3d_level_v4s32_f32:
3454    return NVPTXISD::Tex3DS32FloatLevel;
3455  case Intrinsic::nvvm_tex_3d_grad_v4s32_f32:
3456    return NVPTXISD::Tex3DS32FloatGrad;
3457  case Intrinsic::nvvm_tex_3d_v4u32_s32:
3458    return NVPTXISD::Tex3DU32S32;
3459  case Intrinsic::nvvm_tex_3d_v4u32_f32:
3460    return NVPTXISD::Tex3DU32Float;
3461  case Intrinsic::nvvm_tex_3d_level_v4u32_f32:
3462    return NVPTXISD::Tex3DU32FloatLevel;
3463  case Intrinsic::nvvm_tex_3d_grad_v4u32_f32:
3464    return NVPTXISD::Tex3DU32FloatGrad;
3465
3466  case Intrinsic::nvvm_tex_cube_v4f32_f32:
3467    return NVPTXISD::TexCubeFloatFloat;
3468  case Intrinsic::nvvm_tex_cube_level_v4f32_f32:
3469    return NVPTXISD::TexCubeFloatFloatLevel;
3470  case Intrinsic::nvvm_tex_cube_v4s32_f32:
3471    return NVPTXISD::TexCubeS32Float;
3472  case Intrinsic::nvvm_tex_cube_level_v4s32_f32:
3473    return NVPTXISD::TexCubeS32FloatLevel;
3474  case Intrinsic::nvvm_tex_cube_v4u32_f32:
3475    return NVPTXISD::TexCubeU32Float;
3476  case Intrinsic::nvvm_tex_cube_level_v4u32_f32:
3477    return NVPTXISD::TexCubeU32FloatLevel;
3478
3479  case Intrinsic::nvvm_tex_cube_array_v4f32_f32:
3480    return NVPTXISD::TexCubeArrayFloatFloat;
3481  case Intrinsic::nvvm_tex_cube_array_level_v4f32_f32:
3482    return NVPTXISD::TexCubeArrayFloatFloatLevel;
3483  case Intrinsic::nvvm_tex_cube_array_v4s32_f32:
3484    return NVPTXISD::TexCubeArrayS32Float;
3485  case Intrinsic::nvvm_tex_cube_array_level_v4s32_f32:
3486    return NVPTXISD::TexCubeArrayS32FloatLevel;
3487  case Intrinsic::nvvm_tex_cube_array_v4u32_f32:
3488    return NVPTXISD::TexCubeArrayU32Float;
3489  case Intrinsic::nvvm_tex_cube_array_level_v4u32_f32:
3490    return NVPTXISD::TexCubeArrayU32FloatLevel;
3491
3492  case Intrinsic::nvvm_tld4_r_2d_v4f32_f32:
3493    return NVPTXISD::Tld4R2DFloatFloat;
3494  case Intrinsic::nvvm_tld4_g_2d_v4f32_f32:
3495    return NVPTXISD::Tld4G2DFloatFloat;
3496  case Intrinsic::nvvm_tld4_b_2d_v4f32_f32:
3497    return NVPTXISD::Tld4B2DFloatFloat;
3498  case Intrinsic::nvvm_tld4_a_2d_v4f32_f32:
3499    return NVPTXISD::Tld4A2DFloatFloat;
3500  case Intrinsic::nvvm_tld4_r_2d_v4s32_f32:
3501    return NVPTXISD::Tld4R2DS64Float;
3502  case Intrinsic::nvvm_tld4_g_2d_v4s32_f32:
3503    return NVPTXISD::Tld4G2DS64Float;
3504  case Intrinsic::nvvm_tld4_b_2d_v4s32_f32:
3505    return NVPTXISD::Tld4B2DS64Float;
3506  case Intrinsic::nvvm_tld4_a_2d_v4s32_f32:
3507    return NVPTXISD::Tld4A2DS64Float;
3508  case Intrinsic::nvvm_tld4_r_2d_v4u32_f32:
3509    return NVPTXISD::Tld4R2DU64Float;
3510  case Intrinsic::nvvm_tld4_g_2d_v4u32_f32:
3511    return NVPTXISD::Tld4G2DU64Float;
3512  case Intrinsic::nvvm_tld4_b_2d_v4u32_f32:
3513    return NVPTXISD::Tld4B2DU64Float;
3514  case Intrinsic::nvvm_tld4_a_2d_v4u32_f32:
3515    return NVPTXISD::Tld4A2DU64Float;
3516
3517  case Intrinsic::nvvm_tex_unified_1d_v4f32_s32:
3518    return NVPTXISD::TexUnified1DFloatS32;
3519  case Intrinsic::nvvm_tex_unified_1d_v4f32_f32:
3520    return NVPTXISD::TexUnified1DFloatFloat;
3521  case Intrinsic::nvvm_tex_unified_1d_level_v4f32_f32:
3522    return NVPTXISD::TexUnified1DFloatFloatLevel;
3523  case Intrinsic::nvvm_tex_unified_1d_grad_v4f32_f32:
3524    return NVPTXISD::TexUnified1DFloatFloatGrad;
3525  case Intrinsic::nvvm_tex_unified_1d_v4s32_s32:
3526    return NVPTXISD::TexUnified1DS32S32;
3527  case Intrinsic::nvvm_tex_unified_1d_v4s32_f32:
3528    return NVPTXISD::TexUnified1DS32Float;
3529  case Intrinsic::nvvm_tex_unified_1d_level_v4s32_f32:
3530    return NVPTXISD::TexUnified1DS32FloatLevel;
3531  case Intrinsic::nvvm_tex_unified_1d_grad_v4s32_f32:
3532    return NVPTXISD::TexUnified1DS32FloatGrad;
3533  case Intrinsic::nvvm_tex_unified_1d_v4u32_s32:
3534    return NVPTXISD::TexUnified1DU32S32;
3535  case Intrinsic::nvvm_tex_unified_1d_v4u32_f32:
3536    return NVPTXISD::TexUnified1DU32Float;
3537  case Intrinsic::nvvm_tex_unified_1d_level_v4u32_f32:
3538    return NVPTXISD::TexUnified1DU32FloatLevel;
3539  case Intrinsic::nvvm_tex_unified_1d_grad_v4u32_f32:
3540    return NVPTXISD::TexUnified1DU32FloatGrad;
3541
3542  case Intrinsic::nvvm_tex_unified_1d_array_v4f32_s32:
3543    return NVPTXISD::TexUnified1DArrayFloatS32;
3544  case Intrinsic::nvvm_tex_unified_1d_array_v4f32_f32:
3545    return NVPTXISD::TexUnified1DArrayFloatFloat;
3546  case Intrinsic::nvvm_tex_unified_1d_array_level_v4f32_f32:
3547    return NVPTXISD::TexUnified1DArrayFloatFloatLevel;
3548  case Intrinsic::nvvm_tex_unified_1d_array_grad_v4f32_f32:
3549    return NVPTXISD::TexUnified1DArrayFloatFloatGrad;
3550  case Intrinsic::nvvm_tex_unified_1d_array_v4s32_s32:
3551    return NVPTXISD::TexUnified1DArrayS32S32;
3552  case Intrinsic::nvvm_tex_unified_1d_array_v4s32_f32:
3553    return NVPTXISD::TexUnified1DArrayS32Float;
3554  case Intrinsic::nvvm_tex_unified_1d_array_level_v4s32_f32:
3555    return NVPTXISD::TexUnified1DArrayS32FloatLevel;
3556  case Intrinsic::nvvm_tex_unified_1d_array_grad_v4s32_f32:
3557    return NVPTXISD::TexUnified1DArrayS32FloatGrad;
3558  case Intrinsic::nvvm_tex_unified_1d_array_v4u32_s32:
3559    return NVPTXISD::TexUnified1DArrayU32S32;
3560  case Intrinsic::nvvm_tex_unified_1d_array_v4u32_f32:
3561    return NVPTXISD::TexUnified1DArrayU32Float;
3562  case Intrinsic::nvvm_tex_unified_1d_array_level_v4u32_f32:
3563    return NVPTXISD::TexUnified1DArrayU32FloatLevel;
3564  case Intrinsic::nvvm_tex_unified_1d_array_grad_v4u32_f32:
3565    return NVPTXISD::TexUnified1DArrayU32FloatGrad;
3566
3567  case Intrinsic::nvvm_tex_unified_2d_v4f32_s32:
3568    return NVPTXISD::TexUnified2DFloatS32;
3569  case Intrinsic::nvvm_tex_unified_2d_v4f32_f32:
3570    return NVPTXISD::TexUnified2DFloatFloat;
3571  case Intrinsic::nvvm_tex_unified_2d_level_v4f32_f32:
3572    return NVPTXISD::TexUnified2DFloatFloatLevel;
3573  case Intrinsic::nvvm_tex_unified_2d_grad_v4f32_f32:
3574    return NVPTXISD::TexUnified2DFloatFloatGrad;
3575  case Intrinsic::nvvm_tex_unified_2d_v4s32_s32:
3576    return NVPTXISD::TexUnified2DS32S32;
3577  case Intrinsic::nvvm_tex_unified_2d_v4s32_f32:
3578    return NVPTXISD::TexUnified2DS32Float;
3579  case Intrinsic::nvvm_tex_unified_2d_level_v4s32_f32:
3580    return NVPTXISD::TexUnified2DS32FloatLevel;
3581  case Intrinsic::nvvm_tex_unified_2d_grad_v4s32_f32:
3582    return NVPTXISD::TexUnified2DS32FloatGrad;
3583  case Intrinsic::nvvm_tex_unified_2d_v4u32_s32:
3584    return NVPTXISD::TexUnified2DU32S32;
3585  case Intrinsic::nvvm_tex_unified_2d_v4u32_f32:
3586    return NVPTXISD::TexUnified2DU32Float;
3587  case Intrinsic::nvvm_tex_unified_2d_level_v4u32_f32:
3588    return NVPTXISD::TexUnified2DU32FloatLevel;
3589  case Intrinsic::nvvm_tex_unified_2d_grad_v4u32_f32:
3590    return NVPTXISD::TexUnified2DU32FloatGrad;
3591
3592  case Intrinsic::nvvm_tex_unified_2d_array_v4f32_s32:
3593    return NVPTXISD::TexUnified2DArrayFloatS32;
3594  case Intrinsic::nvvm_tex_unified_2d_array_v4f32_f32:
3595    return NVPTXISD::TexUnified2DArrayFloatFloat;
3596  case Intrinsic::nvvm_tex_unified_2d_array_level_v4f32_f32:
3597    return NVPTXISD::TexUnified2DArrayFloatFloatLevel;
3598  case Intrinsic::nvvm_tex_unified_2d_array_grad_v4f32_f32:
3599    return NVPTXISD::TexUnified2DArrayFloatFloatGrad;
3600  case Intrinsic::nvvm_tex_unified_2d_array_v4s32_s32:
3601    return NVPTXISD::TexUnified2DArrayS32S32;
3602  case Intrinsic::nvvm_tex_unified_2d_array_v4s32_f32:
3603    return NVPTXISD::TexUnified2DArrayS32Float;
3604  case Intrinsic::nvvm_tex_unified_2d_array_level_v4s32_f32:
3605    return NVPTXISD::TexUnified2DArrayS32FloatLevel;
3606  case Intrinsic::nvvm_tex_unified_2d_array_grad_v4s32_f32:
3607    return NVPTXISD::TexUnified2DArrayS32FloatGrad;
3608  case Intrinsic::nvvm_tex_unified_2d_array_v4u32_s32:
3609    return NVPTXISD::TexUnified2DArrayU32S32;
3610  case Intrinsic::nvvm_tex_unified_2d_array_v4u32_f32:
3611    return NVPTXISD::TexUnified2DArrayU32Float;
3612  case Intrinsic::nvvm_tex_unified_2d_array_level_v4u32_f32:
3613    return NVPTXISD::TexUnified2DArrayU32FloatLevel;
3614  case Intrinsic::nvvm_tex_unified_2d_array_grad_v4u32_f32:
3615    return NVPTXISD::TexUnified2DArrayU32FloatGrad;
3616
3617  case Intrinsic::nvvm_tex_unified_3d_v4f32_s32:
3618    return NVPTXISD::TexUnified3DFloatS32;
3619  case Intrinsic::nvvm_tex_unified_3d_v4f32_f32:
3620    return NVPTXISD::TexUnified3DFloatFloat;
3621  case Intrinsic::nvvm_tex_unified_3d_level_v4f32_f32:
3622    return NVPTXISD::TexUnified3DFloatFloatLevel;
3623  case Intrinsic::nvvm_tex_unified_3d_grad_v4f32_f32:
3624    return NVPTXISD::TexUnified3DFloatFloatGrad;
3625  case Intrinsic::nvvm_tex_unified_3d_v4s32_s32:
3626    return NVPTXISD::TexUnified3DS32S32;
3627  case Intrinsic::nvvm_tex_unified_3d_v4s32_f32:
3628    return NVPTXISD::TexUnified3DS32Float;
3629  case Intrinsic::nvvm_tex_unified_3d_level_v4s32_f32:
3630    return NVPTXISD::TexUnified3DS32FloatLevel;
3631  case Intrinsic::nvvm_tex_unified_3d_grad_v4s32_f32:
3632    return NVPTXISD::TexUnified3DS32FloatGrad;
3633  case Intrinsic::nvvm_tex_unified_3d_v4u32_s32:
3634    return NVPTXISD::TexUnified3DU32S32;
3635  case Intrinsic::nvvm_tex_unified_3d_v4u32_f32:
3636    return NVPTXISD::TexUnified3DU32Float;
3637  case Intrinsic::nvvm_tex_unified_3d_level_v4u32_f32:
3638    return NVPTXISD::TexUnified3DU32FloatLevel;
3639  case Intrinsic::nvvm_tex_unified_3d_grad_v4u32_f32:
3640    return NVPTXISD::TexUnified3DU32FloatGrad;
3641
3642  case Intrinsic::nvvm_tex_unified_cube_v4f32_f32:
3643    return NVPTXISD::TexUnifiedCubeFloatFloat;
3644  case Intrinsic::nvvm_tex_unified_cube_level_v4f32_f32:
3645    return NVPTXISD::TexUnifiedCubeFloatFloatLevel;
3646  case Intrinsic::nvvm_tex_unified_cube_v4s32_f32:
3647    return NVPTXISD::TexUnifiedCubeS32Float;
3648  case Intrinsic::nvvm_tex_unified_cube_level_v4s32_f32:
3649    return NVPTXISD::TexUnifiedCubeS32FloatLevel;
3650  case Intrinsic::nvvm_tex_unified_cube_v4u32_f32:
3651    return NVPTXISD::TexUnifiedCubeU32Float;
3652  case Intrinsic::nvvm_tex_unified_cube_level_v4u32_f32:
3653    return NVPTXISD::TexUnifiedCubeU32FloatLevel;
3654
3655  case Intrinsic::nvvm_tex_unified_cube_array_v4f32_f32:
3656    return NVPTXISD::TexUnifiedCubeArrayFloatFloat;
3657  case Intrinsic::nvvm_tex_unified_cube_array_level_v4f32_f32:
3658    return NVPTXISD::TexUnifiedCubeArrayFloatFloatLevel;
3659  case Intrinsic::nvvm_tex_unified_cube_array_v4s32_f32:
3660    return NVPTXISD::TexUnifiedCubeArrayS32Float;
3661  case Intrinsic::nvvm_tex_unified_cube_array_level_v4s32_f32:
3662    return NVPTXISD::TexUnifiedCubeArrayS32FloatLevel;
3663  case Intrinsic::nvvm_tex_unified_cube_array_v4u32_f32:
3664    return NVPTXISD::TexUnifiedCubeArrayU32Float;
3665  case Intrinsic::nvvm_tex_unified_cube_array_level_v4u32_f32:
3666    return NVPTXISD::TexUnifiedCubeArrayU32FloatLevel;
3667
3668  case Intrinsic::nvvm_tex_unified_cube_grad_v4f32_f32:
3669    return NVPTXISD::TexUnifiedCubeFloatFloatGrad;
3670  case Intrinsic::nvvm_tex_unified_cube_grad_v4s32_f32:
3671    return NVPTXISD::TexUnifiedCubeS32FloatGrad;
3672  case Intrinsic::nvvm_tex_unified_cube_grad_v4u32_f32:
3673    return NVPTXISD::TexUnifiedCubeU32FloatGrad;
3674  case Intrinsic::nvvm_tex_unified_cube_array_grad_v4f32_f32:
3675    return NVPTXISD::TexUnifiedCubeArrayFloatFloatGrad;
3676  case Intrinsic::nvvm_tex_unified_cube_array_grad_v4s32_f32:
3677    return NVPTXISD::TexUnifiedCubeArrayS32FloatGrad;
3678  case Intrinsic::nvvm_tex_unified_cube_array_grad_v4u32_f32:
3679    return NVPTXISD::TexUnifiedCubeArrayU32FloatGrad;
3680
3681  case Intrinsic::nvvm_tld4_unified_r_2d_v4f32_f32:
3682    return NVPTXISD::Tld4UnifiedR2DFloatFloat;
3683  case Intrinsic::nvvm_tld4_unified_g_2d_v4f32_f32:
3684    return NVPTXISD::Tld4UnifiedG2DFloatFloat;
3685  case Intrinsic::nvvm_tld4_unified_b_2d_v4f32_f32:
3686    return NVPTXISD::Tld4UnifiedB2DFloatFloat;
3687  case Intrinsic::nvvm_tld4_unified_a_2d_v4f32_f32:
3688    return NVPTXISD::Tld4UnifiedA2DFloatFloat;
3689  case Intrinsic::nvvm_tld4_unified_r_2d_v4s32_f32:
3690    return NVPTXISD::Tld4UnifiedR2DS64Float;
3691  case Intrinsic::nvvm_tld4_unified_g_2d_v4s32_f32:
3692    return NVPTXISD::Tld4UnifiedG2DS64Float;
3693  case Intrinsic::nvvm_tld4_unified_b_2d_v4s32_f32:
3694    return NVPTXISD::Tld4UnifiedB2DS64Float;
3695  case Intrinsic::nvvm_tld4_unified_a_2d_v4s32_f32:
3696    return NVPTXISD::Tld4UnifiedA2DS64Float;
3697  case Intrinsic::nvvm_tld4_unified_r_2d_v4u32_f32:
3698    return NVPTXISD::Tld4UnifiedR2DU64Float;
3699  case Intrinsic::nvvm_tld4_unified_g_2d_v4u32_f32:
3700    return NVPTXISD::Tld4UnifiedG2DU64Float;
3701  case Intrinsic::nvvm_tld4_unified_b_2d_v4u32_f32:
3702    return NVPTXISD::Tld4UnifiedB2DU64Float;
3703  case Intrinsic::nvvm_tld4_unified_a_2d_v4u32_f32:
3704    return NVPTXISD::Tld4UnifiedA2DU64Float;
3705  }
3706}
3707
3708static unsigned getOpcForSurfaceInstr(unsigned Intrinsic) {
3709  switch (Intrinsic) {
3710  default:
3711    return 0;
3712  case Intrinsic::nvvm_suld_1d_i8_clamp:
3713    return NVPTXISD::Suld1DI8Clamp;
3714  case Intrinsic::nvvm_suld_1d_i16_clamp:
3715    return NVPTXISD::Suld1DI16Clamp;
3716  case Intrinsic::nvvm_suld_1d_i32_clamp:
3717    return NVPTXISD::Suld1DI32Clamp;
3718  case Intrinsic::nvvm_suld_1d_i64_clamp:
3719    return NVPTXISD::Suld1DI64Clamp;
3720  case Intrinsic::nvvm_suld_1d_v2i8_clamp:
3721    return NVPTXISD::Suld1DV2I8Clamp;
3722  case Intrinsic::nvvm_suld_1d_v2i16_clamp:
3723    return NVPTXISD::Suld1DV2I16Clamp;
3724  case Intrinsic::nvvm_suld_1d_v2i32_clamp:
3725    return NVPTXISD::Suld1DV2I32Clamp;
3726  case Intrinsic::nvvm_suld_1d_v2i64_clamp:
3727    return NVPTXISD::Suld1DV2I64Clamp;
3728  case Intrinsic::nvvm_suld_1d_v4i8_clamp:
3729    return NVPTXISD::Suld1DV4I8Clamp;
3730  case Intrinsic::nvvm_suld_1d_v4i16_clamp:
3731    return NVPTXISD::Suld1DV4I16Clamp;
3732  case Intrinsic::nvvm_suld_1d_v4i32_clamp:
3733    return NVPTXISD::Suld1DV4I32Clamp;
3734  case Intrinsic::nvvm_suld_1d_array_i8_clamp:
3735    return NVPTXISD::Suld1DArrayI8Clamp;
3736  case Intrinsic::nvvm_suld_1d_array_i16_clamp:
3737    return NVPTXISD::Suld1DArrayI16Clamp;
3738  case Intrinsic::nvvm_suld_1d_array_i32_clamp:
3739    return NVPTXISD::Suld1DArrayI32Clamp;
3740  case Intrinsic::nvvm_suld_1d_array_i64_clamp:
3741    return NVPTXISD::Suld1DArrayI64Clamp;
3742  case Intrinsic::nvvm_suld_1d_array_v2i8_clamp:
3743    return NVPTXISD::Suld1DArrayV2I8Clamp;
3744  case Intrinsic::nvvm_suld_1d_array_v2i16_clamp:
3745    return NVPTXISD::Suld1DArrayV2I16Clamp;
3746  case Intrinsic::nvvm_suld_1d_array_v2i32_clamp:
3747    return NVPTXISD::Suld1DArrayV2I32Clamp;
3748  case Intrinsic::nvvm_suld_1d_array_v2i64_clamp:
3749    return NVPTXISD::Suld1DArrayV2I64Clamp;
3750  case Intrinsic::nvvm_suld_1d_array_v4i8_clamp:
3751    return NVPTXISD::Suld1DArrayV4I8Clamp;
3752  case Intrinsic::nvvm_suld_1d_array_v4i16_clamp:
3753    return NVPTXISD::Suld1DArrayV4I16Clamp;
3754  case Intrinsic::nvvm_suld_1d_array_v4i32_clamp:
3755    return NVPTXISD::Suld1DArrayV4I32Clamp;
3756  case Intrinsic::nvvm_suld_2d_i8_clamp:
3757    return NVPTXISD::Suld2DI8Clamp;
3758  case Intrinsic::nvvm_suld_2d_i16_clamp:
3759    return NVPTXISD::Suld2DI16Clamp;
3760  case Intrinsic::nvvm_suld_2d_i32_clamp:
3761    return NVPTXISD::Suld2DI32Clamp;
3762  case Intrinsic::nvvm_suld_2d_i64_clamp:
3763    return NVPTXISD::Suld2DI64Clamp;
3764  case Intrinsic::nvvm_suld_2d_v2i8_clamp:
3765    return NVPTXISD::Suld2DV2I8Clamp;
3766  case Intrinsic::nvvm_suld_2d_v2i16_clamp:
3767    return NVPTXISD::Suld2DV2I16Clamp;
3768  case Intrinsic::nvvm_suld_2d_v2i32_clamp:
3769    return NVPTXISD::Suld2DV2I32Clamp;
3770  case Intrinsic::nvvm_suld_2d_v2i64_clamp:
3771    return NVPTXISD::Suld2DV2I64Clamp;
3772  case Intrinsic::nvvm_suld_2d_v4i8_clamp:
3773    return NVPTXISD::Suld2DV4I8Clamp;
3774  case Intrinsic::nvvm_suld_2d_v4i16_clamp:
3775    return NVPTXISD::Suld2DV4I16Clamp;
3776  case Intrinsic::nvvm_suld_2d_v4i32_clamp:
3777    return NVPTXISD::Suld2DV4I32Clamp;
3778  case Intrinsic::nvvm_suld_2d_array_i8_clamp:
3779    return NVPTXISD::Suld2DArrayI8Clamp;
3780  case Intrinsic::nvvm_suld_2d_array_i16_clamp:
3781    return NVPTXISD::Suld2DArrayI16Clamp;
3782  case Intrinsic::nvvm_suld_2d_array_i32_clamp:
3783    return NVPTXISD::Suld2DArrayI32Clamp;
3784  case Intrinsic::nvvm_suld_2d_array_i64_clamp:
3785    return NVPTXISD::Suld2DArrayI64Clamp;
3786  case Intrinsic::nvvm_suld_2d_array_v2i8_clamp:
3787    return NVPTXISD::Suld2DArrayV2I8Clamp;
3788  case Intrinsic::nvvm_suld_2d_array_v2i16_clamp:
3789    return NVPTXISD::Suld2DArrayV2I16Clamp;
3790  case Intrinsic::nvvm_suld_2d_array_v2i32_clamp:
3791    return NVPTXISD::Suld2DArrayV2I32Clamp;
3792  case Intrinsic::nvvm_suld_2d_array_v2i64_clamp:
3793    return NVPTXISD::Suld2DArrayV2I64Clamp;
3794  case Intrinsic::nvvm_suld_2d_array_v4i8_clamp:
3795    return NVPTXISD::Suld2DArrayV4I8Clamp;
3796  case Intrinsic::nvvm_suld_2d_array_v4i16_clamp:
3797    return NVPTXISD::Suld2DArrayV4I16Clamp;
3798  case Intrinsic::nvvm_suld_2d_array_v4i32_clamp:
3799    return NVPTXISD::Suld2DArrayV4I32Clamp;
3800  case Intrinsic::nvvm_suld_3d_i8_clamp:
3801    return NVPTXISD::Suld3DI8Clamp;
3802  case Intrinsic::nvvm_suld_3d_i16_clamp:
3803    return NVPTXISD::Suld3DI16Clamp;
3804  case Intrinsic::nvvm_suld_3d_i32_clamp:
3805    return NVPTXISD::Suld3DI32Clamp;
3806  case Intrinsic::nvvm_suld_3d_i64_clamp:
3807    return NVPTXISD::Suld3DI64Clamp;
3808  case Intrinsic::nvvm_suld_3d_v2i8_clamp:
3809    return NVPTXISD::Suld3DV2I8Clamp;
3810  case Intrinsic::nvvm_suld_3d_v2i16_clamp:
3811    return NVPTXISD::Suld3DV2I16Clamp;
3812  case Intrinsic::nvvm_suld_3d_v2i32_clamp:
3813    return NVPTXISD::Suld3DV2I32Clamp;
3814  case Intrinsic::nvvm_suld_3d_v2i64_clamp:
3815    return NVPTXISD::Suld3DV2I64Clamp;
3816  case Intrinsic::nvvm_suld_3d_v4i8_clamp:
3817    return NVPTXISD::Suld3DV4I8Clamp;
3818  case Intrinsic::nvvm_suld_3d_v4i16_clamp:
3819    return NVPTXISD::Suld3DV4I16Clamp;
3820  case Intrinsic::nvvm_suld_3d_v4i32_clamp:
3821    return NVPTXISD::Suld3DV4I32Clamp;
3822  case Intrinsic::nvvm_suld_1d_i8_trap:
3823    return NVPTXISD::Suld1DI8Trap;
3824  case Intrinsic::nvvm_suld_1d_i16_trap:
3825    return NVPTXISD::Suld1DI16Trap;
3826  case Intrinsic::nvvm_suld_1d_i32_trap:
3827    return NVPTXISD::Suld1DI32Trap;
3828  case Intrinsic::nvvm_suld_1d_i64_trap:
3829    return NVPTXISD::Suld1DI64Trap;
3830  case Intrinsic::nvvm_suld_1d_v2i8_trap:
3831    return NVPTXISD::Suld1DV2I8Trap;
3832  case Intrinsic::nvvm_suld_1d_v2i16_trap:
3833    return NVPTXISD::Suld1DV2I16Trap;
3834  case Intrinsic::nvvm_suld_1d_v2i32_trap:
3835    return NVPTXISD::Suld1DV2I32Trap;
3836  case Intrinsic::nvvm_suld_1d_v2i64_trap:
3837    return NVPTXISD::Suld1DV2I64Trap;
3838  case Intrinsic::nvvm_suld_1d_v4i8_trap:
3839    return NVPTXISD::Suld1DV4I8Trap;
3840  case Intrinsic::nvvm_suld_1d_v4i16_trap:
3841    return NVPTXISD::Suld1DV4I16Trap;
3842  case Intrinsic::nvvm_suld_1d_v4i32_trap:
3843    return NVPTXISD::Suld1DV4I32Trap;
3844  case Intrinsic::nvvm_suld_1d_array_i8_trap:
3845    return NVPTXISD::Suld1DArrayI8Trap;
3846  case Intrinsic::nvvm_suld_1d_array_i16_trap:
3847    return NVPTXISD::Suld1DArrayI16Trap;
3848  case Intrinsic::nvvm_suld_1d_array_i32_trap:
3849    return NVPTXISD::Suld1DArrayI32Trap;
3850  case Intrinsic::nvvm_suld_1d_array_i64_trap:
3851    return NVPTXISD::Suld1DArrayI64Trap;
3852  case Intrinsic::nvvm_suld_1d_array_v2i8_trap:
3853    return NVPTXISD::Suld1DArrayV2I8Trap;
3854  case Intrinsic::nvvm_suld_1d_array_v2i16_trap:
3855    return NVPTXISD::Suld1DArrayV2I16Trap;
3856  case Intrinsic::nvvm_suld_1d_array_v2i32_trap:
3857    return NVPTXISD::Suld1DArrayV2I32Trap;
3858  case Intrinsic::nvvm_suld_1d_array_v2i64_trap:
3859    return NVPTXISD::Suld1DArrayV2I64Trap;
3860  case Intrinsic::nvvm_suld_1d_array_v4i8_trap:
3861    return NVPTXISD::Suld1DArrayV4I8Trap;
3862  case Intrinsic::nvvm_suld_1d_array_v4i16_trap:
3863    return NVPTXISD::Suld1DArrayV4I16Trap;
3864  case Intrinsic::nvvm_suld_1d_array_v4i32_trap:
3865    return NVPTXISD::Suld1DArrayV4I32Trap;
3866  case Intrinsic::nvvm_suld_2d_i8_trap:
3867    return NVPTXISD::Suld2DI8Trap;
3868  case Intrinsic::nvvm_suld_2d_i16_trap:
3869    return NVPTXISD::Suld2DI16Trap;
3870  case Intrinsic::nvvm_suld_2d_i32_trap:
3871    return NVPTXISD::Suld2DI32Trap;
3872  case Intrinsic::nvvm_suld_2d_i64_trap:
3873    return NVPTXISD::Suld2DI64Trap;
3874  case Intrinsic::nvvm_suld_2d_v2i8_trap:
3875    return NVPTXISD::Suld2DV2I8Trap;
3876  case Intrinsic::nvvm_suld_2d_v2i16_trap:
3877    return NVPTXISD::Suld2DV2I16Trap;
3878  case Intrinsic::nvvm_suld_2d_v2i32_trap:
3879    return NVPTXISD::Suld2DV2I32Trap;
3880  case Intrinsic::nvvm_suld_2d_v2i64_trap:
3881    return NVPTXISD::Suld2DV2I64Trap;
3882  case Intrinsic::nvvm_suld_2d_v4i8_trap:
3883    return NVPTXISD::Suld2DV4I8Trap;
3884  case Intrinsic::nvvm_suld_2d_v4i16_trap:
3885    return NVPTXISD::Suld2DV4I16Trap;
3886  case Intrinsic::nvvm_suld_2d_v4i32_trap:
3887    return NVPTXISD::Suld2DV4I32Trap;
3888  case Intrinsic::nvvm_suld_2d_array_i8_trap:
3889    return NVPTXISD::Suld2DArrayI8Trap;
3890  case Intrinsic::nvvm_suld_2d_array_i16_trap:
3891    return NVPTXISD::Suld2DArrayI16Trap;
3892  case Intrinsic::nvvm_suld_2d_array_i32_trap:
3893    return NVPTXISD::Suld2DArrayI32Trap;
3894  case Intrinsic::nvvm_suld_2d_array_i64_trap:
3895    return NVPTXISD::Suld2DArrayI64Trap;
3896  case Intrinsic::nvvm_suld_2d_array_v2i8_trap:
3897    return NVPTXISD::Suld2DArrayV2I8Trap;
3898  case Intrinsic::nvvm_suld_2d_array_v2i16_trap:
3899    return NVPTXISD::Suld2DArrayV2I16Trap;
3900  case Intrinsic::nvvm_suld_2d_array_v2i32_trap:
3901    return NVPTXISD::Suld2DArrayV2I32Trap;
3902  case Intrinsic::nvvm_suld_2d_array_v2i64_trap:
3903    return NVPTXISD::Suld2DArrayV2I64Trap;
3904  case Intrinsic::nvvm_suld_2d_array_v4i8_trap:
3905    return NVPTXISD::Suld2DArrayV4I8Trap;
3906  case Intrinsic::nvvm_suld_2d_array_v4i16_trap:
3907    return NVPTXISD::Suld2DArrayV4I16Trap;
3908  case Intrinsic::nvvm_suld_2d_array_v4i32_trap:
3909    return NVPTXISD::Suld2DArrayV4I32Trap;
3910  case Intrinsic::nvvm_suld_3d_i8_trap:
3911    return NVPTXISD::Suld3DI8Trap;
3912  case Intrinsic::nvvm_suld_3d_i16_trap:
3913    return NVPTXISD::Suld3DI16Trap;
3914  case Intrinsic::nvvm_suld_3d_i32_trap:
3915    return NVPTXISD::Suld3DI32Trap;
3916  case Intrinsic::nvvm_suld_3d_i64_trap:
3917    return NVPTXISD::Suld3DI64Trap;
3918  case Intrinsic::nvvm_suld_3d_v2i8_trap:
3919    return NVPTXISD::Suld3DV2I8Trap;
3920  case Intrinsic::nvvm_suld_3d_v2i16_trap:
3921    return NVPTXISD::Suld3DV2I16Trap;
3922  case Intrinsic::nvvm_suld_3d_v2i32_trap:
3923    return NVPTXISD::Suld3DV2I32Trap;
3924  case Intrinsic::nvvm_suld_3d_v2i64_trap:
3925    return NVPTXISD::Suld3DV2I64Trap;
3926  case Intrinsic::nvvm_suld_3d_v4i8_trap:
3927    return NVPTXISD::Suld3DV4I8Trap;
3928  case Intrinsic::nvvm_suld_3d_v4i16_trap:
3929    return NVPTXISD::Suld3DV4I16Trap;
3930  case Intrinsic::nvvm_suld_3d_v4i32_trap:
3931    return NVPTXISD::Suld3DV4I32Trap;
3932  case Intrinsic::nvvm_suld_1d_i8_zero:
3933    return NVPTXISD::Suld1DI8Zero;
3934  case Intrinsic::nvvm_suld_1d_i16_zero:
3935    return NVPTXISD::Suld1DI16Zero;
3936  case Intrinsic::nvvm_suld_1d_i32_zero:
3937    return NVPTXISD::Suld1DI32Zero;
3938  case Intrinsic::nvvm_suld_1d_i64_zero:
3939    return NVPTXISD::Suld1DI64Zero;
3940  case Intrinsic::nvvm_suld_1d_v2i8_zero:
3941    return NVPTXISD::Suld1DV2I8Zero;
3942  case Intrinsic::nvvm_suld_1d_v2i16_zero:
3943    return NVPTXISD::Suld1DV2I16Zero;
3944  case Intrinsic::nvvm_suld_1d_v2i32_zero:
3945    return NVPTXISD::Suld1DV2I32Zero;
3946  case Intrinsic::nvvm_suld_1d_v2i64_zero:
3947    return NVPTXISD::Suld1DV2I64Zero;
3948  case Intrinsic::nvvm_suld_1d_v4i8_zero:
3949    return NVPTXISD::Suld1DV4I8Zero;
3950  case Intrinsic::nvvm_suld_1d_v4i16_zero:
3951    return NVPTXISD::Suld1DV4I16Zero;
3952  case Intrinsic::nvvm_suld_1d_v4i32_zero:
3953    return NVPTXISD::Suld1DV4I32Zero;
3954  case Intrinsic::nvvm_suld_1d_array_i8_zero:
3955    return NVPTXISD::Suld1DArrayI8Zero;
3956  case Intrinsic::nvvm_suld_1d_array_i16_zero:
3957    return NVPTXISD::Suld1DArrayI16Zero;
3958  case Intrinsic::nvvm_suld_1d_array_i32_zero:
3959    return NVPTXISD::Suld1DArrayI32Zero;
3960  case Intrinsic::nvvm_suld_1d_array_i64_zero:
3961    return NVPTXISD::Suld1DArrayI64Zero;
3962  case Intrinsic::nvvm_suld_1d_array_v2i8_zero:
3963    return NVPTXISD::Suld1DArrayV2I8Zero;
3964  case Intrinsic::nvvm_suld_1d_array_v2i16_zero:
3965    return NVPTXISD::Suld1DArrayV2I16Zero;
3966  case Intrinsic::nvvm_suld_1d_array_v2i32_zero:
3967    return NVPTXISD::Suld1DArrayV2I32Zero;
3968  case Intrinsic::nvvm_suld_1d_array_v2i64_zero:
3969    return NVPTXISD::Suld1DArrayV2I64Zero;
3970  case Intrinsic::nvvm_suld_1d_array_v4i8_zero:
3971    return NVPTXISD::Suld1DArrayV4I8Zero;
3972  case Intrinsic::nvvm_suld_1d_array_v4i16_zero:
3973    return NVPTXISD::Suld1DArrayV4I16Zero;
3974  case Intrinsic::nvvm_suld_1d_array_v4i32_zero:
3975    return NVPTXISD::Suld1DArrayV4I32Zero;
3976  case Intrinsic::nvvm_suld_2d_i8_zero:
3977    return NVPTXISD::Suld2DI8Zero;
3978  case Intrinsic::nvvm_suld_2d_i16_zero:
3979    return NVPTXISD::Suld2DI16Zero;
3980  case Intrinsic::nvvm_suld_2d_i32_zero:
3981    return NVPTXISD::Suld2DI32Zero;
3982  case Intrinsic::nvvm_suld_2d_i64_zero:
3983    return NVPTXISD::Suld2DI64Zero;
3984  case Intrinsic::nvvm_suld_2d_v2i8_zero:
3985    return NVPTXISD::Suld2DV2I8Zero;
3986  case Intrinsic::nvvm_suld_2d_v2i16_zero:
3987    return NVPTXISD::Suld2DV2I16Zero;
3988  case Intrinsic::nvvm_suld_2d_v2i32_zero:
3989    return NVPTXISD::Suld2DV2I32Zero;
3990  case Intrinsic::nvvm_suld_2d_v2i64_zero:
3991    return NVPTXISD::Suld2DV2I64Zero;
3992  case Intrinsic::nvvm_suld_2d_v4i8_zero:
3993    return NVPTXISD::Suld2DV4I8Zero;
3994  case Intrinsic::nvvm_suld_2d_v4i16_zero:
3995    return NVPTXISD::Suld2DV4I16Zero;
3996  case Intrinsic::nvvm_suld_2d_v4i32_zero:
3997    return NVPTXISD::Suld2DV4I32Zero;
3998  case Intrinsic::nvvm_suld_2d_array_i8_zero:
3999    return NVPTXISD::Suld2DArrayI8Zero;
4000  case Intrinsic::nvvm_suld_2d_array_i16_zero:
4001    return NVPTXISD::Suld2DArrayI16Zero;
4002  case Intrinsic::nvvm_suld_2d_array_i32_zero:
4003    return NVPTXISD::Suld2DArrayI32Zero;
4004  case Intrinsic::nvvm_suld_2d_array_i64_zero:
4005    return NVPTXISD::Suld2DArrayI64Zero;
4006  case Intrinsic::nvvm_suld_2d_array_v2i8_zero:
4007    return NVPTXISD::Suld2DArrayV2I8Zero;
4008  case Intrinsic::nvvm_suld_2d_array_v2i16_zero:
4009    return NVPTXISD::Suld2DArrayV2I16Zero;
4010  case Intrinsic::nvvm_suld_2d_array_v2i32_zero:
4011    return NVPTXISD::Suld2DArrayV2I32Zero;
4012  case Intrinsic::nvvm_suld_2d_array_v2i64_zero:
4013    return NVPTXISD::Suld2DArrayV2I64Zero;
4014  case Intrinsic::nvvm_suld_2d_array_v4i8_zero:
4015    return NVPTXISD::Suld2DArrayV4I8Zero;
4016  case Intrinsic::nvvm_suld_2d_array_v4i16_zero:
4017    return NVPTXISD::Suld2DArrayV4I16Zero;
4018  case Intrinsic::nvvm_suld_2d_array_v4i32_zero:
4019    return NVPTXISD::Suld2DArrayV4I32Zero;
4020  case Intrinsic::nvvm_suld_3d_i8_zero:
4021    return NVPTXISD::Suld3DI8Zero;
4022  case Intrinsic::nvvm_suld_3d_i16_zero:
4023    return NVPTXISD::Suld3DI16Zero;
4024  case Intrinsic::nvvm_suld_3d_i32_zero:
4025    return NVPTXISD::Suld3DI32Zero;
4026  case Intrinsic::nvvm_suld_3d_i64_zero:
4027    return NVPTXISD::Suld3DI64Zero;
4028  case Intrinsic::nvvm_suld_3d_v2i8_zero:
4029    return NVPTXISD::Suld3DV2I8Zero;
4030  case Intrinsic::nvvm_suld_3d_v2i16_zero:
4031    return NVPTXISD::Suld3DV2I16Zero;
4032  case Intrinsic::nvvm_suld_3d_v2i32_zero:
4033    return NVPTXISD::Suld3DV2I32Zero;
4034  case Intrinsic::nvvm_suld_3d_v2i64_zero:
4035    return NVPTXISD::Suld3DV2I64Zero;
4036  case Intrinsic::nvvm_suld_3d_v4i8_zero:
4037    return NVPTXISD::Suld3DV4I8Zero;
4038  case Intrinsic::nvvm_suld_3d_v4i16_zero:
4039    return NVPTXISD::Suld3DV4I16Zero;
4040  case Intrinsic::nvvm_suld_3d_v4i32_zero:
4041    return NVPTXISD::Suld3DV4I32Zero;
4042  }
4043}
4044
4045// llvm.ptx.memcpy.const and llvm.ptx.memmove.const need to be modeled as
4046// TgtMemIntrinsic
4047// because we need the information that is only available in the "Value" type
4048// of destination
4049// pointer. In particular, the address space information.
4050bool NVPTXTargetLowering::getTgtMemIntrinsic(
4051    IntrinsicInfo &Info, const CallInst &I,
4052    MachineFunction &MF, unsigned Intrinsic) const {
4053  switch (Intrinsic) {
4054  default:
4055    return false;
4056  case Intrinsic::nvvm_match_all_sync_i32p:
4057  case Intrinsic::nvvm_match_all_sync_i64p:
4058    Info.opc = ISD::INTRINSIC_W_CHAIN;
4059    // memVT is bogus. These intrinsics have IntrInaccessibleMemOnly attribute
4060    // in order to model data exchange with other threads, but perform no real
4061    // memory accesses.
4062    Info.memVT = MVT::i1;
4063
4064    // Our result depends on both our and other thread's arguments.
4065    Info.flags = MachineMemOperand::MOLoad | MachineMemOperand::MOStore;
4066    return true;
4067  case Intrinsic::nvvm_wmma_m16n16k16_load_a_f16_col:
4068  case Intrinsic::nvvm_wmma_m16n16k16_load_a_f16_row:
4069  case Intrinsic::nvvm_wmma_m16n16k16_load_a_f16_col_stride:
4070  case Intrinsic::nvvm_wmma_m16n16k16_load_a_f16_row_stride:
4071  case Intrinsic::nvvm_wmma_m16n16k16_load_b_f16_col:
4072  case Intrinsic::nvvm_wmma_m16n16k16_load_b_f16_row:
4073  case Intrinsic::nvvm_wmma_m16n16k16_load_b_f16_col_stride:
4074  case Intrinsic::nvvm_wmma_m16n16k16_load_b_f16_row_stride:
4075  case Intrinsic::nvvm_wmma_m32n8k16_load_a_f16_col:
4076  case Intrinsic::nvvm_wmma_m32n8k16_load_a_f16_row:
4077  case Intrinsic::nvvm_wmma_m32n8k16_load_a_f16_col_stride:
4078  case Intrinsic::nvvm_wmma_m32n8k16_load_a_f16_row_stride:
4079  case Intrinsic::nvvm_wmma_m32n8k16_load_b_f16_col:
4080  case Intrinsic::nvvm_wmma_m32n8k16_load_b_f16_row:
4081  case Intrinsic::nvvm_wmma_m32n8k16_load_b_f16_col_stride:
4082  case Intrinsic::nvvm_wmma_m32n8k16_load_b_f16_row_stride:
4083  case Intrinsic::nvvm_wmma_m8n32k16_load_a_f16_col:
4084  case Intrinsic::nvvm_wmma_m8n32k16_load_a_f16_row:
4085  case Intrinsic::nvvm_wmma_m8n32k16_load_a_f16_col_stride:
4086  case Intrinsic::nvvm_wmma_m8n32k16_load_a_f16_row_stride:
4087  case Intrinsic::nvvm_wmma_m8n32k16_load_b_f16_col:
4088  case Intrinsic::nvvm_wmma_m8n32k16_load_b_f16_row:
4089  case Intrinsic::nvvm_wmma_m8n32k16_load_b_f16_col_stride:
4090  case Intrinsic::nvvm_wmma_m8n32k16_load_b_f16_row_stride: {
4091    Info.opc = ISD::INTRINSIC_W_CHAIN;
4092    Info.memVT = MVT::v8f16;
4093    Info.ptrVal = I.getArgOperand(0);
4094    Info.offset = 0;
4095    Info.flags = MachineMemOperand::MOLoad;
4096    Info.align = Align(16);
4097    return true;
4098  }
4099  case Intrinsic::nvvm_wmma_m16n16k16_load_a_s8_col:
4100  case Intrinsic::nvvm_wmma_m16n16k16_load_a_s8_col_stride:
4101  case Intrinsic::nvvm_wmma_m16n16k16_load_a_u8_col_stride:
4102  case Intrinsic::nvvm_wmma_m16n16k16_load_a_u8_col:
4103  case Intrinsic::nvvm_wmma_m16n16k16_load_a_s8_row:
4104  case Intrinsic::nvvm_wmma_m16n16k16_load_a_s8_row_stride:
4105  case Intrinsic::nvvm_wmma_m16n16k16_load_a_u8_row_stride:
4106  case Intrinsic::nvvm_wmma_m16n16k16_load_a_u8_row:
4107  case Intrinsic::nvvm_wmma_m8n32k16_load_a_bf16_col:
4108  case Intrinsic::nvvm_wmma_m8n32k16_load_a_bf16_col_stride:
4109  case Intrinsic::nvvm_wmma_m8n32k16_load_a_bf16_row:
4110  case Intrinsic::nvvm_wmma_m8n32k16_load_a_bf16_row_stride:
4111  case Intrinsic::nvvm_wmma_m16n16k16_load_b_s8_col:
4112  case Intrinsic::nvvm_wmma_m16n16k16_load_b_s8_col_stride:
4113  case Intrinsic::nvvm_wmma_m16n16k16_load_b_u8_col_stride:
4114  case Intrinsic::nvvm_wmma_m16n16k16_load_b_u8_col:
4115  case Intrinsic::nvvm_wmma_m16n16k16_load_b_s8_row:
4116  case Intrinsic::nvvm_wmma_m16n16k16_load_b_s8_row_stride:
4117  case Intrinsic::nvvm_wmma_m16n16k16_load_b_u8_row_stride:
4118  case Intrinsic::nvvm_wmma_m16n16k16_load_b_u8_row:
4119  case Intrinsic::nvvm_wmma_m32n8k16_load_b_bf16_col:
4120  case Intrinsic::nvvm_wmma_m32n8k16_load_b_bf16_col_stride:
4121  case Intrinsic::nvvm_wmma_m32n8k16_load_b_bf16_row:
4122  case Intrinsic::nvvm_wmma_m32n8k16_load_b_bf16_row_stride: {
4123    Info.opc = ISD::INTRINSIC_W_CHAIN;
4124    Info.memVT = MVT::v2i32;
4125    Info.ptrVal = I.getArgOperand(0);
4126    Info.offset = 0;
4127    Info.flags = MachineMemOperand::MOLoad;
4128    Info.align = Align(8);
4129    return true;
4130  }
4131
4132  case Intrinsic::nvvm_wmma_m32n8k16_load_a_s8_col:
4133  case Intrinsic::nvvm_wmma_m32n8k16_load_a_s8_col_stride:
4134  case Intrinsic::nvvm_wmma_m32n8k16_load_a_u8_col_stride:
4135  case Intrinsic::nvvm_wmma_m32n8k16_load_a_u8_col:
4136  case Intrinsic::nvvm_wmma_m32n8k16_load_a_s8_row:
4137  case Intrinsic::nvvm_wmma_m32n8k16_load_a_s8_row_stride:
4138  case Intrinsic::nvvm_wmma_m32n8k16_load_a_u8_row_stride:
4139  case Intrinsic::nvvm_wmma_m32n8k16_load_a_u8_row:
4140  case Intrinsic::nvvm_wmma_m16n16k16_load_a_bf16_col:
4141  case Intrinsic::nvvm_wmma_m16n16k16_load_a_bf16_col_stride:
4142  case Intrinsic::nvvm_wmma_m16n16k16_load_a_bf16_row:
4143  case Intrinsic::nvvm_wmma_m16n16k16_load_a_bf16_row_stride:
4144  case Intrinsic::nvvm_wmma_m16n16k8_load_a_tf32_col:
4145  case Intrinsic::nvvm_wmma_m16n16k8_load_a_tf32_col_stride:
4146  case Intrinsic::nvvm_wmma_m16n16k8_load_a_tf32_row:
4147  case Intrinsic::nvvm_wmma_m16n16k8_load_a_tf32_row_stride:
4148
4149  case Intrinsic::nvvm_wmma_m8n32k16_load_b_s8_col:
4150  case Intrinsic::nvvm_wmma_m8n32k16_load_b_s8_col_stride:
4151  case Intrinsic::nvvm_wmma_m8n32k16_load_b_u8_col_stride:
4152  case Intrinsic::nvvm_wmma_m8n32k16_load_b_u8_col:
4153  case Intrinsic::nvvm_wmma_m8n32k16_load_b_s8_row:
4154  case Intrinsic::nvvm_wmma_m8n32k16_load_b_s8_row_stride:
4155  case Intrinsic::nvvm_wmma_m8n32k16_load_b_u8_row_stride:
4156  case Intrinsic::nvvm_wmma_m8n32k16_load_b_u8_row:
4157  case Intrinsic::nvvm_wmma_m16n16k16_load_b_bf16_col:
4158  case Intrinsic::nvvm_wmma_m16n16k16_load_b_bf16_col_stride:
4159  case Intrinsic::nvvm_wmma_m16n16k16_load_b_bf16_row:
4160  case Intrinsic::nvvm_wmma_m16n16k16_load_b_bf16_row_stride:
4161  case Intrinsic::nvvm_wmma_m16n16k8_load_b_tf32_col:
4162  case Intrinsic::nvvm_wmma_m16n16k8_load_b_tf32_col_stride:
4163  case Intrinsic::nvvm_wmma_m16n16k8_load_b_tf32_row:
4164  case Intrinsic::nvvm_wmma_m16n16k8_load_b_tf32_row_stride:
4165  case Intrinsic::nvvm_ldmatrix_sync_aligned_m8n8_x4_b16:
4166  case Intrinsic::nvvm_ldmatrix_sync_aligned_m8n8_x4_trans_b16: {
4167    Info.opc = ISD::INTRINSIC_W_CHAIN;
4168    Info.memVT = MVT::v4i32;
4169    Info.ptrVal = I.getArgOperand(0);
4170    Info.offset = 0;
4171    Info.flags = MachineMemOperand::MOLoad;
4172    Info.align = Align(16);
4173    return true;
4174  }
4175
4176  case Intrinsic::nvvm_wmma_m32n8k16_load_b_s8_col:
4177  case Intrinsic::nvvm_wmma_m32n8k16_load_b_s8_col_stride:
4178  case Intrinsic::nvvm_wmma_m32n8k16_load_b_u8_col_stride:
4179  case Intrinsic::nvvm_wmma_m32n8k16_load_b_u8_col:
4180  case Intrinsic::nvvm_wmma_m32n8k16_load_b_s8_row:
4181  case Intrinsic::nvvm_wmma_m32n8k16_load_b_s8_row_stride:
4182  case Intrinsic::nvvm_wmma_m32n8k16_load_b_u8_row_stride:
4183  case Intrinsic::nvvm_wmma_m32n8k16_load_b_u8_row:
4184
4185  case Intrinsic::nvvm_wmma_m8n32k16_load_a_s8_col:
4186  case Intrinsic::nvvm_wmma_m8n32k16_load_a_s8_col_stride:
4187  case Intrinsic::nvvm_wmma_m8n32k16_load_a_u8_col_stride:
4188  case Intrinsic::nvvm_wmma_m8n32k16_load_a_u8_col:
4189  case Intrinsic::nvvm_wmma_m8n32k16_load_a_s8_row:
4190  case Intrinsic::nvvm_wmma_m8n32k16_load_a_s8_row_stride:
4191  case Intrinsic::nvvm_wmma_m8n32k16_load_a_u8_row_stride:
4192  case Intrinsic::nvvm_wmma_m8n32k16_load_a_u8_row:
4193  case Intrinsic::nvvm_wmma_m8n8k128_load_a_b1_row:
4194  case Intrinsic::nvvm_wmma_m8n8k128_load_a_b1_row_stride:
4195  case Intrinsic::nvvm_wmma_m8n8k128_load_b_b1_col:
4196  case Intrinsic::nvvm_wmma_m8n8k128_load_b_b1_col_stride:
4197  case Intrinsic::nvvm_wmma_m8n8k32_load_a_s4_row:
4198  case Intrinsic::nvvm_wmma_m8n8k32_load_a_s4_row_stride:
4199  case Intrinsic::nvvm_wmma_m8n8k32_load_a_u4_row_stride:
4200  case Intrinsic::nvvm_wmma_m8n8k32_load_a_u4_row:
4201  case Intrinsic::nvvm_wmma_m8n8k32_load_b_s4_col:
4202  case Intrinsic::nvvm_wmma_m8n8k32_load_b_s4_col_stride:
4203  case Intrinsic::nvvm_wmma_m8n8k32_load_b_u4_col_stride:
4204  case Intrinsic::nvvm_wmma_m8n8k32_load_b_u4_col:
4205  case Intrinsic::nvvm_ldmatrix_sync_aligned_m8n8_x1_b16:
4206  case Intrinsic::nvvm_ldmatrix_sync_aligned_m8n8_x1_trans_b16: {
4207    Info.opc = ISD::INTRINSIC_W_CHAIN;
4208    Info.memVT = MVT::i32;
4209    Info.ptrVal = I.getArgOperand(0);
4210    Info.offset = 0;
4211    Info.flags = MachineMemOperand::MOLoad;
4212    Info.align = Align(4);
4213    return true;
4214  }
4215
4216  case Intrinsic::nvvm_wmma_m16n16k16_load_c_f16_col:
4217  case Intrinsic::nvvm_wmma_m16n16k16_load_c_f16_row:
4218  case Intrinsic::nvvm_wmma_m16n16k16_load_c_f16_col_stride:
4219  case Intrinsic::nvvm_wmma_m16n16k16_load_c_f16_row_stride:
4220  case Intrinsic::nvvm_wmma_m32n8k16_load_c_f16_col:
4221  case Intrinsic::nvvm_wmma_m32n8k16_load_c_f16_row:
4222  case Intrinsic::nvvm_wmma_m32n8k16_load_c_f16_col_stride:
4223  case Intrinsic::nvvm_wmma_m32n8k16_load_c_f16_row_stride:
4224  case Intrinsic::nvvm_wmma_m8n32k16_load_c_f16_col:
4225  case Intrinsic::nvvm_wmma_m8n32k16_load_c_f16_row:
4226  case Intrinsic::nvvm_wmma_m8n32k16_load_c_f16_col_stride:
4227  case Intrinsic::nvvm_wmma_m8n32k16_load_c_f16_row_stride: {
4228    Info.opc = ISD::INTRINSIC_W_CHAIN;
4229    Info.memVT = MVT::v4f16;
4230    Info.ptrVal = I.getArgOperand(0);
4231    Info.offset = 0;
4232    Info.flags = MachineMemOperand::MOLoad;
4233    Info.align = Align(16);
4234    return true;
4235  }
4236
4237  case Intrinsic::nvvm_wmma_m16n16k16_load_c_f32_col:
4238  case Intrinsic::nvvm_wmma_m16n16k16_load_c_f32_row:
4239  case Intrinsic::nvvm_wmma_m16n16k16_load_c_f32_col_stride:
4240  case Intrinsic::nvvm_wmma_m16n16k16_load_c_f32_row_stride:
4241  case Intrinsic::nvvm_wmma_m32n8k16_load_c_f32_col:
4242  case Intrinsic::nvvm_wmma_m32n8k16_load_c_f32_row:
4243  case Intrinsic::nvvm_wmma_m32n8k16_load_c_f32_col_stride:
4244  case Intrinsic::nvvm_wmma_m32n8k16_load_c_f32_row_stride:
4245  case Intrinsic::nvvm_wmma_m8n32k16_load_c_f32_col:
4246  case Intrinsic::nvvm_wmma_m8n32k16_load_c_f32_row:
4247  case Intrinsic::nvvm_wmma_m8n32k16_load_c_f32_col_stride:
4248  case Intrinsic::nvvm_wmma_m8n32k16_load_c_f32_row_stride:
4249  case Intrinsic::nvvm_wmma_m16n16k8_load_c_f32_col:
4250  case Intrinsic::nvvm_wmma_m16n16k8_load_c_f32_row:
4251  case Intrinsic::nvvm_wmma_m16n16k8_load_c_f32_col_stride:
4252  case Intrinsic::nvvm_wmma_m16n16k8_load_c_f32_row_stride: {
4253    Info.opc = ISD::INTRINSIC_W_CHAIN;
4254    Info.memVT = MVT::v8f32;
4255    Info.ptrVal = I.getArgOperand(0);
4256    Info.offset = 0;
4257    Info.flags = MachineMemOperand::MOLoad;
4258    Info.align = Align(16);
4259    return true;
4260  }
4261
4262  case Intrinsic::nvvm_wmma_m32n8k16_load_a_bf16_col:
4263  case Intrinsic::nvvm_wmma_m32n8k16_load_a_bf16_col_stride:
4264  case Intrinsic::nvvm_wmma_m32n8k16_load_a_bf16_row:
4265  case Intrinsic::nvvm_wmma_m32n8k16_load_a_bf16_row_stride:
4266
4267  case Intrinsic::nvvm_wmma_m8n32k16_load_b_bf16_col:
4268  case Intrinsic::nvvm_wmma_m8n32k16_load_b_bf16_col_stride:
4269  case Intrinsic::nvvm_wmma_m8n32k16_load_b_bf16_row:
4270  case Intrinsic::nvvm_wmma_m8n32k16_load_b_bf16_row_stride:
4271
4272  case Intrinsic::nvvm_wmma_m16n16k16_load_c_s32_col:
4273  case Intrinsic::nvvm_wmma_m16n16k16_load_c_s32_col_stride:
4274  case Intrinsic::nvvm_wmma_m16n16k16_load_c_s32_row:
4275  case Intrinsic::nvvm_wmma_m16n16k16_load_c_s32_row_stride:
4276  case Intrinsic::nvvm_wmma_m32n8k16_load_c_s32_col:
4277  case Intrinsic::nvvm_wmma_m32n8k16_load_c_s32_col_stride:
4278  case Intrinsic::nvvm_wmma_m32n8k16_load_c_s32_row:
4279  case Intrinsic::nvvm_wmma_m32n8k16_load_c_s32_row_stride:
4280  case Intrinsic::nvvm_wmma_m8n32k16_load_c_s32_col:
4281  case Intrinsic::nvvm_wmma_m8n32k16_load_c_s32_col_stride:
4282  case Intrinsic::nvvm_wmma_m8n32k16_load_c_s32_row:
4283  case Intrinsic::nvvm_wmma_m8n32k16_load_c_s32_row_stride: {
4284    Info.opc = ISD::INTRINSIC_W_CHAIN;
4285    Info.memVT = MVT::v8i32;
4286    Info.ptrVal = I.getArgOperand(0);
4287    Info.offset = 0;
4288    Info.flags = MachineMemOperand::MOLoad;
4289    Info.align = Align(16);
4290    return true;
4291  }
4292
4293  case Intrinsic::nvvm_wmma_m8n8k128_load_c_s32_col:
4294  case Intrinsic::nvvm_wmma_m8n8k128_load_c_s32_col_stride:
4295  case Intrinsic::nvvm_wmma_m8n8k128_load_c_s32_row:
4296  case Intrinsic::nvvm_wmma_m8n8k128_load_c_s32_row_stride:
4297  case Intrinsic::nvvm_wmma_m8n8k32_load_c_s32_col:
4298  case Intrinsic::nvvm_wmma_m8n8k32_load_c_s32_col_stride:
4299  case Intrinsic::nvvm_wmma_m8n8k32_load_c_s32_row:
4300  case Intrinsic::nvvm_wmma_m8n8k32_load_c_s32_row_stride:
4301  case Intrinsic::nvvm_ldmatrix_sync_aligned_m8n8_x2_b16:
4302  case Intrinsic::nvvm_ldmatrix_sync_aligned_m8n8_x2_trans_b16: {
4303    Info.opc = ISD::INTRINSIC_W_CHAIN;
4304    Info.memVT = MVT::v2i32;
4305    Info.ptrVal = I.getArgOperand(0);
4306    Info.offset = 0;
4307    Info.flags = MachineMemOperand::MOLoad;
4308    Info.align = Align(8);
4309    return true;
4310  }
4311
4312  case Intrinsic::nvvm_wmma_m8n8k4_load_a_f64_col:
4313  case Intrinsic::nvvm_wmma_m8n8k4_load_a_f64_col_stride:
4314  case Intrinsic::nvvm_wmma_m8n8k4_load_a_f64_row:
4315  case Intrinsic::nvvm_wmma_m8n8k4_load_a_f64_row_stride:
4316
4317  case Intrinsic::nvvm_wmma_m8n8k4_load_b_f64_col:
4318  case Intrinsic::nvvm_wmma_m8n8k4_load_b_f64_col_stride:
4319  case Intrinsic::nvvm_wmma_m8n8k4_load_b_f64_row:
4320  case Intrinsic::nvvm_wmma_m8n8k4_load_b_f64_row_stride: {
4321    Info.opc = ISD::INTRINSIC_W_CHAIN;
4322    Info.memVT = MVT::f64;
4323    Info.ptrVal = I.getArgOperand(0);
4324    Info.offset = 0;
4325    Info.flags = MachineMemOperand::MOLoad;
4326    Info.align = Align(8);
4327    return true;
4328  }
4329
4330  case Intrinsic::nvvm_wmma_m8n8k4_load_c_f64_col:
4331  case Intrinsic::nvvm_wmma_m8n8k4_load_c_f64_col_stride:
4332  case Intrinsic::nvvm_wmma_m8n8k4_load_c_f64_row:
4333  case Intrinsic::nvvm_wmma_m8n8k4_load_c_f64_row_stride: {
4334    Info.opc = ISD::INTRINSIC_W_CHAIN;
4335    Info.memVT = MVT::v2f64;
4336    Info.ptrVal = I.getArgOperand(0);
4337    Info.offset = 0;
4338    Info.flags = MachineMemOperand::MOLoad;
4339    Info.align = Align(16);
4340    return true;
4341  }
4342
4343  case Intrinsic::nvvm_wmma_m16n16k16_store_d_f16_col:
4344  case Intrinsic::nvvm_wmma_m16n16k16_store_d_f16_row:
4345  case Intrinsic::nvvm_wmma_m16n16k16_store_d_f16_col_stride:
4346  case Intrinsic::nvvm_wmma_m16n16k16_store_d_f16_row_stride:
4347  case Intrinsic::nvvm_wmma_m32n8k16_store_d_f16_col:
4348  case Intrinsic::nvvm_wmma_m32n8k16_store_d_f16_row:
4349  case Intrinsic::nvvm_wmma_m32n8k16_store_d_f16_col_stride:
4350  case Intrinsic::nvvm_wmma_m32n8k16_store_d_f16_row_stride:
4351  case Intrinsic::nvvm_wmma_m8n32k16_store_d_f16_col:
4352  case Intrinsic::nvvm_wmma_m8n32k16_store_d_f16_row:
4353  case Intrinsic::nvvm_wmma_m8n32k16_store_d_f16_col_stride:
4354  case Intrinsic::nvvm_wmma_m8n32k16_store_d_f16_row_stride: {
4355    Info.opc = ISD::INTRINSIC_VOID;
4356    Info.memVT = MVT::v4f16;
4357    Info.ptrVal = I.getArgOperand(0);
4358    Info.offset = 0;
4359    Info.flags = MachineMemOperand::MOStore;
4360    Info.align = Align(16);
4361    return true;
4362  }
4363
4364  case Intrinsic::nvvm_wmma_m16n16k16_store_d_f32_col:
4365  case Intrinsic::nvvm_wmma_m16n16k16_store_d_f32_row:
4366  case Intrinsic::nvvm_wmma_m16n16k16_store_d_f32_col_stride:
4367  case Intrinsic::nvvm_wmma_m16n16k16_store_d_f32_row_stride:
4368  case Intrinsic::nvvm_wmma_m32n8k16_store_d_f32_col:
4369  case Intrinsic::nvvm_wmma_m32n8k16_store_d_f32_row:
4370  case Intrinsic::nvvm_wmma_m32n8k16_store_d_f32_col_stride:
4371  case Intrinsic::nvvm_wmma_m32n8k16_store_d_f32_row_stride:
4372  case Intrinsic::nvvm_wmma_m8n32k16_store_d_f32_col:
4373  case Intrinsic::nvvm_wmma_m8n32k16_store_d_f32_row:
4374  case Intrinsic::nvvm_wmma_m8n32k16_store_d_f32_col_stride:
4375  case Intrinsic::nvvm_wmma_m8n32k16_store_d_f32_row_stride:
4376  case Intrinsic::nvvm_wmma_m16n16k8_store_d_f32_col:
4377  case Intrinsic::nvvm_wmma_m16n16k8_store_d_f32_row:
4378  case Intrinsic::nvvm_wmma_m16n16k8_store_d_f32_col_stride:
4379  case Intrinsic::nvvm_wmma_m16n16k8_store_d_f32_row_stride: {
4380    Info.opc = ISD::INTRINSIC_VOID;
4381    Info.memVT = MVT::v8f32;
4382    Info.ptrVal = I.getArgOperand(0);
4383    Info.offset = 0;
4384    Info.flags = MachineMemOperand::MOStore;
4385    Info.align = Align(16);
4386    return true;
4387  }
4388
4389  case Intrinsic::nvvm_wmma_m16n16k16_store_d_s32_col:
4390  case Intrinsic::nvvm_wmma_m16n16k16_store_d_s32_col_stride:
4391  case Intrinsic::nvvm_wmma_m16n16k16_store_d_s32_row:
4392  case Intrinsic::nvvm_wmma_m16n16k16_store_d_s32_row_stride:
4393  case Intrinsic::nvvm_wmma_m32n8k16_store_d_s32_col:
4394  case Intrinsic::nvvm_wmma_m32n8k16_store_d_s32_col_stride:
4395  case Intrinsic::nvvm_wmma_m32n8k16_store_d_s32_row:
4396  case Intrinsic::nvvm_wmma_m32n8k16_store_d_s32_row_stride:
4397  case Intrinsic::nvvm_wmma_m8n32k16_store_d_s32_col:
4398  case Intrinsic::nvvm_wmma_m8n32k16_store_d_s32_col_stride:
4399  case Intrinsic::nvvm_wmma_m8n32k16_store_d_s32_row:
4400  case Intrinsic::nvvm_wmma_m8n32k16_store_d_s32_row_stride: {
4401    Info.opc = ISD::INTRINSIC_VOID;
4402    Info.memVT = MVT::v8i32;
4403    Info.ptrVal = I.getArgOperand(0);
4404    Info.offset = 0;
4405    Info.flags = MachineMemOperand::MOStore;
4406    Info.align = Align(16);
4407    return true;
4408  }
4409
4410  case Intrinsic::nvvm_wmma_m8n8k128_store_d_s32_col:
4411  case Intrinsic::nvvm_wmma_m8n8k128_store_d_s32_col_stride:
4412  case Intrinsic::nvvm_wmma_m8n8k128_store_d_s32_row:
4413  case Intrinsic::nvvm_wmma_m8n8k128_store_d_s32_row_stride:
4414  case Intrinsic::nvvm_wmma_m8n8k32_store_d_s32_col:
4415  case Intrinsic::nvvm_wmma_m8n8k32_store_d_s32_col_stride:
4416  case Intrinsic::nvvm_wmma_m8n8k32_store_d_s32_row:
4417  case Intrinsic::nvvm_wmma_m8n8k32_store_d_s32_row_stride: {
4418    Info.opc = ISD::INTRINSIC_VOID;
4419    Info.memVT = MVT::v2i32;
4420    Info.ptrVal = I.getArgOperand(0);
4421    Info.offset = 0;
4422    Info.flags = MachineMemOperand::MOStore;
4423    Info.align = Align(8);
4424    return true;
4425  }
4426
4427  case Intrinsic::nvvm_wmma_m8n8k4_store_d_f64_col:
4428  case Intrinsic::nvvm_wmma_m8n8k4_store_d_f64_col_stride:
4429  case Intrinsic::nvvm_wmma_m8n8k4_store_d_f64_row:
4430  case Intrinsic::nvvm_wmma_m8n8k4_store_d_f64_row_stride: {
4431    Info.opc = ISD::INTRINSIC_VOID;
4432    Info.memVT = MVT::v2f64;
4433    Info.ptrVal = I.getArgOperand(0);
4434    Info.offset = 0;
4435    Info.flags = MachineMemOperand::MOStore;
4436    Info.align = Align(16);
4437    return true;
4438  }
4439
4440  case Intrinsic::nvvm_atomic_load_inc_32:
4441  case Intrinsic::nvvm_atomic_load_dec_32:
4442
4443  case Intrinsic::nvvm_atomic_add_gen_f_cta:
4444  case Intrinsic::nvvm_atomic_add_gen_f_sys:
4445  case Intrinsic::nvvm_atomic_add_gen_i_cta:
4446  case Intrinsic::nvvm_atomic_add_gen_i_sys:
4447  case Intrinsic::nvvm_atomic_and_gen_i_cta:
4448  case Intrinsic::nvvm_atomic_and_gen_i_sys:
4449  case Intrinsic::nvvm_atomic_cas_gen_i_cta:
4450  case Intrinsic::nvvm_atomic_cas_gen_i_sys:
4451  case Intrinsic::nvvm_atomic_dec_gen_i_cta:
4452  case Intrinsic::nvvm_atomic_dec_gen_i_sys:
4453  case Intrinsic::nvvm_atomic_inc_gen_i_cta:
4454  case Intrinsic::nvvm_atomic_inc_gen_i_sys:
4455  case Intrinsic::nvvm_atomic_max_gen_i_cta:
4456  case Intrinsic::nvvm_atomic_max_gen_i_sys:
4457  case Intrinsic::nvvm_atomic_min_gen_i_cta:
4458  case Intrinsic::nvvm_atomic_min_gen_i_sys:
4459  case Intrinsic::nvvm_atomic_or_gen_i_cta:
4460  case Intrinsic::nvvm_atomic_or_gen_i_sys:
4461  case Intrinsic::nvvm_atomic_exch_gen_i_cta:
4462  case Intrinsic::nvvm_atomic_exch_gen_i_sys:
4463  case Intrinsic::nvvm_atomic_xor_gen_i_cta:
4464  case Intrinsic::nvvm_atomic_xor_gen_i_sys: {
4465    auto &DL = I.getModule()->getDataLayout();
4466    Info.opc = ISD::INTRINSIC_W_CHAIN;
4467    Info.memVT = getValueType(DL, I.getType());
4468    Info.ptrVal = I.getArgOperand(0);
4469    Info.offset = 0;
4470    Info.flags = MachineMemOperand::MOLoad | MachineMemOperand::MOStore;
4471    Info.align.reset();
4472    return true;
4473  }
4474
4475  case Intrinsic::nvvm_ldu_global_i:
4476  case Intrinsic::nvvm_ldu_global_f:
4477  case Intrinsic::nvvm_ldu_global_p: {
4478    auto &DL = I.getModule()->getDataLayout();
4479    Info.opc = ISD::INTRINSIC_W_CHAIN;
4480    if (Intrinsic == Intrinsic::nvvm_ldu_global_i)
4481      Info.memVT = getValueType(DL, I.getType());
4482    else if(Intrinsic == Intrinsic::nvvm_ldu_global_p)
4483      Info.memVT = getPointerTy(DL);
4484    else
4485      Info.memVT = getValueType(DL, I.getType());
4486    Info.ptrVal = I.getArgOperand(0);
4487    Info.offset = 0;
4488    Info.flags = MachineMemOperand::MOLoad;
4489    Info.align = cast<ConstantInt>(I.getArgOperand(1))->getMaybeAlignValue();
4490
4491    return true;
4492  }
4493  case Intrinsic::nvvm_ldg_global_i:
4494  case Intrinsic::nvvm_ldg_global_f:
4495  case Intrinsic::nvvm_ldg_global_p: {
4496    auto &DL = I.getModule()->getDataLayout();
4497
4498    Info.opc = ISD::INTRINSIC_W_CHAIN;
4499    if (Intrinsic == Intrinsic::nvvm_ldg_global_i)
4500      Info.memVT = getValueType(DL, I.getType());
4501    else if(Intrinsic == Intrinsic::nvvm_ldg_global_p)
4502      Info.memVT = getPointerTy(DL);
4503    else
4504      Info.memVT = getValueType(DL, I.getType());
4505    Info.ptrVal = I.getArgOperand(0);
4506    Info.offset = 0;
4507    Info.flags = MachineMemOperand::MOLoad;
4508    Info.align = cast<ConstantInt>(I.getArgOperand(1))->getMaybeAlignValue();
4509
4510    return true;
4511  }
4512
4513  case Intrinsic::nvvm_tex_1d_v4f32_s32:
4514  case Intrinsic::nvvm_tex_1d_v4f32_f32:
4515  case Intrinsic::nvvm_tex_1d_level_v4f32_f32:
4516  case Intrinsic::nvvm_tex_1d_grad_v4f32_f32:
4517  case Intrinsic::nvvm_tex_1d_array_v4f32_s32:
4518  case Intrinsic::nvvm_tex_1d_array_v4f32_f32:
4519  case Intrinsic::nvvm_tex_1d_array_level_v4f32_f32:
4520  case Intrinsic::nvvm_tex_1d_array_grad_v4f32_f32:
4521  case Intrinsic::nvvm_tex_2d_v4f32_s32:
4522  case Intrinsic::nvvm_tex_2d_v4f32_f32:
4523  case Intrinsic::nvvm_tex_2d_level_v4f32_f32:
4524  case Intrinsic::nvvm_tex_2d_grad_v4f32_f32:
4525  case Intrinsic::nvvm_tex_2d_array_v4f32_s32:
4526  case Intrinsic::nvvm_tex_2d_array_v4f32_f32:
4527  case Intrinsic::nvvm_tex_2d_array_level_v4f32_f32:
4528  case Intrinsic::nvvm_tex_2d_array_grad_v4f32_f32:
4529  case Intrinsic::nvvm_tex_3d_v4f32_s32:
4530  case Intrinsic::nvvm_tex_3d_v4f32_f32:
4531  case Intrinsic::nvvm_tex_3d_level_v4f32_f32:
4532  case Intrinsic::nvvm_tex_3d_grad_v4f32_f32:
4533  case Intrinsic::nvvm_tex_cube_v4f32_f32:
4534  case Intrinsic::nvvm_tex_cube_level_v4f32_f32:
4535  case Intrinsic::nvvm_tex_cube_array_v4f32_f32:
4536  case Intrinsic::nvvm_tex_cube_array_level_v4f32_f32:
4537  case Intrinsic::nvvm_tld4_r_2d_v4f32_f32:
4538  case Intrinsic::nvvm_tld4_g_2d_v4f32_f32:
4539  case Intrinsic::nvvm_tld4_b_2d_v4f32_f32:
4540  case Intrinsic::nvvm_tld4_a_2d_v4f32_f32:
4541  case Intrinsic::nvvm_tex_unified_1d_v4f32_s32:
4542  case Intrinsic::nvvm_tex_unified_1d_v4f32_f32:
4543  case Intrinsic::nvvm_tex_unified_1d_level_v4f32_f32:
4544  case Intrinsic::nvvm_tex_unified_1d_grad_v4f32_f32:
4545  case Intrinsic::nvvm_tex_unified_1d_array_v4f32_s32:
4546  case Intrinsic::nvvm_tex_unified_1d_array_v4f32_f32:
4547  case Intrinsic::nvvm_tex_unified_1d_array_level_v4f32_f32:
4548  case Intrinsic::nvvm_tex_unified_1d_array_grad_v4f32_f32:
4549  case Intrinsic::nvvm_tex_unified_2d_v4f32_s32:
4550  case Intrinsic::nvvm_tex_unified_2d_v4f32_f32:
4551  case Intrinsic::nvvm_tex_unified_2d_level_v4f32_f32:
4552  case Intrinsic::nvvm_tex_unified_2d_grad_v4f32_f32:
4553  case Intrinsic::nvvm_tex_unified_2d_array_v4f32_s32:
4554  case Intrinsic::nvvm_tex_unified_2d_array_v4f32_f32:
4555  case Intrinsic::nvvm_tex_unified_2d_array_level_v4f32_f32:
4556  case Intrinsic::nvvm_tex_unified_2d_array_grad_v4f32_f32:
4557  case Intrinsic::nvvm_tex_unified_3d_v4f32_s32:
4558  case Intrinsic::nvvm_tex_unified_3d_v4f32_f32:
4559  case Intrinsic::nvvm_tex_unified_3d_level_v4f32_f32:
4560  case Intrinsic::nvvm_tex_unified_3d_grad_v4f32_f32:
4561  case Intrinsic::nvvm_tex_unified_cube_v4f32_f32:
4562  case Intrinsic::nvvm_tex_unified_cube_level_v4f32_f32:
4563  case Intrinsic::nvvm_tex_unified_cube_array_v4f32_f32:
4564  case Intrinsic::nvvm_tex_unified_cube_array_level_v4f32_f32:
4565  case Intrinsic::nvvm_tex_unified_cube_grad_v4f32_f32:
4566  case Intrinsic::nvvm_tex_unified_cube_array_grad_v4f32_f32:
4567  case Intrinsic::nvvm_tld4_unified_r_2d_v4f32_f32:
4568  case Intrinsic::nvvm_tld4_unified_g_2d_v4f32_f32:
4569  case Intrinsic::nvvm_tld4_unified_b_2d_v4f32_f32:
4570  case Intrinsic::nvvm_tld4_unified_a_2d_v4f32_f32:
4571    Info.opc = getOpcForTextureInstr(Intrinsic);
4572    Info.memVT = MVT::v4f32;
4573    Info.ptrVal = nullptr;
4574    Info.offset = 0;
4575    Info.flags = MachineMemOperand::MOLoad;
4576    Info.align = Align(16);
4577    return true;
4578
4579  case Intrinsic::nvvm_tex_1d_v4s32_s32:
4580  case Intrinsic::nvvm_tex_1d_v4s32_f32:
4581  case Intrinsic::nvvm_tex_1d_level_v4s32_f32:
4582  case Intrinsic::nvvm_tex_1d_grad_v4s32_f32:
4583  case Intrinsic::nvvm_tex_1d_array_v4s32_s32:
4584  case Intrinsic::nvvm_tex_1d_array_v4s32_f32:
4585  case Intrinsic::nvvm_tex_1d_array_level_v4s32_f32:
4586  case Intrinsic::nvvm_tex_1d_array_grad_v4s32_f32:
4587  case Intrinsic::nvvm_tex_2d_v4s32_s32:
4588  case Intrinsic::nvvm_tex_2d_v4s32_f32:
4589  case Intrinsic::nvvm_tex_2d_level_v4s32_f32:
4590  case Intrinsic::nvvm_tex_2d_grad_v4s32_f32:
4591  case Intrinsic::nvvm_tex_2d_array_v4s32_s32:
4592  case Intrinsic::nvvm_tex_2d_array_v4s32_f32:
4593  case Intrinsic::nvvm_tex_2d_array_level_v4s32_f32:
4594  case Intrinsic::nvvm_tex_2d_array_grad_v4s32_f32:
4595  case Intrinsic::nvvm_tex_3d_v4s32_s32:
4596  case Intrinsic::nvvm_tex_3d_v4s32_f32:
4597  case Intrinsic::nvvm_tex_3d_level_v4s32_f32:
4598  case Intrinsic::nvvm_tex_3d_grad_v4s32_f32:
4599  case Intrinsic::nvvm_tex_cube_v4s32_f32:
4600  case Intrinsic::nvvm_tex_cube_level_v4s32_f32:
4601  case Intrinsic::nvvm_tex_cube_array_v4s32_f32:
4602  case Intrinsic::nvvm_tex_cube_array_level_v4s32_f32:
4603  case Intrinsic::nvvm_tex_cube_v4u32_f32:
4604  case Intrinsic::nvvm_tex_cube_level_v4u32_f32:
4605  case Intrinsic::nvvm_tex_cube_array_v4u32_f32:
4606  case Intrinsic::nvvm_tex_cube_array_level_v4u32_f32:
4607  case Intrinsic::nvvm_tex_1d_v4u32_s32:
4608  case Intrinsic::nvvm_tex_1d_v4u32_f32:
4609  case Intrinsic::nvvm_tex_1d_level_v4u32_f32:
4610  case Intrinsic::nvvm_tex_1d_grad_v4u32_f32:
4611  case Intrinsic::nvvm_tex_1d_array_v4u32_s32:
4612  case Intrinsic::nvvm_tex_1d_array_v4u32_f32:
4613  case Intrinsic::nvvm_tex_1d_array_level_v4u32_f32:
4614  case Intrinsic::nvvm_tex_1d_array_grad_v4u32_f32:
4615  case Intrinsic::nvvm_tex_2d_v4u32_s32:
4616  case Intrinsic::nvvm_tex_2d_v4u32_f32:
4617  case Intrinsic::nvvm_tex_2d_level_v4u32_f32:
4618  case Intrinsic::nvvm_tex_2d_grad_v4u32_f32:
4619  case Intrinsic::nvvm_tex_2d_array_v4u32_s32:
4620  case Intrinsic::nvvm_tex_2d_array_v4u32_f32:
4621  case Intrinsic::nvvm_tex_2d_array_level_v4u32_f32:
4622  case Intrinsic::nvvm_tex_2d_array_grad_v4u32_f32:
4623  case Intrinsic::nvvm_tex_3d_v4u32_s32:
4624  case Intrinsic::nvvm_tex_3d_v4u32_f32:
4625  case Intrinsic::nvvm_tex_3d_level_v4u32_f32:
4626  case Intrinsic::nvvm_tex_3d_grad_v4u32_f32:
4627  case Intrinsic::nvvm_tld4_r_2d_v4s32_f32:
4628  case Intrinsic::nvvm_tld4_g_2d_v4s32_f32:
4629  case Intrinsic::nvvm_tld4_b_2d_v4s32_f32:
4630  case Intrinsic::nvvm_tld4_a_2d_v4s32_f32:
4631  case Intrinsic::nvvm_tld4_r_2d_v4u32_f32:
4632  case Intrinsic::nvvm_tld4_g_2d_v4u32_f32:
4633  case Intrinsic::nvvm_tld4_b_2d_v4u32_f32:
4634  case Intrinsic::nvvm_tld4_a_2d_v4u32_f32:
4635  case Intrinsic::nvvm_tex_unified_1d_v4s32_s32:
4636  case Intrinsic::nvvm_tex_unified_1d_v4s32_f32:
4637  case Intrinsic::nvvm_tex_unified_1d_level_v4s32_f32:
4638  case Intrinsic::nvvm_tex_unified_1d_grad_v4s32_f32:
4639  case Intrinsic::nvvm_tex_unified_1d_array_v4s32_s32:
4640  case Intrinsic::nvvm_tex_unified_1d_array_v4s32_f32:
4641  case Intrinsic::nvvm_tex_unified_1d_array_level_v4s32_f32:
4642  case Intrinsic::nvvm_tex_unified_1d_array_grad_v4s32_f32:
4643  case Intrinsic::nvvm_tex_unified_2d_v4s32_s32:
4644  case Intrinsic::nvvm_tex_unified_2d_v4s32_f32:
4645  case Intrinsic::nvvm_tex_unified_2d_level_v4s32_f32:
4646  case Intrinsic::nvvm_tex_unified_2d_grad_v4s32_f32:
4647  case Intrinsic::nvvm_tex_unified_2d_array_v4s32_s32:
4648  case Intrinsic::nvvm_tex_unified_2d_array_v4s32_f32:
4649  case Intrinsic::nvvm_tex_unified_2d_array_level_v4s32_f32:
4650  case Intrinsic::nvvm_tex_unified_2d_array_grad_v4s32_f32:
4651  case Intrinsic::nvvm_tex_unified_3d_v4s32_s32:
4652  case Intrinsic::nvvm_tex_unified_3d_v4s32_f32:
4653  case Intrinsic::nvvm_tex_unified_3d_level_v4s32_f32:
4654  case Intrinsic::nvvm_tex_unified_3d_grad_v4s32_f32:
4655  case Intrinsic::nvvm_tex_unified_1d_v4u32_s32:
4656  case Intrinsic::nvvm_tex_unified_1d_v4u32_f32:
4657  case Intrinsic::nvvm_tex_unified_1d_level_v4u32_f32:
4658  case Intrinsic::nvvm_tex_unified_1d_grad_v4u32_f32:
4659  case Intrinsic::nvvm_tex_unified_1d_array_v4u32_s32:
4660  case Intrinsic::nvvm_tex_unified_1d_array_v4u32_f32:
4661  case Intrinsic::nvvm_tex_unified_1d_array_level_v4u32_f32:
4662  case Intrinsic::nvvm_tex_unified_1d_array_grad_v4u32_f32:
4663  case Intrinsic::nvvm_tex_unified_2d_v4u32_s32:
4664  case Intrinsic::nvvm_tex_unified_2d_v4u32_f32:
4665  case Intrinsic::nvvm_tex_unified_2d_level_v4u32_f32:
4666  case Intrinsic::nvvm_tex_unified_2d_grad_v4u32_f32:
4667  case Intrinsic::nvvm_tex_unified_2d_array_v4u32_s32:
4668  case Intrinsic::nvvm_tex_unified_2d_array_v4u32_f32:
4669  case Intrinsic::nvvm_tex_unified_2d_array_level_v4u32_f32:
4670  case Intrinsic::nvvm_tex_unified_2d_array_grad_v4u32_f32:
4671  case Intrinsic::nvvm_tex_unified_3d_v4u32_s32:
4672  case Intrinsic::nvvm_tex_unified_3d_v4u32_f32:
4673  case Intrinsic::nvvm_tex_unified_3d_level_v4u32_f32:
4674  case Intrinsic::nvvm_tex_unified_3d_grad_v4u32_f32:
4675  case Intrinsic::nvvm_tex_unified_cube_v4s32_f32:
4676  case Intrinsic::nvvm_tex_unified_cube_level_v4s32_f32:
4677  case Intrinsic::nvvm_tex_unified_cube_array_v4s32_f32:
4678  case Intrinsic::nvvm_tex_unified_cube_array_level_v4s32_f32:
4679  case Intrinsic::nvvm_tex_unified_cube_v4u32_f32:
4680  case Intrinsic::nvvm_tex_unified_cube_level_v4u32_f32:
4681  case Intrinsic::nvvm_tex_unified_cube_array_v4u32_f32:
4682  case Intrinsic::nvvm_tex_unified_cube_array_level_v4u32_f32:
4683  case Intrinsic::nvvm_tex_unified_cube_grad_v4s32_f32:
4684  case Intrinsic::nvvm_tex_unified_cube_grad_v4u32_f32:
4685  case Intrinsic::nvvm_tex_unified_cube_array_grad_v4s32_f32:
4686  case Intrinsic::nvvm_tex_unified_cube_array_grad_v4u32_f32:
4687  case Intrinsic::nvvm_tld4_unified_r_2d_v4s32_f32:
4688  case Intrinsic::nvvm_tld4_unified_g_2d_v4s32_f32:
4689  case Intrinsic::nvvm_tld4_unified_b_2d_v4s32_f32:
4690  case Intrinsic::nvvm_tld4_unified_a_2d_v4s32_f32:
4691  case Intrinsic::nvvm_tld4_unified_r_2d_v4u32_f32:
4692  case Intrinsic::nvvm_tld4_unified_g_2d_v4u32_f32:
4693  case Intrinsic::nvvm_tld4_unified_b_2d_v4u32_f32:
4694  case Intrinsic::nvvm_tld4_unified_a_2d_v4u32_f32:
4695    Info.opc = getOpcForTextureInstr(Intrinsic);
4696    Info.memVT = MVT::v4i32;
4697    Info.ptrVal = nullptr;
4698    Info.offset = 0;
4699    Info.flags = MachineMemOperand::MOLoad;
4700    Info.align = Align(16);
4701    return true;
4702
4703  case Intrinsic::nvvm_suld_1d_i8_clamp:
4704  case Intrinsic::nvvm_suld_1d_v2i8_clamp:
4705  case Intrinsic::nvvm_suld_1d_v4i8_clamp:
4706  case Intrinsic::nvvm_suld_1d_array_i8_clamp:
4707  case Intrinsic::nvvm_suld_1d_array_v2i8_clamp:
4708  case Intrinsic::nvvm_suld_1d_array_v4i8_clamp:
4709  case Intrinsic::nvvm_suld_2d_i8_clamp:
4710  case Intrinsic::nvvm_suld_2d_v2i8_clamp:
4711  case Intrinsic::nvvm_suld_2d_v4i8_clamp:
4712  case Intrinsic::nvvm_suld_2d_array_i8_clamp:
4713  case Intrinsic::nvvm_suld_2d_array_v2i8_clamp:
4714  case Intrinsic::nvvm_suld_2d_array_v4i8_clamp:
4715  case Intrinsic::nvvm_suld_3d_i8_clamp:
4716  case Intrinsic::nvvm_suld_3d_v2i8_clamp:
4717  case Intrinsic::nvvm_suld_3d_v4i8_clamp:
4718  case Intrinsic::nvvm_suld_1d_i8_trap:
4719  case Intrinsic::nvvm_suld_1d_v2i8_trap:
4720  case Intrinsic::nvvm_suld_1d_v4i8_trap:
4721  case Intrinsic::nvvm_suld_1d_array_i8_trap:
4722  case Intrinsic::nvvm_suld_1d_array_v2i8_trap:
4723  case Intrinsic::nvvm_suld_1d_array_v4i8_trap:
4724  case Intrinsic::nvvm_suld_2d_i8_trap:
4725  case Intrinsic::nvvm_suld_2d_v2i8_trap:
4726  case Intrinsic::nvvm_suld_2d_v4i8_trap:
4727  case Intrinsic::nvvm_suld_2d_array_i8_trap:
4728  case Intrinsic::nvvm_suld_2d_array_v2i8_trap:
4729  case Intrinsic::nvvm_suld_2d_array_v4i8_trap:
4730  case Intrinsic::nvvm_suld_3d_i8_trap:
4731  case Intrinsic::nvvm_suld_3d_v2i8_trap:
4732  case Intrinsic::nvvm_suld_3d_v4i8_trap:
4733  case Intrinsic::nvvm_suld_1d_i8_zero:
4734  case Intrinsic::nvvm_suld_1d_v2i8_zero:
4735  case Intrinsic::nvvm_suld_1d_v4i8_zero:
4736  case Intrinsic::nvvm_suld_1d_array_i8_zero:
4737  case Intrinsic::nvvm_suld_1d_array_v2i8_zero:
4738  case Intrinsic::nvvm_suld_1d_array_v4i8_zero:
4739  case Intrinsic::nvvm_suld_2d_i8_zero:
4740  case Intrinsic::nvvm_suld_2d_v2i8_zero:
4741  case Intrinsic::nvvm_suld_2d_v4i8_zero:
4742  case Intrinsic::nvvm_suld_2d_array_i8_zero:
4743  case Intrinsic::nvvm_suld_2d_array_v2i8_zero:
4744  case Intrinsic::nvvm_suld_2d_array_v4i8_zero:
4745  case Intrinsic::nvvm_suld_3d_i8_zero:
4746  case Intrinsic::nvvm_suld_3d_v2i8_zero:
4747  case Intrinsic::nvvm_suld_3d_v4i8_zero:
4748    Info.opc = getOpcForSurfaceInstr(Intrinsic);
4749    Info.memVT = MVT::i8;
4750    Info.ptrVal = nullptr;
4751    Info.offset = 0;
4752    Info.flags = MachineMemOperand::MOLoad;
4753    Info.align = Align(16);
4754    return true;
4755
4756  case Intrinsic::nvvm_suld_1d_i16_clamp:
4757  case Intrinsic::nvvm_suld_1d_v2i16_clamp:
4758  case Intrinsic::nvvm_suld_1d_v4i16_clamp:
4759  case Intrinsic::nvvm_suld_1d_array_i16_clamp:
4760  case Intrinsic::nvvm_suld_1d_array_v2i16_clamp:
4761  case Intrinsic::nvvm_suld_1d_array_v4i16_clamp:
4762  case Intrinsic::nvvm_suld_2d_i16_clamp:
4763  case Intrinsic::nvvm_suld_2d_v2i16_clamp:
4764  case Intrinsic::nvvm_suld_2d_v4i16_clamp:
4765  case Intrinsic::nvvm_suld_2d_array_i16_clamp:
4766  case Intrinsic::nvvm_suld_2d_array_v2i16_clamp:
4767  case Intrinsic::nvvm_suld_2d_array_v4i16_clamp:
4768  case Intrinsic::nvvm_suld_3d_i16_clamp:
4769  case Intrinsic::nvvm_suld_3d_v2i16_clamp:
4770  case Intrinsic::nvvm_suld_3d_v4i16_clamp:
4771  case Intrinsic::nvvm_suld_1d_i16_trap:
4772  case Intrinsic::nvvm_suld_1d_v2i16_trap:
4773  case Intrinsic::nvvm_suld_1d_v4i16_trap:
4774  case Intrinsic::nvvm_suld_1d_array_i16_trap:
4775  case Intrinsic::nvvm_suld_1d_array_v2i16_trap:
4776  case Intrinsic::nvvm_suld_1d_array_v4i16_trap:
4777  case Intrinsic::nvvm_suld_2d_i16_trap:
4778  case Intrinsic::nvvm_suld_2d_v2i16_trap:
4779  case Intrinsic::nvvm_suld_2d_v4i16_trap:
4780  case Intrinsic::nvvm_suld_2d_array_i16_trap:
4781  case Intrinsic::nvvm_suld_2d_array_v2i16_trap:
4782  case Intrinsic::nvvm_suld_2d_array_v4i16_trap:
4783  case Intrinsic::nvvm_suld_3d_i16_trap:
4784  case Intrinsic::nvvm_suld_3d_v2i16_trap:
4785  case Intrinsic::nvvm_suld_3d_v4i16_trap:
4786  case Intrinsic::nvvm_suld_1d_i16_zero:
4787  case Intrinsic::nvvm_suld_1d_v2i16_zero:
4788  case Intrinsic::nvvm_suld_1d_v4i16_zero:
4789  case Intrinsic::nvvm_suld_1d_array_i16_zero:
4790  case Intrinsic::nvvm_suld_1d_array_v2i16_zero:
4791  case Intrinsic::nvvm_suld_1d_array_v4i16_zero:
4792  case Intrinsic::nvvm_suld_2d_i16_zero:
4793  case Intrinsic::nvvm_suld_2d_v2i16_zero:
4794  case Intrinsic::nvvm_suld_2d_v4i16_zero:
4795  case Intrinsic::nvvm_suld_2d_array_i16_zero:
4796  case Intrinsic::nvvm_suld_2d_array_v2i16_zero:
4797  case Intrinsic::nvvm_suld_2d_array_v4i16_zero:
4798  case Intrinsic::nvvm_suld_3d_i16_zero:
4799  case Intrinsic::nvvm_suld_3d_v2i16_zero:
4800  case Intrinsic::nvvm_suld_3d_v4i16_zero:
4801    Info.opc = getOpcForSurfaceInstr(Intrinsic);
4802    Info.memVT = MVT::i16;
4803    Info.ptrVal = nullptr;
4804    Info.offset = 0;
4805    Info.flags = MachineMemOperand::MOLoad;
4806    Info.align = Align(16);
4807    return true;
4808
4809  case Intrinsic::nvvm_suld_1d_i32_clamp:
4810  case Intrinsic::nvvm_suld_1d_v2i32_clamp:
4811  case Intrinsic::nvvm_suld_1d_v4i32_clamp:
4812  case Intrinsic::nvvm_suld_1d_array_i32_clamp:
4813  case Intrinsic::nvvm_suld_1d_array_v2i32_clamp:
4814  case Intrinsic::nvvm_suld_1d_array_v4i32_clamp:
4815  case Intrinsic::nvvm_suld_2d_i32_clamp:
4816  case Intrinsic::nvvm_suld_2d_v2i32_clamp:
4817  case Intrinsic::nvvm_suld_2d_v4i32_clamp:
4818  case Intrinsic::nvvm_suld_2d_array_i32_clamp:
4819  case Intrinsic::nvvm_suld_2d_array_v2i32_clamp:
4820  case Intrinsic::nvvm_suld_2d_array_v4i32_clamp:
4821  case Intrinsic::nvvm_suld_3d_i32_clamp:
4822  case Intrinsic::nvvm_suld_3d_v2i32_clamp:
4823  case Intrinsic::nvvm_suld_3d_v4i32_clamp:
4824  case Intrinsic::nvvm_suld_1d_i32_trap:
4825  case Intrinsic::nvvm_suld_1d_v2i32_trap:
4826  case Intrinsic::nvvm_suld_1d_v4i32_trap:
4827  case Intrinsic::nvvm_suld_1d_array_i32_trap:
4828  case Intrinsic::nvvm_suld_1d_array_v2i32_trap:
4829  case Intrinsic::nvvm_suld_1d_array_v4i32_trap:
4830  case Intrinsic::nvvm_suld_2d_i32_trap:
4831  case Intrinsic::nvvm_suld_2d_v2i32_trap:
4832  case Intrinsic::nvvm_suld_2d_v4i32_trap:
4833  case Intrinsic::nvvm_suld_2d_array_i32_trap:
4834  case Intrinsic::nvvm_suld_2d_array_v2i32_trap:
4835  case Intrinsic::nvvm_suld_2d_array_v4i32_trap:
4836  case Intrinsic::nvvm_suld_3d_i32_trap:
4837  case Intrinsic::nvvm_suld_3d_v2i32_trap:
4838  case Intrinsic::nvvm_suld_3d_v4i32_trap:
4839  case Intrinsic::nvvm_suld_1d_i32_zero:
4840  case Intrinsic::nvvm_suld_1d_v2i32_zero:
4841  case Intrinsic::nvvm_suld_1d_v4i32_zero:
4842  case Intrinsic::nvvm_suld_1d_array_i32_zero:
4843  case Intrinsic::nvvm_suld_1d_array_v2i32_zero:
4844  case Intrinsic::nvvm_suld_1d_array_v4i32_zero:
4845  case Intrinsic::nvvm_suld_2d_i32_zero:
4846  case Intrinsic::nvvm_suld_2d_v2i32_zero:
4847  case Intrinsic::nvvm_suld_2d_v4i32_zero:
4848  case Intrinsic::nvvm_suld_2d_array_i32_zero:
4849  case Intrinsic::nvvm_suld_2d_array_v2i32_zero:
4850  case Intrinsic::nvvm_suld_2d_array_v4i32_zero:
4851  case Intrinsic::nvvm_suld_3d_i32_zero:
4852  case Intrinsic::nvvm_suld_3d_v2i32_zero:
4853  case Intrinsic::nvvm_suld_3d_v4i32_zero:
4854    Info.opc = getOpcForSurfaceInstr(Intrinsic);
4855    Info.memVT = MVT::i32;
4856    Info.ptrVal = nullptr;
4857    Info.offset = 0;
4858    Info.flags = MachineMemOperand::MOLoad;
4859    Info.align = Align(16);
4860    return true;
4861
4862  case Intrinsic::nvvm_suld_1d_i64_clamp:
4863  case Intrinsic::nvvm_suld_1d_v2i64_clamp:
4864  case Intrinsic::nvvm_suld_1d_array_i64_clamp:
4865  case Intrinsic::nvvm_suld_1d_array_v2i64_clamp:
4866  case Intrinsic::nvvm_suld_2d_i64_clamp:
4867  case Intrinsic::nvvm_suld_2d_v2i64_clamp:
4868  case Intrinsic::nvvm_suld_2d_array_i64_clamp:
4869  case Intrinsic::nvvm_suld_2d_array_v2i64_clamp:
4870  case Intrinsic::nvvm_suld_3d_i64_clamp:
4871  case Intrinsic::nvvm_suld_3d_v2i64_clamp:
4872  case Intrinsic::nvvm_suld_1d_i64_trap:
4873  case Intrinsic::nvvm_suld_1d_v2i64_trap:
4874  case Intrinsic::nvvm_suld_1d_array_i64_trap:
4875  case Intrinsic::nvvm_suld_1d_array_v2i64_trap:
4876  case Intrinsic::nvvm_suld_2d_i64_trap:
4877  case Intrinsic::nvvm_suld_2d_v2i64_trap:
4878  case Intrinsic::nvvm_suld_2d_array_i64_trap:
4879  case Intrinsic::nvvm_suld_2d_array_v2i64_trap:
4880  case Intrinsic::nvvm_suld_3d_i64_trap:
4881  case Intrinsic::nvvm_suld_3d_v2i64_trap:
4882  case Intrinsic::nvvm_suld_1d_i64_zero:
4883  case Intrinsic::nvvm_suld_1d_v2i64_zero:
4884  case Intrinsic::nvvm_suld_1d_array_i64_zero:
4885  case Intrinsic::nvvm_suld_1d_array_v2i64_zero:
4886  case Intrinsic::nvvm_suld_2d_i64_zero:
4887  case Intrinsic::nvvm_suld_2d_v2i64_zero:
4888  case Intrinsic::nvvm_suld_2d_array_i64_zero:
4889  case Intrinsic::nvvm_suld_2d_array_v2i64_zero:
4890  case Intrinsic::nvvm_suld_3d_i64_zero:
4891  case Intrinsic::nvvm_suld_3d_v2i64_zero:
4892    Info.opc = getOpcForSurfaceInstr(Intrinsic);
4893    Info.memVT = MVT::i64;
4894    Info.ptrVal = nullptr;
4895    Info.offset = 0;
4896    Info.flags = MachineMemOperand::MOLoad;
4897    Info.align = Align(16);
4898    return true;
4899  }
4900  return false;
4901}
4902
4903/// getFunctionParamOptimizedAlign - since function arguments are passed via
4904/// .param space, we may want to increase their alignment in a way that
4905/// ensures that we can effectively vectorize their loads & stores. We can
4906/// increase alignment only if the function has internal or has private
4907/// linkage as for other linkage types callers may already rely on default
4908/// alignment. To allow using 128-bit vectorized loads/stores, this function
4909/// ensures that alignment is 16 or greater.
4910Align NVPTXTargetLowering::getFunctionParamOptimizedAlign(
4911    const Function *F, Type *ArgTy, const DataLayout &DL) const {
4912  const uint64_t ABITypeAlign = DL.getABITypeAlign(ArgTy).value();
4913
4914  // If a function has linkage different from internal or private, we
4915  // must use default ABI alignment as external users rely on it. Same
4916  // for a function that may be called from a function pointer.
4917  if (!F || !F->hasLocalLinkage() ||
4918      F->hasAddressTaken(/*Users=*/nullptr,
4919                         /*IgnoreCallbackUses=*/false,
4920                         /*IgnoreAssumeLikeCalls=*/true,
4921                         /*IgnoreLLVMUsed=*/true))
4922    return Align(ABITypeAlign);
4923
4924  assert(!isKernelFunction(*F) && "Expect kernels to have non-local linkage");
4925  return Align(std::max(uint64_t(16), ABITypeAlign));
4926}
4927
4928/// Helper for computing alignment of a device function byval parameter.
4929Align NVPTXTargetLowering::getFunctionByValParamAlign(
4930    const Function *F, Type *ArgTy, Align InitialAlign,
4931    const DataLayout &DL) const {
4932  Align ArgAlign = InitialAlign;
4933  // Try to increase alignment to enhance vectorization options.
4934  if (F)
4935    ArgAlign = std::max(ArgAlign, getFunctionParamOptimizedAlign(F, ArgTy, DL));
4936
4937  // Old ptx versions have a bug. When PTX code takes address of
4938  // byval parameter with alignment < 4, ptxas generates code to
4939  // spill argument into memory. Alas on sm_50+ ptxas generates
4940  // SASS code that fails with misaligned access. To work around
4941  // the problem, make sure that we align byval parameters by at
4942  // least 4. This bug seems to be fixed at least starting from
4943  // ptxas > 9.0.
4944  // TODO: remove this after verifying the bug is not reproduced
4945  // on non-deprecated ptxas versions.
4946  if (ForceMinByValParamAlign)
4947    ArgAlign = std::max(ArgAlign, Align(4));
4948
4949  return ArgAlign;
4950}
4951
4952// Helper for getting a function parameter name. Name is composed from
4953// its index and the function name. Negative index corresponds to special
4954// parameter (unsized array) used for passing variable arguments.
4955std::string NVPTXTargetLowering::getParamName(const Function *F,
4956                                              int Idx) const {
4957  std::string ParamName;
4958  raw_string_ostream ParamStr(ParamName);
4959
4960  ParamStr << getTargetMachine().getSymbol(F)->getName();
4961  if (Idx < 0)
4962    ParamStr << "_vararg";
4963  else
4964    ParamStr << "_param_" << Idx;
4965
4966  return ParamName;
4967}
4968
4969/// isLegalAddressingMode - Return true if the addressing mode represented
4970/// by AM is legal for this target, for a load/store of the specified type.
4971/// Used to guide target specific optimizations, like loop strength reduction
4972/// (LoopStrengthReduce.cpp) and memory optimization for address mode
4973/// (CodeGenPrepare.cpp)
4974bool NVPTXTargetLowering::isLegalAddressingMode(const DataLayout &DL,
4975                                                const AddrMode &AM, Type *Ty,
4976                                                unsigned AS, Instruction *I) const {
4977  // AddrMode - This represents an addressing mode of:
4978  //    BaseGV + BaseOffs + BaseReg + Scale*ScaleReg
4979  //
4980  // The legal address modes are
4981  // - [avar]
4982  // - [areg]
4983  // - [areg+immoff]
4984  // - [immAddr]
4985
4986  if (AM.BaseGV) {
4987    return !AM.BaseOffs && !AM.HasBaseReg && !AM.Scale;
4988  }
4989
4990  switch (AM.Scale) {
4991  case 0: // "r", "r+i" or "i" is allowed
4992    break;
4993  case 1:
4994    if (AM.HasBaseReg) // "r+r+i" or "r+r" is not allowed.
4995      return false;
4996    // Otherwise we have r+i.
4997    break;
4998  default:
4999    // No scale > 1 is allowed
5000    return false;
5001  }
5002  return true;
5003}
5004
5005//===----------------------------------------------------------------------===//
5006//                         NVPTX Inline Assembly Support
5007//===----------------------------------------------------------------------===//
5008
5009/// getConstraintType - Given a constraint letter, return the type of
5010/// constraint it is for this target.
5011NVPTXTargetLowering::ConstraintType
5012NVPTXTargetLowering::getConstraintType(StringRef Constraint) const {
5013  if (Constraint.size() == 1) {
5014    switch (Constraint[0]) {
5015    default:
5016      break;
5017    case 'b':
5018    case 'r':
5019    case 'h':
5020    case 'c':
5021    case 'l':
5022    case 'f':
5023    case 'd':
5024    case '0':
5025    case 'N':
5026      return C_RegisterClass;
5027    }
5028  }
5029  return TargetLowering::getConstraintType(Constraint);
5030}
5031
5032std::pair<unsigned, const TargetRegisterClass *>
5033NVPTXTargetLowering::getRegForInlineAsmConstraint(const TargetRegisterInfo *TRI,
5034                                                  StringRef Constraint,
5035                                                  MVT VT) const {
5036  if (Constraint.size() == 1) {
5037    switch (Constraint[0]) {
5038    case 'b':
5039      return std::make_pair(0U, &NVPTX::Int1RegsRegClass);
5040    case 'c':
5041      return std::make_pair(0U, &NVPTX::Int16RegsRegClass);
5042    case 'h':
5043      return std::make_pair(0U, &NVPTX::Int16RegsRegClass);
5044    case 'r':
5045      return std::make_pair(0U, &NVPTX::Int32RegsRegClass);
5046    case 'l':
5047    case 'N':
5048      return std::make_pair(0U, &NVPTX::Int64RegsRegClass);
5049    case 'f':
5050      return std::make_pair(0U, &NVPTX::Float32RegsRegClass);
5051    case 'd':
5052      return std::make_pair(0U, &NVPTX::Float64RegsRegClass);
5053    }
5054  }
5055  return TargetLowering::getRegForInlineAsmConstraint(TRI, Constraint, VT);
5056}
5057
5058//===----------------------------------------------------------------------===//
5059//                         NVPTX DAG Combining
5060//===----------------------------------------------------------------------===//
5061
5062bool NVPTXTargetLowering::allowFMA(MachineFunction &MF,
5063                                   CodeGenOptLevel OptLevel) const {
5064  // Always honor command-line argument
5065  if (FMAContractLevelOpt.getNumOccurrences() > 0)
5066    return FMAContractLevelOpt > 0;
5067
5068  // Do not contract if we're not optimizing the code.
5069  if (OptLevel == CodeGenOptLevel::None)
5070    return false;
5071
5072  // Honor TargetOptions flags that explicitly say fusion is okay.
5073  if (MF.getTarget().Options.AllowFPOpFusion == FPOpFusion::Fast)
5074    return true;
5075
5076  return allowUnsafeFPMath(MF);
5077}
5078
5079bool NVPTXTargetLowering::allowUnsafeFPMath(MachineFunction &MF) const {
5080  // Honor TargetOptions flags that explicitly say unsafe math is okay.
5081  if (MF.getTarget().Options.UnsafeFPMath)
5082    return true;
5083
5084  // Allow unsafe math if unsafe-fp-math attribute explicitly says so.
5085  const Function &F = MF.getFunction();
5086  return F.getFnAttribute("unsafe-fp-math").getValueAsBool();
5087}
5088
5089/// PerformADDCombineWithOperands - Try DAG combinations for an ADD with
5090/// operands N0 and N1.  This is a helper for PerformADDCombine that is
5091/// called with the default operands, and if that fails, with commuted
5092/// operands.
5093static SDValue PerformADDCombineWithOperands(
5094    SDNode *N, SDValue N0, SDValue N1, TargetLowering::DAGCombinerInfo &DCI,
5095    const NVPTXSubtarget &Subtarget, CodeGenOptLevel OptLevel) {
5096  SelectionDAG  &DAG = DCI.DAG;
5097  // Skip non-integer, non-scalar case
5098  EVT VT=N0.getValueType();
5099  if (VT.isVector())
5100    return SDValue();
5101
5102  // fold (add (mul a, b), c) -> (mad a, b, c)
5103  //
5104  if (N0.getOpcode() == ISD::MUL) {
5105    assert (VT.isInteger());
5106    // For integer:
5107    // Since integer multiply-add costs the same as integer multiply
5108    // but is more costly than integer add, do the fusion only when
5109    // the mul is only used in the add.
5110    if (OptLevel == CodeGenOptLevel::None || VT != MVT::i32 ||
5111        !N0.getNode()->hasOneUse())
5112      return SDValue();
5113
5114    // Do the folding
5115    return DAG.getNode(NVPTXISD::IMAD, SDLoc(N), VT,
5116                       N0.getOperand(0), N0.getOperand(1), N1);
5117  }
5118  else if (N0.getOpcode() == ISD::FMUL) {
5119    if (VT == MVT::f32 || VT == MVT::f64) {
5120      const auto *TLI = static_cast<const NVPTXTargetLowering *>(
5121          &DAG.getTargetLoweringInfo());
5122      if (!TLI->allowFMA(DAG.getMachineFunction(), OptLevel))
5123        return SDValue();
5124
5125      // For floating point:
5126      // Do the fusion only when the mul has less than 5 uses and all
5127      // are add.
5128      // The heuristic is that if a use is not an add, then that use
5129      // cannot be fused into fma, therefore mul is still needed anyway.
5130      // If there are more than 4 uses, even if they are all add, fusing
5131      // them will increase register pressue.
5132      //
5133      int numUses = 0;
5134      int nonAddCount = 0;
5135      for (const SDNode *User : N0.getNode()->uses()) {
5136        numUses++;
5137        if (User->getOpcode() != ISD::FADD)
5138          ++nonAddCount;
5139      }
5140      if (numUses >= 5)
5141        return SDValue();
5142      if (nonAddCount) {
5143        int orderNo = N->getIROrder();
5144        int orderNo2 = N0.getNode()->getIROrder();
5145        // simple heuristics here for considering potential register
5146        // pressure, the logics here is that the differnce are used
5147        // to measure the distance between def and use, the longer distance
5148        // more likely cause register pressure.
5149        if (orderNo - orderNo2 < 500)
5150          return SDValue();
5151
5152        // Now, check if at least one of the FMUL's operands is live beyond the node N,
5153        // which guarantees that the FMA will not increase register pressure at node N.
5154        bool opIsLive = false;
5155        const SDNode *left = N0.getOperand(0).getNode();
5156        const SDNode *right = N0.getOperand(1).getNode();
5157
5158        if (isa<ConstantSDNode>(left) || isa<ConstantSDNode>(right))
5159          opIsLive = true;
5160
5161        if (!opIsLive)
5162          for (const SDNode *User : left->uses()) {
5163            int orderNo3 = User->getIROrder();
5164            if (orderNo3 > orderNo) {
5165              opIsLive = true;
5166              break;
5167            }
5168          }
5169
5170        if (!opIsLive)
5171          for (const SDNode *User : right->uses()) {
5172            int orderNo3 = User->getIROrder();
5173            if (orderNo3 > orderNo) {
5174              opIsLive = true;
5175              break;
5176            }
5177          }
5178
5179        if (!opIsLive)
5180          return SDValue();
5181      }
5182
5183      return DAG.getNode(ISD::FMA, SDLoc(N), VT,
5184                         N0.getOperand(0), N0.getOperand(1), N1);
5185    }
5186  }
5187
5188  return SDValue();
5189}
5190
5191static SDValue PerformStoreRetvalCombine(SDNode *N) {
5192  // Operands from the 2nd to the last one are the values to be stored
5193  for (std::size_t I = 2, OpsCount = N->ops().size(); I != OpsCount; ++I)
5194    if (!N->getOperand(I).isUndef())
5195      return SDValue();
5196
5197  // Operand 0 is the previous value in the chain. Cannot return EntryToken
5198  // as the previous value will become unused and eliminated later.
5199  return N->getOperand(0);
5200}
5201
5202/// PerformADDCombine - Target-specific dag combine xforms for ISD::ADD.
5203///
5204static SDValue PerformADDCombine(SDNode *N,
5205                                 TargetLowering::DAGCombinerInfo &DCI,
5206                                 const NVPTXSubtarget &Subtarget,
5207                                 CodeGenOptLevel OptLevel) {
5208  SDValue N0 = N->getOperand(0);
5209  SDValue N1 = N->getOperand(1);
5210
5211  // First try with the default operand order.
5212  if (SDValue Result =
5213          PerformADDCombineWithOperands(N, N0, N1, DCI, Subtarget, OptLevel))
5214    return Result;
5215
5216  // If that didn't work, try again with the operands commuted.
5217  return PerformADDCombineWithOperands(N, N1, N0, DCI, Subtarget, OptLevel);
5218}
5219
5220static SDValue PerformANDCombine(SDNode *N,
5221                                 TargetLowering::DAGCombinerInfo &DCI) {
5222  // The type legalizer turns a vector load of i8 values into a zextload to i16
5223  // registers, optionally ANY_EXTENDs it (if target type is integer),
5224  // and ANDs off the high 8 bits. Since we turn this load into a
5225  // target-specific DAG node, the DAG combiner fails to eliminate these AND
5226  // nodes. Do that here.
5227  SDValue Val = N->getOperand(0);
5228  SDValue Mask = N->getOperand(1);
5229
5230  if (isa<ConstantSDNode>(Val)) {
5231    std::swap(Val, Mask);
5232  }
5233
5234  SDValue AExt;
5235
5236  // Convert BFE-> truncate i16 -> and 255
5237  // To just BFE-> truncate i16, as the value already has all the bits in the
5238  // right places.
5239  if (Val.getOpcode() == ISD::TRUNCATE) {
5240    SDValue BFE = Val.getOperand(0);
5241    if (BFE.getOpcode() != NVPTXISD::BFE)
5242      return SDValue();
5243
5244    ConstantSDNode *BFEBits = dyn_cast<ConstantSDNode>(BFE.getOperand(0));
5245    if (!BFEBits)
5246      return SDValue();
5247    uint64_t BFEBitsVal = BFEBits->getZExtValue();
5248
5249    ConstantSDNode *MaskCnst = dyn_cast<ConstantSDNode>(Mask);
5250    if (!MaskCnst) {
5251      // Not an AND with a constant
5252      return SDValue();
5253    }
5254    uint64_t MaskVal = MaskCnst->getZExtValue();
5255
5256    if (MaskVal != (uint64_t(1) << BFEBitsVal) - 1)
5257      return SDValue();
5258    // If we get here, the AND is unnecessary.  Just replace it with the trunc
5259    DCI.CombineTo(N, Val, false);
5260  }
5261  // Generally, we will see zextload -> IMOV16rr -> ANY_EXTEND -> and
5262  if (Val.getOpcode() == ISD::ANY_EXTEND) {
5263    AExt = Val;
5264    Val = Val->getOperand(0);
5265  }
5266
5267  if (Val->isMachineOpcode() && Val->getMachineOpcode() == NVPTX::IMOV16rr) {
5268    Val = Val->getOperand(0);
5269  }
5270
5271  if (Val->getOpcode() == NVPTXISD::LoadV2 ||
5272      Val->getOpcode() == NVPTXISD::LoadV4) {
5273    ConstantSDNode *MaskCnst = dyn_cast<ConstantSDNode>(Mask);
5274    if (!MaskCnst) {
5275      // Not an AND with a constant
5276      return SDValue();
5277    }
5278
5279    uint64_t MaskVal = MaskCnst->getZExtValue();
5280    if (MaskVal != 0xff) {
5281      // Not an AND that chops off top 8 bits
5282      return SDValue();
5283    }
5284
5285    MemSDNode *Mem = dyn_cast<MemSDNode>(Val);
5286    if (!Mem) {
5287      // Not a MemSDNode?!?
5288      return SDValue();
5289    }
5290
5291    EVT MemVT = Mem->getMemoryVT();
5292    if (MemVT != MVT::v2i8 && MemVT != MVT::v4i8) {
5293      // We only handle the i8 case
5294      return SDValue();
5295    }
5296
5297    unsigned ExtType = Val->getConstantOperandVal(Val->getNumOperands() - 1);
5298    if (ExtType == ISD::SEXTLOAD) {
5299      // If for some reason the load is a sextload, the and is needed to zero
5300      // out the high 8 bits
5301      return SDValue();
5302    }
5303
5304    bool AddTo = false;
5305    if (AExt.getNode() != nullptr) {
5306      // Re-insert the ext as a zext.
5307      Val = DCI.DAG.getNode(ISD::ZERO_EXTEND, SDLoc(N),
5308                            AExt.getValueType(), Val);
5309      AddTo = true;
5310    }
5311
5312    // If we get here, the AND is unnecessary.  Just replace it with the load
5313    DCI.CombineTo(N, Val, AddTo);
5314  }
5315
5316  return SDValue();
5317}
5318
5319static SDValue PerformREMCombine(SDNode *N,
5320                                 TargetLowering::DAGCombinerInfo &DCI,
5321                                 CodeGenOptLevel OptLevel) {
5322  assert(N->getOpcode() == ISD::SREM || N->getOpcode() == ISD::UREM);
5323
5324  // Don't do anything at less than -O2.
5325  if (OptLevel < CodeGenOptLevel::Default)
5326    return SDValue();
5327
5328  SelectionDAG &DAG = DCI.DAG;
5329  SDLoc DL(N);
5330  EVT VT = N->getValueType(0);
5331  bool IsSigned = N->getOpcode() == ISD::SREM;
5332  unsigned DivOpc = IsSigned ? ISD::SDIV : ISD::UDIV;
5333
5334  const SDValue &Num = N->getOperand(0);
5335  const SDValue &Den = N->getOperand(1);
5336
5337  for (const SDNode *U : Num->uses()) {
5338    if (U->getOpcode() == DivOpc && U->getOperand(0) == Num &&
5339        U->getOperand(1) == Den) {
5340      // Num % Den -> Num - (Num / Den) * Den
5341      return DAG.getNode(ISD::SUB, DL, VT, Num,
5342                         DAG.getNode(ISD::MUL, DL, VT,
5343                                     DAG.getNode(DivOpc, DL, VT, Num, Den),
5344                                     Den));
5345    }
5346  }
5347  return SDValue();
5348}
5349
5350enum OperandSignedness {
5351  Signed = 0,
5352  Unsigned,
5353  Unknown
5354};
5355
5356/// IsMulWideOperandDemotable - Checks if the provided DAG node is an operand
5357/// that can be demoted to \p OptSize bits without loss of information. The
5358/// signedness of the operand, if determinable, is placed in \p S.
5359static bool IsMulWideOperandDemotable(SDValue Op,
5360                                      unsigned OptSize,
5361                                      OperandSignedness &S) {
5362  S = Unknown;
5363
5364  if (Op.getOpcode() == ISD::SIGN_EXTEND ||
5365      Op.getOpcode() == ISD::SIGN_EXTEND_INREG) {
5366    EVT OrigVT = Op.getOperand(0).getValueType();
5367    if (OrigVT.getFixedSizeInBits() <= OptSize) {
5368      S = Signed;
5369      return true;
5370    }
5371  } else if (Op.getOpcode() == ISD::ZERO_EXTEND) {
5372    EVT OrigVT = Op.getOperand(0).getValueType();
5373    if (OrigVT.getFixedSizeInBits() <= OptSize) {
5374      S = Unsigned;
5375      return true;
5376    }
5377  }
5378
5379  return false;
5380}
5381
5382/// AreMulWideOperandsDemotable - Checks if the given LHS and RHS operands can
5383/// be demoted to \p OptSize bits without loss of information. If the operands
5384/// contain a constant, it should appear as the RHS operand. The signedness of
5385/// the operands is placed in \p IsSigned.
5386static bool AreMulWideOperandsDemotable(SDValue LHS, SDValue RHS,
5387                                        unsigned OptSize,
5388                                        bool &IsSigned) {
5389  OperandSignedness LHSSign;
5390
5391  // The LHS operand must be a demotable op
5392  if (!IsMulWideOperandDemotable(LHS, OptSize, LHSSign))
5393    return false;
5394
5395  // We should have been able to determine the signedness from the LHS
5396  if (LHSSign == Unknown)
5397    return false;
5398
5399  IsSigned = (LHSSign == Signed);
5400
5401  // The RHS can be a demotable op or a constant
5402  if (ConstantSDNode *CI = dyn_cast<ConstantSDNode>(RHS)) {
5403    const APInt &Val = CI->getAPIntValue();
5404    if (LHSSign == Unsigned) {
5405      return Val.isIntN(OptSize);
5406    } else {
5407      return Val.isSignedIntN(OptSize);
5408    }
5409  } else {
5410    OperandSignedness RHSSign;
5411    if (!IsMulWideOperandDemotable(RHS, OptSize, RHSSign))
5412      return false;
5413
5414    return LHSSign == RHSSign;
5415  }
5416}
5417
5418/// TryMULWIDECombine - Attempt to replace a multiply of M bits with a multiply
5419/// of M/2 bits that produces an M-bit result (i.e. mul.wide). This transform
5420/// works on both multiply DAG nodes and SHL DAG nodes with a constant shift
5421/// amount.
5422static SDValue TryMULWIDECombine(SDNode *N,
5423                                 TargetLowering::DAGCombinerInfo &DCI) {
5424  EVT MulType = N->getValueType(0);
5425  if (MulType != MVT::i32 && MulType != MVT::i64) {
5426    return SDValue();
5427  }
5428
5429  SDLoc DL(N);
5430  unsigned OptSize = MulType.getSizeInBits() >> 1;
5431  SDValue LHS = N->getOperand(0);
5432  SDValue RHS = N->getOperand(1);
5433
5434  // Canonicalize the multiply so the constant (if any) is on the right
5435  if (N->getOpcode() == ISD::MUL) {
5436    if (isa<ConstantSDNode>(LHS)) {
5437      std::swap(LHS, RHS);
5438    }
5439  }
5440
5441  // If we have a SHL, determine the actual multiply amount
5442  if (N->getOpcode() == ISD::SHL) {
5443    ConstantSDNode *ShlRHS = dyn_cast<ConstantSDNode>(RHS);
5444    if (!ShlRHS) {
5445      return SDValue();
5446    }
5447
5448    APInt ShiftAmt = ShlRHS->getAPIntValue();
5449    unsigned BitWidth = MulType.getSizeInBits();
5450    if (ShiftAmt.sge(0) && ShiftAmt.slt(BitWidth)) {
5451      APInt MulVal = APInt(BitWidth, 1) << ShiftAmt;
5452      RHS = DCI.DAG.getConstant(MulVal, DL, MulType);
5453    } else {
5454      return SDValue();
5455    }
5456  }
5457
5458  bool Signed;
5459  // Verify that our operands are demotable
5460  if (!AreMulWideOperandsDemotable(LHS, RHS, OptSize, Signed)) {
5461    return SDValue();
5462  }
5463
5464  EVT DemotedVT;
5465  if (MulType == MVT::i32) {
5466    DemotedVT = MVT::i16;
5467  } else {
5468    DemotedVT = MVT::i32;
5469  }
5470
5471  // Truncate the operands to the correct size. Note that these are just for
5472  // type consistency and will (likely) be eliminated in later phases.
5473  SDValue TruncLHS =
5474    DCI.DAG.getNode(ISD::TRUNCATE, DL, DemotedVT, LHS);
5475  SDValue TruncRHS =
5476    DCI.DAG.getNode(ISD::TRUNCATE, DL, DemotedVT, RHS);
5477
5478  unsigned Opc;
5479  if (Signed) {
5480    Opc = NVPTXISD::MUL_WIDE_SIGNED;
5481  } else {
5482    Opc = NVPTXISD::MUL_WIDE_UNSIGNED;
5483  }
5484
5485  return DCI.DAG.getNode(Opc, DL, MulType, TruncLHS, TruncRHS);
5486}
5487
5488/// PerformMULCombine - Runs PTX-specific DAG combine patterns on MUL nodes.
5489static SDValue PerformMULCombine(SDNode *N,
5490                                 TargetLowering::DAGCombinerInfo &DCI,
5491                                 CodeGenOptLevel OptLevel) {
5492  if (OptLevel > CodeGenOptLevel::None) {
5493    // Try mul.wide combining at OptLevel > 0
5494    if (SDValue Ret = TryMULWIDECombine(N, DCI))
5495      return Ret;
5496  }
5497
5498  return SDValue();
5499}
5500
5501/// PerformSHLCombine - Runs PTX-specific DAG combine patterns on SHL nodes.
5502static SDValue PerformSHLCombine(SDNode *N,
5503                                 TargetLowering::DAGCombinerInfo &DCI,
5504                                 CodeGenOptLevel OptLevel) {
5505  if (OptLevel > CodeGenOptLevel::None) {
5506    // Try mul.wide combining at OptLevel > 0
5507    if (SDValue Ret = TryMULWIDECombine(N, DCI))
5508      return Ret;
5509  }
5510
5511  return SDValue();
5512}
5513
5514static SDValue PerformSETCCCombine(SDNode *N,
5515                                   TargetLowering::DAGCombinerInfo &DCI,
5516                                   unsigned int SmVersion) {
5517  EVT CCType = N->getValueType(0);
5518  SDValue A = N->getOperand(0);
5519  SDValue B = N->getOperand(1);
5520
5521  EVT AType = A.getValueType();
5522  if (!(CCType == MVT::v2i1 && (AType == MVT::v2f16 || AType == MVT::v2bf16)))
5523    return SDValue();
5524
5525  if (A.getValueType() == MVT::v2bf16 && SmVersion < 90)
5526    return SDValue();
5527
5528  SDLoc DL(N);
5529  // setp.f16x2 returns two scalar predicates, which we need to
5530  // convert back to v2i1. The returned result will be scalarized by
5531  // the legalizer, but the comparison will remain a single vector
5532  // instruction.
5533  SDValue CCNode = DCI.DAG.getNode(
5534      A.getValueType() == MVT::v2f16 ? NVPTXISD::SETP_F16X2
5535                                     : NVPTXISD::SETP_BF16X2,
5536      DL, DCI.DAG.getVTList(MVT::i1, MVT::i1), {A, B, N->getOperand(2)});
5537  return DCI.DAG.getNode(ISD::BUILD_VECTOR, DL, CCType, CCNode.getValue(0),
5538                         CCNode.getValue(1));
5539}
5540
5541static SDValue PerformEXTRACTCombine(SDNode *N,
5542                                     TargetLowering::DAGCombinerInfo &DCI) {
5543  SDValue Vector = N->getOperand(0);
5544  SDLoc DL(N);
5545  EVT VectorVT = Vector.getValueType();
5546  if (Vector->getOpcode() == ISD::LOAD && VectorVT.isSimple() &&
5547      IsPTXVectorType(VectorVT.getSimpleVT()))
5548    return SDValue(); // Native vector loads already combine nicely w/
5549                      // extract_vector_elt, except for v4i8.
5550  // Don't mess with singletons or v2*16 types, we already handle them OK.
5551  if (VectorVT.getVectorNumElements() == 1 || Isv2x16VT(VectorVT) ||
5552      VectorVT == MVT::v4i8)
5553    return SDValue();
5554
5555  uint64_t VectorBits = VectorVT.getSizeInBits();
5556  // We only handle the types we can extract in-register.
5557  if (!(VectorBits == 16 || VectorBits == 32 || VectorBits == 64))
5558    return SDValue();
5559
5560  ConstantSDNode *Index = dyn_cast<ConstantSDNode>(N->getOperand(1));
5561  // Index == 0 is handled by generic DAG combiner.
5562  if (!Index || Index->getZExtValue() == 0)
5563    return SDValue();
5564
5565  MVT IVT = MVT::getIntegerVT(VectorBits);
5566  EVT EltVT = VectorVT.getVectorElementType();
5567  EVT EltIVT = EltVT.changeTypeToInteger();
5568  uint64_t EltBits = EltVT.getScalarSizeInBits();
5569
5570  SDValue Result = DCI.DAG.getNode(
5571      ISD::TRUNCATE, DL, EltIVT,
5572      DCI.DAG.getNode(
5573          ISD::SRA, DL, IVT, DCI.DAG.getNode(ISD::BITCAST, DL, IVT, Vector),
5574          DCI.DAG.getConstant(Index->getZExtValue() * EltBits, DL, IVT)));
5575
5576  // If element has non-integer type, bitcast it back to the expected type.
5577  if (EltVT != EltIVT)
5578    Result = DCI.DAG.getNode(ISD::BITCAST, DL, EltVT, Result);
5579  // Past legalizer, we may need to extent i8 -> i16 to match the register type.
5580  if (EltVT != N->getValueType(0))
5581    Result = DCI.DAG.getNode(ISD::ANY_EXTEND, DL, N->getValueType(0), Result);
5582
5583  return Result;
5584}
5585
5586static SDValue PerformVSELECTCombine(SDNode *N,
5587                                     TargetLowering::DAGCombinerInfo &DCI) {
5588  SDValue VA = N->getOperand(1);
5589  EVT VectorVT = VA.getValueType();
5590  if (VectorVT != MVT::v4i8)
5591    return SDValue();
5592
5593  // We need to split vselect into individual per-element operations Because we
5594  // use BFE/BFI instruction for byte extraction/insertion, we do end up with
5595  // 32-bit values, so we may as well do comparison as i32 to avoid conversions
5596  // to/from i16 normally used for i8 values.
5597  SmallVector<SDValue, 4> E;
5598  SDLoc DL(N);
5599  SDValue VCond = N->getOperand(0);
5600  SDValue VB = N->getOperand(2);
5601  for (int I = 0; I < 4; ++I) {
5602    SDValue C = DCI.DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::i1, VCond,
5603                                DCI.DAG.getConstant(I, DL, MVT::i32));
5604    SDValue EA = DCI.DAG.getAnyExtOrTrunc(
5605        DCI.DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::i8, VA,
5606                        DCI.DAG.getConstant(I, DL, MVT::i32)),
5607        DL, MVT::i32);
5608    SDValue EB = DCI.DAG.getAnyExtOrTrunc(
5609        DCI.DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::i8, VB,
5610                        DCI.DAG.getConstant(I, DL, MVT::i32)),
5611        DL, MVT::i32);
5612    E.push_back(DCI.DAG.getAnyExtOrTrunc(
5613        DCI.DAG.getNode(ISD::SELECT, DL, MVT::i32, C, EA, EB), DL, MVT::i8));
5614  }
5615  return DCI.DAG.getNode(ISD::BUILD_VECTOR, DL, MVT::v4i8, E);
5616}
5617
5618static SDValue PerformLOADCombine(SDNode *N,
5619                                  TargetLowering::DAGCombinerInfo &DCI) {
5620  SelectionDAG &DAG = DCI.DAG;
5621  LoadSDNode *LD = cast<LoadSDNode>(N);
5622
5623  // Lower a v16i8 load into a LoadV4 operation with i32 results instead of
5624  // letting ReplaceLoadVector split it into smaller loads during legalization.
5625  // This is done at dag-combine1 time, so that vector operations with i8
5626  // elements can be optimised away instead of being needlessly split during
5627  // legalization, which involves storing to the stack and loading it back.
5628  EVT VT = N->getValueType(0);
5629  if (VT != MVT::v16i8)
5630    return SDValue();
5631
5632  SDLoc DL(N);
5633
5634  // Create a v4i32 vector load operation, effectively <4 x v4i8>.
5635  unsigned Opc = NVPTXISD::LoadV4;
5636  EVT NewVT = MVT::v4i32;
5637  EVT EltVT = NewVT.getVectorElementType();
5638  unsigned NumElts = NewVT.getVectorNumElements();
5639  EVT RetVTs[] = {EltVT, EltVT, EltVT, EltVT, MVT::Other};
5640  SDVTList RetVTList = DAG.getVTList(RetVTs);
5641  SmallVector<SDValue, 8> Ops(N->ops());
5642  Ops.push_back(DAG.getIntPtrConstant(LD->getExtensionType(), DL));
5643  SDValue NewLoad = DAG.getMemIntrinsicNode(Opc, DL, RetVTList, Ops, NewVT,
5644                                            LD->getMemOperand());
5645  SDValue NewChain = NewLoad.getValue(NumElts);
5646
5647  // Create a vector of the same type returned by the original load.
5648  SmallVector<SDValue, 4> Elts;
5649  for (unsigned i = 0; i < NumElts; i++)
5650    Elts.push_back(NewLoad.getValue(i));
5651  return DCI.DAG.getMergeValues(
5652      {DCI.DAG.getBitcast(VT, DCI.DAG.getBuildVector(NewVT, DL, Elts)),
5653       NewChain},
5654      DL);
5655}
5656
5657SDValue NVPTXTargetLowering::PerformDAGCombine(SDNode *N,
5658                                               DAGCombinerInfo &DCI) const {
5659  CodeGenOptLevel OptLevel = getTargetMachine().getOptLevel();
5660  switch (N->getOpcode()) {
5661    default: break;
5662    case ISD::ADD:
5663    case ISD::FADD:
5664      return PerformADDCombine(N, DCI, STI, OptLevel);
5665    case ISD::MUL:
5666      return PerformMULCombine(N, DCI, OptLevel);
5667    case ISD::SHL:
5668      return PerformSHLCombine(N, DCI, OptLevel);
5669    case ISD::AND:
5670      return PerformANDCombine(N, DCI);
5671    case ISD::UREM:
5672    case ISD::SREM:
5673      return PerformREMCombine(N, DCI, OptLevel);
5674    case ISD::SETCC:
5675      return PerformSETCCCombine(N, DCI, STI.getSmVersion());
5676    case ISD::LOAD:
5677      return PerformLOADCombine(N, DCI);
5678    case NVPTXISD::StoreRetval:
5679    case NVPTXISD::StoreRetvalV2:
5680    case NVPTXISD::StoreRetvalV4:
5681      return PerformStoreRetvalCombine(N);
5682    case ISD::EXTRACT_VECTOR_ELT:
5683      return PerformEXTRACTCombine(N, DCI);
5684    case ISD::VSELECT:
5685      return PerformVSELECTCombine(N, DCI);
5686  }
5687  return SDValue();
5688}
5689
5690/// ReplaceVectorLoad - Convert vector loads into multi-output scalar loads.
5691static void ReplaceLoadVector(SDNode *N, SelectionDAG &DAG,
5692                              SmallVectorImpl<SDValue> &Results) {
5693  EVT ResVT = N->getValueType(0);
5694  SDLoc DL(N);
5695
5696  assert(ResVT.isVector() && "Vector load must have vector type");
5697
5698  // We only handle "native" vector sizes for now, e.g. <4 x double> is not
5699  // legal.  We can (and should) split that into 2 loads of <2 x double> here
5700  // but I'm leaving that as a TODO for now.
5701  assert(ResVT.isSimple() && "Can only handle simple types");
5702  switch (ResVT.getSimpleVT().SimpleTy) {
5703  default:
5704    return;
5705  case MVT::v2i8:
5706  case MVT::v2i16:
5707  case MVT::v2i32:
5708  case MVT::v2i64:
5709  case MVT::v2f16:
5710  case MVT::v2f32:
5711  case MVT::v2f64:
5712  case MVT::v4i8:
5713  case MVT::v4i16:
5714  case MVT::v4i32:
5715  case MVT::v4f16:
5716  case MVT::v4f32:
5717  case MVT::v8f16:  // <4 x f16x2>
5718  case MVT::v8bf16: // <4 x bf16x2>
5719  case MVT::v8i16:  // <4 x i16x2>
5720    // This is a "native" vector type
5721    break;
5722  }
5723
5724  LoadSDNode *LD = cast<LoadSDNode>(N);
5725
5726  Align Alignment = LD->getAlign();
5727  auto &TD = DAG.getDataLayout();
5728  Align PrefAlign =
5729      TD.getPrefTypeAlign(LD->getMemoryVT().getTypeForEVT(*DAG.getContext()));
5730  if (Alignment < PrefAlign) {
5731    // This load is not sufficiently aligned, so bail out and let this vector
5732    // load be scalarized.  Note that we may still be able to emit smaller
5733    // vector loads.  For example, if we are loading a <4 x float> with an
5734    // alignment of 8, this check will fail but the legalizer will try again
5735    // with 2 x <2 x float>, which will succeed with an alignment of 8.
5736    return;
5737  }
5738
5739  EVT EltVT = ResVT.getVectorElementType();
5740  unsigned NumElts = ResVT.getVectorNumElements();
5741
5742  // Since LoadV2 is a target node, we cannot rely on DAG type legalization.
5743  // Therefore, we must ensure the type is legal.  For i1 and i8, we set the
5744  // loaded type to i16 and propagate the "real" type as the memory type.
5745  bool NeedTrunc = false;
5746  if (EltVT.getSizeInBits() < 16) {
5747    EltVT = MVT::i16;
5748    NeedTrunc = true;
5749  }
5750
5751  unsigned Opcode = 0;
5752  SDVTList LdResVTs;
5753  bool Load16x2 = false;
5754
5755  switch (NumElts) {
5756  default:
5757    return;
5758  case 2:
5759    Opcode = NVPTXISD::LoadV2;
5760    LdResVTs = DAG.getVTList(EltVT, EltVT, MVT::Other);
5761    break;
5762  case 4: {
5763    Opcode = NVPTXISD::LoadV4;
5764    EVT ListVTs[] = { EltVT, EltVT, EltVT, EltVT, MVT::Other };
5765    LdResVTs = DAG.getVTList(ListVTs);
5766    break;
5767  }
5768  case 8: {
5769    // v8f16 is a special case. PTX doesn't have ld.v8.f16
5770    // instruction. Instead, we split the vector into v2f16 chunks and
5771    // load them with ld.v4.b32.
5772    assert(Is16bitsType(EltVT.getSimpleVT()) && "Unsupported v8 vector type.");
5773    Load16x2 = true;
5774    Opcode = NVPTXISD::LoadV4;
5775    EVT VVT;
5776    switch (EltVT.getSimpleVT().SimpleTy) {
5777    case MVT::f16:
5778      VVT = MVT::v2f16;
5779      break;
5780    case MVT::bf16:
5781      VVT = MVT::v2bf16;
5782      break;
5783    case MVT::i16:
5784      VVT = MVT::v2i16;
5785      break;
5786    default:
5787      llvm_unreachable("Unsupported v8 vector type.");
5788    }
5789    EVT ListVTs[] = {VVT, VVT, VVT, VVT, MVT::Other};
5790    LdResVTs = DAG.getVTList(ListVTs);
5791    break;
5792  }
5793  }
5794
5795  // Copy regular operands
5796  SmallVector<SDValue, 8> OtherOps(N->op_begin(), N->op_end());
5797
5798  // The select routine does not have access to the LoadSDNode instance, so
5799  // pass along the extension information
5800  OtherOps.push_back(DAG.getIntPtrConstant(LD->getExtensionType(), DL));
5801
5802  SDValue NewLD = DAG.getMemIntrinsicNode(Opcode, DL, LdResVTs, OtherOps,
5803                                          LD->getMemoryVT(),
5804                                          LD->getMemOperand());
5805
5806  SmallVector<SDValue, 8> ScalarRes;
5807  if (Load16x2) {
5808    // Split v2f16 subvectors back into individual elements.
5809    NumElts /= 2;
5810    for (unsigned i = 0; i < NumElts; ++i) {
5811      SDValue SubVector = NewLD.getValue(i);
5812      SDValue E0 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, EltVT, SubVector,
5813                               DAG.getIntPtrConstant(0, DL));
5814      SDValue E1 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, EltVT, SubVector,
5815                               DAG.getIntPtrConstant(1, DL));
5816      ScalarRes.push_back(E0);
5817      ScalarRes.push_back(E1);
5818    }
5819  } else {
5820    for (unsigned i = 0; i < NumElts; ++i) {
5821      SDValue Res = NewLD.getValue(i);
5822      if (NeedTrunc)
5823        Res = DAG.getNode(ISD::TRUNCATE, DL, ResVT.getVectorElementType(), Res);
5824      ScalarRes.push_back(Res);
5825    }
5826  }
5827
5828  SDValue LoadChain = NewLD.getValue(NumElts);
5829
5830  SDValue BuildVec = DAG.getBuildVector(ResVT, DL, ScalarRes);
5831
5832  Results.push_back(BuildVec);
5833  Results.push_back(LoadChain);
5834}
5835
5836static void ReplaceINTRINSIC_W_CHAIN(SDNode *N, SelectionDAG &DAG,
5837                                     SmallVectorImpl<SDValue> &Results) {
5838  SDValue Chain = N->getOperand(0);
5839  SDValue Intrin = N->getOperand(1);
5840  SDLoc DL(N);
5841
5842  // Get the intrinsic ID
5843  unsigned IntrinNo = Intrin.getNode()->getAsZExtVal();
5844  switch (IntrinNo) {
5845  default:
5846    return;
5847  case Intrinsic::nvvm_ldg_global_i:
5848  case Intrinsic::nvvm_ldg_global_f:
5849  case Intrinsic::nvvm_ldg_global_p:
5850  case Intrinsic::nvvm_ldu_global_i:
5851  case Intrinsic::nvvm_ldu_global_f:
5852  case Intrinsic::nvvm_ldu_global_p: {
5853    EVT ResVT = N->getValueType(0);
5854
5855    if (ResVT.isVector()) {
5856      // Vector LDG/LDU
5857
5858      unsigned NumElts = ResVT.getVectorNumElements();
5859      EVT EltVT = ResVT.getVectorElementType();
5860
5861      // Since LDU/LDG are target nodes, we cannot rely on DAG type
5862      // legalization.
5863      // Therefore, we must ensure the type is legal.  For i1 and i8, we set the
5864      // loaded type to i16 and propagate the "real" type as the memory type.
5865      bool NeedTrunc = false;
5866      if (EltVT.getSizeInBits() < 16) {
5867        EltVT = MVT::i16;
5868        NeedTrunc = true;
5869      }
5870
5871      unsigned Opcode = 0;
5872      SDVTList LdResVTs;
5873
5874      switch (NumElts) {
5875      default:
5876        return;
5877      case 2:
5878        switch (IntrinNo) {
5879        default:
5880          return;
5881        case Intrinsic::nvvm_ldg_global_i:
5882        case Intrinsic::nvvm_ldg_global_f:
5883        case Intrinsic::nvvm_ldg_global_p:
5884          Opcode = NVPTXISD::LDGV2;
5885          break;
5886        case Intrinsic::nvvm_ldu_global_i:
5887        case Intrinsic::nvvm_ldu_global_f:
5888        case Intrinsic::nvvm_ldu_global_p:
5889          Opcode = NVPTXISD::LDUV2;
5890          break;
5891        }
5892        LdResVTs = DAG.getVTList(EltVT, EltVT, MVT::Other);
5893        break;
5894      case 4: {
5895        switch (IntrinNo) {
5896        default:
5897          return;
5898        case Intrinsic::nvvm_ldg_global_i:
5899        case Intrinsic::nvvm_ldg_global_f:
5900        case Intrinsic::nvvm_ldg_global_p:
5901          Opcode = NVPTXISD::LDGV4;
5902          break;
5903        case Intrinsic::nvvm_ldu_global_i:
5904        case Intrinsic::nvvm_ldu_global_f:
5905        case Intrinsic::nvvm_ldu_global_p:
5906          Opcode = NVPTXISD::LDUV4;
5907          break;
5908        }
5909        EVT ListVTs[] = { EltVT, EltVT, EltVT, EltVT, MVT::Other };
5910        LdResVTs = DAG.getVTList(ListVTs);
5911        break;
5912      }
5913      }
5914
5915      SmallVector<SDValue, 8> OtherOps;
5916
5917      // Copy regular operands
5918
5919      OtherOps.push_back(Chain); // Chain
5920                                 // Skip operand 1 (intrinsic ID)
5921      // Others
5922      OtherOps.append(N->op_begin() + 2, N->op_end());
5923
5924      MemIntrinsicSDNode *MemSD = cast<MemIntrinsicSDNode>(N);
5925
5926      SDValue NewLD = DAG.getMemIntrinsicNode(Opcode, DL, LdResVTs, OtherOps,
5927                                              MemSD->getMemoryVT(),
5928                                              MemSD->getMemOperand());
5929
5930      SmallVector<SDValue, 4> ScalarRes;
5931
5932      for (unsigned i = 0; i < NumElts; ++i) {
5933        SDValue Res = NewLD.getValue(i);
5934        if (NeedTrunc)
5935          Res =
5936              DAG.getNode(ISD::TRUNCATE, DL, ResVT.getVectorElementType(), Res);
5937        ScalarRes.push_back(Res);
5938      }
5939
5940      SDValue LoadChain = NewLD.getValue(NumElts);
5941
5942      SDValue BuildVec =
5943          DAG.getBuildVector(ResVT, DL, ScalarRes);
5944
5945      Results.push_back(BuildVec);
5946      Results.push_back(LoadChain);
5947    } else {
5948      // i8 LDG/LDU
5949      assert(ResVT.isSimple() && ResVT.getSimpleVT().SimpleTy == MVT::i8 &&
5950             "Custom handling of non-i8 ldu/ldg?");
5951
5952      // Just copy all operands as-is
5953      SmallVector<SDValue, 4> Ops(N->op_begin(), N->op_end());
5954
5955      // Force output to i16
5956      SDVTList LdResVTs = DAG.getVTList(MVT::i16, MVT::Other);
5957
5958      MemIntrinsicSDNode *MemSD = cast<MemIntrinsicSDNode>(N);
5959
5960      // We make sure the memory type is i8, which will be used during isel
5961      // to select the proper instruction.
5962      SDValue NewLD =
5963          DAG.getMemIntrinsicNode(ISD::INTRINSIC_W_CHAIN, DL, LdResVTs, Ops,
5964                                  MVT::i8, MemSD->getMemOperand());
5965
5966      Results.push_back(DAG.getNode(ISD::TRUNCATE, DL, MVT::i8,
5967                                    NewLD.getValue(0)));
5968      Results.push_back(NewLD.getValue(1));
5969    }
5970  }
5971  }
5972}
5973
5974void NVPTXTargetLowering::ReplaceNodeResults(
5975    SDNode *N, SmallVectorImpl<SDValue> &Results, SelectionDAG &DAG) const {
5976  switch (N->getOpcode()) {
5977  default:
5978    report_fatal_error("Unhandled custom legalization");
5979  case ISD::LOAD:
5980    ReplaceLoadVector(N, DAG, Results);
5981    return;
5982  case ISD::INTRINSIC_W_CHAIN:
5983    ReplaceINTRINSIC_W_CHAIN(N, DAG, Results);
5984    return;
5985  }
5986}
5987
5988NVPTXTargetLowering::AtomicExpansionKind
5989NVPTXTargetLowering::shouldExpandAtomicRMWInIR(AtomicRMWInst *AI) const {
5990  Type *Ty = AI->getValOperand()->getType();
5991
5992  if (AI->isFloatingPointOperation()) {
5993    if (AI->getOperation() == AtomicRMWInst::BinOp::FAdd) {
5994      if (Ty->isFloatTy())
5995        return AtomicExpansionKind::None;
5996      if (Ty->isDoubleTy() && STI.hasAtomAddF64())
5997        return AtomicExpansionKind::None;
5998    }
5999    return AtomicExpansionKind::CmpXChg;
6000  }
6001
6002  assert(Ty->isIntegerTy() && "Ty should be integer at this point");
6003  auto ITy = cast<llvm::IntegerType>(Ty);
6004
6005  switch (AI->getOperation()) {
6006  default:
6007    return AtomicExpansionKind::CmpXChg;
6008  case AtomicRMWInst::BinOp::And:
6009  case AtomicRMWInst::BinOp::Or:
6010  case AtomicRMWInst::BinOp::Xor:
6011  case AtomicRMWInst::BinOp::Xchg:
6012    switch (ITy->getBitWidth()) {
6013    case 8:
6014    case 16:
6015      return AtomicExpansionKind::CmpXChg;
6016    case 32:
6017      return AtomicExpansionKind::None;
6018    case 64:
6019      if (STI.hasAtomBitwise64())
6020        return AtomicExpansionKind::None;
6021      return AtomicExpansionKind::CmpXChg;
6022    default:
6023      llvm_unreachable("unsupported width encountered");
6024    }
6025  case AtomicRMWInst::BinOp::Add:
6026  case AtomicRMWInst::BinOp::Sub:
6027  case AtomicRMWInst::BinOp::Max:
6028  case AtomicRMWInst::BinOp::Min:
6029  case AtomicRMWInst::BinOp::UMax:
6030  case AtomicRMWInst::BinOp::UMin:
6031    switch (ITy->getBitWidth()) {
6032    case 8:
6033    case 16:
6034      return AtomicExpansionKind::CmpXChg;
6035    case 32:
6036      return AtomicExpansionKind::None;
6037    case 64:
6038      if (STI.hasAtomMinMax64())
6039        return AtomicExpansionKind::None;
6040      return AtomicExpansionKind::CmpXChg;
6041    default:
6042      llvm_unreachable("unsupported width encountered");
6043    }
6044  }
6045
6046  return AtomicExpansionKind::CmpXChg;
6047}
6048
6049// Pin NVPTXTargetObjectFile's vtables to this file.
6050NVPTXTargetObjectFile::~NVPTXTargetObjectFile() = default;
6051
6052MCSection *NVPTXTargetObjectFile::SelectSectionForGlobal(
6053    const GlobalObject *GO, SectionKind Kind, const TargetMachine &TM) const {
6054  return getDataSection();
6055}
6056