NVPTXInstrInfo.td revision 360784
11590Srgrimes//===- NVPTXInstrInfo.td - NVPTX Instruction defs -------------*- tblgen-*-===//
21590Srgrimes//
31590Srgrimes// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
41590Srgrimes// See https://llvm.org/LICENSE.txt for license information.
51590Srgrimes// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
61590Srgrimes//
71590Srgrimes//===----------------------------------------------------------------------===//
81590Srgrimes//
91590Srgrimes// This file describes the PTX instructions in TableGen format.
101590Srgrimes//
111590Srgrimes//===----------------------------------------------------------------------===//
121590Srgrimes
131590Srgrimesinclude "NVPTXInstrFormats.td"
141590Srgrimes
151590Srgrimes// A NOP instruction
161590Srgrimeslet hasSideEffects = 0 in {
171590Srgrimes  def NOP : NVPTXInst<(outs), (ins), "", []>;
181590Srgrimes}
191590Srgrimes
201590Srgrimeslet OperandType = "OPERAND_IMMEDIATE" in {
211590Srgrimes  def f16imm : Operand<f16>;
221590Srgrimes}
231590Srgrimes
241590Srgrimes// List of vector specific properties
251590Srgrimesdef isVecLD      : VecInstTypeEnum<1>;
261590Srgrimesdef isVecST      : VecInstTypeEnum<2>;
271590Srgrimesdef isVecBuild   : VecInstTypeEnum<3>;
281590Srgrimesdef isVecShuffle : VecInstTypeEnum<4>;
291590Srgrimesdef isVecExtract : VecInstTypeEnum<5>;
301590Srgrimesdef isVecInsert  : VecInstTypeEnum<6>;
3174769Smikehdef isVecDest    : VecInstTypeEnum<7>;
321590Srgrimesdef isVecOther   : VecInstTypeEnum<15>;
3374769Smikeh
341590Srgrimes//===----------------------------------------------------------------------===//
3599112Sobrien// NVPTX Operand Definitions.
3699112Sobrien//===----------------------------------------------------------------------===//
371590Srgrimes
381590Srgrimesdef brtarget    : Operand<OtherVT>;
391590Srgrimes
401590Srgrimes// CVT conversion modes
411590Srgrimes// These must match the enum in NVPTX.h
421590Srgrimesdef CvtNONE : PatLeaf<(i32 0x0)>;
431590Srgrimesdef CvtRNI  : PatLeaf<(i32 0x1)>;
441590Srgrimesdef CvtRZI  : PatLeaf<(i32 0x2)>;
451590Srgrimesdef CvtRMI  : PatLeaf<(i32 0x3)>;
4688428Smikehdef CvtRPI  : PatLeaf<(i32 0x4)>;
471590Srgrimesdef CvtRN   : PatLeaf<(i32 0x5)>;
481590Srgrimesdef CvtRZ   : PatLeaf<(i32 0x6)>;
491590Srgrimesdef CvtRM   : PatLeaf<(i32 0x7)>;
501590Srgrimesdef CvtRP   : PatLeaf<(i32 0x8)>;
511590Srgrimes
521590Srgrimesdef CvtNONE_FTZ : PatLeaf<(i32 0x10)>;
531590Srgrimesdef CvtRNI_FTZ  : PatLeaf<(i32 0x11)>;
541590Srgrimesdef CvtRZI_FTZ  : PatLeaf<(i32 0x12)>;
551590Srgrimesdef CvtRMI_FTZ  : PatLeaf<(i32 0x13)>;
561590Srgrimesdef CvtRPI_FTZ  : PatLeaf<(i32 0x14)>;
571590Srgrimesdef CvtRN_FTZ   : PatLeaf<(i32 0x15)>;
581590Srgrimesdef CvtRZ_FTZ   : PatLeaf<(i32 0x16)>;
591590Srgrimesdef CvtRM_FTZ   : PatLeaf<(i32 0x17)>;
601590Srgrimesdef CvtRP_FTZ   : PatLeaf<(i32 0x18)>;
611590Srgrimes
621590Srgrimesdef CvtSAT      : PatLeaf<(i32 0x20)>;
631590Srgrimesdef CvtSAT_FTZ  : PatLeaf<(i32 0x30)>;
641590Srgrimes
651590Srgrimesdef CvtMode : Operand<i32> {
661590Srgrimes  let PrintMethod = "printCvtMode";
671590Srgrimes}
681590Srgrimes
691590Srgrimes// Compare modes
701590Srgrimes// These must match the enum in NVPTX.h
711590Srgrimesdef CmpEQ   : PatLeaf<(i32 0)>;
721590Srgrimesdef CmpNE   : PatLeaf<(i32 1)>;
73216564Scharnierdef CmpLT   : PatLeaf<(i32 2)>;
741590Srgrimesdef CmpLE   : PatLeaf<(i32 3)>;
751590Srgrimesdef CmpGT   : PatLeaf<(i32 4)>;
7677274Smikehdef CmpGE   : PatLeaf<(i32 5)>;
7777274Smikehdef CmpEQU  : PatLeaf<(i32 10)>;
7888150Smikehdef CmpNEU  : PatLeaf<(i32 11)>;
7988150Smikehdef CmpLTU  : PatLeaf<(i32 12)>;
8088150Smikehdef CmpLEU  : PatLeaf<(i32 13)>;
811590Srgrimesdef CmpGTU  : PatLeaf<(i32 14)>;
821590Srgrimesdef CmpGEU  : PatLeaf<(i32 15)>;
831590Srgrimesdef CmpNUM  : PatLeaf<(i32 16)>;
841590Srgrimesdef CmpNAN  : PatLeaf<(i32 17)>;
851590Srgrimes
861590Srgrimesdef CmpEQ_FTZ   : PatLeaf<(i32 0x100)>;
8788150Smikehdef CmpNE_FTZ   : PatLeaf<(i32 0x101)>;
8888150Smikehdef CmpLT_FTZ   : PatLeaf<(i32 0x102)>;
8988150Smikehdef CmpLE_FTZ   : PatLeaf<(i32 0x103)>;
9088150Smikehdef CmpGT_FTZ   : PatLeaf<(i32 0x104)>;
911590Srgrimesdef CmpGE_FTZ   : PatLeaf<(i32 0x105)>;
9277274Smikehdef CmpEQU_FTZ  : PatLeaf<(i32 0x10A)>;
931590Srgrimesdef CmpNEU_FTZ  : PatLeaf<(i32 0x10B)>;
9477274Smikehdef CmpLTU_FTZ  : PatLeaf<(i32 0x10C)>;
951590Srgrimesdef CmpLEU_FTZ  : PatLeaf<(i32 0x10D)>;
961590Srgrimesdef CmpGTU_FTZ  : PatLeaf<(i32 0x10E)>;
971590Srgrimesdef CmpGEU_FTZ  : PatLeaf<(i32 0x10F)>;
981590Srgrimesdef CmpNUM_FTZ  : PatLeaf<(i32 0x110)>;
9977274Smikehdef CmpNAN_FTZ  : PatLeaf<(i32 0x111)>;
1001590Srgrimes
1011590Srgrimesdef CmpMode : Operand<i32> {
10288150Smikeh  let PrintMethod = "printCmpMode";
1031590Srgrimes}
1041590Srgrimesdef VecElement : Operand<i32> {
10577274Smikeh  let PrintMethod = "printVecElement";
10677274Smikeh}
10774769Smikeh
10874769Smikeh//===----------------------------------------------------------------------===//
10974769Smikeh// NVPTX Instruction Predicate Definitions
1101590Srgrimes//===----------------------------------------------------------------------===//
1111590Srgrimes
11277274Smikeh
1131590Srgrimesdef hasAtomAddF64 : Predicate<"Subtarget->hasAtomAddF64()">;
1141590Srgrimesdef hasAtomScope : Predicate<"Subtarget->hasAtomScope()">;
1151590Srgrimesdef hasAtomBitwise64 : Predicate<"Subtarget->hasAtomBitwise64()">;
1161590Srgrimesdef hasAtomMinMax64 : Predicate<"Subtarget->hasAtomMinMax64()">;
1171590Srgrimesdef hasVote : Predicate<"Subtarget->hasVote()">;
1181590Srgrimesdef hasDouble : Predicate<"Subtarget->hasDouble()">;
1191590Srgrimesdef hasLDG : Predicate<"Subtarget->hasLDG()">;
1201590Srgrimesdef hasLDU : Predicate<"Subtarget->hasLDU()">;
12177274Smikeh
12277274Smikehdef doF32FTZ : Predicate<"useF32FTZ()">;
1231590Srgrimesdef doNoF32FTZ : Predicate<"!useF32FTZ()">;
1241590Srgrimes
1251590Srgrimesdef doMulWide      : Predicate<"doMulWide">;
12677274Smikeh
1271590Srgrimesdef allowFMA : Predicate<"allowFMA()">;
12877274Smikehdef noFMA : Predicate<"!allowFMA()">;
1291590Srgrimesdef allowUnsafeFPMath : Predicate<"allowUnsafeFPMath()">;
1301590Srgrimes
1311590Srgrimesdef do_DIVF32_APPROX : Predicate<"getDivF32Level()==0">;
1321590Srgrimesdef do_DIVF32_FULL : Predicate<"getDivF32Level()==1">;
1331590Srgrimes
13488150Smikehdef do_SQRTF32_APPROX : Predicate<"!usePrecSqrtF32()">;
13588150Smikehdef do_SQRTF32_RN : Predicate<"usePrecSqrtF32()">;
1361590Srgrimes
1371590Srgrimesdef hasHWROT32 : Predicate<"Subtarget->hasHWROT32()">;
1381590Srgrimesdef noHWROT32 : Predicate<"!Subtarget->hasHWROT32()">;
1391590Srgrimes
1401590Srgrimesdef true : Predicate<"true">;
1411590Srgrimes
1421590Srgrimesdef hasPTX31 : Predicate<"Subtarget->getPTXVersion() >= 31">;
1431590Srgrimesdef hasPTX60 : Predicate<"Subtarget->getPTXVersion() >= 60">;
1441590Srgrimesdef hasPTX61 : Predicate<"Subtarget->getPTXVersion() >= 61">;
1451590Srgrimesdef hasPTX63 : Predicate<"Subtarget->getPTXVersion() >= 63">;
1461590Srgrimesdef hasPTX64 : Predicate<"Subtarget->getPTXVersion() >= 64">;
1471590Srgrimes
14877274Smikehdef hasSM30 : Predicate<"Subtarget->getSmVersion() >= 30">;
1491590Srgrimesdef hasSM70 : Predicate<"Subtarget->getSmVersion() >= 70">;
1501590Srgrimesdef hasSM72 : Predicate<"Subtarget->getSmVersion() >= 72">;
1511590Srgrimesdef hasSM75 : Predicate<"Subtarget->getSmVersion() >= 75">;
1521590Srgrimes
15377274Smikeh// non-sync shfl instructions are not available on sm_70+ in PTX6.4+
1541590Srgrimesdef hasSHFL : Predicate<"!(Subtarget->getSmVersion() >= 70" 
1551590Srgrimes                          "&& Subtarget->getPTXVersion() >= 64)">;
1561590Srgrimes
1571590Srgrimesdef useShortPtr : Predicate<"useShortPointers()">;
1581590Srgrimesdef useFP16Math: Predicate<"Subtarget->allowFP16Math()">;
1591590Srgrimes
1601590Srgrimes//===----------------------------------------------------------------------===//
16177274Smikeh// Some Common Instruction Class Templates
16277274Smikeh//===----------------------------------------------------------------------===//
1631590Srgrimes
1641590Srgrimes// Template for instructions which take three int64, int32, or int16 args.
1651590Srgrimes// The instructions are named "<OpcStr><Width>" (e.g. "add.s64").
1661590Srgrimesmulticlass I3<string OpcStr, SDNode OpNode> {
1671590Srgrimes  def i64rr :
16888150Smikeh    NVPTXInst<(outs Int64Regs:$dst), (ins Int64Regs:$a, Int64Regs:$b),
16988150Smikeh              !strconcat(OpcStr, "64 \t$dst, $a, $b;"),
1701590Srgrimes              [(set Int64Regs:$dst, (OpNode Int64Regs:$a, Int64Regs:$b))]>;
1711590Srgrimes  def i64ri :
1721590Srgrimes    NVPTXInst<(outs Int64Regs:$dst), (ins Int64Regs:$a, i64imm:$b),
17388150Smikeh              !strconcat(OpcStr, "64 \t$dst, $a, $b;"),
17477274Smikeh              [(set Int64Regs:$dst, (OpNode Int64Regs:$a, imm:$b))]>;
1751590Srgrimes  def i32rr :
17688150Smikeh    NVPTXInst<(outs Int32Regs:$dst), (ins Int32Regs:$a, Int32Regs:$b),
17788150Smikeh              !strconcat(OpcStr, "32 \t$dst, $a, $b;"),
17888150Smikeh              [(set Int32Regs:$dst, (OpNode Int32Regs:$a, Int32Regs:$b))]>;
1791590Srgrimes  def i32ri :
1801590Srgrimes    NVPTXInst<(outs Int32Regs:$dst), (ins Int32Regs:$a, i32imm:$b),
1811590Srgrimes              !strconcat(OpcStr, "32 \t$dst, $a, $b;"),
1821590Srgrimes              [(set Int32Regs:$dst, (OpNode Int32Regs:$a, imm:$b))]>;
1831590Srgrimes  def i16rr :
1841590Srgrimes    NVPTXInst<(outs Int16Regs:$dst), (ins Int16Regs:$a, Int16Regs:$b),
1851590Srgrimes              !strconcat(OpcStr, "16 \t$dst, $a, $b;"),
1861590Srgrimes              [(set Int16Regs:$dst, (OpNode Int16Regs:$a, Int16Regs:$b))]>;
1871590Srgrimes  def i16ri :
1881590Srgrimes    NVPTXInst<(outs Int16Regs:$dst), (ins Int16Regs:$a, i16imm:$b),
1891590Srgrimes              !strconcat(OpcStr, "16 \t$dst, $a, $b;"),
19088150Smikeh              [(set Int16Regs:$dst, (OpNode Int16Regs:$a, (imm):$b))]>;
1911590Srgrimes}
1921590Srgrimes
1931590Srgrimes// Template for instructions which take 3 int32 args.  The instructions are
1941590Srgrimes// named "<OpcStr>.s32" (e.g. "addc.cc.s32").
1951590Srgrimesmulticlass ADD_SUB_INT_32<string OpcStr, SDNode OpNode> {
1961590Srgrimes   def i32rr :
1971590Srgrimes     NVPTXInst<(outs Int32Regs:$dst), (ins Int32Regs:$a, Int32Regs:$b),
1981590Srgrimes               !strconcat(OpcStr, ".s32 \t$dst, $a, $b;"),
1991590Srgrimes               [(set Int32Regs:$dst, (OpNode Int32Regs:$a, Int32Regs:$b))]>;
2001590Srgrimes   def i32ri :
2011590Srgrimes     NVPTXInst<(outs Int32Regs:$dst), (ins Int32Regs:$a, i32imm:$b),
2021590Srgrimes               !strconcat(OpcStr, ".s32 \t$dst, $a, $b;"),
2031590Srgrimes               [(set Int32Regs:$dst, (OpNode Int32Regs:$a, imm:$b))]>;
2041590Srgrimes}
2051590Srgrimes
2061590Srgrimes// Template for instructions which take three fp64 or fp32 args.  The
2071590Srgrimes// instructions are named "<OpcStr>.f<Width>" (e.g. "min.f64").
2081590Srgrimes//
2091590Srgrimes// Also defines ftz (flush subnormal inputs and results to sign-preserving
2101590Srgrimes// zero) variants for fp32 functions.
21188428Smikeh//
2121590Srgrimes// This multiclass should be used for nodes that cannot be folded into FMAs.
2131590Srgrimes// For nodes that can be folded into FMAs (i.e. adds and muls), use
2141590Srgrimes// F3_fma_component.
2151590Srgrimesmulticlass F3<string OpcStr, SDNode OpNode> {
2161590Srgrimes   def f64rr :
2171590Srgrimes     NVPTXInst<(outs Float64Regs:$dst),
2181590Srgrimes               (ins Float64Regs:$a, Float64Regs:$b),
2191590Srgrimes               !strconcat(OpcStr, ".f64 \t$dst, $a, $b;"),
2201590Srgrimes               [(set Float64Regs:$dst, (OpNode Float64Regs:$a, Float64Regs:$b))]>;
2211590Srgrimes   def f64ri :
2221590Srgrimes     NVPTXInst<(outs Float64Regs:$dst),
2231590Srgrimes               (ins Float64Regs:$a, f64imm:$b),
2241590Srgrimes               !strconcat(OpcStr, ".f64 \t$dst, $a, $b;"),
2251590Srgrimes               [(set Float64Regs:$dst, (OpNode Float64Regs:$a, fpimm:$b))]>;
2261590Srgrimes   def f32rr_ftz :
2271590Srgrimes     NVPTXInst<(outs Float32Regs:$dst),
2281590Srgrimes               (ins Float32Regs:$a, Float32Regs:$b),
2291590Srgrimes               !strconcat(OpcStr, ".ftz.f32 \t$dst, $a, $b;"),
23088428Smikeh               [(set Float32Regs:$dst, (OpNode Float32Regs:$a, Float32Regs:$b))]>,
23188428Smikeh               Requires<[doF32FTZ]>;
23288428Smikeh   def f32ri_ftz :
23388428Smikeh     NVPTXInst<(outs Float32Regs:$dst),
23488428Smikeh               (ins Float32Regs:$a, f32imm:$b),
2351590Srgrimes               !strconcat(OpcStr, ".ftz.f32 \t$dst, $a, $b;"),
2361590Srgrimes               [(set Float32Regs:$dst, (OpNode Float32Regs:$a, fpimm:$b))]>,
2371590Srgrimes               Requires<[doF32FTZ]>;
2381590Srgrimes   def f32rr :
2391590Srgrimes     NVPTXInst<(outs Float32Regs:$dst),
2401590Srgrimes               (ins Float32Regs:$a, Float32Regs:$b),
2411590Srgrimes               !strconcat(OpcStr, ".f32 \t$dst, $a, $b;"),
2421590Srgrimes               [(set Float32Regs:$dst, (OpNode Float32Regs:$a, Float32Regs:$b))]>;
2431590Srgrimes   def f32ri :
2441590Srgrimes     NVPTXInst<(outs Float32Regs:$dst),
2451590Srgrimes               (ins Float32Regs:$a, f32imm:$b),
2461590Srgrimes               !strconcat(OpcStr, ".f32 \t$dst, $a, $b;"),
2471590Srgrimes               [(set Float32Regs:$dst, (OpNode Float32Regs:$a, fpimm:$b))]>;
2481590Srgrimes}
24932189Sjoerg
2501590Srgrimes// Template for instructions which take three FP args.  The
2511590Srgrimes// instructions are named "<OpcStr>.f<Width>" (e.g. "add.f64").
25288227Sache//
2531590Srgrimes// Also defines ftz (flush subnormal inputs and results to sign-preserving
2541590Srgrimes// zero) variants for fp32/fp16 functions.
2551590Srgrimes//
25632189Sjoerg// This multiclass should be used for nodes that can be folded to make fma ops.
25732189Sjoerg// In this case, we use the ".rn" variant when FMA is disabled, as this behaves
25832189Sjoerg// just like the non ".rn" op, but prevents ptxas from creating FMAs.
25932189Sjoergmulticlass F3_fma_component<string OpcStr, SDNode OpNode> {
26032189Sjoerg   def f64rr :
26188227Sache     NVPTXInst<(outs Float64Regs:$dst),
26232189Sjoerg               (ins Float64Regs:$a, Float64Regs:$b),
26332189Sjoerg               !strconcat(OpcStr, ".f64 \t$dst, $a, $b;"),
26432189Sjoerg               [(set Float64Regs:$dst, (OpNode Float64Regs:$a, Float64Regs:$b))]>,
2651590Srgrimes               Requires<[allowFMA]>;
2661590Srgrimes   def f64ri :
2671590Srgrimes     NVPTXInst<(outs Float64Regs:$dst),
2681590Srgrimes               (ins Float64Regs:$a, f64imm:$b),
2691590Srgrimes               !strconcat(OpcStr, ".f64 \t$dst, $a, $b;"),
2701590Srgrimes               [(set Float64Regs:$dst, (OpNode Float64Regs:$a, fpimm:$b))]>,
2711590Srgrimes               Requires<[allowFMA]>;
2721590Srgrimes   def f32rr_ftz :
27388428Smikeh     NVPTXInst<(outs Float32Regs:$dst),
2741590Srgrimes               (ins Float32Regs:$a, Float32Regs:$b),
2751590Srgrimes               !strconcat(OpcStr, ".ftz.f32 \t$dst, $a, $b;"),
2761590Srgrimes               [(set Float32Regs:$dst, (OpNode Float32Regs:$a, Float32Regs:$b))]>,
27788428Smikeh               Requires<[allowFMA, doF32FTZ]>;
27888428Smikeh   def f32ri_ftz :
27988428Smikeh     NVPTXInst<(outs Float32Regs:$dst),
28088428Smikeh               (ins Float32Regs:$a, f32imm:$b),
28188434Smikeh               !strconcat(OpcStr, ".ftz.f32 \t$dst, $a, $b;"),
28288428Smikeh               [(set Float32Regs:$dst, (OpNode Float32Regs:$a, fpimm:$b))]>,
28388428Smikeh               Requires<[allowFMA, doF32FTZ]>;
28488428Smikeh   def f32rr :
28588428Smikeh     NVPTXInst<(outs Float32Regs:$dst),
28688428Smikeh               (ins Float32Regs:$a, Float32Regs:$b),
28788428Smikeh               !strconcat(OpcStr, ".f32 \t$dst, $a, $b;"),
28888428Smikeh               [(set Float32Regs:$dst, (OpNode Float32Regs:$a, Float32Regs:$b))]>,
28988428Smikeh               Requires<[allowFMA]>;
29088428Smikeh   def f32ri :
29188428Smikeh     NVPTXInst<(outs Float32Regs:$dst),
29288428Smikeh               (ins Float32Regs:$a, f32imm:$b),
29388428Smikeh               !strconcat(OpcStr, ".f32 \t$dst, $a, $b;"),
29488428Smikeh               [(set Float32Regs:$dst, (OpNode Float32Regs:$a, fpimm:$b))]>,
29588428Smikeh               Requires<[allowFMA]>;
29688428Smikeh
29788428Smikeh   def f16rr_ftz :
29888428Smikeh     NVPTXInst<(outs Float16Regs:$dst),
29988428Smikeh               (ins Float16Regs:$a, Float16Regs:$b),
30088428Smikeh               !strconcat(OpcStr, ".ftz.f16 \t$dst, $a, $b;"),
30188428Smikeh               [(set Float16Regs:$dst, (OpNode Float16Regs:$a, Float16Regs:$b))]>,
30288428Smikeh               Requires<[useFP16Math, allowFMA, doF32FTZ]>;
30388428Smikeh   def f16rr :
30488428Smikeh     NVPTXInst<(outs Float16Regs:$dst),
30588428Smikeh               (ins Float16Regs:$a, Float16Regs:$b),
3061590Srgrimes               !strconcat(OpcStr, ".f16 \t$dst, $a, $b;"),
30788428Smikeh               [(set Float16Regs:$dst, (OpNode Float16Regs:$a, Float16Regs:$b))]>,
30888428Smikeh               Requires<[useFP16Math, allowFMA]>;
30988428Smikeh
31088428Smikeh   def f16x2rr_ftz :
31188428Smikeh     NVPTXInst<(outs Float16x2Regs:$dst),
31274769Smikeh               (ins Float16x2Regs:$a, Float16x2Regs:$b),
31374769Smikeh               !strconcat(OpcStr, ".ftz.f16x2 \t$dst, $a, $b;"),
31474769Smikeh               [(set Float16x2Regs:$dst, (OpNode Float16x2Regs:$a, Float16x2Regs:$b))]>,
31574769Smikeh               Requires<[useFP16Math, allowFMA, doF32FTZ]>;
31688428Smikeh   def f16x2rr :
3171590Srgrimes     NVPTXInst<(outs Float16x2Regs:$dst),
31888428Smikeh               (ins Float16x2Regs:$a, Float16x2Regs:$b),
3191590Srgrimes               !strconcat(OpcStr, ".f16x2 \t$dst, $a, $b;"),
3201590Srgrimes               [(set Float16x2Regs:$dst, (OpNode Float16x2Regs:$a, Float16x2Regs:$b))]>,
3211590Srgrimes               Requires<[useFP16Math, allowFMA]>;
3221590Srgrimes
3231590Srgrimes   // These have strange names so we don't perturb existing mir tests.
3241590Srgrimes   def _rnf64rr :
32588227Sache     NVPTXInst<(outs Float64Regs:$dst),
3261590Srgrimes               (ins Float64Regs:$a, Float64Regs:$b),
3271590Srgrimes               !strconcat(OpcStr, ".rn.f64 \t$dst, $a, $b;"),
3281590Srgrimes               [(set Float64Regs:$dst, (OpNode Float64Regs:$a, Float64Regs:$b))]>,
3291590Srgrimes               Requires<[noFMA]>;
3301590Srgrimes   def _rnf64ri :
3311590Srgrimes     NVPTXInst<(outs Float64Regs:$dst),
33277274Smikeh               (ins Float64Regs:$a, f64imm:$b),
3331590Srgrimes               !strconcat(OpcStr, ".rn.f64 \t$dst, $a, $b;"),
33488428Smikeh               [(set Float64Regs:$dst, (OpNode Float64Regs:$a, fpimm:$b))]>,
33588428Smikeh               Requires<[noFMA]>;
33688428Smikeh   def _rnf32rr_ftz :
33788428Smikeh     NVPTXInst<(outs Float32Regs:$dst),
33888428Smikeh               (ins Float32Regs:$a, Float32Regs:$b),
33988428Smikeh               !strconcat(OpcStr, ".rn.ftz.f32 \t$dst, $a, $b;"),
34088428Smikeh               [(set Float32Regs:$dst, (OpNode Float32Regs:$a, Float32Regs:$b))]>,
34188428Smikeh               Requires<[noFMA, doF32FTZ]>;
34288428Smikeh   def _rnf32ri_ftz :
34388428Smikeh     NVPTXInst<(outs Float32Regs:$dst),
34488428Smikeh               (ins Float32Regs:$a, f32imm:$b),
34588428Smikeh               !strconcat(OpcStr, ".rn.ftz.f32 \t$dst, $a, $b;"),
34688428Smikeh               [(set Float32Regs:$dst, (OpNode Float32Regs:$a, fpimm:$b))]>,
34788428Smikeh               Requires<[noFMA, doF32FTZ]>;
34888428Smikeh   def _rnf32rr :
34988428Smikeh     NVPTXInst<(outs Float32Regs:$dst),
35088428Smikeh               (ins Float32Regs:$a, Float32Regs:$b),
35188428Smikeh               !strconcat(OpcStr, ".rn.f32 \t$dst, $a, $b;"),
35288428Smikeh               [(set Float32Regs:$dst, (OpNode Float32Regs:$a, Float32Regs:$b))]>,
35388428Smikeh               Requires<[noFMA]>;
35488428Smikeh   def _rnf32ri :
35588428Smikeh     NVPTXInst<(outs Float32Regs:$dst),
35688428Smikeh               (ins Float32Regs:$a, f32imm:$b),
35788428Smikeh               !strconcat(OpcStr, ".rn.f32 \t$dst, $a, $b;"),
35888428Smikeh               [(set Float32Regs:$dst, (OpNode Float32Regs:$a, fpimm:$b))]>,
35988428Smikeh               Requires<[noFMA]>;
36088428Smikeh   def _rnf16rr_ftz :
36188428Smikeh     NVPTXInst<(outs Float16Regs:$dst),
36288428Smikeh               (ins Float16Regs:$a, Float16Regs:$b),
36388428Smikeh               !strconcat(OpcStr, ".rn.ftz.f16 \t$dst, $a, $b;"),
36488428Smikeh               [(set Float16Regs:$dst, (OpNode Float16Regs:$a, Float16Regs:$b))]>,
36588428Smikeh               Requires<[useFP16Math, noFMA, doF32FTZ]>;
36688428Smikeh   def _rnf16rr :
36788428Smikeh     NVPTXInst<(outs Float16Regs:$dst),
36888428Smikeh               (ins Float16Regs:$a, Float16Regs:$b),
36988428Smikeh               !strconcat(OpcStr, ".rn.f16 \t$dst, $a, $b;"),
37088428Smikeh               [(set Float16Regs:$dst, (OpNode Float16Regs:$a, Float16Regs:$b))]>,
37188428Smikeh               Requires<[useFP16Math, noFMA]>;
37288428Smikeh   def _rnf16x2rr_ftz :
37388428Smikeh     NVPTXInst<(outs Float16x2Regs:$dst),
37488428Smikeh               (ins Float16x2Regs:$a, Float16x2Regs:$b),
37588428Smikeh               !strconcat(OpcStr, ".rn.ftz.f16x2 \t$dst, $a, $b;"),
37688428Smikeh               [(set Float16x2Regs:$dst, (OpNode Float16x2Regs:$a, Float16x2Regs:$b))]>,
37788428Smikeh               Requires<[useFP16Math, noFMA, doF32FTZ]>;
37888428Smikeh   def _rnf16x2rr :
37988428Smikeh     NVPTXInst<(outs Float16x2Regs:$dst),
3801590Srgrimes               (ins Float16x2Regs:$a, Float16x2Regs:$b),
3811590Srgrimes               !strconcat(OpcStr, ".rn.f16x2 \t$dst, $a, $b;"),
38288428Smikeh               [(set Float16x2Regs:$dst, (OpNode Float16x2Regs:$a, Float16x2Regs:$b))]>,
38374769Smikeh               Requires<[useFP16Math, noFMA]>;
3841590Srgrimes}
3851590Srgrimes
3861590Srgrimes// Template for operations which take two f32 or f64 operands.  Provides three
38777274Smikeh// instructions: <OpcStr>.f64, <OpcStr>.f32, and <OpcStr>.ftz.f32 (flush
3881590Srgrimes// subnormal inputs and results to zero).
3891590Srgrimesmulticlass F2<string OpcStr, SDNode OpNode> {
39088150Smikeh   def f64 :     NVPTXInst<(outs Float64Regs:$dst), (ins Float64Regs:$a),
39188150Smikeh                           !strconcat(OpcStr, ".f64 \t$dst, $a;"),
39288150Smikeh                           [(set Float64Regs:$dst, (OpNode Float64Regs:$a))]>;
39388150Smikeh   def f32_ftz : NVPTXInst<(outs Float32Regs:$dst), (ins Float32Regs:$a),
39488150Smikeh                           !strconcat(OpcStr, ".ftz.f32 \t$dst, $a;"),
39577274Smikeh                           [(set Float32Regs:$dst, (OpNode Float32Regs:$a))]>,
3961590Srgrimes                           Requires<[doF32FTZ]>;
3971590Srgrimes   def f32 :     NVPTXInst<(outs Float32Regs:$dst), (ins Float32Regs:$a),
3981590Srgrimes                           !strconcat(OpcStr, ".f32 \t$dst, $a;"),
3991590Srgrimes                           [(set Float32Regs:$dst, (OpNode Float32Regs:$a))]>;
40077274Smikeh}
4011590Srgrimes
4021590Srgrimes//===----------------------------------------------------------------------===//
4031590Srgrimes// NVPTX Instructions.
4041590Srgrimes//===----------------------------------------------------------------------===//
4051590Srgrimes
4061590Srgrimes//-----------------------------------
4071590Srgrimes// Type Conversion
4081590Srgrimes//-----------------------------------
4091590Srgrimes
4101590Srgrimeslet hasSideEffects = 0 in {
4111590Srgrimes  // Generate a cvt to the given type from all possible types.  Each instance
4121590Srgrimes  // takes a CvtMode immediate that defines the conversion mode to use.  It can
4131590Srgrimes  // be CvtNONE to omit a conversion mode.
41477274Smikeh  multiclass CVT_FROM_ALL<string FromName, RegisterClass RC> {
4151590Srgrimes    def _s8 :
4161590Srgrimes      NVPTXInst<(outs RC:$dst),
4171590Srgrimes                (ins Int16Regs:$src, CvtMode:$mode),
4181590Srgrimes                !strconcat("cvt${mode:base}${mode:ftz}${mode:sat}.",
4191590Srgrimes                FromName, ".s8 \t$dst, $src;"), []>;
4201590Srgrimes    def _u8 :
4211590Srgrimes      NVPTXInst<(outs RC:$dst),
4221590Srgrimes                (ins Int16Regs:$src, CvtMode:$mode),
4231590Srgrimes                !strconcat("cvt${mode:base}${mode:ftz}${mode:sat}.",
4241590Srgrimes                FromName, ".u8 \t$dst, $src;"), []>;
4251590Srgrimes    def _s16 :
4261590Srgrimes      NVPTXInst<(outs RC:$dst),
4271590Srgrimes                (ins Int16Regs:$src, CvtMode:$mode),
4281590Srgrimes                !strconcat("cvt${mode:base}${mode:ftz}${mode:sat}.",
42974769Smikeh                FromName, ".s16 \t$dst, $src;"), []>;
4301590Srgrimes    def _u16 :
4311590Srgrimes      NVPTXInst<(outs RC:$dst),
4321590Srgrimes                (ins Int16Regs:$src, CvtMode:$mode),
4331590Srgrimes                !strconcat("cvt${mode:base}${mode:ftz}${mode:sat}.",
43474769Smikeh                FromName, ".u16 \t$dst, $src;"), []>;
4351590Srgrimes    def _s32 :
4361590Srgrimes      NVPTXInst<(outs RC:$dst),
4371590Srgrimes                (ins Int32Regs:$src, CvtMode:$mode),
43877274Smikeh                !strconcat("cvt${mode:base}${mode:ftz}${mode:sat}.",
43977274Smikeh                FromName, ".s32 \t$dst, $src;"), []>;
4401590Srgrimes    def _u32 :
4411590Srgrimes      NVPTXInst<(outs RC:$dst),
4421590Srgrimes                (ins Int32Regs:$src, CvtMode:$mode),
4431590Srgrimes                !strconcat("cvt${mode:base}${mode:ftz}${mode:sat}.",
4441590Srgrimes                FromName, ".u32 \t$dst, $src;"), []>;
4451590Srgrimes    def _s64 :
4461590Srgrimes      NVPTXInst<(outs RC:$dst),
4471590Srgrimes                (ins Int64Regs:$src, CvtMode:$mode),
4481590Srgrimes                !strconcat("cvt${mode:base}${mode:ftz}${mode:sat}.",
4491590Srgrimes                FromName, ".s64 \t$dst, $src;"), []>;
45077274Smikeh    def _u64 :
4511590Srgrimes      NVPTXInst<(outs RC:$dst),
4521590Srgrimes                (ins Int64Regs:$src, CvtMode:$mode),
4531590Srgrimes                !strconcat("cvt${mode:base}${mode:ftz}${mode:sat}.",
4541590Srgrimes                FromName, ".u64 \t$dst, $src;"), []>;
4551590Srgrimes    def _f16 :
4561590Srgrimes      NVPTXInst<(outs RC:$dst),
4571590Srgrimes                (ins Float16Regs:$src, CvtMode:$mode),
4581590Srgrimes                !strconcat("cvt${mode:base}${mode:ftz}${mode:sat}.",
4591590Srgrimes                FromName, ".f16 \t$dst, $src;"), []>;
4601590Srgrimes    def _f32 :
4611590Srgrimes      NVPTXInst<(outs RC:$dst),
4621590Srgrimes                (ins Float32Regs:$src, CvtMode:$mode),
4631590Srgrimes                !strconcat("cvt${mode:base}${mode:ftz}${mode:sat}.",
4641590Srgrimes                FromName, ".f32 \t$dst, $src;"), []>;
4651590Srgrimes    def _f64 :
4661590Srgrimes      NVPTXInst<(outs RC:$dst),
4671590Srgrimes                (ins Float64Regs:$src, CvtMode:$mode),
4681590Srgrimes                !strconcat("cvt${mode:base}${mode:ftz}${mode:sat}.",
4691590Srgrimes                FromName, ".f64 \t$dst, $src;"), []>;
4701590Srgrimes  }
4711590Srgrimes
4721590Srgrimes  // Generate cvts from all types to all types.
4731590Srgrimes  defm CVT_s8  : CVT_FROM_ALL<"s8",  Int16Regs>;
4741590Srgrimes  defm CVT_u8  : CVT_FROM_ALL<"u8",  Int16Regs>;
47577274Smikeh  defm CVT_s16 : CVT_FROM_ALL<"s16", Int16Regs>;
4761590Srgrimes  defm CVT_u16 : CVT_FROM_ALL<"u16", Int16Regs>;
4771590Srgrimes  defm CVT_s32 : CVT_FROM_ALL<"s32", Int32Regs>;
4781590Srgrimes  defm CVT_u32 : CVT_FROM_ALL<"u32", Int32Regs>;
4791590Srgrimes  defm CVT_s64 : CVT_FROM_ALL<"s64", Int64Regs>;
4801590Srgrimes  defm CVT_u64 : CVT_FROM_ALL<"u64", Int64Regs>;
4811590Srgrimes  defm CVT_f16 : CVT_FROM_ALL<"f16", Float16Regs>;
48288150Smikeh  defm CVT_f32 : CVT_FROM_ALL<"f32", Float32Regs>;
48377274Smikeh  defm CVT_f64 : CVT_FROM_ALL<"f64", Float64Regs>;
48477274Smikeh
48577274Smikeh  // These cvts are different from those above: The source and dest registers
48677274Smikeh  // are of the same type.
48777274Smikeh  def CVT_INREG_s16_s8 :  NVPTXInst<(outs Int16Regs:$dst), (ins Int16Regs:$src),
48888150Smikeh                                    "cvt.s16.s8 \t$dst, $src;", []>;
48977274Smikeh  def CVT_INREG_s32_s8 :  NVPTXInst<(outs Int32Regs:$dst), (ins Int32Regs:$src),
4901590Srgrimes                                    "cvt.s32.s8 \t$dst, $src;", []>;
4911590Srgrimes  def CVT_INREG_s32_s16 : NVPTXInst<(outs Int32Regs:$dst), (ins Int32Regs:$src),
4921590Srgrimes                                    "cvt.s32.s16 \t$dst, $src;", []>;
4931590Srgrimes  def CVT_INREG_s64_s8 :  NVPTXInst<(outs Int64Regs:$dst), (ins Int64Regs:$src),
4941590Srgrimes                                    "cvt.s64.s8 \t$dst, $src;", []>;
4951590Srgrimes  def CVT_INREG_s64_s16 : NVPTXInst<(outs Int64Regs:$dst), (ins Int64Regs:$src),
496216564Scharnier                                    "cvt.s64.s16 \t$dst, $src;", []>;
4971590Srgrimes  def CVT_INREG_s64_s32 : NVPTXInst<(outs Int64Regs:$dst), (ins Int64Regs:$src),
49877274Smikeh                                    "cvt.s64.s32 \t$dst, $src;", []>;
49977274Smikeh}
5001590Srgrimes
5011590Srgrimes//-----------------------------------
5021590Srgrimes// Integer Arithmetic
5031590Srgrimes//-----------------------------------
5041590Srgrimes
50577274Smikeh// Template for xor masquerading as int1 arithmetic.
5061590Srgrimesmulticlass ADD_SUB_i1<SDNode OpNode> {
50774769Smikeh   def _rr: NVPTXInst<(outs Int1Regs:$dst), (ins Int1Regs:$a, Int1Regs:$b),
5081590Srgrimes                      "xor.pred \t$dst, $a, $b;",
5091590Srgrimes                      [(set Int1Regs:$dst, (OpNode Int1Regs:$a, Int1Regs:$b))]>;
5101590Srgrimes   def _ri: NVPTXInst<(outs Int1Regs:$dst), (ins Int1Regs:$a, i1imm:$b),
51177274Smikeh                      "xor.pred \t$dst, $a, $b;",
5121590Srgrimes                      [(set Int1Regs:$dst, (OpNode Int1Regs:$a, (imm):$b))]>;
5131590Srgrimes}
51477274Smikeh
51577274Smikeh// int1 addition and subtraction are both just xor.
5161590Srgrimesdefm ADD_i1 : ADD_SUB_i1<add>;
5171590Srgrimesdefm SUB_i1 : ADD_SUB_i1<sub>;
5181590Srgrimes
5191590Srgrimes// int16, int32, and int64 signed addition.  Since nvptx is 2's complement, we
5201590Srgrimes// also use these for unsigned arithmetic.
5211590Srgrimesdefm ADD : I3<"add.s", add>;
5221590Srgrimesdefm SUB : I3<"sub.s", sub>;
52377274Smikeh
5241590Srgrimes// int32 addition and subtraction with carry-out.
52574769Smikeh// FIXME: PTX 4.3 adds a 64-bit add.cc (and maybe also 64-bit addc.cc?).
52677274Smikehdefm ADDCC : ADD_SUB_INT_32<"add.cc", addc>;
52777274Smikehdefm SUBCC : ADD_SUB_INT_32<"sub.cc", subc>;
5281590Srgrimes
5291590Srgrimes// int32 addition and subtraction with carry-in and carry-out.
53077274Smikehdefm ADDCCC : ADD_SUB_INT_32<"addc.cc", adde>;
5311590Srgrimesdefm SUBCCC : ADD_SUB_INT_32<"subc.cc", sube>;
53277274Smikeh
53377274Smikehdefm MULT : I3<"mul.lo.s", mul>;
5341590Srgrimes
5351590Srgrimesdefm MULTHS : I3<"mul.hi.s", mulhs>;
5361590Srgrimesdefm MULTHU : I3<"mul.hi.u", mulhu>;
5371590Srgrimes
5381590Srgrimesdefm SDIV : I3<"div.s", sdiv>;
5391590Srgrimesdefm UDIV : I3<"div.u", udiv>;
5401590Srgrimes
541216564Scharnier// The ri versions of rem.s and rem.u won't be selected; DAGCombiner::visitSREM
5421590Srgrimes// will lower it.
5431590Srgrimesdefm SREM : I3<"rem.s", srem>;
5441590Srgrimesdefm UREM : I3<"rem.u", urem>;
5451590Srgrimes
5461590Srgrimes// Integer absolute value.  NumBits should be one minus the bit width of RC.
54782793Sache// This idiom implements the algorithm at
5481590Srgrimes// http://graphics.stanford.edu/~seander/bithacks.html#IntegerAbs.
54977274Smikehmulticlass ABS<RegisterClass RC, string SizeName> {
5501590Srgrimes  def : NVPTXInst<(outs RC:$dst), (ins RC:$a),
55177274Smikeh                  !strconcat("abs", SizeName, " \t$dst, $a;"),
5521590Srgrimes                  [(set RC:$dst, (abs RC:$a))]>;
5531590Srgrimes}
5541590Srgrimesdefm ABS_16 : ABS<Int16Regs, ".s16">;
5551590Srgrimesdefm ABS_32 : ABS<Int32Regs, ".s32">;
5561590Srgrimesdefm ABS_64 : ABS<Int64Regs, ".s64">;
5571590Srgrimes
5581590Srgrimes// Integer min/max.
5591590Srgrimesdefm SMAX : I3<"max.s", smax>;
5601590Srgrimesdefm UMAX : I3<"max.u", umax>;
561216564Scharnierdefm SMIN : I3<"min.s", smin>;
5621590Srgrimesdefm UMIN : I3<"min.u", umin>;
5631590Srgrimes
56474769Smikeh//
5651590Srgrimes// Wide multiplication
56677274Smikeh//
5671590Srgrimesdef MULWIDES64 :
56877274Smikeh  NVPTXInst<(outs Int64Regs:$dst), (ins Int32Regs:$a, Int32Regs:$b),
56977274Smikeh            "mul.wide.s32 \t$dst, $a, $b;", []>;
57074769Smikehdef MULWIDES64Imm :
57174769Smikeh  NVPTXInst<(outs Int64Regs:$dst), (ins Int32Regs:$a, i32imm:$b),
57274769Smikeh            "mul.wide.s32 \t$dst, $a, $b;", []>;
5731590Srgrimesdef MULWIDES64Imm64 :
5741590Srgrimes  NVPTXInst<(outs Int64Regs:$dst), (ins Int32Regs:$a, i64imm:$b),
57577274Smikeh            "mul.wide.s32 \t$dst, $a, $b;", []>;
5761590Srgrimes
5771590Srgrimesdef MULWIDEU64 :
5781590Srgrimes  NVPTXInst<(outs Int64Regs:$dst), (ins Int32Regs:$a, Int32Regs:$b),
5791590Srgrimes            "mul.wide.u32 \t$dst, $a, $b;", []>;
58077274Smikehdef MULWIDEU64Imm :
58177274Smikeh  NVPTXInst<(outs Int64Regs:$dst), (ins Int32Regs:$a, i32imm:$b),
58277274Smikeh            "mul.wide.u32 \t$dst, $a, $b;", []>;
58377274Smikehdef MULWIDEU64Imm64 :
58477274Smikeh  NVPTXInst<(outs Int64Regs:$dst), (ins Int32Regs:$a, i64imm:$b),
5851590Srgrimes            "mul.wide.u32 \t$dst, $a, $b;", []>;
5861590Srgrimes
5871590Srgrimesdef MULWIDES32 :
5881590Srgrimes  NVPTXInst<(outs Int32Regs:$dst), (ins Int16Regs:$a, Int16Regs:$b),
58977274Smikeh            "mul.wide.s16 \t$dst, $a, $b;", []>;
5901590Srgrimesdef MULWIDES32Imm :
5911590Srgrimes  NVPTXInst<(outs Int32Regs:$dst), (ins Int16Regs:$a, i16imm:$b),
5921590Srgrimes            "mul.wide.s16 \t$dst, $a, $b;", []>;
5931590Srgrimesdef MULWIDES32Imm32 :
5941590Srgrimes  NVPTXInst<(outs Int32Regs:$dst), (ins Int16Regs:$a, i32imm:$b),
59582793Sache            "mul.wide.s16 \t$dst, $a, $b;", []>;
5961590Srgrimes
59777274Smikehdef MULWIDEU32 :
5981590Srgrimes  NVPTXInst<(outs Int32Regs:$dst), (ins Int16Regs:$a, Int16Regs:$b),
59977274Smikeh            "mul.wide.u16 \t$dst, $a, $b;", []>;
6001590Srgrimesdef MULWIDEU32Imm :
6011590Srgrimes  NVPTXInst<(outs Int32Regs:$dst), (ins Int16Regs:$a, i16imm:$b),
6021590Srgrimes            "mul.wide.u16 \t$dst, $a, $b;", []>;
6031590Srgrimesdef MULWIDEU32Imm32 :
6041590Srgrimes  NVPTXInst<(outs Int32Regs:$dst), (ins Int16Regs:$a, i32imm:$b),
6051590Srgrimes            "mul.wide.u16 \t$dst, $a, $b;", []>;
6061590Srgrimes
6071590Srgrimesdef SDTMulWide : SDTypeProfile<1, 2, [SDTCisSameAs<1, 2>]>;
6081590Srgrimesdef mul_wide_signed : SDNode<"NVPTXISD::MUL_WIDE_SIGNED", SDTMulWide>;
6091590Srgrimesdef mul_wide_unsigned : SDNode<"NVPTXISD::MUL_WIDE_UNSIGNED", SDTMulWide>;
6101590Srgrimes
611216564Scharnier// Matchers for signed, unsigned mul.wide ISD nodes.
6121590Srgrimesdef : Pat<(i32 (mul_wide_signed Int16Regs:$a, Int16Regs:$b)),
61377274Smikeh          (MULWIDES32 Int16Regs:$a, Int16Regs:$b)>,
6141590Srgrimes      Requires<[doMulWide]>;
6151590Srgrimesdef : Pat<(i32 (mul_wide_signed Int16Regs:$a, imm:$b)),
6161590Srgrimes          (MULWIDES32Imm Int16Regs:$a, imm:$b)>,
61777274Smikeh      Requires<[doMulWide]>;
61877274Smikehdef : Pat<(i32 (mul_wide_unsigned Int16Regs:$a, Int16Regs:$b)),
61977274Smikeh          (MULWIDEU32 Int16Regs:$a, Int16Regs:$b)>,
6201590Srgrimes      Requires<[doMulWide]>;
62177274Smikehdef : Pat<(i32 (mul_wide_unsigned Int16Regs:$a, imm:$b)),
6221590Srgrimes          (MULWIDEU32Imm Int16Regs:$a, imm:$b)>,
6231590Srgrimes      Requires<[doMulWide]>;
62429574Sphk
6251590Srgrimesdef : Pat<(i64 (mul_wide_signed Int32Regs:$a, Int32Regs:$b)),
62677274Smikeh          (MULWIDES64 Int32Regs:$a, Int32Regs:$b)>,
6271590Srgrimes      Requires<[doMulWide]>;
62829574Sphkdef : Pat<(i64 (mul_wide_signed Int32Regs:$a, imm:$b)),
6291590Srgrimes          (MULWIDES64Imm Int32Regs:$a, imm:$b)>,
6301590Srgrimes      Requires<[doMulWide]>;
63177274Smikehdef : Pat<(i64 (mul_wide_unsigned Int32Regs:$a, Int32Regs:$b)),
63277274Smikeh          (MULWIDEU64 Int32Regs:$a, Int32Regs:$b)>,
6331590Srgrimes      Requires<[doMulWide]>;
63488227Sachedef : Pat<(i64 (mul_wide_unsigned Int32Regs:$a, imm:$b)),
6351590Srgrimes          (MULWIDEU64Imm Int32Regs:$a, imm:$b)>,
6361590Srgrimes      Requires<[doMulWide]>;
6371590Srgrimes
6381590Srgrimes// Predicates used for converting some patterns to mul.wide.
6391590Srgrimesdef SInt32Const : PatLeaf<(imm), [{
6401590Srgrimes  const APInt &v = N->getAPIntValue();
64174769Smikeh  return v.isSignedIntN(32);
64274769Smikeh}]>;
64377274Smikeh
6441590Srgrimesdef UInt32Const : PatLeaf<(imm), [{
6451590Srgrimes  const APInt &v = N->getAPIntValue();
6461590Srgrimes  return v.isIntN(32);
64777274Smikeh}]>;
6481590Srgrimes
6491590Srgrimesdef SInt16Const : PatLeaf<(imm), [{
6501590Srgrimes  const APInt &v = N->getAPIntValue();
6511590Srgrimes  return v.isSignedIntN(16);
6521590Srgrimes}]>;
6531590Srgrimes
6541590Srgrimesdef UInt16Const : PatLeaf<(imm), [{
655216564Scharnier  const APInt &v = N->getAPIntValue();
6561590Srgrimes  return v.isIntN(16);
6571590Srgrimes}]>;
65888150Smikeh
6591590Srgrimesdef Int5Const : PatLeaf<(imm), [{
66088150Smikeh  // Check if 0 <= v < 32; only then will the result of (x << v) be an int32.
66188150Smikeh  const APInt &v = N->getAPIntValue();
66288150Smikeh  return v.sge(0) && v.slt(32);
66377274Smikeh}]>;
66488150Smikeh
66577274Smikehdef Int4Const : PatLeaf<(imm), [{
6661590Srgrimes  // Check if 0 <= v < 16; only then will the result of (x << v) be an int16.
6671590Srgrimes  const APInt &v = N->getAPIntValue();
6681590Srgrimes  return v.sge(0) && v.slt(16);
6691590Srgrimes}]>;
6701590Srgrimes
6711590Srgrimesdef SHL2MUL32 : SDNodeXForm<imm, [{
6721590Srgrimes  const APInt &v = N->getAPIntValue();
6731590Srgrimes  APInt temp(32, 1);
6741590Srgrimes  return CurDAG->getTargetConstant(temp.shl(v), SDLoc(N), MVT::i32);
6751590Srgrimes}]>;
6761590Srgrimes
6771590Srgrimesdef SHL2MUL16 : SDNodeXForm<imm, [{
6781590Srgrimes  const APInt &v = N->getAPIntValue();
679216564Scharnier  APInt temp(16, 1);
6801590Srgrimes  return CurDAG->getTargetConstant(temp.shl(v), SDLoc(N), MVT::i16);
6811590Srgrimes}]>;
6821590Srgrimes
6831590Srgrimes// Convert "sign/zero-extend, then shift left by an immediate" to mul.wide.
6841590Srgrimesdef : Pat<(shl (sext Int32Regs:$a), (i32 Int5Const:$b)),
68577274Smikeh          (MULWIDES64Imm Int32Regs:$a, (SHL2MUL32 node:$b))>,
68677274Smikeh      Requires<[doMulWide]>;
68777274Smikehdef : Pat<(shl (zext Int32Regs:$a), (i32 Int5Const:$b)),
6881590Srgrimes          (MULWIDEU64Imm Int32Regs:$a, (SHL2MUL32 node:$b))>,
6891590Srgrimes      Requires<[doMulWide]>;
6901590Srgrimes
6911590Srgrimesdef : Pat<(shl (sext Int16Regs:$a), (i16 Int4Const:$b)),
6921590Srgrimes          (MULWIDES32Imm Int16Regs:$a, (SHL2MUL16 node:$b))>,
6931590Srgrimes      Requires<[doMulWide]>;
6941590Srgrimesdef : Pat<(shl (zext Int16Regs:$a), (i16 Int4Const:$b)),
69577274Smikeh          (MULWIDEU32Imm Int16Regs:$a, (SHL2MUL16 node:$b))>,
6961590Srgrimes      Requires<[doMulWide]>;
6971590Srgrimes
6981590Srgrimes// Convert "sign/zero-extend then multiply" to mul.wide.
6991590Srgrimesdef : Pat<(mul (sext Int32Regs:$a), (sext Int32Regs:$b)),
7001590Srgrimes          (MULWIDES64 Int32Regs:$a, Int32Regs:$b)>,
7011590Srgrimes      Requires<[doMulWide]>;
702216564Scharnierdef : Pat<(mul (sext Int32Regs:$a), (i64 SInt32Const:$b)),
7031590Srgrimes          (MULWIDES64Imm64 Int32Regs:$a, (i64 SInt32Const:$b))>,
7041590Srgrimes      Requires<[doMulWide]>;
7051590Srgrimes
7061590Srgrimesdef : Pat<(mul (zext Int32Regs:$a), (zext Int32Regs:$b)),
7071590Srgrimes          (MULWIDEU64 Int32Regs:$a, Int32Regs:$b)>,
7081590Srgrimes      Requires<[doMulWide]>;
7091590Srgrimesdef : Pat<(mul (zext Int32Regs:$a), (i64 UInt32Const:$b)),
7101590Srgrimes          (MULWIDEU64Imm64 Int32Regs:$a, (i64 UInt32Const:$b))>,
7111590Srgrimes      Requires<[doMulWide]>;
7121590Srgrimes
7131590Srgrimesdef : Pat<(mul (sext Int16Regs:$a), (sext Int16Regs:$b)),
714216564Scharnier          (MULWIDES32 Int16Regs:$a, Int16Regs:$b)>,
7151590Srgrimes      Requires<[doMulWide]>;
71677274Smikehdef : Pat<(mul (sext Int16Regs:$a), (i32 SInt16Const:$b)),
71777274Smikeh          (MULWIDES32Imm32 Int16Regs:$a, (i32 SInt16Const:$b))>,
7181590Srgrimes      Requires<[doMulWide]>;
7191590Srgrimes
7201590Srgrimesdef : Pat<(mul (zext Int16Regs:$a), (zext Int16Regs:$b)),
7211590Srgrimes          (MULWIDEU32 Int16Regs:$a, Int16Regs:$b)>,
7221590Srgrimes      Requires<[doMulWide]>;
7231590Srgrimesdef : Pat<(mul (zext Int16Regs:$a), (i32 UInt16Const:$b)),
7241590Srgrimes          (MULWIDEU32Imm32 Int16Regs:$a, (i32 UInt16Const:$b))>,
72577274Smikeh      Requires<[doMulWide]>;
7261590Srgrimes
7271590Srgrimes//
7281590Srgrimes// Integer multiply-add
72977274Smikeh//
73077274Smikehdef SDTIMAD :
7311590Srgrimes  SDTypeProfile<1, 3, [SDTCisSameAs<0, 1>, SDTCisInt<0>, SDTCisInt<2>,
7321590Srgrimes                       SDTCisSameAs<0, 2>, SDTCisSameAs<0, 3>]>;
733def imad : SDNode<"NVPTXISD::IMAD", SDTIMAD>;
734
735def MAD16rrr :
736  NVPTXInst<(outs Int16Regs:$dst),
737            (ins Int16Regs:$a, Int16Regs:$b, Int16Regs:$c),
738            "mad.lo.s16 \t$dst, $a, $b, $c;",
739            [(set Int16Regs:$dst, (imad Int16Regs:$a, Int16Regs:$b, Int16Regs:$c))]>;
740def MAD16rri :
741  NVPTXInst<(outs Int16Regs:$dst),
742            (ins Int16Regs:$a, Int16Regs:$b, i16imm:$c),
743            "mad.lo.s16 \t$dst, $a, $b, $c;",
744            [(set Int16Regs:$dst, (imad Int16Regs:$a, Int16Regs:$b, imm:$c))]>;
745def MAD16rir :
746  NVPTXInst<(outs Int16Regs:$dst),
747            (ins Int16Regs:$a, i16imm:$b, Int16Regs:$c),
748            "mad.lo.s16 \t$dst, $a, $b, $c;",
749            [(set Int16Regs:$dst, (imad Int16Regs:$a, imm:$b, Int16Regs:$c))]>;
750def MAD16rii :
751  NVPTXInst<(outs Int16Regs:$dst),
752            (ins Int16Regs:$a, i16imm:$b, i16imm:$c),
753            "mad.lo.s16 \t$dst, $a, $b, $c;",
754            [(set Int16Regs:$dst, (imad Int16Regs:$a, imm:$b, imm:$c))]>;
755
756def MAD32rrr :
757  NVPTXInst<(outs Int32Regs:$dst),
758            (ins Int32Regs:$a, Int32Regs:$b, Int32Regs:$c),
759            "mad.lo.s32 \t$dst, $a, $b, $c;",
760            [(set Int32Regs:$dst, (imad Int32Regs:$a, Int32Regs:$b, Int32Regs:$c))]>;
761def MAD32rri :
762  NVPTXInst<(outs Int32Regs:$dst),
763            (ins Int32Regs:$a, Int32Regs:$b, i32imm:$c),
764            "mad.lo.s32 \t$dst, $a, $b, $c;",
765            [(set Int32Regs:$dst, (imad Int32Regs:$a, Int32Regs:$b, imm:$c))]>;
766def MAD32rir :
767  NVPTXInst<(outs Int32Regs:$dst),
768            (ins Int32Regs:$a, i32imm:$b, Int32Regs:$c),
769            "mad.lo.s32 \t$dst, $a, $b, $c;",
770            [(set Int32Regs:$dst, (imad Int32Regs:$a, imm:$b, Int32Regs:$c))]>;
771def MAD32rii :
772  NVPTXInst<(outs Int32Regs:$dst),
773            (ins Int32Regs:$a, i32imm:$b, i32imm:$c),
774            "mad.lo.s32 \t$dst, $a, $b, $c;",
775            [(set Int32Regs:$dst, (imad Int32Regs:$a, imm:$b, imm:$c))]>;
776
777def MAD64rrr :
778  NVPTXInst<(outs Int64Regs:$dst),
779            (ins Int64Regs:$a, Int64Regs:$b, Int64Regs:$c),
780            "mad.lo.s64 \t$dst, $a, $b, $c;",
781            [(set Int64Regs:$dst, (imad Int64Regs:$a, Int64Regs:$b, Int64Regs:$c))]>;
782def MAD64rri :
783  NVPTXInst<(outs Int64Regs:$dst),
784            (ins Int64Regs:$a, Int64Regs:$b, i64imm:$c),
785            "mad.lo.s64 \t$dst, $a, $b, $c;",
786            [(set Int64Regs:$dst, (imad Int64Regs:$a, Int64Regs:$b, imm:$c))]>;
787def MAD64rir :
788  NVPTXInst<(outs Int64Regs:$dst),
789            (ins Int64Regs:$a, i64imm:$b, Int64Regs:$c),
790            "mad.lo.s64 \t$dst, $a, $b, $c;",
791            [(set Int64Regs:$dst, (imad Int64Regs:$a, imm:$b, Int64Regs:$c))]>;
792def MAD64rii :
793  NVPTXInst<(outs Int64Regs:$dst),
794            (ins Int64Regs:$a, i64imm:$b, i64imm:$c),
795            "mad.lo.s64 \t$dst, $a, $b, $c;",
796            [(set Int64Regs:$dst, (imad Int64Regs:$a, imm:$b, imm:$c))]>;
797
798def INEG16 :
799  NVPTXInst<(outs Int16Regs:$dst), (ins Int16Regs:$src),
800            "neg.s16 \t$dst, $src;",
801            [(set Int16Regs:$dst, (ineg Int16Regs:$src))]>;
802def INEG32 :
803  NVPTXInst<(outs Int32Regs:$dst), (ins Int32Regs:$src),
804            "neg.s32 \t$dst, $src;",
805            [(set Int32Regs:$dst, (ineg Int32Regs:$src))]>;
806def INEG64 :
807  NVPTXInst<(outs Int64Regs:$dst), (ins Int64Regs:$src),
808            "neg.s64 \t$dst, $src;",
809            [(set Int64Regs:$dst, (ineg Int64Regs:$src))]>;
810
811//-----------------------------------
812// Floating Point Arithmetic
813//-----------------------------------
814
815// Constant 1.0f
816def FloatConst1 : PatLeaf<(fpimm), [{
817  return &N->getValueAPF().getSemantics() == &llvm::APFloat::IEEEsingle() &&
818         N->getValueAPF().convertToFloat() == 1.0f;
819}]>;
820// Constant 1.0 (double)
821def DoubleConst1 : PatLeaf<(fpimm), [{
822  return &N->getValueAPF().getSemantics() == &llvm::APFloat::IEEEdouble() &&
823         N->getValueAPF().convertToDouble() == 1.0;
824}]>;
825
826// Loads FP16 constant into a register.
827//
828// ptxas does not have hex representation for fp16, so we can't use
829// fp16 immediate values in .f16 instructions. Instead we have to load
830// the constant into a register using mov.b16.
831def LOAD_CONST_F16 :
832  NVPTXInst<(outs Float16Regs:$dst), (ins f16imm:$a),
833            "mov.b16 \t$dst, $a;", []>;
834
835defm FADD : F3_fma_component<"add", fadd>;
836defm FSUB : F3_fma_component<"sub", fsub>;
837defm FMUL : F3_fma_component<"mul", fmul>;
838
839defm FMIN : F3<"min", fminnum>;
840defm FMAX : F3<"max", fmaxnum>;
841
842defm FABS  : F2<"abs", fabs>;
843defm FNEG  : F2<"neg", fneg>;
844defm FSQRT : F2<"sqrt.rn", fsqrt>;
845
846//
847// F64 division
848//
849def FDIV641r :
850  NVPTXInst<(outs Float64Regs:$dst),
851            (ins f64imm:$a, Float64Regs:$b),
852            "rcp.rn.f64 \t$dst, $b;",
853            [(set Float64Regs:$dst, (fdiv DoubleConst1:$a, Float64Regs:$b))]>;
854def FDIV64rr :
855  NVPTXInst<(outs Float64Regs:$dst),
856            (ins Float64Regs:$a, Float64Regs:$b),
857            "div.rn.f64 \t$dst, $a, $b;",
858            [(set Float64Regs:$dst, (fdiv Float64Regs:$a, Float64Regs:$b))]>;
859def FDIV64ri :
860  NVPTXInst<(outs Float64Regs:$dst),
861            (ins Float64Regs:$a, f64imm:$b),
862            "div.rn.f64 \t$dst, $a, $b;",
863            [(set Float64Regs:$dst, (fdiv Float64Regs:$a, fpimm:$b))]>;
864
865//
866// F32 Approximate reciprocal
867//
868def FDIV321r_ftz :
869  NVPTXInst<(outs Float32Regs:$dst),
870            (ins f32imm:$a, Float32Regs:$b),
871            "rcp.approx.ftz.f32 \t$dst, $b;",
872            [(set Float32Regs:$dst, (fdiv FloatConst1:$a, Float32Regs:$b))]>,
873            Requires<[do_DIVF32_APPROX, doF32FTZ]>;
874def FDIV321r :
875  NVPTXInst<(outs Float32Regs:$dst),
876            (ins f32imm:$a, Float32Regs:$b),
877            "rcp.approx.f32 \t$dst, $b;",
878            [(set Float32Regs:$dst, (fdiv FloatConst1:$a, Float32Regs:$b))]>,
879            Requires<[do_DIVF32_APPROX]>;
880//
881// F32 Approximate division
882//
883def FDIV32approxrr_ftz :
884  NVPTXInst<(outs Float32Regs:$dst),
885            (ins Float32Regs:$a, Float32Regs:$b),
886            "div.approx.ftz.f32 \t$dst, $a, $b;",
887            [(set Float32Regs:$dst, (fdiv Float32Regs:$a, Float32Regs:$b))]>,
888            Requires<[do_DIVF32_APPROX, doF32FTZ]>;
889def FDIV32approxri_ftz :
890  NVPTXInst<(outs Float32Regs:$dst),
891            (ins Float32Regs:$a, f32imm:$b),
892            "div.approx.ftz.f32 \t$dst, $a, $b;",
893            [(set Float32Regs:$dst, (fdiv Float32Regs:$a, fpimm:$b))]>,
894            Requires<[do_DIVF32_APPROX, doF32FTZ]>;
895def FDIV32approxrr :
896  NVPTXInst<(outs Float32Regs:$dst),
897            (ins Float32Regs:$a, Float32Regs:$b),
898            "div.approx.f32 \t$dst, $a, $b;",
899            [(set Float32Regs:$dst, (fdiv Float32Regs:$a, Float32Regs:$b))]>,
900            Requires<[do_DIVF32_APPROX]>;
901def FDIV32approxri :
902  NVPTXInst<(outs Float32Regs:$dst),
903            (ins Float32Regs:$a, f32imm:$b),
904            "div.approx.f32 \t$dst, $a, $b;",
905            [(set Float32Regs:$dst, (fdiv Float32Regs:$a, fpimm:$b))]>,
906            Requires<[do_DIVF32_APPROX]>;
907//
908// F32 Semi-accurate reciprocal
909//
910// rcp.approx gives the same result as div.full(1.0f, a) and is faster.
911//
912def FDIV321r_approx_ftz :
913  NVPTXInst<(outs Float32Regs:$dst),
914            (ins f32imm:$a, Float32Regs:$b),
915            "rcp.approx.ftz.f32 \t$dst, $b;",
916            [(set Float32Regs:$dst, (fdiv FloatConst1:$a, Float32Regs:$b))]>,
917            Requires<[do_DIVF32_FULL, doF32FTZ]>;
918def FDIV321r_approx :
919  NVPTXInst<(outs Float32Regs:$dst),
920            (ins f32imm:$a, Float32Regs:$b),
921            "rcp.approx.f32 \t$dst, $b;",
922            [(set Float32Regs:$dst, (fdiv FloatConst1:$a, Float32Regs:$b))]>,
923            Requires<[do_DIVF32_FULL]>;
924//
925// F32 Semi-accurate division
926//
927def FDIV32rr_ftz :
928  NVPTXInst<(outs Float32Regs:$dst),
929            (ins Float32Regs:$a, Float32Regs:$b),
930            "div.full.ftz.f32 \t$dst, $a, $b;",
931            [(set Float32Regs:$dst, (fdiv Float32Regs:$a, Float32Regs:$b))]>,
932            Requires<[do_DIVF32_FULL, doF32FTZ]>;
933def FDIV32ri_ftz :
934  NVPTXInst<(outs Float32Regs:$dst),
935            (ins Float32Regs:$a, f32imm:$b),
936            "div.full.ftz.f32 \t$dst, $a, $b;",
937            [(set Float32Regs:$dst, (fdiv Float32Regs:$a, fpimm:$b))]>,
938            Requires<[do_DIVF32_FULL, doF32FTZ]>;
939def FDIV32rr :
940  NVPTXInst<(outs Float32Regs:$dst),
941            (ins Float32Regs:$a, Float32Regs:$b),
942            "div.full.f32 \t$dst, $a, $b;",
943            [(set Float32Regs:$dst, (fdiv Float32Regs:$a, Float32Regs:$b))]>,
944            Requires<[do_DIVF32_FULL]>;
945def FDIV32ri :
946  NVPTXInst<(outs Float32Regs:$dst),
947            (ins Float32Regs:$a, f32imm:$b),
948            "div.full.f32 \t$dst, $a, $b;",
949            [(set Float32Regs:$dst, (fdiv Float32Regs:$a, fpimm:$b))]>,
950            Requires<[do_DIVF32_FULL]>;
951//
952// F32 Accurate reciprocal
953//
954def FDIV321r_prec_ftz :
955  NVPTXInst<(outs Float32Regs:$dst),
956            (ins f32imm:$a, Float32Regs:$b),
957            "rcp.rn.ftz.f32 \t$dst, $b;",
958            [(set Float32Regs:$dst, (fdiv FloatConst1:$a, Float32Regs:$b))]>,
959            Requires<[doF32FTZ]>;
960def FDIV321r_prec :
961  NVPTXInst<(outs Float32Regs:$dst),
962            (ins f32imm:$a, Float32Regs:$b),
963            "rcp.rn.f32 \t$dst, $b;",
964            [(set Float32Regs:$dst, (fdiv FloatConst1:$a, Float32Regs:$b))]>;
965//
966// F32 Accurate division
967//
968def FDIV32rr_prec_ftz :
969  NVPTXInst<(outs Float32Regs:$dst),
970            (ins Float32Regs:$a, Float32Regs:$b),
971            "div.rn.ftz.f32 \t$dst, $a, $b;",
972            [(set Float32Regs:$dst, (fdiv Float32Regs:$a, Float32Regs:$b))]>,
973            Requires<[doF32FTZ]>;
974def FDIV32ri_prec_ftz :
975  NVPTXInst<(outs Float32Regs:$dst),
976            (ins Float32Regs:$a, f32imm:$b),
977            "div.rn.ftz.f32 \t$dst, $a, $b;",
978            [(set Float32Regs:$dst, (fdiv Float32Regs:$a, fpimm:$b))]>,
979            Requires<[doF32FTZ]>;
980def FDIV32rr_prec :
981  NVPTXInst<(outs Float32Regs:$dst),
982            (ins Float32Regs:$a, Float32Regs:$b),
983            "div.rn.f32 \t$dst, $a, $b;",
984            [(set Float32Regs:$dst, (fdiv Float32Regs:$a, Float32Regs:$b))]>;
985def FDIV32ri_prec :
986  NVPTXInst<(outs Float32Regs:$dst),
987            (ins Float32Regs:$a, f32imm:$b),
988            "div.rn.f32 \t$dst, $a, $b;",
989            [(set Float32Regs:$dst, (fdiv Float32Regs:$a, fpimm:$b))]>;
990
991//
992// FMA
993//
994
995multiclass FMA<string OpcStr, RegisterClass RC, Operand ImmCls, Predicate Pred> {
996   def rrr : NVPTXInst<(outs RC:$dst), (ins RC:$a, RC:$b, RC:$c),
997                       !strconcat(OpcStr, " \t$dst, $a, $b, $c;"),
998                       [(set RC:$dst, (fma RC:$a, RC:$b, RC:$c))]>,
999                       Requires<[Pred]>;
1000   def rri : NVPTXInst<(outs RC:$dst),
1001                       (ins RC:$a, RC:$b, ImmCls:$c),
1002                       !strconcat(OpcStr, " \t$dst, $a, $b, $c;"),
1003                       [(set RC:$dst, (fma RC:$a, RC:$b, fpimm:$c))]>,
1004                       Requires<[Pred]>;
1005   def rir : NVPTXInst<(outs RC:$dst),
1006                       (ins RC:$a, ImmCls:$b, RC:$c),
1007                       !strconcat(OpcStr, " \t$dst, $a, $b, $c;"),
1008                       [(set RC:$dst, (fma RC:$a, fpimm:$b, RC:$c))]>,
1009                       Requires<[Pred]>;
1010   def rii : NVPTXInst<(outs RC:$dst),
1011                       (ins RC:$a, ImmCls:$b, ImmCls:$c),
1012                       !strconcat(OpcStr, " \t$dst, $a, $b, $c;"),
1013                       [(set RC:$dst, (fma RC:$a, fpimm:$b, fpimm:$c))]>,
1014                       Requires<[Pred]>;
1015}
1016
1017multiclass FMA_F16<string OpcStr, RegisterClass RC, Predicate Pred> {
1018   def rrr : NVPTXInst<(outs RC:$dst), (ins RC:$a, RC:$b, RC:$c),
1019                       !strconcat(OpcStr, " \t$dst, $a, $b, $c;"),
1020                       [(set RC:$dst, (fma RC:$a, RC:$b, RC:$c))]>,
1021                       Requires<[useFP16Math, Pred]>;
1022}
1023
1024defm FMA16_ftz : FMA_F16<"fma.rn.ftz.f16", Float16Regs, doF32FTZ>;
1025defm FMA16     : FMA_F16<"fma.rn.f16", Float16Regs, true>;
1026defm FMA16x2_ftz : FMA_F16<"fma.rn.ftz.f16x2", Float16x2Regs, doF32FTZ>;
1027defm FMA16x2     : FMA_F16<"fma.rn.f16x2", Float16x2Regs, true>;
1028defm FMA32_ftz : FMA<"fma.rn.ftz.f32", Float32Regs, f32imm, doF32FTZ>;
1029defm FMA32     : FMA<"fma.rn.f32", Float32Regs, f32imm, true>;
1030defm FMA64     : FMA<"fma.rn.f64", Float64Regs, f64imm, true>;
1031
1032// sin/cos
1033def SINF:  NVPTXInst<(outs Float32Regs:$dst), (ins Float32Regs:$src),
1034                      "sin.approx.f32 \t$dst, $src;",
1035                      [(set Float32Regs:$dst, (fsin Float32Regs:$src))]>,
1036                      Requires<[allowUnsafeFPMath]>;
1037def COSF:  NVPTXInst<(outs Float32Regs:$dst), (ins Float32Regs:$src),
1038                      "cos.approx.f32 \t$dst, $src;",
1039                      [(set Float32Regs:$dst, (fcos Float32Regs:$src))]>,
1040                      Requires<[allowUnsafeFPMath]>;
1041
1042// Lower (frem x, y) into (sub x, (mul (floor (div x, y)) y)),
1043// i.e. "poor man's fmod()"
1044
1045// frem - f32 FTZ
1046def : Pat<(frem Float32Regs:$x, Float32Regs:$y),
1047          (FSUBf32rr_ftz Float32Regs:$x, (FMULf32rr_ftz (CVT_f32_f32
1048            (FDIV32rr_prec_ftz Float32Regs:$x, Float32Regs:$y), CvtRMI_FTZ),
1049             Float32Regs:$y))>,
1050          Requires<[doF32FTZ]>;
1051def : Pat<(frem Float32Regs:$x, fpimm:$y),
1052          (FSUBf32rr_ftz Float32Regs:$x, (FMULf32ri_ftz (CVT_f32_f32
1053            (FDIV32ri_prec_ftz Float32Regs:$x, fpimm:$y), CvtRMI_FTZ),
1054             fpimm:$y))>,
1055          Requires<[doF32FTZ]>;
1056
1057// frem - f32
1058def : Pat<(frem Float32Regs:$x, Float32Regs:$y),
1059          (FSUBf32rr Float32Regs:$x, (FMULf32rr (CVT_f32_f32
1060            (FDIV32rr_prec Float32Regs:$x, Float32Regs:$y), CvtRMI),
1061             Float32Regs:$y))>;
1062def : Pat<(frem Float32Regs:$x, fpimm:$y),
1063          (FSUBf32rr Float32Regs:$x, (FMULf32ri (CVT_f32_f32
1064            (FDIV32ri_prec Float32Regs:$x, fpimm:$y), CvtRMI),
1065             fpimm:$y))>;
1066
1067// frem - f64
1068def : Pat<(frem Float64Regs:$x, Float64Regs:$y),
1069          (FSUBf64rr Float64Regs:$x, (FMULf64rr (CVT_f64_f64
1070            (FDIV64rr Float64Regs:$x, Float64Regs:$y), CvtRMI),
1071             Float64Regs:$y))>;
1072def : Pat<(frem Float64Regs:$x, fpimm:$y),
1073          (FSUBf64rr Float64Regs:$x, (FMULf64ri (CVT_f64_f64
1074            (FDIV64ri Float64Regs:$x, fpimm:$y), CvtRMI),
1075             fpimm:$y))>;
1076
1077//-----------------------------------
1078// Bitwise operations
1079//-----------------------------------
1080
1081// Template for three-arg bitwise operations.  Takes three args, Creates .b16,
1082// .b32, .b64, and .pred (predicate registers -- i.e., i1) versions of OpcStr.
1083multiclass BITWISE<string OpcStr, SDNode OpNode> {
1084  def b1rr :
1085    NVPTXInst<(outs Int1Regs:$dst), (ins Int1Regs:$a, Int1Regs:$b),
1086              !strconcat(OpcStr, ".pred  \t$dst, $a, $b;"),
1087              [(set Int1Regs:$dst, (OpNode Int1Regs:$a, Int1Regs:$b))]>;
1088  def b1ri :
1089    NVPTXInst<(outs Int1Regs:$dst), (ins Int1Regs:$a, i1imm:$b),
1090              !strconcat(OpcStr, ".pred  \t$dst, $a, $b;"),
1091              [(set Int1Regs:$dst, (OpNode Int1Regs:$a, imm:$b))]>;
1092  def b16rr :
1093    NVPTXInst<(outs Int16Regs:$dst), (ins Int16Regs:$a, Int16Regs:$b),
1094              !strconcat(OpcStr, ".b16  \t$dst, $a, $b;"),
1095              [(set Int16Regs:$dst, (OpNode Int16Regs:$a, Int16Regs:$b))]>;
1096  def b16ri :
1097    NVPTXInst<(outs Int16Regs:$dst), (ins Int16Regs:$a, i16imm:$b),
1098              !strconcat(OpcStr, ".b16  \t$dst, $a, $b;"),
1099              [(set Int16Regs:$dst, (OpNode Int16Regs:$a, imm:$b))]>;
1100  def b32rr :
1101    NVPTXInst<(outs Int32Regs:$dst), (ins Int32Regs:$a, Int32Regs:$b),
1102              !strconcat(OpcStr, ".b32  \t$dst, $a, $b;"),
1103              [(set Int32Regs:$dst, (OpNode Int32Regs:$a, Int32Regs:$b))]>;
1104  def b32ri :
1105    NVPTXInst<(outs Int32Regs:$dst), (ins Int32Regs:$a, i32imm:$b),
1106              !strconcat(OpcStr, ".b32  \t$dst, $a, $b;"),
1107              [(set Int32Regs:$dst, (OpNode Int32Regs:$a, imm:$b))]>;
1108  def b64rr :
1109    NVPTXInst<(outs Int64Regs:$dst), (ins Int64Regs:$a, Int64Regs:$b),
1110              !strconcat(OpcStr, ".b64  \t$dst, $a, $b;"),
1111              [(set Int64Regs:$dst, (OpNode Int64Regs:$a, Int64Regs:$b))]>;
1112  def b64ri :
1113    NVPTXInst<(outs Int64Regs:$dst), (ins Int64Regs:$a, i64imm:$b),
1114              !strconcat(OpcStr, ".b64  \t$dst, $a, $b;"),
1115              [(set Int64Regs:$dst, (OpNode Int64Regs:$a, imm:$b))]>;
1116}
1117
1118defm OR  : BITWISE<"or", or>;
1119defm AND : BITWISE<"and", and>;
1120defm XOR : BITWISE<"xor", xor>;
1121
1122def NOT1  : NVPTXInst<(outs Int1Regs:$dst), (ins Int1Regs:$src),
1123                      "not.pred \t$dst, $src;",
1124                      [(set Int1Regs:$dst, (not Int1Regs:$src))]>;
1125def NOT16 : NVPTXInst<(outs Int16Regs:$dst), (ins Int16Regs:$src),
1126                      "not.b16 \t$dst, $src;",
1127                      [(set Int16Regs:$dst, (not Int16Regs:$src))]>;
1128def NOT32 : NVPTXInst<(outs Int32Regs:$dst), (ins Int32Regs:$src),
1129                      "not.b32 \t$dst, $src;",
1130                      [(set Int32Regs:$dst, (not Int32Regs:$src))]>;
1131def NOT64 : NVPTXInst<(outs Int64Regs:$dst), (ins Int64Regs:$src),
1132                       "not.b64 \t$dst, $src;",
1133                       [(set Int64Regs:$dst, (not Int64Regs:$src))]>;
1134
1135// Template for left/right shifts.  Takes three operands,
1136//   [dest (reg), src (reg), shift (reg or imm)].
1137// dest and src may be int64, int32, or int16, but shift is always int32.
1138//
1139// This template also defines a 32-bit shift (imm, imm) instruction.
1140multiclass SHIFT<string OpcStr, SDNode OpNode> {
1141   def i64rr :
1142     NVPTXInst<(outs Int64Regs:$dst), (ins Int64Regs:$a, Int32Regs:$b),
1143               !strconcat(OpcStr, "64 \t$dst, $a, $b;"),
1144               [(set Int64Regs:$dst, (OpNode Int64Regs:$a, Int32Regs:$b))]>;
1145   def i64ri :
1146     NVPTXInst<(outs Int64Regs:$dst), (ins Int64Regs:$a, i32imm:$b),
1147               !strconcat(OpcStr, "64 \t$dst, $a, $b;"),
1148               [(set Int64Regs:$dst, (OpNode Int64Regs:$a, (i32 imm:$b)))]>;
1149   def i32rr :
1150     NVPTXInst<(outs Int32Regs:$dst), (ins Int32Regs:$a, Int32Regs:$b),
1151               !strconcat(OpcStr, "32 \t$dst, $a, $b;"),
1152               [(set Int32Regs:$dst, (OpNode Int32Regs:$a, Int32Regs:$b))]>;
1153   def i32ri :
1154     NVPTXInst<(outs Int32Regs:$dst), (ins Int32Regs:$a, i32imm:$b),
1155               !strconcat(OpcStr, "32 \t$dst, $a, $b;"),
1156               [(set Int32Regs:$dst, (OpNode Int32Regs:$a, (i32 imm:$b)))]>;
1157   def i32ii :
1158     NVPTXInst<(outs Int32Regs:$dst), (ins i32imm:$a, i32imm:$b),
1159               !strconcat(OpcStr, "32 \t$dst, $a, $b;"),
1160               [(set Int32Regs:$dst, (OpNode (i32 imm:$a), (i32 imm:$b)))]>;
1161   def i16rr :
1162     NVPTXInst<(outs Int16Regs:$dst), (ins Int16Regs:$a, Int32Regs:$b),
1163               !strconcat(OpcStr, "16 \t$dst, $a, $b;"),
1164               [(set Int16Regs:$dst, (OpNode Int16Regs:$a, Int32Regs:$b))]>;
1165   def i16ri :
1166     NVPTXInst<(outs Int16Regs:$dst), (ins Int16Regs:$a, i32imm:$b),
1167               !strconcat(OpcStr, "16 \t$dst, $a, $b;"),
1168               [(set Int16Regs:$dst, (OpNode Int16Regs:$a, (i32 imm:$b)))]>;
1169}
1170
1171defm SHL : SHIFT<"shl.b", shl>;
1172defm SRA : SHIFT<"shr.s", sra>;
1173defm SRL : SHIFT<"shr.u", srl>;
1174
1175// Bit-reverse
1176def BREV32 :
1177  NVPTXInst<(outs Int32Regs:$dst), (ins Int32Regs:$a),
1178             "brev.b32 \t$dst, $a;",
1179             [(set Int32Regs:$dst, (bitreverse Int32Regs:$a))]>;
1180def BREV64 :
1181  NVPTXInst<(outs Int64Regs:$dst), (ins Int64Regs:$a),
1182             "brev.b64 \t$dst, $a;",
1183             [(set Int64Regs:$dst, (bitreverse Int64Regs:$a))]>;
1184
1185//
1186// Rotate: Use ptx shf instruction if available.
1187//
1188
1189// 32 bit r2 = rotl r1, n
1190//    =>
1191//        r2 = shf.l r1, r1, n
1192def ROTL32imm_hw :
1193  NVPTXInst<(outs Int32Regs:$dst), (ins Int32Regs:$src, i32imm:$amt),
1194            "shf.l.wrap.b32 \t$dst, $src, $src, $amt;",
1195            [(set Int32Regs:$dst, (rotl Int32Regs:$src, (i32 imm:$amt)))]>,
1196           Requires<[hasHWROT32]>;
1197
1198def ROTL32reg_hw :
1199  NVPTXInst<(outs Int32Regs:$dst), (ins Int32Regs:$src, Int32Regs:$amt),
1200            "shf.l.wrap.b32 \t$dst, $src, $src, $amt;",
1201            [(set Int32Regs:$dst, (rotl Int32Regs:$src, Int32Regs:$amt))]>,
1202           Requires<[hasHWROT32]>;
1203
1204// 32 bit r2 = rotr r1, n
1205//    =>
1206//        r2 = shf.r r1, r1, n
1207def ROTR32imm_hw :
1208  NVPTXInst<(outs Int32Regs:$dst), (ins Int32Regs:$src, i32imm:$amt),
1209            "shf.r.wrap.b32 \t$dst, $src, $src, $amt;",
1210            [(set Int32Regs:$dst, (rotr Int32Regs:$src, (i32 imm:$amt)))]>,
1211           Requires<[hasHWROT32]>;
1212
1213def ROTR32reg_hw :
1214  NVPTXInst<(outs Int32Regs:$dst), (ins Int32Regs:$src, Int32Regs:$amt),
1215            "shf.r.wrap.b32 \t$dst, $src, $src, $amt;",
1216            [(set Int32Regs:$dst, (rotr Int32Regs:$src, Int32Regs:$amt))]>,
1217           Requires<[hasHWROT32]>;
1218
1219// 32-bit software rotate by immediate.  $amt2 should equal 32 - $amt1.
1220def ROT32imm_sw :
1221  NVPTXInst<(outs Int32Regs:$dst),
1222            (ins Int32Regs:$src, i32imm:$amt1, i32imm:$amt2),
1223            "{{\n\t"
1224            ".reg .b32 %lhs;\n\t"
1225            ".reg .b32 %rhs;\n\t"
1226            "shl.b32 \t%lhs, $src, $amt1;\n\t"
1227            "shr.b32 \t%rhs, $src, $amt2;\n\t"
1228            "add.u32 \t$dst, %lhs, %rhs;\n\t"
1229            "}}",
1230            []>;
1231
1232def SUB_FRM_32 : SDNodeXForm<imm, [{
1233  return CurDAG->getTargetConstant(32 - N->getZExtValue(), SDLoc(N), MVT::i32);
1234}]>;
1235
1236def : Pat<(rotl Int32Regs:$src, (i32 imm:$amt)),
1237          (ROT32imm_sw Int32Regs:$src, imm:$amt, (SUB_FRM_32 node:$amt))>,
1238      Requires<[noHWROT32]>;
1239def : Pat<(rotr Int32Regs:$src, (i32 imm:$amt)),
1240          (ROT32imm_sw Int32Regs:$src, (SUB_FRM_32 node:$amt), imm:$amt)>,
1241      Requires<[noHWROT32]>;
1242
1243// 32-bit software rotate left by register.
1244def ROTL32reg_sw :
1245  NVPTXInst<(outs Int32Regs:$dst), (ins Int32Regs:$src, Int32Regs:$amt),
1246            "{{\n\t"
1247            ".reg .b32 %lhs;\n\t"
1248            ".reg .b32 %rhs;\n\t"
1249            ".reg .b32 %amt2;\n\t"
1250            "shl.b32 \t%lhs, $src, $amt;\n\t"
1251            "sub.s32 \t%amt2, 32, $amt;\n\t"
1252            "shr.b32 \t%rhs, $src, %amt2;\n\t"
1253            "add.u32 \t$dst, %lhs, %rhs;\n\t"
1254            "}}",
1255            [(set Int32Regs:$dst, (rotl Int32Regs:$src, Int32Regs:$amt))]>,
1256           Requires<[noHWROT32]>;
1257
1258// 32-bit software rotate right by register.
1259def ROTR32reg_sw :
1260  NVPTXInst<(outs Int32Regs:$dst), (ins Int32Regs:$src, Int32Regs:$amt),
1261            "{{\n\t"
1262            ".reg .b32 %lhs;\n\t"
1263            ".reg .b32 %rhs;\n\t"
1264            ".reg .b32 %amt2;\n\t"
1265            "shr.b32 \t%lhs, $src, $amt;\n\t"
1266            "sub.s32 \t%amt2, 32, $amt;\n\t"
1267            "shl.b32 \t%rhs, $src, %amt2;\n\t"
1268            "add.u32 \t$dst, %lhs, %rhs;\n\t"
1269            "}}",
1270            [(set Int32Regs:$dst, (rotr Int32Regs:$src, Int32Regs:$amt))]>,
1271           Requires<[noHWROT32]>;
1272
1273// 64-bit software rotate by immediate.  $amt2 should equal 64 - $amt1.
1274def ROT64imm_sw :
1275  NVPTXInst<(outs Int64Regs:$dst),
1276            (ins Int64Regs:$src, i32imm:$amt1, i32imm:$amt2),
1277            "{{\n\t"
1278            ".reg .b64 %lhs;\n\t"
1279            ".reg .b64 %rhs;\n\t"
1280            "shl.b64 \t%lhs, $src, $amt1;\n\t"
1281            "shr.b64 \t%rhs, $src, $amt2;\n\t"
1282            "add.u64 \t$dst, %lhs, %rhs;\n\t"
1283            "}}",
1284            []>;
1285
1286def SUB_FRM_64 : SDNodeXForm<imm, [{
1287    return CurDAG->getTargetConstant(64-N->getZExtValue(), SDLoc(N), MVT::i32);
1288}]>;
1289
1290def : Pat<(rotl Int64Regs:$src, (i32 imm:$amt)),
1291          (ROT64imm_sw Int64Regs:$src, imm:$amt, (SUB_FRM_64 node:$amt))>;
1292def : Pat<(rotr Int64Regs:$src, (i32 imm:$amt)),
1293          (ROT64imm_sw Int64Regs:$src, (SUB_FRM_64 node:$amt), imm:$amt)>;
1294
1295// 64-bit software rotate left by register.
1296def ROTL64reg_sw :
1297  NVPTXInst<(outs Int64Regs:$dst), (ins Int64Regs:$src, Int32Regs:$amt),
1298            "{{\n\t"
1299            ".reg .b64 %lhs;\n\t"
1300            ".reg .b64 %rhs;\n\t"
1301            ".reg .u32 %amt2;\n\t"
1302            "shl.b64 \t%lhs, $src, $amt;\n\t"
1303            "sub.u32 \t%amt2, 64, $amt;\n\t"
1304            "shr.b64 \t%rhs, $src, %amt2;\n\t"
1305            "add.u64 \t$dst, %lhs, %rhs;\n\t"
1306            "}}",
1307            [(set Int64Regs:$dst, (rotl Int64Regs:$src, Int32Regs:$amt))]>;
1308
1309def ROTR64reg_sw :
1310  NVPTXInst<(outs Int64Regs:$dst), (ins Int64Regs:$src, Int32Regs:$amt),
1311            "{{\n\t"
1312            ".reg .b64 %lhs;\n\t"
1313            ".reg .b64 %rhs;\n\t"
1314            ".reg .u32 %amt2;\n\t"
1315            "shr.b64 \t%lhs, $src, $amt;\n\t"
1316            "sub.u32 \t%amt2, 64, $amt;\n\t"
1317            "shl.b64 \t%rhs, $src, %amt2;\n\t"
1318            "add.u64 \t$dst, %lhs, %rhs;\n\t"
1319            "}}",
1320            [(set Int64Regs:$dst, (rotr Int64Regs:$src, Int32Regs:$amt))]>;
1321
1322//
1323// Funnnel shift in clamp mode
1324//
1325
1326// Create SDNodes so they can be used in the DAG code, e.g.
1327// NVPTXISelLowering (LowerShiftLeftParts and LowerShiftRightParts)
1328def FUN_SHFL_CLAMP : SDNode<"NVPTXISD::FUN_SHFL_CLAMP", SDTIntShiftDOp, []>;
1329def FUN_SHFR_CLAMP : SDNode<"NVPTXISD::FUN_SHFR_CLAMP", SDTIntShiftDOp, []>;
1330
1331def FUNSHFLCLAMP :
1332  NVPTXInst<(outs Int32Regs:$dst),
1333            (ins Int32Regs:$lo, Int32Regs:$hi, Int32Regs:$amt),
1334            "shf.l.clamp.b32 \t$dst, $lo, $hi, $amt;",
1335            [(set Int32Regs:$dst,
1336              (FUN_SHFL_CLAMP Int32Regs:$lo, Int32Regs:$hi, Int32Regs:$amt))]>;
1337
1338def FUNSHFRCLAMP :
1339  NVPTXInst<(outs Int32Regs:$dst),
1340            (ins Int32Regs:$lo, Int32Regs:$hi, Int32Regs:$amt),
1341            "shf.r.clamp.b32 \t$dst, $lo, $hi, $amt;",
1342            [(set Int32Regs:$dst,
1343             (FUN_SHFR_CLAMP Int32Regs:$lo, Int32Regs:$hi, Int32Regs:$amt))]>;
1344
1345//
1346// BFE - bit-field extract
1347//
1348
1349// Template for BFE instructions.  Takes four args,
1350//   [dest (reg), src (reg), start (reg or imm), end (reg or imm)].
1351// Start may be an imm only if end is also an imm.  FIXME: Is this a
1352// restriction in PTX?
1353//
1354// dest and src may be int32 or int64, but start and end are always int32.
1355multiclass BFE<string TyStr, RegisterClass RC> {
1356  def rrr
1357    : NVPTXInst<(outs RC:$d),
1358                (ins RC:$a, Int32Regs:$b, Int32Regs:$c),
1359                !strconcat("bfe.", TyStr, " \t$d, $a, $b, $c;"), []>;
1360  def rri
1361    : NVPTXInst<(outs RC:$d),
1362                (ins RC:$a, Int32Regs:$b, i32imm:$c),
1363                !strconcat("bfe.", TyStr, " \t$d, $a, $b, $c;"), []>;
1364  def rii
1365    : NVPTXInst<(outs RC:$d),
1366                (ins RC:$a, i32imm:$b, i32imm:$c),
1367                !strconcat("bfe.", TyStr, " \t$d, $a, $b, $c;"), []>;
1368}
1369
1370let hasSideEffects = 0 in {
1371  defm BFE_S32 : BFE<"s32", Int32Regs>;
1372  defm BFE_U32 : BFE<"u32", Int32Regs>;
1373  defm BFE_S64 : BFE<"s64", Int64Regs>;
1374  defm BFE_U64 : BFE<"u64", Int64Regs>;
1375}
1376
1377//-----------------------------------
1378// Comparison instructions (setp, set)
1379//-----------------------------------
1380
1381// FIXME: This doesn't cover versions of set and setp that combine with a
1382// boolean predicate, e.g. setp.eq.and.b16.
1383
1384let hasSideEffects = 0 in {
1385  multiclass SETP<string TypeStr, RegisterClass RC, Operand ImmCls> {
1386    def rr :
1387      NVPTXInst<(outs Int1Regs:$dst), (ins RC:$a, RC:$b, CmpMode:$cmp),
1388                !strconcat("setp${cmp:base}${cmp:ftz}.", TypeStr,
1389                           " \t$dst, $a, $b;"), []>;
1390    def ri :
1391      NVPTXInst<(outs Int1Regs:$dst), (ins RC:$a, ImmCls:$b, CmpMode:$cmp),
1392                !strconcat("setp${cmp:base}${cmp:ftz}.", TypeStr,
1393                           " \t$dst, $a, $b;"), []>;
1394    def ir :
1395      NVPTXInst<(outs Int1Regs:$dst), (ins ImmCls:$a, RC:$b, CmpMode:$cmp),
1396                !strconcat("setp${cmp:base}${cmp:ftz}.", TypeStr,
1397                           " \t$dst, $a, $b;"), []>;
1398  }
1399}
1400
1401defm SETP_b16 : SETP<"b16", Int16Regs, i16imm>;
1402defm SETP_s16 : SETP<"s16", Int16Regs, i16imm>;
1403defm SETP_u16 : SETP<"u16", Int16Regs, i16imm>;
1404defm SETP_b32 : SETP<"b32", Int32Regs, i32imm>;
1405defm SETP_s32 : SETP<"s32", Int32Regs, i32imm>;
1406defm SETP_u32 : SETP<"u32", Int32Regs, i32imm>;
1407defm SETP_b64 : SETP<"b64", Int64Regs, i64imm>;
1408defm SETP_s64 : SETP<"s64", Int64Regs, i64imm>;
1409defm SETP_u64 : SETP<"u64", Int64Regs, i64imm>;
1410defm SETP_f32 : SETP<"f32", Float32Regs, f32imm>;
1411defm SETP_f64 : SETP<"f64", Float64Regs, f64imm>;
1412def SETP_f16rr :
1413      NVPTXInst<(outs Int1Regs:$dst),
1414                (ins Float16Regs:$a, Float16Regs:$b, CmpMode:$cmp),
1415                "setp${cmp:base}${cmp:ftz}.f16 \t$dst, $a, $b;",
1416                []>, Requires<[useFP16Math]>;
1417
1418def SETP_f16x2rr :
1419      NVPTXInst<(outs Int1Regs:$p, Int1Regs:$q),
1420                (ins Float16x2Regs:$a, Float16x2Regs:$b, CmpMode:$cmp),
1421                "setp${cmp:base}${cmp:ftz}.f16x2 \t$p|$q, $a, $b;",
1422                []>,
1423                Requires<[useFP16Math]>;
1424
1425
1426// FIXME: This doesn't appear to be correct.  The "set" mnemonic has the form
1427// "set.CmpOp{.ftz}.dtype.stype", where dtype is the type of the destination
1428// reg, either u32, s32, or f32.  Anyway these aren't used at the moment.
1429
1430let hasSideEffects = 0 in {
1431  multiclass SET<string TypeStr, RegisterClass RC, Operand ImmCls> {
1432    def rr : NVPTXInst<(outs Int32Regs:$dst),
1433                       (ins RC:$a, RC:$b, CmpMode:$cmp),
1434                       !strconcat("set$cmp.", TypeStr, " \t$dst, $a, $b;"), []>;
1435    def ri : NVPTXInst<(outs Int32Regs:$dst),
1436                       (ins RC:$a, ImmCls:$b, CmpMode:$cmp),
1437                       !strconcat("set$cmp.", TypeStr, " \t$dst, $a, $b;"), []>;
1438    def ir : NVPTXInst<(outs Int32Regs:$dst),
1439                       (ins ImmCls:$a, RC:$b, CmpMode:$cmp),
1440                       !strconcat("set$cmp.", TypeStr, " \t$dst, $a, $b;"), []>;
1441  }
1442}
1443
1444defm SET_b16 : SET<"b16", Int16Regs, i16imm>;
1445defm SET_s16 : SET<"s16", Int16Regs, i16imm>;
1446defm SET_u16 : SET<"u16", Int16Regs, i16imm>;
1447defm SET_b32 : SET<"b32", Int32Regs, i32imm>;
1448defm SET_s32 : SET<"s32", Int32Regs, i32imm>;
1449defm SET_u32 : SET<"u32", Int32Regs, i32imm>;
1450defm SET_b64 : SET<"b64", Int64Regs, i64imm>;
1451defm SET_s64 : SET<"s64", Int64Regs, i64imm>;
1452defm SET_u64 : SET<"u64", Int64Regs, i64imm>;
1453defm SET_f16 : SET<"f16", Float16Regs, f16imm>;
1454defm SET_f32 : SET<"f32", Float32Regs, f32imm>;
1455defm SET_f64 : SET<"f64", Float64Regs, f64imm>;
1456
1457//-----------------------------------
1458// Selection instructions (selp)
1459//-----------------------------------
1460
1461// FIXME: Missing slct
1462
1463// selp instructions that don't have any pattern matches; we explicitly use
1464// them within this file.
1465let hasSideEffects = 0 in {
1466  multiclass SELP<string TypeStr, RegisterClass RC, Operand ImmCls> {
1467    def rr : NVPTXInst<(outs RC:$dst),
1468                       (ins RC:$a, RC:$b, Int1Regs:$p),
1469                       !strconcat("selp.", TypeStr, " \t$dst, $a, $b, $p;"), []>;
1470    def ri : NVPTXInst<(outs RC:$dst),
1471                       (ins RC:$a, ImmCls:$b, Int1Regs:$p),
1472                       !strconcat("selp.", TypeStr, " \t$dst, $a, $b, $p;"), []>;
1473    def ir : NVPTXInst<(outs RC:$dst),
1474                       (ins ImmCls:$a, RC:$b, Int1Regs:$p),
1475                       !strconcat("selp.", TypeStr, " \t$dst, $a, $b, $p;"), []>;
1476    def ii : NVPTXInst<(outs RC:$dst),
1477                       (ins ImmCls:$a, ImmCls:$b, Int1Regs:$p),
1478                       !strconcat("selp.", TypeStr, " \t$dst, $a, $b, $p;"), []>;
1479  }
1480
1481  multiclass SELP_PATTERN<string TypeStr, RegisterClass RC, Operand ImmCls,
1482                          SDNode ImmNode> {
1483    def rr :
1484      NVPTXInst<(outs RC:$dst),
1485                (ins RC:$a, RC:$b, Int1Regs:$p),
1486                !strconcat("selp.", TypeStr, " \t$dst, $a, $b, $p;"),
1487                [(set RC:$dst, (select Int1Regs:$p, RC:$a, RC:$b))]>;
1488    def ri :
1489      NVPTXInst<(outs RC:$dst),
1490                (ins RC:$a, ImmCls:$b, Int1Regs:$p),
1491                !strconcat("selp.", TypeStr, " \t$dst, $a, $b, $p;"),
1492                [(set RC:$dst, (select Int1Regs:$p, RC:$a, ImmNode:$b))]>;
1493    def ir :
1494      NVPTXInst<(outs RC:$dst),
1495                (ins ImmCls:$a, RC:$b, Int1Regs:$p),
1496                !strconcat("selp.", TypeStr, " \t$dst, $a, $b, $p;"),
1497                [(set RC:$dst, (select Int1Regs:$p, ImmNode:$a, RC:$b))]>;
1498    def ii :
1499      NVPTXInst<(outs RC:$dst),
1500                (ins ImmCls:$a, ImmCls:$b, Int1Regs:$p),
1501                !strconcat("selp.", TypeStr, " \t$dst, $a, $b, $p;"),
1502                [(set RC:$dst, (select Int1Regs:$p, ImmNode:$a, ImmNode:$b))]>;
1503  }
1504}
1505
1506// Don't pattern match on selp.{s,u}{16,32,64} -- selp.b{16,32,64} is just as
1507// good.
1508defm SELP_b16 : SELP_PATTERN<"b16", Int16Regs, i16imm, imm>;
1509defm SELP_s16 : SELP<"s16", Int16Regs, i16imm>;
1510defm SELP_u16 : SELP<"u16", Int16Regs, i16imm>;
1511defm SELP_b32 : SELP_PATTERN<"b32", Int32Regs, i32imm, imm>;
1512defm SELP_s32 : SELP<"s32", Int32Regs, i32imm>;
1513defm SELP_u32 : SELP<"u32", Int32Regs, i32imm>;
1514defm SELP_b64 : SELP_PATTERN<"b64", Int64Regs, i64imm, imm>;
1515defm SELP_s64 : SELP<"s64", Int64Regs, i64imm>;
1516defm SELP_u64 : SELP<"u64", Int64Regs, i64imm>;
1517defm SELP_f16 : SELP_PATTERN<"b16", Float16Regs, f16imm, fpimm>;
1518defm SELP_f32 : SELP_PATTERN<"f32", Float32Regs, f32imm, fpimm>;
1519defm SELP_f64 : SELP_PATTERN<"f64", Float64Regs, f64imm, fpimm>;
1520
1521def SELP_f16x2rr :
1522    NVPTXInst<(outs Float16x2Regs:$dst),
1523              (ins Float16x2Regs:$a, Float16x2Regs:$b, Int1Regs:$p),
1524              "selp.b32 \t$dst, $a, $b, $p;",
1525              [(set Float16x2Regs:$dst,
1526                    (select Int1Regs:$p, Float16x2Regs:$a, Float16x2Regs:$b))]>;
1527
1528//-----------------------------------
1529// Data Movement (Load / Store, Move)
1530//-----------------------------------
1531
1532def ADDRri : ComplexPattern<i32, 2, "SelectADDRri", [frameindex],
1533                            [SDNPWantRoot]>;
1534def ADDRri64 : ComplexPattern<i64, 2, "SelectADDRri64", [frameindex],
1535                              [SDNPWantRoot]>;
1536def ADDRvar : ComplexPattern<iPTR, 1, "SelectDirectAddr", [], []>;
1537
1538def MEMri : Operand<i32> {
1539  let PrintMethod = "printMemOperand";
1540  let MIOperandInfo = (ops Int32Regs, i32imm);
1541}
1542def MEMri64 : Operand<i64> {
1543  let PrintMethod = "printMemOperand";
1544  let MIOperandInfo = (ops Int64Regs, i64imm);
1545}
1546
1547def imem : Operand<iPTR> {
1548  let PrintMethod = "printOperand";
1549}
1550
1551def imemAny : Operand<iPTRAny> {
1552  let PrintMethod = "printOperand";
1553}
1554
1555def LdStCode : Operand<i32> {
1556  let PrintMethod = "printLdStCode";
1557}
1558
1559def MmaCode : Operand<i32> {
1560  let PrintMethod = "printMmaCode";
1561}
1562
1563def SDTWrapper : SDTypeProfile<1, 1, [SDTCisSameAs<0, 1>, SDTCisPtrTy<0>]>;
1564def Wrapper    : SDNode<"NVPTXISD::Wrapper", SDTWrapper>;
1565
1566// Load a memory address into a u32 or u64 register.
1567def MOV_ADDR : NVPTXInst<(outs Int32Regs:$dst), (ins imem:$a),
1568                         "mov.u32 \t$dst, $a;",
1569                         [(set Int32Regs:$dst, (Wrapper tglobaladdr:$a))]>;
1570def MOV_ADDR64 : NVPTXInst<(outs Int64Regs:$dst), (ins imem:$a),
1571                           "mov.u64 \t$dst, $a;",
1572                           [(set Int64Regs:$dst, (Wrapper tglobaladdr:$a))]>;
1573
1574// Get pointer to local stack.
1575let hasSideEffects = 0 in {
1576  def MOV_DEPOT_ADDR :    NVPTXInst<(outs Int32Regs:$d), (ins i32imm:$num),
1577                                     "mov.u32 \t$d, __local_depot$num;", []>;
1578  def MOV_DEPOT_ADDR_64 : NVPTXInst<(outs Int64Regs:$d), (ins i32imm:$num),
1579                                    "mov.u64 \t$d, __local_depot$num;", []>;
1580}
1581
1582
1583// copyPhysreg is hard-coded in NVPTXInstrInfo.cpp
1584let IsSimpleMove=1, hasSideEffects=0 in {
1585  def IMOV1rr :  NVPTXInst<(outs Int1Regs:$dst), (ins Int1Regs:$sss),
1586                           "mov.pred \t$dst, $sss;", []>;
1587  def IMOV16rr : NVPTXInst<(outs Int16Regs:$dst), (ins Int16Regs:$sss),
1588                           "mov.u16 \t$dst, $sss;", []>;
1589  def IMOV32rr : NVPTXInst<(outs Int32Regs:$dst), (ins Int32Regs:$sss),
1590                           "mov.u32 \t$dst, $sss;", []>;
1591  def IMOV64rr : NVPTXInst<(outs Int64Regs:$dst), (ins Int64Regs:$sss),
1592                           "mov.u64 \t$dst, $sss;", []>;
1593
1594  def FMOV16rr : NVPTXInst<(outs Float16Regs:$dst), (ins Float16Regs:$src),
1595                           // We have to use .b16 here as there's no mov.f16.
1596                           "mov.b16 \t$dst, $src;", []>;
1597  def FMOV32rr : NVPTXInst<(outs Float32Regs:$dst), (ins Float32Regs:$src),
1598                           "mov.f32 \t$dst, $src;", []>;
1599  def FMOV64rr : NVPTXInst<(outs Float64Regs:$dst), (ins Float64Regs:$src),
1600                           "mov.f64 \t$dst, $src;", []>;
1601}
1602
1603def IMOV1ri : NVPTXInst<(outs Int1Regs:$dst), (ins i1imm:$src),
1604                        "mov.pred \t$dst, $src;",
1605                        [(set Int1Regs:$dst, imm:$src)]>;
1606def IMOV16ri : NVPTXInst<(outs Int16Regs:$dst), (ins i16imm:$src),
1607                         "mov.u16 \t$dst, $src;",
1608                         [(set Int16Regs:$dst, imm:$src)]>;
1609def IMOV32ri : NVPTXInst<(outs Int32Regs:$dst), (ins i32imm:$src),
1610                         "mov.u32 \t$dst, $src;",
1611                         [(set Int32Regs:$dst, imm:$src)]>;
1612def IMOV64i : NVPTXInst<(outs Int64Regs:$dst), (ins i64imm:$src),
1613                        "mov.u64 \t$dst, $src;",
1614                        [(set Int64Regs:$dst, imm:$src)]>;
1615
1616def FMOV32ri : NVPTXInst<(outs Float32Regs:$dst), (ins f32imm:$src),
1617                         "mov.f32 \t$dst, $src;",
1618                         [(set Float32Regs:$dst, fpimm:$src)]>;
1619def FMOV64ri : NVPTXInst<(outs Float64Regs:$dst), (ins f64imm:$src),
1620                         "mov.f64 \t$dst, $src;",
1621                         [(set Float64Regs:$dst, fpimm:$src)]>;
1622
1623def : Pat<(i32 (Wrapper texternalsym:$dst)), (IMOV32ri texternalsym:$dst)>;
1624
1625//---- Copy Frame Index ----
1626def LEA_ADDRi :   NVPTXInst<(outs Int32Regs:$dst), (ins MEMri:$addr),
1627                            "add.u32 \t$dst, ${addr:add};",
1628                            [(set Int32Regs:$dst, ADDRri:$addr)]>;
1629def LEA_ADDRi64 : NVPTXInst<(outs Int64Regs:$dst), (ins MEMri64:$addr),
1630                            "add.u64 \t$dst, ${addr:add};",
1631                            [(set Int64Regs:$dst, ADDRri64:$addr)]>;
1632
1633//-----------------------------------
1634// Comparison and Selection
1635//-----------------------------------
1636
1637multiclass ISET_FORMAT<PatFrag OpNode, PatLeaf Mode,
1638                       Instruction setp_16rr,
1639                       Instruction setp_16ri,
1640                       Instruction setp_16ir,
1641                       Instruction setp_32rr,
1642                       Instruction setp_32ri,
1643                       Instruction setp_32ir,
1644                       Instruction setp_64rr,
1645                       Instruction setp_64ri,
1646                       Instruction setp_64ir,
1647                       Instruction set_16rr,
1648                       Instruction set_16ri,
1649                       Instruction set_16ir,
1650                       Instruction set_32rr,
1651                       Instruction set_32ri,
1652                       Instruction set_32ir,
1653                       Instruction set_64rr,
1654                       Instruction set_64ri,
1655                       Instruction set_64ir> {
1656  // i16 -> pred
1657  def : Pat<(i1 (OpNode Int16Regs:$a, Int16Regs:$b)),
1658            (setp_16rr Int16Regs:$a, Int16Regs:$b, Mode)>;
1659  def : Pat<(i1 (OpNode Int16Regs:$a, imm:$b)),
1660            (setp_16ri Int16Regs:$a, imm:$b, Mode)>;
1661  def : Pat<(i1 (OpNode imm:$a, Int16Regs:$b)),
1662            (setp_16ir imm:$a, Int16Regs:$b, Mode)>;
1663  // i32 -> pred
1664  def : Pat<(i1 (OpNode Int32Regs:$a, Int32Regs:$b)),
1665            (setp_32rr Int32Regs:$a, Int32Regs:$b, Mode)>;
1666  def : Pat<(i1 (OpNode Int32Regs:$a, imm:$b)),
1667            (setp_32ri Int32Regs:$a, imm:$b, Mode)>;
1668  def : Pat<(i1 (OpNode imm:$a, Int32Regs:$b)),
1669            (setp_32ir imm:$a, Int32Regs:$b, Mode)>;
1670  // i64 -> pred
1671  def : Pat<(i1 (OpNode Int64Regs:$a, Int64Regs:$b)),
1672            (setp_64rr Int64Regs:$a, Int64Regs:$b, Mode)>;
1673  def : Pat<(i1 (OpNode Int64Regs:$a, imm:$b)),
1674            (setp_64ri Int64Regs:$a, imm:$b, Mode)>;
1675  def : Pat<(i1 (OpNode imm:$a, Int64Regs:$b)),
1676            (setp_64ir imm:$a, Int64Regs:$b, Mode)>;
1677
1678  // i16 -> i32
1679  def : Pat<(i32 (OpNode Int16Regs:$a, Int16Regs:$b)),
1680            (set_16rr Int16Regs:$a, Int16Regs:$b, Mode)>;
1681  def : Pat<(i32 (OpNode Int16Regs:$a, imm:$b)),
1682            (set_16ri Int16Regs:$a, imm:$b, Mode)>;
1683  def : Pat<(i32 (OpNode imm:$a, Int16Regs:$b)),
1684            (set_16ir imm:$a, Int16Regs:$b, Mode)>;
1685  // i32 -> i32
1686  def : Pat<(i32 (OpNode Int32Regs:$a, Int32Regs:$b)),
1687            (set_32rr Int32Regs:$a, Int32Regs:$b, Mode)>;
1688  def : Pat<(i32 (OpNode Int32Regs:$a, imm:$b)),
1689            (set_32ri Int32Regs:$a, imm:$b, Mode)>;
1690  def : Pat<(i32 (OpNode imm:$a, Int32Regs:$b)),
1691            (set_32ir imm:$a, Int32Regs:$b, Mode)>;
1692  // i64 -> i32
1693  def : Pat<(i32 (OpNode Int64Regs:$a, Int64Regs:$b)),
1694            (set_64rr Int64Regs:$a, Int64Regs:$b, Mode)>;
1695  def : Pat<(i32 (OpNode Int64Regs:$a, imm:$b)),
1696            (set_64ri Int64Regs:$a, imm:$b, Mode)>;
1697  def : Pat<(i32 (OpNode imm:$a, Int64Regs:$b)),
1698            (set_64ir imm:$a, Int64Regs:$b, Mode)>;
1699}
1700
1701multiclass ISET_FORMAT_SIGNED<PatFrag OpNode, PatLeaf Mode>
1702  : ISET_FORMAT<OpNode, Mode,
1703                SETP_s16rr, SETP_s16ri, SETP_s16ir,
1704                SETP_s32rr, SETP_s32ri, SETP_s32ir,
1705                SETP_s64rr, SETP_s64ri, SETP_s64ir,
1706                SET_s16rr, SET_s16ri, SET_s16ir,
1707                SET_s32rr, SET_s32ri, SET_s32ir,
1708                SET_s64rr, SET_s64ri, SET_s64ir> {
1709  // TableGen doesn't like empty multiclasses.
1710  def : PatLeaf<(i32 0)>;
1711}
1712
1713multiclass ISET_FORMAT_UNSIGNED<PatFrag OpNode, PatLeaf Mode>
1714  : ISET_FORMAT<OpNode, Mode,
1715                SETP_u16rr, SETP_u16ri, SETP_u16ir,
1716                SETP_u32rr, SETP_u32ri, SETP_u32ir,
1717                SETP_u64rr, SETP_u64ri, SETP_u64ir,
1718                SET_u16rr, SET_u16ri, SET_u16ir,
1719                SET_u32rr, SET_u32ri, SET_u32ir,
1720                SET_u64rr, SET_u64ri, SET_u64ir> {
1721  // TableGen doesn't like empty multiclasses.
1722  def : PatLeaf<(i32 0)>;
1723}
1724
1725defm : ISET_FORMAT_SIGNED<setgt, CmpGT>;
1726defm : ISET_FORMAT_SIGNED<setlt, CmpLT>;
1727defm : ISET_FORMAT_SIGNED<setge, CmpGE>;
1728defm : ISET_FORMAT_SIGNED<setle, CmpLE>;
1729defm : ISET_FORMAT_SIGNED<seteq, CmpEQ>;
1730defm : ISET_FORMAT_SIGNED<setne, CmpNE>;
1731defm : ISET_FORMAT_UNSIGNED<setugt, CmpGT>;
1732defm : ISET_FORMAT_UNSIGNED<setult, CmpLT>;
1733defm : ISET_FORMAT_UNSIGNED<setuge, CmpGE>;
1734defm : ISET_FORMAT_UNSIGNED<setule, CmpLE>;
1735defm : ISET_FORMAT_UNSIGNED<setueq, CmpEQ>;
1736defm : ISET_FORMAT_UNSIGNED<setune, CmpNE>;
1737
1738// i1 compares
1739def : Pat<(setne Int1Regs:$a, Int1Regs:$b),
1740          (XORb1rr Int1Regs:$a, Int1Regs:$b)>;
1741def : Pat<(setune Int1Regs:$a, Int1Regs:$b),
1742          (XORb1rr Int1Regs:$a, Int1Regs:$b)>;
1743
1744def : Pat<(seteq Int1Regs:$a, Int1Regs:$b),
1745          (NOT1 (XORb1rr Int1Regs:$a, Int1Regs:$b))>;
1746def : Pat<(setueq Int1Regs:$a, Int1Regs:$b),
1747          (NOT1 (XORb1rr Int1Regs:$a, Int1Regs:$b))>;
1748
1749// i1 compare -> i32
1750def : Pat<(i32 (setne Int1Regs:$a, Int1Regs:$b)),
1751          (SELP_u32ii -1, 0, (XORb1rr Int1Regs:$a, Int1Regs:$b))>;
1752def : Pat<(i32 (setne Int1Regs:$a, Int1Regs:$b)),
1753          (SELP_u32ii 0, -1, (XORb1rr Int1Regs:$a, Int1Regs:$b))>;
1754
1755
1756
1757multiclass FSET_FORMAT<PatFrag OpNode, PatLeaf Mode, PatLeaf ModeFTZ> {
1758  // f16 -> pred
1759  def : Pat<(i1 (OpNode Float16Regs:$a, Float16Regs:$b)),
1760            (SETP_f16rr Float16Regs:$a, Float16Regs:$b, ModeFTZ)>,
1761        Requires<[useFP16Math,doF32FTZ]>;
1762  def : Pat<(i1 (OpNode Float16Regs:$a, Float16Regs:$b)),
1763            (SETP_f16rr Float16Regs:$a, Float16Regs:$b, Mode)>,
1764        Requires<[useFP16Math]>;
1765  def : Pat<(i1 (OpNode Float16Regs:$a, fpimm:$b)),
1766            (SETP_f16rr Float16Regs:$a, (LOAD_CONST_F16 fpimm:$b), ModeFTZ)>,
1767        Requires<[useFP16Math,doF32FTZ]>;
1768  def : Pat<(i1 (OpNode Float16Regs:$a, fpimm:$b)),
1769            (SETP_f16rr Float16Regs:$a, (LOAD_CONST_F16 fpimm:$b), Mode)>,
1770        Requires<[useFP16Math]>;
1771  def : Pat<(i1 (OpNode fpimm:$a, Float16Regs:$b)),
1772            (SETP_f16rr (LOAD_CONST_F16 fpimm:$a), Float16Regs:$b, ModeFTZ)>,
1773        Requires<[useFP16Math,doF32FTZ]>;
1774  def : Pat<(i1 (OpNode fpimm:$a, Float16Regs:$b)),
1775            (SETP_f16rr (LOAD_CONST_F16 fpimm:$a), Float16Regs:$b, Mode)>,
1776        Requires<[useFP16Math]>;
1777
1778  // f32 -> pred
1779  def : Pat<(i1 (OpNode Float32Regs:$a, Float32Regs:$b)),
1780            (SETP_f32rr Float32Regs:$a, Float32Regs:$b, ModeFTZ)>,
1781        Requires<[doF32FTZ]>;
1782  def : Pat<(i1 (OpNode Float32Regs:$a, Float32Regs:$b)),
1783            (SETP_f32rr Float32Regs:$a, Float32Regs:$b, Mode)>;
1784  def : Pat<(i1 (OpNode Float32Regs:$a, fpimm:$b)),
1785            (SETP_f32ri Float32Regs:$a, fpimm:$b, ModeFTZ)>,
1786        Requires<[doF32FTZ]>;
1787  def : Pat<(i1 (OpNode Float32Regs:$a, fpimm:$b)),
1788            (SETP_f32ri Float32Regs:$a, fpimm:$b, Mode)>;
1789  def : Pat<(i1 (OpNode fpimm:$a, Float32Regs:$b)),
1790            (SETP_f32ir fpimm:$a, Float32Regs:$b, ModeFTZ)>,
1791        Requires<[doF32FTZ]>;
1792  def : Pat<(i1 (OpNode fpimm:$a, Float32Regs:$b)),
1793            (SETP_f32ir fpimm:$a, Float32Regs:$b, Mode)>;
1794
1795  // f64 -> pred
1796  def : Pat<(i1 (OpNode Float64Regs:$a, Float64Regs:$b)),
1797            (SETP_f64rr Float64Regs:$a, Float64Regs:$b, Mode)>;
1798  def : Pat<(i1 (OpNode Float64Regs:$a, fpimm:$b)),
1799            (SETP_f64ri Float64Regs:$a, fpimm:$b, Mode)>;
1800  def : Pat<(i1 (OpNode fpimm:$a, Float64Regs:$b)),
1801            (SETP_f64ir fpimm:$a, Float64Regs:$b, Mode)>;
1802
1803  // f16 -> i32
1804  def : Pat<(i32 (OpNode Float16Regs:$a, Float16Regs:$b)),
1805            (SET_f16rr Float16Regs:$a, Float16Regs:$b, ModeFTZ)>,
1806        Requires<[useFP16Math, doF32FTZ]>;
1807  def : Pat<(i32 (OpNode Float16Regs:$a, Float16Regs:$b)),
1808            (SET_f16rr Float16Regs:$a, Float16Regs:$b, Mode)>,
1809        Requires<[useFP16Math]>;
1810  def : Pat<(i32 (OpNode Float16Regs:$a, fpimm:$b)),
1811            (SET_f16rr Float16Regs:$a, (LOAD_CONST_F16 fpimm:$b), ModeFTZ)>,
1812        Requires<[useFP16Math, doF32FTZ]>;
1813  def : Pat<(i32 (OpNode Float16Regs:$a, fpimm:$b)),
1814            (SET_f16rr Float16Regs:$a, (LOAD_CONST_F16 fpimm:$b), Mode)>,
1815        Requires<[useFP16Math]>;
1816  def : Pat<(i32 (OpNode fpimm:$a, Float16Regs:$b)),
1817            (SET_f16ir (LOAD_CONST_F16 fpimm:$a), Float16Regs:$b, ModeFTZ)>,
1818        Requires<[useFP16Math, doF32FTZ]>;
1819  def : Pat<(i32 (OpNode fpimm:$a, Float16Regs:$b)),
1820            (SET_f16ir (LOAD_CONST_F16 fpimm:$a), Float16Regs:$b, Mode)>,
1821        Requires<[useFP16Math]>;
1822
1823  // f32 -> i32
1824  def : Pat<(i32 (OpNode Float32Regs:$a, Float32Regs:$b)),
1825            (SET_f32rr Float32Regs:$a, Float32Regs:$b, ModeFTZ)>,
1826        Requires<[doF32FTZ]>;
1827  def : Pat<(i32 (OpNode Float32Regs:$a, Float32Regs:$b)),
1828            (SET_f32rr Float32Regs:$a, Float32Regs:$b, Mode)>;
1829  def : Pat<(i32 (OpNode Float32Regs:$a, fpimm:$b)),
1830            (SET_f32ri Float32Regs:$a, fpimm:$b, ModeFTZ)>,
1831        Requires<[doF32FTZ]>;
1832  def : Pat<(i32 (OpNode Float32Regs:$a, fpimm:$b)),
1833            (SET_f32ri Float32Regs:$a, fpimm:$b, Mode)>;
1834  def : Pat<(i32 (OpNode fpimm:$a, Float32Regs:$b)),
1835            (SET_f32ir fpimm:$a, Float32Regs:$b, ModeFTZ)>,
1836        Requires<[doF32FTZ]>;
1837  def : Pat<(i32 (OpNode fpimm:$a, Float32Regs:$b)),
1838            (SET_f32ir fpimm:$a, Float32Regs:$b, Mode)>;
1839
1840  // f64 -> i32
1841  def : Pat<(i32 (OpNode Float64Regs:$a, Float64Regs:$b)),
1842            (SET_f64rr Float64Regs:$a, Float64Regs:$b, Mode)>;
1843  def : Pat<(i32 (OpNode Float64Regs:$a, fpimm:$b)),
1844            (SET_f64ri Float64Regs:$a, fpimm:$b, Mode)>;
1845  def : Pat<(i32 (OpNode fpimm:$a, Float64Regs:$b)),
1846            (SET_f64ir fpimm:$a, Float64Regs:$b, Mode)>;
1847}
1848
1849defm FSetOGT : FSET_FORMAT<setogt, CmpGT, CmpGT_FTZ>;
1850defm FSetOLT : FSET_FORMAT<setolt, CmpLT, CmpLT_FTZ>;
1851defm FSetOGE : FSET_FORMAT<setoge, CmpGE, CmpGE_FTZ>;
1852defm FSetOLE : FSET_FORMAT<setole, CmpLE, CmpLE_FTZ>;
1853defm FSetOEQ : FSET_FORMAT<setoeq, CmpEQ, CmpEQ_FTZ>;
1854defm FSetONE : FSET_FORMAT<setone, CmpNE, CmpNE_FTZ>;
1855
1856defm FSetUGT : FSET_FORMAT<setugt, CmpGTU, CmpGTU_FTZ>;
1857defm FSetULT : FSET_FORMAT<setult, CmpLTU, CmpLTU_FTZ>;
1858defm FSetUGE : FSET_FORMAT<setuge, CmpGEU, CmpGEU_FTZ>;
1859defm FSetULE : FSET_FORMAT<setule, CmpLEU, CmpLEU_FTZ>;
1860defm FSetUEQ : FSET_FORMAT<setueq, CmpEQU, CmpEQU_FTZ>;
1861defm FSetUNE : FSET_FORMAT<setune, CmpNEU, CmpNEU_FTZ>;
1862
1863defm FSetGT : FSET_FORMAT<setgt, CmpGT, CmpGT_FTZ>;
1864defm FSetLT : FSET_FORMAT<setlt, CmpLT, CmpLT_FTZ>;
1865defm FSetGE : FSET_FORMAT<setge, CmpGE, CmpGE_FTZ>;
1866defm FSetLE : FSET_FORMAT<setle, CmpLE, CmpLE_FTZ>;
1867defm FSetEQ : FSET_FORMAT<seteq, CmpEQ, CmpEQ_FTZ>;
1868defm FSetNE : FSET_FORMAT<setne, CmpNE, CmpNE_FTZ>;
1869
1870defm FSetNUM : FSET_FORMAT<seto, CmpNUM, CmpNUM_FTZ>;
1871defm FSetNAN : FSET_FORMAT<setuo, CmpNAN, CmpNAN_FTZ>;
1872
1873// FIXME: What is this doing here?  Can it be deleted?
1874// def ld_param         : SDNode<"NVPTXISD::LOAD_PARAM", SDTLoad,
1875//                         [SDNPHasChain, SDNPMayLoad, SDNPMemOperand]>;
1876
1877def SDTDeclareParamProfile :
1878  SDTypeProfile<0, 3, [SDTCisInt<0>, SDTCisInt<1>, SDTCisInt<2>]>;
1879def SDTDeclareScalarParamProfile :
1880  SDTypeProfile<0, 3, [SDTCisInt<0>, SDTCisInt<1>, SDTCisInt<2>]>;
1881def SDTLoadParamProfile : SDTypeProfile<1, 2, [SDTCisInt<1>, SDTCisInt<2>]>;
1882def SDTLoadParamV2Profile : SDTypeProfile<2, 2, [SDTCisSameAs<0, 1>, SDTCisInt<2>, SDTCisInt<3>]>;
1883def SDTLoadParamV4Profile : SDTypeProfile<4, 2, [SDTCisInt<4>, SDTCisInt<5>]>;
1884def SDTPrintCallProfile : SDTypeProfile<0, 1, [SDTCisInt<0>]>;
1885def SDTPrintCallUniProfile : SDTypeProfile<0, 1, [SDTCisInt<0>]>;
1886def SDTStoreParamProfile : SDTypeProfile<0, 3, [SDTCisInt<0>, SDTCisInt<1>]>;
1887def SDTStoreParamV2Profile : SDTypeProfile<0, 4, [SDTCisInt<0>, SDTCisInt<1>]>;
1888def SDTStoreParamV4Profile : SDTypeProfile<0, 6, [SDTCisInt<0>, SDTCisInt<1>]>;
1889def SDTStoreParam32Profile : SDTypeProfile<0, 3, [SDTCisInt<0>, SDTCisInt<1>]>;
1890def SDTCallArgProfile : SDTypeProfile<0, 2, [SDTCisInt<0>]>;
1891def SDTCallArgMarkProfile : SDTypeProfile<0, 0, []>;
1892def SDTCallVoidProfile : SDTypeProfile<0, 1, []>;
1893def SDTCallValProfile : SDTypeProfile<1, 0, []>;
1894def SDTMoveParamProfile : SDTypeProfile<1, 1, []>;
1895def SDTStoreRetvalProfile : SDTypeProfile<0, 2, [SDTCisInt<0>]>;
1896def SDTStoreRetvalV2Profile : SDTypeProfile<0, 3, [SDTCisInt<0>]>;
1897def SDTStoreRetvalV4Profile : SDTypeProfile<0, 5, [SDTCisInt<0>]>;
1898def SDTPseudoUseParamProfile : SDTypeProfile<0, 1, []>;
1899def SDTProxyRegProfile : SDTypeProfile<1, 1, []>;
1900
1901def DeclareParam :
1902  SDNode<"NVPTXISD::DeclareParam", SDTDeclareParamProfile,
1903         [SDNPHasChain, SDNPOutGlue, SDNPInGlue, SDNPSideEffect]>;
1904def DeclareScalarParam :
1905  SDNode<"NVPTXISD::DeclareScalarParam", SDTDeclareScalarParamProfile,
1906         [SDNPHasChain, SDNPOutGlue, SDNPInGlue, SDNPSideEffect]>;
1907def DeclareRetParam :
1908  SDNode<"NVPTXISD::DeclareRetParam", SDTDeclareParamProfile,
1909         [SDNPHasChain, SDNPOutGlue, SDNPInGlue, SDNPSideEffect]>;
1910def DeclareRet :
1911  SDNode<"NVPTXISD::DeclareRet", SDTDeclareScalarParamProfile,
1912         [SDNPHasChain, SDNPOutGlue, SDNPInGlue, SDNPSideEffect]>;
1913def LoadParam :
1914  SDNode<"NVPTXISD::LoadParam", SDTLoadParamProfile,
1915         [SDNPHasChain, SDNPMayLoad, SDNPOutGlue, SDNPInGlue]>;
1916def LoadParamV2 :
1917  SDNode<"NVPTXISD::LoadParamV2", SDTLoadParamV2Profile,
1918         [SDNPHasChain, SDNPMayLoad, SDNPOutGlue, SDNPInGlue]>;
1919def LoadParamV4 :
1920  SDNode<"NVPTXISD::LoadParamV4", SDTLoadParamV4Profile,
1921         [SDNPHasChain, SDNPMayLoad, SDNPOutGlue, SDNPInGlue]>;
1922def PrintCall :
1923  SDNode<"NVPTXISD::PrintCall", SDTPrintCallProfile,
1924         [SDNPHasChain, SDNPOutGlue, SDNPInGlue, SDNPSideEffect]>;
1925def PrintConvergentCall :
1926  SDNode<"NVPTXISD::PrintConvergentCall", SDTPrintCallProfile,
1927         [SDNPHasChain, SDNPOutGlue, SDNPInGlue, SDNPSideEffect]>;
1928def PrintCallUni :
1929  SDNode<"NVPTXISD::PrintCallUni", SDTPrintCallUniProfile,
1930         [SDNPHasChain, SDNPOutGlue, SDNPInGlue, SDNPSideEffect]>;
1931def PrintConvergentCallUni :
1932  SDNode<"NVPTXISD::PrintConvergentCallUni", SDTPrintCallUniProfile,
1933         [SDNPHasChain, SDNPOutGlue, SDNPInGlue, SDNPSideEffect]>;
1934def StoreParam :
1935  SDNode<"NVPTXISD::StoreParam", SDTStoreParamProfile,
1936         [SDNPHasChain, SDNPOutGlue, SDNPInGlue, SDNPSideEffect]>;
1937def StoreParamV2 :
1938  SDNode<"NVPTXISD::StoreParamV2", SDTStoreParamV2Profile,
1939         [SDNPHasChain, SDNPOutGlue, SDNPInGlue, SDNPSideEffect]>;
1940def StoreParamV4 :
1941  SDNode<"NVPTXISD::StoreParamV4", SDTStoreParamV4Profile,
1942         [SDNPHasChain, SDNPOutGlue, SDNPInGlue, SDNPSideEffect]>;
1943def StoreParamU32 :
1944  SDNode<"NVPTXISD::StoreParamU32", SDTStoreParam32Profile,
1945         [SDNPHasChain, SDNPOutGlue, SDNPInGlue, SDNPSideEffect]>;
1946def StoreParamS32 :
1947  SDNode<"NVPTXISD::StoreParamS32", SDTStoreParam32Profile,
1948         [SDNPHasChain, SDNPOutGlue, SDNPInGlue, SDNPSideEffect]>;
1949def CallArgBegin :
1950  SDNode<"NVPTXISD::CallArgBegin", SDTCallArgMarkProfile,
1951         [SDNPHasChain, SDNPOutGlue, SDNPInGlue, SDNPSideEffect]>;
1952def CallArg :
1953  SDNode<"NVPTXISD::CallArg", SDTCallArgProfile,
1954         [SDNPHasChain, SDNPOutGlue, SDNPInGlue, SDNPSideEffect]>;
1955def LastCallArg :
1956  SDNode<"NVPTXISD::LastCallArg", SDTCallArgProfile,
1957         [SDNPHasChain, SDNPOutGlue, SDNPInGlue, SDNPSideEffect]>;
1958def CallArgEnd :
1959  SDNode<"NVPTXISD::CallArgEnd", SDTCallVoidProfile,
1960         [SDNPHasChain, SDNPOutGlue, SDNPInGlue, SDNPSideEffect]>;
1961def CallVoid :
1962  SDNode<"NVPTXISD::CallVoid", SDTCallVoidProfile,
1963         [SDNPHasChain, SDNPOutGlue, SDNPInGlue, SDNPSideEffect]>;
1964def Prototype :
1965  SDNode<"NVPTXISD::Prototype", SDTCallVoidProfile,
1966         [SDNPHasChain, SDNPOutGlue, SDNPInGlue, SDNPSideEffect]>;
1967def CallVal :
1968  SDNode<"NVPTXISD::CallVal", SDTCallValProfile,
1969         [SDNPHasChain, SDNPOutGlue, SDNPInGlue, SDNPSideEffect]>;
1970def MoveParam :
1971  SDNode<"NVPTXISD::MoveParam", SDTMoveParamProfile, []>;
1972def StoreRetval :
1973  SDNode<"NVPTXISD::StoreRetval", SDTStoreRetvalProfile,
1974         [SDNPHasChain, SDNPSideEffect]>;
1975def StoreRetvalV2 :
1976  SDNode<"NVPTXISD::StoreRetvalV2", SDTStoreRetvalV2Profile,
1977         [SDNPHasChain, SDNPSideEffect]>;
1978def StoreRetvalV4 :
1979  SDNode<"NVPTXISD::StoreRetvalV4", SDTStoreRetvalV4Profile,
1980         [SDNPHasChain, SDNPSideEffect]>;
1981def PseudoUseParam :
1982  SDNode<"NVPTXISD::PseudoUseParam", SDTPseudoUseParamProfile,
1983         [SDNPHasChain, SDNPOutGlue, SDNPInGlue, SDNPSideEffect]>;
1984def RETURNNode :
1985  SDNode<"NVPTXISD::RETURN", SDTCallArgMarkProfile,
1986         [SDNPHasChain, SDNPSideEffect]>;
1987def ProxyReg :
1988  SDNode<"NVPTXISD::ProxyReg", SDTProxyRegProfile,
1989         [SDNPHasChain, SDNPOutGlue, SDNPInGlue, SDNPSideEffect]>;
1990
1991let mayLoad = 1 in {
1992  class LoadParamMemInst<NVPTXRegClass regclass, string opstr> :
1993        NVPTXInst<(outs regclass:$dst), (ins i32imm:$b),
1994                  !strconcat("ld.param", opstr, " \t$dst, [retval0+$b];"),
1995                  []>;
1996
1997  class LoadParamV2MemInst<NVPTXRegClass regclass, string opstr> :
1998        NVPTXInst<(outs regclass:$dst, regclass:$dst2), (ins i32imm:$b),
1999                  !strconcat("ld.param.v2", opstr,
2000                             " \t{{$dst, $dst2}}, [retval0+$b];"), []>;
2001
2002  class LoadParamV4MemInst<NVPTXRegClass regclass, string opstr> :
2003        NVPTXInst<(outs regclass:$dst, regclass:$dst2, regclass:$dst3,
2004                        regclass:$dst4),
2005                  (ins i32imm:$b),
2006                  !strconcat("ld.param.v4", opstr,
2007                             " \t{{$dst, $dst2, $dst3, $dst4}}, [retval0+$b];"),
2008                  []>;
2009}
2010
2011class LoadParamRegInst<NVPTXRegClass regclass, string opstr> :
2012      NVPTXInst<(outs regclass:$dst), (ins i32imm:$b),
2013                !strconcat("mov", opstr, " \t$dst, retval$b;"),
2014                [(set regclass:$dst, (LoadParam (i32 0), (i32 imm:$b)))]>;
2015
2016let mayStore = 1 in {
2017  class StoreParamInst<NVPTXRegClass regclass, string opstr> :
2018        NVPTXInst<(outs), (ins regclass:$val, i32imm:$a, i32imm:$b),
2019                  !strconcat("st.param", opstr, " \t[param$a+$b], $val;"),
2020                  []>;
2021
2022  class StoreParamV2Inst<NVPTXRegClass regclass, string opstr> :
2023        NVPTXInst<(outs), (ins regclass:$val, regclass:$val2,
2024                               i32imm:$a, i32imm:$b),
2025                  !strconcat("st.param.v2", opstr,
2026                             " \t[param$a+$b], {{$val, $val2}};"),
2027                  []>;
2028
2029  class StoreParamV4Inst<NVPTXRegClass regclass, string opstr> :
2030        NVPTXInst<(outs), (ins regclass:$val, regclass:$val2, regclass:$val3,
2031                               regclass:$val4, i32imm:$a,
2032                               i32imm:$b),
2033                  !strconcat("st.param.v4", opstr,
2034                             " \t[param$a+$b], {{$val, $val2, $val3, $val4}};"),
2035                  []>;
2036
2037  class StoreRetvalInst<NVPTXRegClass regclass, string opstr> :
2038        NVPTXInst<(outs), (ins regclass:$val, i32imm:$a),
2039                  !strconcat("st.param", opstr, " \t[func_retval0+$a], $val;"),
2040                  []>;
2041
2042  class StoreRetvalV2Inst<NVPTXRegClass regclass, string opstr> :
2043        NVPTXInst<(outs), (ins regclass:$val, regclass:$val2, i32imm:$a),
2044                  !strconcat("st.param.v2", opstr,
2045                             " \t[func_retval0+$a], {{$val, $val2}};"),
2046                  []>;
2047
2048  class StoreRetvalV4Inst<NVPTXRegClass regclass, string opstr> :
2049        NVPTXInst<(outs),
2050                  (ins regclass:$val, regclass:$val2, regclass:$val3,
2051                       regclass:$val4, i32imm:$a),
2052                  !strconcat("st.param.v4", opstr,
2053                             " \t[func_retval0+$a], {{$val, $val2, $val3, $val4}};"),
2054                  []>;
2055}
2056
2057let isCall=1 in {
2058  multiclass CALL<string OpcStr, SDNode OpNode> {
2059     def PrintCallNoRetInst : NVPTXInst<(outs), (ins),
2060       !strconcat(OpcStr, " "), [(OpNode (i32 0))]>;
2061     def PrintCallRetInst1 : NVPTXInst<(outs), (ins),
2062       !strconcat(OpcStr, " (retval0), "), [(OpNode (i32 1))]>;
2063     def PrintCallRetInst2 : NVPTXInst<(outs), (ins),
2064       !strconcat(OpcStr, " (retval0, retval1), "), [(OpNode (i32 2))]>;
2065     def PrintCallRetInst3 : NVPTXInst<(outs), (ins),
2066       !strconcat(OpcStr, " (retval0, retval1, retval2), "), [(OpNode (i32 3))]>;
2067     def PrintCallRetInst4 : NVPTXInst<(outs), (ins),
2068       !strconcat(OpcStr, " (retval0, retval1, retval2, retval3), "),
2069       [(OpNode (i32 4))]>;
2070     def PrintCallRetInst5 : NVPTXInst<(outs), (ins),
2071       !strconcat(OpcStr, " (retval0, retval1, retval2, retval3, retval4), "),
2072       [(OpNode (i32 5))]>;
2073     def PrintCallRetInst6 : NVPTXInst<(outs), (ins),
2074       !strconcat(OpcStr, " (retval0, retval1, retval2, retval3, retval4, "
2075                            "retval5), "),
2076       [(OpNode (i32 6))]>;
2077     def PrintCallRetInst7 : NVPTXInst<(outs), (ins),
2078       !strconcat(OpcStr, " (retval0, retval1, retval2, retval3, retval4, "
2079                            "retval5, retval6), "),
2080       [(OpNode (i32 7))]>;
2081     def PrintCallRetInst8 : NVPTXInst<(outs), (ins),
2082       !strconcat(OpcStr, " (retval0, retval1, retval2, retval3, retval4, "
2083                            "retval5, retval6, retval7), "),
2084       [(OpNode (i32 8))]>;
2085  }
2086}
2087
2088defm Call : CALL<"call", PrintCall>;
2089defm CallUni : CALL<"call.uni", PrintCallUni>;
2090
2091// Convergent call instructions.  These are identical to regular calls, except
2092// they have the isConvergent bit set.
2093let isConvergent=1 in {
2094  defm ConvergentCall : CALL<"call", PrintConvergentCall>;
2095  defm ConvergentCallUni : CALL<"call.uni", PrintConvergentCallUni>;
2096}
2097
2098def LoadParamMemI64    : LoadParamMemInst<Int64Regs, ".b64">;
2099def LoadParamMemI32    : LoadParamMemInst<Int32Regs, ".b32">;
2100def LoadParamMemI16    : LoadParamMemInst<Int16Regs, ".b16">;
2101def LoadParamMemI8     : LoadParamMemInst<Int16Regs, ".b8">;
2102def LoadParamMemV2I64  : LoadParamV2MemInst<Int64Regs, ".b64">;
2103def LoadParamMemV2I32  : LoadParamV2MemInst<Int32Regs, ".b32">;
2104def LoadParamMemV2I16  : LoadParamV2MemInst<Int16Regs, ".b16">;
2105def LoadParamMemV2I8   : LoadParamV2MemInst<Int16Regs, ".b8">;
2106def LoadParamMemV4I32  : LoadParamV4MemInst<Int32Regs, ".b32">;
2107def LoadParamMemV4I16  : LoadParamV4MemInst<Int16Regs, ".b16">;
2108def LoadParamMemV4I8   : LoadParamV4MemInst<Int16Regs, ".b8">;
2109def LoadParamMemF16    : LoadParamMemInst<Float16Regs, ".b16">;
2110def LoadParamMemF16x2  : LoadParamMemInst<Float16x2Regs, ".b32">;
2111def LoadParamMemF32    : LoadParamMemInst<Float32Regs, ".f32">;
2112def LoadParamMemF64    : LoadParamMemInst<Float64Regs, ".f64">;
2113def LoadParamMemV2F16  : LoadParamV2MemInst<Float16Regs, ".b16">;
2114def LoadParamMemV2F16x2: LoadParamV2MemInst<Float16x2Regs, ".b32">;
2115def LoadParamMemV2F32  : LoadParamV2MemInst<Float32Regs, ".f32">;
2116def LoadParamMemV2F64  : LoadParamV2MemInst<Float64Regs, ".f64">;
2117def LoadParamMemV4F16  : LoadParamV4MemInst<Float16Regs, ".b16">;
2118def LoadParamMemV4F16x2: LoadParamV4MemInst<Float16x2Regs, ".b32">;
2119def LoadParamMemV4F32  : LoadParamV4MemInst<Float32Regs, ".f32">;
2120
2121def StoreParamI64    : StoreParamInst<Int64Regs, ".b64">;
2122def StoreParamI32    : StoreParamInst<Int32Regs, ".b32">;
2123
2124def StoreParamI16    : StoreParamInst<Int16Regs, ".b16">;
2125def StoreParamI8     : StoreParamInst<Int16Regs, ".b8">;
2126def StoreParamV2I64  : StoreParamV2Inst<Int64Regs, ".b64">;
2127def StoreParamV2I32  : StoreParamV2Inst<Int32Regs, ".b32">;
2128def StoreParamV2I16  : StoreParamV2Inst<Int16Regs, ".b16">;
2129def StoreParamV2I8   : StoreParamV2Inst<Int16Regs, ".b8">;
2130
2131def StoreParamV4I32  : StoreParamV4Inst<Int32Regs, ".b32">;
2132def StoreParamV4I16  : StoreParamV4Inst<Int16Regs, ".b16">;
2133def StoreParamV4I8   : StoreParamV4Inst<Int16Regs, ".b8">;
2134
2135def StoreParamF16      : StoreParamInst<Float16Regs, ".b16">;
2136def StoreParamF16x2    : StoreParamInst<Float16x2Regs, ".b32">;
2137def StoreParamF32      : StoreParamInst<Float32Regs, ".f32">;
2138def StoreParamF64      : StoreParamInst<Float64Regs, ".f64">;
2139def StoreParamV2F16    : StoreParamV2Inst<Float16Regs, ".b16">;
2140def StoreParamV2F16x2  : StoreParamV2Inst<Float16x2Regs, ".b32">;
2141def StoreParamV2F32    : StoreParamV2Inst<Float32Regs, ".f32">;
2142def StoreParamV2F64    : StoreParamV2Inst<Float64Regs, ".f64">;
2143def StoreParamV4F16    : StoreParamV4Inst<Float16Regs, ".b16">;
2144def StoreParamV4F16x2  : StoreParamV4Inst<Float16x2Regs, ".b32">;
2145def StoreParamV4F32    : StoreParamV4Inst<Float32Regs, ".f32">;
2146
2147def StoreRetvalI64    : StoreRetvalInst<Int64Regs, ".b64">;
2148def StoreRetvalI32    : StoreRetvalInst<Int32Regs, ".b32">;
2149def StoreRetvalI16    : StoreRetvalInst<Int16Regs, ".b16">;
2150def StoreRetvalI8     : StoreRetvalInst<Int16Regs, ".b8">;
2151def StoreRetvalV2I64  : StoreRetvalV2Inst<Int64Regs, ".b64">;
2152def StoreRetvalV2I32  : StoreRetvalV2Inst<Int32Regs, ".b32">;
2153def StoreRetvalV2I16  : StoreRetvalV2Inst<Int16Regs, ".b16">;
2154def StoreRetvalV2I8   : StoreRetvalV2Inst<Int16Regs, ".b8">;
2155def StoreRetvalV4I32  : StoreRetvalV4Inst<Int32Regs, ".b32">;
2156def StoreRetvalV4I16  : StoreRetvalV4Inst<Int16Regs, ".b16">;
2157def StoreRetvalV4I8   : StoreRetvalV4Inst<Int16Regs, ".b8">;
2158
2159def StoreRetvalF64    : StoreRetvalInst<Float64Regs, ".f64">;
2160def StoreRetvalF32    : StoreRetvalInst<Float32Regs, ".f32">;
2161def StoreRetvalF16    : StoreRetvalInst<Float16Regs, ".b16">;
2162def StoreRetvalF16x2  : StoreRetvalInst<Float16x2Regs, ".b32">;
2163def StoreRetvalV2F64  : StoreRetvalV2Inst<Float64Regs, ".f64">;
2164def StoreRetvalV2F32  : StoreRetvalV2Inst<Float32Regs, ".f32">;
2165def StoreRetvalV2F16  : StoreRetvalV2Inst<Float16Regs, ".b16">;
2166def StoreRetvalV2F16x2: StoreRetvalV2Inst<Float16x2Regs, ".b32">;
2167def StoreRetvalV4F32  : StoreRetvalV4Inst<Float32Regs, ".f32">;
2168def StoreRetvalV4F16  : StoreRetvalV4Inst<Float16Regs, ".b16">;
2169def StoreRetvalV4F16x2: StoreRetvalV4Inst<Float16x2Regs, ".b32">;
2170
2171def CallArgBeginInst : NVPTXInst<(outs), (ins), "(", [(CallArgBegin)]>;
2172def CallArgEndInst1  : NVPTXInst<(outs), (ins), ");", [(CallArgEnd (i32 1))]>;
2173def CallArgEndInst0  : NVPTXInst<(outs), (ins), ")", [(CallArgEnd (i32 0))]>;
2174def RETURNInst       : NVPTXInst<(outs), (ins), "ret;", [(RETURNNode)]>;
2175
2176class CallArgInst<NVPTXRegClass regclass> :
2177  NVPTXInst<(outs), (ins regclass:$a), "$a, ",
2178            [(CallArg (i32 0), regclass:$a)]>;
2179
2180class LastCallArgInst<NVPTXRegClass regclass> :
2181  NVPTXInst<(outs), (ins regclass:$a), "$a",
2182            [(LastCallArg (i32 0), regclass:$a)]>;
2183
2184def CallArgI64     : CallArgInst<Int64Regs>;
2185def CallArgI32     : CallArgInst<Int32Regs>;
2186def CallArgI16     : CallArgInst<Int16Regs>;
2187def CallArgF64     : CallArgInst<Float64Regs>;
2188def CallArgF32     : CallArgInst<Float32Regs>;
2189
2190def LastCallArgI64 : LastCallArgInst<Int64Regs>;
2191def LastCallArgI32 : LastCallArgInst<Int32Regs>;
2192def LastCallArgI16 : LastCallArgInst<Int16Regs>;
2193def LastCallArgF64 : LastCallArgInst<Float64Regs>;
2194def LastCallArgF32 : LastCallArgInst<Float32Regs>;
2195
2196def CallArgI32imm : NVPTXInst<(outs), (ins i32imm:$a), "$a, ",
2197                              [(CallArg (i32 0), (i32 imm:$a))]>;
2198def LastCallArgI32imm : NVPTXInst<(outs), (ins i32imm:$a), "$a",
2199                                  [(LastCallArg (i32 0), (i32 imm:$a))]>;
2200
2201def CallArgParam : NVPTXInst<(outs), (ins i32imm:$a), "param$a, ",
2202                             [(CallArg (i32 1), (i32 imm:$a))]>;
2203def LastCallArgParam : NVPTXInst<(outs), (ins i32imm:$a), "param$a",
2204                                 [(LastCallArg (i32 1), (i32 imm:$a))]>;
2205
2206def CallVoidInst :      NVPTXInst<(outs), (ins imem:$addr), "$addr, ",
2207                                  [(CallVoid (Wrapper tglobaladdr:$addr))]>;
2208def CallVoidInstReg :   NVPTXInst<(outs), (ins Int32Regs:$addr), "$addr, ",
2209                                  [(CallVoid Int32Regs:$addr)]>;
2210def CallVoidInstReg64 : NVPTXInst<(outs), (ins Int64Regs:$addr), "$addr, ",
2211                                  [(CallVoid Int64Regs:$addr)]>;
2212def PrototypeInst :     NVPTXInst<(outs), (ins i32imm:$val), ", prototype_$val;",
2213                                  [(Prototype (i32 imm:$val))]>;
2214
2215def DeclareRetMemInst :
2216  NVPTXInst<(outs), (ins i32imm:$align, i32imm:$size, i32imm:$num),
2217            ".param .align $align .b8 retval$num[$size];",
2218            [(DeclareRetParam (i32 imm:$align), (i32 imm:$size), (i32 imm:$num))]>;
2219def DeclareRetScalarInst :
2220  NVPTXInst<(outs), (ins i32imm:$size, i32imm:$num),
2221            ".param .b$size retval$num;",
2222            [(DeclareRet (i32 1), (i32 imm:$size), (i32 imm:$num))]>;
2223def DeclareRetRegInst :
2224  NVPTXInst<(outs), (ins i32imm:$size, i32imm:$num),
2225            ".reg .b$size retval$num;",
2226            [(DeclareRet (i32 2), (i32 imm:$size), (i32 imm:$num))]>;
2227
2228def DeclareParamInst :
2229  NVPTXInst<(outs), (ins i32imm:$align, i32imm:$a, i32imm:$size),
2230            ".param .align $align .b8 param$a[$size];",
2231            [(DeclareParam (i32 imm:$align), (i32 imm:$a), (i32 imm:$size))]>;
2232def DeclareScalarParamInst :
2233  NVPTXInst<(outs), (ins i32imm:$a, i32imm:$size),
2234            ".param .b$size param$a;",
2235            [(DeclareScalarParam (i32 imm:$a), (i32 imm:$size), (i32 0))]>;
2236def DeclareScalarRegInst :
2237  NVPTXInst<(outs), (ins i32imm:$a, i32imm:$size),
2238            ".reg .b$size param$a;",
2239            [(DeclareScalarParam (i32 imm:$a), (i32 imm:$size), (i32 1))]>;
2240
2241class MoveParamInst<NVPTXRegClass regclass, string asmstr> :
2242  NVPTXInst<(outs regclass:$dst), (ins regclass:$src),
2243            !strconcat("mov", asmstr, " \t$dst, $src;"),
2244            [(set regclass:$dst, (MoveParam regclass:$src))]>;
2245
2246def MoveParamI64 : MoveParamInst<Int64Regs, ".b64">;
2247def MoveParamI32 : MoveParamInst<Int32Regs, ".b32">;
2248def MoveParamI16 :
2249  NVPTXInst<(outs Int16Regs:$dst), (ins Int16Regs:$src),
2250            "cvt.u16.u32 \t$dst, $src;",
2251            [(set Int16Regs:$dst, (MoveParam Int16Regs:$src))]>;
2252def MoveParamF64 : MoveParamInst<Float64Regs, ".f64">;
2253def MoveParamF32 : MoveParamInst<Float32Regs, ".f32">;
2254def MoveParamF16 : MoveParamInst<Float16Regs, ".f16">;
2255
2256class PseudoUseParamInst<NVPTXRegClass regclass> :
2257  NVPTXInst<(outs), (ins regclass:$src),
2258            "// Pseudo use of $src",
2259            [(PseudoUseParam regclass:$src)]>;
2260
2261def PseudoUseParamI64 : PseudoUseParamInst<Int64Regs>;
2262def PseudoUseParamI32 : PseudoUseParamInst<Int32Regs>;
2263def PseudoUseParamI16 : PseudoUseParamInst<Int16Regs>;
2264def PseudoUseParamF64 : PseudoUseParamInst<Float64Regs>;
2265def PseudoUseParamF32 : PseudoUseParamInst<Float32Regs>;
2266
2267class ProxyRegInst<string SzStr, NVPTXRegClass regclass> :
2268  NVPTXInst<(outs regclass:$dst), (ins regclass:$src),
2269            !strconcat("mov.", SzStr, " \t$dst, $src;"),
2270            [(set regclass:$dst, (ProxyReg regclass:$src))]>;
2271
2272let isCodeGenOnly=1, isPseudo=1 in {
2273  def ProxyRegI1    : ProxyRegInst<"pred", Int1Regs>;
2274  def ProxyRegI16   : ProxyRegInst<"b16",  Int16Regs>;
2275  def ProxyRegI32   : ProxyRegInst<"b32",  Int32Regs>;
2276  def ProxyRegI64   : ProxyRegInst<"b64",  Int64Regs>;
2277  def ProxyRegF16   : ProxyRegInst<"b16",  Float16Regs>;
2278  def ProxyRegF32   : ProxyRegInst<"f32",  Float32Regs>;
2279  def ProxyRegF64   : ProxyRegInst<"f64",  Float64Regs>;
2280  def ProxyRegF16x2 : ProxyRegInst<"b32",  Float16x2Regs>;
2281}
2282
2283//
2284// Load / Store Handling
2285//
2286multiclass LD<NVPTXRegClass regclass> {
2287  def _avar : NVPTXInst<
2288    (outs regclass:$dst),
2289    (ins LdStCode:$isVol, LdStCode:$addsp, LdStCode:$Vec, LdStCode:$Sign,
2290         i32imm:$fromWidth, imem:$addr),
2291    "ld${isVol:volatile}${addsp:addsp}${Vec:vec}.${Sign:sign}$fromWidth "
2292    "\t$dst, [$addr];", []>;
2293  def _areg : NVPTXInst<
2294    (outs regclass:$dst),
2295    (ins LdStCode:$isVol, LdStCode:$addsp, LdStCode:$Vec, LdStCode:$Sign,
2296         i32imm:$fromWidth, Int32Regs:$addr),
2297    "ld${isVol:volatile}${addsp:addsp}${Vec:vec}.${Sign:sign}$fromWidth "
2298    "\t$dst, [$addr];", []>;
2299  def _areg_64 : NVPTXInst<
2300    (outs regclass:$dst),
2301    (ins LdStCode:$isVol, LdStCode:$addsp, LdStCode:$Vec, LdStCode:$Sign,
2302         i32imm:$fromWidth, Int64Regs:$addr),
2303    "ld${isVol:volatile}${addsp:addsp}${Vec:vec}.${Sign:sign}$fromWidth "
2304    "\t$dst, [$addr];", []>;
2305  def _ari : NVPTXInst<
2306    (outs regclass:$dst),
2307    (ins LdStCode:$isVol, LdStCode:$addsp, LdStCode:$Vec, LdStCode:$Sign,
2308         i32imm:$fromWidth, Int32Regs:$addr, i32imm:$offset),
2309    "ld${isVol:volatile}${addsp:addsp}${Vec:vec}.${Sign:sign}$fromWidth "
2310    "\t$dst, [$addr+$offset];", []>;
2311  def _ari_64 : NVPTXInst<
2312    (outs regclass:$dst),
2313    (ins LdStCode:$isVol, LdStCode:$addsp, LdStCode:$Vec,
2314         LdStCode:$Sign, i32imm:$fromWidth, Int64Regs:$addr, i32imm:$offset),
2315    "ld${isVol:volatile}${addsp:addsp}${Vec:vec}.${Sign:sign}$fromWidth "
2316    "\t$dst, [$addr+$offset];", []>;
2317  def _asi : NVPTXInst<
2318    (outs regclass:$dst),
2319    (ins LdStCode:$isVol, LdStCode:$addsp, LdStCode:$Vec,
2320         LdStCode:$Sign, i32imm:$fromWidth, imem:$addr, i32imm:$offset),
2321    "ld${isVol:volatile}${addsp:addsp}${Vec:vec}.${Sign:sign}$fromWidth "
2322    "\t$dst, [$addr+$offset];", []>;
2323}
2324
2325let mayLoad=1, hasSideEffects=0 in {
2326  defm LD_i8  : LD<Int16Regs>;
2327  defm LD_i16 : LD<Int16Regs>;
2328  defm LD_i32 : LD<Int32Regs>;
2329  defm LD_i64 : LD<Int64Regs>;
2330  defm LD_f16 : LD<Float16Regs>;
2331  defm LD_f16x2 : LD<Float16x2Regs>;
2332  defm LD_f32 : LD<Float32Regs>;
2333  defm LD_f64 : LD<Float64Regs>;
2334}
2335
2336multiclass ST<NVPTXRegClass regclass> {
2337  def _avar : NVPTXInst<
2338    (outs),
2339    (ins regclass:$src, LdStCode:$isVol, LdStCode:$addsp, LdStCode:$Vec,
2340         LdStCode:$Sign, i32imm:$toWidth, imem:$addr),
2341    "st${isVol:volatile}${addsp:addsp}${Vec:vec}.${Sign:sign}$toWidth"
2342    " \t[$addr], $src;", []>;
2343  def _areg : NVPTXInst<
2344    (outs),
2345    (ins regclass:$src, LdStCode:$isVol, LdStCode:$addsp,
2346         LdStCode:$Vec, LdStCode:$Sign, i32imm:$toWidth, Int32Regs:$addr),
2347    "st${isVol:volatile}${addsp:addsp}${Vec:vec}.${Sign:sign}$toWidth"
2348    " \t[$addr], $src;", []>;
2349  def _areg_64 : NVPTXInst<
2350    (outs),
2351    (ins regclass:$src, LdStCode:$isVol, LdStCode:$addsp, LdStCode:$Vec,
2352         LdStCode:$Sign, i32imm:$toWidth, Int64Regs:$addr),
2353    "st${isVol:volatile}${addsp:addsp}${Vec:vec}.${Sign:sign}$toWidth"
2354    " \t[$addr], $src;", []>;
2355  def _ari : NVPTXInst<
2356    (outs),
2357    (ins regclass:$src, LdStCode:$isVol, LdStCode:$addsp, LdStCode:$Vec,
2358         LdStCode:$Sign, i32imm:$toWidth, Int32Regs:$addr, i32imm:$offset),
2359    "st${isVol:volatile}${addsp:addsp}${Vec:vec}.${Sign:sign}$toWidth"
2360    " \t[$addr+$offset], $src;", []>;
2361  def _ari_64 : NVPTXInst<
2362    (outs),
2363    (ins regclass:$src, LdStCode:$isVol, LdStCode:$addsp, LdStCode:$Vec,
2364         LdStCode:$Sign, i32imm:$toWidth, Int64Regs:$addr, i32imm:$offset),
2365    "st${isVol:volatile}${addsp:addsp}${Vec:vec}.${Sign:sign}$toWidth"
2366    " \t[$addr+$offset], $src;", []>;
2367  def _asi : NVPTXInst<
2368    (outs),
2369    (ins regclass:$src, LdStCode:$isVol, LdStCode:$addsp, LdStCode:$Vec,
2370         LdStCode:$Sign, i32imm:$toWidth, imem:$addr, i32imm:$offset),
2371    "st${isVol:volatile}${addsp:addsp}${Vec:vec}.${Sign:sign}$toWidth"
2372    " \t[$addr+$offset], $src;", []>;
2373}
2374
2375let mayStore=1, hasSideEffects=0 in {
2376  defm ST_i8  : ST<Int16Regs>;
2377  defm ST_i16 : ST<Int16Regs>;
2378  defm ST_i32 : ST<Int32Regs>;
2379  defm ST_i64 : ST<Int64Regs>;
2380  defm ST_f16 : ST<Float16Regs>;
2381  defm ST_f16x2 : ST<Float16x2Regs>;
2382  defm ST_f32 : ST<Float32Regs>;
2383  defm ST_f64 : ST<Float64Regs>;
2384}
2385
2386// The following is used only in and after vector elementizations.  Vector
2387// elementization happens at the machine instruction level, so the following
2388// instructions never appear in the DAG.
2389multiclass LD_VEC<NVPTXRegClass regclass> {
2390  def _v2_avar : NVPTXInst<
2391    (outs regclass:$dst1, regclass:$dst2),
2392    (ins LdStCode:$isVol, LdStCode:$addsp, LdStCode:$Vec, LdStCode:$Sign,
2393         i32imm:$fromWidth, imem:$addr),
2394    "ld${isVol:volatile}${addsp:addsp}${Vec:vec}.${Sign:sign}$fromWidth "
2395    "\t{{$dst1, $dst2}}, [$addr];", []>;
2396  def _v2_areg : NVPTXInst<
2397    (outs regclass:$dst1, regclass:$dst2),
2398    (ins LdStCode:$isVol, LdStCode:$addsp, LdStCode:$Vec, LdStCode:$Sign,
2399         i32imm:$fromWidth, Int32Regs:$addr),
2400    "ld${isVol:volatile}${addsp:addsp}${Vec:vec}.${Sign:sign}$fromWidth "
2401    "\t{{$dst1, $dst2}}, [$addr];", []>;
2402  def _v2_areg_64 : NVPTXInst<
2403    (outs regclass:$dst1, regclass:$dst2),
2404    (ins LdStCode:$isVol, LdStCode:$addsp, LdStCode:$Vec, LdStCode:$Sign,
2405         i32imm:$fromWidth, Int64Regs:$addr),
2406    "ld${isVol:volatile}${addsp:addsp}${Vec:vec}.${Sign:sign}$fromWidth "
2407    "\t{{$dst1, $dst2}}, [$addr];", []>;
2408  def _v2_ari : NVPTXInst<
2409    (outs regclass:$dst1, regclass:$dst2),
2410    (ins LdStCode:$isVol, LdStCode:$addsp, LdStCode:$Vec, LdStCode:$Sign,
2411         i32imm:$fromWidth, Int32Regs:$addr, i32imm:$offset),
2412    "ld${isVol:volatile}${addsp:addsp}${Vec:vec}.${Sign:sign}$fromWidth "
2413    "\t{{$dst1, $dst2}}, [$addr+$offset];", []>;
2414  def _v2_ari_64 : NVPTXInst<
2415    (outs regclass:$dst1, regclass:$dst2),
2416    (ins LdStCode:$isVol, LdStCode:$addsp, LdStCode:$Vec, LdStCode:$Sign,
2417         i32imm:$fromWidth, Int64Regs:$addr, i32imm:$offset),
2418    "ld${isVol:volatile}${addsp:addsp}${Vec:vec}.${Sign:sign}$fromWidth "
2419    "\t{{$dst1, $dst2}}, [$addr+$offset];", []>;
2420  def _v2_asi : NVPTXInst<
2421    (outs regclass:$dst1, regclass:$dst2),
2422    (ins LdStCode:$isVol, LdStCode:$addsp, LdStCode:$Vec, LdStCode:$Sign,
2423         i32imm:$fromWidth, imem:$addr, i32imm:$offset),
2424    "ld${isVol:volatile}${addsp:addsp}${Vec:vec}.${Sign:sign}$fromWidth "
2425    "\t{{$dst1, $dst2}}, [$addr+$offset];", []>;
2426  def _v4_avar : NVPTXInst<
2427    (outs regclass:$dst1, regclass:$dst2, regclass:$dst3, regclass:$dst4),
2428    (ins LdStCode:$isVol, LdStCode:$addsp, LdStCode:$Vec, LdStCode:$Sign,
2429         i32imm:$fromWidth, imem:$addr),
2430    "ld${isVol:volatile}${addsp:addsp}${Vec:vec}.${Sign:sign}$fromWidth "
2431    "\t{{$dst1, $dst2, $dst3, $dst4}}, [$addr];", []>;
2432  def _v4_areg : NVPTXInst<
2433    (outs regclass:$dst1, regclass:$dst2, regclass:$dst3, regclass:$dst4),
2434    (ins LdStCode:$isVol, LdStCode:$addsp, LdStCode:$Vec, LdStCode:$Sign,
2435         i32imm:$fromWidth, Int32Regs:$addr),
2436    "ld${isVol:volatile}${addsp:addsp}${Vec:vec}.${Sign:sign}$fromWidth "
2437    "\t{{$dst1, $dst2, $dst3, $dst4}}, [$addr];", []>;
2438  def _v4_areg_64 : NVPTXInst<
2439    (outs regclass:$dst1, regclass:$dst2, regclass:$dst3, regclass:$dst4),
2440    (ins LdStCode:$isVol, LdStCode:$addsp, LdStCode:$Vec, LdStCode:$Sign,
2441         i32imm:$fromWidth, Int64Regs:$addr),
2442    "ld${isVol:volatile}${addsp:addsp}${Vec:vec}.${Sign:sign}$fromWidth "
2443    "\t{{$dst1, $dst2, $dst3, $dst4}}, [$addr];", []>;
2444  def _v4_ari : NVPTXInst<
2445    (outs regclass:$dst1, regclass:$dst2, regclass:$dst3, regclass:$dst4),
2446    (ins LdStCode:$isVol, LdStCode:$addsp, LdStCode:$Vec, LdStCode:$Sign,
2447         i32imm:$fromWidth, Int32Regs:$addr, i32imm:$offset),
2448    "ld${isVol:volatile}${addsp:addsp}${Vec:vec}.${Sign:sign}$fromWidth "
2449    "\t{{$dst1, $dst2, $dst3, $dst4}}, [$addr+$offset];", []>;
2450  def _v4_ari_64 : NVPTXInst<
2451    (outs regclass:$dst1, regclass:$dst2, regclass:$dst3, regclass:$dst4),
2452    (ins LdStCode:$isVol, LdStCode:$addsp, LdStCode:$Vec, LdStCode:$Sign,
2453         i32imm:$fromWidth, Int64Regs:$addr, i32imm:$offset),
2454    "ld${isVol:volatile}${addsp:addsp}${Vec:vec}.${Sign:sign}$fromWidth "
2455    "\t{{$dst1, $dst2, $dst3, $dst4}}, [$addr+$offset];", []>;
2456  def _v4_asi : NVPTXInst<
2457    (outs regclass:$dst1, regclass:$dst2, regclass:$dst3, regclass:$dst4),
2458    (ins LdStCode:$isVol, LdStCode:$addsp, LdStCode:$Vec, LdStCode:$Sign,
2459         i32imm:$fromWidth, imem:$addr, i32imm:$offset),
2460    "ld${isVol:volatile}${addsp:addsp}${Vec:vec}.${Sign:sign}$fromWidth "
2461    "\t{{$dst1, $dst2, $dst3, $dst4}}, [$addr+$offset];", []>;
2462}
2463let mayLoad=1, hasSideEffects=0 in {
2464  defm LDV_i8  : LD_VEC<Int16Regs>;
2465  defm LDV_i16 : LD_VEC<Int16Regs>;
2466  defm LDV_i32 : LD_VEC<Int32Regs>;
2467  defm LDV_i64 : LD_VEC<Int64Regs>;
2468  defm LDV_f16 : LD_VEC<Float16Regs>;
2469  defm LDV_f16x2 : LD_VEC<Float16x2Regs>;
2470  defm LDV_f32 : LD_VEC<Float32Regs>;
2471  defm LDV_f64 : LD_VEC<Float64Regs>;
2472}
2473
2474multiclass ST_VEC<NVPTXRegClass regclass> {
2475  def _v2_avar : NVPTXInst<
2476    (outs),
2477    (ins regclass:$src1, regclass:$src2, LdStCode:$isVol, LdStCode:$addsp,
2478         LdStCode:$Vec, LdStCode:$Sign, i32imm:$fromWidth, imem:$addr),
2479    "st${isVol:volatile}${addsp:addsp}${Vec:vec}.${Sign:sign}$fromWidth "
2480    "\t[$addr], {{$src1, $src2}};", []>;
2481  def _v2_areg : NVPTXInst<
2482    (outs),
2483    (ins regclass:$src1, regclass:$src2, LdStCode:$isVol, LdStCode:$addsp,
2484         LdStCode:$Vec, LdStCode:$Sign, i32imm:$fromWidth, Int32Regs:$addr),
2485    "st${isVol:volatile}${addsp:addsp}${Vec:vec}.${Sign:sign}$fromWidth "
2486    "\t[$addr], {{$src1, $src2}};", []>;
2487  def _v2_areg_64 : NVPTXInst<
2488    (outs),
2489    (ins regclass:$src1, regclass:$src2, LdStCode:$isVol, LdStCode:$addsp,
2490         LdStCode:$Vec, LdStCode:$Sign, i32imm:$fromWidth, Int64Regs:$addr),
2491    "st${isVol:volatile}${addsp:addsp}${Vec:vec}.${Sign:sign}$fromWidth "
2492    "\t[$addr], {{$src1, $src2}};", []>;
2493  def _v2_ari : NVPTXInst<
2494    (outs),
2495    (ins regclass:$src1, regclass:$src2, LdStCode:$isVol, LdStCode:$addsp,
2496         LdStCode:$Vec, LdStCode:$Sign, i32imm:$fromWidth, Int32Regs:$addr,
2497         i32imm:$offset),
2498    "st${isVol:volatile}${addsp:addsp}${Vec:vec}.${Sign:sign}$fromWidth "
2499    "\t[$addr+$offset], {{$src1, $src2}};", []>;
2500  def _v2_ari_64 : NVPTXInst<
2501    (outs),
2502    (ins regclass:$src1, regclass:$src2, LdStCode:$isVol, LdStCode:$addsp,
2503         LdStCode:$Vec, LdStCode:$Sign, i32imm:$fromWidth, Int64Regs:$addr,
2504         i32imm:$offset),
2505    "st${isVol:volatile}${addsp:addsp}${Vec:vec}.${Sign:sign}$fromWidth "
2506    "\t[$addr+$offset], {{$src1, $src2}};", []>;
2507  def _v2_asi : NVPTXInst<
2508    (outs),
2509    (ins regclass:$src1, regclass:$src2, LdStCode:$isVol, LdStCode:$addsp,
2510         LdStCode:$Vec, LdStCode:$Sign, i32imm:$fromWidth, imem:$addr,
2511         i32imm:$offset),
2512    "st${isVol:volatile}${addsp:addsp}${Vec:vec}.${Sign:sign}$fromWidth "
2513    "\t[$addr+$offset], {{$src1, $src2}};", []>;
2514  def _v4_avar : NVPTXInst<
2515    (outs),
2516    (ins regclass:$src1, regclass:$src2, regclass:$src3, regclass:$src4,
2517         LdStCode:$isVol, LdStCode:$addsp, LdStCode:$Vec, LdStCode:$Sign,
2518         i32imm:$fromWidth, imem:$addr),
2519    "st${isVol:volatile}${addsp:addsp}${Vec:vec}.${Sign:sign}$fromWidth "
2520    "\t[$addr], {{$src1, $src2, $src3, $src4}};", []>;
2521  def _v4_areg : NVPTXInst<
2522    (outs),
2523    (ins regclass:$src1, regclass:$src2, regclass:$src3, regclass:$src4,
2524         LdStCode:$isVol, LdStCode:$addsp, LdStCode:$Vec, LdStCode:$Sign,
2525         i32imm:$fromWidth, Int32Regs:$addr),
2526    "st${isVol:volatile}${addsp:addsp}${Vec:vec}.${Sign:sign}$fromWidth "
2527    "\t[$addr], {{$src1, $src2, $src3, $src4}};", []>;
2528  def _v4_areg_64 : NVPTXInst<
2529    (outs),
2530    (ins regclass:$src1, regclass:$src2, regclass:$src3, regclass:$src4,
2531         LdStCode:$isVol, LdStCode:$addsp, LdStCode:$Vec, LdStCode:$Sign,
2532         i32imm:$fromWidth, Int64Regs:$addr),
2533    "st${isVol:volatile}${addsp:addsp}${Vec:vec}.${Sign:sign}$fromWidth "
2534    "\t[$addr], {{$src1, $src2, $src3, $src4}};", []>;
2535  def _v4_ari : NVPTXInst<
2536    (outs),
2537    (ins regclass:$src1, regclass:$src2, regclass:$src3, regclass:$src4,
2538         LdStCode:$isVol, LdStCode:$addsp, LdStCode:$Vec, LdStCode:$Sign,
2539         i32imm:$fromWidth, Int32Regs:$addr, i32imm:$offset),
2540    "st${isVol:volatile}${addsp:addsp}${Vec:vec}.${Sign:sign}$fromWidth "
2541    "\t[$addr+$offset], {{$src1, $src2, $src3, $src4}};", []>;
2542  def _v4_ari_64 : NVPTXInst<
2543    (outs),
2544    (ins regclass:$src1, regclass:$src2, regclass:$src3, regclass:$src4,
2545         LdStCode:$isVol, LdStCode:$addsp, LdStCode:$Vec, LdStCode:$Sign,
2546         i32imm:$fromWidth, Int64Regs:$addr, i32imm:$offset),
2547    "st${isVol:volatile}${addsp:addsp}${Vec:vec}.${Sign:sign}$fromWidth "
2548    "\t[$addr+$offset], {{$src1, $src2, $src3, $src4}};", []>;
2549  def _v4_asi : NVPTXInst<
2550    (outs),
2551    (ins regclass:$src1, regclass:$src2, regclass:$src3, regclass:$src4,
2552         LdStCode:$isVol, LdStCode:$addsp, LdStCode:$Vec, LdStCode:$Sign,
2553         i32imm:$fromWidth, imem:$addr, i32imm:$offset),
2554    "st${isVol:volatile}${addsp:addsp}${Vec:vec}.${Sign:sign}"
2555    "$fromWidth \t[$addr+$offset], {{$src1, $src2, $src3, $src4}};", []>;
2556}
2557
2558let mayStore=1, hasSideEffects=0 in {
2559  defm STV_i8  : ST_VEC<Int16Regs>;
2560  defm STV_i16 : ST_VEC<Int16Regs>;
2561  defm STV_i32 : ST_VEC<Int32Regs>;
2562  defm STV_i64 : ST_VEC<Int64Regs>;
2563  defm STV_f16 : ST_VEC<Float16Regs>;
2564  defm STV_f16x2 : ST_VEC<Float16x2Regs>;
2565  defm STV_f32 : ST_VEC<Float32Regs>;
2566  defm STV_f64 : ST_VEC<Float64Regs>;
2567}
2568
2569//---- Conversion ----
2570
2571class F_BITCONVERT<string SzStr, NVPTXRegClass regclassIn,
2572  NVPTXRegClass regclassOut> :
2573           NVPTXInst<(outs regclassOut:$d), (ins regclassIn:$a),
2574           !strconcat("mov.b", SzStr, " \t$d, $a;"),
2575     [(set regclassOut:$d, (bitconvert regclassIn:$a))]>;
2576
2577def BITCONVERT_16_I2F : F_BITCONVERT<"16", Int16Regs, Float16Regs>;
2578def BITCONVERT_16_F2I : F_BITCONVERT<"16", Float16Regs, Int16Regs>;
2579def BITCONVERT_32_I2F : F_BITCONVERT<"32", Int32Regs, Float32Regs>;
2580def BITCONVERT_32_F2I : F_BITCONVERT<"32", Float32Regs, Int32Regs>;
2581def BITCONVERT_64_I2F : F_BITCONVERT<"64", Int64Regs, Float64Regs>;
2582def BITCONVERT_64_F2I : F_BITCONVERT<"64", Float64Regs, Int64Regs>;
2583def BITCONVERT_32_I2F16x2 : F_BITCONVERT<"32", Int32Regs, Float16x2Regs>;
2584def BITCONVERT_32_F16x22I : F_BITCONVERT<"32", Float16x2Regs, Int32Regs>;
2585
2586// NOTE: pred->fp are currently sub-optimal due to an issue in TableGen where
2587// we cannot specify floating-point literals in isel patterns.  Therefore, we
2588// use an integer selp to select either 1 or 0 and then cvt to floating-point.
2589
2590// sint -> f16
2591def : Pat<(f16 (sint_to_fp Int1Regs:$a)),
2592          (CVT_f16_s32 (SELP_u32ii 1, 0, Int1Regs:$a), CvtRN)>;
2593def : Pat<(f16 (sint_to_fp Int16Regs:$a)),
2594          (CVT_f16_s16 Int16Regs:$a, CvtRN)>;
2595def : Pat<(f16 (sint_to_fp Int32Regs:$a)),
2596          (CVT_f16_s32 Int32Regs:$a, CvtRN)>;
2597def : Pat<(f16 (sint_to_fp Int64Regs:$a)),
2598          (CVT_f16_s64 Int64Regs:$a, CvtRN)>;
2599
2600// uint -> f16
2601def : Pat<(f16 (uint_to_fp Int1Regs:$a)),
2602          (CVT_f16_u32 (SELP_u32ii 1, 0, Int1Regs:$a), CvtRN)>;
2603def : Pat<(f16 (uint_to_fp Int16Regs:$a)),
2604          (CVT_f16_u16 Int16Regs:$a, CvtRN)>;
2605def : Pat<(f16 (uint_to_fp Int32Regs:$a)),
2606          (CVT_f16_u32 Int32Regs:$a, CvtRN)>;
2607def : Pat<(f16 (uint_to_fp Int64Regs:$a)),
2608          (CVT_f16_u64 Int64Regs:$a, CvtRN)>;
2609
2610// sint -> f32
2611def : Pat<(f32 (sint_to_fp Int1Regs:$a)),
2612          (CVT_f32_s32 (SELP_u32ii 1, 0, Int1Regs:$a), CvtRN)>;
2613def : Pat<(f32 (sint_to_fp Int16Regs:$a)),
2614          (CVT_f32_s16 Int16Regs:$a, CvtRN)>;
2615def : Pat<(f32 (sint_to_fp Int32Regs:$a)),
2616          (CVT_f32_s32 Int32Regs:$a, CvtRN)>;
2617def : Pat<(f32 (sint_to_fp Int64Regs:$a)),
2618          (CVT_f32_s64 Int64Regs:$a, CvtRN)>;
2619
2620// uint -> f32
2621def : Pat<(f32 (uint_to_fp Int1Regs:$a)),
2622          (CVT_f32_u32 (SELP_u32ii 1, 0, Int1Regs:$a), CvtRN)>;
2623def : Pat<(f32 (uint_to_fp Int16Regs:$a)),
2624          (CVT_f32_u16 Int16Regs:$a, CvtRN)>;
2625def : Pat<(f32 (uint_to_fp Int32Regs:$a)),
2626          (CVT_f32_u32 Int32Regs:$a, CvtRN)>;
2627def : Pat<(f32 (uint_to_fp Int64Regs:$a)),
2628          (CVT_f32_u64 Int64Regs:$a, CvtRN)>;
2629
2630// sint -> f64
2631def : Pat<(f64 (sint_to_fp Int1Regs:$a)),
2632          (CVT_f64_s32 (SELP_u32ii 1, 0, Int1Regs:$a), CvtRN)>;
2633def : Pat<(f64 (sint_to_fp Int16Regs:$a)),
2634          (CVT_f64_s16 Int16Regs:$a, CvtRN)>;
2635def : Pat<(f64 (sint_to_fp Int32Regs:$a)),
2636          (CVT_f64_s32 Int32Regs:$a, CvtRN)>;
2637def : Pat<(f64 (sint_to_fp Int64Regs:$a)),
2638          (CVT_f64_s64 Int64Regs:$a, CvtRN)>;
2639
2640// uint -> f64
2641def : Pat<(f64 (uint_to_fp Int1Regs:$a)),
2642          (CVT_f64_u32 (SELP_u32ii 1, 0, Int1Regs:$a), CvtRN)>;
2643def : Pat<(f64 (uint_to_fp Int16Regs:$a)),
2644          (CVT_f64_u16 Int16Regs:$a, CvtRN)>;
2645def : Pat<(f64 (uint_to_fp Int32Regs:$a)),
2646          (CVT_f64_u32 Int32Regs:$a, CvtRN)>;
2647def : Pat<(f64 (uint_to_fp Int64Regs:$a)),
2648          (CVT_f64_u64 Int64Regs:$a, CvtRN)>;
2649
2650
2651// f16 -> sint
2652def : Pat<(i1 (fp_to_sint Float16Regs:$a)),
2653          (SETP_b16ri (BITCONVERT_16_F2I Float16Regs:$a), 0, CmpEQ)>;
2654def : Pat<(i16 (fp_to_sint Float16Regs:$a)),
2655          (CVT_s16_f16 Float16Regs:$a, CvtRZI)>;
2656def : Pat<(i32 (fp_to_sint Float16Regs:$a)),
2657          (CVT_s32_f16 Float16Regs:$a, CvtRZI)>;
2658def : Pat<(i64 (fp_to_sint Float16Regs:$a)),
2659          (CVT_s64_f16 Float16Regs:$a, CvtRZI)>;
2660
2661// f16 -> uint
2662def : Pat<(i1 (fp_to_uint Float16Regs:$a)),
2663          (SETP_b16ri (BITCONVERT_16_F2I Float16Regs:$a), 0, CmpEQ)>;
2664def : Pat<(i16 (fp_to_uint Float16Regs:$a)),
2665          (CVT_u16_f16 Float16Regs:$a, CvtRZI)>;
2666def : Pat<(i32 (fp_to_uint Float16Regs:$a)),
2667          (CVT_u32_f16 Float16Regs:$a, CvtRZI)>;
2668def : Pat<(i64 (fp_to_uint Float16Regs:$a)),
2669          (CVT_u64_f16 Float16Regs:$a, CvtRZI)>;
2670
2671// f32 -> sint
2672def : Pat<(i1 (fp_to_sint Float32Regs:$a)),
2673          (SETP_b32ri (BITCONVERT_32_F2I Float32Regs:$a), 0, CmpEQ)>;
2674def : Pat<(i16 (fp_to_sint Float32Regs:$a)),
2675          (CVT_s16_f32 Float32Regs:$a, CvtRZI_FTZ)>, Requires<[doF32FTZ]>;
2676def : Pat<(i16 (fp_to_sint Float32Regs:$a)),
2677          (CVT_s16_f32 Float32Regs:$a, CvtRZI)>;
2678def : Pat<(i32 (fp_to_sint Float32Regs:$a)),
2679          (CVT_s32_f32 Float32Regs:$a, CvtRZI_FTZ)>, Requires<[doF32FTZ]>;
2680def : Pat<(i32 (fp_to_sint Float32Regs:$a)),
2681          (CVT_s32_f32 Float32Regs:$a, CvtRZI)>;
2682def : Pat<(i64 (fp_to_sint Float32Regs:$a)),
2683          (CVT_s64_f32 Float32Regs:$a, CvtRZI_FTZ)>, Requires<[doF32FTZ]>;
2684def : Pat<(i64 (fp_to_sint Float32Regs:$a)),
2685          (CVT_s64_f32 Float32Regs:$a, CvtRZI)>;
2686
2687// f32 -> uint
2688def : Pat<(i1 (fp_to_uint Float32Regs:$a)),
2689          (SETP_b32ri (BITCONVERT_32_F2I Float32Regs:$a), 0, CmpEQ)>;
2690def : Pat<(i16 (fp_to_uint Float32Regs:$a)),
2691          (CVT_u16_f32 Float32Regs:$a, CvtRZI_FTZ)>, Requires<[doF32FTZ]>;
2692def : Pat<(i16 (fp_to_uint Float32Regs:$a)),
2693          (CVT_u16_f32 Float32Regs:$a, CvtRZI)>;
2694def : Pat<(i32 (fp_to_uint Float32Regs:$a)),
2695          (CVT_u32_f32 Float32Regs:$a, CvtRZI_FTZ)>, Requires<[doF32FTZ]>;
2696def : Pat<(i32 (fp_to_uint Float32Regs:$a)),
2697          (CVT_u32_f32 Float32Regs:$a, CvtRZI)>;
2698def : Pat<(i64 (fp_to_uint Float32Regs:$a)),
2699          (CVT_u64_f32 Float32Regs:$a, CvtRZI_FTZ)>, Requires<[doF32FTZ]>;
2700def : Pat<(i64 (fp_to_uint Float32Regs:$a)),
2701          (CVT_u64_f32 Float32Regs:$a, CvtRZI)>;
2702
2703// f64 -> sint
2704def : Pat<(i1 (fp_to_sint Float64Regs:$a)),
2705          (SETP_b64ri (BITCONVERT_64_F2I Float64Regs:$a), 0, CmpEQ)>;
2706def : Pat<(i16 (fp_to_sint Float64Regs:$a)),
2707          (CVT_s16_f64 Float64Regs:$a, CvtRZI)>;
2708def : Pat<(i32 (fp_to_sint Float64Regs:$a)),
2709          (CVT_s32_f64 Float64Regs:$a, CvtRZI)>;
2710def : Pat<(i64 (fp_to_sint Float64Regs:$a)),
2711          (CVT_s64_f64 Float64Regs:$a, CvtRZI)>;
2712
2713// f64 -> uint
2714def : Pat<(i1 (fp_to_uint Float64Regs:$a)),
2715          (SETP_b64ri (BITCONVERT_64_F2I Float64Regs:$a), 0, CmpEQ)>;
2716def : Pat<(i16 (fp_to_uint Float64Regs:$a)),
2717          (CVT_u16_f64 Float64Regs:$a, CvtRZI)>;
2718def : Pat<(i32 (fp_to_uint Float64Regs:$a)),
2719          (CVT_u32_f64 Float64Regs:$a, CvtRZI)>;
2720def : Pat<(i64 (fp_to_uint Float64Regs:$a)),
2721          (CVT_u64_f64 Float64Regs:$a, CvtRZI)>;
2722
2723// sext i1
2724def : Pat<(i16 (sext Int1Regs:$a)),
2725          (SELP_s16ii -1, 0, Int1Regs:$a)>;
2726def : Pat<(i32 (sext Int1Regs:$a)),
2727          (SELP_s32ii -1, 0, Int1Regs:$a)>;
2728def : Pat<(i64 (sext Int1Regs:$a)),
2729          (SELP_s64ii -1, 0, Int1Regs:$a)>;
2730
2731// zext i1
2732def : Pat<(i16 (zext Int1Regs:$a)),
2733          (SELP_u16ii 1, 0, Int1Regs:$a)>;
2734def : Pat<(i32 (zext Int1Regs:$a)),
2735          (SELP_u32ii 1, 0, Int1Regs:$a)>;
2736def : Pat<(i64 (zext Int1Regs:$a)),
2737          (SELP_u64ii 1, 0, Int1Regs:$a)>;
2738
2739// anyext i1
2740def : Pat<(i16 (anyext Int1Regs:$a)),
2741          (SELP_u16ii -1, 0, Int1Regs:$a)>;
2742def : Pat<(i32 (anyext Int1Regs:$a)),
2743          (SELP_u32ii -1, 0, Int1Regs:$a)>;
2744def : Pat<(i64 (anyext Int1Regs:$a)),
2745          (SELP_u64ii -1, 0, Int1Regs:$a)>;
2746
2747// sext i16
2748def : Pat<(i32 (sext Int16Regs:$a)),
2749          (CVT_s32_s16 Int16Regs:$a, CvtNONE)>;
2750def : Pat<(i64 (sext Int16Regs:$a)),
2751          (CVT_s64_s16 Int16Regs:$a, CvtNONE)>;
2752
2753// zext i16
2754def : Pat<(i32 (zext Int16Regs:$a)),
2755          (CVT_u32_u16 Int16Regs:$a, CvtNONE)>;
2756def : Pat<(i64 (zext Int16Regs:$a)),
2757          (CVT_u64_u16 Int16Regs:$a, CvtNONE)>;
2758
2759// anyext i16
2760def : Pat<(i32 (anyext Int16Regs:$a)),
2761          (CVT_u32_u16 Int16Regs:$a, CvtNONE)>;
2762def : Pat<(i64 (anyext Int16Regs:$a)),
2763          (CVT_u64_u16 Int16Regs:$a, CvtNONE)>;
2764
2765// sext i32
2766def : Pat<(i64 (sext Int32Regs:$a)),
2767          (CVT_s64_s32 Int32Regs:$a, CvtNONE)>;
2768
2769// zext i32
2770def : Pat<(i64 (zext Int32Regs:$a)),
2771          (CVT_u64_u32 Int32Regs:$a, CvtNONE)>;
2772
2773// anyext i32
2774def : Pat<(i64 (anyext Int32Regs:$a)),
2775          (CVT_u64_u32 Int32Regs:$a, CvtNONE)>;
2776
2777
2778// truncate i64
2779def : Pat<(i32 (trunc Int64Regs:$a)),
2780          (CVT_u32_u64 Int64Regs:$a, CvtNONE)>;
2781def : Pat<(i16 (trunc Int64Regs:$a)),
2782          (CVT_u16_u64 Int64Regs:$a, CvtNONE)>;
2783def : Pat<(i1 (trunc Int64Regs:$a)),
2784          (SETP_b64ri (ANDb64ri Int64Regs:$a, 1), 1, CmpEQ)>;
2785
2786// truncate i32
2787def : Pat<(i16 (trunc Int32Regs:$a)),
2788          (CVT_u16_u32 Int32Regs:$a, CvtNONE)>;
2789def : Pat<(i1 (trunc Int32Regs:$a)),
2790          (SETP_b32ri (ANDb32ri Int32Regs:$a, 1), 1, CmpEQ)>;
2791
2792// truncate i16
2793def : Pat<(i1 (trunc Int16Regs:$a)),
2794          (SETP_b16ri (ANDb16ri Int16Regs:$a, 1), 1, CmpEQ)>;
2795
2796// sext_inreg
2797def : Pat<(sext_inreg Int16Regs:$a, i8), (CVT_INREG_s16_s8 Int16Regs:$a)>;
2798def : Pat<(sext_inreg Int32Regs:$a, i8), (CVT_INREG_s32_s8 Int32Regs:$a)>;
2799def : Pat<(sext_inreg Int32Regs:$a, i16), (CVT_INREG_s32_s16 Int32Regs:$a)>;
2800def : Pat<(sext_inreg Int64Regs:$a, i8), (CVT_INREG_s64_s8 Int64Regs:$a)>;
2801def : Pat<(sext_inreg Int64Regs:$a, i16), (CVT_INREG_s64_s16 Int64Regs:$a)>;
2802def : Pat<(sext_inreg Int64Regs:$a, i32), (CVT_INREG_s64_s32 Int64Regs:$a)>;
2803
2804
2805// Select instructions with 32-bit predicates
2806def : Pat<(select Int32Regs:$pred, Int16Regs:$a, Int16Regs:$b),
2807          (SELP_b16rr Int16Regs:$a, Int16Regs:$b,
2808          (SETP_b32ri (ANDb32ri Int32Regs:$pred, 1), 1, CmpEQ))>;
2809def : Pat<(select Int32Regs:$pred, Int32Regs:$a, Int32Regs:$b),
2810          (SELP_b32rr Int32Regs:$a, Int32Regs:$b,
2811          (SETP_b32ri (ANDb32ri Int32Regs:$pred, 1), 1, CmpEQ))>;
2812def : Pat<(select Int32Regs:$pred, Int64Regs:$a, Int64Regs:$b),
2813          (SELP_b64rr Int64Regs:$a, Int64Regs:$b,
2814          (SETP_b32ri (ANDb32ri Int32Regs:$pred, 1), 1, CmpEQ))>;
2815def : Pat<(select Int32Regs:$pred, Float16Regs:$a, Float16Regs:$b),
2816          (SELP_f16rr Float16Regs:$a, Float16Regs:$b,
2817          (SETP_b32ri (ANDb32ri Int32Regs:$pred, 1), 1, CmpEQ))>;
2818def : Pat<(select Int32Regs:$pred, Float32Regs:$a, Float32Regs:$b),
2819          (SELP_f32rr Float32Regs:$a, Float32Regs:$b,
2820          (SETP_b32ri (ANDb32ri Int32Regs:$pred, 1), 1, CmpEQ))>;
2821def : Pat<(select Int32Regs:$pred, Float64Regs:$a, Float64Regs:$b),
2822          (SELP_f64rr Float64Regs:$a, Float64Regs:$b,
2823          (SETP_b32ri (ANDb32ri Int32Regs:$pred, 1), 1, CmpEQ))>;
2824
2825
2826let hasSideEffects = 0 in {
2827  // pack a set of smaller int registers to a larger int register
2828  def V4I16toI64 : NVPTXInst<(outs Int64Regs:$d),
2829                             (ins Int16Regs:$s1, Int16Regs:$s2,
2830                                  Int16Regs:$s3, Int16Regs:$s4),
2831                             "mov.b64 \t$d, {{$s1, $s2, $s3, $s4}};", []>;
2832  def V2I16toI32 : NVPTXInst<(outs Int32Regs:$d),
2833                             (ins Int16Regs:$s1, Int16Regs:$s2),
2834                             "mov.b32 \t$d, {{$s1, $s2}};", []>;
2835  def V2I32toI64 : NVPTXInst<(outs Int64Regs:$d),
2836                             (ins Int32Regs:$s1, Int32Regs:$s2),
2837                             "mov.b64 \t$d, {{$s1, $s2}};", []>;
2838  def V2F32toF64 : NVPTXInst<(outs Float64Regs:$d),
2839                             (ins Float32Regs:$s1, Float32Regs:$s2),
2840                             "mov.b64 \t$d, {{$s1, $s2}};", []>;
2841
2842  // unpack a larger int register to a set of smaller int registers
2843  def I64toV4I16 : NVPTXInst<(outs Int16Regs:$d1, Int16Regs:$d2,
2844                                   Int16Regs:$d3, Int16Regs:$d4),
2845                             (ins Int64Regs:$s),
2846                             "mov.b64 \t{{$d1, $d2, $d3, $d4}}, $s;", []>;
2847  def I32toV2I16 : NVPTXInst<(outs Int16Regs:$d1, Int16Regs:$d2),
2848                             (ins Int32Regs:$s),
2849                             "mov.b32 \t{{$d1, $d2}}, $s;", []>;
2850  def I64toV2I32 : NVPTXInst<(outs Int32Regs:$d1, Int32Regs:$d2),
2851                             (ins Int64Regs:$s),
2852                             "mov.b64 \t{{$d1, $d2}}, $s;", []>;
2853  def F64toV2F32 : NVPTXInst<(outs Float32Regs:$d1, Float32Regs:$d2),
2854                             (ins Float64Regs:$s),
2855                             "mov.b64 \t{{$d1, $d2}}, $s;", []>;
2856
2857}
2858
2859let hasSideEffects = 0 in {
2860  // Extract element of f16x2 register. PTX does not provide any way
2861  // to access elements of f16x2 vector directly, so we need to
2862  // extract it using a temporary register.
2863  def F16x2toF16_0 : NVPTXInst<(outs Float16Regs:$dst),
2864                               (ins Float16x2Regs:$src),
2865                               "{{ .reg .b16 \t%tmp_hi;\n\t"
2866                               "  mov.b32 \t{$dst, %tmp_hi}, $src; }}",
2867                               [(set Float16Regs:$dst,
2868                                 (extractelt (v2f16 Float16x2Regs:$src), 0))]>;
2869  def F16x2toF16_1 : NVPTXInst<(outs Float16Regs:$dst),
2870                               (ins Float16x2Regs:$src),
2871                               "{{ .reg .b16 \t%tmp_lo;\n\t"
2872                               "  mov.b32 \t{%tmp_lo, $dst}, $src; }}",
2873                               [(set Float16Regs:$dst,
2874                                 (extractelt (v2f16 Float16x2Regs:$src), 1))]>;
2875
2876  // Coalesce two f16 registers into f16x2
2877  def BuildF16x2 : NVPTXInst<(outs Float16x2Regs:$dst),
2878                             (ins Float16Regs:$a, Float16Regs:$b),
2879                             "mov.b32 \t$dst, {{$a, $b}};",
2880                             [(set Float16x2Regs:$dst,
2881                               (build_vector (f16 Float16Regs:$a), (f16 Float16Regs:$b)))]>;
2882
2883  // Directly initializing underlying the b32 register is one less SASS
2884  // instruction than than vector-packing move.
2885  def BuildF16x2i : NVPTXInst<(outs Float16x2Regs:$dst), (ins i32imm:$src),
2886                              "mov.b32 \t$dst, $src;",
2887                              []>;
2888
2889  // Split f16x2 into two f16 registers.
2890  def SplitF16x2  : NVPTXInst<(outs Float16Regs:$lo, Float16Regs:$hi),
2891                              (ins Float16x2Regs:$src),
2892                              "mov.b32 \t{{$lo, $hi}}, $src;",
2893                              []>;
2894  // Split an i32 into two f16
2895  def SplitI32toF16x2  : NVPTXInst<(outs Float16Regs:$lo, Float16Regs:$hi),
2896                                   (ins Int32Regs:$src),
2897                                   "mov.b32 \t{{$lo, $hi}}, $src;",
2898                                   []>;
2899}
2900
2901// Count leading zeros
2902let hasSideEffects = 0 in {
2903  def CLZr32 : NVPTXInst<(outs Int32Regs:$d), (ins Int32Regs:$a),
2904                         "clz.b32 \t$d, $a;", []>;
2905  def CLZr64 : NVPTXInst<(outs Int32Regs:$d), (ins Int64Regs:$a),
2906                         "clz.b64 \t$d, $a;", []>;
2907}
2908
2909// 32-bit has a direct PTX instruction
2910def : Pat<(ctlz Int32Regs:$a), (CLZr32 Int32Regs:$a)>;
2911
2912// The return type of the ctlz ISD node is the same as its input, but the PTX
2913// ctz instruction always returns a 32-bit value.  For ctlz.i64, convert the
2914// ptx value to 64 bits to match the ISD node's semantics, unless we know we're
2915// truncating back down to 32 bits.
2916def : Pat<(i64 (ctlz Int64Regs:$a)), (CVT_u64_u32 (CLZr64 Int64Regs:$a), CvtNONE)>;
2917def : Pat<(i32 (trunc (ctlz Int64Regs:$a))), (CLZr64 Int64Regs:$a)>;
2918
2919// For 16-bit ctlz, we zero-extend to 32-bit, perform the count, then trunc the
2920// result back to 16-bits if necessary.  We also need to subtract 16 because
2921// the high-order 16 zeros were counted.
2922//
2923// TODO: NVPTX has a mov.b32 b32reg, {imm, b16reg} instruction, which we could
2924// use to save one SASS instruction (on sm_35 anyway):
2925//
2926//   mov.b32 $tmp, {0xffff, $a}
2927//   ctlz.b32 $result, $tmp
2928//
2929// That is, instead of zero-extending the input to 32 bits, we'd "one-extend"
2930// and then ctlz that value.  This way we don't have to subtract 16 from the
2931// result.  Unfortunately today we don't have a way to generate
2932// "mov b32reg, {b16imm, b16reg}", so we don't do this optimization.
2933def : Pat<(i16 (ctlz Int16Regs:$a)),
2934          (SUBi16ri (CVT_u16_u32
2935           (CLZr32 (CVT_u32_u16 Int16Regs:$a, CvtNONE)), CvtNONE), 16)>;
2936def : Pat<(i32 (zext (i16 (ctlz Int16Regs:$a)))),
2937          (SUBi32ri (CLZr32 (CVT_u32_u16 Int16Regs:$a, CvtNONE)), 16)>;
2938
2939// Population count
2940let hasSideEffects = 0 in {
2941  def POPCr32 : NVPTXInst<(outs Int32Regs:$d), (ins Int32Regs:$a),
2942                          "popc.b32 \t$d, $a;", []>;
2943  def POPCr64 : NVPTXInst<(outs Int32Regs:$d), (ins Int64Regs:$a),
2944                          "popc.b64 \t$d, $a;", []>;
2945}
2946
2947// 32-bit has a direct PTX instruction
2948def : Pat<(ctpop Int32Regs:$a), (POPCr32 Int32Regs:$a)>;
2949
2950// For 64-bit, the result in PTX is actually 32-bit so we zero-extend to 64-bit
2951// to match the LLVM semantics.  Just as with ctlz.i64, we provide a second
2952// pattern that avoids the type conversion if we're truncating the result to
2953// i32 anyway.
2954def : Pat<(ctpop Int64Regs:$a), (CVT_u64_u32 (POPCr64 Int64Regs:$a), CvtNONE)>;
2955def : Pat<(i32 (trunc (ctpop Int64Regs:$a))), (POPCr64 Int64Regs:$a)>;
2956
2957// For 16-bit, we zero-extend to 32-bit, then trunc the result back to 16-bits.
2958// If we know that we're storing into an i32, we can avoid the final trunc.
2959def : Pat<(ctpop Int16Regs:$a),
2960          (CVT_u16_u32 (POPCr32 (CVT_u32_u16 Int16Regs:$a, CvtNONE)), CvtNONE)>;
2961def : Pat<(i32 (zext (i16 (ctpop Int16Regs:$a)))),
2962          (POPCr32 (CVT_u32_u16 Int16Regs:$a, CvtNONE))>;
2963
2964// fpround f32 -> f16
2965def : Pat<(f16 (fpround Float32Regs:$a)),
2966          (CVT_f16_f32 Float32Regs:$a, CvtRN)>;
2967
2968// fpround f64 -> f16
2969def : Pat<(f16 (fpround Float64Regs:$a)),
2970          (CVT_f16_f64 Float64Regs:$a, CvtRN)>;
2971
2972// fpround f64 -> f32
2973def : Pat<(f32 (fpround Float64Regs:$a)),
2974          (CVT_f32_f64 Float64Regs:$a, CvtRN_FTZ)>, Requires<[doF32FTZ]>;
2975def : Pat<(f32 (fpround Float64Regs:$a)),
2976          (CVT_f32_f64 Float64Regs:$a, CvtRN)>;
2977
2978// fpextend f16 -> f32
2979def : Pat<(f32 (fpextend Float16Regs:$a)),
2980          (CVT_f32_f16 Float16Regs:$a, CvtNONE_FTZ)>, Requires<[doF32FTZ]>;
2981def : Pat<(f32 (fpextend Float16Regs:$a)),
2982          (CVT_f32_f16 Float16Regs:$a, CvtNONE)>;
2983
2984// fpextend f16 -> f64
2985def : Pat<(f64 (fpextend Float16Regs:$a)),
2986          (CVT_f64_f16 Float16Regs:$a, CvtNONE)>;
2987
2988// fpextend f32 -> f64
2989def : Pat<(f64 (fpextend Float32Regs:$a)),
2990          (CVT_f64_f32 Float32Regs:$a, CvtNONE_FTZ)>, Requires<[doF32FTZ]>;
2991def : Pat<(f64 (fpextend Float32Regs:$a)),
2992          (CVT_f64_f32 Float32Regs:$a, CvtNONE)>;
2993
2994def retflag : SDNode<"NVPTXISD::RET_FLAG", SDTNone,
2995                     [SDNPHasChain, SDNPOptInGlue]>;
2996
2997// fceil, ffloor, fround, ftrunc.
2998
2999def : Pat<(fceil Float16Regs:$a),
3000          (CVT_f16_f16 Float16Regs:$a, CvtRPI)>;
3001def : Pat<(fceil Float32Regs:$a),
3002          (CVT_f32_f32 Float32Regs:$a, CvtRPI_FTZ)>, Requires<[doF32FTZ]>;
3003def : Pat<(fceil Float32Regs:$a),
3004          (CVT_f32_f32 Float32Regs:$a, CvtRPI)>, Requires<[doNoF32FTZ]>;
3005def : Pat<(fceil Float64Regs:$a),
3006          (CVT_f64_f64 Float64Regs:$a, CvtRPI)>;
3007
3008def : Pat<(ffloor Float16Regs:$a),
3009          (CVT_f16_f16 Float16Regs:$a, CvtRMI)>;
3010def : Pat<(ffloor Float32Regs:$a),
3011          (CVT_f32_f32 Float32Regs:$a, CvtRMI_FTZ)>, Requires<[doF32FTZ]>;
3012def : Pat<(ffloor Float32Regs:$a),
3013          (CVT_f32_f32 Float32Regs:$a, CvtRMI)>, Requires<[doNoF32FTZ]>;
3014def : Pat<(ffloor Float64Regs:$a),
3015          (CVT_f64_f64 Float64Regs:$a, CvtRMI)>;
3016
3017def : Pat<(ftrunc Float16Regs:$a),
3018          (CVT_f16_f16 Float16Regs:$a, CvtRZI)>;
3019def : Pat<(ftrunc Float32Regs:$a),
3020          (CVT_f32_f32 Float32Regs:$a, CvtRZI_FTZ)>, Requires<[doF32FTZ]>;
3021def : Pat<(ftrunc Float32Regs:$a),
3022          (CVT_f32_f32 Float32Regs:$a, CvtRZI)>, Requires<[doNoF32FTZ]>;
3023def : Pat<(ftrunc Float64Regs:$a),
3024          (CVT_f64_f64 Float64Regs:$a, CvtRZI)>;
3025
3026// nearbyint and rint are implemented as rounding to nearest even.  This isn't
3027// strictly correct, because it causes us to ignore the rounding mode.  But it
3028// matches what CUDA's "libm" does.
3029
3030def : Pat<(fnearbyint Float16Regs:$a),
3031          (CVT_f16_f16 Float16Regs:$a, CvtRNI)>;
3032def : Pat<(fnearbyint Float32Regs:$a),
3033          (CVT_f32_f32 Float32Regs:$a, CvtRNI_FTZ)>, Requires<[doF32FTZ]>;
3034def : Pat<(fnearbyint Float32Regs:$a),
3035          (CVT_f32_f32 Float32Regs:$a, CvtRNI)>, Requires<[doNoF32FTZ]>;
3036def : Pat<(fnearbyint Float64Regs:$a),
3037          (CVT_f64_f64 Float64Regs:$a, CvtRNI)>;
3038
3039def : Pat<(frint Float16Regs:$a),
3040          (CVT_f16_f16 Float16Regs:$a, CvtRNI)>;
3041def : Pat<(frint Float32Regs:$a),
3042          (CVT_f32_f32 Float32Regs:$a, CvtRNI_FTZ)>, Requires<[doF32FTZ]>;
3043def : Pat<(frint Float32Regs:$a),
3044          (CVT_f32_f32 Float32Regs:$a, CvtRNI)>, Requires<[doNoF32FTZ]>;
3045def : Pat<(frint Float64Regs:$a),
3046          (CVT_f64_f64 Float64Regs:$a, CvtRNI)>;
3047
3048
3049//-----------------------------------
3050// Control-flow
3051//-----------------------------------
3052
3053let isTerminator=1 in {
3054   let isReturn=1, isBarrier=1 in
3055      def Return : NVPTXInst<(outs), (ins), "ret;", [(retflag)]>;
3056
3057   let isBranch=1 in
3058      def CBranch : NVPTXInst<(outs), (ins Int1Regs:$a, brtarget:$target),
3059                              "@$a bra \t$target;",
3060                              [(brcond Int1Regs:$a, bb:$target)]>;
3061   let isBranch=1 in
3062      def CBranchOther : NVPTXInst<(outs), (ins Int1Regs:$a, brtarget:$target),
3063                                   "@!$a bra \t$target;", []>;
3064
3065   let isBranch=1, isBarrier=1 in
3066      def GOTO : NVPTXInst<(outs), (ins brtarget:$target),
3067                           "bra.uni \t$target;", [(br bb:$target)]>;
3068}
3069
3070def : Pat<(brcond Int32Regs:$a, bb:$target),
3071          (CBranch (SETP_u32ri Int32Regs:$a, 0, CmpNE), bb:$target)>;
3072
3073// SelectionDAGBuilder::visitSWitchCase() will invert the condition of a
3074// conditional branch if the target block is the next block so that the code
3075// can fall through to the target block.  The invertion is done by 'xor
3076// condition, 1', which will be translated to (setne condition, -1).  Since ptx
3077// supports '@!pred bra target', we should use it.
3078def : Pat<(brcond (i1 (setne Int1Regs:$a, -1)), bb:$target),
3079          (CBranchOther Int1Regs:$a, bb:$target)>;
3080
3081// Call
3082def SDT_NVPTXCallSeqStart : SDCallSeqStart<[SDTCisVT<0, i32>,
3083                                            SDTCisVT<1, i32>]>;
3084def SDT_NVPTXCallSeqEnd   : SDCallSeqEnd<[SDTCisVT<0, i32>, SDTCisVT<1, i32>]>;
3085
3086def callseq_start : SDNode<"ISD::CALLSEQ_START", SDT_NVPTXCallSeqStart,
3087                           [SDNPHasChain, SDNPOutGlue, SDNPSideEffect]>;
3088def callseq_end   : SDNode<"ISD::CALLSEQ_END", SDT_NVPTXCallSeqEnd,
3089                           [SDNPHasChain, SDNPOptInGlue, SDNPOutGlue,
3090                            SDNPSideEffect]>;
3091
3092def SDT_NVPTXCall : SDTypeProfile<0, 1, [SDTCisVT<0, i32>]>;
3093def call          : SDNode<"NVPTXISD::CALL", SDT_NVPTXCall,
3094                           [SDNPHasChain, SDNPOptInGlue, SDNPOutGlue]>;
3095def calltarget : Operand<i32>;
3096let isCall=1 in {
3097   def CALL : NVPTXInst<(outs), (ins calltarget:$dst), "call \t$dst, (1);", []>;
3098}
3099
3100def : Pat<(call tglobaladdr:$dst), (CALL tglobaladdr:$dst)>;
3101def : Pat<(call texternalsym:$dst), (CALL texternalsym:$dst)>;
3102
3103// Pseudo instructions.
3104class Pseudo<dag outs, dag ins, string asmstr, list<dag> pattern>
3105   : NVPTXInst<outs, ins, asmstr, pattern>;
3106
3107def Callseq_Start :
3108  NVPTXInst<(outs), (ins i32imm:$amt1, i32imm:$amt2),
3109            "\\{ // callseq $amt1, $amt2\n"
3110            "\t.reg .b32 temp_param_reg;",
3111            [(callseq_start timm:$amt1, timm:$amt2)]>;
3112def Callseq_End :
3113  NVPTXInst<(outs), (ins i32imm:$amt1, i32imm:$amt2),
3114            "\\} // callseq $amt1",
3115            [(callseq_end timm:$amt1, timm:$amt2)]>;
3116
3117// trap instruction
3118def trapinst : NVPTXInst<(outs), (ins), "trap;", [(trap)]>;
3119
3120// Call prototype wrapper
3121def SDTCallPrototype : SDTypeProfile<0, 1, [SDTCisInt<0>]>;
3122def CallPrototype :
3123  SDNode<"NVPTXISD::CallPrototype", SDTCallPrototype,
3124         [SDNPHasChain, SDNPOutGlue, SDNPInGlue, SDNPSideEffect]>;
3125def ProtoIdent : Operand<i32> {
3126  let PrintMethod = "printProtoIdent";
3127}
3128def CALL_PROTOTYPE :
3129  NVPTXInst<(outs), (ins ProtoIdent:$ident),
3130            "$ident", [(CallPrototype (i32 texternalsym:$ident))]>;
3131
3132
3133include "NVPTXIntrinsics.td"
3134
3135
3136//-----------------------------------
3137// Notes
3138//-----------------------------------
3139// BSWAP is currently expanded. The following is a more efficient
3140// - for < sm_20, use vector scalar mov, as tesla support native 16-bit register
3141// - for sm_20, use pmpt (use vector scalar mov to get the pack and
3142//   unpack). sm_20 supports native 32-bit register, but not native 16-bit
3143// register.
3144