R600ISelLowering.cpp revision 263508
1//===-- R600ISelLowering.cpp - R600 DAG Lowering Implementation -----------===//
2//
3//                     The LLVM Compiler Infrastructure
4//
5// This file is distributed under the University of Illinois Open Source
6// License. See LICENSE.TXT for details.
7//
8//===----------------------------------------------------------------------===//
9//
10/// \file
11/// \brief Custom DAG lowering for R600
12//
13//===----------------------------------------------------------------------===//
14
15#include "R600ISelLowering.h"
16#include "R600Defines.h"
17#include "R600InstrInfo.h"
18#include "R600MachineFunctionInfo.h"
19#include "llvm/CodeGen/CallingConvLower.h"
20#include "llvm/CodeGen/MachineFrameInfo.h"
21#include "llvm/CodeGen/MachineInstrBuilder.h"
22#include "llvm/CodeGen/MachineRegisterInfo.h"
23#include "llvm/CodeGen/SelectionDAG.h"
24#include "llvm/IR/Argument.h"
25#include "llvm/IR/Function.h"
26
27using namespace llvm;
28
29R600TargetLowering::R600TargetLowering(TargetMachine &TM) :
30    AMDGPUTargetLowering(TM),
31    Gen(TM.getSubtarget<AMDGPUSubtarget>().getGeneration()) {
32  addRegisterClass(MVT::v4f32, &AMDGPU::R600_Reg128RegClass);
33  addRegisterClass(MVT::f32, &AMDGPU::R600_Reg32RegClass);
34  addRegisterClass(MVT::v4i32, &AMDGPU::R600_Reg128RegClass);
35  addRegisterClass(MVT::i32, &AMDGPU::R600_Reg32RegClass);
36  addRegisterClass(MVT::v2f32, &AMDGPU::R600_Reg64RegClass);
37  addRegisterClass(MVT::v2i32, &AMDGPU::R600_Reg64RegClass);
38
39  computeRegisterProperties();
40
41  // Set condition code actions
42  setCondCodeAction(ISD::SETO,   MVT::f32, Expand);
43  setCondCodeAction(ISD::SETUO,  MVT::f32, Expand);
44  setCondCodeAction(ISD::SETLT,  MVT::f32, Expand);
45  setCondCodeAction(ISD::SETLE,  MVT::f32, Expand);
46  setCondCodeAction(ISD::SETOLT, MVT::f32, Expand);
47  setCondCodeAction(ISD::SETOLE, MVT::f32, Expand);
48  setCondCodeAction(ISD::SETONE, MVT::f32, Expand);
49  setCondCodeAction(ISD::SETUEQ, MVT::f32, Expand);
50  setCondCodeAction(ISD::SETUGE, MVT::f32, Expand);
51  setCondCodeAction(ISD::SETUGT, MVT::f32, Expand);
52  setCondCodeAction(ISD::SETULT, MVT::f32, Expand);
53  setCondCodeAction(ISD::SETULE, MVT::f32, Expand);
54
55  setCondCodeAction(ISD::SETLE, MVT::i32, Expand);
56  setCondCodeAction(ISD::SETLT, MVT::i32, Expand);
57  setCondCodeAction(ISD::SETULE, MVT::i32, Expand);
58  setCondCodeAction(ISD::SETULT, MVT::i32, Expand);
59
60  setOperationAction(ISD::FCOS, MVT::f32, Custom);
61  setOperationAction(ISD::FSIN, MVT::f32, Custom);
62
63  setOperationAction(ISD::SETCC, MVT::v4i32, Expand);
64  setOperationAction(ISD::SETCC, MVT::v2i32, Expand);
65
66  setOperationAction(ISD::BR_CC, MVT::i32, Expand);
67  setOperationAction(ISD::BR_CC, MVT::f32, Expand);
68
69  setOperationAction(ISD::FSUB, MVT::f32, Expand);
70
71  setOperationAction(ISD::INTRINSIC_VOID, MVT::Other, Custom);
72  setOperationAction(ISD::INTRINSIC_WO_CHAIN, MVT::Other, Custom);
73  setOperationAction(ISD::INTRINSIC_WO_CHAIN, MVT::i1, Custom);
74
75  setOperationAction(ISD::SELECT_CC, MVT::f32, Custom);
76  setOperationAction(ISD::SELECT_CC, MVT::i32, Custom);
77
78  setOperationAction(ISD::SETCC, MVT::i32, Expand);
79  setOperationAction(ISD::SETCC, MVT::f32, Expand);
80  setOperationAction(ISD::FP_TO_UINT, MVT::i1, Custom);
81
82  setOperationAction(ISD::SELECT, MVT::i32, Expand);
83  setOperationAction(ISD::SELECT, MVT::f32, Expand);
84  setOperationAction(ISD::SELECT, MVT::v2i32, Expand);
85  setOperationAction(ISD::SELECT, MVT::v2f32, Expand);
86  setOperationAction(ISD::SELECT, MVT::v4i32, Expand);
87  setOperationAction(ISD::SELECT, MVT::v4f32, Expand);
88
89  // Legalize loads and stores to the private address space.
90  setOperationAction(ISD::LOAD, MVT::i32, Custom);
91  setOperationAction(ISD::LOAD, MVT::v2i32, Custom);
92  setOperationAction(ISD::LOAD, MVT::v4i32, Custom);
93
94  // EXTLOAD should be the same as ZEXTLOAD. It is legal for some address
95  // spaces, so it is custom lowered to handle those where it isn't.
96  setLoadExtAction(ISD::SEXTLOAD, MVT::i8, Custom);
97  setLoadExtAction(ISD::SEXTLOAD, MVT::i16, Custom);
98  setLoadExtAction(ISD::ZEXTLOAD, MVT::i8, Custom);
99  setLoadExtAction(ISD::ZEXTLOAD, MVT::i16, Custom);
100  setLoadExtAction(ISD::EXTLOAD, MVT::i8, Custom);
101  setLoadExtAction(ISD::EXTLOAD, MVT::i16, Custom);
102
103  setOperationAction(ISD::STORE, MVT::i8, Custom);
104  setOperationAction(ISD::STORE, MVT::i32, Custom);
105  setOperationAction(ISD::STORE, MVT::v2i32, Custom);
106  setOperationAction(ISD::STORE, MVT::v4i32, Custom);
107  setTruncStoreAction(MVT::i32, MVT::i8, Custom);
108  setTruncStoreAction(MVT::i32, MVT::i16, Custom);
109
110  setOperationAction(ISD::LOAD, MVT::i32, Custom);
111  setOperationAction(ISD::LOAD, MVT::v4i32, Custom);
112  setOperationAction(ISD::FrameIndex, MVT::i32, Custom);
113
114  setTargetDAGCombine(ISD::FP_ROUND);
115  setTargetDAGCombine(ISD::FP_TO_SINT);
116  setTargetDAGCombine(ISD::EXTRACT_VECTOR_ELT);
117  setTargetDAGCombine(ISD::SELECT_CC);
118  setTargetDAGCombine(ISD::INSERT_VECTOR_ELT);
119
120  setOperationAction(ISD::GlobalAddress, MVT::i32, Custom);
121
122  setBooleanContents(ZeroOrNegativeOneBooleanContent);
123  setBooleanVectorContents(ZeroOrNegativeOneBooleanContent);
124  setSchedulingPreference(Sched::Source);
125}
126
127MachineBasicBlock * R600TargetLowering::EmitInstrWithCustomInserter(
128    MachineInstr * MI, MachineBasicBlock * BB) const {
129  MachineFunction * MF = BB->getParent();
130  MachineRegisterInfo &MRI = MF->getRegInfo();
131  MachineBasicBlock::iterator I = *MI;
132  const R600InstrInfo *TII =
133    static_cast<const R600InstrInfo*>(MF->getTarget().getInstrInfo());
134
135  switch (MI->getOpcode()) {
136  default:
137    // Replace LDS_*_RET instruction that don't have any uses with the
138    // equivalent LDS_*_NORET instruction.
139    if (TII->isLDSRetInstr(MI->getOpcode())) {
140      int DstIdx = TII->getOperandIdx(MI->getOpcode(), AMDGPU::OpName::dst);
141      assert(DstIdx != -1);
142      MachineInstrBuilder NewMI;
143      if (!MRI.use_empty(MI->getOperand(DstIdx).getReg()))
144        return BB;
145
146      NewMI = BuildMI(*BB, I, BB->findDebugLoc(I),
147                      TII->get(AMDGPU::getLDSNoRetOp(MI->getOpcode())));
148      for (unsigned i = 1, e = MI->getNumOperands(); i < e; ++i) {
149        NewMI.addOperand(MI->getOperand(i));
150      }
151    } else {
152      return AMDGPUTargetLowering::EmitInstrWithCustomInserter(MI, BB);
153    }
154    break;
155  case AMDGPU::CLAMP_R600: {
156    MachineInstr *NewMI = TII->buildDefaultInstruction(*BB, I,
157                                                   AMDGPU::MOV,
158                                                   MI->getOperand(0).getReg(),
159                                                   MI->getOperand(1).getReg());
160    TII->addFlag(NewMI, 0, MO_FLAG_CLAMP);
161    break;
162  }
163
164  case AMDGPU::FABS_R600: {
165    MachineInstr *NewMI = TII->buildDefaultInstruction(*BB, I,
166                                                    AMDGPU::MOV,
167                                                    MI->getOperand(0).getReg(),
168                                                    MI->getOperand(1).getReg());
169    TII->addFlag(NewMI, 0, MO_FLAG_ABS);
170    break;
171  }
172
173  case AMDGPU::FNEG_R600: {
174    MachineInstr *NewMI = TII->buildDefaultInstruction(*BB, I,
175                                                    AMDGPU::MOV,
176                                                    MI->getOperand(0).getReg(),
177                                                    MI->getOperand(1).getReg());
178    TII->addFlag(NewMI, 0, MO_FLAG_NEG);
179    break;
180  }
181
182  case AMDGPU::MASK_WRITE: {
183    unsigned maskedRegister = MI->getOperand(0).getReg();
184    assert(TargetRegisterInfo::isVirtualRegister(maskedRegister));
185    MachineInstr * defInstr = MRI.getVRegDef(maskedRegister);
186    TII->addFlag(defInstr, 0, MO_FLAG_MASK);
187    break;
188  }
189
190  case AMDGPU::MOV_IMM_F32:
191    TII->buildMovImm(*BB, I, MI->getOperand(0).getReg(),
192                     MI->getOperand(1).getFPImm()->getValueAPF()
193                         .bitcastToAPInt().getZExtValue());
194    break;
195  case AMDGPU::MOV_IMM_I32:
196    TII->buildMovImm(*BB, I, MI->getOperand(0).getReg(),
197                     MI->getOperand(1).getImm());
198    break;
199  case AMDGPU::CONST_COPY: {
200    MachineInstr *NewMI = TII->buildDefaultInstruction(*BB, MI, AMDGPU::MOV,
201        MI->getOperand(0).getReg(), AMDGPU::ALU_CONST);
202    TII->setImmOperand(NewMI, AMDGPU::OpName::src0_sel,
203        MI->getOperand(1).getImm());
204    break;
205  }
206
207  case AMDGPU::RAT_WRITE_CACHELESS_32_eg:
208  case AMDGPU::RAT_WRITE_CACHELESS_64_eg:
209  case AMDGPU::RAT_WRITE_CACHELESS_128_eg: {
210    unsigned EOP = (llvm::next(I)->getOpcode() == AMDGPU::RETURN) ? 1 : 0;
211
212    BuildMI(*BB, I, BB->findDebugLoc(I), TII->get(MI->getOpcode()))
213            .addOperand(MI->getOperand(0))
214            .addOperand(MI->getOperand(1))
215            .addImm(EOP); // Set End of program bit
216    break;
217  }
218
219  case AMDGPU::TXD: {
220    unsigned T0 = MRI.createVirtualRegister(&AMDGPU::R600_Reg128RegClass);
221    unsigned T1 = MRI.createVirtualRegister(&AMDGPU::R600_Reg128RegClass);
222    MachineOperand &RID = MI->getOperand(4);
223    MachineOperand &SID = MI->getOperand(5);
224    unsigned TextureId = MI->getOperand(6).getImm();
225    unsigned SrcX = 0, SrcY = 1, SrcZ = 2, SrcW = 3;
226    unsigned CTX = 1, CTY = 1, CTZ = 1, CTW = 1;
227
228    switch (TextureId) {
229    case 5: // Rect
230      CTX = CTY = 0;
231      break;
232    case 6: // Shadow1D
233      SrcW = SrcZ;
234      break;
235    case 7: // Shadow2D
236      SrcW = SrcZ;
237      break;
238    case 8: // ShadowRect
239      CTX = CTY = 0;
240      SrcW = SrcZ;
241      break;
242    case 9: // 1DArray
243      SrcZ = SrcY;
244      CTZ = 0;
245      break;
246    case 10: // 2DArray
247      CTZ = 0;
248      break;
249    case 11: // Shadow1DArray
250      SrcZ = SrcY;
251      CTZ = 0;
252      break;
253    case 12: // Shadow2DArray
254      CTZ = 0;
255      break;
256    }
257    BuildMI(*BB, I, BB->findDebugLoc(I), TII->get(AMDGPU::TEX_SET_GRADIENTS_H), T0)
258            .addOperand(MI->getOperand(3))
259            .addImm(SrcX)
260            .addImm(SrcY)
261            .addImm(SrcZ)
262            .addImm(SrcW)
263            .addImm(0)
264            .addImm(0)
265            .addImm(0)
266            .addImm(0)
267            .addImm(1)
268            .addImm(2)
269            .addImm(3)
270            .addOperand(RID)
271            .addOperand(SID)
272            .addImm(CTX)
273            .addImm(CTY)
274            .addImm(CTZ)
275            .addImm(CTW);
276    BuildMI(*BB, I, BB->findDebugLoc(I), TII->get(AMDGPU::TEX_SET_GRADIENTS_V), T1)
277            .addOperand(MI->getOperand(2))
278            .addImm(SrcX)
279            .addImm(SrcY)
280            .addImm(SrcZ)
281            .addImm(SrcW)
282            .addImm(0)
283            .addImm(0)
284            .addImm(0)
285            .addImm(0)
286            .addImm(1)
287            .addImm(2)
288            .addImm(3)
289            .addOperand(RID)
290            .addOperand(SID)
291            .addImm(CTX)
292            .addImm(CTY)
293            .addImm(CTZ)
294            .addImm(CTW);
295    BuildMI(*BB, I, BB->findDebugLoc(I), TII->get(AMDGPU::TEX_SAMPLE_G))
296            .addOperand(MI->getOperand(0))
297            .addOperand(MI->getOperand(1))
298            .addImm(SrcX)
299            .addImm(SrcY)
300            .addImm(SrcZ)
301            .addImm(SrcW)
302            .addImm(0)
303            .addImm(0)
304            .addImm(0)
305            .addImm(0)
306            .addImm(1)
307            .addImm(2)
308            .addImm(3)
309            .addOperand(RID)
310            .addOperand(SID)
311            .addImm(CTX)
312            .addImm(CTY)
313            .addImm(CTZ)
314            .addImm(CTW)
315            .addReg(T0, RegState::Implicit)
316            .addReg(T1, RegState::Implicit);
317    break;
318  }
319
320  case AMDGPU::TXD_SHADOW: {
321    unsigned T0 = MRI.createVirtualRegister(&AMDGPU::R600_Reg128RegClass);
322    unsigned T1 = MRI.createVirtualRegister(&AMDGPU::R600_Reg128RegClass);
323    MachineOperand &RID = MI->getOperand(4);
324    MachineOperand &SID = MI->getOperand(5);
325    unsigned TextureId = MI->getOperand(6).getImm();
326    unsigned SrcX = 0, SrcY = 1, SrcZ = 2, SrcW = 3;
327    unsigned CTX = 1, CTY = 1, CTZ = 1, CTW = 1;
328
329    switch (TextureId) {
330    case 5: // Rect
331      CTX = CTY = 0;
332      break;
333    case 6: // Shadow1D
334      SrcW = SrcZ;
335      break;
336    case 7: // Shadow2D
337      SrcW = SrcZ;
338      break;
339    case 8: // ShadowRect
340      CTX = CTY = 0;
341      SrcW = SrcZ;
342      break;
343    case 9: // 1DArray
344      SrcZ = SrcY;
345      CTZ = 0;
346      break;
347    case 10: // 2DArray
348      CTZ = 0;
349      break;
350    case 11: // Shadow1DArray
351      SrcZ = SrcY;
352      CTZ = 0;
353      break;
354    case 12: // Shadow2DArray
355      CTZ = 0;
356      break;
357    }
358
359    BuildMI(*BB, I, BB->findDebugLoc(I), TII->get(AMDGPU::TEX_SET_GRADIENTS_H), T0)
360            .addOperand(MI->getOperand(3))
361            .addImm(SrcX)
362            .addImm(SrcY)
363            .addImm(SrcZ)
364            .addImm(SrcW)
365            .addImm(0)
366            .addImm(0)
367            .addImm(0)
368            .addImm(0)
369            .addImm(1)
370            .addImm(2)
371            .addImm(3)
372            .addOperand(RID)
373            .addOperand(SID)
374            .addImm(CTX)
375            .addImm(CTY)
376            .addImm(CTZ)
377            .addImm(CTW);
378    BuildMI(*BB, I, BB->findDebugLoc(I), TII->get(AMDGPU::TEX_SET_GRADIENTS_V), T1)
379            .addOperand(MI->getOperand(2))
380            .addImm(SrcX)
381            .addImm(SrcY)
382            .addImm(SrcZ)
383            .addImm(SrcW)
384            .addImm(0)
385            .addImm(0)
386            .addImm(0)
387            .addImm(0)
388            .addImm(1)
389            .addImm(2)
390            .addImm(3)
391            .addOperand(RID)
392            .addOperand(SID)
393            .addImm(CTX)
394            .addImm(CTY)
395            .addImm(CTZ)
396            .addImm(CTW);
397    BuildMI(*BB, I, BB->findDebugLoc(I), TII->get(AMDGPU::TEX_SAMPLE_C_G))
398            .addOperand(MI->getOperand(0))
399            .addOperand(MI->getOperand(1))
400            .addImm(SrcX)
401            .addImm(SrcY)
402            .addImm(SrcZ)
403            .addImm(SrcW)
404            .addImm(0)
405            .addImm(0)
406            .addImm(0)
407            .addImm(0)
408            .addImm(1)
409            .addImm(2)
410            .addImm(3)
411            .addOperand(RID)
412            .addOperand(SID)
413            .addImm(CTX)
414            .addImm(CTY)
415            .addImm(CTZ)
416            .addImm(CTW)
417            .addReg(T0, RegState::Implicit)
418            .addReg(T1, RegState::Implicit);
419    break;
420  }
421
422  case AMDGPU::BRANCH:
423      BuildMI(*BB, I, BB->findDebugLoc(I), TII->get(AMDGPU::JUMP))
424              .addOperand(MI->getOperand(0));
425      break;
426
427  case AMDGPU::BRANCH_COND_f32: {
428    MachineInstr *NewMI =
429      BuildMI(*BB, I, BB->findDebugLoc(I), TII->get(AMDGPU::PRED_X),
430              AMDGPU::PREDICATE_BIT)
431              .addOperand(MI->getOperand(1))
432              .addImm(OPCODE_IS_NOT_ZERO)
433              .addImm(0); // Flags
434    TII->addFlag(NewMI, 0, MO_FLAG_PUSH);
435    BuildMI(*BB, I, BB->findDebugLoc(I), TII->get(AMDGPU::JUMP_COND))
436            .addOperand(MI->getOperand(0))
437            .addReg(AMDGPU::PREDICATE_BIT, RegState::Kill);
438    break;
439  }
440
441  case AMDGPU::BRANCH_COND_i32: {
442    MachineInstr *NewMI =
443      BuildMI(*BB, I, BB->findDebugLoc(I), TII->get(AMDGPU::PRED_X),
444            AMDGPU::PREDICATE_BIT)
445            .addOperand(MI->getOperand(1))
446            .addImm(OPCODE_IS_NOT_ZERO_INT)
447            .addImm(0); // Flags
448    TII->addFlag(NewMI, 0, MO_FLAG_PUSH);
449    BuildMI(*BB, I, BB->findDebugLoc(I), TII->get(AMDGPU::JUMP_COND))
450           .addOperand(MI->getOperand(0))
451            .addReg(AMDGPU::PREDICATE_BIT, RegState::Kill);
452    break;
453  }
454
455  case AMDGPU::EG_ExportSwz:
456  case AMDGPU::R600_ExportSwz: {
457    // Instruction is left unmodified if its not the last one of its type
458    bool isLastInstructionOfItsType = true;
459    unsigned InstExportType = MI->getOperand(1).getImm();
460    for (MachineBasicBlock::iterator NextExportInst = llvm::next(I),
461         EndBlock = BB->end(); NextExportInst != EndBlock;
462         NextExportInst = llvm::next(NextExportInst)) {
463      if (NextExportInst->getOpcode() == AMDGPU::EG_ExportSwz ||
464          NextExportInst->getOpcode() == AMDGPU::R600_ExportSwz) {
465        unsigned CurrentInstExportType = NextExportInst->getOperand(1)
466            .getImm();
467        if (CurrentInstExportType == InstExportType) {
468          isLastInstructionOfItsType = false;
469          break;
470        }
471      }
472    }
473    bool EOP = (llvm::next(I)->getOpcode() == AMDGPU::RETURN)? 1 : 0;
474    if (!EOP && !isLastInstructionOfItsType)
475      return BB;
476    unsigned CfInst = (MI->getOpcode() == AMDGPU::EG_ExportSwz)? 84 : 40;
477    BuildMI(*BB, I, BB->findDebugLoc(I), TII->get(MI->getOpcode()))
478            .addOperand(MI->getOperand(0))
479            .addOperand(MI->getOperand(1))
480            .addOperand(MI->getOperand(2))
481            .addOperand(MI->getOperand(3))
482            .addOperand(MI->getOperand(4))
483            .addOperand(MI->getOperand(5))
484            .addOperand(MI->getOperand(6))
485            .addImm(CfInst)
486            .addImm(EOP);
487    break;
488  }
489  case AMDGPU::RETURN: {
490    // RETURN instructions must have the live-out registers as implicit uses,
491    // otherwise they appear dead.
492    R600MachineFunctionInfo *MFI = MF->getInfo<R600MachineFunctionInfo>();
493    MachineInstrBuilder MIB(*MF, MI);
494    for (unsigned i = 0, e = MFI->LiveOuts.size(); i != e; ++i)
495      MIB.addReg(MFI->LiveOuts[i], RegState::Implicit);
496    return BB;
497  }
498  }
499
500  MI->eraseFromParent();
501  return BB;
502}
503
504//===----------------------------------------------------------------------===//
505// Custom DAG Lowering Operations
506//===----------------------------------------------------------------------===//
507
508SDValue R600TargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const {
509  MachineFunction &MF = DAG.getMachineFunction();
510  R600MachineFunctionInfo *MFI = MF.getInfo<R600MachineFunctionInfo>();
511  switch (Op.getOpcode()) {
512  default: return AMDGPUTargetLowering::LowerOperation(Op, DAG);
513  case ISD::FCOS:
514  case ISD::FSIN: return LowerTrig(Op, DAG);
515  case ISD::SELECT_CC: return LowerSELECT_CC(Op, DAG);
516  case ISD::STORE: return LowerSTORE(Op, DAG);
517  case ISD::LOAD: return LowerLOAD(Op, DAG);
518  case ISD::GlobalAddress: return LowerGlobalAddress(MFI, Op, DAG);
519  case ISD::INTRINSIC_VOID: {
520    SDValue Chain = Op.getOperand(0);
521    unsigned IntrinsicID =
522                         cast<ConstantSDNode>(Op.getOperand(1))->getZExtValue();
523    switch (IntrinsicID) {
524    case AMDGPUIntrinsic::AMDGPU_store_output: {
525      int64_t RegIndex = cast<ConstantSDNode>(Op.getOperand(3))->getZExtValue();
526      unsigned Reg = AMDGPU::R600_TReg32RegClass.getRegister(RegIndex);
527      MFI->LiveOuts.push_back(Reg);
528      return DAG.getCopyToReg(Chain, SDLoc(Op), Reg, Op.getOperand(2));
529    }
530    case AMDGPUIntrinsic::R600_store_swizzle: {
531      const SDValue Args[8] = {
532        Chain,
533        Op.getOperand(2), // Export Value
534        Op.getOperand(3), // ArrayBase
535        Op.getOperand(4), // Type
536        DAG.getConstant(0, MVT::i32), // SWZ_X
537        DAG.getConstant(1, MVT::i32), // SWZ_Y
538        DAG.getConstant(2, MVT::i32), // SWZ_Z
539        DAG.getConstant(3, MVT::i32) // SWZ_W
540      };
541      return DAG.getNode(AMDGPUISD::EXPORT, SDLoc(Op), Op.getValueType(),
542          Args, 8);
543    }
544
545    // default for switch(IntrinsicID)
546    default: break;
547    }
548    // break out of case ISD::INTRINSIC_VOID in switch(Op.getOpcode())
549    break;
550  }
551  case ISD::INTRINSIC_WO_CHAIN: {
552    unsigned IntrinsicID =
553                         cast<ConstantSDNode>(Op.getOperand(0))->getZExtValue();
554    EVT VT = Op.getValueType();
555    SDLoc DL(Op);
556    switch(IntrinsicID) {
557    default: return AMDGPUTargetLowering::LowerOperation(Op, DAG);
558    case AMDGPUIntrinsic::R600_load_input: {
559      int64_t RegIndex = cast<ConstantSDNode>(Op.getOperand(1))->getZExtValue();
560      unsigned Reg = AMDGPU::R600_TReg32RegClass.getRegister(RegIndex);
561      MachineFunction &MF = DAG.getMachineFunction();
562      MachineRegisterInfo &MRI = MF.getRegInfo();
563      MRI.addLiveIn(Reg);
564      return DAG.getCopyFromReg(DAG.getEntryNode(),
565          SDLoc(DAG.getEntryNode()), Reg, VT);
566    }
567
568    case AMDGPUIntrinsic::R600_interp_input: {
569      int slot = cast<ConstantSDNode>(Op.getOperand(1))->getZExtValue();
570      int ijb = cast<ConstantSDNode>(Op.getOperand(2))->getSExtValue();
571      MachineSDNode *interp;
572      if (ijb < 0) {
573        const MachineFunction &MF = DAG.getMachineFunction();
574        const R600InstrInfo *TII =
575          static_cast<const R600InstrInfo*>(MF.getTarget().getInstrInfo());
576        interp = DAG.getMachineNode(AMDGPU::INTERP_VEC_LOAD, DL,
577            MVT::v4f32, DAG.getTargetConstant(slot / 4 , MVT::i32));
578        return DAG.getTargetExtractSubreg(
579            TII->getRegisterInfo().getSubRegFromChannel(slot % 4),
580            DL, MVT::f32, SDValue(interp, 0));
581      }
582      MachineFunction &MF = DAG.getMachineFunction();
583      MachineRegisterInfo &MRI = MF.getRegInfo();
584      unsigned RegisterI = AMDGPU::R600_TReg32RegClass.getRegister(2 * ijb);
585      unsigned RegisterJ = AMDGPU::R600_TReg32RegClass.getRegister(2 * ijb + 1);
586      MRI.addLiveIn(RegisterI);
587      MRI.addLiveIn(RegisterJ);
588      SDValue RegisterINode = DAG.getCopyFromReg(DAG.getEntryNode(),
589          SDLoc(DAG.getEntryNode()), RegisterI, MVT::f32);
590      SDValue RegisterJNode = DAG.getCopyFromReg(DAG.getEntryNode(),
591          SDLoc(DAG.getEntryNode()), RegisterJ, MVT::f32);
592
593      if (slot % 4 < 2)
594        interp = DAG.getMachineNode(AMDGPU::INTERP_PAIR_XY, DL,
595            MVT::f32, MVT::f32, DAG.getTargetConstant(slot / 4 , MVT::i32),
596            RegisterJNode, RegisterINode);
597      else
598        interp = DAG.getMachineNode(AMDGPU::INTERP_PAIR_ZW, DL,
599            MVT::f32, MVT::f32, DAG.getTargetConstant(slot / 4 , MVT::i32),
600            RegisterJNode, RegisterINode);
601      return SDValue(interp, slot % 2);
602    }
603    case AMDGPUIntrinsic::R600_interp_xy:
604    case AMDGPUIntrinsic::R600_interp_zw: {
605      int slot = cast<ConstantSDNode>(Op.getOperand(1))->getZExtValue();
606      MachineSDNode *interp;
607      SDValue RegisterINode = Op.getOperand(2);
608      SDValue RegisterJNode = Op.getOperand(3);
609
610      if (IntrinsicID == AMDGPUIntrinsic::R600_interp_xy)
611        interp = DAG.getMachineNode(AMDGPU::INTERP_PAIR_XY, DL,
612            MVT::f32, MVT::f32, DAG.getTargetConstant(slot, MVT::i32),
613            RegisterJNode, RegisterINode);
614      else
615        interp = DAG.getMachineNode(AMDGPU::INTERP_PAIR_ZW, DL,
616            MVT::f32, MVT::f32, DAG.getTargetConstant(slot, MVT::i32),
617            RegisterJNode, RegisterINode);
618      return DAG.getNode(ISD::BUILD_VECTOR, DL, MVT::v2f32,
619          SDValue(interp, 0), SDValue(interp, 1));
620    }
621    case AMDGPUIntrinsic::R600_tex:
622    case AMDGPUIntrinsic::R600_texc:
623    case AMDGPUIntrinsic::R600_txl:
624    case AMDGPUIntrinsic::R600_txlc:
625    case AMDGPUIntrinsic::R600_txb:
626    case AMDGPUIntrinsic::R600_txbc:
627    case AMDGPUIntrinsic::R600_txf:
628    case AMDGPUIntrinsic::R600_txq:
629    case AMDGPUIntrinsic::R600_ddx:
630    case AMDGPUIntrinsic::R600_ddy:
631    case AMDGPUIntrinsic::R600_ldptr: {
632      unsigned TextureOp;
633      switch (IntrinsicID) {
634      case AMDGPUIntrinsic::R600_tex:
635        TextureOp = 0;
636        break;
637      case AMDGPUIntrinsic::R600_texc:
638        TextureOp = 1;
639        break;
640      case AMDGPUIntrinsic::R600_txl:
641        TextureOp = 2;
642        break;
643      case AMDGPUIntrinsic::R600_txlc:
644        TextureOp = 3;
645        break;
646      case AMDGPUIntrinsic::R600_txb:
647        TextureOp = 4;
648        break;
649      case AMDGPUIntrinsic::R600_txbc:
650        TextureOp = 5;
651        break;
652      case AMDGPUIntrinsic::R600_txf:
653        TextureOp = 6;
654        break;
655      case AMDGPUIntrinsic::R600_txq:
656        TextureOp = 7;
657        break;
658      case AMDGPUIntrinsic::R600_ddx:
659        TextureOp = 8;
660        break;
661      case AMDGPUIntrinsic::R600_ddy:
662        TextureOp = 9;
663        break;
664      case AMDGPUIntrinsic::R600_ldptr:
665        TextureOp = 10;
666        break;
667      default:
668        llvm_unreachable("Unknow Texture Operation");
669      }
670
671      SDValue TexArgs[19] = {
672        DAG.getConstant(TextureOp, MVT::i32),
673        Op.getOperand(1),
674        DAG.getConstant(0, MVT::i32),
675        DAG.getConstant(1, MVT::i32),
676        DAG.getConstant(2, MVT::i32),
677        DAG.getConstant(3, MVT::i32),
678        Op.getOperand(2),
679        Op.getOperand(3),
680        Op.getOperand(4),
681        DAG.getConstant(0, MVT::i32),
682        DAG.getConstant(1, MVT::i32),
683        DAG.getConstant(2, MVT::i32),
684        DAG.getConstant(3, MVT::i32),
685        Op.getOperand(5),
686        Op.getOperand(6),
687        Op.getOperand(7),
688        Op.getOperand(8),
689        Op.getOperand(9),
690        Op.getOperand(10)
691      };
692      return DAG.getNode(AMDGPUISD::TEXTURE_FETCH, DL, MVT::v4f32, TexArgs, 19);
693    }
694    case AMDGPUIntrinsic::AMDGPU_dp4: {
695      SDValue Args[8] = {
696      DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::f32, Op.getOperand(1),
697          DAG.getConstant(0, MVT::i32)),
698      DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::f32, Op.getOperand(2),
699          DAG.getConstant(0, MVT::i32)),
700      DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::f32, Op.getOperand(1),
701          DAG.getConstant(1, MVT::i32)),
702      DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::f32, Op.getOperand(2),
703          DAG.getConstant(1, MVT::i32)),
704      DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::f32, Op.getOperand(1),
705          DAG.getConstant(2, MVT::i32)),
706      DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::f32, Op.getOperand(2),
707          DAG.getConstant(2, MVT::i32)),
708      DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::f32, Op.getOperand(1),
709          DAG.getConstant(3, MVT::i32)),
710      DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::f32, Op.getOperand(2),
711          DAG.getConstant(3, MVT::i32))
712      };
713      return DAG.getNode(AMDGPUISD::DOT4, DL, MVT::f32, Args, 8);
714    }
715
716    case Intrinsic::r600_read_ngroups_x:
717      return LowerImplicitParameter(DAG, VT, DL, 0);
718    case Intrinsic::r600_read_ngroups_y:
719      return LowerImplicitParameter(DAG, VT, DL, 1);
720    case Intrinsic::r600_read_ngroups_z:
721      return LowerImplicitParameter(DAG, VT, DL, 2);
722    case Intrinsic::r600_read_global_size_x:
723      return LowerImplicitParameter(DAG, VT, DL, 3);
724    case Intrinsic::r600_read_global_size_y:
725      return LowerImplicitParameter(DAG, VT, DL, 4);
726    case Intrinsic::r600_read_global_size_z:
727      return LowerImplicitParameter(DAG, VT, DL, 5);
728    case Intrinsic::r600_read_local_size_x:
729      return LowerImplicitParameter(DAG, VT, DL, 6);
730    case Intrinsic::r600_read_local_size_y:
731      return LowerImplicitParameter(DAG, VT, DL, 7);
732    case Intrinsic::r600_read_local_size_z:
733      return LowerImplicitParameter(DAG, VT, DL, 8);
734
735    case Intrinsic::r600_read_tgid_x:
736      return CreateLiveInRegister(DAG, &AMDGPU::R600_TReg32RegClass,
737                                  AMDGPU::T1_X, VT);
738    case Intrinsic::r600_read_tgid_y:
739      return CreateLiveInRegister(DAG, &AMDGPU::R600_TReg32RegClass,
740                                  AMDGPU::T1_Y, VT);
741    case Intrinsic::r600_read_tgid_z:
742      return CreateLiveInRegister(DAG, &AMDGPU::R600_TReg32RegClass,
743                                  AMDGPU::T1_Z, VT);
744    case Intrinsic::r600_read_tidig_x:
745      return CreateLiveInRegister(DAG, &AMDGPU::R600_TReg32RegClass,
746                                  AMDGPU::T0_X, VT);
747    case Intrinsic::r600_read_tidig_y:
748      return CreateLiveInRegister(DAG, &AMDGPU::R600_TReg32RegClass,
749                                  AMDGPU::T0_Y, VT);
750    case Intrinsic::r600_read_tidig_z:
751      return CreateLiveInRegister(DAG, &AMDGPU::R600_TReg32RegClass,
752                                  AMDGPU::T0_Z, VT);
753    }
754    // break out of case ISD::INTRINSIC_WO_CHAIN in switch(Op.getOpcode())
755    break;
756  }
757  } // end switch(Op.getOpcode())
758  return SDValue();
759}
760
761void R600TargetLowering::ReplaceNodeResults(SDNode *N,
762                                            SmallVectorImpl<SDValue> &Results,
763                                            SelectionDAG &DAG) const {
764  switch (N->getOpcode()) {
765  default: return;
766  case ISD::FP_TO_UINT: Results.push_back(LowerFPTOUINT(N->getOperand(0), DAG));
767    return;
768  case ISD::LOAD: {
769    SDNode *Node = LowerLOAD(SDValue(N, 0), DAG).getNode();
770    Results.push_back(SDValue(Node, 0));
771    Results.push_back(SDValue(Node, 1));
772    // XXX: LLVM seems not to replace Chain Value inside CustomWidenLowerNode
773    // function
774    DAG.ReplaceAllUsesOfValueWith(SDValue(N,1), SDValue(Node, 1));
775    return;
776  }
777  case ISD::STORE:
778    SDNode *Node = LowerSTORE(SDValue(N, 0), DAG).getNode();
779    Results.push_back(SDValue(Node, 0));
780    return;
781  }
782}
783
784SDValue R600TargetLowering::LowerTrig(SDValue Op, SelectionDAG &DAG) const {
785  // On hw >= R700, COS/SIN input must be between -1. and 1.
786  // Thus we lower them to TRIG ( FRACT ( x / 2Pi + 0.5) - 0.5)
787  EVT VT = Op.getValueType();
788  SDValue Arg = Op.getOperand(0);
789  SDValue FractPart = DAG.getNode(AMDGPUISD::FRACT, SDLoc(Op), VT,
790      DAG.getNode(ISD::FADD, SDLoc(Op), VT,
791        DAG.getNode(ISD::FMUL, SDLoc(Op), VT, Arg,
792          DAG.getConstantFP(0.15915494309, MVT::f32)),
793        DAG.getConstantFP(0.5, MVT::f32)));
794  unsigned TrigNode;
795  switch (Op.getOpcode()) {
796  case ISD::FCOS:
797    TrigNode = AMDGPUISD::COS_HW;
798    break;
799  case ISD::FSIN:
800    TrigNode = AMDGPUISD::SIN_HW;
801    break;
802  default:
803    llvm_unreachable("Wrong trig opcode");
804  }
805  SDValue TrigVal = DAG.getNode(TrigNode, SDLoc(Op), VT,
806      DAG.getNode(ISD::FADD, SDLoc(Op), VT, FractPart,
807        DAG.getConstantFP(-0.5, MVT::f32)));
808  if (Gen >= AMDGPUSubtarget::R700)
809    return TrigVal;
810  // On R600 hw, COS/SIN input must be between -Pi and Pi.
811  return DAG.getNode(ISD::FMUL, SDLoc(Op), VT, TrigVal,
812      DAG.getConstantFP(3.14159265359, MVT::f32));
813}
814
815SDValue R600TargetLowering::LowerFPTOUINT(SDValue Op, SelectionDAG &DAG) const {
816  return DAG.getNode(
817      ISD::SETCC,
818      SDLoc(Op),
819      MVT::i1,
820      Op, DAG.getConstantFP(0.0f, MVT::f32),
821      DAG.getCondCode(ISD::SETNE)
822      );
823}
824
825SDValue R600TargetLowering::LowerImplicitParameter(SelectionDAG &DAG, EVT VT,
826                                                   SDLoc DL,
827                                                   unsigned DwordOffset) const {
828  unsigned ByteOffset = DwordOffset * 4;
829  PointerType * PtrType = PointerType::get(VT.getTypeForEVT(*DAG.getContext()),
830                                      AMDGPUAS::CONSTANT_BUFFER_0);
831
832  // We shouldn't be using an offset wider than 16-bits for implicit parameters.
833  assert(isInt<16>(ByteOffset));
834
835  return DAG.getLoad(VT, DL, DAG.getEntryNode(),
836                     DAG.getConstant(ByteOffset, MVT::i32), // PTR
837                     MachinePointerInfo(ConstantPointerNull::get(PtrType)),
838                     false, false, false, 0);
839}
840
841bool R600TargetLowering::isZero(SDValue Op) const {
842  if(ConstantSDNode *Cst = dyn_cast<ConstantSDNode>(Op)) {
843    return Cst->isNullValue();
844  } else if(ConstantFPSDNode *CstFP = dyn_cast<ConstantFPSDNode>(Op)){
845    return CstFP->isZero();
846  } else {
847    return false;
848  }
849}
850
851SDValue R600TargetLowering::LowerSELECT_CC(SDValue Op, SelectionDAG &DAG) const {
852  SDLoc DL(Op);
853  EVT VT = Op.getValueType();
854
855  SDValue LHS = Op.getOperand(0);
856  SDValue RHS = Op.getOperand(1);
857  SDValue True = Op.getOperand(2);
858  SDValue False = Op.getOperand(3);
859  SDValue CC = Op.getOperand(4);
860  SDValue Temp;
861
862  // LHS and RHS are guaranteed to be the same value type
863  EVT CompareVT = LHS.getValueType();
864
865  // Check if we can lower this to a native operation.
866
867  // Try to lower to a SET* instruction:
868  //
869  // SET* can match the following patterns:
870  //
871  // select_cc f32, f32, -1,  0, cc_supported
872  // select_cc f32, f32, 1.0f, 0.0f, cc_supported
873  // select_cc i32, i32, -1,  0, cc_supported
874  //
875
876  // Move hardware True/False values to the correct operand.
877  ISD::CondCode CCOpcode = cast<CondCodeSDNode>(CC)->get();
878  ISD::CondCode InverseCC =
879     ISD::getSetCCInverse(CCOpcode, CompareVT == MVT::i32);
880  if (isHWTrueValue(False) && isHWFalseValue(True)) {
881    if (isCondCodeLegal(InverseCC, CompareVT.getSimpleVT())) {
882      std::swap(False, True);
883      CC = DAG.getCondCode(InverseCC);
884    } else {
885      ISD::CondCode SwapInvCC = ISD::getSetCCSwappedOperands(InverseCC);
886      if (isCondCodeLegal(SwapInvCC, CompareVT.getSimpleVT())) {
887        std::swap(False, True);
888        std::swap(LHS, RHS);
889        CC = DAG.getCondCode(SwapInvCC);
890      }
891    }
892  }
893
894  if (isHWTrueValue(True) && isHWFalseValue(False) &&
895      (CompareVT == VT || VT == MVT::i32)) {
896    // This can be matched by a SET* instruction.
897    return DAG.getNode(ISD::SELECT_CC, DL, VT, LHS, RHS, True, False, CC);
898  }
899
900  // Try to lower to a CND* instruction:
901  //
902  // CND* can match the following patterns:
903  //
904  // select_cc f32, 0.0, f32, f32, cc_supported
905  // select_cc f32, 0.0, i32, i32, cc_supported
906  // select_cc i32, 0,   f32, f32, cc_supported
907  // select_cc i32, 0,   i32, i32, cc_supported
908  //
909
910  // Try to move the zero value to the RHS
911  if (isZero(LHS)) {
912    ISD::CondCode CCOpcode = cast<CondCodeSDNode>(CC)->get();
913    // Try swapping the operands
914    ISD::CondCode CCSwapped = ISD::getSetCCSwappedOperands(CCOpcode);
915    if (isCondCodeLegal(CCSwapped, CompareVT.getSimpleVT())) {
916      std::swap(LHS, RHS);
917      CC = DAG.getCondCode(CCSwapped);
918    } else {
919      // Try inverting the conditon and then swapping the operands
920      ISD::CondCode CCInv = ISD::getSetCCInverse(CCOpcode, CompareVT.isInteger());
921      CCSwapped = ISD::getSetCCSwappedOperands(CCInv);
922      if (isCondCodeLegal(CCSwapped, CompareVT.getSimpleVT())) {
923        std::swap(True, False);
924        std::swap(LHS, RHS);
925        CC = DAG.getCondCode(CCSwapped);
926      }
927    }
928  }
929  if (isZero(RHS)) {
930    SDValue Cond = LHS;
931    SDValue Zero = RHS;
932    ISD::CondCode CCOpcode = cast<CondCodeSDNode>(CC)->get();
933    if (CompareVT != VT) {
934      // Bitcast True / False to the correct types.  This will end up being
935      // a nop, but it allows us to define only a single pattern in the
936      // .TD files for each CND* instruction rather than having to have
937      // one pattern for integer True/False and one for fp True/False
938      True = DAG.getNode(ISD::BITCAST, DL, CompareVT, True);
939      False = DAG.getNode(ISD::BITCAST, DL, CompareVT, False);
940    }
941
942    switch (CCOpcode) {
943    case ISD::SETONE:
944    case ISD::SETUNE:
945    case ISD::SETNE:
946      CCOpcode = ISD::getSetCCInverse(CCOpcode, CompareVT == MVT::i32);
947      Temp = True;
948      True = False;
949      False = Temp;
950      break;
951    default:
952      break;
953    }
954    SDValue SelectNode = DAG.getNode(ISD::SELECT_CC, DL, CompareVT,
955        Cond, Zero,
956        True, False,
957        DAG.getCondCode(CCOpcode));
958    return DAG.getNode(ISD::BITCAST, DL, VT, SelectNode);
959  }
960
961
962  // Possible Min/Max pattern
963  SDValue MinMax = LowerMinMax(Op, DAG);
964  if (MinMax.getNode()) {
965    return MinMax;
966  }
967
968  // If we make it this for it means we have no native instructions to handle
969  // this SELECT_CC, so we must lower it.
970  SDValue HWTrue, HWFalse;
971
972  if (CompareVT == MVT::f32) {
973    HWTrue = DAG.getConstantFP(1.0f, CompareVT);
974    HWFalse = DAG.getConstantFP(0.0f, CompareVT);
975  } else if (CompareVT == MVT::i32) {
976    HWTrue = DAG.getConstant(-1, CompareVT);
977    HWFalse = DAG.getConstant(0, CompareVT);
978  }
979  else {
980    assert(!"Unhandled value type in LowerSELECT_CC");
981  }
982
983  // Lower this unsupported SELECT_CC into a combination of two supported
984  // SELECT_CC operations.
985  SDValue Cond = DAG.getNode(ISD::SELECT_CC, DL, CompareVT, LHS, RHS, HWTrue, HWFalse, CC);
986
987  return DAG.getNode(ISD::SELECT_CC, DL, VT,
988      Cond, HWFalse,
989      True, False,
990      DAG.getCondCode(ISD::SETNE));
991}
992
993/// LLVM generates byte-addresed pointers.  For indirect addressing, we need to
994/// convert these pointers to a register index.  Each register holds
995/// 16 bytes, (4 x 32bit sub-register), but we need to take into account the
996/// \p StackWidth, which tells us how many of the 4 sub-registrers will be used
997/// for indirect addressing.
998SDValue R600TargetLowering::stackPtrToRegIndex(SDValue Ptr,
999                                               unsigned StackWidth,
1000                                               SelectionDAG &DAG) const {
1001  unsigned SRLPad;
1002  switch(StackWidth) {
1003  case 1:
1004    SRLPad = 2;
1005    break;
1006  case 2:
1007    SRLPad = 3;
1008    break;
1009  case 4:
1010    SRLPad = 4;
1011    break;
1012  default: llvm_unreachable("Invalid stack width");
1013  }
1014
1015  return DAG.getNode(ISD::SRL, SDLoc(Ptr), Ptr.getValueType(), Ptr,
1016                     DAG.getConstant(SRLPad, MVT::i32));
1017}
1018
1019void R600TargetLowering::getStackAddress(unsigned StackWidth,
1020                                         unsigned ElemIdx,
1021                                         unsigned &Channel,
1022                                         unsigned &PtrIncr) const {
1023  switch (StackWidth) {
1024  default:
1025  case 1:
1026    Channel = 0;
1027    if (ElemIdx > 0) {
1028      PtrIncr = 1;
1029    } else {
1030      PtrIncr = 0;
1031    }
1032    break;
1033  case 2:
1034    Channel = ElemIdx % 2;
1035    if (ElemIdx == 2) {
1036      PtrIncr = 1;
1037    } else {
1038      PtrIncr = 0;
1039    }
1040    break;
1041  case 4:
1042    Channel = ElemIdx;
1043    PtrIncr = 0;
1044    break;
1045  }
1046}
1047
1048SDValue R600TargetLowering::LowerSTORE(SDValue Op, SelectionDAG &DAG) const {
1049  SDLoc DL(Op);
1050  StoreSDNode *StoreNode = cast<StoreSDNode>(Op);
1051  SDValue Chain = Op.getOperand(0);
1052  SDValue Value = Op.getOperand(1);
1053  SDValue Ptr = Op.getOperand(2);
1054
1055  SDValue Result = AMDGPUTargetLowering::LowerSTORE(Op, DAG);
1056  if (Result.getNode()) {
1057    return Result;
1058  }
1059
1060  if (StoreNode->getAddressSpace() == AMDGPUAS::GLOBAL_ADDRESS) {
1061    if (StoreNode->isTruncatingStore()) {
1062      EVT VT = Value.getValueType();
1063      assert(VT.bitsLE(MVT::i32));
1064      EVT MemVT = StoreNode->getMemoryVT();
1065      SDValue MaskConstant;
1066      if (MemVT == MVT::i8) {
1067        MaskConstant = DAG.getConstant(0xFF, MVT::i32);
1068      } else {
1069        assert(MemVT == MVT::i16);
1070        MaskConstant = DAG.getConstant(0xFFFF, MVT::i32);
1071      }
1072      SDValue DWordAddr = DAG.getNode(ISD::SRL, DL, VT, Ptr,
1073                                      DAG.getConstant(2, MVT::i32));
1074      SDValue ByteIndex = DAG.getNode(ISD::AND, DL, Ptr.getValueType(), Ptr,
1075                                      DAG.getConstant(0x00000003, VT));
1076      SDValue TruncValue = DAG.getNode(ISD::AND, DL, VT, Value, MaskConstant);
1077      SDValue Shift = DAG.getNode(ISD::SHL, DL, VT, ByteIndex,
1078                                   DAG.getConstant(3, VT));
1079      SDValue ShiftedValue = DAG.getNode(ISD::SHL, DL, VT, TruncValue, Shift);
1080      SDValue Mask = DAG.getNode(ISD::SHL, DL, VT, MaskConstant, Shift);
1081      // XXX: If we add a 64-bit ZW register class, then we could use a 2 x i32
1082      // vector instead.
1083      SDValue Src[4] = {
1084        ShiftedValue,
1085        DAG.getConstant(0, MVT::i32),
1086        DAG.getConstant(0, MVT::i32),
1087        Mask
1088      };
1089      SDValue Input = DAG.getNode(ISD::BUILD_VECTOR, DL, MVT::v4i32, Src, 4);
1090      SDValue Args[3] = { Chain, Input, DWordAddr };
1091      return DAG.getMemIntrinsicNode(AMDGPUISD::STORE_MSKOR, DL,
1092                                     Op->getVTList(), Args, 3, MemVT,
1093                                     StoreNode->getMemOperand());
1094    } else if (Ptr->getOpcode() != AMDGPUISD::DWORDADDR &&
1095               Value.getValueType().bitsGE(MVT::i32)) {
1096      // Convert pointer from byte address to dword address.
1097      Ptr = DAG.getNode(AMDGPUISD::DWORDADDR, DL, Ptr.getValueType(),
1098                        DAG.getNode(ISD::SRL, DL, Ptr.getValueType(),
1099                                    Ptr, DAG.getConstant(2, MVT::i32)));
1100
1101      if (StoreNode->isTruncatingStore() || StoreNode->isIndexed()) {
1102        assert(!"Truncated and indexed stores not supported yet");
1103      } else {
1104        Chain = DAG.getStore(Chain, DL, Value, Ptr, StoreNode->getMemOperand());
1105      }
1106      return Chain;
1107    }
1108  }
1109
1110  EVT ValueVT = Value.getValueType();
1111
1112  if (StoreNode->getAddressSpace() != AMDGPUAS::PRIVATE_ADDRESS) {
1113    return SDValue();
1114  }
1115
1116  // Lowering for indirect addressing
1117
1118  const MachineFunction &MF = DAG.getMachineFunction();
1119  const AMDGPUFrameLowering *TFL = static_cast<const AMDGPUFrameLowering*>(
1120                                         getTargetMachine().getFrameLowering());
1121  unsigned StackWidth = TFL->getStackWidth(MF);
1122
1123  Ptr = stackPtrToRegIndex(Ptr, StackWidth, DAG);
1124
1125  if (ValueVT.isVector()) {
1126    unsigned NumElemVT = ValueVT.getVectorNumElements();
1127    EVT ElemVT = ValueVT.getVectorElementType();
1128    SDValue Stores[4];
1129
1130    assert(NumElemVT >= StackWidth && "Stack width cannot be greater than "
1131                                      "vector width in load");
1132
1133    for (unsigned i = 0; i < NumElemVT; ++i) {
1134      unsigned Channel, PtrIncr;
1135      getStackAddress(StackWidth, i, Channel, PtrIncr);
1136      Ptr = DAG.getNode(ISD::ADD, DL, MVT::i32, Ptr,
1137                        DAG.getConstant(PtrIncr, MVT::i32));
1138      SDValue Elem = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, ElemVT,
1139                                 Value, DAG.getConstant(i, MVT::i32));
1140
1141      Stores[i] = DAG.getNode(AMDGPUISD::REGISTER_STORE, DL, MVT::Other,
1142                              Chain, Elem, Ptr,
1143                              DAG.getTargetConstant(Channel, MVT::i32));
1144    }
1145     Chain =  DAG.getNode(ISD::TokenFactor, DL, MVT::Other, Stores, NumElemVT);
1146   } else {
1147    if (ValueVT == MVT::i8) {
1148      Value = DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::i32, Value);
1149    }
1150    Chain = DAG.getNode(AMDGPUISD::REGISTER_STORE, DL, MVT::Other, Chain, Value, Ptr,
1151    DAG.getTargetConstant(0, MVT::i32)); // Channel
1152  }
1153
1154  return Chain;
1155}
1156
1157// return (512 + (kc_bank << 12)
1158static int
1159ConstantAddressBlock(unsigned AddressSpace) {
1160  switch (AddressSpace) {
1161  case AMDGPUAS::CONSTANT_BUFFER_0:
1162    return 512;
1163  case AMDGPUAS::CONSTANT_BUFFER_1:
1164    return 512 + 4096;
1165  case AMDGPUAS::CONSTANT_BUFFER_2:
1166    return 512 + 4096 * 2;
1167  case AMDGPUAS::CONSTANT_BUFFER_3:
1168    return 512 + 4096 * 3;
1169  case AMDGPUAS::CONSTANT_BUFFER_4:
1170    return 512 + 4096 * 4;
1171  case AMDGPUAS::CONSTANT_BUFFER_5:
1172    return 512 + 4096 * 5;
1173  case AMDGPUAS::CONSTANT_BUFFER_6:
1174    return 512 + 4096 * 6;
1175  case AMDGPUAS::CONSTANT_BUFFER_7:
1176    return 512 + 4096 * 7;
1177  case AMDGPUAS::CONSTANT_BUFFER_8:
1178    return 512 + 4096 * 8;
1179  case AMDGPUAS::CONSTANT_BUFFER_9:
1180    return 512 + 4096 * 9;
1181  case AMDGPUAS::CONSTANT_BUFFER_10:
1182    return 512 + 4096 * 10;
1183  case AMDGPUAS::CONSTANT_BUFFER_11:
1184    return 512 + 4096 * 11;
1185  case AMDGPUAS::CONSTANT_BUFFER_12:
1186    return 512 + 4096 * 12;
1187  case AMDGPUAS::CONSTANT_BUFFER_13:
1188    return 512 + 4096 * 13;
1189  case AMDGPUAS::CONSTANT_BUFFER_14:
1190    return 512 + 4096 * 14;
1191  case AMDGPUAS::CONSTANT_BUFFER_15:
1192    return 512 + 4096 * 15;
1193  default:
1194    return -1;
1195  }
1196}
1197
1198SDValue R600TargetLowering::LowerLOAD(SDValue Op, SelectionDAG &DAG) const
1199{
1200  EVT VT = Op.getValueType();
1201  SDLoc DL(Op);
1202  LoadSDNode *LoadNode = cast<LoadSDNode>(Op);
1203  SDValue Chain = Op.getOperand(0);
1204  SDValue Ptr = Op.getOperand(1);
1205  SDValue LoweredLoad;
1206
1207  if (LoadNode->getAddressSpace() == AMDGPUAS::LOCAL_ADDRESS && VT.isVector()) {
1208    SDValue MergedValues[2] = {
1209      SplitVectorLoad(Op, DAG),
1210      Chain
1211    };
1212    return DAG.getMergeValues(MergedValues, 2, DL);
1213  }
1214
1215  int ConstantBlock = ConstantAddressBlock(LoadNode->getAddressSpace());
1216  if (ConstantBlock > -1 &&
1217      ((LoadNode->getExtensionType() == ISD::NON_EXTLOAD) ||
1218       (LoadNode->getExtensionType() == ISD::ZEXTLOAD))) {
1219    SDValue Result;
1220    if (isa<ConstantExpr>(LoadNode->getSrcValue()) ||
1221        isa<Constant>(LoadNode->getSrcValue()) ||
1222        isa<ConstantSDNode>(Ptr)) {
1223      SDValue Slots[4];
1224      for (unsigned i = 0; i < 4; i++) {
1225        // We want Const position encoded with the following formula :
1226        // (((512 + (kc_bank << 12) + const_index) << 2) + chan)
1227        // const_index is Ptr computed by llvm using an alignment of 16.
1228        // Thus we add (((512 + (kc_bank << 12)) + chan ) * 4 here and
1229        // then div by 4 at the ISel step
1230        SDValue NewPtr = DAG.getNode(ISD::ADD, DL, Ptr.getValueType(), Ptr,
1231            DAG.getConstant(4 * i + ConstantBlock * 16, MVT::i32));
1232        Slots[i] = DAG.getNode(AMDGPUISD::CONST_ADDRESS, DL, MVT::i32, NewPtr);
1233      }
1234      EVT NewVT = MVT::v4i32;
1235      unsigned NumElements = 4;
1236      if (VT.isVector()) {
1237        NewVT = VT;
1238        NumElements = VT.getVectorNumElements();
1239      }
1240      Result = DAG.getNode(ISD::BUILD_VECTOR, DL, NewVT, Slots, NumElements);
1241    } else {
1242      // non constant ptr cant be folded, keeps it as a v4f32 load
1243      Result = DAG.getNode(AMDGPUISD::CONST_ADDRESS, DL, MVT::v4i32,
1244          DAG.getNode(ISD::SRL, DL, MVT::i32, Ptr, DAG.getConstant(4, MVT::i32)),
1245          DAG.getConstant(LoadNode->getAddressSpace() -
1246                          AMDGPUAS::CONSTANT_BUFFER_0, MVT::i32)
1247          );
1248    }
1249
1250    if (!VT.isVector()) {
1251      Result = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::i32, Result,
1252          DAG.getConstant(0, MVT::i32));
1253    }
1254
1255    SDValue MergedValues[2] = {
1256        Result,
1257        Chain
1258    };
1259    return DAG.getMergeValues(MergedValues, 2, DL);
1260  }
1261
1262  // For most operations returning SDValue() will result in the node being
1263  // expanded by the DAG Legalizer. This is not the case for ISD::LOAD, so we
1264  // need to manually expand loads that may be legal in some address spaces and
1265  // illegal in others. SEXT loads from CONSTANT_BUFFER_0 are supported for
1266  // compute shaders, since the data is sign extended when it is uploaded to the
1267  // buffer. However SEXT loads from other address spaces are not supported, so
1268  // we need to expand them here.
1269  if (LoadNode->getExtensionType() == ISD::SEXTLOAD) {
1270    EVT MemVT = LoadNode->getMemoryVT();
1271    assert(!MemVT.isVector() && (MemVT == MVT::i16 || MemVT == MVT::i8));
1272    SDValue ShiftAmount =
1273          DAG.getConstant(VT.getSizeInBits() - MemVT.getSizeInBits(), MVT::i32);
1274    SDValue NewLoad = DAG.getExtLoad(ISD::EXTLOAD, DL, VT, Chain, Ptr,
1275                                  LoadNode->getPointerInfo(), MemVT,
1276                                  LoadNode->isVolatile(),
1277                                  LoadNode->isNonTemporal(),
1278                                  LoadNode->getAlignment());
1279    SDValue Shl = DAG.getNode(ISD::SHL, DL, VT, NewLoad, ShiftAmount);
1280    SDValue Sra = DAG.getNode(ISD::SRA, DL, VT, Shl, ShiftAmount);
1281
1282    SDValue MergedValues[2] = { Sra, Chain };
1283    return DAG.getMergeValues(MergedValues, 2, DL);
1284  }
1285
1286  if (LoadNode->getAddressSpace() != AMDGPUAS::PRIVATE_ADDRESS) {
1287    return SDValue();
1288  }
1289
1290  // Lowering for indirect addressing
1291  const MachineFunction &MF = DAG.getMachineFunction();
1292  const AMDGPUFrameLowering *TFL = static_cast<const AMDGPUFrameLowering*>(
1293                                         getTargetMachine().getFrameLowering());
1294  unsigned StackWidth = TFL->getStackWidth(MF);
1295
1296  Ptr = stackPtrToRegIndex(Ptr, StackWidth, DAG);
1297
1298  if (VT.isVector()) {
1299    unsigned NumElemVT = VT.getVectorNumElements();
1300    EVT ElemVT = VT.getVectorElementType();
1301    SDValue Loads[4];
1302
1303    assert(NumElemVT >= StackWidth && "Stack width cannot be greater than "
1304                                      "vector width in load");
1305
1306    for (unsigned i = 0; i < NumElemVT; ++i) {
1307      unsigned Channel, PtrIncr;
1308      getStackAddress(StackWidth, i, Channel, PtrIncr);
1309      Ptr = DAG.getNode(ISD::ADD, DL, MVT::i32, Ptr,
1310                        DAG.getConstant(PtrIncr, MVT::i32));
1311      Loads[i] = DAG.getNode(AMDGPUISD::REGISTER_LOAD, DL, ElemVT,
1312                             Chain, Ptr,
1313                             DAG.getTargetConstant(Channel, MVT::i32),
1314                             Op.getOperand(2));
1315    }
1316    for (unsigned i = NumElemVT; i < 4; ++i) {
1317      Loads[i] = DAG.getUNDEF(ElemVT);
1318    }
1319    EVT TargetVT = EVT::getVectorVT(*DAG.getContext(), ElemVT, 4);
1320    LoweredLoad = DAG.getNode(ISD::BUILD_VECTOR, DL, TargetVT, Loads, 4);
1321  } else {
1322    LoweredLoad = DAG.getNode(AMDGPUISD::REGISTER_LOAD, DL, VT,
1323                              Chain, Ptr,
1324                              DAG.getTargetConstant(0, MVT::i32), // Channel
1325                              Op.getOperand(2));
1326  }
1327
1328  SDValue Ops[2];
1329  Ops[0] = LoweredLoad;
1330  Ops[1] = Chain;
1331
1332  return DAG.getMergeValues(Ops, 2, DL);
1333}
1334
1335/// XXX Only kernel functions are supported, so we can assume for now that
1336/// every function is a kernel function, but in the future we should use
1337/// separate calling conventions for kernel and non-kernel functions.
1338SDValue R600TargetLowering::LowerFormalArguments(
1339                                      SDValue Chain,
1340                                      CallingConv::ID CallConv,
1341                                      bool isVarArg,
1342                                      const SmallVectorImpl<ISD::InputArg> &Ins,
1343                                      SDLoc DL, SelectionDAG &DAG,
1344                                      SmallVectorImpl<SDValue> &InVals) const {
1345  SmallVector<CCValAssign, 16> ArgLocs;
1346  CCState CCInfo(CallConv, isVarArg, DAG.getMachineFunction(),
1347                 getTargetMachine(), ArgLocs, *DAG.getContext());
1348  MachineFunction &MF = DAG.getMachineFunction();
1349  unsigned ShaderType = MF.getInfo<R600MachineFunctionInfo>()->ShaderType;
1350
1351  SmallVector<ISD::InputArg, 8> LocalIns;
1352
1353  getOriginalFunctionArgs(DAG, DAG.getMachineFunction().getFunction(), Ins,
1354                          LocalIns);
1355
1356  AnalyzeFormalArguments(CCInfo, LocalIns);
1357
1358  for (unsigned i = 0, e = Ins.size(); i < e; ++i) {
1359    CCValAssign &VA = ArgLocs[i];
1360    EVT VT = Ins[i].VT;
1361    EVT MemVT = LocalIns[i].VT;
1362
1363    if (ShaderType != ShaderType::COMPUTE) {
1364      unsigned Reg = MF.addLiveIn(VA.getLocReg(), &AMDGPU::R600_Reg128RegClass);
1365      SDValue Register = DAG.getCopyFromReg(Chain, DL, Reg, VT);
1366      InVals.push_back(Register);
1367      continue;
1368    }
1369
1370    PointerType *PtrTy = PointerType::get(VT.getTypeForEVT(*DAG.getContext()),
1371                                                   AMDGPUAS::CONSTANT_BUFFER_0);
1372
1373    // The first 36 bytes of the input buffer contains information about
1374    // thread group and global sizes.
1375    SDValue Arg = DAG.getExtLoad(ISD::SEXTLOAD, DL, VT, Chain,
1376                                 DAG.getConstant(36 + VA.getLocMemOffset(), MVT::i32),
1377                                 MachinePointerInfo(UndefValue::get(PtrTy)),
1378                                 MemVT, false, false, 4);
1379                                 // 4 is the prefered alignment for
1380                                 // the CONSTANT memory space.
1381    InVals.push_back(Arg);
1382  }
1383  return Chain;
1384}
1385
1386EVT R600TargetLowering::getSetCCResultType(LLVMContext &, EVT VT) const {
1387   if (!VT.isVector()) return MVT::i32;
1388   return VT.changeVectorElementTypeToInteger();
1389}
1390
1391static SDValue
1392CompactSwizzlableVector(SelectionDAG &DAG, SDValue VectorEntry,
1393                        DenseMap<unsigned, unsigned> &RemapSwizzle) {
1394  assert(VectorEntry.getOpcode() == ISD::BUILD_VECTOR);
1395  assert(RemapSwizzle.empty());
1396  SDValue NewBldVec[4] = {
1397      VectorEntry.getOperand(0),
1398      VectorEntry.getOperand(1),
1399      VectorEntry.getOperand(2),
1400      VectorEntry.getOperand(3)
1401  };
1402
1403  for (unsigned i = 0; i < 4; i++) {
1404    if (NewBldVec[i].getOpcode() == ISD::UNDEF)
1405      // We mask write here to teach later passes that the ith element of this
1406      // vector is undef. Thus we can use it to reduce 128 bits reg usage,
1407      // break false dependencies and additionnaly make assembly easier to read.
1408      RemapSwizzle[i] = 7; // SEL_MASK_WRITE
1409    if (ConstantFPSDNode *C = dyn_cast<ConstantFPSDNode>(NewBldVec[i])) {
1410      if (C->isZero()) {
1411        RemapSwizzle[i] = 4; // SEL_0
1412        NewBldVec[i] = DAG.getUNDEF(MVT::f32);
1413      } else if (C->isExactlyValue(1.0)) {
1414        RemapSwizzle[i] = 5; // SEL_1
1415        NewBldVec[i] = DAG.getUNDEF(MVT::f32);
1416      }
1417    }
1418
1419    if (NewBldVec[i].getOpcode() == ISD::UNDEF)
1420      continue;
1421    for (unsigned j = 0; j < i; j++) {
1422      if (NewBldVec[i] == NewBldVec[j]) {
1423        NewBldVec[i] = DAG.getUNDEF(NewBldVec[i].getValueType());
1424        RemapSwizzle[i] = j;
1425        break;
1426      }
1427    }
1428  }
1429
1430  return DAG.getNode(ISD::BUILD_VECTOR, SDLoc(VectorEntry),
1431      VectorEntry.getValueType(), NewBldVec, 4);
1432}
1433
1434static SDValue ReorganizeVector(SelectionDAG &DAG, SDValue VectorEntry,
1435                                DenseMap<unsigned, unsigned> &RemapSwizzle) {
1436  assert(VectorEntry.getOpcode() == ISD::BUILD_VECTOR);
1437  assert(RemapSwizzle.empty());
1438  SDValue NewBldVec[4] = {
1439      VectorEntry.getOperand(0),
1440      VectorEntry.getOperand(1),
1441      VectorEntry.getOperand(2),
1442      VectorEntry.getOperand(3)
1443  };
1444  bool isUnmovable[4] = { false, false, false, false };
1445  for (unsigned i = 0; i < 4; i++)
1446    RemapSwizzle[i] = i;
1447
1448  for (unsigned i = 0; i < 4; i++) {
1449    if (NewBldVec[i].getOpcode() == ISD::EXTRACT_VECTOR_ELT) {
1450      unsigned Idx = dyn_cast<ConstantSDNode>(NewBldVec[i].getOperand(1))
1451          ->getZExtValue();
1452      if (i == Idx) {
1453        isUnmovable[Idx] = true;
1454        continue;
1455      }
1456      if (isUnmovable[Idx])
1457        continue;
1458      // Swap i and Idx
1459      std::swap(NewBldVec[Idx], NewBldVec[i]);
1460      std::swap(RemapSwizzle[i], RemapSwizzle[Idx]);
1461      break;
1462    }
1463  }
1464
1465  return DAG.getNode(ISD::BUILD_VECTOR, SDLoc(VectorEntry),
1466      VectorEntry.getValueType(), NewBldVec, 4);
1467}
1468
1469
1470SDValue R600TargetLowering::OptimizeSwizzle(SDValue BuildVector,
1471SDValue Swz[4], SelectionDAG &DAG) const {
1472  assert(BuildVector.getOpcode() == ISD::BUILD_VECTOR);
1473  // Old -> New swizzle values
1474  DenseMap<unsigned, unsigned> SwizzleRemap;
1475
1476  BuildVector = CompactSwizzlableVector(DAG, BuildVector, SwizzleRemap);
1477  for (unsigned i = 0; i < 4; i++) {
1478    unsigned Idx = dyn_cast<ConstantSDNode>(Swz[i])->getZExtValue();
1479    if (SwizzleRemap.find(Idx) != SwizzleRemap.end())
1480      Swz[i] = DAG.getConstant(SwizzleRemap[Idx], MVT::i32);
1481  }
1482
1483  SwizzleRemap.clear();
1484  BuildVector = ReorganizeVector(DAG, BuildVector, SwizzleRemap);
1485  for (unsigned i = 0; i < 4; i++) {
1486    unsigned Idx = dyn_cast<ConstantSDNode>(Swz[i])->getZExtValue();
1487    if (SwizzleRemap.find(Idx) != SwizzleRemap.end())
1488      Swz[i] = DAG.getConstant(SwizzleRemap[Idx], MVT::i32);
1489  }
1490
1491  return BuildVector;
1492}
1493
1494
1495//===----------------------------------------------------------------------===//
1496// Custom DAG Optimizations
1497//===----------------------------------------------------------------------===//
1498
1499SDValue R600TargetLowering::PerformDAGCombine(SDNode *N,
1500                                              DAGCombinerInfo &DCI) const {
1501  SelectionDAG &DAG = DCI.DAG;
1502
1503  switch (N->getOpcode()) {
1504  // (f32 fp_round (f64 uint_to_fp a)) -> (f32 uint_to_fp a)
1505  case ISD::FP_ROUND: {
1506      SDValue Arg = N->getOperand(0);
1507      if (Arg.getOpcode() == ISD::UINT_TO_FP && Arg.getValueType() == MVT::f64) {
1508        return DAG.getNode(ISD::UINT_TO_FP, SDLoc(N), N->getValueType(0),
1509                           Arg.getOperand(0));
1510      }
1511      break;
1512    }
1513
1514  // (i32 fp_to_sint (fneg (select_cc f32, f32, 1.0, 0.0 cc))) ->
1515  // (i32 select_cc f32, f32, -1, 0 cc)
1516  //
1517  // Mesa's GLSL frontend generates the above pattern a lot and we can lower
1518  // this to one of the SET*_DX10 instructions.
1519  case ISD::FP_TO_SINT: {
1520    SDValue FNeg = N->getOperand(0);
1521    if (FNeg.getOpcode() != ISD::FNEG) {
1522      return SDValue();
1523    }
1524    SDValue SelectCC = FNeg.getOperand(0);
1525    if (SelectCC.getOpcode() != ISD::SELECT_CC ||
1526        SelectCC.getOperand(0).getValueType() != MVT::f32 || // LHS
1527        SelectCC.getOperand(2).getValueType() != MVT::f32 || // True
1528        !isHWTrueValue(SelectCC.getOperand(2)) ||
1529        !isHWFalseValue(SelectCC.getOperand(3))) {
1530      return SDValue();
1531    }
1532
1533    return DAG.getNode(ISD::SELECT_CC, SDLoc(N), N->getValueType(0),
1534                           SelectCC.getOperand(0), // LHS
1535                           SelectCC.getOperand(1), // RHS
1536                           DAG.getConstant(-1, MVT::i32), // True
1537                           DAG.getConstant(0, MVT::i32),  // Flase
1538                           SelectCC.getOperand(4)); // CC
1539
1540    break;
1541  }
1542
1543  // insert_vector_elt (build_vector elt0, ... , eltN), NewEltIdx, idx
1544  // => build_vector elt0, ... , NewEltIdx, ... , eltN
1545  case ISD::INSERT_VECTOR_ELT: {
1546    SDValue InVec = N->getOperand(0);
1547    SDValue InVal = N->getOperand(1);
1548    SDValue EltNo = N->getOperand(2);
1549    SDLoc dl(N);
1550
1551    // If the inserted element is an UNDEF, just use the input vector.
1552    if (InVal.getOpcode() == ISD::UNDEF)
1553      return InVec;
1554
1555    EVT VT = InVec.getValueType();
1556
1557    // If we can't generate a legal BUILD_VECTOR, exit
1558    if (!isOperationLegal(ISD::BUILD_VECTOR, VT))
1559      return SDValue();
1560
1561    // Check that we know which element is being inserted
1562    if (!isa<ConstantSDNode>(EltNo))
1563      return SDValue();
1564    unsigned Elt = cast<ConstantSDNode>(EltNo)->getZExtValue();
1565
1566    // Check that the operand is a BUILD_VECTOR (or UNDEF, which can essentially
1567    // be converted to a BUILD_VECTOR).  Fill in the Ops vector with the
1568    // vector elements.
1569    SmallVector<SDValue, 8> Ops;
1570    if (InVec.getOpcode() == ISD::BUILD_VECTOR) {
1571      Ops.append(InVec.getNode()->op_begin(),
1572                 InVec.getNode()->op_end());
1573    } else if (InVec.getOpcode() == ISD::UNDEF) {
1574      unsigned NElts = VT.getVectorNumElements();
1575      Ops.append(NElts, DAG.getUNDEF(InVal.getValueType()));
1576    } else {
1577      return SDValue();
1578    }
1579
1580    // Insert the element
1581    if (Elt < Ops.size()) {
1582      // All the operands of BUILD_VECTOR must have the same type;
1583      // we enforce that here.
1584      EVT OpVT = Ops[0].getValueType();
1585      if (InVal.getValueType() != OpVT)
1586        InVal = OpVT.bitsGT(InVal.getValueType()) ?
1587          DAG.getNode(ISD::ANY_EXTEND, dl, OpVT, InVal) :
1588          DAG.getNode(ISD::TRUNCATE, dl, OpVT, InVal);
1589      Ops[Elt] = InVal;
1590    }
1591
1592    // Return the new vector
1593    return DAG.getNode(ISD::BUILD_VECTOR, dl,
1594                       VT, &Ops[0], Ops.size());
1595  }
1596
1597  // Extract_vec (Build_vector) generated by custom lowering
1598  // also needs to be customly combined
1599  case ISD::EXTRACT_VECTOR_ELT: {
1600    SDValue Arg = N->getOperand(0);
1601    if (Arg.getOpcode() == ISD::BUILD_VECTOR) {
1602      if (ConstantSDNode *Const = dyn_cast<ConstantSDNode>(N->getOperand(1))) {
1603        unsigned Element = Const->getZExtValue();
1604        return Arg->getOperand(Element);
1605      }
1606    }
1607    if (Arg.getOpcode() == ISD::BITCAST &&
1608        Arg.getOperand(0).getOpcode() == ISD::BUILD_VECTOR) {
1609      if (ConstantSDNode *Const = dyn_cast<ConstantSDNode>(N->getOperand(1))) {
1610        unsigned Element = Const->getZExtValue();
1611        return DAG.getNode(ISD::BITCAST, SDLoc(N), N->getVTList(),
1612            Arg->getOperand(0).getOperand(Element));
1613      }
1614    }
1615  }
1616
1617  case ISD::SELECT_CC: {
1618    // fold selectcc (selectcc x, y, a, b, cc), b, a, b, seteq ->
1619    //      selectcc x, y, a, b, inv(cc)
1620    //
1621    // fold selectcc (selectcc x, y, a, b, cc), b, a, b, setne ->
1622    //      selectcc x, y, a, b, cc
1623    SDValue LHS = N->getOperand(0);
1624    if (LHS.getOpcode() != ISD::SELECT_CC) {
1625      return SDValue();
1626    }
1627
1628    SDValue RHS = N->getOperand(1);
1629    SDValue True = N->getOperand(2);
1630    SDValue False = N->getOperand(3);
1631    ISD::CondCode NCC = cast<CondCodeSDNode>(N->getOperand(4))->get();
1632
1633    if (LHS.getOperand(2).getNode() != True.getNode() ||
1634        LHS.getOperand(3).getNode() != False.getNode() ||
1635        RHS.getNode() != False.getNode()) {
1636      return SDValue();
1637    }
1638
1639    switch (NCC) {
1640    default: return SDValue();
1641    case ISD::SETNE: return LHS;
1642    case ISD::SETEQ: {
1643      ISD::CondCode LHSCC = cast<CondCodeSDNode>(LHS.getOperand(4))->get();
1644      LHSCC = ISD::getSetCCInverse(LHSCC,
1645                                  LHS.getOperand(0).getValueType().isInteger());
1646      if (DCI.isBeforeLegalizeOps() ||
1647          isCondCodeLegal(LHSCC, LHS.getOperand(0).getSimpleValueType()))
1648        return DAG.getSelectCC(SDLoc(N),
1649                               LHS.getOperand(0),
1650                               LHS.getOperand(1),
1651                               LHS.getOperand(2),
1652                               LHS.getOperand(3),
1653                               LHSCC);
1654      break;
1655    }
1656    }
1657    return SDValue();
1658  }
1659
1660  case AMDGPUISD::EXPORT: {
1661    SDValue Arg = N->getOperand(1);
1662    if (Arg.getOpcode() != ISD::BUILD_VECTOR)
1663      break;
1664
1665    SDValue NewArgs[8] = {
1666      N->getOperand(0), // Chain
1667      SDValue(),
1668      N->getOperand(2), // ArrayBase
1669      N->getOperand(3), // Type
1670      N->getOperand(4), // SWZ_X
1671      N->getOperand(5), // SWZ_Y
1672      N->getOperand(6), // SWZ_Z
1673      N->getOperand(7) // SWZ_W
1674    };
1675    SDLoc DL(N);
1676    NewArgs[1] = OptimizeSwizzle(N->getOperand(1), &NewArgs[4], DAG);
1677    return DAG.getNode(AMDGPUISD::EXPORT, DL, N->getVTList(), NewArgs, 8);
1678  }
1679  case AMDGPUISD::TEXTURE_FETCH: {
1680    SDValue Arg = N->getOperand(1);
1681    if (Arg.getOpcode() != ISD::BUILD_VECTOR)
1682      break;
1683
1684    SDValue NewArgs[19] = {
1685      N->getOperand(0),
1686      N->getOperand(1),
1687      N->getOperand(2),
1688      N->getOperand(3),
1689      N->getOperand(4),
1690      N->getOperand(5),
1691      N->getOperand(6),
1692      N->getOperand(7),
1693      N->getOperand(8),
1694      N->getOperand(9),
1695      N->getOperand(10),
1696      N->getOperand(11),
1697      N->getOperand(12),
1698      N->getOperand(13),
1699      N->getOperand(14),
1700      N->getOperand(15),
1701      N->getOperand(16),
1702      N->getOperand(17),
1703      N->getOperand(18),
1704    };
1705    NewArgs[1] = OptimizeSwizzle(N->getOperand(1), &NewArgs[2], DAG);
1706    return DAG.getNode(AMDGPUISD::TEXTURE_FETCH, SDLoc(N), N->getVTList(),
1707        NewArgs, 19);
1708  }
1709  }
1710  return SDValue();
1711}
1712
1713static bool
1714FoldOperand(SDNode *ParentNode, unsigned SrcIdx, SDValue &Src, SDValue &Neg,
1715            SDValue &Abs, SDValue &Sel, SDValue &Imm, SelectionDAG &DAG) {
1716  const R600InstrInfo *TII =
1717      static_cast<const R600InstrInfo *>(DAG.getTarget().getInstrInfo());
1718  if (!Src.isMachineOpcode())
1719    return false;
1720  switch (Src.getMachineOpcode()) {
1721  case AMDGPU::FNEG_R600:
1722    if (!Neg.getNode())
1723      return false;
1724    Src = Src.getOperand(0);
1725    Neg = DAG.getTargetConstant(1, MVT::i32);
1726    return true;
1727  case AMDGPU::FABS_R600:
1728    if (!Abs.getNode())
1729      return false;
1730    Src = Src.getOperand(0);
1731    Abs = DAG.getTargetConstant(1, MVT::i32);
1732    return true;
1733  case AMDGPU::CONST_COPY: {
1734    unsigned Opcode = ParentNode->getMachineOpcode();
1735    bool HasDst = TII->getOperandIdx(Opcode, AMDGPU::OpName::dst) > -1;
1736
1737    if (!Sel.getNode())
1738      return false;
1739
1740    SDValue CstOffset = Src.getOperand(0);
1741    if (ParentNode->getValueType(0).isVector())
1742      return false;
1743
1744    // Gather constants values
1745    int SrcIndices[] = {
1746      TII->getOperandIdx(Opcode, AMDGPU::OpName::src0),
1747      TII->getOperandIdx(Opcode, AMDGPU::OpName::src1),
1748      TII->getOperandIdx(Opcode, AMDGPU::OpName::src2),
1749      TII->getOperandIdx(Opcode, AMDGPU::OpName::src0_X),
1750      TII->getOperandIdx(Opcode, AMDGPU::OpName::src0_Y),
1751      TII->getOperandIdx(Opcode, AMDGPU::OpName::src0_Z),
1752      TII->getOperandIdx(Opcode, AMDGPU::OpName::src0_W),
1753      TII->getOperandIdx(Opcode, AMDGPU::OpName::src1_X),
1754      TII->getOperandIdx(Opcode, AMDGPU::OpName::src1_Y),
1755      TII->getOperandIdx(Opcode, AMDGPU::OpName::src1_Z),
1756      TII->getOperandIdx(Opcode, AMDGPU::OpName::src1_W)
1757    };
1758    std::vector<unsigned> Consts;
1759    for (unsigned i = 0; i < sizeof(SrcIndices) / sizeof(int); i++) {
1760      int OtherSrcIdx = SrcIndices[i];
1761      int OtherSelIdx = TII->getSelIdx(Opcode, OtherSrcIdx);
1762      if (OtherSrcIdx < 0 || OtherSelIdx < 0)
1763        continue;
1764      if (HasDst) {
1765        OtherSrcIdx--;
1766        OtherSelIdx--;
1767      }
1768      if (RegisterSDNode *Reg =
1769          dyn_cast<RegisterSDNode>(ParentNode->getOperand(OtherSrcIdx))) {
1770        if (Reg->getReg() == AMDGPU::ALU_CONST) {
1771          ConstantSDNode *Cst = dyn_cast<ConstantSDNode>(
1772              ParentNode->getOperand(OtherSelIdx));
1773          Consts.push_back(Cst->getZExtValue());
1774        }
1775      }
1776    }
1777
1778    ConstantSDNode *Cst = dyn_cast<ConstantSDNode>(CstOffset);
1779    Consts.push_back(Cst->getZExtValue());
1780    if (!TII->fitsConstReadLimitations(Consts)) {
1781      return false;
1782    }
1783
1784    Sel = CstOffset;
1785    Src = DAG.getRegister(AMDGPU::ALU_CONST, MVT::f32);
1786    return true;
1787  }
1788  case AMDGPU::MOV_IMM_I32:
1789  case AMDGPU::MOV_IMM_F32: {
1790    unsigned ImmReg = AMDGPU::ALU_LITERAL_X;
1791    uint64_t ImmValue = 0;
1792
1793
1794    if (Src.getMachineOpcode() == AMDGPU::MOV_IMM_F32) {
1795      ConstantFPSDNode *FPC = dyn_cast<ConstantFPSDNode>(Src.getOperand(0));
1796      float FloatValue = FPC->getValueAPF().convertToFloat();
1797      if (FloatValue == 0.0) {
1798        ImmReg = AMDGPU::ZERO;
1799      } else if (FloatValue == 0.5) {
1800        ImmReg = AMDGPU::HALF;
1801      } else if (FloatValue == 1.0) {
1802        ImmReg = AMDGPU::ONE;
1803      } else {
1804        ImmValue = FPC->getValueAPF().bitcastToAPInt().getZExtValue();
1805      }
1806    } else {
1807      ConstantSDNode *C = dyn_cast<ConstantSDNode>(Src.getOperand(0));
1808      uint64_t Value = C->getZExtValue();
1809      if (Value == 0) {
1810        ImmReg = AMDGPU::ZERO;
1811      } else if (Value == 1) {
1812        ImmReg = AMDGPU::ONE_INT;
1813      } else {
1814        ImmValue = Value;
1815      }
1816    }
1817
1818    // Check that we aren't already using an immediate.
1819    // XXX: It's possible for an instruction to have more than one
1820    // immediate operand, but this is not supported yet.
1821    if (ImmReg == AMDGPU::ALU_LITERAL_X) {
1822      if (!Imm.getNode())
1823        return false;
1824      ConstantSDNode *C = dyn_cast<ConstantSDNode>(Imm);
1825      assert(C);
1826      if (C->getZExtValue())
1827        return false;
1828      Imm = DAG.getTargetConstant(ImmValue, MVT::i32);
1829    }
1830    Src = DAG.getRegister(ImmReg, MVT::i32);
1831    return true;
1832  }
1833  default:
1834    return false;
1835  }
1836}
1837
1838
1839/// \brief Fold the instructions after selecting them
1840SDNode *R600TargetLowering::PostISelFolding(MachineSDNode *Node,
1841                                            SelectionDAG &DAG) const {
1842  const R600InstrInfo *TII =
1843      static_cast<const R600InstrInfo *>(DAG.getTarget().getInstrInfo());
1844  if (!Node->isMachineOpcode())
1845    return Node;
1846  unsigned Opcode = Node->getMachineOpcode();
1847  SDValue FakeOp;
1848
1849  std::vector<SDValue> Ops;
1850  for(SDNode::op_iterator I = Node->op_begin(), E = Node->op_end();
1851              I != E; ++I)
1852          Ops.push_back(*I);
1853
1854  if (Opcode == AMDGPU::DOT_4) {
1855    int OperandIdx[] = {
1856      TII->getOperandIdx(Opcode, AMDGPU::OpName::src0_X),
1857      TII->getOperandIdx(Opcode, AMDGPU::OpName::src0_Y),
1858      TII->getOperandIdx(Opcode, AMDGPU::OpName::src0_Z),
1859      TII->getOperandIdx(Opcode, AMDGPU::OpName::src0_W),
1860      TII->getOperandIdx(Opcode, AMDGPU::OpName::src1_X),
1861      TII->getOperandIdx(Opcode, AMDGPU::OpName::src1_Y),
1862      TII->getOperandIdx(Opcode, AMDGPU::OpName::src1_Z),
1863      TII->getOperandIdx(Opcode, AMDGPU::OpName::src1_W)
1864        };
1865    int NegIdx[] = {
1866      TII->getOperandIdx(Opcode, AMDGPU::OpName::src0_neg_X),
1867      TII->getOperandIdx(Opcode, AMDGPU::OpName::src0_neg_Y),
1868      TII->getOperandIdx(Opcode, AMDGPU::OpName::src0_neg_Z),
1869      TII->getOperandIdx(Opcode, AMDGPU::OpName::src0_neg_W),
1870      TII->getOperandIdx(Opcode, AMDGPU::OpName::src1_neg_X),
1871      TII->getOperandIdx(Opcode, AMDGPU::OpName::src1_neg_Y),
1872      TII->getOperandIdx(Opcode, AMDGPU::OpName::src1_neg_Z),
1873      TII->getOperandIdx(Opcode, AMDGPU::OpName::src1_neg_W)
1874    };
1875    int AbsIdx[] = {
1876      TII->getOperandIdx(Opcode, AMDGPU::OpName::src0_abs_X),
1877      TII->getOperandIdx(Opcode, AMDGPU::OpName::src0_abs_Y),
1878      TII->getOperandIdx(Opcode, AMDGPU::OpName::src0_abs_Z),
1879      TII->getOperandIdx(Opcode, AMDGPU::OpName::src0_abs_W),
1880      TII->getOperandIdx(Opcode, AMDGPU::OpName::src1_abs_X),
1881      TII->getOperandIdx(Opcode, AMDGPU::OpName::src1_abs_Y),
1882      TII->getOperandIdx(Opcode, AMDGPU::OpName::src1_abs_Z),
1883      TII->getOperandIdx(Opcode, AMDGPU::OpName::src1_abs_W)
1884    };
1885    for (unsigned i = 0; i < 8; i++) {
1886      if (OperandIdx[i] < 0)
1887        return Node;
1888      SDValue &Src = Ops[OperandIdx[i] - 1];
1889      SDValue &Neg = Ops[NegIdx[i] - 1];
1890      SDValue &Abs = Ops[AbsIdx[i] - 1];
1891      bool HasDst = TII->getOperandIdx(Opcode, AMDGPU::OpName::dst) > -1;
1892      int SelIdx = TII->getSelIdx(Opcode, OperandIdx[i]);
1893      if (HasDst)
1894        SelIdx--;
1895      SDValue &Sel = (SelIdx > -1) ? Ops[SelIdx] : FakeOp;
1896      if (FoldOperand(Node, i, Src, Neg, Abs, Sel, FakeOp, DAG))
1897        return DAG.getMachineNode(Opcode, SDLoc(Node), Node->getVTList(), Ops);
1898    }
1899  } else if (Opcode == AMDGPU::REG_SEQUENCE) {
1900    for (unsigned i = 1, e = Node->getNumOperands(); i < e; i += 2) {
1901      SDValue &Src = Ops[i];
1902      if (FoldOperand(Node, i, Src, FakeOp, FakeOp, FakeOp, FakeOp, DAG))
1903        return DAG.getMachineNode(Opcode, SDLoc(Node), Node->getVTList(), Ops);
1904    }
1905  } else if (Opcode == AMDGPU::CLAMP_R600) {
1906    SDValue Src = Node->getOperand(0);
1907    if (!Src.isMachineOpcode() ||
1908        !TII->hasInstrModifiers(Src.getMachineOpcode()))
1909      return Node;
1910    int ClampIdx = TII->getOperandIdx(Src.getMachineOpcode(),
1911        AMDGPU::OpName::clamp);
1912    if (ClampIdx < 0)
1913      return Node;
1914    std::vector<SDValue> Ops;
1915    unsigned NumOp = Src.getNumOperands();
1916    for(unsigned i = 0; i < NumOp; ++i)
1917          Ops.push_back(Src.getOperand(i));
1918    Ops[ClampIdx - 1] = DAG.getTargetConstant(1, MVT::i32);
1919    return DAG.getMachineNode(Src.getMachineOpcode(), SDLoc(Node),
1920        Node->getVTList(), Ops);
1921  } else {
1922    if (!TII->hasInstrModifiers(Opcode))
1923      return Node;
1924    int OperandIdx[] = {
1925      TII->getOperandIdx(Opcode, AMDGPU::OpName::src0),
1926      TII->getOperandIdx(Opcode, AMDGPU::OpName::src1),
1927      TII->getOperandIdx(Opcode, AMDGPU::OpName::src2)
1928    };
1929    int NegIdx[] = {
1930      TII->getOperandIdx(Opcode, AMDGPU::OpName::src0_neg),
1931      TII->getOperandIdx(Opcode, AMDGPU::OpName::src1_neg),
1932      TII->getOperandIdx(Opcode, AMDGPU::OpName::src2_neg)
1933    };
1934    int AbsIdx[] = {
1935      TII->getOperandIdx(Opcode, AMDGPU::OpName::src0_abs),
1936      TII->getOperandIdx(Opcode, AMDGPU::OpName::src1_abs),
1937      -1
1938    };
1939    for (unsigned i = 0; i < 3; i++) {
1940      if (OperandIdx[i] < 0)
1941        return Node;
1942      SDValue &Src = Ops[OperandIdx[i] - 1];
1943      SDValue &Neg = Ops[NegIdx[i] - 1];
1944      SDValue FakeAbs;
1945      SDValue &Abs = (AbsIdx[i] > -1) ? Ops[AbsIdx[i] - 1] : FakeAbs;
1946      bool HasDst = TII->getOperandIdx(Opcode, AMDGPU::OpName::dst) > -1;
1947      int SelIdx = TII->getSelIdx(Opcode, OperandIdx[i]);
1948      int ImmIdx = TII->getOperandIdx(Opcode, AMDGPU::OpName::literal);
1949      if (HasDst) {
1950        SelIdx--;
1951        ImmIdx--;
1952      }
1953      SDValue &Sel = (SelIdx > -1) ? Ops[SelIdx] : FakeOp;
1954      SDValue &Imm = Ops[ImmIdx];
1955      if (FoldOperand(Node, i, Src, Neg, Abs, Sel, Imm, DAG))
1956        return DAG.getMachineNode(Opcode, SDLoc(Node), Node->getVTList(), Ops);
1957    }
1958  }
1959
1960  return Node;
1961}
1962