X86ISelLowering.cpp revision 266715
1//===-- X86ISelLowering.cpp - X86 DAG Lowering Implementation -------------===// 2// 3// The LLVM Compiler Infrastructure 4// 5// This file is distributed under the University of Illinois Open Source 6// License. See LICENSE.TXT for details. 7// 8//===----------------------------------------------------------------------===// 9// 10// This file defines the interfaces that X86 uses to lower LLVM code into a 11// selection DAG. 12// 13//===----------------------------------------------------------------------===// 14 15#define DEBUG_TYPE "x86-isel" 16#include "X86ISelLowering.h" 17#include "Utils/X86ShuffleDecode.h" 18#include "X86.h" 19#include "X86CallingConv.h" 20#include "X86InstrBuilder.h" 21#include "X86TargetMachine.h" 22#include "X86TargetObjectFile.h" 23#include "llvm/ADT/SmallSet.h" 24#include "llvm/ADT/Statistic.h" 25#include "llvm/ADT/StringExtras.h" 26#include "llvm/ADT/VariadicFunction.h" 27#include "llvm/CodeGen/IntrinsicLowering.h" 28#include "llvm/CodeGen/MachineFrameInfo.h" 29#include "llvm/CodeGen/MachineFunction.h" 30#include "llvm/CodeGen/MachineInstrBuilder.h" 31#include "llvm/CodeGen/MachineJumpTableInfo.h" 32#include "llvm/CodeGen/MachineModuleInfo.h" 33#include "llvm/CodeGen/MachineRegisterInfo.h" 34#include "llvm/IR/CallingConv.h" 35#include "llvm/IR/Constants.h" 36#include "llvm/IR/DerivedTypes.h" 37#include "llvm/IR/Function.h" 38#include "llvm/IR/GlobalAlias.h" 39#include "llvm/IR/GlobalVariable.h" 40#include "llvm/IR/Instructions.h" 41#include "llvm/IR/Intrinsics.h" 42#include "llvm/IR/LLVMContext.h" 43#include "llvm/MC/MCAsmInfo.h" 44#include "llvm/MC/MCContext.h" 45#include "llvm/MC/MCExpr.h" 46#include "llvm/MC/MCSymbol.h" 47#include "llvm/Support/CallSite.h" 48#include "llvm/Support/Debug.h" 49#include "llvm/Support/ErrorHandling.h" 50#include "llvm/Support/MathExtras.h" 51#include "llvm/Target/TargetOptions.h" 52#include <bitset> 53#include <cctype> 54using namespace llvm; 55 56STATISTIC(NumTailCalls, "Number of tail calls"); 57 58// Forward declarations. 59static SDValue getMOVL(SelectionDAG &DAG, SDLoc dl, EVT VT, SDValue V1, 60 SDValue V2); 61 62static SDValue ExtractSubVector(SDValue Vec, unsigned IdxVal, 63 SelectionDAG &DAG, SDLoc dl, 64 unsigned vectorWidth) { 65 assert((vectorWidth == 128 || vectorWidth == 256) && 66 "Unsupported vector width"); 67 EVT VT = Vec.getValueType(); 68 EVT ElVT = VT.getVectorElementType(); 69 unsigned Factor = VT.getSizeInBits()/vectorWidth; 70 EVT ResultVT = EVT::getVectorVT(*DAG.getContext(), ElVT, 71 VT.getVectorNumElements()/Factor); 72 73 // Extract from UNDEF is UNDEF. 74 if (Vec.getOpcode() == ISD::UNDEF) 75 return DAG.getUNDEF(ResultVT); 76 77 // Extract the relevant vectorWidth bits. Generate an EXTRACT_SUBVECTOR 78 unsigned ElemsPerChunk = vectorWidth / ElVT.getSizeInBits(); 79 80 // This is the index of the first element of the vectorWidth-bit chunk 81 // we want. 82 unsigned NormalizedIdxVal = (((IdxVal * ElVT.getSizeInBits()) / vectorWidth) 83 * ElemsPerChunk); 84 85 // If the input is a buildvector just emit a smaller one. 86 if (Vec.getOpcode() == ISD::BUILD_VECTOR) 87 return DAG.getNode(ISD::BUILD_VECTOR, dl, ResultVT, 88 Vec->op_begin()+NormalizedIdxVal, ElemsPerChunk); 89 90 SDValue VecIdx = DAG.getIntPtrConstant(NormalizedIdxVal); 91 SDValue Result = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, ResultVT, Vec, 92 VecIdx); 93 94 return Result; 95 96} 97/// Generate a DAG to grab 128-bits from a vector > 128 bits. This 98/// sets things up to match to an AVX VEXTRACTF128 / VEXTRACTI128 99/// or AVX-512 VEXTRACTF32x4 / VEXTRACTI32x4 100/// instructions or a simple subregister reference. Idx is an index in the 101/// 128 bits we want. It need not be aligned to a 128-bit bounday. That makes 102/// lowering EXTRACT_VECTOR_ELT operations easier. 103static SDValue Extract128BitVector(SDValue Vec, unsigned IdxVal, 104 SelectionDAG &DAG, SDLoc dl) { 105 assert((Vec.getValueType().is256BitVector() || 106 Vec.getValueType().is512BitVector()) && "Unexpected vector size!"); 107 return ExtractSubVector(Vec, IdxVal, DAG, dl, 128); 108} 109 110/// Generate a DAG to grab 256-bits from a 512-bit vector. 111static SDValue Extract256BitVector(SDValue Vec, unsigned IdxVal, 112 SelectionDAG &DAG, SDLoc dl) { 113 assert(Vec.getValueType().is512BitVector() && "Unexpected vector size!"); 114 return ExtractSubVector(Vec, IdxVal, DAG, dl, 256); 115} 116 117static SDValue InsertSubVector(SDValue Result, SDValue Vec, 118 unsigned IdxVal, SelectionDAG &DAG, 119 SDLoc dl, unsigned vectorWidth) { 120 assert((vectorWidth == 128 || vectorWidth == 256) && 121 "Unsupported vector width"); 122 // Inserting UNDEF is Result 123 if (Vec.getOpcode() == ISD::UNDEF) 124 return Result; 125 EVT VT = Vec.getValueType(); 126 EVT ElVT = VT.getVectorElementType(); 127 EVT ResultVT = Result.getValueType(); 128 129 // Insert the relevant vectorWidth bits. 130 unsigned ElemsPerChunk = vectorWidth/ElVT.getSizeInBits(); 131 132 // This is the index of the first element of the vectorWidth-bit chunk 133 // we want. 134 unsigned NormalizedIdxVal = (((IdxVal * ElVT.getSizeInBits())/vectorWidth) 135 * ElemsPerChunk); 136 137 SDValue VecIdx = DAG.getIntPtrConstant(NormalizedIdxVal); 138 return DAG.getNode(ISD::INSERT_SUBVECTOR, dl, ResultVT, Result, Vec, 139 VecIdx); 140} 141/// Generate a DAG to put 128-bits into a vector > 128 bits. This 142/// sets things up to match to an AVX VINSERTF128/VINSERTI128 or 143/// AVX-512 VINSERTF32x4/VINSERTI32x4 instructions or a 144/// simple superregister reference. Idx is an index in the 128 bits 145/// we want. It need not be aligned to a 128-bit bounday. That makes 146/// lowering INSERT_VECTOR_ELT operations easier. 147static SDValue Insert128BitVector(SDValue Result, SDValue Vec, 148 unsigned IdxVal, SelectionDAG &DAG, 149 SDLoc dl) { 150 assert(Vec.getValueType().is128BitVector() && "Unexpected vector size!"); 151 return InsertSubVector(Result, Vec, IdxVal, DAG, dl, 128); 152} 153 154static SDValue Insert256BitVector(SDValue Result, SDValue Vec, 155 unsigned IdxVal, SelectionDAG &DAG, 156 SDLoc dl) { 157 assert(Vec.getValueType().is256BitVector() && "Unexpected vector size!"); 158 return InsertSubVector(Result, Vec, IdxVal, DAG, dl, 256); 159} 160 161/// Concat two 128-bit vectors into a 256 bit vector using VINSERTF128 162/// instructions. This is used because creating CONCAT_VECTOR nodes of 163/// BUILD_VECTORS returns a larger BUILD_VECTOR while we're trying to lower 164/// large BUILD_VECTORS. 165static SDValue Concat128BitVectors(SDValue V1, SDValue V2, EVT VT, 166 unsigned NumElems, SelectionDAG &DAG, 167 SDLoc dl) { 168 SDValue V = Insert128BitVector(DAG.getUNDEF(VT), V1, 0, DAG, dl); 169 return Insert128BitVector(V, V2, NumElems/2, DAG, dl); 170} 171 172static SDValue Concat256BitVectors(SDValue V1, SDValue V2, EVT VT, 173 unsigned NumElems, SelectionDAG &DAG, 174 SDLoc dl) { 175 SDValue V = Insert256BitVector(DAG.getUNDEF(VT), V1, 0, DAG, dl); 176 return Insert256BitVector(V, V2, NumElems/2, DAG, dl); 177} 178 179static TargetLoweringObjectFile *createTLOF(X86TargetMachine &TM) { 180 const X86Subtarget *Subtarget = &TM.getSubtarget<X86Subtarget>(); 181 bool is64Bit = Subtarget->is64Bit(); 182 183 if (Subtarget->isTargetEnvMacho()) { 184 if (is64Bit) 185 return new X86_64MachoTargetObjectFile(); 186 return new TargetLoweringObjectFileMachO(); 187 } 188 189 if (Subtarget->isTargetLinux()) 190 return new X86LinuxTargetObjectFile(); 191 if (Subtarget->isTargetELF()) 192 return new TargetLoweringObjectFileELF(); 193 if (Subtarget->isTargetCOFF() && !Subtarget->isTargetEnvMacho()) 194 return new TargetLoweringObjectFileCOFF(); 195 llvm_unreachable("unknown subtarget type"); 196} 197 198X86TargetLowering::X86TargetLowering(X86TargetMachine &TM) 199 : TargetLowering(TM, createTLOF(TM)) { 200 Subtarget = &TM.getSubtarget<X86Subtarget>(); 201 X86ScalarSSEf64 = Subtarget->hasSSE2(); 202 X86ScalarSSEf32 = Subtarget->hasSSE1(); 203 TD = getDataLayout(); 204 205 resetOperationActions(); 206} 207 208void X86TargetLowering::resetOperationActions() { 209 const TargetMachine &TM = getTargetMachine(); 210 static bool FirstTimeThrough = true; 211 212 // If none of the target options have changed, then we don't need to reset the 213 // operation actions. 214 if (!FirstTimeThrough && TO == TM.Options) return; 215 216 if (!FirstTimeThrough) { 217 // Reinitialize the actions. 218 initActions(); 219 FirstTimeThrough = false; 220 } 221 222 TO = TM.Options; 223 224 // Set up the TargetLowering object. 225 static const MVT IntVTs[] = { MVT::i8, MVT::i16, MVT::i32, MVT::i64 }; 226 227 // X86 is weird, it always uses i8 for shift amounts and setcc results. 228 setBooleanContents(ZeroOrOneBooleanContent); 229 // X86-SSE is even stranger. It uses -1 or 0 for vector masks. 230 setBooleanVectorContents(ZeroOrNegativeOneBooleanContent); 231 232 // For 64-bit since we have so many registers use the ILP scheduler, for 233 // 32-bit code use the register pressure specific scheduling. 234 // For Atom, always use ILP scheduling. 235 if (Subtarget->isAtom()) 236 setSchedulingPreference(Sched::ILP); 237 else if (Subtarget->is64Bit()) 238 setSchedulingPreference(Sched::ILP); 239 else 240 setSchedulingPreference(Sched::RegPressure); 241 const X86RegisterInfo *RegInfo = 242 static_cast<const X86RegisterInfo*>(TM.getRegisterInfo()); 243 setStackPointerRegisterToSaveRestore(RegInfo->getStackRegister()); 244 245 // Bypass expensive divides on Atom when compiling with O2 246 if (Subtarget->hasSlowDivide() && TM.getOptLevel() >= CodeGenOpt::Default) { 247 addBypassSlowDiv(32, 8); 248 if (Subtarget->is64Bit()) 249 addBypassSlowDiv(64, 16); 250 } 251 252 if (Subtarget->isTargetWindows() && !Subtarget->isTargetCygMing()) { 253 // Setup Windows compiler runtime calls. 254 setLibcallName(RTLIB::SDIV_I64, "_alldiv"); 255 setLibcallName(RTLIB::UDIV_I64, "_aulldiv"); 256 setLibcallName(RTLIB::SREM_I64, "_allrem"); 257 setLibcallName(RTLIB::UREM_I64, "_aullrem"); 258 setLibcallName(RTLIB::MUL_I64, "_allmul"); 259 setLibcallCallingConv(RTLIB::SDIV_I64, CallingConv::X86_StdCall); 260 setLibcallCallingConv(RTLIB::UDIV_I64, CallingConv::X86_StdCall); 261 setLibcallCallingConv(RTLIB::SREM_I64, CallingConv::X86_StdCall); 262 setLibcallCallingConv(RTLIB::UREM_I64, CallingConv::X86_StdCall); 263 setLibcallCallingConv(RTLIB::MUL_I64, CallingConv::X86_StdCall); 264 265 // The _ftol2 runtime function has an unusual calling conv, which 266 // is modeled by a special pseudo-instruction. 267 setLibcallName(RTLIB::FPTOUINT_F64_I64, 0); 268 setLibcallName(RTLIB::FPTOUINT_F32_I64, 0); 269 setLibcallName(RTLIB::FPTOUINT_F64_I32, 0); 270 setLibcallName(RTLIB::FPTOUINT_F32_I32, 0); 271 } 272 273 if (Subtarget->isTargetDarwin()) { 274 // Darwin should use _setjmp/_longjmp instead of setjmp/longjmp. 275 setUseUnderscoreSetJmp(false); 276 setUseUnderscoreLongJmp(false); 277 } else if (Subtarget->isTargetMingw()) { 278 // MS runtime is weird: it exports _setjmp, but longjmp! 279 setUseUnderscoreSetJmp(true); 280 setUseUnderscoreLongJmp(false); 281 } else { 282 setUseUnderscoreSetJmp(true); 283 setUseUnderscoreLongJmp(true); 284 } 285 286 // Set up the register classes. 287 addRegisterClass(MVT::i8, &X86::GR8RegClass); 288 addRegisterClass(MVT::i16, &X86::GR16RegClass); 289 addRegisterClass(MVT::i32, &X86::GR32RegClass); 290 if (Subtarget->is64Bit()) 291 addRegisterClass(MVT::i64, &X86::GR64RegClass); 292 293 setLoadExtAction(ISD::SEXTLOAD, MVT::i1, Promote); 294 295 // We don't accept any truncstore of integer registers. 296 setTruncStoreAction(MVT::i64, MVT::i32, Expand); 297 setTruncStoreAction(MVT::i64, MVT::i16, Expand); 298 setTruncStoreAction(MVT::i64, MVT::i8 , Expand); 299 setTruncStoreAction(MVT::i32, MVT::i16, Expand); 300 setTruncStoreAction(MVT::i32, MVT::i8 , Expand); 301 setTruncStoreAction(MVT::i16, MVT::i8, Expand); 302 303 // SETOEQ and SETUNE require checking two conditions. 304 setCondCodeAction(ISD::SETOEQ, MVT::f32, Expand); 305 setCondCodeAction(ISD::SETOEQ, MVT::f64, Expand); 306 setCondCodeAction(ISD::SETOEQ, MVT::f80, Expand); 307 setCondCodeAction(ISD::SETUNE, MVT::f32, Expand); 308 setCondCodeAction(ISD::SETUNE, MVT::f64, Expand); 309 setCondCodeAction(ISD::SETUNE, MVT::f80, Expand); 310 311 // Promote all UINT_TO_FP to larger SINT_TO_FP's, as X86 doesn't have this 312 // operation. 313 setOperationAction(ISD::UINT_TO_FP , MVT::i1 , Promote); 314 setOperationAction(ISD::UINT_TO_FP , MVT::i8 , Promote); 315 setOperationAction(ISD::UINT_TO_FP , MVT::i16 , Promote); 316 317 if (Subtarget->is64Bit()) { 318 setOperationAction(ISD::UINT_TO_FP , MVT::i32 , Promote); 319 setOperationAction(ISD::UINT_TO_FP , MVT::i64 , Custom); 320 } else if (!TM.Options.UseSoftFloat) { 321 // We have an algorithm for SSE2->double, and we turn this into a 322 // 64-bit FILD followed by conditional FADD for other targets. 323 setOperationAction(ISD::UINT_TO_FP , MVT::i64 , Custom); 324 // We have an algorithm for SSE2, and we turn this into a 64-bit 325 // FILD for other targets. 326 setOperationAction(ISD::UINT_TO_FP , MVT::i32 , Custom); 327 } 328 329 // Promote i1/i8 SINT_TO_FP to larger SINT_TO_FP's, as X86 doesn't have 330 // this operation. 331 setOperationAction(ISD::SINT_TO_FP , MVT::i1 , Promote); 332 setOperationAction(ISD::SINT_TO_FP , MVT::i8 , Promote); 333 334 if (!TM.Options.UseSoftFloat) { 335 // SSE has no i16 to fp conversion, only i32 336 if (X86ScalarSSEf32) { 337 setOperationAction(ISD::SINT_TO_FP , MVT::i16 , Promote); 338 // f32 and f64 cases are Legal, f80 case is not 339 setOperationAction(ISD::SINT_TO_FP , MVT::i32 , Custom); 340 } else { 341 setOperationAction(ISD::SINT_TO_FP , MVT::i16 , Custom); 342 setOperationAction(ISD::SINT_TO_FP , MVT::i32 , Custom); 343 } 344 } else { 345 setOperationAction(ISD::SINT_TO_FP , MVT::i16 , Promote); 346 setOperationAction(ISD::SINT_TO_FP , MVT::i32 , Promote); 347 } 348 349 // In 32-bit mode these are custom lowered. In 64-bit mode F32 and F64 350 // are Legal, f80 is custom lowered. 351 setOperationAction(ISD::FP_TO_SINT , MVT::i64 , Custom); 352 setOperationAction(ISD::SINT_TO_FP , MVT::i64 , Custom); 353 354 // Promote i1/i8 FP_TO_SINT to larger FP_TO_SINTS's, as X86 doesn't have 355 // this operation. 356 setOperationAction(ISD::FP_TO_SINT , MVT::i1 , Promote); 357 setOperationAction(ISD::FP_TO_SINT , MVT::i8 , Promote); 358 359 if (X86ScalarSSEf32) { 360 setOperationAction(ISD::FP_TO_SINT , MVT::i16 , Promote); 361 // f32 and f64 cases are Legal, f80 case is not 362 setOperationAction(ISD::FP_TO_SINT , MVT::i32 , Custom); 363 } else { 364 setOperationAction(ISD::FP_TO_SINT , MVT::i16 , Custom); 365 setOperationAction(ISD::FP_TO_SINT , MVT::i32 , Custom); 366 } 367 368 // Handle FP_TO_UINT by promoting the destination to a larger signed 369 // conversion. 370 setOperationAction(ISD::FP_TO_UINT , MVT::i1 , Promote); 371 setOperationAction(ISD::FP_TO_UINT , MVT::i8 , Promote); 372 setOperationAction(ISD::FP_TO_UINT , MVT::i16 , Promote); 373 374 if (Subtarget->is64Bit()) { 375 setOperationAction(ISD::FP_TO_UINT , MVT::i64 , Expand); 376 setOperationAction(ISD::FP_TO_UINT , MVT::i32 , Promote); 377 } else if (!TM.Options.UseSoftFloat) { 378 // Since AVX is a superset of SSE3, only check for SSE here. 379 if (Subtarget->hasSSE1() && !Subtarget->hasSSE3()) 380 // Expand FP_TO_UINT into a select. 381 // FIXME: We would like to use a Custom expander here eventually to do 382 // the optimal thing for SSE vs. the default expansion in the legalizer. 383 setOperationAction(ISD::FP_TO_UINT , MVT::i32 , Expand); 384 else 385 // With SSE3 we can use fisttpll to convert to a signed i64; without 386 // SSE, we're stuck with a fistpll. 387 setOperationAction(ISD::FP_TO_UINT , MVT::i32 , Custom); 388 } 389 390 if (isTargetFTOL()) { 391 // Use the _ftol2 runtime function, which has a pseudo-instruction 392 // to handle its weird calling convention. 393 setOperationAction(ISD::FP_TO_UINT , MVT::i64 , Custom); 394 } 395 396 // TODO: when we have SSE, these could be more efficient, by using movd/movq. 397 if (!X86ScalarSSEf64) { 398 setOperationAction(ISD::BITCAST , MVT::f32 , Expand); 399 setOperationAction(ISD::BITCAST , MVT::i32 , Expand); 400 if (Subtarget->is64Bit()) { 401 setOperationAction(ISD::BITCAST , MVT::f64 , Expand); 402 // Without SSE, i64->f64 goes through memory. 403 setOperationAction(ISD::BITCAST , MVT::i64 , Expand); 404 } 405 } 406 407 // Scalar integer divide and remainder are lowered to use operations that 408 // produce two results, to match the available instructions. This exposes 409 // the two-result form to trivial CSE, which is able to combine x/y and x%y 410 // into a single instruction. 411 // 412 // Scalar integer multiply-high is also lowered to use two-result 413 // operations, to match the available instructions. However, plain multiply 414 // (low) operations are left as Legal, as there are single-result 415 // instructions for this in x86. Using the two-result multiply instructions 416 // when both high and low results are needed must be arranged by dagcombine. 417 for (unsigned i = 0; i != array_lengthof(IntVTs); ++i) { 418 MVT VT = IntVTs[i]; 419 setOperationAction(ISD::MULHS, VT, Expand); 420 setOperationAction(ISD::MULHU, VT, Expand); 421 setOperationAction(ISD::SDIV, VT, Expand); 422 setOperationAction(ISD::UDIV, VT, Expand); 423 setOperationAction(ISD::SREM, VT, Expand); 424 setOperationAction(ISD::UREM, VT, Expand); 425 426 // Add/Sub overflow ops with MVT::Glues are lowered to EFLAGS dependences. 427 setOperationAction(ISD::ADDC, VT, Custom); 428 setOperationAction(ISD::ADDE, VT, Custom); 429 setOperationAction(ISD::SUBC, VT, Custom); 430 setOperationAction(ISD::SUBE, VT, Custom); 431 } 432 433 setOperationAction(ISD::BR_JT , MVT::Other, Expand); 434 setOperationAction(ISD::BRCOND , MVT::Other, Custom); 435 setOperationAction(ISD::BR_CC , MVT::f32, Expand); 436 setOperationAction(ISD::BR_CC , MVT::f64, Expand); 437 setOperationAction(ISD::BR_CC , MVT::f80, Expand); 438 setOperationAction(ISD::BR_CC , MVT::i8, Expand); 439 setOperationAction(ISD::BR_CC , MVT::i16, Expand); 440 setOperationAction(ISD::BR_CC , MVT::i32, Expand); 441 setOperationAction(ISD::BR_CC , MVT::i64, Expand); 442 setOperationAction(ISD::SELECT_CC , MVT::Other, Expand); 443 if (Subtarget->is64Bit()) 444 setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i32, Legal); 445 setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i16 , Legal); 446 setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i8 , Legal); 447 setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i1 , Expand); 448 setOperationAction(ISD::FP_ROUND_INREG , MVT::f32 , Expand); 449 setOperationAction(ISD::FREM , MVT::f32 , Expand); 450 setOperationAction(ISD::FREM , MVT::f64 , Expand); 451 setOperationAction(ISD::FREM , MVT::f80 , Expand); 452 setOperationAction(ISD::FLT_ROUNDS_ , MVT::i32 , Custom); 453 454 // Promote the i8 variants and force them on up to i32 which has a shorter 455 // encoding. 456 setOperationAction(ISD::CTTZ , MVT::i8 , Promote); 457 AddPromotedToType (ISD::CTTZ , MVT::i8 , MVT::i32); 458 setOperationAction(ISD::CTTZ_ZERO_UNDEF , MVT::i8 , Promote); 459 AddPromotedToType (ISD::CTTZ_ZERO_UNDEF , MVT::i8 , MVT::i32); 460 if (Subtarget->hasBMI()) { 461 setOperationAction(ISD::CTTZ_ZERO_UNDEF, MVT::i16 , Expand); 462 setOperationAction(ISD::CTTZ_ZERO_UNDEF, MVT::i32 , Expand); 463 if (Subtarget->is64Bit()) 464 setOperationAction(ISD::CTTZ_ZERO_UNDEF, MVT::i64, Expand); 465 } else { 466 setOperationAction(ISD::CTTZ , MVT::i16 , Custom); 467 setOperationAction(ISD::CTTZ , MVT::i32 , Custom); 468 if (Subtarget->is64Bit()) 469 setOperationAction(ISD::CTTZ , MVT::i64 , Custom); 470 } 471 472 if (Subtarget->hasLZCNT()) { 473 // When promoting the i8 variants, force them to i32 for a shorter 474 // encoding. 475 setOperationAction(ISD::CTLZ , MVT::i8 , Promote); 476 AddPromotedToType (ISD::CTLZ , MVT::i8 , MVT::i32); 477 setOperationAction(ISD::CTLZ_ZERO_UNDEF, MVT::i8 , Promote); 478 AddPromotedToType (ISD::CTLZ_ZERO_UNDEF, MVT::i8 , MVT::i32); 479 setOperationAction(ISD::CTLZ_ZERO_UNDEF, MVT::i16 , Expand); 480 setOperationAction(ISD::CTLZ_ZERO_UNDEF, MVT::i32 , Expand); 481 if (Subtarget->is64Bit()) 482 setOperationAction(ISD::CTLZ_ZERO_UNDEF, MVT::i64, Expand); 483 } else { 484 setOperationAction(ISD::CTLZ , MVT::i8 , Custom); 485 setOperationAction(ISD::CTLZ , MVT::i16 , Custom); 486 setOperationAction(ISD::CTLZ , MVT::i32 , Custom); 487 setOperationAction(ISD::CTLZ_ZERO_UNDEF, MVT::i8 , Custom); 488 setOperationAction(ISD::CTLZ_ZERO_UNDEF, MVT::i16 , Custom); 489 setOperationAction(ISD::CTLZ_ZERO_UNDEF, MVT::i32 , Custom); 490 if (Subtarget->is64Bit()) { 491 setOperationAction(ISD::CTLZ , MVT::i64 , Custom); 492 setOperationAction(ISD::CTLZ_ZERO_UNDEF, MVT::i64, Custom); 493 } 494 } 495 496 if (Subtarget->hasPOPCNT()) { 497 setOperationAction(ISD::CTPOP , MVT::i8 , Promote); 498 } else { 499 setOperationAction(ISD::CTPOP , MVT::i8 , Expand); 500 setOperationAction(ISD::CTPOP , MVT::i16 , Expand); 501 setOperationAction(ISD::CTPOP , MVT::i32 , Expand); 502 if (Subtarget->is64Bit()) 503 setOperationAction(ISD::CTPOP , MVT::i64 , Expand); 504 } 505 506 setOperationAction(ISD::READCYCLECOUNTER , MVT::i64 , Custom); 507 setOperationAction(ISD::BSWAP , MVT::i16 , Expand); 508 509 // These should be promoted to a larger select which is supported. 510 setOperationAction(ISD::SELECT , MVT::i1 , Promote); 511 // X86 wants to expand cmov itself. 512 setOperationAction(ISD::SELECT , MVT::i8 , Custom); 513 setOperationAction(ISD::SELECT , MVT::i16 , Custom); 514 setOperationAction(ISD::SELECT , MVT::i32 , Custom); 515 setOperationAction(ISD::SELECT , MVT::f32 , Custom); 516 setOperationAction(ISD::SELECT , MVT::f64 , Custom); 517 setOperationAction(ISD::SELECT , MVT::f80 , Custom); 518 setOperationAction(ISD::SETCC , MVT::i8 , Custom); 519 setOperationAction(ISD::SETCC , MVT::i16 , Custom); 520 setOperationAction(ISD::SETCC , MVT::i32 , Custom); 521 setOperationAction(ISD::SETCC , MVT::f32 , Custom); 522 setOperationAction(ISD::SETCC , MVT::f64 , Custom); 523 setOperationAction(ISD::SETCC , MVT::f80 , Custom); 524 if (Subtarget->is64Bit()) { 525 setOperationAction(ISD::SELECT , MVT::i64 , Custom); 526 setOperationAction(ISD::SETCC , MVT::i64 , Custom); 527 } 528 setOperationAction(ISD::EH_RETURN , MVT::Other, Custom); 529 // NOTE: EH_SJLJ_SETJMP/_LONGJMP supported here is NOT intended to support 530 // SjLj exception handling but a light-weight setjmp/longjmp replacement to 531 // support continuation, user-level threading, and etc.. As a result, no 532 // other SjLj exception interfaces are implemented and please don't build 533 // your own exception handling based on them. 534 // LLVM/Clang supports zero-cost DWARF exception handling. 535 setOperationAction(ISD::EH_SJLJ_SETJMP, MVT::i32, Custom); 536 setOperationAction(ISD::EH_SJLJ_LONGJMP, MVT::Other, Custom); 537 538 // Darwin ABI issue. 539 setOperationAction(ISD::ConstantPool , MVT::i32 , Custom); 540 setOperationAction(ISD::JumpTable , MVT::i32 , Custom); 541 setOperationAction(ISD::GlobalAddress , MVT::i32 , Custom); 542 setOperationAction(ISD::GlobalTLSAddress, MVT::i32 , Custom); 543 if (Subtarget->is64Bit()) 544 setOperationAction(ISD::GlobalTLSAddress, MVT::i64, Custom); 545 setOperationAction(ISD::ExternalSymbol , MVT::i32 , Custom); 546 setOperationAction(ISD::BlockAddress , MVT::i32 , Custom); 547 if (Subtarget->is64Bit()) { 548 setOperationAction(ISD::ConstantPool , MVT::i64 , Custom); 549 setOperationAction(ISD::JumpTable , MVT::i64 , Custom); 550 setOperationAction(ISD::GlobalAddress , MVT::i64 , Custom); 551 setOperationAction(ISD::ExternalSymbol, MVT::i64 , Custom); 552 setOperationAction(ISD::BlockAddress , MVT::i64 , Custom); 553 } 554 // 64-bit addm sub, shl, sra, srl (iff 32-bit x86) 555 setOperationAction(ISD::SHL_PARTS , MVT::i32 , Custom); 556 setOperationAction(ISD::SRA_PARTS , MVT::i32 , Custom); 557 setOperationAction(ISD::SRL_PARTS , MVT::i32 , Custom); 558 if (Subtarget->is64Bit()) { 559 setOperationAction(ISD::SHL_PARTS , MVT::i64 , Custom); 560 setOperationAction(ISD::SRA_PARTS , MVT::i64 , Custom); 561 setOperationAction(ISD::SRL_PARTS , MVT::i64 , Custom); 562 } 563 564 if (Subtarget->hasSSE1()) 565 setOperationAction(ISD::PREFETCH , MVT::Other, Legal); 566 567 setOperationAction(ISD::ATOMIC_FENCE , MVT::Other, Custom); 568 569 // Expand certain atomics 570 for (unsigned i = 0; i != array_lengthof(IntVTs); ++i) { 571 MVT VT = IntVTs[i]; 572 setOperationAction(ISD::ATOMIC_CMP_SWAP, VT, Custom); 573 setOperationAction(ISD::ATOMIC_LOAD_SUB, VT, Custom); 574 setOperationAction(ISD::ATOMIC_STORE, VT, Custom); 575 } 576 577 if (!Subtarget->is64Bit()) { 578 setOperationAction(ISD::ATOMIC_LOAD, MVT::i64, Custom); 579 setOperationAction(ISD::ATOMIC_LOAD_ADD, MVT::i64, Custom); 580 setOperationAction(ISD::ATOMIC_LOAD_SUB, MVT::i64, Custom); 581 setOperationAction(ISD::ATOMIC_LOAD_AND, MVT::i64, Custom); 582 setOperationAction(ISD::ATOMIC_LOAD_OR, MVT::i64, Custom); 583 setOperationAction(ISD::ATOMIC_LOAD_XOR, MVT::i64, Custom); 584 setOperationAction(ISD::ATOMIC_LOAD_NAND, MVT::i64, Custom); 585 setOperationAction(ISD::ATOMIC_SWAP, MVT::i64, Custom); 586 setOperationAction(ISD::ATOMIC_LOAD_MAX, MVT::i64, Custom); 587 setOperationAction(ISD::ATOMIC_LOAD_MIN, MVT::i64, Custom); 588 setOperationAction(ISD::ATOMIC_LOAD_UMAX, MVT::i64, Custom); 589 setOperationAction(ISD::ATOMIC_LOAD_UMIN, MVT::i64, Custom); 590 } 591 592 if (Subtarget->hasCmpxchg16b()) { 593 setOperationAction(ISD::ATOMIC_CMP_SWAP, MVT::i128, Custom); 594 } 595 596 // FIXME - use subtarget debug flags 597 if (!Subtarget->isTargetDarwin() && 598 !Subtarget->isTargetELF() && 599 !Subtarget->isTargetCygMing()) { 600 setOperationAction(ISD::EH_LABEL, MVT::Other, Expand); 601 } 602 603 if (Subtarget->is64Bit()) { 604 setExceptionPointerRegister(X86::RAX); 605 setExceptionSelectorRegister(X86::RDX); 606 } else { 607 setExceptionPointerRegister(X86::EAX); 608 setExceptionSelectorRegister(X86::EDX); 609 } 610 setOperationAction(ISD::FRAME_TO_ARGS_OFFSET, MVT::i32, Custom); 611 setOperationAction(ISD::FRAME_TO_ARGS_OFFSET, MVT::i64, Custom); 612 613 setOperationAction(ISD::INIT_TRAMPOLINE, MVT::Other, Custom); 614 setOperationAction(ISD::ADJUST_TRAMPOLINE, MVT::Other, Custom); 615 616 setOperationAction(ISD::TRAP, MVT::Other, Legal); 617 setOperationAction(ISD::DEBUGTRAP, MVT::Other, Legal); 618 619 // VASTART needs to be custom lowered to use the VarArgsFrameIndex 620 setOperationAction(ISD::VASTART , MVT::Other, Custom); 621 setOperationAction(ISD::VAEND , MVT::Other, Expand); 622 if (Subtarget->is64Bit() && !Subtarget->isTargetWin64()) { 623 // TargetInfo::X86_64ABIBuiltinVaList 624 setOperationAction(ISD::VAARG , MVT::Other, Custom); 625 setOperationAction(ISD::VACOPY , MVT::Other, Custom); 626 } else { 627 // TargetInfo::CharPtrBuiltinVaList 628 setOperationAction(ISD::VAARG , MVT::Other, Expand); 629 setOperationAction(ISD::VACOPY , MVT::Other, Expand); 630 } 631 632 setOperationAction(ISD::STACKSAVE, MVT::Other, Expand); 633 setOperationAction(ISD::STACKRESTORE, MVT::Other, Expand); 634 635 if (Subtarget->isOSWindows() && !Subtarget->isTargetEnvMacho()) 636 setOperationAction(ISD::DYNAMIC_STACKALLOC, Subtarget->is64Bit() ? 637 MVT::i64 : MVT::i32, Custom); 638 else if (TM.Options.EnableSegmentedStacks) 639 setOperationAction(ISD::DYNAMIC_STACKALLOC, Subtarget->is64Bit() ? 640 MVT::i64 : MVT::i32, Custom); 641 else 642 setOperationAction(ISD::DYNAMIC_STACKALLOC, Subtarget->is64Bit() ? 643 MVT::i64 : MVT::i32, Expand); 644 645 if (!TM.Options.UseSoftFloat && X86ScalarSSEf64) { 646 // f32 and f64 use SSE. 647 // Set up the FP register classes. 648 addRegisterClass(MVT::f32, &X86::FR32RegClass); 649 addRegisterClass(MVT::f64, &X86::FR64RegClass); 650 651 // Use ANDPD to simulate FABS. 652 setOperationAction(ISD::FABS , MVT::f64, Custom); 653 setOperationAction(ISD::FABS , MVT::f32, Custom); 654 655 // Use XORP to simulate FNEG. 656 setOperationAction(ISD::FNEG , MVT::f64, Custom); 657 setOperationAction(ISD::FNEG , MVT::f32, Custom); 658 659 // Use ANDPD and ORPD to simulate FCOPYSIGN. 660 setOperationAction(ISD::FCOPYSIGN, MVT::f64, Custom); 661 setOperationAction(ISD::FCOPYSIGN, MVT::f32, Custom); 662 663 // Lower this to FGETSIGNx86 plus an AND. 664 setOperationAction(ISD::FGETSIGN, MVT::i64, Custom); 665 setOperationAction(ISD::FGETSIGN, MVT::i32, Custom); 666 667 // We don't support sin/cos/fmod 668 setOperationAction(ISD::FSIN , MVT::f64, Expand); 669 setOperationAction(ISD::FCOS , MVT::f64, Expand); 670 setOperationAction(ISD::FSINCOS, MVT::f64, Expand); 671 setOperationAction(ISD::FSIN , MVT::f32, Expand); 672 setOperationAction(ISD::FCOS , MVT::f32, Expand); 673 setOperationAction(ISD::FSINCOS, MVT::f32, Expand); 674 675 // Expand FP immediates into loads from the stack, except for the special 676 // cases we handle. 677 addLegalFPImmediate(APFloat(+0.0)); // xorpd 678 addLegalFPImmediate(APFloat(+0.0f)); // xorps 679 } else if (!TM.Options.UseSoftFloat && X86ScalarSSEf32) { 680 // Use SSE for f32, x87 for f64. 681 // Set up the FP register classes. 682 addRegisterClass(MVT::f32, &X86::FR32RegClass); 683 addRegisterClass(MVT::f64, &X86::RFP64RegClass); 684 685 // Use ANDPS to simulate FABS. 686 setOperationAction(ISD::FABS , MVT::f32, Custom); 687 688 // Use XORP to simulate FNEG. 689 setOperationAction(ISD::FNEG , MVT::f32, Custom); 690 691 setOperationAction(ISD::UNDEF, MVT::f64, Expand); 692 693 // Use ANDPS and ORPS to simulate FCOPYSIGN. 694 setOperationAction(ISD::FCOPYSIGN, MVT::f64, Expand); 695 setOperationAction(ISD::FCOPYSIGN, MVT::f32, Custom); 696 697 // We don't support sin/cos/fmod 698 setOperationAction(ISD::FSIN , MVT::f32, Expand); 699 setOperationAction(ISD::FCOS , MVT::f32, Expand); 700 setOperationAction(ISD::FSINCOS, MVT::f32, Expand); 701 702 // Special cases we handle for FP constants. 703 addLegalFPImmediate(APFloat(+0.0f)); // xorps 704 addLegalFPImmediate(APFloat(+0.0)); // FLD0 705 addLegalFPImmediate(APFloat(+1.0)); // FLD1 706 addLegalFPImmediate(APFloat(-0.0)); // FLD0/FCHS 707 addLegalFPImmediate(APFloat(-1.0)); // FLD1/FCHS 708 709 if (!TM.Options.UnsafeFPMath) { 710 setOperationAction(ISD::FSIN , MVT::f64, Expand); 711 setOperationAction(ISD::FCOS , MVT::f64, Expand); 712 setOperationAction(ISD::FSINCOS, MVT::f64, Expand); 713 } 714 } else if (!TM.Options.UseSoftFloat) { 715 // f32 and f64 in x87. 716 // Set up the FP register classes. 717 addRegisterClass(MVT::f64, &X86::RFP64RegClass); 718 addRegisterClass(MVT::f32, &X86::RFP32RegClass); 719 720 setOperationAction(ISD::UNDEF, MVT::f64, Expand); 721 setOperationAction(ISD::UNDEF, MVT::f32, Expand); 722 setOperationAction(ISD::FCOPYSIGN, MVT::f64, Expand); 723 setOperationAction(ISD::FCOPYSIGN, MVT::f32, Expand); 724 725 if (!TM.Options.UnsafeFPMath) { 726 setOperationAction(ISD::FSIN , MVT::f64, Expand); 727 setOperationAction(ISD::FSIN , MVT::f32, Expand); 728 setOperationAction(ISD::FCOS , MVT::f64, Expand); 729 setOperationAction(ISD::FCOS , MVT::f32, Expand); 730 setOperationAction(ISD::FSINCOS, MVT::f64, Expand); 731 setOperationAction(ISD::FSINCOS, MVT::f32, Expand); 732 } 733 addLegalFPImmediate(APFloat(+0.0)); // FLD0 734 addLegalFPImmediate(APFloat(+1.0)); // FLD1 735 addLegalFPImmediate(APFloat(-0.0)); // FLD0/FCHS 736 addLegalFPImmediate(APFloat(-1.0)); // FLD1/FCHS 737 addLegalFPImmediate(APFloat(+0.0f)); // FLD0 738 addLegalFPImmediate(APFloat(+1.0f)); // FLD1 739 addLegalFPImmediate(APFloat(-0.0f)); // FLD0/FCHS 740 addLegalFPImmediate(APFloat(-1.0f)); // FLD1/FCHS 741 } 742 743 // We don't support FMA. 744 setOperationAction(ISD::FMA, MVT::f64, Expand); 745 setOperationAction(ISD::FMA, MVT::f32, Expand); 746 747 // Long double always uses X87. 748 if (!TM.Options.UseSoftFloat) { 749 addRegisterClass(MVT::f80, &X86::RFP80RegClass); 750 setOperationAction(ISD::UNDEF, MVT::f80, Expand); 751 setOperationAction(ISD::FCOPYSIGN, MVT::f80, Expand); 752 { 753 APFloat TmpFlt = APFloat::getZero(APFloat::x87DoubleExtended); 754 addLegalFPImmediate(TmpFlt); // FLD0 755 TmpFlt.changeSign(); 756 addLegalFPImmediate(TmpFlt); // FLD0/FCHS 757 758 bool ignored; 759 APFloat TmpFlt2(+1.0); 760 TmpFlt2.convert(APFloat::x87DoubleExtended, APFloat::rmNearestTiesToEven, 761 &ignored); 762 addLegalFPImmediate(TmpFlt2); // FLD1 763 TmpFlt2.changeSign(); 764 addLegalFPImmediate(TmpFlt2); // FLD1/FCHS 765 } 766 767 if (!TM.Options.UnsafeFPMath) { 768 setOperationAction(ISD::FSIN , MVT::f80, Expand); 769 setOperationAction(ISD::FCOS , MVT::f80, Expand); 770 setOperationAction(ISD::FSINCOS, MVT::f80, Expand); 771 } 772 773 setOperationAction(ISD::FFLOOR, MVT::f80, Expand); 774 setOperationAction(ISD::FCEIL, MVT::f80, Expand); 775 setOperationAction(ISD::FTRUNC, MVT::f80, Expand); 776 setOperationAction(ISD::FRINT, MVT::f80, Expand); 777 setOperationAction(ISD::FNEARBYINT, MVT::f80, Expand); 778 setOperationAction(ISD::FMA, MVT::f80, Expand); 779 } 780 781 // Always use a library call for pow. 782 setOperationAction(ISD::FPOW , MVT::f32 , Expand); 783 setOperationAction(ISD::FPOW , MVT::f64 , Expand); 784 setOperationAction(ISD::FPOW , MVT::f80 , Expand); 785 786 setOperationAction(ISD::FLOG, MVT::f80, Expand); 787 setOperationAction(ISD::FLOG2, MVT::f80, Expand); 788 setOperationAction(ISD::FLOG10, MVT::f80, Expand); 789 setOperationAction(ISD::FEXP, MVT::f80, Expand); 790 setOperationAction(ISD::FEXP2, MVT::f80, Expand); 791 792 // First set operation action for all vector types to either promote 793 // (for widening) or expand (for scalarization). Then we will selectively 794 // turn on ones that can be effectively codegen'd. 795 for (int i = MVT::FIRST_VECTOR_VALUETYPE; 796 i <= MVT::LAST_VECTOR_VALUETYPE; ++i) { 797 MVT VT = (MVT::SimpleValueType)i; 798 setOperationAction(ISD::ADD , VT, Expand); 799 setOperationAction(ISD::SUB , VT, Expand); 800 setOperationAction(ISD::FADD, VT, Expand); 801 setOperationAction(ISD::FNEG, VT, Expand); 802 setOperationAction(ISD::FSUB, VT, Expand); 803 setOperationAction(ISD::MUL , VT, Expand); 804 setOperationAction(ISD::FMUL, VT, Expand); 805 setOperationAction(ISD::SDIV, VT, Expand); 806 setOperationAction(ISD::UDIV, VT, Expand); 807 setOperationAction(ISD::FDIV, VT, Expand); 808 setOperationAction(ISD::SREM, VT, Expand); 809 setOperationAction(ISD::UREM, VT, Expand); 810 setOperationAction(ISD::LOAD, VT, Expand); 811 setOperationAction(ISD::VECTOR_SHUFFLE, VT, Expand); 812 setOperationAction(ISD::EXTRACT_VECTOR_ELT, VT,Expand); 813 setOperationAction(ISD::INSERT_VECTOR_ELT, VT, Expand); 814 setOperationAction(ISD::EXTRACT_SUBVECTOR, VT,Expand); 815 setOperationAction(ISD::INSERT_SUBVECTOR, VT,Expand); 816 setOperationAction(ISD::FABS, VT, Expand); 817 setOperationAction(ISD::FSIN, VT, Expand); 818 setOperationAction(ISD::FSINCOS, VT, Expand); 819 setOperationAction(ISD::FCOS, VT, Expand); 820 setOperationAction(ISD::FSINCOS, VT, Expand); 821 setOperationAction(ISD::FREM, VT, Expand); 822 setOperationAction(ISD::FMA, VT, Expand); 823 setOperationAction(ISD::FPOWI, VT, Expand); 824 setOperationAction(ISD::FSQRT, VT, Expand); 825 setOperationAction(ISD::FCOPYSIGN, VT, Expand); 826 setOperationAction(ISD::FFLOOR, VT, Expand); 827 setOperationAction(ISD::FCEIL, VT, Expand); 828 setOperationAction(ISD::FTRUNC, VT, Expand); 829 setOperationAction(ISD::FRINT, VT, Expand); 830 setOperationAction(ISD::FNEARBYINT, VT, Expand); 831 setOperationAction(ISD::SMUL_LOHI, VT, Expand); 832 setOperationAction(ISD::UMUL_LOHI, VT, Expand); 833 setOperationAction(ISD::SDIVREM, VT, Expand); 834 setOperationAction(ISD::UDIVREM, VT, Expand); 835 setOperationAction(ISD::FPOW, VT, Expand); 836 setOperationAction(ISD::CTPOP, VT, Expand); 837 setOperationAction(ISD::CTTZ, VT, Expand); 838 setOperationAction(ISD::CTTZ_ZERO_UNDEF, VT, Expand); 839 setOperationAction(ISD::CTLZ, VT, Expand); 840 setOperationAction(ISD::CTLZ_ZERO_UNDEF, VT, Expand); 841 setOperationAction(ISD::SHL, VT, Expand); 842 setOperationAction(ISD::SRA, VT, Expand); 843 setOperationAction(ISD::SRL, VT, Expand); 844 setOperationAction(ISD::ROTL, VT, Expand); 845 setOperationAction(ISD::ROTR, VT, Expand); 846 setOperationAction(ISD::BSWAP, VT, Expand); 847 setOperationAction(ISD::SETCC, VT, Expand); 848 setOperationAction(ISD::FLOG, VT, Expand); 849 setOperationAction(ISD::FLOG2, VT, Expand); 850 setOperationAction(ISD::FLOG10, VT, Expand); 851 setOperationAction(ISD::FEXP, VT, Expand); 852 setOperationAction(ISD::FEXP2, VT, Expand); 853 setOperationAction(ISD::FP_TO_UINT, VT, Expand); 854 setOperationAction(ISD::FP_TO_SINT, VT, Expand); 855 setOperationAction(ISD::UINT_TO_FP, VT, Expand); 856 setOperationAction(ISD::SINT_TO_FP, VT, Expand); 857 setOperationAction(ISD::SIGN_EXTEND_INREG, VT,Expand); 858 setOperationAction(ISD::TRUNCATE, VT, Expand); 859 setOperationAction(ISD::SIGN_EXTEND, VT, Expand); 860 setOperationAction(ISD::ZERO_EXTEND, VT, Expand); 861 setOperationAction(ISD::ANY_EXTEND, VT, Expand); 862 setOperationAction(ISD::VSELECT, VT, Expand); 863 for (int InnerVT = MVT::FIRST_VECTOR_VALUETYPE; 864 InnerVT <= MVT::LAST_VECTOR_VALUETYPE; ++InnerVT) 865 setTruncStoreAction(VT, 866 (MVT::SimpleValueType)InnerVT, Expand); 867 setLoadExtAction(ISD::SEXTLOAD, VT, Expand); 868 setLoadExtAction(ISD::ZEXTLOAD, VT, Expand); 869 setLoadExtAction(ISD::EXTLOAD, VT, Expand); 870 } 871 872 // FIXME: In order to prevent SSE instructions being expanded to MMX ones 873 // with -msoft-float, disable use of MMX as well. 874 if (!TM.Options.UseSoftFloat && Subtarget->hasMMX()) { 875 addRegisterClass(MVT::x86mmx, &X86::VR64RegClass); 876 // No operations on x86mmx supported, everything uses intrinsics. 877 } 878 879 // MMX-sized vectors (other than x86mmx) are expected to be expanded 880 // into smaller operations. 881 setOperationAction(ISD::MULHS, MVT::v8i8, Expand); 882 setOperationAction(ISD::MULHS, MVT::v4i16, Expand); 883 setOperationAction(ISD::MULHS, MVT::v2i32, Expand); 884 setOperationAction(ISD::MULHS, MVT::v1i64, Expand); 885 setOperationAction(ISD::AND, MVT::v8i8, Expand); 886 setOperationAction(ISD::AND, MVT::v4i16, Expand); 887 setOperationAction(ISD::AND, MVT::v2i32, Expand); 888 setOperationAction(ISD::AND, MVT::v1i64, Expand); 889 setOperationAction(ISD::OR, MVT::v8i8, Expand); 890 setOperationAction(ISD::OR, MVT::v4i16, Expand); 891 setOperationAction(ISD::OR, MVT::v2i32, Expand); 892 setOperationAction(ISD::OR, MVT::v1i64, Expand); 893 setOperationAction(ISD::XOR, MVT::v8i8, Expand); 894 setOperationAction(ISD::XOR, MVT::v4i16, Expand); 895 setOperationAction(ISD::XOR, MVT::v2i32, Expand); 896 setOperationAction(ISD::XOR, MVT::v1i64, Expand); 897 setOperationAction(ISD::SCALAR_TO_VECTOR, MVT::v8i8, Expand); 898 setOperationAction(ISD::SCALAR_TO_VECTOR, MVT::v4i16, Expand); 899 setOperationAction(ISD::SCALAR_TO_VECTOR, MVT::v2i32, Expand); 900 setOperationAction(ISD::SCALAR_TO_VECTOR, MVT::v1i64, Expand); 901 setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v1i64, Expand); 902 setOperationAction(ISD::SELECT, MVT::v8i8, Expand); 903 setOperationAction(ISD::SELECT, MVT::v4i16, Expand); 904 setOperationAction(ISD::SELECT, MVT::v2i32, Expand); 905 setOperationAction(ISD::SELECT, MVT::v1i64, Expand); 906 setOperationAction(ISD::BITCAST, MVT::v8i8, Expand); 907 setOperationAction(ISD::BITCAST, MVT::v4i16, Expand); 908 setOperationAction(ISD::BITCAST, MVT::v2i32, Expand); 909 setOperationAction(ISD::BITCAST, MVT::v1i64, Expand); 910 911 if (!TM.Options.UseSoftFloat && Subtarget->hasSSE1()) { 912 addRegisterClass(MVT::v4f32, &X86::VR128RegClass); 913 914 setOperationAction(ISD::FADD, MVT::v4f32, Legal); 915 setOperationAction(ISD::FSUB, MVT::v4f32, Legal); 916 setOperationAction(ISD::FMUL, MVT::v4f32, Legal); 917 setOperationAction(ISD::FDIV, MVT::v4f32, Legal); 918 setOperationAction(ISD::FSQRT, MVT::v4f32, Legal); 919 setOperationAction(ISD::FNEG, MVT::v4f32, Custom); 920 setOperationAction(ISD::FABS, MVT::v4f32, Custom); 921 setOperationAction(ISD::LOAD, MVT::v4f32, Legal); 922 setOperationAction(ISD::BUILD_VECTOR, MVT::v4f32, Custom); 923 setOperationAction(ISD::VECTOR_SHUFFLE, MVT::v4f32, Custom); 924 setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v4f32, Custom); 925 setOperationAction(ISD::SELECT, MVT::v4f32, Custom); 926 } 927 928 if (!TM.Options.UseSoftFloat && Subtarget->hasSSE2()) { 929 addRegisterClass(MVT::v2f64, &X86::VR128RegClass); 930 931 // FIXME: Unfortunately -soft-float and -no-implicit-float means XMM 932 // registers cannot be used even for integer operations. 933 addRegisterClass(MVT::v16i8, &X86::VR128RegClass); 934 addRegisterClass(MVT::v8i16, &X86::VR128RegClass); 935 addRegisterClass(MVT::v4i32, &X86::VR128RegClass); 936 addRegisterClass(MVT::v2i64, &X86::VR128RegClass); 937 938 setOperationAction(ISD::ADD, MVT::v16i8, Legal); 939 setOperationAction(ISD::ADD, MVT::v8i16, Legal); 940 setOperationAction(ISD::ADD, MVT::v4i32, Legal); 941 setOperationAction(ISD::ADD, MVT::v2i64, Legal); 942 setOperationAction(ISD::MUL, MVT::v4i32, Custom); 943 setOperationAction(ISD::MUL, MVT::v2i64, Custom); 944 setOperationAction(ISD::SUB, MVT::v16i8, Legal); 945 setOperationAction(ISD::SUB, MVT::v8i16, Legal); 946 setOperationAction(ISD::SUB, MVT::v4i32, Legal); 947 setOperationAction(ISD::SUB, MVT::v2i64, Legal); 948 setOperationAction(ISD::MUL, MVT::v8i16, Legal); 949 setOperationAction(ISD::FADD, MVT::v2f64, Legal); 950 setOperationAction(ISD::FSUB, MVT::v2f64, Legal); 951 setOperationAction(ISD::FMUL, MVT::v2f64, Legal); 952 setOperationAction(ISD::FDIV, MVT::v2f64, Legal); 953 setOperationAction(ISD::FSQRT, MVT::v2f64, Legal); 954 setOperationAction(ISD::FNEG, MVT::v2f64, Custom); 955 setOperationAction(ISD::FABS, MVT::v2f64, Custom); 956 957 setOperationAction(ISD::SETCC, MVT::v2i64, Custom); 958 setOperationAction(ISD::SETCC, MVT::v16i8, Custom); 959 setOperationAction(ISD::SETCC, MVT::v8i16, Custom); 960 setOperationAction(ISD::SETCC, MVT::v4i32, Custom); 961 962 setOperationAction(ISD::SCALAR_TO_VECTOR, MVT::v16i8, Custom); 963 setOperationAction(ISD::SCALAR_TO_VECTOR, MVT::v8i16, Custom); 964 setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v8i16, Custom); 965 setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v4i32, Custom); 966 setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v4f32, Custom); 967 968 // Custom lower build_vector, vector_shuffle, and extract_vector_elt. 969 for (int i = MVT::v16i8; i != MVT::v2i64; ++i) { 970 MVT VT = (MVT::SimpleValueType)i; 971 // Do not attempt to custom lower non-power-of-2 vectors 972 if (!isPowerOf2_32(VT.getVectorNumElements())) 973 continue; 974 // Do not attempt to custom lower non-128-bit vectors 975 if (!VT.is128BitVector()) 976 continue; 977 setOperationAction(ISD::BUILD_VECTOR, VT, Custom); 978 setOperationAction(ISD::VECTOR_SHUFFLE, VT, Custom); 979 setOperationAction(ISD::EXTRACT_VECTOR_ELT, VT, Custom); 980 } 981 982 setOperationAction(ISD::BUILD_VECTOR, MVT::v2f64, Custom); 983 setOperationAction(ISD::BUILD_VECTOR, MVT::v2i64, Custom); 984 setOperationAction(ISD::VECTOR_SHUFFLE, MVT::v2f64, Custom); 985 setOperationAction(ISD::VECTOR_SHUFFLE, MVT::v2i64, Custom); 986 setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v2f64, Custom); 987 setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v2f64, Custom); 988 989 if (Subtarget->is64Bit()) { 990 setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v2i64, Custom); 991 setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v2i64, Custom); 992 } 993 994 // Promote v16i8, v8i16, v4i32 load, select, and, or, xor to v2i64. 995 for (int i = MVT::v16i8; i != MVT::v2i64; ++i) { 996 MVT VT = (MVT::SimpleValueType)i; 997 998 // Do not attempt to promote non-128-bit vectors 999 if (!VT.is128BitVector()) 1000 continue; 1001 1002 setOperationAction(ISD::AND, VT, Promote); 1003 AddPromotedToType (ISD::AND, VT, MVT::v2i64); 1004 setOperationAction(ISD::OR, VT, Promote); 1005 AddPromotedToType (ISD::OR, VT, MVT::v2i64); 1006 setOperationAction(ISD::XOR, VT, Promote); 1007 AddPromotedToType (ISD::XOR, VT, MVT::v2i64); 1008 setOperationAction(ISD::LOAD, VT, Promote); 1009 AddPromotedToType (ISD::LOAD, VT, MVT::v2i64); 1010 setOperationAction(ISD::SELECT, VT, Promote); 1011 AddPromotedToType (ISD::SELECT, VT, MVT::v2i64); 1012 } 1013 1014 setTruncStoreAction(MVT::f64, MVT::f32, Expand); 1015 1016 // Custom lower v2i64 and v2f64 selects. 1017 setOperationAction(ISD::LOAD, MVT::v2f64, Legal); 1018 setOperationAction(ISD::LOAD, MVT::v2i64, Legal); 1019 setOperationAction(ISD::SELECT, MVT::v2f64, Custom); 1020 setOperationAction(ISD::SELECT, MVT::v2i64, Custom); 1021 1022 setOperationAction(ISD::FP_TO_SINT, MVT::v4i32, Legal); 1023 setOperationAction(ISD::SINT_TO_FP, MVT::v4i32, Legal); 1024 1025 setOperationAction(ISD::UINT_TO_FP, MVT::v4i8, Custom); 1026 setOperationAction(ISD::UINT_TO_FP, MVT::v4i16, Custom); 1027 // As there is no 64-bit GPR available, we need build a special custom 1028 // sequence to convert from v2i32 to v2f32. 1029 if (!Subtarget->is64Bit()) 1030 setOperationAction(ISD::UINT_TO_FP, MVT::v2f32, Custom); 1031 1032 setOperationAction(ISD::FP_EXTEND, MVT::v2f32, Custom); 1033 setOperationAction(ISD::FP_ROUND, MVT::v2f32, Custom); 1034 1035 setLoadExtAction(ISD::EXTLOAD, MVT::v2f32, Legal); 1036 } 1037 1038 if (!TM.Options.UseSoftFloat && Subtarget->hasSSE41()) { 1039 setOperationAction(ISD::FFLOOR, MVT::f32, Legal); 1040 setOperationAction(ISD::FCEIL, MVT::f32, Legal); 1041 setOperationAction(ISD::FTRUNC, MVT::f32, Legal); 1042 setOperationAction(ISD::FRINT, MVT::f32, Legal); 1043 setOperationAction(ISD::FNEARBYINT, MVT::f32, Legal); 1044 setOperationAction(ISD::FFLOOR, MVT::f64, Legal); 1045 setOperationAction(ISD::FCEIL, MVT::f64, Legal); 1046 setOperationAction(ISD::FTRUNC, MVT::f64, Legal); 1047 setOperationAction(ISD::FRINT, MVT::f64, Legal); 1048 setOperationAction(ISD::FNEARBYINT, MVT::f64, Legal); 1049 1050 setOperationAction(ISD::FFLOOR, MVT::v4f32, Legal); 1051 setOperationAction(ISD::FCEIL, MVT::v4f32, Legal); 1052 setOperationAction(ISD::FTRUNC, MVT::v4f32, Legal); 1053 setOperationAction(ISD::FRINT, MVT::v4f32, Legal); 1054 setOperationAction(ISD::FNEARBYINT, MVT::v4f32, Legal); 1055 setOperationAction(ISD::FFLOOR, MVT::v2f64, Legal); 1056 setOperationAction(ISD::FCEIL, MVT::v2f64, Legal); 1057 setOperationAction(ISD::FTRUNC, MVT::v2f64, Legal); 1058 setOperationAction(ISD::FRINT, MVT::v2f64, Legal); 1059 setOperationAction(ISD::FNEARBYINT, MVT::v2f64, Legal); 1060 1061 // FIXME: Do we need to handle scalar-to-vector here? 1062 setOperationAction(ISD::MUL, MVT::v4i32, Legal); 1063 1064 setOperationAction(ISD::VSELECT, MVT::v2f64, Legal); 1065 setOperationAction(ISD::VSELECT, MVT::v2i64, Legal); 1066 setOperationAction(ISD::VSELECT, MVT::v16i8, Legal); 1067 setOperationAction(ISD::VSELECT, MVT::v4i32, Legal); 1068 setOperationAction(ISD::VSELECT, MVT::v4f32, Legal); 1069 1070 // i8 and i16 vectors are custom , because the source register and source 1071 // source memory operand types are not the same width. f32 vectors are 1072 // custom since the immediate controlling the insert encodes additional 1073 // information. 1074 setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v16i8, Custom); 1075 setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v8i16, Custom); 1076 setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v4i32, Custom); 1077 setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v4f32, Custom); 1078 1079 setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v16i8, Custom); 1080 setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v8i16, Custom); 1081 setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v4i32, Custom); 1082 setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v4f32, Custom); 1083 1084 // FIXME: these should be Legal but thats only for the case where 1085 // the index is constant. For now custom expand to deal with that. 1086 if (Subtarget->is64Bit()) { 1087 setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v2i64, Custom); 1088 setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v2i64, Custom); 1089 } 1090 } 1091 1092 if (Subtarget->hasSSE2()) { 1093 setOperationAction(ISD::SRL, MVT::v8i16, Custom); 1094 setOperationAction(ISD::SRL, MVT::v16i8, Custom); 1095 1096 setOperationAction(ISD::SHL, MVT::v8i16, Custom); 1097 setOperationAction(ISD::SHL, MVT::v16i8, Custom); 1098 1099 setOperationAction(ISD::SRA, MVT::v8i16, Custom); 1100 setOperationAction(ISD::SRA, MVT::v16i8, Custom); 1101 1102 // In the customized shift lowering, the legal cases in AVX2 will be 1103 // recognized. 1104 setOperationAction(ISD::SRL, MVT::v2i64, Custom); 1105 setOperationAction(ISD::SRL, MVT::v4i32, Custom); 1106 1107 setOperationAction(ISD::SHL, MVT::v2i64, Custom); 1108 setOperationAction(ISD::SHL, MVT::v4i32, Custom); 1109 1110 setOperationAction(ISD::SRA, MVT::v4i32, Custom); 1111 1112 setOperationAction(ISD::SDIV, MVT::v8i16, Custom); 1113 setOperationAction(ISD::SDIV, MVT::v4i32, Custom); 1114 } 1115 1116 if (!TM.Options.UseSoftFloat && Subtarget->hasFp256()) { 1117 addRegisterClass(MVT::v32i8, &X86::VR256RegClass); 1118 addRegisterClass(MVT::v16i16, &X86::VR256RegClass); 1119 addRegisterClass(MVT::v8i32, &X86::VR256RegClass); 1120 addRegisterClass(MVT::v8f32, &X86::VR256RegClass); 1121 addRegisterClass(MVT::v4i64, &X86::VR256RegClass); 1122 addRegisterClass(MVT::v4f64, &X86::VR256RegClass); 1123 1124 setOperationAction(ISD::LOAD, MVT::v8f32, Legal); 1125 setOperationAction(ISD::LOAD, MVT::v4f64, Legal); 1126 setOperationAction(ISD::LOAD, MVT::v4i64, Legal); 1127 1128 setOperationAction(ISD::FADD, MVT::v8f32, Legal); 1129 setOperationAction(ISD::FSUB, MVT::v8f32, Legal); 1130 setOperationAction(ISD::FMUL, MVT::v8f32, Legal); 1131 setOperationAction(ISD::FDIV, MVT::v8f32, Legal); 1132 setOperationAction(ISD::FSQRT, MVT::v8f32, Legal); 1133 setOperationAction(ISD::FFLOOR, MVT::v8f32, Legal); 1134 setOperationAction(ISD::FCEIL, MVT::v8f32, Legal); 1135 setOperationAction(ISD::FTRUNC, MVT::v8f32, Legal); 1136 setOperationAction(ISD::FRINT, MVT::v8f32, Legal); 1137 setOperationAction(ISD::FNEARBYINT, MVT::v8f32, Legal); 1138 setOperationAction(ISD::FNEG, MVT::v8f32, Custom); 1139 setOperationAction(ISD::FABS, MVT::v8f32, Custom); 1140 1141 setOperationAction(ISD::FADD, MVT::v4f64, Legal); 1142 setOperationAction(ISD::FSUB, MVT::v4f64, Legal); 1143 setOperationAction(ISD::FMUL, MVT::v4f64, Legal); 1144 setOperationAction(ISD::FDIV, MVT::v4f64, Legal); 1145 setOperationAction(ISD::FSQRT, MVT::v4f64, Legal); 1146 setOperationAction(ISD::FFLOOR, MVT::v4f64, Legal); 1147 setOperationAction(ISD::FCEIL, MVT::v4f64, Legal); 1148 setOperationAction(ISD::FTRUNC, MVT::v4f64, Legal); 1149 setOperationAction(ISD::FRINT, MVT::v4f64, Legal); 1150 setOperationAction(ISD::FNEARBYINT, MVT::v4f64, Legal); 1151 setOperationAction(ISD::FNEG, MVT::v4f64, Custom); 1152 setOperationAction(ISD::FABS, MVT::v4f64, Custom); 1153 1154 setOperationAction(ISD::FP_TO_SINT, MVT::v8i16, Custom); 1155 1156 setOperationAction(ISD::FP_TO_SINT, MVT::v8i32, Legal); 1157 setOperationAction(ISD::SINT_TO_FP, MVT::v8i16, Promote); 1158 setOperationAction(ISD::SINT_TO_FP, MVT::v8i32, Legal); 1159 setOperationAction(ISD::FP_ROUND, MVT::v4f32, Legal); 1160 1161 setOperationAction(ISD::UINT_TO_FP, MVT::v8i8, Custom); 1162 setOperationAction(ISD::UINT_TO_FP, MVT::v8i16, Custom); 1163 1164 setLoadExtAction(ISD::EXTLOAD, MVT::v4f32, Legal); 1165 1166 setOperationAction(ISD::SRL, MVT::v16i16, Custom); 1167 setOperationAction(ISD::SRL, MVT::v32i8, Custom); 1168 1169 setOperationAction(ISD::SHL, MVT::v16i16, Custom); 1170 setOperationAction(ISD::SHL, MVT::v32i8, Custom); 1171 1172 setOperationAction(ISD::SRA, MVT::v16i16, Custom); 1173 setOperationAction(ISD::SRA, MVT::v32i8, Custom); 1174 1175 setOperationAction(ISD::SDIV, MVT::v16i16, Custom); 1176 1177 setOperationAction(ISD::SETCC, MVT::v32i8, Custom); 1178 setOperationAction(ISD::SETCC, MVT::v16i16, Custom); 1179 setOperationAction(ISD::SETCC, MVT::v8i32, Custom); 1180 setOperationAction(ISD::SETCC, MVT::v4i64, Custom); 1181 1182 setOperationAction(ISD::SELECT, MVT::v4f64, Custom); 1183 setOperationAction(ISD::SELECT, MVT::v4i64, Custom); 1184 setOperationAction(ISD::SELECT, MVT::v8f32, Custom); 1185 1186 setOperationAction(ISD::VSELECT, MVT::v4f64, Legal); 1187 setOperationAction(ISD::VSELECT, MVT::v4i64, Legal); 1188 setOperationAction(ISD::VSELECT, MVT::v8i32, Legal); 1189 setOperationAction(ISD::VSELECT, MVT::v8f32, Legal); 1190 1191 setOperationAction(ISD::SIGN_EXTEND, MVT::v4i64, Custom); 1192 setOperationAction(ISD::SIGN_EXTEND, MVT::v8i32, Custom); 1193 setOperationAction(ISD::SIGN_EXTEND, MVT::v16i16, Custom); 1194 setOperationAction(ISD::ZERO_EXTEND, MVT::v4i64, Custom); 1195 setOperationAction(ISD::ZERO_EXTEND, MVT::v8i32, Custom); 1196 setOperationAction(ISD::ZERO_EXTEND, MVT::v16i16, Custom); 1197 setOperationAction(ISD::ANY_EXTEND, MVT::v4i64, Custom); 1198 setOperationAction(ISD::ANY_EXTEND, MVT::v8i32, Custom); 1199 setOperationAction(ISD::ANY_EXTEND, MVT::v16i16, Custom); 1200 setOperationAction(ISD::TRUNCATE, MVT::v16i8, Custom); 1201 setOperationAction(ISD::TRUNCATE, MVT::v8i16, Custom); 1202 setOperationAction(ISD::TRUNCATE, MVT::v4i32, Custom); 1203 1204 if (Subtarget->hasFMA() || Subtarget->hasFMA4()) { 1205 setOperationAction(ISD::FMA, MVT::v8f32, Legal); 1206 setOperationAction(ISD::FMA, MVT::v4f64, Legal); 1207 setOperationAction(ISD::FMA, MVT::v4f32, Legal); 1208 setOperationAction(ISD::FMA, MVT::v2f64, Legal); 1209 setOperationAction(ISD::FMA, MVT::f32, Legal); 1210 setOperationAction(ISD::FMA, MVT::f64, Legal); 1211 } 1212 1213 if (Subtarget->hasInt256()) { 1214 setOperationAction(ISD::ADD, MVT::v4i64, Legal); 1215 setOperationAction(ISD::ADD, MVT::v8i32, Legal); 1216 setOperationAction(ISD::ADD, MVT::v16i16, Legal); 1217 setOperationAction(ISD::ADD, MVT::v32i8, Legal); 1218 1219 setOperationAction(ISD::SUB, MVT::v4i64, Legal); 1220 setOperationAction(ISD::SUB, MVT::v8i32, Legal); 1221 setOperationAction(ISD::SUB, MVT::v16i16, Legal); 1222 setOperationAction(ISD::SUB, MVT::v32i8, Legal); 1223 1224 setOperationAction(ISD::MUL, MVT::v4i64, Custom); 1225 setOperationAction(ISD::MUL, MVT::v8i32, Legal); 1226 setOperationAction(ISD::MUL, MVT::v16i16, Legal); 1227 // Don't lower v32i8 because there is no 128-bit byte mul 1228 1229 setOperationAction(ISD::VSELECT, MVT::v32i8, Legal); 1230 1231 setOperationAction(ISD::SDIV, MVT::v8i32, Custom); 1232 } else { 1233 setOperationAction(ISD::ADD, MVT::v4i64, Custom); 1234 setOperationAction(ISD::ADD, MVT::v8i32, Custom); 1235 setOperationAction(ISD::ADD, MVT::v16i16, Custom); 1236 setOperationAction(ISD::ADD, MVT::v32i8, Custom); 1237 1238 setOperationAction(ISD::SUB, MVT::v4i64, Custom); 1239 setOperationAction(ISD::SUB, MVT::v8i32, Custom); 1240 setOperationAction(ISD::SUB, MVT::v16i16, Custom); 1241 setOperationAction(ISD::SUB, MVT::v32i8, Custom); 1242 1243 setOperationAction(ISD::MUL, MVT::v4i64, Custom); 1244 setOperationAction(ISD::MUL, MVT::v8i32, Custom); 1245 setOperationAction(ISD::MUL, MVT::v16i16, Custom); 1246 // Don't lower v32i8 because there is no 128-bit byte mul 1247 } 1248 1249 // In the customized shift lowering, the legal cases in AVX2 will be 1250 // recognized. 1251 setOperationAction(ISD::SRL, MVT::v4i64, Custom); 1252 setOperationAction(ISD::SRL, MVT::v8i32, Custom); 1253 1254 setOperationAction(ISD::SHL, MVT::v4i64, Custom); 1255 setOperationAction(ISD::SHL, MVT::v8i32, Custom); 1256 1257 setOperationAction(ISD::SRA, MVT::v8i32, Custom); 1258 1259 // Custom lower several nodes for 256-bit types. 1260 for (int i = MVT::FIRST_VECTOR_VALUETYPE; 1261 i <= MVT::LAST_VECTOR_VALUETYPE; ++i) { 1262 MVT VT = (MVT::SimpleValueType)i; 1263 1264 // Extract subvector is special because the value type 1265 // (result) is 128-bit but the source is 256-bit wide. 1266 if (VT.is128BitVector()) 1267 setOperationAction(ISD::EXTRACT_SUBVECTOR, VT, Custom); 1268 1269 // Do not attempt to custom lower other non-256-bit vectors 1270 if (!VT.is256BitVector()) 1271 continue; 1272 1273 setOperationAction(ISD::BUILD_VECTOR, VT, Custom); 1274 setOperationAction(ISD::VECTOR_SHUFFLE, VT, Custom); 1275 setOperationAction(ISD::INSERT_VECTOR_ELT, VT, Custom); 1276 setOperationAction(ISD::EXTRACT_VECTOR_ELT, VT, Custom); 1277 setOperationAction(ISD::SCALAR_TO_VECTOR, VT, Custom); 1278 setOperationAction(ISD::INSERT_SUBVECTOR, VT, Custom); 1279 setOperationAction(ISD::CONCAT_VECTORS, VT, Custom); 1280 } 1281 1282 // Promote v32i8, v16i16, v8i32 select, and, or, xor to v4i64. 1283 for (int i = MVT::v32i8; i != MVT::v4i64; ++i) { 1284 MVT VT = (MVT::SimpleValueType)i; 1285 1286 // Do not attempt to promote non-256-bit vectors 1287 if (!VT.is256BitVector()) 1288 continue; 1289 1290 setOperationAction(ISD::AND, VT, Promote); 1291 AddPromotedToType (ISD::AND, VT, MVT::v4i64); 1292 setOperationAction(ISD::OR, VT, Promote); 1293 AddPromotedToType (ISD::OR, VT, MVT::v4i64); 1294 setOperationAction(ISD::XOR, VT, Promote); 1295 AddPromotedToType (ISD::XOR, VT, MVT::v4i64); 1296 setOperationAction(ISD::LOAD, VT, Promote); 1297 AddPromotedToType (ISD::LOAD, VT, MVT::v4i64); 1298 setOperationAction(ISD::SELECT, VT, Promote); 1299 AddPromotedToType (ISD::SELECT, VT, MVT::v4i64); 1300 } 1301 } 1302 1303 if (!TM.Options.UseSoftFloat && Subtarget->hasAVX512()) { 1304 addRegisterClass(MVT::v16i32, &X86::VR512RegClass); 1305 addRegisterClass(MVT::v16f32, &X86::VR512RegClass); 1306 addRegisterClass(MVT::v8i64, &X86::VR512RegClass); 1307 addRegisterClass(MVT::v8f64, &X86::VR512RegClass); 1308 1309 addRegisterClass(MVT::v8i1, &X86::VK8RegClass); 1310 addRegisterClass(MVT::v16i1, &X86::VK16RegClass); 1311 1312 setLoadExtAction(ISD::EXTLOAD, MVT::v8f32, Legal); 1313 setOperationAction(ISD::LOAD, MVT::v16f32, Legal); 1314 setOperationAction(ISD::LOAD, MVT::v8f64, Legal); 1315 setOperationAction(ISD::LOAD, MVT::v8i64, Legal); 1316 setOperationAction(ISD::LOAD, MVT::v16i32, Legal); 1317 setOperationAction(ISD::LOAD, MVT::v16i1, Legal); 1318 1319 setOperationAction(ISD::FADD, MVT::v16f32, Legal); 1320 setOperationAction(ISD::FSUB, MVT::v16f32, Legal); 1321 setOperationAction(ISD::FMUL, MVT::v16f32, Legal); 1322 setOperationAction(ISD::FDIV, MVT::v16f32, Legal); 1323 setOperationAction(ISD::FSQRT, MVT::v16f32, Legal); 1324 setOperationAction(ISD::FNEG, MVT::v16f32, Custom); 1325 1326 setOperationAction(ISD::FADD, MVT::v8f64, Legal); 1327 setOperationAction(ISD::FSUB, MVT::v8f64, Legal); 1328 setOperationAction(ISD::FMUL, MVT::v8f64, Legal); 1329 setOperationAction(ISD::FDIV, MVT::v8f64, Legal); 1330 setOperationAction(ISD::FSQRT, MVT::v8f64, Legal); 1331 setOperationAction(ISD::FNEG, MVT::v8f64, Custom); 1332 setOperationAction(ISD::FMA, MVT::v8f64, Legal); 1333 setOperationAction(ISD::FMA, MVT::v16f32, Legal); 1334 setOperationAction(ISD::SDIV, MVT::v16i32, Custom); 1335 1336 setOperationAction(ISD::FP_TO_SINT, MVT::i32, Legal); 1337 setOperationAction(ISD::FP_TO_UINT, MVT::i32, Legal); 1338 setOperationAction(ISD::SINT_TO_FP, MVT::i32, Legal); 1339 setOperationAction(ISD::UINT_TO_FP, MVT::i32, Legal); 1340 if (Subtarget->is64Bit()) { 1341 setOperationAction(ISD::FP_TO_UINT, MVT::i64, Legal); 1342 setOperationAction(ISD::FP_TO_SINT, MVT::i64, Legal); 1343 setOperationAction(ISD::SINT_TO_FP, MVT::i64, Legal); 1344 setOperationAction(ISD::UINT_TO_FP, MVT::i64, Legal); 1345 } 1346 setOperationAction(ISD::FP_TO_SINT, MVT::v16i32, Legal); 1347 setOperationAction(ISD::FP_TO_UINT, MVT::v16i32, Legal); 1348 setOperationAction(ISD::FP_TO_UINT, MVT::v8i32, Legal); 1349 setOperationAction(ISD::SINT_TO_FP, MVT::v16i32, Legal); 1350 setOperationAction(ISD::UINT_TO_FP, MVT::v16i32, Legal); 1351 setOperationAction(ISD::UINT_TO_FP, MVT::v8i32, Legal); 1352 setOperationAction(ISD::FP_ROUND, MVT::v8f32, Legal); 1353 setOperationAction(ISD::FP_EXTEND, MVT::v8f32, Legal); 1354 1355 setOperationAction(ISD::TRUNCATE, MVT::i1, Legal); 1356 setOperationAction(ISD::TRUNCATE, MVT::v16i8, Custom); 1357 setOperationAction(ISD::TRUNCATE, MVT::v8i32, Custom); 1358 setOperationAction(ISD::TRUNCATE, MVT::v8i1, Custom); 1359 setOperationAction(ISD::TRUNCATE, MVT::v16i1, Custom); 1360 setOperationAction(ISD::ZERO_EXTEND, MVT::v16i32, Custom); 1361 setOperationAction(ISD::ZERO_EXTEND, MVT::v8i64, Custom); 1362 setOperationAction(ISD::SIGN_EXTEND, MVT::v16i32, Custom); 1363 setOperationAction(ISD::SIGN_EXTEND, MVT::v8i64, Custom); 1364 setOperationAction(ISD::SIGN_EXTEND, MVT::v16i8, Custom); 1365 setOperationAction(ISD::SIGN_EXTEND, MVT::v8i16, Custom); 1366 setOperationAction(ISD::SIGN_EXTEND, MVT::v16i16, Custom); 1367 1368 setOperationAction(ISD::CONCAT_VECTORS, MVT::v8f64, Custom); 1369 setOperationAction(ISD::CONCAT_VECTORS, MVT::v8i64, Custom); 1370 setOperationAction(ISD::CONCAT_VECTORS, MVT::v16f32, Custom); 1371 setOperationAction(ISD::CONCAT_VECTORS, MVT::v16i32, Custom); 1372 setOperationAction(ISD::CONCAT_VECTORS, MVT::v8i1, Custom); 1373 1374 setOperationAction(ISD::SETCC, MVT::v16i1, Custom); 1375 setOperationAction(ISD::SETCC, MVT::v8i1, Custom); 1376 1377 setOperationAction(ISD::MUL, MVT::v8i64, Custom); 1378 1379 setOperationAction(ISD::BUILD_VECTOR, MVT::v8i1, Custom); 1380 setOperationAction(ISD::BUILD_VECTOR, MVT::v16i1, Custom); 1381 setOperationAction(ISD::SELECT, MVT::v8f64, Custom); 1382 setOperationAction(ISD::SELECT, MVT::v8i64, Custom); 1383 setOperationAction(ISD::SELECT, MVT::v16f32, Custom); 1384 1385 setOperationAction(ISD::ADD, MVT::v8i64, Legal); 1386 setOperationAction(ISD::ADD, MVT::v16i32, Legal); 1387 1388 setOperationAction(ISD::SUB, MVT::v8i64, Legal); 1389 setOperationAction(ISD::SUB, MVT::v16i32, Legal); 1390 1391 setOperationAction(ISD::MUL, MVT::v16i32, Legal); 1392 1393 setOperationAction(ISD::SRL, MVT::v8i64, Custom); 1394 setOperationAction(ISD::SRL, MVT::v16i32, Custom); 1395 1396 setOperationAction(ISD::SHL, MVT::v8i64, Custom); 1397 setOperationAction(ISD::SHL, MVT::v16i32, Custom); 1398 1399 setOperationAction(ISD::SRA, MVT::v8i64, Custom); 1400 setOperationAction(ISD::SRA, MVT::v16i32, Custom); 1401 1402 setOperationAction(ISD::AND, MVT::v8i64, Legal); 1403 setOperationAction(ISD::OR, MVT::v8i64, Legal); 1404 setOperationAction(ISD::XOR, MVT::v8i64, Legal); 1405 setOperationAction(ISD::AND, MVT::v16i32, Legal); 1406 setOperationAction(ISD::OR, MVT::v16i32, Legal); 1407 setOperationAction(ISD::XOR, MVT::v16i32, Legal); 1408 1409 // Custom lower several nodes. 1410 for (int i = MVT::FIRST_VECTOR_VALUETYPE; 1411 i <= MVT::LAST_VECTOR_VALUETYPE; ++i) { 1412 MVT VT = (MVT::SimpleValueType)i; 1413 1414 unsigned EltSize = VT.getVectorElementType().getSizeInBits(); 1415 // Extract subvector is special because the value type 1416 // (result) is 256/128-bit but the source is 512-bit wide. 1417 if (VT.is128BitVector() || VT.is256BitVector()) 1418 setOperationAction(ISD::EXTRACT_SUBVECTOR, VT, Custom); 1419 1420 if (VT.getVectorElementType() == MVT::i1) 1421 setOperationAction(ISD::EXTRACT_SUBVECTOR, VT, Legal); 1422 1423 // Do not attempt to custom lower other non-512-bit vectors 1424 if (!VT.is512BitVector()) 1425 continue; 1426 1427 if ( EltSize >= 32) { 1428 setOperationAction(ISD::VECTOR_SHUFFLE, VT, Custom); 1429 setOperationAction(ISD::INSERT_VECTOR_ELT, VT, Custom); 1430 setOperationAction(ISD::BUILD_VECTOR, VT, Custom); 1431 setOperationAction(ISD::VSELECT, VT, Legal); 1432 setOperationAction(ISD::EXTRACT_VECTOR_ELT, VT, Custom); 1433 setOperationAction(ISD::SCALAR_TO_VECTOR, VT, Custom); 1434 setOperationAction(ISD::INSERT_SUBVECTOR, VT, Custom); 1435 } 1436 } 1437 for (int i = MVT::v32i8; i != MVT::v8i64; ++i) { 1438 MVT VT = (MVT::SimpleValueType)i; 1439 1440 // Do not attempt to promote non-256-bit vectors 1441 if (!VT.is512BitVector()) 1442 continue; 1443 1444 setOperationAction(ISD::SELECT, VT, Promote); 1445 AddPromotedToType (ISD::SELECT, VT, MVT::v8i64); 1446 } 1447 }// has AVX-512 1448 1449 // SIGN_EXTEND_INREGs are evaluated by the extend type. Handle the expansion 1450 // of this type with custom code. 1451 for (int VT = MVT::FIRST_VECTOR_VALUETYPE; 1452 VT != MVT::LAST_VECTOR_VALUETYPE; VT++) { 1453 setOperationAction(ISD::SIGN_EXTEND_INREG, (MVT::SimpleValueType)VT, 1454 Custom); 1455 } 1456 1457 // We want to custom lower some of our intrinsics. 1458 setOperationAction(ISD::INTRINSIC_WO_CHAIN, MVT::Other, Custom); 1459 setOperationAction(ISD::INTRINSIC_W_CHAIN, MVT::Other, Custom); 1460 setOperationAction(ISD::INTRINSIC_VOID, MVT::Other, Custom); 1461 1462 // Only custom-lower 64-bit SADDO and friends on 64-bit because we don't 1463 // handle type legalization for these operations here. 1464 // 1465 // FIXME: We really should do custom legalization for addition and 1466 // subtraction on x86-32 once PR3203 is fixed. We really can't do much better 1467 // than generic legalization for 64-bit multiplication-with-overflow, though. 1468 for (unsigned i = 0, e = 3+Subtarget->is64Bit(); i != e; ++i) { 1469 // Add/Sub/Mul with overflow operations are custom lowered. 1470 MVT VT = IntVTs[i]; 1471 setOperationAction(ISD::SADDO, VT, Custom); 1472 setOperationAction(ISD::UADDO, VT, Custom); 1473 setOperationAction(ISD::SSUBO, VT, Custom); 1474 setOperationAction(ISD::USUBO, VT, Custom); 1475 setOperationAction(ISD::SMULO, VT, Custom); 1476 setOperationAction(ISD::UMULO, VT, Custom); 1477 } 1478 1479 // There are no 8-bit 3-address imul/mul instructions 1480 setOperationAction(ISD::SMULO, MVT::i8, Expand); 1481 setOperationAction(ISD::UMULO, MVT::i8, Expand); 1482 1483 if (!Subtarget->is64Bit()) { 1484 // These libcalls are not available in 32-bit. 1485 setLibcallName(RTLIB::SHL_I128, 0); 1486 setLibcallName(RTLIB::SRL_I128, 0); 1487 setLibcallName(RTLIB::SRA_I128, 0); 1488 } 1489 1490 // Combine sin / cos into one node or libcall if possible. 1491 if (Subtarget->hasSinCos()) { 1492 setLibcallName(RTLIB::SINCOS_F32, "sincosf"); 1493 setLibcallName(RTLIB::SINCOS_F64, "sincos"); 1494 if (Subtarget->isTargetDarwin()) { 1495 // For MacOSX, we don't want to the normal expansion of a libcall to 1496 // sincos. We want to issue a libcall to __sincos_stret to avoid memory 1497 // traffic. 1498 setOperationAction(ISD::FSINCOS, MVT::f64, Custom); 1499 setOperationAction(ISD::FSINCOS, MVT::f32, Custom); 1500 } 1501 } 1502 1503 // We have target-specific dag combine patterns for the following nodes: 1504 setTargetDAGCombine(ISD::VECTOR_SHUFFLE); 1505 setTargetDAGCombine(ISD::EXTRACT_VECTOR_ELT); 1506 setTargetDAGCombine(ISD::VSELECT); 1507 setTargetDAGCombine(ISD::SELECT); 1508 setTargetDAGCombine(ISD::SHL); 1509 setTargetDAGCombine(ISD::SRA); 1510 setTargetDAGCombine(ISD::SRL); 1511 setTargetDAGCombine(ISD::OR); 1512 setTargetDAGCombine(ISD::AND); 1513 setTargetDAGCombine(ISD::ADD); 1514 setTargetDAGCombine(ISD::FADD); 1515 setTargetDAGCombine(ISD::FSUB); 1516 setTargetDAGCombine(ISD::FMA); 1517 setTargetDAGCombine(ISD::SUB); 1518 setTargetDAGCombine(ISD::LOAD); 1519 setTargetDAGCombine(ISD::STORE); 1520 setTargetDAGCombine(ISD::ZERO_EXTEND); 1521 setTargetDAGCombine(ISD::ANY_EXTEND); 1522 setTargetDAGCombine(ISD::SIGN_EXTEND); 1523 setTargetDAGCombine(ISD::SIGN_EXTEND_INREG); 1524 setTargetDAGCombine(ISD::TRUNCATE); 1525 setTargetDAGCombine(ISD::SINT_TO_FP); 1526 setTargetDAGCombine(ISD::SETCC); 1527 if (Subtarget->is64Bit()) 1528 setTargetDAGCombine(ISD::MUL); 1529 setTargetDAGCombine(ISD::XOR); 1530 1531 computeRegisterProperties(); 1532 1533 // On Darwin, -Os means optimize for size without hurting performance, 1534 // do not reduce the limit. 1535 MaxStoresPerMemset = 16; // For @llvm.memset -> sequence of stores 1536 MaxStoresPerMemsetOptSize = Subtarget->isTargetDarwin() ? 16 : 8; 1537 MaxStoresPerMemcpy = 8; // For @llvm.memcpy -> sequence of stores 1538 MaxStoresPerMemcpyOptSize = Subtarget->isTargetDarwin() ? 8 : 4; 1539 MaxStoresPerMemmove = 8; // For @llvm.memmove -> sequence of stores 1540 MaxStoresPerMemmoveOptSize = Subtarget->isTargetDarwin() ? 8 : 4; 1541 setPrefLoopAlignment(4); // 2^4 bytes. 1542 1543 // Predictable cmov don't hurt on atom because it's in-order. 1544 PredictableSelectIsExpensive = !Subtarget->isAtom(); 1545 1546 setPrefFunctionAlignment(4); // 2^4 bytes. 1547} 1548 1549EVT X86TargetLowering::getSetCCResultType(LLVMContext &, EVT VT) const { 1550 if (!VT.isVector()) 1551 return MVT::i8; 1552 1553 const TargetMachine &TM = getTargetMachine(); 1554 if (!TM.Options.UseSoftFloat && Subtarget->hasAVX512()) 1555 switch(VT.getVectorNumElements()) { 1556 case 8: return MVT::v8i1; 1557 case 16: return MVT::v16i1; 1558 } 1559 1560 return VT.changeVectorElementTypeToInteger(); 1561} 1562 1563/// getMaxByValAlign - Helper for getByValTypeAlignment to determine 1564/// the desired ByVal argument alignment. 1565static void getMaxByValAlign(Type *Ty, unsigned &MaxAlign) { 1566 if (MaxAlign == 16) 1567 return; 1568 if (VectorType *VTy = dyn_cast<VectorType>(Ty)) { 1569 if (VTy->getBitWidth() == 128) 1570 MaxAlign = 16; 1571 } else if (ArrayType *ATy = dyn_cast<ArrayType>(Ty)) { 1572 unsigned EltAlign = 0; 1573 getMaxByValAlign(ATy->getElementType(), EltAlign); 1574 if (EltAlign > MaxAlign) 1575 MaxAlign = EltAlign; 1576 } else if (StructType *STy = dyn_cast<StructType>(Ty)) { 1577 for (unsigned i = 0, e = STy->getNumElements(); i != e; ++i) { 1578 unsigned EltAlign = 0; 1579 getMaxByValAlign(STy->getElementType(i), EltAlign); 1580 if (EltAlign > MaxAlign) 1581 MaxAlign = EltAlign; 1582 if (MaxAlign == 16) 1583 break; 1584 } 1585 } 1586} 1587 1588/// getByValTypeAlignment - Return the desired alignment for ByVal aggregate 1589/// function arguments in the caller parameter area. For X86, aggregates 1590/// that contain SSE vectors are placed at 16-byte boundaries while the rest 1591/// are at 4-byte boundaries. 1592unsigned X86TargetLowering::getByValTypeAlignment(Type *Ty) const { 1593 if (Subtarget->is64Bit()) { 1594 // Max of 8 and alignment of type. 1595 unsigned TyAlign = TD->getABITypeAlignment(Ty); 1596 if (TyAlign > 8) 1597 return TyAlign; 1598 return 8; 1599 } 1600 1601 unsigned Align = 4; 1602 if (Subtarget->hasSSE1()) 1603 getMaxByValAlign(Ty, Align); 1604 return Align; 1605} 1606 1607/// getOptimalMemOpType - Returns the target specific optimal type for load 1608/// and store operations as a result of memset, memcpy, and memmove 1609/// lowering. If DstAlign is zero that means it's safe to destination 1610/// alignment can satisfy any constraint. Similarly if SrcAlign is zero it 1611/// means there isn't a need to check it against alignment requirement, 1612/// probably because the source does not need to be loaded. If 'IsMemset' is 1613/// true, that means it's expanding a memset. If 'ZeroMemset' is true, that 1614/// means it's a memset of zero. 'MemcpyStrSrc' indicates whether the memcpy 1615/// source is constant so it does not need to be loaded. 1616/// It returns EVT::Other if the type should be determined using generic 1617/// target-independent logic. 1618EVT 1619X86TargetLowering::getOptimalMemOpType(uint64_t Size, 1620 unsigned DstAlign, unsigned SrcAlign, 1621 bool IsMemset, bool ZeroMemset, 1622 bool MemcpyStrSrc, 1623 MachineFunction &MF) const { 1624 const Function *F = MF.getFunction(); 1625 if ((!IsMemset || ZeroMemset) && 1626 !F->getAttributes().hasAttribute(AttributeSet::FunctionIndex, 1627 Attribute::NoImplicitFloat)) { 1628 if (Size >= 16 && 1629 (Subtarget->isUnalignedMemAccessFast() || 1630 ((DstAlign == 0 || DstAlign >= 16) && 1631 (SrcAlign == 0 || SrcAlign >= 16)))) { 1632 if (Size >= 32) { 1633 if (Subtarget->hasInt256()) 1634 return MVT::v8i32; 1635 if (Subtarget->hasFp256()) 1636 return MVT::v8f32; 1637 } 1638 if (Subtarget->hasSSE2()) 1639 return MVT::v4i32; 1640 if (Subtarget->hasSSE1()) 1641 return MVT::v4f32; 1642 } else if (!MemcpyStrSrc && Size >= 8 && 1643 !Subtarget->is64Bit() && 1644 Subtarget->hasSSE2()) { 1645 // Do not use f64 to lower memcpy if source is string constant. It's 1646 // better to use i32 to avoid the loads. 1647 return MVT::f64; 1648 } 1649 } 1650 if (Subtarget->is64Bit() && Size >= 8) 1651 return MVT::i64; 1652 return MVT::i32; 1653} 1654 1655bool X86TargetLowering::isSafeMemOpType(MVT VT) const { 1656 if (VT == MVT::f32) 1657 return X86ScalarSSEf32; 1658 else if (VT == MVT::f64) 1659 return X86ScalarSSEf64; 1660 return true; 1661} 1662 1663bool 1664X86TargetLowering::allowsUnalignedMemoryAccesses(EVT VT, bool *Fast) const { 1665 if (Fast) 1666 *Fast = Subtarget->isUnalignedMemAccessFast(); 1667 return true; 1668} 1669 1670/// getJumpTableEncoding - Return the entry encoding for a jump table in the 1671/// current function. The returned value is a member of the 1672/// MachineJumpTableInfo::JTEntryKind enum. 1673unsigned X86TargetLowering::getJumpTableEncoding() const { 1674 // In GOT pic mode, each entry in the jump table is emitted as a @GOTOFF 1675 // symbol. 1676 if (getTargetMachine().getRelocationModel() == Reloc::PIC_ && 1677 Subtarget->isPICStyleGOT()) 1678 return MachineJumpTableInfo::EK_Custom32; 1679 1680 // Otherwise, use the normal jump table encoding heuristics. 1681 return TargetLowering::getJumpTableEncoding(); 1682} 1683 1684const MCExpr * 1685X86TargetLowering::LowerCustomJumpTableEntry(const MachineJumpTableInfo *MJTI, 1686 const MachineBasicBlock *MBB, 1687 unsigned uid,MCContext &Ctx) const{ 1688 assert(getTargetMachine().getRelocationModel() == Reloc::PIC_ && 1689 Subtarget->isPICStyleGOT()); 1690 // In 32-bit ELF systems, our jump table entries are formed with @GOTOFF 1691 // entries. 1692 return MCSymbolRefExpr::Create(MBB->getSymbol(), 1693 MCSymbolRefExpr::VK_GOTOFF, Ctx); 1694} 1695 1696/// getPICJumpTableRelocaBase - Returns relocation base for the given PIC 1697/// jumptable. 1698SDValue X86TargetLowering::getPICJumpTableRelocBase(SDValue Table, 1699 SelectionDAG &DAG) const { 1700 if (!Subtarget->is64Bit()) 1701 // This doesn't have SDLoc associated with it, but is not really the 1702 // same as a Register. 1703 return DAG.getNode(X86ISD::GlobalBaseReg, SDLoc(), getPointerTy()); 1704 return Table; 1705} 1706 1707/// getPICJumpTableRelocBaseExpr - This returns the relocation base for the 1708/// given PIC jumptable, the same as getPICJumpTableRelocBase, but as an 1709/// MCExpr. 1710const MCExpr *X86TargetLowering:: 1711getPICJumpTableRelocBaseExpr(const MachineFunction *MF, unsigned JTI, 1712 MCContext &Ctx) const { 1713 // X86-64 uses RIP relative addressing based on the jump table label. 1714 if (Subtarget->isPICStyleRIPRel()) 1715 return TargetLowering::getPICJumpTableRelocBaseExpr(MF, JTI, Ctx); 1716 1717 // Otherwise, the reference is relative to the PIC base. 1718 return MCSymbolRefExpr::Create(MF->getPICBaseSymbol(), Ctx); 1719} 1720 1721// FIXME: Why this routine is here? Move to RegInfo! 1722std::pair<const TargetRegisterClass*, uint8_t> 1723X86TargetLowering::findRepresentativeClass(MVT VT) const{ 1724 const TargetRegisterClass *RRC = 0; 1725 uint8_t Cost = 1; 1726 switch (VT.SimpleTy) { 1727 default: 1728 return TargetLowering::findRepresentativeClass(VT); 1729 case MVT::i8: case MVT::i16: case MVT::i32: case MVT::i64: 1730 RRC = Subtarget->is64Bit() ? 1731 (const TargetRegisterClass*)&X86::GR64RegClass : 1732 (const TargetRegisterClass*)&X86::GR32RegClass; 1733 break; 1734 case MVT::x86mmx: 1735 RRC = &X86::VR64RegClass; 1736 break; 1737 case MVT::f32: case MVT::f64: 1738 case MVT::v16i8: case MVT::v8i16: case MVT::v4i32: case MVT::v2i64: 1739 case MVT::v4f32: case MVT::v2f64: 1740 case MVT::v32i8: case MVT::v8i32: case MVT::v4i64: case MVT::v8f32: 1741 case MVT::v4f64: 1742 RRC = &X86::VR128RegClass; 1743 break; 1744 } 1745 return std::make_pair(RRC, Cost); 1746} 1747 1748bool X86TargetLowering::getStackCookieLocation(unsigned &AddressSpace, 1749 unsigned &Offset) const { 1750 if (!Subtarget->isTargetLinux()) 1751 return false; 1752 1753 if (Subtarget->is64Bit()) { 1754 // %fs:0x28, unless we're using a Kernel code model, in which case it's %gs: 1755 Offset = 0x28; 1756 if (getTargetMachine().getCodeModel() == CodeModel::Kernel) 1757 AddressSpace = 256; 1758 else 1759 AddressSpace = 257; 1760 } else { 1761 // %gs:0x14 on i386 1762 Offset = 0x14; 1763 AddressSpace = 256; 1764 } 1765 return true; 1766} 1767 1768bool X86TargetLowering::isNoopAddrSpaceCast(unsigned SrcAS, 1769 unsigned DestAS) const { 1770 assert(SrcAS != DestAS && "Expected different address spaces!"); 1771 1772 return SrcAS < 256 && DestAS < 256; 1773} 1774 1775//===----------------------------------------------------------------------===// 1776// Return Value Calling Convention Implementation 1777//===----------------------------------------------------------------------===// 1778 1779#include "X86GenCallingConv.inc" 1780 1781bool 1782X86TargetLowering::CanLowerReturn(CallingConv::ID CallConv, 1783 MachineFunction &MF, bool isVarArg, 1784 const SmallVectorImpl<ISD::OutputArg> &Outs, 1785 LLVMContext &Context) const { 1786 SmallVector<CCValAssign, 16> RVLocs; 1787 CCState CCInfo(CallConv, isVarArg, MF, getTargetMachine(), 1788 RVLocs, Context); 1789 return CCInfo.CheckReturn(Outs, RetCC_X86); 1790} 1791 1792const uint16_t *X86TargetLowering::getScratchRegisters(CallingConv::ID) const { 1793 static const uint16_t ScratchRegs[] = { X86::R11, 0 }; 1794 return ScratchRegs; 1795} 1796 1797SDValue 1798X86TargetLowering::LowerReturn(SDValue Chain, 1799 CallingConv::ID CallConv, bool isVarArg, 1800 const SmallVectorImpl<ISD::OutputArg> &Outs, 1801 const SmallVectorImpl<SDValue> &OutVals, 1802 SDLoc dl, SelectionDAG &DAG) const { 1803 MachineFunction &MF = DAG.getMachineFunction(); 1804 X86MachineFunctionInfo *FuncInfo = MF.getInfo<X86MachineFunctionInfo>(); 1805 1806 SmallVector<CCValAssign, 16> RVLocs; 1807 CCState CCInfo(CallConv, isVarArg, MF, getTargetMachine(), 1808 RVLocs, *DAG.getContext()); 1809 CCInfo.AnalyzeReturn(Outs, RetCC_X86); 1810 1811 SDValue Flag; 1812 SmallVector<SDValue, 6> RetOps; 1813 RetOps.push_back(Chain); // Operand #0 = Chain (updated below) 1814 // Operand #1 = Bytes To Pop 1815 RetOps.push_back(DAG.getTargetConstant(FuncInfo->getBytesToPopOnReturn(), 1816 MVT::i16)); 1817 1818 // Copy the result values into the output registers. 1819 for (unsigned i = 0; i != RVLocs.size(); ++i) { 1820 CCValAssign &VA = RVLocs[i]; 1821 assert(VA.isRegLoc() && "Can only return in registers!"); 1822 SDValue ValToCopy = OutVals[i]; 1823 EVT ValVT = ValToCopy.getValueType(); 1824 1825 // Promote values to the appropriate types 1826 if (VA.getLocInfo() == CCValAssign::SExt) 1827 ValToCopy = DAG.getNode(ISD::SIGN_EXTEND, dl, VA.getLocVT(), ValToCopy); 1828 else if (VA.getLocInfo() == CCValAssign::ZExt) 1829 ValToCopy = DAG.getNode(ISD::ZERO_EXTEND, dl, VA.getLocVT(), ValToCopy); 1830 else if (VA.getLocInfo() == CCValAssign::AExt) 1831 ValToCopy = DAG.getNode(ISD::ANY_EXTEND, dl, VA.getLocVT(), ValToCopy); 1832 else if (VA.getLocInfo() == CCValAssign::BCvt) 1833 ValToCopy = DAG.getNode(ISD::BITCAST, dl, VA.getLocVT(), ValToCopy); 1834 1835 // If this is x86-64, and we disabled SSE, we can't return FP values, 1836 // or SSE or MMX vectors. 1837 if ((ValVT == MVT::f32 || ValVT == MVT::f64 || 1838 VA.getLocReg() == X86::XMM0 || VA.getLocReg() == X86::XMM1) && 1839 (Subtarget->is64Bit() && !Subtarget->hasSSE1())) { 1840 report_fatal_error("SSE register return with SSE disabled"); 1841 } 1842 // Likewise we can't return F64 values with SSE1 only. gcc does so, but 1843 // llvm-gcc has never done it right and no one has noticed, so this 1844 // should be OK for now. 1845 if (ValVT == MVT::f64 && 1846 (Subtarget->is64Bit() && !Subtarget->hasSSE2())) 1847 report_fatal_error("SSE2 register return with SSE2 disabled"); 1848 1849 // Returns in ST0/ST1 are handled specially: these are pushed as operands to 1850 // the RET instruction and handled by the FP Stackifier. 1851 if (VA.getLocReg() == X86::ST0 || 1852 VA.getLocReg() == X86::ST1) { 1853 // If this is a copy from an xmm register to ST(0), use an FPExtend to 1854 // change the value to the FP stack register class. 1855 if (isScalarFPTypeInSSEReg(VA.getValVT())) 1856 ValToCopy = DAG.getNode(ISD::FP_EXTEND, dl, MVT::f80, ValToCopy); 1857 RetOps.push_back(ValToCopy); 1858 // Don't emit a copytoreg. 1859 continue; 1860 } 1861 1862 // 64-bit vector (MMX) values are returned in XMM0 / XMM1 except for v1i64 1863 // which is returned in RAX / RDX. 1864 if (Subtarget->is64Bit()) { 1865 if (ValVT == MVT::x86mmx) { 1866 if (VA.getLocReg() == X86::XMM0 || VA.getLocReg() == X86::XMM1) { 1867 ValToCopy = DAG.getNode(ISD::BITCAST, dl, MVT::i64, ValToCopy); 1868 ValToCopy = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v2i64, 1869 ValToCopy); 1870 // If we don't have SSE2 available, convert to v4f32 so the generated 1871 // register is legal. 1872 if (!Subtarget->hasSSE2()) 1873 ValToCopy = DAG.getNode(ISD::BITCAST, dl, MVT::v4f32,ValToCopy); 1874 } 1875 } 1876 } 1877 1878 Chain = DAG.getCopyToReg(Chain, dl, VA.getLocReg(), ValToCopy, Flag); 1879 Flag = Chain.getValue(1); 1880 RetOps.push_back(DAG.getRegister(VA.getLocReg(), VA.getLocVT())); 1881 } 1882 1883 // The x86-64 ABIs require that for returning structs by value we copy 1884 // the sret argument into %rax/%eax (depending on ABI) for the return. 1885 // Win32 requires us to put the sret argument to %eax as well. 1886 // We saved the argument into a virtual register in the entry block, 1887 // so now we copy the value out and into %rax/%eax. 1888 if (DAG.getMachineFunction().getFunction()->hasStructRetAttr() && 1889 (Subtarget->is64Bit() || Subtarget->isTargetWindows())) { 1890 MachineFunction &MF = DAG.getMachineFunction(); 1891 X86MachineFunctionInfo *FuncInfo = MF.getInfo<X86MachineFunctionInfo>(); 1892 unsigned Reg = FuncInfo->getSRetReturnReg(); 1893 assert(Reg && 1894 "SRetReturnReg should have been set in LowerFormalArguments()."); 1895 SDValue Val = DAG.getCopyFromReg(Chain, dl, Reg, getPointerTy()); 1896 1897 unsigned RetValReg 1898 = (Subtarget->is64Bit() && !Subtarget->isTarget64BitILP32()) ? 1899 X86::RAX : X86::EAX; 1900 Chain = DAG.getCopyToReg(Chain, dl, RetValReg, Val, Flag); 1901 Flag = Chain.getValue(1); 1902 1903 // RAX/EAX now acts like a return value. 1904 RetOps.push_back(DAG.getRegister(RetValReg, getPointerTy())); 1905 } 1906 1907 RetOps[0] = Chain; // Update chain. 1908 1909 // Add the flag if we have it. 1910 if (Flag.getNode()) 1911 RetOps.push_back(Flag); 1912 1913 return DAG.getNode(X86ISD::RET_FLAG, dl, 1914 MVT::Other, &RetOps[0], RetOps.size()); 1915} 1916 1917bool X86TargetLowering::isUsedByReturnOnly(SDNode *N, SDValue &Chain) const { 1918 if (N->getNumValues() != 1) 1919 return false; 1920 if (!N->hasNUsesOfValue(1, 0)) 1921 return false; 1922 1923 SDValue TCChain = Chain; 1924 SDNode *Copy = *N->use_begin(); 1925 if (Copy->getOpcode() == ISD::CopyToReg) { 1926 // If the copy has a glue operand, we conservatively assume it isn't safe to 1927 // perform a tail call. 1928 if (Copy->getOperand(Copy->getNumOperands()-1).getValueType() == MVT::Glue) 1929 return false; 1930 TCChain = Copy->getOperand(0); 1931 } else if (Copy->getOpcode() != ISD::FP_EXTEND) 1932 return false; 1933 1934 bool HasRet = false; 1935 for (SDNode::use_iterator UI = Copy->use_begin(), UE = Copy->use_end(); 1936 UI != UE; ++UI) { 1937 if (UI->getOpcode() != X86ISD::RET_FLAG) 1938 return false; 1939 HasRet = true; 1940 } 1941 1942 if (!HasRet) 1943 return false; 1944 1945 Chain = TCChain; 1946 return true; 1947} 1948 1949MVT 1950X86TargetLowering::getTypeForExtArgOrReturn(MVT VT, 1951 ISD::NodeType ExtendKind) const { 1952 MVT ReturnMVT; 1953 // TODO: Is this also valid on 32-bit? 1954 if (Subtarget->is64Bit() && VT == MVT::i1 && ExtendKind == ISD::ZERO_EXTEND) 1955 ReturnMVT = MVT::i8; 1956 else 1957 ReturnMVT = MVT::i32; 1958 1959 MVT MinVT = getRegisterType(ReturnMVT); 1960 return VT.bitsLT(MinVT) ? MinVT : VT; 1961} 1962 1963/// LowerCallResult - Lower the result values of a call into the 1964/// appropriate copies out of appropriate physical registers. 1965/// 1966SDValue 1967X86TargetLowering::LowerCallResult(SDValue Chain, SDValue InFlag, 1968 CallingConv::ID CallConv, bool isVarArg, 1969 const SmallVectorImpl<ISD::InputArg> &Ins, 1970 SDLoc dl, SelectionDAG &DAG, 1971 SmallVectorImpl<SDValue> &InVals) const { 1972 1973 // Assign locations to each value returned by this call. 1974 SmallVector<CCValAssign, 16> RVLocs; 1975 bool Is64Bit = Subtarget->is64Bit(); 1976 CCState CCInfo(CallConv, isVarArg, DAG.getMachineFunction(), 1977 getTargetMachine(), RVLocs, *DAG.getContext()); 1978 CCInfo.AnalyzeCallResult(Ins, RetCC_X86); 1979 1980 // Copy all of the result registers out of their specified physreg. 1981 for (unsigned i = 0, e = RVLocs.size(); i != e; ++i) { 1982 CCValAssign &VA = RVLocs[i]; 1983 EVT CopyVT = VA.getValVT(); 1984 1985 // If this is x86-64, and we disabled SSE, we can't return FP values 1986 if ((CopyVT == MVT::f32 || CopyVT == MVT::f64) && 1987 ((Is64Bit || Ins[i].Flags.isInReg()) && !Subtarget->hasSSE1())) { 1988 report_fatal_error("SSE register return with SSE disabled"); 1989 } 1990 1991 SDValue Val; 1992 1993 // If this is a call to a function that returns an fp value on the floating 1994 // point stack, we must guarantee the value is popped from the stack, so 1995 // a CopyFromReg is not good enough - the copy instruction may be eliminated 1996 // if the return value is not used. We use the FpPOP_RETVAL instruction 1997 // instead. 1998 if (VA.getLocReg() == X86::ST0 || VA.getLocReg() == X86::ST1) { 1999 // If we prefer to use the value in xmm registers, copy it out as f80 and 2000 // use a truncate to move it from fp stack reg to xmm reg. 2001 if (isScalarFPTypeInSSEReg(VA.getValVT())) CopyVT = MVT::f80; 2002 SDValue Ops[] = { Chain, InFlag }; 2003 Chain = SDValue(DAG.getMachineNode(X86::FpPOP_RETVAL, dl, CopyVT, 2004 MVT::Other, MVT::Glue, Ops), 1); 2005 Val = Chain.getValue(0); 2006 2007 // Round the f80 to the right size, which also moves it to the appropriate 2008 // xmm register. 2009 if (CopyVT != VA.getValVT()) 2010 Val = DAG.getNode(ISD::FP_ROUND, dl, VA.getValVT(), Val, 2011 // This truncation won't change the value. 2012 DAG.getIntPtrConstant(1)); 2013 } else { 2014 Chain = DAG.getCopyFromReg(Chain, dl, VA.getLocReg(), 2015 CopyVT, InFlag).getValue(1); 2016 Val = Chain.getValue(0); 2017 } 2018 InFlag = Chain.getValue(2); 2019 InVals.push_back(Val); 2020 } 2021 2022 return Chain; 2023} 2024 2025//===----------------------------------------------------------------------===// 2026// C & StdCall & Fast Calling Convention implementation 2027//===----------------------------------------------------------------------===// 2028// StdCall calling convention seems to be standard for many Windows' API 2029// routines and around. It differs from C calling convention just a little: 2030// callee should clean up the stack, not caller. Symbols should be also 2031// decorated in some fancy way :) It doesn't support any vector arguments. 2032// For info on fast calling convention see Fast Calling Convention (tail call) 2033// implementation LowerX86_32FastCCCallTo. 2034 2035/// CallIsStructReturn - Determines whether a call uses struct return 2036/// semantics. 2037enum StructReturnType { 2038 NotStructReturn, 2039 RegStructReturn, 2040 StackStructReturn 2041}; 2042static StructReturnType 2043callIsStructReturn(const SmallVectorImpl<ISD::OutputArg> &Outs) { 2044 if (Outs.empty()) 2045 return NotStructReturn; 2046 2047 const ISD::ArgFlagsTy &Flags = Outs[0].Flags; 2048 if (!Flags.isSRet()) 2049 return NotStructReturn; 2050 if (Flags.isInReg()) 2051 return RegStructReturn; 2052 return StackStructReturn; 2053} 2054 2055/// ArgsAreStructReturn - Determines whether a function uses struct 2056/// return semantics. 2057static StructReturnType 2058argsAreStructReturn(const SmallVectorImpl<ISD::InputArg> &Ins) { 2059 if (Ins.empty()) 2060 return NotStructReturn; 2061 2062 const ISD::ArgFlagsTy &Flags = Ins[0].Flags; 2063 if (!Flags.isSRet()) 2064 return NotStructReturn; 2065 if (Flags.isInReg()) 2066 return RegStructReturn; 2067 return StackStructReturn; 2068} 2069 2070/// CreateCopyOfByValArgument - Make a copy of an aggregate at address specified 2071/// by "Src" to address "Dst" with size and alignment information specified by 2072/// the specific parameter attribute. The copy will be passed as a byval 2073/// function parameter. 2074static SDValue 2075CreateCopyOfByValArgument(SDValue Src, SDValue Dst, SDValue Chain, 2076 ISD::ArgFlagsTy Flags, SelectionDAG &DAG, 2077 SDLoc dl) { 2078 SDValue SizeNode = DAG.getConstant(Flags.getByValSize(), MVT::i32); 2079 2080 return DAG.getMemcpy(Chain, dl, Dst, Src, SizeNode, Flags.getByValAlign(), 2081 /*isVolatile*/false, /*AlwaysInline=*/true, 2082 MachinePointerInfo(), MachinePointerInfo()); 2083} 2084 2085/// IsTailCallConvention - Return true if the calling convention is one that 2086/// supports tail call optimization. 2087static bool IsTailCallConvention(CallingConv::ID CC) { 2088 return (CC == CallingConv::Fast || CC == CallingConv::GHC || 2089 CC == CallingConv::HiPE); 2090} 2091 2092/// \brief Return true if the calling convention is a C calling convention. 2093static bool IsCCallConvention(CallingConv::ID CC) { 2094 return (CC == CallingConv::C || CC == CallingConv::X86_64_Win64 || 2095 CC == CallingConv::X86_64_SysV); 2096} 2097 2098bool X86TargetLowering::mayBeEmittedAsTailCall(CallInst *CI) const { 2099 if (!CI->isTailCall() || getTargetMachine().Options.DisableTailCalls) 2100 return false; 2101 2102 CallSite CS(CI); 2103 CallingConv::ID CalleeCC = CS.getCallingConv(); 2104 if (!IsTailCallConvention(CalleeCC) && !IsCCallConvention(CalleeCC)) 2105 return false; 2106 2107 return true; 2108} 2109 2110/// FuncIsMadeTailCallSafe - Return true if the function is being made into 2111/// a tailcall target by changing its ABI. 2112static bool FuncIsMadeTailCallSafe(CallingConv::ID CC, 2113 bool GuaranteedTailCallOpt) { 2114 return GuaranteedTailCallOpt && IsTailCallConvention(CC); 2115} 2116 2117SDValue 2118X86TargetLowering::LowerMemArgument(SDValue Chain, 2119 CallingConv::ID CallConv, 2120 const SmallVectorImpl<ISD::InputArg> &Ins, 2121 SDLoc dl, SelectionDAG &DAG, 2122 const CCValAssign &VA, 2123 MachineFrameInfo *MFI, 2124 unsigned i) const { 2125 // Create the nodes corresponding to a load from this parameter slot. 2126 ISD::ArgFlagsTy Flags = Ins[i].Flags; 2127 bool AlwaysUseMutable = FuncIsMadeTailCallSafe(CallConv, 2128 getTargetMachine().Options.GuaranteedTailCallOpt); 2129 bool isImmutable = !AlwaysUseMutable && !Flags.isByVal(); 2130 EVT ValVT; 2131 2132 // If value is passed by pointer we have address passed instead of the value 2133 // itself. 2134 if (VA.getLocInfo() == CCValAssign::Indirect) 2135 ValVT = VA.getLocVT(); 2136 else 2137 ValVT = VA.getValVT(); 2138 2139 // FIXME: For now, all byval parameter objects are marked mutable. This can be 2140 // changed with more analysis. 2141 // In case of tail call optimization mark all arguments mutable. Since they 2142 // could be overwritten by lowering of arguments in case of a tail call. 2143 if (Flags.isByVal()) { 2144 unsigned Bytes = Flags.getByValSize(); 2145 if (Bytes == 0) Bytes = 1; // Don't create zero-sized stack objects. 2146 int FI = MFI->CreateFixedObject(Bytes, VA.getLocMemOffset(), isImmutable); 2147 return DAG.getFrameIndex(FI, getPointerTy()); 2148 } else { 2149 int FI = MFI->CreateFixedObject(ValVT.getSizeInBits()/8, 2150 VA.getLocMemOffset(), isImmutable); 2151 SDValue FIN = DAG.getFrameIndex(FI, getPointerTy()); 2152 return DAG.getLoad(ValVT, dl, Chain, FIN, 2153 MachinePointerInfo::getFixedStack(FI), 2154 false, false, false, 0); 2155 } 2156} 2157 2158SDValue 2159X86TargetLowering::LowerFormalArguments(SDValue Chain, 2160 CallingConv::ID CallConv, 2161 bool isVarArg, 2162 const SmallVectorImpl<ISD::InputArg> &Ins, 2163 SDLoc dl, 2164 SelectionDAG &DAG, 2165 SmallVectorImpl<SDValue> &InVals) 2166 const { 2167 MachineFunction &MF = DAG.getMachineFunction(); 2168 X86MachineFunctionInfo *FuncInfo = MF.getInfo<X86MachineFunctionInfo>(); 2169 2170 const Function* Fn = MF.getFunction(); 2171 if (Fn->hasExternalLinkage() && 2172 Subtarget->isTargetCygMing() && 2173 Fn->getName() == "main") 2174 FuncInfo->setForceFramePointer(true); 2175 2176 MachineFrameInfo *MFI = MF.getFrameInfo(); 2177 bool Is64Bit = Subtarget->is64Bit(); 2178 bool IsWindows = Subtarget->isTargetWindows(); 2179 bool IsWin64 = Subtarget->isCallingConvWin64(CallConv); 2180 2181 assert(!(isVarArg && IsTailCallConvention(CallConv)) && 2182 "Var args not supported with calling convention fastcc, ghc or hipe"); 2183 2184 // Assign locations to all of the incoming arguments. 2185 SmallVector<CCValAssign, 16> ArgLocs; 2186 CCState CCInfo(CallConv, isVarArg, MF, getTargetMachine(), 2187 ArgLocs, *DAG.getContext()); 2188 2189 // Allocate shadow area for Win64 2190 if (IsWin64) 2191 CCInfo.AllocateStack(32, 8); 2192 2193 CCInfo.AnalyzeFormalArguments(Ins, CC_X86); 2194 2195 unsigned LastVal = ~0U; 2196 SDValue ArgValue; 2197 for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i) { 2198 CCValAssign &VA = ArgLocs[i]; 2199 // TODO: If an arg is passed in two places (e.g. reg and stack), skip later 2200 // places. 2201 assert(VA.getValNo() != LastVal && 2202 "Don't support value assigned to multiple locs yet"); 2203 (void)LastVal; 2204 LastVal = VA.getValNo(); 2205 2206 if (VA.isRegLoc()) { 2207 EVT RegVT = VA.getLocVT(); 2208 const TargetRegisterClass *RC; 2209 if (RegVT == MVT::i32) 2210 RC = &X86::GR32RegClass; 2211 else if (Is64Bit && RegVT == MVT::i64) 2212 RC = &X86::GR64RegClass; 2213 else if (RegVT == MVT::f32) 2214 RC = &X86::FR32RegClass; 2215 else if (RegVT == MVT::f64) 2216 RC = &X86::FR64RegClass; 2217 else if (RegVT.is512BitVector()) 2218 RC = &X86::VR512RegClass; 2219 else if (RegVT.is256BitVector()) 2220 RC = &X86::VR256RegClass; 2221 else if (RegVT.is128BitVector()) 2222 RC = &X86::VR128RegClass; 2223 else if (RegVT == MVT::x86mmx) 2224 RC = &X86::VR64RegClass; 2225 else if (RegVT == MVT::v8i1) 2226 RC = &X86::VK8RegClass; 2227 else if (RegVT == MVT::v16i1) 2228 RC = &X86::VK16RegClass; 2229 else 2230 llvm_unreachable("Unknown argument type!"); 2231 2232 unsigned Reg = MF.addLiveIn(VA.getLocReg(), RC); 2233 ArgValue = DAG.getCopyFromReg(Chain, dl, Reg, RegVT); 2234 2235 // If this is an 8 or 16-bit value, it is really passed promoted to 32 2236 // bits. Insert an assert[sz]ext to capture this, then truncate to the 2237 // right size. 2238 if (VA.getLocInfo() == CCValAssign::SExt) 2239 ArgValue = DAG.getNode(ISD::AssertSext, dl, RegVT, ArgValue, 2240 DAG.getValueType(VA.getValVT())); 2241 else if (VA.getLocInfo() == CCValAssign::ZExt) 2242 ArgValue = DAG.getNode(ISD::AssertZext, dl, RegVT, ArgValue, 2243 DAG.getValueType(VA.getValVT())); 2244 else if (VA.getLocInfo() == CCValAssign::BCvt) 2245 ArgValue = DAG.getNode(ISD::BITCAST, dl, VA.getValVT(), ArgValue); 2246 2247 if (VA.isExtInLoc()) { 2248 // Handle MMX values passed in XMM regs. 2249 if (RegVT.isVector()) 2250 ArgValue = DAG.getNode(X86ISD::MOVDQ2Q, dl, VA.getValVT(), ArgValue); 2251 else 2252 ArgValue = DAG.getNode(ISD::TRUNCATE, dl, VA.getValVT(), ArgValue); 2253 } 2254 } else { 2255 assert(VA.isMemLoc()); 2256 ArgValue = LowerMemArgument(Chain, CallConv, Ins, dl, DAG, VA, MFI, i); 2257 } 2258 2259 // If value is passed via pointer - do a load. 2260 if (VA.getLocInfo() == CCValAssign::Indirect) 2261 ArgValue = DAG.getLoad(VA.getValVT(), dl, Chain, ArgValue, 2262 MachinePointerInfo(), false, false, false, 0); 2263 2264 InVals.push_back(ArgValue); 2265 } 2266 2267 // The x86-64 ABIs require that for returning structs by value we copy 2268 // the sret argument into %rax/%eax (depending on ABI) for the return. 2269 // Win32 requires us to put the sret argument to %eax as well. 2270 // Save the argument into a virtual register so that we can access it 2271 // from the return points. 2272 if (MF.getFunction()->hasStructRetAttr() && 2273 (Subtarget->is64Bit() || Subtarget->isTargetWindows())) { 2274 X86MachineFunctionInfo *FuncInfo = MF.getInfo<X86MachineFunctionInfo>(); 2275 unsigned Reg = FuncInfo->getSRetReturnReg(); 2276 if (!Reg) { 2277 MVT PtrTy = getPointerTy(); 2278 Reg = MF.getRegInfo().createVirtualRegister(getRegClassFor(PtrTy)); 2279 FuncInfo->setSRetReturnReg(Reg); 2280 } 2281 SDValue Copy = DAG.getCopyToReg(DAG.getEntryNode(), dl, Reg, InVals[0]); 2282 Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, Copy, Chain); 2283 } 2284 2285 unsigned StackSize = CCInfo.getNextStackOffset(); 2286 // Align stack specially for tail calls. 2287 if (FuncIsMadeTailCallSafe(CallConv, 2288 MF.getTarget().Options.GuaranteedTailCallOpt)) 2289 StackSize = GetAlignedArgumentStackSize(StackSize, DAG); 2290 2291 // If the function takes variable number of arguments, make a frame index for 2292 // the start of the first vararg value... for expansion of llvm.va_start. 2293 if (isVarArg) { 2294 if (Is64Bit || (CallConv != CallingConv::X86_FastCall && 2295 CallConv != CallingConv::X86_ThisCall)) { 2296 FuncInfo->setVarArgsFrameIndex(MFI->CreateFixedObject(1, StackSize,true)); 2297 } 2298 if (Is64Bit) { 2299 unsigned TotalNumIntRegs = 0, TotalNumXMMRegs = 0; 2300 2301 // FIXME: We should really autogenerate these arrays 2302 static const uint16_t GPR64ArgRegsWin64[] = { 2303 X86::RCX, X86::RDX, X86::R8, X86::R9 2304 }; 2305 static const uint16_t GPR64ArgRegs64Bit[] = { 2306 X86::RDI, X86::RSI, X86::RDX, X86::RCX, X86::R8, X86::R9 2307 }; 2308 static const uint16_t XMMArgRegs64Bit[] = { 2309 X86::XMM0, X86::XMM1, X86::XMM2, X86::XMM3, 2310 X86::XMM4, X86::XMM5, X86::XMM6, X86::XMM7 2311 }; 2312 const uint16_t *GPR64ArgRegs; 2313 unsigned NumXMMRegs = 0; 2314 2315 if (IsWin64) { 2316 // The XMM registers which might contain var arg parameters are shadowed 2317 // in their paired GPR. So we only need to save the GPR to their home 2318 // slots. 2319 TotalNumIntRegs = 4; 2320 GPR64ArgRegs = GPR64ArgRegsWin64; 2321 } else { 2322 TotalNumIntRegs = 6; TotalNumXMMRegs = 8; 2323 GPR64ArgRegs = GPR64ArgRegs64Bit; 2324 2325 NumXMMRegs = CCInfo.getFirstUnallocated(XMMArgRegs64Bit, 2326 TotalNumXMMRegs); 2327 } 2328 unsigned NumIntRegs = CCInfo.getFirstUnallocated(GPR64ArgRegs, 2329 TotalNumIntRegs); 2330 2331 bool NoImplicitFloatOps = Fn->getAttributes(). 2332 hasAttribute(AttributeSet::FunctionIndex, Attribute::NoImplicitFloat); 2333 assert(!(NumXMMRegs && !Subtarget->hasSSE1()) && 2334 "SSE register cannot be used when SSE is disabled!"); 2335 assert(!(NumXMMRegs && MF.getTarget().Options.UseSoftFloat && 2336 NoImplicitFloatOps) && 2337 "SSE register cannot be used when SSE is disabled!"); 2338 if (MF.getTarget().Options.UseSoftFloat || NoImplicitFloatOps || 2339 !Subtarget->hasSSE1()) 2340 // Kernel mode asks for SSE to be disabled, so don't push them 2341 // on the stack. 2342 TotalNumXMMRegs = 0; 2343 2344 if (IsWin64) { 2345 const TargetFrameLowering &TFI = *getTargetMachine().getFrameLowering(); 2346 // Get to the caller-allocated home save location. Add 8 to account 2347 // for the return address. 2348 int HomeOffset = TFI.getOffsetOfLocalArea() + 8; 2349 FuncInfo->setRegSaveFrameIndex( 2350 MFI->CreateFixedObject(1, NumIntRegs * 8 + HomeOffset, false)); 2351 // Fixup to set vararg frame on shadow area (4 x i64). 2352 if (NumIntRegs < 4) 2353 FuncInfo->setVarArgsFrameIndex(FuncInfo->getRegSaveFrameIndex()); 2354 } else { 2355 // For X86-64, if there are vararg parameters that are passed via 2356 // registers, then we must store them to their spots on the stack so 2357 // they may be loaded by deferencing the result of va_next. 2358 FuncInfo->setVarArgsGPOffset(NumIntRegs * 8); 2359 FuncInfo->setVarArgsFPOffset(TotalNumIntRegs * 8 + NumXMMRegs * 16); 2360 FuncInfo->setRegSaveFrameIndex( 2361 MFI->CreateStackObject(TotalNumIntRegs * 8 + TotalNumXMMRegs * 16, 16, 2362 false)); 2363 } 2364 2365 // Store the integer parameter registers. 2366 SmallVector<SDValue, 8> MemOps; 2367 SDValue RSFIN = DAG.getFrameIndex(FuncInfo->getRegSaveFrameIndex(), 2368 getPointerTy()); 2369 unsigned Offset = FuncInfo->getVarArgsGPOffset(); 2370 for (; NumIntRegs != TotalNumIntRegs; ++NumIntRegs) { 2371 SDValue FIN = DAG.getNode(ISD::ADD, dl, getPointerTy(), RSFIN, 2372 DAG.getIntPtrConstant(Offset)); 2373 unsigned VReg = MF.addLiveIn(GPR64ArgRegs[NumIntRegs], 2374 &X86::GR64RegClass); 2375 SDValue Val = DAG.getCopyFromReg(Chain, dl, VReg, MVT::i64); 2376 SDValue Store = 2377 DAG.getStore(Val.getValue(1), dl, Val, FIN, 2378 MachinePointerInfo::getFixedStack( 2379 FuncInfo->getRegSaveFrameIndex(), Offset), 2380 false, false, 0); 2381 MemOps.push_back(Store); 2382 Offset += 8; 2383 } 2384 2385 if (TotalNumXMMRegs != 0 && NumXMMRegs != TotalNumXMMRegs) { 2386 // Now store the XMM (fp + vector) parameter registers. 2387 SmallVector<SDValue, 11> SaveXMMOps; 2388 SaveXMMOps.push_back(Chain); 2389 2390 unsigned AL = MF.addLiveIn(X86::AL, &X86::GR8RegClass); 2391 SDValue ALVal = DAG.getCopyFromReg(DAG.getEntryNode(), dl, AL, MVT::i8); 2392 SaveXMMOps.push_back(ALVal); 2393 2394 SaveXMMOps.push_back(DAG.getIntPtrConstant( 2395 FuncInfo->getRegSaveFrameIndex())); 2396 SaveXMMOps.push_back(DAG.getIntPtrConstant( 2397 FuncInfo->getVarArgsFPOffset())); 2398 2399 for (; NumXMMRegs != TotalNumXMMRegs; ++NumXMMRegs) { 2400 unsigned VReg = MF.addLiveIn(XMMArgRegs64Bit[NumXMMRegs], 2401 &X86::VR128RegClass); 2402 SDValue Val = DAG.getCopyFromReg(Chain, dl, VReg, MVT::v4f32); 2403 SaveXMMOps.push_back(Val); 2404 } 2405 MemOps.push_back(DAG.getNode(X86ISD::VASTART_SAVE_XMM_REGS, dl, 2406 MVT::Other, 2407 &SaveXMMOps[0], SaveXMMOps.size())); 2408 } 2409 2410 if (!MemOps.empty()) 2411 Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, 2412 &MemOps[0], MemOps.size()); 2413 } 2414 } 2415 2416 // Some CCs need callee pop. 2417 if (X86::isCalleePop(CallConv, Is64Bit, isVarArg, 2418 MF.getTarget().Options.GuaranteedTailCallOpt)) { 2419 FuncInfo->setBytesToPopOnReturn(StackSize); // Callee pops everything. 2420 } else { 2421 FuncInfo->setBytesToPopOnReturn(0); // Callee pops nothing. 2422 // If this is an sret function, the return should pop the hidden pointer. 2423 if (!Is64Bit && !IsTailCallConvention(CallConv) && !IsWindows && 2424 argsAreStructReturn(Ins) == StackStructReturn) 2425 FuncInfo->setBytesToPopOnReturn(4); 2426 } 2427 2428 if (!Is64Bit) { 2429 // RegSaveFrameIndex is X86-64 only. 2430 FuncInfo->setRegSaveFrameIndex(0xAAAAAAA); 2431 if (CallConv == CallingConv::X86_FastCall || 2432 CallConv == CallingConv::X86_ThisCall) 2433 // fastcc functions can't have varargs. 2434 FuncInfo->setVarArgsFrameIndex(0xAAAAAAA); 2435 } 2436 2437 FuncInfo->setArgumentStackSize(StackSize); 2438 2439 return Chain; 2440} 2441 2442SDValue 2443X86TargetLowering::LowerMemOpCallTo(SDValue Chain, 2444 SDValue StackPtr, SDValue Arg, 2445 SDLoc dl, SelectionDAG &DAG, 2446 const CCValAssign &VA, 2447 ISD::ArgFlagsTy Flags) const { 2448 unsigned LocMemOffset = VA.getLocMemOffset(); 2449 SDValue PtrOff = DAG.getIntPtrConstant(LocMemOffset); 2450 PtrOff = DAG.getNode(ISD::ADD, dl, getPointerTy(), StackPtr, PtrOff); 2451 if (Flags.isByVal()) 2452 return CreateCopyOfByValArgument(Arg, PtrOff, Chain, Flags, DAG, dl); 2453 2454 return DAG.getStore(Chain, dl, Arg, PtrOff, 2455 MachinePointerInfo::getStack(LocMemOffset), 2456 false, false, 0); 2457} 2458 2459/// EmitTailCallLoadRetAddr - Emit a load of return address if tail call 2460/// optimization is performed and it is required. 2461SDValue 2462X86TargetLowering::EmitTailCallLoadRetAddr(SelectionDAG &DAG, 2463 SDValue &OutRetAddr, SDValue Chain, 2464 bool IsTailCall, bool Is64Bit, 2465 int FPDiff, SDLoc dl) const { 2466 // Adjust the Return address stack slot. 2467 EVT VT = getPointerTy(); 2468 OutRetAddr = getReturnAddressFrameIndex(DAG); 2469 2470 // Load the "old" Return address. 2471 OutRetAddr = DAG.getLoad(VT, dl, Chain, OutRetAddr, MachinePointerInfo(), 2472 false, false, false, 0); 2473 return SDValue(OutRetAddr.getNode(), 1); 2474} 2475 2476/// EmitTailCallStoreRetAddr - Emit a store of the return address if tail call 2477/// optimization is performed and it is required (FPDiff!=0). 2478static SDValue 2479EmitTailCallStoreRetAddr(SelectionDAG & DAG, MachineFunction &MF, 2480 SDValue Chain, SDValue RetAddrFrIdx, EVT PtrVT, 2481 unsigned SlotSize, int FPDiff, SDLoc dl) { 2482 // Store the return address to the appropriate stack slot. 2483 if (!FPDiff) return Chain; 2484 // Calculate the new stack slot for the return address. 2485 int NewReturnAddrFI = 2486 MF.getFrameInfo()->CreateFixedObject(SlotSize, (int64_t)FPDiff - SlotSize, 2487 false); 2488 SDValue NewRetAddrFrIdx = DAG.getFrameIndex(NewReturnAddrFI, PtrVT); 2489 Chain = DAG.getStore(Chain, dl, RetAddrFrIdx, NewRetAddrFrIdx, 2490 MachinePointerInfo::getFixedStack(NewReturnAddrFI), 2491 false, false, 0); 2492 return Chain; 2493} 2494 2495SDValue 2496X86TargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI, 2497 SmallVectorImpl<SDValue> &InVals) const { 2498 SelectionDAG &DAG = CLI.DAG; 2499 SDLoc &dl = CLI.DL; 2500 SmallVectorImpl<ISD::OutputArg> &Outs = CLI.Outs; 2501 SmallVectorImpl<SDValue> &OutVals = CLI.OutVals; 2502 SmallVectorImpl<ISD::InputArg> &Ins = CLI.Ins; 2503 SDValue Chain = CLI.Chain; 2504 SDValue Callee = CLI.Callee; 2505 CallingConv::ID CallConv = CLI.CallConv; 2506 bool &isTailCall = CLI.IsTailCall; 2507 bool isVarArg = CLI.IsVarArg; 2508 2509 MachineFunction &MF = DAG.getMachineFunction(); 2510 bool Is64Bit = Subtarget->is64Bit(); 2511 bool IsWin64 = Subtarget->isCallingConvWin64(CallConv); 2512 bool IsWindows = Subtarget->isTargetWindows(); 2513 StructReturnType SR = callIsStructReturn(Outs); 2514 bool IsSibcall = false; 2515 2516 if (MF.getTarget().Options.DisableTailCalls) 2517 isTailCall = false; 2518 2519 if (isTailCall) { 2520 // Check if it's really possible to do a tail call. 2521 isTailCall = IsEligibleForTailCallOptimization(Callee, CallConv, 2522 isVarArg, SR != NotStructReturn, 2523 MF.getFunction()->hasStructRetAttr(), CLI.RetTy, 2524 Outs, OutVals, Ins, DAG); 2525 2526 // Sibcalls are automatically detected tailcalls which do not require 2527 // ABI changes. 2528 if (!MF.getTarget().Options.GuaranteedTailCallOpt && isTailCall) 2529 IsSibcall = true; 2530 2531 if (isTailCall) 2532 ++NumTailCalls; 2533 } 2534 2535 assert(!(isVarArg && IsTailCallConvention(CallConv)) && 2536 "Var args not supported with calling convention fastcc, ghc or hipe"); 2537 2538 // Analyze operands of the call, assigning locations to each operand. 2539 SmallVector<CCValAssign, 16> ArgLocs; 2540 CCState CCInfo(CallConv, isVarArg, MF, getTargetMachine(), 2541 ArgLocs, *DAG.getContext()); 2542 2543 // Allocate shadow area for Win64 2544 if (IsWin64) 2545 CCInfo.AllocateStack(32, 8); 2546 2547 CCInfo.AnalyzeCallOperands(Outs, CC_X86); 2548 2549 // Get a count of how many bytes are to be pushed on the stack. 2550 unsigned NumBytes = CCInfo.getNextStackOffset(); 2551 if (IsSibcall) 2552 // This is a sibcall. The memory operands are available in caller's 2553 // own caller's stack. 2554 NumBytes = 0; 2555 else if (getTargetMachine().Options.GuaranteedTailCallOpt && 2556 IsTailCallConvention(CallConv)) 2557 NumBytes = GetAlignedArgumentStackSize(NumBytes, DAG); 2558 2559 int FPDiff = 0; 2560 if (isTailCall && !IsSibcall) { 2561 // Lower arguments at fp - stackoffset + fpdiff. 2562 X86MachineFunctionInfo *X86Info = MF.getInfo<X86MachineFunctionInfo>(); 2563 unsigned NumBytesCallerPushed = X86Info->getBytesToPopOnReturn(); 2564 2565 FPDiff = NumBytesCallerPushed - NumBytes; 2566 2567 // Set the delta of movement of the returnaddr stackslot. 2568 // But only set if delta is greater than previous delta. 2569 if (FPDiff < X86Info->getTCReturnAddrDelta()) 2570 X86Info->setTCReturnAddrDelta(FPDiff); 2571 } 2572 2573 if (!IsSibcall) 2574 Chain = DAG.getCALLSEQ_START(Chain, DAG.getIntPtrConstant(NumBytes, true), 2575 dl); 2576 2577 SDValue RetAddrFrIdx; 2578 // Load return address for tail calls. 2579 if (isTailCall && FPDiff) 2580 Chain = EmitTailCallLoadRetAddr(DAG, RetAddrFrIdx, Chain, isTailCall, 2581 Is64Bit, FPDiff, dl); 2582 2583 SmallVector<std::pair<unsigned, SDValue>, 8> RegsToPass; 2584 SmallVector<SDValue, 8> MemOpChains; 2585 SDValue StackPtr; 2586 2587 // Walk the register/memloc assignments, inserting copies/loads. In the case 2588 // of tail call optimization arguments are handle later. 2589 const X86RegisterInfo *RegInfo = 2590 static_cast<const X86RegisterInfo*>(getTargetMachine().getRegisterInfo()); 2591 for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i) { 2592 CCValAssign &VA = ArgLocs[i]; 2593 EVT RegVT = VA.getLocVT(); 2594 SDValue Arg = OutVals[i]; 2595 ISD::ArgFlagsTy Flags = Outs[i].Flags; 2596 bool isByVal = Flags.isByVal(); 2597 2598 // Promote the value if needed. 2599 switch (VA.getLocInfo()) { 2600 default: llvm_unreachable("Unknown loc info!"); 2601 case CCValAssign::Full: break; 2602 case CCValAssign::SExt: 2603 Arg = DAG.getNode(ISD::SIGN_EXTEND, dl, RegVT, Arg); 2604 break; 2605 case CCValAssign::ZExt: 2606 Arg = DAG.getNode(ISD::ZERO_EXTEND, dl, RegVT, Arg); 2607 break; 2608 case CCValAssign::AExt: 2609 if (RegVT.is128BitVector()) { 2610 // Special case: passing MMX values in XMM registers. 2611 Arg = DAG.getNode(ISD::BITCAST, dl, MVT::i64, Arg); 2612 Arg = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v2i64, Arg); 2613 Arg = getMOVL(DAG, dl, MVT::v2i64, DAG.getUNDEF(MVT::v2i64), Arg); 2614 } else 2615 Arg = DAG.getNode(ISD::ANY_EXTEND, dl, RegVT, Arg); 2616 break; 2617 case CCValAssign::BCvt: 2618 Arg = DAG.getNode(ISD::BITCAST, dl, RegVT, Arg); 2619 break; 2620 case CCValAssign::Indirect: { 2621 // Store the argument. 2622 SDValue SpillSlot = DAG.CreateStackTemporary(VA.getValVT()); 2623 int FI = cast<FrameIndexSDNode>(SpillSlot)->getIndex(); 2624 Chain = DAG.getStore(Chain, dl, Arg, SpillSlot, 2625 MachinePointerInfo::getFixedStack(FI), 2626 false, false, 0); 2627 Arg = SpillSlot; 2628 break; 2629 } 2630 } 2631 2632 if (VA.isRegLoc()) { 2633 RegsToPass.push_back(std::make_pair(VA.getLocReg(), Arg)); 2634 if (isVarArg && IsWin64) { 2635 // Win64 ABI requires argument XMM reg to be copied to the corresponding 2636 // shadow reg if callee is a varargs function. 2637 unsigned ShadowReg = 0; 2638 switch (VA.getLocReg()) { 2639 case X86::XMM0: ShadowReg = X86::RCX; break; 2640 case X86::XMM1: ShadowReg = X86::RDX; break; 2641 case X86::XMM2: ShadowReg = X86::R8; break; 2642 case X86::XMM3: ShadowReg = X86::R9; break; 2643 } 2644 if (ShadowReg) 2645 RegsToPass.push_back(std::make_pair(ShadowReg, Arg)); 2646 } 2647 } else if (!IsSibcall && (!isTailCall || isByVal)) { 2648 assert(VA.isMemLoc()); 2649 if (StackPtr.getNode() == 0) 2650 StackPtr = DAG.getCopyFromReg(Chain, dl, RegInfo->getStackRegister(), 2651 getPointerTy()); 2652 MemOpChains.push_back(LowerMemOpCallTo(Chain, StackPtr, Arg, 2653 dl, DAG, VA, Flags)); 2654 } 2655 } 2656 2657 if (!MemOpChains.empty()) 2658 Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, 2659 &MemOpChains[0], MemOpChains.size()); 2660 2661 if (Subtarget->isPICStyleGOT()) { 2662 // ELF / PIC requires GOT in the EBX register before function calls via PLT 2663 // GOT pointer. 2664 if (!isTailCall) { 2665 RegsToPass.push_back(std::make_pair(unsigned(X86::EBX), 2666 DAG.getNode(X86ISD::GlobalBaseReg, SDLoc(), getPointerTy()))); 2667 } else { 2668 // If we are tail calling and generating PIC/GOT style code load the 2669 // address of the callee into ECX. The value in ecx is used as target of 2670 // the tail jump. This is done to circumvent the ebx/callee-saved problem 2671 // for tail calls on PIC/GOT architectures. Normally we would just put the 2672 // address of GOT into ebx and then call target@PLT. But for tail calls 2673 // ebx would be restored (since ebx is callee saved) before jumping to the 2674 // target@PLT. 2675 2676 // Note: The actual moving to ECX is done further down. 2677 GlobalAddressSDNode *G = dyn_cast<GlobalAddressSDNode>(Callee); 2678 if (G && !G->getGlobal()->hasHiddenVisibility() && 2679 !G->getGlobal()->hasProtectedVisibility()) 2680 Callee = LowerGlobalAddress(Callee, DAG); 2681 else if (isa<ExternalSymbolSDNode>(Callee)) 2682 Callee = LowerExternalSymbol(Callee, DAG); 2683 } 2684 } 2685 2686 if (Is64Bit && isVarArg && !IsWin64) { 2687 // From AMD64 ABI document: 2688 // For calls that may call functions that use varargs or stdargs 2689 // (prototype-less calls or calls to functions containing ellipsis (...) in 2690 // the declaration) %al is used as hidden argument to specify the number 2691 // of SSE registers used. The contents of %al do not need to match exactly 2692 // the number of registers, but must be an ubound on the number of SSE 2693 // registers used and is in the range 0 - 8 inclusive. 2694 2695 // Count the number of XMM registers allocated. 2696 static const uint16_t XMMArgRegs[] = { 2697 X86::XMM0, X86::XMM1, X86::XMM2, X86::XMM3, 2698 X86::XMM4, X86::XMM5, X86::XMM6, X86::XMM7 2699 }; 2700 unsigned NumXMMRegs = CCInfo.getFirstUnallocated(XMMArgRegs, 8); 2701 assert((Subtarget->hasSSE1() || !NumXMMRegs) 2702 && "SSE registers cannot be used when SSE is disabled"); 2703 2704 RegsToPass.push_back(std::make_pair(unsigned(X86::AL), 2705 DAG.getConstant(NumXMMRegs, MVT::i8))); 2706 } 2707 2708 // For tail calls lower the arguments to the 'real' stack slot. 2709 if (isTailCall) { 2710 // Force all the incoming stack arguments to be loaded from the stack 2711 // before any new outgoing arguments are stored to the stack, because the 2712 // outgoing stack slots may alias the incoming argument stack slots, and 2713 // the alias isn't otherwise explicit. This is slightly more conservative 2714 // than necessary, because it means that each store effectively depends 2715 // on every argument instead of just those arguments it would clobber. 2716 SDValue ArgChain = DAG.getStackArgumentTokenFactor(Chain); 2717 2718 SmallVector<SDValue, 8> MemOpChains2; 2719 SDValue FIN; 2720 int FI = 0; 2721 if (getTargetMachine().Options.GuaranteedTailCallOpt) { 2722 for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i) { 2723 CCValAssign &VA = ArgLocs[i]; 2724 if (VA.isRegLoc()) 2725 continue; 2726 assert(VA.isMemLoc()); 2727 SDValue Arg = OutVals[i]; 2728 ISD::ArgFlagsTy Flags = Outs[i].Flags; 2729 // Create frame index. 2730 int32_t Offset = VA.getLocMemOffset()+FPDiff; 2731 uint32_t OpSize = (VA.getLocVT().getSizeInBits()+7)/8; 2732 FI = MF.getFrameInfo()->CreateFixedObject(OpSize, Offset, true); 2733 FIN = DAG.getFrameIndex(FI, getPointerTy()); 2734 2735 if (Flags.isByVal()) { 2736 // Copy relative to framepointer. 2737 SDValue Source = DAG.getIntPtrConstant(VA.getLocMemOffset()); 2738 if (StackPtr.getNode() == 0) 2739 StackPtr = DAG.getCopyFromReg(Chain, dl, 2740 RegInfo->getStackRegister(), 2741 getPointerTy()); 2742 Source = DAG.getNode(ISD::ADD, dl, getPointerTy(), StackPtr, Source); 2743 2744 MemOpChains2.push_back(CreateCopyOfByValArgument(Source, FIN, 2745 ArgChain, 2746 Flags, DAG, dl)); 2747 } else { 2748 // Store relative to framepointer. 2749 MemOpChains2.push_back( 2750 DAG.getStore(ArgChain, dl, Arg, FIN, 2751 MachinePointerInfo::getFixedStack(FI), 2752 false, false, 0)); 2753 } 2754 } 2755 } 2756 2757 if (!MemOpChains2.empty()) 2758 Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, 2759 &MemOpChains2[0], MemOpChains2.size()); 2760 2761 // Store the return address to the appropriate stack slot. 2762 Chain = EmitTailCallStoreRetAddr(DAG, MF, Chain, RetAddrFrIdx, 2763 getPointerTy(), RegInfo->getSlotSize(), 2764 FPDiff, dl); 2765 } 2766 2767 // Build a sequence of copy-to-reg nodes chained together with token chain 2768 // and flag operands which copy the outgoing args into registers. 2769 SDValue InFlag; 2770 for (unsigned i = 0, e = RegsToPass.size(); i != e; ++i) { 2771 Chain = DAG.getCopyToReg(Chain, dl, RegsToPass[i].first, 2772 RegsToPass[i].second, InFlag); 2773 InFlag = Chain.getValue(1); 2774 } 2775 2776 if (getTargetMachine().getCodeModel() == CodeModel::Large) { 2777 assert(Is64Bit && "Large code model is only legal in 64-bit mode."); 2778 // In the 64-bit large code model, we have to make all calls 2779 // through a register, since the call instruction's 32-bit 2780 // pc-relative offset may not be large enough to hold the whole 2781 // address. 2782 } else if (GlobalAddressSDNode *G = dyn_cast<GlobalAddressSDNode>(Callee)) { 2783 // If the callee is a GlobalAddress node (quite common, every direct call 2784 // is) turn it into a TargetGlobalAddress node so that legalize doesn't hack 2785 // it. 2786 2787 // We should use extra load for direct calls to dllimported functions in 2788 // non-JIT mode. 2789 const GlobalValue *GV = G->getGlobal(); 2790 if (!GV->hasDLLImportLinkage()) { 2791 unsigned char OpFlags = 0; 2792 bool ExtraLoad = false; 2793 unsigned WrapperKind = ISD::DELETED_NODE; 2794 2795 // On ELF targets, in both X86-64 and X86-32 mode, direct calls to 2796 // external symbols most go through the PLT in PIC mode. If the symbol 2797 // has hidden or protected visibility, or if it is static or local, then 2798 // we don't need to use the PLT - we can directly call it. 2799 if (Subtarget->isTargetELF() && 2800 getTargetMachine().getRelocationModel() == Reloc::PIC_ && 2801 GV->hasDefaultVisibility() && !GV->hasLocalLinkage()) { 2802 OpFlags = X86II::MO_PLT; 2803 } else if (Subtarget->isPICStyleStubAny() && 2804 (GV->isDeclaration() || GV->isWeakForLinker()) && 2805 (!Subtarget->getTargetTriple().isMacOSX() || 2806 Subtarget->getTargetTriple().isMacOSXVersionLT(10, 5))) { 2807 // PC-relative references to external symbols should go through $stub, 2808 // unless we're building with the leopard linker or later, which 2809 // automatically synthesizes these stubs. 2810 OpFlags = X86II::MO_DARWIN_STUB; 2811 } else if (Subtarget->isPICStyleRIPRel() && 2812 isa<Function>(GV) && 2813 cast<Function>(GV)->getAttributes(). 2814 hasAttribute(AttributeSet::FunctionIndex, 2815 Attribute::NonLazyBind)) { 2816 // If the function is marked as non-lazy, generate an indirect call 2817 // which loads from the GOT directly. This avoids runtime overhead 2818 // at the cost of eager binding (and one extra byte of encoding). 2819 OpFlags = X86II::MO_GOTPCREL; 2820 WrapperKind = X86ISD::WrapperRIP; 2821 ExtraLoad = true; 2822 } 2823 2824 Callee = DAG.getTargetGlobalAddress(GV, dl, getPointerTy(), 2825 G->getOffset(), OpFlags); 2826 2827 // Add a wrapper if needed. 2828 if (WrapperKind != ISD::DELETED_NODE) 2829 Callee = DAG.getNode(X86ISD::WrapperRIP, dl, getPointerTy(), Callee); 2830 // Add extra indirection if needed. 2831 if (ExtraLoad) 2832 Callee = DAG.getLoad(getPointerTy(), dl, DAG.getEntryNode(), Callee, 2833 MachinePointerInfo::getGOT(), 2834 false, false, false, 0); 2835 } 2836 } else if (ExternalSymbolSDNode *S = dyn_cast<ExternalSymbolSDNode>(Callee)) { 2837 unsigned char OpFlags = 0; 2838 2839 // On ELF targets, in either X86-64 or X86-32 mode, direct calls to 2840 // external symbols should go through the PLT. 2841 if (Subtarget->isTargetELF() && 2842 getTargetMachine().getRelocationModel() == Reloc::PIC_) { 2843 OpFlags = X86II::MO_PLT; 2844 } else if (Subtarget->isPICStyleStubAny() && 2845 (!Subtarget->getTargetTriple().isMacOSX() || 2846 Subtarget->getTargetTriple().isMacOSXVersionLT(10, 5))) { 2847 // PC-relative references to external symbols should go through $stub, 2848 // unless we're building with the leopard linker or later, which 2849 // automatically synthesizes these stubs. 2850 OpFlags = X86II::MO_DARWIN_STUB; 2851 } 2852 2853 Callee = DAG.getTargetExternalSymbol(S->getSymbol(), getPointerTy(), 2854 OpFlags); 2855 } 2856 2857 // Returns a chain & a flag for retval copy to use. 2858 SDVTList NodeTys = DAG.getVTList(MVT::Other, MVT::Glue); 2859 SmallVector<SDValue, 8> Ops; 2860 2861 if (!IsSibcall && isTailCall) { 2862 Chain = DAG.getCALLSEQ_END(Chain, DAG.getIntPtrConstant(NumBytes, true), 2863 DAG.getIntPtrConstant(0, true), InFlag, dl); 2864 InFlag = Chain.getValue(1); 2865 } 2866 2867 Ops.push_back(Chain); 2868 Ops.push_back(Callee); 2869 2870 if (isTailCall) 2871 Ops.push_back(DAG.getConstant(FPDiff, MVT::i32)); 2872 2873 // Add argument registers to the end of the list so that they are known live 2874 // into the call. 2875 for (unsigned i = 0, e = RegsToPass.size(); i != e; ++i) 2876 Ops.push_back(DAG.getRegister(RegsToPass[i].first, 2877 RegsToPass[i].second.getValueType())); 2878 2879 // Add a register mask operand representing the call-preserved registers. 2880 const TargetRegisterInfo *TRI = getTargetMachine().getRegisterInfo(); 2881 const uint32_t *Mask = TRI->getCallPreservedMask(CallConv); 2882 assert(Mask && "Missing call preserved mask for calling convention"); 2883 Ops.push_back(DAG.getRegisterMask(Mask)); 2884 2885 if (InFlag.getNode()) 2886 Ops.push_back(InFlag); 2887 2888 if (isTailCall) { 2889 // We used to do: 2890 //// If this is the first return lowered for this function, add the regs 2891 //// to the liveout set for the function. 2892 // This isn't right, although it's probably harmless on x86; liveouts 2893 // should be computed from returns not tail calls. Consider a void 2894 // function making a tail call to a function returning int. 2895 return DAG.getNode(X86ISD::TC_RETURN, dl, NodeTys, &Ops[0], Ops.size()); 2896 } 2897 2898 Chain = DAG.getNode(X86ISD::CALL, dl, NodeTys, &Ops[0], Ops.size()); 2899 InFlag = Chain.getValue(1); 2900 2901 // Create the CALLSEQ_END node. 2902 unsigned NumBytesForCalleeToPush; 2903 if (X86::isCalleePop(CallConv, Is64Bit, isVarArg, 2904 getTargetMachine().Options.GuaranteedTailCallOpt)) 2905 NumBytesForCalleeToPush = NumBytes; // Callee pops everything 2906 else if (!Is64Bit && !IsTailCallConvention(CallConv) && !IsWindows && 2907 SR == StackStructReturn) 2908 // If this is a call to a struct-return function, the callee 2909 // pops the hidden struct pointer, so we have to push it back. 2910 // This is common for Darwin/X86, Linux & Mingw32 targets. 2911 // For MSVC Win32 targets, the caller pops the hidden struct pointer. 2912 NumBytesForCalleeToPush = 4; 2913 else 2914 NumBytesForCalleeToPush = 0; // Callee pops nothing. 2915 2916 // Returns a flag for retval copy to use. 2917 if (!IsSibcall) { 2918 Chain = DAG.getCALLSEQ_END(Chain, 2919 DAG.getIntPtrConstant(NumBytes, true), 2920 DAG.getIntPtrConstant(NumBytesForCalleeToPush, 2921 true), 2922 InFlag, dl); 2923 InFlag = Chain.getValue(1); 2924 } 2925 2926 // Handle result values, copying them out of physregs into vregs that we 2927 // return. 2928 return LowerCallResult(Chain, InFlag, CallConv, isVarArg, 2929 Ins, dl, DAG, InVals); 2930} 2931 2932//===----------------------------------------------------------------------===// 2933// Fast Calling Convention (tail call) implementation 2934//===----------------------------------------------------------------------===// 2935 2936// Like std call, callee cleans arguments, convention except that ECX is 2937// reserved for storing the tail called function address. Only 2 registers are 2938// free for argument passing (inreg). Tail call optimization is performed 2939// provided: 2940// * tailcallopt is enabled 2941// * caller/callee are fastcc 2942// On X86_64 architecture with GOT-style position independent code only local 2943// (within module) calls are supported at the moment. 2944// To keep the stack aligned according to platform abi the function 2945// GetAlignedArgumentStackSize ensures that argument delta is always multiples 2946// of stack alignment. (Dynamic linkers need this - darwin's dyld for example) 2947// If a tail called function callee has more arguments than the caller the 2948// caller needs to make sure that there is room to move the RETADDR to. This is 2949// achieved by reserving an area the size of the argument delta right after the 2950// original REtADDR, but before the saved framepointer or the spilled registers 2951// e.g. caller(arg1, arg2) calls callee(arg1, arg2,arg3,arg4) 2952// stack layout: 2953// arg1 2954// arg2 2955// RETADDR 2956// [ new RETADDR 2957// move area ] 2958// (possible EBP) 2959// ESI 2960// EDI 2961// local1 .. 2962 2963/// GetAlignedArgumentStackSize - Make the stack size align e.g 16n + 12 aligned 2964/// for a 16 byte align requirement. 2965unsigned 2966X86TargetLowering::GetAlignedArgumentStackSize(unsigned StackSize, 2967 SelectionDAG& DAG) const { 2968 MachineFunction &MF = DAG.getMachineFunction(); 2969 const TargetMachine &TM = MF.getTarget(); 2970 const X86RegisterInfo *RegInfo = 2971 static_cast<const X86RegisterInfo*>(TM.getRegisterInfo()); 2972 const TargetFrameLowering &TFI = *TM.getFrameLowering(); 2973 unsigned StackAlignment = TFI.getStackAlignment(); 2974 uint64_t AlignMask = StackAlignment - 1; 2975 int64_t Offset = StackSize; 2976 unsigned SlotSize = RegInfo->getSlotSize(); 2977 if ( (Offset & AlignMask) <= (StackAlignment - SlotSize) ) { 2978 // Number smaller than 12 so just add the difference. 2979 Offset += ((StackAlignment - SlotSize) - (Offset & AlignMask)); 2980 } else { 2981 // Mask out lower bits, add stackalignment once plus the 12 bytes. 2982 Offset = ((~AlignMask) & Offset) + StackAlignment + 2983 (StackAlignment-SlotSize); 2984 } 2985 return Offset; 2986} 2987 2988/// MatchingStackOffset - Return true if the given stack call argument is 2989/// already available in the same position (relatively) of the caller's 2990/// incoming argument stack. 2991static 2992bool MatchingStackOffset(SDValue Arg, unsigned Offset, ISD::ArgFlagsTy Flags, 2993 MachineFrameInfo *MFI, const MachineRegisterInfo *MRI, 2994 const X86InstrInfo *TII) { 2995 unsigned Bytes = Arg.getValueType().getSizeInBits() / 8; 2996 int FI = INT_MAX; 2997 if (Arg.getOpcode() == ISD::CopyFromReg) { 2998 unsigned VR = cast<RegisterSDNode>(Arg.getOperand(1))->getReg(); 2999 if (!TargetRegisterInfo::isVirtualRegister(VR)) 3000 return false; 3001 MachineInstr *Def = MRI->getVRegDef(VR); 3002 if (!Def) 3003 return false; 3004 if (!Flags.isByVal()) { 3005 if (!TII->isLoadFromStackSlot(Def, FI)) 3006 return false; 3007 } else { 3008 unsigned Opcode = Def->getOpcode(); 3009 if ((Opcode == X86::LEA32r || Opcode == X86::LEA64r) && 3010 Def->getOperand(1).isFI()) { 3011 FI = Def->getOperand(1).getIndex(); 3012 Bytes = Flags.getByValSize(); 3013 } else 3014 return false; 3015 } 3016 } else if (LoadSDNode *Ld = dyn_cast<LoadSDNode>(Arg)) { 3017 if (Flags.isByVal()) 3018 // ByVal argument is passed in as a pointer but it's now being 3019 // dereferenced. e.g. 3020 // define @foo(%struct.X* %A) { 3021 // tail call @bar(%struct.X* byval %A) 3022 // } 3023 return false; 3024 SDValue Ptr = Ld->getBasePtr(); 3025 FrameIndexSDNode *FINode = dyn_cast<FrameIndexSDNode>(Ptr); 3026 if (!FINode) 3027 return false; 3028 FI = FINode->getIndex(); 3029 } else if (Arg.getOpcode() == ISD::FrameIndex && Flags.isByVal()) { 3030 FrameIndexSDNode *FINode = cast<FrameIndexSDNode>(Arg); 3031 FI = FINode->getIndex(); 3032 Bytes = Flags.getByValSize(); 3033 } else 3034 return false; 3035 3036 assert(FI != INT_MAX); 3037 if (!MFI->isFixedObjectIndex(FI)) 3038 return false; 3039 return Offset == MFI->getObjectOffset(FI) && Bytes == MFI->getObjectSize(FI); 3040} 3041 3042/// IsEligibleForTailCallOptimization - Check whether the call is eligible 3043/// for tail call optimization. Targets which want to do tail call 3044/// optimization should implement this function. 3045bool 3046X86TargetLowering::IsEligibleForTailCallOptimization(SDValue Callee, 3047 CallingConv::ID CalleeCC, 3048 bool isVarArg, 3049 bool isCalleeStructRet, 3050 bool isCallerStructRet, 3051 Type *RetTy, 3052 const SmallVectorImpl<ISD::OutputArg> &Outs, 3053 const SmallVectorImpl<SDValue> &OutVals, 3054 const SmallVectorImpl<ISD::InputArg> &Ins, 3055 SelectionDAG &DAG) const { 3056 if (!IsTailCallConvention(CalleeCC) && !IsCCallConvention(CalleeCC)) 3057 return false; 3058 3059 // If -tailcallopt is specified, make fastcc functions tail-callable. 3060 const MachineFunction &MF = DAG.getMachineFunction(); 3061 const Function *CallerF = MF.getFunction(); 3062 3063 // If the function return type is x86_fp80 and the callee return type is not, 3064 // then the FP_EXTEND of the call result is not a nop. It's not safe to 3065 // perform a tailcall optimization here. 3066 if (CallerF->getReturnType()->isX86_FP80Ty() && !RetTy->isX86_FP80Ty()) 3067 return false; 3068 3069 CallingConv::ID CallerCC = CallerF->getCallingConv(); 3070 bool CCMatch = CallerCC == CalleeCC; 3071 bool IsCalleeWin64 = Subtarget->isCallingConvWin64(CalleeCC); 3072 bool IsCallerWin64 = Subtarget->isCallingConvWin64(CallerCC); 3073 3074 if (getTargetMachine().Options.GuaranteedTailCallOpt) { 3075 if (IsTailCallConvention(CalleeCC) && CCMatch) 3076 return true; 3077 return false; 3078 } 3079 3080 // Look for obvious safe cases to perform tail call optimization that do not 3081 // require ABI changes. This is what gcc calls sibcall. 3082 3083 // Can't do sibcall if stack needs to be dynamically re-aligned. PEI needs to 3084 // emit a special epilogue. 3085 const X86RegisterInfo *RegInfo = 3086 static_cast<const X86RegisterInfo*>(getTargetMachine().getRegisterInfo()); 3087 if (RegInfo->needsStackRealignment(MF)) 3088 return false; 3089 3090 // Also avoid sibcall optimization if either caller or callee uses struct 3091 // return semantics. 3092 if (isCalleeStructRet || isCallerStructRet) 3093 return false; 3094 3095 // An stdcall caller is expected to clean up its arguments; the callee 3096 // isn't going to do that. 3097 if (!CCMatch && CallerCC == CallingConv::X86_StdCall) 3098 return false; 3099 3100 // Do not sibcall optimize vararg calls unless all arguments are passed via 3101 // registers. 3102 if (isVarArg && !Outs.empty()) { 3103 3104 // Optimizing for varargs on Win64 is unlikely to be safe without 3105 // additional testing. 3106 if (IsCalleeWin64 || IsCallerWin64) 3107 return false; 3108 3109 SmallVector<CCValAssign, 16> ArgLocs; 3110 CCState CCInfo(CalleeCC, isVarArg, DAG.getMachineFunction(), 3111 getTargetMachine(), ArgLocs, *DAG.getContext()); 3112 3113 CCInfo.AnalyzeCallOperands(Outs, CC_X86); 3114 for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i) 3115 if (!ArgLocs[i].isRegLoc()) 3116 return false; 3117 } 3118 3119 // If the call result is in ST0 / ST1, it needs to be popped off the x87 3120 // stack. Therefore, if it's not used by the call it is not safe to optimize 3121 // this into a sibcall. 3122 bool Unused = false; 3123 for (unsigned i = 0, e = Ins.size(); i != e; ++i) { 3124 if (!Ins[i].Used) { 3125 Unused = true; 3126 break; 3127 } 3128 } 3129 if (Unused) { 3130 SmallVector<CCValAssign, 16> RVLocs; 3131 CCState CCInfo(CalleeCC, false, DAG.getMachineFunction(), 3132 getTargetMachine(), RVLocs, *DAG.getContext()); 3133 CCInfo.AnalyzeCallResult(Ins, RetCC_X86); 3134 for (unsigned i = 0, e = RVLocs.size(); i != e; ++i) { 3135 CCValAssign &VA = RVLocs[i]; 3136 if (VA.getLocReg() == X86::ST0 || VA.getLocReg() == X86::ST1) 3137 return false; 3138 } 3139 } 3140 3141 // If the calling conventions do not match, then we'd better make sure the 3142 // results are returned in the same way as what the caller expects. 3143 if (!CCMatch) { 3144 SmallVector<CCValAssign, 16> RVLocs1; 3145 CCState CCInfo1(CalleeCC, false, DAG.getMachineFunction(), 3146 getTargetMachine(), RVLocs1, *DAG.getContext()); 3147 CCInfo1.AnalyzeCallResult(Ins, RetCC_X86); 3148 3149 SmallVector<CCValAssign, 16> RVLocs2; 3150 CCState CCInfo2(CallerCC, false, DAG.getMachineFunction(), 3151 getTargetMachine(), RVLocs2, *DAG.getContext()); 3152 CCInfo2.AnalyzeCallResult(Ins, RetCC_X86); 3153 3154 if (RVLocs1.size() != RVLocs2.size()) 3155 return false; 3156 for (unsigned i = 0, e = RVLocs1.size(); i != e; ++i) { 3157 if (RVLocs1[i].isRegLoc() != RVLocs2[i].isRegLoc()) 3158 return false; 3159 if (RVLocs1[i].getLocInfo() != RVLocs2[i].getLocInfo()) 3160 return false; 3161 if (RVLocs1[i].isRegLoc()) { 3162 if (RVLocs1[i].getLocReg() != RVLocs2[i].getLocReg()) 3163 return false; 3164 } else { 3165 if (RVLocs1[i].getLocMemOffset() != RVLocs2[i].getLocMemOffset()) 3166 return false; 3167 } 3168 } 3169 } 3170 3171 // If the callee takes no arguments then go on to check the results of the 3172 // call. 3173 if (!Outs.empty()) { 3174 // Check if stack adjustment is needed. For now, do not do this if any 3175 // argument is passed on the stack. 3176 SmallVector<CCValAssign, 16> ArgLocs; 3177 CCState CCInfo(CalleeCC, isVarArg, DAG.getMachineFunction(), 3178 getTargetMachine(), ArgLocs, *DAG.getContext()); 3179 3180 // Allocate shadow area for Win64 3181 if (IsCalleeWin64) 3182 CCInfo.AllocateStack(32, 8); 3183 3184 CCInfo.AnalyzeCallOperands(Outs, CC_X86); 3185 if (CCInfo.getNextStackOffset()) { 3186 MachineFunction &MF = DAG.getMachineFunction(); 3187 if (MF.getInfo<X86MachineFunctionInfo>()->getBytesToPopOnReturn()) 3188 return false; 3189 3190 // Check if the arguments are already laid out in the right way as 3191 // the caller's fixed stack objects. 3192 MachineFrameInfo *MFI = MF.getFrameInfo(); 3193 const MachineRegisterInfo *MRI = &MF.getRegInfo(); 3194 const X86InstrInfo *TII = 3195 ((const X86TargetMachine&)getTargetMachine()).getInstrInfo(); 3196 for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i) { 3197 CCValAssign &VA = ArgLocs[i]; 3198 SDValue Arg = OutVals[i]; 3199 ISD::ArgFlagsTy Flags = Outs[i].Flags; 3200 if (VA.getLocInfo() == CCValAssign::Indirect) 3201 return false; 3202 if (!VA.isRegLoc()) { 3203 if (!MatchingStackOffset(Arg, VA.getLocMemOffset(), Flags, 3204 MFI, MRI, TII)) 3205 return false; 3206 } 3207 } 3208 } 3209 3210 // If the tailcall address may be in a register, then make sure it's 3211 // possible to register allocate for it. In 32-bit, the call address can 3212 // only target EAX, EDX, or ECX since the tail call must be scheduled after 3213 // callee-saved registers are restored. These happen to be the same 3214 // registers used to pass 'inreg' arguments so watch out for those. 3215 if (!Subtarget->is64Bit() && 3216 ((!isa<GlobalAddressSDNode>(Callee) && 3217 !isa<ExternalSymbolSDNode>(Callee)) || 3218 getTargetMachine().getRelocationModel() == Reloc::PIC_)) { 3219 unsigned NumInRegs = 0; 3220 // In PIC we need an extra register to formulate the address computation 3221 // for the callee. 3222 unsigned MaxInRegs = 3223 (getTargetMachine().getRelocationModel() == Reloc::PIC_) ? 2 : 3; 3224 3225 for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i) { 3226 CCValAssign &VA = ArgLocs[i]; 3227 if (!VA.isRegLoc()) 3228 continue; 3229 unsigned Reg = VA.getLocReg(); 3230 switch (Reg) { 3231 default: break; 3232 case X86::EAX: case X86::EDX: case X86::ECX: 3233 if (++NumInRegs == MaxInRegs) 3234 return false; 3235 break; 3236 } 3237 } 3238 } 3239 } 3240 3241 return true; 3242} 3243 3244FastISel * 3245X86TargetLowering::createFastISel(FunctionLoweringInfo &funcInfo, 3246 const TargetLibraryInfo *libInfo) const { 3247 return X86::createFastISel(funcInfo, libInfo); 3248} 3249 3250//===----------------------------------------------------------------------===// 3251// Other Lowering Hooks 3252//===----------------------------------------------------------------------===// 3253 3254static bool MayFoldLoad(SDValue Op) { 3255 return Op.hasOneUse() && ISD::isNormalLoad(Op.getNode()); 3256} 3257 3258static bool MayFoldIntoStore(SDValue Op) { 3259 return Op.hasOneUse() && ISD::isNormalStore(*Op.getNode()->use_begin()); 3260} 3261 3262static bool isTargetShuffle(unsigned Opcode) { 3263 switch(Opcode) { 3264 default: return false; 3265 case X86ISD::PSHUFD: 3266 case X86ISD::PSHUFHW: 3267 case X86ISD::PSHUFLW: 3268 case X86ISD::SHUFP: 3269 case X86ISD::PALIGNR: 3270 case X86ISD::MOVLHPS: 3271 case X86ISD::MOVLHPD: 3272 case X86ISD::MOVHLPS: 3273 case X86ISD::MOVLPS: 3274 case X86ISD::MOVLPD: 3275 case X86ISD::MOVSHDUP: 3276 case X86ISD::MOVSLDUP: 3277 case X86ISD::MOVDDUP: 3278 case X86ISD::MOVSS: 3279 case X86ISD::MOVSD: 3280 case X86ISD::UNPCKL: 3281 case X86ISD::UNPCKH: 3282 case X86ISD::VPERMILP: 3283 case X86ISD::VPERM2X128: 3284 case X86ISD::VPERMI: 3285 return true; 3286 } 3287} 3288 3289static SDValue getTargetShuffleNode(unsigned Opc, SDLoc dl, EVT VT, 3290 SDValue V1, SelectionDAG &DAG) { 3291 switch(Opc) { 3292 default: llvm_unreachable("Unknown x86 shuffle node"); 3293 case X86ISD::MOVSHDUP: 3294 case X86ISD::MOVSLDUP: 3295 case X86ISD::MOVDDUP: 3296 return DAG.getNode(Opc, dl, VT, V1); 3297 } 3298} 3299 3300static SDValue getTargetShuffleNode(unsigned Opc, SDLoc dl, EVT VT, 3301 SDValue V1, unsigned TargetMask, 3302 SelectionDAG &DAG) { 3303 switch(Opc) { 3304 default: llvm_unreachable("Unknown x86 shuffle node"); 3305 case X86ISD::PSHUFD: 3306 case X86ISD::PSHUFHW: 3307 case X86ISD::PSHUFLW: 3308 case X86ISD::VPERMILP: 3309 case X86ISD::VPERMI: 3310 return DAG.getNode(Opc, dl, VT, V1, DAG.getConstant(TargetMask, MVT::i8)); 3311 } 3312} 3313 3314static SDValue getTargetShuffleNode(unsigned Opc, SDLoc dl, EVT VT, 3315 SDValue V1, SDValue V2, unsigned TargetMask, 3316 SelectionDAG &DAG) { 3317 switch(Opc) { 3318 default: llvm_unreachable("Unknown x86 shuffle node"); 3319 case X86ISD::PALIGNR: 3320 case X86ISD::SHUFP: 3321 case X86ISD::VPERM2X128: 3322 return DAG.getNode(Opc, dl, VT, V1, V2, 3323 DAG.getConstant(TargetMask, MVT::i8)); 3324 } 3325} 3326 3327static SDValue getTargetShuffleNode(unsigned Opc, SDLoc dl, EVT VT, 3328 SDValue V1, SDValue V2, SelectionDAG &DAG) { 3329 switch(Opc) { 3330 default: llvm_unreachable("Unknown x86 shuffle node"); 3331 case X86ISD::MOVLHPS: 3332 case X86ISD::MOVLHPD: 3333 case X86ISD::MOVHLPS: 3334 case X86ISD::MOVLPS: 3335 case X86ISD::MOVLPD: 3336 case X86ISD::MOVSS: 3337 case X86ISD::MOVSD: 3338 case X86ISD::UNPCKL: 3339 case X86ISD::UNPCKH: 3340 return DAG.getNode(Opc, dl, VT, V1, V2); 3341 } 3342} 3343 3344SDValue X86TargetLowering::getReturnAddressFrameIndex(SelectionDAG &DAG) const { 3345 MachineFunction &MF = DAG.getMachineFunction(); 3346 const X86RegisterInfo *RegInfo = 3347 static_cast<const X86RegisterInfo*>(getTargetMachine().getRegisterInfo()); 3348 X86MachineFunctionInfo *FuncInfo = MF.getInfo<X86MachineFunctionInfo>(); 3349 int ReturnAddrIndex = FuncInfo->getRAIndex(); 3350 3351 if (ReturnAddrIndex == 0) { 3352 // Set up a frame object for the return address. 3353 unsigned SlotSize = RegInfo->getSlotSize(); 3354 ReturnAddrIndex = MF.getFrameInfo()->CreateFixedObject(SlotSize, 3355 -(int64_t)SlotSize, 3356 false); 3357 FuncInfo->setRAIndex(ReturnAddrIndex); 3358 } 3359 3360 return DAG.getFrameIndex(ReturnAddrIndex, getPointerTy()); 3361} 3362 3363bool X86::isOffsetSuitableForCodeModel(int64_t Offset, CodeModel::Model M, 3364 bool hasSymbolicDisplacement) { 3365 // Offset should fit into 32 bit immediate field. 3366 if (!isInt<32>(Offset)) 3367 return false; 3368 3369 // If we don't have a symbolic displacement - we don't have any extra 3370 // restrictions. 3371 if (!hasSymbolicDisplacement) 3372 return true; 3373 3374 // FIXME: Some tweaks might be needed for medium code model. 3375 if (M != CodeModel::Small && M != CodeModel::Kernel) 3376 return false; 3377 3378 // For small code model we assume that latest object is 16MB before end of 31 3379 // bits boundary. We may also accept pretty large negative constants knowing 3380 // that all objects are in the positive half of address space. 3381 if (M == CodeModel::Small && Offset < 16*1024*1024) 3382 return true; 3383 3384 // For kernel code model we know that all object resist in the negative half 3385 // of 32bits address space. We may not accept negative offsets, since they may 3386 // be just off and we may accept pretty large positive ones. 3387 if (M == CodeModel::Kernel && Offset > 0) 3388 return true; 3389 3390 return false; 3391} 3392 3393/// isCalleePop - Determines whether the callee is required to pop its 3394/// own arguments. Callee pop is necessary to support tail calls. 3395bool X86::isCalleePop(CallingConv::ID CallingConv, 3396 bool is64Bit, bool IsVarArg, bool TailCallOpt) { 3397 if (IsVarArg) 3398 return false; 3399 3400 switch (CallingConv) { 3401 default: 3402 return false; 3403 case CallingConv::X86_StdCall: 3404 return !is64Bit; 3405 case CallingConv::X86_FastCall: 3406 return !is64Bit; 3407 case CallingConv::X86_ThisCall: 3408 return !is64Bit; 3409 case CallingConv::Fast: 3410 return TailCallOpt; 3411 case CallingConv::GHC: 3412 return TailCallOpt; 3413 case CallingConv::HiPE: 3414 return TailCallOpt; 3415 } 3416} 3417 3418/// TranslateX86CC - do a one to one translation of a ISD::CondCode to the X86 3419/// specific condition code, returning the condition code and the LHS/RHS of the 3420/// comparison to make. 3421static unsigned TranslateX86CC(ISD::CondCode SetCCOpcode, bool isFP, 3422 SDValue &LHS, SDValue &RHS, SelectionDAG &DAG) { 3423 if (!isFP) { 3424 if (ConstantSDNode *RHSC = dyn_cast<ConstantSDNode>(RHS)) { 3425 if (SetCCOpcode == ISD::SETGT && RHSC->isAllOnesValue()) { 3426 // X > -1 -> X == 0, jump !sign. 3427 RHS = DAG.getConstant(0, RHS.getValueType()); 3428 return X86::COND_NS; 3429 } 3430 if (SetCCOpcode == ISD::SETLT && RHSC->isNullValue()) { 3431 // X < 0 -> X == 0, jump on sign. 3432 return X86::COND_S; 3433 } 3434 if (SetCCOpcode == ISD::SETLT && RHSC->getZExtValue() == 1) { 3435 // X < 1 -> X <= 0 3436 RHS = DAG.getConstant(0, RHS.getValueType()); 3437 return X86::COND_LE; 3438 } 3439 } 3440 3441 switch (SetCCOpcode) { 3442 default: llvm_unreachable("Invalid integer condition!"); 3443 case ISD::SETEQ: return X86::COND_E; 3444 case ISD::SETGT: return X86::COND_G; 3445 case ISD::SETGE: return X86::COND_GE; 3446 case ISD::SETLT: return X86::COND_L; 3447 case ISD::SETLE: return X86::COND_LE; 3448 case ISD::SETNE: return X86::COND_NE; 3449 case ISD::SETULT: return X86::COND_B; 3450 case ISD::SETUGT: return X86::COND_A; 3451 case ISD::SETULE: return X86::COND_BE; 3452 case ISD::SETUGE: return X86::COND_AE; 3453 } 3454 } 3455 3456 // First determine if it is required or is profitable to flip the operands. 3457 3458 // If LHS is a foldable load, but RHS is not, flip the condition. 3459 if (ISD::isNON_EXTLoad(LHS.getNode()) && 3460 !ISD::isNON_EXTLoad(RHS.getNode())) { 3461 SetCCOpcode = getSetCCSwappedOperands(SetCCOpcode); 3462 std::swap(LHS, RHS); 3463 } 3464 3465 switch (SetCCOpcode) { 3466 default: break; 3467 case ISD::SETOLT: 3468 case ISD::SETOLE: 3469 case ISD::SETUGT: 3470 case ISD::SETUGE: 3471 std::swap(LHS, RHS); 3472 break; 3473 } 3474 3475 // On a floating point condition, the flags are set as follows: 3476 // ZF PF CF op 3477 // 0 | 0 | 0 | X > Y 3478 // 0 | 0 | 1 | X < Y 3479 // 1 | 0 | 0 | X == Y 3480 // 1 | 1 | 1 | unordered 3481 switch (SetCCOpcode) { 3482 default: llvm_unreachable("Condcode should be pre-legalized away"); 3483 case ISD::SETUEQ: 3484 case ISD::SETEQ: return X86::COND_E; 3485 case ISD::SETOLT: // flipped 3486 case ISD::SETOGT: 3487 case ISD::SETGT: return X86::COND_A; 3488 case ISD::SETOLE: // flipped 3489 case ISD::SETOGE: 3490 case ISD::SETGE: return X86::COND_AE; 3491 case ISD::SETUGT: // flipped 3492 case ISD::SETULT: 3493 case ISD::SETLT: return X86::COND_B; 3494 case ISD::SETUGE: // flipped 3495 case ISD::SETULE: 3496 case ISD::SETLE: return X86::COND_BE; 3497 case ISD::SETONE: 3498 case ISD::SETNE: return X86::COND_NE; 3499 case ISD::SETUO: return X86::COND_P; 3500 case ISD::SETO: return X86::COND_NP; 3501 case ISD::SETOEQ: 3502 case ISD::SETUNE: return X86::COND_INVALID; 3503 } 3504} 3505 3506/// hasFPCMov - is there a floating point cmov for the specific X86 condition 3507/// code. Current x86 isa includes the following FP cmov instructions: 3508/// fcmovb, fcomvbe, fcomve, fcmovu, fcmovae, fcmova, fcmovne, fcmovnu. 3509static bool hasFPCMov(unsigned X86CC) { 3510 switch (X86CC) { 3511 default: 3512 return false; 3513 case X86::COND_B: 3514 case X86::COND_BE: 3515 case X86::COND_E: 3516 case X86::COND_P: 3517 case X86::COND_A: 3518 case X86::COND_AE: 3519 case X86::COND_NE: 3520 case X86::COND_NP: 3521 return true; 3522 } 3523} 3524 3525/// isFPImmLegal - Returns true if the target can instruction select the 3526/// specified FP immediate natively. If false, the legalizer will 3527/// materialize the FP immediate as a load from a constant pool. 3528bool X86TargetLowering::isFPImmLegal(const APFloat &Imm, EVT VT) const { 3529 for (unsigned i = 0, e = LegalFPImmediates.size(); i != e; ++i) { 3530 if (Imm.bitwiseIsEqual(LegalFPImmediates[i])) 3531 return true; 3532 } 3533 return false; 3534} 3535 3536/// isUndefOrInRange - Return true if Val is undef or if its value falls within 3537/// the specified range (L, H]. 3538static bool isUndefOrInRange(int Val, int Low, int Hi) { 3539 return (Val < 0) || (Val >= Low && Val < Hi); 3540} 3541 3542/// isUndefOrEqual - Val is either less than zero (undef) or equal to the 3543/// specified value. 3544static bool isUndefOrEqual(int Val, int CmpVal) { 3545 return (Val < 0 || Val == CmpVal); 3546} 3547 3548/// isSequentialOrUndefInRange - Return true if every element in Mask, beginning 3549/// from position Pos and ending in Pos+Size, falls within the specified 3550/// sequential range (L, L+Pos]. or is undef. 3551static bool isSequentialOrUndefInRange(ArrayRef<int> Mask, 3552 unsigned Pos, unsigned Size, int Low) { 3553 for (unsigned i = Pos, e = Pos+Size; i != e; ++i, ++Low) 3554 if (!isUndefOrEqual(Mask[i], Low)) 3555 return false; 3556 return true; 3557} 3558 3559/// isPSHUFDMask - Return true if the node specifies a shuffle of elements that 3560/// is suitable for input to PSHUFD or PSHUFW. That is, it doesn't reference 3561/// the second operand. 3562static bool isPSHUFDMask(ArrayRef<int> Mask, MVT VT) { 3563 if (VT == MVT::v4f32 || VT == MVT::v4i32 ) 3564 return (Mask[0] < 4 && Mask[1] < 4 && Mask[2] < 4 && Mask[3] < 4); 3565 if (VT == MVT::v2f64 || VT == MVT::v2i64) 3566 return (Mask[0] < 2 && Mask[1] < 2); 3567 return false; 3568} 3569 3570/// isPSHUFHWMask - Return true if the node specifies a shuffle of elements that 3571/// is suitable for input to PSHUFHW. 3572static bool isPSHUFHWMask(ArrayRef<int> Mask, MVT VT, bool HasInt256) { 3573 if (VT != MVT::v8i16 && (!HasInt256 || VT != MVT::v16i16)) 3574 return false; 3575 3576 // Lower quadword copied in order or undef. 3577 if (!isSequentialOrUndefInRange(Mask, 0, 4, 0)) 3578 return false; 3579 3580 // Upper quadword shuffled. 3581 for (unsigned i = 4; i != 8; ++i) 3582 if (!isUndefOrInRange(Mask[i], 4, 8)) 3583 return false; 3584 3585 if (VT == MVT::v16i16) { 3586 // Lower quadword copied in order or undef. 3587 if (!isSequentialOrUndefInRange(Mask, 8, 4, 8)) 3588 return false; 3589 3590 // Upper quadword shuffled. 3591 for (unsigned i = 12; i != 16; ++i) 3592 if (!isUndefOrInRange(Mask[i], 12, 16)) 3593 return false; 3594 } 3595 3596 return true; 3597} 3598 3599/// isPSHUFLWMask - Return true if the node specifies a shuffle of elements that 3600/// is suitable for input to PSHUFLW. 3601static bool isPSHUFLWMask(ArrayRef<int> Mask, MVT VT, bool HasInt256) { 3602 if (VT != MVT::v8i16 && (!HasInt256 || VT != MVT::v16i16)) 3603 return false; 3604 3605 // Upper quadword copied in order. 3606 if (!isSequentialOrUndefInRange(Mask, 4, 4, 4)) 3607 return false; 3608 3609 // Lower quadword shuffled. 3610 for (unsigned i = 0; i != 4; ++i) 3611 if (!isUndefOrInRange(Mask[i], 0, 4)) 3612 return false; 3613 3614 if (VT == MVT::v16i16) { 3615 // Upper quadword copied in order. 3616 if (!isSequentialOrUndefInRange(Mask, 12, 4, 12)) 3617 return false; 3618 3619 // Lower quadword shuffled. 3620 for (unsigned i = 8; i != 12; ++i) 3621 if (!isUndefOrInRange(Mask[i], 8, 12)) 3622 return false; 3623 } 3624 3625 return true; 3626} 3627 3628/// isPALIGNRMask - Return true if the node specifies a shuffle of elements that 3629/// is suitable for input to PALIGNR. 3630static bool isPALIGNRMask(ArrayRef<int> Mask, MVT VT, 3631 const X86Subtarget *Subtarget) { 3632 if ((VT.is128BitVector() && !Subtarget->hasSSSE3()) || 3633 (VT.is256BitVector() && !Subtarget->hasInt256())) 3634 return false; 3635 3636 unsigned NumElts = VT.getVectorNumElements(); 3637 unsigned NumLanes = VT.is512BitVector() ? 1: VT.getSizeInBits()/128; 3638 unsigned NumLaneElts = NumElts/NumLanes; 3639 3640 // Do not handle 64-bit element shuffles with palignr. 3641 if (NumLaneElts == 2) 3642 return false; 3643 3644 for (unsigned l = 0; l != NumElts; l+=NumLaneElts) { 3645 unsigned i; 3646 for (i = 0; i != NumLaneElts; ++i) { 3647 if (Mask[i+l] >= 0) 3648 break; 3649 } 3650 3651 // Lane is all undef, go to next lane 3652 if (i == NumLaneElts) 3653 continue; 3654 3655 int Start = Mask[i+l]; 3656 3657 // Make sure its in this lane in one of the sources 3658 if (!isUndefOrInRange(Start, l, l+NumLaneElts) && 3659 !isUndefOrInRange(Start, l+NumElts, l+NumElts+NumLaneElts)) 3660 return false; 3661 3662 // If not lane 0, then we must match lane 0 3663 if (l != 0 && Mask[i] >= 0 && !isUndefOrEqual(Start, Mask[i]+l)) 3664 return false; 3665 3666 // Correct second source to be contiguous with first source 3667 if (Start >= (int)NumElts) 3668 Start -= NumElts - NumLaneElts; 3669 3670 // Make sure we're shifting in the right direction. 3671 if (Start <= (int)(i+l)) 3672 return false; 3673 3674 Start -= i; 3675 3676 // Check the rest of the elements to see if they are consecutive. 3677 for (++i; i != NumLaneElts; ++i) { 3678 int Idx = Mask[i+l]; 3679 3680 // Make sure its in this lane 3681 if (!isUndefOrInRange(Idx, l, l+NumLaneElts) && 3682 !isUndefOrInRange(Idx, l+NumElts, l+NumElts+NumLaneElts)) 3683 return false; 3684 3685 // If not lane 0, then we must match lane 0 3686 if (l != 0 && Mask[i] >= 0 && !isUndefOrEqual(Idx, Mask[i]+l)) 3687 return false; 3688 3689 if (Idx >= (int)NumElts) 3690 Idx -= NumElts - NumLaneElts; 3691 3692 if (!isUndefOrEqual(Idx, Start+i)) 3693 return false; 3694 3695 } 3696 } 3697 3698 return true; 3699} 3700 3701/// CommuteVectorShuffleMask - Change values in a shuffle permute mask assuming 3702/// the two vector operands have swapped position. 3703static void CommuteVectorShuffleMask(SmallVectorImpl<int> &Mask, 3704 unsigned NumElems) { 3705 for (unsigned i = 0; i != NumElems; ++i) { 3706 int idx = Mask[i]; 3707 if (idx < 0) 3708 continue; 3709 else if (idx < (int)NumElems) 3710 Mask[i] = idx + NumElems; 3711 else 3712 Mask[i] = idx - NumElems; 3713 } 3714} 3715 3716/// isSHUFPMask - Return true if the specified VECTOR_SHUFFLE operand 3717/// specifies a shuffle of elements that is suitable for input to 128/256-bit 3718/// SHUFPS and SHUFPD. If Commuted is true, then it checks for sources to be 3719/// reverse of what x86 shuffles want. 3720static bool isSHUFPMask(ArrayRef<int> Mask, MVT VT, bool Commuted = false) { 3721 3722 unsigned NumElems = VT.getVectorNumElements(); 3723 unsigned NumLanes = VT.getSizeInBits()/128; 3724 unsigned NumLaneElems = NumElems/NumLanes; 3725 3726 if (NumLaneElems != 2 && NumLaneElems != 4) 3727 return false; 3728 3729 unsigned EltSize = VT.getVectorElementType().getSizeInBits(); 3730 bool symetricMaskRequired = 3731 (VT.getSizeInBits() >= 256) && (EltSize == 32); 3732 3733 // VSHUFPSY divides the resulting vector into 4 chunks. 3734 // The sources are also splitted into 4 chunks, and each destination 3735 // chunk must come from a different source chunk. 3736 // 3737 // SRC1 => X7 X6 X5 X4 X3 X2 X1 X0 3738 // SRC2 => Y7 Y6 Y5 Y4 Y3 Y2 Y1 Y9 3739 // 3740 // DST => Y7..Y4, Y7..Y4, X7..X4, X7..X4, 3741 // Y3..Y0, Y3..Y0, X3..X0, X3..X0 3742 // 3743 // VSHUFPDY divides the resulting vector into 4 chunks. 3744 // The sources are also splitted into 4 chunks, and each destination 3745 // chunk must come from a different source chunk. 3746 // 3747 // SRC1 => X3 X2 X1 X0 3748 // SRC2 => Y3 Y2 Y1 Y0 3749 // 3750 // DST => Y3..Y2, X3..X2, Y1..Y0, X1..X0 3751 // 3752 SmallVector<int, 4> MaskVal(NumLaneElems, -1); 3753 unsigned HalfLaneElems = NumLaneElems/2; 3754 for (unsigned l = 0; l != NumElems; l += NumLaneElems) { 3755 for (unsigned i = 0; i != NumLaneElems; ++i) { 3756 int Idx = Mask[i+l]; 3757 unsigned RngStart = l + ((Commuted == (i<HalfLaneElems)) ? NumElems : 0); 3758 if (!isUndefOrInRange(Idx, RngStart, RngStart+NumLaneElems)) 3759 return false; 3760 // For VSHUFPSY, the mask of the second half must be the same as the 3761 // first but with the appropriate offsets. This works in the same way as 3762 // VPERMILPS works with masks. 3763 if (!symetricMaskRequired || Idx < 0) 3764 continue; 3765 if (MaskVal[i] < 0) { 3766 MaskVal[i] = Idx - l; 3767 continue; 3768 } 3769 if ((signed)(Idx - l) != MaskVal[i]) 3770 return false; 3771 } 3772 } 3773 3774 return true; 3775} 3776 3777/// isMOVHLPSMask - Return true if the specified VECTOR_SHUFFLE operand 3778/// specifies a shuffle of elements that is suitable for input to MOVHLPS. 3779static bool isMOVHLPSMask(ArrayRef<int> Mask, MVT VT) { 3780 if (!VT.is128BitVector()) 3781 return false; 3782 3783 unsigned NumElems = VT.getVectorNumElements(); 3784 3785 if (NumElems != 4) 3786 return false; 3787 3788 // Expect bit0 == 6, bit1 == 7, bit2 == 2, bit3 == 3 3789 return isUndefOrEqual(Mask[0], 6) && 3790 isUndefOrEqual(Mask[1], 7) && 3791 isUndefOrEqual(Mask[2], 2) && 3792 isUndefOrEqual(Mask[3], 3); 3793} 3794 3795/// isMOVHLPS_v_undef_Mask - Special case of isMOVHLPSMask for canonical form 3796/// of vector_shuffle v, v, <2, 3, 2, 3>, i.e. vector_shuffle v, undef, 3797/// <2, 3, 2, 3> 3798static bool isMOVHLPS_v_undef_Mask(ArrayRef<int> Mask, MVT VT) { 3799 if (!VT.is128BitVector()) 3800 return false; 3801 3802 unsigned NumElems = VT.getVectorNumElements(); 3803 3804 if (NumElems != 4) 3805 return false; 3806 3807 return isUndefOrEqual(Mask[0], 2) && 3808 isUndefOrEqual(Mask[1], 3) && 3809 isUndefOrEqual(Mask[2], 2) && 3810 isUndefOrEqual(Mask[3], 3); 3811} 3812 3813/// isMOVLPMask - Return true if the specified VECTOR_SHUFFLE operand 3814/// specifies a shuffle of elements that is suitable for input to MOVLP{S|D}. 3815static bool isMOVLPMask(ArrayRef<int> Mask, MVT VT) { 3816 if (!VT.is128BitVector()) 3817 return false; 3818 3819 unsigned NumElems = VT.getVectorNumElements(); 3820 3821 if (NumElems != 2 && NumElems != 4) 3822 return false; 3823 3824 for (unsigned i = 0, e = NumElems/2; i != e; ++i) 3825 if (!isUndefOrEqual(Mask[i], i + NumElems)) 3826 return false; 3827 3828 for (unsigned i = NumElems/2, e = NumElems; i != e; ++i) 3829 if (!isUndefOrEqual(Mask[i], i)) 3830 return false; 3831 3832 return true; 3833} 3834 3835/// isMOVLHPSMask - Return true if the specified VECTOR_SHUFFLE operand 3836/// specifies a shuffle of elements that is suitable for input to MOVLHPS. 3837static bool isMOVLHPSMask(ArrayRef<int> Mask, MVT VT) { 3838 if (!VT.is128BitVector()) 3839 return false; 3840 3841 unsigned NumElems = VT.getVectorNumElements(); 3842 3843 if (NumElems != 2 && NumElems != 4) 3844 return false; 3845 3846 for (unsigned i = 0, e = NumElems/2; i != e; ++i) 3847 if (!isUndefOrEqual(Mask[i], i)) 3848 return false; 3849 3850 for (unsigned i = 0, e = NumElems/2; i != e; ++i) 3851 if (!isUndefOrEqual(Mask[i + e], i + NumElems)) 3852 return false; 3853 3854 return true; 3855} 3856 3857// 3858// Some special combinations that can be optimized. 3859// 3860static 3861SDValue Compact8x32ShuffleNode(ShuffleVectorSDNode *SVOp, 3862 SelectionDAG &DAG) { 3863 MVT VT = SVOp->getSimpleValueType(0); 3864 SDLoc dl(SVOp); 3865 3866 if (VT != MVT::v8i32 && VT != MVT::v8f32) 3867 return SDValue(); 3868 3869 ArrayRef<int> Mask = SVOp->getMask(); 3870 3871 // These are the special masks that may be optimized. 3872 static const int MaskToOptimizeEven[] = {0, 8, 2, 10, 4, 12, 6, 14}; 3873 static const int MaskToOptimizeOdd[] = {1, 9, 3, 11, 5, 13, 7, 15}; 3874 bool MatchEvenMask = true; 3875 bool MatchOddMask = true; 3876 for (int i=0; i<8; ++i) { 3877 if (!isUndefOrEqual(Mask[i], MaskToOptimizeEven[i])) 3878 MatchEvenMask = false; 3879 if (!isUndefOrEqual(Mask[i], MaskToOptimizeOdd[i])) 3880 MatchOddMask = false; 3881 } 3882 3883 if (!MatchEvenMask && !MatchOddMask) 3884 return SDValue(); 3885 3886 SDValue UndefNode = DAG.getNode(ISD::UNDEF, dl, VT); 3887 3888 SDValue Op0 = SVOp->getOperand(0); 3889 SDValue Op1 = SVOp->getOperand(1); 3890 3891 if (MatchEvenMask) { 3892 // Shift the second operand right to 32 bits. 3893 static const int ShiftRightMask[] = {-1, 0, -1, 2, -1, 4, -1, 6 }; 3894 Op1 = DAG.getVectorShuffle(VT, dl, Op1, UndefNode, ShiftRightMask); 3895 } else { 3896 // Shift the first operand left to 32 bits. 3897 static const int ShiftLeftMask[] = {1, -1, 3, -1, 5, -1, 7, -1 }; 3898 Op0 = DAG.getVectorShuffle(VT, dl, Op0, UndefNode, ShiftLeftMask); 3899 } 3900 static const int BlendMask[] = {0, 9, 2, 11, 4, 13, 6, 15}; 3901 return DAG.getVectorShuffle(VT, dl, Op0, Op1, BlendMask); 3902} 3903 3904/// isUNPCKLMask - Return true if the specified VECTOR_SHUFFLE operand 3905/// specifies a shuffle of elements that is suitable for input to UNPCKL. 3906static bool isUNPCKLMask(ArrayRef<int> Mask, MVT VT, 3907 bool HasInt256, bool V2IsSplat = false) { 3908 3909 assert(VT.getSizeInBits() >= 128 && 3910 "Unsupported vector type for unpckl"); 3911 3912 // AVX defines UNPCK* to operate independently on 128-bit lanes. 3913 unsigned NumLanes; 3914 unsigned NumOf256BitLanes; 3915 unsigned NumElts = VT.getVectorNumElements(); 3916 if (VT.is256BitVector()) { 3917 if (NumElts != 4 && NumElts != 8 && 3918 (!HasInt256 || (NumElts != 16 && NumElts != 32))) 3919 return false; 3920 NumLanes = 2; 3921 NumOf256BitLanes = 1; 3922 } else if (VT.is512BitVector()) { 3923 assert(VT.getScalarType().getSizeInBits() >= 32 && 3924 "Unsupported vector type for unpckh"); 3925 NumLanes = 2; 3926 NumOf256BitLanes = 2; 3927 } else { 3928 NumLanes = 1; 3929 NumOf256BitLanes = 1; 3930 } 3931 3932 unsigned NumEltsInStride = NumElts/NumOf256BitLanes; 3933 unsigned NumLaneElts = NumEltsInStride/NumLanes; 3934 3935 for (unsigned l256 = 0; l256 < NumOf256BitLanes; l256 += 1) { 3936 for (unsigned l = 0; l != NumEltsInStride; l += NumLaneElts) { 3937 for (unsigned i = 0, j = l; i != NumLaneElts; i += 2, ++j) { 3938 int BitI = Mask[l256*NumEltsInStride+l+i]; 3939 int BitI1 = Mask[l256*NumEltsInStride+l+i+1]; 3940 if (!isUndefOrEqual(BitI, j+l256*NumElts)) 3941 return false; 3942 if (V2IsSplat && !isUndefOrEqual(BitI1, NumElts)) 3943 return false; 3944 if (!isUndefOrEqual(BitI1, j+l256*NumElts+NumEltsInStride)) 3945 return false; 3946 } 3947 } 3948 } 3949 return true; 3950} 3951 3952/// isUNPCKHMask - Return true if the specified VECTOR_SHUFFLE operand 3953/// specifies a shuffle of elements that is suitable for input to UNPCKH. 3954static bool isUNPCKHMask(ArrayRef<int> Mask, MVT VT, 3955 bool HasInt256, bool V2IsSplat = false) { 3956 assert(VT.getSizeInBits() >= 128 && 3957 "Unsupported vector type for unpckh"); 3958 3959 // AVX defines UNPCK* to operate independently on 128-bit lanes. 3960 unsigned NumLanes; 3961 unsigned NumOf256BitLanes; 3962 unsigned NumElts = VT.getVectorNumElements(); 3963 if (VT.is256BitVector()) { 3964 if (NumElts != 4 && NumElts != 8 && 3965 (!HasInt256 || (NumElts != 16 && NumElts != 32))) 3966 return false; 3967 NumLanes = 2; 3968 NumOf256BitLanes = 1; 3969 } else if (VT.is512BitVector()) { 3970 assert(VT.getScalarType().getSizeInBits() >= 32 && 3971 "Unsupported vector type for unpckh"); 3972 NumLanes = 2; 3973 NumOf256BitLanes = 2; 3974 } else { 3975 NumLanes = 1; 3976 NumOf256BitLanes = 1; 3977 } 3978 3979 unsigned NumEltsInStride = NumElts/NumOf256BitLanes; 3980 unsigned NumLaneElts = NumEltsInStride/NumLanes; 3981 3982 for (unsigned l256 = 0; l256 < NumOf256BitLanes; l256 += 1) { 3983 for (unsigned l = 0; l != NumEltsInStride; l += NumLaneElts) { 3984 for (unsigned i = 0, j = l+NumLaneElts/2; i != NumLaneElts; i += 2, ++j) { 3985 int BitI = Mask[l256*NumEltsInStride+l+i]; 3986 int BitI1 = Mask[l256*NumEltsInStride+l+i+1]; 3987 if (!isUndefOrEqual(BitI, j+l256*NumElts)) 3988 return false; 3989 if (V2IsSplat && !isUndefOrEqual(BitI1, NumElts)) 3990 return false; 3991 if (!isUndefOrEqual(BitI1, j+l256*NumElts+NumEltsInStride)) 3992 return false; 3993 } 3994 } 3995 } 3996 return true; 3997} 3998 3999/// isUNPCKL_v_undef_Mask - Special case of isUNPCKLMask for canonical form 4000/// of vector_shuffle v, v, <0, 4, 1, 5>, i.e. vector_shuffle v, undef, 4001/// <0, 0, 1, 1> 4002static bool isUNPCKL_v_undef_Mask(ArrayRef<int> Mask, MVT VT, bool HasInt256) { 4003 unsigned NumElts = VT.getVectorNumElements(); 4004 bool Is256BitVec = VT.is256BitVector(); 4005 4006 if (VT.is512BitVector()) 4007 return false; 4008 assert((VT.is128BitVector() || VT.is256BitVector()) && 4009 "Unsupported vector type for unpckh"); 4010 4011 if (Is256BitVec && NumElts != 4 && NumElts != 8 && 4012 (!HasInt256 || (NumElts != 16 && NumElts != 32))) 4013 return false; 4014 4015 // For 256-bit i64/f64, use MOVDDUPY instead, so reject the matching pattern 4016 // FIXME: Need a better way to get rid of this, there's no latency difference 4017 // between UNPCKLPD and MOVDDUP, the later should always be checked first and 4018 // the former later. We should also remove the "_undef" special mask. 4019 if (NumElts == 4 && Is256BitVec) 4020 return false; 4021 4022 // Handle 128 and 256-bit vector lengths. AVX defines UNPCK* to operate 4023 // independently on 128-bit lanes. 4024 unsigned NumLanes = VT.getSizeInBits()/128; 4025 unsigned NumLaneElts = NumElts/NumLanes; 4026 4027 for (unsigned l = 0; l != NumElts; l += NumLaneElts) { 4028 for (unsigned i = 0, j = l; i != NumLaneElts; i += 2, ++j) { 4029 int BitI = Mask[l+i]; 4030 int BitI1 = Mask[l+i+1]; 4031 4032 if (!isUndefOrEqual(BitI, j)) 4033 return false; 4034 if (!isUndefOrEqual(BitI1, j)) 4035 return false; 4036 } 4037 } 4038 4039 return true; 4040} 4041 4042/// isUNPCKH_v_undef_Mask - Special case of isUNPCKHMask for canonical form 4043/// of vector_shuffle v, v, <2, 6, 3, 7>, i.e. vector_shuffle v, undef, 4044/// <2, 2, 3, 3> 4045static bool isUNPCKH_v_undef_Mask(ArrayRef<int> Mask, MVT VT, bool HasInt256) { 4046 unsigned NumElts = VT.getVectorNumElements(); 4047 4048 if (VT.is512BitVector()) 4049 return false; 4050 4051 assert((VT.is128BitVector() || VT.is256BitVector()) && 4052 "Unsupported vector type for unpckh"); 4053 4054 if (VT.is256BitVector() && NumElts != 4 && NumElts != 8 && 4055 (!HasInt256 || (NumElts != 16 && NumElts != 32))) 4056 return false; 4057 4058 // Handle 128 and 256-bit vector lengths. AVX defines UNPCK* to operate 4059 // independently on 128-bit lanes. 4060 unsigned NumLanes = VT.getSizeInBits()/128; 4061 unsigned NumLaneElts = NumElts/NumLanes; 4062 4063 for (unsigned l = 0; l != NumElts; l += NumLaneElts) { 4064 for (unsigned i = 0, j = l+NumLaneElts/2; i != NumLaneElts; i += 2, ++j) { 4065 int BitI = Mask[l+i]; 4066 int BitI1 = Mask[l+i+1]; 4067 if (!isUndefOrEqual(BitI, j)) 4068 return false; 4069 if (!isUndefOrEqual(BitI1, j)) 4070 return false; 4071 } 4072 } 4073 return true; 4074} 4075 4076/// isMOVLMask - Return true if the specified VECTOR_SHUFFLE operand 4077/// specifies a shuffle of elements that is suitable for input to MOVSS, 4078/// MOVSD, and MOVD, i.e. setting the lowest element. 4079static bool isMOVLMask(ArrayRef<int> Mask, EVT VT) { 4080 if (VT.getVectorElementType().getSizeInBits() < 32) 4081 return false; 4082 if (!VT.is128BitVector()) 4083 return false; 4084 4085 unsigned NumElts = VT.getVectorNumElements(); 4086 4087 if (!isUndefOrEqual(Mask[0], NumElts)) 4088 return false; 4089 4090 for (unsigned i = 1; i != NumElts; ++i) 4091 if (!isUndefOrEqual(Mask[i], i)) 4092 return false; 4093 4094 return true; 4095} 4096 4097/// isVPERM2X128Mask - Match 256-bit shuffles where the elements are considered 4098/// as permutations between 128-bit chunks or halves. As an example: this 4099/// shuffle bellow: 4100/// vector_shuffle <4, 5, 6, 7, 12, 13, 14, 15> 4101/// The first half comes from the second half of V1 and the second half from the 4102/// the second half of V2. 4103static bool isVPERM2X128Mask(ArrayRef<int> Mask, MVT VT, bool HasFp256) { 4104 if (!HasFp256 || !VT.is256BitVector()) 4105 return false; 4106 4107 // The shuffle result is divided into half A and half B. In total the two 4108 // sources have 4 halves, namely: C, D, E, F. The final values of A and 4109 // B must come from C, D, E or F. 4110 unsigned HalfSize = VT.getVectorNumElements()/2; 4111 bool MatchA = false, MatchB = false; 4112 4113 // Check if A comes from one of C, D, E, F. 4114 for (unsigned Half = 0; Half != 4; ++Half) { 4115 if (isSequentialOrUndefInRange(Mask, 0, HalfSize, Half*HalfSize)) { 4116 MatchA = true; 4117 break; 4118 } 4119 } 4120 4121 // Check if B comes from one of C, D, E, F. 4122 for (unsigned Half = 0; Half != 4; ++Half) { 4123 if (isSequentialOrUndefInRange(Mask, HalfSize, HalfSize, Half*HalfSize)) { 4124 MatchB = true; 4125 break; 4126 } 4127 } 4128 4129 return MatchA && MatchB; 4130} 4131 4132/// getShuffleVPERM2X128Immediate - Return the appropriate immediate to shuffle 4133/// the specified VECTOR_MASK mask with VPERM2F128/VPERM2I128 instructions. 4134static unsigned getShuffleVPERM2X128Immediate(ShuffleVectorSDNode *SVOp) { 4135 MVT VT = SVOp->getSimpleValueType(0); 4136 4137 unsigned HalfSize = VT.getVectorNumElements()/2; 4138 4139 unsigned FstHalf = 0, SndHalf = 0; 4140 for (unsigned i = 0; i < HalfSize; ++i) { 4141 if (SVOp->getMaskElt(i) > 0) { 4142 FstHalf = SVOp->getMaskElt(i)/HalfSize; 4143 break; 4144 } 4145 } 4146 for (unsigned i = HalfSize; i < HalfSize*2; ++i) { 4147 if (SVOp->getMaskElt(i) > 0) { 4148 SndHalf = SVOp->getMaskElt(i)/HalfSize; 4149 break; 4150 } 4151 } 4152 4153 return (FstHalf | (SndHalf << 4)); 4154} 4155 4156// Symetric in-lane mask. Each lane has 4 elements (for imm8) 4157static bool isPermImmMask(ArrayRef<int> Mask, MVT VT, unsigned& Imm8) { 4158 unsigned EltSize = VT.getVectorElementType().getSizeInBits(); 4159 if (EltSize < 32) 4160 return false; 4161 4162 unsigned NumElts = VT.getVectorNumElements(); 4163 Imm8 = 0; 4164 if (VT.is128BitVector() || (VT.is256BitVector() && EltSize == 64)) { 4165 for (unsigned i = 0; i != NumElts; ++i) { 4166 if (Mask[i] < 0) 4167 continue; 4168 Imm8 |= Mask[i] << (i*2); 4169 } 4170 return true; 4171 } 4172 4173 unsigned LaneSize = 4; 4174 SmallVector<int, 4> MaskVal(LaneSize, -1); 4175 4176 for (unsigned l = 0; l != NumElts; l += LaneSize) { 4177 for (unsigned i = 0; i != LaneSize; ++i) { 4178 if (!isUndefOrInRange(Mask[i+l], l, l+LaneSize)) 4179 return false; 4180 if (Mask[i+l] < 0) 4181 continue; 4182 if (MaskVal[i] < 0) { 4183 MaskVal[i] = Mask[i+l] - l; 4184 Imm8 |= MaskVal[i] << (i*2); 4185 continue; 4186 } 4187 if (Mask[i+l] != (signed)(MaskVal[i]+l)) 4188 return false; 4189 } 4190 } 4191 return true; 4192} 4193 4194/// isVPERMILPMask - Return true if the specified VECTOR_SHUFFLE operand 4195/// specifies a shuffle of elements that is suitable for input to VPERMILPD*. 4196/// Note that VPERMIL mask matching is different depending whether theunderlying 4197/// type is 32 or 64. In the VPERMILPS the high half of the mask should point 4198/// to the same elements of the low, but to the higher half of the source. 4199/// In VPERMILPD the two lanes could be shuffled independently of each other 4200/// with the same restriction that lanes can't be crossed. Also handles PSHUFDY. 4201static bool isVPERMILPMask(ArrayRef<int> Mask, MVT VT) { 4202 unsigned EltSize = VT.getVectorElementType().getSizeInBits(); 4203 if (VT.getSizeInBits() < 256 || EltSize < 32) 4204 return false; 4205 bool symetricMaskRequired = (EltSize == 32); 4206 unsigned NumElts = VT.getVectorNumElements(); 4207 4208 unsigned NumLanes = VT.getSizeInBits()/128; 4209 unsigned LaneSize = NumElts/NumLanes; 4210 // 2 or 4 elements in one lane 4211 4212 SmallVector<int, 4> ExpectedMaskVal(LaneSize, -1); 4213 for (unsigned l = 0; l != NumElts; l += LaneSize) { 4214 for (unsigned i = 0; i != LaneSize; ++i) { 4215 if (!isUndefOrInRange(Mask[i+l], l, l+LaneSize)) 4216 return false; 4217 if (symetricMaskRequired) { 4218 if (ExpectedMaskVal[i] < 0 && Mask[i+l] >= 0) { 4219 ExpectedMaskVal[i] = Mask[i+l] - l; 4220 continue; 4221 } 4222 if (!isUndefOrEqual(Mask[i+l], ExpectedMaskVal[i]+l)) 4223 return false; 4224 } 4225 } 4226 } 4227 return true; 4228} 4229 4230/// isCommutedMOVLMask - Returns true if the shuffle mask is except the reverse 4231/// of what x86 movss want. X86 movs requires the lowest element to be lowest 4232/// element of vector 2 and the other elements to come from vector 1 in order. 4233static bool isCommutedMOVLMask(ArrayRef<int> Mask, MVT VT, 4234 bool V2IsSplat = false, bool V2IsUndef = false) { 4235 if (!VT.is128BitVector()) 4236 return false; 4237 4238 unsigned NumOps = VT.getVectorNumElements(); 4239 if (NumOps != 2 && NumOps != 4 && NumOps != 8 && NumOps != 16) 4240 return false; 4241 4242 if (!isUndefOrEqual(Mask[0], 0)) 4243 return false; 4244 4245 for (unsigned i = 1; i != NumOps; ++i) 4246 if (!(isUndefOrEqual(Mask[i], i+NumOps) || 4247 (V2IsUndef && isUndefOrInRange(Mask[i], NumOps, NumOps*2)) || 4248 (V2IsSplat && isUndefOrEqual(Mask[i], NumOps)))) 4249 return false; 4250 4251 return true; 4252} 4253 4254/// isMOVSHDUPMask - Return true if the specified VECTOR_SHUFFLE operand 4255/// specifies a shuffle of elements that is suitable for input to MOVSHDUP. 4256/// Masks to match: <1, 1, 3, 3> or <1, 1, 3, 3, 5, 5, 7, 7> 4257static bool isMOVSHDUPMask(ArrayRef<int> Mask, MVT VT, 4258 const X86Subtarget *Subtarget) { 4259 if (!Subtarget->hasSSE3()) 4260 return false; 4261 4262 unsigned NumElems = VT.getVectorNumElements(); 4263 4264 if ((VT.is128BitVector() && NumElems != 4) || 4265 (VT.is256BitVector() && NumElems != 8) || 4266 (VT.is512BitVector() && NumElems != 16)) 4267 return false; 4268 4269 // "i+1" is the value the indexed mask element must have 4270 for (unsigned i = 0; i != NumElems; i += 2) 4271 if (!isUndefOrEqual(Mask[i], i+1) || 4272 !isUndefOrEqual(Mask[i+1], i+1)) 4273 return false; 4274 4275 return true; 4276} 4277 4278/// isMOVSLDUPMask - Return true if the specified VECTOR_SHUFFLE operand 4279/// specifies a shuffle of elements that is suitable for input to MOVSLDUP. 4280/// Masks to match: <0, 0, 2, 2> or <0, 0, 2, 2, 4, 4, 6, 6> 4281static bool isMOVSLDUPMask(ArrayRef<int> Mask, MVT VT, 4282 const X86Subtarget *Subtarget) { 4283 if (!Subtarget->hasSSE3()) 4284 return false; 4285 4286 unsigned NumElems = VT.getVectorNumElements(); 4287 4288 if ((VT.is128BitVector() && NumElems != 4) || 4289 (VT.is256BitVector() && NumElems != 8) || 4290 (VT.is512BitVector() && NumElems != 16)) 4291 return false; 4292 4293 // "i" is the value the indexed mask element must have 4294 for (unsigned i = 0; i != NumElems; i += 2) 4295 if (!isUndefOrEqual(Mask[i], i) || 4296 !isUndefOrEqual(Mask[i+1], i)) 4297 return false; 4298 4299 return true; 4300} 4301 4302/// isMOVDDUPYMask - Return true if the specified VECTOR_SHUFFLE operand 4303/// specifies a shuffle of elements that is suitable for input to 256-bit 4304/// version of MOVDDUP. 4305static bool isMOVDDUPYMask(ArrayRef<int> Mask, MVT VT, bool HasFp256) { 4306 if (!HasFp256 || !VT.is256BitVector()) 4307 return false; 4308 4309 unsigned NumElts = VT.getVectorNumElements(); 4310 if (NumElts != 4) 4311 return false; 4312 4313 for (unsigned i = 0; i != NumElts/2; ++i) 4314 if (!isUndefOrEqual(Mask[i], 0)) 4315 return false; 4316 for (unsigned i = NumElts/2; i != NumElts; ++i) 4317 if (!isUndefOrEqual(Mask[i], NumElts/2)) 4318 return false; 4319 return true; 4320} 4321 4322/// isMOVDDUPMask - Return true if the specified VECTOR_SHUFFLE operand 4323/// specifies a shuffle of elements that is suitable for input to 128-bit 4324/// version of MOVDDUP. 4325static bool isMOVDDUPMask(ArrayRef<int> Mask, MVT VT) { 4326 if (!VT.is128BitVector()) 4327 return false; 4328 4329 unsigned e = VT.getVectorNumElements() / 2; 4330 for (unsigned i = 0; i != e; ++i) 4331 if (!isUndefOrEqual(Mask[i], i)) 4332 return false; 4333 for (unsigned i = 0; i != e; ++i) 4334 if (!isUndefOrEqual(Mask[e+i], i)) 4335 return false; 4336 return true; 4337} 4338 4339/// isVEXTRACTIndex - Return true if the specified 4340/// EXTRACT_SUBVECTOR operand specifies a vector extract that is 4341/// suitable for instruction that extract 128 or 256 bit vectors 4342static bool isVEXTRACTIndex(SDNode *N, unsigned vecWidth) { 4343 assert((vecWidth == 128 || vecWidth == 256) && "Unexpected vector width"); 4344 if (!isa<ConstantSDNode>(N->getOperand(1).getNode())) 4345 return false; 4346 4347 // The index should be aligned on a vecWidth-bit boundary. 4348 uint64_t Index = 4349 cast<ConstantSDNode>(N->getOperand(1).getNode())->getZExtValue(); 4350 4351 MVT VT = N->getSimpleValueType(0); 4352 unsigned ElSize = VT.getVectorElementType().getSizeInBits(); 4353 bool Result = (Index * ElSize) % vecWidth == 0; 4354 4355 return Result; 4356} 4357 4358/// isVINSERTIndex - Return true if the specified INSERT_SUBVECTOR 4359/// operand specifies a subvector insert that is suitable for input to 4360/// insertion of 128 or 256-bit subvectors 4361static bool isVINSERTIndex(SDNode *N, unsigned vecWidth) { 4362 assert((vecWidth == 128 || vecWidth == 256) && "Unexpected vector width"); 4363 if (!isa<ConstantSDNode>(N->getOperand(2).getNode())) 4364 return false; 4365 // The index should be aligned on a vecWidth-bit boundary. 4366 uint64_t Index = 4367 cast<ConstantSDNode>(N->getOperand(2).getNode())->getZExtValue(); 4368 4369 MVT VT = N->getSimpleValueType(0); 4370 unsigned ElSize = VT.getVectorElementType().getSizeInBits(); 4371 bool Result = (Index * ElSize) % vecWidth == 0; 4372 4373 return Result; 4374} 4375 4376bool X86::isVINSERT128Index(SDNode *N) { 4377 return isVINSERTIndex(N, 128); 4378} 4379 4380bool X86::isVINSERT256Index(SDNode *N) { 4381 return isVINSERTIndex(N, 256); 4382} 4383 4384bool X86::isVEXTRACT128Index(SDNode *N) { 4385 return isVEXTRACTIndex(N, 128); 4386} 4387 4388bool X86::isVEXTRACT256Index(SDNode *N) { 4389 return isVEXTRACTIndex(N, 256); 4390} 4391 4392/// getShuffleSHUFImmediate - Return the appropriate immediate to shuffle 4393/// the specified VECTOR_SHUFFLE mask with PSHUF* and SHUFP* instructions. 4394/// Handles 128-bit and 256-bit. 4395static unsigned getShuffleSHUFImmediate(ShuffleVectorSDNode *N) { 4396 MVT VT = N->getSimpleValueType(0); 4397 4398 assert((VT.getSizeInBits() >= 128) && 4399 "Unsupported vector type for PSHUF/SHUFP"); 4400 4401 // Handle 128 and 256-bit vector lengths. AVX defines PSHUF/SHUFP to operate 4402 // independently on 128-bit lanes. 4403 unsigned NumElts = VT.getVectorNumElements(); 4404 unsigned NumLanes = VT.getSizeInBits()/128; 4405 unsigned NumLaneElts = NumElts/NumLanes; 4406 4407 assert((NumLaneElts == 2 || NumLaneElts == 4 || NumLaneElts == 8) && 4408 "Only supports 2, 4 or 8 elements per lane"); 4409 4410 unsigned Shift = (NumLaneElts >= 4) ? 1 : 0; 4411 unsigned Mask = 0; 4412 for (unsigned i = 0; i != NumElts; ++i) { 4413 int Elt = N->getMaskElt(i); 4414 if (Elt < 0) continue; 4415 Elt &= NumLaneElts - 1; 4416 unsigned ShAmt = (i << Shift) % 8; 4417 Mask |= Elt << ShAmt; 4418 } 4419 4420 return Mask; 4421} 4422 4423/// getShufflePSHUFHWImmediate - Return the appropriate immediate to shuffle 4424/// the specified VECTOR_SHUFFLE mask with the PSHUFHW instruction. 4425static unsigned getShufflePSHUFHWImmediate(ShuffleVectorSDNode *N) { 4426 MVT VT = N->getSimpleValueType(0); 4427 4428 assert((VT == MVT::v8i16 || VT == MVT::v16i16) && 4429 "Unsupported vector type for PSHUFHW"); 4430 4431 unsigned NumElts = VT.getVectorNumElements(); 4432 4433 unsigned Mask = 0; 4434 for (unsigned l = 0; l != NumElts; l += 8) { 4435 // 8 nodes per lane, but we only care about the last 4. 4436 for (unsigned i = 0; i < 4; ++i) { 4437 int Elt = N->getMaskElt(l+i+4); 4438 if (Elt < 0) continue; 4439 Elt &= 0x3; // only 2-bits. 4440 Mask |= Elt << (i * 2); 4441 } 4442 } 4443 4444 return Mask; 4445} 4446 4447/// getShufflePSHUFLWImmediate - Return the appropriate immediate to shuffle 4448/// the specified VECTOR_SHUFFLE mask with the PSHUFLW instruction. 4449static unsigned getShufflePSHUFLWImmediate(ShuffleVectorSDNode *N) { 4450 MVT VT = N->getSimpleValueType(0); 4451 4452 assert((VT == MVT::v8i16 || VT == MVT::v16i16) && 4453 "Unsupported vector type for PSHUFHW"); 4454 4455 unsigned NumElts = VT.getVectorNumElements(); 4456 4457 unsigned Mask = 0; 4458 for (unsigned l = 0; l != NumElts; l += 8) { 4459 // 8 nodes per lane, but we only care about the first 4. 4460 for (unsigned i = 0; i < 4; ++i) { 4461 int Elt = N->getMaskElt(l+i); 4462 if (Elt < 0) continue; 4463 Elt &= 0x3; // only 2-bits 4464 Mask |= Elt << (i * 2); 4465 } 4466 } 4467 4468 return Mask; 4469} 4470 4471/// getShufflePALIGNRImmediate - Return the appropriate immediate to shuffle 4472/// the specified VECTOR_SHUFFLE mask with the PALIGNR instruction. 4473static unsigned getShufflePALIGNRImmediate(ShuffleVectorSDNode *SVOp) { 4474 MVT VT = SVOp->getSimpleValueType(0); 4475 unsigned EltSize = VT.is512BitVector() ? 1 : 4476 VT.getVectorElementType().getSizeInBits() >> 3; 4477 4478 unsigned NumElts = VT.getVectorNumElements(); 4479 unsigned NumLanes = VT.is512BitVector() ? 1 : VT.getSizeInBits()/128; 4480 unsigned NumLaneElts = NumElts/NumLanes; 4481 4482 int Val = 0; 4483 unsigned i; 4484 for (i = 0; i != NumElts; ++i) { 4485 Val = SVOp->getMaskElt(i); 4486 if (Val >= 0) 4487 break; 4488 } 4489 if (Val >= (int)NumElts) 4490 Val -= NumElts - NumLaneElts; 4491 4492 assert(Val - i > 0 && "PALIGNR imm should be positive"); 4493 return (Val - i) * EltSize; 4494} 4495 4496static unsigned getExtractVEXTRACTImmediate(SDNode *N, unsigned vecWidth) { 4497 assert((vecWidth == 128 || vecWidth == 256) && "Unsupported vector width"); 4498 if (!isa<ConstantSDNode>(N->getOperand(1).getNode())) 4499 llvm_unreachable("Illegal extract subvector for VEXTRACT"); 4500 4501 uint64_t Index = 4502 cast<ConstantSDNode>(N->getOperand(1).getNode())->getZExtValue(); 4503 4504 MVT VecVT = N->getOperand(0).getSimpleValueType(); 4505 MVT ElVT = VecVT.getVectorElementType(); 4506 4507 unsigned NumElemsPerChunk = vecWidth / ElVT.getSizeInBits(); 4508 return Index / NumElemsPerChunk; 4509} 4510 4511static unsigned getInsertVINSERTImmediate(SDNode *N, unsigned vecWidth) { 4512 assert((vecWidth == 128 || vecWidth == 256) && "Unsupported vector width"); 4513 if (!isa<ConstantSDNode>(N->getOperand(2).getNode())) 4514 llvm_unreachable("Illegal insert subvector for VINSERT"); 4515 4516 uint64_t Index = 4517 cast<ConstantSDNode>(N->getOperand(2).getNode())->getZExtValue(); 4518 4519 MVT VecVT = N->getSimpleValueType(0); 4520 MVT ElVT = VecVT.getVectorElementType(); 4521 4522 unsigned NumElemsPerChunk = vecWidth / ElVT.getSizeInBits(); 4523 return Index / NumElemsPerChunk; 4524} 4525 4526/// getExtractVEXTRACT128Immediate - Return the appropriate immediate 4527/// to extract the specified EXTRACT_SUBVECTOR index with VEXTRACTF128 4528/// and VINSERTI128 instructions. 4529unsigned X86::getExtractVEXTRACT128Immediate(SDNode *N) { 4530 return getExtractVEXTRACTImmediate(N, 128); 4531} 4532 4533/// getExtractVEXTRACT256Immediate - Return the appropriate immediate 4534/// to extract the specified EXTRACT_SUBVECTOR index with VEXTRACTF64x4 4535/// and VINSERTI64x4 instructions. 4536unsigned X86::getExtractVEXTRACT256Immediate(SDNode *N) { 4537 return getExtractVEXTRACTImmediate(N, 256); 4538} 4539 4540/// getInsertVINSERT128Immediate - Return the appropriate immediate 4541/// to insert at the specified INSERT_SUBVECTOR index with VINSERTF128 4542/// and VINSERTI128 instructions. 4543unsigned X86::getInsertVINSERT128Immediate(SDNode *N) { 4544 return getInsertVINSERTImmediate(N, 128); 4545} 4546 4547/// getInsertVINSERT256Immediate - Return the appropriate immediate 4548/// to insert at the specified INSERT_SUBVECTOR index with VINSERTF46x4 4549/// and VINSERTI64x4 instructions. 4550unsigned X86::getInsertVINSERT256Immediate(SDNode *N) { 4551 return getInsertVINSERTImmediate(N, 256); 4552} 4553 4554/// isZeroNode - Returns true if Elt is a constant zero or a floating point 4555/// constant +0.0. 4556bool X86::isZeroNode(SDValue Elt) { 4557 if (ConstantSDNode *CN = dyn_cast<ConstantSDNode>(Elt)) 4558 return CN->isNullValue(); 4559 if (ConstantFPSDNode *CFP = dyn_cast<ConstantFPSDNode>(Elt)) 4560 return CFP->getValueAPF().isPosZero(); 4561 return false; 4562} 4563 4564/// CommuteVectorShuffle - Swap vector_shuffle operands as well as values in 4565/// their permute mask. 4566static SDValue CommuteVectorShuffle(ShuffleVectorSDNode *SVOp, 4567 SelectionDAG &DAG) { 4568 MVT VT = SVOp->getSimpleValueType(0); 4569 unsigned NumElems = VT.getVectorNumElements(); 4570 SmallVector<int, 8> MaskVec; 4571 4572 for (unsigned i = 0; i != NumElems; ++i) { 4573 int Idx = SVOp->getMaskElt(i); 4574 if (Idx >= 0) { 4575 if (Idx < (int)NumElems) 4576 Idx += NumElems; 4577 else 4578 Idx -= NumElems; 4579 } 4580 MaskVec.push_back(Idx); 4581 } 4582 return DAG.getVectorShuffle(VT, SDLoc(SVOp), SVOp->getOperand(1), 4583 SVOp->getOperand(0), &MaskVec[0]); 4584} 4585 4586/// ShouldXformToMOVHLPS - Return true if the node should be transformed to 4587/// match movhlps. The lower half elements should come from upper half of 4588/// V1 (and in order), and the upper half elements should come from the upper 4589/// half of V2 (and in order). 4590static bool ShouldXformToMOVHLPS(ArrayRef<int> Mask, MVT VT) { 4591 if (!VT.is128BitVector()) 4592 return false; 4593 if (VT.getVectorNumElements() != 4) 4594 return false; 4595 for (unsigned i = 0, e = 2; i != e; ++i) 4596 if (!isUndefOrEqual(Mask[i], i+2)) 4597 return false; 4598 for (unsigned i = 2; i != 4; ++i) 4599 if (!isUndefOrEqual(Mask[i], i+4)) 4600 return false; 4601 return true; 4602} 4603 4604/// isScalarLoadToVector - Returns true if the node is a scalar load that 4605/// is promoted to a vector. It also returns the LoadSDNode by reference if 4606/// required. 4607static bool isScalarLoadToVector(SDNode *N, LoadSDNode **LD = NULL) { 4608 if (N->getOpcode() != ISD::SCALAR_TO_VECTOR) 4609 return false; 4610 N = N->getOperand(0).getNode(); 4611 if (!ISD::isNON_EXTLoad(N)) 4612 return false; 4613 if (LD) 4614 *LD = cast<LoadSDNode>(N); 4615 return true; 4616} 4617 4618// Test whether the given value is a vector value which will be legalized 4619// into a load. 4620static bool WillBeConstantPoolLoad(SDNode *N) { 4621 if (N->getOpcode() != ISD::BUILD_VECTOR) 4622 return false; 4623 4624 // Check for any non-constant elements. 4625 for (unsigned i = 0, e = N->getNumOperands(); i != e; ++i) 4626 switch (N->getOperand(i).getNode()->getOpcode()) { 4627 case ISD::UNDEF: 4628 case ISD::ConstantFP: 4629 case ISD::Constant: 4630 break; 4631 default: 4632 return false; 4633 } 4634 4635 // Vectors of all-zeros and all-ones are materialized with special 4636 // instructions rather than being loaded. 4637 return !ISD::isBuildVectorAllZeros(N) && 4638 !ISD::isBuildVectorAllOnes(N); 4639} 4640 4641/// ShouldXformToMOVLP{S|D} - Return true if the node should be transformed to 4642/// match movlp{s|d}. The lower half elements should come from lower half of 4643/// V1 (and in order), and the upper half elements should come from the upper 4644/// half of V2 (and in order). And since V1 will become the source of the 4645/// MOVLP, it must be either a vector load or a scalar load to vector. 4646static bool ShouldXformToMOVLP(SDNode *V1, SDNode *V2, 4647 ArrayRef<int> Mask, MVT VT) { 4648 if (!VT.is128BitVector()) 4649 return false; 4650 4651 if (!ISD::isNON_EXTLoad(V1) && !isScalarLoadToVector(V1)) 4652 return false; 4653 // Is V2 is a vector load, don't do this transformation. We will try to use 4654 // load folding shufps op. 4655 if (ISD::isNON_EXTLoad(V2) || WillBeConstantPoolLoad(V2)) 4656 return false; 4657 4658 unsigned NumElems = VT.getVectorNumElements(); 4659 4660 if (NumElems != 2 && NumElems != 4) 4661 return false; 4662 for (unsigned i = 0, e = NumElems/2; i != e; ++i) 4663 if (!isUndefOrEqual(Mask[i], i)) 4664 return false; 4665 for (unsigned i = NumElems/2, e = NumElems; i != e; ++i) 4666 if (!isUndefOrEqual(Mask[i], i+NumElems)) 4667 return false; 4668 return true; 4669} 4670 4671/// isSplatVector - Returns true if N is a BUILD_VECTOR node whose elements are 4672/// all the same. 4673static bool isSplatVector(SDNode *N) { 4674 if (N->getOpcode() != ISD::BUILD_VECTOR) 4675 return false; 4676 4677 SDValue SplatValue = N->getOperand(0); 4678 for (unsigned i = 1, e = N->getNumOperands(); i != e; ++i) 4679 if (N->getOperand(i) != SplatValue) 4680 return false; 4681 return true; 4682} 4683 4684/// isZeroShuffle - Returns true if N is a VECTOR_SHUFFLE that can be resolved 4685/// to an zero vector. 4686/// FIXME: move to dag combiner / method on ShuffleVectorSDNode 4687static bool isZeroShuffle(ShuffleVectorSDNode *N) { 4688 SDValue V1 = N->getOperand(0); 4689 SDValue V2 = N->getOperand(1); 4690 unsigned NumElems = N->getValueType(0).getVectorNumElements(); 4691 for (unsigned i = 0; i != NumElems; ++i) { 4692 int Idx = N->getMaskElt(i); 4693 if (Idx >= (int)NumElems) { 4694 unsigned Opc = V2.getOpcode(); 4695 if (Opc == ISD::UNDEF || ISD::isBuildVectorAllZeros(V2.getNode())) 4696 continue; 4697 if (Opc != ISD::BUILD_VECTOR || 4698 !X86::isZeroNode(V2.getOperand(Idx-NumElems))) 4699 return false; 4700 } else if (Idx >= 0) { 4701 unsigned Opc = V1.getOpcode(); 4702 if (Opc == ISD::UNDEF || ISD::isBuildVectorAllZeros(V1.getNode())) 4703 continue; 4704 if (Opc != ISD::BUILD_VECTOR || 4705 !X86::isZeroNode(V1.getOperand(Idx))) 4706 return false; 4707 } 4708 } 4709 return true; 4710} 4711 4712/// getZeroVector - Returns a vector of specified type with all zero elements. 4713/// 4714static SDValue getZeroVector(EVT VT, const X86Subtarget *Subtarget, 4715 SelectionDAG &DAG, SDLoc dl) { 4716 assert(VT.isVector() && "Expected a vector type"); 4717 4718 // Always build SSE zero vectors as <4 x i32> bitcasted 4719 // to their dest type. This ensures they get CSE'd. 4720 SDValue Vec; 4721 if (VT.is128BitVector()) { // SSE 4722 if (Subtarget->hasSSE2()) { // SSE2 4723 SDValue Cst = DAG.getTargetConstant(0, MVT::i32); 4724 Vec = DAG.getNode(ISD::BUILD_VECTOR, dl, MVT::v4i32, Cst, Cst, Cst, Cst); 4725 } else { // SSE1 4726 SDValue Cst = DAG.getTargetConstantFP(+0.0, MVT::f32); 4727 Vec = DAG.getNode(ISD::BUILD_VECTOR, dl, MVT::v4f32, Cst, Cst, Cst, Cst); 4728 } 4729 } else if (VT.is256BitVector()) { // AVX 4730 if (Subtarget->hasInt256()) { // AVX2 4731 SDValue Cst = DAG.getTargetConstant(0, MVT::i32); 4732 SDValue Ops[] = { Cst, Cst, Cst, Cst, Cst, Cst, Cst, Cst }; 4733 Vec = DAG.getNode(ISD::BUILD_VECTOR, dl, MVT::v8i32, Ops, 4734 array_lengthof(Ops)); 4735 } else { 4736 // 256-bit logic and arithmetic instructions in AVX are all 4737 // floating-point, no support for integer ops. Emit fp zeroed vectors. 4738 SDValue Cst = DAG.getTargetConstantFP(+0.0, MVT::f32); 4739 SDValue Ops[] = { Cst, Cst, Cst, Cst, Cst, Cst, Cst, Cst }; 4740 Vec = DAG.getNode(ISD::BUILD_VECTOR, dl, MVT::v8f32, Ops, 4741 array_lengthof(Ops)); 4742 } 4743 } else if (VT.is512BitVector()) { // AVX-512 4744 SDValue Cst = DAG.getTargetConstant(0, MVT::i32); 4745 SDValue Ops[] = { Cst, Cst, Cst, Cst, Cst, Cst, Cst, Cst, 4746 Cst, Cst, Cst, Cst, Cst, Cst, Cst, Cst }; 4747 Vec = DAG.getNode(ISD::BUILD_VECTOR, dl, MVT::v16i32, Ops, 16); 4748 } else 4749 llvm_unreachable("Unexpected vector type"); 4750 4751 return DAG.getNode(ISD::BITCAST, dl, VT, Vec); 4752} 4753 4754/// getOnesVector - Returns a vector of specified type with all bits set. 4755/// Always build ones vectors as <4 x i32> or <8 x i32>. For 256-bit types with 4756/// no AVX2 supprt, use two <4 x i32> inserted in a <8 x i32> appropriately. 4757/// Then bitcast to their original type, ensuring they get CSE'd. 4758static SDValue getOnesVector(MVT VT, bool HasInt256, SelectionDAG &DAG, 4759 SDLoc dl) { 4760 assert(VT.isVector() && "Expected a vector type"); 4761 4762 SDValue Cst = DAG.getTargetConstant(~0U, MVT::i32); 4763 SDValue Vec; 4764 if (VT.is256BitVector()) { 4765 if (HasInt256) { // AVX2 4766 SDValue Ops[] = { Cst, Cst, Cst, Cst, Cst, Cst, Cst, Cst }; 4767 Vec = DAG.getNode(ISD::BUILD_VECTOR, dl, MVT::v8i32, Ops, 4768 array_lengthof(Ops)); 4769 } else { // AVX 4770 Vec = DAG.getNode(ISD::BUILD_VECTOR, dl, MVT::v4i32, Cst, Cst, Cst, Cst); 4771 Vec = Concat128BitVectors(Vec, Vec, MVT::v8i32, 8, DAG, dl); 4772 } 4773 } else if (VT.is128BitVector()) { 4774 Vec = DAG.getNode(ISD::BUILD_VECTOR, dl, MVT::v4i32, Cst, Cst, Cst, Cst); 4775 } else 4776 llvm_unreachable("Unexpected vector type"); 4777 4778 return DAG.getNode(ISD::BITCAST, dl, VT, Vec); 4779} 4780 4781/// NormalizeMask - V2 is a splat, modify the mask (if needed) so all elements 4782/// that point to V2 points to its first element. 4783static void NormalizeMask(SmallVectorImpl<int> &Mask, unsigned NumElems) { 4784 for (unsigned i = 0; i != NumElems; ++i) { 4785 if (Mask[i] > (int)NumElems) { 4786 Mask[i] = NumElems; 4787 } 4788 } 4789} 4790 4791/// getMOVLMask - Returns a vector_shuffle mask for an movs{s|d}, movd 4792/// operation of specified width. 4793static SDValue getMOVL(SelectionDAG &DAG, SDLoc dl, EVT VT, SDValue V1, 4794 SDValue V2) { 4795 unsigned NumElems = VT.getVectorNumElements(); 4796 SmallVector<int, 8> Mask; 4797 Mask.push_back(NumElems); 4798 for (unsigned i = 1; i != NumElems; ++i) 4799 Mask.push_back(i); 4800 return DAG.getVectorShuffle(VT, dl, V1, V2, &Mask[0]); 4801} 4802 4803/// getUnpackl - Returns a vector_shuffle node for an unpackl operation. 4804static SDValue getUnpackl(SelectionDAG &DAG, SDLoc dl, MVT VT, SDValue V1, 4805 SDValue V2) { 4806 unsigned NumElems = VT.getVectorNumElements(); 4807 SmallVector<int, 8> Mask; 4808 for (unsigned i = 0, e = NumElems/2; i != e; ++i) { 4809 Mask.push_back(i); 4810 Mask.push_back(i + NumElems); 4811 } 4812 return DAG.getVectorShuffle(VT, dl, V1, V2, &Mask[0]); 4813} 4814 4815/// getUnpackh - Returns a vector_shuffle node for an unpackh operation. 4816static SDValue getUnpackh(SelectionDAG &DAG, SDLoc dl, MVT VT, SDValue V1, 4817 SDValue V2) { 4818 unsigned NumElems = VT.getVectorNumElements(); 4819 SmallVector<int, 8> Mask; 4820 for (unsigned i = 0, Half = NumElems/2; i != Half; ++i) { 4821 Mask.push_back(i + Half); 4822 Mask.push_back(i + NumElems + Half); 4823 } 4824 return DAG.getVectorShuffle(VT, dl, V1, V2, &Mask[0]); 4825} 4826 4827// PromoteSplati8i16 - All i16 and i8 vector types can't be used directly by 4828// a generic shuffle instruction because the target has no such instructions. 4829// Generate shuffles which repeat i16 and i8 several times until they can be 4830// represented by v4f32 and then be manipulated by target suported shuffles. 4831static SDValue PromoteSplati8i16(SDValue V, SelectionDAG &DAG, int &EltNo) { 4832 MVT VT = V.getSimpleValueType(); 4833 int NumElems = VT.getVectorNumElements(); 4834 SDLoc dl(V); 4835 4836 while (NumElems > 4) { 4837 if (EltNo < NumElems/2) { 4838 V = getUnpackl(DAG, dl, VT, V, V); 4839 } else { 4840 V = getUnpackh(DAG, dl, VT, V, V); 4841 EltNo -= NumElems/2; 4842 } 4843 NumElems >>= 1; 4844 } 4845 return V; 4846} 4847 4848/// getLegalSplat - Generate a legal splat with supported x86 shuffles 4849static SDValue getLegalSplat(SelectionDAG &DAG, SDValue V, int EltNo) { 4850 MVT VT = V.getSimpleValueType(); 4851 SDLoc dl(V); 4852 4853 if (VT.is128BitVector()) { 4854 V = DAG.getNode(ISD::BITCAST, dl, MVT::v4f32, V); 4855 int SplatMask[4] = { EltNo, EltNo, EltNo, EltNo }; 4856 V = DAG.getVectorShuffle(MVT::v4f32, dl, V, DAG.getUNDEF(MVT::v4f32), 4857 &SplatMask[0]); 4858 } else if (VT.is256BitVector()) { 4859 // To use VPERMILPS to splat scalars, the second half of indicies must 4860 // refer to the higher part, which is a duplication of the lower one, 4861 // because VPERMILPS can only handle in-lane permutations. 4862 int SplatMask[8] = { EltNo, EltNo, EltNo, EltNo, 4863 EltNo+4, EltNo+4, EltNo+4, EltNo+4 }; 4864 4865 V = DAG.getNode(ISD::BITCAST, dl, MVT::v8f32, V); 4866 V = DAG.getVectorShuffle(MVT::v8f32, dl, V, DAG.getUNDEF(MVT::v8f32), 4867 &SplatMask[0]); 4868 } else 4869 llvm_unreachable("Vector size not supported"); 4870 4871 return DAG.getNode(ISD::BITCAST, dl, VT, V); 4872} 4873 4874/// PromoteSplat - Splat is promoted to target supported vector shuffles. 4875static SDValue PromoteSplat(ShuffleVectorSDNode *SV, SelectionDAG &DAG) { 4876 MVT SrcVT = SV->getSimpleValueType(0); 4877 SDValue V1 = SV->getOperand(0); 4878 SDLoc dl(SV); 4879 4880 int EltNo = SV->getSplatIndex(); 4881 int NumElems = SrcVT.getVectorNumElements(); 4882 bool Is256BitVec = SrcVT.is256BitVector(); 4883 4884 assert(((SrcVT.is128BitVector() && NumElems > 4) || Is256BitVec) && 4885 "Unknown how to promote splat for type"); 4886 4887 // Extract the 128-bit part containing the splat element and update 4888 // the splat element index when it refers to the higher register. 4889 if (Is256BitVec) { 4890 V1 = Extract128BitVector(V1, EltNo, DAG, dl); 4891 if (EltNo >= NumElems/2) 4892 EltNo -= NumElems/2; 4893 } 4894 4895 // All i16 and i8 vector types can't be used directly by a generic shuffle 4896 // instruction because the target has no such instruction. Generate shuffles 4897 // which repeat i16 and i8 several times until they fit in i32, and then can 4898 // be manipulated by target suported shuffles. 4899 MVT EltVT = SrcVT.getVectorElementType(); 4900 if (EltVT == MVT::i8 || EltVT == MVT::i16) 4901 V1 = PromoteSplati8i16(V1, DAG, EltNo); 4902 4903 // Recreate the 256-bit vector and place the same 128-bit vector 4904 // into the low and high part. This is necessary because we want 4905 // to use VPERM* to shuffle the vectors 4906 if (Is256BitVec) { 4907 V1 = DAG.getNode(ISD::CONCAT_VECTORS, dl, SrcVT, V1, V1); 4908 } 4909 4910 return getLegalSplat(DAG, V1, EltNo); 4911} 4912 4913/// getShuffleVectorZeroOrUndef - Return a vector_shuffle of the specified 4914/// vector of zero or undef vector. This produces a shuffle where the low 4915/// element of V2 is swizzled into the zero/undef vector, landing at element 4916/// Idx. This produces a shuffle mask like 4,1,2,3 (idx=0) or 0,1,2,4 (idx=3). 4917static SDValue getShuffleVectorZeroOrUndef(SDValue V2, unsigned Idx, 4918 bool IsZero, 4919 const X86Subtarget *Subtarget, 4920 SelectionDAG &DAG) { 4921 MVT VT = V2.getSimpleValueType(); 4922 SDValue V1 = IsZero 4923 ? getZeroVector(VT, Subtarget, DAG, SDLoc(V2)) : DAG.getUNDEF(VT); 4924 unsigned NumElems = VT.getVectorNumElements(); 4925 SmallVector<int, 16> MaskVec; 4926 for (unsigned i = 0; i != NumElems; ++i) 4927 // If this is the insertion idx, put the low elt of V2 here. 4928 MaskVec.push_back(i == Idx ? NumElems : i); 4929 return DAG.getVectorShuffle(VT, SDLoc(V2), V1, V2, &MaskVec[0]); 4930} 4931 4932/// getTargetShuffleMask - Calculates the shuffle mask corresponding to the 4933/// target specific opcode. Returns true if the Mask could be calculated. 4934/// Sets IsUnary to true if only uses one source. 4935static bool getTargetShuffleMask(SDNode *N, MVT VT, 4936 SmallVectorImpl<int> &Mask, bool &IsUnary) { 4937 unsigned NumElems = VT.getVectorNumElements(); 4938 SDValue ImmN; 4939 4940 IsUnary = false; 4941 switch(N->getOpcode()) { 4942 case X86ISD::SHUFP: 4943 ImmN = N->getOperand(N->getNumOperands()-1); 4944 DecodeSHUFPMask(VT, cast<ConstantSDNode>(ImmN)->getZExtValue(), Mask); 4945 break; 4946 case X86ISD::UNPCKH: 4947 DecodeUNPCKHMask(VT, Mask); 4948 break; 4949 case X86ISD::UNPCKL: 4950 DecodeUNPCKLMask(VT, Mask); 4951 break; 4952 case X86ISD::MOVHLPS: 4953 DecodeMOVHLPSMask(NumElems, Mask); 4954 break; 4955 case X86ISD::MOVLHPS: 4956 DecodeMOVLHPSMask(NumElems, Mask); 4957 break; 4958 case X86ISD::PALIGNR: 4959 ImmN = N->getOperand(N->getNumOperands()-1); 4960 DecodePALIGNRMask(VT, cast<ConstantSDNode>(ImmN)->getZExtValue(), Mask); 4961 break; 4962 case X86ISD::PSHUFD: 4963 case X86ISD::VPERMILP: 4964 ImmN = N->getOperand(N->getNumOperands()-1); 4965 DecodePSHUFMask(VT, cast<ConstantSDNode>(ImmN)->getZExtValue(), Mask); 4966 IsUnary = true; 4967 break; 4968 case X86ISD::PSHUFHW: 4969 ImmN = N->getOperand(N->getNumOperands()-1); 4970 DecodePSHUFHWMask(VT, cast<ConstantSDNode>(ImmN)->getZExtValue(), Mask); 4971 IsUnary = true; 4972 break; 4973 case X86ISD::PSHUFLW: 4974 ImmN = N->getOperand(N->getNumOperands()-1); 4975 DecodePSHUFLWMask(VT, cast<ConstantSDNode>(ImmN)->getZExtValue(), Mask); 4976 IsUnary = true; 4977 break; 4978 case X86ISD::VPERMI: 4979 ImmN = N->getOperand(N->getNumOperands()-1); 4980 DecodeVPERMMask(cast<ConstantSDNode>(ImmN)->getZExtValue(), Mask); 4981 IsUnary = true; 4982 break; 4983 case X86ISD::MOVSS: 4984 case X86ISD::MOVSD: { 4985 // The index 0 always comes from the first element of the second source, 4986 // this is why MOVSS and MOVSD are used in the first place. The other 4987 // elements come from the other positions of the first source vector 4988 Mask.push_back(NumElems); 4989 for (unsigned i = 1; i != NumElems; ++i) { 4990 Mask.push_back(i); 4991 } 4992 break; 4993 } 4994 case X86ISD::VPERM2X128: 4995 ImmN = N->getOperand(N->getNumOperands()-1); 4996 DecodeVPERM2X128Mask(VT, cast<ConstantSDNode>(ImmN)->getZExtValue(), Mask); 4997 if (Mask.empty()) return false; 4998 break; 4999 case X86ISD::MOVDDUP: 5000 case X86ISD::MOVLHPD: 5001 case X86ISD::MOVLPD: 5002 case X86ISD::MOVLPS: 5003 case X86ISD::MOVSHDUP: 5004 case X86ISD::MOVSLDUP: 5005 // Not yet implemented 5006 return false; 5007 default: llvm_unreachable("unknown target shuffle node"); 5008 } 5009 5010 return true; 5011} 5012 5013/// getShuffleScalarElt - Returns the scalar element that will make up the ith 5014/// element of the result of the vector shuffle. 5015static SDValue getShuffleScalarElt(SDNode *N, unsigned Index, SelectionDAG &DAG, 5016 unsigned Depth) { 5017 if (Depth == 6) 5018 return SDValue(); // Limit search depth. 5019 5020 SDValue V = SDValue(N, 0); 5021 EVT VT = V.getValueType(); 5022 unsigned Opcode = V.getOpcode(); 5023 5024 // Recurse into ISD::VECTOR_SHUFFLE node to find scalars. 5025 if (const ShuffleVectorSDNode *SV = dyn_cast<ShuffleVectorSDNode>(N)) { 5026 int Elt = SV->getMaskElt(Index); 5027 5028 if (Elt < 0) 5029 return DAG.getUNDEF(VT.getVectorElementType()); 5030 5031 unsigned NumElems = VT.getVectorNumElements(); 5032 SDValue NewV = (Elt < (int)NumElems) ? SV->getOperand(0) 5033 : SV->getOperand(1); 5034 return getShuffleScalarElt(NewV.getNode(), Elt % NumElems, DAG, Depth+1); 5035 } 5036 5037 // Recurse into target specific vector shuffles to find scalars. 5038 if (isTargetShuffle(Opcode)) { 5039 MVT ShufVT = V.getSimpleValueType(); 5040 unsigned NumElems = ShufVT.getVectorNumElements(); 5041 SmallVector<int, 16> ShuffleMask; 5042 bool IsUnary; 5043 5044 if (!getTargetShuffleMask(N, ShufVT, ShuffleMask, IsUnary)) 5045 return SDValue(); 5046 5047 int Elt = ShuffleMask[Index]; 5048 if (Elt < 0) 5049 return DAG.getUNDEF(ShufVT.getVectorElementType()); 5050 5051 SDValue NewV = (Elt < (int)NumElems) ? N->getOperand(0) 5052 : N->getOperand(1); 5053 return getShuffleScalarElt(NewV.getNode(), Elt % NumElems, DAG, 5054 Depth+1); 5055 } 5056 5057 // Actual nodes that may contain scalar elements 5058 if (Opcode == ISD::BITCAST) { 5059 V = V.getOperand(0); 5060 EVT SrcVT = V.getValueType(); 5061 unsigned NumElems = VT.getVectorNumElements(); 5062 5063 if (!SrcVT.isVector() || SrcVT.getVectorNumElements() != NumElems) 5064 return SDValue(); 5065 } 5066 5067 if (V.getOpcode() == ISD::SCALAR_TO_VECTOR) 5068 return (Index == 0) ? V.getOperand(0) 5069 : DAG.getUNDEF(VT.getVectorElementType()); 5070 5071 if (V.getOpcode() == ISD::BUILD_VECTOR) 5072 return V.getOperand(Index); 5073 5074 return SDValue(); 5075} 5076 5077/// getNumOfConsecutiveZeros - Return the number of elements of a vector 5078/// shuffle operation which come from a consecutively from a zero. The 5079/// search can start in two different directions, from left or right. 5080/// We count undefs as zeros until PreferredNum is reached. 5081static unsigned getNumOfConsecutiveZeros(ShuffleVectorSDNode *SVOp, 5082 unsigned NumElems, bool ZerosFromLeft, 5083 SelectionDAG &DAG, 5084 unsigned PreferredNum = -1U) { 5085 unsigned NumZeros = 0; 5086 for (unsigned i = 0; i != NumElems; ++i) { 5087 unsigned Index = ZerosFromLeft ? i : NumElems - i - 1; 5088 SDValue Elt = getShuffleScalarElt(SVOp, Index, DAG, 0); 5089 if (!Elt.getNode()) 5090 break; 5091 5092 if (X86::isZeroNode(Elt)) 5093 ++NumZeros; 5094 else if (Elt.getOpcode() == ISD::UNDEF) // Undef as zero up to PreferredNum. 5095 NumZeros = std::min(NumZeros + 1, PreferredNum); 5096 else 5097 break; 5098 } 5099 5100 return NumZeros; 5101} 5102 5103/// isShuffleMaskConsecutive - Check if the shuffle mask indicies [MaskI, MaskE) 5104/// correspond consecutively to elements from one of the vector operands, 5105/// starting from its index OpIdx. Also tell OpNum which source vector operand. 5106static 5107bool isShuffleMaskConsecutive(ShuffleVectorSDNode *SVOp, 5108 unsigned MaskI, unsigned MaskE, unsigned OpIdx, 5109 unsigned NumElems, unsigned &OpNum) { 5110 bool SeenV1 = false; 5111 bool SeenV2 = false; 5112 5113 for (unsigned i = MaskI; i != MaskE; ++i, ++OpIdx) { 5114 int Idx = SVOp->getMaskElt(i); 5115 // Ignore undef indicies 5116 if (Idx < 0) 5117 continue; 5118 5119 if (Idx < (int)NumElems) 5120 SeenV1 = true; 5121 else 5122 SeenV2 = true; 5123 5124 // Only accept consecutive elements from the same vector 5125 if ((Idx % NumElems != OpIdx) || (SeenV1 && SeenV2)) 5126 return false; 5127 } 5128 5129 OpNum = SeenV1 ? 0 : 1; 5130 return true; 5131} 5132 5133/// isVectorShiftRight - Returns true if the shuffle can be implemented as a 5134/// logical left shift of a vector. 5135static bool isVectorShiftRight(ShuffleVectorSDNode *SVOp, SelectionDAG &DAG, 5136 bool &isLeft, SDValue &ShVal, unsigned &ShAmt) { 5137 unsigned NumElems = 5138 SVOp->getSimpleValueType(0).getVectorNumElements(); 5139 unsigned NumZeros = getNumOfConsecutiveZeros( 5140 SVOp, NumElems, false /* check zeros from right */, DAG, 5141 SVOp->getMaskElt(0)); 5142 unsigned OpSrc; 5143 5144 if (!NumZeros) 5145 return false; 5146 5147 // Considering the elements in the mask that are not consecutive zeros, 5148 // check if they consecutively come from only one of the source vectors. 5149 // 5150 // V1 = {X, A, B, C} 0 5151 // \ \ \ / 5152 // vector_shuffle V1, V2 <1, 2, 3, X> 5153 // 5154 if (!isShuffleMaskConsecutive(SVOp, 5155 0, // Mask Start Index 5156 NumElems-NumZeros, // Mask End Index(exclusive) 5157 NumZeros, // Where to start looking in the src vector 5158 NumElems, // Number of elements in vector 5159 OpSrc)) // Which source operand ? 5160 return false; 5161 5162 isLeft = false; 5163 ShAmt = NumZeros; 5164 ShVal = SVOp->getOperand(OpSrc); 5165 return true; 5166} 5167 5168/// isVectorShiftLeft - Returns true if the shuffle can be implemented as a 5169/// logical left shift of a vector. 5170static bool isVectorShiftLeft(ShuffleVectorSDNode *SVOp, SelectionDAG &DAG, 5171 bool &isLeft, SDValue &ShVal, unsigned &ShAmt) { 5172 unsigned NumElems = 5173 SVOp->getSimpleValueType(0).getVectorNumElements(); 5174 unsigned NumZeros = getNumOfConsecutiveZeros( 5175 SVOp, NumElems, true /* check zeros from left */, DAG, 5176 NumElems - SVOp->getMaskElt(NumElems - 1) - 1); 5177 unsigned OpSrc; 5178 5179 if (!NumZeros) 5180 return false; 5181 5182 // Considering the elements in the mask that are not consecutive zeros, 5183 // check if they consecutively come from only one of the source vectors. 5184 // 5185 // 0 { A, B, X, X } = V2 5186 // / \ / / 5187 // vector_shuffle V1, V2 <X, X, 4, 5> 5188 // 5189 if (!isShuffleMaskConsecutive(SVOp, 5190 NumZeros, // Mask Start Index 5191 NumElems, // Mask End Index(exclusive) 5192 0, // Where to start looking in the src vector 5193 NumElems, // Number of elements in vector 5194 OpSrc)) // Which source operand ? 5195 return false; 5196 5197 isLeft = true; 5198 ShAmt = NumZeros; 5199 ShVal = SVOp->getOperand(OpSrc); 5200 return true; 5201} 5202 5203/// isVectorShift - Returns true if the shuffle can be implemented as a 5204/// logical left or right shift of a vector. 5205static bool isVectorShift(ShuffleVectorSDNode *SVOp, SelectionDAG &DAG, 5206 bool &isLeft, SDValue &ShVal, unsigned &ShAmt) { 5207 // Although the logic below support any bitwidth size, there are no 5208 // shift instructions which handle more than 128-bit vectors. 5209 if (!SVOp->getSimpleValueType(0).is128BitVector()) 5210 return false; 5211 5212 if (isVectorShiftLeft(SVOp, DAG, isLeft, ShVal, ShAmt) || 5213 isVectorShiftRight(SVOp, DAG, isLeft, ShVal, ShAmt)) 5214 return true; 5215 5216 return false; 5217} 5218 5219/// LowerBuildVectorv16i8 - Custom lower build_vector of v16i8. 5220/// 5221static SDValue LowerBuildVectorv16i8(SDValue Op, unsigned NonZeros, 5222 unsigned NumNonZero, unsigned NumZero, 5223 SelectionDAG &DAG, 5224 const X86Subtarget* Subtarget, 5225 const TargetLowering &TLI) { 5226 if (NumNonZero > 8) 5227 return SDValue(); 5228 5229 SDLoc dl(Op); 5230 SDValue V(0, 0); 5231 bool First = true; 5232 for (unsigned i = 0; i < 16; ++i) { 5233 bool ThisIsNonZero = (NonZeros & (1 << i)) != 0; 5234 if (ThisIsNonZero && First) { 5235 if (NumZero) 5236 V = getZeroVector(MVT::v8i16, Subtarget, DAG, dl); 5237 else 5238 V = DAG.getUNDEF(MVT::v8i16); 5239 First = false; 5240 } 5241 5242 if ((i & 1) != 0) { 5243 SDValue ThisElt(0, 0), LastElt(0, 0); 5244 bool LastIsNonZero = (NonZeros & (1 << (i-1))) != 0; 5245 if (LastIsNonZero) { 5246 LastElt = DAG.getNode(ISD::ZERO_EXTEND, dl, 5247 MVT::i16, Op.getOperand(i-1)); 5248 } 5249 if (ThisIsNonZero) { 5250 ThisElt = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i16, Op.getOperand(i)); 5251 ThisElt = DAG.getNode(ISD::SHL, dl, MVT::i16, 5252 ThisElt, DAG.getConstant(8, MVT::i8)); 5253 if (LastIsNonZero) 5254 ThisElt = DAG.getNode(ISD::OR, dl, MVT::i16, ThisElt, LastElt); 5255 } else 5256 ThisElt = LastElt; 5257 5258 if (ThisElt.getNode()) 5259 V = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, MVT::v8i16, V, ThisElt, 5260 DAG.getIntPtrConstant(i/2)); 5261 } 5262 } 5263 5264 return DAG.getNode(ISD::BITCAST, dl, MVT::v16i8, V); 5265} 5266 5267/// LowerBuildVectorv8i16 - Custom lower build_vector of v8i16. 5268/// 5269static SDValue LowerBuildVectorv8i16(SDValue Op, unsigned NonZeros, 5270 unsigned NumNonZero, unsigned NumZero, 5271 SelectionDAG &DAG, 5272 const X86Subtarget* Subtarget, 5273 const TargetLowering &TLI) { 5274 if (NumNonZero > 4) 5275 return SDValue(); 5276 5277 SDLoc dl(Op); 5278 SDValue V(0, 0); 5279 bool First = true; 5280 for (unsigned i = 0; i < 8; ++i) { 5281 bool isNonZero = (NonZeros & (1 << i)) != 0; 5282 if (isNonZero) { 5283 if (First) { 5284 if (NumZero) 5285 V = getZeroVector(MVT::v8i16, Subtarget, DAG, dl); 5286 else 5287 V = DAG.getUNDEF(MVT::v8i16); 5288 First = false; 5289 } 5290 V = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, 5291 MVT::v8i16, V, Op.getOperand(i), 5292 DAG.getIntPtrConstant(i)); 5293 } 5294 } 5295 5296 return V; 5297} 5298 5299/// getVShift - Return a vector logical shift node. 5300/// 5301static SDValue getVShift(bool isLeft, EVT VT, SDValue SrcOp, 5302 unsigned NumBits, SelectionDAG &DAG, 5303 const TargetLowering &TLI, SDLoc dl) { 5304 assert(VT.is128BitVector() && "Unknown type for VShift"); 5305 EVT ShVT = MVT::v2i64; 5306 unsigned Opc = isLeft ? X86ISD::VSHLDQ : X86ISD::VSRLDQ; 5307 SrcOp = DAG.getNode(ISD::BITCAST, dl, ShVT, SrcOp); 5308 return DAG.getNode(ISD::BITCAST, dl, VT, 5309 DAG.getNode(Opc, dl, ShVT, SrcOp, 5310 DAG.getConstant(NumBits, 5311 TLI.getScalarShiftAmountTy(SrcOp.getValueType())))); 5312} 5313 5314static SDValue 5315LowerAsSplatVectorLoad(SDValue SrcOp, MVT VT, SDLoc dl, SelectionDAG &DAG) { 5316 5317 // Check if the scalar load can be widened into a vector load. And if 5318 // the address is "base + cst" see if the cst can be "absorbed" into 5319 // the shuffle mask. 5320 if (LoadSDNode *LD = dyn_cast<LoadSDNode>(SrcOp)) { 5321 SDValue Ptr = LD->getBasePtr(); 5322 if (!ISD::isNormalLoad(LD) || LD->isVolatile()) 5323 return SDValue(); 5324 EVT PVT = LD->getValueType(0); 5325 if (PVT != MVT::i32 && PVT != MVT::f32) 5326 return SDValue(); 5327 5328 int FI = -1; 5329 int64_t Offset = 0; 5330 if (FrameIndexSDNode *FINode = dyn_cast<FrameIndexSDNode>(Ptr)) { 5331 FI = FINode->getIndex(); 5332 Offset = 0; 5333 } else if (DAG.isBaseWithConstantOffset(Ptr) && 5334 isa<FrameIndexSDNode>(Ptr.getOperand(0))) { 5335 FI = cast<FrameIndexSDNode>(Ptr.getOperand(0))->getIndex(); 5336 Offset = Ptr.getConstantOperandVal(1); 5337 Ptr = Ptr.getOperand(0); 5338 } else { 5339 return SDValue(); 5340 } 5341 5342 // FIXME: 256-bit vector instructions don't require a strict alignment, 5343 // improve this code to support it better. 5344 unsigned RequiredAlign = VT.getSizeInBits()/8; 5345 SDValue Chain = LD->getChain(); 5346 // Make sure the stack object alignment is at least 16 or 32. 5347 MachineFrameInfo *MFI = DAG.getMachineFunction().getFrameInfo(); 5348 if (DAG.InferPtrAlignment(Ptr) < RequiredAlign) { 5349 if (MFI->isFixedObjectIndex(FI)) { 5350 // Can't change the alignment. FIXME: It's possible to compute 5351 // the exact stack offset and reference FI + adjust offset instead. 5352 // If someone *really* cares about this. That's the way to implement it. 5353 return SDValue(); 5354 } else { 5355 MFI->setObjectAlignment(FI, RequiredAlign); 5356 } 5357 } 5358 5359 // (Offset % 16 or 32) must be multiple of 4. Then address is then 5360 // Ptr + (Offset & ~15). 5361 if (Offset < 0) 5362 return SDValue(); 5363 if ((Offset % RequiredAlign) & 3) 5364 return SDValue(); 5365 int64_t StartOffset = Offset & ~(RequiredAlign-1); 5366 if (StartOffset) 5367 Ptr = DAG.getNode(ISD::ADD, SDLoc(Ptr), Ptr.getValueType(), 5368 Ptr,DAG.getConstant(StartOffset, Ptr.getValueType())); 5369 5370 int EltNo = (Offset - StartOffset) >> 2; 5371 unsigned NumElems = VT.getVectorNumElements(); 5372 5373 EVT NVT = EVT::getVectorVT(*DAG.getContext(), PVT, NumElems); 5374 SDValue V1 = DAG.getLoad(NVT, dl, Chain, Ptr, 5375 LD->getPointerInfo().getWithOffset(StartOffset), 5376 false, false, false, 0); 5377 5378 SmallVector<int, 8> Mask; 5379 for (unsigned i = 0; i != NumElems; ++i) 5380 Mask.push_back(EltNo); 5381 5382 return DAG.getVectorShuffle(NVT, dl, V1, DAG.getUNDEF(NVT), &Mask[0]); 5383 } 5384 5385 return SDValue(); 5386} 5387 5388/// EltsFromConsecutiveLoads - Given the initializing elements 'Elts' of a 5389/// vector of type 'VT', see if the elements can be replaced by a single large 5390/// load which has the same value as a build_vector whose operands are 'elts'. 5391/// 5392/// Example: <load i32 *a, load i32 *a+4, undef, undef> -> zextload a 5393/// 5394/// FIXME: we'd also like to handle the case where the last elements are zero 5395/// rather than undef via VZEXT_LOAD, but we do not detect that case today. 5396/// There's even a handy isZeroNode for that purpose. 5397static SDValue EltsFromConsecutiveLoads(EVT VT, SmallVectorImpl<SDValue> &Elts, 5398 SDLoc &DL, SelectionDAG &DAG, 5399 bool isAfterLegalize) { 5400 EVT EltVT = VT.getVectorElementType(); 5401 unsigned NumElems = Elts.size(); 5402 5403 LoadSDNode *LDBase = NULL; 5404 unsigned LastLoadedElt = -1U; 5405 5406 // For each element in the initializer, see if we've found a load or an undef. 5407 // If we don't find an initial load element, or later load elements are 5408 // non-consecutive, bail out. 5409 for (unsigned i = 0; i < NumElems; ++i) { 5410 SDValue Elt = Elts[i]; 5411 5412 if (!Elt.getNode() || 5413 (Elt.getOpcode() != ISD::UNDEF && !ISD::isNON_EXTLoad(Elt.getNode()))) 5414 return SDValue(); 5415 if (!LDBase) { 5416 if (Elt.getNode()->getOpcode() == ISD::UNDEF) 5417 return SDValue(); 5418 LDBase = cast<LoadSDNode>(Elt.getNode()); 5419 LastLoadedElt = i; 5420 continue; 5421 } 5422 if (Elt.getOpcode() == ISD::UNDEF) 5423 continue; 5424 5425 LoadSDNode *LD = cast<LoadSDNode>(Elt); 5426 if (!DAG.isConsecutiveLoad(LD, LDBase, EltVT.getSizeInBits()/8, i)) 5427 return SDValue(); 5428 LastLoadedElt = i; 5429 } 5430 5431 // If we have found an entire vector of loads and undefs, then return a large 5432 // load of the entire vector width starting at the base pointer. If we found 5433 // consecutive loads for the low half, generate a vzext_load node. 5434 if (LastLoadedElt == NumElems - 1) { 5435 5436 if (isAfterLegalize && 5437 !DAG.getTargetLoweringInfo().isOperationLegal(ISD::LOAD, VT)) 5438 return SDValue(); 5439 5440 SDValue NewLd = SDValue(); 5441 5442 if (DAG.InferPtrAlignment(LDBase->getBasePtr()) >= 16) 5443 NewLd = DAG.getLoad(VT, DL, LDBase->getChain(), LDBase->getBasePtr(), 5444 LDBase->getPointerInfo(), 5445 LDBase->isVolatile(), LDBase->isNonTemporal(), 5446 LDBase->isInvariant(), 0); 5447 NewLd = DAG.getLoad(VT, DL, LDBase->getChain(), LDBase->getBasePtr(), 5448 LDBase->getPointerInfo(), 5449 LDBase->isVolatile(), LDBase->isNonTemporal(), 5450 LDBase->isInvariant(), LDBase->getAlignment()); 5451 5452 if (LDBase->hasAnyUseOfValue(1)) { 5453 SDValue NewChain = DAG.getNode(ISD::TokenFactor, DL, MVT::Other, 5454 SDValue(LDBase, 1), 5455 SDValue(NewLd.getNode(), 1)); 5456 DAG.ReplaceAllUsesOfValueWith(SDValue(LDBase, 1), NewChain); 5457 DAG.UpdateNodeOperands(NewChain.getNode(), SDValue(LDBase, 1), 5458 SDValue(NewLd.getNode(), 1)); 5459 } 5460 5461 return NewLd; 5462 } 5463 if (NumElems == 4 && LastLoadedElt == 1 && 5464 DAG.getTargetLoweringInfo().isTypeLegal(MVT::v2i64)) { 5465 SDVTList Tys = DAG.getVTList(MVT::v2i64, MVT::Other); 5466 SDValue Ops[] = { LDBase->getChain(), LDBase->getBasePtr() }; 5467 SDValue ResNode = 5468 DAG.getMemIntrinsicNode(X86ISD::VZEXT_LOAD, DL, Tys, Ops, 5469 array_lengthof(Ops), MVT::i64, 5470 LDBase->getPointerInfo(), 5471 LDBase->getAlignment(), 5472 false/*isVolatile*/, true/*ReadMem*/, 5473 false/*WriteMem*/); 5474 5475 // Make sure the newly-created LOAD is in the same position as LDBase in 5476 // terms of dependency. We create a TokenFactor for LDBase and ResNode, and 5477 // update uses of LDBase's output chain to use the TokenFactor. 5478 if (LDBase->hasAnyUseOfValue(1)) { 5479 SDValue NewChain = DAG.getNode(ISD::TokenFactor, DL, MVT::Other, 5480 SDValue(LDBase, 1), SDValue(ResNode.getNode(), 1)); 5481 DAG.ReplaceAllUsesOfValueWith(SDValue(LDBase, 1), NewChain); 5482 DAG.UpdateNodeOperands(NewChain.getNode(), SDValue(LDBase, 1), 5483 SDValue(ResNode.getNode(), 1)); 5484 } 5485 5486 return DAG.getNode(ISD::BITCAST, DL, VT, ResNode); 5487 } 5488 return SDValue(); 5489} 5490 5491/// LowerVectorBroadcast - Attempt to use the vbroadcast instruction 5492/// to generate a splat value for the following cases: 5493/// 1. A splat BUILD_VECTOR which uses a single scalar load, or a constant. 5494/// 2. A splat shuffle which uses a scalar_to_vector node which comes from 5495/// a scalar load, or a constant. 5496/// The VBROADCAST node is returned when a pattern is found, 5497/// or SDValue() otherwise. 5498static SDValue LowerVectorBroadcast(SDValue Op, const X86Subtarget* Subtarget, 5499 SelectionDAG &DAG) { 5500 if (!Subtarget->hasFp256()) 5501 return SDValue(); 5502 5503 MVT VT = Op.getSimpleValueType(); 5504 SDLoc dl(Op); 5505 5506 assert((VT.is128BitVector() || VT.is256BitVector() || VT.is512BitVector()) && 5507 "Unsupported vector type for broadcast."); 5508 5509 SDValue Ld; 5510 bool ConstSplatVal; 5511 5512 switch (Op.getOpcode()) { 5513 default: 5514 // Unknown pattern found. 5515 return SDValue(); 5516 5517 case ISD::BUILD_VECTOR: { 5518 // The BUILD_VECTOR node must be a splat. 5519 if (!isSplatVector(Op.getNode())) 5520 return SDValue(); 5521 5522 Ld = Op.getOperand(0); 5523 ConstSplatVal = (Ld.getOpcode() == ISD::Constant || 5524 Ld.getOpcode() == ISD::ConstantFP); 5525 5526 // The suspected load node has several users. Make sure that all 5527 // of its users are from the BUILD_VECTOR node. 5528 // Constants may have multiple users. 5529 if (!ConstSplatVal && !Ld->hasNUsesOfValue(VT.getVectorNumElements(), 0)) 5530 return SDValue(); 5531 break; 5532 } 5533 5534 case ISD::VECTOR_SHUFFLE: { 5535 ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(Op); 5536 5537 // Shuffles must have a splat mask where the first element is 5538 // broadcasted. 5539 if ((!SVOp->isSplat()) || SVOp->getMaskElt(0) != 0) 5540 return SDValue(); 5541 5542 SDValue Sc = Op.getOperand(0); 5543 if (Sc.getOpcode() != ISD::SCALAR_TO_VECTOR && 5544 Sc.getOpcode() != ISD::BUILD_VECTOR) { 5545 5546 if (!Subtarget->hasInt256()) 5547 return SDValue(); 5548 5549 // Use the register form of the broadcast instruction available on AVX2. 5550 if (VT.getSizeInBits() >= 256) 5551 Sc = Extract128BitVector(Sc, 0, DAG, dl); 5552 return DAG.getNode(X86ISD::VBROADCAST, dl, VT, Sc); 5553 } 5554 5555 Ld = Sc.getOperand(0); 5556 ConstSplatVal = (Ld.getOpcode() == ISD::Constant || 5557 Ld.getOpcode() == ISD::ConstantFP); 5558 5559 // The scalar_to_vector node and the suspected 5560 // load node must have exactly one user. 5561 // Constants may have multiple users. 5562 5563 // AVX-512 has register version of the broadcast 5564 bool hasRegVer = Subtarget->hasAVX512() && VT.is512BitVector() && 5565 Ld.getValueType().getSizeInBits() >= 32; 5566 if (!ConstSplatVal && ((!Sc.hasOneUse() || !Ld.hasOneUse()) && 5567 !hasRegVer)) 5568 return SDValue(); 5569 break; 5570 } 5571 } 5572 5573 bool IsGE256 = (VT.getSizeInBits() >= 256); 5574 5575 // Handle the broadcasting a single constant scalar from the constant pool 5576 // into a vector. On Sandybridge it is still better to load a constant vector 5577 // from the constant pool and not to broadcast it from a scalar. 5578 if (ConstSplatVal && Subtarget->hasInt256()) { 5579 EVT CVT = Ld.getValueType(); 5580 assert(!CVT.isVector() && "Must not broadcast a vector type"); 5581 unsigned ScalarSize = CVT.getSizeInBits(); 5582 5583 if (ScalarSize == 32 || (IsGE256 && ScalarSize == 64)) { 5584 const Constant *C = 0; 5585 if (ConstantSDNode *CI = dyn_cast<ConstantSDNode>(Ld)) 5586 C = CI->getConstantIntValue(); 5587 else if (ConstantFPSDNode *CF = dyn_cast<ConstantFPSDNode>(Ld)) 5588 C = CF->getConstantFPValue(); 5589 5590 assert(C && "Invalid constant type"); 5591 5592 const TargetLowering &TLI = DAG.getTargetLoweringInfo(); 5593 SDValue CP = DAG.getConstantPool(C, TLI.getPointerTy()); 5594 unsigned Alignment = cast<ConstantPoolSDNode>(CP)->getAlignment(); 5595 Ld = DAG.getLoad(CVT, dl, DAG.getEntryNode(), CP, 5596 MachinePointerInfo::getConstantPool(), 5597 false, false, false, Alignment); 5598 5599 return DAG.getNode(X86ISD::VBROADCAST, dl, VT, Ld); 5600 } 5601 } 5602 5603 bool IsLoad = ISD::isNormalLoad(Ld.getNode()); 5604 unsigned ScalarSize = Ld.getValueType().getSizeInBits(); 5605 5606 // Handle AVX2 in-register broadcasts. 5607 if (!IsLoad && Subtarget->hasInt256() && 5608 (ScalarSize == 32 || (IsGE256 && ScalarSize == 64))) 5609 return DAG.getNode(X86ISD::VBROADCAST, dl, VT, Ld); 5610 5611 // The scalar source must be a normal load. 5612 if (!IsLoad) 5613 return SDValue(); 5614 5615 if (ScalarSize == 32 || (IsGE256 && ScalarSize == 64)) 5616 return DAG.getNode(X86ISD::VBROADCAST, dl, VT, Ld); 5617 5618 // The integer check is needed for the 64-bit into 128-bit so it doesn't match 5619 // double since there is no vbroadcastsd xmm 5620 if (Subtarget->hasInt256() && Ld.getValueType().isInteger()) { 5621 if (ScalarSize == 8 || ScalarSize == 16 || ScalarSize == 64) 5622 return DAG.getNode(X86ISD::VBROADCAST, dl, VT, Ld); 5623 } 5624 5625 // Unsupported broadcast. 5626 return SDValue(); 5627} 5628 5629static SDValue buildFromShuffleMostly(SDValue Op, SelectionDAG &DAG) { 5630 MVT VT = Op.getSimpleValueType(); 5631 5632 // Skip if insert_vec_elt is not supported. 5633 const TargetLowering &TLI = DAG.getTargetLoweringInfo(); 5634 if (!TLI.isOperationLegalOrCustom(ISD::INSERT_VECTOR_ELT, VT)) 5635 return SDValue(); 5636 5637 SDLoc DL(Op); 5638 unsigned NumElems = Op.getNumOperands(); 5639 5640 SDValue VecIn1; 5641 SDValue VecIn2; 5642 SmallVector<unsigned, 4> InsertIndices; 5643 SmallVector<int, 8> Mask(NumElems, -1); 5644 5645 for (unsigned i = 0; i != NumElems; ++i) { 5646 unsigned Opc = Op.getOperand(i).getOpcode(); 5647 5648 if (Opc == ISD::UNDEF) 5649 continue; 5650 5651 if (Opc != ISD::EXTRACT_VECTOR_ELT) { 5652 // Quit if more than 1 elements need inserting. 5653 if (InsertIndices.size() > 1) 5654 return SDValue(); 5655 5656 InsertIndices.push_back(i); 5657 continue; 5658 } 5659 5660 SDValue ExtractedFromVec = Op.getOperand(i).getOperand(0); 5661 SDValue ExtIdx = Op.getOperand(i).getOperand(1); 5662 5663 // Quit if extracted from vector of different type. 5664 if (ExtractedFromVec.getValueType() != VT) 5665 return SDValue(); 5666 5667 // Quit if non-constant index. 5668 if (!isa<ConstantSDNode>(ExtIdx)) 5669 return SDValue(); 5670 5671 if (VecIn1.getNode() == 0) 5672 VecIn1 = ExtractedFromVec; 5673 else if (VecIn1 != ExtractedFromVec) { 5674 if (VecIn2.getNode() == 0) 5675 VecIn2 = ExtractedFromVec; 5676 else if (VecIn2 != ExtractedFromVec) 5677 // Quit if more than 2 vectors to shuffle 5678 return SDValue(); 5679 } 5680 5681 unsigned Idx = cast<ConstantSDNode>(ExtIdx)->getZExtValue(); 5682 5683 if (ExtractedFromVec == VecIn1) 5684 Mask[i] = Idx; 5685 else if (ExtractedFromVec == VecIn2) 5686 Mask[i] = Idx + NumElems; 5687 } 5688 5689 if (VecIn1.getNode() == 0) 5690 return SDValue(); 5691 5692 VecIn2 = VecIn2.getNode() ? VecIn2 : DAG.getUNDEF(VT); 5693 SDValue NV = DAG.getVectorShuffle(VT, DL, VecIn1, VecIn2, &Mask[0]); 5694 for (unsigned i = 0, e = InsertIndices.size(); i != e; ++i) { 5695 unsigned Idx = InsertIndices[i]; 5696 NV = DAG.getNode(ISD::INSERT_VECTOR_ELT, DL, VT, NV, Op.getOperand(Idx), 5697 DAG.getIntPtrConstant(Idx)); 5698 } 5699 5700 return NV; 5701} 5702 5703// Lower BUILD_VECTOR operation for v8i1 and v16i1 types. 5704SDValue 5705X86TargetLowering::LowerBUILD_VECTORvXi1(SDValue Op, SelectionDAG &DAG) const { 5706 5707 MVT VT = Op.getSimpleValueType(); 5708 assert((VT.getVectorElementType() == MVT::i1) && (VT.getSizeInBits() <= 16) && 5709 "Unexpected type in LowerBUILD_VECTORvXi1!"); 5710 5711 SDLoc dl(Op); 5712 if (ISD::isBuildVectorAllZeros(Op.getNode())) { 5713 SDValue Cst = DAG.getTargetConstant(0, MVT::i1); 5714 SDValue Ops[] = { Cst, Cst, Cst, Cst, Cst, Cst, Cst, Cst, 5715 Cst, Cst, Cst, Cst, Cst, Cst, Cst, Cst }; 5716 return DAG.getNode(ISD::BUILD_VECTOR, dl, VT, 5717 Ops, VT.getVectorNumElements()); 5718 } 5719 5720 if (ISD::isBuildVectorAllOnes(Op.getNode())) { 5721 SDValue Cst = DAG.getTargetConstant(1, MVT::i1); 5722 SDValue Ops[] = { Cst, Cst, Cst, Cst, Cst, Cst, Cst, Cst, 5723 Cst, Cst, Cst, Cst, Cst, Cst, Cst, Cst }; 5724 return DAG.getNode(ISD::BUILD_VECTOR, dl, VT, 5725 Ops, VT.getVectorNumElements()); 5726 } 5727 5728 bool AllContants = true; 5729 uint64_t Immediate = 0; 5730 for (unsigned idx = 0, e = Op.getNumOperands(); idx < e; ++idx) { 5731 SDValue In = Op.getOperand(idx); 5732 if (In.getOpcode() == ISD::UNDEF) 5733 continue; 5734 if (!isa<ConstantSDNode>(In)) { 5735 AllContants = false; 5736 break; 5737 } 5738 if (cast<ConstantSDNode>(In)->getZExtValue()) 5739 Immediate |= (1ULL << idx); 5740 } 5741 5742 if (AllContants) { 5743 SDValue FullMask = DAG.getNode(ISD::BITCAST, dl, MVT::v16i1, 5744 DAG.getConstant(Immediate, MVT::i16)); 5745 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, VT, FullMask, 5746 DAG.getIntPtrConstant(0)); 5747 } 5748 5749 // Splat vector (with undefs) 5750 SDValue In = Op.getOperand(0); 5751 for (unsigned i = 1, e = Op.getNumOperands(); i != e; ++i) { 5752 if (Op.getOperand(i) != In && Op.getOperand(i).getOpcode() != ISD::UNDEF) 5753 llvm_unreachable("Unsupported predicate operation"); 5754 } 5755 5756 SDValue EFLAGS, X86CC; 5757 if (In.getOpcode() == ISD::SETCC) { 5758 SDValue Op0 = In.getOperand(0); 5759 SDValue Op1 = In.getOperand(1); 5760 ISD::CondCode CC = cast<CondCodeSDNode>(In.getOperand(2))->get(); 5761 bool isFP = Op1.getValueType().isFloatingPoint(); 5762 unsigned X86CCVal = TranslateX86CC(CC, isFP, Op0, Op1, DAG); 5763 5764 assert(X86CCVal != X86::COND_INVALID && "Unsupported predicate operation"); 5765 5766 X86CC = DAG.getConstant(X86CCVal, MVT::i8); 5767 EFLAGS = EmitCmp(Op0, Op1, X86CCVal, DAG); 5768 EFLAGS = ConvertCmpIfNecessary(EFLAGS, DAG); 5769 } else if (In.getOpcode() == X86ISD::SETCC) { 5770 X86CC = In.getOperand(0); 5771 EFLAGS = In.getOperand(1); 5772 } else { 5773 // The algorithm: 5774 // Bit1 = In & 0x1 5775 // if (Bit1 != 0) 5776 // ZF = 0 5777 // else 5778 // ZF = 1 5779 // if (ZF == 0) 5780 // res = allOnes ### CMOVNE -1, %res 5781 // else 5782 // res = allZero 5783 MVT InVT = In.getSimpleValueType(); 5784 SDValue Bit1 = DAG.getNode(ISD::AND, dl, InVT, In, DAG.getConstant(1, InVT)); 5785 EFLAGS = EmitTest(Bit1, X86::COND_NE, DAG); 5786 X86CC = DAG.getConstant(X86::COND_NE, MVT::i8); 5787 } 5788 5789 if (VT == MVT::v16i1) { 5790 SDValue Cst1 = DAG.getConstant(-1, MVT::i16); 5791 SDValue Cst0 = DAG.getConstant(0, MVT::i16); 5792 SDValue CmovOp = DAG.getNode(X86ISD::CMOV, dl, MVT::i16, 5793 Cst0, Cst1, X86CC, EFLAGS); 5794 return DAG.getNode(ISD::BITCAST, dl, VT, CmovOp); 5795 } 5796 5797 if (VT == MVT::v8i1) { 5798 SDValue Cst1 = DAG.getConstant(-1, MVT::i32); 5799 SDValue Cst0 = DAG.getConstant(0, MVT::i32); 5800 SDValue CmovOp = DAG.getNode(X86ISD::CMOV, dl, MVT::i32, 5801 Cst0, Cst1, X86CC, EFLAGS); 5802 CmovOp = DAG.getNode(ISD::TRUNCATE, dl, MVT::i8, CmovOp); 5803 return DAG.getNode(ISD::BITCAST, dl, VT, CmovOp); 5804 } 5805 llvm_unreachable("Unsupported predicate operation"); 5806} 5807 5808SDValue 5809X86TargetLowering::LowerBUILD_VECTOR(SDValue Op, SelectionDAG &DAG) const { 5810 SDLoc dl(Op); 5811 5812 MVT VT = Op.getSimpleValueType(); 5813 MVT ExtVT = VT.getVectorElementType(); 5814 unsigned NumElems = Op.getNumOperands(); 5815 5816 // Generate vectors for predicate vectors. 5817 if (VT.getScalarType() == MVT::i1 && Subtarget->hasAVX512()) 5818 return LowerBUILD_VECTORvXi1(Op, DAG); 5819 5820 // Vectors containing all zeros can be matched by pxor and xorps later 5821 if (ISD::isBuildVectorAllZeros(Op.getNode())) { 5822 // Canonicalize this to <4 x i32> to 1) ensure the zero vectors are CSE'd 5823 // and 2) ensure that i64 scalars are eliminated on x86-32 hosts. 5824 if (VT == MVT::v4i32 || VT == MVT::v8i32 || VT == MVT::v16i32) 5825 return Op; 5826 5827 return getZeroVector(VT, Subtarget, DAG, dl); 5828 } 5829 5830 // Vectors containing all ones can be matched by pcmpeqd on 128-bit width 5831 // vectors or broken into v4i32 operations on 256-bit vectors. AVX2 can use 5832 // vpcmpeqd on 256-bit vectors. 5833 if (Subtarget->hasSSE2() && ISD::isBuildVectorAllOnes(Op.getNode())) { 5834 if (VT == MVT::v4i32 || (VT == MVT::v8i32 && Subtarget->hasInt256())) 5835 return Op; 5836 5837 if (!VT.is512BitVector()) 5838 return getOnesVector(VT, Subtarget->hasInt256(), DAG, dl); 5839 } 5840 5841 SDValue Broadcast = LowerVectorBroadcast(Op, Subtarget, DAG); 5842 if (Broadcast.getNode()) 5843 return Broadcast; 5844 5845 unsigned EVTBits = ExtVT.getSizeInBits(); 5846 5847 unsigned NumZero = 0; 5848 unsigned NumNonZero = 0; 5849 unsigned NonZeros = 0; 5850 bool IsAllConstants = true; 5851 SmallSet<SDValue, 8> Values; 5852 for (unsigned i = 0; i < NumElems; ++i) { 5853 SDValue Elt = Op.getOperand(i); 5854 if (Elt.getOpcode() == ISD::UNDEF) 5855 continue; 5856 Values.insert(Elt); 5857 if (Elt.getOpcode() != ISD::Constant && 5858 Elt.getOpcode() != ISD::ConstantFP) 5859 IsAllConstants = false; 5860 if (X86::isZeroNode(Elt)) 5861 NumZero++; 5862 else { 5863 NonZeros |= (1 << i); 5864 NumNonZero++; 5865 } 5866 } 5867 5868 // All undef vector. Return an UNDEF. All zero vectors were handled above. 5869 if (NumNonZero == 0) 5870 return DAG.getUNDEF(VT); 5871 5872 // Special case for single non-zero, non-undef, element. 5873 if (NumNonZero == 1) { 5874 unsigned Idx = countTrailingZeros(NonZeros); 5875 SDValue Item = Op.getOperand(Idx); 5876 5877 // If this is an insertion of an i64 value on x86-32, and if the top bits of 5878 // the value are obviously zero, truncate the value to i32 and do the 5879 // insertion that way. Only do this if the value is non-constant or if the 5880 // value is a constant being inserted into element 0. It is cheaper to do 5881 // a constant pool load than it is to do a movd + shuffle. 5882 if (ExtVT == MVT::i64 && !Subtarget->is64Bit() && 5883 (!IsAllConstants || Idx == 0)) { 5884 if (DAG.MaskedValueIsZero(Item, APInt::getBitsSet(64, 32, 64))) { 5885 // Handle SSE only. 5886 assert(VT == MVT::v2i64 && "Expected an SSE value type!"); 5887 EVT VecVT = MVT::v4i32; 5888 unsigned VecElts = 4; 5889 5890 // Truncate the value (which may itself be a constant) to i32, and 5891 // convert it to a vector with movd (S2V+shuffle to zero extend). 5892 Item = DAG.getNode(ISD::TRUNCATE, dl, MVT::i32, Item); 5893 Item = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VecVT, Item); 5894 Item = getShuffleVectorZeroOrUndef(Item, 0, true, Subtarget, DAG); 5895 5896 // Now we have our 32-bit value zero extended in the low element of 5897 // a vector. If Idx != 0, swizzle it into place. 5898 if (Idx != 0) { 5899 SmallVector<int, 4> Mask; 5900 Mask.push_back(Idx); 5901 for (unsigned i = 1; i != VecElts; ++i) 5902 Mask.push_back(i); 5903 Item = DAG.getVectorShuffle(VecVT, dl, Item, DAG.getUNDEF(VecVT), 5904 &Mask[0]); 5905 } 5906 return DAG.getNode(ISD::BITCAST, dl, VT, Item); 5907 } 5908 } 5909 5910 // If we have a constant or non-constant insertion into the low element of 5911 // a vector, we can do this with SCALAR_TO_VECTOR + shuffle of zero into 5912 // the rest of the elements. This will be matched as movd/movq/movss/movsd 5913 // depending on what the source datatype is. 5914 if (Idx == 0) { 5915 if (NumZero == 0) 5916 return DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, Item); 5917 5918 if (ExtVT == MVT::i32 || ExtVT == MVT::f32 || ExtVT == MVT::f64 || 5919 (ExtVT == MVT::i64 && Subtarget->is64Bit())) { 5920 if (VT.is256BitVector() || VT.is512BitVector()) { 5921 SDValue ZeroVec = getZeroVector(VT, Subtarget, DAG, dl); 5922 return DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, VT, ZeroVec, 5923 Item, DAG.getIntPtrConstant(0)); 5924 } 5925 assert(VT.is128BitVector() && "Expected an SSE value type!"); 5926 Item = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, Item); 5927 // Turn it into a MOVL (i.e. movss, movsd, or movd) to a zero vector. 5928 return getShuffleVectorZeroOrUndef(Item, 0, true, Subtarget, DAG); 5929 } 5930 5931 if (ExtVT == MVT::i16 || ExtVT == MVT::i8) { 5932 Item = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i32, Item); 5933 Item = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v4i32, Item); 5934 if (VT.is256BitVector()) { 5935 SDValue ZeroVec = getZeroVector(MVT::v8i32, Subtarget, DAG, dl); 5936 Item = Insert128BitVector(ZeroVec, Item, 0, DAG, dl); 5937 } else { 5938 assert(VT.is128BitVector() && "Expected an SSE value type!"); 5939 Item = getShuffleVectorZeroOrUndef(Item, 0, true, Subtarget, DAG); 5940 } 5941 return DAG.getNode(ISD::BITCAST, dl, VT, Item); 5942 } 5943 } 5944 5945 // Is it a vector logical left shift? 5946 if (NumElems == 2 && Idx == 1 && 5947 X86::isZeroNode(Op.getOperand(0)) && 5948 !X86::isZeroNode(Op.getOperand(1))) { 5949 unsigned NumBits = VT.getSizeInBits(); 5950 return getVShift(true, VT, 5951 DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, 5952 VT, Op.getOperand(1)), 5953 NumBits/2, DAG, *this, dl); 5954 } 5955 5956 if (IsAllConstants) // Otherwise, it's better to do a constpool load. 5957 return SDValue(); 5958 5959 // Otherwise, if this is a vector with i32 or f32 elements, and the element 5960 // is a non-constant being inserted into an element other than the low one, 5961 // we can't use a constant pool load. Instead, use SCALAR_TO_VECTOR (aka 5962 // movd/movss) to move this into the low element, then shuffle it into 5963 // place. 5964 if (EVTBits == 32) { 5965 Item = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, Item); 5966 5967 // Turn it into a shuffle of zero and zero-extended scalar to vector. 5968 Item = getShuffleVectorZeroOrUndef(Item, 0, NumZero > 0, Subtarget, DAG); 5969 SmallVector<int, 8> MaskVec; 5970 for (unsigned i = 0; i != NumElems; ++i) 5971 MaskVec.push_back(i == Idx ? 0 : 1); 5972 return DAG.getVectorShuffle(VT, dl, Item, DAG.getUNDEF(VT), &MaskVec[0]); 5973 } 5974 } 5975 5976 // Splat is obviously ok. Let legalizer expand it to a shuffle. 5977 if (Values.size() == 1) { 5978 if (EVTBits == 32) { 5979 // Instead of a shuffle like this: 5980 // shuffle (scalar_to_vector (load (ptr + 4))), undef, <0, 0, 0, 0> 5981 // Check if it's possible to issue this instead. 5982 // shuffle (vload ptr)), undef, <1, 1, 1, 1> 5983 unsigned Idx = countTrailingZeros(NonZeros); 5984 SDValue Item = Op.getOperand(Idx); 5985 if (Op.getNode()->isOnlyUserOf(Item.getNode())) 5986 return LowerAsSplatVectorLoad(Item, VT, dl, DAG); 5987 } 5988 return SDValue(); 5989 } 5990 5991 // A vector full of immediates; various special cases are already 5992 // handled, so this is best done with a single constant-pool load. 5993 if (IsAllConstants) 5994 return SDValue(); 5995 5996 // For AVX-length vectors, build the individual 128-bit pieces and use 5997 // shuffles to put them in place. 5998 if (VT.is256BitVector()) { 5999 SmallVector<SDValue, 32> V; 6000 for (unsigned i = 0; i != NumElems; ++i) 6001 V.push_back(Op.getOperand(i)); 6002 6003 EVT HVT = EVT::getVectorVT(*DAG.getContext(), ExtVT, NumElems/2); 6004 6005 // Build both the lower and upper subvector. 6006 SDValue Lower = DAG.getNode(ISD::BUILD_VECTOR, dl, HVT, &V[0], NumElems/2); 6007 SDValue Upper = DAG.getNode(ISD::BUILD_VECTOR, dl, HVT, &V[NumElems / 2], 6008 NumElems/2); 6009 6010 // Recreate the wider vector with the lower and upper part. 6011 return Concat128BitVectors(Lower, Upper, VT, NumElems, DAG, dl); 6012 } 6013 6014 // Let legalizer expand 2-wide build_vectors. 6015 if (EVTBits == 64) { 6016 if (NumNonZero == 1) { 6017 // One half is zero or undef. 6018 unsigned Idx = countTrailingZeros(NonZeros); 6019 SDValue V2 = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, 6020 Op.getOperand(Idx)); 6021 return getShuffleVectorZeroOrUndef(V2, Idx, true, Subtarget, DAG); 6022 } 6023 return SDValue(); 6024 } 6025 6026 // If element VT is < 32 bits, convert it to inserts into a zero vector. 6027 if (EVTBits == 8 && NumElems == 16) { 6028 SDValue V = LowerBuildVectorv16i8(Op, NonZeros,NumNonZero,NumZero, DAG, 6029 Subtarget, *this); 6030 if (V.getNode()) return V; 6031 } 6032 6033 if (EVTBits == 16 && NumElems == 8) { 6034 SDValue V = LowerBuildVectorv8i16(Op, NonZeros,NumNonZero,NumZero, DAG, 6035 Subtarget, *this); 6036 if (V.getNode()) return V; 6037 } 6038 6039 // If element VT is == 32 bits, turn it into a number of shuffles. 6040 SmallVector<SDValue, 8> V(NumElems); 6041 if (NumElems == 4 && NumZero > 0) { 6042 for (unsigned i = 0; i < 4; ++i) { 6043 bool isZero = !(NonZeros & (1 << i)); 6044 if (isZero) 6045 V[i] = getZeroVector(VT, Subtarget, DAG, dl); 6046 else 6047 V[i] = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, Op.getOperand(i)); 6048 } 6049 6050 for (unsigned i = 0; i < 2; ++i) { 6051 switch ((NonZeros & (0x3 << i*2)) >> (i*2)) { 6052 default: break; 6053 case 0: 6054 V[i] = V[i*2]; // Must be a zero vector. 6055 break; 6056 case 1: 6057 V[i] = getMOVL(DAG, dl, VT, V[i*2+1], V[i*2]); 6058 break; 6059 case 2: 6060 V[i] = getMOVL(DAG, dl, VT, V[i*2], V[i*2+1]); 6061 break; 6062 case 3: 6063 V[i] = getUnpackl(DAG, dl, VT, V[i*2], V[i*2+1]); 6064 break; 6065 } 6066 } 6067 6068 bool Reverse1 = (NonZeros & 0x3) == 2; 6069 bool Reverse2 = ((NonZeros & (0x3 << 2)) >> 2) == 2; 6070 int MaskVec[] = { 6071 Reverse1 ? 1 : 0, 6072 Reverse1 ? 0 : 1, 6073 static_cast<int>(Reverse2 ? NumElems+1 : NumElems), 6074 static_cast<int>(Reverse2 ? NumElems : NumElems+1) 6075 }; 6076 return DAG.getVectorShuffle(VT, dl, V[0], V[1], &MaskVec[0]); 6077 } 6078 6079 if (Values.size() > 1 && VT.is128BitVector()) { 6080 // Check for a build vector of consecutive loads. 6081 for (unsigned i = 0; i < NumElems; ++i) 6082 V[i] = Op.getOperand(i); 6083 6084 // Check for elements which are consecutive loads. 6085 SDValue LD = EltsFromConsecutiveLoads(VT, V, dl, DAG, false); 6086 if (LD.getNode()) 6087 return LD; 6088 6089 // Check for a build vector from mostly shuffle plus few inserting. 6090 SDValue Sh = buildFromShuffleMostly(Op, DAG); 6091 if (Sh.getNode()) 6092 return Sh; 6093 6094 // For SSE 4.1, use insertps to put the high elements into the low element. 6095 if (getSubtarget()->hasSSE41()) { 6096 SDValue Result; 6097 if (Op.getOperand(0).getOpcode() != ISD::UNDEF) 6098 Result = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, Op.getOperand(0)); 6099 else 6100 Result = DAG.getUNDEF(VT); 6101 6102 for (unsigned i = 1; i < NumElems; ++i) { 6103 if (Op.getOperand(i).getOpcode() == ISD::UNDEF) continue; 6104 Result = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, VT, Result, 6105 Op.getOperand(i), DAG.getIntPtrConstant(i)); 6106 } 6107 return Result; 6108 } 6109 6110 // Otherwise, expand into a number of unpckl*, start by extending each of 6111 // our (non-undef) elements to the full vector width with the element in the 6112 // bottom slot of the vector (which generates no code for SSE). 6113 for (unsigned i = 0; i < NumElems; ++i) { 6114 if (Op.getOperand(i).getOpcode() != ISD::UNDEF) 6115 V[i] = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, Op.getOperand(i)); 6116 else 6117 V[i] = DAG.getUNDEF(VT); 6118 } 6119 6120 // Next, we iteratively mix elements, e.g. for v4f32: 6121 // Step 1: unpcklps 0, 2 ==> X: <?, ?, 2, 0> 6122 // : unpcklps 1, 3 ==> Y: <?, ?, 3, 1> 6123 // Step 2: unpcklps X, Y ==> <3, 2, 1, 0> 6124 unsigned EltStride = NumElems >> 1; 6125 while (EltStride != 0) { 6126 for (unsigned i = 0; i < EltStride; ++i) { 6127 // If V[i+EltStride] is undef and this is the first round of mixing, 6128 // then it is safe to just drop this shuffle: V[i] is already in the 6129 // right place, the one element (since it's the first round) being 6130 // inserted as undef can be dropped. This isn't safe for successive 6131 // rounds because they will permute elements within both vectors. 6132 if (V[i+EltStride].getOpcode() == ISD::UNDEF && 6133 EltStride == NumElems/2) 6134 continue; 6135 6136 V[i] = getUnpackl(DAG, dl, VT, V[i], V[i + EltStride]); 6137 } 6138 EltStride >>= 1; 6139 } 6140 return V[0]; 6141 } 6142 return SDValue(); 6143} 6144 6145// LowerAVXCONCAT_VECTORS - 256-bit AVX can use the vinsertf128 instruction 6146// to create 256-bit vectors from two other 128-bit ones. 6147static SDValue LowerAVXCONCAT_VECTORS(SDValue Op, SelectionDAG &DAG) { 6148 SDLoc dl(Op); 6149 MVT ResVT = Op.getSimpleValueType(); 6150 6151 assert((ResVT.is256BitVector() || 6152 ResVT.is512BitVector()) && "Value type must be 256-/512-bit wide"); 6153 6154 SDValue V1 = Op.getOperand(0); 6155 SDValue V2 = Op.getOperand(1); 6156 unsigned NumElems = ResVT.getVectorNumElements(); 6157 if(ResVT.is256BitVector()) 6158 return Concat128BitVectors(V1, V2, ResVT, NumElems, DAG, dl); 6159 6160 return Concat256BitVectors(V1, V2, ResVT, NumElems, DAG, dl); 6161} 6162 6163static SDValue LowerCONCAT_VECTORS(SDValue Op, SelectionDAG &DAG) { 6164 assert(Op.getNumOperands() == 2); 6165 6166 // AVX/AVX-512 can use the vinsertf128 instruction to create 256-bit vectors 6167 // from two other 128-bit ones. 6168 return LowerAVXCONCAT_VECTORS(Op, DAG); 6169} 6170 6171// Try to lower a shuffle node into a simple blend instruction. 6172static SDValue 6173LowerVECTOR_SHUFFLEtoBlend(ShuffleVectorSDNode *SVOp, 6174 const X86Subtarget *Subtarget, SelectionDAG &DAG) { 6175 SDValue V1 = SVOp->getOperand(0); 6176 SDValue V2 = SVOp->getOperand(1); 6177 SDLoc dl(SVOp); 6178 MVT VT = SVOp->getSimpleValueType(0); 6179 MVT EltVT = VT.getVectorElementType(); 6180 unsigned NumElems = VT.getVectorNumElements(); 6181 6182 // There is no blend with immediate in AVX-512. 6183 if (VT.is512BitVector()) 6184 return SDValue(); 6185 6186 if (!Subtarget->hasSSE41() || EltVT == MVT::i8) 6187 return SDValue(); 6188 if (!Subtarget->hasInt256() && VT == MVT::v16i16) 6189 return SDValue(); 6190 6191 // Check the mask for BLEND and build the value. 6192 unsigned MaskValue = 0; 6193 // There are 2 lanes if (NumElems > 8), and 1 lane otherwise. 6194 unsigned NumLanes = (NumElems-1)/8 + 1; 6195 unsigned NumElemsInLane = NumElems / NumLanes; 6196 6197 // Blend for v16i16 should be symetric for the both lanes. 6198 for (unsigned i = 0; i < NumElemsInLane; ++i) { 6199 6200 int SndLaneEltIdx = (NumLanes == 2) ? 6201 SVOp->getMaskElt(i + NumElemsInLane) : -1; 6202 int EltIdx = SVOp->getMaskElt(i); 6203 6204 if ((EltIdx < 0 || EltIdx == (int)i) && 6205 (SndLaneEltIdx < 0 || SndLaneEltIdx == (int)(i + NumElemsInLane))) 6206 continue; 6207 6208 if (((unsigned)EltIdx == (i + NumElems)) && 6209 (SndLaneEltIdx < 0 || 6210 (unsigned)SndLaneEltIdx == i + NumElems + NumElemsInLane)) 6211 MaskValue |= (1<<i); 6212 else 6213 return SDValue(); 6214 } 6215 6216 // Convert i32 vectors to floating point if it is not AVX2. 6217 // AVX2 introduced VPBLENDD instruction for 128 and 256-bit vectors. 6218 MVT BlendVT = VT; 6219 if (EltVT == MVT::i64 || (EltVT == MVT::i32 && !Subtarget->hasInt256())) { 6220 BlendVT = MVT::getVectorVT(MVT::getFloatingPointVT(EltVT.getSizeInBits()), 6221 NumElems); 6222 V1 = DAG.getNode(ISD::BITCAST, dl, VT, V1); 6223 V2 = DAG.getNode(ISD::BITCAST, dl, VT, V2); 6224 } 6225 6226 SDValue Ret = DAG.getNode(X86ISD::BLENDI, dl, BlendVT, V1, V2, 6227 DAG.getConstant(MaskValue, MVT::i32)); 6228 return DAG.getNode(ISD::BITCAST, dl, VT, Ret); 6229} 6230 6231// v8i16 shuffles - Prefer shuffles in the following order: 6232// 1. [all] pshuflw, pshufhw, optional move 6233// 2. [ssse3] 1 x pshufb 6234// 3. [ssse3] 2 x pshufb + 1 x por 6235// 4. [all] mov + pshuflw + pshufhw + N x (pextrw + pinsrw) 6236static SDValue 6237LowerVECTOR_SHUFFLEv8i16(SDValue Op, const X86Subtarget *Subtarget, 6238 SelectionDAG &DAG) { 6239 ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(Op); 6240 SDValue V1 = SVOp->getOperand(0); 6241 SDValue V2 = SVOp->getOperand(1); 6242 SDLoc dl(SVOp); 6243 SmallVector<int, 8> MaskVals; 6244 6245 // Determine if more than 1 of the words in each of the low and high quadwords 6246 // of the result come from the same quadword of one of the two inputs. Undef 6247 // mask values count as coming from any quadword, for better codegen. 6248 unsigned LoQuad[] = { 0, 0, 0, 0 }; 6249 unsigned HiQuad[] = { 0, 0, 0, 0 }; 6250 std::bitset<4> InputQuads; 6251 for (unsigned i = 0; i < 8; ++i) { 6252 unsigned *Quad = i < 4 ? LoQuad : HiQuad; 6253 int EltIdx = SVOp->getMaskElt(i); 6254 MaskVals.push_back(EltIdx); 6255 if (EltIdx < 0) { 6256 ++Quad[0]; 6257 ++Quad[1]; 6258 ++Quad[2]; 6259 ++Quad[3]; 6260 continue; 6261 } 6262 ++Quad[EltIdx / 4]; 6263 InputQuads.set(EltIdx / 4); 6264 } 6265 6266 int BestLoQuad = -1; 6267 unsigned MaxQuad = 1; 6268 for (unsigned i = 0; i < 4; ++i) { 6269 if (LoQuad[i] > MaxQuad) { 6270 BestLoQuad = i; 6271 MaxQuad = LoQuad[i]; 6272 } 6273 } 6274 6275 int BestHiQuad = -1; 6276 MaxQuad = 1; 6277 for (unsigned i = 0; i < 4; ++i) { 6278 if (HiQuad[i] > MaxQuad) { 6279 BestHiQuad = i; 6280 MaxQuad = HiQuad[i]; 6281 } 6282 } 6283 6284 // For SSSE3, If all 8 words of the result come from only 1 quadword of each 6285 // of the two input vectors, shuffle them into one input vector so only a 6286 // single pshufb instruction is necessary. If There are more than 2 input 6287 // quads, disable the next transformation since it does not help SSSE3. 6288 bool V1Used = InputQuads[0] || InputQuads[1]; 6289 bool V2Used = InputQuads[2] || InputQuads[3]; 6290 if (Subtarget->hasSSSE3()) { 6291 if (InputQuads.count() == 2 && V1Used && V2Used) { 6292 BestLoQuad = InputQuads[0] ? 0 : 1; 6293 BestHiQuad = InputQuads[2] ? 2 : 3; 6294 } 6295 if (InputQuads.count() > 2) { 6296 BestLoQuad = -1; 6297 BestHiQuad = -1; 6298 } 6299 } 6300 6301 // If BestLoQuad or BestHiQuad are set, shuffle the quads together and update 6302 // the shuffle mask. If a quad is scored as -1, that means that it contains 6303 // words from all 4 input quadwords. 6304 SDValue NewV; 6305 if (BestLoQuad >= 0 || BestHiQuad >= 0) { 6306 int MaskV[] = { 6307 BestLoQuad < 0 ? 0 : BestLoQuad, 6308 BestHiQuad < 0 ? 1 : BestHiQuad 6309 }; 6310 NewV = DAG.getVectorShuffle(MVT::v2i64, dl, 6311 DAG.getNode(ISD::BITCAST, dl, MVT::v2i64, V1), 6312 DAG.getNode(ISD::BITCAST, dl, MVT::v2i64, V2), &MaskV[0]); 6313 NewV = DAG.getNode(ISD::BITCAST, dl, MVT::v8i16, NewV); 6314 6315 // Rewrite the MaskVals and assign NewV to V1 if NewV now contains all the 6316 // source words for the shuffle, to aid later transformations. 6317 bool AllWordsInNewV = true; 6318 bool InOrder[2] = { true, true }; 6319 for (unsigned i = 0; i != 8; ++i) { 6320 int idx = MaskVals[i]; 6321 if (idx != (int)i) 6322 InOrder[i/4] = false; 6323 if (idx < 0 || (idx/4) == BestLoQuad || (idx/4) == BestHiQuad) 6324 continue; 6325 AllWordsInNewV = false; 6326 break; 6327 } 6328 6329 bool pshuflw = AllWordsInNewV, pshufhw = AllWordsInNewV; 6330 if (AllWordsInNewV) { 6331 for (int i = 0; i != 8; ++i) { 6332 int idx = MaskVals[i]; 6333 if (idx < 0) 6334 continue; 6335 idx = MaskVals[i] = (idx / 4) == BestLoQuad ? (idx & 3) : (idx & 3) + 4; 6336 if ((idx != i) && idx < 4) 6337 pshufhw = false; 6338 if ((idx != i) && idx > 3) 6339 pshuflw = false; 6340 } 6341 V1 = NewV; 6342 V2Used = false; 6343 BestLoQuad = 0; 6344 BestHiQuad = 1; 6345 } 6346 6347 // If we've eliminated the use of V2, and the new mask is a pshuflw or 6348 // pshufhw, that's as cheap as it gets. Return the new shuffle. 6349 if ((pshufhw && InOrder[0]) || (pshuflw && InOrder[1])) { 6350 unsigned Opc = pshufhw ? X86ISD::PSHUFHW : X86ISD::PSHUFLW; 6351 unsigned TargetMask = 0; 6352 NewV = DAG.getVectorShuffle(MVT::v8i16, dl, NewV, 6353 DAG.getUNDEF(MVT::v8i16), &MaskVals[0]); 6354 ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(NewV.getNode()); 6355 TargetMask = pshufhw ? getShufflePSHUFHWImmediate(SVOp): 6356 getShufflePSHUFLWImmediate(SVOp); 6357 V1 = NewV.getOperand(0); 6358 return getTargetShuffleNode(Opc, dl, MVT::v8i16, V1, TargetMask, DAG); 6359 } 6360 } 6361 6362 // Promote splats to a larger type which usually leads to more efficient code. 6363 // FIXME: Is this true if pshufb is available? 6364 if (SVOp->isSplat()) 6365 return PromoteSplat(SVOp, DAG); 6366 6367 // If we have SSSE3, and all words of the result are from 1 input vector, 6368 // case 2 is generated, otherwise case 3 is generated. If no SSSE3 6369 // is present, fall back to case 4. 6370 if (Subtarget->hasSSSE3()) { 6371 SmallVector<SDValue,16> pshufbMask; 6372 6373 // If we have elements from both input vectors, set the high bit of the 6374 // shuffle mask element to zero out elements that come from V2 in the V1 6375 // mask, and elements that come from V1 in the V2 mask, so that the two 6376 // results can be OR'd together. 6377 bool TwoInputs = V1Used && V2Used; 6378 for (unsigned i = 0; i != 8; ++i) { 6379 int EltIdx = MaskVals[i] * 2; 6380 int Idx0 = (TwoInputs && (EltIdx >= 16)) ? 0x80 : EltIdx; 6381 int Idx1 = (TwoInputs && (EltIdx >= 16)) ? 0x80 : EltIdx+1; 6382 pshufbMask.push_back(DAG.getConstant(Idx0, MVT::i8)); 6383 pshufbMask.push_back(DAG.getConstant(Idx1, MVT::i8)); 6384 } 6385 V1 = DAG.getNode(ISD::BITCAST, dl, MVT::v16i8, V1); 6386 V1 = DAG.getNode(X86ISD::PSHUFB, dl, MVT::v16i8, V1, 6387 DAG.getNode(ISD::BUILD_VECTOR, dl, 6388 MVT::v16i8, &pshufbMask[0], 16)); 6389 if (!TwoInputs) 6390 return DAG.getNode(ISD::BITCAST, dl, MVT::v8i16, V1); 6391 6392 // Calculate the shuffle mask for the second input, shuffle it, and 6393 // OR it with the first shuffled input. 6394 pshufbMask.clear(); 6395 for (unsigned i = 0; i != 8; ++i) { 6396 int EltIdx = MaskVals[i] * 2; 6397 int Idx0 = (EltIdx < 16) ? 0x80 : EltIdx - 16; 6398 int Idx1 = (EltIdx < 16) ? 0x80 : EltIdx - 15; 6399 pshufbMask.push_back(DAG.getConstant(Idx0, MVT::i8)); 6400 pshufbMask.push_back(DAG.getConstant(Idx1, MVT::i8)); 6401 } 6402 V2 = DAG.getNode(ISD::BITCAST, dl, MVT::v16i8, V2); 6403 V2 = DAG.getNode(X86ISD::PSHUFB, dl, MVT::v16i8, V2, 6404 DAG.getNode(ISD::BUILD_VECTOR, dl, 6405 MVT::v16i8, &pshufbMask[0], 16)); 6406 V1 = DAG.getNode(ISD::OR, dl, MVT::v16i8, V1, V2); 6407 return DAG.getNode(ISD::BITCAST, dl, MVT::v8i16, V1); 6408 } 6409 6410 // If BestLoQuad >= 0, generate a pshuflw to put the low elements in order, 6411 // and update MaskVals with new element order. 6412 std::bitset<8> InOrder; 6413 if (BestLoQuad >= 0) { 6414 int MaskV[] = { -1, -1, -1, -1, 4, 5, 6, 7 }; 6415 for (int i = 0; i != 4; ++i) { 6416 int idx = MaskVals[i]; 6417 if (idx < 0) { 6418 InOrder.set(i); 6419 } else if ((idx / 4) == BestLoQuad) { 6420 MaskV[i] = idx & 3; 6421 InOrder.set(i); 6422 } 6423 } 6424 NewV = DAG.getVectorShuffle(MVT::v8i16, dl, NewV, DAG.getUNDEF(MVT::v8i16), 6425 &MaskV[0]); 6426 6427 if (NewV.getOpcode() == ISD::VECTOR_SHUFFLE && Subtarget->hasSSSE3()) { 6428 ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(NewV.getNode()); 6429 NewV = getTargetShuffleNode(X86ISD::PSHUFLW, dl, MVT::v8i16, 6430 NewV.getOperand(0), 6431 getShufflePSHUFLWImmediate(SVOp), DAG); 6432 } 6433 } 6434 6435 // If BestHi >= 0, generate a pshufhw to put the high elements in order, 6436 // and update MaskVals with the new element order. 6437 if (BestHiQuad >= 0) { 6438 int MaskV[] = { 0, 1, 2, 3, -1, -1, -1, -1 }; 6439 for (unsigned i = 4; i != 8; ++i) { 6440 int idx = MaskVals[i]; 6441 if (idx < 0) { 6442 InOrder.set(i); 6443 } else if ((idx / 4) == BestHiQuad) { 6444 MaskV[i] = (idx & 3) + 4; 6445 InOrder.set(i); 6446 } 6447 } 6448 NewV = DAG.getVectorShuffle(MVT::v8i16, dl, NewV, DAG.getUNDEF(MVT::v8i16), 6449 &MaskV[0]); 6450 6451 if (NewV.getOpcode() == ISD::VECTOR_SHUFFLE && Subtarget->hasSSSE3()) { 6452 ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(NewV.getNode()); 6453 NewV = getTargetShuffleNode(X86ISD::PSHUFHW, dl, MVT::v8i16, 6454 NewV.getOperand(0), 6455 getShufflePSHUFHWImmediate(SVOp), DAG); 6456 } 6457 } 6458 6459 // In case BestHi & BestLo were both -1, which means each quadword has a word 6460 // from each of the four input quadwords, calculate the InOrder bitvector now 6461 // before falling through to the insert/extract cleanup. 6462 if (BestLoQuad == -1 && BestHiQuad == -1) { 6463 NewV = V1; 6464 for (int i = 0; i != 8; ++i) 6465 if (MaskVals[i] < 0 || MaskVals[i] == i) 6466 InOrder.set(i); 6467 } 6468 6469 // The other elements are put in the right place using pextrw and pinsrw. 6470 for (unsigned i = 0; i != 8; ++i) { 6471 if (InOrder[i]) 6472 continue; 6473 int EltIdx = MaskVals[i]; 6474 if (EltIdx < 0) 6475 continue; 6476 SDValue ExtOp = (EltIdx < 8) ? 6477 DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i16, V1, 6478 DAG.getIntPtrConstant(EltIdx)) : 6479 DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i16, V2, 6480 DAG.getIntPtrConstant(EltIdx - 8)); 6481 NewV = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, MVT::v8i16, NewV, ExtOp, 6482 DAG.getIntPtrConstant(i)); 6483 } 6484 return NewV; 6485} 6486 6487// v16i8 shuffles - Prefer shuffles in the following order: 6488// 1. [ssse3] 1 x pshufb 6489// 2. [ssse3] 2 x pshufb + 1 x por 6490// 3. [all] v8i16 shuffle + N x pextrw + rotate + pinsrw 6491static SDValue LowerVECTOR_SHUFFLEv16i8(ShuffleVectorSDNode *SVOp, 6492 const X86Subtarget* Subtarget, 6493 SelectionDAG &DAG) { 6494 const TargetLowering &TLI = DAG.getTargetLoweringInfo(); 6495 SDValue V1 = SVOp->getOperand(0); 6496 SDValue V2 = SVOp->getOperand(1); 6497 SDLoc dl(SVOp); 6498 ArrayRef<int> MaskVals = SVOp->getMask(); 6499 6500 // Promote splats to a larger type which usually leads to more efficient code. 6501 // FIXME: Is this true if pshufb is available? 6502 if (SVOp->isSplat()) 6503 return PromoteSplat(SVOp, DAG); 6504 6505 // If we have SSSE3, case 1 is generated when all result bytes come from 6506 // one of the inputs. Otherwise, case 2 is generated. If no SSSE3 is 6507 // present, fall back to case 3. 6508 6509 // If SSSE3, use 1 pshufb instruction per vector with elements in the result. 6510 if (Subtarget->hasSSSE3()) { 6511 SmallVector<SDValue,16> pshufbMask; 6512 6513 // If all result elements are from one input vector, then only translate 6514 // undef mask values to 0x80 (zero out result) in the pshufb mask. 6515 // 6516 // Otherwise, we have elements from both input vectors, and must zero out 6517 // elements that come from V2 in the first mask, and V1 in the second mask 6518 // so that we can OR them together. 6519 for (unsigned i = 0; i != 16; ++i) { 6520 int EltIdx = MaskVals[i]; 6521 if (EltIdx < 0 || EltIdx >= 16) 6522 EltIdx = 0x80; 6523 pshufbMask.push_back(DAG.getConstant(EltIdx, MVT::i8)); 6524 } 6525 V1 = DAG.getNode(X86ISD::PSHUFB, dl, MVT::v16i8, V1, 6526 DAG.getNode(ISD::BUILD_VECTOR, dl, 6527 MVT::v16i8, &pshufbMask[0], 16)); 6528 6529 // As PSHUFB will zero elements with negative indices, it's safe to ignore 6530 // the 2nd operand if it's undefined or zero. 6531 if (V2.getOpcode() == ISD::UNDEF || 6532 ISD::isBuildVectorAllZeros(V2.getNode())) 6533 return V1; 6534 6535 // Calculate the shuffle mask for the second input, shuffle it, and 6536 // OR it with the first shuffled input. 6537 pshufbMask.clear(); 6538 for (unsigned i = 0; i != 16; ++i) { 6539 int EltIdx = MaskVals[i]; 6540 EltIdx = (EltIdx < 16) ? 0x80 : EltIdx - 16; 6541 pshufbMask.push_back(DAG.getConstant(EltIdx, MVT::i8)); 6542 } 6543 V2 = DAG.getNode(X86ISD::PSHUFB, dl, MVT::v16i8, V2, 6544 DAG.getNode(ISD::BUILD_VECTOR, dl, 6545 MVT::v16i8, &pshufbMask[0], 16)); 6546 return DAG.getNode(ISD::OR, dl, MVT::v16i8, V1, V2); 6547 } 6548 6549 // No SSSE3 - Calculate in place words and then fix all out of place words 6550 // With 0-16 extracts & inserts. Worst case is 16 bytes out of order from 6551 // the 16 different words that comprise the two doublequadword input vectors. 6552 V1 = DAG.getNode(ISD::BITCAST, dl, MVT::v8i16, V1); 6553 V2 = DAG.getNode(ISD::BITCAST, dl, MVT::v8i16, V2); 6554 SDValue NewV = V1; 6555 for (int i = 0; i != 8; ++i) { 6556 int Elt0 = MaskVals[i*2]; 6557 int Elt1 = MaskVals[i*2+1]; 6558 6559 // This word of the result is all undef, skip it. 6560 if (Elt0 < 0 && Elt1 < 0) 6561 continue; 6562 6563 // This word of the result is already in the correct place, skip it. 6564 if ((Elt0 == i*2) && (Elt1 == i*2+1)) 6565 continue; 6566 6567 SDValue Elt0Src = Elt0 < 16 ? V1 : V2; 6568 SDValue Elt1Src = Elt1 < 16 ? V1 : V2; 6569 SDValue InsElt; 6570 6571 // If Elt0 and Elt1 are defined, are consecutive, and can be load 6572 // using a single extract together, load it and store it. 6573 if ((Elt0 >= 0) && ((Elt0 + 1) == Elt1) && ((Elt0 & 1) == 0)) { 6574 InsElt = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i16, Elt1Src, 6575 DAG.getIntPtrConstant(Elt1 / 2)); 6576 NewV = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, MVT::v8i16, NewV, InsElt, 6577 DAG.getIntPtrConstant(i)); 6578 continue; 6579 } 6580 6581 // If Elt1 is defined, extract it from the appropriate source. If the 6582 // source byte is not also odd, shift the extracted word left 8 bits 6583 // otherwise clear the bottom 8 bits if we need to do an or. 6584 if (Elt1 >= 0) { 6585 InsElt = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i16, Elt1Src, 6586 DAG.getIntPtrConstant(Elt1 / 2)); 6587 if ((Elt1 & 1) == 0) 6588 InsElt = DAG.getNode(ISD::SHL, dl, MVT::i16, InsElt, 6589 DAG.getConstant(8, 6590 TLI.getShiftAmountTy(InsElt.getValueType()))); 6591 else if (Elt0 >= 0) 6592 InsElt = DAG.getNode(ISD::AND, dl, MVT::i16, InsElt, 6593 DAG.getConstant(0xFF00, MVT::i16)); 6594 } 6595 // If Elt0 is defined, extract it from the appropriate source. If the 6596 // source byte is not also even, shift the extracted word right 8 bits. If 6597 // Elt1 was also defined, OR the extracted values together before 6598 // inserting them in the result. 6599 if (Elt0 >= 0) { 6600 SDValue InsElt0 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i16, 6601 Elt0Src, DAG.getIntPtrConstant(Elt0 / 2)); 6602 if ((Elt0 & 1) != 0) 6603 InsElt0 = DAG.getNode(ISD::SRL, dl, MVT::i16, InsElt0, 6604 DAG.getConstant(8, 6605 TLI.getShiftAmountTy(InsElt0.getValueType()))); 6606 else if (Elt1 >= 0) 6607 InsElt0 = DAG.getNode(ISD::AND, dl, MVT::i16, InsElt0, 6608 DAG.getConstant(0x00FF, MVT::i16)); 6609 InsElt = Elt1 >= 0 ? DAG.getNode(ISD::OR, dl, MVT::i16, InsElt, InsElt0) 6610 : InsElt0; 6611 } 6612 NewV = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, MVT::v8i16, NewV, InsElt, 6613 DAG.getIntPtrConstant(i)); 6614 } 6615 return DAG.getNode(ISD::BITCAST, dl, MVT::v16i8, NewV); 6616} 6617 6618// v32i8 shuffles - Translate to VPSHUFB if possible. 6619static 6620SDValue LowerVECTOR_SHUFFLEv32i8(ShuffleVectorSDNode *SVOp, 6621 const X86Subtarget *Subtarget, 6622 SelectionDAG &DAG) { 6623 MVT VT = SVOp->getSimpleValueType(0); 6624 SDValue V1 = SVOp->getOperand(0); 6625 SDValue V2 = SVOp->getOperand(1); 6626 SDLoc dl(SVOp); 6627 SmallVector<int, 32> MaskVals(SVOp->getMask().begin(), SVOp->getMask().end()); 6628 6629 bool V2IsUndef = V2.getOpcode() == ISD::UNDEF; 6630 bool V1IsAllZero = ISD::isBuildVectorAllZeros(V1.getNode()); 6631 bool V2IsAllZero = ISD::isBuildVectorAllZeros(V2.getNode()); 6632 6633 // VPSHUFB may be generated if 6634 // (1) one of input vector is undefined or zeroinitializer. 6635 // The mask value 0x80 puts 0 in the corresponding slot of the vector. 6636 // And (2) the mask indexes don't cross the 128-bit lane. 6637 if (VT != MVT::v32i8 || !Subtarget->hasInt256() || 6638 (!V2IsUndef && !V2IsAllZero && !V1IsAllZero)) 6639 return SDValue(); 6640 6641 if (V1IsAllZero && !V2IsAllZero) { 6642 CommuteVectorShuffleMask(MaskVals, 32); 6643 V1 = V2; 6644 } 6645 SmallVector<SDValue, 32> pshufbMask; 6646 for (unsigned i = 0; i != 32; i++) { 6647 int EltIdx = MaskVals[i]; 6648 if (EltIdx < 0 || EltIdx >= 32) 6649 EltIdx = 0x80; 6650 else { 6651 if ((EltIdx >= 16 && i < 16) || (EltIdx < 16 && i >= 16)) 6652 // Cross lane is not allowed. 6653 return SDValue(); 6654 EltIdx &= 0xf; 6655 } 6656 pshufbMask.push_back(DAG.getConstant(EltIdx, MVT::i8)); 6657 } 6658 return DAG.getNode(X86ISD::PSHUFB, dl, MVT::v32i8, V1, 6659 DAG.getNode(ISD::BUILD_VECTOR, dl, 6660 MVT::v32i8, &pshufbMask[0], 32)); 6661} 6662 6663/// RewriteAsNarrowerShuffle - Try rewriting v8i16 and v16i8 shuffles as 4 wide 6664/// ones, or rewriting v4i32 / v4f32 as 2 wide ones if possible. This can be 6665/// done when every pair / quad of shuffle mask elements point to elements in 6666/// the right sequence. e.g. 6667/// vector_shuffle X, Y, <2, 3, | 10, 11, | 0, 1, | 14, 15> 6668static 6669SDValue RewriteAsNarrowerShuffle(ShuffleVectorSDNode *SVOp, 6670 SelectionDAG &DAG) { 6671 MVT VT = SVOp->getSimpleValueType(0); 6672 SDLoc dl(SVOp); 6673 unsigned NumElems = VT.getVectorNumElements(); 6674 MVT NewVT; 6675 unsigned Scale; 6676 switch (VT.SimpleTy) { 6677 default: llvm_unreachable("Unexpected!"); 6678 case MVT::v4f32: NewVT = MVT::v2f64; Scale = 2; break; 6679 case MVT::v4i32: NewVT = MVT::v2i64; Scale = 2; break; 6680 case MVT::v8i16: NewVT = MVT::v4i32; Scale = 2; break; 6681 case MVT::v16i8: NewVT = MVT::v4i32; Scale = 4; break; 6682 case MVT::v16i16: NewVT = MVT::v8i32; Scale = 2; break; 6683 case MVT::v32i8: NewVT = MVT::v8i32; Scale = 4; break; 6684 } 6685 6686 SmallVector<int, 8> MaskVec; 6687 for (unsigned i = 0; i != NumElems; i += Scale) { 6688 int StartIdx = -1; 6689 for (unsigned j = 0; j != Scale; ++j) { 6690 int EltIdx = SVOp->getMaskElt(i+j); 6691 if (EltIdx < 0) 6692 continue; 6693 if (StartIdx < 0) 6694 StartIdx = (EltIdx / Scale); 6695 if (EltIdx != (int)(StartIdx*Scale + j)) 6696 return SDValue(); 6697 } 6698 MaskVec.push_back(StartIdx); 6699 } 6700 6701 SDValue V1 = DAG.getNode(ISD::BITCAST, dl, NewVT, SVOp->getOperand(0)); 6702 SDValue V2 = DAG.getNode(ISD::BITCAST, dl, NewVT, SVOp->getOperand(1)); 6703 return DAG.getVectorShuffle(NewVT, dl, V1, V2, &MaskVec[0]); 6704} 6705 6706/// getVZextMovL - Return a zero-extending vector move low node. 6707/// 6708static SDValue getVZextMovL(MVT VT, MVT OpVT, 6709 SDValue SrcOp, SelectionDAG &DAG, 6710 const X86Subtarget *Subtarget, SDLoc dl) { 6711 if (VT == MVT::v2f64 || VT == MVT::v4f32) { 6712 LoadSDNode *LD = NULL; 6713 if (!isScalarLoadToVector(SrcOp.getNode(), &LD)) 6714 LD = dyn_cast<LoadSDNode>(SrcOp); 6715 if (!LD) { 6716 // movssrr and movsdrr do not clear top bits. Try to use movd, movq 6717 // instead. 6718 MVT ExtVT = (OpVT == MVT::v2f64) ? MVT::i64 : MVT::i32; 6719 if ((ExtVT != MVT::i64 || Subtarget->is64Bit()) && 6720 SrcOp.getOpcode() == ISD::SCALAR_TO_VECTOR && 6721 SrcOp.getOperand(0).getOpcode() == ISD::BITCAST && 6722 SrcOp.getOperand(0).getOperand(0).getValueType() == ExtVT) { 6723 // PR2108 6724 OpVT = (OpVT == MVT::v2f64) ? MVT::v2i64 : MVT::v4i32; 6725 return DAG.getNode(ISD::BITCAST, dl, VT, 6726 DAG.getNode(X86ISD::VZEXT_MOVL, dl, OpVT, 6727 DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, 6728 OpVT, 6729 SrcOp.getOperand(0) 6730 .getOperand(0)))); 6731 } 6732 } 6733 } 6734 6735 return DAG.getNode(ISD::BITCAST, dl, VT, 6736 DAG.getNode(X86ISD::VZEXT_MOVL, dl, OpVT, 6737 DAG.getNode(ISD::BITCAST, dl, 6738 OpVT, SrcOp))); 6739} 6740 6741/// LowerVECTOR_SHUFFLE_256 - Handle all 256-bit wide vectors shuffles 6742/// which could not be matched by any known target speficic shuffle 6743static SDValue 6744LowerVECTOR_SHUFFLE_256(ShuffleVectorSDNode *SVOp, SelectionDAG &DAG) { 6745 6746 SDValue NewOp = Compact8x32ShuffleNode(SVOp, DAG); 6747 if (NewOp.getNode()) 6748 return NewOp; 6749 6750 MVT VT = SVOp->getSimpleValueType(0); 6751 6752 unsigned NumElems = VT.getVectorNumElements(); 6753 unsigned NumLaneElems = NumElems / 2; 6754 6755 SDLoc dl(SVOp); 6756 MVT EltVT = VT.getVectorElementType(); 6757 MVT NVT = MVT::getVectorVT(EltVT, NumLaneElems); 6758 SDValue Output[2]; 6759 6760 SmallVector<int, 16> Mask; 6761 for (unsigned l = 0; l < 2; ++l) { 6762 // Build a shuffle mask for the output, discovering on the fly which 6763 // input vectors to use as shuffle operands (recorded in InputUsed). 6764 // If building a suitable shuffle vector proves too hard, then bail 6765 // out with UseBuildVector set. 6766 bool UseBuildVector = false; 6767 int InputUsed[2] = { -1, -1 }; // Not yet discovered. 6768 unsigned LaneStart = l * NumLaneElems; 6769 for (unsigned i = 0; i != NumLaneElems; ++i) { 6770 // The mask element. This indexes into the input. 6771 int Idx = SVOp->getMaskElt(i+LaneStart); 6772 if (Idx < 0) { 6773 // the mask element does not index into any input vector. 6774 Mask.push_back(-1); 6775 continue; 6776 } 6777 6778 // The input vector this mask element indexes into. 6779 int Input = Idx / NumLaneElems; 6780 6781 // Turn the index into an offset from the start of the input vector. 6782 Idx -= Input * NumLaneElems; 6783 6784 // Find or create a shuffle vector operand to hold this input. 6785 unsigned OpNo; 6786 for (OpNo = 0; OpNo < array_lengthof(InputUsed); ++OpNo) { 6787 if (InputUsed[OpNo] == Input) 6788 // This input vector is already an operand. 6789 break; 6790 if (InputUsed[OpNo] < 0) { 6791 // Create a new operand for this input vector. 6792 InputUsed[OpNo] = Input; 6793 break; 6794 } 6795 } 6796 6797 if (OpNo >= array_lengthof(InputUsed)) { 6798 // More than two input vectors used! Give up on trying to create a 6799 // shuffle vector. Insert all elements into a BUILD_VECTOR instead. 6800 UseBuildVector = true; 6801 break; 6802 } 6803 6804 // Add the mask index for the new shuffle vector. 6805 Mask.push_back(Idx + OpNo * NumLaneElems); 6806 } 6807 6808 if (UseBuildVector) { 6809 SmallVector<SDValue, 16> SVOps; 6810 for (unsigned i = 0; i != NumLaneElems; ++i) { 6811 // The mask element. This indexes into the input. 6812 int Idx = SVOp->getMaskElt(i+LaneStart); 6813 if (Idx < 0) { 6814 SVOps.push_back(DAG.getUNDEF(EltVT)); 6815 continue; 6816 } 6817 6818 // The input vector this mask element indexes into. 6819 int Input = Idx / NumElems; 6820 6821 // Turn the index into an offset from the start of the input vector. 6822 Idx -= Input * NumElems; 6823 6824 // Extract the vector element by hand. 6825 SVOps.push_back(DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, EltVT, 6826 SVOp->getOperand(Input), 6827 DAG.getIntPtrConstant(Idx))); 6828 } 6829 6830 // Construct the output using a BUILD_VECTOR. 6831 Output[l] = DAG.getNode(ISD::BUILD_VECTOR, dl, NVT, &SVOps[0], 6832 SVOps.size()); 6833 } else if (InputUsed[0] < 0) { 6834 // No input vectors were used! The result is undefined. 6835 Output[l] = DAG.getUNDEF(NVT); 6836 } else { 6837 SDValue Op0 = Extract128BitVector(SVOp->getOperand(InputUsed[0] / 2), 6838 (InputUsed[0] % 2) * NumLaneElems, 6839 DAG, dl); 6840 // If only one input was used, use an undefined vector for the other. 6841 SDValue Op1 = (InputUsed[1] < 0) ? DAG.getUNDEF(NVT) : 6842 Extract128BitVector(SVOp->getOperand(InputUsed[1] / 2), 6843 (InputUsed[1] % 2) * NumLaneElems, DAG, dl); 6844 // At least one input vector was used. Create a new shuffle vector. 6845 Output[l] = DAG.getVectorShuffle(NVT, dl, Op0, Op1, &Mask[0]); 6846 } 6847 6848 Mask.clear(); 6849 } 6850 6851 // Concatenate the result back 6852 return DAG.getNode(ISD::CONCAT_VECTORS, dl, VT, Output[0], Output[1]); 6853} 6854 6855/// LowerVECTOR_SHUFFLE_128v4 - Handle all 128-bit wide vectors with 6856/// 4 elements, and match them with several different shuffle types. 6857static SDValue 6858LowerVECTOR_SHUFFLE_128v4(ShuffleVectorSDNode *SVOp, SelectionDAG &DAG) { 6859 SDValue V1 = SVOp->getOperand(0); 6860 SDValue V2 = SVOp->getOperand(1); 6861 SDLoc dl(SVOp); 6862 MVT VT = SVOp->getSimpleValueType(0); 6863 6864 assert(VT.is128BitVector() && "Unsupported vector size"); 6865 6866 std::pair<int, int> Locs[4]; 6867 int Mask1[] = { -1, -1, -1, -1 }; 6868 SmallVector<int, 8> PermMask(SVOp->getMask().begin(), SVOp->getMask().end()); 6869 6870 unsigned NumHi = 0; 6871 unsigned NumLo = 0; 6872 for (unsigned i = 0; i != 4; ++i) { 6873 int Idx = PermMask[i]; 6874 if (Idx < 0) { 6875 Locs[i] = std::make_pair(-1, -1); 6876 } else { 6877 assert(Idx < 8 && "Invalid VECTOR_SHUFFLE index!"); 6878 if (Idx < 4) { 6879 Locs[i] = std::make_pair(0, NumLo); 6880 Mask1[NumLo] = Idx; 6881 NumLo++; 6882 } else { 6883 Locs[i] = std::make_pair(1, NumHi); 6884 if (2+NumHi < 4) 6885 Mask1[2+NumHi] = Idx; 6886 NumHi++; 6887 } 6888 } 6889 } 6890 6891 if (NumLo <= 2 && NumHi <= 2) { 6892 // If no more than two elements come from either vector. This can be 6893 // implemented with two shuffles. First shuffle gather the elements. 6894 // The second shuffle, which takes the first shuffle as both of its 6895 // vector operands, put the elements into the right order. 6896 V1 = DAG.getVectorShuffle(VT, dl, V1, V2, &Mask1[0]); 6897 6898 int Mask2[] = { -1, -1, -1, -1 }; 6899 6900 for (unsigned i = 0; i != 4; ++i) 6901 if (Locs[i].first != -1) { 6902 unsigned Idx = (i < 2) ? 0 : 4; 6903 Idx += Locs[i].first * 2 + Locs[i].second; 6904 Mask2[i] = Idx; 6905 } 6906 6907 return DAG.getVectorShuffle(VT, dl, V1, V1, &Mask2[0]); 6908 } 6909 6910 if (NumLo == 3 || NumHi == 3) { 6911 // Otherwise, we must have three elements from one vector, call it X, and 6912 // one element from the other, call it Y. First, use a shufps to build an 6913 // intermediate vector with the one element from Y and the element from X 6914 // that will be in the same half in the final destination (the indexes don't 6915 // matter). Then, use a shufps to build the final vector, taking the half 6916 // containing the element from Y from the intermediate, and the other half 6917 // from X. 6918 if (NumHi == 3) { 6919 // Normalize it so the 3 elements come from V1. 6920 CommuteVectorShuffleMask(PermMask, 4); 6921 std::swap(V1, V2); 6922 } 6923 6924 // Find the element from V2. 6925 unsigned HiIndex; 6926 for (HiIndex = 0; HiIndex < 3; ++HiIndex) { 6927 int Val = PermMask[HiIndex]; 6928 if (Val < 0) 6929 continue; 6930 if (Val >= 4) 6931 break; 6932 } 6933 6934 Mask1[0] = PermMask[HiIndex]; 6935 Mask1[1] = -1; 6936 Mask1[2] = PermMask[HiIndex^1]; 6937 Mask1[3] = -1; 6938 V2 = DAG.getVectorShuffle(VT, dl, V1, V2, &Mask1[0]); 6939 6940 if (HiIndex >= 2) { 6941 Mask1[0] = PermMask[0]; 6942 Mask1[1] = PermMask[1]; 6943 Mask1[2] = HiIndex & 1 ? 6 : 4; 6944 Mask1[3] = HiIndex & 1 ? 4 : 6; 6945 return DAG.getVectorShuffle(VT, dl, V1, V2, &Mask1[0]); 6946 } 6947 6948 Mask1[0] = HiIndex & 1 ? 2 : 0; 6949 Mask1[1] = HiIndex & 1 ? 0 : 2; 6950 Mask1[2] = PermMask[2]; 6951 Mask1[3] = PermMask[3]; 6952 if (Mask1[2] >= 0) 6953 Mask1[2] += 4; 6954 if (Mask1[3] >= 0) 6955 Mask1[3] += 4; 6956 return DAG.getVectorShuffle(VT, dl, V2, V1, &Mask1[0]); 6957 } 6958 6959 // Break it into (shuffle shuffle_hi, shuffle_lo). 6960 int LoMask[] = { -1, -1, -1, -1 }; 6961 int HiMask[] = { -1, -1, -1, -1 }; 6962 6963 int *MaskPtr = LoMask; 6964 unsigned MaskIdx = 0; 6965 unsigned LoIdx = 0; 6966 unsigned HiIdx = 2; 6967 for (unsigned i = 0; i != 4; ++i) { 6968 if (i == 2) { 6969 MaskPtr = HiMask; 6970 MaskIdx = 1; 6971 LoIdx = 0; 6972 HiIdx = 2; 6973 } 6974 int Idx = PermMask[i]; 6975 if (Idx < 0) { 6976 Locs[i] = std::make_pair(-1, -1); 6977 } else if (Idx < 4) { 6978 Locs[i] = std::make_pair(MaskIdx, LoIdx); 6979 MaskPtr[LoIdx] = Idx; 6980 LoIdx++; 6981 } else { 6982 Locs[i] = std::make_pair(MaskIdx, HiIdx); 6983 MaskPtr[HiIdx] = Idx; 6984 HiIdx++; 6985 } 6986 } 6987 6988 SDValue LoShuffle = DAG.getVectorShuffle(VT, dl, V1, V2, &LoMask[0]); 6989 SDValue HiShuffle = DAG.getVectorShuffle(VT, dl, V1, V2, &HiMask[0]); 6990 int MaskOps[] = { -1, -1, -1, -1 }; 6991 for (unsigned i = 0; i != 4; ++i) 6992 if (Locs[i].first != -1) 6993 MaskOps[i] = Locs[i].first * 4 + Locs[i].second; 6994 return DAG.getVectorShuffle(VT, dl, LoShuffle, HiShuffle, &MaskOps[0]); 6995} 6996 6997static bool MayFoldVectorLoad(SDValue V) { 6998 while (V.hasOneUse() && V.getOpcode() == ISD::BITCAST) 6999 V = V.getOperand(0); 7000 7001 if (V.hasOneUse() && V.getOpcode() == ISD::SCALAR_TO_VECTOR) 7002 V = V.getOperand(0); 7003 if (V.hasOneUse() && V.getOpcode() == ISD::BUILD_VECTOR && 7004 V.getNumOperands() == 2 && V.getOperand(1).getOpcode() == ISD::UNDEF) 7005 // BUILD_VECTOR (load), undef 7006 V = V.getOperand(0); 7007 7008 return MayFoldLoad(V); 7009} 7010 7011static 7012SDValue getMOVDDup(SDValue &Op, SDLoc &dl, SDValue V1, SelectionDAG &DAG) { 7013 MVT VT = Op.getSimpleValueType(); 7014 7015 // Canonizalize to v2f64. 7016 V1 = DAG.getNode(ISD::BITCAST, dl, MVT::v2f64, V1); 7017 return DAG.getNode(ISD::BITCAST, dl, VT, 7018 getTargetShuffleNode(X86ISD::MOVDDUP, dl, MVT::v2f64, 7019 V1, DAG)); 7020} 7021 7022static 7023SDValue getMOVLowToHigh(SDValue &Op, SDLoc &dl, SelectionDAG &DAG, 7024 bool HasSSE2) { 7025 SDValue V1 = Op.getOperand(0); 7026 SDValue V2 = Op.getOperand(1); 7027 MVT VT = Op.getSimpleValueType(); 7028 7029 assert(VT != MVT::v2i64 && "unsupported shuffle type"); 7030 7031 if (HasSSE2 && VT == MVT::v2f64) 7032 return getTargetShuffleNode(X86ISD::MOVLHPD, dl, VT, V1, V2, DAG); 7033 7034 // v4f32 or v4i32: canonizalized to v4f32 (which is legal for SSE1) 7035 return DAG.getNode(ISD::BITCAST, dl, VT, 7036 getTargetShuffleNode(X86ISD::MOVLHPS, dl, MVT::v4f32, 7037 DAG.getNode(ISD::BITCAST, dl, MVT::v4f32, V1), 7038 DAG.getNode(ISD::BITCAST, dl, MVT::v4f32, V2), DAG)); 7039} 7040 7041static 7042SDValue getMOVHighToLow(SDValue &Op, SDLoc &dl, SelectionDAG &DAG) { 7043 SDValue V1 = Op.getOperand(0); 7044 SDValue V2 = Op.getOperand(1); 7045 MVT VT = Op.getSimpleValueType(); 7046 7047 assert((VT == MVT::v4i32 || VT == MVT::v4f32) && 7048 "unsupported shuffle type"); 7049 7050 if (V2.getOpcode() == ISD::UNDEF) 7051 V2 = V1; 7052 7053 // v4i32 or v4f32 7054 return getTargetShuffleNode(X86ISD::MOVHLPS, dl, VT, V1, V2, DAG); 7055} 7056 7057static 7058SDValue getMOVLP(SDValue &Op, SDLoc &dl, SelectionDAG &DAG, bool HasSSE2) { 7059 SDValue V1 = Op.getOperand(0); 7060 SDValue V2 = Op.getOperand(1); 7061 MVT VT = Op.getSimpleValueType(); 7062 unsigned NumElems = VT.getVectorNumElements(); 7063 7064 // Use MOVLPS and MOVLPD in case V1 or V2 are loads. During isel, the second 7065 // operand of these instructions is only memory, so check if there's a 7066 // potencial load folding here, otherwise use SHUFPS or MOVSD to match the 7067 // same masks. 7068 bool CanFoldLoad = false; 7069 7070 // Trivial case, when V2 comes from a load. 7071 if (MayFoldVectorLoad(V2)) 7072 CanFoldLoad = true; 7073 7074 // When V1 is a load, it can be folded later into a store in isel, example: 7075 // (store (v4f32 (X86Movlps (load addr:$src1), VR128:$src2)), addr:$src1) 7076 // turns into: 7077 // (MOVLPSmr addr:$src1, VR128:$src2) 7078 // So, recognize this potential and also use MOVLPS or MOVLPD 7079 else if (MayFoldVectorLoad(V1) && MayFoldIntoStore(Op)) 7080 CanFoldLoad = true; 7081 7082 ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(Op); 7083 if (CanFoldLoad) { 7084 if (HasSSE2 && NumElems == 2) 7085 return getTargetShuffleNode(X86ISD::MOVLPD, dl, VT, V1, V2, DAG); 7086 7087 if (NumElems == 4) 7088 // If we don't care about the second element, proceed to use movss. 7089 if (SVOp->getMaskElt(1) != -1) 7090 return getTargetShuffleNode(X86ISD::MOVLPS, dl, VT, V1, V2, DAG); 7091 } 7092 7093 // movl and movlp will both match v2i64, but v2i64 is never matched by 7094 // movl earlier because we make it strict to avoid messing with the movlp load 7095 // folding logic (see the code above getMOVLP call). Match it here then, 7096 // this is horrible, but will stay like this until we move all shuffle 7097 // matching to x86 specific nodes. Note that for the 1st condition all 7098 // types are matched with movsd. 7099 if (HasSSE2) { 7100 // FIXME: isMOVLMask should be checked and matched before getMOVLP, 7101 // as to remove this logic from here, as much as possible 7102 if (NumElems == 2 || !isMOVLMask(SVOp->getMask(), VT)) 7103 return getTargetShuffleNode(X86ISD::MOVSD, dl, VT, V1, V2, DAG); 7104 return getTargetShuffleNode(X86ISD::MOVSS, dl, VT, V1, V2, DAG); 7105 } 7106 7107 assert(VT != MVT::v4i32 && "unsupported shuffle type"); 7108 7109 // Invert the operand order and use SHUFPS to match it. 7110 return getTargetShuffleNode(X86ISD::SHUFP, dl, VT, V2, V1, 7111 getShuffleSHUFImmediate(SVOp), DAG); 7112} 7113 7114// Reduce a vector shuffle to zext. 7115static SDValue LowerVectorIntExtend(SDValue Op, const X86Subtarget *Subtarget, 7116 SelectionDAG &DAG) { 7117 // PMOVZX is only available from SSE41. 7118 if (!Subtarget->hasSSE41()) 7119 return SDValue(); 7120 7121 MVT VT = Op.getSimpleValueType(); 7122 7123 // Only AVX2 support 256-bit vector integer extending. 7124 if (!Subtarget->hasInt256() && VT.is256BitVector()) 7125 return SDValue(); 7126 7127 ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(Op); 7128 SDLoc DL(Op); 7129 SDValue V1 = Op.getOperand(0); 7130 SDValue V2 = Op.getOperand(1); 7131 unsigned NumElems = VT.getVectorNumElements(); 7132 7133 // Extending is an unary operation and the element type of the source vector 7134 // won't be equal to or larger than i64. 7135 if (V2.getOpcode() != ISD::UNDEF || !VT.isInteger() || 7136 VT.getVectorElementType() == MVT::i64) 7137 return SDValue(); 7138 7139 // Find the expansion ratio, e.g. expanding from i8 to i32 has a ratio of 4. 7140 unsigned Shift = 1; // Start from 2, i.e. 1 << 1. 7141 while ((1U << Shift) < NumElems) { 7142 if (SVOp->getMaskElt(1U << Shift) == 1) 7143 break; 7144 Shift += 1; 7145 // The maximal ratio is 8, i.e. from i8 to i64. 7146 if (Shift > 3) 7147 return SDValue(); 7148 } 7149 7150 // Check the shuffle mask. 7151 unsigned Mask = (1U << Shift) - 1; 7152 for (unsigned i = 0; i != NumElems; ++i) { 7153 int EltIdx = SVOp->getMaskElt(i); 7154 if ((i & Mask) != 0 && EltIdx != -1) 7155 return SDValue(); 7156 if ((i & Mask) == 0 && (unsigned)EltIdx != (i >> Shift)) 7157 return SDValue(); 7158 } 7159 7160 unsigned NBits = VT.getVectorElementType().getSizeInBits() << Shift; 7161 MVT NeVT = MVT::getIntegerVT(NBits); 7162 MVT NVT = MVT::getVectorVT(NeVT, NumElems >> Shift); 7163 7164 if (!DAG.getTargetLoweringInfo().isTypeLegal(NVT)) 7165 return SDValue(); 7166 7167 // Simplify the operand as it's prepared to be fed into shuffle. 7168 unsigned SignificantBits = NVT.getSizeInBits() >> Shift; 7169 if (V1.getOpcode() == ISD::BITCAST && 7170 V1.getOperand(0).getOpcode() == ISD::SCALAR_TO_VECTOR && 7171 V1.getOperand(0).getOperand(0).getOpcode() == ISD::EXTRACT_VECTOR_ELT && 7172 V1.getOperand(0).getOperand(0) 7173 .getSimpleValueType().getSizeInBits() == SignificantBits) { 7174 // (bitcast (sclr2vec (ext_vec_elt x))) -> (bitcast x) 7175 SDValue V = V1.getOperand(0).getOperand(0).getOperand(0); 7176 ConstantSDNode *CIdx = 7177 dyn_cast<ConstantSDNode>(V1.getOperand(0).getOperand(0).getOperand(1)); 7178 // If it's foldable, i.e. normal load with single use, we will let code 7179 // selection to fold it. Otherwise, we will short the conversion sequence. 7180 if (CIdx && CIdx->getZExtValue() == 0 && 7181 (!ISD::isNormalLoad(V.getNode()) || !V.hasOneUse())) { 7182 MVT FullVT = V.getSimpleValueType(); 7183 MVT V1VT = V1.getSimpleValueType(); 7184 if (FullVT.getSizeInBits() > V1VT.getSizeInBits()) { 7185 // The "ext_vec_elt" node is wider than the result node. 7186 // In this case we should extract subvector from V. 7187 // (bitcast (sclr2vec (ext_vec_elt x))) -> (bitcast (extract_subvector x)). 7188 unsigned Ratio = FullVT.getSizeInBits() / V1VT.getSizeInBits(); 7189 MVT SubVecVT = MVT::getVectorVT(FullVT.getVectorElementType(), 7190 FullVT.getVectorNumElements()/Ratio); 7191 V = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, SubVecVT, V, 7192 DAG.getIntPtrConstant(0)); 7193 } 7194 V1 = DAG.getNode(ISD::BITCAST, DL, V1VT, V); 7195 } 7196 } 7197 7198 return DAG.getNode(ISD::BITCAST, DL, VT, 7199 DAG.getNode(X86ISD::VZEXT, DL, NVT, V1)); 7200} 7201 7202static SDValue 7203NormalizeVectorShuffle(SDValue Op, const X86Subtarget *Subtarget, 7204 SelectionDAG &DAG) { 7205 ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(Op); 7206 MVT VT = Op.getSimpleValueType(); 7207 SDLoc dl(Op); 7208 SDValue V1 = Op.getOperand(0); 7209 SDValue V2 = Op.getOperand(1); 7210 7211 if (isZeroShuffle(SVOp)) 7212 return getZeroVector(VT, Subtarget, DAG, dl); 7213 7214 // Handle splat operations 7215 if (SVOp->isSplat()) { 7216 // Use vbroadcast whenever the splat comes from a foldable load 7217 SDValue Broadcast = LowerVectorBroadcast(Op, Subtarget, DAG); 7218 if (Broadcast.getNode()) 7219 return Broadcast; 7220 } 7221 7222 // Check integer expanding shuffles. 7223 SDValue NewOp = LowerVectorIntExtend(Op, Subtarget, DAG); 7224 if (NewOp.getNode()) 7225 return NewOp; 7226 7227 // If the shuffle can be profitably rewritten as a narrower shuffle, then 7228 // do it! 7229 if (VT == MVT::v8i16 || VT == MVT::v16i8 || 7230 VT == MVT::v16i16 || VT == MVT::v32i8) { 7231 SDValue NewOp = RewriteAsNarrowerShuffle(SVOp, DAG); 7232 if (NewOp.getNode()) 7233 return DAG.getNode(ISD::BITCAST, dl, VT, NewOp); 7234 } else if ((VT == MVT::v4i32 || 7235 (VT == MVT::v4f32 && Subtarget->hasSSE2()))) { 7236 // FIXME: Figure out a cleaner way to do this. 7237 // Try to make use of movq to zero out the top part. 7238 if (ISD::isBuildVectorAllZeros(V2.getNode())) { 7239 SDValue NewOp = RewriteAsNarrowerShuffle(SVOp, DAG); 7240 if (NewOp.getNode()) { 7241 MVT NewVT = NewOp.getSimpleValueType(); 7242 if (isCommutedMOVLMask(cast<ShuffleVectorSDNode>(NewOp)->getMask(), 7243 NewVT, true, false)) 7244 return getVZextMovL(VT, NewVT, NewOp.getOperand(0), 7245 DAG, Subtarget, dl); 7246 } 7247 } else if (ISD::isBuildVectorAllZeros(V1.getNode())) { 7248 SDValue NewOp = RewriteAsNarrowerShuffle(SVOp, DAG); 7249 if (NewOp.getNode()) { 7250 MVT NewVT = NewOp.getSimpleValueType(); 7251 if (isMOVLMask(cast<ShuffleVectorSDNode>(NewOp)->getMask(), NewVT)) 7252 return getVZextMovL(VT, NewVT, NewOp.getOperand(1), 7253 DAG, Subtarget, dl); 7254 } 7255 } 7256 } 7257 return SDValue(); 7258} 7259 7260SDValue 7261X86TargetLowering::LowerVECTOR_SHUFFLE(SDValue Op, SelectionDAG &DAG) const { 7262 ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(Op); 7263 SDValue V1 = Op.getOperand(0); 7264 SDValue V2 = Op.getOperand(1); 7265 MVT VT = Op.getSimpleValueType(); 7266 SDLoc dl(Op); 7267 unsigned NumElems = VT.getVectorNumElements(); 7268 bool V1IsUndef = V1.getOpcode() == ISD::UNDEF; 7269 bool V2IsUndef = V2.getOpcode() == ISD::UNDEF; 7270 bool V1IsSplat = false; 7271 bool V2IsSplat = false; 7272 bool HasSSE2 = Subtarget->hasSSE2(); 7273 bool HasFp256 = Subtarget->hasFp256(); 7274 bool HasInt256 = Subtarget->hasInt256(); 7275 MachineFunction &MF = DAG.getMachineFunction(); 7276 bool OptForSize = MF.getFunction()->getAttributes(). 7277 hasAttribute(AttributeSet::FunctionIndex, Attribute::OptimizeForSize); 7278 7279 assert(VT.getSizeInBits() != 64 && "Can't lower MMX shuffles"); 7280 7281 if (V1IsUndef && V2IsUndef) 7282 return DAG.getUNDEF(VT); 7283 7284 assert(!V1IsUndef && "Op 1 of shuffle should not be undef"); 7285 7286 // Vector shuffle lowering takes 3 steps: 7287 // 7288 // 1) Normalize the input vectors. Here splats, zeroed vectors, profitable 7289 // narrowing and commutation of operands should be handled. 7290 // 2) Matching of shuffles with known shuffle masks to x86 target specific 7291 // shuffle nodes. 7292 // 3) Rewriting of unmatched masks into new generic shuffle operations, 7293 // so the shuffle can be broken into other shuffles and the legalizer can 7294 // try the lowering again. 7295 // 7296 // The general idea is that no vector_shuffle operation should be left to 7297 // be matched during isel, all of them must be converted to a target specific 7298 // node here. 7299 7300 // Normalize the input vectors. Here splats, zeroed vectors, profitable 7301 // narrowing and commutation of operands should be handled. The actual code 7302 // doesn't include all of those, work in progress... 7303 SDValue NewOp = NormalizeVectorShuffle(Op, Subtarget, DAG); 7304 if (NewOp.getNode()) 7305 return NewOp; 7306 7307 SmallVector<int, 8> M(SVOp->getMask().begin(), SVOp->getMask().end()); 7308 7309 // NOTE: isPSHUFDMask can also match both masks below (unpckl_undef and 7310 // unpckh_undef). Only use pshufd if speed is more important than size. 7311 if (OptForSize && isUNPCKL_v_undef_Mask(M, VT, HasInt256)) 7312 return getTargetShuffleNode(X86ISD::UNPCKL, dl, VT, V1, V1, DAG); 7313 if (OptForSize && isUNPCKH_v_undef_Mask(M, VT, HasInt256)) 7314 return getTargetShuffleNode(X86ISD::UNPCKH, dl, VT, V1, V1, DAG); 7315 7316 if (isMOVDDUPMask(M, VT) && Subtarget->hasSSE3() && 7317 V2IsUndef && MayFoldVectorLoad(V1)) 7318 return getMOVDDup(Op, dl, V1, DAG); 7319 7320 if (isMOVHLPS_v_undef_Mask(M, VT)) 7321 return getMOVHighToLow(Op, dl, DAG); 7322 7323 // Use to match splats 7324 if (HasSSE2 && isUNPCKHMask(M, VT, HasInt256) && V2IsUndef && 7325 (VT == MVT::v2f64 || VT == MVT::v2i64)) 7326 return getTargetShuffleNode(X86ISD::UNPCKH, dl, VT, V1, V1, DAG); 7327 7328 if (isPSHUFDMask(M, VT)) { 7329 // The actual implementation will match the mask in the if above and then 7330 // during isel it can match several different instructions, not only pshufd 7331 // as its name says, sad but true, emulate the behavior for now... 7332 if (isMOVDDUPMask(M, VT) && ((VT == MVT::v4f32 || VT == MVT::v2i64))) 7333 return getTargetShuffleNode(X86ISD::MOVLHPS, dl, VT, V1, V1, DAG); 7334 7335 unsigned TargetMask = getShuffleSHUFImmediate(SVOp); 7336 7337 if (HasSSE2 && (VT == MVT::v4f32 || VT == MVT::v4i32)) 7338 return getTargetShuffleNode(X86ISD::PSHUFD, dl, VT, V1, TargetMask, DAG); 7339 7340 if (HasFp256 && (VT == MVT::v4f32 || VT == MVT::v2f64)) 7341 return getTargetShuffleNode(X86ISD::VPERMILP, dl, VT, V1, TargetMask, 7342 DAG); 7343 7344 return getTargetShuffleNode(X86ISD::SHUFP, dl, VT, V1, V1, 7345 TargetMask, DAG); 7346 } 7347 7348 if (isPALIGNRMask(M, VT, Subtarget)) 7349 return getTargetShuffleNode(X86ISD::PALIGNR, dl, VT, V1, V2, 7350 getShufflePALIGNRImmediate(SVOp), 7351 DAG); 7352 7353 // Check if this can be converted into a logical shift. 7354 bool isLeft = false; 7355 unsigned ShAmt = 0; 7356 SDValue ShVal; 7357 bool isShift = HasSSE2 && isVectorShift(SVOp, DAG, isLeft, ShVal, ShAmt); 7358 if (isShift && ShVal.hasOneUse()) { 7359 // If the shifted value has multiple uses, it may be cheaper to use 7360 // v_set0 + movlhps or movhlps, etc. 7361 MVT EltVT = VT.getVectorElementType(); 7362 ShAmt *= EltVT.getSizeInBits(); 7363 return getVShift(isLeft, VT, ShVal, ShAmt, DAG, *this, dl); 7364 } 7365 7366 if (isMOVLMask(M, VT)) { 7367 if (ISD::isBuildVectorAllZeros(V1.getNode())) 7368 return getVZextMovL(VT, VT, V2, DAG, Subtarget, dl); 7369 if (!isMOVLPMask(M, VT)) { 7370 if (HasSSE2 && (VT == MVT::v2i64 || VT == MVT::v2f64)) 7371 return getTargetShuffleNode(X86ISD::MOVSD, dl, VT, V1, V2, DAG); 7372 7373 if (VT == MVT::v4i32 || VT == MVT::v4f32) 7374 return getTargetShuffleNode(X86ISD::MOVSS, dl, VT, V1, V2, DAG); 7375 } 7376 } 7377 7378 // FIXME: fold these into legal mask. 7379 if (isMOVLHPSMask(M, VT) && !isUNPCKLMask(M, VT, HasInt256)) 7380 return getMOVLowToHigh(Op, dl, DAG, HasSSE2); 7381 7382 if (isMOVHLPSMask(M, VT)) 7383 return getMOVHighToLow(Op, dl, DAG); 7384 7385 if (V2IsUndef && isMOVSHDUPMask(M, VT, Subtarget)) 7386 return getTargetShuffleNode(X86ISD::MOVSHDUP, dl, VT, V1, DAG); 7387 7388 if (V2IsUndef && isMOVSLDUPMask(M, VT, Subtarget)) 7389 return getTargetShuffleNode(X86ISD::MOVSLDUP, dl, VT, V1, DAG); 7390 7391 if (isMOVLPMask(M, VT)) 7392 return getMOVLP(Op, dl, DAG, HasSSE2); 7393 7394 if (ShouldXformToMOVHLPS(M, VT) || 7395 ShouldXformToMOVLP(V1.getNode(), V2.getNode(), M, VT)) 7396 return CommuteVectorShuffle(SVOp, DAG); 7397 7398 if (isShift) { 7399 // No better options. Use a vshldq / vsrldq. 7400 MVT EltVT = VT.getVectorElementType(); 7401 ShAmt *= EltVT.getSizeInBits(); 7402 return getVShift(isLeft, VT, ShVal, ShAmt, DAG, *this, dl); 7403 } 7404 7405 bool Commuted = false; 7406 // FIXME: This should also accept a bitcast of a splat? Be careful, not 7407 // 1,1,1,1 -> v8i16 though. 7408 V1IsSplat = isSplatVector(V1.getNode()); 7409 V2IsSplat = isSplatVector(V2.getNode()); 7410 7411 // Canonicalize the splat or undef, if present, to be on the RHS. 7412 if (!V2IsUndef && V1IsSplat && !V2IsSplat) { 7413 CommuteVectorShuffleMask(M, NumElems); 7414 std::swap(V1, V2); 7415 std::swap(V1IsSplat, V2IsSplat); 7416 Commuted = true; 7417 } 7418 7419 if (isCommutedMOVLMask(M, VT, V2IsSplat, V2IsUndef)) { 7420 // Shuffling low element of v1 into undef, just return v1. 7421 if (V2IsUndef) 7422 return V1; 7423 // If V2 is a splat, the mask may be malformed such as <4,3,3,3>, which 7424 // the instruction selector will not match, so get a canonical MOVL with 7425 // swapped operands to undo the commute. 7426 return getMOVL(DAG, dl, VT, V2, V1); 7427 } 7428 7429 if (isUNPCKLMask(M, VT, HasInt256)) 7430 return getTargetShuffleNode(X86ISD::UNPCKL, dl, VT, V1, V2, DAG); 7431 7432 if (isUNPCKHMask(M, VT, HasInt256)) 7433 return getTargetShuffleNode(X86ISD::UNPCKH, dl, VT, V1, V2, DAG); 7434 7435 if (V2IsSplat) { 7436 // Normalize mask so all entries that point to V2 points to its first 7437 // element then try to match unpck{h|l} again. If match, return a 7438 // new vector_shuffle with the corrected mask.p 7439 SmallVector<int, 8> NewMask(M.begin(), M.end()); 7440 NormalizeMask(NewMask, NumElems); 7441 if (isUNPCKLMask(NewMask, VT, HasInt256, true)) 7442 return getTargetShuffleNode(X86ISD::UNPCKL, dl, VT, V1, V2, DAG); 7443 if (isUNPCKHMask(NewMask, VT, HasInt256, true)) 7444 return getTargetShuffleNode(X86ISD::UNPCKH, dl, VT, V1, V2, DAG); 7445 } 7446 7447 if (Commuted) { 7448 // Commute is back and try unpck* again. 7449 // FIXME: this seems wrong. 7450 CommuteVectorShuffleMask(M, NumElems); 7451 std::swap(V1, V2); 7452 std::swap(V1IsSplat, V2IsSplat); 7453 Commuted = false; 7454 7455 if (isUNPCKLMask(M, VT, HasInt256)) 7456 return getTargetShuffleNode(X86ISD::UNPCKL, dl, VT, V1, V2, DAG); 7457 7458 if (isUNPCKHMask(M, VT, HasInt256)) 7459 return getTargetShuffleNode(X86ISD::UNPCKH, dl, VT, V1, V2, DAG); 7460 } 7461 7462 // Normalize the node to match x86 shuffle ops if needed 7463 if (!V2IsUndef && (isSHUFPMask(M, VT, /* Commuted */ true))) 7464 return CommuteVectorShuffle(SVOp, DAG); 7465 7466 // The checks below are all present in isShuffleMaskLegal, but they are 7467 // inlined here right now to enable us to directly emit target specific 7468 // nodes, and remove one by one until they don't return Op anymore. 7469 7470 if (ShuffleVectorSDNode::isSplatMask(&M[0], VT) && 7471 SVOp->getSplatIndex() == 0 && V2IsUndef) { 7472 if (VT == MVT::v2f64 || VT == MVT::v2i64) 7473 return getTargetShuffleNode(X86ISD::UNPCKL, dl, VT, V1, V1, DAG); 7474 } 7475 7476 if (isPSHUFHWMask(M, VT, HasInt256)) 7477 return getTargetShuffleNode(X86ISD::PSHUFHW, dl, VT, V1, 7478 getShufflePSHUFHWImmediate(SVOp), 7479 DAG); 7480 7481 if (isPSHUFLWMask(M, VT, HasInt256)) 7482 return getTargetShuffleNode(X86ISD::PSHUFLW, dl, VT, V1, 7483 getShufflePSHUFLWImmediate(SVOp), 7484 DAG); 7485 7486 if (isSHUFPMask(M, VT)) 7487 return getTargetShuffleNode(X86ISD::SHUFP, dl, VT, V1, V2, 7488 getShuffleSHUFImmediate(SVOp), DAG); 7489 7490 if (isUNPCKL_v_undef_Mask(M, VT, HasInt256)) 7491 return getTargetShuffleNode(X86ISD::UNPCKL, dl, VT, V1, V1, DAG); 7492 if (isUNPCKH_v_undef_Mask(M, VT, HasInt256)) 7493 return getTargetShuffleNode(X86ISD::UNPCKH, dl, VT, V1, V1, DAG); 7494 7495 //===--------------------------------------------------------------------===// 7496 // Generate target specific nodes for 128 or 256-bit shuffles only 7497 // supported in the AVX instruction set. 7498 // 7499 7500 // Handle VMOVDDUPY permutations 7501 if (V2IsUndef && isMOVDDUPYMask(M, VT, HasFp256)) 7502 return getTargetShuffleNode(X86ISD::MOVDDUP, dl, VT, V1, DAG); 7503 7504 // Handle VPERMILPS/D* permutations 7505 if (isVPERMILPMask(M, VT)) { 7506 if ((HasInt256 && VT == MVT::v8i32) || VT == MVT::v16i32) 7507 return getTargetShuffleNode(X86ISD::PSHUFD, dl, VT, V1, 7508 getShuffleSHUFImmediate(SVOp), DAG); 7509 return getTargetShuffleNode(X86ISD::VPERMILP, dl, VT, V1, 7510 getShuffleSHUFImmediate(SVOp), DAG); 7511 } 7512 7513 // Handle VPERM2F128/VPERM2I128 permutations 7514 if (isVPERM2X128Mask(M, VT, HasFp256)) 7515 return getTargetShuffleNode(X86ISD::VPERM2X128, dl, VT, V1, 7516 V2, getShuffleVPERM2X128Immediate(SVOp), DAG); 7517 7518 SDValue BlendOp = LowerVECTOR_SHUFFLEtoBlend(SVOp, Subtarget, DAG); 7519 if (BlendOp.getNode()) 7520 return BlendOp; 7521 7522 unsigned Imm8; 7523 if (V2IsUndef && HasInt256 && isPermImmMask(M, VT, Imm8)) 7524 return getTargetShuffleNode(X86ISD::VPERMI, dl, VT, V1, Imm8, DAG); 7525 7526 if ((V2IsUndef && HasInt256 && VT.is256BitVector() && NumElems == 8) || 7527 VT.is512BitVector()) { 7528 MVT MaskEltVT = MVT::getIntegerVT(VT.getVectorElementType().getSizeInBits()); 7529 MVT MaskVectorVT = MVT::getVectorVT(MaskEltVT, NumElems); 7530 SmallVector<SDValue, 16> permclMask; 7531 for (unsigned i = 0; i != NumElems; ++i) { 7532 permclMask.push_back(DAG.getConstant((M[i]>=0) ? M[i] : 0, MaskEltVT)); 7533 } 7534 7535 SDValue Mask = DAG.getNode(ISD::BUILD_VECTOR, dl, MaskVectorVT, 7536 &permclMask[0], NumElems); 7537 if (V2IsUndef) 7538 // Bitcast is for VPERMPS since mask is v8i32 but node takes v8f32 7539 return DAG.getNode(X86ISD::VPERMV, dl, VT, 7540 DAG.getNode(ISD::BITCAST, dl, VT, Mask), V1); 7541 return DAG.getNode(X86ISD::VPERMV3, dl, VT, 7542 DAG.getNode(ISD::BITCAST, dl, VT, Mask), V1, V2); 7543 } 7544 7545 //===--------------------------------------------------------------------===// 7546 // Since no target specific shuffle was selected for this generic one, 7547 // lower it into other known shuffles. FIXME: this isn't true yet, but 7548 // this is the plan. 7549 // 7550 7551 // Handle v8i16 specifically since SSE can do byte extraction and insertion. 7552 if (VT == MVT::v8i16) { 7553 SDValue NewOp = LowerVECTOR_SHUFFLEv8i16(Op, Subtarget, DAG); 7554 if (NewOp.getNode()) 7555 return NewOp; 7556 } 7557 7558 if (VT == MVT::v16i8) { 7559 SDValue NewOp = LowerVECTOR_SHUFFLEv16i8(SVOp, Subtarget, DAG); 7560 if (NewOp.getNode()) 7561 return NewOp; 7562 } 7563 7564 if (VT == MVT::v32i8) { 7565 SDValue NewOp = LowerVECTOR_SHUFFLEv32i8(SVOp, Subtarget, DAG); 7566 if (NewOp.getNode()) 7567 return NewOp; 7568 } 7569 7570 // Handle all 128-bit wide vectors with 4 elements, and match them with 7571 // several different shuffle types. 7572 if (NumElems == 4 && VT.is128BitVector()) 7573 return LowerVECTOR_SHUFFLE_128v4(SVOp, DAG); 7574 7575 // Handle general 256-bit shuffles 7576 if (VT.is256BitVector()) 7577 return LowerVECTOR_SHUFFLE_256(SVOp, DAG); 7578 7579 return SDValue(); 7580} 7581 7582static SDValue LowerEXTRACT_VECTOR_ELT_SSE4(SDValue Op, SelectionDAG &DAG) { 7583 MVT VT = Op.getSimpleValueType(); 7584 SDLoc dl(Op); 7585 7586 if (!Op.getOperand(0).getSimpleValueType().is128BitVector()) 7587 return SDValue(); 7588 7589 if (VT.getSizeInBits() == 8) { 7590 SDValue Extract = DAG.getNode(X86ISD::PEXTRB, dl, MVT::i32, 7591 Op.getOperand(0), Op.getOperand(1)); 7592 SDValue Assert = DAG.getNode(ISD::AssertZext, dl, MVT::i32, Extract, 7593 DAG.getValueType(VT)); 7594 return DAG.getNode(ISD::TRUNCATE, dl, VT, Assert); 7595 } 7596 7597 if (VT.getSizeInBits() == 16) { 7598 unsigned Idx = cast<ConstantSDNode>(Op.getOperand(1))->getZExtValue(); 7599 // If Idx is 0, it's cheaper to do a move instead of a pextrw. 7600 if (Idx == 0) 7601 return DAG.getNode(ISD::TRUNCATE, dl, MVT::i16, 7602 DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i32, 7603 DAG.getNode(ISD::BITCAST, dl, 7604 MVT::v4i32, 7605 Op.getOperand(0)), 7606 Op.getOperand(1))); 7607 SDValue Extract = DAG.getNode(X86ISD::PEXTRW, dl, MVT::i32, 7608 Op.getOperand(0), Op.getOperand(1)); 7609 SDValue Assert = DAG.getNode(ISD::AssertZext, dl, MVT::i32, Extract, 7610 DAG.getValueType(VT)); 7611 return DAG.getNode(ISD::TRUNCATE, dl, VT, Assert); 7612 } 7613 7614 if (VT == MVT::f32) { 7615 // EXTRACTPS outputs to a GPR32 register which will require a movd to copy 7616 // the result back to FR32 register. It's only worth matching if the 7617 // result has a single use which is a store or a bitcast to i32. And in 7618 // the case of a store, it's not worth it if the index is a constant 0, 7619 // because a MOVSSmr can be used instead, which is smaller and faster. 7620 if (!Op.hasOneUse()) 7621 return SDValue(); 7622 SDNode *User = *Op.getNode()->use_begin(); 7623 if ((User->getOpcode() != ISD::STORE || 7624 (isa<ConstantSDNode>(Op.getOperand(1)) && 7625 cast<ConstantSDNode>(Op.getOperand(1))->isNullValue())) && 7626 (User->getOpcode() != ISD::BITCAST || 7627 User->getValueType(0) != MVT::i32)) 7628 return SDValue(); 7629 SDValue Extract = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i32, 7630 DAG.getNode(ISD::BITCAST, dl, MVT::v4i32, 7631 Op.getOperand(0)), 7632 Op.getOperand(1)); 7633 return DAG.getNode(ISD::BITCAST, dl, MVT::f32, Extract); 7634 } 7635 7636 if (VT == MVT::i32 || VT == MVT::i64) { 7637 // ExtractPS/pextrq works with constant index. 7638 if (isa<ConstantSDNode>(Op.getOperand(1))) 7639 return Op; 7640 } 7641 return SDValue(); 7642} 7643 7644SDValue 7645X86TargetLowering::LowerEXTRACT_VECTOR_ELT(SDValue Op, 7646 SelectionDAG &DAG) const { 7647 SDLoc dl(Op); 7648 SDValue Vec = Op.getOperand(0); 7649 MVT VecVT = Vec.getSimpleValueType(); 7650 SDValue Idx = Op.getOperand(1); 7651 if (!isa<ConstantSDNode>(Idx)) { 7652 if (VecVT.is512BitVector() || 7653 (VecVT.is256BitVector() && Subtarget->hasInt256() && 7654 VecVT.getVectorElementType().getSizeInBits() == 32)) { 7655 7656 MVT MaskEltVT = 7657 MVT::getIntegerVT(VecVT.getVectorElementType().getSizeInBits()); 7658 MVT MaskVT = MVT::getVectorVT(MaskEltVT, VecVT.getSizeInBits() / 7659 MaskEltVT.getSizeInBits()); 7660 7661 Idx = DAG.getZExtOrTrunc(Idx, dl, MaskEltVT); 7662 SDValue Mask = DAG.getNode(X86ISD::VINSERT, dl, MaskVT, 7663 getZeroVector(MaskVT, Subtarget, DAG, dl), 7664 Idx, DAG.getConstant(0, getPointerTy())); 7665 SDValue Perm = DAG.getNode(X86ISD::VPERMV, dl, VecVT, Mask, Vec); 7666 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, Op.getValueType(), 7667 Perm, DAG.getConstant(0, getPointerTy())); 7668 } 7669 return SDValue(); 7670 } 7671 7672 // If this is a 256-bit vector result, first extract the 128-bit vector and 7673 // then extract the element from the 128-bit vector. 7674 if (VecVT.is256BitVector() || VecVT.is512BitVector()) { 7675 7676 unsigned IdxVal = cast<ConstantSDNode>(Idx)->getZExtValue(); 7677 // Get the 128-bit vector. 7678 Vec = Extract128BitVector(Vec, IdxVal, DAG, dl); 7679 MVT EltVT = VecVT.getVectorElementType(); 7680 7681 unsigned ElemsPerChunk = 128 / EltVT.getSizeInBits(); 7682 7683 //if (IdxVal >= NumElems/2) 7684 // IdxVal -= NumElems/2; 7685 IdxVal -= (IdxVal/ElemsPerChunk)*ElemsPerChunk; 7686 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, Op.getValueType(), Vec, 7687 DAG.getConstant(IdxVal, MVT::i32)); 7688 } 7689 7690 assert(VecVT.is128BitVector() && "Unexpected vector length"); 7691 7692 if (Subtarget->hasSSE41()) { 7693 SDValue Res = LowerEXTRACT_VECTOR_ELT_SSE4(Op, DAG); 7694 if (Res.getNode()) 7695 return Res; 7696 } 7697 7698 MVT VT = Op.getSimpleValueType(); 7699 // TODO: handle v16i8. 7700 if (VT.getSizeInBits() == 16) { 7701 SDValue Vec = Op.getOperand(0); 7702 unsigned Idx = cast<ConstantSDNode>(Op.getOperand(1))->getZExtValue(); 7703 if (Idx == 0) 7704 return DAG.getNode(ISD::TRUNCATE, dl, MVT::i16, 7705 DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i32, 7706 DAG.getNode(ISD::BITCAST, dl, 7707 MVT::v4i32, Vec), 7708 Op.getOperand(1))); 7709 // Transform it so it match pextrw which produces a 32-bit result. 7710 MVT EltVT = MVT::i32; 7711 SDValue Extract = DAG.getNode(X86ISD::PEXTRW, dl, EltVT, 7712 Op.getOperand(0), Op.getOperand(1)); 7713 SDValue Assert = DAG.getNode(ISD::AssertZext, dl, EltVT, Extract, 7714 DAG.getValueType(VT)); 7715 return DAG.getNode(ISD::TRUNCATE, dl, VT, Assert); 7716 } 7717 7718 if (VT.getSizeInBits() == 32) { 7719 unsigned Idx = cast<ConstantSDNode>(Op.getOperand(1))->getZExtValue(); 7720 if (Idx == 0) 7721 return Op; 7722 7723 // SHUFPS the element to the lowest double word, then movss. 7724 int Mask[4] = { static_cast<int>(Idx), -1, -1, -1 }; 7725 MVT VVT = Op.getOperand(0).getSimpleValueType(); 7726 SDValue Vec = DAG.getVectorShuffle(VVT, dl, Op.getOperand(0), 7727 DAG.getUNDEF(VVT), Mask); 7728 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, VT, Vec, 7729 DAG.getIntPtrConstant(0)); 7730 } 7731 7732 if (VT.getSizeInBits() == 64) { 7733 // FIXME: .td only matches this for <2 x f64>, not <2 x i64> on 32b 7734 // FIXME: seems like this should be unnecessary if mov{h,l}pd were taught 7735 // to match extract_elt for f64. 7736 unsigned Idx = cast<ConstantSDNode>(Op.getOperand(1))->getZExtValue(); 7737 if (Idx == 0) 7738 return Op; 7739 7740 // UNPCKHPD the element to the lowest double word, then movsd. 7741 // Note if the lower 64 bits of the result of the UNPCKHPD is then stored 7742 // to a f64mem, the whole operation is folded into a single MOVHPDmr. 7743 int Mask[2] = { 1, -1 }; 7744 MVT VVT = Op.getOperand(0).getSimpleValueType(); 7745 SDValue Vec = DAG.getVectorShuffle(VVT, dl, Op.getOperand(0), 7746 DAG.getUNDEF(VVT), Mask); 7747 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, VT, Vec, 7748 DAG.getIntPtrConstant(0)); 7749 } 7750 7751 return SDValue(); 7752} 7753 7754static SDValue LowerINSERT_VECTOR_ELT_SSE4(SDValue Op, SelectionDAG &DAG) { 7755 MVT VT = Op.getSimpleValueType(); 7756 MVT EltVT = VT.getVectorElementType(); 7757 SDLoc dl(Op); 7758 7759 SDValue N0 = Op.getOperand(0); 7760 SDValue N1 = Op.getOperand(1); 7761 SDValue N2 = Op.getOperand(2); 7762 7763 if (!VT.is128BitVector()) 7764 return SDValue(); 7765 7766 if ((EltVT.getSizeInBits() == 8 || EltVT.getSizeInBits() == 16) && 7767 isa<ConstantSDNode>(N2)) { 7768 unsigned Opc; 7769 if (VT == MVT::v8i16) 7770 Opc = X86ISD::PINSRW; 7771 else if (VT == MVT::v16i8) 7772 Opc = X86ISD::PINSRB; 7773 else 7774 Opc = X86ISD::PINSRB; 7775 7776 // Transform it so it match pinsr{b,w} which expects a GR32 as its second 7777 // argument. 7778 if (N1.getValueType() != MVT::i32) 7779 N1 = DAG.getNode(ISD::ANY_EXTEND, dl, MVT::i32, N1); 7780 if (N2.getValueType() != MVT::i32) 7781 N2 = DAG.getIntPtrConstant(cast<ConstantSDNode>(N2)->getZExtValue()); 7782 return DAG.getNode(Opc, dl, VT, N0, N1, N2); 7783 } 7784 7785 if (EltVT == MVT::f32 && isa<ConstantSDNode>(N2)) { 7786 // Bits [7:6] of the constant are the source select. This will always be 7787 // zero here. The DAG Combiner may combine an extract_elt index into these 7788 // bits. For example (insert (extract, 3), 2) could be matched by putting 7789 // the '3' into bits [7:6] of X86ISD::INSERTPS. 7790 // Bits [5:4] of the constant are the destination select. This is the 7791 // value of the incoming immediate. 7792 // Bits [3:0] of the constant are the zero mask. The DAG Combiner may 7793 // combine either bitwise AND or insert of float 0.0 to set these bits. 7794 N2 = DAG.getIntPtrConstant(cast<ConstantSDNode>(N2)->getZExtValue() << 4); 7795 // Create this as a scalar to vector.. 7796 N1 = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v4f32, N1); 7797 return DAG.getNode(X86ISD::INSERTPS, dl, VT, N0, N1, N2); 7798 } 7799 7800 if ((EltVT == MVT::i32 || EltVT == MVT::i64) && isa<ConstantSDNode>(N2)) { 7801 // PINSR* works with constant index. 7802 return Op; 7803 } 7804 return SDValue(); 7805} 7806 7807SDValue 7808X86TargetLowering::LowerINSERT_VECTOR_ELT(SDValue Op, SelectionDAG &DAG) const { 7809 MVT VT = Op.getSimpleValueType(); 7810 MVT EltVT = VT.getVectorElementType(); 7811 7812 SDLoc dl(Op); 7813 SDValue N0 = Op.getOperand(0); 7814 SDValue N1 = Op.getOperand(1); 7815 SDValue N2 = Op.getOperand(2); 7816 7817 // If this is a 256-bit vector result, first extract the 128-bit vector, 7818 // insert the element into the extracted half and then place it back. 7819 if (VT.is256BitVector() || VT.is512BitVector()) { 7820 if (!isa<ConstantSDNode>(N2)) 7821 return SDValue(); 7822 7823 // Get the desired 128-bit vector half. 7824 unsigned IdxVal = cast<ConstantSDNode>(N2)->getZExtValue(); 7825 SDValue V = Extract128BitVector(N0, IdxVal, DAG, dl); 7826 7827 // Insert the element into the desired half. 7828 unsigned NumEltsIn128 = 128/EltVT.getSizeInBits(); 7829 unsigned IdxIn128 = IdxVal - (IdxVal/NumEltsIn128) * NumEltsIn128; 7830 7831 V = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, V.getValueType(), V, N1, 7832 DAG.getConstant(IdxIn128, MVT::i32)); 7833 7834 // Insert the changed part back to the 256-bit vector 7835 return Insert128BitVector(N0, V, IdxVal, DAG, dl); 7836 } 7837 7838 if (Subtarget->hasSSE41()) 7839 return LowerINSERT_VECTOR_ELT_SSE4(Op, DAG); 7840 7841 if (EltVT == MVT::i8) 7842 return SDValue(); 7843 7844 if (EltVT.getSizeInBits() == 16 && isa<ConstantSDNode>(N2)) { 7845 // Transform it so it match pinsrw which expects a 16-bit value in a GR32 7846 // as its second argument. 7847 if (N1.getValueType() != MVT::i32) 7848 N1 = DAG.getNode(ISD::ANY_EXTEND, dl, MVT::i32, N1); 7849 if (N2.getValueType() != MVT::i32) 7850 N2 = DAG.getIntPtrConstant(cast<ConstantSDNode>(N2)->getZExtValue()); 7851 return DAG.getNode(X86ISD::PINSRW, dl, VT, N0, N1, N2); 7852 } 7853 return SDValue(); 7854} 7855 7856static SDValue LowerSCALAR_TO_VECTOR(SDValue Op, SelectionDAG &DAG) { 7857 SDLoc dl(Op); 7858 MVT OpVT = Op.getSimpleValueType(); 7859 7860 // If this is a 256-bit vector result, first insert into a 128-bit 7861 // vector and then insert into the 256-bit vector. 7862 if (!OpVT.is128BitVector()) { 7863 // Insert into a 128-bit vector. 7864 unsigned SizeFactor = OpVT.getSizeInBits()/128; 7865 MVT VT128 = MVT::getVectorVT(OpVT.getVectorElementType(), 7866 OpVT.getVectorNumElements() / SizeFactor); 7867 7868 Op = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT128, Op.getOperand(0)); 7869 7870 // Insert the 128-bit vector. 7871 return Insert128BitVector(DAG.getUNDEF(OpVT), Op, 0, DAG, dl); 7872 } 7873 7874 if (OpVT == MVT::v1i64 && 7875 Op.getOperand(0).getValueType() == MVT::i64) 7876 return DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v1i64, Op.getOperand(0)); 7877 7878 SDValue AnyExt = DAG.getNode(ISD::ANY_EXTEND, dl, MVT::i32, Op.getOperand(0)); 7879 assert(OpVT.is128BitVector() && "Expected an SSE type!"); 7880 return DAG.getNode(ISD::BITCAST, dl, OpVT, 7881 DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v4i32,AnyExt)); 7882} 7883 7884// Lower a node with an EXTRACT_SUBVECTOR opcode. This may result in 7885// a simple subregister reference or explicit instructions to grab 7886// upper bits of a vector. 7887static SDValue LowerEXTRACT_SUBVECTOR(SDValue Op, const X86Subtarget *Subtarget, 7888 SelectionDAG &DAG) { 7889 SDLoc dl(Op); 7890 SDValue In = Op.getOperand(0); 7891 SDValue Idx = Op.getOperand(1); 7892 unsigned IdxVal = cast<ConstantSDNode>(Idx)->getZExtValue(); 7893 MVT ResVT = Op.getSimpleValueType(); 7894 MVT InVT = In.getSimpleValueType(); 7895 7896 if (Subtarget->hasFp256()) { 7897 if (ResVT.is128BitVector() && 7898 (InVT.is256BitVector() || InVT.is512BitVector()) && 7899 isa<ConstantSDNode>(Idx)) { 7900 return Extract128BitVector(In, IdxVal, DAG, dl); 7901 } 7902 if (ResVT.is256BitVector() && InVT.is512BitVector() && 7903 isa<ConstantSDNode>(Idx)) { 7904 return Extract256BitVector(In, IdxVal, DAG, dl); 7905 } 7906 } 7907 return SDValue(); 7908} 7909 7910// Lower a node with an INSERT_SUBVECTOR opcode. This may result in a 7911// simple superregister reference or explicit instructions to insert 7912// the upper bits of a vector. 7913static SDValue LowerINSERT_SUBVECTOR(SDValue Op, const X86Subtarget *Subtarget, 7914 SelectionDAG &DAG) { 7915 if (Subtarget->hasFp256()) { 7916 SDLoc dl(Op.getNode()); 7917 SDValue Vec = Op.getNode()->getOperand(0); 7918 SDValue SubVec = Op.getNode()->getOperand(1); 7919 SDValue Idx = Op.getNode()->getOperand(2); 7920 7921 if ((Op.getNode()->getSimpleValueType(0).is256BitVector() || 7922 Op.getNode()->getSimpleValueType(0).is512BitVector()) && 7923 SubVec.getNode()->getSimpleValueType(0).is128BitVector() && 7924 isa<ConstantSDNode>(Idx)) { 7925 unsigned IdxVal = cast<ConstantSDNode>(Idx)->getZExtValue(); 7926 return Insert128BitVector(Vec, SubVec, IdxVal, DAG, dl); 7927 } 7928 7929 if (Op.getNode()->getSimpleValueType(0).is512BitVector() && 7930 SubVec.getNode()->getSimpleValueType(0).is256BitVector() && 7931 isa<ConstantSDNode>(Idx)) { 7932 unsigned IdxVal = cast<ConstantSDNode>(Idx)->getZExtValue(); 7933 return Insert256BitVector(Vec, SubVec, IdxVal, DAG, dl); 7934 } 7935 } 7936 return SDValue(); 7937} 7938 7939// ConstantPool, JumpTable, GlobalAddress, and ExternalSymbol are lowered as 7940// their target countpart wrapped in the X86ISD::Wrapper node. Suppose N is 7941// one of the above mentioned nodes. It has to be wrapped because otherwise 7942// Select(N) returns N. So the raw TargetGlobalAddress nodes, etc. can only 7943// be used to form addressing mode. These wrapped nodes will be selected 7944// into MOV32ri. 7945SDValue 7946X86TargetLowering::LowerConstantPool(SDValue Op, SelectionDAG &DAG) const { 7947 ConstantPoolSDNode *CP = cast<ConstantPoolSDNode>(Op); 7948 7949 // In PIC mode (unless we're in RIPRel PIC mode) we add an offset to the 7950 // global base reg. 7951 unsigned char OpFlag = 0; 7952 unsigned WrapperKind = X86ISD::Wrapper; 7953 CodeModel::Model M = getTargetMachine().getCodeModel(); 7954 7955 if (Subtarget->isPICStyleRIPRel() && 7956 (M == CodeModel::Small || M == CodeModel::Kernel)) 7957 WrapperKind = X86ISD::WrapperRIP; 7958 else if (Subtarget->isPICStyleGOT()) 7959 OpFlag = X86II::MO_GOTOFF; 7960 else if (Subtarget->isPICStyleStubPIC()) 7961 OpFlag = X86II::MO_PIC_BASE_OFFSET; 7962 7963 SDValue Result = DAG.getTargetConstantPool(CP->getConstVal(), getPointerTy(), 7964 CP->getAlignment(), 7965 CP->getOffset(), OpFlag); 7966 SDLoc DL(CP); 7967 Result = DAG.getNode(WrapperKind, DL, getPointerTy(), Result); 7968 // With PIC, the address is actually $g + Offset. 7969 if (OpFlag) { 7970 Result = DAG.getNode(ISD::ADD, DL, getPointerTy(), 7971 DAG.getNode(X86ISD::GlobalBaseReg, 7972 SDLoc(), getPointerTy()), 7973 Result); 7974 } 7975 7976 return Result; 7977} 7978 7979SDValue X86TargetLowering::LowerJumpTable(SDValue Op, SelectionDAG &DAG) const { 7980 JumpTableSDNode *JT = cast<JumpTableSDNode>(Op); 7981 7982 // In PIC mode (unless we're in RIPRel PIC mode) we add an offset to the 7983 // global base reg. 7984 unsigned char OpFlag = 0; 7985 unsigned WrapperKind = X86ISD::Wrapper; 7986 CodeModel::Model M = getTargetMachine().getCodeModel(); 7987 7988 if (Subtarget->isPICStyleRIPRel() && 7989 (M == CodeModel::Small || M == CodeModel::Kernel)) 7990 WrapperKind = X86ISD::WrapperRIP; 7991 else if (Subtarget->isPICStyleGOT()) 7992 OpFlag = X86II::MO_GOTOFF; 7993 else if (Subtarget->isPICStyleStubPIC()) 7994 OpFlag = X86II::MO_PIC_BASE_OFFSET; 7995 7996 SDValue Result = DAG.getTargetJumpTable(JT->getIndex(), getPointerTy(), 7997 OpFlag); 7998 SDLoc DL(JT); 7999 Result = DAG.getNode(WrapperKind, DL, getPointerTy(), Result); 8000 8001 // With PIC, the address is actually $g + Offset. 8002 if (OpFlag) 8003 Result = DAG.getNode(ISD::ADD, DL, getPointerTy(), 8004 DAG.getNode(X86ISD::GlobalBaseReg, 8005 SDLoc(), getPointerTy()), 8006 Result); 8007 8008 return Result; 8009} 8010 8011SDValue 8012X86TargetLowering::LowerExternalSymbol(SDValue Op, SelectionDAG &DAG) const { 8013 const char *Sym = cast<ExternalSymbolSDNode>(Op)->getSymbol(); 8014 8015 // In PIC mode (unless we're in RIPRel PIC mode) we add an offset to the 8016 // global base reg. 8017 unsigned char OpFlag = 0; 8018 unsigned WrapperKind = X86ISD::Wrapper; 8019 CodeModel::Model M = getTargetMachine().getCodeModel(); 8020 8021 if (Subtarget->isPICStyleRIPRel() && 8022 (M == CodeModel::Small || M == CodeModel::Kernel)) { 8023 if (Subtarget->isTargetDarwin() || Subtarget->isTargetELF()) 8024 OpFlag = X86II::MO_GOTPCREL; 8025 WrapperKind = X86ISD::WrapperRIP; 8026 } else if (Subtarget->isPICStyleGOT()) { 8027 OpFlag = X86II::MO_GOT; 8028 } else if (Subtarget->isPICStyleStubPIC()) { 8029 OpFlag = X86II::MO_DARWIN_NONLAZY_PIC_BASE; 8030 } else if (Subtarget->isPICStyleStubNoDynamic()) { 8031 OpFlag = X86II::MO_DARWIN_NONLAZY; 8032 } 8033 8034 SDValue Result = DAG.getTargetExternalSymbol(Sym, getPointerTy(), OpFlag); 8035 8036 SDLoc DL(Op); 8037 Result = DAG.getNode(WrapperKind, DL, getPointerTy(), Result); 8038 8039 // With PIC, the address is actually $g + Offset. 8040 if (getTargetMachine().getRelocationModel() == Reloc::PIC_ && 8041 !Subtarget->is64Bit()) { 8042 Result = DAG.getNode(ISD::ADD, DL, getPointerTy(), 8043 DAG.getNode(X86ISD::GlobalBaseReg, 8044 SDLoc(), getPointerTy()), 8045 Result); 8046 } 8047 8048 // For symbols that require a load from a stub to get the address, emit the 8049 // load. 8050 if (isGlobalStubReference(OpFlag)) 8051 Result = DAG.getLoad(getPointerTy(), DL, DAG.getEntryNode(), Result, 8052 MachinePointerInfo::getGOT(), false, false, false, 0); 8053 8054 return Result; 8055} 8056 8057SDValue 8058X86TargetLowering::LowerBlockAddress(SDValue Op, SelectionDAG &DAG) const { 8059 // Create the TargetBlockAddressAddress node. 8060 unsigned char OpFlags = 8061 Subtarget->ClassifyBlockAddressReference(); 8062 CodeModel::Model M = getTargetMachine().getCodeModel(); 8063 const BlockAddress *BA = cast<BlockAddressSDNode>(Op)->getBlockAddress(); 8064 int64_t Offset = cast<BlockAddressSDNode>(Op)->getOffset(); 8065 SDLoc dl(Op); 8066 SDValue Result = DAG.getTargetBlockAddress(BA, getPointerTy(), Offset, 8067 OpFlags); 8068 8069 if (Subtarget->isPICStyleRIPRel() && 8070 (M == CodeModel::Small || M == CodeModel::Kernel)) 8071 Result = DAG.getNode(X86ISD::WrapperRIP, dl, getPointerTy(), Result); 8072 else 8073 Result = DAG.getNode(X86ISD::Wrapper, dl, getPointerTy(), Result); 8074 8075 // With PIC, the address is actually $g + Offset. 8076 if (isGlobalRelativeToPICBase(OpFlags)) { 8077 Result = DAG.getNode(ISD::ADD, dl, getPointerTy(), 8078 DAG.getNode(X86ISD::GlobalBaseReg, dl, getPointerTy()), 8079 Result); 8080 } 8081 8082 return Result; 8083} 8084 8085SDValue 8086X86TargetLowering::LowerGlobalAddress(const GlobalValue *GV, SDLoc dl, 8087 int64_t Offset, SelectionDAG &DAG) const { 8088 // Create the TargetGlobalAddress node, folding in the constant 8089 // offset if it is legal. 8090 unsigned char OpFlags = 8091 Subtarget->ClassifyGlobalReference(GV, getTargetMachine()); 8092 CodeModel::Model M = getTargetMachine().getCodeModel(); 8093 SDValue Result; 8094 if (OpFlags == X86II::MO_NO_FLAG && 8095 X86::isOffsetSuitableForCodeModel(Offset, M)) { 8096 // A direct static reference to a global. 8097 Result = DAG.getTargetGlobalAddress(GV, dl, getPointerTy(), Offset); 8098 Offset = 0; 8099 } else { 8100 Result = DAG.getTargetGlobalAddress(GV, dl, getPointerTy(), 0, OpFlags); 8101 } 8102 8103 if (Subtarget->isPICStyleRIPRel() && 8104 (M == CodeModel::Small || M == CodeModel::Kernel)) 8105 Result = DAG.getNode(X86ISD::WrapperRIP, dl, getPointerTy(), Result); 8106 else 8107 Result = DAG.getNode(X86ISD::Wrapper, dl, getPointerTy(), Result); 8108 8109 // With PIC, the address is actually $g + Offset. 8110 if (isGlobalRelativeToPICBase(OpFlags)) { 8111 Result = DAG.getNode(ISD::ADD, dl, getPointerTy(), 8112 DAG.getNode(X86ISD::GlobalBaseReg, dl, getPointerTy()), 8113 Result); 8114 } 8115 8116 // For globals that require a load from a stub to get the address, emit the 8117 // load. 8118 if (isGlobalStubReference(OpFlags)) 8119 Result = DAG.getLoad(getPointerTy(), dl, DAG.getEntryNode(), Result, 8120 MachinePointerInfo::getGOT(), false, false, false, 0); 8121 8122 // If there was a non-zero offset that we didn't fold, create an explicit 8123 // addition for it. 8124 if (Offset != 0) 8125 Result = DAG.getNode(ISD::ADD, dl, getPointerTy(), Result, 8126 DAG.getConstant(Offset, getPointerTy())); 8127 8128 return Result; 8129} 8130 8131SDValue 8132X86TargetLowering::LowerGlobalAddress(SDValue Op, SelectionDAG &DAG) const { 8133 const GlobalValue *GV = cast<GlobalAddressSDNode>(Op)->getGlobal(); 8134 int64_t Offset = cast<GlobalAddressSDNode>(Op)->getOffset(); 8135 return LowerGlobalAddress(GV, SDLoc(Op), Offset, DAG); 8136} 8137 8138static SDValue 8139GetTLSADDR(SelectionDAG &DAG, SDValue Chain, GlobalAddressSDNode *GA, 8140 SDValue *InFlag, const EVT PtrVT, unsigned ReturnReg, 8141 unsigned char OperandFlags, bool LocalDynamic = false) { 8142 MachineFrameInfo *MFI = DAG.getMachineFunction().getFrameInfo(); 8143 SDVTList NodeTys = DAG.getVTList(MVT::Other, MVT::Glue); 8144 SDLoc dl(GA); 8145 SDValue TGA = DAG.getTargetGlobalAddress(GA->getGlobal(), dl, 8146 GA->getValueType(0), 8147 GA->getOffset(), 8148 OperandFlags); 8149 8150 X86ISD::NodeType CallType = LocalDynamic ? X86ISD::TLSBASEADDR 8151 : X86ISD::TLSADDR; 8152 8153 if (InFlag) { 8154 SDValue Ops[] = { Chain, TGA, *InFlag }; 8155 Chain = DAG.getNode(CallType, dl, NodeTys, Ops, array_lengthof(Ops)); 8156 } else { 8157 SDValue Ops[] = { Chain, TGA }; 8158 Chain = DAG.getNode(CallType, dl, NodeTys, Ops, array_lengthof(Ops)); 8159 } 8160 8161 // TLSADDR will be codegen'ed as call. Inform MFI that function has calls. 8162 MFI->setAdjustsStack(true); 8163 8164 SDValue Flag = Chain.getValue(1); 8165 return DAG.getCopyFromReg(Chain, dl, ReturnReg, PtrVT, Flag); 8166} 8167 8168// Lower ISD::GlobalTLSAddress using the "general dynamic" model, 32 bit 8169static SDValue 8170LowerToTLSGeneralDynamicModel32(GlobalAddressSDNode *GA, SelectionDAG &DAG, 8171 const EVT PtrVT) { 8172 SDValue InFlag; 8173 SDLoc dl(GA); // ? function entry point might be better 8174 SDValue Chain = DAG.getCopyToReg(DAG.getEntryNode(), dl, X86::EBX, 8175 DAG.getNode(X86ISD::GlobalBaseReg, 8176 SDLoc(), PtrVT), InFlag); 8177 InFlag = Chain.getValue(1); 8178 8179 return GetTLSADDR(DAG, Chain, GA, &InFlag, PtrVT, X86::EAX, X86II::MO_TLSGD); 8180} 8181 8182// Lower ISD::GlobalTLSAddress using the "general dynamic" model, 64 bit 8183static SDValue 8184LowerToTLSGeneralDynamicModel64(GlobalAddressSDNode *GA, SelectionDAG &DAG, 8185 const EVT PtrVT) { 8186 return GetTLSADDR(DAG, DAG.getEntryNode(), GA, NULL, PtrVT, 8187 X86::RAX, X86II::MO_TLSGD); 8188} 8189 8190static SDValue LowerToTLSLocalDynamicModel(GlobalAddressSDNode *GA, 8191 SelectionDAG &DAG, 8192 const EVT PtrVT, 8193 bool is64Bit) { 8194 SDLoc dl(GA); 8195 8196 // Get the start address of the TLS block for this module. 8197 X86MachineFunctionInfo* MFI = DAG.getMachineFunction() 8198 .getInfo<X86MachineFunctionInfo>(); 8199 MFI->incNumLocalDynamicTLSAccesses(); 8200 8201 SDValue Base; 8202 if (is64Bit) { 8203 Base = GetTLSADDR(DAG, DAG.getEntryNode(), GA, NULL, PtrVT, X86::RAX, 8204 X86II::MO_TLSLD, /*LocalDynamic=*/true); 8205 } else { 8206 SDValue InFlag; 8207 SDValue Chain = DAG.getCopyToReg(DAG.getEntryNode(), dl, X86::EBX, 8208 DAG.getNode(X86ISD::GlobalBaseReg, SDLoc(), PtrVT), InFlag); 8209 InFlag = Chain.getValue(1); 8210 Base = GetTLSADDR(DAG, Chain, GA, &InFlag, PtrVT, X86::EAX, 8211 X86II::MO_TLSLDM, /*LocalDynamic=*/true); 8212 } 8213 8214 // Note: the CleanupLocalDynamicTLSPass will remove redundant computations 8215 // of Base. 8216 8217 // Build x@dtpoff. 8218 unsigned char OperandFlags = X86II::MO_DTPOFF; 8219 unsigned WrapperKind = X86ISD::Wrapper; 8220 SDValue TGA = DAG.getTargetGlobalAddress(GA->getGlobal(), dl, 8221 GA->getValueType(0), 8222 GA->getOffset(), OperandFlags); 8223 SDValue Offset = DAG.getNode(WrapperKind, dl, PtrVT, TGA); 8224 8225 // Add x@dtpoff with the base. 8226 return DAG.getNode(ISD::ADD, dl, PtrVT, Offset, Base); 8227} 8228 8229// Lower ISD::GlobalTLSAddress using the "initial exec" or "local exec" model. 8230static SDValue LowerToTLSExecModel(GlobalAddressSDNode *GA, SelectionDAG &DAG, 8231 const EVT PtrVT, TLSModel::Model model, 8232 bool is64Bit, bool isPIC) { 8233 SDLoc dl(GA); 8234 8235 // Get the Thread Pointer, which is %gs:0 (32-bit) or %fs:0 (64-bit). 8236 Value *Ptr = Constant::getNullValue(Type::getInt8PtrTy(*DAG.getContext(), 8237 is64Bit ? 257 : 256)); 8238 8239 SDValue ThreadPointer = 8240 DAG.getLoad(PtrVT, dl, DAG.getEntryNode(), DAG.getIntPtrConstant(0), 8241 MachinePointerInfo(Ptr), false, false, false, 0); 8242 8243 unsigned char OperandFlags = 0; 8244 // Most TLS accesses are not RIP relative, even on x86-64. One exception is 8245 // initialexec. 8246 unsigned WrapperKind = X86ISD::Wrapper; 8247 if (model == TLSModel::LocalExec) { 8248 OperandFlags = is64Bit ? X86II::MO_TPOFF : X86II::MO_NTPOFF; 8249 } else if (model == TLSModel::InitialExec) { 8250 if (is64Bit) { 8251 OperandFlags = X86II::MO_GOTTPOFF; 8252 WrapperKind = X86ISD::WrapperRIP; 8253 } else { 8254 OperandFlags = isPIC ? X86II::MO_GOTNTPOFF : X86II::MO_INDNTPOFF; 8255 } 8256 } else { 8257 llvm_unreachable("Unexpected model"); 8258 } 8259 8260 // emit "addl x@ntpoff,%eax" (local exec) 8261 // or "addl x@indntpoff,%eax" (initial exec) 8262 // or "addl x@gotntpoff(%ebx) ,%eax" (initial exec, 32-bit pic) 8263 SDValue TGA = 8264 DAG.getTargetGlobalAddress(GA->getGlobal(), dl, GA->getValueType(0), 8265 GA->getOffset(), OperandFlags); 8266 SDValue Offset = DAG.getNode(WrapperKind, dl, PtrVT, TGA); 8267 8268 if (model == TLSModel::InitialExec) { 8269 if (isPIC && !is64Bit) { 8270 Offset = DAG.getNode(ISD::ADD, dl, PtrVT, 8271 DAG.getNode(X86ISD::GlobalBaseReg, SDLoc(), PtrVT), 8272 Offset); 8273 } 8274 8275 Offset = DAG.getLoad(PtrVT, dl, DAG.getEntryNode(), Offset, 8276 MachinePointerInfo::getGOT(), false, false, false, 0); 8277 } 8278 8279 // The address of the thread local variable is the add of the thread 8280 // pointer with the offset of the variable. 8281 return DAG.getNode(ISD::ADD, dl, PtrVT, ThreadPointer, Offset); 8282} 8283 8284SDValue 8285X86TargetLowering::LowerGlobalTLSAddress(SDValue Op, SelectionDAG &DAG) const { 8286 8287 GlobalAddressSDNode *GA = cast<GlobalAddressSDNode>(Op); 8288 const GlobalValue *GV = GA->getGlobal(); 8289 8290 if (Subtarget->isTargetELF()) { 8291 TLSModel::Model model = getTargetMachine().getTLSModel(GV); 8292 8293 switch (model) { 8294 case TLSModel::GeneralDynamic: 8295 if (Subtarget->is64Bit()) 8296 return LowerToTLSGeneralDynamicModel64(GA, DAG, getPointerTy()); 8297 return LowerToTLSGeneralDynamicModel32(GA, DAG, getPointerTy()); 8298 case TLSModel::LocalDynamic: 8299 return LowerToTLSLocalDynamicModel(GA, DAG, getPointerTy(), 8300 Subtarget->is64Bit()); 8301 case TLSModel::InitialExec: 8302 case TLSModel::LocalExec: 8303 return LowerToTLSExecModel(GA, DAG, getPointerTy(), model, 8304 Subtarget->is64Bit(), 8305 getTargetMachine().getRelocationModel() == Reloc::PIC_); 8306 } 8307 llvm_unreachable("Unknown TLS model."); 8308 } 8309 8310 if (Subtarget->isTargetDarwin()) { 8311 // Darwin only has one model of TLS. Lower to that. 8312 unsigned char OpFlag = 0; 8313 unsigned WrapperKind = Subtarget->isPICStyleRIPRel() ? 8314 X86ISD::WrapperRIP : X86ISD::Wrapper; 8315 8316 // In PIC mode (unless we're in RIPRel PIC mode) we add an offset to the 8317 // global base reg. 8318 bool PIC32 = (getTargetMachine().getRelocationModel() == Reloc::PIC_) && 8319 !Subtarget->is64Bit(); 8320 if (PIC32) 8321 OpFlag = X86II::MO_TLVP_PIC_BASE; 8322 else 8323 OpFlag = X86II::MO_TLVP; 8324 SDLoc DL(Op); 8325 SDValue Result = DAG.getTargetGlobalAddress(GA->getGlobal(), DL, 8326 GA->getValueType(0), 8327 GA->getOffset(), OpFlag); 8328 SDValue Offset = DAG.getNode(WrapperKind, DL, getPointerTy(), Result); 8329 8330 // With PIC32, the address is actually $g + Offset. 8331 if (PIC32) 8332 Offset = DAG.getNode(ISD::ADD, DL, getPointerTy(), 8333 DAG.getNode(X86ISD::GlobalBaseReg, 8334 SDLoc(), getPointerTy()), 8335 Offset); 8336 8337 // Lowering the machine isd will make sure everything is in the right 8338 // location. 8339 SDValue Chain = DAG.getEntryNode(); 8340 SDVTList NodeTys = DAG.getVTList(MVT::Other, MVT::Glue); 8341 SDValue Args[] = { Chain, Offset }; 8342 Chain = DAG.getNode(X86ISD::TLSCALL, DL, NodeTys, Args, 2); 8343 8344 // TLSCALL will be codegen'ed as call. Inform MFI that function has calls. 8345 MachineFrameInfo *MFI = DAG.getMachineFunction().getFrameInfo(); 8346 MFI->setAdjustsStack(true); 8347 8348 // And our return value (tls address) is in the standard call return value 8349 // location. 8350 unsigned Reg = Subtarget->is64Bit() ? X86::RAX : X86::EAX; 8351 return DAG.getCopyFromReg(Chain, DL, Reg, getPointerTy(), 8352 Chain.getValue(1)); 8353 } 8354 8355 if (Subtarget->isTargetWindows() || Subtarget->isTargetMingw()) { 8356 // Just use the implicit TLS architecture 8357 // Need to generate someting similar to: 8358 // mov rdx, qword [gs:abs 58H]; Load pointer to ThreadLocalStorage 8359 // ; from TEB 8360 // mov ecx, dword [rel _tls_index]: Load index (from C runtime) 8361 // mov rcx, qword [rdx+rcx*8] 8362 // mov eax, .tls$:tlsvar 8363 // [rax+rcx] contains the address 8364 // Windows 64bit: gs:0x58 8365 // Windows 32bit: fs:__tls_array 8366 8367 // If GV is an alias then use the aliasee for determining 8368 // thread-localness. 8369 if (const GlobalAlias *GA = dyn_cast<GlobalAlias>(GV)) 8370 GV = GA->resolveAliasedGlobal(false); 8371 SDLoc dl(GA); 8372 SDValue Chain = DAG.getEntryNode(); 8373 8374 // Get the Thread Pointer, which is %fs:__tls_array (32-bit) or 8375 // %gs:0x58 (64-bit). On MinGW, __tls_array is not available, so directly 8376 // use its literal value of 0x2C. 8377 Value *Ptr = Constant::getNullValue(Subtarget->is64Bit() 8378 ? Type::getInt8PtrTy(*DAG.getContext(), 8379 256) 8380 : Type::getInt32PtrTy(*DAG.getContext(), 8381 257)); 8382 8383 SDValue TlsArray = Subtarget->is64Bit() ? DAG.getIntPtrConstant(0x58) : 8384 (Subtarget->isTargetMingw() ? DAG.getIntPtrConstant(0x2C) : 8385 DAG.getExternalSymbol("_tls_array", getPointerTy())); 8386 8387 SDValue ThreadPointer = DAG.getLoad(getPointerTy(), dl, Chain, TlsArray, 8388 MachinePointerInfo(Ptr), 8389 false, false, false, 0); 8390 8391 // Load the _tls_index variable 8392 SDValue IDX = DAG.getExternalSymbol("_tls_index", getPointerTy()); 8393 if (Subtarget->is64Bit()) 8394 IDX = DAG.getExtLoad(ISD::ZEXTLOAD, dl, getPointerTy(), Chain, 8395 IDX, MachinePointerInfo(), MVT::i32, 8396 false, false, 0); 8397 else 8398 IDX = DAG.getLoad(getPointerTy(), dl, Chain, IDX, MachinePointerInfo(), 8399 false, false, false, 0); 8400 8401 SDValue Scale = DAG.getConstant(Log2_64_Ceil(TD->getPointerSize()), 8402 getPointerTy()); 8403 IDX = DAG.getNode(ISD::SHL, dl, getPointerTy(), IDX, Scale); 8404 8405 SDValue res = DAG.getNode(ISD::ADD, dl, getPointerTy(), ThreadPointer, IDX); 8406 res = DAG.getLoad(getPointerTy(), dl, Chain, res, MachinePointerInfo(), 8407 false, false, false, 0); 8408 8409 // Get the offset of start of .tls section 8410 SDValue TGA = DAG.getTargetGlobalAddress(GA->getGlobal(), dl, 8411 GA->getValueType(0), 8412 GA->getOffset(), X86II::MO_SECREL); 8413 SDValue Offset = DAG.getNode(X86ISD::Wrapper, dl, getPointerTy(), TGA); 8414 8415 // The address of the thread local variable is the add of the thread 8416 // pointer with the offset of the variable. 8417 return DAG.getNode(ISD::ADD, dl, getPointerTy(), res, Offset); 8418 } 8419 8420 llvm_unreachable("TLS not implemented for this target."); 8421} 8422 8423/// LowerShiftParts - Lower SRA_PARTS and friends, which return two i32 values 8424/// and take a 2 x i32 value to shift plus a shift amount. 8425SDValue X86TargetLowering::LowerShiftParts(SDValue Op, SelectionDAG &DAG) const{ 8426 assert(Op.getNumOperands() == 3 && "Not a double-shift!"); 8427 EVT VT = Op.getValueType(); 8428 unsigned VTBits = VT.getSizeInBits(); 8429 SDLoc dl(Op); 8430 bool isSRA = Op.getOpcode() == ISD::SRA_PARTS; 8431 SDValue ShOpLo = Op.getOperand(0); 8432 SDValue ShOpHi = Op.getOperand(1); 8433 SDValue ShAmt = Op.getOperand(2); 8434 // X86ISD::SHLD and X86ISD::SHRD have defined overflow behavior but the 8435 // generic ISD nodes haven't. Insert an AND to be safe, it's optimized away 8436 // during isel. 8437 SDValue SafeShAmt = DAG.getNode(ISD::AND, dl, MVT::i8, ShAmt, 8438 DAG.getConstant(VTBits - 1, MVT::i8)); 8439 SDValue Tmp1 = isSRA ? DAG.getNode(ISD::SRA, dl, VT, ShOpHi, 8440 DAG.getConstant(VTBits - 1, MVT::i8)) 8441 : DAG.getConstant(0, VT); 8442 8443 SDValue Tmp2, Tmp3; 8444 if (Op.getOpcode() == ISD::SHL_PARTS) { 8445 Tmp2 = DAG.getNode(X86ISD::SHLD, dl, VT, ShOpHi, ShOpLo, ShAmt); 8446 Tmp3 = DAG.getNode(ISD::SHL, dl, VT, ShOpLo, SafeShAmt); 8447 } else { 8448 Tmp2 = DAG.getNode(X86ISD::SHRD, dl, VT, ShOpLo, ShOpHi, ShAmt); 8449 Tmp3 = DAG.getNode(isSRA ? ISD::SRA : ISD::SRL, dl, VT, ShOpHi, SafeShAmt); 8450 } 8451 8452 // If the shift amount is larger or equal than the width of a part we can't 8453 // rely on the results of shld/shrd. Insert a test and select the appropriate 8454 // values for large shift amounts. 8455 SDValue AndNode = DAG.getNode(ISD::AND, dl, MVT::i8, ShAmt, 8456 DAG.getConstant(VTBits, MVT::i8)); 8457 SDValue Cond = DAG.getNode(X86ISD::CMP, dl, MVT::i32, 8458 AndNode, DAG.getConstant(0, MVT::i8)); 8459 8460 SDValue Hi, Lo; 8461 SDValue CC = DAG.getConstant(X86::COND_NE, MVT::i8); 8462 SDValue Ops0[4] = { Tmp2, Tmp3, CC, Cond }; 8463 SDValue Ops1[4] = { Tmp3, Tmp1, CC, Cond }; 8464 8465 if (Op.getOpcode() == ISD::SHL_PARTS) { 8466 Hi = DAG.getNode(X86ISD::CMOV, dl, VT, Ops0, 4); 8467 Lo = DAG.getNode(X86ISD::CMOV, dl, VT, Ops1, 4); 8468 } else { 8469 Lo = DAG.getNode(X86ISD::CMOV, dl, VT, Ops0, 4); 8470 Hi = DAG.getNode(X86ISD::CMOV, dl, VT, Ops1, 4); 8471 } 8472 8473 SDValue Ops[2] = { Lo, Hi }; 8474 return DAG.getMergeValues(Ops, array_lengthof(Ops), dl); 8475} 8476 8477SDValue X86TargetLowering::LowerSINT_TO_FP(SDValue Op, 8478 SelectionDAG &DAG) const { 8479 EVT SrcVT = Op.getOperand(0).getValueType(); 8480 8481 if (SrcVT.isVector()) 8482 return SDValue(); 8483 8484 assert(SrcVT.getSimpleVT() <= MVT::i64 && SrcVT.getSimpleVT() >= MVT::i16 && 8485 "Unknown SINT_TO_FP to lower!"); 8486 8487 // These are really Legal; return the operand so the caller accepts it as 8488 // Legal. 8489 if (SrcVT == MVT::i32 && isScalarFPTypeInSSEReg(Op.getValueType())) 8490 return Op; 8491 if (SrcVT == MVT::i64 && isScalarFPTypeInSSEReg(Op.getValueType()) && 8492 Subtarget->is64Bit()) { 8493 return Op; 8494 } 8495 8496 SDLoc dl(Op); 8497 unsigned Size = SrcVT.getSizeInBits()/8; 8498 MachineFunction &MF = DAG.getMachineFunction(); 8499 int SSFI = MF.getFrameInfo()->CreateStackObject(Size, Size, false); 8500 SDValue StackSlot = DAG.getFrameIndex(SSFI, getPointerTy()); 8501 SDValue Chain = DAG.getStore(DAG.getEntryNode(), dl, Op.getOperand(0), 8502 StackSlot, 8503 MachinePointerInfo::getFixedStack(SSFI), 8504 false, false, 0); 8505 return BuildFILD(Op, SrcVT, Chain, StackSlot, DAG); 8506} 8507 8508SDValue X86TargetLowering::BuildFILD(SDValue Op, EVT SrcVT, SDValue Chain, 8509 SDValue StackSlot, 8510 SelectionDAG &DAG) const { 8511 // Build the FILD 8512 SDLoc DL(Op); 8513 SDVTList Tys; 8514 bool useSSE = isScalarFPTypeInSSEReg(Op.getValueType()); 8515 if (useSSE) 8516 Tys = DAG.getVTList(MVT::f64, MVT::Other, MVT::Glue); 8517 else 8518 Tys = DAG.getVTList(Op.getValueType(), MVT::Other); 8519 8520 unsigned ByteSize = SrcVT.getSizeInBits()/8; 8521 8522 FrameIndexSDNode *FI = dyn_cast<FrameIndexSDNode>(StackSlot); 8523 MachineMemOperand *MMO; 8524 if (FI) { 8525 int SSFI = FI->getIndex(); 8526 MMO = 8527 DAG.getMachineFunction() 8528 .getMachineMemOperand(MachinePointerInfo::getFixedStack(SSFI), 8529 MachineMemOperand::MOLoad, ByteSize, ByteSize); 8530 } else { 8531 MMO = cast<LoadSDNode>(StackSlot)->getMemOperand(); 8532 StackSlot = StackSlot.getOperand(1); 8533 } 8534 SDValue Ops[] = { Chain, StackSlot, DAG.getValueType(SrcVT) }; 8535 SDValue Result = DAG.getMemIntrinsicNode(useSSE ? X86ISD::FILD_FLAG : 8536 X86ISD::FILD, DL, 8537 Tys, Ops, array_lengthof(Ops), 8538 SrcVT, MMO); 8539 8540 if (useSSE) { 8541 Chain = Result.getValue(1); 8542 SDValue InFlag = Result.getValue(2); 8543 8544 // FIXME: Currently the FST is flagged to the FILD_FLAG. This 8545 // shouldn't be necessary except that RFP cannot be live across 8546 // multiple blocks. When stackifier is fixed, they can be uncoupled. 8547 MachineFunction &MF = DAG.getMachineFunction(); 8548 unsigned SSFISize = Op.getValueType().getSizeInBits()/8; 8549 int SSFI = MF.getFrameInfo()->CreateStackObject(SSFISize, SSFISize, false); 8550 SDValue StackSlot = DAG.getFrameIndex(SSFI, getPointerTy()); 8551 Tys = DAG.getVTList(MVT::Other); 8552 SDValue Ops[] = { 8553 Chain, Result, StackSlot, DAG.getValueType(Op.getValueType()), InFlag 8554 }; 8555 MachineMemOperand *MMO = 8556 DAG.getMachineFunction() 8557 .getMachineMemOperand(MachinePointerInfo::getFixedStack(SSFI), 8558 MachineMemOperand::MOStore, SSFISize, SSFISize); 8559 8560 Chain = DAG.getMemIntrinsicNode(X86ISD::FST, DL, Tys, 8561 Ops, array_lengthof(Ops), 8562 Op.getValueType(), MMO); 8563 Result = DAG.getLoad(Op.getValueType(), DL, Chain, StackSlot, 8564 MachinePointerInfo::getFixedStack(SSFI), 8565 false, false, false, 0); 8566 } 8567 8568 return Result; 8569} 8570 8571// LowerUINT_TO_FP_i64 - 64-bit unsigned integer to double expansion. 8572SDValue X86TargetLowering::LowerUINT_TO_FP_i64(SDValue Op, 8573 SelectionDAG &DAG) const { 8574 // This algorithm is not obvious. Here it is what we're trying to output: 8575 /* 8576 movq %rax, %xmm0 8577 punpckldq (c0), %xmm0 // c0: (uint4){ 0x43300000U, 0x45300000U, 0U, 0U } 8578 subpd (c1), %xmm0 // c1: (double2){ 0x1.0p52, 0x1.0p52 * 0x1.0p32 } 8579 #ifdef __SSE3__ 8580 haddpd %xmm0, %xmm0 8581 #else 8582 pshufd $0x4e, %xmm0, %xmm1 8583 addpd %xmm1, %xmm0 8584 #endif 8585 */ 8586 8587 SDLoc dl(Op); 8588 LLVMContext *Context = DAG.getContext(); 8589 8590 // Build some magic constants. 8591 static const uint32_t CV0[] = { 0x43300000, 0x45300000, 0, 0 }; 8592 Constant *C0 = ConstantDataVector::get(*Context, CV0); 8593 SDValue CPIdx0 = DAG.getConstantPool(C0, getPointerTy(), 16); 8594 8595 SmallVector<Constant*,2> CV1; 8596 CV1.push_back( 8597 ConstantFP::get(*Context, APFloat(APFloat::IEEEdouble, 8598 APInt(64, 0x4330000000000000ULL)))); 8599 CV1.push_back( 8600 ConstantFP::get(*Context, APFloat(APFloat::IEEEdouble, 8601 APInt(64, 0x4530000000000000ULL)))); 8602 Constant *C1 = ConstantVector::get(CV1); 8603 SDValue CPIdx1 = DAG.getConstantPool(C1, getPointerTy(), 16); 8604 8605 // Load the 64-bit value into an XMM register. 8606 SDValue XR1 = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v2i64, 8607 Op.getOperand(0)); 8608 SDValue CLod0 = DAG.getLoad(MVT::v4i32, dl, DAG.getEntryNode(), CPIdx0, 8609 MachinePointerInfo::getConstantPool(), 8610 false, false, false, 16); 8611 SDValue Unpck1 = getUnpackl(DAG, dl, MVT::v4i32, 8612 DAG.getNode(ISD::BITCAST, dl, MVT::v4i32, XR1), 8613 CLod0); 8614 8615 SDValue CLod1 = DAG.getLoad(MVT::v2f64, dl, CLod0.getValue(1), CPIdx1, 8616 MachinePointerInfo::getConstantPool(), 8617 false, false, false, 16); 8618 SDValue XR2F = DAG.getNode(ISD::BITCAST, dl, MVT::v2f64, Unpck1); 8619 SDValue Sub = DAG.getNode(ISD::FSUB, dl, MVT::v2f64, XR2F, CLod1); 8620 SDValue Result; 8621 8622 if (Subtarget->hasSSE3()) { 8623 // FIXME: The 'haddpd' instruction may be slower than 'movhlps + addsd'. 8624 Result = DAG.getNode(X86ISD::FHADD, dl, MVT::v2f64, Sub, Sub); 8625 } else { 8626 SDValue S2F = DAG.getNode(ISD::BITCAST, dl, MVT::v4i32, Sub); 8627 SDValue Shuffle = getTargetShuffleNode(X86ISD::PSHUFD, dl, MVT::v4i32, 8628 S2F, 0x4E, DAG); 8629 Result = DAG.getNode(ISD::FADD, dl, MVT::v2f64, 8630 DAG.getNode(ISD::BITCAST, dl, MVT::v2f64, Shuffle), 8631 Sub); 8632 } 8633 8634 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::f64, Result, 8635 DAG.getIntPtrConstant(0)); 8636} 8637 8638// LowerUINT_TO_FP_i32 - 32-bit unsigned integer to float expansion. 8639SDValue X86TargetLowering::LowerUINT_TO_FP_i32(SDValue Op, 8640 SelectionDAG &DAG) const { 8641 SDLoc dl(Op); 8642 // FP constant to bias correct the final result. 8643 SDValue Bias = DAG.getConstantFP(BitsToDouble(0x4330000000000000ULL), 8644 MVT::f64); 8645 8646 // Load the 32-bit value into an XMM register. 8647 SDValue Load = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v4i32, 8648 Op.getOperand(0)); 8649 8650 // Zero out the upper parts of the register. 8651 Load = getShuffleVectorZeroOrUndef(Load, 0, true, Subtarget, DAG); 8652 8653 Load = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::f64, 8654 DAG.getNode(ISD::BITCAST, dl, MVT::v2f64, Load), 8655 DAG.getIntPtrConstant(0)); 8656 8657 // Or the load with the bias. 8658 SDValue Or = DAG.getNode(ISD::OR, dl, MVT::v2i64, 8659 DAG.getNode(ISD::BITCAST, dl, MVT::v2i64, 8660 DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, 8661 MVT::v2f64, Load)), 8662 DAG.getNode(ISD::BITCAST, dl, MVT::v2i64, 8663 DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, 8664 MVT::v2f64, Bias))); 8665 Or = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::f64, 8666 DAG.getNode(ISD::BITCAST, dl, MVT::v2f64, Or), 8667 DAG.getIntPtrConstant(0)); 8668 8669 // Subtract the bias. 8670 SDValue Sub = DAG.getNode(ISD::FSUB, dl, MVT::f64, Or, Bias); 8671 8672 // Handle final rounding. 8673 EVT DestVT = Op.getValueType(); 8674 8675 if (DestVT.bitsLT(MVT::f64)) 8676 return DAG.getNode(ISD::FP_ROUND, dl, DestVT, Sub, 8677 DAG.getIntPtrConstant(0)); 8678 if (DestVT.bitsGT(MVT::f64)) 8679 return DAG.getNode(ISD::FP_EXTEND, dl, DestVT, Sub); 8680 8681 // Handle final rounding. 8682 return Sub; 8683} 8684 8685SDValue X86TargetLowering::lowerUINT_TO_FP_vec(SDValue Op, 8686 SelectionDAG &DAG) const { 8687 SDValue N0 = Op.getOperand(0); 8688 EVT SVT = N0.getValueType(); 8689 SDLoc dl(Op); 8690 8691 assert((SVT == MVT::v4i8 || SVT == MVT::v4i16 || 8692 SVT == MVT::v8i8 || SVT == MVT::v8i16) && 8693 "Custom UINT_TO_FP is not supported!"); 8694 8695 EVT NVT = EVT::getVectorVT(*DAG.getContext(), MVT::i32, 8696 SVT.getVectorNumElements()); 8697 return DAG.getNode(ISD::SINT_TO_FP, dl, Op.getValueType(), 8698 DAG.getNode(ISD::ZERO_EXTEND, dl, NVT, N0)); 8699} 8700 8701SDValue X86TargetLowering::LowerUINT_TO_FP(SDValue Op, 8702 SelectionDAG &DAG) const { 8703 SDValue N0 = Op.getOperand(0); 8704 SDLoc dl(Op); 8705 8706 if (Op.getValueType().isVector()) 8707 return lowerUINT_TO_FP_vec(Op, DAG); 8708 8709 // Since UINT_TO_FP is legal (it's marked custom), dag combiner won't 8710 // optimize it to a SINT_TO_FP when the sign bit is known zero. Perform 8711 // the optimization here. 8712 if (DAG.SignBitIsZero(N0)) 8713 return DAG.getNode(ISD::SINT_TO_FP, dl, Op.getValueType(), N0); 8714 8715 EVT SrcVT = N0.getValueType(); 8716 EVT DstVT = Op.getValueType(); 8717 if (SrcVT == MVT::i64 && DstVT == MVT::f64 && X86ScalarSSEf64) 8718 return LowerUINT_TO_FP_i64(Op, DAG); 8719 if (SrcVT == MVT::i32 && X86ScalarSSEf64) 8720 return LowerUINT_TO_FP_i32(Op, DAG); 8721 if (Subtarget->is64Bit() && SrcVT == MVT::i64 && DstVT == MVT::f32) 8722 return SDValue(); 8723 8724 // Make a 64-bit buffer, and use it to build an FILD. 8725 SDValue StackSlot = DAG.CreateStackTemporary(MVT::i64); 8726 if (SrcVT == MVT::i32) { 8727 SDValue WordOff = DAG.getConstant(4, getPointerTy()); 8728 SDValue OffsetSlot = DAG.getNode(ISD::ADD, dl, 8729 getPointerTy(), StackSlot, WordOff); 8730 SDValue Store1 = DAG.getStore(DAG.getEntryNode(), dl, Op.getOperand(0), 8731 StackSlot, MachinePointerInfo(), 8732 false, false, 0); 8733 SDValue Store2 = DAG.getStore(Store1, dl, DAG.getConstant(0, MVT::i32), 8734 OffsetSlot, MachinePointerInfo(), 8735 false, false, 0); 8736 SDValue Fild = BuildFILD(Op, MVT::i64, Store2, StackSlot, DAG); 8737 return Fild; 8738 } 8739 8740 assert(SrcVT == MVT::i64 && "Unexpected type in UINT_TO_FP"); 8741 SDValue Store = DAG.getStore(DAG.getEntryNode(), dl, Op.getOperand(0), 8742 StackSlot, MachinePointerInfo(), 8743 false, false, 0); 8744 // For i64 source, we need to add the appropriate power of 2 if the input 8745 // was negative. This is the same as the optimization in 8746 // DAGTypeLegalizer::ExpandIntOp_UNIT_TO_FP, and for it to be safe here, 8747 // we must be careful to do the computation in x87 extended precision, not 8748 // in SSE. (The generic code can't know it's OK to do this, or how to.) 8749 int SSFI = cast<FrameIndexSDNode>(StackSlot)->getIndex(); 8750 MachineMemOperand *MMO = 8751 DAG.getMachineFunction() 8752 .getMachineMemOperand(MachinePointerInfo::getFixedStack(SSFI), 8753 MachineMemOperand::MOLoad, 8, 8); 8754 8755 SDVTList Tys = DAG.getVTList(MVT::f80, MVT::Other); 8756 SDValue Ops[] = { Store, StackSlot, DAG.getValueType(MVT::i64) }; 8757 SDValue Fild = DAG.getMemIntrinsicNode(X86ISD::FILD, dl, Tys, Ops, 8758 array_lengthof(Ops), MVT::i64, MMO); 8759 8760 APInt FF(32, 0x5F800000ULL); 8761 8762 // Check whether the sign bit is set. 8763 SDValue SignSet = DAG.getSetCC(dl, 8764 getSetCCResultType(*DAG.getContext(), MVT::i64), 8765 Op.getOperand(0), DAG.getConstant(0, MVT::i64), 8766 ISD::SETLT); 8767 8768 // Build a 64 bit pair (0, FF) in the constant pool, with FF in the lo bits. 8769 SDValue FudgePtr = DAG.getConstantPool( 8770 ConstantInt::get(*DAG.getContext(), FF.zext(64)), 8771 getPointerTy()); 8772 8773 // Get a pointer to FF if the sign bit was set, or to 0 otherwise. 8774 SDValue Zero = DAG.getIntPtrConstant(0); 8775 SDValue Four = DAG.getIntPtrConstant(4); 8776 SDValue Offset = DAG.getNode(ISD::SELECT, dl, Zero.getValueType(), SignSet, 8777 Zero, Four); 8778 FudgePtr = DAG.getNode(ISD::ADD, dl, getPointerTy(), FudgePtr, Offset); 8779 8780 // Load the value out, extending it from f32 to f80. 8781 // FIXME: Avoid the extend by constructing the right constant pool? 8782 SDValue Fudge = DAG.getExtLoad(ISD::EXTLOAD, dl, MVT::f80, DAG.getEntryNode(), 8783 FudgePtr, MachinePointerInfo::getConstantPool(), 8784 MVT::f32, false, false, 4); 8785 // Extend everything to 80 bits to force it to be done on x87. 8786 SDValue Add = DAG.getNode(ISD::FADD, dl, MVT::f80, Fild, Fudge); 8787 return DAG.getNode(ISD::FP_ROUND, dl, DstVT, Add, DAG.getIntPtrConstant(0)); 8788} 8789 8790std::pair<SDValue,SDValue> 8791X86TargetLowering:: FP_TO_INTHelper(SDValue Op, SelectionDAG &DAG, 8792 bool IsSigned, bool IsReplace) const { 8793 SDLoc DL(Op); 8794 8795 EVT DstTy = Op.getValueType(); 8796 8797 if (!IsSigned && !isIntegerTypeFTOL(DstTy)) { 8798 assert(DstTy == MVT::i32 && "Unexpected FP_TO_UINT"); 8799 DstTy = MVT::i64; 8800 } 8801 8802 assert(DstTy.getSimpleVT() <= MVT::i64 && 8803 DstTy.getSimpleVT() >= MVT::i16 && 8804 "Unknown FP_TO_INT to lower!"); 8805 8806 // These are really Legal. 8807 if (DstTy == MVT::i32 && 8808 isScalarFPTypeInSSEReg(Op.getOperand(0).getValueType())) 8809 return std::make_pair(SDValue(), SDValue()); 8810 if (Subtarget->is64Bit() && 8811 DstTy == MVT::i64 && 8812 isScalarFPTypeInSSEReg(Op.getOperand(0).getValueType())) 8813 return std::make_pair(SDValue(), SDValue()); 8814 8815 // We lower FP->int64 either into FISTP64 followed by a load from a temporary 8816 // stack slot, or into the FTOL runtime function. 8817 MachineFunction &MF = DAG.getMachineFunction(); 8818 unsigned MemSize = DstTy.getSizeInBits()/8; 8819 int SSFI = MF.getFrameInfo()->CreateStackObject(MemSize, MemSize, false); 8820 SDValue StackSlot = DAG.getFrameIndex(SSFI, getPointerTy()); 8821 8822 unsigned Opc; 8823 if (!IsSigned && isIntegerTypeFTOL(DstTy)) 8824 Opc = X86ISD::WIN_FTOL; 8825 else 8826 switch (DstTy.getSimpleVT().SimpleTy) { 8827 default: llvm_unreachable("Invalid FP_TO_SINT to lower!"); 8828 case MVT::i16: Opc = X86ISD::FP_TO_INT16_IN_MEM; break; 8829 case MVT::i32: Opc = X86ISD::FP_TO_INT32_IN_MEM; break; 8830 case MVT::i64: Opc = X86ISD::FP_TO_INT64_IN_MEM; break; 8831 } 8832 8833 SDValue Chain = DAG.getEntryNode(); 8834 SDValue Value = Op.getOperand(0); 8835 EVT TheVT = Op.getOperand(0).getValueType(); 8836 // FIXME This causes a redundant load/store if the SSE-class value is already 8837 // in memory, such as if it is on the callstack. 8838 if (isScalarFPTypeInSSEReg(TheVT)) { 8839 assert(DstTy == MVT::i64 && "Invalid FP_TO_SINT to lower!"); 8840 Chain = DAG.getStore(Chain, DL, Value, StackSlot, 8841 MachinePointerInfo::getFixedStack(SSFI), 8842 false, false, 0); 8843 SDVTList Tys = DAG.getVTList(Op.getOperand(0).getValueType(), MVT::Other); 8844 SDValue Ops[] = { 8845 Chain, StackSlot, DAG.getValueType(TheVT) 8846 }; 8847 8848 MachineMemOperand *MMO = 8849 MF.getMachineMemOperand(MachinePointerInfo::getFixedStack(SSFI), 8850 MachineMemOperand::MOLoad, MemSize, MemSize); 8851 Value = DAG.getMemIntrinsicNode(X86ISD::FLD, DL, Tys, Ops, 8852 array_lengthof(Ops), DstTy, MMO); 8853 Chain = Value.getValue(1); 8854 SSFI = MF.getFrameInfo()->CreateStackObject(MemSize, MemSize, false); 8855 StackSlot = DAG.getFrameIndex(SSFI, getPointerTy()); 8856 } 8857 8858 MachineMemOperand *MMO = 8859 MF.getMachineMemOperand(MachinePointerInfo::getFixedStack(SSFI), 8860 MachineMemOperand::MOStore, MemSize, MemSize); 8861 8862 if (Opc != X86ISD::WIN_FTOL) { 8863 // Build the FP_TO_INT*_IN_MEM 8864 SDValue Ops[] = { Chain, Value, StackSlot }; 8865 SDValue FIST = DAG.getMemIntrinsicNode(Opc, DL, DAG.getVTList(MVT::Other), 8866 Ops, array_lengthof(Ops), DstTy, 8867 MMO); 8868 return std::make_pair(FIST, StackSlot); 8869 } else { 8870 SDValue ftol = DAG.getNode(X86ISD::WIN_FTOL, DL, 8871 DAG.getVTList(MVT::Other, MVT::Glue), 8872 Chain, Value); 8873 SDValue eax = DAG.getCopyFromReg(ftol, DL, X86::EAX, 8874 MVT::i32, ftol.getValue(1)); 8875 SDValue edx = DAG.getCopyFromReg(eax.getValue(1), DL, X86::EDX, 8876 MVT::i32, eax.getValue(2)); 8877 SDValue Ops[] = { eax, edx }; 8878 SDValue pair = IsReplace 8879 ? DAG.getNode(ISD::BUILD_PAIR, DL, MVT::i64, Ops, array_lengthof(Ops)) 8880 : DAG.getMergeValues(Ops, array_lengthof(Ops), DL); 8881 return std::make_pair(pair, SDValue()); 8882 } 8883} 8884 8885static SDValue LowerAVXExtend(SDValue Op, SelectionDAG &DAG, 8886 const X86Subtarget *Subtarget) { 8887 MVT VT = Op->getSimpleValueType(0); 8888 SDValue In = Op->getOperand(0); 8889 MVT InVT = In.getSimpleValueType(); 8890 SDLoc dl(Op); 8891 8892 // Optimize vectors in AVX mode: 8893 // 8894 // v8i16 -> v8i32 8895 // Use vpunpcklwd for 4 lower elements v8i16 -> v4i32. 8896 // Use vpunpckhwd for 4 upper elements v8i16 -> v4i32. 8897 // Concat upper and lower parts. 8898 // 8899 // v4i32 -> v4i64 8900 // Use vpunpckldq for 4 lower elements v4i32 -> v2i64. 8901 // Use vpunpckhdq for 4 upper elements v4i32 -> v2i64. 8902 // Concat upper and lower parts. 8903 // 8904 8905 if (((VT != MVT::v16i16) || (InVT != MVT::v16i8)) && 8906 ((VT != MVT::v8i32) || (InVT != MVT::v8i16)) && 8907 ((VT != MVT::v4i64) || (InVT != MVT::v4i32))) 8908 return SDValue(); 8909 8910 if (Subtarget->hasInt256()) 8911 return DAG.getNode(X86ISD::VZEXT_MOVL, dl, VT, In); 8912 8913 SDValue ZeroVec = getZeroVector(InVT, Subtarget, DAG, dl); 8914 SDValue Undef = DAG.getUNDEF(InVT); 8915 bool NeedZero = Op.getOpcode() == ISD::ZERO_EXTEND; 8916 SDValue OpLo = getUnpackl(DAG, dl, InVT, In, NeedZero ? ZeroVec : Undef); 8917 SDValue OpHi = getUnpackh(DAG, dl, InVT, In, NeedZero ? ZeroVec : Undef); 8918 8919 MVT HVT = MVT::getVectorVT(VT.getVectorElementType(), 8920 VT.getVectorNumElements()/2); 8921 8922 OpLo = DAG.getNode(ISD::BITCAST, dl, HVT, OpLo); 8923 OpHi = DAG.getNode(ISD::BITCAST, dl, HVT, OpHi); 8924 8925 return DAG.getNode(ISD::CONCAT_VECTORS, dl, VT, OpLo, OpHi); 8926} 8927 8928static SDValue LowerZERO_EXTEND_AVX512(SDValue Op, 8929 SelectionDAG &DAG) { 8930 MVT VT = Op->getValueType(0).getSimpleVT(); 8931 SDValue In = Op->getOperand(0); 8932 MVT InVT = In.getValueType().getSimpleVT(); 8933 SDLoc DL(Op); 8934 unsigned int NumElts = VT.getVectorNumElements(); 8935 if (NumElts != 8 && NumElts != 16) 8936 return SDValue(); 8937 8938 if (VT.is512BitVector() && InVT.getVectorElementType() != MVT::i1) 8939 return DAG.getNode(X86ISD::VZEXT, DL, VT, In); 8940 8941 EVT ExtVT = (NumElts == 8)? MVT::v8i64 : MVT::v16i32; 8942 const TargetLowering &TLI = DAG.getTargetLoweringInfo(); 8943 // Now we have only mask extension 8944 assert(InVT.getVectorElementType() == MVT::i1); 8945 SDValue Cst = DAG.getTargetConstant(1, ExtVT.getScalarType()); 8946 const Constant *C = (dyn_cast<ConstantSDNode>(Cst))->getConstantIntValue(); 8947 SDValue CP = DAG.getConstantPool(C, TLI.getPointerTy()); 8948 unsigned Alignment = cast<ConstantPoolSDNode>(CP)->getAlignment(); 8949 SDValue Ld = DAG.getLoad(Cst.getValueType(), DL, DAG.getEntryNode(), CP, 8950 MachinePointerInfo::getConstantPool(), 8951 false, false, false, Alignment); 8952 8953 SDValue Brcst = DAG.getNode(X86ISD::VBROADCASTM, DL, ExtVT, In, Ld); 8954 if (VT.is512BitVector()) 8955 return Brcst; 8956 return DAG.getNode(X86ISD::VTRUNC, DL, VT, Brcst); 8957} 8958 8959static SDValue LowerANY_EXTEND(SDValue Op, const X86Subtarget *Subtarget, 8960 SelectionDAG &DAG) { 8961 if (Subtarget->hasFp256()) { 8962 SDValue Res = LowerAVXExtend(Op, DAG, Subtarget); 8963 if (Res.getNode()) 8964 return Res; 8965 } 8966 8967 return SDValue(); 8968} 8969 8970static SDValue LowerZERO_EXTEND(SDValue Op, const X86Subtarget *Subtarget, 8971 SelectionDAG &DAG) { 8972 SDLoc DL(Op); 8973 MVT VT = Op.getSimpleValueType(); 8974 SDValue In = Op.getOperand(0); 8975 MVT SVT = In.getSimpleValueType(); 8976 8977 if (VT.is512BitVector() || SVT.getVectorElementType() == MVT::i1) 8978 return LowerZERO_EXTEND_AVX512(Op, DAG); 8979 8980 if (Subtarget->hasFp256()) { 8981 SDValue Res = LowerAVXExtend(Op, DAG, Subtarget); 8982 if (Res.getNode()) 8983 return Res; 8984 } 8985 8986 assert(!VT.is256BitVector() || !SVT.is128BitVector() || 8987 VT.getVectorNumElements() != SVT.getVectorNumElements()); 8988 return SDValue(); 8989} 8990 8991SDValue X86TargetLowering::LowerTRUNCATE(SDValue Op, SelectionDAG &DAG) const { 8992 SDLoc DL(Op); 8993 MVT VT = Op.getSimpleValueType(); 8994 SDValue In = Op.getOperand(0); 8995 MVT InVT = In.getSimpleValueType(); 8996 assert(VT.getVectorNumElements() == InVT.getVectorNumElements() && 8997 "Invalid TRUNCATE operation"); 8998 8999 if (InVT.is512BitVector() || VT.getVectorElementType() == MVT::i1) { 9000 if (VT.getVectorElementType().getSizeInBits() >=8) 9001 return DAG.getNode(X86ISD::VTRUNC, DL, VT, In); 9002 9003 assert(VT.getVectorElementType() == MVT::i1 && "Unexpected vector type"); 9004 unsigned NumElts = InVT.getVectorNumElements(); 9005 assert ((NumElts == 8 || NumElts == 16) && "Unexpected vector type"); 9006 if (InVT.getSizeInBits() < 512) { 9007 MVT ExtVT = (NumElts == 16)? MVT::v16i32 : MVT::v8i64; 9008 In = DAG.getNode(ISD::SIGN_EXTEND, DL, ExtVT, In); 9009 InVT = ExtVT; 9010 } 9011 SDValue Cst = DAG.getTargetConstant(1, InVT.getVectorElementType()); 9012 const Constant *C = (dyn_cast<ConstantSDNode>(Cst))->getConstantIntValue(); 9013 SDValue CP = DAG.getConstantPool(C, getPointerTy()); 9014 unsigned Alignment = cast<ConstantPoolSDNode>(CP)->getAlignment(); 9015 SDValue Ld = DAG.getLoad(Cst.getValueType(), DL, DAG.getEntryNode(), CP, 9016 MachinePointerInfo::getConstantPool(), 9017 false, false, false, Alignment); 9018 SDValue OneV = DAG.getNode(X86ISD::VBROADCAST, DL, InVT, Ld); 9019 SDValue And = DAG.getNode(ISD::AND, DL, InVT, OneV, In); 9020 return DAG.getNode(X86ISD::TESTM, DL, VT, And, And); 9021 } 9022 9023 if ((VT == MVT::v4i32) && (InVT == MVT::v4i64)) { 9024 // On AVX2, v4i64 -> v4i32 becomes VPERMD. 9025 if (Subtarget->hasInt256()) { 9026 static const int ShufMask[] = {0, 2, 4, 6, -1, -1, -1, -1}; 9027 In = DAG.getNode(ISD::BITCAST, DL, MVT::v8i32, In); 9028 In = DAG.getVectorShuffle(MVT::v8i32, DL, In, DAG.getUNDEF(MVT::v8i32), 9029 ShufMask); 9030 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, In, 9031 DAG.getIntPtrConstant(0)); 9032 } 9033 9034 // On AVX, v4i64 -> v4i32 becomes a sequence that uses PSHUFD and MOVLHPS. 9035 SDValue OpLo = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, MVT::v2i64, In, 9036 DAG.getIntPtrConstant(0)); 9037 SDValue OpHi = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, MVT::v2i64, In, 9038 DAG.getIntPtrConstant(2)); 9039 9040 OpLo = DAG.getNode(ISD::BITCAST, DL, MVT::v4i32, OpLo); 9041 OpHi = DAG.getNode(ISD::BITCAST, DL, MVT::v4i32, OpHi); 9042 9043 // The PSHUFD mask: 9044 static const int ShufMask1[] = {0, 2, 0, 0}; 9045 SDValue Undef = DAG.getUNDEF(VT); 9046 OpLo = DAG.getVectorShuffle(VT, DL, OpLo, Undef, ShufMask1); 9047 OpHi = DAG.getVectorShuffle(VT, DL, OpHi, Undef, ShufMask1); 9048 9049 // The MOVLHPS mask: 9050 static const int ShufMask2[] = {0, 1, 4, 5}; 9051 return DAG.getVectorShuffle(VT, DL, OpLo, OpHi, ShufMask2); 9052 } 9053 9054 if ((VT == MVT::v8i16) && (InVT == MVT::v8i32)) { 9055 // On AVX2, v8i32 -> v8i16 becomed PSHUFB. 9056 if (Subtarget->hasInt256()) { 9057 In = DAG.getNode(ISD::BITCAST, DL, MVT::v32i8, In); 9058 9059 SmallVector<SDValue,32> pshufbMask; 9060 for (unsigned i = 0; i < 2; ++i) { 9061 pshufbMask.push_back(DAG.getConstant(0x0, MVT::i8)); 9062 pshufbMask.push_back(DAG.getConstant(0x1, MVT::i8)); 9063 pshufbMask.push_back(DAG.getConstant(0x4, MVT::i8)); 9064 pshufbMask.push_back(DAG.getConstant(0x5, MVT::i8)); 9065 pshufbMask.push_back(DAG.getConstant(0x8, MVT::i8)); 9066 pshufbMask.push_back(DAG.getConstant(0x9, MVT::i8)); 9067 pshufbMask.push_back(DAG.getConstant(0xc, MVT::i8)); 9068 pshufbMask.push_back(DAG.getConstant(0xd, MVT::i8)); 9069 for (unsigned j = 0; j < 8; ++j) 9070 pshufbMask.push_back(DAG.getConstant(0x80, MVT::i8)); 9071 } 9072 SDValue BV = DAG.getNode(ISD::BUILD_VECTOR, DL, MVT::v32i8, 9073 &pshufbMask[0], 32); 9074 In = DAG.getNode(X86ISD::PSHUFB, DL, MVT::v32i8, In, BV); 9075 In = DAG.getNode(ISD::BITCAST, DL, MVT::v4i64, In); 9076 9077 static const int ShufMask[] = {0, 2, -1, -1}; 9078 In = DAG.getVectorShuffle(MVT::v4i64, DL, In, DAG.getUNDEF(MVT::v4i64), 9079 &ShufMask[0]); 9080 In = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, MVT::v2i64, In, 9081 DAG.getIntPtrConstant(0)); 9082 return DAG.getNode(ISD::BITCAST, DL, VT, In); 9083 } 9084 9085 SDValue OpLo = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, MVT::v4i32, In, 9086 DAG.getIntPtrConstant(0)); 9087 9088 SDValue OpHi = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, MVT::v4i32, In, 9089 DAG.getIntPtrConstant(4)); 9090 9091 OpLo = DAG.getNode(ISD::BITCAST, DL, MVT::v16i8, OpLo); 9092 OpHi = DAG.getNode(ISD::BITCAST, DL, MVT::v16i8, OpHi); 9093 9094 // The PSHUFB mask: 9095 static const int ShufMask1[] = {0, 1, 4, 5, 8, 9, 12, 13, 9096 -1, -1, -1, -1, -1, -1, -1, -1}; 9097 9098 SDValue Undef = DAG.getUNDEF(MVT::v16i8); 9099 OpLo = DAG.getVectorShuffle(MVT::v16i8, DL, OpLo, Undef, ShufMask1); 9100 OpHi = DAG.getVectorShuffle(MVT::v16i8, DL, OpHi, Undef, ShufMask1); 9101 9102 OpLo = DAG.getNode(ISD::BITCAST, DL, MVT::v4i32, OpLo); 9103 OpHi = DAG.getNode(ISD::BITCAST, DL, MVT::v4i32, OpHi); 9104 9105 // The MOVLHPS Mask: 9106 static const int ShufMask2[] = {0, 1, 4, 5}; 9107 SDValue res = DAG.getVectorShuffle(MVT::v4i32, DL, OpLo, OpHi, ShufMask2); 9108 return DAG.getNode(ISD::BITCAST, DL, MVT::v8i16, res); 9109 } 9110 9111 // Handle truncation of V256 to V128 using shuffles. 9112 if (!VT.is128BitVector() || !InVT.is256BitVector()) 9113 return SDValue(); 9114 9115 assert(Subtarget->hasFp256() && "256-bit vector without AVX!"); 9116 9117 unsigned NumElems = VT.getVectorNumElements(); 9118 EVT NVT = EVT::getVectorVT(*DAG.getContext(), VT.getVectorElementType(), 9119 NumElems * 2); 9120 9121 SmallVector<int, 16> MaskVec(NumElems * 2, -1); 9122 // Prepare truncation shuffle mask 9123 for (unsigned i = 0; i != NumElems; ++i) 9124 MaskVec[i] = i * 2; 9125 SDValue V = DAG.getVectorShuffle(NVT, DL, 9126 DAG.getNode(ISD::BITCAST, DL, NVT, In), 9127 DAG.getUNDEF(NVT), &MaskVec[0]); 9128 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, V, 9129 DAG.getIntPtrConstant(0)); 9130} 9131 9132SDValue X86TargetLowering::LowerFP_TO_SINT(SDValue Op, 9133 SelectionDAG &DAG) const { 9134 MVT VT = Op.getSimpleValueType(); 9135 if (VT.isVector()) { 9136 if (VT == MVT::v8i16) 9137 return DAG.getNode(ISD::TRUNCATE, SDLoc(Op), VT, 9138 DAG.getNode(ISD::FP_TO_SINT, SDLoc(Op), 9139 MVT::v8i32, Op.getOperand(0))); 9140 return SDValue(); 9141 } 9142 9143 std::pair<SDValue,SDValue> Vals = FP_TO_INTHelper(Op, DAG, 9144 /*IsSigned=*/ true, /*IsReplace=*/ false); 9145 SDValue FIST = Vals.first, StackSlot = Vals.second; 9146 // If FP_TO_INTHelper failed, the node is actually supposed to be Legal. 9147 if (FIST.getNode() == 0) return Op; 9148 9149 if (StackSlot.getNode()) 9150 // Load the result. 9151 return DAG.getLoad(Op.getValueType(), SDLoc(Op), 9152 FIST, StackSlot, MachinePointerInfo(), 9153 false, false, false, 0); 9154 9155 // The node is the result. 9156 return FIST; 9157} 9158 9159SDValue X86TargetLowering::LowerFP_TO_UINT(SDValue Op, 9160 SelectionDAG &DAG) const { 9161 std::pair<SDValue,SDValue> Vals = FP_TO_INTHelper(Op, DAG, 9162 /*IsSigned=*/ false, /*IsReplace=*/ false); 9163 SDValue FIST = Vals.first, StackSlot = Vals.second; 9164 assert(FIST.getNode() && "Unexpected failure"); 9165 9166 if (StackSlot.getNode()) 9167 // Load the result. 9168 return DAG.getLoad(Op.getValueType(), SDLoc(Op), 9169 FIST, StackSlot, MachinePointerInfo(), 9170 false, false, false, 0); 9171 9172 // The node is the result. 9173 return FIST; 9174} 9175 9176static SDValue LowerFP_EXTEND(SDValue Op, SelectionDAG &DAG) { 9177 SDLoc DL(Op); 9178 MVT VT = Op.getSimpleValueType(); 9179 SDValue In = Op.getOperand(0); 9180 MVT SVT = In.getSimpleValueType(); 9181 9182 assert(SVT == MVT::v2f32 && "Only customize MVT::v2f32 type legalization!"); 9183 9184 return DAG.getNode(X86ISD::VFPEXT, DL, VT, 9185 DAG.getNode(ISD::CONCAT_VECTORS, DL, MVT::v4f32, 9186 In, DAG.getUNDEF(SVT))); 9187} 9188 9189SDValue X86TargetLowering::LowerFABS(SDValue Op, SelectionDAG &DAG) const { 9190 LLVMContext *Context = DAG.getContext(); 9191 SDLoc dl(Op); 9192 MVT VT = Op.getSimpleValueType(); 9193 MVT EltVT = VT; 9194 unsigned NumElts = VT == MVT::f64 ? 2 : 4; 9195 if (VT.isVector()) { 9196 EltVT = VT.getVectorElementType(); 9197 NumElts = VT.getVectorNumElements(); 9198 } 9199 Constant *C; 9200 if (EltVT == MVT::f64) 9201 C = ConstantFP::get(*Context, APFloat(APFloat::IEEEdouble, 9202 APInt(64, ~(1ULL << 63)))); 9203 else 9204 C = ConstantFP::get(*Context, APFloat(APFloat::IEEEsingle, 9205 APInt(32, ~(1U << 31)))); 9206 C = ConstantVector::getSplat(NumElts, C); 9207 SDValue CPIdx = DAG.getConstantPool(C, getPointerTy()); 9208 unsigned Alignment = cast<ConstantPoolSDNode>(CPIdx)->getAlignment(); 9209 SDValue Mask = DAG.getLoad(VT, dl, DAG.getEntryNode(), CPIdx, 9210 MachinePointerInfo::getConstantPool(), 9211 false, false, false, Alignment); 9212 if (VT.isVector()) { 9213 MVT ANDVT = VT.is128BitVector() ? MVT::v2i64 : MVT::v4i64; 9214 return DAG.getNode(ISD::BITCAST, dl, VT, 9215 DAG.getNode(ISD::AND, dl, ANDVT, 9216 DAG.getNode(ISD::BITCAST, dl, ANDVT, 9217 Op.getOperand(0)), 9218 DAG.getNode(ISD::BITCAST, dl, ANDVT, Mask))); 9219 } 9220 return DAG.getNode(X86ISD::FAND, dl, VT, Op.getOperand(0), Mask); 9221} 9222 9223SDValue X86TargetLowering::LowerFNEG(SDValue Op, SelectionDAG &DAG) const { 9224 LLVMContext *Context = DAG.getContext(); 9225 SDLoc dl(Op); 9226 MVT VT = Op.getSimpleValueType(); 9227 MVT EltVT = VT; 9228 unsigned NumElts = VT == MVT::f64 ? 2 : 4; 9229 if (VT.isVector()) { 9230 EltVT = VT.getVectorElementType(); 9231 NumElts = VT.getVectorNumElements(); 9232 } 9233 Constant *C; 9234 if (EltVT == MVT::f64) 9235 C = ConstantFP::get(*Context, APFloat(APFloat::IEEEdouble, 9236 APInt(64, 1ULL << 63))); 9237 else 9238 C = ConstantFP::get(*Context, APFloat(APFloat::IEEEsingle, 9239 APInt(32, 1U << 31))); 9240 C = ConstantVector::getSplat(NumElts, C); 9241 SDValue CPIdx = DAG.getConstantPool(C, getPointerTy()); 9242 unsigned Alignment = cast<ConstantPoolSDNode>(CPIdx)->getAlignment(); 9243 SDValue Mask = DAG.getLoad(VT, dl, DAG.getEntryNode(), CPIdx, 9244 MachinePointerInfo::getConstantPool(), 9245 false, false, false, Alignment); 9246 if (VT.isVector()) { 9247 MVT XORVT = MVT::getVectorVT(MVT::i64, VT.getSizeInBits()/64); 9248 return DAG.getNode(ISD::BITCAST, dl, VT, 9249 DAG.getNode(ISD::XOR, dl, XORVT, 9250 DAG.getNode(ISD::BITCAST, dl, XORVT, 9251 Op.getOperand(0)), 9252 DAG.getNode(ISD::BITCAST, dl, XORVT, Mask))); 9253 } 9254 9255 return DAG.getNode(X86ISD::FXOR, dl, VT, Op.getOperand(0), Mask); 9256} 9257 9258SDValue X86TargetLowering::LowerFCOPYSIGN(SDValue Op, SelectionDAG &DAG) const { 9259 LLVMContext *Context = DAG.getContext(); 9260 SDValue Op0 = Op.getOperand(0); 9261 SDValue Op1 = Op.getOperand(1); 9262 SDLoc dl(Op); 9263 MVT VT = Op.getSimpleValueType(); 9264 MVT SrcVT = Op1.getSimpleValueType(); 9265 9266 // If second operand is smaller, extend it first. 9267 if (SrcVT.bitsLT(VT)) { 9268 Op1 = DAG.getNode(ISD::FP_EXTEND, dl, VT, Op1); 9269 SrcVT = VT; 9270 } 9271 // And if it is bigger, shrink it first. 9272 if (SrcVT.bitsGT(VT)) { 9273 Op1 = DAG.getNode(ISD::FP_ROUND, dl, VT, Op1, DAG.getIntPtrConstant(1)); 9274 SrcVT = VT; 9275 } 9276 9277 // At this point the operands and the result should have the same 9278 // type, and that won't be f80 since that is not custom lowered. 9279 9280 // First get the sign bit of second operand. 9281 SmallVector<Constant*,4> CV; 9282 if (SrcVT == MVT::f64) { 9283 const fltSemantics &Sem = APFloat::IEEEdouble; 9284 CV.push_back(ConstantFP::get(*Context, APFloat(Sem, APInt(64, 1ULL << 63)))); 9285 CV.push_back(ConstantFP::get(*Context, APFloat(Sem, APInt(64, 0)))); 9286 } else { 9287 const fltSemantics &Sem = APFloat::IEEEsingle; 9288 CV.push_back(ConstantFP::get(*Context, APFloat(Sem, APInt(32, 1U << 31)))); 9289 CV.push_back(ConstantFP::get(*Context, APFloat(Sem, APInt(32, 0)))); 9290 CV.push_back(ConstantFP::get(*Context, APFloat(Sem, APInt(32, 0)))); 9291 CV.push_back(ConstantFP::get(*Context, APFloat(Sem, APInt(32, 0)))); 9292 } 9293 Constant *C = ConstantVector::get(CV); 9294 SDValue CPIdx = DAG.getConstantPool(C, getPointerTy(), 16); 9295 SDValue Mask1 = DAG.getLoad(SrcVT, dl, DAG.getEntryNode(), CPIdx, 9296 MachinePointerInfo::getConstantPool(), 9297 false, false, false, 16); 9298 SDValue SignBit = DAG.getNode(X86ISD::FAND, dl, SrcVT, Op1, Mask1); 9299 9300 // Shift sign bit right or left if the two operands have different types. 9301 if (SrcVT.bitsGT(VT)) { 9302 // Op0 is MVT::f32, Op1 is MVT::f64. 9303 SignBit = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v2f64, SignBit); 9304 SignBit = DAG.getNode(X86ISD::FSRL, dl, MVT::v2f64, SignBit, 9305 DAG.getConstant(32, MVT::i32)); 9306 SignBit = DAG.getNode(ISD::BITCAST, dl, MVT::v4f32, SignBit); 9307 SignBit = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::f32, SignBit, 9308 DAG.getIntPtrConstant(0)); 9309 } 9310 9311 // Clear first operand sign bit. 9312 CV.clear(); 9313 if (VT == MVT::f64) { 9314 const fltSemantics &Sem = APFloat::IEEEdouble; 9315 CV.push_back(ConstantFP::get(*Context, APFloat(Sem, 9316 APInt(64, ~(1ULL << 63))))); 9317 CV.push_back(ConstantFP::get(*Context, APFloat(Sem, APInt(64, 0)))); 9318 } else { 9319 const fltSemantics &Sem = APFloat::IEEEsingle; 9320 CV.push_back(ConstantFP::get(*Context, APFloat(Sem, 9321 APInt(32, ~(1U << 31))))); 9322 CV.push_back(ConstantFP::get(*Context, APFloat(Sem, APInt(32, 0)))); 9323 CV.push_back(ConstantFP::get(*Context, APFloat(Sem, APInt(32, 0)))); 9324 CV.push_back(ConstantFP::get(*Context, APFloat(Sem, APInt(32, 0)))); 9325 } 9326 C = ConstantVector::get(CV); 9327 CPIdx = DAG.getConstantPool(C, getPointerTy(), 16); 9328 SDValue Mask2 = DAG.getLoad(VT, dl, DAG.getEntryNode(), CPIdx, 9329 MachinePointerInfo::getConstantPool(), 9330 false, false, false, 16); 9331 SDValue Val = DAG.getNode(X86ISD::FAND, dl, VT, Op0, Mask2); 9332 9333 // Or the value with the sign bit. 9334 return DAG.getNode(X86ISD::FOR, dl, VT, Val, SignBit); 9335} 9336 9337static SDValue LowerFGETSIGN(SDValue Op, SelectionDAG &DAG) { 9338 SDValue N0 = Op.getOperand(0); 9339 SDLoc dl(Op); 9340 MVT VT = Op.getSimpleValueType(); 9341 9342 // Lower ISD::FGETSIGN to (AND (X86ISD::FGETSIGNx86 ...) 1). 9343 SDValue xFGETSIGN = DAG.getNode(X86ISD::FGETSIGNx86, dl, VT, N0, 9344 DAG.getConstant(1, VT)); 9345 return DAG.getNode(ISD::AND, dl, VT, xFGETSIGN, DAG.getConstant(1, VT)); 9346} 9347 9348// LowerVectorAllZeroTest - Check whether an OR'd tree is PTEST-able. 9349// 9350static SDValue LowerVectorAllZeroTest(SDValue Op, const X86Subtarget *Subtarget, 9351 SelectionDAG &DAG) { 9352 assert(Op.getOpcode() == ISD::OR && "Only check OR'd tree."); 9353 9354 if (!Subtarget->hasSSE41()) 9355 return SDValue(); 9356 9357 if (!Op->hasOneUse()) 9358 return SDValue(); 9359 9360 SDNode *N = Op.getNode(); 9361 SDLoc DL(N); 9362 9363 SmallVector<SDValue, 8> Opnds; 9364 DenseMap<SDValue, unsigned> VecInMap; 9365 EVT VT = MVT::Other; 9366 9367 // Recognize a special case where a vector is casted into wide integer to 9368 // test all 0s. 9369 Opnds.push_back(N->getOperand(0)); 9370 Opnds.push_back(N->getOperand(1)); 9371 9372 for (unsigned Slot = 0, e = Opnds.size(); Slot < e; ++Slot) { 9373 SmallVectorImpl<SDValue>::const_iterator I = Opnds.begin() + Slot; 9374 // BFS traverse all OR'd operands. 9375 if (I->getOpcode() == ISD::OR) { 9376 Opnds.push_back(I->getOperand(0)); 9377 Opnds.push_back(I->getOperand(1)); 9378 // Re-evaluate the number of nodes to be traversed. 9379 e += 2; // 2 more nodes (LHS and RHS) are pushed. 9380 continue; 9381 } 9382 9383 // Quit if a non-EXTRACT_VECTOR_ELT 9384 if (I->getOpcode() != ISD::EXTRACT_VECTOR_ELT) 9385 return SDValue(); 9386 9387 // Quit if without a constant index. 9388 SDValue Idx = I->getOperand(1); 9389 if (!isa<ConstantSDNode>(Idx)) 9390 return SDValue(); 9391 9392 SDValue ExtractedFromVec = I->getOperand(0); 9393 DenseMap<SDValue, unsigned>::iterator M = VecInMap.find(ExtractedFromVec); 9394 if (M == VecInMap.end()) { 9395 VT = ExtractedFromVec.getValueType(); 9396 // Quit if not 128/256-bit vector. 9397 if (!VT.is128BitVector() && !VT.is256BitVector()) 9398 return SDValue(); 9399 // Quit if not the same type. 9400 if (VecInMap.begin() != VecInMap.end() && 9401 VT != VecInMap.begin()->first.getValueType()) 9402 return SDValue(); 9403 M = VecInMap.insert(std::make_pair(ExtractedFromVec, 0)).first; 9404 } 9405 M->second |= 1U << cast<ConstantSDNode>(Idx)->getZExtValue(); 9406 } 9407 9408 assert((VT.is128BitVector() || VT.is256BitVector()) && 9409 "Not extracted from 128-/256-bit vector."); 9410 9411 unsigned FullMask = (1U << VT.getVectorNumElements()) - 1U; 9412 SmallVector<SDValue, 8> VecIns; 9413 9414 for (DenseMap<SDValue, unsigned>::const_iterator 9415 I = VecInMap.begin(), E = VecInMap.end(); I != E; ++I) { 9416 // Quit if not all elements are used. 9417 if (I->second != FullMask) 9418 return SDValue(); 9419 VecIns.push_back(I->first); 9420 } 9421 9422 EVT TestVT = VT.is128BitVector() ? MVT::v2i64 : MVT::v4i64; 9423 9424 // Cast all vectors into TestVT for PTEST. 9425 for (unsigned i = 0, e = VecIns.size(); i < e; ++i) 9426 VecIns[i] = DAG.getNode(ISD::BITCAST, DL, TestVT, VecIns[i]); 9427 9428 // If more than one full vectors are evaluated, OR them first before PTEST. 9429 for (unsigned Slot = 0, e = VecIns.size(); e - Slot > 1; Slot += 2, e += 1) { 9430 // Each iteration will OR 2 nodes and append the result until there is only 9431 // 1 node left, i.e. the final OR'd value of all vectors. 9432 SDValue LHS = VecIns[Slot]; 9433 SDValue RHS = VecIns[Slot + 1]; 9434 VecIns.push_back(DAG.getNode(ISD::OR, DL, TestVT, LHS, RHS)); 9435 } 9436 9437 return DAG.getNode(X86ISD::PTEST, DL, MVT::i32, 9438 VecIns.back(), VecIns.back()); 9439} 9440 9441/// Emit nodes that will be selected as "test Op0,Op0", or something 9442/// equivalent. 9443SDValue X86TargetLowering::EmitTest(SDValue Op, unsigned X86CC, 9444 SelectionDAG &DAG) const { 9445 SDLoc dl(Op); 9446 9447 // CF and OF aren't always set the way we want. Determine which 9448 // of these we need. 9449 bool NeedCF = false; 9450 bool NeedOF = false; 9451 switch (X86CC) { 9452 default: break; 9453 case X86::COND_A: case X86::COND_AE: 9454 case X86::COND_B: case X86::COND_BE: 9455 NeedCF = true; 9456 break; 9457 case X86::COND_G: case X86::COND_GE: 9458 case X86::COND_L: case X86::COND_LE: 9459 case X86::COND_O: case X86::COND_NO: 9460 NeedOF = true; 9461 break; 9462 } 9463 9464 // See if we can use the EFLAGS value from the operand instead of 9465 // doing a separate TEST. TEST always sets OF and CF to 0, so unless 9466 // we prove that the arithmetic won't overflow, we can't use OF or CF. 9467 if (Op.getResNo() != 0 || NeedOF || NeedCF) 9468 // Emit a CMP with 0, which is the TEST pattern. 9469 return DAG.getNode(X86ISD::CMP, dl, MVT::i32, Op, 9470 DAG.getConstant(0, Op.getValueType())); 9471 9472 unsigned Opcode = 0; 9473 unsigned NumOperands = 0; 9474 9475 // Truncate operations may prevent the merge of the SETCC instruction 9476 // and the arithmetic instruction before it. Attempt to truncate the operands 9477 // of the arithmetic instruction and use a reduced bit-width instruction. 9478 bool NeedTruncation = false; 9479 SDValue ArithOp = Op; 9480 if (Op->getOpcode() == ISD::TRUNCATE && Op->hasOneUse()) { 9481 SDValue Arith = Op->getOperand(0); 9482 // Both the trunc and the arithmetic op need to have one user each. 9483 if (Arith->hasOneUse()) 9484 switch (Arith.getOpcode()) { 9485 default: break; 9486 case ISD::ADD: 9487 case ISD::SUB: 9488 case ISD::AND: 9489 case ISD::OR: 9490 case ISD::XOR: { 9491 NeedTruncation = true; 9492 ArithOp = Arith; 9493 } 9494 } 9495 } 9496 9497 // NOTICE: In the code below we use ArithOp to hold the arithmetic operation 9498 // which may be the result of a CAST. We use the variable 'Op', which is the 9499 // non-casted variable when we check for possible users. 9500 switch (ArithOp.getOpcode()) { 9501 case ISD::ADD: 9502 // Due to an isel shortcoming, be conservative if this add is likely to be 9503 // selected as part of a load-modify-store instruction. When the root node 9504 // in a match is a store, isel doesn't know how to remap non-chain non-flag 9505 // uses of other nodes in the match, such as the ADD in this case. This 9506 // leads to the ADD being left around and reselected, with the result being 9507 // two adds in the output. Alas, even if none our users are stores, that 9508 // doesn't prove we're O.K. Ergo, if we have any parents that aren't 9509 // CopyToReg or SETCC, eschew INC/DEC. A better fix seems to require 9510 // climbing the DAG back to the root, and it doesn't seem to be worth the 9511 // effort. 9512 for (SDNode::use_iterator UI = Op.getNode()->use_begin(), 9513 UE = Op.getNode()->use_end(); UI != UE; ++UI) 9514 if (UI->getOpcode() != ISD::CopyToReg && 9515 UI->getOpcode() != ISD::SETCC && 9516 UI->getOpcode() != ISD::STORE) 9517 goto default_case; 9518 9519 if (ConstantSDNode *C = 9520 dyn_cast<ConstantSDNode>(ArithOp.getNode()->getOperand(1))) { 9521 // An add of one will be selected as an INC. 9522 if (C->getAPIntValue() == 1) { 9523 Opcode = X86ISD::INC; 9524 NumOperands = 1; 9525 break; 9526 } 9527 9528 // An add of negative one (subtract of one) will be selected as a DEC. 9529 if (C->getAPIntValue().isAllOnesValue()) { 9530 Opcode = X86ISD::DEC; 9531 NumOperands = 1; 9532 break; 9533 } 9534 } 9535 9536 // Otherwise use a regular EFLAGS-setting add. 9537 Opcode = X86ISD::ADD; 9538 NumOperands = 2; 9539 break; 9540 case ISD::AND: { 9541 // If the primary and result isn't used, don't bother using X86ISD::AND, 9542 // because a TEST instruction will be better. 9543 bool NonFlagUse = false; 9544 for (SDNode::use_iterator UI = Op.getNode()->use_begin(), 9545 UE = Op.getNode()->use_end(); UI != UE; ++UI) { 9546 SDNode *User = *UI; 9547 unsigned UOpNo = UI.getOperandNo(); 9548 if (User->getOpcode() == ISD::TRUNCATE && User->hasOneUse()) { 9549 // Look pass truncate. 9550 UOpNo = User->use_begin().getOperandNo(); 9551 User = *User->use_begin(); 9552 } 9553 9554 if (User->getOpcode() != ISD::BRCOND && 9555 User->getOpcode() != ISD::SETCC && 9556 !(User->getOpcode() == ISD::SELECT && UOpNo == 0)) { 9557 NonFlagUse = true; 9558 break; 9559 } 9560 } 9561 9562 if (!NonFlagUse) 9563 break; 9564 } 9565 // FALL THROUGH 9566 case ISD::SUB: 9567 case ISD::OR: 9568 case ISD::XOR: 9569 // Due to the ISEL shortcoming noted above, be conservative if this op is 9570 // likely to be selected as part of a load-modify-store instruction. 9571 for (SDNode::use_iterator UI = Op.getNode()->use_begin(), 9572 UE = Op.getNode()->use_end(); UI != UE; ++UI) 9573 if (UI->getOpcode() == ISD::STORE) 9574 goto default_case; 9575 9576 // Otherwise use a regular EFLAGS-setting instruction. 9577 switch (ArithOp.getOpcode()) { 9578 default: llvm_unreachable("unexpected operator!"); 9579 case ISD::SUB: Opcode = X86ISD::SUB; break; 9580 case ISD::XOR: Opcode = X86ISD::XOR; break; 9581 case ISD::AND: Opcode = X86ISD::AND; break; 9582 case ISD::OR: { 9583 if (!NeedTruncation && (X86CC == X86::COND_E || X86CC == X86::COND_NE)) { 9584 SDValue EFLAGS = LowerVectorAllZeroTest(Op, Subtarget, DAG); 9585 if (EFLAGS.getNode()) 9586 return EFLAGS; 9587 } 9588 Opcode = X86ISD::OR; 9589 break; 9590 } 9591 } 9592 9593 NumOperands = 2; 9594 break; 9595 case X86ISD::ADD: 9596 case X86ISD::SUB: 9597 case X86ISD::INC: 9598 case X86ISD::DEC: 9599 case X86ISD::OR: 9600 case X86ISD::XOR: 9601 case X86ISD::AND: 9602 return SDValue(Op.getNode(), 1); 9603 default: 9604 default_case: 9605 break; 9606 } 9607 9608 // If we found that truncation is beneficial, perform the truncation and 9609 // update 'Op'. 9610 if (NeedTruncation) { 9611 EVT VT = Op.getValueType(); 9612 SDValue WideVal = Op->getOperand(0); 9613 EVT WideVT = WideVal.getValueType(); 9614 unsigned ConvertedOp = 0; 9615 // Use a target machine opcode to prevent further DAGCombine 9616 // optimizations that may separate the arithmetic operations 9617 // from the setcc node. 9618 switch (WideVal.getOpcode()) { 9619 default: break; 9620 case ISD::ADD: ConvertedOp = X86ISD::ADD; break; 9621 case ISD::SUB: ConvertedOp = X86ISD::SUB; break; 9622 case ISD::AND: ConvertedOp = X86ISD::AND; break; 9623 case ISD::OR: ConvertedOp = X86ISD::OR; break; 9624 case ISD::XOR: ConvertedOp = X86ISD::XOR; break; 9625 } 9626 9627 if (ConvertedOp) { 9628 const TargetLowering &TLI = DAG.getTargetLoweringInfo(); 9629 if (TLI.isOperationLegal(WideVal.getOpcode(), WideVT)) { 9630 SDValue V0 = DAG.getNode(ISD::TRUNCATE, dl, VT, WideVal.getOperand(0)); 9631 SDValue V1 = DAG.getNode(ISD::TRUNCATE, dl, VT, WideVal.getOperand(1)); 9632 Op = DAG.getNode(ConvertedOp, dl, VT, V0, V1); 9633 } 9634 } 9635 } 9636 9637 if (Opcode == 0) 9638 // Emit a CMP with 0, which is the TEST pattern. 9639 return DAG.getNode(X86ISD::CMP, dl, MVT::i32, Op, 9640 DAG.getConstant(0, Op.getValueType())); 9641 9642 SDVTList VTs = DAG.getVTList(Op.getValueType(), MVT::i32); 9643 SmallVector<SDValue, 4> Ops; 9644 for (unsigned i = 0; i != NumOperands; ++i) 9645 Ops.push_back(Op.getOperand(i)); 9646 9647 SDValue New = DAG.getNode(Opcode, dl, VTs, &Ops[0], NumOperands); 9648 DAG.ReplaceAllUsesWith(Op, New); 9649 return SDValue(New.getNode(), 1); 9650} 9651 9652/// Emit nodes that will be selected as "cmp Op0,Op1", or something 9653/// equivalent. 9654SDValue X86TargetLowering::EmitCmp(SDValue Op0, SDValue Op1, unsigned X86CC, 9655 SelectionDAG &DAG) const { 9656 if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op1)) 9657 if (C->getAPIntValue() == 0) 9658 return EmitTest(Op0, X86CC, DAG); 9659 9660 SDLoc dl(Op0); 9661 if ((Op0.getValueType() == MVT::i8 || Op0.getValueType() == MVT::i16 || 9662 Op0.getValueType() == MVT::i32 || Op0.getValueType() == MVT::i64)) { 9663 // Use SUB instead of CMP to enable CSE between SUB and CMP. 9664 SDVTList VTs = DAG.getVTList(Op0.getValueType(), MVT::i32); 9665 SDValue Sub = DAG.getNode(X86ISD::SUB, dl, VTs, 9666 Op0, Op1); 9667 return SDValue(Sub.getNode(), 1); 9668 } 9669 return DAG.getNode(X86ISD::CMP, dl, MVT::i32, Op0, Op1); 9670} 9671 9672/// Convert a comparison if required by the subtarget. 9673SDValue X86TargetLowering::ConvertCmpIfNecessary(SDValue Cmp, 9674 SelectionDAG &DAG) const { 9675 // If the subtarget does not support the FUCOMI instruction, floating-point 9676 // comparisons have to be converted. 9677 if (Subtarget->hasCMov() || 9678 Cmp.getOpcode() != X86ISD::CMP || 9679 !Cmp.getOperand(0).getValueType().isFloatingPoint() || 9680 !Cmp.getOperand(1).getValueType().isFloatingPoint()) 9681 return Cmp; 9682 9683 // The instruction selector will select an FUCOM instruction instead of 9684 // FUCOMI, which writes the comparison result to FPSW instead of EFLAGS. Hence 9685 // build an SDNode sequence that transfers the result from FPSW into EFLAGS: 9686 // (X86sahf (trunc (srl (X86fp_stsw (trunc (X86cmp ...)), 8)))) 9687 SDLoc dl(Cmp); 9688 SDValue TruncFPSW = DAG.getNode(ISD::TRUNCATE, dl, MVT::i16, Cmp); 9689 SDValue FNStSW = DAG.getNode(X86ISD::FNSTSW16r, dl, MVT::i16, TruncFPSW); 9690 SDValue Srl = DAG.getNode(ISD::SRL, dl, MVT::i16, FNStSW, 9691 DAG.getConstant(8, MVT::i8)); 9692 SDValue TruncSrl = DAG.getNode(ISD::TRUNCATE, dl, MVT::i8, Srl); 9693 return DAG.getNode(X86ISD::SAHF, dl, MVT::i32, TruncSrl); 9694} 9695 9696static bool isAllOnes(SDValue V) { 9697 ConstantSDNode *C = dyn_cast<ConstantSDNode>(V); 9698 return C && C->isAllOnesValue(); 9699} 9700 9701/// LowerToBT - Result of 'and' is compared against zero. Turn it into a BT node 9702/// if it's possible. 9703SDValue X86TargetLowering::LowerToBT(SDValue And, ISD::CondCode CC, 9704 SDLoc dl, SelectionDAG &DAG) const { 9705 SDValue Op0 = And.getOperand(0); 9706 SDValue Op1 = And.getOperand(1); 9707 if (Op0.getOpcode() == ISD::TRUNCATE) 9708 Op0 = Op0.getOperand(0); 9709 if (Op1.getOpcode() == ISD::TRUNCATE) 9710 Op1 = Op1.getOperand(0); 9711 9712 SDValue LHS, RHS; 9713 if (Op1.getOpcode() == ISD::SHL) 9714 std::swap(Op0, Op1); 9715 if (Op0.getOpcode() == ISD::SHL) { 9716 if (ConstantSDNode *And00C = dyn_cast<ConstantSDNode>(Op0.getOperand(0))) 9717 if (And00C->getZExtValue() == 1) { 9718 // If we looked past a truncate, check that it's only truncating away 9719 // known zeros. 9720 unsigned BitWidth = Op0.getValueSizeInBits(); 9721 unsigned AndBitWidth = And.getValueSizeInBits(); 9722 if (BitWidth > AndBitWidth) { 9723 APInt Zeros, Ones; 9724 DAG.ComputeMaskedBits(Op0, Zeros, Ones); 9725 if (Zeros.countLeadingOnes() < BitWidth - AndBitWidth) 9726 return SDValue(); 9727 } 9728 LHS = Op1; 9729 RHS = Op0.getOperand(1); 9730 } 9731 } else if (Op1.getOpcode() == ISD::Constant) { 9732 ConstantSDNode *AndRHS = cast<ConstantSDNode>(Op1); 9733 uint64_t AndRHSVal = AndRHS->getZExtValue(); 9734 SDValue AndLHS = Op0; 9735 9736 if (AndRHSVal == 1 && AndLHS.getOpcode() == ISD::SRL) { 9737 LHS = AndLHS.getOperand(0); 9738 RHS = AndLHS.getOperand(1); 9739 } 9740 9741 // Use BT if the immediate can't be encoded in a TEST instruction. 9742 if (!isUInt<32>(AndRHSVal) && isPowerOf2_64(AndRHSVal)) { 9743 LHS = AndLHS; 9744 RHS = DAG.getConstant(Log2_64_Ceil(AndRHSVal), LHS.getValueType()); 9745 } 9746 } 9747 9748 if (LHS.getNode()) { 9749 // If LHS is i8, promote it to i32 with any_extend. There is no i8 BT 9750 // instruction. Since the shift amount is in-range-or-undefined, we know 9751 // that doing a bittest on the i32 value is ok. We extend to i32 because 9752 // the encoding for the i16 version is larger than the i32 version. 9753 // Also promote i16 to i32 for performance / code size reason. 9754 if (LHS.getValueType() == MVT::i8 || 9755 LHS.getValueType() == MVT::i16) 9756 LHS = DAG.getNode(ISD::ANY_EXTEND, dl, MVT::i32, LHS); 9757 9758 // If the operand types disagree, extend the shift amount to match. Since 9759 // BT ignores high bits (like shifts) we can use anyextend. 9760 if (LHS.getValueType() != RHS.getValueType()) 9761 RHS = DAG.getNode(ISD::ANY_EXTEND, dl, LHS.getValueType(), RHS); 9762 9763 SDValue BT = DAG.getNode(X86ISD::BT, dl, MVT::i32, LHS, RHS); 9764 X86::CondCode Cond = CC == ISD::SETEQ ? X86::COND_AE : X86::COND_B; 9765 return DAG.getNode(X86ISD::SETCC, dl, MVT::i8, 9766 DAG.getConstant(Cond, MVT::i8), BT); 9767 } 9768 9769 return SDValue(); 9770} 9771 9772/// \brief - Turns an ISD::CondCode into a value suitable for SSE floating point 9773/// mask CMPs. 9774static int translateX86FSETCC(ISD::CondCode SetCCOpcode, SDValue &Op0, 9775 SDValue &Op1) { 9776 unsigned SSECC; 9777 bool Swap = false; 9778 9779 // SSE Condition code mapping: 9780 // 0 - EQ 9781 // 1 - LT 9782 // 2 - LE 9783 // 3 - UNORD 9784 // 4 - NEQ 9785 // 5 - NLT 9786 // 6 - NLE 9787 // 7 - ORD 9788 switch (SetCCOpcode) { 9789 default: llvm_unreachable("Unexpected SETCC condition"); 9790 case ISD::SETOEQ: 9791 case ISD::SETEQ: SSECC = 0; break; 9792 case ISD::SETOGT: 9793 case ISD::SETGT: Swap = true; // Fallthrough 9794 case ISD::SETLT: 9795 case ISD::SETOLT: SSECC = 1; break; 9796 case ISD::SETOGE: 9797 case ISD::SETGE: Swap = true; // Fallthrough 9798 case ISD::SETLE: 9799 case ISD::SETOLE: SSECC = 2; break; 9800 case ISD::SETUO: SSECC = 3; break; 9801 case ISD::SETUNE: 9802 case ISD::SETNE: SSECC = 4; break; 9803 case ISD::SETULE: Swap = true; // Fallthrough 9804 case ISD::SETUGE: SSECC = 5; break; 9805 case ISD::SETULT: Swap = true; // Fallthrough 9806 case ISD::SETUGT: SSECC = 6; break; 9807 case ISD::SETO: SSECC = 7; break; 9808 case ISD::SETUEQ: 9809 case ISD::SETONE: SSECC = 8; break; 9810 } 9811 if (Swap) 9812 std::swap(Op0, Op1); 9813 9814 return SSECC; 9815} 9816 9817// Lower256IntVSETCC - Break a VSETCC 256-bit integer VSETCC into two new 128 9818// ones, and then concatenate the result back. 9819static SDValue Lower256IntVSETCC(SDValue Op, SelectionDAG &DAG) { 9820 MVT VT = Op.getSimpleValueType(); 9821 9822 assert(VT.is256BitVector() && Op.getOpcode() == ISD::SETCC && 9823 "Unsupported value type for operation"); 9824 9825 unsigned NumElems = VT.getVectorNumElements(); 9826 SDLoc dl(Op); 9827 SDValue CC = Op.getOperand(2); 9828 9829 // Extract the LHS vectors 9830 SDValue LHS = Op.getOperand(0); 9831 SDValue LHS1 = Extract128BitVector(LHS, 0, DAG, dl); 9832 SDValue LHS2 = Extract128BitVector(LHS, NumElems/2, DAG, dl); 9833 9834 // Extract the RHS vectors 9835 SDValue RHS = Op.getOperand(1); 9836 SDValue RHS1 = Extract128BitVector(RHS, 0, DAG, dl); 9837 SDValue RHS2 = Extract128BitVector(RHS, NumElems/2, DAG, dl); 9838 9839 // Issue the operation on the smaller types and concatenate the result back 9840 MVT EltVT = VT.getVectorElementType(); 9841 MVT NewVT = MVT::getVectorVT(EltVT, NumElems/2); 9842 return DAG.getNode(ISD::CONCAT_VECTORS, dl, VT, 9843 DAG.getNode(Op.getOpcode(), dl, NewVT, LHS1, RHS1, CC), 9844 DAG.getNode(Op.getOpcode(), dl, NewVT, LHS2, RHS2, CC)); 9845} 9846 9847static SDValue LowerIntVSETCC_AVX512(SDValue Op, SelectionDAG &DAG) { 9848 SDValue Op0 = Op.getOperand(0); 9849 SDValue Op1 = Op.getOperand(1); 9850 SDValue CC = Op.getOperand(2); 9851 MVT VT = Op.getSimpleValueType(); 9852 9853 assert(Op0.getValueType().getVectorElementType().getSizeInBits() >= 32 && 9854 Op.getValueType().getScalarType() == MVT::i1 && 9855 "Cannot set masked compare for this operation"); 9856 9857 ISD::CondCode SetCCOpcode = cast<CondCodeSDNode>(CC)->get(); 9858 SDLoc dl(Op); 9859 9860 bool Unsigned = false; 9861 unsigned SSECC; 9862 switch (SetCCOpcode) { 9863 default: llvm_unreachable("Unexpected SETCC condition"); 9864 case ISD::SETNE: SSECC = 4; break; 9865 case ISD::SETEQ: SSECC = 0; break; 9866 case ISD::SETUGT: Unsigned = true; 9867 case ISD::SETGT: SSECC = 6; break; // NLE 9868 case ISD::SETULT: Unsigned = true; 9869 case ISD::SETLT: SSECC = 1; break; 9870 case ISD::SETUGE: Unsigned = true; 9871 case ISD::SETGE: SSECC = 5; break; // NLT 9872 case ISD::SETULE: Unsigned = true; 9873 case ISD::SETLE: SSECC = 2; break; 9874 } 9875 unsigned Opc = Unsigned ? X86ISD::CMPMU: X86ISD::CMPM; 9876 return DAG.getNode(Opc, dl, VT, Op0, Op1, 9877 DAG.getConstant(SSECC, MVT::i8)); 9878 9879} 9880 9881static SDValue LowerVSETCC(SDValue Op, const X86Subtarget *Subtarget, 9882 SelectionDAG &DAG) { 9883 SDValue Op0 = Op.getOperand(0); 9884 SDValue Op1 = Op.getOperand(1); 9885 SDValue CC = Op.getOperand(2); 9886 MVT VT = Op.getSimpleValueType(); 9887 ISD::CondCode SetCCOpcode = cast<CondCodeSDNode>(CC)->get(); 9888 bool isFP = Op.getOperand(1).getSimpleValueType().isFloatingPoint(); 9889 SDLoc dl(Op); 9890 9891 if (isFP) { 9892#ifndef NDEBUG 9893 MVT EltVT = Op0.getSimpleValueType().getVectorElementType(); 9894 assert(EltVT == MVT::f32 || EltVT == MVT::f64); 9895#endif 9896 9897 unsigned SSECC = translateX86FSETCC(SetCCOpcode, Op0, Op1); 9898 unsigned Opc = X86ISD::CMPP; 9899 if (Subtarget->hasAVX512() && VT.getVectorElementType() == MVT::i1) { 9900 assert(VT.getVectorNumElements() <= 16); 9901 Opc = X86ISD::CMPM; 9902 } 9903 // In the two special cases we can't handle, emit two comparisons. 9904 if (SSECC == 8) { 9905 unsigned CC0, CC1; 9906 unsigned CombineOpc; 9907 if (SetCCOpcode == ISD::SETUEQ) { 9908 CC0 = 3; CC1 = 0; CombineOpc = ISD::OR; 9909 } else { 9910 assert(SetCCOpcode == ISD::SETONE); 9911 CC0 = 7; CC1 = 4; CombineOpc = ISD::AND; 9912 } 9913 9914 SDValue Cmp0 = DAG.getNode(Opc, dl, VT, Op0, Op1, 9915 DAG.getConstant(CC0, MVT::i8)); 9916 SDValue Cmp1 = DAG.getNode(Opc, dl, VT, Op0, Op1, 9917 DAG.getConstant(CC1, MVT::i8)); 9918 return DAG.getNode(CombineOpc, dl, VT, Cmp0, Cmp1); 9919 } 9920 // Handle all other FP comparisons here. 9921 return DAG.getNode(Opc, dl, VT, Op0, Op1, 9922 DAG.getConstant(SSECC, MVT::i8)); 9923 } 9924 9925 // Break 256-bit integer vector compare into smaller ones. 9926 if (VT.is256BitVector() && !Subtarget->hasInt256()) 9927 return Lower256IntVSETCC(Op, DAG); 9928 9929 bool MaskResult = (VT.getVectorElementType() == MVT::i1); 9930 EVT OpVT = Op1.getValueType(); 9931 if (Subtarget->hasAVX512()) { 9932 if (Op1.getValueType().is512BitVector() || 9933 (MaskResult && OpVT.getVectorElementType().getSizeInBits() >= 32)) 9934 return LowerIntVSETCC_AVX512(Op, DAG); 9935 9936 // In AVX-512 architecture setcc returns mask with i1 elements, 9937 // But there is no compare instruction for i8 and i16 elements. 9938 // We are not talking about 512-bit operands in this case, these 9939 // types are illegal. 9940 if (MaskResult && 9941 (OpVT.getVectorElementType().getSizeInBits() < 32 && 9942 OpVT.getVectorElementType().getSizeInBits() >= 8)) 9943 return DAG.getNode(ISD::TRUNCATE, dl, VT, 9944 DAG.getNode(ISD::SETCC, dl, OpVT, Op0, Op1, CC)); 9945 } 9946 9947 // We are handling one of the integer comparisons here. Since SSE only has 9948 // GT and EQ comparisons for integer, swapping operands and multiple 9949 // operations may be required for some comparisons. 9950 unsigned Opc; 9951 bool Swap = false, Invert = false, FlipSigns = false, MinMax = false; 9952 9953 switch (SetCCOpcode) { 9954 default: llvm_unreachable("Unexpected SETCC condition"); 9955 case ISD::SETNE: Invert = true; 9956 case ISD::SETEQ: Opc = MaskResult? X86ISD::PCMPEQM: X86ISD::PCMPEQ; break; 9957 case ISD::SETLT: Swap = true; 9958 case ISD::SETGT: Opc = MaskResult? X86ISD::PCMPGTM: X86ISD::PCMPGT; break; 9959 case ISD::SETGE: Swap = true; 9960 case ISD::SETLE: Opc = MaskResult? X86ISD::PCMPGTM: X86ISD::PCMPGT; 9961 Invert = true; break; 9962 case ISD::SETULT: Swap = true; 9963 case ISD::SETUGT: Opc = MaskResult? X86ISD::PCMPGTM: X86ISD::PCMPGT; 9964 FlipSigns = true; break; 9965 case ISD::SETUGE: Swap = true; 9966 case ISD::SETULE: Opc = MaskResult? X86ISD::PCMPGTM: X86ISD::PCMPGT; 9967 FlipSigns = true; Invert = true; break; 9968 } 9969 9970 // Special case: Use min/max operations for SETULE/SETUGE 9971 MVT VET = VT.getVectorElementType(); 9972 bool hasMinMax = 9973 (Subtarget->hasSSE41() && (VET >= MVT::i8 && VET <= MVT::i32)) 9974 || (Subtarget->hasSSE2() && (VET == MVT::i8)); 9975 9976 if (hasMinMax) { 9977 switch (SetCCOpcode) { 9978 default: break; 9979 case ISD::SETULE: Opc = X86ISD::UMIN; MinMax = true; break; 9980 case ISD::SETUGE: Opc = X86ISD::UMAX; MinMax = true; break; 9981 } 9982 9983 if (MinMax) { Swap = false; Invert = false; FlipSigns = false; } 9984 } 9985 9986 if (Swap) 9987 std::swap(Op0, Op1); 9988 9989 // Check that the operation in question is available (most are plain SSE2, 9990 // but PCMPGTQ and PCMPEQQ have different requirements). 9991 if (VT == MVT::v2i64) { 9992 if (Opc == X86ISD::PCMPGT && !Subtarget->hasSSE42()) { 9993 assert(Subtarget->hasSSE2() && "Don't know how to lower!"); 9994 9995 // First cast everything to the right type. 9996 Op0 = DAG.getNode(ISD::BITCAST, dl, MVT::v4i32, Op0); 9997 Op1 = DAG.getNode(ISD::BITCAST, dl, MVT::v4i32, Op1); 9998 9999 // Since SSE has no unsigned integer comparisons, we need to flip the sign 10000 // bits of the inputs before performing those operations. The lower 10001 // compare is always unsigned. 10002 SDValue SB; 10003 if (FlipSigns) { 10004 SB = DAG.getConstant(0x80000000U, MVT::v4i32); 10005 } else { 10006 SDValue Sign = DAG.getConstant(0x80000000U, MVT::i32); 10007 SDValue Zero = DAG.getConstant(0x00000000U, MVT::i32); 10008 SB = DAG.getNode(ISD::BUILD_VECTOR, dl, MVT::v4i32, 10009 Sign, Zero, Sign, Zero); 10010 } 10011 Op0 = DAG.getNode(ISD::XOR, dl, MVT::v4i32, Op0, SB); 10012 Op1 = DAG.getNode(ISD::XOR, dl, MVT::v4i32, Op1, SB); 10013 10014 // Emulate PCMPGTQ with (hi1 > hi2) | ((hi1 == hi2) & (lo1 > lo2)) 10015 SDValue GT = DAG.getNode(X86ISD::PCMPGT, dl, MVT::v4i32, Op0, Op1); 10016 SDValue EQ = DAG.getNode(X86ISD::PCMPEQ, dl, MVT::v4i32, Op0, Op1); 10017 10018 // Create masks for only the low parts/high parts of the 64 bit integers. 10019 static const int MaskHi[] = { 1, 1, 3, 3 }; 10020 static const int MaskLo[] = { 0, 0, 2, 2 }; 10021 SDValue EQHi = DAG.getVectorShuffle(MVT::v4i32, dl, EQ, EQ, MaskHi); 10022 SDValue GTLo = DAG.getVectorShuffle(MVT::v4i32, dl, GT, GT, MaskLo); 10023 SDValue GTHi = DAG.getVectorShuffle(MVT::v4i32, dl, GT, GT, MaskHi); 10024 10025 SDValue Result = DAG.getNode(ISD::AND, dl, MVT::v4i32, EQHi, GTLo); 10026 Result = DAG.getNode(ISD::OR, dl, MVT::v4i32, Result, GTHi); 10027 10028 if (Invert) 10029 Result = DAG.getNOT(dl, Result, MVT::v4i32); 10030 10031 return DAG.getNode(ISD::BITCAST, dl, VT, Result); 10032 } 10033 10034 if (Opc == X86ISD::PCMPEQ && !Subtarget->hasSSE41()) { 10035 // If pcmpeqq is missing but pcmpeqd is available synthesize pcmpeqq with 10036 // pcmpeqd + pshufd + pand. 10037 assert(Subtarget->hasSSE2() && !FlipSigns && "Don't know how to lower!"); 10038 10039 // First cast everything to the right type. 10040 Op0 = DAG.getNode(ISD::BITCAST, dl, MVT::v4i32, Op0); 10041 Op1 = DAG.getNode(ISD::BITCAST, dl, MVT::v4i32, Op1); 10042 10043 // Do the compare. 10044 SDValue Result = DAG.getNode(Opc, dl, MVT::v4i32, Op0, Op1); 10045 10046 // Make sure the lower and upper halves are both all-ones. 10047 static const int Mask[] = { 1, 0, 3, 2 }; 10048 SDValue Shuf = DAG.getVectorShuffle(MVT::v4i32, dl, Result, Result, Mask); 10049 Result = DAG.getNode(ISD::AND, dl, MVT::v4i32, Result, Shuf); 10050 10051 if (Invert) 10052 Result = DAG.getNOT(dl, Result, MVT::v4i32); 10053 10054 return DAG.getNode(ISD::BITCAST, dl, VT, Result); 10055 } 10056 } 10057 10058 // Since SSE has no unsigned integer comparisons, we need to flip the sign 10059 // bits of the inputs before performing those operations. 10060 if (FlipSigns) { 10061 EVT EltVT = VT.getVectorElementType(); 10062 SDValue SB = DAG.getConstant(APInt::getSignBit(EltVT.getSizeInBits()), VT); 10063 Op0 = DAG.getNode(ISD::XOR, dl, VT, Op0, SB); 10064 Op1 = DAG.getNode(ISD::XOR, dl, VT, Op1, SB); 10065 } 10066 10067 SDValue Result = DAG.getNode(Opc, dl, VT, Op0, Op1); 10068 10069 // If the logical-not of the result is required, perform that now. 10070 if (Invert) 10071 Result = DAG.getNOT(dl, Result, VT); 10072 10073 if (MinMax) 10074 Result = DAG.getNode(X86ISD::PCMPEQ, dl, VT, Op0, Result); 10075 10076 return Result; 10077} 10078 10079SDValue X86TargetLowering::LowerSETCC(SDValue Op, SelectionDAG &DAG) const { 10080 10081 MVT VT = Op.getSimpleValueType(); 10082 10083 if (VT.isVector()) return LowerVSETCC(Op, Subtarget, DAG); 10084 10085 assert(VT == MVT::i8 && "SetCC type must be 8-bit integer"); 10086 SDValue Op0 = Op.getOperand(0); 10087 SDValue Op1 = Op.getOperand(1); 10088 SDLoc dl(Op); 10089 ISD::CondCode CC = cast<CondCodeSDNode>(Op.getOperand(2))->get(); 10090 10091 // Optimize to BT if possible. 10092 // Lower (X & (1 << N)) == 0 to BT(X, N). 10093 // Lower ((X >>u N) & 1) != 0 to BT(X, N). 10094 // Lower ((X >>s N) & 1) != 0 to BT(X, N). 10095 if (Op0.getOpcode() == ISD::AND && Op0.hasOneUse() && 10096 Op1.getOpcode() == ISD::Constant && 10097 cast<ConstantSDNode>(Op1)->isNullValue() && 10098 (CC == ISD::SETEQ || CC == ISD::SETNE)) { 10099 SDValue NewSetCC = LowerToBT(Op0, CC, dl, DAG); 10100 if (NewSetCC.getNode()) 10101 return NewSetCC; 10102 } 10103 10104 // Look for X == 0, X == 1, X != 0, or X != 1. We can simplify some forms of 10105 // these. 10106 if (Op1.getOpcode() == ISD::Constant && 10107 (cast<ConstantSDNode>(Op1)->getZExtValue() == 1 || 10108 cast<ConstantSDNode>(Op1)->isNullValue()) && 10109 (CC == ISD::SETEQ || CC == ISD::SETNE)) { 10110 10111 // If the input is a setcc, then reuse the input setcc or use a new one with 10112 // the inverted condition. 10113 if (Op0.getOpcode() == X86ISD::SETCC) { 10114 X86::CondCode CCode = (X86::CondCode)Op0.getConstantOperandVal(0); 10115 bool Invert = (CC == ISD::SETNE) ^ 10116 cast<ConstantSDNode>(Op1)->isNullValue(); 10117 if (!Invert) return Op0; 10118 10119 CCode = X86::GetOppositeBranchCondition(CCode); 10120 return DAG.getNode(X86ISD::SETCC, dl, MVT::i8, 10121 DAG.getConstant(CCode, MVT::i8), Op0.getOperand(1)); 10122 } 10123 } 10124 10125 bool isFP = Op1.getSimpleValueType().isFloatingPoint(); 10126 unsigned X86CC = TranslateX86CC(CC, isFP, Op0, Op1, DAG); 10127 if (X86CC == X86::COND_INVALID) 10128 return SDValue(); 10129 10130 SDValue EFLAGS = EmitCmp(Op0, Op1, X86CC, DAG); 10131 EFLAGS = ConvertCmpIfNecessary(EFLAGS, DAG); 10132 return DAG.getNode(X86ISD::SETCC, dl, MVT::i8, 10133 DAG.getConstant(X86CC, MVT::i8), EFLAGS); 10134} 10135 10136// isX86LogicalCmp - Return true if opcode is a X86 logical comparison. 10137static bool isX86LogicalCmp(SDValue Op) { 10138 unsigned Opc = Op.getNode()->getOpcode(); 10139 if (Opc == X86ISD::CMP || Opc == X86ISD::COMI || Opc == X86ISD::UCOMI || 10140 Opc == X86ISD::SAHF) 10141 return true; 10142 if (Op.getResNo() == 1 && 10143 (Opc == X86ISD::ADD || 10144 Opc == X86ISD::SUB || 10145 Opc == X86ISD::ADC || 10146 Opc == X86ISD::SBB || 10147 Opc == X86ISD::SMUL || 10148 Opc == X86ISD::UMUL || 10149 Opc == X86ISD::INC || 10150 Opc == X86ISD::DEC || 10151 Opc == X86ISD::OR || 10152 Opc == X86ISD::XOR || 10153 Opc == X86ISD::AND)) 10154 return true; 10155 10156 if (Op.getResNo() == 2 && Opc == X86ISD::UMUL) 10157 return true; 10158 10159 return false; 10160} 10161 10162static bool isZero(SDValue V) { 10163 ConstantSDNode *C = dyn_cast<ConstantSDNode>(V); 10164 return C && C->isNullValue(); 10165} 10166 10167static bool isTruncWithZeroHighBitsInput(SDValue V, SelectionDAG &DAG) { 10168 if (V.getOpcode() != ISD::TRUNCATE) 10169 return false; 10170 10171 SDValue VOp0 = V.getOperand(0); 10172 unsigned InBits = VOp0.getValueSizeInBits(); 10173 unsigned Bits = V.getValueSizeInBits(); 10174 return DAG.MaskedValueIsZero(VOp0, APInt::getHighBitsSet(InBits,InBits-Bits)); 10175} 10176 10177SDValue X86TargetLowering::LowerSELECT(SDValue Op, SelectionDAG &DAG) const { 10178 bool addTest = true; 10179 SDValue Cond = Op.getOperand(0); 10180 SDValue Op1 = Op.getOperand(1); 10181 SDValue Op2 = Op.getOperand(2); 10182 SDLoc DL(Op); 10183 EVT VT = Op1.getValueType(); 10184 SDValue CC; 10185 10186 // Lower fp selects into a CMP/AND/ANDN/OR sequence when the necessary SSE ops 10187 // are available. Otherwise fp cmovs get lowered into a less efficient branch 10188 // sequence later on. 10189 if (Cond.getOpcode() == ISD::SETCC && 10190 ((Subtarget->hasSSE2() && (VT == MVT::f32 || VT == MVT::f64)) || 10191 (Subtarget->hasSSE1() && VT == MVT::f32)) && 10192 VT == Cond.getOperand(0).getValueType() && Cond->hasOneUse()) { 10193 SDValue CondOp0 = Cond.getOperand(0), CondOp1 = Cond.getOperand(1); 10194 int SSECC = translateX86FSETCC( 10195 cast<CondCodeSDNode>(Cond.getOperand(2))->get(), CondOp0, CondOp1); 10196 10197 if (SSECC != 8) { 10198 unsigned Opcode = VT == MVT::f32 ? X86ISD::FSETCCss : X86ISD::FSETCCsd; 10199 SDValue Cmp = DAG.getNode(Opcode, DL, VT, CondOp0, CondOp1, 10200 DAG.getConstant(SSECC, MVT::i8)); 10201 SDValue AndN = DAG.getNode(X86ISD::FANDN, DL, VT, Cmp, Op2); 10202 SDValue And = DAG.getNode(X86ISD::FAND, DL, VT, Cmp, Op1); 10203 return DAG.getNode(X86ISD::FOR, DL, VT, AndN, And); 10204 } 10205 } 10206 10207 if (Cond.getOpcode() == ISD::SETCC) { 10208 SDValue NewCond = LowerSETCC(Cond, DAG); 10209 if (NewCond.getNode()) 10210 Cond = NewCond; 10211 } 10212 10213 // (select (x == 0), -1, y) -> (sign_bit (x - 1)) | y 10214 // (select (x == 0), y, -1) -> ~(sign_bit (x - 1)) | y 10215 // (select (x != 0), y, -1) -> (sign_bit (x - 1)) | y 10216 // (select (x != 0), -1, y) -> ~(sign_bit (x - 1)) | y 10217 if (Cond.getOpcode() == X86ISD::SETCC && 10218 Cond.getOperand(1).getOpcode() == X86ISD::CMP && 10219 isZero(Cond.getOperand(1).getOperand(1))) { 10220 SDValue Cmp = Cond.getOperand(1); 10221 10222 unsigned CondCode =cast<ConstantSDNode>(Cond.getOperand(0))->getZExtValue(); 10223 10224 if ((isAllOnes(Op1) || isAllOnes(Op2)) && 10225 (CondCode == X86::COND_E || CondCode == X86::COND_NE)) { 10226 SDValue Y = isAllOnes(Op2) ? Op1 : Op2; 10227 10228 SDValue CmpOp0 = Cmp.getOperand(0); 10229 // Apply further optimizations for special cases 10230 // (select (x != 0), -1, 0) -> neg & sbb 10231 // (select (x == 0), 0, -1) -> neg & sbb 10232 if (ConstantSDNode *YC = dyn_cast<ConstantSDNode>(Y)) 10233 if (YC->isNullValue() && 10234 (isAllOnes(Op1) == (CondCode == X86::COND_NE))) { 10235 SDVTList VTs = DAG.getVTList(CmpOp0.getValueType(), MVT::i32); 10236 SDValue Neg = DAG.getNode(X86ISD::SUB, DL, VTs, 10237 DAG.getConstant(0, CmpOp0.getValueType()), 10238 CmpOp0); 10239 SDValue Res = DAG.getNode(X86ISD::SETCC_CARRY, DL, Op.getValueType(), 10240 DAG.getConstant(X86::COND_B, MVT::i8), 10241 SDValue(Neg.getNode(), 1)); 10242 return Res; 10243 } 10244 10245 Cmp = DAG.getNode(X86ISD::CMP, DL, MVT::i32, 10246 CmpOp0, DAG.getConstant(1, CmpOp0.getValueType())); 10247 Cmp = ConvertCmpIfNecessary(Cmp, DAG); 10248 10249 SDValue Res = // Res = 0 or -1. 10250 DAG.getNode(X86ISD::SETCC_CARRY, DL, Op.getValueType(), 10251 DAG.getConstant(X86::COND_B, MVT::i8), Cmp); 10252 10253 if (isAllOnes(Op1) != (CondCode == X86::COND_E)) 10254 Res = DAG.getNOT(DL, Res, Res.getValueType()); 10255 10256 ConstantSDNode *N2C = dyn_cast<ConstantSDNode>(Op2); 10257 if (N2C == 0 || !N2C->isNullValue()) 10258 Res = DAG.getNode(ISD::OR, DL, Res.getValueType(), Res, Y); 10259 return Res; 10260 } 10261 } 10262 10263 // Look past (and (setcc_carry (cmp ...)), 1). 10264 if (Cond.getOpcode() == ISD::AND && 10265 Cond.getOperand(0).getOpcode() == X86ISD::SETCC_CARRY) { 10266 ConstantSDNode *C = dyn_cast<ConstantSDNode>(Cond.getOperand(1)); 10267 if (C && C->getAPIntValue() == 1) 10268 Cond = Cond.getOperand(0); 10269 } 10270 10271 // If condition flag is set by a X86ISD::CMP, then use it as the condition 10272 // setting operand in place of the X86ISD::SETCC. 10273 unsigned CondOpcode = Cond.getOpcode(); 10274 if (CondOpcode == X86ISD::SETCC || 10275 CondOpcode == X86ISD::SETCC_CARRY) { 10276 CC = Cond.getOperand(0); 10277 10278 SDValue Cmp = Cond.getOperand(1); 10279 unsigned Opc = Cmp.getOpcode(); 10280 MVT VT = Op.getSimpleValueType(); 10281 10282 bool IllegalFPCMov = false; 10283 if (VT.isFloatingPoint() && !VT.isVector() && 10284 !isScalarFPTypeInSSEReg(VT)) // FPStack? 10285 IllegalFPCMov = !hasFPCMov(cast<ConstantSDNode>(CC)->getSExtValue()); 10286 10287 if ((isX86LogicalCmp(Cmp) && !IllegalFPCMov) || 10288 Opc == X86ISD::BT) { // FIXME 10289 Cond = Cmp; 10290 addTest = false; 10291 } 10292 } else if (CondOpcode == ISD::USUBO || CondOpcode == ISD::SSUBO || 10293 CondOpcode == ISD::UADDO || CondOpcode == ISD::SADDO || 10294 ((CondOpcode == ISD::UMULO || CondOpcode == ISD::SMULO) && 10295 Cond.getOperand(0).getValueType() != MVT::i8)) { 10296 SDValue LHS = Cond.getOperand(0); 10297 SDValue RHS = Cond.getOperand(1); 10298 unsigned X86Opcode; 10299 unsigned X86Cond; 10300 SDVTList VTs; 10301 switch (CondOpcode) { 10302 case ISD::UADDO: X86Opcode = X86ISD::ADD; X86Cond = X86::COND_B; break; 10303 case ISD::SADDO: X86Opcode = X86ISD::ADD; X86Cond = X86::COND_O; break; 10304 case ISD::USUBO: X86Opcode = X86ISD::SUB; X86Cond = X86::COND_B; break; 10305 case ISD::SSUBO: X86Opcode = X86ISD::SUB; X86Cond = X86::COND_O; break; 10306 case ISD::UMULO: X86Opcode = X86ISD::UMUL; X86Cond = X86::COND_O; break; 10307 case ISD::SMULO: X86Opcode = X86ISD::SMUL; X86Cond = X86::COND_O; break; 10308 default: llvm_unreachable("unexpected overflowing operator"); 10309 } 10310 if (CondOpcode == ISD::UMULO) 10311 VTs = DAG.getVTList(LHS.getValueType(), LHS.getValueType(), 10312 MVT::i32); 10313 else 10314 VTs = DAG.getVTList(LHS.getValueType(), MVT::i32); 10315 10316 SDValue X86Op = DAG.getNode(X86Opcode, DL, VTs, LHS, RHS); 10317 10318 if (CondOpcode == ISD::UMULO) 10319 Cond = X86Op.getValue(2); 10320 else 10321 Cond = X86Op.getValue(1); 10322 10323 CC = DAG.getConstant(X86Cond, MVT::i8); 10324 addTest = false; 10325 } 10326 10327 if (addTest) { 10328 // Look pass the truncate if the high bits are known zero. 10329 if (isTruncWithZeroHighBitsInput(Cond, DAG)) 10330 Cond = Cond.getOperand(0); 10331 10332 // We know the result of AND is compared against zero. Try to match 10333 // it to BT. 10334 if (Cond.getOpcode() == ISD::AND && Cond.hasOneUse()) { 10335 SDValue NewSetCC = LowerToBT(Cond, ISD::SETNE, DL, DAG); 10336 if (NewSetCC.getNode()) { 10337 CC = NewSetCC.getOperand(0); 10338 Cond = NewSetCC.getOperand(1); 10339 addTest = false; 10340 } 10341 } 10342 } 10343 10344 if (addTest) { 10345 CC = DAG.getConstant(X86::COND_NE, MVT::i8); 10346 Cond = EmitTest(Cond, X86::COND_NE, DAG); 10347 } 10348 10349 // a < b ? -1 : 0 -> RES = ~setcc_carry 10350 // a < b ? 0 : -1 -> RES = setcc_carry 10351 // a >= b ? -1 : 0 -> RES = setcc_carry 10352 // a >= b ? 0 : -1 -> RES = ~setcc_carry 10353 if (Cond.getOpcode() == X86ISD::SUB) { 10354 Cond = ConvertCmpIfNecessary(Cond, DAG); 10355 unsigned CondCode = cast<ConstantSDNode>(CC)->getZExtValue(); 10356 10357 if ((CondCode == X86::COND_AE || CondCode == X86::COND_B) && 10358 (isAllOnes(Op1) || isAllOnes(Op2)) && (isZero(Op1) || isZero(Op2))) { 10359 SDValue Res = DAG.getNode(X86ISD::SETCC_CARRY, DL, Op.getValueType(), 10360 DAG.getConstant(X86::COND_B, MVT::i8), Cond); 10361 if (isAllOnes(Op1) != (CondCode == X86::COND_B)) 10362 return DAG.getNOT(DL, Res, Res.getValueType()); 10363 return Res; 10364 } 10365 } 10366 10367 // X86 doesn't have an i8 cmov. If both operands are the result of a truncate 10368 // widen the cmov and push the truncate through. This avoids introducing a new 10369 // branch during isel and doesn't add any extensions. 10370 if (Op.getValueType() == MVT::i8 && 10371 Op1.getOpcode() == ISD::TRUNCATE && Op2.getOpcode() == ISD::TRUNCATE) { 10372 SDValue T1 = Op1.getOperand(0), T2 = Op2.getOperand(0); 10373 if (T1.getValueType() == T2.getValueType() && 10374 // Blacklist CopyFromReg to avoid partial register stalls. 10375 T1.getOpcode() != ISD::CopyFromReg && T2.getOpcode()!=ISD::CopyFromReg){ 10376 SDVTList VTs = DAG.getVTList(T1.getValueType(), MVT::Glue); 10377 SDValue Cmov = DAG.getNode(X86ISD::CMOV, DL, VTs, T2, T1, CC, Cond); 10378 return DAG.getNode(ISD::TRUNCATE, DL, Op.getValueType(), Cmov); 10379 } 10380 } 10381 10382 // X86ISD::CMOV means set the result (which is operand 1) to the RHS if 10383 // condition is true. 10384 SDVTList VTs = DAG.getVTList(Op.getValueType(), MVT::Glue); 10385 SDValue Ops[] = { Op2, Op1, CC, Cond }; 10386 return DAG.getNode(X86ISD::CMOV, DL, VTs, Ops, array_lengthof(Ops)); 10387} 10388 10389static SDValue LowerSIGN_EXTEND_AVX512(SDValue Op, SelectionDAG &DAG) { 10390 MVT VT = Op->getSimpleValueType(0); 10391 SDValue In = Op->getOperand(0); 10392 MVT InVT = In.getSimpleValueType(); 10393 SDLoc dl(Op); 10394 10395 unsigned int NumElts = VT.getVectorNumElements(); 10396 if (NumElts != 8 && NumElts != 16) 10397 return SDValue(); 10398 10399 if (VT.is512BitVector() && InVT.getVectorElementType() != MVT::i1) 10400 return DAG.getNode(X86ISD::VSEXT, dl, VT, In); 10401 10402 const TargetLowering &TLI = DAG.getTargetLoweringInfo(); 10403 assert (InVT.getVectorElementType() == MVT::i1 && "Unexpected vector type"); 10404 10405 MVT ExtVT = (NumElts == 8) ? MVT::v8i64 : MVT::v16i32; 10406 Constant *C = ConstantInt::get(*DAG.getContext(), 10407 APInt::getAllOnesValue(ExtVT.getScalarType().getSizeInBits())); 10408 10409 SDValue CP = DAG.getConstantPool(C, TLI.getPointerTy()); 10410 unsigned Alignment = cast<ConstantPoolSDNode>(CP)->getAlignment(); 10411 SDValue Ld = DAG.getLoad(ExtVT.getScalarType(), dl, DAG.getEntryNode(), CP, 10412 MachinePointerInfo::getConstantPool(), 10413 false, false, false, Alignment); 10414 SDValue Brcst = DAG.getNode(X86ISD::VBROADCASTM, dl, ExtVT, In, Ld); 10415 if (VT.is512BitVector()) 10416 return Brcst; 10417 return DAG.getNode(X86ISD::VTRUNC, dl, VT, Brcst); 10418} 10419 10420static SDValue LowerSIGN_EXTEND(SDValue Op, const X86Subtarget *Subtarget, 10421 SelectionDAG &DAG) { 10422 MVT VT = Op->getSimpleValueType(0); 10423 SDValue In = Op->getOperand(0); 10424 MVT InVT = In.getSimpleValueType(); 10425 SDLoc dl(Op); 10426 10427 if (VT.is512BitVector() || InVT.getVectorElementType() == MVT::i1) 10428 return LowerSIGN_EXTEND_AVX512(Op, DAG); 10429 10430 if ((VT != MVT::v4i64 || InVT != MVT::v4i32) && 10431 (VT != MVT::v8i32 || InVT != MVT::v8i16) && 10432 (VT != MVT::v16i16 || InVT != MVT::v16i8)) 10433 return SDValue(); 10434 10435 if (Subtarget->hasInt256()) 10436 return DAG.getNode(X86ISD::VSEXT_MOVL, dl, VT, In); 10437 10438 // Optimize vectors in AVX mode 10439 // Sign extend v8i16 to v8i32 and 10440 // v4i32 to v4i64 10441 // 10442 // Divide input vector into two parts 10443 // for v4i32 the shuffle mask will be { 0, 1, -1, -1} {2, 3, -1, -1} 10444 // use vpmovsx instruction to extend v4i32 -> v2i64; v8i16 -> v4i32 10445 // concat the vectors to original VT 10446 10447 unsigned NumElems = InVT.getVectorNumElements(); 10448 SDValue Undef = DAG.getUNDEF(InVT); 10449 10450 SmallVector<int,8> ShufMask1(NumElems, -1); 10451 for (unsigned i = 0; i != NumElems/2; ++i) 10452 ShufMask1[i] = i; 10453 10454 SDValue OpLo = DAG.getVectorShuffle(InVT, dl, In, Undef, &ShufMask1[0]); 10455 10456 SmallVector<int,8> ShufMask2(NumElems, -1); 10457 for (unsigned i = 0; i != NumElems/2; ++i) 10458 ShufMask2[i] = i + NumElems/2; 10459 10460 SDValue OpHi = DAG.getVectorShuffle(InVT, dl, In, Undef, &ShufMask2[0]); 10461 10462 MVT HalfVT = MVT::getVectorVT(VT.getScalarType(), 10463 VT.getVectorNumElements()/2); 10464 10465 OpLo = DAG.getNode(X86ISD::VSEXT_MOVL, dl, HalfVT, OpLo); 10466 OpHi = DAG.getNode(X86ISD::VSEXT_MOVL, dl, HalfVT, OpHi); 10467 10468 return DAG.getNode(ISD::CONCAT_VECTORS, dl, VT, OpLo, OpHi); 10469} 10470 10471// isAndOrOfSingleUseSetCCs - Return true if node is an ISD::AND or 10472// ISD::OR of two X86ISD::SETCC nodes each of which has no other use apart 10473// from the AND / OR. 10474static bool isAndOrOfSetCCs(SDValue Op, unsigned &Opc) { 10475 Opc = Op.getOpcode(); 10476 if (Opc != ISD::OR && Opc != ISD::AND) 10477 return false; 10478 return (Op.getOperand(0).getOpcode() == X86ISD::SETCC && 10479 Op.getOperand(0).hasOneUse() && 10480 Op.getOperand(1).getOpcode() == X86ISD::SETCC && 10481 Op.getOperand(1).hasOneUse()); 10482} 10483 10484// isXor1OfSetCC - Return true if node is an ISD::XOR of a X86ISD::SETCC and 10485// 1 and that the SETCC node has a single use. 10486static bool isXor1OfSetCC(SDValue Op) { 10487 if (Op.getOpcode() != ISD::XOR) 10488 return false; 10489 ConstantSDNode *N1C = dyn_cast<ConstantSDNode>(Op.getOperand(1)); 10490 if (N1C && N1C->getAPIntValue() == 1) { 10491 return Op.getOperand(0).getOpcode() == X86ISD::SETCC && 10492 Op.getOperand(0).hasOneUse(); 10493 } 10494 return false; 10495} 10496 10497SDValue X86TargetLowering::LowerBRCOND(SDValue Op, SelectionDAG &DAG) const { 10498 bool addTest = true; 10499 SDValue Chain = Op.getOperand(0); 10500 SDValue Cond = Op.getOperand(1); 10501 SDValue Dest = Op.getOperand(2); 10502 SDLoc dl(Op); 10503 SDValue CC; 10504 bool Inverted = false; 10505 10506 if (Cond.getOpcode() == ISD::SETCC) { 10507 // Check for setcc([su]{add,sub,mul}o == 0). 10508 if (cast<CondCodeSDNode>(Cond.getOperand(2))->get() == ISD::SETEQ && 10509 isa<ConstantSDNode>(Cond.getOperand(1)) && 10510 cast<ConstantSDNode>(Cond.getOperand(1))->isNullValue() && 10511 Cond.getOperand(0).getResNo() == 1 && 10512 (Cond.getOperand(0).getOpcode() == ISD::SADDO || 10513 Cond.getOperand(0).getOpcode() == ISD::UADDO || 10514 Cond.getOperand(0).getOpcode() == ISD::SSUBO || 10515 Cond.getOperand(0).getOpcode() == ISD::USUBO || 10516 Cond.getOperand(0).getOpcode() == ISD::SMULO || 10517 Cond.getOperand(0).getOpcode() == ISD::UMULO)) { 10518 Inverted = true; 10519 Cond = Cond.getOperand(0); 10520 } else { 10521 SDValue NewCond = LowerSETCC(Cond, DAG); 10522 if (NewCond.getNode()) 10523 Cond = NewCond; 10524 } 10525 } 10526#if 0 10527 // FIXME: LowerXALUO doesn't handle these!! 10528 else if (Cond.getOpcode() == X86ISD::ADD || 10529 Cond.getOpcode() == X86ISD::SUB || 10530 Cond.getOpcode() == X86ISD::SMUL || 10531 Cond.getOpcode() == X86ISD::UMUL) 10532 Cond = LowerXALUO(Cond, DAG); 10533#endif 10534 10535 // Look pass (and (setcc_carry (cmp ...)), 1). 10536 if (Cond.getOpcode() == ISD::AND && 10537 Cond.getOperand(0).getOpcode() == X86ISD::SETCC_CARRY) { 10538 ConstantSDNode *C = dyn_cast<ConstantSDNode>(Cond.getOperand(1)); 10539 if (C && C->getAPIntValue() == 1) 10540 Cond = Cond.getOperand(0); 10541 } 10542 10543 // If condition flag is set by a X86ISD::CMP, then use it as the condition 10544 // setting operand in place of the X86ISD::SETCC. 10545 unsigned CondOpcode = Cond.getOpcode(); 10546 if (CondOpcode == X86ISD::SETCC || 10547 CondOpcode == X86ISD::SETCC_CARRY) { 10548 CC = Cond.getOperand(0); 10549 10550 SDValue Cmp = Cond.getOperand(1); 10551 unsigned Opc = Cmp.getOpcode(); 10552 // FIXME: WHY THE SPECIAL CASING OF LogicalCmp?? 10553 if (isX86LogicalCmp(Cmp) || Opc == X86ISD::BT) { 10554 Cond = Cmp; 10555 addTest = false; 10556 } else { 10557 switch (cast<ConstantSDNode>(CC)->getZExtValue()) { 10558 default: break; 10559 case X86::COND_O: 10560 case X86::COND_B: 10561 // These can only come from an arithmetic instruction with overflow, 10562 // e.g. SADDO, UADDO. 10563 Cond = Cond.getNode()->getOperand(1); 10564 addTest = false; 10565 break; 10566 } 10567 } 10568 } 10569 CondOpcode = Cond.getOpcode(); 10570 if (CondOpcode == ISD::UADDO || CondOpcode == ISD::SADDO || 10571 CondOpcode == ISD::USUBO || CondOpcode == ISD::SSUBO || 10572 ((CondOpcode == ISD::UMULO || CondOpcode == ISD::SMULO) && 10573 Cond.getOperand(0).getValueType() != MVT::i8)) { 10574 SDValue LHS = Cond.getOperand(0); 10575 SDValue RHS = Cond.getOperand(1); 10576 unsigned X86Opcode; 10577 unsigned X86Cond; 10578 SDVTList VTs; 10579 switch (CondOpcode) { 10580 case ISD::UADDO: X86Opcode = X86ISD::ADD; X86Cond = X86::COND_B; break; 10581 case ISD::SADDO: X86Opcode = X86ISD::ADD; X86Cond = X86::COND_O; break; 10582 case ISD::USUBO: X86Opcode = X86ISD::SUB; X86Cond = X86::COND_B; break; 10583 case ISD::SSUBO: X86Opcode = X86ISD::SUB; X86Cond = X86::COND_O; break; 10584 case ISD::UMULO: X86Opcode = X86ISD::UMUL; X86Cond = X86::COND_O; break; 10585 case ISD::SMULO: X86Opcode = X86ISD::SMUL; X86Cond = X86::COND_O; break; 10586 default: llvm_unreachable("unexpected overflowing operator"); 10587 } 10588 if (Inverted) 10589 X86Cond = X86::GetOppositeBranchCondition((X86::CondCode)X86Cond); 10590 if (CondOpcode == ISD::UMULO) 10591 VTs = DAG.getVTList(LHS.getValueType(), LHS.getValueType(), 10592 MVT::i32); 10593 else 10594 VTs = DAG.getVTList(LHS.getValueType(), MVT::i32); 10595 10596 SDValue X86Op = DAG.getNode(X86Opcode, dl, VTs, LHS, RHS); 10597 10598 if (CondOpcode == ISD::UMULO) 10599 Cond = X86Op.getValue(2); 10600 else 10601 Cond = X86Op.getValue(1); 10602 10603 CC = DAG.getConstant(X86Cond, MVT::i8); 10604 addTest = false; 10605 } else { 10606 unsigned CondOpc; 10607 if (Cond.hasOneUse() && isAndOrOfSetCCs(Cond, CondOpc)) { 10608 SDValue Cmp = Cond.getOperand(0).getOperand(1); 10609 if (CondOpc == ISD::OR) { 10610 // Also, recognize the pattern generated by an FCMP_UNE. We can emit 10611 // two branches instead of an explicit OR instruction with a 10612 // separate test. 10613 if (Cmp == Cond.getOperand(1).getOperand(1) && 10614 isX86LogicalCmp(Cmp)) { 10615 CC = Cond.getOperand(0).getOperand(0); 10616 Chain = DAG.getNode(X86ISD::BRCOND, dl, Op.getValueType(), 10617 Chain, Dest, CC, Cmp); 10618 CC = Cond.getOperand(1).getOperand(0); 10619 Cond = Cmp; 10620 addTest = false; 10621 } 10622 } else { // ISD::AND 10623 // Also, recognize the pattern generated by an FCMP_OEQ. We can emit 10624 // two branches instead of an explicit AND instruction with a 10625 // separate test. However, we only do this if this block doesn't 10626 // have a fall-through edge, because this requires an explicit 10627 // jmp when the condition is false. 10628 if (Cmp == Cond.getOperand(1).getOperand(1) && 10629 isX86LogicalCmp(Cmp) && 10630 Op.getNode()->hasOneUse()) { 10631 X86::CondCode CCode = 10632 (X86::CondCode)Cond.getOperand(0).getConstantOperandVal(0); 10633 CCode = X86::GetOppositeBranchCondition(CCode); 10634 CC = DAG.getConstant(CCode, MVT::i8); 10635 SDNode *User = *Op.getNode()->use_begin(); 10636 // Look for an unconditional branch following this conditional branch. 10637 // We need this because we need to reverse the successors in order 10638 // to implement FCMP_OEQ. 10639 if (User->getOpcode() == ISD::BR) { 10640 SDValue FalseBB = User->getOperand(1); 10641 SDNode *NewBR = 10642 DAG.UpdateNodeOperands(User, User->getOperand(0), Dest); 10643 assert(NewBR == User); 10644 (void)NewBR; 10645 Dest = FalseBB; 10646 10647 Chain = DAG.getNode(X86ISD::BRCOND, dl, Op.getValueType(), 10648 Chain, Dest, CC, Cmp); 10649 X86::CondCode CCode = 10650 (X86::CondCode)Cond.getOperand(1).getConstantOperandVal(0); 10651 CCode = X86::GetOppositeBranchCondition(CCode); 10652 CC = DAG.getConstant(CCode, MVT::i8); 10653 Cond = Cmp; 10654 addTest = false; 10655 } 10656 } 10657 } 10658 } else if (Cond.hasOneUse() && isXor1OfSetCC(Cond)) { 10659 // Recognize for xorb (setcc), 1 patterns. The xor inverts the condition. 10660 // It should be transformed during dag combiner except when the condition 10661 // is set by a arithmetics with overflow node. 10662 X86::CondCode CCode = 10663 (X86::CondCode)Cond.getOperand(0).getConstantOperandVal(0); 10664 CCode = X86::GetOppositeBranchCondition(CCode); 10665 CC = DAG.getConstant(CCode, MVT::i8); 10666 Cond = Cond.getOperand(0).getOperand(1); 10667 addTest = false; 10668 } else if (Cond.getOpcode() == ISD::SETCC && 10669 cast<CondCodeSDNode>(Cond.getOperand(2))->get() == ISD::SETOEQ) { 10670 // For FCMP_OEQ, we can emit 10671 // two branches instead of an explicit AND instruction with a 10672 // separate test. However, we only do this if this block doesn't 10673 // have a fall-through edge, because this requires an explicit 10674 // jmp when the condition is false. 10675 if (Op.getNode()->hasOneUse()) { 10676 SDNode *User = *Op.getNode()->use_begin(); 10677 // Look for an unconditional branch following this conditional branch. 10678 // We need this because we need to reverse the successors in order 10679 // to implement FCMP_OEQ. 10680 if (User->getOpcode() == ISD::BR) { 10681 SDValue FalseBB = User->getOperand(1); 10682 SDNode *NewBR = 10683 DAG.UpdateNodeOperands(User, User->getOperand(0), Dest); 10684 assert(NewBR == User); 10685 (void)NewBR; 10686 Dest = FalseBB; 10687 10688 SDValue Cmp = DAG.getNode(X86ISD::CMP, dl, MVT::i32, 10689 Cond.getOperand(0), Cond.getOperand(1)); 10690 Cmp = ConvertCmpIfNecessary(Cmp, DAG); 10691 CC = DAG.getConstant(X86::COND_NE, MVT::i8); 10692 Chain = DAG.getNode(X86ISD::BRCOND, dl, Op.getValueType(), 10693 Chain, Dest, CC, Cmp); 10694 CC = DAG.getConstant(X86::COND_P, MVT::i8); 10695 Cond = Cmp; 10696 addTest = false; 10697 } 10698 } 10699 } else if (Cond.getOpcode() == ISD::SETCC && 10700 cast<CondCodeSDNode>(Cond.getOperand(2))->get() == ISD::SETUNE) { 10701 // For FCMP_UNE, we can emit 10702 // two branches instead of an explicit AND instruction with a 10703 // separate test. However, we only do this if this block doesn't 10704 // have a fall-through edge, because this requires an explicit 10705 // jmp when the condition is false. 10706 if (Op.getNode()->hasOneUse()) { 10707 SDNode *User = *Op.getNode()->use_begin(); 10708 // Look for an unconditional branch following this conditional branch. 10709 // We need this because we need to reverse the successors in order 10710 // to implement FCMP_UNE. 10711 if (User->getOpcode() == ISD::BR) { 10712 SDValue FalseBB = User->getOperand(1); 10713 SDNode *NewBR = 10714 DAG.UpdateNodeOperands(User, User->getOperand(0), Dest); 10715 assert(NewBR == User); 10716 (void)NewBR; 10717 10718 SDValue Cmp = DAG.getNode(X86ISD::CMP, dl, MVT::i32, 10719 Cond.getOperand(0), Cond.getOperand(1)); 10720 Cmp = ConvertCmpIfNecessary(Cmp, DAG); 10721 CC = DAG.getConstant(X86::COND_NE, MVT::i8); 10722 Chain = DAG.getNode(X86ISD::BRCOND, dl, Op.getValueType(), 10723 Chain, Dest, CC, Cmp); 10724 CC = DAG.getConstant(X86::COND_NP, MVT::i8); 10725 Cond = Cmp; 10726 addTest = false; 10727 Dest = FalseBB; 10728 } 10729 } 10730 } 10731 } 10732 10733 if (addTest) { 10734 // Look pass the truncate if the high bits are known zero. 10735 if (isTruncWithZeroHighBitsInput(Cond, DAG)) 10736 Cond = Cond.getOperand(0); 10737 10738 // We know the result of AND is compared against zero. Try to match 10739 // it to BT. 10740 if (Cond.getOpcode() == ISD::AND && Cond.hasOneUse()) { 10741 SDValue NewSetCC = LowerToBT(Cond, ISD::SETNE, dl, DAG); 10742 if (NewSetCC.getNode()) { 10743 CC = NewSetCC.getOperand(0); 10744 Cond = NewSetCC.getOperand(1); 10745 addTest = false; 10746 } 10747 } 10748 } 10749 10750 if (addTest) { 10751 CC = DAG.getConstant(X86::COND_NE, MVT::i8); 10752 Cond = EmitTest(Cond, X86::COND_NE, DAG); 10753 } 10754 Cond = ConvertCmpIfNecessary(Cond, DAG); 10755 return DAG.getNode(X86ISD::BRCOND, dl, Op.getValueType(), 10756 Chain, Dest, CC, Cond); 10757} 10758 10759// Lower dynamic stack allocation to _alloca call for Cygwin/Mingw targets. 10760// Calls to _alloca is needed to probe the stack when allocating more than 4k 10761// bytes in one go. Touching the stack at 4K increments is necessary to ensure 10762// that the guard pages used by the OS virtual memory manager are allocated in 10763// correct sequence. 10764SDValue 10765X86TargetLowering::LowerDYNAMIC_STACKALLOC(SDValue Op, 10766 SelectionDAG &DAG) const { 10767 assert((Subtarget->isTargetCygMing() || Subtarget->isTargetWindows() || 10768 getTargetMachine().Options.EnableSegmentedStacks) && 10769 "This should be used only on Windows targets or when segmented stacks " 10770 "are being used"); 10771 assert(!Subtarget->isTargetEnvMacho() && "Not implemented"); 10772 SDLoc dl(Op); 10773 10774 // Get the inputs. 10775 SDValue Chain = Op.getOperand(0); 10776 SDValue Size = Op.getOperand(1); 10777 unsigned Align = cast<ConstantSDNode>(Op.getOperand(2))->getZExtValue(); 10778 EVT VT = Op.getNode()->getValueType(0); 10779 10780 bool Is64Bit = Subtarget->is64Bit(); 10781 EVT SPTy = Is64Bit ? MVT::i64 : MVT::i32; 10782 10783 if (getTargetMachine().Options.EnableSegmentedStacks) { 10784 MachineFunction &MF = DAG.getMachineFunction(); 10785 MachineRegisterInfo &MRI = MF.getRegInfo(); 10786 10787 if (Is64Bit) { 10788 // The 64 bit implementation of segmented stacks needs to clobber both r10 10789 // r11. This makes it impossible to use it along with nested parameters. 10790 const Function *F = MF.getFunction(); 10791 10792 for (Function::const_arg_iterator I = F->arg_begin(), E = F->arg_end(); 10793 I != E; ++I) 10794 if (I->hasNestAttr()) 10795 report_fatal_error("Cannot use segmented stacks with functions that " 10796 "have nested arguments."); 10797 } 10798 10799 const TargetRegisterClass *AddrRegClass = 10800 getRegClassFor(Subtarget->is64Bit() ? MVT::i64:MVT::i32); 10801 unsigned Vreg = MRI.createVirtualRegister(AddrRegClass); 10802 Chain = DAG.getCopyToReg(Chain, dl, Vreg, Size); 10803 SDValue Value = DAG.getNode(X86ISD::SEG_ALLOCA, dl, SPTy, Chain, 10804 DAG.getRegister(Vreg, SPTy)); 10805 SDValue Ops1[2] = { Value, Chain }; 10806 return DAG.getMergeValues(Ops1, 2, dl); 10807 } else { 10808 SDValue Flag; 10809 unsigned Reg = (Subtarget->is64Bit() ? X86::RAX : X86::EAX); 10810 10811 Chain = DAG.getCopyToReg(Chain, dl, Reg, Size, Flag); 10812 Flag = Chain.getValue(1); 10813 SDVTList NodeTys = DAG.getVTList(MVT::Other, MVT::Glue); 10814 10815 Chain = DAG.getNode(X86ISD::WIN_ALLOCA, dl, NodeTys, Chain, Flag); 10816 10817 const X86RegisterInfo *RegInfo = 10818 static_cast<const X86RegisterInfo*>(getTargetMachine().getRegisterInfo()); 10819 unsigned SPReg = RegInfo->getStackRegister(); 10820 SDValue SP = DAG.getCopyFromReg(Chain, dl, SPReg, SPTy); 10821 Chain = SP.getValue(1); 10822 10823 if (Align) { 10824 SP = DAG.getNode(ISD::AND, dl, VT, SP.getValue(0), 10825 DAG.getConstant(-(uint64_t)Align, VT)); 10826 Chain = DAG.getCopyToReg(Chain, dl, SPReg, SP); 10827 } 10828 10829 SDValue Ops1[2] = { SP, Chain }; 10830 return DAG.getMergeValues(Ops1, 2, dl); 10831 } 10832} 10833 10834SDValue X86TargetLowering::LowerVASTART(SDValue Op, SelectionDAG &DAG) const { 10835 MachineFunction &MF = DAG.getMachineFunction(); 10836 X86MachineFunctionInfo *FuncInfo = MF.getInfo<X86MachineFunctionInfo>(); 10837 10838 const Value *SV = cast<SrcValueSDNode>(Op.getOperand(2))->getValue(); 10839 SDLoc DL(Op); 10840 10841 if (!Subtarget->is64Bit() || Subtarget->isTargetWin64()) { 10842 // vastart just stores the address of the VarArgsFrameIndex slot into the 10843 // memory location argument. 10844 SDValue FR = DAG.getFrameIndex(FuncInfo->getVarArgsFrameIndex(), 10845 getPointerTy()); 10846 return DAG.getStore(Op.getOperand(0), DL, FR, Op.getOperand(1), 10847 MachinePointerInfo(SV), false, false, 0); 10848 } 10849 10850 // __va_list_tag: 10851 // gp_offset (0 - 6 * 8) 10852 // fp_offset (48 - 48 + 8 * 16) 10853 // overflow_arg_area (point to parameters coming in memory). 10854 // reg_save_area 10855 SmallVector<SDValue, 8> MemOps; 10856 SDValue FIN = Op.getOperand(1); 10857 // Store gp_offset 10858 SDValue Store = DAG.getStore(Op.getOperand(0), DL, 10859 DAG.getConstant(FuncInfo->getVarArgsGPOffset(), 10860 MVT::i32), 10861 FIN, MachinePointerInfo(SV), false, false, 0); 10862 MemOps.push_back(Store); 10863 10864 // Store fp_offset 10865 FIN = DAG.getNode(ISD::ADD, DL, getPointerTy(), 10866 FIN, DAG.getIntPtrConstant(4)); 10867 Store = DAG.getStore(Op.getOperand(0), DL, 10868 DAG.getConstant(FuncInfo->getVarArgsFPOffset(), 10869 MVT::i32), 10870 FIN, MachinePointerInfo(SV, 4), false, false, 0); 10871 MemOps.push_back(Store); 10872 10873 // Store ptr to overflow_arg_area 10874 FIN = DAG.getNode(ISD::ADD, DL, getPointerTy(), 10875 FIN, DAG.getIntPtrConstant(4)); 10876 SDValue OVFIN = DAG.getFrameIndex(FuncInfo->getVarArgsFrameIndex(), 10877 getPointerTy()); 10878 Store = DAG.getStore(Op.getOperand(0), DL, OVFIN, FIN, 10879 MachinePointerInfo(SV, 8), 10880 false, false, 0); 10881 MemOps.push_back(Store); 10882 10883 // Store ptr to reg_save_area. 10884 FIN = DAG.getNode(ISD::ADD, DL, getPointerTy(), 10885 FIN, DAG.getIntPtrConstant(8)); 10886 SDValue RSFIN = DAG.getFrameIndex(FuncInfo->getRegSaveFrameIndex(), 10887 getPointerTy()); 10888 Store = DAG.getStore(Op.getOperand(0), DL, RSFIN, FIN, 10889 MachinePointerInfo(SV, 16), false, false, 0); 10890 MemOps.push_back(Store); 10891 return DAG.getNode(ISD::TokenFactor, DL, MVT::Other, 10892 &MemOps[0], MemOps.size()); 10893} 10894 10895SDValue X86TargetLowering::LowerVAARG(SDValue Op, SelectionDAG &DAG) const { 10896 assert(Subtarget->is64Bit() && 10897 "LowerVAARG only handles 64-bit va_arg!"); 10898 assert((Subtarget->isTargetLinux() || 10899 Subtarget->isTargetDarwin()) && 10900 "Unhandled target in LowerVAARG"); 10901 assert(Op.getNode()->getNumOperands() == 4); 10902 SDValue Chain = Op.getOperand(0); 10903 SDValue SrcPtr = Op.getOperand(1); 10904 const Value *SV = cast<SrcValueSDNode>(Op.getOperand(2))->getValue(); 10905 unsigned Align = Op.getConstantOperandVal(3); 10906 SDLoc dl(Op); 10907 10908 EVT ArgVT = Op.getNode()->getValueType(0); 10909 Type *ArgTy = ArgVT.getTypeForEVT(*DAG.getContext()); 10910 uint32_t ArgSize = getDataLayout()->getTypeAllocSize(ArgTy); 10911 uint8_t ArgMode; 10912 10913 // Decide which area this value should be read from. 10914 // TODO: Implement the AMD64 ABI in its entirety. This simple 10915 // selection mechanism works only for the basic types. 10916 if (ArgVT == MVT::f80) { 10917 llvm_unreachable("va_arg for f80 not yet implemented"); 10918 } else if (ArgVT.isFloatingPoint() && ArgSize <= 16 /*bytes*/) { 10919 ArgMode = 2; // Argument passed in XMM register. Use fp_offset. 10920 } else if (ArgVT.isInteger() && ArgSize <= 32 /*bytes*/) { 10921 ArgMode = 1; // Argument passed in GPR64 register(s). Use gp_offset. 10922 } else { 10923 llvm_unreachable("Unhandled argument type in LowerVAARG"); 10924 } 10925 10926 if (ArgMode == 2) { 10927 // Sanity Check: Make sure using fp_offset makes sense. 10928 assert(!getTargetMachine().Options.UseSoftFloat && 10929 !(DAG.getMachineFunction() 10930 .getFunction()->getAttributes() 10931 .hasAttribute(AttributeSet::FunctionIndex, 10932 Attribute::NoImplicitFloat)) && 10933 Subtarget->hasSSE1()); 10934 } 10935 10936 // Insert VAARG_64 node into the DAG 10937 // VAARG_64 returns two values: Variable Argument Address, Chain 10938 SmallVector<SDValue, 11> InstOps; 10939 InstOps.push_back(Chain); 10940 InstOps.push_back(SrcPtr); 10941 InstOps.push_back(DAG.getConstant(ArgSize, MVT::i32)); 10942 InstOps.push_back(DAG.getConstant(ArgMode, MVT::i8)); 10943 InstOps.push_back(DAG.getConstant(Align, MVT::i32)); 10944 SDVTList VTs = DAG.getVTList(getPointerTy(), MVT::Other); 10945 SDValue VAARG = DAG.getMemIntrinsicNode(X86ISD::VAARG_64, dl, 10946 VTs, &InstOps[0], InstOps.size(), 10947 MVT::i64, 10948 MachinePointerInfo(SV), 10949 /*Align=*/0, 10950 /*Volatile=*/false, 10951 /*ReadMem=*/true, 10952 /*WriteMem=*/true); 10953 Chain = VAARG.getValue(1); 10954 10955 // Load the next argument and return it 10956 return DAG.getLoad(ArgVT, dl, 10957 Chain, 10958 VAARG, 10959 MachinePointerInfo(), 10960 false, false, false, 0); 10961} 10962 10963static SDValue LowerVACOPY(SDValue Op, const X86Subtarget *Subtarget, 10964 SelectionDAG &DAG) { 10965 // X86-64 va_list is a struct { i32, i32, i8*, i8* }. 10966 assert(Subtarget->is64Bit() && "This code only handles 64-bit va_copy!"); 10967 SDValue Chain = Op.getOperand(0); 10968 SDValue DstPtr = Op.getOperand(1); 10969 SDValue SrcPtr = Op.getOperand(2); 10970 const Value *DstSV = cast<SrcValueSDNode>(Op.getOperand(3))->getValue(); 10971 const Value *SrcSV = cast<SrcValueSDNode>(Op.getOperand(4))->getValue(); 10972 SDLoc DL(Op); 10973 10974 return DAG.getMemcpy(Chain, DL, DstPtr, SrcPtr, 10975 DAG.getIntPtrConstant(24), 8, /*isVolatile*/false, 10976 false, 10977 MachinePointerInfo(DstSV), MachinePointerInfo(SrcSV)); 10978} 10979 10980// getTargetVShiftByConstNode - Handle vector element shifts where the shift 10981// amount is a constant. Takes immediate version of shift as input. 10982static SDValue getTargetVShiftByConstNode(unsigned Opc, SDLoc dl, EVT VT, 10983 SDValue SrcOp, uint64_t ShiftAmt, 10984 SelectionDAG &DAG) { 10985 10986 // Check for ShiftAmt >= element width 10987 if (ShiftAmt >= VT.getVectorElementType().getSizeInBits()) { 10988 if (Opc == X86ISD::VSRAI) 10989 ShiftAmt = VT.getVectorElementType().getSizeInBits() - 1; 10990 else 10991 return DAG.getConstant(0, VT); 10992 } 10993 10994 assert((Opc == X86ISD::VSHLI || Opc == X86ISD::VSRLI || Opc == X86ISD::VSRAI) 10995 && "Unknown target vector shift-by-constant node"); 10996 10997 return DAG.getNode(Opc, dl, VT, SrcOp, DAG.getConstant(ShiftAmt, MVT::i8)); 10998} 10999 11000// getTargetVShiftNode - Handle vector element shifts where the shift amount 11001// may or may not be a constant. Takes immediate version of shift as input. 11002static SDValue getTargetVShiftNode(unsigned Opc, SDLoc dl, EVT VT, 11003 SDValue SrcOp, SDValue ShAmt, 11004 SelectionDAG &DAG) { 11005 assert(ShAmt.getValueType() == MVT::i32 && "ShAmt is not i32"); 11006 11007 // Catch shift-by-constant. 11008 if (ConstantSDNode *CShAmt = dyn_cast<ConstantSDNode>(ShAmt)) 11009 return getTargetVShiftByConstNode(Opc, dl, VT, SrcOp, 11010 CShAmt->getZExtValue(), DAG); 11011 11012 // Change opcode to non-immediate version 11013 switch (Opc) { 11014 default: llvm_unreachable("Unknown target vector shift node"); 11015 case X86ISD::VSHLI: Opc = X86ISD::VSHL; break; 11016 case X86ISD::VSRLI: Opc = X86ISD::VSRL; break; 11017 case X86ISD::VSRAI: Opc = X86ISD::VSRA; break; 11018 } 11019 11020 // Need to build a vector containing shift amount 11021 // Shift amount is 32-bits, but SSE instructions read 64-bit, so fill with 0 11022 SDValue ShOps[4]; 11023 ShOps[0] = ShAmt; 11024 ShOps[1] = DAG.getConstant(0, MVT::i32); 11025 ShOps[2] = ShOps[3] = DAG.getUNDEF(MVT::i32); 11026 ShAmt = DAG.getNode(ISD::BUILD_VECTOR, dl, MVT::v4i32, &ShOps[0], 4); 11027 11028 // The return type has to be a 128-bit type with the same element 11029 // type as the input type. 11030 MVT EltVT = VT.getVectorElementType().getSimpleVT(); 11031 EVT ShVT = MVT::getVectorVT(EltVT, 128/EltVT.getSizeInBits()); 11032 11033 ShAmt = DAG.getNode(ISD::BITCAST, dl, ShVT, ShAmt); 11034 return DAG.getNode(Opc, dl, VT, SrcOp, ShAmt); 11035} 11036 11037static SDValue LowerINTRINSIC_WO_CHAIN(SDValue Op, SelectionDAG &DAG) { 11038 SDLoc dl(Op); 11039 unsigned IntNo = cast<ConstantSDNode>(Op.getOperand(0))->getZExtValue(); 11040 switch (IntNo) { 11041 default: return SDValue(); // Don't custom lower most intrinsics. 11042 // Comparison intrinsics. 11043 case Intrinsic::x86_sse_comieq_ss: 11044 case Intrinsic::x86_sse_comilt_ss: 11045 case Intrinsic::x86_sse_comile_ss: 11046 case Intrinsic::x86_sse_comigt_ss: 11047 case Intrinsic::x86_sse_comige_ss: 11048 case Intrinsic::x86_sse_comineq_ss: 11049 case Intrinsic::x86_sse_ucomieq_ss: 11050 case Intrinsic::x86_sse_ucomilt_ss: 11051 case Intrinsic::x86_sse_ucomile_ss: 11052 case Intrinsic::x86_sse_ucomigt_ss: 11053 case Intrinsic::x86_sse_ucomige_ss: 11054 case Intrinsic::x86_sse_ucomineq_ss: 11055 case Intrinsic::x86_sse2_comieq_sd: 11056 case Intrinsic::x86_sse2_comilt_sd: 11057 case Intrinsic::x86_sse2_comile_sd: 11058 case Intrinsic::x86_sse2_comigt_sd: 11059 case Intrinsic::x86_sse2_comige_sd: 11060 case Intrinsic::x86_sse2_comineq_sd: 11061 case Intrinsic::x86_sse2_ucomieq_sd: 11062 case Intrinsic::x86_sse2_ucomilt_sd: 11063 case Intrinsic::x86_sse2_ucomile_sd: 11064 case Intrinsic::x86_sse2_ucomigt_sd: 11065 case Intrinsic::x86_sse2_ucomige_sd: 11066 case Intrinsic::x86_sse2_ucomineq_sd: { 11067 unsigned Opc; 11068 ISD::CondCode CC; 11069 switch (IntNo) { 11070 default: llvm_unreachable("Impossible intrinsic"); // Can't reach here. 11071 case Intrinsic::x86_sse_comieq_ss: 11072 case Intrinsic::x86_sse2_comieq_sd: 11073 Opc = X86ISD::COMI; 11074 CC = ISD::SETEQ; 11075 break; 11076 case Intrinsic::x86_sse_comilt_ss: 11077 case Intrinsic::x86_sse2_comilt_sd: 11078 Opc = X86ISD::COMI; 11079 CC = ISD::SETLT; 11080 break; 11081 case Intrinsic::x86_sse_comile_ss: 11082 case Intrinsic::x86_sse2_comile_sd: 11083 Opc = X86ISD::COMI; 11084 CC = ISD::SETLE; 11085 break; 11086 case Intrinsic::x86_sse_comigt_ss: 11087 case Intrinsic::x86_sse2_comigt_sd: 11088 Opc = X86ISD::COMI; 11089 CC = ISD::SETGT; 11090 break; 11091 case Intrinsic::x86_sse_comige_ss: 11092 case Intrinsic::x86_sse2_comige_sd: 11093 Opc = X86ISD::COMI; 11094 CC = ISD::SETGE; 11095 break; 11096 case Intrinsic::x86_sse_comineq_ss: 11097 case Intrinsic::x86_sse2_comineq_sd: 11098 Opc = X86ISD::COMI; 11099 CC = ISD::SETNE; 11100 break; 11101 case Intrinsic::x86_sse_ucomieq_ss: 11102 case Intrinsic::x86_sse2_ucomieq_sd: 11103 Opc = X86ISD::UCOMI; 11104 CC = ISD::SETEQ; 11105 break; 11106 case Intrinsic::x86_sse_ucomilt_ss: 11107 case Intrinsic::x86_sse2_ucomilt_sd: 11108 Opc = X86ISD::UCOMI; 11109 CC = ISD::SETLT; 11110 break; 11111 case Intrinsic::x86_sse_ucomile_ss: 11112 case Intrinsic::x86_sse2_ucomile_sd: 11113 Opc = X86ISD::UCOMI; 11114 CC = ISD::SETLE; 11115 break; 11116 case Intrinsic::x86_sse_ucomigt_ss: 11117 case Intrinsic::x86_sse2_ucomigt_sd: 11118 Opc = X86ISD::UCOMI; 11119 CC = ISD::SETGT; 11120 break; 11121 case Intrinsic::x86_sse_ucomige_ss: 11122 case Intrinsic::x86_sse2_ucomige_sd: 11123 Opc = X86ISD::UCOMI; 11124 CC = ISD::SETGE; 11125 break; 11126 case Intrinsic::x86_sse_ucomineq_ss: 11127 case Intrinsic::x86_sse2_ucomineq_sd: 11128 Opc = X86ISD::UCOMI; 11129 CC = ISD::SETNE; 11130 break; 11131 } 11132 11133 SDValue LHS = Op.getOperand(1); 11134 SDValue RHS = Op.getOperand(2); 11135 unsigned X86CC = TranslateX86CC(CC, true, LHS, RHS, DAG); 11136 assert(X86CC != X86::COND_INVALID && "Unexpected illegal condition!"); 11137 SDValue Cond = DAG.getNode(Opc, dl, MVT::i32, LHS, RHS); 11138 SDValue SetCC = DAG.getNode(X86ISD::SETCC, dl, MVT::i8, 11139 DAG.getConstant(X86CC, MVT::i8), Cond); 11140 return DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i32, SetCC); 11141 } 11142 11143 // Arithmetic intrinsics. 11144 case Intrinsic::x86_sse2_pmulu_dq: 11145 case Intrinsic::x86_avx2_pmulu_dq: 11146 return DAG.getNode(X86ISD::PMULUDQ, dl, Op.getValueType(), 11147 Op.getOperand(1), Op.getOperand(2)); 11148 11149 // SSE2/AVX2 sub with unsigned saturation intrinsics 11150 case Intrinsic::x86_sse2_psubus_b: 11151 case Intrinsic::x86_sse2_psubus_w: 11152 case Intrinsic::x86_avx2_psubus_b: 11153 case Intrinsic::x86_avx2_psubus_w: 11154 return DAG.getNode(X86ISD::SUBUS, dl, Op.getValueType(), 11155 Op.getOperand(1), Op.getOperand(2)); 11156 11157 // SSE3/AVX horizontal add/sub intrinsics 11158 case Intrinsic::x86_sse3_hadd_ps: 11159 case Intrinsic::x86_sse3_hadd_pd: 11160 case Intrinsic::x86_avx_hadd_ps_256: 11161 case Intrinsic::x86_avx_hadd_pd_256: 11162 case Intrinsic::x86_sse3_hsub_ps: 11163 case Intrinsic::x86_sse3_hsub_pd: 11164 case Intrinsic::x86_avx_hsub_ps_256: 11165 case Intrinsic::x86_avx_hsub_pd_256: 11166 case Intrinsic::x86_ssse3_phadd_w_128: 11167 case Intrinsic::x86_ssse3_phadd_d_128: 11168 case Intrinsic::x86_avx2_phadd_w: 11169 case Intrinsic::x86_avx2_phadd_d: 11170 case Intrinsic::x86_ssse3_phsub_w_128: 11171 case Intrinsic::x86_ssse3_phsub_d_128: 11172 case Intrinsic::x86_avx2_phsub_w: 11173 case Intrinsic::x86_avx2_phsub_d: { 11174 unsigned Opcode; 11175 switch (IntNo) { 11176 default: llvm_unreachable("Impossible intrinsic"); // Can't reach here. 11177 case Intrinsic::x86_sse3_hadd_ps: 11178 case Intrinsic::x86_sse3_hadd_pd: 11179 case Intrinsic::x86_avx_hadd_ps_256: 11180 case Intrinsic::x86_avx_hadd_pd_256: 11181 Opcode = X86ISD::FHADD; 11182 break; 11183 case Intrinsic::x86_sse3_hsub_ps: 11184 case Intrinsic::x86_sse3_hsub_pd: 11185 case Intrinsic::x86_avx_hsub_ps_256: 11186 case Intrinsic::x86_avx_hsub_pd_256: 11187 Opcode = X86ISD::FHSUB; 11188 break; 11189 case Intrinsic::x86_ssse3_phadd_w_128: 11190 case Intrinsic::x86_ssse3_phadd_d_128: 11191 case Intrinsic::x86_avx2_phadd_w: 11192 case Intrinsic::x86_avx2_phadd_d: 11193 Opcode = X86ISD::HADD; 11194 break; 11195 case Intrinsic::x86_ssse3_phsub_w_128: 11196 case Intrinsic::x86_ssse3_phsub_d_128: 11197 case Intrinsic::x86_avx2_phsub_w: 11198 case Intrinsic::x86_avx2_phsub_d: 11199 Opcode = X86ISD::HSUB; 11200 break; 11201 } 11202 return DAG.getNode(Opcode, dl, Op.getValueType(), 11203 Op.getOperand(1), Op.getOperand(2)); 11204 } 11205 11206 // SSE2/SSE41/AVX2 integer max/min intrinsics. 11207 case Intrinsic::x86_sse2_pmaxu_b: 11208 case Intrinsic::x86_sse41_pmaxuw: 11209 case Intrinsic::x86_sse41_pmaxud: 11210 case Intrinsic::x86_avx2_pmaxu_b: 11211 case Intrinsic::x86_avx2_pmaxu_w: 11212 case Intrinsic::x86_avx2_pmaxu_d: 11213 case Intrinsic::x86_avx512_pmaxu_d: 11214 case Intrinsic::x86_avx512_pmaxu_q: 11215 case Intrinsic::x86_sse2_pminu_b: 11216 case Intrinsic::x86_sse41_pminuw: 11217 case Intrinsic::x86_sse41_pminud: 11218 case Intrinsic::x86_avx2_pminu_b: 11219 case Intrinsic::x86_avx2_pminu_w: 11220 case Intrinsic::x86_avx2_pminu_d: 11221 case Intrinsic::x86_avx512_pminu_d: 11222 case Intrinsic::x86_avx512_pminu_q: 11223 case Intrinsic::x86_sse41_pmaxsb: 11224 case Intrinsic::x86_sse2_pmaxs_w: 11225 case Intrinsic::x86_sse41_pmaxsd: 11226 case Intrinsic::x86_avx2_pmaxs_b: 11227 case Intrinsic::x86_avx2_pmaxs_w: 11228 case Intrinsic::x86_avx2_pmaxs_d: 11229 case Intrinsic::x86_avx512_pmaxs_d: 11230 case Intrinsic::x86_avx512_pmaxs_q: 11231 case Intrinsic::x86_sse41_pminsb: 11232 case Intrinsic::x86_sse2_pmins_w: 11233 case Intrinsic::x86_sse41_pminsd: 11234 case Intrinsic::x86_avx2_pmins_b: 11235 case Intrinsic::x86_avx2_pmins_w: 11236 case Intrinsic::x86_avx2_pmins_d: 11237 case Intrinsic::x86_avx512_pmins_d: 11238 case Intrinsic::x86_avx512_pmins_q: { 11239 unsigned Opcode; 11240 switch (IntNo) { 11241 default: llvm_unreachable("Impossible intrinsic"); // Can't reach here. 11242 case Intrinsic::x86_sse2_pmaxu_b: 11243 case Intrinsic::x86_sse41_pmaxuw: 11244 case Intrinsic::x86_sse41_pmaxud: 11245 case Intrinsic::x86_avx2_pmaxu_b: 11246 case Intrinsic::x86_avx2_pmaxu_w: 11247 case Intrinsic::x86_avx2_pmaxu_d: 11248 case Intrinsic::x86_avx512_pmaxu_d: 11249 case Intrinsic::x86_avx512_pmaxu_q: 11250 Opcode = X86ISD::UMAX; 11251 break; 11252 case Intrinsic::x86_sse2_pminu_b: 11253 case Intrinsic::x86_sse41_pminuw: 11254 case Intrinsic::x86_sse41_pminud: 11255 case Intrinsic::x86_avx2_pminu_b: 11256 case Intrinsic::x86_avx2_pminu_w: 11257 case Intrinsic::x86_avx2_pminu_d: 11258 case Intrinsic::x86_avx512_pminu_d: 11259 case Intrinsic::x86_avx512_pminu_q: 11260 Opcode = X86ISD::UMIN; 11261 break; 11262 case Intrinsic::x86_sse41_pmaxsb: 11263 case Intrinsic::x86_sse2_pmaxs_w: 11264 case Intrinsic::x86_sse41_pmaxsd: 11265 case Intrinsic::x86_avx2_pmaxs_b: 11266 case Intrinsic::x86_avx2_pmaxs_w: 11267 case Intrinsic::x86_avx2_pmaxs_d: 11268 case Intrinsic::x86_avx512_pmaxs_d: 11269 case Intrinsic::x86_avx512_pmaxs_q: 11270 Opcode = X86ISD::SMAX; 11271 break; 11272 case Intrinsic::x86_sse41_pminsb: 11273 case Intrinsic::x86_sse2_pmins_w: 11274 case Intrinsic::x86_sse41_pminsd: 11275 case Intrinsic::x86_avx2_pmins_b: 11276 case Intrinsic::x86_avx2_pmins_w: 11277 case Intrinsic::x86_avx2_pmins_d: 11278 case Intrinsic::x86_avx512_pmins_d: 11279 case Intrinsic::x86_avx512_pmins_q: 11280 Opcode = X86ISD::SMIN; 11281 break; 11282 } 11283 return DAG.getNode(Opcode, dl, Op.getValueType(), 11284 Op.getOperand(1), Op.getOperand(2)); 11285 } 11286 11287 // SSE/SSE2/AVX floating point max/min intrinsics. 11288 case Intrinsic::x86_sse_max_ps: 11289 case Intrinsic::x86_sse2_max_pd: 11290 case Intrinsic::x86_avx_max_ps_256: 11291 case Intrinsic::x86_avx_max_pd_256: 11292 case Intrinsic::x86_avx512_max_ps_512: 11293 case Intrinsic::x86_avx512_max_pd_512: 11294 case Intrinsic::x86_sse_min_ps: 11295 case Intrinsic::x86_sse2_min_pd: 11296 case Intrinsic::x86_avx_min_ps_256: 11297 case Intrinsic::x86_avx_min_pd_256: 11298 case Intrinsic::x86_avx512_min_ps_512: 11299 case Intrinsic::x86_avx512_min_pd_512: { 11300 unsigned Opcode; 11301 switch (IntNo) { 11302 default: llvm_unreachable("Impossible intrinsic"); // Can't reach here. 11303 case Intrinsic::x86_sse_max_ps: 11304 case Intrinsic::x86_sse2_max_pd: 11305 case Intrinsic::x86_avx_max_ps_256: 11306 case Intrinsic::x86_avx_max_pd_256: 11307 case Intrinsic::x86_avx512_max_ps_512: 11308 case Intrinsic::x86_avx512_max_pd_512: 11309 Opcode = X86ISD::FMAX; 11310 break; 11311 case Intrinsic::x86_sse_min_ps: 11312 case Intrinsic::x86_sse2_min_pd: 11313 case Intrinsic::x86_avx_min_ps_256: 11314 case Intrinsic::x86_avx_min_pd_256: 11315 case Intrinsic::x86_avx512_min_ps_512: 11316 case Intrinsic::x86_avx512_min_pd_512: 11317 Opcode = X86ISD::FMIN; 11318 break; 11319 } 11320 return DAG.getNode(Opcode, dl, Op.getValueType(), 11321 Op.getOperand(1), Op.getOperand(2)); 11322 } 11323 11324 // AVX2 variable shift intrinsics 11325 case Intrinsic::x86_avx2_psllv_d: 11326 case Intrinsic::x86_avx2_psllv_q: 11327 case Intrinsic::x86_avx2_psllv_d_256: 11328 case Intrinsic::x86_avx2_psllv_q_256: 11329 case Intrinsic::x86_avx2_psrlv_d: 11330 case Intrinsic::x86_avx2_psrlv_q: 11331 case Intrinsic::x86_avx2_psrlv_d_256: 11332 case Intrinsic::x86_avx2_psrlv_q_256: 11333 case Intrinsic::x86_avx2_psrav_d: 11334 case Intrinsic::x86_avx2_psrav_d_256: { 11335 unsigned Opcode; 11336 switch (IntNo) { 11337 default: llvm_unreachable("Impossible intrinsic"); // Can't reach here. 11338 case Intrinsic::x86_avx2_psllv_d: 11339 case Intrinsic::x86_avx2_psllv_q: 11340 case Intrinsic::x86_avx2_psllv_d_256: 11341 case Intrinsic::x86_avx2_psllv_q_256: 11342 Opcode = ISD::SHL; 11343 break; 11344 case Intrinsic::x86_avx2_psrlv_d: 11345 case Intrinsic::x86_avx2_psrlv_q: 11346 case Intrinsic::x86_avx2_psrlv_d_256: 11347 case Intrinsic::x86_avx2_psrlv_q_256: 11348 Opcode = ISD::SRL; 11349 break; 11350 case Intrinsic::x86_avx2_psrav_d: 11351 case Intrinsic::x86_avx2_psrav_d_256: 11352 Opcode = ISD::SRA; 11353 break; 11354 } 11355 return DAG.getNode(Opcode, dl, Op.getValueType(), 11356 Op.getOperand(1), Op.getOperand(2)); 11357 } 11358 11359 case Intrinsic::x86_ssse3_pshuf_b_128: 11360 case Intrinsic::x86_avx2_pshuf_b: 11361 return DAG.getNode(X86ISD::PSHUFB, dl, Op.getValueType(), 11362 Op.getOperand(1), Op.getOperand(2)); 11363 11364 case Intrinsic::x86_ssse3_psign_b_128: 11365 case Intrinsic::x86_ssse3_psign_w_128: 11366 case Intrinsic::x86_ssse3_psign_d_128: 11367 case Intrinsic::x86_avx2_psign_b: 11368 case Intrinsic::x86_avx2_psign_w: 11369 case Intrinsic::x86_avx2_psign_d: 11370 return DAG.getNode(X86ISD::PSIGN, dl, Op.getValueType(), 11371 Op.getOperand(1), Op.getOperand(2)); 11372 11373 case Intrinsic::x86_sse41_insertps: 11374 return DAG.getNode(X86ISD::INSERTPS, dl, Op.getValueType(), 11375 Op.getOperand(1), Op.getOperand(2), Op.getOperand(3)); 11376 11377 case Intrinsic::x86_avx_vperm2f128_ps_256: 11378 case Intrinsic::x86_avx_vperm2f128_pd_256: 11379 case Intrinsic::x86_avx_vperm2f128_si_256: 11380 case Intrinsic::x86_avx2_vperm2i128: 11381 return DAG.getNode(X86ISD::VPERM2X128, dl, Op.getValueType(), 11382 Op.getOperand(1), Op.getOperand(2), Op.getOperand(3)); 11383 11384 case Intrinsic::x86_avx2_permd: 11385 case Intrinsic::x86_avx2_permps: 11386 // Operands intentionally swapped. Mask is last operand to intrinsic, 11387 // but second operand for node/instruction. 11388 return DAG.getNode(X86ISD::VPERMV, dl, Op.getValueType(), 11389 Op.getOperand(2), Op.getOperand(1)); 11390 11391 case Intrinsic::x86_sse_sqrt_ps: 11392 case Intrinsic::x86_sse2_sqrt_pd: 11393 case Intrinsic::x86_avx_sqrt_ps_256: 11394 case Intrinsic::x86_avx_sqrt_pd_256: 11395 return DAG.getNode(ISD::FSQRT, dl, Op.getValueType(), Op.getOperand(1)); 11396 11397 // ptest and testp intrinsics. The intrinsic these come from are designed to 11398 // return an integer value, not just an instruction so lower it to the ptest 11399 // or testp pattern and a setcc for the result. 11400 case Intrinsic::x86_sse41_ptestz: 11401 case Intrinsic::x86_sse41_ptestc: 11402 case Intrinsic::x86_sse41_ptestnzc: 11403 case Intrinsic::x86_avx_ptestz_256: 11404 case Intrinsic::x86_avx_ptestc_256: 11405 case Intrinsic::x86_avx_ptestnzc_256: 11406 case Intrinsic::x86_avx_vtestz_ps: 11407 case Intrinsic::x86_avx_vtestc_ps: 11408 case Intrinsic::x86_avx_vtestnzc_ps: 11409 case Intrinsic::x86_avx_vtestz_pd: 11410 case Intrinsic::x86_avx_vtestc_pd: 11411 case Intrinsic::x86_avx_vtestnzc_pd: 11412 case Intrinsic::x86_avx_vtestz_ps_256: 11413 case Intrinsic::x86_avx_vtestc_ps_256: 11414 case Intrinsic::x86_avx_vtestnzc_ps_256: 11415 case Intrinsic::x86_avx_vtestz_pd_256: 11416 case Intrinsic::x86_avx_vtestc_pd_256: 11417 case Intrinsic::x86_avx_vtestnzc_pd_256: { 11418 bool IsTestPacked = false; 11419 unsigned X86CC; 11420 switch (IntNo) { 11421 default: llvm_unreachable("Bad fallthrough in Intrinsic lowering."); 11422 case Intrinsic::x86_avx_vtestz_ps: 11423 case Intrinsic::x86_avx_vtestz_pd: 11424 case Intrinsic::x86_avx_vtestz_ps_256: 11425 case Intrinsic::x86_avx_vtestz_pd_256: 11426 IsTestPacked = true; // Fallthrough 11427 case Intrinsic::x86_sse41_ptestz: 11428 case Intrinsic::x86_avx_ptestz_256: 11429 // ZF = 1 11430 X86CC = X86::COND_E; 11431 break; 11432 case Intrinsic::x86_avx_vtestc_ps: 11433 case Intrinsic::x86_avx_vtestc_pd: 11434 case Intrinsic::x86_avx_vtestc_ps_256: 11435 case Intrinsic::x86_avx_vtestc_pd_256: 11436 IsTestPacked = true; // Fallthrough 11437 case Intrinsic::x86_sse41_ptestc: 11438 case Intrinsic::x86_avx_ptestc_256: 11439 // CF = 1 11440 X86CC = X86::COND_B; 11441 break; 11442 case Intrinsic::x86_avx_vtestnzc_ps: 11443 case Intrinsic::x86_avx_vtestnzc_pd: 11444 case Intrinsic::x86_avx_vtestnzc_ps_256: 11445 case Intrinsic::x86_avx_vtestnzc_pd_256: 11446 IsTestPacked = true; // Fallthrough 11447 case Intrinsic::x86_sse41_ptestnzc: 11448 case Intrinsic::x86_avx_ptestnzc_256: 11449 // ZF and CF = 0 11450 X86CC = X86::COND_A; 11451 break; 11452 } 11453 11454 SDValue LHS = Op.getOperand(1); 11455 SDValue RHS = Op.getOperand(2); 11456 unsigned TestOpc = IsTestPacked ? X86ISD::TESTP : X86ISD::PTEST; 11457 SDValue Test = DAG.getNode(TestOpc, dl, MVT::i32, LHS, RHS); 11458 SDValue CC = DAG.getConstant(X86CC, MVT::i8); 11459 SDValue SetCC = DAG.getNode(X86ISD::SETCC, dl, MVT::i8, CC, Test); 11460 return DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i32, SetCC); 11461 } 11462 case Intrinsic::x86_avx512_kortestz: 11463 case Intrinsic::x86_avx512_kortestc: { 11464 unsigned X86CC = (IntNo == Intrinsic::x86_avx512_kortestz)? X86::COND_E: X86::COND_B; 11465 SDValue LHS = DAG.getNode(ISD::BITCAST, dl, MVT::v16i1, Op.getOperand(1)); 11466 SDValue RHS = DAG.getNode(ISD::BITCAST, dl, MVT::v16i1, Op.getOperand(2)); 11467 SDValue CC = DAG.getConstant(X86CC, MVT::i8); 11468 SDValue Test = DAG.getNode(X86ISD::KORTEST, dl, MVT::i32, LHS, RHS); 11469 SDValue SetCC = DAG.getNode(X86ISD::SETCC, dl, MVT::i8, CC, Test); 11470 return DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i32, SetCC); 11471 } 11472 11473 // SSE/AVX shift intrinsics 11474 case Intrinsic::x86_sse2_psll_w: 11475 case Intrinsic::x86_sse2_psll_d: 11476 case Intrinsic::x86_sse2_psll_q: 11477 case Intrinsic::x86_avx2_psll_w: 11478 case Intrinsic::x86_avx2_psll_d: 11479 case Intrinsic::x86_avx2_psll_q: 11480 case Intrinsic::x86_sse2_psrl_w: 11481 case Intrinsic::x86_sse2_psrl_d: 11482 case Intrinsic::x86_sse2_psrl_q: 11483 case Intrinsic::x86_avx2_psrl_w: 11484 case Intrinsic::x86_avx2_psrl_d: 11485 case Intrinsic::x86_avx2_psrl_q: 11486 case Intrinsic::x86_sse2_psra_w: 11487 case Intrinsic::x86_sse2_psra_d: 11488 case Intrinsic::x86_avx2_psra_w: 11489 case Intrinsic::x86_avx2_psra_d: { 11490 unsigned Opcode; 11491 switch (IntNo) { 11492 default: llvm_unreachable("Impossible intrinsic"); // Can't reach here. 11493 case Intrinsic::x86_sse2_psll_w: 11494 case Intrinsic::x86_sse2_psll_d: 11495 case Intrinsic::x86_sse2_psll_q: 11496 case Intrinsic::x86_avx2_psll_w: 11497 case Intrinsic::x86_avx2_psll_d: 11498 case Intrinsic::x86_avx2_psll_q: 11499 Opcode = X86ISD::VSHL; 11500 break; 11501 case Intrinsic::x86_sse2_psrl_w: 11502 case Intrinsic::x86_sse2_psrl_d: 11503 case Intrinsic::x86_sse2_psrl_q: 11504 case Intrinsic::x86_avx2_psrl_w: 11505 case Intrinsic::x86_avx2_psrl_d: 11506 case Intrinsic::x86_avx2_psrl_q: 11507 Opcode = X86ISD::VSRL; 11508 break; 11509 case Intrinsic::x86_sse2_psra_w: 11510 case Intrinsic::x86_sse2_psra_d: 11511 case Intrinsic::x86_avx2_psra_w: 11512 case Intrinsic::x86_avx2_psra_d: 11513 Opcode = X86ISD::VSRA; 11514 break; 11515 } 11516 return DAG.getNode(Opcode, dl, Op.getValueType(), 11517 Op.getOperand(1), Op.getOperand(2)); 11518 } 11519 11520 // SSE/AVX immediate shift intrinsics 11521 case Intrinsic::x86_sse2_pslli_w: 11522 case Intrinsic::x86_sse2_pslli_d: 11523 case Intrinsic::x86_sse2_pslli_q: 11524 case Intrinsic::x86_avx2_pslli_w: 11525 case Intrinsic::x86_avx2_pslli_d: 11526 case Intrinsic::x86_avx2_pslli_q: 11527 case Intrinsic::x86_sse2_psrli_w: 11528 case Intrinsic::x86_sse2_psrli_d: 11529 case Intrinsic::x86_sse2_psrli_q: 11530 case Intrinsic::x86_avx2_psrli_w: 11531 case Intrinsic::x86_avx2_psrli_d: 11532 case Intrinsic::x86_avx2_psrli_q: 11533 case Intrinsic::x86_sse2_psrai_w: 11534 case Intrinsic::x86_sse2_psrai_d: 11535 case Intrinsic::x86_avx2_psrai_w: 11536 case Intrinsic::x86_avx2_psrai_d: { 11537 unsigned Opcode; 11538 switch (IntNo) { 11539 default: llvm_unreachable("Impossible intrinsic"); // Can't reach here. 11540 case Intrinsic::x86_sse2_pslli_w: 11541 case Intrinsic::x86_sse2_pslli_d: 11542 case Intrinsic::x86_sse2_pslli_q: 11543 case Intrinsic::x86_avx2_pslli_w: 11544 case Intrinsic::x86_avx2_pslli_d: 11545 case Intrinsic::x86_avx2_pslli_q: 11546 Opcode = X86ISD::VSHLI; 11547 break; 11548 case Intrinsic::x86_sse2_psrli_w: 11549 case Intrinsic::x86_sse2_psrli_d: 11550 case Intrinsic::x86_sse2_psrli_q: 11551 case Intrinsic::x86_avx2_psrli_w: 11552 case Intrinsic::x86_avx2_psrli_d: 11553 case Intrinsic::x86_avx2_psrli_q: 11554 Opcode = X86ISD::VSRLI; 11555 break; 11556 case Intrinsic::x86_sse2_psrai_w: 11557 case Intrinsic::x86_sse2_psrai_d: 11558 case Intrinsic::x86_avx2_psrai_w: 11559 case Intrinsic::x86_avx2_psrai_d: 11560 Opcode = X86ISD::VSRAI; 11561 break; 11562 } 11563 return getTargetVShiftNode(Opcode, dl, Op.getValueType(), 11564 Op.getOperand(1), Op.getOperand(2), DAG); 11565 } 11566 11567 case Intrinsic::x86_sse42_pcmpistria128: 11568 case Intrinsic::x86_sse42_pcmpestria128: 11569 case Intrinsic::x86_sse42_pcmpistric128: 11570 case Intrinsic::x86_sse42_pcmpestric128: 11571 case Intrinsic::x86_sse42_pcmpistrio128: 11572 case Intrinsic::x86_sse42_pcmpestrio128: 11573 case Intrinsic::x86_sse42_pcmpistris128: 11574 case Intrinsic::x86_sse42_pcmpestris128: 11575 case Intrinsic::x86_sse42_pcmpistriz128: 11576 case Intrinsic::x86_sse42_pcmpestriz128: { 11577 unsigned Opcode; 11578 unsigned X86CC; 11579 switch (IntNo) { 11580 default: llvm_unreachable("Impossible intrinsic"); // Can't reach here. 11581 case Intrinsic::x86_sse42_pcmpistria128: 11582 Opcode = X86ISD::PCMPISTRI; 11583 X86CC = X86::COND_A; 11584 break; 11585 case Intrinsic::x86_sse42_pcmpestria128: 11586 Opcode = X86ISD::PCMPESTRI; 11587 X86CC = X86::COND_A; 11588 break; 11589 case Intrinsic::x86_sse42_pcmpistric128: 11590 Opcode = X86ISD::PCMPISTRI; 11591 X86CC = X86::COND_B; 11592 break; 11593 case Intrinsic::x86_sse42_pcmpestric128: 11594 Opcode = X86ISD::PCMPESTRI; 11595 X86CC = X86::COND_B; 11596 break; 11597 case Intrinsic::x86_sse42_pcmpistrio128: 11598 Opcode = X86ISD::PCMPISTRI; 11599 X86CC = X86::COND_O; 11600 break; 11601 case Intrinsic::x86_sse42_pcmpestrio128: 11602 Opcode = X86ISD::PCMPESTRI; 11603 X86CC = X86::COND_O; 11604 break; 11605 case Intrinsic::x86_sse42_pcmpistris128: 11606 Opcode = X86ISD::PCMPISTRI; 11607 X86CC = X86::COND_S; 11608 break; 11609 case Intrinsic::x86_sse42_pcmpestris128: 11610 Opcode = X86ISD::PCMPESTRI; 11611 X86CC = X86::COND_S; 11612 break; 11613 case Intrinsic::x86_sse42_pcmpistriz128: 11614 Opcode = X86ISD::PCMPISTRI; 11615 X86CC = X86::COND_E; 11616 break; 11617 case Intrinsic::x86_sse42_pcmpestriz128: 11618 Opcode = X86ISD::PCMPESTRI; 11619 X86CC = X86::COND_E; 11620 break; 11621 } 11622 SmallVector<SDValue, 5> NewOps(Op->op_begin()+1, Op->op_end()); 11623 SDVTList VTs = DAG.getVTList(Op.getValueType(), MVT::i32); 11624 SDValue PCMP = DAG.getNode(Opcode, dl, VTs, NewOps.data(), NewOps.size()); 11625 SDValue SetCC = DAG.getNode(X86ISD::SETCC, dl, MVT::i8, 11626 DAG.getConstant(X86CC, MVT::i8), 11627 SDValue(PCMP.getNode(), 1)); 11628 return DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i32, SetCC); 11629 } 11630 11631 case Intrinsic::x86_sse42_pcmpistri128: 11632 case Intrinsic::x86_sse42_pcmpestri128: { 11633 unsigned Opcode; 11634 if (IntNo == Intrinsic::x86_sse42_pcmpistri128) 11635 Opcode = X86ISD::PCMPISTRI; 11636 else 11637 Opcode = X86ISD::PCMPESTRI; 11638 11639 SmallVector<SDValue, 5> NewOps(Op->op_begin()+1, Op->op_end()); 11640 SDVTList VTs = DAG.getVTList(Op.getValueType(), MVT::i32); 11641 return DAG.getNode(Opcode, dl, VTs, NewOps.data(), NewOps.size()); 11642 } 11643 case Intrinsic::x86_fma_vfmadd_ps: 11644 case Intrinsic::x86_fma_vfmadd_pd: 11645 case Intrinsic::x86_fma_vfmsub_ps: 11646 case Intrinsic::x86_fma_vfmsub_pd: 11647 case Intrinsic::x86_fma_vfnmadd_ps: 11648 case Intrinsic::x86_fma_vfnmadd_pd: 11649 case Intrinsic::x86_fma_vfnmsub_ps: 11650 case Intrinsic::x86_fma_vfnmsub_pd: 11651 case Intrinsic::x86_fma_vfmaddsub_ps: 11652 case Intrinsic::x86_fma_vfmaddsub_pd: 11653 case Intrinsic::x86_fma_vfmsubadd_ps: 11654 case Intrinsic::x86_fma_vfmsubadd_pd: 11655 case Intrinsic::x86_fma_vfmadd_ps_256: 11656 case Intrinsic::x86_fma_vfmadd_pd_256: 11657 case Intrinsic::x86_fma_vfmsub_ps_256: 11658 case Intrinsic::x86_fma_vfmsub_pd_256: 11659 case Intrinsic::x86_fma_vfnmadd_ps_256: 11660 case Intrinsic::x86_fma_vfnmadd_pd_256: 11661 case Intrinsic::x86_fma_vfnmsub_ps_256: 11662 case Intrinsic::x86_fma_vfnmsub_pd_256: 11663 case Intrinsic::x86_fma_vfmaddsub_ps_256: 11664 case Intrinsic::x86_fma_vfmaddsub_pd_256: 11665 case Intrinsic::x86_fma_vfmsubadd_ps_256: 11666 case Intrinsic::x86_fma_vfmsubadd_pd_256: 11667 case Intrinsic::x86_fma_vfmadd_ps_512: 11668 case Intrinsic::x86_fma_vfmadd_pd_512: 11669 case Intrinsic::x86_fma_vfmsub_ps_512: 11670 case Intrinsic::x86_fma_vfmsub_pd_512: 11671 case Intrinsic::x86_fma_vfnmadd_ps_512: 11672 case Intrinsic::x86_fma_vfnmadd_pd_512: 11673 case Intrinsic::x86_fma_vfnmsub_ps_512: 11674 case Intrinsic::x86_fma_vfnmsub_pd_512: 11675 case Intrinsic::x86_fma_vfmaddsub_ps_512: 11676 case Intrinsic::x86_fma_vfmaddsub_pd_512: 11677 case Intrinsic::x86_fma_vfmsubadd_ps_512: 11678 case Intrinsic::x86_fma_vfmsubadd_pd_512: { 11679 unsigned Opc; 11680 switch (IntNo) { 11681 default: llvm_unreachable("Impossible intrinsic"); // Can't reach here. 11682 case Intrinsic::x86_fma_vfmadd_ps: 11683 case Intrinsic::x86_fma_vfmadd_pd: 11684 case Intrinsic::x86_fma_vfmadd_ps_256: 11685 case Intrinsic::x86_fma_vfmadd_pd_256: 11686 case Intrinsic::x86_fma_vfmadd_ps_512: 11687 case Intrinsic::x86_fma_vfmadd_pd_512: 11688 Opc = X86ISD::FMADD; 11689 break; 11690 case Intrinsic::x86_fma_vfmsub_ps: 11691 case Intrinsic::x86_fma_vfmsub_pd: 11692 case Intrinsic::x86_fma_vfmsub_ps_256: 11693 case Intrinsic::x86_fma_vfmsub_pd_256: 11694 case Intrinsic::x86_fma_vfmsub_ps_512: 11695 case Intrinsic::x86_fma_vfmsub_pd_512: 11696 Opc = X86ISD::FMSUB; 11697 break; 11698 case Intrinsic::x86_fma_vfnmadd_ps: 11699 case Intrinsic::x86_fma_vfnmadd_pd: 11700 case Intrinsic::x86_fma_vfnmadd_ps_256: 11701 case Intrinsic::x86_fma_vfnmadd_pd_256: 11702 case Intrinsic::x86_fma_vfnmadd_ps_512: 11703 case Intrinsic::x86_fma_vfnmadd_pd_512: 11704 Opc = X86ISD::FNMADD; 11705 break; 11706 case Intrinsic::x86_fma_vfnmsub_ps: 11707 case Intrinsic::x86_fma_vfnmsub_pd: 11708 case Intrinsic::x86_fma_vfnmsub_ps_256: 11709 case Intrinsic::x86_fma_vfnmsub_pd_256: 11710 case Intrinsic::x86_fma_vfnmsub_ps_512: 11711 case Intrinsic::x86_fma_vfnmsub_pd_512: 11712 Opc = X86ISD::FNMSUB; 11713 break; 11714 case Intrinsic::x86_fma_vfmaddsub_ps: 11715 case Intrinsic::x86_fma_vfmaddsub_pd: 11716 case Intrinsic::x86_fma_vfmaddsub_ps_256: 11717 case Intrinsic::x86_fma_vfmaddsub_pd_256: 11718 case Intrinsic::x86_fma_vfmaddsub_ps_512: 11719 case Intrinsic::x86_fma_vfmaddsub_pd_512: 11720 Opc = X86ISD::FMADDSUB; 11721 break; 11722 case Intrinsic::x86_fma_vfmsubadd_ps: 11723 case Intrinsic::x86_fma_vfmsubadd_pd: 11724 case Intrinsic::x86_fma_vfmsubadd_ps_256: 11725 case Intrinsic::x86_fma_vfmsubadd_pd_256: 11726 case Intrinsic::x86_fma_vfmsubadd_ps_512: 11727 case Intrinsic::x86_fma_vfmsubadd_pd_512: 11728 Opc = X86ISD::FMSUBADD; 11729 break; 11730 } 11731 11732 return DAG.getNode(Opc, dl, Op.getValueType(), Op.getOperand(1), 11733 Op.getOperand(2), Op.getOperand(3)); 11734 } 11735 } 11736} 11737 11738static SDValue getGatherNode(unsigned Opc, SDValue Op, SelectionDAG &DAG, 11739 SDValue Base, SDValue Index, 11740 SDValue ScaleOp, SDValue Chain, 11741 const X86Subtarget * Subtarget) { 11742 SDLoc dl(Op); 11743 ConstantSDNode *C = dyn_cast<ConstantSDNode>(ScaleOp); 11744 assert(C && "Invalid scale type"); 11745 SDValue Scale = DAG.getTargetConstant(C->getZExtValue(), MVT::i8); 11746 SDValue Src = getZeroVector(Op.getValueType(), Subtarget, DAG, dl); 11747 EVT MaskVT = MVT::getVectorVT(MVT::i1, 11748 Index.getValueType().getVectorNumElements()); 11749 SDValue MaskInReg = DAG.getConstant(~0, MaskVT); 11750 SDVTList VTs = DAG.getVTList(Op.getValueType(), MaskVT, MVT::Other); 11751 SDValue Disp = DAG.getTargetConstant(0, MVT::i32); 11752 SDValue Segment = DAG.getRegister(0, MVT::i32); 11753 SDValue Ops[] = {Src, MaskInReg, Base, Scale, Index, Disp, Segment, Chain}; 11754 SDNode *Res = DAG.getMachineNode(Opc, dl, VTs, Ops); 11755 SDValue RetOps[] = { SDValue(Res, 0), SDValue(Res, 2) }; 11756 return DAG.getMergeValues(RetOps, array_lengthof(RetOps), dl); 11757} 11758 11759static SDValue getMGatherNode(unsigned Opc, SDValue Op, SelectionDAG &DAG, 11760 SDValue Src, SDValue Mask, SDValue Base, 11761 SDValue Index, SDValue ScaleOp, SDValue Chain, 11762 const X86Subtarget * Subtarget) { 11763 SDLoc dl(Op); 11764 ConstantSDNode *C = dyn_cast<ConstantSDNode>(ScaleOp); 11765 assert(C && "Invalid scale type"); 11766 SDValue Scale = DAG.getTargetConstant(C->getZExtValue(), MVT::i8); 11767 EVT MaskVT = MVT::getVectorVT(MVT::i1, 11768 Index.getValueType().getVectorNumElements()); 11769 SDValue MaskInReg = DAG.getNode(ISD::BITCAST, dl, MaskVT, Mask); 11770 SDVTList VTs = DAG.getVTList(Op.getValueType(), MaskVT, MVT::Other); 11771 SDValue Disp = DAG.getTargetConstant(0, MVT::i32); 11772 SDValue Segment = DAG.getRegister(0, MVT::i32); 11773 if (Src.getOpcode() == ISD::UNDEF) 11774 Src = getZeroVector(Op.getValueType(), Subtarget, DAG, dl); 11775 SDValue Ops[] = {Src, MaskInReg, Base, Scale, Index, Disp, Segment, Chain}; 11776 SDNode *Res = DAG.getMachineNode(Opc, dl, VTs, Ops); 11777 SDValue RetOps[] = { SDValue(Res, 0), SDValue(Res, 2) }; 11778 return DAG.getMergeValues(RetOps, array_lengthof(RetOps), dl); 11779} 11780 11781static SDValue getScatterNode(unsigned Opc, SDValue Op, SelectionDAG &DAG, 11782 SDValue Src, SDValue Base, SDValue Index, 11783 SDValue ScaleOp, SDValue Chain) { 11784 SDLoc dl(Op); 11785 ConstantSDNode *C = dyn_cast<ConstantSDNode>(ScaleOp); 11786 assert(C && "Invalid scale type"); 11787 SDValue Scale = DAG.getTargetConstant(C->getZExtValue(), MVT::i8); 11788 SDValue Disp = DAG.getTargetConstant(0, MVT::i32); 11789 SDValue Segment = DAG.getRegister(0, MVT::i32); 11790 EVT MaskVT = MVT::getVectorVT(MVT::i1, 11791 Index.getValueType().getVectorNumElements()); 11792 SDValue MaskInReg = DAG.getConstant(~0, MaskVT); 11793 SDVTList VTs = DAG.getVTList(MaskVT, MVT::Other); 11794 SDValue Ops[] = {Base, Scale, Index, Disp, Segment, MaskInReg, Src, Chain}; 11795 SDNode *Res = DAG.getMachineNode(Opc, dl, VTs, Ops); 11796 return SDValue(Res, 1); 11797} 11798 11799static SDValue getMScatterNode(unsigned Opc, SDValue Op, SelectionDAG &DAG, 11800 SDValue Src, SDValue Mask, SDValue Base, 11801 SDValue Index, SDValue ScaleOp, SDValue Chain) { 11802 SDLoc dl(Op); 11803 ConstantSDNode *C = dyn_cast<ConstantSDNode>(ScaleOp); 11804 assert(C && "Invalid scale type"); 11805 SDValue Scale = DAG.getTargetConstant(C->getZExtValue(), MVT::i8); 11806 SDValue Disp = DAG.getTargetConstant(0, MVT::i32); 11807 SDValue Segment = DAG.getRegister(0, MVT::i32); 11808 EVT MaskVT = MVT::getVectorVT(MVT::i1, 11809 Index.getValueType().getVectorNumElements()); 11810 SDValue MaskInReg = DAG.getNode(ISD::BITCAST, dl, MaskVT, Mask); 11811 SDVTList VTs = DAG.getVTList(MaskVT, MVT::Other); 11812 SDValue Ops[] = {Base, Scale, Index, Disp, Segment, MaskInReg, Src, Chain}; 11813 SDNode *Res = DAG.getMachineNode(Opc, dl, VTs, Ops); 11814 return SDValue(Res, 1); 11815} 11816 11817static SDValue LowerINTRINSIC_W_CHAIN(SDValue Op, const X86Subtarget *Subtarget, 11818 SelectionDAG &DAG) { 11819 SDLoc dl(Op); 11820 unsigned IntNo = cast<ConstantSDNode>(Op.getOperand(1))->getZExtValue(); 11821 switch (IntNo) { 11822 default: return SDValue(); // Don't custom lower most intrinsics. 11823 11824 // RDRAND/RDSEED intrinsics. 11825 case Intrinsic::x86_rdrand_16: 11826 case Intrinsic::x86_rdrand_32: 11827 case Intrinsic::x86_rdrand_64: 11828 case Intrinsic::x86_rdseed_16: 11829 case Intrinsic::x86_rdseed_32: 11830 case Intrinsic::x86_rdseed_64: { 11831 unsigned Opcode = (IntNo == Intrinsic::x86_rdseed_16 || 11832 IntNo == Intrinsic::x86_rdseed_32 || 11833 IntNo == Intrinsic::x86_rdseed_64) ? X86ISD::RDSEED : 11834 X86ISD::RDRAND; 11835 // Emit the node with the right value type. 11836 SDVTList VTs = DAG.getVTList(Op->getValueType(0), MVT::Glue, MVT::Other); 11837 SDValue Result = DAG.getNode(Opcode, dl, VTs, Op.getOperand(0)); 11838 11839 // If the value returned by RDRAND/RDSEED was valid (CF=1), return 1. 11840 // Otherwise return the value from Rand, which is always 0, casted to i32. 11841 SDValue Ops[] = { DAG.getZExtOrTrunc(Result, dl, Op->getValueType(1)), 11842 DAG.getConstant(1, Op->getValueType(1)), 11843 DAG.getConstant(X86::COND_B, MVT::i32), 11844 SDValue(Result.getNode(), 1) }; 11845 SDValue isValid = DAG.getNode(X86ISD::CMOV, dl, 11846 DAG.getVTList(Op->getValueType(1), MVT::Glue), 11847 Ops, array_lengthof(Ops)); 11848 11849 // Return { result, isValid, chain }. 11850 return DAG.getNode(ISD::MERGE_VALUES, dl, Op->getVTList(), Result, isValid, 11851 SDValue(Result.getNode(), 2)); 11852 } 11853 //int_gather(index, base, scale); 11854 case Intrinsic::x86_avx512_gather_qpd_512: 11855 case Intrinsic::x86_avx512_gather_qps_512: 11856 case Intrinsic::x86_avx512_gather_dpd_512: 11857 case Intrinsic::x86_avx512_gather_qpi_512: 11858 case Intrinsic::x86_avx512_gather_qpq_512: 11859 case Intrinsic::x86_avx512_gather_dpq_512: 11860 case Intrinsic::x86_avx512_gather_dps_512: 11861 case Intrinsic::x86_avx512_gather_dpi_512: { 11862 unsigned Opc; 11863 switch (IntNo) { 11864 default: llvm_unreachable("Unexpected intrinsic!"); 11865 case Intrinsic::x86_avx512_gather_qps_512: Opc = X86::VGATHERQPSZrm; break; 11866 case Intrinsic::x86_avx512_gather_qpd_512: Opc = X86::VGATHERQPDZrm; break; 11867 case Intrinsic::x86_avx512_gather_dpd_512: Opc = X86::VGATHERDPDZrm; break; 11868 case Intrinsic::x86_avx512_gather_dps_512: Opc = X86::VGATHERDPSZrm; break; 11869 case Intrinsic::x86_avx512_gather_qpi_512: Opc = X86::VPGATHERQDZrm; break; 11870 case Intrinsic::x86_avx512_gather_qpq_512: Opc = X86::VPGATHERQQZrm; break; 11871 case Intrinsic::x86_avx512_gather_dpi_512: Opc = X86::VPGATHERDDZrm; break; 11872 case Intrinsic::x86_avx512_gather_dpq_512: Opc = X86::VPGATHERDQZrm; break; 11873 } 11874 SDValue Chain = Op.getOperand(0); 11875 SDValue Index = Op.getOperand(2); 11876 SDValue Base = Op.getOperand(3); 11877 SDValue Scale = Op.getOperand(4); 11878 return getGatherNode(Opc, Op, DAG, Base, Index, Scale, Chain, Subtarget); 11879 } 11880 //int_gather_mask(v1, mask, index, base, scale); 11881 case Intrinsic::x86_avx512_gather_qps_mask_512: 11882 case Intrinsic::x86_avx512_gather_qpd_mask_512: 11883 case Intrinsic::x86_avx512_gather_dpd_mask_512: 11884 case Intrinsic::x86_avx512_gather_dps_mask_512: 11885 case Intrinsic::x86_avx512_gather_qpi_mask_512: 11886 case Intrinsic::x86_avx512_gather_qpq_mask_512: 11887 case Intrinsic::x86_avx512_gather_dpi_mask_512: 11888 case Intrinsic::x86_avx512_gather_dpq_mask_512: { 11889 unsigned Opc; 11890 switch (IntNo) { 11891 default: llvm_unreachable("Unexpected intrinsic!"); 11892 case Intrinsic::x86_avx512_gather_qps_mask_512: 11893 Opc = X86::VGATHERQPSZrm; break; 11894 case Intrinsic::x86_avx512_gather_qpd_mask_512: 11895 Opc = X86::VGATHERQPDZrm; break; 11896 case Intrinsic::x86_avx512_gather_dpd_mask_512: 11897 Opc = X86::VGATHERDPDZrm; break; 11898 case Intrinsic::x86_avx512_gather_dps_mask_512: 11899 Opc = X86::VGATHERDPSZrm; break; 11900 case Intrinsic::x86_avx512_gather_qpi_mask_512: 11901 Opc = X86::VPGATHERQDZrm; break; 11902 case Intrinsic::x86_avx512_gather_qpq_mask_512: 11903 Opc = X86::VPGATHERQQZrm; break; 11904 case Intrinsic::x86_avx512_gather_dpi_mask_512: 11905 Opc = X86::VPGATHERDDZrm; break; 11906 case Intrinsic::x86_avx512_gather_dpq_mask_512: 11907 Opc = X86::VPGATHERDQZrm; break; 11908 } 11909 SDValue Chain = Op.getOperand(0); 11910 SDValue Src = Op.getOperand(2); 11911 SDValue Mask = Op.getOperand(3); 11912 SDValue Index = Op.getOperand(4); 11913 SDValue Base = Op.getOperand(5); 11914 SDValue Scale = Op.getOperand(6); 11915 return getMGatherNode(Opc, Op, DAG, Src, Mask, Base, Index, Scale, Chain, 11916 Subtarget); 11917 } 11918 //int_scatter(base, index, v1, scale); 11919 case Intrinsic::x86_avx512_scatter_qpd_512: 11920 case Intrinsic::x86_avx512_scatter_qps_512: 11921 case Intrinsic::x86_avx512_scatter_dpd_512: 11922 case Intrinsic::x86_avx512_scatter_qpi_512: 11923 case Intrinsic::x86_avx512_scatter_qpq_512: 11924 case Intrinsic::x86_avx512_scatter_dpq_512: 11925 case Intrinsic::x86_avx512_scatter_dps_512: 11926 case Intrinsic::x86_avx512_scatter_dpi_512: { 11927 unsigned Opc; 11928 switch (IntNo) { 11929 default: llvm_unreachable("Unexpected intrinsic!"); 11930 case Intrinsic::x86_avx512_scatter_qpd_512: 11931 Opc = X86::VSCATTERQPDZmr; break; 11932 case Intrinsic::x86_avx512_scatter_qps_512: 11933 Opc = X86::VSCATTERQPSZmr; break; 11934 case Intrinsic::x86_avx512_scatter_dpd_512: 11935 Opc = X86::VSCATTERDPDZmr; break; 11936 case Intrinsic::x86_avx512_scatter_dps_512: 11937 Opc = X86::VSCATTERDPSZmr; break; 11938 case Intrinsic::x86_avx512_scatter_qpi_512: 11939 Opc = X86::VPSCATTERQDZmr; break; 11940 case Intrinsic::x86_avx512_scatter_qpq_512: 11941 Opc = X86::VPSCATTERQQZmr; break; 11942 case Intrinsic::x86_avx512_scatter_dpq_512: 11943 Opc = X86::VPSCATTERDQZmr; break; 11944 case Intrinsic::x86_avx512_scatter_dpi_512: 11945 Opc = X86::VPSCATTERDDZmr; break; 11946 } 11947 SDValue Chain = Op.getOperand(0); 11948 SDValue Base = Op.getOperand(2); 11949 SDValue Index = Op.getOperand(3); 11950 SDValue Src = Op.getOperand(4); 11951 SDValue Scale = Op.getOperand(5); 11952 return getScatterNode(Opc, Op, DAG, Src, Base, Index, Scale, Chain); 11953 } 11954 //int_scatter_mask(base, mask, index, v1, scale); 11955 case Intrinsic::x86_avx512_scatter_qps_mask_512: 11956 case Intrinsic::x86_avx512_scatter_qpd_mask_512: 11957 case Intrinsic::x86_avx512_scatter_dpd_mask_512: 11958 case Intrinsic::x86_avx512_scatter_dps_mask_512: 11959 case Intrinsic::x86_avx512_scatter_qpi_mask_512: 11960 case Intrinsic::x86_avx512_scatter_qpq_mask_512: 11961 case Intrinsic::x86_avx512_scatter_dpi_mask_512: 11962 case Intrinsic::x86_avx512_scatter_dpq_mask_512: { 11963 unsigned Opc; 11964 switch (IntNo) { 11965 default: llvm_unreachable("Unexpected intrinsic!"); 11966 case Intrinsic::x86_avx512_scatter_qpd_mask_512: 11967 Opc = X86::VSCATTERQPDZmr; break; 11968 case Intrinsic::x86_avx512_scatter_qps_mask_512: 11969 Opc = X86::VSCATTERQPSZmr; break; 11970 case Intrinsic::x86_avx512_scatter_dpd_mask_512: 11971 Opc = X86::VSCATTERDPDZmr; break; 11972 case Intrinsic::x86_avx512_scatter_dps_mask_512: 11973 Opc = X86::VSCATTERDPSZmr; break; 11974 case Intrinsic::x86_avx512_scatter_qpi_mask_512: 11975 Opc = X86::VPSCATTERQDZmr; break; 11976 case Intrinsic::x86_avx512_scatter_qpq_mask_512: 11977 Opc = X86::VPSCATTERQQZmr; break; 11978 case Intrinsic::x86_avx512_scatter_dpq_mask_512: 11979 Opc = X86::VPSCATTERDQZmr; break; 11980 case Intrinsic::x86_avx512_scatter_dpi_mask_512: 11981 Opc = X86::VPSCATTERDDZmr; break; 11982 } 11983 SDValue Chain = Op.getOperand(0); 11984 SDValue Base = Op.getOperand(2); 11985 SDValue Mask = Op.getOperand(3); 11986 SDValue Index = Op.getOperand(4); 11987 SDValue Src = Op.getOperand(5); 11988 SDValue Scale = Op.getOperand(6); 11989 return getMScatterNode(Opc, Op, DAG, Src, Mask, Base, Index, Scale, Chain); 11990 } 11991 // XTEST intrinsics. 11992 case Intrinsic::x86_xtest: { 11993 SDVTList VTs = DAG.getVTList(Op->getValueType(0), MVT::Other); 11994 SDValue InTrans = DAG.getNode(X86ISD::XTEST, dl, VTs, Op.getOperand(0)); 11995 SDValue SetCC = DAG.getNode(X86ISD::SETCC, dl, MVT::i8, 11996 DAG.getConstant(X86::COND_NE, MVT::i8), 11997 InTrans); 11998 SDValue Ret = DAG.getNode(ISD::ZERO_EXTEND, dl, Op->getValueType(0), SetCC); 11999 return DAG.getNode(ISD::MERGE_VALUES, dl, Op->getVTList(), 12000 Ret, SDValue(InTrans.getNode(), 1)); 12001 } 12002 } 12003} 12004 12005SDValue X86TargetLowering::LowerRETURNADDR(SDValue Op, 12006 SelectionDAG &DAG) const { 12007 MachineFrameInfo *MFI = DAG.getMachineFunction().getFrameInfo(); 12008 MFI->setReturnAddressIsTaken(true); 12009 12010 unsigned Depth = cast<ConstantSDNode>(Op.getOperand(0))->getZExtValue(); 12011 SDLoc dl(Op); 12012 EVT PtrVT = getPointerTy(); 12013 12014 if (Depth > 0) { 12015 SDValue FrameAddr = LowerFRAMEADDR(Op, DAG); 12016 const X86RegisterInfo *RegInfo = 12017 static_cast<const X86RegisterInfo*>(getTargetMachine().getRegisterInfo()); 12018 SDValue Offset = DAG.getConstant(RegInfo->getSlotSize(), PtrVT); 12019 return DAG.getLoad(PtrVT, dl, DAG.getEntryNode(), 12020 DAG.getNode(ISD::ADD, dl, PtrVT, 12021 FrameAddr, Offset), 12022 MachinePointerInfo(), false, false, false, 0); 12023 } 12024 12025 // Just load the return address. 12026 SDValue RetAddrFI = getReturnAddressFrameIndex(DAG); 12027 return DAG.getLoad(PtrVT, dl, DAG.getEntryNode(), 12028 RetAddrFI, MachinePointerInfo(), false, false, false, 0); 12029} 12030 12031SDValue X86TargetLowering::LowerFRAMEADDR(SDValue Op, SelectionDAG &DAG) const { 12032 MachineFrameInfo *MFI = DAG.getMachineFunction().getFrameInfo(); 12033 MFI->setFrameAddressIsTaken(true); 12034 12035 EVT VT = Op.getValueType(); 12036 SDLoc dl(Op); // FIXME probably not meaningful 12037 unsigned Depth = cast<ConstantSDNode>(Op.getOperand(0))->getZExtValue(); 12038 const X86RegisterInfo *RegInfo = 12039 static_cast<const X86RegisterInfo*>(getTargetMachine().getRegisterInfo()); 12040 unsigned FrameReg = RegInfo->getFrameRegister(DAG.getMachineFunction()); 12041 assert(((FrameReg == X86::RBP && VT == MVT::i64) || 12042 (FrameReg == X86::EBP && VT == MVT::i32)) && 12043 "Invalid Frame Register!"); 12044 SDValue FrameAddr = DAG.getCopyFromReg(DAG.getEntryNode(), dl, FrameReg, VT); 12045 while (Depth--) 12046 FrameAddr = DAG.getLoad(VT, dl, DAG.getEntryNode(), FrameAddr, 12047 MachinePointerInfo(), 12048 false, false, false, 0); 12049 return FrameAddr; 12050} 12051 12052SDValue X86TargetLowering::LowerFRAME_TO_ARGS_OFFSET(SDValue Op, 12053 SelectionDAG &DAG) const { 12054 const X86RegisterInfo *RegInfo = 12055 static_cast<const X86RegisterInfo*>(getTargetMachine().getRegisterInfo()); 12056 return DAG.getIntPtrConstant(2 * RegInfo->getSlotSize()); 12057} 12058 12059SDValue X86TargetLowering::LowerEH_RETURN(SDValue Op, SelectionDAG &DAG) const { 12060 SDValue Chain = Op.getOperand(0); 12061 SDValue Offset = Op.getOperand(1); 12062 SDValue Handler = Op.getOperand(2); 12063 SDLoc dl (Op); 12064 12065 EVT PtrVT = getPointerTy(); 12066 const X86RegisterInfo *RegInfo = 12067 static_cast<const X86RegisterInfo*>(getTargetMachine().getRegisterInfo()); 12068 unsigned FrameReg = RegInfo->getFrameRegister(DAG.getMachineFunction()); 12069 assert(((FrameReg == X86::RBP && PtrVT == MVT::i64) || 12070 (FrameReg == X86::EBP && PtrVT == MVT::i32)) && 12071 "Invalid Frame Register!"); 12072 SDValue Frame = DAG.getCopyFromReg(DAG.getEntryNode(), dl, FrameReg, PtrVT); 12073 unsigned StoreAddrReg = (PtrVT == MVT::i64) ? X86::RCX : X86::ECX; 12074 12075 SDValue StoreAddr = DAG.getNode(ISD::ADD, dl, PtrVT, Frame, 12076 DAG.getIntPtrConstant(RegInfo->getSlotSize())); 12077 StoreAddr = DAG.getNode(ISD::ADD, dl, PtrVT, StoreAddr, Offset); 12078 Chain = DAG.getStore(Chain, dl, Handler, StoreAddr, MachinePointerInfo(), 12079 false, false, 0); 12080 Chain = DAG.getCopyToReg(Chain, dl, StoreAddrReg, StoreAddr); 12081 12082 return DAG.getNode(X86ISD::EH_RETURN, dl, MVT::Other, Chain, 12083 DAG.getRegister(StoreAddrReg, PtrVT)); 12084} 12085 12086SDValue X86TargetLowering::lowerEH_SJLJ_SETJMP(SDValue Op, 12087 SelectionDAG &DAG) const { 12088 SDLoc DL(Op); 12089 return DAG.getNode(X86ISD::EH_SJLJ_SETJMP, DL, 12090 DAG.getVTList(MVT::i32, MVT::Other), 12091 Op.getOperand(0), Op.getOperand(1)); 12092} 12093 12094SDValue X86TargetLowering::lowerEH_SJLJ_LONGJMP(SDValue Op, 12095 SelectionDAG &DAG) const { 12096 SDLoc DL(Op); 12097 return DAG.getNode(X86ISD::EH_SJLJ_LONGJMP, DL, MVT::Other, 12098 Op.getOperand(0), Op.getOperand(1)); 12099} 12100 12101static SDValue LowerADJUST_TRAMPOLINE(SDValue Op, SelectionDAG &DAG) { 12102 return Op.getOperand(0); 12103} 12104 12105SDValue X86TargetLowering::LowerINIT_TRAMPOLINE(SDValue Op, 12106 SelectionDAG &DAG) const { 12107 SDValue Root = Op.getOperand(0); 12108 SDValue Trmp = Op.getOperand(1); // trampoline 12109 SDValue FPtr = Op.getOperand(2); // nested function 12110 SDValue Nest = Op.getOperand(3); // 'nest' parameter value 12111 SDLoc dl (Op); 12112 12113 const Value *TrmpAddr = cast<SrcValueSDNode>(Op.getOperand(4))->getValue(); 12114 const TargetRegisterInfo* TRI = getTargetMachine().getRegisterInfo(); 12115 12116 if (Subtarget->is64Bit()) { 12117 SDValue OutChains[6]; 12118 12119 // Large code-model. 12120 const unsigned char JMP64r = 0xFF; // 64-bit jmp through register opcode. 12121 const unsigned char MOV64ri = 0xB8; // X86::MOV64ri opcode. 12122 12123 const unsigned char N86R10 = TRI->getEncodingValue(X86::R10) & 0x7; 12124 const unsigned char N86R11 = TRI->getEncodingValue(X86::R11) & 0x7; 12125 12126 const unsigned char REX_WB = 0x40 | 0x08 | 0x01; // REX prefix 12127 12128 // Load the pointer to the nested function into R11. 12129 unsigned OpCode = ((MOV64ri | N86R11) << 8) | REX_WB; // movabsq r11 12130 SDValue Addr = Trmp; 12131 OutChains[0] = DAG.getStore(Root, dl, DAG.getConstant(OpCode, MVT::i16), 12132 Addr, MachinePointerInfo(TrmpAddr), 12133 false, false, 0); 12134 12135 Addr = DAG.getNode(ISD::ADD, dl, MVT::i64, Trmp, 12136 DAG.getConstant(2, MVT::i64)); 12137 OutChains[1] = DAG.getStore(Root, dl, FPtr, Addr, 12138 MachinePointerInfo(TrmpAddr, 2), 12139 false, false, 2); 12140 12141 // Load the 'nest' parameter value into R10. 12142 // R10 is specified in X86CallingConv.td 12143 OpCode = ((MOV64ri | N86R10) << 8) | REX_WB; // movabsq r10 12144 Addr = DAG.getNode(ISD::ADD, dl, MVT::i64, Trmp, 12145 DAG.getConstant(10, MVT::i64)); 12146 OutChains[2] = DAG.getStore(Root, dl, DAG.getConstant(OpCode, MVT::i16), 12147 Addr, MachinePointerInfo(TrmpAddr, 10), 12148 false, false, 0); 12149 12150 Addr = DAG.getNode(ISD::ADD, dl, MVT::i64, Trmp, 12151 DAG.getConstant(12, MVT::i64)); 12152 OutChains[3] = DAG.getStore(Root, dl, Nest, Addr, 12153 MachinePointerInfo(TrmpAddr, 12), 12154 false, false, 2); 12155 12156 // Jump to the nested function. 12157 OpCode = (JMP64r << 8) | REX_WB; // jmpq *... 12158 Addr = DAG.getNode(ISD::ADD, dl, MVT::i64, Trmp, 12159 DAG.getConstant(20, MVT::i64)); 12160 OutChains[4] = DAG.getStore(Root, dl, DAG.getConstant(OpCode, MVT::i16), 12161 Addr, MachinePointerInfo(TrmpAddr, 20), 12162 false, false, 0); 12163 12164 unsigned char ModRM = N86R11 | (4 << 3) | (3 << 6); // ...r11 12165 Addr = DAG.getNode(ISD::ADD, dl, MVT::i64, Trmp, 12166 DAG.getConstant(22, MVT::i64)); 12167 OutChains[5] = DAG.getStore(Root, dl, DAG.getConstant(ModRM, MVT::i8), Addr, 12168 MachinePointerInfo(TrmpAddr, 22), 12169 false, false, 0); 12170 12171 return DAG.getNode(ISD::TokenFactor, dl, MVT::Other, OutChains, 6); 12172 } else { 12173 const Function *Func = 12174 cast<Function>(cast<SrcValueSDNode>(Op.getOperand(5))->getValue()); 12175 CallingConv::ID CC = Func->getCallingConv(); 12176 unsigned NestReg; 12177 12178 switch (CC) { 12179 default: 12180 llvm_unreachable("Unsupported calling convention"); 12181 case CallingConv::C: 12182 case CallingConv::X86_StdCall: { 12183 // Pass 'nest' parameter in ECX. 12184 // Must be kept in sync with X86CallingConv.td 12185 NestReg = X86::ECX; 12186 12187 // Check that ECX wasn't needed by an 'inreg' parameter. 12188 FunctionType *FTy = Func->getFunctionType(); 12189 const AttributeSet &Attrs = Func->getAttributes(); 12190 12191 if (!Attrs.isEmpty() && !Func->isVarArg()) { 12192 unsigned InRegCount = 0; 12193 unsigned Idx = 1; 12194 12195 for (FunctionType::param_iterator I = FTy->param_begin(), 12196 E = FTy->param_end(); I != E; ++I, ++Idx) 12197 if (Attrs.hasAttribute(Idx, Attribute::InReg)) 12198 // FIXME: should only count parameters that are lowered to integers. 12199 InRegCount += (TD->getTypeSizeInBits(*I) + 31) / 32; 12200 12201 if (InRegCount > 2) { 12202 report_fatal_error("Nest register in use - reduce number of inreg" 12203 " parameters!"); 12204 } 12205 } 12206 break; 12207 } 12208 case CallingConv::X86_FastCall: 12209 case CallingConv::X86_ThisCall: 12210 case CallingConv::Fast: 12211 // Pass 'nest' parameter in EAX. 12212 // Must be kept in sync with X86CallingConv.td 12213 NestReg = X86::EAX; 12214 break; 12215 } 12216 12217 SDValue OutChains[4]; 12218 SDValue Addr, Disp; 12219 12220 Addr = DAG.getNode(ISD::ADD, dl, MVT::i32, Trmp, 12221 DAG.getConstant(10, MVT::i32)); 12222 Disp = DAG.getNode(ISD::SUB, dl, MVT::i32, FPtr, Addr); 12223 12224 // This is storing the opcode for MOV32ri. 12225 const unsigned char MOV32ri = 0xB8; // X86::MOV32ri's opcode byte. 12226 const unsigned char N86Reg = TRI->getEncodingValue(NestReg) & 0x7; 12227 OutChains[0] = DAG.getStore(Root, dl, 12228 DAG.getConstant(MOV32ri|N86Reg, MVT::i8), 12229 Trmp, MachinePointerInfo(TrmpAddr), 12230 false, false, 0); 12231 12232 Addr = DAG.getNode(ISD::ADD, dl, MVT::i32, Trmp, 12233 DAG.getConstant(1, MVT::i32)); 12234 OutChains[1] = DAG.getStore(Root, dl, Nest, Addr, 12235 MachinePointerInfo(TrmpAddr, 1), 12236 false, false, 1); 12237 12238 const unsigned char JMP = 0xE9; // jmp <32bit dst> opcode. 12239 Addr = DAG.getNode(ISD::ADD, dl, MVT::i32, Trmp, 12240 DAG.getConstant(5, MVT::i32)); 12241 OutChains[2] = DAG.getStore(Root, dl, DAG.getConstant(JMP, MVT::i8), Addr, 12242 MachinePointerInfo(TrmpAddr, 5), 12243 false, false, 1); 12244 12245 Addr = DAG.getNode(ISD::ADD, dl, MVT::i32, Trmp, 12246 DAG.getConstant(6, MVT::i32)); 12247 OutChains[3] = DAG.getStore(Root, dl, Disp, Addr, 12248 MachinePointerInfo(TrmpAddr, 6), 12249 false, false, 1); 12250 12251 return DAG.getNode(ISD::TokenFactor, dl, MVT::Other, OutChains, 4); 12252 } 12253} 12254 12255SDValue X86TargetLowering::LowerFLT_ROUNDS_(SDValue Op, 12256 SelectionDAG &DAG) const { 12257 /* 12258 The rounding mode is in bits 11:10 of FPSR, and has the following 12259 settings: 12260 00 Round to nearest 12261 01 Round to -inf 12262 10 Round to +inf 12263 11 Round to 0 12264 12265 FLT_ROUNDS, on the other hand, expects the following: 12266 -1 Undefined 12267 0 Round to 0 12268 1 Round to nearest 12269 2 Round to +inf 12270 3 Round to -inf 12271 12272 To perform the conversion, we do: 12273 (((((FPSR & 0x800) >> 11) | ((FPSR & 0x400) >> 9)) + 1) & 3) 12274 */ 12275 12276 MachineFunction &MF = DAG.getMachineFunction(); 12277 const TargetMachine &TM = MF.getTarget(); 12278 const TargetFrameLowering &TFI = *TM.getFrameLowering(); 12279 unsigned StackAlignment = TFI.getStackAlignment(); 12280 EVT VT = Op.getValueType(); 12281 SDLoc DL(Op); 12282 12283 // Save FP Control Word to stack slot 12284 int SSFI = MF.getFrameInfo()->CreateStackObject(2, StackAlignment, false); 12285 SDValue StackSlot = DAG.getFrameIndex(SSFI, getPointerTy()); 12286 12287 MachineMemOperand *MMO = 12288 MF.getMachineMemOperand(MachinePointerInfo::getFixedStack(SSFI), 12289 MachineMemOperand::MOStore, 2, 2); 12290 12291 SDValue Ops[] = { DAG.getEntryNode(), StackSlot }; 12292 SDValue Chain = DAG.getMemIntrinsicNode(X86ISD::FNSTCW16m, DL, 12293 DAG.getVTList(MVT::Other), 12294 Ops, array_lengthof(Ops), MVT::i16, 12295 MMO); 12296 12297 // Load FP Control Word from stack slot 12298 SDValue CWD = DAG.getLoad(MVT::i16, DL, Chain, StackSlot, 12299 MachinePointerInfo(), false, false, false, 0); 12300 12301 // Transform as necessary 12302 SDValue CWD1 = 12303 DAG.getNode(ISD::SRL, DL, MVT::i16, 12304 DAG.getNode(ISD::AND, DL, MVT::i16, 12305 CWD, DAG.getConstant(0x800, MVT::i16)), 12306 DAG.getConstant(11, MVT::i8)); 12307 SDValue CWD2 = 12308 DAG.getNode(ISD::SRL, DL, MVT::i16, 12309 DAG.getNode(ISD::AND, DL, MVT::i16, 12310 CWD, DAG.getConstant(0x400, MVT::i16)), 12311 DAG.getConstant(9, MVT::i8)); 12312 12313 SDValue RetVal = 12314 DAG.getNode(ISD::AND, DL, MVT::i16, 12315 DAG.getNode(ISD::ADD, DL, MVT::i16, 12316 DAG.getNode(ISD::OR, DL, MVT::i16, CWD1, CWD2), 12317 DAG.getConstant(1, MVT::i16)), 12318 DAG.getConstant(3, MVT::i16)); 12319 12320 return DAG.getNode((VT.getSizeInBits() < 16 ? 12321 ISD::TRUNCATE : ISD::ZERO_EXTEND), DL, VT, RetVal); 12322} 12323 12324static SDValue LowerCTLZ(SDValue Op, SelectionDAG &DAG) { 12325 EVT VT = Op.getValueType(); 12326 EVT OpVT = VT; 12327 unsigned NumBits = VT.getSizeInBits(); 12328 SDLoc dl(Op); 12329 12330 Op = Op.getOperand(0); 12331 if (VT == MVT::i8) { 12332 // Zero extend to i32 since there is not an i8 bsr. 12333 OpVT = MVT::i32; 12334 Op = DAG.getNode(ISD::ZERO_EXTEND, dl, OpVT, Op); 12335 } 12336 12337 // Issue a bsr (scan bits in reverse) which also sets EFLAGS. 12338 SDVTList VTs = DAG.getVTList(OpVT, MVT::i32); 12339 Op = DAG.getNode(X86ISD::BSR, dl, VTs, Op); 12340 12341 // If src is zero (i.e. bsr sets ZF), returns NumBits. 12342 SDValue Ops[] = { 12343 Op, 12344 DAG.getConstant(NumBits+NumBits-1, OpVT), 12345 DAG.getConstant(X86::COND_E, MVT::i8), 12346 Op.getValue(1) 12347 }; 12348 Op = DAG.getNode(X86ISD::CMOV, dl, OpVT, Ops, array_lengthof(Ops)); 12349 12350 // Finally xor with NumBits-1. 12351 Op = DAG.getNode(ISD::XOR, dl, OpVT, Op, DAG.getConstant(NumBits-1, OpVT)); 12352 12353 if (VT == MVT::i8) 12354 Op = DAG.getNode(ISD::TRUNCATE, dl, MVT::i8, Op); 12355 return Op; 12356} 12357 12358static SDValue LowerCTLZ_ZERO_UNDEF(SDValue Op, SelectionDAG &DAG) { 12359 EVT VT = Op.getValueType(); 12360 EVT OpVT = VT; 12361 unsigned NumBits = VT.getSizeInBits(); 12362 SDLoc dl(Op); 12363 12364 Op = Op.getOperand(0); 12365 if (VT == MVT::i8) { 12366 // Zero extend to i32 since there is not an i8 bsr. 12367 OpVT = MVT::i32; 12368 Op = DAG.getNode(ISD::ZERO_EXTEND, dl, OpVT, Op); 12369 } 12370 12371 // Issue a bsr (scan bits in reverse). 12372 SDVTList VTs = DAG.getVTList(OpVT, MVT::i32); 12373 Op = DAG.getNode(X86ISD::BSR, dl, VTs, Op); 12374 12375 // And xor with NumBits-1. 12376 Op = DAG.getNode(ISD::XOR, dl, OpVT, Op, DAG.getConstant(NumBits-1, OpVT)); 12377 12378 if (VT == MVT::i8) 12379 Op = DAG.getNode(ISD::TRUNCATE, dl, MVT::i8, Op); 12380 return Op; 12381} 12382 12383static SDValue LowerCTTZ(SDValue Op, SelectionDAG &DAG) { 12384 EVT VT = Op.getValueType(); 12385 unsigned NumBits = VT.getSizeInBits(); 12386 SDLoc dl(Op); 12387 Op = Op.getOperand(0); 12388 12389 // Issue a bsf (scan bits forward) which also sets EFLAGS. 12390 SDVTList VTs = DAG.getVTList(VT, MVT::i32); 12391 Op = DAG.getNode(X86ISD::BSF, dl, VTs, Op); 12392 12393 // If src is zero (i.e. bsf sets ZF), returns NumBits. 12394 SDValue Ops[] = { 12395 Op, 12396 DAG.getConstant(NumBits, VT), 12397 DAG.getConstant(X86::COND_E, MVT::i8), 12398 Op.getValue(1) 12399 }; 12400 return DAG.getNode(X86ISD::CMOV, dl, VT, Ops, array_lengthof(Ops)); 12401} 12402 12403// Lower256IntArith - Break a 256-bit integer operation into two new 128-bit 12404// ones, and then concatenate the result back. 12405static SDValue Lower256IntArith(SDValue Op, SelectionDAG &DAG) { 12406 EVT VT = Op.getValueType(); 12407 12408 assert(VT.is256BitVector() && VT.isInteger() && 12409 "Unsupported value type for operation"); 12410 12411 unsigned NumElems = VT.getVectorNumElements(); 12412 SDLoc dl(Op); 12413 12414 // Extract the LHS vectors 12415 SDValue LHS = Op.getOperand(0); 12416 SDValue LHS1 = Extract128BitVector(LHS, 0, DAG, dl); 12417 SDValue LHS2 = Extract128BitVector(LHS, NumElems/2, DAG, dl); 12418 12419 // Extract the RHS vectors 12420 SDValue RHS = Op.getOperand(1); 12421 SDValue RHS1 = Extract128BitVector(RHS, 0, DAG, dl); 12422 SDValue RHS2 = Extract128BitVector(RHS, NumElems/2, DAG, dl); 12423 12424 MVT EltVT = VT.getVectorElementType().getSimpleVT(); 12425 EVT NewVT = MVT::getVectorVT(EltVT, NumElems/2); 12426 12427 return DAG.getNode(ISD::CONCAT_VECTORS, dl, VT, 12428 DAG.getNode(Op.getOpcode(), dl, NewVT, LHS1, RHS1), 12429 DAG.getNode(Op.getOpcode(), dl, NewVT, LHS2, RHS2)); 12430} 12431 12432static SDValue LowerADD(SDValue Op, SelectionDAG &DAG) { 12433 assert(Op.getValueType().is256BitVector() && 12434 Op.getValueType().isInteger() && 12435 "Only handle AVX 256-bit vector integer operation"); 12436 return Lower256IntArith(Op, DAG); 12437} 12438 12439static SDValue LowerSUB(SDValue Op, SelectionDAG &DAG) { 12440 assert(Op.getValueType().is256BitVector() && 12441 Op.getValueType().isInteger() && 12442 "Only handle AVX 256-bit vector integer operation"); 12443 return Lower256IntArith(Op, DAG); 12444} 12445 12446static SDValue LowerMUL(SDValue Op, const X86Subtarget *Subtarget, 12447 SelectionDAG &DAG) { 12448 SDLoc dl(Op); 12449 EVT VT = Op.getValueType(); 12450 12451 // Decompose 256-bit ops into smaller 128-bit ops. 12452 if (VT.is256BitVector() && !Subtarget->hasInt256()) 12453 return Lower256IntArith(Op, DAG); 12454 12455 SDValue A = Op.getOperand(0); 12456 SDValue B = Op.getOperand(1); 12457 12458 // Lower v4i32 mul as 2x shuffle, 2x pmuludq, 2x shuffle. 12459 if (VT == MVT::v4i32) { 12460 assert(Subtarget->hasSSE2() && !Subtarget->hasSSE41() && 12461 "Should not custom lower when pmuldq is available!"); 12462 12463 // Extract the odd parts. 12464 static const int UnpackMask[] = { 1, -1, 3, -1 }; 12465 SDValue Aodds = DAG.getVectorShuffle(VT, dl, A, A, UnpackMask); 12466 SDValue Bodds = DAG.getVectorShuffle(VT, dl, B, B, UnpackMask); 12467 12468 // Multiply the even parts. 12469 SDValue Evens = DAG.getNode(X86ISD::PMULUDQ, dl, MVT::v2i64, A, B); 12470 // Now multiply odd parts. 12471 SDValue Odds = DAG.getNode(X86ISD::PMULUDQ, dl, MVT::v2i64, Aodds, Bodds); 12472 12473 Evens = DAG.getNode(ISD::BITCAST, dl, VT, Evens); 12474 Odds = DAG.getNode(ISD::BITCAST, dl, VT, Odds); 12475 12476 // Merge the two vectors back together with a shuffle. This expands into 2 12477 // shuffles. 12478 static const int ShufMask[] = { 0, 4, 2, 6 }; 12479 return DAG.getVectorShuffle(VT, dl, Evens, Odds, ShufMask); 12480 } 12481 12482 assert((VT == MVT::v2i64 || VT == MVT::v4i64 || VT == MVT::v8i64) && 12483 "Only know how to lower V2I64/V4I64/V8I64 multiply"); 12484 12485 // Ahi = psrlqi(a, 32); 12486 // Bhi = psrlqi(b, 32); 12487 // 12488 // AloBlo = pmuludq(a, b); 12489 // AloBhi = pmuludq(a, Bhi); 12490 // AhiBlo = pmuludq(Ahi, b); 12491 12492 // AloBhi = psllqi(AloBhi, 32); 12493 // AhiBlo = psllqi(AhiBlo, 32); 12494 // return AloBlo + AloBhi + AhiBlo; 12495 12496 SDValue Ahi = getTargetVShiftByConstNode(X86ISD::VSRLI, dl, VT, A, 32, DAG); 12497 SDValue Bhi = getTargetVShiftByConstNode(X86ISD::VSRLI, dl, VT, B, 32, DAG); 12498 12499 // Bit cast to 32-bit vectors for MULUDQ 12500 EVT MulVT = (VT == MVT::v2i64) ? MVT::v4i32 : 12501 (VT == MVT::v4i64) ? MVT::v8i32 : MVT::v16i32; 12502 A = DAG.getNode(ISD::BITCAST, dl, MulVT, A); 12503 B = DAG.getNode(ISD::BITCAST, dl, MulVT, B); 12504 Ahi = DAG.getNode(ISD::BITCAST, dl, MulVT, Ahi); 12505 Bhi = DAG.getNode(ISD::BITCAST, dl, MulVT, Bhi); 12506 12507 SDValue AloBlo = DAG.getNode(X86ISD::PMULUDQ, dl, VT, A, B); 12508 SDValue AloBhi = DAG.getNode(X86ISD::PMULUDQ, dl, VT, A, Bhi); 12509 SDValue AhiBlo = DAG.getNode(X86ISD::PMULUDQ, dl, VT, Ahi, B); 12510 12511 AloBhi = getTargetVShiftByConstNode(X86ISD::VSHLI, dl, VT, AloBhi, 32, DAG); 12512 AhiBlo = getTargetVShiftByConstNode(X86ISD::VSHLI, dl, VT, AhiBlo, 32, DAG); 12513 12514 SDValue Res = DAG.getNode(ISD::ADD, dl, VT, AloBlo, AloBhi); 12515 return DAG.getNode(ISD::ADD, dl, VT, Res, AhiBlo); 12516} 12517 12518static SDValue LowerSDIV(SDValue Op, SelectionDAG &DAG) { 12519 EVT VT = Op.getValueType(); 12520 EVT EltTy = VT.getVectorElementType(); 12521 unsigned NumElts = VT.getVectorNumElements(); 12522 SDValue N0 = Op.getOperand(0); 12523 SDLoc dl(Op); 12524 12525 // Lower sdiv X, pow2-const. 12526 BuildVectorSDNode *C = dyn_cast<BuildVectorSDNode>(Op.getOperand(1)); 12527 if (!C) 12528 return SDValue(); 12529 12530 APInt SplatValue, SplatUndef; 12531 unsigned SplatBitSize; 12532 bool HasAnyUndefs; 12533 if (!C->isConstantSplat(SplatValue, SplatUndef, SplatBitSize, 12534 HasAnyUndefs) || 12535 EltTy.getSizeInBits() < SplatBitSize) 12536 return SDValue(); 12537 12538 if ((SplatValue != 0) && 12539 (SplatValue.isPowerOf2() || (-SplatValue).isPowerOf2())) { 12540 unsigned Lg2 = SplatValue.countTrailingZeros(); 12541 // Splat the sign bit. 12542 SmallVector<SDValue, 16> Sz(NumElts, 12543 DAG.getConstant(EltTy.getSizeInBits() - 1, 12544 EltTy)); 12545 SDValue SGN = DAG.getNode(ISD::SRA, dl, VT, N0, 12546 DAG.getNode(ISD::BUILD_VECTOR, dl, VT, &Sz[0], 12547 NumElts)); 12548 // Add (N0 < 0) ? abs2 - 1 : 0; 12549 SmallVector<SDValue, 16> Amt(NumElts, 12550 DAG.getConstant(EltTy.getSizeInBits() - Lg2, 12551 EltTy)); 12552 SDValue SRL = DAG.getNode(ISD::SRL, dl, VT, SGN, 12553 DAG.getNode(ISD::BUILD_VECTOR, dl, VT, &Amt[0], 12554 NumElts)); 12555 SDValue ADD = DAG.getNode(ISD::ADD, dl, VT, N0, SRL); 12556 SmallVector<SDValue, 16> Lg2Amt(NumElts, DAG.getConstant(Lg2, EltTy)); 12557 SDValue SRA = DAG.getNode(ISD::SRA, dl, VT, ADD, 12558 DAG.getNode(ISD::BUILD_VECTOR, dl, VT, &Lg2Amt[0], 12559 NumElts)); 12560 12561 // If we're dividing by a positive value, we're done. Otherwise, we must 12562 // negate the result. 12563 if (SplatValue.isNonNegative()) 12564 return SRA; 12565 12566 SmallVector<SDValue, 16> V(NumElts, DAG.getConstant(0, EltTy)); 12567 SDValue Zero = DAG.getNode(ISD::BUILD_VECTOR, dl, VT, &V[0], NumElts); 12568 return DAG.getNode(ISD::SUB, dl, VT, Zero, SRA); 12569 } 12570 return SDValue(); 12571} 12572 12573static SDValue LowerScalarImmediateShift(SDValue Op, SelectionDAG &DAG, 12574 const X86Subtarget *Subtarget) { 12575 EVT VT = Op.getValueType(); 12576 SDLoc dl(Op); 12577 SDValue R = Op.getOperand(0); 12578 SDValue Amt = Op.getOperand(1); 12579 12580 // Optimize shl/srl/sra with constant shift amount. 12581 if (isSplatVector(Amt.getNode())) { 12582 SDValue SclrAmt = Amt->getOperand(0); 12583 if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(SclrAmt)) { 12584 uint64_t ShiftAmt = C->getZExtValue(); 12585 12586 if (VT == MVT::v2i64 || VT == MVT::v4i32 || VT == MVT::v8i16 || 12587 (Subtarget->hasInt256() && 12588 (VT == MVT::v4i64 || VT == MVT::v8i32 || VT == MVT::v16i16)) || 12589 (Subtarget->hasAVX512() && 12590 (VT == MVT::v8i64 || VT == MVT::v16i32))) { 12591 if (Op.getOpcode() == ISD::SHL) 12592 return getTargetVShiftByConstNode(X86ISD::VSHLI, dl, VT, R, ShiftAmt, 12593 DAG); 12594 if (Op.getOpcode() == ISD::SRL) 12595 return getTargetVShiftByConstNode(X86ISD::VSRLI, dl, VT, R, ShiftAmt, 12596 DAG); 12597 if (Op.getOpcode() == ISD::SRA && VT != MVT::v2i64 && VT != MVT::v4i64) 12598 return getTargetVShiftByConstNode(X86ISD::VSRAI, dl, VT, R, ShiftAmt, 12599 DAG); 12600 } 12601 12602 if (VT == MVT::v16i8) { 12603 if (Op.getOpcode() == ISD::SHL) { 12604 // Make a large shift. 12605 SDValue SHL = getTargetVShiftByConstNode(X86ISD::VSHLI, dl, 12606 MVT::v8i16, R, ShiftAmt, 12607 DAG); 12608 SHL = DAG.getNode(ISD::BITCAST, dl, VT, SHL); 12609 // Zero out the rightmost bits. 12610 SmallVector<SDValue, 16> V(16, 12611 DAG.getConstant(uint8_t(-1U << ShiftAmt), 12612 MVT::i8)); 12613 return DAG.getNode(ISD::AND, dl, VT, SHL, 12614 DAG.getNode(ISD::BUILD_VECTOR, dl, VT, &V[0], 16)); 12615 } 12616 if (Op.getOpcode() == ISD::SRL) { 12617 // Make a large shift. 12618 SDValue SRL = getTargetVShiftByConstNode(X86ISD::VSRLI, dl, 12619 MVT::v8i16, R, ShiftAmt, 12620 DAG); 12621 SRL = DAG.getNode(ISD::BITCAST, dl, VT, SRL); 12622 // Zero out the leftmost bits. 12623 SmallVector<SDValue, 16> V(16, 12624 DAG.getConstant(uint8_t(-1U) >> ShiftAmt, 12625 MVT::i8)); 12626 return DAG.getNode(ISD::AND, dl, VT, SRL, 12627 DAG.getNode(ISD::BUILD_VECTOR, dl, VT, &V[0], 16)); 12628 } 12629 if (Op.getOpcode() == ISD::SRA) { 12630 if (ShiftAmt == 7) { 12631 // R s>> 7 === R s< 0 12632 SDValue Zeros = getZeroVector(VT, Subtarget, DAG, dl); 12633 return DAG.getNode(X86ISD::PCMPGT, dl, VT, Zeros, R); 12634 } 12635 12636 // R s>> a === ((R u>> a) ^ m) - m 12637 SDValue Res = DAG.getNode(ISD::SRL, dl, VT, R, Amt); 12638 SmallVector<SDValue, 16> V(16, DAG.getConstant(128 >> ShiftAmt, 12639 MVT::i8)); 12640 SDValue Mask = DAG.getNode(ISD::BUILD_VECTOR, dl, VT, &V[0], 16); 12641 Res = DAG.getNode(ISD::XOR, dl, VT, Res, Mask); 12642 Res = DAG.getNode(ISD::SUB, dl, VT, Res, Mask); 12643 return Res; 12644 } 12645 llvm_unreachable("Unknown shift opcode."); 12646 } 12647 12648 if (Subtarget->hasInt256() && VT == MVT::v32i8) { 12649 if (Op.getOpcode() == ISD::SHL) { 12650 // Make a large shift. 12651 SDValue SHL = getTargetVShiftByConstNode(X86ISD::VSHLI, dl, 12652 MVT::v16i16, R, ShiftAmt, 12653 DAG); 12654 SHL = DAG.getNode(ISD::BITCAST, dl, VT, SHL); 12655 // Zero out the rightmost bits. 12656 SmallVector<SDValue, 32> V(32, 12657 DAG.getConstant(uint8_t(-1U << ShiftAmt), 12658 MVT::i8)); 12659 return DAG.getNode(ISD::AND, dl, VT, SHL, 12660 DAG.getNode(ISD::BUILD_VECTOR, dl, VT, &V[0], 32)); 12661 } 12662 if (Op.getOpcode() == ISD::SRL) { 12663 // Make a large shift. 12664 SDValue SRL = getTargetVShiftByConstNode(X86ISD::VSRLI, dl, 12665 MVT::v16i16, R, ShiftAmt, 12666 DAG); 12667 SRL = DAG.getNode(ISD::BITCAST, dl, VT, SRL); 12668 // Zero out the leftmost bits. 12669 SmallVector<SDValue, 32> V(32, 12670 DAG.getConstant(uint8_t(-1U) >> ShiftAmt, 12671 MVT::i8)); 12672 return DAG.getNode(ISD::AND, dl, VT, SRL, 12673 DAG.getNode(ISD::BUILD_VECTOR, dl, VT, &V[0], 32)); 12674 } 12675 if (Op.getOpcode() == ISD::SRA) { 12676 if (ShiftAmt == 7) { 12677 // R s>> 7 === R s< 0 12678 SDValue Zeros = getZeroVector(VT, Subtarget, DAG, dl); 12679 return DAG.getNode(X86ISD::PCMPGT, dl, VT, Zeros, R); 12680 } 12681 12682 // R s>> a === ((R u>> a) ^ m) - m 12683 SDValue Res = DAG.getNode(ISD::SRL, dl, VT, R, Amt); 12684 SmallVector<SDValue, 32> V(32, DAG.getConstant(128 >> ShiftAmt, 12685 MVT::i8)); 12686 SDValue Mask = DAG.getNode(ISD::BUILD_VECTOR, dl, VT, &V[0], 32); 12687 Res = DAG.getNode(ISD::XOR, dl, VT, Res, Mask); 12688 Res = DAG.getNode(ISD::SUB, dl, VT, Res, Mask); 12689 return Res; 12690 } 12691 llvm_unreachable("Unknown shift opcode."); 12692 } 12693 } 12694 } 12695 12696 // Special case in 32-bit mode, where i64 is expanded into high and low parts. 12697 if (!Subtarget->is64Bit() && 12698 (VT == MVT::v2i64 || (Subtarget->hasInt256() && VT == MVT::v4i64)) && 12699 Amt.getOpcode() == ISD::BITCAST && 12700 Amt.getOperand(0).getOpcode() == ISD::BUILD_VECTOR) { 12701 Amt = Amt.getOperand(0); 12702 unsigned Ratio = Amt.getValueType().getVectorNumElements() / 12703 VT.getVectorNumElements(); 12704 unsigned RatioInLog2 = Log2_32_Ceil(Ratio); 12705 uint64_t ShiftAmt = 0; 12706 for (unsigned i = 0; i != Ratio; ++i) { 12707 ConstantSDNode *C = dyn_cast<ConstantSDNode>(Amt.getOperand(i)); 12708 if (C == 0) 12709 return SDValue(); 12710 // 6 == Log2(64) 12711 ShiftAmt |= C->getZExtValue() << (i * (1 << (6 - RatioInLog2))); 12712 } 12713 // Check remaining shift amounts. 12714 for (unsigned i = Ratio; i != Amt.getNumOperands(); i += Ratio) { 12715 uint64_t ShAmt = 0; 12716 for (unsigned j = 0; j != Ratio; ++j) { 12717 ConstantSDNode *C = 12718 dyn_cast<ConstantSDNode>(Amt.getOperand(i + j)); 12719 if (C == 0) 12720 return SDValue(); 12721 // 6 == Log2(64) 12722 ShAmt |= C->getZExtValue() << (j * (1 << (6 - RatioInLog2))); 12723 } 12724 if (ShAmt != ShiftAmt) 12725 return SDValue(); 12726 } 12727 switch (Op.getOpcode()) { 12728 default: 12729 llvm_unreachable("Unknown shift opcode!"); 12730 case ISD::SHL: 12731 return getTargetVShiftByConstNode(X86ISD::VSHLI, dl, VT, R, ShiftAmt, 12732 DAG); 12733 case ISD::SRL: 12734 return getTargetVShiftByConstNode(X86ISD::VSRLI, dl, VT, R, ShiftAmt, 12735 DAG); 12736 case ISD::SRA: 12737 return getTargetVShiftByConstNode(X86ISD::VSRAI, dl, VT, R, ShiftAmt, 12738 DAG); 12739 } 12740 } 12741 12742 return SDValue(); 12743} 12744 12745static SDValue LowerScalarVariableShift(SDValue Op, SelectionDAG &DAG, 12746 const X86Subtarget* Subtarget) { 12747 EVT VT = Op.getValueType(); 12748 SDLoc dl(Op); 12749 SDValue R = Op.getOperand(0); 12750 SDValue Amt = Op.getOperand(1); 12751 12752 if ((VT == MVT::v2i64 && Op.getOpcode() != ISD::SRA) || 12753 VT == MVT::v4i32 || VT == MVT::v8i16 || 12754 (Subtarget->hasInt256() && 12755 ((VT == MVT::v4i64 && Op.getOpcode() != ISD::SRA) || 12756 VT == MVT::v8i32 || VT == MVT::v16i16)) || 12757 (Subtarget->hasAVX512() && (VT == MVT::v8i64 || VT == MVT::v16i32))) { 12758 SDValue BaseShAmt; 12759 EVT EltVT = VT.getVectorElementType(); 12760 12761 if (Amt.getOpcode() == ISD::BUILD_VECTOR) { 12762 unsigned NumElts = VT.getVectorNumElements(); 12763 unsigned i, j; 12764 for (i = 0; i != NumElts; ++i) { 12765 if (Amt.getOperand(i).getOpcode() == ISD::UNDEF) 12766 continue; 12767 break; 12768 } 12769 for (j = i; j != NumElts; ++j) { 12770 SDValue Arg = Amt.getOperand(j); 12771 if (Arg.getOpcode() == ISD::UNDEF) continue; 12772 if (Arg != Amt.getOperand(i)) 12773 break; 12774 } 12775 if (i != NumElts && j == NumElts) 12776 BaseShAmt = Amt.getOperand(i); 12777 } else { 12778 if (Amt.getOpcode() == ISD::EXTRACT_SUBVECTOR) 12779 Amt = Amt.getOperand(0); 12780 if (Amt.getOpcode() == ISD::VECTOR_SHUFFLE && 12781 cast<ShuffleVectorSDNode>(Amt)->isSplat()) { 12782 SDValue InVec = Amt.getOperand(0); 12783 if (InVec.getOpcode() == ISD::BUILD_VECTOR) { 12784 unsigned NumElts = InVec.getValueType().getVectorNumElements(); 12785 unsigned i = 0; 12786 for (; i != NumElts; ++i) { 12787 SDValue Arg = InVec.getOperand(i); 12788 if (Arg.getOpcode() == ISD::UNDEF) continue; 12789 BaseShAmt = Arg; 12790 break; 12791 } 12792 } else if (InVec.getOpcode() == ISD::INSERT_VECTOR_ELT) { 12793 if (ConstantSDNode *C = 12794 dyn_cast<ConstantSDNode>(InVec.getOperand(2))) { 12795 unsigned SplatIdx = 12796 cast<ShuffleVectorSDNode>(Amt)->getSplatIndex(); 12797 if (C->getZExtValue() == SplatIdx) 12798 BaseShAmt = InVec.getOperand(1); 12799 } 12800 } 12801 if (BaseShAmt.getNode() == 0) 12802 BaseShAmt = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, EltVT, Amt, 12803 DAG.getIntPtrConstant(0)); 12804 } 12805 } 12806 12807 if (BaseShAmt.getNode()) { 12808 if (EltVT.bitsGT(MVT::i32)) 12809 BaseShAmt = DAG.getNode(ISD::TRUNCATE, dl, MVT::i32, BaseShAmt); 12810 else if (EltVT.bitsLT(MVT::i32)) 12811 BaseShAmt = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i32, BaseShAmt); 12812 12813 switch (Op.getOpcode()) { 12814 default: 12815 llvm_unreachable("Unknown shift opcode!"); 12816 case ISD::SHL: 12817 switch (VT.getSimpleVT().SimpleTy) { 12818 default: return SDValue(); 12819 case MVT::v2i64: 12820 case MVT::v4i32: 12821 case MVT::v8i16: 12822 case MVT::v4i64: 12823 case MVT::v8i32: 12824 case MVT::v16i16: 12825 case MVT::v16i32: 12826 case MVT::v8i64: 12827 return getTargetVShiftNode(X86ISD::VSHLI, dl, VT, R, BaseShAmt, DAG); 12828 } 12829 case ISD::SRA: 12830 switch (VT.getSimpleVT().SimpleTy) { 12831 default: return SDValue(); 12832 case MVT::v4i32: 12833 case MVT::v8i16: 12834 case MVT::v8i32: 12835 case MVT::v16i16: 12836 case MVT::v16i32: 12837 case MVT::v8i64: 12838 return getTargetVShiftNode(X86ISD::VSRAI, dl, VT, R, BaseShAmt, DAG); 12839 } 12840 case ISD::SRL: 12841 switch (VT.getSimpleVT().SimpleTy) { 12842 default: return SDValue(); 12843 case MVT::v2i64: 12844 case MVT::v4i32: 12845 case MVT::v8i16: 12846 case MVT::v4i64: 12847 case MVT::v8i32: 12848 case MVT::v16i16: 12849 case MVT::v16i32: 12850 case MVT::v8i64: 12851 return getTargetVShiftNode(X86ISD::VSRLI, dl, VT, R, BaseShAmt, DAG); 12852 } 12853 } 12854 } 12855 } 12856 12857 // Special case in 32-bit mode, where i64 is expanded into high and low parts. 12858 if (!Subtarget->is64Bit() && 12859 (VT == MVT::v2i64 || (Subtarget->hasInt256() && VT == MVT::v4i64) || 12860 (Subtarget->hasAVX512() && VT == MVT::v8i64)) && 12861 Amt.getOpcode() == ISD::BITCAST && 12862 Amt.getOperand(0).getOpcode() == ISD::BUILD_VECTOR) { 12863 Amt = Amt.getOperand(0); 12864 unsigned Ratio = Amt.getValueType().getVectorNumElements() / 12865 VT.getVectorNumElements(); 12866 std::vector<SDValue> Vals(Ratio); 12867 for (unsigned i = 0; i != Ratio; ++i) 12868 Vals[i] = Amt.getOperand(i); 12869 for (unsigned i = Ratio; i != Amt.getNumOperands(); i += Ratio) { 12870 for (unsigned j = 0; j != Ratio; ++j) 12871 if (Vals[j] != Amt.getOperand(i + j)) 12872 return SDValue(); 12873 } 12874 switch (Op.getOpcode()) { 12875 default: 12876 llvm_unreachable("Unknown shift opcode!"); 12877 case ISD::SHL: 12878 return DAG.getNode(X86ISD::VSHL, dl, VT, R, Op.getOperand(1)); 12879 case ISD::SRL: 12880 return DAG.getNode(X86ISD::VSRL, dl, VT, R, Op.getOperand(1)); 12881 case ISD::SRA: 12882 return DAG.getNode(X86ISD::VSRA, dl, VT, R, Op.getOperand(1)); 12883 } 12884 } 12885 12886 return SDValue(); 12887} 12888 12889static SDValue LowerShift(SDValue Op, const X86Subtarget* Subtarget, 12890 SelectionDAG &DAG) { 12891 12892 EVT VT = Op.getValueType(); 12893 SDLoc dl(Op); 12894 SDValue R = Op.getOperand(0); 12895 SDValue Amt = Op.getOperand(1); 12896 SDValue V; 12897 12898 if (!Subtarget->hasSSE2()) 12899 return SDValue(); 12900 12901 V = LowerScalarImmediateShift(Op, DAG, Subtarget); 12902 if (V.getNode()) 12903 return V; 12904 12905 V = LowerScalarVariableShift(Op, DAG, Subtarget); 12906 if (V.getNode()) 12907 return V; 12908 12909 if (Subtarget->hasAVX512() && (VT == MVT::v16i32 || VT == MVT::v8i64)) 12910 return Op; 12911 // AVX2 has VPSLLV/VPSRAV/VPSRLV. 12912 if (Subtarget->hasInt256()) { 12913 if (Op.getOpcode() == ISD::SRL && 12914 (VT == MVT::v2i64 || VT == MVT::v4i32 || 12915 VT == MVT::v4i64 || VT == MVT::v8i32)) 12916 return Op; 12917 if (Op.getOpcode() == ISD::SHL && 12918 (VT == MVT::v2i64 || VT == MVT::v4i32 || 12919 VT == MVT::v4i64 || VT == MVT::v8i32)) 12920 return Op; 12921 if (Op.getOpcode() == ISD::SRA && (VT == MVT::v4i32 || VT == MVT::v8i32)) 12922 return Op; 12923 } 12924 12925 // Lower SHL with variable shift amount. 12926 if (VT == MVT::v4i32 && Op->getOpcode() == ISD::SHL) { 12927 Op = DAG.getNode(ISD::SHL, dl, VT, Amt, DAG.getConstant(23, VT)); 12928 12929 Op = DAG.getNode(ISD::ADD, dl, VT, Op, DAG.getConstant(0x3f800000U, VT)); 12930 Op = DAG.getNode(ISD::BITCAST, dl, MVT::v4f32, Op); 12931 Op = DAG.getNode(ISD::FP_TO_SINT, dl, VT, Op); 12932 return DAG.getNode(ISD::MUL, dl, VT, Op, R); 12933 } 12934 if (VT == MVT::v16i8 && Op->getOpcode() == ISD::SHL) { 12935 assert(Subtarget->hasSSE2() && "Need SSE2 for pslli/pcmpeq."); 12936 12937 // a = a << 5; 12938 Op = DAG.getNode(ISD::SHL, dl, VT, Amt, DAG.getConstant(5, VT)); 12939 Op = DAG.getNode(ISD::BITCAST, dl, VT, Op); 12940 12941 // Turn 'a' into a mask suitable for VSELECT 12942 SDValue VSelM = DAG.getConstant(0x80, VT); 12943 SDValue OpVSel = DAG.getNode(ISD::AND, dl, VT, VSelM, Op); 12944 OpVSel = DAG.getNode(X86ISD::PCMPEQ, dl, VT, OpVSel, VSelM); 12945 12946 SDValue CM1 = DAG.getConstant(0x0f, VT); 12947 SDValue CM2 = DAG.getConstant(0x3f, VT); 12948 12949 // r = VSELECT(r, psllw(r & (char16)15, 4), a); 12950 SDValue M = DAG.getNode(ISD::AND, dl, VT, R, CM1); 12951 M = getTargetVShiftByConstNode(X86ISD::VSHLI, dl, MVT::v8i16, M, 4, DAG); 12952 M = DAG.getNode(ISD::BITCAST, dl, VT, M); 12953 R = DAG.getNode(ISD::VSELECT, dl, VT, OpVSel, M, R); 12954 12955 // a += a 12956 Op = DAG.getNode(ISD::ADD, dl, VT, Op, Op); 12957 OpVSel = DAG.getNode(ISD::AND, dl, VT, VSelM, Op); 12958 OpVSel = DAG.getNode(X86ISD::PCMPEQ, dl, VT, OpVSel, VSelM); 12959 12960 // r = VSELECT(r, psllw(r & (char16)63, 2), a); 12961 M = DAG.getNode(ISD::AND, dl, VT, R, CM2); 12962 M = getTargetVShiftByConstNode(X86ISD::VSHLI, dl, MVT::v8i16, M, 2, DAG); 12963 M = DAG.getNode(ISD::BITCAST, dl, VT, M); 12964 R = DAG.getNode(ISD::VSELECT, dl, VT, OpVSel, M, R); 12965 12966 // a += a 12967 Op = DAG.getNode(ISD::ADD, dl, VT, Op, Op); 12968 OpVSel = DAG.getNode(ISD::AND, dl, VT, VSelM, Op); 12969 OpVSel = DAG.getNode(X86ISD::PCMPEQ, dl, VT, OpVSel, VSelM); 12970 12971 // return VSELECT(r, r+r, a); 12972 R = DAG.getNode(ISD::VSELECT, dl, VT, OpVSel, 12973 DAG.getNode(ISD::ADD, dl, VT, R, R), R); 12974 return R; 12975 } 12976 12977 // Decompose 256-bit shifts into smaller 128-bit shifts. 12978 if (VT.is256BitVector()) { 12979 unsigned NumElems = VT.getVectorNumElements(); 12980 MVT EltVT = VT.getVectorElementType().getSimpleVT(); 12981 EVT NewVT = MVT::getVectorVT(EltVT, NumElems/2); 12982 12983 // Extract the two vectors 12984 SDValue V1 = Extract128BitVector(R, 0, DAG, dl); 12985 SDValue V2 = Extract128BitVector(R, NumElems/2, DAG, dl); 12986 12987 // Recreate the shift amount vectors 12988 SDValue Amt1, Amt2; 12989 if (Amt.getOpcode() == ISD::BUILD_VECTOR) { 12990 // Constant shift amount 12991 SmallVector<SDValue, 4> Amt1Csts; 12992 SmallVector<SDValue, 4> Amt2Csts; 12993 for (unsigned i = 0; i != NumElems/2; ++i) 12994 Amt1Csts.push_back(Amt->getOperand(i)); 12995 for (unsigned i = NumElems/2; i != NumElems; ++i) 12996 Amt2Csts.push_back(Amt->getOperand(i)); 12997 12998 Amt1 = DAG.getNode(ISD::BUILD_VECTOR, dl, NewVT, 12999 &Amt1Csts[0], NumElems/2); 13000 Amt2 = DAG.getNode(ISD::BUILD_VECTOR, dl, NewVT, 13001 &Amt2Csts[0], NumElems/2); 13002 } else { 13003 // Variable shift amount 13004 Amt1 = Extract128BitVector(Amt, 0, DAG, dl); 13005 Amt2 = Extract128BitVector(Amt, NumElems/2, DAG, dl); 13006 } 13007 13008 // Issue new vector shifts for the smaller types 13009 V1 = DAG.getNode(Op.getOpcode(), dl, NewVT, V1, Amt1); 13010 V2 = DAG.getNode(Op.getOpcode(), dl, NewVT, V2, Amt2); 13011 13012 // Concatenate the result back 13013 return DAG.getNode(ISD::CONCAT_VECTORS, dl, VT, V1, V2); 13014 } 13015 13016 return SDValue(); 13017} 13018 13019static SDValue LowerXALUO(SDValue Op, SelectionDAG &DAG) { 13020 // Lower the "add/sub/mul with overflow" instruction into a regular ins plus 13021 // a "setcc" instruction that checks the overflow flag. The "brcond" lowering 13022 // looks for this combo and may remove the "setcc" instruction if the "setcc" 13023 // has only one use. 13024 SDNode *N = Op.getNode(); 13025 SDValue LHS = N->getOperand(0); 13026 SDValue RHS = N->getOperand(1); 13027 unsigned BaseOp = 0; 13028 unsigned Cond = 0; 13029 SDLoc DL(Op); 13030 switch (Op.getOpcode()) { 13031 default: llvm_unreachable("Unknown ovf instruction!"); 13032 case ISD::SADDO: 13033 // A subtract of one will be selected as a INC. Note that INC doesn't 13034 // set CF, so we can't do this for UADDO. 13035 if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(RHS)) 13036 if (C->isOne()) { 13037 BaseOp = X86ISD::INC; 13038 Cond = X86::COND_O; 13039 break; 13040 } 13041 BaseOp = X86ISD::ADD; 13042 Cond = X86::COND_O; 13043 break; 13044 case ISD::UADDO: 13045 BaseOp = X86ISD::ADD; 13046 Cond = X86::COND_B; 13047 break; 13048 case ISD::SSUBO: 13049 // A subtract of one will be selected as a DEC. Note that DEC doesn't 13050 // set CF, so we can't do this for USUBO. 13051 if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(RHS)) 13052 if (C->isOne()) { 13053 BaseOp = X86ISD::DEC; 13054 Cond = X86::COND_O; 13055 break; 13056 } 13057 BaseOp = X86ISD::SUB; 13058 Cond = X86::COND_O; 13059 break; 13060 case ISD::USUBO: 13061 BaseOp = X86ISD::SUB; 13062 Cond = X86::COND_B; 13063 break; 13064 case ISD::SMULO: 13065 BaseOp = X86ISD::SMUL; 13066 Cond = X86::COND_O; 13067 break; 13068 case ISD::UMULO: { // i64, i8 = umulo lhs, rhs --> i64, i64, i32 umul lhs,rhs 13069 SDVTList VTs = DAG.getVTList(N->getValueType(0), N->getValueType(0), 13070 MVT::i32); 13071 SDValue Sum = DAG.getNode(X86ISD::UMUL, DL, VTs, LHS, RHS); 13072 13073 SDValue SetCC = 13074 DAG.getNode(X86ISD::SETCC, DL, MVT::i8, 13075 DAG.getConstant(X86::COND_O, MVT::i32), 13076 SDValue(Sum.getNode(), 2)); 13077 13078 return DAG.getNode(ISD::MERGE_VALUES, DL, N->getVTList(), Sum, SetCC); 13079 } 13080 } 13081 13082 // Also sets EFLAGS. 13083 SDVTList VTs = DAG.getVTList(N->getValueType(0), MVT::i32); 13084 SDValue Sum = DAG.getNode(BaseOp, DL, VTs, LHS, RHS); 13085 13086 SDValue SetCC = 13087 DAG.getNode(X86ISD::SETCC, DL, N->getValueType(1), 13088 DAG.getConstant(Cond, MVT::i32), 13089 SDValue(Sum.getNode(), 1)); 13090 13091 return DAG.getNode(ISD::MERGE_VALUES, DL, N->getVTList(), Sum, SetCC); 13092} 13093 13094SDValue X86TargetLowering::LowerSIGN_EXTEND_INREG(SDValue Op, 13095 SelectionDAG &DAG) const { 13096 SDLoc dl(Op); 13097 EVT ExtraVT = cast<VTSDNode>(Op.getOperand(1))->getVT(); 13098 EVT VT = Op.getValueType(); 13099 13100 if (!Subtarget->hasSSE2() || !VT.isVector()) 13101 return SDValue(); 13102 13103 unsigned BitsDiff = VT.getScalarType().getSizeInBits() - 13104 ExtraVT.getScalarType().getSizeInBits(); 13105 13106 switch (VT.getSimpleVT().SimpleTy) { 13107 default: return SDValue(); 13108 case MVT::v8i32: 13109 case MVT::v16i16: 13110 if (!Subtarget->hasFp256()) 13111 return SDValue(); 13112 if (!Subtarget->hasInt256()) { 13113 // needs to be split 13114 unsigned NumElems = VT.getVectorNumElements(); 13115 13116 // Extract the LHS vectors 13117 SDValue LHS = Op.getOperand(0); 13118 SDValue LHS1 = Extract128BitVector(LHS, 0, DAG, dl); 13119 SDValue LHS2 = Extract128BitVector(LHS, NumElems/2, DAG, dl); 13120 13121 MVT EltVT = VT.getVectorElementType().getSimpleVT(); 13122 EVT NewVT = MVT::getVectorVT(EltVT, NumElems/2); 13123 13124 EVT ExtraEltVT = ExtraVT.getVectorElementType(); 13125 unsigned ExtraNumElems = ExtraVT.getVectorNumElements(); 13126 ExtraVT = EVT::getVectorVT(*DAG.getContext(), ExtraEltVT, 13127 ExtraNumElems/2); 13128 SDValue Extra = DAG.getValueType(ExtraVT); 13129 13130 LHS1 = DAG.getNode(Op.getOpcode(), dl, NewVT, LHS1, Extra); 13131 LHS2 = DAG.getNode(Op.getOpcode(), dl, NewVT, LHS2, Extra); 13132 13133 return DAG.getNode(ISD::CONCAT_VECTORS, dl, VT, LHS1, LHS2); 13134 } 13135 // fall through 13136 case MVT::v4i32: 13137 case MVT::v8i16: { 13138 SDValue Op0 = Op.getOperand(0); 13139 SDValue Op00 = Op0.getOperand(0); 13140 SDValue Tmp1; 13141 // Hopefully, this VECTOR_SHUFFLE is just a VZEXT. 13142 if (Op0.getOpcode() == ISD::BITCAST && 13143 Op00.getOpcode() == ISD::VECTOR_SHUFFLE) { 13144 // (sext (vzext x)) -> (vsext x) 13145 Tmp1 = LowerVectorIntExtend(Op00, Subtarget, DAG); 13146 if (Tmp1.getNode()) { 13147 EVT ExtraEltVT = ExtraVT.getVectorElementType(); 13148 // This folding is only valid when the in-reg type is a vector of i8, 13149 // i16, or i32. 13150 if (ExtraEltVT == MVT::i8 || ExtraEltVT == MVT::i16 || 13151 ExtraEltVT == MVT::i32) { 13152 SDValue Tmp1Op0 = Tmp1.getOperand(0); 13153 assert(Tmp1Op0.getOpcode() == X86ISD::VZEXT && 13154 "This optimization is invalid without a VZEXT."); 13155 return DAG.getNode(X86ISD::VSEXT, dl, VT, Tmp1Op0.getOperand(0)); 13156 } 13157 Op0 = Tmp1; 13158 } 13159 } 13160 13161 // If the above didn't work, then just use Shift-Left + Shift-Right. 13162 Tmp1 = getTargetVShiftByConstNode(X86ISD::VSHLI, dl, VT, Op0, BitsDiff, 13163 DAG); 13164 return getTargetVShiftByConstNode(X86ISD::VSRAI, dl, VT, Tmp1, BitsDiff, 13165 DAG); 13166 } 13167 } 13168} 13169 13170static SDValue LowerATOMIC_FENCE(SDValue Op, const X86Subtarget *Subtarget, 13171 SelectionDAG &DAG) { 13172 SDLoc dl(Op); 13173 AtomicOrdering FenceOrdering = static_cast<AtomicOrdering>( 13174 cast<ConstantSDNode>(Op.getOperand(1))->getZExtValue()); 13175 SynchronizationScope FenceScope = static_cast<SynchronizationScope>( 13176 cast<ConstantSDNode>(Op.getOperand(2))->getZExtValue()); 13177 13178 // The only fence that needs an instruction is a sequentially-consistent 13179 // cross-thread fence. 13180 if (FenceOrdering == SequentiallyConsistent && FenceScope == CrossThread) { 13181 // Use mfence if we have SSE2 or we're on x86-64 (even if we asked for 13182 // no-sse2). There isn't any reason to disable it if the target processor 13183 // supports it. 13184 if (Subtarget->hasSSE2() || Subtarget->is64Bit()) 13185 return DAG.getNode(X86ISD::MFENCE, dl, MVT::Other, Op.getOperand(0)); 13186 13187 SDValue Chain = Op.getOperand(0); 13188 SDValue Zero = DAG.getConstant(0, MVT::i32); 13189 SDValue Ops[] = { 13190 DAG.getRegister(X86::ESP, MVT::i32), // Base 13191 DAG.getTargetConstant(1, MVT::i8), // Scale 13192 DAG.getRegister(0, MVT::i32), // Index 13193 DAG.getTargetConstant(0, MVT::i32), // Disp 13194 DAG.getRegister(0, MVT::i32), // Segment. 13195 Zero, 13196 Chain 13197 }; 13198 SDNode *Res = DAG.getMachineNode(X86::OR32mrLocked, dl, MVT::Other, Ops); 13199 return SDValue(Res, 0); 13200 } 13201 13202 // MEMBARRIER is a compiler barrier; it codegens to a no-op. 13203 return DAG.getNode(X86ISD::MEMBARRIER, dl, MVT::Other, Op.getOperand(0)); 13204} 13205 13206static SDValue LowerCMP_SWAP(SDValue Op, const X86Subtarget *Subtarget, 13207 SelectionDAG &DAG) { 13208 EVT T = Op.getValueType(); 13209 SDLoc DL(Op); 13210 unsigned Reg = 0; 13211 unsigned size = 0; 13212 switch(T.getSimpleVT().SimpleTy) { 13213 default: llvm_unreachable("Invalid value type!"); 13214 case MVT::i8: Reg = X86::AL; size = 1; break; 13215 case MVT::i16: Reg = X86::AX; size = 2; break; 13216 case MVT::i32: Reg = X86::EAX; size = 4; break; 13217 case MVT::i64: 13218 assert(Subtarget->is64Bit() && "Node not type legal!"); 13219 Reg = X86::RAX; size = 8; 13220 break; 13221 } 13222 SDValue cpIn = DAG.getCopyToReg(Op.getOperand(0), DL, Reg, 13223 Op.getOperand(2), SDValue()); 13224 SDValue Ops[] = { cpIn.getValue(0), 13225 Op.getOperand(1), 13226 Op.getOperand(3), 13227 DAG.getTargetConstant(size, MVT::i8), 13228 cpIn.getValue(1) }; 13229 SDVTList Tys = DAG.getVTList(MVT::Other, MVT::Glue); 13230 MachineMemOperand *MMO = cast<AtomicSDNode>(Op)->getMemOperand(); 13231 SDValue Result = DAG.getMemIntrinsicNode(X86ISD::LCMPXCHG_DAG, DL, Tys, 13232 Ops, array_lengthof(Ops), T, MMO); 13233 SDValue cpOut = 13234 DAG.getCopyFromReg(Result.getValue(0), DL, Reg, T, Result.getValue(1)); 13235 return cpOut; 13236} 13237 13238static SDValue LowerREADCYCLECOUNTER(SDValue Op, const X86Subtarget *Subtarget, 13239 SelectionDAG &DAG) { 13240 assert(Subtarget->is64Bit() && "Result not type legalized?"); 13241 SDVTList Tys = DAG.getVTList(MVT::Other, MVT::Glue); 13242 SDValue TheChain = Op.getOperand(0); 13243 SDLoc dl(Op); 13244 SDValue rd = DAG.getNode(X86ISD::RDTSC_DAG, dl, Tys, &TheChain, 1); 13245 SDValue rax = DAG.getCopyFromReg(rd, dl, X86::RAX, MVT::i64, rd.getValue(1)); 13246 SDValue rdx = DAG.getCopyFromReg(rax.getValue(1), dl, X86::RDX, MVT::i64, 13247 rax.getValue(2)); 13248 SDValue Tmp = DAG.getNode(ISD::SHL, dl, MVT::i64, rdx, 13249 DAG.getConstant(32, MVT::i8)); 13250 SDValue Ops[] = { 13251 DAG.getNode(ISD::OR, dl, MVT::i64, rax, Tmp), 13252 rdx.getValue(1) 13253 }; 13254 return DAG.getMergeValues(Ops, array_lengthof(Ops), dl); 13255} 13256 13257static SDValue LowerBITCAST(SDValue Op, const X86Subtarget *Subtarget, 13258 SelectionDAG &DAG) { 13259 MVT SrcVT = Op.getOperand(0).getSimpleValueType(); 13260 MVT DstVT = Op.getSimpleValueType(); 13261 assert(Subtarget->is64Bit() && !Subtarget->hasSSE2() && 13262 Subtarget->hasMMX() && "Unexpected custom BITCAST"); 13263 assert((DstVT == MVT::i64 || 13264 (DstVT.isVector() && DstVT.getSizeInBits()==64)) && 13265 "Unexpected custom BITCAST"); 13266 // i64 <=> MMX conversions are Legal. 13267 if (SrcVT==MVT::i64 && DstVT.isVector()) 13268 return Op; 13269 if (DstVT==MVT::i64 && SrcVT.isVector()) 13270 return Op; 13271 // MMX <=> MMX conversions are Legal. 13272 if (SrcVT.isVector() && DstVT.isVector()) 13273 return Op; 13274 // All other conversions need to be expanded. 13275 return SDValue(); 13276} 13277 13278static SDValue LowerLOAD_SUB(SDValue Op, SelectionDAG &DAG) { 13279 SDNode *Node = Op.getNode(); 13280 SDLoc dl(Node); 13281 EVT T = Node->getValueType(0); 13282 SDValue negOp = DAG.getNode(ISD::SUB, dl, T, 13283 DAG.getConstant(0, T), Node->getOperand(2)); 13284 return DAG.getAtomic(ISD::ATOMIC_LOAD_ADD, dl, 13285 cast<AtomicSDNode>(Node)->getMemoryVT(), 13286 Node->getOperand(0), 13287 Node->getOperand(1), negOp, 13288 cast<AtomicSDNode>(Node)->getSrcValue(), 13289 cast<AtomicSDNode>(Node)->getAlignment(), 13290 cast<AtomicSDNode>(Node)->getOrdering(), 13291 cast<AtomicSDNode>(Node)->getSynchScope()); 13292} 13293 13294static SDValue LowerATOMIC_STORE(SDValue Op, SelectionDAG &DAG) { 13295 SDNode *Node = Op.getNode(); 13296 SDLoc dl(Node); 13297 EVT VT = cast<AtomicSDNode>(Node)->getMemoryVT(); 13298 13299 // Convert seq_cst store -> xchg 13300 // Convert wide store -> swap (-> cmpxchg8b/cmpxchg16b) 13301 // FIXME: On 32-bit, store -> fist or movq would be more efficient 13302 // (The only way to get a 16-byte store is cmpxchg16b) 13303 // FIXME: 16-byte ATOMIC_SWAP isn't actually hooked up at the moment. 13304 if (cast<AtomicSDNode>(Node)->getOrdering() == SequentiallyConsistent || 13305 !DAG.getTargetLoweringInfo().isTypeLegal(VT)) { 13306 SDValue Swap = DAG.getAtomic(ISD::ATOMIC_SWAP, dl, 13307 cast<AtomicSDNode>(Node)->getMemoryVT(), 13308 Node->getOperand(0), 13309 Node->getOperand(1), Node->getOperand(2), 13310 cast<AtomicSDNode>(Node)->getMemOperand(), 13311 cast<AtomicSDNode>(Node)->getOrdering(), 13312 cast<AtomicSDNode>(Node)->getSynchScope()); 13313 return Swap.getValue(1); 13314 } 13315 // Other atomic stores have a simple pattern. 13316 return Op; 13317} 13318 13319static SDValue LowerADDC_ADDE_SUBC_SUBE(SDValue Op, SelectionDAG &DAG) { 13320 EVT VT = Op.getNode()->getValueType(0); 13321 13322 // Let legalize expand this if it isn't a legal type yet. 13323 if (!DAG.getTargetLoweringInfo().isTypeLegal(VT)) 13324 return SDValue(); 13325 13326 SDVTList VTs = DAG.getVTList(VT, MVT::i32); 13327 13328 unsigned Opc; 13329 bool ExtraOp = false; 13330 switch (Op.getOpcode()) { 13331 default: llvm_unreachable("Invalid code"); 13332 case ISD::ADDC: Opc = X86ISD::ADD; break; 13333 case ISD::ADDE: Opc = X86ISD::ADC; ExtraOp = true; break; 13334 case ISD::SUBC: Opc = X86ISD::SUB; break; 13335 case ISD::SUBE: Opc = X86ISD::SBB; ExtraOp = true; break; 13336 } 13337 13338 if (!ExtraOp) 13339 return DAG.getNode(Opc, SDLoc(Op), VTs, Op.getOperand(0), 13340 Op.getOperand(1)); 13341 return DAG.getNode(Opc, SDLoc(Op), VTs, Op.getOperand(0), 13342 Op.getOperand(1), Op.getOperand(2)); 13343} 13344 13345static SDValue LowerFSINCOS(SDValue Op, const X86Subtarget *Subtarget, 13346 SelectionDAG &DAG) { 13347 assert(Subtarget->isTargetDarwin() && Subtarget->is64Bit()); 13348 13349 // For MacOSX, we want to call an alternative entry point: __sincos_stret, 13350 // which returns the values as { float, float } (in XMM0) or 13351 // { double, double } (which is returned in XMM0, XMM1). 13352 SDLoc dl(Op); 13353 SDValue Arg = Op.getOperand(0); 13354 EVT ArgVT = Arg.getValueType(); 13355 Type *ArgTy = ArgVT.getTypeForEVT(*DAG.getContext()); 13356 13357 TargetLowering::ArgListTy Args; 13358 TargetLowering::ArgListEntry Entry; 13359 13360 Entry.Node = Arg; 13361 Entry.Ty = ArgTy; 13362 Entry.isSExt = false; 13363 Entry.isZExt = false; 13364 Args.push_back(Entry); 13365 13366 bool isF64 = ArgVT == MVT::f64; 13367 // Only optimize x86_64 for now. i386 is a bit messy. For f32, 13368 // the small struct {f32, f32} is returned in (eax, edx). For f64, 13369 // the results are returned via SRet in memory. 13370 const char *LibcallName = isF64 ? "__sincos_stret" : "__sincosf_stret"; 13371 const TargetLowering &TLI = DAG.getTargetLoweringInfo(); 13372 SDValue Callee = DAG.getExternalSymbol(LibcallName, TLI.getPointerTy()); 13373 13374 Type *RetTy = isF64 13375 ? (Type*)StructType::get(ArgTy, ArgTy, NULL) 13376 : (Type*)VectorType::get(ArgTy, 4); 13377 TargetLowering:: 13378 CallLoweringInfo CLI(DAG.getEntryNode(), RetTy, 13379 false, false, false, false, 0, 13380 CallingConv::C, /*isTaillCall=*/false, 13381 /*doesNotRet=*/false, /*isReturnValueUsed*/true, 13382 Callee, Args, DAG, dl); 13383 std::pair<SDValue, SDValue> CallResult = TLI.LowerCallTo(CLI); 13384 13385 if (isF64) 13386 // Returned in xmm0 and xmm1. 13387 return CallResult.first; 13388 13389 // Returned in bits 0:31 and 32:64 xmm0. 13390 SDValue SinVal = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, ArgVT, 13391 CallResult.first, DAG.getIntPtrConstant(0)); 13392 SDValue CosVal = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, ArgVT, 13393 CallResult.first, DAG.getIntPtrConstant(1)); 13394 SDVTList Tys = DAG.getVTList(ArgVT, ArgVT); 13395 return DAG.getNode(ISD::MERGE_VALUES, dl, Tys, SinVal, CosVal); 13396} 13397 13398/// LowerOperation - Provide custom lowering hooks for some operations. 13399/// 13400SDValue X86TargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const { 13401 switch (Op.getOpcode()) { 13402 default: llvm_unreachable("Should not custom lower this!"); 13403 case ISD::SIGN_EXTEND_INREG: return LowerSIGN_EXTEND_INREG(Op,DAG); 13404 case ISD::ATOMIC_FENCE: return LowerATOMIC_FENCE(Op, Subtarget, DAG); 13405 case ISD::ATOMIC_CMP_SWAP: return LowerCMP_SWAP(Op, Subtarget, DAG); 13406 case ISD::ATOMIC_LOAD_SUB: return LowerLOAD_SUB(Op,DAG); 13407 case ISD::ATOMIC_STORE: return LowerATOMIC_STORE(Op,DAG); 13408 case ISD::BUILD_VECTOR: return LowerBUILD_VECTOR(Op, DAG); 13409 case ISD::CONCAT_VECTORS: return LowerCONCAT_VECTORS(Op, DAG); 13410 case ISD::VECTOR_SHUFFLE: return LowerVECTOR_SHUFFLE(Op, DAG); 13411 case ISD::EXTRACT_VECTOR_ELT: return LowerEXTRACT_VECTOR_ELT(Op, DAG); 13412 case ISD::INSERT_VECTOR_ELT: return LowerINSERT_VECTOR_ELT(Op, DAG); 13413 case ISD::EXTRACT_SUBVECTOR: return LowerEXTRACT_SUBVECTOR(Op,Subtarget,DAG); 13414 case ISD::INSERT_SUBVECTOR: return LowerINSERT_SUBVECTOR(Op, Subtarget,DAG); 13415 case ISD::SCALAR_TO_VECTOR: return LowerSCALAR_TO_VECTOR(Op, DAG); 13416 case ISD::ConstantPool: return LowerConstantPool(Op, DAG); 13417 case ISD::GlobalAddress: return LowerGlobalAddress(Op, DAG); 13418 case ISD::GlobalTLSAddress: return LowerGlobalTLSAddress(Op, DAG); 13419 case ISD::ExternalSymbol: return LowerExternalSymbol(Op, DAG); 13420 case ISD::BlockAddress: return LowerBlockAddress(Op, DAG); 13421 case ISD::SHL_PARTS: 13422 case ISD::SRA_PARTS: 13423 case ISD::SRL_PARTS: return LowerShiftParts(Op, DAG); 13424 case ISD::SINT_TO_FP: return LowerSINT_TO_FP(Op, DAG); 13425 case ISD::UINT_TO_FP: return LowerUINT_TO_FP(Op, DAG); 13426 case ISD::TRUNCATE: return LowerTRUNCATE(Op, DAG); 13427 case ISD::ZERO_EXTEND: return LowerZERO_EXTEND(Op, Subtarget, DAG); 13428 case ISD::SIGN_EXTEND: return LowerSIGN_EXTEND(Op, Subtarget, DAG); 13429 case ISD::ANY_EXTEND: return LowerANY_EXTEND(Op, Subtarget, DAG); 13430 case ISD::FP_TO_SINT: return LowerFP_TO_SINT(Op, DAG); 13431 case ISD::FP_TO_UINT: return LowerFP_TO_UINT(Op, DAG); 13432 case ISD::FP_EXTEND: return LowerFP_EXTEND(Op, DAG); 13433 case ISD::FABS: return LowerFABS(Op, DAG); 13434 case ISD::FNEG: return LowerFNEG(Op, DAG); 13435 case ISD::FCOPYSIGN: return LowerFCOPYSIGN(Op, DAG); 13436 case ISD::FGETSIGN: return LowerFGETSIGN(Op, DAG); 13437 case ISD::SETCC: return LowerSETCC(Op, DAG); 13438 case ISD::SELECT: return LowerSELECT(Op, DAG); 13439 case ISD::BRCOND: return LowerBRCOND(Op, DAG); 13440 case ISD::JumpTable: return LowerJumpTable(Op, DAG); 13441 case ISD::VASTART: return LowerVASTART(Op, DAG); 13442 case ISD::VAARG: return LowerVAARG(Op, DAG); 13443 case ISD::VACOPY: return LowerVACOPY(Op, Subtarget, DAG); 13444 case ISD::INTRINSIC_WO_CHAIN: return LowerINTRINSIC_WO_CHAIN(Op, DAG); 13445 case ISD::INTRINSIC_VOID: 13446 case ISD::INTRINSIC_W_CHAIN: return LowerINTRINSIC_W_CHAIN(Op, Subtarget, DAG); 13447 case ISD::RETURNADDR: return LowerRETURNADDR(Op, DAG); 13448 case ISD::FRAMEADDR: return LowerFRAMEADDR(Op, DAG); 13449 case ISD::FRAME_TO_ARGS_OFFSET: 13450 return LowerFRAME_TO_ARGS_OFFSET(Op, DAG); 13451 case ISD::DYNAMIC_STACKALLOC: return LowerDYNAMIC_STACKALLOC(Op, DAG); 13452 case ISD::EH_RETURN: return LowerEH_RETURN(Op, DAG); 13453 case ISD::EH_SJLJ_SETJMP: return lowerEH_SJLJ_SETJMP(Op, DAG); 13454 case ISD::EH_SJLJ_LONGJMP: return lowerEH_SJLJ_LONGJMP(Op, DAG); 13455 case ISD::INIT_TRAMPOLINE: return LowerINIT_TRAMPOLINE(Op, DAG); 13456 case ISD::ADJUST_TRAMPOLINE: return LowerADJUST_TRAMPOLINE(Op, DAG); 13457 case ISD::FLT_ROUNDS_: return LowerFLT_ROUNDS_(Op, DAG); 13458 case ISD::CTLZ: return LowerCTLZ(Op, DAG); 13459 case ISD::CTLZ_ZERO_UNDEF: return LowerCTLZ_ZERO_UNDEF(Op, DAG); 13460 case ISD::CTTZ: return LowerCTTZ(Op, DAG); 13461 case ISD::MUL: return LowerMUL(Op, Subtarget, DAG); 13462 case ISD::SRA: 13463 case ISD::SRL: 13464 case ISD::SHL: return LowerShift(Op, Subtarget, DAG); 13465 case ISD::SADDO: 13466 case ISD::UADDO: 13467 case ISD::SSUBO: 13468 case ISD::USUBO: 13469 case ISD::SMULO: 13470 case ISD::UMULO: return LowerXALUO(Op, DAG); 13471 case ISD::READCYCLECOUNTER: return LowerREADCYCLECOUNTER(Op, Subtarget,DAG); 13472 case ISD::BITCAST: return LowerBITCAST(Op, Subtarget, DAG); 13473 case ISD::ADDC: 13474 case ISD::ADDE: 13475 case ISD::SUBC: 13476 case ISD::SUBE: return LowerADDC_ADDE_SUBC_SUBE(Op, DAG); 13477 case ISD::ADD: return LowerADD(Op, DAG); 13478 case ISD::SUB: return LowerSUB(Op, DAG); 13479 case ISD::SDIV: return LowerSDIV(Op, DAG); 13480 case ISD::FSINCOS: return LowerFSINCOS(Op, Subtarget, DAG); 13481 } 13482} 13483 13484static void ReplaceATOMIC_LOAD(SDNode *Node, 13485 SmallVectorImpl<SDValue> &Results, 13486 SelectionDAG &DAG) { 13487 SDLoc dl(Node); 13488 EVT VT = cast<AtomicSDNode>(Node)->getMemoryVT(); 13489 13490 // Convert wide load -> cmpxchg8b/cmpxchg16b 13491 // FIXME: On 32-bit, load -> fild or movq would be more efficient 13492 // (The only way to get a 16-byte load is cmpxchg16b) 13493 // FIXME: 16-byte ATOMIC_CMP_SWAP isn't actually hooked up at the moment. 13494 SDValue Zero = DAG.getConstant(0, VT); 13495 SDValue Swap = DAG.getAtomic(ISD::ATOMIC_CMP_SWAP, dl, VT, 13496 Node->getOperand(0), 13497 Node->getOperand(1), Zero, Zero, 13498 cast<AtomicSDNode>(Node)->getMemOperand(), 13499 cast<AtomicSDNode>(Node)->getOrdering(), 13500 cast<AtomicSDNode>(Node)->getSynchScope()); 13501 Results.push_back(Swap.getValue(0)); 13502 Results.push_back(Swap.getValue(1)); 13503} 13504 13505static void 13506ReplaceATOMIC_BINARY_64(SDNode *Node, SmallVectorImpl<SDValue>&Results, 13507 SelectionDAG &DAG, unsigned NewOp) { 13508 SDLoc dl(Node); 13509 assert (Node->getValueType(0) == MVT::i64 && 13510 "Only know how to expand i64 atomics"); 13511 13512 SDValue Chain = Node->getOperand(0); 13513 SDValue In1 = Node->getOperand(1); 13514 SDValue In2L = DAG.getNode(ISD::EXTRACT_ELEMENT, dl, MVT::i32, 13515 Node->getOperand(2), DAG.getIntPtrConstant(0)); 13516 SDValue In2H = DAG.getNode(ISD::EXTRACT_ELEMENT, dl, MVT::i32, 13517 Node->getOperand(2), DAG.getIntPtrConstant(1)); 13518 SDValue Ops[] = { Chain, In1, In2L, In2H }; 13519 SDVTList Tys = DAG.getVTList(MVT::i32, MVT::i32, MVT::Other); 13520 SDValue Result = 13521 DAG.getMemIntrinsicNode(NewOp, dl, Tys, Ops, array_lengthof(Ops), MVT::i64, 13522 cast<MemSDNode>(Node)->getMemOperand()); 13523 SDValue OpsF[] = { Result.getValue(0), Result.getValue(1)}; 13524 Results.push_back(DAG.getNode(ISD::BUILD_PAIR, dl, MVT::i64, OpsF, 2)); 13525 Results.push_back(Result.getValue(2)); 13526} 13527 13528/// ReplaceNodeResults - Replace a node with an illegal result type 13529/// with a new node built out of custom code. 13530void X86TargetLowering::ReplaceNodeResults(SDNode *N, 13531 SmallVectorImpl<SDValue>&Results, 13532 SelectionDAG &DAG) const { 13533 SDLoc dl(N); 13534 const TargetLowering &TLI = DAG.getTargetLoweringInfo(); 13535 switch (N->getOpcode()) { 13536 default: 13537 llvm_unreachable("Do not know how to custom type legalize this operation!"); 13538 case ISD::SIGN_EXTEND_INREG: 13539 case ISD::ADDC: 13540 case ISD::ADDE: 13541 case ISD::SUBC: 13542 case ISD::SUBE: 13543 // We don't want to expand or promote these. 13544 return; 13545 case ISD::FP_TO_SINT: 13546 case ISD::FP_TO_UINT: { 13547 bool IsSigned = N->getOpcode() == ISD::FP_TO_SINT; 13548 13549 if (!IsSigned && !isIntegerTypeFTOL(SDValue(N, 0).getValueType())) 13550 return; 13551 13552 std::pair<SDValue,SDValue> Vals = 13553 FP_TO_INTHelper(SDValue(N, 0), DAG, IsSigned, /*IsReplace=*/ true); 13554 SDValue FIST = Vals.first, StackSlot = Vals.second; 13555 if (FIST.getNode() != 0) { 13556 EVT VT = N->getValueType(0); 13557 // Return a load from the stack slot. 13558 if (StackSlot.getNode() != 0) 13559 Results.push_back(DAG.getLoad(VT, dl, FIST, StackSlot, 13560 MachinePointerInfo(), 13561 false, false, false, 0)); 13562 else 13563 Results.push_back(FIST); 13564 } 13565 return; 13566 } 13567 case ISD::UINT_TO_FP: { 13568 assert(Subtarget->hasSSE2() && "Requires at least SSE2!"); 13569 if (N->getOperand(0).getValueType() != MVT::v2i32 || 13570 N->getValueType(0) != MVT::v2f32) 13571 return; 13572 SDValue ZExtIn = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::v2i64, 13573 N->getOperand(0)); 13574 SDValue Bias = DAG.getConstantFP(BitsToDouble(0x4330000000000000ULL), 13575 MVT::f64); 13576 SDValue VBias = DAG.getNode(ISD::BUILD_VECTOR, dl, MVT::v2f64, Bias, Bias); 13577 SDValue Or = DAG.getNode(ISD::OR, dl, MVT::v2i64, ZExtIn, 13578 DAG.getNode(ISD::BITCAST, dl, MVT::v2i64, VBias)); 13579 Or = DAG.getNode(ISD::BITCAST, dl, MVT::v2f64, Or); 13580 SDValue Sub = DAG.getNode(ISD::FSUB, dl, MVT::v2f64, Or, VBias); 13581 Results.push_back(DAG.getNode(X86ISD::VFPROUND, dl, MVT::v4f32, Sub)); 13582 return; 13583 } 13584 case ISD::FP_ROUND: { 13585 if (!TLI.isTypeLegal(N->getOperand(0).getValueType())) 13586 return; 13587 SDValue V = DAG.getNode(X86ISD::VFPROUND, dl, MVT::v4f32, N->getOperand(0)); 13588 Results.push_back(V); 13589 return; 13590 } 13591 case ISD::READCYCLECOUNTER: { 13592 SDVTList Tys = DAG.getVTList(MVT::Other, MVT::Glue); 13593 SDValue TheChain = N->getOperand(0); 13594 SDValue rd = DAG.getNode(X86ISD::RDTSC_DAG, dl, Tys, &TheChain, 1); 13595 SDValue eax = DAG.getCopyFromReg(rd, dl, X86::EAX, MVT::i32, 13596 rd.getValue(1)); 13597 SDValue edx = DAG.getCopyFromReg(eax.getValue(1), dl, X86::EDX, MVT::i32, 13598 eax.getValue(2)); 13599 // Use a buildpair to merge the two 32-bit values into a 64-bit one. 13600 SDValue Ops[] = { eax, edx }; 13601 Results.push_back(DAG.getNode(ISD::BUILD_PAIR, dl, MVT::i64, Ops, 13602 array_lengthof(Ops))); 13603 Results.push_back(edx.getValue(1)); 13604 return; 13605 } 13606 case ISD::ATOMIC_CMP_SWAP: { 13607 EVT T = N->getValueType(0); 13608 assert((T == MVT::i64 || T == MVT::i128) && "can only expand cmpxchg pair"); 13609 bool Regs64bit = T == MVT::i128; 13610 EVT HalfT = Regs64bit ? MVT::i64 : MVT::i32; 13611 SDValue cpInL, cpInH; 13612 cpInL = DAG.getNode(ISD::EXTRACT_ELEMENT, dl, HalfT, N->getOperand(2), 13613 DAG.getConstant(0, HalfT)); 13614 cpInH = DAG.getNode(ISD::EXTRACT_ELEMENT, dl, HalfT, N->getOperand(2), 13615 DAG.getConstant(1, HalfT)); 13616 cpInL = DAG.getCopyToReg(N->getOperand(0), dl, 13617 Regs64bit ? X86::RAX : X86::EAX, 13618 cpInL, SDValue()); 13619 cpInH = DAG.getCopyToReg(cpInL.getValue(0), dl, 13620 Regs64bit ? X86::RDX : X86::EDX, 13621 cpInH, cpInL.getValue(1)); 13622 SDValue swapInL, swapInH; 13623 swapInL = DAG.getNode(ISD::EXTRACT_ELEMENT, dl, HalfT, N->getOperand(3), 13624 DAG.getConstant(0, HalfT)); 13625 swapInH = DAG.getNode(ISD::EXTRACT_ELEMENT, dl, HalfT, N->getOperand(3), 13626 DAG.getConstant(1, HalfT)); 13627 swapInL = DAG.getCopyToReg(cpInH.getValue(0), dl, 13628 Regs64bit ? X86::RBX : X86::EBX, 13629 swapInL, cpInH.getValue(1)); 13630 swapInH = DAG.getCopyToReg(swapInL.getValue(0), dl, 13631 Regs64bit ? X86::RCX : X86::ECX, 13632 swapInH, swapInL.getValue(1)); 13633 SDValue Ops[] = { swapInH.getValue(0), 13634 N->getOperand(1), 13635 swapInH.getValue(1) }; 13636 SDVTList Tys = DAG.getVTList(MVT::Other, MVT::Glue); 13637 MachineMemOperand *MMO = cast<AtomicSDNode>(N)->getMemOperand(); 13638 unsigned Opcode = Regs64bit ? X86ISD::LCMPXCHG16_DAG : 13639 X86ISD::LCMPXCHG8_DAG; 13640 SDValue Result = DAG.getMemIntrinsicNode(Opcode, dl, Tys, 13641 Ops, array_lengthof(Ops), T, MMO); 13642 SDValue cpOutL = DAG.getCopyFromReg(Result.getValue(0), dl, 13643 Regs64bit ? X86::RAX : X86::EAX, 13644 HalfT, Result.getValue(1)); 13645 SDValue cpOutH = DAG.getCopyFromReg(cpOutL.getValue(1), dl, 13646 Regs64bit ? X86::RDX : X86::EDX, 13647 HalfT, cpOutL.getValue(2)); 13648 SDValue OpsF[] = { cpOutL.getValue(0), cpOutH.getValue(0)}; 13649 Results.push_back(DAG.getNode(ISD::BUILD_PAIR, dl, T, OpsF, 2)); 13650 Results.push_back(cpOutH.getValue(1)); 13651 return; 13652 } 13653 case ISD::ATOMIC_LOAD_ADD: 13654 case ISD::ATOMIC_LOAD_AND: 13655 case ISD::ATOMIC_LOAD_NAND: 13656 case ISD::ATOMIC_LOAD_OR: 13657 case ISD::ATOMIC_LOAD_SUB: 13658 case ISD::ATOMIC_LOAD_XOR: 13659 case ISD::ATOMIC_LOAD_MAX: 13660 case ISD::ATOMIC_LOAD_MIN: 13661 case ISD::ATOMIC_LOAD_UMAX: 13662 case ISD::ATOMIC_LOAD_UMIN: 13663 case ISD::ATOMIC_SWAP: { 13664 unsigned Opc; 13665 switch (N->getOpcode()) { 13666 default: llvm_unreachable("Unexpected opcode"); 13667 case ISD::ATOMIC_LOAD_ADD: 13668 Opc = X86ISD::ATOMADD64_DAG; 13669 break; 13670 case ISD::ATOMIC_LOAD_AND: 13671 Opc = X86ISD::ATOMAND64_DAG; 13672 break; 13673 case ISD::ATOMIC_LOAD_NAND: 13674 Opc = X86ISD::ATOMNAND64_DAG; 13675 break; 13676 case ISD::ATOMIC_LOAD_OR: 13677 Opc = X86ISD::ATOMOR64_DAG; 13678 break; 13679 case ISD::ATOMIC_LOAD_SUB: 13680 Opc = X86ISD::ATOMSUB64_DAG; 13681 break; 13682 case ISD::ATOMIC_LOAD_XOR: 13683 Opc = X86ISD::ATOMXOR64_DAG; 13684 break; 13685 case ISD::ATOMIC_LOAD_MAX: 13686 Opc = X86ISD::ATOMMAX64_DAG; 13687 break; 13688 case ISD::ATOMIC_LOAD_MIN: 13689 Opc = X86ISD::ATOMMIN64_DAG; 13690 break; 13691 case ISD::ATOMIC_LOAD_UMAX: 13692 Opc = X86ISD::ATOMUMAX64_DAG; 13693 break; 13694 case ISD::ATOMIC_LOAD_UMIN: 13695 Opc = X86ISD::ATOMUMIN64_DAG; 13696 break; 13697 case ISD::ATOMIC_SWAP: 13698 Opc = X86ISD::ATOMSWAP64_DAG; 13699 break; 13700 } 13701 ReplaceATOMIC_BINARY_64(N, Results, DAG, Opc); 13702 return; 13703 } 13704 case ISD::ATOMIC_LOAD: 13705 ReplaceATOMIC_LOAD(N, Results, DAG); 13706 } 13707} 13708 13709const char *X86TargetLowering::getTargetNodeName(unsigned Opcode) const { 13710 switch (Opcode) { 13711 default: return NULL; 13712 case X86ISD::BSF: return "X86ISD::BSF"; 13713 case X86ISD::BSR: return "X86ISD::BSR"; 13714 case X86ISD::SHLD: return "X86ISD::SHLD"; 13715 case X86ISD::SHRD: return "X86ISD::SHRD"; 13716 case X86ISD::FAND: return "X86ISD::FAND"; 13717 case X86ISD::FANDN: return "X86ISD::FANDN"; 13718 case X86ISD::FOR: return "X86ISD::FOR"; 13719 case X86ISD::FXOR: return "X86ISD::FXOR"; 13720 case X86ISD::FSRL: return "X86ISD::FSRL"; 13721 case X86ISD::FILD: return "X86ISD::FILD"; 13722 case X86ISD::FILD_FLAG: return "X86ISD::FILD_FLAG"; 13723 case X86ISD::FP_TO_INT16_IN_MEM: return "X86ISD::FP_TO_INT16_IN_MEM"; 13724 case X86ISD::FP_TO_INT32_IN_MEM: return "X86ISD::FP_TO_INT32_IN_MEM"; 13725 case X86ISD::FP_TO_INT64_IN_MEM: return "X86ISD::FP_TO_INT64_IN_MEM"; 13726 case X86ISD::FLD: return "X86ISD::FLD"; 13727 case X86ISD::FST: return "X86ISD::FST"; 13728 case X86ISD::CALL: return "X86ISD::CALL"; 13729 case X86ISD::RDTSC_DAG: return "X86ISD::RDTSC_DAG"; 13730 case X86ISD::BT: return "X86ISD::BT"; 13731 case X86ISD::CMP: return "X86ISD::CMP"; 13732 case X86ISD::COMI: return "X86ISD::COMI"; 13733 case X86ISD::UCOMI: return "X86ISD::UCOMI"; 13734 case X86ISD::CMPM: return "X86ISD::CMPM"; 13735 case X86ISD::CMPMU: return "X86ISD::CMPMU"; 13736 case X86ISD::SETCC: return "X86ISD::SETCC"; 13737 case X86ISD::SETCC_CARRY: return "X86ISD::SETCC_CARRY"; 13738 case X86ISD::FSETCCsd: return "X86ISD::FSETCCsd"; 13739 case X86ISD::FSETCCss: return "X86ISD::FSETCCss"; 13740 case X86ISD::CMOV: return "X86ISD::CMOV"; 13741 case X86ISD::BRCOND: return "X86ISD::BRCOND"; 13742 case X86ISD::RET_FLAG: return "X86ISD::RET_FLAG"; 13743 case X86ISD::REP_STOS: return "X86ISD::REP_STOS"; 13744 case X86ISD::REP_MOVS: return "X86ISD::REP_MOVS"; 13745 case X86ISD::GlobalBaseReg: return "X86ISD::GlobalBaseReg"; 13746 case X86ISD::Wrapper: return "X86ISD::Wrapper"; 13747 case X86ISD::WrapperRIP: return "X86ISD::WrapperRIP"; 13748 case X86ISD::PEXTRB: return "X86ISD::PEXTRB"; 13749 case X86ISD::PEXTRW: return "X86ISD::PEXTRW"; 13750 case X86ISD::INSERTPS: return "X86ISD::INSERTPS"; 13751 case X86ISD::PINSRB: return "X86ISD::PINSRB"; 13752 case X86ISD::PINSRW: return "X86ISD::PINSRW"; 13753 case X86ISD::PSHUFB: return "X86ISD::PSHUFB"; 13754 case X86ISD::ANDNP: return "X86ISD::ANDNP"; 13755 case X86ISD::PSIGN: return "X86ISD::PSIGN"; 13756 case X86ISD::BLENDV: return "X86ISD::BLENDV"; 13757 case X86ISD::BLENDI: return "X86ISD::BLENDI"; 13758 case X86ISD::SUBUS: return "X86ISD::SUBUS"; 13759 case X86ISD::HADD: return "X86ISD::HADD"; 13760 case X86ISD::HSUB: return "X86ISD::HSUB"; 13761 case X86ISD::FHADD: return "X86ISD::FHADD"; 13762 case X86ISD::FHSUB: return "X86ISD::FHSUB"; 13763 case X86ISD::UMAX: return "X86ISD::UMAX"; 13764 case X86ISD::UMIN: return "X86ISD::UMIN"; 13765 case X86ISD::SMAX: return "X86ISD::SMAX"; 13766 case X86ISD::SMIN: return "X86ISD::SMIN"; 13767 case X86ISD::FMAX: return "X86ISD::FMAX"; 13768 case X86ISD::FMIN: return "X86ISD::FMIN"; 13769 case X86ISD::FMAXC: return "X86ISD::FMAXC"; 13770 case X86ISD::FMINC: return "X86ISD::FMINC"; 13771 case X86ISD::FRSQRT: return "X86ISD::FRSQRT"; 13772 case X86ISD::FRCP: return "X86ISD::FRCP"; 13773 case X86ISD::TLSADDR: return "X86ISD::TLSADDR"; 13774 case X86ISD::TLSBASEADDR: return "X86ISD::TLSBASEADDR"; 13775 case X86ISD::TLSCALL: return "X86ISD::TLSCALL"; 13776 case X86ISD::EH_SJLJ_SETJMP: return "X86ISD::EH_SJLJ_SETJMP"; 13777 case X86ISD::EH_SJLJ_LONGJMP: return "X86ISD::EH_SJLJ_LONGJMP"; 13778 case X86ISD::EH_RETURN: return "X86ISD::EH_RETURN"; 13779 case X86ISD::TC_RETURN: return "X86ISD::TC_RETURN"; 13780 case X86ISD::FNSTCW16m: return "X86ISD::FNSTCW16m"; 13781 case X86ISD::FNSTSW16r: return "X86ISD::FNSTSW16r"; 13782 case X86ISD::LCMPXCHG_DAG: return "X86ISD::LCMPXCHG_DAG"; 13783 case X86ISD::LCMPXCHG8_DAG: return "X86ISD::LCMPXCHG8_DAG"; 13784 case X86ISD::ATOMADD64_DAG: return "X86ISD::ATOMADD64_DAG"; 13785 case X86ISD::ATOMSUB64_DAG: return "X86ISD::ATOMSUB64_DAG"; 13786 case X86ISD::ATOMOR64_DAG: return "X86ISD::ATOMOR64_DAG"; 13787 case X86ISD::ATOMXOR64_DAG: return "X86ISD::ATOMXOR64_DAG"; 13788 case X86ISD::ATOMAND64_DAG: return "X86ISD::ATOMAND64_DAG"; 13789 case X86ISD::ATOMNAND64_DAG: return "X86ISD::ATOMNAND64_DAG"; 13790 case X86ISD::VZEXT_MOVL: return "X86ISD::VZEXT_MOVL"; 13791 case X86ISD::VSEXT_MOVL: return "X86ISD::VSEXT_MOVL"; 13792 case X86ISD::VZEXT_LOAD: return "X86ISD::VZEXT_LOAD"; 13793 case X86ISD::VZEXT: return "X86ISD::VZEXT"; 13794 case X86ISD::VSEXT: return "X86ISD::VSEXT"; 13795 case X86ISD::VTRUNC: return "X86ISD::VTRUNC"; 13796 case X86ISD::VTRUNCM: return "X86ISD::VTRUNCM"; 13797 case X86ISD::VINSERT: return "X86ISD::VINSERT"; 13798 case X86ISD::VFPEXT: return "X86ISD::VFPEXT"; 13799 case X86ISD::VFPROUND: return "X86ISD::VFPROUND"; 13800 case X86ISD::VSHLDQ: return "X86ISD::VSHLDQ"; 13801 case X86ISD::VSRLDQ: return "X86ISD::VSRLDQ"; 13802 case X86ISD::VSHL: return "X86ISD::VSHL"; 13803 case X86ISD::VSRL: return "X86ISD::VSRL"; 13804 case X86ISD::VSRA: return "X86ISD::VSRA"; 13805 case X86ISD::VSHLI: return "X86ISD::VSHLI"; 13806 case X86ISD::VSRLI: return "X86ISD::VSRLI"; 13807 case X86ISD::VSRAI: return "X86ISD::VSRAI"; 13808 case X86ISD::CMPP: return "X86ISD::CMPP"; 13809 case X86ISD::PCMPEQ: return "X86ISD::PCMPEQ"; 13810 case X86ISD::PCMPGT: return "X86ISD::PCMPGT"; 13811 case X86ISD::PCMPEQM: return "X86ISD::PCMPEQM"; 13812 case X86ISD::PCMPGTM: return "X86ISD::PCMPGTM"; 13813 case X86ISD::ADD: return "X86ISD::ADD"; 13814 case X86ISD::SUB: return "X86ISD::SUB"; 13815 case X86ISD::ADC: return "X86ISD::ADC"; 13816 case X86ISD::SBB: return "X86ISD::SBB"; 13817 case X86ISD::SMUL: return "X86ISD::SMUL"; 13818 case X86ISD::UMUL: return "X86ISD::UMUL"; 13819 case X86ISD::INC: return "X86ISD::INC"; 13820 case X86ISD::DEC: return "X86ISD::DEC"; 13821 case X86ISD::OR: return "X86ISD::OR"; 13822 case X86ISD::XOR: return "X86ISD::XOR"; 13823 case X86ISD::AND: return "X86ISD::AND"; 13824 case X86ISD::BLSI: return "X86ISD::BLSI"; 13825 case X86ISD::BLSMSK: return "X86ISD::BLSMSK"; 13826 case X86ISD::BLSR: return "X86ISD::BLSR"; 13827 case X86ISD::BZHI: return "X86ISD::BZHI"; 13828 case X86ISD::BEXTR: return "X86ISD::BEXTR"; 13829 case X86ISD::MUL_IMM: return "X86ISD::MUL_IMM"; 13830 case X86ISD::PTEST: return "X86ISD::PTEST"; 13831 case X86ISD::TESTP: return "X86ISD::TESTP"; 13832 case X86ISD::TESTM: return "X86ISD::TESTM"; 13833 case X86ISD::KORTEST: return "X86ISD::KORTEST"; 13834 case X86ISD::KTEST: return "X86ISD::KTEST"; 13835 case X86ISD::PALIGNR: return "X86ISD::PALIGNR"; 13836 case X86ISD::PSHUFD: return "X86ISD::PSHUFD"; 13837 case X86ISD::PSHUFHW: return "X86ISD::PSHUFHW"; 13838 case X86ISD::PSHUFLW: return "X86ISD::PSHUFLW"; 13839 case X86ISD::SHUFP: return "X86ISD::SHUFP"; 13840 case X86ISD::MOVLHPS: return "X86ISD::MOVLHPS"; 13841 case X86ISD::MOVLHPD: return "X86ISD::MOVLHPD"; 13842 case X86ISD::MOVHLPS: return "X86ISD::MOVHLPS"; 13843 case X86ISD::MOVLPS: return "X86ISD::MOVLPS"; 13844 case X86ISD::MOVLPD: return "X86ISD::MOVLPD"; 13845 case X86ISD::MOVDDUP: return "X86ISD::MOVDDUP"; 13846 case X86ISD::MOVSHDUP: return "X86ISD::MOVSHDUP"; 13847 case X86ISD::MOVSLDUP: return "X86ISD::MOVSLDUP"; 13848 case X86ISD::MOVSD: return "X86ISD::MOVSD"; 13849 case X86ISD::MOVSS: return "X86ISD::MOVSS"; 13850 case X86ISD::UNPCKL: return "X86ISD::UNPCKL"; 13851 case X86ISD::UNPCKH: return "X86ISD::UNPCKH"; 13852 case X86ISD::VBROADCAST: return "X86ISD::VBROADCAST"; 13853 case X86ISD::VBROADCASTM: return "X86ISD::VBROADCASTM"; 13854 case X86ISD::VPERMILP: return "X86ISD::VPERMILP"; 13855 case X86ISD::VPERM2X128: return "X86ISD::VPERM2X128"; 13856 case X86ISD::VPERMV: return "X86ISD::VPERMV"; 13857 case X86ISD::VPERMV3: return "X86ISD::VPERMV3"; 13858 case X86ISD::VPERMI: return "X86ISD::VPERMI"; 13859 case X86ISD::PMULUDQ: return "X86ISD::PMULUDQ"; 13860 case X86ISD::VASTART_SAVE_XMM_REGS: return "X86ISD::VASTART_SAVE_XMM_REGS"; 13861 case X86ISD::VAARG_64: return "X86ISD::VAARG_64"; 13862 case X86ISD::WIN_ALLOCA: return "X86ISD::WIN_ALLOCA"; 13863 case X86ISD::MEMBARRIER: return "X86ISD::MEMBARRIER"; 13864 case X86ISD::SEG_ALLOCA: return "X86ISD::SEG_ALLOCA"; 13865 case X86ISD::WIN_FTOL: return "X86ISD::WIN_FTOL"; 13866 case X86ISD::SAHF: return "X86ISD::SAHF"; 13867 case X86ISD::RDRAND: return "X86ISD::RDRAND"; 13868 case X86ISD::RDSEED: return "X86ISD::RDSEED"; 13869 case X86ISD::FMADD: return "X86ISD::FMADD"; 13870 case X86ISD::FMSUB: return "X86ISD::FMSUB"; 13871 case X86ISD::FNMADD: return "X86ISD::FNMADD"; 13872 case X86ISD::FNMSUB: return "X86ISD::FNMSUB"; 13873 case X86ISD::FMADDSUB: return "X86ISD::FMADDSUB"; 13874 case X86ISD::FMSUBADD: return "X86ISD::FMSUBADD"; 13875 case X86ISD::PCMPESTRI: return "X86ISD::PCMPESTRI"; 13876 case X86ISD::PCMPISTRI: return "X86ISD::PCMPISTRI"; 13877 case X86ISD::XTEST: return "X86ISD::XTEST"; 13878 } 13879} 13880 13881// isLegalAddressingMode - Return true if the addressing mode represented 13882// by AM is legal for this target, for a load/store of the specified type. 13883bool X86TargetLowering::isLegalAddressingMode(const AddrMode &AM, 13884 Type *Ty) const { 13885 // X86 supports extremely general addressing modes. 13886 CodeModel::Model M = getTargetMachine().getCodeModel(); 13887 Reloc::Model R = getTargetMachine().getRelocationModel(); 13888 13889 // X86 allows a sign-extended 32-bit immediate field as a displacement. 13890 if (!X86::isOffsetSuitableForCodeModel(AM.BaseOffs, M, AM.BaseGV != NULL)) 13891 return false; 13892 13893 if (AM.BaseGV) { 13894 unsigned GVFlags = 13895 Subtarget->ClassifyGlobalReference(AM.BaseGV, getTargetMachine()); 13896 13897 // If a reference to this global requires an extra load, we can't fold it. 13898 if (isGlobalStubReference(GVFlags)) 13899 return false; 13900 13901 // If BaseGV requires a register for the PIC base, we cannot also have a 13902 // BaseReg specified. 13903 if (AM.HasBaseReg && isGlobalRelativeToPICBase(GVFlags)) 13904 return false; 13905 13906 // If lower 4G is not available, then we must use rip-relative addressing. 13907 if ((M != CodeModel::Small || R != Reloc::Static) && 13908 Subtarget->is64Bit() && (AM.BaseOffs || AM.Scale > 1)) 13909 return false; 13910 } 13911 13912 switch (AM.Scale) { 13913 case 0: 13914 case 1: 13915 case 2: 13916 case 4: 13917 case 8: 13918 // These scales always work. 13919 break; 13920 case 3: 13921 case 5: 13922 case 9: 13923 // These scales are formed with basereg+scalereg. Only accept if there is 13924 // no basereg yet. 13925 if (AM.HasBaseReg) 13926 return false; 13927 break; 13928 default: // Other stuff never works. 13929 return false; 13930 } 13931 13932 return true; 13933} 13934 13935bool X86TargetLowering::isTruncateFree(Type *Ty1, Type *Ty2) const { 13936 if (!Ty1->isIntegerTy() || !Ty2->isIntegerTy()) 13937 return false; 13938 unsigned NumBits1 = Ty1->getPrimitiveSizeInBits(); 13939 unsigned NumBits2 = Ty2->getPrimitiveSizeInBits(); 13940 return NumBits1 > NumBits2; 13941} 13942 13943bool X86TargetLowering::allowTruncateForTailCall(Type *Ty1, Type *Ty2) const { 13944 if (!Ty1->isIntegerTy() || !Ty2->isIntegerTy()) 13945 return false; 13946 13947 if (!isTypeLegal(EVT::getEVT(Ty1))) 13948 return false; 13949 13950 assert(Ty1->getPrimitiveSizeInBits() <= 64 && "i128 is probably not a noop"); 13951 13952 // Assuming the caller doesn't have a zeroext or signext return parameter, 13953 // truncation all the way down to i1 is valid. 13954 return true; 13955} 13956 13957bool X86TargetLowering::isLegalICmpImmediate(int64_t Imm) const { 13958 return isInt<32>(Imm); 13959} 13960 13961bool X86TargetLowering::isLegalAddImmediate(int64_t Imm) const { 13962 // Can also use sub to handle negated immediates. 13963 return isInt<32>(Imm); 13964} 13965 13966bool X86TargetLowering::isTruncateFree(EVT VT1, EVT VT2) const { 13967 if (!VT1.isInteger() || !VT2.isInteger()) 13968 return false; 13969 unsigned NumBits1 = VT1.getSizeInBits(); 13970 unsigned NumBits2 = VT2.getSizeInBits(); 13971 return NumBits1 > NumBits2; 13972} 13973 13974bool X86TargetLowering::isZExtFree(Type *Ty1, Type *Ty2) const { 13975 // x86-64 implicitly zero-extends 32-bit results in 64-bit registers. 13976 return Ty1->isIntegerTy(32) && Ty2->isIntegerTy(64) && Subtarget->is64Bit(); 13977} 13978 13979bool X86TargetLowering::isZExtFree(EVT VT1, EVT VT2) const { 13980 // x86-64 implicitly zero-extends 32-bit results in 64-bit registers. 13981 return VT1 == MVT::i32 && VT2 == MVT::i64 && Subtarget->is64Bit(); 13982} 13983 13984bool X86TargetLowering::isZExtFree(SDValue Val, EVT VT2) const { 13985 EVT VT1 = Val.getValueType(); 13986 if (isZExtFree(VT1, VT2)) 13987 return true; 13988 13989 if (Val.getOpcode() != ISD::LOAD) 13990 return false; 13991 13992 if (!VT1.isSimple() || !VT1.isInteger() || 13993 !VT2.isSimple() || !VT2.isInteger()) 13994 return false; 13995 13996 switch (VT1.getSimpleVT().SimpleTy) { 13997 default: break; 13998 case MVT::i8: 13999 case MVT::i16: 14000 case MVT::i32: 14001 // X86 has 8, 16, and 32-bit zero-extending loads. 14002 return true; 14003 } 14004 14005 return false; 14006} 14007 14008bool 14009X86TargetLowering::isFMAFasterThanFMulAndFAdd(EVT VT) const { 14010 if (!(Subtarget->hasFMA() || Subtarget->hasFMA4())) 14011 return false; 14012 14013 VT = VT.getScalarType(); 14014 14015 if (!VT.isSimple()) 14016 return false; 14017 14018 switch (VT.getSimpleVT().SimpleTy) { 14019 case MVT::f32: 14020 case MVT::f64: 14021 return true; 14022 default: 14023 break; 14024 } 14025 14026 return false; 14027} 14028 14029bool X86TargetLowering::isNarrowingProfitable(EVT VT1, EVT VT2) const { 14030 // i16 instructions are longer (0x66 prefix) and potentially slower. 14031 return !(VT1 == MVT::i32 && VT2 == MVT::i16); 14032} 14033 14034/// isShuffleMaskLegal - Targets can use this to indicate that they only 14035/// support *some* VECTOR_SHUFFLE operations, those with specific masks. 14036/// By default, if a target supports the VECTOR_SHUFFLE node, all mask values 14037/// are assumed to be legal. 14038bool 14039X86TargetLowering::isShuffleMaskLegal(const SmallVectorImpl<int> &M, 14040 EVT VT) const { 14041 if (!VT.isSimple()) 14042 return false; 14043 14044 MVT SVT = VT.getSimpleVT(); 14045 14046 // Very little shuffling can be done for 64-bit vectors right now. 14047 if (VT.getSizeInBits() == 64) 14048 return false; 14049 14050 // FIXME: pshufb, blends, shifts. 14051 return (SVT.getVectorNumElements() == 2 || 14052 ShuffleVectorSDNode::isSplatMask(&M[0], VT) || 14053 isMOVLMask(M, SVT) || 14054 isSHUFPMask(M, SVT) || 14055 isPSHUFDMask(M, SVT) || 14056 isPSHUFHWMask(M, SVT, Subtarget->hasInt256()) || 14057 isPSHUFLWMask(M, SVT, Subtarget->hasInt256()) || 14058 isPALIGNRMask(M, SVT, Subtarget) || 14059 isUNPCKLMask(M, SVT, Subtarget->hasInt256()) || 14060 isUNPCKHMask(M, SVT, Subtarget->hasInt256()) || 14061 isUNPCKL_v_undef_Mask(M, SVT, Subtarget->hasInt256()) || 14062 isUNPCKH_v_undef_Mask(M, SVT, Subtarget->hasInt256())); 14063} 14064 14065bool 14066X86TargetLowering::isVectorClearMaskLegal(const SmallVectorImpl<int> &Mask, 14067 EVT VT) const { 14068 if (!VT.isSimple()) 14069 return false; 14070 14071 MVT SVT = VT.getSimpleVT(); 14072 unsigned NumElts = SVT.getVectorNumElements(); 14073 // FIXME: This collection of masks seems suspect. 14074 if (NumElts == 2) 14075 return true; 14076 if (NumElts == 4 && SVT.is128BitVector()) { 14077 return (isMOVLMask(Mask, SVT) || 14078 isCommutedMOVLMask(Mask, SVT, true) || 14079 isSHUFPMask(Mask, SVT) || 14080 isSHUFPMask(Mask, SVT, /* Commuted */ true)); 14081 } 14082 return false; 14083} 14084 14085//===----------------------------------------------------------------------===// 14086// X86 Scheduler Hooks 14087//===----------------------------------------------------------------------===// 14088 14089/// Utility function to emit xbegin specifying the start of an RTM region. 14090static MachineBasicBlock *EmitXBegin(MachineInstr *MI, MachineBasicBlock *MBB, 14091 const TargetInstrInfo *TII) { 14092 DebugLoc DL = MI->getDebugLoc(); 14093 14094 const BasicBlock *BB = MBB->getBasicBlock(); 14095 MachineFunction::iterator I = MBB; 14096 ++I; 14097 14098 // For the v = xbegin(), we generate 14099 // 14100 // thisMBB: 14101 // xbegin sinkMBB 14102 // 14103 // mainMBB: 14104 // eax = -1 14105 // 14106 // sinkMBB: 14107 // v = eax 14108 14109 MachineBasicBlock *thisMBB = MBB; 14110 MachineFunction *MF = MBB->getParent(); 14111 MachineBasicBlock *mainMBB = MF->CreateMachineBasicBlock(BB); 14112 MachineBasicBlock *sinkMBB = MF->CreateMachineBasicBlock(BB); 14113 MF->insert(I, mainMBB); 14114 MF->insert(I, sinkMBB); 14115 14116 // Transfer the remainder of BB and its successor edges to sinkMBB. 14117 sinkMBB->splice(sinkMBB->begin(), MBB, 14118 llvm::next(MachineBasicBlock::iterator(MI)), MBB->end()); 14119 sinkMBB->transferSuccessorsAndUpdatePHIs(MBB); 14120 14121 // thisMBB: 14122 // xbegin sinkMBB 14123 // # fallthrough to mainMBB 14124 // # abortion to sinkMBB 14125 BuildMI(thisMBB, DL, TII->get(X86::XBEGIN_4)).addMBB(sinkMBB); 14126 thisMBB->addSuccessor(mainMBB); 14127 thisMBB->addSuccessor(sinkMBB); 14128 14129 // mainMBB: 14130 // EAX = -1 14131 BuildMI(mainMBB, DL, TII->get(X86::MOV32ri), X86::EAX).addImm(-1); 14132 mainMBB->addSuccessor(sinkMBB); 14133 14134 // sinkMBB: 14135 // EAX is live into the sinkMBB 14136 sinkMBB->addLiveIn(X86::EAX); 14137 BuildMI(*sinkMBB, sinkMBB->begin(), DL, 14138 TII->get(TargetOpcode::COPY), MI->getOperand(0).getReg()) 14139 .addReg(X86::EAX); 14140 14141 MI->eraseFromParent(); 14142 return sinkMBB; 14143} 14144 14145// Get CMPXCHG opcode for the specified data type. 14146static unsigned getCmpXChgOpcode(EVT VT) { 14147 switch (VT.getSimpleVT().SimpleTy) { 14148 case MVT::i8: return X86::LCMPXCHG8; 14149 case MVT::i16: return X86::LCMPXCHG16; 14150 case MVT::i32: return X86::LCMPXCHG32; 14151 case MVT::i64: return X86::LCMPXCHG64; 14152 default: 14153 break; 14154 } 14155 llvm_unreachable("Invalid operand size!"); 14156} 14157 14158// Get LOAD opcode for the specified data type. 14159static unsigned getLoadOpcode(EVT VT) { 14160 switch (VT.getSimpleVT().SimpleTy) { 14161 case MVT::i8: return X86::MOV8rm; 14162 case MVT::i16: return X86::MOV16rm; 14163 case MVT::i32: return X86::MOV32rm; 14164 case MVT::i64: return X86::MOV64rm; 14165 default: 14166 break; 14167 } 14168 llvm_unreachable("Invalid operand size!"); 14169} 14170 14171// Get opcode of the non-atomic one from the specified atomic instruction. 14172static unsigned getNonAtomicOpcode(unsigned Opc) { 14173 switch (Opc) { 14174 case X86::ATOMAND8: return X86::AND8rr; 14175 case X86::ATOMAND16: return X86::AND16rr; 14176 case X86::ATOMAND32: return X86::AND32rr; 14177 case X86::ATOMAND64: return X86::AND64rr; 14178 case X86::ATOMOR8: return X86::OR8rr; 14179 case X86::ATOMOR16: return X86::OR16rr; 14180 case X86::ATOMOR32: return X86::OR32rr; 14181 case X86::ATOMOR64: return X86::OR64rr; 14182 case X86::ATOMXOR8: return X86::XOR8rr; 14183 case X86::ATOMXOR16: return X86::XOR16rr; 14184 case X86::ATOMXOR32: return X86::XOR32rr; 14185 case X86::ATOMXOR64: return X86::XOR64rr; 14186 } 14187 llvm_unreachable("Unhandled atomic-load-op opcode!"); 14188} 14189 14190// Get opcode of the non-atomic one from the specified atomic instruction with 14191// extra opcode. 14192static unsigned getNonAtomicOpcodeWithExtraOpc(unsigned Opc, 14193 unsigned &ExtraOpc) { 14194 switch (Opc) { 14195 case X86::ATOMNAND8: ExtraOpc = X86::NOT8r; return X86::AND8rr; 14196 case X86::ATOMNAND16: ExtraOpc = X86::NOT16r; return X86::AND16rr; 14197 case X86::ATOMNAND32: ExtraOpc = X86::NOT32r; return X86::AND32rr; 14198 case X86::ATOMNAND64: ExtraOpc = X86::NOT64r; return X86::AND64rr; 14199 case X86::ATOMMAX8: ExtraOpc = X86::CMP8rr; return X86::CMOVL32rr; 14200 case X86::ATOMMAX16: ExtraOpc = X86::CMP16rr; return X86::CMOVL16rr; 14201 case X86::ATOMMAX32: ExtraOpc = X86::CMP32rr; return X86::CMOVL32rr; 14202 case X86::ATOMMAX64: ExtraOpc = X86::CMP64rr; return X86::CMOVL64rr; 14203 case X86::ATOMMIN8: ExtraOpc = X86::CMP8rr; return X86::CMOVG32rr; 14204 case X86::ATOMMIN16: ExtraOpc = X86::CMP16rr; return X86::CMOVG16rr; 14205 case X86::ATOMMIN32: ExtraOpc = X86::CMP32rr; return X86::CMOVG32rr; 14206 case X86::ATOMMIN64: ExtraOpc = X86::CMP64rr; return X86::CMOVG64rr; 14207 case X86::ATOMUMAX8: ExtraOpc = X86::CMP8rr; return X86::CMOVB32rr; 14208 case X86::ATOMUMAX16: ExtraOpc = X86::CMP16rr; return X86::CMOVB16rr; 14209 case X86::ATOMUMAX32: ExtraOpc = X86::CMP32rr; return X86::CMOVB32rr; 14210 case X86::ATOMUMAX64: ExtraOpc = X86::CMP64rr; return X86::CMOVB64rr; 14211 case X86::ATOMUMIN8: ExtraOpc = X86::CMP8rr; return X86::CMOVA32rr; 14212 case X86::ATOMUMIN16: ExtraOpc = X86::CMP16rr; return X86::CMOVA16rr; 14213 case X86::ATOMUMIN32: ExtraOpc = X86::CMP32rr; return X86::CMOVA32rr; 14214 case X86::ATOMUMIN64: ExtraOpc = X86::CMP64rr; return X86::CMOVA64rr; 14215 } 14216 llvm_unreachable("Unhandled atomic-load-op opcode!"); 14217} 14218 14219// Get opcode of the non-atomic one from the specified atomic instruction for 14220// 64-bit data type on 32-bit target. 14221static unsigned getNonAtomic6432Opcode(unsigned Opc, unsigned &HiOpc) { 14222 switch (Opc) { 14223 case X86::ATOMAND6432: HiOpc = X86::AND32rr; return X86::AND32rr; 14224 case X86::ATOMOR6432: HiOpc = X86::OR32rr; return X86::OR32rr; 14225 case X86::ATOMXOR6432: HiOpc = X86::XOR32rr; return X86::XOR32rr; 14226 case X86::ATOMADD6432: HiOpc = X86::ADC32rr; return X86::ADD32rr; 14227 case X86::ATOMSUB6432: HiOpc = X86::SBB32rr; return X86::SUB32rr; 14228 case X86::ATOMSWAP6432: HiOpc = X86::MOV32rr; return X86::MOV32rr; 14229 case X86::ATOMMAX6432: HiOpc = X86::SETLr; return X86::SETLr; 14230 case X86::ATOMMIN6432: HiOpc = X86::SETGr; return X86::SETGr; 14231 case X86::ATOMUMAX6432: HiOpc = X86::SETBr; return X86::SETBr; 14232 case X86::ATOMUMIN6432: HiOpc = X86::SETAr; return X86::SETAr; 14233 } 14234 llvm_unreachable("Unhandled atomic-load-op opcode!"); 14235} 14236 14237// Get opcode of the non-atomic one from the specified atomic instruction for 14238// 64-bit data type on 32-bit target with extra opcode. 14239static unsigned getNonAtomic6432OpcodeWithExtraOpc(unsigned Opc, 14240 unsigned &HiOpc, 14241 unsigned &ExtraOpc) { 14242 switch (Opc) { 14243 case X86::ATOMNAND6432: 14244 ExtraOpc = X86::NOT32r; 14245 HiOpc = X86::AND32rr; 14246 return X86::AND32rr; 14247 } 14248 llvm_unreachable("Unhandled atomic-load-op opcode!"); 14249} 14250 14251// Get pseudo CMOV opcode from the specified data type. 14252static unsigned getPseudoCMOVOpc(EVT VT) { 14253 switch (VT.getSimpleVT().SimpleTy) { 14254 case MVT::i8: return X86::CMOV_GR8; 14255 case MVT::i16: return X86::CMOV_GR16; 14256 case MVT::i32: return X86::CMOV_GR32; 14257 default: 14258 break; 14259 } 14260 llvm_unreachable("Unknown CMOV opcode!"); 14261} 14262 14263// EmitAtomicLoadArith - emit the code sequence for pseudo atomic instructions. 14264// They will be translated into a spin-loop or compare-exchange loop from 14265// 14266// ... 14267// dst = atomic-fetch-op MI.addr, MI.val 14268// ... 14269// 14270// to 14271// 14272// ... 14273// t1 = LOAD MI.addr 14274// loop: 14275// t4 = phi(t1, t3 / loop) 14276// t2 = OP MI.val, t4 14277// EAX = t4 14278// LCMPXCHG [MI.addr], t2, [EAX is implicitly used & defined] 14279// t3 = EAX 14280// JNE loop 14281// sink: 14282// dst = t3 14283// ... 14284MachineBasicBlock * 14285X86TargetLowering::EmitAtomicLoadArith(MachineInstr *MI, 14286 MachineBasicBlock *MBB) const { 14287 const TargetInstrInfo *TII = getTargetMachine().getInstrInfo(); 14288 DebugLoc DL = MI->getDebugLoc(); 14289 14290 MachineFunction *MF = MBB->getParent(); 14291 MachineRegisterInfo &MRI = MF->getRegInfo(); 14292 14293 const BasicBlock *BB = MBB->getBasicBlock(); 14294 MachineFunction::iterator I = MBB; 14295 ++I; 14296 14297 assert(MI->getNumOperands() <= X86::AddrNumOperands + 4 && 14298 "Unexpected number of operands"); 14299 14300 assert(MI->hasOneMemOperand() && 14301 "Expected atomic-load-op to have one memoperand"); 14302 14303 // Memory Reference 14304 MachineInstr::mmo_iterator MMOBegin = MI->memoperands_begin(); 14305 MachineInstr::mmo_iterator MMOEnd = MI->memoperands_end(); 14306 14307 unsigned DstReg, SrcReg; 14308 unsigned MemOpndSlot; 14309 14310 unsigned CurOp = 0; 14311 14312 DstReg = MI->getOperand(CurOp++).getReg(); 14313 MemOpndSlot = CurOp; 14314 CurOp += X86::AddrNumOperands; 14315 SrcReg = MI->getOperand(CurOp++).getReg(); 14316 14317 const TargetRegisterClass *RC = MRI.getRegClass(DstReg); 14318 MVT::SimpleValueType VT = *RC->vt_begin(); 14319 unsigned t1 = MRI.createVirtualRegister(RC); 14320 unsigned t2 = MRI.createVirtualRegister(RC); 14321 unsigned t3 = MRI.createVirtualRegister(RC); 14322 unsigned t4 = MRI.createVirtualRegister(RC); 14323 unsigned PhyReg = getX86SubSuperRegister(X86::EAX, VT); 14324 14325 unsigned LCMPXCHGOpc = getCmpXChgOpcode(VT); 14326 unsigned LOADOpc = getLoadOpcode(VT); 14327 14328 // For the atomic load-arith operator, we generate 14329 // 14330 // thisMBB: 14331 // t1 = LOAD [MI.addr] 14332 // mainMBB: 14333 // t4 = phi(t1 / thisMBB, t3 / mainMBB) 14334 // t1 = OP MI.val, EAX 14335 // EAX = t4 14336 // LCMPXCHG [MI.addr], t1, [EAX is implicitly used & defined] 14337 // t3 = EAX 14338 // JNE mainMBB 14339 // sinkMBB: 14340 // dst = t3 14341 14342 MachineBasicBlock *thisMBB = MBB; 14343 MachineBasicBlock *mainMBB = MF->CreateMachineBasicBlock(BB); 14344 MachineBasicBlock *sinkMBB = MF->CreateMachineBasicBlock(BB); 14345 MF->insert(I, mainMBB); 14346 MF->insert(I, sinkMBB); 14347 14348 MachineInstrBuilder MIB; 14349 14350 // Transfer the remainder of BB and its successor edges to sinkMBB. 14351 sinkMBB->splice(sinkMBB->begin(), MBB, 14352 llvm::next(MachineBasicBlock::iterator(MI)), MBB->end()); 14353 sinkMBB->transferSuccessorsAndUpdatePHIs(MBB); 14354 14355 // thisMBB: 14356 MIB = BuildMI(thisMBB, DL, TII->get(LOADOpc), t1); 14357 for (unsigned i = 0; i < X86::AddrNumOperands; ++i) { 14358 MachineOperand NewMO = MI->getOperand(MemOpndSlot + i); 14359 if (NewMO.isReg()) 14360 NewMO.setIsKill(false); 14361 MIB.addOperand(NewMO); 14362 } 14363 for (MachineInstr::mmo_iterator MMOI = MMOBegin; MMOI != MMOEnd; ++MMOI) { 14364 unsigned flags = (*MMOI)->getFlags(); 14365 flags = (flags & ~MachineMemOperand::MOStore) | MachineMemOperand::MOLoad; 14366 MachineMemOperand *MMO = 14367 MF->getMachineMemOperand((*MMOI)->getPointerInfo(), flags, 14368 (*MMOI)->getSize(), 14369 (*MMOI)->getBaseAlignment(), 14370 (*MMOI)->getTBAAInfo(), 14371 (*MMOI)->getRanges()); 14372 MIB.addMemOperand(MMO); 14373 } 14374 14375 thisMBB->addSuccessor(mainMBB); 14376 14377 // mainMBB: 14378 MachineBasicBlock *origMainMBB = mainMBB; 14379 14380 // Add a PHI. 14381 MachineInstr *Phi = BuildMI(mainMBB, DL, TII->get(X86::PHI), t4) 14382 .addReg(t1).addMBB(thisMBB).addReg(t3).addMBB(mainMBB); 14383 14384 unsigned Opc = MI->getOpcode(); 14385 switch (Opc) { 14386 default: 14387 llvm_unreachable("Unhandled atomic-load-op opcode!"); 14388 case X86::ATOMAND8: 14389 case X86::ATOMAND16: 14390 case X86::ATOMAND32: 14391 case X86::ATOMAND64: 14392 case X86::ATOMOR8: 14393 case X86::ATOMOR16: 14394 case X86::ATOMOR32: 14395 case X86::ATOMOR64: 14396 case X86::ATOMXOR8: 14397 case X86::ATOMXOR16: 14398 case X86::ATOMXOR32: 14399 case X86::ATOMXOR64: { 14400 unsigned ARITHOpc = getNonAtomicOpcode(Opc); 14401 BuildMI(mainMBB, DL, TII->get(ARITHOpc), t2).addReg(SrcReg) 14402 .addReg(t4); 14403 break; 14404 } 14405 case X86::ATOMNAND8: 14406 case X86::ATOMNAND16: 14407 case X86::ATOMNAND32: 14408 case X86::ATOMNAND64: { 14409 unsigned Tmp = MRI.createVirtualRegister(RC); 14410 unsigned NOTOpc; 14411 unsigned ANDOpc = getNonAtomicOpcodeWithExtraOpc(Opc, NOTOpc); 14412 BuildMI(mainMBB, DL, TII->get(ANDOpc), Tmp).addReg(SrcReg) 14413 .addReg(t4); 14414 BuildMI(mainMBB, DL, TII->get(NOTOpc), t2).addReg(Tmp); 14415 break; 14416 } 14417 case X86::ATOMMAX8: 14418 case X86::ATOMMAX16: 14419 case X86::ATOMMAX32: 14420 case X86::ATOMMAX64: 14421 case X86::ATOMMIN8: 14422 case X86::ATOMMIN16: 14423 case X86::ATOMMIN32: 14424 case X86::ATOMMIN64: 14425 case X86::ATOMUMAX8: 14426 case X86::ATOMUMAX16: 14427 case X86::ATOMUMAX32: 14428 case X86::ATOMUMAX64: 14429 case X86::ATOMUMIN8: 14430 case X86::ATOMUMIN16: 14431 case X86::ATOMUMIN32: 14432 case X86::ATOMUMIN64: { 14433 unsigned CMPOpc; 14434 unsigned CMOVOpc = getNonAtomicOpcodeWithExtraOpc(Opc, CMPOpc); 14435 14436 BuildMI(mainMBB, DL, TII->get(CMPOpc)) 14437 .addReg(SrcReg) 14438 .addReg(t4); 14439 14440 if (Subtarget->hasCMov()) { 14441 if (VT != MVT::i8) { 14442 // Native support 14443 BuildMI(mainMBB, DL, TII->get(CMOVOpc), t2) 14444 .addReg(SrcReg) 14445 .addReg(t4); 14446 } else { 14447 // Promote i8 to i32 to use CMOV32 14448 const TargetRegisterInfo* TRI = getTargetMachine().getRegisterInfo(); 14449 const TargetRegisterClass *RC32 = 14450 TRI->getSubClassWithSubReg(getRegClassFor(MVT::i32), X86::sub_8bit); 14451 unsigned SrcReg32 = MRI.createVirtualRegister(RC32); 14452 unsigned AccReg32 = MRI.createVirtualRegister(RC32); 14453 unsigned Tmp = MRI.createVirtualRegister(RC32); 14454 14455 unsigned Undef = MRI.createVirtualRegister(RC32); 14456 BuildMI(mainMBB, DL, TII->get(TargetOpcode::IMPLICIT_DEF), Undef); 14457 14458 BuildMI(mainMBB, DL, TII->get(TargetOpcode::INSERT_SUBREG), SrcReg32) 14459 .addReg(Undef) 14460 .addReg(SrcReg) 14461 .addImm(X86::sub_8bit); 14462 BuildMI(mainMBB, DL, TII->get(TargetOpcode::INSERT_SUBREG), AccReg32) 14463 .addReg(Undef) 14464 .addReg(t4) 14465 .addImm(X86::sub_8bit); 14466 14467 BuildMI(mainMBB, DL, TII->get(CMOVOpc), Tmp) 14468 .addReg(SrcReg32) 14469 .addReg(AccReg32); 14470 14471 BuildMI(mainMBB, DL, TII->get(TargetOpcode::COPY), t2) 14472 .addReg(Tmp, 0, X86::sub_8bit); 14473 } 14474 } else { 14475 // Use pseudo select and lower them. 14476 assert((VT == MVT::i8 || VT == MVT::i16 || VT == MVT::i32) && 14477 "Invalid atomic-load-op transformation!"); 14478 unsigned SelOpc = getPseudoCMOVOpc(VT); 14479 X86::CondCode CC = X86::getCondFromCMovOpc(CMOVOpc); 14480 assert(CC != X86::COND_INVALID && "Invalid atomic-load-op transformation!"); 14481 MIB = BuildMI(mainMBB, DL, TII->get(SelOpc), t2) 14482 .addReg(SrcReg).addReg(t4) 14483 .addImm(CC); 14484 mainMBB = EmitLoweredSelect(MIB, mainMBB); 14485 // Replace the original PHI node as mainMBB is changed after CMOV 14486 // lowering. 14487 BuildMI(*origMainMBB, Phi, DL, TII->get(X86::PHI), t4) 14488 .addReg(t1).addMBB(thisMBB).addReg(t3).addMBB(mainMBB); 14489 Phi->eraseFromParent(); 14490 } 14491 break; 14492 } 14493 } 14494 14495 // Copy PhyReg back from virtual register. 14496 BuildMI(mainMBB, DL, TII->get(TargetOpcode::COPY), PhyReg) 14497 .addReg(t4); 14498 14499 MIB = BuildMI(mainMBB, DL, TII->get(LCMPXCHGOpc)); 14500 for (unsigned i = 0; i < X86::AddrNumOperands; ++i) { 14501 MachineOperand NewMO = MI->getOperand(MemOpndSlot + i); 14502 if (NewMO.isReg()) 14503 NewMO.setIsKill(false); 14504 MIB.addOperand(NewMO); 14505 } 14506 MIB.addReg(t2); 14507 MIB.setMemRefs(MMOBegin, MMOEnd); 14508 14509 // Copy PhyReg back to virtual register. 14510 BuildMI(mainMBB, DL, TII->get(TargetOpcode::COPY), t3) 14511 .addReg(PhyReg); 14512 14513 BuildMI(mainMBB, DL, TII->get(X86::JNE_4)).addMBB(origMainMBB); 14514 14515 mainMBB->addSuccessor(origMainMBB); 14516 mainMBB->addSuccessor(sinkMBB); 14517 14518 // sinkMBB: 14519 BuildMI(*sinkMBB, sinkMBB->begin(), DL, 14520 TII->get(TargetOpcode::COPY), DstReg) 14521 .addReg(t3); 14522 14523 MI->eraseFromParent(); 14524 return sinkMBB; 14525} 14526 14527// EmitAtomicLoadArith6432 - emit the code sequence for pseudo atomic 14528// instructions. They will be translated into a spin-loop or compare-exchange 14529// loop from 14530// 14531// ... 14532// dst = atomic-fetch-op MI.addr, MI.val 14533// ... 14534// 14535// to 14536// 14537// ... 14538// t1L = LOAD [MI.addr + 0] 14539// t1H = LOAD [MI.addr + 4] 14540// loop: 14541// t4L = phi(t1L, t3L / loop) 14542// t4H = phi(t1H, t3H / loop) 14543// t2L = OP MI.val.lo, t4L 14544// t2H = OP MI.val.hi, t4H 14545// EAX = t4L 14546// EDX = t4H 14547// EBX = t2L 14548// ECX = t2H 14549// LCMPXCHG8B [MI.addr], [ECX:EBX & EDX:EAX are implicitly used and EDX:EAX is implicitly defined] 14550// t3L = EAX 14551// t3H = EDX 14552// JNE loop 14553// sink: 14554// dstL = t3L 14555// dstH = t3H 14556// ... 14557MachineBasicBlock * 14558X86TargetLowering::EmitAtomicLoadArith6432(MachineInstr *MI, 14559 MachineBasicBlock *MBB) const { 14560 const TargetInstrInfo *TII = getTargetMachine().getInstrInfo(); 14561 DebugLoc DL = MI->getDebugLoc(); 14562 14563 MachineFunction *MF = MBB->getParent(); 14564 MachineRegisterInfo &MRI = MF->getRegInfo(); 14565 14566 const BasicBlock *BB = MBB->getBasicBlock(); 14567 MachineFunction::iterator I = MBB; 14568 ++I; 14569 14570 assert(MI->getNumOperands() <= X86::AddrNumOperands + 7 && 14571 "Unexpected number of operands"); 14572 14573 assert(MI->hasOneMemOperand() && 14574 "Expected atomic-load-op32 to have one memoperand"); 14575 14576 // Memory Reference 14577 MachineInstr::mmo_iterator MMOBegin = MI->memoperands_begin(); 14578 MachineInstr::mmo_iterator MMOEnd = MI->memoperands_end(); 14579 14580 unsigned DstLoReg, DstHiReg; 14581 unsigned SrcLoReg, SrcHiReg; 14582 unsigned MemOpndSlot; 14583 14584 unsigned CurOp = 0; 14585 14586 DstLoReg = MI->getOperand(CurOp++).getReg(); 14587 DstHiReg = MI->getOperand(CurOp++).getReg(); 14588 MemOpndSlot = CurOp; 14589 CurOp += X86::AddrNumOperands; 14590 SrcLoReg = MI->getOperand(CurOp++).getReg(); 14591 SrcHiReg = MI->getOperand(CurOp++).getReg(); 14592 14593 const TargetRegisterClass *RC = &X86::GR32RegClass; 14594 const TargetRegisterClass *RC8 = &X86::GR8RegClass; 14595 14596 unsigned t1L = MRI.createVirtualRegister(RC); 14597 unsigned t1H = MRI.createVirtualRegister(RC); 14598 unsigned t2L = MRI.createVirtualRegister(RC); 14599 unsigned t2H = MRI.createVirtualRegister(RC); 14600 unsigned t3L = MRI.createVirtualRegister(RC); 14601 unsigned t3H = MRI.createVirtualRegister(RC); 14602 unsigned t4L = MRI.createVirtualRegister(RC); 14603 unsigned t4H = MRI.createVirtualRegister(RC); 14604 14605 unsigned LCMPXCHGOpc = X86::LCMPXCHG8B; 14606 unsigned LOADOpc = X86::MOV32rm; 14607 14608 // For the atomic load-arith operator, we generate 14609 // 14610 // thisMBB: 14611 // t1L = LOAD [MI.addr + 0] 14612 // t1H = LOAD [MI.addr + 4] 14613 // mainMBB: 14614 // t4L = phi(t1L / thisMBB, t3L / mainMBB) 14615 // t4H = phi(t1H / thisMBB, t3H / mainMBB) 14616 // t2L = OP MI.val.lo, t4L 14617 // t2H = OP MI.val.hi, t4H 14618 // EBX = t2L 14619 // ECX = t2H 14620 // LCMPXCHG8B [MI.addr], [ECX:EBX & EDX:EAX are implicitly used and EDX:EAX is implicitly defined] 14621 // t3L = EAX 14622 // t3H = EDX 14623 // JNE loop 14624 // sinkMBB: 14625 // dstL = t3L 14626 // dstH = t3H 14627 14628 MachineBasicBlock *thisMBB = MBB; 14629 MachineBasicBlock *mainMBB = MF->CreateMachineBasicBlock(BB); 14630 MachineBasicBlock *sinkMBB = MF->CreateMachineBasicBlock(BB); 14631 MF->insert(I, mainMBB); 14632 MF->insert(I, sinkMBB); 14633 14634 MachineInstrBuilder MIB; 14635 14636 // Transfer the remainder of BB and its successor edges to sinkMBB. 14637 sinkMBB->splice(sinkMBB->begin(), MBB, 14638 llvm::next(MachineBasicBlock::iterator(MI)), MBB->end()); 14639 sinkMBB->transferSuccessorsAndUpdatePHIs(MBB); 14640 14641 // thisMBB: 14642 // Lo 14643 MIB = BuildMI(thisMBB, DL, TII->get(LOADOpc), t1L); 14644 for (unsigned i = 0; i < X86::AddrNumOperands; ++i) { 14645 MachineOperand NewMO = MI->getOperand(MemOpndSlot + i); 14646 if (NewMO.isReg()) 14647 NewMO.setIsKill(false); 14648 MIB.addOperand(NewMO); 14649 } 14650 for (MachineInstr::mmo_iterator MMOI = MMOBegin; MMOI != MMOEnd; ++MMOI) { 14651 unsigned flags = (*MMOI)->getFlags(); 14652 flags = (flags & ~MachineMemOperand::MOStore) | MachineMemOperand::MOLoad; 14653 MachineMemOperand *MMO = 14654 MF->getMachineMemOperand((*MMOI)->getPointerInfo(), flags, 14655 (*MMOI)->getSize(), 14656 (*MMOI)->getBaseAlignment(), 14657 (*MMOI)->getTBAAInfo(), 14658 (*MMOI)->getRanges()); 14659 MIB.addMemOperand(MMO); 14660 }; 14661 MachineInstr *LowMI = MIB; 14662 14663 // Hi 14664 MIB = BuildMI(thisMBB, DL, TII->get(LOADOpc), t1H); 14665 for (unsigned i = 0; i < X86::AddrNumOperands; ++i) { 14666 if (i == X86::AddrDisp) { 14667 MIB.addDisp(MI->getOperand(MemOpndSlot + i), 4); // 4 == sizeof(i32) 14668 } else { 14669 MachineOperand NewMO = MI->getOperand(MemOpndSlot + i); 14670 if (NewMO.isReg()) 14671 NewMO.setIsKill(false); 14672 MIB.addOperand(NewMO); 14673 } 14674 } 14675 MIB.setMemRefs(LowMI->memoperands_begin(), LowMI->memoperands_end()); 14676 14677 thisMBB->addSuccessor(mainMBB); 14678 14679 // mainMBB: 14680 MachineBasicBlock *origMainMBB = mainMBB; 14681 14682 // Add PHIs. 14683 MachineInstr *PhiL = BuildMI(mainMBB, DL, TII->get(X86::PHI), t4L) 14684 .addReg(t1L).addMBB(thisMBB).addReg(t3L).addMBB(mainMBB); 14685 MachineInstr *PhiH = BuildMI(mainMBB, DL, TII->get(X86::PHI), t4H) 14686 .addReg(t1H).addMBB(thisMBB).addReg(t3H).addMBB(mainMBB); 14687 14688 unsigned Opc = MI->getOpcode(); 14689 switch (Opc) { 14690 default: 14691 llvm_unreachable("Unhandled atomic-load-op6432 opcode!"); 14692 case X86::ATOMAND6432: 14693 case X86::ATOMOR6432: 14694 case X86::ATOMXOR6432: 14695 case X86::ATOMADD6432: 14696 case X86::ATOMSUB6432: { 14697 unsigned HiOpc; 14698 unsigned LoOpc = getNonAtomic6432Opcode(Opc, HiOpc); 14699 BuildMI(mainMBB, DL, TII->get(LoOpc), t2L).addReg(t4L) 14700 .addReg(SrcLoReg); 14701 BuildMI(mainMBB, DL, TII->get(HiOpc), t2H).addReg(t4H) 14702 .addReg(SrcHiReg); 14703 break; 14704 } 14705 case X86::ATOMNAND6432: { 14706 unsigned HiOpc, NOTOpc; 14707 unsigned LoOpc = getNonAtomic6432OpcodeWithExtraOpc(Opc, HiOpc, NOTOpc); 14708 unsigned TmpL = MRI.createVirtualRegister(RC); 14709 unsigned TmpH = MRI.createVirtualRegister(RC); 14710 BuildMI(mainMBB, DL, TII->get(LoOpc), TmpL).addReg(SrcLoReg) 14711 .addReg(t4L); 14712 BuildMI(mainMBB, DL, TII->get(HiOpc), TmpH).addReg(SrcHiReg) 14713 .addReg(t4H); 14714 BuildMI(mainMBB, DL, TII->get(NOTOpc), t2L).addReg(TmpL); 14715 BuildMI(mainMBB, DL, TII->get(NOTOpc), t2H).addReg(TmpH); 14716 break; 14717 } 14718 case X86::ATOMMAX6432: 14719 case X86::ATOMMIN6432: 14720 case X86::ATOMUMAX6432: 14721 case X86::ATOMUMIN6432: { 14722 unsigned HiOpc; 14723 unsigned LoOpc = getNonAtomic6432Opcode(Opc, HiOpc); 14724 unsigned cL = MRI.createVirtualRegister(RC8); 14725 unsigned cH = MRI.createVirtualRegister(RC8); 14726 unsigned cL32 = MRI.createVirtualRegister(RC); 14727 unsigned cH32 = MRI.createVirtualRegister(RC); 14728 unsigned cc = MRI.createVirtualRegister(RC); 14729 // cl := cmp src_lo, lo 14730 BuildMI(mainMBB, DL, TII->get(X86::CMP32rr)) 14731 .addReg(SrcLoReg).addReg(t4L); 14732 BuildMI(mainMBB, DL, TII->get(LoOpc), cL); 14733 BuildMI(mainMBB, DL, TII->get(X86::MOVZX32rr8), cL32).addReg(cL); 14734 // ch := cmp src_hi, hi 14735 BuildMI(mainMBB, DL, TII->get(X86::CMP32rr)) 14736 .addReg(SrcHiReg).addReg(t4H); 14737 BuildMI(mainMBB, DL, TII->get(HiOpc), cH); 14738 BuildMI(mainMBB, DL, TII->get(X86::MOVZX32rr8), cH32).addReg(cH); 14739 // cc := if (src_hi == hi) ? cl : ch; 14740 if (Subtarget->hasCMov()) { 14741 BuildMI(mainMBB, DL, TII->get(X86::CMOVE32rr), cc) 14742 .addReg(cH32).addReg(cL32); 14743 } else { 14744 MIB = BuildMI(mainMBB, DL, TII->get(X86::CMOV_GR32), cc) 14745 .addReg(cH32).addReg(cL32) 14746 .addImm(X86::COND_E); 14747 mainMBB = EmitLoweredSelect(MIB, mainMBB); 14748 } 14749 BuildMI(mainMBB, DL, TII->get(X86::TEST32rr)).addReg(cc).addReg(cc); 14750 if (Subtarget->hasCMov()) { 14751 BuildMI(mainMBB, DL, TII->get(X86::CMOVNE32rr), t2L) 14752 .addReg(SrcLoReg).addReg(t4L); 14753 BuildMI(mainMBB, DL, TII->get(X86::CMOVNE32rr), t2H) 14754 .addReg(SrcHiReg).addReg(t4H); 14755 } else { 14756 MIB = BuildMI(mainMBB, DL, TII->get(X86::CMOV_GR32), t2L) 14757 .addReg(SrcLoReg).addReg(t4L) 14758 .addImm(X86::COND_NE); 14759 mainMBB = EmitLoweredSelect(MIB, mainMBB); 14760 // As the lowered CMOV won't clobber EFLAGS, we could reuse it for the 14761 // 2nd CMOV lowering. 14762 mainMBB->addLiveIn(X86::EFLAGS); 14763 MIB = BuildMI(mainMBB, DL, TII->get(X86::CMOV_GR32), t2H) 14764 .addReg(SrcHiReg).addReg(t4H) 14765 .addImm(X86::COND_NE); 14766 mainMBB = EmitLoweredSelect(MIB, mainMBB); 14767 // Replace the original PHI node as mainMBB is changed after CMOV 14768 // lowering. 14769 BuildMI(*origMainMBB, PhiL, DL, TII->get(X86::PHI), t4L) 14770 .addReg(t1L).addMBB(thisMBB).addReg(t3L).addMBB(mainMBB); 14771 BuildMI(*origMainMBB, PhiH, DL, TII->get(X86::PHI), t4H) 14772 .addReg(t1H).addMBB(thisMBB).addReg(t3H).addMBB(mainMBB); 14773 PhiL->eraseFromParent(); 14774 PhiH->eraseFromParent(); 14775 } 14776 break; 14777 } 14778 case X86::ATOMSWAP6432: { 14779 unsigned HiOpc; 14780 unsigned LoOpc = getNonAtomic6432Opcode(Opc, HiOpc); 14781 BuildMI(mainMBB, DL, TII->get(LoOpc), t2L).addReg(SrcLoReg); 14782 BuildMI(mainMBB, DL, TII->get(HiOpc), t2H).addReg(SrcHiReg); 14783 break; 14784 } 14785 } 14786 14787 // Copy EDX:EAX back from HiReg:LoReg 14788 BuildMI(mainMBB, DL, TII->get(TargetOpcode::COPY), X86::EAX).addReg(t4L); 14789 BuildMI(mainMBB, DL, TII->get(TargetOpcode::COPY), X86::EDX).addReg(t4H); 14790 // Copy ECX:EBX from t1H:t1L 14791 BuildMI(mainMBB, DL, TII->get(TargetOpcode::COPY), X86::EBX).addReg(t2L); 14792 BuildMI(mainMBB, DL, TII->get(TargetOpcode::COPY), X86::ECX).addReg(t2H); 14793 14794 MIB = BuildMI(mainMBB, DL, TII->get(LCMPXCHGOpc)); 14795 for (unsigned i = 0; i < X86::AddrNumOperands; ++i) { 14796 MachineOperand NewMO = MI->getOperand(MemOpndSlot + i); 14797 if (NewMO.isReg()) 14798 NewMO.setIsKill(false); 14799 MIB.addOperand(NewMO); 14800 } 14801 MIB.setMemRefs(MMOBegin, MMOEnd); 14802 14803 // Copy EDX:EAX back to t3H:t3L 14804 BuildMI(mainMBB, DL, TII->get(TargetOpcode::COPY), t3L).addReg(X86::EAX); 14805 BuildMI(mainMBB, DL, TII->get(TargetOpcode::COPY), t3H).addReg(X86::EDX); 14806 14807 BuildMI(mainMBB, DL, TII->get(X86::JNE_4)).addMBB(origMainMBB); 14808 14809 mainMBB->addSuccessor(origMainMBB); 14810 mainMBB->addSuccessor(sinkMBB); 14811 14812 // sinkMBB: 14813 BuildMI(*sinkMBB, sinkMBB->begin(), DL, 14814 TII->get(TargetOpcode::COPY), DstLoReg) 14815 .addReg(t3L); 14816 BuildMI(*sinkMBB, sinkMBB->begin(), DL, 14817 TII->get(TargetOpcode::COPY), DstHiReg) 14818 .addReg(t3H); 14819 14820 MI->eraseFromParent(); 14821 return sinkMBB; 14822} 14823 14824// FIXME: When we get size specific XMM0 registers, i.e. XMM0_V16I8 14825// or XMM0_V32I8 in AVX all of this code can be replaced with that 14826// in the .td file. 14827static MachineBasicBlock *EmitPCMPSTRM(MachineInstr *MI, MachineBasicBlock *BB, 14828 const TargetInstrInfo *TII) { 14829 unsigned Opc; 14830 switch (MI->getOpcode()) { 14831 default: llvm_unreachable("illegal opcode!"); 14832 case X86::PCMPISTRM128REG: Opc = X86::PCMPISTRM128rr; break; 14833 case X86::VPCMPISTRM128REG: Opc = X86::VPCMPISTRM128rr; break; 14834 case X86::PCMPISTRM128MEM: Opc = X86::PCMPISTRM128rm; break; 14835 case X86::VPCMPISTRM128MEM: Opc = X86::VPCMPISTRM128rm; break; 14836 case X86::PCMPESTRM128REG: Opc = X86::PCMPESTRM128rr; break; 14837 case X86::VPCMPESTRM128REG: Opc = X86::VPCMPESTRM128rr; break; 14838 case X86::PCMPESTRM128MEM: Opc = X86::PCMPESTRM128rm; break; 14839 case X86::VPCMPESTRM128MEM: Opc = X86::VPCMPESTRM128rm; break; 14840 } 14841 14842 DebugLoc dl = MI->getDebugLoc(); 14843 MachineInstrBuilder MIB = BuildMI(*BB, MI, dl, TII->get(Opc)); 14844 14845 unsigned NumArgs = MI->getNumOperands(); 14846 for (unsigned i = 1; i < NumArgs; ++i) { 14847 MachineOperand &Op = MI->getOperand(i); 14848 if (!(Op.isReg() && Op.isImplicit())) 14849 MIB.addOperand(Op); 14850 } 14851 if (MI->hasOneMemOperand()) 14852 MIB->setMemRefs(MI->memoperands_begin(), MI->memoperands_end()); 14853 14854 BuildMI(*BB, MI, dl, 14855 TII->get(TargetOpcode::COPY), MI->getOperand(0).getReg()) 14856 .addReg(X86::XMM0); 14857 14858 MI->eraseFromParent(); 14859 return BB; 14860} 14861 14862// FIXME: Custom handling because TableGen doesn't support multiple implicit 14863// defs in an instruction pattern 14864static MachineBasicBlock *EmitPCMPSTRI(MachineInstr *MI, MachineBasicBlock *BB, 14865 const TargetInstrInfo *TII) { 14866 unsigned Opc; 14867 switch (MI->getOpcode()) { 14868 default: llvm_unreachable("illegal opcode!"); 14869 case X86::PCMPISTRIREG: Opc = X86::PCMPISTRIrr; break; 14870 case X86::VPCMPISTRIREG: Opc = X86::VPCMPISTRIrr; break; 14871 case X86::PCMPISTRIMEM: Opc = X86::PCMPISTRIrm; break; 14872 case X86::VPCMPISTRIMEM: Opc = X86::VPCMPISTRIrm; break; 14873 case X86::PCMPESTRIREG: Opc = X86::PCMPESTRIrr; break; 14874 case X86::VPCMPESTRIREG: Opc = X86::VPCMPESTRIrr; break; 14875 case X86::PCMPESTRIMEM: Opc = X86::PCMPESTRIrm; break; 14876 case X86::VPCMPESTRIMEM: Opc = X86::VPCMPESTRIrm; break; 14877 } 14878 14879 DebugLoc dl = MI->getDebugLoc(); 14880 MachineInstrBuilder MIB = BuildMI(*BB, MI, dl, TII->get(Opc)); 14881 14882 unsigned NumArgs = MI->getNumOperands(); // remove the results 14883 for (unsigned i = 1; i < NumArgs; ++i) { 14884 MachineOperand &Op = MI->getOperand(i); 14885 if (!(Op.isReg() && Op.isImplicit())) 14886 MIB.addOperand(Op); 14887 } 14888 if (MI->hasOneMemOperand()) 14889 MIB->setMemRefs(MI->memoperands_begin(), MI->memoperands_end()); 14890 14891 BuildMI(*BB, MI, dl, 14892 TII->get(TargetOpcode::COPY), MI->getOperand(0).getReg()) 14893 .addReg(X86::ECX); 14894 14895 MI->eraseFromParent(); 14896 return BB; 14897} 14898 14899static MachineBasicBlock * EmitMonitor(MachineInstr *MI, MachineBasicBlock *BB, 14900 const TargetInstrInfo *TII, 14901 const X86Subtarget* Subtarget) { 14902 DebugLoc dl = MI->getDebugLoc(); 14903 14904 // Address into RAX/EAX, other two args into ECX, EDX. 14905 unsigned MemOpc = Subtarget->is64Bit() ? X86::LEA64r : X86::LEA32r; 14906 unsigned MemReg = Subtarget->is64Bit() ? X86::RAX : X86::EAX; 14907 MachineInstrBuilder MIB = BuildMI(*BB, MI, dl, TII->get(MemOpc), MemReg); 14908 for (int i = 0; i < X86::AddrNumOperands; ++i) 14909 MIB.addOperand(MI->getOperand(i)); 14910 14911 unsigned ValOps = X86::AddrNumOperands; 14912 BuildMI(*BB, MI, dl, TII->get(TargetOpcode::COPY), X86::ECX) 14913 .addReg(MI->getOperand(ValOps).getReg()); 14914 BuildMI(*BB, MI, dl, TII->get(TargetOpcode::COPY), X86::EDX) 14915 .addReg(MI->getOperand(ValOps+1).getReg()); 14916 14917 // The instruction doesn't actually take any operands though. 14918 BuildMI(*BB, MI, dl, TII->get(X86::MONITORrrr)); 14919 14920 MI->eraseFromParent(); // The pseudo is gone now. 14921 return BB; 14922} 14923 14924MachineBasicBlock * 14925X86TargetLowering::EmitVAARG64WithCustomInserter( 14926 MachineInstr *MI, 14927 MachineBasicBlock *MBB) const { 14928 // Emit va_arg instruction on X86-64. 14929 14930 // Operands to this pseudo-instruction: 14931 // 0 ) Output : destination address (reg) 14932 // 1-5) Input : va_list address (addr, i64mem) 14933 // 6 ) ArgSize : Size (in bytes) of vararg type 14934 // 7 ) ArgMode : 0=overflow only, 1=use gp_offset, 2=use fp_offset 14935 // 8 ) Align : Alignment of type 14936 // 9 ) EFLAGS (implicit-def) 14937 14938 assert(MI->getNumOperands() == 10 && "VAARG_64 should have 10 operands!"); 14939 assert(X86::AddrNumOperands == 5 && "VAARG_64 assumes 5 address operands"); 14940 14941 unsigned DestReg = MI->getOperand(0).getReg(); 14942 MachineOperand &Base = MI->getOperand(1); 14943 MachineOperand &Scale = MI->getOperand(2); 14944 MachineOperand &Index = MI->getOperand(3); 14945 MachineOperand &Disp = MI->getOperand(4); 14946 MachineOperand &Segment = MI->getOperand(5); 14947 unsigned ArgSize = MI->getOperand(6).getImm(); 14948 unsigned ArgMode = MI->getOperand(7).getImm(); 14949 unsigned Align = MI->getOperand(8).getImm(); 14950 14951 // Memory Reference 14952 assert(MI->hasOneMemOperand() && "Expected VAARG_64 to have one memoperand"); 14953 MachineInstr::mmo_iterator MMOBegin = MI->memoperands_begin(); 14954 MachineInstr::mmo_iterator MMOEnd = MI->memoperands_end(); 14955 14956 // Machine Information 14957 const TargetInstrInfo *TII = getTargetMachine().getInstrInfo(); 14958 MachineRegisterInfo &MRI = MBB->getParent()->getRegInfo(); 14959 const TargetRegisterClass *AddrRegClass = getRegClassFor(MVT::i64); 14960 const TargetRegisterClass *OffsetRegClass = getRegClassFor(MVT::i32); 14961 DebugLoc DL = MI->getDebugLoc(); 14962 14963 // struct va_list { 14964 // i32 gp_offset 14965 // i32 fp_offset 14966 // i64 overflow_area (address) 14967 // i64 reg_save_area (address) 14968 // } 14969 // sizeof(va_list) = 24 14970 // alignment(va_list) = 8 14971 14972 unsigned TotalNumIntRegs = 6; 14973 unsigned TotalNumXMMRegs = 8; 14974 bool UseGPOffset = (ArgMode == 1); 14975 bool UseFPOffset = (ArgMode == 2); 14976 unsigned MaxOffset = TotalNumIntRegs * 8 + 14977 (UseFPOffset ? TotalNumXMMRegs * 16 : 0); 14978 14979 /* Align ArgSize to a multiple of 8 */ 14980 unsigned ArgSizeA8 = (ArgSize + 7) & ~7; 14981 bool NeedsAlign = (Align > 8); 14982 14983 MachineBasicBlock *thisMBB = MBB; 14984 MachineBasicBlock *overflowMBB; 14985 MachineBasicBlock *offsetMBB; 14986 MachineBasicBlock *endMBB; 14987 14988 unsigned OffsetDestReg = 0; // Argument address computed by offsetMBB 14989 unsigned OverflowDestReg = 0; // Argument address computed by overflowMBB 14990 unsigned OffsetReg = 0; 14991 14992 if (!UseGPOffset && !UseFPOffset) { 14993 // If we only pull from the overflow region, we don't create a branch. 14994 // We don't need to alter control flow. 14995 OffsetDestReg = 0; // unused 14996 OverflowDestReg = DestReg; 14997 14998 offsetMBB = NULL; 14999 overflowMBB = thisMBB; 15000 endMBB = thisMBB; 15001 } else { 15002 // First emit code to check if gp_offset (or fp_offset) is below the bound. 15003 // If so, pull the argument from reg_save_area. (branch to offsetMBB) 15004 // If not, pull from overflow_area. (branch to overflowMBB) 15005 // 15006 // thisMBB 15007 // | . 15008 // | . 15009 // offsetMBB overflowMBB 15010 // | . 15011 // | . 15012 // endMBB 15013 15014 // Registers for the PHI in endMBB 15015 OffsetDestReg = MRI.createVirtualRegister(AddrRegClass); 15016 OverflowDestReg = MRI.createVirtualRegister(AddrRegClass); 15017 15018 const BasicBlock *LLVM_BB = MBB->getBasicBlock(); 15019 MachineFunction *MF = MBB->getParent(); 15020 overflowMBB = MF->CreateMachineBasicBlock(LLVM_BB); 15021 offsetMBB = MF->CreateMachineBasicBlock(LLVM_BB); 15022 endMBB = MF->CreateMachineBasicBlock(LLVM_BB); 15023 15024 MachineFunction::iterator MBBIter = MBB; 15025 ++MBBIter; 15026 15027 // Insert the new basic blocks 15028 MF->insert(MBBIter, offsetMBB); 15029 MF->insert(MBBIter, overflowMBB); 15030 MF->insert(MBBIter, endMBB); 15031 15032 // Transfer the remainder of MBB and its successor edges to endMBB. 15033 endMBB->splice(endMBB->begin(), thisMBB, 15034 llvm::next(MachineBasicBlock::iterator(MI)), 15035 thisMBB->end()); 15036 endMBB->transferSuccessorsAndUpdatePHIs(thisMBB); 15037 15038 // Make offsetMBB and overflowMBB successors of thisMBB 15039 thisMBB->addSuccessor(offsetMBB); 15040 thisMBB->addSuccessor(overflowMBB); 15041 15042 // endMBB is a successor of both offsetMBB and overflowMBB 15043 offsetMBB->addSuccessor(endMBB); 15044 overflowMBB->addSuccessor(endMBB); 15045 15046 // Load the offset value into a register 15047 OffsetReg = MRI.createVirtualRegister(OffsetRegClass); 15048 BuildMI(thisMBB, DL, TII->get(X86::MOV32rm), OffsetReg) 15049 .addOperand(Base) 15050 .addOperand(Scale) 15051 .addOperand(Index) 15052 .addDisp(Disp, UseFPOffset ? 4 : 0) 15053 .addOperand(Segment) 15054 .setMemRefs(MMOBegin, MMOEnd); 15055 15056 // Check if there is enough room left to pull this argument. 15057 BuildMI(thisMBB, DL, TII->get(X86::CMP32ri)) 15058 .addReg(OffsetReg) 15059 .addImm(MaxOffset + 8 - ArgSizeA8); 15060 15061 // Branch to "overflowMBB" if offset >= max 15062 // Fall through to "offsetMBB" otherwise 15063 BuildMI(thisMBB, DL, TII->get(X86::GetCondBranchFromCond(X86::COND_AE))) 15064 .addMBB(overflowMBB); 15065 } 15066 15067 // In offsetMBB, emit code to use the reg_save_area. 15068 if (offsetMBB) { 15069 assert(OffsetReg != 0); 15070 15071 // Read the reg_save_area address. 15072 unsigned RegSaveReg = MRI.createVirtualRegister(AddrRegClass); 15073 BuildMI(offsetMBB, DL, TII->get(X86::MOV64rm), RegSaveReg) 15074 .addOperand(Base) 15075 .addOperand(Scale) 15076 .addOperand(Index) 15077 .addDisp(Disp, 16) 15078 .addOperand(Segment) 15079 .setMemRefs(MMOBegin, MMOEnd); 15080 15081 // Zero-extend the offset 15082 unsigned OffsetReg64 = MRI.createVirtualRegister(AddrRegClass); 15083 BuildMI(offsetMBB, DL, TII->get(X86::SUBREG_TO_REG), OffsetReg64) 15084 .addImm(0) 15085 .addReg(OffsetReg) 15086 .addImm(X86::sub_32bit); 15087 15088 // Add the offset to the reg_save_area to get the final address. 15089 BuildMI(offsetMBB, DL, TII->get(X86::ADD64rr), OffsetDestReg) 15090 .addReg(OffsetReg64) 15091 .addReg(RegSaveReg); 15092 15093 // Compute the offset for the next argument 15094 unsigned NextOffsetReg = MRI.createVirtualRegister(OffsetRegClass); 15095 BuildMI(offsetMBB, DL, TII->get(X86::ADD32ri), NextOffsetReg) 15096 .addReg(OffsetReg) 15097 .addImm(UseFPOffset ? 16 : 8); 15098 15099 // Store it back into the va_list. 15100 BuildMI(offsetMBB, DL, TII->get(X86::MOV32mr)) 15101 .addOperand(Base) 15102 .addOperand(Scale) 15103 .addOperand(Index) 15104 .addDisp(Disp, UseFPOffset ? 4 : 0) 15105 .addOperand(Segment) 15106 .addReg(NextOffsetReg) 15107 .setMemRefs(MMOBegin, MMOEnd); 15108 15109 // Jump to endMBB 15110 BuildMI(offsetMBB, DL, TII->get(X86::JMP_4)) 15111 .addMBB(endMBB); 15112 } 15113 15114 // 15115 // Emit code to use overflow area 15116 // 15117 15118 // Load the overflow_area address into a register. 15119 unsigned OverflowAddrReg = MRI.createVirtualRegister(AddrRegClass); 15120 BuildMI(overflowMBB, DL, TII->get(X86::MOV64rm), OverflowAddrReg) 15121 .addOperand(Base) 15122 .addOperand(Scale) 15123 .addOperand(Index) 15124 .addDisp(Disp, 8) 15125 .addOperand(Segment) 15126 .setMemRefs(MMOBegin, MMOEnd); 15127 15128 // If we need to align it, do so. Otherwise, just copy the address 15129 // to OverflowDestReg. 15130 if (NeedsAlign) { 15131 // Align the overflow address 15132 assert((Align & (Align-1)) == 0 && "Alignment must be a power of 2"); 15133 unsigned TmpReg = MRI.createVirtualRegister(AddrRegClass); 15134 15135 // aligned_addr = (addr + (align-1)) & ~(align-1) 15136 BuildMI(overflowMBB, DL, TII->get(X86::ADD64ri32), TmpReg) 15137 .addReg(OverflowAddrReg) 15138 .addImm(Align-1); 15139 15140 BuildMI(overflowMBB, DL, TII->get(X86::AND64ri32), OverflowDestReg) 15141 .addReg(TmpReg) 15142 .addImm(~(uint64_t)(Align-1)); 15143 } else { 15144 BuildMI(overflowMBB, DL, TII->get(TargetOpcode::COPY), OverflowDestReg) 15145 .addReg(OverflowAddrReg); 15146 } 15147 15148 // Compute the next overflow address after this argument. 15149 // (the overflow address should be kept 8-byte aligned) 15150 unsigned NextAddrReg = MRI.createVirtualRegister(AddrRegClass); 15151 BuildMI(overflowMBB, DL, TII->get(X86::ADD64ri32), NextAddrReg) 15152 .addReg(OverflowDestReg) 15153 .addImm(ArgSizeA8); 15154 15155 // Store the new overflow address. 15156 BuildMI(overflowMBB, DL, TII->get(X86::MOV64mr)) 15157 .addOperand(Base) 15158 .addOperand(Scale) 15159 .addOperand(Index) 15160 .addDisp(Disp, 8) 15161 .addOperand(Segment) 15162 .addReg(NextAddrReg) 15163 .setMemRefs(MMOBegin, MMOEnd); 15164 15165 // If we branched, emit the PHI to the front of endMBB. 15166 if (offsetMBB) { 15167 BuildMI(*endMBB, endMBB->begin(), DL, 15168 TII->get(X86::PHI), DestReg) 15169 .addReg(OffsetDestReg).addMBB(offsetMBB) 15170 .addReg(OverflowDestReg).addMBB(overflowMBB); 15171 } 15172 15173 // Erase the pseudo instruction 15174 MI->eraseFromParent(); 15175 15176 return endMBB; 15177} 15178 15179MachineBasicBlock * 15180X86TargetLowering::EmitVAStartSaveXMMRegsWithCustomInserter( 15181 MachineInstr *MI, 15182 MachineBasicBlock *MBB) const { 15183 // Emit code to save XMM registers to the stack. The ABI says that the 15184 // number of registers to save is given in %al, so it's theoretically 15185 // possible to do an indirect jump trick to avoid saving all of them, 15186 // however this code takes a simpler approach and just executes all 15187 // of the stores if %al is non-zero. It's less code, and it's probably 15188 // easier on the hardware branch predictor, and stores aren't all that 15189 // expensive anyway. 15190 15191 // Create the new basic blocks. One block contains all the XMM stores, 15192 // and one block is the final destination regardless of whether any 15193 // stores were performed. 15194 const BasicBlock *LLVM_BB = MBB->getBasicBlock(); 15195 MachineFunction *F = MBB->getParent(); 15196 MachineFunction::iterator MBBIter = MBB; 15197 ++MBBIter; 15198 MachineBasicBlock *XMMSaveMBB = F->CreateMachineBasicBlock(LLVM_BB); 15199 MachineBasicBlock *EndMBB = F->CreateMachineBasicBlock(LLVM_BB); 15200 F->insert(MBBIter, XMMSaveMBB); 15201 F->insert(MBBIter, EndMBB); 15202 15203 // Transfer the remainder of MBB and its successor edges to EndMBB. 15204 EndMBB->splice(EndMBB->begin(), MBB, 15205 llvm::next(MachineBasicBlock::iterator(MI)), 15206 MBB->end()); 15207 EndMBB->transferSuccessorsAndUpdatePHIs(MBB); 15208 15209 // The original block will now fall through to the XMM save block. 15210 MBB->addSuccessor(XMMSaveMBB); 15211 // The XMMSaveMBB will fall through to the end block. 15212 XMMSaveMBB->addSuccessor(EndMBB); 15213 15214 // Now add the instructions. 15215 const TargetInstrInfo *TII = getTargetMachine().getInstrInfo(); 15216 DebugLoc DL = MI->getDebugLoc(); 15217 15218 unsigned CountReg = MI->getOperand(0).getReg(); 15219 int64_t RegSaveFrameIndex = MI->getOperand(1).getImm(); 15220 int64_t VarArgsFPOffset = MI->getOperand(2).getImm(); 15221 15222 if (!Subtarget->isTargetWin64()) { 15223 // If %al is 0, branch around the XMM save block. 15224 BuildMI(MBB, DL, TII->get(X86::TEST8rr)).addReg(CountReg).addReg(CountReg); 15225 BuildMI(MBB, DL, TII->get(X86::JE_4)).addMBB(EndMBB); 15226 MBB->addSuccessor(EndMBB); 15227 } 15228 15229 // Make sure the last operand is EFLAGS, which gets clobbered by the branch 15230 // that was just emitted, but clearly shouldn't be "saved". 15231 assert((MI->getNumOperands() <= 3 || 15232 !MI->getOperand(MI->getNumOperands() - 1).isReg() || 15233 MI->getOperand(MI->getNumOperands() - 1).getReg() == X86::EFLAGS) 15234 && "Expected last argument to be EFLAGS"); 15235 unsigned MOVOpc = Subtarget->hasFp256() ? X86::VMOVAPSmr : X86::MOVAPSmr; 15236 // In the XMM save block, save all the XMM argument registers. 15237 for (int i = 3, e = MI->getNumOperands() - 1; i != e; ++i) { 15238 int64_t Offset = (i - 3) * 16 + VarArgsFPOffset; 15239 MachineMemOperand *MMO = 15240 F->getMachineMemOperand( 15241 MachinePointerInfo::getFixedStack(RegSaveFrameIndex, Offset), 15242 MachineMemOperand::MOStore, 15243 /*Size=*/16, /*Align=*/16); 15244 BuildMI(XMMSaveMBB, DL, TII->get(MOVOpc)) 15245 .addFrameIndex(RegSaveFrameIndex) 15246 .addImm(/*Scale=*/1) 15247 .addReg(/*IndexReg=*/0) 15248 .addImm(/*Disp=*/Offset) 15249 .addReg(/*Segment=*/0) 15250 .addReg(MI->getOperand(i).getReg()) 15251 .addMemOperand(MMO); 15252 } 15253 15254 MI->eraseFromParent(); // The pseudo instruction is gone now. 15255 15256 return EndMBB; 15257} 15258 15259// The EFLAGS operand of SelectItr might be missing a kill marker 15260// because there were multiple uses of EFLAGS, and ISel didn't know 15261// which to mark. Figure out whether SelectItr should have had a 15262// kill marker, and set it if it should. Returns the correct kill 15263// marker value. 15264static bool checkAndUpdateEFLAGSKill(MachineBasicBlock::iterator SelectItr, 15265 MachineBasicBlock* BB, 15266 const TargetRegisterInfo* TRI) { 15267 // Scan forward through BB for a use/def of EFLAGS. 15268 MachineBasicBlock::iterator miI(llvm::next(SelectItr)); 15269 for (MachineBasicBlock::iterator miE = BB->end(); miI != miE; ++miI) { 15270 const MachineInstr& mi = *miI; 15271 if (mi.readsRegister(X86::EFLAGS)) 15272 return false; 15273 if (mi.definesRegister(X86::EFLAGS)) 15274 break; // Should have kill-flag - update below. 15275 } 15276 15277 // If we hit the end of the block, check whether EFLAGS is live into a 15278 // successor. 15279 if (miI == BB->end()) { 15280 for (MachineBasicBlock::succ_iterator sItr = BB->succ_begin(), 15281 sEnd = BB->succ_end(); 15282 sItr != sEnd; ++sItr) { 15283 MachineBasicBlock* succ = *sItr; 15284 if (succ->isLiveIn(X86::EFLAGS)) 15285 return false; 15286 } 15287 } 15288 15289 // We found a def, or hit the end of the basic block and EFLAGS wasn't live 15290 // out. SelectMI should have a kill flag on EFLAGS. 15291 SelectItr->addRegisterKilled(X86::EFLAGS, TRI); 15292 return true; 15293} 15294 15295MachineBasicBlock * 15296X86TargetLowering::EmitLoweredSelect(MachineInstr *MI, 15297 MachineBasicBlock *BB) const { 15298 const TargetInstrInfo *TII = getTargetMachine().getInstrInfo(); 15299 DebugLoc DL = MI->getDebugLoc(); 15300 15301 // To "insert" a SELECT_CC instruction, we actually have to insert the 15302 // diamond control-flow pattern. The incoming instruction knows the 15303 // destination vreg to set, the condition code register to branch on, the 15304 // true/false values to select between, and a branch opcode to use. 15305 const BasicBlock *LLVM_BB = BB->getBasicBlock(); 15306 MachineFunction::iterator It = BB; 15307 ++It; 15308 15309 // thisMBB: 15310 // ... 15311 // TrueVal = ... 15312 // cmpTY ccX, r1, r2 15313 // bCC copy1MBB 15314 // fallthrough --> copy0MBB 15315 MachineBasicBlock *thisMBB = BB; 15316 MachineFunction *F = BB->getParent(); 15317 MachineBasicBlock *copy0MBB = F->CreateMachineBasicBlock(LLVM_BB); 15318 MachineBasicBlock *sinkMBB = F->CreateMachineBasicBlock(LLVM_BB); 15319 F->insert(It, copy0MBB); 15320 F->insert(It, sinkMBB); 15321 15322 // If the EFLAGS register isn't dead in the terminator, then claim that it's 15323 // live into the sink and copy blocks. 15324 const TargetRegisterInfo* TRI = getTargetMachine().getRegisterInfo(); 15325 if (!MI->killsRegister(X86::EFLAGS) && 15326 !checkAndUpdateEFLAGSKill(MI, BB, TRI)) { 15327 copy0MBB->addLiveIn(X86::EFLAGS); 15328 sinkMBB->addLiveIn(X86::EFLAGS); 15329 } 15330 15331 // Transfer the remainder of BB and its successor edges to sinkMBB. 15332 sinkMBB->splice(sinkMBB->begin(), BB, 15333 llvm::next(MachineBasicBlock::iterator(MI)), 15334 BB->end()); 15335 sinkMBB->transferSuccessorsAndUpdatePHIs(BB); 15336 15337 // Add the true and fallthrough blocks as its successors. 15338 BB->addSuccessor(copy0MBB); 15339 BB->addSuccessor(sinkMBB); 15340 15341 // Create the conditional branch instruction. 15342 unsigned Opc = 15343 X86::GetCondBranchFromCond((X86::CondCode)MI->getOperand(3).getImm()); 15344 BuildMI(BB, DL, TII->get(Opc)).addMBB(sinkMBB); 15345 15346 // copy0MBB: 15347 // %FalseValue = ... 15348 // # fallthrough to sinkMBB 15349 copy0MBB->addSuccessor(sinkMBB); 15350 15351 // sinkMBB: 15352 // %Result = phi [ %FalseValue, copy0MBB ], [ %TrueValue, thisMBB ] 15353 // ... 15354 BuildMI(*sinkMBB, sinkMBB->begin(), DL, 15355 TII->get(X86::PHI), MI->getOperand(0).getReg()) 15356 .addReg(MI->getOperand(1).getReg()).addMBB(copy0MBB) 15357 .addReg(MI->getOperand(2).getReg()).addMBB(thisMBB); 15358 15359 MI->eraseFromParent(); // The pseudo instruction is gone now. 15360 return sinkMBB; 15361} 15362 15363MachineBasicBlock * 15364X86TargetLowering::EmitLoweredSegAlloca(MachineInstr *MI, MachineBasicBlock *BB, 15365 bool Is64Bit) const { 15366 const TargetInstrInfo *TII = getTargetMachine().getInstrInfo(); 15367 DebugLoc DL = MI->getDebugLoc(); 15368 MachineFunction *MF = BB->getParent(); 15369 const BasicBlock *LLVM_BB = BB->getBasicBlock(); 15370 15371 assert(getTargetMachine().Options.EnableSegmentedStacks); 15372 15373 unsigned TlsReg = Is64Bit ? X86::FS : X86::GS; 15374 unsigned TlsOffset = Is64Bit ? 0x70 : 0x30; 15375 15376 // BB: 15377 // ... [Till the alloca] 15378 // If stacklet is not large enough, jump to mallocMBB 15379 // 15380 // bumpMBB: 15381 // Allocate by subtracting from RSP 15382 // Jump to continueMBB 15383 // 15384 // mallocMBB: 15385 // Allocate by call to runtime 15386 // 15387 // continueMBB: 15388 // ... 15389 // [rest of original BB] 15390 // 15391 15392 MachineBasicBlock *mallocMBB = MF->CreateMachineBasicBlock(LLVM_BB); 15393 MachineBasicBlock *bumpMBB = MF->CreateMachineBasicBlock(LLVM_BB); 15394 MachineBasicBlock *continueMBB = MF->CreateMachineBasicBlock(LLVM_BB); 15395 15396 MachineRegisterInfo &MRI = MF->getRegInfo(); 15397 const TargetRegisterClass *AddrRegClass = 15398 getRegClassFor(Is64Bit ? MVT::i64:MVT::i32); 15399 15400 unsigned mallocPtrVReg = MRI.createVirtualRegister(AddrRegClass), 15401 bumpSPPtrVReg = MRI.createVirtualRegister(AddrRegClass), 15402 tmpSPVReg = MRI.createVirtualRegister(AddrRegClass), 15403 SPLimitVReg = MRI.createVirtualRegister(AddrRegClass), 15404 sizeVReg = MI->getOperand(1).getReg(), 15405 physSPReg = Is64Bit ? X86::RSP : X86::ESP; 15406 15407 MachineFunction::iterator MBBIter = BB; 15408 ++MBBIter; 15409 15410 MF->insert(MBBIter, bumpMBB); 15411 MF->insert(MBBIter, mallocMBB); 15412 MF->insert(MBBIter, continueMBB); 15413 15414 continueMBB->splice(continueMBB->begin(), BB, llvm::next 15415 (MachineBasicBlock::iterator(MI)), BB->end()); 15416 continueMBB->transferSuccessorsAndUpdatePHIs(BB); 15417 15418 // Add code to the main basic block to check if the stack limit has been hit, 15419 // and if so, jump to mallocMBB otherwise to bumpMBB. 15420 BuildMI(BB, DL, TII->get(TargetOpcode::COPY), tmpSPVReg).addReg(physSPReg); 15421 BuildMI(BB, DL, TII->get(Is64Bit ? X86::SUB64rr:X86::SUB32rr), SPLimitVReg) 15422 .addReg(tmpSPVReg).addReg(sizeVReg); 15423 BuildMI(BB, DL, TII->get(Is64Bit ? X86::CMP64mr:X86::CMP32mr)) 15424 .addReg(0).addImm(1).addReg(0).addImm(TlsOffset).addReg(TlsReg) 15425 .addReg(SPLimitVReg); 15426 BuildMI(BB, DL, TII->get(X86::JG_4)).addMBB(mallocMBB); 15427 15428 // bumpMBB simply decreases the stack pointer, since we know the current 15429 // stacklet has enough space. 15430 BuildMI(bumpMBB, DL, TII->get(TargetOpcode::COPY), physSPReg) 15431 .addReg(SPLimitVReg); 15432 BuildMI(bumpMBB, DL, TII->get(TargetOpcode::COPY), bumpSPPtrVReg) 15433 .addReg(SPLimitVReg); 15434 BuildMI(bumpMBB, DL, TII->get(X86::JMP_4)).addMBB(continueMBB); 15435 15436 // Calls into a routine in libgcc to allocate more space from the heap. 15437 const uint32_t *RegMask = 15438 getTargetMachine().getRegisterInfo()->getCallPreservedMask(CallingConv::C); 15439 if (Is64Bit) { 15440 BuildMI(mallocMBB, DL, TII->get(X86::MOV64rr), X86::RDI) 15441 .addReg(sizeVReg); 15442 BuildMI(mallocMBB, DL, TII->get(X86::CALL64pcrel32)) 15443 .addExternalSymbol("__morestack_allocate_stack_space") 15444 .addRegMask(RegMask) 15445 .addReg(X86::RDI, RegState::Implicit) 15446 .addReg(X86::RAX, RegState::ImplicitDefine); 15447 } else { 15448 BuildMI(mallocMBB, DL, TII->get(X86::SUB32ri), physSPReg).addReg(physSPReg) 15449 .addImm(12); 15450 BuildMI(mallocMBB, DL, TII->get(X86::PUSH32r)).addReg(sizeVReg); 15451 BuildMI(mallocMBB, DL, TII->get(X86::CALLpcrel32)) 15452 .addExternalSymbol("__morestack_allocate_stack_space") 15453 .addRegMask(RegMask) 15454 .addReg(X86::EAX, RegState::ImplicitDefine); 15455 } 15456 15457 if (!Is64Bit) 15458 BuildMI(mallocMBB, DL, TII->get(X86::ADD32ri), physSPReg).addReg(physSPReg) 15459 .addImm(16); 15460 15461 BuildMI(mallocMBB, DL, TII->get(TargetOpcode::COPY), mallocPtrVReg) 15462 .addReg(Is64Bit ? X86::RAX : X86::EAX); 15463 BuildMI(mallocMBB, DL, TII->get(X86::JMP_4)).addMBB(continueMBB); 15464 15465 // Set up the CFG correctly. 15466 BB->addSuccessor(bumpMBB); 15467 BB->addSuccessor(mallocMBB); 15468 mallocMBB->addSuccessor(continueMBB); 15469 bumpMBB->addSuccessor(continueMBB); 15470 15471 // Take care of the PHI nodes. 15472 BuildMI(*continueMBB, continueMBB->begin(), DL, TII->get(X86::PHI), 15473 MI->getOperand(0).getReg()) 15474 .addReg(mallocPtrVReg).addMBB(mallocMBB) 15475 .addReg(bumpSPPtrVReg).addMBB(bumpMBB); 15476 15477 // Delete the original pseudo instruction. 15478 MI->eraseFromParent(); 15479 15480 // And we're done. 15481 return continueMBB; 15482} 15483 15484MachineBasicBlock * 15485X86TargetLowering::EmitLoweredWinAlloca(MachineInstr *MI, 15486 MachineBasicBlock *BB) const { 15487 const TargetInstrInfo *TII = getTargetMachine().getInstrInfo(); 15488 DebugLoc DL = MI->getDebugLoc(); 15489 15490 assert(!Subtarget->isTargetEnvMacho()); 15491 15492 // The lowering is pretty easy: we're just emitting the call to _alloca. The 15493 // non-trivial part is impdef of ESP. 15494 15495 if (Subtarget->isTargetWin64()) { 15496 if (Subtarget->isTargetCygMing()) { 15497 // ___chkstk(Mingw64): 15498 // Clobbers R10, R11, RAX and EFLAGS. 15499 // Updates RSP. 15500 BuildMI(*BB, MI, DL, TII->get(X86::W64ALLOCA)) 15501 .addExternalSymbol("___chkstk") 15502 .addReg(X86::RAX, RegState::Implicit) 15503 .addReg(X86::RSP, RegState::Implicit) 15504 .addReg(X86::RAX, RegState::Define | RegState::Implicit) 15505 .addReg(X86::RSP, RegState::Define | RegState::Implicit) 15506 .addReg(X86::EFLAGS, RegState::Define | RegState::Implicit); 15507 } else { 15508 // __chkstk(MSVCRT): does not update stack pointer. 15509 // Clobbers R10, R11 and EFLAGS. 15510 BuildMI(*BB, MI, DL, TII->get(X86::W64ALLOCA)) 15511 .addExternalSymbol("__chkstk") 15512 .addReg(X86::RAX, RegState::Implicit) 15513 .addReg(X86::EFLAGS, RegState::Define | RegState::Implicit); 15514 // RAX has the offset to be subtracted from RSP. 15515 BuildMI(*BB, MI, DL, TII->get(X86::SUB64rr), X86::RSP) 15516 .addReg(X86::RSP) 15517 .addReg(X86::RAX); 15518 } 15519 } else { 15520 const char *StackProbeSymbol = 15521 Subtarget->isTargetWindows() ? "_chkstk" : "_alloca"; 15522 15523 BuildMI(*BB, MI, DL, TII->get(X86::CALLpcrel32)) 15524 .addExternalSymbol(StackProbeSymbol) 15525 .addReg(X86::EAX, RegState::Implicit) 15526 .addReg(X86::ESP, RegState::Implicit) 15527 .addReg(X86::EAX, RegState::Define | RegState::Implicit) 15528 .addReg(X86::ESP, RegState::Define | RegState::Implicit) 15529 .addReg(X86::EFLAGS, RegState::Define | RegState::Implicit); 15530 } 15531 15532 MI->eraseFromParent(); // The pseudo instruction is gone now. 15533 return BB; 15534} 15535 15536MachineBasicBlock * 15537X86TargetLowering::EmitLoweredTLSCall(MachineInstr *MI, 15538 MachineBasicBlock *BB) const { 15539 // This is pretty easy. We're taking the value that we received from 15540 // our load from the relocation, sticking it in either RDI (x86-64) 15541 // or EAX and doing an indirect call. The return value will then 15542 // be in the normal return register. 15543 const X86InstrInfo *TII 15544 = static_cast<const X86InstrInfo*>(getTargetMachine().getInstrInfo()); 15545 DebugLoc DL = MI->getDebugLoc(); 15546 MachineFunction *F = BB->getParent(); 15547 15548 assert(Subtarget->isTargetDarwin() && "Darwin only instr emitted?"); 15549 assert(MI->getOperand(3).isGlobal() && "This should be a global"); 15550 15551 // Get a register mask for the lowered call. 15552 // FIXME: The 32-bit calls have non-standard calling conventions. Use a 15553 // proper register mask. 15554 const uint32_t *RegMask = 15555 getTargetMachine().getRegisterInfo()->getCallPreservedMask(CallingConv::C); 15556 if (Subtarget->is64Bit()) { 15557 MachineInstrBuilder MIB = BuildMI(*BB, MI, DL, 15558 TII->get(X86::MOV64rm), X86::RDI) 15559 .addReg(X86::RIP) 15560 .addImm(0).addReg(0) 15561 .addGlobalAddress(MI->getOperand(3).getGlobal(), 0, 15562 MI->getOperand(3).getTargetFlags()) 15563 .addReg(0); 15564 MIB = BuildMI(*BB, MI, DL, TII->get(X86::CALL64m)); 15565 addDirectMem(MIB, X86::RDI); 15566 MIB.addReg(X86::RAX, RegState::ImplicitDefine).addRegMask(RegMask); 15567 } else if (getTargetMachine().getRelocationModel() != Reloc::PIC_) { 15568 MachineInstrBuilder MIB = BuildMI(*BB, MI, DL, 15569 TII->get(X86::MOV32rm), X86::EAX) 15570 .addReg(0) 15571 .addImm(0).addReg(0) 15572 .addGlobalAddress(MI->getOperand(3).getGlobal(), 0, 15573 MI->getOperand(3).getTargetFlags()) 15574 .addReg(0); 15575 MIB = BuildMI(*BB, MI, DL, TII->get(X86::CALL32m)); 15576 addDirectMem(MIB, X86::EAX); 15577 MIB.addReg(X86::EAX, RegState::ImplicitDefine).addRegMask(RegMask); 15578 } else { 15579 MachineInstrBuilder MIB = BuildMI(*BB, MI, DL, 15580 TII->get(X86::MOV32rm), X86::EAX) 15581 .addReg(TII->getGlobalBaseReg(F)) 15582 .addImm(0).addReg(0) 15583 .addGlobalAddress(MI->getOperand(3).getGlobal(), 0, 15584 MI->getOperand(3).getTargetFlags()) 15585 .addReg(0); 15586 MIB = BuildMI(*BB, MI, DL, TII->get(X86::CALL32m)); 15587 addDirectMem(MIB, X86::EAX); 15588 MIB.addReg(X86::EAX, RegState::ImplicitDefine).addRegMask(RegMask); 15589 } 15590 15591 MI->eraseFromParent(); // The pseudo instruction is gone now. 15592 return BB; 15593} 15594 15595MachineBasicBlock * 15596X86TargetLowering::emitEHSjLjSetJmp(MachineInstr *MI, 15597 MachineBasicBlock *MBB) const { 15598 DebugLoc DL = MI->getDebugLoc(); 15599 const TargetInstrInfo *TII = getTargetMachine().getInstrInfo(); 15600 15601 MachineFunction *MF = MBB->getParent(); 15602 MachineRegisterInfo &MRI = MF->getRegInfo(); 15603 15604 const BasicBlock *BB = MBB->getBasicBlock(); 15605 MachineFunction::iterator I = MBB; 15606 ++I; 15607 15608 // Memory Reference 15609 MachineInstr::mmo_iterator MMOBegin = MI->memoperands_begin(); 15610 MachineInstr::mmo_iterator MMOEnd = MI->memoperands_end(); 15611 15612 unsigned DstReg; 15613 unsigned MemOpndSlot = 0; 15614 15615 unsigned CurOp = 0; 15616 15617 DstReg = MI->getOperand(CurOp++).getReg(); 15618 const TargetRegisterClass *RC = MRI.getRegClass(DstReg); 15619 assert(RC->hasType(MVT::i32) && "Invalid destination!"); 15620 unsigned mainDstReg = MRI.createVirtualRegister(RC); 15621 unsigned restoreDstReg = MRI.createVirtualRegister(RC); 15622 15623 MemOpndSlot = CurOp; 15624 15625 MVT PVT = getPointerTy(); 15626 assert((PVT == MVT::i64 || PVT == MVT::i32) && 15627 "Invalid Pointer Size!"); 15628 15629 // For v = setjmp(buf), we generate 15630 // 15631 // thisMBB: 15632 // buf[LabelOffset] = restoreMBB 15633 // SjLjSetup restoreMBB 15634 // 15635 // mainMBB: 15636 // v_main = 0 15637 // 15638 // sinkMBB: 15639 // v = phi(main, restore) 15640 // 15641 // restoreMBB: 15642 // v_restore = 1 15643 15644 MachineBasicBlock *thisMBB = MBB; 15645 MachineBasicBlock *mainMBB = MF->CreateMachineBasicBlock(BB); 15646 MachineBasicBlock *sinkMBB = MF->CreateMachineBasicBlock(BB); 15647 MachineBasicBlock *restoreMBB = MF->CreateMachineBasicBlock(BB); 15648 MF->insert(I, mainMBB); 15649 MF->insert(I, sinkMBB); 15650 MF->push_back(restoreMBB); 15651 15652 MachineInstrBuilder MIB; 15653 15654 // Transfer the remainder of BB and its successor edges to sinkMBB. 15655 sinkMBB->splice(sinkMBB->begin(), MBB, 15656 llvm::next(MachineBasicBlock::iterator(MI)), MBB->end()); 15657 sinkMBB->transferSuccessorsAndUpdatePHIs(MBB); 15658 15659 // thisMBB: 15660 unsigned PtrStoreOpc = 0; 15661 unsigned LabelReg = 0; 15662 const int64_t LabelOffset = 1 * PVT.getStoreSize(); 15663 Reloc::Model RM = getTargetMachine().getRelocationModel(); 15664 bool UseImmLabel = (getTargetMachine().getCodeModel() == CodeModel::Small) && 15665 (RM == Reloc::Static || RM == Reloc::DynamicNoPIC); 15666 15667 // Prepare IP either in reg or imm. 15668 if (!UseImmLabel) { 15669 PtrStoreOpc = (PVT == MVT::i64) ? X86::MOV64mr : X86::MOV32mr; 15670 const TargetRegisterClass *PtrRC = getRegClassFor(PVT); 15671 LabelReg = MRI.createVirtualRegister(PtrRC); 15672 if (Subtarget->is64Bit()) { 15673 MIB = BuildMI(*thisMBB, MI, DL, TII->get(X86::LEA64r), LabelReg) 15674 .addReg(X86::RIP) 15675 .addImm(0) 15676 .addReg(0) 15677 .addMBB(restoreMBB) 15678 .addReg(0); 15679 } else { 15680 const X86InstrInfo *XII = static_cast<const X86InstrInfo*>(TII); 15681 MIB = BuildMI(*thisMBB, MI, DL, TII->get(X86::LEA32r), LabelReg) 15682 .addReg(XII->getGlobalBaseReg(MF)) 15683 .addImm(0) 15684 .addReg(0) 15685 .addMBB(restoreMBB, Subtarget->ClassifyBlockAddressReference()) 15686 .addReg(0); 15687 } 15688 } else 15689 PtrStoreOpc = (PVT == MVT::i64) ? X86::MOV64mi32 : X86::MOV32mi; 15690 // Store IP 15691 MIB = BuildMI(*thisMBB, MI, DL, TII->get(PtrStoreOpc)); 15692 for (unsigned i = 0; i < X86::AddrNumOperands; ++i) { 15693 if (i == X86::AddrDisp) 15694 MIB.addDisp(MI->getOperand(MemOpndSlot + i), LabelOffset); 15695 else 15696 MIB.addOperand(MI->getOperand(MemOpndSlot + i)); 15697 } 15698 if (!UseImmLabel) 15699 MIB.addReg(LabelReg); 15700 else 15701 MIB.addMBB(restoreMBB); 15702 MIB.setMemRefs(MMOBegin, MMOEnd); 15703 // Setup 15704 MIB = BuildMI(*thisMBB, MI, DL, TII->get(X86::EH_SjLj_Setup)) 15705 .addMBB(restoreMBB); 15706 15707 const X86RegisterInfo *RegInfo = 15708 static_cast<const X86RegisterInfo*>(getTargetMachine().getRegisterInfo()); 15709 MIB.addRegMask(RegInfo->getNoPreservedMask()); 15710 thisMBB->addSuccessor(mainMBB); 15711 thisMBB->addSuccessor(restoreMBB); 15712 15713 // mainMBB: 15714 // EAX = 0 15715 BuildMI(mainMBB, DL, TII->get(X86::MOV32r0), mainDstReg); 15716 mainMBB->addSuccessor(sinkMBB); 15717 15718 // sinkMBB: 15719 BuildMI(*sinkMBB, sinkMBB->begin(), DL, 15720 TII->get(X86::PHI), DstReg) 15721 .addReg(mainDstReg).addMBB(mainMBB) 15722 .addReg(restoreDstReg).addMBB(restoreMBB); 15723 15724 // restoreMBB: 15725 BuildMI(restoreMBB, DL, TII->get(X86::MOV32ri), restoreDstReg).addImm(1); 15726 BuildMI(restoreMBB, DL, TII->get(X86::JMP_4)).addMBB(sinkMBB); 15727 restoreMBB->addSuccessor(sinkMBB); 15728 15729 MI->eraseFromParent(); 15730 return sinkMBB; 15731} 15732 15733MachineBasicBlock * 15734X86TargetLowering::emitEHSjLjLongJmp(MachineInstr *MI, 15735 MachineBasicBlock *MBB) const { 15736 DebugLoc DL = MI->getDebugLoc(); 15737 const TargetInstrInfo *TII = getTargetMachine().getInstrInfo(); 15738 15739 MachineFunction *MF = MBB->getParent(); 15740 MachineRegisterInfo &MRI = MF->getRegInfo(); 15741 15742 // Memory Reference 15743 MachineInstr::mmo_iterator MMOBegin = MI->memoperands_begin(); 15744 MachineInstr::mmo_iterator MMOEnd = MI->memoperands_end(); 15745 15746 MVT PVT = getPointerTy(); 15747 assert((PVT == MVT::i64 || PVT == MVT::i32) && 15748 "Invalid Pointer Size!"); 15749 15750 const TargetRegisterClass *RC = 15751 (PVT == MVT::i64) ? &X86::GR64RegClass : &X86::GR32RegClass; 15752 unsigned Tmp = MRI.createVirtualRegister(RC); 15753 // Since FP is only updated here but NOT referenced, it's treated as GPR. 15754 const X86RegisterInfo *RegInfo = 15755 static_cast<const X86RegisterInfo*>(getTargetMachine().getRegisterInfo()); 15756 unsigned FP = (PVT == MVT::i64) ? X86::RBP : X86::EBP; 15757 unsigned SP = RegInfo->getStackRegister(); 15758 15759 MachineInstrBuilder MIB; 15760 15761 const int64_t LabelOffset = 1 * PVT.getStoreSize(); 15762 const int64_t SPOffset = 2 * PVT.getStoreSize(); 15763 15764 unsigned PtrLoadOpc = (PVT == MVT::i64) ? X86::MOV64rm : X86::MOV32rm; 15765 unsigned IJmpOpc = (PVT == MVT::i64) ? X86::JMP64r : X86::JMP32r; 15766 15767 // Reload FP 15768 MIB = BuildMI(*MBB, MI, DL, TII->get(PtrLoadOpc), FP); 15769 for (unsigned i = 0; i < X86::AddrNumOperands; ++i) 15770 MIB.addOperand(MI->getOperand(i)); 15771 MIB.setMemRefs(MMOBegin, MMOEnd); 15772 // Reload IP 15773 MIB = BuildMI(*MBB, MI, DL, TII->get(PtrLoadOpc), Tmp); 15774 for (unsigned i = 0; i < X86::AddrNumOperands; ++i) { 15775 if (i == X86::AddrDisp) 15776 MIB.addDisp(MI->getOperand(i), LabelOffset); 15777 else 15778 MIB.addOperand(MI->getOperand(i)); 15779 } 15780 MIB.setMemRefs(MMOBegin, MMOEnd); 15781 // Reload SP 15782 MIB = BuildMI(*MBB, MI, DL, TII->get(PtrLoadOpc), SP); 15783 for (unsigned i = 0; i < X86::AddrNumOperands; ++i) { 15784 if (i == X86::AddrDisp) 15785 MIB.addDisp(MI->getOperand(i), SPOffset); 15786 else 15787 MIB.addOperand(MI->getOperand(i)); 15788 } 15789 MIB.setMemRefs(MMOBegin, MMOEnd); 15790 // Jump 15791 BuildMI(*MBB, MI, DL, TII->get(IJmpOpc)).addReg(Tmp); 15792 15793 MI->eraseFromParent(); 15794 return MBB; 15795} 15796 15797MachineBasicBlock * 15798X86TargetLowering::EmitInstrWithCustomInserter(MachineInstr *MI, 15799 MachineBasicBlock *BB) const { 15800 switch (MI->getOpcode()) { 15801 default: llvm_unreachable("Unexpected instr type to insert"); 15802 case X86::TAILJMPd64: 15803 case X86::TAILJMPr64: 15804 case X86::TAILJMPm64: 15805 llvm_unreachable("TAILJMP64 would not be touched here."); 15806 case X86::TCRETURNdi64: 15807 case X86::TCRETURNri64: 15808 case X86::TCRETURNmi64: 15809 return BB; 15810 case X86::WIN_ALLOCA: 15811 return EmitLoweredWinAlloca(MI, BB); 15812 case X86::SEG_ALLOCA_32: 15813 return EmitLoweredSegAlloca(MI, BB, false); 15814 case X86::SEG_ALLOCA_64: 15815 return EmitLoweredSegAlloca(MI, BB, true); 15816 case X86::TLSCall_32: 15817 case X86::TLSCall_64: 15818 return EmitLoweredTLSCall(MI, BB); 15819 case X86::CMOV_GR8: 15820 case X86::CMOV_FR32: 15821 case X86::CMOV_FR64: 15822 case X86::CMOV_V4F32: 15823 case X86::CMOV_V2F64: 15824 case X86::CMOV_V2I64: 15825 case X86::CMOV_V8F32: 15826 case X86::CMOV_V4F64: 15827 case X86::CMOV_V4I64: 15828 case X86::CMOV_V16F32: 15829 case X86::CMOV_V8F64: 15830 case X86::CMOV_V8I64: 15831 case X86::CMOV_GR16: 15832 case X86::CMOV_GR32: 15833 case X86::CMOV_RFP32: 15834 case X86::CMOV_RFP64: 15835 case X86::CMOV_RFP80: 15836 return EmitLoweredSelect(MI, BB); 15837 15838 case X86::FP32_TO_INT16_IN_MEM: 15839 case X86::FP32_TO_INT32_IN_MEM: 15840 case X86::FP32_TO_INT64_IN_MEM: 15841 case X86::FP64_TO_INT16_IN_MEM: 15842 case X86::FP64_TO_INT32_IN_MEM: 15843 case X86::FP64_TO_INT64_IN_MEM: 15844 case X86::FP80_TO_INT16_IN_MEM: 15845 case X86::FP80_TO_INT32_IN_MEM: 15846 case X86::FP80_TO_INT64_IN_MEM: { 15847 const TargetInstrInfo *TII = getTargetMachine().getInstrInfo(); 15848 DebugLoc DL = MI->getDebugLoc(); 15849 15850 // Change the floating point control register to use "round towards zero" 15851 // mode when truncating to an integer value. 15852 MachineFunction *F = BB->getParent(); 15853 int CWFrameIdx = F->getFrameInfo()->CreateStackObject(2, 2, false); 15854 addFrameReference(BuildMI(*BB, MI, DL, 15855 TII->get(X86::FNSTCW16m)), CWFrameIdx); 15856 15857 // Load the old value of the high byte of the control word... 15858 unsigned OldCW = 15859 F->getRegInfo().createVirtualRegister(&X86::GR16RegClass); 15860 addFrameReference(BuildMI(*BB, MI, DL, TII->get(X86::MOV16rm), OldCW), 15861 CWFrameIdx); 15862 15863 // Set the high part to be round to zero... 15864 addFrameReference(BuildMI(*BB, MI, DL, TII->get(X86::MOV16mi)), CWFrameIdx) 15865 .addImm(0xC7F); 15866 15867 // Reload the modified control word now... 15868 addFrameReference(BuildMI(*BB, MI, DL, 15869 TII->get(X86::FLDCW16m)), CWFrameIdx); 15870 15871 // Restore the memory image of control word to original value 15872 addFrameReference(BuildMI(*BB, MI, DL, TII->get(X86::MOV16mr)), CWFrameIdx) 15873 .addReg(OldCW); 15874 15875 // Get the X86 opcode to use. 15876 unsigned Opc; 15877 switch (MI->getOpcode()) { 15878 default: llvm_unreachable("illegal opcode!"); 15879 case X86::FP32_TO_INT16_IN_MEM: Opc = X86::IST_Fp16m32; break; 15880 case X86::FP32_TO_INT32_IN_MEM: Opc = X86::IST_Fp32m32; break; 15881 case X86::FP32_TO_INT64_IN_MEM: Opc = X86::IST_Fp64m32; break; 15882 case X86::FP64_TO_INT16_IN_MEM: Opc = X86::IST_Fp16m64; break; 15883 case X86::FP64_TO_INT32_IN_MEM: Opc = X86::IST_Fp32m64; break; 15884 case X86::FP64_TO_INT64_IN_MEM: Opc = X86::IST_Fp64m64; break; 15885 case X86::FP80_TO_INT16_IN_MEM: Opc = X86::IST_Fp16m80; break; 15886 case X86::FP80_TO_INT32_IN_MEM: Opc = X86::IST_Fp32m80; break; 15887 case X86::FP80_TO_INT64_IN_MEM: Opc = X86::IST_Fp64m80; break; 15888 } 15889 15890 X86AddressMode AM; 15891 MachineOperand &Op = MI->getOperand(0); 15892 if (Op.isReg()) { 15893 AM.BaseType = X86AddressMode::RegBase; 15894 AM.Base.Reg = Op.getReg(); 15895 } else { 15896 AM.BaseType = X86AddressMode::FrameIndexBase; 15897 AM.Base.FrameIndex = Op.getIndex(); 15898 } 15899 Op = MI->getOperand(1); 15900 if (Op.isImm()) 15901 AM.Scale = Op.getImm(); 15902 Op = MI->getOperand(2); 15903 if (Op.isImm()) 15904 AM.IndexReg = Op.getImm(); 15905 Op = MI->getOperand(3); 15906 if (Op.isGlobal()) { 15907 AM.GV = Op.getGlobal(); 15908 } else { 15909 AM.Disp = Op.getImm(); 15910 } 15911 addFullAddress(BuildMI(*BB, MI, DL, TII->get(Opc)), AM) 15912 .addReg(MI->getOperand(X86::AddrNumOperands).getReg()); 15913 15914 // Reload the original control word now. 15915 addFrameReference(BuildMI(*BB, MI, DL, 15916 TII->get(X86::FLDCW16m)), CWFrameIdx); 15917 15918 MI->eraseFromParent(); // The pseudo instruction is gone now. 15919 return BB; 15920 } 15921 // String/text processing lowering. 15922 case X86::PCMPISTRM128REG: 15923 case X86::VPCMPISTRM128REG: 15924 case X86::PCMPISTRM128MEM: 15925 case X86::VPCMPISTRM128MEM: 15926 case X86::PCMPESTRM128REG: 15927 case X86::VPCMPESTRM128REG: 15928 case X86::PCMPESTRM128MEM: 15929 case X86::VPCMPESTRM128MEM: 15930 assert(Subtarget->hasSSE42() && 15931 "Target must have SSE4.2 or AVX features enabled"); 15932 return EmitPCMPSTRM(MI, BB, getTargetMachine().getInstrInfo()); 15933 15934 // String/text processing lowering. 15935 case X86::PCMPISTRIREG: 15936 case X86::VPCMPISTRIREG: 15937 case X86::PCMPISTRIMEM: 15938 case X86::VPCMPISTRIMEM: 15939 case X86::PCMPESTRIREG: 15940 case X86::VPCMPESTRIREG: 15941 case X86::PCMPESTRIMEM: 15942 case X86::VPCMPESTRIMEM: 15943 assert(Subtarget->hasSSE42() && 15944 "Target must have SSE4.2 or AVX features enabled"); 15945 return EmitPCMPSTRI(MI, BB, getTargetMachine().getInstrInfo()); 15946 15947 // Thread synchronization. 15948 case X86::MONITOR: 15949 return EmitMonitor(MI, BB, getTargetMachine().getInstrInfo(), Subtarget); 15950 15951 // xbegin 15952 case X86::XBEGIN: 15953 return EmitXBegin(MI, BB, getTargetMachine().getInstrInfo()); 15954 15955 // Atomic Lowering. 15956 case X86::ATOMAND8: 15957 case X86::ATOMAND16: 15958 case X86::ATOMAND32: 15959 case X86::ATOMAND64: 15960 // Fall through 15961 case X86::ATOMOR8: 15962 case X86::ATOMOR16: 15963 case X86::ATOMOR32: 15964 case X86::ATOMOR64: 15965 // Fall through 15966 case X86::ATOMXOR16: 15967 case X86::ATOMXOR8: 15968 case X86::ATOMXOR32: 15969 case X86::ATOMXOR64: 15970 // Fall through 15971 case X86::ATOMNAND8: 15972 case X86::ATOMNAND16: 15973 case X86::ATOMNAND32: 15974 case X86::ATOMNAND64: 15975 // Fall through 15976 case X86::ATOMMAX8: 15977 case X86::ATOMMAX16: 15978 case X86::ATOMMAX32: 15979 case X86::ATOMMAX64: 15980 // Fall through 15981 case X86::ATOMMIN8: 15982 case X86::ATOMMIN16: 15983 case X86::ATOMMIN32: 15984 case X86::ATOMMIN64: 15985 // Fall through 15986 case X86::ATOMUMAX8: 15987 case X86::ATOMUMAX16: 15988 case X86::ATOMUMAX32: 15989 case X86::ATOMUMAX64: 15990 // Fall through 15991 case X86::ATOMUMIN8: 15992 case X86::ATOMUMIN16: 15993 case X86::ATOMUMIN32: 15994 case X86::ATOMUMIN64: 15995 return EmitAtomicLoadArith(MI, BB); 15996 15997 // This group does 64-bit operations on a 32-bit host. 15998 case X86::ATOMAND6432: 15999 case X86::ATOMOR6432: 16000 case X86::ATOMXOR6432: 16001 case X86::ATOMNAND6432: 16002 case X86::ATOMADD6432: 16003 case X86::ATOMSUB6432: 16004 case X86::ATOMMAX6432: 16005 case X86::ATOMMIN6432: 16006 case X86::ATOMUMAX6432: 16007 case X86::ATOMUMIN6432: 16008 case X86::ATOMSWAP6432: 16009 return EmitAtomicLoadArith6432(MI, BB); 16010 16011 case X86::VASTART_SAVE_XMM_REGS: 16012 return EmitVAStartSaveXMMRegsWithCustomInserter(MI, BB); 16013 16014 case X86::VAARG_64: 16015 return EmitVAARG64WithCustomInserter(MI, BB); 16016 16017 case X86::EH_SjLj_SetJmp32: 16018 case X86::EH_SjLj_SetJmp64: 16019 return emitEHSjLjSetJmp(MI, BB); 16020 16021 case X86::EH_SjLj_LongJmp32: 16022 case X86::EH_SjLj_LongJmp64: 16023 return emitEHSjLjLongJmp(MI, BB); 16024 } 16025} 16026 16027//===----------------------------------------------------------------------===// 16028// X86 Optimization Hooks 16029//===----------------------------------------------------------------------===// 16030 16031void X86TargetLowering::computeMaskedBitsForTargetNode(const SDValue Op, 16032 APInt &KnownZero, 16033 APInt &KnownOne, 16034 const SelectionDAG &DAG, 16035 unsigned Depth) const { 16036 unsigned BitWidth = KnownZero.getBitWidth(); 16037 unsigned Opc = Op.getOpcode(); 16038 assert((Opc >= ISD::BUILTIN_OP_END || 16039 Opc == ISD::INTRINSIC_WO_CHAIN || 16040 Opc == ISD::INTRINSIC_W_CHAIN || 16041 Opc == ISD::INTRINSIC_VOID) && 16042 "Should use MaskedValueIsZero if you don't know whether Op" 16043 " is a target node!"); 16044 16045 KnownZero = KnownOne = APInt(BitWidth, 0); // Don't know anything. 16046 switch (Opc) { 16047 default: break; 16048 case X86ISD::ADD: 16049 case X86ISD::SUB: 16050 case X86ISD::ADC: 16051 case X86ISD::SBB: 16052 case X86ISD::SMUL: 16053 case X86ISD::UMUL: 16054 case X86ISD::INC: 16055 case X86ISD::DEC: 16056 case X86ISD::OR: 16057 case X86ISD::XOR: 16058 case X86ISD::AND: 16059 // These nodes' second result is a boolean. 16060 if (Op.getResNo() == 0) 16061 break; 16062 // Fallthrough 16063 case X86ISD::SETCC: 16064 KnownZero |= APInt::getHighBitsSet(BitWidth, BitWidth - 1); 16065 break; 16066 case ISD::INTRINSIC_WO_CHAIN: { 16067 unsigned IntId = cast<ConstantSDNode>(Op.getOperand(0))->getZExtValue(); 16068 unsigned NumLoBits = 0; 16069 switch (IntId) { 16070 default: break; 16071 case Intrinsic::x86_sse_movmsk_ps: 16072 case Intrinsic::x86_avx_movmsk_ps_256: 16073 case Intrinsic::x86_sse2_movmsk_pd: 16074 case Intrinsic::x86_avx_movmsk_pd_256: 16075 case Intrinsic::x86_mmx_pmovmskb: 16076 case Intrinsic::x86_sse2_pmovmskb_128: 16077 case Intrinsic::x86_avx2_pmovmskb: { 16078 // High bits of movmskp{s|d}, pmovmskb are known zero. 16079 switch (IntId) { 16080 default: llvm_unreachable("Impossible intrinsic"); // Can't reach here. 16081 case Intrinsic::x86_sse_movmsk_ps: NumLoBits = 4; break; 16082 case Intrinsic::x86_avx_movmsk_ps_256: NumLoBits = 8; break; 16083 case Intrinsic::x86_sse2_movmsk_pd: NumLoBits = 2; break; 16084 case Intrinsic::x86_avx_movmsk_pd_256: NumLoBits = 4; break; 16085 case Intrinsic::x86_mmx_pmovmskb: NumLoBits = 8; break; 16086 case Intrinsic::x86_sse2_pmovmskb_128: NumLoBits = 16; break; 16087 case Intrinsic::x86_avx2_pmovmskb: NumLoBits = 32; break; 16088 } 16089 KnownZero = APInt::getHighBitsSet(BitWidth, BitWidth - NumLoBits); 16090 break; 16091 } 16092 } 16093 break; 16094 } 16095 } 16096} 16097 16098unsigned X86TargetLowering::ComputeNumSignBitsForTargetNode(SDValue Op, 16099 unsigned Depth) const { 16100 // SETCC_CARRY sets the dest to ~0 for true or 0 for false. 16101 if (Op.getOpcode() == X86ISD::SETCC_CARRY) 16102 return Op.getValueType().getScalarType().getSizeInBits(); 16103 16104 // Fallback case. 16105 return 1; 16106} 16107 16108/// isGAPlusOffset - Returns true (and the GlobalValue and the offset) if the 16109/// node is a GlobalAddress + offset. 16110bool X86TargetLowering::isGAPlusOffset(SDNode *N, 16111 const GlobalValue* &GA, 16112 int64_t &Offset) const { 16113 if (N->getOpcode() == X86ISD::Wrapper) { 16114 if (isa<GlobalAddressSDNode>(N->getOperand(0))) { 16115 GA = cast<GlobalAddressSDNode>(N->getOperand(0))->getGlobal(); 16116 Offset = cast<GlobalAddressSDNode>(N->getOperand(0))->getOffset(); 16117 return true; 16118 } 16119 } 16120 return TargetLowering::isGAPlusOffset(N, GA, Offset); 16121} 16122 16123/// isShuffleHigh128VectorInsertLow - Checks whether the shuffle node is the 16124/// same as extracting the high 128-bit part of 256-bit vector and then 16125/// inserting the result into the low part of a new 256-bit vector 16126static bool isShuffleHigh128VectorInsertLow(ShuffleVectorSDNode *SVOp) { 16127 EVT VT = SVOp->getValueType(0); 16128 unsigned NumElems = VT.getVectorNumElements(); 16129 16130 // vector_shuffle <4, 5, 6, 7, u, u, u, u> or <2, 3, u, u> 16131 for (unsigned i = 0, j = NumElems/2; i != NumElems/2; ++i, ++j) 16132 if (!isUndefOrEqual(SVOp->getMaskElt(i), j) || 16133 SVOp->getMaskElt(j) >= 0) 16134 return false; 16135 16136 return true; 16137} 16138 16139/// isShuffleLow128VectorInsertHigh - Checks whether the shuffle node is the 16140/// same as extracting the low 128-bit part of 256-bit vector and then 16141/// inserting the result into the high part of a new 256-bit vector 16142static bool isShuffleLow128VectorInsertHigh(ShuffleVectorSDNode *SVOp) { 16143 EVT VT = SVOp->getValueType(0); 16144 unsigned NumElems = VT.getVectorNumElements(); 16145 16146 // vector_shuffle <u, u, u, u, 0, 1, 2, 3> or <u, u, 0, 1> 16147 for (unsigned i = NumElems/2, j = 0; i != NumElems; ++i, ++j) 16148 if (!isUndefOrEqual(SVOp->getMaskElt(i), j) || 16149 SVOp->getMaskElt(j) >= 0) 16150 return false; 16151 16152 return true; 16153} 16154 16155/// PerformShuffleCombine256 - Performs shuffle combines for 256-bit vectors. 16156static SDValue PerformShuffleCombine256(SDNode *N, SelectionDAG &DAG, 16157 TargetLowering::DAGCombinerInfo &DCI, 16158 const X86Subtarget* Subtarget) { 16159 SDLoc dl(N); 16160 ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(N); 16161 SDValue V1 = SVOp->getOperand(0); 16162 SDValue V2 = SVOp->getOperand(1); 16163 EVT VT = SVOp->getValueType(0); 16164 unsigned NumElems = VT.getVectorNumElements(); 16165 16166 if (V1.getOpcode() == ISD::CONCAT_VECTORS && 16167 V2.getOpcode() == ISD::CONCAT_VECTORS) { 16168 // 16169 // 0,0,0,... 16170 // | 16171 // V UNDEF BUILD_VECTOR UNDEF 16172 // \ / \ / 16173 // CONCAT_VECTOR CONCAT_VECTOR 16174 // \ / 16175 // \ / 16176 // RESULT: V + zero extended 16177 // 16178 if (V2.getOperand(0).getOpcode() != ISD::BUILD_VECTOR || 16179 V2.getOperand(1).getOpcode() != ISD::UNDEF || 16180 V1.getOperand(1).getOpcode() != ISD::UNDEF) 16181 return SDValue(); 16182 16183 if (!ISD::isBuildVectorAllZeros(V2.getOperand(0).getNode())) 16184 return SDValue(); 16185 16186 // To match the shuffle mask, the first half of the mask should 16187 // be exactly the first vector, and all the rest a splat with the 16188 // first element of the second one. 16189 for (unsigned i = 0; i != NumElems/2; ++i) 16190 if (!isUndefOrEqual(SVOp->getMaskElt(i), i) || 16191 !isUndefOrEqual(SVOp->getMaskElt(i+NumElems/2), NumElems)) 16192 return SDValue(); 16193 16194 // If V1 is coming from a vector load then just fold to a VZEXT_LOAD. 16195 if (LoadSDNode *Ld = dyn_cast<LoadSDNode>(V1.getOperand(0))) { 16196 if (Ld->hasNUsesOfValue(1, 0)) { 16197 SDVTList Tys = DAG.getVTList(MVT::v4i64, MVT::Other); 16198 SDValue Ops[] = { Ld->getChain(), Ld->getBasePtr() }; 16199 SDValue ResNode = 16200 DAG.getMemIntrinsicNode(X86ISD::VZEXT_LOAD, dl, Tys, Ops, 16201 array_lengthof(Ops), 16202 Ld->getMemoryVT(), 16203 Ld->getPointerInfo(), 16204 Ld->getAlignment(), 16205 false/*isVolatile*/, true/*ReadMem*/, 16206 false/*WriteMem*/); 16207 16208 // Make sure the newly-created LOAD is in the same position as Ld in 16209 // terms of dependency. We create a TokenFactor for Ld and ResNode, 16210 // and update uses of Ld's output chain to use the TokenFactor. 16211 if (Ld->hasAnyUseOfValue(1)) { 16212 SDValue NewChain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, 16213 SDValue(Ld, 1), SDValue(ResNode.getNode(), 1)); 16214 DAG.ReplaceAllUsesOfValueWith(SDValue(Ld, 1), NewChain); 16215 DAG.UpdateNodeOperands(NewChain.getNode(), SDValue(Ld, 1), 16216 SDValue(ResNode.getNode(), 1)); 16217 } 16218 16219 return DAG.getNode(ISD::BITCAST, dl, VT, ResNode); 16220 } 16221 } 16222 16223 // Emit a zeroed vector and insert the desired subvector on its 16224 // first half. 16225 SDValue Zeros = getZeroVector(VT, Subtarget, DAG, dl); 16226 SDValue InsV = Insert128BitVector(Zeros, V1.getOperand(0), 0, DAG, dl); 16227 return DCI.CombineTo(N, InsV); 16228 } 16229 16230 //===--------------------------------------------------------------------===// 16231 // Combine some shuffles into subvector extracts and inserts: 16232 // 16233 16234 // vector_shuffle <4, 5, 6, 7, u, u, u, u> or <2, 3, u, u> 16235 if (isShuffleHigh128VectorInsertLow(SVOp)) { 16236 SDValue V = Extract128BitVector(V1, NumElems/2, DAG, dl); 16237 SDValue InsV = Insert128BitVector(DAG.getUNDEF(VT), V, 0, DAG, dl); 16238 return DCI.CombineTo(N, InsV); 16239 } 16240 16241 // vector_shuffle <u, u, u, u, 0, 1, 2, 3> or <u, u, 0, 1> 16242 if (isShuffleLow128VectorInsertHigh(SVOp)) { 16243 SDValue V = Extract128BitVector(V1, 0, DAG, dl); 16244 SDValue InsV = Insert128BitVector(DAG.getUNDEF(VT), V, NumElems/2, DAG, dl); 16245 return DCI.CombineTo(N, InsV); 16246 } 16247 16248 return SDValue(); 16249} 16250 16251/// PerformShuffleCombine - Performs several different shuffle combines. 16252static SDValue PerformShuffleCombine(SDNode *N, SelectionDAG &DAG, 16253 TargetLowering::DAGCombinerInfo &DCI, 16254 const X86Subtarget *Subtarget) { 16255 SDLoc dl(N); 16256 EVT VT = N->getValueType(0); 16257 16258 // Don't create instructions with illegal types after legalize types has run. 16259 const TargetLowering &TLI = DAG.getTargetLoweringInfo(); 16260 if (!DCI.isBeforeLegalize() && !TLI.isTypeLegal(VT.getVectorElementType())) 16261 return SDValue(); 16262 16263 // Combine 256-bit vector shuffles. This is only profitable when in AVX mode 16264 if (Subtarget->hasFp256() && VT.is256BitVector() && 16265 N->getOpcode() == ISD::VECTOR_SHUFFLE) 16266 return PerformShuffleCombine256(N, DAG, DCI, Subtarget); 16267 16268 // Only handle 128 wide vector from here on. 16269 if (!VT.is128BitVector()) 16270 return SDValue(); 16271 16272 // Combine a vector_shuffle that is equal to build_vector load1, load2, load3, 16273 // load4, <0, 1, 2, 3> into a 128-bit load if the load addresses are 16274 // consecutive, non-overlapping, and in the right order. 16275 SmallVector<SDValue, 16> Elts; 16276 for (unsigned i = 0, e = VT.getVectorNumElements(); i != e; ++i) 16277 Elts.push_back(getShuffleScalarElt(N, i, DAG, 0)); 16278 16279 return EltsFromConsecutiveLoads(VT, Elts, dl, DAG, true); 16280} 16281 16282/// PerformTruncateCombine - Converts truncate operation to 16283/// a sequence of vector shuffle operations. 16284/// It is possible when we truncate 256-bit vector to 128-bit vector 16285static SDValue PerformTruncateCombine(SDNode *N, SelectionDAG &DAG, 16286 TargetLowering::DAGCombinerInfo &DCI, 16287 const X86Subtarget *Subtarget) { 16288 return SDValue(); 16289} 16290 16291/// XFormVExtractWithShuffleIntoLoad - Check if a vector extract from a target 16292/// specific shuffle of a load can be folded into a single element load. 16293/// Similar handling for VECTOR_SHUFFLE is performed by DAGCombiner, but 16294/// shuffles have been customed lowered so we need to handle those here. 16295static SDValue XFormVExtractWithShuffleIntoLoad(SDNode *N, SelectionDAG &DAG, 16296 TargetLowering::DAGCombinerInfo &DCI) { 16297 if (DCI.isBeforeLegalizeOps()) 16298 return SDValue(); 16299 16300 SDValue InVec = N->getOperand(0); 16301 SDValue EltNo = N->getOperand(1); 16302 16303 if (!isa<ConstantSDNode>(EltNo)) 16304 return SDValue(); 16305 16306 EVT VT = InVec.getValueType(); 16307 16308 bool HasShuffleIntoBitcast = false; 16309 if (InVec.getOpcode() == ISD::BITCAST) { 16310 // Don't duplicate a load with other uses. 16311 if (!InVec.hasOneUse()) 16312 return SDValue(); 16313 EVT BCVT = InVec.getOperand(0).getValueType(); 16314 if (BCVT.getVectorNumElements() != VT.getVectorNumElements()) 16315 return SDValue(); 16316 InVec = InVec.getOperand(0); 16317 HasShuffleIntoBitcast = true; 16318 } 16319 16320 if (!isTargetShuffle(InVec.getOpcode())) 16321 return SDValue(); 16322 16323 // Don't duplicate a load with other uses. 16324 if (!InVec.hasOneUse()) 16325 return SDValue(); 16326 16327 SmallVector<int, 16> ShuffleMask; 16328 bool UnaryShuffle; 16329 if (!getTargetShuffleMask(InVec.getNode(), VT.getSimpleVT(), ShuffleMask, 16330 UnaryShuffle)) 16331 return SDValue(); 16332 16333 // Select the input vector, guarding against out of range extract vector. 16334 unsigned NumElems = VT.getVectorNumElements(); 16335 int Elt = cast<ConstantSDNode>(EltNo)->getZExtValue(); 16336 int Idx = (Elt > (int)NumElems) ? -1 : ShuffleMask[Elt]; 16337 SDValue LdNode = (Idx < (int)NumElems) ? InVec.getOperand(0) 16338 : InVec.getOperand(1); 16339 16340 // If inputs to shuffle are the same for both ops, then allow 2 uses 16341 unsigned AllowedUses = InVec.getOperand(0) == InVec.getOperand(1) ? 2 : 1; 16342 16343 if (LdNode.getOpcode() == ISD::BITCAST) { 16344 // Don't duplicate a load with other uses. 16345 if (!LdNode.getNode()->hasNUsesOfValue(AllowedUses, 0)) 16346 return SDValue(); 16347 16348 AllowedUses = 1; // only allow 1 load use if we have a bitcast 16349 LdNode = LdNode.getOperand(0); 16350 } 16351 16352 if (!ISD::isNormalLoad(LdNode.getNode())) 16353 return SDValue(); 16354 16355 LoadSDNode *LN0 = cast<LoadSDNode>(LdNode); 16356 16357 if (!LN0 ||!LN0->hasNUsesOfValue(AllowedUses, 0) || LN0->isVolatile()) 16358 return SDValue(); 16359 16360 if (HasShuffleIntoBitcast) { 16361 // If there's a bitcast before the shuffle, check if the load type and 16362 // alignment is valid. 16363 unsigned Align = LN0->getAlignment(); 16364 const TargetLowering &TLI = DAG.getTargetLoweringInfo(); 16365 unsigned NewAlign = TLI.getDataLayout()-> 16366 getABITypeAlignment(VT.getTypeForEVT(*DAG.getContext())); 16367 16368 if (NewAlign > Align || !TLI.isOperationLegalOrCustom(ISD::LOAD, VT)) 16369 return SDValue(); 16370 } 16371 16372 // All checks match so transform back to vector_shuffle so that DAG combiner 16373 // can finish the job 16374 SDLoc dl(N); 16375 16376 // Create shuffle node taking into account the case that its a unary shuffle 16377 SDValue Shuffle = (UnaryShuffle) ? DAG.getUNDEF(VT) : InVec.getOperand(1); 16378 Shuffle = DAG.getVectorShuffle(InVec.getValueType(), dl, 16379 InVec.getOperand(0), Shuffle, 16380 &ShuffleMask[0]); 16381 Shuffle = DAG.getNode(ISD::BITCAST, dl, VT, Shuffle); 16382 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, N->getValueType(0), Shuffle, 16383 EltNo); 16384} 16385 16386/// Extract one bit from mask vector, like v16i1 or v8i1. 16387/// AVX-512 feature. 16388static SDValue ExtractBitFromMaskVector(SDNode *N, SelectionDAG &DAG) { 16389 SDValue Vec = N->getOperand(0); 16390 SDLoc dl(Vec); 16391 MVT VecVT = Vec.getSimpleValueType(); 16392 SDValue Idx = N->getOperand(1); 16393 MVT EltVT = N->getSimpleValueType(0); 16394 16395 assert((VecVT.getVectorElementType() == MVT::i1 && EltVT == MVT::i8) || 16396 "Unexpected operands in ExtractBitFromMaskVector"); 16397 16398 // variable index 16399 if (!isa<ConstantSDNode>(Idx)) { 16400 MVT ExtVT = (VecVT == MVT::v8i1 ? MVT::v8i64 : MVT::v16i32); 16401 SDValue Ext = DAG.getNode(ISD::ZERO_EXTEND, dl, ExtVT, Vec); 16402 SDValue Elt = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, 16403 ExtVT.getVectorElementType(), Ext); 16404 return DAG.getNode(ISD::TRUNCATE, dl, EltVT, Elt); 16405 } 16406 16407 unsigned IdxVal = cast<ConstantSDNode>(Idx)->getZExtValue(); 16408 16409 MVT ScalarVT = MVT::getIntegerVT(VecVT.getSizeInBits()); 16410 unsigned MaxShift = VecVT.getSizeInBits() - 1; 16411 Vec = DAG.getNode(ISD::BITCAST, dl, ScalarVT, Vec); 16412 Vec = DAG.getNode(ISD::SHL, dl, ScalarVT, Vec, 16413 DAG.getConstant(MaxShift - IdxVal, ScalarVT)); 16414 Vec = DAG.getNode(ISD::SRL, dl, ScalarVT, Vec, 16415 DAG.getConstant(MaxShift, ScalarVT)); 16416 16417 if (VecVT == MVT::v16i1) { 16418 Vec = DAG.getNode(ISD::BITCAST, dl, MVT::i16, Vec); 16419 return DAG.getNode(ISD::TRUNCATE, dl, MVT::i8, Vec); 16420 } 16421 return DAG.getNode(ISD::BITCAST, dl, MVT::i8, Vec); 16422} 16423 16424/// PerformEXTRACT_VECTOR_ELTCombine - Detect vector gather/scatter index 16425/// generation and convert it from being a bunch of shuffles and extracts 16426/// to a simple store and scalar loads to extract the elements. 16427static SDValue PerformEXTRACT_VECTOR_ELTCombine(SDNode *N, SelectionDAG &DAG, 16428 TargetLowering::DAGCombinerInfo &DCI) { 16429 SDValue NewOp = XFormVExtractWithShuffleIntoLoad(N, DAG, DCI); 16430 if (NewOp.getNode()) 16431 return NewOp; 16432 16433 SDValue InputVector = N->getOperand(0); 16434 16435 if (InputVector.getValueType().getVectorElementType() == MVT::i1 && 16436 !DCI.isBeforeLegalize()) 16437 return ExtractBitFromMaskVector(N, DAG); 16438 16439 // Detect whether we are trying to convert from mmx to i32 and the bitcast 16440 // from mmx to v2i32 has a single usage. 16441 if (InputVector.getNode()->getOpcode() == llvm::ISD::BITCAST && 16442 InputVector.getNode()->getOperand(0).getValueType() == MVT::x86mmx && 16443 InputVector.hasOneUse() && N->getValueType(0) == MVT::i32) 16444 return DAG.getNode(X86ISD::MMX_MOVD2W, SDLoc(InputVector), 16445 N->getValueType(0), 16446 InputVector.getNode()->getOperand(0)); 16447 16448 // Only operate on vectors of 4 elements, where the alternative shuffling 16449 // gets to be more expensive. 16450 if (InputVector.getValueType() != MVT::v4i32) 16451 return SDValue(); 16452 16453 // Check whether every use of InputVector is an EXTRACT_VECTOR_ELT with a 16454 // single use which is a sign-extend or zero-extend, and all elements are 16455 // used. 16456 SmallVector<SDNode *, 4> Uses; 16457 unsigned ExtractedElements = 0; 16458 for (SDNode::use_iterator UI = InputVector.getNode()->use_begin(), 16459 UE = InputVector.getNode()->use_end(); UI != UE; ++UI) { 16460 if (UI.getUse().getResNo() != InputVector.getResNo()) 16461 return SDValue(); 16462 16463 SDNode *Extract = *UI; 16464 if (Extract->getOpcode() != ISD::EXTRACT_VECTOR_ELT) 16465 return SDValue(); 16466 16467 if (Extract->getValueType(0) != MVT::i32) 16468 return SDValue(); 16469 if (!Extract->hasOneUse()) 16470 return SDValue(); 16471 if (Extract->use_begin()->getOpcode() != ISD::SIGN_EXTEND && 16472 Extract->use_begin()->getOpcode() != ISD::ZERO_EXTEND) 16473 return SDValue(); 16474 if (!isa<ConstantSDNode>(Extract->getOperand(1))) 16475 return SDValue(); 16476 16477 // Record which element was extracted. 16478 ExtractedElements |= 16479 1 << cast<ConstantSDNode>(Extract->getOperand(1))->getZExtValue(); 16480 16481 Uses.push_back(Extract); 16482 } 16483 16484 // If not all the elements were used, this may not be worthwhile. 16485 if (ExtractedElements != 15) 16486 return SDValue(); 16487 16488 // Ok, we've now decided to do the transformation. 16489 SDLoc dl(InputVector); 16490 16491 // Store the value to a temporary stack slot. 16492 SDValue StackPtr = DAG.CreateStackTemporary(InputVector.getValueType()); 16493 SDValue Ch = DAG.getStore(DAG.getEntryNode(), dl, InputVector, StackPtr, 16494 MachinePointerInfo(), false, false, 0); 16495 16496 // Replace each use (extract) with a load of the appropriate element. 16497 for (SmallVectorImpl<SDNode *>::iterator UI = Uses.begin(), 16498 UE = Uses.end(); UI != UE; ++UI) { 16499 SDNode *Extract = *UI; 16500 16501 // cOMpute the element's address. 16502 SDValue Idx = Extract->getOperand(1); 16503 unsigned EltSize = 16504 InputVector.getValueType().getVectorElementType().getSizeInBits()/8; 16505 uint64_t Offset = EltSize * cast<ConstantSDNode>(Idx)->getZExtValue(); 16506 const TargetLowering &TLI = DAG.getTargetLoweringInfo(); 16507 SDValue OffsetVal = DAG.getConstant(Offset, TLI.getPointerTy()); 16508 16509 SDValue ScalarAddr = DAG.getNode(ISD::ADD, dl, TLI.getPointerTy(), 16510 StackPtr, OffsetVal); 16511 16512 // Load the scalar. 16513 SDValue LoadScalar = DAG.getLoad(Extract->getValueType(0), dl, Ch, 16514 ScalarAddr, MachinePointerInfo(), 16515 false, false, false, 0); 16516 16517 // Replace the exact with the load. 16518 DAG.ReplaceAllUsesOfValueWith(SDValue(Extract, 0), LoadScalar); 16519 } 16520 16521 // The replacement was made in place; don't return anything. 16522 return SDValue(); 16523} 16524 16525/// \brief Matches a VSELECT onto min/max or return 0 if the node doesn't match. 16526static std::pair<unsigned, bool> 16527matchIntegerMINMAX(SDValue Cond, EVT VT, SDValue LHS, SDValue RHS, 16528 SelectionDAG &DAG, const X86Subtarget *Subtarget) { 16529 if (!VT.isVector()) 16530 return std::make_pair(0, false); 16531 16532 bool NeedSplit = false; 16533 switch (VT.getSimpleVT().SimpleTy) { 16534 default: return std::make_pair(0, false); 16535 case MVT::v32i8: 16536 case MVT::v16i16: 16537 case MVT::v8i32: 16538 if (!Subtarget->hasAVX2()) 16539 NeedSplit = true; 16540 if (!Subtarget->hasAVX()) 16541 return std::make_pair(0, false); 16542 break; 16543 case MVT::v16i8: 16544 case MVT::v8i16: 16545 case MVT::v4i32: 16546 if (!Subtarget->hasSSE2()) 16547 return std::make_pair(0, false); 16548 } 16549 16550 // SSE2 has only a small subset of the operations. 16551 bool hasUnsigned = Subtarget->hasSSE41() || 16552 (Subtarget->hasSSE2() && VT == MVT::v16i8); 16553 bool hasSigned = Subtarget->hasSSE41() || 16554 (Subtarget->hasSSE2() && VT == MVT::v8i16); 16555 16556 ISD::CondCode CC = cast<CondCodeSDNode>(Cond.getOperand(2))->get(); 16557 16558 unsigned Opc = 0; 16559 // Check for x CC y ? x : y. 16560 if (DAG.isEqualTo(LHS, Cond.getOperand(0)) && 16561 DAG.isEqualTo(RHS, Cond.getOperand(1))) { 16562 switch (CC) { 16563 default: break; 16564 case ISD::SETULT: 16565 case ISD::SETULE: 16566 Opc = hasUnsigned ? X86ISD::UMIN : 0; break; 16567 case ISD::SETUGT: 16568 case ISD::SETUGE: 16569 Opc = hasUnsigned ? X86ISD::UMAX : 0; break; 16570 case ISD::SETLT: 16571 case ISD::SETLE: 16572 Opc = hasSigned ? X86ISD::SMIN : 0; break; 16573 case ISD::SETGT: 16574 case ISD::SETGE: 16575 Opc = hasSigned ? X86ISD::SMAX : 0; break; 16576 } 16577 // Check for x CC y ? y : x -- a min/max with reversed arms. 16578 } else if (DAG.isEqualTo(LHS, Cond.getOperand(1)) && 16579 DAG.isEqualTo(RHS, Cond.getOperand(0))) { 16580 switch (CC) { 16581 default: break; 16582 case ISD::SETULT: 16583 case ISD::SETULE: 16584 Opc = hasUnsigned ? X86ISD::UMAX : 0; break; 16585 case ISD::SETUGT: 16586 case ISD::SETUGE: 16587 Opc = hasUnsigned ? X86ISD::UMIN : 0; break; 16588 case ISD::SETLT: 16589 case ISD::SETLE: 16590 Opc = hasSigned ? X86ISD::SMAX : 0; break; 16591 case ISD::SETGT: 16592 case ISD::SETGE: 16593 Opc = hasSigned ? X86ISD::SMIN : 0; break; 16594 } 16595 } 16596 16597 return std::make_pair(Opc, NeedSplit); 16598} 16599 16600/// PerformSELECTCombine - Do target-specific dag combines on SELECT and VSELECT 16601/// nodes. 16602static SDValue PerformSELECTCombine(SDNode *N, SelectionDAG &DAG, 16603 TargetLowering::DAGCombinerInfo &DCI, 16604 const X86Subtarget *Subtarget) { 16605 SDLoc DL(N); 16606 SDValue Cond = N->getOperand(0); 16607 // Get the LHS/RHS of the select. 16608 SDValue LHS = N->getOperand(1); 16609 SDValue RHS = N->getOperand(2); 16610 EVT VT = LHS.getValueType(); 16611 const TargetLowering &TLI = DAG.getTargetLoweringInfo(); 16612 16613 // If we have SSE[12] support, try to form min/max nodes. SSE min/max 16614 // instructions match the semantics of the common C idiom x<y?x:y but not 16615 // x<=y?x:y, because of how they handle negative zero (which can be 16616 // ignored in unsafe-math mode). 16617 if (Cond.getOpcode() == ISD::SETCC && VT.isFloatingPoint() && 16618 VT != MVT::f80 && TLI.isTypeLegal(VT) && 16619 (Subtarget->hasSSE2() || 16620 (Subtarget->hasSSE1() && VT.getScalarType() == MVT::f32))) { 16621 ISD::CondCode CC = cast<CondCodeSDNode>(Cond.getOperand(2))->get(); 16622 16623 unsigned Opcode = 0; 16624 // Check for x CC y ? x : y. 16625 if (DAG.isEqualTo(LHS, Cond.getOperand(0)) && 16626 DAG.isEqualTo(RHS, Cond.getOperand(1))) { 16627 switch (CC) { 16628 default: break; 16629 case ISD::SETULT: 16630 // Converting this to a min would handle NaNs incorrectly, and swapping 16631 // the operands would cause it to handle comparisons between positive 16632 // and negative zero incorrectly. 16633 if (!DAG.isKnownNeverNaN(LHS) || !DAG.isKnownNeverNaN(RHS)) { 16634 if (!DAG.getTarget().Options.UnsafeFPMath && 16635 !(DAG.isKnownNeverZero(LHS) || DAG.isKnownNeverZero(RHS))) 16636 break; 16637 std::swap(LHS, RHS); 16638 } 16639 Opcode = X86ISD::FMIN; 16640 break; 16641 case ISD::SETOLE: 16642 // Converting this to a min would handle comparisons between positive 16643 // and negative zero incorrectly. 16644 if (!DAG.getTarget().Options.UnsafeFPMath && 16645 !DAG.isKnownNeverZero(LHS) && !DAG.isKnownNeverZero(RHS)) 16646 break; 16647 Opcode = X86ISD::FMIN; 16648 break; 16649 case ISD::SETULE: 16650 // Converting this to a min would handle both negative zeros and NaNs 16651 // incorrectly, but we can swap the operands to fix both. 16652 std::swap(LHS, RHS); 16653 case ISD::SETOLT: 16654 case ISD::SETLT: 16655 case ISD::SETLE: 16656 Opcode = X86ISD::FMIN; 16657 break; 16658 16659 case ISD::SETOGE: 16660 // Converting this to a max would handle comparisons between positive 16661 // and negative zero incorrectly. 16662 if (!DAG.getTarget().Options.UnsafeFPMath && 16663 !DAG.isKnownNeverZero(LHS) && !DAG.isKnownNeverZero(RHS)) 16664 break; 16665 Opcode = X86ISD::FMAX; 16666 break; 16667 case ISD::SETUGT: 16668 // Converting this to a max would handle NaNs incorrectly, and swapping 16669 // the operands would cause it to handle comparisons between positive 16670 // and negative zero incorrectly. 16671 if (!DAG.isKnownNeverNaN(LHS) || !DAG.isKnownNeverNaN(RHS)) { 16672 if (!DAG.getTarget().Options.UnsafeFPMath && 16673 !(DAG.isKnownNeverZero(LHS) || DAG.isKnownNeverZero(RHS))) 16674 break; 16675 std::swap(LHS, RHS); 16676 } 16677 Opcode = X86ISD::FMAX; 16678 break; 16679 case ISD::SETUGE: 16680 // Converting this to a max would handle both negative zeros and NaNs 16681 // incorrectly, but we can swap the operands to fix both. 16682 std::swap(LHS, RHS); 16683 case ISD::SETOGT: 16684 case ISD::SETGT: 16685 case ISD::SETGE: 16686 Opcode = X86ISD::FMAX; 16687 break; 16688 } 16689 // Check for x CC y ? y : x -- a min/max with reversed arms. 16690 } else if (DAG.isEqualTo(LHS, Cond.getOperand(1)) && 16691 DAG.isEqualTo(RHS, Cond.getOperand(0))) { 16692 switch (CC) { 16693 default: break; 16694 case ISD::SETOGE: 16695 // Converting this to a min would handle comparisons between positive 16696 // and negative zero incorrectly, and swapping the operands would 16697 // cause it to handle NaNs incorrectly. 16698 if (!DAG.getTarget().Options.UnsafeFPMath && 16699 !(DAG.isKnownNeverZero(LHS) || DAG.isKnownNeverZero(RHS))) { 16700 if (!DAG.isKnownNeverNaN(LHS) || !DAG.isKnownNeverNaN(RHS)) 16701 break; 16702 std::swap(LHS, RHS); 16703 } 16704 Opcode = X86ISD::FMIN; 16705 break; 16706 case ISD::SETUGT: 16707 // Converting this to a min would handle NaNs incorrectly. 16708 if (!DAG.getTarget().Options.UnsafeFPMath && 16709 (!DAG.isKnownNeverNaN(LHS) || !DAG.isKnownNeverNaN(RHS))) 16710 break; 16711 Opcode = X86ISD::FMIN; 16712 break; 16713 case ISD::SETUGE: 16714 // Converting this to a min would handle both negative zeros and NaNs 16715 // incorrectly, but we can swap the operands to fix both. 16716 std::swap(LHS, RHS); 16717 case ISD::SETOGT: 16718 case ISD::SETGT: 16719 case ISD::SETGE: 16720 Opcode = X86ISD::FMIN; 16721 break; 16722 16723 case ISD::SETULT: 16724 // Converting this to a max would handle NaNs incorrectly. 16725 if (!DAG.isKnownNeverNaN(LHS) || !DAG.isKnownNeverNaN(RHS)) 16726 break; 16727 Opcode = X86ISD::FMAX; 16728 break; 16729 case ISD::SETOLE: 16730 // Converting this to a max would handle comparisons between positive 16731 // and negative zero incorrectly, and swapping the operands would 16732 // cause it to handle NaNs incorrectly. 16733 if (!DAG.getTarget().Options.UnsafeFPMath && 16734 !DAG.isKnownNeverZero(LHS) && !DAG.isKnownNeverZero(RHS)) { 16735 if (!DAG.isKnownNeverNaN(LHS) || !DAG.isKnownNeverNaN(RHS)) 16736 break; 16737 std::swap(LHS, RHS); 16738 } 16739 Opcode = X86ISD::FMAX; 16740 break; 16741 case ISD::SETULE: 16742 // Converting this to a max would handle both negative zeros and NaNs 16743 // incorrectly, but we can swap the operands to fix both. 16744 std::swap(LHS, RHS); 16745 case ISD::SETOLT: 16746 case ISD::SETLT: 16747 case ISD::SETLE: 16748 Opcode = X86ISD::FMAX; 16749 break; 16750 } 16751 } 16752 16753 if (Opcode) 16754 return DAG.getNode(Opcode, DL, N->getValueType(0), LHS, RHS); 16755 } 16756 16757 EVT CondVT = Cond.getValueType(); 16758 if (Subtarget->hasAVX512() && VT.isVector() && CondVT.isVector() && 16759 CondVT.getVectorElementType() == MVT::i1) { 16760 // v16i8 (select v16i1, v16i8, v16i8) does not have a proper 16761 // lowering on AVX-512. In this case we convert it to 16762 // v16i8 (select v16i8, v16i8, v16i8) and use AVX instruction. 16763 // The same situation for all 128 and 256-bit vectors of i8 and i16 16764 EVT OpVT = LHS.getValueType(); 16765 if ((OpVT.is128BitVector() || OpVT.is256BitVector()) && 16766 (OpVT.getVectorElementType() == MVT::i8 || 16767 OpVT.getVectorElementType() == MVT::i16)) { 16768 Cond = DAG.getNode(ISD::SIGN_EXTEND, DL, OpVT, Cond); 16769 DCI.AddToWorklist(Cond.getNode()); 16770 return DAG.getNode(N->getOpcode(), DL, OpVT, Cond, LHS, RHS); 16771 } 16772 } 16773 // If this is a select between two integer constants, try to do some 16774 // optimizations. 16775 if (ConstantSDNode *TrueC = dyn_cast<ConstantSDNode>(LHS)) { 16776 if (ConstantSDNode *FalseC = dyn_cast<ConstantSDNode>(RHS)) 16777 // Don't do this for crazy integer types. 16778 if (DAG.getTargetLoweringInfo().isTypeLegal(LHS.getValueType())) { 16779 // If this is efficiently invertible, canonicalize the LHSC/RHSC values 16780 // so that TrueC (the true value) is larger than FalseC. 16781 bool NeedsCondInvert = false; 16782 16783 if (TrueC->getAPIntValue().ult(FalseC->getAPIntValue()) && 16784 // Efficiently invertible. 16785 (Cond.getOpcode() == ISD::SETCC || // setcc -> invertible. 16786 (Cond.getOpcode() == ISD::XOR && // xor(X, C) -> invertible. 16787 isa<ConstantSDNode>(Cond.getOperand(1))))) { 16788 NeedsCondInvert = true; 16789 std::swap(TrueC, FalseC); 16790 } 16791 16792 // Optimize C ? 8 : 0 -> zext(C) << 3. Likewise for any pow2/0. 16793 if (FalseC->getAPIntValue() == 0 && 16794 TrueC->getAPIntValue().isPowerOf2()) { 16795 if (NeedsCondInvert) // Invert the condition if needed. 16796 Cond = DAG.getNode(ISD::XOR, DL, Cond.getValueType(), Cond, 16797 DAG.getConstant(1, Cond.getValueType())); 16798 16799 // Zero extend the condition if needed. 16800 Cond = DAG.getNode(ISD::ZERO_EXTEND, DL, LHS.getValueType(), Cond); 16801 16802 unsigned ShAmt = TrueC->getAPIntValue().logBase2(); 16803 return DAG.getNode(ISD::SHL, DL, LHS.getValueType(), Cond, 16804 DAG.getConstant(ShAmt, MVT::i8)); 16805 } 16806 16807 // Optimize Cond ? cst+1 : cst -> zext(setcc(C)+cst. 16808 if (FalseC->getAPIntValue()+1 == TrueC->getAPIntValue()) { 16809 if (NeedsCondInvert) // Invert the condition if needed. 16810 Cond = DAG.getNode(ISD::XOR, DL, Cond.getValueType(), Cond, 16811 DAG.getConstant(1, Cond.getValueType())); 16812 16813 // Zero extend the condition if needed. 16814 Cond = DAG.getNode(ISD::ZERO_EXTEND, DL, 16815 FalseC->getValueType(0), Cond); 16816 return DAG.getNode(ISD::ADD, DL, Cond.getValueType(), Cond, 16817 SDValue(FalseC, 0)); 16818 } 16819 16820 // Optimize cases that will turn into an LEA instruction. This requires 16821 // an i32 or i64 and an efficient multiplier (1, 2, 3, 4, 5, 8, 9). 16822 if (N->getValueType(0) == MVT::i32 || N->getValueType(0) == MVT::i64) { 16823 uint64_t Diff = TrueC->getZExtValue()-FalseC->getZExtValue(); 16824 if (N->getValueType(0) == MVT::i32) Diff = (unsigned)Diff; 16825 16826 bool isFastMultiplier = false; 16827 if (Diff < 10) { 16828 switch ((unsigned char)Diff) { 16829 default: break; 16830 case 1: // result = add base, cond 16831 case 2: // result = lea base( , cond*2) 16832 case 3: // result = lea base(cond, cond*2) 16833 case 4: // result = lea base( , cond*4) 16834 case 5: // result = lea base(cond, cond*4) 16835 case 8: // result = lea base( , cond*8) 16836 case 9: // result = lea base(cond, cond*8) 16837 isFastMultiplier = true; 16838 break; 16839 } 16840 } 16841 16842 if (isFastMultiplier) { 16843 APInt Diff = TrueC->getAPIntValue()-FalseC->getAPIntValue(); 16844 if (NeedsCondInvert) // Invert the condition if needed. 16845 Cond = DAG.getNode(ISD::XOR, DL, Cond.getValueType(), Cond, 16846 DAG.getConstant(1, Cond.getValueType())); 16847 16848 // Zero extend the condition if needed. 16849 Cond = DAG.getNode(ISD::ZERO_EXTEND, DL, FalseC->getValueType(0), 16850 Cond); 16851 // Scale the condition by the difference. 16852 if (Diff != 1) 16853 Cond = DAG.getNode(ISD::MUL, DL, Cond.getValueType(), Cond, 16854 DAG.getConstant(Diff, Cond.getValueType())); 16855 16856 // Add the base if non-zero. 16857 if (FalseC->getAPIntValue() != 0) 16858 Cond = DAG.getNode(ISD::ADD, DL, Cond.getValueType(), Cond, 16859 SDValue(FalseC, 0)); 16860 return Cond; 16861 } 16862 } 16863 } 16864 } 16865 16866 // Canonicalize max and min: 16867 // (x > y) ? x : y -> (x >= y) ? x : y 16868 // (x < y) ? x : y -> (x <= y) ? x : y 16869 // This allows use of COND_S / COND_NS (see TranslateX86CC) which eliminates 16870 // the need for an extra compare 16871 // against zero. e.g. 16872 // (x - y) > 0 : (x - y) ? 0 -> (x - y) >= 0 : (x - y) ? 0 16873 // subl %esi, %edi 16874 // testl %edi, %edi 16875 // movl $0, %eax 16876 // cmovgl %edi, %eax 16877 // => 16878 // xorl %eax, %eax 16879 // subl %esi, $edi 16880 // cmovsl %eax, %edi 16881 if (N->getOpcode() == ISD::SELECT && Cond.getOpcode() == ISD::SETCC && 16882 DAG.isEqualTo(LHS, Cond.getOperand(0)) && 16883 DAG.isEqualTo(RHS, Cond.getOperand(1))) { 16884 ISD::CondCode CC = cast<CondCodeSDNode>(Cond.getOperand(2))->get(); 16885 switch (CC) { 16886 default: break; 16887 case ISD::SETLT: 16888 case ISD::SETGT: { 16889 ISD::CondCode NewCC = (CC == ISD::SETLT) ? ISD::SETLE : ISD::SETGE; 16890 Cond = DAG.getSetCC(SDLoc(Cond), Cond.getValueType(), 16891 Cond.getOperand(0), Cond.getOperand(1), NewCC); 16892 return DAG.getNode(ISD::SELECT, DL, VT, Cond, LHS, RHS); 16893 } 16894 } 16895 } 16896 16897 // Early exit check 16898 if (!TLI.isTypeLegal(VT)) 16899 return SDValue(); 16900 16901 // Match VSELECTs into subs with unsigned saturation. 16902 if (N->getOpcode() == ISD::VSELECT && Cond.getOpcode() == ISD::SETCC && 16903 // psubus is available in SSE2 and AVX2 for i8 and i16 vectors. 16904 ((Subtarget->hasSSE2() && (VT == MVT::v16i8 || VT == MVT::v8i16)) || 16905 (Subtarget->hasAVX2() && (VT == MVT::v32i8 || VT == MVT::v16i16)))) { 16906 ISD::CondCode CC = cast<CondCodeSDNode>(Cond.getOperand(2))->get(); 16907 16908 // Check if one of the arms of the VSELECT is a zero vector. If it's on the 16909 // left side invert the predicate to simplify logic below. 16910 SDValue Other; 16911 if (ISD::isBuildVectorAllZeros(LHS.getNode())) { 16912 Other = RHS; 16913 CC = ISD::getSetCCInverse(CC, true); 16914 } else if (ISD::isBuildVectorAllZeros(RHS.getNode())) { 16915 Other = LHS; 16916 } 16917 16918 if (Other.getNode() && Other->getNumOperands() == 2 && 16919 DAG.isEqualTo(Other->getOperand(0), Cond.getOperand(0))) { 16920 SDValue OpLHS = Other->getOperand(0), OpRHS = Other->getOperand(1); 16921 SDValue CondRHS = Cond->getOperand(1); 16922 16923 // Look for a general sub with unsigned saturation first. 16924 // x >= y ? x-y : 0 --> subus x, y 16925 // x > y ? x-y : 0 --> subus x, y 16926 if ((CC == ISD::SETUGE || CC == ISD::SETUGT) && 16927 Other->getOpcode() == ISD::SUB && DAG.isEqualTo(OpRHS, CondRHS)) 16928 return DAG.getNode(X86ISD::SUBUS, DL, VT, OpLHS, OpRHS); 16929 16930 // If the RHS is a constant we have to reverse the const canonicalization. 16931 // x > C-1 ? x+-C : 0 --> subus x, C 16932 if (CC == ISD::SETUGT && Other->getOpcode() == ISD::ADD && 16933 isSplatVector(CondRHS.getNode()) && isSplatVector(OpRHS.getNode())) { 16934 APInt A = cast<ConstantSDNode>(OpRHS.getOperand(0))->getAPIntValue(); 16935 if (CondRHS.getConstantOperandVal(0) == -A-1) 16936 return DAG.getNode(X86ISD::SUBUS, DL, VT, OpLHS, 16937 DAG.getConstant(-A, VT)); 16938 } 16939 16940 // Another special case: If C was a sign bit, the sub has been 16941 // canonicalized into a xor. 16942 // FIXME: Would it be better to use ComputeMaskedBits to determine whether 16943 // it's safe to decanonicalize the xor? 16944 // x s< 0 ? x^C : 0 --> subus x, C 16945 if (CC == ISD::SETLT && Other->getOpcode() == ISD::XOR && 16946 ISD::isBuildVectorAllZeros(CondRHS.getNode()) && 16947 isSplatVector(OpRHS.getNode())) { 16948 APInt A = cast<ConstantSDNode>(OpRHS.getOperand(0))->getAPIntValue(); 16949 if (A.isSignBit()) 16950 return DAG.getNode(X86ISD::SUBUS, DL, VT, OpLHS, OpRHS); 16951 } 16952 } 16953 } 16954 16955 // Try to match a min/max vector operation. 16956 if (N->getOpcode() == ISD::VSELECT && Cond.getOpcode() == ISD::SETCC) { 16957 std::pair<unsigned, bool> ret = matchIntegerMINMAX(Cond, VT, LHS, RHS, DAG, Subtarget); 16958 unsigned Opc = ret.first; 16959 bool NeedSplit = ret.second; 16960 16961 if (Opc && NeedSplit) { 16962 unsigned NumElems = VT.getVectorNumElements(); 16963 // Extract the LHS vectors 16964 SDValue LHS1 = Extract128BitVector(LHS, 0, DAG, DL); 16965 SDValue LHS2 = Extract128BitVector(LHS, NumElems/2, DAG, DL); 16966 16967 // Extract the RHS vectors 16968 SDValue RHS1 = Extract128BitVector(RHS, 0, DAG, DL); 16969 SDValue RHS2 = Extract128BitVector(RHS, NumElems/2, DAG, DL); 16970 16971 // Create min/max for each subvector 16972 LHS = DAG.getNode(Opc, DL, LHS1.getValueType(), LHS1, RHS1); 16973 RHS = DAG.getNode(Opc, DL, LHS2.getValueType(), LHS2, RHS2); 16974 16975 // Merge the result 16976 return DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, LHS, RHS); 16977 } else if (Opc) 16978 return DAG.getNode(Opc, DL, VT, LHS, RHS); 16979 } 16980 16981 // Simplify vector selection if the selector will be produced by CMPP*/PCMP*. 16982 if (N->getOpcode() == ISD::VSELECT && Cond.getOpcode() == ISD::SETCC && 16983 // Check if SETCC has already been promoted 16984 TLI.getSetCCResultType(*DAG.getContext(), VT) == Cond.getValueType()) { 16985 16986 assert(Cond.getValueType().isVector() && 16987 "vector select expects a vector selector!"); 16988 16989 EVT IntVT = Cond.getValueType(); 16990 bool TValIsAllOnes = ISD::isBuildVectorAllOnes(LHS.getNode()); 16991 bool FValIsAllZeros = ISD::isBuildVectorAllZeros(RHS.getNode()); 16992 16993 if (!TValIsAllOnes && !FValIsAllZeros) { 16994 // Try invert the condition if true value is not all 1s and false value 16995 // is not all 0s. 16996 bool TValIsAllZeros = ISD::isBuildVectorAllZeros(LHS.getNode()); 16997 bool FValIsAllOnes = ISD::isBuildVectorAllOnes(RHS.getNode()); 16998 16999 if (TValIsAllZeros || FValIsAllOnes) { 17000 SDValue CC = Cond.getOperand(2); 17001 ISD::CondCode NewCC = 17002 ISD::getSetCCInverse(cast<CondCodeSDNode>(CC)->get(), 17003 Cond.getOperand(0).getValueType().isInteger()); 17004 Cond = DAG.getSetCC(DL, IntVT, Cond.getOperand(0), Cond.getOperand(1), NewCC); 17005 std::swap(LHS, RHS); 17006 TValIsAllOnes = FValIsAllOnes; 17007 FValIsAllZeros = TValIsAllZeros; 17008 } 17009 } 17010 17011 if (TValIsAllOnes || FValIsAllZeros) { 17012 SDValue Ret; 17013 17014 if (TValIsAllOnes && FValIsAllZeros) 17015 Ret = Cond; 17016 else if (TValIsAllOnes) 17017 Ret = DAG.getNode(ISD::OR, DL, IntVT, Cond, 17018 DAG.getNode(ISD::BITCAST, DL, IntVT, RHS)); 17019 else if (FValIsAllZeros) 17020 Ret = DAG.getNode(ISD::AND, DL, IntVT, Cond, 17021 DAG.getNode(ISD::BITCAST, DL, IntVT, LHS)); 17022 17023 return DAG.getNode(ISD::BITCAST, DL, VT, Ret); 17024 } 17025 } 17026 17027 // If we know that this node is legal then we know that it is going to be 17028 // matched by one of the SSE/AVX BLEND instructions. These instructions only 17029 // depend on the highest bit in each word. Try to use SimplifyDemandedBits 17030 // to simplify previous instructions. 17031 if (N->getOpcode() == ISD::VSELECT && DCI.isBeforeLegalizeOps() && 17032 !DCI.isBeforeLegalize() && TLI.isOperationLegal(ISD::VSELECT, VT)) { 17033 unsigned BitWidth = Cond.getValueType().getScalarType().getSizeInBits(); 17034 17035 // Don't optimize vector selects that map to mask-registers. 17036 if (BitWidth == 1) 17037 return SDValue(); 17038 17039 // Check all uses of that condition operand to check whether it will be 17040 // consumed by non-BLEND instructions, which may depend on all bits are set 17041 // properly. 17042 for (SDNode::use_iterator I = Cond->use_begin(), 17043 E = Cond->use_end(); I != E; ++I) 17044 if (I->getOpcode() != ISD::VSELECT) 17045 // TODO: Add other opcodes eventually lowered into BLEND. 17046 return SDValue(); 17047 17048 assert(BitWidth >= 8 && BitWidth <= 64 && "Invalid mask size"); 17049 APInt DemandedMask = APInt::getHighBitsSet(BitWidth, 1); 17050 17051 APInt KnownZero, KnownOne; 17052 TargetLowering::TargetLoweringOpt TLO(DAG, DCI.isBeforeLegalize(), 17053 DCI.isBeforeLegalizeOps()); 17054 if (TLO.ShrinkDemandedConstant(Cond, DemandedMask) || 17055 TLI.SimplifyDemandedBits(Cond, DemandedMask, KnownZero, KnownOne, TLO)) 17056 DCI.CommitTargetLoweringOpt(TLO); 17057 } 17058 17059 return SDValue(); 17060} 17061 17062// Check whether a boolean test is testing a boolean value generated by 17063// X86ISD::SETCC. If so, return the operand of that SETCC and proper condition 17064// code. 17065// 17066// Simplify the following patterns: 17067// (Op (CMP (SETCC Cond EFLAGS) 1) EQ) or 17068// (Op (CMP (SETCC Cond EFLAGS) 0) NEQ) 17069// to (Op EFLAGS Cond) 17070// 17071// (Op (CMP (SETCC Cond EFLAGS) 0) EQ) or 17072// (Op (CMP (SETCC Cond EFLAGS) 1) NEQ) 17073// to (Op EFLAGS !Cond) 17074// 17075// where Op could be BRCOND or CMOV. 17076// 17077static SDValue checkBoolTestSetCCCombine(SDValue Cmp, X86::CondCode &CC) { 17078 // Quit if not CMP and SUB with its value result used. 17079 if (Cmp.getOpcode() != X86ISD::CMP && 17080 (Cmp.getOpcode() != X86ISD::SUB || Cmp.getNode()->hasAnyUseOfValue(0))) 17081 return SDValue(); 17082 17083 // Quit if not used as a boolean value. 17084 if (CC != X86::COND_E && CC != X86::COND_NE) 17085 return SDValue(); 17086 17087 // Check CMP operands. One of them should be 0 or 1 and the other should be 17088 // an SetCC or extended from it. 17089 SDValue Op1 = Cmp.getOperand(0); 17090 SDValue Op2 = Cmp.getOperand(1); 17091 17092 SDValue SetCC; 17093 const ConstantSDNode* C = 0; 17094 bool needOppositeCond = (CC == X86::COND_E); 17095 bool checkAgainstTrue = false; // Is it a comparison against 1? 17096 17097 if ((C = dyn_cast<ConstantSDNode>(Op1))) 17098 SetCC = Op2; 17099 else if ((C = dyn_cast<ConstantSDNode>(Op2))) 17100 SetCC = Op1; 17101 else // Quit if all operands are not constants. 17102 return SDValue(); 17103 17104 if (C->getZExtValue() == 1) { 17105 needOppositeCond = !needOppositeCond; 17106 checkAgainstTrue = true; 17107 } else if (C->getZExtValue() != 0) 17108 // Quit if the constant is neither 0 or 1. 17109 return SDValue(); 17110 17111 bool truncatedToBoolWithAnd = false; 17112 // Skip (zext $x), (trunc $x), or (and $x, 1) node. 17113 while (SetCC.getOpcode() == ISD::ZERO_EXTEND || 17114 SetCC.getOpcode() == ISD::TRUNCATE || 17115 SetCC.getOpcode() == ISD::AND) { 17116 if (SetCC.getOpcode() == ISD::AND) { 17117 int OpIdx = -1; 17118 ConstantSDNode *CS; 17119 if ((CS = dyn_cast<ConstantSDNode>(SetCC.getOperand(0))) && 17120 CS->getZExtValue() == 1) 17121 OpIdx = 1; 17122 if ((CS = dyn_cast<ConstantSDNode>(SetCC.getOperand(1))) && 17123 CS->getZExtValue() == 1) 17124 OpIdx = 0; 17125 if (OpIdx == -1) 17126 break; 17127 SetCC = SetCC.getOperand(OpIdx); 17128 truncatedToBoolWithAnd = true; 17129 } else 17130 SetCC = SetCC.getOperand(0); 17131 } 17132 17133 switch (SetCC.getOpcode()) { 17134 case X86ISD::SETCC_CARRY: 17135 // Since SETCC_CARRY gives output based on R = CF ? ~0 : 0, it's unsafe to 17136 // simplify it if the result of SETCC_CARRY is not canonicalized to 0 or 1, 17137 // i.e. it's a comparison against true but the result of SETCC_CARRY is not 17138 // truncated to i1 using 'and'. 17139 if (checkAgainstTrue && !truncatedToBoolWithAnd) 17140 break; 17141 assert(X86::CondCode(SetCC.getConstantOperandVal(0)) == X86::COND_B && 17142 "Invalid use of SETCC_CARRY!"); 17143 // FALL THROUGH 17144 case X86ISD::SETCC: 17145 // Set the condition code or opposite one if necessary. 17146 CC = X86::CondCode(SetCC.getConstantOperandVal(0)); 17147 if (needOppositeCond) 17148 CC = X86::GetOppositeBranchCondition(CC); 17149 return SetCC.getOperand(1); 17150 case X86ISD::CMOV: { 17151 // Check whether false/true value has canonical one, i.e. 0 or 1. 17152 ConstantSDNode *FVal = dyn_cast<ConstantSDNode>(SetCC.getOperand(0)); 17153 ConstantSDNode *TVal = dyn_cast<ConstantSDNode>(SetCC.getOperand(1)); 17154 // Quit if true value is not a constant. 17155 if (!TVal) 17156 return SDValue(); 17157 // Quit if false value is not a constant. 17158 if (!FVal) { 17159 SDValue Op = SetCC.getOperand(0); 17160 // Skip 'zext' or 'trunc' node. 17161 if (Op.getOpcode() == ISD::ZERO_EXTEND || 17162 Op.getOpcode() == ISD::TRUNCATE) 17163 Op = Op.getOperand(0); 17164 // A special case for rdrand/rdseed, where 0 is set if false cond is 17165 // found. 17166 if ((Op.getOpcode() != X86ISD::RDRAND && 17167 Op.getOpcode() != X86ISD::RDSEED) || Op.getResNo() != 0) 17168 return SDValue(); 17169 } 17170 // Quit if false value is not the constant 0 or 1. 17171 bool FValIsFalse = true; 17172 if (FVal && FVal->getZExtValue() != 0) { 17173 if (FVal->getZExtValue() != 1) 17174 return SDValue(); 17175 // If FVal is 1, opposite cond is needed. 17176 needOppositeCond = !needOppositeCond; 17177 FValIsFalse = false; 17178 } 17179 // Quit if TVal is not the constant opposite of FVal. 17180 if (FValIsFalse && TVal->getZExtValue() != 1) 17181 return SDValue(); 17182 if (!FValIsFalse && TVal->getZExtValue() != 0) 17183 return SDValue(); 17184 CC = X86::CondCode(SetCC.getConstantOperandVal(2)); 17185 if (needOppositeCond) 17186 CC = X86::GetOppositeBranchCondition(CC); 17187 return SetCC.getOperand(3); 17188 } 17189 } 17190 17191 return SDValue(); 17192} 17193 17194/// Optimize X86ISD::CMOV [LHS, RHS, CONDCODE (e.g. X86::COND_NE), CONDVAL] 17195static SDValue PerformCMOVCombine(SDNode *N, SelectionDAG &DAG, 17196 TargetLowering::DAGCombinerInfo &DCI, 17197 const X86Subtarget *Subtarget) { 17198 SDLoc DL(N); 17199 17200 // If the flag operand isn't dead, don't touch this CMOV. 17201 if (N->getNumValues() == 2 && !SDValue(N, 1).use_empty()) 17202 return SDValue(); 17203 17204 SDValue FalseOp = N->getOperand(0); 17205 SDValue TrueOp = N->getOperand(1); 17206 X86::CondCode CC = (X86::CondCode)N->getConstantOperandVal(2); 17207 SDValue Cond = N->getOperand(3); 17208 17209 if (CC == X86::COND_E || CC == X86::COND_NE) { 17210 switch (Cond.getOpcode()) { 17211 default: break; 17212 case X86ISD::BSR: 17213 case X86ISD::BSF: 17214 // If operand of BSR / BSF are proven never zero, then ZF cannot be set. 17215 if (DAG.isKnownNeverZero(Cond.getOperand(0))) 17216 return (CC == X86::COND_E) ? FalseOp : TrueOp; 17217 } 17218 } 17219 17220 SDValue Flags; 17221 17222 Flags = checkBoolTestSetCCCombine(Cond, CC); 17223 if (Flags.getNode() && 17224 // Extra check as FCMOV only supports a subset of X86 cond. 17225 (FalseOp.getValueType() != MVT::f80 || hasFPCMov(CC))) { 17226 SDValue Ops[] = { FalseOp, TrueOp, 17227 DAG.getConstant(CC, MVT::i8), Flags }; 17228 return DAG.getNode(X86ISD::CMOV, DL, N->getVTList(), 17229 Ops, array_lengthof(Ops)); 17230 } 17231 17232 // If this is a select between two integer constants, try to do some 17233 // optimizations. Note that the operands are ordered the opposite of SELECT 17234 // operands. 17235 if (ConstantSDNode *TrueC = dyn_cast<ConstantSDNode>(TrueOp)) { 17236 if (ConstantSDNode *FalseC = dyn_cast<ConstantSDNode>(FalseOp)) { 17237 // Canonicalize the TrueC/FalseC values so that TrueC (the true value) is 17238 // larger than FalseC (the false value). 17239 if (TrueC->getAPIntValue().ult(FalseC->getAPIntValue())) { 17240 CC = X86::GetOppositeBranchCondition(CC); 17241 std::swap(TrueC, FalseC); 17242 std::swap(TrueOp, FalseOp); 17243 } 17244 17245 // Optimize C ? 8 : 0 -> zext(setcc(C)) << 3. Likewise for any pow2/0. 17246 // This is efficient for any integer data type (including i8/i16) and 17247 // shift amount. 17248 if (FalseC->getAPIntValue() == 0 && TrueC->getAPIntValue().isPowerOf2()) { 17249 Cond = DAG.getNode(X86ISD::SETCC, DL, MVT::i8, 17250 DAG.getConstant(CC, MVT::i8), Cond); 17251 17252 // Zero extend the condition if needed. 17253 Cond = DAG.getNode(ISD::ZERO_EXTEND, DL, TrueC->getValueType(0), Cond); 17254 17255 unsigned ShAmt = TrueC->getAPIntValue().logBase2(); 17256 Cond = DAG.getNode(ISD::SHL, DL, Cond.getValueType(), Cond, 17257 DAG.getConstant(ShAmt, MVT::i8)); 17258 if (N->getNumValues() == 2) // Dead flag value? 17259 return DCI.CombineTo(N, Cond, SDValue()); 17260 return Cond; 17261 } 17262 17263 // Optimize Cond ? cst+1 : cst -> zext(setcc(C)+cst. This is efficient 17264 // for any integer data type, including i8/i16. 17265 if (FalseC->getAPIntValue()+1 == TrueC->getAPIntValue()) { 17266 Cond = DAG.getNode(X86ISD::SETCC, DL, MVT::i8, 17267 DAG.getConstant(CC, MVT::i8), Cond); 17268 17269 // Zero extend the condition if needed. 17270 Cond = DAG.getNode(ISD::ZERO_EXTEND, DL, 17271 FalseC->getValueType(0), Cond); 17272 Cond = DAG.getNode(ISD::ADD, DL, Cond.getValueType(), Cond, 17273 SDValue(FalseC, 0)); 17274 17275 if (N->getNumValues() == 2) // Dead flag value? 17276 return DCI.CombineTo(N, Cond, SDValue()); 17277 return Cond; 17278 } 17279 17280 // Optimize cases that will turn into an LEA instruction. This requires 17281 // an i32 or i64 and an efficient multiplier (1, 2, 3, 4, 5, 8, 9). 17282 if (N->getValueType(0) == MVT::i32 || N->getValueType(0) == MVT::i64) { 17283 uint64_t Diff = TrueC->getZExtValue()-FalseC->getZExtValue(); 17284 if (N->getValueType(0) == MVT::i32) Diff = (unsigned)Diff; 17285 17286 bool isFastMultiplier = false; 17287 if (Diff < 10) { 17288 switch ((unsigned char)Diff) { 17289 default: break; 17290 case 1: // result = add base, cond 17291 case 2: // result = lea base( , cond*2) 17292 case 3: // result = lea base(cond, cond*2) 17293 case 4: // result = lea base( , cond*4) 17294 case 5: // result = lea base(cond, cond*4) 17295 case 8: // result = lea base( , cond*8) 17296 case 9: // result = lea base(cond, cond*8) 17297 isFastMultiplier = true; 17298 break; 17299 } 17300 } 17301 17302 if (isFastMultiplier) { 17303 APInt Diff = TrueC->getAPIntValue()-FalseC->getAPIntValue(); 17304 Cond = DAG.getNode(X86ISD::SETCC, DL, MVT::i8, 17305 DAG.getConstant(CC, MVT::i8), Cond); 17306 // Zero extend the condition if needed. 17307 Cond = DAG.getNode(ISD::ZERO_EXTEND, DL, FalseC->getValueType(0), 17308 Cond); 17309 // Scale the condition by the difference. 17310 if (Diff != 1) 17311 Cond = DAG.getNode(ISD::MUL, DL, Cond.getValueType(), Cond, 17312 DAG.getConstant(Diff, Cond.getValueType())); 17313 17314 // Add the base if non-zero. 17315 if (FalseC->getAPIntValue() != 0) 17316 Cond = DAG.getNode(ISD::ADD, DL, Cond.getValueType(), Cond, 17317 SDValue(FalseC, 0)); 17318 if (N->getNumValues() == 2) // Dead flag value? 17319 return DCI.CombineTo(N, Cond, SDValue()); 17320 return Cond; 17321 } 17322 } 17323 } 17324 } 17325 17326 // Handle these cases: 17327 // (select (x != c), e, c) -> select (x != c), e, x), 17328 // (select (x == c), c, e) -> select (x == c), x, e) 17329 // where the c is an integer constant, and the "select" is the combination 17330 // of CMOV and CMP. 17331 // 17332 // The rationale for this change is that the conditional-move from a constant 17333 // needs two instructions, however, conditional-move from a register needs 17334 // only one instruction. 17335 // 17336 // CAVEAT: By replacing a constant with a symbolic value, it may obscure 17337 // some instruction-combining opportunities. This opt needs to be 17338 // postponed as late as possible. 17339 // 17340 if (!DCI.isBeforeLegalize() && !DCI.isBeforeLegalizeOps()) { 17341 // the DCI.xxxx conditions are provided to postpone the optimization as 17342 // late as possible. 17343 17344 ConstantSDNode *CmpAgainst = 0; 17345 if ((Cond.getOpcode() == X86ISD::CMP || Cond.getOpcode() == X86ISD::SUB) && 17346 (CmpAgainst = dyn_cast<ConstantSDNode>(Cond.getOperand(1))) && 17347 !isa<ConstantSDNode>(Cond.getOperand(0))) { 17348 17349 if (CC == X86::COND_NE && 17350 CmpAgainst == dyn_cast<ConstantSDNode>(FalseOp)) { 17351 CC = X86::GetOppositeBranchCondition(CC); 17352 std::swap(TrueOp, FalseOp); 17353 } 17354 17355 if (CC == X86::COND_E && 17356 CmpAgainst == dyn_cast<ConstantSDNode>(TrueOp)) { 17357 SDValue Ops[] = { FalseOp, Cond.getOperand(0), 17358 DAG.getConstant(CC, MVT::i8), Cond }; 17359 return DAG.getNode(X86ISD::CMOV, DL, N->getVTList (), Ops, 17360 array_lengthof(Ops)); 17361 } 17362 } 17363 } 17364 17365 return SDValue(); 17366} 17367 17368/// PerformMulCombine - Optimize a single multiply with constant into two 17369/// in order to implement it with two cheaper instructions, e.g. 17370/// LEA + SHL, LEA + LEA. 17371static SDValue PerformMulCombine(SDNode *N, SelectionDAG &DAG, 17372 TargetLowering::DAGCombinerInfo &DCI) { 17373 if (DCI.isBeforeLegalize() || DCI.isCalledByLegalizer()) 17374 return SDValue(); 17375 17376 EVT VT = N->getValueType(0); 17377 if (VT != MVT::i64) 17378 return SDValue(); 17379 17380 ConstantSDNode *C = dyn_cast<ConstantSDNode>(N->getOperand(1)); 17381 if (!C) 17382 return SDValue(); 17383 uint64_t MulAmt = C->getZExtValue(); 17384 if (isPowerOf2_64(MulAmt) || MulAmt == 3 || MulAmt == 5 || MulAmt == 9) 17385 return SDValue(); 17386 17387 uint64_t MulAmt1 = 0; 17388 uint64_t MulAmt2 = 0; 17389 if ((MulAmt % 9) == 0) { 17390 MulAmt1 = 9; 17391 MulAmt2 = MulAmt / 9; 17392 } else if ((MulAmt % 5) == 0) { 17393 MulAmt1 = 5; 17394 MulAmt2 = MulAmt / 5; 17395 } else if ((MulAmt % 3) == 0) { 17396 MulAmt1 = 3; 17397 MulAmt2 = MulAmt / 3; 17398 } 17399 if (MulAmt2 && 17400 (isPowerOf2_64(MulAmt2) || MulAmt2 == 3 || MulAmt2 == 5 || MulAmt2 == 9)){ 17401 SDLoc DL(N); 17402 17403 if (isPowerOf2_64(MulAmt2) && 17404 !(N->hasOneUse() && N->use_begin()->getOpcode() == ISD::ADD)) 17405 // If second multiplifer is pow2, issue it first. We want the multiply by 17406 // 3, 5, or 9 to be folded into the addressing mode unless the lone use 17407 // is an add. 17408 std::swap(MulAmt1, MulAmt2); 17409 17410 SDValue NewMul; 17411 if (isPowerOf2_64(MulAmt1)) 17412 NewMul = DAG.getNode(ISD::SHL, DL, VT, N->getOperand(0), 17413 DAG.getConstant(Log2_64(MulAmt1), MVT::i8)); 17414 else 17415 NewMul = DAG.getNode(X86ISD::MUL_IMM, DL, VT, N->getOperand(0), 17416 DAG.getConstant(MulAmt1, VT)); 17417 17418 if (isPowerOf2_64(MulAmt2)) 17419 NewMul = DAG.getNode(ISD::SHL, DL, VT, NewMul, 17420 DAG.getConstant(Log2_64(MulAmt2), MVT::i8)); 17421 else 17422 NewMul = DAG.getNode(X86ISD::MUL_IMM, DL, VT, NewMul, 17423 DAG.getConstant(MulAmt2, VT)); 17424 17425 // Do not add new nodes to DAG combiner worklist. 17426 DCI.CombineTo(N, NewMul, false); 17427 } 17428 return SDValue(); 17429} 17430 17431static SDValue PerformSHLCombine(SDNode *N, SelectionDAG &DAG) { 17432 SDValue N0 = N->getOperand(0); 17433 SDValue N1 = N->getOperand(1); 17434 ConstantSDNode *N1C = dyn_cast<ConstantSDNode>(N1); 17435 EVT VT = N0.getValueType(); 17436 17437 // fold (shl (and (setcc_c), c1), c2) -> (and setcc_c, (c1 << c2)) 17438 // since the result of setcc_c is all zero's or all ones. 17439 if (VT.isInteger() && !VT.isVector() && 17440 N1C && N0.getOpcode() == ISD::AND && 17441 N0.getOperand(1).getOpcode() == ISD::Constant) { 17442 SDValue N00 = N0.getOperand(0); 17443 if (N00.getOpcode() == X86ISD::SETCC_CARRY || 17444 ((N00.getOpcode() == ISD::ANY_EXTEND || 17445 N00.getOpcode() == ISD::ZERO_EXTEND) && 17446 N00.getOperand(0).getOpcode() == X86ISD::SETCC_CARRY)) { 17447 APInt Mask = cast<ConstantSDNode>(N0.getOperand(1))->getAPIntValue(); 17448 APInt ShAmt = N1C->getAPIntValue(); 17449 Mask = Mask.shl(ShAmt); 17450 if (Mask != 0) 17451 return DAG.getNode(ISD::AND, SDLoc(N), VT, 17452 N00, DAG.getConstant(Mask, VT)); 17453 } 17454 } 17455 17456 // Hardware support for vector shifts is sparse which makes us scalarize the 17457 // vector operations in many cases. Also, on sandybridge ADD is faster than 17458 // shl. 17459 // (shl V, 1) -> add V,V 17460 if (isSplatVector(N1.getNode())) { 17461 assert(N0.getValueType().isVector() && "Invalid vector shift type"); 17462 ConstantSDNode *N1C = dyn_cast<ConstantSDNode>(N1->getOperand(0)); 17463 // We shift all of the values by one. In many cases we do not have 17464 // hardware support for this operation. This is better expressed as an ADD 17465 // of two values. 17466 if (N1C && (1 == N1C->getZExtValue())) { 17467 return DAG.getNode(ISD::ADD, SDLoc(N), VT, N0, N0); 17468 } 17469 } 17470 17471 return SDValue(); 17472} 17473 17474/// \brief Returns a vector of 0s if the node in input is a vector logical 17475/// shift by a constant amount which is known to be bigger than or equal 17476/// to the vector element size in bits. 17477static SDValue performShiftToAllZeros(SDNode *N, SelectionDAG &DAG, 17478 const X86Subtarget *Subtarget) { 17479 EVT VT = N->getValueType(0); 17480 17481 if (VT != MVT::v2i64 && VT != MVT::v4i32 && VT != MVT::v8i16 && 17482 (!Subtarget->hasInt256() || 17483 (VT != MVT::v4i64 && VT != MVT::v8i32 && VT != MVT::v16i16))) 17484 return SDValue(); 17485 17486 SDValue Amt = N->getOperand(1); 17487 SDLoc DL(N); 17488 if (isSplatVector(Amt.getNode())) { 17489 SDValue SclrAmt = Amt->getOperand(0); 17490 if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(SclrAmt)) { 17491 APInt ShiftAmt = C->getAPIntValue(); 17492 unsigned MaxAmount = VT.getVectorElementType().getSizeInBits(); 17493 17494 // SSE2/AVX2 logical shifts always return a vector of 0s 17495 // if the shift amount is bigger than or equal to 17496 // the element size. The constant shift amount will be 17497 // encoded as a 8-bit immediate. 17498 if (ShiftAmt.trunc(8).uge(MaxAmount)) 17499 return getZeroVector(VT, Subtarget, DAG, DL); 17500 } 17501 } 17502 17503 return SDValue(); 17504} 17505 17506/// PerformShiftCombine - Combine shifts. 17507static SDValue PerformShiftCombine(SDNode* N, SelectionDAG &DAG, 17508 TargetLowering::DAGCombinerInfo &DCI, 17509 const X86Subtarget *Subtarget) { 17510 if (N->getOpcode() == ISD::SHL) { 17511 SDValue V = PerformSHLCombine(N, DAG); 17512 if (V.getNode()) return V; 17513 } 17514 17515 if (N->getOpcode() != ISD::SRA) { 17516 // Try to fold this logical shift into a zero vector. 17517 SDValue V = performShiftToAllZeros(N, DAG, Subtarget); 17518 if (V.getNode()) return V; 17519 } 17520 17521 return SDValue(); 17522} 17523 17524// CMPEQCombine - Recognize the distinctive (AND (setcc ...) (setcc ..)) 17525// where both setccs reference the same FP CMP, and rewrite for CMPEQSS 17526// and friends. Likewise for OR -> CMPNEQSS. 17527static SDValue CMPEQCombine(SDNode *N, SelectionDAG &DAG, 17528 TargetLowering::DAGCombinerInfo &DCI, 17529 const X86Subtarget *Subtarget) { 17530 unsigned opcode; 17531 17532 // SSE1 supports CMP{eq|ne}SS, and SSE2 added CMP{eq|ne}SD, but 17533 // we're requiring SSE2 for both. 17534 if (Subtarget->hasSSE2() && isAndOrOfSetCCs(SDValue(N, 0U), opcode)) { 17535 SDValue N0 = N->getOperand(0); 17536 SDValue N1 = N->getOperand(1); 17537 SDValue CMP0 = N0->getOperand(1); 17538 SDValue CMP1 = N1->getOperand(1); 17539 SDLoc DL(N); 17540 17541 // The SETCCs should both refer to the same CMP. 17542 if (CMP0.getOpcode() != X86ISD::CMP || CMP0 != CMP1) 17543 return SDValue(); 17544 17545 SDValue CMP00 = CMP0->getOperand(0); 17546 SDValue CMP01 = CMP0->getOperand(1); 17547 EVT VT = CMP00.getValueType(); 17548 17549 if (VT == MVT::f32 || VT == MVT::f64) { 17550 bool ExpectingFlags = false; 17551 // Check for any users that want flags: 17552 for (SDNode::use_iterator UI = N->use_begin(), UE = N->use_end(); 17553 !ExpectingFlags && UI != UE; ++UI) 17554 switch (UI->getOpcode()) { 17555 default: 17556 case ISD::BR_CC: 17557 case ISD::BRCOND: 17558 case ISD::SELECT: 17559 ExpectingFlags = true; 17560 break; 17561 case ISD::CopyToReg: 17562 case ISD::SIGN_EXTEND: 17563 case ISD::ZERO_EXTEND: 17564 case ISD::ANY_EXTEND: 17565 break; 17566 } 17567 17568 if (!ExpectingFlags) { 17569 enum X86::CondCode cc0 = (enum X86::CondCode)N0.getConstantOperandVal(0); 17570 enum X86::CondCode cc1 = (enum X86::CondCode)N1.getConstantOperandVal(0); 17571 17572 if (cc1 == X86::COND_E || cc1 == X86::COND_NE) { 17573 X86::CondCode tmp = cc0; 17574 cc0 = cc1; 17575 cc1 = tmp; 17576 } 17577 17578 if ((cc0 == X86::COND_E && cc1 == X86::COND_NP) || 17579 (cc0 == X86::COND_NE && cc1 == X86::COND_P)) { 17580 bool is64BitFP = (CMP00.getValueType() == MVT::f64); 17581 X86ISD::NodeType NTOperator = is64BitFP ? 17582 X86ISD::FSETCCsd : X86ISD::FSETCCss; 17583 // FIXME: need symbolic constants for these magic numbers. 17584 // See X86ATTInstPrinter.cpp:printSSECC(). 17585 unsigned x86cc = (cc0 == X86::COND_E) ? 0 : 4; 17586 SDValue OnesOrZeroesF = DAG.getNode(NTOperator, DL, CMP00.getValueType(), 17587 CMP00, CMP01, 17588 DAG.getConstant(x86cc, MVT::i8)); 17589 17590 MVT IntVT = is64BitFP ? MVT::i64 : MVT::i32; 17591 17592 if (is64BitFP && !Subtarget->is64Bit()) { 17593 // On a 32-bit target, we cannot bitcast the 64-bit float to a 17594 // 64-bit integer, since that's not a legal type. Since 17595 // OnesOrZeroesF is all ones of all zeroes, we don't need all the 17596 // bits, but can do this little dance to extract the lowest 32 bits 17597 // and work with those going forward. 17598 SDValue Vector64 = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, MVT::v2f64, 17599 OnesOrZeroesF); 17600 SDValue Vector32 = DAG.getNode(ISD::BITCAST, DL, MVT::v4f32, 17601 Vector64); 17602 OnesOrZeroesF = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::f32, 17603 Vector32, DAG.getIntPtrConstant(0)); 17604 IntVT = MVT::i32; 17605 } 17606 17607 SDValue OnesOrZeroesI = DAG.getNode(ISD::BITCAST, DL, IntVT, OnesOrZeroesF); 17608 SDValue ANDed = DAG.getNode(ISD::AND, DL, IntVT, OnesOrZeroesI, 17609 DAG.getConstant(1, IntVT)); 17610 SDValue OneBitOfTruth = DAG.getNode(ISD::TRUNCATE, DL, MVT::i8, ANDed); 17611 return OneBitOfTruth; 17612 } 17613 } 17614 } 17615 } 17616 return SDValue(); 17617} 17618 17619/// CanFoldXORWithAllOnes - Test whether the XOR operand is a AllOnes vector 17620/// so it can be folded inside ANDNP. 17621static bool CanFoldXORWithAllOnes(const SDNode *N) { 17622 EVT VT = N->getValueType(0); 17623 17624 // Match direct AllOnes for 128 and 256-bit vectors 17625 if (ISD::isBuildVectorAllOnes(N)) 17626 return true; 17627 17628 // Look through a bit convert. 17629 if (N->getOpcode() == ISD::BITCAST) 17630 N = N->getOperand(0).getNode(); 17631 17632 // Sometimes the operand may come from a insert_subvector building a 256-bit 17633 // allones vector 17634 if (VT.is256BitVector() && 17635 N->getOpcode() == ISD::INSERT_SUBVECTOR) { 17636 SDValue V1 = N->getOperand(0); 17637 SDValue V2 = N->getOperand(1); 17638 17639 if (V1.getOpcode() == ISD::INSERT_SUBVECTOR && 17640 V1.getOperand(0).getOpcode() == ISD::UNDEF && 17641 ISD::isBuildVectorAllOnes(V1.getOperand(1).getNode()) && 17642 ISD::isBuildVectorAllOnes(V2.getNode())) 17643 return true; 17644 } 17645 17646 return false; 17647} 17648 17649// On AVX/AVX2 the type v8i1 is legalized to v8i16, which is an XMM sized 17650// register. In most cases we actually compare or select YMM-sized registers 17651// and mixing the two types creates horrible code. This method optimizes 17652// some of the transition sequences. 17653static SDValue WidenMaskArithmetic(SDNode *N, SelectionDAG &DAG, 17654 TargetLowering::DAGCombinerInfo &DCI, 17655 const X86Subtarget *Subtarget) { 17656 EVT VT = N->getValueType(0); 17657 if (!VT.is256BitVector()) 17658 return SDValue(); 17659 17660 assert((N->getOpcode() == ISD::ANY_EXTEND || 17661 N->getOpcode() == ISD::ZERO_EXTEND || 17662 N->getOpcode() == ISD::SIGN_EXTEND) && "Invalid Node"); 17663 17664 SDValue Narrow = N->getOperand(0); 17665 EVT NarrowVT = Narrow->getValueType(0); 17666 if (!NarrowVT.is128BitVector()) 17667 return SDValue(); 17668 17669 if (Narrow->getOpcode() != ISD::XOR && 17670 Narrow->getOpcode() != ISD::AND && 17671 Narrow->getOpcode() != ISD::OR) 17672 return SDValue(); 17673 17674 SDValue N0 = Narrow->getOperand(0); 17675 SDValue N1 = Narrow->getOperand(1); 17676 SDLoc DL(Narrow); 17677 17678 // The Left side has to be a trunc. 17679 if (N0.getOpcode() != ISD::TRUNCATE) 17680 return SDValue(); 17681 17682 // The type of the truncated inputs. 17683 EVT WideVT = N0->getOperand(0)->getValueType(0); 17684 if (WideVT != VT) 17685 return SDValue(); 17686 17687 // The right side has to be a 'trunc' or a constant vector. 17688 bool RHSTrunc = N1.getOpcode() == ISD::TRUNCATE; 17689 bool RHSConst = (isSplatVector(N1.getNode()) && 17690 isa<ConstantSDNode>(N1->getOperand(0))); 17691 if (!RHSTrunc && !RHSConst) 17692 return SDValue(); 17693 17694 const TargetLowering &TLI = DAG.getTargetLoweringInfo(); 17695 17696 if (!TLI.isOperationLegalOrPromote(Narrow->getOpcode(), WideVT)) 17697 return SDValue(); 17698 17699 // Set N0 and N1 to hold the inputs to the new wide operation. 17700 N0 = N0->getOperand(0); 17701 if (RHSConst) { 17702 N1 = DAG.getNode(ISD::ZERO_EXTEND, DL, WideVT.getScalarType(), 17703 N1->getOperand(0)); 17704 SmallVector<SDValue, 8> C(WideVT.getVectorNumElements(), N1); 17705 N1 = DAG.getNode(ISD::BUILD_VECTOR, DL, WideVT, &C[0], C.size()); 17706 } else if (RHSTrunc) { 17707 N1 = N1->getOperand(0); 17708 } 17709 17710 // Generate the wide operation. 17711 SDValue Op = DAG.getNode(Narrow->getOpcode(), DL, WideVT, N0, N1); 17712 unsigned Opcode = N->getOpcode(); 17713 switch (Opcode) { 17714 case ISD::ANY_EXTEND: 17715 return Op; 17716 case ISD::ZERO_EXTEND: { 17717 unsigned InBits = NarrowVT.getScalarType().getSizeInBits(); 17718 APInt Mask = APInt::getAllOnesValue(InBits); 17719 Mask = Mask.zext(VT.getScalarType().getSizeInBits()); 17720 return DAG.getNode(ISD::AND, DL, VT, 17721 Op, DAG.getConstant(Mask, VT)); 17722 } 17723 case ISD::SIGN_EXTEND: 17724 return DAG.getNode(ISD::SIGN_EXTEND_INREG, DL, VT, 17725 Op, DAG.getValueType(NarrowVT)); 17726 default: 17727 llvm_unreachable("Unexpected opcode"); 17728 } 17729} 17730 17731static SDValue PerformAndCombine(SDNode *N, SelectionDAG &DAG, 17732 TargetLowering::DAGCombinerInfo &DCI, 17733 const X86Subtarget *Subtarget) { 17734 EVT VT = N->getValueType(0); 17735 if (DCI.isBeforeLegalizeOps()) 17736 return SDValue(); 17737 17738 SDValue R = CMPEQCombine(N, DAG, DCI, Subtarget); 17739 if (R.getNode()) 17740 return R; 17741 17742 // Create BLSI, BLSR, and BZHI instructions 17743 // BLSI is X & (-X) 17744 // BLSR is X & (X-1) 17745 // BZHI is X & ((1 << Y) - 1) 17746 // BEXTR is ((X >> imm) & (2**size-1)) 17747 if (VT == MVT::i32 || VT == MVT::i64) { 17748 SDValue N0 = N->getOperand(0); 17749 SDValue N1 = N->getOperand(1); 17750 SDLoc DL(N); 17751 17752 if (Subtarget->hasBMI()) { 17753 // Check LHS for neg 17754 if (N0.getOpcode() == ISD::SUB && N0.getOperand(1) == N1 && 17755 isZero(N0.getOperand(0))) 17756 return DAG.getNode(X86ISD::BLSI, DL, VT, N1); 17757 17758 // Check RHS for neg 17759 if (N1.getOpcode() == ISD::SUB && N1.getOperand(1) == N0 && 17760 isZero(N1.getOperand(0))) 17761 return DAG.getNode(X86ISD::BLSI, DL, VT, N0); 17762 17763 // Check LHS for X-1 17764 if (N0.getOpcode() == ISD::ADD && N0.getOperand(0) == N1 && 17765 isAllOnes(N0.getOperand(1))) 17766 return DAG.getNode(X86ISD::BLSR, DL, VT, N1); 17767 17768 // Check RHS for X-1 17769 if (N1.getOpcode() == ISD::ADD && N1.getOperand(0) == N0 && 17770 isAllOnes(N1.getOperand(1))) 17771 return DAG.getNode(X86ISD::BLSR, DL, VT, N0); 17772 } 17773 17774 if (Subtarget->hasBMI2()) { 17775 // Check for (and (add (shl 1, Y), -1), X) 17776 if (N0.getOpcode() == ISD::ADD && isAllOnes(N0.getOperand(1))) { 17777 SDValue N00 = N0.getOperand(0); 17778 if (N00.getOpcode() == ISD::SHL) { 17779 SDValue N001 = N00.getOperand(1); 17780 assert(N001.getValueType() == MVT::i8 && "unexpected type"); 17781 ConstantSDNode *C = dyn_cast<ConstantSDNode>(N00.getOperand(0)); 17782 if (C && C->getZExtValue() == 1) 17783 return DAG.getNode(X86ISD::BZHI, DL, VT, N1, N001); 17784 } 17785 } 17786 17787 // Check for (and X, (add (shl 1, Y), -1)) 17788 if (N1.getOpcode() == ISD::ADD && isAllOnes(N1.getOperand(1))) { 17789 SDValue N10 = N1.getOperand(0); 17790 if (N10.getOpcode() == ISD::SHL) { 17791 SDValue N101 = N10.getOperand(1); 17792 assert(N101.getValueType() == MVT::i8 && "unexpected type"); 17793 ConstantSDNode *C = dyn_cast<ConstantSDNode>(N10.getOperand(0)); 17794 if (C && C->getZExtValue() == 1) 17795 return DAG.getNode(X86ISD::BZHI, DL, VT, N0, N101); 17796 } 17797 } 17798 } 17799 17800 // Check for BEXTR. 17801 if ((Subtarget->hasBMI() || Subtarget->hasTBM()) && 17802 (N0.getOpcode() == ISD::SRA || N0.getOpcode() == ISD::SRL)) { 17803 ConstantSDNode *MaskNode = dyn_cast<ConstantSDNode>(N1); 17804 ConstantSDNode *ShiftNode = dyn_cast<ConstantSDNode>(N0.getOperand(1)); 17805 if (MaskNode && ShiftNode) { 17806 uint64_t Mask = MaskNode->getZExtValue(); 17807 uint64_t Shift = ShiftNode->getZExtValue(); 17808 if (isMask_64(Mask)) { 17809 uint64_t MaskSize = CountPopulation_64(Mask); 17810 if (Shift + MaskSize <= VT.getSizeInBits()) 17811 return DAG.getNode(X86ISD::BEXTR, DL, VT, N0.getOperand(0), 17812 DAG.getConstant(Shift | (MaskSize << 8), VT)); 17813 } 17814 } 17815 } // BEXTR 17816 17817 return SDValue(); 17818 } 17819 17820 // Want to form ANDNP nodes: 17821 // 1) In the hopes of then easily combining them with OR and AND nodes 17822 // to form PBLEND/PSIGN. 17823 // 2) To match ANDN packed intrinsics 17824 if (VT != MVT::v2i64 && VT != MVT::v4i64) 17825 return SDValue(); 17826 17827 SDValue N0 = N->getOperand(0); 17828 SDValue N1 = N->getOperand(1); 17829 SDLoc DL(N); 17830 17831 // Check LHS for vnot 17832 if (N0.getOpcode() == ISD::XOR && 17833 //ISD::isBuildVectorAllOnes(N0.getOperand(1).getNode())) 17834 CanFoldXORWithAllOnes(N0.getOperand(1).getNode())) 17835 return DAG.getNode(X86ISD::ANDNP, DL, VT, N0.getOperand(0), N1); 17836 17837 // Check RHS for vnot 17838 if (N1.getOpcode() == ISD::XOR && 17839 //ISD::isBuildVectorAllOnes(N1.getOperand(1).getNode())) 17840 CanFoldXORWithAllOnes(N1.getOperand(1).getNode())) 17841 return DAG.getNode(X86ISD::ANDNP, DL, VT, N1.getOperand(0), N0); 17842 17843 return SDValue(); 17844} 17845 17846static SDValue PerformOrCombine(SDNode *N, SelectionDAG &DAG, 17847 TargetLowering::DAGCombinerInfo &DCI, 17848 const X86Subtarget *Subtarget) { 17849 EVT VT = N->getValueType(0); 17850 if (DCI.isBeforeLegalizeOps()) 17851 return SDValue(); 17852 17853 SDValue R = CMPEQCombine(N, DAG, DCI, Subtarget); 17854 if (R.getNode()) 17855 return R; 17856 17857 SDValue N0 = N->getOperand(0); 17858 SDValue N1 = N->getOperand(1); 17859 17860 // look for psign/blend 17861 if (VT == MVT::v2i64 || VT == MVT::v4i64) { 17862 if (!Subtarget->hasSSSE3() || 17863 (VT == MVT::v4i64 && !Subtarget->hasInt256())) 17864 return SDValue(); 17865 17866 // Canonicalize pandn to RHS 17867 if (N0.getOpcode() == X86ISD::ANDNP) 17868 std::swap(N0, N1); 17869 // or (and (m, y), (pandn m, x)) 17870 if (N0.getOpcode() == ISD::AND && N1.getOpcode() == X86ISD::ANDNP) { 17871 SDValue Mask = N1.getOperand(0); 17872 SDValue X = N1.getOperand(1); 17873 SDValue Y; 17874 if (N0.getOperand(0) == Mask) 17875 Y = N0.getOperand(1); 17876 if (N0.getOperand(1) == Mask) 17877 Y = N0.getOperand(0); 17878 17879 // Check to see if the mask appeared in both the AND and ANDNP and 17880 if (!Y.getNode()) 17881 return SDValue(); 17882 17883 // Validate that X, Y, and Mask are BIT_CONVERTS, and see through them. 17884 // Look through mask bitcast. 17885 if (Mask.getOpcode() == ISD::BITCAST) 17886 Mask = Mask.getOperand(0); 17887 if (X.getOpcode() == ISD::BITCAST) 17888 X = X.getOperand(0); 17889 if (Y.getOpcode() == ISD::BITCAST) 17890 Y = Y.getOperand(0); 17891 17892 EVT MaskVT = Mask.getValueType(); 17893 17894 // Validate that the Mask operand is a vector sra node. 17895 // FIXME: what to do for bytes, since there is a psignb/pblendvb, but 17896 // there is no psrai.b 17897 unsigned EltBits = MaskVT.getVectorElementType().getSizeInBits(); 17898 unsigned SraAmt = ~0; 17899 if (Mask.getOpcode() == ISD::SRA) { 17900 SDValue Amt = Mask.getOperand(1); 17901 if (isSplatVector(Amt.getNode())) { 17902 SDValue SclrAmt = Amt->getOperand(0); 17903 if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(SclrAmt)) 17904 SraAmt = C->getZExtValue(); 17905 } 17906 } else if (Mask.getOpcode() == X86ISD::VSRAI) { 17907 SDValue SraC = Mask.getOperand(1); 17908 SraAmt = cast<ConstantSDNode>(SraC)->getZExtValue(); 17909 } 17910 if ((SraAmt + 1) != EltBits) 17911 return SDValue(); 17912 17913 SDLoc DL(N); 17914 17915 // Now we know we at least have a plendvb with the mask val. See if 17916 // we can form a psignb/w/d. 17917 // psign = x.type == y.type == mask.type && y = sub(0, x); 17918 if (Y.getOpcode() == ISD::SUB && Y.getOperand(1) == X && 17919 ISD::isBuildVectorAllZeros(Y.getOperand(0).getNode()) && 17920 X.getValueType() == MaskVT && Y.getValueType() == MaskVT) { 17921 assert((EltBits == 8 || EltBits == 16 || EltBits == 32) && 17922 "Unsupported VT for PSIGN"); 17923 Mask = DAG.getNode(X86ISD::PSIGN, DL, MaskVT, X, Mask.getOperand(0)); 17924 return DAG.getNode(ISD::BITCAST, DL, VT, Mask); 17925 } 17926 // PBLENDVB only available on SSE 4.1 17927 if (!Subtarget->hasSSE41()) 17928 return SDValue(); 17929 17930 EVT BlendVT = (VT == MVT::v4i64) ? MVT::v32i8 : MVT::v16i8; 17931 17932 X = DAG.getNode(ISD::BITCAST, DL, BlendVT, X); 17933 Y = DAG.getNode(ISD::BITCAST, DL, BlendVT, Y); 17934 Mask = DAG.getNode(ISD::BITCAST, DL, BlendVT, Mask); 17935 Mask = DAG.getNode(ISD::VSELECT, DL, BlendVT, Mask, Y, X); 17936 return DAG.getNode(ISD::BITCAST, DL, VT, Mask); 17937 } 17938 } 17939 17940 if (VT != MVT::i16 && VT != MVT::i32 && VT != MVT::i64) 17941 return SDValue(); 17942 17943 // fold (or (x << c) | (y >> (64 - c))) ==> (shld64 x, y, c) 17944 if (N0.getOpcode() == ISD::SRL && N1.getOpcode() == ISD::SHL) 17945 std::swap(N0, N1); 17946 if (N0.getOpcode() != ISD::SHL || N1.getOpcode() != ISD::SRL) 17947 return SDValue(); 17948 if (!N0.hasOneUse() || !N1.hasOneUse()) 17949 return SDValue(); 17950 17951 SDValue ShAmt0 = N0.getOperand(1); 17952 if (ShAmt0.getValueType() != MVT::i8) 17953 return SDValue(); 17954 SDValue ShAmt1 = N1.getOperand(1); 17955 if (ShAmt1.getValueType() != MVT::i8) 17956 return SDValue(); 17957 if (ShAmt0.getOpcode() == ISD::TRUNCATE) 17958 ShAmt0 = ShAmt0.getOperand(0); 17959 if (ShAmt1.getOpcode() == ISD::TRUNCATE) 17960 ShAmt1 = ShAmt1.getOperand(0); 17961 17962 SDLoc DL(N); 17963 unsigned Opc = X86ISD::SHLD; 17964 SDValue Op0 = N0.getOperand(0); 17965 SDValue Op1 = N1.getOperand(0); 17966 if (ShAmt0.getOpcode() == ISD::SUB) { 17967 Opc = X86ISD::SHRD; 17968 std::swap(Op0, Op1); 17969 std::swap(ShAmt0, ShAmt1); 17970 } 17971 17972 unsigned Bits = VT.getSizeInBits(); 17973 if (ShAmt1.getOpcode() == ISD::SUB) { 17974 SDValue Sum = ShAmt1.getOperand(0); 17975 if (ConstantSDNode *SumC = dyn_cast<ConstantSDNode>(Sum)) { 17976 SDValue ShAmt1Op1 = ShAmt1.getOperand(1); 17977 if (ShAmt1Op1.getNode()->getOpcode() == ISD::TRUNCATE) 17978 ShAmt1Op1 = ShAmt1Op1.getOperand(0); 17979 if (SumC->getSExtValue() == Bits && ShAmt1Op1 == ShAmt0) 17980 return DAG.getNode(Opc, DL, VT, 17981 Op0, Op1, 17982 DAG.getNode(ISD::TRUNCATE, DL, 17983 MVT::i8, ShAmt0)); 17984 } 17985 } else if (ConstantSDNode *ShAmt1C = dyn_cast<ConstantSDNode>(ShAmt1)) { 17986 ConstantSDNode *ShAmt0C = dyn_cast<ConstantSDNode>(ShAmt0); 17987 if (ShAmt0C && 17988 ShAmt0C->getSExtValue() + ShAmt1C->getSExtValue() == Bits) 17989 return DAG.getNode(Opc, DL, VT, 17990 N0.getOperand(0), N1.getOperand(0), 17991 DAG.getNode(ISD::TRUNCATE, DL, 17992 MVT::i8, ShAmt0)); 17993 } 17994 17995 return SDValue(); 17996} 17997 17998// Generate NEG and CMOV for integer abs. 17999static SDValue performIntegerAbsCombine(SDNode *N, SelectionDAG &DAG) { 18000 EVT VT = N->getValueType(0); 18001 18002 // Since X86 does not have CMOV for 8-bit integer, we don't convert 18003 // 8-bit integer abs to NEG and CMOV. 18004 if (VT.isInteger() && VT.getSizeInBits() == 8) 18005 return SDValue(); 18006 18007 SDValue N0 = N->getOperand(0); 18008 SDValue N1 = N->getOperand(1); 18009 SDLoc DL(N); 18010 18011 // Check pattern of XOR(ADD(X,Y), Y) where Y is SRA(X, size(X)-1) 18012 // and change it to SUB and CMOV. 18013 if (VT.isInteger() && N->getOpcode() == ISD::XOR && 18014 N0.getOpcode() == ISD::ADD && 18015 N0.getOperand(1) == N1 && 18016 N1.getOpcode() == ISD::SRA && 18017 N1.getOperand(0) == N0.getOperand(0)) 18018 if (ConstantSDNode *Y1C = dyn_cast<ConstantSDNode>(N1.getOperand(1))) 18019 if (Y1C->getAPIntValue() == VT.getSizeInBits()-1) { 18020 // Generate SUB & CMOV. 18021 SDValue Neg = DAG.getNode(X86ISD::SUB, DL, DAG.getVTList(VT, MVT::i32), 18022 DAG.getConstant(0, VT), N0.getOperand(0)); 18023 18024 SDValue Ops[] = { N0.getOperand(0), Neg, 18025 DAG.getConstant(X86::COND_GE, MVT::i8), 18026 SDValue(Neg.getNode(), 1) }; 18027 return DAG.getNode(X86ISD::CMOV, DL, DAG.getVTList(VT, MVT::Glue), 18028 Ops, array_lengthof(Ops)); 18029 } 18030 return SDValue(); 18031} 18032 18033// PerformXorCombine - Attempts to turn XOR nodes into BLSMSK nodes 18034static SDValue PerformXorCombine(SDNode *N, SelectionDAG &DAG, 18035 TargetLowering::DAGCombinerInfo &DCI, 18036 const X86Subtarget *Subtarget) { 18037 EVT VT = N->getValueType(0); 18038 if (DCI.isBeforeLegalizeOps()) 18039 return SDValue(); 18040 18041 if (Subtarget->hasCMov()) { 18042 SDValue RV = performIntegerAbsCombine(N, DAG); 18043 if (RV.getNode()) 18044 return RV; 18045 } 18046 18047 // Try forming BMI if it is available. 18048 if (!Subtarget->hasBMI()) 18049 return SDValue(); 18050 18051 if (VT != MVT::i32 && VT != MVT::i64) 18052 return SDValue(); 18053 18054 assert(Subtarget->hasBMI() && "Creating BLSMSK requires BMI instructions"); 18055 18056 // Create BLSMSK instructions by finding X ^ (X-1) 18057 SDValue N0 = N->getOperand(0); 18058 SDValue N1 = N->getOperand(1); 18059 SDLoc DL(N); 18060 18061 if (N0.getOpcode() == ISD::ADD && N0.getOperand(0) == N1 && 18062 isAllOnes(N0.getOperand(1))) 18063 return DAG.getNode(X86ISD::BLSMSK, DL, VT, N1); 18064 18065 if (N1.getOpcode() == ISD::ADD && N1.getOperand(0) == N0 && 18066 isAllOnes(N1.getOperand(1))) 18067 return DAG.getNode(X86ISD::BLSMSK, DL, VT, N0); 18068 18069 return SDValue(); 18070} 18071 18072/// PerformLOADCombine - Do target-specific dag combines on LOAD nodes. 18073static SDValue PerformLOADCombine(SDNode *N, SelectionDAG &DAG, 18074 TargetLowering::DAGCombinerInfo &DCI, 18075 const X86Subtarget *Subtarget) { 18076 LoadSDNode *Ld = cast<LoadSDNode>(N); 18077 EVT RegVT = Ld->getValueType(0); 18078 EVT MemVT = Ld->getMemoryVT(); 18079 SDLoc dl(Ld); 18080 const TargetLowering &TLI = DAG.getTargetLoweringInfo(); 18081 unsigned RegSz = RegVT.getSizeInBits(); 18082 18083 // On Sandybridge unaligned 256bit loads are inefficient. 18084 ISD::LoadExtType Ext = Ld->getExtensionType(); 18085 unsigned Alignment = Ld->getAlignment(); 18086 bool IsAligned = Alignment == 0 || Alignment >= MemVT.getSizeInBits()/8; 18087 if (RegVT.is256BitVector() && !Subtarget->hasInt256() && 18088 !DCI.isBeforeLegalizeOps() && !IsAligned && Ext == ISD::NON_EXTLOAD) { 18089 unsigned NumElems = RegVT.getVectorNumElements(); 18090 if (NumElems < 2) 18091 return SDValue(); 18092 18093 SDValue Ptr = Ld->getBasePtr(); 18094 SDValue Increment = DAG.getConstant(16, TLI.getPointerTy()); 18095 18096 EVT HalfVT = EVT::getVectorVT(*DAG.getContext(), MemVT.getScalarType(), 18097 NumElems/2); 18098 SDValue Load1 = DAG.getLoad(HalfVT, dl, Ld->getChain(), Ptr, 18099 Ld->getPointerInfo(), Ld->isVolatile(), 18100 Ld->isNonTemporal(), Ld->isInvariant(), 18101 Alignment); 18102 Ptr = DAG.getNode(ISD::ADD, dl, Ptr.getValueType(), Ptr, Increment); 18103 SDValue Load2 = DAG.getLoad(HalfVT, dl, Ld->getChain(), Ptr, 18104 Ld->getPointerInfo(), Ld->isVolatile(), 18105 Ld->isNonTemporal(), Ld->isInvariant(), 18106 std::min(16U, Alignment)); 18107 SDValue TF = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, 18108 Load1.getValue(1), 18109 Load2.getValue(1)); 18110 18111 SDValue NewVec = DAG.getUNDEF(RegVT); 18112 NewVec = Insert128BitVector(NewVec, Load1, 0, DAG, dl); 18113 NewVec = Insert128BitVector(NewVec, Load2, NumElems/2, DAG, dl); 18114 return DCI.CombineTo(N, NewVec, TF, true); 18115 } 18116 18117 // If this is a vector EXT Load then attempt to optimize it using a 18118 // shuffle. If SSSE3 is not available we may emit an illegal shuffle but the 18119 // expansion is still better than scalar code. 18120 // We generate X86ISD::VSEXT for SEXTLOADs if it's available, otherwise we'll 18121 // emit a shuffle and a arithmetic shift. 18122 // TODO: It is possible to support ZExt by zeroing the undef values 18123 // during the shuffle phase or after the shuffle. 18124 if (RegVT.isVector() && RegVT.isInteger() && Subtarget->hasSSE2() && 18125 (Ext == ISD::EXTLOAD || Ext == ISD::SEXTLOAD)) { 18126 assert(MemVT != RegVT && "Cannot extend to the same type"); 18127 assert(MemVT.isVector() && "Must load a vector from memory"); 18128 18129 unsigned NumElems = RegVT.getVectorNumElements(); 18130 unsigned MemSz = MemVT.getSizeInBits(); 18131 assert(RegSz > MemSz && "Register size must be greater than the mem size"); 18132 18133 if (Ext == ISD::SEXTLOAD && RegSz == 256 && !Subtarget->hasInt256()) 18134 return SDValue(); 18135 18136 // All sizes must be a power of two. 18137 if (!isPowerOf2_32(RegSz * MemSz * NumElems)) 18138 return SDValue(); 18139 18140 // Attempt to load the original value using scalar loads. 18141 // Find the largest scalar type that divides the total loaded size. 18142 MVT SclrLoadTy = MVT::i8; 18143 for (unsigned tp = MVT::FIRST_INTEGER_VALUETYPE; 18144 tp < MVT::LAST_INTEGER_VALUETYPE; ++tp) { 18145 MVT Tp = (MVT::SimpleValueType)tp; 18146 if (TLI.isTypeLegal(Tp) && ((MemSz % Tp.getSizeInBits()) == 0)) { 18147 SclrLoadTy = Tp; 18148 } 18149 } 18150 18151 // On 32bit systems, we can't save 64bit integers. Try bitcasting to F64. 18152 if (TLI.isTypeLegal(MVT::f64) && SclrLoadTy.getSizeInBits() < 64 && 18153 (64 <= MemSz)) 18154 SclrLoadTy = MVT::f64; 18155 18156 // Calculate the number of scalar loads that we need to perform 18157 // in order to load our vector from memory. 18158 unsigned NumLoads = MemSz / SclrLoadTy.getSizeInBits(); 18159 if (Ext == ISD::SEXTLOAD && NumLoads > 1) 18160 return SDValue(); 18161 18162 unsigned loadRegZize = RegSz; 18163 if (Ext == ISD::SEXTLOAD && RegSz == 256) 18164 loadRegZize /= 2; 18165 18166 // Represent our vector as a sequence of elements which are the 18167 // largest scalar that we can load. 18168 EVT LoadUnitVecVT = EVT::getVectorVT(*DAG.getContext(), SclrLoadTy, 18169 loadRegZize/SclrLoadTy.getSizeInBits()); 18170 18171 // Represent the data using the same element type that is stored in 18172 // memory. In practice, we ''widen'' MemVT. 18173 EVT WideVecVT = 18174 EVT::getVectorVT(*DAG.getContext(), MemVT.getScalarType(), 18175 loadRegZize/MemVT.getScalarType().getSizeInBits()); 18176 18177 assert(WideVecVT.getSizeInBits() == LoadUnitVecVT.getSizeInBits() && 18178 "Invalid vector type"); 18179 18180 // We can't shuffle using an illegal type. 18181 if (!TLI.isTypeLegal(WideVecVT)) 18182 return SDValue(); 18183 18184 SmallVector<SDValue, 8> Chains; 18185 SDValue Ptr = Ld->getBasePtr(); 18186 SDValue Increment = DAG.getConstant(SclrLoadTy.getSizeInBits()/8, 18187 TLI.getPointerTy()); 18188 SDValue Res = DAG.getUNDEF(LoadUnitVecVT); 18189 18190 for (unsigned i = 0; i < NumLoads; ++i) { 18191 // Perform a single load. 18192 SDValue ScalarLoad = DAG.getLoad(SclrLoadTy, dl, Ld->getChain(), 18193 Ptr, Ld->getPointerInfo(), 18194 Ld->isVolatile(), Ld->isNonTemporal(), 18195 Ld->isInvariant(), Ld->getAlignment()); 18196 Chains.push_back(ScalarLoad.getValue(1)); 18197 // Create the first element type using SCALAR_TO_VECTOR in order to avoid 18198 // another round of DAGCombining. 18199 if (i == 0) 18200 Res = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, LoadUnitVecVT, ScalarLoad); 18201 else 18202 Res = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, LoadUnitVecVT, Res, 18203 ScalarLoad, DAG.getIntPtrConstant(i)); 18204 18205 Ptr = DAG.getNode(ISD::ADD, dl, Ptr.getValueType(), Ptr, Increment); 18206 } 18207 18208 SDValue TF = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, &Chains[0], 18209 Chains.size()); 18210 18211 // Bitcast the loaded value to a vector of the original element type, in 18212 // the size of the target vector type. 18213 SDValue SlicedVec = DAG.getNode(ISD::BITCAST, dl, WideVecVT, Res); 18214 unsigned SizeRatio = RegSz/MemSz; 18215 18216 if (Ext == ISD::SEXTLOAD) { 18217 // If we have SSE4.1 we can directly emit a VSEXT node. 18218 if (Subtarget->hasSSE41()) { 18219 SDValue Sext = DAG.getNode(X86ISD::VSEXT, dl, RegVT, SlicedVec); 18220 return DCI.CombineTo(N, Sext, TF, true); 18221 } 18222 18223 // Otherwise we'll shuffle the small elements in the high bits of the 18224 // larger type and perform an arithmetic shift. If the shift is not legal 18225 // it's better to scalarize. 18226 if (!TLI.isOperationLegalOrCustom(ISD::SRA, RegVT)) 18227 return SDValue(); 18228 18229 // Redistribute the loaded elements into the different locations. 18230 SmallVector<int, 8> ShuffleVec(NumElems * SizeRatio, -1); 18231 for (unsigned i = 0; i != NumElems; ++i) 18232 ShuffleVec[i*SizeRatio + SizeRatio-1] = i; 18233 18234 SDValue Shuff = DAG.getVectorShuffle(WideVecVT, dl, SlicedVec, 18235 DAG.getUNDEF(WideVecVT), 18236 &ShuffleVec[0]); 18237 18238 Shuff = DAG.getNode(ISD::BITCAST, dl, RegVT, Shuff); 18239 18240 // Build the arithmetic shift. 18241 unsigned Amt = RegVT.getVectorElementType().getSizeInBits() - 18242 MemVT.getVectorElementType().getSizeInBits(); 18243 Shuff = DAG.getNode(ISD::SRA, dl, RegVT, Shuff, 18244 DAG.getConstant(Amt, RegVT)); 18245 18246 return DCI.CombineTo(N, Shuff, TF, true); 18247 } 18248 18249 // Redistribute the loaded elements into the different locations. 18250 SmallVector<int, 8> ShuffleVec(NumElems * SizeRatio, -1); 18251 for (unsigned i = 0; i != NumElems; ++i) 18252 ShuffleVec[i*SizeRatio] = i; 18253 18254 SDValue Shuff = DAG.getVectorShuffle(WideVecVT, dl, SlicedVec, 18255 DAG.getUNDEF(WideVecVT), 18256 &ShuffleVec[0]); 18257 18258 // Bitcast to the requested type. 18259 Shuff = DAG.getNode(ISD::BITCAST, dl, RegVT, Shuff); 18260 // Replace the original load with the new sequence 18261 // and return the new chain. 18262 return DCI.CombineTo(N, Shuff, TF, true); 18263 } 18264 18265 return SDValue(); 18266} 18267 18268/// PerformSTORECombine - Do target-specific dag combines on STORE nodes. 18269static SDValue PerformSTORECombine(SDNode *N, SelectionDAG &DAG, 18270 const X86Subtarget *Subtarget) { 18271 StoreSDNode *St = cast<StoreSDNode>(N); 18272 EVT VT = St->getValue().getValueType(); 18273 EVT StVT = St->getMemoryVT(); 18274 SDLoc dl(St); 18275 SDValue StoredVal = St->getOperand(1); 18276 const TargetLowering &TLI = DAG.getTargetLoweringInfo(); 18277 18278 // If we are saving a concatenation of two XMM registers, perform two stores. 18279 // On Sandy Bridge, 256-bit memory operations are executed by two 18280 // 128-bit ports. However, on Haswell it is better to issue a single 256-bit 18281 // memory operation. 18282 unsigned Alignment = St->getAlignment(); 18283 bool IsAligned = Alignment == 0 || Alignment >= VT.getSizeInBits()/8; 18284 if (VT.is256BitVector() && !Subtarget->hasInt256() && 18285 StVT == VT && !IsAligned) { 18286 unsigned NumElems = VT.getVectorNumElements(); 18287 if (NumElems < 2) 18288 return SDValue(); 18289 18290 SDValue Value0 = Extract128BitVector(StoredVal, 0, DAG, dl); 18291 SDValue Value1 = Extract128BitVector(StoredVal, NumElems/2, DAG, dl); 18292 18293 SDValue Stride = DAG.getConstant(16, TLI.getPointerTy()); 18294 SDValue Ptr0 = St->getBasePtr(); 18295 SDValue Ptr1 = DAG.getNode(ISD::ADD, dl, Ptr0.getValueType(), Ptr0, Stride); 18296 18297 SDValue Ch0 = DAG.getStore(St->getChain(), dl, Value0, Ptr0, 18298 St->getPointerInfo(), St->isVolatile(), 18299 St->isNonTemporal(), Alignment); 18300 SDValue Ch1 = DAG.getStore(St->getChain(), dl, Value1, Ptr1, 18301 St->getPointerInfo(), St->isVolatile(), 18302 St->isNonTemporal(), 18303 std::min(16U, Alignment)); 18304 return DAG.getNode(ISD::TokenFactor, dl, MVT::Other, Ch0, Ch1); 18305 } 18306 18307 // Optimize trunc store (of multiple scalars) to shuffle and store. 18308 // First, pack all of the elements in one place. Next, store to memory 18309 // in fewer chunks. 18310 if (St->isTruncatingStore() && VT.isVector()) { 18311 const TargetLowering &TLI = DAG.getTargetLoweringInfo(); 18312 unsigned NumElems = VT.getVectorNumElements(); 18313 assert(StVT != VT && "Cannot truncate to the same type"); 18314 unsigned FromSz = VT.getVectorElementType().getSizeInBits(); 18315 unsigned ToSz = StVT.getVectorElementType().getSizeInBits(); 18316 18317 // From, To sizes and ElemCount must be pow of two 18318 if (!isPowerOf2_32(NumElems * FromSz * ToSz)) return SDValue(); 18319 // We are going to use the original vector elt for storing. 18320 // Accumulated smaller vector elements must be a multiple of the store size. 18321 if (0 != (NumElems * FromSz) % ToSz) return SDValue(); 18322 18323 unsigned SizeRatio = FromSz / ToSz; 18324 18325 assert(SizeRatio * NumElems * ToSz == VT.getSizeInBits()); 18326 18327 // Create a type on which we perform the shuffle 18328 EVT WideVecVT = EVT::getVectorVT(*DAG.getContext(), 18329 StVT.getScalarType(), NumElems*SizeRatio); 18330 18331 assert(WideVecVT.getSizeInBits() == VT.getSizeInBits()); 18332 18333 SDValue WideVec = DAG.getNode(ISD::BITCAST, dl, WideVecVT, St->getValue()); 18334 SmallVector<int, 8> ShuffleVec(NumElems * SizeRatio, -1); 18335 for (unsigned i = 0; i != NumElems; ++i) 18336 ShuffleVec[i] = i * SizeRatio; 18337 18338 // Can't shuffle using an illegal type. 18339 if (!TLI.isTypeLegal(WideVecVT)) 18340 return SDValue(); 18341 18342 SDValue Shuff = DAG.getVectorShuffle(WideVecVT, dl, WideVec, 18343 DAG.getUNDEF(WideVecVT), 18344 &ShuffleVec[0]); 18345 // At this point all of the data is stored at the bottom of the 18346 // register. We now need to save it to mem. 18347 18348 // Find the largest store unit 18349 MVT StoreType = MVT::i8; 18350 for (unsigned tp = MVT::FIRST_INTEGER_VALUETYPE; 18351 tp < MVT::LAST_INTEGER_VALUETYPE; ++tp) { 18352 MVT Tp = (MVT::SimpleValueType)tp; 18353 if (TLI.isTypeLegal(Tp) && Tp.getSizeInBits() <= NumElems * ToSz) 18354 StoreType = Tp; 18355 } 18356 18357 // On 32bit systems, we can't save 64bit integers. Try bitcasting to F64. 18358 if (TLI.isTypeLegal(MVT::f64) && StoreType.getSizeInBits() < 64 && 18359 (64 <= NumElems * ToSz)) 18360 StoreType = MVT::f64; 18361 18362 // Bitcast the original vector into a vector of store-size units 18363 EVT StoreVecVT = EVT::getVectorVT(*DAG.getContext(), 18364 StoreType, VT.getSizeInBits()/StoreType.getSizeInBits()); 18365 assert(StoreVecVT.getSizeInBits() == VT.getSizeInBits()); 18366 SDValue ShuffWide = DAG.getNode(ISD::BITCAST, dl, StoreVecVT, Shuff); 18367 SmallVector<SDValue, 8> Chains; 18368 SDValue Increment = DAG.getConstant(StoreType.getSizeInBits()/8, 18369 TLI.getPointerTy()); 18370 SDValue Ptr = St->getBasePtr(); 18371 18372 // Perform one or more big stores into memory. 18373 for (unsigned i=0, e=(ToSz*NumElems)/StoreType.getSizeInBits(); i!=e; ++i) { 18374 SDValue SubVec = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, 18375 StoreType, ShuffWide, 18376 DAG.getIntPtrConstant(i)); 18377 SDValue Ch = DAG.getStore(St->getChain(), dl, SubVec, Ptr, 18378 St->getPointerInfo(), St->isVolatile(), 18379 St->isNonTemporal(), St->getAlignment()); 18380 Ptr = DAG.getNode(ISD::ADD, dl, Ptr.getValueType(), Ptr, Increment); 18381 Chains.push_back(Ch); 18382 } 18383 18384 return DAG.getNode(ISD::TokenFactor, dl, MVT::Other, &Chains[0], 18385 Chains.size()); 18386 } 18387 18388 // Turn load->store of MMX types into GPR load/stores. This avoids clobbering 18389 // the FP state in cases where an emms may be missing. 18390 // A preferable solution to the general problem is to figure out the right 18391 // places to insert EMMS. This qualifies as a quick hack. 18392 18393 // Similarly, turn load->store of i64 into double load/stores in 32-bit mode. 18394 if (VT.getSizeInBits() != 64) 18395 return SDValue(); 18396 18397 const Function *F = DAG.getMachineFunction().getFunction(); 18398 bool NoImplicitFloatOps = F->getAttributes(). 18399 hasAttribute(AttributeSet::FunctionIndex, Attribute::NoImplicitFloat); 18400 bool F64IsLegal = !DAG.getTarget().Options.UseSoftFloat && !NoImplicitFloatOps 18401 && Subtarget->hasSSE2(); 18402 if ((VT.isVector() || 18403 (VT == MVT::i64 && F64IsLegal && !Subtarget->is64Bit())) && 18404 isa<LoadSDNode>(St->getValue()) && 18405 !cast<LoadSDNode>(St->getValue())->isVolatile() && 18406 St->getChain().hasOneUse() && !St->isVolatile()) { 18407 SDNode* LdVal = St->getValue().getNode(); 18408 LoadSDNode *Ld = 0; 18409 int TokenFactorIndex = -1; 18410 SmallVector<SDValue, 8> Ops; 18411 SDNode* ChainVal = St->getChain().getNode(); 18412 // Must be a store of a load. We currently handle two cases: the load 18413 // is a direct child, and it's under an intervening TokenFactor. It is 18414 // possible to dig deeper under nested TokenFactors. 18415 if (ChainVal == LdVal) 18416 Ld = cast<LoadSDNode>(St->getChain()); 18417 else if (St->getValue().hasOneUse() && 18418 ChainVal->getOpcode() == ISD::TokenFactor) { 18419 for (unsigned i = 0, e = ChainVal->getNumOperands(); i != e; ++i) { 18420 if (ChainVal->getOperand(i).getNode() == LdVal) { 18421 TokenFactorIndex = i; 18422 Ld = cast<LoadSDNode>(St->getValue()); 18423 } else 18424 Ops.push_back(ChainVal->getOperand(i)); 18425 } 18426 } 18427 18428 if (!Ld || !ISD::isNormalLoad(Ld)) 18429 return SDValue(); 18430 18431 // If this is not the MMX case, i.e. we are just turning i64 load/store 18432 // into f64 load/store, avoid the transformation if there are multiple 18433 // uses of the loaded value. 18434 if (!VT.isVector() && !Ld->hasNUsesOfValue(1, 0)) 18435 return SDValue(); 18436 18437 SDLoc LdDL(Ld); 18438 SDLoc StDL(N); 18439 // If we are a 64-bit capable x86, lower to a single movq load/store pair. 18440 // Otherwise, if it's legal to use f64 SSE instructions, use f64 load/store 18441 // pair instead. 18442 if (Subtarget->is64Bit() || F64IsLegal) { 18443 EVT LdVT = Subtarget->is64Bit() ? MVT::i64 : MVT::f64; 18444 SDValue NewLd = DAG.getLoad(LdVT, LdDL, Ld->getChain(), Ld->getBasePtr(), 18445 Ld->getPointerInfo(), Ld->isVolatile(), 18446 Ld->isNonTemporal(), Ld->isInvariant(), 18447 Ld->getAlignment()); 18448 SDValue NewChain = NewLd.getValue(1); 18449 if (TokenFactorIndex != -1) { 18450 Ops.push_back(NewChain); 18451 NewChain = DAG.getNode(ISD::TokenFactor, LdDL, MVT::Other, &Ops[0], 18452 Ops.size()); 18453 } 18454 return DAG.getStore(NewChain, StDL, NewLd, St->getBasePtr(), 18455 St->getPointerInfo(), 18456 St->isVolatile(), St->isNonTemporal(), 18457 St->getAlignment()); 18458 } 18459 18460 // Otherwise, lower to two pairs of 32-bit loads / stores. 18461 SDValue LoAddr = Ld->getBasePtr(); 18462 SDValue HiAddr = DAG.getNode(ISD::ADD, LdDL, MVT::i32, LoAddr, 18463 DAG.getConstant(4, MVT::i32)); 18464 18465 SDValue LoLd = DAG.getLoad(MVT::i32, LdDL, Ld->getChain(), LoAddr, 18466 Ld->getPointerInfo(), 18467 Ld->isVolatile(), Ld->isNonTemporal(), 18468 Ld->isInvariant(), Ld->getAlignment()); 18469 SDValue HiLd = DAG.getLoad(MVT::i32, LdDL, Ld->getChain(), HiAddr, 18470 Ld->getPointerInfo().getWithOffset(4), 18471 Ld->isVolatile(), Ld->isNonTemporal(), 18472 Ld->isInvariant(), 18473 MinAlign(Ld->getAlignment(), 4)); 18474 18475 SDValue NewChain = LoLd.getValue(1); 18476 if (TokenFactorIndex != -1) { 18477 Ops.push_back(LoLd); 18478 Ops.push_back(HiLd); 18479 NewChain = DAG.getNode(ISD::TokenFactor, LdDL, MVT::Other, &Ops[0], 18480 Ops.size()); 18481 } 18482 18483 LoAddr = St->getBasePtr(); 18484 HiAddr = DAG.getNode(ISD::ADD, StDL, MVT::i32, LoAddr, 18485 DAG.getConstant(4, MVT::i32)); 18486 18487 SDValue LoSt = DAG.getStore(NewChain, StDL, LoLd, LoAddr, 18488 St->getPointerInfo(), 18489 St->isVolatile(), St->isNonTemporal(), 18490 St->getAlignment()); 18491 SDValue HiSt = DAG.getStore(NewChain, StDL, HiLd, HiAddr, 18492 St->getPointerInfo().getWithOffset(4), 18493 St->isVolatile(), 18494 St->isNonTemporal(), 18495 MinAlign(St->getAlignment(), 4)); 18496 return DAG.getNode(ISD::TokenFactor, StDL, MVT::Other, LoSt, HiSt); 18497 } 18498 return SDValue(); 18499} 18500 18501/// isHorizontalBinOp - Return 'true' if this vector operation is "horizontal" 18502/// and return the operands for the horizontal operation in LHS and RHS. A 18503/// horizontal operation performs the binary operation on successive elements 18504/// of its first operand, then on successive elements of its second operand, 18505/// returning the resulting values in a vector. For example, if 18506/// A = < float a0, float a1, float a2, float a3 > 18507/// and 18508/// B = < float b0, float b1, float b2, float b3 > 18509/// then the result of doing a horizontal operation on A and B is 18510/// A horizontal-op B = < a0 op a1, a2 op a3, b0 op b1, b2 op b3 >. 18511/// In short, LHS and RHS are inspected to see if LHS op RHS is of the form 18512/// A horizontal-op B, for some already available A and B, and if so then LHS is 18513/// set to A, RHS to B, and the routine returns 'true'. 18514/// Note that the binary operation should have the property that if one of the 18515/// operands is UNDEF then the result is UNDEF. 18516static bool isHorizontalBinOp(SDValue &LHS, SDValue &RHS, bool IsCommutative) { 18517 // Look for the following pattern: if 18518 // A = < float a0, float a1, float a2, float a3 > 18519 // B = < float b0, float b1, float b2, float b3 > 18520 // and 18521 // LHS = VECTOR_SHUFFLE A, B, <0, 2, 4, 6> 18522 // RHS = VECTOR_SHUFFLE A, B, <1, 3, 5, 7> 18523 // then LHS op RHS = < a0 op a1, a2 op a3, b0 op b1, b2 op b3 > 18524 // which is A horizontal-op B. 18525 18526 // At least one of the operands should be a vector shuffle. 18527 if (LHS.getOpcode() != ISD::VECTOR_SHUFFLE && 18528 RHS.getOpcode() != ISD::VECTOR_SHUFFLE) 18529 return false; 18530 18531 MVT VT = LHS.getSimpleValueType(); 18532 18533 assert((VT.is128BitVector() || VT.is256BitVector()) && 18534 "Unsupported vector type for horizontal add/sub"); 18535 18536 // Handle 128 and 256-bit vector lengths. AVX defines horizontal add/sub to 18537 // operate independently on 128-bit lanes. 18538 unsigned NumElts = VT.getVectorNumElements(); 18539 unsigned NumLanes = VT.getSizeInBits()/128; 18540 unsigned NumLaneElts = NumElts / NumLanes; 18541 assert((NumLaneElts % 2 == 0) && 18542 "Vector type should have an even number of elements in each lane"); 18543 unsigned HalfLaneElts = NumLaneElts/2; 18544 18545 // View LHS in the form 18546 // LHS = VECTOR_SHUFFLE A, B, LMask 18547 // If LHS is not a shuffle then pretend it is the shuffle 18548 // LHS = VECTOR_SHUFFLE LHS, undef, <0, 1, ..., N-1> 18549 // NOTE: in what follows a default initialized SDValue represents an UNDEF of 18550 // type VT. 18551 SDValue A, B; 18552 SmallVector<int, 16> LMask(NumElts); 18553 if (LHS.getOpcode() == ISD::VECTOR_SHUFFLE) { 18554 if (LHS.getOperand(0).getOpcode() != ISD::UNDEF) 18555 A = LHS.getOperand(0); 18556 if (LHS.getOperand(1).getOpcode() != ISD::UNDEF) 18557 B = LHS.getOperand(1); 18558 ArrayRef<int> Mask = cast<ShuffleVectorSDNode>(LHS.getNode())->getMask(); 18559 std::copy(Mask.begin(), Mask.end(), LMask.begin()); 18560 } else { 18561 if (LHS.getOpcode() != ISD::UNDEF) 18562 A = LHS; 18563 for (unsigned i = 0; i != NumElts; ++i) 18564 LMask[i] = i; 18565 } 18566 18567 // Likewise, view RHS in the form 18568 // RHS = VECTOR_SHUFFLE C, D, RMask 18569 SDValue C, D; 18570 SmallVector<int, 16> RMask(NumElts); 18571 if (RHS.getOpcode() == ISD::VECTOR_SHUFFLE) { 18572 if (RHS.getOperand(0).getOpcode() != ISD::UNDEF) 18573 C = RHS.getOperand(0); 18574 if (RHS.getOperand(1).getOpcode() != ISD::UNDEF) 18575 D = RHS.getOperand(1); 18576 ArrayRef<int> Mask = cast<ShuffleVectorSDNode>(RHS.getNode())->getMask(); 18577 std::copy(Mask.begin(), Mask.end(), RMask.begin()); 18578 } else { 18579 if (RHS.getOpcode() != ISD::UNDEF) 18580 C = RHS; 18581 for (unsigned i = 0; i != NumElts; ++i) 18582 RMask[i] = i; 18583 } 18584 18585 // Check that the shuffles are both shuffling the same vectors. 18586 if (!(A == C && B == D) && !(A == D && B == C)) 18587 return false; 18588 18589 // If everything is UNDEF then bail out: it would be better to fold to UNDEF. 18590 if (!A.getNode() && !B.getNode()) 18591 return false; 18592 18593 // If A and B occur in reverse order in RHS, then "swap" them (which means 18594 // rewriting the mask). 18595 if (A != C) 18596 CommuteVectorShuffleMask(RMask, NumElts); 18597 18598 // At this point LHS and RHS are equivalent to 18599 // LHS = VECTOR_SHUFFLE A, B, LMask 18600 // RHS = VECTOR_SHUFFLE A, B, RMask 18601 // Check that the masks correspond to performing a horizontal operation. 18602 for (unsigned l = 0; l != NumElts; l += NumLaneElts) { 18603 for (unsigned i = 0; i != NumLaneElts; ++i) { 18604 int LIdx = LMask[i+l], RIdx = RMask[i+l]; 18605 18606 // Ignore any UNDEF components. 18607 if (LIdx < 0 || RIdx < 0 || 18608 (!A.getNode() && (LIdx < (int)NumElts || RIdx < (int)NumElts)) || 18609 (!B.getNode() && (LIdx >= (int)NumElts || RIdx >= (int)NumElts))) 18610 continue; 18611 18612 // Check that successive elements are being operated on. If not, this is 18613 // not a horizontal operation. 18614 unsigned Src = (i/HalfLaneElts); // each lane is split between srcs 18615 int Index = 2*(i%HalfLaneElts) + NumElts*Src + l; 18616 if (!(LIdx == Index && RIdx == Index + 1) && 18617 !(IsCommutative && LIdx == Index + 1 && RIdx == Index)) 18618 return false; 18619 } 18620 } 18621 18622 LHS = A.getNode() ? A : B; // If A is 'UNDEF', use B for it. 18623 RHS = B.getNode() ? B : A; // If B is 'UNDEF', use A for it. 18624 return true; 18625} 18626 18627/// PerformFADDCombine - Do target-specific dag combines on floating point adds. 18628static SDValue PerformFADDCombine(SDNode *N, SelectionDAG &DAG, 18629 const X86Subtarget *Subtarget) { 18630 EVT VT = N->getValueType(0); 18631 SDValue LHS = N->getOperand(0); 18632 SDValue RHS = N->getOperand(1); 18633 18634 // Try to synthesize horizontal adds from adds of shuffles. 18635 if (((Subtarget->hasSSE3() && (VT == MVT::v4f32 || VT == MVT::v2f64)) || 18636 (Subtarget->hasFp256() && (VT == MVT::v8f32 || VT == MVT::v4f64))) && 18637 isHorizontalBinOp(LHS, RHS, true)) 18638 return DAG.getNode(X86ISD::FHADD, SDLoc(N), VT, LHS, RHS); 18639 return SDValue(); 18640} 18641 18642/// PerformFSUBCombine - Do target-specific dag combines on floating point subs. 18643static SDValue PerformFSUBCombine(SDNode *N, SelectionDAG &DAG, 18644 const X86Subtarget *Subtarget) { 18645 EVT VT = N->getValueType(0); 18646 SDValue LHS = N->getOperand(0); 18647 SDValue RHS = N->getOperand(1); 18648 18649 // Try to synthesize horizontal subs from subs of shuffles. 18650 if (((Subtarget->hasSSE3() && (VT == MVT::v4f32 || VT == MVT::v2f64)) || 18651 (Subtarget->hasFp256() && (VT == MVT::v8f32 || VT == MVT::v4f64))) && 18652 isHorizontalBinOp(LHS, RHS, false)) 18653 return DAG.getNode(X86ISD::FHSUB, SDLoc(N), VT, LHS, RHS); 18654 return SDValue(); 18655} 18656 18657/// PerformFORCombine - Do target-specific dag combines on X86ISD::FOR and 18658/// X86ISD::FXOR nodes. 18659static SDValue PerformFORCombine(SDNode *N, SelectionDAG &DAG) { 18660 assert(N->getOpcode() == X86ISD::FOR || N->getOpcode() == X86ISD::FXOR); 18661 // F[X]OR(0.0, x) -> x 18662 // F[X]OR(x, 0.0) -> x 18663 if (ConstantFPSDNode *C = dyn_cast<ConstantFPSDNode>(N->getOperand(0))) 18664 if (C->getValueAPF().isPosZero()) 18665 return N->getOperand(1); 18666 if (ConstantFPSDNode *C = dyn_cast<ConstantFPSDNode>(N->getOperand(1))) 18667 if (C->getValueAPF().isPosZero()) 18668 return N->getOperand(0); 18669 return SDValue(); 18670} 18671 18672/// PerformFMinFMaxCombine - Do target-specific dag combines on X86ISD::FMIN and 18673/// X86ISD::FMAX nodes. 18674static SDValue PerformFMinFMaxCombine(SDNode *N, SelectionDAG &DAG) { 18675 assert(N->getOpcode() == X86ISD::FMIN || N->getOpcode() == X86ISD::FMAX); 18676 18677 // Only perform optimizations if UnsafeMath is used. 18678 if (!DAG.getTarget().Options.UnsafeFPMath) 18679 return SDValue(); 18680 18681 // If we run in unsafe-math mode, then convert the FMAX and FMIN nodes 18682 // into FMINC and FMAXC, which are Commutative operations. 18683 unsigned NewOp = 0; 18684 switch (N->getOpcode()) { 18685 default: llvm_unreachable("unknown opcode"); 18686 case X86ISD::FMIN: NewOp = X86ISD::FMINC; break; 18687 case X86ISD::FMAX: NewOp = X86ISD::FMAXC; break; 18688 } 18689 18690 return DAG.getNode(NewOp, SDLoc(N), N->getValueType(0), 18691 N->getOperand(0), N->getOperand(1)); 18692} 18693 18694/// PerformFANDCombine - Do target-specific dag combines on X86ISD::FAND nodes. 18695static SDValue PerformFANDCombine(SDNode *N, SelectionDAG &DAG) { 18696 // FAND(0.0, x) -> 0.0 18697 // FAND(x, 0.0) -> 0.0 18698 if (ConstantFPSDNode *C = dyn_cast<ConstantFPSDNode>(N->getOperand(0))) 18699 if (C->getValueAPF().isPosZero()) 18700 return N->getOperand(0); 18701 if (ConstantFPSDNode *C = dyn_cast<ConstantFPSDNode>(N->getOperand(1))) 18702 if (C->getValueAPF().isPosZero()) 18703 return N->getOperand(1); 18704 return SDValue(); 18705} 18706 18707/// PerformFANDNCombine - Do target-specific dag combines on X86ISD::FANDN nodes 18708static SDValue PerformFANDNCombine(SDNode *N, SelectionDAG &DAG) { 18709 // FANDN(x, 0.0) -> 0.0 18710 // FANDN(0.0, x) -> x 18711 if (ConstantFPSDNode *C = dyn_cast<ConstantFPSDNode>(N->getOperand(0))) 18712 if (C->getValueAPF().isPosZero()) 18713 return N->getOperand(1); 18714 if (ConstantFPSDNode *C = dyn_cast<ConstantFPSDNode>(N->getOperand(1))) 18715 if (C->getValueAPF().isPosZero()) 18716 return N->getOperand(1); 18717 return SDValue(); 18718} 18719 18720static SDValue PerformBTCombine(SDNode *N, 18721 SelectionDAG &DAG, 18722 TargetLowering::DAGCombinerInfo &DCI) { 18723 // BT ignores high bits in the bit index operand. 18724 SDValue Op1 = N->getOperand(1); 18725 if (Op1.hasOneUse()) { 18726 unsigned BitWidth = Op1.getValueSizeInBits(); 18727 APInt DemandedMask = APInt::getLowBitsSet(BitWidth, Log2_32(BitWidth)); 18728 APInt KnownZero, KnownOne; 18729 TargetLowering::TargetLoweringOpt TLO(DAG, !DCI.isBeforeLegalize(), 18730 !DCI.isBeforeLegalizeOps()); 18731 const TargetLowering &TLI = DAG.getTargetLoweringInfo(); 18732 if (TLO.ShrinkDemandedConstant(Op1, DemandedMask) || 18733 TLI.SimplifyDemandedBits(Op1, DemandedMask, KnownZero, KnownOne, TLO)) 18734 DCI.CommitTargetLoweringOpt(TLO); 18735 } 18736 return SDValue(); 18737} 18738 18739static SDValue PerformVZEXT_MOVLCombine(SDNode *N, SelectionDAG &DAG) { 18740 SDValue Op = N->getOperand(0); 18741 if (Op.getOpcode() == ISD::BITCAST) 18742 Op = Op.getOperand(0); 18743 EVT VT = N->getValueType(0), OpVT = Op.getValueType(); 18744 if (Op.getOpcode() == X86ISD::VZEXT_LOAD && 18745 VT.getVectorElementType().getSizeInBits() == 18746 OpVT.getVectorElementType().getSizeInBits()) { 18747 return DAG.getNode(ISD::BITCAST, SDLoc(N), VT, Op); 18748 } 18749 return SDValue(); 18750} 18751 18752static SDValue PerformSIGN_EXTEND_INREGCombine(SDNode *N, SelectionDAG &DAG, 18753 const X86Subtarget *Subtarget) { 18754 EVT VT = N->getValueType(0); 18755 if (!VT.isVector()) 18756 return SDValue(); 18757 18758 SDValue N0 = N->getOperand(0); 18759 SDValue N1 = N->getOperand(1); 18760 EVT ExtraVT = cast<VTSDNode>(N1)->getVT(); 18761 SDLoc dl(N); 18762 18763 // The SIGN_EXTEND_INREG to v4i64 is expensive operation on the 18764 // both SSE and AVX2 since there is no sign-extended shift right 18765 // operation on a vector with 64-bit elements. 18766 //(sext_in_reg (v4i64 anyext (v4i32 x )), ExtraVT) -> 18767 // (v4i64 sext (v4i32 sext_in_reg (v4i32 x , ExtraVT))) 18768 if (VT == MVT::v4i64 && (N0.getOpcode() == ISD::ANY_EXTEND || 18769 N0.getOpcode() == ISD::SIGN_EXTEND)) { 18770 SDValue N00 = N0.getOperand(0); 18771 18772 // EXTLOAD has a better solution on AVX2, 18773 // it may be replaced with X86ISD::VSEXT node. 18774 if (N00.getOpcode() == ISD::LOAD && Subtarget->hasInt256()) 18775 if (!ISD::isNormalLoad(N00.getNode())) 18776 return SDValue(); 18777 18778 if (N00.getValueType() == MVT::v4i32 && ExtraVT.getSizeInBits() < 128) { 18779 SDValue Tmp = DAG.getNode(ISD::SIGN_EXTEND_INREG, dl, MVT::v4i32, 18780 N00, N1); 18781 return DAG.getNode(ISD::SIGN_EXTEND, dl, MVT::v4i64, Tmp); 18782 } 18783 } 18784 return SDValue(); 18785} 18786 18787static SDValue PerformSExtCombine(SDNode *N, SelectionDAG &DAG, 18788 TargetLowering::DAGCombinerInfo &DCI, 18789 const X86Subtarget *Subtarget) { 18790 if (!DCI.isBeforeLegalizeOps()) 18791 return SDValue(); 18792 18793 if (!Subtarget->hasFp256()) 18794 return SDValue(); 18795 18796 EVT VT = N->getValueType(0); 18797 if (VT.isVector() && VT.getSizeInBits() == 256) { 18798 SDValue R = WidenMaskArithmetic(N, DAG, DCI, Subtarget); 18799 if (R.getNode()) 18800 return R; 18801 } 18802 18803 return SDValue(); 18804} 18805 18806static SDValue PerformFMACombine(SDNode *N, SelectionDAG &DAG, 18807 const X86Subtarget* Subtarget) { 18808 SDLoc dl(N); 18809 EVT VT = N->getValueType(0); 18810 18811 // Let legalize expand this if it isn't a legal type yet. 18812 if (!DAG.getTargetLoweringInfo().isTypeLegal(VT)) 18813 return SDValue(); 18814 18815 EVT ScalarVT = VT.getScalarType(); 18816 if ((ScalarVT != MVT::f32 && ScalarVT != MVT::f64) || 18817 (!Subtarget->hasFMA() && !Subtarget->hasFMA4())) 18818 return SDValue(); 18819 18820 SDValue A = N->getOperand(0); 18821 SDValue B = N->getOperand(1); 18822 SDValue C = N->getOperand(2); 18823 18824 bool NegA = (A.getOpcode() == ISD::FNEG); 18825 bool NegB = (B.getOpcode() == ISD::FNEG); 18826 bool NegC = (C.getOpcode() == ISD::FNEG); 18827 18828 // Negative multiplication when NegA xor NegB 18829 bool NegMul = (NegA != NegB); 18830 if (NegA) 18831 A = A.getOperand(0); 18832 if (NegB) 18833 B = B.getOperand(0); 18834 if (NegC) 18835 C = C.getOperand(0); 18836 18837 unsigned Opcode; 18838 if (!NegMul) 18839 Opcode = (!NegC) ? X86ISD::FMADD : X86ISD::FMSUB; 18840 else 18841 Opcode = (!NegC) ? X86ISD::FNMADD : X86ISD::FNMSUB; 18842 18843 return DAG.getNode(Opcode, dl, VT, A, B, C); 18844} 18845 18846static SDValue PerformZExtCombine(SDNode *N, SelectionDAG &DAG, 18847 TargetLowering::DAGCombinerInfo &DCI, 18848 const X86Subtarget *Subtarget) { 18849 // (i32 zext (and (i8 x86isd::setcc_carry), 1)) -> 18850 // (and (i32 x86isd::setcc_carry), 1) 18851 // This eliminates the zext. This transformation is necessary because 18852 // ISD::SETCC is always legalized to i8. 18853 SDLoc dl(N); 18854 SDValue N0 = N->getOperand(0); 18855 EVT VT = N->getValueType(0); 18856 18857 if (N0.getOpcode() == ISD::AND && 18858 N0.hasOneUse() && 18859 N0.getOperand(0).hasOneUse()) { 18860 SDValue N00 = N0.getOperand(0); 18861 if (N00.getOpcode() == X86ISD::SETCC_CARRY) { 18862 ConstantSDNode *C = dyn_cast<ConstantSDNode>(N0.getOperand(1)); 18863 if (!C || C->getZExtValue() != 1) 18864 return SDValue(); 18865 return DAG.getNode(ISD::AND, dl, VT, 18866 DAG.getNode(X86ISD::SETCC_CARRY, dl, VT, 18867 N00.getOperand(0), N00.getOperand(1)), 18868 DAG.getConstant(1, VT)); 18869 } 18870 } 18871 18872 if (VT.is256BitVector()) { 18873 SDValue R = WidenMaskArithmetic(N, DAG, DCI, Subtarget); 18874 if (R.getNode()) 18875 return R; 18876 } 18877 18878 return SDValue(); 18879} 18880 18881// Optimize x == -y --> x+y == 0 18882// x != -y --> x+y != 0 18883static SDValue PerformISDSETCCCombine(SDNode *N, SelectionDAG &DAG) { 18884 ISD::CondCode CC = cast<CondCodeSDNode>(N->getOperand(2))->get(); 18885 SDValue LHS = N->getOperand(0); 18886 SDValue RHS = N->getOperand(1); 18887 18888 if ((CC == ISD::SETNE || CC == ISD::SETEQ) && LHS.getOpcode() == ISD::SUB) 18889 if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(LHS.getOperand(0))) 18890 if (C->getAPIntValue() == 0 && LHS.hasOneUse()) { 18891 SDValue addV = DAG.getNode(ISD::ADD, SDLoc(N), 18892 LHS.getValueType(), RHS, LHS.getOperand(1)); 18893 return DAG.getSetCC(SDLoc(N), N->getValueType(0), 18894 addV, DAG.getConstant(0, addV.getValueType()), CC); 18895 } 18896 if ((CC == ISD::SETNE || CC == ISD::SETEQ) && RHS.getOpcode() == ISD::SUB) 18897 if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(RHS.getOperand(0))) 18898 if (C->getAPIntValue() == 0 && RHS.hasOneUse()) { 18899 SDValue addV = DAG.getNode(ISD::ADD, SDLoc(N), 18900 RHS.getValueType(), LHS, RHS.getOperand(1)); 18901 return DAG.getSetCC(SDLoc(N), N->getValueType(0), 18902 addV, DAG.getConstant(0, addV.getValueType()), CC); 18903 } 18904 return SDValue(); 18905} 18906 18907// Helper function of PerformSETCCCombine. It is to materialize "setb reg" 18908// as "sbb reg,reg", since it can be extended without zext and produces 18909// an all-ones bit which is more useful than 0/1 in some cases. 18910static SDValue MaterializeSETB(SDLoc DL, SDValue EFLAGS, SelectionDAG &DAG) { 18911 return DAG.getNode(ISD::AND, DL, MVT::i8, 18912 DAG.getNode(X86ISD::SETCC_CARRY, DL, MVT::i8, 18913 DAG.getConstant(X86::COND_B, MVT::i8), EFLAGS), 18914 DAG.getConstant(1, MVT::i8)); 18915} 18916 18917// Optimize RES = X86ISD::SETCC CONDCODE, EFLAG_INPUT 18918static SDValue PerformSETCCCombine(SDNode *N, SelectionDAG &DAG, 18919 TargetLowering::DAGCombinerInfo &DCI, 18920 const X86Subtarget *Subtarget) { 18921 SDLoc DL(N); 18922 X86::CondCode CC = X86::CondCode(N->getConstantOperandVal(0)); 18923 SDValue EFLAGS = N->getOperand(1); 18924 18925 if (CC == X86::COND_A) { 18926 // Try to convert COND_A into COND_B in an attempt to facilitate 18927 // materializing "setb reg". 18928 // 18929 // Do not flip "e > c", where "c" is a constant, because Cmp instruction 18930 // cannot take an immediate as its first operand. 18931 // 18932 if (EFLAGS.getOpcode() == X86ISD::SUB && EFLAGS.hasOneUse() && 18933 EFLAGS.getValueType().isInteger() && 18934 !isa<ConstantSDNode>(EFLAGS.getOperand(1))) { 18935 SDValue NewSub = DAG.getNode(X86ISD::SUB, SDLoc(EFLAGS), 18936 EFLAGS.getNode()->getVTList(), 18937 EFLAGS.getOperand(1), EFLAGS.getOperand(0)); 18938 SDValue NewEFLAGS = SDValue(NewSub.getNode(), EFLAGS.getResNo()); 18939 return MaterializeSETB(DL, NewEFLAGS, DAG); 18940 } 18941 } 18942 18943 // Materialize "setb reg" as "sbb reg,reg", since it can be extended without 18944 // a zext and produces an all-ones bit which is more useful than 0/1 in some 18945 // cases. 18946 if (CC == X86::COND_B) 18947 return MaterializeSETB(DL, EFLAGS, DAG); 18948 18949 SDValue Flags; 18950 18951 Flags = checkBoolTestSetCCCombine(EFLAGS, CC); 18952 if (Flags.getNode()) { 18953 SDValue Cond = DAG.getConstant(CC, MVT::i8); 18954 return DAG.getNode(X86ISD::SETCC, DL, N->getVTList(), Cond, Flags); 18955 } 18956 18957 return SDValue(); 18958} 18959 18960// Optimize branch condition evaluation. 18961// 18962static SDValue PerformBrCondCombine(SDNode *N, SelectionDAG &DAG, 18963 TargetLowering::DAGCombinerInfo &DCI, 18964 const X86Subtarget *Subtarget) { 18965 SDLoc DL(N); 18966 SDValue Chain = N->getOperand(0); 18967 SDValue Dest = N->getOperand(1); 18968 SDValue EFLAGS = N->getOperand(3); 18969 X86::CondCode CC = X86::CondCode(N->getConstantOperandVal(2)); 18970 18971 SDValue Flags; 18972 18973 Flags = checkBoolTestSetCCCombine(EFLAGS, CC); 18974 if (Flags.getNode()) { 18975 SDValue Cond = DAG.getConstant(CC, MVT::i8); 18976 return DAG.getNode(X86ISD::BRCOND, DL, N->getVTList(), Chain, Dest, Cond, 18977 Flags); 18978 } 18979 18980 return SDValue(); 18981} 18982 18983static SDValue PerformSINT_TO_FPCombine(SDNode *N, SelectionDAG &DAG, 18984 const X86TargetLowering *XTLI) { 18985 SDValue Op0 = N->getOperand(0); 18986 EVT InVT = Op0->getValueType(0); 18987 18988 // SINT_TO_FP(v4i8) -> SINT_TO_FP(SEXT(v4i8 to v4i32)) 18989 if (InVT == MVT::v8i8 || InVT == MVT::v4i8) { 18990 SDLoc dl(N); 18991 MVT DstVT = InVT == MVT::v4i8 ? MVT::v4i32 : MVT::v8i32; 18992 SDValue P = DAG.getNode(ISD::SIGN_EXTEND, dl, DstVT, Op0); 18993 return DAG.getNode(ISD::SINT_TO_FP, dl, N->getValueType(0), P); 18994 } 18995 18996 // Transform (SINT_TO_FP (i64 ...)) into an x87 operation if we have 18997 // a 32-bit target where SSE doesn't support i64->FP operations. 18998 if (Op0.getOpcode() == ISD::LOAD) { 18999 LoadSDNode *Ld = cast<LoadSDNode>(Op0.getNode()); 19000 EVT VT = Ld->getValueType(0); 19001 if (!Ld->isVolatile() && !N->getValueType(0).isVector() && 19002 ISD::isNON_EXTLoad(Op0.getNode()) && Op0.hasOneUse() && 19003 !XTLI->getSubtarget()->is64Bit() && 19004 VT == MVT::i64) { 19005 SDValue FILDChain = XTLI->BuildFILD(SDValue(N, 0), Ld->getValueType(0), 19006 Ld->getChain(), Op0, DAG); 19007 DAG.ReplaceAllUsesOfValueWith(Op0.getValue(1), FILDChain.getValue(1)); 19008 return FILDChain; 19009 } 19010 } 19011 return SDValue(); 19012} 19013 19014// Optimize RES, EFLAGS = X86ISD::ADC LHS, RHS, EFLAGS 19015static SDValue PerformADCCombine(SDNode *N, SelectionDAG &DAG, 19016 X86TargetLowering::DAGCombinerInfo &DCI) { 19017 // If the LHS and RHS of the ADC node are zero, then it can't overflow and 19018 // the result is either zero or one (depending on the input carry bit). 19019 // Strength reduce this down to a "set on carry" aka SETCC_CARRY&1. 19020 if (X86::isZeroNode(N->getOperand(0)) && 19021 X86::isZeroNode(N->getOperand(1)) && 19022 // We don't have a good way to replace an EFLAGS use, so only do this when 19023 // dead right now. 19024 SDValue(N, 1).use_empty()) { 19025 SDLoc DL(N); 19026 EVT VT = N->getValueType(0); 19027 SDValue CarryOut = DAG.getConstant(0, N->getValueType(1)); 19028 SDValue Res1 = DAG.getNode(ISD::AND, DL, VT, 19029 DAG.getNode(X86ISD::SETCC_CARRY, DL, VT, 19030 DAG.getConstant(X86::COND_B,MVT::i8), 19031 N->getOperand(2)), 19032 DAG.getConstant(1, VT)); 19033 return DCI.CombineTo(N, Res1, CarryOut); 19034 } 19035 19036 return SDValue(); 19037} 19038 19039// fold (add Y, (sete X, 0)) -> adc 0, Y 19040// (add Y, (setne X, 0)) -> sbb -1, Y 19041// (sub (sete X, 0), Y) -> sbb 0, Y 19042// (sub (setne X, 0), Y) -> adc -1, Y 19043static SDValue OptimizeConditionalInDecrement(SDNode *N, SelectionDAG &DAG) { 19044 SDLoc DL(N); 19045 19046 // Look through ZExts. 19047 SDValue Ext = N->getOperand(N->getOpcode() == ISD::SUB ? 1 : 0); 19048 if (Ext.getOpcode() != ISD::ZERO_EXTEND || !Ext.hasOneUse()) 19049 return SDValue(); 19050 19051 SDValue SetCC = Ext.getOperand(0); 19052 if (SetCC.getOpcode() != X86ISD::SETCC || !SetCC.hasOneUse()) 19053 return SDValue(); 19054 19055 X86::CondCode CC = (X86::CondCode)SetCC.getConstantOperandVal(0); 19056 if (CC != X86::COND_E && CC != X86::COND_NE) 19057 return SDValue(); 19058 19059 SDValue Cmp = SetCC.getOperand(1); 19060 if (Cmp.getOpcode() != X86ISD::CMP || !Cmp.hasOneUse() || 19061 !X86::isZeroNode(Cmp.getOperand(1)) || 19062 !Cmp.getOperand(0).getValueType().isInteger()) 19063 return SDValue(); 19064 19065 SDValue CmpOp0 = Cmp.getOperand(0); 19066 SDValue NewCmp = DAG.getNode(X86ISD::CMP, DL, MVT::i32, CmpOp0, 19067 DAG.getConstant(1, CmpOp0.getValueType())); 19068 19069 SDValue OtherVal = N->getOperand(N->getOpcode() == ISD::SUB ? 0 : 1); 19070 if (CC == X86::COND_NE) 19071 return DAG.getNode(N->getOpcode() == ISD::SUB ? X86ISD::ADC : X86ISD::SBB, 19072 DL, OtherVal.getValueType(), OtherVal, 19073 DAG.getConstant(-1ULL, OtherVal.getValueType()), NewCmp); 19074 return DAG.getNode(N->getOpcode() == ISD::SUB ? X86ISD::SBB : X86ISD::ADC, 19075 DL, OtherVal.getValueType(), OtherVal, 19076 DAG.getConstant(0, OtherVal.getValueType()), NewCmp); 19077} 19078 19079/// PerformADDCombine - Do target-specific dag combines on integer adds. 19080static SDValue PerformAddCombine(SDNode *N, SelectionDAG &DAG, 19081 const X86Subtarget *Subtarget) { 19082 EVT VT = N->getValueType(0); 19083 SDValue Op0 = N->getOperand(0); 19084 SDValue Op1 = N->getOperand(1); 19085 19086 // Try to synthesize horizontal adds from adds of shuffles. 19087 if (((Subtarget->hasSSSE3() && (VT == MVT::v8i16 || VT == MVT::v4i32)) || 19088 (Subtarget->hasInt256() && (VT == MVT::v16i16 || VT == MVT::v8i32))) && 19089 isHorizontalBinOp(Op0, Op1, true)) 19090 return DAG.getNode(X86ISD::HADD, SDLoc(N), VT, Op0, Op1); 19091 19092 return OptimizeConditionalInDecrement(N, DAG); 19093} 19094 19095static SDValue PerformSubCombine(SDNode *N, SelectionDAG &DAG, 19096 const X86Subtarget *Subtarget) { 19097 SDValue Op0 = N->getOperand(0); 19098 SDValue Op1 = N->getOperand(1); 19099 19100 // X86 can't encode an immediate LHS of a sub. See if we can push the 19101 // negation into a preceding instruction. 19102 if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op0)) { 19103 // If the RHS of the sub is a XOR with one use and a constant, invert the 19104 // immediate. Then add one to the LHS of the sub so we can turn 19105 // X-Y -> X+~Y+1, saving one register. 19106 if (Op1->hasOneUse() && Op1.getOpcode() == ISD::XOR && 19107 isa<ConstantSDNode>(Op1.getOperand(1))) { 19108 APInt XorC = cast<ConstantSDNode>(Op1.getOperand(1))->getAPIntValue(); 19109 EVT VT = Op0.getValueType(); 19110 SDValue NewXor = DAG.getNode(ISD::XOR, SDLoc(Op1), VT, 19111 Op1.getOperand(0), 19112 DAG.getConstant(~XorC, VT)); 19113 return DAG.getNode(ISD::ADD, SDLoc(N), VT, NewXor, 19114 DAG.getConstant(C->getAPIntValue()+1, VT)); 19115 } 19116 } 19117 19118 // Try to synthesize horizontal adds from adds of shuffles. 19119 EVT VT = N->getValueType(0); 19120 if (((Subtarget->hasSSSE3() && (VT == MVT::v8i16 || VT == MVT::v4i32)) || 19121 (Subtarget->hasInt256() && (VT == MVT::v16i16 || VT == MVT::v8i32))) && 19122 isHorizontalBinOp(Op0, Op1, true)) 19123 return DAG.getNode(X86ISD::HSUB, SDLoc(N), VT, Op0, Op1); 19124 19125 return OptimizeConditionalInDecrement(N, DAG); 19126} 19127 19128/// performVZEXTCombine - Performs build vector combines 19129static SDValue performVZEXTCombine(SDNode *N, SelectionDAG &DAG, 19130 TargetLowering::DAGCombinerInfo &DCI, 19131 const X86Subtarget *Subtarget) { 19132 // (vzext (bitcast (vzext (x)) -> (vzext x) 19133 SDValue In = N->getOperand(0); 19134 while (In.getOpcode() == ISD::BITCAST) 19135 In = In.getOperand(0); 19136 19137 if (In.getOpcode() != X86ISD::VZEXT) 19138 return SDValue(); 19139 19140 return DAG.getNode(X86ISD::VZEXT, SDLoc(N), N->getValueType(0), 19141 In.getOperand(0)); 19142} 19143 19144SDValue X86TargetLowering::PerformDAGCombine(SDNode *N, 19145 DAGCombinerInfo &DCI) const { 19146 SelectionDAG &DAG = DCI.DAG; 19147 switch (N->getOpcode()) { 19148 default: break; 19149 case ISD::EXTRACT_VECTOR_ELT: 19150 return PerformEXTRACT_VECTOR_ELTCombine(N, DAG, DCI); 19151 case ISD::VSELECT: 19152 case ISD::SELECT: return PerformSELECTCombine(N, DAG, DCI, Subtarget); 19153 case X86ISD::CMOV: return PerformCMOVCombine(N, DAG, DCI, Subtarget); 19154 case ISD::ADD: return PerformAddCombine(N, DAG, Subtarget); 19155 case ISD::SUB: return PerformSubCombine(N, DAG, Subtarget); 19156 case X86ISD::ADC: return PerformADCCombine(N, DAG, DCI); 19157 case ISD::MUL: return PerformMulCombine(N, DAG, DCI); 19158 case ISD::SHL: 19159 case ISD::SRA: 19160 case ISD::SRL: return PerformShiftCombine(N, DAG, DCI, Subtarget); 19161 case ISD::AND: return PerformAndCombine(N, DAG, DCI, Subtarget); 19162 case ISD::OR: return PerformOrCombine(N, DAG, DCI, Subtarget); 19163 case ISD::XOR: return PerformXorCombine(N, DAG, DCI, Subtarget); 19164 case ISD::LOAD: return PerformLOADCombine(N, DAG, DCI, Subtarget); 19165 case ISD::STORE: return PerformSTORECombine(N, DAG, Subtarget); 19166 case ISD::SINT_TO_FP: return PerformSINT_TO_FPCombine(N, DAG, this); 19167 case ISD::FADD: return PerformFADDCombine(N, DAG, Subtarget); 19168 case ISD::FSUB: return PerformFSUBCombine(N, DAG, Subtarget); 19169 case X86ISD::FXOR: 19170 case X86ISD::FOR: return PerformFORCombine(N, DAG); 19171 case X86ISD::FMIN: 19172 case X86ISD::FMAX: return PerformFMinFMaxCombine(N, DAG); 19173 case X86ISD::FAND: return PerformFANDCombine(N, DAG); 19174 case X86ISD::FANDN: return PerformFANDNCombine(N, DAG); 19175 case X86ISD::BT: return PerformBTCombine(N, DAG, DCI); 19176 case X86ISD::VZEXT_MOVL: return PerformVZEXT_MOVLCombine(N, DAG); 19177 case ISD::ANY_EXTEND: 19178 case ISD::ZERO_EXTEND: return PerformZExtCombine(N, DAG, DCI, Subtarget); 19179 case ISD::SIGN_EXTEND: return PerformSExtCombine(N, DAG, DCI, Subtarget); 19180 case ISD::SIGN_EXTEND_INREG: return PerformSIGN_EXTEND_INREGCombine(N, DAG, Subtarget); 19181 case ISD::TRUNCATE: return PerformTruncateCombine(N, DAG,DCI,Subtarget); 19182 case ISD::SETCC: return PerformISDSETCCCombine(N, DAG); 19183 case X86ISD::SETCC: return PerformSETCCCombine(N, DAG, DCI, Subtarget); 19184 case X86ISD::BRCOND: return PerformBrCondCombine(N, DAG, DCI, Subtarget); 19185 case X86ISD::VZEXT: return performVZEXTCombine(N, DAG, DCI, Subtarget); 19186 case X86ISD::SHUFP: // Handle all target specific shuffles 19187 case X86ISD::PALIGNR: 19188 case X86ISD::UNPCKH: 19189 case X86ISD::UNPCKL: 19190 case X86ISD::MOVHLPS: 19191 case X86ISD::MOVLHPS: 19192 case X86ISD::PSHUFD: 19193 case X86ISD::PSHUFHW: 19194 case X86ISD::PSHUFLW: 19195 case X86ISD::MOVSS: 19196 case X86ISD::MOVSD: 19197 case X86ISD::VPERMILP: 19198 case X86ISD::VPERM2X128: 19199 case ISD::VECTOR_SHUFFLE: return PerformShuffleCombine(N, DAG, DCI,Subtarget); 19200 case ISD::FMA: return PerformFMACombine(N, DAG, Subtarget); 19201 } 19202 19203 return SDValue(); 19204} 19205 19206/// isTypeDesirableForOp - Return true if the target has native support for 19207/// the specified value type and it is 'desirable' to use the type for the 19208/// given node type. e.g. On x86 i16 is legal, but undesirable since i16 19209/// instruction encodings are longer and some i16 instructions are slow. 19210bool X86TargetLowering::isTypeDesirableForOp(unsigned Opc, EVT VT) const { 19211 if (!isTypeLegal(VT)) 19212 return false; 19213 if (VT != MVT::i16) 19214 return true; 19215 19216 switch (Opc) { 19217 default: 19218 return true; 19219 case ISD::LOAD: 19220 case ISD::SIGN_EXTEND: 19221 case ISD::ZERO_EXTEND: 19222 case ISD::ANY_EXTEND: 19223 case ISD::SHL: 19224 case ISD::SRL: 19225 case ISD::SUB: 19226 case ISD::ADD: 19227 case ISD::MUL: 19228 case ISD::AND: 19229 case ISD::OR: 19230 case ISD::XOR: 19231 return false; 19232 } 19233} 19234 19235/// IsDesirableToPromoteOp - This method query the target whether it is 19236/// beneficial for dag combiner to promote the specified node. If true, it 19237/// should return the desired promotion type by reference. 19238bool X86TargetLowering::IsDesirableToPromoteOp(SDValue Op, EVT &PVT) const { 19239 EVT VT = Op.getValueType(); 19240 if (VT != MVT::i16) 19241 return false; 19242 19243 bool Promote = false; 19244 bool Commute = false; 19245 switch (Op.getOpcode()) { 19246 default: break; 19247 case ISD::LOAD: { 19248 LoadSDNode *LD = cast<LoadSDNode>(Op); 19249 // If the non-extending load has a single use and it's not live out, then it 19250 // might be folded. 19251 if (LD->getExtensionType() == ISD::NON_EXTLOAD /*&& 19252 Op.hasOneUse()*/) { 19253 for (SDNode::use_iterator UI = Op.getNode()->use_begin(), 19254 UE = Op.getNode()->use_end(); UI != UE; ++UI) { 19255 // The only case where we'd want to promote LOAD (rather then it being 19256 // promoted as an operand is when it's only use is liveout. 19257 if (UI->getOpcode() != ISD::CopyToReg) 19258 return false; 19259 } 19260 } 19261 Promote = true; 19262 break; 19263 } 19264 case ISD::SIGN_EXTEND: 19265 case ISD::ZERO_EXTEND: 19266 case ISD::ANY_EXTEND: 19267 Promote = true; 19268 break; 19269 case ISD::SHL: 19270 case ISD::SRL: { 19271 SDValue N0 = Op.getOperand(0); 19272 // Look out for (store (shl (load), x)). 19273 if (MayFoldLoad(N0) && MayFoldIntoStore(Op)) 19274 return false; 19275 Promote = true; 19276 break; 19277 } 19278 case ISD::ADD: 19279 case ISD::MUL: 19280 case ISD::AND: 19281 case ISD::OR: 19282 case ISD::XOR: 19283 Commute = true; 19284 // fallthrough 19285 case ISD::SUB: { 19286 SDValue N0 = Op.getOperand(0); 19287 SDValue N1 = Op.getOperand(1); 19288 if (!Commute && MayFoldLoad(N1)) 19289 return false; 19290 // Avoid disabling potential load folding opportunities. 19291 if (MayFoldLoad(N0) && (!isa<ConstantSDNode>(N1) || MayFoldIntoStore(Op))) 19292 return false; 19293 if (MayFoldLoad(N1) && (!isa<ConstantSDNode>(N0) || MayFoldIntoStore(Op))) 19294 return false; 19295 Promote = true; 19296 } 19297 } 19298 19299 PVT = MVT::i32; 19300 return Promote; 19301} 19302 19303//===----------------------------------------------------------------------===// 19304// X86 Inline Assembly Support 19305//===----------------------------------------------------------------------===// 19306 19307namespace { 19308 // Helper to match a string separated by whitespace. 19309 bool matchAsmImpl(StringRef s, ArrayRef<const StringRef *> args) { 19310 s = s.substr(s.find_first_not_of(" \t")); // Skip leading whitespace. 19311 19312 for (unsigned i = 0, e = args.size(); i != e; ++i) { 19313 StringRef piece(*args[i]); 19314 if (!s.startswith(piece)) // Check if the piece matches. 19315 return false; 19316 19317 s = s.substr(piece.size()); 19318 StringRef::size_type pos = s.find_first_not_of(" \t"); 19319 if (pos == 0) // We matched a prefix. 19320 return false; 19321 19322 s = s.substr(pos); 19323 } 19324 19325 return s.empty(); 19326 } 19327 const VariadicFunction1<bool, StringRef, StringRef, matchAsmImpl> matchAsm={}; 19328} 19329 19330static bool clobbersFlagRegisters(const SmallVector<StringRef, 4> &AsmPieces) { 19331 19332 if (AsmPieces.size() == 3 || AsmPieces.size() == 4) { 19333 if (std::count(AsmPieces.begin(), AsmPieces.end(), "~{cc}") && 19334 std::count(AsmPieces.begin(), AsmPieces.end(), "~{flags}") && 19335 std::count(AsmPieces.begin(), AsmPieces.end(), "~{fpsr}")) { 19336 19337 if (AsmPieces.size() == 3) 19338 return true; 19339 else if (std::count(AsmPieces.begin(), AsmPieces.end(), "~{dirflag}")) 19340 return true; 19341 } 19342 } 19343 return false; 19344} 19345 19346bool X86TargetLowering::ExpandInlineAsm(CallInst *CI) const { 19347 InlineAsm *IA = cast<InlineAsm>(CI->getCalledValue()); 19348 19349 std::string AsmStr = IA->getAsmString(); 19350 19351 IntegerType *Ty = dyn_cast<IntegerType>(CI->getType()); 19352 if (!Ty || Ty->getBitWidth() % 16 != 0) 19353 return false; 19354 19355 // TODO: should remove alternatives from the asmstring: "foo {a|b}" -> "foo a" 19356 SmallVector<StringRef, 4> AsmPieces; 19357 SplitString(AsmStr, AsmPieces, ";\n"); 19358 19359 switch (AsmPieces.size()) { 19360 default: return false; 19361 case 1: 19362 // FIXME: this should verify that we are targeting a 486 or better. If not, 19363 // we will turn this bswap into something that will be lowered to logical 19364 // ops instead of emitting the bswap asm. For now, we don't support 486 or 19365 // lower so don't worry about this. 19366 // bswap $0 19367 if (matchAsm(AsmPieces[0], "bswap", "$0") || 19368 matchAsm(AsmPieces[0], "bswapl", "$0") || 19369 matchAsm(AsmPieces[0], "bswapq", "$0") || 19370 matchAsm(AsmPieces[0], "bswap", "${0:q}") || 19371 matchAsm(AsmPieces[0], "bswapl", "${0:q}") || 19372 matchAsm(AsmPieces[0], "bswapq", "${0:q}")) { 19373 // No need to check constraints, nothing other than the equivalent of 19374 // "=r,0" would be valid here. 19375 return IntrinsicLowering::LowerToByteSwap(CI); 19376 } 19377 19378 // rorw $$8, ${0:w} --> llvm.bswap.i16 19379 if (CI->getType()->isIntegerTy(16) && 19380 IA->getConstraintString().compare(0, 5, "=r,0,") == 0 && 19381 (matchAsm(AsmPieces[0], "rorw", "$$8,", "${0:w}") || 19382 matchAsm(AsmPieces[0], "rolw", "$$8,", "${0:w}"))) { 19383 AsmPieces.clear(); 19384 const std::string &ConstraintsStr = IA->getConstraintString(); 19385 SplitString(StringRef(ConstraintsStr).substr(5), AsmPieces, ","); 19386 array_pod_sort(AsmPieces.begin(), AsmPieces.end()); 19387 if (clobbersFlagRegisters(AsmPieces)) 19388 return IntrinsicLowering::LowerToByteSwap(CI); 19389 } 19390 break; 19391 case 3: 19392 if (CI->getType()->isIntegerTy(32) && 19393 IA->getConstraintString().compare(0, 5, "=r,0,") == 0 && 19394 matchAsm(AsmPieces[0], "rorw", "$$8,", "${0:w}") && 19395 matchAsm(AsmPieces[1], "rorl", "$$16,", "$0") && 19396 matchAsm(AsmPieces[2], "rorw", "$$8,", "${0:w}")) { 19397 AsmPieces.clear(); 19398 const std::string &ConstraintsStr = IA->getConstraintString(); 19399 SplitString(StringRef(ConstraintsStr).substr(5), AsmPieces, ","); 19400 array_pod_sort(AsmPieces.begin(), AsmPieces.end()); 19401 if (clobbersFlagRegisters(AsmPieces)) 19402 return IntrinsicLowering::LowerToByteSwap(CI); 19403 } 19404 19405 if (CI->getType()->isIntegerTy(64)) { 19406 InlineAsm::ConstraintInfoVector Constraints = IA->ParseConstraints(); 19407 if (Constraints.size() >= 2 && 19408 Constraints[0].Codes.size() == 1 && Constraints[0].Codes[0] == "A" && 19409 Constraints[1].Codes.size() == 1 && Constraints[1].Codes[0] == "0") { 19410 // bswap %eax / bswap %edx / xchgl %eax, %edx -> llvm.bswap.i64 19411 if (matchAsm(AsmPieces[0], "bswap", "%eax") && 19412 matchAsm(AsmPieces[1], "bswap", "%edx") && 19413 matchAsm(AsmPieces[2], "xchgl", "%eax,", "%edx")) 19414 return IntrinsicLowering::LowerToByteSwap(CI); 19415 } 19416 } 19417 break; 19418 } 19419 return false; 19420} 19421 19422/// getConstraintType - Given a constraint letter, return the type of 19423/// constraint it is for this target. 19424X86TargetLowering::ConstraintType 19425X86TargetLowering::getConstraintType(const std::string &Constraint) const { 19426 if (Constraint.size() == 1) { 19427 switch (Constraint[0]) { 19428 case 'R': 19429 case 'q': 19430 case 'Q': 19431 case 'f': 19432 case 't': 19433 case 'u': 19434 case 'y': 19435 case 'x': 19436 case 'Y': 19437 case 'l': 19438 return C_RegisterClass; 19439 case 'a': 19440 case 'b': 19441 case 'c': 19442 case 'd': 19443 case 'S': 19444 case 'D': 19445 case 'A': 19446 return C_Register; 19447 case 'I': 19448 case 'J': 19449 case 'K': 19450 case 'L': 19451 case 'M': 19452 case 'N': 19453 case 'G': 19454 case 'C': 19455 case 'e': 19456 case 'Z': 19457 return C_Other; 19458 default: 19459 break; 19460 } 19461 } 19462 return TargetLowering::getConstraintType(Constraint); 19463} 19464 19465/// Examine constraint type and operand type and determine a weight value. 19466/// This object must already have been set up with the operand type 19467/// and the current alternative constraint selected. 19468TargetLowering::ConstraintWeight 19469 X86TargetLowering::getSingleConstraintMatchWeight( 19470 AsmOperandInfo &info, const char *constraint) const { 19471 ConstraintWeight weight = CW_Invalid; 19472 Value *CallOperandVal = info.CallOperandVal; 19473 // If we don't have a value, we can't do a match, 19474 // but allow it at the lowest weight. 19475 if (CallOperandVal == NULL) 19476 return CW_Default; 19477 Type *type = CallOperandVal->getType(); 19478 // Look at the constraint type. 19479 switch (*constraint) { 19480 default: 19481 weight = TargetLowering::getSingleConstraintMatchWeight(info, constraint); 19482 case 'R': 19483 case 'q': 19484 case 'Q': 19485 case 'a': 19486 case 'b': 19487 case 'c': 19488 case 'd': 19489 case 'S': 19490 case 'D': 19491 case 'A': 19492 if (CallOperandVal->getType()->isIntegerTy()) 19493 weight = CW_SpecificReg; 19494 break; 19495 case 'f': 19496 case 't': 19497 case 'u': 19498 if (type->isFloatingPointTy()) 19499 weight = CW_SpecificReg; 19500 break; 19501 case 'y': 19502 if (type->isX86_MMXTy() && Subtarget->hasMMX()) 19503 weight = CW_SpecificReg; 19504 break; 19505 case 'x': 19506 case 'Y': 19507 if (((type->getPrimitiveSizeInBits() == 128) && Subtarget->hasSSE1()) || 19508 ((type->getPrimitiveSizeInBits() == 256) && Subtarget->hasFp256())) 19509 weight = CW_Register; 19510 break; 19511 case 'I': 19512 if (ConstantInt *C = dyn_cast<ConstantInt>(info.CallOperandVal)) { 19513 if (C->getZExtValue() <= 31) 19514 weight = CW_Constant; 19515 } 19516 break; 19517 case 'J': 19518 if (ConstantInt *C = dyn_cast<ConstantInt>(CallOperandVal)) { 19519 if (C->getZExtValue() <= 63) 19520 weight = CW_Constant; 19521 } 19522 break; 19523 case 'K': 19524 if (ConstantInt *C = dyn_cast<ConstantInt>(CallOperandVal)) { 19525 if ((C->getSExtValue() >= -0x80) && (C->getSExtValue() <= 0x7f)) 19526 weight = CW_Constant; 19527 } 19528 break; 19529 case 'L': 19530 if (ConstantInt *C = dyn_cast<ConstantInt>(CallOperandVal)) { 19531 if ((C->getZExtValue() == 0xff) || (C->getZExtValue() == 0xffff)) 19532 weight = CW_Constant; 19533 } 19534 break; 19535 case 'M': 19536 if (ConstantInt *C = dyn_cast<ConstantInt>(CallOperandVal)) { 19537 if (C->getZExtValue() <= 3) 19538 weight = CW_Constant; 19539 } 19540 break; 19541 case 'N': 19542 if (ConstantInt *C = dyn_cast<ConstantInt>(CallOperandVal)) { 19543 if (C->getZExtValue() <= 0xff) 19544 weight = CW_Constant; 19545 } 19546 break; 19547 case 'G': 19548 case 'C': 19549 if (dyn_cast<ConstantFP>(CallOperandVal)) { 19550 weight = CW_Constant; 19551 } 19552 break; 19553 case 'e': 19554 if (ConstantInt *C = dyn_cast<ConstantInt>(CallOperandVal)) { 19555 if ((C->getSExtValue() >= -0x80000000LL) && 19556 (C->getSExtValue() <= 0x7fffffffLL)) 19557 weight = CW_Constant; 19558 } 19559 break; 19560 case 'Z': 19561 if (ConstantInt *C = dyn_cast<ConstantInt>(CallOperandVal)) { 19562 if (C->getZExtValue() <= 0xffffffff) 19563 weight = CW_Constant; 19564 } 19565 break; 19566 } 19567 return weight; 19568} 19569 19570/// LowerXConstraint - try to replace an X constraint, which matches anything, 19571/// with another that has more specific requirements based on the type of the 19572/// corresponding operand. 19573const char *X86TargetLowering:: 19574LowerXConstraint(EVT ConstraintVT) const { 19575 // FP X constraints get lowered to SSE1/2 registers if available, otherwise 19576 // 'f' like normal targets. 19577 if (ConstraintVT.isFloatingPoint()) { 19578 if (Subtarget->hasSSE2()) 19579 return "Y"; 19580 if (Subtarget->hasSSE1()) 19581 return "x"; 19582 } 19583 19584 return TargetLowering::LowerXConstraint(ConstraintVT); 19585} 19586 19587/// LowerAsmOperandForConstraint - Lower the specified operand into the Ops 19588/// vector. If it is invalid, don't add anything to Ops. 19589void X86TargetLowering::LowerAsmOperandForConstraint(SDValue Op, 19590 std::string &Constraint, 19591 std::vector<SDValue>&Ops, 19592 SelectionDAG &DAG) const { 19593 SDValue Result(0, 0); 19594 19595 // Only support length 1 constraints for now. 19596 if (Constraint.length() > 1) return; 19597 19598 char ConstraintLetter = Constraint[0]; 19599 switch (ConstraintLetter) { 19600 default: break; 19601 case 'I': 19602 if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op)) { 19603 if (C->getZExtValue() <= 31) { 19604 Result = DAG.getTargetConstant(C->getZExtValue(), Op.getValueType()); 19605 break; 19606 } 19607 } 19608 return; 19609 case 'J': 19610 if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op)) { 19611 if (C->getZExtValue() <= 63) { 19612 Result = DAG.getTargetConstant(C->getZExtValue(), Op.getValueType()); 19613 break; 19614 } 19615 } 19616 return; 19617 case 'K': 19618 if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op)) { 19619 if (isInt<8>(C->getSExtValue())) { 19620 Result = DAG.getTargetConstant(C->getZExtValue(), Op.getValueType()); 19621 break; 19622 } 19623 } 19624 return; 19625 case 'N': 19626 if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op)) { 19627 if (C->getZExtValue() <= 255) { 19628 Result = DAG.getTargetConstant(C->getZExtValue(), Op.getValueType()); 19629 break; 19630 } 19631 } 19632 return; 19633 case 'e': { 19634 // 32-bit signed value 19635 if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op)) { 19636 if (ConstantInt::isValueValidForType(Type::getInt32Ty(*DAG.getContext()), 19637 C->getSExtValue())) { 19638 // Widen to 64 bits here to get it sign extended. 19639 Result = DAG.getTargetConstant(C->getSExtValue(), MVT::i64); 19640 break; 19641 } 19642 // FIXME gcc accepts some relocatable values here too, but only in certain 19643 // memory models; it's complicated. 19644 } 19645 return; 19646 } 19647 case 'Z': { 19648 // 32-bit unsigned value 19649 if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op)) { 19650 if (ConstantInt::isValueValidForType(Type::getInt32Ty(*DAG.getContext()), 19651 C->getZExtValue())) { 19652 Result = DAG.getTargetConstant(C->getZExtValue(), Op.getValueType()); 19653 break; 19654 } 19655 } 19656 // FIXME gcc accepts some relocatable values here too, but only in certain 19657 // memory models; it's complicated. 19658 return; 19659 } 19660 case 'i': { 19661 // Literal immediates are always ok. 19662 if (ConstantSDNode *CST = dyn_cast<ConstantSDNode>(Op)) { 19663 // Widen to 64 bits here to get it sign extended. 19664 Result = DAG.getTargetConstant(CST->getSExtValue(), MVT::i64); 19665 break; 19666 } 19667 19668 // In any sort of PIC mode addresses need to be computed at runtime by 19669 // adding in a register or some sort of table lookup. These can't 19670 // be used as immediates. 19671 if (Subtarget->isPICStyleGOT() || Subtarget->isPICStyleStubPIC()) 19672 return; 19673 19674 // If we are in non-pic codegen mode, we allow the address of a global (with 19675 // an optional displacement) to be used with 'i'. 19676 GlobalAddressSDNode *GA = 0; 19677 int64_t Offset = 0; 19678 19679 // Match either (GA), (GA+C), (GA+C1+C2), etc. 19680 while (1) { 19681 if ((GA = dyn_cast<GlobalAddressSDNode>(Op))) { 19682 Offset += GA->getOffset(); 19683 break; 19684 } else if (Op.getOpcode() == ISD::ADD) { 19685 if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op.getOperand(1))) { 19686 Offset += C->getZExtValue(); 19687 Op = Op.getOperand(0); 19688 continue; 19689 } 19690 } else if (Op.getOpcode() == ISD::SUB) { 19691 if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op.getOperand(1))) { 19692 Offset += -C->getZExtValue(); 19693 Op = Op.getOperand(0); 19694 continue; 19695 } 19696 } 19697 19698 // Otherwise, this isn't something we can handle, reject it. 19699 return; 19700 } 19701 19702 const GlobalValue *GV = GA->getGlobal(); 19703 // If we require an extra load to get this address, as in PIC mode, we 19704 // can't accept it. 19705 if (isGlobalStubReference(Subtarget->ClassifyGlobalReference(GV, 19706 getTargetMachine()))) 19707 return; 19708 19709 Result = DAG.getTargetGlobalAddress(GV, SDLoc(Op), 19710 GA->getValueType(0), Offset); 19711 break; 19712 } 19713 } 19714 19715 if (Result.getNode()) { 19716 Ops.push_back(Result); 19717 return; 19718 } 19719 return TargetLowering::LowerAsmOperandForConstraint(Op, Constraint, Ops, DAG); 19720} 19721 19722std::pair<unsigned, const TargetRegisterClass*> 19723X86TargetLowering::getRegForInlineAsmConstraint(const std::string &Constraint, 19724 MVT VT) const { 19725 // First, see if this is a constraint that directly corresponds to an LLVM 19726 // register class. 19727 if (Constraint.size() == 1) { 19728 // GCC Constraint Letters 19729 switch (Constraint[0]) { 19730 default: break; 19731 // TODO: Slight differences here in allocation order and leaving 19732 // RIP in the class. Do they matter any more here than they do 19733 // in the normal allocation? 19734 case 'q': // GENERAL_REGS in 64-bit mode, Q_REGS in 32-bit mode. 19735 if (Subtarget->is64Bit()) { 19736 if (VT == MVT::i32 || VT == MVT::f32) 19737 return std::make_pair(0U, &X86::GR32RegClass); 19738 if (VT == MVT::i16) 19739 return std::make_pair(0U, &X86::GR16RegClass); 19740 if (VT == MVT::i8 || VT == MVT::i1) 19741 return std::make_pair(0U, &X86::GR8RegClass); 19742 if (VT == MVT::i64 || VT == MVT::f64) 19743 return std::make_pair(0U, &X86::GR64RegClass); 19744 break; 19745 } 19746 // 32-bit fallthrough 19747 case 'Q': // Q_REGS 19748 if (VT == MVT::i32 || VT == MVT::f32) 19749 return std::make_pair(0U, &X86::GR32_ABCDRegClass); 19750 if (VT == MVT::i16) 19751 return std::make_pair(0U, &X86::GR16_ABCDRegClass); 19752 if (VT == MVT::i8 || VT == MVT::i1) 19753 return std::make_pair(0U, &X86::GR8_ABCD_LRegClass); 19754 if (VT == MVT::i64) 19755 return std::make_pair(0U, &X86::GR64_ABCDRegClass); 19756 break; 19757 case 'r': // GENERAL_REGS 19758 case 'l': // INDEX_REGS 19759 if (VT == MVT::i8 || VT == MVT::i1) 19760 return std::make_pair(0U, &X86::GR8RegClass); 19761 if (VT == MVT::i16) 19762 return std::make_pair(0U, &X86::GR16RegClass); 19763 if (VT == MVT::i32 || VT == MVT::f32 || !Subtarget->is64Bit()) 19764 return std::make_pair(0U, &X86::GR32RegClass); 19765 return std::make_pair(0U, &X86::GR64RegClass); 19766 case 'R': // LEGACY_REGS 19767 if (VT == MVT::i8 || VT == MVT::i1) 19768 return std::make_pair(0U, &X86::GR8_NOREXRegClass); 19769 if (VT == MVT::i16) 19770 return std::make_pair(0U, &X86::GR16_NOREXRegClass); 19771 if (VT == MVT::i32 || !Subtarget->is64Bit()) 19772 return std::make_pair(0U, &X86::GR32_NOREXRegClass); 19773 return std::make_pair(0U, &X86::GR64_NOREXRegClass); 19774 case 'f': // FP Stack registers. 19775 // If SSE is enabled for this VT, use f80 to ensure the isel moves the 19776 // value to the correct fpstack register class. 19777 if (VT == MVT::f32 && !isScalarFPTypeInSSEReg(VT)) 19778 return std::make_pair(0U, &X86::RFP32RegClass); 19779 if (VT == MVT::f64 && !isScalarFPTypeInSSEReg(VT)) 19780 return std::make_pair(0U, &X86::RFP64RegClass); 19781 return std::make_pair(0U, &X86::RFP80RegClass); 19782 case 'y': // MMX_REGS if MMX allowed. 19783 if (!Subtarget->hasMMX()) break; 19784 return std::make_pair(0U, &X86::VR64RegClass); 19785 case 'Y': // SSE_REGS if SSE2 allowed 19786 if (!Subtarget->hasSSE2()) break; 19787 // FALL THROUGH. 19788 case 'x': // SSE_REGS if SSE1 allowed or AVX_REGS if AVX allowed 19789 if (!Subtarget->hasSSE1()) break; 19790 19791 switch (VT.SimpleTy) { 19792 default: break; 19793 // Scalar SSE types. 19794 case MVT::f32: 19795 case MVT::i32: 19796 return std::make_pair(0U, &X86::FR32RegClass); 19797 case MVT::f64: 19798 case MVT::i64: 19799 return std::make_pair(0U, &X86::FR64RegClass); 19800 // Vector types. 19801 case MVT::v16i8: 19802 case MVT::v8i16: 19803 case MVT::v4i32: 19804 case MVT::v2i64: 19805 case MVT::v4f32: 19806 case MVT::v2f64: 19807 return std::make_pair(0U, &X86::VR128RegClass); 19808 // AVX types. 19809 case MVT::v32i8: 19810 case MVT::v16i16: 19811 case MVT::v8i32: 19812 case MVT::v4i64: 19813 case MVT::v8f32: 19814 case MVT::v4f64: 19815 return std::make_pair(0U, &X86::VR256RegClass); 19816 case MVT::v8f64: 19817 case MVT::v16f32: 19818 case MVT::v16i32: 19819 case MVT::v8i64: 19820 return std::make_pair(0U, &X86::VR512RegClass); 19821 } 19822 break; 19823 } 19824 } 19825 19826 // Use the default implementation in TargetLowering to convert the register 19827 // constraint into a member of a register class. 19828 std::pair<unsigned, const TargetRegisterClass*> Res; 19829 Res = TargetLowering::getRegForInlineAsmConstraint(Constraint, VT); 19830 19831 // Not found as a standard register? 19832 if (Res.second == 0) { 19833 // Map st(0) -> st(7) -> ST0 19834 if (Constraint.size() == 7 && Constraint[0] == '{' && 19835 tolower(Constraint[1]) == 's' && 19836 tolower(Constraint[2]) == 't' && 19837 Constraint[3] == '(' && 19838 (Constraint[4] >= '0' && Constraint[4] <= '7') && 19839 Constraint[5] == ')' && 19840 Constraint[6] == '}') { 19841 19842 Res.first = X86::ST0+Constraint[4]-'0'; 19843 Res.second = &X86::RFP80RegClass; 19844 return Res; 19845 } 19846 19847 // GCC allows "st(0)" to be called just plain "st". 19848 if (StringRef("{st}").equals_lower(Constraint)) { 19849 Res.first = X86::ST0; 19850 Res.second = &X86::RFP80RegClass; 19851 return Res; 19852 } 19853 19854 // flags -> EFLAGS 19855 if (StringRef("{flags}").equals_lower(Constraint)) { 19856 Res.first = X86::EFLAGS; 19857 Res.second = &X86::CCRRegClass; 19858 return Res; 19859 } 19860 19861 // 'A' means EAX + EDX. 19862 if (Constraint == "A") { 19863 Res.first = X86::EAX; 19864 Res.second = &X86::GR32_ADRegClass; 19865 return Res; 19866 } 19867 return Res; 19868 } 19869 19870 // Otherwise, check to see if this is a register class of the wrong value 19871 // type. For example, we want to map "{ax},i32" -> {eax}, we don't want it to 19872 // turn into {ax},{dx}. 19873 if (Res.second->hasType(VT)) 19874 return Res; // Correct type already, nothing to do. 19875 19876 // All of the single-register GCC register classes map their values onto 19877 // 16-bit register pieces "ax","dx","cx","bx","si","di","bp","sp". If we 19878 // really want an 8-bit or 32-bit register, map to the appropriate register 19879 // class and return the appropriate register. 19880 if (Res.second == &X86::GR16RegClass) { 19881 if (VT == MVT::i8 || VT == MVT::i1) { 19882 unsigned DestReg = 0; 19883 switch (Res.first) { 19884 default: break; 19885 case X86::AX: DestReg = X86::AL; break; 19886 case X86::DX: DestReg = X86::DL; break; 19887 case X86::CX: DestReg = X86::CL; break; 19888 case X86::BX: DestReg = X86::BL; break; 19889 } 19890 if (DestReg) { 19891 Res.first = DestReg; 19892 Res.second = &X86::GR8RegClass; 19893 } 19894 } else if (VT == MVT::i32 || VT == MVT::f32) { 19895 unsigned DestReg = 0; 19896 switch (Res.first) { 19897 default: break; 19898 case X86::AX: DestReg = X86::EAX; break; 19899 case X86::DX: DestReg = X86::EDX; break; 19900 case X86::CX: DestReg = X86::ECX; break; 19901 case X86::BX: DestReg = X86::EBX; break; 19902 case X86::SI: DestReg = X86::ESI; break; 19903 case X86::DI: DestReg = X86::EDI; break; 19904 case X86::BP: DestReg = X86::EBP; break; 19905 case X86::SP: DestReg = X86::ESP; break; 19906 } 19907 if (DestReg) { 19908 Res.first = DestReg; 19909 Res.second = &X86::GR32RegClass; 19910 } 19911 } else if (VT == MVT::i64 || VT == MVT::f64) { 19912 unsigned DestReg = 0; 19913 switch (Res.first) { 19914 default: break; 19915 case X86::AX: DestReg = X86::RAX; break; 19916 case X86::DX: DestReg = X86::RDX; break; 19917 case X86::CX: DestReg = X86::RCX; break; 19918 case X86::BX: DestReg = X86::RBX; break; 19919 case X86::SI: DestReg = X86::RSI; break; 19920 case X86::DI: DestReg = X86::RDI; break; 19921 case X86::BP: DestReg = X86::RBP; break; 19922 case X86::SP: DestReg = X86::RSP; break; 19923 } 19924 if (DestReg) { 19925 Res.first = DestReg; 19926 Res.second = &X86::GR64RegClass; 19927 } 19928 } 19929 } else if (Res.second == &X86::FR32RegClass || 19930 Res.second == &X86::FR64RegClass || 19931 Res.second == &X86::VR128RegClass || 19932 Res.second == &X86::VR256RegClass || 19933 Res.second == &X86::FR32XRegClass || 19934 Res.second == &X86::FR64XRegClass || 19935 Res.second == &X86::VR128XRegClass || 19936 Res.second == &X86::VR256XRegClass || 19937 Res.second == &X86::VR512RegClass) { 19938 // Handle references to XMM physical registers that got mapped into the 19939 // wrong class. This can happen with constraints like {xmm0} where the 19940 // target independent register mapper will just pick the first match it can 19941 // find, ignoring the required type. 19942 19943 if (VT == MVT::f32 || VT == MVT::i32) 19944 Res.second = &X86::FR32RegClass; 19945 else if (VT == MVT::f64 || VT == MVT::i64) 19946 Res.second = &X86::FR64RegClass; 19947 else if (X86::VR128RegClass.hasType(VT)) 19948 Res.second = &X86::VR128RegClass; 19949 else if (X86::VR256RegClass.hasType(VT)) 19950 Res.second = &X86::VR256RegClass; 19951 else if (X86::VR512RegClass.hasType(VT)) 19952 Res.second = &X86::VR512RegClass; 19953 } 19954 19955 return Res; 19956} 19957