1//===-- X86ISelLowering.cpp - X86 DAG Lowering Implementation -------------===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8//
9// This file defines the interfaces that X86 uses to lower LLVM code into a
10// selection DAG.
11//
12//===----------------------------------------------------------------------===//
13
14#include "X86ISelLowering.h"
15#include "MCTargetDesc/X86ShuffleDecode.h"
16#include "X86.h"
17#include "X86CallingConv.h"
18#include "X86FrameLowering.h"
19#include "X86InstrBuilder.h"
20#include "X86IntrinsicsInfo.h"
21#include "X86MachineFunctionInfo.h"
22#include "X86TargetMachine.h"
23#include "X86TargetObjectFile.h"
24#include "llvm/ADT/SmallBitVector.h"
25#include "llvm/ADT/SmallSet.h"
26#include "llvm/ADT/Statistic.h"
27#include "llvm/ADT/StringExtras.h"
28#include "llvm/ADT/StringSwitch.h"
29#include "llvm/Analysis/BlockFrequencyInfo.h"
30#include "llvm/Analysis/EHPersonalities.h"
31#include "llvm/Analysis/ObjCARCUtil.h"
32#include "llvm/Analysis/ProfileSummaryInfo.h"
33#include "llvm/Analysis/VectorUtils.h"
34#include "llvm/CodeGen/IntrinsicLowering.h"
35#include "llvm/CodeGen/MachineFrameInfo.h"
36#include "llvm/CodeGen/MachineFunction.h"
37#include "llvm/CodeGen/MachineInstrBuilder.h"
38#include "llvm/CodeGen/MachineJumpTableInfo.h"
39#include "llvm/CodeGen/MachineLoopInfo.h"
40#include "llvm/CodeGen/MachineModuleInfo.h"
41#include "llvm/CodeGen/MachineRegisterInfo.h"
42#include "llvm/CodeGen/TargetLowering.h"
43#include "llvm/CodeGen/WinEHFuncInfo.h"
44#include "llvm/IR/CallingConv.h"
45#include "llvm/IR/Constants.h"
46#include "llvm/IR/DerivedTypes.h"
47#include "llvm/IR/DiagnosticInfo.h"
48#include "llvm/IR/Function.h"
49#include "llvm/IR/GlobalAlias.h"
50#include "llvm/IR/GlobalVariable.h"
51#include "llvm/IR/Instructions.h"
52#include "llvm/IR/Intrinsics.h"
53#include "llvm/MC/MCAsmInfo.h"
54#include "llvm/MC/MCContext.h"
55#include "llvm/MC/MCExpr.h"
56#include "llvm/MC/MCSymbol.h"
57#include "llvm/Support/CommandLine.h"
58#include "llvm/Support/Debug.h"
59#include "llvm/Support/ErrorHandling.h"
60#include "llvm/Support/KnownBits.h"
61#include "llvm/Support/MathExtras.h"
62#include "llvm/Target/TargetOptions.h"
63#include <algorithm>
64#include <bitset>
65#include <cctype>
66#include <numeric>
67using namespace llvm;
68
69#define DEBUG_TYPE "x86-isel"
70
71STATISTIC(NumTailCalls, "Number of tail calls");
72
73static cl::opt<int> ExperimentalPrefLoopAlignment(
74    "x86-experimental-pref-loop-alignment", cl::init(4),
75    cl::desc(
76        "Sets the preferable loop alignment for experiments (as log2 bytes)"
77        "(the last x86-experimental-pref-loop-alignment bits"
78        " of the loop header PC will be 0)."),
79    cl::Hidden);
80
81static cl::opt<int> ExperimentalPrefInnermostLoopAlignment(
82    "x86-experimental-pref-innermost-loop-alignment", cl::init(4),
83    cl::desc(
84        "Sets the preferable loop alignment for experiments (as log2 bytes) "
85        "for innermost loops only. If specified, this option overrides "
86        "alignment set by x86-experimental-pref-loop-alignment."),
87    cl::Hidden);
88
89static cl::opt<bool> MulConstantOptimization(
90    "mul-constant-optimization", cl::init(true),
91    cl::desc("Replace 'mul x, Const' with more effective instructions like "
92             "SHIFT, LEA, etc."),
93    cl::Hidden);
94
95static cl::opt<bool> ExperimentalUnorderedISEL(
96    "x86-experimental-unordered-atomic-isel", cl::init(false),
97    cl::desc("Use LoadSDNode and StoreSDNode instead of "
98             "AtomicSDNode for unordered atomic loads and "
99             "stores respectively."),
100    cl::Hidden);
101
102/// Call this when the user attempts to do something unsupported, like
103/// returning a double without SSE2 enabled on x86_64. This is not fatal, unlike
104/// report_fatal_error, so calling code should attempt to recover without
105/// crashing.
106static void errorUnsupported(SelectionDAG &DAG, const SDLoc &dl,
107                             const char *Msg) {
108  MachineFunction &MF = DAG.getMachineFunction();
109  DAG.getContext()->diagnose(
110      DiagnosticInfoUnsupported(MF.getFunction(), Msg, dl.getDebugLoc()));
111}
112
113X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
114                                     const X86Subtarget &STI)
115    : TargetLowering(TM), Subtarget(STI) {
116  bool UseX87 = !Subtarget.useSoftFloat() && Subtarget.hasX87();
117  X86ScalarSSEf64 = Subtarget.hasSSE2();
118  X86ScalarSSEf32 = Subtarget.hasSSE1();
119  MVT PtrVT = MVT::getIntegerVT(TM.getPointerSizeInBits(0));
120
121  // Set up the TargetLowering object.
122
123  // X86 is weird. It always uses i8 for shift amounts and setcc results.
124  setBooleanContents(ZeroOrOneBooleanContent);
125  // X86-SSE is even stranger. It uses -1 or 0 for vector masks.
126  setBooleanVectorContents(ZeroOrNegativeOneBooleanContent);
127
128  // For 64-bit, since we have so many registers, use the ILP scheduler.
129  // For 32-bit, use the register pressure specific scheduling.
130  // For Atom, always use ILP scheduling.
131  if (Subtarget.isAtom())
132    setSchedulingPreference(Sched::ILP);
133  else if (Subtarget.is64Bit())
134    setSchedulingPreference(Sched::ILP);
135  else
136    setSchedulingPreference(Sched::RegPressure);
137  const X86RegisterInfo *RegInfo = Subtarget.getRegisterInfo();
138  setStackPointerRegisterToSaveRestore(RegInfo->getStackRegister());
139
140  // Bypass expensive divides and use cheaper ones.
141  if (TM.getOptLevel() >= CodeGenOpt::Default) {
142    if (Subtarget.hasSlowDivide32())
143      addBypassSlowDiv(32, 8);
144    if (Subtarget.hasSlowDivide64() && Subtarget.is64Bit())
145      addBypassSlowDiv(64, 32);
146  }
147
148  // Setup Windows compiler runtime calls.
149  if (Subtarget.isTargetWindowsMSVC() || Subtarget.isTargetWindowsItanium()) {
150    static const struct {
151      const RTLIB::Libcall Op;
152      const char * const Name;
153      const CallingConv::ID CC;
154    } LibraryCalls[] = {
155      { RTLIB::SDIV_I64, "_alldiv", CallingConv::X86_StdCall },
156      { RTLIB::UDIV_I64, "_aulldiv", CallingConv::X86_StdCall },
157      { RTLIB::SREM_I64, "_allrem", CallingConv::X86_StdCall },
158      { RTLIB::UREM_I64, "_aullrem", CallingConv::X86_StdCall },
159      { RTLIB::MUL_I64, "_allmul", CallingConv::X86_StdCall },
160    };
161
162    for (const auto &LC : LibraryCalls) {
163      setLibcallName(LC.Op, LC.Name);
164      setLibcallCallingConv(LC.Op, LC.CC);
165    }
166  }
167
168  if (Subtarget.getTargetTriple().isOSMSVCRT()) {
169    // MSVCRT doesn't have powi; fall back to pow
170    setLibcallName(RTLIB::POWI_F32, nullptr);
171    setLibcallName(RTLIB::POWI_F64, nullptr);
172  }
173
174  // If we don't have cmpxchg8b(meaing this is a 386/486), limit atomic size to
175  // 32 bits so the AtomicExpandPass will expand it so we don't need cmpxchg8b.
176  // FIXME: Should we be limiting the atomic size on other configs? Default is
177  // 1024.
178  if (!Subtarget.hasCmpxchg8b())
179    setMaxAtomicSizeInBitsSupported(32);
180
181  // Set up the register classes.
182  addRegisterClass(MVT::i8, &X86::GR8RegClass);
183  addRegisterClass(MVT::i16, &X86::GR16RegClass);
184  addRegisterClass(MVT::i32, &X86::GR32RegClass);
185  if (Subtarget.is64Bit())
186    addRegisterClass(MVT::i64, &X86::GR64RegClass);
187
188  for (MVT VT : MVT::integer_valuetypes())
189    setLoadExtAction(ISD::SEXTLOAD, VT, MVT::i1, Promote);
190
191  // We don't accept any truncstore of integer registers.
192  setTruncStoreAction(MVT::i64, MVT::i32, Expand);
193  setTruncStoreAction(MVT::i64, MVT::i16, Expand);
194  setTruncStoreAction(MVT::i64, MVT::i8 , Expand);
195  setTruncStoreAction(MVT::i32, MVT::i16, Expand);
196  setTruncStoreAction(MVT::i32, MVT::i8 , Expand);
197  setTruncStoreAction(MVT::i16, MVT::i8,  Expand);
198
199  setTruncStoreAction(MVT::f64, MVT::f32, Expand);
200
201  // SETOEQ and SETUNE require checking two conditions.
202  for (auto VT : {MVT::f32, MVT::f64, MVT::f80}) {
203    setCondCodeAction(ISD::SETOEQ, VT, Expand);
204    setCondCodeAction(ISD::SETUNE, VT, Expand);
205  }
206
207  // Integer absolute.
208  if (Subtarget.hasCMov()) {
209    setOperationAction(ISD::ABS            , MVT::i16  , Custom);
210    setOperationAction(ISD::ABS            , MVT::i32  , Custom);
211    if (Subtarget.is64Bit())
212      setOperationAction(ISD::ABS          , MVT::i64  , Custom);
213  }
214
215  // Funnel shifts.
216  for (auto ShiftOp : {ISD::FSHL, ISD::FSHR}) {
217    // For slow shld targets we only lower for code size.
218    LegalizeAction ShiftDoubleAction = Subtarget.isSHLDSlow() ? Custom : Legal;
219
220    setOperationAction(ShiftOp             , MVT::i8   , Custom);
221    setOperationAction(ShiftOp             , MVT::i16  , Custom);
222    setOperationAction(ShiftOp             , MVT::i32  , ShiftDoubleAction);
223    if (Subtarget.is64Bit())
224      setOperationAction(ShiftOp           , MVT::i64  , ShiftDoubleAction);
225  }
226
227  if (!Subtarget.useSoftFloat()) {
228    // Promote all UINT_TO_FP to larger SINT_TO_FP's, as X86 doesn't have this
229    // operation.
230    setOperationAction(ISD::UINT_TO_FP,        MVT::i8, Promote);
231    setOperationAction(ISD::STRICT_UINT_TO_FP, MVT::i8, Promote);
232    setOperationAction(ISD::UINT_TO_FP,        MVT::i16, Promote);
233    setOperationAction(ISD::STRICT_UINT_TO_FP, MVT::i16, Promote);
234    // We have an algorithm for SSE2, and we turn this into a 64-bit
235    // FILD or VCVTUSI2SS/SD for other targets.
236    setOperationAction(ISD::UINT_TO_FP,        MVT::i32, Custom);
237    setOperationAction(ISD::STRICT_UINT_TO_FP, MVT::i32, Custom);
238    // We have an algorithm for SSE2->double, and we turn this into a
239    // 64-bit FILD followed by conditional FADD for other targets.
240    setOperationAction(ISD::UINT_TO_FP,        MVT::i64, Custom);
241    setOperationAction(ISD::STRICT_UINT_TO_FP, MVT::i64, Custom);
242
243    // Promote i8 SINT_TO_FP to larger SINT_TO_FP's, as X86 doesn't have
244    // this operation.
245    setOperationAction(ISD::SINT_TO_FP,        MVT::i8, Promote);
246    setOperationAction(ISD::STRICT_SINT_TO_FP, MVT::i8, Promote);
247    // SSE has no i16 to fp conversion, only i32. We promote in the handler
248    // to allow f80 to use i16 and f64 to use i16 with sse1 only
249    setOperationAction(ISD::SINT_TO_FP,        MVT::i16, Custom);
250    setOperationAction(ISD::STRICT_SINT_TO_FP, MVT::i16, Custom);
251    // f32 and f64 cases are Legal with SSE1/SSE2, f80 case is not
252    setOperationAction(ISD::SINT_TO_FP,        MVT::i32, Custom);
253    setOperationAction(ISD::STRICT_SINT_TO_FP, MVT::i32, Custom);
254    // In 32-bit mode these are custom lowered.  In 64-bit mode F32 and F64
255    // are Legal, f80 is custom lowered.
256    setOperationAction(ISD::SINT_TO_FP,        MVT::i64, Custom);
257    setOperationAction(ISD::STRICT_SINT_TO_FP, MVT::i64, Custom);
258
259    // Promote i8 FP_TO_SINT to larger FP_TO_SINTS's, as X86 doesn't have
260    // this operation.
261    setOperationAction(ISD::FP_TO_SINT,        MVT::i8,  Promote);
262    // FIXME: This doesn't generate invalid exception when it should. PR44019.
263    setOperationAction(ISD::STRICT_FP_TO_SINT, MVT::i8,  Promote);
264    setOperationAction(ISD::FP_TO_SINT,        MVT::i16, Custom);
265    setOperationAction(ISD::STRICT_FP_TO_SINT, MVT::i16, Custom);
266    setOperationAction(ISD::FP_TO_SINT,        MVT::i32, Custom);
267    setOperationAction(ISD::STRICT_FP_TO_SINT, MVT::i32, Custom);
268    // In 32-bit mode these are custom lowered.  In 64-bit mode F32 and F64
269    // are Legal, f80 is custom lowered.
270    setOperationAction(ISD::FP_TO_SINT,        MVT::i64, Custom);
271    setOperationAction(ISD::STRICT_FP_TO_SINT, MVT::i64, Custom);
272
273    // Handle FP_TO_UINT by promoting the destination to a larger signed
274    // conversion.
275    setOperationAction(ISD::FP_TO_UINT,        MVT::i8,  Promote);
276    // FIXME: This doesn't generate invalid exception when it should. PR44019.
277    setOperationAction(ISD::STRICT_FP_TO_UINT, MVT::i8,  Promote);
278    setOperationAction(ISD::FP_TO_UINT,        MVT::i16, Promote);
279    // FIXME: This doesn't generate invalid exception when it should. PR44019.
280    setOperationAction(ISD::STRICT_FP_TO_UINT, MVT::i16, Promote);
281    setOperationAction(ISD::FP_TO_UINT,        MVT::i32, Custom);
282    setOperationAction(ISD::STRICT_FP_TO_UINT, MVT::i32, Custom);
283    setOperationAction(ISD::FP_TO_UINT,        MVT::i64, Custom);
284    setOperationAction(ISD::STRICT_FP_TO_UINT, MVT::i64, Custom);
285
286    setOperationAction(ISD::LRINT,             MVT::f32, Custom);
287    setOperationAction(ISD::LRINT,             MVT::f64, Custom);
288    setOperationAction(ISD::LLRINT,            MVT::f32, Custom);
289    setOperationAction(ISD::LLRINT,            MVT::f64, Custom);
290
291    if (!Subtarget.is64Bit()) {
292      setOperationAction(ISD::LRINT,  MVT::i64, Custom);
293      setOperationAction(ISD::LLRINT, MVT::i64, Custom);
294    }
295  }
296
297  if (Subtarget.hasSSE2()) {
298    // Custom lowering for saturating float to int conversions.
299    // We handle promotion to larger result types manually.
300    for (MVT VT : { MVT::i8, MVT::i16, MVT::i32 }) {
301      setOperationAction(ISD::FP_TO_UINT_SAT, VT, Custom);
302      setOperationAction(ISD::FP_TO_SINT_SAT, VT, Custom);
303    }
304    if (Subtarget.is64Bit()) {
305      setOperationAction(ISD::FP_TO_UINT_SAT, MVT::i64, Custom);
306      setOperationAction(ISD::FP_TO_SINT_SAT, MVT::i64, Custom);
307    }
308  }
309
310  // Handle address space casts between mixed sized pointers.
311  setOperationAction(ISD::ADDRSPACECAST, MVT::i32, Custom);
312  setOperationAction(ISD::ADDRSPACECAST, MVT::i64, Custom);
313
314  // TODO: when we have SSE, these could be more efficient, by using movd/movq.
315  if (!X86ScalarSSEf64) {
316    setOperationAction(ISD::BITCAST        , MVT::f32  , Expand);
317    setOperationAction(ISD::BITCAST        , MVT::i32  , Expand);
318    if (Subtarget.is64Bit()) {
319      setOperationAction(ISD::BITCAST      , MVT::f64  , Expand);
320      // Without SSE, i64->f64 goes through memory.
321      setOperationAction(ISD::BITCAST      , MVT::i64  , Expand);
322    }
323  } else if (!Subtarget.is64Bit())
324    setOperationAction(ISD::BITCAST      , MVT::i64  , Custom);
325
326  // Scalar integer divide and remainder are lowered to use operations that
327  // produce two results, to match the available instructions. This exposes
328  // the two-result form to trivial CSE, which is able to combine x/y and x%y
329  // into a single instruction.
330  //
331  // Scalar integer multiply-high is also lowered to use two-result
332  // operations, to match the available instructions. However, plain multiply
333  // (low) operations are left as Legal, as there are single-result
334  // instructions for this in x86. Using the two-result multiply instructions
335  // when both high and low results are needed must be arranged by dagcombine.
336  for (auto VT : { MVT::i8, MVT::i16, MVT::i32, MVT::i64 }) {
337    setOperationAction(ISD::MULHS, VT, Expand);
338    setOperationAction(ISD::MULHU, VT, Expand);
339    setOperationAction(ISD::SDIV, VT, Expand);
340    setOperationAction(ISD::UDIV, VT, Expand);
341    setOperationAction(ISD::SREM, VT, Expand);
342    setOperationAction(ISD::UREM, VT, Expand);
343  }
344
345  setOperationAction(ISD::BR_JT            , MVT::Other, Expand);
346  setOperationAction(ISD::BRCOND           , MVT::Other, Custom);
347  for (auto VT : { MVT::f32, MVT::f64, MVT::f80, MVT::f128,
348                   MVT::i8,  MVT::i16, MVT::i32, MVT::i64 }) {
349    setOperationAction(ISD::BR_CC,     VT, Expand);
350    setOperationAction(ISD::SELECT_CC, VT, Expand);
351  }
352  if (Subtarget.is64Bit())
353    setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i32, Legal);
354  setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i16  , Legal);
355  setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i8   , Legal);
356  setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i1   , Expand);
357
358  setOperationAction(ISD::FREM             , MVT::f32  , Expand);
359  setOperationAction(ISD::FREM             , MVT::f64  , Expand);
360  setOperationAction(ISD::FREM             , MVT::f80  , Expand);
361  setOperationAction(ISD::FREM             , MVT::f128 , Expand);
362
363  if (!Subtarget.useSoftFloat() && Subtarget.hasX87()) {
364    setOperationAction(ISD::FLT_ROUNDS_    , MVT::i32  , Custom);
365    setOperationAction(ISD::SET_ROUNDING   , MVT::Other, Custom);
366  }
367
368  // Promote the i8 variants and force them on up to i32 which has a shorter
369  // encoding.
370  setOperationPromotedToType(ISD::CTTZ           , MVT::i8   , MVT::i32);
371  setOperationPromotedToType(ISD::CTTZ_ZERO_UNDEF, MVT::i8   , MVT::i32);
372
373  if (Subtarget.hasBMI()) {
374    // Promote the i16 zero undef variant and force it on up to i32 when tzcnt
375    // is enabled.
376    setOperationPromotedToType(ISD::CTTZ_ZERO_UNDEF, MVT::i16, MVT::i32);
377  } else {
378    setOperationAction(ISD::CTTZ, MVT::i16, Custom);
379    setOperationAction(ISD::CTTZ           , MVT::i32  , Custom);
380    setOperationAction(ISD::CTTZ_ZERO_UNDEF, MVT::i16  , Legal);
381    setOperationAction(ISD::CTTZ_ZERO_UNDEF, MVT::i32  , Legal);
382    if (Subtarget.is64Bit()) {
383      setOperationAction(ISD::CTTZ         , MVT::i64  , Custom);
384      setOperationAction(ISD::CTTZ_ZERO_UNDEF, MVT::i64, Legal);
385    }
386  }
387
388  if (Subtarget.hasLZCNT()) {
389    // When promoting the i8 variants, force them to i32 for a shorter
390    // encoding.
391    setOperationPromotedToType(ISD::CTLZ           , MVT::i8   , MVT::i32);
392    setOperationPromotedToType(ISD::CTLZ_ZERO_UNDEF, MVT::i8   , MVT::i32);
393  } else {
394    for (auto VT : {MVT::i8, MVT::i16, MVT::i32, MVT::i64}) {
395      if (VT == MVT::i64 && !Subtarget.is64Bit())
396        continue;
397      setOperationAction(ISD::CTLZ           , VT, Custom);
398      setOperationAction(ISD::CTLZ_ZERO_UNDEF, VT, Custom);
399    }
400  }
401
402  for (auto Op : {ISD::FP16_TO_FP, ISD::STRICT_FP16_TO_FP, ISD::FP_TO_FP16,
403                  ISD::STRICT_FP_TO_FP16}) {
404    // Special handling for half-precision floating point conversions.
405    // If we don't have F16C support, then lower half float conversions
406    // into library calls.
407    setOperationAction(
408        Op, MVT::f32,
409        (!Subtarget.useSoftFloat() && Subtarget.hasF16C()) ? Custom : Expand);
410    // There's never any support for operations beyond MVT::f32.
411    setOperationAction(Op, MVT::f64, Expand);
412    setOperationAction(Op, MVT::f80, Expand);
413    setOperationAction(Op, MVT::f128, Expand);
414  }
415
416  setLoadExtAction(ISD::EXTLOAD, MVT::f32, MVT::f16, Expand);
417  setLoadExtAction(ISD::EXTLOAD, MVT::f64, MVT::f16, Expand);
418  setLoadExtAction(ISD::EXTLOAD, MVT::f80, MVT::f16, Expand);
419  setLoadExtAction(ISD::EXTLOAD, MVT::f128, MVT::f16, Expand);
420  setTruncStoreAction(MVT::f32, MVT::f16, Expand);
421  setTruncStoreAction(MVT::f64, MVT::f16, Expand);
422  setTruncStoreAction(MVT::f80, MVT::f16, Expand);
423  setTruncStoreAction(MVT::f128, MVT::f16, Expand);
424
425  setOperationAction(ISD::PARITY, MVT::i8, Custom);
426  if (Subtarget.hasPOPCNT()) {
427    setOperationPromotedToType(ISD::CTPOP, MVT::i8, MVT::i32);
428  } else {
429    setOperationAction(ISD::CTPOP          , MVT::i8   , Expand);
430    setOperationAction(ISD::CTPOP          , MVT::i16  , Expand);
431    setOperationAction(ISD::CTPOP          , MVT::i32  , Expand);
432    if (Subtarget.is64Bit())
433      setOperationAction(ISD::CTPOP        , MVT::i64  , Expand);
434    else
435      setOperationAction(ISD::CTPOP        , MVT::i64  , Custom);
436
437    setOperationAction(ISD::PARITY, MVT::i16, Custom);
438    setOperationAction(ISD::PARITY, MVT::i32, Custom);
439    if (Subtarget.is64Bit())
440      setOperationAction(ISD::PARITY, MVT::i64, Custom);
441  }
442
443  setOperationAction(ISD::READCYCLECOUNTER , MVT::i64  , Custom);
444
445  if (!Subtarget.hasMOVBE())
446    setOperationAction(ISD::BSWAP          , MVT::i16  , Expand);
447
448  // X86 wants to expand cmov itself.
449  for (auto VT : { MVT::f32, MVT::f64, MVT::f80, MVT::f128 }) {
450    setOperationAction(ISD::SELECT, VT, Custom);
451    setOperationAction(ISD::SETCC, VT, Custom);
452    setOperationAction(ISD::STRICT_FSETCC, VT, Custom);
453    setOperationAction(ISD::STRICT_FSETCCS, VT, Custom);
454  }
455  for (auto VT : { MVT::i8, MVT::i16, MVT::i32, MVT::i64 }) {
456    if (VT == MVT::i64 && !Subtarget.is64Bit())
457      continue;
458    setOperationAction(ISD::SELECT, VT, Custom);
459    setOperationAction(ISD::SETCC,  VT, Custom);
460  }
461
462  // Custom action for SELECT MMX and expand action for SELECT_CC MMX
463  setOperationAction(ISD::SELECT, MVT::x86mmx, Custom);
464  setOperationAction(ISD::SELECT_CC, MVT::x86mmx, Expand);
465
466  setOperationAction(ISD::EH_RETURN       , MVT::Other, Custom);
467  // NOTE: EH_SJLJ_SETJMP/_LONGJMP are not recommended, since
468  // LLVM/Clang supports zero-cost DWARF and SEH exception handling.
469  setOperationAction(ISD::EH_SJLJ_SETJMP, MVT::i32, Custom);
470  setOperationAction(ISD::EH_SJLJ_LONGJMP, MVT::Other, Custom);
471  setOperationAction(ISD::EH_SJLJ_SETUP_DISPATCH, MVT::Other, Custom);
472  if (TM.Options.ExceptionModel == ExceptionHandling::SjLj)
473    setLibcallName(RTLIB::UNWIND_RESUME, "_Unwind_SjLj_Resume");
474
475  // Darwin ABI issue.
476  for (auto VT : { MVT::i32, MVT::i64 }) {
477    if (VT == MVT::i64 && !Subtarget.is64Bit())
478      continue;
479    setOperationAction(ISD::ConstantPool    , VT, Custom);
480    setOperationAction(ISD::JumpTable       , VT, Custom);
481    setOperationAction(ISD::GlobalAddress   , VT, Custom);
482    setOperationAction(ISD::GlobalTLSAddress, VT, Custom);
483    setOperationAction(ISD::ExternalSymbol  , VT, Custom);
484    setOperationAction(ISD::BlockAddress    , VT, Custom);
485  }
486
487  // 64-bit shl, sra, srl (iff 32-bit x86)
488  for (auto VT : { MVT::i32, MVT::i64 }) {
489    if (VT == MVT::i64 && !Subtarget.is64Bit())
490      continue;
491    setOperationAction(ISD::SHL_PARTS, VT, Custom);
492    setOperationAction(ISD::SRA_PARTS, VT, Custom);
493    setOperationAction(ISD::SRL_PARTS, VT, Custom);
494  }
495
496  if (Subtarget.hasSSEPrefetch() || Subtarget.has3DNow())
497    setOperationAction(ISD::PREFETCH      , MVT::Other, Legal);
498
499  setOperationAction(ISD::ATOMIC_FENCE  , MVT::Other, Custom);
500
501  // Expand certain atomics
502  for (auto VT : { MVT::i8, MVT::i16, MVT::i32, MVT::i64 }) {
503    setOperationAction(ISD::ATOMIC_CMP_SWAP_WITH_SUCCESS, VT, Custom);
504    setOperationAction(ISD::ATOMIC_LOAD_SUB, VT, Custom);
505    setOperationAction(ISD::ATOMIC_LOAD_ADD, VT, Custom);
506    setOperationAction(ISD::ATOMIC_LOAD_OR, VT, Custom);
507    setOperationAction(ISD::ATOMIC_LOAD_XOR, VT, Custom);
508    setOperationAction(ISD::ATOMIC_LOAD_AND, VT, Custom);
509    setOperationAction(ISD::ATOMIC_STORE, VT, Custom);
510  }
511
512  if (!Subtarget.is64Bit())
513    setOperationAction(ISD::ATOMIC_LOAD, MVT::i64, Custom);
514
515  if (Subtarget.hasCmpxchg16b()) {
516    setOperationAction(ISD::ATOMIC_CMP_SWAP_WITH_SUCCESS, MVT::i128, Custom);
517  }
518
519  // FIXME - use subtarget debug flags
520  if (!Subtarget.isTargetDarwin() && !Subtarget.isTargetELF() &&
521      !Subtarget.isTargetCygMing() && !Subtarget.isTargetWin64() &&
522      TM.Options.ExceptionModel != ExceptionHandling::SjLj) {
523    setOperationAction(ISD::EH_LABEL, MVT::Other, Expand);
524  }
525
526  setOperationAction(ISD::FRAME_TO_ARGS_OFFSET, MVT::i32, Custom);
527  setOperationAction(ISD::FRAME_TO_ARGS_OFFSET, MVT::i64, Custom);
528
529  setOperationAction(ISD::INIT_TRAMPOLINE, MVT::Other, Custom);
530  setOperationAction(ISD::ADJUST_TRAMPOLINE, MVT::Other, Custom);
531
532  setOperationAction(ISD::TRAP, MVT::Other, Legal);
533  setOperationAction(ISD::DEBUGTRAP, MVT::Other, Legal);
534  setOperationAction(ISD::UBSANTRAP, MVT::Other, Legal);
535
536  // VASTART needs to be custom lowered to use the VarArgsFrameIndex
537  setOperationAction(ISD::VASTART           , MVT::Other, Custom);
538  setOperationAction(ISD::VAEND             , MVT::Other, Expand);
539  bool Is64Bit = Subtarget.is64Bit();
540  setOperationAction(ISD::VAARG,  MVT::Other, Is64Bit ? Custom : Expand);
541  setOperationAction(ISD::VACOPY, MVT::Other, Is64Bit ? Custom : Expand);
542
543  setOperationAction(ISD::STACKSAVE,          MVT::Other, Expand);
544  setOperationAction(ISD::STACKRESTORE,       MVT::Other, Expand);
545
546  setOperationAction(ISD::DYNAMIC_STACKALLOC, PtrVT, Custom);
547
548  // GC_TRANSITION_START and GC_TRANSITION_END need custom lowering.
549  setOperationAction(ISD::GC_TRANSITION_START, MVT::Other, Custom);
550  setOperationAction(ISD::GC_TRANSITION_END, MVT::Other, Custom);
551
552  if (!Subtarget.useSoftFloat() && X86ScalarSSEf64) {
553    // f32 and f64 use SSE.
554    // Set up the FP register classes.
555    addRegisterClass(MVT::f32, Subtarget.hasAVX512() ? &X86::FR32XRegClass
556                                                     : &X86::FR32RegClass);
557    addRegisterClass(MVT::f64, Subtarget.hasAVX512() ? &X86::FR64XRegClass
558                                                     : &X86::FR64RegClass);
559
560    // Disable f32->f64 extload as we can only generate this in one instruction
561    // under optsize. So its easier to pattern match (fpext (load)) for that
562    // case instead of needing to emit 2 instructions for extload in the
563    // non-optsize case.
564    setLoadExtAction(ISD::EXTLOAD, MVT::f64, MVT::f32, Expand);
565
566    for (auto VT : { MVT::f32, MVT::f64 }) {
567      // Use ANDPD to simulate FABS.
568      setOperationAction(ISD::FABS, VT, Custom);
569
570      // Use XORP to simulate FNEG.
571      setOperationAction(ISD::FNEG, VT, Custom);
572
573      // Use ANDPD and ORPD to simulate FCOPYSIGN.
574      setOperationAction(ISD::FCOPYSIGN, VT, Custom);
575
576      // These might be better off as horizontal vector ops.
577      setOperationAction(ISD::FADD, VT, Custom);
578      setOperationAction(ISD::FSUB, VT, Custom);
579
580      // We don't support sin/cos/fmod
581      setOperationAction(ISD::FSIN   , VT, Expand);
582      setOperationAction(ISD::FCOS   , VT, Expand);
583      setOperationAction(ISD::FSINCOS, VT, Expand);
584    }
585
586    // Lower this to MOVMSK plus an AND.
587    setOperationAction(ISD::FGETSIGN, MVT::i64, Custom);
588    setOperationAction(ISD::FGETSIGN, MVT::i32, Custom);
589
590  } else if (!Subtarget.useSoftFloat() && X86ScalarSSEf32 &&
591             (UseX87 || Is64Bit)) {
592    // Use SSE for f32, x87 for f64.
593    // Set up the FP register classes.
594    addRegisterClass(MVT::f32, &X86::FR32RegClass);
595    if (UseX87)
596      addRegisterClass(MVT::f64, &X86::RFP64RegClass);
597
598    // Use ANDPS to simulate FABS.
599    setOperationAction(ISD::FABS , MVT::f32, Custom);
600
601    // Use XORP to simulate FNEG.
602    setOperationAction(ISD::FNEG , MVT::f32, Custom);
603
604    if (UseX87)
605      setOperationAction(ISD::UNDEF, MVT::f64, Expand);
606
607    // Use ANDPS and ORPS to simulate FCOPYSIGN.
608    if (UseX87)
609      setOperationAction(ISD::FCOPYSIGN, MVT::f64, Expand);
610    setOperationAction(ISD::FCOPYSIGN, MVT::f32, Custom);
611
612    // We don't support sin/cos/fmod
613    setOperationAction(ISD::FSIN   , MVT::f32, Expand);
614    setOperationAction(ISD::FCOS   , MVT::f32, Expand);
615    setOperationAction(ISD::FSINCOS, MVT::f32, Expand);
616
617    if (UseX87) {
618      // Always expand sin/cos functions even though x87 has an instruction.
619      setOperationAction(ISD::FSIN, MVT::f64, Expand);
620      setOperationAction(ISD::FCOS, MVT::f64, Expand);
621      setOperationAction(ISD::FSINCOS, MVT::f64, Expand);
622    }
623  } else if (UseX87) {
624    // f32 and f64 in x87.
625    // Set up the FP register classes.
626    addRegisterClass(MVT::f64, &X86::RFP64RegClass);
627    addRegisterClass(MVT::f32, &X86::RFP32RegClass);
628
629    for (auto VT : { MVT::f32, MVT::f64 }) {
630      setOperationAction(ISD::UNDEF,     VT, Expand);
631      setOperationAction(ISD::FCOPYSIGN, VT, Expand);
632
633      // Always expand sin/cos functions even though x87 has an instruction.
634      setOperationAction(ISD::FSIN   , VT, Expand);
635      setOperationAction(ISD::FCOS   , VT, Expand);
636      setOperationAction(ISD::FSINCOS, VT, Expand);
637    }
638  }
639
640  // Expand FP32 immediates into loads from the stack, save special cases.
641  if (isTypeLegal(MVT::f32)) {
642    if (UseX87 && (getRegClassFor(MVT::f32) == &X86::RFP32RegClass)) {
643      addLegalFPImmediate(APFloat(+0.0f)); // FLD0
644      addLegalFPImmediate(APFloat(+1.0f)); // FLD1
645      addLegalFPImmediate(APFloat(-0.0f)); // FLD0/FCHS
646      addLegalFPImmediate(APFloat(-1.0f)); // FLD1/FCHS
647    } else // SSE immediates.
648      addLegalFPImmediate(APFloat(+0.0f)); // xorps
649  }
650  // Expand FP64 immediates into loads from the stack, save special cases.
651  if (isTypeLegal(MVT::f64)) {
652    if (UseX87 && getRegClassFor(MVT::f64) == &X86::RFP64RegClass) {
653      addLegalFPImmediate(APFloat(+0.0)); // FLD0
654      addLegalFPImmediate(APFloat(+1.0)); // FLD1
655      addLegalFPImmediate(APFloat(-0.0)); // FLD0/FCHS
656      addLegalFPImmediate(APFloat(-1.0)); // FLD1/FCHS
657    } else // SSE immediates.
658      addLegalFPImmediate(APFloat(+0.0)); // xorpd
659  }
660  // Handle constrained floating-point operations of scalar.
661  setOperationAction(ISD::STRICT_FADD,      MVT::f32, Legal);
662  setOperationAction(ISD::STRICT_FADD,      MVT::f64, Legal);
663  setOperationAction(ISD::STRICT_FSUB,      MVT::f32, Legal);
664  setOperationAction(ISD::STRICT_FSUB,      MVT::f64, Legal);
665  setOperationAction(ISD::STRICT_FMUL,      MVT::f32, Legal);
666  setOperationAction(ISD::STRICT_FMUL,      MVT::f64, Legal);
667  setOperationAction(ISD::STRICT_FDIV,      MVT::f32, Legal);
668  setOperationAction(ISD::STRICT_FDIV,      MVT::f64, Legal);
669  setOperationAction(ISD::STRICT_FP_EXTEND, MVT::f64, Legal);
670  setOperationAction(ISD::STRICT_FP_ROUND,  MVT::f32, Legal);
671  setOperationAction(ISD::STRICT_FP_ROUND,  MVT::f64, Legal);
672  setOperationAction(ISD::STRICT_FSQRT,     MVT::f32, Legal);
673  setOperationAction(ISD::STRICT_FSQRT,     MVT::f64, Legal);
674
675  // We don't support FMA.
676  setOperationAction(ISD::FMA, MVT::f64, Expand);
677  setOperationAction(ISD::FMA, MVT::f32, Expand);
678
679  // f80 always uses X87.
680  if (UseX87) {
681    addRegisterClass(MVT::f80, &X86::RFP80RegClass);
682    setOperationAction(ISD::UNDEF,     MVT::f80, Expand);
683    setOperationAction(ISD::FCOPYSIGN, MVT::f80, Expand);
684    {
685      APFloat TmpFlt = APFloat::getZero(APFloat::x87DoubleExtended());
686      addLegalFPImmediate(TmpFlt);  // FLD0
687      TmpFlt.changeSign();
688      addLegalFPImmediate(TmpFlt);  // FLD0/FCHS
689
690      bool ignored;
691      APFloat TmpFlt2(+1.0);
692      TmpFlt2.convert(APFloat::x87DoubleExtended(), APFloat::rmNearestTiesToEven,
693                      &ignored);
694      addLegalFPImmediate(TmpFlt2);  // FLD1
695      TmpFlt2.changeSign();
696      addLegalFPImmediate(TmpFlt2);  // FLD1/FCHS
697    }
698
699    // Always expand sin/cos functions even though x87 has an instruction.
700    setOperationAction(ISD::FSIN   , MVT::f80, Expand);
701    setOperationAction(ISD::FCOS   , MVT::f80, Expand);
702    setOperationAction(ISD::FSINCOS, MVT::f80, Expand);
703
704    setOperationAction(ISD::FFLOOR, MVT::f80, Expand);
705    setOperationAction(ISD::FCEIL,  MVT::f80, Expand);
706    setOperationAction(ISD::FTRUNC, MVT::f80, Expand);
707    setOperationAction(ISD::FRINT,  MVT::f80, Expand);
708    setOperationAction(ISD::FNEARBYINT, MVT::f80, Expand);
709    setOperationAction(ISD::FMA, MVT::f80, Expand);
710    setOperationAction(ISD::LROUND, MVT::f80, Expand);
711    setOperationAction(ISD::LLROUND, MVT::f80, Expand);
712    setOperationAction(ISD::LRINT, MVT::f80, Custom);
713    setOperationAction(ISD::LLRINT, MVT::f80, Custom);
714
715    // Handle constrained floating-point operations of scalar.
716    setOperationAction(ISD::STRICT_FADD     , MVT::f80, Legal);
717    setOperationAction(ISD::STRICT_FSUB     , MVT::f80, Legal);
718    setOperationAction(ISD::STRICT_FMUL     , MVT::f80, Legal);
719    setOperationAction(ISD::STRICT_FDIV     , MVT::f80, Legal);
720    setOperationAction(ISD::STRICT_FSQRT    , MVT::f80, Legal);
721    setOperationAction(ISD::STRICT_FP_EXTEND, MVT::f80, Legal);
722    // FIXME: When the target is 64-bit, STRICT_FP_ROUND will be overwritten
723    // as Custom.
724    setOperationAction(ISD::STRICT_FP_ROUND, MVT::f80, Legal);
725  }
726
727  // f128 uses xmm registers, but most operations require libcalls.
728  if (!Subtarget.useSoftFloat() && Subtarget.is64Bit() && Subtarget.hasSSE1()) {
729    addRegisterClass(MVT::f128, Subtarget.hasVLX() ? &X86::VR128XRegClass
730                                                   : &X86::VR128RegClass);
731
732    addLegalFPImmediate(APFloat::getZero(APFloat::IEEEquad())); // xorps
733
734    setOperationAction(ISD::FADD,        MVT::f128, LibCall);
735    setOperationAction(ISD::STRICT_FADD, MVT::f128, LibCall);
736    setOperationAction(ISD::FSUB,        MVT::f128, LibCall);
737    setOperationAction(ISD::STRICT_FSUB, MVT::f128, LibCall);
738    setOperationAction(ISD::FDIV,        MVT::f128, LibCall);
739    setOperationAction(ISD::STRICT_FDIV, MVT::f128, LibCall);
740    setOperationAction(ISD::FMUL,        MVT::f128, LibCall);
741    setOperationAction(ISD::STRICT_FMUL, MVT::f128, LibCall);
742    setOperationAction(ISD::FMA,         MVT::f128, LibCall);
743    setOperationAction(ISD::STRICT_FMA,  MVT::f128, LibCall);
744
745    setOperationAction(ISD::FABS, MVT::f128, Custom);
746    setOperationAction(ISD::FNEG, MVT::f128, Custom);
747    setOperationAction(ISD::FCOPYSIGN, MVT::f128, Custom);
748
749    setOperationAction(ISD::FSIN,         MVT::f128, LibCall);
750    setOperationAction(ISD::STRICT_FSIN,  MVT::f128, LibCall);
751    setOperationAction(ISD::FCOS,         MVT::f128, LibCall);
752    setOperationAction(ISD::STRICT_FCOS,  MVT::f128, LibCall);
753    setOperationAction(ISD::FSINCOS,      MVT::f128, LibCall);
754    // No STRICT_FSINCOS
755    setOperationAction(ISD::FSQRT,        MVT::f128, LibCall);
756    setOperationAction(ISD::STRICT_FSQRT, MVT::f128, LibCall);
757
758    setOperationAction(ISD::FP_EXTEND,        MVT::f128, Custom);
759    setOperationAction(ISD::STRICT_FP_EXTEND, MVT::f128, Custom);
760    // We need to custom handle any FP_ROUND with an f128 input, but
761    // LegalizeDAG uses the result type to know when to run a custom handler.
762    // So we have to list all legal floating point result types here.
763    if (isTypeLegal(MVT::f32)) {
764      setOperationAction(ISD::FP_ROUND, MVT::f32, Custom);
765      setOperationAction(ISD::STRICT_FP_ROUND, MVT::f32, Custom);
766    }
767    if (isTypeLegal(MVT::f64)) {
768      setOperationAction(ISD::FP_ROUND, MVT::f64, Custom);
769      setOperationAction(ISD::STRICT_FP_ROUND, MVT::f64, Custom);
770    }
771    if (isTypeLegal(MVT::f80)) {
772      setOperationAction(ISD::FP_ROUND, MVT::f80, Custom);
773      setOperationAction(ISD::STRICT_FP_ROUND, MVT::f80, Custom);
774    }
775
776    setOperationAction(ISD::SETCC, MVT::f128, Custom);
777
778    setLoadExtAction(ISD::EXTLOAD, MVT::f128, MVT::f32, Expand);
779    setLoadExtAction(ISD::EXTLOAD, MVT::f128, MVT::f64, Expand);
780    setLoadExtAction(ISD::EXTLOAD, MVT::f128, MVT::f80, Expand);
781    setTruncStoreAction(MVT::f128, MVT::f32, Expand);
782    setTruncStoreAction(MVT::f128, MVT::f64, Expand);
783    setTruncStoreAction(MVT::f128, MVT::f80, Expand);
784  }
785
786  // Always use a library call for pow.
787  setOperationAction(ISD::FPOW             , MVT::f32  , Expand);
788  setOperationAction(ISD::FPOW             , MVT::f64  , Expand);
789  setOperationAction(ISD::FPOW             , MVT::f80  , Expand);
790  setOperationAction(ISD::FPOW             , MVT::f128 , Expand);
791
792  setOperationAction(ISD::FLOG, MVT::f80, Expand);
793  setOperationAction(ISD::FLOG2, MVT::f80, Expand);
794  setOperationAction(ISD::FLOG10, MVT::f80, Expand);
795  setOperationAction(ISD::FEXP, MVT::f80, Expand);
796  setOperationAction(ISD::FEXP2, MVT::f80, Expand);
797  setOperationAction(ISD::FMINNUM, MVT::f80, Expand);
798  setOperationAction(ISD::FMAXNUM, MVT::f80, Expand);
799
800  // Some FP actions are always expanded for vector types.
801  for (auto VT : { MVT::v4f32, MVT::v8f32, MVT::v16f32,
802                   MVT::v2f64, MVT::v4f64, MVT::v8f64 }) {
803    setOperationAction(ISD::FSIN,      VT, Expand);
804    setOperationAction(ISD::FSINCOS,   VT, Expand);
805    setOperationAction(ISD::FCOS,      VT, Expand);
806    setOperationAction(ISD::FREM,      VT, Expand);
807    setOperationAction(ISD::FCOPYSIGN, VT, Expand);
808    setOperationAction(ISD::FPOW,      VT, Expand);
809    setOperationAction(ISD::FLOG,      VT, Expand);
810    setOperationAction(ISD::FLOG2,     VT, Expand);
811    setOperationAction(ISD::FLOG10,    VT, Expand);
812    setOperationAction(ISD::FEXP,      VT, Expand);
813    setOperationAction(ISD::FEXP2,     VT, Expand);
814  }
815
816  // First set operation action for all vector types to either promote
817  // (for widening) or expand (for scalarization). Then we will selectively
818  // turn on ones that can be effectively codegen'd.
819  for (MVT VT : MVT::fixedlen_vector_valuetypes()) {
820    setOperationAction(ISD::SDIV, VT, Expand);
821    setOperationAction(ISD::UDIV, VT, Expand);
822    setOperationAction(ISD::SREM, VT, Expand);
823    setOperationAction(ISD::UREM, VT, Expand);
824    setOperationAction(ISD::EXTRACT_VECTOR_ELT, VT,Expand);
825    setOperationAction(ISD::INSERT_VECTOR_ELT, VT, Expand);
826    setOperationAction(ISD::EXTRACT_SUBVECTOR, VT,Expand);
827    setOperationAction(ISD::INSERT_SUBVECTOR, VT,Expand);
828    setOperationAction(ISD::FMA,  VT, Expand);
829    setOperationAction(ISD::FFLOOR, VT, Expand);
830    setOperationAction(ISD::FCEIL, VT, Expand);
831    setOperationAction(ISD::FTRUNC, VT, Expand);
832    setOperationAction(ISD::FRINT, VT, Expand);
833    setOperationAction(ISD::FNEARBYINT, VT, Expand);
834    setOperationAction(ISD::SMUL_LOHI, VT, Expand);
835    setOperationAction(ISD::MULHS, VT, Expand);
836    setOperationAction(ISD::UMUL_LOHI, VT, Expand);
837    setOperationAction(ISD::MULHU, VT, Expand);
838    setOperationAction(ISD::SDIVREM, VT, Expand);
839    setOperationAction(ISD::UDIVREM, VT, Expand);
840    setOperationAction(ISD::CTPOP, VT, Expand);
841    setOperationAction(ISD::CTTZ, VT, Expand);
842    setOperationAction(ISD::CTLZ, VT, Expand);
843    setOperationAction(ISD::ROTL, VT, Expand);
844    setOperationAction(ISD::ROTR, VT, Expand);
845    setOperationAction(ISD::BSWAP, VT, Expand);
846    setOperationAction(ISD::SETCC, VT, Expand);
847    setOperationAction(ISD::FP_TO_UINT, VT, Expand);
848    setOperationAction(ISD::FP_TO_SINT, VT, Expand);
849    setOperationAction(ISD::UINT_TO_FP, VT, Expand);
850    setOperationAction(ISD::SINT_TO_FP, VT, Expand);
851    setOperationAction(ISD::SIGN_EXTEND_INREG, VT,Expand);
852    setOperationAction(ISD::TRUNCATE, VT, Expand);
853    setOperationAction(ISD::SIGN_EXTEND, VT, Expand);
854    setOperationAction(ISD::ZERO_EXTEND, VT, Expand);
855    setOperationAction(ISD::ANY_EXTEND, VT, Expand);
856    setOperationAction(ISD::SELECT_CC, VT, Expand);
857    for (MVT InnerVT : MVT::fixedlen_vector_valuetypes()) {
858      setTruncStoreAction(InnerVT, VT, Expand);
859
860      setLoadExtAction(ISD::SEXTLOAD, InnerVT, VT, Expand);
861      setLoadExtAction(ISD::ZEXTLOAD, InnerVT, VT, Expand);
862
863      // N.b. ISD::EXTLOAD legality is basically ignored except for i1-like
864      // types, we have to deal with them whether we ask for Expansion or not.
865      // Setting Expand causes its own optimisation problems though, so leave
866      // them legal.
867      if (VT.getVectorElementType() == MVT::i1)
868        setLoadExtAction(ISD::EXTLOAD, InnerVT, VT, Expand);
869
870      // EXTLOAD for MVT::f16 vectors is not legal because f16 vectors are
871      // split/scalarized right now.
872      if (VT.getVectorElementType() == MVT::f16)
873        setLoadExtAction(ISD::EXTLOAD, InnerVT, VT, Expand);
874    }
875  }
876
877  // FIXME: In order to prevent SSE instructions being expanded to MMX ones
878  // with -msoft-float, disable use of MMX as well.
879  if (!Subtarget.useSoftFloat() && Subtarget.hasMMX()) {
880    addRegisterClass(MVT::x86mmx, &X86::VR64RegClass);
881    // No operations on x86mmx supported, everything uses intrinsics.
882  }
883
884  if (!Subtarget.useSoftFloat() && Subtarget.hasSSE1()) {
885    addRegisterClass(MVT::v4f32, Subtarget.hasVLX() ? &X86::VR128XRegClass
886                                                    : &X86::VR128RegClass);
887
888    setOperationAction(ISD::FNEG,               MVT::v4f32, Custom);
889    setOperationAction(ISD::FABS,               MVT::v4f32, Custom);
890    setOperationAction(ISD::FCOPYSIGN,          MVT::v4f32, Custom);
891    setOperationAction(ISD::BUILD_VECTOR,       MVT::v4f32, Custom);
892    setOperationAction(ISD::VECTOR_SHUFFLE,     MVT::v4f32, Custom);
893    setOperationAction(ISD::VSELECT,            MVT::v4f32, Custom);
894    setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v4f32, Custom);
895    setOperationAction(ISD::SELECT,             MVT::v4f32, Custom);
896
897    setOperationAction(ISD::LOAD,               MVT::v2f32, Custom);
898    setOperationAction(ISD::STORE,              MVT::v2f32, Custom);
899
900    setOperationAction(ISD::STRICT_FADD,        MVT::v4f32, Legal);
901    setOperationAction(ISD::STRICT_FSUB,        MVT::v4f32, Legal);
902    setOperationAction(ISD::STRICT_FMUL,        MVT::v4f32, Legal);
903    setOperationAction(ISD::STRICT_FDIV,        MVT::v4f32, Legal);
904    setOperationAction(ISD::STRICT_FSQRT,       MVT::v4f32, Legal);
905  }
906
907  if (!Subtarget.useSoftFloat() && Subtarget.hasSSE2()) {
908    addRegisterClass(MVT::v2f64, Subtarget.hasVLX() ? &X86::VR128XRegClass
909                                                    : &X86::VR128RegClass);
910
911    // FIXME: Unfortunately, -soft-float and -no-implicit-float mean XMM
912    // registers cannot be used even for integer operations.
913    addRegisterClass(MVT::v16i8, Subtarget.hasVLX() ? &X86::VR128XRegClass
914                                                    : &X86::VR128RegClass);
915    addRegisterClass(MVT::v8i16, Subtarget.hasVLX() ? &X86::VR128XRegClass
916                                                    : &X86::VR128RegClass);
917    addRegisterClass(MVT::v4i32, Subtarget.hasVLX() ? &X86::VR128XRegClass
918                                                    : &X86::VR128RegClass);
919    addRegisterClass(MVT::v2i64, Subtarget.hasVLX() ? &X86::VR128XRegClass
920                                                    : &X86::VR128RegClass);
921
922    for (auto VT : { MVT::v2i8, MVT::v4i8, MVT::v8i8,
923                     MVT::v2i16, MVT::v4i16, MVT::v2i32 }) {
924      setOperationAction(ISD::SDIV, VT, Custom);
925      setOperationAction(ISD::SREM, VT, Custom);
926      setOperationAction(ISD::UDIV, VT, Custom);
927      setOperationAction(ISD::UREM, VT, Custom);
928    }
929
930    setOperationAction(ISD::MUL,                MVT::v2i8,  Custom);
931    setOperationAction(ISD::MUL,                MVT::v4i8,  Custom);
932    setOperationAction(ISD::MUL,                MVT::v8i8,  Custom);
933
934    setOperationAction(ISD::MUL,                MVT::v16i8, Custom);
935    setOperationAction(ISD::MUL,                MVT::v4i32, Custom);
936    setOperationAction(ISD::MUL,                MVT::v2i64, Custom);
937    setOperationAction(ISD::MULHU,              MVT::v4i32, Custom);
938    setOperationAction(ISD::MULHS,              MVT::v4i32, Custom);
939    setOperationAction(ISD::MULHU,              MVT::v16i8, Custom);
940    setOperationAction(ISD::MULHS,              MVT::v16i8, Custom);
941    setOperationAction(ISD::MULHU,              MVT::v8i16, Legal);
942    setOperationAction(ISD::MULHS,              MVT::v8i16, Legal);
943    setOperationAction(ISD::MUL,                MVT::v8i16, Legal);
944
945    setOperationAction(ISD::SMULO,              MVT::v16i8, Custom);
946    setOperationAction(ISD::UMULO,              MVT::v16i8, Custom);
947
948    setOperationAction(ISD::FNEG,               MVT::v2f64, Custom);
949    setOperationAction(ISD::FABS,               MVT::v2f64, Custom);
950    setOperationAction(ISD::FCOPYSIGN,          MVT::v2f64, Custom);
951
952    for (auto VT : { MVT::v16i8, MVT::v8i16, MVT::v4i32, MVT::v2i64 }) {
953      setOperationAction(ISD::SMAX, VT, VT == MVT::v8i16 ? Legal : Custom);
954      setOperationAction(ISD::SMIN, VT, VT == MVT::v8i16 ? Legal : Custom);
955      setOperationAction(ISD::UMAX, VT, VT == MVT::v16i8 ? Legal : Custom);
956      setOperationAction(ISD::UMIN, VT, VT == MVT::v16i8 ? Legal : Custom);
957    }
958
959    setOperationAction(ISD::UADDSAT,            MVT::v16i8, Legal);
960    setOperationAction(ISD::SADDSAT,            MVT::v16i8, Legal);
961    setOperationAction(ISD::USUBSAT,            MVT::v16i8, Legal);
962    setOperationAction(ISD::SSUBSAT,            MVT::v16i8, Legal);
963    setOperationAction(ISD::UADDSAT,            MVT::v8i16, Legal);
964    setOperationAction(ISD::SADDSAT,            MVT::v8i16, Legal);
965    setOperationAction(ISD::USUBSAT,            MVT::v8i16, Legal);
966    setOperationAction(ISD::SSUBSAT,            MVT::v8i16, Legal);
967    setOperationAction(ISD::USUBSAT,            MVT::v4i32, Custom);
968    setOperationAction(ISD::USUBSAT,            MVT::v2i64, Custom);
969
970    setOperationAction(ISD::INSERT_VECTOR_ELT,  MVT::v8i16, Custom);
971    setOperationAction(ISD::INSERT_VECTOR_ELT,  MVT::v4i32, Custom);
972    setOperationAction(ISD::INSERT_VECTOR_ELT,  MVT::v4f32, Custom);
973
974    for (auto VT : { MVT::v16i8, MVT::v8i16, MVT::v4i32, MVT::v2i64 }) {
975      setOperationAction(ISD::SETCC,              VT, Custom);
976      setOperationAction(ISD::STRICT_FSETCC,      VT, Custom);
977      setOperationAction(ISD::STRICT_FSETCCS,     VT, Custom);
978      setOperationAction(ISD::CTPOP,              VT, Custom);
979      setOperationAction(ISD::ABS,                VT, Custom);
980
981      // The condition codes aren't legal in SSE/AVX and under AVX512 we use
982      // setcc all the way to isel and prefer SETGT in some isel patterns.
983      setCondCodeAction(ISD::SETLT, VT, Custom);
984      setCondCodeAction(ISD::SETLE, VT, Custom);
985    }
986
987    for (auto VT : { MVT::v16i8, MVT::v8i16, MVT::v4i32 }) {
988      setOperationAction(ISD::SCALAR_TO_VECTOR,   VT, Custom);
989      setOperationAction(ISD::BUILD_VECTOR,       VT, Custom);
990      setOperationAction(ISD::VECTOR_SHUFFLE,     VT, Custom);
991      setOperationAction(ISD::VSELECT,            VT, Custom);
992      setOperationAction(ISD::EXTRACT_VECTOR_ELT, VT, Custom);
993    }
994
995    for (auto VT : { MVT::v2f64, MVT::v2i64 }) {
996      setOperationAction(ISD::BUILD_VECTOR,       VT, Custom);
997      setOperationAction(ISD::VECTOR_SHUFFLE,     VT, Custom);
998      setOperationAction(ISD::VSELECT,            VT, Custom);
999
1000      if (VT == MVT::v2i64 && !Subtarget.is64Bit())
1001        continue;
1002
1003      setOperationAction(ISD::INSERT_VECTOR_ELT,  VT, Custom);
1004      setOperationAction(ISD::EXTRACT_VECTOR_ELT, VT, Custom);
1005    }
1006
1007    // Custom lower v2i64 and v2f64 selects.
1008    setOperationAction(ISD::SELECT,             MVT::v2f64, Custom);
1009    setOperationAction(ISD::SELECT,             MVT::v2i64, Custom);
1010    setOperationAction(ISD::SELECT,             MVT::v4i32, Custom);
1011    setOperationAction(ISD::SELECT,             MVT::v8i16, Custom);
1012    setOperationAction(ISD::SELECT,             MVT::v16i8, Custom);
1013
1014    setOperationAction(ISD::FP_TO_SINT,         MVT::v4i32, Legal);
1015    setOperationAction(ISD::FP_TO_SINT,         MVT::v2i32, Custom);
1016    setOperationAction(ISD::STRICT_FP_TO_SINT,  MVT::v4i32, Legal);
1017    setOperationAction(ISD::STRICT_FP_TO_SINT,  MVT::v2i32, Custom);
1018
1019    // Custom legalize these to avoid over promotion or custom promotion.
1020    for (auto VT : {MVT::v2i8, MVT::v4i8, MVT::v8i8, MVT::v2i16, MVT::v4i16}) {
1021      setOperationAction(ISD::FP_TO_SINT,        VT, Custom);
1022      setOperationAction(ISD::FP_TO_UINT,        VT, Custom);
1023      setOperationAction(ISD::STRICT_FP_TO_SINT, VT, Custom);
1024      setOperationAction(ISD::STRICT_FP_TO_UINT, VT, Custom);
1025    }
1026
1027    setOperationAction(ISD::SINT_TO_FP,         MVT::v4i32, Legal);
1028    setOperationAction(ISD::STRICT_SINT_TO_FP,  MVT::v4i32, Legal);
1029    setOperationAction(ISD::SINT_TO_FP,         MVT::v2i32, Custom);
1030    setOperationAction(ISD::STRICT_SINT_TO_FP,  MVT::v2i32, Custom);
1031
1032    setOperationAction(ISD::UINT_TO_FP,         MVT::v2i32, Custom);
1033    setOperationAction(ISD::STRICT_UINT_TO_FP,  MVT::v2i32, Custom);
1034
1035    setOperationAction(ISD::UINT_TO_FP,         MVT::v4i32, Custom);
1036    setOperationAction(ISD::STRICT_UINT_TO_FP,  MVT::v4i32, Custom);
1037
1038    // Fast v2f32 UINT_TO_FP( v2i32 ) custom conversion.
1039    setOperationAction(ISD::SINT_TO_FP,         MVT::v2f32, Custom);
1040    setOperationAction(ISD::STRICT_SINT_TO_FP,  MVT::v2f32, Custom);
1041    setOperationAction(ISD::UINT_TO_FP,         MVT::v2f32, Custom);
1042    setOperationAction(ISD::STRICT_UINT_TO_FP,  MVT::v2f32, Custom);
1043
1044    setOperationAction(ISD::FP_EXTEND,          MVT::v2f32, Custom);
1045    setOperationAction(ISD::STRICT_FP_EXTEND,   MVT::v2f32, Custom);
1046    setOperationAction(ISD::FP_ROUND,           MVT::v2f32, Custom);
1047    setOperationAction(ISD::STRICT_FP_ROUND,    MVT::v2f32, Custom);
1048
1049    // We want to legalize this to an f64 load rather than an i64 load on
1050    // 64-bit targets and two 32-bit loads on a 32-bit target. Similar for
1051    // store.
1052    setOperationAction(ISD::LOAD,               MVT::v2i32, Custom);
1053    setOperationAction(ISD::LOAD,               MVT::v4i16, Custom);
1054    setOperationAction(ISD::LOAD,               MVT::v8i8,  Custom);
1055    setOperationAction(ISD::STORE,              MVT::v2i32, Custom);
1056    setOperationAction(ISD::STORE,              MVT::v4i16, Custom);
1057    setOperationAction(ISD::STORE,              MVT::v8i8,  Custom);
1058
1059    setOperationAction(ISD::BITCAST,            MVT::v2i32, Custom);
1060    setOperationAction(ISD::BITCAST,            MVT::v4i16, Custom);
1061    setOperationAction(ISD::BITCAST,            MVT::v8i8,  Custom);
1062    if (!Subtarget.hasAVX512())
1063      setOperationAction(ISD::BITCAST, MVT::v16i1, Custom);
1064
1065    setOperationAction(ISD::SIGN_EXTEND_VECTOR_INREG, MVT::v2i64, Custom);
1066    setOperationAction(ISD::SIGN_EXTEND_VECTOR_INREG, MVT::v4i32, Custom);
1067    setOperationAction(ISD::SIGN_EXTEND_VECTOR_INREG, MVT::v8i16, Custom);
1068
1069    setOperationAction(ISD::SIGN_EXTEND, MVT::v4i64, Custom);
1070
1071    setOperationAction(ISD::TRUNCATE,    MVT::v2i8,  Custom);
1072    setOperationAction(ISD::TRUNCATE,    MVT::v2i16, Custom);
1073    setOperationAction(ISD::TRUNCATE,    MVT::v2i32, Custom);
1074    setOperationAction(ISD::TRUNCATE,    MVT::v4i8,  Custom);
1075    setOperationAction(ISD::TRUNCATE,    MVT::v4i16, Custom);
1076    setOperationAction(ISD::TRUNCATE,    MVT::v8i8,  Custom);
1077
1078    // In the customized shift lowering, the legal v4i32/v2i64 cases
1079    // in AVX2 will be recognized.
1080    for (auto VT : { MVT::v16i8, MVT::v8i16, MVT::v4i32, MVT::v2i64 }) {
1081      setOperationAction(ISD::SRL,              VT, Custom);
1082      setOperationAction(ISD::SHL,              VT, Custom);
1083      setOperationAction(ISD::SRA,              VT, Custom);
1084    }
1085
1086    setOperationAction(ISD::ROTL,               MVT::v4i32, Custom);
1087    setOperationAction(ISD::ROTL,               MVT::v8i16, Custom);
1088
1089    // With 512-bit registers or AVX512VL+BW, expanding (and promoting the
1090    // shifts) is better.
1091    if (!Subtarget.useAVX512Regs() &&
1092        !(Subtarget.hasBWI() && Subtarget.hasVLX()))
1093      setOperationAction(ISD::ROTL,             MVT::v16i8, Custom);
1094
1095    setOperationAction(ISD::STRICT_FSQRT,       MVT::v2f64, Legal);
1096    setOperationAction(ISD::STRICT_FADD,        MVT::v2f64, Legal);
1097    setOperationAction(ISD::STRICT_FSUB,        MVT::v2f64, Legal);
1098    setOperationAction(ISD::STRICT_FMUL,        MVT::v2f64, Legal);
1099    setOperationAction(ISD::STRICT_FDIV,        MVT::v2f64, Legal);
1100  }
1101
1102  if (!Subtarget.useSoftFloat() && Subtarget.hasSSSE3()) {
1103    setOperationAction(ISD::ABS,                MVT::v16i8, Legal);
1104    setOperationAction(ISD::ABS,                MVT::v8i16, Legal);
1105    setOperationAction(ISD::ABS,                MVT::v4i32, Legal);
1106    setOperationAction(ISD::BITREVERSE,         MVT::v16i8, Custom);
1107    setOperationAction(ISD::CTLZ,               MVT::v16i8, Custom);
1108    setOperationAction(ISD::CTLZ,               MVT::v8i16, Custom);
1109    setOperationAction(ISD::CTLZ,               MVT::v4i32, Custom);
1110    setOperationAction(ISD::CTLZ,               MVT::v2i64, Custom);
1111
1112    // These might be better off as horizontal vector ops.
1113    setOperationAction(ISD::ADD,                MVT::i16, Custom);
1114    setOperationAction(ISD::ADD,                MVT::i32, Custom);
1115    setOperationAction(ISD::SUB,                MVT::i16, Custom);
1116    setOperationAction(ISD::SUB,                MVT::i32, Custom);
1117  }
1118
1119  if (!Subtarget.useSoftFloat() && Subtarget.hasSSE41()) {
1120    for (MVT RoundedTy : {MVT::f32, MVT::f64, MVT::v4f32, MVT::v2f64}) {
1121      setOperationAction(ISD::FFLOOR,            RoundedTy,  Legal);
1122      setOperationAction(ISD::STRICT_FFLOOR,     RoundedTy,  Legal);
1123      setOperationAction(ISD::FCEIL,             RoundedTy,  Legal);
1124      setOperationAction(ISD::STRICT_FCEIL,      RoundedTy,  Legal);
1125      setOperationAction(ISD::FTRUNC,            RoundedTy,  Legal);
1126      setOperationAction(ISD::STRICT_FTRUNC,     RoundedTy,  Legal);
1127      setOperationAction(ISD::FRINT,             RoundedTy,  Legal);
1128      setOperationAction(ISD::STRICT_FRINT,      RoundedTy,  Legal);
1129      setOperationAction(ISD::FNEARBYINT,        RoundedTy,  Legal);
1130      setOperationAction(ISD::STRICT_FNEARBYINT, RoundedTy,  Legal);
1131      setOperationAction(ISD::FROUNDEVEN,        RoundedTy,  Legal);
1132      setOperationAction(ISD::STRICT_FROUNDEVEN, RoundedTy,  Legal);
1133
1134      setOperationAction(ISD::FROUND,            RoundedTy,  Custom);
1135    }
1136
1137    setOperationAction(ISD::SMAX,               MVT::v16i8, Legal);
1138    setOperationAction(ISD::SMAX,               MVT::v4i32, Legal);
1139    setOperationAction(ISD::UMAX,               MVT::v8i16, Legal);
1140    setOperationAction(ISD::UMAX,               MVT::v4i32, Legal);
1141    setOperationAction(ISD::SMIN,               MVT::v16i8, Legal);
1142    setOperationAction(ISD::SMIN,               MVT::v4i32, Legal);
1143    setOperationAction(ISD::UMIN,               MVT::v8i16, Legal);
1144    setOperationAction(ISD::UMIN,               MVT::v4i32, Legal);
1145
1146    setOperationAction(ISD::UADDSAT,            MVT::v4i32, Custom);
1147
1148    // FIXME: Do we need to handle scalar-to-vector here?
1149    setOperationAction(ISD::MUL,                MVT::v4i32, Legal);
1150
1151    // We directly match byte blends in the backend as they match the VSELECT
1152    // condition form.
1153    setOperationAction(ISD::VSELECT,            MVT::v16i8, Legal);
1154
1155    // SSE41 brings specific instructions for doing vector sign extend even in
1156    // cases where we don't have SRA.
1157    for (auto VT : { MVT::v8i16, MVT::v4i32, MVT::v2i64 }) {
1158      setOperationAction(ISD::SIGN_EXTEND_VECTOR_INREG, VT, Legal);
1159      setOperationAction(ISD::ZERO_EXTEND_VECTOR_INREG, VT, Legal);
1160    }
1161
1162    // SSE41 also has vector sign/zero extending loads, PMOV[SZ]X
1163    for (auto LoadExtOp : { ISD::SEXTLOAD, ISD::ZEXTLOAD }) {
1164      setLoadExtAction(LoadExtOp, MVT::v8i16, MVT::v8i8,  Legal);
1165      setLoadExtAction(LoadExtOp, MVT::v4i32, MVT::v4i8,  Legal);
1166      setLoadExtAction(LoadExtOp, MVT::v2i64, MVT::v2i8,  Legal);
1167      setLoadExtAction(LoadExtOp, MVT::v4i32, MVT::v4i16, Legal);
1168      setLoadExtAction(LoadExtOp, MVT::v2i64, MVT::v2i16, Legal);
1169      setLoadExtAction(LoadExtOp, MVT::v2i64, MVT::v2i32, Legal);
1170    }
1171
1172    // i8 vectors are custom because the source register and source
1173    // source memory operand types are not the same width.
1174    setOperationAction(ISD::INSERT_VECTOR_ELT,  MVT::v16i8, Custom);
1175
1176    if (Subtarget.is64Bit() && !Subtarget.hasAVX512()) {
1177      // We need to scalarize v4i64->v432 uint_to_fp using cvtsi2ss, but we can
1178      // do the pre and post work in the vector domain.
1179      setOperationAction(ISD::UINT_TO_FP,        MVT::v4i64, Custom);
1180      setOperationAction(ISD::STRICT_UINT_TO_FP, MVT::v4i64, Custom);
1181      // We need to mark SINT_TO_FP as Custom even though we want to expand it
1182      // so that DAG combine doesn't try to turn it into uint_to_fp.
1183      setOperationAction(ISD::SINT_TO_FP,        MVT::v4i64, Custom);
1184      setOperationAction(ISD::STRICT_SINT_TO_FP, MVT::v4i64, Custom);
1185    }
1186  }
1187
1188  if (!Subtarget.useSoftFloat() && Subtarget.hasSSE42()) {
1189    setOperationAction(ISD::UADDSAT,            MVT::v2i64, Custom);
1190  }
1191
1192  if (!Subtarget.useSoftFloat() && Subtarget.hasXOP()) {
1193    for (auto VT : { MVT::v16i8, MVT::v8i16,  MVT::v4i32, MVT::v2i64,
1194                     MVT::v32i8, MVT::v16i16, MVT::v8i32, MVT::v4i64 })
1195      setOperationAction(ISD::ROTL, VT, Custom);
1196
1197    // XOP can efficiently perform BITREVERSE with VPPERM.
1198    for (auto VT : { MVT::i8, MVT::i16, MVT::i32, MVT::i64 })
1199      setOperationAction(ISD::BITREVERSE, VT, Custom);
1200
1201    for (auto VT : { MVT::v16i8, MVT::v8i16,  MVT::v4i32, MVT::v2i64,
1202                     MVT::v32i8, MVT::v16i16, MVT::v8i32, MVT::v4i64 })
1203      setOperationAction(ISD::BITREVERSE, VT, Custom);
1204  }
1205
1206  if (!Subtarget.useSoftFloat() && Subtarget.hasAVX()) {
1207    bool HasInt256 = Subtarget.hasInt256();
1208
1209    addRegisterClass(MVT::v32i8,  Subtarget.hasVLX() ? &X86::VR256XRegClass
1210                                                     : &X86::VR256RegClass);
1211    addRegisterClass(MVT::v16i16, Subtarget.hasVLX() ? &X86::VR256XRegClass
1212                                                     : &X86::VR256RegClass);
1213    addRegisterClass(MVT::v8i32,  Subtarget.hasVLX() ? &X86::VR256XRegClass
1214                                                     : &X86::VR256RegClass);
1215    addRegisterClass(MVT::v8f32,  Subtarget.hasVLX() ? &X86::VR256XRegClass
1216                                                     : &X86::VR256RegClass);
1217    addRegisterClass(MVT::v4i64,  Subtarget.hasVLX() ? &X86::VR256XRegClass
1218                                                     : &X86::VR256RegClass);
1219    addRegisterClass(MVT::v4f64,  Subtarget.hasVLX() ? &X86::VR256XRegClass
1220                                                     : &X86::VR256RegClass);
1221
1222    for (auto VT : { MVT::v8f32, MVT::v4f64 }) {
1223      setOperationAction(ISD::FFLOOR,            VT, Legal);
1224      setOperationAction(ISD::STRICT_FFLOOR,     VT, Legal);
1225      setOperationAction(ISD::FCEIL,             VT, Legal);
1226      setOperationAction(ISD::STRICT_FCEIL,      VT, Legal);
1227      setOperationAction(ISD::FTRUNC,            VT, Legal);
1228      setOperationAction(ISD::STRICT_FTRUNC,     VT, Legal);
1229      setOperationAction(ISD::FRINT,             VT, Legal);
1230      setOperationAction(ISD::STRICT_FRINT,      VT, Legal);
1231      setOperationAction(ISD::FNEARBYINT,        VT, Legal);
1232      setOperationAction(ISD::STRICT_FNEARBYINT, VT, Legal);
1233      setOperationAction(ISD::FROUNDEVEN,        VT, Legal);
1234      setOperationAction(ISD::STRICT_FROUNDEVEN, VT, Legal);
1235
1236      setOperationAction(ISD::FROUND,            VT, Custom);
1237
1238      setOperationAction(ISD::FNEG,              VT, Custom);
1239      setOperationAction(ISD::FABS,              VT, Custom);
1240      setOperationAction(ISD::FCOPYSIGN,         VT, Custom);
1241    }
1242
1243    // (fp_to_int:v8i16 (v8f32 ..)) requires the result type to be promoted
1244    // even though v8i16 is a legal type.
1245    setOperationPromotedToType(ISD::FP_TO_SINT,        MVT::v8i16, MVT::v8i32);
1246    setOperationPromotedToType(ISD::FP_TO_UINT,        MVT::v8i16, MVT::v8i32);
1247    setOperationPromotedToType(ISD::STRICT_FP_TO_SINT, MVT::v8i16, MVT::v8i32);
1248    setOperationPromotedToType(ISD::STRICT_FP_TO_UINT, MVT::v8i16, MVT::v8i32);
1249    setOperationAction(ISD::FP_TO_SINT,                MVT::v8i32, Legal);
1250    setOperationAction(ISD::STRICT_FP_TO_SINT,         MVT::v8i32, Legal);
1251
1252    setOperationAction(ISD::SINT_TO_FP,         MVT::v8i32, Legal);
1253    setOperationAction(ISD::STRICT_SINT_TO_FP,  MVT::v8i32, Legal);
1254
1255    setOperationAction(ISD::STRICT_FP_ROUND,    MVT::v4f32, Legal);
1256    setOperationAction(ISD::STRICT_FADD,        MVT::v8f32, Legal);
1257    setOperationAction(ISD::STRICT_FADD,        MVT::v4f64, Legal);
1258    setOperationAction(ISD::STRICT_FSUB,        MVT::v8f32, Legal);
1259    setOperationAction(ISD::STRICT_FSUB,        MVT::v4f64, Legal);
1260    setOperationAction(ISD::STRICT_FMUL,        MVT::v8f32, Legal);
1261    setOperationAction(ISD::STRICT_FMUL,        MVT::v4f64, Legal);
1262    setOperationAction(ISD::STRICT_FDIV,        MVT::v8f32, Legal);
1263    setOperationAction(ISD::STRICT_FDIV,        MVT::v4f64, Legal);
1264    setOperationAction(ISD::STRICT_FP_EXTEND,   MVT::v4f64, Legal);
1265    setOperationAction(ISD::STRICT_FSQRT,       MVT::v8f32, Legal);
1266    setOperationAction(ISD::STRICT_FSQRT,       MVT::v4f64, Legal);
1267
1268    if (!Subtarget.hasAVX512())
1269      setOperationAction(ISD::BITCAST, MVT::v32i1, Custom);
1270
1271    // In the customized shift lowering, the legal v8i32/v4i64 cases
1272    // in AVX2 will be recognized.
1273    for (auto VT : { MVT::v32i8, MVT::v16i16, MVT::v8i32, MVT::v4i64 }) {
1274      setOperationAction(ISD::SRL, VT, Custom);
1275      setOperationAction(ISD::SHL, VT, Custom);
1276      setOperationAction(ISD::SRA, VT, Custom);
1277    }
1278
1279    // These types need custom splitting if their input is a 128-bit vector.
1280    setOperationAction(ISD::SIGN_EXTEND,       MVT::v8i64,  Custom);
1281    setOperationAction(ISD::SIGN_EXTEND,       MVT::v16i32, Custom);
1282    setOperationAction(ISD::ZERO_EXTEND,       MVT::v8i64,  Custom);
1283    setOperationAction(ISD::ZERO_EXTEND,       MVT::v16i32, Custom);
1284
1285    setOperationAction(ISD::ROTL,              MVT::v8i32,  Custom);
1286    setOperationAction(ISD::ROTL,              MVT::v16i16, Custom);
1287
1288    // With BWI, expanding (and promoting the shifts) is the better.
1289    if (!Subtarget.useBWIRegs())
1290      setOperationAction(ISD::ROTL,            MVT::v32i8,  Custom);
1291
1292    setOperationAction(ISD::SELECT,            MVT::v4f64, Custom);
1293    setOperationAction(ISD::SELECT,            MVT::v4i64, Custom);
1294    setOperationAction(ISD::SELECT,            MVT::v8i32, Custom);
1295    setOperationAction(ISD::SELECT,            MVT::v16i16, Custom);
1296    setOperationAction(ISD::SELECT,            MVT::v32i8, Custom);
1297    setOperationAction(ISD::SELECT,            MVT::v8f32, Custom);
1298
1299    for (auto VT : { MVT::v16i16, MVT::v8i32, MVT::v4i64 }) {
1300      setOperationAction(ISD::SIGN_EXTEND,     VT, Custom);
1301      setOperationAction(ISD::ZERO_EXTEND,     VT, Custom);
1302      setOperationAction(ISD::ANY_EXTEND,      VT, Custom);
1303    }
1304
1305    setOperationAction(ISD::TRUNCATE,          MVT::v16i8, Custom);
1306    setOperationAction(ISD::TRUNCATE,          MVT::v8i16, Custom);
1307    setOperationAction(ISD::TRUNCATE,          MVT::v4i32, Custom);
1308    setOperationAction(ISD::BITREVERSE,        MVT::v32i8, Custom);
1309
1310    for (auto VT : { MVT::v32i8, MVT::v16i16, MVT::v8i32, MVT::v4i64 }) {
1311      setOperationAction(ISD::SETCC,           VT, Custom);
1312      setOperationAction(ISD::STRICT_FSETCC,   VT, Custom);
1313      setOperationAction(ISD::STRICT_FSETCCS,  VT, Custom);
1314      setOperationAction(ISD::CTPOP,           VT, Custom);
1315      setOperationAction(ISD::CTLZ,            VT, Custom);
1316
1317      // The condition codes aren't legal in SSE/AVX and under AVX512 we use
1318      // setcc all the way to isel and prefer SETGT in some isel patterns.
1319      setCondCodeAction(ISD::SETLT, VT, Custom);
1320      setCondCodeAction(ISD::SETLE, VT, Custom);
1321    }
1322
1323    if (Subtarget.hasAnyFMA()) {
1324      for (auto VT : { MVT::f32, MVT::f64, MVT::v4f32, MVT::v8f32,
1325                       MVT::v2f64, MVT::v4f64 }) {
1326        setOperationAction(ISD::FMA, VT, Legal);
1327        setOperationAction(ISD::STRICT_FMA, VT, Legal);
1328      }
1329    }
1330
1331    for (auto VT : { MVT::v32i8, MVT::v16i16, MVT::v8i32, MVT::v4i64 }) {
1332      setOperationAction(ISD::ADD, VT, HasInt256 ? Legal : Custom);
1333      setOperationAction(ISD::SUB, VT, HasInt256 ? Legal : Custom);
1334    }
1335
1336    setOperationAction(ISD::MUL,       MVT::v4i64,  Custom);
1337    setOperationAction(ISD::MUL,       MVT::v8i32,  HasInt256 ? Legal : Custom);
1338    setOperationAction(ISD::MUL,       MVT::v16i16, HasInt256 ? Legal : Custom);
1339    setOperationAction(ISD::MUL,       MVT::v32i8,  Custom);
1340
1341    setOperationAction(ISD::MULHU,     MVT::v8i32,  Custom);
1342    setOperationAction(ISD::MULHS,     MVT::v8i32,  Custom);
1343    setOperationAction(ISD::MULHU,     MVT::v16i16, HasInt256 ? Legal : Custom);
1344    setOperationAction(ISD::MULHS,     MVT::v16i16, HasInt256 ? Legal : Custom);
1345    setOperationAction(ISD::MULHU,     MVT::v32i8,  Custom);
1346    setOperationAction(ISD::MULHS,     MVT::v32i8,  Custom);
1347
1348    setOperationAction(ISD::SMULO,     MVT::v32i8, Custom);
1349    setOperationAction(ISD::UMULO,     MVT::v32i8, Custom);
1350
1351    setOperationAction(ISD::ABS,       MVT::v4i64,  Custom);
1352    setOperationAction(ISD::SMAX,      MVT::v4i64,  Custom);
1353    setOperationAction(ISD::UMAX,      MVT::v4i64,  Custom);
1354    setOperationAction(ISD::SMIN,      MVT::v4i64,  Custom);
1355    setOperationAction(ISD::UMIN,      MVT::v4i64,  Custom);
1356
1357    setOperationAction(ISD::UADDSAT,   MVT::v32i8,  HasInt256 ? Legal : Custom);
1358    setOperationAction(ISD::SADDSAT,   MVT::v32i8,  HasInt256 ? Legal : Custom);
1359    setOperationAction(ISD::USUBSAT,   MVT::v32i8,  HasInt256 ? Legal : Custom);
1360    setOperationAction(ISD::SSUBSAT,   MVT::v32i8,  HasInt256 ? Legal : Custom);
1361    setOperationAction(ISD::UADDSAT,   MVT::v16i16, HasInt256 ? Legal : Custom);
1362    setOperationAction(ISD::SADDSAT,   MVT::v16i16, HasInt256 ? Legal : Custom);
1363    setOperationAction(ISD::USUBSAT,   MVT::v16i16, HasInt256 ? Legal : Custom);
1364    setOperationAction(ISD::SSUBSAT,   MVT::v16i16, HasInt256 ? Legal : Custom);
1365    setOperationAction(ISD::UADDSAT,   MVT::v8i32, Custom);
1366    setOperationAction(ISD::USUBSAT,   MVT::v8i32, Custom);
1367    setOperationAction(ISD::UADDSAT,   MVT::v4i64, Custom);
1368    setOperationAction(ISD::USUBSAT,   MVT::v4i64, Custom);
1369
1370    for (auto VT : { MVT::v32i8, MVT::v16i16, MVT::v8i32 }) {
1371      setOperationAction(ISD::ABS,  VT, HasInt256 ? Legal : Custom);
1372      setOperationAction(ISD::SMAX, VT, HasInt256 ? Legal : Custom);
1373      setOperationAction(ISD::UMAX, VT, HasInt256 ? Legal : Custom);
1374      setOperationAction(ISD::SMIN, VT, HasInt256 ? Legal : Custom);
1375      setOperationAction(ISD::UMIN, VT, HasInt256 ? Legal : Custom);
1376    }
1377
1378    for (auto VT : {MVT::v16i16, MVT::v8i32, MVT::v4i64}) {
1379      setOperationAction(ISD::SIGN_EXTEND_VECTOR_INREG, VT, Custom);
1380      setOperationAction(ISD::ZERO_EXTEND_VECTOR_INREG, VT, Custom);
1381    }
1382
1383    if (HasInt256) {
1384      // The custom lowering for UINT_TO_FP for v8i32 becomes interesting
1385      // when we have a 256bit-wide blend with immediate.
1386      setOperationAction(ISD::UINT_TO_FP, MVT::v8i32, Custom);
1387      setOperationAction(ISD::STRICT_UINT_TO_FP, MVT::v8i32, Custom);
1388
1389      // AVX2 also has wider vector sign/zero extending loads, VPMOV[SZ]X
1390      for (auto LoadExtOp : { ISD::SEXTLOAD, ISD::ZEXTLOAD }) {
1391        setLoadExtAction(LoadExtOp, MVT::v16i16, MVT::v16i8, Legal);
1392        setLoadExtAction(LoadExtOp, MVT::v8i32,  MVT::v8i8,  Legal);
1393        setLoadExtAction(LoadExtOp, MVT::v4i64,  MVT::v4i8,  Legal);
1394        setLoadExtAction(LoadExtOp, MVT::v8i32,  MVT::v8i16, Legal);
1395        setLoadExtAction(LoadExtOp, MVT::v4i64,  MVT::v4i16, Legal);
1396        setLoadExtAction(LoadExtOp, MVT::v4i64,  MVT::v4i32, Legal);
1397      }
1398    }
1399
1400    for (auto VT : { MVT::v4i32, MVT::v8i32, MVT::v2i64, MVT::v4i64,
1401                     MVT::v4f32, MVT::v8f32, MVT::v2f64, MVT::v4f64 }) {
1402      setOperationAction(ISD::MLOAD,  VT, Subtarget.hasVLX() ? Legal : Custom);
1403      setOperationAction(ISD::MSTORE, VT, Legal);
1404    }
1405
1406    // Extract subvector is special because the value type
1407    // (result) is 128-bit but the source is 256-bit wide.
1408    for (auto VT : { MVT::v16i8, MVT::v8i16, MVT::v4i32, MVT::v2i64,
1409                     MVT::v4f32, MVT::v2f64 }) {
1410      setOperationAction(ISD::EXTRACT_SUBVECTOR, VT, Legal);
1411    }
1412
1413    // Custom lower several nodes for 256-bit types.
1414    for (MVT VT : { MVT::v32i8, MVT::v16i16, MVT::v8i32, MVT::v4i64,
1415                    MVT::v8f32, MVT::v4f64 }) {
1416      setOperationAction(ISD::BUILD_VECTOR,       VT, Custom);
1417      setOperationAction(ISD::VECTOR_SHUFFLE,     VT, Custom);
1418      setOperationAction(ISD::VSELECT,            VT, Custom);
1419      setOperationAction(ISD::INSERT_VECTOR_ELT,  VT, Custom);
1420      setOperationAction(ISD::EXTRACT_VECTOR_ELT, VT, Custom);
1421      setOperationAction(ISD::SCALAR_TO_VECTOR,   VT, Custom);
1422      setOperationAction(ISD::INSERT_SUBVECTOR,   VT, Legal);
1423      setOperationAction(ISD::CONCAT_VECTORS,     VT, Custom);
1424      setOperationAction(ISD::STORE,              VT, Custom);
1425    }
1426
1427    if (HasInt256) {
1428      setOperationAction(ISD::VSELECT, MVT::v32i8, Legal);
1429
1430      // Custom legalize 2x32 to get a little better code.
1431      setOperationAction(ISD::MGATHER, MVT::v2f32, Custom);
1432      setOperationAction(ISD::MGATHER, MVT::v2i32, Custom);
1433
1434      for (auto VT : { MVT::v4i32, MVT::v8i32, MVT::v2i64, MVT::v4i64,
1435                       MVT::v4f32, MVT::v8f32, MVT::v2f64, MVT::v4f64 })
1436        setOperationAction(ISD::MGATHER,  VT, Custom);
1437    }
1438  }
1439
1440  // This block controls legalization of the mask vector sizes that are
1441  // available with AVX512. 512-bit vectors are in a separate block controlled
1442  // by useAVX512Regs.
1443  if (!Subtarget.useSoftFloat() && Subtarget.hasAVX512()) {
1444    addRegisterClass(MVT::v1i1,   &X86::VK1RegClass);
1445    addRegisterClass(MVT::v2i1,   &X86::VK2RegClass);
1446    addRegisterClass(MVT::v4i1,   &X86::VK4RegClass);
1447    addRegisterClass(MVT::v8i1,   &X86::VK8RegClass);
1448    addRegisterClass(MVT::v16i1,  &X86::VK16RegClass);
1449
1450    setOperationAction(ISD::SELECT,             MVT::v1i1, Custom);
1451    setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v1i1, Custom);
1452    setOperationAction(ISD::BUILD_VECTOR,       MVT::v1i1, Custom);
1453
1454    setOperationPromotedToType(ISD::FP_TO_SINT,        MVT::v8i1,  MVT::v8i32);
1455    setOperationPromotedToType(ISD::FP_TO_UINT,        MVT::v8i1,  MVT::v8i32);
1456    setOperationPromotedToType(ISD::FP_TO_SINT,        MVT::v4i1,  MVT::v4i32);
1457    setOperationPromotedToType(ISD::FP_TO_UINT,        MVT::v4i1,  MVT::v4i32);
1458    setOperationPromotedToType(ISD::STRICT_FP_TO_SINT, MVT::v8i1,  MVT::v8i32);
1459    setOperationPromotedToType(ISD::STRICT_FP_TO_UINT, MVT::v8i1,  MVT::v8i32);
1460    setOperationPromotedToType(ISD::STRICT_FP_TO_SINT, MVT::v4i1,  MVT::v4i32);
1461    setOperationPromotedToType(ISD::STRICT_FP_TO_UINT, MVT::v4i1,  MVT::v4i32);
1462    setOperationAction(ISD::FP_TO_SINT,                MVT::v2i1,  Custom);
1463    setOperationAction(ISD::FP_TO_UINT,                MVT::v2i1,  Custom);
1464    setOperationAction(ISD::STRICT_FP_TO_SINT,         MVT::v2i1,  Custom);
1465    setOperationAction(ISD::STRICT_FP_TO_UINT,         MVT::v2i1,  Custom);
1466
1467    // There is no byte sized k-register load or store without AVX512DQ.
1468    if (!Subtarget.hasDQI()) {
1469      setOperationAction(ISD::LOAD, MVT::v1i1, Custom);
1470      setOperationAction(ISD::LOAD, MVT::v2i1, Custom);
1471      setOperationAction(ISD::LOAD, MVT::v4i1, Custom);
1472      setOperationAction(ISD::LOAD, MVT::v8i1, Custom);
1473
1474      setOperationAction(ISD::STORE, MVT::v1i1, Custom);
1475      setOperationAction(ISD::STORE, MVT::v2i1, Custom);
1476      setOperationAction(ISD::STORE, MVT::v4i1, Custom);
1477      setOperationAction(ISD::STORE, MVT::v8i1, Custom);
1478    }
1479
1480    // Extends of v16i1/v8i1/v4i1/v2i1 to 128-bit vectors.
1481    for (auto VT : { MVT::v16i8, MVT::v8i16, MVT::v4i32, MVT::v2i64 }) {
1482      setOperationAction(ISD::SIGN_EXTEND, VT, Custom);
1483      setOperationAction(ISD::ZERO_EXTEND, VT, Custom);
1484      setOperationAction(ISD::ANY_EXTEND,  VT, Custom);
1485    }
1486
1487    for (auto VT : { MVT::v1i1, MVT::v2i1, MVT::v4i1, MVT::v8i1, MVT::v16i1 })
1488      setOperationAction(ISD::VSELECT,          VT, Expand);
1489
1490    for (auto VT : { MVT::v2i1, MVT::v4i1, MVT::v8i1, MVT::v16i1 }) {
1491      setOperationAction(ISD::SETCC,            VT, Custom);
1492      setOperationAction(ISD::STRICT_FSETCC,    VT, Custom);
1493      setOperationAction(ISD::STRICT_FSETCCS,   VT, Custom);
1494      setOperationAction(ISD::SELECT,           VT, Custom);
1495      setOperationAction(ISD::TRUNCATE,         VT, Custom);
1496
1497      setOperationAction(ISD::BUILD_VECTOR,     VT, Custom);
1498      setOperationAction(ISD::CONCAT_VECTORS,   VT, Custom);
1499      setOperationAction(ISD::EXTRACT_VECTOR_ELT, VT, Custom);
1500      setOperationAction(ISD::INSERT_SUBVECTOR, VT, Custom);
1501      setOperationAction(ISD::INSERT_VECTOR_ELT, VT, Custom);
1502      setOperationAction(ISD::VECTOR_SHUFFLE,   VT,  Custom);
1503    }
1504
1505    for (auto VT : { MVT::v1i1, MVT::v2i1, MVT::v4i1, MVT::v8i1 })
1506      setOperationAction(ISD::EXTRACT_SUBVECTOR, VT, Custom);
1507  }
1508
1509  // This block controls legalization for 512-bit operations with 32/64 bit
1510  // elements. 512-bits can be disabled based on prefer-vector-width and
1511  // required-vector-width function attributes.
1512  if (!Subtarget.useSoftFloat() && Subtarget.useAVX512Regs()) {
1513    bool HasBWI = Subtarget.hasBWI();
1514
1515    addRegisterClass(MVT::v16i32, &X86::VR512RegClass);
1516    addRegisterClass(MVT::v16f32, &X86::VR512RegClass);
1517    addRegisterClass(MVT::v8i64,  &X86::VR512RegClass);
1518    addRegisterClass(MVT::v8f64,  &X86::VR512RegClass);
1519    addRegisterClass(MVT::v32i16, &X86::VR512RegClass);
1520    addRegisterClass(MVT::v64i8,  &X86::VR512RegClass);
1521
1522    for (auto ExtType : {ISD::ZEXTLOAD, ISD::SEXTLOAD}) {
1523      setLoadExtAction(ExtType, MVT::v16i32, MVT::v16i8,  Legal);
1524      setLoadExtAction(ExtType, MVT::v16i32, MVT::v16i16, Legal);
1525      setLoadExtAction(ExtType, MVT::v8i64,  MVT::v8i8,   Legal);
1526      setLoadExtAction(ExtType, MVT::v8i64,  MVT::v8i16,  Legal);
1527      setLoadExtAction(ExtType, MVT::v8i64,  MVT::v8i32,  Legal);
1528      if (HasBWI)
1529        setLoadExtAction(ExtType, MVT::v32i16, MVT::v32i8, Legal);
1530    }
1531
1532    for (MVT VT : { MVT::v16f32, MVT::v8f64 }) {
1533      setOperationAction(ISD::FNEG,  VT, Custom);
1534      setOperationAction(ISD::FABS,  VT, Custom);
1535      setOperationAction(ISD::FMA,   VT, Legal);
1536      setOperationAction(ISD::STRICT_FMA, VT, Legal);
1537      setOperationAction(ISD::FCOPYSIGN, VT, Custom);
1538    }
1539
1540    for (MVT VT : { MVT::v16i1, MVT::v16i8, MVT::v16i16 }) {
1541      setOperationPromotedToType(ISD::FP_TO_SINT       , VT, MVT::v16i32);
1542      setOperationPromotedToType(ISD::FP_TO_UINT       , VT, MVT::v16i32);
1543      setOperationPromotedToType(ISD::STRICT_FP_TO_SINT, VT, MVT::v16i32);
1544      setOperationPromotedToType(ISD::STRICT_FP_TO_UINT, VT, MVT::v16i32);
1545    }
1546    setOperationAction(ISD::FP_TO_SINT,        MVT::v16i32, Legal);
1547    setOperationAction(ISD::FP_TO_UINT,        MVT::v16i32, Legal);
1548    setOperationAction(ISD::STRICT_FP_TO_SINT, MVT::v16i32, Legal);
1549    setOperationAction(ISD::STRICT_FP_TO_UINT, MVT::v16i32, Legal);
1550    setOperationAction(ISD::SINT_TO_FP,        MVT::v16i32, Legal);
1551    setOperationAction(ISD::UINT_TO_FP,        MVT::v16i32, Legal);
1552    setOperationAction(ISD::STRICT_SINT_TO_FP, MVT::v16i32, Legal);
1553    setOperationAction(ISD::STRICT_UINT_TO_FP, MVT::v16i32, Legal);
1554
1555    setOperationAction(ISD::STRICT_FADD,      MVT::v16f32, Legal);
1556    setOperationAction(ISD::STRICT_FADD,      MVT::v8f64,  Legal);
1557    setOperationAction(ISD::STRICT_FSUB,      MVT::v16f32, Legal);
1558    setOperationAction(ISD::STRICT_FSUB,      MVT::v8f64,  Legal);
1559    setOperationAction(ISD::STRICT_FMUL,      MVT::v16f32, Legal);
1560    setOperationAction(ISD::STRICT_FMUL,      MVT::v8f64,  Legal);
1561    setOperationAction(ISD::STRICT_FDIV,      MVT::v16f32, Legal);
1562    setOperationAction(ISD::STRICT_FDIV,      MVT::v8f64,  Legal);
1563    setOperationAction(ISD::STRICT_FSQRT,     MVT::v16f32, Legal);
1564    setOperationAction(ISD::STRICT_FSQRT,     MVT::v8f64,  Legal);
1565    setOperationAction(ISD::STRICT_FP_EXTEND, MVT::v8f64,  Legal);
1566    setOperationAction(ISD::STRICT_FP_ROUND,  MVT::v8f32,  Legal);
1567
1568    setTruncStoreAction(MVT::v8i64,   MVT::v8i8,   Legal);
1569    setTruncStoreAction(MVT::v8i64,   MVT::v8i16,  Legal);
1570    setTruncStoreAction(MVT::v8i64,   MVT::v8i32,  Legal);
1571    setTruncStoreAction(MVT::v16i32,  MVT::v16i8,  Legal);
1572    setTruncStoreAction(MVT::v16i32,  MVT::v16i16, Legal);
1573    if (HasBWI)
1574      setTruncStoreAction(MVT::v32i16,  MVT::v32i8, Legal);
1575
1576    // With 512-bit vectors and no VLX, we prefer to widen MLOAD/MSTORE
1577    // to 512-bit rather than use the AVX2 instructions so that we can use
1578    // k-masks.
1579    if (!Subtarget.hasVLX()) {
1580      for (auto VT : {MVT::v4i32, MVT::v8i32, MVT::v2i64, MVT::v4i64,
1581           MVT::v4f32, MVT::v8f32, MVT::v2f64, MVT::v4f64}) {
1582        setOperationAction(ISD::MLOAD,  VT, Custom);
1583        setOperationAction(ISD::MSTORE, VT, Custom);
1584      }
1585    }
1586
1587    setOperationAction(ISD::TRUNCATE,    MVT::v8i32,  Legal);
1588    setOperationAction(ISD::TRUNCATE,    MVT::v16i16, Legal);
1589    setOperationAction(ISD::TRUNCATE,    MVT::v32i8,  HasBWI ? Legal : Custom);
1590    setOperationAction(ISD::TRUNCATE,    MVT::v16i64, Custom);
1591    setOperationAction(ISD::ZERO_EXTEND, MVT::v32i16, Custom);
1592    setOperationAction(ISD::ZERO_EXTEND, MVT::v16i32, Custom);
1593    setOperationAction(ISD::ZERO_EXTEND, MVT::v8i64,  Custom);
1594    setOperationAction(ISD::ANY_EXTEND,  MVT::v32i16, Custom);
1595    setOperationAction(ISD::ANY_EXTEND,  MVT::v16i32, Custom);
1596    setOperationAction(ISD::ANY_EXTEND,  MVT::v8i64,  Custom);
1597    setOperationAction(ISD::SIGN_EXTEND, MVT::v32i16, Custom);
1598    setOperationAction(ISD::SIGN_EXTEND, MVT::v16i32, Custom);
1599    setOperationAction(ISD::SIGN_EXTEND, MVT::v8i64,  Custom);
1600
1601    if (HasBWI) {
1602      // Extends from v64i1 masks to 512-bit vectors.
1603      setOperationAction(ISD::SIGN_EXTEND,        MVT::v64i8, Custom);
1604      setOperationAction(ISD::ZERO_EXTEND,        MVT::v64i8, Custom);
1605      setOperationAction(ISD::ANY_EXTEND,         MVT::v64i8, Custom);
1606    }
1607
1608    for (auto VT : { MVT::v16f32, MVT::v8f64 }) {
1609      setOperationAction(ISD::FFLOOR,            VT, Legal);
1610      setOperationAction(ISD::STRICT_FFLOOR,     VT, Legal);
1611      setOperationAction(ISD::FCEIL,             VT, Legal);
1612      setOperationAction(ISD::STRICT_FCEIL,      VT, Legal);
1613      setOperationAction(ISD::FTRUNC,            VT, Legal);
1614      setOperationAction(ISD::STRICT_FTRUNC,     VT, Legal);
1615      setOperationAction(ISD::FRINT,             VT, Legal);
1616      setOperationAction(ISD::STRICT_FRINT,      VT, Legal);
1617      setOperationAction(ISD::FNEARBYINT,        VT, Legal);
1618      setOperationAction(ISD::STRICT_FNEARBYINT, VT, Legal);
1619      setOperationAction(ISD::FROUNDEVEN,        VT, Legal);
1620      setOperationAction(ISD::STRICT_FROUNDEVEN, VT, Legal);
1621
1622      setOperationAction(ISD::FROUND,            VT, Custom);
1623    }
1624
1625    for (auto VT : {MVT::v32i16, MVT::v16i32, MVT::v8i64}) {
1626      setOperationAction(ISD::SIGN_EXTEND_VECTOR_INREG, VT, Custom);
1627      setOperationAction(ISD::ZERO_EXTEND_VECTOR_INREG, VT, Custom);
1628    }
1629
1630    setOperationAction(ISD::ADD, MVT::v32i16, HasBWI ? Legal : Custom);
1631    setOperationAction(ISD::SUB, MVT::v32i16, HasBWI ? Legal : Custom);
1632    setOperationAction(ISD::ADD, MVT::v64i8,  HasBWI ? Legal : Custom);
1633    setOperationAction(ISD::SUB, MVT::v64i8,  HasBWI ? Legal : Custom);
1634
1635    setOperationAction(ISD::MUL, MVT::v8i64,  Custom);
1636    setOperationAction(ISD::MUL, MVT::v16i32, Legal);
1637    setOperationAction(ISD::MUL, MVT::v32i16, HasBWI ? Legal : Custom);
1638    setOperationAction(ISD::MUL, MVT::v64i8,  Custom);
1639
1640    setOperationAction(ISD::MULHU, MVT::v16i32, Custom);
1641    setOperationAction(ISD::MULHS, MVT::v16i32, Custom);
1642    setOperationAction(ISD::MULHS, MVT::v32i16, HasBWI ? Legal : Custom);
1643    setOperationAction(ISD::MULHU, MVT::v32i16, HasBWI ? Legal : Custom);
1644    setOperationAction(ISD::MULHS, MVT::v64i8,  Custom);
1645    setOperationAction(ISD::MULHU, MVT::v64i8,  Custom);
1646
1647    setOperationAction(ISD::SMULO, MVT::v64i8, Custom);
1648    setOperationAction(ISD::UMULO, MVT::v64i8, Custom);
1649
1650    setOperationAction(ISD::BITREVERSE, MVT::v64i8,  Custom);
1651
1652    for (auto VT : { MVT::v64i8, MVT::v32i16, MVT::v16i32, MVT::v8i64 }) {
1653      setOperationAction(ISD::SRL,              VT, Custom);
1654      setOperationAction(ISD::SHL,              VT, Custom);
1655      setOperationAction(ISD::SRA,              VT, Custom);
1656      setOperationAction(ISD::SETCC,            VT, Custom);
1657
1658      // The condition codes aren't legal in SSE/AVX and under AVX512 we use
1659      // setcc all the way to isel and prefer SETGT in some isel patterns.
1660      setCondCodeAction(ISD::SETLT, VT, Custom);
1661      setCondCodeAction(ISD::SETLE, VT, Custom);
1662    }
1663    for (auto VT : { MVT::v16i32, MVT::v8i64 }) {
1664      setOperationAction(ISD::SMAX,             VT, Legal);
1665      setOperationAction(ISD::UMAX,             VT, Legal);
1666      setOperationAction(ISD::SMIN,             VT, Legal);
1667      setOperationAction(ISD::UMIN,             VT, Legal);
1668      setOperationAction(ISD::ABS,              VT, Legal);
1669      setOperationAction(ISD::CTPOP,            VT, Custom);
1670      setOperationAction(ISD::ROTL,             VT, Custom);
1671      setOperationAction(ISD::ROTR,             VT, Custom);
1672      setOperationAction(ISD::STRICT_FSETCC,    VT, Custom);
1673      setOperationAction(ISD::STRICT_FSETCCS,   VT, Custom);
1674    }
1675
1676    for (auto VT : { MVT::v64i8, MVT::v32i16 }) {
1677      setOperationAction(ISD::ABS,     VT, HasBWI ? Legal : Custom);
1678      setOperationAction(ISD::CTPOP,   VT, Subtarget.hasBITALG() ? Legal : Custom);
1679      setOperationAction(ISD::CTLZ,    VT, Custom);
1680      setOperationAction(ISD::SMAX,    VT, HasBWI ? Legal : Custom);
1681      setOperationAction(ISD::UMAX,    VT, HasBWI ? Legal : Custom);
1682      setOperationAction(ISD::SMIN,    VT, HasBWI ? Legal : Custom);
1683      setOperationAction(ISD::UMIN,    VT, HasBWI ? Legal : Custom);
1684      setOperationAction(ISD::UADDSAT, VT, HasBWI ? Legal : Custom);
1685      setOperationAction(ISD::SADDSAT, VT, HasBWI ? Legal : Custom);
1686      setOperationAction(ISD::USUBSAT, VT, HasBWI ? Legal : Custom);
1687      setOperationAction(ISD::SSUBSAT, VT, HasBWI ? Legal : Custom);
1688    }
1689
1690    if (Subtarget.hasDQI()) {
1691      setOperationAction(ISD::SINT_TO_FP, MVT::v8i64, Legal);
1692      setOperationAction(ISD::UINT_TO_FP, MVT::v8i64, Legal);
1693      setOperationAction(ISD::STRICT_SINT_TO_FP, MVT::v8i64, Legal);
1694      setOperationAction(ISD::STRICT_UINT_TO_FP, MVT::v8i64, Legal);
1695      setOperationAction(ISD::FP_TO_SINT, MVT::v8i64, Legal);
1696      setOperationAction(ISD::FP_TO_UINT, MVT::v8i64, Legal);
1697      setOperationAction(ISD::STRICT_FP_TO_SINT, MVT::v8i64, Legal);
1698      setOperationAction(ISD::STRICT_FP_TO_UINT, MVT::v8i64, Legal);
1699
1700      setOperationAction(ISD::MUL,        MVT::v8i64, Legal);
1701    }
1702
1703    if (Subtarget.hasCDI()) {
1704      // NonVLX sub-targets extend 128/256 vectors to use the 512 version.
1705      for (auto VT : { MVT::v16i32, MVT::v8i64} ) {
1706        setOperationAction(ISD::CTLZ,            VT, Legal);
1707      }
1708    } // Subtarget.hasCDI()
1709
1710    if (Subtarget.hasVPOPCNTDQ()) {
1711      for (auto VT : { MVT::v16i32, MVT::v8i64 })
1712        setOperationAction(ISD::CTPOP, VT, Legal);
1713    }
1714
1715    // Extract subvector is special because the value type
1716    // (result) is 256-bit but the source is 512-bit wide.
1717    // 128-bit was made Legal under AVX1.
1718    for (auto VT : { MVT::v32i8, MVT::v16i16, MVT::v8i32, MVT::v4i64,
1719                     MVT::v8f32, MVT::v4f64 })
1720      setOperationAction(ISD::EXTRACT_SUBVECTOR, VT, Legal);
1721
1722    for (auto VT : { MVT::v64i8, MVT::v32i16, MVT::v16i32, MVT::v8i64,
1723                     MVT::v16f32, MVT::v8f64 }) {
1724      setOperationAction(ISD::CONCAT_VECTORS,     VT, Custom);
1725      setOperationAction(ISD::INSERT_SUBVECTOR,   VT, Legal);
1726      setOperationAction(ISD::SELECT,             VT, Custom);
1727      setOperationAction(ISD::VSELECT,            VT, Custom);
1728      setOperationAction(ISD::BUILD_VECTOR,       VT, Custom);
1729      setOperationAction(ISD::EXTRACT_VECTOR_ELT, VT, Custom);
1730      setOperationAction(ISD::VECTOR_SHUFFLE,     VT, Custom);
1731      setOperationAction(ISD::SCALAR_TO_VECTOR,   VT, Custom);
1732      setOperationAction(ISD::INSERT_VECTOR_ELT,  VT, Custom);
1733    }
1734
1735    for (auto VT : { MVT::v16i32, MVT::v8i64, MVT::v16f32, MVT::v8f64 }) {
1736      setOperationAction(ISD::MLOAD,               VT, Legal);
1737      setOperationAction(ISD::MSTORE,              VT, Legal);
1738      setOperationAction(ISD::MGATHER,             VT, Custom);
1739      setOperationAction(ISD::MSCATTER,            VT, Custom);
1740    }
1741    if (HasBWI) {
1742      for (auto VT : { MVT::v64i8, MVT::v32i16 }) {
1743        setOperationAction(ISD::MLOAD,        VT, Legal);
1744        setOperationAction(ISD::MSTORE,       VT, Legal);
1745      }
1746    } else {
1747      setOperationAction(ISD::STORE, MVT::v32i16, Custom);
1748      setOperationAction(ISD::STORE, MVT::v64i8,  Custom);
1749    }
1750
1751    if (Subtarget.hasVBMI2()) {
1752      for (auto VT : { MVT::v8i16, MVT::v4i32, MVT::v2i64,
1753                       MVT::v16i16, MVT::v8i32, MVT::v4i64,
1754                       MVT::v32i16, MVT::v16i32, MVT::v8i64 }) {
1755        setOperationAction(ISD::FSHL, VT, Custom);
1756        setOperationAction(ISD::FSHR, VT, Custom);
1757      }
1758
1759      setOperationAction(ISD::ROTL, MVT::v32i16, Custom);
1760      setOperationAction(ISD::ROTR, MVT::v8i16,  Custom);
1761      setOperationAction(ISD::ROTR, MVT::v16i16, Custom);
1762      setOperationAction(ISD::ROTR, MVT::v32i16, Custom);
1763    }
1764  }// useAVX512Regs
1765
1766  // This block controls legalization for operations that don't have
1767  // pre-AVX512 equivalents. Without VLX we use 512-bit operations for
1768  // narrower widths.
1769  if (!Subtarget.useSoftFloat() && Subtarget.hasAVX512()) {
1770    // These operations are handled on non-VLX by artificially widening in
1771    // isel patterns.
1772
1773    setOperationAction(ISD::FP_TO_UINT, MVT::v8i32,
1774                       Subtarget.hasVLX() ? Legal : Custom);
1775    setOperationAction(ISD::FP_TO_UINT, MVT::v4i32,
1776                       Subtarget.hasVLX() ? Legal : Custom);
1777    setOperationAction(ISD::FP_TO_UINT,         MVT::v2i32, Custom);
1778    setOperationAction(ISD::STRICT_FP_TO_UINT, MVT::v8i32,
1779                       Subtarget.hasVLX() ? Legal : Custom);
1780    setOperationAction(ISD::STRICT_FP_TO_UINT, MVT::v4i32,
1781                       Subtarget.hasVLX() ? Legal : Custom);
1782    setOperationAction(ISD::STRICT_FP_TO_UINT,  MVT::v2i32, Custom);
1783    setOperationAction(ISD::UINT_TO_FP, MVT::v8i32,
1784                       Subtarget.hasVLX() ? Legal : Custom);
1785    setOperationAction(ISD::UINT_TO_FP, MVT::v4i32,
1786                       Subtarget.hasVLX() ? Legal : Custom);
1787    setOperationAction(ISD::STRICT_UINT_TO_FP, MVT::v8i32,
1788                       Subtarget.hasVLX() ? Legal : Custom);
1789    setOperationAction(ISD::STRICT_UINT_TO_FP, MVT::v4i32,
1790                       Subtarget.hasVLX() ? Legal : Custom);
1791
1792    if (Subtarget.hasDQI()) {
1793      // Fast v2f32 SINT_TO_FP( v2i64 ) custom conversion.
1794      // v2f32 UINT_TO_FP is already custom under SSE2.
1795      assert(isOperationCustom(ISD::UINT_TO_FP, MVT::v2f32) &&
1796             isOperationCustom(ISD::STRICT_UINT_TO_FP, MVT::v2f32) &&
1797             "Unexpected operation action!");
1798      // v2i64 FP_TO_S/UINT(v2f32) custom conversion.
1799      setOperationAction(ISD::FP_TO_SINT,        MVT::v2f32, Custom);
1800      setOperationAction(ISD::FP_TO_UINT,        MVT::v2f32, Custom);
1801      setOperationAction(ISD::STRICT_FP_TO_SINT, MVT::v2f32, Custom);
1802      setOperationAction(ISD::STRICT_FP_TO_UINT, MVT::v2f32, Custom);
1803    }
1804
1805    for (auto VT : { MVT::v2i64, MVT::v4i64 }) {
1806      setOperationAction(ISD::SMAX, VT, Legal);
1807      setOperationAction(ISD::UMAX, VT, Legal);
1808      setOperationAction(ISD::SMIN, VT, Legal);
1809      setOperationAction(ISD::UMIN, VT, Legal);
1810      setOperationAction(ISD::ABS,  VT, Legal);
1811    }
1812
1813    for (auto VT : { MVT::v4i32, MVT::v8i32, MVT::v2i64, MVT::v4i64 }) {
1814      setOperationAction(ISD::ROTL,     VT, Custom);
1815      setOperationAction(ISD::ROTR,     VT, Custom);
1816    }
1817
1818    // Custom legalize 2x32 to get a little better code.
1819    setOperationAction(ISD::MSCATTER, MVT::v2f32, Custom);
1820    setOperationAction(ISD::MSCATTER, MVT::v2i32, Custom);
1821
1822    for (auto VT : { MVT::v4i32, MVT::v8i32, MVT::v2i64, MVT::v4i64,
1823                     MVT::v4f32, MVT::v8f32, MVT::v2f64, MVT::v4f64 })
1824      setOperationAction(ISD::MSCATTER, VT, Custom);
1825
1826    if (Subtarget.hasDQI()) {
1827      for (auto VT : { MVT::v2i64, MVT::v4i64 }) {
1828        setOperationAction(ISD::SINT_TO_FP, VT,
1829                           Subtarget.hasVLX() ? Legal : Custom);
1830        setOperationAction(ISD::UINT_TO_FP, VT,
1831                           Subtarget.hasVLX() ? Legal : Custom);
1832        setOperationAction(ISD::STRICT_SINT_TO_FP, VT,
1833                           Subtarget.hasVLX() ? Legal : Custom);
1834        setOperationAction(ISD::STRICT_UINT_TO_FP, VT,
1835                           Subtarget.hasVLX() ? Legal : Custom);
1836        setOperationAction(ISD::FP_TO_SINT, VT,
1837                           Subtarget.hasVLX() ? Legal : Custom);
1838        setOperationAction(ISD::FP_TO_UINT, VT,
1839                           Subtarget.hasVLX() ? Legal : Custom);
1840        setOperationAction(ISD::STRICT_FP_TO_SINT, VT,
1841                           Subtarget.hasVLX() ? Legal : Custom);
1842        setOperationAction(ISD::STRICT_FP_TO_UINT, VT,
1843                           Subtarget.hasVLX() ? Legal : Custom);
1844        setOperationAction(ISD::MUL,               VT, Legal);
1845      }
1846    }
1847
1848    if (Subtarget.hasCDI()) {
1849      for (auto VT : { MVT::v4i32, MVT::v8i32, MVT::v2i64, MVT::v4i64 }) {
1850        setOperationAction(ISD::CTLZ,            VT, Legal);
1851      }
1852    } // Subtarget.hasCDI()
1853
1854    if (Subtarget.hasVPOPCNTDQ()) {
1855      for (auto VT : { MVT::v4i32, MVT::v8i32, MVT::v2i64, MVT::v4i64 })
1856        setOperationAction(ISD::CTPOP, VT, Legal);
1857    }
1858  }
1859
1860  // This block control legalization of v32i1/v64i1 which are available with
1861  // AVX512BW. 512-bit v32i16 and v64i8 vector legalization is controlled with
1862  // useBWIRegs.
1863  if (!Subtarget.useSoftFloat() && Subtarget.hasBWI()) {
1864    addRegisterClass(MVT::v32i1,  &X86::VK32RegClass);
1865    addRegisterClass(MVT::v64i1,  &X86::VK64RegClass);
1866
1867    for (auto VT : { MVT::v32i1, MVT::v64i1 }) {
1868      setOperationAction(ISD::VSELECT,            VT, Expand);
1869      setOperationAction(ISD::TRUNCATE,           VT, Custom);
1870      setOperationAction(ISD::SETCC,              VT, Custom);
1871      setOperationAction(ISD::EXTRACT_VECTOR_ELT, VT, Custom);
1872      setOperationAction(ISD::INSERT_VECTOR_ELT,  VT, Custom);
1873      setOperationAction(ISD::SELECT,             VT, Custom);
1874      setOperationAction(ISD::BUILD_VECTOR,       VT, Custom);
1875      setOperationAction(ISD::VECTOR_SHUFFLE,     VT, Custom);
1876      setOperationAction(ISD::CONCAT_VECTORS,     VT, Custom);
1877      setOperationAction(ISD::INSERT_SUBVECTOR,   VT, Custom);
1878    }
1879
1880    for (auto VT : { MVT::v16i1, MVT::v32i1 })
1881      setOperationAction(ISD::EXTRACT_SUBVECTOR, VT, Custom);
1882
1883    // Extends from v32i1 masks to 256-bit vectors.
1884    setOperationAction(ISD::SIGN_EXTEND,        MVT::v32i8, Custom);
1885    setOperationAction(ISD::ZERO_EXTEND,        MVT::v32i8, Custom);
1886    setOperationAction(ISD::ANY_EXTEND,         MVT::v32i8, Custom);
1887
1888    for (auto VT : { MVT::v32i8, MVT::v16i8, MVT::v16i16, MVT::v8i16 }) {
1889      setOperationAction(ISD::MLOAD,  VT, Subtarget.hasVLX() ? Legal : Custom);
1890      setOperationAction(ISD::MSTORE, VT, Subtarget.hasVLX() ? Legal : Custom);
1891    }
1892
1893    // These operations are handled on non-VLX by artificially widening in
1894    // isel patterns.
1895    // TODO: Custom widen in lowering on non-VLX and drop the isel patterns?
1896
1897    if (Subtarget.hasBITALG()) {
1898      for (auto VT : { MVT::v16i8, MVT::v32i8, MVT::v8i16, MVT::v16i16 })
1899        setOperationAction(ISD::CTPOP, VT, Legal);
1900    }
1901  }
1902
1903  if (!Subtarget.useSoftFloat() && Subtarget.hasVLX()) {
1904    setTruncStoreAction(MVT::v4i64, MVT::v4i8,  Legal);
1905    setTruncStoreAction(MVT::v4i64, MVT::v4i16, Legal);
1906    setTruncStoreAction(MVT::v4i64, MVT::v4i32, Legal);
1907    setTruncStoreAction(MVT::v8i32, MVT::v8i8,  Legal);
1908    setTruncStoreAction(MVT::v8i32, MVT::v8i16, Legal);
1909
1910    setTruncStoreAction(MVT::v2i64, MVT::v2i8,  Legal);
1911    setTruncStoreAction(MVT::v2i64, MVT::v2i16, Legal);
1912    setTruncStoreAction(MVT::v2i64, MVT::v2i32, Legal);
1913    setTruncStoreAction(MVT::v4i32, MVT::v4i8,  Legal);
1914    setTruncStoreAction(MVT::v4i32, MVT::v4i16, Legal);
1915
1916    if (Subtarget.hasBWI()) {
1917      setTruncStoreAction(MVT::v16i16,  MVT::v16i8, Legal);
1918      setTruncStoreAction(MVT::v8i16,   MVT::v8i8,  Legal);
1919    }
1920
1921    setOperationAction(ISD::TRUNCATE, MVT::v16i32, Custom);
1922    setOperationAction(ISD::TRUNCATE, MVT::v8i64, Custom);
1923    setOperationAction(ISD::TRUNCATE, MVT::v16i64, Custom);
1924  }
1925
1926  if (Subtarget.hasAMXTILE()) {
1927    addRegisterClass(MVT::x86amx, &X86::TILERegClass);
1928  }
1929
1930  // We want to custom lower some of our intrinsics.
1931  setOperationAction(ISD::INTRINSIC_WO_CHAIN, MVT::Other, Custom);
1932  setOperationAction(ISD::INTRINSIC_W_CHAIN, MVT::Other, Custom);
1933  setOperationAction(ISD::INTRINSIC_VOID, MVT::Other, Custom);
1934  if (!Subtarget.is64Bit()) {
1935    setOperationAction(ISD::INTRINSIC_W_CHAIN, MVT::i64, Custom);
1936  }
1937
1938  // Only custom-lower 64-bit SADDO and friends on 64-bit because we don't
1939  // handle type legalization for these operations here.
1940  //
1941  // FIXME: We really should do custom legalization for addition and
1942  // subtraction on x86-32 once PR3203 is fixed.  We really can't do much better
1943  // than generic legalization for 64-bit multiplication-with-overflow, though.
1944  for (auto VT : { MVT::i8, MVT::i16, MVT::i32, MVT::i64 }) {
1945    if (VT == MVT::i64 && !Subtarget.is64Bit())
1946      continue;
1947    // Add/Sub/Mul with overflow operations are custom lowered.
1948    setOperationAction(ISD::SADDO, VT, Custom);
1949    setOperationAction(ISD::UADDO, VT, Custom);
1950    setOperationAction(ISD::SSUBO, VT, Custom);
1951    setOperationAction(ISD::USUBO, VT, Custom);
1952    setOperationAction(ISD::SMULO, VT, Custom);
1953    setOperationAction(ISD::UMULO, VT, Custom);
1954
1955    // Support carry in as value rather than glue.
1956    setOperationAction(ISD::ADDCARRY, VT, Custom);
1957    setOperationAction(ISD::SUBCARRY, VT, Custom);
1958    setOperationAction(ISD::SETCCCARRY, VT, Custom);
1959    setOperationAction(ISD::SADDO_CARRY, VT, Custom);
1960    setOperationAction(ISD::SSUBO_CARRY, VT, Custom);
1961  }
1962
1963  if (!Subtarget.is64Bit()) {
1964    // These libcalls are not available in 32-bit.
1965    setLibcallName(RTLIB::SHL_I128, nullptr);
1966    setLibcallName(RTLIB::SRL_I128, nullptr);
1967    setLibcallName(RTLIB::SRA_I128, nullptr);
1968    setLibcallName(RTLIB::MUL_I128, nullptr);
1969  }
1970
1971  // Combine sin / cos into _sincos_stret if it is available.
1972  if (getLibcallName(RTLIB::SINCOS_STRET_F32) != nullptr &&
1973      getLibcallName(RTLIB::SINCOS_STRET_F64) != nullptr) {
1974    setOperationAction(ISD::FSINCOS, MVT::f64, Custom);
1975    setOperationAction(ISD::FSINCOS, MVT::f32, Custom);
1976  }
1977
1978  if (Subtarget.isTargetWin64()) {
1979    setOperationAction(ISD::SDIV, MVT::i128, Custom);
1980    setOperationAction(ISD::UDIV, MVT::i128, Custom);
1981    setOperationAction(ISD::SREM, MVT::i128, Custom);
1982    setOperationAction(ISD::UREM, MVT::i128, Custom);
1983  }
1984
1985  // On 32 bit MSVC, `fmodf(f32)` is not defined - only `fmod(f64)`
1986  // is. We should promote the value to 64-bits to solve this.
1987  // This is what the CRT headers do - `fmodf` is an inline header
1988  // function casting to f64 and calling `fmod`.
1989  if (Subtarget.is32Bit() &&
1990      (Subtarget.isTargetWindowsMSVC() || Subtarget.isTargetWindowsItanium()))
1991    for (ISD::NodeType Op :
1992         {ISD::FCEIL,  ISD::STRICT_FCEIL,
1993          ISD::FCOS,   ISD::STRICT_FCOS,
1994          ISD::FEXP,   ISD::STRICT_FEXP,
1995          ISD::FFLOOR, ISD::STRICT_FFLOOR,
1996          ISD::FREM,   ISD::STRICT_FREM,
1997          ISD::FLOG,   ISD::STRICT_FLOG,
1998          ISD::FLOG10, ISD::STRICT_FLOG10,
1999          ISD::FPOW,   ISD::STRICT_FPOW,
2000          ISD::FSIN,   ISD::STRICT_FSIN})
2001      if (isOperationExpand(Op, MVT::f32))
2002        setOperationAction(Op, MVT::f32, Promote);
2003
2004  // We have target-specific dag combine patterns for the following nodes:
2005  setTargetDAGCombine(ISD::VECTOR_SHUFFLE);
2006  setTargetDAGCombine(ISD::SCALAR_TO_VECTOR);
2007  setTargetDAGCombine(ISD::INSERT_VECTOR_ELT);
2008  setTargetDAGCombine(ISD::EXTRACT_VECTOR_ELT);
2009  setTargetDAGCombine(ISD::CONCAT_VECTORS);
2010  setTargetDAGCombine(ISD::INSERT_SUBVECTOR);
2011  setTargetDAGCombine(ISD::EXTRACT_SUBVECTOR);
2012  setTargetDAGCombine(ISD::BITCAST);
2013  setTargetDAGCombine(ISD::VSELECT);
2014  setTargetDAGCombine(ISD::SELECT);
2015  setTargetDAGCombine(ISD::SHL);
2016  setTargetDAGCombine(ISD::SRA);
2017  setTargetDAGCombine(ISD::SRL);
2018  setTargetDAGCombine(ISD::OR);
2019  setTargetDAGCombine(ISD::AND);
2020  setTargetDAGCombine(ISD::ADD);
2021  setTargetDAGCombine(ISD::FADD);
2022  setTargetDAGCombine(ISD::FSUB);
2023  setTargetDAGCombine(ISD::FNEG);
2024  setTargetDAGCombine(ISD::FMA);
2025  setTargetDAGCombine(ISD::STRICT_FMA);
2026  setTargetDAGCombine(ISD::FMINNUM);
2027  setTargetDAGCombine(ISD::FMAXNUM);
2028  setTargetDAGCombine(ISD::SUB);
2029  setTargetDAGCombine(ISD::LOAD);
2030  setTargetDAGCombine(ISD::MLOAD);
2031  setTargetDAGCombine(ISD::STORE);
2032  setTargetDAGCombine(ISD::MSTORE);
2033  setTargetDAGCombine(ISD::TRUNCATE);
2034  setTargetDAGCombine(ISD::ZERO_EXTEND);
2035  setTargetDAGCombine(ISD::ANY_EXTEND);
2036  setTargetDAGCombine(ISD::SIGN_EXTEND);
2037  setTargetDAGCombine(ISD::SIGN_EXTEND_INREG);
2038  setTargetDAGCombine(ISD::ANY_EXTEND_VECTOR_INREG);
2039  setTargetDAGCombine(ISD::SIGN_EXTEND_VECTOR_INREG);
2040  setTargetDAGCombine(ISD::ZERO_EXTEND_VECTOR_INREG);
2041  setTargetDAGCombine(ISD::SINT_TO_FP);
2042  setTargetDAGCombine(ISD::UINT_TO_FP);
2043  setTargetDAGCombine(ISD::STRICT_SINT_TO_FP);
2044  setTargetDAGCombine(ISD::STRICT_UINT_TO_FP);
2045  setTargetDAGCombine(ISD::SETCC);
2046  setTargetDAGCombine(ISD::MUL);
2047  setTargetDAGCombine(ISD::XOR);
2048  setTargetDAGCombine(ISD::MSCATTER);
2049  setTargetDAGCombine(ISD::MGATHER);
2050  setTargetDAGCombine(ISD::FP16_TO_FP);
2051  setTargetDAGCombine(ISD::FP_EXTEND);
2052  setTargetDAGCombine(ISD::STRICT_FP_EXTEND);
2053  setTargetDAGCombine(ISD::FP_ROUND);
2054
2055  computeRegisterProperties(Subtarget.getRegisterInfo());
2056
2057  MaxStoresPerMemset = 16; // For @llvm.memset -> sequence of stores
2058  MaxStoresPerMemsetOptSize = 8;
2059  MaxStoresPerMemcpy = 8; // For @llvm.memcpy -> sequence of stores
2060  MaxStoresPerMemcpyOptSize = 4;
2061  MaxStoresPerMemmove = 8; // For @llvm.memmove -> sequence of stores
2062  MaxStoresPerMemmoveOptSize = 4;
2063
2064  // TODO: These control memcmp expansion in CGP and could be raised higher, but
2065  // that needs to benchmarked and balanced with the potential use of vector
2066  // load/store types (PR33329, PR33914).
2067  MaxLoadsPerMemcmp = 2;
2068  MaxLoadsPerMemcmpOptSize = 2;
2069
2070  // Set loop alignment to 2^ExperimentalPrefLoopAlignment bytes (default: 2^4).
2071  setPrefLoopAlignment(Align(1ULL << ExperimentalPrefLoopAlignment));
2072
2073  // An out-of-order CPU can speculatively execute past a predictable branch,
2074  // but a conditional move could be stalled by an expensive earlier operation.
2075  PredictableSelectIsExpensive = Subtarget.getSchedModel().isOutOfOrder();
2076  EnableExtLdPromotion = true;
2077  setPrefFunctionAlignment(Align(16));
2078
2079  verifyIntrinsicTables();
2080
2081  // Default to having -disable-strictnode-mutation on
2082  IsStrictFPEnabled = true;
2083}
2084
2085// This has so far only been implemented for 64-bit MachO.
2086bool X86TargetLowering::useLoadStackGuardNode() const {
2087  return Subtarget.isTargetMachO() && Subtarget.is64Bit();
2088}
2089
2090bool X86TargetLowering::useStackGuardXorFP() const {
2091  // Currently only MSVC CRTs XOR the frame pointer into the stack guard value.
2092  return Subtarget.getTargetTriple().isOSMSVCRT() && !Subtarget.isTargetMachO();
2093}
2094
2095SDValue X86TargetLowering::emitStackGuardXorFP(SelectionDAG &DAG, SDValue Val,
2096                                               const SDLoc &DL) const {
2097  EVT PtrTy = getPointerTy(DAG.getDataLayout());
2098  unsigned XorOp = Subtarget.is64Bit() ? X86::XOR64_FP : X86::XOR32_FP;
2099  MachineSDNode *Node = DAG.getMachineNode(XorOp, DL, PtrTy, Val);
2100  return SDValue(Node, 0);
2101}
2102
2103TargetLoweringBase::LegalizeTypeAction
2104X86TargetLowering::getPreferredVectorAction(MVT VT) const {
2105  if ((VT == MVT::v32i1 || VT == MVT::v64i1) && Subtarget.hasAVX512() &&
2106      !Subtarget.hasBWI())
2107    return TypeSplitVector;
2108
2109  if (!VT.isScalableVector() && VT.getVectorNumElements() != 1 &&
2110      VT.getVectorElementType() != MVT::i1)
2111    return TypeWidenVector;
2112
2113  return TargetLoweringBase::getPreferredVectorAction(VT);
2114}
2115
2116static std::pair<MVT, unsigned>
2117handleMaskRegisterForCallingConv(unsigned NumElts, CallingConv::ID CC,
2118                                 const X86Subtarget &Subtarget) {
2119  // v2i1/v4i1/v8i1/v16i1 all pass in xmm registers unless the calling
2120  // convention is one that uses k registers.
2121  if (NumElts == 2)
2122    return {MVT::v2i64, 1};
2123  if (NumElts == 4)
2124    return {MVT::v4i32, 1};
2125  if (NumElts == 8 && CC != CallingConv::X86_RegCall &&
2126      CC != CallingConv::Intel_OCL_BI)
2127    return {MVT::v8i16, 1};
2128  if (NumElts == 16 && CC != CallingConv::X86_RegCall &&
2129      CC != CallingConv::Intel_OCL_BI)
2130    return {MVT::v16i8, 1};
2131  // v32i1 passes in ymm unless we have BWI and the calling convention is
2132  // regcall.
2133  if (NumElts == 32 && (!Subtarget.hasBWI() || CC != CallingConv::X86_RegCall))
2134    return {MVT::v32i8, 1};
2135  // Split v64i1 vectors if we don't have v64i8 available.
2136  if (NumElts == 64 && Subtarget.hasBWI() && CC != CallingConv::X86_RegCall) {
2137    if (Subtarget.useAVX512Regs())
2138      return {MVT::v64i8, 1};
2139    return {MVT::v32i8, 2};
2140  }
2141
2142  // Break wide or odd vXi1 vectors into scalars to match avx2 behavior.
2143  if (!isPowerOf2_32(NumElts) || (NumElts == 64 && !Subtarget.hasBWI()) ||
2144      NumElts > 64)
2145    return {MVT::i8, NumElts};
2146
2147  return {MVT::INVALID_SIMPLE_VALUE_TYPE, 0};
2148}
2149
2150MVT X86TargetLowering::getRegisterTypeForCallingConv(LLVMContext &Context,
2151                                                     CallingConv::ID CC,
2152                                                     EVT VT) const {
2153  if (VT.isVector() && VT.getVectorElementType() == MVT::i1 &&
2154      Subtarget.hasAVX512()) {
2155    unsigned NumElts = VT.getVectorNumElements();
2156
2157    MVT RegisterVT;
2158    unsigned NumRegisters;
2159    std::tie(RegisterVT, NumRegisters) =
2160        handleMaskRegisterForCallingConv(NumElts, CC, Subtarget);
2161    if (RegisterVT != MVT::INVALID_SIMPLE_VALUE_TYPE)
2162      return RegisterVT;
2163  }
2164
2165  return TargetLowering::getRegisterTypeForCallingConv(Context, CC, VT);
2166}
2167
2168unsigned X86TargetLowering::getNumRegistersForCallingConv(LLVMContext &Context,
2169                                                          CallingConv::ID CC,
2170                                                          EVT VT) const {
2171  if (VT.isVector() && VT.getVectorElementType() == MVT::i1 &&
2172      Subtarget.hasAVX512()) {
2173    unsigned NumElts = VT.getVectorNumElements();
2174
2175    MVT RegisterVT;
2176    unsigned NumRegisters;
2177    std::tie(RegisterVT, NumRegisters) =
2178        handleMaskRegisterForCallingConv(NumElts, CC, Subtarget);
2179    if (RegisterVT != MVT::INVALID_SIMPLE_VALUE_TYPE)
2180      return NumRegisters;
2181  }
2182
2183  return TargetLowering::getNumRegistersForCallingConv(Context, CC, VT);
2184}
2185
2186unsigned X86TargetLowering::getVectorTypeBreakdownForCallingConv(
2187    LLVMContext &Context, CallingConv::ID CC, EVT VT, EVT &IntermediateVT,
2188    unsigned &NumIntermediates, MVT &RegisterVT) const {
2189  // Break wide or odd vXi1 vectors into scalars to match avx2 behavior.
2190  if (VT.isVector() && VT.getVectorElementType() == MVT::i1 &&
2191      Subtarget.hasAVX512() &&
2192      (!isPowerOf2_32(VT.getVectorNumElements()) ||
2193       (VT.getVectorNumElements() == 64 && !Subtarget.hasBWI()) ||
2194       VT.getVectorNumElements() > 64)) {
2195    RegisterVT = MVT::i8;
2196    IntermediateVT = MVT::i1;
2197    NumIntermediates = VT.getVectorNumElements();
2198    return NumIntermediates;
2199  }
2200
2201  // Split v64i1 vectors if we don't have v64i8 available.
2202  if (VT == MVT::v64i1 && Subtarget.hasBWI() && !Subtarget.useAVX512Regs() &&
2203      CC != CallingConv::X86_RegCall) {
2204    RegisterVT = MVT::v32i8;
2205    IntermediateVT = MVT::v32i1;
2206    NumIntermediates = 2;
2207    return 2;
2208  }
2209
2210  return TargetLowering::getVectorTypeBreakdownForCallingConv(Context, CC, VT, IntermediateVT,
2211                                              NumIntermediates, RegisterVT);
2212}
2213
2214EVT X86TargetLowering::getSetCCResultType(const DataLayout &DL,
2215                                          LLVMContext& Context,
2216                                          EVT VT) const {
2217  if (!VT.isVector())
2218    return MVT::i8;
2219
2220  if (Subtarget.hasAVX512()) {
2221    // Figure out what this type will be legalized to.
2222    EVT LegalVT = VT;
2223    while (getTypeAction(Context, LegalVT) != TypeLegal)
2224      LegalVT = getTypeToTransformTo(Context, LegalVT);
2225
2226    // If we got a 512-bit vector then we'll definitely have a vXi1 compare.
2227    if (LegalVT.getSimpleVT().is512BitVector())
2228      return EVT::getVectorVT(Context, MVT::i1, VT.getVectorElementCount());
2229
2230    if (LegalVT.getSimpleVT().isVector() && Subtarget.hasVLX()) {
2231      // If we legalized to less than a 512-bit vector, then we will use a vXi1
2232      // compare for vXi32/vXi64 for sure. If we have BWI we will also support
2233      // vXi16/vXi8.
2234      MVT EltVT = LegalVT.getSimpleVT().getVectorElementType();
2235      if (Subtarget.hasBWI() || EltVT.getSizeInBits() >= 32)
2236        return EVT::getVectorVT(Context, MVT::i1, VT.getVectorElementCount());
2237    }
2238  }
2239
2240  return VT.changeVectorElementTypeToInteger();
2241}
2242
2243/// Helper for getByValTypeAlignment to determine
2244/// the desired ByVal argument alignment.
2245static void getMaxByValAlign(Type *Ty, Align &MaxAlign) {
2246  if (MaxAlign == 16)
2247    return;
2248  if (VectorType *VTy = dyn_cast<VectorType>(Ty)) {
2249    if (VTy->getPrimitiveSizeInBits().getFixedSize() == 128)
2250      MaxAlign = Align(16);
2251  } else if (ArrayType *ATy = dyn_cast<ArrayType>(Ty)) {
2252    Align EltAlign;
2253    getMaxByValAlign(ATy->getElementType(), EltAlign);
2254    if (EltAlign > MaxAlign)
2255      MaxAlign = EltAlign;
2256  } else if (StructType *STy = dyn_cast<StructType>(Ty)) {
2257    for (auto *EltTy : STy->elements()) {
2258      Align EltAlign;
2259      getMaxByValAlign(EltTy, EltAlign);
2260      if (EltAlign > MaxAlign)
2261        MaxAlign = EltAlign;
2262      if (MaxAlign == 16)
2263        break;
2264    }
2265  }
2266}
2267
2268/// Return the desired alignment for ByVal aggregate
2269/// function arguments in the caller parameter area. For X86, aggregates
2270/// that contain SSE vectors are placed at 16-byte boundaries while the rest
2271/// are at 4-byte boundaries.
2272unsigned X86TargetLowering::getByValTypeAlignment(Type *Ty,
2273                                                  const DataLayout &DL) const {
2274  if (Subtarget.is64Bit()) {
2275    // Max of 8 and alignment of type.
2276    Align TyAlign = DL.getABITypeAlign(Ty);
2277    if (TyAlign > 8)
2278      return TyAlign.value();
2279    return 8;
2280  }
2281
2282  Align Alignment(4);
2283  if (Subtarget.hasSSE1())
2284    getMaxByValAlign(Ty, Alignment);
2285  return Alignment.value();
2286}
2287
2288/// It returns EVT::Other if the type should be determined using generic
2289/// target-independent logic.
2290/// For vector ops we check that the overall size isn't larger than our
2291/// preferred vector width.
2292EVT X86TargetLowering::getOptimalMemOpType(
2293    const MemOp &Op, const AttributeList &FuncAttributes) const {
2294  if (!FuncAttributes.hasFnAttribute(Attribute::NoImplicitFloat)) {
2295    if (Op.size() >= 16 &&
2296        (!Subtarget.isUnalignedMem16Slow() || Op.isAligned(Align(16)))) {
2297      // FIXME: Check if unaligned 64-byte accesses are slow.
2298      if (Op.size() >= 64 && Subtarget.hasAVX512() &&
2299          (Subtarget.getPreferVectorWidth() >= 512)) {
2300        return Subtarget.hasBWI() ? MVT::v64i8 : MVT::v16i32;
2301      }
2302      // FIXME: Check if unaligned 32-byte accesses are slow.
2303      if (Op.size() >= 32 && Subtarget.hasAVX() &&
2304          (Subtarget.getPreferVectorWidth() >= 256)) {
2305        // Although this isn't a well-supported type for AVX1, we'll let
2306        // legalization and shuffle lowering produce the optimal codegen. If we
2307        // choose an optimal type with a vector element larger than a byte,
2308        // getMemsetStores() may create an intermediate splat (using an integer
2309        // multiply) before we splat as a vector.
2310        return MVT::v32i8;
2311      }
2312      if (Subtarget.hasSSE2() && (Subtarget.getPreferVectorWidth() >= 128))
2313        return MVT::v16i8;
2314      // TODO: Can SSE1 handle a byte vector?
2315      // If we have SSE1 registers we should be able to use them.
2316      if (Subtarget.hasSSE1() && (Subtarget.is64Bit() || Subtarget.hasX87()) &&
2317          (Subtarget.getPreferVectorWidth() >= 128))
2318        return MVT::v4f32;
2319    } else if (((Op.isMemcpy() && !Op.isMemcpyStrSrc()) || Op.isZeroMemset()) &&
2320               Op.size() >= 8 && !Subtarget.is64Bit() && Subtarget.hasSSE2()) {
2321      // Do not use f64 to lower memcpy if source is string constant. It's
2322      // better to use i32 to avoid the loads.
2323      // Also, do not use f64 to lower memset unless this is a memset of zeros.
2324      // The gymnastics of splatting a byte value into an XMM register and then
2325      // only using 8-byte stores (because this is a CPU with slow unaligned
2326      // 16-byte accesses) makes that a loser.
2327      return MVT::f64;
2328    }
2329  }
2330  // This is a compromise. If we reach here, unaligned accesses may be slow on
2331  // this target. However, creating smaller, aligned accesses could be even
2332  // slower and would certainly be a lot more code.
2333  if (Subtarget.is64Bit() && Op.size() >= 8)
2334    return MVT::i64;
2335  return MVT::i32;
2336}
2337
2338bool X86TargetLowering::isSafeMemOpType(MVT VT) const {
2339  if (VT == MVT::f32)
2340    return X86ScalarSSEf32;
2341  if (VT == MVT::f64)
2342    return X86ScalarSSEf64;
2343  return true;
2344}
2345
2346bool X86TargetLowering::allowsMisalignedMemoryAccesses(
2347    EVT VT, unsigned, Align Alignment, MachineMemOperand::Flags Flags,
2348    bool *Fast) const {
2349  if (Fast) {
2350    switch (VT.getSizeInBits()) {
2351    default:
2352      // 8-byte and under are always assumed to be fast.
2353      *Fast = true;
2354      break;
2355    case 128:
2356      *Fast = !Subtarget.isUnalignedMem16Slow();
2357      break;
2358    case 256:
2359      *Fast = !Subtarget.isUnalignedMem32Slow();
2360      break;
2361    // TODO: What about AVX-512 (512-bit) accesses?
2362    }
2363  }
2364  // NonTemporal vector memory ops must be aligned.
2365  if (!!(Flags & MachineMemOperand::MONonTemporal) && VT.isVector()) {
2366    // NT loads can only be vector aligned, so if its less aligned than the
2367    // minimum vector size (which we can split the vector down to), we might as
2368    // well use a regular unaligned vector load.
2369    // We don't have any NT loads pre-SSE41.
2370    if (!!(Flags & MachineMemOperand::MOLoad))
2371      return (Alignment < 16 || !Subtarget.hasSSE41());
2372    return false;
2373  }
2374  // Misaligned accesses of any size are always allowed.
2375  return true;
2376}
2377
2378/// Return the entry encoding for a jump table in the
2379/// current function.  The returned value is a member of the
2380/// MachineJumpTableInfo::JTEntryKind enum.
2381unsigned X86TargetLowering::getJumpTableEncoding() const {
2382  // In GOT pic mode, each entry in the jump table is emitted as a @GOTOFF
2383  // symbol.
2384  if (isPositionIndependent() && Subtarget.isPICStyleGOT())
2385    return MachineJumpTableInfo::EK_Custom32;
2386
2387  // Otherwise, use the normal jump table encoding heuristics.
2388  return TargetLowering::getJumpTableEncoding();
2389}
2390
2391bool X86TargetLowering::useSoftFloat() const {
2392  return Subtarget.useSoftFloat();
2393}
2394
2395void X86TargetLowering::markLibCallAttributes(MachineFunction *MF, unsigned CC,
2396                                              ArgListTy &Args) const {
2397
2398  // Only relabel X86-32 for C / Stdcall CCs.
2399  if (Subtarget.is64Bit())
2400    return;
2401  if (CC != CallingConv::C && CC != CallingConv::X86_StdCall)
2402    return;
2403  unsigned ParamRegs = 0;
2404  if (auto *M = MF->getFunction().getParent())
2405    ParamRegs = M->getNumberRegisterParameters();
2406
2407  // Mark the first N int arguments as having reg
2408  for (auto &Arg : Args) {
2409    Type *T = Arg.Ty;
2410    if (T->isIntOrPtrTy())
2411      if (MF->getDataLayout().getTypeAllocSize(T) <= 8) {
2412        unsigned numRegs = 1;
2413        if (MF->getDataLayout().getTypeAllocSize(T) > 4)
2414          numRegs = 2;
2415        if (ParamRegs < numRegs)
2416          return;
2417        ParamRegs -= numRegs;
2418        Arg.IsInReg = true;
2419      }
2420  }
2421}
2422
2423const MCExpr *
2424X86TargetLowering::LowerCustomJumpTableEntry(const MachineJumpTableInfo *MJTI,
2425                                             const MachineBasicBlock *MBB,
2426                                             unsigned uid,MCContext &Ctx) const{
2427  assert(isPositionIndependent() && Subtarget.isPICStyleGOT());
2428  // In 32-bit ELF systems, our jump table entries are formed with @GOTOFF
2429  // entries.
2430  return MCSymbolRefExpr::create(MBB->getSymbol(),
2431                                 MCSymbolRefExpr::VK_GOTOFF, Ctx);
2432}
2433
2434/// Returns relocation base for the given PIC jumptable.
2435SDValue X86TargetLowering::getPICJumpTableRelocBase(SDValue Table,
2436                                                    SelectionDAG &DAG) const {
2437  if (!Subtarget.is64Bit())
2438    // This doesn't have SDLoc associated with it, but is not really the
2439    // same as a Register.
2440    return DAG.getNode(X86ISD::GlobalBaseReg, SDLoc(),
2441                       getPointerTy(DAG.getDataLayout()));
2442  return Table;
2443}
2444
2445/// This returns the relocation base for the given PIC jumptable,
2446/// the same as getPICJumpTableRelocBase, but as an MCExpr.
2447const MCExpr *X86TargetLowering::
2448getPICJumpTableRelocBaseExpr(const MachineFunction *MF, unsigned JTI,
2449                             MCContext &Ctx) const {
2450  // X86-64 uses RIP relative addressing based on the jump table label.
2451  if (Subtarget.isPICStyleRIPRel())
2452    return TargetLowering::getPICJumpTableRelocBaseExpr(MF, JTI, Ctx);
2453
2454  // Otherwise, the reference is relative to the PIC base.
2455  return MCSymbolRefExpr::create(MF->getPICBaseSymbol(), Ctx);
2456}
2457
2458std::pair<const TargetRegisterClass *, uint8_t>
2459X86TargetLowering::findRepresentativeClass(const TargetRegisterInfo *TRI,
2460                                           MVT VT) const {
2461  const TargetRegisterClass *RRC = nullptr;
2462  uint8_t Cost = 1;
2463  switch (VT.SimpleTy) {
2464  default:
2465    return TargetLowering::findRepresentativeClass(TRI, VT);
2466  case MVT::i8: case MVT::i16: case MVT::i32: case MVT::i64:
2467    RRC = Subtarget.is64Bit() ? &X86::GR64RegClass : &X86::GR32RegClass;
2468    break;
2469  case MVT::x86mmx:
2470    RRC = &X86::VR64RegClass;
2471    break;
2472  case MVT::f32: case MVT::f64:
2473  case MVT::v16i8: case MVT::v8i16: case MVT::v4i32: case MVT::v2i64:
2474  case MVT::v4f32: case MVT::v2f64:
2475  case MVT::v32i8: case MVT::v16i16: case MVT::v8i32: case MVT::v4i64:
2476  case MVT::v8f32: case MVT::v4f64:
2477  case MVT::v64i8: case MVT::v32i16: case MVT::v16i32: case MVT::v8i64:
2478  case MVT::v16f32: case MVT::v8f64:
2479    RRC = &X86::VR128XRegClass;
2480    break;
2481  }
2482  return std::make_pair(RRC, Cost);
2483}
2484
2485unsigned X86TargetLowering::getAddressSpace() const {
2486  if (Subtarget.is64Bit())
2487    return (getTargetMachine().getCodeModel() == CodeModel::Kernel) ? 256 : 257;
2488  return 256;
2489}
2490
2491static bool hasStackGuardSlotTLS(const Triple &TargetTriple) {
2492  return TargetTriple.isOSGlibc() || TargetTriple.isOSFuchsia() ||
2493         (TargetTriple.isAndroid() && !TargetTriple.isAndroidVersionLT(17));
2494}
2495
2496static Constant* SegmentOffset(IRBuilder<> &IRB,
2497                               int Offset, unsigned AddressSpace) {
2498  return ConstantExpr::getIntToPtr(
2499      ConstantInt::get(Type::getInt32Ty(IRB.getContext()), Offset),
2500      Type::getInt8PtrTy(IRB.getContext())->getPointerTo(AddressSpace));
2501}
2502
2503Value *X86TargetLowering::getIRStackGuard(IRBuilder<> &IRB) const {
2504  // glibc, bionic, and Fuchsia have a special slot for the stack guard in
2505  // tcbhead_t; use it instead of the usual global variable (see
2506  // sysdeps/{i386,x86_64}/nptl/tls.h)
2507  if (hasStackGuardSlotTLS(Subtarget.getTargetTriple())) {
2508    if (Subtarget.isTargetFuchsia()) {
2509      // <zircon/tls.h> defines ZX_TLS_STACK_GUARD_OFFSET with this value.
2510      return SegmentOffset(IRB, 0x10, getAddressSpace());
2511    } else {
2512      unsigned AddressSpace = getAddressSpace();
2513      Module *M = IRB.GetInsertBlock()->getParent()->getParent();
2514      // Specially, some users may customize the base reg and offset.
2515      int Offset = M->getStackProtectorGuardOffset();
2516      // If we don't set -stack-protector-guard-offset value:
2517      // %fs:0x28, unless we're using a Kernel code model, in which case
2518      // it's %gs:0x28.  gs:0x14 on i386.
2519      if (Offset == INT_MAX)
2520        Offset = (Subtarget.is64Bit()) ? 0x28 : 0x14;
2521
2522      StringRef GuardReg = M->getStackProtectorGuardReg();
2523      if (GuardReg == "fs")
2524        AddressSpace = X86AS::FS;
2525      else if (GuardReg == "gs")
2526        AddressSpace = X86AS::GS;
2527      return SegmentOffset(IRB, Offset, AddressSpace);
2528    }
2529  }
2530  return TargetLowering::getIRStackGuard(IRB);
2531}
2532
2533void X86TargetLowering::insertSSPDeclarations(Module &M) const {
2534  // MSVC CRT provides functionalities for stack protection.
2535  if (Subtarget.getTargetTriple().isWindowsMSVCEnvironment() ||
2536      Subtarget.getTargetTriple().isWindowsItaniumEnvironment()) {
2537    // MSVC CRT has a global variable holding security cookie.
2538    M.getOrInsertGlobal("__security_cookie",
2539                        Type::getInt8PtrTy(M.getContext()));
2540
2541    // MSVC CRT has a function to validate security cookie.
2542    FunctionCallee SecurityCheckCookie = M.getOrInsertFunction(
2543        "__security_check_cookie", Type::getVoidTy(M.getContext()),
2544        Type::getInt8PtrTy(M.getContext()));
2545    if (Function *F = dyn_cast<Function>(SecurityCheckCookie.getCallee())) {
2546      F->setCallingConv(CallingConv::X86_FastCall);
2547      F->addAttribute(1, Attribute::AttrKind::InReg);
2548    }
2549    return;
2550  }
2551
2552  StringRef GuardMode = M.getStackProtectorGuard();
2553
2554  // glibc, bionic, and Fuchsia have a special slot for the stack guard.
2555  if ((GuardMode == "tls" || GuardMode.empty()) &&
2556      hasStackGuardSlotTLS(Subtarget.getTargetTriple()))
2557    return;
2558  TargetLowering::insertSSPDeclarations(M);
2559}
2560
2561Value *X86TargetLowering::getSDagStackGuard(const Module &M) const {
2562  // MSVC CRT has a global variable holding security cookie.
2563  if (Subtarget.getTargetTriple().isWindowsMSVCEnvironment() ||
2564      Subtarget.getTargetTriple().isWindowsItaniumEnvironment()) {
2565    return M.getGlobalVariable("__security_cookie");
2566  }
2567  return TargetLowering::getSDagStackGuard(M);
2568}
2569
2570Function *X86TargetLowering::getSSPStackGuardCheck(const Module &M) const {
2571  // MSVC CRT has a function to validate security cookie.
2572  if (Subtarget.getTargetTriple().isWindowsMSVCEnvironment() ||
2573      Subtarget.getTargetTriple().isWindowsItaniumEnvironment()) {
2574    return M.getFunction("__security_check_cookie");
2575  }
2576  return TargetLowering::getSSPStackGuardCheck(M);
2577}
2578
2579Value *X86TargetLowering::getSafeStackPointerLocation(IRBuilder<> &IRB) const {
2580  if (Subtarget.getTargetTriple().isOSContiki())
2581    return getDefaultSafeStackPointerLocation(IRB, false);
2582
2583  // Android provides a fixed TLS slot for the SafeStack pointer. See the
2584  // definition of TLS_SLOT_SAFESTACK in
2585  // https://android.googlesource.com/platform/bionic/+/master/libc/private/bionic_tls.h
2586  if (Subtarget.isTargetAndroid()) {
2587    // %fs:0x48, unless we're using a Kernel code model, in which case it's %gs:
2588    // %gs:0x24 on i386
2589    int Offset = (Subtarget.is64Bit()) ? 0x48 : 0x24;
2590    return SegmentOffset(IRB, Offset, getAddressSpace());
2591  }
2592
2593  // Fuchsia is similar.
2594  if (Subtarget.isTargetFuchsia()) {
2595    // <zircon/tls.h> defines ZX_TLS_UNSAFE_SP_OFFSET with this value.
2596    return SegmentOffset(IRB, 0x18, getAddressSpace());
2597  }
2598
2599  return TargetLowering::getSafeStackPointerLocation(IRB);
2600}
2601
2602//===----------------------------------------------------------------------===//
2603//               Return Value Calling Convention Implementation
2604//===----------------------------------------------------------------------===//
2605
2606bool X86TargetLowering::CanLowerReturn(
2607    CallingConv::ID CallConv, MachineFunction &MF, bool isVarArg,
2608    const SmallVectorImpl<ISD::OutputArg> &Outs, LLVMContext &Context) const {
2609  SmallVector<CCValAssign, 16> RVLocs;
2610  CCState CCInfo(CallConv, isVarArg, MF, RVLocs, Context);
2611  return CCInfo.CheckReturn(Outs, RetCC_X86);
2612}
2613
2614const MCPhysReg *X86TargetLowering::getScratchRegisters(CallingConv::ID) const {
2615  static const MCPhysReg ScratchRegs[] = { X86::R11, 0 };
2616  return ScratchRegs;
2617}
2618
2619/// Lowers masks values (v*i1) to the local register values
2620/// \returns DAG node after lowering to register type
2621static SDValue lowerMasksToReg(const SDValue &ValArg, const EVT &ValLoc,
2622                               const SDLoc &Dl, SelectionDAG &DAG) {
2623  EVT ValVT = ValArg.getValueType();
2624
2625  if (ValVT == MVT::v1i1)
2626    return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, Dl, ValLoc, ValArg,
2627                       DAG.getIntPtrConstant(0, Dl));
2628
2629  if ((ValVT == MVT::v8i1 && (ValLoc == MVT::i8 || ValLoc == MVT::i32)) ||
2630      (ValVT == MVT::v16i1 && (ValLoc == MVT::i16 || ValLoc == MVT::i32))) {
2631    // Two stage lowering might be required
2632    // bitcast:   v8i1 -> i8 / v16i1 -> i16
2633    // anyextend: i8   -> i32 / i16   -> i32
2634    EVT TempValLoc = ValVT == MVT::v8i1 ? MVT::i8 : MVT::i16;
2635    SDValue ValToCopy = DAG.getBitcast(TempValLoc, ValArg);
2636    if (ValLoc == MVT::i32)
2637      ValToCopy = DAG.getNode(ISD::ANY_EXTEND, Dl, ValLoc, ValToCopy);
2638    return ValToCopy;
2639  }
2640
2641  if ((ValVT == MVT::v32i1 && ValLoc == MVT::i32) ||
2642      (ValVT == MVT::v64i1 && ValLoc == MVT::i64)) {
2643    // One stage lowering is required
2644    // bitcast:   v32i1 -> i32 / v64i1 -> i64
2645    return DAG.getBitcast(ValLoc, ValArg);
2646  }
2647
2648  return DAG.getNode(ISD::ANY_EXTEND, Dl, ValLoc, ValArg);
2649}
2650
2651/// Breaks v64i1 value into two registers and adds the new node to the DAG
2652static void Passv64i1ArgInRegs(
2653    const SDLoc &Dl, SelectionDAG &DAG, SDValue &Arg,
2654    SmallVectorImpl<std::pair<Register, SDValue>> &RegsToPass, CCValAssign &VA,
2655    CCValAssign &NextVA, const X86Subtarget &Subtarget) {
2656  assert(Subtarget.hasBWI() && "Expected AVX512BW target!");
2657  assert(Subtarget.is32Bit() && "Expecting 32 bit target");
2658  assert(Arg.getValueType() == MVT::i64 && "Expecting 64 bit value");
2659  assert(VA.isRegLoc() && NextVA.isRegLoc() &&
2660         "The value should reside in two registers");
2661
2662  // Before splitting the value we cast it to i64
2663  Arg = DAG.getBitcast(MVT::i64, Arg);
2664
2665  // Splitting the value into two i32 types
2666  SDValue Lo, Hi;
2667  Lo = DAG.getNode(ISD::EXTRACT_ELEMENT, Dl, MVT::i32, Arg,
2668                   DAG.getConstant(0, Dl, MVT::i32));
2669  Hi = DAG.getNode(ISD::EXTRACT_ELEMENT, Dl, MVT::i32, Arg,
2670                   DAG.getConstant(1, Dl, MVT::i32));
2671
2672  // Attach the two i32 types into corresponding registers
2673  RegsToPass.push_back(std::make_pair(VA.getLocReg(), Lo));
2674  RegsToPass.push_back(std::make_pair(NextVA.getLocReg(), Hi));
2675}
2676
2677SDValue
2678X86TargetLowering::LowerReturn(SDValue Chain, CallingConv::ID CallConv,
2679                               bool isVarArg,
2680                               const SmallVectorImpl<ISD::OutputArg> &Outs,
2681                               const SmallVectorImpl<SDValue> &OutVals,
2682                               const SDLoc &dl, SelectionDAG &DAG) const {
2683  MachineFunction &MF = DAG.getMachineFunction();
2684  X86MachineFunctionInfo *FuncInfo = MF.getInfo<X86MachineFunctionInfo>();
2685
2686  // In some cases we need to disable registers from the default CSR list.
2687  // For example, when they are used for argument passing.
2688  bool ShouldDisableCalleeSavedRegister =
2689      CallConv == CallingConv::X86_RegCall ||
2690      MF.getFunction().hasFnAttribute("no_caller_saved_registers");
2691
2692  if (CallConv == CallingConv::X86_INTR && !Outs.empty())
2693    report_fatal_error("X86 interrupts may not return any value");
2694
2695  SmallVector<CCValAssign, 16> RVLocs;
2696  CCState CCInfo(CallConv, isVarArg, MF, RVLocs, *DAG.getContext());
2697  CCInfo.AnalyzeReturn(Outs, RetCC_X86);
2698
2699  SmallVector<std::pair<Register, SDValue>, 4> RetVals;
2700  for (unsigned I = 0, OutsIndex = 0, E = RVLocs.size(); I != E;
2701       ++I, ++OutsIndex) {
2702    CCValAssign &VA = RVLocs[I];
2703    assert(VA.isRegLoc() && "Can only return in registers!");
2704
2705    // Add the register to the CalleeSaveDisableRegs list.
2706    if (ShouldDisableCalleeSavedRegister)
2707      MF.getRegInfo().disableCalleeSavedRegister(VA.getLocReg());
2708
2709    SDValue ValToCopy = OutVals[OutsIndex];
2710    EVT ValVT = ValToCopy.getValueType();
2711
2712    // Promote values to the appropriate types.
2713    if (VA.getLocInfo() == CCValAssign::SExt)
2714      ValToCopy = DAG.getNode(ISD::SIGN_EXTEND, dl, VA.getLocVT(), ValToCopy);
2715    else if (VA.getLocInfo() == CCValAssign::ZExt)
2716      ValToCopy = DAG.getNode(ISD::ZERO_EXTEND, dl, VA.getLocVT(), ValToCopy);
2717    else if (VA.getLocInfo() == CCValAssign::AExt) {
2718      if (ValVT.isVector() && ValVT.getVectorElementType() == MVT::i1)
2719        ValToCopy = lowerMasksToReg(ValToCopy, VA.getLocVT(), dl, DAG);
2720      else
2721        ValToCopy = DAG.getNode(ISD::ANY_EXTEND, dl, VA.getLocVT(), ValToCopy);
2722    }
2723    else if (VA.getLocInfo() == CCValAssign::BCvt)
2724      ValToCopy = DAG.getBitcast(VA.getLocVT(), ValToCopy);
2725
2726    assert(VA.getLocInfo() != CCValAssign::FPExt &&
2727           "Unexpected FP-extend for return value.");
2728
2729    // Report an error if we have attempted to return a value via an XMM
2730    // register and SSE was disabled.
2731    if (!Subtarget.hasSSE1() && X86::FR32XRegClass.contains(VA.getLocReg())) {
2732      errorUnsupported(DAG, dl, "SSE register return with SSE disabled");
2733      VA.convertToReg(X86::FP0); // Set reg to FP0, avoid hitting asserts.
2734    } else if (!Subtarget.hasSSE2() &&
2735               X86::FR64XRegClass.contains(VA.getLocReg()) &&
2736               ValVT == MVT::f64) {
2737      // When returning a double via an XMM register, report an error if SSE2 is
2738      // not enabled.
2739      errorUnsupported(DAG, dl, "SSE2 register return with SSE2 disabled");
2740      VA.convertToReg(X86::FP0); // Set reg to FP0, avoid hitting asserts.
2741    }
2742
2743    // Returns in ST0/ST1 are handled specially: these are pushed as operands to
2744    // the RET instruction and handled by the FP Stackifier.
2745    if (VA.getLocReg() == X86::FP0 ||
2746        VA.getLocReg() == X86::FP1) {
2747      // If this is a copy from an xmm register to ST(0), use an FPExtend to
2748      // change the value to the FP stack register class.
2749      if (isScalarFPTypeInSSEReg(VA.getValVT()))
2750        ValToCopy = DAG.getNode(ISD::FP_EXTEND, dl, MVT::f80, ValToCopy);
2751      RetVals.push_back(std::make_pair(VA.getLocReg(), ValToCopy));
2752      // Don't emit a copytoreg.
2753      continue;
2754    }
2755
2756    // 64-bit vector (MMX) values are returned in XMM0 / XMM1 except for v1i64
2757    // which is returned in RAX / RDX.
2758    if (Subtarget.is64Bit()) {
2759      if (ValVT == MVT::x86mmx) {
2760        if (VA.getLocReg() == X86::XMM0 || VA.getLocReg() == X86::XMM1) {
2761          ValToCopy = DAG.getBitcast(MVT::i64, ValToCopy);
2762          ValToCopy = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v2i64,
2763                                  ValToCopy);
2764          // If we don't have SSE2 available, convert to v4f32 so the generated
2765          // register is legal.
2766          if (!Subtarget.hasSSE2())
2767            ValToCopy = DAG.getBitcast(MVT::v4f32, ValToCopy);
2768        }
2769      }
2770    }
2771
2772    if (VA.needsCustom()) {
2773      assert(VA.getValVT() == MVT::v64i1 &&
2774             "Currently the only custom case is when we split v64i1 to 2 regs");
2775
2776      Passv64i1ArgInRegs(dl, DAG, ValToCopy, RetVals, VA, RVLocs[++I],
2777                         Subtarget);
2778
2779      // Add the second register to the CalleeSaveDisableRegs list.
2780      if (ShouldDisableCalleeSavedRegister)
2781        MF.getRegInfo().disableCalleeSavedRegister(RVLocs[I].getLocReg());
2782    } else {
2783      RetVals.push_back(std::make_pair(VA.getLocReg(), ValToCopy));
2784    }
2785  }
2786
2787  SDValue Flag;
2788  SmallVector<SDValue, 6> RetOps;
2789  RetOps.push_back(Chain); // Operand #0 = Chain (updated below)
2790  // Operand #1 = Bytes To Pop
2791  RetOps.push_back(DAG.getTargetConstant(FuncInfo->getBytesToPopOnReturn(), dl,
2792                   MVT::i32));
2793
2794  // Copy the result values into the output registers.
2795  for (auto &RetVal : RetVals) {
2796    if (RetVal.first == X86::FP0 || RetVal.first == X86::FP1) {
2797      RetOps.push_back(RetVal.second);
2798      continue; // Don't emit a copytoreg.
2799    }
2800
2801    Chain = DAG.getCopyToReg(Chain, dl, RetVal.first, RetVal.second, Flag);
2802    Flag = Chain.getValue(1);
2803    RetOps.push_back(
2804        DAG.getRegister(RetVal.first, RetVal.second.getValueType()));
2805  }
2806
2807  // Swift calling convention does not require we copy the sret argument
2808  // into %rax/%eax for the return, and SRetReturnReg is not set for Swift.
2809
2810  // All x86 ABIs require that for returning structs by value we copy
2811  // the sret argument into %rax/%eax (depending on ABI) for the return.
2812  // We saved the argument into a virtual register in the entry block,
2813  // so now we copy the value out and into %rax/%eax.
2814  //
2815  // Checking Function.hasStructRetAttr() here is insufficient because the IR
2816  // may not have an explicit sret argument. If FuncInfo.CanLowerReturn is
2817  // false, then an sret argument may be implicitly inserted in the SelDAG. In
2818  // either case FuncInfo->setSRetReturnReg() will have been called.
2819  if (Register SRetReg = FuncInfo->getSRetReturnReg()) {
2820    // When we have both sret and another return value, we should use the
2821    // original Chain stored in RetOps[0], instead of the current Chain updated
2822    // in the above loop. If we only have sret, RetOps[0] equals to Chain.
2823
2824    // For the case of sret and another return value, we have
2825    //   Chain_0 at the function entry
2826    //   Chain_1 = getCopyToReg(Chain_0) in the above loop
2827    // If we use Chain_1 in getCopyFromReg, we will have
2828    //   Val = getCopyFromReg(Chain_1)
2829    //   Chain_2 = getCopyToReg(Chain_1, Val) from below
2830
2831    // getCopyToReg(Chain_0) will be glued together with
2832    // getCopyToReg(Chain_1, Val) into Unit A, getCopyFromReg(Chain_1) will be
2833    // in Unit B, and we will have cyclic dependency between Unit A and Unit B:
2834    //   Data dependency from Unit B to Unit A due to usage of Val in
2835    //     getCopyToReg(Chain_1, Val)
2836    //   Chain dependency from Unit A to Unit B
2837
2838    // So here, we use RetOps[0] (i.e Chain_0) for getCopyFromReg.
2839    SDValue Val = DAG.getCopyFromReg(RetOps[0], dl, SRetReg,
2840                                     getPointerTy(MF.getDataLayout()));
2841
2842    Register RetValReg
2843        = (Subtarget.is64Bit() && !Subtarget.isTarget64BitILP32()) ?
2844          X86::RAX : X86::EAX;
2845    Chain = DAG.getCopyToReg(Chain, dl, RetValReg, Val, Flag);
2846    Flag = Chain.getValue(1);
2847
2848    // RAX/EAX now acts like a return value.
2849    RetOps.push_back(
2850        DAG.getRegister(RetValReg, getPointerTy(DAG.getDataLayout())));
2851
2852    // Add the returned register to the CalleeSaveDisableRegs list.
2853    if (ShouldDisableCalleeSavedRegister)
2854      MF.getRegInfo().disableCalleeSavedRegister(RetValReg);
2855  }
2856
2857  const X86RegisterInfo *TRI = Subtarget.getRegisterInfo();
2858  const MCPhysReg *I =
2859      TRI->getCalleeSavedRegsViaCopy(&DAG.getMachineFunction());
2860  if (I) {
2861    for (; *I; ++I) {
2862      if (X86::GR64RegClass.contains(*I))
2863        RetOps.push_back(DAG.getRegister(*I, MVT::i64));
2864      else
2865        llvm_unreachable("Unexpected register class in CSRsViaCopy!");
2866    }
2867  }
2868
2869  RetOps[0] = Chain;  // Update chain.
2870
2871  // Add the flag if we have it.
2872  if (Flag.getNode())
2873    RetOps.push_back(Flag);
2874
2875  X86ISD::NodeType opcode = X86ISD::RET_FLAG;
2876  if (CallConv == CallingConv::X86_INTR)
2877    opcode = X86ISD::IRET;
2878  return DAG.getNode(opcode, dl, MVT::Other, RetOps);
2879}
2880
2881bool X86TargetLowering::isUsedByReturnOnly(SDNode *N, SDValue &Chain) const {
2882  if (N->getNumValues() != 1 || !N->hasNUsesOfValue(1, 0))
2883    return false;
2884
2885  SDValue TCChain = Chain;
2886  SDNode *Copy = *N->use_begin();
2887  if (Copy->getOpcode() == ISD::CopyToReg) {
2888    // If the copy has a glue operand, we conservatively assume it isn't safe to
2889    // perform a tail call.
2890    if (Copy->getOperand(Copy->getNumOperands()-1).getValueType() == MVT::Glue)
2891      return false;
2892    TCChain = Copy->getOperand(0);
2893  } else if (Copy->getOpcode() != ISD::FP_EXTEND)
2894    return false;
2895
2896  bool HasRet = false;
2897  for (SDNode::use_iterator UI = Copy->use_begin(), UE = Copy->use_end();
2898       UI != UE; ++UI) {
2899    if (UI->getOpcode() != X86ISD::RET_FLAG)
2900      return false;
2901    // If we are returning more than one value, we can definitely
2902    // not make a tail call see PR19530
2903    if (UI->getNumOperands() > 4)
2904      return false;
2905    if (UI->getNumOperands() == 4 &&
2906        UI->getOperand(UI->getNumOperands()-1).getValueType() != MVT::Glue)
2907      return false;
2908    HasRet = true;
2909  }
2910
2911  if (!HasRet)
2912    return false;
2913
2914  Chain = TCChain;
2915  return true;
2916}
2917
2918EVT X86TargetLowering::getTypeForExtReturn(LLVMContext &Context, EVT VT,
2919                                           ISD::NodeType ExtendKind) const {
2920  MVT ReturnMVT = MVT::i32;
2921
2922  bool Darwin = Subtarget.getTargetTriple().isOSDarwin();
2923  if (VT == MVT::i1 || (!Darwin && (VT == MVT::i8 || VT == MVT::i16))) {
2924    // The ABI does not require i1, i8 or i16 to be extended.
2925    //
2926    // On Darwin, there is code in the wild relying on Clang's old behaviour of
2927    // always extending i8/i16 return values, so keep doing that for now.
2928    // (PR26665).
2929    ReturnMVT = MVT::i8;
2930  }
2931
2932  EVT MinVT = getRegisterType(Context, ReturnMVT);
2933  return VT.bitsLT(MinVT) ? MinVT : VT;
2934}
2935
2936/// Reads two 32 bit registers and creates a 64 bit mask value.
2937/// \param VA The current 32 bit value that need to be assigned.
2938/// \param NextVA The next 32 bit value that need to be assigned.
2939/// \param Root The parent DAG node.
2940/// \param [in,out] InFlag Represents SDvalue in the parent DAG node for
2941///                        glue purposes. In the case the DAG is already using
2942///                        physical register instead of virtual, we should glue
2943///                        our new SDValue to InFlag SDvalue.
2944/// \return a new SDvalue of size 64bit.
2945static SDValue getv64i1Argument(CCValAssign &VA, CCValAssign &NextVA,
2946                                SDValue &Root, SelectionDAG &DAG,
2947                                const SDLoc &Dl, const X86Subtarget &Subtarget,
2948                                SDValue *InFlag = nullptr) {
2949  assert((Subtarget.hasBWI()) && "Expected AVX512BW target!");
2950  assert(Subtarget.is32Bit() && "Expecting 32 bit target");
2951  assert(VA.getValVT() == MVT::v64i1 &&
2952         "Expecting first location of 64 bit width type");
2953  assert(NextVA.getValVT() == VA.getValVT() &&
2954         "The locations should have the same type");
2955  assert(VA.isRegLoc() && NextVA.isRegLoc() &&
2956         "The values should reside in two registers");
2957
2958  SDValue Lo, Hi;
2959  SDValue ArgValueLo, ArgValueHi;
2960
2961  MachineFunction &MF = DAG.getMachineFunction();
2962  const TargetRegisterClass *RC = &X86::GR32RegClass;
2963
2964  // Read a 32 bit value from the registers.
2965  if (nullptr == InFlag) {
2966    // When no physical register is present,
2967    // create an intermediate virtual register.
2968    Register Reg = MF.addLiveIn(VA.getLocReg(), RC);
2969    ArgValueLo = DAG.getCopyFromReg(Root, Dl, Reg, MVT::i32);
2970    Reg = MF.addLiveIn(NextVA.getLocReg(), RC);
2971    ArgValueHi = DAG.getCopyFromReg(Root, Dl, Reg, MVT::i32);
2972  } else {
2973    // When a physical register is available read the value from it and glue
2974    // the reads together.
2975    ArgValueLo =
2976      DAG.getCopyFromReg(Root, Dl, VA.getLocReg(), MVT::i32, *InFlag);
2977    *InFlag = ArgValueLo.getValue(2);
2978    ArgValueHi =
2979      DAG.getCopyFromReg(Root, Dl, NextVA.getLocReg(), MVT::i32, *InFlag);
2980    *InFlag = ArgValueHi.getValue(2);
2981  }
2982
2983  // Convert the i32 type into v32i1 type.
2984  Lo = DAG.getBitcast(MVT::v32i1, ArgValueLo);
2985
2986  // Convert the i32 type into v32i1 type.
2987  Hi = DAG.getBitcast(MVT::v32i1, ArgValueHi);
2988
2989  // Concatenate the two values together.
2990  return DAG.getNode(ISD::CONCAT_VECTORS, Dl, MVT::v64i1, Lo, Hi);
2991}
2992
2993/// The function will lower a register of various sizes (8/16/32/64)
2994/// to a mask value of the expected size (v8i1/v16i1/v32i1/v64i1)
2995/// \returns a DAG node contains the operand after lowering to mask type.
2996static SDValue lowerRegToMasks(const SDValue &ValArg, const EVT &ValVT,
2997                               const EVT &ValLoc, const SDLoc &Dl,
2998                               SelectionDAG &DAG) {
2999  SDValue ValReturned = ValArg;
3000
3001  if (ValVT == MVT::v1i1)
3002    return DAG.getNode(ISD::SCALAR_TO_VECTOR, Dl, MVT::v1i1, ValReturned);
3003
3004  if (ValVT == MVT::v64i1) {
3005    // In 32 bit machine, this case is handled by getv64i1Argument
3006    assert(ValLoc == MVT::i64 && "Expecting only i64 locations");
3007    // In 64 bit machine, There is no need to truncate the value only bitcast
3008  } else {
3009    MVT maskLen;
3010    switch (ValVT.getSimpleVT().SimpleTy) {
3011    case MVT::v8i1:
3012      maskLen = MVT::i8;
3013      break;
3014    case MVT::v16i1:
3015      maskLen = MVT::i16;
3016      break;
3017    case MVT::v32i1:
3018      maskLen = MVT::i32;
3019      break;
3020    default:
3021      llvm_unreachable("Expecting a vector of i1 types");
3022    }
3023
3024    ValReturned = DAG.getNode(ISD::TRUNCATE, Dl, maskLen, ValReturned);
3025  }
3026  return DAG.getBitcast(ValVT, ValReturned);
3027}
3028
3029/// Lower the result values of a call into the
3030/// appropriate copies out of appropriate physical registers.
3031///
3032SDValue X86TargetLowering::LowerCallResult(
3033    SDValue Chain, SDValue InFlag, CallingConv::ID CallConv, bool isVarArg,
3034    const SmallVectorImpl<ISD::InputArg> &Ins, const SDLoc &dl,
3035    SelectionDAG &DAG, SmallVectorImpl<SDValue> &InVals,
3036    uint32_t *RegMask) const {
3037
3038  const TargetRegisterInfo *TRI = Subtarget.getRegisterInfo();
3039  // Assign locations to each value returned by this call.
3040  SmallVector<CCValAssign, 16> RVLocs;
3041  CCState CCInfo(CallConv, isVarArg, DAG.getMachineFunction(), RVLocs,
3042                 *DAG.getContext());
3043  CCInfo.AnalyzeCallResult(Ins, RetCC_X86);
3044
3045  // Copy all of the result registers out of their specified physreg.
3046  for (unsigned I = 0, InsIndex = 0, E = RVLocs.size(); I != E;
3047       ++I, ++InsIndex) {
3048    CCValAssign &VA = RVLocs[I];
3049    EVT CopyVT = VA.getLocVT();
3050
3051    // In some calling conventions we need to remove the used registers
3052    // from the register mask.
3053    if (RegMask) {
3054      for (MCSubRegIterator SubRegs(VA.getLocReg(), TRI, /*IncludeSelf=*/true);
3055           SubRegs.isValid(); ++SubRegs)
3056        RegMask[*SubRegs / 32] &= ~(1u << (*SubRegs % 32));
3057    }
3058
3059    // Report an error if there was an attempt to return FP values via XMM
3060    // registers.
3061    if (!Subtarget.hasSSE1() && X86::FR32XRegClass.contains(VA.getLocReg())) {
3062      errorUnsupported(DAG, dl, "SSE register return with SSE disabled");
3063      if (VA.getLocReg() == X86::XMM1)
3064        VA.convertToReg(X86::FP1); // Set reg to FP1, avoid hitting asserts.
3065      else
3066        VA.convertToReg(X86::FP0); // Set reg to FP0, avoid hitting asserts.
3067    } else if (!Subtarget.hasSSE2() &&
3068               X86::FR64XRegClass.contains(VA.getLocReg()) &&
3069               CopyVT == MVT::f64) {
3070      errorUnsupported(DAG, dl, "SSE2 register return with SSE2 disabled");
3071      if (VA.getLocReg() == X86::XMM1)
3072        VA.convertToReg(X86::FP1); // Set reg to FP1, avoid hitting asserts.
3073      else
3074        VA.convertToReg(X86::FP0); // Set reg to FP0, avoid hitting asserts.
3075    }
3076
3077    // If we prefer to use the value in xmm registers, copy it out as f80 and
3078    // use a truncate to move it from fp stack reg to xmm reg.
3079    bool RoundAfterCopy = false;
3080    if ((VA.getLocReg() == X86::FP0 || VA.getLocReg() == X86::FP1) &&
3081        isScalarFPTypeInSSEReg(VA.getValVT())) {
3082      if (!Subtarget.hasX87())
3083        report_fatal_error("X87 register return with X87 disabled");
3084      CopyVT = MVT::f80;
3085      RoundAfterCopy = (CopyVT != VA.getLocVT());
3086    }
3087
3088    SDValue Val;
3089    if (VA.needsCustom()) {
3090      assert(VA.getValVT() == MVT::v64i1 &&
3091             "Currently the only custom case is when we split v64i1 to 2 regs");
3092      Val =
3093          getv64i1Argument(VA, RVLocs[++I], Chain, DAG, dl, Subtarget, &InFlag);
3094    } else {
3095      Chain = DAG.getCopyFromReg(Chain, dl, VA.getLocReg(), CopyVT, InFlag)
3096                  .getValue(1);
3097      Val = Chain.getValue(0);
3098      InFlag = Chain.getValue(2);
3099    }
3100
3101    if (RoundAfterCopy)
3102      Val = DAG.getNode(ISD::FP_ROUND, dl, VA.getValVT(), Val,
3103                        // This truncation won't change the value.
3104                        DAG.getIntPtrConstant(1, dl));
3105
3106    if (VA.isExtInLoc()) {
3107      if (VA.getValVT().isVector() &&
3108          VA.getValVT().getScalarType() == MVT::i1 &&
3109          ((VA.getLocVT() == MVT::i64) || (VA.getLocVT() == MVT::i32) ||
3110           (VA.getLocVT() == MVT::i16) || (VA.getLocVT() == MVT::i8))) {
3111        // promoting a mask type (v*i1) into a register of type i64/i32/i16/i8
3112        Val = lowerRegToMasks(Val, VA.getValVT(), VA.getLocVT(), dl, DAG);
3113      } else
3114        Val = DAG.getNode(ISD::TRUNCATE, dl, VA.getValVT(), Val);
3115    }
3116
3117    if (VA.getLocInfo() == CCValAssign::BCvt)
3118      Val = DAG.getBitcast(VA.getValVT(), Val);
3119
3120    InVals.push_back(Val);
3121  }
3122
3123  return Chain;
3124}
3125
3126//===----------------------------------------------------------------------===//
3127//                C & StdCall & Fast Calling Convention implementation
3128//===----------------------------------------------------------------------===//
3129//  StdCall calling convention seems to be standard for many Windows' API
3130//  routines and around. It differs from C calling convention just a little:
3131//  callee should clean up the stack, not caller. Symbols should be also
3132//  decorated in some fancy way :) It doesn't support any vector arguments.
3133//  For info on fast calling convention see Fast Calling Convention (tail call)
3134//  implementation LowerX86_32FastCCCallTo.
3135
3136/// CallIsStructReturn - Determines whether a call uses struct return
3137/// semantics.
3138enum StructReturnType {
3139  NotStructReturn,
3140  RegStructReturn,
3141  StackStructReturn
3142};
3143static StructReturnType
3144callIsStructReturn(ArrayRef<ISD::OutputArg> Outs, bool IsMCU) {
3145  if (Outs.empty())
3146    return NotStructReturn;
3147
3148  const ISD::ArgFlagsTy &Flags = Outs[0].Flags;
3149  if (!Flags.isSRet())
3150    return NotStructReturn;
3151  if (Flags.isInReg() || IsMCU)
3152    return RegStructReturn;
3153  return StackStructReturn;
3154}
3155
3156/// Determines whether a function uses struct return semantics.
3157static StructReturnType
3158argsAreStructReturn(ArrayRef<ISD::InputArg> Ins, bool IsMCU) {
3159  if (Ins.empty())
3160    return NotStructReturn;
3161
3162  const ISD::ArgFlagsTy &Flags = Ins[0].Flags;
3163  if (!Flags.isSRet())
3164    return NotStructReturn;
3165  if (Flags.isInReg() || IsMCU)
3166    return RegStructReturn;
3167  return StackStructReturn;
3168}
3169
3170/// Make a copy of an aggregate at address specified by "Src" to address
3171/// "Dst" with size and alignment information specified by the specific
3172/// parameter attribute. The copy will be passed as a byval function parameter.
3173static SDValue CreateCopyOfByValArgument(SDValue Src, SDValue Dst,
3174                                         SDValue Chain, ISD::ArgFlagsTy Flags,
3175                                         SelectionDAG &DAG, const SDLoc &dl) {
3176  SDValue SizeNode = DAG.getIntPtrConstant(Flags.getByValSize(), dl);
3177
3178  return DAG.getMemcpy(
3179      Chain, dl, Dst, Src, SizeNode, Flags.getNonZeroByValAlign(),
3180      /*isVolatile*/ false, /*AlwaysInline=*/true,
3181      /*isTailCall*/ false, MachinePointerInfo(), MachinePointerInfo());
3182}
3183
3184/// Return true if the calling convention is one that we can guarantee TCO for.
3185static bool canGuaranteeTCO(CallingConv::ID CC) {
3186  return (CC == CallingConv::Fast || CC == CallingConv::GHC ||
3187          CC == CallingConv::X86_RegCall || CC == CallingConv::HiPE ||
3188          CC == CallingConv::HHVM || CC == CallingConv::Tail ||
3189          CC == CallingConv::SwiftTail);
3190}
3191
3192/// Return true if we might ever do TCO for calls with this calling convention.
3193static bool mayTailCallThisCC(CallingConv::ID CC) {
3194  switch (CC) {
3195  // C calling conventions:
3196  case CallingConv::C:
3197  case CallingConv::Win64:
3198  case CallingConv::X86_64_SysV:
3199  // Callee pop conventions:
3200  case CallingConv::X86_ThisCall:
3201  case CallingConv::X86_StdCall:
3202  case CallingConv::X86_VectorCall:
3203  case CallingConv::X86_FastCall:
3204  // Swift:
3205  case CallingConv::Swift:
3206    return true;
3207  default:
3208    return canGuaranteeTCO(CC);
3209  }
3210}
3211
3212/// Return true if the function is being made into a tailcall target by
3213/// changing its ABI.
3214static bool shouldGuaranteeTCO(CallingConv::ID CC, bool GuaranteedTailCallOpt) {
3215  return (GuaranteedTailCallOpt && canGuaranteeTCO(CC)) ||
3216         CC == CallingConv::Tail || CC == CallingConv::SwiftTail;
3217}
3218
3219bool X86TargetLowering::mayBeEmittedAsTailCall(const CallInst *CI) const {
3220  if (!CI->isTailCall())
3221    return false;
3222
3223  CallingConv::ID CalleeCC = CI->getCallingConv();
3224  if (!mayTailCallThisCC(CalleeCC))
3225    return false;
3226
3227  return true;
3228}
3229
3230SDValue
3231X86TargetLowering::LowerMemArgument(SDValue Chain, CallingConv::ID CallConv,
3232                                    const SmallVectorImpl<ISD::InputArg> &Ins,
3233                                    const SDLoc &dl, SelectionDAG &DAG,
3234                                    const CCValAssign &VA,
3235                                    MachineFrameInfo &MFI, unsigned i) const {
3236  // Create the nodes corresponding to a load from this parameter slot.
3237  ISD::ArgFlagsTy Flags = Ins[i].Flags;
3238  bool AlwaysUseMutable = shouldGuaranteeTCO(
3239      CallConv, DAG.getTarget().Options.GuaranteedTailCallOpt);
3240  bool isImmutable = !AlwaysUseMutable && !Flags.isByVal();
3241  EVT ValVT;
3242  MVT PtrVT = getPointerTy(DAG.getDataLayout());
3243
3244  // If value is passed by pointer we have address passed instead of the value
3245  // itself. No need to extend if the mask value and location share the same
3246  // absolute size.
3247  bool ExtendedInMem =
3248      VA.isExtInLoc() && VA.getValVT().getScalarType() == MVT::i1 &&
3249      VA.getValVT().getSizeInBits() != VA.getLocVT().getSizeInBits();
3250
3251  if (VA.getLocInfo() == CCValAssign::Indirect || ExtendedInMem)
3252    ValVT = VA.getLocVT();
3253  else
3254    ValVT = VA.getValVT();
3255
3256  // FIXME: For now, all byval parameter objects are marked mutable. This can be
3257  // changed with more analysis.
3258  // In case of tail call optimization mark all arguments mutable. Since they
3259  // could be overwritten by lowering of arguments in case of a tail call.
3260  if (Flags.isByVal()) {
3261    unsigned Bytes = Flags.getByValSize();
3262    if (Bytes == 0) Bytes = 1; // Don't create zero-sized stack objects.
3263
3264    // FIXME: For now, all byval parameter objects are marked as aliasing. This
3265    // can be improved with deeper analysis.
3266    int FI = MFI.CreateFixedObject(Bytes, VA.getLocMemOffset(), isImmutable,
3267                                   /*isAliased=*/true);
3268    return DAG.getFrameIndex(FI, PtrVT);
3269  }
3270
3271  EVT ArgVT = Ins[i].ArgVT;
3272
3273  // If this is a vector that has been split into multiple parts, and the
3274  // scalar size of the parts don't match the vector element size, then we can't
3275  // elide the copy. The parts will have padding between them instead of being
3276  // packed like a vector.
3277  bool ScalarizedAndExtendedVector =
3278      ArgVT.isVector() && !VA.getLocVT().isVector() &&
3279      VA.getLocVT().getSizeInBits() != ArgVT.getScalarSizeInBits();
3280
3281  // This is an argument in memory. We might be able to perform copy elision.
3282  // If the argument is passed directly in memory without any extension, then we
3283  // can perform copy elision. Large vector types, for example, may be passed
3284  // indirectly by pointer.
3285  if (Flags.isCopyElisionCandidate() &&
3286      VA.getLocInfo() != CCValAssign::Indirect && !ExtendedInMem &&
3287      !ScalarizedAndExtendedVector) {
3288    SDValue PartAddr;
3289    if (Ins[i].PartOffset == 0) {
3290      // If this is a one-part value or the first part of a multi-part value,
3291      // create a stack object for the entire argument value type and return a
3292      // load from our portion of it. This assumes that if the first part of an
3293      // argument is in memory, the rest will also be in memory.
3294      int FI = MFI.CreateFixedObject(ArgVT.getStoreSize(), VA.getLocMemOffset(),
3295                                     /*IsImmutable=*/false);
3296      PartAddr = DAG.getFrameIndex(FI, PtrVT);
3297      return DAG.getLoad(
3298          ValVT, dl, Chain, PartAddr,
3299          MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), FI));
3300    } else {
3301      // This is not the first piece of an argument in memory. See if there is
3302      // already a fixed stack object including this offset. If so, assume it
3303      // was created by the PartOffset == 0 branch above and create a load from
3304      // the appropriate offset into it.
3305      int64_t PartBegin = VA.getLocMemOffset();
3306      int64_t PartEnd = PartBegin + ValVT.getSizeInBits() / 8;
3307      int FI = MFI.getObjectIndexBegin();
3308      for (; MFI.isFixedObjectIndex(FI); ++FI) {
3309        int64_t ObjBegin = MFI.getObjectOffset(FI);
3310        int64_t ObjEnd = ObjBegin + MFI.getObjectSize(FI);
3311        if (ObjBegin <= PartBegin && PartEnd <= ObjEnd)
3312          break;
3313      }
3314      if (MFI.isFixedObjectIndex(FI)) {
3315        SDValue Addr =
3316            DAG.getNode(ISD::ADD, dl, PtrVT, DAG.getFrameIndex(FI, PtrVT),
3317                        DAG.getIntPtrConstant(Ins[i].PartOffset, dl));
3318        return DAG.getLoad(
3319            ValVT, dl, Chain, Addr,
3320            MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), FI,
3321                                              Ins[i].PartOffset));
3322      }
3323    }
3324  }
3325
3326  int FI = MFI.CreateFixedObject(ValVT.getSizeInBits() / 8,
3327                                 VA.getLocMemOffset(), isImmutable);
3328
3329  // Set SExt or ZExt flag.
3330  if (VA.getLocInfo() == CCValAssign::ZExt) {
3331    MFI.setObjectZExt(FI, true);
3332  } else if (VA.getLocInfo() == CCValAssign::SExt) {
3333    MFI.setObjectSExt(FI, true);
3334  }
3335
3336  SDValue FIN = DAG.getFrameIndex(FI, PtrVT);
3337  SDValue Val = DAG.getLoad(
3338      ValVT, dl, Chain, FIN,
3339      MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), FI));
3340  return ExtendedInMem
3341             ? (VA.getValVT().isVector()
3342                    ? DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VA.getValVT(), Val)
3343                    : DAG.getNode(ISD::TRUNCATE, dl, VA.getValVT(), Val))
3344             : Val;
3345}
3346
3347// FIXME: Get this from tablegen.
3348static ArrayRef<MCPhysReg> get64BitArgumentGPRs(CallingConv::ID CallConv,
3349                                                const X86Subtarget &Subtarget) {
3350  assert(Subtarget.is64Bit());
3351
3352  if (Subtarget.isCallingConvWin64(CallConv)) {
3353    static const MCPhysReg GPR64ArgRegsWin64[] = {
3354      X86::RCX, X86::RDX, X86::R8,  X86::R9
3355    };
3356    return makeArrayRef(std::begin(GPR64ArgRegsWin64), std::end(GPR64ArgRegsWin64));
3357  }
3358
3359  static const MCPhysReg GPR64ArgRegs64Bit[] = {
3360    X86::RDI, X86::RSI, X86::RDX, X86::RCX, X86::R8, X86::R9
3361  };
3362  return makeArrayRef(std::begin(GPR64ArgRegs64Bit), std::end(GPR64ArgRegs64Bit));
3363}
3364
3365// FIXME: Get this from tablegen.
3366static ArrayRef<MCPhysReg> get64BitArgumentXMMs(MachineFunction &MF,
3367                                                CallingConv::ID CallConv,
3368                                                const X86Subtarget &Subtarget) {
3369  assert(Subtarget.is64Bit());
3370  if (Subtarget.isCallingConvWin64(CallConv)) {
3371    // The XMM registers which might contain var arg parameters are shadowed
3372    // in their paired GPR.  So we only need to save the GPR to their home
3373    // slots.
3374    // TODO: __vectorcall will change this.
3375    return None;
3376  }
3377
3378  const Function &F = MF.getFunction();
3379  bool NoImplicitFloatOps = F.hasFnAttribute(Attribute::NoImplicitFloat);
3380  bool isSoftFloat = Subtarget.useSoftFloat();
3381  assert(!(isSoftFloat && NoImplicitFloatOps) &&
3382         "SSE register cannot be used when SSE is disabled!");
3383  if (isSoftFloat || NoImplicitFloatOps || !Subtarget.hasSSE1())
3384    // Kernel mode asks for SSE to be disabled, so there are no XMM argument
3385    // registers.
3386    return None;
3387
3388  static const MCPhysReg XMMArgRegs64Bit[] = {
3389    X86::XMM0, X86::XMM1, X86::XMM2, X86::XMM3,
3390    X86::XMM4, X86::XMM5, X86::XMM6, X86::XMM7
3391  };
3392  return makeArrayRef(std::begin(XMMArgRegs64Bit), std::end(XMMArgRegs64Bit));
3393}
3394
3395#ifndef NDEBUG
3396static bool isSortedByValueNo(ArrayRef<CCValAssign> ArgLocs) {
3397  return llvm::is_sorted(
3398      ArgLocs, [](const CCValAssign &A, const CCValAssign &B) -> bool {
3399        return A.getValNo() < B.getValNo();
3400      });
3401}
3402#endif
3403
3404namespace {
3405/// This is a helper class for lowering variable arguments parameters.
3406class VarArgsLoweringHelper {
3407public:
3408  VarArgsLoweringHelper(X86MachineFunctionInfo *FuncInfo, const SDLoc &Loc,
3409                        SelectionDAG &DAG, const X86Subtarget &Subtarget,
3410                        CallingConv::ID CallConv, CCState &CCInfo)
3411      : FuncInfo(FuncInfo), DL(Loc), DAG(DAG), Subtarget(Subtarget),
3412        TheMachineFunction(DAG.getMachineFunction()),
3413        TheFunction(TheMachineFunction.getFunction()),
3414        FrameInfo(TheMachineFunction.getFrameInfo()),
3415        FrameLowering(*Subtarget.getFrameLowering()),
3416        TargLowering(DAG.getTargetLoweringInfo()), CallConv(CallConv),
3417        CCInfo(CCInfo) {}
3418
3419  // Lower variable arguments parameters.
3420  void lowerVarArgsParameters(SDValue &Chain, unsigned StackSize);
3421
3422private:
3423  void createVarArgAreaAndStoreRegisters(SDValue &Chain, unsigned StackSize);
3424
3425  void forwardMustTailParameters(SDValue &Chain);
3426
3427  bool is64Bit() const { return Subtarget.is64Bit(); }
3428  bool isWin64() const { return Subtarget.isCallingConvWin64(CallConv); }
3429
3430  X86MachineFunctionInfo *FuncInfo;
3431  const SDLoc &DL;
3432  SelectionDAG &DAG;
3433  const X86Subtarget &Subtarget;
3434  MachineFunction &TheMachineFunction;
3435  const Function &TheFunction;
3436  MachineFrameInfo &FrameInfo;
3437  const TargetFrameLowering &FrameLowering;
3438  const TargetLowering &TargLowering;
3439  CallingConv::ID CallConv;
3440  CCState &CCInfo;
3441};
3442} // namespace
3443
3444void VarArgsLoweringHelper::createVarArgAreaAndStoreRegisters(
3445    SDValue &Chain, unsigned StackSize) {
3446  // If the function takes variable number of arguments, make a frame index for
3447  // the start of the first vararg value... for expansion of llvm.va_start. We
3448  // can skip this if there are no va_start calls.
3449  if (is64Bit() || (CallConv != CallingConv::X86_FastCall &&
3450                    CallConv != CallingConv::X86_ThisCall)) {
3451    FuncInfo->setVarArgsFrameIndex(
3452        FrameInfo.CreateFixedObject(1, StackSize, true));
3453  }
3454
3455  // Figure out if XMM registers are in use.
3456  assert(!(Subtarget.useSoftFloat() &&
3457           TheFunction.hasFnAttribute(Attribute::NoImplicitFloat)) &&
3458         "SSE register cannot be used when SSE is disabled!");
3459
3460  // 64-bit calling conventions support varargs and register parameters, so we
3461  // have to do extra work to spill them in the prologue.
3462  if (is64Bit()) {
3463    // Find the first unallocated argument registers.
3464    ArrayRef<MCPhysReg> ArgGPRs = get64BitArgumentGPRs(CallConv, Subtarget);
3465    ArrayRef<MCPhysReg> ArgXMMs =
3466        get64BitArgumentXMMs(TheMachineFunction, CallConv, Subtarget);
3467    unsigned NumIntRegs = CCInfo.getFirstUnallocated(ArgGPRs);
3468    unsigned NumXMMRegs = CCInfo.getFirstUnallocated(ArgXMMs);
3469
3470    assert(!(NumXMMRegs && !Subtarget.hasSSE1()) &&
3471           "SSE register cannot be used when SSE is disabled!");
3472
3473    if (isWin64()) {
3474      // Get to the caller-allocated home save location.  Add 8 to account
3475      // for the return address.
3476      int HomeOffset = FrameLowering.getOffsetOfLocalArea() + 8;
3477      FuncInfo->setRegSaveFrameIndex(
3478          FrameInfo.CreateFixedObject(1, NumIntRegs * 8 + HomeOffset, false));
3479      // Fixup to set vararg frame on shadow area (4 x i64).
3480      if (NumIntRegs < 4)
3481        FuncInfo->setVarArgsFrameIndex(FuncInfo->getRegSaveFrameIndex());
3482    } else {
3483      // For X86-64, if there are vararg parameters that are passed via
3484      // registers, then we must store them to their spots on the stack so
3485      // they may be loaded by dereferencing the result of va_next.
3486      FuncInfo->setVarArgsGPOffset(NumIntRegs * 8);
3487      FuncInfo->setVarArgsFPOffset(ArgGPRs.size() * 8 + NumXMMRegs * 16);
3488      FuncInfo->setRegSaveFrameIndex(FrameInfo.CreateStackObject(
3489          ArgGPRs.size() * 8 + ArgXMMs.size() * 16, Align(16), false));
3490    }
3491
3492    SmallVector<SDValue, 6>
3493        LiveGPRs; // list of SDValue for GPR registers keeping live input value
3494    SmallVector<SDValue, 8> LiveXMMRegs; // list of SDValue for XMM registers
3495                                         // keeping live input value
3496    SDValue ALVal; // if applicable keeps SDValue for %al register
3497
3498    // Gather all the live in physical registers.
3499    for (MCPhysReg Reg : ArgGPRs.slice(NumIntRegs)) {
3500      Register GPR = TheMachineFunction.addLiveIn(Reg, &X86::GR64RegClass);
3501      LiveGPRs.push_back(DAG.getCopyFromReg(Chain, DL, GPR, MVT::i64));
3502    }
3503    const auto &AvailableXmms = ArgXMMs.slice(NumXMMRegs);
3504    if (!AvailableXmms.empty()) {
3505      Register AL = TheMachineFunction.addLiveIn(X86::AL, &X86::GR8RegClass);
3506      ALVal = DAG.getCopyFromReg(Chain, DL, AL, MVT::i8);
3507      for (MCPhysReg Reg : AvailableXmms) {
3508        // FastRegisterAllocator spills virtual registers at basic
3509        // block boundary. That leads to usages of xmm registers
3510        // outside of check for %al. Pass physical registers to
3511        // VASTART_SAVE_XMM_REGS to avoid unneccessary spilling.
3512        TheMachineFunction.getRegInfo().addLiveIn(Reg);
3513        LiveXMMRegs.push_back(DAG.getRegister(Reg, MVT::v4f32));
3514      }
3515    }
3516
3517    // Store the integer parameter registers.
3518    SmallVector<SDValue, 8> MemOps;
3519    SDValue RSFIN =
3520        DAG.getFrameIndex(FuncInfo->getRegSaveFrameIndex(),
3521                          TargLowering.getPointerTy(DAG.getDataLayout()));
3522    unsigned Offset = FuncInfo->getVarArgsGPOffset();
3523    for (SDValue Val : LiveGPRs) {
3524      SDValue FIN = DAG.getNode(ISD::ADD, DL,
3525                                TargLowering.getPointerTy(DAG.getDataLayout()),
3526                                RSFIN, DAG.getIntPtrConstant(Offset, DL));
3527      SDValue Store =
3528          DAG.getStore(Val.getValue(1), DL, Val, FIN,
3529                       MachinePointerInfo::getFixedStack(
3530                           DAG.getMachineFunction(),
3531                           FuncInfo->getRegSaveFrameIndex(), Offset));
3532      MemOps.push_back(Store);
3533      Offset += 8;
3534    }
3535
3536    // Now store the XMM (fp + vector) parameter registers.
3537    if (!LiveXMMRegs.empty()) {
3538      SmallVector<SDValue, 12> SaveXMMOps;
3539      SaveXMMOps.push_back(Chain);
3540      SaveXMMOps.push_back(ALVal);
3541      SaveXMMOps.push_back(
3542          DAG.getTargetConstant(FuncInfo->getRegSaveFrameIndex(), DL, MVT::i32));
3543      SaveXMMOps.push_back(
3544          DAG.getTargetConstant(FuncInfo->getVarArgsFPOffset(), DL, MVT::i32));
3545      llvm::append_range(SaveXMMOps, LiveXMMRegs);
3546      MemOps.push_back(DAG.getNode(X86ISD::VASTART_SAVE_XMM_REGS, DL,
3547                                   MVT::Other, SaveXMMOps));
3548    }
3549
3550    if (!MemOps.empty())
3551      Chain = DAG.getNode(ISD::TokenFactor, DL, MVT::Other, MemOps);
3552  }
3553}
3554
3555void VarArgsLoweringHelper::forwardMustTailParameters(SDValue &Chain) {
3556  // Find the largest legal vector type.
3557  MVT VecVT = MVT::Other;
3558  // FIXME: Only some x86_32 calling conventions support AVX512.
3559  if (Subtarget.useAVX512Regs() &&
3560      (is64Bit() || (CallConv == CallingConv::X86_VectorCall ||
3561                     CallConv == CallingConv::Intel_OCL_BI)))
3562    VecVT = MVT::v16f32;
3563  else if (Subtarget.hasAVX())
3564    VecVT = MVT::v8f32;
3565  else if (Subtarget.hasSSE2())
3566    VecVT = MVT::v4f32;
3567
3568  // We forward some GPRs and some vector types.
3569  SmallVector<MVT, 2> RegParmTypes;
3570  MVT IntVT = is64Bit() ? MVT::i64 : MVT::i32;
3571  RegParmTypes.push_back(IntVT);
3572  if (VecVT != MVT::Other)
3573    RegParmTypes.push_back(VecVT);
3574
3575  // Compute the set of forwarded registers. The rest are scratch.
3576  SmallVectorImpl<ForwardedRegister> &Forwards =
3577      FuncInfo->getForwardedMustTailRegParms();
3578  CCInfo.analyzeMustTailForwardedRegisters(Forwards, RegParmTypes, CC_X86);
3579
3580  // Forward AL for SysV x86_64 targets, since it is used for varargs.
3581  if (is64Bit() && !isWin64() && !CCInfo.isAllocated(X86::AL)) {
3582    Register ALVReg = TheMachineFunction.addLiveIn(X86::AL, &X86::GR8RegClass);
3583    Forwards.push_back(ForwardedRegister(ALVReg, X86::AL, MVT::i8));
3584  }
3585
3586  // Copy all forwards from physical to virtual registers.
3587  for (ForwardedRegister &FR : Forwards) {
3588    // FIXME: Can we use a less constrained schedule?
3589    SDValue RegVal = DAG.getCopyFromReg(Chain, DL, FR.VReg, FR.VT);
3590    FR.VReg = TheMachineFunction.getRegInfo().createVirtualRegister(
3591        TargLowering.getRegClassFor(FR.VT));
3592    Chain = DAG.getCopyToReg(Chain, DL, FR.VReg, RegVal);
3593  }
3594}
3595
3596void VarArgsLoweringHelper::lowerVarArgsParameters(SDValue &Chain,
3597                                                   unsigned StackSize) {
3598  // Set FrameIndex to the 0xAAAAAAA value to mark unset state.
3599  // If necessary, it would be set into the correct value later.
3600  FuncInfo->setVarArgsFrameIndex(0xAAAAAAA);
3601  FuncInfo->setRegSaveFrameIndex(0xAAAAAAA);
3602
3603  if (FrameInfo.hasVAStart())
3604    createVarArgAreaAndStoreRegisters(Chain, StackSize);
3605
3606  if (FrameInfo.hasMustTailInVarArgFunc())
3607    forwardMustTailParameters(Chain);
3608}
3609
3610SDValue X86TargetLowering::LowerFormalArguments(
3611    SDValue Chain, CallingConv::ID CallConv, bool IsVarArg,
3612    const SmallVectorImpl<ISD::InputArg> &Ins, const SDLoc &dl,
3613    SelectionDAG &DAG, SmallVectorImpl<SDValue> &InVals) const {
3614  MachineFunction &MF = DAG.getMachineFunction();
3615  X86MachineFunctionInfo *FuncInfo = MF.getInfo<X86MachineFunctionInfo>();
3616
3617  const Function &F = MF.getFunction();
3618  if (F.hasExternalLinkage() && Subtarget.isTargetCygMing() &&
3619      F.getName() == "main")
3620    FuncInfo->setForceFramePointer(true);
3621
3622  MachineFrameInfo &MFI = MF.getFrameInfo();
3623  bool Is64Bit = Subtarget.is64Bit();
3624  bool IsWin64 = Subtarget.isCallingConvWin64(CallConv);
3625
3626  assert(
3627      !(IsVarArg && canGuaranteeTCO(CallConv)) &&
3628      "Var args not supported with calling conv' regcall, fastcc, ghc or hipe");
3629
3630  // Assign locations to all of the incoming arguments.
3631  SmallVector<CCValAssign, 16> ArgLocs;
3632  CCState CCInfo(CallConv, IsVarArg, MF, ArgLocs, *DAG.getContext());
3633
3634  // Allocate shadow area for Win64.
3635  if (IsWin64)
3636    CCInfo.AllocateStack(32, Align(8));
3637
3638  CCInfo.AnalyzeArguments(Ins, CC_X86);
3639
3640  // In vectorcall calling convention a second pass is required for the HVA
3641  // types.
3642  if (CallingConv::X86_VectorCall == CallConv) {
3643    CCInfo.AnalyzeArgumentsSecondPass(Ins, CC_X86);
3644  }
3645
3646  // The next loop assumes that the locations are in the same order of the
3647  // input arguments.
3648  assert(isSortedByValueNo(ArgLocs) &&
3649         "Argument Location list must be sorted before lowering");
3650
3651  SDValue ArgValue;
3652  for (unsigned I = 0, InsIndex = 0, E = ArgLocs.size(); I != E;
3653       ++I, ++InsIndex) {
3654    assert(InsIndex < Ins.size() && "Invalid Ins index");
3655    CCValAssign &VA = ArgLocs[I];
3656
3657    if (VA.isRegLoc()) {
3658      EVT RegVT = VA.getLocVT();
3659      if (VA.needsCustom()) {
3660        assert(
3661            VA.getValVT() == MVT::v64i1 &&
3662            "Currently the only custom case is when we split v64i1 to 2 regs");
3663
3664        // v64i1 values, in regcall calling convention, that are
3665        // compiled to 32 bit arch, are split up into two registers.
3666        ArgValue =
3667            getv64i1Argument(VA, ArgLocs[++I], Chain, DAG, dl, Subtarget);
3668      } else {
3669        const TargetRegisterClass *RC;
3670        if (RegVT == MVT::i8)
3671          RC = &X86::GR8RegClass;
3672        else if (RegVT == MVT::i16)
3673          RC = &X86::GR16RegClass;
3674        else if (RegVT == MVT::i32)
3675          RC = &X86::GR32RegClass;
3676        else if (Is64Bit && RegVT == MVT::i64)
3677          RC = &X86::GR64RegClass;
3678        else if (RegVT == MVT::f32)
3679          RC = Subtarget.hasAVX512() ? &X86::FR32XRegClass : &X86::FR32RegClass;
3680        else if (RegVT == MVT::f64)
3681          RC = Subtarget.hasAVX512() ? &X86::FR64XRegClass : &X86::FR64RegClass;
3682        else if (RegVT == MVT::f80)
3683          RC = &X86::RFP80RegClass;
3684        else if (RegVT == MVT::f128)
3685          RC = &X86::VR128RegClass;
3686        else if (RegVT.is512BitVector())
3687          RC = &X86::VR512RegClass;
3688        else if (RegVT.is256BitVector())
3689          RC = Subtarget.hasVLX() ? &X86::VR256XRegClass : &X86::VR256RegClass;
3690        else if (RegVT.is128BitVector())
3691          RC = Subtarget.hasVLX() ? &X86::VR128XRegClass : &X86::VR128RegClass;
3692        else if (RegVT == MVT::x86mmx)
3693          RC = &X86::VR64RegClass;
3694        else if (RegVT == MVT::v1i1)
3695          RC = &X86::VK1RegClass;
3696        else if (RegVT == MVT::v8i1)
3697          RC = &X86::VK8RegClass;
3698        else if (RegVT == MVT::v16i1)
3699          RC = &X86::VK16RegClass;
3700        else if (RegVT == MVT::v32i1)
3701          RC = &X86::VK32RegClass;
3702        else if (RegVT == MVT::v64i1)
3703          RC = &X86::VK64RegClass;
3704        else
3705          llvm_unreachable("Unknown argument type!");
3706
3707        Register Reg = MF.addLiveIn(VA.getLocReg(), RC);
3708        ArgValue = DAG.getCopyFromReg(Chain, dl, Reg, RegVT);
3709      }
3710
3711      // If this is an 8 or 16-bit value, it is really passed promoted to 32
3712      // bits.  Insert an assert[sz]ext to capture this, then truncate to the
3713      // right size.
3714      if (VA.getLocInfo() == CCValAssign::SExt)
3715        ArgValue = DAG.getNode(ISD::AssertSext, dl, RegVT, ArgValue,
3716                               DAG.getValueType(VA.getValVT()));
3717      else if (VA.getLocInfo() == CCValAssign::ZExt)
3718        ArgValue = DAG.getNode(ISD::AssertZext, dl, RegVT, ArgValue,
3719                               DAG.getValueType(VA.getValVT()));
3720      else if (VA.getLocInfo() == CCValAssign::BCvt)
3721        ArgValue = DAG.getBitcast(VA.getValVT(), ArgValue);
3722
3723      if (VA.isExtInLoc()) {
3724        // Handle MMX values passed in XMM regs.
3725        if (RegVT.isVector() && VA.getValVT().getScalarType() != MVT::i1)
3726          ArgValue = DAG.getNode(X86ISD::MOVDQ2Q, dl, VA.getValVT(), ArgValue);
3727        else if (VA.getValVT().isVector() &&
3728                 VA.getValVT().getScalarType() == MVT::i1 &&
3729                 ((VA.getLocVT() == MVT::i64) || (VA.getLocVT() == MVT::i32) ||
3730                  (VA.getLocVT() == MVT::i16) || (VA.getLocVT() == MVT::i8))) {
3731          // Promoting a mask type (v*i1) into a register of type i64/i32/i16/i8
3732          ArgValue = lowerRegToMasks(ArgValue, VA.getValVT(), RegVT, dl, DAG);
3733        } else
3734          ArgValue = DAG.getNode(ISD::TRUNCATE, dl, VA.getValVT(), ArgValue);
3735      }
3736    } else {
3737      assert(VA.isMemLoc());
3738      ArgValue =
3739          LowerMemArgument(Chain, CallConv, Ins, dl, DAG, VA, MFI, InsIndex);
3740    }
3741
3742    // If value is passed via pointer - do a load.
3743    if (VA.getLocInfo() == CCValAssign::Indirect && !Ins[I].Flags.isByVal())
3744      ArgValue =
3745          DAG.getLoad(VA.getValVT(), dl, Chain, ArgValue, MachinePointerInfo());
3746
3747    InVals.push_back(ArgValue);
3748  }
3749
3750  for (unsigned I = 0, E = Ins.size(); I != E; ++I) {
3751    if (Ins[I].Flags.isSwiftAsync()) {
3752      auto X86FI = MF.getInfo<X86MachineFunctionInfo>();
3753      if (Subtarget.is64Bit())
3754        X86FI->setHasSwiftAsyncContext(true);
3755      else {
3756        int FI = MF.getFrameInfo().CreateStackObject(4, Align(4), false);
3757        X86FI->setSwiftAsyncContextFrameIdx(FI);
3758        SDValue St = DAG.getStore(DAG.getEntryNode(), dl, InVals[I],
3759                                  DAG.getFrameIndex(FI, MVT::i32),
3760                                  MachinePointerInfo::getFixedStack(MF, FI));
3761        Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, St, Chain);
3762      }
3763    }
3764
3765    // Swift calling convention does not require we copy the sret argument
3766    // into %rax/%eax for the return. We don't set SRetReturnReg for Swift.
3767    if (CallConv == CallingConv::Swift || CallConv == CallingConv::SwiftTail)
3768      continue;
3769
3770    // All x86 ABIs require that for returning structs by value we copy the
3771    // sret argument into %rax/%eax (depending on ABI) for the return. Save
3772    // the argument into a virtual register so that we can access it from the
3773    // return points.
3774    if (Ins[I].Flags.isSRet()) {
3775      Register Reg = FuncInfo->getSRetReturnReg();
3776      if (!Reg) {
3777        MVT PtrTy = getPointerTy(DAG.getDataLayout());
3778        Reg = MF.getRegInfo().createVirtualRegister(getRegClassFor(PtrTy));
3779        FuncInfo->setSRetReturnReg(Reg);
3780      }
3781      SDValue Copy = DAG.getCopyToReg(DAG.getEntryNode(), dl, Reg, InVals[I]);
3782      Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, Copy, Chain);
3783      break;
3784    }
3785  }
3786
3787  unsigned StackSize = CCInfo.getNextStackOffset();
3788  // Align stack specially for tail calls.
3789  if (shouldGuaranteeTCO(CallConv,
3790                         MF.getTarget().Options.GuaranteedTailCallOpt))
3791    StackSize = GetAlignedArgumentStackSize(StackSize, DAG);
3792
3793  if (IsVarArg)
3794    VarArgsLoweringHelper(FuncInfo, dl, DAG, Subtarget, CallConv, CCInfo)
3795        .lowerVarArgsParameters(Chain, StackSize);
3796
3797  // Some CCs need callee pop.
3798  if (X86::isCalleePop(CallConv, Is64Bit, IsVarArg,
3799                       MF.getTarget().Options.GuaranteedTailCallOpt)) {
3800    FuncInfo->setBytesToPopOnReturn(StackSize); // Callee pops everything.
3801  } else if (CallConv == CallingConv::X86_INTR && Ins.size() == 2) {
3802    // X86 interrupts must pop the error code (and the alignment padding) if
3803    // present.
3804    FuncInfo->setBytesToPopOnReturn(Is64Bit ? 16 : 4);
3805  } else {
3806    FuncInfo->setBytesToPopOnReturn(0); // Callee pops nothing.
3807    // If this is an sret function, the return should pop the hidden pointer.
3808    if (!Is64Bit && !canGuaranteeTCO(CallConv) &&
3809        !Subtarget.getTargetTriple().isOSMSVCRT() &&
3810        argsAreStructReturn(Ins, Subtarget.isTargetMCU()) == StackStructReturn)
3811      FuncInfo->setBytesToPopOnReturn(4);
3812  }
3813
3814  if (!Is64Bit) {
3815    // RegSaveFrameIndex is X86-64 only.
3816    FuncInfo->setRegSaveFrameIndex(0xAAAAAAA);
3817  }
3818
3819  FuncInfo->setArgumentStackSize(StackSize);
3820
3821  if (WinEHFuncInfo *EHInfo = MF.getWinEHFuncInfo()) {
3822    EHPersonality Personality = classifyEHPersonality(F.getPersonalityFn());
3823    if (Personality == EHPersonality::CoreCLR) {
3824      assert(Is64Bit);
3825      // TODO: Add a mechanism to frame lowering that will allow us to indicate
3826      // that we'd prefer this slot be allocated towards the bottom of the frame
3827      // (i.e. near the stack pointer after allocating the frame).  Every
3828      // funclet needs a copy of this slot in its (mostly empty) frame, and the
3829      // offset from the bottom of this and each funclet's frame must be the
3830      // same, so the size of funclets' (mostly empty) frames is dictated by
3831      // how far this slot is from the bottom (since they allocate just enough
3832      // space to accommodate holding this slot at the correct offset).
3833      int PSPSymFI = MFI.CreateStackObject(8, Align(8), /*isSpillSlot=*/false);
3834      EHInfo->PSPSymFrameIdx = PSPSymFI;
3835    }
3836  }
3837
3838  if (CallConv == CallingConv::X86_RegCall ||
3839      F.hasFnAttribute("no_caller_saved_registers")) {
3840    MachineRegisterInfo &MRI = MF.getRegInfo();
3841    for (std::pair<Register, Register> Pair : MRI.liveins())
3842      MRI.disableCalleeSavedRegister(Pair.first);
3843  }
3844
3845  return Chain;
3846}
3847
3848SDValue X86TargetLowering::LowerMemOpCallTo(SDValue Chain, SDValue StackPtr,
3849                                            SDValue Arg, const SDLoc &dl,
3850                                            SelectionDAG &DAG,
3851                                            const CCValAssign &VA,
3852                                            ISD::ArgFlagsTy Flags,
3853                                            bool isByVal) const {
3854  unsigned LocMemOffset = VA.getLocMemOffset();
3855  SDValue PtrOff = DAG.getIntPtrConstant(LocMemOffset, dl);
3856  PtrOff = DAG.getNode(ISD::ADD, dl, getPointerTy(DAG.getDataLayout()),
3857                       StackPtr, PtrOff);
3858  if (isByVal)
3859    return CreateCopyOfByValArgument(Arg, PtrOff, Chain, Flags, DAG, dl);
3860
3861  return DAG.getStore(
3862      Chain, dl, Arg, PtrOff,
3863      MachinePointerInfo::getStack(DAG.getMachineFunction(), LocMemOffset));
3864}
3865
3866/// Emit a load of return address if tail call
3867/// optimization is performed and it is required.
3868SDValue X86TargetLowering::EmitTailCallLoadRetAddr(
3869    SelectionDAG &DAG, SDValue &OutRetAddr, SDValue Chain, bool IsTailCall,
3870    bool Is64Bit, int FPDiff, const SDLoc &dl) const {
3871  // Adjust the Return address stack slot.
3872  EVT VT = getPointerTy(DAG.getDataLayout());
3873  OutRetAddr = getReturnAddressFrameIndex(DAG);
3874
3875  // Load the "old" Return address.
3876  OutRetAddr = DAG.getLoad(VT, dl, Chain, OutRetAddr, MachinePointerInfo());
3877  return SDValue(OutRetAddr.getNode(), 1);
3878}
3879
3880/// Emit a store of the return address if tail call
3881/// optimization is performed and it is required (FPDiff!=0).
3882static SDValue EmitTailCallStoreRetAddr(SelectionDAG &DAG, MachineFunction &MF,
3883                                        SDValue Chain, SDValue RetAddrFrIdx,
3884                                        EVT PtrVT, unsigned SlotSize,
3885                                        int FPDiff, const SDLoc &dl) {
3886  // Store the return address to the appropriate stack slot.
3887  if (!FPDiff) return Chain;
3888  // Calculate the new stack slot for the return address.
3889  int NewReturnAddrFI =
3890    MF.getFrameInfo().CreateFixedObject(SlotSize, (int64_t)FPDiff - SlotSize,
3891                                         false);
3892  SDValue NewRetAddrFrIdx = DAG.getFrameIndex(NewReturnAddrFI, PtrVT);
3893  Chain = DAG.getStore(Chain, dl, RetAddrFrIdx, NewRetAddrFrIdx,
3894                       MachinePointerInfo::getFixedStack(
3895                           DAG.getMachineFunction(), NewReturnAddrFI));
3896  return Chain;
3897}
3898
3899/// Returns a vector_shuffle mask for an movs{s|d}, movd
3900/// operation of specified width.
3901static SDValue getMOVL(SelectionDAG &DAG, const SDLoc &dl, MVT VT, SDValue V1,
3902                       SDValue V2) {
3903  unsigned NumElems = VT.getVectorNumElements();
3904  SmallVector<int, 8> Mask;
3905  Mask.push_back(NumElems);
3906  for (unsigned i = 1; i != NumElems; ++i)
3907    Mask.push_back(i);
3908  return DAG.getVectorShuffle(VT, dl, V1, V2, Mask);
3909}
3910
3911SDValue
3912X86TargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI,
3913                             SmallVectorImpl<SDValue> &InVals) const {
3914  SelectionDAG &DAG                     = CLI.DAG;
3915  SDLoc &dl                             = CLI.DL;
3916  SmallVectorImpl<ISD::OutputArg> &Outs = CLI.Outs;
3917  SmallVectorImpl<SDValue> &OutVals     = CLI.OutVals;
3918  SmallVectorImpl<ISD::InputArg> &Ins   = CLI.Ins;
3919  SDValue Chain                         = CLI.Chain;
3920  SDValue Callee                        = CLI.Callee;
3921  CallingConv::ID CallConv              = CLI.CallConv;
3922  bool &isTailCall                      = CLI.IsTailCall;
3923  bool isVarArg                         = CLI.IsVarArg;
3924  const auto *CB                        = CLI.CB;
3925
3926  MachineFunction &MF = DAG.getMachineFunction();
3927  bool Is64Bit        = Subtarget.is64Bit();
3928  bool IsWin64        = Subtarget.isCallingConvWin64(CallConv);
3929  StructReturnType SR = callIsStructReturn(Outs, Subtarget.isTargetMCU());
3930  bool IsSibcall      = false;
3931  bool IsGuaranteeTCO = MF.getTarget().Options.GuaranteedTailCallOpt ||
3932      CallConv == CallingConv::Tail || CallConv == CallingConv::SwiftTail;
3933  X86MachineFunctionInfo *X86Info = MF.getInfo<X86MachineFunctionInfo>();
3934  bool HasNCSR = (CB && isa<CallInst>(CB) &&
3935                  CB->hasFnAttr("no_caller_saved_registers"));
3936  bool HasNoCfCheck = (CB && CB->doesNoCfCheck());
3937  bool IsIndirectCall = (CB && isa<CallInst>(CB) && CB->isIndirectCall());
3938  const Module *M = MF.getMMI().getModule();
3939  Metadata *IsCFProtectionSupported = M->getModuleFlag("cf-protection-branch");
3940
3941  MachineFunction::CallSiteInfo CSInfo;
3942  if (CallConv == CallingConv::X86_INTR)
3943    report_fatal_error("X86 interrupts may not be called directly");
3944
3945  if (Subtarget.isPICStyleGOT() && !IsGuaranteeTCO) {
3946    // If we are using a GOT, disable tail calls to external symbols with
3947    // default visibility. Tail calling such a symbol requires using a GOT
3948    // relocation, which forces early binding of the symbol. This breaks code
3949    // that require lazy function symbol resolution. Using musttail or
3950    // GuaranteedTailCallOpt will override this.
3951    GlobalAddressSDNode *G = dyn_cast<GlobalAddressSDNode>(Callee);
3952    if (!G || (!G->getGlobal()->hasLocalLinkage() &&
3953               G->getGlobal()->hasDefaultVisibility()))
3954      isTailCall = false;
3955  }
3956
3957  bool IsMustTail = CLI.CB && CLI.CB->isMustTailCall();
3958  if (IsMustTail) {
3959    // Force this to be a tail call.  The verifier rules are enough to ensure
3960    // that we can lower this successfully without moving the return address
3961    // around.
3962    isTailCall = true;
3963  } else if (isTailCall) {
3964    // Check if it's really possible to do a tail call.
3965    isTailCall = IsEligibleForTailCallOptimization(Callee, CallConv,
3966                    isVarArg, SR != NotStructReturn,
3967                    MF.getFunction().hasStructRetAttr(), CLI.RetTy,
3968                    Outs, OutVals, Ins, DAG);
3969
3970    // Sibcalls are automatically detected tailcalls which do not require
3971    // ABI changes.
3972    if (!IsGuaranteeTCO && isTailCall)
3973      IsSibcall = true;
3974
3975    if (isTailCall)
3976      ++NumTailCalls;
3977  }
3978
3979  assert(!(isVarArg && canGuaranteeTCO(CallConv)) &&
3980         "Var args not supported with calling convention fastcc, ghc or hipe");
3981
3982  // Analyze operands of the call, assigning locations to each operand.
3983  SmallVector<CCValAssign, 16> ArgLocs;
3984  CCState CCInfo(CallConv, isVarArg, MF, ArgLocs, *DAG.getContext());
3985
3986  // Allocate shadow area for Win64.
3987  if (IsWin64)
3988    CCInfo.AllocateStack(32, Align(8));
3989
3990  CCInfo.AnalyzeArguments(Outs, CC_X86);
3991
3992  // In vectorcall calling convention a second pass is required for the HVA
3993  // types.
3994  if (CallingConv::X86_VectorCall == CallConv) {
3995    CCInfo.AnalyzeArgumentsSecondPass(Outs, CC_X86);
3996  }
3997
3998  // Get a count of how many bytes are to be pushed on the stack.
3999  unsigned NumBytes = CCInfo.getAlignedCallFrameSize();
4000  if (IsSibcall)
4001    // This is a sibcall. The memory operands are available in caller's
4002    // own caller's stack.
4003    NumBytes = 0;
4004  else if (IsGuaranteeTCO && canGuaranteeTCO(CallConv))
4005    NumBytes = GetAlignedArgumentStackSize(NumBytes, DAG);
4006
4007  int FPDiff = 0;
4008  if (isTailCall && !IsSibcall && !IsMustTail) {
4009    // Lower arguments at fp - stackoffset + fpdiff.
4010    unsigned NumBytesCallerPushed = X86Info->getBytesToPopOnReturn();
4011
4012    FPDiff = NumBytesCallerPushed - NumBytes;
4013
4014    // Set the delta of movement of the returnaddr stackslot.
4015    // But only set if delta is greater than previous delta.
4016    if (FPDiff < X86Info->getTCReturnAddrDelta())
4017      X86Info->setTCReturnAddrDelta(FPDiff);
4018  }
4019
4020  unsigned NumBytesToPush = NumBytes;
4021  unsigned NumBytesToPop = NumBytes;
4022
4023  // If we have an inalloca argument, all stack space has already been allocated
4024  // for us and be right at the top of the stack.  We don't support multiple
4025  // arguments passed in memory when using inalloca.
4026  if (!Outs.empty() && Outs.back().Flags.isInAlloca()) {
4027    NumBytesToPush = 0;
4028    if (!ArgLocs.back().isMemLoc())
4029      report_fatal_error("cannot use inalloca attribute on a register "
4030                         "parameter");
4031    if (ArgLocs.back().getLocMemOffset() != 0)
4032      report_fatal_error("any parameter with the inalloca attribute must be "
4033                         "the only memory argument");
4034  } else if (CLI.IsPreallocated) {
4035    assert(ArgLocs.back().isMemLoc() &&
4036           "cannot use preallocated attribute on a register "
4037           "parameter");
4038    SmallVector<size_t, 4> PreallocatedOffsets;
4039    for (size_t i = 0; i < CLI.OutVals.size(); ++i) {
4040      if (CLI.CB->paramHasAttr(i, Attribute::Preallocated)) {
4041        PreallocatedOffsets.push_back(ArgLocs[i].getLocMemOffset());
4042      }
4043    }
4044    auto *MFI = DAG.getMachineFunction().getInfo<X86MachineFunctionInfo>();
4045    size_t PreallocatedId = MFI->getPreallocatedIdForCallSite(CLI.CB);
4046    MFI->setPreallocatedStackSize(PreallocatedId, NumBytes);
4047    MFI->setPreallocatedArgOffsets(PreallocatedId, PreallocatedOffsets);
4048    NumBytesToPush = 0;
4049  }
4050
4051  if (!IsSibcall && !IsMustTail)
4052    Chain = DAG.getCALLSEQ_START(Chain, NumBytesToPush,
4053                                 NumBytes - NumBytesToPush, dl);
4054
4055  SDValue RetAddrFrIdx;
4056  // Load return address for tail calls.
4057  if (isTailCall && FPDiff)
4058    Chain = EmitTailCallLoadRetAddr(DAG, RetAddrFrIdx, Chain, isTailCall,
4059                                    Is64Bit, FPDiff, dl);
4060
4061  SmallVector<std::pair<Register, SDValue>, 8> RegsToPass;
4062  SmallVector<SDValue, 8> MemOpChains;
4063  SDValue StackPtr;
4064
4065  // The next loop assumes that the locations are in the same order of the
4066  // input arguments.
4067  assert(isSortedByValueNo(ArgLocs) &&
4068         "Argument Location list must be sorted before lowering");
4069
4070  // Walk the register/memloc assignments, inserting copies/loads.  In the case
4071  // of tail call optimization arguments are handle later.
4072  const X86RegisterInfo *RegInfo = Subtarget.getRegisterInfo();
4073  for (unsigned I = 0, OutIndex = 0, E = ArgLocs.size(); I != E;
4074       ++I, ++OutIndex) {
4075    assert(OutIndex < Outs.size() && "Invalid Out index");
4076    // Skip inalloca/preallocated arguments, they have already been written.
4077    ISD::ArgFlagsTy Flags = Outs[OutIndex].Flags;
4078    if (Flags.isInAlloca() || Flags.isPreallocated())
4079      continue;
4080
4081    CCValAssign &VA = ArgLocs[I];
4082    EVT RegVT = VA.getLocVT();
4083    SDValue Arg = OutVals[OutIndex];
4084    bool isByVal = Flags.isByVal();
4085
4086    // Promote the value if needed.
4087    switch (VA.getLocInfo()) {
4088    default: llvm_unreachable("Unknown loc info!");
4089    case CCValAssign::Full: break;
4090    case CCValAssign::SExt:
4091      Arg = DAG.getNode(ISD::SIGN_EXTEND, dl, RegVT, Arg);
4092      break;
4093    case CCValAssign::ZExt:
4094      Arg = DAG.getNode(ISD::ZERO_EXTEND, dl, RegVT, Arg);
4095      break;
4096    case CCValAssign::AExt:
4097      if (Arg.getValueType().isVector() &&
4098          Arg.getValueType().getVectorElementType() == MVT::i1)
4099        Arg = lowerMasksToReg(Arg, RegVT, dl, DAG);
4100      else if (RegVT.is128BitVector()) {
4101        // Special case: passing MMX values in XMM registers.
4102        Arg = DAG.getBitcast(MVT::i64, Arg);
4103        Arg = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v2i64, Arg);
4104        Arg = getMOVL(DAG, dl, MVT::v2i64, DAG.getUNDEF(MVT::v2i64), Arg);
4105      } else
4106        Arg = DAG.getNode(ISD::ANY_EXTEND, dl, RegVT, Arg);
4107      break;
4108    case CCValAssign::BCvt:
4109      Arg = DAG.getBitcast(RegVT, Arg);
4110      break;
4111    case CCValAssign::Indirect: {
4112      if (isByVal) {
4113        // Memcpy the argument to a temporary stack slot to prevent
4114        // the caller from seeing any modifications the callee may make
4115        // as guaranteed by the `byval` attribute.
4116        int FrameIdx = MF.getFrameInfo().CreateStackObject(
4117            Flags.getByValSize(),
4118            std::max(Align(16), Flags.getNonZeroByValAlign()), false);
4119        SDValue StackSlot =
4120            DAG.getFrameIndex(FrameIdx, getPointerTy(DAG.getDataLayout()));
4121        Chain =
4122            CreateCopyOfByValArgument(Arg, StackSlot, Chain, Flags, DAG, dl);
4123        // From now on treat this as a regular pointer
4124        Arg = StackSlot;
4125        isByVal = false;
4126      } else {
4127        // Store the argument.
4128        SDValue SpillSlot = DAG.CreateStackTemporary(VA.getValVT());
4129        int FI = cast<FrameIndexSDNode>(SpillSlot)->getIndex();
4130        Chain = DAG.getStore(
4131            Chain, dl, Arg, SpillSlot,
4132            MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), FI));
4133        Arg = SpillSlot;
4134      }
4135      break;
4136    }
4137    }
4138
4139    if (VA.needsCustom()) {
4140      assert(VA.getValVT() == MVT::v64i1 &&
4141             "Currently the only custom case is when we split v64i1 to 2 regs");
4142      // Split v64i1 value into two registers
4143      Passv64i1ArgInRegs(dl, DAG, Arg, RegsToPass, VA, ArgLocs[++I], Subtarget);
4144    } else if (VA.isRegLoc()) {
4145      RegsToPass.push_back(std::make_pair(VA.getLocReg(), Arg));
4146      const TargetOptions &Options = DAG.getTarget().Options;
4147      if (Options.EmitCallSiteInfo)
4148        CSInfo.emplace_back(VA.getLocReg(), I);
4149      if (isVarArg && IsWin64) {
4150        // Win64 ABI requires argument XMM reg to be copied to the corresponding
4151        // shadow reg if callee is a varargs function.
4152        Register ShadowReg;
4153        switch (VA.getLocReg()) {
4154        case X86::XMM0: ShadowReg = X86::RCX; break;
4155        case X86::XMM1: ShadowReg = X86::RDX; break;
4156        case X86::XMM2: ShadowReg = X86::R8; break;
4157        case X86::XMM3: ShadowReg = X86::R9; break;
4158        }
4159        if (ShadowReg)
4160          RegsToPass.push_back(std::make_pair(ShadowReg, Arg));
4161      }
4162    } else if (!IsSibcall && (!isTailCall || isByVal)) {
4163      assert(VA.isMemLoc());
4164      if (!StackPtr.getNode())
4165        StackPtr = DAG.getCopyFromReg(Chain, dl, RegInfo->getStackRegister(),
4166                                      getPointerTy(DAG.getDataLayout()));
4167      MemOpChains.push_back(LowerMemOpCallTo(Chain, StackPtr, Arg,
4168                                             dl, DAG, VA, Flags, isByVal));
4169    }
4170  }
4171
4172  if (!MemOpChains.empty())
4173    Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, MemOpChains);
4174
4175  if (Subtarget.isPICStyleGOT()) {
4176    // ELF / PIC requires GOT in the EBX register before function calls via PLT
4177    // GOT pointer (except regcall).
4178    if (!isTailCall) {
4179      // Indirect call with RegCall calling convertion may use up all the
4180      // general registers, so it is not suitable to bind EBX reister for
4181      // GOT address, just let register allocator handle it.
4182      if (CallConv != CallingConv::X86_RegCall)
4183        RegsToPass.push_back(std::make_pair(
4184          Register(X86::EBX), DAG.getNode(X86ISD::GlobalBaseReg, SDLoc(),
4185                                          getPointerTy(DAG.getDataLayout()))));
4186    } else {
4187      // If we are tail calling and generating PIC/GOT style code load the
4188      // address of the callee into ECX. The value in ecx is used as target of
4189      // the tail jump. This is done to circumvent the ebx/callee-saved problem
4190      // for tail calls on PIC/GOT architectures. Normally we would just put the
4191      // address of GOT into ebx and then call target@PLT. But for tail calls
4192      // ebx would be restored (since ebx is callee saved) before jumping to the
4193      // target@PLT.
4194
4195      // Note: The actual moving to ECX is done further down.
4196      GlobalAddressSDNode *G = dyn_cast<GlobalAddressSDNode>(Callee);
4197      if (G && !G->getGlobal()->hasLocalLinkage() &&
4198          G->getGlobal()->hasDefaultVisibility())
4199        Callee = LowerGlobalAddress(Callee, DAG);
4200      else if (isa<ExternalSymbolSDNode>(Callee))
4201        Callee = LowerExternalSymbol(Callee, DAG);
4202    }
4203  }
4204
4205  if (Is64Bit && isVarArg && !IsWin64 && !IsMustTail) {
4206    // From AMD64 ABI document:
4207    // For calls that may call functions that use varargs or stdargs
4208    // (prototype-less calls or calls to functions containing ellipsis (...) in
4209    // the declaration) %al is used as hidden argument to specify the number
4210    // of SSE registers used. The contents of %al do not need to match exactly
4211    // the number of registers, but must be an ubound on the number of SSE
4212    // registers used and is in the range 0 - 8 inclusive.
4213
4214    // Count the number of XMM registers allocated.
4215    static const MCPhysReg XMMArgRegs[] = {
4216      X86::XMM0, X86::XMM1, X86::XMM2, X86::XMM3,
4217      X86::XMM4, X86::XMM5, X86::XMM6, X86::XMM7
4218    };
4219    unsigned NumXMMRegs = CCInfo.getFirstUnallocated(XMMArgRegs);
4220    assert((Subtarget.hasSSE1() || !NumXMMRegs)
4221           && "SSE registers cannot be used when SSE is disabled");
4222    RegsToPass.push_back(std::make_pair(Register(X86::AL),
4223                                        DAG.getConstant(NumXMMRegs, dl,
4224                                                        MVT::i8)));
4225  }
4226
4227  if (isVarArg && IsMustTail) {
4228    const auto &Forwards = X86Info->getForwardedMustTailRegParms();
4229    for (const auto &F : Forwards) {
4230      SDValue Val = DAG.getCopyFromReg(Chain, dl, F.VReg, F.VT);
4231      RegsToPass.push_back(std::make_pair(F.PReg, Val));
4232    }
4233  }
4234
4235  // For tail calls lower the arguments to the 'real' stack slots.  Sibcalls
4236  // don't need this because the eligibility check rejects calls that require
4237  // shuffling arguments passed in memory.
4238  if (!IsSibcall && isTailCall) {
4239    // Force all the incoming stack arguments to be loaded from the stack
4240    // before any new outgoing arguments are stored to the stack, because the
4241    // outgoing stack slots may alias the incoming argument stack slots, and
4242    // the alias isn't otherwise explicit. This is slightly more conservative
4243    // than necessary, because it means that each store effectively depends
4244    // on every argument instead of just those arguments it would clobber.
4245    SDValue ArgChain = DAG.getStackArgumentTokenFactor(Chain);
4246
4247    SmallVector<SDValue, 8> MemOpChains2;
4248    SDValue FIN;
4249    int FI = 0;
4250    for (unsigned I = 0, OutsIndex = 0, E = ArgLocs.size(); I != E;
4251         ++I, ++OutsIndex) {
4252      CCValAssign &VA = ArgLocs[I];
4253
4254      if (VA.isRegLoc()) {
4255        if (VA.needsCustom()) {
4256          assert((CallConv == CallingConv::X86_RegCall) &&
4257                 "Expecting custom case only in regcall calling convention");
4258          // This means that we are in special case where one argument was
4259          // passed through two register locations - Skip the next location
4260          ++I;
4261        }
4262
4263        continue;
4264      }
4265
4266      assert(VA.isMemLoc());
4267      SDValue Arg = OutVals[OutsIndex];
4268      ISD::ArgFlagsTy Flags = Outs[OutsIndex].Flags;
4269      // Skip inalloca/preallocated arguments.  They don't require any work.
4270      if (Flags.isInAlloca() || Flags.isPreallocated())
4271        continue;
4272      // Create frame index.
4273      int32_t Offset = VA.getLocMemOffset()+FPDiff;
4274      uint32_t OpSize = (VA.getLocVT().getSizeInBits()+7)/8;
4275      FI = MF.getFrameInfo().CreateFixedObject(OpSize, Offset, true);
4276      FIN = DAG.getFrameIndex(FI, getPointerTy(DAG.getDataLayout()));
4277
4278      if (Flags.isByVal()) {
4279        // Copy relative to framepointer.
4280        SDValue Source = DAG.getIntPtrConstant(VA.getLocMemOffset(), dl);
4281        if (!StackPtr.getNode())
4282          StackPtr = DAG.getCopyFromReg(Chain, dl, RegInfo->getStackRegister(),
4283                                        getPointerTy(DAG.getDataLayout()));
4284        Source = DAG.getNode(ISD::ADD, dl, getPointerTy(DAG.getDataLayout()),
4285                             StackPtr, Source);
4286
4287        MemOpChains2.push_back(CreateCopyOfByValArgument(Source, FIN,
4288                                                         ArgChain,
4289                                                         Flags, DAG, dl));
4290      } else {
4291        // Store relative to framepointer.
4292        MemOpChains2.push_back(DAG.getStore(
4293            ArgChain, dl, Arg, FIN,
4294            MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), FI)));
4295      }
4296    }
4297
4298    if (!MemOpChains2.empty())
4299      Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, MemOpChains2);
4300
4301    // Store the return address to the appropriate stack slot.
4302    Chain = EmitTailCallStoreRetAddr(DAG, MF, Chain, RetAddrFrIdx,
4303                                     getPointerTy(DAG.getDataLayout()),
4304                                     RegInfo->getSlotSize(), FPDiff, dl);
4305  }
4306
4307  // Build a sequence of copy-to-reg nodes chained together with token chain
4308  // and flag operands which copy the outgoing args into registers.
4309  SDValue InFlag;
4310  for (unsigned i = 0, e = RegsToPass.size(); i != e; ++i) {
4311    Chain = DAG.getCopyToReg(Chain, dl, RegsToPass[i].first,
4312                             RegsToPass[i].second, InFlag);
4313    InFlag = Chain.getValue(1);
4314  }
4315
4316  if (DAG.getTarget().getCodeModel() == CodeModel::Large) {
4317    assert(Is64Bit && "Large code model is only legal in 64-bit mode.");
4318    // In the 64-bit large code model, we have to make all calls
4319    // through a register, since the call instruction's 32-bit
4320    // pc-relative offset may not be large enough to hold the whole
4321    // address.
4322  } else if (Callee->getOpcode() == ISD::GlobalAddress ||
4323             Callee->getOpcode() == ISD::ExternalSymbol) {
4324    // Lower direct calls to global addresses and external symbols. Setting
4325    // ForCall to true here has the effect of removing WrapperRIP when possible
4326    // to allow direct calls to be selected without first materializing the
4327    // address into a register.
4328    Callee = LowerGlobalOrExternal(Callee, DAG, /*ForCall=*/true);
4329  } else if (Subtarget.isTarget64BitILP32() &&
4330             Callee->getValueType(0) == MVT::i32) {
4331    // Zero-extend the 32-bit Callee address into a 64-bit according to x32 ABI
4332    Callee = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i64, Callee);
4333  }
4334
4335  // Returns a chain & a flag for retval copy to use.
4336  SDVTList NodeTys = DAG.getVTList(MVT::Other, MVT::Glue);
4337  SmallVector<SDValue, 8> Ops;
4338
4339  if (!IsSibcall && isTailCall && !IsMustTail) {
4340    Chain = DAG.getCALLSEQ_END(Chain,
4341                               DAG.getIntPtrConstant(NumBytesToPop, dl, true),
4342                               DAG.getIntPtrConstant(0, dl, true), InFlag, dl);
4343    InFlag = Chain.getValue(1);
4344  }
4345
4346  Ops.push_back(Chain);
4347  Ops.push_back(Callee);
4348
4349  if (isTailCall)
4350    Ops.push_back(DAG.getTargetConstant(FPDiff, dl, MVT::i32));
4351
4352  // Add argument registers to the end of the list so that they are known live
4353  // into the call.
4354  for (unsigned i = 0, e = RegsToPass.size(); i != e; ++i)
4355    Ops.push_back(DAG.getRegister(RegsToPass[i].first,
4356                                  RegsToPass[i].second.getValueType()));
4357
4358  // Add a register mask operand representing the call-preserved registers.
4359  const uint32_t *Mask = [&]() {
4360    auto AdaptedCC = CallConv;
4361    // If HasNCSR is asserted (attribute NoCallerSavedRegisters exists),
4362    // use X86_INTR calling convention because it has the same CSR mask
4363    // (same preserved registers).
4364    if (HasNCSR)
4365      AdaptedCC = (CallingConv::ID)CallingConv::X86_INTR;
4366    // If NoCalleeSavedRegisters is requested, than use GHC since it happens
4367    // to use the CSR_NoRegs_RegMask.
4368    if (CB && CB->hasFnAttr("no_callee_saved_registers"))
4369      AdaptedCC = (CallingConv::ID)CallingConv::GHC;
4370    return RegInfo->getCallPreservedMask(MF, AdaptedCC);
4371  }();
4372  assert(Mask && "Missing call preserved mask for calling convention");
4373
4374  // If this is an invoke in a 32-bit function using a funclet-based
4375  // personality, assume the function clobbers all registers. If an exception
4376  // is thrown, the runtime will not restore CSRs.
4377  // FIXME: Model this more precisely so that we can register allocate across
4378  // the normal edge and spill and fill across the exceptional edge.
4379  if (!Is64Bit && CLI.CB && isa<InvokeInst>(CLI.CB)) {
4380    const Function &CallerFn = MF.getFunction();
4381    EHPersonality Pers =
4382        CallerFn.hasPersonalityFn()
4383            ? classifyEHPersonality(CallerFn.getPersonalityFn())
4384            : EHPersonality::Unknown;
4385    if (isFuncletEHPersonality(Pers))
4386      Mask = RegInfo->getNoPreservedMask();
4387  }
4388
4389  // Define a new register mask from the existing mask.
4390  uint32_t *RegMask = nullptr;
4391
4392  // In some calling conventions we need to remove the used physical registers
4393  // from the reg mask.
4394  if (CallConv == CallingConv::X86_RegCall || HasNCSR) {
4395    const TargetRegisterInfo *TRI = Subtarget.getRegisterInfo();
4396
4397    // Allocate a new Reg Mask and copy Mask.
4398    RegMask = MF.allocateRegMask();
4399    unsigned RegMaskSize = MachineOperand::getRegMaskSize(TRI->getNumRegs());
4400    memcpy(RegMask, Mask, sizeof(RegMask[0]) * RegMaskSize);
4401
4402    // Make sure all sub registers of the argument registers are reset
4403    // in the RegMask.
4404    for (auto const &RegPair : RegsToPass)
4405      for (MCSubRegIterator SubRegs(RegPair.first, TRI, /*IncludeSelf=*/true);
4406           SubRegs.isValid(); ++SubRegs)
4407        RegMask[*SubRegs / 32] &= ~(1u << (*SubRegs % 32));
4408
4409    // Create the RegMask Operand according to our updated mask.
4410    Ops.push_back(DAG.getRegisterMask(RegMask));
4411  } else {
4412    // Create the RegMask Operand according to the static mask.
4413    Ops.push_back(DAG.getRegisterMask(Mask));
4414  }
4415
4416  if (InFlag.getNode())
4417    Ops.push_back(InFlag);
4418
4419  if (isTailCall) {
4420    // We used to do:
4421    //// If this is the first return lowered for this function, add the regs
4422    //// to the liveout set for the function.
4423    // This isn't right, although it's probably harmless on x86; liveouts
4424    // should be computed from returns not tail calls.  Consider a void
4425    // function making a tail call to a function returning int.
4426    MF.getFrameInfo().setHasTailCall();
4427    SDValue Ret = DAG.getNode(X86ISD::TC_RETURN, dl, NodeTys, Ops);
4428    DAG.addCallSiteInfo(Ret.getNode(), std::move(CSInfo));
4429    return Ret;
4430  }
4431
4432  if (HasNoCfCheck && IsCFProtectionSupported && IsIndirectCall) {
4433    Chain = DAG.getNode(X86ISD::NT_CALL, dl, NodeTys, Ops);
4434  } else if (CLI.CB && objcarc::hasAttachedCallOpBundle(CLI.CB)) {
4435    // Calls with a "clang.arc.attachedcall" bundle are special. They should be
4436    // expanded to the call, directly followed by a special marker sequence and
4437    // a call to a ObjC library function. Use the CALL_RVMARKER to do that.
4438    assert(!isTailCall &&
4439           "tail calls cannot be marked with clang.arc.attachedcall");
4440    assert(Is64Bit && "clang.arc.attachedcall is only supported in 64bit mode");
4441
4442    // Add target constant to select ObjC runtime call just before the call
4443    // target. RuntimeCallType == 0 selects objc_retainAutoreleasedReturnValue,
4444    // RuntimeCallType == 0 selects objc_unsafeClaimAutoreleasedReturnValue when
4445    // epxanding the pseudo.
4446    unsigned RuntimeCallType =
4447        objcarc::hasAttachedCallOpBundle(CLI.CB, true) ? 0 : 1;
4448    Ops.insert(Ops.begin() + 1,
4449               DAG.getTargetConstant(RuntimeCallType, dl, MVT::i32));
4450    Chain = DAG.getNode(X86ISD::CALL_RVMARKER, dl, NodeTys, Ops);
4451  } else {
4452    Chain = DAG.getNode(X86ISD::CALL, dl, NodeTys, Ops);
4453  }
4454
4455  InFlag = Chain.getValue(1);
4456  DAG.addNoMergeSiteInfo(Chain.getNode(), CLI.NoMerge);
4457  DAG.addCallSiteInfo(Chain.getNode(), std::move(CSInfo));
4458
4459  // Save heapallocsite metadata.
4460  if (CLI.CB)
4461    if (MDNode *HeapAlloc = CLI.CB->getMetadata("heapallocsite"))
4462      DAG.addHeapAllocSite(Chain.getNode(), HeapAlloc);
4463
4464  // Create the CALLSEQ_END node.
4465  unsigned NumBytesForCalleeToPop;
4466  if (X86::isCalleePop(CallConv, Is64Bit, isVarArg,
4467                       DAG.getTarget().Options.GuaranteedTailCallOpt))
4468    NumBytesForCalleeToPop = NumBytes;    // Callee pops everything
4469  else if (!Is64Bit && !canGuaranteeTCO(CallConv) &&
4470           !Subtarget.getTargetTriple().isOSMSVCRT() &&
4471           SR == StackStructReturn)
4472    // If this is a call to a struct-return function, the callee
4473    // pops the hidden struct pointer, so we have to push it back.
4474    // This is common for Darwin/X86, Linux & Mingw32 targets.
4475    // For MSVC Win32 targets, the caller pops the hidden struct pointer.
4476    NumBytesForCalleeToPop = 4;
4477  else
4478    NumBytesForCalleeToPop = 0;  // Callee pops nothing.
4479
4480  // Returns a flag for retval copy to use.
4481  if (!IsSibcall) {
4482    Chain = DAG.getCALLSEQ_END(Chain,
4483                               DAG.getIntPtrConstant(NumBytesToPop, dl, true),
4484                               DAG.getIntPtrConstant(NumBytesForCalleeToPop, dl,
4485                                                     true),
4486                               InFlag, dl);
4487    InFlag = Chain.getValue(1);
4488  }
4489
4490  // Handle result values, copying them out of physregs into vregs that we
4491  // return.
4492  return LowerCallResult(Chain, InFlag, CallConv, isVarArg, Ins, dl, DAG,
4493                         InVals, RegMask);
4494}
4495
4496//===----------------------------------------------------------------------===//
4497//                Fast Calling Convention (tail call) implementation
4498//===----------------------------------------------------------------------===//
4499
4500//  Like std call, callee cleans arguments, convention except that ECX is
4501//  reserved for storing the tail called function address. Only 2 registers are
4502//  free for argument passing (inreg). Tail call optimization is performed
4503//  provided:
4504//                * tailcallopt is enabled
4505//                * caller/callee are fastcc
4506//  On X86_64 architecture with GOT-style position independent code only local
4507//  (within module) calls are supported at the moment.
4508//  To keep the stack aligned according to platform abi the function
4509//  GetAlignedArgumentStackSize ensures that argument delta is always multiples
4510//  of stack alignment. (Dynamic linkers need this - Darwin's dyld for example)
4511//  If a tail called function callee has more arguments than the caller the
4512//  caller needs to make sure that there is room to move the RETADDR to. This is
4513//  achieved by reserving an area the size of the argument delta right after the
4514//  original RETADDR, but before the saved framepointer or the spilled registers
4515//  e.g. caller(arg1, arg2) calls callee(arg1, arg2,arg3,arg4)
4516//  stack layout:
4517//    arg1
4518//    arg2
4519//    RETADDR
4520//    [ new RETADDR
4521//      move area ]
4522//    (possible EBP)
4523//    ESI
4524//    EDI
4525//    local1 ..
4526
4527/// Make the stack size align e.g 16n + 12 aligned for a 16-byte align
4528/// requirement.
4529unsigned
4530X86TargetLowering::GetAlignedArgumentStackSize(const unsigned StackSize,
4531                                               SelectionDAG &DAG) const {
4532  const Align StackAlignment = Subtarget.getFrameLowering()->getStackAlign();
4533  const uint64_t SlotSize = Subtarget.getRegisterInfo()->getSlotSize();
4534  assert(StackSize % SlotSize == 0 &&
4535         "StackSize must be a multiple of SlotSize");
4536  return alignTo(StackSize + SlotSize, StackAlignment) - SlotSize;
4537}
4538
4539/// Return true if the given stack call argument is already available in the
4540/// same position (relatively) of the caller's incoming argument stack.
4541static
4542bool MatchingStackOffset(SDValue Arg, unsigned Offset, ISD::ArgFlagsTy Flags,
4543                         MachineFrameInfo &MFI, const MachineRegisterInfo *MRI,
4544                         const X86InstrInfo *TII, const CCValAssign &VA) {
4545  unsigned Bytes = Arg.getValueSizeInBits() / 8;
4546
4547  for (;;) {
4548    // Look through nodes that don't alter the bits of the incoming value.
4549    unsigned Op = Arg.getOpcode();
4550    if (Op == ISD::ZERO_EXTEND || Op == ISD::ANY_EXTEND || Op == ISD::BITCAST) {
4551      Arg = Arg.getOperand(0);
4552      continue;
4553    }
4554    if (Op == ISD::TRUNCATE) {
4555      const SDValue &TruncInput = Arg.getOperand(0);
4556      if (TruncInput.getOpcode() == ISD::AssertZext &&
4557          cast<VTSDNode>(TruncInput.getOperand(1))->getVT() ==
4558              Arg.getValueType()) {
4559        Arg = TruncInput.getOperand(0);
4560        continue;
4561      }
4562    }
4563    break;
4564  }
4565
4566  int FI = INT_MAX;
4567  if (Arg.getOpcode() == ISD::CopyFromReg) {
4568    Register VR = cast<RegisterSDNode>(Arg.getOperand(1))->getReg();
4569    if (!VR.isVirtual())
4570      return false;
4571    MachineInstr *Def = MRI->getVRegDef(VR);
4572    if (!Def)
4573      return false;
4574    if (!Flags.isByVal()) {
4575      if (!TII->isLoadFromStackSlot(*Def, FI))
4576        return false;
4577    } else {
4578      unsigned Opcode = Def->getOpcode();
4579      if ((Opcode == X86::LEA32r || Opcode == X86::LEA64r ||
4580           Opcode == X86::LEA64_32r) &&
4581          Def->getOperand(1).isFI()) {
4582        FI = Def->getOperand(1).getIndex();
4583        Bytes = Flags.getByValSize();
4584      } else
4585        return false;
4586    }
4587  } else if (LoadSDNode *Ld = dyn_cast<LoadSDNode>(Arg)) {
4588    if (Flags.isByVal())
4589      // ByVal argument is passed in as a pointer but it's now being
4590      // dereferenced. e.g.
4591      // define @foo(%struct.X* %A) {
4592      //   tail call @bar(%struct.X* byval %A)
4593      // }
4594      return false;
4595    SDValue Ptr = Ld->getBasePtr();
4596    FrameIndexSDNode *FINode = dyn_cast<FrameIndexSDNode>(Ptr);
4597    if (!FINode)
4598      return false;
4599    FI = FINode->getIndex();
4600  } else if (Arg.getOpcode() == ISD::FrameIndex && Flags.isByVal()) {
4601    FrameIndexSDNode *FINode = cast<FrameIndexSDNode>(Arg);
4602    FI = FINode->getIndex();
4603    Bytes = Flags.getByValSize();
4604  } else
4605    return false;
4606
4607  assert(FI != INT_MAX);
4608  if (!MFI.isFixedObjectIndex(FI))
4609    return false;
4610
4611  if (Offset != MFI.getObjectOffset(FI))
4612    return false;
4613
4614  // If this is not byval, check that the argument stack object is immutable.
4615  // inalloca and argument copy elision can create mutable argument stack
4616  // objects. Byval objects can be mutated, but a byval call intends to pass the
4617  // mutated memory.
4618  if (!Flags.isByVal() && !MFI.isImmutableObjectIndex(FI))
4619    return false;
4620
4621  if (VA.getLocVT().getFixedSizeInBits() >
4622      Arg.getValueSizeInBits().getFixedSize()) {
4623    // If the argument location is wider than the argument type, check that any
4624    // extension flags match.
4625    if (Flags.isZExt() != MFI.isObjectZExt(FI) ||
4626        Flags.isSExt() != MFI.isObjectSExt(FI)) {
4627      return false;
4628    }
4629  }
4630
4631  return Bytes == MFI.getObjectSize(FI);
4632}
4633
4634/// Check whether the call is eligible for tail call optimization. Targets
4635/// that want to do tail call optimization should implement this function.
4636bool X86TargetLowering::IsEligibleForTailCallOptimization(
4637    SDValue Callee, CallingConv::ID CalleeCC, bool isVarArg,
4638    bool isCalleeStructRet, bool isCallerStructRet, Type *RetTy,
4639    const SmallVectorImpl<ISD::OutputArg> &Outs,
4640    const SmallVectorImpl<SDValue> &OutVals,
4641    const SmallVectorImpl<ISD::InputArg> &Ins, SelectionDAG &DAG) const {
4642  if (!mayTailCallThisCC(CalleeCC))
4643    return false;
4644
4645  // If -tailcallopt is specified, make fastcc functions tail-callable.
4646  MachineFunction &MF = DAG.getMachineFunction();
4647  const Function &CallerF = MF.getFunction();
4648
4649  // If the function return type is x86_fp80 and the callee return type is not,
4650  // then the FP_EXTEND of the call result is not a nop. It's not safe to
4651  // perform a tailcall optimization here.
4652  if (CallerF.getReturnType()->isX86_FP80Ty() && !RetTy->isX86_FP80Ty())
4653    return false;
4654
4655  CallingConv::ID CallerCC = CallerF.getCallingConv();
4656  bool CCMatch = CallerCC == CalleeCC;
4657  bool IsCalleeWin64 = Subtarget.isCallingConvWin64(CalleeCC);
4658  bool IsCallerWin64 = Subtarget.isCallingConvWin64(CallerCC);
4659  bool IsGuaranteeTCO = DAG.getTarget().Options.GuaranteedTailCallOpt ||
4660      CalleeCC == CallingConv::Tail || CalleeCC == CallingConv::SwiftTail;
4661
4662  // Win64 functions have extra shadow space for argument homing. Don't do the
4663  // sibcall if the caller and callee have mismatched expectations for this
4664  // space.
4665  if (IsCalleeWin64 != IsCallerWin64)
4666    return false;
4667
4668  if (IsGuaranteeTCO) {
4669    if (canGuaranteeTCO(CalleeCC) && CCMatch)
4670      return true;
4671    return false;
4672  }
4673
4674  // Look for obvious safe cases to perform tail call optimization that do not
4675  // require ABI changes. This is what gcc calls sibcall.
4676
4677  // Can't do sibcall if stack needs to be dynamically re-aligned. PEI needs to
4678  // emit a special epilogue.
4679  const X86RegisterInfo *RegInfo = Subtarget.getRegisterInfo();
4680  if (RegInfo->hasStackRealignment(MF))
4681    return false;
4682
4683  // Also avoid sibcall optimization if either caller or callee uses struct
4684  // return semantics.
4685  if (isCalleeStructRet || isCallerStructRet)
4686    return false;
4687
4688  // Do not sibcall optimize vararg calls unless all arguments are passed via
4689  // registers.
4690  LLVMContext &C = *DAG.getContext();
4691  if (isVarArg && !Outs.empty()) {
4692    // Optimizing for varargs on Win64 is unlikely to be safe without
4693    // additional testing.
4694    if (IsCalleeWin64 || IsCallerWin64)
4695      return false;
4696
4697    SmallVector<CCValAssign, 16> ArgLocs;
4698    CCState CCInfo(CalleeCC, isVarArg, MF, ArgLocs, C);
4699
4700    CCInfo.AnalyzeCallOperands(Outs, CC_X86);
4701    for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i)
4702      if (!ArgLocs[i].isRegLoc())
4703        return false;
4704  }
4705
4706  // If the call result is in ST0 / ST1, it needs to be popped off the x87
4707  // stack.  Therefore, if it's not used by the call it is not safe to optimize
4708  // this into a sibcall.
4709  bool Unused = false;
4710  for (unsigned i = 0, e = Ins.size(); i != e; ++i) {
4711    if (!Ins[i].Used) {
4712      Unused = true;
4713      break;
4714    }
4715  }
4716  if (Unused) {
4717    SmallVector<CCValAssign, 16> RVLocs;
4718    CCState CCInfo(CalleeCC, false, MF, RVLocs, C);
4719    CCInfo.AnalyzeCallResult(Ins, RetCC_X86);
4720    for (unsigned i = 0, e = RVLocs.size(); i != e; ++i) {
4721      CCValAssign &VA = RVLocs[i];
4722      if (VA.getLocReg() == X86::FP0 || VA.getLocReg() == X86::FP1)
4723        return false;
4724    }
4725  }
4726
4727  // Check that the call results are passed in the same way.
4728  if (!CCState::resultsCompatible(CalleeCC, CallerCC, MF, C, Ins,
4729                                  RetCC_X86, RetCC_X86))
4730    return false;
4731  // The callee has to preserve all registers the caller needs to preserve.
4732  const X86RegisterInfo *TRI = Subtarget.getRegisterInfo();
4733  const uint32_t *CallerPreserved = TRI->getCallPreservedMask(MF, CallerCC);
4734  if (!CCMatch) {
4735    const uint32_t *CalleePreserved = TRI->getCallPreservedMask(MF, CalleeCC);
4736    if (!TRI->regmaskSubsetEqual(CallerPreserved, CalleePreserved))
4737      return false;
4738  }
4739
4740  unsigned StackArgsSize = 0;
4741
4742  // If the callee takes no arguments then go on to check the results of the
4743  // call.
4744  if (!Outs.empty()) {
4745    // Check if stack adjustment is needed. For now, do not do this if any
4746    // argument is passed on the stack.
4747    SmallVector<CCValAssign, 16> ArgLocs;
4748    CCState CCInfo(CalleeCC, isVarArg, MF, ArgLocs, C);
4749
4750    // Allocate shadow area for Win64
4751    if (IsCalleeWin64)
4752      CCInfo.AllocateStack(32, Align(8));
4753
4754    CCInfo.AnalyzeCallOperands(Outs, CC_X86);
4755    StackArgsSize = CCInfo.getNextStackOffset();
4756
4757    if (CCInfo.getNextStackOffset()) {
4758      // Check if the arguments are already laid out in the right way as
4759      // the caller's fixed stack objects.
4760      MachineFrameInfo &MFI = MF.getFrameInfo();
4761      const MachineRegisterInfo *MRI = &MF.getRegInfo();
4762      const X86InstrInfo *TII = Subtarget.getInstrInfo();
4763      for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i) {
4764        CCValAssign &VA = ArgLocs[i];
4765        SDValue Arg = OutVals[i];
4766        ISD::ArgFlagsTy Flags = Outs[i].Flags;
4767        if (VA.getLocInfo() == CCValAssign::Indirect)
4768          return false;
4769        if (!VA.isRegLoc()) {
4770          if (!MatchingStackOffset(Arg, VA.getLocMemOffset(), Flags,
4771                                   MFI, MRI, TII, VA))
4772            return false;
4773        }
4774      }
4775    }
4776
4777    bool PositionIndependent = isPositionIndependent();
4778    // If the tailcall address may be in a register, then make sure it's
4779    // possible to register allocate for it. In 32-bit, the call address can
4780    // only target EAX, EDX, or ECX since the tail call must be scheduled after
4781    // callee-saved registers are restored. These happen to be the same
4782    // registers used to pass 'inreg' arguments so watch out for those.
4783    if (!Subtarget.is64Bit() && ((!isa<GlobalAddressSDNode>(Callee) &&
4784                                  !isa<ExternalSymbolSDNode>(Callee)) ||
4785                                 PositionIndependent)) {
4786      unsigned NumInRegs = 0;
4787      // In PIC we need an extra register to formulate the address computation
4788      // for the callee.
4789      unsigned MaxInRegs = PositionIndependent ? 2 : 3;
4790
4791      for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i) {
4792        CCValAssign &VA = ArgLocs[i];
4793        if (!VA.isRegLoc())
4794          continue;
4795        Register Reg = VA.getLocReg();
4796        switch (Reg) {
4797        default: break;
4798        case X86::EAX: case X86::EDX: case X86::ECX:
4799          if (++NumInRegs == MaxInRegs)
4800            return false;
4801          break;
4802        }
4803      }
4804    }
4805
4806    const MachineRegisterInfo &MRI = MF.getRegInfo();
4807    if (!parametersInCSRMatch(MRI, CallerPreserved, ArgLocs, OutVals))
4808      return false;
4809  }
4810
4811  bool CalleeWillPop =
4812      X86::isCalleePop(CalleeCC, Subtarget.is64Bit(), isVarArg,
4813                       MF.getTarget().Options.GuaranteedTailCallOpt);
4814
4815  if (unsigned BytesToPop =
4816          MF.getInfo<X86MachineFunctionInfo>()->getBytesToPopOnReturn()) {
4817    // If we have bytes to pop, the callee must pop them.
4818    bool CalleePopMatches = CalleeWillPop && BytesToPop == StackArgsSize;
4819    if (!CalleePopMatches)
4820      return false;
4821  } else if (CalleeWillPop && StackArgsSize > 0) {
4822    // If we don't have bytes to pop, make sure the callee doesn't pop any.
4823    return false;
4824  }
4825
4826  return true;
4827}
4828
4829FastISel *
4830X86TargetLowering::createFastISel(FunctionLoweringInfo &funcInfo,
4831                                  const TargetLibraryInfo *libInfo) const {
4832  return X86::createFastISel(funcInfo, libInfo);
4833}
4834
4835//===----------------------------------------------------------------------===//
4836//                           Other Lowering Hooks
4837//===----------------------------------------------------------------------===//
4838
4839static bool MayFoldLoad(SDValue Op) {
4840  return Op.hasOneUse() && ISD::isNormalLoad(Op.getNode());
4841}
4842
4843static bool MayFoldIntoStore(SDValue Op) {
4844  return Op.hasOneUse() && ISD::isNormalStore(*Op.getNode()->use_begin());
4845}
4846
4847static bool MayFoldIntoZeroExtend(SDValue Op) {
4848  if (Op.hasOneUse()) {
4849    unsigned Opcode = Op.getNode()->use_begin()->getOpcode();
4850    return (ISD::ZERO_EXTEND == Opcode);
4851  }
4852  return false;
4853}
4854
4855static bool isTargetShuffle(unsigned Opcode) {
4856  switch(Opcode) {
4857  default: return false;
4858  case X86ISD::BLENDI:
4859  case X86ISD::PSHUFB:
4860  case X86ISD::PSHUFD:
4861  case X86ISD::PSHUFHW:
4862  case X86ISD::PSHUFLW:
4863  case X86ISD::SHUFP:
4864  case X86ISD::INSERTPS:
4865  case X86ISD::EXTRQI:
4866  case X86ISD::INSERTQI:
4867  case X86ISD::VALIGN:
4868  case X86ISD::PALIGNR:
4869  case X86ISD::VSHLDQ:
4870  case X86ISD::VSRLDQ:
4871  case X86ISD::MOVLHPS:
4872  case X86ISD::MOVHLPS:
4873  case X86ISD::MOVSHDUP:
4874  case X86ISD::MOVSLDUP:
4875  case X86ISD::MOVDDUP:
4876  case X86ISD::MOVSS:
4877  case X86ISD::MOVSD:
4878  case X86ISD::UNPCKL:
4879  case X86ISD::UNPCKH:
4880  case X86ISD::VBROADCAST:
4881  case X86ISD::VPERMILPI:
4882  case X86ISD::VPERMILPV:
4883  case X86ISD::VPERM2X128:
4884  case X86ISD::SHUF128:
4885  case X86ISD::VPERMIL2:
4886  case X86ISD::VPERMI:
4887  case X86ISD::VPPERM:
4888  case X86ISD::VPERMV:
4889  case X86ISD::VPERMV3:
4890  case X86ISD::VZEXT_MOVL:
4891    return true;
4892  }
4893}
4894
4895static bool isTargetShuffleVariableMask(unsigned Opcode) {
4896  switch (Opcode) {
4897  default: return false;
4898  // Target Shuffles.
4899  case X86ISD::PSHUFB:
4900  case X86ISD::VPERMILPV:
4901  case X86ISD::VPERMIL2:
4902  case X86ISD::VPPERM:
4903  case X86ISD::VPERMV:
4904  case X86ISD::VPERMV3:
4905    return true;
4906  // 'Faux' Target Shuffles.
4907  case ISD::OR:
4908  case ISD::AND:
4909  case X86ISD::ANDNP:
4910    return true;
4911  }
4912}
4913
4914static bool isTargetShuffleSplat(SDValue Op) {
4915  unsigned Opcode = Op.getOpcode();
4916  if (Opcode == ISD::EXTRACT_SUBVECTOR)
4917    return isTargetShuffleSplat(Op.getOperand(0));
4918  return Opcode == X86ISD::VBROADCAST || Opcode == X86ISD::VBROADCAST_LOAD;
4919}
4920
4921SDValue X86TargetLowering::getReturnAddressFrameIndex(SelectionDAG &DAG) const {
4922  MachineFunction &MF = DAG.getMachineFunction();
4923  const X86RegisterInfo *RegInfo = Subtarget.getRegisterInfo();
4924  X86MachineFunctionInfo *FuncInfo = MF.getInfo<X86MachineFunctionInfo>();
4925  int ReturnAddrIndex = FuncInfo->getRAIndex();
4926
4927  if (ReturnAddrIndex == 0) {
4928    // Set up a frame object for the return address.
4929    unsigned SlotSize = RegInfo->getSlotSize();
4930    ReturnAddrIndex = MF.getFrameInfo().CreateFixedObject(SlotSize,
4931                                                          -(int64_t)SlotSize,
4932                                                          false);
4933    FuncInfo->setRAIndex(ReturnAddrIndex);
4934  }
4935
4936  return DAG.getFrameIndex(ReturnAddrIndex, getPointerTy(DAG.getDataLayout()));
4937}
4938
4939bool X86::isOffsetSuitableForCodeModel(int64_t Offset, CodeModel::Model M,
4940                                       bool hasSymbolicDisplacement) {
4941  // Offset should fit into 32 bit immediate field.
4942  if (!isInt<32>(Offset))
4943    return false;
4944
4945  // If we don't have a symbolic displacement - we don't have any extra
4946  // restrictions.
4947  if (!hasSymbolicDisplacement)
4948    return true;
4949
4950  // FIXME: Some tweaks might be needed for medium code model.
4951  if (M != CodeModel::Small && M != CodeModel::Kernel)
4952    return false;
4953
4954  // For small code model we assume that latest object is 16MB before end of 31
4955  // bits boundary. We may also accept pretty large negative constants knowing
4956  // that all objects are in the positive half of address space.
4957  if (M == CodeModel::Small && Offset < 16*1024*1024)
4958    return true;
4959
4960  // For kernel code model we know that all object resist in the negative half
4961  // of 32bits address space. We may not accept negative offsets, since they may
4962  // be just off and we may accept pretty large positive ones.
4963  if (M == CodeModel::Kernel && Offset >= 0)
4964    return true;
4965
4966  return false;
4967}
4968
4969/// Determines whether the callee is required to pop its own arguments.
4970/// Callee pop is necessary to support tail calls.
4971bool X86::isCalleePop(CallingConv::ID CallingConv,
4972                      bool is64Bit, bool IsVarArg, bool GuaranteeTCO) {
4973  // If GuaranteeTCO is true, we force some calls to be callee pop so that we
4974  // can guarantee TCO.
4975  if (!IsVarArg && shouldGuaranteeTCO(CallingConv, GuaranteeTCO))
4976    return true;
4977
4978  switch (CallingConv) {
4979  default:
4980    return false;
4981  case CallingConv::X86_StdCall:
4982  case CallingConv::X86_FastCall:
4983  case CallingConv::X86_ThisCall:
4984  case CallingConv::X86_VectorCall:
4985    return !is64Bit;
4986  }
4987}
4988
4989/// Return true if the condition is an signed comparison operation.
4990static bool isX86CCSigned(unsigned X86CC) {
4991  switch (X86CC) {
4992  default:
4993    llvm_unreachable("Invalid integer condition!");
4994  case X86::COND_E:
4995  case X86::COND_NE:
4996  case X86::COND_B:
4997  case X86::COND_A:
4998  case X86::COND_BE:
4999  case X86::COND_AE:
5000    return false;
5001  case X86::COND_G:
5002  case X86::COND_GE:
5003  case X86::COND_L:
5004  case X86::COND_LE:
5005    return true;
5006  }
5007}
5008
5009static X86::CondCode TranslateIntegerX86CC(ISD::CondCode SetCCOpcode) {
5010  switch (SetCCOpcode) {
5011  default: llvm_unreachable("Invalid integer condition!");
5012  case ISD::SETEQ:  return X86::COND_E;
5013  case ISD::SETGT:  return X86::COND_G;
5014  case ISD::SETGE:  return X86::COND_GE;
5015  case ISD::SETLT:  return X86::COND_L;
5016  case ISD::SETLE:  return X86::COND_LE;
5017  case ISD::SETNE:  return X86::COND_NE;
5018  case ISD::SETULT: return X86::COND_B;
5019  case ISD::SETUGT: return X86::COND_A;
5020  case ISD::SETULE: return X86::COND_BE;
5021  case ISD::SETUGE: return X86::COND_AE;
5022  }
5023}
5024
5025/// Do a one-to-one translation of a ISD::CondCode to the X86-specific
5026/// condition code, returning the condition code and the LHS/RHS of the
5027/// comparison to make.
5028static X86::CondCode TranslateX86CC(ISD::CondCode SetCCOpcode, const SDLoc &DL,
5029                               bool isFP, SDValue &LHS, SDValue &RHS,
5030                               SelectionDAG &DAG) {
5031  if (!isFP) {
5032    if (ConstantSDNode *RHSC = dyn_cast<ConstantSDNode>(RHS)) {
5033      if (SetCCOpcode == ISD::SETGT && RHSC->isAllOnesValue()) {
5034        // X > -1   -> X == 0, jump !sign.
5035        RHS = DAG.getConstant(0, DL, RHS.getValueType());
5036        return X86::COND_NS;
5037      }
5038      if (SetCCOpcode == ISD::SETLT && RHSC->isNullValue()) {
5039        // X < 0   -> X == 0, jump on sign.
5040        return X86::COND_S;
5041      }
5042      if (SetCCOpcode == ISD::SETGE && RHSC->isNullValue()) {
5043        // X >= 0   -> X == 0, jump on !sign.
5044        return X86::COND_NS;
5045      }
5046      if (SetCCOpcode == ISD::SETLT && RHSC->isOne()) {
5047        // X < 1   -> X <= 0
5048        RHS = DAG.getConstant(0, DL, RHS.getValueType());
5049        return X86::COND_LE;
5050      }
5051    }
5052
5053    return TranslateIntegerX86CC(SetCCOpcode);
5054  }
5055
5056  // First determine if it is required or is profitable to flip the operands.
5057
5058  // If LHS is a foldable load, but RHS is not, flip the condition.
5059  if (ISD::isNON_EXTLoad(LHS.getNode()) &&
5060      !ISD::isNON_EXTLoad(RHS.getNode())) {
5061    SetCCOpcode = getSetCCSwappedOperands(SetCCOpcode);
5062    std::swap(LHS, RHS);
5063  }
5064
5065  switch (SetCCOpcode) {
5066  default: break;
5067  case ISD::SETOLT:
5068  case ISD::SETOLE:
5069  case ISD::SETUGT:
5070  case ISD::SETUGE:
5071    std::swap(LHS, RHS);
5072    break;
5073  }
5074
5075  // On a floating point condition, the flags are set as follows:
5076  // ZF  PF  CF   op
5077  //  0 | 0 | 0 | X > Y
5078  //  0 | 0 | 1 | X < Y
5079  //  1 | 0 | 0 | X == Y
5080  //  1 | 1 | 1 | unordered
5081  switch (SetCCOpcode) {
5082  default: llvm_unreachable("Condcode should be pre-legalized away");
5083  case ISD::SETUEQ:
5084  case ISD::SETEQ:   return X86::COND_E;
5085  case ISD::SETOLT:              // flipped
5086  case ISD::SETOGT:
5087  case ISD::SETGT:   return X86::COND_A;
5088  case ISD::SETOLE:              // flipped
5089  case ISD::SETOGE:
5090  case ISD::SETGE:   return X86::COND_AE;
5091  case ISD::SETUGT:              // flipped
5092  case ISD::SETULT:
5093  case ISD::SETLT:   return X86::COND_B;
5094  case ISD::SETUGE:              // flipped
5095  case ISD::SETULE:
5096  case ISD::SETLE:   return X86::COND_BE;
5097  case ISD::SETONE:
5098  case ISD::SETNE:   return X86::COND_NE;
5099  case ISD::SETUO:   return X86::COND_P;
5100  case ISD::SETO:    return X86::COND_NP;
5101  case ISD::SETOEQ:
5102  case ISD::SETUNE:  return X86::COND_INVALID;
5103  }
5104}
5105
5106/// Is there a floating point cmov for the specific X86 condition code?
5107/// Current x86 isa includes the following FP cmov instructions:
5108/// fcmovb, fcomvbe, fcomve, fcmovu, fcmovae, fcmova, fcmovne, fcmovnu.
5109static bool hasFPCMov(unsigned X86CC) {
5110  switch (X86CC) {
5111  default:
5112    return false;
5113  case X86::COND_B:
5114  case X86::COND_BE:
5115  case X86::COND_E:
5116  case X86::COND_P:
5117  case X86::COND_A:
5118  case X86::COND_AE:
5119  case X86::COND_NE:
5120  case X86::COND_NP:
5121    return true;
5122  }
5123}
5124
5125
5126bool X86TargetLowering::getTgtMemIntrinsic(IntrinsicInfo &Info,
5127                                           const CallInst &I,
5128                                           MachineFunction &MF,
5129                                           unsigned Intrinsic) const {
5130  Info.flags = MachineMemOperand::MONone;
5131  Info.offset = 0;
5132
5133  const IntrinsicData* IntrData = getIntrinsicWithChain(Intrinsic);
5134  if (!IntrData) {
5135    switch (Intrinsic) {
5136    case Intrinsic::x86_aesenc128kl:
5137    case Intrinsic::x86_aesdec128kl:
5138      Info.opc = ISD::INTRINSIC_W_CHAIN;
5139      Info.ptrVal = I.getArgOperand(1);
5140      Info.memVT = EVT::getIntegerVT(I.getType()->getContext(), 48);
5141      Info.align = Align(1);
5142      Info.flags |= MachineMemOperand::MOLoad;
5143      return true;
5144    case Intrinsic::x86_aesenc256kl:
5145    case Intrinsic::x86_aesdec256kl:
5146      Info.opc = ISD::INTRINSIC_W_CHAIN;
5147      Info.ptrVal = I.getArgOperand(1);
5148      Info.memVT = EVT::getIntegerVT(I.getType()->getContext(), 64);
5149      Info.align = Align(1);
5150      Info.flags |= MachineMemOperand::MOLoad;
5151      return true;
5152    case Intrinsic::x86_aesencwide128kl:
5153    case Intrinsic::x86_aesdecwide128kl:
5154      Info.opc = ISD::INTRINSIC_W_CHAIN;
5155      Info.ptrVal = I.getArgOperand(0);
5156      Info.memVT = EVT::getIntegerVT(I.getType()->getContext(), 48);
5157      Info.align = Align(1);
5158      Info.flags |= MachineMemOperand::MOLoad;
5159      return true;
5160    case Intrinsic::x86_aesencwide256kl:
5161    case Intrinsic::x86_aesdecwide256kl:
5162      Info.opc = ISD::INTRINSIC_W_CHAIN;
5163      Info.ptrVal = I.getArgOperand(0);
5164      Info.memVT = EVT::getIntegerVT(I.getType()->getContext(), 64);
5165      Info.align = Align(1);
5166      Info.flags |= MachineMemOperand::MOLoad;
5167      return true;
5168    }
5169    return false;
5170  }
5171
5172  switch (IntrData->Type) {
5173  case TRUNCATE_TO_MEM_VI8:
5174  case TRUNCATE_TO_MEM_VI16:
5175  case TRUNCATE_TO_MEM_VI32: {
5176    Info.opc = ISD::INTRINSIC_VOID;
5177    Info.ptrVal = I.getArgOperand(0);
5178    MVT VT  = MVT::getVT(I.getArgOperand(1)->getType());
5179    MVT ScalarVT = MVT::INVALID_SIMPLE_VALUE_TYPE;
5180    if (IntrData->Type == TRUNCATE_TO_MEM_VI8)
5181      ScalarVT = MVT::i8;
5182    else if (IntrData->Type == TRUNCATE_TO_MEM_VI16)
5183      ScalarVT = MVT::i16;
5184    else if (IntrData->Type == TRUNCATE_TO_MEM_VI32)
5185      ScalarVT = MVT::i32;
5186
5187    Info.memVT = MVT::getVectorVT(ScalarVT, VT.getVectorNumElements());
5188    Info.align = Align(1);
5189    Info.flags |= MachineMemOperand::MOStore;
5190    break;
5191  }
5192  case GATHER:
5193  case GATHER_AVX2: {
5194    Info.opc = ISD::INTRINSIC_W_CHAIN;
5195    Info.ptrVal = nullptr;
5196    MVT DataVT = MVT::getVT(I.getType());
5197    MVT IndexVT = MVT::getVT(I.getArgOperand(2)->getType());
5198    unsigned NumElts = std::min(DataVT.getVectorNumElements(),
5199                                IndexVT.getVectorNumElements());
5200    Info.memVT = MVT::getVectorVT(DataVT.getVectorElementType(), NumElts);
5201    Info.align = Align(1);
5202    Info.flags |= MachineMemOperand::MOLoad;
5203    break;
5204  }
5205  case SCATTER: {
5206    Info.opc = ISD::INTRINSIC_VOID;
5207    Info.ptrVal = nullptr;
5208    MVT DataVT = MVT::getVT(I.getArgOperand(3)->getType());
5209    MVT IndexVT = MVT::getVT(I.getArgOperand(2)->getType());
5210    unsigned NumElts = std::min(DataVT.getVectorNumElements(),
5211                                IndexVT.getVectorNumElements());
5212    Info.memVT = MVT::getVectorVT(DataVT.getVectorElementType(), NumElts);
5213    Info.align = Align(1);
5214    Info.flags |= MachineMemOperand::MOStore;
5215    break;
5216  }
5217  default:
5218    return false;
5219  }
5220
5221  return true;
5222}
5223
5224/// Returns true if the target can instruction select the
5225/// specified FP immediate natively. If false, the legalizer will
5226/// materialize the FP immediate as a load from a constant pool.
5227bool X86TargetLowering::isFPImmLegal(const APFloat &Imm, EVT VT,
5228                                     bool ForCodeSize) const {
5229  for (unsigned i = 0, e = LegalFPImmediates.size(); i != e; ++i) {
5230    if (Imm.bitwiseIsEqual(LegalFPImmediates[i]))
5231      return true;
5232  }
5233  return false;
5234}
5235
5236bool X86TargetLowering::shouldReduceLoadWidth(SDNode *Load,
5237                                              ISD::LoadExtType ExtTy,
5238                                              EVT NewVT) const {
5239  assert(cast<LoadSDNode>(Load)->isSimple() && "illegal to narrow");
5240
5241  // "ELF Handling for Thread-Local Storage" specifies that R_X86_64_GOTTPOFF
5242  // relocation target a movq or addq instruction: don't let the load shrink.
5243  SDValue BasePtr = cast<LoadSDNode>(Load)->getBasePtr();
5244  if (BasePtr.getOpcode() == X86ISD::WrapperRIP)
5245    if (const auto *GA = dyn_cast<GlobalAddressSDNode>(BasePtr.getOperand(0)))
5246      return GA->getTargetFlags() != X86II::MO_GOTTPOFF;
5247
5248  // If this is an (1) AVX vector load with (2) multiple uses and (3) all of
5249  // those uses are extracted directly into a store, then the extract + store
5250  // can be store-folded. Therefore, it's probably not worth splitting the load.
5251  EVT VT = Load->getValueType(0);
5252  if ((VT.is256BitVector() || VT.is512BitVector()) && !Load->hasOneUse()) {
5253    for (auto UI = Load->use_begin(), UE = Load->use_end(); UI != UE; ++UI) {
5254      // Skip uses of the chain value. Result 0 of the node is the load value.
5255      if (UI.getUse().getResNo() != 0)
5256        continue;
5257
5258      // If this use is not an extract + store, it's probably worth splitting.
5259      if (UI->getOpcode() != ISD::EXTRACT_SUBVECTOR || !UI->hasOneUse() ||
5260          UI->use_begin()->getOpcode() != ISD::STORE)
5261        return true;
5262    }
5263    // All non-chain uses are extract + store.
5264    return false;
5265  }
5266
5267  return true;
5268}
5269
5270/// Returns true if it is beneficial to convert a load of a constant
5271/// to just the constant itself.
5272bool X86TargetLowering::shouldConvertConstantLoadToIntImm(const APInt &Imm,
5273                                                          Type *Ty) const {
5274  assert(Ty->isIntegerTy());
5275
5276  unsigned BitSize = Ty->getPrimitiveSizeInBits();
5277  if (BitSize == 0 || BitSize > 64)
5278    return false;
5279  return true;
5280}
5281
5282bool X86TargetLowering::reduceSelectOfFPConstantLoads(EVT CmpOpVT) const {
5283  // If we are using XMM registers in the ABI and the condition of the select is
5284  // a floating-point compare and we have blendv or conditional move, then it is
5285  // cheaper to select instead of doing a cross-register move and creating a
5286  // load that depends on the compare result.
5287  bool IsFPSetCC = CmpOpVT.isFloatingPoint() && CmpOpVT != MVT::f128;
5288  return !IsFPSetCC || !Subtarget.isTarget64BitLP64() || !Subtarget.hasAVX();
5289}
5290
5291bool X86TargetLowering::convertSelectOfConstantsToMath(EVT VT) const {
5292  // TODO: It might be a win to ease or lift this restriction, but the generic
5293  // folds in DAGCombiner conflict with vector folds for an AVX512 target.
5294  if (VT.isVector() && Subtarget.hasAVX512())
5295    return false;
5296
5297  return true;
5298}
5299
5300bool X86TargetLowering::decomposeMulByConstant(LLVMContext &Context, EVT VT,
5301                                               SDValue C) const {
5302  // TODO: We handle scalars using custom code, but generic combining could make
5303  // that unnecessary.
5304  APInt MulC;
5305  if (!ISD::isConstantSplatVector(C.getNode(), MulC))
5306    return false;
5307
5308  // Find the type this will be legalized too. Otherwise we might prematurely
5309  // convert this to shl+add/sub and then still have to type legalize those ops.
5310  // Another choice would be to defer the decision for illegal types until
5311  // after type legalization. But constant splat vectors of i64 can't make it
5312  // through type legalization on 32-bit targets so we would need to special
5313  // case vXi64.
5314  while (getTypeAction(Context, VT) != TypeLegal)
5315    VT = getTypeToTransformTo(Context, VT);
5316
5317  // If vector multiply is legal, assume that's faster than shl + add/sub.
5318  // TODO: Multiply is a complex op with higher latency and lower throughput in
5319  //       most implementations, so this check could be loosened based on type
5320  //       and/or a CPU attribute.
5321  if (isOperationLegal(ISD::MUL, VT))
5322    return false;
5323
5324  // shl+add, shl+sub, shl+add+neg
5325  return (MulC + 1).isPowerOf2() || (MulC - 1).isPowerOf2() ||
5326         (1 - MulC).isPowerOf2() || (-(MulC + 1)).isPowerOf2();
5327}
5328
5329bool X86TargetLowering::isExtractSubvectorCheap(EVT ResVT, EVT SrcVT,
5330                                                unsigned Index) const {
5331  if (!isOperationLegalOrCustom(ISD::EXTRACT_SUBVECTOR, ResVT))
5332    return false;
5333
5334  // Mask vectors support all subregister combinations and operations that
5335  // extract half of vector.
5336  if (ResVT.getVectorElementType() == MVT::i1)
5337    return Index == 0 || ((ResVT.getSizeInBits() == SrcVT.getSizeInBits()*2) &&
5338                          (Index == ResVT.getVectorNumElements()));
5339
5340  return (Index % ResVT.getVectorNumElements()) == 0;
5341}
5342
5343bool X86TargetLowering::shouldScalarizeBinop(SDValue VecOp) const {
5344  unsigned Opc = VecOp.getOpcode();
5345
5346  // Assume target opcodes can't be scalarized.
5347  // TODO - do we have any exceptions?
5348  if (Opc >= ISD::BUILTIN_OP_END)
5349    return false;
5350
5351  // If the vector op is not supported, try to convert to scalar.
5352  EVT VecVT = VecOp.getValueType();
5353  if (!isOperationLegalOrCustomOrPromote(Opc, VecVT))
5354    return true;
5355
5356  // If the vector op is supported, but the scalar op is not, the transform may
5357  // not be worthwhile.
5358  EVT ScalarVT = VecVT.getScalarType();
5359  return isOperationLegalOrCustomOrPromote(Opc, ScalarVT);
5360}
5361
5362bool X86TargetLowering::shouldFormOverflowOp(unsigned Opcode, EVT VT,
5363                                             bool) const {
5364  // TODO: Allow vectors?
5365  if (VT.isVector())
5366    return false;
5367  return VT.isSimple() || !isOperationExpand(Opcode, VT);
5368}
5369
5370bool X86TargetLowering::isCheapToSpeculateCttz() const {
5371  // Speculate cttz only if we can directly use TZCNT.
5372  return Subtarget.hasBMI();
5373}
5374
5375bool X86TargetLowering::isCheapToSpeculateCtlz() const {
5376  // Speculate ctlz only if we can directly use LZCNT.
5377  return Subtarget.hasLZCNT();
5378}
5379
5380bool X86TargetLowering::isLoadBitCastBeneficial(EVT LoadVT, EVT BitcastVT,
5381                                                const SelectionDAG &DAG,
5382                                                const MachineMemOperand &MMO) const {
5383  if (!Subtarget.hasAVX512() && !LoadVT.isVector() && BitcastVT.isVector() &&
5384      BitcastVT.getVectorElementType() == MVT::i1)
5385    return false;
5386
5387  if (!Subtarget.hasDQI() && BitcastVT == MVT::v8i1 && LoadVT == MVT::i8)
5388    return false;
5389
5390  // If both types are legal vectors, it's always ok to convert them.
5391  if (LoadVT.isVector() && BitcastVT.isVector() &&
5392      isTypeLegal(LoadVT) && isTypeLegal(BitcastVT))
5393    return true;
5394
5395  return TargetLowering::isLoadBitCastBeneficial(LoadVT, BitcastVT, DAG, MMO);
5396}
5397
5398bool X86TargetLowering::canMergeStoresTo(unsigned AddressSpace, EVT MemVT,
5399                                         const SelectionDAG &DAG) const {
5400  // Do not merge to float value size (128 bytes) if no implicit
5401  // float attribute is set.
5402  bool NoFloat = DAG.getMachineFunction().getFunction().hasFnAttribute(
5403      Attribute::NoImplicitFloat);
5404
5405  if (NoFloat) {
5406    unsigned MaxIntSize = Subtarget.is64Bit() ? 64 : 32;
5407    return (MemVT.getSizeInBits() <= MaxIntSize);
5408  }
5409  // Make sure we don't merge greater than our preferred vector
5410  // width.
5411  if (MemVT.getSizeInBits() > Subtarget.getPreferVectorWidth())
5412    return false;
5413
5414  return true;
5415}
5416
5417bool X86TargetLowering::isCtlzFast() const {
5418  return Subtarget.hasFastLZCNT();
5419}
5420
5421bool X86TargetLowering::isMaskAndCmp0FoldingBeneficial(
5422    const Instruction &AndI) const {
5423  return true;
5424}
5425
5426bool X86TargetLowering::hasAndNotCompare(SDValue Y) const {
5427  EVT VT = Y.getValueType();
5428
5429  if (VT.isVector())
5430    return false;
5431
5432  if (!Subtarget.hasBMI())
5433    return false;
5434
5435  // There are only 32-bit and 64-bit forms for 'andn'.
5436  if (VT != MVT::i32 && VT != MVT::i64)
5437    return false;
5438
5439  return !isa<ConstantSDNode>(Y);
5440}
5441
5442bool X86TargetLowering::hasAndNot(SDValue Y) const {
5443  EVT VT = Y.getValueType();
5444
5445  if (!VT.isVector())
5446    return hasAndNotCompare(Y);
5447
5448  // Vector.
5449
5450  if (!Subtarget.hasSSE1() || VT.getSizeInBits() < 128)
5451    return false;
5452
5453  if (VT == MVT::v4i32)
5454    return true;
5455
5456  return Subtarget.hasSSE2();
5457}
5458
5459bool X86TargetLowering::hasBitTest(SDValue X, SDValue Y) const {
5460  return X.getValueType().isScalarInteger(); // 'bt'
5461}
5462
5463bool X86TargetLowering::
5464    shouldProduceAndByConstByHoistingConstFromShiftsLHSOfAnd(
5465        SDValue X, ConstantSDNode *XC, ConstantSDNode *CC, SDValue Y,
5466        unsigned OldShiftOpcode, unsigned NewShiftOpcode,
5467        SelectionDAG &DAG) const {
5468  // Does baseline recommend not to perform the fold by default?
5469  if (!TargetLowering::shouldProduceAndByConstByHoistingConstFromShiftsLHSOfAnd(
5470          X, XC, CC, Y, OldShiftOpcode, NewShiftOpcode, DAG))
5471    return false;
5472  // For scalars this transform is always beneficial.
5473  if (X.getValueType().isScalarInteger())
5474    return true;
5475  // If all the shift amounts are identical, then transform is beneficial even
5476  // with rudimentary SSE2 shifts.
5477  if (DAG.isSplatValue(Y, /*AllowUndefs=*/true))
5478    return true;
5479  // If we have AVX2 with it's powerful shift operations, then it's also good.
5480  if (Subtarget.hasAVX2())
5481    return true;
5482  // Pre-AVX2 vector codegen for this pattern is best for variant with 'shl'.
5483  return NewShiftOpcode == ISD::SHL;
5484}
5485
5486bool X86TargetLowering::shouldFoldConstantShiftPairToMask(
5487    const SDNode *N, CombineLevel Level) const {
5488  assert(((N->getOpcode() == ISD::SHL &&
5489           N->getOperand(0).getOpcode() == ISD::SRL) ||
5490          (N->getOpcode() == ISD::SRL &&
5491           N->getOperand(0).getOpcode() == ISD::SHL)) &&
5492         "Expected shift-shift mask");
5493  EVT VT = N->getValueType(0);
5494  if ((Subtarget.hasFastVectorShiftMasks() && VT.isVector()) ||
5495      (Subtarget.hasFastScalarShiftMasks() && !VT.isVector())) {
5496    // Only fold if the shift values are equal - so it folds to AND.
5497    // TODO - we should fold if either is a non-uniform vector but we don't do
5498    // the fold for non-splats yet.
5499    return N->getOperand(1) == N->getOperand(0).getOperand(1);
5500  }
5501  return TargetLoweringBase::shouldFoldConstantShiftPairToMask(N, Level);
5502}
5503
5504bool X86TargetLowering::shouldFoldMaskToVariableShiftPair(SDValue Y) const {
5505  EVT VT = Y.getValueType();
5506
5507  // For vectors, we don't have a preference, but we probably want a mask.
5508  if (VT.isVector())
5509    return false;
5510
5511  // 64-bit shifts on 32-bit targets produce really bad bloated code.
5512  if (VT == MVT::i64 && !Subtarget.is64Bit())
5513    return false;
5514
5515  return true;
5516}
5517
5518bool X86TargetLowering::shouldExpandShift(SelectionDAG &DAG,
5519                                          SDNode *N) const {
5520  if (DAG.getMachineFunction().getFunction().hasMinSize() &&
5521      !Subtarget.isOSWindows())
5522    return false;
5523  return true;
5524}
5525
5526bool X86TargetLowering::shouldSplatInsEltVarIndex(EVT VT) const {
5527  // Any legal vector type can be splatted more efficiently than
5528  // loading/spilling from memory.
5529  return isTypeLegal(VT);
5530}
5531
5532MVT X86TargetLowering::hasFastEqualityCompare(unsigned NumBits) const {
5533  MVT VT = MVT::getIntegerVT(NumBits);
5534  if (isTypeLegal(VT))
5535    return VT;
5536
5537  // PMOVMSKB can handle this.
5538  if (NumBits == 128 && isTypeLegal(MVT::v16i8))
5539    return MVT::v16i8;
5540
5541  // VPMOVMSKB can handle this.
5542  if (NumBits == 256 && isTypeLegal(MVT::v32i8))
5543    return MVT::v32i8;
5544
5545  // TODO: Allow 64-bit type for 32-bit target.
5546  // TODO: 512-bit types should be allowed, but make sure that those
5547  // cases are handled in combineVectorSizedSetCCEquality().
5548
5549  return MVT::INVALID_SIMPLE_VALUE_TYPE;
5550}
5551
5552/// Val is the undef sentinel value or equal to the specified value.
5553static bool isUndefOrEqual(int Val, int CmpVal) {
5554  return ((Val == SM_SentinelUndef) || (Val == CmpVal));
5555}
5556
5557/// Return true if every element in Mask is the undef sentinel value or equal to
5558/// the specified value..
5559static bool isUndefOrEqual(ArrayRef<int> Mask, int CmpVal) {
5560  return llvm::all_of(Mask, [CmpVal](int M) {
5561    return (M == SM_SentinelUndef) || (M == CmpVal);
5562  });
5563}
5564
5565/// Val is either the undef or zero sentinel value.
5566static bool isUndefOrZero(int Val) {
5567  return ((Val == SM_SentinelUndef) || (Val == SM_SentinelZero));
5568}
5569
5570/// Return true if every element in Mask, beginning from position Pos and ending
5571/// in Pos+Size is the undef sentinel value.
5572static bool isUndefInRange(ArrayRef<int> Mask, unsigned Pos, unsigned Size) {
5573  return llvm::all_of(Mask.slice(Pos, Size),
5574                      [](int M) { return M == SM_SentinelUndef; });
5575}
5576
5577/// Return true if the mask creates a vector whose lower half is undefined.
5578static bool isUndefLowerHalf(ArrayRef<int> Mask) {
5579  unsigned NumElts = Mask.size();
5580  return isUndefInRange(Mask, 0, NumElts / 2);
5581}
5582
5583/// Return true if the mask creates a vector whose upper half is undefined.
5584static bool isUndefUpperHalf(ArrayRef<int> Mask) {
5585  unsigned NumElts = Mask.size();
5586  return isUndefInRange(Mask, NumElts / 2, NumElts / 2);
5587}
5588
5589/// Return true if Val falls within the specified range (L, H].
5590static bool isInRange(int Val, int Low, int Hi) {
5591  return (Val >= Low && Val < Hi);
5592}
5593
5594/// Return true if the value of any element in Mask falls within the specified
5595/// range (L, H].
5596static bool isAnyInRange(ArrayRef<int> Mask, int Low, int Hi) {
5597  return llvm::any_of(Mask, [Low, Hi](int M) { return isInRange(M, Low, Hi); });
5598}
5599
5600/// Return true if the value of any element in Mask is the zero sentinel value.
5601static bool isAnyZero(ArrayRef<int> Mask) {
5602  return llvm::any_of(Mask, [](int M) { return M == SM_SentinelZero; });
5603}
5604
5605/// Return true if the value of any element in Mask is the zero or undef
5606/// sentinel values.
5607static bool isAnyZeroOrUndef(ArrayRef<int> Mask) {
5608  return llvm::any_of(Mask, [](int M) {
5609    return M == SM_SentinelZero || M == SM_SentinelUndef;
5610  });
5611}
5612
5613/// Return true if Val is undef or if its value falls within the
5614/// specified range (L, H].
5615static bool isUndefOrInRange(int Val, int Low, int Hi) {
5616  return (Val == SM_SentinelUndef) || isInRange(Val, Low, Hi);
5617}
5618
5619/// Return true if every element in Mask is undef or if its value
5620/// falls within the specified range (L, H].
5621static bool isUndefOrInRange(ArrayRef<int> Mask, int Low, int Hi) {
5622  return llvm::all_of(
5623      Mask, [Low, Hi](int M) { return isUndefOrInRange(M, Low, Hi); });
5624}
5625
5626/// Return true if Val is undef, zero or if its value falls within the
5627/// specified range (L, H].
5628static bool isUndefOrZeroOrInRange(int Val, int Low, int Hi) {
5629  return isUndefOrZero(Val) || isInRange(Val, Low, Hi);
5630}
5631
5632/// Return true if every element in Mask is undef, zero or if its value
5633/// falls within the specified range (L, H].
5634static bool isUndefOrZeroOrInRange(ArrayRef<int> Mask, int Low, int Hi) {
5635  return llvm::all_of(
5636      Mask, [Low, Hi](int M) { return isUndefOrZeroOrInRange(M, Low, Hi); });
5637}
5638
5639/// Return true if every element in Mask, beginning
5640/// from position Pos and ending in Pos + Size, falls within the specified
5641/// sequence (Low, Low + Step, ..., Low + (Size - 1) * Step) or is undef.
5642static bool isSequentialOrUndefInRange(ArrayRef<int> Mask, unsigned Pos,
5643                                       unsigned Size, int Low, int Step = 1) {
5644  for (unsigned i = Pos, e = Pos + Size; i != e; ++i, Low += Step)
5645    if (!isUndefOrEqual(Mask[i], Low))
5646      return false;
5647  return true;
5648}
5649
5650/// Return true if every element in Mask, beginning
5651/// from position Pos and ending in Pos+Size, falls within the specified
5652/// sequential range (Low, Low+Size], or is undef or is zero.
5653static bool isSequentialOrUndefOrZeroInRange(ArrayRef<int> Mask, unsigned Pos,
5654                                             unsigned Size, int Low,
5655                                             int Step = 1) {
5656  for (unsigned i = Pos, e = Pos + Size; i != e; ++i, Low += Step)
5657    if (!isUndefOrZero(Mask[i]) && Mask[i] != Low)
5658      return false;
5659  return true;
5660}
5661
5662/// Return true if every element in Mask, beginning
5663/// from position Pos and ending in Pos+Size is undef or is zero.
5664static bool isUndefOrZeroInRange(ArrayRef<int> Mask, unsigned Pos,
5665                                 unsigned Size) {
5666  return llvm::all_of(Mask.slice(Pos, Size),
5667                      [](int M) { return isUndefOrZero(M); });
5668}
5669
5670/// Helper function to test whether a shuffle mask could be
5671/// simplified by widening the elements being shuffled.
5672///
5673/// Appends the mask for wider elements in WidenedMask if valid. Otherwise
5674/// leaves it in an unspecified state.
5675///
5676/// NOTE: This must handle normal vector shuffle masks and *target* vector
5677/// shuffle masks. The latter have the special property of a '-2' representing
5678/// a zero-ed lane of a vector.
5679static bool canWidenShuffleElements(ArrayRef<int> Mask,
5680                                    SmallVectorImpl<int> &WidenedMask) {
5681  WidenedMask.assign(Mask.size() / 2, 0);
5682  for (int i = 0, Size = Mask.size(); i < Size; i += 2) {
5683    int M0 = Mask[i];
5684    int M1 = Mask[i + 1];
5685
5686    // If both elements are undef, its trivial.
5687    if (M0 == SM_SentinelUndef && M1 == SM_SentinelUndef) {
5688      WidenedMask[i / 2] = SM_SentinelUndef;
5689      continue;
5690    }
5691
5692    // Check for an undef mask and a mask value properly aligned to fit with
5693    // a pair of values. If we find such a case, use the non-undef mask's value.
5694    if (M0 == SM_SentinelUndef && M1 >= 0 && (M1 % 2) == 1) {
5695      WidenedMask[i / 2] = M1 / 2;
5696      continue;
5697    }
5698    if (M1 == SM_SentinelUndef && M0 >= 0 && (M0 % 2) == 0) {
5699      WidenedMask[i / 2] = M0 / 2;
5700      continue;
5701    }
5702
5703    // When zeroing, we need to spread the zeroing across both lanes to widen.
5704    if (M0 == SM_SentinelZero || M1 == SM_SentinelZero) {
5705      if ((M0 == SM_SentinelZero || M0 == SM_SentinelUndef) &&
5706          (M1 == SM_SentinelZero || M1 == SM_SentinelUndef)) {
5707        WidenedMask[i / 2] = SM_SentinelZero;
5708        continue;
5709      }
5710      return false;
5711    }
5712
5713    // Finally check if the two mask values are adjacent and aligned with
5714    // a pair.
5715    if (M0 != SM_SentinelUndef && (M0 % 2) == 0 && (M0 + 1) == M1) {
5716      WidenedMask[i / 2] = M0 / 2;
5717      continue;
5718    }
5719
5720    // Otherwise we can't safely widen the elements used in this shuffle.
5721    return false;
5722  }
5723  assert(WidenedMask.size() == Mask.size() / 2 &&
5724         "Incorrect size of mask after widening the elements!");
5725
5726  return true;
5727}
5728
5729static bool canWidenShuffleElements(ArrayRef<int> Mask,
5730                                    const APInt &Zeroable,
5731                                    bool V2IsZero,
5732                                    SmallVectorImpl<int> &WidenedMask) {
5733  // Create an alternative mask with info about zeroable elements.
5734  // Here we do not set undef elements as zeroable.
5735  SmallVector<int, 64> ZeroableMask(Mask.begin(), Mask.end());
5736  if (V2IsZero) {
5737    assert(!Zeroable.isNullValue() && "V2's non-undef elements are used?!");
5738    for (int i = 0, Size = Mask.size(); i != Size; ++i)
5739      if (Mask[i] != SM_SentinelUndef && Zeroable[i])
5740        ZeroableMask[i] = SM_SentinelZero;
5741  }
5742  return canWidenShuffleElements(ZeroableMask, WidenedMask);
5743}
5744
5745static bool canWidenShuffleElements(ArrayRef<int> Mask) {
5746  SmallVector<int, 32> WidenedMask;
5747  return canWidenShuffleElements(Mask, WidenedMask);
5748}
5749
5750// Attempt to narrow/widen shuffle mask until it matches the target number of
5751// elements.
5752static bool scaleShuffleElements(ArrayRef<int> Mask, unsigned NumDstElts,
5753                                 SmallVectorImpl<int> &ScaledMask) {
5754  unsigned NumSrcElts = Mask.size();
5755  assert(((NumSrcElts % NumDstElts) == 0 || (NumDstElts % NumSrcElts) == 0) &&
5756         "Illegal shuffle scale factor");
5757
5758  // Narrowing is guaranteed to work.
5759  if (NumDstElts >= NumSrcElts) {
5760    int Scale = NumDstElts / NumSrcElts;
5761    llvm::narrowShuffleMaskElts(Scale, Mask, ScaledMask);
5762    return true;
5763  }
5764
5765  // We have to repeat the widening until we reach the target size, but we can
5766  // split out the first widening as it sets up ScaledMask for us.
5767  if (canWidenShuffleElements(Mask, ScaledMask)) {
5768    while (ScaledMask.size() > NumDstElts) {
5769      SmallVector<int, 16> WidenedMask;
5770      if (!canWidenShuffleElements(ScaledMask, WidenedMask))
5771        return false;
5772      ScaledMask = std::move(WidenedMask);
5773    }
5774    return true;
5775  }
5776
5777  return false;
5778}
5779
5780/// Returns true if Elt is a constant zero or a floating point constant +0.0.
5781bool X86::isZeroNode(SDValue Elt) {
5782  return isNullConstant(Elt) || isNullFPConstant(Elt);
5783}
5784
5785// Build a vector of constants.
5786// Use an UNDEF node if MaskElt == -1.
5787// Split 64-bit constants in the 32-bit mode.
5788static SDValue getConstVector(ArrayRef<int> Values, MVT VT, SelectionDAG &DAG,
5789                              const SDLoc &dl, bool IsMask = false) {
5790
5791  SmallVector<SDValue, 32>  Ops;
5792  bool Split = false;
5793
5794  MVT ConstVecVT = VT;
5795  unsigned NumElts = VT.getVectorNumElements();
5796  bool In64BitMode = DAG.getTargetLoweringInfo().isTypeLegal(MVT::i64);
5797  if (!In64BitMode && VT.getVectorElementType() == MVT::i64) {
5798    ConstVecVT = MVT::getVectorVT(MVT::i32, NumElts * 2);
5799    Split = true;
5800  }
5801
5802  MVT EltVT = ConstVecVT.getVectorElementType();
5803  for (unsigned i = 0; i < NumElts; ++i) {
5804    bool IsUndef = Values[i] < 0 && IsMask;
5805    SDValue OpNode = IsUndef ? DAG.getUNDEF(EltVT) :
5806      DAG.getConstant(Values[i], dl, EltVT);
5807    Ops.push_back(OpNode);
5808    if (Split)
5809      Ops.push_back(IsUndef ? DAG.getUNDEF(EltVT) :
5810                    DAG.getConstant(0, dl, EltVT));
5811  }
5812  SDValue ConstsNode = DAG.getBuildVector(ConstVecVT, dl, Ops);
5813  if (Split)
5814    ConstsNode = DAG.getBitcast(VT, ConstsNode);
5815  return ConstsNode;
5816}
5817
5818static SDValue getConstVector(ArrayRef<APInt> Bits, APInt &Undefs,
5819                              MVT VT, SelectionDAG &DAG, const SDLoc &dl) {
5820  assert(Bits.size() == Undefs.getBitWidth() &&
5821         "Unequal constant and undef arrays");
5822  SmallVector<SDValue, 32> Ops;
5823  bool Split = false;
5824
5825  MVT ConstVecVT = VT;
5826  unsigned NumElts = VT.getVectorNumElements();
5827  bool In64BitMode = DAG.getTargetLoweringInfo().isTypeLegal(MVT::i64);
5828  if (!In64BitMode && VT.getVectorElementType() == MVT::i64) {
5829    ConstVecVT = MVT::getVectorVT(MVT::i32, NumElts * 2);
5830    Split = true;
5831  }
5832
5833  MVT EltVT = ConstVecVT.getVectorElementType();
5834  for (unsigned i = 0, e = Bits.size(); i != e; ++i) {
5835    if (Undefs[i]) {
5836      Ops.append(Split ? 2 : 1, DAG.getUNDEF(EltVT));
5837      continue;
5838    }
5839    const APInt &V = Bits[i];
5840    assert(V.getBitWidth() == VT.getScalarSizeInBits() && "Unexpected sizes");
5841    if (Split) {
5842      Ops.push_back(DAG.getConstant(V.trunc(32), dl, EltVT));
5843      Ops.push_back(DAG.getConstant(V.lshr(32).trunc(32), dl, EltVT));
5844    } else if (EltVT == MVT::f32) {
5845      APFloat FV(APFloat::IEEEsingle(), V);
5846      Ops.push_back(DAG.getConstantFP(FV, dl, EltVT));
5847    } else if (EltVT == MVT::f64) {
5848      APFloat FV(APFloat::IEEEdouble(), V);
5849      Ops.push_back(DAG.getConstantFP(FV, dl, EltVT));
5850    } else {
5851      Ops.push_back(DAG.getConstant(V, dl, EltVT));
5852    }
5853  }
5854
5855  SDValue ConstsNode = DAG.getBuildVector(ConstVecVT, dl, Ops);
5856  return DAG.getBitcast(VT, ConstsNode);
5857}
5858
5859/// Returns a vector of specified type with all zero elements.
5860static SDValue getZeroVector(MVT VT, const X86Subtarget &Subtarget,
5861                             SelectionDAG &DAG, const SDLoc &dl) {
5862  assert((VT.is128BitVector() || VT.is256BitVector() || VT.is512BitVector() ||
5863          VT.getVectorElementType() == MVT::i1) &&
5864         "Unexpected vector type");
5865
5866  // Try to build SSE/AVX zero vectors as <N x i32> bitcasted to their dest
5867  // type. This ensures they get CSE'd. But if the integer type is not
5868  // available, use a floating-point +0.0 instead.
5869  SDValue Vec;
5870  if (!Subtarget.hasSSE2() && VT.is128BitVector()) {
5871    Vec = DAG.getConstantFP(+0.0, dl, MVT::v4f32);
5872  } else if (VT.isFloatingPoint()) {
5873    Vec = DAG.getConstantFP(+0.0, dl, VT);
5874  } else if (VT.getVectorElementType() == MVT::i1) {
5875    assert((Subtarget.hasBWI() || VT.getVectorNumElements() <= 16) &&
5876           "Unexpected vector type");
5877    Vec = DAG.getConstant(0, dl, VT);
5878  } else {
5879    unsigned Num32BitElts = VT.getSizeInBits() / 32;
5880    Vec = DAG.getConstant(0, dl, MVT::getVectorVT(MVT::i32, Num32BitElts));
5881  }
5882  return DAG.getBitcast(VT, Vec);
5883}
5884
5885static SDValue extractSubVector(SDValue Vec, unsigned IdxVal, SelectionDAG &DAG,
5886                                const SDLoc &dl, unsigned vectorWidth) {
5887  EVT VT = Vec.getValueType();
5888  EVT ElVT = VT.getVectorElementType();
5889  unsigned Factor = VT.getSizeInBits() / vectorWidth;
5890  EVT ResultVT = EVT::getVectorVT(*DAG.getContext(), ElVT,
5891                                  VT.getVectorNumElements() / Factor);
5892
5893  // Extract the relevant vectorWidth bits.  Generate an EXTRACT_SUBVECTOR
5894  unsigned ElemsPerChunk = vectorWidth / ElVT.getSizeInBits();
5895  assert(isPowerOf2_32(ElemsPerChunk) && "Elements per chunk not power of 2");
5896
5897  // This is the index of the first element of the vectorWidth-bit chunk
5898  // we want. Since ElemsPerChunk is a power of 2 just need to clear bits.
5899  IdxVal &= ~(ElemsPerChunk - 1);
5900
5901  // If the input is a buildvector just emit a smaller one.
5902  if (Vec.getOpcode() == ISD::BUILD_VECTOR)
5903    return DAG.getBuildVector(ResultVT, dl,
5904                              Vec->ops().slice(IdxVal, ElemsPerChunk));
5905
5906  SDValue VecIdx = DAG.getIntPtrConstant(IdxVal, dl);
5907  return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, ResultVT, Vec, VecIdx);
5908}
5909
5910/// Generate a DAG to grab 128-bits from a vector > 128 bits.  This
5911/// sets things up to match to an AVX VEXTRACTF128 / VEXTRACTI128
5912/// or AVX-512 VEXTRACTF32x4 / VEXTRACTI32x4
5913/// instructions or a simple subregister reference. Idx is an index in the
5914/// 128 bits we want.  It need not be aligned to a 128-bit boundary.  That makes
5915/// lowering EXTRACT_VECTOR_ELT operations easier.
5916static SDValue extract128BitVector(SDValue Vec, unsigned IdxVal,
5917                                   SelectionDAG &DAG, const SDLoc &dl) {
5918  assert((Vec.getValueType().is256BitVector() ||
5919          Vec.getValueType().is512BitVector()) && "Unexpected vector size!");
5920  return extractSubVector(Vec, IdxVal, DAG, dl, 128);
5921}
5922
5923/// Generate a DAG to grab 256-bits from a 512-bit vector.
5924static SDValue extract256BitVector(SDValue Vec, unsigned IdxVal,
5925                                   SelectionDAG &DAG, const SDLoc &dl) {
5926  assert(Vec.getValueType().is512BitVector() && "Unexpected vector size!");
5927  return extractSubVector(Vec, IdxVal, DAG, dl, 256);
5928}
5929
5930static SDValue insertSubVector(SDValue Result, SDValue Vec, unsigned IdxVal,
5931                               SelectionDAG &DAG, const SDLoc &dl,
5932                               unsigned vectorWidth) {
5933  assert((vectorWidth == 128 || vectorWidth == 256) &&
5934         "Unsupported vector width");
5935  // Inserting UNDEF is Result
5936  if (Vec.isUndef())
5937    return Result;
5938  EVT VT = Vec.getValueType();
5939  EVT ElVT = VT.getVectorElementType();
5940  EVT ResultVT = Result.getValueType();
5941
5942  // Insert the relevant vectorWidth bits.
5943  unsigned ElemsPerChunk = vectorWidth/ElVT.getSizeInBits();
5944  assert(isPowerOf2_32(ElemsPerChunk) && "Elements per chunk not power of 2");
5945
5946  // This is the index of the first element of the vectorWidth-bit chunk
5947  // we want. Since ElemsPerChunk is a power of 2 just need to clear bits.
5948  IdxVal &= ~(ElemsPerChunk - 1);
5949
5950  SDValue VecIdx = DAG.getIntPtrConstant(IdxVal, dl);
5951  return DAG.getNode(ISD::INSERT_SUBVECTOR, dl, ResultVT, Result, Vec, VecIdx);
5952}
5953
5954/// Generate a DAG to put 128-bits into a vector > 128 bits.  This
5955/// sets things up to match to an AVX VINSERTF128/VINSERTI128 or
5956/// AVX-512 VINSERTF32x4/VINSERTI32x4 instructions or a
5957/// simple superregister reference.  Idx is an index in the 128 bits
5958/// we want.  It need not be aligned to a 128-bit boundary.  That makes
5959/// lowering INSERT_VECTOR_ELT operations easier.
5960static SDValue insert128BitVector(SDValue Result, SDValue Vec, unsigned IdxVal,
5961                                  SelectionDAG &DAG, const SDLoc &dl) {
5962  assert(Vec.getValueType().is128BitVector() && "Unexpected vector size!");
5963  return insertSubVector(Result, Vec, IdxVal, DAG, dl, 128);
5964}
5965
5966/// Widen a vector to a larger size with the same scalar type, with the new
5967/// elements either zero or undef.
5968static SDValue widenSubVector(MVT VT, SDValue Vec, bool ZeroNewElements,
5969                              const X86Subtarget &Subtarget, SelectionDAG &DAG,
5970                              const SDLoc &dl) {
5971  assert(Vec.getValueSizeInBits().getFixedSize() < VT.getFixedSizeInBits() &&
5972         Vec.getValueType().getScalarType() == VT.getScalarType() &&
5973         "Unsupported vector widening type");
5974  SDValue Res = ZeroNewElements ? getZeroVector(VT, Subtarget, DAG, dl)
5975                                : DAG.getUNDEF(VT);
5976  return DAG.getNode(ISD::INSERT_SUBVECTOR, dl, VT, Res, Vec,
5977                     DAG.getIntPtrConstant(0, dl));
5978}
5979
5980/// Widen a vector to a larger size with the same scalar type, with the new
5981/// elements either zero or undef.
5982static SDValue widenSubVector(SDValue Vec, bool ZeroNewElements,
5983                              const X86Subtarget &Subtarget, SelectionDAG &DAG,
5984                              const SDLoc &dl, unsigned WideSizeInBits) {
5985  assert(Vec.getValueSizeInBits() < WideSizeInBits &&
5986         (WideSizeInBits % Vec.getScalarValueSizeInBits()) == 0 &&
5987         "Unsupported vector widening type");
5988  unsigned WideNumElts = WideSizeInBits / Vec.getScalarValueSizeInBits();
5989  MVT SVT = Vec.getSimpleValueType().getScalarType();
5990  MVT VT = MVT::getVectorVT(SVT, WideNumElts);
5991  return widenSubVector(VT, Vec, ZeroNewElements, Subtarget, DAG, dl);
5992}
5993
5994// Helper function to collect subvector ops that are concatenated together,
5995// either by ISD::CONCAT_VECTORS or a ISD::INSERT_SUBVECTOR series.
5996// The subvectors in Ops are guaranteed to be the same type.
5997static bool collectConcatOps(SDNode *N, SmallVectorImpl<SDValue> &Ops) {
5998  assert(Ops.empty() && "Expected an empty ops vector");
5999
6000  if (N->getOpcode() == ISD::CONCAT_VECTORS) {
6001    Ops.append(N->op_begin(), N->op_end());
6002    return true;
6003  }
6004
6005  if (N->getOpcode() == ISD::INSERT_SUBVECTOR) {
6006    SDValue Src = N->getOperand(0);
6007    SDValue Sub = N->getOperand(1);
6008    const APInt &Idx = N->getConstantOperandAPInt(2);
6009    EVT VT = Src.getValueType();
6010    EVT SubVT = Sub.getValueType();
6011
6012    // TODO - Handle more general insert_subvector chains.
6013    if (VT.getSizeInBits() == (SubVT.getSizeInBits() * 2) &&
6014        Idx == (VT.getVectorNumElements() / 2)) {
6015      // insert_subvector(insert_subvector(undef, x, lo), y, hi)
6016      if (Src.getOpcode() == ISD::INSERT_SUBVECTOR &&
6017          Src.getOperand(1).getValueType() == SubVT &&
6018          isNullConstant(Src.getOperand(2))) {
6019        Ops.push_back(Src.getOperand(1));
6020        Ops.push_back(Sub);
6021        return true;
6022      }
6023      // insert_subvector(x, extract_subvector(x, lo), hi)
6024      if (Sub.getOpcode() == ISD::EXTRACT_SUBVECTOR &&
6025          Sub.getOperand(0) == Src && isNullConstant(Sub.getOperand(1))) {
6026        Ops.append(2, Sub);
6027        return true;
6028      }
6029    }
6030  }
6031
6032  return false;
6033}
6034
6035static std::pair<SDValue, SDValue> splitVector(SDValue Op, SelectionDAG &DAG,
6036                                               const SDLoc &dl) {
6037  EVT VT = Op.getValueType();
6038  unsigned NumElems = VT.getVectorNumElements();
6039  unsigned SizeInBits = VT.getSizeInBits();
6040  assert((NumElems % 2) == 0 && (SizeInBits % 2) == 0 &&
6041         "Can't split odd sized vector");
6042
6043  SDValue Lo = extractSubVector(Op, 0, DAG, dl, SizeInBits / 2);
6044  SDValue Hi = extractSubVector(Op, NumElems / 2, DAG, dl, SizeInBits / 2);
6045  return std::make_pair(Lo, Hi);
6046}
6047
6048// Split an unary integer op into 2 half sized ops.
6049static SDValue splitVectorIntUnary(SDValue Op, SelectionDAG &DAG) {
6050  EVT VT = Op.getValueType();
6051
6052  // Make sure we only try to split 256/512-bit types to avoid creating
6053  // narrow vectors.
6054  assert((Op.getOperand(0).getValueType().is256BitVector() ||
6055          Op.getOperand(0).getValueType().is512BitVector()) &&
6056         (VT.is256BitVector() || VT.is512BitVector()) && "Unsupported VT!");
6057  assert(Op.getOperand(0).getValueType().getVectorNumElements() ==
6058             VT.getVectorNumElements() &&
6059         "Unexpected VTs!");
6060
6061  SDLoc dl(Op);
6062
6063  // Extract the Lo/Hi vectors
6064  SDValue Lo, Hi;
6065  std::tie(Lo, Hi) = splitVector(Op.getOperand(0), DAG, dl);
6066
6067  EVT LoVT, HiVT;
6068  std::tie(LoVT, HiVT) = DAG.GetSplitDestVTs(VT);
6069  return DAG.getNode(ISD::CONCAT_VECTORS, dl, VT,
6070                     DAG.getNode(Op.getOpcode(), dl, LoVT, Lo),
6071                     DAG.getNode(Op.getOpcode(), dl, HiVT, Hi));
6072}
6073
6074/// Break a binary integer operation into 2 half sized ops and then
6075/// concatenate the result back.
6076static SDValue splitVectorIntBinary(SDValue Op, SelectionDAG &DAG) {
6077  EVT VT = Op.getValueType();
6078
6079  // Sanity check that all the types match.
6080  assert(Op.getOperand(0).getValueType() == VT &&
6081         Op.getOperand(1).getValueType() == VT && "Unexpected VTs!");
6082  assert((VT.is256BitVector() || VT.is512BitVector()) && "Unsupported VT!");
6083
6084  SDLoc dl(Op);
6085
6086  // Extract the LHS Lo/Hi vectors
6087  SDValue LHS1, LHS2;
6088  std::tie(LHS1, LHS2) = splitVector(Op.getOperand(0), DAG, dl);
6089
6090  // Extract the RHS Lo/Hi vectors
6091  SDValue RHS1, RHS2;
6092  std::tie(RHS1, RHS2) = splitVector(Op.getOperand(1), DAG, dl);
6093
6094  EVT LoVT, HiVT;
6095  std::tie(LoVT, HiVT) = DAG.GetSplitDestVTs(VT);
6096  return DAG.getNode(ISD::CONCAT_VECTORS, dl, VT,
6097                     DAG.getNode(Op.getOpcode(), dl, LoVT, LHS1, RHS1),
6098                     DAG.getNode(Op.getOpcode(), dl, HiVT, LHS2, RHS2));
6099}
6100
6101// Helper for splitting operands of an operation to legal target size and
6102// apply a function on each part.
6103// Useful for operations that are available on SSE2 in 128-bit, on AVX2 in
6104// 256-bit and on AVX512BW in 512-bit. The argument VT is the type used for
6105// deciding if/how to split Ops. Ops elements do *not* have to be of type VT.
6106// The argument Builder is a function that will be applied on each split part:
6107// SDValue Builder(SelectionDAG&G, SDLoc, ArrayRef<SDValue>)
6108template <typename F>
6109SDValue SplitOpsAndApply(SelectionDAG &DAG, const X86Subtarget &Subtarget,
6110                         const SDLoc &DL, EVT VT, ArrayRef<SDValue> Ops,
6111                         F Builder, bool CheckBWI = true) {
6112  assert(Subtarget.hasSSE2() && "Target assumed to support at least SSE2");
6113  unsigned NumSubs = 1;
6114  if ((CheckBWI && Subtarget.useBWIRegs()) ||
6115      (!CheckBWI && Subtarget.useAVX512Regs())) {
6116    if (VT.getSizeInBits() > 512) {
6117      NumSubs = VT.getSizeInBits() / 512;
6118      assert((VT.getSizeInBits() % 512) == 0 && "Illegal vector size");
6119    }
6120  } else if (Subtarget.hasAVX2()) {
6121    if (VT.getSizeInBits() > 256) {
6122      NumSubs = VT.getSizeInBits() / 256;
6123      assert((VT.getSizeInBits() % 256) == 0 && "Illegal vector size");
6124    }
6125  } else {
6126    if (VT.getSizeInBits() > 128) {
6127      NumSubs = VT.getSizeInBits() / 128;
6128      assert((VT.getSizeInBits() % 128) == 0 && "Illegal vector size");
6129    }
6130  }
6131
6132  if (NumSubs == 1)
6133    return Builder(DAG, DL, Ops);
6134
6135  SmallVector<SDValue, 4> Subs;
6136  for (unsigned i = 0; i != NumSubs; ++i) {
6137    SmallVector<SDValue, 2> SubOps;
6138    for (SDValue Op : Ops) {
6139      EVT OpVT = Op.getValueType();
6140      unsigned NumSubElts = OpVT.getVectorNumElements() / NumSubs;
6141      unsigned SizeSub = OpVT.getSizeInBits() / NumSubs;
6142      SubOps.push_back(extractSubVector(Op, i * NumSubElts, DAG, DL, SizeSub));
6143    }
6144    Subs.push_back(Builder(DAG, DL, SubOps));
6145  }
6146  return DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, Subs);
6147}
6148
6149/// Insert i1-subvector to i1-vector.
6150static SDValue insert1BitVector(SDValue Op, SelectionDAG &DAG,
6151                                const X86Subtarget &Subtarget) {
6152
6153  SDLoc dl(Op);
6154  SDValue Vec = Op.getOperand(0);
6155  SDValue SubVec = Op.getOperand(1);
6156  SDValue Idx = Op.getOperand(2);
6157  unsigned IdxVal = Op.getConstantOperandVal(2);
6158
6159  // Inserting undef is a nop. We can just return the original vector.
6160  if (SubVec.isUndef())
6161    return Vec;
6162
6163  if (IdxVal == 0 && Vec.isUndef()) // the operation is legal
6164    return Op;
6165
6166  MVT OpVT = Op.getSimpleValueType();
6167  unsigned NumElems = OpVT.getVectorNumElements();
6168  SDValue ZeroIdx = DAG.getIntPtrConstant(0, dl);
6169
6170  // Extend to natively supported kshift.
6171  MVT WideOpVT = OpVT;
6172  if ((!Subtarget.hasDQI() && NumElems == 8) || NumElems < 8)
6173    WideOpVT = Subtarget.hasDQI() ? MVT::v8i1 : MVT::v16i1;
6174
6175  // Inserting into the lsbs of a zero vector is legal. ISel will insert shifts
6176  // if necessary.
6177  if (IdxVal == 0 && ISD::isBuildVectorAllZeros(Vec.getNode())) {
6178    // May need to promote to a legal type.
6179    Op = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, WideOpVT,
6180                     DAG.getConstant(0, dl, WideOpVT),
6181                     SubVec, Idx);
6182    return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, OpVT, Op, ZeroIdx);
6183  }
6184
6185  MVT SubVecVT = SubVec.getSimpleValueType();
6186  unsigned SubVecNumElems = SubVecVT.getVectorNumElements();
6187  assert(IdxVal + SubVecNumElems <= NumElems &&
6188         IdxVal % SubVecVT.getSizeInBits() == 0 &&
6189         "Unexpected index value in INSERT_SUBVECTOR");
6190
6191  SDValue Undef = DAG.getUNDEF(WideOpVT);
6192
6193  if (IdxVal == 0) {
6194    // Zero lower bits of the Vec
6195    SDValue ShiftBits = DAG.getTargetConstant(SubVecNumElems, dl, MVT::i8);
6196    Vec = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, WideOpVT, Undef, Vec,
6197                      ZeroIdx);
6198    Vec = DAG.getNode(X86ISD::KSHIFTR, dl, WideOpVT, Vec, ShiftBits);
6199    Vec = DAG.getNode(X86ISD::KSHIFTL, dl, WideOpVT, Vec, ShiftBits);
6200    // Merge them together, SubVec should be zero extended.
6201    SubVec = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, WideOpVT,
6202                         DAG.getConstant(0, dl, WideOpVT),
6203                         SubVec, ZeroIdx);
6204    Op = DAG.getNode(ISD::OR, dl, WideOpVT, Vec, SubVec);
6205    return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, OpVT, Op, ZeroIdx);
6206  }
6207
6208  SubVec = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, WideOpVT,
6209                       Undef, SubVec, ZeroIdx);
6210
6211  if (Vec.isUndef()) {
6212    assert(IdxVal != 0 && "Unexpected index");
6213    SubVec = DAG.getNode(X86ISD::KSHIFTL, dl, WideOpVT, SubVec,
6214                         DAG.getTargetConstant(IdxVal, dl, MVT::i8));
6215    return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, OpVT, SubVec, ZeroIdx);
6216  }
6217
6218  if (ISD::isBuildVectorAllZeros(Vec.getNode())) {
6219    assert(IdxVal != 0 && "Unexpected index");
6220    NumElems = WideOpVT.getVectorNumElements();
6221    unsigned ShiftLeft = NumElems - SubVecNumElems;
6222    unsigned ShiftRight = NumElems - SubVecNumElems - IdxVal;
6223    SubVec = DAG.getNode(X86ISD::KSHIFTL, dl, WideOpVT, SubVec,
6224                         DAG.getTargetConstant(ShiftLeft, dl, MVT::i8));
6225    if (ShiftRight != 0)
6226      SubVec = DAG.getNode(X86ISD::KSHIFTR, dl, WideOpVT, SubVec,
6227                           DAG.getTargetConstant(ShiftRight, dl, MVT::i8));
6228    return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, OpVT, SubVec, ZeroIdx);
6229  }
6230
6231  // Simple case when we put subvector in the upper part
6232  if (IdxVal + SubVecNumElems == NumElems) {
6233    SubVec = DAG.getNode(X86ISD::KSHIFTL, dl, WideOpVT, SubVec,
6234                         DAG.getTargetConstant(IdxVal, dl, MVT::i8));
6235    if (SubVecNumElems * 2 == NumElems) {
6236      // Special case, use legal zero extending insert_subvector. This allows
6237      // isel to optimize when bits are known zero.
6238      Vec = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, SubVecVT, Vec, ZeroIdx);
6239      Vec = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, WideOpVT,
6240                        DAG.getConstant(0, dl, WideOpVT),
6241                        Vec, ZeroIdx);
6242    } else {
6243      // Otherwise use explicit shifts to zero the bits.
6244      Vec = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, WideOpVT,
6245                        Undef, Vec, ZeroIdx);
6246      NumElems = WideOpVT.getVectorNumElements();
6247      SDValue ShiftBits = DAG.getTargetConstant(NumElems - IdxVal, dl, MVT::i8);
6248      Vec = DAG.getNode(X86ISD::KSHIFTL, dl, WideOpVT, Vec, ShiftBits);
6249      Vec = DAG.getNode(X86ISD::KSHIFTR, dl, WideOpVT, Vec, ShiftBits);
6250    }
6251    Op = DAG.getNode(ISD::OR, dl, WideOpVT, Vec, SubVec);
6252    return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, OpVT, Op, ZeroIdx);
6253  }
6254
6255  // Inserting into the middle is more complicated.
6256
6257  NumElems = WideOpVT.getVectorNumElements();
6258
6259  // Widen the vector if needed.
6260  Vec = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, WideOpVT, Undef, Vec, ZeroIdx);
6261
6262  unsigned ShiftLeft = NumElems - SubVecNumElems;
6263  unsigned ShiftRight = NumElems - SubVecNumElems - IdxVal;
6264
6265  // Do an optimization for the the most frequently used types.
6266  if (WideOpVT != MVT::v64i1 || Subtarget.is64Bit()) {
6267    APInt Mask0 = APInt::getBitsSet(NumElems, IdxVal, IdxVal + SubVecNumElems);
6268    Mask0.flipAllBits();
6269    SDValue CMask0 = DAG.getConstant(Mask0, dl, MVT::getIntegerVT(NumElems));
6270    SDValue VMask0 = DAG.getNode(ISD::BITCAST, dl, WideOpVT, CMask0);
6271    Vec = DAG.getNode(ISD::AND, dl, WideOpVT, Vec, VMask0);
6272    SubVec = DAG.getNode(X86ISD::KSHIFTL, dl, WideOpVT, SubVec,
6273                         DAG.getTargetConstant(ShiftLeft, dl, MVT::i8));
6274    SubVec = DAG.getNode(X86ISD::KSHIFTR, dl, WideOpVT, SubVec,
6275                         DAG.getTargetConstant(ShiftRight, dl, MVT::i8));
6276    Op = DAG.getNode(ISD::OR, dl, WideOpVT, Vec, SubVec);
6277
6278    // Reduce to original width if needed.
6279    return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, OpVT, Op, ZeroIdx);
6280  }
6281
6282  // Clear the upper bits of the subvector and move it to its insert position.
6283  SubVec = DAG.getNode(X86ISD::KSHIFTL, dl, WideOpVT, SubVec,
6284                       DAG.getTargetConstant(ShiftLeft, dl, MVT::i8));
6285  SubVec = DAG.getNode(X86ISD::KSHIFTR, dl, WideOpVT, SubVec,
6286                       DAG.getTargetConstant(ShiftRight, dl, MVT::i8));
6287
6288  // Isolate the bits below the insertion point.
6289  unsigned LowShift = NumElems - IdxVal;
6290  SDValue Low = DAG.getNode(X86ISD::KSHIFTL, dl, WideOpVT, Vec,
6291                            DAG.getTargetConstant(LowShift, dl, MVT::i8));
6292  Low = DAG.getNode(X86ISD::KSHIFTR, dl, WideOpVT, Low,
6293                    DAG.getTargetConstant(LowShift, dl, MVT::i8));
6294
6295  // Isolate the bits after the last inserted bit.
6296  unsigned HighShift = IdxVal + SubVecNumElems;
6297  SDValue High = DAG.getNode(X86ISD::KSHIFTR, dl, WideOpVT, Vec,
6298                            DAG.getTargetConstant(HighShift, dl, MVT::i8));
6299  High = DAG.getNode(X86ISD::KSHIFTL, dl, WideOpVT, High,
6300                    DAG.getTargetConstant(HighShift, dl, MVT::i8));
6301
6302  // Now OR all 3 pieces together.
6303  Vec = DAG.getNode(ISD::OR, dl, WideOpVT, Low, High);
6304  SubVec = DAG.getNode(ISD::OR, dl, WideOpVT, SubVec, Vec);
6305
6306  // Reduce to original width if needed.
6307  return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, OpVT, SubVec, ZeroIdx);
6308}
6309
6310static SDValue concatSubVectors(SDValue V1, SDValue V2, SelectionDAG &DAG,
6311                                const SDLoc &dl) {
6312  assert(V1.getValueType() == V2.getValueType() && "subvector type mismatch");
6313  EVT SubVT = V1.getValueType();
6314  EVT SubSVT = SubVT.getScalarType();
6315  unsigned SubNumElts = SubVT.getVectorNumElements();
6316  unsigned SubVectorWidth = SubVT.getSizeInBits();
6317  EVT VT = EVT::getVectorVT(*DAG.getContext(), SubSVT, 2 * SubNumElts);
6318  SDValue V = insertSubVector(DAG.getUNDEF(VT), V1, 0, DAG, dl, SubVectorWidth);
6319  return insertSubVector(V, V2, SubNumElts, DAG, dl, SubVectorWidth);
6320}
6321
6322/// Returns a vector of specified type with all bits set.
6323/// Always build ones vectors as <4 x i32>, <8 x i32> or <16 x i32>.
6324/// Then bitcast to their original type, ensuring they get CSE'd.
6325static SDValue getOnesVector(EVT VT, SelectionDAG &DAG, const SDLoc &dl) {
6326  assert((VT.is128BitVector() || VT.is256BitVector() || VT.is512BitVector()) &&
6327         "Expected a 128/256/512-bit vector type");
6328
6329  APInt Ones = APInt::getAllOnesValue(32);
6330  unsigned NumElts = VT.getSizeInBits() / 32;
6331  SDValue Vec = DAG.getConstant(Ones, dl, MVT::getVectorVT(MVT::i32, NumElts));
6332  return DAG.getBitcast(VT, Vec);
6333}
6334
6335// Convert *_EXTEND_VECTOR_INREG to *_EXTEND opcode.
6336static unsigned getOpcode_EXTEND(unsigned Opcode) {
6337  switch (Opcode) {
6338  case ISD::ANY_EXTEND:
6339  case ISD::ANY_EXTEND_VECTOR_INREG:
6340    return ISD::ANY_EXTEND;
6341  case ISD::ZERO_EXTEND:
6342  case ISD::ZERO_EXTEND_VECTOR_INREG:
6343    return ISD::ZERO_EXTEND;
6344  case ISD::SIGN_EXTEND:
6345  case ISD::SIGN_EXTEND_VECTOR_INREG:
6346    return ISD::SIGN_EXTEND;
6347  }
6348  llvm_unreachable("Unknown opcode");
6349}
6350
6351// Convert *_EXTEND to *_EXTEND_VECTOR_INREG opcode.
6352static unsigned getOpcode_EXTEND_VECTOR_INREG(unsigned Opcode) {
6353  switch (Opcode) {
6354  case ISD::ANY_EXTEND:
6355  case ISD::ANY_EXTEND_VECTOR_INREG:
6356    return ISD::ANY_EXTEND_VECTOR_INREG;
6357  case ISD::ZERO_EXTEND:
6358  case ISD::ZERO_EXTEND_VECTOR_INREG:
6359    return ISD::ZERO_EXTEND_VECTOR_INREG;
6360  case ISD::SIGN_EXTEND:
6361  case ISD::SIGN_EXTEND_VECTOR_INREG:
6362    return ISD::SIGN_EXTEND_VECTOR_INREG;
6363  }
6364  llvm_unreachable("Unknown opcode");
6365}
6366
6367static SDValue getEXTEND_VECTOR_INREG(unsigned Opcode, const SDLoc &DL, EVT VT,
6368                                      SDValue In, SelectionDAG &DAG) {
6369  EVT InVT = In.getValueType();
6370  assert(VT.isVector() && InVT.isVector() && "Expected vector VTs.");
6371  assert((ISD::ANY_EXTEND == Opcode || ISD::SIGN_EXTEND == Opcode ||
6372          ISD::ZERO_EXTEND == Opcode) &&
6373         "Unknown extension opcode");
6374
6375  // For 256-bit vectors, we only need the lower (128-bit) input half.
6376  // For 512-bit vectors, we only need the lower input half or quarter.
6377  if (InVT.getSizeInBits() > 128) {
6378    assert(VT.getSizeInBits() == InVT.getSizeInBits() &&
6379           "Expected VTs to be the same size!");
6380    unsigned Scale = VT.getScalarSizeInBits() / InVT.getScalarSizeInBits();
6381    In = extractSubVector(In, 0, DAG, DL,
6382                          std::max(128U, (unsigned)VT.getSizeInBits() / Scale));
6383    InVT = In.getValueType();
6384  }
6385
6386  if (VT.getVectorNumElements() != InVT.getVectorNumElements())
6387    Opcode = getOpcode_EXTEND_VECTOR_INREG(Opcode);
6388
6389  return DAG.getNode(Opcode, DL, VT, In);
6390}
6391
6392// Match (xor X, -1) -> X.
6393// Match extract_subvector(xor X, -1) -> extract_subvector(X).
6394// Match concat_vectors(xor X, -1, xor Y, -1) -> concat_vectors(X, Y).
6395static SDValue IsNOT(SDValue V, SelectionDAG &DAG) {
6396  V = peekThroughBitcasts(V);
6397  if (V.getOpcode() == ISD::XOR &&
6398      ISD::isBuildVectorAllOnes(V.getOperand(1).getNode()))
6399    return V.getOperand(0);
6400  if (V.getOpcode() == ISD::EXTRACT_SUBVECTOR &&
6401      (isNullConstant(V.getOperand(1)) || V.getOperand(0).hasOneUse())) {
6402    if (SDValue Not = IsNOT(V.getOperand(0), DAG)) {
6403      Not = DAG.getBitcast(V.getOperand(0).getValueType(), Not);
6404      return DAG.getNode(ISD::EXTRACT_SUBVECTOR, SDLoc(Not), V.getValueType(),
6405                         Not, V.getOperand(1));
6406    }
6407  }
6408  SmallVector<SDValue, 2> CatOps;
6409  if (collectConcatOps(V.getNode(), CatOps)) {
6410    for (SDValue &CatOp : CatOps) {
6411      SDValue NotCat = IsNOT(CatOp, DAG);
6412      if (!NotCat) return SDValue();
6413      CatOp = DAG.getBitcast(CatOp.getValueType(), NotCat);
6414    }
6415    return DAG.getNode(ISD::CONCAT_VECTORS, SDLoc(V), V.getValueType(), CatOps);
6416  }
6417  return SDValue();
6418}
6419
6420void llvm::createUnpackShuffleMask(EVT VT, SmallVectorImpl<int> &Mask,
6421                                   bool Lo, bool Unary) {
6422  assert(VT.getScalarType().isSimple() && (VT.getSizeInBits() % 128) == 0 &&
6423         "Illegal vector type to unpack");
6424  assert(Mask.empty() && "Expected an empty shuffle mask vector");
6425  int NumElts = VT.getVectorNumElements();
6426  int NumEltsInLane = 128 / VT.getScalarSizeInBits();
6427  for (int i = 0; i < NumElts; ++i) {
6428    unsigned LaneStart = (i / NumEltsInLane) * NumEltsInLane;
6429    int Pos = (i % NumEltsInLane) / 2 + LaneStart;
6430    Pos += (Unary ? 0 : NumElts * (i % 2));
6431    Pos += (Lo ? 0 : NumEltsInLane / 2);
6432    Mask.push_back(Pos);
6433  }
6434}
6435
6436/// Similar to unpacklo/unpackhi, but without the 128-bit lane limitation
6437/// imposed by AVX and specific to the unary pattern. Example:
6438/// v8iX Lo --> <0, 0, 1, 1, 2, 2, 3, 3>
6439/// v8iX Hi --> <4, 4, 5, 5, 6, 6, 7, 7>
6440void llvm::createSplat2ShuffleMask(MVT VT, SmallVectorImpl<int> &Mask,
6441                                   bool Lo) {
6442  assert(Mask.empty() && "Expected an empty shuffle mask vector");
6443  int NumElts = VT.getVectorNumElements();
6444  for (int i = 0; i < NumElts; ++i) {
6445    int Pos = i / 2;
6446    Pos += (Lo ? 0 : NumElts / 2);
6447    Mask.push_back(Pos);
6448  }
6449}
6450
6451/// Returns a vector_shuffle node for an unpackl operation.
6452static SDValue getUnpackl(SelectionDAG &DAG, const SDLoc &dl, EVT VT,
6453                          SDValue V1, SDValue V2) {
6454  SmallVector<int, 8> Mask;
6455  createUnpackShuffleMask(VT, Mask, /* Lo = */ true, /* Unary = */ false);
6456  return DAG.getVectorShuffle(VT, dl, V1, V2, Mask);
6457}
6458
6459/// Returns a vector_shuffle node for an unpackh operation.
6460static SDValue getUnpackh(SelectionDAG &DAG, const SDLoc &dl, EVT VT,
6461                          SDValue V1, SDValue V2) {
6462  SmallVector<int, 8> Mask;
6463  createUnpackShuffleMask(VT, Mask, /* Lo = */ false, /* Unary = */ false);
6464  return DAG.getVectorShuffle(VT, dl, V1, V2, Mask);
6465}
6466
6467/// Return a vector_shuffle of the specified vector of zero or undef vector.
6468/// This produces a shuffle where the low element of V2 is swizzled into the
6469/// zero/undef vector, landing at element Idx.
6470/// This produces a shuffle mask like 4,1,2,3 (idx=0) or  0,1,2,4 (idx=3).
6471static SDValue getShuffleVectorZeroOrUndef(SDValue V2, int Idx,
6472                                           bool IsZero,
6473                                           const X86Subtarget &Subtarget,
6474                                           SelectionDAG &DAG) {
6475  MVT VT = V2.getSimpleValueType();
6476  SDValue V1 = IsZero
6477    ? getZeroVector(VT, Subtarget, DAG, SDLoc(V2)) : DAG.getUNDEF(VT);
6478  int NumElems = VT.getVectorNumElements();
6479  SmallVector<int, 16> MaskVec(NumElems);
6480  for (int i = 0; i != NumElems; ++i)
6481    // If this is the insertion idx, put the low elt of V2 here.
6482    MaskVec[i] = (i == Idx) ? NumElems : i;
6483  return DAG.getVectorShuffle(VT, SDLoc(V2), V1, V2, MaskVec);
6484}
6485
6486static const Constant *getTargetConstantFromBasePtr(SDValue Ptr) {
6487  if (Ptr.getOpcode() == X86ISD::Wrapper ||
6488      Ptr.getOpcode() == X86ISD::WrapperRIP)
6489    Ptr = Ptr.getOperand(0);
6490
6491  auto *CNode = dyn_cast<ConstantPoolSDNode>(Ptr);
6492  if (!CNode || CNode->isMachineConstantPoolEntry() || CNode->getOffset() != 0)
6493    return nullptr;
6494
6495  return CNode->getConstVal();
6496}
6497
6498static const Constant *getTargetConstantFromNode(LoadSDNode *Load) {
6499  if (!Load || !ISD::isNormalLoad(Load))
6500    return nullptr;
6501  return getTargetConstantFromBasePtr(Load->getBasePtr());
6502}
6503
6504static const Constant *getTargetConstantFromNode(SDValue Op) {
6505  Op = peekThroughBitcasts(Op);
6506  return getTargetConstantFromNode(dyn_cast<LoadSDNode>(Op));
6507}
6508
6509const Constant *
6510X86TargetLowering::getTargetConstantFromLoad(LoadSDNode *LD) const {
6511  assert(LD && "Unexpected null LoadSDNode");
6512  return getTargetConstantFromNode(LD);
6513}
6514
6515// Extract raw constant bits from constant pools.
6516static bool getTargetConstantBitsFromNode(SDValue Op, unsigned EltSizeInBits,
6517                                          APInt &UndefElts,
6518                                          SmallVectorImpl<APInt> &EltBits,
6519                                          bool AllowWholeUndefs = true,
6520                                          bool AllowPartialUndefs = true) {
6521  assert(EltBits.empty() && "Expected an empty EltBits vector");
6522
6523  Op = peekThroughBitcasts(Op);
6524
6525  EVT VT = Op.getValueType();
6526  unsigned SizeInBits = VT.getSizeInBits();
6527  assert((SizeInBits % EltSizeInBits) == 0 && "Can't split constant!");
6528  unsigned NumElts = SizeInBits / EltSizeInBits;
6529
6530  // Bitcast a source array of element bits to the target size.
6531  auto CastBitData = [&](APInt &UndefSrcElts, ArrayRef<APInt> SrcEltBits) {
6532    unsigned NumSrcElts = UndefSrcElts.getBitWidth();
6533    unsigned SrcEltSizeInBits = SrcEltBits[0].getBitWidth();
6534    assert((NumSrcElts * SrcEltSizeInBits) == SizeInBits &&
6535           "Constant bit sizes don't match");
6536
6537    // Don't split if we don't allow undef bits.
6538    bool AllowUndefs = AllowWholeUndefs || AllowPartialUndefs;
6539    if (UndefSrcElts.getBoolValue() && !AllowUndefs)
6540      return false;
6541
6542    // If we're already the right size, don't bother bitcasting.
6543    if (NumSrcElts == NumElts) {
6544      UndefElts = UndefSrcElts;
6545      EltBits.assign(SrcEltBits.begin(), SrcEltBits.end());
6546      return true;
6547    }
6548
6549    // Extract all the undef/constant element data and pack into single bitsets.
6550    APInt UndefBits(SizeInBits, 0);
6551    APInt MaskBits(SizeInBits, 0);
6552
6553    for (unsigned i = 0; i != NumSrcElts; ++i) {
6554      unsigned BitOffset = i * SrcEltSizeInBits;
6555      if (UndefSrcElts[i])
6556        UndefBits.setBits(BitOffset, BitOffset + SrcEltSizeInBits);
6557      MaskBits.insertBits(SrcEltBits[i], BitOffset);
6558    }
6559
6560    // Split the undef/constant single bitset data into the target elements.
6561    UndefElts = APInt(NumElts, 0);
6562    EltBits.resize(NumElts, APInt(EltSizeInBits, 0));
6563
6564    for (unsigned i = 0; i != NumElts; ++i) {
6565      unsigned BitOffset = i * EltSizeInBits;
6566      APInt UndefEltBits = UndefBits.extractBits(EltSizeInBits, BitOffset);
6567
6568      // Only treat an element as UNDEF if all bits are UNDEF.
6569      if (UndefEltBits.isAllOnesValue()) {
6570        if (!AllowWholeUndefs)
6571          return false;
6572        UndefElts.setBit(i);
6573        continue;
6574      }
6575
6576      // If only some bits are UNDEF then treat them as zero (or bail if not
6577      // supported).
6578      if (UndefEltBits.getBoolValue() && !AllowPartialUndefs)
6579        return false;
6580
6581      EltBits[i] = MaskBits.extractBits(EltSizeInBits, BitOffset);
6582    }
6583    return true;
6584  };
6585
6586  // Collect constant bits and insert into mask/undef bit masks.
6587  auto CollectConstantBits = [](const Constant *Cst, APInt &Mask, APInt &Undefs,
6588                                unsigned UndefBitIndex) {
6589    if (!Cst)
6590      return false;
6591    if (isa<UndefValue>(Cst)) {
6592      Undefs.setBit(UndefBitIndex);
6593      return true;
6594    }
6595    if (auto *CInt = dyn_cast<ConstantInt>(Cst)) {
6596      Mask = CInt->getValue();
6597      return true;
6598    }
6599    if (auto *CFP = dyn_cast<ConstantFP>(Cst)) {
6600      Mask = CFP->getValueAPF().bitcastToAPInt();
6601      return true;
6602    }
6603    return false;
6604  };
6605
6606  // Handle UNDEFs.
6607  if (Op.isUndef()) {
6608    APInt UndefSrcElts = APInt::getAllOnesValue(NumElts);
6609    SmallVector<APInt, 64> SrcEltBits(NumElts, APInt(EltSizeInBits, 0));
6610    return CastBitData(UndefSrcElts, SrcEltBits);
6611  }
6612
6613  // Extract scalar constant bits.
6614  if (auto *Cst = dyn_cast<ConstantSDNode>(Op)) {
6615    APInt UndefSrcElts = APInt::getNullValue(1);
6616    SmallVector<APInt, 64> SrcEltBits(1, Cst->getAPIntValue());
6617    return CastBitData(UndefSrcElts, SrcEltBits);
6618  }
6619  if (auto *Cst = dyn_cast<ConstantFPSDNode>(Op)) {
6620    APInt UndefSrcElts = APInt::getNullValue(1);
6621    APInt RawBits = Cst->getValueAPF().bitcastToAPInt();
6622    SmallVector<APInt, 64> SrcEltBits(1, RawBits);
6623    return CastBitData(UndefSrcElts, SrcEltBits);
6624  }
6625
6626  // Extract constant bits from build vector.
6627  if (ISD::isBuildVectorOfConstantSDNodes(Op.getNode())) {
6628    unsigned SrcEltSizeInBits = VT.getScalarSizeInBits();
6629    unsigned NumSrcElts = SizeInBits / SrcEltSizeInBits;
6630
6631    APInt UndefSrcElts(NumSrcElts, 0);
6632    SmallVector<APInt, 64> SrcEltBits(NumSrcElts, APInt(SrcEltSizeInBits, 0));
6633    for (unsigned i = 0, e = Op.getNumOperands(); i != e; ++i) {
6634      const SDValue &Src = Op.getOperand(i);
6635      if (Src.isUndef()) {
6636        UndefSrcElts.setBit(i);
6637        continue;
6638      }
6639      auto *Cst = cast<ConstantSDNode>(Src);
6640      SrcEltBits[i] = Cst->getAPIntValue().zextOrTrunc(SrcEltSizeInBits);
6641    }
6642    return CastBitData(UndefSrcElts, SrcEltBits);
6643  }
6644  if (ISD::isBuildVectorOfConstantFPSDNodes(Op.getNode())) {
6645    unsigned SrcEltSizeInBits = VT.getScalarSizeInBits();
6646    unsigned NumSrcElts = SizeInBits / SrcEltSizeInBits;
6647
6648    APInt UndefSrcElts(NumSrcElts, 0);
6649    SmallVector<APInt, 64> SrcEltBits(NumSrcElts, APInt(SrcEltSizeInBits, 0));
6650    for (unsigned i = 0, e = Op.getNumOperands(); i != e; ++i) {
6651      const SDValue &Src = Op.getOperand(i);
6652      if (Src.isUndef()) {
6653        UndefSrcElts.setBit(i);
6654        continue;
6655      }
6656      auto *Cst = cast<ConstantFPSDNode>(Src);
6657      APInt RawBits = Cst->getValueAPF().bitcastToAPInt();
6658      SrcEltBits[i] = RawBits.zextOrTrunc(SrcEltSizeInBits);
6659    }
6660    return CastBitData(UndefSrcElts, SrcEltBits);
6661  }
6662
6663  // Extract constant bits from constant pool vector.
6664  if (auto *Cst = getTargetConstantFromNode(Op)) {
6665    Type *CstTy = Cst->getType();
6666    unsigned CstSizeInBits = CstTy->getPrimitiveSizeInBits();
6667    if (!CstTy->isVectorTy() || (CstSizeInBits % SizeInBits) != 0)
6668      return false;
6669
6670    unsigned SrcEltSizeInBits = CstTy->getScalarSizeInBits();
6671    unsigned NumSrcElts = SizeInBits / SrcEltSizeInBits;
6672
6673    APInt UndefSrcElts(NumSrcElts, 0);
6674    SmallVector<APInt, 64> SrcEltBits(NumSrcElts, APInt(SrcEltSizeInBits, 0));
6675    for (unsigned i = 0; i != NumSrcElts; ++i)
6676      if (!CollectConstantBits(Cst->getAggregateElement(i), SrcEltBits[i],
6677                               UndefSrcElts, i))
6678        return false;
6679
6680    return CastBitData(UndefSrcElts, SrcEltBits);
6681  }
6682
6683  // Extract constant bits from a broadcasted constant pool scalar.
6684  if (Op.getOpcode() == X86ISD::VBROADCAST_LOAD &&
6685      EltSizeInBits <= VT.getScalarSizeInBits()) {
6686    auto *MemIntr = cast<MemIntrinsicSDNode>(Op);
6687    if (MemIntr->getMemoryVT().getScalarSizeInBits() != VT.getScalarSizeInBits())
6688      return false;
6689
6690    SDValue Ptr = MemIntr->getBasePtr();
6691    if (const Constant *C = getTargetConstantFromBasePtr(Ptr)) {
6692      unsigned SrcEltSizeInBits = C->getType()->getScalarSizeInBits();
6693      unsigned NumSrcElts = SizeInBits / SrcEltSizeInBits;
6694
6695      APInt UndefSrcElts(NumSrcElts, 0);
6696      SmallVector<APInt, 64> SrcEltBits(1, APInt(SrcEltSizeInBits, 0));
6697      if (CollectConstantBits(C, SrcEltBits[0], UndefSrcElts, 0)) {
6698        if (UndefSrcElts[0])
6699          UndefSrcElts.setBits(0, NumSrcElts);
6700        SrcEltBits.append(NumSrcElts - 1, SrcEltBits[0]);
6701        return CastBitData(UndefSrcElts, SrcEltBits);
6702      }
6703    }
6704  }
6705
6706  // Extract constant bits from a subvector broadcast.
6707  if (Op.getOpcode() == X86ISD::SUBV_BROADCAST_LOAD) {
6708    auto *MemIntr = cast<MemIntrinsicSDNode>(Op);
6709    SDValue Ptr = MemIntr->getBasePtr();
6710    if (const Constant *Cst = getTargetConstantFromBasePtr(Ptr)) {
6711      Type *CstTy = Cst->getType();
6712      unsigned CstSizeInBits = CstTy->getPrimitiveSizeInBits();
6713      if (!CstTy->isVectorTy() || (SizeInBits % CstSizeInBits) != 0)
6714        return false;
6715      unsigned SubEltSizeInBits = CstTy->getScalarSizeInBits();
6716      unsigned NumSubElts = CstSizeInBits / SubEltSizeInBits;
6717      unsigned NumSubVecs = SizeInBits / CstSizeInBits;
6718      APInt UndefSubElts(NumSubElts, 0);
6719      SmallVector<APInt, 64> SubEltBits(NumSubElts * NumSubVecs,
6720                                        APInt(SubEltSizeInBits, 0));
6721      for (unsigned i = 0; i != NumSubElts; ++i) {
6722        if (!CollectConstantBits(Cst->getAggregateElement(i), SubEltBits[i],
6723                                 UndefSubElts, i))
6724          return false;
6725        for (unsigned j = 1; j != NumSubVecs; ++j)
6726          SubEltBits[i + (j * NumSubElts)] = SubEltBits[i];
6727      }
6728      UndefSubElts = APInt::getSplat(NumSubVecs * UndefSubElts.getBitWidth(),
6729                                     UndefSubElts);
6730      return CastBitData(UndefSubElts, SubEltBits);
6731    }
6732  }
6733
6734  // Extract a rematerialized scalar constant insertion.
6735  if (Op.getOpcode() == X86ISD::VZEXT_MOVL &&
6736      Op.getOperand(0).getOpcode() == ISD::SCALAR_TO_VECTOR &&
6737      isa<ConstantSDNode>(Op.getOperand(0).getOperand(0))) {
6738    unsigned SrcEltSizeInBits = VT.getScalarSizeInBits();
6739    unsigned NumSrcElts = SizeInBits / SrcEltSizeInBits;
6740
6741    APInt UndefSrcElts(NumSrcElts, 0);
6742    SmallVector<APInt, 64> SrcEltBits;
6743    auto *CN = cast<ConstantSDNode>(Op.getOperand(0).getOperand(0));
6744    SrcEltBits.push_back(CN->getAPIntValue().zextOrTrunc(SrcEltSizeInBits));
6745    SrcEltBits.append(NumSrcElts - 1, APInt(SrcEltSizeInBits, 0));
6746    return CastBitData(UndefSrcElts, SrcEltBits);
6747  }
6748
6749  // Insert constant bits from a base and sub vector sources.
6750  if (Op.getOpcode() == ISD::INSERT_SUBVECTOR) {
6751    // If bitcasts to larger elements we might lose track of undefs - don't
6752    // allow any to be safe.
6753    unsigned SrcEltSizeInBits = VT.getScalarSizeInBits();
6754    bool AllowUndefs = EltSizeInBits >= SrcEltSizeInBits;
6755
6756    APInt UndefSrcElts, UndefSubElts;
6757    SmallVector<APInt, 32> EltSrcBits, EltSubBits;
6758    if (getTargetConstantBitsFromNode(Op.getOperand(1), SrcEltSizeInBits,
6759                                      UndefSubElts, EltSubBits,
6760                                      AllowWholeUndefs && AllowUndefs,
6761                                      AllowPartialUndefs && AllowUndefs) &&
6762        getTargetConstantBitsFromNode(Op.getOperand(0), SrcEltSizeInBits,
6763                                      UndefSrcElts, EltSrcBits,
6764                                      AllowWholeUndefs && AllowUndefs,
6765                                      AllowPartialUndefs && AllowUndefs)) {
6766      unsigned BaseIdx = Op.getConstantOperandVal(2);
6767      UndefSrcElts.insertBits(UndefSubElts, BaseIdx);
6768      for (unsigned i = 0, e = EltSubBits.size(); i != e; ++i)
6769        EltSrcBits[BaseIdx + i] = EltSubBits[i];
6770      return CastBitData(UndefSrcElts, EltSrcBits);
6771    }
6772  }
6773
6774  // Extract constant bits from a subvector's source.
6775  if (Op.getOpcode() == ISD::EXTRACT_SUBVECTOR) {
6776    // TODO - support extract_subvector through bitcasts.
6777    if (EltSizeInBits != VT.getScalarSizeInBits())
6778      return false;
6779
6780    if (getTargetConstantBitsFromNode(Op.getOperand(0), EltSizeInBits,
6781                                      UndefElts, EltBits, AllowWholeUndefs,
6782                                      AllowPartialUndefs)) {
6783      EVT SrcVT = Op.getOperand(0).getValueType();
6784      unsigned NumSrcElts = SrcVT.getVectorNumElements();
6785      unsigned NumSubElts = VT.getVectorNumElements();
6786      unsigned BaseIdx = Op.getConstantOperandVal(1);
6787      UndefElts = UndefElts.extractBits(NumSubElts, BaseIdx);
6788      if ((BaseIdx + NumSubElts) != NumSrcElts)
6789        EltBits.erase(EltBits.begin() + BaseIdx + NumSubElts, EltBits.end());
6790      if (BaseIdx != 0)
6791        EltBits.erase(EltBits.begin(), EltBits.begin() + BaseIdx);
6792      return true;
6793    }
6794  }
6795
6796  // Extract constant bits from shuffle node sources.
6797  if (auto *SVN = dyn_cast<ShuffleVectorSDNode>(Op)) {
6798    // TODO - support shuffle through bitcasts.
6799    if (EltSizeInBits != VT.getScalarSizeInBits())
6800      return false;
6801
6802    ArrayRef<int> Mask = SVN->getMask();
6803    if ((!AllowWholeUndefs || !AllowPartialUndefs) &&
6804        llvm::any_of(Mask, [](int M) { return M < 0; }))
6805      return false;
6806
6807    APInt UndefElts0, UndefElts1;
6808    SmallVector<APInt, 32> EltBits0, EltBits1;
6809    if (isAnyInRange(Mask, 0, NumElts) &&
6810        !getTargetConstantBitsFromNode(Op.getOperand(0), EltSizeInBits,
6811                                       UndefElts0, EltBits0, AllowWholeUndefs,
6812                                       AllowPartialUndefs))
6813      return false;
6814    if (isAnyInRange(Mask, NumElts, 2 * NumElts) &&
6815        !getTargetConstantBitsFromNode(Op.getOperand(1), EltSizeInBits,
6816                                       UndefElts1, EltBits1, AllowWholeUndefs,
6817                                       AllowPartialUndefs))
6818      return false;
6819
6820    UndefElts = APInt::getNullValue(NumElts);
6821    for (int i = 0; i != (int)NumElts; ++i) {
6822      int M = Mask[i];
6823      if (M < 0) {
6824        UndefElts.setBit(i);
6825        EltBits.push_back(APInt::getNullValue(EltSizeInBits));
6826      } else if (M < (int)NumElts) {
6827        if (UndefElts0[M])
6828          UndefElts.setBit(i);
6829        EltBits.push_back(EltBits0[M]);
6830      } else {
6831        if (UndefElts1[M - NumElts])
6832          UndefElts.setBit(i);
6833        EltBits.push_back(EltBits1[M - NumElts]);
6834      }
6835    }
6836    return true;
6837  }
6838
6839  return false;
6840}
6841
6842namespace llvm {
6843namespace X86 {
6844bool isConstantSplat(SDValue Op, APInt &SplatVal, bool AllowPartialUndefs) {
6845  APInt UndefElts;
6846  SmallVector<APInt, 16> EltBits;
6847  if (getTargetConstantBitsFromNode(Op, Op.getScalarValueSizeInBits(),
6848                                    UndefElts, EltBits, true,
6849                                    AllowPartialUndefs)) {
6850    int SplatIndex = -1;
6851    for (int i = 0, e = EltBits.size(); i != e; ++i) {
6852      if (UndefElts[i])
6853        continue;
6854      if (0 <= SplatIndex && EltBits[i] != EltBits[SplatIndex]) {
6855        SplatIndex = -1;
6856        break;
6857      }
6858      SplatIndex = i;
6859    }
6860    if (0 <= SplatIndex) {
6861      SplatVal = EltBits[SplatIndex];
6862      return true;
6863    }
6864  }
6865
6866  return false;
6867}
6868} // namespace X86
6869} // namespace llvm
6870
6871static bool getTargetShuffleMaskIndices(SDValue MaskNode,
6872                                        unsigned MaskEltSizeInBits,
6873                                        SmallVectorImpl<uint64_t> &RawMask,
6874                                        APInt &UndefElts) {
6875  // Extract the raw target constant bits.
6876  SmallVector<APInt, 64> EltBits;
6877  if (!getTargetConstantBitsFromNode(MaskNode, MaskEltSizeInBits, UndefElts,
6878                                     EltBits, /* AllowWholeUndefs */ true,
6879                                     /* AllowPartialUndefs */ false))
6880    return false;
6881
6882  // Insert the extracted elements into the mask.
6883  for (const APInt &Elt : EltBits)
6884    RawMask.push_back(Elt.getZExtValue());
6885
6886  return true;
6887}
6888
6889/// Create a shuffle mask that matches the PACKSS/PACKUS truncation.
6890/// A multi-stage pack shuffle mask is created by specifying NumStages > 1.
6891/// Note: This ignores saturation, so inputs must be checked first.
6892static void createPackShuffleMask(MVT VT, SmallVectorImpl<int> &Mask,
6893                                  bool Unary, unsigned NumStages = 1) {
6894  assert(Mask.empty() && "Expected an empty shuffle mask vector");
6895  unsigned NumElts = VT.getVectorNumElements();
6896  unsigned NumLanes = VT.getSizeInBits() / 128;
6897  unsigned NumEltsPerLane = 128 / VT.getScalarSizeInBits();
6898  unsigned Offset = Unary ? 0 : NumElts;
6899  unsigned Repetitions = 1u << (NumStages - 1);
6900  unsigned Increment = 1u << NumStages;
6901  assert((NumEltsPerLane >> NumStages) > 0 && "Illegal packing compaction");
6902
6903  for (unsigned Lane = 0; Lane != NumLanes; ++Lane) {
6904    for (unsigned Stage = 0; Stage != Repetitions; ++Stage) {
6905      for (unsigned Elt = 0; Elt != NumEltsPerLane; Elt += Increment)
6906        Mask.push_back(Elt + (Lane * NumEltsPerLane));
6907      for (unsigned Elt = 0; Elt != NumEltsPerLane; Elt += Increment)
6908        Mask.push_back(Elt + (Lane * NumEltsPerLane) + Offset);
6909    }
6910  }
6911}
6912
6913// Split the demanded elts of a PACKSS/PACKUS node between its operands.
6914static void getPackDemandedElts(EVT VT, const APInt &DemandedElts,
6915                                APInt &DemandedLHS, APInt &DemandedRHS) {
6916  int NumLanes = VT.getSizeInBits() / 128;
6917  int NumElts = DemandedElts.getBitWidth();
6918  int NumInnerElts = NumElts / 2;
6919  int NumEltsPerLane = NumElts / NumLanes;
6920  int NumInnerEltsPerLane = NumInnerElts / NumLanes;
6921
6922  DemandedLHS = APInt::getNullValue(NumInnerElts);
6923  DemandedRHS = APInt::getNullValue(NumInnerElts);
6924
6925  // Map DemandedElts to the packed operands.
6926  for (int Lane = 0; Lane != NumLanes; ++Lane) {
6927    for (int Elt = 0; Elt != NumInnerEltsPerLane; ++Elt) {
6928      int OuterIdx = (Lane * NumEltsPerLane) + Elt;
6929      int InnerIdx = (Lane * NumInnerEltsPerLane) + Elt;
6930      if (DemandedElts[OuterIdx])
6931        DemandedLHS.setBit(InnerIdx);
6932      if (DemandedElts[OuterIdx + NumInnerEltsPerLane])
6933        DemandedRHS.setBit(InnerIdx);
6934    }
6935  }
6936}
6937
6938// Split the demanded elts of a HADD/HSUB node between its operands.
6939static void getHorizDemandedElts(EVT VT, const APInt &DemandedElts,
6940                                 APInt &DemandedLHS, APInt &DemandedRHS) {
6941  int NumLanes = VT.getSizeInBits() / 128;
6942  int NumElts = DemandedElts.getBitWidth();
6943  int NumEltsPerLane = NumElts / NumLanes;
6944  int HalfEltsPerLane = NumEltsPerLane / 2;
6945
6946  DemandedLHS = APInt::getNullValue(NumElts);
6947  DemandedRHS = APInt::getNullValue(NumElts);
6948
6949  // Map DemandedElts to the horizontal operands.
6950  for (int Idx = 0; Idx != NumElts; ++Idx) {
6951    if (!DemandedElts[Idx])
6952      continue;
6953    int LaneIdx = (Idx / NumEltsPerLane) * NumEltsPerLane;
6954    int LocalIdx = Idx % NumEltsPerLane;
6955    if (LocalIdx < HalfEltsPerLane) {
6956      DemandedLHS.setBit(LaneIdx + 2 * LocalIdx + 0);
6957      DemandedLHS.setBit(LaneIdx + 2 * LocalIdx + 1);
6958    } else {
6959      LocalIdx -= HalfEltsPerLane;
6960      DemandedRHS.setBit(LaneIdx + 2 * LocalIdx + 0);
6961      DemandedRHS.setBit(LaneIdx + 2 * LocalIdx + 1);
6962    }
6963  }
6964}
6965
6966/// Calculates the shuffle mask corresponding to the target-specific opcode.
6967/// If the mask could be calculated, returns it in \p Mask, returns the shuffle
6968/// operands in \p Ops, and returns true.
6969/// Sets \p IsUnary to true if only one source is used. Note that this will set
6970/// IsUnary for shuffles which use a single input multiple times, and in those
6971/// cases it will adjust the mask to only have indices within that single input.
6972/// It is an error to call this with non-empty Mask/Ops vectors.
6973static bool getTargetShuffleMask(SDNode *N, MVT VT, bool AllowSentinelZero,
6974                                 SmallVectorImpl<SDValue> &Ops,
6975                                 SmallVectorImpl<int> &Mask, bool &IsUnary) {
6976  unsigned NumElems = VT.getVectorNumElements();
6977  unsigned MaskEltSize = VT.getScalarSizeInBits();
6978  SmallVector<uint64_t, 32> RawMask;
6979  APInt RawUndefs;
6980  uint64_t ImmN;
6981
6982  assert(Mask.empty() && "getTargetShuffleMask expects an empty Mask vector");
6983  assert(Ops.empty() && "getTargetShuffleMask expects an empty Ops vector");
6984
6985  IsUnary = false;
6986  bool IsFakeUnary = false;
6987  switch (N->getOpcode()) {
6988  case X86ISD::BLENDI:
6989    assert(N->getOperand(0).getValueType() == VT && "Unexpected value type");
6990    assert(N->getOperand(1).getValueType() == VT && "Unexpected value type");
6991    ImmN = N->getConstantOperandVal(N->getNumOperands() - 1);
6992    DecodeBLENDMask(NumElems, ImmN, Mask);
6993    IsUnary = IsFakeUnary = N->getOperand(0) == N->getOperand(1);
6994    break;
6995  case X86ISD::SHUFP:
6996    assert(N->getOperand(0).getValueType() == VT && "Unexpected value type");
6997    assert(N->getOperand(1).getValueType() == VT && "Unexpected value type");
6998    ImmN = N->getConstantOperandVal(N->getNumOperands() - 1);
6999    DecodeSHUFPMask(NumElems, MaskEltSize, ImmN, Mask);
7000    IsUnary = IsFakeUnary = N->getOperand(0) == N->getOperand(1);
7001    break;
7002  case X86ISD::INSERTPS:
7003    assert(N->getOperand(0).getValueType() == VT && "Unexpected value type");
7004    assert(N->getOperand(1).getValueType() == VT && "Unexpected value type");
7005    ImmN = N->getConstantOperandVal(N->getNumOperands() - 1);
7006    DecodeINSERTPSMask(ImmN, Mask);
7007    IsUnary = IsFakeUnary = N->getOperand(0) == N->getOperand(1);
7008    break;
7009  case X86ISD::EXTRQI:
7010    assert(N->getOperand(0).getValueType() == VT && "Unexpected value type");
7011    if (isa<ConstantSDNode>(N->getOperand(1)) &&
7012        isa<ConstantSDNode>(N->getOperand(2))) {
7013      int BitLen = N->getConstantOperandVal(1);
7014      int BitIdx = N->getConstantOperandVal(2);
7015      DecodeEXTRQIMask(NumElems, MaskEltSize, BitLen, BitIdx, Mask);
7016      IsUnary = true;
7017    }
7018    break;
7019  case X86ISD::INSERTQI:
7020    assert(N->getOperand(0).getValueType() == VT && "Unexpected value type");
7021    assert(N->getOperand(1).getValueType() == VT && "Unexpected value type");
7022    if (isa<ConstantSDNode>(N->getOperand(2)) &&
7023        isa<ConstantSDNode>(N->getOperand(3))) {
7024      int BitLen = N->getConstantOperandVal(2);
7025      int BitIdx = N->getConstantOperandVal(3);
7026      DecodeINSERTQIMask(NumElems, MaskEltSize, BitLen, BitIdx, Mask);
7027      IsUnary = IsFakeUnary = N->getOperand(0) == N->getOperand(1);
7028    }
7029    break;
7030  case X86ISD::UNPCKH:
7031    assert(N->getOperand(0).getValueType() == VT && "Unexpected value type");
7032    assert(N->getOperand(1).getValueType() == VT && "Unexpected value type");
7033    DecodeUNPCKHMask(NumElems, MaskEltSize, Mask);
7034    IsUnary = IsFakeUnary = N->getOperand(0) == N->getOperand(1);
7035    break;
7036  case X86ISD::UNPCKL:
7037    assert(N->getOperand(0).getValueType() == VT && "Unexpected value type");
7038    assert(N->getOperand(1).getValueType() == VT && "Unexpected value type");
7039    DecodeUNPCKLMask(NumElems, MaskEltSize, Mask);
7040    IsUnary = IsFakeUnary = N->getOperand(0) == N->getOperand(1);
7041    break;
7042  case X86ISD::MOVHLPS:
7043    assert(N->getOperand(0).getValueType() == VT && "Unexpected value type");
7044    assert(N->getOperand(1).getValueType() == VT && "Unexpected value type");
7045    DecodeMOVHLPSMask(NumElems, Mask);
7046    IsUnary = IsFakeUnary = N->getOperand(0) == N->getOperand(1);
7047    break;
7048  case X86ISD::MOVLHPS:
7049    assert(N->getOperand(0).getValueType() == VT && "Unexpected value type");
7050    assert(N->getOperand(1).getValueType() == VT && "Unexpected value type");
7051    DecodeMOVLHPSMask(NumElems, Mask);
7052    IsUnary = IsFakeUnary = N->getOperand(0) == N->getOperand(1);
7053    break;
7054  case X86ISD::VALIGN:
7055    assert((VT.getScalarType() == MVT::i32 || VT.getScalarType() == MVT::i64) &&
7056           "Only 32-bit and 64-bit elements are supported!");
7057    assert(N->getOperand(0).getValueType() == VT && "Unexpected value type");
7058    assert(N->getOperand(1).getValueType() == VT && "Unexpected value type");
7059    ImmN = N->getConstantOperandVal(N->getNumOperands() - 1);
7060    DecodeVALIGNMask(NumElems, ImmN, Mask);
7061    IsUnary = IsFakeUnary = N->getOperand(0) == N->getOperand(1);
7062    Ops.push_back(N->getOperand(1));
7063    Ops.push_back(N->getOperand(0));
7064    break;
7065  case X86ISD::PALIGNR:
7066    assert(VT.getScalarType() == MVT::i8 && "Byte vector expected");
7067    assert(N->getOperand(0).getValueType() == VT && "Unexpected value type");
7068    assert(N->getOperand(1).getValueType() == VT && "Unexpected value type");
7069    ImmN = N->getConstantOperandVal(N->getNumOperands() - 1);
7070    DecodePALIGNRMask(NumElems, ImmN, Mask);
7071    IsUnary = IsFakeUnary = N->getOperand(0) == N->getOperand(1);
7072    Ops.push_back(N->getOperand(1));
7073    Ops.push_back(N->getOperand(0));
7074    break;
7075  case X86ISD::VSHLDQ:
7076    assert(VT.getScalarType() == MVT::i8 && "Byte vector expected");
7077    assert(N->getOperand(0).getValueType() == VT && "Unexpected value type");
7078    ImmN = N->getConstantOperandVal(N->getNumOperands() - 1);
7079    DecodePSLLDQMask(NumElems, ImmN, Mask);
7080    IsUnary = true;
7081    break;
7082  case X86ISD::VSRLDQ:
7083    assert(VT.getScalarType() == MVT::i8 && "Byte vector expected");
7084    assert(N->getOperand(0).getValueType() == VT && "Unexpected value type");
7085    ImmN = N->getConstantOperandVal(N->getNumOperands() - 1);
7086    DecodePSRLDQMask(NumElems, ImmN, Mask);
7087    IsUnary = true;
7088    break;
7089  case X86ISD::PSHUFD:
7090  case X86ISD::VPERMILPI:
7091    assert(N->getOperand(0).getValueType() == VT && "Unexpected value type");
7092    ImmN = N->getConstantOperandVal(N->getNumOperands() - 1);
7093    DecodePSHUFMask(NumElems, MaskEltSize, ImmN, Mask);
7094    IsUnary = true;
7095    break;
7096  case X86ISD::PSHUFHW:
7097    assert(N->getOperand(0).getValueType() == VT && "Unexpected value type");
7098    ImmN = N->getConstantOperandVal(N->getNumOperands() - 1);
7099    DecodePSHUFHWMask(NumElems, ImmN, Mask);
7100    IsUnary = true;
7101    break;
7102  case X86ISD::PSHUFLW:
7103    assert(N->getOperand(0).getValueType() == VT && "Unexpected value type");
7104    ImmN = N->getConstantOperandVal(N->getNumOperands() - 1);
7105    DecodePSHUFLWMask(NumElems, ImmN, Mask);
7106    IsUnary = true;
7107    break;
7108  case X86ISD::VZEXT_MOVL:
7109    assert(N->getOperand(0).getValueType() == VT && "Unexpected value type");
7110    DecodeZeroMoveLowMask(NumElems, Mask);
7111    IsUnary = true;
7112    break;
7113  case X86ISD::VBROADCAST:
7114    // We only decode broadcasts of same-sized vectors, peeking through to
7115    // extracted subvectors is likely to cause hasOneUse issues with
7116    // SimplifyDemandedBits etc.
7117    if (N->getOperand(0).getValueType() == VT) {
7118      DecodeVectorBroadcast(NumElems, Mask);
7119      IsUnary = true;
7120      break;
7121    }
7122    return false;
7123  case X86ISD::VPERMILPV: {
7124    assert(N->getOperand(0).getValueType() == VT && "Unexpected value type");
7125    IsUnary = true;
7126    SDValue MaskNode = N->getOperand(1);
7127    if (getTargetShuffleMaskIndices(MaskNode, MaskEltSize, RawMask,
7128                                    RawUndefs)) {
7129      DecodeVPERMILPMask(NumElems, MaskEltSize, RawMask, RawUndefs, Mask);
7130      break;
7131    }
7132    return false;
7133  }
7134  case X86ISD::PSHUFB: {
7135    assert(VT.getScalarType() == MVT::i8 && "Byte vector expected");
7136    assert(N->getOperand(0).getValueType() == VT && "Unexpected value type");
7137    assert(N->getOperand(1).getValueType() == VT && "Unexpected value type");
7138    IsUnary = true;
7139    SDValue MaskNode = N->getOperand(1);
7140    if (getTargetShuffleMaskIndices(MaskNode, 8, RawMask, RawUndefs)) {
7141      DecodePSHUFBMask(RawMask, RawUndefs, Mask);
7142      break;
7143    }
7144    return false;
7145  }
7146  case X86ISD::VPERMI:
7147    assert(N->getOperand(0).getValueType() == VT && "Unexpected value type");
7148    ImmN = N->getConstantOperandVal(N->getNumOperands() - 1);
7149    DecodeVPERMMask(NumElems, ImmN, Mask);
7150    IsUnary = true;
7151    break;
7152  case X86ISD::MOVSS:
7153  case X86ISD::MOVSD:
7154    assert(N->getOperand(0).getValueType() == VT && "Unexpected value type");
7155    assert(N->getOperand(1).getValueType() == VT && "Unexpected value type");
7156    DecodeScalarMoveMask(NumElems, /* IsLoad */ false, Mask);
7157    break;
7158  case X86ISD::VPERM2X128:
7159    assert(N->getOperand(0).getValueType() == VT && "Unexpected value type");
7160    assert(N->getOperand(1).getValueType() == VT && "Unexpected value type");
7161    ImmN = N->getConstantOperandVal(N->getNumOperands() - 1);
7162    DecodeVPERM2X128Mask(NumElems, ImmN, Mask);
7163    IsUnary = IsFakeUnary = N->getOperand(0) == N->getOperand(1);
7164    break;
7165  case X86ISD::SHUF128:
7166    assert(N->getOperand(0).getValueType() == VT && "Unexpected value type");
7167    assert(N->getOperand(1).getValueType() == VT && "Unexpected value type");
7168    ImmN = N->getConstantOperandVal(N->getNumOperands() - 1);
7169    decodeVSHUF64x2FamilyMask(NumElems, MaskEltSize, ImmN, Mask);
7170    IsUnary = IsFakeUnary = N->getOperand(0) == N->getOperand(1);
7171    break;
7172  case X86ISD::MOVSLDUP:
7173    assert(N->getOperand(0).getValueType() == VT && "Unexpected value type");
7174    DecodeMOVSLDUPMask(NumElems, Mask);
7175    IsUnary = true;
7176    break;
7177  case X86ISD::MOVSHDUP:
7178    assert(N->getOperand(0).getValueType() == VT && "Unexpected value type");
7179    DecodeMOVSHDUPMask(NumElems, Mask);
7180    IsUnary = true;
7181    break;
7182  case X86ISD::MOVDDUP:
7183    assert(N->getOperand(0).getValueType() == VT && "Unexpected value type");
7184    DecodeMOVDDUPMask(NumElems, Mask);
7185    IsUnary = true;
7186    break;
7187  case X86ISD::VPERMIL2: {
7188    assert(N->getOperand(0).getValueType() == VT && "Unexpected value type");
7189    assert(N->getOperand(1).getValueType() == VT && "Unexpected value type");
7190    IsUnary = IsFakeUnary = N->getOperand(0) == N->getOperand(1);
7191    SDValue MaskNode = N->getOperand(2);
7192    SDValue CtrlNode = N->getOperand(3);
7193    if (ConstantSDNode *CtrlOp = dyn_cast<ConstantSDNode>(CtrlNode)) {
7194      unsigned CtrlImm = CtrlOp->getZExtValue();
7195      if (getTargetShuffleMaskIndices(MaskNode, MaskEltSize, RawMask,
7196                                      RawUndefs)) {
7197        DecodeVPERMIL2PMask(NumElems, MaskEltSize, CtrlImm, RawMask, RawUndefs,
7198                            Mask);
7199        break;
7200      }
7201    }
7202    return false;
7203  }
7204  case X86ISD::VPPERM: {
7205    assert(N->getOperand(0).getValueType() == VT && "Unexpected value type");
7206    assert(N->getOperand(1).getValueType() == VT && "Unexpected value type");
7207    IsUnary = IsFakeUnary = N->getOperand(0) == N->getOperand(1);
7208    SDValue MaskNode = N->getOperand(2);
7209    if (getTargetShuffleMaskIndices(MaskNode, 8, RawMask, RawUndefs)) {
7210      DecodeVPPERMMask(RawMask, RawUndefs, Mask);
7211      break;
7212    }
7213    return false;
7214  }
7215  case X86ISD::VPERMV: {
7216    assert(N->getOperand(1).getValueType() == VT && "Unexpected value type");
7217    IsUnary = true;
7218    // Unlike most shuffle nodes, VPERMV's mask operand is operand 0.
7219    Ops.push_back(N->getOperand(1));
7220    SDValue MaskNode = N->getOperand(0);
7221    if (getTargetShuffleMaskIndices(MaskNode, MaskEltSize, RawMask,
7222                                    RawUndefs)) {
7223      DecodeVPERMVMask(RawMask, RawUndefs, Mask);
7224      break;
7225    }
7226    return false;
7227  }
7228  case X86ISD::VPERMV3: {
7229    assert(N->getOperand(0).getValueType() == VT && "Unexpected value type");
7230    assert(N->getOperand(2).getValueType() == VT && "Unexpected value type");
7231    IsUnary = IsFakeUnary = N->getOperand(0) == N->getOperand(2);
7232    // Unlike most shuffle nodes, VPERMV3's mask operand is the middle one.
7233    Ops.push_back(N->getOperand(0));
7234    Ops.push_back(N->getOperand(2));
7235    SDValue MaskNode = N->getOperand(1);
7236    if (getTargetShuffleMaskIndices(MaskNode, MaskEltSize, RawMask,
7237                                    RawUndefs)) {
7238      DecodeVPERMV3Mask(RawMask, RawUndefs, Mask);
7239      break;
7240    }
7241    return false;
7242  }
7243  default: llvm_unreachable("unknown target shuffle node");
7244  }
7245
7246  // Empty mask indicates the decode failed.
7247  if (Mask.empty())
7248    return false;
7249
7250  // Check if we're getting a shuffle mask with zero'd elements.
7251  if (!AllowSentinelZero && isAnyZero(Mask))
7252    return false;
7253
7254  // If we have a fake unary shuffle, the shuffle mask is spread across two
7255  // inputs that are actually the same node. Re-map the mask to always point
7256  // into the first input.
7257  if (IsFakeUnary)
7258    for (int &M : Mask)
7259      if (M >= (int)Mask.size())
7260        M -= Mask.size();
7261
7262  // If we didn't already add operands in the opcode-specific code, default to
7263  // adding 1 or 2 operands starting at 0.
7264  if (Ops.empty()) {
7265    Ops.push_back(N->getOperand(0));
7266    if (!IsUnary || IsFakeUnary)
7267      Ops.push_back(N->getOperand(1));
7268  }
7269
7270  return true;
7271}
7272
7273// Wrapper for getTargetShuffleMask with InUnary;
7274static bool getTargetShuffleMask(SDNode *N, MVT VT, bool AllowSentinelZero,
7275                                 SmallVectorImpl<SDValue> &Ops,
7276                                 SmallVectorImpl<int> &Mask) {
7277  bool IsUnary;
7278  return getTargetShuffleMask(N, VT, AllowSentinelZero, Ops, Mask, IsUnary);
7279}
7280
7281/// Compute whether each element of a shuffle is zeroable.
7282///
7283/// A "zeroable" vector shuffle element is one which can be lowered to zero.
7284/// Either it is an undef element in the shuffle mask, the element of the input
7285/// referenced is undef, or the element of the input referenced is known to be
7286/// zero. Many x86 shuffles can zero lanes cheaply and we often want to handle
7287/// as many lanes with this technique as possible to simplify the remaining
7288/// shuffle.
7289static void computeZeroableShuffleElements(ArrayRef<int> Mask,
7290                                           SDValue V1, SDValue V2,
7291                                           APInt &KnownUndef, APInt &KnownZero) {
7292  int Size = Mask.size();
7293  KnownUndef = KnownZero = APInt::getNullValue(Size);
7294
7295  V1 = peekThroughBitcasts(V1);
7296  V2 = peekThroughBitcasts(V2);
7297
7298  bool V1IsZero = ISD::isBuildVectorAllZeros(V1.getNode());
7299  bool V2IsZero = ISD::isBuildVectorAllZeros(V2.getNode());
7300
7301  int VectorSizeInBits = V1.getValueSizeInBits();
7302  int ScalarSizeInBits = VectorSizeInBits / Size;
7303  assert(!(VectorSizeInBits % ScalarSizeInBits) && "Illegal shuffle mask size");
7304
7305  for (int i = 0; i < Size; ++i) {
7306    int M = Mask[i];
7307    // Handle the easy cases.
7308    if (M < 0) {
7309      KnownUndef.setBit(i);
7310      continue;
7311    }
7312    if ((M >= 0 && M < Size && V1IsZero) || (M >= Size && V2IsZero)) {
7313      KnownZero.setBit(i);
7314      continue;
7315    }
7316
7317    // Determine shuffle input and normalize the mask.
7318    SDValue V = M < Size ? V1 : V2;
7319    M %= Size;
7320
7321    // Currently we can only search BUILD_VECTOR for UNDEF/ZERO elements.
7322    if (V.getOpcode() != ISD::BUILD_VECTOR)
7323      continue;
7324
7325    // If the BUILD_VECTOR has fewer elements then the bitcasted portion of
7326    // the (larger) source element must be UNDEF/ZERO.
7327    if ((Size % V.getNumOperands()) == 0) {
7328      int Scale = Size / V->getNumOperands();
7329      SDValue Op = V.getOperand(M / Scale);
7330      if (Op.isUndef())
7331        KnownUndef.setBit(i);
7332      if (X86::isZeroNode(Op))
7333        KnownZero.setBit(i);
7334      else if (ConstantSDNode *Cst = dyn_cast<ConstantSDNode>(Op)) {
7335        APInt Val = Cst->getAPIntValue();
7336        Val = Val.extractBits(ScalarSizeInBits, (M % Scale) * ScalarSizeInBits);
7337        if (Val == 0)
7338          KnownZero.setBit(i);
7339      } else if (ConstantFPSDNode *Cst = dyn_cast<ConstantFPSDNode>(Op)) {
7340        APInt Val = Cst->getValueAPF().bitcastToAPInt();
7341        Val = Val.extractBits(ScalarSizeInBits, (M % Scale) * ScalarSizeInBits);
7342        if (Val == 0)
7343          KnownZero.setBit(i);
7344      }
7345      continue;
7346    }
7347
7348    // If the BUILD_VECTOR has more elements then all the (smaller) source
7349    // elements must be UNDEF or ZERO.
7350    if ((V.getNumOperands() % Size) == 0) {
7351      int Scale = V->getNumOperands() / Size;
7352      bool AllUndef = true;
7353      bool AllZero = true;
7354      for (int j = 0; j < Scale; ++j) {
7355        SDValue Op = V.getOperand((M * Scale) + j);
7356        AllUndef &= Op.isUndef();
7357        AllZero &= X86::isZeroNode(Op);
7358      }
7359      if (AllUndef)
7360        KnownUndef.setBit(i);
7361      if (AllZero)
7362        KnownZero.setBit(i);
7363      continue;
7364    }
7365  }
7366}
7367
7368/// Decode a target shuffle mask and inputs and see if any values are
7369/// known to be undef or zero from their inputs.
7370/// Returns true if the target shuffle mask was decoded.
7371/// FIXME: Merge this with computeZeroableShuffleElements?
7372static bool getTargetShuffleAndZeroables(SDValue N, SmallVectorImpl<int> &Mask,
7373                                         SmallVectorImpl<SDValue> &Ops,
7374                                         APInt &KnownUndef, APInt &KnownZero) {
7375  bool IsUnary;
7376  if (!isTargetShuffle(N.getOpcode()))
7377    return false;
7378
7379  MVT VT = N.getSimpleValueType();
7380  if (!getTargetShuffleMask(N.getNode(), VT, true, Ops, Mask, IsUnary))
7381    return false;
7382
7383  int Size = Mask.size();
7384  SDValue V1 = Ops[0];
7385  SDValue V2 = IsUnary ? V1 : Ops[1];
7386  KnownUndef = KnownZero = APInt::getNullValue(Size);
7387
7388  V1 = peekThroughBitcasts(V1);
7389  V2 = peekThroughBitcasts(V2);
7390
7391  assert((VT.getSizeInBits() % Size) == 0 &&
7392         "Illegal split of shuffle value type");
7393  unsigned EltSizeInBits = VT.getSizeInBits() / Size;
7394
7395  // Extract known constant input data.
7396  APInt UndefSrcElts[2];
7397  SmallVector<APInt, 32> SrcEltBits[2];
7398  bool IsSrcConstant[2] = {
7399      getTargetConstantBitsFromNode(V1, EltSizeInBits, UndefSrcElts[0],
7400                                    SrcEltBits[0], true, false),
7401      getTargetConstantBitsFromNode(V2, EltSizeInBits, UndefSrcElts[1],
7402                                    SrcEltBits[1], true, false)};
7403
7404  for (int i = 0; i < Size; ++i) {
7405    int M = Mask[i];
7406
7407    // Already decoded as SM_SentinelZero / SM_SentinelUndef.
7408    if (M < 0) {
7409      assert(isUndefOrZero(M) && "Unknown shuffle sentinel value!");
7410      if (SM_SentinelUndef == M)
7411        KnownUndef.setBit(i);
7412      if (SM_SentinelZero == M)
7413        KnownZero.setBit(i);
7414      continue;
7415    }
7416
7417    // Determine shuffle input and normalize the mask.
7418    unsigned SrcIdx = M / Size;
7419    SDValue V = M < Size ? V1 : V2;
7420    M %= Size;
7421
7422    // We are referencing an UNDEF input.
7423    if (V.isUndef()) {
7424      KnownUndef.setBit(i);
7425      continue;
7426    }
7427
7428    // SCALAR_TO_VECTOR - only the first element is defined, and the rest UNDEF.
7429    // TODO: We currently only set UNDEF for integer types - floats use the same
7430    // registers as vectors and many of the scalar folded loads rely on the
7431    // SCALAR_TO_VECTOR pattern.
7432    if (V.getOpcode() == ISD::SCALAR_TO_VECTOR &&
7433        (Size % V.getValueType().getVectorNumElements()) == 0) {
7434      int Scale = Size / V.getValueType().getVectorNumElements();
7435      int Idx = M / Scale;
7436      if (Idx != 0 && !VT.isFloatingPoint())
7437        KnownUndef.setBit(i);
7438      else if (Idx == 0 && X86::isZeroNode(V.getOperand(0)))
7439        KnownZero.setBit(i);
7440      continue;
7441    }
7442
7443    // INSERT_SUBVECTOR - to widen vectors we often insert them into UNDEF
7444    // base vectors.
7445    if (V.getOpcode() == ISD::INSERT_SUBVECTOR) {
7446      SDValue Vec = V.getOperand(0);
7447      int NumVecElts = Vec.getValueType().getVectorNumElements();
7448      if (Vec.isUndef() && Size == NumVecElts) {
7449        int Idx = V.getConstantOperandVal(2);
7450        int NumSubElts = V.getOperand(1).getValueType().getVectorNumElements();
7451        if (M < Idx || (Idx + NumSubElts) <= M)
7452          KnownUndef.setBit(i);
7453      }
7454      continue;
7455    }
7456
7457    // Attempt to extract from the source's constant bits.
7458    if (IsSrcConstant[SrcIdx]) {
7459      if (UndefSrcElts[SrcIdx][M])
7460        KnownUndef.setBit(i);
7461      else if (SrcEltBits[SrcIdx][M] == 0)
7462        KnownZero.setBit(i);
7463    }
7464  }
7465
7466  assert(VT.getVectorNumElements() == (unsigned)Size &&
7467         "Different mask size from vector size!");
7468  return true;
7469}
7470
7471// Replace target shuffle mask elements with known undef/zero sentinels.
7472static void resolveTargetShuffleFromZeroables(SmallVectorImpl<int> &Mask,
7473                                              const APInt &KnownUndef,
7474                                              const APInt &KnownZero,
7475                                              bool ResolveKnownZeros= true) {
7476  unsigned NumElts = Mask.size();
7477  assert(KnownUndef.getBitWidth() == NumElts &&
7478         KnownZero.getBitWidth() == NumElts && "Shuffle mask size mismatch");
7479
7480  for (unsigned i = 0; i != NumElts; ++i) {
7481    if (KnownUndef[i])
7482      Mask[i] = SM_SentinelUndef;
7483    else if (ResolveKnownZeros && KnownZero[i])
7484      Mask[i] = SM_SentinelZero;
7485  }
7486}
7487
7488// Extract target shuffle mask sentinel elements to known undef/zero bitmasks.
7489static void resolveZeroablesFromTargetShuffle(const SmallVectorImpl<int> &Mask,
7490                                              APInt &KnownUndef,
7491                                              APInt &KnownZero) {
7492  unsigned NumElts = Mask.size();
7493  KnownUndef = KnownZero = APInt::getNullValue(NumElts);
7494
7495  for (unsigned i = 0; i != NumElts; ++i) {
7496    int M = Mask[i];
7497    if (SM_SentinelUndef == M)
7498      KnownUndef.setBit(i);
7499    if (SM_SentinelZero == M)
7500      KnownZero.setBit(i);
7501  }
7502}
7503
7504// Forward declaration (for getFauxShuffleMask recursive check).
7505// TODO: Use DemandedElts variant.
7506static bool getTargetShuffleInputs(SDValue Op, SmallVectorImpl<SDValue> &Inputs,
7507                                   SmallVectorImpl<int> &Mask,
7508                                   const SelectionDAG &DAG, unsigned Depth,
7509                                   bool ResolveKnownElts);
7510
7511// Attempt to decode ops that could be represented as a shuffle mask.
7512// The decoded shuffle mask may contain a different number of elements to the
7513// destination value type.
7514static bool getFauxShuffleMask(SDValue N, const APInt &DemandedElts,
7515                               SmallVectorImpl<int> &Mask,
7516                               SmallVectorImpl<SDValue> &Ops,
7517                               const SelectionDAG &DAG, unsigned Depth,
7518                               bool ResolveKnownElts) {
7519  Mask.clear();
7520  Ops.clear();
7521
7522  MVT VT = N.getSimpleValueType();
7523  unsigned NumElts = VT.getVectorNumElements();
7524  unsigned NumSizeInBits = VT.getSizeInBits();
7525  unsigned NumBitsPerElt = VT.getScalarSizeInBits();
7526  if ((NumBitsPerElt % 8) != 0 || (NumSizeInBits % 8) != 0)
7527    return false;
7528  assert(NumElts == DemandedElts.getBitWidth() && "Unexpected vector size");
7529  unsigned NumSizeInBytes = NumSizeInBits / 8;
7530  unsigned NumBytesPerElt = NumBitsPerElt / 8;
7531
7532  unsigned Opcode = N.getOpcode();
7533  switch (Opcode) {
7534  case ISD::VECTOR_SHUFFLE: {
7535    // Don't treat ISD::VECTOR_SHUFFLE as a target shuffle so decode it here.
7536    ArrayRef<int> ShuffleMask = cast<ShuffleVectorSDNode>(N)->getMask();
7537    if (isUndefOrInRange(ShuffleMask, 0, 2 * NumElts)) {
7538      Mask.append(ShuffleMask.begin(), ShuffleMask.end());
7539      Ops.push_back(N.getOperand(0));
7540      Ops.push_back(N.getOperand(1));
7541      return true;
7542    }
7543    return false;
7544  }
7545  case ISD::AND:
7546  case X86ISD::ANDNP: {
7547    // Attempt to decode as a per-byte mask.
7548    APInt UndefElts;
7549    SmallVector<APInt, 32> EltBits;
7550    SDValue N0 = N.getOperand(0);
7551    SDValue N1 = N.getOperand(1);
7552    bool IsAndN = (X86ISD::ANDNP == Opcode);
7553    uint64_t ZeroMask = IsAndN ? 255 : 0;
7554    if (!getTargetConstantBitsFromNode(IsAndN ? N0 : N1, 8, UndefElts, EltBits))
7555      return false;
7556    for (int i = 0, e = (int)EltBits.size(); i != e; ++i) {
7557      if (UndefElts[i]) {
7558        Mask.push_back(SM_SentinelUndef);
7559        continue;
7560      }
7561      const APInt &ByteBits = EltBits[i];
7562      if (ByteBits != 0 && ByteBits != 255)
7563        return false;
7564      Mask.push_back(ByteBits == ZeroMask ? SM_SentinelZero : i);
7565    }
7566    Ops.push_back(IsAndN ? N1 : N0);
7567    return true;
7568  }
7569  case ISD::OR: {
7570    // Handle OR(SHUFFLE,SHUFFLE) case where one source is zero and the other
7571    // is a valid shuffle index.
7572    SDValue N0 = peekThroughBitcasts(N.getOperand(0));
7573    SDValue N1 = peekThroughBitcasts(N.getOperand(1));
7574    if (!N0.getValueType().isVector() || !N1.getValueType().isVector())
7575      return false;
7576    SmallVector<int, 64> SrcMask0, SrcMask1;
7577    SmallVector<SDValue, 2> SrcInputs0, SrcInputs1;
7578    if (!getTargetShuffleInputs(N0, SrcInputs0, SrcMask0, DAG, Depth + 1,
7579                                true) ||
7580        !getTargetShuffleInputs(N1, SrcInputs1, SrcMask1, DAG, Depth + 1,
7581                                true))
7582      return false;
7583
7584    size_t MaskSize = std::max(SrcMask0.size(), SrcMask1.size());
7585    SmallVector<int, 64> Mask0, Mask1;
7586    narrowShuffleMaskElts(MaskSize / SrcMask0.size(), SrcMask0, Mask0);
7587    narrowShuffleMaskElts(MaskSize / SrcMask1.size(), SrcMask1, Mask1);
7588    for (int i = 0; i != (int)MaskSize; ++i) {
7589      // NOTE: Don't handle SM_SentinelUndef, as we can end up in infinite
7590      // loops converting between OR and BLEND shuffles due to
7591      // canWidenShuffleElements merging away undef elements, meaning we
7592      // fail to recognise the OR as the undef element isn't known zero.
7593      if (Mask0[i] == SM_SentinelZero && Mask1[i] == SM_SentinelZero)
7594        Mask.push_back(SM_SentinelZero);
7595      else if (Mask1[i] == SM_SentinelZero)
7596        Mask.push_back(i);
7597      else if (Mask0[i] == SM_SentinelZero)
7598        Mask.push_back(i + MaskSize);
7599      else
7600        return false;
7601    }
7602    Ops.push_back(N0);
7603    Ops.push_back(N1);
7604    return true;
7605  }
7606  case ISD::INSERT_SUBVECTOR: {
7607    SDValue Src = N.getOperand(0);
7608    SDValue Sub = N.getOperand(1);
7609    EVT SubVT = Sub.getValueType();
7610    unsigned NumSubElts = SubVT.getVectorNumElements();
7611    if (!N->isOnlyUserOf(Sub.getNode()))
7612      return false;
7613    uint64_t InsertIdx = N.getConstantOperandVal(2);
7614    // Handle INSERT_SUBVECTOR(SRC0, EXTRACT_SUBVECTOR(SRC1)).
7615    if (Sub.getOpcode() == ISD::EXTRACT_SUBVECTOR &&
7616        Sub.getOperand(0).getValueType() == VT) {
7617      uint64_t ExtractIdx = Sub.getConstantOperandVal(1);
7618      for (int i = 0; i != (int)NumElts; ++i)
7619        Mask.push_back(i);
7620      for (int i = 0; i != (int)NumSubElts; ++i)
7621        Mask[InsertIdx + i] = NumElts + ExtractIdx + i;
7622      Ops.push_back(Src);
7623      Ops.push_back(Sub.getOperand(0));
7624      return true;
7625    }
7626    // Handle INSERT_SUBVECTOR(SRC0, SHUFFLE(SRC1)).
7627    SmallVector<int, 64> SubMask;
7628    SmallVector<SDValue, 2> SubInputs;
7629    if (!getTargetShuffleInputs(peekThroughOneUseBitcasts(Sub), SubInputs,
7630                                SubMask, DAG, Depth + 1, ResolveKnownElts))
7631      return false;
7632
7633    // Subvector shuffle inputs must not be larger than the subvector.
7634    if (llvm::any_of(SubInputs, [SubVT](SDValue SubInput) {
7635          return SubVT.getFixedSizeInBits() <
7636                 SubInput.getValueSizeInBits().getFixedSize();
7637        }))
7638      return false;
7639
7640    if (SubMask.size() != NumSubElts) {
7641      assert(((SubMask.size() % NumSubElts) == 0 ||
7642              (NumSubElts % SubMask.size()) == 0) && "Illegal submask scale");
7643      if ((NumSubElts % SubMask.size()) == 0) {
7644        int Scale = NumSubElts / SubMask.size();
7645        SmallVector<int,64> ScaledSubMask;
7646        narrowShuffleMaskElts(Scale, SubMask, ScaledSubMask);
7647        SubMask = ScaledSubMask;
7648      } else {
7649        int Scale = SubMask.size() / NumSubElts;
7650        NumSubElts = SubMask.size();
7651        NumElts *= Scale;
7652        InsertIdx *= Scale;
7653      }
7654    }
7655    Ops.push_back(Src);
7656    Ops.append(SubInputs.begin(), SubInputs.end());
7657    if (ISD::isBuildVectorAllZeros(Src.getNode()))
7658      Mask.append(NumElts, SM_SentinelZero);
7659    else
7660      for (int i = 0; i != (int)NumElts; ++i)
7661        Mask.push_back(i);
7662    for (int i = 0; i != (int)NumSubElts; ++i) {
7663      int M = SubMask[i];
7664      if (0 <= M) {
7665        int InputIdx = M / NumSubElts;
7666        M = (NumElts * (1 + InputIdx)) + (M % NumSubElts);
7667      }
7668      Mask[i + InsertIdx] = M;
7669    }
7670    return true;
7671  }
7672  case X86ISD::PINSRB:
7673  case X86ISD::PINSRW:
7674  case ISD::SCALAR_TO_VECTOR:
7675  case ISD::INSERT_VECTOR_ELT: {
7676    // Match against a insert_vector_elt/scalar_to_vector of an extract from a
7677    // vector, for matching src/dst vector types.
7678    SDValue Scl = N.getOperand(Opcode == ISD::SCALAR_TO_VECTOR ? 0 : 1);
7679
7680    unsigned DstIdx = 0;
7681    if (Opcode != ISD::SCALAR_TO_VECTOR) {
7682      // Check we have an in-range constant insertion index.
7683      if (!isa<ConstantSDNode>(N.getOperand(2)) ||
7684          N.getConstantOperandAPInt(2).uge(NumElts))
7685        return false;
7686      DstIdx = N.getConstantOperandVal(2);
7687
7688      // Attempt to recognise an INSERT*(VEC, 0, DstIdx) shuffle pattern.
7689      if (X86::isZeroNode(Scl)) {
7690        Ops.push_back(N.getOperand(0));
7691        for (unsigned i = 0; i != NumElts; ++i)
7692          Mask.push_back(i == DstIdx ? SM_SentinelZero : (int)i);
7693        return true;
7694      }
7695    }
7696
7697    // Peek through trunc/aext/zext.
7698    // TODO: aext shouldn't require SM_SentinelZero padding.
7699    // TODO: handle shift of scalars.
7700    unsigned MinBitsPerElt = Scl.getScalarValueSizeInBits();
7701    while (Scl.getOpcode() == ISD::TRUNCATE ||
7702           Scl.getOpcode() == ISD::ANY_EXTEND ||
7703           Scl.getOpcode() == ISD::ZERO_EXTEND) {
7704      Scl = Scl.getOperand(0);
7705      MinBitsPerElt =
7706          std::min<unsigned>(MinBitsPerElt, Scl.getScalarValueSizeInBits());
7707    }
7708    if ((MinBitsPerElt % 8) != 0)
7709      return false;
7710
7711    // Attempt to find the source vector the scalar was extracted from.
7712    SDValue SrcExtract;
7713    if ((Scl.getOpcode() == ISD::EXTRACT_VECTOR_ELT ||
7714         Scl.getOpcode() == X86ISD::PEXTRW ||
7715         Scl.getOpcode() == X86ISD::PEXTRB) &&
7716        Scl.getOperand(0).getValueSizeInBits() == NumSizeInBits) {
7717      SrcExtract = Scl;
7718    }
7719    if (!SrcExtract || !isa<ConstantSDNode>(SrcExtract.getOperand(1)))
7720      return false;
7721
7722    SDValue SrcVec = SrcExtract.getOperand(0);
7723    EVT SrcVT = SrcVec.getValueType();
7724    if (!SrcVT.getScalarType().isByteSized())
7725      return false;
7726    unsigned SrcIdx = SrcExtract.getConstantOperandVal(1);
7727    unsigned SrcByte = SrcIdx * (SrcVT.getScalarSizeInBits() / 8);
7728    unsigned DstByte = DstIdx * NumBytesPerElt;
7729    MinBitsPerElt =
7730        std::min<unsigned>(MinBitsPerElt, SrcVT.getScalarSizeInBits());
7731
7732    // Create 'identity' byte level shuffle mask and then add inserted bytes.
7733    if (Opcode == ISD::SCALAR_TO_VECTOR) {
7734      Ops.push_back(SrcVec);
7735      Mask.append(NumSizeInBytes, SM_SentinelUndef);
7736    } else {
7737      Ops.push_back(SrcVec);
7738      Ops.push_back(N.getOperand(0));
7739      for (int i = 0; i != (int)NumSizeInBytes; ++i)
7740        Mask.push_back(NumSizeInBytes + i);
7741    }
7742
7743    unsigned MinBytesPerElts = MinBitsPerElt / 8;
7744    MinBytesPerElts = std::min(MinBytesPerElts, NumBytesPerElt);
7745    for (unsigned i = 0; i != MinBytesPerElts; ++i)
7746      Mask[DstByte + i] = SrcByte + i;
7747    for (unsigned i = MinBytesPerElts; i < NumBytesPerElt; ++i)
7748      Mask[DstByte + i] = SM_SentinelZero;
7749    return true;
7750  }
7751  case X86ISD::PACKSS:
7752  case X86ISD::PACKUS: {
7753    SDValue N0 = N.getOperand(0);
7754    SDValue N1 = N.getOperand(1);
7755    assert(N0.getValueType().getVectorNumElements() == (NumElts / 2) &&
7756           N1.getValueType().getVectorNumElements() == (NumElts / 2) &&
7757           "Unexpected input value type");
7758
7759    APInt EltsLHS, EltsRHS;
7760    getPackDemandedElts(VT, DemandedElts, EltsLHS, EltsRHS);
7761
7762    // If we know input saturation won't happen (or we don't care for particular
7763    // lanes), we can treat this as a truncation shuffle.
7764    bool Offset0 = false, Offset1 = false;
7765    if (Opcode == X86ISD::PACKSS) {
7766      if ((!(N0.isUndef() || EltsLHS.isNullValue()) &&
7767           DAG.ComputeNumSignBits(N0, EltsLHS, Depth + 1) <= NumBitsPerElt) ||
7768          (!(N1.isUndef() || EltsRHS.isNullValue()) &&
7769           DAG.ComputeNumSignBits(N1, EltsRHS, Depth + 1) <= NumBitsPerElt))
7770        return false;
7771      // We can't easily fold ASHR into a shuffle, but if it was feeding a
7772      // PACKSS then it was likely being used for sign-extension for a
7773      // truncation, so just peek through and adjust the mask accordingly.
7774      if (N0.getOpcode() == X86ISD::VSRAI && N->isOnlyUserOf(N0.getNode()) &&
7775          N0.getConstantOperandAPInt(1) == NumBitsPerElt) {
7776        Offset0 = true;
7777        N0 = N0.getOperand(0);
7778      }
7779      if (N1.getOpcode() == X86ISD::VSRAI && N->isOnlyUserOf(N1.getNode()) &&
7780          N1.getConstantOperandAPInt(1) == NumBitsPerElt) {
7781        Offset1 = true;
7782        N1 = N1.getOperand(0);
7783      }
7784    } else {
7785      APInt ZeroMask = APInt::getHighBitsSet(2 * NumBitsPerElt, NumBitsPerElt);
7786      if ((!(N0.isUndef() || EltsLHS.isNullValue()) &&
7787           !DAG.MaskedValueIsZero(N0, ZeroMask, EltsLHS, Depth + 1)) ||
7788          (!(N1.isUndef() || EltsRHS.isNullValue()) &&
7789           !DAG.MaskedValueIsZero(N1, ZeroMask, EltsRHS, Depth + 1)))
7790        return false;
7791    }
7792
7793    bool IsUnary = (N0 == N1);
7794
7795    Ops.push_back(N0);
7796    if (!IsUnary)
7797      Ops.push_back(N1);
7798
7799    createPackShuffleMask(VT, Mask, IsUnary);
7800
7801    if (Offset0 || Offset1) {
7802      for (int &M : Mask)
7803        if ((Offset0 && isInRange(M, 0, NumElts)) ||
7804            (Offset1 && isInRange(M, NumElts, 2 * NumElts)))
7805          ++M;
7806    }
7807    return true;
7808  }
7809  case X86ISD::VTRUNC: {
7810    SDValue Src = N.getOperand(0);
7811    EVT SrcVT = Src.getValueType();
7812    // Truncated source must be a simple vector.
7813    if (!SrcVT.isSimple() || (SrcVT.getSizeInBits() % 128) != 0 ||
7814        (SrcVT.getScalarSizeInBits() % 8) != 0)
7815      return false;
7816    unsigned NumSrcElts = SrcVT.getVectorNumElements();
7817    unsigned NumBitsPerSrcElt = SrcVT.getScalarSizeInBits();
7818    unsigned Scale = NumBitsPerSrcElt / NumBitsPerElt;
7819    assert((NumBitsPerSrcElt % NumBitsPerElt) == 0 && "Illegal truncation");
7820    for (unsigned i = 0; i != NumSrcElts; ++i)
7821      Mask.push_back(i * Scale);
7822    Mask.append(NumElts - NumSrcElts, SM_SentinelZero);
7823    Ops.push_back(Src);
7824    return true;
7825  }
7826  case X86ISD::VSHLI:
7827  case X86ISD::VSRLI: {
7828    uint64_t ShiftVal = N.getConstantOperandVal(1);
7829    // Out of range bit shifts are guaranteed to be zero.
7830    if (NumBitsPerElt <= ShiftVal) {
7831      Mask.append(NumElts, SM_SentinelZero);
7832      return true;
7833    }
7834
7835    // We can only decode 'whole byte' bit shifts as shuffles.
7836    if ((ShiftVal % 8) != 0)
7837      break;
7838
7839    uint64_t ByteShift = ShiftVal / 8;
7840    Ops.push_back(N.getOperand(0));
7841
7842    // Clear mask to all zeros and insert the shifted byte indices.
7843    Mask.append(NumSizeInBytes, SM_SentinelZero);
7844
7845    if (X86ISD::VSHLI == Opcode) {
7846      for (unsigned i = 0; i != NumSizeInBytes; i += NumBytesPerElt)
7847        for (unsigned j = ByteShift; j != NumBytesPerElt; ++j)
7848          Mask[i + j] = i + j - ByteShift;
7849    } else {
7850      for (unsigned i = 0; i != NumSizeInBytes; i += NumBytesPerElt)
7851        for (unsigned j = ByteShift; j != NumBytesPerElt; ++j)
7852          Mask[i + j - ByteShift] = i + j;
7853    }
7854    return true;
7855  }
7856  case X86ISD::VROTLI:
7857  case X86ISD::VROTRI: {
7858    // We can only decode 'whole byte' bit rotates as shuffles.
7859    uint64_t RotateVal = N.getConstantOperandAPInt(1).urem(NumBitsPerElt);
7860    if ((RotateVal % 8) != 0)
7861      return false;
7862    Ops.push_back(N.getOperand(0));
7863    int Offset = RotateVal / 8;
7864    Offset = (X86ISD::VROTLI == Opcode ? NumBytesPerElt - Offset : Offset);
7865    for (int i = 0; i != (int)NumElts; ++i) {
7866      int BaseIdx = i * NumBytesPerElt;
7867      for (int j = 0; j != (int)NumBytesPerElt; ++j) {
7868        Mask.push_back(BaseIdx + ((Offset + j) % NumBytesPerElt));
7869      }
7870    }
7871    return true;
7872  }
7873  case X86ISD::VBROADCAST: {
7874    SDValue Src = N.getOperand(0);
7875    if (!Src.getSimpleValueType().isVector()) {
7876      if (Src.getOpcode() != ISD::EXTRACT_VECTOR_ELT ||
7877          !isNullConstant(Src.getOperand(1)) ||
7878          Src.getOperand(0).getValueType().getScalarType() !=
7879              VT.getScalarType())
7880        return false;
7881      Src = Src.getOperand(0);
7882    }
7883    Ops.push_back(Src);
7884    Mask.append(NumElts, 0);
7885    return true;
7886  }
7887  case ISD::ZERO_EXTEND:
7888  case ISD::ANY_EXTEND:
7889  case ISD::ZERO_EXTEND_VECTOR_INREG:
7890  case ISD::ANY_EXTEND_VECTOR_INREG: {
7891    SDValue Src = N.getOperand(0);
7892    EVT SrcVT = Src.getValueType();
7893
7894    // Extended source must be a simple vector.
7895    if (!SrcVT.isSimple() || (SrcVT.getSizeInBits() % 128) != 0 ||
7896        (SrcVT.getScalarSizeInBits() % 8) != 0)
7897      return false;
7898
7899    bool IsAnyExtend =
7900        (ISD::ANY_EXTEND == Opcode || ISD::ANY_EXTEND_VECTOR_INREG == Opcode);
7901    DecodeZeroExtendMask(SrcVT.getScalarSizeInBits(), NumBitsPerElt, NumElts,
7902                         IsAnyExtend, Mask);
7903    Ops.push_back(Src);
7904    return true;
7905  }
7906  }
7907
7908  return false;
7909}
7910
7911/// Removes unused/repeated shuffle source inputs and adjusts the shuffle mask.
7912static void resolveTargetShuffleInputsAndMask(SmallVectorImpl<SDValue> &Inputs,
7913                                              SmallVectorImpl<int> &Mask) {
7914  int MaskWidth = Mask.size();
7915  SmallVector<SDValue, 16> UsedInputs;
7916  for (int i = 0, e = Inputs.size(); i < e; ++i) {
7917    int lo = UsedInputs.size() * MaskWidth;
7918    int hi = lo + MaskWidth;
7919
7920    // Strip UNDEF input usage.
7921    if (Inputs[i].isUndef())
7922      for (int &M : Mask)
7923        if ((lo <= M) && (M < hi))
7924          M = SM_SentinelUndef;
7925
7926    // Check for unused inputs.
7927    if (none_of(Mask, [lo, hi](int i) { return (lo <= i) && (i < hi); })) {
7928      for (int &M : Mask)
7929        if (lo <= M)
7930          M -= MaskWidth;
7931      continue;
7932    }
7933
7934    // Check for repeated inputs.
7935    bool IsRepeat = false;
7936    for (int j = 0, ue = UsedInputs.size(); j != ue; ++j) {
7937      if (UsedInputs[j] != Inputs[i])
7938        continue;
7939      for (int &M : Mask)
7940        if (lo <= M)
7941          M = (M < hi) ? ((M - lo) + (j * MaskWidth)) : (M - MaskWidth);
7942      IsRepeat = true;
7943      break;
7944    }
7945    if (IsRepeat)
7946      continue;
7947
7948    UsedInputs.push_back(Inputs[i]);
7949  }
7950  Inputs = UsedInputs;
7951}
7952
7953/// Calls getTargetShuffleAndZeroables to resolve a target shuffle mask's inputs
7954/// and then sets the SM_SentinelUndef and SM_SentinelZero values.
7955/// Returns true if the target shuffle mask was decoded.
7956static bool getTargetShuffleInputs(SDValue Op, const APInt &DemandedElts,
7957                                   SmallVectorImpl<SDValue> &Inputs,
7958                                   SmallVectorImpl<int> &Mask,
7959                                   APInt &KnownUndef, APInt &KnownZero,
7960                                   const SelectionDAG &DAG, unsigned Depth,
7961                                   bool ResolveKnownElts) {
7962  EVT VT = Op.getValueType();
7963  if (!VT.isSimple() || !VT.isVector())
7964    return false;
7965
7966  if (getTargetShuffleAndZeroables(Op, Mask, Inputs, KnownUndef, KnownZero)) {
7967    if (ResolveKnownElts)
7968      resolveTargetShuffleFromZeroables(Mask, KnownUndef, KnownZero);
7969    return true;
7970  }
7971  if (getFauxShuffleMask(Op, DemandedElts, Mask, Inputs, DAG, Depth,
7972                         ResolveKnownElts)) {
7973    resolveZeroablesFromTargetShuffle(Mask, KnownUndef, KnownZero);
7974    return true;
7975  }
7976  return false;
7977}
7978
7979static bool getTargetShuffleInputs(SDValue Op, SmallVectorImpl<SDValue> &Inputs,
7980                                   SmallVectorImpl<int> &Mask,
7981                                   const SelectionDAG &DAG, unsigned Depth = 0,
7982                                   bool ResolveKnownElts = true) {
7983  EVT VT = Op.getValueType();
7984  if (!VT.isSimple() || !VT.isVector())
7985    return false;
7986
7987  APInt KnownUndef, KnownZero;
7988  unsigned NumElts = Op.getValueType().getVectorNumElements();
7989  APInt DemandedElts = APInt::getAllOnesValue(NumElts);
7990  return getTargetShuffleInputs(Op, DemandedElts, Inputs, Mask, KnownUndef,
7991                                KnownZero, DAG, Depth, ResolveKnownElts);
7992}
7993
7994/// Returns the scalar element that will make up the i'th
7995/// element of the result of the vector shuffle.
7996static SDValue getShuffleScalarElt(SDValue Op, unsigned Index,
7997                                   SelectionDAG &DAG, unsigned Depth) {
7998  if (Depth >= SelectionDAG::MaxRecursionDepth)
7999    return SDValue(); // Limit search depth.
8000
8001  EVT VT = Op.getValueType();
8002  unsigned Opcode = Op.getOpcode();
8003  unsigned NumElems = VT.getVectorNumElements();
8004
8005  // Recurse into ISD::VECTOR_SHUFFLE node to find scalars.
8006  if (auto *SV = dyn_cast<ShuffleVectorSDNode>(Op)) {
8007    int Elt = SV->getMaskElt(Index);
8008
8009    if (Elt < 0)
8010      return DAG.getUNDEF(VT.getVectorElementType());
8011
8012    SDValue Src = (Elt < (int)NumElems) ? SV->getOperand(0) : SV->getOperand(1);
8013    return getShuffleScalarElt(Src, Elt % NumElems, DAG, Depth + 1);
8014  }
8015
8016  // Recurse into target specific vector shuffles to find scalars.
8017  if (isTargetShuffle(Opcode)) {
8018    MVT ShufVT = VT.getSimpleVT();
8019    MVT ShufSVT = ShufVT.getVectorElementType();
8020    int NumElems = (int)ShufVT.getVectorNumElements();
8021    SmallVector<int, 16> ShuffleMask;
8022    SmallVector<SDValue, 16> ShuffleOps;
8023    if (!getTargetShuffleMask(Op.getNode(), ShufVT, true, ShuffleOps,
8024                              ShuffleMask))
8025      return SDValue();
8026
8027    int Elt = ShuffleMask[Index];
8028    if (Elt == SM_SentinelZero)
8029      return ShufSVT.isInteger() ? DAG.getConstant(0, SDLoc(Op), ShufSVT)
8030                                 : DAG.getConstantFP(+0.0, SDLoc(Op), ShufSVT);
8031    if (Elt == SM_SentinelUndef)
8032      return DAG.getUNDEF(ShufSVT);
8033
8034    assert(0 <= Elt && Elt < (2 * NumElems) && "Shuffle index out of range");
8035    SDValue Src = (Elt < NumElems) ? ShuffleOps[0] : ShuffleOps[1];
8036    return getShuffleScalarElt(Src, Elt % NumElems, DAG, Depth + 1);
8037  }
8038
8039  // Recurse into insert_subvector base/sub vector to find scalars.
8040  if (Opcode == ISD::INSERT_SUBVECTOR) {
8041    SDValue Vec = Op.getOperand(0);
8042    SDValue Sub = Op.getOperand(1);
8043    uint64_t SubIdx = Op.getConstantOperandVal(2);
8044    unsigned NumSubElts = Sub.getValueType().getVectorNumElements();
8045
8046    if (SubIdx <= Index && Index < (SubIdx + NumSubElts))
8047      return getShuffleScalarElt(Sub, Index - SubIdx, DAG, Depth + 1);
8048    return getShuffleScalarElt(Vec, Index, DAG, Depth + 1);
8049  }
8050
8051  // Recurse into concat_vectors sub vector to find scalars.
8052  if (Opcode == ISD::CONCAT_VECTORS) {
8053    EVT SubVT = Op.getOperand(0).getValueType();
8054    unsigned NumSubElts = SubVT.getVectorNumElements();
8055    uint64_t SubIdx = Index / NumSubElts;
8056    uint64_t SubElt = Index % NumSubElts;
8057    return getShuffleScalarElt(Op.getOperand(SubIdx), SubElt, DAG, Depth + 1);
8058  }
8059
8060  // Recurse into extract_subvector src vector to find scalars.
8061  if (Opcode == ISD::EXTRACT_SUBVECTOR) {
8062    SDValue Src = Op.getOperand(0);
8063    uint64_t SrcIdx = Op.getConstantOperandVal(1);
8064    return getShuffleScalarElt(Src, Index + SrcIdx, DAG, Depth + 1);
8065  }
8066
8067  // We only peek through bitcasts of the same vector width.
8068  if (Opcode == ISD::BITCAST) {
8069    SDValue Src = Op.getOperand(0);
8070    EVT SrcVT = Src.getValueType();
8071    if (SrcVT.isVector() && SrcVT.getVectorNumElements() == NumElems)
8072      return getShuffleScalarElt(Src, Index, DAG, Depth + 1);
8073    return SDValue();
8074  }
8075
8076  // Actual nodes that may contain scalar elements
8077
8078  // For insert_vector_elt - either return the index matching scalar or recurse
8079  // into the base vector.
8080  if (Opcode == ISD::INSERT_VECTOR_ELT &&
8081      isa<ConstantSDNode>(Op.getOperand(2))) {
8082    if (Op.getConstantOperandAPInt(2) == Index)
8083      return Op.getOperand(1);
8084    return getShuffleScalarElt(Op.getOperand(0), Index, DAG, Depth + 1);
8085  }
8086
8087  if (Opcode == ISD::SCALAR_TO_VECTOR)
8088    return (Index == 0) ? Op.getOperand(0)
8089                        : DAG.getUNDEF(VT.getVectorElementType());
8090
8091  if (Opcode == ISD::BUILD_VECTOR)
8092    return Op.getOperand(Index);
8093
8094  return SDValue();
8095}
8096
8097// Use PINSRB/PINSRW/PINSRD to create a build vector.
8098static SDValue LowerBuildVectorAsInsert(SDValue Op, const APInt &NonZeroMask,
8099                                        unsigned NumNonZero, unsigned NumZero,
8100                                        SelectionDAG &DAG,
8101                                        const X86Subtarget &Subtarget) {
8102  MVT VT = Op.getSimpleValueType();
8103  unsigned NumElts = VT.getVectorNumElements();
8104  assert(((VT == MVT::v8i16 && Subtarget.hasSSE2()) ||
8105          ((VT == MVT::v16i8 || VT == MVT::v4i32) && Subtarget.hasSSE41())) &&
8106         "Illegal vector insertion");
8107
8108  SDLoc dl(Op);
8109  SDValue V;
8110  bool First = true;
8111
8112  for (unsigned i = 0; i < NumElts; ++i) {
8113    bool IsNonZero = NonZeroMask[i];
8114    if (!IsNonZero)
8115      continue;
8116
8117    // If the build vector contains zeros or our first insertion is not the
8118    // first index then insert into zero vector to break any register
8119    // dependency else use SCALAR_TO_VECTOR.
8120    if (First) {
8121      First = false;
8122      if (NumZero || 0 != i)
8123        V = getZeroVector(VT, Subtarget, DAG, dl);
8124      else {
8125        assert(0 == i && "Expected insertion into zero-index");
8126        V = DAG.getAnyExtOrTrunc(Op.getOperand(i), dl, MVT::i32);
8127        V = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v4i32, V);
8128        V = DAG.getBitcast(VT, V);
8129        continue;
8130      }
8131    }
8132    V = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, VT, V, Op.getOperand(i),
8133                    DAG.getIntPtrConstant(i, dl));
8134  }
8135
8136  return V;
8137}
8138
8139/// Custom lower build_vector of v16i8.
8140static SDValue LowerBuildVectorv16i8(SDValue Op, const APInt &NonZeroMask,
8141                                     unsigned NumNonZero, unsigned NumZero,
8142                                     SelectionDAG &DAG,
8143                                     const X86Subtarget &Subtarget) {
8144  if (NumNonZero > 8 && !Subtarget.hasSSE41())
8145    return SDValue();
8146
8147  // SSE4.1 - use PINSRB to insert each byte directly.
8148  if (Subtarget.hasSSE41())
8149    return LowerBuildVectorAsInsert(Op, NonZeroMask, NumNonZero, NumZero, DAG,
8150                                    Subtarget);
8151
8152  SDLoc dl(Op);
8153  SDValue V;
8154
8155  // Pre-SSE4.1 - merge byte pairs and insert with PINSRW.
8156  for (unsigned i = 0; i < 16; i += 2) {
8157    bool ThisIsNonZero = NonZeroMask[i];
8158    bool NextIsNonZero = NonZeroMask[i + 1];
8159    if (!ThisIsNonZero && !NextIsNonZero)
8160      continue;
8161
8162    // FIXME: Investigate combining the first 4 bytes as a i32 instead.
8163    SDValue Elt;
8164    if (ThisIsNonZero) {
8165      if (NumZero || NextIsNonZero)
8166        Elt = DAG.getZExtOrTrunc(Op.getOperand(i), dl, MVT::i32);
8167      else
8168        Elt = DAG.getAnyExtOrTrunc(Op.getOperand(i), dl, MVT::i32);
8169    }
8170
8171    if (NextIsNonZero) {
8172      SDValue NextElt = Op.getOperand(i + 1);
8173      if (i == 0 && NumZero)
8174        NextElt = DAG.getZExtOrTrunc(NextElt, dl, MVT::i32);
8175      else
8176        NextElt = DAG.getAnyExtOrTrunc(NextElt, dl, MVT::i32);
8177      NextElt = DAG.getNode(ISD::SHL, dl, MVT::i32, NextElt,
8178                            DAG.getConstant(8, dl, MVT::i8));
8179      if (ThisIsNonZero)
8180        Elt = DAG.getNode(ISD::OR, dl, MVT::i32, NextElt, Elt);
8181      else
8182        Elt = NextElt;
8183    }
8184
8185    // If our first insertion is not the first index or zeros are needed, then
8186    // insert into zero vector. Otherwise, use SCALAR_TO_VECTOR (leaves high
8187    // elements undefined).
8188    if (!V) {
8189      if (i != 0 || NumZero)
8190        V = getZeroVector(MVT::v8i16, Subtarget, DAG, dl);
8191      else {
8192        V = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v4i32, Elt);
8193        V = DAG.getBitcast(MVT::v8i16, V);
8194        continue;
8195      }
8196    }
8197    Elt = DAG.getNode(ISD::TRUNCATE, dl, MVT::i16, Elt);
8198    V = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, MVT::v8i16, V, Elt,
8199                    DAG.getIntPtrConstant(i / 2, dl));
8200  }
8201
8202  return DAG.getBitcast(MVT::v16i8, V);
8203}
8204
8205/// Custom lower build_vector of v8i16.
8206static SDValue LowerBuildVectorv8i16(SDValue Op, const APInt &NonZeroMask,
8207                                     unsigned NumNonZero, unsigned NumZero,
8208                                     SelectionDAG &DAG,
8209                                     const X86Subtarget &Subtarget) {
8210  if (NumNonZero > 4 && !Subtarget.hasSSE41())
8211    return SDValue();
8212
8213  // Use PINSRW to insert each byte directly.
8214  return LowerBuildVectorAsInsert(Op, NonZeroMask, NumNonZero, NumZero, DAG,
8215                                  Subtarget);
8216}
8217
8218/// Custom lower build_vector of v4i32 or v4f32.
8219static SDValue LowerBuildVectorv4x32(SDValue Op, SelectionDAG &DAG,
8220                                     const X86Subtarget &Subtarget) {
8221  // If this is a splat of a pair of elements, use MOVDDUP (unless the target
8222  // has XOP; in that case defer lowering to potentially use VPERMIL2PS).
8223  // Because we're creating a less complicated build vector here, we may enable
8224  // further folding of the MOVDDUP via shuffle transforms.
8225  if (Subtarget.hasSSE3() && !Subtarget.hasXOP() &&
8226      Op.getOperand(0) == Op.getOperand(2) &&
8227      Op.getOperand(1) == Op.getOperand(3) &&
8228      Op.getOperand(0) != Op.getOperand(1)) {
8229    SDLoc DL(Op);
8230    MVT VT = Op.getSimpleValueType();
8231    MVT EltVT = VT.getVectorElementType();
8232    // Create a new build vector with the first 2 elements followed by undef
8233    // padding, bitcast to v2f64, duplicate, and bitcast back.
8234    SDValue Ops[4] = { Op.getOperand(0), Op.getOperand(1),
8235                       DAG.getUNDEF(EltVT), DAG.getUNDEF(EltVT) };
8236    SDValue NewBV = DAG.getBitcast(MVT::v2f64, DAG.getBuildVector(VT, DL, Ops));
8237    SDValue Dup = DAG.getNode(X86ISD::MOVDDUP, DL, MVT::v2f64, NewBV);
8238    return DAG.getBitcast(VT, Dup);
8239  }
8240
8241  // Find all zeroable elements.
8242  std::bitset<4> Zeroable, Undefs;
8243  for (int i = 0; i < 4; ++i) {
8244    SDValue Elt = Op.getOperand(i);
8245    Undefs[i] = Elt.isUndef();
8246    Zeroable[i] = (Elt.isUndef() || X86::isZeroNode(Elt));
8247  }
8248  assert(Zeroable.size() - Zeroable.count() > 1 &&
8249         "We expect at least two non-zero elements!");
8250
8251  // We only know how to deal with build_vector nodes where elements are either
8252  // zeroable or extract_vector_elt with constant index.
8253  SDValue FirstNonZero;
8254  unsigned FirstNonZeroIdx;
8255  for (unsigned i = 0; i < 4; ++i) {
8256    if (Zeroable[i])
8257      continue;
8258    SDValue Elt = Op.getOperand(i);
8259    if (Elt.getOpcode() != ISD::EXTRACT_VECTOR_ELT ||
8260        !isa<ConstantSDNode>(Elt.getOperand(1)))
8261      return SDValue();
8262    // Make sure that this node is extracting from a 128-bit vector.
8263    MVT VT = Elt.getOperand(0).getSimpleValueType();
8264    if (!VT.is128BitVector())
8265      return SDValue();
8266    if (!FirstNonZero.getNode()) {
8267      FirstNonZero = Elt;
8268      FirstNonZeroIdx = i;
8269    }
8270  }
8271
8272  assert(FirstNonZero.getNode() && "Unexpected build vector of all zeros!");
8273  SDValue V1 = FirstNonZero.getOperand(0);
8274  MVT VT = V1.getSimpleValueType();
8275
8276  // See if this build_vector can be lowered as a blend with zero.
8277  SDValue Elt;
8278  unsigned EltMaskIdx, EltIdx;
8279  int Mask[4];
8280  for (EltIdx = 0; EltIdx < 4; ++EltIdx) {
8281    if (Zeroable[EltIdx]) {
8282      // The zero vector will be on the right hand side.
8283      Mask[EltIdx] = EltIdx+4;
8284      continue;
8285    }
8286
8287    Elt = Op->getOperand(EltIdx);
8288    // By construction, Elt is a EXTRACT_VECTOR_ELT with constant index.
8289    EltMaskIdx = Elt.getConstantOperandVal(1);
8290    if (Elt.getOperand(0) != V1 || EltMaskIdx != EltIdx)
8291      break;
8292    Mask[EltIdx] = EltIdx;
8293  }
8294
8295  if (EltIdx == 4) {
8296    // Let the shuffle legalizer deal with blend operations.
8297    SDValue VZeroOrUndef = (Zeroable == Undefs)
8298                               ? DAG.getUNDEF(VT)
8299                               : getZeroVector(VT, Subtarget, DAG, SDLoc(Op));
8300    if (V1.getSimpleValueType() != VT)
8301      V1 = DAG.getBitcast(VT, V1);
8302    return DAG.getVectorShuffle(VT, SDLoc(V1), V1, VZeroOrUndef, Mask);
8303  }
8304
8305  // See if we can lower this build_vector to a INSERTPS.
8306  if (!Subtarget.hasSSE41())
8307    return SDValue();
8308
8309  SDValue V2 = Elt.getOperand(0);
8310  if (Elt == FirstNonZero && EltIdx == FirstNonZeroIdx)
8311    V1 = SDValue();
8312
8313  bool CanFold = true;
8314  for (unsigned i = EltIdx + 1; i < 4 && CanFold; ++i) {
8315    if (Zeroable[i])
8316      continue;
8317
8318    SDValue Current = Op->getOperand(i);
8319    SDValue SrcVector = Current->getOperand(0);
8320    if (!V1.getNode())
8321      V1 = SrcVector;
8322    CanFold = (SrcVector == V1) && (Current.getConstantOperandAPInt(1) == i);
8323  }
8324
8325  if (!CanFold)
8326    return SDValue();
8327
8328  assert(V1.getNode() && "Expected at least two non-zero elements!");
8329  if (V1.getSimpleValueType() != MVT::v4f32)
8330    V1 = DAG.getBitcast(MVT::v4f32, V1);
8331  if (V2.getSimpleValueType() != MVT::v4f32)
8332    V2 = DAG.getBitcast(MVT::v4f32, V2);
8333
8334  // Ok, we can emit an INSERTPS instruction.
8335  unsigned ZMask = Zeroable.to_ulong();
8336
8337  unsigned InsertPSMask = EltMaskIdx << 6 | EltIdx << 4 | ZMask;
8338  assert((InsertPSMask & ~0xFFu) == 0 && "Invalid mask!");
8339  SDLoc DL(Op);
8340  SDValue Result = DAG.getNode(X86ISD::INSERTPS, DL, MVT::v4f32, V1, V2,
8341                               DAG.getIntPtrConstant(InsertPSMask, DL, true));
8342  return DAG.getBitcast(VT, Result);
8343}
8344
8345/// Return a vector logical shift node.
8346static SDValue getVShift(bool isLeft, EVT VT, SDValue SrcOp, unsigned NumBits,
8347                         SelectionDAG &DAG, const TargetLowering &TLI,
8348                         const SDLoc &dl) {
8349  assert(VT.is128BitVector() && "Unknown type for VShift");
8350  MVT ShVT = MVT::v16i8;
8351  unsigned Opc = isLeft ? X86ISD::VSHLDQ : X86ISD::VSRLDQ;
8352  SrcOp = DAG.getBitcast(ShVT, SrcOp);
8353  assert(NumBits % 8 == 0 && "Only support byte sized shifts");
8354  SDValue ShiftVal = DAG.getTargetConstant(NumBits / 8, dl, MVT::i8);
8355  return DAG.getBitcast(VT, DAG.getNode(Opc, dl, ShVT, SrcOp, ShiftVal));
8356}
8357
8358static SDValue LowerAsSplatVectorLoad(SDValue SrcOp, MVT VT, const SDLoc &dl,
8359                                      SelectionDAG &DAG) {
8360
8361  // Check if the scalar load can be widened into a vector load. And if
8362  // the address is "base + cst" see if the cst can be "absorbed" into
8363  // the shuffle mask.
8364  if (LoadSDNode *LD = dyn_cast<LoadSDNode>(SrcOp)) {
8365    SDValue Ptr = LD->getBasePtr();
8366    if (!ISD::isNormalLoad(LD) || !LD->isSimple())
8367      return SDValue();
8368    EVT PVT = LD->getValueType(0);
8369    if (PVT != MVT::i32 && PVT != MVT::f32)
8370      return SDValue();
8371
8372    int FI = -1;
8373    int64_t Offset = 0;
8374    if (FrameIndexSDNode *FINode = dyn_cast<FrameIndexSDNode>(Ptr)) {
8375      FI = FINode->getIndex();
8376      Offset = 0;
8377    } else if (DAG.isBaseWithConstantOffset(Ptr) &&
8378               isa<FrameIndexSDNode>(Ptr.getOperand(0))) {
8379      FI = cast<FrameIndexSDNode>(Ptr.getOperand(0))->getIndex();
8380      Offset = Ptr.getConstantOperandVal(1);
8381      Ptr = Ptr.getOperand(0);
8382    } else {
8383      return SDValue();
8384    }
8385
8386    // FIXME: 256-bit vector instructions don't require a strict alignment,
8387    // improve this code to support it better.
8388    Align RequiredAlign(VT.getSizeInBits() / 8);
8389    SDValue Chain = LD->getChain();
8390    // Make sure the stack object alignment is at least 16 or 32.
8391    MachineFrameInfo &MFI = DAG.getMachineFunction().getFrameInfo();
8392    MaybeAlign InferredAlign = DAG.InferPtrAlign(Ptr);
8393    if (!InferredAlign || *InferredAlign < RequiredAlign) {
8394      if (MFI.isFixedObjectIndex(FI)) {
8395        // Can't change the alignment. FIXME: It's possible to compute
8396        // the exact stack offset and reference FI + adjust offset instead.
8397        // If someone *really* cares about this. That's the way to implement it.
8398        return SDValue();
8399      } else {
8400        MFI.setObjectAlignment(FI, RequiredAlign);
8401      }
8402    }
8403
8404    // (Offset % 16 or 32) must be multiple of 4. Then address is then
8405    // Ptr + (Offset & ~15).
8406    if (Offset < 0)
8407      return SDValue();
8408    if ((Offset % RequiredAlign.value()) & 3)
8409      return SDValue();
8410    int64_t StartOffset = Offset & ~int64_t(RequiredAlign.value() - 1);
8411    if (StartOffset) {
8412      SDLoc DL(Ptr);
8413      Ptr = DAG.getNode(ISD::ADD, DL, Ptr.getValueType(), Ptr,
8414                        DAG.getConstant(StartOffset, DL, Ptr.getValueType()));
8415    }
8416
8417    int EltNo = (Offset - StartOffset) >> 2;
8418    unsigned NumElems = VT.getVectorNumElements();
8419
8420    EVT NVT = EVT::getVectorVT(*DAG.getContext(), PVT, NumElems);
8421    SDValue V1 = DAG.getLoad(NVT, dl, Chain, Ptr,
8422                             LD->getPointerInfo().getWithOffset(StartOffset));
8423
8424    SmallVector<int, 8> Mask(NumElems, EltNo);
8425
8426    return DAG.getVectorShuffle(NVT, dl, V1, DAG.getUNDEF(NVT), Mask);
8427  }
8428
8429  return SDValue();
8430}
8431
8432// Recurse to find a LoadSDNode source and the accumulated ByteOffest.
8433static bool findEltLoadSrc(SDValue Elt, LoadSDNode *&Ld, int64_t &ByteOffset) {
8434  if (ISD::isNON_EXTLoad(Elt.getNode())) {
8435    auto *BaseLd = cast<LoadSDNode>(Elt);
8436    if (!BaseLd->isSimple())
8437      return false;
8438    Ld = BaseLd;
8439    ByteOffset = 0;
8440    return true;
8441  }
8442
8443  switch (Elt.getOpcode()) {
8444  case ISD::BITCAST:
8445  case ISD::TRUNCATE:
8446  case ISD::SCALAR_TO_VECTOR:
8447    return findEltLoadSrc(Elt.getOperand(0), Ld, ByteOffset);
8448  case ISD::SRL:
8449    if (auto *IdxC = dyn_cast<ConstantSDNode>(Elt.getOperand(1))) {
8450      uint64_t Idx = IdxC->getZExtValue();
8451      if ((Idx % 8) == 0 && findEltLoadSrc(Elt.getOperand(0), Ld, ByteOffset)) {
8452        ByteOffset += Idx / 8;
8453        return true;
8454      }
8455    }
8456    break;
8457  case ISD::EXTRACT_VECTOR_ELT:
8458    if (auto *IdxC = dyn_cast<ConstantSDNode>(Elt.getOperand(1))) {
8459      SDValue Src = Elt.getOperand(0);
8460      unsigned SrcSizeInBits = Src.getScalarValueSizeInBits();
8461      unsigned DstSizeInBits = Elt.getScalarValueSizeInBits();
8462      if (DstSizeInBits == SrcSizeInBits && (SrcSizeInBits % 8) == 0 &&
8463          findEltLoadSrc(Src, Ld, ByteOffset)) {
8464        uint64_t Idx = IdxC->getZExtValue();
8465        ByteOffset += Idx * (SrcSizeInBits / 8);
8466        return true;
8467      }
8468    }
8469    break;
8470  }
8471
8472  return false;
8473}
8474
8475/// Given the initializing elements 'Elts' of a vector of type 'VT', see if the
8476/// elements can be replaced by a single large load which has the same value as
8477/// a build_vector or insert_subvector whose loaded operands are 'Elts'.
8478///
8479/// Example: <load i32 *a, load i32 *a+4, zero, undef> -> zextload a
8480static SDValue EltsFromConsecutiveLoads(EVT VT, ArrayRef<SDValue> Elts,
8481                                        const SDLoc &DL, SelectionDAG &DAG,
8482                                        const X86Subtarget &Subtarget,
8483                                        bool isAfterLegalize) {
8484  if ((VT.getScalarSizeInBits() % 8) != 0)
8485    return SDValue();
8486
8487  unsigned NumElems = Elts.size();
8488
8489  int LastLoadedElt = -1;
8490  APInt LoadMask = APInt::getNullValue(NumElems);
8491  APInt ZeroMask = APInt::getNullValue(NumElems);
8492  APInt UndefMask = APInt::getNullValue(NumElems);
8493
8494  SmallVector<LoadSDNode*, 8> Loads(NumElems, nullptr);
8495  SmallVector<int64_t, 8> ByteOffsets(NumElems, 0);
8496
8497  // For each element in the initializer, see if we've found a load, zero or an
8498  // undef.
8499  for (unsigned i = 0; i < NumElems; ++i) {
8500    SDValue Elt = peekThroughBitcasts(Elts[i]);
8501    if (!Elt.getNode())
8502      return SDValue();
8503    if (Elt.isUndef()) {
8504      UndefMask.setBit(i);
8505      continue;
8506    }
8507    if (X86::isZeroNode(Elt) || ISD::isBuildVectorAllZeros(Elt.getNode())) {
8508      ZeroMask.setBit(i);
8509      continue;
8510    }
8511
8512    // Each loaded element must be the correct fractional portion of the
8513    // requested vector load.
8514    unsigned EltSizeInBits = Elt.getValueSizeInBits();
8515    if ((NumElems * EltSizeInBits) != VT.getSizeInBits())
8516      return SDValue();
8517
8518    if (!findEltLoadSrc(Elt, Loads[i], ByteOffsets[i]) || ByteOffsets[i] < 0)
8519      return SDValue();
8520    unsigned LoadSizeInBits = Loads[i]->getValueSizeInBits(0);
8521    if (((ByteOffsets[i] * 8) + EltSizeInBits) > LoadSizeInBits)
8522      return SDValue();
8523
8524    LoadMask.setBit(i);
8525    LastLoadedElt = i;
8526  }
8527  assert((ZeroMask.countPopulation() + UndefMask.countPopulation() +
8528          LoadMask.countPopulation()) == NumElems &&
8529         "Incomplete element masks");
8530
8531  // Handle Special Cases - all undef or undef/zero.
8532  if (UndefMask.countPopulation() == NumElems)
8533    return DAG.getUNDEF(VT);
8534  if ((ZeroMask.countPopulation() + UndefMask.countPopulation()) == NumElems)
8535    return VT.isInteger() ? DAG.getConstant(0, DL, VT)
8536                          : DAG.getConstantFP(0.0, DL, VT);
8537
8538  const TargetLowering &TLI = DAG.getTargetLoweringInfo();
8539  int FirstLoadedElt = LoadMask.countTrailingZeros();
8540  SDValue EltBase = peekThroughBitcasts(Elts[FirstLoadedElt]);
8541  EVT EltBaseVT = EltBase.getValueType();
8542  assert(EltBaseVT.getSizeInBits() == EltBaseVT.getStoreSizeInBits() &&
8543         "Register/Memory size mismatch");
8544  LoadSDNode *LDBase = Loads[FirstLoadedElt];
8545  assert(LDBase && "Did not find base load for merging consecutive loads");
8546  unsigned BaseSizeInBits = EltBaseVT.getStoreSizeInBits();
8547  unsigned BaseSizeInBytes = BaseSizeInBits / 8;
8548  int NumLoadedElts = (1 + LastLoadedElt - FirstLoadedElt);
8549  int LoadSizeInBits = NumLoadedElts * BaseSizeInBits;
8550  assert((BaseSizeInBits % 8) == 0 && "Sub-byte element loads detected");
8551
8552  // TODO: Support offsetting the base load.
8553  if (ByteOffsets[FirstLoadedElt] != 0)
8554    return SDValue();
8555
8556  // Check to see if the element's load is consecutive to the base load
8557  // or offset from a previous (already checked) load.
8558  auto CheckConsecutiveLoad = [&](LoadSDNode *Base, int EltIdx) {
8559    LoadSDNode *Ld = Loads[EltIdx];
8560    int64_t ByteOffset = ByteOffsets[EltIdx];
8561    if (ByteOffset && (ByteOffset % BaseSizeInBytes) == 0) {
8562      int64_t BaseIdx = EltIdx - (ByteOffset / BaseSizeInBytes);
8563      return (0 <= BaseIdx && BaseIdx < (int)NumElems && LoadMask[BaseIdx] &&
8564              Loads[BaseIdx] == Ld && ByteOffsets[BaseIdx] == 0);
8565    }
8566    return DAG.areNonVolatileConsecutiveLoads(Ld, Base, BaseSizeInBytes,
8567                                              EltIdx - FirstLoadedElt);
8568  };
8569
8570  // Consecutive loads can contain UNDEFS but not ZERO elements.
8571  // Consecutive loads with UNDEFs and ZEROs elements require a
8572  // an additional shuffle stage to clear the ZERO elements.
8573  bool IsConsecutiveLoad = true;
8574  bool IsConsecutiveLoadWithZeros = true;
8575  for (int i = FirstLoadedElt + 1; i <= LastLoadedElt; ++i) {
8576    if (LoadMask[i]) {
8577      if (!CheckConsecutiveLoad(LDBase, i)) {
8578        IsConsecutiveLoad = false;
8579        IsConsecutiveLoadWithZeros = false;
8580        break;
8581      }
8582    } else if (ZeroMask[i]) {
8583      IsConsecutiveLoad = false;
8584    }
8585  }
8586
8587  auto CreateLoad = [&DAG, &DL, &Loads](EVT VT, LoadSDNode *LDBase) {
8588    auto MMOFlags = LDBase->getMemOperand()->getFlags();
8589    assert(LDBase->isSimple() &&
8590           "Cannot merge volatile or atomic loads.");
8591    SDValue NewLd =
8592        DAG.getLoad(VT, DL, LDBase->getChain(), LDBase->getBasePtr(),
8593                    LDBase->getPointerInfo(), LDBase->getOriginalAlign(),
8594                    MMOFlags);
8595    for (auto *LD : Loads)
8596      if (LD)
8597        DAG.makeEquivalentMemoryOrdering(LD, NewLd);
8598    return NewLd;
8599  };
8600
8601  // Check if the base load is entirely dereferenceable.
8602  bool IsDereferenceable = LDBase->getPointerInfo().isDereferenceable(
8603      VT.getSizeInBits() / 8, *DAG.getContext(), DAG.getDataLayout());
8604
8605  // LOAD - all consecutive load/undefs (must start/end with a load or be
8606  // entirely dereferenceable). If we have found an entire vector of loads and
8607  // undefs, then return a large load of the entire vector width starting at the
8608  // base pointer. If the vector contains zeros, then attempt to shuffle those
8609  // elements.
8610  if (FirstLoadedElt == 0 &&
8611      (NumLoadedElts == (int)NumElems || IsDereferenceable) &&
8612      (IsConsecutiveLoad || IsConsecutiveLoadWithZeros)) {
8613    if (isAfterLegalize && !TLI.isOperationLegal(ISD::LOAD, VT))
8614      return SDValue();
8615
8616    // Don't create 256-bit non-temporal aligned loads without AVX2 as these
8617    // will lower to regular temporal loads and use the cache.
8618    if (LDBase->isNonTemporal() && LDBase->getAlignment() >= 32 &&
8619        VT.is256BitVector() && !Subtarget.hasInt256())
8620      return SDValue();
8621
8622    if (NumElems == 1)
8623      return DAG.getBitcast(VT, Elts[FirstLoadedElt]);
8624
8625    if (!ZeroMask)
8626      return CreateLoad(VT, LDBase);
8627
8628    // IsConsecutiveLoadWithZeros - we need to create a shuffle of the loaded
8629    // vector and a zero vector to clear out the zero elements.
8630    if (!isAfterLegalize && VT.isVector()) {
8631      unsigned NumMaskElts = VT.getVectorNumElements();
8632      if ((NumMaskElts % NumElems) == 0) {
8633        unsigned Scale = NumMaskElts / NumElems;
8634        SmallVector<int, 4> ClearMask(NumMaskElts, -1);
8635        for (unsigned i = 0; i < NumElems; ++i) {
8636          if (UndefMask[i])
8637            continue;
8638          int Offset = ZeroMask[i] ? NumMaskElts : 0;
8639          for (unsigned j = 0; j != Scale; ++j)
8640            ClearMask[(i * Scale) + j] = (i * Scale) + j + Offset;
8641        }
8642        SDValue V = CreateLoad(VT, LDBase);
8643        SDValue Z = VT.isInteger() ? DAG.getConstant(0, DL, VT)
8644                                   : DAG.getConstantFP(0.0, DL, VT);
8645        return DAG.getVectorShuffle(VT, DL, V, Z, ClearMask);
8646      }
8647    }
8648  }
8649
8650  // If the upper half of a ymm/zmm load is undef then just load the lower half.
8651  if (VT.is256BitVector() || VT.is512BitVector()) {
8652    unsigned HalfNumElems = NumElems / 2;
8653    if (UndefMask.extractBits(HalfNumElems, HalfNumElems).isAllOnesValue()) {
8654      EVT HalfVT =
8655          EVT::getVectorVT(*DAG.getContext(), VT.getScalarType(), HalfNumElems);
8656      SDValue HalfLD =
8657          EltsFromConsecutiveLoads(HalfVT, Elts.drop_back(HalfNumElems), DL,
8658                                   DAG, Subtarget, isAfterLegalize);
8659      if (HalfLD)
8660        return DAG.getNode(ISD::INSERT_SUBVECTOR, DL, VT, DAG.getUNDEF(VT),
8661                           HalfLD, DAG.getIntPtrConstant(0, DL));
8662    }
8663  }
8664
8665  // VZEXT_LOAD - consecutive 32/64-bit load/undefs followed by zeros/undefs.
8666  if (IsConsecutiveLoad && FirstLoadedElt == 0 &&
8667      (LoadSizeInBits == 32 || LoadSizeInBits == 64) &&
8668      ((VT.is128BitVector() || VT.is256BitVector() || VT.is512BitVector()))) {
8669    MVT VecSVT = VT.isFloatingPoint() ? MVT::getFloatingPointVT(LoadSizeInBits)
8670                                      : MVT::getIntegerVT(LoadSizeInBits);
8671    MVT VecVT = MVT::getVectorVT(VecSVT, VT.getSizeInBits() / LoadSizeInBits);
8672    // Allow v4f32 on SSE1 only targets.
8673    // FIXME: Add more isel patterns so we can just use VT directly.
8674    if (!Subtarget.hasSSE2() && VT == MVT::v4f32)
8675      VecVT = MVT::v4f32;
8676    if (TLI.isTypeLegal(VecVT)) {
8677      SDVTList Tys = DAG.getVTList(VecVT, MVT::Other);
8678      SDValue Ops[] = { LDBase->getChain(), LDBase->getBasePtr() };
8679      SDValue ResNode = DAG.getMemIntrinsicNode(
8680          X86ISD::VZEXT_LOAD, DL, Tys, Ops, VecSVT, LDBase->getPointerInfo(),
8681          LDBase->getOriginalAlign(), MachineMemOperand::MOLoad);
8682      for (auto *LD : Loads)
8683        if (LD)
8684          DAG.makeEquivalentMemoryOrdering(LD, ResNode);
8685      return DAG.getBitcast(VT, ResNode);
8686    }
8687  }
8688
8689  // BROADCAST - match the smallest possible repetition pattern, load that
8690  // scalar/subvector element and then broadcast to the entire vector.
8691  if (ZeroMask.isNullValue() && isPowerOf2_32(NumElems) && Subtarget.hasAVX() &&
8692      (VT.is128BitVector() || VT.is256BitVector() || VT.is512BitVector())) {
8693    for (unsigned SubElems = 1; SubElems < NumElems; SubElems *= 2) {
8694      unsigned RepeatSize = SubElems * BaseSizeInBits;
8695      unsigned ScalarSize = std::min(RepeatSize, 64u);
8696      if (!Subtarget.hasAVX2() && ScalarSize < 32)
8697        continue;
8698
8699      // Don't attempt a 1:N subvector broadcast - it should be caught by
8700      // combineConcatVectorOps, else will cause infinite loops.
8701      if (RepeatSize > ScalarSize && SubElems == 1)
8702        continue;
8703
8704      bool Match = true;
8705      SmallVector<SDValue, 8> RepeatedLoads(SubElems, DAG.getUNDEF(EltBaseVT));
8706      for (unsigned i = 0; i != NumElems && Match; ++i) {
8707        if (!LoadMask[i])
8708          continue;
8709        SDValue Elt = peekThroughBitcasts(Elts[i]);
8710        if (RepeatedLoads[i % SubElems].isUndef())
8711          RepeatedLoads[i % SubElems] = Elt;
8712        else
8713          Match &= (RepeatedLoads[i % SubElems] == Elt);
8714      }
8715
8716      // We must have loads at both ends of the repetition.
8717      Match &= !RepeatedLoads.front().isUndef();
8718      Match &= !RepeatedLoads.back().isUndef();
8719      if (!Match)
8720        continue;
8721
8722      EVT RepeatVT =
8723          VT.isInteger() && (RepeatSize != 64 || TLI.isTypeLegal(MVT::i64))
8724              ? EVT::getIntegerVT(*DAG.getContext(), ScalarSize)
8725              : EVT::getFloatingPointVT(ScalarSize);
8726      if (RepeatSize > ScalarSize)
8727        RepeatVT = EVT::getVectorVT(*DAG.getContext(), RepeatVT,
8728                                    RepeatSize / ScalarSize);
8729      EVT BroadcastVT =
8730          EVT::getVectorVT(*DAG.getContext(), RepeatVT.getScalarType(),
8731                           VT.getSizeInBits() / ScalarSize);
8732      if (TLI.isTypeLegal(BroadcastVT)) {
8733        if (SDValue RepeatLoad = EltsFromConsecutiveLoads(
8734                RepeatVT, RepeatedLoads, DL, DAG, Subtarget, isAfterLegalize)) {
8735          SDValue Broadcast = RepeatLoad;
8736          if (RepeatSize > ScalarSize) {
8737            while (Broadcast.getValueSizeInBits() < VT.getSizeInBits())
8738              Broadcast = concatSubVectors(Broadcast, Broadcast, DAG, DL);
8739          } else {
8740            Broadcast =
8741                DAG.getNode(X86ISD::VBROADCAST, DL, BroadcastVT, RepeatLoad);
8742          }
8743          return DAG.getBitcast(VT, Broadcast);
8744        }
8745      }
8746    }
8747  }
8748
8749  return SDValue();
8750}
8751
8752// Combine a vector ops (shuffles etc.) that is equal to build_vector load1,
8753// load2, load3, load4, <0, 1, 2, 3> into a vector load if the load addresses
8754// are consecutive, non-overlapping, and in the right order.
8755static SDValue combineToConsecutiveLoads(EVT VT, SDValue Op, const SDLoc &DL,
8756                                         SelectionDAG &DAG,
8757                                         const X86Subtarget &Subtarget,
8758                                         bool isAfterLegalize) {
8759  SmallVector<SDValue, 64> Elts;
8760  for (unsigned i = 0, e = VT.getVectorNumElements(); i != e; ++i) {
8761    if (SDValue Elt = getShuffleScalarElt(Op, i, DAG, 0)) {
8762      Elts.push_back(Elt);
8763      continue;
8764    }
8765    return SDValue();
8766  }
8767  assert(Elts.size() == VT.getVectorNumElements());
8768  return EltsFromConsecutiveLoads(VT, Elts, DL, DAG, Subtarget,
8769                                  isAfterLegalize);
8770}
8771
8772static Constant *getConstantVector(MVT VT, const APInt &SplatValue,
8773                                   unsigned SplatBitSize, LLVMContext &C) {
8774  unsigned ScalarSize = VT.getScalarSizeInBits();
8775  unsigned NumElm = SplatBitSize / ScalarSize;
8776
8777  SmallVector<Constant *, 32> ConstantVec;
8778  for (unsigned i = 0; i < NumElm; i++) {
8779    APInt Val = SplatValue.extractBits(ScalarSize, ScalarSize * i);
8780    Constant *Const;
8781    if (VT.isFloatingPoint()) {
8782      if (ScalarSize == 32) {
8783        Const = ConstantFP::get(C, APFloat(APFloat::IEEEsingle(), Val));
8784      } else {
8785        assert(ScalarSize == 64 && "Unsupported floating point scalar size");
8786        Const = ConstantFP::get(C, APFloat(APFloat::IEEEdouble(), Val));
8787      }
8788    } else
8789      Const = Constant::getIntegerValue(Type::getIntNTy(C, ScalarSize), Val);
8790    ConstantVec.push_back(Const);
8791  }
8792  return ConstantVector::get(ArrayRef<Constant *>(ConstantVec));
8793}
8794
8795static bool isFoldableUseOfShuffle(SDNode *N) {
8796  for (auto *U : N->uses()) {
8797    unsigned Opc = U->getOpcode();
8798    // VPERMV/VPERMV3 shuffles can never fold their index operands.
8799    if (Opc == X86ISD::VPERMV && U->getOperand(0).getNode() == N)
8800      return false;
8801    if (Opc == X86ISD::VPERMV3 && U->getOperand(1).getNode() == N)
8802      return false;
8803    if (isTargetShuffle(Opc))
8804      return true;
8805    if (Opc == ISD::BITCAST) // Ignore bitcasts
8806      return isFoldableUseOfShuffle(U);
8807    if (N->hasOneUse())
8808      return true;
8809  }
8810  return false;
8811}
8812
8813/// Attempt to use the vbroadcast instruction to generate a splat value
8814/// from a splat BUILD_VECTOR which uses:
8815///  a. A single scalar load, or a constant.
8816///  b. Repeated pattern of constants (e.g. <0,1,0,1> or <0,1,2,3,0,1,2,3>).
8817///
8818/// The VBROADCAST node is returned when a pattern is found,
8819/// or SDValue() otherwise.
8820static SDValue lowerBuildVectorAsBroadcast(BuildVectorSDNode *BVOp,
8821                                           const X86Subtarget &Subtarget,
8822                                           SelectionDAG &DAG) {
8823  // VBROADCAST requires AVX.
8824  // TODO: Splats could be generated for non-AVX CPUs using SSE
8825  // instructions, but there's less potential gain for only 128-bit vectors.
8826  if (!Subtarget.hasAVX())
8827    return SDValue();
8828
8829  MVT VT = BVOp->getSimpleValueType(0);
8830  unsigned NumElts = VT.getVectorNumElements();
8831  SDLoc dl(BVOp);
8832
8833  assert((VT.is128BitVector() || VT.is256BitVector() || VT.is512BitVector()) &&
8834         "Unsupported vector type for broadcast.");
8835
8836  // See if the build vector is a repeating sequence of scalars (inc. splat).
8837  SDValue Ld;
8838  BitVector UndefElements;
8839  SmallVector<SDValue, 16> Sequence;
8840  if (BVOp->getRepeatedSequence(Sequence, &UndefElements)) {
8841    assert((NumElts % Sequence.size()) == 0 && "Sequence doesn't fit.");
8842    if (Sequence.size() == 1)
8843      Ld = Sequence[0];
8844  }
8845
8846  // Attempt to use VBROADCASTM
8847  // From this pattern:
8848  // a. t0 = (zext_i64 (bitcast_i8 v2i1 X))
8849  // b. t1 = (build_vector t0 t0)
8850  //
8851  // Create (VBROADCASTM v2i1 X)
8852  if (!Sequence.empty() && Subtarget.hasCDI()) {
8853    // If not a splat, are the upper sequence values zeroable?
8854    unsigned SeqLen = Sequence.size();
8855    bool UpperZeroOrUndef =
8856        SeqLen == 1 ||
8857        llvm::all_of(makeArrayRef(Sequence).drop_front(), [](SDValue V) {
8858          return !V || V.isUndef() || isNullConstant(V);
8859        });
8860    SDValue Op0 = Sequence[0];
8861    if (UpperZeroOrUndef && ((Op0.getOpcode() == ISD::BITCAST) ||
8862                             (Op0.getOpcode() == ISD::ZERO_EXTEND &&
8863                              Op0.getOperand(0).getOpcode() == ISD::BITCAST))) {
8864      SDValue BOperand = Op0.getOpcode() == ISD::BITCAST
8865                             ? Op0.getOperand(0)
8866                             : Op0.getOperand(0).getOperand(0);
8867      MVT MaskVT = BOperand.getSimpleValueType();
8868      MVT EltType = MVT::getIntegerVT(VT.getScalarSizeInBits() * SeqLen);
8869      if ((EltType == MVT::i64 && MaskVT == MVT::v8i1) ||  // for broadcastmb2q
8870          (EltType == MVT::i32 && MaskVT == MVT::v16i1)) { // for broadcastmw2d
8871        MVT BcstVT = MVT::getVectorVT(EltType, NumElts / SeqLen);
8872        if (!VT.is512BitVector() && !Subtarget.hasVLX()) {
8873          unsigned Scale = 512 / VT.getSizeInBits();
8874          BcstVT = MVT::getVectorVT(EltType, Scale * (NumElts / SeqLen));
8875        }
8876        SDValue Bcst = DAG.getNode(X86ISD::VBROADCASTM, dl, BcstVT, BOperand);
8877        if (BcstVT.getSizeInBits() != VT.getSizeInBits())
8878          Bcst = extractSubVector(Bcst, 0, DAG, dl, VT.getSizeInBits());
8879        return DAG.getBitcast(VT, Bcst);
8880      }
8881    }
8882  }
8883
8884  unsigned NumUndefElts = UndefElements.count();
8885  if (!Ld || (NumElts - NumUndefElts) <= 1) {
8886    APInt SplatValue, Undef;
8887    unsigned SplatBitSize;
8888    bool HasUndef;
8889    // Check if this is a repeated constant pattern suitable for broadcasting.
8890    if (BVOp->isConstantSplat(SplatValue, Undef, SplatBitSize, HasUndef) &&
8891        SplatBitSize > VT.getScalarSizeInBits() &&
8892        SplatBitSize < VT.getSizeInBits()) {
8893      // Avoid replacing with broadcast when it's a use of a shuffle
8894      // instruction to preserve the present custom lowering of shuffles.
8895      if (isFoldableUseOfShuffle(BVOp))
8896        return SDValue();
8897      // replace BUILD_VECTOR with broadcast of the repeated constants.
8898      const TargetLowering &TLI = DAG.getTargetLoweringInfo();
8899      LLVMContext *Ctx = DAG.getContext();
8900      MVT PVT = TLI.getPointerTy(DAG.getDataLayout());
8901      if (Subtarget.hasAVX()) {
8902        if (SplatBitSize == 32 || SplatBitSize == 64 ||
8903            (SplatBitSize < 32 && Subtarget.hasAVX2())) {
8904          // Splatted value can fit in one INTEGER constant in constant pool.
8905          // Load the constant and broadcast it.
8906          MVT CVT = MVT::getIntegerVT(SplatBitSize);
8907          Type *ScalarTy = Type::getIntNTy(*Ctx, SplatBitSize);
8908          Constant *C = Constant::getIntegerValue(ScalarTy, SplatValue);
8909          SDValue CP = DAG.getConstantPool(C, PVT);
8910          unsigned Repeat = VT.getSizeInBits() / SplatBitSize;
8911
8912          Align Alignment = cast<ConstantPoolSDNode>(CP)->getAlign();
8913          SDVTList Tys =
8914              DAG.getVTList(MVT::getVectorVT(CVT, Repeat), MVT::Other);
8915          SDValue Ops[] = {DAG.getEntryNode(), CP};
8916          MachinePointerInfo MPI =
8917              MachinePointerInfo::getConstantPool(DAG.getMachineFunction());
8918          SDValue Brdcst = DAG.getMemIntrinsicNode(
8919              X86ISD::VBROADCAST_LOAD, dl, Tys, Ops, CVT, MPI, Alignment,
8920              MachineMemOperand::MOLoad);
8921          return DAG.getBitcast(VT, Brdcst);
8922        }
8923        if (SplatBitSize > 64) {
8924          // Load the vector of constants and broadcast it.
8925          Constant *VecC = getConstantVector(VT, SplatValue, SplatBitSize,
8926                                             *Ctx);
8927          SDValue VCP = DAG.getConstantPool(VecC, PVT);
8928          unsigned NumElm = SplatBitSize / VT.getScalarSizeInBits();
8929          MVT VVT = MVT::getVectorVT(VT.getScalarType(), NumElm);
8930          Align Alignment = cast<ConstantPoolSDNode>(VCP)->getAlign();
8931          SDVTList Tys = DAG.getVTList(VT, MVT::Other);
8932          SDValue Ops[] = {DAG.getEntryNode(), VCP};
8933          MachinePointerInfo MPI =
8934              MachinePointerInfo::getConstantPool(DAG.getMachineFunction());
8935          return DAG.getMemIntrinsicNode(
8936              X86ISD::SUBV_BROADCAST_LOAD, dl, Tys, Ops, VVT, MPI, Alignment,
8937              MachineMemOperand::MOLoad);
8938        }
8939      }
8940    }
8941
8942    // If we are moving a scalar into a vector (Ld must be set and all elements
8943    // but 1 are undef) and that operation is not obviously supported by
8944    // vmovd/vmovq/vmovss/vmovsd, then keep trying to form a broadcast.
8945    // That's better than general shuffling and may eliminate a load to GPR and
8946    // move from scalar to vector register.
8947    if (!Ld || NumElts - NumUndefElts != 1)
8948      return SDValue();
8949    unsigned ScalarSize = Ld.getValueSizeInBits();
8950    if (!(UndefElements[0] || (ScalarSize != 32 && ScalarSize != 64)))
8951      return SDValue();
8952  }
8953
8954  bool ConstSplatVal =
8955      (Ld.getOpcode() == ISD::Constant || Ld.getOpcode() == ISD::ConstantFP);
8956  bool IsLoad = ISD::isNormalLoad(Ld.getNode());
8957
8958  // TODO: Handle broadcasts of non-constant sequences.
8959
8960  // Make sure that all of the users of a non-constant load are from the
8961  // BUILD_VECTOR node.
8962  // FIXME: Is the use count needed for non-constant, non-load case?
8963  if (!ConstSplatVal && !IsLoad && !BVOp->isOnlyUserOf(Ld.getNode()))
8964    return SDValue();
8965
8966  unsigned ScalarSize = Ld.getValueSizeInBits();
8967  bool IsGE256 = (VT.getSizeInBits() >= 256);
8968
8969  // When optimizing for size, generate up to 5 extra bytes for a broadcast
8970  // instruction to save 8 or more bytes of constant pool data.
8971  // TODO: If multiple splats are generated to load the same constant,
8972  // it may be detrimental to overall size. There needs to be a way to detect
8973  // that condition to know if this is truly a size win.
8974  bool OptForSize = DAG.shouldOptForSize();
8975
8976  // Handle broadcasting a single constant scalar from the constant pool
8977  // into a vector.
8978  // On Sandybridge (no AVX2), it is still better to load a constant vector
8979  // from the constant pool and not to broadcast it from a scalar.
8980  // But override that restriction when optimizing for size.
8981  // TODO: Check if splatting is recommended for other AVX-capable CPUs.
8982  if (ConstSplatVal && (Subtarget.hasAVX2() || OptForSize)) {
8983    EVT CVT = Ld.getValueType();
8984    assert(!CVT.isVector() && "Must not broadcast a vector type");
8985
8986    // Splat f32, i32, v4f64, v4i64 in all cases with AVX2.
8987    // For size optimization, also splat v2f64 and v2i64, and for size opt
8988    // with AVX2, also splat i8 and i16.
8989    // With pattern matching, the VBROADCAST node may become a VMOVDDUP.
8990    if (ScalarSize == 32 || (IsGE256 && ScalarSize == 64) ||
8991        (OptForSize && (ScalarSize == 64 || Subtarget.hasAVX2()))) {
8992      const Constant *C = nullptr;
8993      if (ConstantSDNode *CI = dyn_cast<ConstantSDNode>(Ld))
8994        C = CI->getConstantIntValue();
8995      else if (ConstantFPSDNode *CF = dyn_cast<ConstantFPSDNode>(Ld))
8996        C = CF->getConstantFPValue();
8997
8998      assert(C && "Invalid constant type");
8999
9000      const TargetLowering &TLI = DAG.getTargetLoweringInfo();
9001      SDValue CP =
9002          DAG.getConstantPool(C, TLI.getPointerTy(DAG.getDataLayout()));
9003      Align Alignment = cast<ConstantPoolSDNode>(CP)->getAlign();
9004
9005      SDVTList Tys = DAG.getVTList(VT, MVT::Other);
9006      SDValue Ops[] = {DAG.getEntryNode(), CP};
9007      MachinePointerInfo MPI =
9008          MachinePointerInfo::getConstantPool(DAG.getMachineFunction());
9009      return DAG.getMemIntrinsicNode(X86ISD::VBROADCAST_LOAD, dl, Tys, Ops, CVT,
9010                                     MPI, Alignment, MachineMemOperand::MOLoad);
9011    }
9012  }
9013
9014  // Handle AVX2 in-register broadcasts.
9015  if (!IsLoad && Subtarget.hasInt256() &&
9016      (ScalarSize == 32 || (IsGE256 && ScalarSize == 64)))
9017    return DAG.getNode(X86ISD::VBROADCAST, dl, VT, Ld);
9018
9019  // The scalar source must be a normal load.
9020  if (!IsLoad)
9021    return SDValue();
9022
9023  // Make sure the non-chain result is only used by this build vector.
9024  if (!Ld->hasNUsesOfValue(NumElts - NumUndefElts, 0))
9025    return SDValue();
9026
9027  if (ScalarSize == 32 || (IsGE256 && ScalarSize == 64) ||
9028      (Subtarget.hasVLX() && ScalarSize == 64)) {
9029    auto *LN = cast<LoadSDNode>(Ld);
9030    SDVTList Tys = DAG.getVTList(VT, MVT::Other);
9031    SDValue Ops[] = {LN->getChain(), LN->getBasePtr()};
9032    SDValue BCast =
9033        DAG.getMemIntrinsicNode(X86ISD::VBROADCAST_LOAD, dl, Tys, Ops,
9034                                LN->getMemoryVT(), LN->getMemOperand());
9035    DAG.ReplaceAllUsesOfValueWith(SDValue(LN, 1), BCast.getValue(1));
9036    return BCast;
9037  }
9038
9039  // The integer check is needed for the 64-bit into 128-bit so it doesn't match
9040  // double since there is no vbroadcastsd xmm
9041  if (Subtarget.hasInt256() && Ld.getValueType().isInteger() &&
9042      (ScalarSize == 8 || ScalarSize == 16 || ScalarSize == 64)) {
9043    auto *LN = cast<LoadSDNode>(Ld);
9044    SDVTList Tys = DAG.getVTList(VT, MVT::Other);
9045    SDValue Ops[] = {LN->getChain(), LN->getBasePtr()};
9046    SDValue BCast =
9047        DAG.getMemIntrinsicNode(X86ISD::VBROADCAST_LOAD, dl, Tys, Ops,
9048                                LN->getMemoryVT(), LN->getMemOperand());
9049    DAG.ReplaceAllUsesOfValueWith(SDValue(LN, 1), BCast.getValue(1));
9050    return BCast;
9051  }
9052
9053  // Unsupported broadcast.
9054  return SDValue();
9055}
9056
9057/// For an EXTRACT_VECTOR_ELT with a constant index return the real
9058/// underlying vector and index.
9059///
9060/// Modifies \p ExtractedFromVec to the real vector and returns the real
9061/// index.
9062static int getUnderlyingExtractedFromVec(SDValue &ExtractedFromVec,
9063                                         SDValue ExtIdx) {
9064  int Idx = cast<ConstantSDNode>(ExtIdx)->getZExtValue();
9065  if (!isa<ShuffleVectorSDNode>(ExtractedFromVec))
9066    return Idx;
9067
9068  // For 256-bit vectors, LowerEXTRACT_VECTOR_ELT_SSE4 may have already
9069  // lowered this:
9070  //   (extract_vector_elt (v8f32 %1), Constant<6>)
9071  // to:
9072  //   (extract_vector_elt (vector_shuffle<2,u,u,u>
9073  //                           (extract_subvector (v8f32 %0), Constant<4>),
9074  //                           undef)
9075  //                       Constant<0>)
9076  // In this case the vector is the extract_subvector expression and the index
9077  // is 2, as specified by the shuffle.
9078  ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(ExtractedFromVec);
9079  SDValue ShuffleVec = SVOp->getOperand(0);
9080  MVT ShuffleVecVT = ShuffleVec.getSimpleValueType();
9081  assert(ShuffleVecVT.getVectorElementType() ==
9082         ExtractedFromVec.getSimpleValueType().getVectorElementType());
9083
9084  int ShuffleIdx = SVOp->getMaskElt(Idx);
9085  if (isUndefOrInRange(ShuffleIdx, 0, ShuffleVecVT.getVectorNumElements())) {
9086    ExtractedFromVec = ShuffleVec;
9087    return ShuffleIdx;
9088  }
9089  return Idx;
9090}
9091
9092static SDValue buildFromShuffleMostly(SDValue Op, SelectionDAG &DAG) {
9093  MVT VT = Op.getSimpleValueType();
9094
9095  // Skip if insert_vec_elt is not supported.
9096  const TargetLowering &TLI = DAG.getTargetLoweringInfo();
9097  if (!TLI.isOperationLegalOrCustom(ISD::INSERT_VECTOR_ELT, VT))
9098    return SDValue();
9099
9100  SDLoc DL(Op);
9101  unsigned NumElems = Op.getNumOperands();
9102
9103  SDValue VecIn1;
9104  SDValue VecIn2;
9105  SmallVector<unsigned, 4> InsertIndices;
9106  SmallVector<int, 8> Mask(NumElems, -1);
9107
9108  for (unsigned i = 0; i != NumElems; ++i) {
9109    unsigned Opc = Op.getOperand(i).getOpcode();
9110
9111    if (Opc == ISD::UNDEF)
9112      continue;
9113
9114    if (Opc != ISD::EXTRACT_VECTOR_ELT) {
9115      // Quit if more than 1 elements need inserting.
9116      if (InsertIndices.size() > 1)
9117        return SDValue();
9118
9119      InsertIndices.push_back(i);
9120      continue;
9121    }
9122
9123    SDValue ExtractedFromVec = Op.getOperand(i).getOperand(0);
9124    SDValue ExtIdx = Op.getOperand(i).getOperand(1);
9125
9126    // Quit if non-constant index.
9127    if (!isa<ConstantSDNode>(ExtIdx))
9128      return SDValue();
9129    int Idx = getUnderlyingExtractedFromVec(ExtractedFromVec, ExtIdx);
9130
9131    // Quit if extracted from vector of different type.
9132    if (ExtractedFromVec.getValueType() != VT)
9133      return SDValue();
9134
9135    if (!VecIn1.getNode())
9136      VecIn1 = ExtractedFromVec;
9137    else if (VecIn1 != ExtractedFromVec) {
9138      if (!VecIn2.getNode())
9139        VecIn2 = ExtractedFromVec;
9140      else if (VecIn2 != ExtractedFromVec)
9141        // Quit if more than 2 vectors to shuffle
9142        return SDValue();
9143    }
9144
9145    if (ExtractedFromVec == VecIn1)
9146      Mask[i] = Idx;
9147    else if (ExtractedFromVec == VecIn2)
9148      Mask[i] = Idx + NumElems;
9149  }
9150
9151  if (!VecIn1.getNode())
9152    return SDValue();
9153
9154  VecIn2 = VecIn2.getNode() ? VecIn2 : DAG.getUNDEF(VT);
9155  SDValue NV = DAG.getVectorShuffle(VT, DL, VecIn1, VecIn2, Mask);
9156
9157  for (unsigned Idx : InsertIndices)
9158    NV = DAG.getNode(ISD::INSERT_VECTOR_ELT, DL, VT, NV, Op.getOperand(Idx),
9159                     DAG.getIntPtrConstant(Idx, DL));
9160
9161  return NV;
9162}
9163
9164// Lower BUILD_VECTOR operation for v8i1 and v16i1 types.
9165static SDValue LowerBUILD_VECTORvXi1(SDValue Op, SelectionDAG &DAG,
9166                                     const X86Subtarget &Subtarget) {
9167
9168  MVT VT = Op.getSimpleValueType();
9169  assert((VT.getVectorElementType() == MVT::i1) &&
9170         "Unexpected type in LowerBUILD_VECTORvXi1!");
9171
9172  SDLoc dl(Op);
9173  if (ISD::isBuildVectorAllZeros(Op.getNode()) ||
9174      ISD::isBuildVectorAllOnes(Op.getNode()))
9175    return Op;
9176
9177  uint64_t Immediate = 0;
9178  SmallVector<unsigned, 16> NonConstIdx;
9179  bool IsSplat = true;
9180  bool HasConstElts = false;
9181  int SplatIdx = -1;
9182  for (unsigned idx = 0, e = Op.getNumOperands(); idx < e; ++idx) {
9183    SDValue In = Op.getOperand(idx);
9184    if (In.isUndef())
9185      continue;
9186    if (auto *InC = dyn_cast<ConstantSDNode>(In)) {
9187      Immediate |= (InC->getZExtValue() & 0x1) << idx;
9188      HasConstElts = true;
9189    } else {
9190      NonConstIdx.push_back(idx);
9191    }
9192    if (SplatIdx < 0)
9193      SplatIdx = idx;
9194    else if (In != Op.getOperand(SplatIdx))
9195      IsSplat = false;
9196  }
9197
9198  // for splat use " (select i1 splat_elt, all-ones, all-zeroes)"
9199  if (IsSplat) {
9200    // The build_vector allows the scalar element to be larger than the vector
9201    // element type. We need to mask it to use as a condition unless we know
9202    // the upper bits are zero.
9203    // FIXME: Use computeKnownBits instead of checking specific opcode?
9204    SDValue Cond = Op.getOperand(SplatIdx);
9205    assert(Cond.getValueType() == MVT::i8 && "Unexpected VT!");
9206    if (Cond.getOpcode() != ISD::SETCC)
9207      Cond = DAG.getNode(ISD::AND, dl, MVT::i8, Cond,
9208                         DAG.getConstant(1, dl, MVT::i8));
9209
9210    // Perform the select in the scalar domain so we can use cmov.
9211    if (VT == MVT::v64i1 && !Subtarget.is64Bit()) {
9212      SDValue Select = DAG.getSelect(dl, MVT::i32, Cond,
9213                                     DAG.getAllOnesConstant(dl, MVT::i32),
9214                                     DAG.getConstant(0, dl, MVT::i32));
9215      Select = DAG.getBitcast(MVT::v32i1, Select);
9216      return DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v64i1, Select, Select);
9217    } else {
9218      MVT ImmVT = MVT::getIntegerVT(std::max((unsigned)VT.getSizeInBits(), 8U));
9219      SDValue Select = DAG.getSelect(dl, ImmVT, Cond,
9220                                     DAG.getAllOnesConstant(dl, ImmVT),
9221                                     DAG.getConstant(0, dl, ImmVT));
9222      MVT VecVT = VT.getSizeInBits() >= 8 ? VT : MVT::v8i1;
9223      Select = DAG.getBitcast(VecVT, Select);
9224      return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, VT, Select,
9225                         DAG.getIntPtrConstant(0, dl));
9226    }
9227  }
9228
9229  // insert elements one by one
9230  SDValue DstVec;
9231  if (HasConstElts) {
9232    if (VT == MVT::v64i1 && !Subtarget.is64Bit()) {
9233      SDValue ImmL = DAG.getConstant(Lo_32(Immediate), dl, MVT::i32);
9234      SDValue ImmH = DAG.getConstant(Hi_32(Immediate), dl, MVT::i32);
9235      ImmL = DAG.getBitcast(MVT::v32i1, ImmL);
9236      ImmH = DAG.getBitcast(MVT::v32i1, ImmH);
9237      DstVec = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v64i1, ImmL, ImmH);
9238    } else {
9239      MVT ImmVT = MVT::getIntegerVT(std::max((unsigned)VT.getSizeInBits(), 8U));
9240      SDValue Imm = DAG.getConstant(Immediate, dl, ImmVT);
9241      MVT VecVT = VT.getSizeInBits() >= 8 ? VT : MVT::v8i1;
9242      DstVec = DAG.getBitcast(VecVT, Imm);
9243      DstVec = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, VT, DstVec,
9244                           DAG.getIntPtrConstant(0, dl));
9245    }
9246  } else
9247    DstVec = DAG.getUNDEF(VT);
9248
9249  for (unsigned i = 0, e = NonConstIdx.size(); i != e; ++i) {
9250    unsigned InsertIdx = NonConstIdx[i];
9251    DstVec = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, VT, DstVec,
9252                         Op.getOperand(InsertIdx),
9253                         DAG.getIntPtrConstant(InsertIdx, dl));
9254  }
9255  return DstVec;
9256}
9257
9258LLVM_ATTRIBUTE_UNUSED static bool isHorizOp(unsigned Opcode) {
9259  switch (Opcode) {
9260  case X86ISD::PACKSS:
9261  case X86ISD::PACKUS:
9262  case X86ISD::FHADD:
9263  case X86ISD::FHSUB:
9264  case X86ISD::HADD:
9265  case X86ISD::HSUB:
9266    return true;
9267  }
9268  return false;
9269}
9270
9271/// This is a helper function of LowerToHorizontalOp().
9272/// This function checks that the build_vector \p N in input implements a
9273/// 128-bit partial horizontal operation on a 256-bit vector, but that operation
9274/// may not match the layout of an x86 256-bit horizontal instruction.
9275/// In other words, if this returns true, then some extraction/insertion will
9276/// be required to produce a valid horizontal instruction.
9277///
9278/// Parameter \p Opcode defines the kind of horizontal operation to match.
9279/// For example, if \p Opcode is equal to ISD::ADD, then this function
9280/// checks if \p N implements a horizontal arithmetic add; if instead \p Opcode
9281/// is equal to ISD::SUB, then this function checks if this is a horizontal
9282/// arithmetic sub.
9283///
9284/// This function only analyzes elements of \p N whose indices are
9285/// in range [BaseIdx, LastIdx).
9286///
9287/// TODO: This function was originally used to match both real and fake partial
9288/// horizontal operations, but the index-matching logic is incorrect for that.
9289/// See the corrected implementation in isHopBuildVector(). Can we reduce this
9290/// code because it is only used for partial h-op matching now?
9291static bool isHorizontalBinOpPart(const BuildVectorSDNode *N, unsigned Opcode,
9292                                  SelectionDAG &DAG,
9293                                  unsigned BaseIdx, unsigned LastIdx,
9294                                  SDValue &V0, SDValue &V1) {
9295  EVT VT = N->getValueType(0);
9296  assert(VT.is256BitVector() && "Only use for matching partial 256-bit h-ops");
9297  assert(BaseIdx * 2 <= LastIdx && "Invalid Indices in input!");
9298  assert(VT.isVector() && VT.getVectorNumElements() >= LastIdx &&
9299         "Invalid Vector in input!");
9300
9301  bool IsCommutable = (Opcode == ISD::ADD || Opcode == ISD::FADD);
9302  bool CanFold = true;
9303  unsigned ExpectedVExtractIdx = BaseIdx;
9304  unsigned NumElts = LastIdx - BaseIdx;
9305  V0 = DAG.getUNDEF(VT);
9306  V1 = DAG.getUNDEF(VT);
9307
9308  // Check if N implements a horizontal binop.
9309  for (unsigned i = 0, e = NumElts; i != e && CanFold; ++i) {
9310    SDValue Op = N->getOperand(i + BaseIdx);
9311
9312    // Skip UNDEFs.
9313    if (Op->isUndef()) {
9314      // Update the expected vector extract index.
9315      if (i * 2 == NumElts)
9316        ExpectedVExtractIdx = BaseIdx;
9317      ExpectedVExtractIdx += 2;
9318      continue;
9319    }
9320
9321    CanFold = Op->getOpcode() == Opcode && Op->hasOneUse();
9322
9323    if (!CanFold)
9324      break;
9325
9326    SDValue Op0 = Op.getOperand(0);
9327    SDValue Op1 = Op.getOperand(1);
9328
9329    // Try to match the following pattern:
9330    // (BINOP (extract_vector_elt A, I), (extract_vector_elt A, I+1))
9331    CanFold = (Op0.getOpcode() == ISD::EXTRACT_VECTOR_ELT &&
9332        Op1.getOpcode() == ISD::EXTRACT_VECTOR_ELT &&
9333        Op0.getOperand(0) == Op1.getOperand(0) &&
9334        isa<ConstantSDNode>(Op0.getOperand(1)) &&
9335        isa<ConstantSDNode>(Op1.getOperand(1)));
9336    if (!CanFold)
9337      break;
9338
9339    unsigned I0 = Op0.getConstantOperandVal(1);
9340    unsigned I1 = Op1.getConstantOperandVal(1);
9341
9342    if (i * 2 < NumElts) {
9343      if (V0.isUndef()) {
9344        V0 = Op0.getOperand(0);
9345        if (V0.getValueType() != VT)
9346          return false;
9347      }
9348    } else {
9349      if (V1.isUndef()) {
9350        V1 = Op0.getOperand(0);
9351        if (V1.getValueType() != VT)
9352          return false;
9353      }
9354      if (i * 2 == NumElts)
9355        ExpectedVExtractIdx = BaseIdx;
9356    }
9357
9358    SDValue Expected = (i * 2 < NumElts) ? V0 : V1;
9359    if (I0 == ExpectedVExtractIdx)
9360      CanFold = I1 == I0 + 1 && Op0.getOperand(0) == Expected;
9361    else if (IsCommutable && I1 == ExpectedVExtractIdx) {
9362      // Try to match the following dag sequence:
9363      // (BINOP (extract_vector_elt A, I+1), (extract_vector_elt A, I))
9364      CanFold = I0 == I1 + 1 && Op1.getOperand(0) == Expected;
9365    } else
9366      CanFold = false;
9367
9368    ExpectedVExtractIdx += 2;
9369  }
9370
9371  return CanFold;
9372}
9373
9374/// Emit a sequence of two 128-bit horizontal add/sub followed by
9375/// a concat_vector.
9376///
9377/// This is a helper function of LowerToHorizontalOp().
9378/// This function expects two 256-bit vectors called V0 and V1.
9379/// At first, each vector is split into two separate 128-bit vectors.
9380/// Then, the resulting 128-bit vectors are used to implement two
9381/// horizontal binary operations.
9382///
9383/// The kind of horizontal binary operation is defined by \p X86Opcode.
9384///
9385/// \p Mode specifies how the 128-bit parts of V0 and V1 are passed in input to
9386/// the two new horizontal binop.
9387/// When Mode is set, the first horizontal binop dag node would take as input
9388/// the lower 128-bit of V0 and the upper 128-bit of V0. The second
9389/// horizontal binop dag node would take as input the lower 128-bit of V1
9390/// and the upper 128-bit of V1.
9391///   Example:
9392///     HADD V0_LO, V0_HI
9393///     HADD V1_LO, V1_HI
9394///
9395/// Otherwise, the first horizontal binop dag node takes as input the lower
9396/// 128-bit of V0 and the lower 128-bit of V1, and the second horizontal binop
9397/// dag node takes the upper 128-bit of V0 and the upper 128-bit of V1.
9398///   Example:
9399///     HADD V0_LO, V1_LO
9400///     HADD V0_HI, V1_HI
9401///
9402/// If \p isUndefLO is set, then the algorithm propagates UNDEF to the lower
9403/// 128-bits of the result. If \p isUndefHI is set, then UNDEF is propagated to
9404/// the upper 128-bits of the result.
9405static SDValue ExpandHorizontalBinOp(const SDValue &V0, const SDValue &V1,
9406                                     const SDLoc &DL, SelectionDAG &DAG,
9407                                     unsigned X86Opcode, bool Mode,
9408                                     bool isUndefLO, bool isUndefHI) {
9409  MVT VT = V0.getSimpleValueType();
9410  assert(VT.is256BitVector() && VT == V1.getSimpleValueType() &&
9411         "Invalid nodes in input!");
9412
9413  unsigned NumElts = VT.getVectorNumElements();
9414  SDValue V0_LO = extract128BitVector(V0, 0, DAG, DL);
9415  SDValue V0_HI = extract128BitVector(V0, NumElts/2, DAG, DL);
9416  SDValue V1_LO = extract128BitVector(V1, 0, DAG, DL);
9417  SDValue V1_HI = extract128BitVector(V1, NumElts/2, DAG, DL);
9418  MVT NewVT = V0_LO.getSimpleValueType();
9419
9420  SDValue LO = DAG.getUNDEF(NewVT);
9421  SDValue HI = DAG.getUNDEF(NewVT);
9422
9423  if (Mode) {
9424    // Don't emit a horizontal binop if the result is expected to be UNDEF.
9425    if (!isUndefLO && !V0->isUndef())
9426      LO = DAG.getNode(X86Opcode, DL, NewVT, V0_LO, V0_HI);
9427    if (!isUndefHI && !V1->isUndef())
9428      HI = DAG.getNode(X86Opcode, DL, NewVT, V1_LO, V1_HI);
9429  } else {
9430    // Don't emit a horizontal binop if the result is expected to be UNDEF.
9431    if (!isUndefLO && (!V0_LO->isUndef() || !V1_LO->isUndef()))
9432      LO = DAG.getNode(X86Opcode, DL, NewVT, V0_LO, V1_LO);
9433
9434    if (!isUndefHI && (!V0_HI->isUndef() || !V1_HI->isUndef()))
9435      HI = DAG.getNode(X86Opcode, DL, NewVT, V0_HI, V1_HI);
9436  }
9437
9438  return DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, LO, HI);
9439}
9440
9441/// Returns true iff \p BV builds a vector with the result equivalent to
9442/// the result of ADDSUB/SUBADD operation.
9443/// If true is returned then the operands of ADDSUB = Opnd0 +- Opnd1
9444/// (SUBADD = Opnd0 -+ Opnd1) operation are written to the parameters
9445/// \p Opnd0 and \p Opnd1.
9446static bool isAddSubOrSubAdd(const BuildVectorSDNode *BV,
9447                             const X86Subtarget &Subtarget, SelectionDAG &DAG,
9448                             SDValue &Opnd0, SDValue &Opnd1,
9449                             unsigned &NumExtracts,
9450                             bool &IsSubAdd) {
9451
9452  MVT VT = BV->getSimpleValueType(0);
9453  if (!Subtarget.hasSSE3() || !VT.isFloatingPoint())
9454    return false;
9455
9456  unsigned NumElts = VT.getVectorNumElements();
9457  SDValue InVec0 = DAG.getUNDEF(VT);
9458  SDValue InVec1 = DAG.getUNDEF(VT);
9459
9460  NumExtracts = 0;
9461
9462  // Odd-numbered elements in the input build vector are obtained from
9463  // adding/subtracting two integer/float elements.
9464  // Even-numbered elements in the input build vector are obtained from
9465  // subtracting/adding two integer/float elements.
9466  unsigned Opc[2] = {0, 0};
9467  for (unsigned i = 0, e = NumElts; i != e; ++i) {
9468    SDValue Op = BV->getOperand(i);
9469
9470    // Skip 'undef' values.
9471    unsigned Opcode = Op.getOpcode();
9472    if (Opcode == ISD::UNDEF)
9473      continue;
9474
9475    // Early exit if we found an unexpected opcode.
9476    if (Opcode != ISD::FADD && Opcode != ISD::FSUB)
9477      return false;
9478
9479    SDValue Op0 = Op.getOperand(0);
9480    SDValue Op1 = Op.getOperand(1);
9481
9482    // Try to match the following pattern:
9483    // (BINOP (extract_vector_elt A, i), (extract_vector_elt B, i))
9484    // Early exit if we cannot match that sequence.
9485    if (Op0.getOpcode() != ISD::EXTRACT_VECTOR_ELT ||
9486        Op1.getOpcode() != ISD::EXTRACT_VECTOR_ELT ||
9487        !isa<ConstantSDNode>(Op0.getOperand(1)) ||
9488        Op0.getOperand(1) != Op1.getOperand(1))
9489      return false;
9490
9491    unsigned I0 = Op0.getConstantOperandVal(1);
9492    if (I0 != i)
9493      return false;
9494
9495    // We found a valid add/sub node, make sure its the same opcode as previous
9496    // elements for this parity.
9497    if (Opc[i % 2] != 0 && Opc[i % 2] != Opcode)
9498      return false;
9499    Opc[i % 2] = Opcode;
9500
9501    // Update InVec0 and InVec1.
9502    if (InVec0.isUndef()) {
9503      InVec0 = Op0.getOperand(0);
9504      if (InVec0.getSimpleValueType() != VT)
9505        return false;
9506    }
9507    if (InVec1.isUndef()) {
9508      InVec1 = Op1.getOperand(0);
9509      if (InVec1.getSimpleValueType() != VT)
9510        return false;
9511    }
9512
9513    // Make sure that operands in input to each add/sub node always
9514    // come from a same pair of vectors.
9515    if (InVec0 != Op0.getOperand(0)) {
9516      if (Opcode == ISD::FSUB)
9517        return false;
9518
9519      // FADD is commutable. Try to commute the operands
9520      // and then test again.
9521      std::swap(Op0, Op1);
9522      if (InVec0 != Op0.getOperand(0))
9523        return false;
9524    }
9525
9526    if (InVec1 != Op1.getOperand(0))
9527      return false;
9528
9529    // Increment the number of extractions done.
9530    ++NumExtracts;
9531  }
9532
9533  // Ensure we have found an opcode for both parities and that they are
9534  // different. Don't try to fold this build_vector into an ADDSUB/SUBADD if the
9535  // inputs are undef.
9536  if (!Opc[0] || !Opc[1] || Opc[0] == Opc[1] ||
9537      InVec0.isUndef() || InVec1.isUndef())
9538    return false;
9539
9540  IsSubAdd = Opc[0] == ISD::FADD;
9541
9542  Opnd0 = InVec0;
9543  Opnd1 = InVec1;
9544  return true;
9545}
9546
9547/// Returns true if is possible to fold MUL and an idiom that has already been
9548/// recognized as ADDSUB/SUBADD(\p Opnd0, \p Opnd1) into
9549/// FMADDSUB/FMSUBADD(x, y, \p Opnd1). If (and only if) true is returned, the
9550/// operands of FMADDSUB/FMSUBADD are written to parameters \p Opnd0, \p Opnd1, \p Opnd2.
9551///
9552/// Prior to calling this function it should be known that there is some
9553/// SDNode that potentially can be replaced with an X86ISD::ADDSUB operation
9554/// using \p Opnd0 and \p Opnd1 as operands. Also, this method is called
9555/// before replacement of such SDNode with ADDSUB operation. Thus the number
9556/// of \p Opnd0 uses is expected to be equal to 2.
9557/// For example, this function may be called for the following IR:
9558///    %AB = fmul fast <2 x double> %A, %B
9559///    %Sub = fsub fast <2 x double> %AB, %C
9560///    %Add = fadd fast <2 x double> %AB, %C
9561///    %Addsub = shufflevector <2 x double> %Sub, <2 x double> %Add,
9562///                            <2 x i32> <i32 0, i32 3>
9563/// There is a def for %Addsub here, which potentially can be replaced by
9564/// X86ISD::ADDSUB operation:
9565///    %Addsub = X86ISD::ADDSUB %AB, %C
9566/// and such ADDSUB can further be replaced with FMADDSUB:
9567///    %Addsub = FMADDSUB %A, %B, %C.
9568///
9569/// The main reason why this method is called before the replacement of the
9570/// recognized ADDSUB idiom with ADDSUB operation is that such replacement
9571/// is illegal sometimes. E.g. 512-bit ADDSUB is not available, while 512-bit
9572/// FMADDSUB is.
9573static bool isFMAddSubOrFMSubAdd(const X86Subtarget &Subtarget,
9574                                 SelectionDAG &DAG,
9575                                 SDValue &Opnd0, SDValue &Opnd1, SDValue &Opnd2,
9576                                 unsigned ExpectedUses) {
9577  if (Opnd0.getOpcode() != ISD::FMUL ||
9578      !Opnd0->hasNUsesOfValue(ExpectedUses, 0) || !Subtarget.hasAnyFMA())
9579    return false;
9580
9581  // FIXME: These checks must match the similar ones in
9582  // DAGCombiner::visitFADDForFMACombine. It would be good to have one
9583  // function that would answer if it is Ok to fuse MUL + ADD to FMADD
9584  // or MUL + ADDSUB to FMADDSUB.
9585  const TargetOptions &Options = DAG.getTarget().Options;
9586  bool AllowFusion =
9587      (Options.AllowFPOpFusion == FPOpFusion::Fast || Options.UnsafeFPMath);
9588  if (!AllowFusion)
9589    return false;
9590
9591  Opnd2 = Opnd1;
9592  Opnd1 = Opnd0.getOperand(1);
9593  Opnd0 = Opnd0.getOperand(0);
9594
9595  return true;
9596}
9597
9598/// Try to fold a build_vector that performs an 'addsub' or 'fmaddsub' or
9599/// 'fsubadd' operation accordingly to X86ISD::ADDSUB or X86ISD::FMADDSUB or
9600/// X86ISD::FMSUBADD node.
9601static SDValue lowerToAddSubOrFMAddSub(const BuildVectorSDNode *BV,
9602                                       const X86Subtarget &Subtarget,
9603                                       SelectionDAG &DAG) {
9604  SDValue Opnd0, Opnd1;
9605  unsigned NumExtracts;
9606  bool IsSubAdd;
9607  if (!isAddSubOrSubAdd(BV, Subtarget, DAG, Opnd0, Opnd1, NumExtracts,
9608                        IsSubAdd))
9609    return SDValue();
9610
9611  MVT VT = BV->getSimpleValueType(0);
9612  SDLoc DL(BV);
9613
9614  // Try to generate X86ISD::FMADDSUB node here.
9615  SDValue Opnd2;
9616  if (isFMAddSubOrFMSubAdd(Subtarget, DAG, Opnd0, Opnd1, Opnd2, NumExtracts)) {
9617    unsigned Opc = IsSubAdd ? X86ISD::FMSUBADD : X86ISD::FMADDSUB;
9618    return DAG.getNode(Opc, DL, VT, Opnd0, Opnd1, Opnd2);
9619  }
9620
9621  // We only support ADDSUB.
9622  if (IsSubAdd)
9623    return SDValue();
9624
9625  // Do not generate X86ISD::ADDSUB node for 512-bit types even though
9626  // the ADDSUB idiom has been successfully recognized. There are no known
9627  // X86 targets with 512-bit ADDSUB instructions!
9628  // 512-bit ADDSUB idiom recognition was needed only as part of FMADDSUB idiom
9629  // recognition.
9630  if (VT.is512BitVector())
9631    return SDValue();
9632
9633  return DAG.getNode(X86ISD::ADDSUB, DL, VT, Opnd0, Opnd1);
9634}
9635
9636static bool isHopBuildVector(const BuildVectorSDNode *BV, SelectionDAG &DAG,
9637                             unsigned &HOpcode, SDValue &V0, SDValue &V1) {
9638  // Initialize outputs to known values.
9639  MVT VT = BV->getSimpleValueType(0);
9640  HOpcode = ISD::DELETED_NODE;
9641  V0 = DAG.getUNDEF(VT);
9642  V1 = DAG.getUNDEF(VT);
9643
9644  // x86 256-bit horizontal ops are defined in a non-obvious way. Each 128-bit
9645  // half of the result is calculated independently from the 128-bit halves of
9646  // the inputs, so that makes the index-checking logic below more complicated.
9647  unsigned NumElts = VT.getVectorNumElements();
9648  unsigned GenericOpcode = ISD::DELETED_NODE;
9649  unsigned Num128BitChunks = VT.is256BitVector() ? 2 : 1;
9650  unsigned NumEltsIn128Bits = NumElts / Num128BitChunks;
9651  unsigned NumEltsIn64Bits = NumEltsIn128Bits / 2;
9652  for (unsigned i = 0; i != Num128BitChunks; ++i) {
9653    for (unsigned j = 0; j != NumEltsIn128Bits; ++j) {
9654      // Ignore undef elements.
9655      SDValue Op = BV->getOperand(i * NumEltsIn128Bits + j);
9656      if (Op.isUndef())
9657        continue;
9658
9659      // If there's an opcode mismatch, we're done.
9660      if (HOpcode != ISD::DELETED_NODE && Op.getOpcode() != GenericOpcode)
9661        return false;
9662
9663      // Initialize horizontal opcode.
9664      if (HOpcode == ISD::DELETED_NODE) {
9665        GenericOpcode = Op.getOpcode();
9666        switch (GenericOpcode) {
9667        case ISD::ADD: HOpcode = X86ISD::HADD; break;
9668        case ISD::SUB: HOpcode = X86ISD::HSUB; break;
9669        case ISD::FADD: HOpcode = X86ISD::FHADD; break;
9670        case ISD::FSUB: HOpcode = X86ISD::FHSUB; break;
9671        default: return false;
9672        }
9673      }
9674
9675      SDValue Op0 = Op.getOperand(0);
9676      SDValue Op1 = Op.getOperand(1);
9677      if (Op0.getOpcode() != ISD::EXTRACT_VECTOR_ELT ||
9678          Op1.getOpcode() != ISD::EXTRACT_VECTOR_ELT ||
9679          Op0.getOperand(0) != Op1.getOperand(0) ||
9680          !isa<ConstantSDNode>(Op0.getOperand(1)) ||
9681          !isa<ConstantSDNode>(Op1.getOperand(1)) || !Op.hasOneUse())
9682        return false;
9683
9684      // The source vector is chosen based on which 64-bit half of the
9685      // destination vector is being calculated.
9686      if (j < NumEltsIn64Bits) {
9687        if (V0.isUndef())
9688          V0 = Op0.getOperand(0);
9689      } else {
9690        if (V1.isUndef())
9691          V1 = Op0.getOperand(0);
9692      }
9693
9694      SDValue SourceVec = (j < NumEltsIn64Bits) ? V0 : V1;
9695      if (SourceVec != Op0.getOperand(0))
9696        return false;
9697
9698      // op (extract_vector_elt A, I), (extract_vector_elt A, I+1)
9699      unsigned ExtIndex0 = Op0.getConstantOperandVal(1);
9700      unsigned ExtIndex1 = Op1.getConstantOperandVal(1);
9701      unsigned ExpectedIndex = i * NumEltsIn128Bits +
9702                               (j % NumEltsIn64Bits) * 2;
9703      if (ExpectedIndex == ExtIndex0 && ExtIndex1 == ExtIndex0 + 1)
9704        continue;
9705
9706      // If this is not a commutative op, this does not match.
9707      if (GenericOpcode != ISD::ADD && GenericOpcode != ISD::FADD)
9708        return false;
9709
9710      // Addition is commutative, so try swapping the extract indexes.
9711      // op (extract_vector_elt A, I+1), (extract_vector_elt A, I)
9712      if (ExpectedIndex == ExtIndex1 && ExtIndex0 == ExtIndex1 + 1)
9713        continue;
9714
9715      // Extract indexes do not match horizontal requirement.
9716      return false;
9717    }
9718  }
9719  // We matched. Opcode and operands are returned by reference as arguments.
9720  return true;
9721}
9722
9723static SDValue getHopForBuildVector(const BuildVectorSDNode *BV,
9724                                    SelectionDAG &DAG, unsigned HOpcode,
9725                                    SDValue V0, SDValue V1) {
9726  // If either input vector is not the same size as the build vector,
9727  // extract/insert the low bits to the correct size.
9728  // This is free (examples: zmm --> xmm, xmm --> ymm).
9729  MVT VT = BV->getSimpleValueType(0);
9730  unsigned Width = VT.getSizeInBits();
9731  if (V0.getValueSizeInBits() > Width)
9732    V0 = extractSubVector(V0, 0, DAG, SDLoc(BV), Width);
9733  else if (V0.getValueSizeInBits() < Width)
9734    V0 = insertSubVector(DAG.getUNDEF(VT), V0, 0, DAG, SDLoc(BV), Width);
9735
9736  if (V1.getValueSizeInBits() > Width)
9737    V1 = extractSubVector(V1, 0, DAG, SDLoc(BV), Width);
9738  else if (V1.getValueSizeInBits() < Width)
9739    V1 = insertSubVector(DAG.getUNDEF(VT), V1, 0, DAG, SDLoc(BV), Width);
9740
9741  unsigned NumElts = VT.getVectorNumElements();
9742  APInt DemandedElts = APInt::getAllOnesValue(NumElts);
9743  for (unsigned i = 0; i != NumElts; ++i)
9744    if (BV->getOperand(i).isUndef())
9745      DemandedElts.clearBit(i);
9746
9747  // If we don't need the upper xmm, then perform as a xmm hop.
9748  unsigned HalfNumElts = NumElts / 2;
9749  if (VT.is256BitVector() && DemandedElts.lshr(HalfNumElts) == 0) {
9750    MVT HalfVT = VT.getHalfNumVectorElementsVT();
9751    V0 = extractSubVector(V0, 0, DAG, SDLoc(BV), 128);
9752    V1 = extractSubVector(V1, 0, DAG, SDLoc(BV), 128);
9753    SDValue Half = DAG.getNode(HOpcode, SDLoc(BV), HalfVT, V0, V1);
9754    return insertSubVector(DAG.getUNDEF(VT), Half, 0, DAG, SDLoc(BV), 256);
9755  }
9756
9757  return DAG.getNode(HOpcode, SDLoc(BV), VT, V0, V1);
9758}
9759
9760/// Lower BUILD_VECTOR to a horizontal add/sub operation if possible.
9761static SDValue LowerToHorizontalOp(const BuildVectorSDNode *BV,
9762                                   const X86Subtarget &Subtarget,
9763                                   SelectionDAG &DAG) {
9764  // We need at least 2 non-undef elements to make this worthwhile by default.
9765  unsigned NumNonUndefs =
9766      count_if(BV->op_values(), [](SDValue V) { return !V.isUndef(); });
9767  if (NumNonUndefs < 2)
9768    return SDValue();
9769
9770  // There are 4 sets of horizontal math operations distinguished by type:
9771  // int/FP at 128-bit/256-bit. Each type was introduced with a different
9772  // subtarget feature. Try to match those "native" patterns first.
9773  MVT VT = BV->getSimpleValueType(0);
9774  if (((VT == MVT::v4f32 || VT == MVT::v2f64) && Subtarget.hasSSE3()) ||
9775      ((VT == MVT::v8i16 || VT == MVT::v4i32) && Subtarget.hasSSSE3()) ||
9776      ((VT == MVT::v8f32 || VT == MVT::v4f64) && Subtarget.hasAVX()) ||
9777      ((VT == MVT::v16i16 || VT == MVT::v8i32) && Subtarget.hasAVX2())) {
9778    unsigned HOpcode;
9779    SDValue V0, V1;
9780    if (isHopBuildVector(BV, DAG, HOpcode, V0, V1))
9781      return getHopForBuildVector(BV, DAG, HOpcode, V0, V1);
9782  }
9783
9784  // Try harder to match 256-bit ops by using extract/concat.
9785  if (!Subtarget.hasAVX() || !VT.is256BitVector())
9786    return SDValue();
9787
9788  // Count the number of UNDEF operands in the build_vector in input.
9789  unsigned NumElts = VT.getVectorNumElements();
9790  unsigned Half = NumElts / 2;
9791  unsigned NumUndefsLO = 0;
9792  unsigned NumUndefsHI = 0;
9793  for (unsigned i = 0, e = Half; i != e; ++i)
9794    if (BV->getOperand(i)->isUndef())
9795      NumUndefsLO++;
9796
9797  for (unsigned i = Half, e = NumElts; i != e; ++i)
9798    if (BV->getOperand(i)->isUndef())
9799      NumUndefsHI++;
9800
9801  SDLoc DL(BV);
9802  SDValue InVec0, InVec1;
9803  if (VT == MVT::v8i32 || VT == MVT::v16i16) {
9804    SDValue InVec2, InVec3;
9805    unsigned X86Opcode;
9806    bool CanFold = true;
9807
9808    if (isHorizontalBinOpPart(BV, ISD::ADD, DAG, 0, Half, InVec0, InVec1) &&
9809        isHorizontalBinOpPart(BV, ISD::ADD, DAG, Half, NumElts, InVec2,
9810                              InVec3) &&
9811        ((InVec0.isUndef() || InVec2.isUndef()) || InVec0 == InVec2) &&
9812        ((InVec1.isUndef() || InVec3.isUndef()) || InVec1 == InVec3))
9813      X86Opcode = X86ISD::HADD;
9814    else if (isHorizontalBinOpPart(BV, ISD::SUB, DAG, 0, Half, InVec0,
9815                                   InVec1) &&
9816             isHorizontalBinOpPart(BV, ISD::SUB, DAG, Half, NumElts, InVec2,
9817                                   InVec3) &&
9818             ((InVec0.isUndef() || InVec2.isUndef()) || InVec0 == InVec2) &&
9819             ((InVec1.isUndef() || InVec3.isUndef()) || InVec1 == InVec3))
9820      X86Opcode = X86ISD::HSUB;
9821    else
9822      CanFold = false;
9823
9824    if (CanFold) {
9825      // Do not try to expand this build_vector into a pair of horizontal
9826      // add/sub if we can emit a pair of scalar add/sub.
9827      if (NumUndefsLO + 1 == Half || NumUndefsHI + 1 == Half)
9828        return SDValue();
9829
9830      // Convert this build_vector into a pair of horizontal binops followed by
9831      // a concat vector. We must adjust the outputs from the partial horizontal
9832      // matching calls above to account for undefined vector halves.
9833      SDValue V0 = InVec0.isUndef() ? InVec2 : InVec0;
9834      SDValue V1 = InVec1.isUndef() ? InVec3 : InVec1;
9835      assert((!V0.isUndef() || !V1.isUndef()) && "Horizontal-op of undefs?");
9836      bool isUndefLO = NumUndefsLO == Half;
9837      bool isUndefHI = NumUndefsHI == Half;
9838      return ExpandHorizontalBinOp(V0, V1, DL, DAG, X86Opcode, false, isUndefLO,
9839                                   isUndefHI);
9840    }
9841  }
9842
9843  if (VT == MVT::v8f32 || VT == MVT::v4f64 || VT == MVT::v8i32 ||
9844      VT == MVT::v16i16) {
9845    unsigned X86Opcode;
9846    if (isHorizontalBinOpPart(BV, ISD::ADD, DAG, 0, NumElts, InVec0, InVec1))
9847      X86Opcode = X86ISD::HADD;
9848    else if (isHorizontalBinOpPart(BV, ISD::SUB, DAG, 0, NumElts, InVec0,
9849                                   InVec1))
9850      X86Opcode = X86ISD::HSUB;
9851    else if (isHorizontalBinOpPart(BV, ISD::FADD, DAG, 0, NumElts, InVec0,
9852                                   InVec1))
9853      X86Opcode = X86ISD::FHADD;
9854    else if (isHorizontalBinOpPart(BV, ISD::FSUB, DAG, 0, NumElts, InVec0,
9855                                   InVec1))
9856      X86Opcode = X86ISD::FHSUB;
9857    else
9858      return SDValue();
9859
9860    // Don't try to expand this build_vector into a pair of horizontal add/sub
9861    // if we can simply emit a pair of scalar add/sub.
9862    if (NumUndefsLO + 1 == Half || NumUndefsHI + 1 == Half)
9863      return SDValue();
9864
9865    // Convert this build_vector into two horizontal add/sub followed by
9866    // a concat vector.
9867    bool isUndefLO = NumUndefsLO == Half;
9868    bool isUndefHI = NumUndefsHI == Half;
9869    return ExpandHorizontalBinOp(InVec0, InVec1, DL, DAG, X86Opcode, true,
9870                                 isUndefLO, isUndefHI);
9871  }
9872
9873  return SDValue();
9874}
9875
9876static SDValue LowerShift(SDValue Op, const X86Subtarget &Subtarget,
9877                          SelectionDAG &DAG);
9878
9879/// If a BUILD_VECTOR's source elements all apply the same bit operation and
9880/// one of their operands is constant, lower to a pair of BUILD_VECTOR and
9881/// just apply the bit to the vectors.
9882/// NOTE: Its not in our interest to start make a general purpose vectorizer
9883/// from this, but enough scalar bit operations are created from the later
9884/// legalization + scalarization stages to need basic support.
9885static SDValue lowerBuildVectorToBitOp(BuildVectorSDNode *Op,
9886                                       const X86Subtarget &Subtarget,
9887                                       SelectionDAG &DAG) {
9888  SDLoc DL(Op);
9889  MVT VT = Op->getSimpleValueType(0);
9890  unsigned NumElems = VT.getVectorNumElements();
9891  const TargetLowering &TLI = DAG.getTargetLoweringInfo();
9892
9893  // Check that all elements have the same opcode.
9894  // TODO: Should we allow UNDEFS and if so how many?
9895  unsigned Opcode = Op->getOperand(0).getOpcode();
9896  for (unsigned i = 1; i < NumElems; ++i)
9897    if (Opcode != Op->getOperand(i).getOpcode())
9898      return SDValue();
9899
9900  // TODO: We may be able to add support for other Ops (ADD/SUB + shifts).
9901  bool IsShift = false;
9902  switch (Opcode) {
9903  default:
9904    return SDValue();
9905  case ISD::SHL:
9906  case ISD::SRL:
9907  case ISD::SRA:
9908    IsShift = true;
9909    break;
9910  case ISD::AND:
9911  case ISD::XOR:
9912  case ISD::OR:
9913    // Don't do this if the buildvector is a splat - we'd replace one
9914    // constant with an entire vector.
9915    if (Op->getSplatValue())
9916      return SDValue();
9917    if (!TLI.isOperationLegalOrPromote(Opcode, VT))
9918      return SDValue();
9919    break;
9920  }
9921
9922  SmallVector<SDValue, 4> LHSElts, RHSElts;
9923  for (SDValue Elt : Op->ops()) {
9924    SDValue LHS = Elt.getOperand(0);
9925    SDValue RHS = Elt.getOperand(1);
9926
9927    // We expect the canonicalized RHS operand to be the constant.
9928    if (!isa<ConstantSDNode>(RHS))
9929      return SDValue();
9930
9931    // Extend shift amounts.
9932    if (RHS.getValueSizeInBits() != VT.getScalarSizeInBits()) {
9933      if (!IsShift)
9934        return SDValue();
9935      RHS = DAG.getZExtOrTrunc(RHS, DL, VT.getScalarType());
9936    }
9937
9938    LHSElts.push_back(LHS);
9939    RHSElts.push_back(RHS);
9940  }
9941
9942  // Limit to shifts by uniform immediates.
9943  // TODO: Only accept vXi8/vXi64 special cases?
9944  // TODO: Permit non-uniform XOP/AVX2/MULLO cases?
9945  if (IsShift && any_of(RHSElts, [&](SDValue V) { return RHSElts[0] != V; }))
9946    return SDValue();
9947
9948  SDValue LHS = DAG.getBuildVector(VT, DL, LHSElts);
9949  SDValue RHS = DAG.getBuildVector(VT, DL, RHSElts);
9950  SDValue Res = DAG.getNode(Opcode, DL, VT, LHS, RHS);
9951
9952  if (!IsShift)
9953    return Res;
9954
9955  // Immediately lower the shift to ensure the constant build vector doesn't
9956  // get converted to a constant pool before the shift is lowered.
9957  return LowerShift(Res, Subtarget, DAG);
9958}
9959
9960/// Create a vector constant without a load. SSE/AVX provide the bare minimum
9961/// functionality to do this, so it's all zeros, all ones, or some derivation
9962/// that is cheap to calculate.
9963static SDValue materializeVectorConstant(SDValue Op, SelectionDAG &DAG,
9964                                         const X86Subtarget &Subtarget) {
9965  SDLoc DL(Op);
9966  MVT VT = Op.getSimpleValueType();
9967
9968  // Vectors containing all zeros can be matched by pxor and xorps.
9969  if (ISD::isBuildVectorAllZeros(Op.getNode()))
9970    return Op;
9971
9972  // Vectors containing all ones can be matched by pcmpeqd on 128-bit width
9973  // vectors or broken into v4i32 operations on 256-bit vectors. AVX2 can use
9974  // vpcmpeqd on 256-bit vectors.
9975  if (Subtarget.hasSSE2() && ISD::isBuildVectorAllOnes(Op.getNode())) {
9976    if (VT == MVT::v4i32 || VT == MVT::v8i32 || VT == MVT::v16i32)
9977      return Op;
9978
9979    return getOnesVector(VT, DAG, DL);
9980  }
9981
9982  return SDValue();
9983}
9984
9985/// Look for opportunities to create a VPERMV/VPERMILPV/PSHUFB variable permute
9986/// from a vector of source values and a vector of extraction indices.
9987/// The vectors might be manipulated to match the type of the permute op.
9988static SDValue createVariablePermute(MVT VT, SDValue SrcVec, SDValue IndicesVec,
9989                                     SDLoc &DL, SelectionDAG &DAG,
9990                                     const X86Subtarget &Subtarget) {
9991  MVT ShuffleVT = VT;
9992  EVT IndicesVT = EVT(VT).changeVectorElementTypeToInteger();
9993  unsigned NumElts = VT.getVectorNumElements();
9994  unsigned SizeInBits = VT.getSizeInBits();
9995
9996  // Adjust IndicesVec to match VT size.
9997  assert(IndicesVec.getValueType().getVectorNumElements() >= NumElts &&
9998         "Illegal variable permute mask size");
9999  if (IndicesVec.getValueType().getVectorNumElements() > NumElts) {
10000    // Narrow/widen the indices vector to the correct size.
10001    if (IndicesVec.getValueSizeInBits() > SizeInBits)
10002      IndicesVec = extractSubVector(IndicesVec, 0, DAG, SDLoc(IndicesVec),
10003                                    NumElts * VT.getScalarSizeInBits());
10004    else if (IndicesVec.getValueSizeInBits() < SizeInBits)
10005      IndicesVec = widenSubVector(IndicesVec, false, Subtarget, DAG,
10006                                  SDLoc(IndicesVec), SizeInBits);
10007    // Zero-extend the index elements within the vector.
10008    if (IndicesVec.getValueType().getVectorNumElements() > NumElts)
10009      IndicesVec = DAG.getNode(ISD::ZERO_EXTEND_VECTOR_INREG, SDLoc(IndicesVec),
10010                               IndicesVT, IndicesVec);
10011  }
10012  IndicesVec = DAG.getZExtOrTrunc(IndicesVec, SDLoc(IndicesVec), IndicesVT);
10013
10014  // Handle SrcVec that don't match VT type.
10015  if (SrcVec.getValueSizeInBits() != SizeInBits) {
10016    if ((SrcVec.getValueSizeInBits() % SizeInBits) == 0) {
10017      // Handle larger SrcVec by treating it as a larger permute.
10018      unsigned Scale = SrcVec.getValueSizeInBits() / SizeInBits;
10019      VT = MVT::getVectorVT(VT.getScalarType(), Scale * NumElts);
10020      IndicesVT = EVT(VT).changeVectorElementTypeToInteger();
10021      IndicesVec = widenSubVector(IndicesVT.getSimpleVT(), IndicesVec, false,
10022                                  Subtarget, DAG, SDLoc(IndicesVec));
10023      SDValue NewSrcVec =
10024          createVariablePermute(VT, SrcVec, IndicesVec, DL, DAG, Subtarget);
10025      if (NewSrcVec)
10026        return extractSubVector(NewSrcVec, 0, DAG, DL, SizeInBits);
10027      return SDValue();
10028    } else if (SrcVec.getValueSizeInBits() < SizeInBits) {
10029      // Widen smaller SrcVec to match VT.
10030      SrcVec = widenSubVector(VT, SrcVec, false, Subtarget, DAG, SDLoc(SrcVec));
10031    } else
10032      return SDValue();
10033  }
10034
10035  auto ScaleIndices = [&DAG](SDValue Idx, uint64_t Scale) {
10036    assert(isPowerOf2_64(Scale) && "Illegal variable permute shuffle scale");
10037    EVT SrcVT = Idx.getValueType();
10038    unsigned NumDstBits = SrcVT.getScalarSizeInBits() / Scale;
10039    uint64_t IndexScale = 0;
10040    uint64_t IndexOffset = 0;
10041
10042    // If we're scaling a smaller permute op, then we need to repeat the
10043    // indices, scaling and offsetting them as well.
10044    // e.g. v4i32 -> v16i8 (Scale = 4)
10045    // IndexScale = v4i32 Splat(4 << 24 | 4 << 16 | 4 << 8 | 4)
10046    // IndexOffset = v4i32 Splat(3 << 24 | 2 << 16 | 1 << 8 | 0)
10047    for (uint64_t i = 0; i != Scale; ++i) {
10048      IndexScale |= Scale << (i * NumDstBits);
10049      IndexOffset |= i << (i * NumDstBits);
10050    }
10051
10052    Idx = DAG.getNode(ISD::MUL, SDLoc(Idx), SrcVT, Idx,
10053                      DAG.getConstant(IndexScale, SDLoc(Idx), SrcVT));
10054    Idx = DAG.getNode(ISD::ADD, SDLoc(Idx), SrcVT, Idx,
10055                      DAG.getConstant(IndexOffset, SDLoc(Idx), SrcVT));
10056    return Idx;
10057  };
10058
10059  unsigned Opcode = 0;
10060  switch (VT.SimpleTy) {
10061  default:
10062    break;
10063  case MVT::v16i8:
10064    if (Subtarget.hasSSSE3())
10065      Opcode = X86ISD::PSHUFB;
10066    break;
10067  case MVT::v8i16:
10068    if (Subtarget.hasVLX() && Subtarget.hasBWI())
10069      Opcode = X86ISD::VPERMV;
10070    else if (Subtarget.hasSSSE3()) {
10071      Opcode = X86ISD::PSHUFB;
10072      ShuffleVT = MVT::v16i8;
10073    }
10074    break;
10075  case MVT::v4f32:
10076  case MVT::v4i32:
10077    if (Subtarget.hasAVX()) {
10078      Opcode = X86ISD::VPERMILPV;
10079      ShuffleVT = MVT::v4f32;
10080    } else if (Subtarget.hasSSSE3()) {
10081      Opcode = X86ISD::PSHUFB;
10082      ShuffleVT = MVT::v16i8;
10083    }
10084    break;
10085  case MVT::v2f64:
10086  case MVT::v2i64:
10087    if (Subtarget.hasAVX()) {
10088      // VPERMILPD selects using bit#1 of the index vector, so scale IndicesVec.
10089      IndicesVec = DAG.getNode(ISD::ADD, DL, IndicesVT, IndicesVec, IndicesVec);
10090      Opcode = X86ISD::VPERMILPV;
10091      ShuffleVT = MVT::v2f64;
10092    } else if (Subtarget.hasSSE41()) {
10093      // SSE41 can compare v2i64 - select between indices 0 and 1.
10094      return DAG.getSelectCC(
10095          DL, IndicesVec,
10096          getZeroVector(IndicesVT.getSimpleVT(), Subtarget, DAG, DL),
10097          DAG.getVectorShuffle(VT, DL, SrcVec, SrcVec, {0, 0}),
10098          DAG.getVectorShuffle(VT, DL, SrcVec, SrcVec, {1, 1}),
10099          ISD::CondCode::SETEQ);
10100    }
10101    break;
10102  case MVT::v32i8:
10103    if (Subtarget.hasVLX() && Subtarget.hasVBMI())
10104      Opcode = X86ISD::VPERMV;
10105    else if (Subtarget.hasXOP()) {
10106      SDValue LoSrc = extract128BitVector(SrcVec, 0, DAG, DL);
10107      SDValue HiSrc = extract128BitVector(SrcVec, 16, DAG, DL);
10108      SDValue LoIdx = extract128BitVector(IndicesVec, 0, DAG, DL);
10109      SDValue HiIdx = extract128BitVector(IndicesVec, 16, DAG, DL);
10110      return DAG.getNode(
10111          ISD::CONCAT_VECTORS, DL, VT,
10112          DAG.getNode(X86ISD::VPPERM, DL, MVT::v16i8, LoSrc, HiSrc, LoIdx),
10113          DAG.getNode(X86ISD::VPPERM, DL, MVT::v16i8, LoSrc, HiSrc, HiIdx));
10114    } else if (Subtarget.hasAVX()) {
10115      SDValue Lo = extract128BitVector(SrcVec, 0, DAG, DL);
10116      SDValue Hi = extract128BitVector(SrcVec, 16, DAG, DL);
10117      SDValue LoLo = DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, Lo, Lo);
10118      SDValue HiHi = DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, Hi, Hi);
10119      auto PSHUFBBuilder = [](SelectionDAG &DAG, const SDLoc &DL,
10120                              ArrayRef<SDValue> Ops) {
10121        // Permute Lo and Hi and then select based on index range.
10122        // This works as SHUFB uses bits[3:0] to permute elements and we don't
10123        // care about the bit[7] as its just an index vector.
10124        SDValue Idx = Ops[2];
10125        EVT VT = Idx.getValueType();
10126        return DAG.getSelectCC(DL, Idx, DAG.getConstant(15, DL, VT),
10127                               DAG.getNode(X86ISD::PSHUFB, DL, VT, Ops[1], Idx),
10128                               DAG.getNode(X86ISD::PSHUFB, DL, VT, Ops[0], Idx),
10129                               ISD::CondCode::SETGT);
10130      };
10131      SDValue Ops[] = {LoLo, HiHi, IndicesVec};
10132      return SplitOpsAndApply(DAG, Subtarget, DL, MVT::v32i8, Ops,
10133                              PSHUFBBuilder);
10134    }
10135    break;
10136  case MVT::v16i16:
10137    if (Subtarget.hasVLX() && Subtarget.hasBWI())
10138      Opcode = X86ISD::VPERMV;
10139    else if (Subtarget.hasAVX()) {
10140      // Scale to v32i8 and perform as v32i8.
10141      IndicesVec = ScaleIndices(IndicesVec, 2);
10142      return DAG.getBitcast(
10143          VT, createVariablePermute(
10144                  MVT::v32i8, DAG.getBitcast(MVT::v32i8, SrcVec),
10145                  DAG.getBitcast(MVT::v32i8, IndicesVec), DL, DAG, Subtarget));
10146    }
10147    break;
10148  case MVT::v8f32:
10149  case MVT::v8i32:
10150    if (Subtarget.hasAVX2())
10151      Opcode = X86ISD::VPERMV;
10152    else if (Subtarget.hasAVX()) {
10153      SrcVec = DAG.getBitcast(MVT::v8f32, SrcVec);
10154      SDValue LoLo = DAG.getVectorShuffle(MVT::v8f32, DL, SrcVec, SrcVec,
10155                                          {0, 1, 2, 3, 0, 1, 2, 3});
10156      SDValue HiHi = DAG.getVectorShuffle(MVT::v8f32, DL, SrcVec, SrcVec,
10157                                          {4, 5, 6, 7, 4, 5, 6, 7});
10158      if (Subtarget.hasXOP())
10159        return DAG.getBitcast(
10160            VT, DAG.getNode(X86ISD::VPERMIL2, DL, MVT::v8f32, LoLo, HiHi,
10161                            IndicesVec, DAG.getTargetConstant(0, DL, MVT::i8)));
10162      // Permute Lo and Hi and then select based on index range.
10163      // This works as VPERMILPS only uses index bits[0:1] to permute elements.
10164      SDValue Res = DAG.getSelectCC(
10165          DL, IndicesVec, DAG.getConstant(3, DL, MVT::v8i32),
10166          DAG.getNode(X86ISD::VPERMILPV, DL, MVT::v8f32, HiHi, IndicesVec),
10167          DAG.getNode(X86ISD::VPERMILPV, DL, MVT::v8f32, LoLo, IndicesVec),
10168          ISD::CondCode::SETGT);
10169      return DAG.getBitcast(VT, Res);
10170    }
10171    break;
10172  case MVT::v4i64:
10173  case MVT::v4f64:
10174    if (Subtarget.hasAVX512()) {
10175      if (!Subtarget.hasVLX()) {
10176        MVT WidenSrcVT = MVT::getVectorVT(VT.getScalarType(), 8);
10177        SrcVec = widenSubVector(WidenSrcVT, SrcVec, false, Subtarget, DAG,
10178                                SDLoc(SrcVec));
10179        IndicesVec = widenSubVector(MVT::v8i64, IndicesVec, false, Subtarget,
10180                                    DAG, SDLoc(IndicesVec));
10181        SDValue Res = createVariablePermute(WidenSrcVT, SrcVec, IndicesVec, DL,
10182                                            DAG, Subtarget);
10183        return extract256BitVector(Res, 0, DAG, DL);
10184      }
10185      Opcode = X86ISD::VPERMV;
10186    } else if (Subtarget.hasAVX()) {
10187      SrcVec = DAG.getBitcast(MVT::v4f64, SrcVec);
10188      SDValue LoLo =
10189          DAG.getVectorShuffle(MVT::v4f64, DL, SrcVec, SrcVec, {0, 1, 0, 1});
10190      SDValue HiHi =
10191          DAG.getVectorShuffle(MVT::v4f64, DL, SrcVec, SrcVec, {2, 3, 2, 3});
10192      // VPERMIL2PD selects with bit#1 of the index vector, so scale IndicesVec.
10193      IndicesVec = DAG.getNode(ISD::ADD, DL, IndicesVT, IndicesVec, IndicesVec);
10194      if (Subtarget.hasXOP())
10195        return DAG.getBitcast(
10196            VT, DAG.getNode(X86ISD::VPERMIL2, DL, MVT::v4f64, LoLo, HiHi,
10197                            IndicesVec, DAG.getTargetConstant(0, DL, MVT::i8)));
10198      // Permute Lo and Hi and then select based on index range.
10199      // This works as VPERMILPD only uses index bit[1] to permute elements.
10200      SDValue Res = DAG.getSelectCC(
10201          DL, IndicesVec, DAG.getConstant(2, DL, MVT::v4i64),
10202          DAG.getNode(X86ISD::VPERMILPV, DL, MVT::v4f64, HiHi, IndicesVec),
10203          DAG.getNode(X86ISD::VPERMILPV, DL, MVT::v4f64, LoLo, IndicesVec),
10204          ISD::CondCode::SETGT);
10205      return DAG.getBitcast(VT, Res);
10206    }
10207    break;
10208  case MVT::v64i8:
10209    if (Subtarget.hasVBMI())
10210      Opcode = X86ISD::VPERMV;
10211    break;
10212  case MVT::v32i16:
10213    if (Subtarget.hasBWI())
10214      Opcode = X86ISD::VPERMV;
10215    break;
10216  case MVT::v16f32:
10217  case MVT::v16i32:
10218  case MVT::v8f64:
10219  case MVT::v8i64:
10220    if (Subtarget.hasAVX512())
10221      Opcode = X86ISD::VPERMV;
10222    break;
10223  }
10224  if (!Opcode)
10225    return SDValue();
10226
10227  assert((VT.getSizeInBits() == ShuffleVT.getSizeInBits()) &&
10228         (VT.getScalarSizeInBits() % ShuffleVT.getScalarSizeInBits()) == 0 &&
10229         "Illegal variable permute shuffle type");
10230
10231  uint64_t Scale = VT.getScalarSizeInBits() / ShuffleVT.getScalarSizeInBits();
10232  if (Scale > 1)
10233    IndicesVec = ScaleIndices(IndicesVec, Scale);
10234
10235  EVT ShuffleIdxVT = EVT(ShuffleVT).changeVectorElementTypeToInteger();
10236  IndicesVec = DAG.getBitcast(ShuffleIdxVT, IndicesVec);
10237
10238  SrcVec = DAG.getBitcast(ShuffleVT, SrcVec);
10239  SDValue Res = Opcode == X86ISD::VPERMV
10240                    ? DAG.getNode(Opcode, DL, ShuffleVT, IndicesVec, SrcVec)
10241                    : DAG.getNode(Opcode, DL, ShuffleVT, SrcVec, IndicesVec);
10242  return DAG.getBitcast(VT, Res);
10243}
10244
10245// Tries to lower a BUILD_VECTOR composed of extract-extract chains that can be
10246// reasoned to be a permutation of a vector by indices in a non-constant vector.
10247// (build_vector (extract_elt V, (extract_elt I, 0)),
10248//               (extract_elt V, (extract_elt I, 1)),
10249//                    ...
10250// ->
10251// (vpermv I, V)
10252//
10253// TODO: Handle undefs
10254// TODO: Utilize pshufb and zero mask blending to support more efficient
10255// construction of vectors with constant-0 elements.
10256static SDValue
10257LowerBUILD_VECTORAsVariablePermute(SDValue V, SelectionDAG &DAG,
10258                                   const X86Subtarget &Subtarget) {
10259  SDValue SrcVec, IndicesVec;
10260  // Check for a match of the permute source vector and permute index elements.
10261  // This is done by checking that the i-th build_vector operand is of the form:
10262  // (extract_elt SrcVec, (extract_elt IndicesVec, i)).
10263  for (unsigned Idx = 0, E = V.getNumOperands(); Idx != E; ++Idx) {
10264    SDValue Op = V.getOperand(Idx);
10265    if (Op.getOpcode() != ISD::EXTRACT_VECTOR_ELT)
10266      return SDValue();
10267
10268    // If this is the first extract encountered in V, set the source vector,
10269    // otherwise verify the extract is from the previously defined source
10270    // vector.
10271    if (!SrcVec)
10272      SrcVec = Op.getOperand(0);
10273    else if (SrcVec != Op.getOperand(0))
10274      return SDValue();
10275    SDValue ExtractedIndex = Op->getOperand(1);
10276    // Peek through extends.
10277    if (ExtractedIndex.getOpcode() == ISD::ZERO_EXTEND ||
10278        ExtractedIndex.getOpcode() == ISD::SIGN_EXTEND)
10279      ExtractedIndex = ExtractedIndex.getOperand(0);
10280    if (ExtractedIndex.getOpcode() != ISD::EXTRACT_VECTOR_ELT)
10281      return SDValue();
10282
10283    // If this is the first extract from the index vector candidate, set the
10284    // indices vector, otherwise verify the extract is from the previously
10285    // defined indices vector.
10286    if (!IndicesVec)
10287      IndicesVec = ExtractedIndex.getOperand(0);
10288    else if (IndicesVec != ExtractedIndex.getOperand(0))
10289      return SDValue();
10290
10291    auto *PermIdx = dyn_cast<ConstantSDNode>(ExtractedIndex.getOperand(1));
10292    if (!PermIdx || PermIdx->getAPIntValue() != Idx)
10293      return SDValue();
10294  }
10295
10296  SDLoc DL(V);
10297  MVT VT = V.getSimpleValueType();
10298  return createVariablePermute(VT, SrcVec, IndicesVec, DL, DAG, Subtarget);
10299}
10300
10301SDValue
10302X86TargetLowering::LowerBUILD_VECTOR(SDValue Op, SelectionDAG &DAG) const {
10303  SDLoc dl(Op);
10304
10305  MVT VT = Op.getSimpleValueType();
10306  MVT EltVT = VT.getVectorElementType();
10307  unsigned NumElems = Op.getNumOperands();
10308
10309  // Generate vectors for predicate vectors.
10310  if (VT.getVectorElementType() == MVT::i1 && Subtarget.hasAVX512())
10311    return LowerBUILD_VECTORvXi1(Op, DAG, Subtarget);
10312
10313  if (SDValue VectorConstant = materializeVectorConstant(Op, DAG, Subtarget))
10314    return VectorConstant;
10315
10316  unsigned EVTBits = EltVT.getSizeInBits();
10317  APInt UndefMask = APInt::getNullValue(NumElems);
10318  APInt ZeroMask = APInt::getNullValue(NumElems);
10319  APInt NonZeroMask = APInt::getNullValue(NumElems);
10320  bool IsAllConstants = true;
10321  SmallSet<SDValue, 8> Values;
10322  unsigned NumConstants = NumElems;
10323  for (unsigned i = 0; i < NumElems; ++i) {
10324    SDValue Elt = Op.getOperand(i);
10325    if (Elt.isUndef()) {
10326      UndefMask.setBit(i);
10327      continue;
10328    }
10329    Values.insert(Elt);
10330    if (!isa<ConstantSDNode>(Elt) && !isa<ConstantFPSDNode>(Elt)) {
10331      IsAllConstants = false;
10332      NumConstants--;
10333    }
10334    if (X86::isZeroNode(Elt)) {
10335      ZeroMask.setBit(i);
10336    } else {
10337      NonZeroMask.setBit(i);
10338    }
10339  }
10340
10341  // All undef vector. Return an UNDEF. All zero vectors were handled above.
10342  if (NonZeroMask == 0) {
10343    assert(UndefMask.isAllOnesValue() && "Fully undef mask expected");
10344    return DAG.getUNDEF(VT);
10345  }
10346
10347  BuildVectorSDNode *BV = cast<BuildVectorSDNode>(Op.getNode());
10348
10349  // If the upper elts of a ymm/zmm are undef/zero then we might be better off
10350  // lowering to a smaller build vector and padding with undef/zero.
10351  if ((VT.is256BitVector() || VT.is512BitVector()) &&
10352      !isFoldableUseOfShuffle(BV)) {
10353    unsigned UpperElems = NumElems / 2;
10354    APInt UndefOrZeroMask = UndefMask | ZeroMask;
10355    unsigned NumUpperUndefsOrZeros = UndefOrZeroMask.countLeadingOnes();
10356    if (NumUpperUndefsOrZeros >= UpperElems) {
10357      if (VT.is512BitVector() &&
10358          NumUpperUndefsOrZeros >= (NumElems - (NumElems / 4)))
10359        UpperElems = NumElems - (NumElems / 4);
10360      bool UndefUpper = UndefMask.countLeadingOnes() >= UpperElems;
10361      MVT LowerVT = MVT::getVectorVT(EltVT, NumElems - UpperElems);
10362      SDValue NewBV =
10363          DAG.getBuildVector(LowerVT, dl, Op->ops().drop_back(UpperElems));
10364      return widenSubVector(VT, NewBV, !UndefUpper, Subtarget, DAG, dl);
10365    }
10366  }
10367
10368  if (SDValue AddSub = lowerToAddSubOrFMAddSub(BV, Subtarget, DAG))
10369    return AddSub;
10370  if (SDValue HorizontalOp = LowerToHorizontalOp(BV, Subtarget, DAG))
10371    return HorizontalOp;
10372  if (SDValue Broadcast = lowerBuildVectorAsBroadcast(BV, Subtarget, DAG))
10373    return Broadcast;
10374  if (SDValue BitOp = lowerBuildVectorToBitOp(BV, Subtarget, DAG))
10375    return BitOp;
10376
10377  unsigned NumZero = ZeroMask.countPopulation();
10378  unsigned NumNonZero = NonZeroMask.countPopulation();
10379
10380  // If we are inserting one variable into a vector of non-zero constants, try
10381  // to avoid loading each constant element as a scalar. Load the constants as a
10382  // vector and then insert the variable scalar element. If insertion is not
10383  // supported, fall back to a shuffle to get the scalar blended with the
10384  // constants. Insertion into a zero vector is handled as a special-case
10385  // somewhere below here.
10386  if (NumConstants == NumElems - 1 && NumNonZero != 1 &&
10387      (isOperationLegalOrCustom(ISD::INSERT_VECTOR_ELT, VT) ||
10388       isOperationLegalOrCustom(ISD::VECTOR_SHUFFLE, VT))) {
10389    // Create an all-constant vector. The variable element in the old
10390    // build vector is replaced by undef in the constant vector. Save the
10391    // variable scalar element and its index for use in the insertelement.
10392    LLVMContext &Context = *DAG.getContext();
10393    Type *EltType = Op.getValueType().getScalarType().getTypeForEVT(Context);
10394    SmallVector<Constant *, 16> ConstVecOps(NumElems, UndefValue::get(EltType));
10395    SDValue VarElt;
10396    SDValue InsIndex;
10397    for (unsigned i = 0; i != NumElems; ++i) {
10398      SDValue Elt = Op.getOperand(i);
10399      if (auto *C = dyn_cast<ConstantSDNode>(Elt))
10400        ConstVecOps[i] = ConstantInt::get(Context, C->getAPIntValue());
10401      else if (auto *C = dyn_cast<ConstantFPSDNode>(Elt))
10402        ConstVecOps[i] = ConstantFP::get(Context, C->getValueAPF());
10403      else if (!Elt.isUndef()) {
10404        assert(!VarElt.getNode() && !InsIndex.getNode() &&
10405               "Expected one variable element in this vector");
10406        VarElt = Elt;
10407        InsIndex = DAG.getVectorIdxConstant(i, dl);
10408      }
10409    }
10410    Constant *CV = ConstantVector::get(ConstVecOps);
10411    SDValue DAGConstVec = DAG.getConstantPool(CV, VT);
10412
10413    // The constants we just created may not be legal (eg, floating point). We
10414    // must lower the vector right here because we can not guarantee that we'll
10415    // legalize it before loading it. This is also why we could not just create
10416    // a new build vector here. If the build vector contains illegal constants,
10417    // it could get split back up into a series of insert elements.
10418    // TODO: Improve this by using shorter loads with broadcast/VZEXT_LOAD.
10419    SDValue LegalDAGConstVec = LowerConstantPool(DAGConstVec, DAG);
10420    MachineFunction &MF = DAG.getMachineFunction();
10421    MachinePointerInfo MPI = MachinePointerInfo::getConstantPool(MF);
10422    SDValue Ld = DAG.getLoad(VT, dl, DAG.getEntryNode(), LegalDAGConstVec, MPI);
10423    unsigned InsertC = cast<ConstantSDNode>(InsIndex)->getZExtValue();
10424    unsigned NumEltsInLow128Bits = 128 / VT.getScalarSizeInBits();
10425    if (InsertC < NumEltsInLow128Bits)
10426      return DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, VT, Ld, VarElt, InsIndex);
10427
10428    // There's no good way to insert into the high elements of a >128-bit
10429    // vector, so use shuffles to avoid an extract/insert sequence.
10430    assert(VT.getSizeInBits() > 128 && "Invalid insertion index?");
10431    assert(Subtarget.hasAVX() && "Must have AVX with >16-byte vector");
10432    SmallVector<int, 8> ShuffleMask;
10433    unsigned NumElts = VT.getVectorNumElements();
10434    for (unsigned i = 0; i != NumElts; ++i)
10435      ShuffleMask.push_back(i == InsertC ? NumElts : i);
10436    SDValue S2V = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, VarElt);
10437    return DAG.getVectorShuffle(VT, dl, Ld, S2V, ShuffleMask);
10438  }
10439
10440  // Special case for single non-zero, non-undef, element.
10441  if (NumNonZero == 1) {
10442    unsigned Idx = NonZeroMask.countTrailingZeros();
10443    SDValue Item = Op.getOperand(Idx);
10444
10445    // If we have a constant or non-constant insertion into the low element of
10446    // a vector, we can do this with SCALAR_TO_VECTOR + shuffle of zero into
10447    // the rest of the elements.  This will be matched as movd/movq/movss/movsd
10448    // depending on what the source datatype is.
10449    if (Idx == 0) {
10450      if (NumZero == 0)
10451        return DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, Item);
10452
10453      if (EltVT == MVT::i32 || EltVT == MVT::f32 || EltVT == MVT::f64 ||
10454          (EltVT == MVT::i64 && Subtarget.is64Bit())) {
10455        assert((VT.is128BitVector() || VT.is256BitVector() ||
10456                VT.is512BitVector()) &&
10457               "Expected an SSE value type!");
10458        Item = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, Item);
10459        // Turn it into a MOVL (i.e. movss, movsd, or movd) to a zero vector.
10460        return getShuffleVectorZeroOrUndef(Item, 0, true, Subtarget, DAG);
10461      }
10462
10463      // We can't directly insert an i8 or i16 into a vector, so zero extend
10464      // it to i32 first.
10465      if (EltVT == MVT::i16 || EltVT == MVT::i8) {
10466        Item = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i32, Item);
10467        MVT ShufVT = MVT::getVectorVT(MVT::i32, VT.getSizeInBits() / 32);
10468        Item = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, ShufVT, Item);
10469        Item = getShuffleVectorZeroOrUndef(Item, 0, true, Subtarget, DAG);
10470        return DAG.getBitcast(VT, Item);
10471      }
10472    }
10473
10474    // Is it a vector logical left shift?
10475    if (NumElems == 2 && Idx == 1 &&
10476        X86::isZeroNode(Op.getOperand(0)) &&
10477        !X86::isZeroNode(Op.getOperand(1))) {
10478      unsigned NumBits = VT.getSizeInBits();
10479      return getVShift(true, VT,
10480                       DAG.getNode(ISD::SCALAR_TO_VECTOR, dl,
10481                                   VT, Op.getOperand(1)),
10482                       NumBits/2, DAG, *this, dl);
10483    }
10484
10485    if (IsAllConstants) // Otherwise, it's better to do a constpool load.
10486      return SDValue();
10487
10488    // Otherwise, if this is a vector with i32 or f32 elements, and the element
10489    // is a non-constant being inserted into an element other than the low one,
10490    // we can't use a constant pool load.  Instead, use SCALAR_TO_VECTOR (aka
10491    // movd/movss) to move this into the low element, then shuffle it into
10492    // place.
10493    if (EVTBits == 32) {
10494      Item = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, Item);
10495      return getShuffleVectorZeroOrUndef(Item, Idx, NumZero > 0, Subtarget, DAG);
10496    }
10497  }
10498
10499  // Splat is obviously ok. Let legalizer expand it to a shuffle.
10500  if (Values.size() == 1) {
10501    if (EVTBits == 32) {
10502      // Instead of a shuffle like this:
10503      // shuffle (scalar_to_vector (load (ptr + 4))), undef, <0, 0, 0, 0>
10504      // Check if it's possible to issue this instead.
10505      // shuffle (vload ptr)), undef, <1, 1, 1, 1>
10506      unsigned Idx = NonZeroMask.countTrailingZeros();
10507      SDValue Item = Op.getOperand(Idx);
10508      if (Op.getNode()->isOnlyUserOf(Item.getNode()))
10509        return LowerAsSplatVectorLoad(Item, VT, dl, DAG);
10510    }
10511    return SDValue();
10512  }
10513
10514  // A vector full of immediates; various special cases are already
10515  // handled, so this is best done with a single constant-pool load.
10516  if (IsAllConstants)
10517    return SDValue();
10518
10519  if (SDValue V = LowerBUILD_VECTORAsVariablePermute(Op, DAG, Subtarget))
10520      return V;
10521
10522  // See if we can use a vector load to get all of the elements.
10523  {
10524    SmallVector<SDValue, 64> Ops(Op->op_begin(), Op->op_begin() + NumElems);
10525    if (SDValue LD =
10526            EltsFromConsecutiveLoads(VT, Ops, dl, DAG, Subtarget, false))
10527      return LD;
10528  }
10529
10530  // If this is a splat of pairs of 32-bit elements, we can use a narrower
10531  // build_vector and broadcast it.
10532  // TODO: We could probably generalize this more.
10533  if (Subtarget.hasAVX2() && EVTBits == 32 && Values.size() == 2) {
10534    SDValue Ops[4] = { Op.getOperand(0), Op.getOperand(1),
10535                       DAG.getUNDEF(EltVT), DAG.getUNDEF(EltVT) };
10536    auto CanSplat = [](SDValue Op, unsigned NumElems, ArrayRef<SDValue> Ops) {
10537      // Make sure all the even/odd operands match.
10538      for (unsigned i = 2; i != NumElems; ++i)
10539        if (Ops[i % 2] != Op.getOperand(i))
10540          return false;
10541      return true;
10542    };
10543    if (CanSplat(Op, NumElems, Ops)) {
10544      MVT WideEltVT = VT.isFloatingPoint() ? MVT::f64 : MVT::i64;
10545      MVT NarrowVT = MVT::getVectorVT(EltVT, 4);
10546      // Create a new build vector and cast to v2i64/v2f64.
10547      SDValue NewBV = DAG.getBitcast(MVT::getVectorVT(WideEltVT, 2),
10548                                     DAG.getBuildVector(NarrowVT, dl, Ops));
10549      // Broadcast from v2i64/v2f64 and cast to final VT.
10550      MVT BcastVT = MVT::getVectorVT(WideEltVT, NumElems / 2);
10551      return DAG.getBitcast(VT, DAG.getNode(X86ISD::VBROADCAST, dl, BcastVT,
10552                                            NewBV));
10553    }
10554  }
10555
10556  // For AVX-length vectors, build the individual 128-bit pieces and use
10557  // shuffles to put them in place.
10558  if (VT.getSizeInBits() > 128) {
10559    MVT HVT = MVT::getVectorVT(EltVT, NumElems / 2);
10560
10561    // Build both the lower and upper subvector.
10562    SDValue Lower =
10563        DAG.getBuildVector(HVT, dl, Op->ops().slice(0, NumElems / 2));
10564    SDValue Upper = DAG.getBuildVector(
10565        HVT, dl, Op->ops().slice(NumElems / 2, NumElems /2));
10566
10567    // Recreate the wider vector with the lower and upper part.
10568    return concatSubVectors(Lower, Upper, DAG, dl);
10569  }
10570
10571  // Let legalizer expand 2-wide build_vectors.
10572  if (EVTBits == 64) {
10573    if (NumNonZero == 1) {
10574      // One half is zero or undef.
10575      unsigned Idx = NonZeroMask.countTrailingZeros();
10576      SDValue V2 = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT,
10577                               Op.getOperand(Idx));
10578      return getShuffleVectorZeroOrUndef(V2, Idx, true, Subtarget, DAG);
10579    }
10580    return SDValue();
10581  }
10582
10583  // If element VT is < 32 bits, convert it to inserts into a zero vector.
10584  if (EVTBits == 8 && NumElems == 16)
10585    if (SDValue V = LowerBuildVectorv16i8(Op, NonZeroMask, NumNonZero, NumZero,
10586                                          DAG, Subtarget))
10587      return V;
10588
10589  if (EVTBits == 16 && NumElems == 8)
10590    if (SDValue V = LowerBuildVectorv8i16(Op, NonZeroMask, NumNonZero, NumZero,
10591                                          DAG, Subtarget))
10592      return V;
10593
10594  // If element VT is == 32 bits and has 4 elems, try to generate an INSERTPS
10595  if (EVTBits == 32 && NumElems == 4)
10596    if (SDValue V = LowerBuildVectorv4x32(Op, DAG, Subtarget))
10597      return V;
10598
10599  // If element VT is == 32 bits, turn it into a number of shuffles.
10600  if (NumElems == 4 && NumZero > 0) {
10601    SmallVector<SDValue, 8> Ops(NumElems);
10602    for (unsigned i = 0; i < 4; ++i) {
10603      bool isZero = !NonZeroMask[i];
10604      if (isZero)
10605        Ops[i] = getZeroVector(VT, Subtarget, DAG, dl);
10606      else
10607        Ops[i] = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, Op.getOperand(i));
10608    }
10609
10610    for (unsigned i = 0; i < 2; ++i) {
10611      switch (NonZeroMask.extractBitsAsZExtValue(2, i * 2)) {
10612        default: llvm_unreachable("Unexpected NonZero count");
10613        case 0:
10614          Ops[i] = Ops[i*2];  // Must be a zero vector.
10615          break;
10616        case 1:
10617          Ops[i] = getMOVL(DAG, dl, VT, Ops[i*2+1], Ops[i*2]);
10618          break;
10619        case 2:
10620          Ops[i] = getMOVL(DAG, dl, VT, Ops[i*2], Ops[i*2+1]);
10621          break;
10622        case 3:
10623          Ops[i] = getUnpackl(DAG, dl, VT, Ops[i*2], Ops[i*2+1]);
10624          break;
10625      }
10626    }
10627
10628    bool Reverse1 = NonZeroMask.extractBitsAsZExtValue(2, 0) == 2;
10629    bool Reverse2 = NonZeroMask.extractBitsAsZExtValue(2, 2) == 2;
10630    int MaskVec[] = {
10631      Reverse1 ? 1 : 0,
10632      Reverse1 ? 0 : 1,
10633      static_cast<int>(Reverse2 ? NumElems+1 : NumElems),
10634      static_cast<int>(Reverse2 ? NumElems   : NumElems+1)
10635    };
10636    return DAG.getVectorShuffle(VT, dl, Ops[0], Ops[1], MaskVec);
10637  }
10638
10639  assert(Values.size() > 1 && "Expected non-undef and non-splat vector");
10640
10641  // Check for a build vector from mostly shuffle plus few inserting.
10642  if (SDValue Sh = buildFromShuffleMostly(Op, DAG))
10643    return Sh;
10644
10645  // For SSE 4.1, use insertps to put the high elements into the low element.
10646  if (Subtarget.hasSSE41()) {
10647    SDValue Result;
10648    if (!Op.getOperand(0).isUndef())
10649      Result = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, Op.getOperand(0));
10650    else
10651      Result = DAG.getUNDEF(VT);
10652
10653    for (unsigned i = 1; i < NumElems; ++i) {
10654      if (Op.getOperand(i).isUndef()) continue;
10655      Result = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, VT, Result,
10656                           Op.getOperand(i), DAG.getIntPtrConstant(i, dl));
10657    }
10658    return Result;
10659  }
10660
10661  // Otherwise, expand into a number of unpckl*, start by extending each of
10662  // our (non-undef) elements to the full vector width with the element in the
10663  // bottom slot of the vector (which generates no code for SSE).
10664  SmallVector<SDValue, 8> Ops(NumElems);
10665  for (unsigned i = 0; i < NumElems; ++i) {
10666    if (!Op.getOperand(i).isUndef())
10667      Ops[i] = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, Op.getOperand(i));
10668    else
10669      Ops[i] = DAG.getUNDEF(VT);
10670  }
10671
10672  // Next, we iteratively mix elements, e.g. for v4f32:
10673  //   Step 1: unpcklps 0, 1 ==> X: <?, ?, 1, 0>
10674  //         : unpcklps 2, 3 ==> Y: <?, ?, 3, 2>
10675  //   Step 2: unpcklpd X, Y ==>    <3, 2, 1, 0>
10676  for (unsigned Scale = 1; Scale < NumElems; Scale *= 2) {
10677    // Generate scaled UNPCKL shuffle mask.
10678    SmallVector<int, 16> Mask;
10679    for(unsigned i = 0; i != Scale; ++i)
10680      Mask.push_back(i);
10681    for (unsigned i = 0; i != Scale; ++i)
10682      Mask.push_back(NumElems+i);
10683    Mask.append(NumElems - Mask.size(), SM_SentinelUndef);
10684
10685    for (unsigned i = 0, e = NumElems / (2 * Scale); i != e; ++i)
10686      Ops[i] = DAG.getVectorShuffle(VT, dl, Ops[2*i], Ops[(2*i)+1], Mask);
10687  }
10688  return Ops[0];
10689}
10690
10691// 256-bit AVX can use the vinsertf128 instruction
10692// to create 256-bit vectors from two other 128-bit ones.
10693// TODO: Detect subvector broadcast here instead of DAG combine?
10694static SDValue LowerAVXCONCAT_VECTORS(SDValue Op, SelectionDAG &DAG,
10695                                      const X86Subtarget &Subtarget) {
10696  SDLoc dl(Op);
10697  MVT ResVT = Op.getSimpleValueType();
10698
10699  assert((ResVT.is256BitVector() ||
10700          ResVT.is512BitVector()) && "Value type must be 256-/512-bit wide");
10701
10702  unsigned NumOperands = Op.getNumOperands();
10703  unsigned NumZero = 0;
10704  unsigned NumNonZero = 0;
10705  unsigned NonZeros = 0;
10706  for (unsigned i = 0; i != NumOperands; ++i) {
10707    SDValue SubVec = Op.getOperand(i);
10708    if (SubVec.isUndef())
10709      continue;
10710    if (ISD::isBuildVectorAllZeros(SubVec.getNode()))
10711      ++NumZero;
10712    else {
10713      assert(i < sizeof(NonZeros) * CHAR_BIT); // Ensure the shift is in range.
10714      NonZeros |= 1 << i;
10715      ++NumNonZero;
10716    }
10717  }
10718
10719  // If we have more than 2 non-zeros, build each half separately.
10720  if (NumNonZero > 2) {
10721    MVT HalfVT = ResVT.getHalfNumVectorElementsVT();
10722    ArrayRef<SDUse> Ops = Op->ops();
10723    SDValue Lo = DAG.getNode(ISD::CONCAT_VECTORS, dl, HalfVT,
10724                             Ops.slice(0, NumOperands/2));
10725    SDValue Hi = DAG.getNode(ISD::CONCAT_VECTORS, dl, HalfVT,
10726                             Ops.slice(NumOperands/2));
10727    return DAG.getNode(ISD::CONCAT_VECTORS, dl, ResVT, Lo, Hi);
10728  }
10729
10730  // Otherwise, build it up through insert_subvectors.
10731  SDValue Vec = NumZero ? getZeroVector(ResVT, Subtarget, DAG, dl)
10732                        : DAG.getUNDEF(ResVT);
10733
10734  MVT SubVT = Op.getOperand(0).getSimpleValueType();
10735  unsigned NumSubElems = SubVT.getVectorNumElements();
10736  for (unsigned i = 0; i != NumOperands; ++i) {
10737    if ((NonZeros & (1 << i)) == 0)
10738      continue;
10739
10740    Vec = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, ResVT, Vec,
10741                      Op.getOperand(i),
10742                      DAG.getIntPtrConstant(i * NumSubElems, dl));
10743  }
10744
10745  return Vec;
10746}
10747
10748// Returns true if the given node is a type promotion (by concatenating i1
10749// zeros) of the result of a node that already zeros all upper bits of
10750// k-register.
10751// TODO: Merge this with LowerAVXCONCAT_VECTORS?
10752static SDValue LowerCONCAT_VECTORSvXi1(SDValue Op,
10753                                       const X86Subtarget &Subtarget,
10754                                       SelectionDAG & DAG) {
10755  SDLoc dl(Op);
10756  MVT ResVT = Op.getSimpleValueType();
10757  unsigned NumOperands = Op.getNumOperands();
10758
10759  assert(NumOperands > 1 && isPowerOf2_32(NumOperands) &&
10760         "Unexpected number of operands in CONCAT_VECTORS");
10761
10762  uint64_t Zeros = 0;
10763  uint64_t NonZeros = 0;
10764  for (unsigned i = 0; i != NumOperands; ++i) {
10765    SDValue SubVec = Op.getOperand(i);
10766    if (SubVec.isUndef())
10767      continue;
10768    assert(i < sizeof(NonZeros) * CHAR_BIT); // Ensure the shift is in range.
10769    if (ISD::isBuildVectorAllZeros(SubVec.getNode()))
10770      Zeros |= (uint64_t)1 << i;
10771    else
10772      NonZeros |= (uint64_t)1 << i;
10773  }
10774
10775  unsigned NumElems = ResVT.getVectorNumElements();
10776
10777  // If we are inserting non-zero vector and there are zeros in LSBs and undef
10778  // in the MSBs we need to emit a KSHIFTL. The generic lowering to
10779  // insert_subvector will give us two kshifts.
10780  if (isPowerOf2_64(NonZeros) && Zeros != 0 && NonZeros > Zeros &&
10781      Log2_64(NonZeros) != NumOperands - 1) {
10782    MVT ShiftVT = ResVT;
10783    if ((!Subtarget.hasDQI() && NumElems == 8) || NumElems < 8)
10784      ShiftVT = Subtarget.hasDQI() ? MVT::v8i1 : MVT::v16i1;
10785    unsigned Idx = Log2_64(NonZeros);
10786    SDValue SubVec = Op.getOperand(Idx);
10787    unsigned SubVecNumElts = SubVec.getSimpleValueType().getVectorNumElements();
10788    SubVec = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, ShiftVT,
10789                         DAG.getUNDEF(ShiftVT), SubVec,
10790                         DAG.getIntPtrConstant(0, dl));
10791    Op = DAG.getNode(X86ISD::KSHIFTL, dl, ShiftVT, SubVec,
10792                     DAG.getTargetConstant(Idx * SubVecNumElts, dl, MVT::i8));
10793    return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, ResVT, Op,
10794                       DAG.getIntPtrConstant(0, dl));
10795  }
10796
10797  // If there are zero or one non-zeros we can handle this very simply.
10798  if (NonZeros == 0 || isPowerOf2_64(NonZeros)) {
10799    SDValue Vec = Zeros ? DAG.getConstant(0, dl, ResVT) : DAG.getUNDEF(ResVT);
10800    if (!NonZeros)
10801      return Vec;
10802    unsigned Idx = Log2_64(NonZeros);
10803    SDValue SubVec = Op.getOperand(Idx);
10804    unsigned SubVecNumElts = SubVec.getSimpleValueType().getVectorNumElements();
10805    return DAG.getNode(ISD::INSERT_SUBVECTOR, dl, ResVT, Vec, SubVec,
10806                       DAG.getIntPtrConstant(Idx * SubVecNumElts, dl));
10807  }
10808
10809  if (NumOperands > 2) {
10810    MVT HalfVT = ResVT.getHalfNumVectorElementsVT();
10811    ArrayRef<SDUse> Ops = Op->ops();
10812    SDValue Lo = DAG.getNode(ISD::CONCAT_VECTORS, dl, HalfVT,
10813                             Ops.slice(0, NumOperands/2));
10814    SDValue Hi = DAG.getNode(ISD::CONCAT_VECTORS, dl, HalfVT,
10815                             Ops.slice(NumOperands/2));
10816    return DAG.getNode(ISD::CONCAT_VECTORS, dl, ResVT, Lo, Hi);
10817  }
10818
10819  assert(countPopulation(NonZeros) == 2 && "Simple cases not handled?");
10820
10821  if (ResVT.getVectorNumElements() >= 16)
10822    return Op; // The operation is legal with KUNPCK
10823
10824  SDValue Vec = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, ResVT,
10825                            DAG.getUNDEF(ResVT), Op.getOperand(0),
10826                            DAG.getIntPtrConstant(0, dl));
10827  return DAG.getNode(ISD::INSERT_SUBVECTOR, dl, ResVT, Vec, Op.getOperand(1),
10828                     DAG.getIntPtrConstant(NumElems/2, dl));
10829}
10830
10831static SDValue LowerCONCAT_VECTORS(SDValue Op,
10832                                   const X86Subtarget &Subtarget,
10833                                   SelectionDAG &DAG) {
10834  MVT VT = Op.getSimpleValueType();
10835  if (VT.getVectorElementType() == MVT::i1)
10836    return LowerCONCAT_VECTORSvXi1(Op, Subtarget, DAG);
10837
10838  assert((VT.is256BitVector() && Op.getNumOperands() == 2) ||
10839         (VT.is512BitVector() && (Op.getNumOperands() == 2 ||
10840          Op.getNumOperands() == 4)));
10841
10842  // AVX can use the vinsertf128 instruction to create 256-bit vectors
10843  // from two other 128-bit ones.
10844
10845  // 512-bit vector may contain 2 256-bit vectors or 4 128-bit vectors
10846  return LowerAVXCONCAT_VECTORS(Op, DAG, Subtarget);
10847}
10848
10849//===----------------------------------------------------------------------===//
10850// Vector shuffle lowering
10851//
10852// This is an experimental code path for lowering vector shuffles on x86. It is
10853// designed to handle arbitrary vector shuffles and blends, gracefully
10854// degrading performance as necessary. It works hard to recognize idiomatic
10855// shuffles and lower them to optimal instruction patterns without leaving
10856// a framework that allows reasonably efficient handling of all vector shuffle
10857// patterns.
10858//===----------------------------------------------------------------------===//
10859
10860/// Tiny helper function to identify a no-op mask.
10861///
10862/// This is a somewhat boring predicate function. It checks whether the mask
10863/// array input, which is assumed to be a single-input shuffle mask of the kind
10864/// used by the X86 shuffle instructions (not a fully general
10865/// ShuffleVectorSDNode mask) requires any shuffles to occur. Both undef and an
10866/// in-place shuffle are 'no-op's.
10867static bool isNoopShuffleMask(ArrayRef<int> Mask) {
10868  for (int i = 0, Size = Mask.size(); i < Size; ++i) {
10869    assert(Mask[i] >= -1 && "Out of bound mask element!");
10870    if (Mask[i] >= 0 && Mask[i] != i)
10871      return false;
10872  }
10873  return true;
10874}
10875
10876/// Test whether there are elements crossing LaneSizeInBits lanes in this
10877/// shuffle mask.
10878///
10879/// X86 divides up its shuffles into in-lane and cross-lane shuffle operations
10880/// and we routinely test for these.
10881static bool isLaneCrossingShuffleMask(unsigned LaneSizeInBits,
10882                                      unsigned ScalarSizeInBits,
10883                                      ArrayRef<int> Mask) {
10884  assert(LaneSizeInBits && ScalarSizeInBits &&
10885         (LaneSizeInBits % ScalarSizeInBits) == 0 &&
10886         "Illegal shuffle lane size");
10887  int LaneSize = LaneSizeInBits / ScalarSizeInBits;
10888  int Size = Mask.size();
10889  for (int i = 0; i < Size; ++i)
10890    if (Mask[i] >= 0 && (Mask[i] % Size) / LaneSize != i / LaneSize)
10891      return true;
10892  return false;
10893}
10894
10895/// Test whether there are elements crossing 128-bit lanes in this
10896/// shuffle mask.
10897static bool is128BitLaneCrossingShuffleMask(MVT VT, ArrayRef<int> Mask) {
10898  return isLaneCrossingShuffleMask(128, VT.getScalarSizeInBits(), Mask);
10899}
10900
10901/// Test whether elements in each LaneSizeInBits lane in this shuffle mask come
10902/// from multiple lanes - this is different to isLaneCrossingShuffleMask to
10903/// better support 'repeated mask + lane permute' style shuffles.
10904static bool isMultiLaneShuffleMask(unsigned LaneSizeInBits,
10905                                   unsigned ScalarSizeInBits,
10906                                   ArrayRef<int> Mask) {
10907  assert(LaneSizeInBits && ScalarSizeInBits &&
10908         (LaneSizeInBits % ScalarSizeInBits) == 0 &&
10909         "Illegal shuffle lane size");
10910  int NumElts = Mask.size();
10911  int NumEltsPerLane = LaneSizeInBits / ScalarSizeInBits;
10912  int NumLanes = NumElts / NumEltsPerLane;
10913  if (NumLanes > 1) {
10914    for (int i = 0; i != NumLanes; ++i) {
10915      int SrcLane = -1;
10916      for (int j = 0; j != NumEltsPerLane; ++j) {
10917        int M = Mask[(i * NumEltsPerLane) + j];
10918        if (M < 0)
10919          continue;
10920        int Lane = (M % NumElts) / NumEltsPerLane;
10921        if (SrcLane >= 0 && SrcLane != Lane)
10922          return true;
10923        SrcLane = Lane;
10924      }
10925    }
10926  }
10927  return false;
10928}
10929
10930/// Test whether a shuffle mask is equivalent within each sub-lane.
10931///
10932/// This checks a shuffle mask to see if it is performing the same
10933/// lane-relative shuffle in each sub-lane. This trivially implies
10934/// that it is also not lane-crossing. It may however involve a blend from the
10935/// same lane of a second vector.
10936///
10937/// The specific repeated shuffle mask is populated in \p RepeatedMask, as it is
10938/// non-trivial to compute in the face of undef lanes. The representation is
10939/// suitable for use with existing 128-bit shuffles as entries from the second
10940/// vector have been remapped to [LaneSize, 2*LaneSize).
10941static bool isRepeatedShuffleMask(unsigned LaneSizeInBits, MVT VT,
10942                                  ArrayRef<int> Mask,
10943                                  SmallVectorImpl<int> &RepeatedMask) {
10944  auto LaneSize = LaneSizeInBits / VT.getScalarSizeInBits();
10945  RepeatedMask.assign(LaneSize, -1);
10946  int Size = Mask.size();
10947  for (int i = 0; i < Size; ++i) {
10948    assert(Mask[i] == SM_SentinelUndef || Mask[i] >= 0);
10949    if (Mask[i] < 0)
10950      continue;
10951    if ((Mask[i] % Size) / LaneSize != i / LaneSize)
10952      // This entry crosses lanes, so there is no way to model this shuffle.
10953      return false;
10954
10955    // Ok, handle the in-lane shuffles by detecting if and when they repeat.
10956    // Adjust second vector indices to start at LaneSize instead of Size.
10957    int LocalM = Mask[i] < Size ? Mask[i] % LaneSize
10958                                : Mask[i] % LaneSize + LaneSize;
10959    if (RepeatedMask[i % LaneSize] < 0)
10960      // This is the first non-undef entry in this slot of a 128-bit lane.
10961      RepeatedMask[i % LaneSize] = LocalM;
10962    else if (RepeatedMask[i % LaneSize] != LocalM)
10963      // Found a mismatch with the repeated mask.
10964      return false;
10965  }
10966  return true;
10967}
10968
10969/// Test whether a shuffle mask is equivalent within each 128-bit lane.
10970static bool
10971is128BitLaneRepeatedShuffleMask(MVT VT, ArrayRef<int> Mask,
10972                                SmallVectorImpl<int> &RepeatedMask) {
10973  return isRepeatedShuffleMask(128, VT, Mask, RepeatedMask);
10974}
10975
10976static bool
10977is128BitLaneRepeatedShuffleMask(MVT VT, ArrayRef<int> Mask) {
10978  SmallVector<int, 32> RepeatedMask;
10979  return isRepeatedShuffleMask(128, VT, Mask, RepeatedMask);
10980}
10981
10982/// Test whether a shuffle mask is equivalent within each 256-bit lane.
10983static bool
10984is256BitLaneRepeatedShuffleMask(MVT VT, ArrayRef<int> Mask,
10985                                SmallVectorImpl<int> &RepeatedMask) {
10986  return isRepeatedShuffleMask(256, VT, Mask, RepeatedMask);
10987}
10988
10989/// Test whether a target shuffle mask is equivalent within each sub-lane.
10990/// Unlike isRepeatedShuffleMask we must respect SM_SentinelZero.
10991static bool isRepeatedTargetShuffleMask(unsigned LaneSizeInBits,
10992                                        unsigned EltSizeInBits,
10993                                        ArrayRef<int> Mask,
10994                                        SmallVectorImpl<int> &RepeatedMask) {
10995  int LaneSize = LaneSizeInBits / EltSizeInBits;
10996  RepeatedMask.assign(LaneSize, SM_SentinelUndef);
10997  int Size = Mask.size();
10998  for (int i = 0; i < Size; ++i) {
10999    assert(isUndefOrZero(Mask[i]) || (Mask[i] >= 0));
11000    if (Mask[i] == SM_SentinelUndef)
11001      continue;
11002    if (Mask[i] == SM_SentinelZero) {
11003      if (!isUndefOrZero(RepeatedMask[i % LaneSize]))
11004        return false;
11005      RepeatedMask[i % LaneSize] = SM_SentinelZero;
11006      continue;
11007    }
11008    if ((Mask[i] % Size) / LaneSize != i / LaneSize)
11009      // This entry crosses lanes, so there is no way to model this shuffle.
11010      return false;
11011
11012    // Handle the in-lane shuffles by detecting if and when they repeat. Adjust
11013    // later vector indices to start at multiples of LaneSize instead of Size.
11014    int LaneM = Mask[i] / Size;
11015    int LocalM = (Mask[i] % LaneSize) + (LaneM * LaneSize);
11016    if (RepeatedMask[i % LaneSize] == SM_SentinelUndef)
11017      // This is the first non-undef entry in this slot of a 128-bit lane.
11018      RepeatedMask[i % LaneSize] = LocalM;
11019    else if (RepeatedMask[i % LaneSize] != LocalM)
11020      // Found a mismatch with the repeated mask.
11021      return false;
11022  }
11023  return true;
11024}
11025
11026/// Test whether a target shuffle mask is equivalent within each sub-lane.
11027/// Unlike isRepeatedShuffleMask we must respect SM_SentinelZero.
11028static bool isRepeatedTargetShuffleMask(unsigned LaneSizeInBits, MVT VT,
11029                                        ArrayRef<int> Mask,
11030                                        SmallVectorImpl<int> &RepeatedMask) {
11031  return isRepeatedTargetShuffleMask(LaneSizeInBits, VT.getScalarSizeInBits(),
11032                                     Mask, RepeatedMask);
11033}
11034
11035/// Checks whether the vector elements referenced by two shuffle masks are
11036/// equivalent.
11037static bool IsElementEquivalent(int MaskSize, SDValue Op, SDValue ExpectedOp,
11038                                int Idx, int ExpectedIdx) {
11039  assert(0 <= Idx && Idx < MaskSize && 0 <= ExpectedIdx &&
11040         ExpectedIdx < MaskSize && "Out of range element index");
11041  if (!Op || !ExpectedOp || Op.getOpcode() != ExpectedOp.getOpcode())
11042    return false;
11043
11044  switch (Op.getOpcode()) {
11045  case ISD::BUILD_VECTOR:
11046    // If the values are build vectors, we can look through them to find
11047    // equivalent inputs that make the shuffles equivalent.
11048    // TODO: Handle MaskSize != Op.getNumOperands()?
11049    if (MaskSize == (int)Op.getNumOperands() &&
11050        MaskSize == (int)ExpectedOp.getNumOperands())
11051      return Op.getOperand(Idx) == ExpectedOp.getOperand(ExpectedIdx);
11052    break;
11053  case X86ISD::VBROADCAST:
11054  case X86ISD::VBROADCAST_LOAD:
11055    // TODO: Handle MaskSize != Op.getValueType().getVectorNumElements()?
11056    return (Op == ExpectedOp &&
11057            (int)Op.getValueType().getVectorNumElements() == MaskSize);
11058  case X86ISD::HADD:
11059  case X86ISD::HSUB:
11060  case X86ISD::FHADD:
11061  case X86ISD::FHSUB:
11062  case X86ISD::PACKSS:
11063  case X86ISD::PACKUS:
11064    // HOP(X,X) can refer to the elt from the lower/upper half of a lane.
11065    // TODO: Handle MaskSize != NumElts?
11066    // TODO: Handle HOP(X,Y) vs HOP(Y,X) equivalence cases.
11067    if (Op == ExpectedOp && Op.getOperand(0) == Op.getOperand(1)) {
11068      MVT VT = Op.getSimpleValueType();
11069      int NumElts = VT.getVectorNumElements();
11070      if (MaskSize == NumElts) {
11071        int NumLanes = VT.getSizeInBits() / 128;
11072        int NumEltsPerLane = NumElts / NumLanes;
11073        int NumHalfEltsPerLane = NumEltsPerLane / 2;
11074        bool SameLane =
11075            (Idx / NumEltsPerLane) == (ExpectedIdx / NumEltsPerLane);
11076        bool SameElt =
11077            (Idx % NumHalfEltsPerLane) == (ExpectedIdx % NumHalfEltsPerLane);
11078        return SameLane && SameElt;
11079      }
11080    }
11081    break;
11082  }
11083
11084  return false;
11085}
11086
11087/// Checks whether a shuffle mask is equivalent to an explicit list of
11088/// arguments.
11089///
11090/// This is a fast way to test a shuffle mask against a fixed pattern:
11091///
11092///   if (isShuffleEquivalent(Mask, 3, 2, {1, 0})) { ... }
11093///
11094/// It returns true if the mask is exactly as wide as the argument list, and
11095/// each element of the mask is either -1 (signifying undef) or the value given
11096/// in the argument.
11097static bool isShuffleEquivalent(ArrayRef<int> Mask, ArrayRef<int> ExpectedMask,
11098                                SDValue V1 = SDValue(),
11099                                SDValue V2 = SDValue()) {
11100  int Size = Mask.size();
11101  if (Size != (int)ExpectedMask.size())
11102    return false;
11103
11104  for (int i = 0; i < Size; ++i) {
11105    assert(Mask[i] >= -1 && "Out of bound mask element!");
11106    int MaskIdx = Mask[i];
11107    int ExpectedIdx = ExpectedMask[i];
11108    if (0 <= MaskIdx && MaskIdx != ExpectedIdx) {
11109      SDValue MaskV = MaskIdx < Size ? V1 : V2;
11110      SDValue ExpectedV = ExpectedIdx < Size ? V1 : V2;
11111      MaskIdx = MaskIdx < Size ? MaskIdx : (MaskIdx - Size);
11112      ExpectedIdx = ExpectedIdx < Size ? ExpectedIdx : (ExpectedIdx - Size);
11113      if (!IsElementEquivalent(Size, MaskV, ExpectedV, MaskIdx, ExpectedIdx))
11114        return false;
11115    }
11116  }
11117  return true;
11118}
11119
11120/// Checks whether a target shuffle mask is equivalent to an explicit pattern.
11121///
11122/// The masks must be exactly the same width.
11123///
11124/// If an element in Mask matches SM_SentinelUndef (-1) then the corresponding
11125/// value in ExpectedMask is always accepted. Otherwise the indices must match.
11126///
11127/// SM_SentinelZero is accepted as a valid negative index but must match in
11128/// both.
11129static bool isTargetShuffleEquivalent(MVT VT, ArrayRef<int> Mask,
11130                                      ArrayRef<int> ExpectedMask,
11131                                      SDValue V1 = SDValue(),
11132                                      SDValue V2 = SDValue()) {
11133  int Size = Mask.size();
11134  if (Size != (int)ExpectedMask.size())
11135    return false;
11136  assert(isUndefOrZeroOrInRange(ExpectedMask, 0, 2 * Size) &&
11137         "Illegal target shuffle mask");
11138
11139  // Check for out-of-range target shuffle mask indices.
11140  if (!isUndefOrZeroOrInRange(Mask, 0, 2 * Size))
11141    return false;
11142
11143  // Don't use V1/V2 if they're not the same size as the shuffle mask type.
11144  if (V1 && V1.getValueSizeInBits() != VT.getSizeInBits())
11145    V1 = SDValue();
11146  if (V2 && V2.getValueSizeInBits() != VT.getSizeInBits())
11147    V2 = SDValue();
11148
11149  for (int i = 0; i < Size; ++i) {
11150    int MaskIdx = Mask[i];
11151    int ExpectedIdx = ExpectedMask[i];
11152    if (MaskIdx == SM_SentinelUndef || MaskIdx == ExpectedIdx)
11153      continue;
11154    if (0 <= MaskIdx && 0 <= ExpectedIdx) {
11155      SDValue MaskV = MaskIdx < Size ? V1 : V2;
11156      SDValue ExpectedV = ExpectedIdx < Size ? V1 : V2;
11157      MaskIdx = MaskIdx < Size ? MaskIdx : (MaskIdx - Size);
11158      ExpectedIdx = ExpectedIdx < Size ? ExpectedIdx : (ExpectedIdx - Size);
11159      if (IsElementEquivalent(Size, MaskV, ExpectedV, MaskIdx, ExpectedIdx))
11160        continue;
11161    }
11162    // TODO - handle SM_Sentinel equivalences.
11163    return false;
11164  }
11165  return true;
11166}
11167
11168// Attempt to create a shuffle mask from a VSELECT condition mask.
11169static bool createShuffleMaskFromVSELECT(SmallVectorImpl<int> &Mask,
11170                                         SDValue Cond) {
11171  EVT CondVT = Cond.getValueType();
11172  unsigned EltSizeInBits = CondVT.getScalarSizeInBits();
11173  unsigned NumElts = CondVT.getVectorNumElements();
11174
11175  APInt UndefElts;
11176  SmallVector<APInt, 32> EltBits;
11177  if (!getTargetConstantBitsFromNode(Cond, EltSizeInBits, UndefElts, EltBits,
11178                                     true, false))
11179    return false;
11180
11181  Mask.resize(NumElts, SM_SentinelUndef);
11182
11183  for (int i = 0; i != (int)NumElts; ++i) {
11184    Mask[i] = i;
11185    // Arbitrarily choose from the 2nd operand if the select condition element
11186    // is undef.
11187    // TODO: Can we do better by matching patterns such as even/odd?
11188    if (UndefElts[i] || EltBits[i].isNullValue())
11189      Mask[i] += NumElts;
11190  }
11191
11192  return true;
11193}
11194
11195// Check if the shuffle mask is suitable for the AVX vpunpcklwd or vpunpckhwd
11196// instructions.
11197static bool isUnpackWdShuffleMask(ArrayRef<int> Mask, MVT VT) {
11198  if (VT != MVT::v8i32 && VT != MVT::v8f32)
11199    return false;
11200
11201  SmallVector<int, 8> Unpcklwd;
11202  createUnpackShuffleMask(MVT::v8i16, Unpcklwd, /* Lo = */ true,
11203                          /* Unary = */ false);
11204  SmallVector<int, 8> Unpckhwd;
11205  createUnpackShuffleMask(MVT::v8i16, Unpckhwd, /* Lo = */ false,
11206                          /* Unary = */ false);
11207  bool IsUnpackwdMask = (isTargetShuffleEquivalent(VT, Mask, Unpcklwd) ||
11208                         isTargetShuffleEquivalent(VT, Mask, Unpckhwd));
11209  return IsUnpackwdMask;
11210}
11211
11212static bool is128BitUnpackShuffleMask(ArrayRef<int> Mask) {
11213  // Create 128-bit vector type based on mask size.
11214  MVT EltVT = MVT::getIntegerVT(128 / Mask.size());
11215  MVT VT = MVT::getVectorVT(EltVT, Mask.size());
11216
11217  // We can't assume a canonical shuffle mask, so try the commuted version too.
11218  SmallVector<int, 4> CommutedMask(Mask.begin(), Mask.end());
11219  ShuffleVectorSDNode::commuteMask(CommutedMask);
11220
11221  // Match any of unary/binary or low/high.
11222  for (unsigned i = 0; i != 4; ++i) {
11223    SmallVector<int, 16> UnpackMask;
11224    createUnpackShuffleMask(VT, UnpackMask, (i >> 1) % 2, i % 2);
11225    if (isTargetShuffleEquivalent(VT, Mask, UnpackMask) ||
11226        isTargetShuffleEquivalent(VT, CommutedMask, UnpackMask))
11227      return true;
11228  }
11229  return false;
11230}
11231
11232/// Return true if a shuffle mask chooses elements identically in its top and
11233/// bottom halves. For example, any splat mask has the same top and bottom
11234/// halves. If an element is undefined in only one half of the mask, the halves
11235/// are not considered identical.
11236static bool hasIdenticalHalvesShuffleMask(ArrayRef<int> Mask) {
11237  assert(Mask.size() % 2 == 0 && "Expecting even number of elements in mask");
11238  unsigned HalfSize = Mask.size() / 2;
11239  for (unsigned i = 0; i != HalfSize; ++i) {
11240    if (Mask[i] != Mask[i + HalfSize])
11241      return false;
11242  }
11243  return true;
11244}
11245
11246/// Get a 4-lane 8-bit shuffle immediate for a mask.
11247///
11248/// This helper function produces an 8-bit shuffle immediate corresponding to
11249/// the ubiquitous shuffle encoding scheme used in x86 instructions for
11250/// shuffling 4 lanes. It can be used with most of the PSHUF instructions for
11251/// example.
11252///
11253/// NB: We rely heavily on "undef" masks preserving the input lane.
11254static unsigned getV4X86ShuffleImm(ArrayRef<int> Mask) {
11255  assert(Mask.size() == 4 && "Only 4-lane shuffle masks");
11256  assert(Mask[0] >= -1 && Mask[0] < 4 && "Out of bound mask element!");
11257  assert(Mask[1] >= -1 && Mask[1] < 4 && "Out of bound mask element!");
11258  assert(Mask[2] >= -1 && Mask[2] < 4 && "Out of bound mask element!");
11259  assert(Mask[3] >= -1 && Mask[3] < 4 && "Out of bound mask element!");
11260
11261  // If the mask only uses one non-undef element, then fully 'splat' it to
11262  // improve later broadcast matching.
11263  int FirstIndex = find_if(Mask, [](int M) { return M >= 0; }) - Mask.begin();
11264  assert(0 <= FirstIndex && FirstIndex < 4 && "All undef shuffle mask");
11265
11266  int FirstElt = Mask[FirstIndex];
11267  if (all_of(Mask, [FirstElt](int M) { return M < 0 || M == FirstElt; }))
11268    return (FirstElt << 6) | (FirstElt << 4) | (FirstElt << 2) | FirstElt;
11269
11270  unsigned Imm = 0;
11271  Imm |= (Mask[0] < 0 ? 0 : Mask[0]) << 0;
11272  Imm |= (Mask[1] < 0 ? 1 : Mask[1]) << 2;
11273  Imm |= (Mask[2] < 0 ? 2 : Mask[2]) << 4;
11274  Imm |= (Mask[3] < 0 ? 3 : Mask[3]) << 6;
11275  return Imm;
11276}
11277
11278static SDValue getV4X86ShuffleImm8ForMask(ArrayRef<int> Mask, const SDLoc &DL,
11279                                          SelectionDAG &DAG) {
11280  return DAG.getTargetConstant(getV4X86ShuffleImm(Mask), DL, MVT::i8);
11281}
11282
11283// The Shuffle result is as follow:
11284// 0*a[0]0*a[1]...0*a[n] , n >=0 where a[] elements in a ascending order.
11285// Each Zeroable's element correspond to a particular Mask's element.
11286// As described in computeZeroableShuffleElements function.
11287//
11288// The function looks for a sub-mask that the nonzero elements are in
11289// increasing order. If such sub-mask exist. The function returns true.
11290static bool isNonZeroElementsInOrder(const APInt &Zeroable,
11291                                     ArrayRef<int> Mask, const EVT &VectorType,
11292                                     bool &IsZeroSideLeft) {
11293  int NextElement = -1;
11294  // Check if the Mask's nonzero elements are in increasing order.
11295  for (int i = 0, e = Mask.size(); i < e; i++) {
11296    // Checks if the mask's zeros elements are built from only zeros.
11297    assert(Mask[i] >= -1 && "Out of bound mask element!");
11298    if (Mask[i] < 0)
11299      return false;
11300    if (Zeroable[i])
11301      continue;
11302    // Find the lowest non zero element
11303    if (NextElement < 0) {
11304      NextElement = Mask[i] != 0 ? VectorType.getVectorNumElements() : 0;
11305      IsZeroSideLeft = NextElement != 0;
11306    }
11307    // Exit if the mask's non zero elements are not in increasing order.
11308    if (NextElement != Mask[i])
11309      return false;
11310    NextElement++;
11311  }
11312  return true;
11313}
11314
11315/// Try to lower a shuffle with a single PSHUFB of V1 or V2.
11316static SDValue lowerShuffleWithPSHUFB(const SDLoc &DL, MVT VT,
11317                                      ArrayRef<int> Mask, SDValue V1,
11318                                      SDValue V2, const APInt &Zeroable,
11319                                      const X86Subtarget &Subtarget,
11320                                      SelectionDAG &DAG) {
11321  int Size = Mask.size();
11322  int LaneSize = 128 / VT.getScalarSizeInBits();
11323  const int NumBytes = VT.getSizeInBits() / 8;
11324  const int NumEltBytes = VT.getScalarSizeInBits() / 8;
11325
11326  assert((Subtarget.hasSSSE3() && VT.is128BitVector()) ||
11327         (Subtarget.hasAVX2() && VT.is256BitVector()) ||
11328         (Subtarget.hasBWI() && VT.is512BitVector()));
11329
11330  SmallVector<SDValue, 64> PSHUFBMask(NumBytes);
11331  // Sign bit set in i8 mask means zero element.
11332  SDValue ZeroMask = DAG.getConstant(0x80, DL, MVT::i8);
11333
11334  SDValue V;
11335  for (int i = 0; i < NumBytes; ++i) {
11336    int M = Mask[i / NumEltBytes];
11337    if (M < 0) {
11338      PSHUFBMask[i] = DAG.getUNDEF(MVT::i8);
11339      continue;
11340    }
11341    if (Zeroable[i / NumEltBytes]) {
11342      PSHUFBMask[i] = ZeroMask;
11343      continue;
11344    }
11345
11346    // We can only use a single input of V1 or V2.
11347    SDValue SrcV = (M >= Size ? V2 : V1);
11348    if (V && V != SrcV)
11349      return SDValue();
11350    V = SrcV;
11351    M %= Size;
11352
11353    // PSHUFB can't cross lanes, ensure this doesn't happen.
11354    if ((M / LaneSize) != ((i / NumEltBytes) / LaneSize))
11355      return SDValue();
11356
11357    M = M % LaneSize;
11358    M = M * NumEltBytes + (i % NumEltBytes);
11359    PSHUFBMask[i] = DAG.getConstant(M, DL, MVT::i8);
11360  }
11361  assert(V && "Failed to find a source input");
11362
11363  MVT I8VT = MVT::getVectorVT(MVT::i8, NumBytes);
11364  return DAG.getBitcast(
11365      VT, DAG.getNode(X86ISD::PSHUFB, DL, I8VT, DAG.getBitcast(I8VT, V),
11366                      DAG.getBuildVector(I8VT, DL, PSHUFBMask)));
11367}
11368
11369static SDValue getMaskNode(SDValue Mask, MVT MaskVT,
11370                           const X86Subtarget &Subtarget, SelectionDAG &DAG,
11371                           const SDLoc &dl);
11372
11373// X86 has dedicated shuffle that can be lowered to VEXPAND
11374static SDValue lowerShuffleToEXPAND(const SDLoc &DL, MVT VT,
11375                                    const APInt &Zeroable,
11376                                    ArrayRef<int> Mask, SDValue &V1,
11377                                    SDValue &V2, SelectionDAG &DAG,
11378                                    const X86Subtarget &Subtarget) {
11379  bool IsLeftZeroSide = true;
11380  if (!isNonZeroElementsInOrder(Zeroable, Mask, V1.getValueType(),
11381                                IsLeftZeroSide))
11382    return SDValue();
11383  unsigned VEXPANDMask = (~Zeroable).getZExtValue();
11384  MVT IntegerType =
11385      MVT::getIntegerVT(std::max((int)VT.getVectorNumElements(), 8));
11386  SDValue MaskNode = DAG.getConstant(VEXPANDMask, DL, IntegerType);
11387  unsigned NumElts = VT.getVectorNumElements();
11388  assert((NumElts == 4 || NumElts == 8 || NumElts == 16) &&
11389         "Unexpected number of vector elements");
11390  SDValue VMask = getMaskNode(MaskNode, MVT::getVectorVT(MVT::i1, NumElts),
11391                              Subtarget, DAG, DL);
11392  SDValue ZeroVector = getZeroVector(VT, Subtarget, DAG, DL);
11393  SDValue ExpandedVector = IsLeftZeroSide ? V2 : V1;
11394  return DAG.getNode(X86ISD::EXPAND, DL, VT, ExpandedVector, ZeroVector, VMask);
11395}
11396
11397static bool matchShuffleWithUNPCK(MVT VT, SDValue &V1, SDValue &V2,
11398                                  unsigned &UnpackOpcode, bool IsUnary,
11399                                  ArrayRef<int> TargetMask, const SDLoc &DL,
11400                                  SelectionDAG &DAG,
11401                                  const X86Subtarget &Subtarget) {
11402  int NumElts = VT.getVectorNumElements();
11403
11404  bool Undef1 = true, Undef2 = true, Zero1 = true, Zero2 = true;
11405  for (int i = 0; i != NumElts; i += 2) {
11406    int M1 = TargetMask[i + 0];
11407    int M2 = TargetMask[i + 1];
11408    Undef1 &= (SM_SentinelUndef == M1);
11409    Undef2 &= (SM_SentinelUndef == M2);
11410    Zero1 &= isUndefOrZero(M1);
11411    Zero2 &= isUndefOrZero(M2);
11412  }
11413  assert(!((Undef1 || Zero1) && (Undef2 || Zero2)) &&
11414         "Zeroable shuffle detected");
11415
11416  // Attempt to match the target mask against the unpack lo/hi mask patterns.
11417  SmallVector<int, 64> Unpckl, Unpckh;
11418  createUnpackShuffleMask(VT, Unpckl, /* Lo = */ true, IsUnary);
11419  if (isTargetShuffleEquivalent(VT, TargetMask, Unpckl, V1,
11420                                (IsUnary ? V1 : V2))) {
11421    UnpackOpcode = X86ISD::UNPCKL;
11422    V2 = (Undef2 ? DAG.getUNDEF(VT) : (IsUnary ? V1 : V2));
11423    V1 = (Undef1 ? DAG.getUNDEF(VT) : V1);
11424    return true;
11425  }
11426
11427  createUnpackShuffleMask(VT, Unpckh, /* Lo = */ false, IsUnary);
11428  if (isTargetShuffleEquivalent(VT, TargetMask, Unpckh, V1,
11429                                (IsUnary ? V1 : V2))) {
11430    UnpackOpcode = X86ISD::UNPCKH;
11431    V2 = (Undef2 ? DAG.getUNDEF(VT) : (IsUnary ? V1 : V2));
11432    V1 = (Undef1 ? DAG.getUNDEF(VT) : V1);
11433    return true;
11434  }
11435
11436  // If an unary shuffle, attempt to match as an unpack lo/hi with zero.
11437  if (IsUnary && (Zero1 || Zero2)) {
11438    // Don't bother if we can blend instead.
11439    if ((Subtarget.hasSSE41() || VT == MVT::v2i64 || VT == MVT::v2f64) &&
11440        isSequentialOrUndefOrZeroInRange(TargetMask, 0, NumElts, 0))
11441      return false;
11442
11443    bool MatchLo = true, MatchHi = true;
11444    for (int i = 0; (i != NumElts) && (MatchLo || MatchHi); ++i) {
11445      int M = TargetMask[i];
11446
11447      // Ignore if the input is known to be zero or the index is undef.
11448      if ((((i & 1) == 0) && Zero1) || (((i & 1) == 1) && Zero2) ||
11449          (M == SM_SentinelUndef))
11450        continue;
11451
11452      MatchLo &= (M == Unpckl[i]);
11453      MatchHi &= (M == Unpckh[i]);
11454    }
11455
11456    if (MatchLo || MatchHi) {
11457      UnpackOpcode = MatchLo ? X86ISD::UNPCKL : X86ISD::UNPCKH;
11458      V2 = Zero2 ? getZeroVector(VT, Subtarget, DAG, DL) : V1;
11459      V1 = Zero1 ? getZeroVector(VT, Subtarget, DAG, DL) : V1;
11460      return true;
11461    }
11462  }
11463
11464  // If a binary shuffle, commute and try again.
11465  if (!IsUnary) {
11466    ShuffleVectorSDNode::commuteMask(Unpckl);
11467    if (isTargetShuffleEquivalent(VT, TargetMask, Unpckl)) {
11468      UnpackOpcode = X86ISD::UNPCKL;
11469      std::swap(V1, V2);
11470      return true;
11471    }
11472
11473    ShuffleVectorSDNode::commuteMask(Unpckh);
11474    if (isTargetShuffleEquivalent(VT, TargetMask, Unpckh)) {
11475      UnpackOpcode = X86ISD::UNPCKH;
11476      std::swap(V1, V2);
11477      return true;
11478    }
11479  }
11480
11481  return false;
11482}
11483
11484// X86 has dedicated unpack instructions that can handle specific blend
11485// operations: UNPCKH and UNPCKL.
11486static SDValue lowerShuffleWithUNPCK(const SDLoc &DL, MVT VT,
11487                                     ArrayRef<int> Mask, SDValue V1, SDValue V2,
11488                                     SelectionDAG &DAG) {
11489  SmallVector<int, 8> Unpckl;
11490  createUnpackShuffleMask(VT, Unpckl, /* Lo = */ true, /* Unary = */ false);
11491  if (isShuffleEquivalent(Mask, Unpckl, V1, V2))
11492    return DAG.getNode(X86ISD::UNPCKL, DL, VT, V1, V2);
11493
11494  SmallVector<int, 8> Unpckh;
11495  createUnpackShuffleMask(VT, Unpckh, /* Lo = */ false, /* Unary = */ false);
11496  if (isShuffleEquivalent(Mask, Unpckh, V1, V2))
11497    return DAG.getNode(X86ISD::UNPCKH, DL, VT, V1, V2);
11498
11499  // Commute and try again.
11500  ShuffleVectorSDNode::commuteMask(Unpckl);
11501  if (isShuffleEquivalent(Mask, Unpckl, V1, V2))
11502    return DAG.getNode(X86ISD::UNPCKL, DL, VT, V2, V1);
11503
11504  ShuffleVectorSDNode::commuteMask(Unpckh);
11505  if (isShuffleEquivalent(Mask, Unpckh, V1, V2))
11506    return DAG.getNode(X86ISD::UNPCKH, DL, VT, V2, V1);
11507
11508  return SDValue();
11509}
11510
11511/// Check if the mask can be mapped to a preliminary shuffle (vperm 64-bit)
11512/// followed by unpack 256-bit.
11513static SDValue lowerShuffleWithUNPCK256(const SDLoc &DL, MVT VT,
11514                                        ArrayRef<int> Mask, SDValue V1,
11515                                        SDValue V2, SelectionDAG &DAG) {
11516  SmallVector<int, 32> Unpckl, Unpckh;
11517  createSplat2ShuffleMask(VT, Unpckl, /* Lo */ true);
11518  createSplat2ShuffleMask(VT, Unpckh, /* Lo */ false);
11519
11520  unsigned UnpackOpcode;
11521  if (isShuffleEquivalent(Mask, Unpckl, V1, V2))
11522    UnpackOpcode = X86ISD::UNPCKL;
11523  else if (isShuffleEquivalent(Mask, Unpckh, V1, V2))
11524    UnpackOpcode = X86ISD::UNPCKH;
11525  else
11526    return SDValue();
11527
11528  // This is a "natural" unpack operation (rather than the 128-bit sectored
11529  // operation implemented by AVX). We need to rearrange 64-bit chunks of the
11530  // input in order to use the x86 instruction.
11531  V1 = DAG.getVectorShuffle(MVT::v4f64, DL, DAG.getBitcast(MVT::v4f64, V1),
11532                            DAG.getUNDEF(MVT::v4f64), {0, 2, 1, 3});
11533  V1 = DAG.getBitcast(VT, V1);
11534  return DAG.getNode(UnpackOpcode, DL, VT, V1, V1);
11535}
11536
11537// Check if the mask can be mapped to a TRUNCATE or VTRUNC, truncating the
11538// source into the lower elements and zeroing the upper elements.
11539static bool matchShuffleAsVTRUNC(MVT &SrcVT, MVT &DstVT, MVT VT,
11540                                 ArrayRef<int> Mask, const APInt &Zeroable,
11541                                 const X86Subtarget &Subtarget) {
11542  if (!VT.is512BitVector() && !Subtarget.hasVLX())
11543    return false;
11544
11545  unsigned NumElts = Mask.size();
11546  unsigned EltSizeInBits = VT.getScalarSizeInBits();
11547  unsigned MaxScale = 64 / EltSizeInBits;
11548
11549  for (unsigned Scale = 2; Scale <= MaxScale; Scale += Scale) {
11550    unsigned SrcEltBits = EltSizeInBits * Scale;
11551    if (SrcEltBits < 32 && !Subtarget.hasBWI())
11552      continue;
11553    unsigned NumSrcElts = NumElts / Scale;
11554    if (!isSequentialOrUndefInRange(Mask, 0, NumSrcElts, 0, Scale))
11555      continue;
11556    unsigned UpperElts = NumElts - NumSrcElts;
11557    if (!Zeroable.extractBits(UpperElts, NumSrcElts).isAllOnesValue())
11558      continue;
11559    SrcVT = MVT::getIntegerVT(EltSizeInBits * Scale);
11560    SrcVT = MVT::getVectorVT(SrcVT, NumSrcElts);
11561    DstVT = MVT::getIntegerVT(EltSizeInBits);
11562    if ((NumSrcElts * EltSizeInBits) >= 128) {
11563      // ISD::TRUNCATE
11564      DstVT = MVT::getVectorVT(DstVT, NumSrcElts);
11565    } else {
11566      // X86ISD::VTRUNC
11567      DstVT = MVT::getVectorVT(DstVT, 128 / EltSizeInBits);
11568    }
11569    return true;
11570  }
11571
11572  return false;
11573}
11574
11575// Helper to create TRUNCATE/VTRUNC nodes, optionally with zero/undef upper
11576// element padding to the final DstVT.
11577static SDValue getAVX512TruncNode(const SDLoc &DL, MVT DstVT, SDValue Src,
11578                                  const X86Subtarget &Subtarget,
11579                                  SelectionDAG &DAG, bool ZeroUppers) {
11580  MVT SrcVT = Src.getSimpleValueType();
11581  MVT DstSVT = DstVT.getScalarType();
11582  unsigned NumDstElts = DstVT.getVectorNumElements();
11583  unsigned NumSrcElts = SrcVT.getVectorNumElements();
11584  unsigned DstEltSizeInBits = DstVT.getScalarSizeInBits();
11585
11586  if (!DAG.getTargetLoweringInfo().isTypeLegal(SrcVT))
11587    return SDValue();
11588
11589  // Perform a direct ISD::TRUNCATE if possible.
11590  if (NumSrcElts == NumDstElts)
11591    return DAG.getNode(ISD::TRUNCATE, DL, DstVT, Src);
11592
11593  if (NumSrcElts > NumDstElts) {
11594    MVT TruncVT = MVT::getVectorVT(DstSVT, NumSrcElts);
11595    SDValue Trunc = DAG.getNode(ISD::TRUNCATE, DL, TruncVT, Src);
11596    return extractSubVector(Trunc, 0, DAG, DL, DstVT.getSizeInBits());
11597  }
11598
11599  if ((NumSrcElts * DstEltSizeInBits) >= 128) {
11600    MVT TruncVT = MVT::getVectorVT(DstSVT, NumSrcElts);
11601    SDValue Trunc = DAG.getNode(ISD::TRUNCATE, DL, TruncVT, Src);
11602    return widenSubVector(Trunc, ZeroUppers, Subtarget, DAG, DL,
11603                          DstVT.getSizeInBits());
11604  }
11605
11606  // Non-VLX targets must truncate from a 512-bit type, so we need to
11607  // widen, truncate and then possibly extract the original subvector.
11608  if (!Subtarget.hasVLX() && !SrcVT.is512BitVector()) {
11609    SDValue NewSrc = widenSubVector(Src, ZeroUppers, Subtarget, DAG, DL, 512);
11610    return getAVX512TruncNode(DL, DstVT, NewSrc, Subtarget, DAG, ZeroUppers);
11611  }
11612
11613  // Fallback to a X86ISD::VTRUNC, padding if necessary.
11614  MVT TruncVT = MVT::getVectorVT(DstSVT, 128 / DstEltSizeInBits);
11615  SDValue Trunc = DAG.getNode(X86ISD::VTRUNC, DL, TruncVT, Src);
11616  if (DstVT != TruncVT)
11617    Trunc = widenSubVector(Trunc, ZeroUppers, Subtarget, DAG, DL,
11618                           DstVT.getSizeInBits());
11619  return Trunc;
11620}
11621
11622// Try to lower trunc+vector_shuffle to a vpmovdb or a vpmovdw instruction.
11623//
11624// An example is the following:
11625//
11626// t0: ch = EntryToken
11627//           t2: v4i64,ch = CopyFromReg t0, Register:v4i64 %0
11628//         t25: v4i32 = truncate t2
11629//       t41: v8i16 = bitcast t25
11630//       t21: v8i16 = BUILD_VECTOR undef:i16, undef:i16, undef:i16, undef:i16,
11631//       Constant:i16<0>, Constant:i16<0>, Constant:i16<0>, Constant:i16<0>
11632//     t51: v8i16 = vector_shuffle<0,2,4,6,12,13,14,15> t41, t21
11633//   t18: v2i64 = bitcast t51
11634//
11635// One can just use a single vpmovdw instruction, without avx512vl we need to
11636// use the zmm variant and extract the lower subvector, padding with zeroes.
11637// TODO: Merge with lowerShuffleAsVTRUNC.
11638static SDValue lowerShuffleWithVPMOV(const SDLoc &DL, MVT VT, SDValue V1,
11639                                     SDValue V2, ArrayRef<int> Mask,
11640                                     const APInt &Zeroable,
11641                                     const X86Subtarget &Subtarget,
11642                                     SelectionDAG &DAG) {
11643  assert((VT == MVT::v16i8 || VT == MVT::v8i16) && "Unexpected VTRUNC type");
11644  if (!Subtarget.hasAVX512())
11645    return SDValue();
11646
11647  unsigned NumElts = VT.getVectorNumElements();
11648  unsigned EltSizeInBits = VT.getScalarSizeInBits();
11649  unsigned MaxScale = 64 / EltSizeInBits;
11650  for (unsigned Scale = 2; Scale <= MaxScale; Scale += Scale) {
11651    unsigned NumSrcElts = NumElts / Scale;
11652    unsigned UpperElts = NumElts - NumSrcElts;
11653    if (!isSequentialOrUndefInRange(Mask, 0, NumSrcElts, 0, Scale) ||
11654        !Zeroable.extractBits(UpperElts, NumSrcElts).isAllOnesValue())
11655      continue;
11656
11657    SDValue Src = V1;
11658    if (!Src.hasOneUse())
11659      return SDValue();
11660
11661    Src = peekThroughOneUseBitcasts(Src);
11662    if (Src.getOpcode() != ISD::TRUNCATE ||
11663        Src.getScalarValueSizeInBits() != (EltSizeInBits * Scale))
11664      return SDValue();
11665    Src = Src.getOperand(0);
11666
11667    // VPMOVWB is only available with avx512bw.
11668    MVT SrcVT = Src.getSimpleValueType();
11669    if (SrcVT.getVectorElementType() == MVT::i16 && VT == MVT::v16i8 &&
11670        !Subtarget.hasBWI())
11671      return SDValue();
11672
11673    bool UndefUppers = isUndefInRange(Mask, NumSrcElts, UpperElts);
11674    return getAVX512TruncNode(DL, VT, Src, Subtarget, DAG, !UndefUppers);
11675  }
11676
11677  return SDValue();
11678}
11679
11680// Attempt to match binary shuffle patterns as a truncate.
11681static SDValue lowerShuffleAsVTRUNC(const SDLoc &DL, MVT VT, SDValue V1,
11682                                    SDValue V2, ArrayRef<int> Mask,
11683                                    const APInt &Zeroable,
11684                                    const X86Subtarget &Subtarget,
11685                                    SelectionDAG &DAG) {
11686  assert((VT.is128BitVector() || VT.is256BitVector()) &&
11687         "Unexpected VTRUNC type");
11688  if (!Subtarget.hasAVX512())
11689    return SDValue();
11690
11691  unsigned NumElts = VT.getVectorNumElements();
11692  unsigned EltSizeInBits = VT.getScalarSizeInBits();
11693  unsigned MaxScale = 64 / EltSizeInBits;
11694  for (unsigned Scale = 2; Scale <= MaxScale; Scale += Scale) {
11695    // TODO: Support non-BWI VPMOVWB truncations?
11696    unsigned SrcEltBits = EltSizeInBits * Scale;
11697    if (SrcEltBits < 32 && !Subtarget.hasBWI())
11698      continue;
11699
11700    // Match shuffle <0,Scale,2*Scale,..,undef_or_zero,undef_or_zero,...>
11701    // Bail if the V2 elements are undef.
11702    unsigned NumHalfSrcElts = NumElts / Scale;
11703    unsigned NumSrcElts = 2 * NumHalfSrcElts;
11704    if (!isSequentialOrUndefInRange(Mask, 0, NumSrcElts, 0, Scale) ||
11705        isUndefInRange(Mask, NumHalfSrcElts, NumHalfSrcElts))
11706      continue;
11707
11708    // The elements beyond the truncation must be undef/zero.
11709    unsigned UpperElts = NumElts - NumSrcElts;
11710    if (UpperElts > 0 &&
11711        !Zeroable.extractBits(UpperElts, NumSrcElts).isAllOnesValue())
11712      continue;
11713    bool UndefUppers =
11714        UpperElts > 0 && isUndefInRange(Mask, NumSrcElts, UpperElts);
11715
11716    // As we're using both sources then we need to concat them together
11717    // and truncate from the double-sized src.
11718    MVT ConcatVT = MVT::getVectorVT(VT.getScalarType(), NumElts * 2);
11719    SDValue Src = DAG.getNode(ISD::CONCAT_VECTORS, DL, ConcatVT, V1, V2);
11720
11721    MVT SrcSVT = MVT::getIntegerVT(SrcEltBits);
11722    MVT SrcVT = MVT::getVectorVT(SrcSVT, NumSrcElts);
11723    Src = DAG.getBitcast(SrcVT, Src);
11724    return getAVX512TruncNode(DL, VT, Src, Subtarget, DAG, !UndefUppers);
11725  }
11726
11727  return SDValue();
11728}
11729
11730/// Check whether a compaction lowering can be done by dropping even
11731/// elements and compute how many times even elements must be dropped.
11732///
11733/// This handles shuffles which take every Nth element where N is a power of
11734/// two. Example shuffle masks:
11735///
11736///  N = 1:  0,  2,  4,  6,  8, 10, 12, 14,  0,  2,  4,  6,  8, 10, 12, 14
11737///  N = 1:  0,  2,  4,  6,  8, 10, 12, 14, 16, 18, 20, 22, 24, 26, 28, 30
11738///  N = 2:  0,  4,  8, 12,  0,  4,  8, 12,  0,  4,  8, 12,  0,  4,  8, 12
11739///  N = 2:  0,  4,  8, 12, 16, 20, 24, 28,  0,  4,  8, 12, 16, 20, 24, 28
11740///  N = 3:  0,  8,  0,  8,  0,  8,  0,  8,  0,  8,  0,  8,  0,  8,  0,  8
11741///  N = 3:  0,  8, 16, 24,  0,  8, 16, 24,  0,  8, 16, 24,  0,  8, 16, 24
11742///
11743/// Any of these lanes can of course be undef.
11744///
11745/// This routine only supports N <= 3.
11746/// FIXME: Evaluate whether either AVX or AVX-512 have any opportunities here
11747/// for larger N.
11748///
11749/// \returns N above, or the number of times even elements must be dropped if
11750/// there is such a number. Otherwise returns zero.
11751static int canLowerByDroppingEvenElements(ArrayRef<int> Mask,
11752                                          bool IsSingleInput) {
11753  // The modulus for the shuffle vector entries is based on whether this is
11754  // a single input or not.
11755  int ShuffleModulus = Mask.size() * (IsSingleInput ? 1 : 2);
11756  assert(isPowerOf2_32((uint32_t)ShuffleModulus) &&
11757         "We should only be called with masks with a power-of-2 size!");
11758
11759  uint64_t ModMask = (uint64_t)ShuffleModulus - 1;
11760
11761  // We track whether the input is viable for all power-of-2 strides 2^1, 2^2,
11762  // and 2^3 simultaneously. This is because we may have ambiguity with
11763  // partially undef inputs.
11764  bool ViableForN[3] = {true, true, true};
11765
11766  for (int i = 0, e = Mask.size(); i < e; ++i) {
11767    // Ignore undef lanes, we'll optimistically collapse them to the pattern we
11768    // want.
11769    if (Mask[i] < 0)
11770      continue;
11771
11772    bool IsAnyViable = false;
11773    for (unsigned j = 0; j != array_lengthof(ViableForN); ++j)
11774      if (ViableForN[j]) {
11775        uint64_t N = j + 1;
11776
11777        // The shuffle mask must be equal to (i * 2^N) % M.
11778        if ((uint64_t)Mask[i] == (((uint64_t)i << N) & ModMask))
11779          IsAnyViable = true;
11780        else
11781          ViableForN[j] = false;
11782      }
11783    // Early exit if we exhaust the possible powers of two.
11784    if (!IsAnyViable)
11785      break;
11786  }
11787
11788  for (unsigned j = 0; j != array_lengthof(ViableForN); ++j)
11789    if (ViableForN[j])
11790      return j + 1;
11791
11792  // Return 0 as there is no viable power of two.
11793  return 0;
11794}
11795
11796// X86 has dedicated pack instructions that can handle specific truncation
11797// operations: PACKSS and PACKUS.
11798// Checks for compaction shuffle masks if MaxStages > 1.
11799// TODO: Add support for matching multiple PACKSS/PACKUS stages.
11800static bool matchShuffleWithPACK(MVT VT, MVT &SrcVT, SDValue &V1, SDValue &V2,
11801                                 unsigned &PackOpcode, ArrayRef<int> TargetMask,
11802                                 SelectionDAG &DAG,
11803                                 const X86Subtarget &Subtarget,
11804                                 unsigned MaxStages = 1) {
11805  unsigned NumElts = VT.getVectorNumElements();
11806  unsigned BitSize = VT.getScalarSizeInBits();
11807  assert(0 < MaxStages && MaxStages <= 3 && (BitSize << MaxStages) <= 64 &&
11808         "Illegal maximum compaction");
11809
11810  auto MatchPACK = [&](SDValue N1, SDValue N2, MVT PackVT) {
11811    unsigned NumSrcBits = PackVT.getScalarSizeInBits();
11812    unsigned NumPackedBits = NumSrcBits - BitSize;
11813    SDValue VV1 = DAG.getBitcast(PackVT, N1);
11814    SDValue VV2 = DAG.getBitcast(PackVT, N2);
11815    if (Subtarget.hasSSE41() || BitSize == 8) {
11816      APInt ZeroMask = APInt::getHighBitsSet(NumSrcBits, NumPackedBits);
11817      if ((N1.isUndef() || DAG.MaskedValueIsZero(VV1, ZeroMask)) &&
11818          (N2.isUndef() || DAG.MaskedValueIsZero(VV2, ZeroMask))) {
11819        V1 = VV1;
11820        V2 = VV2;
11821        SrcVT = PackVT;
11822        PackOpcode = X86ISD::PACKUS;
11823        return true;
11824      }
11825    }
11826    if ((N1.isUndef() || DAG.ComputeNumSignBits(VV1) > NumPackedBits) &&
11827        (N2.isUndef() || DAG.ComputeNumSignBits(VV2) > NumPackedBits)) {
11828      V1 = VV1;
11829      V2 = VV2;
11830      SrcVT = PackVT;
11831      PackOpcode = X86ISD::PACKSS;
11832      return true;
11833    }
11834    return false;
11835  };
11836
11837  // Attempt to match against wider and wider compaction patterns.
11838  for (unsigned NumStages = 1; NumStages <= MaxStages; ++NumStages) {
11839    MVT PackSVT = MVT::getIntegerVT(BitSize << NumStages);
11840    MVT PackVT = MVT::getVectorVT(PackSVT, NumElts >> NumStages);
11841
11842    // Try binary shuffle.
11843    SmallVector<int, 32> BinaryMask;
11844    createPackShuffleMask(VT, BinaryMask, false, NumStages);
11845    if (isTargetShuffleEquivalent(VT, TargetMask, BinaryMask, V1, V2))
11846      if (MatchPACK(V1, V2, PackVT))
11847        return true;
11848
11849    // Try unary shuffle.
11850    SmallVector<int, 32> UnaryMask;
11851    createPackShuffleMask(VT, UnaryMask, true, NumStages);
11852    if (isTargetShuffleEquivalent(VT, TargetMask, UnaryMask, V1))
11853      if (MatchPACK(V1, V1, PackVT))
11854        return true;
11855  }
11856
11857  return false;
11858}
11859
11860static SDValue lowerShuffleWithPACK(const SDLoc &DL, MVT VT, ArrayRef<int> Mask,
11861                                    SDValue V1, SDValue V2, SelectionDAG &DAG,
11862                                    const X86Subtarget &Subtarget) {
11863  MVT PackVT;
11864  unsigned PackOpcode;
11865  unsigned SizeBits = VT.getSizeInBits();
11866  unsigned EltBits = VT.getScalarSizeInBits();
11867  unsigned MaxStages = Log2_32(64 / EltBits);
11868  if (!matchShuffleWithPACK(VT, PackVT, V1, V2, PackOpcode, Mask, DAG,
11869                            Subtarget, MaxStages))
11870    return SDValue();
11871
11872  unsigned CurrentEltBits = PackVT.getScalarSizeInBits();
11873  unsigned NumStages = Log2_32(CurrentEltBits / EltBits);
11874
11875  // Don't lower multi-stage packs on AVX512, truncation is better.
11876  if (NumStages != 1 && SizeBits == 128 && Subtarget.hasVLX())
11877    return SDValue();
11878
11879  // Pack to the largest type possible:
11880  // vXi64/vXi32 -> PACK*SDW and vXi16 -> PACK*SWB.
11881  unsigned MaxPackBits = 16;
11882  if (CurrentEltBits > 16 &&
11883      (PackOpcode == X86ISD::PACKSS || Subtarget.hasSSE41()))
11884    MaxPackBits = 32;
11885
11886  // Repeatedly pack down to the target size.
11887  SDValue Res;
11888  for (unsigned i = 0; i != NumStages; ++i) {
11889    unsigned SrcEltBits = std::min(MaxPackBits, CurrentEltBits);
11890    unsigned NumSrcElts = SizeBits / SrcEltBits;
11891    MVT SrcSVT = MVT::getIntegerVT(SrcEltBits);
11892    MVT DstSVT = MVT::getIntegerVT(SrcEltBits / 2);
11893    MVT SrcVT = MVT::getVectorVT(SrcSVT, NumSrcElts);
11894    MVT DstVT = MVT::getVectorVT(DstSVT, NumSrcElts * 2);
11895    Res = DAG.getNode(PackOpcode, DL, DstVT, DAG.getBitcast(SrcVT, V1),
11896                      DAG.getBitcast(SrcVT, V2));
11897    V1 = V2 = Res;
11898    CurrentEltBits /= 2;
11899  }
11900  assert(Res && Res.getValueType() == VT &&
11901         "Failed to lower compaction shuffle");
11902  return Res;
11903}
11904
11905/// Try to emit a bitmask instruction for a shuffle.
11906///
11907/// This handles cases where we can model a blend exactly as a bitmask due to
11908/// one of the inputs being zeroable.
11909static SDValue lowerShuffleAsBitMask(const SDLoc &DL, MVT VT, SDValue V1,
11910                                     SDValue V2, ArrayRef<int> Mask,
11911                                     const APInt &Zeroable,
11912                                     const X86Subtarget &Subtarget,
11913                                     SelectionDAG &DAG) {
11914  MVT MaskVT = VT;
11915  MVT EltVT = VT.getVectorElementType();
11916  SDValue Zero, AllOnes;
11917  // Use f64 if i64 isn't legal.
11918  if (EltVT == MVT::i64 && !Subtarget.is64Bit()) {
11919    EltVT = MVT::f64;
11920    MaskVT = MVT::getVectorVT(EltVT, Mask.size());
11921  }
11922
11923  MVT LogicVT = VT;
11924  if (EltVT == MVT::f32 || EltVT == MVT::f64) {
11925    Zero = DAG.getConstantFP(0.0, DL, EltVT);
11926    APFloat AllOnesValue = APFloat::getAllOnesValue(
11927        SelectionDAG::EVTToAPFloatSemantics(EltVT), EltVT.getSizeInBits());
11928    AllOnes = DAG.getConstantFP(AllOnesValue, DL, EltVT);
11929    LogicVT =
11930        MVT::getVectorVT(EltVT == MVT::f64 ? MVT::i64 : MVT::i32, Mask.size());
11931  } else {
11932    Zero = DAG.getConstant(0, DL, EltVT);
11933    AllOnes = DAG.getAllOnesConstant(DL, EltVT);
11934  }
11935
11936  SmallVector<SDValue, 16> VMaskOps(Mask.size(), Zero);
11937  SDValue V;
11938  for (int i = 0, Size = Mask.size(); i < Size; ++i) {
11939    if (Zeroable[i])
11940      continue;
11941    if (Mask[i] % Size != i)
11942      return SDValue(); // Not a blend.
11943    if (!V)
11944      V = Mask[i] < Size ? V1 : V2;
11945    else if (V != (Mask[i] < Size ? V1 : V2))
11946      return SDValue(); // Can only let one input through the mask.
11947
11948    VMaskOps[i] = AllOnes;
11949  }
11950  if (!V)
11951    return SDValue(); // No non-zeroable elements!
11952
11953  SDValue VMask = DAG.getBuildVector(MaskVT, DL, VMaskOps);
11954  VMask = DAG.getBitcast(LogicVT, VMask);
11955  V = DAG.getBitcast(LogicVT, V);
11956  SDValue And = DAG.getNode(ISD::AND, DL, LogicVT, V, VMask);
11957  return DAG.getBitcast(VT, And);
11958}
11959
11960/// Try to emit a blend instruction for a shuffle using bit math.
11961///
11962/// This is used as a fallback approach when first class blend instructions are
11963/// unavailable. Currently it is only suitable for integer vectors, but could
11964/// be generalized for floating point vectors if desirable.
11965static SDValue lowerShuffleAsBitBlend(const SDLoc &DL, MVT VT, SDValue V1,
11966                                      SDValue V2, ArrayRef<int> Mask,
11967                                      SelectionDAG &DAG) {
11968  assert(VT.isInteger() && "Only supports integer vector types!");
11969  MVT EltVT = VT.getVectorElementType();
11970  SDValue Zero = DAG.getConstant(0, DL, EltVT);
11971  SDValue AllOnes = DAG.getAllOnesConstant(DL, EltVT);
11972  SmallVector<SDValue, 16> MaskOps;
11973  for (int i = 0, Size = Mask.size(); i < Size; ++i) {
11974    if (Mask[i] >= 0 && Mask[i] != i && Mask[i] != i + Size)
11975      return SDValue(); // Shuffled input!
11976    MaskOps.push_back(Mask[i] < Size ? AllOnes : Zero);
11977  }
11978
11979  SDValue V1Mask = DAG.getBuildVector(VT, DL, MaskOps);
11980  V1 = DAG.getNode(ISD::AND, DL, VT, V1, V1Mask);
11981  V2 = DAG.getNode(X86ISD::ANDNP, DL, VT, V1Mask, V2);
11982  return DAG.getNode(ISD::OR, DL, VT, V1, V2);
11983}
11984
11985static SDValue getVectorMaskingNode(SDValue Op, SDValue Mask,
11986                                    SDValue PreservedSrc,
11987                                    const X86Subtarget &Subtarget,
11988                                    SelectionDAG &DAG);
11989
11990static bool matchShuffleAsBlend(SDValue V1, SDValue V2,
11991                                MutableArrayRef<int> Mask,
11992                                const APInt &Zeroable, bool &ForceV1Zero,
11993                                bool &ForceV2Zero, uint64_t &BlendMask) {
11994  bool V1IsZeroOrUndef =
11995      V1.isUndef() || ISD::isBuildVectorAllZeros(V1.getNode());
11996  bool V2IsZeroOrUndef =
11997      V2.isUndef() || ISD::isBuildVectorAllZeros(V2.getNode());
11998
11999  BlendMask = 0;
12000  ForceV1Zero = false, ForceV2Zero = false;
12001  assert(Mask.size() <= 64 && "Shuffle mask too big for blend mask");
12002
12003  // Attempt to generate the binary blend mask. If an input is zero then
12004  // we can use any lane.
12005  for (int i = 0, Size = Mask.size(); i < Size; ++i) {
12006    int M = Mask[i];
12007    if (M == SM_SentinelUndef)
12008      continue;
12009    if (M == i)
12010      continue;
12011    if (M == i + Size) {
12012      BlendMask |= 1ull << i;
12013      continue;
12014    }
12015    if (Zeroable[i]) {
12016      if (V1IsZeroOrUndef) {
12017        ForceV1Zero = true;
12018        Mask[i] = i;
12019        continue;
12020      }
12021      if (V2IsZeroOrUndef) {
12022        ForceV2Zero = true;
12023        BlendMask |= 1ull << i;
12024        Mask[i] = i + Size;
12025        continue;
12026      }
12027    }
12028    return false;
12029  }
12030  return true;
12031}
12032
12033static uint64_t scaleVectorShuffleBlendMask(uint64_t BlendMask, int Size,
12034                                            int Scale) {
12035  uint64_t ScaledMask = 0;
12036  for (int i = 0; i != Size; ++i)
12037    if (BlendMask & (1ull << i))
12038      ScaledMask |= ((1ull << Scale) - 1) << (i * Scale);
12039  return ScaledMask;
12040}
12041
12042/// Try to emit a blend instruction for a shuffle.
12043///
12044/// This doesn't do any checks for the availability of instructions for blending
12045/// these values. It relies on the availability of the X86ISD::BLENDI pattern to
12046/// be matched in the backend with the type given. What it does check for is
12047/// that the shuffle mask is a blend, or convertible into a blend with zero.
12048static SDValue lowerShuffleAsBlend(const SDLoc &DL, MVT VT, SDValue V1,
12049                                   SDValue V2, ArrayRef<int> Original,
12050                                   const APInt &Zeroable,
12051                                   const X86Subtarget &Subtarget,
12052                                   SelectionDAG &DAG) {
12053  uint64_t BlendMask = 0;
12054  bool ForceV1Zero = false, ForceV2Zero = false;
12055  SmallVector<int, 64> Mask(Original.begin(), Original.end());
12056  if (!matchShuffleAsBlend(V1, V2, Mask, Zeroable, ForceV1Zero, ForceV2Zero,
12057                           BlendMask))
12058    return SDValue();
12059
12060  // Create a REAL zero vector - ISD::isBuildVectorAllZeros allows UNDEFs.
12061  if (ForceV1Zero)
12062    V1 = getZeroVector(VT, Subtarget, DAG, DL);
12063  if (ForceV2Zero)
12064    V2 = getZeroVector(VT, Subtarget, DAG, DL);
12065
12066  switch (VT.SimpleTy) {
12067  case MVT::v4i64:
12068  case MVT::v8i32:
12069    assert(Subtarget.hasAVX2() && "256-bit integer blends require AVX2!");
12070    LLVM_FALLTHROUGH;
12071  case MVT::v4f64:
12072  case MVT::v8f32:
12073    assert(Subtarget.hasAVX() && "256-bit float blends require AVX!");
12074    LLVM_FALLTHROUGH;
12075  case MVT::v2f64:
12076  case MVT::v2i64:
12077  case MVT::v4f32:
12078  case MVT::v4i32:
12079  case MVT::v8i16:
12080    assert(Subtarget.hasSSE41() && "128-bit blends require SSE41!");
12081    return DAG.getNode(X86ISD::BLENDI, DL, VT, V1, V2,
12082                       DAG.getTargetConstant(BlendMask, DL, MVT::i8));
12083  case MVT::v16i16: {
12084    assert(Subtarget.hasAVX2() && "v16i16 blends require AVX2!");
12085    SmallVector<int, 8> RepeatedMask;
12086    if (is128BitLaneRepeatedShuffleMask(MVT::v16i16, Mask, RepeatedMask)) {
12087      // We can lower these with PBLENDW which is mirrored across 128-bit lanes.
12088      assert(RepeatedMask.size() == 8 && "Repeated mask size doesn't match!");
12089      BlendMask = 0;
12090      for (int i = 0; i < 8; ++i)
12091        if (RepeatedMask[i] >= 8)
12092          BlendMask |= 1ull << i;
12093      return DAG.getNode(X86ISD::BLENDI, DL, MVT::v16i16, V1, V2,
12094                         DAG.getTargetConstant(BlendMask, DL, MVT::i8));
12095    }
12096    // Use PBLENDW for lower/upper lanes and then blend lanes.
12097    // TODO - we should allow 2 PBLENDW here and leave shuffle combine to
12098    // merge to VSELECT where useful.
12099    uint64_t LoMask = BlendMask & 0xFF;
12100    uint64_t HiMask = (BlendMask >> 8) & 0xFF;
12101    if (LoMask == 0 || LoMask == 255 || HiMask == 0 || HiMask == 255) {
12102      SDValue Lo = DAG.getNode(X86ISD::BLENDI, DL, MVT::v16i16, V1, V2,
12103                               DAG.getTargetConstant(LoMask, DL, MVT::i8));
12104      SDValue Hi = DAG.getNode(X86ISD::BLENDI, DL, MVT::v16i16, V1, V2,
12105                               DAG.getTargetConstant(HiMask, DL, MVT::i8));
12106      return DAG.getVectorShuffle(
12107          MVT::v16i16, DL, Lo, Hi,
12108          {0, 1, 2, 3, 4, 5, 6, 7, 24, 25, 26, 27, 28, 29, 30, 31});
12109    }
12110    LLVM_FALLTHROUGH;
12111  }
12112  case MVT::v32i8:
12113    assert(Subtarget.hasAVX2() && "256-bit byte-blends require AVX2!");
12114    LLVM_FALLTHROUGH;
12115  case MVT::v16i8: {
12116    assert(Subtarget.hasSSE41() && "128-bit byte-blends require SSE41!");
12117
12118    // Attempt to lower to a bitmask if we can. VPAND is faster than VPBLENDVB.
12119    if (SDValue Masked = lowerShuffleAsBitMask(DL, VT, V1, V2, Mask, Zeroable,
12120                                               Subtarget, DAG))
12121      return Masked;
12122
12123    if (Subtarget.hasBWI() && Subtarget.hasVLX()) {
12124      MVT IntegerType =
12125          MVT::getIntegerVT(std::max((int)VT.getVectorNumElements(), 8));
12126      SDValue MaskNode = DAG.getConstant(BlendMask, DL, IntegerType);
12127      return getVectorMaskingNode(V2, MaskNode, V1, Subtarget, DAG);
12128    }
12129
12130    // If we have VPTERNLOG, we can use that as a bit blend.
12131    if (Subtarget.hasVLX())
12132      if (SDValue BitBlend =
12133              lowerShuffleAsBitBlend(DL, VT, V1, V2, Mask, DAG))
12134        return BitBlend;
12135
12136    // Scale the blend by the number of bytes per element.
12137    int Scale = VT.getScalarSizeInBits() / 8;
12138
12139    // This form of blend is always done on bytes. Compute the byte vector
12140    // type.
12141    MVT BlendVT = MVT::getVectorVT(MVT::i8, VT.getSizeInBits() / 8);
12142
12143    // x86 allows load folding with blendvb from the 2nd source operand. But
12144    // we are still using LLVM select here (see comment below), so that's V1.
12145    // If V2 can be load-folded and V1 cannot be load-folded, then commute to
12146    // allow that load-folding possibility.
12147    if (!ISD::isNormalLoad(V1.getNode()) && ISD::isNormalLoad(V2.getNode())) {
12148      ShuffleVectorSDNode::commuteMask(Mask);
12149      std::swap(V1, V2);
12150    }
12151
12152    // Compute the VSELECT mask. Note that VSELECT is really confusing in the
12153    // mix of LLVM's code generator and the x86 backend. We tell the code
12154    // generator that boolean values in the elements of an x86 vector register
12155    // are -1 for true and 0 for false. We then use the LLVM semantics of 'true'
12156    // mapping a select to operand #1, and 'false' mapping to operand #2. The
12157    // reality in x86 is that vector masks (pre-AVX-512) use only the high bit
12158    // of the element (the remaining are ignored) and 0 in that high bit would
12159    // mean operand #1 while 1 in the high bit would mean operand #2. So while
12160    // the LLVM model for boolean values in vector elements gets the relevant
12161    // bit set, it is set backwards and over constrained relative to x86's
12162    // actual model.
12163    SmallVector<SDValue, 32> VSELECTMask;
12164    for (int i = 0, Size = Mask.size(); i < Size; ++i)
12165      for (int j = 0; j < Scale; ++j)
12166        VSELECTMask.push_back(
12167            Mask[i] < 0 ? DAG.getUNDEF(MVT::i8)
12168                        : DAG.getConstant(Mask[i] < Size ? -1 : 0, DL,
12169                                          MVT::i8));
12170
12171    V1 = DAG.getBitcast(BlendVT, V1);
12172    V2 = DAG.getBitcast(BlendVT, V2);
12173    return DAG.getBitcast(
12174        VT,
12175        DAG.getSelect(DL, BlendVT, DAG.getBuildVector(BlendVT, DL, VSELECTMask),
12176                      V1, V2));
12177  }
12178  case MVT::v16f32:
12179  case MVT::v8f64:
12180  case MVT::v8i64:
12181  case MVT::v16i32:
12182  case MVT::v32i16:
12183  case MVT::v64i8: {
12184    // Attempt to lower to a bitmask if we can. Only if not optimizing for size.
12185    bool OptForSize = DAG.shouldOptForSize();
12186    if (!OptForSize) {
12187      if (SDValue Masked = lowerShuffleAsBitMask(DL, VT, V1, V2, Mask, Zeroable,
12188                                                 Subtarget, DAG))
12189        return Masked;
12190    }
12191
12192    // Otherwise load an immediate into a GPR, cast to k-register, and use a
12193    // masked move.
12194    MVT IntegerType =
12195        MVT::getIntegerVT(std::max((int)VT.getVectorNumElements(), 8));
12196    SDValue MaskNode = DAG.getConstant(BlendMask, DL, IntegerType);
12197    return getVectorMaskingNode(V2, MaskNode, V1, Subtarget, DAG);
12198  }
12199  default:
12200    llvm_unreachable("Not a supported integer vector type!");
12201  }
12202}
12203
12204/// Try to lower as a blend of elements from two inputs followed by
12205/// a single-input permutation.
12206///
12207/// This matches the pattern where we can blend elements from two inputs and
12208/// then reduce the shuffle to a single-input permutation.
12209static SDValue lowerShuffleAsBlendAndPermute(const SDLoc &DL, MVT VT,
12210                                             SDValue V1, SDValue V2,
12211                                             ArrayRef<int> Mask,
12212                                             SelectionDAG &DAG,
12213                                             bool ImmBlends = false) {
12214  // We build up the blend mask while checking whether a blend is a viable way
12215  // to reduce the shuffle.
12216  SmallVector<int, 32> BlendMask(Mask.size(), -1);
12217  SmallVector<int, 32> PermuteMask(Mask.size(), -1);
12218
12219  for (int i = 0, Size = Mask.size(); i < Size; ++i) {
12220    if (Mask[i] < 0)
12221      continue;
12222
12223    assert(Mask[i] < Size * 2 && "Shuffle input is out of bounds.");
12224
12225    if (BlendMask[Mask[i] % Size] < 0)
12226      BlendMask[Mask[i] % Size] = Mask[i];
12227    else if (BlendMask[Mask[i] % Size] != Mask[i])
12228      return SDValue(); // Can't blend in the needed input!
12229
12230    PermuteMask[i] = Mask[i] % Size;
12231  }
12232
12233  // If only immediate blends, then bail if the blend mask can't be widened to
12234  // i16.
12235  unsigned EltSize = VT.getScalarSizeInBits();
12236  if (ImmBlends && EltSize == 8 && !canWidenShuffleElements(BlendMask))
12237    return SDValue();
12238
12239  SDValue V = DAG.getVectorShuffle(VT, DL, V1, V2, BlendMask);
12240  return DAG.getVectorShuffle(VT, DL, V, DAG.getUNDEF(VT), PermuteMask);
12241}
12242
12243/// Try to lower as an unpack of elements from two inputs followed by
12244/// a single-input permutation.
12245///
12246/// This matches the pattern where we can unpack elements from two inputs and
12247/// then reduce the shuffle to a single-input (wider) permutation.
12248static SDValue lowerShuffleAsUNPCKAndPermute(const SDLoc &DL, MVT VT,
12249                                             SDValue V1, SDValue V2,
12250                                             ArrayRef<int> Mask,
12251                                             SelectionDAG &DAG) {
12252  int NumElts = Mask.size();
12253  int NumLanes = VT.getSizeInBits() / 128;
12254  int NumLaneElts = NumElts / NumLanes;
12255  int NumHalfLaneElts = NumLaneElts / 2;
12256
12257  bool MatchLo = true, MatchHi = true;
12258  SDValue Ops[2] = {DAG.getUNDEF(VT), DAG.getUNDEF(VT)};
12259
12260  // Determine UNPCKL/UNPCKH type and operand order.
12261  for (int Lane = 0; Lane != NumElts; Lane += NumLaneElts) {
12262    for (int Elt = 0; Elt != NumLaneElts; ++Elt) {
12263      int M = Mask[Lane + Elt];
12264      if (M < 0)
12265        continue;
12266
12267      SDValue &Op = Ops[Elt & 1];
12268      if (M < NumElts && (Op.isUndef() || Op == V1))
12269        Op = V1;
12270      else if (NumElts <= M && (Op.isUndef() || Op == V2))
12271        Op = V2;
12272      else
12273        return SDValue();
12274
12275      int Lo = Lane, Mid = Lane + NumHalfLaneElts, Hi = Lane + NumLaneElts;
12276      MatchLo &= isUndefOrInRange(M, Lo, Mid) ||
12277                 isUndefOrInRange(M, NumElts + Lo, NumElts + Mid);
12278      MatchHi &= isUndefOrInRange(M, Mid, Hi) ||
12279                 isUndefOrInRange(M, NumElts + Mid, NumElts + Hi);
12280      if (!MatchLo && !MatchHi)
12281        return SDValue();
12282    }
12283  }
12284  assert((MatchLo ^ MatchHi) && "Failed to match UNPCKLO/UNPCKHI");
12285
12286  // Now check that each pair of elts come from the same unpack pair
12287  // and set the permute mask based on each pair.
12288  // TODO - Investigate cases where we permute individual elements.
12289  SmallVector<int, 32> PermuteMask(NumElts, -1);
12290  for (int Lane = 0; Lane != NumElts; Lane += NumLaneElts) {
12291    for (int Elt = 0; Elt != NumLaneElts; Elt += 2) {
12292      int M0 = Mask[Lane + Elt + 0];
12293      int M1 = Mask[Lane + Elt + 1];
12294      if (0 <= M0 && 0 <= M1 &&
12295          (M0 % NumHalfLaneElts) != (M1 % NumHalfLaneElts))
12296        return SDValue();
12297      if (0 <= M0)
12298        PermuteMask[Lane + Elt + 0] = Lane + (2 * (M0 % NumHalfLaneElts));
12299      if (0 <= M1)
12300        PermuteMask[Lane + Elt + 1] = Lane + (2 * (M1 % NumHalfLaneElts)) + 1;
12301    }
12302  }
12303
12304  unsigned UnpckOp = MatchLo ? X86ISD::UNPCKL : X86ISD::UNPCKH;
12305  SDValue Unpck = DAG.getNode(UnpckOp, DL, VT, Ops);
12306  return DAG.getVectorShuffle(VT, DL, Unpck, DAG.getUNDEF(VT), PermuteMask);
12307}
12308
12309/// Helper to form a PALIGNR-based rotate+permute, merging 2 inputs and then
12310/// permuting the elements of the result in place.
12311static SDValue lowerShuffleAsByteRotateAndPermute(
12312    const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef<int> Mask,
12313    const X86Subtarget &Subtarget, SelectionDAG &DAG) {
12314  if ((VT.is128BitVector() && !Subtarget.hasSSSE3()) ||
12315      (VT.is256BitVector() && !Subtarget.hasAVX2()) ||
12316      (VT.is512BitVector() && !Subtarget.hasBWI()))
12317    return SDValue();
12318
12319  // We don't currently support lane crossing permutes.
12320  if (is128BitLaneCrossingShuffleMask(VT, Mask))
12321    return SDValue();
12322
12323  int Scale = VT.getScalarSizeInBits() / 8;
12324  int NumLanes = VT.getSizeInBits() / 128;
12325  int NumElts = VT.getVectorNumElements();
12326  int NumEltsPerLane = NumElts / NumLanes;
12327
12328  // Determine range of mask elts.
12329  bool Blend1 = true;
12330  bool Blend2 = true;
12331  std::pair<int, int> Range1 = std::make_pair(INT_MAX, INT_MIN);
12332  std::pair<int, int> Range2 = std::make_pair(INT_MAX, INT_MIN);
12333  for (int Lane = 0; Lane != NumElts; Lane += NumEltsPerLane) {
12334    for (int Elt = 0; Elt != NumEltsPerLane; ++Elt) {
12335      int M = Mask[Lane + Elt];
12336      if (M < 0)
12337        continue;
12338      if (M < NumElts) {
12339        Blend1 &= (M == (Lane + Elt));
12340        assert(Lane <= M && M < (Lane + NumEltsPerLane) && "Out of range mask");
12341        M = M % NumEltsPerLane;
12342        Range1.first = std::min(Range1.first, M);
12343        Range1.second = std::max(Range1.second, M);
12344      } else {
12345        M -= NumElts;
12346        Blend2 &= (M == (Lane + Elt));
12347        assert(Lane <= M && M < (Lane + NumEltsPerLane) && "Out of range mask");
12348        M = M % NumEltsPerLane;
12349        Range2.first = std::min(Range2.first, M);
12350        Range2.second = std::max(Range2.second, M);
12351      }
12352    }
12353  }
12354
12355  // Bail if we don't need both elements.
12356  // TODO - it might be worth doing this for unary shuffles if the permute
12357  // can be widened.
12358  if (!(0 <= Range1.first && Range1.second < NumEltsPerLane) ||
12359      !(0 <= Range2.first && Range2.second < NumEltsPerLane))
12360    return SDValue();
12361
12362  if (VT.getSizeInBits() > 128 && (Blend1 || Blend2))
12363    return SDValue();
12364
12365  // Rotate the 2 ops so we can access both ranges, then permute the result.
12366  auto RotateAndPermute = [&](SDValue Lo, SDValue Hi, int RotAmt, int Ofs) {
12367    MVT ByteVT = MVT::getVectorVT(MVT::i8, VT.getSizeInBits() / 8);
12368    SDValue Rotate = DAG.getBitcast(
12369        VT, DAG.getNode(X86ISD::PALIGNR, DL, ByteVT, DAG.getBitcast(ByteVT, Hi),
12370                        DAG.getBitcast(ByteVT, Lo),
12371                        DAG.getTargetConstant(Scale * RotAmt, DL, MVT::i8)));
12372    SmallVector<int, 64> PermMask(NumElts, SM_SentinelUndef);
12373    for (int Lane = 0; Lane != NumElts; Lane += NumEltsPerLane) {
12374      for (int Elt = 0; Elt != NumEltsPerLane; ++Elt) {
12375        int M = Mask[Lane + Elt];
12376        if (M < 0)
12377          continue;
12378        if (M < NumElts)
12379          PermMask[Lane + Elt] = Lane + ((M + Ofs - RotAmt) % NumEltsPerLane);
12380        else
12381          PermMask[Lane + Elt] = Lane + ((M - Ofs - RotAmt) % NumEltsPerLane);
12382      }
12383    }
12384    return DAG.getVectorShuffle(VT, DL, Rotate, DAG.getUNDEF(VT), PermMask);
12385  };
12386
12387  // Check if the ranges are small enough to rotate from either direction.
12388  if (Range2.second < Range1.first)
12389    return RotateAndPermute(V1, V2, Range1.first, 0);
12390  if (Range1.second < Range2.first)
12391    return RotateAndPermute(V2, V1, Range2.first, NumElts);
12392  return SDValue();
12393}
12394
12395/// Generic routine to decompose a shuffle and blend into independent
12396/// blends and permutes.
12397///
12398/// This matches the extremely common pattern for handling combined
12399/// shuffle+blend operations on newer X86 ISAs where we have very fast blend
12400/// operations. It will try to pick the best arrangement of shuffles and
12401/// blends. For vXi8/vXi16 shuffles we may use unpack instead of blend.
12402static SDValue lowerShuffleAsDecomposedShuffleMerge(
12403    const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef<int> Mask,
12404    const X86Subtarget &Subtarget, SelectionDAG &DAG) {
12405  int NumElts = Mask.size();
12406  int NumLanes = VT.getSizeInBits() / 128;
12407  int NumEltsPerLane = NumElts / NumLanes;
12408
12409  // Shuffle the input elements into the desired positions in V1 and V2 and
12410  // unpack/blend them together.
12411  bool IsAlternating = true;
12412  SmallVector<int, 32> V1Mask(NumElts, -1);
12413  SmallVector<int, 32> V2Mask(NumElts, -1);
12414  SmallVector<int, 32> FinalMask(NumElts, -1);
12415  for (int i = 0; i < NumElts; ++i) {
12416    int M = Mask[i];
12417    if (M >= 0 && M < NumElts) {
12418      V1Mask[i] = M;
12419      FinalMask[i] = i;
12420      IsAlternating &= (i & 1) == 0;
12421    } else if (M >= NumElts) {
12422      V2Mask[i] = M - NumElts;
12423      FinalMask[i] = i + NumElts;
12424      IsAlternating &= (i & 1) == 1;
12425    }
12426  }
12427
12428  // Try to lower with the simpler initial blend/unpack/rotate strategies unless
12429  // one of the input shuffles would be a no-op. We prefer to shuffle inputs as
12430  // the shuffle may be able to fold with a load or other benefit. However, when
12431  // we'll have to do 2x as many shuffles in order to achieve this, a 2-input
12432  // pre-shuffle first is a better strategy.
12433  if (!isNoopShuffleMask(V1Mask) && !isNoopShuffleMask(V2Mask)) {
12434    // Only prefer immediate blends to unpack/rotate.
12435    if (SDValue BlendPerm = lowerShuffleAsBlendAndPermute(DL, VT, V1, V2, Mask,
12436                                                          DAG, true))
12437      return BlendPerm;
12438    if (SDValue UnpackPerm = lowerShuffleAsUNPCKAndPermute(DL, VT, V1, V2, Mask,
12439                                                           DAG))
12440      return UnpackPerm;
12441    if (SDValue RotatePerm = lowerShuffleAsByteRotateAndPermute(
12442            DL, VT, V1, V2, Mask, Subtarget, DAG))
12443      return RotatePerm;
12444    // Unpack/rotate failed - try again with variable blends.
12445    if (SDValue BlendPerm = lowerShuffleAsBlendAndPermute(DL, VT, V1, V2, Mask,
12446                                                          DAG))
12447      return BlendPerm;
12448  }
12449
12450  // If the final mask is an alternating blend of vXi8/vXi16, convert to an
12451  // UNPCKL(SHUFFLE, SHUFFLE) pattern.
12452  // TODO: It doesn't have to be alternating - but each lane mustn't have more
12453  // than half the elements coming from each source.
12454  if (IsAlternating && VT.getScalarSizeInBits() < 32) {
12455    V1Mask.assign(NumElts, -1);
12456    V2Mask.assign(NumElts, -1);
12457    FinalMask.assign(NumElts, -1);
12458    for (int i = 0; i != NumElts; i += NumEltsPerLane)
12459      for (int j = 0; j != NumEltsPerLane; ++j) {
12460        int M = Mask[i + j];
12461        if (M >= 0 && M < NumElts) {
12462          V1Mask[i + (j / 2)] = M;
12463          FinalMask[i + j] = i + (j / 2);
12464        } else if (M >= NumElts) {
12465          V2Mask[i + (j / 2)] = M - NumElts;
12466          FinalMask[i + j] = i + (j / 2) + NumElts;
12467        }
12468      }
12469  }
12470
12471  V1 = DAG.getVectorShuffle(VT, DL, V1, DAG.getUNDEF(VT), V1Mask);
12472  V2 = DAG.getVectorShuffle(VT, DL, V2, DAG.getUNDEF(VT), V2Mask);
12473  return DAG.getVectorShuffle(VT, DL, V1, V2, FinalMask);
12474}
12475
12476/// Try to lower a vector shuffle as a bit rotation.
12477///
12478/// Look for a repeated rotation pattern in each sub group.
12479/// Returns a ISD::ROTL element rotation amount or -1 if failed.
12480static int matchShuffleAsBitRotate(ArrayRef<int> Mask, int NumSubElts) {
12481  int NumElts = Mask.size();
12482  assert((NumElts % NumSubElts) == 0 && "Illegal shuffle mask");
12483
12484  int RotateAmt = -1;
12485  for (int i = 0; i != NumElts; i += NumSubElts) {
12486    for (int j = 0; j != NumSubElts; ++j) {
12487      int M = Mask[i + j];
12488      if (M < 0)
12489        continue;
12490      if (!isInRange(M, i, i + NumSubElts))
12491        return -1;
12492      int Offset = (NumSubElts - (M - (i + j))) % NumSubElts;
12493      if (0 <= RotateAmt && Offset != RotateAmt)
12494        return -1;
12495      RotateAmt = Offset;
12496    }
12497  }
12498  return RotateAmt;
12499}
12500
12501static int matchShuffleAsBitRotate(MVT &RotateVT, int EltSizeInBits,
12502                                   const X86Subtarget &Subtarget,
12503                                   ArrayRef<int> Mask) {
12504  assert(!isNoopShuffleMask(Mask) && "We shouldn't lower no-op shuffles!");
12505  assert(EltSizeInBits < 64 && "Can't rotate 64-bit integers");
12506
12507  // AVX512 only has vXi32/vXi64 rotates, so limit the rotation sub group size.
12508  int MinSubElts = Subtarget.hasAVX512() ? std::max(32 / EltSizeInBits, 2) : 2;
12509  int MaxSubElts = 64 / EltSizeInBits;
12510  for (int NumSubElts = MinSubElts; NumSubElts <= MaxSubElts; NumSubElts *= 2) {
12511    int RotateAmt = matchShuffleAsBitRotate(Mask, NumSubElts);
12512    if (RotateAmt < 0)
12513      continue;
12514
12515    int NumElts = Mask.size();
12516    MVT RotateSVT = MVT::getIntegerVT(EltSizeInBits * NumSubElts);
12517    RotateVT = MVT::getVectorVT(RotateSVT, NumElts / NumSubElts);
12518    return RotateAmt * EltSizeInBits;
12519  }
12520
12521  return -1;
12522}
12523
12524/// Lower shuffle using X86ISD::VROTLI rotations.
12525static SDValue lowerShuffleAsBitRotate(const SDLoc &DL, MVT VT, SDValue V1,
12526                                       ArrayRef<int> Mask,
12527                                       const X86Subtarget &Subtarget,
12528                                       SelectionDAG &DAG) {
12529  // Only XOP + AVX512 targets have bit rotation instructions.
12530  // If we at least have SSSE3 (PSHUFB) then we shouldn't attempt to use this.
12531  bool IsLegal =
12532      (VT.is128BitVector() && Subtarget.hasXOP()) || Subtarget.hasAVX512();
12533  if (!IsLegal && Subtarget.hasSSE3())
12534    return SDValue();
12535
12536  MVT RotateVT;
12537  int RotateAmt = matchShuffleAsBitRotate(RotateVT, VT.getScalarSizeInBits(),
12538                                          Subtarget, Mask);
12539  if (RotateAmt < 0)
12540    return SDValue();
12541
12542  // For pre-SSSE3 targets, if we are shuffling vXi8 elts then ISD::ROTL,
12543  // expanded to OR(SRL,SHL), will be more efficient, but if they can
12544  // widen to vXi16 or more then existing lowering should will be better.
12545  if (!IsLegal) {
12546    if ((RotateAmt % 16) == 0)
12547      return SDValue();
12548    // TODO: Use getTargetVShiftByConstNode.
12549    unsigned ShlAmt = RotateAmt;
12550    unsigned SrlAmt = RotateVT.getScalarSizeInBits() - RotateAmt;
12551    V1 = DAG.getBitcast(RotateVT, V1);
12552    SDValue SHL = DAG.getNode(X86ISD::VSHLI, DL, RotateVT, V1,
12553                              DAG.getTargetConstant(ShlAmt, DL, MVT::i8));
12554    SDValue SRL = DAG.getNode(X86ISD::VSRLI, DL, RotateVT, V1,
12555                              DAG.getTargetConstant(SrlAmt, DL, MVT::i8));
12556    SDValue Rot = DAG.getNode(ISD::OR, DL, RotateVT, SHL, SRL);
12557    return DAG.getBitcast(VT, Rot);
12558  }
12559
12560  SDValue Rot =
12561      DAG.getNode(X86ISD::VROTLI, DL, RotateVT, DAG.getBitcast(RotateVT, V1),
12562                  DAG.getTargetConstant(RotateAmt, DL, MVT::i8));
12563  return DAG.getBitcast(VT, Rot);
12564}
12565
12566/// Try to match a vector shuffle as an element rotation.
12567///
12568/// This is used for support PALIGNR for SSSE3 or VALIGND/Q for AVX512.
12569static int matchShuffleAsElementRotate(SDValue &V1, SDValue &V2,
12570                                       ArrayRef<int> Mask) {
12571  int NumElts = Mask.size();
12572
12573  // We need to detect various ways of spelling a rotation:
12574  //   [11, 12, 13, 14, 15,  0,  1,  2]
12575  //   [-1, 12, 13, 14, -1, -1,  1, -1]
12576  //   [-1, -1, -1, -1, -1, -1,  1,  2]
12577  //   [ 3,  4,  5,  6,  7,  8,  9, 10]
12578  //   [-1,  4,  5,  6, -1, -1,  9, -1]
12579  //   [-1,  4,  5,  6, -1, -1, -1, -1]
12580  int Rotation = 0;
12581  SDValue Lo, Hi;
12582  for (int i = 0; i < NumElts; ++i) {
12583    int M = Mask[i];
12584    assert((M == SM_SentinelUndef || (0 <= M && M < (2*NumElts))) &&
12585           "Unexpected mask index.");
12586    if (M < 0)
12587      continue;
12588
12589    // Determine where a rotated vector would have started.
12590    int StartIdx = i - (M % NumElts);
12591    if (StartIdx == 0)
12592      // The identity rotation isn't interesting, stop.
12593      return -1;
12594
12595    // If we found the tail of a vector the rotation must be the missing
12596    // front. If we found the head of a vector, it must be how much of the
12597    // head.
12598    int CandidateRotation = StartIdx < 0 ? -StartIdx : NumElts - StartIdx;
12599
12600    if (Rotation == 0)
12601      Rotation = CandidateRotation;
12602    else if (Rotation != CandidateRotation)
12603      // The rotations don't match, so we can't match this mask.
12604      return -1;
12605
12606    // Compute which value this mask is pointing at.
12607    SDValue MaskV = M < NumElts ? V1 : V2;
12608
12609    // Compute which of the two target values this index should be assigned
12610    // to. This reflects whether the high elements are remaining or the low
12611    // elements are remaining.
12612    SDValue &TargetV = StartIdx < 0 ? Hi : Lo;
12613
12614    // Either set up this value if we've not encountered it before, or check
12615    // that it remains consistent.
12616    if (!TargetV)
12617      TargetV = MaskV;
12618    else if (TargetV != MaskV)
12619      // This may be a rotation, but it pulls from the inputs in some
12620      // unsupported interleaving.
12621      return -1;
12622  }
12623
12624  // Check that we successfully analyzed the mask, and normalize the results.
12625  assert(Rotation != 0 && "Failed to locate a viable rotation!");
12626  assert((Lo || Hi) && "Failed to find a rotated input vector!");
12627  if (!Lo)
12628    Lo = Hi;
12629  else if (!Hi)
12630    Hi = Lo;
12631
12632  V1 = Lo;
12633  V2 = Hi;
12634
12635  return Rotation;
12636}
12637
12638/// Try to lower a vector shuffle as a byte rotation.
12639///
12640/// SSSE3 has a generic PALIGNR instruction in x86 that will do an arbitrary
12641/// byte-rotation of the concatenation of two vectors; pre-SSSE3 can use
12642/// a PSRLDQ/PSLLDQ/POR pattern to get a similar effect. This routine will
12643/// try to generically lower a vector shuffle through such an pattern. It
12644/// does not check for the profitability of lowering either as PALIGNR or
12645/// PSRLDQ/PSLLDQ/POR, only whether the mask is valid to lower in that form.
12646/// This matches shuffle vectors that look like:
12647///
12648///   v8i16 [11, 12, 13, 14, 15, 0, 1, 2]
12649///
12650/// Essentially it concatenates V1 and V2, shifts right by some number of
12651/// elements, and takes the low elements as the result. Note that while this is
12652/// specified as a *right shift* because x86 is little-endian, it is a *left
12653/// rotate* of the vector lanes.
12654static int matchShuffleAsByteRotate(MVT VT, SDValue &V1, SDValue &V2,
12655                                    ArrayRef<int> Mask) {
12656  // Don't accept any shuffles with zero elements.
12657  if (isAnyZero(Mask))
12658    return -1;
12659
12660  // PALIGNR works on 128-bit lanes.
12661  SmallVector<int, 16> RepeatedMask;
12662  if (!is128BitLaneRepeatedShuffleMask(VT, Mask, RepeatedMask))
12663    return -1;
12664
12665  int Rotation = matchShuffleAsElementRotate(V1, V2, RepeatedMask);
12666  if (Rotation <= 0)
12667    return -1;
12668
12669  // PALIGNR rotates bytes, so we need to scale the
12670  // rotation based on how many bytes are in the vector lane.
12671  int NumElts = RepeatedMask.size();
12672  int Scale = 16 / NumElts;
12673  return Rotation * Scale;
12674}
12675
12676static SDValue lowerShuffleAsByteRotate(const SDLoc &DL, MVT VT, SDValue V1,
12677                                        SDValue V2, ArrayRef<int> Mask,
12678                                        const X86Subtarget &Subtarget,
12679                                        SelectionDAG &DAG) {
12680  assert(!isNoopShuffleMask(Mask) && "We shouldn't lower no-op shuffles!");
12681
12682  SDValue Lo = V1, Hi = V2;
12683  int ByteRotation = matchShuffleAsByteRotate(VT, Lo, Hi, Mask);
12684  if (ByteRotation <= 0)
12685    return SDValue();
12686
12687  // Cast the inputs to i8 vector of correct length to match PALIGNR or
12688  // PSLLDQ/PSRLDQ.
12689  MVT ByteVT = MVT::getVectorVT(MVT::i8, VT.getSizeInBits() / 8);
12690  Lo = DAG.getBitcast(ByteVT, Lo);
12691  Hi = DAG.getBitcast(ByteVT, Hi);
12692
12693  // SSSE3 targets can use the palignr instruction.
12694  if (Subtarget.hasSSSE3()) {
12695    assert((!VT.is512BitVector() || Subtarget.hasBWI()) &&
12696           "512-bit PALIGNR requires BWI instructions");
12697    return DAG.getBitcast(
12698        VT, DAG.getNode(X86ISD::PALIGNR, DL, ByteVT, Lo, Hi,
12699                        DAG.getTargetConstant(ByteRotation, DL, MVT::i8)));
12700  }
12701
12702  assert(VT.is128BitVector() &&
12703         "Rotate-based lowering only supports 128-bit lowering!");
12704  assert(Mask.size() <= 16 &&
12705         "Can shuffle at most 16 bytes in a 128-bit vector!");
12706  assert(ByteVT == MVT::v16i8 &&
12707         "SSE2 rotate lowering only needed for v16i8!");
12708
12709  // Default SSE2 implementation
12710  int LoByteShift = 16 - ByteRotation;
12711  int HiByteShift = ByteRotation;
12712
12713  SDValue LoShift =
12714      DAG.getNode(X86ISD::VSHLDQ, DL, MVT::v16i8, Lo,
12715                  DAG.getTargetConstant(LoByteShift, DL, MVT::i8));
12716  SDValue HiShift =
12717      DAG.getNode(X86ISD::VSRLDQ, DL, MVT::v16i8, Hi,
12718                  DAG.getTargetConstant(HiByteShift, DL, MVT::i8));
12719  return DAG.getBitcast(VT,
12720                        DAG.getNode(ISD::OR, DL, MVT::v16i8, LoShift, HiShift));
12721}
12722
12723/// Try to lower a vector shuffle as a dword/qword rotation.
12724///
12725/// AVX512 has a VALIGND/VALIGNQ instructions that will do an arbitrary
12726/// rotation of the concatenation of two vectors; This routine will
12727/// try to generically lower a vector shuffle through such an pattern.
12728///
12729/// Essentially it concatenates V1 and V2, shifts right by some number of
12730/// elements, and takes the low elements as the result. Note that while this is
12731/// specified as a *right shift* because x86 is little-endian, it is a *left
12732/// rotate* of the vector lanes.
12733static SDValue lowerShuffleAsVALIGN(const SDLoc &DL, MVT VT, SDValue V1,
12734                                    SDValue V2, ArrayRef<int> Mask,
12735                                    const X86Subtarget &Subtarget,
12736                                    SelectionDAG &DAG) {
12737  assert((VT.getScalarType() == MVT::i32 || VT.getScalarType() == MVT::i64) &&
12738         "Only 32-bit and 64-bit elements are supported!");
12739
12740  // 128/256-bit vectors are only supported with VLX.
12741  assert((Subtarget.hasVLX() || (!VT.is128BitVector() && !VT.is256BitVector()))
12742         && "VLX required for 128/256-bit vectors");
12743
12744  SDValue Lo = V1, Hi = V2;
12745  int Rotation = matchShuffleAsElementRotate(Lo, Hi, Mask);
12746  if (Rotation <= 0)
12747    return SDValue();
12748
12749  return DAG.getNode(X86ISD::VALIGN, DL, VT, Lo, Hi,
12750                     DAG.getTargetConstant(Rotation, DL, MVT::i8));
12751}
12752
12753/// Try to lower a vector shuffle as a byte shift sequence.
12754static SDValue lowerShuffleAsByteShiftMask(const SDLoc &DL, MVT VT, SDValue V1,
12755                                           SDValue V2, ArrayRef<int> Mask,
12756                                           const APInt &Zeroable,
12757                                           const X86Subtarget &Subtarget,
12758                                           SelectionDAG &DAG) {
12759  assert(!isNoopShuffleMask(Mask) && "We shouldn't lower no-op shuffles!");
12760  assert(VT.is128BitVector() && "Only 128-bit vectors supported");
12761
12762  // We need a shuffle that has zeros at one/both ends and a sequential
12763  // shuffle from one source within.
12764  unsigned ZeroLo = Zeroable.countTrailingOnes();
12765  unsigned ZeroHi = Zeroable.countLeadingOnes();
12766  if (!ZeroLo && !ZeroHi)
12767    return SDValue();
12768
12769  unsigned NumElts = Mask.size();
12770  unsigned Len = NumElts - (ZeroLo + ZeroHi);
12771  if (!isSequentialOrUndefInRange(Mask, ZeroLo, Len, Mask[ZeroLo]))
12772    return SDValue();
12773
12774  unsigned Scale = VT.getScalarSizeInBits() / 8;
12775  ArrayRef<int> StubMask = Mask.slice(ZeroLo, Len);
12776  if (!isUndefOrInRange(StubMask, 0, NumElts) &&
12777      !isUndefOrInRange(StubMask, NumElts, 2 * NumElts))
12778    return SDValue();
12779
12780  SDValue Res = Mask[ZeroLo] < (int)NumElts ? V1 : V2;
12781  Res = DAG.getBitcast(MVT::v16i8, Res);
12782
12783  // Use VSHLDQ/VSRLDQ ops to zero the ends of a vector and leave an
12784  // inner sequential set of elements, possibly offset:
12785  // 01234567 --> zzzzzz01 --> 1zzzzzzz
12786  // 01234567 --> 4567zzzz --> zzzzz456
12787  // 01234567 --> z0123456 --> 3456zzzz --> zz3456zz
12788  if (ZeroLo == 0) {
12789    unsigned Shift = (NumElts - 1) - (Mask[ZeroLo + Len - 1] % NumElts);
12790    Res = DAG.getNode(X86ISD::VSHLDQ, DL, MVT::v16i8, Res,
12791                      DAG.getTargetConstant(Scale * Shift, DL, MVT::i8));
12792    Res = DAG.getNode(X86ISD::VSRLDQ, DL, MVT::v16i8, Res,
12793                      DAG.getTargetConstant(Scale * ZeroHi, DL, MVT::i8));
12794  } else if (ZeroHi == 0) {
12795    unsigned Shift = Mask[ZeroLo] % NumElts;
12796    Res = DAG.getNode(X86ISD::VSRLDQ, DL, MVT::v16i8, Res,
12797                      DAG.getTargetConstant(Scale * Shift, DL, MVT::i8));
12798    Res = DAG.getNode(X86ISD::VSHLDQ, DL, MVT::v16i8, Res,
12799                      DAG.getTargetConstant(Scale * ZeroLo, DL, MVT::i8));
12800  } else if (!Subtarget.hasSSSE3()) {
12801    // If we don't have PSHUFB then its worth avoiding an AND constant mask
12802    // by performing 3 byte shifts. Shuffle combining can kick in above that.
12803    // TODO: There may be some cases where VSH{LR}DQ+PAND is still better.
12804    unsigned Shift = (NumElts - 1) - (Mask[ZeroLo + Len - 1] % NumElts);
12805    Res = DAG.getNode(X86ISD::VSHLDQ, DL, MVT::v16i8, Res,
12806                      DAG.getTargetConstant(Scale * Shift, DL, MVT::i8));
12807    Shift += Mask[ZeroLo] % NumElts;
12808    Res = DAG.getNode(X86ISD::VSRLDQ, DL, MVT::v16i8, Res,
12809                      DAG.getTargetConstant(Scale * Shift, DL, MVT::i8));
12810    Res = DAG.getNode(X86ISD::VSHLDQ, DL, MVT::v16i8, Res,
12811                      DAG.getTargetConstant(Scale * ZeroLo, DL, MVT::i8));
12812  } else
12813    return SDValue();
12814
12815  return DAG.getBitcast(VT, Res);
12816}
12817
12818/// Try to lower a vector shuffle as a bit shift (shifts in zeros).
12819///
12820/// Attempts to match a shuffle mask against the PSLL(W/D/Q/DQ) and
12821/// PSRL(W/D/Q/DQ) SSE2 and AVX2 logical bit-shift instructions. The function
12822/// matches elements from one of the input vectors shuffled to the left or
12823/// right with zeroable elements 'shifted in'. It handles both the strictly
12824/// bit-wise element shifts and the byte shift across an entire 128-bit double
12825/// quad word lane.
12826///
12827/// PSHL : (little-endian) left bit shift.
12828/// [ zz, 0, zz,  2 ]
12829/// [ -1, 4, zz, -1 ]
12830/// PSRL : (little-endian) right bit shift.
12831/// [  1, zz,  3, zz]
12832/// [ -1, -1,  7, zz]
12833/// PSLLDQ : (little-endian) left byte shift
12834/// [ zz,  0,  1,  2,  3,  4,  5,  6]
12835/// [ zz, zz, -1, -1,  2,  3,  4, -1]
12836/// [ zz, zz, zz, zz, zz, zz, -1,  1]
12837/// PSRLDQ : (little-endian) right byte shift
12838/// [  5, 6,  7, zz, zz, zz, zz, zz]
12839/// [ -1, 5,  6,  7, zz, zz, zz, zz]
12840/// [  1, 2, -1, -1, -1, -1, zz, zz]
12841static int matchShuffleAsShift(MVT &ShiftVT, unsigned &Opcode,
12842                               unsigned ScalarSizeInBits, ArrayRef<int> Mask,
12843                               int MaskOffset, const APInt &Zeroable,
12844                               const X86Subtarget &Subtarget) {
12845  int Size = Mask.size();
12846  unsigned SizeInBits = Size * ScalarSizeInBits;
12847
12848  auto CheckZeros = [&](int Shift, int Scale, bool Left) {
12849    for (int i = 0; i < Size; i += Scale)
12850      for (int j = 0; j < Shift; ++j)
12851        if (!Zeroable[i + j + (Left ? 0 : (Scale - Shift))])
12852          return false;
12853
12854    return true;
12855  };
12856
12857  auto MatchShift = [&](int Shift, int Scale, bool Left) {
12858    for (int i = 0; i != Size; i += Scale) {
12859      unsigned Pos = Left ? i + Shift : i;
12860      unsigned Low = Left ? i : i + Shift;
12861      unsigned Len = Scale - Shift;
12862      if (!isSequentialOrUndefInRange(Mask, Pos, Len, Low + MaskOffset))
12863        return -1;
12864    }
12865
12866    int ShiftEltBits = ScalarSizeInBits * Scale;
12867    bool ByteShift = ShiftEltBits > 64;
12868    Opcode = Left ? (ByteShift ? X86ISD::VSHLDQ : X86ISD::VSHLI)
12869                  : (ByteShift ? X86ISD::VSRLDQ : X86ISD::VSRLI);
12870    int ShiftAmt = Shift * ScalarSizeInBits / (ByteShift ? 8 : 1);
12871
12872    // Normalize the scale for byte shifts to still produce an i64 element
12873    // type.
12874    Scale = ByteShift ? Scale / 2 : Scale;
12875
12876    // We need to round trip through the appropriate type for the shift.
12877    MVT ShiftSVT = MVT::getIntegerVT(ScalarSizeInBits * Scale);
12878    ShiftVT = ByteShift ? MVT::getVectorVT(MVT::i8, SizeInBits / 8)
12879                        : MVT::getVectorVT(ShiftSVT, Size / Scale);
12880    return (int)ShiftAmt;
12881  };
12882
12883  // SSE/AVX supports logical shifts up to 64-bit integers - so we can just
12884  // keep doubling the size of the integer elements up to that. We can
12885  // then shift the elements of the integer vector by whole multiples of
12886  // their width within the elements of the larger integer vector. Test each
12887  // multiple to see if we can find a match with the moved element indices
12888  // and that the shifted in elements are all zeroable.
12889  unsigned MaxWidth = ((SizeInBits == 512) && !Subtarget.hasBWI() ? 64 : 128);
12890  for (int Scale = 2; Scale * ScalarSizeInBits <= MaxWidth; Scale *= 2)
12891    for (int Shift = 1; Shift != Scale; ++Shift)
12892      for (bool Left : {true, false})
12893        if (CheckZeros(Shift, Scale, Left)) {
12894          int ShiftAmt = MatchShift(Shift, Scale, Left);
12895          if (0 < ShiftAmt)
12896            return ShiftAmt;
12897        }
12898
12899  // no match
12900  return -1;
12901}
12902
12903static SDValue lowerShuffleAsShift(const SDLoc &DL, MVT VT, SDValue V1,
12904                                   SDValue V2, ArrayRef<int> Mask,
12905                                   const APInt &Zeroable,
12906                                   const X86Subtarget &Subtarget,
12907                                   SelectionDAG &DAG) {
12908  int Size = Mask.size();
12909  assert(Size == (int)VT.getVectorNumElements() && "Unexpected mask size");
12910
12911  MVT ShiftVT;
12912  SDValue V = V1;
12913  unsigned Opcode;
12914
12915  // Try to match shuffle against V1 shift.
12916  int ShiftAmt = matchShuffleAsShift(ShiftVT, Opcode, VT.getScalarSizeInBits(),
12917                                     Mask, 0, Zeroable, Subtarget);
12918
12919  // If V1 failed, try to match shuffle against V2 shift.
12920  if (ShiftAmt < 0) {
12921    ShiftAmt = matchShuffleAsShift(ShiftVT, Opcode, VT.getScalarSizeInBits(),
12922                                   Mask, Size, Zeroable, Subtarget);
12923    V = V2;
12924  }
12925
12926  if (ShiftAmt < 0)
12927    return SDValue();
12928
12929  assert(DAG.getTargetLoweringInfo().isTypeLegal(ShiftVT) &&
12930         "Illegal integer vector type");
12931  V = DAG.getBitcast(ShiftVT, V);
12932  V = DAG.getNode(Opcode, DL, ShiftVT, V,
12933                  DAG.getTargetConstant(ShiftAmt, DL, MVT::i8));
12934  return DAG.getBitcast(VT, V);
12935}
12936
12937// EXTRQ: Extract Len elements from lower half of source, starting at Idx.
12938// Remainder of lower half result is zero and upper half is all undef.
12939static bool matchShuffleAsEXTRQ(MVT VT, SDValue &V1, SDValue &V2,
12940                                ArrayRef<int> Mask, uint64_t &BitLen,
12941                                uint64_t &BitIdx, const APInt &Zeroable) {
12942  int Size = Mask.size();
12943  int HalfSize = Size / 2;
12944  assert(Size == (int)VT.getVectorNumElements() && "Unexpected mask size");
12945  assert(!Zeroable.isAllOnesValue() && "Fully zeroable shuffle mask");
12946
12947  // Upper half must be undefined.
12948  if (!isUndefUpperHalf(Mask))
12949    return false;
12950
12951  // Determine the extraction length from the part of the
12952  // lower half that isn't zeroable.
12953  int Len = HalfSize;
12954  for (; Len > 0; --Len)
12955    if (!Zeroable[Len - 1])
12956      break;
12957  assert(Len > 0 && "Zeroable shuffle mask");
12958
12959  // Attempt to match first Len sequential elements from the lower half.
12960  SDValue Src;
12961  int Idx = -1;
12962  for (int i = 0; i != Len; ++i) {
12963    int M = Mask[i];
12964    if (M == SM_SentinelUndef)
12965      continue;
12966    SDValue &V = (M < Size ? V1 : V2);
12967    M = M % Size;
12968
12969    // The extracted elements must start at a valid index and all mask
12970    // elements must be in the lower half.
12971    if (i > M || M >= HalfSize)
12972      return false;
12973
12974    if (Idx < 0 || (Src == V && Idx == (M - i))) {
12975      Src = V;
12976      Idx = M - i;
12977      continue;
12978    }
12979    return false;
12980  }
12981
12982  if (!Src || Idx < 0)
12983    return false;
12984
12985  assert((Idx + Len) <= HalfSize && "Illegal extraction mask");
12986  BitLen = (Len * VT.getScalarSizeInBits()) & 0x3f;
12987  BitIdx = (Idx * VT.getScalarSizeInBits()) & 0x3f;
12988  V1 = Src;
12989  return true;
12990}
12991
12992// INSERTQ: Extract lowest Len elements from lower half of second source and
12993// insert over first source, starting at Idx.
12994// { A[0], .., A[Idx-1], B[0], .., B[Len-1], A[Idx+Len], .., UNDEF, ... }
12995static bool matchShuffleAsINSERTQ(MVT VT, SDValue &V1, SDValue &V2,
12996                                  ArrayRef<int> Mask, uint64_t &BitLen,
12997                                  uint64_t &BitIdx) {
12998  int Size = Mask.size();
12999  int HalfSize = Size / 2;
13000  assert(Size == (int)VT.getVectorNumElements() && "Unexpected mask size");
13001
13002  // Upper half must be undefined.
13003  if (!isUndefUpperHalf(Mask))
13004    return false;
13005
13006  for (int Idx = 0; Idx != HalfSize; ++Idx) {
13007    SDValue Base;
13008
13009    // Attempt to match first source from mask before insertion point.
13010    if (isUndefInRange(Mask, 0, Idx)) {
13011      /* EMPTY */
13012    } else if (isSequentialOrUndefInRange(Mask, 0, Idx, 0)) {
13013      Base = V1;
13014    } else if (isSequentialOrUndefInRange(Mask, 0, Idx, Size)) {
13015      Base = V2;
13016    } else {
13017      continue;
13018    }
13019
13020    // Extend the extraction length looking to match both the insertion of
13021    // the second source and the remaining elements of the first.
13022    for (int Hi = Idx + 1; Hi <= HalfSize; ++Hi) {
13023      SDValue Insert;
13024      int Len = Hi - Idx;
13025
13026      // Match insertion.
13027      if (isSequentialOrUndefInRange(Mask, Idx, Len, 0)) {
13028        Insert = V1;
13029      } else if (isSequentialOrUndefInRange(Mask, Idx, Len, Size)) {
13030        Insert = V2;
13031      } else {
13032        continue;
13033      }
13034
13035      // Match the remaining elements of the lower half.
13036      if (isUndefInRange(Mask, Hi, HalfSize - Hi)) {
13037        /* EMPTY */
13038      } else if ((!Base || (Base == V1)) &&
13039                 isSequentialOrUndefInRange(Mask, Hi, HalfSize - Hi, Hi)) {
13040        Base = V1;
13041      } else if ((!Base || (Base == V2)) &&
13042                 isSequentialOrUndefInRange(Mask, Hi, HalfSize - Hi,
13043                                            Size + Hi)) {
13044        Base = V2;
13045      } else {
13046        continue;
13047      }
13048
13049      BitLen = (Len * VT.getScalarSizeInBits()) & 0x3f;
13050      BitIdx = (Idx * VT.getScalarSizeInBits()) & 0x3f;
13051      V1 = Base;
13052      V2 = Insert;
13053      return true;
13054    }
13055  }
13056
13057  return false;
13058}
13059
13060/// Try to lower a vector shuffle using SSE4a EXTRQ/INSERTQ.
13061static SDValue lowerShuffleWithSSE4A(const SDLoc &DL, MVT VT, SDValue V1,
13062                                     SDValue V2, ArrayRef<int> Mask,
13063                                     const APInt &Zeroable, SelectionDAG &DAG) {
13064  uint64_t BitLen, BitIdx;
13065  if (matchShuffleAsEXTRQ(VT, V1, V2, Mask, BitLen, BitIdx, Zeroable))
13066    return DAG.getNode(X86ISD::EXTRQI, DL, VT, V1,
13067                       DAG.getTargetConstant(BitLen, DL, MVT::i8),
13068                       DAG.getTargetConstant(BitIdx, DL, MVT::i8));
13069
13070  if (matchShuffleAsINSERTQ(VT, V1, V2, Mask, BitLen, BitIdx))
13071    return DAG.getNode(X86ISD::INSERTQI, DL, VT, V1 ? V1 : DAG.getUNDEF(VT),
13072                       V2 ? V2 : DAG.getUNDEF(VT),
13073                       DAG.getTargetConstant(BitLen, DL, MVT::i8),
13074                       DAG.getTargetConstant(BitIdx, DL, MVT::i8));
13075
13076  return SDValue();
13077}
13078
13079/// Lower a vector shuffle as a zero or any extension.
13080///
13081/// Given a specific number of elements, element bit width, and extension
13082/// stride, produce either a zero or any extension based on the available
13083/// features of the subtarget. The extended elements are consecutive and
13084/// begin and can start from an offsetted element index in the input; to
13085/// avoid excess shuffling the offset must either being in the bottom lane
13086/// or at the start of a higher lane. All extended elements must be from
13087/// the same lane.
13088static SDValue lowerShuffleAsSpecificZeroOrAnyExtend(
13089    const SDLoc &DL, MVT VT, int Scale, int Offset, bool AnyExt, SDValue InputV,
13090    ArrayRef<int> Mask, const X86Subtarget &Subtarget, SelectionDAG &DAG) {
13091  assert(Scale > 1 && "Need a scale to extend.");
13092  int EltBits = VT.getScalarSizeInBits();
13093  int NumElements = VT.getVectorNumElements();
13094  int NumEltsPerLane = 128 / EltBits;
13095  int OffsetLane = Offset / NumEltsPerLane;
13096  assert((EltBits == 8 || EltBits == 16 || EltBits == 32) &&
13097         "Only 8, 16, and 32 bit elements can be extended.");
13098  assert(Scale * EltBits <= 64 && "Cannot zero extend past 64 bits.");
13099  assert(0 <= Offset && "Extension offset must be positive.");
13100  assert((Offset < NumEltsPerLane || Offset % NumEltsPerLane == 0) &&
13101         "Extension offset must be in the first lane or start an upper lane.");
13102
13103  // Check that an index is in same lane as the base offset.
13104  auto SafeOffset = [&](int Idx) {
13105    return OffsetLane == (Idx / NumEltsPerLane);
13106  };
13107
13108  // Shift along an input so that the offset base moves to the first element.
13109  auto ShuffleOffset = [&](SDValue V) {
13110    if (!Offset)
13111      return V;
13112
13113    SmallVector<int, 8> ShMask((unsigned)NumElements, -1);
13114    for (int i = 0; i * Scale < NumElements; ++i) {
13115      int SrcIdx = i + Offset;
13116      ShMask[i] = SafeOffset(SrcIdx) ? SrcIdx : -1;
13117    }
13118    return DAG.getVectorShuffle(VT, DL, V, DAG.getUNDEF(VT), ShMask);
13119  };
13120
13121  // Found a valid a/zext mask! Try various lowering strategies based on the
13122  // input type and available ISA extensions.
13123  if (Subtarget.hasSSE41()) {
13124    // Not worth offsetting 128-bit vectors if scale == 2, a pattern using
13125    // PUNPCK will catch this in a later shuffle match.
13126    if (Offset && Scale == 2 && VT.is128BitVector())
13127      return SDValue();
13128    MVT ExtVT = MVT::getVectorVT(MVT::getIntegerVT(EltBits * Scale),
13129                                 NumElements / Scale);
13130    InputV = ShuffleOffset(InputV);
13131    InputV = getEXTEND_VECTOR_INREG(AnyExt ? ISD::ANY_EXTEND : ISD::ZERO_EXTEND,
13132                                    DL, ExtVT, InputV, DAG);
13133    return DAG.getBitcast(VT, InputV);
13134  }
13135
13136  assert(VT.is128BitVector() && "Only 128-bit vectors can be extended.");
13137
13138  // For any extends we can cheat for larger element sizes and use shuffle
13139  // instructions that can fold with a load and/or copy.
13140  if (AnyExt && EltBits == 32) {
13141    int PSHUFDMask[4] = {Offset, -1, SafeOffset(Offset + 1) ? Offset + 1 : -1,
13142                         -1};
13143    return DAG.getBitcast(
13144        VT, DAG.getNode(X86ISD::PSHUFD, DL, MVT::v4i32,
13145                        DAG.getBitcast(MVT::v4i32, InputV),
13146                        getV4X86ShuffleImm8ForMask(PSHUFDMask, DL, DAG)));
13147  }
13148  if (AnyExt && EltBits == 16 && Scale > 2) {
13149    int PSHUFDMask[4] = {Offset / 2, -1,
13150                         SafeOffset(Offset + 1) ? (Offset + 1) / 2 : -1, -1};
13151    InputV = DAG.getNode(X86ISD::PSHUFD, DL, MVT::v4i32,
13152                         DAG.getBitcast(MVT::v4i32, InputV),
13153                         getV4X86ShuffleImm8ForMask(PSHUFDMask, DL, DAG));
13154    int PSHUFWMask[4] = {1, -1, -1, -1};
13155    unsigned OddEvenOp = (Offset & 1) ? X86ISD::PSHUFLW : X86ISD::PSHUFHW;
13156    return DAG.getBitcast(
13157        VT, DAG.getNode(OddEvenOp, DL, MVT::v8i16,
13158                        DAG.getBitcast(MVT::v8i16, InputV),
13159                        getV4X86ShuffleImm8ForMask(PSHUFWMask, DL, DAG)));
13160  }
13161
13162  // The SSE4A EXTRQ instruction can efficiently extend the first 2 lanes
13163  // to 64-bits.
13164  if ((Scale * EltBits) == 64 && EltBits < 32 && Subtarget.hasSSE4A()) {
13165    assert(NumElements == (int)Mask.size() && "Unexpected shuffle mask size!");
13166    assert(VT.is128BitVector() && "Unexpected vector width!");
13167
13168    int LoIdx = Offset * EltBits;
13169    SDValue Lo = DAG.getBitcast(
13170        MVT::v2i64, DAG.getNode(X86ISD::EXTRQI, DL, VT, InputV,
13171                                DAG.getTargetConstant(EltBits, DL, MVT::i8),
13172                                DAG.getTargetConstant(LoIdx, DL, MVT::i8)));
13173
13174    if (isUndefUpperHalf(Mask) || !SafeOffset(Offset + 1))
13175      return DAG.getBitcast(VT, Lo);
13176
13177    int HiIdx = (Offset + 1) * EltBits;
13178    SDValue Hi = DAG.getBitcast(
13179        MVT::v2i64, DAG.getNode(X86ISD::EXTRQI, DL, VT, InputV,
13180                                DAG.getTargetConstant(EltBits, DL, MVT::i8),
13181                                DAG.getTargetConstant(HiIdx, DL, MVT::i8)));
13182    return DAG.getBitcast(VT,
13183                          DAG.getNode(X86ISD::UNPCKL, DL, MVT::v2i64, Lo, Hi));
13184  }
13185
13186  // If this would require more than 2 unpack instructions to expand, use
13187  // pshufb when available. We can only use more than 2 unpack instructions
13188  // when zero extending i8 elements which also makes it easier to use pshufb.
13189  if (Scale > 4 && EltBits == 8 && Subtarget.hasSSSE3()) {
13190    assert(NumElements == 16 && "Unexpected byte vector width!");
13191    SDValue PSHUFBMask[16];
13192    for (int i = 0; i < 16; ++i) {
13193      int Idx = Offset + (i / Scale);
13194      if ((i % Scale == 0 && SafeOffset(Idx))) {
13195        PSHUFBMask[i] = DAG.getConstant(Idx, DL, MVT::i8);
13196        continue;
13197      }
13198      PSHUFBMask[i] =
13199          AnyExt ? DAG.getUNDEF(MVT::i8) : DAG.getConstant(0x80, DL, MVT::i8);
13200    }
13201    InputV = DAG.getBitcast(MVT::v16i8, InputV);
13202    return DAG.getBitcast(
13203        VT, DAG.getNode(X86ISD::PSHUFB, DL, MVT::v16i8, InputV,
13204                        DAG.getBuildVector(MVT::v16i8, DL, PSHUFBMask)));
13205  }
13206
13207  // If we are extending from an offset, ensure we start on a boundary that
13208  // we can unpack from.
13209  int AlignToUnpack = Offset % (NumElements / Scale);
13210  if (AlignToUnpack) {
13211    SmallVector<int, 8> ShMask((unsigned)NumElements, -1);
13212    for (int i = AlignToUnpack; i < NumElements; ++i)
13213      ShMask[i - AlignToUnpack] = i;
13214    InputV = DAG.getVectorShuffle(VT, DL, InputV, DAG.getUNDEF(VT), ShMask);
13215    Offset -= AlignToUnpack;
13216  }
13217
13218  // Otherwise emit a sequence of unpacks.
13219  do {
13220    unsigned UnpackLoHi = X86ISD::UNPCKL;
13221    if (Offset >= (NumElements / 2)) {
13222      UnpackLoHi = X86ISD::UNPCKH;
13223      Offset -= (NumElements / 2);
13224    }
13225
13226    MVT InputVT = MVT::getVectorVT(MVT::getIntegerVT(EltBits), NumElements);
13227    SDValue Ext = AnyExt ? DAG.getUNDEF(InputVT)
13228                         : getZeroVector(InputVT, Subtarget, DAG, DL);
13229    InputV = DAG.getBitcast(InputVT, InputV);
13230    InputV = DAG.getNode(UnpackLoHi, DL, InputVT, InputV, Ext);
13231    Scale /= 2;
13232    EltBits *= 2;
13233    NumElements /= 2;
13234  } while (Scale > 1);
13235  return DAG.getBitcast(VT, InputV);
13236}
13237
13238/// Try to lower a vector shuffle as a zero extension on any microarch.
13239///
13240/// This routine will try to do everything in its power to cleverly lower
13241/// a shuffle which happens to match the pattern of a zero extend. It doesn't
13242/// check for the profitability of this lowering,  it tries to aggressively
13243/// match this pattern. It will use all of the micro-architectural details it
13244/// can to emit an efficient lowering. It handles both blends with all-zero
13245/// inputs to explicitly zero-extend and undef-lanes (sometimes undef due to
13246/// masking out later).
13247///
13248/// The reason we have dedicated lowering for zext-style shuffles is that they
13249/// are both incredibly common and often quite performance sensitive.
13250static SDValue lowerShuffleAsZeroOrAnyExtend(
13251    const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef<int> Mask,
13252    const APInt &Zeroable, const X86Subtarget &Subtarget,
13253    SelectionDAG &DAG) {
13254  int Bits = VT.getSizeInBits();
13255  int NumLanes = Bits / 128;
13256  int NumElements = VT.getVectorNumElements();
13257  int NumEltsPerLane = NumElements / NumLanes;
13258  assert(VT.getScalarSizeInBits() <= 32 &&
13259         "Exceeds 32-bit integer zero extension limit");
13260  assert((int)Mask.size() == NumElements && "Unexpected shuffle mask size");
13261
13262  // Define a helper function to check a particular ext-scale and lower to it if
13263  // valid.
13264  auto Lower = [&](int Scale) -> SDValue {
13265    SDValue InputV;
13266    bool AnyExt = true;
13267    int Offset = 0;
13268    int Matches = 0;
13269    for (int i = 0; i < NumElements; ++i) {
13270      int M = Mask[i];
13271      if (M < 0)
13272        continue; // Valid anywhere but doesn't tell us anything.
13273      if (i % Scale != 0) {
13274        // Each of the extended elements need to be zeroable.
13275        if (!Zeroable[i])
13276          return SDValue();
13277
13278        // We no longer are in the anyext case.
13279        AnyExt = false;
13280        continue;
13281      }
13282
13283      // Each of the base elements needs to be consecutive indices into the
13284      // same input vector.
13285      SDValue V = M < NumElements ? V1 : V2;
13286      M = M % NumElements;
13287      if (!InputV) {
13288        InputV = V;
13289        Offset = M - (i / Scale);
13290      } else if (InputV != V)
13291        return SDValue(); // Flip-flopping inputs.
13292
13293      // Offset must start in the lowest 128-bit lane or at the start of an
13294      // upper lane.
13295      // FIXME: Is it ever worth allowing a negative base offset?
13296      if (!((0 <= Offset && Offset < NumEltsPerLane) ||
13297            (Offset % NumEltsPerLane) == 0))
13298        return SDValue();
13299
13300      // If we are offsetting, all referenced entries must come from the same
13301      // lane.
13302      if (Offset && (Offset / NumEltsPerLane) != (M / NumEltsPerLane))
13303        return SDValue();
13304
13305      if ((M % NumElements) != (Offset + (i / Scale)))
13306        return SDValue(); // Non-consecutive strided elements.
13307      Matches++;
13308    }
13309
13310    // If we fail to find an input, we have a zero-shuffle which should always
13311    // have already been handled.
13312    // FIXME: Maybe handle this here in case during blending we end up with one?
13313    if (!InputV)
13314      return SDValue();
13315
13316    // If we are offsetting, don't extend if we only match a single input, we
13317    // can always do better by using a basic PSHUF or PUNPCK.
13318    if (Offset != 0 && Matches < 2)
13319      return SDValue();
13320
13321    return lowerShuffleAsSpecificZeroOrAnyExtend(DL, VT, Scale, Offset, AnyExt,
13322                                                 InputV, Mask, Subtarget, DAG);
13323  };
13324
13325  // The widest scale possible for extending is to a 64-bit integer.
13326  assert(Bits % 64 == 0 &&
13327         "The number of bits in a vector must be divisible by 64 on x86!");
13328  int NumExtElements = Bits / 64;
13329
13330  // Each iteration, try extending the elements half as much, but into twice as
13331  // many elements.
13332  for (; NumExtElements < NumElements; NumExtElements *= 2) {
13333    assert(NumElements % NumExtElements == 0 &&
13334           "The input vector size must be divisible by the extended size.");
13335    if (SDValue V = Lower(NumElements / NumExtElements))
13336      return V;
13337  }
13338
13339  // General extends failed, but 128-bit vectors may be able to use MOVQ.
13340  if (Bits != 128)
13341    return SDValue();
13342
13343  // Returns one of the source operands if the shuffle can be reduced to a
13344  // MOVQ, copying the lower 64-bits and zero-extending to the upper 64-bits.
13345  auto CanZExtLowHalf = [&]() {
13346    for (int i = NumElements / 2; i != NumElements; ++i)
13347      if (!Zeroable[i])
13348        return SDValue();
13349    if (isSequentialOrUndefInRange(Mask, 0, NumElements / 2, 0))
13350      return V1;
13351    if (isSequentialOrUndefInRange(Mask, 0, NumElements / 2, NumElements))
13352      return V2;
13353    return SDValue();
13354  };
13355
13356  if (SDValue V = CanZExtLowHalf()) {
13357    V = DAG.getBitcast(MVT::v2i64, V);
13358    V = DAG.getNode(X86ISD::VZEXT_MOVL, DL, MVT::v2i64, V);
13359    return DAG.getBitcast(VT, V);
13360  }
13361
13362  // No viable ext lowering found.
13363  return SDValue();
13364}
13365
13366/// Try to get a scalar value for a specific element of a vector.
13367///
13368/// Looks through BUILD_VECTOR and SCALAR_TO_VECTOR nodes to find a scalar.
13369static SDValue getScalarValueForVectorElement(SDValue V, int Idx,
13370                                              SelectionDAG &DAG) {
13371  MVT VT = V.getSimpleValueType();
13372  MVT EltVT = VT.getVectorElementType();
13373  V = peekThroughBitcasts(V);
13374
13375  // If the bitcasts shift the element size, we can't extract an equivalent
13376  // element from it.
13377  MVT NewVT = V.getSimpleValueType();
13378  if (!NewVT.isVector() || NewVT.getScalarSizeInBits() != VT.getScalarSizeInBits())
13379    return SDValue();
13380
13381  if (V.getOpcode() == ISD::BUILD_VECTOR ||
13382      (Idx == 0 && V.getOpcode() == ISD::SCALAR_TO_VECTOR)) {
13383    // Ensure the scalar operand is the same size as the destination.
13384    // FIXME: Add support for scalar truncation where possible.
13385    SDValue S = V.getOperand(Idx);
13386    if (EltVT.getSizeInBits() == S.getSimpleValueType().getSizeInBits())
13387      return DAG.getBitcast(EltVT, S);
13388  }
13389
13390  return SDValue();
13391}
13392
13393/// Helper to test for a load that can be folded with x86 shuffles.
13394///
13395/// This is particularly important because the set of instructions varies
13396/// significantly based on whether the operand is a load or not.
13397static bool isShuffleFoldableLoad(SDValue V) {
13398  V = peekThroughBitcasts(V);
13399  return ISD::isNON_EXTLoad(V.getNode());
13400}
13401
13402/// Try to lower insertion of a single element into a zero vector.
13403///
13404/// This is a common pattern that we have especially efficient patterns to lower
13405/// across all subtarget feature sets.
13406static SDValue lowerShuffleAsElementInsertion(
13407    const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef<int> Mask,
13408    const APInt &Zeroable, const X86Subtarget &Subtarget,
13409    SelectionDAG &DAG) {
13410  MVT ExtVT = VT;
13411  MVT EltVT = VT.getVectorElementType();
13412
13413  int V2Index =
13414      find_if(Mask, [&Mask](int M) { return M >= (int)Mask.size(); }) -
13415      Mask.begin();
13416  bool IsV1Zeroable = true;
13417  for (int i = 0, Size = Mask.size(); i < Size; ++i)
13418    if (i != V2Index && !Zeroable[i]) {
13419      IsV1Zeroable = false;
13420      break;
13421    }
13422
13423  // Check for a single input from a SCALAR_TO_VECTOR node.
13424  // FIXME: All of this should be canonicalized into INSERT_VECTOR_ELT and
13425  // all the smarts here sunk into that routine. However, the current
13426  // lowering of BUILD_VECTOR makes that nearly impossible until the old
13427  // vector shuffle lowering is dead.
13428  SDValue V2S = getScalarValueForVectorElement(V2, Mask[V2Index] - Mask.size(),
13429                                               DAG);
13430  if (V2S && DAG.getTargetLoweringInfo().isTypeLegal(V2S.getValueType())) {
13431    // We need to zext the scalar if it is smaller than an i32.
13432    V2S = DAG.getBitcast(EltVT, V2S);
13433    if (EltVT == MVT::i8 || EltVT == MVT::i16) {
13434      // Using zext to expand a narrow element won't work for non-zero
13435      // insertions.
13436      if (!IsV1Zeroable)
13437        return SDValue();
13438
13439      // Zero-extend directly to i32.
13440      ExtVT = MVT::getVectorVT(MVT::i32, ExtVT.getSizeInBits() / 32);
13441      V2S = DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::i32, V2S);
13442    }
13443    V2 = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, ExtVT, V2S);
13444  } else if (Mask[V2Index] != (int)Mask.size() || EltVT == MVT::i8 ||
13445             EltVT == MVT::i16) {
13446    // Either not inserting from the low element of the input or the input
13447    // element size is too small to use VZEXT_MOVL to clear the high bits.
13448    return SDValue();
13449  }
13450
13451  if (!IsV1Zeroable) {
13452    // If V1 can't be treated as a zero vector we have fewer options to lower
13453    // this. We can't support integer vectors or non-zero targets cheaply, and
13454    // the V1 elements can't be permuted in any way.
13455    assert(VT == ExtVT && "Cannot change extended type when non-zeroable!");
13456    if (!VT.isFloatingPoint() || V2Index != 0)
13457      return SDValue();
13458    SmallVector<int, 8> V1Mask(Mask.begin(), Mask.end());
13459    V1Mask[V2Index] = -1;
13460    if (!isNoopShuffleMask(V1Mask))
13461      return SDValue();
13462    if (!VT.is128BitVector())
13463      return SDValue();
13464
13465    // Otherwise, use MOVSD or MOVSS.
13466    assert((EltVT == MVT::f32 || EltVT == MVT::f64) &&
13467           "Only two types of floating point element types to handle!");
13468    return DAG.getNode(EltVT == MVT::f32 ? X86ISD::MOVSS : X86ISD::MOVSD, DL,
13469                       ExtVT, V1, V2);
13470  }
13471
13472  // This lowering only works for the low element with floating point vectors.
13473  if (VT.isFloatingPoint() && V2Index != 0)
13474    return SDValue();
13475
13476  V2 = DAG.getNode(X86ISD::VZEXT_MOVL, DL, ExtVT, V2);
13477  if (ExtVT != VT)
13478    V2 = DAG.getBitcast(VT, V2);
13479
13480  if (V2Index != 0) {
13481    // If we have 4 or fewer lanes we can cheaply shuffle the element into
13482    // the desired position. Otherwise it is more efficient to do a vector
13483    // shift left. We know that we can do a vector shift left because all
13484    // the inputs are zero.
13485    if (VT.isFloatingPoint() || VT.getVectorNumElements() <= 4) {
13486      SmallVector<int, 4> V2Shuffle(Mask.size(), 1);
13487      V2Shuffle[V2Index] = 0;
13488      V2 = DAG.getVectorShuffle(VT, DL, V2, DAG.getUNDEF(VT), V2Shuffle);
13489    } else {
13490      V2 = DAG.getBitcast(MVT::v16i8, V2);
13491      V2 = DAG.getNode(X86ISD::VSHLDQ, DL, MVT::v16i8, V2,
13492                       DAG.getTargetConstant(
13493                           V2Index * EltVT.getSizeInBits() / 8, DL, MVT::i8));
13494      V2 = DAG.getBitcast(VT, V2);
13495    }
13496  }
13497  return V2;
13498}
13499
13500/// Try to lower broadcast of a single - truncated - integer element,
13501/// coming from a scalar_to_vector/build_vector node \p V0 with larger elements.
13502///
13503/// This assumes we have AVX2.
13504static SDValue lowerShuffleAsTruncBroadcast(const SDLoc &DL, MVT VT, SDValue V0,
13505                                            int BroadcastIdx,
13506                                            const X86Subtarget &Subtarget,
13507                                            SelectionDAG &DAG) {
13508  assert(Subtarget.hasAVX2() &&
13509         "We can only lower integer broadcasts with AVX2!");
13510
13511  MVT EltVT = VT.getVectorElementType();
13512  MVT V0VT = V0.getSimpleValueType();
13513
13514  assert(VT.isInteger() && "Unexpected non-integer trunc broadcast!");
13515  assert(V0VT.isVector() && "Unexpected non-vector vector-sized value!");
13516
13517  MVT V0EltVT = V0VT.getVectorElementType();
13518  if (!V0EltVT.isInteger())
13519    return SDValue();
13520
13521  const unsigned EltSize = EltVT.getSizeInBits();
13522  const unsigned V0EltSize = V0EltVT.getSizeInBits();
13523
13524  // This is only a truncation if the original element type is larger.
13525  if (V0EltSize <= EltSize)
13526    return SDValue();
13527
13528  assert(((V0EltSize % EltSize) == 0) &&
13529         "Scalar type sizes must all be powers of 2 on x86!");
13530
13531  const unsigned V0Opc = V0.getOpcode();
13532  const unsigned Scale = V0EltSize / EltSize;
13533  const unsigned V0BroadcastIdx = BroadcastIdx / Scale;
13534
13535  if ((V0Opc != ISD::SCALAR_TO_VECTOR || V0BroadcastIdx != 0) &&
13536      V0Opc != ISD::BUILD_VECTOR)
13537    return SDValue();
13538
13539  SDValue Scalar = V0.getOperand(V0BroadcastIdx);
13540
13541  // If we're extracting non-least-significant bits, shift so we can truncate.
13542  // Hopefully, we can fold away the trunc/srl/load into the broadcast.
13543  // Even if we can't (and !isShuffleFoldableLoad(Scalar)), prefer
13544  // vpbroadcast+vmovd+shr to vpshufb(m)+vmovd.
13545  if (const int OffsetIdx = BroadcastIdx % Scale)
13546    Scalar = DAG.getNode(ISD::SRL, DL, Scalar.getValueType(), Scalar,
13547                         DAG.getConstant(OffsetIdx * EltSize, DL, MVT::i8));
13548
13549  return DAG.getNode(X86ISD::VBROADCAST, DL, VT,
13550                     DAG.getNode(ISD::TRUNCATE, DL, EltVT, Scalar));
13551}
13552
13553/// Test whether this can be lowered with a single SHUFPS instruction.
13554///
13555/// This is used to disable more specialized lowerings when the shufps lowering
13556/// will happen to be efficient.
13557static bool isSingleSHUFPSMask(ArrayRef<int> Mask) {
13558  // This routine only handles 128-bit shufps.
13559  assert(Mask.size() == 4 && "Unsupported mask size!");
13560  assert(Mask[0] >= -1 && Mask[0] < 8 && "Out of bound mask element!");
13561  assert(Mask[1] >= -1 && Mask[1] < 8 && "Out of bound mask element!");
13562  assert(Mask[2] >= -1 && Mask[2] < 8 && "Out of bound mask element!");
13563  assert(Mask[3] >= -1 && Mask[3] < 8 && "Out of bound mask element!");
13564
13565  // To lower with a single SHUFPS we need to have the low half and high half
13566  // each requiring a single input.
13567  if (Mask[0] >= 0 && Mask[1] >= 0 && (Mask[0] < 4) != (Mask[1] < 4))
13568    return false;
13569  if (Mask[2] >= 0 && Mask[3] >= 0 && (Mask[2] < 4) != (Mask[3] < 4))
13570    return false;
13571
13572  return true;
13573}
13574
13575/// If we are extracting two 128-bit halves of a vector and shuffling the
13576/// result, match that to a 256-bit AVX2 vperm* instruction to avoid a
13577/// multi-shuffle lowering.
13578static SDValue lowerShuffleOfExtractsAsVperm(const SDLoc &DL, SDValue N0,
13579                                             SDValue N1, ArrayRef<int> Mask,
13580                                             SelectionDAG &DAG) {
13581  MVT VT = N0.getSimpleValueType();
13582  assert((VT.is128BitVector() &&
13583          (VT.getScalarSizeInBits() == 32 || VT.getScalarSizeInBits() == 64)) &&
13584         "VPERM* family of shuffles requires 32-bit or 64-bit elements");
13585
13586  // Check that both sources are extracts of the same source vector.
13587  if (!N0.hasOneUse() || !N1.hasOneUse() ||
13588      N0.getOpcode() != ISD::EXTRACT_SUBVECTOR ||
13589      N1.getOpcode() != ISD::EXTRACT_SUBVECTOR ||
13590      N0.getOperand(0) != N1.getOperand(0))
13591    return SDValue();
13592
13593  SDValue WideVec = N0.getOperand(0);
13594  MVT WideVT = WideVec.getSimpleValueType();
13595  if (!WideVT.is256BitVector())
13596    return SDValue();
13597
13598  // Match extracts of each half of the wide source vector. Commute the shuffle
13599  // if the extract of the low half is N1.
13600  unsigned NumElts = VT.getVectorNumElements();
13601  SmallVector<int, 4> NewMask(Mask.begin(), Mask.end());
13602  const APInt &ExtIndex0 = N0.getConstantOperandAPInt(1);
13603  const APInt &ExtIndex1 = N1.getConstantOperandAPInt(1);
13604  if (ExtIndex1 == 0 && ExtIndex0 == NumElts)
13605    ShuffleVectorSDNode::commuteMask(NewMask);
13606  else if (ExtIndex0 != 0 || ExtIndex1 != NumElts)
13607    return SDValue();
13608
13609  // Final bailout: if the mask is simple, we are better off using an extract
13610  // and a simple narrow shuffle. Prefer extract+unpack(h/l)ps to vpermps
13611  // because that avoids a constant load from memory.
13612  if (NumElts == 4 &&
13613      (isSingleSHUFPSMask(NewMask) || is128BitUnpackShuffleMask(NewMask)))
13614    return SDValue();
13615
13616  // Extend the shuffle mask with undef elements.
13617  NewMask.append(NumElts, -1);
13618
13619  // shuf (extract X, 0), (extract X, 4), M --> extract (shuf X, undef, M'), 0
13620  SDValue Shuf = DAG.getVectorShuffle(WideVT, DL, WideVec, DAG.getUNDEF(WideVT),
13621                                      NewMask);
13622  // This is free: ymm -> xmm.
13623  return DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, Shuf,
13624                     DAG.getIntPtrConstant(0, DL));
13625}
13626
13627/// Try to lower broadcast of a single element.
13628///
13629/// For convenience, this code also bundles all of the subtarget feature set
13630/// filtering. While a little annoying to re-dispatch on type here, there isn't
13631/// a convenient way to factor it out.
13632static SDValue lowerShuffleAsBroadcast(const SDLoc &DL, MVT VT, SDValue V1,
13633                                       SDValue V2, ArrayRef<int> Mask,
13634                                       const X86Subtarget &Subtarget,
13635                                       SelectionDAG &DAG) {
13636  if (!((Subtarget.hasSSE3() && VT == MVT::v2f64) ||
13637        (Subtarget.hasAVX() && VT.isFloatingPoint()) ||
13638        (Subtarget.hasAVX2() && VT.isInteger())))
13639    return SDValue();
13640
13641  // With MOVDDUP (v2f64) we can broadcast from a register or a load, otherwise
13642  // we can only broadcast from a register with AVX2.
13643  unsigned NumEltBits = VT.getScalarSizeInBits();
13644  unsigned Opcode = (VT == MVT::v2f64 && !Subtarget.hasAVX2())
13645                        ? X86ISD::MOVDDUP
13646                        : X86ISD::VBROADCAST;
13647  bool BroadcastFromReg = (Opcode == X86ISD::MOVDDUP) || Subtarget.hasAVX2();
13648
13649  // Check that the mask is a broadcast.
13650  int BroadcastIdx = getSplatIndex(Mask);
13651  if (BroadcastIdx < 0)
13652    return SDValue();
13653  assert(BroadcastIdx < (int)Mask.size() && "We only expect to be called with "
13654                                            "a sorted mask where the broadcast "
13655                                            "comes from V1.");
13656
13657  // Go up the chain of (vector) values to find a scalar load that we can
13658  // combine with the broadcast.
13659  // TODO: Combine this logic with findEltLoadSrc() used by
13660  //       EltsFromConsecutiveLoads().
13661  int BitOffset = BroadcastIdx * NumEltBits;
13662  SDValue V = V1;
13663  for (;;) {
13664    switch (V.getOpcode()) {
13665    case ISD::BITCAST: {
13666      V = V.getOperand(0);
13667      continue;
13668    }
13669    case ISD::CONCAT_VECTORS: {
13670      int OpBitWidth = V.getOperand(0).getValueSizeInBits();
13671      int OpIdx = BitOffset / OpBitWidth;
13672      V = V.getOperand(OpIdx);
13673      BitOffset %= OpBitWidth;
13674      continue;
13675    }
13676    case ISD::EXTRACT_SUBVECTOR: {
13677      // The extraction index adds to the existing offset.
13678      unsigned EltBitWidth = V.getScalarValueSizeInBits();
13679      unsigned Idx = V.getConstantOperandVal(1);
13680      unsigned BeginOffset = Idx * EltBitWidth;
13681      BitOffset += BeginOffset;
13682      V = V.getOperand(0);
13683      continue;
13684    }
13685    case ISD::INSERT_SUBVECTOR: {
13686      SDValue VOuter = V.getOperand(0), VInner = V.getOperand(1);
13687      int EltBitWidth = VOuter.getScalarValueSizeInBits();
13688      int Idx = (int)V.getConstantOperandVal(2);
13689      int NumSubElts = (int)VInner.getSimpleValueType().getVectorNumElements();
13690      int BeginOffset = Idx * EltBitWidth;
13691      int EndOffset = BeginOffset + NumSubElts * EltBitWidth;
13692      if (BeginOffset <= BitOffset && BitOffset < EndOffset) {
13693        BitOffset -= BeginOffset;
13694        V = VInner;
13695      } else {
13696        V = VOuter;
13697      }
13698      continue;
13699    }
13700    }
13701    break;
13702  }
13703  assert((BitOffset % NumEltBits) == 0 && "Illegal bit-offset");
13704  BroadcastIdx = BitOffset / NumEltBits;
13705
13706  // Do we need to bitcast the source to retrieve the original broadcast index?
13707  bool BitCastSrc = V.getScalarValueSizeInBits() != NumEltBits;
13708
13709  // Check if this is a broadcast of a scalar. We special case lowering
13710  // for scalars so that we can more effectively fold with loads.
13711  // If the original value has a larger element type than the shuffle, the
13712  // broadcast element is in essence truncated. Make that explicit to ease
13713  // folding.
13714  if (BitCastSrc && VT.isInteger())
13715    if (SDValue TruncBroadcast = lowerShuffleAsTruncBroadcast(
13716            DL, VT, V, BroadcastIdx, Subtarget, DAG))
13717      return TruncBroadcast;
13718
13719  // Also check the simpler case, where we can directly reuse the scalar.
13720  if (!BitCastSrc &&
13721      ((V.getOpcode() == ISD::BUILD_VECTOR && V.hasOneUse()) ||
13722       (V.getOpcode() == ISD::SCALAR_TO_VECTOR && BroadcastIdx == 0))) {
13723    V = V.getOperand(BroadcastIdx);
13724
13725    // If we can't broadcast from a register, check that the input is a load.
13726    if (!BroadcastFromReg && !isShuffleFoldableLoad(V))
13727      return SDValue();
13728  } else if (ISD::isNormalLoad(V.getNode()) &&
13729             cast<LoadSDNode>(V)->isSimple()) {
13730    // We do not check for one-use of the vector load because a broadcast load
13731    // is expected to be a win for code size, register pressure, and possibly
13732    // uops even if the original vector load is not eliminated.
13733
13734    // Reduce the vector load and shuffle to a broadcasted scalar load.
13735    LoadSDNode *Ld = cast<LoadSDNode>(V);
13736    SDValue BaseAddr = Ld->getOperand(1);
13737    MVT SVT = VT.getScalarType();
13738    unsigned Offset = BroadcastIdx * SVT.getStoreSize();
13739    assert((int)(Offset * 8) == BitOffset && "Unexpected bit-offset");
13740    SDValue NewAddr =
13741        DAG.getMemBasePlusOffset(BaseAddr, TypeSize::Fixed(Offset), DL);
13742
13743    // Directly form VBROADCAST_LOAD if we're using VBROADCAST opcode rather
13744    // than MOVDDUP.
13745    // FIXME: Should we add VBROADCAST_LOAD isel patterns for pre-AVX?
13746    if (Opcode == X86ISD::VBROADCAST) {
13747      SDVTList Tys = DAG.getVTList(VT, MVT::Other);
13748      SDValue Ops[] = {Ld->getChain(), NewAddr};
13749      V = DAG.getMemIntrinsicNode(
13750          X86ISD::VBROADCAST_LOAD, DL, Tys, Ops, SVT,
13751          DAG.getMachineFunction().getMachineMemOperand(
13752              Ld->getMemOperand(), Offset, SVT.getStoreSize()));
13753      DAG.makeEquivalentMemoryOrdering(Ld, V);
13754      return DAG.getBitcast(VT, V);
13755    }
13756    assert(SVT == MVT::f64 && "Unexpected VT!");
13757    V = DAG.getLoad(SVT, DL, Ld->getChain(), NewAddr,
13758                    DAG.getMachineFunction().getMachineMemOperand(
13759                        Ld->getMemOperand(), Offset, SVT.getStoreSize()));
13760    DAG.makeEquivalentMemoryOrdering(Ld, V);
13761  } else if (!BroadcastFromReg) {
13762    // We can't broadcast from a vector register.
13763    return SDValue();
13764  } else if (BitOffset != 0) {
13765    // We can only broadcast from the zero-element of a vector register,
13766    // but it can be advantageous to broadcast from the zero-element of a
13767    // subvector.
13768    if (!VT.is256BitVector() && !VT.is512BitVector())
13769      return SDValue();
13770
13771    // VPERMQ/VPERMPD can perform the cross-lane shuffle directly.
13772    if (VT == MVT::v4f64 || VT == MVT::v4i64)
13773      return SDValue();
13774
13775    // Only broadcast the zero-element of a 128-bit subvector.
13776    if ((BitOffset % 128) != 0)
13777      return SDValue();
13778
13779    assert((BitOffset % V.getScalarValueSizeInBits()) == 0 &&
13780           "Unexpected bit-offset");
13781    assert((V.getValueSizeInBits() == 256 || V.getValueSizeInBits() == 512) &&
13782           "Unexpected vector size");
13783    unsigned ExtractIdx = BitOffset / V.getScalarValueSizeInBits();
13784    V = extract128BitVector(V, ExtractIdx, DAG, DL);
13785  }
13786
13787  // On AVX we can use VBROADCAST directly for scalar sources.
13788  if (Opcode == X86ISD::MOVDDUP && !V.getValueType().isVector()) {
13789    V = DAG.getBitcast(MVT::f64, V);
13790    if (Subtarget.hasAVX()) {
13791      V = DAG.getNode(X86ISD::VBROADCAST, DL, MVT::v2f64, V);
13792      return DAG.getBitcast(VT, V);
13793    }
13794    V = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, MVT::v2f64, V);
13795  }
13796
13797  // If this is a scalar, do the broadcast on this type and bitcast.
13798  if (!V.getValueType().isVector()) {
13799    assert(V.getScalarValueSizeInBits() == NumEltBits &&
13800           "Unexpected scalar size");
13801    MVT BroadcastVT = MVT::getVectorVT(V.getSimpleValueType(),
13802                                       VT.getVectorNumElements());
13803    return DAG.getBitcast(VT, DAG.getNode(Opcode, DL, BroadcastVT, V));
13804  }
13805
13806  // We only support broadcasting from 128-bit vectors to minimize the
13807  // number of patterns we need to deal with in isel. So extract down to
13808  // 128-bits, removing as many bitcasts as possible.
13809  if (V.getValueSizeInBits() > 128)
13810    V = extract128BitVector(peekThroughBitcasts(V), 0, DAG, DL);
13811
13812  // Otherwise cast V to a vector with the same element type as VT, but
13813  // possibly narrower than VT. Then perform the broadcast.
13814  unsigned NumSrcElts = V.getValueSizeInBits() / NumEltBits;
13815  MVT CastVT = MVT::getVectorVT(VT.getVectorElementType(), NumSrcElts);
13816  return DAG.getNode(Opcode, DL, VT, DAG.getBitcast(CastVT, V));
13817}
13818
13819// Check for whether we can use INSERTPS to perform the shuffle. We only use
13820// INSERTPS when the V1 elements are already in the correct locations
13821// because otherwise we can just always use two SHUFPS instructions which
13822// are much smaller to encode than a SHUFPS and an INSERTPS. We can also
13823// perform INSERTPS if a single V1 element is out of place and all V2
13824// elements are zeroable.
13825static bool matchShuffleAsInsertPS(SDValue &V1, SDValue &V2,
13826                                   unsigned &InsertPSMask,
13827                                   const APInt &Zeroable,
13828                                   ArrayRef<int> Mask, SelectionDAG &DAG) {
13829  assert(V1.getSimpleValueType().is128BitVector() && "Bad operand type!");
13830  assert(V2.getSimpleValueType().is128BitVector() && "Bad operand type!");
13831  assert(Mask.size() == 4 && "Unexpected mask size for v4 shuffle!");
13832
13833  // Attempt to match INSERTPS with one element from VA or VB being
13834  // inserted into VA (or undef). If successful, V1, V2 and InsertPSMask
13835  // are updated.
13836  auto matchAsInsertPS = [&](SDValue VA, SDValue VB,
13837                             ArrayRef<int> CandidateMask) {
13838    unsigned ZMask = 0;
13839    int VADstIndex = -1;
13840    int VBDstIndex = -1;
13841    bool VAUsedInPlace = false;
13842
13843    for (int i = 0; i < 4; ++i) {
13844      // Synthesize a zero mask from the zeroable elements (includes undefs).
13845      if (Zeroable[i]) {
13846        ZMask |= 1 << i;
13847        continue;
13848      }
13849
13850      // Flag if we use any VA inputs in place.
13851      if (i == CandidateMask[i]) {
13852        VAUsedInPlace = true;
13853        continue;
13854      }
13855
13856      // We can only insert a single non-zeroable element.
13857      if (VADstIndex >= 0 || VBDstIndex >= 0)
13858        return false;
13859
13860      if (CandidateMask[i] < 4) {
13861        // VA input out of place for insertion.
13862        VADstIndex = i;
13863      } else {
13864        // VB input for insertion.
13865        VBDstIndex = i;
13866      }
13867    }
13868
13869    // Don't bother if we have no (non-zeroable) element for insertion.
13870    if (VADstIndex < 0 && VBDstIndex < 0)
13871      return false;
13872
13873    // Determine element insertion src/dst indices. The src index is from the
13874    // start of the inserted vector, not the start of the concatenated vector.
13875    unsigned VBSrcIndex = 0;
13876    if (VADstIndex >= 0) {
13877      // If we have a VA input out of place, we use VA as the V2 element
13878      // insertion and don't use the original V2 at all.
13879      VBSrcIndex = CandidateMask[VADstIndex];
13880      VBDstIndex = VADstIndex;
13881      VB = VA;
13882    } else {
13883      VBSrcIndex = CandidateMask[VBDstIndex] - 4;
13884    }
13885
13886    // If no V1 inputs are used in place, then the result is created only from
13887    // the zero mask and the V2 insertion - so remove V1 dependency.
13888    if (!VAUsedInPlace)
13889      VA = DAG.getUNDEF(MVT::v4f32);
13890
13891    // Update V1, V2 and InsertPSMask accordingly.
13892    V1 = VA;
13893    V2 = VB;
13894
13895    // Insert the V2 element into the desired position.
13896    InsertPSMask = VBSrcIndex << 6 | VBDstIndex << 4 | ZMask;
13897    assert((InsertPSMask & ~0xFFu) == 0 && "Invalid mask!");
13898    return true;
13899  };
13900
13901  if (matchAsInsertPS(V1, V2, Mask))
13902    return true;
13903
13904  // Commute and try again.
13905  SmallVector<int, 4> CommutedMask(Mask.begin(), Mask.end());
13906  ShuffleVectorSDNode::commuteMask(CommutedMask);
13907  if (matchAsInsertPS(V2, V1, CommutedMask))
13908    return true;
13909
13910  return false;
13911}
13912
13913static SDValue lowerShuffleAsInsertPS(const SDLoc &DL, SDValue V1, SDValue V2,
13914                                      ArrayRef<int> Mask, const APInt &Zeroable,
13915                                      SelectionDAG &DAG) {
13916  assert(V1.getSimpleValueType() == MVT::v4f32 && "Bad operand type!");
13917  assert(V2.getSimpleValueType() == MVT::v4f32 && "Bad operand type!");
13918
13919  // Attempt to match the insertps pattern.
13920  unsigned InsertPSMask = 0;
13921  if (!matchShuffleAsInsertPS(V1, V2, InsertPSMask, Zeroable, Mask, DAG))
13922    return SDValue();
13923
13924  // Insert the V2 element into the desired position.
13925  return DAG.getNode(X86ISD::INSERTPS, DL, MVT::v4f32, V1, V2,
13926                     DAG.getTargetConstant(InsertPSMask, DL, MVT::i8));
13927}
13928
13929/// Try to lower a shuffle as a permute of the inputs followed by an
13930/// UNPCK instruction.
13931///
13932/// This specifically targets cases where we end up with alternating between
13933/// the two inputs, and so can permute them into something that feeds a single
13934/// UNPCK instruction. Note that this routine only targets integer vectors
13935/// because for floating point vectors we have a generalized SHUFPS lowering
13936/// strategy that handles everything that doesn't *exactly* match an unpack,
13937/// making this clever lowering unnecessary.
13938static SDValue lowerShuffleAsPermuteAndUnpack(
13939    const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef<int> Mask,
13940    const X86Subtarget &Subtarget, SelectionDAG &DAG) {
13941  assert(!VT.isFloatingPoint() &&
13942         "This routine only supports integer vectors.");
13943  assert(VT.is128BitVector() &&
13944         "This routine only works on 128-bit vectors.");
13945  assert(!V2.isUndef() &&
13946         "This routine should only be used when blending two inputs.");
13947  assert(Mask.size() >= 2 && "Single element masks are invalid.");
13948
13949  int Size = Mask.size();
13950
13951  int NumLoInputs =
13952      count_if(Mask, [Size](int M) { return M >= 0 && M % Size < Size / 2; });
13953  int NumHiInputs =
13954      count_if(Mask, [Size](int M) { return M % Size >= Size / 2; });
13955
13956  bool UnpackLo = NumLoInputs >= NumHiInputs;
13957
13958  auto TryUnpack = [&](int ScalarSize, int Scale) {
13959    SmallVector<int, 16> V1Mask((unsigned)Size, -1);
13960    SmallVector<int, 16> V2Mask((unsigned)Size, -1);
13961
13962    for (int i = 0; i < Size; ++i) {
13963      if (Mask[i] < 0)
13964        continue;
13965
13966      // Each element of the unpack contains Scale elements from this mask.
13967      int UnpackIdx = i / Scale;
13968
13969      // We only handle the case where V1 feeds the first slots of the unpack.
13970      // We rely on canonicalization to ensure this is the case.
13971      if ((UnpackIdx % 2 == 0) != (Mask[i] < Size))
13972        return SDValue();
13973
13974      // Setup the mask for this input. The indexing is tricky as we have to
13975      // handle the unpack stride.
13976      SmallVectorImpl<int> &VMask = (UnpackIdx % 2 == 0) ? V1Mask : V2Mask;
13977      VMask[(UnpackIdx / 2) * Scale + i % Scale + (UnpackLo ? 0 : Size / 2)] =
13978          Mask[i] % Size;
13979    }
13980
13981    // If we will have to shuffle both inputs to use the unpack, check whether
13982    // we can just unpack first and shuffle the result. If so, skip this unpack.
13983    if ((NumLoInputs == 0 || NumHiInputs == 0) && !isNoopShuffleMask(V1Mask) &&
13984        !isNoopShuffleMask(V2Mask))
13985      return SDValue();
13986
13987    // Shuffle the inputs into place.
13988    V1 = DAG.getVectorShuffle(VT, DL, V1, DAG.getUNDEF(VT), V1Mask);
13989    V2 = DAG.getVectorShuffle(VT, DL, V2, DAG.getUNDEF(VT), V2Mask);
13990
13991    // Cast the inputs to the type we will use to unpack them.
13992    MVT UnpackVT = MVT::getVectorVT(MVT::getIntegerVT(ScalarSize), Size / Scale);
13993    V1 = DAG.getBitcast(UnpackVT, V1);
13994    V2 = DAG.getBitcast(UnpackVT, V2);
13995
13996    // Unpack the inputs and cast the result back to the desired type.
13997    return DAG.getBitcast(
13998        VT, DAG.getNode(UnpackLo ? X86ISD::UNPCKL : X86ISD::UNPCKH, DL,
13999                        UnpackVT, V1, V2));
14000  };
14001
14002  // We try each unpack from the largest to the smallest to try and find one
14003  // that fits this mask.
14004  int OrigScalarSize = VT.getScalarSizeInBits();
14005  for (int ScalarSize = 64; ScalarSize >= OrigScalarSize; ScalarSize /= 2)
14006    if (SDValue Unpack = TryUnpack(ScalarSize, ScalarSize / OrigScalarSize))
14007      return Unpack;
14008
14009  // If we're shuffling with a zero vector then we're better off not doing
14010  // VECTOR_SHUFFLE(UNPCK()) as we lose track of those zero elements.
14011  if (ISD::isBuildVectorAllZeros(V1.getNode()) ||
14012      ISD::isBuildVectorAllZeros(V2.getNode()))
14013    return SDValue();
14014
14015  // If none of the unpack-rooted lowerings worked (or were profitable) try an
14016  // initial unpack.
14017  if (NumLoInputs == 0 || NumHiInputs == 0) {
14018    assert((NumLoInputs > 0 || NumHiInputs > 0) &&
14019           "We have to have *some* inputs!");
14020    int HalfOffset = NumLoInputs == 0 ? Size / 2 : 0;
14021
14022    // FIXME: We could consider the total complexity of the permute of each
14023    // possible unpacking. Or at the least we should consider how many
14024    // half-crossings are created.
14025    // FIXME: We could consider commuting the unpacks.
14026
14027    SmallVector<int, 32> PermMask((unsigned)Size, -1);
14028    for (int i = 0; i < Size; ++i) {
14029      if (Mask[i] < 0)
14030        continue;
14031
14032      assert(Mask[i] % Size >= HalfOffset && "Found input from wrong half!");
14033
14034      PermMask[i] =
14035          2 * ((Mask[i] % Size) - HalfOffset) + (Mask[i] < Size ? 0 : 1);
14036    }
14037    return DAG.getVectorShuffle(
14038        VT, DL, DAG.getNode(NumLoInputs == 0 ? X86ISD::UNPCKH : X86ISD::UNPCKL,
14039                            DL, VT, V1, V2),
14040        DAG.getUNDEF(VT), PermMask);
14041  }
14042
14043  return SDValue();
14044}
14045
14046/// Handle lowering of 2-lane 64-bit floating point shuffles.
14047///
14048/// This is the basis function for the 2-lane 64-bit shuffles as we have full
14049/// support for floating point shuffles but not integer shuffles. These
14050/// instructions will incur a domain crossing penalty on some chips though so
14051/// it is better to avoid lowering through this for integer vectors where
14052/// possible.
14053static SDValue lowerV2F64Shuffle(const SDLoc &DL, ArrayRef<int> Mask,
14054                                 const APInt &Zeroable, SDValue V1, SDValue V2,
14055                                 const X86Subtarget &Subtarget,
14056                                 SelectionDAG &DAG) {
14057  assert(V1.getSimpleValueType() == MVT::v2f64 && "Bad operand type!");
14058  assert(V2.getSimpleValueType() == MVT::v2f64 && "Bad operand type!");
14059  assert(Mask.size() == 2 && "Unexpected mask size for v2 shuffle!");
14060
14061  if (V2.isUndef()) {
14062    // Check for being able to broadcast a single element.
14063    if (SDValue Broadcast = lowerShuffleAsBroadcast(DL, MVT::v2f64, V1, V2,
14064                                                    Mask, Subtarget, DAG))
14065      return Broadcast;
14066
14067    // Straight shuffle of a single input vector. Simulate this by using the
14068    // single input as both of the "inputs" to this instruction..
14069    unsigned SHUFPDMask = (Mask[0] == 1) | ((Mask[1] == 1) << 1);
14070
14071    if (Subtarget.hasAVX()) {
14072      // If we have AVX, we can use VPERMILPS which will allow folding a load
14073      // into the shuffle.
14074      return DAG.getNode(X86ISD::VPERMILPI, DL, MVT::v2f64, V1,
14075                         DAG.getTargetConstant(SHUFPDMask, DL, MVT::i8));
14076    }
14077
14078    return DAG.getNode(
14079        X86ISD::SHUFP, DL, MVT::v2f64,
14080        Mask[0] == SM_SentinelUndef ? DAG.getUNDEF(MVT::v2f64) : V1,
14081        Mask[1] == SM_SentinelUndef ? DAG.getUNDEF(MVT::v2f64) : V1,
14082        DAG.getTargetConstant(SHUFPDMask, DL, MVT::i8));
14083  }
14084  assert(Mask[0] >= 0 && "No undef lanes in multi-input v2 shuffles!");
14085  assert(Mask[1] >= 0 && "No undef lanes in multi-input v2 shuffles!");
14086  assert(Mask[0] < 2 && "We sort V1 to be the first input.");
14087  assert(Mask[1] >= 2 && "We sort V2 to be the second input.");
14088
14089  if (Subtarget.hasAVX2())
14090    if (SDValue Extract = lowerShuffleOfExtractsAsVperm(DL, V1, V2, Mask, DAG))
14091      return Extract;
14092
14093  // When loading a scalar and then shuffling it into a vector we can often do
14094  // the insertion cheaply.
14095  if (SDValue Insertion = lowerShuffleAsElementInsertion(
14096          DL, MVT::v2f64, V1, V2, Mask, Zeroable, Subtarget, DAG))
14097    return Insertion;
14098  // Try inverting the insertion since for v2 masks it is easy to do and we
14099  // can't reliably sort the mask one way or the other.
14100  int InverseMask[2] = {Mask[0] < 0 ? -1 : (Mask[0] ^ 2),
14101                        Mask[1] < 0 ? -1 : (Mask[1] ^ 2)};
14102  if (SDValue Insertion = lowerShuffleAsElementInsertion(
14103          DL, MVT::v2f64, V2, V1, InverseMask, Zeroable, Subtarget, DAG))
14104    return Insertion;
14105
14106  // Try to use one of the special instruction patterns to handle two common
14107  // blend patterns if a zero-blend above didn't work.
14108  if (isShuffleEquivalent(Mask, {0, 3}, V1, V2) ||
14109      isShuffleEquivalent(Mask, {1, 3}, V1, V2))
14110    if (SDValue V1S = getScalarValueForVectorElement(V1, Mask[0], DAG))
14111      // We can either use a special instruction to load over the low double or
14112      // to move just the low double.
14113      return DAG.getNode(
14114          X86ISD::MOVSD, DL, MVT::v2f64, V2,
14115          DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, MVT::v2f64, V1S));
14116
14117  if (Subtarget.hasSSE41())
14118    if (SDValue Blend = lowerShuffleAsBlend(DL, MVT::v2f64, V1, V2, Mask,
14119                                            Zeroable, Subtarget, DAG))
14120      return Blend;
14121
14122  // Use dedicated unpack instructions for masks that match their pattern.
14123  if (SDValue V = lowerShuffleWithUNPCK(DL, MVT::v2f64, Mask, V1, V2, DAG))
14124    return V;
14125
14126  unsigned SHUFPDMask = (Mask[0] == 1) | (((Mask[1] - 2) == 1) << 1);
14127  return DAG.getNode(X86ISD::SHUFP, DL, MVT::v2f64, V1, V2,
14128                     DAG.getTargetConstant(SHUFPDMask, DL, MVT::i8));
14129}
14130
14131/// Handle lowering of 2-lane 64-bit integer shuffles.
14132///
14133/// Tries to lower a 2-lane 64-bit shuffle using shuffle operations provided by
14134/// the integer unit to minimize domain crossing penalties. However, for blends
14135/// it falls back to the floating point shuffle operation with appropriate bit
14136/// casting.
14137static SDValue lowerV2I64Shuffle(const SDLoc &DL, ArrayRef<int> Mask,
14138                                 const APInt &Zeroable, SDValue V1, SDValue V2,
14139                                 const X86Subtarget &Subtarget,
14140                                 SelectionDAG &DAG) {
14141  assert(V1.getSimpleValueType() == MVT::v2i64 && "Bad operand type!");
14142  assert(V2.getSimpleValueType() == MVT::v2i64 && "Bad operand type!");
14143  assert(Mask.size() == 2 && "Unexpected mask size for v2 shuffle!");
14144
14145  if (V2.isUndef()) {
14146    // Check for being able to broadcast a single element.
14147    if (SDValue Broadcast = lowerShuffleAsBroadcast(DL, MVT::v2i64, V1, V2,
14148                                                    Mask, Subtarget, DAG))
14149      return Broadcast;
14150
14151    // Straight shuffle of a single input vector. For everything from SSE2
14152    // onward this has a single fast instruction with no scary immediates.
14153    // We have to map the mask as it is actually a v4i32 shuffle instruction.
14154    V1 = DAG.getBitcast(MVT::v4i32, V1);
14155    int WidenedMask[4] = {Mask[0] < 0 ? -1 : (Mask[0] * 2),
14156                          Mask[0] < 0 ? -1 : ((Mask[0] * 2) + 1),
14157                          Mask[1] < 0 ? -1 : (Mask[1] * 2),
14158                          Mask[1] < 0 ? -1 : ((Mask[1] * 2) + 1)};
14159    return DAG.getBitcast(
14160        MVT::v2i64,
14161        DAG.getNode(X86ISD::PSHUFD, DL, MVT::v4i32, V1,
14162                    getV4X86ShuffleImm8ForMask(WidenedMask, DL, DAG)));
14163  }
14164  assert(Mask[0] != -1 && "No undef lanes in multi-input v2 shuffles!");
14165  assert(Mask[1] != -1 && "No undef lanes in multi-input v2 shuffles!");
14166  assert(Mask[0] < 2 && "We sort V1 to be the first input.");
14167  assert(Mask[1] >= 2 && "We sort V2 to be the second input.");
14168
14169  if (Subtarget.hasAVX2())
14170    if (SDValue Extract = lowerShuffleOfExtractsAsVperm(DL, V1, V2, Mask, DAG))
14171      return Extract;
14172
14173  // Try to use shift instructions.
14174  if (SDValue Shift = lowerShuffleAsShift(DL, MVT::v2i64, V1, V2, Mask,
14175                                          Zeroable, Subtarget, DAG))
14176    return Shift;
14177
14178  // When loading a scalar and then shuffling it into a vector we can often do
14179  // the insertion cheaply.
14180  if (SDValue Insertion = lowerShuffleAsElementInsertion(
14181          DL, MVT::v2i64, V1, V2, Mask, Zeroable, Subtarget, DAG))
14182    return Insertion;
14183  // Try inverting the insertion since for v2 masks it is easy to do and we
14184  // can't reliably sort the mask one way or the other.
14185  int InverseMask[2] = {Mask[0] ^ 2, Mask[1] ^ 2};
14186  if (SDValue Insertion = lowerShuffleAsElementInsertion(
14187          DL, MVT::v2i64, V2, V1, InverseMask, Zeroable, Subtarget, DAG))
14188    return Insertion;
14189
14190  // We have different paths for blend lowering, but they all must use the
14191  // *exact* same predicate.
14192  bool IsBlendSupported = Subtarget.hasSSE41();
14193  if (IsBlendSupported)
14194    if (SDValue Blend = lowerShuffleAsBlend(DL, MVT::v2i64, V1, V2, Mask,
14195                                            Zeroable, Subtarget, DAG))
14196      return Blend;
14197
14198  // Use dedicated unpack instructions for masks that match their pattern.
14199  if (SDValue V = lowerShuffleWithUNPCK(DL, MVT::v2i64, Mask, V1, V2, DAG))
14200    return V;
14201
14202  // Try to use byte rotation instructions.
14203  // Its more profitable for pre-SSSE3 to use shuffles/unpacks.
14204  if (Subtarget.hasSSSE3()) {
14205    if (Subtarget.hasVLX())
14206      if (SDValue Rotate = lowerShuffleAsVALIGN(DL, MVT::v2i64, V1, V2, Mask,
14207                                                Subtarget, DAG))
14208        return Rotate;
14209
14210    if (SDValue Rotate = lowerShuffleAsByteRotate(DL, MVT::v2i64, V1, V2, Mask,
14211                                                  Subtarget, DAG))
14212      return Rotate;
14213  }
14214
14215  // If we have direct support for blends, we should lower by decomposing into
14216  // a permute. That will be faster than the domain cross.
14217  if (IsBlendSupported)
14218    return lowerShuffleAsDecomposedShuffleMerge(DL, MVT::v2i64, V1, V2, Mask,
14219                                                Subtarget, DAG);
14220
14221  // We implement this with SHUFPD which is pretty lame because it will likely
14222  // incur 2 cycles of stall for integer vectors on Nehalem and older chips.
14223  // However, all the alternatives are still more cycles and newer chips don't
14224  // have this problem. It would be really nice if x86 had better shuffles here.
14225  V1 = DAG.getBitcast(MVT::v2f64, V1);
14226  V2 = DAG.getBitcast(MVT::v2f64, V2);
14227  return DAG.getBitcast(MVT::v2i64,
14228                        DAG.getVectorShuffle(MVT::v2f64, DL, V1, V2, Mask));
14229}
14230
14231/// Lower a vector shuffle using the SHUFPS instruction.
14232///
14233/// This is a helper routine dedicated to lowering vector shuffles using SHUFPS.
14234/// It makes no assumptions about whether this is the *best* lowering, it simply
14235/// uses it.
14236static SDValue lowerShuffleWithSHUFPS(const SDLoc &DL, MVT VT,
14237                                      ArrayRef<int> Mask, SDValue V1,
14238                                      SDValue V2, SelectionDAG &DAG) {
14239  SDValue LowV = V1, HighV = V2;
14240  SmallVector<int, 4> NewMask(Mask.begin(), Mask.end());
14241  int NumV2Elements = count_if(Mask, [](int M) { return M >= 4; });
14242
14243  if (NumV2Elements == 1) {
14244    int V2Index = find_if(Mask, [](int M) { return M >= 4; }) - Mask.begin();
14245
14246    // Compute the index adjacent to V2Index and in the same half by toggling
14247    // the low bit.
14248    int V2AdjIndex = V2Index ^ 1;
14249
14250    if (Mask[V2AdjIndex] < 0) {
14251      // Handles all the cases where we have a single V2 element and an undef.
14252      // This will only ever happen in the high lanes because we commute the
14253      // vector otherwise.
14254      if (V2Index < 2)
14255        std::swap(LowV, HighV);
14256      NewMask[V2Index] -= 4;
14257    } else {
14258      // Handle the case where the V2 element ends up adjacent to a V1 element.
14259      // To make this work, blend them together as the first step.
14260      int V1Index = V2AdjIndex;
14261      int BlendMask[4] = {Mask[V2Index] - 4, 0, Mask[V1Index], 0};
14262      V2 = DAG.getNode(X86ISD::SHUFP, DL, VT, V2, V1,
14263                       getV4X86ShuffleImm8ForMask(BlendMask, DL, DAG));
14264
14265      // Now proceed to reconstruct the final blend as we have the necessary
14266      // high or low half formed.
14267      if (V2Index < 2) {
14268        LowV = V2;
14269        HighV = V1;
14270      } else {
14271        HighV = V2;
14272      }
14273      NewMask[V1Index] = 2; // We put the V1 element in V2[2].
14274      NewMask[V2Index] = 0; // We shifted the V2 element into V2[0].
14275    }
14276  } else if (NumV2Elements == 2) {
14277    if (Mask[0] < 4 && Mask[1] < 4) {
14278      // Handle the easy case where we have V1 in the low lanes and V2 in the
14279      // high lanes.
14280      NewMask[2] -= 4;
14281      NewMask[3] -= 4;
14282    } else if (Mask[2] < 4 && Mask[3] < 4) {
14283      // We also handle the reversed case because this utility may get called
14284      // when we detect a SHUFPS pattern but can't easily commute the shuffle to
14285      // arrange things in the right direction.
14286      NewMask[0] -= 4;
14287      NewMask[1] -= 4;
14288      HighV = V1;
14289      LowV = V2;
14290    } else {
14291      // We have a mixture of V1 and V2 in both low and high lanes. Rather than
14292      // trying to place elements directly, just blend them and set up the final
14293      // shuffle to place them.
14294
14295      // The first two blend mask elements are for V1, the second two are for
14296      // V2.
14297      int BlendMask[4] = {Mask[0] < 4 ? Mask[0] : Mask[1],
14298                          Mask[2] < 4 ? Mask[2] : Mask[3],
14299                          (Mask[0] >= 4 ? Mask[0] : Mask[1]) - 4,
14300                          (Mask[2] >= 4 ? Mask[2] : Mask[3]) - 4};
14301      V1 = DAG.getNode(X86ISD::SHUFP, DL, VT, V1, V2,
14302                       getV4X86ShuffleImm8ForMask(BlendMask, DL, DAG));
14303
14304      // Now we do a normal shuffle of V1 by giving V1 as both operands to
14305      // a blend.
14306      LowV = HighV = V1;
14307      NewMask[0] = Mask[0] < 4 ? 0 : 2;
14308      NewMask[1] = Mask[0] < 4 ? 2 : 0;
14309      NewMask[2] = Mask[2] < 4 ? 1 : 3;
14310      NewMask[3] = Mask[2] < 4 ? 3 : 1;
14311    }
14312  } else if (NumV2Elements == 3) {
14313    // Ideally canonicalizeShuffleMaskWithCommute should have caught this, but
14314    // we can get here due to other paths (e.g repeated mask matching) that we
14315    // don't want to do another round of lowerVECTOR_SHUFFLE.
14316    ShuffleVectorSDNode::commuteMask(NewMask);
14317    return lowerShuffleWithSHUFPS(DL, VT, NewMask, V2, V1, DAG);
14318  }
14319  return DAG.getNode(X86ISD::SHUFP, DL, VT, LowV, HighV,
14320                     getV4X86ShuffleImm8ForMask(NewMask, DL, DAG));
14321}
14322
14323/// Lower 4-lane 32-bit floating point shuffles.
14324///
14325/// Uses instructions exclusively from the floating point unit to minimize
14326/// domain crossing penalties, as these are sufficient to implement all v4f32
14327/// shuffles.
14328static SDValue lowerV4F32Shuffle(const SDLoc &DL, ArrayRef<int> Mask,
14329                                 const APInt &Zeroable, SDValue V1, SDValue V2,
14330                                 const X86Subtarget &Subtarget,
14331                                 SelectionDAG &DAG) {
14332  assert(V1.getSimpleValueType() == MVT::v4f32 && "Bad operand type!");
14333  assert(V2.getSimpleValueType() == MVT::v4f32 && "Bad operand type!");
14334  assert(Mask.size() == 4 && "Unexpected mask size for v4 shuffle!");
14335
14336  int NumV2Elements = count_if(Mask, [](int M) { return M >= 4; });
14337
14338  if (NumV2Elements == 0) {
14339    // Check for being able to broadcast a single element.
14340    if (SDValue Broadcast = lowerShuffleAsBroadcast(DL, MVT::v4f32, V1, V2,
14341                                                    Mask, Subtarget, DAG))
14342      return Broadcast;
14343
14344    // Use even/odd duplicate instructions for masks that match their pattern.
14345    if (Subtarget.hasSSE3()) {
14346      if (isShuffleEquivalent(Mask, {0, 0, 2, 2}, V1, V2))
14347        return DAG.getNode(X86ISD::MOVSLDUP, DL, MVT::v4f32, V1);
14348      if (isShuffleEquivalent(Mask, {1, 1, 3, 3}, V1, V2))
14349        return DAG.getNode(X86ISD::MOVSHDUP, DL, MVT::v4f32, V1);
14350    }
14351
14352    if (Subtarget.hasAVX()) {
14353      // If we have AVX, we can use VPERMILPS which will allow folding a load
14354      // into the shuffle.
14355      return DAG.getNode(X86ISD::VPERMILPI, DL, MVT::v4f32, V1,
14356                         getV4X86ShuffleImm8ForMask(Mask, DL, DAG));
14357    }
14358
14359    // Use MOVLHPS/MOVHLPS to simulate unary shuffles. These are only valid
14360    // in SSE1 because otherwise they are widened to v2f64 and never get here.
14361    if (!Subtarget.hasSSE2()) {
14362      if (isShuffleEquivalent(Mask, {0, 1, 0, 1}, V1, V2))
14363        return DAG.getNode(X86ISD::MOVLHPS, DL, MVT::v4f32, V1, V1);
14364      if (isShuffleEquivalent(Mask, {2, 3, 2, 3}, V1, V2))
14365        return DAG.getNode(X86ISD::MOVHLPS, DL, MVT::v4f32, V1, V1);
14366    }
14367
14368    // Otherwise, use a straight shuffle of a single input vector. We pass the
14369    // input vector to both operands to simulate this with a SHUFPS.
14370    return DAG.getNode(X86ISD::SHUFP, DL, MVT::v4f32, V1, V1,
14371                       getV4X86ShuffleImm8ForMask(Mask, DL, DAG));
14372  }
14373
14374  if (Subtarget.hasAVX2())
14375    if (SDValue Extract = lowerShuffleOfExtractsAsVperm(DL, V1, V2, Mask, DAG))
14376      return Extract;
14377
14378  // There are special ways we can lower some single-element blends. However, we
14379  // have custom ways we can lower more complex single-element blends below that
14380  // we defer to if both this and BLENDPS fail to match, so restrict this to
14381  // when the V2 input is targeting element 0 of the mask -- that is the fast
14382  // case here.
14383  if (NumV2Elements == 1 && Mask[0] >= 4)
14384    if (SDValue V = lowerShuffleAsElementInsertion(
14385            DL, MVT::v4f32, V1, V2, Mask, Zeroable, Subtarget, DAG))
14386      return V;
14387
14388  if (Subtarget.hasSSE41()) {
14389    if (SDValue Blend = lowerShuffleAsBlend(DL, MVT::v4f32, V1, V2, Mask,
14390                                            Zeroable, Subtarget, DAG))
14391      return Blend;
14392
14393    // Use INSERTPS if we can complete the shuffle efficiently.
14394    if (SDValue V = lowerShuffleAsInsertPS(DL, V1, V2, Mask, Zeroable, DAG))
14395      return V;
14396
14397    if (!isSingleSHUFPSMask(Mask))
14398      if (SDValue BlendPerm = lowerShuffleAsBlendAndPermute(DL, MVT::v4f32, V1,
14399                                                            V2, Mask, DAG))
14400        return BlendPerm;
14401  }
14402
14403  // Use low/high mov instructions. These are only valid in SSE1 because
14404  // otherwise they are widened to v2f64 and never get here.
14405  if (!Subtarget.hasSSE2()) {
14406    if (isShuffleEquivalent(Mask, {0, 1, 4, 5}, V1, V2))
14407      return DAG.getNode(X86ISD::MOVLHPS, DL, MVT::v4f32, V1, V2);
14408    if (isShuffleEquivalent(Mask, {2, 3, 6, 7}, V1, V2))
14409      return DAG.getNode(X86ISD::MOVHLPS, DL, MVT::v4f32, V2, V1);
14410  }
14411
14412  // Use dedicated unpack instructions for masks that match their pattern.
14413  if (SDValue V = lowerShuffleWithUNPCK(DL, MVT::v4f32, Mask, V1, V2, DAG))
14414    return V;
14415
14416  // Otherwise fall back to a SHUFPS lowering strategy.
14417  return lowerShuffleWithSHUFPS(DL, MVT::v4f32, Mask, V1, V2, DAG);
14418}
14419
14420/// Lower 4-lane i32 vector shuffles.
14421///
14422/// We try to handle these with integer-domain shuffles where we can, but for
14423/// blends we use the floating point domain blend instructions.
14424static SDValue lowerV4I32Shuffle(const SDLoc &DL, ArrayRef<int> Mask,
14425                                 const APInt &Zeroable, SDValue V1, SDValue V2,
14426                                 const X86Subtarget &Subtarget,
14427                                 SelectionDAG &DAG) {
14428  assert(V1.getSimpleValueType() == MVT::v4i32 && "Bad operand type!");
14429  assert(V2.getSimpleValueType() == MVT::v4i32 && "Bad operand type!");
14430  assert(Mask.size() == 4 && "Unexpected mask size for v4 shuffle!");
14431
14432  // Whenever we can lower this as a zext, that instruction is strictly faster
14433  // than any alternative. It also allows us to fold memory operands into the
14434  // shuffle in many cases.
14435  if (SDValue ZExt = lowerShuffleAsZeroOrAnyExtend(DL, MVT::v4i32, V1, V2, Mask,
14436                                                   Zeroable, Subtarget, DAG))
14437    return ZExt;
14438
14439  int NumV2Elements = count_if(Mask, [](int M) { return M >= 4; });
14440
14441  if (NumV2Elements == 0) {
14442    // Try to use broadcast unless the mask only has one non-undef element.
14443    if (count_if(Mask, [](int M) { return M >= 0 && M < 4; }) > 1) {
14444      if (SDValue Broadcast = lowerShuffleAsBroadcast(DL, MVT::v4i32, V1, V2,
14445                                                      Mask, Subtarget, DAG))
14446        return Broadcast;
14447    }
14448
14449    // Straight shuffle of a single input vector. For everything from SSE2
14450    // onward this has a single fast instruction with no scary immediates.
14451    // We coerce the shuffle pattern to be compatible with UNPCK instructions
14452    // but we aren't actually going to use the UNPCK instruction because doing
14453    // so prevents folding a load into this instruction or making a copy.
14454    const int UnpackLoMask[] = {0, 0, 1, 1};
14455    const int UnpackHiMask[] = {2, 2, 3, 3};
14456    if (isShuffleEquivalent(Mask, {0, 0, 1, 1}, V1, V2))
14457      Mask = UnpackLoMask;
14458    else if (isShuffleEquivalent(Mask, {2, 2, 3, 3}, V1, V2))
14459      Mask = UnpackHiMask;
14460
14461    return DAG.getNode(X86ISD::PSHUFD, DL, MVT::v4i32, V1,
14462                       getV4X86ShuffleImm8ForMask(Mask, DL, DAG));
14463  }
14464
14465  if (Subtarget.hasAVX2())
14466    if (SDValue Extract = lowerShuffleOfExtractsAsVperm(DL, V1, V2, Mask, DAG))
14467      return Extract;
14468
14469  // Try to use shift instructions.
14470  if (SDValue Shift = lowerShuffleAsShift(DL, MVT::v4i32, V1, V2, Mask,
14471                                          Zeroable, Subtarget, DAG))
14472    return Shift;
14473
14474  // There are special ways we can lower some single-element blends.
14475  if (NumV2Elements == 1)
14476    if (SDValue V = lowerShuffleAsElementInsertion(
14477            DL, MVT::v4i32, V1, V2, Mask, Zeroable, Subtarget, DAG))
14478      return V;
14479
14480  // We have different paths for blend lowering, but they all must use the
14481  // *exact* same predicate.
14482  bool IsBlendSupported = Subtarget.hasSSE41();
14483  if (IsBlendSupported)
14484    if (SDValue Blend = lowerShuffleAsBlend(DL, MVT::v4i32, V1, V2, Mask,
14485                                            Zeroable, Subtarget, DAG))
14486      return Blend;
14487
14488  if (SDValue Masked = lowerShuffleAsBitMask(DL, MVT::v4i32, V1, V2, Mask,
14489                                             Zeroable, Subtarget, DAG))
14490    return Masked;
14491
14492  // Use dedicated unpack instructions for masks that match their pattern.
14493  if (SDValue V = lowerShuffleWithUNPCK(DL, MVT::v4i32, Mask, V1, V2, DAG))
14494    return V;
14495
14496  // Try to use byte rotation instructions.
14497  // Its more profitable for pre-SSSE3 to use shuffles/unpacks.
14498  if (Subtarget.hasSSSE3()) {
14499    if (Subtarget.hasVLX())
14500      if (SDValue Rotate = lowerShuffleAsVALIGN(DL, MVT::v4i32, V1, V2, Mask,
14501                                                Subtarget, DAG))
14502        return Rotate;
14503
14504    if (SDValue Rotate = lowerShuffleAsByteRotate(DL, MVT::v4i32, V1, V2, Mask,
14505                                                  Subtarget, DAG))
14506      return Rotate;
14507  }
14508
14509  // Assume that a single SHUFPS is faster than an alternative sequence of
14510  // multiple instructions (even if the CPU has a domain penalty).
14511  // If some CPU is harmed by the domain switch, we can fix it in a later pass.
14512  if (!isSingleSHUFPSMask(Mask)) {
14513    // If we have direct support for blends, we should lower by decomposing into
14514    // a permute. That will be faster than the domain cross.
14515    if (IsBlendSupported)
14516      return lowerShuffleAsDecomposedShuffleMerge(DL, MVT::v4i32, V1, V2, Mask,
14517                                                  Subtarget, DAG);
14518
14519    // Try to lower by permuting the inputs into an unpack instruction.
14520    if (SDValue Unpack = lowerShuffleAsPermuteAndUnpack(DL, MVT::v4i32, V1, V2,
14521                                                        Mask, Subtarget, DAG))
14522      return Unpack;
14523  }
14524
14525  // We implement this with SHUFPS because it can blend from two vectors.
14526  // Because we're going to eventually use SHUFPS, we use SHUFPS even to build
14527  // up the inputs, bypassing domain shift penalties that we would incur if we
14528  // directly used PSHUFD on Nehalem and older. For newer chips, this isn't
14529  // relevant.
14530  SDValue CastV1 = DAG.getBitcast(MVT::v4f32, V1);
14531  SDValue CastV2 = DAG.getBitcast(MVT::v4f32, V2);
14532  SDValue ShufPS = DAG.getVectorShuffle(MVT::v4f32, DL, CastV1, CastV2, Mask);
14533  return DAG.getBitcast(MVT::v4i32, ShufPS);
14534}
14535
14536/// Lowering of single-input v8i16 shuffles is the cornerstone of SSE2
14537/// shuffle lowering, and the most complex part.
14538///
14539/// The lowering strategy is to try to form pairs of input lanes which are
14540/// targeted at the same half of the final vector, and then use a dword shuffle
14541/// to place them onto the right half, and finally unpack the paired lanes into
14542/// their final position.
14543///
14544/// The exact breakdown of how to form these dword pairs and align them on the
14545/// correct sides is really tricky. See the comments within the function for
14546/// more of the details.
14547///
14548/// This code also handles repeated 128-bit lanes of v8i16 shuffles, but each
14549/// lane must shuffle the *exact* same way. In fact, you must pass a v8 Mask to
14550/// this routine for it to work correctly. To shuffle a 256-bit or 512-bit i16
14551/// vector, form the analogous 128-bit 8-element Mask.
14552static SDValue lowerV8I16GeneralSingleInputShuffle(
14553    const SDLoc &DL, MVT VT, SDValue V, MutableArrayRef<int> Mask,
14554    const X86Subtarget &Subtarget, SelectionDAG &DAG) {
14555  assert(VT.getVectorElementType() == MVT::i16 && "Bad input type!");
14556  MVT PSHUFDVT = MVT::getVectorVT(MVT::i32, VT.getVectorNumElements() / 2);
14557
14558  assert(Mask.size() == 8 && "Shuffle mask length doesn't match!");
14559  MutableArrayRef<int> LoMask = Mask.slice(0, 4);
14560  MutableArrayRef<int> HiMask = Mask.slice(4, 4);
14561
14562  // Attempt to directly match PSHUFLW or PSHUFHW.
14563  if (isUndefOrInRange(LoMask, 0, 4) &&
14564      isSequentialOrUndefInRange(HiMask, 0, 4, 4)) {
14565    return DAG.getNode(X86ISD::PSHUFLW, DL, VT, V,
14566                       getV4X86ShuffleImm8ForMask(LoMask, DL, DAG));
14567  }
14568  if (isUndefOrInRange(HiMask, 4, 8) &&
14569      isSequentialOrUndefInRange(LoMask, 0, 4, 0)) {
14570    for (int i = 0; i != 4; ++i)
14571      HiMask[i] = (HiMask[i] < 0 ? HiMask[i] : (HiMask[i] - 4));
14572    return DAG.getNode(X86ISD::PSHUFHW, DL, VT, V,
14573                       getV4X86ShuffleImm8ForMask(HiMask, DL, DAG));
14574  }
14575
14576  SmallVector<int, 4> LoInputs;
14577  copy_if(LoMask, std::back_inserter(LoInputs), [](int M) { return M >= 0; });
14578  array_pod_sort(LoInputs.begin(), LoInputs.end());
14579  LoInputs.erase(std::unique(LoInputs.begin(), LoInputs.end()), LoInputs.end());
14580  SmallVector<int, 4> HiInputs;
14581  copy_if(HiMask, std::back_inserter(HiInputs), [](int M) { return M >= 0; });
14582  array_pod_sort(HiInputs.begin(), HiInputs.end());
14583  HiInputs.erase(std::unique(HiInputs.begin(), HiInputs.end()), HiInputs.end());
14584  int NumLToL = llvm::lower_bound(LoInputs, 4) - LoInputs.begin();
14585  int NumHToL = LoInputs.size() - NumLToL;
14586  int NumLToH = llvm::lower_bound(HiInputs, 4) - HiInputs.begin();
14587  int NumHToH = HiInputs.size() - NumLToH;
14588  MutableArrayRef<int> LToLInputs(LoInputs.data(), NumLToL);
14589  MutableArrayRef<int> LToHInputs(HiInputs.data(), NumLToH);
14590  MutableArrayRef<int> HToLInputs(LoInputs.data() + NumLToL, NumHToL);
14591  MutableArrayRef<int> HToHInputs(HiInputs.data() + NumLToH, NumHToH);
14592
14593  // If we are shuffling values from one half - check how many different DWORD
14594  // pairs we need to create. If only 1 or 2 then we can perform this as a
14595  // PSHUFLW/PSHUFHW + PSHUFD instead of the PSHUFD+PSHUFLW+PSHUFHW chain below.
14596  auto ShuffleDWordPairs = [&](ArrayRef<int> PSHUFHalfMask,
14597                               ArrayRef<int> PSHUFDMask, unsigned ShufWOp) {
14598    V = DAG.getNode(ShufWOp, DL, VT, V,
14599                    getV4X86ShuffleImm8ForMask(PSHUFHalfMask, DL, DAG));
14600    V = DAG.getBitcast(PSHUFDVT, V);
14601    V = DAG.getNode(X86ISD::PSHUFD, DL, PSHUFDVT, V,
14602                    getV4X86ShuffleImm8ForMask(PSHUFDMask, DL, DAG));
14603    return DAG.getBitcast(VT, V);
14604  };
14605
14606  if ((NumHToL + NumHToH) == 0 || (NumLToL + NumLToH) == 0) {
14607    int PSHUFDMask[4] = { -1, -1, -1, -1 };
14608    SmallVector<std::pair<int, int>, 4> DWordPairs;
14609    int DOffset = ((NumHToL + NumHToH) == 0 ? 0 : 2);
14610
14611    // Collect the different DWORD pairs.
14612    for (int DWord = 0; DWord != 4; ++DWord) {
14613      int M0 = Mask[2 * DWord + 0];
14614      int M1 = Mask[2 * DWord + 1];
14615      M0 = (M0 >= 0 ? M0 % 4 : M0);
14616      M1 = (M1 >= 0 ? M1 % 4 : M1);
14617      if (M0 < 0 && M1 < 0)
14618        continue;
14619
14620      bool Match = false;
14621      for (int j = 0, e = DWordPairs.size(); j < e; ++j) {
14622        auto &DWordPair = DWordPairs[j];
14623        if ((M0 < 0 || isUndefOrEqual(DWordPair.first, M0)) &&
14624            (M1 < 0 || isUndefOrEqual(DWordPair.second, M1))) {
14625          DWordPair.first = (M0 >= 0 ? M0 : DWordPair.first);
14626          DWordPair.second = (M1 >= 0 ? M1 : DWordPair.second);
14627          PSHUFDMask[DWord] = DOffset + j;
14628          Match = true;
14629          break;
14630        }
14631      }
14632      if (!Match) {
14633        PSHUFDMask[DWord] = DOffset + DWordPairs.size();
14634        DWordPairs.push_back(std::make_pair(M0, M1));
14635      }
14636    }
14637
14638    if (DWordPairs.size() <= 2) {
14639      DWordPairs.resize(2, std::make_pair(-1, -1));
14640      int PSHUFHalfMask[4] = {DWordPairs[0].first, DWordPairs[0].second,
14641                              DWordPairs[1].first, DWordPairs[1].second};
14642      if ((NumHToL + NumHToH) == 0)
14643        return ShuffleDWordPairs(PSHUFHalfMask, PSHUFDMask, X86ISD::PSHUFLW);
14644      if ((NumLToL + NumLToH) == 0)
14645        return ShuffleDWordPairs(PSHUFHalfMask, PSHUFDMask, X86ISD::PSHUFHW);
14646    }
14647  }
14648
14649  // Simplify the 1-into-3 and 3-into-1 cases with a single pshufd. For all
14650  // such inputs we can swap two of the dwords across the half mark and end up
14651  // with <=2 inputs to each half in each half. Once there, we can fall through
14652  // to the generic code below. For example:
14653  //
14654  // Input: [a, b, c, d, e, f, g, h] -PSHUFD[0,2,1,3]-> [a, b, e, f, c, d, g, h]
14655  // Mask:  [0, 1, 2, 7, 4, 5, 6, 3] -----------------> [0, 1, 4, 7, 2, 3, 6, 5]
14656  //
14657  // However in some very rare cases we have a 1-into-3 or 3-into-1 on one half
14658  // and an existing 2-into-2 on the other half. In this case we may have to
14659  // pre-shuffle the 2-into-2 half to avoid turning it into a 3-into-1 or
14660  // 1-into-3 which could cause us to cycle endlessly fixing each side in turn.
14661  // Fortunately, we don't have to handle anything but a 2-into-2 pattern
14662  // because any other situation (including a 3-into-1 or 1-into-3 in the other
14663  // half than the one we target for fixing) will be fixed when we re-enter this
14664  // path. We will also combine away any sequence of PSHUFD instructions that
14665  // result into a single instruction. Here is an example of the tricky case:
14666  //
14667  // Input: [a, b, c, d, e, f, g, h] -PSHUFD[0,2,1,3]-> [a, b, e, f, c, d, g, h]
14668  // Mask:  [3, 7, 1, 0, 2, 7, 3, 5] -THIS-IS-BAD!!!!-> [5, 7, 1, 0, 4, 7, 5, 3]
14669  //
14670  // This now has a 1-into-3 in the high half! Instead, we do two shuffles:
14671  //
14672  // Input: [a, b, c, d, e, f, g, h] PSHUFHW[0,2,1,3]-> [a, b, c, d, e, g, f, h]
14673  // Mask:  [3, 7, 1, 0, 2, 7, 3, 5] -----------------> [3, 7, 1, 0, 2, 7, 3, 6]
14674  //
14675  // Input: [a, b, c, d, e, g, f, h] -PSHUFD[0,2,1,3]-> [a, b, e, g, c, d, f, h]
14676  // Mask:  [3, 7, 1, 0, 2, 7, 3, 6] -----------------> [5, 7, 1, 0, 4, 7, 5, 6]
14677  //
14678  // The result is fine to be handled by the generic logic.
14679  auto balanceSides = [&](ArrayRef<int> AToAInputs, ArrayRef<int> BToAInputs,
14680                          ArrayRef<int> BToBInputs, ArrayRef<int> AToBInputs,
14681                          int AOffset, int BOffset) {
14682    assert((AToAInputs.size() == 3 || AToAInputs.size() == 1) &&
14683           "Must call this with A having 3 or 1 inputs from the A half.");
14684    assert((BToAInputs.size() == 1 || BToAInputs.size() == 3) &&
14685           "Must call this with B having 1 or 3 inputs from the B half.");
14686    assert(AToAInputs.size() + BToAInputs.size() == 4 &&
14687           "Must call this with either 3:1 or 1:3 inputs (summing to 4).");
14688
14689    bool ThreeAInputs = AToAInputs.size() == 3;
14690
14691    // Compute the index of dword with only one word among the three inputs in
14692    // a half by taking the sum of the half with three inputs and subtracting
14693    // the sum of the actual three inputs. The difference is the remaining
14694    // slot.
14695    int ADWord = 0, BDWord = 0;
14696    int &TripleDWord = ThreeAInputs ? ADWord : BDWord;
14697    int &OneInputDWord = ThreeAInputs ? BDWord : ADWord;
14698    int TripleInputOffset = ThreeAInputs ? AOffset : BOffset;
14699    ArrayRef<int> TripleInputs = ThreeAInputs ? AToAInputs : BToAInputs;
14700    int OneInput = ThreeAInputs ? BToAInputs[0] : AToAInputs[0];
14701    int TripleInputSum = 0 + 1 + 2 + 3 + (4 * TripleInputOffset);
14702    int TripleNonInputIdx =
14703        TripleInputSum - std::accumulate(TripleInputs.begin(), TripleInputs.end(), 0);
14704    TripleDWord = TripleNonInputIdx / 2;
14705
14706    // We use xor with one to compute the adjacent DWord to whichever one the
14707    // OneInput is in.
14708    OneInputDWord = (OneInput / 2) ^ 1;
14709
14710    // Check for one tricky case: We're fixing a 3<-1 or a 1<-3 shuffle for AToA
14711    // and BToA inputs. If there is also such a problem with the BToB and AToB
14712    // inputs, we don't try to fix it necessarily -- we'll recurse and see it in
14713    // the next pass. However, if we have a 2<-2 in the BToB and AToB inputs, it
14714    // is essential that we don't *create* a 3<-1 as then we might oscillate.
14715    if (BToBInputs.size() == 2 && AToBInputs.size() == 2) {
14716      // Compute how many inputs will be flipped by swapping these DWords. We
14717      // need
14718      // to balance this to ensure we don't form a 3-1 shuffle in the other
14719      // half.
14720      int NumFlippedAToBInputs =
14721          std::count(AToBInputs.begin(), AToBInputs.end(), 2 * ADWord) +
14722          std::count(AToBInputs.begin(), AToBInputs.end(), 2 * ADWord + 1);
14723      int NumFlippedBToBInputs =
14724          std::count(BToBInputs.begin(), BToBInputs.end(), 2 * BDWord) +
14725          std::count(BToBInputs.begin(), BToBInputs.end(), 2 * BDWord + 1);
14726      if ((NumFlippedAToBInputs == 1 &&
14727           (NumFlippedBToBInputs == 0 || NumFlippedBToBInputs == 2)) ||
14728          (NumFlippedBToBInputs == 1 &&
14729           (NumFlippedAToBInputs == 0 || NumFlippedAToBInputs == 2))) {
14730        // We choose whether to fix the A half or B half based on whether that
14731        // half has zero flipped inputs. At zero, we may not be able to fix it
14732        // with that half. We also bias towards fixing the B half because that
14733        // will more commonly be the high half, and we have to bias one way.
14734        auto FixFlippedInputs = [&V, &DL, &Mask, &DAG](int PinnedIdx, int DWord,
14735                                                       ArrayRef<int> Inputs) {
14736          int FixIdx = PinnedIdx ^ 1; // The adjacent slot to the pinned slot.
14737          bool IsFixIdxInput = is_contained(Inputs, PinnedIdx ^ 1);
14738          // Determine whether the free index is in the flipped dword or the
14739          // unflipped dword based on where the pinned index is. We use this bit
14740          // in an xor to conditionally select the adjacent dword.
14741          int FixFreeIdx = 2 * (DWord ^ (PinnedIdx / 2 == DWord));
14742          bool IsFixFreeIdxInput = is_contained(Inputs, FixFreeIdx);
14743          if (IsFixIdxInput == IsFixFreeIdxInput)
14744            FixFreeIdx += 1;
14745          IsFixFreeIdxInput = is_contained(Inputs, FixFreeIdx);
14746          assert(IsFixIdxInput != IsFixFreeIdxInput &&
14747                 "We need to be changing the number of flipped inputs!");
14748          int PSHUFHalfMask[] = {0, 1, 2, 3};
14749          std::swap(PSHUFHalfMask[FixFreeIdx % 4], PSHUFHalfMask[FixIdx % 4]);
14750          V = DAG.getNode(
14751              FixIdx < 4 ? X86ISD::PSHUFLW : X86ISD::PSHUFHW, DL,
14752              MVT::getVectorVT(MVT::i16, V.getValueSizeInBits() / 16), V,
14753              getV4X86ShuffleImm8ForMask(PSHUFHalfMask, DL, DAG));
14754
14755          for (int &M : Mask)
14756            if (M >= 0 && M == FixIdx)
14757              M = FixFreeIdx;
14758            else if (M >= 0 && M == FixFreeIdx)
14759              M = FixIdx;
14760        };
14761        if (NumFlippedBToBInputs != 0) {
14762          int BPinnedIdx =
14763              BToAInputs.size() == 3 ? TripleNonInputIdx : OneInput;
14764          FixFlippedInputs(BPinnedIdx, BDWord, BToBInputs);
14765        } else {
14766          assert(NumFlippedAToBInputs != 0 && "Impossible given predicates!");
14767          int APinnedIdx = ThreeAInputs ? TripleNonInputIdx : OneInput;
14768          FixFlippedInputs(APinnedIdx, ADWord, AToBInputs);
14769        }
14770      }
14771    }
14772
14773    int PSHUFDMask[] = {0, 1, 2, 3};
14774    PSHUFDMask[ADWord] = BDWord;
14775    PSHUFDMask[BDWord] = ADWord;
14776    V = DAG.getBitcast(
14777        VT,
14778        DAG.getNode(X86ISD::PSHUFD, DL, PSHUFDVT, DAG.getBitcast(PSHUFDVT, V),
14779                    getV4X86ShuffleImm8ForMask(PSHUFDMask, DL, DAG)));
14780
14781    // Adjust the mask to match the new locations of A and B.
14782    for (int &M : Mask)
14783      if (M >= 0 && M/2 == ADWord)
14784        M = 2 * BDWord + M % 2;
14785      else if (M >= 0 && M/2 == BDWord)
14786        M = 2 * ADWord + M % 2;
14787
14788    // Recurse back into this routine to re-compute state now that this isn't
14789    // a 3 and 1 problem.
14790    return lowerV8I16GeneralSingleInputShuffle(DL, VT, V, Mask, Subtarget, DAG);
14791  };
14792  if ((NumLToL == 3 && NumHToL == 1) || (NumLToL == 1 && NumHToL == 3))
14793    return balanceSides(LToLInputs, HToLInputs, HToHInputs, LToHInputs, 0, 4);
14794  if ((NumHToH == 3 && NumLToH == 1) || (NumHToH == 1 && NumLToH == 3))
14795    return balanceSides(HToHInputs, LToHInputs, LToLInputs, HToLInputs, 4, 0);
14796
14797  // At this point there are at most two inputs to the low and high halves from
14798  // each half. That means the inputs can always be grouped into dwords and
14799  // those dwords can then be moved to the correct half with a dword shuffle.
14800  // We use at most one low and one high word shuffle to collect these paired
14801  // inputs into dwords, and finally a dword shuffle to place them.
14802  int PSHUFLMask[4] = {-1, -1, -1, -1};
14803  int PSHUFHMask[4] = {-1, -1, -1, -1};
14804  int PSHUFDMask[4] = {-1, -1, -1, -1};
14805
14806  // First fix the masks for all the inputs that are staying in their
14807  // original halves. This will then dictate the targets of the cross-half
14808  // shuffles.
14809  auto fixInPlaceInputs =
14810      [&PSHUFDMask](ArrayRef<int> InPlaceInputs, ArrayRef<int> IncomingInputs,
14811                    MutableArrayRef<int> SourceHalfMask,
14812                    MutableArrayRef<int> HalfMask, int HalfOffset) {
14813    if (InPlaceInputs.empty())
14814      return;
14815    if (InPlaceInputs.size() == 1) {
14816      SourceHalfMask[InPlaceInputs[0] - HalfOffset] =
14817          InPlaceInputs[0] - HalfOffset;
14818      PSHUFDMask[InPlaceInputs[0] / 2] = InPlaceInputs[0] / 2;
14819      return;
14820    }
14821    if (IncomingInputs.empty()) {
14822      // Just fix all of the in place inputs.
14823      for (int Input : InPlaceInputs) {
14824        SourceHalfMask[Input - HalfOffset] = Input - HalfOffset;
14825        PSHUFDMask[Input / 2] = Input / 2;
14826      }
14827      return;
14828    }
14829
14830    assert(InPlaceInputs.size() == 2 && "Cannot handle 3 or 4 inputs!");
14831    SourceHalfMask[InPlaceInputs[0] - HalfOffset] =
14832        InPlaceInputs[0] - HalfOffset;
14833    // Put the second input next to the first so that they are packed into
14834    // a dword. We find the adjacent index by toggling the low bit.
14835    int AdjIndex = InPlaceInputs[0] ^ 1;
14836    SourceHalfMask[AdjIndex - HalfOffset] = InPlaceInputs[1] - HalfOffset;
14837    std::replace(HalfMask.begin(), HalfMask.end(), InPlaceInputs[1], AdjIndex);
14838    PSHUFDMask[AdjIndex / 2] = AdjIndex / 2;
14839  };
14840  fixInPlaceInputs(LToLInputs, HToLInputs, PSHUFLMask, LoMask, 0);
14841  fixInPlaceInputs(HToHInputs, LToHInputs, PSHUFHMask, HiMask, 4);
14842
14843  // Now gather the cross-half inputs and place them into a free dword of
14844  // their target half.
14845  // FIXME: This operation could almost certainly be simplified dramatically to
14846  // look more like the 3-1 fixing operation.
14847  auto moveInputsToRightHalf = [&PSHUFDMask](
14848      MutableArrayRef<int> IncomingInputs, ArrayRef<int> ExistingInputs,
14849      MutableArrayRef<int> SourceHalfMask, MutableArrayRef<int> HalfMask,
14850      MutableArrayRef<int> FinalSourceHalfMask, int SourceOffset,
14851      int DestOffset) {
14852    auto isWordClobbered = [](ArrayRef<int> SourceHalfMask, int Word) {
14853      return SourceHalfMask[Word] >= 0 && SourceHalfMask[Word] != Word;
14854    };
14855    auto isDWordClobbered = [&isWordClobbered](ArrayRef<int> SourceHalfMask,
14856                                               int Word) {
14857      int LowWord = Word & ~1;
14858      int HighWord = Word | 1;
14859      return isWordClobbered(SourceHalfMask, LowWord) ||
14860             isWordClobbered(SourceHalfMask, HighWord);
14861    };
14862
14863    if (IncomingInputs.empty())
14864      return;
14865
14866    if (ExistingInputs.empty()) {
14867      // Map any dwords with inputs from them into the right half.
14868      for (int Input : IncomingInputs) {
14869        // If the source half mask maps over the inputs, turn those into
14870        // swaps and use the swapped lane.
14871        if (isWordClobbered(SourceHalfMask, Input - SourceOffset)) {
14872          if (SourceHalfMask[SourceHalfMask[Input - SourceOffset]] < 0) {
14873            SourceHalfMask[SourceHalfMask[Input - SourceOffset]] =
14874                Input - SourceOffset;
14875            // We have to swap the uses in our half mask in one sweep.
14876            for (int &M : HalfMask)
14877              if (M == SourceHalfMask[Input - SourceOffset] + SourceOffset)
14878                M = Input;
14879              else if (M == Input)
14880                M = SourceHalfMask[Input - SourceOffset] + SourceOffset;
14881          } else {
14882            assert(SourceHalfMask[SourceHalfMask[Input - SourceOffset]] ==
14883                       Input - SourceOffset &&
14884                   "Previous placement doesn't match!");
14885          }
14886          // Note that this correctly re-maps both when we do a swap and when
14887          // we observe the other side of the swap above. We rely on that to
14888          // avoid swapping the members of the input list directly.
14889          Input = SourceHalfMask[Input - SourceOffset] + SourceOffset;
14890        }
14891
14892        // Map the input's dword into the correct half.
14893        if (PSHUFDMask[(Input - SourceOffset + DestOffset) / 2] < 0)
14894          PSHUFDMask[(Input - SourceOffset + DestOffset) / 2] = Input / 2;
14895        else
14896          assert(PSHUFDMask[(Input - SourceOffset + DestOffset) / 2] ==
14897                     Input / 2 &&
14898                 "Previous placement doesn't match!");
14899      }
14900
14901      // And just directly shift any other-half mask elements to be same-half
14902      // as we will have mirrored the dword containing the element into the
14903      // same position within that half.
14904      for (int &M : HalfMask)
14905        if (M >= SourceOffset && M < SourceOffset + 4) {
14906          M = M - SourceOffset + DestOffset;
14907          assert(M >= 0 && "This should never wrap below zero!");
14908        }
14909      return;
14910    }
14911
14912    // Ensure we have the input in a viable dword of its current half. This
14913    // is particularly tricky because the original position may be clobbered
14914    // by inputs being moved and *staying* in that half.
14915    if (IncomingInputs.size() == 1) {
14916      if (isWordClobbered(SourceHalfMask, IncomingInputs[0] - SourceOffset)) {
14917        int InputFixed = find(SourceHalfMask, -1) - std::begin(SourceHalfMask) +
14918                         SourceOffset;
14919        SourceHalfMask[InputFixed - SourceOffset] =
14920            IncomingInputs[0] - SourceOffset;
14921        std::replace(HalfMask.begin(), HalfMask.end(), IncomingInputs[0],
14922                     InputFixed);
14923        IncomingInputs[0] = InputFixed;
14924      }
14925    } else if (IncomingInputs.size() == 2) {
14926      if (IncomingInputs[0] / 2 != IncomingInputs[1] / 2 ||
14927          isDWordClobbered(SourceHalfMask, IncomingInputs[0] - SourceOffset)) {
14928        // We have two non-adjacent or clobbered inputs we need to extract from
14929        // the source half. To do this, we need to map them into some adjacent
14930        // dword slot in the source mask.
14931        int InputsFixed[2] = {IncomingInputs[0] - SourceOffset,
14932                              IncomingInputs[1] - SourceOffset};
14933
14934        // If there is a free slot in the source half mask adjacent to one of
14935        // the inputs, place the other input in it. We use (Index XOR 1) to
14936        // compute an adjacent index.
14937        if (!isWordClobbered(SourceHalfMask, InputsFixed[0]) &&
14938            SourceHalfMask[InputsFixed[0] ^ 1] < 0) {
14939          SourceHalfMask[InputsFixed[0]] = InputsFixed[0];
14940          SourceHalfMask[InputsFixed[0] ^ 1] = InputsFixed[1];
14941          InputsFixed[1] = InputsFixed[0] ^ 1;
14942        } else if (!isWordClobbered(SourceHalfMask, InputsFixed[1]) &&
14943                   SourceHalfMask[InputsFixed[1] ^ 1] < 0) {
14944          SourceHalfMask[InputsFixed[1]] = InputsFixed[1];
14945          SourceHalfMask[InputsFixed[1] ^ 1] = InputsFixed[0];
14946          InputsFixed[0] = InputsFixed[1] ^ 1;
14947        } else if (SourceHalfMask[2 * ((InputsFixed[0] / 2) ^ 1)] < 0 &&
14948                   SourceHalfMask[2 * ((InputsFixed[0] / 2) ^ 1) + 1] < 0) {
14949          // The two inputs are in the same DWord but it is clobbered and the
14950          // adjacent DWord isn't used at all. Move both inputs to the free
14951          // slot.
14952          SourceHalfMask[2 * ((InputsFixed[0] / 2) ^ 1)] = InputsFixed[0];
14953          SourceHalfMask[2 * ((InputsFixed[0] / 2) ^ 1) + 1] = InputsFixed[1];
14954          InputsFixed[0] = 2 * ((InputsFixed[0] / 2) ^ 1);
14955          InputsFixed[1] = 2 * ((InputsFixed[0] / 2) ^ 1) + 1;
14956        } else {
14957          // The only way we hit this point is if there is no clobbering
14958          // (because there are no off-half inputs to this half) and there is no
14959          // free slot adjacent to one of the inputs. In this case, we have to
14960          // swap an input with a non-input.
14961          for (int i = 0; i < 4; ++i)
14962            assert((SourceHalfMask[i] < 0 || SourceHalfMask[i] == i) &&
14963                   "We can't handle any clobbers here!");
14964          assert(InputsFixed[1] != (InputsFixed[0] ^ 1) &&
14965                 "Cannot have adjacent inputs here!");
14966
14967          SourceHalfMask[InputsFixed[0] ^ 1] = InputsFixed[1];
14968          SourceHalfMask[InputsFixed[1]] = InputsFixed[0] ^ 1;
14969
14970          // We also have to update the final source mask in this case because
14971          // it may need to undo the above swap.
14972          for (int &M : FinalSourceHalfMask)
14973            if (M == (InputsFixed[0] ^ 1) + SourceOffset)
14974              M = InputsFixed[1] + SourceOffset;
14975            else if (M == InputsFixed[1] + SourceOffset)
14976              M = (InputsFixed[0] ^ 1) + SourceOffset;
14977
14978          InputsFixed[1] = InputsFixed[0] ^ 1;
14979        }
14980
14981        // Point everything at the fixed inputs.
14982        for (int &M : HalfMask)
14983          if (M == IncomingInputs[0])
14984            M = InputsFixed[0] + SourceOffset;
14985          else if (M == IncomingInputs[1])
14986            M = InputsFixed[1] + SourceOffset;
14987
14988        IncomingInputs[0] = InputsFixed[0] + SourceOffset;
14989        IncomingInputs[1] = InputsFixed[1] + SourceOffset;
14990      }
14991    } else {
14992      llvm_unreachable("Unhandled input size!");
14993    }
14994
14995    // Now hoist the DWord down to the right half.
14996    int FreeDWord = (PSHUFDMask[DestOffset / 2] < 0 ? 0 : 1) + DestOffset / 2;
14997    assert(PSHUFDMask[FreeDWord] < 0 && "DWord not free");
14998    PSHUFDMask[FreeDWord] = IncomingInputs[0] / 2;
14999    for (int &M : HalfMask)
15000      for (int Input : IncomingInputs)
15001        if (M == Input)
15002          M = FreeDWord * 2 + Input % 2;
15003  };
15004  moveInputsToRightHalf(HToLInputs, LToLInputs, PSHUFHMask, LoMask, HiMask,
15005                        /*SourceOffset*/ 4, /*DestOffset*/ 0);
15006  moveInputsToRightHalf(LToHInputs, HToHInputs, PSHUFLMask, HiMask, LoMask,
15007                        /*SourceOffset*/ 0, /*DestOffset*/ 4);
15008
15009  // Now enact all the shuffles we've computed to move the inputs into their
15010  // target half.
15011  if (!isNoopShuffleMask(PSHUFLMask))
15012    V = DAG.getNode(X86ISD::PSHUFLW, DL, VT, V,
15013                    getV4X86ShuffleImm8ForMask(PSHUFLMask, DL, DAG));
15014  if (!isNoopShuffleMask(PSHUFHMask))
15015    V = DAG.getNode(X86ISD::PSHUFHW, DL, VT, V,
15016                    getV4X86ShuffleImm8ForMask(PSHUFHMask, DL, DAG));
15017  if (!isNoopShuffleMask(PSHUFDMask))
15018    V = DAG.getBitcast(
15019        VT,
15020        DAG.getNode(X86ISD::PSHUFD, DL, PSHUFDVT, DAG.getBitcast(PSHUFDVT, V),
15021                    getV4X86ShuffleImm8ForMask(PSHUFDMask, DL, DAG)));
15022
15023  // At this point, each half should contain all its inputs, and we can then
15024  // just shuffle them into their final position.
15025  assert(count_if(LoMask, [](int M) { return M >= 4; }) == 0 &&
15026         "Failed to lift all the high half inputs to the low mask!");
15027  assert(count_if(HiMask, [](int M) { return M >= 0 && M < 4; }) == 0 &&
15028         "Failed to lift all the low half inputs to the high mask!");
15029
15030  // Do a half shuffle for the low mask.
15031  if (!isNoopShuffleMask(LoMask))
15032    V = DAG.getNode(X86ISD::PSHUFLW, DL, VT, V,
15033                    getV4X86ShuffleImm8ForMask(LoMask, DL, DAG));
15034
15035  // Do a half shuffle with the high mask after shifting its values down.
15036  for (int &M : HiMask)
15037    if (M >= 0)
15038      M -= 4;
15039  if (!isNoopShuffleMask(HiMask))
15040    V = DAG.getNode(X86ISD::PSHUFHW, DL, VT, V,
15041                    getV4X86ShuffleImm8ForMask(HiMask, DL, DAG));
15042
15043  return V;
15044}
15045
15046/// Helper to form a PSHUFB-based shuffle+blend, opportunistically avoiding the
15047/// blend if only one input is used.
15048static SDValue lowerShuffleAsBlendOfPSHUFBs(
15049    const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef<int> Mask,
15050    const APInt &Zeroable, SelectionDAG &DAG, bool &V1InUse, bool &V2InUse) {
15051  assert(!is128BitLaneCrossingShuffleMask(VT, Mask) &&
15052         "Lane crossing shuffle masks not supported");
15053
15054  int NumBytes = VT.getSizeInBits() / 8;
15055  int Size = Mask.size();
15056  int Scale = NumBytes / Size;
15057
15058  SmallVector<SDValue, 64> V1Mask(NumBytes, DAG.getUNDEF(MVT::i8));
15059  SmallVector<SDValue, 64> V2Mask(NumBytes, DAG.getUNDEF(MVT::i8));
15060  V1InUse = false;
15061  V2InUse = false;
15062
15063  for (int i = 0; i < NumBytes; ++i) {
15064    int M = Mask[i / Scale];
15065    if (M < 0)
15066      continue;
15067
15068    const int ZeroMask = 0x80;
15069    int V1Idx = M < Size ? M * Scale + i % Scale : ZeroMask;
15070    int V2Idx = M < Size ? ZeroMask : (M - Size) * Scale + i % Scale;
15071    if (Zeroable[i / Scale])
15072      V1Idx = V2Idx = ZeroMask;
15073
15074    V1Mask[i] = DAG.getConstant(V1Idx, DL, MVT::i8);
15075    V2Mask[i] = DAG.getConstant(V2Idx, DL, MVT::i8);
15076    V1InUse |= (ZeroMask != V1Idx);
15077    V2InUse |= (ZeroMask != V2Idx);
15078  }
15079
15080  MVT ShufVT = MVT::getVectorVT(MVT::i8, NumBytes);
15081  if (V1InUse)
15082    V1 = DAG.getNode(X86ISD::PSHUFB, DL, ShufVT, DAG.getBitcast(ShufVT, V1),
15083                     DAG.getBuildVector(ShufVT, DL, V1Mask));
15084  if (V2InUse)
15085    V2 = DAG.getNode(X86ISD::PSHUFB, DL, ShufVT, DAG.getBitcast(ShufVT, V2),
15086                     DAG.getBuildVector(ShufVT, DL, V2Mask));
15087
15088  // If we need shuffled inputs from both, blend the two.
15089  SDValue V;
15090  if (V1InUse && V2InUse)
15091    V = DAG.getNode(ISD::OR, DL, ShufVT, V1, V2);
15092  else
15093    V = V1InUse ? V1 : V2;
15094
15095  // Cast the result back to the correct type.
15096  return DAG.getBitcast(VT, V);
15097}
15098
15099/// Generic lowering of 8-lane i16 shuffles.
15100///
15101/// This handles both single-input shuffles and combined shuffle/blends with
15102/// two inputs. The single input shuffles are immediately delegated to
15103/// a dedicated lowering routine.
15104///
15105/// The blends are lowered in one of three fundamental ways. If there are few
15106/// enough inputs, it delegates to a basic UNPCK-based strategy. If the shuffle
15107/// of the input is significantly cheaper when lowered as an interleaving of
15108/// the two inputs, try to interleave them. Otherwise, blend the low and high
15109/// halves of the inputs separately (making them have relatively few inputs)
15110/// and then concatenate them.
15111static SDValue lowerV8I16Shuffle(const SDLoc &DL, ArrayRef<int> Mask,
15112                                 const APInt &Zeroable, SDValue V1, SDValue V2,
15113                                 const X86Subtarget &Subtarget,
15114                                 SelectionDAG &DAG) {
15115  assert(V1.getSimpleValueType() == MVT::v8i16 && "Bad operand type!");
15116  assert(V2.getSimpleValueType() == MVT::v8i16 && "Bad operand type!");
15117  assert(Mask.size() == 8 && "Unexpected mask size for v8 shuffle!");
15118
15119  // Whenever we can lower this as a zext, that instruction is strictly faster
15120  // than any alternative.
15121  if (SDValue ZExt = lowerShuffleAsZeroOrAnyExtend(DL, MVT::v8i16, V1, V2, Mask,
15122                                                   Zeroable, Subtarget, DAG))
15123    return ZExt;
15124
15125  // Try to use lower using a truncation.
15126  if (SDValue V = lowerShuffleWithVPMOV(DL, MVT::v8i16, V1, V2, Mask, Zeroable,
15127                                        Subtarget, DAG))
15128    return V;
15129
15130  int NumV2Inputs = count_if(Mask, [](int M) { return M >= 8; });
15131
15132  if (NumV2Inputs == 0) {
15133    // Try to use shift instructions.
15134    if (SDValue Shift = lowerShuffleAsShift(DL, MVT::v8i16, V1, V1, Mask,
15135                                            Zeroable, Subtarget, DAG))
15136      return Shift;
15137
15138    // Check for being able to broadcast a single element.
15139    if (SDValue Broadcast = lowerShuffleAsBroadcast(DL, MVT::v8i16, V1, V2,
15140                                                    Mask, Subtarget, DAG))
15141      return Broadcast;
15142
15143    // Try to use bit rotation instructions.
15144    if (SDValue Rotate = lowerShuffleAsBitRotate(DL, MVT::v8i16, V1, Mask,
15145                                                 Subtarget, DAG))
15146      return Rotate;
15147
15148    // Use dedicated unpack instructions for masks that match their pattern.
15149    if (SDValue V = lowerShuffleWithUNPCK(DL, MVT::v8i16, Mask, V1, V2, DAG))
15150      return V;
15151
15152    // Use dedicated pack instructions for masks that match their pattern.
15153    if (SDValue V = lowerShuffleWithPACK(DL, MVT::v8i16, Mask, V1, V2, DAG,
15154                                         Subtarget))
15155      return V;
15156
15157    // Try to use byte rotation instructions.
15158    if (SDValue Rotate = lowerShuffleAsByteRotate(DL, MVT::v8i16, V1, V1, Mask,
15159                                                  Subtarget, DAG))
15160      return Rotate;
15161
15162    // Make a copy of the mask so it can be modified.
15163    SmallVector<int, 8> MutableMask(Mask.begin(), Mask.end());
15164    return lowerV8I16GeneralSingleInputShuffle(DL, MVT::v8i16, V1, MutableMask,
15165                                               Subtarget, DAG);
15166  }
15167
15168  assert(llvm::any_of(Mask, [](int M) { return M >= 0 && M < 8; }) &&
15169         "All single-input shuffles should be canonicalized to be V1-input "
15170         "shuffles.");
15171
15172  // Try to use shift instructions.
15173  if (SDValue Shift = lowerShuffleAsShift(DL, MVT::v8i16, V1, V2, Mask,
15174                                          Zeroable, Subtarget, DAG))
15175    return Shift;
15176
15177  // See if we can use SSE4A Extraction / Insertion.
15178  if (Subtarget.hasSSE4A())
15179    if (SDValue V = lowerShuffleWithSSE4A(DL, MVT::v8i16, V1, V2, Mask,
15180                                          Zeroable, DAG))
15181      return V;
15182
15183  // There are special ways we can lower some single-element blends.
15184  if (NumV2Inputs == 1)
15185    if (SDValue V = lowerShuffleAsElementInsertion(
15186            DL, MVT::v8i16, V1, V2, Mask, Zeroable, Subtarget, DAG))
15187      return V;
15188
15189  // We have different paths for blend lowering, but they all must use the
15190  // *exact* same predicate.
15191  bool IsBlendSupported = Subtarget.hasSSE41();
15192  if (IsBlendSupported)
15193    if (SDValue Blend = lowerShuffleAsBlend(DL, MVT::v8i16, V1, V2, Mask,
15194                                            Zeroable, Subtarget, DAG))
15195      return Blend;
15196
15197  if (SDValue Masked = lowerShuffleAsBitMask(DL, MVT::v8i16, V1, V2, Mask,
15198                                             Zeroable, Subtarget, DAG))
15199    return Masked;
15200
15201  // Use dedicated unpack instructions for masks that match their pattern.
15202  if (SDValue V = lowerShuffleWithUNPCK(DL, MVT::v8i16, Mask, V1, V2, DAG))
15203    return V;
15204
15205  // Use dedicated pack instructions for masks that match their pattern.
15206  if (SDValue V = lowerShuffleWithPACK(DL, MVT::v8i16, Mask, V1, V2, DAG,
15207                                       Subtarget))
15208    return V;
15209
15210  // Try to use lower using a truncation.
15211  if (SDValue V = lowerShuffleAsVTRUNC(DL, MVT::v8i16, V1, V2, Mask, Zeroable,
15212                                       Subtarget, DAG))
15213    return V;
15214
15215  // Try to use byte rotation instructions.
15216  if (SDValue Rotate = lowerShuffleAsByteRotate(DL, MVT::v8i16, V1, V2, Mask,
15217                                                Subtarget, DAG))
15218    return Rotate;
15219
15220  if (SDValue BitBlend =
15221          lowerShuffleAsBitBlend(DL, MVT::v8i16, V1, V2, Mask, DAG))
15222    return BitBlend;
15223
15224  // Try to use byte shift instructions to mask.
15225  if (SDValue V = lowerShuffleAsByteShiftMask(DL, MVT::v8i16, V1, V2, Mask,
15226                                              Zeroable, Subtarget, DAG))
15227    return V;
15228
15229  // Attempt to lower using compaction, SSE41 is necessary for PACKUSDW.
15230  // We could use SIGN_EXTEND_INREG+PACKSSDW for older targets but this seems to
15231  // be slower than a PSHUFLW+PSHUFHW+PSHUFD chain.
15232  int NumEvenDrops = canLowerByDroppingEvenElements(Mask, false);
15233  if ((NumEvenDrops == 1 || NumEvenDrops == 2) && Subtarget.hasSSE41() &&
15234      !Subtarget.hasVLX()) {
15235    SmallVector<SDValue, 8> DWordClearOps(4, DAG.getConstant(0, DL, MVT::i32));
15236    for (unsigned i = 0; i != 4; i += 1 << (NumEvenDrops - 1))
15237      DWordClearOps[i] = DAG.getConstant(0xFFFF, DL, MVT::i32);
15238    SDValue DWordClearMask = DAG.getBuildVector(MVT::v4i32, DL, DWordClearOps);
15239    V1 = DAG.getNode(ISD::AND, DL, MVT::v4i32, DAG.getBitcast(MVT::v4i32, V1),
15240                     DWordClearMask);
15241    V2 = DAG.getNode(ISD::AND, DL, MVT::v4i32, DAG.getBitcast(MVT::v4i32, V2),
15242                     DWordClearMask);
15243    // Now pack things back together.
15244    SDValue Result = DAG.getNode(X86ISD::PACKUS, DL, MVT::v8i16, V1, V2);
15245    if (NumEvenDrops == 2) {
15246      Result = DAG.getBitcast(MVT::v4i32, Result);
15247      Result = DAG.getNode(X86ISD::PACKUS, DL, MVT::v8i16, Result, Result);
15248    }
15249    return Result;
15250  }
15251
15252  // Try to lower by permuting the inputs into an unpack instruction.
15253  if (SDValue Unpack = lowerShuffleAsPermuteAndUnpack(DL, MVT::v8i16, V1, V2,
15254                                                      Mask, Subtarget, DAG))
15255    return Unpack;
15256
15257  // If we can't directly blend but can use PSHUFB, that will be better as it
15258  // can both shuffle and set up the inefficient blend.
15259  if (!IsBlendSupported && Subtarget.hasSSSE3()) {
15260    bool V1InUse, V2InUse;
15261    return lowerShuffleAsBlendOfPSHUFBs(DL, MVT::v8i16, V1, V2, Mask,
15262                                        Zeroable, DAG, V1InUse, V2InUse);
15263  }
15264
15265  // We can always bit-blend if we have to so the fallback strategy is to
15266  // decompose into single-input permutes and blends/unpacks.
15267  return lowerShuffleAsDecomposedShuffleMerge(DL, MVT::v8i16, V1, V2,
15268                                              Mask, Subtarget, DAG);
15269}
15270
15271// Lowers unary/binary shuffle as VPERMV/VPERMV3, for non-VLX targets,
15272// sub-512-bit shuffles are padded to 512-bits for the shuffle and then
15273// the active subvector is extracted.
15274static SDValue lowerShuffleWithPERMV(const SDLoc &DL, MVT VT,
15275                                     ArrayRef<int> Mask, SDValue V1, SDValue V2,
15276                                     const X86Subtarget &Subtarget,
15277                                     SelectionDAG &DAG) {
15278  MVT MaskVT = VT.changeTypeToInteger();
15279  SDValue MaskNode;
15280  MVT ShuffleVT = VT;
15281  if (!VT.is512BitVector() && !Subtarget.hasVLX()) {
15282    V1 = widenSubVector(V1, false, Subtarget, DAG, DL, 512);
15283    V2 = widenSubVector(V2, false, Subtarget, DAG, DL, 512);
15284    ShuffleVT = V1.getSimpleValueType();
15285
15286    // Adjust mask to correct indices for the second input.
15287    int NumElts = VT.getVectorNumElements();
15288    unsigned Scale = 512 / VT.getSizeInBits();
15289    SmallVector<int, 32> AdjustedMask(Mask.begin(), Mask.end());
15290    for (int &M : AdjustedMask)
15291      if (NumElts <= M)
15292        M += (Scale - 1) * NumElts;
15293    MaskNode = getConstVector(AdjustedMask, MaskVT, DAG, DL, true);
15294    MaskNode = widenSubVector(MaskNode, false, Subtarget, DAG, DL, 512);
15295  } else {
15296    MaskNode = getConstVector(Mask, MaskVT, DAG, DL, true);
15297  }
15298
15299  SDValue Result;
15300  if (V2.isUndef())
15301    Result = DAG.getNode(X86ISD::VPERMV, DL, ShuffleVT, MaskNode, V1);
15302  else
15303    Result = DAG.getNode(X86ISD::VPERMV3, DL, ShuffleVT, V1, MaskNode, V2);
15304
15305  if (VT != ShuffleVT)
15306    Result = extractSubVector(Result, 0, DAG, DL, VT.getSizeInBits());
15307
15308  return Result;
15309}
15310
15311/// Generic lowering of v16i8 shuffles.
15312///
15313/// This is a hybrid strategy to lower v16i8 vectors. It first attempts to
15314/// detect any complexity reducing interleaving. If that doesn't help, it uses
15315/// UNPCK to spread the i8 elements across two i16-element vectors, and uses
15316/// the existing lowering for v8i16 blends on each half, finally PACK-ing them
15317/// back together.
15318static SDValue lowerV16I8Shuffle(const SDLoc &DL, ArrayRef<int> Mask,
15319                                 const APInt &Zeroable, SDValue V1, SDValue V2,
15320                                 const X86Subtarget &Subtarget,
15321                                 SelectionDAG &DAG) {
15322  assert(V1.getSimpleValueType() == MVT::v16i8 && "Bad operand type!");
15323  assert(V2.getSimpleValueType() == MVT::v16i8 && "Bad operand type!");
15324  assert(Mask.size() == 16 && "Unexpected mask size for v16 shuffle!");
15325
15326  // Try to use shift instructions.
15327  if (SDValue Shift = lowerShuffleAsShift(DL, MVT::v16i8, V1, V2, Mask,
15328                                          Zeroable, Subtarget, DAG))
15329    return Shift;
15330
15331  // Try to use byte rotation instructions.
15332  if (SDValue Rotate = lowerShuffleAsByteRotate(DL, MVT::v16i8, V1, V2, Mask,
15333                                                Subtarget, DAG))
15334    return Rotate;
15335
15336  // Use dedicated pack instructions for masks that match their pattern.
15337  if (SDValue V = lowerShuffleWithPACK(DL, MVT::v16i8, Mask, V1, V2, DAG,
15338                                       Subtarget))
15339    return V;
15340
15341  // Try to use a zext lowering.
15342  if (SDValue ZExt = lowerShuffleAsZeroOrAnyExtend(DL, MVT::v16i8, V1, V2, Mask,
15343                                                   Zeroable, Subtarget, DAG))
15344    return ZExt;
15345
15346  // Try to use lower using a truncation.
15347  if (SDValue V = lowerShuffleWithVPMOV(DL, MVT::v16i8, V1, V2, Mask, Zeroable,
15348                                        Subtarget, DAG))
15349    return V;
15350
15351  if (SDValue V = lowerShuffleAsVTRUNC(DL, MVT::v16i8, V1, V2, Mask, Zeroable,
15352                                       Subtarget, DAG))
15353    return V;
15354
15355  // See if we can use SSE4A Extraction / Insertion.
15356  if (Subtarget.hasSSE4A())
15357    if (SDValue V = lowerShuffleWithSSE4A(DL, MVT::v16i8, V1, V2, Mask,
15358                                          Zeroable, DAG))
15359      return V;
15360
15361  int NumV2Elements = count_if(Mask, [](int M) { return M >= 16; });
15362
15363  // For single-input shuffles, there are some nicer lowering tricks we can use.
15364  if (NumV2Elements == 0) {
15365    // Check for being able to broadcast a single element.
15366    if (SDValue Broadcast = lowerShuffleAsBroadcast(DL, MVT::v16i8, V1, V2,
15367                                                    Mask, Subtarget, DAG))
15368      return Broadcast;
15369
15370    // Try to use bit rotation instructions.
15371    if (SDValue Rotate = lowerShuffleAsBitRotate(DL, MVT::v16i8, V1, Mask,
15372                                                 Subtarget, DAG))
15373      return Rotate;
15374
15375    if (SDValue V = lowerShuffleWithUNPCK(DL, MVT::v16i8, Mask, V1, V2, DAG))
15376      return V;
15377
15378    // Check whether we can widen this to an i16 shuffle by duplicating bytes.
15379    // Notably, this handles splat and partial-splat shuffles more efficiently.
15380    // However, it only makes sense if the pre-duplication shuffle simplifies
15381    // things significantly. Currently, this means we need to be able to
15382    // express the pre-duplication shuffle as an i16 shuffle.
15383    //
15384    // FIXME: We should check for other patterns which can be widened into an
15385    // i16 shuffle as well.
15386    auto canWidenViaDuplication = [](ArrayRef<int> Mask) {
15387      for (int i = 0; i < 16; i += 2)
15388        if (Mask[i] >= 0 && Mask[i + 1] >= 0 && Mask[i] != Mask[i + 1])
15389          return false;
15390
15391      return true;
15392    };
15393    auto tryToWidenViaDuplication = [&]() -> SDValue {
15394      if (!canWidenViaDuplication(Mask))
15395        return SDValue();
15396      SmallVector<int, 4> LoInputs;
15397      copy_if(Mask, std::back_inserter(LoInputs),
15398              [](int M) { return M >= 0 && M < 8; });
15399      array_pod_sort(LoInputs.begin(), LoInputs.end());
15400      LoInputs.erase(std::unique(LoInputs.begin(), LoInputs.end()),
15401                     LoInputs.end());
15402      SmallVector<int, 4> HiInputs;
15403      copy_if(Mask, std::back_inserter(HiInputs), [](int M) { return M >= 8; });
15404      array_pod_sort(HiInputs.begin(), HiInputs.end());
15405      HiInputs.erase(std::unique(HiInputs.begin(), HiInputs.end()),
15406                     HiInputs.end());
15407
15408      bool TargetLo = LoInputs.size() >= HiInputs.size();
15409      ArrayRef<int> InPlaceInputs = TargetLo ? LoInputs : HiInputs;
15410      ArrayRef<int> MovingInputs = TargetLo ? HiInputs : LoInputs;
15411
15412      int PreDupI16Shuffle[] = {-1, -1, -1, -1, -1, -1, -1, -1};
15413      SmallDenseMap<int, int, 8> LaneMap;
15414      for (int I : InPlaceInputs) {
15415        PreDupI16Shuffle[I/2] = I/2;
15416        LaneMap[I] = I;
15417      }
15418      int j = TargetLo ? 0 : 4, je = j + 4;
15419      for (int i = 0, ie = MovingInputs.size(); i < ie; ++i) {
15420        // Check if j is already a shuffle of this input. This happens when
15421        // there are two adjacent bytes after we move the low one.
15422        if (PreDupI16Shuffle[j] != MovingInputs[i] / 2) {
15423          // If we haven't yet mapped the input, search for a slot into which
15424          // we can map it.
15425          while (j < je && PreDupI16Shuffle[j] >= 0)
15426            ++j;
15427
15428          if (j == je)
15429            // We can't place the inputs into a single half with a simple i16 shuffle, so bail.
15430            return SDValue();
15431
15432          // Map this input with the i16 shuffle.
15433          PreDupI16Shuffle[j] = MovingInputs[i] / 2;
15434        }
15435
15436        // Update the lane map based on the mapping we ended up with.
15437        LaneMap[MovingInputs[i]] = 2 * j + MovingInputs[i] % 2;
15438      }
15439      V1 = DAG.getBitcast(
15440          MVT::v16i8,
15441          DAG.getVectorShuffle(MVT::v8i16, DL, DAG.getBitcast(MVT::v8i16, V1),
15442                               DAG.getUNDEF(MVT::v8i16), PreDupI16Shuffle));
15443
15444      // Unpack the bytes to form the i16s that will be shuffled into place.
15445      bool EvenInUse = false, OddInUse = false;
15446      for (int i = 0; i < 16; i += 2) {
15447        EvenInUse |= (Mask[i + 0] >= 0);
15448        OddInUse |= (Mask[i + 1] >= 0);
15449        if (EvenInUse && OddInUse)
15450          break;
15451      }
15452      V1 = DAG.getNode(TargetLo ? X86ISD::UNPCKL : X86ISD::UNPCKH, DL,
15453                       MVT::v16i8, EvenInUse ? V1 : DAG.getUNDEF(MVT::v16i8),
15454                       OddInUse ? V1 : DAG.getUNDEF(MVT::v16i8));
15455
15456      int PostDupI16Shuffle[8] = {-1, -1, -1, -1, -1, -1, -1, -1};
15457      for (int i = 0; i < 16; ++i)
15458        if (Mask[i] >= 0) {
15459          int MappedMask = LaneMap[Mask[i]] - (TargetLo ? 0 : 8);
15460          assert(MappedMask < 8 && "Invalid v8 shuffle mask!");
15461          if (PostDupI16Shuffle[i / 2] < 0)
15462            PostDupI16Shuffle[i / 2] = MappedMask;
15463          else
15464            assert(PostDupI16Shuffle[i / 2] == MappedMask &&
15465                   "Conflicting entries in the original shuffle!");
15466        }
15467      return DAG.getBitcast(
15468          MVT::v16i8,
15469          DAG.getVectorShuffle(MVT::v8i16, DL, DAG.getBitcast(MVT::v8i16, V1),
15470                               DAG.getUNDEF(MVT::v8i16), PostDupI16Shuffle));
15471    };
15472    if (SDValue V = tryToWidenViaDuplication())
15473      return V;
15474  }
15475
15476  if (SDValue Masked = lowerShuffleAsBitMask(DL, MVT::v16i8, V1, V2, Mask,
15477                                             Zeroable, Subtarget, DAG))
15478    return Masked;
15479
15480  // Use dedicated unpack instructions for masks that match their pattern.
15481  if (SDValue V = lowerShuffleWithUNPCK(DL, MVT::v16i8, Mask, V1, V2, DAG))
15482    return V;
15483
15484  // Try to use byte shift instructions to mask.
15485  if (SDValue V = lowerShuffleAsByteShiftMask(DL, MVT::v16i8, V1, V2, Mask,
15486                                              Zeroable, Subtarget, DAG))
15487    return V;
15488
15489  // Check for compaction patterns.
15490  bool IsSingleInput = V2.isUndef();
15491  int NumEvenDrops = canLowerByDroppingEvenElements(Mask, IsSingleInput);
15492
15493  // Check for SSSE3 which lets us lower all v16i8 shuffles much more directly
15494  // with PSHUFB. It is important to do this before we attempt to generate any
15495  // blends but after all of the single-input lowerings. If the single input
15496  // lowerings can find an instruction sequence that is faster than a PSHUFB, we
15497  // want to preserve that and we can DAG combine any longer sequences into
15498  // a PSHUFB in the end. But once we start blending from multiple inputs,
15499  // the complexity of DAG combining bad patterns back into PSHUFB is too high,
15500  // and there are *very* few patterns that would actually be faster than the
15501  // PSHUFB approach because of its ability to zero lanes.
15502  //
15503  // If the mask is a binary compaction, we can more efficiently perform this
15504  // as a PACKUS(AND(),AND()) - which is quicker than UNPACK(PSHUFB(),PSHUFB()).
15505  //
15506  // FIXME: The only exceptions to the above are blends which are exact
15507  // interleavings with direct instructions supporting them. We currently don't
15508  // handle those well here.
15509  if (Subtarget.hasSSSE3() && (IsSingleInput || NumEvenDrops != 1)) {
15510    bool V1InUse = false;
15511    bool V2InUse = false;
15512
15513    SDValue PSHUFB = lowerShuffleAsBlendOfPSHUFBs(
15514        DL, MVT::v16i8, V1, V2, Mask, Zeroable, DAG, V1InUse, V2InUse);
15515
15516    // If both V1 and V2 are in use and we can use a direct blend or an unpack,
15517    // do so. This avoids using them to handle blends-with-zero which is
15518    // important as a single pshufb is significantly faster for that.
15519    if (V1InUse && V2InUse) {
15520      if (Subtarget.hasSSE41())
15521        if (SDValue Blend = lowerShuffleAsBlend(DL, MVT::v16i8, V1, V2, Mask,
15522                                                Zeroable, Subtarget, DAG))
15523          return Blend;
15524
15525      // We can use an unpack to do the blending rather than an or in some
15526      // cases. Even though the or may be (very minorly) more efficient, we
15527      // preference this lowering because there are common cases where part of
15528      // the complexity of the shuffles goes away when we do the final blend as
15529      // an unpack.
15530      // FIXME: It might be worth trying to detect if the unpack-feeding
15531      // shuffles will both be pshufb, in which case we shouldn't bother with
15532      // this.
15533      if (SDValue Unpack = lowerShuffleAsPermuteAndUnpack(
15534              DL, MVT::v16i8, V1, V2, Mask, Subtarget, DAG))
15535        return Unpack;
15536
15537      // AVX512VBMI can lower to VPERMB (non-VLX will pad to v64i8).
15538      if (Subtarget.hasVBMI())
15539        return lowerShuffleWithPERMV(DL, MVT::v16i8, Mask, V1, V2, Subtarget,
15540                                     DAG);
15541
15542      // If we have XOP we can use one VPPERM instead of multiple PSHUFBs.
15543      if (Subtarget.hasXOP()) {
15544        SDValue MaskNode = getConstVector(Mask, MVT::v16i8, DAG, DL, true);
15545        return DAG.getNode(X86ISD::VPPERM, DL, MVT::v16i8, V1, V2, MaskNode);
15546      }
15547
15548      // Use PALIGNR+Permute if possible - permute might become PSHUFB but the
15549      // PALIGNR will be cheaper than the second PSHUFB+OR.
15550      if (SDValue V = lowerShuffleAsByteRotateAndPermute(
15551              DL, MVT::v16i8, V1, V2, Mask, Subtarget, DAG))
15552        return V;
15553    }
15554
15555    return PSHUFB;
15556  }
15557
15558  // There are special ways we can lower some single-element blends.
15559  if (NumV2Elements == 1)
15560    if (SDValue V = lowerShuffleAsElementInsertion(
15561            DL, MVT::v16i8, V1, V2, Mask, Zeroable, Subtarget, DAG))
15562      return V;
15563
15564  if (SDValue Blend = lowerShuffleAsBitBlend(DL, MVT::v16i8, V1, V2, Mask, DAG))
15565    return Blend;
15566
15567  // Check whether a compaction lowering can be done. This handles shuffles
15568  // which take every Nth element for some even N. See the helper function for
15569  // details.
15570  //
15571  // We special case these as they can be particularly efficiently handled with
15572  // the PACKUSB instruction on x86 and they show up in common patterns of
15573  // rearranging bytes to truncate wide elements.
15574  if (NumEvenDrops) {
15575    // NumEvenDrops is the power of two stride of the elements. Another way of
15576    // thinking about it is that we need to drop the even elements this many
15577    // times to get the original input.
15578
15579    // First we need to zero all the dropped bytes.
15580    assert(NumEvenDrops <= 3 &&
15581           "No support for dropping even elements more than 3 times.");
15582    SmallVector<SDValue, 8> WordClearOps(8, DAG.getConstant(0, DL, MVT::i16));
15583    for (unsigned i = 0; i != 8; i += 1 << (NumEvenDrops - 1))
15584      WordClearOps[i] = DAG.getConstant(0xFF, DL, MVT::i16);
15585    SDValue WordClearMask = DAG.getBuildVector(MVT::v8i16, DL, WordClearOps);
15586    V1 = DAG.getNode(ISD::AND, DL, MVT::v8i16, DAG.getBitcast(MVT::v8i16, V1),
15587                     WordClearMask);
15588    if (!IsSingleInput)
15589      V2 = DAG.getNode(ISD::AND, DL, MVT::v8i16, DAG.getBitcast(MVT::v8i16, V2),
15590                       WordClearMask);
15591
15592    // Now pack things back together.
15593    SDValue Result = DAG.getNode(X86ISD::PACKUS, DL, MVT::v16i8, V1,
15594                                 IsSingleInput ? V1 : V2);
15595    for (int i = 1; i < NumEvenDrops; ++i) {
15596      Result = DAG.getBitcast(MVT::v8i16, Result);
15597      Result = DAG.getNode(X86ISD::PACKUS, DL, MVT::v16i8, Result, Result);
15598    }
15599    return Result;
15600  }
15601
15602  // Handle multi-input cases by blending/unpacking single-input shuffles.
15603  if (NumV2Elements > 0)
15604    return lowerShuffleAsDecomposedShuffleMerge(DL, MVT::v16i8, V1, V2, Mask,
15605                                                Subtarget, DAG);
15606
15607  // The fallback path for single-input shuffles widens this into two v8i16
15608  // vectors with unpacks, shuffles those, and then pulls them back together
15609  // with a pack.
15610  SDValue V = V1;
15611
15612  std::array<int, 8> LoBlendMask = {{-1, -1, -1, -1, -1, -1, -1, -1}};
15613  std::array<int, 8> HiBlendMask = {{-1, -1, -1, -1, -1, -1, -1, -1}};
15614  for (int i = 0; i < 16; ++i)
15615    if (Mask[i] >= 0)
15616      (i < 8 ? LoBlendMask[i] : HiBlendMask[i % 8]) = Mask[i];
15617
15618  SDValue VLoHalf, VHiHalf;
15619  // Check if any of the odd lanes in the v16i8 are used. If not, we can mask
15620  // them out and avoid using UNPCK{L,H} to extract the elements of V as
15621  // i16s.
15622  if (none_of(LoBlendMask, [](int M) { return M >= 0 && M % 2 == 1; }) &&
15623      none_of(HiBlendMask, [](int M) { return M >= 0 && M % 2 == 1; })) {
15624    // Use a mask to drop the high bytes.
15625    VLoHalf = DAG.getBitcast(MVT::v8i16, V);
15626    VLoHalf = DAG.getNode(ISD::AND, DL, MVT::v8i16, VLoHalf,
15627                          DAG.getConstant(0x00FF, DL, MVT::v8i16));
15628
15629    // This will be a single vector shuffle instead of a blend so nuke VHiHalf.
15630    VHiHalf = DAG.getUNDEF(MVT::v8i16);
15631
15632    // Squash the masks to point directly into VLoHalf.
15633    for (int &M : LoBlendMask)
15634      if (M >= 0)
15635        M /= 2;
15636    for (int &M : HiBlendMask)
15637      if (M >= 0)
15638        M /= 2;
15639  } else {
15640    // Otherwise just unpack the low half of V into VLoHalf and the high half into
15641    // VHiHalf so that we can blend them as i16s.
15642    SDValue Zero = getZeroVector(MVT::v16i8, Subtarget, DAG, DL);
15643
15644    VLoHalf = DAG.getBitcast(
15645        MVT::v8i16, DAG.getNode(X86ISD::UNPCKL, DL, MVT::v16i8, V, Zero));
15646    VHiHalf = DAG.getBitcast(
15647        MVT::v8i16, DAG.getNode(X86ISD::UNPCKH, DL, MVT::v16i8, V, Zero));
15648  }
15649
15650  SDValue LoV = DAG.getVectorShuffle(MVT::v8i16, DL, VLoHalf, VHiHalf, LoBlendMask);
15651  SDValue HiV = DAG.getVectorShuffle(MVT::v8i16, DL, VLoHalf, VHiHalf, HiBlendMask);
15652
15653  return DAG.getNode(X86ISD::PACKUS, DL, MVT::v16i8, LoV, HiV);
15654}
15655
15656/// Dispatching routine to lower various 128-bit x86 vector shuffles.
15657///
15658/// This routine breaks down the specific type of 128-bit shuffle and
15659/// dispatches to the lowering routines accordingly.
15660static SDValue lower128BitShuffle(const SDLoc &DL, ArrayRef<int> Mask,
15661                                  MVT VT, SDValue V1, SDValue V2,
15662                                  const APInt &Zeroable,
15663                                  const X86Subtarget &Subtarget,
15664                                  SelectionDAG &DAG) {
15665  switch (VT.SimpleTy) {
15666  case MVT::v2i64:
15667    return lowerV2I64Shuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
15668  case MVT::v2f64:
15669    return lowerV2F64Shuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
15670  case MVT::v4i32:
15671    return lowerV4I32Shuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
15672  case MVT::v4f32:
15673    return lowerV4F32Shuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
15674  case MVT::v8i16:
15675    return lowerV8I16Shuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
15676  case MVT::v16i8:
15677    return lowerV16I8Shuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
15678
15679  default:
15680    llvm_unreachable("Unimplemented!");
15681  }
15682}
15683
15684/// Generic routine to split vector shuffle into half-sized shuffles.
15685///
15686/// This routine just extracts two subvectors, shuffles them independently, and
15687/// then concatenates them back together. This should work effectively with all
15688/// AVX vector shuffle types.
15689static SDValue splitAndLowerShuffle(const SDLoc &DL, MVT VT, SDValue V1,
15690                                    SDValue V2, ArrayRef<int> Mask,
15691                                    SelectionDAG &DAG) {
15692  assert(VT.getSizeInBits() >= 256 &&
15693         "Only for 256-bit or wider vector shuffles!");
15694  assert(V1.getSimpleValueType() == VT && "Bad operand type!");
15695  assert(V2.getSimpleValueType() == VT && "Bad operand type!");
15696
15697  ArrayRef<int> LoMask = Mask.slice(0, Mask.size() / 2);
15698  ArrayRef<int> HiMask = Mask.slice(Mask.size() / 2);
15699
15700  int NumElements = VT.getVectorNumElements();
15701  int SplitNumElements = NumElements / 2;
15702  MVT ScalarVT = VT.getVectorElementType();
15703  MVT SplitVT = MVT::getVectorVT(ScalarVT, SplitNumElements);
15704
15705  // Use splitVector/extractSubVector so that split build-vectors just build two
15706  // narrower build vectors. This helps shuffling with splats and zeros.
15707  auto SplitVector = [&](SDValue V) {
15708    SDValue LoV, HiV;
15709    std::tie(LoV, HiV) = splitVector(peekThroughBitcasts(V), DAG, DL);
15710    return std::make_pair(DAG.getBitcast(SplitVT, LoV),
15711                          DAG.getBitcast(SplitVT, HiV));
15712  };
15713
15714  SDValue LoV1, HiV1, LoV2, HiV2;
15715  std::tie(LoV1, HiV1) = SplitVector(V1);
15716  std::tie(LoV2, HiV2) = SplitVector(V2);
15717
15718  // Now create two 4-way blends of these half-width vectors.
15719  auto HalfBlend = [&](ArrayRef<int> HalfMask) {
15720    bool UseLoV1 = false, UseHiV1 = false, UseLoV2 = false, UseHiV2 = false;
15721    SmallVector<int, 32> V1BlendMask((unsigned)SplitNumElements, -1);
15722    SmallVector<int, 32> V2BlendMask((unsigned)SplitNumElements, -1);
15723    SmallVector<int, 32> BlendMask((unsigned)SplitNumElements, -1);
15724    for (int i = 0; i < SplitNumElements; ++i) {
15725      int M = HalfMask[i];
15726      if (M >= NumElements) {
15727        if (M >= NumElements + SplitNumElements)
15728          UseHiV2 = true;
15729        else
15730          UseLoV2 = true;
15731        V2BlendMask[i] = M - NumElements;
15732        BlendMask[i] = SplitNumElements + i;
15733      } else if (M >= 0) {
15734        if (M >= SplitNumElements)
15735          UseHiV1 = true;
15736        else
15737          UseLoV1 = true;
15738        V1BlendMask[i] = M;
15739        BlendMask[i] = i;
15740      }
15741    }
15742
15743    // Because the lowering happens after all combining takes place, we need to
15744    // manually combine these blend masks as much as possible so that we create
15745    // a minimal number of high-level vector shuffle nodes.
15746
15747    // First try just blending the halves of V1 or V2.
15748    if (!UseLoV1 && !UseHiV1 && !UseLoV2 && !UseHiV2)
15749      return DAG.getUNDEF(SplitVT);
15750    if (!UseLoV2 && !UseHiV2)
15751      return DAG.getVectorShuffle(SplitVT, DL, LoV1, HiV1, V1BlendMask);
15752    if (!UseLoV1 && !UseHiV1)
15753      return DAG.getVectorShuffle(SplitVT, DL, LoV2, HiV2, V2BlendMask);
15754
15755    SDValue V1Blend, V2Blend;
15756    if (UseLoV1 && UseHiV1) {
15757      V1Blend =
15758        DAG.getVectorShuffle(SplitVT, DL, LoV1, HiV1, V1BlendMask);
15759    } else {
15760      // We only use half of V1 so map the usage down into the final blend mask.
15761      V1Blend = UseLoV1 ? LoV1 : HiV1;
15762      for (int i = 0; i < SplitNumElements; ++i)
15763        if (BlendMask[i] >= 0 && BlendMask[i] < SplitNumElements)
15764          BlendMask[i] = V1BlendMask[i] - (UseLoV1 ? 0 : SplitNumElements);
15765    }
15766    if (UseLoV2 && UseHiV2) {
15767      V2Blend =
15768        DAG.getVectorShuffle(SplitVT, DL, LoV2, HiV2, V2BlendMask);
15769    } else {
15770      // We only use half of V2 so map the usage down into the final blend mask.
15771      V2Blend = UseLoV2 ? LoV2 : HiV2;
15772      for (int i = 0; i < SplitNumElements; ++i)
15773        if (BlendMask[i] >= SplitNumElements)
15774          BlendMask[i] = V2BlendMask[i] + (UseLoV2 ? SplitNumElements : 0);
15775    }
15776    return DAG.getVectorShuffle(SplitVT, DL, V1Blend, V2Blend, BlendMask);
15777  };
15778  SDValue Lo = HalfBlend(LoMask);
15779  SDValue Hi = HalfBlend(HiMask);
15780  return DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, Lo, Hi);
15781}
15782
15783/// Either split a vector in halves or decompose the shuffles and the
15784/// blend/unpack.
15785///
15786/// This is provided as a good fallback for many lowerings of non-single-input
15787/// shuffles with more than one 128-bit lane. In those cases, we want to select
15788/// between splitting the shuffle into 128-bit components and stitching those
15789/// back together vs. extracting the single-input shuffles and blending those
15790/// results.
15791static SDValue lowerShuffleAsSplitOrBlend(const SDLoc &DL, MVT VT, SDValue V1,
15792                                          SDValue V2, ArrayRef<int> Mask,
15793                                          const X86Subtarget &Subtarget,
15794                                          SelectionDAG &DAG) {
15795  assert(!V2.isUndef() && "This routine must not be used to lower single-input "
15796         "shuffles as it could then recurse on itself.");
15797  int Size = Mask.size();
15798
15799  // If this can be modeled as a broadcast of two elements followed by a blend,
15800  // prefer that lowering. This is especially important because broadcasts can
15801  // often fold with memory operands.
15802  auto DoBothBroadcast = [&] {
15803    int V1BroadcastIdx = -1, V2BroadcastIdx = -1;
15804    for (int M : Mask)
15805      if (M >= Size) {
15806        if (V2BroadcastIdx < 0)
15807          V2BroadcastIdx = M - Size;
15808        else if (M - Size != V2BroadcastIdx)
15809          return false;
15810      } else if (M >= 0) {
15811        if (V1BroadcastIdx < 0)
15812          V1BroadcastIdx = M;
15813        else if (M != V1BroadcastIdx)
15814          return false;
15815      }
15816    return true;
15817  };
15818  if (DoBothBroadcast())
15819    return lowerShuffleAsDecomposedShuffleMerge(DL, VT, V1, V2, Mask, Subtarget,
15820                                                DAG);
15821
15822  // If the inputs all stem from a single 128-bit lane of each input, then we
15823  // split them rather than blending because the split will decompose to
15824  // unusually few instructions.
15825  int LaneCount = VT.getSizeInBits() / 128;
15826  int LaneSize = Size / LaneCount;
15827  SmallBitVector LaneInputs[2];
15828  LaneInputs[0].resize(LaneCount, false);
15829  LaneInputs[1].resize(LaneCount, false);
15830  for (int i = 0; i < Size; ++i)
15831    if (Mask[i] >= 0)
15832      LaneInputs[Mask[i] / Size][(Mask[i] % Size) / LaneSize] = true;
15833  if (LaneInputs[0].count() <= 1 && LaneInputs[1].count() <= 1)
15834    return splitAndLowerShuffle(DL, VT, V1, V2, Mask, DAG);
15835
15836  // Otherwise, just fall back to decomposed shuffles and a blend/unpack. This
15837  // requires that the decomposed single-input shuffles don't end up here.
15838  return lowerShuffleAsDecomposedShuffleMerge(DL, VT, V1, V2, Mask, Subtarget,
15839                                              DAG);
15840}
15841
15842// Lower as SHUFPD(VPERM2F128(V1, V2), VPERM2F128(V1, V2)).
15843// TODO: Extend to support v8f32 (+ 512-bit shuffles).
15844static SDValue lowerShuffleAsLanePermuteAndSHUFP(const SDLoc &DL, MVT VT,
15845                                                 SDValue V1, SDValue V2,
15846                                                 ArrayRef<int> Mask,
15847                                                 SelectionDAG &DAG) {
15848  assert(VT == MVT::v4f64 && "Only for v4f64 shuffles");
15849
15850  int LHSMask[4] = {-1, -1, -1, -1};
15851  int RHSMask[4] = {-1, -1, -1, -1};
15852  unsigned SHUFPMask = 0;
15853
15854  // As SHUFPD uses a single LHS/RHS element per lane, we can always
15855  // perform the shuffle once the lanes have been shuffled in place.
15856  for (int i = 0; i != 4; ++i) {
15857    int M = Mask[i];
15858    if (M < 0)
15859      continue;
15860    int LaneBase = i & ~1;
15861    auto &LaneMask = (i & 1) ? RHSMask : LHSMask;
15862    LaneMask[LaneBase + (M & 1)] = M;
15863    SHUFPMask |= (M & 1) << i;
15864  }
15865
15866  SDValue LHS = DAG.getVectorShuffle(VT, DL, V1, V2, LHSMask);
15867  SDValue RHS = DAG.getVectorShuffle(VT, DL, V1, V2, RHSMask);
15868  return DAG.getNode(X86ISD::SHUFP, DL, VT, LHS, RHS,
15869                     DAG.getTargetConstant(SHUFPMask, DL, MVT::i8));
15870}
15871
15872/// Lower a vector shuffle crossing multiple 128-bit lanes as
15873/// a lane permutation followed by a per-lane permutation.
15874///
15875/// This is mainly for cases where we can have non-repeating permutes
15876/// in each lane.
15877///
15878/// TODO: This is very similar to lowerShuffleAsLanePermuteAndRepeatedMask,
15879/// we should investigate merging them.
15880static SDValue lowerShuffleAsLanePermuteAndPermute(
15881    const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef<int> Mask,
15882    SelectionDAG &DAG, const X86Subtarget &Subtarget) {
15883  int NumElts = VT.getVectorNumElements();
15884  int NumLanes = VT.getSizeInBits() / 128;
15885  int NumEltsPerLane = NumElts / NumLanes;
15886  bool CanUseSublanes = Subtarget.hasAVX2() && V2.isUndef();
15887
15888  /// Attempts to find a sublane permute with the given size
15889  /// that gets all elements into their target lanes.
15890  ///
15891  /// If successful, fills CrossLaneMask and InLaneMask and returns true.
15892  /// If unsuccessful, returns false and may overwrite InLaneMask.
15893  auto getSublanePermute = [&](int NumSublanes) -> SDValue {
15894    int NumSublanesPerLane = NumSublanes / NumLanes;
15895    int NumEltsPerSublane = NumElts / NumSublanes;
15896
15897    SmallVector<int, 16> CrossLaneMask;
15898    SmallVector<int, 16> InLaneMask(NumElts, SM_SentinelUndef);
15899    // CrossLaneMask but one entry == one sublane.
15900    SmallVector<int, 16> CrossLaneMaskLarge(NumSublanes, SM_SentinelUndef);
15901
15902    for (int i = 0; i != NumElts; ++i) {
15903      int M = Mask[i];
15904      if (M < 0)
15905        continue;
15906
15907      int SrcSublane = M / NumEltsPerSublane;
15908      int DstLane = i / NumEltsPerLane;
15909
15910      // We only need to get the elements into the right lane, not sublane.
15911      // So search all sublanes that make up the destination lane.
15912      bool Found = false;
15913      int DstSubStart = DstLane * NumSublanesPerLane;
15914      int DstSubEnd = DstSubStart + NumSublanesPerLane;
15915      for (int DstSublane = DstSubStart; DstSublane < DstSubEnd; ++DstSublane) {
15916        if (!isUndefOrEqual(CrossLaneMaskLarge[DstSublane], SrcSublane))
15917          continue;
15918
15919        Found = true;
15920        CrossLaneMaskLarge[DstSublane] = SrcSublane;
15921        int DstSublaneOffset = DstSublane * NumEltsPerSublane;
15922        InLaneMask[i] = DstSublaneOffset + M % NumEltsPerSublane;
15923        break;
15924      }
15925      if (!Found)
15926        return SDValue();
15927    }
15928
15929    // Fill CrossLaneMask using CrossLaneMaskLarge.
15930    narrowShuffleMaskElts(NumEltsPerSublane, CrossLaneMaskLarge, CrossLaneMask);
15931
15932    if (!CanUseSublanes) {
15933      // If we're only shuffling a single lowest lane and the rest are identity
15934      // then don't bother.
15935      // TODO - isShuffleMaskInputInPlace could be extended to something like
15936      // this.
15937      int NumIdentityLanes = 0;
15938      bool OnlyShuffleLowestLane = true;
15939      for (int i = 0; i != NumLanes; ++i) {
15940        int LaneOffset = i * NumEltsPerLane;
15941        if (isSequentialOrUndefInRange(InLaneMask, LaneOffset, NumEltsPerLane,
15942                                       i * NumEltsPerLane))
15943          NumIdentityLanes++;
15944        else if (CrossLaneMask[LaneOffset] != 0)
15945          OnlyShuffleLowestLane = false;
15946      }
15947      if (OnlyShuffleLowestLane && NumIdentityLanes == (NumLanes - 1))
15948        return SDValue();
15949    }
15950
15951    SDValue CrossLane = DAG.getVectorShuffle(VT, DL, V1, V2, CrossLaneMask);
15952    return DAG.getVectorShuffle(VT, DL, CrossLane, DAG.getUNDEF(VT),
15953                                InLaneMask);
15954  };
15955
15956  // First attempt a solution with full lanes.
15957  if (SDValue V = getSublanePermute(/*NumSublanes=*/NumLanes))
15958    return V;
15959
15960  // The rest of the solutions use sublanes.
15961  if (!CanUseSublanes)
15962    return SDValue();
15963
15964  // Then attempt a solution with 64-bit sublanes (vpermq).
15965  if (SDValue V = getSublanePermute(/*NumSublanes=*/NumLanes * 2))
15966    return V;
15967
15968  // If that doesn't work and we have fast variable shuffle,
15969  // attempt 32-bit sublanes (vpermd).
15970  if (!Subtarget.hasFastVariableShuffle())
15971    return SDValue();
15972
15973  return getSublanePermute(/*NumSublanes=*/NumLanes * 4);
15974}
15975
15976/// Lower a vector shuffle crossing multiple 128-bit lanes by shuffling one
15977/// source with a lane permutation.
15978///
15979/// This lowering strategy results in four instructions in the worst case for a
15980/// single-input cross lane shuffle which is lower than any other fully general
15981/// cross-lane shuffle strategy I'm aware of. Special cases for each particular
15982/// shuffle pattern should be handled prior to trying this lowering.
15983static SDValue lowerShuffleAsLanePermuteAndShuffle(
15984    const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef<int> Mask,
15985    SelectionDAG &DAG, const X86Subtarget &Subtarget) {
15986  // FIXME: This should probably be generalized for 512-bit vectors as well.
15987  assert(VT.is256BitVector() && "Only for 256-bit vector shuffles!");
15988  int Size = Mask.size();
15989  int LaneSize = Size / 2;
15990
15991  // Fold to SHUFPD(VPERM2F128(V1, V2), VPERM2F128(V1, V2)).
15992  // Only do this if the elements aren't all from the lower lane,
15993  // otherwise we're (probably) better off doing a split.
15994  if (VT == MVT::v4f64 &&
15995      !all_of(Mask, [LaneSize](int M) { return M < LaneSize; }))
15996    if (SDValue V =
15997            lowerShuffleAsLanePermuteAndSHUFP(DL, VT, V1, V2, Mask, DAG))
15998      return V;
15999
16000  // If there are only inputs from one 128-bit lane, splitting will in fact be
16001  // less expensive. The flags track whether the given lane contains an element
16002  // that crosses to another lane.
16003  if (!Subtarget.hasAVX2()) {
16004    bool LaneCrossing[2] = {false, false};
16005    for (int i = 0; i < Size; ++i)
16006      if (Mask[i] >= 0 && ((Mask[i] % Size) / LaneSize) != (i / LaneSize))
16007        LaneCrossing[(Mask[i] % Size) / LaneSize] = true;
16008    if (!LaneCrossing[0] || !LaneCrossing[1])
16009      return splitAndLowerShuffle(DL, VT, V1, V2, Mask, DAG);
16010  } else {
16011    bool LaneUsed[2] = {false, false};
16012    for (int i = 0; i < Size; ++i)
16013      if (Mask[i] >= 0)
16014        LaneUsed[(Mask[i] % Size) / LaneSize] = true;
16015    if (!LaneUsed[0] || !LaneUsed[1])
16016      return splitAndLowerShuffle(DL, VT, V1, V2, Mask, DAG);
16017  }
16018
16019  // TODO - we could support shuffling V2 in the Flipped input.
16020  assert(V2.isUndef() &&
16021         "This last part of this routine only works on single input shuffles");
16022
16023  SmallVector<int, 32> InLaneMask(Mask.begin(), Mask.end());
16024  for (int i = 0; i < Size; ++i) {
16025    int &M = InLaneMask[i];
16026    if (M < 0)
16027      continue;
16028    if (((M % Size) / LaneSize) != (i / LaneSize))
16029      M = (M % LaneSize) + ((i / LaneSize) * LaneSize) + Size;
16030  }
16031  assert(!is128BitLaneCrossingShuffleMask(VT, InLaneMask) &&
16032         "In-lane shuffle mask expected");
16033
16034  // Flip the lanes, and shuffle the results which should now be in-lane.
16035  MVT PVT = VT.isFloatingPoint() ? MVT::v4f64 : MVT::v4i64;
16036  SDValue Flipped = DAG.getBitcast(PVT, V1);
16037  Flipped =
16038      DAG.getVectorShuffle(PVT, DL, Flipped, DAG.getUNDEF(PVT), {2, 3, 0, 1});
16039  Flipped = DAG.getBitcast(VT, Flipped);
16040  return DAG.getVectorShuffle(VT, DL, V1, Flipped, InLaneMask);
16041}
16042
16043/// Handle lowering 2-lane 128-bit shuffles.
16044static SDValue lowerV2X128Shuffle(const SDLoc &DL, MVT VT, SDValue V1,
16045                                  SDValue V2, ArrayRef<int> Mask,
16046                                  const APInt &Zeroable,
16047                                  const X86Subtarget &Subtarget,
16048                                  SelectionDAG &DAG) {
16049  // With AVX2, use VPERMQ/VPERMPD for unary shuffles to allow memory folding.
16050  if (Subtarget.hasAVX2() && V2.isUndef())
16051    return SDValue();
16052
16053  bool V2IsZero = !V2.isUndef() && ISD::isBuildVectorAllZeros(V2.getNode());
16054
16055  SmallVector<int, 4> WidenedMask;
16056  if (!canWidenShuffleElements(Mask, Zeroable, V2IsZero, WidenedMask))
16057    return SDValue();
16058
16059  bool IsLowZero = (Zeroable & 0x3) == 0x3;
16060  bool IsHighZero = (Zeroable & 0xc) == 0xc;
16061
16062  // Try to use an insert into a zero vector.
16063  if (WidenedMask[0] == 0 && IsHighZero) {
16064    MVT SubVT = MVT::getVectorVT(VT.getVectorElementType(), 2);
16065    SDValue LoV = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, SubVT, V1,
16066                              DAG.getIntPtrConstant(0, DL));
16067    return DAG.getNode(ISD::INSERT_SUBVECTOR, DL, VT,
16068                       getZeroVector(VT, Subtarget, DAG, DL), LoV,
16069                       DAG.getIntPtrConstant(0, DL));
16070  }
16071
16072  // TODO: If minimizing size and one of the inputs is a zero vector and the
16073  // the zero vector has only one use, we could use a VPERM2X128 to save the
16074  // instruction bytes needed to explicitly generate the zero vector.
16075
16076  // Blends are faster and handle all the non-lane-crossing cases.
16077  if (SDValue Blend = lowerShuffleAsBlend(DL, VT, V1, V2, Mask, Zeroable,
16078                                          Subtarget, DAG))
16079    return Blend;
16080
16081  // If either input operand is a zero vector, use VPERM2X128 because its mask
16082  // allows us to replace the zero input with an implicit zero.
16083  if (!IsLowZero && !IsHighZero) {
16084    // Check for patterns which can be matched with a single insert of a 128-bit
16085    // subvector.
16086    bool OnlyUsesV1 = isShuffleEquivalent(Mask, {0, 1, 0, 1}, V1, V2);
16087    if (OnlyUsesV1 || isShuffleEquivalent(Mask, {0, 1, 4, 5}, V1, V2)) {
16088
16089      // With AVX1, use vperm2f128 (below) to allow load folding. Otherwise,
16090      // this will likely become vinsertf128 which can't fold a 256-bit memop.
16091      if (!isa<LoadSDNode>(peekThroughBitcasts(V1))) {
16092        MVT SubVT = MVT::getVectorVT(VT.getVectorElementType(), 2);
16093        SDValue SubVec = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, SubVT,
16094                                     OnlyUsesV1 ? V1 : V2,
16095                                     DAG.getIntPtrConstant(0, DL));
16096        return DAG.getNode(ISD::INSERT_SUBVECTOR, DL, VT, V1, SubVec,
16097                           DAG.getIntPtrConstant(2, DL));
16098      }
16099    }
16100
16101    // Try to use SHUF128 if possible.
16102    if (Subtarget.hasVLX()) {
16103      if (WidenedMask[0] < 2 && WidenedMask[1] >= 2) {
16104        unsigned PermMask = ((WidenedMask[0] % 2) << 0) |
16105                            ((WidenedMask[1] % 2) << 1);
16106        return DAG.getNode(X86ISD::SHUF128, DL, VT, V1, V2,
16107                           DAG.getTargetConstant(PermMask, DL, MVT::i8));
16108      }
16109    }
16110  }
16111
16112  // Otherwise form a 128-bit permutation. After accounting for undefs,
16113  // convert the 64-bit shuffle mask selection values into 128-bit
16114  // selection bits by dividing the indexes by 2 and shifting into positions
16115  // defined by a vperm2*128 instruction's immediate control byte.
16116
16117  // The immediate permute control byte looks like this:
16118  //    [1:0] - select 128 bits from sources for low half of destination
16119  //    [2]   - ignore
16120  //    [3]   - zero low half of destination
16121  //    [5:4] - select 128 bits from sources for high half of destination
16122  //    [6]   - ignore
16123  //    [7]   - zero high half of destination
16124
16125  assert((WidenedMask[0] >= 0 || IsLowZero) &&
16126         (WidenedMask[1] >= 0 || IsHighZero) && "Undef half?");
16127
16128  unsigned PermMask = 0;
16129  PermMask |= IsLowZero  ? 0x08 : (WidenedMask[0] << 0);
16130  PermMask |= IsHighZero ? 0x80 : (WidenedMask[1] << 4);
16131
16132  // Check the immediate mask and replace unused sources with undef.
16133  if ((PermMask & 0x0a) != 0x00 && (PermMask & 0xa0) != 0x00)
16134    V1 = DAG.getUNDEF(VT);
16135  if ((PermMask & 0x0a) != 0x02 && (PermMask & 0xa0) != 0x20)
16136    V2 = DAG.getUNDEF(VT);
16137
16138  return DAG.getNode(X86ISD::VPERM2X128, DL, VT, V1, V2,
16139                     DAG.getTargetConstant(PermMask, DL, MVT::i8));
16140}
16141
16142/// Lower a vector shuffle by first fixing the 128-bit lanes and then
16143/// shuffling each lane.
16144///
16145/// This attempts to create a repeated lane shuffle where each lane uses one
16146/// or two of the lanes of the inputs. The lanes of the input vectors are
16147/// shuffled in one or two independent shuffles to get the lanes into the
16148/// position needed by the final shuffle.
16149static SDValue lowerShuffleAsLanePermuteAndRepeatedMask(
16150    const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef<int> Mask,
16151    const X86Subtarget &Subtarget, SelectionDAG &DAG) {
16152  assert(!V2.isUndef() && "This is only useful with multiple inputs.");
16153
16154  if (is128BitLaneRepeatedShuffleMask(VT, Mask))
16155    return SDValue();
16156
16157  int NumElts = Mask.size();
16158  int NumLanes = VT.getSizeInBits() / 128;
16159  int NumLaneElts = 128 / VT.getScalarSizeInBits();
16160  SmallVector<int, 16> RepeatMask(NumLaneElts, -1);
16161  SmallVector<std::array<int, 2>, 2> LaneSrcs(NumLanes, {{-1, -1}});
16162
16163  // First pass will try to fill in the RepeatMask from lanes that need two
16164  // sources.
16165  for (int Lane = 0; Lane != NumLanes; ++Lane) {
16166    int Srcs[2] = {-1, -1};
16167    SmallVector<int, 16> InLaneMask(NumLaneElts, -1);
16168    for (int i = 0; i != NumLaneElts; ++i) {
16169      int M = Mask[(Lane * NumLaneElts) + i];
16170      if (M < 0)
16171        continue;
16172      // Determine which of the possible input lanes (NumLanes from each source)
16173      // this element comes from. Assign that as one of the sources for this
16174      // lane. We can assign up to 2 sources for this lane. If we run out
16175      // sources we can't do anything.
16176      int LaneSrc = M / NumLaneElts;
16177      int Src;
16178      if (Srcs[0] < 0 || Srcs[0] == LaneSrc)
16179        Src = 0;
16180      else if (Srcs[1] < 0 || Srcs[1] == LaneSrc)
16181        Src = 1;
16182      else
16183        return SDValue();
16184
16185      Srcs[Src] = LaneSrc;
16186      InLaneMask[i] = (M % NumLaneElts) + Src * NumElts;
16187    }
16188
16189    // If this lane has two sources, see if it fits with the repeat mask so far.
16190    if (Srcs[1] < 0)
16191      continue;
16192
16193    LaneSrcs[Lane][0] = Srcs[0];
16194    LaneSrcs[Lane][1] = Srcs[1];
16195
16196    auto MatchMasks = [](ArrayRef<int> M1, ArrayRef<int> M2) {
16197      assert(M1.size() == M2.size() && "Unexpected mask size");
16198      for (int i = 0, e = M1.size(); i != e; ++i)
16199        if (M1[i] >= 0 && M2[i] >= 0 && M1[i] != M2[i])
16200          return false;
16201      return true;
16202    };
16203
16204    auto MergeMasks = [](ArrayRef<int> Mask, MutableArrayRef<int> MergedMask) {
16205      assert(Mask.size() == MergedMask.size() && "Unexpected mask size");
16206      for (int i = 0, e = MergedMask.size(); i != e; ++i) {
16207        int M = Mask[i];
16208        if (M < 0)
16209          continue;
16210        assert((MergedMask[i] < 0 || MergedMask[i] == M) &&
16211               "Unexpected mask element");
16212        MergedMask[i] = M;
16213      }
16214    };
16215
16216    if (MatchMasks(InLaneMask, RepeatMask)) {
16217      // Merge this lane mask into the final repeat mask.
16218      MergeMasks(InLaneMask, RepeatMask);
16219      continue;
16220    }
16221
16222    // Didn't find a match. Swap the operands and try again.
16223    std::swap(LaneSrcs[Lane][0], LaneSrcs[Lane][1]);
16224    ShuffleVectorSDNode::commuteMask(InLaneMask);
16225
16226    if (MatchMasks(InLaneMask, RepeatMask)) {
16227      // Merge this lane mask into the final repeat mask.
16228      MergeMasks(InLaneMask, RepeatMask);
16229      continue;
16230    }
16231
16232    // Couldn't find a match with the operands in either order.
16233    return SDValue();
16234  }
16235
16236  // Now handle any lanes with only one source.
16237  for (int Lane = 0; Lane != NumLanes; ++Lane) {
16238    // If this lane has already been processed, skip it.
16239    if (LaneSrcs[Lane][0] >= 0)
16240      continue;
16241
16242    for (int i = 0; i != NumLaneElts; ++i) {
16243      int M = Mask[(Lane * NumLaneElts) + i];
16244      if (M < 0)
16245        continue;
16246
16247      // If RepeatMask isn't defined yet we can define it ourself.
16248      if (RepeatMask[i] < 0)
16249        RepeatMask[i] = M % NumLaneElts;
16250
16251      if (RepeatMask[i] < NumElts) {
16252        if (RepeatMask[i] != M % NumLaneElts)
16253          return SDValue();
16254        LaneSrcs[Lane][0] = M / NumLaneElts;
16255      } else {
16256        if (RepeatMask[i] != ((M % NumLaneElts) + NumElts))
16257          return SDValue();
16258        LaneSrcs[Lane][1] = M / NumLaneElts;
16259      }
16260    }
16261
16262    if (LaneSrcs[Lane][0] < 0 && LaneSrcs[Lane][1] < 0)
16263      return SDValue();
16264  }
16265
16266  SmallVector<int, 16> NewMask(NumElts, -1);
16267  for (int Lane = 0; Lane != NumLanes; ++Lane) {
16268    int Src = LaneSrcs[Lane][0];
16269    for (int i = 0; i != NumLaneElts; ++i) {
16270      int M = -1;
16271      if (Src >= 0)
16272        M = Src * NumLaneElts + i;
16273      NewMask[Lane * NumLaneElts + i] = M;
16274    }
16275  }
16276  SDValue NewV1 = DAG.getVectorShuffle(VT, DL, V1, V2, NewMask);
16277  // Ensure we didn't get back the shuffle we started with.
16278  // FIXME: This is a hack to make up for some splat handling code in
16279  // getVectorShuffle.
16280  if (isa<ShuffleVectorSDNode>(NewV1) &&
16281      cast<ShuffleVectorSDNode>(NewV1)->getMask() == Mask)
16282    return SDValue();
16283
16284  for (int Lane = 0; Lane != NumLanes; ++Lane) {
16285    int Src = LaneSrcs[Lane][1];
16286    for (int i = 0; i != NumLaneElts; ++i) {
16287      int M = -1;
16288      if (Src >= 0)
16289        M = Src * NumLaneElts + i;
16290      NewMask[Lane * NumLaneElts + i] = M;
16291    }
16292  }
16293  SDValue NewV2 = DAG.getVectorShuffle(VT, DL, V1, V2, NewMask);
16294  // Ensure we didn't get back the shuffle we started with.
16295  // FIXME: This is a hack to make up for some splat handling code in
16296  // getVectorShuffle.
16297  if (isa<ShuffleVectorSDNode>(NewV2) &&
16298      cast<ShuffleVectorSDNode>(NewV2)->getMask() == Mask)
16299    return SDValue();
16300
16301  for (int i = 0; i != NumElts; ++i) {
16302    NewMask[i] = RepeatMask[i % NumLaneElts];
16303    if (NewMask[i] < 0)
16304      continue;
16305
16306    NewMask[i] += (i / NumLaneElts) * NumLaneElts;
16307  }
16308  return DAG.getVectorShuffle(VT, DL, NewV1, NewV2, NewMask);
16309}
16310
16311/// If the input shuffle mask results in a vector that is undefined in all upper
16312/// or lower half elements and that mask accesses only 2 halves of the
16313/// shuffle's operands, return true. A mask of half the width with mask indexes
16314/// adjusted to access the extracted halves of the original shuffle operands is
16315/// returned in HalfMask. HalfIdx1 and HalfIdx2 return whether the upper or
16316/// lower half of each input operand is accessed.
16317static bool
16318getHalfShuffleMask(ArrayRef<int> Mask, MutableArrayRef<int> HalfMask,
16319                   int &HalfIdx1, int &HalfIdx2) {
16320  assert((Mask.size() == HalfMask.size() * 2) &&
16321         "Expected input mask to be twice as long as output");
16322
16323  // Exactly one half of the result must be undef to allow narrowing.
16324  bool UndefLower = isUndefLowerHalf(Mask);
16325  bool UndefUpper = isUndefUpperHalf(Mask);
16326  if (UndefLower == UndefUpper)
16327    return false;
16328
16329  unsigned HalfNumElts = HalfMask.size();
16330  unsigned MaskIndexOffset = UndefLower ? HalfNumElts : 0;
16331  HalfIdx1 = -1;
16332  HalfIdx2 = -1;
16333  for (unsigned i = 0; i != HalfNumElts; ++i) {
16334    int M = Mask[i + MaskIndexOffset];
16335    if (M < 0) {
16336      HalfMask[i] = M;
16337      continue;
16338    }
16339
16340    // Determine which of the 4 half vectors this element is from.
16341    // i.e. 0 = Lower V1, 1 = Upper V1, 2 = Lower V2, 3 = Upper V2.
16342    int HalfIdx = M / HalfNumElts;
16343
16344    // Determine the element index into its half vector source.
16345    int HalfElt = M % HalfNumElts;
16346
16347    // We can shuffle with up to 2 half vectors, set the new 'half'
16348    // shuffle mask accordingly.
16349    if (HalfIdx1 < 0 || HalfIdx1 == HalfIdx) {
16350      HalfMask[i] = HalfElt;
16351      HalfIdx1 = HalfIdx;
16352      continue;
16353    }
16354    if (HalfIdx2 < 0 || HalfIdx2 == HalfIdx) {
16355      HalfMask[i] = HalfElt + HalfNumElts;
16356      HalfIdx2 = HalfIdx;
16357      continue;
16358    }
16359
16360    // Too many half vectors referenced.
16361    return false;
16362  }
16363
16364  return true;
16365}
16366
16367/// Given the output values from getHalfShuffleMask(), create a half width
16368/// shuffle of extracted vectors followed by an insert back to full width.
16369static SDValue getShuffleHalfVectors(const SDLoc &DL, SDValue V1, SDValue V2,
16370                                     ArrayRef<int> HalfMask, int HalfIdx1,
16371                                     int HalfIdx2, bool UndefLower,
16372                                     SelectionDAG &DAG, bool UseConcat = false) {
16373  assert(V1.getValueType() == V2.getValueType() && "Different sized vectors?");
16374  assert(V1.getValueType().isSimple() && "Expecting only simple types");
16375
16376  MVT VT = V1.getSimpleValueType();
16377  MVT HalfVT = VT.getHalfNumVectorElementsVT();
16378  unsigned HalfNumElts = HalfVT.getVectorNumElements();
16379
16380  auto getHalfVector = [&](int HalfIdx) {
16381    if (HalfIdx < 0)
16382      return DAG.getUNDEF(HalfVT);
16383    SDValue V = (HalfIdx < 2 ? V1 : V2);
16384    HalfIdx = (HalfIdx % 2) * HalfNumElts;
16385    return DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, HalfVT, V,
16386                       DAG.getIntPtrConstant(HalfIdx, DL));
16387  };
16388
16389  // ins undef, (shuf (ext V1, HalfIdx1), (ext V2, HalfIdx2), HalfMask), Offset
16390  SDValue Half1 = getHalfVector(HalfIdx1);
16391  SDValue Half2 = getHalfVector(HalfIdx2);
16392  SDValue V = DAG.getVectorShuffle(HalfVT, DL, Half1, Half2, HalfMask);
16393  if (UseConcat) {
16394    SDValue Op0 = V;
16395    SDValue Op1 = DAG.getUNDEF(HalfVT);
16396    if (UndefLower)
16397      std::swap(Op0, Op1);
16398    return DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, Op0, Op1);
16399  }
16400
16401  unsigned Offset = UndefLower ? HalfNumElts : 0;
16402  return DAG.getNode(ISD::INSERT_SUBVECTOR, DL, VT, DAG.getUNDEF(VT), V,
16403                     DAG.getIntPtrConstant(Offset, DL));
16404}
16405
16406/// Lower shuffles where an entire half of a 256 or 512-bit vector is UNDEF.
16407/// This allows for fast cases such as subvector extraction/insertion
16408/// or shuffling smaller vector types which can lower more efficiently.
16409static SDValue lowerShuffleWithUndefHalf(const SDLoc &DL, MVT VT, SDValue V1,
16410                                         SDValue V2, ArrayRef<int> Mask,
16411                                         const X86Subtarget &Subtarget,
16412                                         SelectionDAG &DAG) {
16413  assert((VT.is256BitVector() || VT.is512BitVector()) &&
16414         "Expected 256-bit or 512-bit vector");
16415
16416  bool UndefLower = isUndefLowerHalf(Mask);
16417  if (!UndefLower && !isUndefUpperHalf(Mask))
16418    return SDValue();
16419
16420  assert((!UndefLower || !isUndefUpperHalf(Mask)) &&
16421         "Completely undef shuffle mask should have been simplified already");
16422
16423  // Upper half is undef and lower half is whole upper subvector.
16424  // e.g. vector_shuffle <4, 5, 6, 7, u, u, u, u> or <2, 3, u, u>
16425  MVT HalfVT = VT.getHalfNumVectorElementsVT();
16426  unsigned HalfNumElts = HalfVT.getVectorNumElements();
16427  if (!UndefLower &&
16428      isSequentialOrUndefInRange(Mask, 0, HalfNumElts, HalfNumElts)) {
16429    SDValue Hi = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, HalfVT, V1,
16430                             DAG.getIntPtrConstant(HalfNumElts, DL));
16431    return DAG.getNode(ISD::INSERT_SUBVECTOR, DL, VT, DAG.getUNDEF(VT), Hi,
16432                       DAG.getIntPtrConstant(0, DL));
16433  }
16434
16435  // Lower half is undef and upper half is whole lower subvector.
16436  // e.g. vector_shuffle <u, u, u, u, 0, 1, 2, 3> or <u, u, 0, 1>
16437  if (UndefLower &&
16438      isSequentialOrUndefInRange(Mask, HalfNumElts, HalfNumElts, 0)) {
16439    SDValue Hi = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, HalfVT, V1,
16440                             DAG.getIntPtrConstant(0, DL));
16441    return DAG.getNode(ISD::INSERT_SUBVECTOR, DL, VT, DAG.getUNDEF(VT), Hi,
16442                       DAG.getIntPtrConstant(HalfNumElts, DL));
16443  }
16444
16445  int HalfIdx1, HalfIdx2;
16446  SmallVector<int, 8> HalfMask(HalfNumElts);
16447  if (!getHalfShuffleMask(Mask, HalfMask, HalfIdx1, HalfIdx2))
16448    return SDValue();
16449
16450  assert(HalfMask.size() == HalfNumElts && "Unexpected shuffle mask length");
16451
16452  // Only shuffle the halves of the inputs when useful.
16453  unsigned NumLowerHalves =
16454      (HalfIdx1 == 0 || HalfIdx1 == 2) + (HalfIdx2 == 0 || HalfIdx2 == 2);
16455  unsigned NumUpperHalves =
16456      (HalfIdx1 == 1 || HalfIdx1 == 3) + (HalfIdx2 == 1 || HalfIdx2 == 3);
16457  assert(NumLowerHalves + NumUpperHalves <= 2 && "Only 1 or 2 halves allowed");
16458
16459  // Determine the larger pattern of undef/halves, then decide if it's worth
16460  // splitting the shuffle based on subtarget capabilities and types.
16461  unsigned EltWidth = VT.getVectorElementType().getSizeInBits();
16462  if (!UndefLower) {
16463    // XXXXuuuu: no insert is needed.
16464    // Always extract lowers when setting lower - these are all free subreg ops.
16465    if (NumUpperHalves == 0)
16466      return getShuffleHalfVectors(DL, V1, V2, HalfMask, HalfIdx1, HalfIdx2,
16467                                   UndefLower, DAG);
16468
16469    if (NumUpperHalves == 1) {
16470      // AVX2 has efficient 32/64-bit element cross-lane shuffles.
16471      if (Subtarget.hasAVX2()) {
16472        // extract128 + vunpckhps/vshufps, is better than vblend + vpermps.
16473        if (EltWidth == 32 && NumLowerHalves && HalfVT.is128BitVector() &&
16474            !is128BitUnpackShuffleMask(HalfMask) &&
16475            (!isSingleSHUFPSMask(HalfMask) ||
16476             Subtarget.hasFastVariableShuffle()))
16477          return SDValue();
16478        // If this is a unary shuffle (assume that the 2nd operand is
16479        // canonicalized to undef), then we can use vpermpd. Otherwise, we
16480        // are better off extracting the upper half of 1 operand and using a
16481        // narrow shuffle.
16482        if (EltWidth == 64 && V2.isUndef())
16483          return SDValue();
16484      }
16485      // AVX512 has efficient cross-lane shuffles for all legal 512-bit types.
16486      if (Subtarget.hasAVX512() && VT.is512BitVector())
16487        return SDValue();
16488      // Extract + narrow shuffle is better than the wide alternative.
16489      return getShuffleHalfVectors(DL, V1, V2, HalfMask, HalfIdx1, HalfIdx2,
16490                                   UndefLower, DAG);
16491    }
16492
16493    // Don't extract both uppers, instead shuffle and then extract.
16494    assert(NumUpperHalves == 2 && "Half vector count went wrong");
16495    return SDValue();
16496  }
16497
16498  // UndefLower - uuuuXXXX: an insert to high half is required if we split this.
16499  if (NumUpperHalves == 0) {
16500    // AVX2 has efficient 64-bit element cross-lane shuffles.
16501    // TODO: Refine to account for unary shuffle, splat, and other masks?
16502    if (Subtarget.hasAVX2() && EltWidth == 64)
16503      return SDValue();
16504    // AVX512 has efficient cross-lane shuffles for all legal 512-bit types.
16505    if (Subtarget.hasAVX512() && VT.is512BitVector())
16506      return SDValue();
16507    // Narrow shuffle + insert is better than the wide alternative.
16508    return getShuffleHalfVectors(DL, V1, V2, HalfMask, HalfIdx1, HalfIdx2,
16509                                 UndefLower, DAG);
16510  }
16511
16512  // NumUpperHalves != 0: don't bother with extract, shuffle, and then insert.
16513  return SDValue();
16514}
16515
16516/// Test whether the specified input (0 or 1) is in-place blended by the
16517/// given mask.
16518///
16519/// This returns true if the elements from a particular input are already in the
16520/// slot required by the given mask and require no permutation.
16521static bool isShuffleMaskInputInPlace(int Input, ArrayRef<int> Mask) {
16522  assert((Input == 0 || Input == 1) && "Only two inputs to shuffles.");
16523  int Size = Mask.size();
16524  for (int i = 0; i < Size; ++i)
16525    if (Mask[i] >= 0 && Mask[i] / Size == Input && Mask[i] % Size != i)
16526      return false;
16527
16528  return true;
16529}
16530
16531/// Handle case where shuffle sources are coming from the same 128-bit lane and
16532/// every lane can be represented as the same repeating mask - allowing us to
16533/// shuffle the sources with the repeating shuffle and then permute the result
16534/// to the destination lanes.
16535static SDValue lowerShuffleAsRepeatedMaskAndLanePermute(
16536    const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef<int> Mask,
16537    const X86Subtarget &Subtarget, SelectionDAG &DAG) {
16538  int NumElts = VT.getVectorNumElements();
16539  int NumLanes = VT.getSizeInBits() / 128;
16540  int NumLaneElts = NumElts / NumLanes;
16541
16542  // On AVX2 we may be able to just shuffle the lowest elements and then
16543  // broadcast the result.
16544  if (Subtarget.hasAVX2()) {
16545    for (unsigned BroadcastSize : {16, 32, 64}) {
16546      if (BroadcastSize <= VT.getScalarSizeInBits())
16547        continue;
16548      int NumBroadcastElts = BroadcastSize / VT.getScalarSizeInBits();
16549
16550      // Attempt to match a repeating pattern every NumBroadcastElts,
16551      // accounting for UNDEFs but only references the lowest 128-bit
16552      // lane of the inputs.
16553      auto FindRepeatingBroadcastMask = [&](SmallVectorImpl<int> &RepeatMask) {
16554        for (int i = 0; i != NumElts; i += NumBroadcastElts)
16555          for (int j = 0; j != NumBroadcastElts; ++j) {
16556            int M = Mask[i + j];
16557            if (M < 0)
16558              continue;
16559            int &R = RepeatMask[j];
16560            if (0 != ((M % NumElts) / NumLaneElts))
16561              return false;
16562            if (0 <= R && R != M)
16563              return false;
16564            R = M;
16565          }
16566        return true;
16567      };
16568
16569      SmallVector<int, 8> RepeatMask((unsigned)NumElts, -1);
16570      if (!FindRepeatingBroadcastMask(RepeatMask))
16571        continue;
16572
16573      // Shuffle the (lowest) repeated elements in place for broadcast.
16574      SDValue RepeatShuf = DAG.getVectorShuffle(VT, DL, V1, V2, RepeatMask);
16575
16576      // Shuffle the actual broadcast.
16577      SmallVector<int, 8> BroadcastMask((unsigned)NumElts, -1);
16578      for (int i = 0; i != NumElts; i += NumBroadcastElts)
16579        for (int j = 0; j != NumBroadcastElts; ++j)
16580          BroadcastMask[i + j] = j;
16581      return DAG.getVectorShuffle(VT, DL, RepeatShuf, DAG.getUNDEF(VT),
16582                                  BroadcastMask);
16583    }
16584  }
16585
16586  // Bail if the shuffle mask doesn't cross 128-bit lanes.
16587  if (!is128BitLaneCrossingShuffleMask(VT, Mask))
16588    return SDValue();
16589
16590  // Bail if we already have a repeated lane shuffle mask.
16591  SmallVector<int, 8> RepeatedShuffleMask;
16592  if (is128BitLaneRepeatedShuffleMask(VT, Mask, RepeatedShuffleMask))
16593    return SDValue();
16594
16595  // On AVX2 targets we can permute 256-bit vectors as 64-bit sub-lanes
16596  // (with PERMQ/PERMPD), otherwise we can only permute whole 128-bit lanes.
16597  int SubLaneScale = Subtarget.hasAVX2() && VT.is256BitVector() ? 2 : 1;
16598  int NumSubLanes = NumLanes * SubLaneScale;
16599  int NumSubLaneElts = NumLaneElts / SubLaneScale;
16600
16601  // Check that all the sources are coming from the same lane and see if we can
16602  // form a repeating shuffle mask (local to each sub-lane). At the same time,
16603  // determine the source sub-lane for each destination sub-lane.
16604  int TopSrcSubLane = -1;
16605  SmallVector<int, 8> Dst2SrcSubLanes((unsigned)NumSubLanes, -1);
16606  SmallVector<int, 8> RepeatedSubLaneMasks[2] = {
16607      SmallVector<int, 8>((unsigned)NumSubLaneElts, SM_SentinelUndef),
16608      SmallVector<int, 8>((unsigned)NumSubLaneElts, SM_SentinelUndef)};
16609
16610  for (int DstSubLane = 0; DstSubLane != NumSubLanes; ++DstSubLane) {
16611    // Extract the sub-lane mask, check that it all comes from the same lane
16612    // and normalize the mask entries to come from the first lane.
16613    int SrcLane = -1;
16614    SmallVector<int, 8> SubLaneMask((unsigned)NumSubLaneElts, -1);
16615    for (int Elt = 0; Elt != NumSubLaneElts; ++Elt) {
16616      int M = Mask[(DstSubLane * NumSubLaneElts) + Elt];
16617      if (M < 0)
16618        continue;
16619      int Lane = (M % NumElts) / NumLaneElts;
16620      if ((0 <= SrcLane) && (SrcLane != Lane))
16621        return SDValue();
16622      SrcLane = Lane;
16623      int LocalM = (M % NumLaneElts) + (M < NumElts ? 0 : NumElts);
16624      SubLaneMask[Elt] = LocalM;
16625    }
16626
16627    // Whole sub-lane is UNDEF.
16628    if (SrcLane < 0)
16629      continue;
16630
16631    // Attempt to match against the candidate repeated sub-lane masks.
16632    for (int SubLane = 0; SubLane != SubLaneScale; ++SubLane) {
16633      auto MatchMasks = [NumSubLaneElts](ArrayRef<int> M1, ArrayRef<int> M2) {
16634        for (int i = 0; i != NumSubLaneElts; ++i) {
16635          if (M1[i] < 0 || M2[i] < 0)
16636            continue;
16637          if (M1[i] != M2[i])
16638            return false;
16639        }
16640        return true;
16641      };
16642
16643      auto &RepeatedSubLaneMask = RepeatedSubLaneMasks[SubLane];
16644      if (!MatchMasks(SubLaneMask, RepeatedSubLaneMask))
16645        continue;
16646
16647      // Merge the sub-lane mask into the matching repeated sub-lane mask.
16648      for (int i = 0; i != NumSubLaneElts; ++i) {
16649        int M = SubLaneMask[i];
16650        if (M < 0)
16651          continue;
16652        assert((RepeatedSubLaneMask[i] < 0 || RepeatedSubLaneMask[i] == M) &&
16653               "Unexpected mask element");
16654        RepeatedSubLaneMask[i] = M;
16655      }
16656
16657      // Track the top most source sub-lane - by setting the remaining to UNDEF
16658      // we can greatly simplify shuffle matching.
16659      int SrcSubLane = (SrcLane * SubLaneScale) + SubLane;
16660      TopSrcSubLane = std::max(TopSrcSubLane, SrcSubLane);
16661      Dst2SrcSubLanes[DstSubLane] = SrcSubLane;
16662      break;
16663    }
16664
16665    // Bail if we failed to find a matching repeated sub-lane mask.
16666    if (Dst2SrcSubLanes[DstSubLane] < 0)
16667      return SDValue();
16668  }
16669  assert(0 <= TopSrcSubLane && TopSrcSubLane < NumSubLanes &&
16670         "Unexpected source lane");
16671
16672  // Create a repeating shuffle mask for the entire vector.
16673  SmallVector<int, 8> RepeatedMask((unsigned)NumElts, -1);
16674  for (int SubLane = 0; SubLane <= TopSrcSubLane; ++SubLane) {
16675    int Lane = SubLane / SubLaneScale;
16676    auto &RepeatedSubLaneMask = RepeatedSubLaneMasks[SubLane % SubLaneScale];
16677    for (int Elt = 0; Elt != NumSubLaneElts; ++Elt) {
16678      int M = RepeatedSubLaneMask[Elt];
16679      if (M < 0)
16680        continue;
16681      int Idx = (SubLane * NumSubLaneElts) + Elt;
16682      RepeatedMask[Idx] = M + (Lane * NumLaneElts);
16683    }
16684  }
16685  SDValue RepeatedShuffle = DAG.getVectorShuffle(VT, DL, V1, V2, RepeatedMask);
16686
16687  // Shuffle each source sub-lane to its destination.
16688  SmallVector<int, 8> SubLaneMask((unsigned)NumElts, -1);
16689  for (int i = 0; i != NumElts; i += NumSubLaneElts) {
16690    int SrcSubLane = Dst2SrcSubLanes[i / NumSubLaneElts];
16691    if (SrcSubLane < 0)
16692      continue;
16693    for (int j = 0; j != NumSubLaneElts; ++j)
16694      SubLaneMask[i + j] = j + (SrcSubLane * NumSubLaneElts);
16695  }
16696
16697  return DAG.getVectorShuffle(VT, DL, RepeatedShuffle, DAG.getUNDEF(VT),
16698                              SubLaneMask);
16699}
16700
16701static bool matchShuffleWithSHUFPD(MVT VT, SDValue &V1, SDValue &V2,
16702                                   bool &ForceV1Zero, bool &ForceV2Zero,
16703                                   unsigned &ShuffleImm, ArrayRef<int> Mask,
16704                                   const APInt &Zeroable) {
16705  int NumElts = VT.getVectorNumElements();
16706  assert(VT.getScalarSizeInBits() == 64 &&
16707         (NumElts == 2 || NumElts == 4 || NumElts == 8) &&
16708         "Unexpected data type for VSHUFPD");
16709  assert(isUndefOrZeroOrInRange(Mask, 0, 2 * NumElts) &&
16710         "Illegal shuffle mask");
16711
16712  bool ZeroLane[2] = { true, true };
16713  for (int i = 0; i < NumElts; ++i)
16714    ZeroLane[i & 1] &= Zeroable[i];
16715
16716  // Mask for V8F64: 0/1,  8/9,  2/3,  10/11, 4/5, ..
16717  // Mask for V4F64; 0/1,  4/5,  2/3,  6/7..
16718  ShuffleImm = 0;
16719  bool ShufpdMask = true;
16720  bool CommutableMask = true;
16721  for (int i = 0; i < NumElts; ++i) {
16722    if (Mask[i] == SM_SentinelUndef || ZeroLane[i & 1])
16723      continue;
16724    if (Mask[i] < 0)
16725      return false;
16726    int Val = (i & 6) + NumElts * (i & 1);
16727    int CommutVal = (i & 0xe) + NumElts * ((i & 1) ^ 1);
16728    if (Mask[i] < Val || Mask[i] > Val + 1)
16729      ShufpdMask = false;
16730    if (Mask[i] < CommutVal || Mask[i] > CommutVal + 1)
16731      CommutableMask = false;
16732    ShuffleImm |= (Mask[i] % 2) << i;
16733  }
16734
16735  if (!ShufpdMask && !CommutableMask)
16736    return false;
16737
16738  if (!ShufpdMask && CommutableMask)
16739    std::swap(V1, V2);
16740
16741  ForceV1Zero = ZeroLane[0];
16742  ForceV2Zero = ZeroLane[1];
16743  return true;
16744}
16745
16746static SDValue lowerShuffleWithSHUFPD(const SDLoc &DL, MVT VT, SDValue V1,
16747                                      SDValue V2, ArrayRef<int> Mask,
16748                                      const APInt &Zeroable,
16749                                      const X86Subtarget &Subtarget,
16750                                      SelectionDAG &DAG) {
16751  assert((VT == MVT::v2f64 || VT == MVT::v4f64 || VT == MVT::v8f64) &&
16752         "Unexpected data type for VSHUFPD");
16753
16754  unsigned Immediate = 0;
16755  bool ForceV1Zero = false, ForceV2Zero = false;
16756  if (!matchShuffleWithSHUFPD(VT, V1, V2, ForceV1Zero, ForceV2Zero, Immediate,
16757                              Mask, Zeroable))
16758    return SDValue();
16759
16760  // Create a REAL zero vector - ISD::isBuildVectorAllZeros allows UNDEFs.
16761  if (ForceV1Zero)
16762    V1 = getZeroVector(VT, Subtarget, DAG, DL);
16763  if (ForceV2Zero)
16764    V2 = getZeroVector(VT, Subtarget, DAG, DL);
16765
16766  return DAG.getNode(X86ISD::SHUFP, DL, VT, V1, V2,
16767                     DAG.getTargetConstant(Immediate, DL, MVT::i8));
16768}
16769
16770// Look for {0, 8, 16, 24, 32, 40, 48, 56 } in the first 8 elements. Followed
16771// by zeroable elements in the remaining 24 elements. Turn this into two
16772// vmovqb instructions shuffled together.
16773static SDValue lowerShuffleAsVTRUNCAndUnpack(const SDLoc &DL, MVT VT,
16774                                             SDValue V1, SDValue V2,
16775                                             ArrayRef<int> Mask,
16776                                             const APInt &Zeroable,
16777                                             SelectionDAG &DAG) {
16778  assert(VT == MVT::v32i8 && "Unexpected type!");
16779
16780  // The first 8 indices should be every 8th element.
16781  if (!isSequentialOrUndefInRange(Mask, 0, 8, 0, 8))
16782    return SDValue();
16783
16784  // Remaining elements need to be zeroable.
16785  if (Zeroable.countLeadingOnes() < (Mask.size() - 8))
16786    return SDValue();
16787
16788  V1 = DAG.getBitcast(MVT::v4i64, V1);
16789  V2 = DAG.getBitcast(MVT::v4i64, V2);
16790
16791  V1 = DAG.getNode(X86ISD::VTRUNC, DL, MVT::v16i8, V1);
16792  V2 = DAG.getNode(X86ISD::VTRUNC, DL, MVT::v16i8, V2);
16793
16794  // The VTRUNCs will put 0s in the upper 12 bytes. Use them to put zeroes in
16795  // the upper bits of the result using an unpckldq.
16796  SDValue Unpack = DAG.getVectorShuffle(MVT::v16i8, DL, V1, V2,
16797                                        { 0, 1, 2, 3, 16, 17, 18, 19,
16798                                          4, 5, 6, 7, 20, 21, 22, 23 });
16799  // Insert the unpckldq into a zero vector to widen to v32i8.
16800  return DAG.getNode(ISD::INSERT_SUBVECTOR, DL, MVT::v32i8,
16801                     DAG.getConstant(0, DL, MVT::v32i8), Unpack,
16802                     DAG.getIntPtrConstant(0, DL));
16803}
16804
16805
16806/// Handle lowering of 4-lane 64-bit floating point shuffles.
16807///
16808/// Also ends up handling lowering of 4-lane 64-bit integer shuffles when AVX2
16809/// isn't available.
16810static SDValue lowerV4F64Shuffle(const SDLoc &DL, ArrayRef<int> Mask,
16811                                 const APInt &Zeroable, SDValue V1, SDValue V2,
16812                                 const X86Subtarget &Subtarget,
16813                                 SelectionDAG &DAG) {
16814  assert(V1.getSimpleValueType() == MVT::v4f64 && "Bad operand type!");
16815  assert(V2.getSimpleValueType() == MVT::v4f64 && "Bad operand type!");
16816  assert(Mask.size() == 4 && "Unexpected mask size for v4 shuffle!");
16817
16818  if (SDValue V = lowerV2X128Shuffle(DL, MVT::v4f64, V1, V2, Mask, Zeroable,
16819                                     Subtarget, DAG))
16820    return V;
16821
16822  if (V2.isUndef()) {
16823    // Check for being able to broadcast a single element.
16824    if (SDValue Broadcast = lowerShuffleAsBroadcast(DL, MVT::v4f64, V1, V2,
16825                                                    Mask, Subtarget, DAG))
16826      return Broadcast;
16827
16828    // Use low duplicate instructions for masks that match their pattern.
16829    if (isShuffleEquivalent(Mask, {0, 0, 2, 2}, V1, V2))
16830      return DAG.getNode(X86ISD::MOVDDUP, DL, MVT::v4f64, V1);
16831
16832    if (!is128BitLaneCrossingShuffleMask(MVT::v4f64, Mask)) {
16833      // Non-half-crossing single input shuffles can be lowered with an
16834      // interleaved permutation.
16835      unsigned VPERMILPMask = (Mask[0] == 1) | ((Mask[1] == 1) << 1) |
16836                              ((Mask[2] == 3) << 2) | ((Mask[3] == 3) << 3);
16837      return DAG.getNode(X86ISD::VPERMILPI, DL, MVT::v4f64, V1,
16838                         DAG.getTargetConstant(VPERMILPMask, DL, MVT::i8));
16839    }
16840
16841    // With AVX2 we have direct support for this permutation.
16842    if (Subtarget.hasAVX2())
16843      return DAG.getNode(X86ISD::VPERMI, DL, MVT::v4f64, V1,
16844                         getV4X86ShuffleImm8ForMask(Mask, DL, DAG));
16845
16846    // Try to create an in-lane repeating shuffle mask and then shuffle the
16847    // results into the target lanes.
16848    if (SDValue V = lowerShuffleAsRepeatedMaskAndLanePermute(
16849            DL, MVT::v4f64, V1, V2, Mask, Subtarget, DAG))
16850      return V;
16851
16852    // Try to permute the lanes and then use a per-lane permute.
16853    if (SDValue V = lowerShuffleAsLanePermuteAndPermute(DL, MVT::v4f64, V1, V2,
16854                                                        Mask, DAG, Subtarget))
16855      return V;
16856
16857    // Otherwise, fall back.
16858    return lowerShuffleAsLanePermuteAndShuffle(DL, MVT::v4f64, V1, V2, Mask,
16859                                               DAG, Subtarget);
16860  }
16861
16862  // Use dedicated unpack instructions for masks that match their pattern.
16863  if (SDValue V = lowerShuffleWithUNPCK(DL, MVT::v4f64, Mask, V1, V2, DAG))
16864    return V;
16865
16866  if (SDValue Blend = lowerShuffleAsBlend(DL, MVT::v4f64, V1, V2, Mask,
16867                                          Zeroable, Subtarget, DAG))
16868    return Blend;
16869
16870  // Check if the blend happens to exactly fit that of SHUFPD.
16871  if (SDValue Op = lowerShuffleWithSHUFPD(DL, MVT::v4f64, V1, V2, Mask,
16872                                          Zeroable, Subtarget, DAG))
16873    return Op;
16874
16875  // If we have lane crossing shuffles AND they don't all come from the lower
16876  // lane elements, lower to SHUFPD(VPERM2F128(V1, V2), VPERM2F128(V1, V2)).
16877  // TODO: Handle BUILD_VECTOR sources which getVectorShuffle currently
16878  // canonicalize to a blend of splat which isn't necessary for this combine.
16879  if (is128BitLaneCrossingShuffleMask(MVT::v4f64, Mask) &&
16880      !all_of(Mask, [](int M) { return M < 2 || (4 <= M && M < 6); }) &&
16881      (V1.getOpcode() != ISD::BUILD_VECTOR) &&
16882      (V2.getOpcode() != ISD::BUILD_VECTOR))
16883    if (SDValue Op = lowerShuffleAsLanePermuteAndSHUFP(DL, MVT::v4f64, V1, V2,
16884                                                       Mask, DAG))
16885      return Op;
16886
16887  // If we have one input in place, then we can permute the other input and
16888  // blend the result.
16889  if (isShuffleMaskInputInPlace(0, Mask) || isShuffleMaskInputInPlace(1, Mask))
16890    return lowerShuffleAsDecomposedShuffleMerge(DL, MVT::v4f64, V1, V2, Mask,
16891                                                Subtarget, DAG);
16892
16893  // Try to create an in-lane repeating shuffle mask and then shuffle the
16894  // results into the target lanes.
16895  if (SDValue V = lowerShuffleAsRepeatedMaskAndLanePermute(
16896          DL, MVT::v4f64, V1, V2, Mask, Subtarget, DAG))
16897    return V;
16898
16899  // Try to simplify this by merging 128-bit lanes to enable a lane-based
16900  // shuffle. However, if we have AVX2 and either inputs are already in place,
16901  // we will be able to shuffle even across lanes the other input in a single
16902  // instruction so skip this pattern.
16903  if (!(Subtarget.hasAVX2() && (isShuffleMaskInputInPlace(0, Mask) ||
16904                                isShuffleMaskInputInPlace(1, Mask))))
16905    if (SDValue V = lowerShuffleAsLanePermuteAndRepeatedMask(
16906            DL, MVT::v4f64, V1, V2, Mask, Subtarget, DAG))
16907      return V;
16908
16909  // If we have VLX support, we can use VEXPAND.
16910  if (Subtarget.hasVLX())
16911    if (SDValue V = lowerShuffleToEXPAND(DL, MVT::v4f64, Zeroable, Mask, V1, V2,
16912                                         DAG, Subtarget))
16913      return V;
16914
16915  // If we have AVX2 then we always want to lower with a blend because an v4 we
16916  // can fully permute the elements.
16917  if (Subtarget.hasAVX2())
16918    return lowerShuffleAsDecomposedShuffleMerge(DL, MVT::v4f64, V1, V2, Mask,
16919                                                Subtarget, DAG);
16920
16921  // Otherwise fall back on generic lowering.
16922  return lowerShuffleAsSplitOrBlend(DL, MVT::v4f64, V1, V2, Mask,
16923                                    Subtarget, DAG);
16924}
16925
16926/// Handle lowering of 4-lane 64-bit integer shuffles.
16927///
16928/// This routine is only called when we have AVX2 and thus a reasonable
16929/// instruction set for v4i64 shuffling..
16930static SDValue lowerV4I64Shuffle(const SDLoc &DL, ArrayRef<int> Mask,
16931                                 const APInt &Zeroable, SDValue V1, SDValue V2,
16932                                 const X86Subtarget &Subtarget,
16933                                 SelectionDAG &DAG) {
16934  assert(V1.getSimpleValueType() == MVT::v4i64 && "Bad operand type!");
16935  assert(V2.getSimpleValueType() == MVT::v4i64 && "Bad operand type!");
16936  assert(Mask.size() == 4 && "Unexpected mask size for v4 shuffle!");
16937  assert(Subtarget.hasAVX2() && "We can only lower v4i64 with AVX2!");
16938
16939  if (SDValue V = lowerV2X128Shuffle(DL, MVT::v4i64, V1, V2, Mask, Zeroable,
16940                                     Subtarget, DAG))
16941    return V;
16942
16943  if (SDValue Blend = lowerShuffleAsBlend(DL, MVT::v4i64, V1, V2, Mask,
16944                                          Zeroable, Subtarget, DAG))
16945    return Blend;
16946
16947  // Check for being able to broadcast a single element.
16948  if (SDValue Broadcast = lowerShuffleAsBroadcast(DL, MVT::v4i64, V1, V2, Mask,
16949                                                  Subtarget, DAG))
16950    return Broadcast;
16951
16952  if (V2.isUndef()) {
16953    // When the shuffle is mirrored between the 128-bit lanes of the unit, we
16954    // can use lower latency instructions that will operate on both lanes.
16955    SmallVector<int, 2> RepeatedMask;
16956    if (is128BitLaneRepeatedShuffleMask(MVT::v4i64, Mask, RepeatedMask)) {
16957      SmallVector<int, 4> PSHUFDMask;
16958      narrowShuffleMaskElts(2, RepeatedMask, PSHUFDMask);
16959      return DAG.getBitcast(
16960          MVT::v4i64,
16961          DAG.getNode(X86ISD::PSHUFD, DL, MVT::v8i32,
16962                      DAG.getBitcast(MVT::v8i32, V1),
16963                      getV4X86ShuffleImm8ForMask(PSHUFDMask, DL, DAG)));
16964    }
16965
16966    // AVX2 provides a direct instruction for permuting a single input across
16967    // lanes.
16968    return DAG.getNode(X86ISD::VPERMI, DL, MVT::v4i64, V1,
16969                       getV4X86ShuffleImm8ForMask(Mask, DL, DAG));
16970  }
16971
16972  // Try to use shift instructions.
16973  if (SDValue Shift = lowerShuffleAsShift(DL, MVT::v4i64, V1, V2, Mask,
16974                                          Zeroable, Subtarget, DAG))
16975    return Shift;
16976
16977  // If we have VLX support, we can use VALIGN or VEXPAND.
16978  if (Subtarget.hasVLX()) {
16979    if (SDValue Rotate = lowerShuffleAsVALIGN(DL, MVT::v4i64, V1, V2, Mask,
16980                                              Subtarget, DAG))
16981      return Rotate;
16982
16983    if (SDValue V = lowerShuffleToEXPAND(DL, MVT::v4i64, Zeroable, Mask, V1, V2,
16984                                         DAG, Subtarget))
16985      return V;
16986  }
16987
16988  // Try to use PALIGNR.
16989  if (SDValue Rotate = lowerShuffleAsByteRotate(DL, MVT::v4i64, V1, V2, Mask,
16990                                                Subtarget, DAG))
16991    return Rotate;
16992
16993  // Use dedicated unpack instructions for masks that match their pattern.
16994  if (SDValue V = lowerShuffleWithUNPCK(DL, MVT::v4i64, Mask, V1, V2, DAG))
16995    return V;
16996
16997  // If we have one input in place, then we can permute the other input and
16998  // blend the result.
16999  if (isShuffleMaskInputInPlace(0, Mask) || isShuffleMaskInputInPlace(1, Mask))
17000    return lowerShuffleAsDecomposedShuffleMerge(DL, MVT::v4i64, V1, V2, Mask,
17001                                                Subtarget, DAG);
17002
17003  // Try to create an in-lane repeating shuffle mask and then shuffle the
17004  // results into the target lanes.
17005  if (SDValue V = lowerShuffleAsRepeatedMaskAndLanePermute(
17006          DL, MVT::v4i64, V1, V2, Mask, Subtarget, DAG))
17007    return V;
17008
17009  // Try to simplify this by merging 128-bit lanes to enable a lane-based
17010  // shuffle. However, if we have AVX2 and either inputs are already in place,
17011  // we will be able to shuffle even across lanes the other input in a single
17012  // instruction so skip this pattern.
17013  if (!isShuffleMaskInputInPlace(0, Mask) &&
17014      !isShuffleMaskInputInPlace(1, Mask))
17015    if (SDValue Result = lowerShuffleAsLanePermuteAndRepeatedMask(
17016            DL, MVT::v4i64, V1, V2, Mask, Subtarget, DAG))
17017      return Result;
17018
17019  // Otherwise fall back on generic blend lowering.
17020  return lowerShuffleAsDecomposedShuffleMerge(DL, MVT::v4i64, V1, V2, Mask,
17021                                              Subtarget, DAG);
17022}
17023
17024/// Handle lowering of 8-lane 32-bit floating point shuffles.
17025///
17026/// Also ends up handling lowering of 8-lane 32-bit integer shuffles when AVX2
17027/// isn't available.
17028static SDValue lowerV8F32Shuffle(const SDLoc &DL, ArrayRef<int> Mask,
17029                                 const APInt &Zeroable, SDValue V1, SDValue V2,
17030                                 const X86Subtarget &Subtarget,
17031                                 SelectionDAG &DAG) {
17032  assert(V1.getSimpleValueType() == MVT::v8f32 && "Bad operand type!");
17033  assert(V2.getSimpleValueType() == MVT::v8f32 && "Bad operand type!");
17034  assert(Mask.size() == 8 && "Unexpected mask size for v8 shuffle!");
17035
17036  if (SDValue Blend = lowerShuffleAsBlend(DL, MVT::v8f32, V1, V2, Mask,
17037                                          Zeroable, Subtarget, DAG))
17038    return Blend;
17039
17040  // Check for being able to broadcast a single element.
17041  if (SDValue Broadcast = lowerShuffleAsBroadcast(DL, MVT::v8f32, V1, V2, Mask,
17042                                                  Subtarget, DAG))
17043    return Broadcast;
17044
17045  // If the shuffle mask is repeated in each 128-bit lane, we have many more
17046  // options to efficiently lower the shuffle.
17047  SmallVector<int, 4> RepeatedMask;
17048  if (is128BitLaneRepeatedShuffleMask(MVT::v8f32, Mask, RepeatedMask)) {
17049    assert(RepeatedMask.size() == 4 &&
17050           "Repeated masks must be half the mask width!");
17051
17052    // Use even/odd duplicate instructions for masks that match their pattern.
17053    if (isShuffleEquivalent(RepeatedMask, {0, 0, 2, 2}, V1, V2))
17054      return DAG.getNode(X86ISD::MOVSLDUP, DL, MVT::v8f32, V1);
17055    if (isShuffleEquivalent(RepeatedMask, {1, 1, 3, 3}, V1, V2))
17056      return DAG.getNode(X86ISD::MOVSHDUP, DL, MVT::v8f32, V1);
17057
17058    if (V2.isUndef())
17059      return DAG.getNode(X86ISD::VPERMILPI, DL, MVT::v8f32, V1,
17060                         getV4X86ShuffleImm8ForMask(RepeatedMask, DL, DAG));
17061
17062    // Use dedicated unpack instructions for masks that match their pattern.
17063    if (SDValue V = lowerShuffleWithUNPCK(DL, MVT::v8f32, Mask, V1, V2, DAG))
17064      return V;
17065
17066    // Otherwise, fall back to a SHUFPS sequence. Here it is important that we
17067    // have already handled any direct blends.
17068    return lowerShuffleWithSHUFPS(DL, MVT::v8f32, RepeatedMask, V1, V2, DAG);
17069  }
17070
17071  // Try to create an in-lane repeating shuffle mask and then shuffle the
17072  // results into the target lanes.
17073  if (SDValue V = lowerShuffleAsRepeatedMaskAndLanePermute(
17074          DL, MVT::v8f32, V1, V2, Mask, Subtarget, DAG))
17075    return V;
17076
17077  // If we have a single input shuffle with different shuffle patterns in the
17078  // two 128-bit lanes use the variable mask to VPERMILPS.
17079  if (V2.isUndef()) {
17080    if (!is128BitLaneCrossingShuffleMask(MVT::v8f32, Mask)) {
17081      SDValue VPermMask = getConstVector(Mask, MVT::v8i32, DAG, DL, true);
17082      return DAG.getNode(X86ISD::VPERMILPV, DL, MVT::v8f32, V1, VPermMask);
17083    }
17084    if (Subtarget.hasAVX2()) {
17085      SDValue VPermMask = getConstVector(Mask, MVT::v8i32, DAG, DL, true);
17086      return DAG.getNode(X86ISD::VPERMV, DL, MVT::v8f32, VPermMask, V1);
17087    }
17088    // Otherwise, fall back.
17089    return lowerShuffleAsLanePermuteAndShuffle(DL, MVT::v8f32, V1, V2, Mask,
17090                                               DAG, Subtarget);
17091  }
17092
17093  // Try to simplify this by merging 128-bit lanes to enable a lane-based
17094  // shuffle.
17095  if (SDValue Result = lowerShuffleAsLanePermuteAndRepeatedMask(
17096          DL, MVT::v8f32, V1, V2, Mask, Subtarget, DAG))
17097    return Result;
17098
17099  // If we have VLX support, we can use VEXPAND.
17100  if (Subtarget.hasVLX())
17101    if (SDValue V = lowerShuffleToEXPAND(DL, MVT::v8f32, Zeroable, Mask, V1, V2,
17102                                         DAG, Subtarget))
17103      return V;
17104
17105  // For non-AVX512 if the Mask is of 16bit elements in lane then try to split
17106  // since after split we get a more efficient code using vpunpcklwd and
17107  // vpunpckhwd instrs than vblend.
17108  if (!Subtarget.hasAVX512() && isUnpackWdShuffleMask(Mask, MVT::v8f32))
17109    return lowerShuffleAsSplitOrBlend(DL, MVT::v8f32, V1, V2, Mask, Subtarget,
17110                                      DAG);
17111
17112  // If we have AVX2 then we always want to lower with a blend because at v8 we
17113  // can fully permute the elements.
17114  if (Subtarget.hasAVX2())
17115    return lowerShuffleAsDecomposedShuffleMerge(DL, MVT::v8f32, V1, V2, Mask,
17116                                                Subtarget, DAG);
17117
17118  // Otherwise fall back on generic lowering.
17119  return lowerShuffleAsSplitOrBlend(DL, MVT::v8f32, V1, V2, Mask,
17120                                    Subtarget, DAG);
17121}
17122
17123/// Handle lowering of 8-lane 32-bit integer shuffles.
17124///
17125/// This routine is only called when we have AVX2 and thus a reasonable
17126/// instruction set for v8i32 shuffling..
17127static SDValue lowerV8I32Shuffle(const SDLoc &DL, ArrayRef<int> Mask,
17128                                 const APInt &Zeroable, SDValue V1, SDValue V2,
17129                                 const X86Subtarget &Subtarget,
17130                                 SelectionDAG &DAG) {
17131  assert(V1.getSimpleValueType() == MVT::v8i32 && "Bad operand type!");
17132  assert(V2.getSimpleValueType() == MVT::v8i32 && "Bad operand type!");
17133  assert(Mask.size() == 8 && "Unexpected mask size for v8 shuffle!");
17134  assert(Subtarget.hasAVX2() && "We can only lower v8i32 with AVX2!");
17135
17136  // Whenever we can lower this as a zext, that instruction is strictly faster
17137  // than any alternative. It also allows us to fold memory operands into the
17138  // shuffle in many cases.
17139  if (SDValue ZExt = lowerShuffleAsZeroOrAnyExtend(DL, MVT::v8i32, V1, V2, Mask,
17140                                                   Zeroable, Subtarget, DAG))
17141    return ZExt;
17142
17143  // For non-AVX512 if the Mask is of 16bit elements in lane then try to split
17144  // since after split we get a more efficient code than vblend by using
17145  // vpunpcklwd and vpunpckhwd instrs.
17146  if (isUnpackWdShuffleMask(Mask, MVT::v8i32) && !V2.isUndef() &&
17147      !Subtarget.hasAVX512())
17148    return lowerShuffleAsSplitOrBlend(DL, MVT::v8i32, V1, V2, Mask, Subtarget,
17149                                      DAG);
17150
17151  if (SDValue Blend = lowerShuffleAsBlend(DL, MVT::v8i32, V1, V2, Mask,
17152                                          Zeroable, Subtarget, DAG))
17153    return Blend;
17154
17155  // Check for being able to broadcast a single element.
17156  if (SDValue Broadcast = lowerShuffleAsBroadcast(DL, MVT::v8i32, V1, V2, Mask,
17157                                                  Subtarget, DAG))
17158    return Broadcast;
17159
17160  // If the shuffle mask is repeated in each 128-bit lane we can use more
17161  // efficient instructions that mirror the shuffles across the two 128-bit
17162  // lanes.
17163  SmallVector<int, 4> RepeatedMask;
17164  bool Is128BitLaneRepeatedShuffle =
17165      is128BitLaneRepeatedShuffleMask(MVT::v8i32, Mask, RepeatedMask);
17166  if (Is128BitLaneRepeatedShuffle) {
17167    assert(RepeatedMask.size() == 4 && "Unexpected repeated mask size!");
17168    if (V2.isUndef())
17169      return DAG.getNode(X86ISD::PSHUFD, DL, MVT::v8i32, V1,
17170                         getV4X86ShuffleImm8ForMask(RepeatedMask, DL, DAG));
17171
17172    // Use dedicated unpack instructions for masks that match their pattern.
17173    if (SDValue V = lowerShuffleWithUNPCK(DL, MVT::v8i32, Mask, V1, V2, DAG))
17174      return V;
17175  }
17176
17177  // Try to use shift instructions.
17178  if (SDValue Shift = lowerShuffleAsShift(DL, MVT::v8i32, V1, V2, Mask,
17179                                          Zeroable, Subtarget, DAG))
17180    return Shift;
17181
17182  // If we have VLX support, we can use VALIGN or EXPAND.
17183  if (Subtarget.hasVLX()) {
17184    if (SDValue Rotate = lowerShuffleAsVALIGN(DL, MVT::v8i32, V1, V2, Mask,
17185                                              Subtarget, DAG))
17186      return Rotate;
17187
17188    if (SDValue V = lowerShuffleToEXPAND(DL, MVT::v8i32, Zeroable, Mask, V1, V2,
17189                                         DAG, Subtarget))
17190      return V;
17191  }
17192
17193  // Try to use byte rotation instructions.
17194  if (SDValue Rotate = lowerShuffleAsByteRotate(DL, MVT::v8i32, V1, V2, Mask,
17195                                                Subtarget, DAG))
17196    return Rotate;
17197
17198  // Try to create an in-lane repeating shuffle mask and then shuffle the
17199  // results into the target lanes.
17200  if (SDValue V = lowerShuffleAsRepeatedMaskAndLanePermute(
17201          DL, MVT::v8i32, V1, V2, Mask, Subtarget, DAG))
17202    return V;
17203
17204  if (V2.isUndef()) {
17205    // Try to produce a fixed cross-128-bit lane permute followed by unpack
17206    // because that should be faster than the variable permute alternatives.
17207    if (SDValue V = lowerShuffleWithUNPCK256(DL, MVT::v8i32, Mask, V1, V2, DAG))
17208      return V;
17209
17210    // If the shuffle patterns aren't repeated but it's a single input, directly
17211    // generate a cross-lane VPERMD instruction.
17212    SDValue VPermMask = getConstVector(Mask, MVT::v8i32, DAG, DL, true);
17213    return DAG.getNode(X86ISD::VPERMV, DL, MVT::v8i32, VPermMask, V1);
17214  }
17215
17216  // Assume that a single SHUFPS is faster than an alternative sequence of
17217  // multiple instructions (even if the CPU has a domain penalty).
17218  // If some CPU is harmed by the domain switch, we can fix it in a later pass.
17219  if (Is128BitLaneRepeatedShuffle && isSingleSHUFPSMask(RepeatedMask)) {
17220    SDValue CastV1 = DAG.getBitcast(MVT::v8f32, V1);
17221    SDValue CastV2 = DAG.getBitcast(MVT::v8f32, V2);
17222    SDValue ShufPS = lowerShuffleWithSHUFPS(DL, MVT::v8f32, RepeatedMask,
17223                                            CastV1, CastV2, DAG);
17224    return DAG.getBitcast(MVT::v8i32, ShufPS);
17225  }
17226
17227  // Try to simplify this by merging 128-bit lanes to enable a lane-based
17228  // shuffle.
17229  if (SDValue Result = lowerShuffleAsLanePermuteAndRepeatedMask(
17230          DL, MVT::v8i32, V1, V2, Mask, Subtarget, DAG))
17231    return Result;
17232
17233  // Otherwise fall back on generic blend lowering.
17234  return lowerShuffleAsDecomposedShuffleMerge(DL, MVT::v8i32, V1, V2, Mask,
17235                                              Subtarget, DAG);
17236}
17237
17238/// Handle lowering of 16-lane 16-bit integer shuffles.
17239///
17240/// This routine is only called when we have AVX2 and thus a reasonable
17241/// instruction set for v16i16 shuffling..
17242static SDValue lowerV16I16Shuffle(const SDLoc &DL, ArrayRef<int> Mask,
17243                                  const APInt &Zeroable, SDValue V1, SDValue V2,
17244                                  const X86Subtarget &Subtarget,
17245                                  SelectionDAG &DAG) {
17246  assert(V1.getSimpleValueType() == MVT::v16i16 && "Bad operand type!");
17247  assert(V2.getSimpleValueType() == MVT::v16i16 && "Bad operand type!");
17248  assert(Mask.size() == 16 && "Unexpected mask size for v16 shuffle!");
17249  assert(Subtarget.hasAVX2() && "We can only lower v16i16 with AVX2!");
17250
17251  // Whenever we can lower this as a zext, that instruction is strictly faster
17252  // than any alternative. It also allows us to fold memory operands into the
17253  // shuffle in many cases.
17254  if (SDValue ZExt = lowerShuffleAsZeroOrAnyExtend(
17255          DL, MVT::v16i16, V1, V2, Mask, Zeroable, Subtarget, DAG))
17256    return ZExt;
17257
17258  // Check for being able to broadcast a single element.
17259  if (SDValue Broadcast = lowerShuffleAsBroadcast(DL, MVT::v16i16, V1, V2, Mask,
17260                                                  Subtarget, DAG))
17261    return Broadcast;
17262
17263  if (SDValue Blend = lowerShuffleAsBlend(DL, MVT::v16i16, V1, V2, Mask,
17264                                          Zeroable, Subtarget, DAG))
17265    return Blend;
17266
17267  // Use dedicated unpack instructions for masks that match their pattern.
17268  if (SDValue V = lowerShuffleWithUNPCK(DL, MVT::v16i16, Mask, V1, V2, DAG))
17269    return V;
17270
17271  // Use dedicated pack instructions for masks that match their pattern.
17272  if (SDValue V = lowerShuffleWithPACK(DL, MVT::v16i16, Mask, V1, V2, DAG,
17273                                       Subtarget))
17274    return V;
17275
17276  // Try to use lower using a truncation.
17277  if (SDValue V = lowerShuffleAsVTRUNC(DL, MVT::v16i16, V1, V2, Mask, Zeroable,
17278                                       Subtarget, DAG))
17279    return V;
17280
17281  // Try to use shift instructions.
17282  if (SDValue Shift = lowerShuffleAsShift(DL, MVT::v16i16, V1, V2, Mask,
17283                                          Zeroable, Subtarget, DAG))
17284    return Shift;
17285
17286  // Try to use byte rotation instructions.
17287  if (SDValue Rotate = lowerShuffleAsByteRotate(DL, MVT::v16i16, V1, V2, Mask,
17288                                                Subtarget, DAG))
17289    return Rotate;
17290
17291  // Try to create an in-lane repeating shuffle mask and then shuffle the
17292  // results into the target lanes.
17293  if (SDValue V = lowerShuffleAsRepeatedMaskAndLanePermute(
17294          DL, MVT::v16i16, V1, V2, Mask, Subtarget, DAG))
17295    return V;
17296
17297  if (V2.isUndef()) {
17298    // Try to use bit rotation instructions.
17299    if (SDValue Rotate =
17300            lowerShuffleAsBitRotate(DL, MVT::v16i16, V1, Mask, Subtarget, DAG))
17301      return Rotate;
17302
17303    // Try to produce a fixed cross-128-bit lane permute followed by unpack
17304    // because that should be faster than the variable permute alternatives.
17305    if (SDValue V = lowerShuffleWithUNPCK256(DL, MVT::v16i16, Mask, V1, V2, DAG))
17306      return V;
17307
17308    // There are no generalized cross-lane shuffle operations available on i16
17309    // element types.
17310    if (is128BitLaneCrossingShuffleMask(MVT::v16i16, Mask)) {
17311      if (SDValue V = lowerShuffleAsLanePermuteAndPermute(
17312              DL, MVT::v16i16, V1, V2, Mask, DAG, Subtarget))
17313        return V;
17314
17315      return lowerShuffleAsLanePermuteAndShuffle(DL, MVT::v16i16, V1, V2, Mask,
17316                                                 DAG, Subtarget);
17317    }
17318
17319    SmallVector<int, 8> RepeatedMask;
17320    if (is128BitLaneRepeatedShuffleMask(MVT::v16i16, Mask, RepeatedMask)) {
17321      // As this is a single-input shuffle, the repeated mask should be
17322      // a strictly valid v8i16 mask that we can pass through to the v8i16
17323      // lowering to handle even the v16 case.
17324      return lowerV8I16GeneralSingleInputShuffle(
17325          DL, MVT::v16i16, V1, RepeatedMask, Subtarget, DAG);
17326    }
17327  }
17328
17329  if (SDValue PSHUFB = lowerShuffleWithPSHUFB(DL, MVT::v16i16, Mask, V1, V2,
17330                                              Zeroable, Subtarget, DAG))
17331    return PSHUFB;
17332
17333  // AVX512BW can lower to VPERMW (non-VLX will pad to v32i16).
17334  if (Subtarget.hasBWI())
17335    return lowerShuffleWithPERMV(DL, MVT::v16i16, Mask, V1, V2, Subtarget, DAG);
17336
17337  // Try to simplify this by merging 128-bit lanes to enable a lane-based
17338  // shuffle.
17339  if (SDValue Result = lowerShuffleAsLanePermuteAndRepeatedMask(
17340          DL, MVT::v16i16, V1, V2, Mask, Subtarget, DAG))
17341    return Result;
17342
17343  // Try to permute the lanes and then use a per-lane permute.
17344  if (SDValue V = lowerShuffleAsLanePermuteAndPermute(
17345          DL, MVT::v16i16, V1, V2, Mask, DAG, Subtarget))
17346    return V;
17347
17348  // Otherwise fall back on generic lowering.
17349  return lowerShuffleAsSplitOrBlend(DL, MVT::v16i16, V1, V2, Mask,
17350                                    Subtarget, DAG);
17351}
17352
17353/// Handle lowering of 32-lane 8-bit integer shuffles.
17354///
17355/// This routine is only called when we have AVX2 and thus a reasonable
17356/// instruction set for v32i8 shuffling..
17357static SDValue lowerV32I8Shuffle(const SDLoc &DL, ArrayRef<int> Mask,
17358                                 const APInt &Zeroable, SDValue V1, SDValue V2,
17359                                 const X86Subtarget &Subtarget,
17360                                 SelectionDAG &DAG) {
17361  assert(V1.getSimpleValueType() == MVT::v32i8 && "Bad operand type!");
17362  assert(V2.getSimpleValueType() == MVT::v32i8 && "Bad operand type!");
17363  assert(Mask.size() == 32 && "Unexpected mask size for v32 shuffle!");
17364  assert(Subtarget.hasAVX2() && "We can only lower v32i8 with AVX2!");
17365
17366  // Whenever we can lower this as a zext, that instruction is strictly faster
17367  // than any alternative. It also allows us to fold memory operands into the
17368  // shuffle in many cases.
17369  if (SDValue ZExt = lowerShuffleAsZeroOrAnyExtend(DL, MVT::v32i8, V1, V2, Mask,
17370                                                   Zeroable, Subtarget, DAG))
17371    return ZExt;
17372
17373  // Check for being able to broadcast a single element.
17374  if (SDValue Broadcast = lowerShuffleAsBroadcast(DL, MVT::v32i8, V1, V2, Mask,
17375                                                  Subtarget, DAG))
17376    return Broadcast;
17377
17378  if (SDValue Blend = lowerShuffleAsBlend(DL, MVT::v32i8, V1, V2, Mask,
17379                                          Zeroable, Subtarget, DAG))
17380    return Blend;
17381
17382  // Use dedicated unpack instructions for masks that match their pattern.
17383  if (SDValue V = lowerShuffleWithUNPCK(DL, MVT::v32i8, Mask, V1, V2, DAG))
17384    return V;
17385
17386  // Use dedicated pack instructions for masks that match their pattern.
17387  if (SDValue V = lowerShuffleWithPACK(DL, MVT::v32i8, Mask, V1, V2, DAG,
17388                                       Subtarget))
17389    return V;
17390
17391  // Try to use lower using a truncation.
17392  if (SDValue V = lowerShuffleAsVTRUNC(DL, MVT::v32i8, V1, V2, Mask, Zeroable,
17393                                       Subtarget, DAG))
17394    return V;
17395
17396  // Try to use shift instructions.
17397  if (SDValue Shift = lowerShuffleAsShift(DL, MVT::v32i8, V1, V2, Mask,
17398                                          Zeroable, Subtarget, DAG))
17399    return Shift;
17400
17401  // Try to use byte rotation instructions.
17402  if (SDValue Rotate = lowerShuffleAsByteRotate(DL, MVT::v32i8, V1, V2, Mask,
17403                                                Subtarget, DAG))
17404    return Rotate;
17405
17406  // Try to use bit rotation instructions.
17407  if (V2.isUndef())
17408    if (SDValue Rotate =
17409            lowerShuffleAsBitRotate(DL, MVT::v32i8, V1, Mask, Subtarget, DAG))
17410      return Rotate;
17411
17412  // Try to create an in-lane repeating shuffle mask and then shuffle the
17413  // results into the target lanes.
17414  if (SDValue V = lowerShuffleAsRepeatedMaskAndLanePermute(
17415          DL, MVT::v32i8, V1, V2, Mask, Subtarget, DAG))
17416    return V;
17417
17418  // There are no generalized cross-lane shuffle operations available on i8
17419  // element types.
17420  if (V2.isUndef() && is128BitLaneCrossingShuffleMask(MVT::v32i8, Mask)) {
17421    // Try to produce a fixed cross-128-bit lane permute followed by unpack
17422    // because that should be faster than the variable permute alternatives.
17423    if (SDValue V = lowerShuffleWithUNPCK256(DL, MVT::v32i8, Mask, V1, V2, DAG))
17424      return V;
17425
17426    if (SDValue V = lowerShuffleAsLanePermuteAndPermute(
17427            DL, MVT::v32i8, V1, V2, Mask, DAG, Subtarget))
17428      return V;
17429
17430    return lowerShuffleAsLanePermuteAndShuffle(DL, MVT::v32i8, V1, V2, Mask,
17431                                               DAG, Subtarget);
17432  }
17433
17434  if (SDValue PSHUFB = lowerShuffleWithPSHUFB(DL, MVT::v32i8, Mask, V1, V2,
17435                                              Zeroable, Subtarget, DAG))
17436    return PSHUFB;
17437
17438  // AVX512VBMI can lower to VPERMB (non-VLX will pad to v64i8).
17439  if (Subtarget.hasVBMI())
17440    return lowerShuffleWithPERMV(DL, MVT::v32i8, Mask, V1, V2, Subtarget, DAG);
17441
17442  // Try to simplify this by merging 128-bit lanes to enable a lane-based
17443  // shuffle.
17444  if (SDValue Result = lowerShuffleAsLanePermuteAndRepeatedMask(
17445          DL, MVT::v32i8, V1, V2, Mask, Subtarget, DAG))
17446    return Result;
17447
17448  // Try to permute the lanes and then use a per-lane permute.
17449  if (SDValue V = lowerShuffleAsLanePermuteAndPermute(
17450          DL, MVT::v32i8, V1, V2, Mask, DAG, Subtarget))
17451    return V;
17452
17453  // Look for {0, 8, 16, 24, 32, 40, 48, 56 } in the first 8 elements. Followed
17454  // by zeroable elements in the remaining 24 elements. Turn this into two
17455  // vmovqb instructions shuffled together.
17456  if (Subtarget.hasVLX())
17457    if (SDValue V = lowerShuffleAsVTRUNCAndUnpack(DL, MVT::v32i8, V1, V2,
17458                                                  Mask, Zeroable, DAG))
17459      return V;
17460
17461  // Otherwise fall back on generic lowering.
17462  return lowerShuffleAsSplitOrBlend(DL, MVT::v32i8, V1, V2, Mask,
17463                                    Subtarget, DAG);
17464}
17465
17466/// High-level routine to lower various 256-bit x86 vector shuffles.
17467///
17468/// This routine either breaks down the specific type of a 256-bit x86 vector
17469/// shuffle or splits it into two 128-bit shuffles and fuses the results back
17470/// together based on the available instructions.
17471static SDValue lower256BitShuffle(const SDLoc &DL, ArrayRef<int> Mask, MVT VT,
17472                                  SDValue V1, SDValue V2, const APInt &Zeroable,
17473                                  const X86Subtarget &Subtarget,
17474                                  SelectionDAG &DAG) {
17475  // If we have a single input to the zero element, insert that into V1 if we
17476  // can do so cheaply.
17477  int NumElts = VT.getVectorNumElements();
17478  int NumV2Elements = count_if(Mask, [NumElts](int M) { return M >= NumElts; });
17479
17480  if (NumV2Elements == 1 && Mask[0] >= NumElts)
17481    if (SDValue Insertion = lowerShuffleAsElementInsertion(
17482            DL, VT, V1, V2, Mask, Zeroable, Subtarget, DAG))
17483      return Insertion;
17484
17485  // Handle special cases where the lower or upper half is UNDEF.
17486  if (SDValue V =
17487          lowerShuffleWithUndefHalf(DL, VT, V1, V2, Mask, Subtarget, DAG))
17488    return V;
17489
17490  // There is a really nice hard cut-over between AVX1 and AVX2 that means we
17491  // can check for those subtargets here and avoid much of the subtarget
17492  // querying in the per-vector-type lowering routines. With AVX1 we have
17493  // essentially *zero* ability to manipulate a 256-bit vector with integer
17494  // types. Since we'll use floating point types there eventually, just
17495  // immediately cast everything to a float and operate entirely in that domain.
17496  if (VT.isInteger() && !Subtarget.hasAVX2()) {
17497    int ElementBits = VT.getScalarSizeInBits();
17498    if (ElementBits < 32) {
17499      // No floating point type available, if we can't use the bit operations
17500      // for masking/blending then decompose into 128-bit vectors.
17501      if (SDValue V = lowerShuffleAsBitMask(DL, VT, V1, V2, Mask, Zeroable,
17502                                            Subtarget, DAG))
17503        return V;
17504      if (SDValue V = lowerShuffleAsBitBlend(DL, VT, V1, V2, Mask, DAG))
17505        return V;
17506      return splitAndLowerShuffle(DL, VT, V1, V2, Mask, DAG);
17507    }
17508
17509    MVT FpVT = MVT::getVectorVT(MVT::getFloatingPointVT(ElementBits),
17510                                VT.getVectorNumElements());
17511    V1 = DAG.getBitcast(FpVT, V1);
17512    V2 = DAG.getBitcast(FpVT, V2);
17513    return DAG.getBitcast(VT, DAG.getVectorShuffle(FpVT, DL, V1, V2, Mask));
17514  }
17515
17516  switch (VT.SimpleTy) {
17517  case MVT::v4f64:
17518    return lowerV4F64Shuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
17519  case MVT::v4i64:
17520    return lowerV4I64Shuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
17521  case MVT::v8f32:
17522    return lowerV8F32Shuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
17523  case MVT::v8i32:
17524    return lowerV8I32Shuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
17525  case MVT::v16i16:
17526    return lowerV16I16Shuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
17527  case MVT::v32i8:
17528    return lowerV32I8Shuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
17529
17530  default:
17531    llvm_unreachable("Not a valid 256-bit x86 vector type!");
17532  }
17533}
17534
17535/// Try to lower a vector shuffle as a 128-bit shuffles.
17536static SDValue lowerV4X128Shuffle(const SDLoc &DL, MVT VT, ArrayRef<int> Mask,
17537                                  const APInt &Zeroable, SDValue V1, SDValue V2,
17538                                  const X86Subtarget &Subtarget,
17539                                  SelectionDAG &DAG) {
17540  assert(VT.getScalarSizeInBits() == 64 &&
17541         "Unexpected element type size for 128bit shuffle.");
17542
17543  // To handle 256 bit vector requires VLX and most probably
17544  // function lowerV2X128VectorShuffle() is better solution.
17545  assert(VT.is512BitVector() && "Unexpected vector size for 512bit shuffle.");
17546
17547  // TODO - use Zeroable like we do for lowerV2X128VectorShuffle?
17548  SmallVector<int, 4> Widened128Mask;
17549  if (!canWidenShuffleElements(Mask, Widened128Mask))
17550    return SDValue();
17551  assert(Widened128Mask.size() == 4 && "Shuffle widening mismatch");
17552
17553  // Try to use an insert into a zero vector.
17554  if (Widened128Mask[0] == 0 && (Zeroable & 0xf0) == 0xf0 &&
17555      (Widened128Mask[1] == 1 || (Zeroable & 0x0c) == 0x0c)) {
17556    unsigned NumElts = ((Zeroable & 0x0c) == 0x0c) ? 2 : 4;
17557    MVT SubVT = MVT::getVectorVT(VT.getVectorElementType(), NumElts);
17558    SDValue LoV = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, SubVT, V1,
17559                              DAG.getIntPtrConstant(0, DL));
17560    return DAG.getNode(ISD::INSERT_SUBVECTOR, DL, VT,
17561                       getZeroVector(VT, Subtarget, DAG, DL), LoV,
17562                       DAG.getIntPtrConstant(0, DL));
17563  }
17564
17565  // Check for patterns which can be matched with a single insert of a 256-bit
17566  // subvector.
17567  bool OnlyUsesV1 = isShuffleEquivalent(Mask, {0, 1, 2, 3, 0, 1, 2, 3}, V1, V2);
17568  if (OnlyUsesV1 ||
17569      isShuffleEquivalent(Mask, {0, 1, 2, 3, 8, 9, 10, 11}, V1, V2)) {
17570    MVT SubVT = MVT::getVectorVT(VT.getVectorElementType(), 4);
17571    SDValue SubVec =
17572        DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, SubVT, OnlyUsesV1 ? V1 : V2,
17573                    DAG.getIntPtrConstant(0, DL));
17574    return DAG.getNode(ISD::INSERT_SUBVECTOR, DL, VT, V1, SubVec,
17575                       DAG.getIntPtrConstant(4, DL));
17576  }
17577
17578  // See if this is an insertion of the lower 128-bits of V2 into V1.
17579  bool IsInsert = true;
17580  int V2Index = -1;
17581  for (int i = 0; i < 4; ++i) {
17582    assert(Widened128Mask[i] >= -1 && "Illegal shuffle sentinel value");
17583    if (Widened128Mask[i] < 0)
17584      continue;
17585
17586    // Make sure all V1 subvectors are in place.
17587    if (Widened128Mask[i] < 4) {
17588      if (Widened128Mask[i] != i) {
17589        IsInsert = false;
17590        break;
17591      }
17592    } else {
17593      // Make sure we only have a single V2 index and its the lowest 128-bits.
17594      if (V2Index >= 0 || Widened128Mask[i] != 4) {
17595        IsInsert = false;
17596        break;
17597      }
17598      V2Index = i;
17599    }
17600  }
17601  if (IsInsert && V2Index >= 0) {
17602    MVT SubVT = MVT::getVectorVT(VT.getVectorElementType(), 2);
17603    SDValue Subvec = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, SubVT, V2,
17604                                 DAG.getIntPtrConstant(0, DL));
17605    return insert128BitVector(V1, Subvec, V2Index * 2, DAG, DL);
17606  }
17607
17608  // See if we can widen to a 256-bit lane shuffle, we're going to lose 128-lane
17609  // UNDEF info by lowering to X86ISD::SHUF128 anyway, so by widening where
17610  // possible we at least ensure the lanes stay sequential to help later
17611  // combines.
17612  SmallVector<int, 2> Widened256Mask;
17613  if (canWidenShuffleElements(Widened128Mask, Widened256Mask)) {
17614    Widened128Mask.clear();
17615    narrowShuffleMaskElts(2, Widened256Mask, Widened128Mask);
17616  }
17617
17618  // Try to lower to vshuf64x2/vshuf32x4.
17619  SDValue Ops[2] = {DAG.getUNDEF(VT), DAG.getUNDEF(VT)};
17620  unsigned PermMask = 0;
17621  // Insure elements came from the same Op.
17622  for (int i = 0; i < 4; ++i) {
17623    assert(Widened128Mask[i] >= -1 && "Illegal shuffle sentinel value");
17624    if (Widened128Mask[i] < 0)
17625      continue;
17626
17627    SDValue Op = Widened128Mask[i] >= 4 ? V2 : V1;
17628    unsigned OpIndex = i / 2;
17629    if (Ops[OpIndex].isUndef())
17630      Ops[OpIndex] = Op;
17631    else if (Ops[OpIndex] != Op)
17632      return SDValue();
17633
17634    // Convert the 128-bit shuffle mask selection values into 128-bit selection
17635    // bits defined by a vshuf64x2 instruction's immediate control byte.
17636    PermMask |= (Widened128Mask[i] % 4) << (i * 2);
17637  }
17638
17639  return DAG.getNode(X86ISD::SHUF128, DL, VT, Ops[0], Ops[1],
17640                     DAG.getTargetConstant(PermMask, DL, MVT::i8));
17641}
17642
17643/// Handle lowering of 8-lane 64-bit floating point shuffles.
17644static SDValue lowerV8F64Shuffle(const SDLoc &DL, ArrayRef<int> Mask,
17645                                 const APInt &Zeroable, SDValue V1, SDValue V2,
17646                                 const X86Subtarget &Subtarget,
17647                                 SelectionDAG &DAG) {
17648  assert(V1.getSimpleValueType() == MVT::v8f64 && "Bad operand type!");
17649  assert(V2.getSimpleValueType() == MVT::v8f64 && "Bad operand type!");
17650  assert(Mask.size() == 8 && "Unexpected mask size for v8 shuffle!");
17651
17652  if (V2.isUndef()) {
17653    // Use low duplicate instructions for masks that match their pattern.
17654    if (isShuffleEquivalent(Mask, {0, 0, 2, 2, 4, 4, 6, 6}, V1, V2))
17655      return DAG.getNode(X86ISD::MOVDDUP, DL, MVT::v8f64, V1);
17656
17657    if (!is128BitLaneCrossingShuffleMask(MVT::v8f64, Mask)) {
17658      // Non-half-crossing single input shuffles can be lowered with an
17659      // interleaved permutation.
17660      unsigned VPERMILPMask = (Mask[0] == 1) | ((Mask[1] == 1) << 1) |
17661                              ((Mask[2] == 3) << 2) | ((Mask[3] == 3) << 3) |
17662                              ((Mask[4] == 5) << 4) | ((Mask[5] == 5) << 5) |
17663                              ((Mask[6] == 7) << 6) | ((Mask[7] == 7) << 7);
17664      return DAG.getNode(X86ISD::VPERMILPI, DL, MVT::v8f64, V1,
17665                         DAG.getTargetConstant(VPERMILPMask, DL, MVT::i8));
17666    }
17667
17668    SmallVector<int, 4> RepeatedMask;
17669    if (is256BitLaneRepeatedShuffleMask(MVT::v8f64, Mask, RepeatedMask))
17670      return DAG.getNode(X86ISD::VPERMI, DL, MVT::v8f64, V1,
17671                         getV4X86ShuffleImm8ForMask(RepeatedMask, DL, DAG));
17672  }
17673
17674  if (SDValue Shuf128 = lowerV4X128Shuffle(DL, MVT::v8f64, Mask, Zeroable, V1,
17675                                           V2, Subtarget, DAG))
17676    return Shuf128;
17677
17678  if (SDValue Unpck = lowerShuffleWithUNPCK(DL, MVT::v8f64, Mask, V1, V2, DAG))
17679    return Unpck;
17680
17681  // Check if the blend happens to exactly fit that of SHUFPD.
17682  if (SDValue Op = lowerShuffleWithSHUFPD(DL, MVT::v8f64, V1, V2, Mask,
17683                                          Zeroable, Subtarget, DAG))
17684    return Op;
17685
17686  if (SDValue V = lowerShuffleToEXPAND(DL, MVT::v8f64, Zeroable, Mask, V1, V2,
17687                                       DAG, Subtarget))
17688    return V;
17689
17690  if (SDValue Blend = lowerShuffleAsBlend(DL, MVT::v8f64, V1, V2, Mask,
17691                                          Zeroable, Subtarget, DAG))
17692    return Blend;
17693
17694  return lowerShuffleWithPERMV(DL, MVT::v8f64, Mask, V1, V2, Subtarget, DAG);
17695}
17696
17697/// Handle lowering of 16-lane 32-bit floating point shuffles.
17698static SDValue lowerV16F32Shuffle(const SDLoc &DL, ArrayRef<int> Mask,
17699                                  const APInt &Zeroable, SDValue V1, SDValue V2,
17700                                  const X86Subtarget &Subtarget,
17701                                  SelectionDAG &DAG) {
17702  assert(V1.getSimpleValueType() == MVT::v16f32 && "Bad operand type!");
17703  assert(V2.getSimpleValueType() == MVT::v16f32 && "Bad operand type!");
17704  assert(Mask.size() == 16 && "Unexpected mask size for v16 shuffle!");
17705
17706  // If the shuffle mask is repeated in each 128-bit lane, we have many more
17707  // options to efficiently lower the shuffle.
17708  SmallVector<int, 4> RepeatedMask;
17709  if (is128BitLaneRepeatedShuffleMask(MVT::v16f32, Mask, RepeatedMask)) {
17710    assert(RepeatedMask.size() == 4 && "Unexpected repeated mask size!");
17711
17712    // Use even/odd duplicate instructions for masks that match their pattern.
17713    if (isShuffleEquivalent(RepeatedMask, {0, 0, 2, 2}, V1, V2))
17714      return DAG.getNode(X86ISD::MOVSLDUP, DL, MVT::v16f32, V1);
17715    if (isShuffleEquivalent(RepeatedMask, {1, 1, 3, 3}, V1, V2))
17716      return DAG.getNode(X86ISD::MOVSHDUP, DL, MVT::v16f32, V1);
17717
17718    if (V2.isUndef())
17719      return DAG.getNode(X86ISD::VPERMILPI, DL, MVT::v16f32, V1,
17720                         getV4X86ShuffleImm8ForMask(RepeatedMask, DL, DAG));
17721
17722    // Use dedicated unpack instructions for masks that match their pattern.
17723    if (SDValue V = lowerShuffleWithUNPCK(DL, MVT::v16f32, Mask, V1, V2, DAG))
17724      return V;
17725
17726    if (SDValue Blend = lowerShuffleAsBlend(DL, MVT::v16f32, V1, V2, Mask,
17727                                            Zeroable, Subtarget, DAG))
17728      return Blend;
17729
17730    // Otherwise, fall back to a SHUFPS sequence.
17731    return lowerShuffleWithSHUFPS(DL, MVT::v16f32, RepeatedMask, V1, V2, DAG);
17732  }
17733
17734  // Try to create an in-lane repeating shuffle mask and then shuffle the
17735  // results into the target lanes.
17736  if (SDValue V = lowerShuffleAsRepeatedMaskAndLanePermute(
17737          DL, MVT::v16f32, V1, V2, Mask, Subtarget, DAG))
17738    return V;
17739
17740  // If we have a single input shuffle with different shuffle patterns in the
17741  // 128-bit lanes and don't lane cross, use variable mask VPERMILPS.
17742  if (V2.isUndef() &&
17743      !is128BitLaneCrossingShuffleMask(MVT::v16f32, Mask)) {
17744    SDValue VPermMask = getConstVector(Mask, MVT::v16i32, DAG, DL, true);
17745    return DAG.getNode(X86ISD::VPERMILPV, DL, MVT::v16f32, V1, VPermMask);
17746  }
17747
17748  // If we have AVX512F support, we can use VEXPAND.
17749  if (SDValue V = lowerShuffleToEXPAND(DL, MVT::v16f32, Zeroable, Mask,
17750                                             V1, V2, DAG, Subtarget))
17751    return V;
17752
17753  return lowerShuffleWithPERMV(DL, MVT::v16f32, Mask, V1, V2, Subtarget, DAG);
17754}
17755
17756/// Handle lowering of 8-lane 64-bit integer shuffles.
17757static SDValue lowerV8I64Shuffle(const SDLoc &DL, ArrayRef<int> Mask,
17758                                 const APInt &Zeroable, SDValue V1, SDValue V2,
17759                                 const X86Subtarget &Subtarget,
17760                                 SelectionDAG &DAG) {
17761  assert(V1.getSimpleValueType() == MVT::v8i64 && "Bad operand type!");
17762  assert(V2.getSimpleValueType() == MVT::v8i64 && "Bad operand type!");
17763  assert(Mask.size() == 8 && "Unexpected mask size for v8 shuffle!");
17764
17765  if (V2.isUndef()) {
17766    // When the shuffle is mirrored between the 128-bit lanes of the unit, we
17767    // can use lower latency instructions that will operate on all four
17768    // 128-bit lanes.
17769    SmallVector<int, 2> Repeated128Mask;
17770    if (is128BitLaneRepeatedShuffleMask(MVT::v8i64, Mask, Repeated128Mask)) {
17771      SmallVector<int, 4> PSHUFDMask;
17772      narrowShuffleMaskElts(2, Repeated128Mask, PSHUFDMask);
17773      return DAG.getBitcast(
17774          MVT::v8i64,
17775          DAG.getNode(X86ISD::PSHUFD, DL, MVT::v16i32,
17776                      DAG.getBitcast(MVT::v16i32, V1),
17777                      getV4X86ShuffleImm8ForMask(PSHUFDMask, DL, DAG)));
17778    }
17779
17780    SmallVector<int, 4> Repeated256Mask;
17781    if (is256BitLaneRepeatedShuffleMask(MVT::v8i64, Mask, Repeated256Mask))
17782      return DAG.getNode(X86ISD::VPERMI, DL, MVT::v8i64, V1,
17783                         getV4X86ShuffleImm8ForMask(Repeated256Mask, DL, DAG));
17784  }
17785
17786  if (SDValue Shuf128 = lowerV4X128Shuffle(DL, MVT::v8i64, Mask, Zeroable, V1,
17787                                           V2, Subtarget, DAG))
17788    return Shuf128;
17789
17790  // Try to use shift instructions.
17791  if (SDValue Shift = lowerShuffleAsShift(DL, MVT::v8i64, V1, V2, Mask,
17792                                          Zeroable, Subtarget, DAG))
17793    return Shift;
17794
17795  // Try to use VALIGN.
17796  if (SDValue Rotate = lowerShuffleAsVALIGN(DL, MVT::v8i64, V1, V2, Mask,
17797                                            Subtarget, DAG))
17798    return Rotate;
17799
17800  // Try to use PALIGNR.
17801  if (Subtarget.hasBWI())
17802    if (SDValue Rotate = lowerShuffleAsByteRotate(DL, MVT::v8i64, V1, V2, Mask,
17803                                                  Subtarget, DAG))
17804      return Rotate;
17805
17806  if (SDValue Unpck = lowerShuffleWithUNPCK(DL, MVT::v8i64, Mask, V1, V2, DAG))
17807    return Unpck;
17808
17809  // If we have AVX512F support, we can use VEXPAND.
17810  if (SDValue V = lowerShuffleToEXPAND(DL, MVT::v8i64, Zeroable, Mask, V1, V2,
17811                                       DAG, Subtarget))
17812    return V;
17813
17814  if (SDValue Blend = lowerShuffleAsBlend(DL, MVT::v8i64, V1, V2, Mask,
17815                                          Zeroable, Subtarget, DAG))
17816    return Blend;
17817
17818  return lowerShuffleWithPERMV(DL, MVT::v8i64, Mask, V1, V2, Subtarget, DAG);
17819}
17820
17821/// Handle lowering of 16-lane 32-bit integer shuffles.
17822static SDValue lowerV16I32Shuffle(const SDLoc &DL, ArrayRef<int> Mask,
17823                                  const APInt &Zeroable, SDValue V1, SDValue V2,
17824                                  const X86Subtarget &Subtarget,
17825                                  SelectionDAG &DAG) {
17826  assert(V1.getSimpleValueType() == MVT::v16i32 && "Bad operand type!");
17827  assert(V2.getSimpleValueType() == MVT::v16i32 && "Bad operand type!");
17828  assert(Mask.size() == 16 && "Unexpected mask size for v16 shuffle!");
17829
17830  // Whenever we can lower this as a zext, that instruction is strictly faster
17831  // than any alternative. It also allows us to fold memory operands into the
17832  // shuffle in many cases.
17833  if (SDValue ZExt = lowerShuffleAsZeroOrAnyExtend(
17834          DL, MVT::v16i32, V1, V2, Mask, Zeroable, Subtarget, DAG))
17835    return ZExt;
17836
17837  // If the shuffle mask is repeated in each 128-bit lane we can use more
17838  // efficient instructions that mirror the shuffles across the four 128-bit
17839  // lanes.
17840  SmallVector<int, 4> RepeatedMask;
17841  bool Is128BitLaneRepeatedShuffle =
17842      is128BitLaneRepeatedShuffleMask(MVT::v16i32, Mask, RepeatedMask);
17843  if (Is128BitLaneRepeatedShuffle) {
17844    assert(RepeatedMask.size() == 4 && "Unexpected repeated mask size!");
17845    if (V2.isUndef())
17846      return DAG.getNode(X86ISD::PSHUFD, DL, MVT::v16i32, V1,
17847                         getV4X86ShuffleImm8ForMask(RepeatedMask, DL, DAG));
17848
17849    // Use dedicated unpack instructions for masks that match their pattern.
17850    if (SDValue V = lowerShuffleWithUNPCK(DL, MVT::v16i32, Mask, V1, V2, DAG))
17851      return V;
17852  }
17853
17854  // Try to use shift instructions.
17855  if (SDValue Shift = lowerShuffleAsShift(DL, MVT::v16i32, V1, V2, Mask,
17856                                          Zeroable, Subtarget, DAG))
17857    return Shift;
17858
17859  // Try to use VALIGN.
17860  if (SDValue Rotate = lowerShuffleAsVALIGN(DL, MVT::v16i32, V1, V2, Mask,
17861                                            Subtarget, DAG))
17862    return Rotate;
17863
17864  // Try to use byte rotation instructions.
17865  if (Subtarget.hasBWI())
17866    if (SDValue Rotate = lowerShuffleAsByteRotate(DL, MVT::v16i32, V1, V2, Mask,
17867                                                  Subtarget, DAG))
17868      return Rotate;
17869
17870  // Assume that a single SHUFPS is faster than using a permv shuffle.
17871  // If some CPU is harmed by the domain switch, we can fix it in a later pass.
17872  if (Is128BitLaneRepeatedShuffle && isSingleSHUFPSMask(RepeatedMask)) {
17873    SDValue CastV1 = DAG.getBitcast(MVT::v16f32, V1);
17874    SDValue CastV2 = DAG.getBitcast(MVT::v16f32, V2);
17875    SDValue ShufPS = lowerShuffleWithSHUFPS(DL, MVT::v16f32, RepeatedMask,
17876                                            CastV1, CastV2, DAG);
17877    return DAG.getBitcast(MVT::v16i32, ShufPS);
17878  }
17879
17880  // Try to create an in-lane repeating shuffle mask and then shuffle the
17881  // results into the target lanes.
17882  if (SDValue V = lowerShuffleAsRepeatedMaskAndLanePermute(
17883          DL, MVT::v16i32, V1, V2, Mask, Subtarget, DAG))
17884    return V;
17885
17886  // If we have AVX512F support, we can use VEXPAND.
17887  if (SDValue V = lowerShuffleToEXPAND(DL, MVT::v16i32, Zeroable, Mask, V1, V2,
17888                                       DAG, Subtarget))
17889    return V;
17890
17891  if (SDValue Blend = lowerShuffleAsBlend(DL, MVT::v16i32, V1, V2, Mask,
17892                                          Zeroable, Subtarget, DAG))
17893    return Blend;
17894
17895  return lowerShuffleWithPERMV(DL, MVT::v16i32, Mask, V1, V2, Subtarget, DAG);
17896}
17897
17898/// Handle lowering of 32-lane 16-bit integer shuffles.
17899static SDValue lowerV32I16Shuffle(const SDLoc &DL, ArrayRef<int> Mask,
17900                                  const APInt &Zeroable, SDValue V1, SDValue V2,
17901                                  const X86Subtarget &Subtarget,
17902                                  SelectionDAG &DAG) {
17903  assert(V1.getSimpleValueType() == MVT::v32i16 && "Bad operand type!");
17904  assert(V2.getSimpleValueType() == MVT::v32i16 && "Bad operand type!");
17905  assert(Mask.size() == 32 && "Unexpected mask size for v32 shuffle!");
17906  assert(Subtarget.hasBWI() && "We can only lower v32i16 with AVX-512-BWI!");
17907
17908  // Whenever we can lower this as a zext, that instruction is strictly faster
17909  // than any alternative. It also allows us to fold memory operands into the
17910  // shuffle in many cases.
17911  if (SDValue ZExt = lowerShuffleAsZeroOrAnyExtend(
17912          DL, MVT::v32i16, V1, V2, Mask, Zeroable, Subtarget, DAG))
17913    return ZExt;
17914
17915  // Use dedicated unpack instructions for masks that match their pattern.
17916  if (SDValue V = lowerShuffleWithUNPCK(DL, MVT::v32i16, Mask, V1, V2, DAG))
17917    return V;
17918
17919  // Use dedicated pack instructions for masks that match their pattern.
17920  if (SDValue V =
17921          lowerShuffleWithPACK(DL, MVT::v32i16, Mask, V1, V2, DAG, Subtarget))
17922    return V;
17923
17924  // Try to use shift instructions.
17925  if (SDValue Shift = lowerShuffleAsShift(DL, MVT::v32i16, V1, V2, Mask,
17926                                          Zeroable, Subtarget, DAG))
17927    return Shift;
17928
17929  // Try to use byte rotation instructions.
17930  if (SDValue Rotate = lowerShuffleAsByteRotate(DL, MVT::v32i16, V1, V2, Mask,
17931                                                Subtarget, DAG))
17932    return Rotate;
17933
17934  if (V2.isUndef()) {
17935    // Try to use bit rotation instructions.
17936    if (SDValue Rotate =
17937            lowerShuffleAsBitRotate(DL, MVT::v32i16, V1, Mask, Subtarget, DAG))
17938      return Rotate;
17939
17940    SmallVector<int, 8> RepeatedMask;
17941    if (is128BitLaneRepeatedShuffleMask(MVT::v32i16, Mask, RepeatedMask)) {
17942      // As this is a single-input shuffle, the repeated mask should be
17943      // a strictly valid v8i16 mask that we can pass through to the v8i16
17944      // lowering to handle even the v32 case.
17945      return lowerV8I16GeneralSingleInputShuffle(DL, MVT::v32i16, V1,
17946                                                 RepeatedMask, Subtarget, DAG);
17947    }
17948  }
17949
17950  if (SDValue Blend = lowerShuffleAsBlend(DL, MVT::v32i16, V1, V2, Mask,
17951                                          Zeroable, Subtarget, DAG))
17952    return Blend;
17953
17954  if (SDValue PSHUFB = lowerShuffleWithPSHUFB(DL, MVT::v32i16, Mask, V1, V2,
17955                                              Zeroable, Subtarget, DAG))
17956    return PSHUFB;
17957
17958  return lowerShuffleWithPERMV(DL, MVT::v32i16, Mask, V1, V2, Subtarget, DAG);
17959}
17960
17961/// Handle lowering of 64-lane 8-bit integer shuffles.
17962static SDValue lowerV64I8Shuffle(const SDLoc &DL, ArrayRef<int> Mask,
17963                                 const APInt &Zeroable, SDValue V1, SDValue V2,
17964                                 const X86Subtarget &Subtarget,
17965                                 SelectionDAG &DAG) {
17966  assert(V1.getSimpleValueType() == MVT::v64i8 && "Bad operand type!");
17967  assert(V2.getSimpleValueType() == MVT::v64i8 && "Bad operand type!");
17968  assert(Mask.size() == 64 && "Unexpected mask size for v64 shuffle!");
17969  assert(Subtarget.hasBWI() && "We can only lower v64i8 with AVX-512-BWI!");
17970
17971  // Whenever we can lower this as a zext, that instruction is strictly faster
17972  // than any alternative. It also allows us to fold memory operands into the
17973  // shuffle in many cases.
17974  if (SDValue ZExt = lowerShuffleAsZeroOrAnyExtend(
17975          DL, MVT::v64i8, V1, V2, Mask, Zeroable, Subtarget, DAG))
17976    return ZExt;
17977
17978  // Use dedicated unpack instructions for masks that match their pattern.
17979  if (SDValue V = lowerShuffleWithUNPCK(DL, MVT::v64i8, Mask, V1, V2, DAG))
17980    return V;
17981
17982  // Use dedicated pack instructions for masks that match their pattern.
17983  if (SDValue V = lowerShuffleWithPACK(DL, MVT::v64i8, Mask, V1, V2, DAG,
17984                                       Subtarget))
17985    return V;
17986
17987  // Try to use shift instructions.
17988  if (SDValue Shift = lowerShuffleAsShift(DL, MVT::v64i8, V1, V2, Mask,
17989                                          Zeroable, Subtarget, DAG))
17990    return Shift;
17991
17992  // Try to use byte rotation instructions.
17993  if (SDValue Rotate = lowerShuffleAsByteRotate(DL, MVT::v64i8, V1, V2, Mask,
17994                                                Subtarget, DAG))
17995    return Rotate;
17996
17997  // Try to use bit rotation instructions.
17998  if (V2.isUndef())
17999    if (SDValue Rotate =
18000            lowerShuffleAsBitRotate(DL, MVT::v64i8, V1, Mask, Subtarget, DAG))
18001      return Rotate;
18002
18003  // Lower as AND if possible.
18004  if (SDValue Masked = lowerShuffleAsBitMask(DL, MVT::v64i8, V1, V2, Mask,
18005                                             Zeroable, Subtarget, DAG))
18006    return Masked;
18007
18008  if (SDValue PSHUFB = lowerShuffleWithPSHUFB(DL, MVT::v64i8, Mask, V1, V2,
18009                                              Zeroable, Subtarget, DAG))
18010    return PSHUFB;
18011
18012  // VBMI can use VPERMV/VPERMV3 byte shuffles.
18013  if (Subtarget.hasVBMI())
18014    return lowerShuffleWithPERMV(DL, MVT::v64i8, Mask, V1, V2, Subtarget, DAG);
18015
18016  // Try to create an in-lane repeating shuffle mask and then shuffle the
18017  // results into the target lanes.
18018  if (SDValue V = lowerShuffleAsRepeatedMaskAndLanePermute(
18019          DL, MVT::v64i8, V1, V2, Mask, Subtarget, DAG))
18020    return V;
18021
18022  if (SDValue Blend = lowerShuffleAsBlend(DL, MVT::v64i8, V1, V2, Mask,
18023                                          Zeroable, Subtarget, DAG))
18024    return Blend;
18025
18026  // Try to simplify this by merging 128-bit lanes to enable a lane-based
18027  // shuffle.
18028  if (!V2.isUndef())
18029    if (SDValue Result = lowerShuffleAsLanePermuteAndRepeatedMask(
18030            DL, MVT::v64i8, V1, V2, Mask, Subtarget, DAG))
18031      return Result;
18032
18033  // FIXME: Implement direct support for this type!
18034  return splitAndLowerShuffle(DL, MVT::v64i8, V1, V2, Mask, DAG);
18035}
18036
18037/// High-level routine to lower various 512-bit x86 vector shuffles.
18038///
18039/// This routine either breaks down the specific type of a 512-bit x86 vector
18040/// shuffle or splits it into two 256-bit shuffles and fuses the results back
18041/// together based on the available instructions.
18042static SDValue lower512BitShuffle(const SDLoc &DL, ArrayRef<int> Mask,
18043                                  MVT VT, SDValue V1, SDValue V2,
18044                                  const APInt &Zeroable,
18045                                  const X86Subtarget &Subtarget,
18046                                  SelectionDAG &DAG) {
18047  assert(Subtarget.hasAVX512() &&
18048         "Cannot lower 512-bit vectors w/ basic ISA!");
18049
18050  // If we have a single input to the zero element, insert that into V1 if we
18051  // can do so cheaply.
18052  int NumElts = Mask.size();
18053  int NumV2Elements = count_if(Mask, [NumElts](int M) { return M >= NumElts; });
18054
18055  if (NumV2Elements == 1 && Mask[0] >= NumElts)
18056    if (SDValue Insertion = lowerShuffleAsElementInsertion(
18057            DL, VT, V1, V2, Mask, Zeroable, Subtarget, DAG))
18058      return Insertion;
18059
18060  // Handle special cases where the lower or upper half is UNDEF.
18061  if (SDValue V =
18062          lowerShuffleWithUndefHalf(DL, VT, V1, V2, Mask, Subtarget, DAG))
18063    return V;
18064
18065  // Check for being able to broadcast a single element.
18066  if (SDValue Broadcast = lowerShuffleAsBroadcast(DL, VT, V1, V2, Mask,
18067                                                  Subtarget, DAG))
18068    return Broadcast;
18069
18070  if ((VT == MVT::v32i16 || VT == MVT::v64i8) && !Subtarget.hasBWI()) {
18071    // Try using bit ops for masking and blending before falling back to
18072    // splitting.
18073    if (SDValue V = lowerShuffleAsBitMask(DL, VT, V1, V2, Mask, Zeroable,
18074                                          Subtarget, DAG))
18075      return V;
18076    if (SDValue V = lowerShuffleAsBitBlend(DL, VT, V1, V2, Mask, DAG))
18077      return V;
18078
18079    return splitAndLowerShuffle(DL, VT, V1, V2, Mask, DAG);
18080  }
18081
18082  // Dispatch to each element type for lowering. If we don't have support for
18083  // specific element type shuffles at 512 bits, immediately split them and
18084  // lower them. Each lowering routine of a given type is allowed to assume that
18085  // the requisite ISA extensions for that element type are available.
18086  switch (VT.SimpleTy) {
18087  case MVT::v8f64:
18088    return lowerV8F64Shuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
18089  case MVT::v16f32:
18090    return lowerV16F32Shuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
18091  case MVT::v8i64:
18092    return lowerV8I64Shuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
18093  case MVT::v16i32:
18094    return lowerV16I32Shuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
18095  case MVT::v32i16:
18096    return lowerV32I16Shuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
18097  case MVT::v64i8:
18098    return lowerV64I8Shuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
18099
18100  default:
18101    llvm_unreachable("Not a valid 512-bit x86 vector type!");
18102  }
18103}
18104
18105static SDValue lower1BitShuffleAsKSHIFTR(const SDLoc &DL, ArrayRef<int> Mask,
18106                                         MVT VT, SDValue V1, SDValue V2,
18107                                         const X86Subtarget &Subtarget,
18108                                         SelectionDAG &DAG) {
18109  // Shuffle should be unary.
18110  if (!V2.isUndef())
18111    return SDValue();
18112
18113  int ShiftAmt = -1;
18114  int NumElts = Mask.size();
18115  for (int i = 0; i != NumElts; ++i) {
18116    int M = Mask[i];
18117    assert((M == SM_SentinelUndef || (0 <= M && M < NumElts)) &&
18118           "Unexpected mask index.");
18119    if (M < 0)
18120      continue;
18121
18122    // The first non-undef element determines our shift amount.
18123    if (ShiftAmt < 0) {
18124      ShiftAmt = M - i;
18125      // Need to be shifting right.
18126      if (ShiftAmt <= 0)
18127        return SDValue();
18128    }
18129    // All non-undef elements must shift by the same amount.
18130    if (ShiftAmt != M - i)
18131      return SDValue();
18132  }
18133  assert(ShiftAmt >= 0 && "All undef?");
18134
18135  // Great we found a shift right.
18136  MVT WideVT = VT;
18137  if ((!Subtarget.hasDQI() && NumElts == 8) || NumElts < 8)
18138    WideVT = Subtarget.hasDQI() ? MVT::v8i1 : MVT::v16i1;
18139  SDValue Res = DAG.getNode(ISD::INSERT_SUBVECTOR, DL, WideVT,
18140                            DAG.getUNDEF(WideVT), V1,
18141                            DAG.getIntPtrConstant(0, DL));
18142  Res = DAG.getNode(X86ISD::KSHIFTR, DL, WideVT, Res,
18143                    DAG.getTargetConstant(ShiftAmt, DL, MVT::i8));
18144  return DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, Res,
18145                     DAG.getIntPtrConstant(0, DL));
18146}
18147
18148// Determine if this shuffle can be implemented with a KSHIFT instruction.
18149// Returns the shift amount if possible or -1 if not. This is a simplified
18150// version of matchShuffleAsShift.
18151static int match1BitShuffleAsKSHIFT(unsigned &Opcode, ArrayRef<int> Mask,
18152                                    int MaskOffset, const APInt &Zeroable) {
18153  int Size = Mask.size();
18154
18155  auto CheckZeros = [&](int Shift, bool Left) {
18156    for (int j = 0; j < Shift; ++j)
18157      if (!Zeroable[j + (Left ? 0 : (Size - Shift))])
18158        return false;
18159
18160    return true;
18161  };
18162
18163  auto MatchShift = [&](int Shift, bool Left) {
18164    unsigned Pos = Left ? Shift : 0;
18165    unsigned Low = Left ? 0 : Shift;
18166    unsigned Len = Size - Shift;
18167    return isSequentialOrUndefInRange(Mask, Pos, Len, Low + MaskOffset);
18168  };
18169
18170  for (int Shift = 1; Shift != Size; ++Shift)
18171    for (bool Left : {true, false})
18172      if (CheckZeros(Shift, Left) && MatchShift(Shift, Left)) {
18173        Opcode = Left ? X86ISD::KSHIFTL : X86ISD::KSHIFTR;
18174        return Shift;
18175      }
18176
18177  return -1;
18178}
18179
18180
18181// Lower vXi1 vector shuffles.
18182// There is no a dedicated instruction on AVX-512 that shuffles the masks.
18183// The only way to shuffle bits is to sign-extend the mask vector to SIMD
18184// vector, shuffle and then truncate it back.
18185static SDValue lower1BitShuffle(const SDLoc &DL, ArrayRef<int> Mask,
18186                                MVT VT, SDValue V1, SDValue V2,
18187                                const APInt &Zeroable,
18188                                const X86Subtarget &Subtarget,
18189                                SelectionDAG &DAG) {
18190  assert(Subtarget.hasAVX512() &&
18191         "Cannot lower 512-bit vectors w/o basic ISA!");
18192
18193  int NumElts = Mask.size();
18194
18195  // Try to recognize shuffles that are just padding a subvector with zeros.
18196  int SubvecElts = 0;
18197  int Src = -1;
18198  for (int i = 0; i != NumElts; ++i) {
18199    if (Mask[i] >= 0) {
18200      // Grab the source from the first valid mask. All subsequent elements need
18201      // to use this same source.
18202      if (Src < 0)
18203        Src = Mask[i] / NumElts;
18204      if (Src != (Mask[i] / NumElts) || (Mask[i] % NumElts) != i)
18205        break;
18206    }
18207
18208    ++SubvecElts;
18209  }
18210  assert(SubvecElts != NumElts && "Identity shuffle?");
18211
18212  // Clip to a power 2.
18213  SubvecElts = PowerOf2Floor(SubvecElts);
18214
18215  // Make sure the number of zeroable bits in the top at least covers the bits
18216  // not covered by the subvector.
18217  if ((int)Zeroable.countLeadingOnes() >= (NumElts - SubvecElts)) {
18218    assert(Src >= 0 && "Expected a source!");
18219    MVT ExtractVT = MVT::getVectorVT(MVT::i1, SubvecElts);
18220    SDValue Extract = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, ExtractVT,
18221                                  Src == 0 ? V1 : V2,
18222                                  DAG.getIntPtrConstant(0, DL));
18223    return DAG.getNode(ISD::INSERT_SUBVECTOR, DL, VT,
18224                       DAG.getConstant(0, DL, VT),
18225                       Extract, DAG.getIntPtrConstant(0, DL));
18226  }
18227
18228  // Try a simple shift right with undef elements. Later we'll try with zeros.
18229  if (SDValue Shift = lower1BitShuffleAsKSHIFTR(DL, Mask, VT, V1, V2, Subtarget,
18230                                                DAG))
18231    return Shift;
18232
18233  // Try to match KSHIFTs.
18234  unsigned Offset = 0;
18235  for (SDValue V : { V1, V2 }) {
18236    unsigned Opcode;
18237    int ShiftAmt = match1BitShuffleAsKSHIFT(Opcode, Mask, Offset, Zeroable);
18238    if (ShiftAmt >= 0) {
18239      MVT WideVT = VT;
18240      if ((!Subtarget.hasDQI() && NumElts == 8) || NumElts < 8)
18241        WideVT = Subtarget.hasDQI() ? MVT::v8i1 : MVT::v16i1;
18242      SDValue Res = DAG.getNode(ISD::INSERT_SUBVECTOR, DL, WideVT,
18243                                DAG.getUNDEF(WideVT), V,
18244                                DAG.getIntPtrConstant(0, DL));
18245      // Widened right shifts need two shifts to ensure we shift in zeroes.
18246      if (Opcode == X86ISD::KSHIFTR && WideVT != VT) {
18247        int WideElts = WideVT.getVectorNumElements();
18248        // Shift left to put the original vector in the MSBs of the new size.
18249        Res = DAG.getNode(X86ISD::KSHIFTL, DL, WideVT, Res,
18250                          DAG.getTargetConstant(WideElts - NumElts, DL, MVT::i8));
18251        // Increase the shift amount to account for the left shift.
18252        ShiftAmt += WideElts - NumElts;
18253      }
18254
18255      Res = DAG.getNode(Opcode, DL, WideVT, Res,
18256                        DAG.getTargetConstant(ShiftAmt, DL, MVT::i8));
18257      return DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, Res,
18258                         DAG.getIntPtrConstant(0, DL));
18259    }
18260    Offset += NumElts; // Increment for next iteration.
18261  }
18262
18263
18264
18265  MVT ExtVT;
18266  switch (VT.SimpleTy) {
18267  default:
18268    llvm_unreachable("Expected a vector of i1 elements");
18269  case MVT::v2i1:
18270    ExtVT = MVT::v2i64;
18271    break;
18272  case MVT::v4i1:
18273    ExtVT = MVT::v4i32;
18274    break;
18275  case MVT::v8i1:
18276    // Take 512-bit type, more shuffles on KNL. If we have VLX use a 256-bit
18277    // shuffle.
18278    ExtVT = Subtarget.hasVLX() ? MVT::v8i32 : MVT::v8i64;
18279    break;
18280  case MVT::v16i1:
18281    // Take 512-bit type, unless we are avoiding 512-bit types and have the
18282    // 256-bit operation available.
18283    ExtVT = Subtarget.canExtendTo512DQ() ? MVT::v16i32 : MVT::v16i16;
18284    break;
18285  case MVT::v32i1:
18286    // Take 512-bit type, unless we are avoiding 512-bit types and have the
18287    // 256-bit operation available.
18288    assert(Subtarget.hasBWI() && "Expected AVX512BW support");
18289    ExtVT = Subtarget.canExtendTo512BW() ? MVT::v32i16 : MVT::v32i8;
18290    break;
18291  case MVT::v64i1:
18292    // Fall back to scalarization. FIXME: We can do better if the shuffle
18293    // can be partitioned cleanly.
18294    if (!Subtarget.useBWIRegs())
18295      return SDValue();
18296    ExtVT = MVT::v64i8;
18297    break;
18298  }
18299
18300  V1 = DAG.getNode(ISD::SIGN_EXTEND, DL, ExtVT, V1);
18301  V2 = DAG.getNode(ISD::SIGN_EXTEND, DL, ExtVT, V2);
18302
18303  SDValue Shuffle = DAG.getVectorShuffle(ExtVT, DL, V1, V2, Mask);
18304  // i1 was sign extended we can use X86ISD::CVT2MASK.
18305  int NumElems = VT.getVectorNumElements();
18306  if ((Subtarget.hasBWI() && (NumElems >= 32)) ||
18307      (Subtarget.hasDQI() && (NumElems < 32)))
18308    return DAG.getSetCC(DL, VT, DAG.getConstant(0, DL, ExtVT),
18309                       Shuffle, ISD::SETGT);
18310
18311  return DAG.getNode(ISD::TRUNCATE, DL, VT, Shuffle);
18312}
18313
18314/// Helper function that returns true if the shuffle mask should be
18315/// commuted to improve canonicalization.
18316static bool canonicalizeShuffleMaskWithCommute(ArrayRef<int> Mask) {
18317  int NumElements = Mask.size();
18318
18319  int NumV1Elements = 0, NumV2Elements = 0;
18320  for (int M : Mask)
18321    if (M < 0)
18322      continue;
18323    else if (M < NumElements)
18324      ++NumV1Elements;
18325    else
18326      ++NumV2Elements;
18327
18328  // Commute the shuffle as needed such that more elements come from V1 than
18329  // V2. This allows us to match the shuffle pattern strictly on how many
18330  // elements come from V1 without handling the symmetric cases.
18331  if (NumV2Elements > NumV1Elements)
18332    return true;
18333
18334  assert(NumV1Elements > 0 && "No V1 indices");
18335
18336  if (NumV2Elements == 0)
18337    return false;
18338
18339  // When the number of V1 and V2 elements are the same, try to minimize the
18340  // number of uses of V2 in the low half of the vector. When that is tied,
18341  // ensure that the sum of indices for V1 is equal to or lower than the sum
18342  // indices for V2. When those are equal, try to ensure that the number of odd
18343  // indices for V1 is lower than the number of odd indices for V2.
18344  if (NumV1Elements == NumV2Elements) {
18345    int LowV1Elements = 0, LowV2Elements = 0;
18346    for (int M : Mask.slice(0, NumElements / 2))
18347      if (M >= NumElements)
18348        ++LowV2Elements;
18349      else if (M >= 0)
18350        ++LowV1Elements;
18351    if (LowV2Elements > LowV1Elements)
18352      return true;
18353    if (LowV2Elements == LowV1Elements) {
18354      int SumV1Indices = 0, SumV2Indices = 0;
18355      for (int i = 0, Size = Mask.size(); i < Size; ++i)
18356        if (Mask[i] >= NumElements)
18357          SumV2Indices += i;
18358        else if (Mask[i] >= 0)
18359          SumV1Indices += i;
18360      if (SumV2Indices < SumV1Indices)
18361        return true;
18362      if (SumV2Indices == SumV1Indices) {
18363        int NumV1OddIndices = 0, NumV2OddIndices = 0;
18364        for (int i = 0, Size = Mask.size(); i < Size; ++i)
18365          if (Mask[i] >= NumElements)
18366            NumV2OddIndices += i % 2;
18367          else if (Mask[i] >= 0)
18368            NumV1OddIndices += i % 2;
18369        if (NumV2OddIndices < NumV1OddIndices)
18370          return true;
18371      }
18372    }
18373  }
18374
18375  return false;
18376}
18377
18378/// Top-level lowering for x86 vector shuffles.
18379///
18380/// This handles decomposition, canonicalization, and lowering of all x86
18381/// vector shuffles. Most of the specific lowering strategies are encapsulated
18382/// above in helper routines. The canonicalization attempts to widen shuffles
18383/// to involve fewer lanes of wider elements, consolidate symmetric patterns
18384/// s.t. only one of the two inputs needs to be tested, etc.
18385static SDValue lowerVECTOR_SHUFFLE(SDValue Op, const X86Subtarget &Subtarget,
18386                                   SelectionDAG &DAG) {
18387  ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(Op);
18388  ArrayRef<int> OrigMask = SVOp->getMask();
18389  SDValue V1 = Op.getOperand(0);
18390  SDValue V2 = Op.getOperand(1);
18391  MVT VT = Op.getSimpleValueType();
18392  int NumElements = VT.getVectorNumElements();
18393  SDLoc DL(Op);
18394  bool Is1BitVector = (VT.getVectorElementType() == MVT::i1);
18395
18396  assert((VT.getSizeInBits() != 64 || Is1BitVector) &&
18397         "Can't lower MMX shuffles");
18398
18399  bool V1IsUndef = V1.isUndef();
18400  bool V2IsUndef = V2.isUndef();
18401  if (V1IsUndef && V2IsUndef)
18402    return DAG.getUNDEF(VT);
18403
18404  // When we create a shuffle node we put the UNDEF node to second operand,
18405  // but in some cases the first operand may be transformed to UNDEF.
18406  // In this case we should just commute the node.
18407  if (V1IsUndef)
18408    return DAG.getCommutedVectorShuffle(*SVOp);
18409
18410  // Check for non-undef masks pointing at an undef vector and make the masks
18411  // undef as well. This makes it easier to match the shuffle based solely on
18412  // the mask.
18413  if (V2IsUndef &&
18414      any_of(OrigMask, [NumElements](int M) { return M >= NumElements; })) {
18415    SmallVector<int, 8> NewMask(OrigMask.begin(), OrigMask.end());
18416    for (int &M : NewMask)
18417      if (M >= NumElements)
18418        M = -1;
18419    return DAG.getVectorShuffle(VT, DL, V1, V2, NewMask);
18420  }
18421
18422  // Check for illegal shuffle mask element index values.
18423  int MaskUpperLimit = OrigMask.size() * (V2IsUndef ? 1 : 2);
18424  (void)MaskUpperLimit;
18425  assert(llvm::all_of(OrigMask,
18426                      [&](int M) { return -1 <= M && M < MaskUpperLimit; }) &&
18427         "Out of bounds shuffle index");
18428
18429  // We actually see shuffles that are entirely re-arrangements of a set of
18430  // zero inputs. This mostly happens while decomposing complex shuffles into
18431  // simple ones. Directly lower these as a buildvector of zeros.
18432  APInt KnownUndef, KnownZero;
18433  computeZeroableShuffleElements(OrigMask, V1, V2, KnownUndef, KnownZero);
18434
18435  APInt Zeroable = KnownUndef | KnownZero;
18436  if (Zeroable.isAllOnesValue())
18437    return getZeroVector(VT, Subtarget, DAG, DL);
18438
18439  bool V2IsZero = !V2IsUndef && ISD::isBuildVectorAllZeros(V2.getNode());
18440
18441  // Try to collapse shuffles into using a vector type with fewer elements but
18442  // wider element types. We cap this to not form integers or floating point
18443  // elements wider than 64 bits, but it might be interesting to form i128
18444  // integers to handle flipping the low and high halves of AVX 256-bit vectors.
18445  SmallVector<int, 16> WidenedMask;
18446  if (VT.getScalarSizeInBits() < 64 && !Is1BitVector &&
18447      canWidenShuffleElements(OrigMask, Zeroable, V2IsZero, WidenedMask)) {
18448    // Shuffle mask widening should not interfere with a broadcast opportunity
18449    // by obfuscating the operands with bitcasts.
18450    // TODO: Avoid lowering directly from this top-level function: make this
18451    // a query (canLowerAsBroadcast) and defer lowering to the type-based calls.
18452    if (SDValue Broadcast = lowerShuffleAsBroadcast(DL, VT, V1, V2, OrigMask,
18453                                                    Subtarget, DAG))
18454      return Broadcast;
18455
18456    MVT NewEltVT = VT.isFloatingPoint()
18457                       ? MVT::getFloatingPointVT(VT.getScalarSizeInBits() * 2)
18458                       : MVT::getIntegerVT(VT.getScalarSizeInBits() * 2);
18459    int NewNumElts = NumElements / 2;
18460    MVT NewVT = MVT::getVectorVT(NewEltVT, NewNumElts);
18461    // Make sure that the new vector type is legal. For example, v2f64 isn't
18462    // legal on SSE1.
18463    if (DAG.getTargetLoweringInfo().isTypeLegal(NewVT)) {
18464      if (V2IsZero) {
18465        // Modify the new Mask to take all zeros from the all-zero vector.
18466        // Choose indices that are blend-friendly.
18467        bool UsedZeroVector = false;
18468        assert(is_contained(WidenedMask, SM_SentinelZero) &&
18469               "V2's non-undef elements are used?!");
18470        for (int i = 0; i != NewNumElts; ++i)
18471          if (WidenedMask[i] == SM_SentinelZero) {
18472            WidenedMask[i] = i + NewNumElts;
18473            UsedZeroVector = true;
18474          }
18475        // Ensure all elements of V2 are zero - isBuildVectorAllZeros permits
18476        // some elements to be undef.
18477        if (UsedZeroVector)
18478          V2 = getZeroVector(NewVT, Subtarget, DAG, DL);
18479      }
18480      V1 = DAG.getBitcast(NewVT, V1);
18481      V2 = DAG.getBitcast(NewVT, V2);
18482      return DAG.getBitcast(
18483          VT, DAG.getVectorShuffle(NewVT, DL, V1, V2, WidenedMask));
18484    }
18485  }
18486
18487  // Commute the shuffle if it will improve canonicalization.
18488  SmallVector<int, 64> Mask(OrigMask.begin(), OrigMask.end());
18489  if (canonicalizeShuffleMaskWithCommute(Mask)) {
18490    ShuffleVectorSDNode::commuteMask(Mask);
18491    std::swap(V1, V2);
18492  }
18493
18494  // For each vector width, delegate to a specialized lowering routine.
18495  if (VT.is128BitVector())
18496    return lower128BitShuffle(DL, Mask, VT, V1, V2, Zeroable, Subtarget, DAG);
18497
18498  if (VT.is256BitVector())
18499    return lower256BitShuffle(DL, Mask, VT, V1, V2, Zeroable, Subtarget, DAG);
18500
18501  if (VT.is512BitVector())
18502    return lower512BitShuffle(DL, Mask, VT, V1, V2, Zeroable, Subtarget, DAG);
18503
18504  if (Is1BitVector)
18505    return lower1BitShuffle(DL, Mask, VT, V1, V2, Zeroable, Subtarget, DAG);
18506
18507  llvm_unreachable("Unimplemented!");
18508}
18509
18510/// Try to lower a VSELECT instruction to a vector shuffle.
18511static SDValue lowerVSELECTtoVectorShuffle(SDValue Op,
18512                                           const X86Subtarget &Subtarget,
18513                                           SelectionDAG &DAG) {
18514  SDValue Cond = Op.getOperand(0);
18515  SDValue LHS = Op.getOperand(1);
18516  SDValue RHS = Op.getOperand(2);
18517  MVT VT = Op.getSimpleValueType();
18518
18519  // Only non-legal VSELECTs reach this lowering, convert those into generic
18520  // shuffles and re-use the shuffle lowering path for blends.
18521  if (ISD::isBuildVectorOfConstantSDNodes(Cond.getNode())) {
18522    SmallVector<int, 32> Mask;
18523    if (createShuffleMaskFromVSELECT(Mask, Cond))
18524      return DAG.getVectorShuffle(VT, SDLoc(Op), LHS, RHS, Mask);
18525  }
18526
18527  return SDValue();
18528}
18529
18530SDValue X86TargetLowering::LowerVSELECT(SDValue Op, SelectionDAG &DAG) const {
18531  SDValue Cond = Op.getOperand(0);
18532  SDValue LHS = Op.getOperand(1);
18533  SDValue RHS = Op.getOperand(2);
18534
18535  // A vselect where all conditions and data are constants can be optimized into
18536  // a single vector load by SelectionDAGLegalize::ExpandBUILD_VECTOR().
18537  if (ISD::isBuildVectorOfConstantSDNodes(Cond.getNode()) &&
18538      ISD::isBuildVectorOfConstantSDNodes(LHS.getNode()) &&
18539      ISD::isBuildVectorOfConstantSDNodes(RHS.getNode()))
18540    return SDValue();
18541
18542  // Try to lower this to a blend-style vector shuffle. This can handle all
18543  // constant condition cases.
18544  if (SDValue BlendOp = lowerVSELECTtoVectorShuffle(Op, Subtarget, DAG))
18545    return BlendOp;
18546
18547  // If this VSELECT has a vector if i1 as a mask, it will be directly matched
18548  // with patterns on the mask registers on AVX-512.
18549  MVT CondVT = Cond.getSimpleValueType();
18550  unsigned CondEltSize = Cond.getScalarValueSizeInBits();
18551  if (CondEltSize == 1)
18552    return Op;
18553
18554  // Variable blends are only legal from SSE4.1 onward.
18555  if (!Subtarget.hasSSE41())
18556    return SDValue();
18557
18558  SDLoc dl(Op);
18559  MVT VT = Op.getSimpleValueType();
18560  unsigned EltSize = VT.getScalarSizeInBits();
18561  unsigned NumElts = VT.getVectorNumElements();
18562
18563  // Expand v32i16/v64i8 without BWI.
18564  if ((VT == MVT::v32i16 || VT == MVT::v64i8) && !Subtarget.hasBWI())
18565    return SDValue();
18566
18567  // If the VSELECT is on a 512-bit type, we have to convert a non-i1 condition
18568  // into an i1 condition so that we can use the mask-based 512-bit blend
18569  // instructions.
18570  if (VT.getSizeInBits() == 512) {
18571    // Build a mask by testing the condition against zero.
18572    MVT MaskVT = MVT::getVectorVT(MVT::i1, NumElts);
18573    SDValue Mask = DAG.getSetCC(dl, MaskVT, Cond,
18574                                DAG.getConstant(0, dl, CondVT),
18575                                ISD::SETNE);
18576    // Now return a new VSELECT using the mask.
18577    return DAG.getSelect(dl, VT, Mask, LHS, RHS);
18578  }
18579
18580  // SEXT/TRUNC cases where the mask doesn't match the destination size.
18581  if (CondEltSize != EltSize) {
18582    // If we don't have a sign splat, rely on the expansion.
18583    if (CondEltSize != DAG.ComputeNumSignBits(Cond))
18584      return SDValue();
18585
18586    MVT NewCondSVT = MVT::getIntegerVT(EltSize);
18587    MVT NewCondVT = MVT::getVectorVT(NewCondSVT, NumElts);
18588    Cond = DAG.getSExtOrTrunc(Cond, dl, NewCondVT);
18589    return DAG.getNode(ISD::VSELECT, dl, VT, Cond, LHS, RHS);
18590  }
18591
18592  // Only some types will be legal on some subtargets. If we can emit a legal
18593  // VSELECT-matching blend, return Op, and but if we need to expand, return
18594  // a null value.
18595  switch (VT.SimpleTy) {
18596  default:
18597    // Most of the vector types have blends past SSE4.1.
18598    return Op;
18599
18600  case MVT::v32i8:
18601    // The byte blends for AVX vectors were introduced only in AVX2.
18602    if (Subtarget.hasAVX2())
18603      return Op;
18604
18605    return SDValue();
18606
18607  case MVT::v8i16:
18608  case MVT::v16i16: {
18609    // Bitcast everything to the vXi8 type and use a vXi8 vselect.
18610    MVT CastVT = MVT::getVectorVT(MVT::i8, NumElts * 2);
18611    Cond = DAG.getBitcast(CastVT, Cond);
18612    LHS = DAG.getBitcast(CastVT, LHS);
18613    RHS = DAG.getBitcast(CastVT, RHS);
18614    SDValue Select = DAG.getNode(ISD::VSELECT, dl, CastVT, Cond, LHS, RHS);
18615    return DAG.getBitcast(VT, Select);
18616  }
18617  }
18618}
18619
18620static SDValue LowerEXTRACT_VECTOR_ELT_SSE4(SDValue Op, SelectionDAG &DAG) {
18621  MVT VT = Op.getSimpleValueType();
18622  SDValue Vec = Op.getOperand(0);
18623  SDValue Idx = Op.getOperand(1);
18624  assert(isa<ConstantSDNode>(Idx) && "Constant index expected");
18625  SDLoc dl(Op);
18626
18627  if (!Vec.getSimpleValueType().is128BitVector())
18628    return SDValue();
18629
18630  if (VT.getSizeInBits() == 8) {
18631    // If IdxVal is 0, it's cheaper to do a move instead of a pextrb, unless
18632    // we're going to zero extend the register or fold the store.
18633    if (llvm::isNullConstant(Idx) && !MayFoldIntoZeroExtend(Op) &&
18634        !MayFoldIntoStore(Op))
18635      return DAG.getNode(ISD::TRUNCATE, dl, MVT::i8,
18636                         DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i32,
18637                                     DAG.getBitcast(MVT::v4i32, Vec), Idx));
18638
18639    unsigned IdxVal = cast<ConstantSDNode>(Idx)->getZExtValue();
18640    SDValue Extract = DAG.getNode(X86ISD::PEXTRB, dl, MVT::i32, Vec,
18641                                  DAG.getTargetConstant(IdxVal, dl, MVT::i8));
18642    return DAG.getNode(ISD::TRUNCATE, dl, VT, Extract);
18643  }
18644
18645  if (VT == MVT::f32) {
18646    // EXTRACTPS outputs to a GPR32 register which will require a movd to copy
18647    // the result back to FR32 register. It's only worth matching if the
18648    // result has a single use which is a store or a bitcast to i32.  And in
18649    // the case of a store, it's not worth it if the index is a constant 0,
18650    // because a MOVSSmr can be used instead, which is smaller and faster.
18651    if (!Op.hasOneUse())
18652      return SDValue();
18653    SDNode *User = *Op.getNode()->use_begin();
18654    if ((User->getOpcode() != ISD::STORE || isNullConstant(Idx)) &&
18655        (User->getOpcode() != ISD::BITCAST ||
18656         User->getValueType(0) != MVT::i32))
18657      return SDValue();
18658    SDValue Extract = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i32,
18659                                  DAG.getBitcast(MVT::v4i32, Vec), Idx);
18660    return DAG.getBitcast(MVT::f32, Extract);
18661  }
18662
18663  if (VT == MVT::i32 || VT == MVT::i64)
18664      return Op;
18665
18666  return SDValue();
18667}
18668
18669/// Extract one bit from mask vector, like v16i1 or v8i1.
18670/// AVX-512 feature.
18671static SDValue ExtractBitFromMaskVector(SDValue Op, SelectionDAG &DAG,
18672                                        const X86Subtarget &Subtarget) {
18673  SDValue Vec = Op.getOperand(0);
18674  SDLoc dl(Vec);
18675  MVT VecVT = Vec.getSimpleValueType();
18676  SDValue Idx = Op.getOperand(1);
18677  auto* IdxC = dyn_cast<ConstantSDNode>(Idx);
18678  MVT EltVT = Op.getSimpleValueType();
18679
18680  assert((VecVT.getVectorNumElements() <= 16 || Subtarget.hasBWI()) &&
18681         "Unexpected vector type in ExtractBitFromMaskVector");
18682
18683  // variable index can't be handled in mask registers,
18684  // extend vector to VR512/128
18685  if (!IdxC) {
18686    unsigned NumElts = VecVT.getVectorNumElements();
18687    // Extending v8i1/v16i1 to 512-bit get better performance on KNL
18688    // than extending to 128/256bit.
18689    MVT ExtEltVT = (NumElts <= 8) ? MVT::getIntegerVT(128 / NumElts) : MVT::i8;
18690    MVT ExtVecVT = MVT::getVectorVT(ExtEltVT, NumElts);
18691    SDValue Ext = DAG.getNode(ISD::SIGN_EXTEND, dl, ExtVecVT, Vec);
18692    SDValue Elt = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, ExtEltVT, Ext, Idx);
18693    return DAG.getNode(ISD::TRUNCATE, dl, EltVT, Elt);
18694  }
18695
18696  unsigned IdxVal = IdxC->getZExtValue();
18697  if (IdxVal == 0) // the operation is legal
18698    return Op;
18699
18700  // Extend to natively supported kshift.
18701  unsigned NumElems = VecVT.getVectorNumElements();
18702  MVT WideVecVT = VecVT;
18703  if ((!Subtarget.hasDQI() && NumElems == 8) || NumElems < 8) {
18704    WideVecVT = Subtarget.hasDQI() ? MVT::v8i1 : MVT::v16i1;
18705    Vec = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, WideVecVT,
18706                      DAG.getUNDEF(WideVecVT), Vec,
18707                      DAG.getIntPtrConstant(0, dl));
18708  }
18709
18710  // Use kshiftr instruction to move to the lower element.
18711  Vec = DAG.getNode(X86ISD::KSHIFTR, dl, WideVecVT, Vec,
18712                    DAG.getTargetConstant(IdxVal, dl, MVT::i8));
18713
18714  return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, Op.getValueType(), Vec,
18715                     DAG.getIntPtrConstant(0, dl));
18716}
18717
18718SDValue
18719X86TargetLowering::LowerEXTRACT_VECTOR_ELT(SDValue Op,
18720                                           SelectionDAG &DAG) const {
18721  SDLoc dl(Op);
18722  SDValue Vec = Op.getOperand(0);
18723  MVT VecVT = Vec.getSimpleValueType();
18724  SDValue Idx = Op.getOperand(1);
18725  auto* IdxC = dyn_cast<ConstantSDNode>(Idx);
18726
18727  if (VecVT.getVectorElementType() == MVT::i1)
18728    return ExtractBitFromMaskVector(Op, DAG, Subtarget);
18729
18730  if (!IdxC) {
18731    // Its more profitable to go through memory (1 cycles throughput)
18732    // than using VMOVD + VPERMV/PSHUFB sequence ( 2/3 cycles throughput)
18733    // IACA tool was used to get performance estimation
18734    // (https://software.intel.com/en-us/articles/intel-architecture-code-analyzer)
18735    //
18736    // example : extractelement <16 x i8> %a, i32 %i
18737    //
18738    // Block Throughput: 3.00 Cycles
18739    // Throughput Bottleneck: Port5
18740    //
18741    // | Num Of |   Ports pressure in cycles  |    |
18742    // |  Uops  |  0  - DV  |  5  |  6  |  7  |    |
18743    // ---------------------------------------------
18744    // |   1    |           | 1.0 |     |     | CP | vmovd xmm1, edi
18745    // |   1    |           | 1.0 |     |     | CP | vpshufb xmm0, xmm0, xmm1
18746    // |   2    | 1.0       | 1.0 |     |     | CP | vpextrb eax, xmm0, 0x0
18747    // Total Num Of Uops: 4
18748    //
18749    //
18750    // Block Throughput: 1.00 Cycles
18751    // Throughput Bottleneck: PORT2_AGU, PORT3_AGU, Port4
18752    //
18753    // |    |  Ports pressure in cycles   |  |
18754    // |Uops| 1 | 2 - D  |3 -  D  | 4 | 5 |  |
18755    // ---------------------------------------------------------
18756    // |2^  |   | 0.5    | 0.5    |1.0|   |CP| vmovaps xmmword ptr [rsp-0x18], xmm0
18757    // |1   |0.5|        |        |   |0.5|  | lea rax, ptr [rsp-0x18]
18758    // |1   |   |0.5, 0.5|0.5, 0.5|   |   |CP| mov al, byte ptr [rdi+rax*1]
18759    // Total Num Of Uops: 4
18760
18761    return SDValue();
18762  }
18763
18764  unsigned IdxVal = IdxC->getZExtValue();
18765
18766  // If this is a 256-bit vector result, first extract the 128-bit vector and
18767  // then extract the element from the 128-bit vector.
18768  if (VecVT.is256BitVector() || VecVT.is512BitVector()) {
18769    // Get the 128-bit vector.
18770    Vec = extract128BitVector(Vec, IdxVal, DAG, dl);
18771    MVT EltVT = VecVT.getVectorElementType();
18772
18773    unsigned ElemsPerChunk = 128 / EltVT.getSizeInBits();
18774    assert(isPowerOf2_32(ElemsPerChunk) && "Elements per chunk not power of 2");
18775
18776    // Find IdxVal modulo ElemsPerChunk. Since ElemsPerChunk is a power of 2
18777    // this can be done with a mask.
18778    IdxVal &= ElemsPerChunk - 1;
18779    return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, Op.getValueType(), Vec,
18780                       DAG.getIntPtrConstant(IdxVal, dl));
18781  }
18782
18783  assert(VecVT.is128BitVector() && "Unexpected vector length");
18784
18785  MVT VT = Op.getSimpleValueType();
18786
18787  if (VT.getSizeInBits() == 16) {
18788    // If IdxVal is 0, it's cheaper to do a move instead of a pextrw, unless
18789    // we're going to zero extend the register or fold the store (SSE41 only).
18790    if (IdxVal == 0 && !MayFoldIntoZeroExtend(Op) &&
18791        !(Subtarget.hasSSE41() && MayFoldIntoStore(Op)))
18792      return DAG.getNode(ISD::TRUNCATE, dl, MVT::i16,
18793                         DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i32,
18794                                     DAG.getBitcast(MVT::v4i32, Vec), Idx));
18795
18796    SDValue Extract = DAG.getNode(X86ISD::PEXTRW, dl, MVT::i32, Vec,
18797                                  DAG.getTargetConstant(IdxVal, dl, MVT::i8));
18798    return DAG.getNode(ISD::TRUNCATE, dl, VT, Extract);
18799  }
18800
18801  if (Subtarget.hasSSE41())
18802    if (SDValue Res = LowerEXTRACT_VECTOR_ELT_SSE4(Op, DAG))
18803      return Res;
18804
18805  // TODO: We only extract a single element from v16i8, we can probably afford
18806  // to be more aggressive here before using the default approach of spilling to
18807  // stack.
18808  if (VT.getSizeInBits() == 8 && Op->isOnlyUserOf(Vec.getNode())) {
18809    // Extract either the lowest i32 or any i16, and extract the sub-byte.
18810    int DWordIdx = IdxVal / 4;
18811    if (DWordIdx == 0) {
18812      SDValue Res = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i32,
18813                                DAG.getBitcast(MVT::v4i32, Vec),
18814                                DAG.getIntPtrConstant(DWordIdx, dl));
18815      int ShiftVal = (IdxVal % 4) * 8;
18816      if (ShiftVal != 0)
18817        Res = DAG.getNode(ISD::SRL, dl, MVT::i32, Res,
18818                          DAG.getConstant(ShiftVal, dl, MVT::i8));
18819      return DAG.getNode(ISD::TRUNCATE, dl, VT, Res);
18820    }
18821
18822    int WordIdx = IdxVal / 2;
18823    SDValue Res = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i16,
18824                              DAG.getBitcast(MVT::v8i16, Vec),
18825                              DAG.getIntPtrConstant(WordIdx, dl));
18826    int ShiftVal = (IdxVal % 2) * 8;
18827    if (ShiftVal != 0)
18828      Res = DAG.getNode(ISD::SRL, dl, MVT::i16, Res,
18829                        DAG.getConstant(ShiftVal, dl, MVT::i8));
18830    return DAG.getNode(ISD::TRUNCATE, dl, VT, Res);
18831  }
18832
18833  if (VT.getSizeInBits() == 32) {
18834    if (IdxVal == 0)
18835      return Op;
18836
18837    // SHUFPS the element to the lowest double word, then movss.
18838    int Mask[4] = { static_cast<int>(IdxVal), -1, -1, -1 };
18839    Vec = DAG.getVectorShuffle(VecVT, dl, Vec, DAG.getUNDEF(VecVT), Mask);
18840    return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, VT, Vec,
18841                       DAG.getIntPtrConstant(0, dl));
18842  }
18843
18844  if (VT.getSizeInBits() == 64) {
18845    // FIXME: .td only matches this for <2 x f64>, not <2 x i64> on 32b
18846    // FIXME: seems like this should be unnecessary if mov{h,l}pd were taught
18847    //        to match extract_elt for f64.
18848    if (IdxVal == 0)
18849      return Op;
18850
18851    // UNPCKHPD the element to the lowest double word, then movsd.
18852    // Note if the lower 64 bits of the result of the UNPCKHPD is then stored
18853    // to a f64mem, the whole operation is folded into a single MOVHPDmr.
18854    int Mask[2] = { 1, -1 };
18855    Vec = DAG.getVectorShuffle(VecVT, dl, Vec, DAG.getUNDEF(VecVT), Mask);
18856    return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, VT, Vec,
18857                       DAG.getIntPtrConstant(0, dl));
18858  }
18859
18860  return SDValue();
18861}
18862
18863/// Insert one bit to mask vector, like v16i1 or v8i1.
18864/// AVX-512 feature.
18865static SDValue InsertBitToMaskVector(SDValue Op, SelectionDAG &DAG,
18866                                     const X86Subtarget &Subtarget) {
18867  SDLoc dl(Op);
18868  SDValue Vec = Op.getOperand(0);
18869  SDValue Elt = Op.getOperand(1);
18870  SDValue Idx = Op.getOperand(2);
18871  MVT VecVT = Vec.getSimpleValueType();
18872
18873  if (!isa<ConstantSDNode>(Idx)) {
18874    // Non constant index. Extend source and destination,
18875    // insert element and then truncate the result.
18876    unsigned NumElts = VecVT.getVectorNumElements();
18877    MVT ExtEltVT = (NumElts <= 8) ? MVT::getIntegerVT(128 / NumElts) : MVT::i8;
18878    MVT ExtVecVT = MVT::getVectorVT(ExtEltVT, NumElts);
18879    SDValue ExtOp = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, ExtVecVT,
18880      DAG.getNode(ISD::SIGN_EXTEND, dl, ExtVecVT, Vec),
18881      DAG.getNode(ISD::SIGN_EXTEND, dl, ExtEltVT, Elt), Idx);
18882    return DAG.getNode(ISD::TRUNCATE, dl, VecVT, ExtOp);
18883  }
18884
18885  // Copy into a k-register, extract to v1i1 and insert_subvector.
18886  SDValue EltInVec = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v1i1, Elt);
18887  return DAG.getNode(ISD::INSERT_SUBVECTOR, dl, VecVT, Vec, EltInVec, Idx);
18888}
18889
18890SDValue X86TargetLowering::LowerINSERT_VECTOR_ELT(SDValue Op,
18891                                                  SelectionDAG &DAG) const {
18892  MVT VT = Op.getSimpleValueType();
18893  MVT EltVT = VT.getVectorElementType();
18894  unsigned NumElts = VT.getVectorNumElements();
18895  unsigned EltSizeInBits = EltVT.getScalarSizeInBits();
18896
18897  if (EltVT == MVT::i1)
18898    return InsertBitToMaskVector(Op, DAG, Subtarget);
18899
18900  SDLoc dl(Op);
18901  SDValue N0 = Op.getOperand(0);
18902  SDValue N1 = Op.getOperand(1);
18903  SDValue N2 = Op.getOperand(2);
18904  auto *N2C = dyn_cast<ConstantSDNode>(N2);
18905
18906  if (!N2C) {
18907    // Variable insertion indices, usually we're better off spilling to stack,
18908    // but AVX512 can use a variable compare+select by comparing against all
18909    // possible vector indices, and FP insertion has less gpr->simd traffic.
18910    if (!(Subtarget.hasBWI() ||
18911          (Subtarget.hasAVX512() && EltSizeInBits >= 32) ||
18912          (Subtarget.hasSSE41() && VT.isFloatingPoint())))
18913      return SDValue();
18914
18915    MVT IdxSVT = MVT::getIntegerVT(EltSizeInBits);
18916    MVT IdxVT = MVT::getVectorVT(IdxSVT, NumElts);
18917    SDValue IdxExt = DAG.getZExtOrTrunc(N2, dl, IdxSVT);
18918    SDValue IdxSplat = DAG.getSplatBuildVector(IdxVT, dl, IdxExt);
18919    SDValue EltSplat = DAG.getSplatBuildVector(VT, dl, N1);
18920
18921    SmallVector<SDValue, 16> RawIndices;
18922    for (unsigned I = 0; I != NumElts; ++I)
18923      RawIndices.push_back(DAG.getConstant(I, dl, IdxSVT));
18924    SDValue Indices = DAG.getBuildVector(IdxVT, dl, RawIndices);
18925
18926    // inselt N0, N1, N2 --> select (SplatN2 == {0,1,2...}) ? SplatN1 : N0.
18927    return DAG.getSelectCC(dl, IdxSplat, Indices, EltSplat, N0,
18928                           ISD::CondCode::SETEQ);
18929  }
18930
18931  if (N2C->getAPIntValue().uge(NumElts))
18932    return SDValue();
18933  uint64_t IdxVal = N2C->getZExtValue();
18934
18935  bool IsZeroElt = X86::isZeroNode(N1);
18936  bool IsAllOnesElt = VT.isInteger() && llvm::isAllOnesConstant(N1);
18937
18938  // If we are inserting a element, see if we can do this more efficiently with
18939  // a blend shuffle with a rematerializable vector than a costly integer
18940  // insertion.
18941  if ((IsZeroElt || IsAllOnesElt) && Subtarget.hasSSE41() &&
18942      (16 <= EltSizeInBits || (IsZeroElt && !VT.is128BitVector()))) {
18943    SmallVector<int, 8> BlendMask;
18944    for (unsigned i = 0; i != NumElts; ++i)
18945      BlendMask.push_back(i == IdxVal ? i + NumElts : i);
18946    SDValue CstVector = IsZeroElt ? getZeroVector(VT, Subtarget, DAG, dl)
18947                                  : getOnesVector(VT, DAG, dl);
18948    return DAG.getVectorShuffle(VT, dl, N0, CstVector, BlendMask);
18949  }
18950
18951  // If the vector is wider than 128 bits, extract the 128-bit subvector, insert
18952  // into that, and then insert the subvector back into the result.
18953  if (VT.is256BitVector() || VT.is512BitVector()) {
18954    // With a 256-bit vector, we can insert into the zero element efficiently
18955    // using a blend if we have AVX or AVX2 and the right data type.
18956    if (VT.is256BitVector() && IdxVal == 0) {
18957      // TODO: It is worthwhile to cast integer to floating point and back
18958      // and incur a domain crossing penalty if that's what we'll end up
18959      // doing anyway after extracting to a 128-bit vector.
18960      if ((Subtarget.hasAVX() && (EltVT == MVT::f64 || EltVT == MVT::f32)) ||
18961          (Subtarget.hasAVX2() && EltVT == MVT::i32)) {
18962        SDValue N1Vec = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, N1);
18963        return DAG.getNode(X86ISD::BLENDI, dl, VT, N0, N1Vec,
18964                           DAG.getTargetConstant(1, dl, MVT::i8));
18965      }
18966    }
18967
18968    // Get the desired 128-bit vector chunk.
18969    SDValue V = extract128BitVector(N0, IdxVal, DAG, dl);
18970
18971    // Insert the element into the desired chunk.
18972    unsigned NumEltsIn128 = 128 / EltSizeInBits;
18973    assert(isPowerOf2_32(NumEltsIn128));
18974    // Since NumEltsIn128 is a power of 2 we can use mask instead of modulo.
18975    unsigned IdxIn128 = IdxVal & (NumEltsIn128 - 1);
18976
18977    V = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, V.getValueType(), V, N1,
18978                    DAG.getIntPtrConstant(IdxIn128, dl));
18979
18980    // Insert the changed part back into the bigger vector
18981    return insert128BitVector(N0, V, IdxVal, DAG, dl);
18982  }
18983  assert(VT.is128BitVector() && "Only 128-bit vector types should be left!");
18984
18985  // This will be just movd/movq/movss/movsd.
18986  if (IdxVal == 0 && ISD::isBuildVectorAllZeros(N0.getNode())) {
18987    if (EltVT == MVT::i32 || EltVT == MVT::f32 || EltVT == MVT::f64 ||
18988        EltVT == MVT::i64) {
18989      N1 = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, N1);
18990      return getShuffleVectorZeroOrUndef(N1, 0, true, Subtarget, DAG);
18991    }
18992
18993    // We can't directly insert an i8 or i16 into a vector, so zero extend
18994    // it to i32 first.
18995    if (EltVT == MVT::i16 || EltVT == MVT::i8) {
18996      N1 = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i32, N1);
18997      MVT ShufVT = MVT::getVectorVT(MVT::i32, VT.getSizeInBits() / 32);
18998      N1 = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, ShufVT, N1);
18999      N1 = getShuffleVectorZeroOrUndef(N1, 0, true, Subtarget, DAG);
19000      return DAG.getBitcast(VT, N1);
19001    }
19002  }
19003
19004  // Transform it so it match pinsr{b,w} which expects a GR32 as its second
19005  // argument. SSE41 required for pinsrb.
19006  if (VT == MVT::v8i16 || (VT == MVT::v16i8 && Subtarget.hasSSE41())) {
19007    unsigned Opc;
19008    if (VT == MVT::v8i16) {
19009      assert(Subtarget.hasSSE2() && "SSE2 required for PINSRW");
19010      Opc = X86ISD::PINSRW;
19011    } else {
19012      assert(VT == MVT::v16i8 && "PINSRB requires v16i8 vector");
19013      assert(Subtarget.hasSSE41() && "SSE41 required for PINSRB");
19014      Opc = X86ISD::PINSRB;
19015    }
19016
19017    assert(N1.getValueType() != MVT::i32 && "Unexpected VT");
19018    N1 = DAG.getNode(ISD::ANY_EXTEND, dl, MVT::i32, N1);
19019    N2 = DAG.getTargetConstant(IdxVal, dl, MVT::i8);
19020    return DAG.getNode(Opc, dl, VT, N0, N1, N2);
19021  }
19022
19023  if (Subtarget.hasSSE41()) {
19024    if (EltVT == MVT::f32) {
19025      // Bits [7:6] of the constant are the source select. This will always be
19026      //   zero here. The DAG Combiner may combine an extract_elt index into
19027      //   these bits. For example (insert (extract, 3), 2) could be matched by
19028      //   putting the '3' into bits [7:6] of X86ISD::INSERTPS.
19029      // Bits [5:4] of the constant are the destination select. This is the
19030      //   value of the incoming immediate.
19031      // Bits [3:0] of the constant are the zero mask. The DAG Combiner may
19032      //   combine either bitwise AND or insert of float 0.0 to set these bits.
19033
19034      bool MinSize = DAG.getMachineFunction().getFunction().hasMinSize();
19035      if (IdxVal == 0 && (!MinSize || !MayFoldLoad(N1))) {
19036        // If this is an insertion of 32-bits into the low 32-bits of
19037        // a vector, we prefer to generate a blend with immediate rather
19038        // than an insertps. Blends are simpler operations in hardware and so
19039        // will always have equal or better performance than insertps.
19040        // But if optimizing for size and there's a load folding opportunity,
19041        // generate insertps because blendps does not have a 32-bit memory
19042        // operand form.
19043        N1 = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v4f32, N1);
19044        return DAG.getNode(X86ISD::BLENDI, dl, VT, N0, N1,
19045                           DAG.getTargetConstant(1, dl, MVT::i8));
19046      }
19047      // Create this as a scalar to vector..
19048      N1 = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v4f32, N1);
19049      return DAG.getNode(X86ISD::INSERTPS, dl, VT, N0, N1,
19050                         DAG.getTargetConstant(IdxVal << 4, dl, MVT::i8));
19051    }
19052
19053    // PINSR* works with constant index.
19054    if (EltVT == MVT::i32 || EltVT == MVT::i64)
19055      return Op;
19056  }
19057
19058  return SDValue();
19059}
19060
19061static SDValue LowerSCALAR_TO_VECTOR(SDValue Op, const X86Subtarget &Subtarget,
19062                                     SelectionDAG &DAG) {
19063  SDLoc dl(Op);
19064  MVT OpVT = Op.getSimpleValueType();
19065
19066  // It's always cheaper to replace a xor+movd with xorps and simplifies further
19067  // combines.
19068  if (X86::isZeroNode(Op.getOperand(0)))
19069    return getZeroVector(OpVT, Subtarget, DAG, dl);
19070
19071  // If this is a 256-bit vector result, first insert into a 128-bit
19072  // vector and then insert into the 256-bit vector.
19073  if (!OpVT.is128BitVector()) {
19074    // Insert into a 128-bit vector.
19075    unsigned SizeFactor = OpVT.getSizeInBits() / 128;
19076    MVT VT128 = MVT::getVectorVT(OpVT.getVectorElementType(),
19077                                 OpVT.getVectorNumElements() / SizeFactor);
19078
19079    Op = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT128, Op.getOperand(0));
19080
19081    // Insert the 128-bit vector.
19082    return insert128BitVector(DAG.getUNDEF(OpVT), Op, 0, DAG, dl);
19083  }
19084  assert(OpVT.is128BitVector() && OpVT.isInteger() && OpVT != MVT::v2i64 &&
19085         "Expected an SSE type!");
19086
19087  // Pass through a v4i32 SCALAR_TO_VECTOR as that's what we use in tblgen.
19088  if (OpVT == MVT::v4i32)
19089    return Op;
19090
19091  SDValue AnyExt = DAG.getNode(ISD::ANY_EXTEND, dl, MVT::i32, Op.getOperand(0));
19092  return DAG.getBitcast(
19093      OpVT, DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v4i32, AnyExt));
19094}
19095
19096// Lower a node with an INSERT_SUBVECTOR opcode.  This may result in a
19097// simple superregister reference or explicit instructions to insert
19098// the upper bits of a vector.
19099static SDValue LowerINSERT_SUBVECTOR(SDValue Op, const X86Subtarget &Subtarget,
19100                                     SelectionDAG &DAG) {
19101  assert(Op.getSimpleValueType().getVectorElementType() == MVT::i1);
19102
19103  return insert1BitVector(Op, DAG, Subtarget);
19104}
19105
19106static SDValue LowerEXTRACT_SUBVECTOR(SDValue Op, const X86Subtarget &Subtarget,
19107                                      SelectionDAG &DAG) {
19108  assert(Op.getSimpleValueType().getVectorElementType() == MVT::i1 &&
19109         "Only vXi1 extract_subvectors need custom lowering");
19110
19111  SDLoc dl(Op);
19112  SDValue Vec = Op.getOperand(0);
19113  uint64_t IdxVal = Op.getConstantOperandVal(1);
19114
19115  if (IdxVal == 0) // the operation is legal
19116    return Op;
19117
19118  MVT VecVT = Vec.getSimpleValueType();
19119  unsigned NumElems = VecVT.getVectorNumElements();
19120
19121  // Extend to natively supported kshift.
19122  MVT WideVecVT = VecVT;
19123  if ((!Subtarget.hasDQI() && NumElems == 8) || NumElems < 8) {
19124    WideVecVT = Subtarget.hasDQI() ? MVT::v8i1 : MVT::v16i1;
19125    Vec = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, WideVecVT,
19126                      DAG.getUNDEF(WideVecVT), Vec,
19127                      DAG.getIntPtrConstant(0, dl));
19128  }
19129
19130  // Shift to the LSB.
19131  Vec = DAG.getNode(X86ISD::KSHIFTR, dl, WideVecVT, Vec,
19132                    DAG.getTargetConstant(IdxVal, dl, MVT::i8));
19133
19134  return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, Op.getValueType(), Vec,
19135                     DAG.getIntPtrConstant(0, dl));
19136}
19137
19138// Returns the appropriate wrapper opcode for a global reference.
19139unsigned X86TargetLowering::getGlobalWrapperKind(
19140    const GlobalValue *GV, const unsigned char OpFlags) const {
19141  // References to absolute symbols are never PC-relative.
19142  if (GV && GV->isAbsoluteSymbolRef())
19143    return X86ISD::Wrapper;
19144
19145  CodeModel::Model M = getTargetMachine().getCodeModel();
19146  if (Subtarget.isPICStyleRIPRel() &&
19147      (M == CodeModel::Small || M == CodeModel::Kernel))
19148    return X86ISD::WrapperRIP;
19149
19150  // GOTPCREL references must always use RIP.
19151  if (OpFlags == X86II::MO_GOTPCREL)
19152    return X86ISD::WrapperRIP;
19153
19154  return X86ISD::Wrapper;
19155}
19156
19157// ConstantPool, JumpTable, GlobalAddress, and ExternalSymbol are lowered as
19158// their target counterpart wrapped in the X86ISD::Wrapper node. Suppose N is
19159// one of the above mentioned nodes. It has to be wrapped because otherwise
19160// Select(N) returns N. So the raw TargetGlobalAddress nodes, etc. can only
19161// be used to form addressing mode. These wrapped nodes will be selected
19162// into MOV32ri.
19163SDValue
19164X86TargetLowering::LowerConstantPool(SDValue Op, SelectionDAG &DAG) const {
19165  ConstantPoolSDNode *CP = cast<ConstantPoolSDNode>(Op);
19166
19167  // In PIC mode (unless we're in RIPRel PIC mode) we add an offset to the
19168  // global base reg.
19169  unsigned char OpFlag = Subtarget.classifyLocalReference(nullptr);
19170
19171  auto PtrVT = getPointerTy(DAG.getDataLayout());
19172  SDValue Result = DAG.getTargetConstantPool(
19173      CP->getConstVal(), PtrVT, CP->getAlign(), CP->getOffset(), OpFlag);
19174  SDLoc DL(CP);
19175  Result = DAG.getNode(getGlobalWrapperKind(), DL, PtrVT, Result);
19176  // With PIC, the address is actually $g + Offset.
19177  if (OpFlag) {
19178    Result =
19179        DAG.getNode(ISD::ADD, DL, PtrVT,
19180                    DAG.getNode(X86ISD::GlobalBaseReg, SDLoc(), PtrVT), Result);
19181  }
19182
19183  return Result;
19184}
19185
19186SDValue X86TargetLowering::LowerJumpTable(SDValue Op, SelectionDAG &DAG) const {
19187  JumpTableSDNode *JT = cast<JumpTableSDNode>(Op);
19188
19189  // In PIC mode (unless we're in RIPRel PIC mode) we add an offset to the
19190  // global base reg.
19191  unsigned char OpFlag = Subtarget.classifyLocalReference(nullptr);
19192
19193  auto PtrVT = getPointerTy(DAG.getDataLayout());
19194  SDValue Result = DAG.getTargetJumpTable(JT->getIndex(), PtrVT, OpFlag);
19195  SDLoc DL(JT);
19196  Result = DAG.getNode(getGlobalWrapperKind(), DL, PtrVT, Result);
19197
19198  // With PIC, the address is actually $g + Offset.
19199  if (OpFlag)
19200    Result =
19201        DAG.getNode(ISD::ADD, DL, PtrVT,
19202                    DAG.getNode(X86ISD::GlobalBaseReg, SDLoc(), PtrVT), Result);
19203
19204  return Result;
19205}
19206
19207SDValue X86TargetLowering::LowerExternalSymbol(SDValue Op,
19208                                               SelectionDAG &DAG) const {
19209  return LowerGlobalOrExternal(Op, DAG, /*ForCall=*/false);
19210}
19211
19212SDValue
19213X86TargetLowering::LowerBlockAddress(SDValue Op, SelectionDAG &DAG) const {
19214  // Create the TargetBlockAddressAddress node.
19215  unsigned char OpFlags =
19216    Subtarget.classifyBlockAddressReference();
19217  const BlockAddress *BA = cast<BlockAddressSDNode>(Op)->getBlockAddress();
19218  int64_t Offset = cast<BlockAddressSDNode>(Op)->getOffset();
19219  SDLoc dl(Op);
19220  auto PtrVT = getPointerTy(DAG.getDataLayout());
19221  SDValue Result = DAG.getTargetBlockAddress(BA, PtrVT, Offset, OpFlags);
19222  Result = DAG.getNode(getGlobalWrapperKind(), dl, PtrVT, Result);
19223
19224  // With PIC, the address is actually $g + Offset.
19225  if (isGlobalRelativeToPICBase(OpFlags)) {
19226    Result = DAG.getNode(ISD::ADD, dl, PtrVT,
19227                         DAG.getNode(X86ISD::GlobalBaseReg, dl, PtrVT), Result);
19228  }
19229
19230  return Result;
19231}
19232
19233/// Creates target global address or external symbol nodes for calls or
19234/// other uses.
19235SDValue X86TargetLowering::LowerGlobalOrExternal(SDValue Op, SelectionDAG &DAG,
19236                                                 bool ForCall) const {
19237  // Unpack the global address or external symbol.
19238  const SDLoc &dl = SDLoc(Op);
19239  const GlobalValue *GV = nullptr;
19240  int64_t Offset = 0;
19241  const char *ExternalSym = nullptr;
19242  if (const auto *G = dyn_cast<GlobalAddressSDNode>(Op)) {
19243    GV = G->getGlobal();
19244    Offset = G->getOffset();
19245  } else {
19246    const auto *ES = cast<ExternalSymbolSDNode>(Op);
19247    ExternalSym = ES->getSymbol();
19248  }
19249
19250  // Calculate some flags for address lowering.
19251  const Module &Mod = *DAG.getMachineFunction().getFunction().getParent();
19252  unsigned char OpFlags;
19253  if (ForCall)
19254    OpFlags = Subtarget.classifyGlobalFunctionReference(GV, Mod);
19255  else
19256    OpFlags = Subtarget.classifyGlobalReference(GV, Mod);
19257  bool HasPICReg = isGlobalRelativeToPICBase(OpFlags);
19258  bool NeedsLoad = isGlobalStubReference(OpFlags);
19259
19260  CodeModel::Model M = DAG.getTarget().getCodeModel();
19261  auto PtrVT = getPointerTy(DAG.getDataLayout());
19262  SDValue Result;
19263
19264  if (GV) {
19265    // Create a target global address if this is a global. If possible, fold the
19266    // offset into the global address reference. Otherwise, ADD it on later.
19267    // Suppress the folding if Offset is negative: movl foo-1, %eax is not
19268    // allowed because if the address of foo is 0, the ELF R_X86_64_32
19269    // relocation will compute to a negative value, which is invalid.
19270    int64_t GlobalOffset = 0;
19271    if (OpFlags == X86II::MO_NO_FLAG && Offset >= 0 &&
19272        X86::isOffsetSuitableForCodeModel(Offset, M, true)) {
19273      std::swap(GlobalOffset, Offset);
19274    }
19275    Result = DAG.getTargetGlobalAddress(GV, dl, PtrVT, GlobalOffset, OpFlags);
19276  } else {
19277    // If this is not a global address, this must be an external symbol.
19278    Result = DAG.getTargetExternalSymbol(ExternalSym, PtrVT, OpFlags);
19279  }
19280
19281  // If this is a direct call, avoid the wrapper if we don't need to do any
19282  // loads or adds. This allows SDAG ISel to match direct calls.
19283  if (ForCall && !NeedsLoad && !HasPICReg && Offset == 0)
19284    return Result;
19285
19286  Result = DAG.getNode(getGlobalWrapperKind(GV, OpFlags), dl, PtrVT, Result);
19287
19288  // With PIC, the address is actually $g + Offset.
19289  if (HasPICReg) {
19290    Result = DAG.getNode(ISD::ADD, dl, PtrVT,
19291                         DAG.getNode(X86ISD::GlobalBaseReg, dl, PtrVT), Result);
19292  }
19293
19294  // For globals that require a load from a stub to get the address, emit the
19295  // load.
19296  if (NeedsLoad)
19297    Result = DAG.getLoad(PtrVT, dl, DAG.getEntryNode(), Result,
19298                         MachinePointerInfo::getGOT(DAG.getMachineFunction()));
19299
19300  // If there was a non-zero offset that we didn't fold, create an explicit
19301  // addition for it.
19302  if (Offset != 0)
19303    Result = DAG.getNode(ISD::ADD, dl, PtrVT, Result,
19304                         DAG.getConstant(Offset, dl, PtrVT));
19305
19306  return Result;
19307}
19308
19309SDValue
19310X86TargetLowering::LowerGlobalAddress(SDValue Op, SelectionDAG &DAG) const {
19311  return LowerGlobalOrExternal(Op, DAG, /*ForCall=*/false);
19312}
19313
19314static SDValue
19315GetTLSADDR(SelectionDAG &DAG, SDValue Chain, GlobalAddressSDNode *GA,
19316           SDValue *InFlag, const EVT PtrVT, unsigned ReturnReg,
19317           unsigned char OperandFlags, bool LocalDynamic = false) {
19318  MachineFrameInfo &MFI = DAG.getMachineFunction().getFrameInfo();
19319  SDVTList NodeTys = DAG.getVTList(MVT::Other, MVT::Glue);
19320  SDLoc dl(GA);
19321  SDValue TGA = DAG.getTargetGlobalAddress(GA->getGlobal(), dl,
19322                                           GA->getValueType(0),
19323                                           GA->getOffset(),
19324                                           OperandFlags);
19325
19326  X86ISD::NodeType CallType = LocalDynamic ? X86ISD::TLSBASEADDR
19327                                           : X86ISD::TLSADDR;
19328
19329  if (InFlag) {
19330    SDValue Ops[] = { Chain,  TGA, *InFlag };
19331    Chain = DAG.getNode(CallType, dl, NodeTys, Ops);
19332  } else {
19333    SDValue Ops[]  = { Chain, TGA };
19334    Chain = DAG.getNode(CallType, dl, NodeTys, Ops);
19335  }
19336
19337  // TLSADDR will be codegen'ed as call. Inform MFI that function has calls.
19338  MFI.setAdjustsStack(true);
19339  MFI.setHasCalls(true);
19340
19341  SDValue Flag = Chain.getValue(1);
19342  return DAG.getCopyFromReg(Chain, dl, ReturnReg, PtrVT, Flag);
19343}
19344
19345// Lower ISD::GlobalTLSAddress using the "general dynamic" model, 32 bit
19346static SDValue
19347LowerToTLSGeneralDynamicModel32(GlobalAddressSDNode *GA, SelectionDAG &DAG,
19348                                const EVT PtrVT) {
19349  SDValue InFlag;
19350  SDLoc dl(GA);  // ? function entry point might be better
19351  SDValue Chain = DAG.getCopyToReg(DAG.getEntryNode(), dl, X86::EBX,
19352                                   DAG.getNode(X86ISD::GlobalBaseReg,
19353                                               SDLoc(), PtrVT), InFlag);
19354  InFlag = Chain.getValue(1);
19355
19356  return GetTLSADDR(DAG, Chain, GA, &InFlag, PtrVT, X86::EAX, X86II::MO_TLSGD);
19357}
19358
19359// Lower ISD::GlobalTLSAddress using the "general dynamic" model, 64 bit LP64
19360static SDValue
19361LowerToTLSGeneralDynamicModel64(GlobalAddressSDNode *GA, SelectionDAG &DAG,
19362                                const EVT PtrVT) {
19363  return GetTLSADDR(DAG, DAG.getEntryNode(), GA, nullptr, PtrVT,
19364                    X86::RAX, X86II::MO_TLSGD);
19365}
19366
19367// Lower ISD::GlobalTLSAddress using the "general dynamic" model, 64 bit ILP32
19368static SDValue
19369LowerToTLSGeneralDynamicModelX32(GlobalAddressSDNode *GA, SelectionDAG &DAG,
19370                                 const EVT PtrVT) {
19371  return GetTLSADDR(DAG, DAG.getEntryNode(), GA, nullptr, PtrVT,
19372                    X86::EAX, X86II::MO_TLSGD);
19373}
19374
19375static SDValue LowerToTLSLocalDynamicModel(GlobalAddressSDNode *GA,
19376                                           SelectionDAG &DAG, const EVT PtrVT,
19377                                           bool Is64Bit, bool Is64BitLP64) {
19378  SDLoc dl(GA);
19379
19380  // Get the start address of the TLS block for this module.
19381  X86MachineFunctionInfo *MFI = DAG.getMachineFunction()
19382      .getInfo<X86MachineFunctionInfo>();
19383  MFI->incNumLocalDynamicTLSAccesses();
19384
19385  SDValue Base;
19386  if (Is64Bit) {
19387    unsigned ReturnReg = Is64BitLP64 ? X86::RAX : X86::EAX;
19388    Base = GetTLSADDR(DAG, DAG.getEntryNode(), GA, nullptr, PtrVT, ReturnReg,
19389                      X86II::MO_TLSLD, /*LocalDynamic=*/true);
19390  } else {
19391    SDValue InFlag;
19392    SDValue Chain = DAG.getCopyToReg(DAG.getEntryNode(), dl, X86::EBX,
19393        DAG.getNode(X86ISD::GlobalBaseReg, SDLoc(), PtrVT), InFlag);
19394    InFlag = Chain.getValue(1);
19395    Base = GetTLSADDR(DAG, Chain, GA, &InFlag, PtrVT, X86::EAX,
19396                      X86II::MO_TLSLDM, /*LocalDynamic=*/true);
19397  }
19398
19399  // Note: the CleanupLocalDynamicTLSPass will remove redundant computations
19400  // of Base.
19401
19402  // Build x@dtpoff.
19403  unsigned char OperandFlags = X86II::MO_DTPOFF;
19404  unsigned WrapperKind = X86ISD::Wrapper;
19405  SDValue TGA = DAG.getTargetGlobalAddress(GA->getGlobal(), dl,
19406                                           GA->getValueType(0),
19407                                           GA->getOffset(), OperandFlags);
19408  SDValue Offset = DAG.getNode(WrapperKind, dl, PtrVT, TGA);
19409
19410  // Add x@dtpoff with the base.
19411  return DAG.getNode(ISD::ADD, dl, PtrVT, Offset, Base);
19412}
19413
19414// Lower ISD::GlobalTLSAddress using the "initial exec" or "local exec" model.
19415static SDValue LowerToTLSExecModel(GlobalAddressSDNode *GA, SelectionDAG &DAG,
19416                                   const EVT PtrVT, TLSModel::Model model,
19417                                   bool is64Bit, bool isPIC) {
19418  SDLoc dl(GA);
19419
19420  // Get the Thread Pointer, which is %gs:0 (32-bit) or %fs:0 (64-bit).
19421  Value *Ptr = Constant::getNullValue(Type::getInt8PtrTy(*DAG.getContext(),
19422                                                         is64Bit ? 257 : 256));
19423
19424  SDValue ThreadPointer =
19425      DAG.getLoad(PtrVT, dl, DAG.getEntryNode(), DAG.getIntPtrConstant(0, dl),
19426                  MachinePointerInfo(Ptr));
19427
19428  unsigned char OperandFlags = 0;
19429  // Most TLS accesses are not RIP relative, even on x86-64.  One exception is
19430  // initialexec.
19431  unsigned WrapperKind = X86ISD::Wrapper;
19432  if (model == TLSModel::LocalExec) {
19433    OperandFlags = is64Bit ? X86II::MO_TPOFF : X86II::MO_NTPOFF;
19434  } else if (model == TLSModel::InitialExec) {
19435    if (is64Bit) {
19436      OperandFlags = X86II::MO_GOTTPOFF;
19437      WrapperKind = X86ISD::WrapperRIP;
19438    } else {
19439      OperandFlags = isPIC ? X86II::MO_GOTNTPOFF : X86II::MO_INDNTPOFF;
19440    }
19441  } else {
19442    llvm_unreachable("Unexpected model");
19443  }
19444
19445  // emit "addl x@ntpoff,%eax" (local exec)
19446  // or "addl x@indntpoff,%eax" (initial exec)
19447  // or "addl x@gotntpoff(%ebx) ,%eax" (initial exec, 32-bit pic)
19448  SDValue TGA =
19449      DAG.getTargetGlobalAddress(GA->getGlobal(), dl, GA->getValueType(0),
19450                                 GA->getOffset(), OperandFlags);
19451  SDValue Offset = DAG.getNode(WrapperKind, dl, PtrVT, TGA);
19452
19453  if (model == TLSModel::InitialExec) {
19454    if (isPIC && !is64Bit) {
19455      Offset = DAG.getNode(ISD::ADD, dl, PtrVT,
19456                           DAG.getNode(X86ISD::GlobalBaseReg, SDLoc(), PtrVT),
19457                           Offset);
19458    }
19459
19460    Offset = DAG.getLoad(PtrVT, dl, DAG.getEntryNode(), Offset,
19461                         MachinePointerInfo::getGOT(DAG.getMachineFunction()));
19462  }
19463
19464  // The address of the thread local variable is the add of the thread
19465  // pointer with the offset of the variable.
19466  return DAG.getNode(ISD::ADD, dl, PtrVT, ThreadPointer, Offset);
19467}
19468
19469SDValue
19470X86TargetLowering::LowerGlobalTLSAddress(SDValue Op, SelectionDAG &DAG) const {
19471
19472  GlobalAddressSDNode *GA = cast<GlobalAddressSDNode>(Op);
19473
19474  if (DAG.getTarget().useEmulatedTLS())
19475    return LowerToTLSEmulatedModel(GA, DAG);
19476
19477  const GlobalValue *GV = GA->getGlobal();
19478  auto PtrVT = getPointerTy(DAG.getDataLayout());
19479  bool PositionIndependent = isPositionIndependent();
19480
19481  if (Subtarget.isTargetELF()) {
19482    TLSModel::Model model = DAG.getTarget().getTLSModel(GV);
19483    switch (model) {
19484      case TLSModel::GeneralDynamic:
19485        if (Subtarget.is64Bit()) {
19486          if (Subtarget.isTarget64BitLP64())
19487            return LowerToTLSGeneralDynamicModel64(GA, DAG, PtrVT);
19488          return LowerToTLSGeneralDynamicModelX32(GA, DAG, PtrVT);
19489        }
19490        return LowerToTLSGeneralDynamicModel32(GA, DAG, PtrVT);
19491      case TLSModel::LocalDynamic:
19492        return LowerToTLSLocalDynamicModel(GA, DAG, PtrVT, Subtarget.is64Bit(),
19493                                           Subtarget.isTarget64BitLP64());
19494      case TLSModel::InitialExec:
19495      case TLSModel::LocalExec:
19496        return LowerToTLSExecModel(GA, DAG, PtrVT, model, Subtarget.is64Bit(),
19497                                   PositionIndependent);
19498    }
19499    llvm_unreachable("Unknown TLS model.");
19500  }
19501
19502  if (Subtarget.isTargetDarwin()) {
19503    // Darwin only has one model of TLS.  Lower to that.
19504    unsigned char OpFlag = 0;
19505    unsigned WrapperKind = Subtarget.isPICStyleRIPRel() ?
19506                           X86ISD::WrapperRIP : X86ISD::Wrapper;
19507
19508    // In PIC mode (unless we're in RIPRel PIC mode) we add an offset to the
19509    // global base reg.
19510    bool PIC32 = PositionIndependent && !Subtarget.is64Bit();
19511    if (PIC32)
19512      OpFlag = X86II::MO_TLVP_PIC_BASE;
19513    else
19514      OpFlag = X86II::MO_TLVP;
19515    SDLoc DL(Op);
19516    SDValue Result = DAG.getTargetGlobalAddress(GA->getGlobal(), DL,
19517                                                GA->getValueType(0),
19518                                                GA->getOffset(), OpFlag);
19519    SDValue Offset = DAG.getNode(WrapperKind, DL, PtrVT, Result);
19520
19521    // With PIC32, the address is actually $g + Offset.
19522    if (PIC32)
19523      Offset = DAG.getNode(ISD::ADD, DL, PtrVT,
19524                           DAG.getNode(X86ISD::GlobalBaseReg, SDLoc(), PtrVT),
19525                           Offset);
19526
19527    // Lowering the machine isd will make sure everything is in the right
19528    // location.
19529    SDValue Chain = DAG.getEntryNode();
19530    SDVTList NodeTys = DAG.getVTList(MVT::Other, MVT::Glue);
19531    Chain = DAG.getCALLSEQ_START(Chain, 0, 0, DL);
19532    SDValue Args[] = { Chain, Offset };
19533    Chain = DAG.getNode(X86ISD::TLSCALL, DL, NodeTys, Args);
19534    Chain = DAG.getCALLSEQ_END(Chain, DAG.getIntPtrConstant(0, DL, true),
19535                               DAG.getIntPtrConstant(0, DL, true),
19536                               Chain.getValue(1), DL);
19537
19538    // TLSCALL will be codegen'ed as call. Inform MFI that function has calls.
19539    MachineFrameInfo &MFI = DAG.getMachineFunction().getFrameInfo();
19540    MFI.setAdjustsStack(true);
19541
19542    // And our return value (tls address) is in the standard call return value
19543    // location.
19544    unsigned Reg = Subtarget.is64Bit() ? X86::RAX : X86::EAX;
19545    return DAG.getCopyFromReg(Chain, DL, Reg, PtrVT, Chain.getValue(1));
19546  }
19547
19548  if (Subtarget.isOSWindows()) {
19549    // Just use the implicit TLS architecture
19550    // Need to generate something similar to:
19551    //   mov     rdx, qword [gs:abs 58H]; Load pointer to ThreadLocalStorage
19552    //                                  ; from TEB
19553    //   mov     ecx, dword [rel _tls_index]: Load index (from C runtime)
19554    //   mov     rcx, qword [rdx+rcx*8]
19555    //   mov     eax, .tls$:tlsvar
19556    //   [rax+rcx] contains the address
19557    // Windows 64bit: gs:0x58
19558    // Windows 32bit: fs:__tls_array
19559
19560    SDLoc dl(GA);
19561    SDValue Chain = DAG.getEntryNode();
19562
19563    // Get the Thread Pointer, which is %fs:__tls_array (32-bit) or
19564    // %gs:0x58 (64-bit). On MinGW, __tls_array is not available, so directly
19565    // use its literal value of 0x2C.
19566    Value *Ptr = Constant::getNullValue(Subtarget.is64Bit()
19567                                        ? Type::getInt8PtrTy(*DAG.getContext(),
19568                                                             256)
19569                                        : Type::getInt32PtrTy(*DAG.getContext(),
19570                                                              257));
19571
19572    SDValue TlsArray = Subtarget.is64Bit()
19573                           ? DAG.getIntPtrConstant(0x58, dl)
19574                           : (Subtarget.isTargetWindowsGNU()
19575                                  ? DAG.getIntPtrConstant(0x2C, dl)
19576                                  : DAG.getExternalSymbol("_tls_array", PtrVT));
19577
19578    SDValue ThreadPointer =
19579        DAG.getLoad(PtrVT, dl, Chain, TlsArray, MachinePointerInfo(Ptr));
19580
19581    SDValue res;
19582    if (GV->getThreadLocalMode() == GlobalVariable::LocalExecTLSModel) {
19583      res = ThreadPointer;
19584    } else {
19585      // Load the _tls_index variable
19586      SDValue IDX = DAG.getExternalSymbol("_tls_index", PtrVT);
19587      if (Subtarget.is64Bit())
19588        IDX = DAG.getExtLoad(ISD::ZEXTLOAD, dl, PtrVT, Chain, IDX,
19589                             MachinePointerInfo(), MVT::i32);
19590      else
19591        IDX = DAG.getLoad(PtrVT, dl, Chain, IDX, MachinePointerInfo());
19592
19593      const DataLayout &DL = DAG.getDataLayout();
19594      SDValue Scale =
19595          DAG.getConstant(Log2_64_Ceil(DL.getPointerSize()), dl, MVT::i8);
19596      IDX = DAG.getNode(ISD::SHL, dl, PtrVT, IDX, Scale);
19597
19598      res = DAG.getNode(ISD::ADD, dl, PtrVT, ThreadPointer, IDX);
19599    }
19600
19601    res = DAG.getLoad(PtrVT, dl, Chain, res, MachinePointerInfo());
19602
19603    // Get the offset of start of .tls section
19604    SDValue TGA = DAG.getTargetGlobalAddress(GA->getGlobal(), dl,
19605                                             GA->getValueType(0),
19606                                             GA->getOffset(), X86II::MO_SECREL);
19607    SDValue Offset = DAG.getNode(X86ISD::Wrapper, dl, PtrVT, TGA);
19608
19609    // The address of the thread local variable is the add of the thread
19610    // pointer with the offset of the variable.
19611    return DAG.getNode(ISD::ADD, dl, PtrVT, res, Offset);
19612  }
19613
19614  llvm_unreachable("TLS not implemented for this target.");
19615}
19616
19617/// Lower SRA_PARTS and friends, which return two i32 values
19618/// and take a 2 x i32 value to shift plus a shift amount.
19619/// TODO: Can this be moved to general expansion code?
19620static SDValue LowerShiftParts(SDValue Op, SelectionDAG &DAG) {
19621  SDValue Lo, Hi;
19622  DAG.getTargetLoweringInfo().expandShiftParts(Op.getNode(), Lo, Hi, DAG);
19623  return DAG.getMergeValues({Lo, Hi}, SDLoc(Op));
19624}
19625
19626static SDValue LowerFunnelShift(SDValue Op, const X86Subtarget &Subtarget,
19627                                SelectionDAG &DAG) {
19628  MVT VT = Op.getSimpleValueType();
19629  assert((Op.getOpcode() == ISD::FSHL || Op.getOpcode() == ISD::FSHR) &&
19630         "Unexpected funnel shift opcode!");
19631
19632  SDLoc DL(Op);
19633  SDValue Op0 = Op.getOperand(0);
19634  SDValue Op1 = Op.getOperand(1);
19635  SDValue Amt = Op.getOperand(2);
19636
19637  bool IsFSHR = Op.getOpcode() == ISD::FSHR;
19638
19639  if (VT.isVector()) {
19640    assert(Subtarget.hasVBMI2() && "Expected VBMI2");
19641
19642    if (IsFSHR)
19643      std::swap(Op0, Op1);
19644
19645    // With AVX512, but not VLX we need to widen to get a 512-bit result type.
19646    if (!Subtarget.hasVLX() && !VT.is512BitVector()) {
19647      Op0 = widenSubVector(Op0, false, Subtarget, DAG, DL, 512);
19648      Op1 = widenSubVector(Op1, false, Subtarget, DAG, DL, 512);
19649    }
19650
19651    SDValue Funnel;
19652    APInt APIntShiftAmt;
19653    MVT ResultVT = Op0.getSimpleValueType();
19654    if (X86::isConstantSplat(Amt, APIntShiftAmt)) {
19655      uint64_t ShiftAmt = APIntShiftAmt.urem(VT.getScalarSizeInBits());
19656      Funnel =
19657          DAG.getNode(IsFSHR ? X86ISD::VSHRD : X86ISD::VSHLD, DL, ResultVT, Op0,
19658                      Op1, DAG.getTargetConstant(ShiftAmt, DL, MVT::i8));
19659    } else {
19660      if (!Subtarget.hasVLX() && !VT.is512BitVector())
19661        Amt = widenSubVector(Amt, false, Subtarget, DAG, DL, 512);
19662      Funnel = DAG.getNode(IsFSHR ? X86ISD::VSHRDV : X86ISD::VSHLDV, DL,
19663                           ResultVT, Op0, Op1, Amt);
19664    }
19665    if (!Subtarget.hasVLX() && !VT.is512BitVector())
19666      Funnel = extractSubVector(Funnel, 0, DAG, DL, VT.getSizeInBits());
19667    return Funnel;
19668  }
19669  assert(
19670      (VT == MVT::i8 || VT == MVT::i16 || VT == MVT::i32 || VT == MVT::i64) &&
19671      "Unexpected funnel shift type!");
19672
19673  // Expand slow SHLD/SHRD cases if we are not optimizing for size.
19674  bool OptForSize = DAG.shouldOptForSize();
19675  bool ExpandFunnel = !OptForSize && Subtarget.isSHLDSlow();
19676
19677  // fshl(x,y,z) -> (((aext(x) << bw) | zext(y)) << (z & (bw-1))) >> bw.
19678  // fshr(x,y,z) -> (((aext(x) << bw) | zext(y)) >> (z & (bw-1))).
19679  if ((VT == MVT::i8 || (ExpandFunnel && VT == MVT::i16)) &&
19680      !isa<ConstantSDNode>(Amt)) {
19681    unsigned EltSizeInBits = VT.getScalarSizeInBits();
19682    SDValue Mask = DAG.getConstant(EltSizeInBits - 1, DL, Amt.getValueType());
19683    SDValue HiShift = DAG.getConstant(EltSizeInBits, DL, Amt.getValueType());
19684    Op0 = DAG.getAnyExtOrTrunc(Op0, DL, MVT::i32);
19685    Op1 = DAG.getZExtOrTrunc(Op1, DL, MVT::i32);
19686    Amt = DAG.getNode(ISD::AND, DL, Amt.getValueType(), Amt, Mask);
19687    SDValue Res = DAG.getNode(ISD::SHL, DL, MVT::i32, Op0, HiShift);
19688    Res = DAG.getNode(ISD::OR, DL, MVT::i32, Res, Op1);
19689    if (IsFSHR) {
19690      Res = DAG.getNode(ISD::SRL, DL, MVT::i32, Res, Amt);
19691    } else {
19692      Res = DAG.getNode(ISD::SHL, DL, MVT::i32, Res, Amt);
19693      Res = DAG.getNode(ISD::SRL, DL, MVT::i32, Res, HiShift);
19694    }
19695    return DAG.getZExtOrTrunc(Res, DL, VT);
19696  }
19697
19698  if (VT == MVT::i8 || ExpandFunnel)
19699    return SDValue();
19700
19701  // i16 needs to modulo the shift amount, but i32/i64 have implicit modulo.
19702  if (VT == MVT::i16) {
19703    Amt = DAG.getNode(ISD::AND, DL, Amt.getValueType(), Amt,
19704                      DAG.getConstant(15, DL, Amt.getValueType()));
19705    unsigned FSHOp = (IsFSHR ? X86ISD::FSHR : X86ISD::FSHL);
19706    return DAG.getNode(FSHOp, DL, VT, Op0, Op1, Amt);
19707  }
19708
19709  return Op;
19710}
19711
19712// Try to use a packed vector operation to handle i64 on 32-bit targets when
19713// AVX512DQ is enabled.
19714static SDValue LowerI64IntToFP_AVX512DQ(SDValue Op, SelectionDAG &DAG,
19715                                        const X86Subtarget &Subtarget) {
19716  assert((Op.getOpcode() == ISD::SINT_TO_FP ||
19717          Op.getOpcode() == ISD::STRICT_SINT_TO_FP ||
19718          Op.getOpcode() == ISD::STRICT_UINT_TO_FP ||
19719          Op.getOpcode() == ISD::UINT_TO_FP) &&
19720         "Unexpected opcode!");
19721  bool IsStrict = Op->isStrictFPOpcode();
19722  unsigned OpNo = IsStrict ? 1 : 0;
19723  SDValue Src = Op.getOperand(OpNo);
19724  MVT SrcVT = Src.getSimpleValueType();
19725  MVT VT = Op.getSimpleValueType();
19726
19727   if (!Subtarget.hasDQI() || SrcVT != MVT::i64 || Subtarget.is64Bit() ||
19728       (VT != MVT::f32 && VT != MVT::f64))
19729    return SDValue();
19730
19731  // Pack the i64 into a vector, do the operation and extract.
19732
19733  // Using 256-bit to ensure result is 128-bits for f32 case.
19734  unsigned NumElts = Subtarget.hasVLX() ? 4 : 8;
19735  MVT VecInVT = MVT::getVectorVT(MVT::i64, NumElts);
19736  MVT VecVT = MVT::getVectorVT(VT, NumElts);
19737
19738  SDLoc dl(Op);
19739  SDValue InVec = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VecInVT, Src);
19740  if (IsStrict) {
19741    SDValue CvtVec = DAG.getNode(Op.getOpcode(), dl, {VecVT, MVT::Other},
19742                                 {Op.getOperand(0), InVec});
19743    SDValue Chain = CvtVec.getValue(1);
19744    SDValue Value = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, VT, CvtVec,
19745                                DAG.getIntPtrConstant(0, dl));
19746    return DAG.getMergeValues({Value, Chain}, dl);
19747  }
19748
19749  SDValue CvtVec = DAG.getNode(Op.getOpcode(), dl, VecVT, InVec);
19750
19751  return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, VT, CvtVec,
19752                     DAG.getIntPtrConstant(0, dl));
19753}
19754
19755static bool useVectorCast(unsigned Opcode, MVT FromVT, MVT ToVT,
19756                          const X86Subtarget &Subtarget) {
19757  switch (Opcode) {
19758    case ISD::SINT_TO_FP:
19759      // TODO: Handle wider types with AVX/AVX512.
19760      if (!Subtarget.hasSSE2() || FromVT != MVT::v4i32)
19761        return false;
19762      // CVTDQ2PS or (V)CVTDQ2PD
19763      return ToVT == MVT::v4f32 || (Subtarget.hasAVX() && ToVT == MVT::v4f64);
19764
19765    case ISD::UINT_TO_FP:
19766      // TODO: Handle wider types and i64 elements.
19767      if (!Subtarget.hasAVX512() || FromVT != MVT::v4i32)
19768        return false;
19769      // VCVTUDQ2PS or VCVTUDQ2PD
19770      return ToVT == MVT::v4f32 || ToVT == MVT::v4f64;
19771
19772    default:
19773      return false;
19774  }
19775}
19776
19777/// Given a scalar cast operation that is extracted from a vector, try to
19778/// vectorize the cast op followed by extraction. This will avoid an expensive
19779/// round-trip between XMM and GPR.
19780static SDValue vectorizeExtractedCast(SDValue Cast, SelectionDAG &DAG,
19781                                      const X86Subtarget &Subtarget) {
19782  // TODO: This could be enhanced to handle smaller integer types by peeking
19783  // through an extend.
19784  SDValue Extract = Cast.getOperand(0);
19785  MVT DestVT = Cast.getSimpleValueType();
19786  if (Extract.getOpcode() != ISD::EXTRACT_VECTOR_ELT ||
19787      !isa<ConstantSDNode>(Extract.getOperand(1)))
19788    return SDValue();
19789
19790  // See if we have a 128-bit vector cast op for this type of cast.
19791  SDValue VecOp = Extract.getOperand(0);
19792  MVT FromVT = VecOp.getSimpleValueType();
19793  unsigned NumEltsInXMM = 128 / FromVT.getScalarSizeInBits();
19794  MVT Vec128VT = MVT::getVectorVT(FromVT.getScalarType(), NumEltsInXMM);
19795  MVT ToVT = MVT::getVectorVT(DestVT, NumEltsInXMM);
19796  if (!useVectorCast(Cast.getOpcode(), Vec128VT, ToVT, Subtarget))
19797    return SDValue();
19798
19799  // If we are extracting from a non-zero element, first shuffle the source
19800  // vector to allow extracting from element zero.
19801  SDLoc DL(Cast);
19802  if (!isNullConstant(Extract.getOperand(1))) {
19803    SmallVector<int, 16> Mask(FromVT.getVectorNumElements(), -1);
19804    Mask[0] = Extract.getConstantOperandVal(1);
19805    VecOp = DAG.getVectorShuffle(FromVT, DL, VecOp, DAG.getUNDEF(FromVT), Mask);
19806  }
19807  // If the source vector is wider than 128-bits, extract the low part. Do not
19808  // create an unnecessarily wide vector cast op.
19809  if (FromVT != Vec128VT)
19810    VecOp = extract128BitVector(VecOp, 0, DAG, DL);
19811
19812  // cast (extelt V, 0) --> extelt (cast (extract_subv V)), 0
19813  // cast (extelt V, C) --> extelt (cast (extract_subv (shuffle V, [C...]))), 0
19814  SDValue VCast = DAG.getNode(Cast.getOpcode(), DL, ToVT, VecOp);
19815  return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, DestVT, VCast,
19816                     DAG.getIntPtrConstant(0, DL));
19817}
19818
19819/// Given a scalar cast to FP with a cast to integer operand (almost an ftrunc),
19820/// try to vectorize the cast ops. This will avoid an expensive round-trip
19821/// between XMM and GPR.
19822static SDValue lowerFPToIntToFP(SDValue CastToFP, SelectionDAG &DAG,
19823                                const X86Subtarget &Subtarget) {
19824  // TODO: Allow FP_TO_UINT.
19825  SDValue CastToInt = CastToFP.getOperand(0);
19826  MVT VT = CastToFP.getSimpleValueType();
19827  if (CastToInt.getOpcode() != ISD::FP_TO_SINT || VT.isVector())
19828    return SDValue();
19829
19830  MVT IntVT = CastToInt.getSimpleValueType();
19831  SDValue X = CastToInt.getOperand(0);
19832  MVT SrcVT = X.getSimpleValueType();
19833  if (SrcVT != MVT::f32 && SrcVT != MVT::f64)
19834    return SDValue();
19835
19836  // See if we have 128-bit vector cast instructions for this type of cast.
19837  // We need cvttps2dq/cvttpd2dq and cvtdq2ps/cvtdq2pd.
19838  if (!Subtarget.hasSSE2() || (VT != MVT::f32 && VT != MVT::f64) ||
19839      IntVT != MVT::i32)
19840    return SDValue();
19841
19842  unsigned SrcSize = SrcVT.getSizeInBits();
19843  unsigned IntSize = IntVT.getSizeInBits();
19844  unsigned VTSize = VT.getSizeInBits();
19845  MVT VecSrcVT = MVT::getVectorVT(SrcVT, 128 / SrcSize);
19846  MVT VecIntVT = MVT::getVectorVT(IntVT, 128 / IntSize);
19847  MVT VecVT = MVT::getVectorVT(VT, 128 / VTSize);
19848
19849  // We need target-specific opcodes if this is v2f64 -> v4i32 -> v2f64.
19850  unsigned ToIntOpcode =
19851      SrcSize != IntSize ? X86ISD::CVTTP2SI : (unsigned)ISD::FP_TO_SINT;
19852  unsigned ToFPOpcode =
19853      IntSize != VTSize ? X86ISD::CVTSI2P : (unsigned)ISD::SINT_TO_FP;
19854
19855  // sint_to_fp (fp_to_sint X) --> extelt (sint_to_fp (fp_to_sint (s2v X))), 0
19856  //
19857  // We are not defining the high elements (for example, zero them) because
19858  // that could nullify any performance advantage that we hoped to gain from
19859  // this vector op hack. We do not expect any adverse effects (like denorm
19860  // penalties) with cast ops.
19861  SDLoc DL(CastToFP);
19862  SDValue ZeroIdx = DAG.getIntPtrConstant(0, DL);
19863  SDValue VecX = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, VecSrcVT, X);
19864  SDValue VCastToInt = DAG.getNode(ToIntOpcode, DL, VecIntVT, VecX);
19865  SDValue VCastToFP = DAG.getNode(ToFPOpcode, DL, VecVT, VCastToInt);
19866  return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, VT, VCastToFP, ZeroIdx);
19867}
19868
19869static SDValue lowerINT_TO_FP_vXi64(SDValue Op, SelectionDAG &DAG,
19870                                    const X86Subtarget &Subtarget) {
19871  SDLoc DL(Op);
19872  bool IsStrict = Op->isStrictFPOpcode();
19873  MVT VT = Op->getSimpleValueType(0);
19874  SDValue Src = Op->getOperand(IsStrict ? 1 : 0);
19875
19876  if (Subtarget.hasDQI()) {
19877    assert(!Subtarget.hasVLX() && "Unexpected features");
19878
19879    assert((Src.getSimpleValueType() == MVT::v2i64 ||
19880            Src.getSimpleValueType() == MVT::v4i64) &&
19881           "Unsupported custom type");
19882
19883    // With AVX512DQ, but not VLX we need to widen to get a 512-bit result type.
19884    assert((VT == MVT::v4f32 || VT == MVT::v2f64 || VT == MVT::v4f64) &&
19885           "Unexpected VT!");
19886    MVT WideVT = VT == MVT::v4f32 ? MVT::v8f32 : MVT::v8f64;
19887
19888    // Need to concat with zero vector for strict fp to avoid spurious
19889    // exceptions.
19890    SDValue Tmp = IsStrict ? DAG.getConstant(0, DL, MVT::v8i64)
19891                           : DAG.getUNDEF(MVT::v8i64);
19892    Src = DAG.getNode(ISD::INSERT_SUBVECTOR, DL, MVT::v8i64, Tmp, Src,
19893                      DAG.getIntPtrConstant(0, DL));
19894    SDValue Res, Chain;
19895    if (IsStrict) {
19896      Res = DAG.getNode(Op.getOpcode(), DL, {WideVT, MVT::Other},
19897                        {Op->getOperand(0), Src});
19898      Chain = Res.getValue(1);
19899    } else {
19900      Res = DAG.getNode(Op.getOpcode(), DL, WideVT, Src);
19901    }
19902
19903    Res = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, Res,
19904                      DAG.getIntPtrConstant(0, DL));
19905
19906    if (IsStrict)
19907      return DAG.getMergeValues({Res, Chain}, DL);
19908    return Res;
19909  }
19910
19911  bool IsSigned = Op->getOpcode() == ISD::SINT_TO_FP ||
19912                  Op->getOpcode() == ISD::STRICT_SINT_TO_FP;
19913  if (VT != MVT::v4f32 || IsSigned)
19914    return SDValue();
19915
19916  SDValue Zero = DAG.getConstant(0, DL, MVT::v4i64);
19917  SDValue One  = DAG.getConstant(1, DL, MVT::v4i64);
19918  SDValue Sign = DAG.getNode(ISD::OR, DL, MVT::v4i64,
19919                             DAG.getNode(ISD::SRL, DL, MVT::v4i64, Src, One),
19920                             DAG.getNode(ISD::AND, DL, MVT::v4i64, Src, One));
19921  SDValue IsNeg = DAG.getSetCC(DL, MVT::v4i64, Src, Zero, ISD::SETLT);
19922  SDValue SignSrc = DAG.getSelect(DL, MVT::v4i64, IsNeg, Sign, Src);
19923  SmallVector<SDValue, 4> SignCvts(4);
19924  SmallVector<SDValue, 4> Chains(4);
19925  for (int i = 0; i != 4; ++i) {
19926    SDValue Elt = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::i64, SignSrc,
19927                              DAG.getIntPtrConstant(i, DL));
19928    if (IsStrict) {
19929      SignCvts[i] =
19930          DAG.getNode(ISD::STRICT_SINT_TO_FP, DL, {MVT::f32, MVT::Other},
19931                      {Op.getOperand(0), Elt});
19932      Chains[i] = SignCvts[i].getValue(1);
19933    } else {
19934      SignCvts[i] = DAG.getNode(ISD::SINT_TO_FP, DL, MVT::f32, Elt);
19935    }
19936  }
19937  SDValue SignCvt = DAG.getBuildVector(VT, DL, SignCvts);
19938
19939  SDValue Slow, Chain;
19940  if (IsStrict) {
19941    Chain = DAG.getNode(ISD::TokenFactor, DL, MVT::Other, Chains);
19942    Slow = DAG.getNode(ISD::STRICT_FADD, DL, {MVT::v4f32, MVT::Other},
19943                       {Chain, SignCvt, SignCvt});
19944    Chain = Slow.getValue(1);
19945  } else {
19946    Slow = DAG.getNode(ISD::FADD, DL, MVT::v4f32, SignCvt, SignCvt);
19947  }
19948
19949  IsNeg = DAG.getNode(ISD::TRUNCATE, DL, MVT::v4i32, IsNeg);
19950  SDValue Cvt = DAG.getSelect(DL, MVT::v4f32, IsNeg, Slow, SignCvt);
19951
19952  if (IsStrict)
19953    return DAG.getMergeValues({Cvt, Chain}, DL);
19954
19955  return Cvt;
19956}
19957
19958SDValue X86TargetLowering::LowerSINT_TO_FP(SDValue Op,
19959                                           SelectionDAG &DAG) const {
19960  bool IsStrict = Op->isStrictFPOpcode();
19961  unsigned OpNo = IsStrict ? 1 : 0;
19962  SDValue Src = Op.getOperand(OpNo);
19963  SDValue Chain = IsStrict ? Op->getOperand(0) : DAG.getEntryNode();
19964  MVT SrcVT = Src.getSimpleValueType();
19965  MVT VT = Op.getSimpleValueType();
19966  SDLoc dl(Op);
19967
19968  if (SDValue Extract = vectorizeExtractedCast(Op, DAG, Subtarget))
19969    return Extract;
19970
19971  if (SDValue R = lowerFPToIntToFP(Op, DAG, Subtarget))
19972    return R;
19973
19974  if (SrcVT.isVector()) {
19975    if (SrcVT == MVT::v2i32 && VT == MVT::v2f64) {
19976      // Note: Since v2f64 is a legal type. We don't need to zero extend the
19977      // source for strict FP.
19978      if (IsStrict)
19979        return DAG.getNode(
19980            X86ISD::STRICT_CVTSI2P, dl, {VT, MVT::Other},
19981            {Chain, DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v4i32, Src,
19982                                DAG.getUNDEF(SrcVT))});
19983      return DAG.getNode(X86ISD::CVTSI2P, dl, VT,
19984                         DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v4i32, Src,
19985                                     DAG.getUNDEF(SrcVT)));
19986    }
19987    if (SrcVT == MVT::v2i64 || SrcVT == MVT::v4i64)
19988      return lowerINT_TO_FP_vXi64(Op, DAG, Subtarget);
19989
19990    return SDValue();
19991  }
19992
19993  assert(SrcVT <= MVT::i64 && SrcVT >= MVT::i16 &&
19994         "Unknown SINT_TO_FP to lower!");
19995
19996  bool UseSSEReg = isScalarFPTypeInSSEReg(VT);
19997
19998  // These are really Legal; return the operand so the caller accepts it as
19999  // Legal.
20000  if (SrcVT == MVT::i32 && UseSSEReg)
20001    return Op;
20002  if (SrcVT == MVT::i64 && UseSSEReg && Subtarget.is64Bit())
20003    return Op;
20004
20005  if (SDValue V = LowerI64IntToFP_AVX512DQ(Op, DAG, Subtarget))
20006    return V;
20007
20008  // SSE doesn't have an i16 conversion so we need to promote.
20009  if (SrcVT == MVT::i16 && (UseSSEReg || VT == MVT::f128)) {
20010    SDValue Ext = DAG.getNode(ISD::SIGN_EXTEND, dl, MVT::i32, Src);
20011    if (IsStrict)
20012      return DAG.getNode(ISD::STRICT_SINT_TO_FP, dl, {VT, MVT::Other},
20013                         {Chain, Ext});
20014
20015    return DAG.getNode(ISD::SINT_TO_FP, dl, VT, Ext);
20016  }
20017
20018  if (VT == MVT::f128)
20019    return SDValue();
20020
20021  SDValue ValueToStore = Src;
20022  if (SrcVT == MVT::i64 && Subtarget.hasSSE2() && !Subtarget.is64Bit())
20023    // Bitcasting to f64 here allows us to do a single 64-bit store from
20024    // an SSE register, avoiding the store forwarding penalty that would come
20025    // with two 32-bit stores.
20026    ValueToStore = DAG.getBitcast(MVT::f64, ValueToStore);
20027
20028  unsigned Size = SrcVT.getStoreSize();
20029  Align Alignment(Size);
20030  MachineFunction &MF = DAG.getMachineFunction();
20031  auto PtrVT = getPointerTy(MF.getDataLayout());
20032  int SSFI = MF.getFrameInfo().CreateStackObject(Size, Alignment, false);
20033  MachinePointerInfo MPI =
20034      MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), SSFI);
20035  SDValue StackSlot = DAG.getFrameIndex(SSFI, PtrVT);
20036  Chain = DAG.getStore(Chain, dl, ValueToStore, StackSlot, MPI, Alignment);
20037  std::pair<SDValue, SDValue> Tmp =
20038      BuildFILD(VT, SrcVT, dl, Chain, StackSlot, MPI, Alignment, DAG);
20039
20040  if (IsStrict)
20041    return DAG.getMergeValues({Tmp.first, Tmp.second}, dl);
20042
20043  return Tmp.first;
20044}
20045
20046std::pair<SDValue, SDValue> X86TargetLowering::BuildFILD(
20047    EVT DstVT, EVT SrcVT, const SDLoc &DL, SDValue Chain, SDValue Pointer,
20048    MachinePointerInfo PtrInfo, Align Alignment, SelectionDAG &DAG) const {
20049  // Build the FILD
20050  SDVTList Tys;
20051  bool useSSE = isScalarFPTypeInSSEReg(DstVT);
20052  if (useSSE)
20053    Tys = DAG.getVTList(MVT::f80, MVT::Other);
20054  else
20055    Tys = DAG.getVTList(DstVT, MVT::Other);
20056
20057  SDValue FILDOps[] = {Chain, Pointer};
20058  SDValue Result =
20059      DAG.getMemIntrinsicNode(X86ISD::FILD, DL, Tys, FILDOps, SrcVT, PtrInfo,
20060                              Alignment, MachineMemOperand::MOLoad);
20061  Chain = Result.getValue(1);
20062
20063  if (useSSE) {
20064    MachineFunction &MF = DAG.getMachineFunction();
20065    unsigned SSFISize = DstVT.getStoreSize();
20066    int SSFI =
20067        MF.getFrameInfo().CreateStackObject(SSFISize, Align(SSFISize), false);
20068    auto PtrVT = getPointerTy(MF.getDataLayout());
20069    SDValue StackSlot = DAG.getFrameIndex(SSFI, PtrVT);
20070    Tys = DAG.getVTList(MVT::Other);
20071    SDValue FSTOps[] = {Chain, Result, StackSlot};
20072    MachineMemOperand *StoreMMO = DAG.getMachineFunction().getMachineMemOperand(
20073        MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), SSFI),
20074        MachineMemOperand::MOStore, SSFISize, Align(SSFISize));
20075
20076    Chain =
20077        DAG.getMemIntrinsicNode(X86ISD::FST, DL, Tys, FSTOps, DstVT, StoreMMO);
20078    Result = DAG.getLoad(
20079        DstVT, DL, Chain, StackSlot,
20080        MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), SSFI));
20081    Chain = Result.getValue(1);
20082  }
20083
20084  return { Result, Chain };
20085}
20086
20087/// Horizontal vector math instructions may be slower than normal math with
20088/// shuffles. Limit horizontal op codegen based on size/speed trade-offs, uarch
20089/// implementation, and likely shuffle complexity of the alternate sequence.
20090static bool shouldUseHorizontalOp(bool IsSingleSource, SelectionDAG &DAG,
20091                                  const X86Subtarget &Subtarget) {
20092  bool IsOptimizingSize = DAG.shouldOptForSize();
20093  bool HasFastHOps = Subtarget.hasFastHorizontalOps();
20094  return !IsSingleSource || IsOptimizingSize || HasFastHOps;
20095}
20096
20097/// 64-bit unsigned integer to double expansion.
20098static SDValue LowerUINT_TO_FP_i64(SDValue Op, SelectionDAG &DAG,
20099                                   const X86Subtarget &Subtarget) {
20100  // We can't use this algorithm for strict fp. It produces -0.0 instead of +0.0
20101  // when converting 0 when rounding toward negative infinity. Caller will
20102  // fall back to Expand for when i64 or is legal or use FILD in 32-bit mode.
20103  assert(!Op->isStrictFPOpcode() && "Expected non-strict uint_to_fp!");
20104  // This algorithm is not obvious. Here it is what we're trying to output:
20105  /*
20106     movq       %rax,  %xmm0
20107     punpckldq  (c0),  %xmm0  // c0: (uint4){ 0x43300000U, 0x45300000U, 0U, 0U }
20108     subpd      (c1),  %xmm0  // c1: (double2){ 0x1.0p52, 0x1.0p52 * 0x1.0p32 }
20109     #ifdef __SSE3__
20110       haddpd   %xmm0, %xmm0
20111     #else
20112       pshufd   $0x4e, %xmm0, %xmm1
20113       addpd    %xmm1, %xmm0
20114     #endif
20115  */
20116
20117  SDLoc dl(Op);
20118  LLVMContext *Context = DAG.getContext();
20119
20120  // Build some magic constants.
20121  static const uint32_t CV0[] = { 0x43300000, 0x45300000, 0, 0 };
20122  Constant *C0 = ConstantDataVector::get(*Context, CV0);
20123  auto PtrVT = DAG.getTargetLoweringInfo().getPointerTy(DAG.getDataLayout());
20124  SDValue CPIdx0 = DAG.getConstantPool(C0, PtrVT, Align(16));
20125
20126  SmallVector<Constant*,2> CV1;
20127  CV1.push_back(
20128    ConstantFP::get(*Context, APFloat(APFloat::IEEEdouble(),
20129                                      APInt(64, 0x4330000000000000ULL))));
20130  CV1.push_back(
20131    ConstantFP::get(*Context, APFloat(APFloat::IEEEdouble(),
20132                                      APInt(64, 0x4530000000000000ULL))));
20133  Constant *C1 = ConstantVector::get(CV1);
20134  SDValue CPIdx1 = DAG.getConstantPool(C1, PtrVT, Align(16));
20135
20136  // Load the 64-bit value into an XMM register.
20137  SDValue XR1 =
20138      DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v2i64, Op.getOperand(0));
20139  SDValue CLod0 = DAG.getLoad(
20140      MVT::v4i32, dl, DAG.getEntryNode(), CPIdx0,
20141      MachinePointerInfo::getConstantPool(DAG.getMachineFunction()), Align(16));
20142  SDValue Unpck1 =
20143      getUnpackl(DAG, dl, MVT::v4i32, DAG.getBitcast(MVT::v4i32, XR1), CLod0);
20144
20145  SDValue CLod1 = DAG.getLoad(
20146      MVT::v2f64, dl, CLod0.getValue(1), CPIdx1,
20147      MachinePointerInfo::getConstantPool(DAG.getMachineFunction()), Align(16));
20148  SDValue XR2F = DAG.getBitcast(MVT::v2f64, Unpck1);
20149  // TODO: Are there any fast-math-flags to propagate here?
20150  SDValue Sub = DAG.getNode(ISD::FSUB, dl, MVT::v2f64, XR2F, CLod1);
20151  SDValue Result;
20152
20153  if (Subtarget.hasSSE3() &&
20154      shouldUseHorizontalOp(true, DAG, Subtarget)) {
20155    Result = DAG.getNode(X86ISD::FHADD, dl, MVT::v2f64, Sub, Sub);
20156  } else {
20157    SDValue Shuffle = DAG.getVectorShuffle(MVT::v2f64, dl, Sub, Sub, {1,-1});
20158    Result = DAG.getNode(ISD::FADD, dl, MVT::v2f64, Shuffle, Sub);
20159  }
20160  Result = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::f64, Result,
20161                       DAG.getIntPtrConstant(0, dl));
20162  return Result;
20163}
20164
20165/// 32-bit unsigned integer to float expansion.
20166static SDValue LowerUINT_TO_FP_i32(SDValue Op, SelectionDAG &DAG,
20167                                   const X86Subtarget &Subtarget) {
20168  unsigned OpNo = Op.getNode()->isStrictFPOpcode() ? 1 : 0;
20169  SDLoc dl(Op);
20170  // FP constant to bias correct the final result.
20171  SDValue Bias = DAG.getConstantFP(BitsToDouble(0x4330000000000000ULL), dl,
20172                                   MVT::f64);
20173
20174  // Load the 32-bit value into an XMM register.
20175  SDValue Load =
20176      DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v4i32, Op.getOperand(OpNo));
20177
20178  // Zero out the upper parts of the register.
20179  Load = getShuffleVectorZeroOrUndef(Load, 0, true, Subtarget, DAG);
20180
20181  // Or the load with the bias.
20182  SDValue Or = DAG.getNode(
20183      ISD::OR, dl, MVT::v2i64,
20184      DAG.getBitcast(MVT::v2i64, Load),
20185      DAG.getBitcast(MVT::v2i64,
20186                     DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v2f64, Bias)));
20187  Or =
20188      DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::f64,
20189                  DAG.getBitcast(MVT::v2f64, Or), DAG.getIntPtrConstant(0, dl));
20190
20191  if (Op.getNode()->isStrictFPOpcode()) {
20192    // Subtract the bias.
20193    // TODO: Are there any fast-math-flags to propagate here?
20194    SDValue Chain = Op.getOperand(0);
20195    SDValue Sub = DAG.getNode(ISD::STRICT_FSUB, dl, {MVT::f64, MVT::Other},
20196                              {Chain, Or, Bias});
20197
20198    if (Op.getValueType() == Sub.getValueType())
20199      return Sub;
20200
20201    // Handle final rounding.
20202    std::pair<SDValue, SDValue> ResultPair = DAG.getStrictFPExtendOrRound(
20203        Sub, Sub.getValue(1), dl, Op.getSimpleValueType());
20204
20205    return DAG.getMergeValues({ResultPair.first, ResultPair.second}, dl);
20206  }
20207
20208  // Subtract the bias.
20209  // TODO: Are there any fast-math-flags to propagate here?
20210  SDValue Sub = DAG.getNode(ISD::FSUB, dl, MVT::f64, Or, Bias);
20211
20212  // Handle final rounding.
20213  return DAG.getFPExtendOrRound(Sub, dl, Op.getSimpleValueType());
20214}
20215
20216static SDValue lowerUINT_TO_FP_v2i32(SDValue Op, SelectionDAG &DAG,
20217                                     const X86Subtarget &Subtarget,
20218                                     const SDLoc &DL) {
20219  if (Op.getSimpleValueType() != MVT::v2f64)
20220    return SDValue();
20221
20222  bool IsStrict = Op->isStrictFPOpcode();
20223
20224  SDValue N0 = Op.getOperand(IsStrict ? 1 : 0);
20225  assert(N0.getSimpleValueType() == MVT::v2i32 && "Unexpected input type");
20226
20227  if (Subtarget.hasAVX512()) {
20228    if (!Subtarget.hasVLX()) {
20229      // Let generic type legalization widen this.
20230      if (!IsStrict)
20231        return SDValue();
20232      // Otherwise pad the integer input with 0s and widen the operation.
20233      N0 = DAG.getNode(ISD::CONCAT_VECTORS, DL, MVT::v4i32, N0,
20234                       DAG.getConstant(0, DL, MVT::v2i32));
20235      SDValue Res = DAG.getNode(Op->getOpcode(), DL, {MVT::v4f64, MVT::Other},
20236                                {Op.getOperand(0), N0});
20237      SDValue Chain = Res.getValue(1);
20238      Res = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, MVT::v2f64, Res,
20239                        DAG.getIntPtrConstant(0, DL));
20240      return DAG.getMergeValues({Res, Chain}, DL);
20241    }
20242
20243    // Legalize to v4i32 type.
20244    N0 = DAG.getNode(ISD::CONCAT_VECTORS, DL, MVT::v4i32, N0,
20245                     DAG.getUNDEF(MVT::v2i32));
20246    if (IsStrict)
20247      return DAG.getNode(X86ISD::STRICT_CVTUI2P, DL, {MVT::v2f64, MVT::Other},
20248                         {Op.getOperand(0), N0});
20249    return DAG.getNode(X86ISD::CVTUI2P, DL, MVT::v2f64, N0);
20250  }
20251
20252  // Zero extend to 2i64, OR with the floating point representation of 2^52.
20253  // This gives us the floating point equivalent of 2^52 + the i32 integer
20254  // since double has 52-bits of mantissa. Then subtract 2^52 in floating
20255  // point leaving just our i32 integers in double format.
20256  SDValue ZExtIn = DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::v2i64, N0);
20257  SDValue VBias =
20258      DAG.getConstantFP(BitsToDouble(0x4330000000000000ULL), DL, MVT::v2f64);
20259  SDValue Or = DAG.getNode(ISD::OR, DL, MVT::v2i64, ZExtIn,
20260                           DAG.getBitcast(MVT::v2i64, VBias));
20261  Or = DAG.getBitcast(MVT::v2f64, Or);
20262
20263  if (IsStrict)
20264    return DAG.getNode(ISD::STRICT_FSUB, DL, {MVT::v2f64, MVT::Other},
20265                       {Op.getOperand(0), Or, VBias});
20266  return DAG.getNode(ISD::FSUB, DL, MVT::v2f64, Or, VBias);
20267}
20268
20269static SDValue lowerUINT_TO_FP_vXi32(SDValue Op, SelectionDAG &DAG,
20270                                     const X86Subtarget &Subtarget) {
20271  SDLoc DL(Op);
20272  bool IsStrict = Op->isStrictFPOpcode();
20273  SDValue V = Op->getOperand(IsStrict ? 1 : 0);
20274  MVT VecIntVT = V.getSimpleValueType();
20275  assert((VecIntVT == MVT::v4i32 || VecIntVT == MVT::v8i32) &&
20276         "Unsupported custom type");
20277
20278  if (Subtarget.hasAVX512()) {
20279    // With AVX512, but not VLX we need to widen to get a 512-bit result type.
20280    assert(!Subtarget.hasVLX() && "Unexpected features");
20281    MVT VT = Op->getSimpleValueType(0);
20282
20283    // v8i32->v8f64 is legal with AVX512 so just return it.
20284    if (VT == MVT::v8f64)
20285      return Op;
20286
20287    assert((VT == MVT::v4f32 || VT == MVT::v8f32 || VT == MVT::v4f64) &&
20288           "Unexpected VT!");
20289    MVT WideVT = VT == MVT::v4f64 ? MVT::v8f64 : MVT::v16f32;
20290    MVT WideIntVT = VT == MVT::v4f64 ? MVT::v8i32 : MVT::v16i32;
20291    // Need to concat with zero vector for strict fp to avoid spurious
20292    // exceptions.
20293    SDValue Tmp =
20294        IsStrict ? DAG.getConstant(0, DL, WideIntVT) : DAG.getUNDEF(WideIntVT);
20295    V = DAG.getNode(ISD::INSERT_SUBVECTOR, DL, WideIntVT, Tmp, V,
20296                    DAG.getIntPtrConstant(0, DL));
20297    SDValue Res, Chain;
20298    if (IsStrict) {
20299      Res = DAG.getNode(ISD::STRICT_UINT_TO_FP, DL, {WideVT, MVT::Other},
20300                        {Op->getOperand(0), V});
20301      Chain = Res.getValue(1);
20302    } else {
20303      Res = DAG.getNode(ISD::UINT_TO_FP, DL, WideVT, V);
20304    }
20305
20306    Res = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, Res,
20307                      DAG.getIntPtrConstant(0, DL));
20308
20309    if (IsStrict)
20310      return DAG.getMergeValues({Res, Chain}, DL);
20311    return Res;
20312  }
20313
20314  if (Subtarget.hasAVX() && VecIntVT == MVT::v4i32 &&
20315      Op->getSimpleValueType(0) == MVT::v4f64) {
20316    SDValue ZExtIn = DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::v4i64, V);
20317    Constant *Bias = ConstantFP::get(
20318        *DAG.getContext(),
20319        APFloat(APFloat::IEEEdouble(), APInt(64, 0x4330000000000000ULL)));
20320    auto PtrVT = DAG.getTargetLoweringInfo().getPointerTy(DAG.getDataLayout());
20321    SDValue CPIdx = DAG.getConstantPool(Bias, PtrVT, Align(8));
20322    SDVTList Tys = DAG.getVTList(MVT::v4f64, MVT::Other);
20323    SDValue Ops[] = {DAG.getEntryNode(), CPIdx};
20324    SDValue VBias = DAG.getMemIntrinsicNode(
20325        X86ISD::VBROADCAST_LOAD, DL, Tys, Ops, MVT::f64,
20326        MachinePointerInfo::getConstantPool(DAG.getMachineFunction()), Align(8),
20327        MachineMemOperand::MOLoad);
20328
20329    SDValue Or = DAG.getNode(ISD::OR, DL, MVT::v4i64, ZExtIn,
20330                             DAG.getBitcast(MVT::v4i64, VBias));
20331    Or = DAG.getBitcast(MVT::v4f64, Or);
20332
20333    if (IsStrict)
20334      return DAG.getNode(ISD::STRICT_FSUB, DL, {MVT::v4f64, MVT::Other},
20335                         {Op.getOperand(0), Or, VBias});
20336    return DAG.getNode(ISD::FSUB, DL, MVT::v4f64, Or, VBias);
20337  }
20338
20339  // The algorithm is the following:
20340  // #ifdef __SSE4_1__
20341  //     uint4 lo = _mm_blend_epi16( v, (uint4) 0x4b000000, 0xaa);
20342  //     uint4 hi = _mm_blend_epi16( _mm_srli_epi32(v,16),
20343  //                                 (uint4) 0x53000000, 0xaa);
20344  // #else
20345  //     uint4 lo = (v & (uint4) 0xffff) | (uint4) 0x4b000000;
20346  //     uint4 hi = (v >> 16) | (uint4) 0x53000000;
20347  // #endif
20348  //     float4 fhi = (float4) hi - (0x1.0p39f + 0x1.0p23f);
20349  //     return (float4) lo + fhi;
20350
20351  bool Is128 = VecIntVT == MVT::v4i32;
20352  MVT VecFloatVT = Is128 ? MVT::v4f32 : MVT::v8f32;
20353  // If we convert to something else than the supported type, e.g., to v4f64,
20354  // abort early.
20355  if (VecFloatVT != Op->getSimpleValueType(0))
20356    return SDValue();
20357
20358  // In the #idef/#else code, we have in common:
20359  // - The vector of constants:
20360  // -- 0x4b000000
20361  // -- 0x53000000
20362  // - A shift:
20363  // -- v >> 16
20364
20365  // Create the splat vector for 0x4b000000.
20366  SDValue VecCstLow = DAG.getConstant(0x4b000000, DL, VecIntVT);
20367  // Create the splat vector for 0x53000000.
20368  SDValue VecCstHigh = DAG.getConstant(0x53000000, DL, VecIntVT);
20369
20370  // Create the right shift.
20371  SDValue VecCstShift = DAG.getConstant(16, DL, VecIntVT);
20372  SDValue HighShift = DAG.getNode(ISD::SRL, DL, VecIntVT, V, VecCstShift);
20373
20374  SDValue Low, High;
20375  if (Subtarget.hasSSE41()) {
20376    MVT VecI16VT = Is128 ? MVT::v8i16 : MVT::v16i16;
20377    //     uint4 lo = _mm_blend_epi16( v, (uint4) 0x4b000000, 0xaa);
20378    SDValue VecCstLowBitcast = DAG.getBitcast(VecI16VT, VecCstLow);
20379    SDValue VecBitcast = DAG.getBitcast(VecI16VT, V);
20380    // Low will be bitcasted right away, so do not bother bitcasting back to its
20381    // original type.
20382    Low = DAG.getNode(X86ISD::BLENDI, DL, VecI16VT, VecBitcast,
20383                      VecCstLowBitcast, DAG.getTargetConstant(0xaa, DL, MVT::i8));
20384    //     uint4 hi = _mm_blend_epi16( _mm_srli_epi32(v,16),
20385    //                                 (uint4) 0x53000000, 0xaa);
20386    SDValue VecCstHighBitcast = DAG.getBitcast(VecI16VT, VecCstHigh);
20387    SDValue VecShiftBitcast = DAG.getBitcast(VecI16VT, HighShift);
20388    // High will be bitcasted right away, so do not bother bitcasting back to
20389    // its original type.
20390    High = DAG.getNode(X86ISD::BLENDI, DL, VecI16VT, VecShiftBitcast,
20391                       VecCstHighBitcast, DAG.getTargetConstant(0xaa, DL, MVT::i8));
20392  } else {
20393    SDValue VecCstMask = DAG.getConstant(0xffff, DL, VecIntVT);
20394    //     uint4 lo = (v & (uint4) 0xffff) | (uint4) 0x4b000000;
20395    SDValue LowAnd = DAG.getNode(ISD::AND, DL, VecIntVT, V, VecCstMask);
20396    Low = DAG.getNode(ISD::OR, DL, VecIntVT, LowAnd, VecCstLow);
20397
20398    //     uint4 hi = (v >> 16) | (uint4) 0x53000000;
20399    High = DAG.getNode(ISD::OR, DL, VecIntVT, HighShift, VecCstHigh);
20400  }
20401
20402  // Create the vector constant for (0x1.0p39f + 0x1.0p23f).
20403  SDValue VecCstFSub = DAG.getConstantFP(
20404      APFloat(APFloat::IEEEsingle(), APInt(32, 0x53000080)), DL, VecFloatVT);
20405
20406  //     float4 fhi = (float4) hi - (0x1.0p39f + 0x1.0p23f);
20407  // NOTE: By using fsub of a positive constant instead of fadd of a negative
20408  // constant, we avoid reassociation in MachineCombiner when unsafe-fp-math is
20409  // enabled. See PR24512.
20410  SDValue HighBitcast = DAG.getBitcast(VecFloatVT, High);
20411  // TODO: Are there any fast-math-flags to propagate here?
20412  //     (float4) lo;
20413  SDValue LowBitcast = DAG.getBitcast(VecFloatVT, Low);
20414  //     return (float4) lo + fhi;
20415  if (IsStrict) {
20416    SDValue FHigh = DAG.getNode(ISD::STRICT_FSUB, DL, {VecFloatVT, MVT::Other},
20417                                {Op.getOperand(0), HighBitcast, VecCstFSub});
20418    return DAG.getNode(ISD::STRICT_FADD, DL, {VecFloatVT, MVT::Other},
20419                       {FHigh.getValue(1), LowBitcast, FHigh});
20420  }
20421
20422  SDValue FHigh =
20423      DAG.getNode(ISD::FSUB, DL, VecFloatVT, HighBitcast, VecCstFSub);
20424  return DAG.getNode(ISD::FADD, DL, VecFloatVT, LowBitcast, FHigh);
20425}
20426
20427static SDValue lowerUINT_TO_FP_vec(SDValue Op, SelectionDAG &DAG,
20428                                   const X86Subtarget &Subtarget) {
20429  unsigned OpNo = Op.getNode()->isStrictFPOpcode() ? 1 : 0;
20430  SDValue N0 = Op.getOperand(OpNo);
20431  MVT SrcVT = N0.getSimpleValueType();
20432  SDLoc dl(Op);
20433
20434  switch (SrcVT.SimpleTy) {
20435  default:
20436    llvm_unreachable("Custom UINT_TO_FP is not supported!");
20437  case MVT::v2i32:
20438    return lowerUINT_TO_FP_v2i32(Op, DAG, Subtarget, dl);
20439  case MVT::v4i32:
20440  case MVT::v8i32:
20441    return lowerUINT_TO_FP_vXi32(Op, DAG, Subtarget);
20442  case MVT::v2i64:
20443  case MVT::v4i64:
20444    return lowerINT_TO_FP_vXi64(Op, DAG, Subtarget);
20445  }
20446}
20447
20448SDValue X86TargetLowering::LowerUINT_TO_FP(SDValue Op,
20449                                           SelectionDAG &DAG) const {
20450  bool IsStrict = Op->isStrictFPOpcode();
20451  unsigned OpNo = IsStrict ? 1 : 0;
20452  SDValue Src = Op.getOperand(OpNo);
20453  SDLoc dl(Op);
20454  auto PtrVT = getPointerTy(DAG.getDataLayout());
20455  MVT SrcVT = Src.getSimpleValueType();
20456  MVT DstVT = Op->getSimpleValueType(0);
20457  SDValue Chain = IsStrict ? Op.getOperand(0) : DAG.getEntryNode();
20458
20459  if (DstVT == MVT::f128)
20460    return SDValue();
20461
20462  if (DstVT.isVector())
20463    return lowerUINT_TO_FP_vec(Op, DAG, Subtarget);
20464
20465  if (SDValue Extract = vectorizeExtractedCast(Op, DAG, Subtarget))
20466    return Extract;
20467
20468  if (Subtarget.hasAVX512() && isScalarFPTypeInSSEReg(DstVT) &&
20469      (SrcVT == MVT::i32 || (SrcVT == MVT::i64 && Subtarget.is64Bit()))) {
20470    // Conversions from unsigned i32 to f32/f64 are legal,
20471    // using VCVTUSI2SS/SD.  Same for i64 in 64-bit mode.
20472    return Op;
20473  }
20474
20475  // Promote i32 to i64 and use a signed conversion on 64-bit targets.
20476  if (SrcVT == MVT::i32 && Subtarget.is64Bit()) {
20477    Src = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i64, Src);
20478    if (IsStrict)
20479      return DAG.getNode(ISD::STRICT_SINT_TO_FP, dl, {DstVT, MVT::Other},
20480                         {Chain, Src});
20481    return DAG.getNode(ISD::SINT_TO_FP, dl, DstVT, Src);
20482  }
20483
20484  if (SDValue V = LowerI64IntToFP_AVX512DQ(Op, DAG, Subtarget))
20485    return V;
20486
20487  // The transform for i64->f64 isn't correct for 0 when rounding to negative
20488  // infinity. It produces -0.0, so disable under strictfp.
20489  if (SrcVT == MVT::i64 && DstVT == MVT::f64 && X86ScalarSSEf64 && !IsStrict)
20490    return LowerUINT_TO_FP_i64(Op, DAG, Subtarget);
20491  if (SrcVT == MVT::i32 && X86ScalarSSEf64 && DstVT != MVT::f80)
20492    return LowerUINT_TO_FP_i32(Op, DAG, Subtarget);
20493  if (Subtarget.is64Bit() && SrcVT == MVT::i64 &&
20494      (DstVT == MVT::f32 || DstVT == MVT::f64))
20495    return SDValue();
20496
20497  // Make a 64-bit buffer, and use it to build an FILD.
20498  SDValue StackSlot = DAG.CreateStackTemporary(MVT::i64, 8);
20499  int SSFI = cast<FrameIndexSDNode>(StackSlot)->getIndex();
20500  Align SlotAlign(8);
20501  MachinePointerInfo MPI =
20502    MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), SSFI);
20503  if (SrcVT == MVT::i32) {
20504    SDValue OffsetSlot =
20505        DAG.getMemBasePlusOffset(StackSlot, TypeSize::Fixed(4), dl);
20506    SDValue Store1 = DAG.getStore(Chain, dl, Src, StackSlot, MPI, SlotAlign);
20507    SDValue Store2 = DAG.getStore(Store1, dl, DAG.getConstant(0, dl, MVT::i32),
20508                                  OffsetSlot, MPI.getWithOffset(4), SlotAlign);
20509    std::pair<SDValue, SDValue> Tmp =
20510        BuildFILD(DstVT, MVT::i64, dl, Store2, StackSlot, MPI, SlotAlign, DAG);
20511    if (IsStrict)
20512      return DAG.getMergeValues({Tmp.first, Tmp.second}, dl);
20513
20514    return Tmp.first;
20515  }
20516
20517  assert(SrcVT == MVT::i64 && "Unexpected type in UINT_TO_FP");
20518  SDValue ValueToStore = Src;
20519  if (isScalarFPTypeInSSEReg(Op.getValueType()) && !Subtarget.is64Bit()) {
20520    // Bitcasting to f64 here allows us to do a single 64-bit store from
20521    // an SSE register, avoiding the store forwarding penalty that would come
20522    // with two 32-bit stores.
20523    ValueToStore = DAG.getBitcast(MVT::f64, ValueToStore);
20524  }
20525  SDValue Store =
20526      DAG.getStore(Chain, dl, ValueToStore, StackSlot, MPI, SlotAlign);
20527  // For i64 source, we need to add the appropriate power of 2 if the input
20528  // was negative. We must be careful to do the computation in x87 extended
20529  // precision, not in SSE.
20530  SDVTList Tys = DAG.getVTList(MVT::f80, MVT::Other);
20531  SDValue Ops[] = { Store, StackSlot };
20532  SDValue Fild =
20533      DAG.getMemIntrinsicNode(X86ISD::FILD, dl, Tys, Ops, MVT::i64, MPI,
20534                              SlotAlign, MachineMemOperand::MOLoad);
20535  Chain = Fild.getValue(1);
20536
20537
20538  // Check whether the sign bit is set.
20539  SDValue SignSet = DAG.getSetCC(
20540      dl, getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), MVT::i64),
20541      Op.getOperand(OpNo), DAG.getConstant(0, dl, MVT::i64), ISD::SETLT);
20542
20543  // Build a 64 bit pair (FF, 0) in the constant pool, with FF in the hi bits.
20544  APInt FF(64, 0x5F80000000000000ULL);
20545  SDValue FudgePtr = DAG.getConstantPool(
20546      ConstantInt::get(*DAG.getContext(), FF), PtrVT);
20547  Align CPAlignment = cast<ConstantPoolSDNode>(FudgePtr)->getAlign();
20548
20549  // Get a pointer to FF if the sign bit was set, or to 0 otherwise.
20550  SDValue Zero = DAG.getIntPtrConstant(0, dl);
20551  SDValue Four = DAG.getIntPtrConstant(4, dl);
20552  SDValue Offset = DAG.getSelect(dl, Zero.getValueType(), SignSet, Four, Zero);
20553  FudgePtr = DAG.getNode(ISD::ADD, dl, PtrVT, FudgePtr, Offset);
20554
20555  // Load the value out, extending it from f32 to f80.
20556  SDValue Fudge = DAG.getExtLoad(
20557      ISD::EXTLOAD, dl, MVT::f80, Chain, FudgePtr,
20558      MachinePointerInfo::getConstantPool(DAG.getMachineFunction()), MVT::f32,
20559      CPAlignment);
20560  Chain = Fudge.getValue(1);
20561  // Extend everything to 80 bits to force it to be done on x87.
20562  // TODO: Are there any fast-math-flags to propagate here?
20563  if (IsStrict) {
20564    SDValue Add = DAG.getNode(ISD::STRICT_FADD, dl, {MVT::f80, MVT::Other},
20565                              {Chain, Fild, Fudge});
20566    // STRICT_FP_ROUND can't handle equal types.
20567    if (DstVT == MVT::f80)
20568      return Add;
20569    return DAG.getNode(ISD::STRICT_FP_ROUND, dl, {DstVT, MVT::Other},
20570                       {Add.getValue(1), Add, DAG.getIntPtrConstant(0, dl)});
20571  }
20572  SDValue Add = DAG.getNode(ISD::FADD, dl, MVT::f80, Fild, Fudge);
20573  return DAG.getNode(ISD::FP_ROUND, dl, DstVT, Add,
20574                     DAG.getIntPtrConstant(0, dl));
20575}
20576
20577// If the given FP_TO_SINT (IsSigned) or FP_TO_UINT (!IsSigned) operation
20578// is legal, or has an fp128 or f16 source (which needs to be promoted to f32),
20579// just return an SDValue().
20580// Otherwise it is assumed to be a conversion from one of f32, f64 or f80
20581// to i16, i32 or i64, and we lower it to a legal sequence and return the
20582// result.
20583SDValue
20584X86TargetLowering::FP_TO_INTHelper(SDValue Op, SelectionDAG &DAG,
20585                                   bool IsSigned, SDValue &Chain) const {
20586  bool IsStrict = Op->isStrictFPOpcode();
20587  SDLoc DL(Op);
20588
20589  EVT DstTy = Op.getValueType();
20590  SDValue Value = Op.getOperand(IsStrict ? 1 : 0);
20591  EVT TheVT = Value.getValueType();
20592  auto PtrVT = getPointerTy(DAG.getDataLayout());
20593
20594  if (TheVT != MVT::f32 && TheVT != MVT::f64 && TheVT != MVT::f80) {
20595    // f16 must be promoted before using the lowering in this routine.
20596    // fp128 does not use this lowering.
20597    return SDValue();
20598  }
20599
20600  // If using FIST to compute an unsigned i64, we'll need some fixup
20601  // to handle values above the maximum signed i64.  A FIST is always
20602  // used for the 32-bit subtarget, but also for f80 on a 64-bit target.
20603  bool UnsignedFixup = !IsSigned && DstTy == MVT::i64;
20604
20605  // FIXME: This does not generate an invalid exception if the input does not
20606  // fit in i32. PR44019
20607  if (!IsSigned && DstTy != MVT::i64) {
20608    // Replace the fp-to-uint32 operation with an fp-to-sint64 FIST.
20609    // The low 32 bits of the fist result will have the correct uint32 result.
20610    assert(DstTy == MVT::i32 && "Unexpected FP_TO_UINT");
20611    DstTy = MVT::i64;
20612  }
20613
20614  assert(DstTy.getSimpleVT() <= MVT::i64 &&
20615         DstTy.getSimpleVT() >= MVT::i16 &&
20616         "Unknown FP_TO_INT to lower!");
20617
20618  // We lower FP->int64 into FISTP64 followed by a load from a temporary
20619  // stack slot.
20620  MachineFunction &MF = DAG.getMachineFunction();
20621  unsigned MemSize = DstTy.getStoreSize();
20622  int SSFI =
20623      MF.getFrameInfo().CreateStackObject(MemSize, Align(MemSize), false);
20624  SDValue StackSlot = DAG.getFrameIndex(SSFI, PtrVT);
20625
20626  Chain = IsStrict ? Op.getOperand(0) : DAG.getEntryNode();
20627
20628  SDValue Adjust; // 0x0 or 0x80000000, for result sign bit adjustment.
20629
20630  if (UnsignedFixup) {
20631    //
20632    // Conversion to unsigned i64 is implemented with a select,
20633    // depending on whether the source value fits in the range
20634    // of a signed i64.  Let Thresh be the FP equivalent of
20635    // 0x8000000000000000ULL.
20636    //
20637    //  Adjust = (Value >= Thresh) ? 0x80000000 : 0;
20638    //  FltOfs = (Value >= Thresh) ? 0x80000000 : 0;
20639    //  FistSrc = (Value - FltOfs);
20640    //  Fist-to-mem64 FistSrc
20641    //  Add 0 or 0x800...0ULL to the 64-bit result, which is equivalent
20642    //  to XOR'ing the high 32 bits with Adjust.
20643    //
20644    // Being a power of 2, Thresh is exactly representable in all FP formats.
20645    // For X87 we'd like to use the smallest FP type for this constant, but
20646    // for DAG type consistency we have to match the FP operand type.
20647
20648    APFloat Thresh(APFloat::IEEEsingle(), APInt(32, 0x5f000000));
20649    LLVM_ATTRIBUTE_UNUSED APFloat::opStatus Status = APFloat::opOK;
20650    bool LosesInfo = false;
20651    if (TheVT == MVT::f64)
20652      // The rounding mode is irrelevant as the conversion should be exact.
20653      Status = Thresh.convert(APFloat::IEEEdouble(), APFloat::rmNearestTiesToEven,
20654                              &LosesInfo);
20655    else if (TheVT == MVT::f80)
20656      Status = Thresh.convert(APFloat::x87DoubleExtended(),
20657                              APFloat::rmNearestTiesToEven, &LosesInfo);
20658
20659    assert(Status == APFloat::opOK && !LosesInfo &&
20660           "FP conversion should have been exact");
20661
20662    SDValue ThreshVal = DAG.getConstantFP(Thresh, DL, TheVT);
20663
20664    EVT ResVT = getSetCCResultType(DAG.getDataLayout(),
20665                                   *DAG.getContext(), TheVT);
20666    SDValue Cmp;
20667    if (IsStrict) {
20668      Cmp = DAG.getSetCC(DL, ResVT, Value, ThreshVal, ISD::SETGE, Chain,
20669                         /*IsSignaling*/ true);
20670      Chain = Cmp.getValue(1);
20671    } else {
20672      Cmp = DAG.getSetCC(DL, ResVT, Value, ThreshVal, ISD::SETGE);
20673    }
20674
20675    // Our preferred lowering of
20676    //
20677    // (Value >= Thresh) ? 0x8000000000000000ULL : 0
20678    //
20679    // is
20680    //
20681    // (Value >= Thresh) << 63
20682    //
20683    // but since we can get here after LegalOperations, DAGCombine might do the
20684    // wrong thing if we create a select. So, directly create the preferred
20685    // version.
20686    SDValue Zext = DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::i64, Cmp);
20687    SDValue Const63 = DAG.getConstant(63, DL, MVT::i8);
20688    Adjust = DAG.getNode(ISD::SHL, DL, MVT::i64, Zext, Const63);
20689
20690    SDValue FltOfs = DAG.getSelect(DL, TheVT, Cmp, ThreshVal,
20691                                   DAG.getConstantFP(0.0, DL, TheVT));
20692
20693    if (IsStrict) {
20694      Value = DAG.getNode(ISD::STRICT_FSUB, DL, { TheVT, MVT::Other},
20695                          { Chain, Value, FltOfs });
20696      Chain = Value.getValue(1);
20697    } else
20698      Value = DAG.getNode(ISD::FSUB, DL, TheVT, Value, FltOfs);
20699  }
20700
20701  MachinePointerInfo MPI = MachinePointerInfo::getFixedStack(MF, SSFI);
20702
20703  // FIXME This causes a redundant load/store if the SSE-class value is already
20704  // in memory, such as if it is on the callstack.
20705  if (isScalarFPTypeInSSEReg(TheVT)) {
20706    assert(DstTy == MVT::i64 && "Invalid FP_TO_SINT to lower!");
20707    Chain = DAG.getStore(Chain, DL, Value, StackSlot, MPI);
20708    SDVTList Tys = DAG.getVTList(MVT::f80, MVT::Other);
20709    SDValue Ops[] = { Chain, StackSlot };
20710
20711    unsigned FLDSize = TheVT.getStoreSize();
20712    assert(FLDSize <= MemSize && "Stack slot not big enough");
20713    MachineMemOperand *MMO = MF.getMachineMemOperand(
20714        MPI, MachineMemOperand::MOLoad, FLDSize, Align(FLDSize));
20715    Value = DAG.getMemIntrinsicNode(X86ISD::FLD, DL, Tys, Ops, TheVT, MMO);
20716    Chain = Value.getValue(1);
20717  }
20718
20719  // Build the FP_TO_INT*_IN_MEM
20720  MachineMemOperand *MMO = MF.getMachineMemOperand(
20721      MPI, MachineMemOperand::MOStore, MemSize, Align(MemSize));
20722  SDValue Ops[] = { Chain, Value, StackSlot };
20723  SDValue FIST = DAG.getMemIntrinsicNode(X86ISD::FP_TO_INT_IN_MEM, DL,
20724                                         DAG.getVTList(MVT::Other),
20725                                         Ops, DstTy, MMO);
20726
20727  SDValue Res = DAG.getLoad(Op.getValueType(), SDLoc(Op), FIST, StackSlot, MPI);
20728  Chain = Res.getValue(1);
20729
20730  // If we need an unsigned fixup, XOR the result with adjust.
20731  if (UnsignedFixup)
20732    Res = DAG.getNode(ISD::XOR, DL, MVT::i64, Res, Adjust);
20733
20734  return Res;
20735}
20736
20737static SDValue LowerAVXExtend(SDValue Op, SelectionDAG &DAG,
20738                              const X86Subtarget &Subtarget) {
20739  MVT VT = Op.getSimpleValueType();
20740  SDValue In = Op.getOperand(0);
20741  MVT InVT = In.getSimpleValueType();
20742  SDLoc dl(Op);
20743  unsigned Opc = Op.getOpcode();
20744
20745  assert(VT.isVector() && InVT.isVector() && "Expected vector type");
20746  assert((Opc == ISD::ANY_EXTEND || Opc == ISD::ZERO_EXTEND) &&
20747         "Unexpected extension opcode");
20748  assert(VT.getVectorNumElements() == InVT.getVectorNumElements() &&
20749         "Expected same number of elements");
20750  assert((VT.getVectorElementType() == MVT::i16 ||
20751          VT.getVectorElementType() == MVT::i32 ||
20752          VT.getVectorElementType() == MVT::i64) &&
20753         "Unexpected element type");
20754  assert((InVT.getVectorElementType() == MVT::i8 ||
20755          InVT.getVectorElementType() == MVT::i16 ||
20756          InVT.getVectorElementType() == MVT::i32) &&
20757         "Unexpected element type");
20758
20759  unsigned ExtendInVecOpc = getOpcode_EXTEND_VECTOR_INREG(Opc);
20760
20761  if (VT == MVT::v32i16 && !Subtarget.hasBWI()) {
20762    assert(InVT == MVT::v32i8 && "Unexpected VT!");
20763    return splitVectorIntUnary(Op, DAG);
20764  }
20765
20766  if (Subtarget.hasInt256())
20767    return Op;
20768
20769  // Optimize vectors in AVX mode:
20770  //
20771  //   v8i16 -> v8i32
20772  //   Use vpmovzwd for 4 lower elements  v8i16 -> v4i32.
20773  //   Use vpunpckhwd for 4 upper elements  v8i16 -> v4i32.
20774  //   Concat upper and lower parts.
20775  //
20776  //   v4i32 -> v4i64
20777  //   Use vpmovzdq for 4 lower elements  v4i32 -> v2i64.
20778  //   Use vpunpckhdq for 4 upper elements  v4i32 -> v2i64.
20779  //   Concat upper and lower parts.
20780  //
20781  MVT HalfVT = VT.getHalfNumVectorElementsVT();
20782  SDValue OpLo = DAG.getNode(ExtendInVecOpc, dl, HalfVT, In);
20783
20784  // Short-circuit if we can determine that each 128-bit half is the same value.
20785  // Otherwise, this is difficult to match and optimize.
20786  if (auto *Shuf = dyn_cast<ShuffleVectorSDNode>(In))
20787    if (hasIdenticalHalvesShuffleMask(Shuf->getMask()))
20788      return DAG.getNode(ISD::CONCAT_VECTORS, dl, VT, OpLo, OpLo);
20789
20790  SDValue ZeroVec = DAG.getConstant(0, dl, InVT);
20791  SDValue Undef = DAG.getUNDEF(InVT);
20792  bool NeedZero = Opc == ISD::ZERO_EXTEND;
20793  SDValue OpHi = getUnpackh(DAG, dl, InVT, In, NeedZero ? ZeroVec : Undef);
20794  OpHi = DAG.getBitcast(HalfVT, OpHi);
20795
20796  return DAG.getNode(ISD::CONCAT_VECTORS, dl, VT, OpLo, OpHi);
20797}
20798
20799// Helper to split and extend a v16i1 mask to v16i8 or v16i16.
20800static SDValue SplitAndExtendv16i1(unsigned ExtOpc, MVT VT, SDValue In,
20801                                   const SDLoc &dl, SelectionDAG &DAG) {
20802  assert((VT == MVT::v16i8 || VT == MVT::v16i16) && "Unexpected VT.");
20803  SDValue Lo = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MVT::v8i1, In,
20804                           DAG.getIntPtrConstant(0, dl));
20805  SDValue Hi = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MVT::v8i1, In,
20806                           DAG.getIntPtrConstant(8, dl));
20807  Lo = DAG.getNode(ExtOpc, dl, MVT::v8i16, Lo);
20808  Hi = DAG.getNode(ExtOpc, dl, MVT::v8i16, Hi);
20809  SDValue Res = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v16i16, Lo, Hi);
20810  return DAG.getNode(ISD::TRUNCATE, dl, VT, Res);
20811}
20812
20813static  SDValue LowerZERO_EXTEND_Mask(SDValue Op,
20814                                      const X86Subtarget &Subtarget,
20815                                      SelectionDAG &DAG) {
20816  MVT VT = Op->getSimpleValueType(0);
20817  SDValue In = Op->getOperand(0);
20818  MVT InVT = In.getSimpleValueType();
20819  assert(InVT.getVectorElementType() == MVT::i1 && "Unexpected input type!");
20820  SDLoc DL(Op);
20821  unsigned NumElts = VT.getVectorNumElements();
20822
20823  // For all vectors, but vXi8 we can just emit a sign_extend and a shift. This
20824  // avoids a constant pool load.
20825  if (VT.getVectorElementType() != MVT::i8) {
20826    SDValue Extend = DAG.getNode(ISD::SIGN_EXTEND, DL, VT, In);
20827    return DAG.getNode(ISD::SRL, DL, VT, Extend,
20828                       DAG.getConstant(VT.getScalarSizeInBits() - 1, DL, VT));
20829  }
20830
20831  // Extend VT if BWI is not supported.
20832  MVT ExtVT = VT;
20833  if (!Subtarget.hasBWI()) {
20834    // If v16i32 is to be avoided, we'll need to split and concatenate.
20835    if (NumElts == 16 && !Subtarget.canExtendTo512DQ())
20836      return SplitAndExtendv16i1(ISD::ZERO_EXTEND, VT, In, DL, DAG);
20837
20838    ExtVT = MVT::getVectorVT(MVT::i32, NumElts);
20839  }
20840
20841  // Widen to 512-bits if VLX is not supported.
20842  MVT WideVT = ExtVT;
20843  if (!ExtVT.is512BitVector() && !Subtarget.hasVLX()) {
20844    NumElts *= 512 / ExtVT.getSizeInBits();
20845    InVT = MVT::getVectorVT(MVT::i1, NumElts);
20846    In = DAG.getNode(ISD::INSERT_SUBVECTOR, DL, InVT, DAG.getUNDEF(InVT),
20847                     In, DAG.getIntPtrConstant(0, DL));
20848    WideVT = MVT::getVectorVT(ExtVT.getVectorElementType(),
20849                              NumElts);
20850  }
20851
20852  SDValue One = DAG.getConstant(1, DL, WideVT);
20853  SDValue Zero = DAG.getConstant(0, DL, WideVT);
20854
20855  SDValue SelectedVal = DAG.getSelect(DL, WideVT, In, One, Zero);
20856
20857  // Truncate if we had to extend above.
20858  if (VT != ExtVT) {
20859    WideVT = MVT::getVectorVT(MVT::i8, NumElts);
20860    SelectedVal = DAG.getNode(ISD::TRUNCATE, DL, WideVT, SelectedVal);
20861  }
20862
20863  // Extract back to 128/256-bit if we widened.
20864  if (WideVT != VT)
20865    SelectedVal = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, SelectedVal,
20866                              DAG.getIntPtrConstant(0, DL));
20867
20868  return SelectedVal;
20869}
20870
20871static SDValue LowerZERO_EXTEND(SDValue Op, const X86Subtarget &Subtarget,
20872                                SelectionDAG &DAG) {
20873  SDValue In = Op.getOperand(0);
20874  MVT SVT = In.getSimpleValueType();
20875
20876  if (SVT.getVectorElementType() == MVT::i1)
20877    return LowerZERO_EXTEND_Mask(Op, Subtarget, DAG);
20878
20879  assert(Subtarget.hasAVX() && "Expected AVX support");
20880  return LowerAVXExtend(Op, DAG, Subtarget);
20881}
20882
20883/// Helper to recursively truncate vector elements in half with PACKSS/PACKUS.
20884/// It makes use of the fact that vectors with enough leading sign/zero bits
20885/// prevent the PACKSS/PACKUS from saturating the results.
20886/// AVX2 (Int256) sub-targets require extra shuffling as the PACK*S operates
20887/// within each 128-bit lane.
20888static SDValue truncateVectorWithPACK(unsigned Opcode, EVT DstVT, SDValue In,
20889                                      const SDLoc &DL, SelectionDAG &DAG,
20890                                      const X86Subtarget &Subtarget) {
20891  assert((Opcode == X86ISD::PACKSS || Opcode == X86ISD::PACKUS) &&
20892         "Unexpected PACK opcode");
20893  assert(DstVT.isVector() && "VT not a vector?");
20894
20895  // Requires SSE2 for PACKSS (SSE41 PACKUSDW is handled below).
20896  if (!Subtarget.hasSSE2())
20897    return SDValue();
20898
20899  EVT SrcVT = In.getValueType();
20900
20901  // No truncation required, we might get here due to recursive calls.
20902  if (SrcVT == DstVT)
20903    return In;
20904
20905  // We only support vector truncation to 64bits or greater from a
20906  // 128bits or greater source.
20907  unsigned DstSizeInBits = DstVT.getSizeInBits();
20908  unsigned SrcSizeInBits = SrcVT.getSizeInBits();
20909  if ((DstSizeInBits % 64) != 0 || (SrcSizeInBits % 128) != 0)
20910    return SDValue();
20911
20912  unsigned NumElems = SrcVT.getVectorNumElements();
20913  if (!isPowerOf2_32(NumElems))
20914    return SDValue();
20915
20916  LLVMContext &Ctx = *DAG.getContext();
20917  assert(DstVT.getVectorNumElements() == NumElems && "Illegal truncation");
20918  assert(SrcSizeInBits > DstSizeInBits && "Illegal truncation");
20919
20920  EVT PackedSVT = EVT::getIntegerVT(Ctx, SrcVT.getScalarSizeInBits() / 2);
20921
20922  // Pack to the largest type possible:
20923  // vXi64/vXi32 -> PACK*SDW and vXi16 -> PACK*SWB.
20924  EVT InVT = MVT::i16, OutVT = MVT::i8;
20925  if (SrcVT.getScalarSizeInBits() > 16 &&
20926      (Opcode == X86ISD::PACKSS || Subtarget.hasSSE41())) {
20927    InVT = MVT::i32;
20928    OutVT = MVT::i16;
20929  }
20930
20931  // 128bit -> 64bit truncate - PACK 128-bit src in the lower subvector.
20932  if (SrcVT.is128BitVector()) {
20933    InVT = EVT::getVectorVT(Ctx, InVT, 128 / InVT.getSizeInBits());
20934    OutVT = EVT::getVectorVT(Ctx, OutVT, 128 / OutVT.getSizeInBits());
20935    In = DAG.getBitcast(InVT, In);
20936    SDValue Res = DAG.getNode(Opcode, DL, OutVT, In, DAG.getUNDEF(InVT));
20937    Res = extractSubVector(Res, 0, DAG, DL, 64);
20938    return DAG.getBitcast(DstVT, Res);
20939  }
20940
20941  // Split lower/upper subvectors.
20942  SDValue Lo, Hi;
20943  std::tie(Lo, Hi) = splitVector(In, DAG, DL);
20944
20945  unsigned SubSizeInBits = SrcSizeInBits / 2;
20946  InVT = EVT::getVectorVT(Ctx, InVT, SubSizeInBits / InVT.getSizeInBits());
20947  OutVT = EVT::getVectorVT(Ctx, OutVT, SubSizeInBits / OutVT.getSizeInBits());
20948
20949  // 256bit -> 128bit truncate - PACK lower/upper 128-bit subvectors.
20950  if (SrcVT.is256BitVector() && DstVT.is128BitVector()) {
20951    Lo = DAG.getBitcast(InVT, Lo);
20952    Hi = DAG.getBitcast(InVT, Hi);
20953    SDValue Res = DAG.getNode(Opcode, DL, OutVT, Lo, Hi);
20954    return DAG.getBitcast(DstVT, Res);
20955  }
20956
20957  // AVX2: 512bit -> 256bit truncate - PACK lower/upper 256-bit subvectors.
20958  // AVX2: 512bit -> 128bit truncate - PACK(PACK, PACK).
20959  if (SrcVT.is512BitVector() && Subtarget.hasInt256()) {
20960    Lo = DAG.getBitcast(InVT, Lo);
20961    Hi = DAG.getBitcast(InVT, Hi);
20962    SDValue Res = DAG.getNode(Opcode, DL, OutVT, Lo, Hi);
20963
20964    // 256-bit PACK(ARG0, ARG1) leaves us with ((LO0,LO1),(HI0,HI1)),
20965    // so we need to shuffle to get ((LO0,HI0),(LO1,HI1)).
20966    // Scale shuffle mask to avoid bitcasts and help ComputeNumSignBits.
20967    SmallVector<int, 64> Mask;
20968    int Scale = 64 / OutVT.getScalarSizeInBits();
20969    narrowShuffleMaskElts(Scale, { 0, 2, 1, 3 }, Mask);
20970    Res = DAG.getVectorShuffle(OutVT, DL, Res, Res, Mask);
20971
20972    if (DstVT.is256BitVector())
20973      return DAG.getBitcast(DstVT, Res);
20974
20975    // If 512bit -> 128bit truncate another stage.
20976    EVT PackedVT = EVT::getVectorVT(Ctx, PackedSVT, NumElems);
20977    Res = DAG.getBitcast(PackedVT, Res);
20978    return truncateVectorWithPACK(Opcode, DstVT, Res, DL, DAG, Subtarget);
20979  }
20980
20981  // Recursively pack lower/upper subvectors, concat result and pack again.
20982  assert(SrcSizeInBits >= 256 && "Expected 256-bit vector or greater");
20983  EVT PackedVT = EVT::getVectorVT(Ctx, PackedSVT, NumElems / 2);
20984  Lo = truncateVectorWithPACK(Opcode, PackedVT, Lo, DL, DAG, Subtarget);
20985  Hi = truncateVectorWithPACK(Opcode, PackedVT, Hi, DL, DAG, Subtarget);
20986
20987  PackedVT = EVT::getVectorVT(Ctx, PackedSVT, NumElems);
20988  SDValue Res = DAG.getNode(ISD::CONCAT_VECTORS, DL, PackedVT, Lo, Hi);
20989  return truncateVectorWithPACK(Opcode, DstVT, Res, DL, DAG, Subtarget);
20990}
20991
20992static SDValue LowerTruncateVecI1(SDValue Op, SelectionDAG &DAG,
20993                                  const X86Subtarget &Subtarget) {
20994
20995  SDLoc DL(Op);
20996  MVT VT = Op.getSimpleValueType();
20997  SDValue In = Op.getOperand(0);
20998  MVT InVT = In.getSimpleValueType();
20999
21000  assert(VT.getVectorElementType() == MVT::i1 && "Unexpected vector type.");
21001
21002  // Shift LSB to MSB and use VPMOVB/W2M or TESTD/Q.
21003  unsigned ShiftInx = InVT.getScalarSizeInBits() - 1;
21004  if (InVT.getScalarSizeInBits() <= 16) {
21005    if (Subtarget.hasBWI()) {
21006      // legal, will go to VPMOVB2M, VPMOVW2M
21007      if (DAG.ComputeNumSignBits(In) < InVT.getScalarSizeInBits()) {
21008        // We need to shift to get the lsb into sign position.
21009        // Shift packed bytes not supported natively, bitcast to word
21010        MVT ExtVT = MVT::getVectorVT(MVT::i16, InVT.getSizeInBits()/16);
21011        In = DAG.getNode(ISD::SHL, DL, ExtVT,
21012                         DAG.getBitcast(ExtVT, In),
21013                         DAG.getConstant(ShiftInx, DL, ExtVT));
21014        In = DAG.getBitcast(InVT, In);
21015      }
21016      return DAG.getSetCC(DL, VT, DAG.getConstant(0, DL, InVT),
21017                          In, ISD::SETGT);
21018    }
21019    // Use TESTD/Q, extended vector to packed dword/qword.
21020    assert((InVT.is256BitVector() || InVT.is128BitVector()) &&
21021           "Unexpected vector type.");
21022    unsigned NumElts = InVT.getVectorNumElements();
21023    assert((NumElts == 8 || NumElts == 16) && "Unexpected number of elements");
21024    // We need to change to a wider element type that we have support for.
21025    // For 8 element vectors this is easy, we either extend to v8i32 or v8i64.
21026    // For 16 element vectors we extend to v16i32 unless we are explicitly
21027    // trying to avoid 512-bit vectors. If we are avoiding 512-bit vectors
21028    // we need to split into two 8 element vectors which we can extend to v8i32,
21029    // truncate and concat the results. There's an additional complication if
21030    // the original type is v16i8. In that case we can't split the v16i8
21031    // directly, so we need to shuffle high elements to low and use
21032    // sign_extend_vector_inreg.
21033    if (NumElts == 16 && !Subtarget.canExtendTo512DQ()) {
21034      SDValue Lo, Hi;
21035      if (InVT == MVT::v16i8) {
21036        Lo = DAG.getNode(ISD::SIGN_EXTEND_VECTOR_INREG, DL, MVT::v8i32, In);
21037        Hi = DAG.getVectorShuffle(
21038            InVT, DL, In, In,
21039            {8, 9, 10, 11, 12, 13, 14, 15, -1, -1, -1, -1, -1, -1, -1, -1});
21040        Hi = DAG.getNode(ISD::SIGN_EXTEND_VECTOR_INREG, DL, MVT::v8i32, Hi);
21041      } else {
21042        assert(InVT == MVT::v16i16 && "Unexpected VT!");
21043        Lo = extract128BitVector(In, 0, DAG, DL);
21044        Hi = extract128BitVector(In, 8, DAG, DL);
21045      }
21046      // We're split now, just emit two truncates and a concat. The two
21047      // truncates will trigger legalization to come back to this function.
21048      Lo = DAG.getNode(ISD::TRUNCATE, DL, MVT::v8i1, Lo);
21049      Hi = DAG.getNode(ISD::TRUNCATE, DL, MVT::v8i1, Hi);
21050      return DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, Lo, Hi);
21051    }
21052    // We either have 8 elements or we're allowed to use 512-bit vectors.
21053    // If we have VLX, we want to use the narrowest vector that can get the
21054    // job done so we use vXi32.
21055    MVT EltVT = Subtarget.hasVLX() ? MVT::i32 : MVT::getIntegerVT(512/NumElts);
21056    MVT ExtVT = MVT::getVectorVT(EltVT, NumElts);
21057    In = DAG.getNode(ISD::SIGN_EXTEND, DL, ExtVT, In);
21058    InVT = ExtVT;
21059    ShiftInx = InVT.getScalarSizeInBits() - 1;
21060  }
21061
21062  if (DAG.ComputeNumSignBits(In) < InVT.getScalarSizeInBits()) {
21063    // We need to shift to get the lsb into sign position.
21064    In = DAG.getNode(ISD::SHL, DL, InVT, In,
21065                     DAG.getConstant(ShiftInx, DL, InVT));
21066  }
21067  // If we have DQI, emit a pattern that will be iseled as vpmovq2m/vpmovd2m.
21068  if (Subtarget.hasDQI())
21069    return DAG.getSetCC(DL, VT, DAG.getConstant(0, DL, InVT), In, ISD::SETGT);
21070  return DAG.getSetCC(DL, VT, In, DAG.getConstant(0, DL, InVT), ISD::SETNE);
21071}
21072
21073SDValue X86TargetLowering::LowerTRUNCATE(SDValue Op, SelectionDAG &DAG) const {
21074  SDLoc DL(Op);
21075  MVT VT = Op.getSimpleValueType();
21076  SDValue In = Op.getOperand(0);
21077  MVT InVT = In.getSimpleValueType();
21078  unsigned InNumEltBits = InVT.getScalarSizeInBits();
21079
21080  assert(VT.getVectorNumElements() == InVT.getVectorNumElements() &&
21081         "Invalid TRUNCATE operation");
21082
21083  // If we're called by the type legalizer, handle a few cases.
21084  const TargetLowering &TLI = DAG.getTargetLoweringInfo();
21085  if (!TLI.isTypeLegal(InVT)) {
21086    if ((InVT == MVT::v8i64 || InVT == MVT::v16i32 || InVT == MVT::v16i64) &&
21087        VT.is128BitVector()) {
21088      assert((InVT == MVT::v16i64 || Subtarget.hasVLX()) &&
21089             "Unexpected subtarget!");
21090      // The default behavior is to truncate one step, concatenate, and then
21091      // truncate the remainder. We'd rather produce two 64-bit results and
21092      // concatenate those.
21093      SDValue Lo, Hi;
21094      std::tie(Lo, Hi) = DAG.SplitVector(In, DL);
21095
21096      EVT LoVT, HiVT;
21097      std::tie(LoVT, HiVT) = DAG.GetSplitDestVTs(VT);
21098
21099      Lo = DAG.getNode(ISD::TRUNCATE, DL, LoVT, Lo);
21100      Hi = DAG.getNode(ISD::TRUNCATE, DL, HiVT, Hi);
21101      return DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, Lo, Hi);
21102    }
21103
21104    // Otherwise let default legalization handle it.
21105    return SDValue();
21106  }
21107
21108  if (VT.getVectorElementType() == MVT::i1)
21109    return LowerTruncateVecI1(Op, DAG, Subtarget);
21110
21111  // vpmovqb/w/d, vpmovdb/w, vpmovwb
21112  if (Subtarget.hasAVX512()) {
21113    if (InVT == MVT::v32i16 && !Subtarget.hasBWI()) {
21114      assert(VT == MVT::v32i8 && "Unexpected VT!");
21115      return splitVectorIntUnary(Op, DAG);
21116    }
21117
21118    // word to byte only under BWI. Otherwise we have to promoted to v16i32
21119    // and then truncate that. But we should only do that if we haven't been
21120    // asked to avoid 512-bit vectors. The actual promotion to v16i32 will be
21121    // handled by isel patterns.
21122    if (InVT != MVT::v16i16 || Subtarget.hasBWI() ||
21123        Subtarget.canExtendTo512DQ())
21124      return Op;
21125  }
21126
21127  unsigned NumPackedSignBits = std::min<unsigned>(VT.getScalarSizeInBits(), 16);
21128  unsigned NumPackedZeroBits = Subtarget.hasSSE41() ? NumPackedSignBits : 8;
21129
21130  // Truncate with PACKUS if we are truncating a vector with leading zero bits
21131  // that extend all the way to the packed/truncated value.
21132  // Pre-SSE41 we can only use PACKUSWB.
21133  KnownBits Known = DAG.computeKnownBits(In);
21134  if ((InNumEltBits - NumPackedZeroBits) <= Known.countMinLeadingZeros())
21135    if (SDValue V =
21136            truncateVectorWithPACK(X86ISD::PACKUS, VT, In, DL, DAG, Subtarget))
21137      return V;
21138
21139  // Truncate with PACKSS if we are truncating a vector with sign-bits that
21140  // extend all the way to the packed/truncated value.
21141  if ((InNumEltBits - NumPackedSignBits) < DAG.ComputeNumSignBits(In))
21142    if (SDValue V =
21143            truncateVectorWithPACK(X86ISD::PACKSS, VT, In, DL, DAG, Subtarget))
21144      return V;
21145
21146  // Handle truncation of V256 to V128 using shuffles.
21147  assert(VT.is128BitVector() && InVT.is256BitVector() && "Unexpected types!");
21148
21149  if ((VT == MVT::v4i32) && (InVT == MVT::v4i64)) {
21150    In = DAG.getBitcast(MVT::v8i32, In);
21151
21152    // On AVX2, v4i64 -> v4i32 becomes VPERMD.
21153    if (Subtarget.hasInt256()) {
21154      static const int ShufMask[] = {0, 2, 4, 6, -1, -1, -1, -1};
21155      In = DAG.getVectorShuffle(MVT::v8i32, DL, In, In, ShufMask);
21156      return DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, In,
21157                         DAG.getIntPtrConstant(0, DL));
21158    }
21159
21160    SDValue OpLo = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, MVT::v4i32, In,
21161                               DAG.getIntPtrConstant(0, DL));
21162    SDValue OpHi = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, MVT::v4i32, In,
21163                               DAG.getIntPtrConstant(4, DL));
21164    static const int ShufMask[] = {0, 2, 4, 6};
21165    return DAG.getVectorShuffle(VT, DL, OpLo, OpHi, ShufMask);
21166  }
21167
21168  if ((VT == MVT::v8i16) && (InVT == MVT::v8i32)) {
21169    In = DAG.getBitcast(MVT::v32i8, In);
21170
21171    // On AVX2, v8i32 -> v8i16 becomes PSHUFB.
21172    if (Subtarget.hasInt256()) {
21173      // The PSHUFB mask:
21174      static const int ShufMask1[] = { 0,  1,  4,  5,  8,  9, 12, 13,
21175                                      -1, -1, -1, -1, -1, -1, -1, -1,
21176                                      16, 17, 20, 21, 24, 25, 28, 29,
21177                                      -1, -1, -1, -1, -1, -1, -1, -1 };
21178      In = DAG.getVectorShuffle(MVT::v32i8, DL, In, In, ShufMask1);
21179      In = DAG.getBitcast(MVT::v4i64, In);
21180
21181      static const int ShufMask2[] = {0, 2, -1, -1};
21182      In = DAG.getVectorShuffle(MVT::v4i64, DL, In, In, ShufMask2);
21183      return DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, MVT::v8i16,
21184                         DAG.getBitcast(MVT::v16i16, In),
21185                         DAG.getIntPtrConstant(0, DL));
21186    }
21187
21188    SDValue OpLo = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, MVT::v16i8, In,
21189                               DAG.getIntPtrConstant(0, DL));
21190    SDValue OpHi = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, MVT::v16i8, In,
21191                               DAG.getIntPtrConstant(16, DL));
21192
21193    // The PSHUFB mask:
21194    static const int ShufMask1[] = {0,  1,  4,  5,  8,  9, 12, 13,
21195                                   -1, -1, -1, -1, -1, -1, -1, -1};
21196
21197    OpLo = DAG.getVectorShuffle(MVT::v16i8, DL, OpLo, OpLo, ShufMask1);
21198    OpHi = DAG.getVectorShuffle(MVT::v16i8, DL, OpHi, OpHi, ShufMask1);
21199
21200    OpLo = DAG.getBitcast(MVT::v4i32, OpLo);
21201    OpHi = DAG.getBitcast(MVT::v4i32, OpHi);
21202
21203    // The MOVLHPS Mask:
21204    static const int ShufMask2[] = {0, 1, 4, 5};
21205    SDValue res = DAG.getVectorShuffle(MVT::v4i32, DL, OpLo, OpHi, ShufMask2);
21206    return DAG.getBitcast(MVT::v8i16, res);
21207  }
21208
21209  if (VT == MVT::v16i8 && InVT == MVT::v16i16) {
21210    // Use an AND to zero uppper bits for PACKUS.
21211    In = DAG.getNode(ISD::AND, DL, InVT, In, DAG.getConstant(255, DL, InVT));
21212
21213    SDValue InLo = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, MVT::v8i16, In,
21214                               DAG.getIntPtrConstant(0, DL));
21215    SDValue InHi = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, MVT::v8i16, In,
21216                               DAG.getIntPtrConstant(8, DL));
21217    return DAG.getNode(X86ISD::PACKUS, DL, VT, InLo, InHi);
21218  }
21219
21220  llvm_unreachable("All 256->128 cases should have been handled above!");
21221}
21222
21223SDValue X86TargetLowering::LowerFP_TO_INT(SDValue Op, SelectionDAG &DAG) const {
21224  bool IsStrict = Op->isStrictFPOpcode();
21225  bool IsSigned = Op.getOpcode() == ISD::FP_TO_SINT ||
21226                  Op.getOpcode() == ISD::STRICT_FP_TO_SINT;
21227  MVT VT = Op->getSimpleValueType(0);
21228  SDValue Src = Op.getOperand(IsStrict ? 1 : 0);
21229  MVT SrcVT = Src.getSimpleValueType();
21230  SDLoc dl(Op);
21231
21232  if (VT.isVector()) {
21233    if (VT == MVT::v2i1 && SrcVT == MVT::v2f64) {
21234      MVT ResVT = MVT::v4i32;
21235      MVT TruncVT = MVT::v4i1;
21236      unsigned Opc;
21237      if (IsStrict)
21238        Opc = IsSigned ? X86ISD::STRICT_CVTTP2SI : X86ISD::STRICT_CVTTP2UI;
21239      else
21240        Opc = IsSigned ? X86ISD::CVTTP2SI : X86ISD::CVTTP2UI;
21241
21242      if (!IsSigned && !Subtarget.hasVLX()) {
21243        assert(Subtarget.useAVX512Regs() && "Unexpected features!");
21244        // Widen to 512-bits.
21245        ResVT = MVT::v8i32;
21246        TruncVT = MVT::v8i1;
21247        Opc = Op.getOpcode();
21248        // Need to concat with zero vector for strict fp to avoid spurious
21249        // exceptions.
21250        // TODO: Should we just do this for non-strict as well?
21251        SDValue Tmp = IsStrict ? DAG.getConstantFP(0.0, dl, MVT::v8f64)
21252                               : DAG.getUNDEF(MVT::v8f64);
21253        Src = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, MVT::v8f64, Tmp, Src,
21254                          DAG.getIntPtrConstant(0, dl));
21255      }
21256      SDValue Res, Chain;
21257      if (IsStrict) {
21258        Res =
21259            DAG.getNode(Opc, dl, {ResVT, MVT::Other}, {Op->getOperand(0), Src});
21260        Chain = Res.getValue(1);
21261      } else {
21262        Res = DAG.getNode(Opc, dl, ResVT, Src);
21263      }
21264
21265      Res = DAG.getNode(ISD::TRUNCATE, dl, TruncVT, Res);
21266      Res = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MVT::v2i1, Res,
21267                        DAG.getIntPtrConstant(0, dl));
21268      if (IsStrict)
21269        return DAG.getMergeValues({Res, Chain}, dl);
21270      return Res;
21271    }
21272
21273    // v8f64->v8i32 is legal, but we need v8i32 to be custom for v8f32.
21274    if (VT == MVT::v8i32 && SrcVT == MVT::v8f64) {
21275      assert(!IsSigned && "Expected unsigned conversion!");
21276      assert(Subtarget.useAVX512Regs() && "Requires avx512f");
21277      return Op;
21278    }
21279
21280    // Widen vXi32 fp_to_uint with avx512f to 512-bit source.
21281    if ((VT == MVT::v4i32 || VT == MVT::v8i32) &&
21282        (SrcVT == MVT::v4f64 || SrcVT == MVT::v4f32 || SrcVT == MVT::v8f32)) {
21283      assert(!IsSigned && "Expected unsigned conversion!");
21284      assert(Subtarget.useAVX512Regs() && !Subtarget.hasVLX() &&
21285             "Unexpected features!");
21286      MVT WideVT = SrcVT == MVT::v4f64 ? MVT::v8f64 : MVT::v16f32;
21287      MVT ResVT = SrcVT == MVT::v4f64 ? MVT::v8i32 : MVT::v16i32;
21288      // Need to concat with zero vector for strict fp to avoid spurious
21289      // exceptions.
21290      // TODO: Should we just do this for non-strict as well?
21291      SDValue Tmp =
21292          IsStrict ? DAG.getConstantFP(0.0, dl, WideVT) : DAG.getUNDEF(WideVT);
21293      Src = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, WideVT, Tmp, Src,
21294                        DAG.getIntPtrConstant(0, dl));
21295
21296      SDValue Res, Chain;
21297      if (IsStrict) {
21298        Res = DAG.getNode(ISD::STRICT_FP_TO_UINT, dl, {ResVT, MVT::Other},
21299                          {Op->getOperand(0), Src});
21300        Chain = Res.getValue(1);
21301      } else {
21302        Res = DAG.getNode(ISD::FP_TO_UINT, dl, ResVT, Src);
21303      }
21304
21305      Res = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, VT, Res,
21306                        DAG.getIntPtrConstant(0, dl));
21307
21308      if (IsStrict)
21309        return DAG.getMergeValues({Res, Chain}, dl);
21310      return Res;
21311    }
21312
21313    // Widen vXi64 fp_to_uint/fp_to_sint with avx512dq to 512-bit source.
21314    if ((VT == MVT::v2i64 || VT == MVT::v4i64) &&
21315        (SrcVT == MVT::v2f64 || SrcVT == MVT::v4f64 || SrcVT == MVT::v4f32)) {
21316      assert(Subtarget.useAVX512Regs() && Subtarget.hasDQI() &&
21317             !Subtarget.hasVLX() && "Unexpected features!");
21318      MVT WideVT = SrcVT == MVT::v4f32 ? MVT::v8f32 : MVT::v8f64;
21319      // Need to concat with zero vector for strict fp to avoid spurious
21320      // exceptions.
21321      // TODO: Should we just do this for non-strict as well?
21322      SDValue Tmp =
21323          IsStrict ? DAG.getConstantFP(0.0, dl, WideVT) : DAG.getUNDEF(WideVT);
21324      Src = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, WideVT, Tmp, Src,
21325                        DAG.getIntPtrConstant(0, dl));
21326
21327      SDValue Res, Chain;
21328      if (IsStrict) {
21329        Res = DAG.getNode(Op.getOpcode(), dl, {MVT::v8i64, MVT::Other},
21330                          {Op->getOperand(0), Src});
21331        Chain = Res.getValue(1);
21332      } else {
21333        Res = DAG.getNode(Op.getOpcode(), dl, MVT::v8i64, Src);
21334      }
21335
21336      Res = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, VT, Res,
21337                        DAG.getIntPtrConstant(0, dl));
21338
21339      if (IsStrict)
21340        return DAG.getMergeValues({Res, Chain}, dl);
21341      return Res;
21342    }
21343
21344    if (VT == MVT::v2i64 && SrcVT  == MVT::v2f32) {
21345      if (!Subtarget.hasVLX()) {
21346        // Non-strict nodes without VLX can we widened to v4f32->v4i64 by type
21347        // legalizer and then widened again by vector op legalization.
21348        if (!IsStrict)
21349          return SDValue();
21350
21351        SDValue Zero = DAG.getConstantFP(0.0, dl, MVT::v2f32);
21352        SDValue Tmp = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v8f32,
21353                                  {Src, Zero, Zero, Zero});
21354        Tmp = DAG.getNode(Op.getOpcode(), dl, {MVT::v8i64, MVT::Other},
21355                          {Op->getOperand(0), Tmp});
21356        SDValue Chain = Tmp.getValue(1);
21357        Tmp = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MVT::v2i64, Tmp,
21358                          DAG.getIntPtrConstant(0, dl));
21359        if (IsStrict)
21360          return DAG.getMergeValues({Tmp, Chain}, dl);
21361        return Tmp;
21362      }
21363
21364      assert(Subtarget.hasDQI() && Subtarget.hasVLX() && "Requires AVX512DQVL");
21365      SDValue Tmp = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v4f32, Src,
21366                                DAG.getUNDEF(MVT::v2f32));
21367      if (IsStrict) {
21368        unsigned Opc = IsSigned ? X86ISD::STRICT_CVTTP2SI
21369                                : X86ISD::STRICT_CVTTP2UI;
21370        return DAG.getNode(Opc, dl, {VT, MVT::Other}, {Op->getOperand(0), Tmp});
21371      }
21372      unsigned Opc = IsSigned ? X86ISD::CVTTP2SI : X86ISD::CVTTP2UI;
21373      return DAG.getNode(Opc, dl, VT, Tmp);
21374    }
21375
21376    return SDValue();
21377  }
21378
21379  assert(!VT.isVector());
21380
21381  bool UseSSEReg = isScalarFPTypeInSSEReg(SrcVT);
21382
21383  if (!IsSigned && UseSSEReg) {
21384    // Conversions from f32/f64 with AVX512 should be legal.
21385    if (Subtarget.hasAVX512())
21386      return Op;
21387
21388    // Use default expansion for i64.
21389    if (VT == MVT::i64)
21390      return SDValue();
21391
21392    assert(VT == MVT::i32 && "Unexpected VT!");
21393
21394    // Promote i32 to i64 and use a signed operation on 64-bit targets.
21395    // FIXME: This does not generate an invalid exception if the input does not
21396    // fit in i32. PR44019
21397    if (Subtarget.is64Bit()) {
21398      SDValue Res, Chain;
21399      if (IsStrict) {
21400        Res = DAG.getNode(ISD::STRICT_FP_TO_SINT, dl, { MVT::i64, MVT::Other},
21401                          { Op.getOperand(0), Src });
21402        Chain = Res.getValue(1);
21403      } else
21404        Res = DAG.getNode(ISD::FP_TO_SINT, dl, MVT::i64, Src);
21405
21406      Res = DAG.getNode(ISD::TRUNCATE, dl, VT, Res);
21407      if (IsStrict)
21408        return DAG.getMergeValues({ Res, Chain }, dl);
21409      return Res;
21410    }
21411
21412    // Use default expansion for SSE1/2 targets without SSE3. With SSE3 we can
21413    // use fisttp which will be handled later.
21414    if (!Subtarget.hasSSE3())
21415      return SDValue();
21416  }
21417
21418  // Promote i16 to i32 if we can use a SSE operation or the type is f128.
21419  // FIXME: This does not generate an invalid exception if the input does not
21420  // fit in i16. PR44019
21421  if (VT == MVT::i16 && (UseSSEReg || SrcVT == MVT::f128)) {
21422    assert(IsSigned && "Expected i16 FP_TO_UINT to have been promoted!");
21423    SDValue Res, Chain;
21424    if (IsStrict) {
21425      Res = DAG.getNode(ISD::STRICT_FP_TO_SINT, dl, { MVT::i32, MVT::Other},
21426                        { Op.getOperand(0), Src });
21427      Chain = Res.getValue(1);
21428    } else
21429      Res = DAG.getNode(ISD::FP_TO_SINT, dl, MVT::i32, Src);
21430
21431    Res = DAG.getNode(ISD::TRUNCATE, dl, VT, Res);
21432    if (IsStrict)
21433      return DAG.getMergeValues({ Res, Chain }, dl);
21434    return Res;
21435  }
21436
21437  // If this is a FP_TO_SINT using SSEReg we're done.
21438  if (UseSSEReg && IsSigned)
21439    return Op;
21440
21441  // fp128 needs to use a libcall.
21442  if (SrcVT == MVT::f128) {
21443    RTLIB::Libcall LC;
21444    if (IsSigned)
21445      LC = RTLIB::getFPTOSINT(SrcVT, VT);
21446    else
21447      LC = RTLIB::getFPTOUINT(SrcVT, VT);
21448
21449    SDValue Chain = IsStrict ? Op.getOperand(0) : SDValue();
21450    MakeLibCallOptions CallOptions;
21451    std::pair<SDValue, SDValue> Tmp = makeLibCall(DAG, LC, VT, Src, CallOptions,
21452                                                  SDLoc(Op), Chain);
21453
21454    if (IsStrict)
21455      return DAG.getMergeValues({ Tmp.first, Tmp.second }, dl);
21456
21457    return Tmp.first;
21458  }
21459
21460  // Fall back to X87.
21461  SDValue Chain;
21462  if (SDValue V = FP_TO_INTHelper(Op, DAG, IsSigned, Chain)) {
21463    if (IsStrict)
21464      return DAG.getMergeValues({V, Chain}, dl);
21465    return V;
21466  }
21467
21468  llvm_unreachable("Expected FP_TO_INTHelper to handle all remaining cases.");
21469}
21470
21471SDValue X86TargetLowering::LowerLRINT_LLRINT(SDValue Op,
21472                                             SelectionDAG &DAG) const {
21473  SDValue Src = Op.getOperand(0);
21474  MVT SrcVT = Src.getSimpleValueType();
21475
21476  // If the source is in an SSE register, the node is Legal.
21477  if (isScalarFPTypeInSSEReg(SrcVT))
21478    return Op;
21479
21480  return LRINT_LLRINTHelper(Op.getNode(), DAG);
21481}
21482
21483SDValue X86TargetLowering::LRINT_LLRINTHelper(SDNode *N,
21484                                              SelectionDAG &DAG) const {
21485  EVT DstVT = N->getValueType(0);
21486  SDValue Src = N->getOperand(0);
21487  EVT SrcVT = Src.getValueType();
21488
21489  if (SrcVT != MVT::f32 && SrcVT != MVT::f64 && SrcVT != MVT::f80) {
21490    // f16 must be promoted before using the lowering in this routine.
21491    // fp128 does not use this lowering.
21492    return SDValue();
21493  }
21494
21495  SDLoc DL(N);
21496  SDValue Chain = DAG.getEntryNode();
21497
21498  bool UseSSE = isScalarFPTypeInSSEReg(SrcVT);
21499
21500  // If we're converting from SSE, the stack slot needs to hold both types.
21501  // Otherwise it only needs to hold the DstVT.
21502  EVT OtherVT = UseSSE ? SrcVT : DstVT;
21503  SDValue StackPtr = DAG.CreateStackTemporary(DstVT, OtherVT);
21504  int SPFI = cast<FrameIndexSDNode>(StackPtr.getNode())->getIndex();
21505  MachinePointerInfo MPI =
21506      MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), SPFI);
21507
21508  if (UseSSE) {
21509    assert(DstVT == MVT::i64 && "Invalid LRINT/LLRINT to lower!");
21510    Chain = DAG.getStore(Chain, DL, Src, StackPtr, MPI);
21511    SDVTList Tys = DAG.getVTList(MVT::f80, MVT::Other);
21512    SDValue Ops[] = { Chain, StackPtr };
21513
21514    Src = DAG.getMemIntrinsicNode(X86ISD::FLD, DL, Tys, Ops, SrcVT, MPI,
21515                                  /*Align*/ None, MachineMemOperand::MOLoad);
21516    Chain = Src.getValue(1);
21517  }
21518
21519  SDValue StoreOps[] = { Chain, Src, StackPtr };
21520  Chain = DAG.getMemIntrinsicNode(X86ISD::FIST, DL, DAG.getVTList(MVT::Other),
21521                                  StoreOps, DstVT, MPI, /*Align*/ None,
21522                                  MachineMemOperand::MOStore);
21523
21524  return DAG.getLoad(DstVT, DL, Chain, StackPtr, MPI);
21525}
21526
21527SDValue
21528X86TargetLowering::LowerFP_TO_INT_SAT(SDValue Op, SelectionDAG &DAG) const {
21529  // This is based on the TargetLowering::expandFP_TO_INT_SAT implementation,
21530  // but making use of X86 specifics to produce better instruction sequences.
21531  SDNode *Node = Op.getNode();
21532  bool IsSigned = Node->getOpcode() == ISD::FP_TO_SINT_SAT;
21533  unsigned FpToIntOpcode = IsSigned ? ISD::FP_TO_SINT : ISD::FP_TO_UINT;
21534  SDLoc dl(SDValue(Node, 0));
21535  SDValue Src = Node->getOperand(0);
21536
21537  // There are three types involved here: SrcVT is the source floating point
21538  // type, DstVT is the type of the result, and TmpVT is the result of the
21539  // intermediate FP_TO_*INT operation we'll use (which may be a promotion of
21540  // DstVT).
21541  EVT SrcVT = Src.getValueType();
21542  EVT DstVT = Node->getValueType(0);
21543  EVT TmpVT = DstVT;
21544
21545  // This code is only for floats and doubles. Fall back to generic code for
21546  // anything else.
21547  if (!isScalarFPTypeInSSEReg(SrcVT))
21548    return SDValue();
21549
21550  EVT SatVT = cast<VTSDNode>(Node->getOperand(1))->getVT();
21551  unsigned SatWidth = SatVT.getScalarSizeInBits();
21552  unsigned DstWidth = DstVT.getScalarSizeInBits();
21553  unsigned TmpWidth = TmpVT.getScalarSizeInBits();
21554  assert(SatWidth <= DstWidth && SatWidth <= TmpWidth &&
21555         "Expected saturation width smaller than result width");
21556
21557  // Promote result of FP_TO_*INT to at least 32 bits.
21558  if (TmpWidth < 32) {
21559    TmpVT = MVT::i32;
21560    TmpWidth = 32;
21561  }
21562
21563  // Promote conversions to unsigned 32-bit to 64-bit, because it will allow
21564  // us to use a native signed conversion instead.
21565  if (SatWidth == 32 && !IsSigned && Subtarget.is64Bit()) {
21566    TmpVT = MVT::i64;
21567    TmpWidth = 64;
21568  }
21569
21570  // If the saturation width is smaller than the size of the temporary result,
21571  // we can always use signed conversion, which is native.
21572  if (SatWidth < TmpWidth)
21573    FpToIntOpcode = ISD::FP_TO_SINT;
21574
21575  // Determine minimum and maximum integer values and their corresponding
21576  // floating-point values.
21577  APInt MinInt, MaxInt;
21578  if (IsSigned) {
21579    MinInt = APInt::getSignedMinValue(SatWidth).sextOrSelf(DstWidth);
21580    MaxInt = APInt::getSignedMaxValue(SatWidth).sextOrSelf(DstWidth);
21581  } else {
21582    MinInt = APInt::getMinValue(SatWidth).zextOrSelf(DstWidth);
21583    MaxInt = APInt::getMaxValue(SatWidth).zextOrSelf(DstWidth);
21584  }
21585
21586  APFloat MinFloat(DAG.EVTToAPFloatSemantics(SrcVT));
21587  APFloat MaxFloat(DAG.EVTToAPFloatSemantics(SrcVT));
21588
21589  APFloat::opStatus MinStatus = MinFloat.convertFromAPInt(
21590    MinInt, IsSigned, APFloat::rmTowardZero);
21591  APFloat::opStatus MaxStatus = MaxFloat.convertFromAPInt(
21592    MaxInt, IsSigned, APFloat::rmTowardZero);
21593  bool AreExactFloatBounds = !(MinStatus & APFloat::opStatus::opInexact)
21594                          && !(MaxStatus & APFloat::opStatus::opInexact);
21595
21596  SDValue MinFloatNode = DAG.getConstantFP(MinFloat, dl, SrcVT);
21597  SDValue MaxFloatNode = DAG.getConstantFP(MaxFloat, dl, SrcVT);
21598
21599  // If the integer bounds are exactly representable as floats, emit a
21600  // min+max+fptoi sequence. Otherwise use comparisons and selects.
21601  if (AreExactFloatBounds) {
21602    if (DstVT != TmpVT) {
21603      // Clamp by MinFloat from below. If Src is NaN, propagate NaN.
21604      SDValue MinClamped = DAG.getNode(
21605        X86ISD::FMAX, dl, SrcVT, MinFloatNode, Src);
21606      // Clamp by MaxFloat from above. If Src is NaN, propagate NaN.
21607      SDValue BothClamped = DAG.getNode(
21608        X86ISD::FMIN, dl, SrcVT, MaxFloatNode, MinClamped);
21609      // Convert clamped value to integer.
21610      SDValue FpToInt = DAG.getNode(FpToIntOpcode, dl, TmpVT, BothClamped);
21611
21612      // NaN will become INDVAL, with the top bit set and the rest zero.
21613      // Truncation will discard the top bit, resulting in zero.
21614      return DAG.getNode(ISD::TRUNCATE, dl, DstVT, FpToInt);
21615    }
21616
21617    // Clamp by MinFloat from below. If Src is NaN, the result is MinFloat.
21618    SDValue MinClamped = DAG.getNode(
21619      X86ISD::FMAX, dl, SrcVT, Src, MinFloatNode);
21620    // Clamp by MaxFloat from above. NaN cannot occur.
21621    SDValue BothClamped = DAG.getNode(
21622      X86ISD::FMINC, dl, SrcVT, MinClamped, MaxFloatNode);
21623    // Convert clamped value to integer.
21624    SDValue FpToInt = DAG.getNode(FpToIntOpcode, dl, DstVT, BothClamped);
21625
21626    if (!IsSigned) {
21627      // In the unsigned case we're done, because we mapped NaN to MinFloat,
21628      // which is zero.
21629      return FpToInt;
21630    }
21631
21632    // Otherwise, select zero if Src is NaN.
21633    SDValue ZeroInt = DAG.getConstant(0, dl, DstVT);
21634    return DAG.getSelectCC(
21635      dl, Src, Src, ZeroInt, FpToInt, ISD::CondCode::SETUO);
21636  }
21637
21638  SDValue MinIntNode = DAG.getConstant(MinInt, dl, DstVT);
21639  SDValue MaxIntNode = DAG.getConstant(MaxInt, dl, DstVT);
21640
21641  // Result of direct conversion, which may be selected away.
21642  SDValue FpToInt = DAG.getNode(FpToIntOpcode, dl, TmpVT, Src);
21643
21644  if (DstVT != TmpVT) {
21645    // NaN will become INDVAL, with the top bit set and the rest zero.
21646    // Truncation will discard the top bit, resulting in zero.
21647    FpToInt = DAG.getNode(ISD::TRUNCATE, dl, DstVT, FpToInt);
21648  }
21649
21650  SDValue Select = FpToInt;
21651  // For signed conversions where we saturate to the same size as the
21652  // result type of the fptoi instructions, INDVAL coincides with integer
21653  // minimum, so we don't need to explicitly check it.
21654  if (!IsSigned || SatWidth != TmpVT.getScalarSizeInBits()) {
21655    // If Src ULT MinFloat, select MinInt. In particular, this also selects
21656    // MinInt if Src is NaN.
21657    Select = DAG.getSelectCC(
21658      dl, Src, MinFloatNode, MinIntNode, Select, ISD::CondCode::SETULT);
21659  }
21660
21661  // If Src OGT MaxFloat, select MaxInt.
21662  Select = DAG.getSelectCC(
21663    dl, Src, MaxFloatNode, MaxIntNode, Select, ISD::CondCode::SETOGT);
21664
21665  // In the unsigned case we are done, because we mapped NaN to MinInt, which
21666  // is already zero. The promoted case was already handled above.
21667  if (!IsSigned || DstVT != TmpVT) {
21668    return Select;
21669  }
21670
21671  // Otherwise, select 0 if Src is NaN.
21672  SDValue ZeroInt = DAG.getConstant(0, dl, DstVT);
21673  return DAG.getSelectCC(
21674    dl, Src, Src, ZeroInt, Select, ISD::CondCode::SETUO);
21675}
21676
21677SDValue X86TargetLowering::LowerFP_EXTEND(SDValue Op, SelectionDAG &DAG) const {
21678  bool IsStrict = Op->isStrictFPOpcode();
21679
21680  SDLoc DL(Op);
21681  MVT VT = Op.getSimpleValueType();
21682  SDValue In = Op.getOperand(IsStrict ? 1 : 0);
21683  MVT SVT = In.getSimpleValueType();
21684
21685  if (VT == MVT::f128)
21686    return SDValue();
21687
21688  assert(SVT == MVT::v2f32 && "Only customize MVT::v2f32 type legalization!");
21689
21690  SDValue Res =
21691      DAG.getNode(ISD::CONCAT_VECTORS, DL, MVT::v4f32, In, DAG.getUNDEF(SVT));
21692  if (IsStrict)
21693    return DAG.getNode(X86ISD::STRICT_VFPEXT, DL, {VT, MVT::Other},
21694                       {Op->getOperand(0), Res});
21695  return DAG.getNode(X86ISD::VFPEXT, DL, VT, Res);
21696}
21697
21698SDValue X86TargetLowering::LowerFP_ROUND(SDValue Op, SelectionDAG &DAG) const {
21699  bool IsStrict = Op->isStrictFPOpcode();
21700  SDValue In = Op.getOperand(IsStrict ? 1 : 0);
21701  // It's legal except when f128 is involved
21702  if (In.getSimpleValueType() != MVT::f128)
21703    return Op;
21704
21705  return SDValue();
21706}
21707
21708static SDValue LowerFP16_TO_FP(SDValue Op, SelectionDAG &DAG) {
21709  bool IsStrict = Op->isStrictFPOpcode();
21710  SDValue Src = Op.getOperand(IsStrict ? 1 : 0);
21711  assert(Src.getValueType() == MVT::i16 && Op.getValueType() == MVT::f32 &&
21712         "Unexpected VT!");
21713
21714  SDLoc dl(Op);
21715  SDValue Res = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, MVT::v8i16,
21716                            DAG.getConstant(0, dl, MVT::v8i16), Src,
21717                            DAG.getIntPtrConstant(0, dl));
21718
21719  SDValue Chain;
21720  if (IsStrict) {
21721    Res = DAG.getNode(X86ISD::STRICT_CVTPH2PS, dl, {MVT::v4f32, MVT::Other},
21722                      {Op.getOperand(0), Res});
21723    Chain = Res.getValue(1);
21724  } else {
21725    Res = DAG.getNode(X86ISD::CVTPH2PS, dl, MVT::v4f32, Res);
21726  }
21727
21728  Res = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::f32, Res,
21729                    DAG.getIntPtrConstant(0, dl));
21730
21731  if (IsStrict)
21732    return DAG.getMergeValues({Res, Chain}, dl);
21733
21734  return Res;
21735}
21736
21737static SDValue LowerFP_TO_FP16(SDValue Op, SelectionDAG &DAG) {
21738  bool IsStrict = Op->isStrictFPOpcode();
21739  SDValue Src = Op.getOperand(IsStrict ? 1 : 0);
21740  assert(Src.getValueType() == MVT::f32 && Op.getValueType() == MVT::i16 &&
21741         "Unexpected VT!");
21742
21743  SDLoc dl(Op);
21744  SDValue Res, Chain;
21745  if (IsStrict) {
21746    Res = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, MVT::v4f32,
21747                      DAG.getConstantFP(0, dl, MVT::v4f32), Src,
21748                      DAG.getIntPtrConstant(0, dl));
21749    Res = DAG.getNode(
21750        X86ISD::STRICT_CVTPS2PH, dl, {MVT::v8i16, MVT::Other},
21751        {Op.getOperand(0), Res, DAG.getTargetConstant(4, dl, MVT::i32)});
21752    Chain = Res.getValue(1);
21753  } else {
21754    // FIXME: Should we use zeros for upper elements for non-strict?
21755    Res = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v4f32, Src);
21756    Res = DAG.getNode(X86ISD::CVTPS2PH, dl, MVT::v8i16, Res,
21757                      DAG.getTargetConstant(4, dl, MVT::i32));
21758  }
21759
21760  Res = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i16, Res,
21761                    DAG.getIntPtrConstant(0, dl));
21762
21763  if (IsStrict)
21764    return DAG.getMergeValues({Res, Chain}, dl);
21765
21766  return Res;
21767}
21768
21769/// Depending on uarch and/or optimizing for size, we might prefer to use a
21770/// vector operation in place of the typical scalar operation.
21771static SDValue lowerAddSubToHorizontalOp(SDValue Op, SelectionDAG &DAG,
21772                                         const X86Subtarget &Subtarget) {
21773  // If both operands have other uses, this is probably not profitable.
21774  SDValue LHS = Op.getOperand(0);
21775  SDValue RHS = Op.getOperand(1);
21776  if (!LHS.hasOneUse() && !RHS.hasOneUse())
21777    return Op;
21778
21779  // FP horizontal add/sub were added with SSE3. Integer with SSSE3.
21780  bool IsFP = Op.getSimpleValueType().isFloatingPoint();
21781  if (IsFP && !Subtarget.hasSSE3())
21782    return Op;
21783  if (!IsFP && !Subtarget.hasSSSE3())
21784    return Op;
21785
21786  // Extract from a common vector.
21787  if (LHS.getOpcode() != ISD::EXTRACT_VECTOR_ELT ||
21788      RHS.getOpcode() != ISD::EXTRACT_VECTOR_ELT ||
21789      LHS.getOperand(0) != RHS.getOperand(0) ||
21790      !isa<ConstantSDNode>(LHS.getOperand(1)) ||
21791      !isa<ConstantSDNode>(RHS.getOperand(1)) ||
21792      !shouldUseHorizontalOp(true, DAG, Subtarget))
21793    return Op;
21794
21795  // Allow commuted 'hadd' ops.
21796  // TODO: Allow commuted (f)sub by negating the result of (F)HSUB?
21797  unsigned HOpcode;
21798  switch (Op.getOpcode()) {
21799    case ISD::ADD: HOpcode = X86ISD::HADD; break;
21800    case ISD::SUB: HOpcode = X86ISD::HSUB; break;
21801    case ISD::FADD: HOpcode = X86ISD::FHADD; break;
21802    case ISD::FSUB: HOpcode = X86ISD::FHSUB; break;
21803    default:
21804      llvm_unreachable("Trying to lower unsupported opcode to horizontal op");
21805  }
21806  unsigned LExtIndex = LHS.getConstantOperandVal(1);
21807  unsigned RExtIndex = RHS.getConstantOperandVal(1);
21808  if ((LExtIndex & 1) == 1 && (RExtIndex & 1) == 0 &&
21809      (HOpcode == X86ISD::HADD || HOpcode == X86ISD::FHADD))
21810    std::swap(LExtIndex, RExtIndex);
21811
21812  if ((LExtIndex & 1) != 0 || RExtIndex != (LExtIndex + 1))
21813    return Op;
21814
21815  SDValue X = LHS.getOperand(0);
21816  EVT VecVT = X.getValueType();
21817  unsigned BitWidth = VecVT.getSizeInBits();
21818  unsigned NumLanes = BitWidth / 128;
21819  unsigned NumEltsPerLane = VecVT.getVectorNumElements() / NumLanes;
21820  assert((BitWidth == 128 || BitWidth == 256 || BitWidth == 512) &&
21821         "Not expecting illegal vector widths here");
21822
21823  // Creating a 256-bit horizontal op would be wasteful, and there is no 512-bit
21824  // equivalent, so extract the 256/512-bit source op to 128-bit if we can.
21825  SDLoc DL(Op);
21826  if (BitWidth == 256 || BitWidth == 512) {
21827    unsigned LaneIdx = LExtIndex / NumEltsPerLane;
21828    X = extract128BitVector(X, LaneIdx * NumEltsPerLane, DAG, DL);
21829    LExtIndex %= NumEltsPerLane;
21830  }
21831
21832  // add (extractelt (X, 0), extractelt (X, 1)) --> extractelt (hadd X, X), 0
21833  // add (extractelt (X, 1), extractelt (X, 0)) --> extractelt (hadd X, X), 0
21834  // add (extractelt (X, 2), extractelt (X, 3)) --> extractelt (hadd X, X), 1
21835  // sub (extractelt (X, 0), extractelt (X, 1)) --> extractelt (hsub X, X), 0
21836  SDValue HOp = DAG.getNode(HOpcode, DL, X.getValueType(), X, X);
21837  return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, Op.getSimpleValueType(), HOp,
21838                     DAG.getIntPtrConstant(LExtIndex / 2, DL));
21839}
21840
21841/// Depending on uarch and/or optimizing for size, we might prefer to use a
21842/// vector operation in place of the typical scalar operation.
21843SDValue X86TargetLowering::lowerFaddFsub(SDValue Op, SelectionDAG &DAG) const {
21844  assert((Op.getValueType() == MVT::f32 || Op.getValueType() == MVT::f64) &&
21845         "Only expecting float/double");
21846  return lowerAddSubToHorizontalOp(Op, DAG, Subtarget);
21847}
21848
21849/// ISD::FROUND is defined to round to nearest with ties rounding away from 0.
21850/// This mode isn't supported in hardware on X86. But as long as we aren't
21851/// compiling with trapping math, we can emulate this with
21852/// floor(X + copysign(nextafter(0.5, 0.0), X)).
21853static SDValue LowerFROUND(SDValue Op, SelectionDAG &DAG) {
21854  SDValue N0 = Op.getOperand(0);
21855  SDLoc dl(Op);
21856  MVT VT = Op.getSimpleValueType();
21857
21858  // N0 += copysign(nextafter(0.5, 0.0), N0)
21859  const fltSemantics &Sem = SelectionDAG::EVTToAPFloatSemantics(VT);
21860  bool Ignored;
21861  APFloat Point5Pred = APFloat(0.5f);
21862  Point5Pred.convert(Sem, APFloat::rmNearestTiesToEven, &Ignored);
21863  Point5Pred.next(/*nextDown*/true);
21864
21865  SDValue Adder = DAG.getNode(ISD::FCOPYSIGN, dl, VT,
21866                              DAG.getConstantFP(Point5Pred, dl, VT), N0);
21867  N0 = DAG.getNode(ISD::FADD, dl, VT, N0, Adder);
21868
21869  // Truncate the result to remove fraction.
21870  return DAG.getNode(ISD::FTRUNC, dl, VT, N0);
21871}
21872
21873/// The only differences between FABS and FNEG are the mask and the logic op.
21874/// FNEG also has a folding opportunity for FNEG(FABS(x)).
21875static SDValue LowerFABSorFNEG(SDValue Op, SelectionDAG &DAG) {
21876  assert((Op.getOpcode() == ISD::FABS || Op.getOpcode() == ISD::FNEG) &&
21877         "Wrong opcode for lowering FABS or FNEG.");
21878
21879  bool IsFABS = (Op.getOpcode() == ISD::FABS);
21880
21881  // If this is a FABS and it has an FNEG user, bail out to fold the combination
21882  // into an FNABS. We'll lower the FABS after that if it is still in use.
21883  if (IsFABS)
21884    for (SDNode *User : Op->uses())
21885      if (User->getOpcode() == ISD::FNEG)
21886        return Op;
21887
21888  SDLoc dl(Op);
21889  MVT VT = Op.getSimpleValueType();
21890
21891  bool IsF128 = (VT == MVT::f128);
21892  assert((VT == MVT::f64 || VT == MVT::f32 || VT == MVT::f128 ||
21893          VT == MVT::v2f64 || VT == MVT::v4f64 || VT == MVT::v4f32 ||
21894          VT == MVT::v8f32 || VT == MVT::v8f64 || VT == MVT::v16f32) &&
21895         "Unexpected type in LowerFABSorFNEG");
21896
21897  // FIXME: Use function attribute "OptimizeForSize" and/or CodeGenOpt::Level to
21898  // decide if we should generate a 16-byte constant mask when we only need 4 or
21899  // 8 bytes for the scalar case.
21900
21901  // There are no scalar bitwise logical SSE/AVX instructions, so we
21902  // generate a 16-byte vector constant and logic op even for the scalar case.
21903  // Using a 16-byte mask allows folding the load of the mask with
21904  // the logic op, so it can save (~4 bytes) on code size.
21905  bool IsFakeVector = !VT.isVector() && !IsF128;
21906  MVT LogicVT = VT;
21907  if (IsFakeVector)
21908    LogicVT = (VT == MVT::f64) ? MVT::v2f64 : MVT::v4f32;
21909
21910  unsigned EltBits = VT.getScalarSizeInBits();
21911  // For FABS, mask is 0x7f...; for FNEG, mask is 0x80...
21912  APInt MaskElt = IsFABS ? APInt::getSignedMaxValue(EltBits) :
21913                           APInt::getSignMask(EltBits);
21914  const fltSemantics &Sem = SelectionDAG::EVTToAPFloatSemantics(VT);
21915  SDValue Mask = DAG.getConstantFP(APFloat(Sem, MaskElt), dl, LogicVT);
21916
21917  SDValue Op0 = Op.getOperand(0);
21918  bool IsFNABS = !IsFABS && (Op0.getOpcode() == ISD::FABS);
21919  unsigned LogicOp = IsFABS  ? X86ISD::FAND :
21920                     IsFNABS ? X86ISD::FOR  :
21921                               X86ISD::FXOR;
21922  SDValue Operand = IsFNABS ? Op0.getOperand(0) : Op0;
21923
21924  if (VT.isVector() || IsF128)
21925    return DAG.getNode(LogicOp, dl, LogicVT, Operand, Mask);
21926
21927  // For the scalar case extend to a 128-bit vector, perform the logic op,
21928  // and extract the scalar result back out.
21929  Operand = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, LogicVT, Operand);
21930  SDValue LogicNode = DAG.getNode(LogicOp, dl, LogicVT, Operand, Mask);
21931  return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, VT, LogicNode,
21932                     DAG.getIntPtrConstant(0, dl));
21933}
21934
21935static SDValue LowerFCOPYSIGN(SDValue Op, SelectionDAG &DAG) {
21936  SDValue Mag = Op.getOperand(0);
21937  SDValue Sign = Op.getOperand(1);
21938  SDLoc dl(Op);
21939
21940  // If the sign operand is smaller, extend it first.
21941  MVT VT = Op.getSimpleValueType();
21942  if (Sign.getSimpleValueType().bitsLT(VT))
21943    Sign = DAG.getNode(ISD::FP_EXTEND, dl, VT, Sign);
21944
21945  // And if it is bigger, shrink it first.
21946  if (Sign.getSimpleValueType().bitsGT(VT))
21947    Sign =
21948        DAG.getNode(ISD::FP_ROUND, dl, VT, Sign, DAG.getIntPtrConstant(0, dl));
21949
21950  // At this point the operands and the result should have the same
21951  // type, and that won't be f80 since that is not custom lowered.
21952  bool IsF128 = (VT == MVT::f128);
21953  assert((VT == MVT::f64 || VT == MVT::f32 || VT == MVT::f128 ||
21954          VT == MVT::v2f64 || VT == MVT::v4f64 || VT == MVT::v4f32 ||
21955          VT == MVT::v8f32 || VT == MVT::v8f64 || VT == MVT::v16f32) &&
21956         "Unexpected type in LowerFCOPYSIGN");
21957
21958  const fltSemantics &Sem = SelectionDAG::EVTToAPFloatSemantics(VT);
21959
21960  // Perform all scalar logic operations as 16-byte vectors because there are no
21961  // scalar FP logic instructions in SSE.
21962  // TODO: This isn't necessary. If we used scalar types, we might avoid some
21963  // unnecessary splats, but we might miss load folding opportunities. Should
21964  // this decision be based on OptimizeForSize?
21965  bool IsFakeVector = !VT.isVector() && !IsF128;
21966  MVT LogicVT = VT;
21967  if (IsFakeVector)
21968    LogicVT = (VT == MVT::f64) ? MVT::v2f64 : MVT::v4f32;
21969
21970  // The mask constants are automatically splatted for vector types.
21971  unsigned EltSizeInBits = VT.getScalarSizeInBits();
21972  SDValue SignMask = DAG.getConstantFP(
21973      APFloat(Sem, APInt::getSignMask(EltSizeInBits)), dl, LogicVT);
21974  SDValue MagMask = DAG.getConstantFP(
21975      APFloat(Sem, APInt::getSignedMaxValue(EltSizeInBits)), dl, LogicVT);
21976
21977  // First, clear all bits but the sign bit from the second operand (sign).
21978  if (IsFakeVector)
21979    Sign = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, LogicVT, Sign);
21980  SDValue SignBit = DAG.getNode(X86ISD::FAND, dl, LogicVT, Sign, SignMask);
21981
21982  // Next, clear the sign bit from the first operand (magnitude).
21983  // TODO: If we had general constant folding for FP logic ops, this check
21984  // wouldn't be necessary.
21985  SDValue MagBits;
21986  if (ConstantFPSDNode *Op0CN = isConstOrConstSplatFP(Mag)) {
21987    APFloat APF = Op0CN->getValueAPF();
21988    APF.clearSign();
21989    MagBits = DAG.getConstantFP(APF, dl, LogicVT);
21990  } else {
21991    // If the magnitude operand wasn't a constant, we need to AND out the sign.
21992    if (IsFakeVector)
21993      Mag = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, LogicVT, Mag);
21994    MagBits = DAG.getNode(X86ISD::FAND, dl, LogicVT, Mag, MagMask);
21995  }
21996
21997  // OR the magnitude value with the sign bit.
21998  SDValue Or = DAG.getNode(X86ISD::FOR, dl, LogicVT, MagBits, SignBit);
21999  return !IsFakeVector ? Or : DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, VT, Or,
22000                                          DAG.getIntPtrConstant(0, dl));
22001}
22002
22003static SDValue LowerFGETSIGN(SDValue Op, SelectionDAG &DAG) {
22004  SDValue N0 = Op.getOperand(0);
22005  SDLoc dl(Op);
22006  MVT VT = Op.getSimpleValueType();
22007
22008  MVT OpVT = N0.getSimpleValueType();
22009  assert((OpVT == MVT::f32 || OpVT == MVT::f64) &&
22010         "Unexpected type for FGETSIGN");
22011
22012  // Lower ISD::FGETSIGN to (AND (X86ISD::MOVMSK ...) 1).
22013  MVT VecVT = (OpVT == MVT::f32 ? MVT::v4f32 : MVT::v2f64);
22014  SDValue Res = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VecVT, N0);
22015  Res = DAG.getNode(X86ISD::MOVMSK, dl, MVT::i32, Res);
22016  Res = DAG.getZExtOrTrunc(Res, dl, VT);
22017  Res = DAG.getNode(ISD::AND, dl, VT, Res, DAG.getConstant(1, dl, VT));
22018  return Res;
22019}
22020
22021/// Helper for creating a X86ISD::SETCC node.
22022static SDValue getSETCC(X86::CondCode Cond, SDValue EFLAGS, const SDLoc &dl,
22023                        SelectionDAG &DAG) {
22024  return DAG.getNode(X86ISD::SETCC, dl, MVT::i8,
22025                     DAG.getTargetConstant(Cond, dl, MVT::i8), EFLAGS);
22026}
22027
22028/// Helper for matching OR(EXTRACTELT(X,0),OR(EXTRACTELT(X,1),...))
22029/// style scalarized (associative) reduction patterns. Partial reductions
22030/// are supported when the pointer SrcMask is non-null.
22031/// TODO - move this to SelectionDAG?
22032static bool matchScalarReduction(SDValue Op, ISD::NodeType BinOp,
22033                                 SmallVectorImpl<SDValue> &SrcOps,
22034                                 SmallVectorImpl<APInt> *SrcMask = nullptr) {
22035  SmallVector<SDValue, 8> Opnds;
22036  DenseMap<SDValue, APInt> SrcOpMap;
22037  EVT VT = MVT::Other;
22038
22039  // Recognize a special case where a vector is casted into wide integer to
22040  // test all 0s.
22041  assert(Op.getOpcode() == unsigned(BinOp) &&
22042         "Unexpected bit reduction opcode");
22043  Opnds.push_back(Op.getOperand(0));
22044  Opnds.push_back(Op.getOperand(1));
22045
22046  for (unsigned Slot = 0, e = Opnds.size(); Slot < e; ++Slot) {
22047    SmallVectorImpl<SDValue>::const_iterator I = Opnds.begin() + Slot;
22048    // BFS traverse all BinOp operands.
22049    if (I->getOpcode() == unsigned(BinOp)) {
22050      Opnds.push_back(I->getOperand(0));
22051      Opnds.push_back(I->getOperand(1));
22052      // Re-evaluate the number of nodes to be traversed.
22053      e += 2; // 2 more nodes (LHS and RHS) are pushed.
22054      continue;
22055    }
22056
22057    // Quit if a non-EXTRACT_VECTOR_ELT
22058    if (I->getOpcode() != ISD::EXTRACT_VECTOR_ELT)
22059      return false;
22060
22061    // Quit if without a constant index.
22062    auto *Idx = dyn_cast<ConstantSDNode>(I->getOperand(1));
22063    if (!Idx)
22064      return false;
22065
22066    SDValue Src = I->getOperand(0);
22067    DenseMap<SDValue, APInt>::iterator M = SrcOpMap.find(Src);
22068    if (M == SrcOpMap.end()) {
22069      VT = Src.getValueType();
22070      // Quit if not the same type.
22071      if (!SrcOpMap.empty() && VT != SrcOpMap.begin()->first.getValueType())
22072        return false;
22073      unsigned NumElts = VT.getVectorNumElements();
22074      APInt EltCount = APInt::getNullValue(NumElts);
22075      M = SrcOpMap.insert(std::make_pair(Src, EltCount)).first;
22076      SrcOps.push_back(Src);
22077    }
22078
22079    // Quit if element already used.
22080    unsigned CIdx = Idx->getZExtValue();
22081    if (M->second[CIdx])
22082      return false;
22083    M->second.setBit(CIdx);
22084  }
22085
22086  if (SrcMask) {
22087    // Collect the source partial masks.
22088    for (SDValue &SrcOp : SrcOps)
22089      SrcMask->push_back(SrcOpMap[SrcOp]);
22090  } else {
22091    // Quit if not all elements are used.
22092    for (const auto &I : SrcOpMap)
22093      if (!I.second.isAllOnesValue())
22094        return false;
22095  }
22096
22097  return true;
22098}
22099
22100// Helper function for comparing all bits of a vector against zero.
22101static SDValue LowerVectorAllZero(const SDLoc &DL, SDValue V, ISD::CondCode CC,
22102                                  const APInt &Mask,
22103                                  const X86Subtarget &Subtarget,
22104                                  SelectionDAG &DAG, X86::CondCode &X86CC) {
22105  EVT VT = V.getValueType();
22106  unsigned ScalarSize = VT.getScalarSizeInBits();
22107  if (Mask.getBitWidth() != ScalarSize) {
22108    assert(ScalarSize == 1 && "Element Mask vs Vector bitwidth mismatch");
22109    return SDValue();
22110  }
22111
22112  assert((CC == ISD::SETEQ || CC == ISD::SETNE) && "Unsupported ISD::CondCode");
22113  X86CC = (CC == ISD::SETEQ ? X86::COND_E : X86::COND_NE);
22114
22115  auto MaskBits = [&](SDValue Src) {
22116    if (Mask.isAllOnesValue())
22117      return Src;
22118    EVT SrcVT = Src.getValueType();
22119    SDValue MaskValue = DAG.getConstant(Mask, DL, SrcVT);
22120    return DAG.getNode(ISD::AND, DL, SrcVT, Src, MaskValue);
22121  };
22122
22123  // For sub-128-bit vector, cast to (legal) integer and compare with zero.
22124  if (VT.getSizeInBits() < 128) {
22125    EVT IntVT = EVT::getIntegerVT(*DAG.getContext(), VT.getSizeInBits());
22126    if (!DAG.getTargetLoweringInfo().isTypeLegal(IntVT))
22127      return SDValue();
22128    return DAG.getNode(X86ISD::CMP, DL, MVT::i32,
22129                       DAG.getBitcast(IntVT, MaskBits(V)),
22130                       DAG.getConstant(0, DL, IntVT));
22131  }
22132
22133  // Quit if not splittable to 128/256-bit vector.
22134  if (!isPowerOf2_32(VT.getSizeInBits()))
22135    return SDValue();
22136
22137  // Split down to 128/256-bit vector.
22138  unsigned TestSize = Subtarget.hasAVX() ? 256 : 128;
22139  while (VT.getSizeInBits() > TestSize) {
22140    auto Split = DAG.SplitVector(V, DL);
22141    VT = Split.first.getValueType();
22142    V = DAG.getNode(ISD::OR, DL, VT, Split.first, Split.second);
22143  }
22144
22145  bool UsePTEST = Subtarget.hasSSE41();
22146  if (UsePTEST) {
22147    MVT TestVT = VT.is128BitVector() ? MVT::v2i64 : MVT::v4i64;
22148    V = DAG.getBitcast(TestVT, MaskBits(V));
22149    return DAG.getNode(X86ISD::PTEST, DL, MVT::i32, V, V);
22150  }
22151
22152  // Without PTEST, a masked v2i64 or-reduction is not faster than
22153  // scalarization.
22154  if (!Mask.isAllOnesValue() && VT.getScalarSizeInBits() > 32)
22155      return SDValue();
22156
22157  V = DAG.getBitcast(MVT::v16i8, MaskBits(V));
22158  V = DAG.getNode(X86ISD::PCMPEQ, DL, MVT::v16i8, V,
22159                  getZeroVector(MVT::v16i8, Subtarget, DAG, DL));
22160  V = DAG.getNode(X86ISD::MOVMSK, DL, MVT::i32, V);
22161  return DAG.getNode(X86ISD::CMP, DL, MVT::i32, V,
22162                     DAG.getConstant(0xFFFF, DL, MVT::i32));
22163}
22164
22165// Check whether an OR'd reduction tree is PTEST-able, or if we can fallback to
22166// CMP(MOVMSK(PCMPEQB(X,0))).
22167static SDValue MatchVectorAllZeroTest(SDValue Op, ISD::CondCode CC,
22168                                      const SDLoc &DL,
22169                                      const X86Subtarget &Subtarget,
22170                                      SelectionDAG &DAG, SDValue &X86CC) {
22171  assert((CC == ISD::SETEQ || CC == ISD::SETNE) && "Unsupported ISD::CondCode");
22172
22173  if (!Subtarget.hasSSE2() || !Op->hasOneUse())
22174    return SDValue();
22175
22176  // Check whether we're masking/truncating an OR-reduction result, in which
22177  // case track the masked bits.
22178  APInt Mask = APInt::getAllOnesValue(Op.getScalarValueSizeInBits());
22179  switch (Op.getOpcode()) {
22180  case ISD::TRUNCATE: {
22181    SDValue Src = Op.getOperand(0);
22182    Mask = APInt::getLowBitsSet(Src.getScalarValueSizeInBits(),
22183                                Op.getScalarValueSizeInBits());
22184    Op = Src;
22185    break;
22186  }
22187  case ISD::AND: {
22188    if (auto *Cst = dyn_cast<ConstantSDNode>(Op.getOperand(1))) {
22189      Mask = Cst->getAPIntValue();
22190      Op = Op.getOperand(0);
22191    }
22192    break;
22193  }
22194  }
22195
22196  SmallVector<SDValue, 8> VecIns;
22197  if (Op.getOpcode() == ISD::OR && matchScalarReduction(Op, ISD::OR, VecIns)) {
22198    EVT VT = VecIns[0].getValueType();
22199    assert(llvm::all_of(VecIns,
22200                        [VT](SDValue V) { return VT == V.getValueType(); }) &&
22201           "Reduction source vector mismatch");
22202
22203    // Quit if less than 128-bits or not splittable to 128/256-bit vector.
22204    if (VT.getSizeInBits() < 128 || !isPowerOf2_32(VT.getSizeInBits()))
22205      return SDValue();
22206
22207    // If more than one full vector is evaluated, OR them first before PTEST.
22208    for (unsigned Slot = 0, e = VecIns.size(); e - Slot > 1;
22209         Slot += 2, e += 1) {
22210      // Each iteration will OR 2 nodes and append the result until there is
22211      // only 1 node left, i.e. the final OR'd value of all vectors.
22212      SDValue LHS = VecIns[Slot];
22213      SDValue RHS = VecIns[Slot + 1];
22214      VecIns.push_back(DAG.getNode(ISD::OR, DL, VT, LHS, RHS));
22215    }
22216
22217    X86::CondCode CCode;
22218    if (SDValue V = LowerVectorAllZero(DL, VecIns.back(), CC, Mask, Subtarget,
22219                                       DAG, CCode)) {
22220      X86CC = DAG.getTargetConstant(CCode, DL, MVT::i8);
22221      return V;
22222    }
22223  }
22224
22225  if (Op.getOpcode() == ISD::EXTRACT_VECTOR_ELT) {
22226    ISD::NodeType BinOp;
22227    if (SDValue Match =
22228            DAG.matchBinOpReduction(Op.getNode(), BinOp, {ISD::OR})) {
22229      X86::CondCode CCode;
22230      if (SDValue V =
22231              LowerVectorAllZero(DL, Match, CC, Mask, Subtarget, DAG, CCode)) {
22232        X86CC = DAG.getTargetConstant(CCode, DL, MVT::i8);
22233        return V;
22234      }
22235    }
22236  }
22237
22238  return SDValue();
22239}
22240
22241/// return true if \c Op has a use that doesn't just read flags.
22242static bool hasNonFlagsUse(SDValue Op) {
22243  for (SDNode::use_iterator UI = Op->use_begin(), UE = Op->use_end(); UI != UE;
22244       ++UI) {
22245    SDNode *User = *UI;
22246    unsigned UOpNo = UI.getOperandNo();
22247    if (User->getOpcode() == ISD::TRUNCATE && User->hasOneUse()) {
22248      // Look pass truncate.
22249      UOpNo = User->use_begin().getOperandNo();
22250      User = *User->use_begin();
22251    }
22252
22253    if (User->getOpcode() != ISD::BRCOND && User->getOpcode() != ISD::SETCC &&
22254        !(User->getOpcode() == ISD::SELECT && UOpNo == 0))
22255      return true;
22256  }
22257  return false;
22258}
22259
22260// Transform to an x86-specific ALU node with flags if there is a chance of
22261// using an RMW op or only the flags are used. Otherwise, leave
22262// the node alone and emit a 'cmp' or 'test' instruction.
22263static bool isProfitableToUseFlagOp(SDValue Op) {
22264  for (SDNode *U : Op->uses())
22265    if (U->getOpcode() != ISD::CopyToReg &&
22266        U->getOpcode() != ISD::SETCC &&
22267        U->getOpcode() != ISD::STORE)
22268      return false;
22269
22270  return true;
22271}
22272
22273/// Emit nodes that will be selected as "test Op0,Op0", or something
22274/// equivalent.
22275static SDValue EmitTest(SDValue Op, unsigned X86CC, const SDLoc &dl,
22276                        SelectionDAG &DAG, const X86Subtarget &Subtarget) {
22277  // CF and OF aren't always set the way we want. Determine which
22278  // of these we need.
22279  bool NeedCF = false;
22280  bool NeedOF = false;
22281  switch (X86CC) {
22282  default: break;
22283  case X86::COND_A: case X86::COND_AE:
22284  case X86::COND_B: case X86::COND_BE:
22285    NeedCF = true;
22286    break;
22287  case X86::COND_G: case X86::COND_GE:
22288  case X86::COND_L: case X86::COND_LE:
22289  case X86::COND_O: case X86::COND_NO: {
22290    // Check if we really need to set the
22291    // Overflow flag. If NoSignedWrap is present
22292    // that is not actually needed.
22293    switch (Op->getOpcode()) {
22294    case ISD::ADD:
22295    case ISD::SUB:
22296    case ISD::MUL:
22297    case ISD::SHL:
22298      if (Op.getNode()->getFlags().hasNoSignedWrap())
22299        break;
22300      LLVM_FALLTHROUGH;
22301    default:
22302      NeedOF = true;
22303      break;
22304    }
22305    break;
22306  }
22307  }
22308  // See if we can use the EFLAGS value from the operand instead of
22309  // doing a separate TEST. TEST always sets OF and CF to 0, so unless
22310  // we prove that the arithmetic won't overflow, we can't use OF or CF.
22311  if (Op.getResNo() != 0 || NeedOF || NeedCF) {
22312    // Emit a CMP with 0, which is the TEST pattern.
22313    return DAG.getNode(X86ISD::CMP, dl, MVT::i32, Op,
22314                       DAG.getConstant(0, dl, Op.getValueType()));
22315  }
22316  unsigned Opcode = 0;
22317  unsigned NumOperands = 0;
22318
22319  SDValue ArithOp = Op;
22320
22321  // NOTICE: In the code below we use ArithOp to hold the arithmetic operation
22322  // which may be the result of a CAST.  We use the variable 'Op', which is the
22323  // non-casted variable when we check for possible users.
22324  switch (ArithOp.getOpcode()) {
22325  case ISD::AND:
22326    // If the primary 'and' result isn't used, don't bother using X86ISD::AND,
22327    // because a TEST instruction will be better.
22328    if (!hasNonFlagsUse(Op))
22329      break;
22330
22331    LLVM_FALLTHROUGH;
22332  case ISD::ADD:
22333  case ISD::SUB:
22334  case ISD::OR:
22335  case ISD::XOR:
22336    if (!isProfitableToUseFlagOp(Op))
22337      break;
22338
22339    // Otherwise use a regular EFLAGS-setting instruction.
22340    switch (ArithOp.getOpcode()) {
22341    default: llvm_unreachable("unexpected operator!");
22342    case ISD::ADD: Opcode = X86ISD::ADD; break;
22343    case ISD::SUB: Opcode = X86ISD::SUB; break;
22344    case ISD::XOR: Opcode = X86ISD::XOR; break;
22345    case ISD::AND: Opcode = X86ISD::AND; break;
22346    case ISD::OR:  Opcode = X86ISD::OR;  break;
22347    }
22348
22349    NumOperands = 2;
22350    break;
22351  case X86ISD::ADD:
22352  case X86ISD::SUB:
22353  case X86ISD::OR:
22354  case X86ISD::XOR:
22355  case X86ISD::AND:
22356    return SDValue(Op.getNode(), 1);
22357  case ISD::SSUBO:
22358  case ISD::USUBO: {
22359    // /USUBO/SSUBO will become a X86ISD::SUB and we can use its Z flag.
22360    SDVTList VTs = DAG.getVTList(Op.getValueType(), MVT::i32);
22361    return DAG.getNode(X86ISD::SUB, dl, VTs, Op->getOperand(0),
22362                       Op->getOperand(1)).getValue(1);
22363  }
22364  default:
22365    break;
22366  }
22367
22368  if (Opcode == 0) {
22369    // Emit a CMP with 0, which is the TEST pattern.
22370    return DAG.getNode(X86ISD::CMP, dl, MVT::i32, Op,
22371                       DAG.getConstant(0, dl, Op.getValueType()));
22372  }
22373  SDVTList VTs = DAG.getVTList(Op.getValueType(), MVT::i32);
22374  SmallVector<SDValue, 4> Ops(Op->op_begin(), Op->op_begin() + NumOperands);
22375
22376  SDValue New = DAG.getNode(Opcode, dl, VTs, Ops);
22377  DAG.ReplaceAllUsesOfValueWith(SDValue(Op.getNode(), 0), New);
22378  return SDValue(New.getNode(), 1);
22379}
22380
22381/// Emit nodes that will be selected as "cmp Op0,Op1", or something
22382/// equivalent.
22383static SDValue EmitCmp(SDValue Op0, SDValue Op1, unsigned X86CC,
22384                       const SDLoc &dl, SelectionDAG &DAG,
22385                       const X86Subtarget &Subtarget) {
22386  if (isNullConstant(Op1))
22387    return EmitTest(Op0, X86CC, dl, DAG, Subtarget);
22388
22389  EVT CmpVT = Op0.getValueType();
22390
22391  assert((CmpVT == MVT::i8 || CmpVT == MVT::i16 ||
22392          CmpVT == MVT::i32 || CmpVT == MVT::i64) && "Unexpected VT!");
22393
22394  // Only promote the compare up to I32 if it is a 16 bit operation
22395  // with an immediate.  16 bit immediates are to be avoided.
22396  if (CmpVT == MVT::i16 && !Subtarget.isAtom() &&
22397      !DAG.getMachineFunction().getFunction().hasMinSize()) {
22398    ConstantSDNode *COp0 = dyn_cast<ConstantSDNode>(Op0);
22399    ConstantSDNode *COp1 = dyn_cast<ConstantSDNode>(Op1);
22400    // Don't do this if the immediate can fit in 8-bits.
22401    if ((COp0 && !COp0->getAPIntValue().isSignedIntN(8)) ||
22402        (COp1 && !COp1->getAPIntValue().isSignedIntN(8))) {
22403      unsigned ExtendOp =
22404          isX86CCSigned(X86CC) ? ISD::SIGN_EXTEND : ISD::ZERO_EXTEND;
22405      if (X86CC == X86::COND_E || X86CC == X86::COND_NE) {
22406        // For equality comparisons try to use SIGN_EXTEND if the input was
22407        // truncate from something with enough sign bits.
22408        if (Op0.getOpcode() == ISD::TRUNCATE) {
22409          SDValue In = Op0.getOperand(0);
22410          unsigned EffBits =
22411              In.getScalarValueSizeInBits() - DAG.ComputeNumSignBits(In) + 1;
22412          if (EffBits <= 16)
22413            ExtendOp = ISD::SIGN_EXTEND;
22414        } else if (Op1.getOpcode() == ISD::TRUNCATE) {
22415          SDValue In = Op1.getOperand(0);
22416          unsigned EffBits =
22417              In.getScalarValueSizeInBits() - DAG.ComputeNumSignBits(In) + 1;
22418          if (EffBits <= 16)
22419            ExtendOp = ISD::SIGN_EXTEND;
22420        }
22421      }
22422
22423      CmpVT = MVT::i32;
22424      Op0 = DAG.getNode(ExtendOp, dl, CmpVT, Op0);
22425      Op1 = DAG.getNode(ExtendOp, dl, CmpVT, Op1);
22426    }
22427  }
22428
22429  // Try to shrink i64 compares if the input has enough zero bits.
22430  // FIXME: Do this for non-constant compares for constant on LHS?
22431  if (CmpVT == MVT::i64 && isa<ConstantSDNode>(Op1) && !isX86CCSigned(X86CC) &&
22432      Op0.hasOneUse() && // Hacky way to not break CSE opportunities with sub.
22433      cast<ConstantSDNode>(Op1)->getAPIntValue().getActiveBits() <= 32 &&
22434      DAG.MaskedValueIsZero(Op0, APInt::getHighBitsSet(64, 32))) {
22435    CmpVT = MVT::i32;
22436    Op0 = DAG.getNode(ISD::TRUNCATE, dl, CmpVT, Op0);
22437    Op1 = DAG.getNode(ISD::TRUNCATE, dl, CmpVT, Op1);
22438  }
22439
22440  // 0-x == y --> x+y == 0
22441  // 0-x != y --> x+y != 0
22442  if (Op0.getOpcode() == ISD::SUB && isNullConstant(Op0.getOperand(0)) &&
22443      Op0.hasOneUse() && (X86CC == X86::COND_E || X86CC == X86::COND_NE)) {
22444    SDVTList VTs = DAG.getVTList(CmpVT, MVT::i32);
22445    SDValue Add = DAG.getNode(X86ISD::ADD, dl, VTs, Op0.getOperand(1), Op1);
22446    return Add.getValue(1);
22447  }
22448
22449  // x == 0-y --> x+y == 0
22450  // x != 0-y --> x+y != 0
22451  if (Op1.getOpcode() == ISD::SUB && isNullConstant(Op1.getOperand(0)) &&
22452      Op1.hasOneUse() && (X86CC == X86::COND_E || X86CC == X86::COND_NE)) {
22453    SDVTList VTs = DAG.getVTList(CmpVT, MVT::i32);
22454    SDValue Add = DAG.getNode(X86ISD::ADD, dl, VTs, Op0, Op1.getOperand(1));
22455    return Add.getValue(1);
22456  }
22457
22458  // Use SUB instead of CMP to enable CSE between SUB and CMP.
22459  SDVTList VTs = DAG.getVTList(CmpVT, MVT::i32);
22460  SDValue Sub = DAG.getNode(X86ISD::SUB, dl, VTs, Op0, Op1);
22461  return Sub.getValue(1);
22462}
22463
22464/// Check if replacement of SQRT with RSQRT should be disabled.
22465bool X86TargetLowering::isFsqrtCheap(SDValue Op, SelectionDAG &DAG) const {
22466  EVT VT = Op.getValueType();
22467
22468  // We never want to use both SQRT and RSQRT instructions for the same input.
22469  if (DAG.getNodeIfExists(X86ISD::FRSQRT, DAG.getVTList(VT), Op))
22470    return false;
22471
22472  if (VT.isVector())
22473    return Subtarget.hasFastVectorFSQRT();
22474  return Subtarget.hasFastScalarFSQRT();
22475}
22476
22477/// The minimum architected relative accuracy is 2^-12. We need one
22478/// Newton-Raphson step to have a good float result (24 bits of precision).
22479SDValue X86TargetLowering::getSqrtEstimate(SDValue Op,
22480                                           SelectionDAG &DAG, int Enabled,
22481                                           int &RefinementSteps,
22482                                           bool &UseOneConstNR,
22483                                           bool Reciprocal) const {
22484  EVT VT = Op.getValueType();
22485
22486  // SSE1 has rsqrtss and rsqrtps. AVX adds a 256-bit variant for rsqrtps.
22487  // It is likely not profitable to do this for f64 because a double-precision
22488  // rsqrt estimate with refinement on x86 prior to FMA requires at least 16
22489  // instructions: convert to single, rsqrtss, convert back to double, refine
22490  // (3 steps = at least 13 insts). If an 'rsqrtsd' variant was added to the ISA
22491  // along with FMA, this could be a throughput win.
22492  // TODO: SQRT requires SSE2 to prevent the introduction of an illegal v4i32
22493  // after legalize types.
22494  if ((VT == MVT::f32 && Subtarget.hasSSE1()) ||
22495      (VT == MVT::v4f32 && Subtarget.hasSSE1() && Reciprocal) ||
22496      (VT == MVT::v4f32 && Subtarget.hasSSE2() && !Reciprocal) ||
22497      (VT == MVT::v8f32 && Subtarget.hasAVX()) ||
22498      (VT == MVT::v16f32 && Subtarget.useAVX512Regs())) {
22499    if (RefinementSteps == ReciprocalEstimate::Unspecified)
22500      RefinementSteps = 1;
22501
22502    UseOneConstNR = false;
22503    // There is no FSQRT for 512-bits, but there is RSQRT14.
22504    unsigned Opcode = VT == MVT::v16f32 ? X86ISD::RSQRT14 : X86ISD::FRSQRT;
22505    return DAG.getNode(Opcode, SDLoc(Op), VT, Op);
22506  }
22507  return SDValue();
22508}
22509
22510/// The minimum architected relative accuracy is 2^-12. We need one
22511/// Newton-Raphson step to have a good float result (24 bits of precision).
22512SDValue X86TargetLowering::getRecipEstimate(SDValue Op, SelectionDAG &DAG,
22513                                            int Enabled,
22514                                            int &RefinementSteps) const {
22515  EVT VT = Op.getValueType();
22516
22517  // SSE1 has rcpss and rcpps. AVX adds a 256-bit variant for rcpps.
22518  // It is likely not profitable to do this for f64 because a double-precision
22519  // reciprocal estimate with refinement on x86 prior to FMA requires
22520  // 15 instructions: convert to single, rcpss, convert back to double, refine
22521  // (3 steps = 12 insts). If an 'rcpsd' variant was added to the ISA
22522  // along with FMA, this could be a throughput win.
22523
22524  if ((VT == MVT::f32 && Subtarget.hasSSE1()) ||
22525      (VT == MVT::v4f32 && Subtarget.hasSSE1()) ||
22526      (VT == MVT::v8f32 && Subtarget.hasAVX()) ||
22527      (VT == MVT::v16f32 && Subtarget.useAVX512Regs())) {
22528    // Enable estimate codegen with 1 refinement step for vector division.
22529    // Scalar division estimates are disabled because they break too much
22530    // real-world code. These defaults are intended to match GCC behavior.
22531    if (VT == MVT::f32 && Enabled == ReciprocalEstimate::Unspecified)
22532      return SDValue();
22533
22534    if (RefinementSteps == ReciprocalEstimate::Unspecified)
22535      RefinementSteps = 1;
22536
22537    // There is no FSQRT for 512-bits, but there is RCP14.
22538    unsigned Opcode = VT == MVT::v16f32 ? X86ISD::RCP14 : X86ISD::FRCP;
22539    return DAG.getNode(Opcode, SDLoc(Op), VT, Op);
22540  }
22541  return SDValue();
22542}
22543
22544/// If we have at least two divisions that use the same divisor, convert to
22545/// multiplication by a reciprocal. This may need to be adjusted for a given
22546/// CPU if a division's cost is not at least twice the cost of a multiplication.
22547/// This is because we still need one division to calculate the reciprocal and
22548/// then we need two multiplies by that reciprocal as replacements for the
22549/// original divisions.
22550unsigned X86TargetLowering::combineRepeatedFPDivisors() const {
22551  return 2;
22552}
22553
22554SDValue
22555X86TargetLowering::BuildSDIVPow2(SDNode *N, const APInt &Divisor,
22556                                 SelectionDAG &DAG,
22557                                 SmallVectorImpl<SDNode *> &Created) const {
22558  AttributeList Attr = DAG.getMachineFunction().getFunction().getAttributes();
22559  if (isIntDivCheap(N->getValueType(0), Attr))
22560    return SDValue(N,0); // Lower SDIV as SDIV
22561
22562  assert((Divisor.isPowerOf2() || (-Divisor).isPowerOf2()) &&
22563         "Unexpected divisor!");
22564
22565  // Only perform this transform if CMOV is supported otherwise the select
22566  // below will become a branch.
22567  if (!Subtarget.hasCMov())
22568    return SDValue();
22569
22570  // fold (sdiv X, pow2)
22571  EVT VT = N->getValueType(0);
22572  // FIXME: Support i8.
22573  if (VT != MVT::i16 && VT != MVT::i32 &&
22574      !(Subtarget.is64Bit() && VT == MVT::i64))
22575    return SDValue();
22576
22577  unsigned Lg2 = Divisor.countTrailingZeros();
22578
22579  // If the divisor is 2 or -2, the default expansion is better.
22580  if (Lg2 == 1)
22581    return SDValue();
22582
22583  SDLoc DL(N);
22584  SDValue N0 = N->getOperand(0);
22585  SDValue Zero = DAG.getConstant(0, DL, VT);
22586  APInt Lg2Mask = APInt::getLowBitsSet(VT.getSizeInBits(), Lg2);
22587  SDValue Pow2MinusOne = DAG.getConstant(Lg2Mask, DL, VT);
22588
22589  // If N0 is negative, we need to add (Pow2 - 1) to it before shifting right.
22590  SDValue Cmp = DAG.getSetCC(DL, MVT::i8, N0, Zero, ISD::SETLT);
22591  SDValue Add = DAG.getNode(ISD::ADD, DL, VT, N0, Pow2MinusOne);
22592  SDValue CMov = DAG.getNode(ISD::SELECT, DL, VT, Cmp, Add, N0);
22593
22594  Created.push_back(Cmp.getNode());
22595  Created.push_back(Add.getNode());
22596  Created.push_back(CMov.getNode());
22597
22598  // Divide by pow2.
22599  SDValue SRA =
22600      DAG.getNode(ISD::SRA, DL, VT, CMov, DAG.getConstant(Lg2, DL, MVT::i8));
22601
22602  // If we're dividing by a positive value, we're done.  Otherwise, we must
22603  // negate the result.
22604  if (Divisor.isNonNegative())
22605    return SRA;
22606
22607  Created.push_back(SRA.getNode());
22608  return DAG.getNode(ISD::SUB, DL, VT, Zero, SRA);
22609}
22610
22611/// Result of 'and' is compared against zero. Change to a BT node if possible.
22612/// Returns the BT node and the condition code needed to use it.
22613static SDValue LowerAndToBT(SDValue And, ISD::CondCode CC,
22614                            const SDLoc &dl, SelectionDAG &DAG,
22615                            SDValue &X86CC) {
22616  assert(And.getOpcode() == ISD::AND && "Expected AND node!");
22617  SDValue Op0 = And.getOperand(0);
22618  SDValue Op1 = And.getOperand(1);
22619  if (Op0.getOpcode() == ISD::TRUNCATE)
22620    Op0 = Op0.getOperand(0);
22621  if (Op1.getOpcode() == ISD::TRUNCATE)
22622    Op1 = Op1.getOperand(0);
22623
22624  SDValue Src, BitNo;
22625  if (Op1.getOpcode() == ISD::SHL)
22626    std::swap(Op0, Op1);
22627  if (Op0.getOpcode() == ISD::SHL) {
22628    if (isOneConstant(Op0.getOperand(0))) {
22629      // If we looked past a truncate, check that it's only truncating away
22630      // known zeros.
22631      unsigned BitWidth = Op0.getValueSizeInBits();
22632      unsigned AndBitWidth = And.getValueSizeInBits();
22633      if (BitWidth > AndBitWidth) {
22634        KnownBits Known = DAG.computeKnownBits(Op0);
22635        if (Known.countMinLeadingZeros() < BitWidth - AndBitWidth)
22636          return SDValue();
22637      }
22638      Src = Op1;
22639      BitNo = Op0.getOperand(1);
22640    }
22641  } else if (Op1.getOpcode() == ISD::Constant) {
22642    ConstantSDNode *AndRHS = cast<ConstantSDNode>(Op1);
22643    uint64_t AndRHSVal = AndRHS->getZExtValue();
22644    SDValue AndLHS = Op0;
22645
22646    if (AndRHSVal == 1 && AndLHS.getOpcode() == ISD::SRL) {
22647      Src = AndLHS.getOperand(0);
22648      BitNo = AndLHS.getOperand(1);
22649    } else {
22650      // Use BT if the immediate can't be encoded in a TEST instruction or we
22651      // are optimizing for size and the immedaite won't fit in a byte.
22652      bool OptForSize = DAG.shouldOptForSize();
22653      if ((!isUInt<32>(AndRHSVal) || (OptForSize && !isUInt<8>(AndRHSVal))) &&
22654          isPowerOf2_64(AndRHSVal)) {
22655        Src = AndLHS;
22656        BitNo = DAG.getConstant(Log2_64_Ceil(AndRHSVal), dl,
22657                                Src.getValueType());
22658      }
22659    }
22660  }
22661
22662  // No patterns found, give up.
22663  if (!Src.getNode())
22664    return SDValue();
22665
22666  // If Src is i8, promote it to i32 with any_extend.  There is no i8 BT
22667  // instruction.  Since the shift amount is in-range-or-undefined, we know
22668  // that doing a bittest on the i32 value is ok.  We extend to i32 because
22669  // the encoding for the i16 version is larger than the i32 version.
22670  // Also promote i16 to i32 for performance / code size reason.
22671  if (Src.getValueType() == MVT::i8 || Src.getValueType() == MVT::i16)
22672    Src = DAG.getNode(ISD::ANY_EXTEND, dl, MVT::i32, Src);
22673
22674  // See if we can use the 32-bit instruction instead of the 64-bit one for a
22675  // shorter encoding. Since the former takes the modulo 32 of BitNo and the
22676  // latter takes the modulo 64, this is only valid if the 5th bit of BitNo is
22677  // known to be zero.
22678  if (Src.getValueType() == MVT::i64 &&
22679      DAG.MaskedValueIsZero(BitNo, APInt(BitNo.getValueSizeInBits(), 32)))
22680    Src = DAG.getNode(ISD::TRUNCATE, dl, MVT::i32, Src);
22681
22682  // If the operand types disagree, extend the shift amount to match.  Since
22683  // BT ignores high bits (like shifts) we can use anyextend.
22684  if (Src.getValueType() != BitNo.getValueType())
22685    BitNo = DAG.getNode(ISD::ANY_EXTEND, dl, Src.getValueType(), BitNo);
22686
22687  X86CC = DAG.getTargetConstant(CC == ISD::SETEQ ? X86::COND_AE : X86::COND_B,
22688                                dl, MVT::i8);
22689  return DAG.getNode(X86ISD::BT, dl, MVT::i32, Src, BitNo);
22690}
22691
22692/// Turns an ISD::CondCode into a value suitable for SSE floating-point mask
22693/// CMPs.
22694static unsigned translateX86FSETCC(ISD::CondCode SetCCOpcode, SDValue &Op0,
22695                                   SDValue &Op1, bool &IsAlwaysSignaling) {
22696  unsigned SSECC;
22697  bool Swap = false;
22698
22699  // SSE Condition code mapping:
22700  //  0 - EQ
22701  //  1 - LT
22702  //  2 - LE
22703  //  3 - UNORD
22704  //  4 - NEQ
22705  //  5 - NLT
22706  //  6 - NLE
22707  //  7 - ORD
22708  switch (SetCCOpcode) {
22709  default: llvm_unreachable("Unexpected SETCC condition");
22710  case ISD::SETOEQ:
22711  case ISD::SETEQ:  SSECC = 0; break;
22712  case ISD::SETOGT:
22713  case ISD::SETGT:  Swap = true; LLVM_FALLTHROUGH;
22714  case ISD::SETLT:
22715  case ISD::SETOLT: SSECC = 1; break;
22716  case ISD::SETOGE:
22717  case ISD::SETGE:  Swap = true; LLVM_FALLTHROUGH;
22718  case ISD::SETLE:
22719  case ISD::SETOLE: SSECC = 2; break;
22720  case ISD::SETUO:  SSECC = 3; break;
22721  case ISD::SETUNE:
22722  case ISD::SETNE:  SSECC = 4; break;
22723  case ISD::SETULE: Swap = true; LLVM_FALLTHROUGH;
22724  case ISD::SETUGE: SSECC = 5; break;
22725  case ISD::SETULT: Swap = true; LLVM_FALLTHROUGH;
22726  case ISD::SETUGT: SSECC = 6; break;
22727  case ISD::SETO:   SSECC = 7; break;
22728  case ISD::SETUEQ: SSECC = 8; break;
22729  case ISD::SETONE: SSECC = 12; break;
22730  }
22731  if (Swap)
22732    std::swap(Op0, Op1);
22733
22734  switch (SetCCOpcode) {
22735  default:
22736    IsAlwaysSignaling = true;
22737    break;
22738  case ISD::SETEQ:
22739  case ISD::SETOEQ:
22740  case ISD::SETUEQ:
22741  case ISD::SETNE:
22742  case ISD::SETONE:
22743  case ISD::SETUNE:
22744  case ISD::SETO:
22745  case ISD::SETUO:
22746    IsAlwaysSignaling = false;
22747    break;
22748  }
22749
22750  return SSECC;
22751}
22752
22753/// Break a VSETCC 256-bit integer VSETCC into two new 128 ones and then
22754/// concatenate the result back.
22755static SDValue splitIntVSETCC(EVT VT, SDValue LHS, SDValue RHS,
22756                              ISD::CondCode Cond, SelectionDAG &DAG,
22757                              const SDLoc &dl) {
22758  assert(VT.isInteger() && VT == LHS.getValueType() &&
22759         VT == RHS.getValueType() && "Unsupported VTs!");
22760
22761  SDValue CC = DAG.getCondCode(Cond);
22762
22763  // Extract the LHS Lo/Hi vectors
22764  SDValue LHS1, LHS2;
22765  std::tie(LHS1, LHS2) = splitVector(LHS, DAG, dl);
22766
22767  // Extract the RHS Lo/Hi vectors
22768  SDValue RHS1, RHS2;
22769  std::tie(RHS1, RHS2) = splitVector(RHS, DAG, dl);
22770
22771  // Issue the operation on the smaller types and concatenate the result back
22772  EVT LoVT, HiVT;
22773  std::tie(LoVT, HiVT) = DAG.GetSplitDestVTs(VT);
22774  return DAG.getNode(ISD::CONCAT_VECTORS, dl, VT,
22775                     DAG.getNode(ISD::SETCC, dl, LoVT, LHS1, RHS1, CC),
22776                     DAG.getNode(ISD::SETCC, dl, HiVT, LHS2, RHS2, CC));
22777}
22778
22779static SDValue LowerIntVSETCC_AVX512(SDValue Op, SelectionDAG &DAG) {
22780
22781  SDValue Op0 = Op.getOperand(0);
22782  SDValue Op1 = Op.getOperand(1);
22783  SDValue CC = Op.getOperand(2);
22784  MVT VT = Op.getSimpleValueType();
22785  SDLoc dl(Op);
22786
22787  assert(VT.getVectorElementType() == MVT::i1 &&
22788         "Cannot set masked compare for this operation");
22789
22790  ISD::CondCode SetCCOpcode = cast<CondCodeSDNode>(CC)->get();
22791
22792  // Prefer SETGT over SETLT.
22793  if (SetCCOpcode == ISD::SETLT) {
22794    SetCCOpcode = ISD::getSetCCSwappedOperands(SetCCOpcode);
22795    std::swap(Op0, Op1);
22796  }
22797
22798  return DAG.getSetCC(dl, VT, Op0, Op1, SetCCOpcode);
22799}
22800
22801/// Given a buildvector constant, return a new vector constant with each element
22802/// incremented or decremented. If incrementing or decrementing would result in
22803/// unsigned overflow or underflow or this is not a simple vector constant,
22804/// return an empty value.
22805static SDValue incDecVectorConstant(SDValue V, SelectionDAG &DAG, bool IsInc) {
22806  auto *BV = dyn_cast<BuildVectorSDNode>(V.getNode());
22807  if (!BV)
22808    return SDValue();
22809
22810  MVT VT = V.getSimpleValueType();
22811  MVT EltVT = VT.getVectorElementType();
22812  unsigned NumElts = VT.getVectorNumElements();
22813  SmallVector<SDValue, 8> NewVecC;
22814  SDLoc DL(V);
22815  for (unsigned i = 0; i < NumElts; ++i) {
22816    auto *Elt = dyn_cast<ConstantSDNode>(BV->getOperand(i));
22817    if (!Elt || Elt->isOpaque() || Elt->getSimpleValueType(0) != EltVT)
22818      return SDValue();
22819
22820    // Avoid overflow/underflow.
22821    const APInt &EltC = Elt->getAPIntValue();
22822    if ((IsInc && EltC.isMaxValue()) || (!IsInc && EltC.isNullValue()))
22823      return SDValue();
22824
22825    NewVecC.push_back(DAG.getConstant(EltC + (IsInc ? 1 : -1), DL, EltVT));
22826  }
22827
22828  return DAG.getBuildVector(VT, DL, NewVecC);
22829}
22830
22831/// As another special case, use PSUBUS[BW] when it's profitable. E.g. for
22832/// Op0 u<= Op1:
22833///   t = psubus Op0, Op1
22834///   pcmpeq t, <0..0>
22835static SDValue LowerVSETCCWithSUBUS(SDValue Op0, SDValue Op1, MVT VT,
22836                                    ISD::CondCode Cond, const SDLoc &dl,
22837                                    const X86Subtarget &Subtarget,
22838                                    SelectionDAG &DAG) {
22839  if (!Subtarget.hasSSE2())
22840    return SDValue();
22841
22842  MVT VET = VT.getVectorElementType();
22843  if (VET != MVT::i8 && VET != MVT::i16)
22844    return SDValue();
22845
22846  switch (Cond) {
22847  default:
22848    return SDValue();
22849  case ISD::SETULT: {
22850    // If the comparison is against a constant we can turn this into a
22851    // setule.  With psubus, setule does not require a swap.  This is
22852    // beneficial because the constant in the register is no longer
22853    // destructed as the destination so it can be hoisted out of a loop.
22854    // Only do this pre-AVX since vpcmp* is no longer destructive.
22855    if (Subtarget.hasAVX())
22856      return SDValue();
22857    SDValue ULEOp1 = incDecVectorConstant(Op1, DAG, /*IsInc*/false);
22858    if (!ULEOp1)
22859      return SDValue();
22860    Op1 = ULEOp1;
22861    break;
22862  }
22863  case ISD::SETUGT: {
22864    // If the comparison is against a constant, we can turn this into a setuge.
22865    // This is beneficial because materializing a constant 0 for the PCMPEQ is
22866    // probably cheaper than XOR+PCMPGT using 2 different vector constants:
22867    // cmpgt (xor X, SignMaskC) CmpC --> cmpeq (usubsat (CmpC+1), X), 0
22868    SDValue UGEOp1 = incDecVectorConstant(Op1, DAG, /*IsInc*/true);
22869    if (!UGEOp1)
22870      return SDValue();
22871    Op1 = Op0;
22872    Op0 = UGEOp1;
22873    break;
22874  }
22875  // Psubus is better than flip-sign because it requires no inversion.
22876  case ISD::SETUGE:
22877    std::swap(Op0, Op1);
22878    break;
22879  case ISD::SETULE:
22880    break;
22881  }
22882
22883  SDValue Result = DAG.getNode(ISD::USUBSAT, dl, VT, Op0, Op1);
22884  return DAG.getNode(X86ISD::PCMPEQ, dl, VT, Result,
22885                     DAG.getConstant(0, dl, VT));
22886}
22887
22888static SDValue LowerVSETCC(SDValue Op, const X86Subtarget &Subtarget,
22889                           SelectionDAG &DAG) {
22890  bool IsStrict = Op.getOpcode() == ISD::STRICT_FSETCC ||
22891                  Op.getOpcode() == ISD::STRICT_FSETCCS;
22892  SDValue Op0 = Op.getOperand(IsStrict ? 1 : 0);
22893  SDValue Op1 = Op.getOperand(IsStrict ? 2 : 1);
22894  SDValue CC = Op.getOperand(IsStrict ? 3 : 2);
22895  MVT VT = Op->getSimpleValueType(0);
22896  ISD::CondCode Cond = cast<CondCodeSDNode>(CC)->get();
22897  bool isFP = Op1.getSimpleValueType().isFloatingPoint();
22898  SDLoc dl(Op);
22899
22900  if (isFP) {
22901#ifndef NDEBUG
22902    MVT EltVT = Op0.getSimpleValueType().getVectorElementType();
22903    assert(EltVT == MVT::f32 || EltVT == MVT::f64);
22904#endif
22905
22906    bool IsSignaling = Op.getOpcode() == ISD::STRICT_FSETCCS;
22907    SDValue Chain = IsStrict ? Op.getOperand(0) : SDValue();
22908
22909    // If we have a strict compare with a vXi1 result and the input is 128/256
22910    // bits we can't use a masked compare unless we have VLX. If we use a wider
22911    // compare like we do for non-strict, we might trigger spurious exceptions
22912    // from the upper elements. Instead emit a AVX compare and convert to mask.
22913    unsigned Opc;
22914    if (Subtarget.hasAVX512() && VT.getVectorElementType() == MVT::i1 &&
22915        (!IsStrict || Subtarget.hasVLX() ||
22916         Op0.getSimpleValueType().is512BitVector())) {
22917      assert(VT.getVectorNumElements() <= 16);
22918      Opc = IsStrict ? X86ISD::STRICT_CMPM : X86ISD::CMPM;
22919    } else {
22920      Opc = IsStrict ? X86ISD::STRICT_CMPP : X86ISD::CMPP;
22921      // The SSE/AVX packed FP comparison nodes are defined with a
22922      // floating-point vector result that matches the operand type. This allows
22923      // them to work with an SSE1 target (integer vector types are not legal).
22924      VT = Op0.getSimpleValueType();
22925    }
22926
22927    SDValue Cmp;
22928    bool IsAlwaysSignaling;
22929    unsigned SSECC = translateX86FSETCC(Cond, Op0, Op1, IsAlwaysSignaling);
22930    if (!Subtarget.hasAVX()) {
22931      // TODO: We could use following steps to handle a quiet compare with
22932      // signaling encodings.
22933      // 1. Get ordered masks from a quiet ISD::SETO
22934      // 2. Use the masks to mask potential unordered elements in operand A, B
22935      // 3. Get the compare results of masked A, B
22936      // 4. Calculating final result using the mask and result from 3
22937      // But currently, we just fall back to scalar operations.
22938      if (IsStrict && IsAlwaysSignaling && !IsSignaling)
22939        return SDValue();
22940
22941      // Insert an extra signaling instruction to raise exception.
22942      if (IsStrict && !IsAlwaysSignaling && IsSignaling) {
22943        SDValue SignalCmp = DAG.getNode(
22944            Opc, dl, {VT, MVT::Other},
22945            {Chain, Op0, Op1, DAG.getTargetConstant(1, dl, MVT::i8)}); // LT_OS
22946        // FIXME: It seems we need to update the flags of all new strict nodes.
22947        // Otherwise, mayRaiseFPException in MI will return false due to
22948        // NoFPExcept = false by default. However, I didn't find it in other
22949        // patches.
22950        SignalCmp->setFlags(Op->getFlags());
22951        Chain = SignalCmp.getValue(1);
22952      }
22953
22954      // In the two cases not handled by SSE compare predicates (SETUEQ/SETONE),
22955      // emit two comparisons and a logic op to tie them together.
22956      if (SSECC >= 8) {
22957        // LLVM predicate is SETUEQ or SETONE.
22958        unsigned CC0, CC1;
22959        unsigned CombineOpc;
22960        if (Cond == ISD::SETUEQ) {
22961          CC0 = 3; // UNORD
22962          CC1 = 0; // EQ
22963          CombineOpc = X86ISD::FOR;
22964        } else {
22965          assert(Cond == ISD::SETONE);
22966          CC0 = 7; // ORD
22967          CC1 = 4; // NEQ
22968          CombineOpc = X86ISD::FAND;
22969        }
22970
22971        SDValue Cmp0, Cmp1;
22972        if (IsStrict) {
22973          Cmp0 = DAG.getNode(
22974              Opc, dl, {VT, MVT::Other},
22975              {Chain, Op0, Op1, DAG.getTargetConstant(CC0, dl, MVT::i8)});
22976          Cmp1 = DAG.getNode(
22977              Opc, dl, {VT, MVT::Other},
22978              {Chain, Op0, Op1, DAG.getTargetConstant(CC1, dl, MVT::i8)});
22979          Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, Cmp0.getValue(1),
22980                              Cmp1.getValue(1));
22981        } else {
22982          Cmp0 = DAG.getNode(
22983              Opc, dl, VT, Op0, Op1, DAG.getTargetConstant(CC0, dl, MVT::i8));
22984          Cmp1 = DAG.getNode(
22985              Opc, dl, VT, Op0, Op1, DAG.getTargetConstant(CC1, dl, MVT::i8));
22986        }
22987        Cmp = DAG.getNode(CombineOpc, dl, VT, Cmp0, Cmp1);
22988      } else {
22989        if (IsStrict) {
22990          Cmp = DAG.getNode(
22991              Opc, dl, {VT, MVT::Other},
22992              {Chain, Op0, Op1, DAG.getTargetConstant(SSECC, dl, MVT::i8)});
22993          Chain = Cmp.getValue(1);
22994        } else
22995          Cmp = DAG.getNode(
22996              Opc, dl, VT, Op0, Op1, DAG.getTargetConstant(SSECC, dl, MVT::i8));
22997      }
22998    } else {
22999      // Handle all other FP comparisons here.
23000      if (IsStrict) {
23001        // Make a flip on already signaling CCs before setting bit 4 of AVX CC.
23002        SSECC |= (IsAlwaysSignaling ^ IsSignaling) << 4;
23003        Cmp = DAG.getNode(
23004            Opc, dl, {VT, MVT::Other},
23005            {Chain, Op0, Op1, DAG.getTargetConstant(SSECC, dl, MVT::i8)});
23006        Chain = Cmp.getValue(1);
23007      } else
23008        Cmp = DAG.getNode(
23009            Opc, dl, VT, Op0, Op1, DAG.getTargetConstant(SSECC, dl, MVT::i8));
23010    }
23011
23012    if (VT.getFixedSizeInBits() >
23013        Op.getSimpleValueType().getFixedSizeInBits()) {
23014      // We emitted a compare with an XMM/YMM result. Finish converting to a
23015      // mask register using a vptestm.
23016      EVT CastVT = EVT(VT).changeVectorElementTypeToInteger();
23017      Cmp = DAG.getBitcast(CastVT, Cmp);
23018      Cmp = DAG.getSetCC(dl, Op.getSimpleValueType(), Cmp,
23019                         DAG.getConstant(0, dl, CastVT), ISD::SETNE);
23020    } else {
23021      // If this is SSE/AVX CMPP, bitcast the result back to integer to match
23022      // the result type of SETCC. The bitcast is expected to be optimized
23023      // away during combining/isel.
23024      Cmp = DAG.getBitcast(Op.getSimpleValueType(), Cmp);
23025    }
23026
23027    if (IsStrict)
23028      return DAG.getMergeValues({Cmp, Chain}, dl);
23029
23030    return Cmp;
23031  }
23032
23033  assert(!IsStrict && "Strict SETCC only handles FP operands.");
23034
23035  MVT VTOp0 = Op0.getSimpleValueType();
23036  (void)VTOp0;
23037  assert(VTOp0 == Op1.getSimpleValueType() &&
23038         "Expected operands with same type!");
23039  assert(VT.getVectorNumElements() == VTOp0.getVectorNumElements() &&
23040         "Invalid number of packed elements for source and destination!");
23041
23042  // The non-AVX512 code below works under the assumption that source and
23043  // destination types are the same.
23044  assert((Subtarget.hasAVX512() || (VT == VTOp0)) &&
23045         "Value types for source and destination must be the same!");
23046
23047  // The result is boolean, but operands are int/float
23048  if (VT.getVectorElementType() == MVT::i1) {
23049    // In AVX-512 architecture setcc returns mask with i1 elements,
23050    // But there is no compare instruction for i8 and i16 elements in KNL.
23051    assert((VTOp0.getScalarSizeInBits() >= 32 || Subtarget.hasBWI()) &&
23052           "Unexpected operand type");
23053    return LowerIntVSETCC_AVX512(Op, DAG);
23054  }
23055
23056  // Lower using XOP integer comparisons.
23057  if (VT.is128BitVector() && Subtarget.hasXOP()) {
23058    // Translate compare code to XOP PCOM compare mode.
23059    unsigned CmpMode = 0;
23060    switch (Cond) {
23061    default: llvm_unreachable("Unexpected SETCC condition");
23062    case ISD::SETULT:
23063    case ISD::SETLT: CmpMode = 0x00; break;
23064    case ISD::SETULE:
23065    case ISD::SETLE: CmpMode = 0x01; break;
23066    case ISD::SETUGT:
23067    case ISD::SETGT: CmpMode = 0x02; break;
23068    case ISD::SETUGE:
23069    case ISD::SETGE: CmpMode = 0x03; break;
23070    case ISD::SETEQ: CmpMode = 0x04; break;
23071    case ISD::SETNE: CmpMode = 0x05; break;
23072    }
23073
23074    // Are we comparing unsigned or signed integers?
23075    unsigned Opc =
23076        ISD::isUnsignedIntSetCC(Cond) ? X86ISD::VPCOMU : X86ISD::VPCOM;
23077
23078    return DAG.getNode(Opc, dl, VT, Op0, Op1,
23079                       DAG.getTargetConstant(CmpMode, dl, MVT::i8));
23080  }
23081
23082  // (X & Y) != 0 --> (X & Y) == Y iff Y is power-of-2.
23083  // Revert part of the simplifySetCCWithAnd combine, to avoid an invert.
23084  if (Cond == ISD::SETNE && ISD::isBuildVectorAllZeros(Op1.getNode())) {
23085    SDValue BC0 = peekThroughBitcasts(Op0);
23086    if (BC0.getOpcode() == ISD::AND) {
23087      APInt UndefElts;
23088      SmallVector<APInt, 64> EltBits;
23089      if (getTargetConstantBitsFromNode(BC0.getOperand(1),
23090                                        VT.getScalarSizeInBits(), UndefElts,
23091                                        EltBits, false, false)) {
23092        if (llvm::all_of(EltBits, [](APInt &V) { return V.isPowerOf2(); })) {
23093          Cond = ISD::SETEQ;
23094          Op1 = DAG.getBitcast(VT, BC0.getOperand(1));
23095        }
23096      }
23097    }
23098  }
23099
23100  // ICMP_EQ(AND(X,C),C) -> SRA(SHL(X,LOG2(C)),BW-1) iff C is power-of-2.
23101  if (Cond == ISD::SETEQ && Op0.getOpcode() == ISD::AND &&
23102      Op0.getOperand(1) == Op1 && Op0.hasOneUse()) {
23103    ConstantSDNode *C1 = isConstOrConstSplat(Op1);
23104    if (C1 && C1->getAPIntValue().isPowerOf2()) {
23105      unsigned BitWidth = VT.getScalarSizeInBits();
23106      unsigned ShiftAmt = BitWidth - C1->getAPIntValue().logBase2() - 1;
23107
23108      SDValue Result = Op0.getOperand(0);
23109      Result = DAG.getNode(ISD::SHL, dl, VT, Result,
23110                           DAG.getConstant(ShiftAmt, dl, VT));
23111      Result = DAG.getNode(ISD::SRA, dl, VT, Result,
23112                           DAG.getConstant(BitWidth - 1, dl, VT));
23113      return Result;
23114    }
23115  }
23116
23117  // Break 256-bit integer vector compare into smaller ones.
23118  if (VT.is256BitVector() && !Subtarget.hasInt256())
23119    return splitIntVSETCC(VT, Op0, Op1, Cond, DAG, dl);
23120
23121  if (VT == MVT::v32i16 || VT == MVT::v64i8) {
23122    assert(!Subtarget.hasBWI() && "Unexpected VT with AVX512BW!");
23123    return splitIntVSETCC(VT, Op0, Op1, Cond, DAG, dl);
23124  }
23125
23126  // If we have a limit constant, try to form PCMPGT (signed cmp) to avoid
23127  // not-of-PCMPEQ:
23128  // X != INT_MIN --> X >s INT_MIN
23129  // X != INT_MAX --> X <s INT_MAX --> INT_MAX >s X
23130  // +X != 0 --> +X >s 0
23131  APInt ConstValue;
23132  if (Cond == ISD::SETNE &&
23133      ISD::isConstantSplatVector(Op1.getNode(), ConstValue)) {
23134    if (ConstValue.isMinSignedValue())
23135      Cond = ISD::SETGT;
23136    else if (ConstValue.isMaxSignedValue())
23137      Cond = ISD::SETLT;
23138    else if (ConstValue.isNullValue() && DAG.SignBitIsZero(Op0))
23139      Cond = ISD::SETGT;
23140  }
23141
23142  // If both operands are known non-negative, then an unsigned compare is the
23143  // same as a signed compare and there's no need to flip signbits.
23144  // TODO: We could check for more general simplifications here since we're
23145  // computing known bits.
23146  bool FlipSigns = ISD::isUnsignedIntSetCC(Cond) &&
23147                   !(DAG.SignBitIsZero(Op0) && DAG.SignBitIsZero(Op1));
23148
23149  // Special case: Use min/max operations for unsigned compares.
23150  const TargetLowering &TLI = DAG.getTargetLoweringInfo();
23151  if (ISD::isUnsignedIntSetCC(Cond) &&
23152      (FlipSigns || ISD::isTrueWhenEqual(Cond)) &&
23153      TLI.isOperationLegal(ISD::UMIN, VT)) {
23154    // If we have a constant operand, increment/decrement it and change the
23155    // condition to avoid an invert.
23156    if (Cond == ISD::SETUGT) {
23157      // X > C --> X >= (C+1) --> X == umax(X, C+1)
23158      if (SDValue UGTOp1 = incDecVectorConstant(Op1, DAG, /*IsInc*/true)) {
23159        Op1 = UGTOp1;
23160        Cond = ISD::SETUGE;
23161      }
23162    }
23163    if (Cond == ISD::SETULT) {
23164      // X < C --> X <= (C-1) --> X == umin(X, C-1)
23165      if (SDValue ULTOp1 = incDecVectorConstant(Op1, DAG, /*IsInc*/false)) {
23166        Op1 = ULTOp1;
23167        Cond = ISD::SETULE;
23168      }
23169    }
23170    bool Invert = false;
23171    unsigned Opc;
23172    switch (Cond) {
23173    default: llvm_unreachable("Unexpected condition code");
23174    case ISD::SETUGT: Invert = true; LLVM_FALLTHROUGH;
23175    case ISD::SETULE: Opc = ISD::UMIN; break;
23176    case ISD::SETULT: Invert = true; LLVM_FALLTHROUGH;
23177    case ISD::SETUGE: Opc = ISD::UMAX; break;
23178    }
23179
23180    SDValue Result = DAG.getNode(Opc, dl, VT, Op0, Op1);
23181    Result = DAG.getNode(X86ISD::PCMPEQ, dl, VT, Op0, Result);
23182
23183    // If the logical-not of the result is required, perform that now.
23184    if (Invert)
23185      Result = DAG.getNOT(dl, Result, VT);
23186
23187    return Result;
23188  }
23189
23190  // Try to use SUBUS and PCMPEQ.
23191  if (FlipSigns)
23192    if (SDValue V =
23193            LowerVSETCCWithSUBUS(Op0, Op1, VT, Cond, dl, Subtarget, DAG))
23194      return V;
23195
23196  // We are handling one of the integer comparisons here. Since SSE only has
23197  // GT and EQ comparisons for integer, swapping operands and multiple
23198  // operations may be required for some comparisons.
23199  unsigned Opc = (Cond == ISD::SETEQ || Cond == ISD::SETNE) ? X86ISD::PCMPEQ
23200                                                            : X86ISD::PCMPGT;
23201  bool Swap = Cond == ISD::SETLT || Cond == ISD::SETULT ||
23202              Cond == ISD::SETGE || Cond == ISD::SETUGE;
23203  bool Invert = Cond == ISD::SETNE ||
23204                (Cond != ISD::SETEQ && ISD::isTrueWhenEqual(Cond));
23205
23206  if (Swap)
23207    std::swap(Op0, Op1);
23208
23209  // Check that the operation in question is available (most are plain SSE2,
23210  // but PCMPGTQ and PCMPEQQ have different requirements).
23211  if (VT == MVT::v2i64) {
23212    if (Opc == X86ISD::PCMPGT && !Subtarget.hasSSE42()) {
23213      assert(Subtarget.hasSSE2() && "Don't know how to lower!");
23214
23215      // Special case for sign bit test. We can use a v4i32 PCMPGT and shuffle
23216      // the odd elements over the even elements.
23217      if (!FlipSigns && !Invert && ISD::isBuildVectorAllZeros(Op0.getNode())) {
23218        Op0 = DAG.getConstant(0, dl, MVT::v4i32);
23219        Op1 = DAG.getBitcast(MVT::v4i32, Op1);
23220
23221        SDValue GT = DAG.getNode(X86ISD::PCMPGT, dl, MVT::v4i32, Op0, Op1);
23222        static const int MaskHi[] = { 1, 1, 3, 3 };
23223        SDValue Result = DAG.getVectorShuffle(MVT::v4i32, dl, GT, GT, MaskHi);
23224
23225        return DAG.getBitcast(VT, Result);
23226      }
23227
23228      if (!FlipSigns && !Invert && ISD::isBuildVectorAllOnes(Op1.getNode())) {
23229        Op0 = DAG.getBitcast(MVT::v4i32, Op0);
23230        Op1 = DAG.getConstant(-1, dl, MVT::v4i32);
23231
23232        SDValue GT = DAG.getNode(X86ISD::PCMPGT, dl, MVT::v4i32, Op0, Op1);
23233        static const int MaskHi[] = { 1, 1, 3, 3 };
23234        SDValue Result = DAG.getVectorShuffle(MVT::v4i32, dl, GT, GT, MaskHi);
23235
23236        return DAG.getBitcast(VT, Result);
23237      }
23238
23239      // Since SSE has no unsigned integer comparisons, we need to flip the sign
23240      // bits of the inputs before performing those operations. The lower
23241      // compare is always unsigned.
23242      SDValue SB;
23243      if (FlipSigns) {
23244        SB = DAG.getConstant(0x8000000080000000ULL, dl, MVT::v2i64);
23245      } else {
23246        SB = DAG.getConstant(0x0000000080000000ULL, dl, MVT::v2i64);
23247      }
23248      Op0 = DAG.getNode(ISD::XOR, dl, MVT::v2i64, Op0, SB);
23249      Op1 = DAG.getNode(ISD::XOR, dl, MVT::v2i64, Op1, SB);
23250
23251      // Cast everything to the right type.
23252      Op0 = DAG.getBitcast(MVT::v4i32, Op0);
23253      Op1 = DAG.getBitcast(MVT::v4i32, Op1);
23254
23255      // Emulate PCMPGTQ with (hi1 > hi2) | ((hi1 == hi2) & (lo1 > lo2))
23256      SDValue GT = DAG.getNode(X86ISD::PCMPGT, dl, MVT::v4i32, Op0, Op1);
23257      SDValue EQ = DAG.getNode(X86ISD::PCMPEQ, dl, MVT::v4i32, Op0, Op1);
23258
23259      // Create masks for only the low parts/high parts of the 64 bit integers.
23260      static const int MaskHi[] = { 1, 1, 3, 3 };
23261      static const int MaskLo[] = { 0, 0, 2, 2 };
23262      SDValue EQHi = DAG.getVectorShuffle(MVT::v4i32, dl, EQ, EQ, MaskHi);
23263      SDValue GTLo = DAG.getVectorShuffle(MVT::v4i32, dl, GT, GT, MaskLo);
23264      SDValue GTHi = DAG.getVectorShuffle(MVT::v4i32, dl, GT, GT, MaskHi);
23265
23266      SDValue Result = DAG.getNode(ISD::AND, dl, MVT::v4i32, EQHi, GTLo);
23267      Result = DAG.getNode(ISD::OR, dl, MVT::v4i32, Result, GTHi);
23268
23269      if (Invert)
23270        Result = DAG.getNOT(dl, Result, MVT::v4i32);
23271
23272      return DAG.getBitcast(VT, Result);
23273    }
23274
23275    if (Opc == X86ISD::PCMPEQ && !Subtarget.hasSSE41()) {
23276      // If pcmpeqq is missing but pcmpeqd is available synthesize pcmpeqq with
23277      // pcmpeqd + pshufd + pand.
23278      assert(Subtarget.hasSSE2() && !FlipSigns && "Don't know how to lower!");
23279
23280      // First cast everything to the right type.
23281      Op0 = DAG.getBitcast(MVT::v4i32, Op0);
23282      Op1 = DAG.getBitcast(MVT::v4i32, Op1);
23283
23284      // Do the compare.
23285      SDValue Result = DAG.getNode(Opc, dl, MVT::v4i32, Op0, Op1);
23286
23287      // Make sure the lower and upper halves are both all-ones.
23288      static const int Mask[] = { 1, 0, 3, 2 };
23289      SDValue Shuf = DAG.getVectorShuffle(MVT::v4i32, dl, Result, Result, Mask);
23290      Result = DAG.getNode(ISD::AND, dl, MVT::v4i32, Result, Shuf);
23291
23292      if (Invert)
23293        Result = DAG.getNOT(dl, Result, MVT::v4i32);
23294
23295      return DAG.getBitcast(VT, Result);
23296    }
23297  }
23298
23299  // Since SSE has no unsigned integer comparisons, we need to flip the sign
23300  // bits of the inputs before performing those operations.
23301  if (FlipSigns) {
23302    MVT EltVT = VT.getVectorElementType();
23303    SDValue SM = DAG.getConstant(APInt::getSignMask(EltVT.getSizeInBits()), dl,
23304                                 VT);
23305    Op0 = DAG.getNode(ISD::XOR, dl, VT, Op0, SM);
23306    Op1 = DAG.getNode(ISD::XOR, dl, VT, Op1, SM);
23307  }
23308
23309  SDValue Result = DAG.getNode(Opc, dl, VT, Op0, Op1);
23310
23311  // If the logical-not of the result is required, perform that now.
23312  if (Invert)
23313    Result = DAG.getNOT(dl, Result, VT);
23314
23315  return Result;
23316}
23317
23318// Try to select this as a KORTEST+SETCC or KTEST+SETCC if possible.
23319static SDValue EmitAVX512Test(SDValue Op0, SDValue Op1, ISD::CondCode CC,
23320                              const SDLoc &dl, SelectionDAG &DAG,
23321                              const X86Subtarget &Subtarget,
23322                              SDValue &X86CC) {
23323  // Only support equality comparisons.
23324  if (CC != ISD::SETEQ && CC != ISD::SETNE)
23325    return SDValue();
23326
23327  // Must be a bitcast from vXi1.
23328  if (Op0.getOpcode() != ISD::BITCAST)
23329    return SDValue();
23330
23331  Op0 = Op0.getOperand(0);
23332  MVT VT = Op0.getSimpleValueType();
23333  if (!(Subtarget.hasAVX512() && VT == MVT::v16i1) &&
23334      !(Subtarget.hasDQI() && VT == MVT::v8i1) &&
23335      !(Subtarget.hasBWI() && (VT == MVT::v32i1 || VT == MVT::v64i1)))
23336    return SDValue();
23337
23338  X86::CondCode X86Cond;
23339  if (isNullConstant(Op1)) {
23340    X86Cond = CC == ISD::SETEQ ? X86::COND_E : X86::COND_NE;
23341  } else if (isAllOnesConstant(Op1)) {
23342    // C flag is set for all ones.
23343    X86Cond = CC == ISD::SETEQ ? X86::COND_B : X86::COND_AE;
23344  } else
23345    return SDValue();
23346
23347  // If the input is an AND, we can combine it's operands into the KTEST.
23348  bool KTestable = false;
23349  if (Subtarget.hasDQI() && (VT == MVT::v8i1 || VT == MVT::v16i1))
23350    KTestable = true;
23351  if (Subtarget.hasBWI() && (VT == MVT::v32i1 || VT == MVT::v64i1))
23352    KTestable = true;
23353  if (!isNullConstant(Op1))
23354    KTestable = false;
23355  if (KTestable && Op0.getOpcode() == ISD::AND && Op0.hasOneUse()) {
23356    SDValue LHS = Op0.getOperand(0);
23357    SDValue RHS = Op0.getOperand(1);
23358    X86CC = DAG.getTargetConstant(X86Cond, dl, MVT::i8);
23359    return DAG.getNode(X86ISD::KTEST, dl, MVT::i32, LHS, RHS);
23360  }
23361
23362  // If the input is an OR, we can combine it's operands into the KORTEST.
23363  SDValue LHS = Op0;
23364  SDValue RHS = Op0;
23365  if (Op0.getOpcode() == ISD::OR && Op0.hasOneUse()) {
23366    LHS = Op0.getOperand(0);
23367    RHS = Op0.getOperand(1);
23368  }
23369
23370  X86CC = DAG.getTargetConstant(X86Cond, dl, MVT::i8);
23371  return DAG.getNode(X86ISD::KORTEST, dl, MVT::i32, LHS, RHS);
23372}
23373
23374/// Emit flags for the given setcc condition and operands. Also returns the
23375/// corresponding X86 condition code constant in X86CC.
23376SDValue X86TargetLowering::emitFlagsForSetcc(SDValue Op0, SDValue Op1,
23377                                             ISD::CondCode CC, const SDLoc &dl,
23378                                             SelectionDAG &DAG,
23379                                             SDValue &X86CC) const {
23380  // Optimize to BT if possible.
23381  // Lower (X & (1 << N)) == 0 to BT(X, N).
23382  // Lower ((X >>u N) & 1) != 0 to BT(X, N).
23383  // Lower ((X >>s N) & 1) != 0 to BT(X, N).
23384  if (Op0.getOpcode() == ISD::AND && Op0.hasOneUse() && isNullConstant(Op1) &&
23385      (CC == ISD::SETEQ || CC == ISD::SETNE)) {
23386    if (SDValue BT = LowerAndToBT(Op0, CC, dl, DAG, X86CC))
23387      return BT;
23388  }
23389
23390  // Try to use PTEST/PMOVMSKB for a tree ORs equality compared with 0.
23391  // TODO: We could do AND tree with all 1s as well by using the C flag.
23392  if (isNullConstant(Op1) && (CC == ISD::SETEQ || CC == ISD::SETNE))
23393    if (SDValue CmpZ =
23394            MatchVectorAllZeroTest(Op0, CC, dl, Subtarget, DAG, X86CC))
23395      return CmpZ;
23396
23397  // Try to lower using KORTEST or KTEST.
23398  if (SDValue Test = EmitAVX512Test(Op0, Op1, CC, dl, DAG, Subtarget, X86CC))
23399    return Test;
23400
23401  // Look for X == 0, X == 1, X != 0, or X != 1.  We can simplify some forms of
23402  // these.
23403  if ((isOneConstant(Op1) || isNullConstant(Op1)) &&
23404      (CC == ISD::SETEQ || CC == ISD::SETNE)) {
23405    // If the input is a setcc, then reuse the input setcc or use a new one with
23406    // the inverted condition.
23407    if (Op0.getOpcode() == X86ISD::SETCC) {
23408      bool Invert = (CC == ISD::SETNE) ^ isNullConstant(Op1);
23409
23410      X86CC = Op0.getOperand(0);
23411      if (Invert) {
23412        X86::CondCode CCode = (X86::CondCode)Op0.getConstantOperandVal(0);
23413        CCode = X86::GetOppositeBranchCondition(CCode);
23414        X86CC = DAG.getTargetConstant(CCode, dl, MVT::i8);
23415      }
23416
23417      return Op0.getOperand(1);
23418    }
23419  }
23420
23421  // Try to use the carry flag from the add in place of an separate CMP for:
23422  // (seteq (add X, -1), -1). Similar for setne.
23423  if (isAllOnesConstant(Op1) && Op0.getOpcode() == ISD::ADD &&
23424      Op0.getOperand(1) == Op1 && (CC == ISD::SETEQ || CC == ISD::SETNE)) {
23425    if (isProfitableToUseFlagOp(Op0)) {
23426      SDVTList VTs = DAG.getVTList(Op0.getValueType(), MVT::i32);
23427
23428      SDValue New = DAG.getNode(X86ISD::ADD, dl, VTs, Op0.getOperand(0),
23429                                Op0.getOperand(1));
23430      DAG.ReplaceAllUsesOfValueWith(SDValue(Op0.getNode(), 0), New);
23431      X86::CondCode CCode = CC == ISD::SETEQ ? X86::COND_AE : X86::COND_B;
23432      X86CC = DAG.getTargetConstant(CCode, dl, MVT::i8);
23433      return SDValue(New.getNode(), 1);
23434    }
23435  }
23436
23437  X86::CondCode CondCode =
23438      TranslateX86CC(CC, dl, /*IsFP*/ false, Op0, Op1, DAG);
23439  assert(CondCode != X86::COND_INVALID && "Unexpected condition code!");
23440
23441  SDValue EFLAGS = EmitCmp(Op0, Op1, CondCode, dl, DAG, Subtarget);
23442  X86CC = DAG.getTargetConstant(CondCode, dl, MVT::i8);
23443  return EFLAGS;
23444}
23445
23446SDValue X86TargetLowering::LowerSETCC(SDValue Op, SelectionDAG &DAG) const {
23447
23448  bool IsStrict = Op.getOpcode() == ISD::STRICT_FSETCC ||
23449                  Op.getOpcode() == ISD::STRICT_FSETCCS;
23450  MVT VT = Op->getSimpleValueType(0);
23451
23452  if (VT.isVector()) return LowerVSETCC(Op, Subtarget, DAG);
23453
23454  assert(VT == MVT::i8 && "SetCC type must be 8-bit integer");
23455  SDValue Chain = IsStrict ? Op.getOperand(0) : SDValue();
23456  SDValue Op0 = Op.getOperand(IsStrict ? 1 : 0);
23457  SDValue Op1 = Op.getOperand(IsStrict ? 2 : 1);
23458  SDLoc dl(Op);
23459  ISD::CondCode CC =
23460      cast<CondCodeSDNode>(Op.getOperand(IsStrict ? 3 : 2))->get();
23461
23462  // Handle f128 first, since one possible outcome is a normal integer
23463  // comparison which gets handled by emitFlagsForSetcc.
23464  if (Op0.getValueType() == MVT::f128) {
23465    softenSetCCOperands(DAG, MVT::f128, Op0, Op1, CC, dl, Op0, Op1, Chain,
23466                        Op.getOpcode() == ISD::STRICT_FSETCCS);
23467
23468    // If softenSetCCOperands returned a scalar, use it.
23469    if (!Op1.getNode()) {
23470      assert(Op0.getValueType() == Op.getValueType() &&
23471             "Unexpected setcc expansion!");
23472      if (IsStrict)
23473        return DAG.getMergeValues({Op0, Chain}, dl);
23474      return Op0;
23475    }
23476  }
23477
23478  if (Op0.getSimpleValueType().isInteger()) {
23479    SDValue X86CC;
23480    SDValue EFLAGS = emitFlagsForSetcc(Op0, Op1, CC, dl, DAG, X86CC);
23481    SDValue Res = DAG.getNode(X86ISD::SETCC, dl, MVT::i8, X86CC, EFLAGS);
23482    return IsStrict ? DAG.getMergeValues({Res, Chain}, dl) : Res;
23483  }
23484
23485  // Handle floating point.
23486  X86::CondCode CondCode = TranslateX86CC(CC, dl, /*IsFP*/ true, Op0, Op1, DAG);
23487  if (CondCode == X86::COND_INVALID)
23488    return SDValue();
23489
23490  SDValue EFLAGS;
23491  if (IsStrict) {
23492    bool IsSignaling = Op.getOpcode() == ISD::STRICT_FSETCCS;
23493    EFLAGS =
23494        DAG.getNode(IsSignaling ? X86ISD::STRICT_FCMPS : X86ISD::STRICT_FCMP,
23495                    dl, {MVT::i32, MVT::Other}, {Chain, Op0, Op1});
23496    Chain = EFLAGS.getValue(1);
23497  } else {
23498    EFLAGS = DAG.getNode(X86ISD::FCMP, dl, MVT::i32, Op0, Op1);
23499  }
23500
23501  SDValue X86CC = DAG.getTargetConstant(CondCode, dl, MVT::i8);
23502  SDValue Res = DAG.getNode(X86ISD::SETCC, dl, MVT::i8, X86CC, EFLAGS);
23503  return IsStrict ? DAG.getMergeValues({Res, Chain}, dl) : Res;
23504}
23505
23506SDValue X86TargetLowering::LowerSETCCCARRY(SDValue Op, SelectionDAG &DAG) const {
23507  SDValue LHS = Op.getOperand(0);
23508  SDValue RHS = Op.getOperand(1);
23509  SDValue Carry = Op.getOperand(2);
23510  SDValue Cond = Op.getOperand(3);
23511  SDLoc DL(Op);
23512
23513  assert(LHS.getSimpleValueType().isInteger() && "SETCCCARRY is integer only.");
23514  X86::CondCode CC = TranslateIntegerX86CC(cast<CondCodeSDNode>(Cond)->get());
23515
23516  // Recreate the carry if needed.
23517  EVT CarryVT = Carry.getValueType();
23518  Carry = DAG.getNode(X86ISD::ADD, DL, DAG.getVTList(CarryVT, MVT::i32),
23519                      Carry, DAG.getAllOnesConstant(DL, CarryVT));
23520
23521  SDVTList VTs = DAG.getVTList(LHS.getValueType(), MVT::i32);
23522  SDValue Cmp = DAG.getNode(X86ISD::SBB, DL, VTs, LHS, RHS, Carry.getValue(1));
23523  return getSETCC(CC, Cmp.getValue(1), DL, DAG);
23524}
23525
23526// This function returns three things: the arithmetic computation itself
23527// (Value), an EFLAGS result (Overflow), and a condition code (Cond).  The
23528// flag and the condition code define the case in which the arithmetic
23529// computation overflows.
23530static std::pair<SDValue, SDValue>
23531getX86XALUOOp(X86::CondCode &Cond, SDValue Op, SelectionDAG &DAG) {
23532  assert(Op.getResNo() == 0 && "Unexpected result number!");
23533  SDValue Value, Overflow;
23534  SDValue LHS = Op.getOperand(0);
23535  SDValue RHS = Op.getOperand(1);
23536  unsigned BaseOp = 0;
23537  SDLoc DL(Op);
23538  switch (Op.getOpcode()) {
23539  default: llvm_unreachable("Unknown ovf instruction!");
23540  case ISD::SADDO:
23541    BaseOp = X86ISD::ADD;
23542    Cond = X86::COND_O;
23543    break;
23544  case ISD::UADDO:
23545    BaseOp = X86ISD::ADD;
23546    Cond = isOneConstant(RHS) ? X86::COND_E : X86::COND_B;
23547    break;
23548  case ISD::SSUBO:
23549    BaseOp = X86ISD::SUB;
23550    Cond = X86::COND_O;
23551    break;
23552  case ISD::USUBO:
23553    BaseOp = X86ISD::SUB;
23554    Cond = X86::COND_B;
23555    break;
23556  case ISD::SMULO:
23557    BaseOp = X86ISD::SMUL;
23558    Cond = X86::COND_O;
23559    break;
23560  case ISD::UMULO:
23561    BaseOp = X86ISD::UMUL;
23562    Cond = X86::COND_O;
23563    break;
23564  }
23565
23566  if (BaseOp) {
23567    // Also sets EFLAGS.
23568    SDVTList VTs = DAG.getVTList(Op.getValueType(), MVT::i32);
23569    Value = DAG.getNode(BaseOp, DL, VTs, LHS, RHS);
23570    Overflow = Value.getValue(1);
23571  }
23572
23573  return std::make_pair(Value, Overflow);
23574}
23575
23576static SDValue LowerXALUO(SDValue Op, SelectionDAG &DAG) {
23577  // Lower the "add/sub/mul with overflow" instruction into a regular ins plus
23578  // a "setcc" instruction that checks the overflow flag. The "brcond" lowering
23579  // looks for this combo and may remove the "setcc" instruction if the "setcc"
23580  // has only one use.
23581  SDLoc DL(Op);
23582  X86::CondCode Cond;
23583  SDValue Value, Overflow;
23584  std::tie(Value, Overflow) = getX86XALUOOp(Cond, Op, DAG);
23585
23586  SDValue SetCC = getSETCC(Cond, Overflow, DL, DAG);
23587  assert(Op->getValueType(1) == MVT::i8 && "Unexpected VT!");
23588  return DAG.getNode(ISD::MERGE_VALUES, DL, Op->getVTList(), Value, SetCC);
23589}
23590
23591/// Return true if opcode is a X86 logical comparison.
23592static bool isX86LogicalCmp(SDValue Op) {
23593  unsigned Opc = Op.getOpcode();
23594  if (Opc == X86ISD::CMP || Opc == X86ISD::COMI || Opc == X86ISD::UCOMI ||
23595      Opc == X86ISD::FCMP)
23596    return true;
23597  if (Op.getResNo() == 1 &&
23598      (Opc == X86ISD::ADD || Opc == X86ISD::SUB || Opc == X86ISD::ADC ||
23599       Opc == X86ISD::SBB || Opc == X86ISD::SMUL || Opc == X86ISD::UMUL ||
23600       Opc == X86ISD::OR || Opc == X86ISD::XOR || Opc == X86ISD::AND))
23601    return true;
23602
23603  return false;
23604}
23605
23606static bool isTruncWithZeroHighBitsInput(SDValue V, SelectionDAG &DAG) {
23607  if (V.getOpcode() != ISD::TRUNCATE)
23608    return false;
23609
23610  SDValue VOp0 = V.getOperand(0);
23611  unsigned InBits = VOp0.getValueSizeInBits();
23612  unsigned Bits = V.getValueSizeInBits();
23613  return DAG.MaskedValueIsZero(VOp0, APInt::getHighBitsSet(InBits,InBits-Bits));
23614}
23615
23616SDValue X86TargetLowering::LowerSELECT(SDValue Op, SelectionDAG &DAG) const {
23617  bool AddTest = true;
23618  SDValue Cond  = Op.getOperand(0);
23619  SDValue Op1 = Op.getOperand(1);
23620  SDValue Op2 = Op.getOperand(2);
23621  SDLoc DL(Op);
23622  MVT VT = Op1.getSimpleValueType();
23623  SDValue CC;
23624
23625  // Lower FP selects into a CMP/AND/ANDN/OR sequence when the necessary SSE ops
23626  // are available or VBLENDV if AVX is available.
23627  // Otherwise FP cmovs get lowered into a less efficient branch sequence later.
23628  if (Cond.getOpcode() == ISD::SETCC && isScalarFPTypeInSSEReg(VT) &&
23629      VT == Cond.getOperand(0).getSimpleValueType() && Cond->hasOneUse()) {
23630    SDValue CondOp0 = Cond.getOperand(0), CondOp1 = Cond.getOperand(1);
23631    bool IsAlwaysSignaling;
23632    unsigned SSECC =
23633        translateX86FSETCC(cast<CondCodeSDNode>(Cond.getOperand(2))->get(),
23634                           CondOp0, CondOp1, IsAlwaysSignaling);
23635
23636    if (Subtarget.hasAVX512()) {
23637      SDValue Cmp =
23638          DAG.getNode(X86ISD::FSETCCM, DL, MVT::v1i1, CondOp0, CondOp1,
23639                      DAG.getTargetConstant(SSECC, DL, MVT::i8));
23640      assert(!VT.isVector() && "Not a scalar type?");
23641      return DAG.getNode(X86ISD::SELECTS, DL, VT, Cmp, Op1, Op2);
23642    }
23643
23644    if (SSECC < 8 || Subtarget.hasAVX()) {
23645      SDValue Cmp = DAG.getNode(X86ISD::FSETCC, DL, VT, CondOp0, CondOp1,
23646                                DAG.getTargetConstant(SSECC, DL, MVT::i8));
23647
23648      // If we have AVX, we can use a variable vector select (VBLENDV) instead
23649      // of 3 logic instructions for size savings and potentially speed.
23650      // Unfortunately, there is no scalar form of VBLENDV.
23651
23652      // If either operand is a +0.0 constant, don't try this. We can expect to
23653      // optimize away at least one of the logic instructions later in that
23654      // case, so that sequence would be faster than a variable blend.
23655
23656      // BLENDV was introduced with SSE 4.1, but the 2 register form implicitly
23657      // uses XMM0 as the selection register. That may need just as many
23658      // instructions as the AND/ANDN/OR sequence due to register moves, so
23659      // don't bother.
23660      if (Subtarget.hasAVX() && !isNullFPConstant(Op1) &&
23661          !isNullFPConstant(Op2)) {
23662        // Convert to vectors, do a VSELECT, and convert back to scalar.
23663        // All of the conversions should be optimized away.
23664        MVT VecVT = VT == MVT::f32 ? MVT::v4f32 : MVT::v2f64;
23665        SDValue VOp1 = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, VecVT, Op1);
23666        SDValue VOp2 = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, VecVT, Op2);
23667        SDValue VCmp = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, VecVT, Cmp);
23668
23669        MVT VCmpVT = VT == MVT::f32 ? MVT::v4i32 : MVT::v2i64;
23670        VCmp = DAG.getBitcast(VCmpVT, VCmp);
23671
23672        SDValue VSel = DAG.getSelect(DL, VecVT, VCmp, VOp1, VOp2);
23673
23674        return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, VT,
23675                           VSel, DAG.getIntPtrConstant(0, DL));
23676      }
23677      SDValue AndN = DAG.getNode(X86ISD::FANDN, DL, VT, Cmp, Op2);
23678      SDValue And = DAG.getNode(X86ISD::FAND, DL, VT, Cmp, Op1);
23679      return DAG.getNode(X86ISD::FOR, DL, VT, AndN, And);
23680    }
23681  }
23682
23683  // AVX512 fallback is to lower selects of scalar floats to masked moves.
23684  if (isScalarFPTypeInSSEReg(VT) && Subtarget.hasAVX512()) {
23685    SDValue Cmp = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, MVT::v1i1, Cond);
23686    return DAG.getNode(X86ISD::SELECTS, DL, VT, Cmp, Op1, Op2);
23687  }
23688
23689  if (Cond.getOpcode() == ISD::SETCC) {
23690    if (SDValue NewCond = LowerSETCC(Cond, DAG)) {
23691      Cond = NewCond;
23692      // If the condition was updated, it's possible that the operands of the
23693      // select were also updated (for example, EmitTest has a RAUW). Refresh
23694      // the local references to the select operands in case they got stale.
23695      Op1 = Op.getOperand(1);
23696      Op2 = Op.getOperand(2);
23697    }
23698  }
23699
23700  // (select (x == 0), -1, y) -> (sign_bit (x - 1)) | y
23701  // (select (x == 0), y, -1) -> ~(sign_bit (x - 1)) | y
23702  // (select (x != 0), y, -1) -> (sign_bit (x - 1)) | y
23703  // (select (x != 0), -1, y) -> ~(sign_bit (x - 1)) | y
23704  // (select (and (x , 0x1) == 0), y, (z ^ y) ) -> (-(and (x , 0x1)) & z ) ^ y
23705  // (select (and (x , 0x1) == 0), y, (z | y) ) -> (-(and (x , 0x1)) & z ) | y
23706  if (Cond.getOpcode() == X86ISD::SETCC &&
23707      Cond.getOperand(1).getOpcode() == X86ISD::CMP &&
23708      isNullConstant(Cond.getOperand(1).getOperand(1))) {
23709    SDValue Cmp = Cond.getOperand(1);
23710    SDValue CmpOp0 = Cmp.getOperand(0);
23711    unsigned CondCode = Cond.getConstantOperandVal(0);
23712
23713    // Special handling for __builtin_ffs(X) - 1 pattern which looks like
23714    // (select (seteq X, 0), -1, (cttz_zero_undef X)). Disable the special
23715    // handle to keep the CMP with 0. This should be removed by
23716    // optimizeCompareInst by using the flags from the BSR/TZCNT used for the
23717    // cttz_zero_undef.
23718    auto MatchFFSMinus1 = [&](SDValue Op1, SDValue Op2) {
23719      return (Op1.getOpcode() == ISD::CTTZ_ZERO_UNDEF && Op1.hasOneUse() &&
23720              Op1.getOperand(0) == CmpOp0 && isAllOnesConstant(Op2));
23721    };
23722    if (Subtarget.hasCMov() && (VT == MVT::i32 || VT == MVT::i64) &&
23723        ((CondCode == X86::COND_NE && MatchFFSMinus1(Op1, Op2)) ||
23724         (CondCode == X86::COND_E && MatchFFSMinus1(Op2, Op1)))) {
23725      // Keep Cmp.
23726    } else if ((isAllOnesConstant(Op1) || isAllOnesConstant(Op2)) &&
23727        (CondCode == X86::COND_E || CondCode == X86::COND_NE)) {
23728      SDValue Y = isAllOnesConstant(Op2) ? Op1 : Op2;
23729
23730      SDVTList VTs = DAG.getVTList(Op.getValueType(), MVT::i32);
23731      SDVTList CmpVTs = DAG.getVTList(CmpOp0.getValueType(), MVT::i32);
23732
23733      // Apply further optimizations for special cases
23734      // (select (x != 0), -1, 0) -> neg & sbb
23735      // (select (x == 0), 0, -1) -> neg & sbb
23736      if (isNullConstant(Y) &&
23737          (isAllOnesConstant(Op1) == (CondCode == X86::COND_NE))) {
23738        SDValue Zero = DAG.getConstant(0, DL, CmpOp0.getValueType());
23739        SDValue Neg = DAG.getNode(X86ISD::SUB, DL, CmpVTs, Zero, CmpOp0);
23740        Zero = DAG.getConstant(0, DL, Op.getValueType());
23741        return DAG.getNode(X86ISD::SBB, DL, VTs, Zero, Zero, Neg.getValue(1));
23742      }
23743
23744      Cmp = DAG.getNode(X86ISD::SUB, DL, CmpVTs,
23745                        CmpOp0, DAG.getConstant(1, DL, CmpOp0.getValueType()));
23746
23747      SDValue Zero = DAG.getConstant(0, DL, Op.getValueType());
23748      SDValue Res =   // Res = 0 or -1.
23749        DAG.getNode(X86ISD::SBB, DL, VTs, Zero, Zero, Cmp.getValue(1));
23750
23751      if (isAllOnesConstant(Op1) != (CondCode == X86::COND_E))
23752        Res = DAG.getNOT(DL, Res, Res.getValueType());
23753
23754      return DAG.getNode(ISD::OR, DL, Res.getValueType(), Res, Y);
23755    } else if (!Subtarget.hasCMov() && CondCode == X86::COND_E &&
23756               Cmp.getOperand(0).getOpcode() == ISD::AND &&
23757               isOneConstant(Cmp.getOperand(0).getOperand(1))) {
23758      SDValue Src1, Src2;
23759      // true if Op2 is XOR or OR operator and one of its operands
23760      // is equal to Op1
23761      // ( a , a op b) || ( b , a op b)
23762      auto isOrXorPattern = [&]() {
23763        if ((Op2.getOpcode() == ISD::XOR || Op2.getOpcode() == ISD::OR) &&
23764            (Op2.getOperand(0) == Op1 || Op2.getOperand(1) == Op1)) {
23765          Src1 =
23766              Op2.getOperand(0) == Op1 ? Op2.getOperand(1) : Op2.getOperand(0);
23767          Src2 = Op1;
23768          return true;
23769        }
23770        return false;
23771      };
23772
23773      if (isOrXorPattern()) {
23774        SDValue Neg;
23775        unsigned int CmpSz = CmpOp0.getSimpleValueType().getSizeInBits();
23776        // we need mask of all zeros or ones with same size of the other
23777        // operands.
23778        if (CmpSz > VT.getSizeInBits())
23779          Neg = DAG.getNode(ISD::TRUNCATE, DL, VT, CmpOp0);
23780        else if (CmpSz < VT.getSizeInBits())
23781          Neg = DAG.getNode(ISD::AND, DL, VT,
23782              DAG.getNode(ISD::ANY_EXTEND, DL, VT, CmpOp0.getOperand(0)),
23783              DAG.getConstant(1, DL, VT));
23784        else
23785          Neg = CmpOp0;
23786        SDValue Mask = DAG.getNode(ISD::SUB, DL, VT, DAG.getConstant(0, DL, VT),
23787                                   Neg); // -(and (x, 0x1))
23788        SDValue And = DAG.getNode(ISD::AND, DL, VT, Mask, Src1); // Mask & z
23789        return DAG.getNode(Op2.getOpcode(), DL, VT, And, Src2);  // And Op y
23790      }
23791    }
23792  }
23793
23794  // Look past (and (setcc_carry (cmp ...)), 1).
23795  if (Cond.getOpcode() == ISD::AND &&
23796      Cond.getOperand(0).getOpcode() == X86ISD::SETCC_CARRY &&
23797      isOneConstant(Cond.getOperand(1)))
23798    Cond = Cond.getOperand(0);
23799
23800  // If condition flag is set by a X86ISD::CMP, then use it as the condition
23801  // setting operand in place of the X86ISD::SETCC.
23802  unsigned CondOpcode = Cond.getOpcode();
23803  if (CondOpcode == X86ISD::SETCC ||
23804      CondOpcode == X86ISD::SETCC_CARRY) {
23805    CC = Cond.getOperand(0);
23806
23807    SDValue Cmp = Cond.getOperand(1);
23808    bool IllegalFPCMov = false;
23809    if (VT.isFloatingPoint() && !VT.isVector() &&
23810        !isScalarFPTypeInSSEReg(VT) && Subtarget.hasCMov())  // FPStack?
23811      IllegalFPCMov = !hasFPCMov(cast<ConstantSDNode>(CC)->getSExtValue());
23812
23813    if ((isX86LogicalCmp(Cmp) && !IllegalFPCMov) ||
23814        Cmp.getOpcode() == X86ISD::BT) { // FIXME
23815      Cond = Cmp;
23816      AddTest = false;
23817    }
23818  } else if (CondOpcode == ISD::USUBO || CondOpcode == ISD::SSUBO ||
23819             CondOpcode == ISD::UADDO || CondOpcode == ISD::SADDO ||
23820             CondOpcode == ISD::UMULO || CondOpcode == ISD::SMULO) {
23821    SDValue Value;
23822    X86::CondCode X86Cond;
23823    std::tie(Value, Cond) = getX86XALUOOp(X86Cond, Cond.getValue(0), DAG);
23824
23825    CC = DAG.getTargetConstant(X86Cond, DL, MVT::i8);
23826    AddTest = false;
23827  }
23828
23829  if (AddTest) {
23830    // Look past the truncate if the high bits are known zero.
23831    if (isTruncWithZeroHighBitsInput(Cond, DAG))
23832      Cond = Cond.getOperand(0);
23833
23834    // We know the result of AND is compared against zero. Try to match
23835    // it to BT.
23836    if (Cond.getOpcode() == ISD::AND && Cond.hasOneUse()) {
23837      SDValue BTCC;
23838      if (SDValue BT = LowerAndToBT(Cond, ISD::SETNE, DL, DAG, BTCC)) {
23839        CC = BTCC;
23840        Cond = BT;
23841        AddTest = false;
23842      }
23843    }
23844  }
23845
23846  if (AddTest) {
23847    CC = DAG.getTargetConstant(X86::COND_NE, DL, MVT::i8);
23848    Cond = EmitTest(Cond, X86::COND_NE, DL, DAG, Subtarget);
23849  }
23850
23851  // a <  b ? -1 :  0 -> RES = ~setcc_carry
23852  // a <  b ?  0 : -1 -> RES = setcc_carry
23853  // a >= b ? -1 :  0 -> RES = setcc_carry
23854  // a >= b ?  0 : -1 -> RES = ~setcc_carry
23855  if (Cond.getOpcode() == X86ISD::SUB) {
23856    unsigned CondCode = cast<ConstantSDNode>(CC)->getZExtValue();
23857
23858    if ((CondCode == X86::COND_AE || CondCode == X86::COND_B) &&
23859        (isAllOnesConstant(Op1) || isAllOnesConstant(Op2)) &&
23860        (isNullConstant(Op1) || isNullConstant(Op2))) {
23861      SDValue Res =
23862          DAG.getNode(X86ISD::SETCC_CARRY, DL, Op.getValueType(),
23863                      DAG.getTargetConstant(X86::COND_B, DL, MVT::i8), Cond);
23864      if (isAllOnesConstant(Op1) != (CondCode == X86::COND_B))
23865        return DAG.getNOT(DL, Res, Res.getValueType());
23866      return Res;
23867    }
23868  }
23869
23870  // X86 doesn't have an i8 cmov. If both operands are the result of a truncate
23871  // widen the cmov and push the truncate through. This avoids introducing a new
23872  // branch during isel and doesn't add any extensions.
23873  if (Op.getValueType() == MVT::i8 &&
23874      Op1.getOpcode() == ISD::TRUNCATE && Op2.getOpcode() == ISD::TRUNCATE) {
23875    SDValue T1 = Op1.getOperand(0), T2 = Op2.getOperand(0);
23876    if (T1.getValueType() == T2.getValueType() &&
23877        // Exclude CopyFromReg to avoid partial register stalls.
23878        T1.getOpcode() != ISD::CopyFromReg && T2.getOpcode()!=ISD::CopyFromReg){
23879      SDValue Cmov = DAG.getNode(X86ISD::CMOV, DL, T1.getValueType(), T2, T1,
23880                                 CC, Cond);
23881      return DAG.getNode(ISD::TRUNCATE, DL, Op.getValueType(), Cmov);
23882    }
23883  }
23884
23885  // Or finally, promote i8 cmovs if we have CMOV,
23886  //                 or i16 cmovs if it won't prevent folding a load.
23887  // FIXME: we should not limit promotion of i8 case to only when the CMOV is
23888  //        legal, but EmitLoweredSelect() can not deal with these extensions
23889  //        being inserted between two CMOV's. (in i16 case too TBN)
23890  //        https://bugs.llvm.org/show_bug.cgi?id=40974
23891  if ((Op.getValueType() == MVT::i8 && Subtarget.hasCMov()) ||
23892      (Op.getValueType() == MVT::i16 && !MayFoldLoad(Op1) &&
23893       !MayFoldLoad(Op2))) {
23894    Op1 = DAG.getNode(ISD::ANY_EXTEND, DL, MVT::i32, Op1);
23895    Op2 = DAG.getNode(ISD::ANY_EXTEND, DL, MVT::i32, Op2);
23896    SDValue Ops[] = { Op2, Op1, CC, Cond };
23897    SDValue Cmov = DAG.getNode(X86ISD::CMOV, DL, MVT::i32, Ops);
23898    return DAG.getNode(ISD::TRUNCATE, DL, Op.getValueType(), Cmov);
23899  }
23900
23901  // X86ISD::CMOV means set the result (which is operand 1) to the RHS if
23902  // condition is true.
23903  SDValue Ops[] = { Op2, Op1, CC, Cond };
23904  return DAG.getNode(X86ISD::CMOV, DL, Op.getValueType(), Ops);
23905}
23906
23907static SDValue LowerSIGN_EXTEND_Mask(SDValue Op,
23908                                     const X86Subtarget &Subtarget,
23909                                     SelectionDAG &DAG) {
23910  MVT VT = Op->getSimpleValueType(0);
23911  SDValue In = Op->getOperand(0);
23912  MVT InVT = In.getSimpleValueType();
23913  assert(InVT.getVectorElementType() == MVT::i1 && "Unexpected input type!");
23914  MVT VTElt = VT.getVectorElementType();
23915  SDLoc dl(Op);
23916
23917  unsigned NumElts = VT.getVectorNumElements();
23918
23919  // Extend VT if the scalar type is i8/i16 and BWI is not supported.
23920  MVT ExtVT = VT;
23921  if (!Subtarget.hasBWI() && VTElt.getSizeInBits() <= 16) {
23922    // If v16i32 is to be avoided, we'll need to split and concatenate.
23923    if (NumElts == 16 && !Subtarget.canExtendTo512DQ())
23924      return SplitAndExtendv16i1(Op.getOpcode(), VT, In, dl, DAG);
23925
23926    ExtVT = MVT::getVectorVT(MVT::i32, NumElts);
23927  }
23928
23929  // Widen to 512-bits if VLX is not supported.
23930  MVT WideVT = ExtVT;
23931  if (!ExtVT.is512BitVector() && !Subtarget.hasVLX()) {
23932    NumElts *= 512 / ExtVT.getSizeInBits();
23933    InVT = MVT::getVectorVT(MVT::i1, NumElts);
23934    In = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, InVT, DAG.getUNDEF(InVT),
23935                     In, DAG.getIntPtrConstant(0, dl));
23936    WideVT = MVT::getVectorVT(ExtVT.getVectorElementType(), NumElts);
23937  }
23938
23939  SDValue V;
23940  MVT WideEltVT = WideVT.getVectorElementType();
23941  if ((Subtarget.hasDQI() && WideEltVT.getSizeInBits() >= 32) ||
23942      (Subtarget.hasBWI() && WideEltVT.getSizeInBits() <= 16)) {
23943    V = DAG.getNode(Op.getOpcode(), dl, WideVT, In);
23944  } else {
23945    SDValue NegOne = DAG.getConstant(-1, dl, WideVT);
23946    SDValue Zero = DAG.getConstant(0, dl, WideVT);
23947    V = DAG.getSelect(dl, WideVT, In, NegOne, Zero);
23948  }
23949
23950  // Truncate if we had to extend i16/i8 above.
23951  if (VT != ExtVT) {
23952    WideVT = MVT::getVectorVT(VTElt, NumElts);
23953    V = DAG.getNode(ISD::TRUNCATE, dl, WideVT, V);
23954  }
23955
23956  // Extract back to 128/256-bit if we widened.
23957  if (WideVT != VT)
23958    V = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, VT, V,
23959                    DAG.getIntPtrConstant(0, dl));
23960
23961  return V;
23962}
23963
23964static SDValue LowerANY_EXTEND(SDValue Op, const X86Subtarget &Subtarget,
23965                               SelectionDAG &DAG) {
23966  SDValue In = Op->getOperand(0);
23967  MVT InVT = In.getSimpleValueType();
23968
23969  if (InVT.getVectorElementType() == MVT::i1)
23970    return LowerSIGN_EXTEND_Mask(Op, Subtarget, DAG);
23971
23972  assert(Subtarget.hasAVX() && "Expected AVX support");
23973  return LowerAVXExtend(Op, DAG, Subtarget);
23974}
23975
23976// Lowering for SIGN_EXTEND_VECTOR_INREG and ZERO_EXTEND_VECTOR_INREG.
23977// For sign extend this needs to handle all vector sizes and SSE4.1 and
23978// non-SSE4.1 targets. For zero extend this should only handle inputs of
23979// MVT::v64i8 when BWI is not supported, but AVX512 is.
23980static SDValue LowerEXTEND_VECTOR_INREG(SDValue Op,
23981                                        const X86Subtarget &Subtarget,
23982                                        SelectionDAG &DAG) {
23983  SDValue In = Op->getOperand(0);
23984  MVT VT = Op->getSimpleValueType(0);
23985  MVT InVT = In.getSimpleValueType();
23986
23987  MVT SVT = VT.getVectorElementType();
23988  MVT InSVT = InVT.getVectorElementType();
23989  assert(SVT.getFixedSizeInBits() > InSVT.getFixedSizeInBits());
23990
23991  if (SVT != MVT::i64 && SVT != MVT::i32 && SVT != MVT::i16)
23992    return SDValue();
23993  if (InSVT != MVT::i32 && InSVT != MVT::i16 && InSVT != MVT::i8)
23994    return SDValue();
23995  if (!(VT.is128BitVector() && Subtarget.hasSSE2()) &&
23996      !(VT.is256BitVector() && Subtarget.hasAVX()) &&
23997      !(VT.is512BitVector() && Subtarget.hasAVX512()))
23998    return SDValue();
23999
24000  SDLoc dl(Op);
24001  unsigned Opc = Op.getOpcode();
24002  unsigned NumElts = VT.getVectorNumElements();
24003
24004  // For 256-bit vectors, we only need the lower (128-bit) half of the input.
24005  // For 512-bit vectors, we need 128-bits or 256-bits.
24006  if (InVT.getSizeInBits() > 128) {
24007    // Input needs to be at least the same number of elements as output, and
24008    // at least 128-bits.
24009    int InSize = InSVT.getSizeInBits() * NumElts;
24010    In = extractSubVector(In, 0, DAG, dl, std::max(InSize, 128));
24011    InVT = In.getSimpleValueType();
24012  }
24013
24014  // SSE41 targets can use the pmov[sz]x* instructions directly for 128-bit results,
24015  // so are legal and shouldn't occur here. AVX2/AVX512 pmovsx* instructions still
24016  // need to be handled here for 256/512-bit results.
24017  if (Subtarget.hasInt256()) {
24018    assert(VT.getSizeInBits() > 128 && "Unexpected 128-bit vector extension");
24019
24020    if (InVT.getVectorNumElements() != NumElts)
24021      return DAG.getNode(Op.getOpcode(), dl, VT, In);
24022
24023    // FIXME: Apparently we create inreg operations that could be regular
24024    // extends.
24025    unsigned ExtOpc =
24026        Opc == ISD::SIGN_EXTEND_VECTOR_INREG ? ISD::SIGN_EXTEND
24027                                             : ISD::ZERO_EXTEND;
24028    return DAG.getNode(ExtOpc, dl, VT, In);
24029  }
24030
24031  // pre-AVX2 256-bit extensions need to be split into 128-bit instructions.
24032  if (Subtarget.hasAVX()) {
24033    assert(VT.is256BitVector() && "256-bit vector expected");
24034    MVT HalfVT = VT.getHalfNumVectorElementsVT();
24035    int HalfNumElts = HalfVT.getVectorNumElements();
24036
24037    unsigned NumSrcElts = InVT.getVectorNumElements();
24038    SmallVector<int, 16> HiMask(NumSrcElts, SM_SentinelUndef);
24039    for (int i = 0; i != HalfNumElts; ++i)
24040      HiMask[i] = HalfNumElts + i;
24041
24042    SDValue Lo = DAG.getNode(Opc, dl, HalfVT, In);
24043    SDValue Hi = DAG.getVectorShuffle(InVT, dl, In, DAG.getUNDEF(InVT), HiMask);
24044    Hi = DAG.getNode(Opc, dl, HalfVT, Hi);
24045    return DAG.getNode(ISD::CONCAT_VECTORS, dl, VT, Lo, Hi);
24046  }
24047
24048  // We should only get here for sign extend.
24049  assert(Opc == ISD::SIGN_EXTEND_VECTOR_INREG && "Unexpected opcode!");
24050  assert(VT.is128BitVector() && InVT.is128BitVector() && "Unexpected VTs");
24051
24052  // pre-SSE41 targets unpack lower lanes and then sign-extend using SRAI.
24053  SDValue Curr = In;
24054  SDValue SignExt = Curr;
24055
24056  // As SRAI is only available on i16/i32 types, we expand only up to i32
24057  // and handle i64 separately.
24058  if (InVT != MVT::v4i32) {
24059    MVT DestVT = VT == MVT::v2i64 ? MVT::v4i32 : VT;
24060
24061    unsigned DestWidth = DestVT.getScalarSizeInBits();
24062    unsigned Scale = DestWidth / InSVT.getSizeInBits();
24063
24064    unsigned InNumElts = InVT.getVectorNumElements();
24065    unsigned DestElts = DestVT.getVectorNumElements();
24066
24067    // Build a shuffle mask that takes each input element and places it in the
24068    // MSBs of the new element size.
24069    SmallVector<int, 16> Mask(InNumElts, SM_SentinelUndef);
24070    for (unsigned i = 0; i != DestElts; ++i)
24071      Mask[i * Scale + (Scale - 1)] = i;
24072
24073    Curr = DAG.getVectorShuffle(InVT, dl, In, In, Mask);
24074    Curr = DAG.getBitcast(DestVT, Curr);
24075
24076    unsigned SignExtShift = DestWidth - InSVT.getSizeInBits();
24077    SignExt = DAG.getNode(X86ISD::VSRAI, dl, DestVT, Curr,
24078                          DAG.getTargetConstant(SignExtShift, dl, MVT::i8));
24079  }
24080
24081  if (VT == MVT::v2i64) {
24082    assert(Curr.getValueType() == MVT::v4i32 && "Unexpected input VT");
24083    SDValue Zero = DAG.getConstant(0, dl, MVT::v4i32);
24084    SDValue Sign = DAG.getSetCC(dl, MVT::v4i32, Zero, Curr, ISD::SETGT);
24085    SignExt = DAG.getVectorShuffle(MVT::v4i32, dl, SignExt, Sign, {0, 4, 1, 5});
24086    SignExt = DAG.getBitcast(VT, SignExt);
24087  }
24088
24089  return SignExt;
24090}
24091
24092static SDValue LowerSIGN_EXTEND(SDValue Op, const X86Subtarget &Subtarget,
24093                                SelectionDAG &DAG) {
24094  MVT VT = Op->getSimpleValueType(0);
24095  SDValue In = Op->getOperand(0);
24096  MVT InVT = In.getSimpleValueType();
24097  SDLoc dl(Op);
24098
24099  if (InVT.getVectorElementType() == MVT::i1)
24100    return LowerSIGN_EXTEND_Mask(Op, Subtarget, DAG);
24101
24102  assert(VT.isVector() && InVT.isVector() && "Expected vector type");
24103  assert(VT.getVectorNumElements() == InVT.getVectorNumElements() &&
24104         "Expected same number of elements");
24105  assert((VT.getVectorElementType() == MVT::i16 ||
24106          VT.getVectorElementType() == MVT::i32 ||
24107          VT.getVectorElementType() == MVT::i64) &&
24108         "Unexpected element type");
24109  assert((InVT.getVectorElementType() == MVT::i8 ||
24110          InVT.getVectorElementType() == MVT::i16 ||
24111          InVT.getVectorElementType() == MVT::i32) &&
24112         "Unexpected element type");
24113
24114  if (VT == MVT::v32i16 && !Subtarget.hasBWI()) {
24115    assert(InVT == MVT::v32i8 && "Unexpected VT!");
24116    return splitVectorIntUnary(Op, DAG);
24117  }
24118
24119  if (Subtarget.hasInt256())
24120    return Op;
24121
24122  // Optimize vectors in AVX mode
24123  // Sign extend  v8i16 to v8i32 and
24124  //              v4i32 to v4i64
24125  //
24126  // Divide input vector into two parts
24127  // for v4i32 the high shuffle mask will be {2, 3, -1, -1}
24128  // use vpmovsx instruction to extend v4i32 -> v2i64; v8i16 -> v4i32
24129  // concat the vectors to original VT
24130  MVT HalfVT = VT.getHalfNumVectorElementsVT();
24131  SDValue OpLo = DAG.getNode(ISD::SIGN_EXTEND_VECTOR_INREG, dl, HalfVT, In);
24132
24133  unsigned NumElems = InVT.getVectorNumElements();
24134  SmallVector<int,8> ShufMask(NumElems, -1);
24135  for (unsigned i = 0; i != NumElems/2; ++i)
24136    ShufMask[i] = i + NumElems/2;
24137
24138  SDValue OpHi = DAG.getVectorShuffle(InVT, dl, In, In, ShufMask);
24139  OpHi = DAG.getNode(ISD::SIGN_EXTEND_VECTOR_INREG, dl, HalfVT, OpHi);
24140
24141  return DAG.getNode(ISD::CONCAT_VECTORS, dl, VT, OpLo, OpHi);
24142}
24143
24144/// Change a vector store into a pair of half-size vector stores.
24145static SDValue splitVectorStore(StoreSDNode *Store, SelectionDAG &DAG) {
24146  SDValue StoredVal = Store->getValue();
24147  assert((StoredVal.getValueType().is256BitVector() ||
24148          StoredVal.getValueType().is512BitVector()) &&
24149         "Expecting 256/512-bit op");
24150
24151  // Splitting volatile memory ops is not allowed unless the operation was not
24152  // legal to begin with. Assume the input store is legal (this transform is
24153  // only used for targets with AVX). Note: It is possible that we have an
24154  // illegal type like v2i128, and so we could allow splitting a volatile store
24155  // in that case if that is important.
24156  if (!Store->isSimple())
24157    return SDValue();
24158
24159  SDLoc DL(Store);
24160  SDValue Value0, Value1;
24161  std::tie(Value0, Value1) = splitVector(StoredVal, DAG, DL);
24162  unsigned HalfOffset = Value0.getValueType().getStoreSize();
24163  SDValue Ptr0 = Store->getBasePtr();
24164  SDValue Ptr1 =
24165      DAG.getMemBasePlusOffset(Ptr0, TypeSize::Fixed(HalfOffset), DL);
24166  SDValue Ch0 =
24167      DAG.getStore(Store->getChain(), DL, Value0, Ptr0, Store->getPointerInfo(),
24168                   Store->getOriginalAlign(),
24169                   Store->getMemOperand()->getFlags());
24170  SDValue Ch1 = DAG.getStore(Store->getChain(), DL, Value1, Ptr1,
24171                             Store->getPointerInfo().getWithOffset(HalfOffset),
24172                             Store->getOriginalAlign(),
24173                             Store->getMemOperand()->getFlags());
24174  return DAG.getNode(ISD::TokenFactor, DL, MVT::Other, Ch0, Ch1);
24175}
24176
24177/// Scalarize a vector store, bitcasting to TargetVT to determine the scalar
24178/// type.
24179static SDValue scalarizeVectorStore(StoreSDNode *Store, MVT StoreVT,
24180                                    SelectionDAG &DAG) {
24181  SDValue StoredVal = Store->getValue();
24182  assert(StoreVT.is128BitVector() &&
24183         StoredVal.getValueType().is128BitVector() && "Expecting 128-bit op");
24184  StoredVal = DAG.getBitcast(StoreVT, StoredVal);
24185
24186  // Splitting volatile memory ops is not allowed unless the operation was not
24187  // legal to begin with. We are assuming the input op is legal (this transform
24188  // is only used for targets with AVX).
24189  if (!Store->isSimple())
24190    return SDValue();
24191
24192  MVT StoreSVT = StoreVT.getScalarType();
24193  unsigned NumElems = StoreVT.getVectorNumElements();
24194  unsigned ScalarSize = StoreSVT.getStoreSize();
24195
24196  SDLoc DL(Store);
24197  SmallVector<SDValue, 4> Stores;
24198  for (unsigned i = 0; i != NumElems; ++i) {
24199    unsigned Offset = i * ScalarSize;
24200    SDValue Ptr = DAG.getMemBasePlusOffset(Store->getBasePtr(),
24201                                           TypeSize::Fixed(Offset), DL);
24202    SDValue Scl = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, StoreSVT, StoredVal,
24203                              DAG.getIntPtrConstant(i, DL));
24204    SDValue Ch = DAG.getStore(Store->getChain(), DL, Scl, Ptr,
24205                              Store->getPointerInfo().getWithOffset(Offset),
24206                              Store->getOriginalAlign(),
24207                              Store->getMemOperand()->getFlags());
24208    Stores.push_back(Ch);
24209  }
24210  return DAG.getNode(ISD::TokenFactor, DL, MVT::Other, Stores);
24211}
24212
24213static SDValue LowerStore(SDValue Op, const X86Subtarget &Subtarget,
24214                          SelectionDAG &DAG) {
24215  StoreSDNode *St = cast<StoreSDNode>(Op.getNode());
24216  SDLoc dl(St);
24217  SDValue StoredVal = St->getValue();
24218
24219  // Without AVX512DQ, we need to use a scalar type for v2i1/v4i1/v8i1 stores.
24220  if (StoredVal.getValueType().isVector() &&
24221      StoredVal.getValueType().getVectorElementType() == MVT::i1) {
24222    unsigned NumElts = StoredVal.getValueType().getVectorNumElements();
24223    assert(NumElts <= 8 && "Unexpected VT");
24224    assert(!St->isTruncatingStore() && "Expected non-truncating store");
24225    assert(Subtarget.hasAVX512() && !Subtarget.hasDQI() &&
24226           "Expected AVX512F without AVX512DQI");
24227
24228    // We must pad with zeros to ensure we store zeroes to any unused bits.
24229    StoredVal = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, MVT::v16i1,
24230                            DAG.getUNDEF(MVT::v16i1), StoredVal,
24231                            DAG.getIntPtrConstant(0, dl));
24232    StoredVal = DAG.getBitcast(MVT::i16, StoredVal);
24233    StoredVal = DAG.getNode(ISD::TRUNCATE, dl, MVT::i8, StoredVal);
24234    // Make sure we store zeros in the extra bits.
24235    if (NumElts < 8)
24236      StoredVal = DAG.getZeroExtendInReg(
24237          StoredVal, dl, EVT::getIntegerVT(*DAG.getContext(), NumElts));
24238
24239    return DAG.getStore(St->getChain(), dl, StoredVal, St->getBasePtr(),
24240                        St->getPointerInfo(), St->getOriginalAlign(),
24241                        St->getMemOperand()->getFlags());
24242  }
24243
24244  if (St->isTruncatingStore())
24245    return SDValue();
24246
24247  // If this is a 256-bit store of concatenated ops, we are better off splitting
24248  // that store into two 128-bit stores. This avoids spurious use of 256-bit ops
24249  // and each half can execute independently. Some cores would split the op into
24250  // halves anyway, so the concat (vinsertf128) is purely an extra op.
24251  MVT StoreVT = StoredVal.getSimpleValueType();
24252  if (StoreVT.is256BitVector() ||
24253      ((StoreVT == MVT::v32i16 || StoreVT == MVT::v64i8) &&
24254       !Subtarget.hasBWI())) {
24255    SmallVector<SDValue, 4> CatOps;
24256    if (StoredVal.hasOneUse() && collectConcatOps(StoredVal.getNode(), CatOps))
24257      return splitVectorStore(St, DAG);
24258    return SDValue();
24259  }
24260
24261  const TargetLowering &TLI = DAG.getTargetLoweringInfo();
24262  assert(StoreVT.isVector() && StoreVT.getSizeInBits() == 64 &&
24263         "Unexpected VT");
24264  assert(TLI.getTypeAction(*DAG.getContext(), StoreVT) ==
24265             TargetLowering::TypeWidenVector && "Unexpected type action!");
24266
24267  EVT WideVT = TLI.getTypeToTransformTo(*DAG.getContext(), StoreVT);
24268  StoredVal = DAG.getNode(ISD::CONCAT_VECTORS, dl, WideVT, StoredVal,
24269                          DAG.getUNDEF(StoreVT));
24270
24271  if (Subtarget.hasSSE2()) {
24272    // Widen the vector, cast to a v2x64 type, extract the single 64-bit element
24273    // and store it.
24274    MVT StVT = Subtarget.is64Bit() && StoreVT.isInteger() ? MVT::i64 : MVT::f64;
24275    MVT CastVT = MVT::getVectorVT(StVT, 2);
24276    StoredVal = DAG.getBitcast(CastVT, StoredVal);
24277    StoredVal = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, StVT, StoredVal,
24278                            DAG.getIntPtrConstant(0, dl));
24279
24280    return DAG.getStore(St->getChain(), dl, StoredVal, St->getBasePtr(),
24281                        St->getPointerInfo(), St->getOriginalAlign(),
24282                        St->getMemOperand()->getFlags());
24283  }
24284  assert(Subtarget.hasSSE1() && "Expected SSE");
24285  SDVTList Tys = DAG.getVTList(MVT::Other);
24286  SDValue Ops[] = {St->getChain(), StoredVal, St->getBasePtr()};
24287  return DAG.getMemIntrinsicNode(X86ISD::VEXTRACT_STORE, dl, Tys, Ops, MVT::i64,
24288                                 St->getMemOperand());
24289}
24290
24291// Lower vector extended loads using a shuffle. If SSSE3 is not available we
24292// may emit an illegal shuffle but the expansion is still better than scalar
24293// code. We generate sext/sext_invec for SEXTLOADs if it's available, otherwise
24294// we'll emit a shuffle and a arithmetic shift.
24295// FIXME: Is the expansion actually better than scalar code? It doesn't seem so.
24296// TODO: It is possible to support ZExt by zeroing the undef values during
24297// the shuffle phase or after the shuffle.
24298static SDValue LowerLoad(SDValue Op, const X86Subtarget &Subtarget,
24299                                 SelectionDAG &DAG) {
24300  MVT RegVT = Op.getSimpleValueType();
24301  assert(RegVT.isVector() && "We only custom lower vector loads.");
24302  assert(RegVT.isInteger() &&
24303         "We only custom lower integer vector loads.");
24304
24305  LoadSDNode *Ld = cast<LoadSDNode>(Op.getNode());
24306  SDLoc dl(Ld);
24307
24308  // Without AVX512DQ, we need to use a scalar type for v2i1/v4i1/v8i1 loads.
24309  if (RegVT.getVectorElementType() == MVT::i1) {
24310    assert(EVT(RegVT) == Ld->getMemoryVT() && "Expected non-extending load");
24311    assert(RegVT.getVectorNumElements() <= 8 && "Unexpected VT");
24312    assert(Subtarget.hasAVX512() && !Subtarget.hasDQI() &&
24313           "Expected AVX512F without AVX512DQI");
24314
24315    SDValue NewLd = DAG.getLoad(MVT::i8, dl, Ld->getChain(), Ld->getBasePtr(),
24316                                Ld->getPointerInfo(), Ld->getOriginalAlign(),
24317                                Ld->getMemOperand()->getFlags());
24318
24319    // Replace chain users with the new chain.
24320    assert(NewLd->getNumValues() == 2 && "Loads must carry a chain!");
24321
24322    SDValue Val = DAG.getNode(ISD::ANY_EXTEND, dl, MVT::i16, NewLd);
24323    Val = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, RegVT,
24324                      DAG.getBitcast(MVT::v16i1, Val),
24325                      DAG.getIntPtrConstant(0, dl));
24326    return DAG.getMergeValues({Val, NewLd.getValue(1)}, dl);
24327  }
24328
24329  return SDValue();
24330}
24331
24332/// Return true if node is an ISD::AND or ISD::OR of two X86ISD::SETCC nodes
24333/// each of which has no other use apart from the AND / OR.
24334static bool isAndOrOfSetCCs(SDValue Op, unsigned &Opc) {
24335  Opc = Op.getOpcode();
24336  if (Opc != ISD::OR && Opc != ISD::AND)
24337    return false;
24338  return (Op.getOperand(0).getOpcode() == X86ISD::SETCC &&
24339          Op.getOperand(0).hasOneUse() &&
24340          Op.getOperand(1).getOpcode() == X86ISD::SETCC &&
24341          Op.getOperand(1).hasOneUse());
24342}
24343
24344SDValue X86TargetLowering::LowerBRCOND(SDValue Op, SelectionDAG &DAG) const {
24345  SDValue Chain = Op.getOperand(0);
24346  SDValue Cond  = Op.getOperand(1);
24347  SDValue Dest  = Op.getOperand(2);
24348  SDLoc dl(Op);
24349
24350  if (Cond.getOpcode() == ISD::SETCC &&
24351      Cond.getOperand(0).getValueType() != MVT::f128) {
24352    SDValue LHS = Cond.getOperand(0);
24353    SDValue RHS = Cond.getOperand(1);
24354    ISD::CondCode CC = cast<CondCodeSDNode>(Cond.getOperand(2))->get();
24355
24356    // Special case for
24357    // setcc([su]{add,sub,mul}o == 0)
24358    // setcc([su]{add,sub,mul}o != 1)
24359    if (ISD::isOverflowIntrOpRes(LHS) &&
24360        (CC == ISD::SETEQ || CC == ISD::SETNE) &&
24361        (isNullConstant(RHS) || isOneConstant(RHS))) {
24362      SDValue Value, Overflow;
24363      X86::CondCode X86Cond;
24364      std::tie(Value, Overflow) = getX86XALUOOp(X86Cond, LHS.getValue(0), DAG);
24365
24366      if ((CC == ISD::SETEQ) == isNullConstant(RHS))
24367        X86Cond = X86::GetOppositeBranchCondition(X86Cond);
24368
24369      SDValue CCVal = DAG.getTargetConstant(X86Cond, dl, MVT::i8);
24370      return DAG.getNode(X86ISD::BRCOND, dl, MVT::Other, Chain, Dest, CCVal,
24371                         Overflow);
24372    }
24373
24374    if (LHS.getSimpleValueType().isInteger()) {
24375      SDValue CCVal;
24376      SDValue EFLAGS = emitFlagsForSetcc(LHS, RHS, CC, SDLoc(Cond), DAG, CCVal);
24377      return DAG.getNode(X86ISD::BRCOND, dl, MVT::Other, Chain, Dest, CCVal,
24378                         EFLAGS);
24379    }
24380
24381    if (CC == ISD::SETOEQ) {
24382      // For FCMP_OEQ, we can emit
24383      // two branches instead of an explicit AND instruction with a
24384      // separate test. However, we only do this if this block doesn't
24385      // have a fall-through edge, because this requires an explicit
24386      // jmp when the condition is false.
24387      if (Op.getNode()->hasOneUse()) {
24388        SDNode *User = *Op.getNode()->use_begin();
24389        // Look for an unconditional branch following this conditional branch.
24390        // We need this because we need to reverse the successors in order
24391        // to implement FCMP_OEQ.
24392        if (User->getOpcode() == ISD::BR) {
24393          SDValue FalseBB = User->getOperand(1);
24394          SDNode *NewBR =
24395            DAG.UpdateNodeOperands(User, User->getOperand(0), Dest);
24396          assert(NewBR == User);
24397          (void)NewBR;
24398          Dest = FalseBB;
24399
24400          SDValue Cmp =
24401              DAG.getNode(X86ISD::FCMP, SDLoc(Cond), MVT::i32, LHS, RHS);
24402          SDValue CCVal = DAG.getTargetConstant(X86::COND_NE, dl, MVT::i8);
24403          Chain = DAG.getNode(X86ISD::BRCOND, dl, MVT::Other, Chain, Dest,
24404                              CCVal, Cmp);
24405          CCVal = DAG.getTargetConstant(X86::COND_P, dl, MVT::i8);
24406          return DAG.getNode(X86ISD::BRCOND, dl, MVT::Other, Chain, Dest, CCVal,
24407                             Cmp);
24408        }
24409      }
24410    } else if (CC == ISD::SETUNE) {
24411      // For FCMP_UNE, we can emit
24412      // two branches instead of an explicit OR instruction with a
24413      // separate test.
24414      SDValue Cmp = DAG.getNode(X86ISD::FCMP, SDLoc(Cond), MVT::i32, LHS, RHS);
24415      SDValue CCVal = DAG.getTargetConstant(X86::COND_NE, dl, MVT::i8);
24416      Chain =
24417          DAG.getNode(X86ISD::BRCOND, dl, MVT::Other, Chain, Dest, CCVal, Cmp);
24418      CCVal = DAG.getTargetConstant(X86::COND_P, dl, MVT::i8);
24419      return DAG.getNode(X86ISD::BRCOND, dl, MVT::Other, Chain, Dest, CCVal,
24420                         Cmp);
24421    } else {
24422      X86::CondCode X86Cond =
24423          TranslateX86CC(CC, dl, /*IsFP*/ true, LHS, RHS, DAG);
24424      SDValue Cmp = DAG.getNode(X86ISD::FCMP, SDLoc(Cond), MVT::i32, LHS, RHS);
24425      SDValue CCVal = DAG.getTargetConstant(X86Cond, dl, MVT::i8);
24426      return DAG.getNode(X86ISD::BRCOND, dl, MVT::Other, Chain, Dest, CCVal,
24427                         Cmp);
24428    }
24429  }
24430
24431  if (ISD::isOverflowIntrOpRes(Cond)) {
24432    SDValue Value, Overflow;
24433    X86::CondCode X86Cond;
24434    std::tie(Value, Overflow) = getX86XALUOOp(X86Cond, Cond.getValue(0), DAG);
24435
24436    SDValue CCVal = DAG.getTargetConstant(X86Cond, dl, MVT::i8);
24437    return DAG.getNode(X86ISD::BRCOND, dl, MVT::Other, Chain, Dest, CCVal,
24438                       Overflow);
24439  }
24440
24441  // Look past the truncate if the high bits are known zero.
24442  if (isTruncWithZeroHighBitsInput(Cond, DAG))
24443    Cond = Cond.getOperand(0);
24444
24445  EVT CondVT = Cond.getValueType();
24446
24447  // Add an AND with 1 if we don't already have one.
24448  if (!(Cond.getOpcode() == ISD::AND && isOneConstant(Cond.getOperand(1))))
24449    Cond =
24450        DAG.getNode(ISD::AND, dl, CondVT, Cond, DAG.getConstant(1, dl, CondVT));
24451
24452  SDValue LHS = Cond;
24453  SDValue RHS = DAG.getConstant(0, dl, CondVT);
24454
24455  SDValue CCVal;
24456  SDValue EFLAGS = emitFlagsForSetcc(LHS, RHS, ISD::SETNE, dl, DAG, CCVal);
24457  return DAG.getNode(X86ISD::BRCOND, dl, MVT::Other, Chain, Dest, CCVal,
24458                     EFLAGS);
24459}
24460
24461// Lower dynamic stack allocation to _alloca call for Cygwin/Mingw targets.
24462// Calls to _alloca are needed to probe the stack when allocating more than 4k
24463// bytes in one go. Touching the stack at 4K increments is necessary to ensure
24464// that the guard pages used by the OS virtual memory manager are allocated in
24465// correct sequence.
24466SDValue
24467X86TargetLowering::LowerDYNAMIC_STACKALLOC(SDValue Op,
24468                                           SelectionDAG &DAG) const {
24469  MachineFunction &MF = DAG.getMachineFunction();
24470  bool SplitStack = MF.shouldSplitStack();
24471  bool EmitStackProbeCall = hasStackProbeSymbol(MF);
24472  bool Lower = (Subtarget.isOSWindows() && !Subtarget.isTargetMachO()) ||
24473               SplitStack || EmitStackProbeCall;
24474  SDLoc dl(Op);
24475
24476  // Get the inputs.
24477  SDNode *Node = Op.getNode();
24478  SDValue Chain = Op.getOperand(0);
24479  SDValue Size  = Op.getOperand(1);
24480  MaybeAlign Alignment(Op.getConstantOperandVal(2));
24481  EVT VT = Node->getValueType(0);
24482
24483  // Chain the dynamic stack allocation so that it doesn't modify the stack
24484  // pointer when other instructions are using the stack.
24485  Chain = DAG.getCALLSEQ_START(Chain, 0, 0, dl);
24486
24487  bool Is64Bit = Subtarget.is64Bit();
24488  MVT SPTy = getPointerTy(DAG.getDataLayout());
24489
24490  SDValue Result;
24491  if (!Lower) {
24492    const TargetLowering &TLI = DAG.getTargetLoweringInfo();
24493    Register SPReg = TLI.getStackPointerRegisterToSaveRestore();
24494    assert(SPReg && "Target cannot require DYNAMIC_STACKALLOC expansion and"
24495                    " not tell us which reg is the stack pointer!");
24496
24497    const TargetFrameLowering &TFI = *Subtarget.getFrameLowering();
24498    const Align StackAlign = TFI.getStackAlign();
24499    if (hasInlineStackProbe(MF)) {
24500      MachineRegisterInfo &MRI = MF.getRegInfo();
24501
24502      const TargetRegisterClass *AddrRegClass = getRegClassFor(SPTy);
24503      Register Vreg = MRI.createVirtualRegister(AddrRegClass);
24504      Chain = DAG.getCopyToReg(Chain, dl, Vreg, Size);
24505      Result = DAG.getNode(X86ISD::PROBED_ALLOCA, dl, SPTy, Chain,
24506                           DAG.getRegister(Vreg, SPTy));
24507    } else {
24508      SDValue SP = DAG.getCopyFromReg(Chain, dl, SPReg, VT);
24509      Chain = SP.getValue(1);
24510      Result = DAG.getNode(ISD::SUB, dl, VT, SP, Size); // Value
24511    }
24512    if (Alignment && *Alignment > StackAlign)
24513      Result =
24514          DAG.getNode(ISD::AND, dl, VT, Result,
24515                      DAG.getConstant(~(Alignment->value() - 1ULL), dl, VT));
24516    Chain = DAG.getCopyToReg(Chain, dl, SPReg, Result); // Output chain
24517  } else if (SplitStack) {
24518    MachineRegisterInfo &MRI = MF.getRegInfo();
24519
24520    if (Is64Bit) {
24521      // The 64 bit implementation of segmented stacks needs to clobber both r10
24522      // r11. This makes it impossible to use it along with nested parameters.
24523      const Function &F = MF.getFunction();
24524      for (const auto &A : F.args()) {
24525        if (A.hasNestAttr())
24526          report_fatal_error("Cannot use segmented stacks with functions that "
24527                             "have nested arguments.");
24528      }
24529    }
24530
24531    const TargetRegisterClass *AddrRegClass = getRegClassFor(SPTy);
24532    Register Vreg = MRI.createVirtualRegister(AddrRegClass);
24533    Chain = DAG.getCopyToReg(Chain, dl, Vreg, Size);
24534    Result = DAG.getNode(X86ISD::SEG_ALLOCA, dl, SPTy, Chain,
24535                                DAG.getRegister(Vreg, SPTy));
24536  } else {
24537    SDVTList NodeTys = DAG.getVTList(MVT::Other, MVT::Glue);
24538    Chain = DAG.getNode(X86ISD::WIN_ALLOCA, dl, NodeTys, Chain, Size);
24539    MF.getInfo<X86MachineFunctionInfo>()->setHasWinAlloca(true);
24540
24541    const X86RegisterInfo *RegInfo = Subtarget.getRegisterInfo();
24542    Register SPReg = RegInfo->getStackRegister();
24543    SDValue SP = DAG.getCopyFromReg(Chain, dl, SPReg, SPTy);
24544    Chain = SP.getValue(1);
24545
24546    if (Alignment) {
24547      SP = DAG.getNode(ISD::AND, dl, VT, SP.getValue(0),
24548                       DAG.getConstant(~(Alignment->value() - 1ULL), dl, VT));
24549      Chain = DAG.getCopyToReg(Chain, dl, SPReg, SP);
24550    }
24551
24552    Result = SP;
24553  }
24554
24555  Chain = DAG.getCALLSEQ_END(Chain, DAG.getIntPtrConstant(0, dl, true),
24556                             DAG.getIntPtrConstant(0, dl, true), SDValue(), dl);
24557
24558  SDValue Ops[2] = {Result, Chain};
24559  return DAG.getMergeValues(Ops, dl);
24560}
24561
24562SDValue X86TargetLowering::LowerVASTART(SDValue Op, SelectionDAG &DAG) const {
24563  MachineFunction &MF = DAG.getMachineFunction();
24564  auto PtrVT = getPointerTy(MF.getDataLayout());
24565  X86MachineFunctionInfo *FuncInfo = MF.getInfo<X86MachineFunctionInfo>();
24566
24567  const Value *SV = cast<SrcValueSDNode>(Op.getOperand(2))->getValue();
24568  SDLoc DL(Op);
24569
24570  if (!Subtarget.is64Bit() ||
24571      Subtarget.isCallingConvWin64(MF.getFunction().getCallingConv())) {
24572    // vastart just stores the address of the VarArgsFrameIndex slot into the
24573    // memory location argument.
24574    SDValue FR = DAG.getFrameIndex(FuncInfo->getVarArgsFrameIndex(), PtrVT);
24575    return DAG.getStore(Op.getOperand(0), DL, FR, Op.getOperand(1),
24576                        MachinePointerInfo(SV));
24577  }
24578
24579  // __va_list_tag:
24580  //   gp_offset         (0 - 6 * 8)
24581  //   fp_offset         (48 - 48 + 8 * 16)
24582  //   overflow_arg_area (point to parameters coming in memory).
24583  //   reg_save_area
24584  SmallVector<SDValue, 8> MemOps;
24585  SDValue FIN = Op.getOperand(1);
24586  // Store gp_offset
24587  SDValue Store = DAG.getStore(
24588      Op.getOperand(0), DL,
24589      DAG.getConstant(FuncInfo->getVarArgsGPOffset(), DL, MVT::i32), FIN,
24590      MachinePointerInfo(SV));
24591  MemOps.push_back(Store);
24592
24593  // Store fp_offset
24594  FIN = DAG.getMemBasePlusOffset(FIN, TypeSize::Fixed(4), DL);
24595  Store = DAG.getStore(
24596      Op.getOperand(0), DL,
24597      DAG.getConstant(FuncInfo->getVarArgsFPOffset(), DL, MVT::i32), FIN,
24598      MachinePointerInfo(SV, 4));
24599  MemOps.push_back(Store);
24600
24601  // Store ptr to overflow_arg_area
24602  FIN = DAG.getNode(ISD::ADD, DL, PtrVT, FIN, DAG.getIntPtrConstant(4, DL));
24603  SDValue OVFIN = DAG.getFrameIndex(FuncInfo->getVarArgsFrameIndex(), PtrVT);
24604  Store =
24605      DAG.getStore(Op.getOperand(0), DL, OVFIN, FIN, MachinePointerInfo(SV, 8));
24606  MemOps.push_back(Store);
24607
24608  // Store ptr to reg_save_area.
24609  FIN = DAG.getNode(ISD::ADD, DL, PtrVT, FIN, DAG.getIntPtrConstant(
24610      Subtarget.isTarget64BitLP64() ? 8 : 4, DL));
24611  SDValue RSFIN = DAG.getFrameIndex(FuncInfo->getRegSaveFrameIndex(), PtrVT);
24612  Store = DAG.getStore(
24613      Op.getOperand(0), DL, RSFIN, FIN,
24614      MachinePointerInfo(SV, Subtarget.isTarget64BitLP64() ? 16 : 12));
24615  MemOps.push_back(Store);
24616  return DAG.getNode(ISD::TokenFactor, DL, MVT::Other, MemOps);
24617}
24618
24619SDValue X86TargetLowering::LowerVAARG(SDValue Op, SelectionDAG &DAG) const {
24620  assert(Subtarget.is64Bit() &&
24621         "LowerVAARG only handles 64-bit va_arg!");
24622  assert(Op.getNumOperands() == 4);
24623
24624  MachineFunction &MF = DAG.getMachineFunction();
24625  if (Subtarget.isCallingConvWin64(MF.getFunction().getCallingConv()))
24626    // The Win64 ABI uses char* instead of a structure.
24627    return DAG.expandVAArg(Op.getNode());
24628
24629  SDValue Chain = Op.getOperand(0);
24630  SDValue SrcPtr = Op.getOperand(1);
24631  const Value *SV = cast<SrcValueSDNode>(Op.getOperand(2))->getValue();
24632  unsigned Align = Op.getConstantOperandVal(3);
24633  SDLoc dl(Op);
24634
24635  EVT ArgVT = Op.getNode()->getValueType(0);
24636  Type *ArgTy = ArgVT.getTypeForEVT(*DAG.getContext());
24637  uint32_t ArgSize = DAG.getDataLayout().getTypeAllocSize(ArgTy);
24638  uint8_t ArgMode;
24639
24640  // Decide which area this value should be read from.
24641  // TODO: Implement the AMD64 ABI in its entirety. This simple
24642  // selection mechanism works only for the basic types.
24643  assert(ArgVT != MVT::f80 && "va_arg for f80 not yet implemented");
24644  if (ArgVT.isFloatingPoint() && ArgSize <= 16 /*bytes*/) {
24645    ArgMode = 2;  // Argument passed in XMM register. Use fp_offset.
24646  } else {
24647    assert(ArgVT.isInteger() && ArgSize <= 32 /*bytes*/ &&
24648           "Unhandled argument type in LowerVAARG");
24649    ArgMode = 1;  // Argument passed in GPR64 register(s). Use gp_offset.
24650  }
24651
24652  if (ArgMode == 2) {
24653    // Sanity Check: Make sure using fp_offset makes sense.
24654    assert(!Subtarget.useSoftFloat() &&
24655           !(MF.getFunction().hasFnAttribute(Attribute::NoImplicitFloat)) &&
24656           Subtarget.hasSSE1());
24657  }
24658
24659  // Insert VAARG node into the DAG
24660  // VAARG returns two values: Variable Argument Address, Chain
24661  SDValue InstOps[] = {Chain, SrcPtr,
24662                       DAG.getTargetConstant(ArgSize, dl, MVT::i32),
24663                       DAG.getTargetConstant(ArgMode, dl, MVT::i8),
24664                       DAG.getTargetConstant(Align, dl, MVT::i32)};
24665  SDVTList VTs = DAG.getVTList(getPointerTy(DAG.getDataLayout()), MVT::Other);
24666  SDValue VAARG = DAG.getMemIntrinsicNode(
24667      Subtarget.isTarget64BitLP64() ? X86ISD::VAARG_64 : X86ISD::VAARG_X32, dl,
24668      VTs, InstOps, MVT::i64, MachinePointerInfo(SV),
24669      /*Alignment=*/None,
24670      MachineMemOperand::MOLoad | MachineMemOperand::MOStore);
24671  Chain = VAARG.getValue(1);
24672
24673  // Load the next argument and return it
24674  return DAG.getLoad(ArgVT, dl, Chain, VAARG, MachinePointerInfo());
24675}
24676
24677static SDValue LowerVACOPY(SDValue Op, const X86Subtarget &Subtarget,
24678                           SelectionDAG &DAG) {
24679  // X86-64 va_list is a struct { i32, i32, i8*, i8* }, except on Windows,
24680  // where a va_list is still an i8*.
24681  assert(Subtarget.is64Bit() && "This code only handles 64-bit va_copy!");
24682  if (Subtarget.isCallingConvWin64(
24683        DAG.getMachineFunction().getFunction().getCallingConv()))
24684    // Probably a Win64 va_copy.
24685    return DAG.expandVACopy(Op.getNode());
24686
24687  SDValue Chain = Op.getOperand(0);
24688  SDValue DstPtr = Op.getOperand(1);
24689  SDValue SrcPtr = Op.getOperand(2);
24690  const Value *DstSV = cast<SrcValueSDNode>(Op.getOperand(3))->getValue();
24691  const Value *SrcSV = cast<SrcValueSDNode>(Op.getOperand(4))->getValue();
24692  SDLoc DL(Op);
24693
24694  return DAG.getMemcpy(
24695      Chain, DL, DstPtr, SrcPtr,
24696      DAG.getIntPtrConstant(Subtarget.isTarget64BitLP64() ? 24 : 16, DL),
24697      Align(Subtarget.isTarget64BitLP64() ? 8 : 4), /*isVolatile*/ false, false,
24698      false, MachinePointerInfo(DstSV), MachinePointerInfo(SrcSV));
24699}
24700
24701// Helper to get immediate/variable SSE shift opcode from other shift opcodes.
24702static unsigned getTargetVShiftUniformOpcode(unsigned Opc, bool IsVariable) {
24703  switch (Opc) {
24704  case ISD::SHL:
24705  case X86ISD::VSHL:
24706  case X86ISD::VSHLI:
24707    return IsVariable ? X86ISD::VSHL : X86ISD::VSHLI;
24708  case ISD::SRL:
24709  case X86ISD::VSRL:
24710  case X86ISD::VSRLI:
24711    return IsVariable ? X86ISD::VSRL : X86ISD::VSRLI;
24712  case ISD::SRA:
24713  case X86ISD::VSRA:
24714  case X86ISD::VSRAI:
24715    return IsVariable ? X86ISD::VSRA : X86ISD::VSRAI;
24716  }
24717  llvm_unreachable("Unknown target vector shift node");
24718}
24719
24720/// Handle vector element shifts where the shift amount is a constant.
24721/// Takes immediate version of shift as input.
24722static SDValue getTargetVShiftByConstNode(unsigned Opc, const SDLoc &dl, MVT VT,
24723                                          SDValue SrcOp, uint64_t ShiftAmt,
24724                                          SelectionDAG &DAG) {
24725  MVT ElementType = VT.getVectorElementType();
24726
24727  // Bitcast the source vector to the output type, this is mainly necessary for
24728  // vXi8/vXi64 shifts.
24729  if (VT != SrcOp.getSimpleValueType())
24730    SrcOp = DAG.getBitcast(VT, SrcOp);
24731
24732  // Fold this packed shift into its first operand if ShiftAmt is 0.
24733  if (ShiftAmt == 0)
24734    return SrcOp;
24735
24736  // Check for ShiftAmt >= element width
24737  if (ShiftAmt >= ElementType.getSizeInBits()) {
24738    if (Opc == X86ISD::VSRAI)
24739      ShiftAmt = ElementType.getSizeInBits() - 1;
24740    else
24741      return DAG.getConstant(0, dl, VT);
24742  }
24743
24744  assert((Opc == X86ISD::VSHLI || Opc == X86ISD::VSRLI || Opc == X86ISD::VSRAI)
24745         && "Unknown target vector shift-by-constant node");
24746
24747  // Fold this packed vector shift into a build vector if SrcOp is a
24748  // vector of Constants or UNDEFs.
24749  if (ISD::isBuildVectorOfConstantSDNodes(SrcOp.getNode())) {
24750    SmallVector<SDValue, 8> Elts;
24751    unsigned NumElts = SrcOp->getNumOperands();
24752
24753    switch (Opc) {
24754    default: llvm_unreachable("Unknown opcode!");
24755    case X86ISD::VSHLI:
24756      for (unsigned i = 0; i != NumElts; ++i) {
24757        SDValue CurrentOp = SrcOp->getOperand(i);
24758        if (CurrentOp->isUndef()) {
24759          // Must produce 0s in the correct bits.
24760          Elts.push_back(DAG.getConstant(0, dl, ElementType));
24761          continue;
24762        }
24763        auto *ND = cast<ConstantSDNode>(CurrentOp);
24764        const APInt &C = ND->getAPIntValue();
24765        Elts.push_back(DAG.getConstant(C.shl(ShiftAmt), dl, ElementType));
24766      }
24767      break;
24768    case X86ISD::VSRLI:
24769      for (unsigned i = 0; i != NumElts; ++i) {
24770        SDValue CurrentOp = SrcOp->getOperand(i);
24771        if (CurrentOp->isUndef()) {
24772          // Must produce 0s in the correct bits.
24773          Elts.push_back(DAG.getConstant(0, dl, ElementType));
24774          continue;
24775        }
24776        auto *ND = cast<ConstantSDNode>(CurrentOp);
24777        const APInt &C = ND->getAPIntValue();
24778        Elts.push_back(DAG.getConstant(C.lshr(ShiftAmt), dl, ElementType));
24779      }
24780      break;
24781    case X86ISD::VSRAI:
24782      for (unsigned i = 0; i != NumElts; ++i) {
24783        SDValue CurrentOp = SrcOp->getOperand(i);
24784        if (CurrentOp->isUndef()) {
24785          // All shifted in bits must be the same so use 0.
24786          Elts.push_back(DAG.getConstant(0, dl, ElementType));
24787          continue;
24788        }
24789        auto *ND = cast<ConstantSDNode>(CurrentOp);
24790        const APInt &C = ND->getAPIntValue();
24791        Elts.push_back(DAG.getConstant(C.ashr(ShiftAmt), dl, ElementType));
24792      }
24793      break;
24794    }
24795
24796    return DAG.getBuildVector(VT, dl, Elts);
24797  }
24798
24799  return DAG.getNode(Opc, dl, VT, SrcOp,
24800                     DAG.getTargetConstant(ShiftAmt, dl, MVT::i8));
24801}
24802
24803/// Handle vector element shifts where the shift amount may or may not be a
24804/// constant. Takes immediate version of shift as input.
24805static SDValue getTargetVShiftNode(unsigned Opc, const SDLoc &dl, MVT VT,
24806                                   SDValue SrcOp, SDValue ShAmt,
24807                                   const X86Subtarget &Subtarget,
24808                                   SelectionDAG &DAG) {
24809  MVT SVT = ShAmt.getSimpleValueType();
24810  assert((SVT == MVT::i32 || SVT == MVT::i64) && "Unexpected value type!");
24811
24812  // Catch shift-by-constant.
24813  if (ConstantSDNode *CShAmt = dyn_cast<ConstantSDNode>(ShAmt))
24814    return getTargetVShiftByConstNode(Opc, dl, VT, SrcOp,
24815                                      CShAmt->getZExtValue(), DAG);
24816
24817  // Change opcode to non-immediate version.
24818  Opc = getTargetVShiftUniformOpcode(Opc, true);
24819
24820  // Need to build a vector containing shift amount.
24821  // SSE/AVX packed shifts only use the lower 64-bit of the shift count.
24822  // +====================+============+=======================================+
24823  // | ShAmt is           | HasSSE4.1? | Construct ShAmt vector as             |
24824  // +====================+============+=======================================+
24825  // | i64                | Yes, No    | Use ShAmt as lowest elt               |
24826  // | i32                | Yes        | zero-extend in-reg                    |
24827  // | (i32 zext(i16/i8)) | Yes        | zero-extend in-reg                    |
24828  // | (i32 zext(i16/i8)) | No         | byte-shift-in-reg                     |
24829  // | i16/i32            | No         | v4i32 build_vector(ShAmt, 0, ud, ud)) |
24830  // +====================+============+=======================================+
24831
24832  if (SVT == MVT::i64)
24833    ShAmt = DAG.getNode(ISD::SCALAR_TO_VECTOR, SDLoc(ShAmt), MVT::v2i64, ShAmt);
24834  else if (ShAmt.getOpcode() == ISD::ZERO_EXTEND &&
24835           ShAmt.getOperand(0).getOpcode() == ISD::EXTRACT_VECTOR_ELT &&
24836           (ShAmt.getOperand(0).getSimpleValueType() == MVT::i16 ||
24837            ShAmt.getOperand(0).getSimpleValueType() == MVT::i8)) {
24838    ShAmt = ShAmt.getOperand(0);
24839    MVT AmtTy = ShAmt.getSimpleValueType() == MVT::i8 ? MVT::v16i8 : MVT::v8i16;
24840    ShAmt = DAG.getNode(ISD::SCALAR_TO_VECTOR, SDLoc(ShAmt), AmtTy, ShAmt);
24841    if (Subtarget.hasSSE41())
24842      ShAmt = DAG.getNode(ISD::ZERO_EXTEND_VECTOR_INREG, SDLoc(ShAmt),
24843                          MVT::v2i64, ShAmt);
24844    else {
24845      SDValue ByteShift = DAG.getTargetConstant(
24846          (128 - AmtTy.getScalarSizeInBits()) / 8, SDLoc(ShAmt), MVT::i8);
24847      ShAmt = DAG.getBitcast(MVT::v16i8, ShAmt);
24848      ShAmt = DAG.getNode(X86ISD::VSHLDQ, SDLoc(ShAmt), MVT::v16i8, ShAmt,
24849                          ByteShift);
24850      ShAmt = DAG.getNode(X86ISD::VSRLDQ, SDLoc(ShAmt), MVT::v16i8, ShAmt,
24851                          ByteShift);
24852    }
24853  } else if (Subtarget.hasSSE41() &&
24854             ShAmt.getOpcode() == ISD::EXTRACT_VECTOR_ELT) {
24855    ShAmt = DAG.getNode(ISD::SCALAR_TO_VECTOR, SDLoc(ShAmt), MVT::v4i32, ShAmt);
24856    ShAmt = DAG.getNode(ISD::ZERO_EXTEND_VECTOR_INREG, SDLoc(ShAmt),
24857                        MVT::v2i64, ShAmt);
24858  } else {
24859    SDValue ShOps[4] = {ShAmt, DAG.getConstant(0, dl, SVT), DAG.getUNDEF(SVT),
24860                        DAG.getUNDEF(SVT)};
24861    ShAmt = DAG.getBuildVector(MVT::v4i32, dl, ShOps);
24862  }
24863
24864  // The return type has to be a 128-bit type with the same element
24865  // type as the input type.
24866  MVT EltVT = VT.getVectorElementType();
24867  MVT ShVT = MVT::getVectorVT(EltVT, 128 / EltVT.getSizeInBits());
24868
24869  ShAmt = DAG.getBitcast(ShVT, ShAmt);
24870  return DAG.getNode(Opc, dl, VT, SrcOp, ShAmt);
24871}
24872
24873/// Return Mask with the necessary casting or extending
24874/// for \p Mask according to \p MaskVT when lowering masking intrinsics
24875static SDValue getMaskNode(SDValue Mask, MVT MaskVT,
24876                           const X86Subtarget &Subtarget, SelectionDAG &DAG,
24877                           const SDLoc &dl) {
24878
24879  if (isAllOnesConstant(Mask))
24880    return DAG.getConstant(1, dl, MaskVT);
24881  if (X86::isZeroNode(Mask))
24882    return DAG.getConstant(0, dl, MaskVT);
24883
24884  assert(MaskVT.bitsLE(Mask.getSimpleValueType()) && "Unexpected mask size!");
24885
24886  if (Mask.getSimpleValueType() == MVT::i64 && Subtarget.is32Bit()) {
24887    assert(MaskVT == MVT::v64i1 && "Expected v64i1 mask!");
24888    assert(Subtarget.hasBWI() && "Expected AVX512BW target!");
24889    // In case 32bit mode, bitcast i64 is illegal, extend/split it.
24890    SDValue Lo, Hi;
24891    Lo = DAG.getNode(ISD::EXTRACT_ELEMENT, dl, MVT::i32, Mask,
24892                        DAG.getConstant(0, dl, MVT::i32));
24893    Hi = DAG.getNode(ISD::EXTRACT_ELEMENT, dl, MVT::i32, Mask,
24894                        DAG.getConstant(1, dl, MVT::i32));
24895
24896    Lo = DAG.getBitcast(MVT::v32i1, Lo);
24897    Hi = DAG.getBitcast(MVT::v32i1, Hi);
24898
24899    return DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v64i1, Lo, Hi);
24900  } else {
24901    MVT BitcastVT = MVT::getVectorVT(MVT::i1,
24902                                     Mask.getSimpleValueType().getSizeInBits());
24903    // In case when MaskVT equals v2i1 or v4i1, low 2 or 4 elements
24904    // are extracted by EXTRACT_SUBVECTOR.
24905    return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MaskVT,
24906                       DAG.getBitcast(BitcastVT, Mask),
24907                       DAG.getIntPtrConstant(0, dl));
24908  }
24909}
24910
24911/// Return (and \p Op, \p Mask) for compare instructions or
24912/// (vselect \p Mask, \p Op, \p PreservedSrc) for others along with the
24913/// necessary casting or extending for \p Mask when lowering masking intrinsics
24914static SDValue getVectorMaskingNode(SDValue Op, SDValue Mask,
24915                  SDValue PreservedSrc,
24916                  const X86Subtarget &Subtarget,
24917                  SelectionDAG &DAG) {
24918  MVT VT = Op.getSimpleValueType();
24919  MVT MaskVT = MVT::getVectorVT(MVT::i1, VT.getVectorNumElements());
24920  unsigned OpcodeSelect = ISD::VSELECT;
24921  SDLoc dl(Op);
24922
24923  if (isAllOnesConstant(Mask))
24924    return Op;
24925
24926  SDValue VMask = getMaskNode(Mask, MaskVT, Subtarget, DAG, dl);
24927
24928  if (PreservedSrc.isUndef())
24929    PreservedSrc = getZeroVector(VT, Subtarget, DAG, dl);
24930  return DAG.getNode(OpcodeSelect, dl, VT, VMask, Op, PreservedSrc);
24931}
24932
24933/// Creates an SDNode for a predicated scalar operation.
24934/// \returns (X86vselect \p Mask, \p Op, \p PreservedSrc).
24935/// The mask is coming as MVT::i8 and it should be transformed
24936/// to MVT::v1i1 while lowering masking intrinsics.
24937/// The main difference between ScalarMaskingNode and VectorMaskingNode is using
24938/// "X86select" instead of "vselect". We just can't create the "vselect" node
24939/// for a scalar instruction.
24940static SDValue getScalarMaskingNode(SDValue Op, SDValue Mask,
24941                                    SDValue PreservedSrc,
24942                                    const X86Subtarget &Subtarget,
24943                                    SelectionDAG &DAG) {
24944
24945  if (auto *MaskConst = dyn_cast<ConstantSDNode>(Mask))
24946    if (MaskConst->getZExtValue() & 0x1)
24947      return Op;
24948
24949  MVT VT = Op.getSimpleValueType();
24950  SDLoc dl(Op);
24951
24952  assert(Mask.getValueType() == MVT::i8 && "Unexpect type");
24953  SDValue IMask = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MVT::v1i1,
24954                              DAG.getBitcast(MVT::v8i1, Mask),
24955                              DAG.getIntPtrConstant(0, dl));
24956  if (Op.getOpcode() == X86ISD::FSETCCM ||
24957      Op.getOpcode() == X86ISD::FSETCCM_SAE ||
24958      Op.getOpcode() == X86ISD::VFPCLASSS)
24959    return DAG.getNode(ISD::AND, dl, VT, Op, IMask);
24960
24961  if (PreservedSrc.isUndef())
24962    PreservedSrc = getZeroVector(VT, Subtarget, DAG, dl);
24963  return DAG.getNode(X86ISD::SELECTS, dl, VT, IMask, Op, PreservedSrc);
24964}
24965
24966static int getSEHRegistrationNodeSize(const Function *Fn) {
24967  if (!Fn->hasPersonalityFn())
24968    report_fatal_error(
24969        "querying registration node size for function without personality");
24970  // The RegNodeSize is 6 32-bit words for SEH and 4 for C++ EH. See
24971  // WinEHStatePass for the full struct definition.
24972  switch (classifyEHPersonality(Fn->getPersonalityFn())) {
24973  case EHPersonality::MSVC_X86SEH: return 24;
24974  case EHPersonality::MSVC_CXX: return 16;
24975  default: break;
24976  }
24977  report_fatal_error(
24978      "can only recover FP for 32-bit MSVC EH personality functions");
24979}
24980
24981/// When the MSVC runtime transfers control to us, either to an outlined
24982/// function or when returning to a parent frame after catching an exception, we
24983/// recover the parent frame pointer by doing arithmetic on the incoming EBP.
24984/// Here's the math:
24985///   RegNodeBase = EntryEBP - RegNodeSize
24986///   ParentFP = RegNodeBase - ParentFrameOffset
24987/// Subtracting RegNodeSize takes us to the offset of the registration node, and
24988/// subtracting the offset (negative on x86) takes us back to the parent FP.
24989static SDValue recoverFramePointer(SelectionDAG &DAG, const Function *Fn,
24990                                   SDValue EntryEBP) {
24991  MachineFunction &MF = DAG.getMachineFunction();
24992  SDLoc dl;
24993
24994  const TargetLowering &TLI = DAG.getTargetLoweringInfo();
24995  MVT PtrVT = TLI.getPointerTy(DAG.getDataLayout());
24996
24997  // It's possible that the parent function no longer has a personality function
24998  // if the exceptional code was optimized away, in which case we just return
24999  // the incoming EBP.
25000  if (!Fn->hasPersonalityFn())
25001    return EntryEBP;
25002
25003  // Get an MCSymbol that will ultimately resolve to the frame offset of the EH
25004  // registration, or the .set_setframe offset.
25005  MCSymbol *OffsetSym =
25006      MF.getMMI().getContext().getOrCreateParentFrameOffsetSymbol(
25007          GlobalValue::dropLLVMManglingEscape(Fn->getName()));
25008  SDValue OffsetSymVal = DAG.getMCSymbol(OffsetSym, PtrVT);
25009  SDValue ParentFrameOffset =
25010      DAG.getNode(ISD::LOCAL_RECOVER, dl, PtrVT, OffsetSymVal);
25011
25012  // Return EntryEBP + ParentFrameOffset for x64. This adjusts from RSP after
25013  // prologue to RBP in the parent function.
25014  const X86Subtarget &Subtarget =
25015      static_cast<const X86Subtarget &>(DAG.getSubtarget());
25016  if (Subtarget.is64Bit())
25017    return DAG.getNode(ISD::ADD, dl, PtrVT, EntryEBP, ParentFrameOffset);
25018
25019  int RegNodeSize = getSEHRegistrationNodeSize(Fn);
25020  // RegNodeBase = EntryEBP - RegNodeSize
25021  // ParentFP = RegNodeBase - ParentFrameOffset
25022  SDValue RegNodeBase = DAG.getNode(ISD::SUB, dl, PtrVT, EntryEBP,
25023                                    DAG.getConstant(RegNodeSize, dl, PtrVT));
25024  return DAG.getNode(ISD::SUB, dl, PtrVT, RegNodeBase, ParentFrameOffset);
25025}
25026
25027SDValue X86TargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op,
25028                                                   SelectionDAG &DAG) const {
25029  // Helper to detect if the operand is CUR_DIRECTION rounding mode.
25030  auto isRoundModeCurDirection = [](SDValue Rnd) {
25031    if (auto *C = dyn_cast<ConstantSDNode>(Rnd))
25032      return C->getAPIntValue() == X86::STATIC_ROUNDING::CUR_DIRECTION;
25033
25034    return false;
25035  };
25036  auto isRoundModeSAE = [](SDValue Rnd) {
25037    if (auto *C = dyn_cast<ConstantSDNode>(Rnd)) {
25038      unsigned RC = C->getZExtValue();
25039      if (RC & X86::STATIC_ROUNDING::NO_EXC) {
25040        // Clear the NO_EXC bit and check remaining bits.
25041        RC ^= X86::STATIC_ROUNDING::NO_EXC;
25042        // As a convenience we allow no other bits or explicitly
25043        // current direction.
25044        return RC == 0 || RC == X86::STATIC_ROUNDING::CUR_DIRECTION;
25045      }
25046    }
25047
25048    return false;
25049  };
25050  auto isRoundModeSAEToX = [](SDValue Rnd, unsigned &RC) {
25051    if (auto *C = dyn_cast<ConstantSDNode>(Rnd)) {
25052      RC = C->getZExtValue();
25053      if (RC & X86::STATIC_ROUNDING::NO_EXC) {
25054        // Clear the NO_EXC bit and check remaining bits.
25055        RC ^= X86::STATIC_ROUNDING::NO_EXC;
25056        return RC == X86::STATIC_ROUNDING::TO_NEAREST_INT ||
25057               RC == X86::STATIC_ROUNDING::TO_NEG_INF ||
25058               RC == X86::STATIC_ROUNDING::TO_POS_INF ||
25059               RC == X86::STATIC_ROUNDING::TO_ZERO;
25060      }
25061    }
25062
25063    return false;
25064  };
25065
25066  SDLoc dl(Op);
25067  unsigned IntNo = Op.getConstantOperandVal(0);
25068  MVT VT = Op.getSimpleValueType();
25069  const IntrinsicData* IntrData = getIntrinsicWithoutChain(IntNo);
25070
25071  // Propagate flags from original node to transformed node(s).
25072  SelectionDAG::FlagInserter FlagsInserter(DAG, Op->getFlags());
25073
25074  if (IntrData) {
25075    switch(IntrData->Type) {
25076    case INTR_TYPE_1OP: {
25077      // We specify 2 possible opcodes for intrinsics with rounding modes.
25078      // First, we check if the intrinsic may have non-default rounding mode,
25079      // (IntrData->Opc1 != 0), then we check the rounding mode operand.
25080      unsigned IntrWithRoundingModeOpcode = IntrData->Opc1;
25081      if (IntrWithRoundingModeOpcode != 0) {
25082        SDValue Rnd = Op.getOperand(2);
25083        unsigned RC = 0;
25084        if (isRoundModeSAEToX(Rnd, RC))
25085          return DAG.getNode(IntrWithRoundingModeOpcode, dl, Op.getValueType(),
25086                             Op.getOperand(1),
25087                             DAG.getTargetConstant(RC, dl, MVT::i32));
25088        if (!isRoundModeCurDirection(Rnd))
25089          return SDValue();
25090      }
25091      return DAG.getNode(IntrData->Opc0, dl, Op.getValueType(),
25092                         Op.getOperand(1));
25093    }
25094    case INTR_TYPE_1OP_SAE: {
25095      SDValue Sae = Op.getOperand(2);
25096
25097      unsigned Opc;
25098      if (isRoundModeCurDirection(Sae))
25099        Opc = IntrData->Opc0;
25100      else if (isRoundModeSAE(Sae))
25101        Opc = IntrData->Opc1;
25102      else
25103        return SDValue();
25104
25105      return DAG.getNode(Opc, dl, Op.getValueType(), Op.getOperand(1));
25106    }
25107    case INTR_TYPE_2OP: {
25108      SDValue Src2 = Op.getOperand(2);
25109
25110      // We specify 2 possible opcodes for intrinsics with rounding modes.
25111      // First, we check if the intrinsic may have non-default rounding mode,
25112      // (IntrData->Opc1 != 0), then we check the rounding mode operand.
25113      unsigned IntrWithRoundingModeOpcode = IntrData->Opc1;
25114      if (IntrWithRoundingModeOpcode != 0) {
25115        SDValue Rnd = Op.getOperand(3);
25116        unsigned RC = 0;
25117        if (isRoundModeSAEToX(Rnd, RC))
25118          return DAG.getNode(IntrWithRoundingModeOpcode, dl, Op.getValueType(),
25119                             Op.getOperand(1), Src2,
25120                             DAG.getTargetConstant(RC, dl, MVT::i32));
25121        if (!isRoundModeCurDirection(Rnd))
25122          return SDValue();
25123      }
25124
25125      return DAG.getNode(IntrData->Opc0, dl, Op.getValueType(),
25126                         Op.getOperand(1), Src2);
25127    }
25128    case INTR_TYPE_2OP_SAE: {
25129      SDValue Sae = Op.getOperand(3);
25130
25131      unsigned Opc;
25132      if (isRoundModeCurDirection(Sae))
25133        Opc = IntrData->Opc0;
25134      else if (isRoundModeSAE(Sae))
25135        Opc = IntrData->Opc1;
25136      else
25137        return SDValue();
25138
25139      return DAG.getNode(Opc, dl, Op.getValueType(), Op.getOperand(1),
25140                         Op.getOperand(2));
25141    }
25142    case INTR_TYPE_3OP:
25143    case INTR_TYPE_3OP_IMM8: {
25144      SDValue Src1 = Op.getOperand(1);
25145      SDValue Src2 = Op.getOperand(2);
25146      SDValue Src3 = Op.getOperand(3);
25147
25148      if (IntrData->Type == INTR_TYPE_3OP_IMM8 &&
25149          Src3.getValueType() != MVT::i8) {
25150        Src3 = DAG.getTargetConstant(
25151            cast<ConstantSDNode>(Src3)->getZExtValue() & 0xff, dl, MVT::i8);
25152      }
25153
25154      // We specify 2 possible opcodes for intrinsics with rounding modes.
25155      // First, we check if the intrinsic may have non-default rounding mode,
25156      // (IntrData->Opc1 != 0), then we check the rounding mode operand.
25157      unsigned IntrWithRoundingModeOpcode = IntrData->Opc1;
25158      if (IntrWithRoundingModeOpcode != 0) {
25159        SDValue Rnd = Op.getOperand(4);
25160        unsigned RC = 0;
25161        if (isRoundModeSAEToX(Rnd, RC))
25162          return DAG.getNode(IntrWithRoundingModeOpcode, dl, Op.getValueType(),
25163                             Src1, Src2, Src3,
25164                             DAG.getTargetConstant(RC, dl, MVT::i32));
25165        if (!isRoundModeCurDirection(Rnd))
25166          return SDValue();
25167      }
25168
25169      return DAG.getNode(IntrData->Opc0, dl, Op.getValueType(),
25170                         {Src1, Src2, Src3});
25171    }
25172    case INTR_TYPE_4OP_IMM8: {
25173      assert(Op.getOperand(4)->getOpcode() == ISD::TargetConstant);
25174      SDValue Src4 = Op.getOperand(4);
25175      if (Src4.getValueType() != MVT::i8) {
25176        Src4 = DAG.getTargetConstant(
25177            cast<ConstantSDNode>(Src4)->getZExtValue() & 0xff, dl, MVT::i8);
25178      }
25179
25180      return DAG.getNode(IntrData->Opc0, dl, Op.getValueType(),
25181                         Op.getOperand(1), Op.getOperand(2), Op.getOperand(3),
25182                         Src4);
25183    }
25184    case INTR_TYPE_1OP_MASK: {
25185      SDValue Src = Op.getOperand(1);
25186      SDValue PassThru = Op.getOperand(2);
25187      SDValue Mask = Op.getOperand(3);
25188      // We add rounding mode to the Node when
25189      //   - RC Opcode is specified and
25190      //   - RC is not "current direction".
25191      unsigned IntrWithRoundingModeOpcode = IntrData->Opc1;
25192      if (IntrWithRoundingModeOpcode != 0) {
25193        SDValue Rnd = Op.getOperand(4);
25194        unsigned RC = 0;
25195        if (isRoundModeSAEToX(Rnd, RC))
25196          return getVectorMaskingNode(
25197              DAG.getNode(IntrWithRoundingModeOpcode, dl, Op.getValueType(),
25198                          Src, DAG.getTargetConstant(RC, dl, MVT::i32)),
25199              Mask, PassThru, Subtarget, DAG);
25200        if (!isRoundModeCurDirection(Rnd))
25201          return SDValue();
25202      }
25203      return getVectorMaskingNode(
25204          DAG.getNode(IntrData->Opc0, dl, VT, Src), Mask, PassThru,
25205          Subtarget, DAG);
25206    }
25207    case INTR_TYPE_1OP_MASK_SAE: {
25208      SDValue Src = Op.getOperand(1);
25209      SDValue PassThru = Op.getOperand(2);
25210      SDValue Mask = Op.getOperand(3);
25211      SDValue Rnd = Op.getOperand(4);
25212
25213      unsigned Opc;
25214      if (isRoundModeCurDirection(Rnd))
25215        Opc = IntrData->Opc0;
25216      else if (isRoundModeSAE(Rnd))
25217        Opc = IntrData->Opc1;
25218      else
25219        return SDValue();
25220
25221      return getVectorMaskingNode(DAG.getNode(Opc, dl, VT, Src), Mask, PassThru,
25222                                  Subtarget, DAG);
25223    }
25224    case INTR_TYPE_SCALAR_MASK: {
25225      SDValue Src1 = Op.getOperand(1);
25226      SDValue Src2 = Op.getOperand(2);
25227      SDValue passThru = Op.getOperand(3);
25228      SDValue Mask = Op.getOperand(4);
25229      unsigned IntrWithRoundingModeOpcode = IntrData->Opc1;
25230      // There are 2 kinds of intrinsics in this group:
25231      // (1) With suppress-all-exceptions (sae) or rounding mode- 6 operands
25232      // (2) With rounding mode and sae - 7 operands.
25233      bool HasRounding = IntrWithRoundingModeOpcode != 0;
25234      if (Op.getNumOperands() == (5U + HasRounding)) {
25235        if (HasRounding) {
25236          SDValue Rnd = Op.getOperand(5);
25237          unsigned RC = 0;
25238          if (isRoundModeSAEToX(Rnd, RC))
25239            return getScalarMaskingNode(
25240                DAG.getNode(IntrWithRoundingModeOpcode, dl, VT, Src1, Src2,
25241                            DAG.getTargetConstant(RC, dl, MVT::i32)),
25242                Mask, passThru, Subtarget, DAG);
25243          if (!isRoundModeCurDirection(Rnd))
25244            return SDValue();
25245        }
25246        return getScalarMaskingNode(DAG.getNode(IntrData->Opc0, dl, VT, Src1,
25247                                                Src2),
25248                                    Mask, passThru, Subtarget, DAG);
25249      }
25250
25251      assert(Op.getNumOperands() == (6U + HasRounding) &&
25252             "Unexpected intrinsic form");
25253      SDValue RoundingMode = Op.getOperand(5);
25254      unsigned Opc = IntrData->Opc0;
25255      if (HasRounding) {
25256        SDValue Sae = Op.getOperand(6);
25257        if (isRoundModeSAE(Sae))
25258          Opc = IntrWithRoundingModeOpcode;
25259        else if (!isRoundModeCurDirection(Sae))
25260          return SDValue();
25261      }
25262      return getScalarMaskingNode(DAG.getNode(Opc, dl, VT, Src1,
25263                                              Src2, RoundingMode),
25264                                  Mask, passThru, Subtarget, DAG);
25265    }
25266    case INTR_TYPE_SCALAR_MASK_RND: {
25267      SDValue Src1 = Op.getOperand(1);
25268      SDValue Src2 = Op.getOperand(2);
25269      SDValue passThru = Op.getOperand(3);
25270      SDValue Mask = Op.getOperand(4);
25271      SDValue Rnd = Op.getOperand(5);
25272
25273      SDValue NewOp;
25274      unsigned RC = 0;
25275      if (isRoundModeCurDirection(Rnd))
25276        NewOp = DAG.getNode(IntrData->Opc0, dl, VT, Src1, Src2);
25277      else if (isRoundModeSAEToX(Rnd, RC))
25278        NewOp = DAG.getNode(IntrData->Opc1, dl, VT, Src1, Src2,
25279                            DAG.getTargetConstant(RC, dl, MVT::i32));
25280      else
25281        return SDValue();
25282
25283      return getScalarMaskingNode(NewOp, Mask, passThru, Subtarget, DAG);
25284    }
25285    case INTR_TYPE_SCALAR_MASK_SAE: {
25286      SDValue Src1 = Op.getOperand(1);
25287      SDValue Src2 = Op.getOperand(2);
25288      SDValue passThru = Op.getOperand(3);
25289      SDValue Mask = Op.getOperand(4);
25290      SDValue Sae = Op.getOperand(5);
25291      unsigned Opc;
25292      if (isRoundModeCurDirection(Sae))
25293        Opc = IntrData->Opc0;
25294      else if (isRoundModeSAE(Sae))
25295        Opc = IntrData->Opc1;
25296      else
25297        return SDValue();
25298
25299      return getScalarMaskingNode(DAG.getNode(Opc, dl, VT, Src1, Src2),
25300                                  Mask, passThru, Subtarget, DAG);
25301    }
25302    case INTR_TYPE_2OP_MASK: {
25303      SDValue Src1 = Op.getOperand(1);
25304      SDValue Src2 = Op.getOperand(2);
25305      SDValue PassThru = Op.getOperand(3);
25306      SDValue Mask = Op.getOperand(4);
25307      SDValue NewOp;
25308      if (IntrData->Opc1 != 0) {
25309        SDValue Rnd = Op.getOperand(5);
25310        unsigned RC = 0;
25311        if (isRoundModeSAEToX(Rnd, RC))
25312          NewOp = DAG.getNode(IntrData->Opc1, dl, VT, Src1, Src2,
25313                              DAG.getTargetConstant(RC, dl, MVT::i32));
25314        else if (!isRoundModeCurDirection(Rnd))
25315          return SDValue();
25316      }
25317      if (!NewOp)
25318        NewOp = DAG.getNode(IntrData->Opc0, dl, VT, Src1, Src2);
25319      return getVectorMaskingNode(NewOp, Mask, PassThru, Subtarget, DAG);
25320    }
25321    case INTR_TYPE_2OP_MASK_SAE: {
25322      SDValue Src1 = Op.getOperand(1);
25323      SDValue Src2 = Op.getOperand(2);
25324      SDValue PassThru = Op.getOperand(3);
25325      SDValue Mask = Op.getOperand(4);
25326
25327      unsigned Opc = IntrData->Opc0;
25328      if (IntrData->Opc1 != 0) {
25329        SDValue Sae = Op.getOperand(5);
25330        if (isRoundModeSAE(Sae))
25331          Opc = IntrData->Opc1;
25332        else if (!isRoundModeCurDirection(Sae))
25333          return SDValue();
25334      }
25335
25336      return getVectorMaskingNode(DAG.getNode(Opc, dl, VT, Src1, Src2),
25337                                  Mask, PassThru, Subtarget, DAG);
25338    }
25339    case INTR_TYPE_3OP_SCALAR_MASK_SAE: {
25340      SDValue Src1 = Op.getOperand(1);
25341      SDValue Src2 = Op.getOperand(2);
25342      SDValue Src3 = Op.getOperand(3);
25343      SDValue PassThru = Op.getOperand(4);
25344      SDValue Mask = Op.getOperand(5);
25345      SDValue Sae = Op.getOperand(6);
25346      unsigned Opc;
25347      if (isRoundModeCurDirection(Sae))
25348        Opc = IntrData->Opc0;
25349      else if (isRoundModeSAE(Sae))
25350        Opc = IntrData->Opc1;
25351      else
25352        return SDValue();
25353
25354      return getScalarMaskingNode(DAG.getNode(Opc, dl, VT, Src1, Src2, Src3),
25355                                  Mask, PassThru, Subtarget, DAG);
25356    }
25357    case INTR_TYPE_3OP_MASK_SAE: {
25358      SDValue Src1 = Op.getOperand(1);
25359      SDValue Src2 = Op.getOperand(2);
25360      SDValue Src3 = Op.getOperand(3);
25361      SDValue PassThru = Op.getOperand(4);
25362      SDValue Mask = Op.getOperand(5);
25363
25364      unsigned Opc = IntrData->Opc0;
25365      if (IntrData->Opc1 != 0) {
25366        SDValue Sae = Op.getOperand(6);
25367        if (isRoundModeSAE(Sae))
25368          Opc = IntrData->Opc1;
25369        else if (!isRoundModeCurDirection(Sae))
25370          return SDValue();
25371      }
25372      return getVectorMaskingNode(DAG.getNode(Opc, dl, VT, Src1, Src2, Src3),
25373                                  Mask, PassThru, Subtarget, DAG);
25374    }
25375    case BLENDV: {
25376      SDValue Src1 = Op.getOperand(1);
25377      SDValue Src2 = Op.getOperand(2);
25378      SDValue Src3 = Op.getOperand(3);
25379
25380      EVT MaskVT = Src3.getValueType().changeVectorElementTypeToInteger();
25381      Src3 = DAG.getBitcast(MaskVT, Src3);
25382
25383      // Reverse the operands to match VSELECT order.
25384      return DAG.getNode(IntrData->Opc0, dl, VT, Src3, Src2, Src1);
25385    }
25386    case VPERM_2OP : {
25387      SDValue Src1 = Op.getOperand(1);
25388      SDValue Src2 = Op.getOperand(2);
25389
25390      // Swap Src1 and Src2 in the node creation
25391      return DAG.getNode(IntrData->Opc0, dl, VT,Src2, Src1);
25392    }
25393    case IFMA_OP:
25394      // NOTE: We need to swizzle the operands to pass the multiply operands
25395      // first.
25396      return DAG.getNode(IntrData->Opc0, dl, Op.getValueType(),
25397                         Op.getOperand(2), Op.getOperand(3), Op.getOperand(1));
25398    case FPCLASSS: {
25399      SDValue Src1 = Op.getOperand(1);
25400      SDValue Imm = Op.getOperand(2);
25401      SDValue Mask = Op.getOperand(3);
25402      SDValue FPclass = DAG.getNode(IntrData->Opc0, dl, MVT::v1i1, Src1, Imm);
25403      SDValue FPclassMask = getScalarMaskingNode(FPclass, Mask, SDValue(),
25404                                                 Subtarget, DAG);
25405      // Need to fill with zeros to ensure the bitcast will produce zeroes
25406      // for the upper bits. An EXTRACT_ELEMENT here wouldn't guarantee that.
25407      SDValue Ins = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, MVT::v8i1,
25408                                DAG.getConstant(0, dl, MVT::v8i1),
25409                                FPclassMask, DAG.getIntPtrConstant(0, dl));
25410      return DAG.getBitcast(MVT::i8, Ins);
25411    }
25412
25413    case CMP_MASK_CC: {
25414      MVT MaskVT = Op.getSimpleValueType();
25415      SDValue CC = Op.getOperand(3);
25416      SDValue Mask = Op.getOperand(4);
25417      // We specify 2 possible opcodes for intrinsics with rounding modes.
25418      // First, we check if the intrinsic may have non-default rounding mode,
25419      // (IntrData->Opc1 != 0), then we check the rounding mode operand.
25420      if (IntrData->Opc1 != 0) {
25421        SDValue Sae = Op.getOperand(5);
25422        if (isRoundModeSAE(Sae))
25423          return DAG.getNode(IntrData->Opc1, dl, MaskVT, Op.getOperand(1),
25424                             Op.getOperand(2), CC, Mask, Sae);
25425        if (!isRoundModeCurDirection(Sae))
25426          return SDValue();
25427      }
25428      //default rounding mode
25429      return DAG.getNode(IntrData->Opc0, dl, MaskVT,
25430                         {Op.getOperand(1), Op.getOperand(2), CC, Mask});
25431    }
25432    case CMP_MASK_SCALAR_CC: {
25433      SDValue Src1 = Op.getOperand(1);
25434      SDValue Src2 = Op.getOperand(2);
25435      SDValue CC = Op.getOperand(3);
25436      SDValue Mask = Op.getOperand(4);
25437
25438      SDValue Cmp;
25439      if (IntrData->Opc1 != 0) {
25440        SDValue Sae = Op.getOperand(5);
25441        if (isRoundModeSAE(Sae))
25442          Cmp = DAG.getNode(IntrData->Opc1, dl, MVT::v1i1, Src1, Src2, CC, Sae);
25443        else if (!isRoundModeCurDirection(Sae))
25444          return SDValue();
25445      }
25446      //default rounding mode
25447      if (!Cmp.getNode())
25448        Cmp = DAG.getNode(IntrData->Opc0, dl, MVT::v1i1, Src1, Src2, CC);
25449
25450      SDValue CmpMask = getScalarMaskingNode(Cmp, Mask, SDValue(),
25451                                             Subtarget, DAG);
25452      // Need to fill with zeros to ensure the bitcast will produce zeroes
25453      // for the upper bits. An EXTRACT_ELEMENT here wouldn't guarantee that.
25454      SDValue Ins = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, MVT::v8i1,
25455                                DAG.getConstant(0, dl, MVT::v8i1),
25456                                CmpMask, DAG.getIntPtrConstant(0, dl));
25457      return DAG.getBitcast(MVT::i8, Ins);
25458    }
25459    case COMI: { // Comparison intrinsics
25460      ISD::CondCode CC = (ISD::CondCode)IntrData->Opc1;
25461      SDValue LHS = Op.getOperand(1);
25462      SDValue RHS = Op.getOperand(2);
25463      // Some conditions require the operands to be swapped.
25464      if (CC == ISD::SETLT || CC == ISD::SETLE)
25465        std::swap(LHS, RHS);
25466
25467      SDValue Comi = DAG.getNode(IntrData->Opc0, dl, MVT::i32, LHS, RHS);
25468      SDValue SetCC;
25469      switch (CC) {
25470      case ISD::SETEQ: { // (ZF = 0 and PF = 0)
25471        SetCC = getSETCC(X86::COND_E, Comi, dl, DAG);
25472        SDValue SetNP = getSETCC(X86::COND_NP, Comi, dl, DAG);
25473        SetCC = DAG.getNode(ISD::AND, dl, MVT::i8, SetCC, SetNP);
25474        break;
25475      }
25476      case ISD::SETNE: { // (ZF = 1 or PF = 1)
25477        SetCC = getSETCC(X86::COND_NE, Comi, dl, DAG);
25478        SDValue SetP = getSETCC(X86::COND_P, Comi, dl, DAG);
25479        SetCC = DAG.getNode(ISD::OR, dl, MVT::i8, SetCC, SetP);
25480        break;
25481      }
25482      case ISD::SETGT: // (CF = 0 and ZF = 0)
25483      case ISD::SETLT: { // Condition opposite to GT. Operands swapped above.
25484        SetCC = getSETCC(X86::COND_A, Comi, dl, DAG);
25485        break;
25486      }
25487      case ISD::SETGE: // CF = 0
25488      case ISD::SETLE: // Condition opposite to GE. Operands swapped above.
25489        SetCC = getSETCC(X86::COND_AE, Comi, dl, DAG);
25490        break;
25491      default:
25492        llvm_unreachable("Unexpected illegal condition!");
25493      }
25494      return DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i32, SetCC);
25495    }
25496    case COMI_RM: { // Comparison intrinsics with Sae
25497      SDValue LHS = Op.getOperand(1);
25498      SDValue RHS = Op.getOperand(2);
25499      unsigned CondVal = Op.getConstantOperandVal(3);
25500      SDValue Sae = Op.getOperand(4);
25501
25502      SDValue FCmp;
25503      if (isRoundModeCurDirection(Sae))
25504        FCmp = DAG.getNode(X86ISD::FSETCCM, dl, MVT::v1i1, LHS, RHS,
25505                           DAG.getTargetConstant(CondVal, dl, MVT::i8));
25506      else if (isRoundModeSAE(Sae))
25507        FCmp = DAG.getNode(X86ISD::FSETCCM_SAE, dl, MVT::v1i1, LHS, RHS,
25508                           DAG.getTargetConstant(CondVal, dl, MVT::i8), Sae);
25509      else
25510        return SDValue();
25511      // Need to fill with zeros to ensure the bitcast will produce zeroes
25512      // for the upper bits. An EXTRACT_ELEMENT here wouldn't guarantee that.
25513      SDValue Ins = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, MVT::v16i1,
25514                                DAG.getConstant(0, dl, MVT::v16i1),
25515                                FCmp, DAG.getIntPtrConstant(0, dl));
25516      return DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i32,
25517                         DAG.getBitcast(MVT::i16, Ins));
25518    }
25519    case VSHIFT:
25520      return getTargetVShiftNode(IntrData->Opc0, dl, Op.getSimpleValueType(),
25521                                 Op.getOperand(1), Op.getOperand(2), Subtarget,
25522                                 DAG);
25523    case COMPRESS_EXPAND_IN_REG: {
25524      SDValue Mask = Op.getOperand(3);
25525      SDValue DataToCompress = Op.getOperand(1);
25526      SDValue PassThru = Op.getOperand(2);
25527      if (ISD::isBuildVectorAllOnes(Mask.getNode())) // return data as is
25528        return Op.getOperand(1);
25529
25530      // Avoid false dependency.
25531      if (PassThru.isUndef())
25532        PassThru = DAG.getConstant(0, dl, VT);
25533
25534      return DAG.getNode(IntrData->Opc0, dl, VT, DataToCompress, PassThru,
25535                         Mask);
25536    }
25537    case FIXUPIMM:
25538    case FIXUPIMM_MASKZ: {
25539      SDValue Src1 = Op.getOperand(1);
25540      SDValue Src2 = Op.getOperand(2);
25541      SDValue Src3 = Op.getOperand(3);
25542      SDValue Imm = Op.getOperand(4);
25543      SDValue Mask = Op.getOperand(5);
25544      SDValue Passthru = (IntrData->Type == FIXUPIMM)
25545                             ? Src1
25546                             : getZeroVector(VT, Subtarget, DAG, dl);
25547
25548      unsigned Opc = IntrData->Opc0;
25549      if (IntrData->Opc1 != 0) {
25550        SDValue Sae = Op.getOperand(6);
25551        if (isRoundModeSAE(Sae))
25552          Opc = IntrData->Opc1;
25553        else if (!isRoundModeCurDirection(Sae))
25554          return SDValue();
25555      }
25556
25557      SDValue FixupImm = DAG.getNode(Opc, dl, VT, Src1, Src2, Src3, Imm);
25558
25559      if (Opc == X86ISD::VFIXUPIMM || Opc == X86ISD::VFIXUPIMM_SAE)
25560        return getVectorMaskingNode(FixupImm, Mask, Passthru, Subtarget, DAG);
25561
25562      return getScalarMaskingNode(FixupImm, Mask, Passthru, Subtarget, DAG);
25563    }
25564    case ROUNDP: {
25565      assert(IntrData->Opc0 == X86ISD::VRNDSCALE && "Unexpected opcode");
25566      // Clear the upper bits of the rounding immediate so that the legacy
25567      // intrinsic can't trigger the scaling behavior of VRNDSCALE.
25568      auto Round = cast<ConstantSDNode>(Op.getOperand(2));
25569      SDValue RoundingMode =
25570          DAG.getTargetConstant(Round->getZExtValue() & 0xf, dl, MVT::i32);
25571      return DAG.getNode(IntrData->Opc0, dl, Op.getValueType(),
25572                         Op.getOperand(1), RoundingMode);
25573    }
25574    case ROUNDS: {
25575      assert(IntrData->Opc0 == X86ISD::VRNDSCALES && "Unexpected opcode");
25576      // Clear the upper bits of the rounding immediate so that the legacy
25577      // intrinsic can't trigger the scaling behavior of VRNDSCALE.
25578      auto Round = cast<ConstantSDNode>(Op.getOperand(3));
25579      SDValue RoundingMode =
25580          DAG.getTargetConstant(Round->getZExtValue() & 0xf, dl, MVT::i32);
25581      return DAG.getNode(IntrData->Opc0, dl, Op.getValueType(),
25582                         Op.getOperand(1), Op.getOperand(2), RoundingMode);
25583    }
25584    case BEXTRI: {
25585      assert(IntrData->Opc0 == X86ISD::BEXTRI && "Unexpected opcode");
25586
25587      uint64_t Imm = Op.getConstantOperandVal(2);
25588      SDValue Control = DAG.getTargetConstant(Imm & 0xffff, dl,
25589                                              Op.getValueType());
25590      return DAG.getNode(IntrData->Opc0, dl, Op.getValueType(),
25591                         Op.getOperand(1), Control);
25592    }
25593    // ADC/ADCX/SBB
25594    case ADX: {
25595      SDVTList CFVTs = DAG.getVTList(Op->getValueType(0), MVT::i32);
25596      SDVTList VTs = DAG.getVTList(Op.getOperand(2).getValueType(), MVT::i32);
25597
25598      SDValue Res;
25599      // If the carry in is zero, then we should just use ADD/SUB instead of
25600      // ADC/SBB.
25601      if (isNullConstant(Op.getOperand(1))) {
25602        Res = DAG.getNode(IntrData->Opc1, dl, VTs, Op.getOperand(2),
25603                          Op.getOperand(3));
25604      } else {
25605        SDValue GenCF = DAG.getNode(X86ISD::ADD, dl, CFVTs, Op.getOperand(1),
25606                                    DAG.getConstant(-1, dl, MVT::i8));
25607        Res = DAG.getNode(IntrData->Opc0, dl, VTs, Op.getOperand(2),
25608                          Op.getOperand(3), GenCF.getValue(1));
25609      }
25610      SDValue SetCC = getSETCC(X86::COND_B, Res.getValue(1), dl, DAG);
25611      SDValue Results[] = { SetCC, Res };
25612      return DAG.getMergeValues(Results, dl);
25613    }
25614    case CVTPD2PS_MASK:
25615    case CVTPD2DQ_MASK:
25616    case CVTQQ2PS_MASK:
25617    case TRUNCATE_TO_REG: {
25618      SDValue Src = Op.getOperand(1);
25619      SDValue PassThru = Op.getOperand(2);
25620      SDValue Mask = Op.getOperand(3);
25621
25622      if (isAllOnesConstant(Mask))
25623        return DAG.getNode(IntrData->Opc0, dl, Op.getValueType(), Src);
25624
25625      MVT SrcVT = Src.getSimpleValueType();
25626      MVT MaskVT = MVT::getVectorVT(MVT::i1, SrcVT.getVectorNumElements());
25627      Mask = getMaskNode(Mask, MaskVT, Subtarget, DAG, dl);
25628      return DAG.getNode(IntrData->Opc1, dl, Op.getValueType(),
25629                         {Src, PassThru, Mask});
25630    }
25631    case CVTPS2PH_MASK: {
25632      SDValue Src = Op.getOperand(1);
25633      SDValue Rnd = Op.getOperand(2);
25634      SDValue PassThru = Op.getOperand(3);
25635      SDValue Mask = Op.getOperand(4);
25636
25637      if (isAllOnesConstant(Mask))
25638        return DAG.getNode(IntrData->Opc0, dl, Op.getValueType(), Src, Rnd);
25639
25640      MVT SrcVT = Src.getSimpleValueType();
25641      MVT MaskVT = MVT::getVectorVT(MVT::i1, SrcVT.getVectorNumElements());
25642      Mask = getMaskNode(Mask, MaskVT, Subtarget, DAG, dl);
25643      return DAG.getNode(IntrData->Opc1, dl, Op.getValueType(), Src, Rnd,
25644                         PassThru, Mask);
25645
25646    }
25647    case CVTNEPS2BF16_MASK: {
25648      SDValue Src = Op.getOperand(1);
25649      SDValue PassThru = Op.getOperand(2);
25650      SDValue Mask = Op.getOperand(3);
25651
25652      if (ISD::isBuildVectorAllOnes(Mask.getNode()))
25653        return DAG.getNode(IntrData->Opc0, dl, Op.getValueType(), Src);
25654
25655      // Break false dependency.
25656      if (PassThru.isUndef())
25657        PassThru = DAG.getConstant(0, dl, PassThru.getValueType());
25658
25659      return DAG.getNode(IntrData->Opc1, dl, Op.getValueType(), Src, PassThru,
25660                         Mask);
25661    }
25662    default:
25663      break;
25664    }
25665  }
25666
25667  switch (IntNo) {
25668  default: return SDValue();    // Don't custom lower most intrinsics.
25669
25670  // ptest and testp intrinsics. The intrinsic these come from are designed to
25671  // return an integer value, not just an instruction so lower it to the ptest
25672  // or testp pattern and a setcc for the result.
25673  case Intrinsic::x86_avx512_ktestc_b:
25674  case Intrinsic::x86_avx512_ktestc_w:
25675  case Intrinsic::x86_avx512_ktestc_d:
25676  case Intrinsic::x86_avx512_ktestc_q:
25677  case Intrinsic::x86_avx512_ktestz_b:
25678  case Intrinsic::x86_avx512_ktestz_w:
25679  case Intrinsic::x86_avx512_ktestz_d:
25680  case Intrinsic::x86_avx512_ktestz_q:
25681  case Intrinsic::x86_sse41_ptestz:
25682  case Intrinsic::x86_sse41_ptestc:
25683  case Intrinsic::x86_sse41_ptestnzc:
25684  case Intrinsic::x86_avx_ptestz_256:
25685  case Intrinsic::x86_avx_ptestc_256:
25686  case Intrinsic::x86_avx_ptestnzc_256:
25687  case Intrinsic::x86_avx_vtestz_ps:
25688  case Intrinsic::x86_avx_vtestc_ps:
25689  case Intrinsic::x86_avx_vtestnzc_ps:
25690  case Intrinsic::x86_avx_vtestz_pd:
25691  case Intrinsic::x86_avx_vtestc_pd:
25692  case Intrinsic::x86_avx_vtestnzc_pd:
25693  case Intrinsic::x86_avx_vtestz_ps_256:
25694  case Intrinsic::x86_avx_vtestc_ps_256:
25695  case Intrinsic::x86_avx_vtestnzc_ps_256:
25696  case Intrinsic::x86_avx_vtestz_pd_256:
25697  case Intrinsic::x86_avx_vtestc_pd_256:
25698  case Intrinsic::x86_avx_vtestnzc_pd_256: {
25699    unsigned TestOpc = X86ISD::PTEST;
25700    X86::CondCode X86CC;
25701    switch (IntNo) {
25702    default: llvm_unreachable("Bad fallthrough in Intrinsic lowering.");
25703    case Intrinsic::x86_avx512_ktestc_b:
25704    case Intrinsic::x86_avx512_ktestc_w:
25705    case Intrinsic::x86_avx512_ktestc_d:
25706    case Intrinsic::x86_avx512_ktestc_q:
25707      // CF = 1
25708      TestOpc = X86ISD::KTEST;
25709      X86CC = X86::COND_B;
25710      break;
25711    case Intrinsic::x86_avx512_ktestz_b:
25712    case Intrinsic::x86_avx512_ktestz_w:
25713    case Intrinsic::x86_avx512_ktestz_d:
25714    case Intrinsic::x86_avx512_ktestz_q:
25715      TestOpc = X86ISD::KTEST;
25716      X86CC = X86::COND_E;
25717      break;
25718    case Intrinsic::x86_avx_vtestz_ps:
25719    case Intrinsic::x86_avx_vtestz_pd:
25720    case Intrinsic::x86_avx_vtestz_ps_256:
25721    case Intrinsic::x86_avx_vtestz_pd_256:
25722      TestOpc = X86ISD::TESTP;
25723      LLVM_FALLTHROUGH;
25724    case Intrinsic::x86_sse41_ptestz:
25725    case Intrinsic::x86_avx_ptestz_256:
25726      // ZF = 1
25727      X86CC = X86::COND_E;
25728      break;
25729    case Intrinsic::x86_avx_vtestc_ps:
25730    case Intrinsic::x86_avx_vtestc_pd:
25731    case Intrinsic::x86_avx_vtestc_ps_256:
25732    case Intrinsic::x86_avx_vtestc_pd_256:
25733      TestOpc = X86ISD::TESTP;
25734      LLVM_FALLTHROUGH;
25735    case Intrinsic::x86_sse41_ptestc:
25736    case Intrinsic::x86_avx_ptestc_256:
25737      // CF = 1
25738      X86CC = X86::COND_B;
25739      break;
25740    case Intrinsic::x86_avx_vtestnzc_ps:
25741    case Intrinsic::x86_avx_vtestnzc_pd:
25742    case Intrinsic::x86_avx_vtestnzc_ps_256:
25743    case Intrinsic::x86_avx_vtestnzc_pd_256:
25744      TestOpc = X86ISD::TESTP;
25745      LLVM_FALLTHROUGH;
25746    case Intrinsic::x86_sse41_ptestnzc:
25747    case Intrinsic::x86_avx_ptestnzc_256:
25748      // ZF and CF = 0
25749      X86CC = X86::COND_A;
25750      break;
25751    }
25752
25753    SDValue LHS = Op.getOperand(1);
25754    SDValue RHS = Op.getOperand(2);
25755    SDValue Test = DAG.getNode(TestOpc, dl, MVT::i32, LHS, RHS);
25756    SDValue SetCC = getSETCC(X86CC, Test, dl, DAG);
25757    return DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i32, SetCC);
25758  }
25759
25760  case Intrinsic::x86_sse42_pcmpistria128:
25761  case Intrinsic::x86_sse42_pcmpestria128:
25762  case Intrinsic::x86_sse42_pcmpistric128:
25763  case Intrinsic::x86_sse42_pcmpestric128:
25764  case Intrinsic::x86_sse42_pcmpistrio128:
25765  case Intrinsic::x86_sse42_pcmpestrio128:
25766  case Intrinsic::x86_sse42_pcmpistris128:
25767  case Intrinsic::x86_sse42_pcmpestris128:
25768  case Intrinsic::x86_sse42_pcmpistriz128:
25769  case Intrinsic::x86_sse42_pcmpestriz128: {
25770    unsigned Opcode;
25771    X86::CondCode X86CC;
25772    switch (IntNo) {
25773    default: llvm_unreachable("Impossible intrinsic");  // Can't reach here.
25774    case Intrinsic::x86_sse42_pcmpistria128:
25775      Opcode = X86ISD::PCMPISTR;
25776      X86CC = X86::COND_A;
25777      break;
25778    case Intrinsic::x86_sse42_pcmpestria128:
25779      Opcode = X86ISD::PCMPESTR;
25780      X86CC = X86::COND_A;
25781      break;
25782    case Intrinsic::x86_sse42_pcmpistric128:
25783      Opcode = X86ISD::PCMPISTR;
25784      X86CC = X86::COND_B;
25785      break;
25786    case Intrinsic::x86_sse42_pcmpestric128:
25787      Opcode = X86ISD::PCMPESTR;
25788      X86CC = X86::COND_B;
25789      break;
25790    case Intrinsic::x86_sse42_pcmpistrio128:
25791      Opcode = X86ISD::PCMPISTR;
25792      X86CC = X86::COND_O;
25793      break;
25794    case Intrinsic::x86_sse42_pcmpestrio128:
25795      Opcode = X86ISD::PCMPESTR;
25796      X86CC = X86::COND_O;
25797      break;
25798    case Intrinsic::x86_sse42_pcmpistris128:
25799      Opcode = X86ISD::PCMPISTR;
25800      X86CC = X86::COND_S;
25801      break;
25802    case Intrinsic::x86_sse42_pcmpestris128:
25803      Opcode = X86ISD::PCMPESTR;
25804      X86CC = X86::COND_S;
25805      break;
25806    case Intrinsic::x86_sse42_pcmpistriz128:
25807      Opcode = X86ISD::PCMPISTR;
25808      X86CC = X86::COND_E;
25809      break;
25810    case Intrinsic::x86_sse42_pcmpestriz128:
25811      Opcode = X86ISD::PCMPESTR;
25812      X86CC = X86::COND_E;
25813      break;
25814    }
25815    SmallVector<SDValue, 5> NewOps(Op->op_begin()+1, Op->op_end());
25816    SDVTList VTs = DAG.getVTList(MVT::i32, MVT::v16i8, MVT::i32);
25817    SDValue PCMP = DAG.getNode(Opcode, dl, VTs, NewOps).getValue(2);
25818    SDValue SetCC = getSETCC(X86CC, PCMP, dl, DAG);
25819    return DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i32, SetCC);
25820  }
25821
25822  case Intrinsic::x86_sse42_pcmpistri128:
25823  case Intrinsic::x86_sse42_pcmpestri128: {
25824    unsigned Opcode;
25825    if (IntNo == Intrinsic::x86_sse42_pcmpistri128)
25826      Opcode = X86ISD::PCMPISTR;
25827    else
25828      Opcode = X86ISD::PCMPESTR;
25829
25830    SmallVector<SDValue, 5> NewOps(Op->op_begin()+1, Op->op_end());
25831    SDVTList VTs = DAG.getVTList(MVT::i32, MVT::v16i8, MVT::i32);
25832    return DAG.getNode(Opcode, dl, VTs, NewOps);
25833  }
25834
25835  case Intrinsic::x86_sse42_pcmpistrm128:
25836  case Intrinsic::x86_sse42_pcmpestrm128: {
25837    unsigned Opcode;
25838    if (IntNo == Intrinsic::x86_sse42_pcmpistrm128)
25839      Opcode = X86ISD::PCMPISTR;
25840    else
25841      Opcode = X86ISD::PCMPESTR;
25842
25843    SmallVector<SDValue, 5> NewOps(Op->op_begin()+1, Op->op_end());
25844    SDVTList VTs = DAG.getVTList(MVT::i32, MVT::v16i8, MVT::i32);
25845    return DAG.getNode(Opcode, dl, VTs, NewOps).getValue(1);
25846  }
25847
25848  case Intrinsic::eh_sjlj_lsda: {
25849    MachineFunction &MF = DAG.getMachineFunction();
25850    const TargetLowering &TLI = DAG.getTargetLoweringInfo();
25851    MVT PtrVT = TLI.getPointerTy(DAG.getDataLayout());
25852    auto &Context = MF.getMMI().getContext();
25853    MCSymbol *S = Context.getOrCreateSymbol(Twine("GCC_except_table") +
25854                                            Twine(MF.getFunctionNumber()));
25855    return DAG.getNode(getGlobalWrapperKind(), dl, VT,
25856                       DAG.getMCSymbol(S, PtrVT));
25857  }
25858
25859  case Intrinsic::x86_seh_lsda: {
25860    // Compute the symbol for the LSDA. We know it'll get emitted later.
25861    MachineFunction &MF = DAG.getMachineFunction();
25862    SDValue Op1 = Op.getOperand(1);
25863    auto *Fn = cast<Function>(cast<GlobalAddressSDNode>(Op1)->getGlobal());
25864    MCSymbol *LSDASym = MF.getMMI().getContext().getOrCreateLSDASymbol(
25865        GlobalValue::dropLLVMManglingEscape(Fn->getName()));
25866
25867    // Generate a simple absolute symbol reference. This intrinsic is only
25868    // supported on 32-bit Windows, which isn't PIC.
25869    SDValue Result = DAG.getMCSymbol(LSDASym, VT);
25870    return DAG.getNode(X86ISD::Wrapper, dl, VT, Result);
25871  }
25872
25873  case Intrinsic::eh_recoverfp: {
25874    SDValue FnOp = Op.getOperand(1);
25875    SDValue IncomingFPOp = Op.getOperand(2);
25876    GlobalAddressSDNode *GSD = dyn_cast<GlobalAddressSDNode>(FnOp);
25877    auto *Fn = dyn_cast_or_null<Function>(GSD ? GSD->getGlobal() : nullptr);
25878    if (!Fn)
25879      report_fatal_error(
25880          "llvm.eh.recoverfp must take a function as the first argument");
25881    return recoverFramePointer(DAG, Fn, IncomingFPOp);
25882  }
25883
25884  case Intrinsic::localaddress: {
25885    // Returns one of the stack, base, or frame pointer registers, depending on
25886    // which is used to reference local variables.
25887    MachineFunction &MF = DAG.getMachineFunction();
25888    const X86RegisterInfo *RegInfo = Subtarget.getRegisterInfo();
25889    unsigned Reg;
25890    if (RegInfo->hasBasePointer(MF))
25891      Reg = RegInfo->getBaseRegister();
25892    else { // Handles the SP or FP case.
25893      bool CantUseFP = RegInfo->hasStackRealignment(MF);
25894      if (CantUseFP)
25895        Reg = RegInfo->getPtrSizedStackRegister(MF);
25896      else
25897        Reg = RegInfo->getPtrSizedFrameRegister(MF);
25898    }
25899    return DAG.getCopyFromReg(DAG.getEntryNode(), dl, Reg, VT);
25900  }
25901  case Intrinsic::swift_async_context_addr: {
25902    auto &MF = DAG.getMachineFunction();
25903    auto X86FI = MF.getInfo<X86MachineFunctionInfo>();
25904    if (Subtarget.is64Bit()) {
25905      MF.getFrameInfo().setFrameAddressIsTaken(true);
25906      X86FI->setHasSwiftAsyncContext(true);
25907      return SDValue(
25908          DAG.getMachineNode(
25909              X86::SUB64ri8, dl, MVT::i64,
25910              DAG.getCopyFromReg(DAG.getEntryNode(), dl, X86::RBP, MVT::i64),
25911              DAG.getTargetConstant(8, dl, MVT::i32)),
25912          0);
25913    } else {
25914      // 32-bit so no special extended frame, create or reuse an existing stack
25915      // slot.
25916      if (!X86FI->getSwiftAsyncContextFrameIdx())
25917        X86FI->setSwiftAsyncContextFrameIdx(
25918            MF.getFrameInfo().CreateStackObject(4, Align(4), false));
25919      return DAG.getFrameIndex(*X86FI->getSwiftAsyncContextFrameIdx(), MVT::i32);
25920    }
25921  }
25922  case Intrinsic::x86_avx512_vp2intersect_q_512:
25923  case Intrinsic::x86_avx512_vp2intersect_q_256:
25924  case Intrinsic::x86_avx512_vp2intersect_q_128:
25925  case Intrinsic::x86_avx512_vp2intersect_d_512:
25926  case Intrinsic::x86_avx512_vp2intersect_d_256:
25927  case Intrinsic::x86_avx512_vp2intersect_d_128: {
25928    MVT MaskVT = Op.getSimpleValueType();
25929
25930    SDVTList VTs = DAG.getVTList(MVT::Untyped, MVT::Other);
25931    SDLoc DL(Op);
25932
25933    SDValue Operation =
25934        DAG.getNode(X86ISD::VP2INTERSECT, DL, VTs,
25935                    Op->getOperand(1), Op->getOperand(2));
25936
25937    SDValue Result0 = DAG.getTargetExtractSubreg(X86::sub_mask_0, DL,
25938                                                 MaskVT, Operation);
25939    SDValue Result1 = DAG.getTargetExtractSubreg(X86::sub_mask_1, DL,
25940                                                 MaskVT, Operation);
25941    return DAG.getMergeValues({Result0, Result1}, DL);
25942  }
25943  case Intrinsic::x86_mmx_pslli_w:
25944  case Intrinsic::x86_mmx_pslli_d:
25945  case Intrinsic::x86_mmx_pslli_q:
25946  case Intrinsic::x86_mmx_psrli_w:
25947  case Intrinsic::x86_mmx_psrli_d:
25948  case Intrinsic::x86_mmx_psrli_q:
25949  case Intrinsic::x86_mmx_psrai_w:
25950  case Intrinsic::x86_mmx_psrai_d: {
25951    SDLoc DL(Op);
25952    SDValue ShAmt = Op.getOperand(2);
25953    // If the argument is a constant, convert it to a target constant.
25954    if (auto *C = dyn_cast<ConstantSDNode>(ShAmt)) {
25955      // Clamp out of bounds shift amounts since they will otherwise be masked
25956      // to 8-bits which may make it no longer out of bounds.
25957      unsigned ShiftAmount = C->getAPIntValue().getLimitedValue(255);
25958      if (ShiftAmount == 0)
25959        return Op.getOperand(1);
25960
25961      return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, Op.getValueType(),
25962                         Op.getOperand(0), Op.getOperand(1),
25963                         DAG.getTargetConstant(ShiftAmount, DL, MVT::i32));
25964    }
25965
25966    unsigned NewIntrinsic;
25967    switch (IntNo) {
25968    default: llvm_unreachable("Impossible intrinsic");  // Can't reach here.
25969    case Intrinsic::x86_mmx_pslli_w:
25970      NewIntrinsic = Intrinsic::x86_mmx_psll_w;
25971      break;
25972    case Intrinsic::x86_mmx_pslli_d:
25973      NewIntrinsic = Intrinsic::x86_mmx_psll_d;
25974      break;
25975    case Intrinsic::x86_mmx_pslli_q:
25976      NewIntrinsic = Intrinsic::x86_mmx_psll_q;
25977      break;
25978    case Intrinsic::x86_mmx_psrli_w:
25979      NewIntrinsic = Intrinsic::x86_mmx_psrl_w;
25980      break;
25981    case Intrinsic::x86_mmx_psrli_d:
25982      NewIntrinsic = Intrinsic::x86_mmx_psrl_d;
25983      break;
25984    case Intrinsic::x86_mmx_psrli_q:
25985      NewIntrinsic = Intrinsic::x86_mmx_psrl_q;
25986      break;
25987    case Intrinsic::x86_mmx_psrai_w:
25988      NewIntrinsic = Intrinsic::x86_mmx_psra_w;
25989      break;
25990    case Intrinsic::x86_mmx_psrai_d:
25991      NewIntrinsic = Intrinsic::x86_mmx_psra_d;
25992      break;
25993    }
25994
25995    // The vector shift intrinsics with scalars uses 32b shift amounts but
25996    // the sse2/mmx shift instructions reads 64 bits. Copy the 32 bits to an
25997    // MMX register.
25998    ShAmt = DAG.getNode(X86ISD::MMX_MOVW2D, DL, MVT::x86mmx, ShAmt);
25999    return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, Op.getValueType(),
26000                       DAG.getTargetConstant(NewIntrinsic, DL,
26001                                             getPointerTy(DAG.getDataLayout())),
26002                       Op.getOperand(1), ShAmt);
26003  }
26004  }
26005}
26006
26007static SDValue getAVX2GatherNode(unsigned Opc, SDValue Op, SelectionDAG &DAG,
26008                                 SDValue Src, SDValue Mask, SDValue Base,
26009                                 SDValue Index, SDValue ScaleOp, SDValue Chain,
26010                                 const X86Subtarget &Subtarget) {
26011  SDLoc dl(Op);
26012  auto *C = dyn_cast<ConstantSDNode>(ScaleOp);
26013  // Scale must be constant.
26014  if (!C)
26015    return SDValue();
26016  const TargetLowering &TLI = DAG.getTargetLoweringInfo();
26017  SDValue Scale = DAG.getTargetConstant(C->getZExtValue(), dl,
26018                                        TLI.getPointerTy(DAG.getDataLayout()));
26019  EVT MaskVT = Mask.getValueType().changeVectorElementTypeToInteger();
26020  SDVTList VTs = DAG.getVTList(Op.getValueType(), MVT::Other);
26021  // If source is undef or we know it won't be used, use a zero vector
26022  // to break register dependency.
26023  // TODO: use undef instead and let BreakFalseDeps deal with it?
26024  if (Src.isUndef() || ISD::isBuildVectorAllOnes(Mask.getNode()))
26025    Src = getZeroVector(Op.getSimpleValueType(), Subtarget, DAG, dl);
26026
26027  // Cast mask to an integer type.
26028  Mask = DAG.getBitcast(MaskVT, Mask);
26029
26030  MemIntrinsicSDNode *MemIntr = cast<MemIntrinsicSDNode>(Op);
26031
26032  SDValue Ops[] = {Chain, Src, Mask, Base, Index, Scale };
26033  SDValue Res =
26034      DAG.getMemIntrinsicNode(X86ISD::MGATHER, dl, VTs, Ops,
26035                              MemIntr->getMemoryVT(), MemIntr->getMemOperand());
26036  return DAG.getMergeValues({Res, Res.getValue(1)}, dl);
26037}
26038
26039static SDValue getGatherNode(SDValue Op, SelectionDAG &DAG,
26040                             SDValue Src, SDValue Mask, SDValue Base,
26041                             SDValue Index, SDValue ScaleOp, SDValue Chain,
26042                             const X86Subtarget &Subtarget) {
26043  MVT VT = Op.getSimpleValueType();
26044  SDLoc dl(Op);
26045  auto *C = dyn_cast<ConstantSDNode>(ScaleOp);
26046  // Scale must be constant.
26047  if (!C)
26048    return SDValue();
26049  const TargetLowering &TLI = DAG.getTargetLoweringInfo();
26050  SDValue Scale = DAG.getTargetConstant(C->getZExtValue(), dl,
26051                                        TLI.getPointerTy(DAG.getDataLayout()));
26052  unsigned MinElts = std::min(Index.getSimpleValueType().getVectorNumElements(),
26053                              VT.getVectorNumElements());
26054  MVT MaskVT = MVT::getVectorVT(MVT::i1, MinElts);
26055
26056  // We support two versions of the gather intrinsics. One with scalar mask and
26057  // one with vXi1 mask. Convert scalar to vXi1 if necessary.
26058  if (Mask.getValueType() != MaskVT)
26059    Mask = getMaskNode(Mask, MaskVT, Subtarget, DAG, dl);
26060
26061  SDVTList VTs = DAG.getVTList(Op.getValueType(), MVT::Other);
26062  // If source is undef or we know it won't be used, use a zero vector
26063  // to break register dependency.
26064  // TODO: use undef instead and let BreakFalseDeps deal with it?
26065  if (Src.isUndef() || ISD::isBuildVectorAllOnes(Mask.getNode()))
26066    Src = getZeroVector(Op.getSimpleValueType(), Subtarget, DAG, dl);
26067
26068  MemIntrinsicSDNode *MemIntr = cast<MemIntrinsicSDNode>(Op);
26069
26070  SDValue Ops[] = {Chain, Src, Mask, Base, Index, Scale };
26071  SDValue Res =
26072      DAG.getMemIntrinsicNode(X86ISD::MGATHER, dl, VTs, Ops,
26073                              MemIntr->getMemoryVT(), MemIntr->getMemOperand());
26074  return DAG.getMergeValues({Res, Res.getValue(1)}, dl);
26075}
26076
26077static SDValue getScatterNode(unsigned Opc, SDValue Op, SelectionDAG &DAG,
26078                               SDValue Src, SDValue Mask, SDValue Base,
26079                               SDValue Index, SDValue ScaleOp, SDValue Chain,
26080                               const X86Subtarget &Subtarget) {
26081  SDLoc dl(Op);
26082  auto *C = dyn_cast<ConstantSDNode>(ScaleOp);
26083  // Scale must be constant.
26084  if (!C)
26085    return SDValue();
26086  const TargetLowering &TLI = DAG.getTargetLoweringInfo();
26087  SDValue Scale = DAG.getTargetConstant(C->getZExtValue(), dl,
26088                                        TLI.getPointerTy(DAG.getDataLayout()));
26089  unsigned MinElts = std::min(Index.getSimpleValueType().getVectorNumElements(),
26090                              Src.getSimpleValueType().getVectorNumElements());
26091  MVT MaskVT = MVT::getVectorVT(MVT::i1, MinElts);
26092
26093  // We support two versions of the scatter intrinsics. One with scalar mask and
26094  // one with vXi1 mask. Convert scalar to vXi1 if necessary.
26095  if (Mask.getValueType() != MaskVT)
26096    Mask = getMaskNode(Mask, MaskVT, Subtarget, DAG, dl);
26097
26098  MemIntrinsicSDNode *MemIntr = cast<MemIntrinsicSDNode>(Op);
26099
26100  SDVTList VTs = DAG.getVTList(MVT::Other);
26101  SDValue Ops[] = {Chain, Src, Mask, Base, Index, Scale};
26102  SDValue Res =
26103      DAG.getMemIntrinsicNode(X86ISD::MSCATTER, dl, VTs, Ops,
26104                              MemIntr->getMemoryVT(), MemIntr->getMemOperand());
26105  return Res;
26106}
26107
26108static SDValue getPrefetchNode(unsigned Opc, SDValue Op, SelectionDAG &DAG,
26109                               SDValue Mask, SDValue Base, SDValue Index,
26110                               SDValue ScaleOp, SDValue Chain,
26111                               const X86Subtarget &Subtarget) {
26112  SDLoc dl(Op);
26113  auto *C = dyn_cast<ConstantSDNode>(ScaleOp);
26114  // Scale must be constant.
26115  if (!C)
26116    return SDValue();
26117  const TargetLowering &TLI = DAG.getTargetLoweringInfo();
26118  SDValue Scale = DAG.getTargetConstant(C->getZExtValue(), dl,
26119                                        TLI.getPointerTy(DAG.getDataLayout()));
26120  SDValue Disp = DAG.getTargetConstant(0, dl, MVT::i32);
26121  SDValue Segment = DAG.getRegister(0, MVT::i32);
26122  MVT MaskVT =
26123    MVT::getVectorVT(MVT::i1, Index.getSimpleValueType().getVectorNumElements());
26124  SDValue VMask = getMaskNode(Mask, MaskVT, Subtarget, DAG, dl);
26125  SDValue Ops[] = {VMask, Base, Scale, Index, Disp, Segment, Chain};
26126  SDNode *Res = DAG.getMachineNode(Opc, dl, MVT::Other, Ops);
26127  return SDValue(Res, 0);
26128}
26129
26130/// Handles the lowering of builtin intrinsics with chain that return their
26131/// value into registers EDX:EAX.
26132/// If operand ScrReg is a valid register identifier, then operand 2 of N is
26133/// copied to SrcReg. The assumption is that SrcReg is an implicit input to
26134/// TargetOpcode.
26135/// Returns a Glue value which can be used to add extra copy-from-reg if the
26136/// expanded intrinsics implicitly defines extra registers (i.e. not just
26137/// EDX:EAX).
26138static SDValue expandIntrinsicWChainHelper(SDNode *N, const SDLoc &DL,
26139                                        SelectionDAG &DAG,
26140                                        unsigned TargetOpcode,
26141                                        unsigned SrcReg,
26142                                        const X86Subtarget &Subtarget,
26143                                        SmallVectorImpl<SDValue> &Results) {
26144  SDValue Chain = N->getOperand(0);
26145  SDValue Glue;
26146
26147  if (SrcReg) {
26148    assert(N->getNumOperands() == 3 && "Unexpected number of operands!");
26149    Chain = DAG.getCopyToReg(Chain, DL, SrcReg, N->getOperand(2), Glue);
26150    Glue = Chain.getValue(1);
26151  }
26152
26153  SDVTList Tys = DAG.getVTList(MVT::Other, MVT::Glue);
26154  SDValue N1Ops[] = {Chain, Glue};
26155  SDNode *N1 = DAG.getMachineNode(
26156      TargetOpcode, DL, Tys, ArrayRef<SDValue>(N1Ops, Glue.getNode() ? 2 : 1));
26157  Chain = SDValue(N1, 0);
26158
26159  // Reads the content of XCR and returns it in registers EDX:EAX.
26160  SDValue LO, HI;
26161  if (Subtarget.is64Bit()) {
26162    LO = DAG.getCopyFromReg(Chain, DL, X86::RAX, MVT::i64, SDValue(N1, 1));
26163    HI = DAG.getCopyFromReg(LO.getValue(1), DL, X86::RDX, MVT::i64,
26164                            LO.getValue(2));
26165  } else {
26166    LO = DAG.getCopyFromReg(Chain, DL, X86::EAX, MVT::i32, SDValue(N1, 1));
26167    HI = DAG.getCopyFromReg(LO.getValue(1), DL, X86::EDX, MVT::i32,
26168                            LO.getValue(2));
26169  }
26170  Chain = HI.getValue(1);
26171  Glue = HI.getValue(2);
26172
26173  if (Subtarget.is64Bit()) {
26174    // Merge the two 32-bit values into a 64-bit one.
26175    SDValue Tmp = DAG.getNode(ISD::SHL, DL, MVT::i64, HI,
26176                              DAG.getConstant(32, DL, MVT::i8));
26177    Results.push_back(DAG.getNode(ISD::OR, DL, MVT::i64, LO, Tmp));
26178    Results.push_back(Chain);
26179    return Glue;
26180  }
26181
26182  // Use a buildpair to merge the two 32-bit values into a 64-bit one.
26183  SDValue Ops[] = { LO, HI };
26184  SDValue Pair = DAG.getNode(ISD::BUILD_PAIR, DL, MVT::i64, Ops);
26185  Results.push_back(Pair);
26186  Results.push_back(Chain);
26187  return Glue;
26188}
26189
26190/// Handles the lowering of builtin intrinsics that read the time stamp counter
26191/// (x86_rdtsc and x86_rdtscp). This function is also used to custom lower
26192/// READCYCLECOUNTER nodes.
26193static void getReadTimeStampCounter(SDNode *N, const SDLoc &DL, unsigned Opcode,
26194                                    SelectionDAG &DAG,
26195                                    const X86Subtarget &Subtarget,
26196                                    SmallVectorImpl<SDValue> &Results) {
26197  // The processor's time-stamp counter (a 64-bit MSR) is stored into the
26198  // EDX:EAX registers. EDX is loaded with the high-order 32 bits of the MSR
26199  // and the EAX register is loaded with the low-order 32 bits.
26200  SDValue Glue = expandIntrinsicWChainHelper(N, DL, DAG, Opcode,
26201                                             /* NoRegister */0, Subtarget,
26202                                             Results);
26203  if (Opcode != X86::RDTSCP)
26204    return;
26205
26206  SDValue Chain = Results[1];
26207  // Instruction RDTSCP loads the IA32:TSC_AUX_MSR (address C000_0103H) into
26208  // the ECX register. Add 'ecx' explicitly to the chain.
26209  SDValue ecx = DAG.getCopyFromReg(Chain, DL, X86::ECX, MVT::i32, Glue);
26210  Results[1] = ecx;
26211  Results.push_back(ecx.getValue(1));
26212}
26213
26214static SDValue LowerREADCYCLECOUNTER(SDValue Op, const X86Subtarget &Subtarget,
26215                                     SelectionDAG &DAG) {
26216  SmallVector<SDValue, 3> Results;
26217  SDLoc DL(Op);
26218  getReadTimeStampCounter(Op.getNode(), DL, X86::RDTSC, DAG, Subtarget,
26219                          Results);
26220  return DAG.getMergeValues(Results, DL);
26221}
26222
26223static SDValue MarkEHRegistrationNode(SDValue Op, SelectionDAG &DAG) {
26224  MachineFunction &MF = DAG.getMachineFunction();
26225  SDValue Chain = Op.getOperand(0);
26226  SDValue RegNode = Op.getOperand(2);
26227  WinEHFuncInfo *EHInfo = MF.getWinEHFuncInfo();
26228  if (!EHInfo)
26229    report_fatal_error("EH registrations only live in functions using WinEH");
26230
26231  // Cast the operand to an alloca, and remember the frame index.
26232  auto *FINode = dyn_cast<FrameIndexSDNode>(RegNode);
26233  if (!FINode)
26234    report_fatal_error("llvm.x86.seh.ehregnode expects a static alloca");
26235  EHInfo->EHRegNodeFrameIndex = FINode->getIndex();
26236
26237  // Return the chain operand without making any DAG nodes.
26238  return Chain;
26239}
26240
26241static SDValue MarkEHGuard(SDValue Op, SelectionDAG &DAG) {
26242  MachineFunction &MF = DAG.getMachineFunction();
26243  SDValue Chain = Op.getOperand(0);
26244  SDValue EHGuard = Op.getOperand(2);
26245  WinEHFuncInfo *EHInfo = MF.getWinEHFuncInfo();
26246  if (!EHInfo)
26247    report_fatal_error("EHGuard only live in functions using WinEH");
26248
26249  // Cast the operand to an alloca, and remember the frame index.
26250  auto *FINode = dyn_cast<FrameIndexSDNode>(EHGuard);
26251  if (!FINode)
26252    report_fatal_error("llvm.x86.seh.ehguard expects a static alloca");
26253  EHInfo->EHGuardFrameIndex = FINode->getIndex();
26254
26255  // Return the chain operand without making any DAG nodes.
26256  return Chain;
26257}
26258
26259/// Emit Truncating Store with signed or unsigned saturation.
26260static SDValue
26261EmitTruncSStore(bool SignedSat, SDValue Chain, const SDLoc &Dl, SDValue Val,
26262                SDValue Ptr, EVT MemVT, MachineMemOperand *MMO,
26263                SelectionDAG &DAG) {
26264  SDVTList VTs = DAG.getVTList(MVT::Other);
26265  SDValue Undef = DAG.getUNDEF(Ptr.getValueType());
26266  SDValue Ops[] = { Chain, Val, Ptr, Undef };
26267  unsigned Opc = SignedSat ? X86ISD::VTRUNCSTORES : X86ISD::VTRUNCSTOREUS;
26268  return DAG.getMemIntrinsicNode(Opc, Dl, VTs, Ops, MemVT, MMO);
26269}
26270
26271/// Emit Masked Truncating Store with signed or unsigned saturation.
26272static SDValue
26273EmitMaskedTruncSStore(bool SignedSat, SDValue Chain, const SDLoc &Dl,
26274                      SDValue Val, SDValue Ptr, SDValue Mask, EVT MemVT,
26275                      MachineMemOperand *MMO, SelectionDAG &DAG) {
26276  SDVTList VTs = DAG.getVTList(MVT::Other);
26277  SDValue Ops[] = { Chain, Val, Ptr, Mask };
26278  unsigned Opc = SignedSat ? X86ISD::VMTRUNCSTORES : X86ISD::VMTRUNCSTOREUS;
26279  return DAG.getMemIntrinsicNode(Opc, Dl, VTs, Ops, MemVT, MMO);
26280}
26281
26282static SDValue LowerINTRINSIC_W_CHAIN(SDValue Op, const X86Subtarget &Subtarget,
26283                                      SelectionDAG &DAG) {
26284  unsigned IntNo = Op.getConstantOperandVal(1);
26285  const IntrinsicData *IntrData = getIntrinsicWithChain(IntNo);
26286  if (!IntrData) {
26287    switch (IntNo) {
26288    case llvm::Intrinsic::x86_seh_ehregnode:
26289      return MarkEHRegistrationNode(Op, DAG);
26290    case llvm::Intrinsic::x86_seh_ehguard:
26291      return MarkEHGuard(Op, DAG);
26292    case llvm::Intrinsic::x86_rdpkru: {
26293      SDLoc dl(Op);
26294      SDVTList VTs = DAG.getVTList(MVT::i32, MVT::Other);
26295      // Create a RDPKRU node and pass 0 to the ECX parameter.
26296      return DAG.getNode(X86ISD::RDPKRU, dl, VTs, Op.getOperand(0),
26297                         DAG.getConstant(0, dl, MVT::i32));
26298    }
26299    case llvm::Intrinsic::x86_wrpkru: {
26300      SDLoc dl(Op);
26301      // Create a WRPKRU node, pass the input to the EAX parameter,  and pass 0
26302      // to the EDX and ECX parameters.
26303      return DAG.getNode(X86ISD::WRPKRU, dl, MVT::Other,
26304                         Op.getOperand(0), Op.getOperand(2),
26305                         DAG.getConstant(0, dl, MVT::i32),
26306                         DAG.getConstant(0, dl, MVT::i32));
26307    }
26308    case llvm::Intrinsic::x86_flags_read_u32:
26309    case llvm::Intrinsic::x86_flags_read_u64:
26310    case llvm::Intrinsic::x86_flags_write_u32:
26311    case llvm::Intrinsic::x86_flags_write_u64: {
26312      // We need a frame pointer because this will get lowered to a PUSH/POP
26313      // sequence.
26314      MachineFrameInfo &MFI = DAG.getMachineFunction().getFrameInfo();
26315      MFI.setHasCopyImplyingStackAdjustment(true);
26316      // Don't do anything here, we will expand these intrinsics out later
26317      // during FinalizeISel in EmitInstrWithCustomInserter.
26318      return Op;
26319    }
26320    case Intrinsic::x86_lwpins32:
26321    case Intrinsic::x86_lwpins64:
26322    case Intrinsic::x86_umwait:
26323    case Intrinsic::x86_tpause: {
26324      SDLoc dl(Op);
26325      SDValue Chain = Op->getOperand(0);
26326      SDVTList VTs = DAG.getVTList(MVT::i32, MVT::Other);
26327      unsigned Opcode;
26328
26329      switch (IntNo) {
26330      default: llvm_unreachable("Impossible intrinsic");
26331      case Intrinsic::x86_umwait:
26332        Opcode = X86ISD::UMWAIT;
26333        break;
26334      case Intrinsic::x86_tpause:
26335        Opcode = X86ISD::TPAUSE;
26336        break;
26337      case Intrinsic::x86_lwpins32:
26338      case Intrinsic::x86_lwpins64:
26339        Opcode = X86ISD::LWPINS;
26340        break;
26341      }
26342
26343      SDValue Operation =
26344          DAG.getNode(Opcode, dl, VTs, Chain, Op->getOperand(2),
26345                      Op->getOperand(3), Op->getOperand(4));
26346      SDValue SetCC = getSETCC(X86::COND_B, Operation.getValue(0), dl, DAG);
26347      return DAG.getNode(ISD::MERGE_VALUES, dl, Op->getVTList(), SetCC,
26348                         Operation.getValue(1));
26349    }
26350    case Intrinsic::x86_enqcmd:
26351    case Intrinsic::x86_enqcmds: {
26352      SDLoc dl(Op);
26353      SDValue Chain = Op.getOperand(0);
26354      SDVTList VTs = DAG.getVTList(MVT::i32, MVT::Other);
26355      unsigned Opcode;
26356      switch (IntNo) {
26357      default: llvm_unreachable("Impossible intrinsic!");
26358      case Intrinsic::x86_enqcmd:
26359        Opcode = X86ISD::ENQCMD;
26360        break;
26361      case Intrinsic::x86_enqcmds:
26362        Opcode = X86ISD::ENQCMDS;
26363        break;
26364      }
26365      SDValue Operation = DAG.getNode(Opcode, dl, VTs, Chain, Op.getOperand(2),
26366                                      Op.getOperand(3));
26367      SDValue SetCC = getSETCC(X86::COND_E, Operation.getValue(0), dl, DAG);
26368      return DAG.getNode(ISD::MERGE_VALUES, dl, Op->getVTList(), SetCC,
26369                         Operation.getValue(1));
26370    }
26371    case Intrinsic::x86_aesenc128kl:
26372    case Intrinsic::x86_aesdec128kl:
26373    case Intrinsic::x86_aesenc256kl:
26374    case Intrinsic::x86_aesdec256kl: {
26375      SDLoc DL(Op);
26376      SDVTList VTs = DAG.getVTList(MVT::v2i64, MVT::i32, MVT::Other);
26377      SDValue Chain = Op.getOperand(0);
26378      unsigned Opcode;
26379
26380      switch (IntNo) {
26381      default: llvm_unreachable("Impossible intrinsic");
26382      case Intrinsic::x86_aesenc128kl:
26383        Opcode = X86ISD::AESENC128KL;
26384        break;
26385      case Intrinsic::x86_aesdec128kl:
26386        Opcode = X86ISD::AESDEC128KL;
26387        break;
26388      case Intrinsic::x86_aesenc256kl:
26389        Opcode = X86ISD::AESENC256KL;
26390        break;
26391      case Intrinsic::x86_aesdec256kl:
26392        Opcode = X86ISD::AESDEC256KL;
26393        break;
26394      }
26395
26396      MemIntrinsicSDNode *MemIntr = cast<MemIntrinsicSDNode>(Op);
26397      MachineMemOperand *MMO = MemIntr->getMemOperand();
26398      EVT MemVT = MemIntr->getMemoryVT();
26399      SDValue Operation = DAG.getMemIntrinsicNode(
26400          Opcode, DL, VTs, {Chain, Op.getOperand(2), Op.getOperand(3)}, MemVT,
26401          MMO);
26402      SDValue ZF = getSETCC(X86::COND_E, Operation.getValue(1), DL, DAG);
26403
26404      return DAG.getNode(ISD::MERGE_VALUES, DL, Op->getVTList(),
26405                         {ZF, Operation.getValue(0), Operation.getValue(2)});
26406    }
26407    case Intrinsic::x86_aesencwide128kl:
26408    case Intrinsic::x86_aesdecwide128kl:
26409    case Intrinsic::x86_aesencwide256kl:
26410    case Intrinsic::x86_aesdecwide256kl: {
26411      SDLoc DL(Op);
26412      SDVTList VTs = DAG.getVTList(
26413          {MVT::i32, MVT::v2i64, MVT::v2i64, MVT::v2i64, MVT::v2i64, MVT::v2i64,
26414           MVT::v2i64, MVT::v2i64, MVT::v2i64, MVT::Other});
26415      SDValue Chain = Op.getOperand(0);
26416      unsigned Opcode;
26417
26418      switch (IntNo) {
26419      default: llvm_unreachable("Impossible intrinsic");
26420      case Intrinsic::x86_aesencwide128kl:
26421        Opcode = X86ISD::AESENCWIDE128KL;
26422        break;
26423      case Intrinsic::x86_aesdecwide128kl:
26424        Opcode = X86ISD::AESDECWIDE128KL;
26425        break;
26426      case Intrinsic::x86_aesencwide256kl:
26427        Opcode = X86ISD::AESENCWIDE256KL;
26428        break;
26429      case Intrinsic::x86_aesdecwide256kl:
26430        Opcode = X86ISD::AESDECWIDE256KL;
26431        break;
26432      }
26433
26434      MemIntrinsicSDNode *MemIntr = cast<MemIntrinsicSDNode>(Op);
26435      MachineMemOperand *MMO = MemIntr->getMemOperand();
26436      EVT MemVT = MemIntr->getMemoryVT();
26437      SDValue Operation = DAG.getMemIntrinsicNode(
26438          Opcode, DL, VTs,
26439          {Chain, Op.getOperand(2), Op.getOperand(3), Op.getOperand(4),
26440           Op.getOperand(5), Op.getOperand(6), Op.getOperand(7),
26441           Op.getOperand(8), Op.getOperand(9), Op.getOperand(10)},
26442          MemVT, MMO);
26443      SDValue ZF = getSETCC(X86::COND_E, Operation.getValue(0), DL, DAG);
26444
26445      return DAG.getNode(ISD::MERGE_VALUES, DL, Op->getVTList(),
26446                         {ZF, Operation.getValue(1), Operation.getValue(2),
26447                          Operation.getValue(3), Operation.getValue(4),
26448                          Operation.getValue(5), Operation.getValue(6),
26449                          Operation.getValue(7), Operation.getValue(8),
26450                          Operation.getValue(9)});
26451    }
26452    case Intrinsic::x86_testui: {
26453      SDLoc dl(Op);
26454      SDValue Chain = Op.getOperand(0);
26455      SDVTList VTs = DAG.getVTList(MVT::i32, MVT::Other);
26456      SDValue Operation = DAG.getNode(X86ISD::TESTUI, dl, VTs, Chain);
26457      SDValue SetCC = getSETCC(X86::COND_B, Operation.getValue(0), dl, DAG);
26458      return DAG.getNode(ISD::MERGE_VALUES, dl, Op->getVTList(), SetCC,
26459                         Operation.getValue(1));
26460    }
26461    }
26462    return SDValue();
26463  }
26464
26465  SDLoc dl(Op);
26466  switch(IntrData->Type) {
26467  default: llvm_unreachable("Unknown Intrinsic Type");
26468  case RDSEED:
26469  case RDRAND: {
26470    // Emit the node with the right value type.
26471    SDVTList VTs = DAG.getVTList(Op->getValueType(0), MVT::i32, MVT::Other);
26472    SDValue Result = DAG.getNode(IntrData->Opc0, dl, VTs, Op.getOperand(0));
26473
26474    // If the value returned by RDRAND/RDSEED was valid (CF=1), return 1.
26475    // Otherwise return the value from Rand, which is always 0, casted to i32.
26476    SDValue Ops[] = {DAG.getZExtOrTrunc(Result, dl, Op->getValueType(1)),
26477                     DAG.getConstant(1, dl, Op->getValueType(1)),
26478                     DAG.getTargetConstant(X86::COND_B, dl, MVT::i8),
26479                     SDValue(Result.getNode(), 1)};
26480    SDValue isValid = DAG.getNode(X86ISD::CMOV, dl, Op->getValueType(1), Ops);
26481
26482    // Return { result, isValid, chain }.
26483    return DAG.getNode(ISD::MERGE_VALUES, dl, Op->getVTList(), Result, isValid,
26484                       SDValue(Result.getNode(), 2));
26485  }
26486  case GATHER_AVX2: {
26487    SDValue Chain = Op.getOperand(0);
26488    SDValue Src   = Op.getOperand(2);
26489    SDValue Base  = Op.getOperand(3);
26490    SDValue Index = Op.getOperand(4);
26491    SDValue Mask  = Op.getOperand(5);
26492    SDValue Scale = Op.getOperand(6);
26493    return getAVX2GatherNode(IntrData->Opc0, Op, DAG, Src, Mask, Base, Index,
26494                             Scale, Chain, Subtarget);
26495  }
26496  case GATHER: {
26497  //gather(v1, mask, index, base, scale);
26498    SDValue Chain = Op.getOperand(0);
26499    SDValue Src   = Op.getOperand(2);
26500    SDValue Base  = Op.getOperand(3);
26501    SDValue Index = Op.getOperand(4);
26502    SDValue Mask  = Op.getOperand(5);
26503    SDValue Scale = Op.getOperand(6);
26504    return getGatherNode(Op, DAG, Src, Mask, Base, Index, Scale,
26505                         Chain, Subtarget);
26506  }
26507  case SCATTER: {
26508  //scatter(base, mask, index, v1, scale);
26509    SDValue Chain = Op.getOperand(0);
26510    SDValue Base  = Op.getOperand(2);
26511    SDValue Mask  = Op.getOperand(3);
26512    SDValue Index = Op.getOperand(4);
26513    SDValue Src   = Op.getOperand(5);
26514    SDValue Scale = Op.getOperand(6);
26515    return getScatterNode(IntrData->Opc0, Op, DAG, Src, Mask, Base, Index,
26516                          Scale, Chain, Subtarget);
26517  }
26518  case PREFETCH: {
26519    const APInt &HintVal = Op.getConstantOperandAPInt(6);
26520    assert((HintVal == 2 || HintVal == 3) &&
26521           "Wrong prefetch hint in intrinsic: should be 2 or 3");
26522    unsigned Opcode = (HintVal == 2 ? IntrData->Opc1 : IntrData->Opc0);
26523    SDValue Chain = Op.getOperand(0);
26524    SDValue Mask  = Op.getOperand(2);
26525    SDValue Index = Op.getOperand(3);
26526    SDValue Base  = Op.getOperand(4);
26527    SDValue Scale = Op.getOperand(5);
26528    return getPrefetchNode(Opcode, Op, DAG, Mask, Base, Index, Scale, Chain,
26529                           Subtarget);
26530  }
26531  // Read Time Stamp Counter (RDTSC) and Processor ID (RDTSCP).
26532  case RDTSC: {
26533    SmallVector<SDValue, 2> Results;
26534    getReadTimeStampCounter(Op.getNode(), dl, IntrData->Opc0, DAG, Subtarget,
26535                            Results);
26536    return DAG.getMergeValues(Results, dl);
26537  }
26538  // Read Performance Monitoring Counters.
26539  case RDPMC:
26540  // GetExtended Control Register.
26541  case XGETBV: {
26542    SmallVector<SDValue, 2> Results;
26543
26544    // RDPMC uses ECX to select the index of the performance counter to read.
26545    // XGETBV uses ECX to select the index of the XCR register to return.
26546    // The result is stored into registers EDX:EAX.
26547    expandIntrinsicWChainHelper(Op.getNode(), dl, DAG, IntrData->Opc0, X86::ECX,
26548                                Subtarget, Results);
26549    return DAG.getMergeValues(Results, dl);
26550  }
26551  // XTEST intrinsics.
26552  case XTEST: {
26553    SDVTList VTs = DAG.getVTList(Op->getValueType(0), MVT::Other);
26554    SDValue InTrans = DAG.getNode(IntrData->Opc0, dl, VTs, Op.getOperand(0));
26555
26556    SDValue SetCC = getSETCC(X86::COND_NE, InTrans, dl, DAG);
26557    SDValue Ret = DAG.getNode(ISD::ZERO_EXTEND, dl, Op->getValueType(0), SetCC);
26558    return DAG.getNode(ISD::MERGE_VALUES, dl, Op->getVTList(),
26559                       Ret, SDValue(InTrans.getNode(), 1));
26560  }
26561  case TRUNCATE_TO_MEM_VI8:
26562  case TRUNCATE_TO_MEM_VI16:
26563  case TRUNCATE_TO_MEM_VI32: {
26564    SDValue Mask = Op.getOperand(4);
26565    SDValue DataToTruncate = Op.getOperand(3);
26566    SDValue Addr = Op.getOperand(2);
26567    SDValue Chain = Op.getOperand(0);
26568
26569    MemIntrinsicSDNode *MemIntr = dyn_cast<MemIntrinsicSDNode>(Op);
26570    assert(MemIntr && "Expected MemIntrinsicSDNode!");
26571
26572    EVT MemVT  = MemIntr->getMemoryVT();
26573
26574    uint16_t TruncationOp = IntrData->Opc0;
26575    switch (TruncationOp) {
26576    case X86ISD::VTRUNC: {
26577      if (isAllOnesConstant(Mask)) // return just a truncate store
26578        return DAG.getTruncStore(Chain, dl, DataToTruncate, Addr, MemVT,
26579                                 MemIntr->getMemOperand());
26580
26581      MVT MaskVT = MVT::getVectorVT(MVT::i1, MemVT.getVectorNumElements());
26582      SDValue VMask = getMaskNode(Mask, MaskVT, Subtarget, DAG, dl);
26583      SDValue Offset = DAG.getUNDEF(VMask.getValueType());
26584
26585      return DAG.getMaskedStore(Chain, dl, DataToTruncate, Addr, Offset, VMask,
26586                                MemVT, MemIntr->getMemOperand(), ISD::UNINDEXED,
26587                                true /* truncating */);
26588    }
26589    case X86ISD::VTRUNCUS:
26590    case X86ISD::VTRUNCS: {
26591      bool IsSigned = (TruncationOp == X86ISD::VTRUNCS);
26592      if (isAllOnesConstant(Mask))
26593        return EmitTruncSStore(IsSigned, Chain, dl, DataToTruncate, Addr, MemVT,
26594                               MemIntr->getMemOperand(), DAG);
26595
26596      MVT MaskVT = MVT::getVectorVT(MVT::i1, MemVT.getVectorNumElements());
26597      SDValue VMask = getMaskNode(Mask, MaskVT, Subtarget, DAG, dl);
26598
26599      return EmitMaskedTruncSStore(IsSigned, Chain, dl, DataToTruncate, Addr,
26600                                   VMask, MemVT, MemIntr->getMemOperand(), DAG);
26601    }
26602    default:
26603      llvm_unreachable("Unsupported truncstore intrinsic");
26604    }
26605  }
26606  }
26607}
26608
26609SDValue X86TargetLowering::LowerRETURNADDR(SDValue Op,
26610                                           SelectionDAG &DAG) const {
26611  MachineFrameInfo &MFI = DAG.getMachineFunction().getFrameInfo();
26612  MFI.setReturnAddressIsTaken(true);
26613
26614  if (verifyReturnAddressArgumentIsConstant(Op, DAG))
26615    return SDValue();
26616
26617  unsigned Depth = Op.getConstantOperandVal(0);
26618  SDLoc dl(Op);
26619  EVT PtrVT = getPointerTy(DAG.getDataLayout());
26620
26621  if (Depth > 0) {
26622    SDValue FrameAddr = LowerFRAMEADDR(Op, DAG);
26623    const X86RegisterInfo *RegInfo = Subtarget.getRegisterInfo();
26624    SDValue Offset = DAG.getConstant(RegInfo->getSlotSize(), dl, PtrVT);
26625    return DAG.getLoad(PtrVT, dl, DAG.getEntryNode(),
26626                       DAG.getNode(ISD::ADD, dl, PtrVT, FrameAddr, Offset),
26627                       MachinePointerInfo());
26628  }
26629
26630  // Just load the return address.
26631  SDValue RetAddrFI = getReturnAddressFrameIndex(DAG);
26632  return DAG.getLoad(PtrVT, dl, DAG.getEntryNode(), RetAddrFI,
26633                     MachinePointerInfo());
26634}
26635
26636SDValue X86TargetLowering::LowerADDROFRETURNADDR(SDValue Op,
26637                                                 SelectionDAG &DAG) const {
26638  DAG.getMachineFunction().getFrameInfo().setReturnAddressIsTaken(true);
26639  return getReturnAddressFrameIndex(DAG);
26640}
26641
26642SDValue X86TargetLowering::LowerFRAMEADDR(SDValue Op, SelectionDAG &DAG) const {
26643  MachineFunction &MF = DAG.getMachineFunction();
26644  MachineFrameInfo &MFI = MF.getFrameInfo();
26645  X86MachineFunctionInfo *FuncInfo = MF.getInfo<X86MachineFunctionInfo>();
26646  const X86RegisterInfo *RegInfo = Subtarget.getRegisterInfo();
26647  EVT VT = Op.getValueType();
26648
26649  MFI.setFrameAddressIsTaken(true);
26650
26651  if (MF.getTarget().getMCAsmInfo()->usesWindowsCFI()) {
26652    // Depth > 0 makes no sense on targets which use Windows unwind codes.  It
26653    // is not possible to crawl up the stack without looking at the unwind codes
26654    // simultaneously.
26655    int FrameAddrIndex = FuncInfo->getFAIndex();
26656    if (!FrameAddrIndex) {
26657      // Set up a frame object for the return address.
26658      unsigned SlotSize = RegInfo->getSlotSize();
26659      FrameAddrIndex = MF.getFrameInfo().CreateFixedObject(
26660          SlotSize, /*SPOffset=*/0, /*IsImmutable=*/false);
26661      FuncInfo->setFAIndex(FrameAddrIndex);
26662    }
26663    return DAG.getFrameIndex(FrameAddrIndex, VT);
26664  }
26665
26666  unsigned FrameReg =
26667      RegInfo->getPtrSizedFrameRegister(DAG.getMachineFunction());
26668  SDLoc dl(Op);  // FIXME probably not meaningful
26669  unsigned Depth = Op.getConstantOperandVal(0);
26670  assert(((FrameReg == X86::RBP && VT == MVT::i64) ||
26671          (FrameReg == X86::EBP && VT == MVT::i32)) &&
26672         "Invalid Frame Register!");
26673  SDValue FrameAddr = DAG.getCopyFromReg(DAG.getEntryNode(), dl, FrameReg, VT);
26674  while (Depth--)
26675    FrameAddr = DAG.getLoad(VT, dl, DAG.getEntryNode(), FrameAddr,
26676                            MachinePointerInfo());
26677  return FrameAddr;
26678}
26679
26680// FIXME? Maybe this could be a TableGen attribute on some registers and
26681// this table could be generated automatically from RegInfo.
26682Register X86TargetLowering::getRegisterByName(const char* RegName, LLT VT,
26683                                              const MachineFunction &MF) const {
26684  const TargetFrameLowering &TFI = *Subtarget.getFrameLowering();
26685
26686  Register Reg = StringSwitch<unsigned>(RegName)
26687                       .Case("esp", X86::ESP)
26688                       .Case("rsp", X86::RSP)
26689                       .Case("ebp", X86::EBP)
26690                       .Case("rbp", X86::RBP)
26691                       .Default(0);
26692
26693  if (Reg == X86::EBP || Reg == X86::RBP) {
26694    if (!TFI.hasFP(MF))
26695      report_fatal_error("register " + StringRef(RegName) +
26696                         " is allocatable: function has no frame pointer");
26697#ifndef NDEBUG
26698    else {
26699      const X86RegisterInfo *RegInfo = Subtarget.getRegisterInfo();
26700      Register FrameReg = RegInfo->getPtrSizedFrameRegister(MF);
26701      assert((FrameReg == X86::EBP || FrameReg == X86::RBP) &&
26702             "Invalid Frame Register!");
26703    }
26704#endif
26705  }
26706
26707  if (Reg)
26708    return Reg;
26709
26710  report_fatal_error("Invalid register name global variable");
26711}
26712
26713SDValue X86TargetLowering::LowerFRAME_TO_ARGS_OFFSET(SDValue Op,
26714                                                     SelectionDAG &DAG) const {
26715  const X86RegisterInfo *RegInfo = Subtarget.getRegisterInfo();
26716  return DAG.getIntPtrConstant(2 * RegInfo->getSlotSize(), SDLoc(Op));
26717}
26718
26719Register X86TargetLowering::getExceptionPointerRegister(
26720    const Constant *PersonalityFn) const {
26721  if (classifyEHPersonality(PersonalityFn) == EHPersonality::CoreCLR)
26722    return Subtarget.isTarget64BitLP64() ? X86::RDX : X86::EDX;
26723
26724  return Subtarget.isTarget64BitLP64() ? X86::RAX : X86::EAX;
26725}
26726
26727Register X86TargetLowering::getExceptionSelectorRegister(
26728    const Constant *PersonalityFn) const {
26729  // Funclet personalities don't use selectors (the runtime does the selection).
26730  if (isFuncletEHPersonality(classifyEHPersonality(PersonalityFn)))
26731    return X86::NoRegister;
26732  return Subtarget.isTarget64BitLP64() ? X86::RDX : X86::EDX;
26733}
26734
26735bool X86TargetLowering::needsFixedCatchObjects() const {
26736  return Subtarget.isTargetWin64();
26737}
26738
26739SDValue X86TargetLowering::LowerEH_RETURN(SDValue Op, SelectionDAG &DAG) const {
26740  SDValue Chain     = Op.getOperand(0);
26741  SDValue Offset    = Op.getOperand(1);
26742  SDValue Handler   = Op.getOperand(2);
26743  SDLoc dl      (Op);
26744
26745  EVT PtrVT = getPointerTy(DAG.getDataLayout());
26746  const X86RegisterInfo *RegInfo = Subtarget.getRegisterInfo();
26747  Register FrameReg = RegInfo->getFrameRegister(DAG.getMachineFunction());
26748  assert(((FrameReg == X86::RBP && PtrVT == MVT::i64) ||
26749          (FrameReg == X86::EBP && PtrVT == MVT::i32)) &&
26750         "Invalid Frame Register!");
26751  SDValue Frame = DAG.getCopyFromReg(DAG.getEntryNode(), dl, FrameReg, PtrVT);
26752  Register StoreAddrReg = (PtrVT == MVT::i64) ? X86::RCX : X86::ECX;
26753
26754  SDValue StoreAddr = DAG.getNode(ISD::ADD, dl, PtrVT, Frame,
26755                                 DAG.getIntPtrConstant(RegInfo->getSlotSize(),
26756                                                       dl));
26757  StoreAddr = DAG.getNode(ISD::ADD, dl, PtrVT, StoreAddr, Offset);
26758  Chain = DAG.getStore(Chain, dl, Handler, StoreAddr, MachinePointerInfo());
26759  Chain = DAG.getCopyToReg(Chain, dl, StoreAddrReg, StoreAddr);
26760
26761  return DAG.getNode(X86ISD::EH_RETURN, dl, MVT::Other, Chain,
26762                     DAG.getRegister(StoreAddrReg, PtrVT));
26763}
26764
26765SDValue X86TargetLowering::lowerEH_SJLJ_SETJMP(SDValue Op,
26766                                               SelectionDAG &DAG) const {
26767  SDLoc DL(Op);
26768  // If the subtarget is not 64bit, we may need the global base reg
26769  // after isel expand pseudo, i.e., after CGBR pass ran.
26770  // Therefore, ask for the GlobalBaseReg now, so that the pass
26771  // inserts the code for us in case we need it.
26772  // Otherwise, we will end up in a situation where we will
26773  // reference a virtual register that is not defined!
26774  if (!Subtarget.is64Bit()) {
26775    const X86InstrInfo *TII = Subtarget.getInstrInfo();
26776    (void)TII->getGlobalBaseReg(&DAG.getMachineFunction());
26777  }
26778  return DAG.getNode(X86ISD::EH_SJLJ_SETJMP, DL,
26779                     DAG.getVTList(MVT::i32, MVT::Other),
26780                     Op.getOperand(0), Op.getOperand(1));
26781}
26782
26783SDValue X86TargetLowering::lowerEH_SJLJ_LONGJMP(SDValue Op,
26784                                                SelectionDAG &DAG) const {
26785  SDLoc DL(Op);
26786  return DAG.getNode(X86ISD::EH_SJLJ_LONGJMP, DL, MVT::Other,
26787                     Op.getOperand(0), Op.getOperand(1));
26788}
26789
26790SDValue X86TargetLowering::lowerEH_SJLJ_SETUP_DISPATCH(SDValue Op,
26791                                                       SelectionDAG &DAG) const {
26792  SDLoc DL(Op);
26793  return DAG.getNode(X86ISD::EH_SJLJ_SETUP_DISPATCH, DL, MVT::Other,
26794                     Op.getOperand(0));
26795}
26796
26797static SDValue LowerADJUST_TRAMPOLINE(SDValue Op, SelectionDAG &DAG) {
26798  return Op.getOperand(0);
26799}
26800
26801SDValue X86TargetLowering::LowerINIT_TRAMPOLINE(SDValue Op,
26802                                                SelectionDAG &DAG) const {
26803  SDValue Root = Op.getOperand(0);
26804  SDValue Trmp = Op.getOperand(1); // trampoline
26805  SDValue FPtr = Op.getOperand(2); // nested function
26806  SDValue Nest = Op.getOperand(3); // 'nest' parameter value
26807  SDLoc dl (Op);
26808
26809  const Value *TrmpAddr = cast<SrcValueSDNode>(Op.getOperand(4))->getValue();
26810  const TargetRegisterInfo *TRI = Subtarget.getRegisterInfo();
26811
26812  if (Subtarget.is64Bit()) {
26813    SDValue OutChains[6];
26814
26815    // Large code-model.
26816    const unsigned char JMP64r  = 0xFF; // 64-bit jmp through register opcode.
26817    const unsigned char MOV64ri = 0xB8; // X86::MOV64ri opcode.
26818
26819    const unsigned char N86R10 = TRI->getEncodingValue(X86::R10) & 0x7;
26820    const unsigned char N86R11 = TRI->getEncodingValue(X86::R11) & 0x7;
26821
26822    const unsigned char REX_WB = 0x40 | 0x08 | 0x01; // REX prefix
26823
26824    // Load the pointer to the nested function into R11.
26825    unsigned OpCode = ((MOV64ri | N86R11) << 8) | REX_WB; // movabsq r11
26826    SDValue Addr = Trmp;
26827    OutChains[0] = DAG.getStore(Root, dl, DAG.getConstant(OpCode, dl, MVT::i16),
26828                                Addr, MachinePointerInfo(TrmpAddr));
26829
26830    Addr = DAG.getNode(ISD::ADD, dl, MVT::i64, Trmp,
26831                       DAG.getConstant(2, dl, MVT::i64));
26832    OutChains[1] = DAG.getStore(Root, dl, FPtr, Addr,
26833                                MachinePointerInfo(TrmpAddr, 2), Align(2));
26834
26835    // Load the 'nest' parameter value into R10.
26836    // R10 is specified in X86CallingConv.td
26837    OpCode = ((MOV64ri | N86R10) << 8) | REX_WB; // movabsq r10
26838    Addr = DAG.getNode(ISD::ADD, dl, MVT::i64, Trmp,
26839                       DAG.getConstant(10, dl, MVT::i64));
26840    OutChains[2] = DAG.getStore(Root, dl, DAG.getConstant(OpCode, dl, MVT::i16),
26841                                Addr, MachinePointerInfo(TrmpAddr, 10));
26842
26843    Addr = DAG.getNode(ISD::ADD, dl, MVT::i64, Trmp,
26844                       DAG.getConstant(12, dl, MVT::i64));
26845    OutChains[3] = DAG.getStore(Root, dl, Nest, Addr,
26846                                MachinePointerInfo(TrmpAddr, 12), Align(2));
26847
26848    // Jump to the nested function.
26849    OpCode = (JMP64r << 8) | REX_WB; // jmpq *...
26850    Addr = DAG.getNode(ISD::ADD, dl, MVT::i64, Trmp,
26851                       DAG.getConstant(20, dl, MVT::i64));
26852    OutChains[4] = DAG.getStore(Root, dl, DAG.getConstant(OpCode, dl, MVT::i16),
26853                                Addr, MachinePointerInfo(TrmpAddr, 20));
26854
26855    unsigned char ModRM = N86R11 | (4 << 3) | (3 << 6); // ...r11
26856    Addr = DAG.getNode(ISD::ADD, dl, MVT::i64, Trmp,
26857                       DAG.getConstant(22, dl, MVT::i64));
26858    OutChains[5] = DAG.getStore(Root, dl, DAG.getConstant(ModRM, dl, MVT::i8),
26859                                Addr, MachinePointerInfo(TrmpAddr, 22));
26860
26861    return DAG.getNode(ISD::TokenFactor, dl, MVT::Other, OutChains);
26862  } else {
26863    const Function *Func =
26864      cast<Function>(cast<SrcValueSDNode>(Op.getOperand(5))->getValue());
26865    CallingConv::ID CC = Func->getCallingConv();
26866    unsigned NestReg;
26867
26868    switch (CC) {
26869    default:
26870      llvm_unreachable("Unsupported calling convention");
26871    case CallingConv::C:
26872    case CallingConv::X86_StdCall: {
26873      // Pass 'nest' parameter in ECX.
26874      // Must be kept in sync with X86CallingConv.td
26875      NestReg = X86::ECX;
26876
26877      // Check that ECX wasn't needed by an 'inreg' parameter.
26878      FunctionType *FTy = Func->getFunctionType();
26879      const AttributeList &Attrs = Func->getAttributes();
26880
26881      if (!Attrs.isEmpty() && !Func->isVarArg()) {
26882        unsigned InRegCount = 0;
26883        unsigned Idx = 1;
26884
26885        for (FunctionType::param_iterator I = FTy->param_begin(),
26886             E = FTy->param_end(); I != E; ++I, ++Idx)
26887          if (Attrs.hasAttribute(Idx, Attribute::InReg)) {
26888            const DataLayout &DL = DAG.getDataLayout();
26889            // FIXME: should only count parameters that are lowered to integers.
26890            InRegCount += (DL.getTypeSizeInBits(*I) + 31) / 32;
26891          }
26892
26893        if (InRegCount > 2) {
26894          report_fatal_error("Nest register in use - reduce number of inreg"
26895                             " parameters!");
26896        }
26897      }
26898      break;
26899    }
26900    case CallingConv::X86_FastCall:
26901    case CallingConv::X86_ThisCall:
26902    case CallingConv::Fast:
26903    case CallingConv::Tail:
26904    case CallingConv::SwiftTail:
26905      // Pass 'nest' parameter in EAX.
26906      // Must be kept in sync with X86CallingConv.td
26907      NestReg = X86::EAX;
26908      break;
26909    }
26910
26911    SDValue OutChains[4];
26912    SDValue Addr, Disp;
26913
26914    Addr = DAG.getNode(ISD::ADD, dl, MVT::i32, Trmp,
26915                       DAG.getConstant(10, dl, MVT::i32));
26916    Disp = DAG.getNode(ISD::SUB, dl, MVT::i32, FPtr, Addr);
26917
26918    // This is storing the opcode for MOV32ri.
26919    const unsigned char MOV32ri = 0xB8; // X86::MOV32ri's opcode byte.
26920    const unsigned char N86Reg = TRI->getEncodingValue(NestReg) & 0x7;
26921    OutChains[0] =
26922        DAG.getStore(Root, dl, DAG.getConstant(MOV32ri | N86Reg, dl, MVT::i8),
26923                     Trmp, MachinePointerInfo(TrmpAddr));
26924
26925    Addr = DAG.getNode(ISD::ADD, dl, MVT::i32, Trmp,
26926                       DAG.getConstant(1, dl, MVT::i32));
26927    OutChains[1] = DAG.getStore(Root, dl, Nest, Addr,
26928                                MachinePointerInfo(TrmpAddr, 1), Align(1));
26929
26930    const unsigned char JMP = 0xE9; // jmp <32bit dst> opcode.
26931    Addr = DAG.getNode(ISD::ADD, dl, MVT::i32, Trmp,
26932                       DAG.getConstant(5, dl, MVT::i32));
26933    OutChains[2] =
26934        DAG.getStore(Root, dl, DAG.getConstant(JMP, dl, MVT::i8), Addr,
26935                     MachinePointerInfo(TrmpAddr, 5), Align(1));
26936
26937    Addr = DAG.getNode(ISD::ADD, dl, MVT::i32, Trmp,
26938                       DAG.getConstant(6, dl, MVT::i32));
26939    OutChains[3] = DAG.getStore(Root, dl, Disp, Addr,
26940                                MachinePointerInfo(TrmpAddr, 6), Align(1));
26941
26942    return DAG.getNode(ISD::TokenFactor, dl, MVT::Other, OutChains);
26943  }
26944}
26945
26946SDValue X86TargetLowering::LowerFLT_ROUNDS_(SDValue Op,
26947                                            SelectionDAG &DAG) const {
26948  /*
26949   The rounding mode is in bits 11:10 of FPSR, and has the following
26950   settings:
26951     00 Round to nearest
26952     01 Round to -inf
26953     10 Round to +inf
26954     11 Round to 0
26955
26956  FLT_ROUNDS, on the other hand, expects the following:
26957    -1 Undefined
26958     0 Round to 0
26959     1 Round to nearest
26960     2 Round to +inf
26961     3 Round to -inf
26962
26963  To perform the conversion, we use a packed lookup table of the four 2-bit
26964  values that we can index by FPSP[11:10]
26965    0x2d --> (0b00,10,11,01) --> (0,2,3,1) >> FPSR[11:10]
26966
26967    (0x2d >> ((FPSR & 0xc00) >> 9)) & 3
26968  */
26969
26970  MachineFunction &MF = DAG.getMachineFunction();
26971  MVT VT = Op.getSimpleValueType();
26972  SDLoc DL(Op);
26973
26974  // Save FP Control Word to stack slot
26975  int SSFI = MF.getFrameInfo().CreateStackObject(2, Align(2), false);
26976  SDValue StackSlot =
26977      DAG.getFrameIndex(SSFI, getPointerTy(DAG.getDataLayout()));
26978
26979  MachinePointerInfo MPI = MachinePointerInfo::getFixedStack(MF, SSFI);
26980
26981  SDValue Chain = Op.getOperand(0);
26982  SDValue Ops[] = {Chain, StackSlot};
26983  Chain = DAG.getMemIntrinsicNode(X86ISD::FNSTCW16m, DL,
26984                                  DAG.getVTList(MVT::Other), Ops, MVT::i16, MPI,
26985                                  Align(2), MachineMemOperand::MOStore);
26986
26987  // Load FP Control Word from stack slot
26988  SDValue CWD = DAG.getLoad(MVT::i16, DL, Chain, StackSlot, MPI, Align(2));
26989  Chain = CWD.getValue(1);
26990
26991  // Mask and turn the control bits into a shift for the lookup table.
26992  SDValue Shift =
26993    DAG.getNode(ISD::SRL, DL, MVT::i16,
26994                DAG.getNode(ISD::AND, DL, MVT::i16,
26995                            CWD, DAG.getConstant(0xc00, DL, MVT::i16)),
26996                DAG.getConstant(9, DL, MVT::i8));
26997  Shift = DAG.getNode(ISD::TRUNCATE, DL, MVT::i8, Shift);
26998
26999  SDValue LUT = DAG.getConstant(0x2d, DL, MVT::i32);
27000  SDValue RetVal =
27001    DAG.getNode(ISD::AND, DL, MVT::i32,
27002                DAG.getNode(ISD::SRL, DL, MVT::i32, LUT, Shift),
27003                DAG.getConstant(3, DL, MVT::i32));
27004
27005  RetVal = DAG.getZExtOrTrunc(RetVal, DL, VT);
27006
27007  return DAG.getMergeValues({RetVal, Chain}, DL);
27008}
27009
27010SDValue X86TargetLowering::LowerSET_ROUNDING(SDValue Op,
27011                                             SelectionDAG &DAG) const {
27012  MachineFunction &MF = DAG.getMachineFunction();
27013  SDLoc DL(Op);
27014  SDValue Chain = Op.getNode()->getOperand(0);
27015
27016  // FP control word may be set only from data in memory. So we need to allocate
27017  // stack space to save/load FP control word.
27018  int OldCWFrameIdx = MF.getFrameInfo().CreateStackObject(4, Align(4), false);
27019  SDValue StackSlot =
27020      DAG.getFrameIndex(OldCWFrameIdx, getPointerTy(DAG.getDataLayout()));
27021  MachinePointerInfo MPI = MachinePointerInfo::getFixedStack(MF, OldCWFrameIdx);
27022  MachineMemOperand *MMO =
27023      MF.getMachineMemOperand(MPI, MachineMemOperand::MOStore, 2, Align(2));
27024
27025  // Store FP control word into memory.
27026  SDValue Ops[] = {Chain, StackSlot};
27027  Chain = DAG.getMemIntrinsicNode(
27028      X86ISD::FNSTCW16m, DL, DAG.getVTList(MVT::Other), Ops, MVT::i16, MMO);
27029
27030  // Load FP Control Word from stack slot and clear RM field (bits 11:10).
27031  SDValue CWD = DAG.getLoad(MVT::i16, DL, Chain, StackSlot, MPI);
27032  Chain = CWD.getValue(1);
27033  CWD = DAG.getNode(ISD::AND, DL, MVT::i16, CWD.getValue(0),
27034                    DAG.getConstant(0xf3ff, DL, MVT::i16));
27035
27036  // Calculate new rounding mode.
27037  SDValue NewRM = Op.getNode()->getOperand(1);
27038  SDValue RMBits;
27039  if (auto *CVal = dyn_cast<ConstantSDNode>(NewRM)) {
27040    uint64_t RM = CVal->getZExtValue();
27041    int FieldVal;
27042    switch (static_cast<RoundingMode>(RM)) {
27043    case RoundingMode::NearestTiesToEven: FieldVal = X86::rmToNearest; break;
27044    case RoundingMode::TowardNegative:    FieldVal = X86::rmDownward; break;
27045    case RoundingMode::TowardPositive:    FieldVal = X86::rmUpward; break;
27046    case RoundingMode::TowardZero:        FieldVal = X86::rmTowardZero; break;
27047    default:
27048      llvm_unreachable("rounding mode is not supported by X86 hardware");
27049    }
27050    RMBits = DAG.getConstant(FieldVal, DL, MVT::i16);
27051  } else {
27052    // Need to convert argument into bits of control word:
27053    //    0 Round to 0       -> 11
27054    //    1 Round to nearest -> 00
27055    //    2 Round to +inf    -> 10
27056    //    3 Round to -inf    -> 01
27057    // The 2-bit value needs then to be shifted so that it occupies bits 11:10.
27058    // To make the conversion, put all these values into a value 0xc9 and shift
27059    // it left depending on the rounding mode:
27060    //    (0xc9 << 4) & 0xc00 = X86::rmTowardZero
27061    //    (0xc9 << 6) & 0xc00 = X86::rmToNearest
27062    //    ...
27063    // (0xc9 << (2 * NewRM + 4)) & 0xc00
27064    SDValue ShiftValue =
27065        DAG.getNode(ISD::TRUNCATE, DL, MVT::i8,
27066                    DAG.getNode(ISD::ADD, DL, MVT::i32,
27067                                DAG.getNode(ISD::SHL, DL, MVT::i32, NewRM,
27068                                            DAG.getConstant(1, DL, MVT::i8)),
27069                                DAG.getConstant(4, DL, MVT::i32)));
27070    SDValue Shifted =
27071        DAG.getNode(ISD::SHL, DL, MVT::i16, DAG.getConstant(0xc9, DL, MVT::i16),
27072                    ShiftValue);
27073    RMBits = DAG.getNode(ISD::AND, DL, MVT::i16, Shifted,
27074                         DAG.getConstant(0xc00, DL, MVT::i16));
27075  }
27076
27077  // Update rounding mode bits and store the new FP Control Word into stack.
27078  CWD = DAG.getNode(ISD::OR, DL, MVT::i16, CWD, RMBits);
27079  Chain = DAG.getStore(Chain, DL, CWD, StackSlot, MPI, /* Alignment = */ 2);
27080
27081  // Load FP control word from the slot.
27082  SDValue OpsLD[] = {Chain, StackSlot};
27083  MachineMemOperand *MMOL =
27084      MF.getMachineMemOperand(MPI, MachineMemOperand::MOLoad, 2, Align(2));
27085  Chain = DAG.getMemIntrinsicNode(
27086      X86ISD::FLDCW16m, DL, DAG.getVTList(MVT::Other), OpsLD, MVT::i16, MMOL);
27087
27088  // If target supports SSE, set MXCSR as well. Rounding mode is encoded in the
27089  // same way but in bits 14:13.
27090  if (Subtarget.hasSSE1()) {
27091    // Store MXCSR into memory.
27092    Chain = DAG.getNode(
27093        ISD::INTRINSIC_VOID, DL, DAG.getVTList(MVT::Other), Chain,
27094        DAG.getTargetConstant(Intrinsic::x86_sse_stmxcsr, DL, MVT::i32),
27095        StackSlot);
27096
27097    // Load MXCSR from stack slot and clear RM field (bits 14:13).
27098    SDValue CWD = DAG.getLoad(MVT::i32, DL, Chain, StackSlot, MPI);
27099    Chain = CWD.getValue(1);
27100    CWD = DAG.getNode(ISD::AND, DL, MVT::i32, CWD.getValue(0),
27101                      DAG.getConstant(0xffff9fff, DL, MVT::i32));
27102
27103    // Shift X87 RM bits from 11:10 to 14:13.
27104    RMBits = DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::i32, RMBits);
27105    RMBits = DAG.getNode(ISD::SHL, DL, MVT::i32, RMBits,
27106                         DAG.getConstant(3, DL, MVT::i8));
27107
27108    // Update rounding mode bits and store the new FP Control Word into stack.
27109    CWD = DAG.getNode(ISD::OR, DL, MVT::i32, CWD, RMBits);
27110    Chain = DAG.getStore(Chain, DL, CWD, StackSlot, MPI, /* Alignment = */ 4);
27111
27112    // Load MXCSR from the slot.
27113    Chain = DAG.getNode(
27114        ISD::INTRINSIC_VOID, DL, DAG.getVTList(MVT::Other), Chain,
27115        DAG.getTargetConstant(Intrinsic::x86_sse_ldmxcsr, DL, MVT::i32),
27116        StackSlot);
27117  }
27118
27119  return Chain;
27120}
27121
27122/// Lower a vector CTLZ using native supported vector CTLZ instruction.
27123//
27124// i8/i16 vector implemented using dword LZCNT vector instruction
27125// ( sub(trunc(lzcnt(zext32(x)))) ). In case zext32(x) is illegal,
27126// split the vector, perform operation on it's Lo a Hi part and
27127// concatenate the results.
27128static SDValue LowerVectorCTLZ_AVX512CDI(SDValue Op, SelectionDAG &DAG,
27129                                         const X86Subtarget &Subtarget) {
27130  assert(Op.getOpcode() == ISD::CTLZ);
27131  SDLoc dl(Op);
27132  MVT VT = Op.getSimpleValueType();
27133  MVT EltVT = VT.getVectorElementType();
27134  unsigned NumElems = VT.getVectorNumElements();
27135
27136  assert((EltVT == MVT::i8 || EltVT == MVT::i16) &&
27137          "Unsupported element type");
27138
27139  // Split vector, it's Lo and Hi parts will be handled in next iteration.
27140  if (NumElems > 16 ||
27141      (NumElems == 16 && !Subtarget.canExtendTo512DQ()))
27142    return splitVectorIntUnary(Op, DAG);
27143
27144  MVT NewVT = MVT::getVectorVT(MVT::i32, NumElems);
27145  assert((NewVT.is256BitVector() || NewVT.is512BitVector()) &&
27146          "Unsupported value type for operation");
27147
27148  // Use native supported vector instruction vplzcntd.
27149  Op = DAG.getNode(ISD::ZERO_EXTEND, dl, NewVT, Op.getOperand(0));
27150  SDValue CtlzNode = DAG.getNode(ISD::CTLZ, dl, NewVT, Op);
27151  SDValue TruncNode = DAG.getNode(ISD::TRUNCATE, dl, VT, CtlzNode);
27152  SDValue Delta = DAG.getConstant(32 - EltVT.getSizeInBits(), dl, VT);
27153
27154  return DAG.getNode(ISD::SUB, dl, VT, TruncNode, Delta);
27155}
27156
27157// Lower CTLZ using a PSHUFB lookup table implementation.
27158static SDValue LowerVectorCTLZInRegLUT(SDValue Op, const SDLoc &DL,
27159                                       const X86Subtarget &Subtarget,
27160                                       SelectionDAG &DAG) {
27161  MVT VT = Op.getSimpleValueType();
27162  int NumElts = VT.getVectorNumElements();
27163  int NumBytes = NumElts * (VT.getScalarSizeInBits() / 8);
27164  MVT CurrVT = MVT::getVectorVT(MVT::i8, NumBytes);
27165
27166  // Per-nibble leading zero PSHUFB lookup table.
27167  const int LUT[16] = {/* 0 */ 4, /* 1 */ 3, /* 2 */ 2, /* 3 */ 2,
27168                       /* 4 */ 1, /* 5 */ 1, /* 6 */ 1, /* 7 */ 1,
27169                       /* 8 */ 0, /* 9 */ 0, /* a */ 0, /* b */ 0,
27170                       /* c */ 0, /* d */ 0, /* e */ 0, /* f */ 0};
27171
27172  SmallVector<SDValue, 64> LUTVec;
27173  for (int i = 0; i < NumBytes; ++i)
27174    LUTVec.push_back(DAG.getConstant(LUT[i % 16], DL, MVT::i8));
27175  SDValue InRegLUT = DAG.getBuildVector(CurrVT, DL, LUTVec);
27176
27177  // Begin by bitcasting the input to byte vector, then split those bytes
27178  // into lo/hi nibbles and use the PSHUFB LUT to perform CLTZ on each of them.
27179  // If the hi input nibble is zero then we add both results together, otherwise
27180  // we just take the hi result (by masking the lo result to zero before the
27181  // add).
27182  SDValue Op0 = DAG.getBitcast(CurrVT, Op.getOperand(0));
27183  SDValue Zero = DAG.getConstant(0, DL, CurrVT);
27184
27185  SDValue NibbleShift = DAG.getConstant(0x4, DL, CurrVT);
27186  SDValue Lo = Op0;
27187  SDValue Hi = DAG.getNode(ISD::SRL, DL, CurrVT, Op0, NibbleShift);
27188  SDValue HiZ;
27189  if (CurrVT.is512BitVector()) {
27190    MVT MaskVT = MVT::getVectorVT(MVT::i1, CurrVT.getVectorNumElements());
27191    HiZ = DAG.getSetCC(DL, MaskVT, Hi, Zero, ISD::SETEQ);
27192    HiZ = DAG.getNode(ISD::SIGN_EXTEND, DL, CurrVT, HiZ);
27193  } else {
27194    HiZ = DAG.getSetCC(DL, CurrVT, Hi, Zero, ISD::SETEQ);
27195  }
27196
27197  Lo = DAG.getNode(X86ISD::PSHUFB, DL, CurrVT, InRegLUT, Lo);
27198  Hi = DAG.getNode(X86ISD::PSHUFB, DL, CurrVT, InRegLUT, Hi);
27199  Lo = DAG.getNode(ISD::AND, DL, CurrVT, Lo, HiZ);
27200  SDValue Res = DAG.getNode(ISD::ADD, DL, CurrVT, Lo, Hi);
27201
27202  // Merge result back from vXi8 back to VT, working on the lo/hi halves
27203  // of the current vector width in the same way we did for the nibbles.
27204  // If the upper half of the input element is zero then add the halves'
27205  // leading zero counts together, otherwise just use the upper half's.
27206  // Double the width of the result until we are at target width.
27207  while (CurrVT != VT) {
27208    int CurrScalarSizeInBits = CurrVT.getScalarSizeInBits();
27209    int CurrNumElts = CurrVT.getVectorNumElements();
27210    MVT NextSVT = MVT::getIntegerVT(CurrScalarSizeInBits * 2);
27211    MVT NextVT = MVT::getVectorVT(NextSVT, CurrNumElts / 2);
27212    SDValue Shift = DAG.getConstant(CurrScalarSizeInBits, DL, NextVT);
27213
27214    // Check if the upper half of the input element is zero.
27215    if (CurrVT.is512BitVector()) {
27216      MVT MaskVT = MVT::getVectorVT(MVT::i1, CurrVT.getVectorNumElements());
27217      HiZ = DAG.getSetCC(DL, MaskVT, DAG.getBitcast(CurrVT, Op0),
27218                         DAG.getBitcast(CurrVT, Zero), ISD::SETEQ);
27219      HiZ = DAG.getNode(ISD::SIGN_EXTEND, DL, CurrVT, HiZ);
27220    } else {
27221      HiZ = DAG.getSetCC(DL, CurrVT, DAG.getBitcast(CurrVT, Op0),
27222                         DAG.getBitcast(CurrVT, Zero), ISD::SETEQ);
27223    }
27224    HiZ = DAG.getBitcast(NextVT, HiZ);
27225
27226    // Move the upper/lower halves to the lower bits as we'll be extending to
27227    // NextVT. Mask the lower result to zero if HiZ is true and add the results
27228    // together.
27229    SDValue ResNext = Res = DAG.getBitcast(NextVT, Res);
27230    SDValue R0 = DAG.getNode(ISD::SRL, DL, NextVT, ResNext, Shift);
27231    SDValue R1 = DAG.getNode(ISD::SRL, DL, NextVT, HiZ, Shift);
27232    R1 = DAG.getNode(ISD::AND, DL, NextVT, ResNext, R1);
27233    Res = DAG.getNode(ISD::ADD, DL, NextVT, R0, R1);
27234    CurrVT = NextVT;
27235  }
27236
27237  return Res;
27238}
27239
27240static SDValue LowerVectorCTLZ(SDValue Op, const SDLoc &DL,
27241                               const X86Subtarget &Subtarget,
27242                               SelectionDAG &DAG) {
27243  MVT VT = Op.getSimpleValueType();
27244
27245  if (Subtarget.hasCDI() &&
27246      // vXi8 vectors need to be promoted to 512-bits for vXi32.
27247      (Subtarget.canExtendTo512DQ() || VT.getVectorElementType() != MVT::i8))
27248    return LowerVectorCTLZ_AVX512CDI(Op, DAG, Subtarget);
27249
27250  // Decompose 256-bit ops into smaller 128-bit ops.
27251  if (VT.is256BitVector() && !Subtarget.hasInt256())
27252    return splitVectorIntUnary(Op, DAG);
27253
27254  // Decompose 512-bit ops into smaller 256-bit ops.
27255  if (VT.is512BitVector() && !Subtarget.hasBWI())
27256    return splitVectorIntUnary(Op, DAG);
27257
27258  assert(Subtarget.hasSSSE3() && "Expected SSSE3 support for PSHUFB");
27259  return LowerVectorCTLZInRegLUT(Op, DL, Subtarget, DAG);
27260}
27261
27262static SDValue LowerCTLZ(SDValue Op, const X86Subtarget &Subtarget,
27263                         SelectionDAG &DAG) {
27264  MVT VT = Op.getSimpleValueType();
27265  MVT OpVT = VT;
27266  unsigned NumBits = VT.getSizeInBits();
27267  SDLoc dl(Op);
27268  unsigned Opc = Op.getOpcode();
27269
27270  if (VT.isVector())
27271    return LowerVectorCTLZ(Op, dl, Subtarget, DAG);
27272
27273  Op = Op.getOperand(0);
27274  if (VT == MVT::i8) {
27275    // Zero extend to i32 since there is not an i8 bsr.
27276    OpVT = MVT::i32;
27277    Op = DAG.getNode(ISD::ZERO_EXTEND, dl, OpVT, Op);
27278  }
27279
27280  // Issue a bsr (scan bits in reverse) which also sets EFLAGS.
27281  SDVTList VTs = DAG.getVTList(OpVT, MVT::i32);
27282  Op = DAG.getNode(X86ISD::BSR, dl, VTs, Op);
27283
27284  if (Opc == ISD::CTLZ) {
27285    // If src is zero (i.e. bsr sets ZF), returns NumBits.
27286    SDValue Ops[] = {Op, DAG.getConstant(NumBits + NumBits - 1, dl, OpVT),
27287                     DAG.getTargetConstant(X86::COND_E, dl, MVT::i8),
27288                     Op.getValue(1)};
27289    Op = DAG.getNode(X86ISD::CMOV, dl, OpVT, Ops);
27290  }
27291
27292  // Finally xor with NumBits-1.
27293  Op = DAG.getNode(ISD::XOR, dl, OpVT, Op,
27294                   DAG.getConstant(NumBits - 1, dl, OpVT));
27295
27296  if (VT == MVT::i8)
27297    Op = DAG.getNode(ISD::TRUNCATE, dl, MVT::i8, Op);
27298  return Op;
27299}
27300
27301static SDValue LowerCTTZ(SDValue Op, const X86Subtarget &Subtarget,
27302                         SelectionDAG &DAG) {
27303  MVT VT = Op.getSimpleValueType();
27304  unsigned NumBits = VT.getScalarSizeInBits();
27305  SDValue N0 = Op.getOperand(0);
27306  SDLoc dl(Op);
27307
27308  assert(!VT.isVector() && Op.getOpcode() == ISD::CTTZ &&
27309         "Only scalar CTTZ requires custom lowering");
27310
27311  // Issue a bsf (scan bits forward) which also sets EFLAGS.
27312  SDVTList VTs = DAG.getVTList(VT, MVT::i32);
27313  Op = DAG.getNode(X86ISD::BSF, dl, VTs, N0);
27314
27315  // If src is zero (i.e. bsf sets ZF), returns NumBits.
27316  SDValue Ops[] = {Op, DAG.getConstant(NumBits, dl, VT),
27317                   DAG.getTargetConstant(X86::COND_E, dl, MVT::i8),
27318                   Op.getValue(1)};
27319  return DAG.getNode(X86ISD::CMOV, dl, VT, Ops);
27320}
27321
27322static SDValue lowerAddSub(SDValue Op, SelectionDAG &DAG,
27323                           const X86Subtarget &Subtarget) {
27324  MVT VT = Op.getSimpleValueType();
27325  if (VT == MVT::i16 || VT == MVT::i32)
27326    return lowerAddSubToHorizontalOp(Op, DAG, Subtarget);
27327
27328  if (VT == MVT::v32i16 || VT == MVT::v64i8)
27329    return splitVectorIntBinary(Op, DAG);
27330
27331  assert(Op.getSimpleValueType().is256BitVector() &&
27332         Op.getSimpleValueType().isInteger() &&
27333         "Only handle AVX 256-bit vector integer operation");
27334  return splitVectorIntBinary(Op, DAG);
27335}
27336
27337static SDValue LowerADDSAT_SUBSAT(SDValue Op, SelectionDAG &DAG,
27338                                  const X86Subtarget &Subtarget) {
27339  MVT VT = Op.getSimpleValueType();
27340  SDValue X = Op.getOperand(0), Y = Op.getOperand(1);
27341  unsigned Opcode = Op.getOpcode();
27342  SDLoc DL(Op);
27343
27344  if (VT == MVT::v32i16 || VT == MVT::v64i8 ||
27345      (VT.is256BitVector() && !Subtarget.hasInt256())) {
27346    assert(Op.getSimpleValueType().isInteger() &&
27347           "Only handle AVX vector integer operation");
27348    return splitVectorIntBinary(Op, DAG);
27349  }
27350
27351  // Avoid the generic expansion with min/max if we don't have pminu*/pmaxu*.
27352  const TargetLowering &TLI = DAG.getTargetLoweringInfo();
27353  EVT SetCCResultType =
27354      TLI.getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), VT);
27355
27356  if (Opcode == ISD::USUBSAT && !TLI.isOperationLegal(ISD::UMAX, VT)) {
27357    // usubsat X, Y --> (X >u Y) ? X - Y : 0
27358    SDValue Sub = DAG.getNode(ISD::SUB, DL, VT, X, Y);
27359    SDValue Cmp = DAG.getSetCC(DL, SetCCResultType, X, Y, ISD::SETUGT);
27360    // TODO: Move this to DAGCombiner?
27361    if (SetCCResultType == VT &&
27362        DAG.ComputeNumSignBits(Cmp) == VT.getScalarSizeInBits())
27363      return DAG.getNode(ISD::AND, DL, VT, Cmp, Sub);
27364    return DAG.getSelect(DL, VT, Cmp, Sub, DAG.getConstant(0, DL, VT));
27365  }
27366
27367  // Use default expansion.
27368  return SDValue();
27369}
27370
27371static SDValue LowerABS(SDValue Op, const X86Subtarget &Subtarget,
27372                        SelectionDAG &DAG) {
27373  MVT VT = Op.getSimpleValueType();
27374  if (VT == MVT::i16 || VT == MVT::i32 || VT == MVT::i64) {
27375    // Since X86 does not have CMOV for 8-bit integer, we don't convert
27376    // 8-bit integer abs to NEG and CMOV.
27377    SDLoc DL(Op);
27378    SDValue N0 = Op.getOperand(0);
27379    SDValue Neg = DAG.getNode(X86ISD::SUB, DL, DAG.getVTList(VT, MVT::i32),
27380                              DAG.getConstant(0, DL, VT), N0);
27381    SDValue Ops[] = {N0, Neg, DAG.getTargetConstant(X86::COND_GE, DL, MVT::i8),
27382                     SDValue(Neg.getNode(), 1)};
27383    return DAG.getNode(X86ISD::CMOV, DL, VT, Ops);
27384  }
27385
27386  // ABS(vXi64 X) --> VPBLENDVPD(X, 0-X, X).
27387  if ((VT == MVT::v2i64 || VT == MVT::v4i64) && Subtarget.hasSSE41()) {
27388    SDLoc DL(Op);
27389    SDValue Src = Op.getOperand(0);
27390    SDValue Sub =
27391        DAG.getNode(ISD::SUB, DL, VT, DAG.getConstant(0, DL, VT), Src);
27392    return DAG.getNode(X86ISD::BLENDV, DL, VT, Src, Sub, Src);
27393  }
27394
27395  if (VT.is256BitVector() && !Subtarget.hasInt256()) {
27396    assert(VT.isInteger() &&
27397           "Only handle AVX 256-bit vector integer operation");
27398    return splitVectorIntUnary(Op, DAG);
27399  }
27400
27401  if ((VT == MVT::v32i16 || VT == MVT::v64i8) && !Subtarget.hasBWI())
27402    return splitVectorIntUnary(Op, DAG);
27403
27404  // Default to expand.
27405  return SDValue();
27406}
27407
27408static SDValue LowerMINMAX(SDValue Op, SelectionDAG &DAG) {
27409  MVT VT = Op.getSimpleValueType();
27410
27411  // For AVX1 cases, split to use legal ops (everything but v4i64).
27412  if (VT.getScalarType() != MVT::i64 && VT.is256BitVector())
27413    return splitVectorIntBinary(Op, DAG);
27414
27415  if (VT == MVT::v32i16 || VT == MVT::v64i8)
27416    return splitVectorIntBinary(Op, DAG);
27417
27418  // Default to expand.
27419  return SDValue();
27420}
27421
27422static SDValue LowerMUL(SDValue Op, const X86Subtarget &Subtarget,
27423                        SelectionDAG &DAG) {
27424  SDLoc dl(Op);
27425  MVT VT = Op.getSimpleValueType();
27426
27427  // Decompose 256-bit ops into 128-bit ops.
27428  if (VT.is256BitVector() && !Subtarget.hasInt256())
27429    return splitVectorIntBinary(Op, DAG);
27430
27431  if ((VT == MVT::v32i16 || VT == MVT::v64i8) && !Subtarget.hasBWI())
27432    return splitVectorIntBinary(Op, DAG);
27433
27434  SDValue A = Op.getOperand(0);
27435  SDValue B = Op.getOperand(1);
27436
27437  // Lower v16i8/v32i8/v64i8 mul as sign-extension to v8i16/v16i16/v32i16
27438  // vector pairs, multiply and truncate.
27439  if (VT == MVT::v16i8 || VT == MVT::v32i8 || VT == MVT::v64i8) {
27440    unsigned NumElts = VT.getVectorNumElements();
27441
27442    if ((VT == MVT::v16i8 && Subtarget.hasInt256()) ||
27443        (VT == MVT::v32i8 && Subtarget.canExtendTo512BW())) {
27444      MVT ExVT = MVT::getVectorVT(MVT::i16, VT.getVectorNumElements());
27445      return DAG.getNode(
27446          ISD::TRUNCATE, dl, VT,
27447          DAG.getNode(ISD::MUL, dl, ExVT,
27448                      DAG.getNode(ISD::ANY_EXTEND, dl, ExVT, A),
27449                      DAG.getNode(ISD::ANY_EXTEND, dl, ExVT, B)));
27450    }
27451
27452    MVT ExVT = MVT::getVectorVT(MVT::i16, NumElts / 2);
27453
27454    // Extract the lo/hi parts to any extend to i16.
27455    // We're going to mask off the low byte of each result element of the
27456    // pmullw, so it doesn't matter what's in the high byte of each 16-bit
27457    // element.
27458    SDValue Undef = DAG.getUNDEF(VT);
27459    SDValue ALo = DAG.getBitcast(ExVT, getUnpackl(DAG, dl, VT, A, Undef));
27460    SDValue AHi = DAG.getBitcast(ExVT, getUnpackh(DAG, dl, VT, A, Undef));
27461
27462    SDValue BLo, BHi;
27463    if (ISD::isBuildVectorOfConstantSDNodes(B.getNode())) {
27464      // If the RHS is a constant, manually unpackl/unpackh.
27465      SmallVector<SDValue, 16> LoOps, HiOps;
27466      for (unsigned i = 0; i != NumElts; i += 16) {
27467        for (unsigned j = 0; j != 8; ++j) {
27468          LoOps.push_back(DAG.getAnyExtOrTrunc(B.getOperand(i + j), dl,
27469                                               MVT::i16));
27470          HiOps.push_back(DAG.getAnyExtOrTrunc(B.getOperand(i + j + 8), dl,
27471                                               MVT::i16));
27472        }
27473      }
27474
27475      BLo = DAG.getBuildVector(ExVT, dl, LoOps);
27476      BHi = DAG.getBuildVector(ExVT, dl, HiOps);
27477    } else {
27478      BLo = DAG.getBitcast(ExVT, getUnpackl(DAG, dl, VT, B, Undef));
27479      BHi = DAG.getBitcast(ExVT, getUnpackh(DAG, dl, VT, B, Undef));
27480    }
27481
27482    // Multiply, mask the lower 8bits of the lo/hi results and pack.
27483    SDValue RLo = DAG.getNode(ISD::MUL, dl, ExVT, ALo, BLo);
27484    SDValue RHi = DAG.getNode(ISD::MUL, dl, ExVT, AHi, BHi);
27485    RLo = DAG.getNode(ISD::AND, dl, ExVT, RLo, DAG.getConstant(255, dl, ExVT));
27486    RHi = DAG.getNode(ISD::AND, dl, ExVT, RHi, DAG.getConstant(255, dl, ExVT));
27487    return DAG.getNode(X86ISD::PACKUS, dl, VT, RLo, RHi);
27488  }
27489
27490  // Lower v4i32 mul as 2x shuffle, 2x pmuludq, 2x shuffle.
27491  if (VT == MVT::v4i32) {
27492    assert(Subtarget.hasSSE2() && !Subtarget.hasSSE41() &&
27493           "Should not custom lower when pmulld is available!");
27494
27495    // Extract the odd parts.
27496    static const int UnpackMask[] = { 1, -1, 3, -1 };
27497    SDValue Aodds = DAG.getVectorShuffle(VT, dl, A, A, UnpackMask);
27498    SDValue Bodds = DAG.getVectorShuffle(VT, dl, B, B, UnpackMask);
27499
27500    // Multiply the even parts.
27501    SDValue Evens = DAG.getNode(X86ISD::PMULUDQ, dl, MVT::v2i64,
27502                                DAG.getBitcast(MVT::v2i64, A),
27503                                DAG.getBitcast(MVT::v2i64, B));
27504    // Now multiply odd parts.
27505    SDValue Odds = DAG.getNode(X86ISD::PMULUDQ, dl, MVT::v2i64,
27506                               DAG.getBitcast(MVT::v2i64, Aodds),
27507                               DAG.getBitcast(MVT::v2i64, Bodds));
27508
27509    Evens = DAG.getBitcast(VT, Evens);
27510    Odds = DAG.getBitcast(VT, Odds);
27511
27512    // Merge the two vectors back together with a shuffle. This expands into 2
27513    // shuffles.
27514    static const int ShufMask[] = { 0, 4, 2, 6 };
27515    return DAG.getVectorShuffle(VT, dl, Evens, Odds, ShufMask);
27516  }
27517
27518  assert((VT == MVT::v2i64 || VT == MVT::v4i64 || VT == MVT::v8i64) &&
27519         "Only know how to lower V2I64/V4I64/V8I64 multiply");
27520  assert(!Subtarget.hasDQI() && "DQI should use MULLQ");
27521
27522  //  Ahi = psrlqi(a, 32);
27523  //  Bhi = psrlqi(b, 32);
27524  //
27525  //  AloBlo = pmuludq(a, b);
27526  //  AloBhi = pmuludq(a, Bhi);
27527  //  AhiBlo = pmuludq(Ahi, b);
27528  //
27529  //  Hi = psllqi(AloBhi + AhiBlo, 32);
27530  //  return AloBlo + Hi;
27531  KnownBits AKnown = DAG.computeKnownBits(A);
27532  KnownBits BKnown = DAG.computeKnownBits(B);
27533
27534  APInt LowerBitsMask = APInt::getLowBitsSet(64, 32);
27535  bool ALoIsZero = LowerBitsMask.isSubsetOf(AKnown.Zero);
27536  bool BLoIsZero = LowerBitsMask.isSubsetOf(BKnown.Zero);
27537
27538  APInt UpperBitsMask = APInt::getHighBitsSet(64, 32);
27539  bool AHiIsZero = UpperBitsMask.isSubsetOf(AKnown.Zero);
27540  bool BHiIsZero = UpperBitsMask.isSubsetOf(BKnown.Zero);
27541
27542  SDValue Zero = DAG.getConstant(0, dl, VT);
27543
27544  // Only multiply lo/hi halves that aren't known to be zero.
27545  SDValue AloBlo = Zero;
27546  if (!ALoIsZero && !BLoIsZero)
27547    AloBlo = DAG.getNode(X86ISD::PMULUDQ, dl, VT, A, B);
27548
27549  SDValue AloBhi = Zero;
27550  if (!ALoIsZero && !BHiIsZero) {
27551    SDValue Bhi = getTargetVShiftByConstNode(X86ISD::VSRLI, dl, VT, B, 32, DAG);
27552    AloBhi = DAG.getNode(X86ISD::PMULUDQ, dl, VT, A, Bhi);
27553  }
27554
27555  SDValue AhiBlo = Zero;
27556  if (!AHiIsZero && !BLoIsZero) {
27557    SDValue Ahi = getTargetVShiftByConstNode(X86ISD::VSRLI, dl, VT, A, 32, DAG);
27558    AhiBlo = DAG.getNode(X86ISD::PMULUDQ, dl, VT, Ahi, B);
27559  }
27560
27561  SDValue Hi = DAG.getNode(ISD::ADD, dl, VT, AloBhi, AhiBlo);
27562  Hi = getTargetVShiftByConstNode(X86ISD::VSHLI, dl, VT, Hi, 32, DAG);
27563
27564  return DAG.getNode(ISD::ADD, dl, VT, AloBlo, Hi);
27565}
27566
27567static SDValue LowervXi8MulWithUNPCK(SDValue A, SDValue B, const SDLoc &dl,
27568                                     MVT VT, bool IsSigned,
27569                                     const X86Subtarget &Subtarget,
27570                                     SelectionDAG &DAG,
27571                                     SDValue *Low = nullptr) {
27572  unsigned NumElts = VT.getVectorNumElements();
27573
27574  // For vXi8 we will unpack the low and high half of each 128 bit lane to widen
27575  // to a vXi16 type. Do the multiplies, shift the results and pack the half
27576  // lane results back together.
27577
27578  // We'll take different approaches for signed and unsigned.
27579  // For unsigned we'll use punpcklbw/punpckhbw to put zero extend the bytes
27580  // and use pmullw to calculate the full 16-bit product.
27581  // For signed we'll use punpcklbw/punpckbw to extend the bytes to words and
27582  // shift them left into the upper byte of each word. This allows us to use
27583  // pmulhw to calculate the full 16-bit product. This trick means we don't
27584  // need to sign extend the bytes to use pmullw.
27585
27586  MVT ExVT = MVT::getVectorVT(MVT::i16, NumElts / 2);
27587  SDValue Zero = DAG.getConstant(0, dl, VT);
27588
27589  SDValue ALo, AHi;
27590  if (IsSigned) {
27591    ALo = DAG.getBitcast(ExVT, getUnpackl(DAG, dl, VT, Zero, A));
27592    AHi = DAG.getBitcast(ExVT, getUnpackh(DAG, dl, VT, Zero, A));
27593  } else {
27594    ALo = DAG.getBitcast(ExVT, getUnpackl(DAG, dl, VT, A, Zero));
27595    AHi = DAG.getBitcast(ExVT, getUnpackh(DAG, dl, VT, A, Zero));
27596  }
27597
27598  SDValue BLo, BHi;
27599  if (ISD::isBuildVectorOfConstantSDNodes(B.getNode())) {
27600    // If the RHS is a constant, manually unpackl/unpackh and extend.
27601    SmallVector<SDValue, 16> LoOps, HiOps;
27602    for (unsigned i = 0; i != NumElts; i += 16) {
27603      for (unsigned j = 0; j != 8; ++j) {
27604        SDValue LoOp = B.getOperand(i + j);
27605        SDValue HiOp = B.getOperand(i + j + 8);
27606
27607        if (IsSigned) {
27608          LoOp = DAG.getAnyExtOrTrunc(LoOp, dl, MVT::i16);
27609          HiOp = DAG.getAnyExtOrTrunc(HiOp, dl, MVT::i16);
27610          LoOp = DAG.getNode(ISD::SHL, dl, MVT::i16, LoOp,
27611                             DAG.getConstant(8, dl, MVT::i16));
27612          HiOp = DAG.getNode(ISD::SHL, dl, MVT::i16, HiOp,
27613                             DAG.getConstant(8, dl, MVT::i16));
27614        } else {
27615          LoOp = DAG.getZExtOrTrunc(LoOp, dl, MVT::i16);
27616          HiOp = DAG.getZExtOrTrunc(HiOp, dl, MVT::i16);
27617        }
27618
27619        LoOps.push_back(LoOp);
27620        HiOps.push_back(HiOp);
27621      }
27622    }
27623
27624    BLo = DAG.getBuildVector(ExVT, dl, LoOps);
27625    BHi = DAG.getBuildVector(ExVT, dl, HiOps);
27626  } else if (IsSigned) {
27627    BLo = DAG.getBitcast(ExVT, getUnpackl(DAG, dl, VT, Zero, B));
27628    BHi = DAG.getBitcast(ExVT, getUnpackh(DAG, dl, VT, Zero, B));
27629  } else {
27630    BLo = DAG.getBitcast(ExVT, getUnpackl(DAG, dl, VT, B, Zero));
27631    BHi = DAG.getBitcast(ExVT, getUnpackh(DAG, dl, VT, B, Zero));
27632  }
27633
27634  // Multiply, lshr the upper 8bits to the lower 8bits of the lo/hi results and
27635  // pack back to vXi8.
27636  unsigned MulOpc = IsSigned ? ISD::MULHS : ISD::MUL;
27637  SDValue RLo = DAG.getNode(MulOpc, dl, ExVT, ALo, BLo);
27638  SDValue RHi = DAG.getNode(MulOpc, dl, ExVT, AHi, BHi);
27639
27640  if (Low) {
27641    // Mask the lower bits and pack the results to rejoin the halves.
27642    SDValue Mask = DAG.getConstant(255, dl, ExVT);
27643    SDValue LLo = DAG.getNode(ISD::AND, dl, ExVT, RLo, Mask);
27644    SDValue LHi = DAG.getNode(ISD::AND, dl, ExVT, RHi, Mask);
27645    *Low = DAG.getNode(X86ISD::PACKUS, dl, VT, LLo, LHi);
27646  }
27647
27648  RLo = getTargetVShiftByConstNode(X86ISD::VSRLI, dl, ExVT, RLo, 8, DAG);
27649  RHi = getTargetVShiftByConstNode(X86ISD::VSRLI, dl, ExVT, RHi, 8, DAG);
27650
27651  // Bitcast back to VT and then pack all the even elements from Lo and Hi.
27652  return DAG.getNode(X86ISD::PACKUS, dl, VT, RLo, RHi);
27653}
27654
27655static SDValue LowerMULH(SDValue Op, const X86Subtarget &Subtarget,
27656                         SelectionDAG &DAG) {
27657  SDLoc dl(Op);
27658  MVT VT = Op.getSimpleValueType();
27659  bool IsSigned = Op->getOpcode() == ISD::MULHS;
27660  unsigned NumElts = VT.getVectorNumElements();
27661  SDValue A = Op.getOperand(0);
27662  SDValue B = Op.getOperand(1);
27663
27664  // Decompose 256-bit ops into 128-bit ops.
27665  if (VT.is256BitVector() && !Subtarget.hasInt256())
27666    return splitVectorIntBinary(Op, DAG);
27667
27668  if ((VT == MVT::v32i16 || VT == MVT::v64i8) && !Subtarget.hasBWI())
27669    return splitVectorIntBinary(Op, DAG);
27670
27671  if (VT == MVT::v4i32 || VT == MVT::v8i32 || VT == MVT::v16i32) {
27672    assert((VT == MVT::v4i32 && Subtarget.hasSSE2()) ||
27673           (VT == MVT::v8i32 && Subtarget.hasInt256()) ||
27674           (VT == MVT::v16i32 && Subtarget.hasAVX512()));
27675
27676    // PMULxD operations multiply each even value (starting at 0) of LHS with
27677    // the related value of RHS and produce a widen result.
27678    // E.g., PMULUDQ <4 x i32> <a|b|c|d>, <4 x i32> <e|f|g|h>
27679    // => <2 x i64> <ae|cg>
27680    //
27681    // In other word, to have all the results, we need to perform two PMULxD:
27682    // 1. one with the even values.
27683    // 2. one with the odd values.
27684    // To achieve #2, with need to place the odd values at an even position.
27685    //
27686    // Place the odd value at an even position (basically, shift all values 1
27687    // step to the left):
27688    const int Mask[] = {1, -1,  3, -1,  5, -1,  7, -1,
27689                        9, -1, 11, -1, 13, -1, 15, -1};
27690    // <a|b|c|d> => <b|undef|d|undef>
27691    SDValue Odd0 = DAG.getVectorShuffle(VT, dl, A, A,
27692                                        makeArrayRef(&Mask[0], NumElts));
27693    // <e|f|g|h> => <f|undef|h|undef>
27694    SDValue Odd1 = DAG.getVectorShuffle(VT, dl, B, B,
27695                                        makeArrayRef(&Mask[0], NumElts));
27696
27697    // Emit two multiplies, one for the lower 2 ints and one for the higher 2
27698    // ints.
27699    MVT MulVT = MVT::getVectorVT(MVT::i64, NumElts / 2);
27700    unsigned Opcode =
27701        (IsSigned && Subtarget.hasSSE41()) ? X86ISD::PMULDQ : X86ISD::PMULUDQ;
27702    // PMULUDQ <4 x i32> <a|b|c|d>, <4 x i32> <e|f|g|h>
27703    // => <2 x i64> <ae|cg>
27704    SDValue Mul1 = DAG.getBitcast(VT, DAG.getNode(Opcode, dl, MulVT,
27705                                                  DAG.getBitcast(MulVT, A),
27706                                                  DAG.getBitcast(MulVT, B)));
27707    // PMULUDQ <4 x i32> <b|undef|d|undef>, <4 x i32> <f|undef|h|undef>
27708    // => <2 x i64> <bf|dh>
27709    SDValue Mul2 = DAG.getBitcast(VT, DAG.getNode(Opcode, dl, MulVT,
27710                                                  DAG.getBitcast(MulVT, Odd0),
27711                                                  DAG.getBitcast(MulVT, Odd1)));
27712
27713    // Shuffle it back into the right order.
27714    SmallVector<int, 16> ShufMask(NumElts);
27715    for (int i = 0; i != (int)NumElts; ++i)
27716      ShufMask[i] = (i / 2) * 2 + ((i % 2) * NumElts) + 1;
27717
27718    SDValue Res = DAG.getVectorShuffle(VT, dl, Mul1, Mul2, ShufMask);
27719
27720    // If we have a signed multiply but no PMULDQ fix up the result of an
27721    // unsigned multiply.
27722    if (IsSigned && !Subtarget.hasSSE41()) {
27723      SDValue Zero = DAG.getConstant(0, dl, VT);
27724      SDValue T1 = DAG.getNode(ISD::AND, dl, VT,
27725                               DAG.getSetCC(dl, VT, Zero, A, ISD::SETGT), B);
27726      SDValue T2 = DAG.getNode(ISD::AND, dl, VT,
27727                               DAG.getSetCC(dl, VT, Zero, B, ISD::SETGT), A);
27728
27729      SDValue Fixup = DAG.getNode(ISD::ADD, dl, VT, T1, T2);
27730      Res = DAG.getNode(ISD::SUB, dl, VT, Res, Fixup);
27731    }
27732
27733    return Res;
27734  }
27735
27736  // Only i8 vectors should need custom lowering after this.
27737  assert((VT == MVT::v16i8 || (VT == MVT::v32i8 && Subtarget.hasInt256()) ||
27738         (VT == MVT::v64i8 && Subtarget.hasBWI())) &&
27739         "Unsupported vector type");
27740
27741  // Lower v16i8/v32i8 as extension to v8i16/v16i16 vector pairs, multiply,
27742  // logical shift down the upper half and pack back to i8.
27743
27744  // With SSE41 we can use sign/zero extend, but for pre-SSE41 we unpack
27745  // and then ashr/lshr the upper bits down to the lower bits before multiply.
27746
27747  if ((VT == MVT::v16i8 && Subtarget.hasInt256()) ||
27748      (VT == MVT::v32i8 && Subtarget.canExtendTo512BW())) {
27749    MVT ExVT = MVT::getVectorVT(MVT::i16, NumElts);
27750    unsigned ExAVX = IsSigned ? ISD::SIGN_EXTEND : ISD::ZERO_EXTEND;
27751    SDValue ExA = DAG.getNode(ExAVX, dl, ExVT, A);
27752    SDValue ExB = DAG.getNode(ExAVX, dl, ExVT, B);
27753    SDValue Mul = DAG.getNode(ISD::MUL, dl, ExVT, ExA, ExB);
27754    Mul = getTargetVShiftByConstNode(X86ISD::VSRLI, dl, ExVT, Mul, 8, DAG);
27755    return DAG.getNode(ISD::TRUNCATE, dl, VT, Mul);
27756  }
27757
27758  return LowervXi8MulWithUNPCK(A, B, dl, VT, IsSigned, Subtarget, DAG);
27759}
27760
27761// Custom lowering for SMULO/UMULO.
27762static SDValue LowerMULO(SDValue Op, const X86Subtarget &Subtarget,
27763                         SelectionDAG &DAG) {
27764  MVT VT = Op.getSimpleValueType();
27765
27766  // Scalars defer to LowerXALUO.
27767  if (!VT.isVector())
27768    return LowerXALUO(Op, DAG);
27769
27770  SDLoc dl(Op);
27771  bool IsSigned = Op->getOpcode() == ISD::SMULO;
27772  SDValue A = Op.getOperand(0);
27773  SDValue B = Op.getOperand(1);
27774  EVT OvfVT = Op->getValueType(1);
27775
27776  if ((VT == MVT::v32i8 && !Subtarget.hasInt256()) ||
27777      (VT == MVT::v64i8 && !Subtarget.hasBWI())) {
27778    // Extract the LHS Lo/Hi vectors
27779    SDValue LHSLo, LHSHi;
27780    std::tie(LHSLo, LHSHi) = splitVector(A, DAG, dl);
27781
27782    // Extract the RHS Lo/Hi vectors
27783    SDValue RHSLo, RHSHi;
27784    std::tie(RHSLo, RHSHi) = splitVector(B, DAG, dl);
27785
27786    EVT LoOvfVT, HiOvfVT;
27787    std::tie(LoOvfVT, HiOvfVT) = DAG.GetSplitDestVTs(OvfVT);
27788    SDVTList LoVTs = DAG.getVTList(LHSLo.getValueType(), LoOvfVT);
27789    SDVTList HiVTs = DAG.getVTList(LHSHi.getValueType(), HiOvfVT);
27790
27791    // Issue the split operations.
27792    SDValue Lo = DAG.getNode(Op.getOpcode(), dl, LoVTs, LHSLo, RHSLo);
27793    SDValue Hi = DAG.getNode(Op.getOpcode(), dl, HiVTs, LHSHi, RHSHi);
27794
27795    // Join the separate data results and the overflow results.
27796    SDValue Res = DAG.getNode(ISD::CONCAT_VECTORS, dl, VT, Lo, Hi);
27797    SDValue Ovf = DAG.getNode(ISD::CONCAT_VECTORS, dl, OvfVT, Lo.getValue(1),
27798                              Hi.getValue(1));
27799
27800    return DAG.getMergeValues({Res, Ovf}, dl);
27801  }
27802
27803  const TargetLowering &TLI = DAG.getTargetLoweringInfo();
27804  EVT SetccVT =
27805      TLI.getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), VT);
27806
27807  if ((VT == MVT::v16i8 && Subtarget.hasInt256()) ||
27808      (VT == MVT::v32i8 && Subtarget.canExtendTo512BW())) {
27809    unsigned NumElts = VT.getVectorNumElements();
27810    MVT ExVT = MVT::getVectorVT(MVT::i16, NumElts);
27811    unsigned ExAVX = IsSigned ? ISD::SIGN_EXTEND : ISD::ZERO_EXTEND;
27812    SDValue ExA = DAG.getNode(ExAVX, dl, ExVT, A);
27813    SDValue ExB = DAG.getNode(ExAVX, dl, ExVT, B);
27814    SDValue Mul = DAG.getNode(ISD::MUL, dl, ExVT, ExA, ExB);
27815
27816    SDValue Low = DAG.getNode(ISD::TRUNCATE, dl, VT, Mul);
27817
27818    SDValue Ovf;
27819    if (IsSigned) {
27820      SDValue High, LowSign;
27821      if (OvfVT.getVectorElementType() == MVT::i1 &&
27822          (Subtarget.hasBWI() || Subtarget.canExtendTo512DQ())) {
27823        // Rather the truncating try to do the compare on vXi16 or vXi32.
27824        // Shift the high down filling with sign bits.
27825        High = getTargetVShiftByConstNode(X86ISD::VSRAI, dl, ExVT, Mul, 8, DAG);
27826        // Fill all 16 bits with the sign bit from the low.
27827        LowSign =
27828            getTargetVShiftByConstNode(X86ISD::VSHLI, dl, ExVT, Mul, 8, DAG);
27829        LowSign = getTargetVShiftByConstNode(X86ISD::VSRAI, dl, ExVT, LowSign,
27830                                             15, DAG);
27831        SetccVT = OvfVT;
27832        if (!Subtarget.hasBWI()) {
27833          // We can't do a vXi16 compare so sign extend to v16i32.
27834          High = DAG.getNode(ISD::SIGN_EXTEND, dl, MVT::v16i32, High);
27835          LowSign = DAG.getNode(ISD::SIGN_EXTEND, dl, MVT::v16i32, LowSign);
27836        }
27837      } else {
27838        // Otherwise do the compare at vXi8.
27839        High = getTargetVShiftByConstNode(X86ISD::VSRLI, dl, ExVT, Mul, 8, DAG);
27840        High = DAG.getNode(ISD::TRUNCATE, dl, VT, High);
27841        LowSign =
27842            DAG.getNode(ISD::SRA, dl, VT, Low, DAG.getConstant(7, dl, VT));
27843      }
27844
27845      Ovf = DAG.getSetCC(dl, SetccVT, LowSign, High, ISD::SETNE);
27846    } else {
27847      SDValue High =
27848          getTargetVShiftByConstNode(X86ISD::VSRLI, dl, ExVT, Mul, 8, DAG);
27849      if (OvfVT.getVectorElementType() == MVT::i1 &&
27850          (Subtarget.hasBWI() || Subtarget.canExtendTo512DQ())) {
27851        // Rather the truncating try to do the compare on vXi16 or vXi32.
27852        SetccVT = OvfVT;
27853        if (!Subtarget.hasBWI()) {
27854          // We can't do a vXi16 compare so sign extend to v16i32.
27855          High = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::v16i32, High);
27856        }
27857      } else {
27858        // Otherwise do the compare at vXi8.
27859        High = DAG.getNode(ISD::TRUNCATE, dl, VT, High);
27860      }
27861
27862      Ovf =
27863          DAG.getSetCC(dl, SetccVT, High,
27864                       DAG.getConstant(0, dl, High.getValueType()), ISD::SETNE);
27865    }
27866
27867    Ovf = DAG.getSExtOrTrunc(Ovf, dl, OvfVT);
27868
27869    return DAG.getMergeValues({Low, Ovf}, dl);
27870  }
27871
27872  SDValue Low;
27873  SDValue High =
27874      LowervXi8MulWithUNPCK(A, B, dl, VT, IsSigned, Subtarget, DAG, &Low);
27875
27876  SDValue Ovf;
27877  if (IsSigned) {
27878    // SMULO overflows if the high bits don't match the sign of the low.
27879    SDValue LowSign =
27880        DAG.getNode(ISD::SRA, dl, VT, Low, DAG.getConstant(7, dl, VT));
27881    Ovf = DAG.getSetCC(dl, SetccVT, LowSign, High, ISD::SETNE);
27882  } else {
27883    // UMULO overflows if the high bits are non-zero.
27884    Ovf =
27885        DAG.getSetCC(dl, SetccVT, High, DAG.getConstant(0, dl, VT), ISD::SETNE);
27886  }
27887
27888  Ovf = DAG.getSExtOrTrunc(Ovf, dl, OvfVT);
27889
27890  return DAG.getMergeValues({Low, Ovf}, dl);
27891}
27892
27893SDValue X86TargetLowering::LowerWin64_i128OP(SDValue Op, SelectionDAG &DAG) const {
27894  assert(Subtarget.isTargetWin64() && "Unexpected target");
27895  EVT VT = Op.getValueType();
27896  assert(VT.isInteger() && VT.getSizeInBits() == 128 &&
27897         "Unexpected return type for lowering");
27898
27899  RTLIB::Libcall LC;
27900  bool isSigned;
27901  switch (Op->getOpcode()) {
27902  default: llvm_unreachable("Unexpected request for libcall!");
27903  case ISD::SDIV:      isSigned = true;  LC = RTLIB::SDIV_I128;    break;
27904  case ISD::UDIV:      isSigned = false; LC = RTLIB::UDIV_I128;    break;
27905  case ISD::SREM:      isSigned = true;  LC = RTLIB::SREM_I128;    break;
27906  case ISD::UREM:      isSigned = false; LC = RTLIB::UREM_I128;    break;
27907  }
27908
27909  SDLoc dl(Op);
27910  SDValue InChain = DAG.getEntryNode();
27911
27912  TargetLowering::ArgListTy Args;
27913  TargetLowering::ArgListEntry Entry;
27914  for (unsigned i = 0, e = Op->getNumOperands(); i != e; ++i) {
27915    EVT ArgVT = Op->getOperand(i).getValueType();
27916    assert(ArgVT.isInteger() && ArgVT.getSizeInBits() == 128 &&
27917           "Unexpected argument type for lowering");
27918    SDValue StackPtr = DAG.CreateStackTemporary(ArgVT, 16);
27919    int SPFI = cast<FrameIndexSDNode>(StackPtr.getNode())->getIndex();
27920    MachinePointerInfo MPI =
27921        MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), SPFI);
27922    Entry.Node = StackPtr;
27923    InChain =
27924        DAG.getStore(InChain, dl, Op->getOperand(i), StackPtr, MPI, Align(16));
27925    Type *ArgTy = ArgVT.getTypeForEVT(*DAG.getContext());
27926    Entry.Ty = PointerType::get(ArgTy,0);
27927    Entry.IsSExt = false;
27928    Entry.IsZExt = false;
27929    Args.push_back(Entry);
27930  }
27931
27932  SDValue Callee = DAG.getExternalSymbol(getLibcallName(LC),
27933                                         getPointerTy(DAG.getDataLayout()));
27934
27935  TargetLowering::CallLoweringInfo CLI(DAG);
27936  CLI.setDebugLoc(dl)
27937      .setChain(InChain)
27938      .setLibCallee(
27939          getLibcallCallingConv(LC),
27940          static_cast<EVT>(MVT::v2i64).getTypeForEVT(*DAG.getContext()), Callee,
27941          std::move(Args))
27942      .setInRegister()
27943      .setSExtResult(isSigned)
27944      .setZExtResult(!isSigned);
27945
27946  std::pair<SDValue, SDValue> CallInfo = LowerCallTo(CLI);
27947  return DAG.getBitcast(VT, CallInfo.first);
27948}
27949
27950// Return true if the required (according to Opcode) shift-imm form is natively
27951// supported by the Subtarget
27952static bool SupportedVectorShiftWithImm(MVT VT, const X86Subtarget &Subtarget,
27953                                        unsigned Opcode) {
27954  if (VT.getScalarSizeInBits() < 16)
27955    return false;
27956
27957  if (VT.is512BitVector() && Subtarget.hasAVX512() &&
27958      (VT.getScalarSizeInBits() > 16 || Subtarget.hasBWI()))
27959    return true;
27960
27961  bool LShift = (VT.is128BitVector() && Subtarget.hasSSE2()) ||
27962                (VT.is256BitVector() && Subtarget.hasInt256());
27963
27964  bool AShift = LShift && (Subtarget.hasAVX512() ||
27965                           (VT != MVT::v2i64 && VT != MVT::v4i64));
27966  return (Opcode == ISD::SRA) ? AShift : LShift;
27967}
27968
27969// The shift amount is a variable, but it is the same for all vector lanes.
27970// These instructions are defined together with shift-immediate.
27971static
27972bool SupportedVectorShiftWithBaseAmnt(MVT VT, const X86Subtarget &Subtarget,
27973                                      unsigned Opcode) {
27974  return SupportedVectorShiftWithImm(VT, Subtarget, Opcode);
27975}
27976
27977// Return true if the required (according to Opcode) variable-shift form is
27978// natively supported by the Subtarget
27979static bool SupportedVectorVarShift(MVT VT, const X86Subtarget &Subtarget,
27980                                    unsigned Opcode) {
27981
27982  if (!Subtarget.hasInt256() || VT.getScalarSizeInBits() < 16)
27983    return false;
27984
27985  // vXi16 supported only on AVX-512, BWI
27986  if (VT.getScalarSizeInBits() == 16 && !Subtarget.hasBWI())
27987    return false;
27988
27989  if (Subtarget.hasAVX512())
27990    return true;
27991
27992  bool LShift = VT.is128BitVector() || VT.is256BitVector();
27993  bool AShift = LShift &&  VT != MVT::v2i64 && VT != MVT::v4i64;
27994  return (Opcode == ISD::SRA) ? AShift : LShift;
27995}
27996
27997static SDValue LowerScalarImmediateShift(SDValue Op, SelectionDAG &DAG,
27998                                         const X86Subtarget &Subtarget) {
27999  MVT VT = Op.getSimpleValueType();
28000  SDLoc dl(Op);
28001  SDValue R = Op.getOperand(0);
28002  SDValue Amt = Op.getOperand(1);
28003  unsigned X86Opc = getTargetVShiftUniformOpcode(Op.getOpcode(), false);
28004
28005  auto ArithmeticShiftRight64 = [&](uint64_t ShiftAmt) {
28006    assert((VT == MVT::v2i64 || VT == MVT::v4i64) && "Unexpected SRA type");
28007    MVT ExVT = MVT::getVectorVT(MVT::i32, VT.getVectorNumElements() * 2);
28008    SDValue Ex = DAG.getBitcast(ExVT, R);
28009
28010    // ashr(R, 63) === cmp_slt(R, 0)
28011    if (ShiftAmt == 63 && Subtarget.hasSSE42()) {
28012      assert((VT != MVT::v4i64 || Subtarget.hasInt256()) &&
28013             "Unsupported PCMPGT op");
28014      return DAG.getNode(X86ISD::PCMPGT, dl, VT, DAG.getConstant(0, dl, VT), R);
28015    }
28016
28017    if (ShiftAmt >= 32) {
28018      // Splat sign to upper i32 dst, and SRA upper i32 src to lower i32.
28019      SDValue Upper =
28020          getTargetVShiftByConstNode(X86ISD::VSRAI, dl, ExVT, Ex, 31, DAG);
28021      SDValue Lower = getTargetVShiftByConstNode(X86ISD::VSRAI, dl, ExVT, Ex,
28022                                                 ShiftAmt - 32, DAG);
28023      if (VT == MVT::v2i64)
28024        Ex = DAG.getVectorShuffle(ExVT, dl, Upper, Lower, {5, 1, 7, 3});
28025      if (VT == MVT::v4i64)
28026        Ex = DAG.getVectorShuffle(ExVT, dl, Upper, Lower,
28027                                  {9, 1, 11, 3, 13, 5, 15, 7});
28028    } else {
28029      // SRA upper i32, SRL whole i64 and select lower i32.
28030      SDValue Upper = getTargetVShiftByConstNode(X86ISD::VSRAI, dl, ExVT, Ex,
28031                                                 ShiftAmt, DAG);
28032      SDValue Lower =
28033          getTargetVShiftByConstNode(X86ISD::VSRLI, dl, VT, R, ShiftAmt, DAG);
28034      Lower = DAG.getBitcast(ExVT, Lower);
28035      if (VT == MVT::v2i64)
28036        Ex = DAG.getVectorShuffle(ExVT, dl, Upper, Lower, {4, 1, 6, 3});
28037      if (VT == MVT::v4i64)
28038        Ex = DAG.getVectorShuffle(ExVT, dl, Upper, Lower,
28039                                  {8, 1, 10, 3, 12, 5, 14, 7});
28040    }
28041    return DAG.getBitcast(VT, Ex);
28042  };
28043
28044  // Optimize shl/srl/sra with constant shift amount.
28045  APInt APIntShiftAmt;
28046  if (!X86::isConstantSplat(Amt, APIntShiftAmt))
28047    return SDValue();
28048
28049  // If the shift amount is out of range, return undef.
28050  if (APIntShiftAmt.uge(VT.getScalarSizeInBits()))
28051    return DAG.getUNDEF(VT);
28052
28053  uint64_t ShiftAmt = APIntShiftAmt.getZExtValue();
28054
28055  if (SupportedVectorShiftWithImm(VT, Subtarget, Op.getOpcode()))
28056    return getTargetVShiftByConstNode(X86Opc, dl, VT, R, ShiftAmt, DAG);
28057
28058  // i64 SRA needs to be performed as partial shifts.
28059  if (((!Subtarget.hasXOP() && VT == MVT::v2i64) ||
28060       (Subtarget.hasInt256() && VT == MVT::v4i64)) &&
28061      Op.getOpcode() == ISD::SRA)
28062    return ArithmeticShiftRight64(ShiftAmt);
28063
28064  if (VT == MVT::v16i8 || (Subtarget.hasInt256() && VT == MVT::v32i8) ||
28065      (Subtarget.hasBWI() && VT == MVT::v64i8)) {
28066    unsigned NumElts = VT.getVectorNumElements();
28067    MVT ShiftVT = MVT::getVectorVT(MVT::i16, NumElts / 2);
28068
28069    // Simple i8 add case
28070    if (Op.getOpcode() == ISD::SHL && ShiftAmt == 1)
28071      return DAG.getNode(ISD::ADD, dl, VT, R, R);
28072
28073    // ashr(R, 7)  === cmp_slt(R, 0)
28074    if (Op.getOpcode() == ISD::SRA && ShiftAmt == 7) {
28075      SDValue Zeros = DAG.getConstant(0, dl, VT);
28076      if (VT.is512BitVector()) {
28077        assert(VT == MVT::v64i8 && "Unexpected element type!");
28078        SDValue CMP = DAG.getSetCC(dl, MVT::v64i1, Zeros, R, ISD::SETGT);
28079        return DAG.getNode(ISD::SIGN_EXTEND, dl, VT, CMP);
28080      }
28081      return DAG.getNode(X86ISD::PCMPGT, dl, VT, Zeros, R);
28082    }
28083
28084    // XOP can shift v16i8 directly instead of as shift v8i16 + mask.
28085    if (VT == MVT::v16i8 && Subtarget.hasXOP())
28086      return SDValue();
28087
28088    if (Op.getOpcode() == ISD::SHL) {
28089      // Make a large shift.
28090      SDValue SHL = getTargetVShiftByConstNode(X86ISD::VSHLI, dl, ShiftVT, R,
28091                                               ShiftAmt, DAG);
28092      SHL = DAG.getBitcast(VT, SHL);
28093      // Zero out the rightmost bits.
28094      APInt Mask = APInt::getHighBitsSet(8, 8 - ShiftAmt);
28095      return DAG.getNode(ISD::AND, dl, VT, SHL, DAG.getConstant(Mask, dl, VT));
28096    }
28097    if (Op.getOpcode() == ISD::SRL) {
28098      // Make a large shift.
28099      SDValue SRL = getTargetVShiftByConstNode(X86ISD::VSRLI, dl, ShiftVT, R,
28100                                               ShiftAmt, DAG);
28101      SRL = DAG.getBitcast(VT, SRL);
28102      // Zero out the leftmost bits.
28103      APInt Mask = APInt::getLowBitsSet(8, 8 - ShiftAmt);
28104      return DAG.getNode(ISD::AND, dl, VT, SRL, DAG.getConstant(Mask, dl, VT));
28105    }
28106    if (Op.getOpcode() == ISD::SRA) {
28107      // ashr(R, Amt) === sub(xor(lshr(R, Amt), Mask), Mask)
28108      SDValue Res = DAG.getNode(ISD::SRL, dl, VT, R, Amt);
28109
28110      SDValue Mask = DAG.getConstant(128 >> ShiftAmt, dl, VT);
28111      Res = DAG.getNode(ISD::XOR, dl, VT, Res, Mask);
28112      Res = DAG.getNode(ISD::SUB, dl, VT, Res, Mask);
28113      return Res;
28114    }
28115    llvm_unreachable("Unknown shift opcode.");
28116  }
28117
28118  return SDValue();
28119}
28120
28121static SDValue LowerScalarVariableShift(SDValue Op, SelectionDAG &DAG,
28122                                        const X86Subtarget &Subtarget) {
28123  MVT VT = Op.getSimpleValueType();
28124  SDLoc dl(Op);
28125  SDValue R = Op.getOperand(0);
28126  SDValue Amt = Op.getOperand(1);
28127  unsigned Opcode = Op.getOpcode();
28128  unsigned X86OpcI = getTargetVShiftUniformOpcode(Opcode, false);
28129  unsigned X86OpcV = getTargetVShiftUniformOpcode(Opcode, true);
28130
28131  if (SDValue BaseShAmt = DAG.getSplatValue(Amt)) {
28132    if (SupportedVectorShiftWithBaseAmnt(VT, Subtarget, Opcode)) {
28133      MVT EltVT = VT.getVectorElementType();
28134      assert(EltVT.bitsLE(MVT::i64) && "Unexpected element type!");
28135      if (EltVT != MVT::i64 && EltVT.bitsGT(MVT::i32))
28136        BaseShAmt = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i64, BaseShAmt);
28137      else if (EltVT.bitsLT(MVT::i32))
28138        BaseShAmt = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i32, BaseShAmt);
28139
28140      return getTargetVShiftNode(X86OpcI, dl, VT, R, BaseShAmt, Subtarget, DAG);
28141    }
28142
28143    // vXi8 shifts - shift as v8i16 + mask result.
28144    if (((VT == MVT::v16i8 && !Subtarget.canExtendTo512DQ()) ||
28145         (VT == MVT::v32i8 && !Subtarget.canExtendTo512BW()) ||
28146         VT == MVT::v64i8) &&
28147        !Subtarget.hasXOP()) {
28148      unsigned NumElts = VT.getVectorNumElements();
28149      MVT ExtVT = MVT::getVectorVT(MVT::i16, NumElts / 2);
28150      if (SupportedVectorShiftWithBaseAmnt(ExtVT, Subtarget, Opcode)) {
28151        unsigned LogicalOp = (Opcode == ISD::SHL ? ISD::SHL : ISD::SRL);
28152        unsigned LogicalX86Op = getTargetVShiftUniformOpcode(LogicalOp, false);
28153        BaseShAmt = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i32, BaseShAmt);
28154
28155        // Create the mask using vXi16 shifts. For shift-rights we need to move
28156        // the upper byte down before splatting the vXi8 mask.
28157        SDValue BitMask = DAG.getConstant(-1, dl, ExtVT);
28158        BitMask = getTargetVShiftNode(LogicalX86Op, dl, ExtVT, BitMask,
28159                                      BaseShAmt, Subtarget, DAG);
28160        if (Opcode != ISD::SHL)
28161          BitMask = getTargetVShiftByConstNode(LogicalX86Op, dl, ExtVT, BitMask,
28162                                               8, DAG);
28163        BitMask = DAG.getBitcast(VT, BitMask);
28164        BitMask = DAG.getVectorShuffle(VT, dl, BitMask, BitMask,
28165                                       SmallVector<int, 64>(NumElts, 0));
28166
28167        SDValue Res = getTargetVShiftNode(LogicalX86Op, dl, ExtVT,
28168                                          DAG.getBitcast(ExtVT, R), BaseShAmt,
28169                                          Subtarget, DAG);
28170        Res = DAG.getBitcast(VT, Res);
28171        Res = DAG.getNode(ISD::AND, dl, VT, Res, BitMask);
28172
28173        if (Opcode == ISD::SRA) {
28174          // ashr(R, Amt) === sub(xor(lshr(R, Amt), SignMask), SignMask)
28175          // SignMask = lshr(SignBit, Amt) - safe to do this with PSRLW.
28176          SDValue SignMask = DAG.getConstant(0x8080, dl, ExtVT);
28177          SignMask = getTargetVShiftNode(LogicalX86Op, dl, ExtVT, SignMask,
28178                                         BaseShAmt, Subtarget, DAG);
28179          SignMask = DAG.getBitcast(VT, SignMask);
28180          Res = DAG.getNode(ISD::XOR, dl, VT, Res, SignMask);
28181          Res = DAG.getNode(ISD::SUB, dl, VT, Res, SignMask);
28182        }
28183        return Res;
28184      }
28185    }
28186  }
28187
28188  // Check cases (mainly 32-bit) where i64 is expanded into high and low parts.
28189  if (VT == MVT::v2i64 && Amt.getOpcode() == ISD::BITCAST &&
28190      Amt.getOperand(0).getOpcode() == ISD::BUILD_VECTOR) {
28191    Amt = Amt.getOperand(0);
28192    unsigned Ratio = 64 / Amt.getScalarValueSizeInBits();
28193    std::vector<SDValue> Vals(Ratio);
28194    for (unsigned i = 0; i != Ratio; ++i)
28195      Vals[i] = Amt.getOperand(i);
28196    for (unsigned i = Ratio, e = Amt.getNumOperands(); i != e; i += Ratio) {
28197      for (unsigned j = 0; j != Ratio; ++j)
28198        if (Vals[j] != Amt.getOperand(i + j))
28199          return SDValue();
28200    }
28201
28202    if (SupportedVectorShiftWithBaseAmnt(VT, Subtarget, Op.getOpcode()))
28203      return DAG.getNode(X86OpcV, dl, VT, R, Op.getOperand(1));
28204  }
28205  return SDValue();
28206}
28207
28208// Convert a shift/rotate left amount to a multiplication scale factor.
28209static SDValue convertShiftLeftToScale(SDValue Amt, const SDLoc &dl,
28210                                       const X86Subtarget &Subtarget,
28211                                       SelectionDAG &DAG) {
28212  MVT VT = Amt.getSimpleValueType();
28213  if (!(VT == MVT::v8i16 || VT == MVT::v4i32 ||
28214        (Subtarget.hasInt256() && VT == MVT::v16i16) ||
28215        (Subtarget.hasVBMI2() && VT == MVT::v32i16) ||
28216        (!Subtarget.hasAVX512() && VT == MVT::v16i8)))
28217    return SDValue();
28218
28219  if (ISD::isBuildVectorOfConstantSDNodes(Amt.getNode())) {
28220    SmallVector<SDValue, 8> Elts;
28221    MVT SVT = VT.getVectorElementType();
28222    unsigned SVTBits = SVT.getSizeInBits();
28223    APInt One(SVTBits, 1);
28224    unsigned NumElems = VT.getVectorNumElements();
28225
28226    for (unsigned i = 0; i != NumElems; ++i) {
28227      SDValue Op = Amt->getOperand(i);
28228      if (Op->isUndef()) {
28229        Elts.push_back(Op);
28230        continue;
28231      }
28232
28233      ConstantSDNode *ND = cast<ConstantSDNode>(Op);
28234      APInt C(SVTBits, ND->getZExtValue());
28235      uint64_t ShAmt = C.getZExtValue();
28236      if (ShAmt >= SVTBits) {
28237        Elts.push_back(DAG.getUNDEF(SVT));
28238        continue;
28239      }
28240      Elts.push_back(DAG.getConstant(One.shl(ShAmt), dl, SVT));
28241    }
28242    return DAG.getBuildVector(VT, dl, Elts);
28243  }
28244
28245  // If the target doesn't support variable shifts, use either FP conversion
28246  // or integer multiplication to avoid shifting each element individually.
28247  if (VT == MVT::v4i32) {
28248    Amt = DAG.getNode(ISD::SHL, dl, VT, Amt, DAG.getConstant(23, dl, VT));
28249    Amt = DAG.getNode(ISD::ADD, dl, VT, Amt,
28250                      DAG.getConstant(0x3f800000U, dl, VT));
28251    Amt = DAG.getBitcast(MVT::v4f32, Amt);
28252    return DAG.getNode(ISD::FP_TO_SINT, dl, VT, Amt);
28253  }
28254
28255  // AVX2 can more effectively perform this as a zext/trunc to/from v8i32.
28256  if (VT == MVT::v8i16 && !Subtarget.hasAVX2()) {
28257    SDValue Z = DAG.getConstant(0, dl, VT);
28258    SDValue Lo = DAG.getBitcast(MVT::v4i32, getUnpackl(DAG, dl, VT, Amt, Z));
28259    SDValue Hi = DAG.getBitcast(MVT::v4i32, getUnpackh(DAG, dl, VT, Amt, Z));
28260    Lo = convertShiftLeftToScale(Lo, dl, Subtarget, DAG);
28261    Hi = convertShiftLeftToScale(Hi, dl, Subtarget, DAG);
28262    if (Subtarget.hasSSE41())
28263      return DAG.getNode(X86ISD::PACKUS, dl, VT, Lo, Hi);
28264
28265    return DAG.getVectorShuffle(VT, dl, DAG.getBitcast(VT, Lo),
28266                                        DAG.getBitcast(VT, Hi),
28267                                        {0, 2, 4, 6, 8, 10, 12, 14});
28268  }
28269
28270  return SDValue();
28271}
28272
28273static SDValue LowerShift(SDValue Op, const X86Subtarget &Subtarget,
28274                          SelectionDAG &DAG) {
28275  MVT VT = Op.getSimpleValueType();
28276  SDLoc dl(Op);
28277  SDValue R = Op.getOperand(0);
28278  SDValue Amt = Op.getOperand(1);
28279  unsigned EltSizeInBits = VT.getScalarSizeInBits();
28280  bool ConstantAmt = ISD::isBuildVectorOfConstantSDNodes(Amt.getNode());
28281
28282  unsigned Opc = Op.getOpcode();
28283  unsigned X86OpcV = getTargetVShiftUniformOpcode(Opc, true);
28284  unsigned X86OpcI = getTargetVShiftUniformOpcode(Opc, false);
28285
28286  assert(VT.isVector() && "Custom lowering only for vector shifts!");
28287  assert(Subtarget.hasSSE2() && "Only custom lower when we have SSE2!");
28288
28289  if (SDValue V = LowerScalarImmediateShift(Op, DAG, Subtarget))
28290    return V;
28291
28292  if (SDValue V = LowerScalarVariableShift(Op, DAG, Subtarget))
28293    return V;
28294
28295  if (SupportedVectorVarShift(VT, Subtarget, Opc))
28296    return Op;
28297
28298  // XOP has 128-bit variable logical/arithmetic shifts.
28299  // +ve/-ve Amt = shift left/right.
28300  if (Subtarget.hasXOP() && (VT == MVT::v2i64 || VT == MVT::v4i32 ||
28301                             VT == MVT::v8i16 || VT == MVT::v16i8)) {
28302    if (Opc == ISD::SRL || Opc == ISD::SRA) {
28303      SDValue Zero = DAG.getConstant(0, dl, VT);
28304      Amt = DAG.getNode(ISD::SUB, dl, VT, Zero, Amt);
28305    }
28306    if (Opc == ISD::SHL || Opc == ISD::SRL)
28307      return DAG.getNode(X86ISD::VPSHL, dl, VT, R, Amt);
28308    if (Opc == ISD::SRA)
28309      return DAG.getNode(X86ISD::VPSHA, dl, VT, R, Amt);
28310  }
28311
28312  // 2i64 vector logical shifts can efficiently avoid scalarization - do the
28313  // shifts per-lane and then shuffle the partial results back together.
28314  if (VT == MVT::v2i64 && Opc != ISD::SRA) {
28315    // Splat the shift amounts so the scalar shifts above will catch it.
28316    SDValue Amt0 = DAG.getVectorShuffle(VT, dl, Amt, Amt, {0, 0});
28317    SDValue Amt1 = DAG.getVectorShuffle(VT, dl, Amt, Amt, {1, 1});
28318    SDValue R0 = DAG.getNode(Opc, dl, VT, R, Amt0);
28319    SDValue R1 = DAG.getNode(Opc, dl, VT, R, Amt1);
28320    return DAG.getVectorShuffle(VT, dl, R0, R1, {0, 3});
28321  }
28322
28323  // i64 vector arithmetic shift can be emulated with the transform:
28324  // M = lshr(SIGN_MASK, Amt)
28325  // ashr(R, Amt) === sub(xor(lshr(R, Amt), M), M)
28326  if ((VT == MVT::v2i64 || (VT == MVT::v4i64 && Subtarget.hasInt256())) &&
28327      Opc == ISD::SRA) {
28328    SDValue S = DAG.getConstant(APInt::getSignMask(64), dl, VT);
28329    SDValue M = DAG.getNode(ISD::SRL, dl, VT, S, Amt);
28330    R = DAG.getNode(ISD::SRL, dl, VT, R, Amt);
28331    R = DAG.getNode(ISD::XOR, dl, VT, R, M);
28332    R = DAG.getNode(ISD::SUB, dl, VT, R, M);
28333    return R;
28334  }
28335
28336  // If possible, lower this shift as a sequence of two shifts by
28337  // constant plus a BLENDing shuffle instead of scalarizing it.
28338  // Example:
28339  //   (v4i32 (srl A, (build_vector < X, Y, Y, Y>)))
28340  //
28341  // Could be rewritten as:
28342  //   (v4i32 (MOVSS (srl A, <Y,Y,Y,Y>), (srl A, <X,X,X,X>)))
28343  //
28344  // The advantage is that the two shifts from the example would be
28345  // lowered as X86ISD::VSRLI nodes in parallel before blending.
28346  if (ConstantAmt && (VT == MVT::v8i16 || VT == MVT::v4i32 ||
28347                      (VT == MVT::v16i16 && Subtarget.hasInt256()))) {
28348    SDValue Amt1, Amt2;
28349    unsigned NumElts = VT.getVectorNumElements();
28350    SmallVector<int, 8> ShuffleMask;
28351    for (unsigned i = 0; i != NumElts; ++i) {
28352      SDValue A = Amt->getOperand(i);
28353      if (A.isUndef()) {
28354        ShuffleMask.push_back(SM_SentinelUndef);
28355        continue;
28356      }
28357      if (!Amt1 || Amt1 == A) {
28358        ShuffleMask.push_back(i);
28359        Amt1 = A;
28360        continue;
28361      }
28362      if (!Amt2 || Amt2 == A) {
28363        ShuffleMask.push_back(i + NumElts);
28364        Amt2 = A;
28365        continue;
28366      }
28367      break;
28368    }
28369
28370    // Only perform this blend if we can perform it without loading a mask.
28371    if (ShuffleMask.size() == NumElts && Amt1 && Amt2 &&
28372        (VT != MVT::v16i16 ||
28373         is128BitLaneRepeatedShuffleMask(VT, ShuffleMask)) &&
28374        (VT == MVT::v4i32 || Subtarget.hasSSE41() || Opc != ISD::SHL ||
28375         canWidenShuffleElements(ShuffleMask))) {
28376      auto *Cst1 = dyn_cast<ConstantSDNode>(Amt1);
28377      auto *Cst2 = dyn_cast<ConstantSDNode>(Amt2);
28378      if (Cst1 && Cst2 && Cst1->getAPIntValue().ult(EltSizeInBits) &&
28379          Cst2->getAPIntValue().ult(EltSizeInBits)) {
28380        SDValue Shift1 = getTargetVShiftByConstNode(X86OpcI, dl, VT, R,
28381                                                    Cst1->getZExtValue(), DAG);
28382        SDValue Shift2 = getTargetVShiftByConstNode(X86OpcI, dl, VT, R,
28383                                                    Cst2->getZExtValue(), DAG);
28384        return DAG.getVectorShuffle(VT, dl, Shift1, Shift2, ShuffleMask);
28385      }
28386    }
28387  }
28388
28389  // If possible, lower this packed shift into a vector multiply instead of
28390  // expanding it into a sequence of scalar shifts.
28391  if (Opc == ISD::SHL)
28392    if (SDValue Scale = convertShiftLeftToScale(Amt, dl, Subtarget, DAG))
28393      return DAG.getNode(ISD::MUL, dl, VT, R, Scale);
28394
28395  // Constant ISD::SRL can be performed efficiently on vXi16 vectors as we
28396  // can replace with ISD::MULHU, creating scale factor from (NumEltBits - Amt).
28397  if (Opc == ISD::SRL && ConstantAmt &&
28398      (VT == MVT::v8i16 || (VT == MVT::v16i16 && Subtarget.hasInt256()))) {
28399    SDValue EltBits = DAG.getConstant(EltSizeInBits, dl, VT);
28400    SDValue RAmt = DAG.getNode(ISD::SUB, dl, VT, EltBits, Amt);
28401    if (SDValue Scale = convertShiftLeftToScale(RAmt, dl, Subtarget, DAG)) {
28402      SDValue Zero = DAG.getConstant(0, dl, VT);
28403      SDValue ZAmt = DAG.getSetCC(dl, VT, Amt, Zero, ISD::SETEQ);
28404      SDValue Res = DAG.getNode(ISD::MULHU, dl, VT, R, Scale);
28405      return DAG.getSelect(dl, VT, ZAmt, R, Res);
28406    }
28407  }
28408
28409  // Constant ISD::SRA can be performed efficiently on vXi16 vectors as we
28410  // can replace with ISD::MULHS, creating scale factor from (NumEltBits - Amt).
28411  // TODO: Special case handling for shift by 0/1, really we can afford either
28412  // of these cases in pre-SSE41/XOP/AVX512 but not both.
28413  if (Opc == ISD::SRA && ConstantAmt &&
28414      (VT == MVT::v8i16 || (VT == MVT::v16i16 && Subtarget.hasInt256())) &&
28415      ((Subtarget.hasSSE41() && !Subtarget.hasXOP() &&
28416        !Subtarget.hasAVX512()) ||
28417       DAG.isKnownNeverZero(Amt))) {
28418    SDValue EltBits = DAG.getConstant(EltSizeInBits, dl, VT);
28419    SDValue RAmt = DAG.getNode(ISD::SUB, dl, VT, EltBits, Amt);
28420    if (SDValue Scale = convertShiftLeftToScale(RAmt, dl, Subtarget, DAG)) {
28421      SDValue Amt0 =
28422          DAG.getSetCC(dl, VT, Amt, DAG.getConstant(0, dl, VT), ISD::SETEQ);
28423      SDValue Amt1 =
28424          DAG.getSetCC(dl, VT, Amt, DAG.getConstant(1, dl, VT), ISD::SETEQ);
28425      SDValue Sra1 =
28426          getTargetVShiftByConstNode(X86ISD::VSRAI, dl, VT, R, 1, DAG);
28427      SDValue Res = DAG.getNode(ISD::MULHS, dl, VT, R, Scale);
28428      Res = DAG.getSelect(dl, VT, Amt0, R, Res);
28429      return DAG.getSelect(dl, VT, Amt1, Sra1, Res);
28430    }
28431  }
28432
28433  // v4i32 Non Uniform Shifts.
28434  // If the shift amount is constant we can shift each lane using the SSE2
28435  // immediate shifts, else we need to zero-extend each lane to the lower i64
28436  // and shift using the SSE2 variable shifts.
28437  // The separate results can then be blended together.
28438  if (VT == MVT::v4i32) {
28439    SDValue Amt0, Amt1, Amt2, Amt3;
28440    if (ConstantAmt) {
28441      Amt0 = DAG.getVectorShuffle(VT, dl, Amt, DAG.getUNDEF(VT), {0, 0, 0, 0});
28442      Amt1 = DAG.getVectorShuffle(VT, dl, Amt, DAG.getUNDEF(VT), {1, 1, 1, 1});
28443      Amt2 = DAG.getVectorShuffle(VT, dl, Amt, DAG.getUNDEF(VT), {2, 2, 2, 2});
28444      Amt3 = DAG.getVectorShuffle(VT, dl, Amt, DAG.getUNDEF(VT), {3, 3, 3, 3});
28445    } else {
28446      // The SSE2 shifts use the lower i64 as the same shift amount for
28447      // all lanes and the upper i64 is ignored. On AVX we're better off
28448      // just zero-extending, but for SSE just duplicating the top 16-bits is
28449      // cheaper and has the same effect for out of range values.
28450      if (Subtarget.hasAVX()) {
28451        SDValue Z = DAG.getConstant(0, dl, VT);
28452        Amt0 = DAG.getVectorShuffle(VT, dl, Amt, Z, {0, 4, -1, -1});
28453        Amt1 = DAG.getVectorShuffle(VT, dl, Amt, Z, {1, 5, -1, -1});
28454        Amt2 = DAG.getVectorShuffle(VT, dl, Amt, Z, {2, 6, -1, -1});
28455        Amt3 = DAG.getVectorShuffle(VT, dl, Amt, Z, {3, 7, -1, -1});
28456      } else {
28457        SDValue Amt01 = DAG.getBitcast(MVT::v8i16, Amt);
28458        SDValue Amt23 = DAG.getVectorShuffle(MVT::v8i16, dl, Amt01, Amt01,
28459                                             {4, 5, 6, 7, -1, -1, -1, -1});
28460        Amt0 = DAG.getVectorShuffle(MVT::v8i16, dl, Amt01, Amt01,
28461                                    {0, 1, 1, 1, -1, -1, -1, -1});
28462        Amt1 = DAG.getVectorShuffle(MVT::v8i16, dl, Amt01, Amt01,
28463                                    {2, 3, 3, 3, -1, -1, -1, -1});
28464        Amt2 = DAG.getVectorShuffle(MVT::v8i16, dl, Amt23, Amt23,
28465                                    {0, 1, 1, 1, -1, -1, -1, -1});
28466        Amt3 = DAG.getVectorShuffle(MVT::v8i16, dl, Amt23, Amt23,
28467                                    {2, 3, 3, 3, -1, -1, -1, -1});
28468      }
28469    }
28470
28471    unsigned ShOpc = ConstantAmt ? Opc : X86OpcV;
28472    SDValue R0 = DAG.getNode(ShOpc, dl, VT, R, DAG.getBitcast(VT, Amt0));
28473    SDValue R1 = DAG.getNode(ShOpc, dl, VT, R, DAG.getBitcast(VT, Amt1));
28474    SDValue R2 = DAG.getNode(ShOpc, dl, VT, R, DAG.getBitcast(VT, Amt2));
28475    SDValue R3 = DAG.getNode(ShOpc, dl, VT, R, DAG.getBitcast(VT, Amt3));
28476
28477    // Merge the shifted lane results optimally with/without PBLENDW.
28478    // TODO - ideally shuffle combining would handle this.
28479    if (Subtarget.hasSSE41()) {
28480      SDValue R02 = DAG.getVectorShuffle(VT, dl, R0, R2, {0, -1, 6, -1});
28481      SDValue R13 = DAG.getVectorShuffle(VT, dl, R1, R3, {-1, 1, -1, 7});
28482      return DAG.getVectorShuffle(VT, dl, R02, R13, {0, 5, 2, 7});
28483    }
28484    SDValue R01 = DAG.getVectorShuffle(VT, dl, R0, R1, {0, -1, -1, 5});
28485    SDValue R23 = DAG.getVectorShuffle(VT, dl, R2, R3, {2, -1, -1, 7});
28486    return DAG.getVectorShuffle(VT, dl, R01, R23, {0, 3, 4, 7});
28487  }
28488
28489  // It's worth extending once and using the vXi16/vXi32 shifts for smaller
28490  // types, but without AVX512 the extra overheads to get from vXi8 to vXi32
28491  // make the existing SSE solution better.
28492  // NOTE: We honor prefered vector width before promoting to 512-bits.
28493  if ((Subtarget.hasInt256() && VT == MVT::v8i16) ||
28494      (Subtarget.canExtendTo512DQ() && VT == MVT::v16i16) ||
28495      (Subtarget.canExtendTo512DQ() && VT == MVT::v16i8) ||
28496      (Subtarget.canExtendTo512BW() && VT == MVT::v32i8) ||
28497      (Subtarget.hasBWI() && Subtarget.hasVLX() && VT == MVT::v16i8)) {
28498    assert((!Subtarget.hasBWI() || VT == MVT::v32i8 || VT == MVT::v16i8) &&
28499           "Unexpected vector type");
28500    MVT EvtSVT = Subtarget.hasBWI() ? MVT::i16 : MVT::i32;
28501    MVT ExtVT = MVT::getVectorVT(EvtSVT, VT.getVectorNumElements());
28502    unsigned ExtOpc = Opc == ISD::SRA ? ISD::SIGN_EXTEND : ISD::ZERO_EXTEND;
28503    R = DAG.getNode(ExtOpc, dl, ExtVT, R);
28504    Amt = DAG.getNode(ISD::ZERO_EXTEND, dl, ExtVT, Amt);
28505    return DAG.getNode(ISD::TRUNCATE, dl, VT,
28506                       DAG.getNode(Opc, dl, ExtVT, R, Amt));
28507  }
28508
28509  // Constant ISD::SRA/SRL can be performed efficiently on vXi8 vectors as we
28510  // extend to vXi16 to perform a MUL scale effectively as a MUL_LOHI.
28511  if (ConstantAmt && (Opc == ISD::SRA || Opc == ISD::SRL) &&
28512      (VT == MVT::v16i8 || (VT == MVT::v32i8 && Subtarget.hasInt256()) ||
28513       (VT == MVT::v64i8 && Subtarget.hasBWI())) &&
28514      !Subtarget.hasXOP()) {
28515    int NumElts = VT.getVectorNumElements();
28516    SDValue Cst8 = DAG.getTargetConstant(8, dl, MVT::i8);
28517
28518    // Extend constant shift amount to vXi16 (it doesn't matter if the type
28519    // isn't legal).
28520    MVT ExVT = MVT::getVectorVT(MVT::i16, NumElts);
28521    Amt = DAG.getZExtOrTrunc(Amt, dl, ExVT);
28522    Amt = DAG.getNode(ISD::SUB, dl, ExVT, DAG.getConstant(8, dl, ExVT), Amt);
28523    Amt = DAG.getNode(ISD::SHL, dl, ExVT, DAG.getConstant(1, dl, ExVT), Amt);
28524    assert(ISD::isBuildVectorOfConstantSDNodes(Amt.getNode()) &&
28525           "Constant build vector expected");
28526
28527    if (VT == MVT::v16i8 && Subtarget.hasInt256()) {
28528      R = Opc == ISD::SRA ? DAG.getSExtOrTrunc(R, dl, ExVT)
28529                          : DAG.getZExtOrTrunc(R, dl, ExVT);
28530      R = DAG.getNode(ISD::MUL, dl, ExVT, R, Amt);
28531      R = DAG.getNode(X86ISD::VSRLI, dl, ExVT, R, Cst8);
28532      return DAG.getZExtOrTrunc(R, dl, VT);
28533    }
28534
28535    SmallVector<SDValue, 16> LoAmt, HiAmt;
28536    for (int i = 0; i != NumElts; i += 16) {
28537      for (int j = 0; j != 8; ++j) {
28538        LoAmt.push_back(Amt.getOperand(i + j));
28539        HiAmt.push_back(Amt.getOperand(i + j + 8));
28540      }
28541    }
28542
28543    MVT VT16 = MVT::getVectorVT(MVT::i16, NumElts / 2);
28544    SDValue LoA = DAG.getBuildVector(VT16, dl, LoAmt);
28545    SDValue HiA = DAG.getBuildVector(VT16, dl, HiAmt);
28546
28547    SDValue LoR = DAG.getBitcast(VT16, getUnpackl(DAG, dl, VT, R, R));
28548    SDValue HiR = DAG.getBitcast(VT16, getUnpackh(DAG, dl, VT, R, R));
28549    LoR = DAG.getNode(X86OpcI, dl, VT16, LoR, Cst8);
28550    HiR = DAG.getNode(X86OpcI, dl, VT16, HiR, Cst8);
28551    LoR = DAG.getNode(ISD::MUL, dl, VT16, LoR, LoA);
28552    HiR = DAG.getNode(ISD::MUL, dl, VT16, HiR, HiA);
28553    LoR = DAG.getNode(X86ISD::VSRLI, dl, VT16, LoR, Cst8);
28554    HiR = DAG.getNode(X86ISD::VSRLI, dl, VT16, HiR, Cst8);
28555    return DAG.getNode(X86ISD::PACKUS, dl, VT, LoR, HiR);
28556  }
28557
28558  if (VT == MVT::v16i8 ||
28559      (VT == MVT::v32i8 && Subtarget.hasInt256() && !Subtarget.hasXOP()) ||
28560      (VT == MVT::v64i8 && Subtarget.hasBWI())) {
28561    MVT ExtVT = MVT::getVectorVT(MVT::i16, VT.getVectorNumElements() / 2);
28562
28563    auto SignBitSelect = [&](MVT SelVT, SDValue Sel, SDValue V0, SDValue V1) {
28564      if (VT.is512BitVector()) {
28565        // On AVX512BW targets we make use of the fact that VSELECT lowers
28566        // to a masked blend which selects bytes based just on the sign bit
28567        // extracted to a mask.
28568        MVT MaskVT = MVT::getVectorVT(MVT::i1, VT.getVectorNumElements());
28569        V0 = DAG.getBitcast(VT, V0);
28570        V1 = DAG.getBitcast(VT, V1);
28571        Sel = DAG.getBitcast(VT, Sel);
28572        Sel = DAG.getSetCC(dl, MaskVT, DAG.getConstant(0, dl, VT), Sel,
28573                           ISD::SETGT);
28574        return DAG.getBitcast(SelVT, DAG.getSelect(dl, VT, Sel, V0, V1));
28575      } else if (Subtarget.hasSSE41()) {
28576        // On SSE41 targets we can use PBLENDVB which selects bytes based just
28577        // on the sign bit.
28578        V0 = DAG.getBitcast(VT, V0);
28579        V1 = DAG.getBitcast(VT, V1);
28580        Sel = DAG.getBitcast(VT, Sel);
28581        return DAG.getBitcast(SelVT,
28582                              DAG.getNode(X86ISD::BLENDV, dl, VT, Sel, V0, V1));
28583      }
28584      // On pre-SSE41 targets we test for the sign bit by comparing to
28585      // zero - a negative value will set all bits of the lanes to true
28586      // and VSELECT uses that in its OR(AND(V0,C),AND(V1,~C)) lowering.
28587      SDValue Z = DAG.getConstant(0, dl, SelVT);
28588      SDValue C = DAG.getNode(X86ISD::PCMPGT, dl, SelVT, Z, Sel);
28589      return DAG.getSelect(dl, SelVT, C, V0, V1);
28590    };
28591
28592    // Turn 'a' into a mask suitable for VSELECT: a = a << 5;
28593    // We can safely do this using i16 shifts as we're only interested in
28594    // the 3 lower bits of each byte.
28595    Amt = DAG.getBitcast(ExtVT, Amt);
28596    Amt = getTargetVShiftByConstNode(X86ISD::VSHLI, dl, ExtVT, Amt, 5, DAG);
28597    Amt = DAG.getBitcast(VT, Amt);
28598
28599    if (Opc == ISD::SHL || Opc == ISD::SRL) {
28600      // r = VSELECT(r, shift(r, 4), a);
28601      SDValue M = DAG.getNode(Opc, dl, VT, R, DAG.getConstant(4, dl, VT));
28602      R = SignBitSelect(VT, Amt, M, R);
28603
28604      // a += a
28605      Amt = DAG.getNode(ISD::ADD, dl, VT, Amt, Amt);
28606
28607      // r = VSELECT(r, shift(r, 2), a);
28608      M = DAG.getNode(Opc, dl, VT, R, DAG.getConstant(2, dl, VT));
28609      R = SignBitSelect(VT, Amt, M, R);
28610
28611      // a += a
28612      Amt = DAG.getNode(ISD::ADD, dl, VT, Amt, Amt);
28613
28614      // return VSELECT(r, shift(r, 1), a);
28615      M = DAG.getNode(Opc, dl, VT, R, DAG.getConstant(1, dl, VT));
28616      R = SignBitSelect(VT, Amt, M, R);
28617      return R;
28618    }
28619
28620    if (Opc == ISD::SRA) {
28621      // For SRA we need to unpack each byte to the higher byte of a i16 vector
28622      // so we can correctly sign extend. We don't care what happens to the
28623      // lower byte.
28624      SDValue ALo = getUnpackl(DAG, dl, VT, DAG.getUNDEF(VT), Amt);
28625      SDValue AHi = getUnpackh(DAG, dl, VT, DAG.getUNDEF(VT), Amt);
28626      SDValue RLo = getUnpackl(DAG, dl, VT, DAG.getUNDEF(VT), R);
28627      SDValue RHi = getUnpackh(DAG, dl, VT, DAG.getUNDEF(VT), R);
28628      ALo = DAG.getBitcast(ExtVT, ALo);
28629      AHi = DAG.getBitcast(ExtVT, AHi);
28630      RLo = DAG.getBitcast(ExtVT, RLo);
28631      RHi = DAG.getBitcast(ExtVT, RHi);
28632
28633      // r = VSELECT(r, shift(r, 4), a);
28634      SDValue MLo = getTargetVShiftByConstNode(X86OpcI, dl, ExtVT, RLo, 4, DAG);
28635      SDValue MHi = getTargetVShiftByConstNode(X86OpcI, dl, ExtVT, RHi, 4, DAG);
28636      RLo = SignBitSelect(ExtVT, ALo, MLo, RLo);
28637      RHi = SignBitSelect(ExtVT, AHi, MHi, RHi);
28638
28639      // a += a
28640      ALo = DAG.getNode(ISD::ADD, dl, ExtVT, ALo, ALo);
28641      AHi = DAG.getNode(ISD::ADD, dl, ExtVT, AHi, AHi);
28642
28643      // r = VSELECT(r, shift(r, 2), a);
28644      MLo = getTargetVShiftByConstNode(X86OpcI, dl, ExtVT, RLo, 2, DAG);
28645      MHi = getTargetVShiftByConstNode(X86OpcI, dl, ExtVT, RHi, 2, DAG);
28646      RLo = SignBitSelect(ExtVT, ALo, MLo, RLo);
28647      RHi = SignBitSelect(ExtVT, AHi, MHi, RHi);
28648
28649      // a += a
28650      ALo = DAG.getNode(ISD::ADD, dl, ExtVT, ALo, ALo);
28651      AHi = DAG.getNode(ISD::ADD, dl, ExtVT, AHi, AHi);
28652
28653      // r = VSELECT(r, shift(r, 1), a);
28654      MLo = getTargetVShiftByConstNode(X86OpcI, dl, ExtVT, RLo, 1, DAG);
28655      MHi = getTargetVShiftByConstNode(X86OpcI, dl, ExtVT, RHi, 1, DAG);
28656      RLo = SignBitSelect(ExtVT, ALo, MLo, RLo);
28657      RHi = SignBitSelect(ExtVT, AHi, MHi, RHi);
28658
28659      // Logical shift the result back to the lower byte, leaving a zero upper
28660      // byte meaning that we can safely pack with PACKUSWB.
28661      RLo = getTargetVShiftByConstNode(X86ISD::VSRLI, dl, ExtVT, RLo, 8, DAG);
28662      RHi = getTargetVShiftByConstNode(X86ISD::VSRLI, dl, ExtVT, RHi, 8, DAG);
28663      return DAG.getNode(X86ISD::PACKUS, dl, VT, RLo, RHi);
28664    }
28665  }
28666
28667  if (Subtarget.hasInt256() && !Subtarget.hasXOP() && VT == MVT::v16i16) {
28668    MVT ExtVT = MVT::v8i32;
28669    SDValue Z = DAG.getConstant(0, dl, VT);
28670    SDValue ALo = getUnpackl(DAG, dl, VT, Amt, Z);
28671    SDValue AHi = getUnpackh(DAG, dl, VT, Amt, Z);
28672    SDValue RLo = getUnpackl(DAG, dl, VT, Z, R);
28673    SDValue RHi = getUnpackh(DAG, dl, VT, Z, R);
28674    ALo = DAG.getBitcast(ExtVT, ALo);
28675    AHi = DAG.getBitcast(ExtVT, AHi);
28676    RLo = DAG.getBitcast(ExtVT, RLo);
28677    RHi = DAG.getBitcast(ExtVT, RHi);
28678    SDValue Lo = DAG.getNode(Opc, dl, ExtVT, RLo, ALo);
28679    SDValue Hi = DAG.getNode(Opc, dl, ExtVT, RHi, AHi);
28680    Lo = getTargetVShiftByConstNode(X86ISD::VSRLI, dl, ExtVT, Lo, 16, DAG);
28681    Hi = getTargetVShiftByConstNode(X86ISD::VSRLI, dl, ExtVT, Hi, 16, DAG);
28682    return DAG.getNode(X86ISD::PACKUS, dl, VT, Lo, Hi);
28683  }
28684
28685  if (VT == MVT::v8i16) {
28686    // If we have a constant shift amount, the non-SSE41 path is best as
28687    // avoiding bitcasts make it easier to constant fold and reduce to PBLENDW.
28688    bool UseSSE41 = Subtarget.hasSSE41() &&
28689                    !ISD::isBuildVectorOfConstantSDNodes(Amt.getNode());
28690
28691    auto SignBitSelect = [&](SDValue Sel, SDValue V0, SDValue V1) {
28692      // On SSE41 targets we can use PBLENDVB which selects bytes based just on
28693      // the sign bit.
28694      if (UseSSE41) {
28695        MVT ExtVT = MVT::getVectorVT(MVT::i8, VT.getVectorNumElements() * 2);
28696        V0 = DAG.getBitcast(ExtVT, V0);
28697        V1 = DAG.getBitcast(ExtVT, V1);
28698        Sel = DAG.getBitcast(ExtVT, Sel);
28699        return DAG.getBitcast(
28700            VT, DAG.getNode(X86ISD::BLENDV, dl, ExtVT, Sel, V0, V1));
28701      }
28702      // On pre-SSE41 targets we splat the sign bit - a negative value will
28703      // set all bits of the lanes to true and VSELECT uses that in
28704      // its OR(AND(V0,C),AND(V1,~C)) lowering.
28705      SDValue C =
28706          getTargetVShiftByConstNode(X86ISD::VSRAI, dl, VT, Sel, 15, DAG);
28707      return DAG.getSelect(dl, VT, C, V0, V1);
28708    };
28709
28710    // Turn 'a' into a mask suitable for VSELECT: a = a << 12;
28711    if (UseSSE41) {
28712      // On SSE41 targets we need to replicate the shift mask in both
28713      // bytes for PBLENDVB.
28714      Amt = DAG.getNode(
28715          ISD::OR, dl, VT,
28716          getTargetVShiftByConstNode(X86ISD::VSHLI, dl, VT, Amt, 4, DAG),
28717          getTargetVShiftByConstNode(X86ISD::VSHLI, dl, VT, Amt, 12, DAG));
28718    } else {
28719      Amt = getTargetVShiftByConstNode(X86ISD::VSHLI, dl, VT, Amt, 12, DAG);
28720    }
28721
28722    // r = VSELECT(r, shift(r, 8), a);
28723    SDValue M = getTargetVShiftByConstNode(X86OpcI, dl, VT, R, 8, DAG);
28724    R = SignBitSelect(Amt, M, R);
28725
28726    // a += a
28727    Amt = DAG.getNode(ISD::ADD, dl, VT, Amt, Amt);
28728
28729    // r = VSELECT(r, shift(r, 4), a);
28730    M = getTargetVShiftByConstNode(X86OpcI, dl, VT, R, 4, DAG);
28731    R = SignBitSelect(Amt, M, R);
28732
28733    // a += a
28734    Amt = DAG.getNode(ISD::ADD, dl, VT, Amt, Amt);
28735
28736    // r = VSELECT(r, shift(r, 2), a);
28737    M = getTargetVShiftByConstNode(X86OpcI, dl, VT, R, 2, DAG);
28738    R = SignBitSelect(Amt, M, R);
28739
28740    // a += a
28741    Amt = DAG.getNode(ISD::ADD, dl, VT, Amt, Amt);
28742
28743    // return VSELECT(r, shift(r, 1), a);
28744    M = getTargetVShiftByConstNode(X86OpcI, dl, VT, R, 1, DAG);
28745    R = SignBitSelect(Amt, M, R);
28746    return R;
28747  }
28748
28749  // Decompose 256-bit shifts into 128-bit shifts.
28750  if (VT.is256BitVector())
28751    return splitVectorIntBinary(Op, DAG);
28752
28753  if (VT == MVT::v32i16 || VT == MVT::v64i8)
28754    return splitVectorIntBinary(Op, DAG);
28755
28756  return SDValue();
28757}
28758
28759static SDValue LowerRotate(SDValue Op, const X86Subtarget &Subtarget,
28760                           SelectionDAG &DAG) {
28761  MVT VT = Op.getSimpleValueType();
28762  assert(VT.isVector() && "Custom lowering only for vector rotates!");
28763
28764  SDLoc DL(Op);
28765  SDValue R = Op.getOperand(0);
28766  SDValue Amt = Op.getOperand(1);
28767  unsigned Opcode = Op.getOpcode();
28768  unsigned EltSizeInBits = VT.getScalarSizeInBits();
28769  int NumElts = VT.getVectorNumElements();
28770
28771  // Check for constant splat rotation amount.
28772  APInt CstSplatValue;
28773  bool IsCstSplat = X86::isConstantSplat(Amt, CstSplatValue);
28774
28775  // Check for splat rotate by zero.
28776  if (IsCstSplat && CstSplatValue.urem(EltSizeInBits) == 0)
28777    return R;
28778
28779  // AVX512 implicitly uses modulo rotation amounts.
28780  if (Subtarget.hasAVX512() && 32 <= EltSizeInBits) {
28781    // Attempt to rotate by immediate.
28782    if (IsCstSplat) {
28783      unsigned RotOpc = (Opcode == ISD::ROTL ? X86ISD::VROTLI : X86ISD::VROTRI);
28784      uint64_t RotAmt = CstSplatValue.urem(EltSizeInBits);
28785      return DAG.getNode(RotOpc, DL, VT, R,
28786                         DAG.getTargetConstant(RotAmt, DL, MVT::i8));
28787    }
28788
28789    // Else, fall-back on VPROLV/VPRORV.
28790    return Op;
28791  }
28792
28793  // AVX512 VBMI2 vXi16 - lower to funnel shifts.
28794  if (Subtarget.hasVBMI2() && 16 == EltSizeInBits) {
28795    unsigned FunnelOpc = (Opcode == ISD::ROTL ? ISD::FSHL : ISD::FSHR);
28796    return DAG.getNode(FunnelOpc, DL, VT, R, R, Amt);
28797  }
28798
28799  assert((Opcode == ISD::ROTL) && "Only ROTL supported");
28800
28801  // XOP has 128-bit vector variable + immediate rotates.
28802  // +ve/-ve Amt = rotate left/right - just need to handle ISD::ROTL.
28803  // XOP implicitly uses modulo rotation amounts.
28804  if (Subtarget.hasXOP()) {
28805    if (VT.is256BitVector())
28806      return splitVectorIntBinary(Op, DAG);
28807    assert(VT.is128BitVector() && "Only rotate 128-bit vectors!");
28808
28809    // Attempt to rotate by immediate.
28810    if (IsCstSplat) {
28811      uint64_t RotAmt = CstSplatValue.urem(EltSizeInBits);
28812      return DAG.getNode(X86ISD::VROTLI, DL, VT, R,
28813                         DAG.getTargetConstant(RotAmt, DL, MVT::i8));
28814    }
28815
28816    // Use general rotate by variable (per-element).
28817    return Op;
28818  }
28819
28820  // Split 256-bit integers on pre-AVX2 targets.
28821  if (VT.is256BitVector() && !Subtarget.hasAVX2())
28822    return splitVectorIntBinary(Op, DAG);
28823
28824  assert((VT == MVT::v4i32 || VT == MVT::v8i16 || VT == MVT::v16i8 ||
28825          ((VT == MVT::v8i32 || VT == MVT::v16i16 || VT == MVT::v32i8 ||
28826            VT == MVT::v32i16) &&
28827           Subtarget.hasAVX2())) &&
28828         "Only vXi32/vXi16/vXi8 vector rotates supported");
28829
28830  // Rotate by an uniform constant - expand back to shifts.
28831  if (IsCstSplat)
28832    return SDValue();
28833
28834  bool IsSplatAmt = DAG.isSplatValue(Amt);
28835
28836  // v16i8/v32i8: Split rotation into rot4/rot2/rot1 stages and select by
28837  // the amount bit.
28838  if (EltSizeInBits == 8 && !IsSplatAmt) {
28839    if (ISD::isBuildVectorOfConstantSDNodes(Amt.getNode()))
28840      return SDValue();
28841
28842    // We don't need ModuloAmt here as we just peek at individual bits.
28843    MVT ExtVT = MVT::getVectorVT(MVT::i16, NumElts / 2);
28844
28845    auto SignBitSelect = [&](MVT SelVT, SDValue Sel, SDValue V0, SDValue V1) {
28846      if (Subtarget.hasSSE41()) {
28847        // On SSE41 targets we can use PBLENDVB which selects bytes based just
28848        // on the sign bit.
28849        V0 = DAG.getBitcast(VT, V0);
28850        V1 = DAG.getBitcast(VT, V1);
28851        Sel = DAG.getBitcast(VT, Sel);
28852        return DAG.getBitcast(SelVT,
28853                              DAG.getNode(X86ISD::BLENDV, DL, VT, Sel, V0, V1));
28854      }
28855      // On pre-SSE41 targets we test for the sign bit by comparing to
28856      // zero - a negative value will set all bits of the lanes to true
28857      // and VSELECT uses that in its OR(AND(V0,C),AND(V1,~C)) lowering.
28858      SDValue Z = DAG.getConstant(0, DL, SelVT);
28859      SDValue C = DAG.getNode(X86ISD::PCMPGT, DL, SelVT, Z, Sel);
28860      return DAG.getSelect(DL, SelVT, C, V0, V1);
28861    };
28862
28863    // Turn 'a' into a mask suitable for VSELECT: a = a << 5;
28864    // We can safely do this using i16 shifts as we're only interested in
28865    // the 3 lower bits of each byte.
28866    Amt = DAG.getBitcast(ExtVT, Amt);
28867    Amt = DAG.getNode(ISD::SHL, DL, ExtVT, Amt, DAG.getConstant(5, DL, ExtVT));
28868    Amt = DAG.getBitcast(VT, Amt);
28869
28870    // r = VSELECT(r, rot(r, 4), a);
28871    SDValue M;
28872    M = DAG.getNode(
28873        ISD::OR, DL, VT,
28874        DAG.getNode(ISD::SHL, DL, VT, R, DAG.getConstant(4, DL, VT)),
28875        DAG.getNode(ISD::SRL, DL, VT, R, DAG.getConstant(4, DL, VT)));
28876    R = SignBitSelect(VT, Amt, M, R);
28877
28878    // a += a
28879    Amt = DAG.getNode(ISD::ADD, DL, VT, Amt, Amt);
28880
28881    // r = VSELECT(r, rot(r, 2), a);
28882    M = DAG.getNode(
28883        ISD::OR, DL, VT,
28884        DAG.getNode(ISD::SHL, DL, VT, R, DAG.getConstant(2, DL, VT)),
28885        DAG.getNode(ISD::SRL, DL, VT, R, DAG.getConstant(6, DL, VT)));
28886    R = SignBitSelect(VT, Amt, M, R);
28887
28888    // a += a
28889    Amt = DAG.getNode(ISD::ADD, DL, VT, Amt, Amt);
28890
28891    // return VSELECT(r, rot(r, 1), a);
28892    M = DAG.getNode(
28893        ISD::OR, DL, VT,
28894        DAG.getNode(ISD::SHL, DL, VT, R, DAG.getConstant(1, DL, VT)),
28895        DAG.getNode(ISD::SRL, DL, VT, R, DAG.getConstant(7, DL, VT)));
28896    return SignBitSelect(VT, Amt, M, R);
28897  }
28898
28899  // ISD::ROT* uses modulo rotate amounts.
28900  Amt = DAG.getNode(ISD::AND, DL, VT, Amt,
28901                    DAG.getConstant(EltSizeInBits - 1, DL, VT));
28902
28903  bool ConstantAmt = ISD::isBuildVectorOfConstantSDNodes(Amt.getNode());
28904  bool LegalVarShifts = SupportedVectorVarShift(VT, Subtarget, ISD::SHL) &&
28905                        SupportedVectorVarShift(VT, Subtarget, ISD::SRL);
28906
28907  // Fallback for splats + all supported variable shifts.
28908  // Fallback for non-constants AVX2 vXi16 as well.
28909  if (IsSplatAmt || LegalVarShifts || (Subtarget.hasAVX2() && !ConstantAmt)) {
28910    SDValue AmtR = DAG.getConstant(EltSizeInBits, DL, VT);
28911    AmtR = DAG.getNode(ISD::SUB, DL, VT, AmtR, Amt);
28912    SDValue SHL = DAG.getNode(ISD::SHL, DL, VT, R, Amt);
28913    SDValue SRL = DAG.getNode(ISD::SRL, DL, VT, R, AmtR);
28914    return DAG.getNode(ISD::OR, DL, VT, SHL, SRL);
28915  }
28916
28917  // As with shifts, convert the rotation amount to a multiplication factor.
28918  SDValue Scale = convertShiftLeftToScale(Amt, DL, Subtarget, DAG);
28919  assert(Scale && "Failed to convert ROTL amount to scale");
28920
28921  // v8i16/v16i16: perform unsigned multiply hi/lo and OR the results.
28922  if (EltSizeInBits == 16) {
28923    SDValue Lo = DAG.getNode(ISD::MUL, DL, VT, R, Scale);
28924    SDValue Hi = DAG.getNode(ISD::MULHU, DL, VT, R, Scale);
28925    return DAG.getNode(ISD::OR, DL, VT, Lo, Hi);
28926  }
28927
28928  // v4i32: make use of the PMULUDQ instruction to multiply 2 lanes of v4i32
28929  // to v2i64 results at a time. The upper 32-bits contain the wrapped bits
28930  // that can then be OR'd with the lower 32-bits.
28931  assert(VT == MVT::v4i32 && "Only v4i32 vector rotate expected");
28932  static const int OddMask[] = {1, -1, 3, -1};
28933  SDValue R13 = DAG.getVectorShuffle(VT, DL, R, R, OddMask);
28934  SDValue Scale13 = DAG.getVectorShuffle(VT, DL, Scale, Scale, OddMask);
28935
28936  SDValue Res02 = DAG.getNode(X86ISD::PMULUDQ, DL, MVT::v2i64,
28937                              DAG.getBitcast(MVT::v2i64, R),
28938                              DAG.getBitcast(MVT::v2i64, Scale));
28939  SDValue Res13 = DAG.getNode(X86ISD::PMULUDQ, DL, MVT::v2i64,
28940                              DAG.getBitcast(MVT::v2i64, R13),
28941                              DAG.getBitcast(MVT::v2i64, Scale13));
28942  Res02 = DAG.getBitcast(VT, Res02);
28943  Res13 = DAG.getBitcast(VT, Res13);
28944
28945  return DAG.getNode(ISD::OR, DL, VT,
28946                     DAG.getVectorShuffle(VT, DL, Res02, Res13, {0, 4, 2, 6}),
28947                     DAG.getVectorShuffle(VT, DL, Res02, Res13, {1, 5, 3, 7}));
28948}
28949
28950/// Returns true if the operand type is exactly twice the native width, and
28951/// the corresponding cmpxchg8b or cmpxchg16b instruction is available.
28952/// Used to know whether to use cmpxchg8/16b when expanding atomic operations
28953/// (otherwise we leave them alone to become __sync_fetch_and_... calls).
28954bool X86TargetLowering::needsCmpXchgNb(Type *MemType) const {
28955  unsigned OpWidth = MemType->getPrimitiveSizeInBits();
28956
28957  if (OpWidth == 64)
28958    return Subtarget.hasCmpxchg8b() && !Subtarget.is64Bit();
28959  if (OpWidth == 128)
28960    return Subtarget.hasCmpxchg16b();
28961
28962  return false;
28963}
28964
28965bool X86TargetLowering::shouldExpandAtomicStoreInIR(StoreInst *SI) const {
28966  Type *MemType = SI->getValueOperand()->getType();
28967
28968  bool NoImplicitFloatOps =
28969      SI->getFunction()->hasFnAttribute(Attribute::NoImplicitFloat);
28970  if (MemType->getPrimitiveSizeInBits() == 64 && !Subtarget.is64Bit() &&
28971      !Subtarget.useSoftFloat() && !NoImplicitFloatOps &&
28972      (Subtarget.hasSSE1() || Subtarget.hasX87()))
28973    return false;
28974
28975  return needsCmpXchgNb(MemType);
28976}
28977
28978// Note: this turns large loads into lock cmpxchg8b/16b.
28979// TODO: In 32-bit mode, use MOVLPS when SSE1 is available?
28980TargetLowering::AtomicExpansionKind
28981X86TargetLowering::shouldExpandAtomicLoadInIR(LoadInst *LI) const {
28982  Type *MemType = LI->getType();
28983
28984  // If this a 64 bit atomic load on a 32-bit target and SSE2 is enabled, we
28985  // can use movq to do the load. If we have X87 we can load into an 80-bit
28986  // X87 register and store it to a stack temporary.
28987  bool NoImplicitFloatOps =
28988      LI->getFunction()->hasFnAttribute(Attribute::NoImplicitFloat);
28989  if (MemType->getPrimitiveSizeInBits() == 64 && !Subtarget.is64Bit() &&
28990      !Subtarget.useSoftFloat() && !NoImplicitFloatOps &&
28991      (Subtarget.hasSSE1() || Subtarget.hasX87()))
28992    return AtomicExpansionKind::None;
28993
28994  return needsCmpXchgNb(MemType) ? AtomicExpansionKind::CmpXChg
28995                                 : AtomicExpansionKind::None;
28996}
28997
28998TargetLowering::AtomicExpansionKind
28999X86TargetLowering::shouldExpandAtomicRMWInIR(AtomicRMWInst *AI) const {
29000  unsigned NativeWidth = Subtarget.is64Bit() ? 64 : 32;
29001  Type *MemType = AI->getType();
29002
29003  // If the operand is too big, we must see if cmpxchg8/16b is available
29004  // and default to library calls otherwise.
29005  if (MemType->getPrimitiveSizeInBits() > NativeWidth) {
29006    return needsCmpXchgNb(MemType) ? AtomicExpansionKind::CmpXChg
29007                                   : AtomicExpansionKind::None;
29008  }
29009
29010  AtomicRMWInst::BinOp Op = AI->getOperation();
29011  switch (Op) {
29012  default:
29013    llvm_unreachable("Unknown atomic operation");
29014  case AtomicRMWInst::Xchg:
29015  case AtomicRMWInst::Add:
29016  case AtomicRMWInst::Sub:
29017    // It's better to use xadd, xsub or xchg for these in all cases.
29018    return AtomicExpansionKind::None;
29019  case AtomicRMWInst::Or:
29020  case AtomicRMWInst::And:
29021  case AtomicRMWInst::Xor:
29022    // If the atomicrmw's result isn't actually used, we can just add a "lock"
29023    // prefix to a normal instruction for these operations.
29024    return !AI->use_empty() ? AtomicExpansionKind::CmpXChg
29025                            : AtomicExpansionKind::None;
29026  case AtomicRMWInst::Nand:
29027  case AtomicRMWInst::Max:
29028  case AtomicRMWInst::Min:
29029  case AtomicRMWInst::UMax:
29030  case AtomicRMWInst::UMin:
29031  case AtomicRMWInst::FAdd:
29032  case AtomicRMWInst::FSub:
29033    // These always require a non-trivial set of data operations on x86. We must
29034    // use a cmpxchg loop.
29035    return AtomicExpansionKind::CmpXChg;
29036  }
29037}
29038
29039LoadInst *
29040X86TargetLowering::lowerIdempotentRMWIntoFencedLoad(AtomicRMWInst *AI) const {
29041  unsigned NativeWidth = Subtarget.is64Bit() ? 64 : 32;
29042  Type *MemType = AI->getType();
29043  // Accesses larger than the native width are turned into cmpxchg/libcalls, so
29044  // there is no benefit in turning such RMWs into loads, and it is actually
29045  // harmful as it introduces a mfence.
29046  if (MemType->getPrimitiveSizeInBits() > NativeWidth)
29047    return nullptr;
29048
29049  // If this is a canonical idempotent atomicrmw w/no uses, we have a better
29050  // lowering available in lowerAtomicArith.
29051  // TODO: push more cases through this path.
29052  if (auto *C = dyn_cast<ConstantInt>(AI->getValOperand()))
29053    if (AI->getOperation() == AtomicRMWInst::Or && C->isZero() &&
29054        AI->use_empty())
29055      return nullptr;
29056
29057  IRBuilder<> Builder(AI);
29058  Module *M = Builder.GetInsertBlock()->getParent()->getParent();
29059  auto SSID = AI->getSyncScopeID();
29060  // We must restrict the ordering to avoid generating loads with Release or
29061  // ReleaseAcquire orderings.
29062  auto Order = AtomicCmpXchgInst::getStrongestFailureOrdering(AI->getOrdering());
29063
29064  // Before the load we need a fence. Here is an example lifted from
29065  // http://www.hpl.hp.com/techreports/2012/HPL-2012-68.pdf showing why a fence
29066  // is required:
29067  // Thread 0:
29068  //   x.store(1, relaxed);
29069  //   r1 = y.fetch_add(0, release);
29070  // Thread 1:
29071  //   y.fetch_add(42, acquire);
29072  //   r2 = x.load(relaxed);
29073  // r1 = r2 = 0 is impossible, but becomes possible if the idempotent rmw is
29074  // lowered to just a load without a fence. A mfence flushes the store buffer,
29075  // making the optimization clearly correct.
29076  // FIXME: it is required if isReleaseOrStronger(Order) but it is not clear
29077  // otherwise, we might be able to be more aggressive on relaxed idempotent
29078  // rmw. In practice, they do not look useful, so we don't try to be
29079  // especially clever.
29080  if (SSID == SyncScope::SingleThread)
29081    // FIXME: we could just insert an X86ISD::MEMBARRIER here, except we are at
29082    // the IR level, so we must wrap it in an intrinsic.
29083    return nullptr;
29084
29085  if (!Subtarget.hasMFence())
29086    // FIXME: it might make sense to use a locked operation here but on a
29087    // different cache-line to prevent cache-line bouncing. In practice it
29088    // is probably a small win, and x86 processors without mfence are rare
29089    // enough that we do not bother.
29090    return nullptr;
29091
29092  Function *MFence =
29093      llvm::Intrinsic::getDeclaration(M, Intrinsic::x86_sse2_mfence);
29094  Builder.CreateCall(MFence, {});
29095
29096  // Finally we can emit the atomic load.
29097  LoadInst *Loaded = Builder.CreateAlignedLoad(
29098      AI->getType(), AI->getPointerOperand(), AI->getAlign());
29099  Loaded->setAtomic(Order, SSID);
29100  AI->replaceAllUsesWith(Loaded);
29101  AI->eraseFromParent();
29102  return Loaded;
29103}
29104
29105bool X86TargetLowering::lowerAtomicStoreAsStoreSDNode(const StoreInst &SI) const {
29106  if (!SI.isUnordered())
29107    return false;
29108  return ExperimentalUnorderedISEL;
29109}
29110bool X86TargetLowering::lowerAtomicLoadAsLoadSDNode(const LoadInst &LI) const {
29111  if (!LI.isUnordered())
29112    return false;
29113  return ExperimentalUnorderedISEL;
29114}
29115
29116
29117/// Emit a locked operation on a stack location which does not change any
29118/// memory location, but does involve a lock prefix.  Location is chosen to be
29119/// a) very likely accessed only by a single thread to minimize cache traffic,
29120/// and b) definitely dereferenceable.  Returns the new Chain result.
29121static SDValue emitLockedStackOp(SelectionDAG &DAG,
29122                                 const X86Subtarget &Subtarget, SDValue Chain,
29123                                 const SDLoc &DL) {
29124  // Implementation notes:
29125  // 1) LOCK prefix creates a full read/write reordering barrier for memory
29126  // operations issued by the current processor.  As such, the location
29127  // referenced is not relevant for the ordering properties of the instruction.
29128  // See: Intel�� 64 and IA-32 ArchitecturesSoftware Developer���s Manual,
29129  // 8.2.3.9  Loads and Stores Are Not Reordered with Locked Instructions
29130  // 2) Using an immediate operand appears to be the best encoding choice
29131  // here since it doesn't require an extra register.
29132  // 3) OR appears to be very slightly faster than ADD. (Though, the difference
29133  // is small enough it might just be measurement noise.)
29134  // 4) When choosing offsets, there are several contributing factors:
29135  //   a) If there's no redzone, we default to TOS.  (We could allocate a cache
29136  //      line aligned stack object to improve this case.)
29137  //   b) To minimize our chances of introducing a false dependence, we prefer
29138  //      to offset the stack usage from TOS slightly.
29139  //   c) To minimize concerns about cross thread stack usage - in particular,
29140  //      the idiomatic MyThreadPool.run([&StackVars]() {...}) pattern which
29141  //      captures state in the TOS frame and accesses it from many threads -
29142  //      we want to use an offset such that the offset is in a distinct cache
29143  //      line from the TOS frame.
29144  //
29145  // For a general discussion of the tradeoffs and benchmark results, see:
29146  // https://shipilev.net/blog/2014/on-the-fence-with-dependencies/
29147
29148  auto &MF = DAG.getMachineFunction();
29149  auto &TFL = *Subtarget.getFrameLowering();
29150  const unsigned SPOffset = TFL.has128ByteRedZone(MF) ? -64 : 0;
29151
29152  if (Subtarget.is64Bit()) {
29153    SDValue Zero = DAG.getTargetConstant(0, DL, MVT::i32);
29154    SDValue Ops[] = {
29155      DAG.getRegister(X86::RSP, MVT::i64),                  // Base
29156      DAG.getTargetConstant(1, DL, MVT::i8),                // Scale
29157      DAG.getRegister(0, MVT::i64),                         // Index
29158      DAG.getTargetConstant(SPOffset, DL, MVT::i32),        // Disp
29159      DAG.getRegister(0, MVT::i16),                         // Segment.
29160      Zero,
29161      Chain};
29162    SDNode *Res = DAG.getMachineNode(X86::OR32mi8Locked, DL, MVT::i32,
29163                                     MVT::Other, Ops);
29164    return SDValue(Res, 1);
29165  }
29166
29167  SDValue Zero = DAG.getTargetConstant(0, DL, MVT::i32);
29168  SDValue Ops[] = {
29169    DAG.getRegister(X86::ESP, MVT::i32),            // Base
29170    DAG.getTargetConstant(1, DL, MVT::i8),          // Scale
29171    DAG.getRegister(0, MVT::i32),                   // Index
29172    DAG.getTargetConstant(SPOffset, DL, MVT::i32),  // Disp
29173    DAG.getRegister(0, MVT::i16),                   // Segment.
29174    Zero,
29175    Chain
29176  };
29177  SDNode *Res = DAG.getMachineNode(X86::OR32mi8Locked, DL, MVT::i32,
29178                                   MVT::Other, Ops);
29179  return SDValue(Res, 1);
29180}
29181
29182static SDValue LowerATOMIC_FENCE(SDValue Op, const X86Subtarget &Subtarget,
29183                                 SelectionDAG &DAG) {
29184  SDLoc dl(Op);
29185  AtomicOrdering FenceOrdering =
29186      static_cast<AtomicOrdering>(Op.getConstantOperandVal(1));
29187  SyncScope::ID FenceSSID =
29188      static_cast<SyncScope::ID>(Op.getConstantOperandVal(2));
29189
29190  // The only fence that needs an instruction is a sequentially-consistent
29191  // cross-thread fence.
29192  if (FenceOrdering == AtomicOrdering::SequentiallyConsistent &&
29193      FenceSSID == SyncScope::System) {
29194    if (Subtarget.hasMFence())
29195      return DAG.getNode(X86ISD::MFENCE, dl, MVT::Other, Op.getOperand(0));
29196
29197    SDValue Chain = Op.getOperand(0);
29198    return emitLockedStackOp(DAG, Subtarget, Chain, dl);
29199  }
29200
29201  // MEMBARRIER is a compiler barrier; it codegens to a no-op.
29202  return DAG.getNode(X86ISD::MEMBARRIER, dl, MVT::Other, Op.getOperand(0));
29203}
29204
29205static SDValue LowerCMP_SWAP(SDValue Op, const X86Subtarget &Subtarget,
29206                             SelectionDAG &DAG) {
29207  MVT T = Op.getSimpleValueType();
29208  SDLoc DL(Op);
29209  unsigned Reg = 0;
29210  unsigned size = 0;
29211  switch(T.SimpleTy) {
29212  default: llvm_unreachable("Invalid value type!");
29213  case MVT::i8:  Reg = X86::AL;  size = 1; break;
29214  case MVT::i16: Reg = X86::AX;  size = 2; break;
29215  case MVT::i32: Reg = X86::EAX; size = 4; break;
29216  case MVT::i64:
29217    assert(Subtarget.is64Bit() && "Node not type legal!");
29218    Reg = X86::RAX; size = 8;
29219    break;
29220  }
29221  SDValue cpIn = DAG.getCopyToReg(Op.getOperand(0), DL, Reg,
29222                                  Op.getOperand(2), SDValue());
29223  SDValue Ops[] = { cpIn.getValue(0),
29224                    Op.getOperand(1),
29225                    Op.getOperand(3),
29226                    DAG.getTargetConstant(size, DL, MVT::i8),
29227                    cpIn.getValue(1) };
29228  SDVTList Tys = DAG.getVTList(MVT::Other, MVT::Glue);
29229  MachineMemOperand *MMO = cast<AtomicSDNode>(Op)->getMemOperand();
29230  SDValue Result = DAG.getMemIntrinsicNode(X86ISD::LCMPXCHG_DAG, DL, Tys,
29231                                           Ops, T, MMO);
29232
29233  SDValue cpOut =
29234    DAG.getCopyFromReg(Result.getValue(0), DL, Reg, T, Result.getValue(1));
29235  SDValue EFLAGS = DAG.getCopyFromReg(cpOut.getValue(1), DL, X86::EFLAGS,
29236                                      MVT::i32, cpOut.getValue(2));
29237  SDValue Success = getSETCC(X86::COND_E, EFLAGS, DL, DAG);
29238
29239  return DAG.getNode(ISD::MERGE_VALUES, DL, Op->getVTList(),
29240                     cpOut, Success, EFLAGS.getValue(1));
29241}
29242
29243// Create MOVMSKB, taking into account whether we need to split for AVX1.
29244static SDValue getPMOVMSKB(const SDLoc &DL, SDValue V, SelectionDAG &DAG,
29245                           const X86Subtarget &Subtarget) {
29246  MVT InVT = V.getSimpleValueType();
29247
29248  if (InVT == MVT::v64i8) {
29249    SDValue Lo, Hi;
29250    std::tie(Lo, Hi) = DAG.SplitVector(V, DL);
29251    Lo = getPMOVMSKB(DL, Lo, DAG, Subtarget);
29252    Hi = getPMOVMSKB(DL, Hi, DAG, Subtarget);
29253    Lo = DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::i64, Lo);
29254    Hi = DAG.getNode(ISD::ANY_EXTEND, DL, MVT::i64, Hi);
29255    Hi = DAG.getNode(ISD::SHL, DL, MVT::i64, Hi,
29256                     DAG.getConstant(32, DL, MVT::i8));
29257    return DAG.getNode(ISD::OR, DL, MVT::i64, Lo, Hi);
29258  }
29259  if (InVT == MVT::v32i8 && !Subtarget.hasInt256()) {
29260    SDValue Lo, Hi;
29261    std::tie(Lo, Hi) = DAG.SplitVector(V, DL);
29262    Lo = DAG.getNode(X86ISD::MOVMSK, DL, MVT::i32, Lo);
29263    Hi = DAG.getNode(X86ISD::MOVMSK, DL, MVT::i32, Hi);
29264    Hi = DAG.getNode(ISD::SHL, DL, MVT::i32, Hi,
29265                     DAG.getConstant(16, DL, MVT::i8));
29266    return DAG.getNode(ISD::OR, DL, MVT::i32, Lo, Hi);
29267  }
29268
29269  return DAG.getNode(X86ISD::MOVMSK, DL, MVT::i32, V);
29270}
29271
29272static SDValue LowerBITCAST(SDValue Op, const X86Subtarget &Subtarget,
29273                            SelectionDAG &DAG) {
29274  SDValue Src = Op.getOperand(0);
29275  MVT SrcVT = Src.getSimpleValueType();
29276  MVT DstVT = Op.getSimpleValueType();
29277
29278  // Legalize (v64i1 (bitcast i64 (X))) by splitting the i64, bitcasting each
29279  // half to v32i1 and concatenating the result.
29280  if (SrcVT == MVT::i64 && DstVT == MVT::v64i1) {
29281    assert(!Subtarget.is64Bit() && "Expected 32-bit mode");
29282    assert(Subtarget.hasBWI() && "Expected BWI target");
29283    SDLoc dl(Op);
29284    SDValue Lo = DAG.getNode(ISD::EXTRACT_ELEMENT, dl, MVT::i32, Src,
29285                             DAG.getIntPtrConstant(0, dl));
29286    Lo = DAG.getBitcast(MVT::v32i1, Lo);
29287    SDValue Hi = DAG.getNode(ISD::EXTRACT_ELEMENT, dl, MVT::i32, Src,
29288                             DAG.getIntPtrConstant(1, dl));
29289    Hi = DAG.getBitcast(MVT::v32i1, Hi);
29290    return DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v64i1, Lo, Hi);
29291  }
29292
29293  // Use MOVMSK for vector to scalar conversion to prevent scalarization.
29294  if ((SrcVT == MVT::v16i1 || SrcVT == MVT::v32i1) && DstVT.isScalarInteger()) {
29295    assert(!Subtarget.hasAVX512() && "Should use K-registers with AVX512");
29296    MVT SExtVT = SrcVT == MVT::v16i1 ? MVT::v16i8 : MVT::v32i8;
29297    SDLoc DL(Op);
29298    SDValue V = DAG.getSExtOrTrunc(Src, DL, SExtVT);
29299    V = getPMOVMSKB(DL, V, DAG, Subtarget);
29300    return DAG.getZExtOrTrunc(V, DL, DstVT);
29301  }
29302
29303  assert((SrcVT == MVT::v2i32 || SrcVT == MVT::v4i16 || SrcVT == MVT::v8i8 ||
29304          SrcVT == MVT::i64) && "Unexpected VT!");
29305
29306  assert(Subtarget.hasSSE2() && "Requires at least SSE2!");
29307  if (!(DstVT == MVT::f64 && SrcVT == MVT::i64) &&
29308      !(DstVT == MVT::x86mmx && SrcVT.isVector()))
29309    // This conversion needs to be expanded.
29310    return SDValue();
29311
29312  SDLoc dl(Op);
29313  if (SrcVT.isVector()) {
29314    // Widen the vector in input in the case of MVT::v2i32.
29315    // Example: from MVT::v2i32 to MVT::v4i32.
29316    MVT NewVT = MVT::getVectorVT(SrcVT.getVectorElementType(),
29317                                 SrcVT.getVectorNumElements() * 2);
29318    Src = DAG.getNode(ISD::CONCAT_VECTORS, dl, NewVT, Src,
29319                      DAG.getUNDEF(SrcVT));
29320  } else {
29321    assert(SrcVT == MVT::i64 && !Subtarget.is64Bit() &&
29322           "Unexpected source type in LowerBITCAST");
29323    Src = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v2i64, Src);
29324  }
29325
29326  MVT V2X64VT = DstVT == MVT::f64 ? MVT::v2f64 : MVT::v2i64;
29327  Src = DAG.getNode(ISD::BITCAST, dl, V2X64VT, Src);
29328
29329  if (DstVT == MVT::x86mmx)
29330    return DAG.getNode(X86ISD::MOVDQ2Q, dl, DstVT, Src);
29331
29332  return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, DstVT, Src,
29333                     DAG.getIntPtrConstant(0, dl));
29334}
29335
29336/// Compute the horizontal sum of bytes in V for the elements of VT.
29337///
29338/// Requires V to be a byte vector and VT to be an integer vector type with
29339/// wider elements than V's type. The width of the elements of VT determines
29340/// how many bytes of V are summed horizontally to produce each element of the
29341/// result.
29342static SDValue LowerHorizontalByteSum(SDValue V, MVT VT,
29343                                      const X86Subtarget &Subtarget,
29344                                      SelectionDAG &DAG) {
29345  SDLoc DL(V);
29346  MVT ByteVecVT = V.getSimpleValueType();
29347  MVT EltVT = VT.getVectorElementType();
29348  assert(ByteVecVT.getVectorElementType() == MVT::i8 &&
29349         "Expected value to have byte element type.");
29350  assert(EltVT != MVT::i8 &&
29351         "Horizontal byte sum only makes sense for wider elements!");
29352  unsigned VecSize = VT.getSizeInBits();
29353  assert(ByteVecVT.getSizeInBits() == VecSize && "Cannot change vector size!");
29354
29355  // PSADBW instruction horizontally add all bytes and leave the result in i64
29356  // chunks, thus directly computes the pop count for v2i64 and v4i64.
29357  if (EltVT == MVT::i64) {
29358    SDValue Zeros = DAG.getConstant(0, DL, ByteVecVT);
29359    MVT SadVecVT = MVT::getVectorVT(MVT::i64, VecSize / 64);
29360    V = DAG.getNode(X86ISD::PSADBW, DL, SadVecVT, V, Zeros);
29361    return DAG.getBitcast(VT, V);
29362  }
29363
29364  if (EltVT == MVT::i32) {
29365    // We unpack the low half and high half into i32s interleaved with zeros so
29366    // that we can use PSADBW to horizontally sum them. The most useful part of
29367    // this is that it lines up the results of two PSADBW instructions to be
29368    // two v2i64 vectors which concatenated are the 4 population counts. We can
29369    // then use PACKUSWB to shrink and concatenate them into a v4i32 again.
29370    SDValue Zeros = DAG.getConstant(0, DL, VT);
29371    SDValue V32 = DAG.getBitcast(VT, V);
29372    SDValue Low = getUnpackl(DAG, DL, VT, V32, Zeros);
29373    SDValue High = getUnpackh(DAG, DL, VT, V32, Zeros);
29374
29375    // Do the horizontal sums into two v2i64s.
29376    Zeros = DAG.getConstant(0, DL, ByteVecVT);
29377    MVT SadVecVT = MVT::getVectorVT(MVT::i64, VecSize / 64);
29378    Low = DAG.getNode(X86ISD::PSADBW, DL, SadVecVT,
29379                      DAG.getBitcast(ByteVecVT, Low), Zeros);
29380    High = DAG.getNode(X86ISD::PSADBW, DL, SadVecVT,
29381                       DAG.getBitcast(ByteVecVT, High), Zeros);
29382
29383    // Merge them together.
29384    MVT ShortVecVT = MVT::getVectorVT(MVT::i16, VecSize / 16);
29385    V = DAG.getNode(X86ISD::PACKUS, DL, ByteVecVT,
29386                    DAG.getBitcast(ShortVecVT, Low),
29387                    DAG.getBitcast(ShortVecVT, High));
29388
29389    return DAG.getBitcast(VT, V);
29390  }
29391
29392  // The only element type left is i16.
29393  assert(EltVT == MVT::i16 && "Unknown how to handle type");
29394
29395  // To obtain pop count for each i16 element starting from the pop count for
29396  // i8 elements, shift the i16s left by 8, sum as i8s, and then shift as i16s
29397  // right by 8. It is important to shift as i16s as i8 vector shift isn't
29398  // directly supported.
29399  SDValue ShifterV = DAG.getConstant(8, DL, VT);
29400  SDValue Shl = DAG.getNode(ISD::SHL, DL, VT, DAG.getBitcast(VT, V), ShifterV);
29401  V = DAG.getNode(ISD::ADD, DL, ByteVecVT, DAG.getBitcast(ByteVecVT, Shl),
29402                  DAG.getBitcast(ByteVecVT, V));
29403  return DAG.getNode(ISD::SRL, DL, VT, DAG.getBitcast(VT, V), ShifterV);
29404}
29405
29406static SDValue LowerVectorCTPOPInRegLUT(SDValue Op, const SDLoc &DL,
29407                                        const X86Subtarget &Subtarget,
29408                                        SelectionDAG &DAG) {
29409  MVT VT = Op.getSimpleValueType();
29410  MVT EltVT = VT.getVectorElementType();
29411  int NumElts = VT.getVectorNumElements();
29412  (void)EltVT;
29413  assert(EltVT == MVT::i8 && "Only vXi8 vector CTPOP lowering supported.");
29414
29415  // Implement a lookup table in register by using an algorithm based on:
29416  // http://wm.ite.pl/articles/sse-popcount.html
29417  //
29418  // The general idea is that every lower byte nibble in the input vector is an
29419  // index into a in-register pre-computed pop count table. We then split up the
29420  // input vector in two new ones: (1) a vector with only the shifted-right
29421  // higher nibbles for each byte and (2) a vector with the lower nibbles (and
29422  // masked out higher ones) for each byte. PSHUFB is used separately with both
29423  // to index the in-register table. Next, both are added and the result is a
29424  // i8 vector where each element contains the pop count for input byte.
29425  const int LUT[16] = {/* 0 */ 0, /* 1 */ 1, /* 2 */ 1, /* 3 */ 2,
29426                       /* 4 */ 1, /* 5 */ 2, /* 6 */ 2, /* 7 */ 3,
29427                       /* 8 */ 1, /* 9 */ 2, /* a */ 2, /* b */ 3,
29428                       /* c */ 2, /* d */ 3, /* e */ 3, /* f */ 4};
29429
29430  SmallVector<SDValue, 64> LUTVec;
29431  for (int i = 0; i < NumElts; ++i)
29432    LUTVec.push_back(DAG.getConstant(LUT[i % 16], DL, MVT::i8));
29433  SDValue InRegLUT = DAG.getBuildVector(VT, DL, LUTVec);
29434  SDValue M0F = DAG.getConstant(0x0F, DL, VT);
29435
29436  // High nibbles
29437  SDValue FourV = DAG.getConstant(4, DL, VT);
29438  SDValue HiNibbles = DAG.getNode(ISD::SRL, DL, VT, Op, FourV);
29439
29440  // Low nibbles
29441  SDValue LoNibbles = DAG.getNode(ISD::AND, DL, VT, Op, M0F);
29442
29443  // The input vector is used as the shuffle mask that index elements into the
29444  // LUT. After counting low and high nibbles, add the vector to obtain the
29445  // final pop count per i8 element.
29446  SDValue HiPopCnt = DAG.getNode(X86ISD::PSHUFB, DL, VT, InRegLUT, HiNibbles);
29447  SDValue LoPopCnt = DAG.getNode(X86ISD::PSHUFB, DL, VT, InRegLUT, LoNibbles);
29448  return DAG.getNode(ISD::ADD, DL, VT, HiPopCnt, LoPopCnt);
29449}
29450
29451// Please ensure that any codegen change from LowerVectorCTPOP is reflected in
29452// updated cost models in X86TTIImpl::getIntrinsicInstrCost.
29453static SDValue LowerVectorCTPOP(SDValue Op, const X86Subtarget &Subtarget,
29454                                SelectionDAG &DAG) {
29455  MVT VT = Op.getSimpleValueType();
29456  assert((VT.is512BitVector() || VT.is256BitVector() || VT.is128BitVector()) &&
29457         "Unknown CTPOP type to handle");
29458  SDLoc DL(Op.getNode());
29459  SDValue Op0 = Op.getOperand(0);
29460
29461  // TRUNC(CTPOP(ZEXT(X))) to make use of vXi32/vXi64 VPOPCNT instructions.
29462  if (Subtarget.hasVPOPCNTDQ()) {
29463    unsigned NumElems = VT.getVectorNumElements();
29464    assert((VT.getVectorElementType() == MVT::i8 ||
29465            VT.getVectorElementType() == MVT::i16) && "Unexpected type");
29466    if (NumElems < 16 || (NumElems == 16 && Subtarget.canExtendTo512DQ())) {
29467      MVT NewVT = MVT::getVectorVT(MVT::i32, NumElems);
29468      Op = DAG.getNode(ISD::ZERO_EXTEND, DL, NewVT, Op0);
29469      Op = DAG.getNode(ISD::CTPOP, DL, NewVT, Op);
29470      return DAG.getNode(ISD::TRUNCATE, DL, VT, Op);
29471    }
29472  }
29473
29474  // Decompose 256-bit ops into smaller 128-bit ops.
29475  if (VT.is256BitVector() && !Subtarget.hasInt256())
29476    return splitVectorIntUnary(Op, DAG);
29477
29478  // Decompose 512-bit ops into smaller 256-bit ops.
29479  if (VT.is512BitVector() && !Subtarget.hasBWI())
29480    return splitVectorIntUnary(Op, DAG);
29481
29482  // For element types greater than i8, do vXi8 pop counts and a bytesum.
29483  if (VT.getScalarType() != MVT::i8) {
29484    MVT ByteVT = MVT::getVectorVT(MVT::i8, VT.getSizeInBits() / 8);
29485    SDValue ByteOp = DAG.getBitcast(ByteVT, Op0);
29486    SDValue PopCnt8 = DAG.getNode(ISD::CTPOP, DL, ByteVT, ByteOp);
29487    return LowerHorizontalByteSum(PopCnt8, VT, Subtarget, DAG);
29488  }
29489
29490  // We can't use the fast LUT approach, so fall back on LegalizeDAG.
29491  if (!Subtarget.hasSSSE3())
29492    return SDValue();
29493
29494  return LowerVectorCTPOPInRegLUT(Op0, DL, Subtarget, DAG);
29495}
29496
29497static SDValue LowerCTPOP(SDValue Op, const X86Subtarget &Subtarget,
29498                          SelectionDAG &DAG) {
29499  assert(Op.getSimpleValueType().isVector() &&
29500         "We only do custom lowering for vector population count.");
29501  return LowerVectorCTPOP(Op, Subtarget, DAG);
29502}
29503
29504static SDValue LowerBITREVERSE_XOP(SDValue Op, SelectionDAG &DAG) {
29505  MVT VT = Op.getSimpleValueType();
29506  SDValue In = Op.getOperand(0);
29507  SDLoc DL(Op);
29508
29509  // For scalars, its still beneficial to transfer to/from the SIMD unit to
29510  // perform the BITREVERSE.
29511  if (!VT.isVector()) {
29512    MVT VecVT = MVT::getVectorVT(VT, 128 / VT.getSizeInBits());
29513    SDValue Res = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, VecVT, In);
29514    Res = DAG.getNode(ISD::BITREVERSE, DL, VecVT, Res);
29515    return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, VT, Res,
29516                       DAG.getIntPtrConstant(0, DL));
29517  }
29518
29519  int NumElts = VT.getVectorNumElements();
29520  int ScalarSizeInBytes = VT.getScalarSizeInBits() / 8;
29521
29522  // Decompose 256-bit ops into smaller 128-bit ops.
29523  if (VT.is256BitVector())
29524    return splitVectorIntUnary(Op, DAG);
29525
29526  assert(VT.is128BitVector() &&
29527         "Only 128-bit vector bitreverse lowering supported.");
29528
29529  // VPPERM reverses the bits of a byte with the permute Op (2 << 5), and we
29530  // perform the BSWAP in the shuffle.
29531  // Its best to shuffle using the second operand as this will implicitly allow
29532  // memory folding for multiple vectors.
29533  SmallVector<SDValue, 16> MaskElts;
29534  for (int i = 0; i != NumElts; ++i) {
29535    for (int j = ScalarSizeInBytes - 1; j >= 0; --j) {
29536      int SourceByte = 16 + (i * ScalarSizeInBytes) + j;
29537      int PermuteByte = SourceByte | (2 << 5);
29538      MaskElts.push_back(DAG.getConstant(PermuteByte, DL, MVT::i8));
29539    }
29540  }
29541
29542  SDValue Mask = DAG.getBuildVector(MVT::v16i8, DL, MaskElts);
29543  SDValue Res = DAG.getBitcast(MVT::v16i8, In);
29544  Res = DAG.getNode(X86ISD::VPPERM, DL, MVT::v16i8, DAG.getUNDEF(MVT::v16i8),
29545                    Res, Mask);
29546  return DAG.getBitcast(VT, Res);
29547}
29548
29549static SDValue LowerBITREVERSE(SDValue Op, const X86Subtarget &Subtarget,
29550                               SelectionDAG &DAG) {
29551  MVT VT = Op.getSimpleValueType();
29552
29553  if (Subtarget.hasXOP() && !VT.is512BitVector())
29554    return LowerBITREVERSE_XOP(Op, DAG);
29555
29556  assert(Subtarget.hasSSSE3() && "SSSE3 required for BITREVERSE");
29557
29558  SDValue In = Op.getOperand(0);
29559  SDLoc DL(Op);
29560
29561  assert(VT.getScalarType() == MVT::i8 &&
29562         "Only byte vector BITREVERSE supported");
29563
29564  // Split v64i8 without BWI so that we can still use the PSHUFB lowering.
29565  if (VT == MVT::v64i8 && !Subtarget.hasBWI())
29566    return splitVectorIntUnary(Op, DAG);
29567
29568  // Decompose 256-bit ops into smaller 128-bit ops on pre-AVX2.
29569  if (VT == MVT::v32i8 && !Subtarget.hasInt256())
29570    return splitVectorIntUnary(Op, DAG);
29571
29572  unsigned NumElts = VT.getVectorNumElements();
29573
29574  // If we have GFNI, we can use GF2P8AFFINEQB to reverse the bits.
29575  if (Subtarget.hasGFNI()) {
29576    MVT MatrixVT = MVT::getVectorVT(MVT::i64, NumElts / 8);
29577    SDValue Matrix = DAG.getConstant(0x8040201008040201ULL, DL, MatrixVT);
29578    Matrix = DAG.getBitcast(VT, Matrix);
29579    return DAG.getNode(X86ISD::GF2P8AFFINEQB, DL, VT, In, Matrix,
29580                       DAG.getTargetConstant(0, DL, MVT::i8));
29581  }
29582
29583  // Perform BITREVERSE using PSHUFB lookups. Each byte is split into
29584  // two nibbles and a PSHUFB lookup to find the bitreverse of each
29585  // 0-15 value (moved to the other nibble).
29586  SDValue NibbleMask = DAG.getConstant(0xF, DL, VT);
29587  SDValue Lo = DAG.getNode(ISD::AND, DL, VT, In, NibbleMask);
29588  SDValue Hi = DAG.getNode(ISD::SRL, DL, VT, In, DAG.getConstant(4, DL, VT));
29589
29590  const int LoLUT[16] = {
29591      /* 0 */ 0x00, /* 1 */ 0x80, /* 2 */ 0x40, /* 3 */ 0xC0,
29592      /* 4 */ 0x20, /* 5 */ 0xA0, /* 6 */ 0x60, /* 7 */ 0xE0,
29593      /* 8 */ 0x10, /* 9 */ 0x90, /* a */ 0x50, /* b */ 0xD0,
29594      /* c */ 0x30, /* d */ 0xB0, /* e */ 0x70, /* f */ 0xF0};
29595  const int HiLUT[16] = {
29596      /* 0 */ 0x00, /* 1 */ 0x08, /* 2 */ 0x04, /* 3 */ 0x0C,
29597      /* 4 */ 0x02, /* 5 */ 0x0A, /* 6 */ 0x06, /* 7 */ 0x0E,
29598      /* 8 */ 0x01, /* 9 */ 0x09, /* a */ 0x05, /* b */ 0x0D,
29599      /* c */ 0x03, /* d */ 0x0B, /* e */ 0x07, /* f */ 0x0F};
29600
29601  SmallVector<SDValue, 16> LoMaskElts, HiMaskElts;
29602  for (unsigned i = 0; i < NumElts; ++i) {
29603    LoMaskElts.push_back(DAG.getConstant(LoLUT[i % 16], DL, MVT::i8));
29604    HiMaskElts.push_back(DAG.getConstant(HiLUT[i % 16], DL, MVT::i8));
29605  }
29606
29607  SDValue LoMask = DAG.getBuildVector(VT, DL, LoMaskElts);
29608  SDValue HiMask = DAG.getBuildVector(VT, DL, HiMaskElts);
29609  Lo = DAG.getNode(X86ISD::PSHUFB, DL, VT, LoMask, Lo);
29610  Hi = DAG.getNode(X86ISD::PSHUFB, DL, VT, HiMask, Hi);
29611  return DAG.getNode(ISD::OR, DL, VT, Lo, Hi);
29612}
29613
29614static SDValue LowerPARITY(SDValue Op, const X86Subtarget &Subtarget,
29615                           SelectionDAG &DAG) {
29616  SDLoc DL(Op);
29617  SDValue X = Op.getOperand(0);
29618  MVT VT = Op.getSimpleValueType();
29619
29620  // Special case. If the input fits in 8-bits we can use a single 8-bit TEST.
29621  if (VT == MVT::i8 ||
29622      DAG.MaskedValueIsZero(X, APInt::getBitsSetFrom(VT.getSizeInBits(), 8))) {
29623    X = DAG.getNode(ISD::TRUNCATE, DL, MVT::i8, X);
29624    SDValue Flags = DAG.getNode(X86ISD::CMP, DL, MVT::i32, X,
29625                                DAG.getConstant(0, DL, MVT::i8));
29626    // Copy the inverse of the parity flag into a register with setcc.
29627    SDValue Setnp = getSETCC(X86::COND_NP, Flags, DL, DAG);
29628    // Extend to the original type.
29629    return DAG.getNode(ISD::ZERO_EXTEND, DL, VT, Setnp);
29630  }
29631
29632  if (VT == MVT::i64) {
29633    // Xor the high and low 16-bits together using a 32-bit operation.
29634    SDValue Hi = DAG.getNode(ISD::TRUNCATE, DL, MVT::i32,
29635                             DAG.getNode(ISD::SRL, DL, MVT::i64, X,
29636                                         DAG.getConstant(32, DL, MVT::i8)));
29637    SDValue Lo = DAG.getNode(ISD::TRUNCATE, DL, MVT::i32, X);
29638    X = DAG.getNode(ISD::XOR, DL, MVT::i32, Lo, Hi);
29639  }
29640
29641  if (VT != MVT::i16) {
29642    // Xor the high and low 16-bits together using a 32-bit operation.
29643    SDValue Hi16 = DAG.getNode(ISD::SRL, DL, MVT::i32, X,
29644                               DAG.getConstant(16, DL, MVT::i8));
29645    X = DAG.getNode(ISD::XOR, DL, MVT::i32, X, Hi16);
29646  } else {
29647    // If the input is 16-bits, we need to extend to use an i32 shift below.
29648    X = DAG.getNode(ISD::ANY_EXTEND, DL, MVT::i32, X);
29649  }
29650
29651  // Finally xor the low 2 bytes together and use a 8-bit flag setting xor.
29652  // This should allow an h-reg to be used to save a shift.
29653  SDValue Hi = DAG.getNode(
29654      ISD::TRUNCATE, DL, MVT::i8,
29655      DAG.getNode(ISD::SRL, DL, MVT::i32, X, DAG.getConstant(8, DL, MVT::i8)));
29656  SDValue Lo = DAG.getNode(ISD::TRUNCATE, DL, MVT::i8, X);
29657  SDVTList VTs = DAG.getVTList(MVT::i8, MVT::i32);
29658  SDValue Flags = DAG.getNode(X86ISD::XOR, DL, VTs, Lo, Hi).getValue(1);
29659
29660  // Copy the inverse of the parity flag into a register with setcc.
29661  SDValue Setnp = getSETCC(X86::COND_NP, Flags, DL, DAG);
29662  // Extend to the original type.
29663  return DAG.getNode(ISD::ZERO_EXTEND, DL, VT, Setnp);
29664}
29665
29666static SDValue lowerAtomicArithWithLOCK(SDValue N, SelectionDAG &DAG,
29667                                        const X86Subtarget &Subtarget) {
29668  unsigned NewOpc = 0;
29669  switch (N->getOpcode()) {
29670  case ISD::ATOMIC_LOAD_ADD:
29671    NewOpc = X86ISD::LADD;
29672    break;
29673  case ISD::ATOMIC_LOAD_SUB:
29674    NewOpc = X86ISD::LSUB;
29675    break;
29676  case ISD::ATOMIC_LOAD_OR:
29677    NewOpc = X86ISD::LOR;
29678    break;
29679  case ISD::ATOMIC_LOAD_XOR:
29680    NewOpc = X86ISD::LXOR;
29681    break;
29682  case ISD::ATOMIC_LOAD_AND:
29683    NewOpc = X86ISD::LAND;
29684    break;
29685  default:
29686    llvm_unreachable("Unknown ATOMIC_LOAD_ opcode");
29687  }
29688
29689  MachineMemOperand *MMO = cast<MemSDNode>(N)->getMemOperand();
29690
29691  return DAG.getMemIntrinsicNode(
29692      NewOpc, SDLoc(N), DAG.getVTList(MVT::i32, MVT::Other),
29693      {N->getOperand(0), N->getOperand(1), N->getOperand(2)},
29694      /*MemVT=*/N->getSimpleValueType(0), MMO);
29695}
29696
29697/// Lower atomic_load_ops into LOCK-prefixed operations.
29698static SDValue lowerAtomicArith(SDValue N, SelectionDAG &DAG,
29699                                const X86Subtarget &Subtarget) {
29700  AtomicSDNode *AN = cast<AtomicSDNode>(N.getNode());
29701  SDValue Chain = N->getOperand(0);
29702  SDValue LHS = N->getOperand(1);
29703  SDValue RHS = N->getOperand(2);
29704  unsigned Opc = N->getOpcode();
29705  MVT VT = N->getSimpleValueType(0);
29706  SDLoc DL(N);
29707
29708  // We can lower atomic_load_add into LXADD. However, any other atomicrmw op
29709  // can only be lowered when the result is unused.  They should have already
29710  // been transformed into a cmpxchg loop in AtomicExpand.
29711  if (N->hasAnyUseOfValue(0)) {
29712    // Handle (atomic_load_sub p, v) as (atomic_load_add p, -v), to be able to
29713    // select LXADD if LOCK_SUB can't be selected.
29714    if (Opc == ISD::ATOMIC_LOAD_SUB) {
29715      RHS = DAG.getNode(ISD::SUB, DL, VT, DAG.getConstant(0, DL, VT), RHS);
29716      return DAG.getAtomic(ISD::ATOMIC_LOAD_ADD, DL, VT, Chain, LHS,
29717                           RHS, AN->getMemOperand());
29718    }
29719    assert(Opc == ISD::ATOMIC_LOAD_ADD &&
29720           "Used AtomicRMW ops other than Add should have been expanded!");
29721    return N;
29722  }
29723
29724  // Specialized lowering for the canonical form of an idemptotent atomicrmw.
29725  // The core idea here is that since the memory location isn't actually
29726  // changing, all we need is a lowering for the *ordering* impacts of the
29727  // atomicrmw.  As such, we can chose a different operation and memory
29728  // location to minimize impact on other code.
29729  if (Opc == ISD::ATOMIC_LOAD_OR && isNullConstant(RHS)) {
29730    // On X86, the only ordering which actually requires an instruction is
29731    // seq_cst which isn't SingleThread, everything just needs to be preserved
29732    // during codegen and then dropped. Note that we expect (but don't assume),
29733    // that orderings other than seq_cst and acq_rel have been canonicalized to
29734    // a store or load.
29735    if (AN->getOrdering() == AtomicOrdering::SequentiallyConsistent &&
29736        AN->getSyncScopeID() == SyncScope::System) {
29737      // Prefer a locked operation against a stack location to minimize cache
29738      // traffic.  This assumes that stack locations are very likely to be
29739      // accessed only by the owning thread.
29740      SDValue NewChain = emitLockedStackOp(DAG, Subtarget, Chain, DL);
29741      assert(!N->hasAnyUseOfValue(0));
29742      // NOTE: The getUNDEF is needed to give something for the unused result 0.
29743      return DAG.getNode(ISD::MERGE_VALUES, DL, N->getVTList(),
29744                         DAG.getUNDEF(VT), NewChain);
29745    }
29746    // MEMBARRIER is a compiler barrier; it codegens to a no-op.
29747    SDValue NewChain = DAG.getNode(X86ISD::MEMBARRIER, DL, MVT::Other, Chain);
29748    assert(!N->hasAnyUseOfValue(0));
29749    // NOTE: The getUNDEF is needed to give something for the unused result 0.
29750    return DAG.getNode(ISD::MERGE_VALUES, DL, N->getVTList(),
29751                       DAG.getUNDEF(VT), NewChain);
29752  }
29753
29754  SDValue LockOp = lowerAtomicArithWithLOCK(N, DAG, Subtarget);
29755  // RAUW the chain, but don't worry about the result, as it's unused.
29756  assert(!N->hasAnyUseOfValue(0));
29757  // NOTE: The getUNDEF is needed to give something for the unused result 0.
29758  return DAG.getNode(ISD::MERGE_VALUES, DL, N->getVTList(),
29759                     DAG.getUNDEF(VT), LockOp.getValue(1));
29760}
29761
29762static SDValue LowerATOMIC_STORE(SDValue Op, SelectionDAG &DAG,
29763                                 const X86Subtarget &Subtarget) {
29764  auto *Node = cast<AtomicSDNode>(Op.getNode());
29765  SDLoc dl(Node);
29766  EVT VT = Node->getMemoryVT();
29767
29768  bool IsSeqCst = Node->getOrdering() == AtomicOrdering::SequentiallyConsistent;
29769  bool IsTypeLegal = DAG.getTargetLoweringInfo().isTypeLegal(VT);
29770
29771  // If this store is not sequentially consistent and the type is legal
29772  // we can just keep it.
29773  if (!IsSeqCst && IsTypeLegal)
29774    return Op;
29775
29776  if (VT == MVT::i64 && !IsTypeLegal) {
29777    // For illegal i64 atomic_stores, we can try to use MOVQ or MOVLPS if SSE
29778    // is enabled.
29779    bool NoImplicitFloatOps =
29780        DAG.getMachineFunction().getFunction().hasFnAttribute(
29781            Attribute::NoImplicitFloat);
29782    if (!Subtarget.useSoftFloat() && !NoImplicitFloatOps) {
29783      SDValue Chain;
29784      if (Subtarget.hasSSE1()) {
29785        SDValue SclToVec = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v2i64,
29786                                       Node->getOperand(2));
29787        MVT StVT = Subtarget.hasSSE2() ? MVT::v2i64 : MVT::v4f32;
29788        SclToVec = DAG.getBitcast(StVT, SclToVec);
29789        SDVTList Tys = DAG.getVTList(MVT::Other);
29790        SDValue Ops[] = {Node->getChain(), SclToVec, Node->getBasePtr()};
29791        Chain = DAG.getMemIntrinsicNode(X86ISD::VEXTRACT_STORE, dl, Tys, Ops,
29792                                        MVT::i64, Node->getMemOperand());
29793      } else if (Subtarget.hasX87()) {
29794        // First load this into an 80-bit X87 register using a stack temporary.
29795        // This will put the whole integer into the significand.
29796        SDValue StackPtr = DAG.CreateStackTemporary(MVT::i64);
29797        int SPFI = cast<FrameIndexSDNode>(StackPtr.getNode())->getIndex();
29798        MachinePointerInfo MPI =
29799            MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), SPFI);
29800        Chain =
29801            DAG.getStore(Node->getChain(), dl, Node->getOperand(2), StackPtr,
29802                         MPI, MaybeAlign(), MachineMemOperand::MOStore);
29803        SDVTList Tys = DAG.getVTList(MVT::f80, MVT::Other);
29804        SDValue LdOps[] = {Chain, StackPtr};
29805        SDValue Value =
29806            DAG.getMemIntrinsicNode(X86ISD::FILD, dl, Tys, LdOps, MVT::i64, MPI,
29807                                    /*Align*/ None, MachineMemOperand::MOLoad);
29808        Chain = Value.getValue(1);
29809
29810        // Now use an FIST to do the atomic store.
29811        SDValue StoreOps[] = {Chain, Value, Node->getBasePtr()};
29812        Chain =
29813            DAG.getMemIntrinsicNode(X86ISD::FIST, dl, DAG.getVTList(MVT::Other),
29814                                    StoreOps, MVT::i64, Node->getMemOperand());
29815      }
29816
29817      if (Chain) {
29818        // If this is a sequentially consistent store, also emit an appropriate
29819        // barrier.
29820        if (IsSeqCst)
29821          Chain = emitLockedStackOp(DAG, Subtarget, Chain, dl);
29822
29823        return Chain;
29824      }
29825    }
29826  }
29827
29828  // Convert seq_cst store -> xchg
29829  // Convert wide store -> swap (-> cmpxchg8b/cmpxchg16b)
29830  // FIXME: 16-byte ATOMIC_SWAP isn't actually hooked up at the moment.
29831  SDValue Swap = DAG.getAtomic(ISD::ATOMIC_SWAP, dl,
29832                               Node->getMemoryVT(),
29833                               Node->getOperand(0),
29834                               Node->getOperand(1), Node->getOperand(2),
29835                               Node->getMemOperand());
29836  return Swap.getValue(1);
29837}
29838
29839static SDValue LowerADDSUBCARRY(SDValue Op, SelectionDAG &DAG) {
29840  SDNode *N = Op.getNode();
29841  MVT VT = N->getSimpleValueType(0);
29842  unsigned Opc = Op.getOpcode();
29843
29844  // Let legalize expand this if it isn't a legal type yet.
29845  if (!DAG.getTargetLoweringInfo().isTypeLegal(VT))
29846    return SDValue();
29847
29848  SDVTList VTs = DAG.getVTList(VT, MVT::i32);
29849  SDLoc DL(N);
29850
29851  // Set the carry flag.
29852  SDValue Carry = Op.getOperand(2);
29853  EVT CarryVT = Carry.getValueType();
29854  Carry = DAG.getNode(X86ISD::ADD, DL, DAG.getVTList(CarryVT, MVT::i32),
29855                      Carry, DAG.getAllOnesConstant(DL, CarryVT));
29856
29857  bool IsAdd = Opc == ISD::ADDCARRY || Opc == ISD::SADDO_CARRY;
29858  SDValue Sum = DAG.getNode(IsAdd ? X86ISD::ADC : X86ISD::SBB, DL, VTs,
29859                            Op.getOperand(0), Op.getOperand(1),
29860                            Carry.getValue(1));
29861
29862  bool IsSigned = Opc == ISD::SADDO_CARRY || Opc == ISD::SSUBO_CARRY;
29863  SDValue SetCC = getSETCC(IsSigned ? X86::COND_O : X86::COND_B,
29864                           Sum.getValue(1), DL, DAG);
29865  if (N->getValueType(1) == MVT::i1)
29866    SetCC = DAG.getNode(ISD::TRUNCATE, DL, MVT::i1, SetCC);
29867
29868  return DAG.getNode(ISD::MERGE_VALUES, DL, N->getVTList(), Sum, SetCC);
29869}
29870
29871static SDValue LowerFSINCOS(SDValue Op, const X86Subtarget &Subtarget,
29872                            SelectionDAG &DAG) {
29873  assert(Subtarget.isTargetDarwin() && Subtarget.is64Bit());
29874
29875  // For MacOSX, we want to call an alternative entry point: __sincos_stret,
29876  // which returns the values as { float, float } (in XMM0) or
29877  // { double, double } (which is returned in XMM0, XMM1).
29878  SDLoc dl(Op);
29879  SDValue Arg = Op.getOperand(0);
29880  EVT ArgVT = Arg.getValueType();
29881  Type *ArgTy = ArgVT.getTypeForEVT(*DAG.getContext());
29882
29883  TargetLowering::ArgListTy Args;
29884  TargetLowering::ArgListEntry Entry;
29885
29886  Entry.Node = Arg;
29887  Entry.Ty = ArgTy;
29888  Entry.IsSExt = false;
29889  Entry.IsZExt = false;
29890  Args.push_back(Entry);
29891
29892  bool isF64 = ArgVT == MVT::f64;
29893  // Only optimize x86_64 for now. i386 is a bit messy. For f32,
29894  // the small struct {f32, f32} is returned in (eax, edx). For f64,
29895  // the results are returned via SRet in memory.
29896  const TargetLowering &TLI = DAG.getTargetLoweringInfo();
29897  RTLIB::Libcall LC = isF64 ? RTLIB::SINCOS_STRET_F64 : RTLIB::SINCOS_STRET_F32;
29898  const char *LibcallName = TLI.getLibcallName(LC);
29899  SDValue Callee =
29900      DAG.getExternalSymbol(LibcallName, TLI.getPointerTy(DAG.getDataLayout()));
29901
29902  Type *RetTy = isF64 ? (Type *)StructType::get(ArgTy, ArgTy)
29903                      : (Type *)FixedVectorType::get(ArgTy, 4);
29904
29905  TargetLowering::CallLoweringInfo CLI(DAG);
29906  CLI.setDebugLoc(dl)
29907      .setChain(DAG.getEntryNode())
29908      .setLibCallee(CallingConv::C, RetTy, Callee, std::move(Args));
29909
29910  std::pair<SDValue, SDValue> CallResult = TLI.LowerCallTo(CLI);
29911
29912  if (isF64)
29913    // Returned in xmm0 and xmm1.
29914    return CallResult.first;
29915
29916  // Returned in bits 0:31 and 32:64 xmm0.
29917  SDValue SinVal = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, ArgVT,
29918                               CallResult.first, DAG.getIntPtrConstant(0, dl));
29919  SDValue CosVal = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, ArgVT,
29920                               CallResult.first, DAG.getIntPtrConstant(1, dl));
29921  SDVTList Tys = DAG.getVTList(ArgVT, ArgVT);
29922  return DAG.getNode(ISD::MERGE_VALUES, dl, Tys, SinVal, CosVal);
29923}
29924
29925/// Widen a vector input to a vector of NVT.  The
29926/// input vector must have the same element type as NVT.
29927static SDValue ExtendToType(SDValue InOp, MVT NVT, SelectionDAG &DAG,
29928                            bool FillWithZeroes = false) {
29929  // Check if InOp already has the right width.
29930  MVT InVT = InOp.getSimpleValueType();
29931  if (InVT == NVT)
29932    return InOp;
29933
29934  if (InOp.isUndef())
29935    return DAG.getUNDEF(NVT);
29936
29937  assert(InVT.getVectorElementType() == NVT.getVectorElementType() &&
29938         "input and widen element type must match");
29939
29940  unsigned InNumElts = InVT.getVectorNumElements();
29941  unsigned WidenNumElts = NVT.getVectorNumElements();
29942  assert(WidenNumElts > InNumElts && WidenNumElts % InNumElts == 0 &&
29943         "Unexpected request for vector widening");
29944
29945  SDLoc dl(InOp);
29946  if (InOp.getOpcode() == ISD::CONCAT_VECTORS &&
29947      InOp.getNumOperands() == 2) {
29948    SDValue N1 = InOp.getOperand(1);
29949    if ((ISD::isBuildVectorAllZeros(N1.getNode()) && FillWithZeroes) ||
29950        N1.isUndef()) {
29951      InOp = InOp.getOperand(0);
29952      InVT = InOp.getSimpleValueType();
29953      InNumElts = InVT.getVectorNumElements();
29954    }
29955  }
29956  if (ISD::isBuildVectorOfConstantSDNodes(InOp.getNode()) ||
29957      ISD::isBuildVectorOfConstantFPSDNodes(InOp.getNode())) {
29958    SmallVector<SDValue, 16> Ops;
29959    for (unsigned i = 0; i < InNumElts; ++i)
29960      Ops.push_back(InOp.getOperand(i));
29961
29962    EVT EltVT = InOp.getOperand(0).getValueType();
29963
29964    SDValue FillVal = FillWithZeroes ? DAG.getConstant(0, dl, EltVT) :
29965      DAG.getUNDEF(EltVT);
29966    for (unsigned i = 0; i < WidenNumElts - InNumElts; ++i)
29967      Ops.push_back(FillVal);
29968    return DAG.getBuildVector(NVT, dl, Ops);
29969  }
29970  SDValue FillVal = FillWithZeroes ? DAG.getConstant(0, dl, NVT) :
29971    DAG.getUNDEF(NVT);
29972  return DAG.getNode(ISD::INSERT_SUBVECTOR, dl, NVT, FillVal,
29973                     InOp, DAG.getIntPtrConstant(0, dl));
29974}
29975
29976static SDValue LowerMSCATTER(SDValue Op, const X86Subtarget &Subtarget,
29977                             SelectionDAG &DAG) {
29978  assert(Subtarget.hasAVX512() &&
29979         "MGATHER/MSCATTER are supported on AVX-512 arch only");
29980
29981  MaskedScatterSDNode *N = cast<MaskedScatterSDNode>(Op.getNode());
29982  SDValue Src = N->getValue();
29983  MVT VT = Src.getSimpleValueType();
29984  assert(VT.getScalarSizeInBits() >= 32 && "Unsupported scatter op");
29985  SDLoc dl(Op);
29986
29987  SDValue Scale = N->getScale();
29988  SDValue Index = N->getIndex();
29989  SDValue Mask = N->getMask();
29990  SDValue Chain = N->getChain();
29991  SDValue BasePtr = N->getBasePtr();
29992
29993  if (VT == MVT::v2f32 || VT == MVT::v2i32) {
29994    assert(Mask.getValueType() == MVT::v2i1 && "Unexpected mask type");
29995    // If the index is v2i64 and we have VLX we can use xmm for data and index.
29996    if (Index.getValueType() == MVT::v2i64 && Subtarget.hasVLX()) {
29997      const TargetLowering &TLI = DAG.getTargetLoweringInfo();
29998      EVT WideVT = TLI.getTypeToTransformTo(*DAG.getContext(), VT);
29999      Src = DAG.getNode(ISD::CONCAT_VECTORS, dl, WideVT, Src, DAG.getUNDEF(VT));
30000      SDVTList VTs = DAG.getVTList(MVT::Other);
30001      SDValue Ops[] = {Chain, Src, Mask, BasePtr, Index, Scale};
30002      return DAG.getMemIntrinsicNode(X86ISD::MSCATTER, dl, VTs, Ops,
30003                                     N->getMemoryVT(), N->getMemOperand());
30004    }
30005    return SDValue();
30006  }
30007
30008  MVT IndexVT = Index.getSimpleValueType();
30009
30010  // If the index is v2i32, we're being called by type legalization and we
30011  // should just let the default handling take care of it.
30012  if (IndexVT == MVT::v2i32)
30013    return SDValue();
30014
30015  // If we don't have VLX and neither the passthru or index is 512-bits, we
30016  // need to widen until one is.
30017  if (!Subtarget.hasVLX() && !VT.is512BitVector() &&
30018      !Index.getSimpleValueType().is512BitVector()) {
30019    // Determine how much we need to widen by to get a 512-bit type.
30020    unsigned Factor = std::min(512/VT.getSizeInBits(),
30021                               512/IndexVT.getSizeInBits());
30022    unsigned NumElts = VT.getVectorNumElements() * Factor;
30023
30024    VT = MVT::getVectorVT(VT.getVectorElementType(), NumElts);
30025    IndexVT = MVT::getVectorVT(IndexVT.getVectorElementType(), NumElts);
30026    MVT MaskVT = MVT::getVectorVT(MVT::i1, NumElts);
30027
30028    Src = ExtendToType(Src, VT, DAG);
30029    Index = ExtendToType(Index, IndexVT, DAG);
30030    Mask = ExtendToType(Mask, MaskVT, DAG, true);
30031  }
30032
30033  SDVTList VTs = DAG.getVTList(MVT::Other);
30034  SDValue Ops[] = {Chain, Src, Mask, BasePtr, Index, Scale};
30035  return DAG.getMemIntrinsicNode(X86ISD::MSCATTER, dl, VTs, Ops,
30036                                 N->getMemoryVT(), N->getMemOperand());
30037}
30038
30039static SDValue LowerMLOAD(SDValue Op, const X86Subtarget &Subtarget,
30040                          SelectionDAG &DAG) {
30041
30042  MaskedLoadSDNode *N = cast<MaskedLoadSDNode>(Op.getNode());
30043  MVT VT = Op.getSimpleValueType();
30044  MVT ScalarVT = VT.getScalarType();
30045  SDValue Mask = N->getMask();
30046  MVT MaskVT = Mask.getSimpleValueType();
30047  SDValue PassThru = N->getPassThru();
30048  SDLoc dl(Op);
30049
30050  // Handle AVX masked loads which don't support passthru other than 0.
30051  if (MaskVT.getVectorElementType() != MVT::i1) {
30052    // We also allow undef in the isel pattern.
30053    if (PassThru.isUndef() || ISD::isBuildVectorAllZeros(PassThru.getNode()))
30054      return Op;
30055
30056    SDValue NewLoad = DAG.getMaskedLoad(
30057        VT, dl, N->getChain(), N->getBasePtr(), N->getOffset(), Mask,
30058        getZeroVector(VT, Subtarget, DAG, dl), N->getMemoryVT(),
30059        N->getMemOperand(), N->getAddressingMode(), N->getExtensionType(),
30060        N->isExpandingLoad());
30061    // Emit a blend.
30062    SDValue Select = DAG.getNode(ISD::VSELECT, dl, VT, Mask, NewLoad, PassThru);
30063    return DAG.getMergeValues({ Select, NewLoad.getValue(1) }, dl);
30064  }
30065
30066  assert((!N->isExpandingLoad() || Subtarget.hasAVX512()) &&
30067         "Expanding masked load is supported on AVX-512 target only!");
30068
30069  assert((!N->isExpandingLoad() || ScalarVT.getSizeInBits() >= 32) &&
30070         "Expanding masked load is supported for 32 and 64-bit types only!");
30071
30072  assert(Subtarget.hasAVX512() && !Subtarget.hasVLX() && !VT.is512BitVector() &&
30073         "Cannot lower masked load op.");
30074
30075  assert((ScalarVT.getSizeInBits() >= 32 ||
30076          (Subtarget.hasBWI() &&
30077              (ScalarVT == MVT::i8 || ScalarVT == MVT::i16))) &&
30078         "Unsupported masked load op.");
30079
30080  // This operation is legal for targets with VLX, but without
30081  // VLX the vector should be widened to 512 bit
30082  unsigned NumEltsInWideVec = 512 / VT.getScalarSizeInBits();
30083  MVT WideDataVT = MVT::getVectorVT(ScalarVT, NumEltsInWideVec);
30084  PassThru = ExtendToType(PassThru, WideDataVT, DAG);
30085
30086  // Mask element has to be i1.
30087  assert(Mask.getSimpleValueType().getScalarType() == MVT::i1 &&
30088         "Unexpected mask type");
30089
30090  MVT WideMaskVT = MVT::getVectorVT(MVT::i1, NumEltsInWideVec);
30091
30092  Mask = ExtendToType(Mask, WideMaskVT, DAG, true);
30093  SDValue NewLoad = DAG.getMaskedLoad(
30094      WideDataVT, dl, N->getChain(), N->getBasePtr(), N->getOffset(), Mask,
30095      PassThru, N->getMemoryVT(), N->getMemOperand(), N->getAddressingMode(),
30096      N->getExtensionType(), N->isExpandingLoad());
30097
30098  SDValue Extract =
30099      DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, VT, NewLoad.getValue(0),
30100                  DAG.getIntPtrConstant(0, dl));
30101  SDValue RetOps[] = {Extract, NewLoad.getValue(1)};
30102  return DAG.getMergeValues(RetOps, dl);
30103}
30104
30105static SDValue LowerMSTORE(SDValue Op, const X86Subtarget &Subtarget,
30106                           SelectionDAG &DAG) {
30107  MaskedStoreSDNode *N = cast<MaskedStoreSDNode>(Op.getNode());
30108  SDValue DataToStore = N->getValue();
30109  MVT VT = DataToStore.getSimpleValueType();
30110  MVT ScalarVT = VT.getScalarType();
30111  SDValue Mask = N->getMask();
30112  SDLoc dl(Op);
30113
30114  assert((!N->isCompressingStore() || Subtarget.hasAVX512()) &&
30115         "Expanding masked load is supported on AVX-512 target only!");
30116
30117  assert((!N->isCompressingStore() || ScalarVT.getSizeInBits() >= 32) &&
30118         "Expanding masked load is supported for 32 and 64-bit types only!");
30119
30120  assert(Subtarget.hasAVX512() && !Subtarget.hasVLX() && !VT.is512BitVector() &&
30121         "Cannot lower masked store op.");
30122
30123  assert((ScalarVT.getSizeInBits() >= 32 ||
30124          (Subtarget.hasBWI() &&
30125              (ScalarVT == MVT::i8 || ScalarVT == MVT::i16))) &&
30126          "Unsupported masked store op.");
30127
30128  // This operation is legal for targets with VLX, but without
30129  // VLX the vector should be widened to 512 bit
30130  unsigned NumEltsInWideVec = 512/VT.getScalarSizeInBits();
30131  MVT WideDataVT = MVT::getVectorVT(ScalarVT, NumEltsInWideVec);
30132
30133  // Mask element has to be i1.
30134  assert(Mask.getSimpleValueType().getScalarType() == MVT::i1 &&
30135         "Unexpected mask type");
30136
30137  MVT WideMaskVT = MVT::getVectorVT(MVT::i1, NumEltsInWideVec);
30138
30139  DataToStore = ExtendToType(DataToStore, WideDataVT, DAG);
30140  Mask = ExtendToType(Mask, WideMaskVT, DAG, true);
30141  return DAG.getMaskedStore(N->getChain(), dl, DataToStore, N->getBasePtr(),
30142                            N->getOffset(), Mask, N->getMemoryVT(),
30143                            N->getMemOperand(), N->getAddressingMode(),
30144                            N->isTruncatingStore(), N->isCompressingStore());
30145}
30146
30147static SDValue LowerMGATHER(SDValue Op, const X86Subtarget &Subtarget,
30148                            SelectionDAG &DAG) {
30149  assert(Subtarget.hasAVX2() &&
30150         "MGATHER/MSCATTER are supported on AVX-512/AVX-2 arch only");
30151
30152  MaskedGatherSDNode *N = cast<MaskedGatherSDNode>(Op.getNode());
30153  SDLoc dl(Op);
30154  MVT VT = Op.getSimpleValueType();
30155  SDValue Index = N->getIndex();
30156  SDValue Mask = N->getMask();
30157  SDValue PassThru = N->getPassThru();
30158  MVT IndexVT = Index.getSimpleValueType();
30159
30160  assert(VT.getScalarSizeInBits() >= 32 && "Unsupported gather op");
30161
30162  // If the index is v2i32, we're being called by type legalization.
30163  if (IndexVT == MVT::v2i32)
30164    return SDValue();
30165
30166  // If we don't have VLX and neither the passthru or index is 512-bits, we
30167  // need to widen until one is.
30168  MVT OrigVT = VT;
30169  if (Subtarget.hasAVX512() && !Subtarget.hasVLX() && !VT.is512BitVector() &&
30170      !IndexVT.is512BitVector()) {
30171    // Determine how much we need to widen by to get a 512-bit type.
30172    unsigned Factor = std::min(512/VT.getSizeInBits(),
30173                               512/IndexVT.getSizeInBits());
30174
30175    unsigned NumElts = VT.getVectorNumElements() * Factor;
30176
30177    VT = MVT::getVectorVT(VT.getVectorElementType(), NumElts);
30178    IndexVT = MVT::getVectorVT(IndexVT.getVectorElementType(), NumElts);
30179    MVT MaskVT = MVT::getVectorVT(MVT::i1, NumElts);
30180
30181    PassThru = ExtendToType(PassThru, VT, DAG);
30182    Index = ExtendToType(Index, IndexVT, DAG);
30183    Mask = ExtendToType(Mask, MaskVT, DAG, true);
30184  }
30185
30186  SDValue Ops[] = { N->getChain(), PassThru, Mask, N->getBasePtr(), Index,
30187                    N->getScale() };
30188  SDValue NewGather = DAG.getMemIntrinsicNode(
30189      X86ISD::MGATHER, dl, DAG.getVTList(VT, MVT::Other), Ops, N->getMemoryVT(),
30190      N->getMemOperand());
30191  SDValue Extract = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, OrigVT,
30192                                NewGather, DAG.getIntPtrConstant(0, dl));
30193  return DAG.getMergeValues({Extract, NewGather.getValue(1)}, dl);
30194}
30195
30196static SDValue LowerADDRSPACECAST(SDValue Op, SelectionDAG &DAG) {
30197  SDLoc dl(Op);
30198  SDValue Src = Op.getOperand(0);
30199  MVT DstVT = Op.getSimpleValueType();
30200
30201  AddrSpaceCastSDNode *N = cast<AddrSpaceCastSDNode>(Op.getNode());
30202  unsigned SrcAS = N->getSrcAddressSpace();
30203
30204  assert(SrcAS != N->getDestAddressSpace() &&
30205         "addrspacecast must be between different address spaces");
30206
30207  if (SrcAS == X86AS::PTR32_UPTR && DstVT == MVT::i64) {
30208    Op = DAG.getNode(ISD::ZERO_EXTEND, dl, DstVT, Src);
30209  } else if (DstVT == MVT::i64) {
30210    Op = DAG.getNode(ISD::SIGN_EXTEND, dl, DstVT, Src);
30211  } else if (DstVT == MVT::i32) {
30212    Op = DAG.getNode(ISD::TRUNCATE, dl, DstVT, Src);
30213  } else {
30214    report_fatal_error("Bad address space in addrspacecast");
30215  }
30216  return Op;
30217}
30218
30219SDValue X86TargetLowering::LowerGC_TRANSITION(SDValue Op,
30220                                              SelectionDAG &DAG) const {
30221  // TODO: Eventually, the lowering of these nodes should be informed by or
30222  // deferred to the GC strategy for the function in which they appear. For
30223  // now, however, they must be lowered to something. Since they are logically
30224  // no-ops in the case of a null GC strategy (or a GC strategy which does not
30225  // require special handling for these nodes), lower them as literal NOOPs for
30226  // the time being.
30227  SmallVector<SDValue, 2> Ops;
30228
30229  Ops.push_back(Op.getOperand(0));
30230  if (Op->getGluedNode())
30231    Ops.push_back(Op->getOperand(Op->getNumOperands() - 1));
30232
30233  SDLoc OpDL(Op);
30234  SDVTList VTs = DAG.getVTList(MVT::Other, MVT::Glue);
30235  SDValue NOOP(DAG.getMachineNode(X86::NOOP, SDLoc(Op), VTs, Ops), 0);
30236
30237  return NOOP;
30238}
30239
30240// Custom split CVTPS2PH with wide types.
30241static SDValue LowerCVTPS2PH(SDValue Op, SelectionDAG &DAG) {
30242  SDLoc dl(Op);
30243  EVT VT = Op.getValueType();
30244  SDValue Lo, Hi;
30245  std::tie(Lo, Hi) = DAG.SplitVectorOperand(Op.getNode(), 0);
30246  EVT LoVT, HiVT;
30247  std::tie(LoVT, HiVT) = DAG.GetSplitDestVTs(VT);
30248  SDValue RC = Op.getOperand(1);
30249  Lo = DAG.getNode(X86ISD::CVTPS2PH, dl, LoVT, Lo, RC);
30250  Hi = DAG.getNode(X86ISD::CVTPS2PH, dl, HiVT, Hi, RC);
30251  return DAG.getNode(ISD::CONCAT_VECTORS, dl, VT, Lo, Hi);
30252}
30253
30254/// Provide custom lowering hooks for some operations.
30255SDValue X86TargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const {
30256  switch (Op.getOpcode()) {
30257  default: llvm_unreachable("Should not custom lower this!");
30258  case ISD::ATOMIC_FENCE:       return LowerATOMIC_FENCE(Op, Subtarget, DAG);
30259  case ISD::ATOMIC_CMP_SWAP_WITH_SUCCESS:
30260    return LowerCMP_SWAP(Op, Subtarget, DAG);
30261  case ISD::CTPOP:              return LowerCTPOP(Op, Subtarget, DAG);
30262  case ISD::ATOMIC_LOAD_ADD:
30263  case ISD::ATOMIC_LOAD_SUB:
30264  case ISD::ATOMIC_LOAD_OR:
30265  case ISD::ATOMIC_LOAD_XOR:
30266  case ISD::ATOMIC_LOAD_AND:    return lowerAtomicArith(Op, DAG, Subtarget);
30267  case ISD::ATOMIC_STORE:       return LowerATOMIC_STORE(Op, DAG, Subtarget);
30268  case ISD::BITREVERSE:         return LowerBITREVERSE(Op, Subtarget, DAG);
30269  case ISD::PARITY:             return LowerPARITY(Op, Subtarget, DAG);
30270  case ISD::BUILD_VECTOR:       return LowerBUILD_VECTOR(Op, DAG);
30271  case ISD::CONCAT_VECTORS:     return LowerCONCAT_VECTORS(Op, Subtarget, DAG);
30272  case ISD::VECTOR_SHUFFLE:     return lowerVECTOR_SHUFFLE(Op, Subtarget, DAG);
30273  case ISD::VSELECT:            return LowerVSELECT(Op, DAG);
30274  case ISD::EXTRACT_VECTOR_ELT: return LowerEXTRACT_VECTOR_ELT(Op, DAG);
30275  case ISD::INSERT_VECTOR_ELT:  return LowerINSERT_VECTOR_ELT(Op, DAG);
30276  case ISD::INSERT_SUBVECTOR:   return LowerINSERT_SUBVECTOR(Op, Subtarget,DAG);
30277  case ISD::EXTRACT_SUBVECTOR:  return LowerEXTRACT_SUBVECTOR(Op,Subtarget,DAG);
30278  case ISD::SCALAR_TO_VECTOR:   return LowerSCALAR_TO_VECTOR(Op, Subtarget,DAG);
30279  case ISD::ConstantPool:       return LowerConstantPool(Op, DAG);
30280  case ISD::GlobalAddress:      return LowerGlobalAddress(Op, DAG);
30281  case ISD::GlobalTLSAddress:   return LowerGlobalTLSAddress(Op, DAG);
30282  case ISD::ExternalSymbol:     return LowerExternalSymbol(Op, DAG);
30283  case ISD::BlockAddress:       return LowerBlockAddress(Op, DAG);
30284  case ISD::SHL_PARTS:
30285  case ISD::SRA_PARTS:
30286  case ISD::SRL_PARTS:          return LowerShiftParts(Op, DAG);
30287  case ISD::FSHL:
30288  case ISD::FSHR:               return LowerFunnelShift(Op, Subtarget, DAG);
30289  case ISD::STRICT_SINT_TO_FP:
30290  case ISD::SINT_TO_FP:         return LowerSINT_TO_FP(Op, DAG);
30291  case ISD::STRICT_UINT_TO_FP:
30292  case ISD::UINT_TO_FP:         return LowerUINT_TO_FP(Op, DAG);
30293  case ISD::TRUNCATE:           return LowerTRUNCATE(Op, DAG);
30294  case ISD::ZERO_EXTEND:        return LowerZERO_EXTEND(Op, Subtarget, DAG);
30295  case ISD::SIGN_EXTEND:        return LowerSIGN_EXTEND(Op, Subtarget, DAG);
30296  case ISD::ANY_EXTEND:         return LowerANY_EXTEND(Op, Subtarget, DAG);
30297  case ISD::ZERO_EXTEND_VECTOR_INREG:
30298  case ISD::SIGN_EXTEND_VECTOR_INREG:
30299    return LowerEXTEND_VECTOR_INREG(Op, Subtarget, DAG);
30300  case ISD::FP_TO_SINT:
30301  case ISD::STRICT_FP_TO_SINT:
30302  case ISD::FP_TO_UINT:
30303  case ISD::STRICT_FP_TO_UINT:  return LowerFP_TO_INT(Op, DAG);
30304  case ISD::FP_TO_SINT_SAT:
30305  case ISD::FP_TO_UINT_SAT:     return LowerFP_TO_INT_SAT(Op, DAG);
30306  case ISD::FP_EXTEND:
30307  case ISD::STRICT_FP_EXTEND:   return LowerFP_EXTEND(Op, DAG);
30308  case ISD::FP_ROUND:
30309  case ISD::STRICT_FP_ROUND:    return LowerFP_ROUND(Op, DAG);
30310  case ISD::FP16_TO_FP:
30311  case ISD::STRICT_FP16_TO_FP:  return LowerFP16_TO_FP(Op, DAG);
30312  case ISD::FP_TO_FP16:
30313  case ISD::STRICT_FP_TO_FP16:  return LowerFP_TO_FP16(Op, DAG);
30314  case ISD::LOAD:               return LowerLoad(Op, Subtarget, DAG);
30315  case ISD::STORE:              return LowerStore(Op, Subtarget, DAG);
30316  case ISD::FADD:
30317  case ISD::FSUB:               return lowerFaddFsub(Op, DAG);
30318  case ISD::FROUND:             return LowerFROUND(Op, DAG);
30319  case ISD::FABS:
30320  case ISD::FNEG:               return LowerFABSorFNEG(Op, DAG);
30321  case ISD::FCOPYSIGN:          return LowerFCOPYSIGN(Op, DAG);
30322  case ISD::FGETSIGN:           return LowerFGETSIGN(Op, DAG);
30323  case ISD::LRINT:
30324  case ISD::LLRINT:             return LowerLRINT_LLRINT(Op, DAG);
30325  case ISD::SETCC:
30326  case ISD::STRICT_FSETCC:
30327  case ISD::STRICT_FSETCCS:     return LowerSETCC(Op, DAG);
30328  case ISD::SETCCCARRY:         return LowerSETCCCARRY(Op, DAG);
30329  case ISD::SELECT:             return LowerSELECT(Op, DAG);
30330  case ISD::BRCOND:             return LowerBRCOND(Op, DAG);
30331  case ISD::JumpTable:          return LowerJumpTable(Op, DAG);
30332  case ISD::VASTART:            return LowerVASTART(Op, DAG);
30333  case ISD::VAARG:              return LowerVAARG(Op, DAG);
30334  case ISD::VACOPY:             return LowerVACOPY(Op, Subtarget, DAG);
30335  case ISD::INTRINSIC_WO_CHAIN: return LowerINTRINSIC_WO_CHAIN(Op, DAG);
30336  case ISD::INTRINSIC_VOID:
30337  case ISD::INTRINSIC_W_CHAIN:  return LowerINTRINSIC_W_CHAIN(Op, Subtarget, DAG);
30338  case ISD::RETURNADDR:         return LowerRETURNADDR(Op, DAG);
30339  case ISD::ADDROFRETURNADDR:   return LowerADDROFRETURNADDR(Op, DAG);
30340  case ISD::FRAMEADDR:          return LowerFRAMEADDR(Op, DAG);
30341  case ISD::FRAME_TO_ARGS_OFFSET:
30342                                return LowerFRAME_TO_ARGS_OFFSET(Op, DAG);
30343  case ISD::DYNAMIC_STACKALLOC: return LowerDYNAMIC_STACKALLOC(Op, DAG);
30344  case ISD::EH_RETURN:          return LowerEH_RETURN(Op, DAG);
30345  case ISD::EH_SJLJ_SETJMP:     return lowerEH_SJLJ_SETJMP(Op, DAG);
30346  case ISD::EH_SJLJ_LONGJMP:    return lowerEH_SJLJ_LONGJMP(Op, DAG);
30347  case ISD::EH_SJLJ_SETUP_DISPATCH:
30348    return lowerEH_SJLJ_SETUP_DISPATCH(Op, DAG);
30349  case ISD::INIT_TRAMPOLINE:    return LowerINIT_TRAMPOLINE(Op, DAG);
30350  case ISD::ADJUST_TRAMPOLINE:  return LowerADJUST_TRAMPOLINE(Op, DAG);
30351  case ISD::FLT_ROUNDS_:        return LowerFLT_ROUNDS_(Op, DAG);
30352  case ISD::SET_ROUNDING:       return LowerSET_ROUNDING(Op, DAG);
30353  case ISD::CTLZ:
30354  case ISD::CTLZ_ZERO_UNDEF:    return LowerCTLZ(Op, Subtarget, DAG);
30355  case ISD::CTTZ:
30356  case ISD::CTTZ_ZERO_UNDEF:    return LowerCTTZ(Op, Subtarget, DAG);
30357  case ISD::MUL:                return LowerMUL(Op, Subtarget, DAG);
30358  case ISD::MULHS:
30359  case ISD::MULHU:              return LowerMULH(Op, Subtarget, DAG);
30360  case ISD::ROTL:
30361  case ISD::ROTR:               return LowerRotate(Op, Subtarget, DAG);
30362  case ISD::SRA:
30363  case ISD::SRL:
30364  case ISD::SHL:                return LowerShift(Op, Subtarget, DAG);
30365  case ISD::SADDO:
30366  case ISD::UADDO:
30367  case ISD::SSUBO:
30368  case ISD::USUBO:              return LowerXALUO(Op, DAG);
30369  case ISD::SMULO:
30370  case ISD::UMULO:              return LowerMULO(Op, Subtarget, DAG);
30371  case ISD::READCYCLECOUNTER:   return LowerREADCYCLECOUNTER(Op, Subtarget,DAG);
30372  case ISD::BITCAST:            return LowerBITCAST(Op, Subtarget, DAG);
30373  case ISD::SADDO_CARRY:
30374  case ISD::SSUBO_CARRY:
30375  case ISD::ADDCARRY:
30376  case ISD::SUBCARRY:           return LowerADDSUBCARRY(Op, DAG);
30377  case ISD::ADD:
30378  case ISD::SUB:                return lowerAddSub(Op, DAG, Subtarget);
30379  case ISD::UADDSAT:
30380  case ISD::SADDSAT:
30381  case ISD::USUBSAT:
30382  case ISD::SSUBSAT:            return LowerADDSAT_SUBSAT(Op, DAG, Subtarget);
30383  case ISD::SMAX:
30384  case ISD::SMIN:
30385  case ISD::UMAX:
30386  case ISD::UMIN:               return LowerMINMAX(Op, DAG);
30387  case ISD::ABS:                return LowerABS(Op, Subtarget, DAG);
30388  case ISD::FSINCOS:            return LowerFSINCOS(Op, Subtarget, DAG);
30389  case ISD::MLOAD:              return LowerMLOAD(Op, Subtarget, DAG);
30390  case ISD::MSTORE:             return LowerMSTORE(Op, Subtarget, DAG);
30391  case ISD::MGATHER:            return LowerMGATHER(Op, Subtarget, DAG);
30392  case ISD::MSCATTER:           return LowerMSCATTER(Op, Subtarget, DAG);
30393  case ISD::GC_TRANSITION_START:
30394  case ISD::GC_TRANSITION_END:  return LowerGC_TRANSITION(Op, DAG);
30395  case ISD::ADDRSPACECAST:      return LowerADDRSPACECAST(Op, DAG);
30396  case X86ISD::CVTPS2PH:        return LowerCVTPS2PH(Op, DAG);
30397  }
30398}
30399
30400/// Replace a node with an illegal result type with a new node built out of
30401/// custom code.
30402void X86TargetLowering::ReplaceNodeResults(SDNode *N,
30403                                           SmallVectorImpl<SDValue>&Results,
30404                                           SelectionDAG &DAG) const {
30405  SDLoc dl(N);
30406  switch (N->getOpcode()) {
30407  default:
30408#ifndef NDEBUG
30409    dbgs() << "ReplaceNodeResults: ";
30410    N->dump(&DAG);
30411#endif
30412    llvm_unreachable("Do not know how to custom type legalize this operation!");
30413  case X86ISD::CVTPH2PS: {
30414    EVT VT = N->getValueType(0);
30415    SDValue Lo, Hi;
30416    std::tie(Lo, Hi) = DAG.SplitVectorOperand(N, 0);
30417    EVT LoVT, HiVT;
30418    std::tie(LoVT, HiVT) = DAG.GetSplitDestVTs(VT);
30419    Lo = DAG.getNode(X86ISD::CVTPH2PS, dl, LoVT, Lo);
30420    Hi = DAG.getNode(X86ISD::CVTPH2PS, dl, HiVT, Hi);
30421    SDValue Res = DAG.getNode(ISD::CONCAT_VECTORS, dl, VT, Lo, Hi);
30422    Results.push_back(Res);
30423    return;
30424  }
30425  case X86ISD::STRICT_CVTPH2PS: {
30426    EVT VT = N->getValueType(0);
30427    SDValue Lo, Hi;
30428    std::tie(Lo, Hi) = DAG.SplitVectorOperand(N, 1);
30429    EVT LoVT, HiVT;
30430    std::tie(LoVT, HiVT) = DAG.GetSplitDestVTs(VT);
30431    Lo = DAG.getNode(X86ISD::STRICT_CVTPH2PS, dl, {LoVT, MVT::Other},
30432                     {N->getOperand(0), Lo});
30433    Hi = DAG.getNode(X86ISD::STRICT_CVTPH2PS, dl, {HiVT, MVT::Other},
30434                     {N->getOperand(0), Hi});
30435    SDValue Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other,
30436                                Lo.getValue(1), Hi.getValue(1));
30437    SDValue Res = DAG.getNode(ISD::CONCAT_VECTORS, dl, VT, Lo, Hi);
30438    Results.push_back(Res);
30439    Results.push_back(Chain);
30440    return;
30441  }
30442  case X86ISD::CVTPS2PH:
30443    Results.push_back(LowerCVTPS2PH(SDValue(N, 0), DAG));
30444    return;
30445  case ISD::CTPOP: {
30446    assert(N->getValueType(0) == MVT::i64 && "Unexpected VT!");
30447    // Use a v2i64 if possible.
30448    bool NoImplicitFloatOps =
30449        DAG.getMachineFunction().getFunction().hasFnAttribute(
30450            Attribute::NoImplicitFloat);
30451    if (isTypeLegal(MVT::v2i64) && !NoImplicitFloatOps) {
30452      SDValue Wide =
30453          DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v2i64, N->getOperand(0));
30454      Wide = DAG.getNode(ISD::CTPOP, dl, MVT::v2i64, Wide);
30455      // Bit count should fit in 32-bits, extract it as that and then zero
30456      // extend to i64. Otherwise we end up extracting bits 63:32 separately.
30457      Wide = DAG.getNode(ISD::BITCAST, dl, MVT::v4i32, Wide);
30458      Wide = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i32, Wide,
30459                         DAG.getIntPtrConstant(0, dl));
30460      Wide = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i64, Wide);
30461      Results.push_back(Wide);
30462    }
30463    return;
30464  }
30465  case ISD::MUL: {
30466    EVT VT = N->getValueType(0);
30467    assert(getTypeAction(*DAG.getContext(), VT) == TypeWidenVector &&
30468           VT.getVectorElementType() == MVT::i8 && "Unexpected VT!");
30469    // Pre-promote these to vXi16 to avoid op legalization thinking all 16
30470    // elements are needed.
30471    MVT MulVT = MVT::getVectorVT(MVT::i16, VT.getVectorNumElements());
30472    SDValue Op0 = DAG.getNode(ISD::ANY_EXTEND, dl, MulVT, N->getOperand(0));
30473    SDValue Op1 = DAG.getNode(ISD::ANY_EXTEND, dl, MulVT, N->getOperand(1));
30474    SDValue Res = DAG.getNode(ISD::MUL, dl, MulVT, Op0, Op1);
30475    Res = DAG.getNode(ISD::TRUNCATE, dl, VT, Res);
30476    unsigned NumConcats = 16 / VT.getVectorNumElements();
30477    SmallVector<SDValue, 8> ConcatOps(NumConcats, DAG.getUNDEF(VT));
30478    ConcatOps[0] = Res;
30479    Res = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v16i8, ConcatOps);
30480    Results.push_back(Res);
30481    return;
30482  }
30483  case X86ISD::VPMADDWD:
30484  case X86ISD::AVG: {
30485    // Legalize types for X86ISD::AVG/VPMADDWD by widening.
30486    assert(Subtarget.hasSSE2() && "Requires at least SSE2!");
30487
30488    EVT VT = N->getValueType(0);
30489    EVT InVT = N->getOperand(0).getValueType();
30490    assert(VT.getSizeInBits() < 128 && 128 % VT.getSizeInBits() == 0 &&
30491           "Expected a VT that divides into 128 bits.");
30492    assert(getTypeAction(*DAG.getContext(), VT) == TypeWidenVector &&
30493           "Unexpected type action!");
30494    unsigned NumConcat = 128 / InVT.getSizeInBits();
30495
30496    EVT InWideVT = EVT::getVectorVT(*DAG.getContext(),
30497                                    InVT.getVectorElementType(),
30498                                    NumConcat * InVT.getVectorNumElements());
30499    EVT WideVT = EVT::getVectorVT(*DAG.getContext(),
30500                                  VT.getVectorElementType(),
30501                                  NumConcat * VT.getVectorNumElements());
30502
30503    SmallVector<SDValue, 16> Ops(NumConcat, DAG.getUNDEF(InVT));
30504    Ops[0] = N->getOperand(0);
30505    SDValue InVec0 = DAG.getNode(ISD::CONCAT_VECTORS, dl, InWideVT, Ops);
30506    Ops[0] = N->getOperand(1);
30507    SDValue InVec1 = DAG.getNode(ISD::CONCAT_VECTORS, dl, InWideVT, Ops);
30508
30509    SDValue Res = DAG.getNode(N->getOpcode(), dl, WideVT, InVec0, InVec1);
30510    Results.push_back(Res);
30511    return;
30512  }
30513  // We might have generated v2f32 FMIN/FMAX operations. Widen them to v4f32.
30514  case X86ISD::FMINC:
30515  case X86ISD::FMIN:
30516  case X86ISD::FMAXC:
30517  case X86ISD::FMAX: {
30518    EVT VT = N->getValueType(0);
30519    assert(VT == MVT::v2f32 && "Unexpected type (!= v2f32) on FMIN/FMAX.");
30520    SDValue UNDEF = DAG.getUNDEF(VT);
30521    SDValue LHS = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v4f32,
30522                              N->getOperand(0), UNDEF);
30523    SDValue RHS = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v4f32,
30524                              N->getOperand(1), UNDEF);
30525    Results.push_back(DAG.getNode(N->getOpcode(), dl, MVT::v4f32, LHS, RHS));
30526    return;
30527  }
30528  case ISD::SDIV:
30529  case ISD::UDIV:
30530  case ISD::SREM:
30531  case ISD::UREM: {
30532    EVT VT = N->getValueType(0);
30533    if (VT.isVector()) {
30534      assert(getTypeAction(*DAG.getContext(), VT) == TypeWidenVector &&
30535             "Unexpected type action!");
30536      // If this RHS is a constant splat vector we can widen this and let
30537      // division/remainder by constant optimize it.
30538      // TODO: Can we do something for non-splat?
30539      APInt SplatVal;
30540      if (ISD::isConstantSplatVector(N->getOperand(1).getNode(), SplatVal)) {
30541        unsigned NumConcats = 128 / VT.getSizeInBits();
30542        SmallVector<SDValue, 8> Ops0(NumConcats, DAG.getUNDEF(VT));
30543        Ops0[0] = N->getOperand(0);
30544        EVT ResVT = getTypeToTransformTo(*DAG.getContext(), VT);
30545        SDValue N0 = DAG.getNode(ISD::CONCAT_VECTORS, dl, ResVT, Ops0);
30546        SDValue N1 = DAG.getConstant(SplatVal, dl, ResVT);
30547        SDValue Res = DAG.getNode(N->getOpcode(), dl, ResVT, N0, N1);
30548        Results.push_back(Res);
30549      }
30550      return;
30551    }
30552
30553    SDValue V = LowerWin64_i128OP(SDValue(N,0), DAG);
30554    Results.push_back(V);
30555    return;
30556  }
30557  case ISD::TRUNCATE: {
30558    MVT VT = N->getSimpleValueType(0);
30559    if (getTypeAction(*DAG.getContext(), VT) != TypeWidenVector)
30560      return;
30561
30562    // The generic legalizer will try to widen the input type to the same
30563    // number of elements as the widened result type. But this isn't always
30564    // the best thing so do some custom legalization to avoid some cases.
30565    MVT WidenVT = getTypeToTransformTo(*DAG.getContext(), VT).getSimpleVT();
30566    SDValue In = N->getOperand(0);
30567    EVT InVT = In.getValueType();
30568
30569    unsigned InBits = InVT.getSizeInBits();
30570    if (128 % InBits == 0) {
30571      // 128 bit and smaller inputs should avoid truncate all together and
30572      // just use a build_vector that will become a shuffle.
30573      // TODO: Widen and use a shuffle directly?
30574      MVT InEltVT = InVT.getSimpleVT().getVectorElementType();
30575      EVT EltVT = VT.getVectorElementType();
30576      unsigned WidenNumElts = WidenVT.getVectorNumElements();
30577      SmallVector<SDValue, 16> Ops(WidenNumElts, DAG.getUNDEF(EltVT));
30578      // Use the original element count so we don't do more scalar opts than
30579      // necessary.
30580      unsigned MinElts = VT.getVectorNumElements();
30581      for (unsigned i=0; i < MinElts; ++i) {
30582        SDValue Val = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, InEltVT, In,
30583                                  DAG.getIntPtrConstant(i, dl));
30584        Ops[i] = DAG.getNode(ISD::TRUNCATE, dl, EltVT, Val);
30585      }
30586      Results.push_back(DAG.getBuildVector(WidenVT, dl, Ops));
30587      return;
30588    }
30589    // With AVX512 there are some cases that can use a target specific
30590    // truncate node to go from 256/512 to less than 128 with zeros in the
30591    // upper elements of the 128 bit result.
30592    if (Subtarget.hasAVX512() && isTypeLegal(InVT)) {
30593      // We can use VTRUNC directly if for 256 bits with VLX or for any 512.
30594      if ((InBits == 256 && Subtarget.hasVLX()) || InBits == 512) {
30595        Results.push_back(DAG.getNode(X86ISD::VTRUNC, dl, WidenVT, In));
30596        return;
30597      }
30598      // There's one case we can widen to 512 bits and use VTRUNC.
30599      if (InVT == MVT::v4i64 && VT == MVT::v4i8 && isTypeLegal(MVT::v8i64)) {
30600        In = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v8i64, In,
30601                         DAG.getUNDEF(MVT::v4i64));
30602        Results.push_back(DAG.getNode(X86ISD::VTRUNC, dl, WidenVT, In));
30603        return;
30604      }
30605    }
30606    if (Subtarget.hasVLX() && InVT == MVT::v8i64 && VT == MVT::v8i8 &&
30607        getTypeAction(*DAG.getContext(), InVT) == TypeSplitVector &&
30608        isTypeLegal(MVT::v4i64)) {
30609      // Input needs to be split and output needs to widened. Let's use two
30610      // VTRUNCs, and shuffle their results together into the wider type.
30611      SDValue Lo, Hi;
30612      std::tie(Lo, Hi) = DAG.SplitVector(In, dl);
30613
30614      Lo = DAG.getNode(X86ISD::VTRUNC, dl, MVT::v16i8, Lo);
30615      Hi = DAG.getNode(X86ISD::VTRUNC, dl, MVT::v16i8, Hi);
30616      SDValue Res = DAG.getVectorShuffle(MVT::v16i8, dl, Lo, Hi,
30617                                         { 0,  1,  2,  3, 16, 17, 18, 19,
30618                                          -1, -1, -1, -1, -1, -1, -1, -1 });
30619      Results.push_back(Res);
30620      return;
30621    }
30622
30623    return;
30624  }
30625  case ISD::ANY_EXTEND:
30626    // Right now, only MVT::v8i8 has Custom action for an illegal type.
30627    // It's intended to custom handle the input type.
30628    assert(N->getValueType(0) == MVT::v8i8 &&
30629           "Do not know how to legalize this Node");
30630    return;
30631  case ISD::SIGN_EXTEND:
30632  case ISD::ZERO_EXTEND: {
30633    EVT VT = N->getValueType(0);
30634    SDValue In = N->getOperand(0);
30635    EVT InVT = In.getValueType();
30636    if (!Subtarget.hasSSE41() && VT == MVT::v4i64 &&
30637        (InVT == MVT::v4i16 || InVT == MVT::v4i8)){
30638      assert(getTypeAction(*DAG.getContext(), InVT) == TypeWidenVector &&
30639             "Unexpected type action!");
30640      assert(N->getOpcode() == ISD::SIGN_EXTEND && "Unexpected opcode");
30641      // Custom split this so we can extend i8/i16->i32 invec. This is better
30642      // since sign_extend_inreg i8/i16->i64 requires an extend to i32 using
30643      // sra. Then extending from i32 to i64 using pcmpgt. By custom splitting
30644      // we allow the sra from the extend to i32 to be shared by the split.
30645      In = DAG.getNode(ISD::SIGN_EXTEND, dl, MVT::v4i32, In);
30646
30647      // Fill a vector with sign bits for each element.
30648      SDValue Zero = DAG.getConstant(0, dl, MVT::v4i32);
30649      SDValue SignBits = DAG.getSetCC(dl, MVT::v4i32, Zero, In, ISD::SETGT);
30650
30651      // Create an unpackl and unpackh to interleave the sign bits then bitcast
30652      // to v2i64.
30653      SDValue Lo = DAG.getVectorShuffle(MVT::v4i32, dl, In, SignBits,
30654                                        {0, 4, 1, 5});
30655      Lo = DAG.getNode(ISD::BITCAST, dl, MVT::v2i64, Lo);
30656      SDValue Hi = DAG.getVectorShuffle(MVT::v4i32, dl, In, SignBits,
30657                                        {2, 6, 3, 7});
30658      Hi = DAG.getNode(ISD::BITCAST, dl, MVT::v2i64, Hi);
30659
30660      SDValue Res = DAG.getNode(ISD::CONCAT_VECTORS, dl, VT, Lo, Hi);
30661      Results.push_back(Res);
30662      return;
30663    }
30664
30665    if (VT == MVT::v16i32 || VT == MVT::v8i64) {
30666      if (!InVT.is128BitVector()) {
30667        // Not a 128 bit vector, but maybe type legalization will promote
30668        // it to 128 bits.
30669        if (getTypeAction(*DAG.getContext(), InVT) != TypePromoteInteger)
30670          return;
30671        InVT = getTypeToTransformTo(*DAG.getContext(), InVT);
30672        if (!InVT.is128BitVector())
30673          return;
30674
30675        // Promote the input to 128 bits. Type legalization will turn this into
30676        // zext_inreg/sext_inreg.
30677        In = DAG.getNode(N->getOpcode(), dl, InVT, In);
30678      }
30679
30680      // Perform custom splitting instead of the two stage extend we would get
30681      // by default.
30682      EVT LoVT, HiVT;
30683      std::tie(LoVT, HiVT) = DAG.GetSplitDestVTs(N->getValueType(0));
30684      assert(isTypeLegal(LoVT) && "Split VT not legal?");
30685
30686      SDValue Lo = getEXTEND_VECTOR_INREG(N->getOpcode(), dl, LoVT, In, DAG);
30687
30688      // We need to shift the input over by half the number of elements.
30689      unsigned NumElts = InVT.getVectorNumElements();
30690      unsigned HalfNumElts = NumElts / 2;
30691      SmallVector<int, 16> ShufMask(NumElts, SM_SentinelUndef);
30692      for (unsigned i = 0; i != HalfNumElts; ++i)
30693        ShufMask[i] = i + HalfNumElts;
30694
30695      SDValue Hi = DAG.getVectorShuffle(InVT, dl, In, In, ShufMask);
30696      Hi = getEXTEND_VECTOR_INREG(N->getOpcode(), dl, HiVT, Hi, DAG);
30697
30698      SDValue Res = DAG.getNode(ISD::CONCAT_VECTORS, dl, VT, Lo, Hi);
30699      Results.push_back(Res);
30700    }
30701    return;
30702  }
30703  case ISD::FP_TO_SINT:
30704  case ISD::STRICT_FP_TO_SINT:
30705  case ISD::FP_TO_UINT:
30706  case ISD::STRICT_FP_TO_UINT: {
30707    bool IsStrict = N->isStrictFPOpcode();
30708    bool IsSigned = N->getOpcode() == ISD::FP_TO_SINT ||
30709                    N->getOpcode() == ISD::STRICT_FP_TO_SINT;
30710    EVT VT = N->getValueType(0);
30711    SDValue Src = N->getOperand(IsStrict ? 1 : 0);
30712    EVT SrcVT = Src.getValueType();
30713
30714    if (VT.isVector() && VT.getScalarSizeInBits() < 32) {
30715      assert(getTypeAction(*DAG.getContext(), VT) == TypeWidenVector &&
30716             "Unexpected type action!");
30717
30718      // Try to create a 128 bit vector, but don't exceed a 32 bit element.
30719      unsigned NewEltWidth = std::min(128 / VT.getVectorNumElements(), 32U);
30720      MVT PromoteVT = MVT::getVectorVT(MVT::getIntegerVT(NewEltWidth),
30721                                       VT.getVectorNumElements());
30722      SDValue Res;
30723      SDValue Chain;
30724      if (IsStrict) {
30725        Res = DAG.getNode(ISD::STRICT_FP_TO_SINT, dl, {PromoteVT, MVT::Other},
30726                          {N->getOperand(0), Src});
30727        Chain = Res.getValue(1);
30728      } else
30729        Res = DAG.getNode(ISD::FP_TO_SINT, dl, PromoteVT, Src);
30730
30731      // Preserve what we know about the size of the original result. Except
30732      // when the result is v2i32 since we can't widen the assert.
30733      if (PromoteVT != MVT::v2i32)
30734        Res = DAG.getNode(!IsSigned ? ISD::AssertZext : ISD::AssertSext,
30735                          dl, PromoteVT, Res,
30736                          DAG.getValueType(VT.getVectorElementType()));
30737
30738      // Truncate back to the original width.
30739      Res = DAG.getNode(ISD::TRUNCATE, dl, VT, Res);
30740
30741      // Now widen to 128 bits.
30742      unsigned NumConcats = 128 / VT.getSizeInBits();
30743      MVT ConcatVT = MVT::getVectorVT(VT.getSimpleVT().getVectorElementType(),
30744                                      VT.getVectorNumElements() * NumConcats);
30745      SmallVector<SDValue, 8> ConcatOps(NumConcats, DAG.getUNDEF(VT));
30746      ConcatOps[0] = Res;
30747      Res = DAG.getNode(ISD::CONCAT_VECTORS, dl, ConcatVT, ConcatOps);
30748      Results.push_back(Res);
30749      if (IsStrict)
30750        Results.push_back(Chain);
30751      return;
30752    }
30753
30754
30755    if (VT == MVT::v2i32) {
30756      assert((IsSigned || Subtarget.hasAVX512()) &&
30757             "Can only handle signed conversion without AVX512");
30758      assert(Subtarget.hasSSE2() && "Requires at least SSE2!");
30759      assert(getTypeAction(*DAG.getContext(), VT) == TypeWidenVector &&
30760             "Unexpected type action!");
30761      if (Src.getValueType() == MVT::v2f64) {
30762        unsigned Opc;
30763        if (IsStrict)
30764          Opc = IsSigned ? X86ISD::STRICT_CVTTP2SI : X86ISD::STRICT_CVTTP2UI;
30765        else
30766          Opc = IsSigned ? X86ISD::CVTTP2SI : X86ISD::CVTTP2UI;
30767
30768        // If we have VLX we can emit a target specific FP_TO_UINT node,.
30769        if (!IsSigned && !Subtarget.hasVLX()) {
30770          // Otherwise we can defer to the generic legalizer which will widen
30771          // the input as well. This will be further widened during op
30772          // legalization to v8i32<-v8f64.
30773          // For strict nodes we'll need to widen ourselves.
30774          // FIXME: Fix the type legalizer to safely widen strict nodes?
30775          if (!IsStrict)
30776            return;
30777          Src = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v4f64, Src,
30778                            DAG.getConstantFP(0.0, dl, MVT::v2f64));
30779          Opc = N->getOpcode();
30780        }
30781        SDValue Res;
30782        SDValue Chain;
30783        if (IsStrict) {
30784          Res = DAG.getNode(Opc, dl, {MVT::v4i32, MVT::Other},
30785                            {N->getOperand(0), Src});
30786          Chain = Res.getValue(1);
30787        } else {
30788          Res = DAG.getNode(Opc, dl, MVT::v4i32, Src);
30789        }
30790        Results.push_back(Res);
30791        if (IsStrict)
30792          Results.push_back(Chain);
30793        return;
30794      }
30795
30796      // Custom widen strict v2f32->v2i32 by padding with zeros.
30797      // FIXME: Should generic type legalizer do this?
30798      if (Src.getValueType() == MVT::v2f32 && IsStrict) {
30799        Src = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v4f32, Src,
30800                          DAG.getConstantFP(0.0, dl, MVT::v2f32));
30801        SDValue Res = DAG.getNode(N->getOpcode(), dl, {MVT::v4i32, MVT::Other},
30802                                  {N->getOperand(0), Src});
30803        Results.push_back(Res);
30804        Results.push_back(Res.getValue(1));
30805        return;
30806      }
30807
30808      // The FP_TO_INTHelper below only handles f32/f64/f80 scalar inputs,
30809      // so early out here.
30810      return;
30811    }
30812
30813    assert(!VT.isVector() && "Vectors should have been handled above!");
30814
30815    if (Subtarget.hasDQI() && VT == MVT::i64 &&
30816        (SrcVT == MVT::f32 || SrcVT == MVT::f64)) {
30817      assert(!Subtarget.is64Bit() && "i64 should be legal");
30818      unsigned NumElts = Subtarget.hasVLX() ? 2 : 8;
30819      // If we use a 128-bit result we might need to use a target specific node.
30820      unsigned SrcElts =
30821          std::max(NumElts, 128U / (unsigned)SrcVT.getSizeInBits());
30822      MVT VecVT = MVT::getVectorVT(MVT::i64, NumElts);
30823      MVT VecInVT = MVT::getVectorVT(SrcVT.getSimpleVT(), SrcElts);
30824      unsigned Opc = N->getOpcode();
30825      if (NumElts != SrcElts) {
30826        if (IsStrict)
30827          Opc = IsSigned ? X86ISD::STRICT_CVTTP2SI : X86ISD::STRICT_CVTTP2UI;
30828        else
30829          Opc = IsSigned ? X86ISD::CVTTP2SI : X86ISD::CVTTP2UI;
30830      }
30831
30832      SDValue ZeroIdx = DAG.getIntPtrConstant(0, dl);
30833      SDValue Res = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, VecInVT,
30834                                DAG.getConstantFP(0.0, dl, VecInVT), Src,
30835                                ZeroIdx);
30836      SDValue Chain;
30837      if (IsStrict) {
30838        SDVTList Tys = DAG.getVTList(VecVT, MVT::Other);
30839        Res = DAG.getNode(Opc, SDLoc(N), Tys, N->getOperand(0), Res);
30840        Chain = Res.getValue(1);
30841      } else
30842        Res = DAG.getNode(Opc, SDLoc(N), VecVT, Res);
30843      Res = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, VT, Res, ZeroIdx);
30844      Results.push_back(Res);
30845      if (IsStrict)
30846        Results.push_back(Chain);
30847      return;
30848    }
30849
30850    SDValue Chain;
30851    if (SDValue V = FP_TO_INTHelper(SDValue(N, 0), DAG, IsSigned, Chain)) {
30852      Results.push_back(V);
30853      if (IsStrict)
30854        Results.push_back(Chain);
30855    }
30856    return;
30857  }
30858  case ISD::LRINT:
30859  case ISD::LLRINT: {
30860    if (SDValue V = LRINT_LLRINTHelper(N, DAG))
30861      Results.push_back(V);
30862    return;
30863  }
30864
30865  case ISD::SINT_TO_FP:
30866  case ISD::STRICT_SINT_TO_FP:
30867  case ISD::UINT_TO_FP:
30868  case ISD::STRICT_UINT_TO_FP: {
30869    bool IsStrict = N->isStrictFPOpcode();
30870    bool IsSigned = N->getOpcode() == ISD::SINT_TO_FP ||
30871                    N->getOpcode() == ISD::STRICT_SINT_TO_FP;
30872    EVT VT = N->getValueType(0);
30873    if (VT != MVT::v2f32)
30874      return;
30875    SDValue Src = N->getOperand(IsStrict ? 1 : 0);
30876    EVT SrcVT = Src.getValueType();
30877    if (Subtarget.hasDQI() && Subtarget.hasVLX() && SrcVT == MVT::v2i64) {
30878      if (IsStrict) {
30879        unsigned Opc = IsSigned ? X86ISD::STRICT_CVTSI2P
30880                                : X86ISD::STRICT_CVTUI2P;
30881        SDValue Res = DAG.getNode(Opc, dl, {MVT::v4f32, MVT::Other},
30882                                  {N->getOperand(0), Src});
30883        Results.push_back(Res);
30884        Results.push_back(Res.getValue(1));
30885      } else {
30886        unsigned Opc = IsSigned ? X86ISD::CVTSI2P : X86ISD::CVTUI2P;
30887        Results.push_back(DAG.getNode(Opc, dl, MVT::v4f32, Src));
30888      }
30889      return;
30890    }
30891    if (SrcVT == MVT::v2i64 && !IsSigned && Subtarget.is64Bit() &&
30892        Subtarget.hasSSE41() && !Subtarget.hasAVX512()) {
30893      SDValue Zero = DAG.getConstant(0, dl, SrcVT);
30894      SDValue One  = DAG.getConstant(1, dl, SrcVT);
30895      SDValue Sign = DAG.getNode(ISD::OR, dl, SrcVT,
30896                                 DAG.getNode(ISD::SRL, dl, SrcVT, Src, One),
30897                                 DAG.getNode(ISD::AND, dl, SrcVT, Src, One));
30898      SDValue IsNeg = DAG.getSetCC(dl, MVT::v2i64, Src, Zero, ISD::SETLT);
30899      SDValue SignSrc = DAG.getSelect(dl, SrcVT, IsNeg, Sign, Src);
30900      SmallVector<SDValue, 4> SignCvts(4, DAG.getConstantFP(0.0, dl, MVT::f32));
30901      for (int i = 0; i != 2; ++i) {
30902        SDValue Elt = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i64,
30903                                  SignSrc, DAG.getIntPtrConstant(i, dl));
30904        if (IsStrict)
30905          SignCvts[i] =
30906              DAG.getNode(ISD::STRICT_SINT_TO_FP, dl, {MVT::f32, MVT::Other},
30907                          {N->getOperand(0), Elt});
30908        else
30909          SignCvts[i] = DAG.getNode(ISD::SINT_TO_FP, dl, MVT::f32, Elt);
30910      };
30911      SDValue SignCvt = DAG.getBuildVector(MVT::v4f32, dl, SignCvts);
30912      SDValue Slow, Chain;
30913      if (IsStrict) {
30914        Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other,
30915                            SignCvts[0].getValue(1), SignCvts[1].getValue(1));
30916        Slow = DAG.getNode(ISD::STRICT_FADD, dl, {MVT::v4f32, MVT::Other},
30917                           {Chain, SignCvt, SignCvt});
30918        Chain = Slow.getValue(1);
30919      } else {
30920        Slow = DAG.getNode(ISD::FADD, dl, MVT::v4f32, SignCvt, SignCvt);
30921      }
30922      IsNeg = DAG.getBitcast(MVT::v4i32, IsNeg);
30923      IsNeg =
30924          DAG.getVectorShuffle(MVT::v4i32, dl, IsNeg, IsNeg, {1, 3, -1, -1});
30925      SDValue Cvt = DAG.getSelect(dl, MVT::v4f32, IsNeg, Slow, SignCvt);
30926      Results.push_back(Cvt);
30927      if (IsStrict)
30928        Results.push_back(Chain);
30929      return;
30930    }
30931
30932    if (SrcVT != MVT::v2i32)
30933      return;
30934
30935    if (IsSigned || Subtarget.hasAVX512()) {
30936      if (!IsStrict)
30937        return;
30938
30939      // Custom widen strict v2i32->v2f32 to avoid scalarization.
30940      // FIXME: Should generic type legalizer do this?
30941      Src = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v4i32, Src,
30942                        DAG.getConstant(0, dl, MVT::v2i32));
30943      SDValue Res = DAG.getNode(N->getOpcode(), dl, {MVT::v4f32, MVT::Other},
30944                                {N->getOperand(0), Src});
30945      Results.push_back(Res);
30946      Results.push_back(Res.getValue(1));
30947      return;
30948    }
30949
30950    assert(Subtarget.hasSSE2() && "Requires at least SSE2!");
30951    SDValue ZExtIn = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::v2i64, Src);
30952    SDValue VBias =
30953        DAG.getConstantFP(BitsToDouble(0x4330000000000000ULL), dl, MVT::v2f64);
30954    SDValue Or = DAG.getNode(ISD::OR, dl, MVT::v2i64, ZExtIn,
30955                             DAG.getBitcast(MVT::v2i64, VBias));
30956    Or = DAG.getBitcast(MVT::v2f64, Or);
30957    if (IsStrict) {
30958      SDValue Sub = DAG.getNode(ISD::STRICT_FSUB, dl, {MVT::v2f64, MVT::Other},
30959                                {N->getOperand(0), Or, VBias});
30960      SDValue Res = DAG.getNode(X86ISD::STRICT_VFPROUND, dl,
30961                                {MVT::v4f32, MVT::Other},
30962                                {Sub.getValue(1), Sub});
30963      Results.push_back(Res);
30964      Results.push_back(Res.getValue(1));
30965    } else {
30966      // TODO: Are there any fast-math-flags to propagate here?
30967      SDValue Sub = DAG.getNode(ISD::FSUB, dl, MVT::v2f64, Or, VBias);
30968      Results.push_back(DAG.getNode(X86ISD::VFPROUND, dl, MVT::v4f32, Sub));
30969    }
30970    return;
30971  }
30972  case ISD::STRICT_FP_ROUND:
30973  case ISD::FP_ROUND: {
30974    bool IsStrict = N->isStrictFPOpcode();
30975    SDValue Src = N->getOperand(IsStrict ? 1 : 0);
30976    if (!isTypeLegal(Src.getValueType()))
30977      return;
30978    SDValue V;
30979    if (IsStrict)
30980      V = DAG.getNode(X86ISD::STRICT_VFPROUND, dl, {MVT::v4f32, MVT::Other},
30981                      {N->getOperand(0), N->getOperand(1)});
30982    else
30983      V = DAG.getNode(X86ISD::VFPROUND, dl, MVT::v4f32, N->getOperand(0));
30984    Results.push_back(V);
30985    if (IsStrict)
30986      Results.push_back(V.getValue(1));
30987    return;
30988  }
30989  case ISD::FP_EXTEND:
30990  case ISD::STRICT_FP_EXTEND: {
30991    // Right now, only MVT::v2f32 has OperationAction for FP_EXTEND.
30992    // No other ValueType for FP_EXTEND should reach this point.
30993    assert(N->getValueType(0) == MVT::v2f32 &&
30994           "Do not know how to legalize this Node");
30995    return;
30996  }
30997  case ISD::INTRINSIC_W_CHAIN: {
30998    unsigned IntNo = N->getConstantOperandVal(1);
30999    switch (IntNo) {
31000    default : llvm_unreachable("Do not know how to custom type "
31001                               "legalize this intrinsic operation!");
31002    case Intrinsic::x86_rdtsc:
31003      return getReadTimeStampCounter(N, dl, X86::RDTSC, DAG, Subtarget,
31004                                     Results);
31005    case Intrinsic::x86_rdtscp:
31006      return getReadTimeStampCounter(N, dl, X86::RDTSCP, DAG, Subtarget,
31007                                     Results);
31008    case Intrinsic::x86_rdpmc:
31009      expandIntrinsicWChainHelper(N, dl, DAG, X86::RDPMC, X86::ECX, Subtarget,
31010                                  Results);
31011      return;
31012    case Intrinsic::x86_xgetbv:
31013      expandIntrinsicWChainHelper(N, dl, DAG, X86::XGETBV, X86::ECX, Subtarget,
31014                                  Results);
31015      return;
31016    }
31017  }
31018  case ISD::READCYCLECOUNTER: {
31019    return getReadTimeStampCounter(N, dl, X86::RDTSC, DAG, Subtarget, Results);
31020  }
31021  case ISD::ATOMIC_CMP_SWAP_WITH_SUCCESS: {
31022    EVT T = N->getValueType(0);
31023    assert((T == MVT::i64 || T == MVT::i128) && "can only expand cmpxchg pair");
31024    bool Regs64bit = T == MVT::i128;
31025    assert((!Regs64bit || Subtarget.hasCmpxchg16b()) &&
31026           "64-bit ATOMIC_CMP_SWAP_WITH_SUCCESS requires CMPXCHG16B");
31027    MVT HalfT = Regs64bit ? MVT::i64 : MVT::i32;
31028    SDValue cpInL, cpInH;
31029    cpInL = DAG.getNode(ISD::EXTRACT_ELEMENT, dl, HalfT, N->getOperand(2),
31030                        DAG.getConstant(0, dl, HalfT));
31031    cpInH = DAG.getNode(ISD::EXTRACT_ELEMENT, dl, HalfT, N->getOperand(2),
31032                        DAG.getConstant(1, dl, HalfT));
31033    cpInL = DAG.getCopyToReg(N->getOperand(0), dl,
31034                             Regs64bit ? X86::RAX : X86::EAX,
31035                             cpInL, SDValue());
31036    cpInH = DAG.getCopyToReg(cpInL.getValue(0), dl,
31037                             Regs64bit ? X86::RDX : X86::EDX,
31038                             cpInH, cpInL.getValue(1));
31039    SDValue swapInL, swapInH;
31040    swapInL = DAG.getNode(ISD::EXTRACT_ELEMENT, dl, HalfT, N->getOperand(3),
31041                          DAG.getConstant(0, dl, HalfT));
31042    swapInH = DAG.getNode(ISD::EXTRACT_ELEMENT, dl, HalfT, N->getOperand(3),
31043                          DAG.getConstant(1, dl, HalfT));
31044    swapInH =
31045        DAG.getCopyToReg(cpInH.getValue(0), dl, Regs64bit ? X86::RCX : X86::ECX,
31046                         swapInH, cpInH.getValue(1));
31047
31048    // In 64-bit mode we might need the base pointer in RBX, but we can't know
31049    // until later. So we keep the RBX input in a vreg and use a custom
31050    // inserter.
31051    // Since RBX will be a reserved register the register allocator will not
31052    // make sure its value will be properly saved and restored around this
31053    // live-range.
31054    SDValue Result;
31055    SDVTList Tys = DAG.getVTList(MVT::Other, MVT::Glue);
31056    MachineMemOperand *MMO = cast<AtomicSDNode>(N)->getMemOperand();
31057    if (Regs64bit) {
31058      SDValue Ops[] = {swapInH.getValue(0), N->getOperand(1), swapInL,
31059                       swapInH.getValue(1)};
31060      Result =
31061          DAG.getMemIntrinsicNode(X86ISD::LCMPXCHG16_DAG, dl, Tys, Ops, T, MMO);
31062    } else {
31063      swapInL = DAG.getCopyToReg(swapInH.getValue(0), dl, X86::EBX, swapInL,
31064                                 swapInH.getValue(1));
31065      SDValue Ops[] = {swapInL.getValue(0), N->getOperand(1),
31066                       swapInL.getValue(1)};
31067      Result =
31068          DAG.getMemIntrinsicNode(X86ISD::LCMPXCHG8_DAG, dl, Tys, Ops, T, MMO);
31069    }
31070
31071    SDValue cpOutL = DAG.getCopyFromReg(Result.getValue(0), dl,
31072                                        Regs64bit ? X86::RAX : X86::EAX,
31073                                        HalfT, Result.getValue(1));
31074    SDValue cpOutH = DAG.getCopyFromReg(cpOutL.getValue(1), dl,
31075                                        Regs64bit ? X86::RDX : X86::EDX,
31076                                        HalfT, cpOutL.getValue(2));
31077    SDValue OpsF[] = { cpOutL.getValue(0), cpOutH.getValue(0)};
31078
31079    SDValue EFLAGS = DAG.getCopyFromReg(cpOutH.getValue(1), dl, X86::EFLAGS,
31080                                        MVT::i32, cpOutH.getValue(2));
31081    SDValue Success = getSETCC(X86::COND_E, EFLAGS, dl, DAG);
31082    Success = DAG.getZExtOrTrunc(Success, dl, N->getValueType(1));
31083
31084    Results.push_back(DAG.getNode(ISD::BUILD_PAIR, dl, T, OpsF));
31085    Results.push_back(Success);
31086    Results.push_back(EFLAGS.getValue(1));
31087    return;
31088  }
31089  case ISD::ATOMIC_LOAD: {
31090    assert(N->getValueType(0) == MVT::i64 && "Unexpected VT!");
31091    bool NoImplicitFloatOps =
31092        DAG.getMachineFunction().getFunction().hasFnAttribute(
31093            Attribute::NoImplicitFloat);
31094    if (!Subtarget.useSoftFloat() && !NoImplicitFloatOps) {
31095      auto *Node = cast<AtomicSDNode>(N);
31096      if (Subtarget.hasSSE1()) {
31097        // Use a VZEXT_LOAD which will be selected as MOVQ or XORPS+MOVLPS.
31098        // Then extract the lower 64-bits.
31099        MVT LdVT = Subtarget.hasSSE2() ? MVT::v2i64 : MVT::v4f32;
31100        SDVTList Tys = DAG.getVTList(LdVT, MVT::Other);
31101        SDValue Ops[] = { Node->getChain(), Node->getBasePtr() };
31102        SDValue Ld = DAG.getMemIntrinsicNode(X86ISD::VZEXT_LOAD, dl, Tys, Ops,
31103                                             MVT::i64, Node->getMemOperand());
31104        if (Subtarget.hasSSE2()) {
31105          SDValue Res = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i64, Ld,
31106                                    DAG.getIntPtrConstant(0, dl));
31107          Results.push_back(Res);
31108          Results.push_back(Ld.getValue(1));
31109          return;
31110        }
31111        // We use an alternative sequence for SSE1 that extracts as v2f32 and
31112        // then casts to i64. This avoids a 128-bit stack temporary being
31113        // created by type legalization if we were to cast v4f32->v2i64.
31114        SDValue Res = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MVT::v2f32, Ld,
31115                                  DAG.getIntPtrConstant(0, dl));
31116        Res = DAG.getBitcast(MVT::i64, Res);
31117        Results.push_back(Res);
31118        Results.push_back(Ld.getValue(1));
31119        return;
31120      }
31121      if (Subtarget.hasX87()) {
31122        // First load this into an 80-bit X87 register. This will put the whole
31123        // integer into the significand.
31124        SDVTList Tys = DAG.getVTList(MVT::f80, MVT::Other);
31125        SDValue Ops[] = { Node->getChain(), Node->getBasePtr() };
31126        SDValue Result = DAG.getMemIntrinsicNode(X86ISD::FILD,
31127                                                 dl, Tys, Ops, MVT::i64,
31128                                                 Node->getMemOperand());
31129        SDValue Chain = Result.getValue(1);
31130
31131        // Now store the X87 register to a stack temporary and convert to i64.
31132        // This store is not atomic and doesn't need to be.
31133        // FIXME: We don't need a stack temporary if the result of the load
31134        // is already being stored. We could just directly store there.
31135        SDValue StackPtr = DAG.CreateStackTemporary(MVT::i64);
31136        int SPFI = cast<FrameIndexSDNode>(StackPtr.getNode())->getIndex();
31137        MachinePointerInfo MPI =
31138            MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), SPFI);
31139        SDValue StoreOps[] = { Chain, Result, StackPtr };
31140        Chain = DAG.getMemIntrinsicNode(
31141            X86ISD::FIST, dl, DAG.getVTList(MVT::Other), StoreOps, MVT::i64,
31142            MPI, None /*Align*/, MachineMemOperand::MOStore);
31143
31144        // Finally load the value back from the stack temporary and return it.
31145        // This load is not atomic and doesn't need to be.
31146        // This load will be further type legalized.
31147        Result = DAG.getLoad(MVT::i64, dl, Chain, StackPtr, MPI);
31148        Results.push_back(Result);
31149        Results.push_back(Result.getValue(1));
31150        return;
31151      }
31152    }
31153    // TODO: Use MOVLPS when SSE1 is available?
31154    // Delegate to generic TypeLegalization. Situations we can really handle
31155    // should have already been dealt with by AtomicExpandPass.cpp.
31156    break;
31157  }
31158  case ISD::ATOMIC_SWAP:
31159  case ISD::ATOMIC_LOAD_ADD:
31160  case ISD::ATOMIC_LOAD_SUB:
31161  case ISD::ATOMIC_LOAD_AND:
31162  case ISD::ATOMIC_LOAD_OR:
31163  case ISD::ATOMIC_LOAD_XOR:
31164  case ISD::ATOMIC_LOAD_NAND:
31165  case ISD::ATOMIC_LOAD_MIN:
31166  case ISD::ATOMIC_LOAD_MAX:
31167  case ISD::ATOMIC_LOAD_UMIN:
31168  case ISD::ATOMIC_LOAD_UMAX:
31169    // Delegate to generic TypeLegalization. Situations we can really handle
31170    // should have already been dealt with by AtomicExpandPass.cpp.
31171    break;
31172
31173  case ISD::BITCAST: {
31174    assert(Subtarget.hasSSE2() && "Requires at least SSE2!");
31175    EVT DstVT = N->getValueType(0);
31176    EVT SrcVT = N->getOperand(0).getValueType();
31177
31178    // If this is a bitcast from a v64i1 k-register to a i64 on a 32-bit target
31179    // we can split using the k-register rather than memory.
31180    if (SrcVT == MVT::v64i1 && DstVT == MVT::i64 && Subtarget.hasBWI()) {
31181      assert(!Subtarget.is64Bit() && "Expected 32-bit mode");
31182      SDValue Lo, Hi;
31183      std::tie(Lo, Hi) = DAG.SplitVectorOperand(N, 0);
31184      Lo = DAG.getBitcast(MVT::i32, Lo);
31185      Hi = DAG.getBitcast(MVT::i32, Hi);
31186      SDValue Res = DAG.getNode(ISD::BUILD_PAIR, dl, MVT::i64, Lo, Hi);
31187      Results.push_back(Res);
31188      return;
31189    }
31190
31191    if (DstVT.isVector() && SrcVT == MVT::x86mmx) {
31192      // FIXME: Use v4f32 for SSE1?
31193      assert(Subtarget.hasSSE2() && "Requires SSE2");
31194      assert(getTypeAction(*DAG.getContext(), DstVT) == TypeWidenVector &&
31195             "Unexpected type action!");
31196      EVT WideVT = getTypeToTransformTo(*DAG.getContext(), DstVT);
31197      SDValue Res = DAG.getNode(X86ISD::MOVQ2DQ, dl, MVT::v2i64,
31198                                N->getOperand(0));
31199      Res = DAG.getBitcast(WideVT, Res);
31200      Results.push_back(Res);
31201      return;
31202    }
31203
31204    return;
31205  }
31206  case ISD::MGATHER: {
31207    EVT VT = N->getValueType(0);
31208    if ((VT == MVT::v2f32 || VT == MVT::v2i32) &&
31209        (Subtarget.hasVLX() || !Subtarget.hasAVX512())) {
31210      auto *Gather = cast<MaskedGatherSDNode>(N);
31211      SDValue Index = Gather->getIndex();
31212      if (Index.getValueType() != MVT::v2i64)
31213        return;
31214      assert(getTypeAction(*DAG.getContext(), VT) == TypeWidenVector &&
31215             "Unexpected type action!");
31216      EVT WideVT = getTypeToTransformTo(*DAG.getContext(), VT);
31217      SDValue Mask = Gather->getMask();
31218      assert(Mask.getValueType() == MVT::v2i1 && "Unexpected mask type");
31219      SDValue PassThru = DAG.getNode(ISD::CONCAT_VECTORS, dl, WideVT,
31220                                     Gather->getPassThru(),
31221                                     DAG.getUNDEF(VT));
31222      if (!Subtarget.hasVLX()) {
31223        // We need to widen the mask, but the instruction will only use 2
31224        // of its elements. So we can use undef.
31225        Mask = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v4i1, Mask,
31226                           DAG.getUNDEF(MVT::v2i1));
31227        Mask = DAG.getNode(ISD::SIGN_EXTEND, dl, MVT::v4i32, Mask);
31228      }
31229      SDValue Ops[] = { Gather->getChain(), PassThru, Mask,
31230                        Gather->getBasePtr(), Index, Gather->getScale() };
31231      SDValue Res = DAG.getMemIntrinsicNode(
31232          X86ISD::MGATHER, dl, DAG.getVTList(WideVT, MVT::Other), Ops,
31233          Gather->getMemoryVT(), Gather->getMemOperand());
31234      Results.push_back(Res);
31235      Results.push_back(Res.getValue(1));
31236      return;
31237    }
31238    return;
31239  }
31240  case ISD::LOAD: {
31241    // Use an f64/i64 load and a scalar_to_vector for v2f32/v2i32 loads. This
31242    // avoids scalarizing in 32-bit mode. In 64-bit mode this avoids a int->fp
31243    // cast since type legalization will try to use an i64 load.
31244    MVT VT = N->getSimpleValueType(0);
31245    assert(VT.isVector() && VT.getSizeInBits() == 64 && "Unexpected VT");
31246    assert(getTypeAction(*DAG.getContext(), VT) == TypeWidenVector &&
31247           "Unexpected type action!");
31248    if (!ISD::isNON_EXTLoad(N))
31249      return;
31250    auto *Ld = cast<LoadSDNode>(N);
31251    if (Subtarget.hasSSE2()) {
31252      MVT LdVT = Subtarget.is64Bit() && VT.isInteger() ? MVT::i64 : MVT::f64;
31253      SDValue Res = DAG.getLoad(LdVT, dl, Ld->getChain(), Ld->getBasePtr(),
31254                                Ld->getPointerInfo(), Ld->getOriginalAlign(),
31255                                Ld->getMemOperand()->getFlags());
31256      SDValue Chain = Res.getValue(1);
31257      MVT VecVT = MVT::getVectorVT(LdVT, 2);
31258      Res = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VecVT, Res);
31259      EVT WideVT = getTypeToTransformTo(*DAG.getContext(), VT);
31260      Res = DAG.getBitcast(WideVT, Res);
31261      Results.push_back(Res);
31262      Results.push_back(Chain);
31263      return;
31264    }
31265    assert(Subtarget.hasSSE1() && "Expected SSE");
31266    SDVTList Tys = DAG.getVTList(MVT::v4f32, MVT::Other);
31267    SDValue Ops[] = {Ld->getChain(), Ld->getBasePtr()};
31268    SDValue Res = DAG.getMemIntrinsicNode(X86ISD::VZEXT_LOAD, dl, Tys, Ops,
31269                                          MVT::i64, Ld->getMemOperand());
31270    Results.push_back(Res);
31271    Results.push_back(Res.getValue(1));
31272    return;
31273  }
31274  case ISD::ADDRSPACECAST: {
31275    SDValue V = LowerADDRSPACECAST(SDValue(N,0), DAG);
31276    Results.push_back(V);
31277    return;
31278  }
31279  case ISD::BITREVERSE:
31280    assert(N->getValueType(0) == MVT::i64 && "Unexpected VT!");
31281    assert(Subtarget.hasXOP() && "Expected XOP");
31282    // We can use VPPERM by copying to a vector register and back. We'll need
31283    // to move the scalar in two i32 pieces.
31284    Results.push_back(LowerBITREVERSE(SDValue(N, 0), Subtarget, DAG));
31285    return;
31286  }
31287}
31288
31289const char *X86TargetLowering::getTargetNodeName(unsigned Opcode) const {
31290  switch ((X86ISD::NodeType)Opcode) {
31291  case X86ISD::FIRST_NUMBER:       break;
31292#define NODE_NAME_CASE(NODE) case X86ISD::NODE: return "X86ISD::" #NODE;
31293  NODE_NAME_CASE(BSF)
31294  NODE_NAME_CASE(BSR)
31295  NODE_NAME_CASE(FSHL)
31296  NODE_NAME_CASE(FSHR)
31297  NODE_NAME_CASE(FAND)
31298  NODE_NAME_CASE(FANDN)
31299  NODE_NAME_CASE(FOR)
31300  NODE_NAME_CASE(FXOR)
31301  NODE_NAME_CASE(FILD)
31302  NODE_NAME_CASE(FIST)
31303  NODE_NAME_CASE(FP_TO_INT_IN_MEM)
31304  NODE_NAME_CASE(FLD)
31305  NODE_NAME_CASE(FST)
31306  NODE_NAME_CASE(CALL)
31307  NODE_NAME_CASE(CALL_RVMARKER)
31308  NODE_NAME_CASE(BT)
31309  NODE_NAME_CASE(CMP)
31310  NODE_NAME_CASE(FCMP)
31311  NODE_NAME_CASE(STRICT_FCMP)
31312  NODE_NAME_CASE(STRICT_FCMPS)
31313  NODE_NAME_CASE(COMI)
31314  NODE_NAME_CASE(UCOMI)
31315  NODE_NAME_CASE(CMPM)
31316  NODE_NAME_CASE(CMPMM)
31317  NODE_NAME_CASE(STRICT_CMPM)
31318  NODE_NAME_CASE(CMPMM_SAE)
31319  NODE_NAME_CASE(SETCC)
31320  NODE_NAME_CASE(SETCC_CARRY)
31321  NODE_NAME_CASE(FSETCC)
31322  NODE_NAME_CASE(FSETCCM)
31323  NODE_NAME_CASE(FSETCCM_SAE)
31324  NODE_NAME_CASE(CMOV)
31325  NODE_NAME_CASE(BRCOND)
31326  NODE_NAME_CASE(RET_FLAG)
31327  NODE_NAME_CASE(IRET)
31328  NODE_NAME_CASE(REP_STOS)
31329  NODE_NAME_CASE(REP_MOVS)
31330  NODE_NAME_CASE(GlobalBaseReg)
31331  NODE_NAME_CASE(Wrapper)
31332  NODE_NAME_CASE(WrapperRIP)
31333  NODE_NAME_CASE(MOVQ2DQ)
31334  NODE_NAME_CASE(MOVDQ2Q)
31335  NODE_NAME_CASE(MMX_MOVD2W)
31336  NODE_NAME_CASE(MMX_MOVW2D)
31337  NODE_NAME_CASE(PEXTRB)
31338  NODE_NAME_CASE(PEXTRW)
31339  NODE_NAME_CASE(INSERTPS)
31340  NODE_NAME_CASE(PINSRB)
31341  NODE_NAME_CASE(PINSRW)
31342  NODE_NAME_CASE(PSHUFB)
31343  NODE_NAME_CASE(ANDNP)
31344  NODE_NAME_CASE(BLENDI)
31345  NODE_NAME_CASE(BLENDV)
31346  NODE_NAME_CASE(HADD)
31347  NODE_NAME_CASE(HSUB)
31348  NODE_NAME_CASE(FHADD)
31349  NODE_NAME_CASE(FHSUB)
31350  NODE_NAME_CASE(CONFLICT)
31351  NODE_NAME_CASE(FMAX)
31352  NODE_NAME_CASE(FMAXS)
31353  NODE_NAME_CASE(FMAX_SAE)
31354  NODE_NAME_CASE(FMAXS_SAE)
31355  NODE_NAME_CASE(FMIN)
31356  NODE_NAME_CASE(FMINS)
31357  NODE_NAME_CASE(FMIN_SAE)
31358  NODE_NAME_CASE(FMINS_SAE)
31359  NODE_NAME_CASE(FMAXC)
31360  NODE_NAME_CASE(FMINC)
31361  NODE_NAME_CASE(FRSQRT)
31362  NODE_NAME_CASE(FRCP)
31363  NODE_NAME_CASE(EXTRQI)
31364  NODE_NAME_CASE(INSERTQI)
31365  NODE_NAME_CASE(TLSADDR)
31366  NODE_NAME_CASE(TLSBASEADDR)
31367  NODE_NAME_CASE(TLSCALL)
31368  NODE_NAME_CASE(EH_SJLJ_SETJMP)
31369  NODE_NAME_CASE(EH_SJLJ_LONGJMP)
31370  NODE_NAME_CASE(EH_SJLJ_SETUP_DISPATCH)
31371  NODE_NAME_CASE(EH_RETURN)
31372  NODE_NAME_CASE(TC_RETURN)
31373  NODE_NAME_CASE(FNSTCW16m)
31374  NODE_NAME_CASE(FLDCW16m)
31375  NODE_NAME_CASE(LCMPXCHG_DAG)
31376  NODE_NAME_CASE(LCMPXCHG8_DAG)
31377  NODE_NAME_CASE(LCMPXCHG16_DAG)
31378  NODE_NAME_CASE(LCMPXCHG16_SAVE_RBX_DAG)
31379  NODE_NAME_CASE(LADD)
31380  NODE_NAME_CASE(LSUB)
31381  NODE_NAME_CASE(LOR)
31382  NODE_NAME_CASE(LXOR)
31383  NODE_NAME_CASE(LAND)
31384  NODE_NAME_CASE(VZEXT_MOVL)
31385  NODE_NAME_CASE(VZEXT_LOAD)
31386  NODE_NAME_CASE(VEXTRACT_STORE)
31387  NODE_NAME_CASE(VTRUNC)
31388  NODE_NAME_CASE(VTRUNCS)
31389  NODE_NAME_CASE(VTRUNCUS)
31390  NODE_NAME_CASE(VMTRUNC)
31391  NODE_NAME_CASE(VMTRUNCS)
31392  NODE_NAME_CASE(VMTRUNCUS)
31393  NODE_NAME_CASE(VTRUNCSTORES)
31394  NODE_NAME_CASE(VTRUNCSTOREUS)
31395  NODE_NAME_CASE(VMTRUNCSTORES)
31396  NODE_NAME_CASE(VMTRUNCSTOREUS)
31397  NODE_NAME_CASE(VFPEXT)
31398  NODE_NAME_CASE(STRICT_VFPEXT)
31399  NODE_NAME_CASE(VFPEXT_SAE)
31400  NODE_NAME_CASE(VFPEXTS)
31401  NODE_NAME_CASE(VFPEXTS_SAE)
31402  NODE_NAME_CASE(VFPROUND)
31403  NODE_NAME_CASE(STRICT_VFPROUND)
31404  NODE_NAME_CASE(VMFPROUND)
31405  NODE_NAME_CASE(VFPROUND_RND)
31406  NODE_NAME_CASE(VFPROUNDS)
31407  NODE_NAME_CASE(VFPROUNDS_RND)
31408  NODE_NAME_CASE(VSHLDQ)
31409  NODE_NAME_CASE(VSRLDQ)
31410  NODE_NAME_CASE(VSHL)
31411  NODE_NAME_CASE(VSRL)
31412  NODE_NAME_CASE(VSRA)
31413  NODE_NAME_CASE(VSHLI)
31414  NODE_NAME_CASE(VSRLI)
31415  NODE_NAME_CASE(VSRAI)
31416  NODE_NAME_CASE(VSHLV)
31417  NODE_NAME_CASE(VSRLV)
31418  NODE_NAME_CASE(VSRAV)
31419  NODE_NAME_CASE(VROTLI)
31420  NODE_NAME_CASE(VROTRI)
31421  NODE_NAME_CASE(VPPERM)
31422  NODE_NAME_CASE(CMPP)
31423  NODE_NAME_CASE(STRICT_CMPP)
31424  NODE_NAME_CASE(PCMPEQ)
31425  NODE_NAME_CASE(PCMPGT)
31426  NODE_NAME_CASE(PHMINPOS)
31427  NODE_NAME_CASE(ADD)
31428  NODE_NAME_CASE(SUB)
31429  NODE_NAME_CASE(ADC)
31430  NODE_NAME_CASE(SBB)
31431  NODE_NAME_CASE(SMUL)
31432  NODE_NAME_CASE(UMUL)
31433  NODE_NAME_CASE(OR)
31434  NODE_NAME_CASE(XOR)
31435  NODE_NAME_CASE(AND)
31436  NODE_NAME_CASE(BEXTR)
31437  NODE_NAME_CASE(BEXTRI)
31438  NODE_NAME_CASE(BZHI)
31439  NODE_NAME_CASE(PDEP)
31440  NODE_NAME_CASE(PEXT)
31441  NODE_NAME_CASE(MUL_IMM)
31442  NODE_NAME_CASE(MOVMSK)
31443  NODE_NAME_CASE(PTEST)
31444  NODE_NAME_CASE(TESTP)
31445  NODE_NAME_CASE(KORTEST)
31446  NODE_NAME_CASE(KTEST)
31447  NODE_NAME_CASE(KADD)
31448  NODE_NAME_CASE(KSHIFTL)
31449  NODE_NAME_CASE(KSHIFTR)
31450  NODE_NAME_CASE(PACKSS)
31451  NODE_NAME_CASE(PACKUS)
31452  NODE_NAME_CASE(PALIGNR)
31453  NODE_NAME_CASE(VALIGN)
31454  NODE_NAME_CASE(VSHLD)
31455  NODE_NAME_CASE(VSHRD)
31456  NODE_NAME_CASE(VSHLDV)
31457  NODE_NAME_CASE(VSHRDV)
31458  NODE_NAME_CASE(PSHUFD)
31459  NODE_NAME_CASE(PSHUFHW)
31460  NODE_NAME_CASE(PSHUFLW)
31461  NODE_NAME_CASE(SHUFP)
31462  NODE_NAME_CASE(SHUF128)
31463  NODE_NAME_CASE(MOVLHPS)
31464  NODE_NAME_CASE(MOVHLPS)
31465  NODE_NAME_CASE(MOVDDUP)
31466  NODE_NAME_CASE(MOVSHDUP)
31467  NODE_NAME_CASE(MOVSLDUP)
31468  NODE_NAME_CASE(MOVSD)
31469  NODE_NAME_CASE(MOVSS)
31470  NODE_NAME_CASE(UNPCKL)
31471  NODE_NAME_CASE(UNPCKH)
31472  NODE_NAME_CASE(VBROADCAST)
31473  NODE_NAME_CASE(VBROADCAST_LOAD)
31474  NODE_NAME_CASE(VBROADCASTM)
31475  NODE_NAME_CASE(SUBV_BROADCAST_LOAD)
31476  NODE_NAME_CASE(VPERMILPV)
31477  NODE_NAME_CASE(VPERMILPI)
31478  NODE_NAME_CASE(VPERM2X128)
31479  NODE_NAME_CASE(VPERMV)
31480  NODE_NAME_CASE(VPERMV3)
31481  NODE_NAME_CASE(VPERMI)
31482  NODE_NAME_CASE(VPTERNLOG)
31483  NODE_NAME_CASE(VFIXUPIMM)
31484  NODE_NAME_CASE(VFIXUPIMM_SAE)
31485  NODE_NAME_CASE(VFIXUPIMMS)
31486  NODE_NAME_CASE(VFIXUPIMMS_SAE)
31487  NODE_NAME_CASE(VRANGE)
31488  NODE_NAME_CASE(VRANGE_SAE)
31489  NODE_NAME_CASE(VRANGES)
31490  NODE_NAME_CASE(VRANGES_SAE)
31491  NODE_NAME_CASE(PMULUDQ)
31492  NODE_NAME_CASE(PMULDQ)
31493  NODE_NAME_CASE(PSADBW)
31494  NODE_NAME_CASE(DBPSADBW)
31495  NODE_NAME_CASE(VASTART_SAVE_XMM_REGS)
31496  NODE_NAME_CASE(VAARG_64)
31497  NODE_NAME_CASE(VAARG_X32)
31498  NODE_NAME_CASE(WIN_ALLOCA)
31499  NODE_NAME_CASE(MEMBARRIER)
31500  NODE_NAME_CASE(MFENCE)
31501  NODE_NAME_CASE(SEG_ALLOCA)
31502  NODE_NAME_CASE(PROBED_ALLOCA)
31503  NODE_NAME_CASE(RDRAND)
31504  NODE_NAME_CASE(RDSEED)
31505  NODE_NAME_CASE(RDPKRU)
31506  NODE_NAME_CASE(WRPKRU)
31507  NODE_NAME_CASE(VPMADDUBSW)
31508  NODE_NAME_CASE(VPMADDWD)
31509  NODE_NAME_CASE(VPSHA)
31510  NODE_NAME_CASE(VPSHL)
31511  NODE_NAME_CASE(VPCOM)
31512  NODE_NAME_CASE(VPCOMU)
31513  NODE_NAME_CASE(VPERMIL2)
31514  NODE_NAME_CASE(FMSUB)
31515  NODE_NAME_CASE(STRICT_FMSUB)
31516  NODE_NAME_CASE(FNMADD)
31517  NODE_NAME_CASE(STRICT_FNMADD)
31518  NODE_NAME_CASE(FNMSUB)
31519  NODE_NAME_CASE(STRICT_FNMSUB)
31520  NODE_NAME_CASE(FMADDSUB)
31521  NODE_NAME_CASE(FMSUBADD)
31522  NODE_NAME_CASE(FMADD_RND)
31523  NODE_NAME_CASE(FNMADD_RND)
31524  NODE_NAME_CASE(FMSUB_RND)
31525  NODE_NAME_CASE(FNMSUB_RND)
31526  NODE_NAME_CASE(FMADDSUB_RND)
31527  NODE_NAME_CASE(FMSUBADD_RND)
31528  NODE_NAME_CASE(VPMADD52H)
31529  NODE_NAME_CASE(VPMADD52L)
31530  NODE_NAME_CASE(VRNDSCALE)
31531  NODE_NAME_CASE(STRICT_VRNDSCALE)
31532  NODE_NAME_CASE(VRNDSCALE_SAE)
31533  NODE_NAME_CASE(VRNDSCALES)
31534  NODE_NAME_CASE(VRNDSCALES_SAE)
31535  NODE_NAME_CASE(VREDUCE)
31536  NODE_NAME_CASE(VREDUCE_SAE)
31537  NODE_NAME_CASE(VREDUCES)
31538  NODE_NAME_CASE(VREDUCES_SAE)
31539  NODE_NAME_CASE(VGETMANT)
31540  NODE_NAME_CASE(VGETMANT_SAE)
31541  NODE_NAME_CASE(VGETMANTS)
31542  NODE_NAME_CASE(VGETMANTS_SAE)
31543  NODE_NAME_CASE(PCMPESTR)
31544  NODE_NAME_CASE(PCMPISTR)
31545  NODE_NAME_CASE(XTEST)
31546  NODE_NAME_CASE(COMPRESS)
31547  NODE_NAME_CASE(EXPAND)
31548  NODE_NAME_CASE(SELECTS)
31549  NODE_NAME_CASE(ADDSUB)
31550  NODE_NAME_CASE(RCP14)
31551  NODE_NAME_CASE(RCP14S)
31552  NODE_NAME_CASE(RCP28)
31553  NODE_NAME_CASE(RCP28_SAE)
31554  NODE_NAME_CASE(RCP28S)
31555  NODE_NAME_CASE(RCP28S_SAE)
31556  NODE_NAME_CASE(EXP2)
31557  NODE_NAME_CASE(EXP2_SAE)
31558  NODE_NAME_CASE(RSQRT14)
31559  NODE_NAME_CASE(RSQRT14S)
31560  NODE_NAME_CASE(RSQRT28)
31561  NODE_NAME_CASE(RSQRT28_SAE)
31562  NODE_NAME_CASE(RSQRT28S)
31563  NODE_NAME_CASE(RSQRT28S_SAE)
31564  NODE_NAME_CASE(FADD_RND)
31565  NODE_NAME_CASE(FADDS)
31566  NODE_NAME_CASE(FADDS_RND)
31567  NODE_NAME_CASE(FSUB_RND)
31568  NODE_NAME_CASE(FSUBS)
31569  NODE_NAME_CASE(FSUBS_RND)
31570  NODE_NAME_CASE(FMUL_RND)
31571  NODE_NAME_CASE(FMULS)
31572  NODE_NAME_CASE(FMULS_RND)
31573  NODE_NAME_CASE(FDIV_RND)
31574  NODE_NAME_CASE(FDIVS)
31575  NODE_NAME_CASE(FDIVS_RND)
31576  NODE_NAME_CASE(FSQRT_RND)
31577  NODE_NAME_CASE(FSQRTS)
31578  NODE_NAME_CASE(FSQRTS_RND)
31579  NODE_NAME_CASE(FGETEXP)
31580  NODE_NAME_CASE(FGETEXP_SAE)
31581  NODE_NAME_CASE(FGETEXPS)
31582  NODE_NAME_CASE(FGETEXPS_SAE)
31583  NODE_NAME_CASE(SCALEF)
31584  NODE_NAME_CASE(SCALEF_RND)
31585  NODE_NAME_CASE(SCALEFS)
31586  NODE_NAME_CASE(SCALEFS_RND)
31587  NODE_NAME_CASE(AVG)
31588  NODE_NAME_CASE(MULHRS)
31589  NODE_NAME_CASE(SINT_TO_FP_RND)
31590  NODE_NAME_CASE(UINT_TO_FP_RND)
31591  NODE_NAME_CASE(CVTTP2SI)
31592  NODE_NAME_CASE(CVTTP2UI)
31593  NODE_NAME_CASE(STRICT_CVTTP2SI)
31594  NODE_NAME_CASE(STRICT_CVTTP2UI)
31595  NODE_NAME_CASE(MCVTTP2SI)
31596  NODE_NAME_CASE(MCVTTP2UI)
31597  NODE_NAME_CASE(CVTTP2SI_SAE)
31598  NODE_NAME_CASE(CVTTP2UI_SAE)
31599  NODE_NAME_CASE(CVTTS2SI)
31600  NODE_NAME_CASE(CVTTS2UI)
31601  NODE_NAME_CASE(CVTTS2SI_SAE)
31602  NODE_NAME_CASE(CVTTS2UI_SAE)
31603  NODE_NAME_CASE(CVTSI2P)
31604  NODE_NAME_CASE(CVTUI2P)
31605  NODE_NAME_CASE(STRICT_CVTSI2P)
31606  NODE_NAME_CASE(STRICT_CVTUI2P)
31607  NODE_NAME_CASE(MCVTSI2P)
31608  NODE_NAME_CASE(MCVTUI2P)
31609  NODE_NAME_CASE(VFPCLASS)
31610  NODE_NAME_CASE(VFPCLASSS)
31611  NODE_NAME_CASE(MULTISHIFT)
31612  NODE_NAME_CASE(SCALAR_SINT_TO_FP)
31613  NODE_NAME_CASE(SCALAR_SINT_TO_FP_RND)
31614  NODE_NAME_CASE(SCALAR_UINT_TO_FP)
31615  NODE_NAME_CASE(SCALAR_UINT_TO_FP_RND)
31616  NODE_NAME_CASE(CVTPS2PH)
31617  NODE_NAME_CASE(STRICT_CVTPS2PH)
31618  NODE_NAME_CASE(MCVTPS2PH)
31619  NODE_NAME_CASE(CVTPH2PS)
31620  NODE_NAME_CASE(STRICT_CVTPH2PS)
31621  NODE_NAME_CASE(CVTPH2PS_SAE)
31622  NODE_NAME_CASE(CVTP2SI)
31623  NODE_NAME_CASE(CVTP2UI)
31624  NODE_NAME_CASE(MCVTP2SI)
31625  NODE_NAME_CASE(MCVTP2UI)
31626  NODE_NAME_CASE(CVTP2SI_RND)
31627  NODE_NAME_CASE(CVTP2UI_RND)
31628  NODE_NAME_CASE(CVTS2SI)
31629  NODE_NAME_CASE(CVTS2UI)
31630  NODE_NAME_CASE(CVTS2SI_RND)
31631  NODE_NAME_CASE(CVTS2UI_RND)
31632  NODE_NAME_CASE(CVTNE2PS2BF16)
31633  NODE_NAME_CASE(CVTNEPS2BF16)
31634  NODE_NAME_CASE(MCVTNEPS2BF16)
31635  NODE_NAME_CASE(DPBF16PS)
31636  NODE_NAME_CASE(LWPINS)
31637  NODE_NAME_CASE(MGATHER)
31638  NODE_NAME_CASE(MSCATTER)
31639  NODE_NAME_CASE(VPDPBUSD)
31640  NODE_NAME_CASE(VPDPBUSDS)
31641  NODE_NAME_CASE(VPDPWSSD)
31642  NODE_NAME_CASE(VPDPWSSDS)
31643  NODE_NAME_CASE(VPSHUFBITQMB)
31644  NODE_NAME_CASE(GF2P8MULB)
31645  NODE_NAME_CASE(GF2P8AFFINEQB)
31646  NODE_NAME_CASE(GF2P8AFFINEINVQB)
31647  NODE_NAME_CASE(NT_CALL)
31648  NODE_NAME_CASE(NT_BRIND)
31649  NODE_NAME_CASE(UMWAIT)
31650  NODE_NAME_CASE(TPAUSE)
31651  NODE_NAME_CASE(ENQCMD)
31652  NODE_NAME_CASE(ENQCMDS)
31653  NODE_NAME_CASE(VP2INTERSECT)
31654  NODE_NAME_CASE(AESENC128KL)
31655  NODE_NAME_CASE(AESDEC128KL)
31656  NODE_NAME_CASE(AESENC256KL)
31657  NODE_NAME_CASE(AESDEC256KL)
31658  NODE_NAME_CASE(AESENCWIDE128KL)
31659  NODE_NAME_CASE(AESDECWIDE128KL)
31660  NODE_NAME_CASE(AESENCWIDE256KL)
31661  NODE_NAME_CASE(AESDECWIDE256KL)
31662  NODE_NAME_CASE(TESTUI)
31663  }
31664  return nullptr;
31665#undef NODE_NAME_CASE
31666}
31667
31668/// Return true if the addressing mode represented by AM is legal for this
31669/// target, for a load/store of the specified type.
31670bool X86TargetLowering::isLegalAddressingMode(const DataLayout &DL,
31671                                              const AddrMode &AM, Type *Ty,
31672                                              unsigned AS,
31673                                              Instruction *I) const {
31674  // X86 supports extremely general addressing modes.
31675  CodeModel::Model M = getTargetMachine().getCodeModel();
31676
31677  // X86 allows a sign-extended 32-bit immediate field as a displacement.
31678  if (!X86::isOffsetSuitableForCodeModel(AM.BaseOffs, M, AM.BaseGV != nullptr))
31679    return false;
31680
31681  if (AM.BaseGV) {
31682    unsigned GVFlags = Subtarget.classifyGlobalReference(AM.BaseGV);
31683
31684    // If a reference to this global requires an extra load, we can't fold it.
31685    if (isGlobalStubReference(GVFlags))
31686      return false;
31687
31688    // If BaseGV requires a register for the PIC base, we cannot also have a
31689    // BaseReg specified.
31690    if (AM.HasBaseReg && isGlobalRelativeToPICBase(GVFlags))
31691      return false;
31692
31693    // If lower 4G is not available, then we must use rip-relative addressing.
31694    if ((M != CodeModel::Small || isPositionIndependent()) &&
31695        Subtarget.is64Bit() && (AM.BaseOffs || AM.Scale > 1))
31696      return false;
31697  }
31698
31699  switch (AM.Scale) {
31700  case 0:
31701  case 1:
31702  case 2:
31703  case 4:
31704  case 8:
31705    // These scales always work.
31706    break;
31707  case 3:
31708  case 5:
31709  case 9:
31710    // These scales are formed with basereg+scalereg.  Only accept if there is
31711    // no basereg yet.
31712    if (AM.HasBaseReg)
31713      return false;
31714    break;
31715  default:  // Other stuff never works.
31716    return false;
31717  }
31718
31719  return true;
31720}
31721
31722bool X86TargetLowering::isVectorShiftByScalarCheap(Type *Ty) const {
31723  unsigned Bits = Ty->getScalarSizeInBits();
31724
31725  // 8-bit shifts are always expensive, but versions with a scalar amount aren't
31726  // particularly cheaper than those without.
31727  if (Bits == 8)
31728    return false;
31729
31730  // XOP has v16i8/v8i16/v4i32/v2i64 variable vector shifts.
31731  // Splitting for v32i8/v16i16 on XOP+AVX2 targets is still preferred.
31732  if (Subtarget.hasXOP() &&
31733      (Bits == 8 || Bits == 16 || Bits == 32 || Bits == 64))
31734    return false;
31735
31736  // AVX2 has vpsllv[dq] instructions (and other shifts) that make variable
31737  // shifts just as cheap as scalar ones.
31738  if (Subtarget.hasAVX2() && (Bits == 32 || Bits == 64))
31739    return false;
31740
31741  // AVX512BW has shifts such as vpsllvw.
31742  if (Subtarget.hasBWI() && Bits == 16)
31743      return false;
31744
31745  // Otherwise, it's significantly cheaper to shift by a scalar amount than by a
31746  // fully general vector.
31747  return true;
31748}
31749
31750bool X86TargetLowering::isBinOp(unsigned Opcode) const {
31751  switch (Opcode) {
31752  // These are non-commutative binops.
31753  // TODO: Add more X86ISD opcodes once we have test coverage.
31754  case X86ISD::ANDNP:
31755  case X86ISD::PCMPGT:
31756  case X86ISD::FMAX:
31757  case X86ISD::FMIN:
31758  case X86ISD::FANDN:
31759    return true;
31760  }
31761
31762  return TargetLoweringBase::isBinOp(Opcode);
31763}
31764
31765bool X86TargetLowering::isCommutativeBinOp(unsigned Opcode) const {
31766  switch (Opcode) {
31767  // TODO: Add more X86ISD opcodes once we have test coverage.
31768  case X86ISD::PCMPEQ:
31769  case X86ISD::PMULDQ:
31770  case X86ISD::PMULUDQ:
31771  case X86ISD::FMAXC:
31772  case X86ISD::FMINC:
31773  case X86ISD::FAND:
31774  case X86ISD::FOR:
31775  case X86ISD::FXOR:
31776    return true;
31777  }
31778
31779  return TargetLoweringBase::isCommutativeBinOp(Opcode);
31780}
31781
31782bool X86TargetLowering::isTruncateFree(Type *Ty1, Type *Ty2) const {
31783  if (!Ty1->isIntegerTy() || !Ty2->isIntegerTy())
31784    return false;
31785  unsigned NumBits1 = Ty1->getPrimitiveSizeInBits();
31786  unsigned NumBits2 = Ty2->getPrimitiveSizeInBits();
31787  return NumBits1 > NumBits2;
31788}
31789
31790bool X86TargetLowering::allowTruncateForTailCall(Type *Ty1, Type *Ty2) const {
31791  if (!Ty1->isIntegerTy() || !Ty2->isIntegerTy())
31792    return false;
31793
31794  if (!isTypeLegal(EVT::getEVT(Ty1)))
31795    return false;
31796
31797  assert(Ty1->getPrimitiveSizeInBits() <= 64 && "i128 is probably not a noop");
31798
31799  // Assuming the caller doesn't have a zeroext or signext return parameter,
31800  // truncation all the way down to i1 is valid.
31801  return true;
31802}
31803
31804bool X86TargetLowering::isLegalICmpImmediate(int64_t Imm) const {
31805  return isInt<32>(Imm);
31806}
31807
31808bool X86TargetLowering::isLegalAddImmediate(int64_t Imm) const {
31809  // Can also use sub to handle negated immediates.
31810  return isInt<32>(Imm);
31811}
31812
31813bool X86TargetLowering::isLegalStoreImmediate(int64_t Imm) const {
31814  return isInt<32>(Imm);
31815}
31816
31817bool X86TargetLowering::isTruncateFree(EVT VT1, EVT VT2) const {
31818  if (!VT1.isScalarInteger() || !VT2.isScalarInteger())
31819    return false;
31820  unsigned NumBits1 = VT1.getSizeInBits();
31821  unsigned NumBits2 = VT2.getSizeInBits();
31822  return NumBits1 > NumBits2;
31823}
31824
31825bool X86TargetLowering::isZExtFree(Type *Ty1, Type *Ty2) const {
31826  // x86-64 implicitly zero-extends 32-bit results in 64-bit registers.
31827  return Ty1->isIntegerTy(32) && Ty2->isIntegerTy(64) && Subtarget.is64Bit();
31828}
31829
31830bool X86TargetLowering::isZExtFree(EVT VT1, EVT VT2) const {
31831  // x86-64 implicitly zero-extends 32-bit results in 64-bit registers.
31832  return VT1 == MVT::i32 && VT2 == MVT::i64 && Subtarget.is64Bit();
31833}
31834
31835bool X86TargetLowering::isZExtFree(SDValue Val, EVT VT2) const {
31836  EVT VT1 = Val.getValueType();
31837  if (isZExtFree(VT1, VT2))
31838    return true;
31839
31840  if (Val.getOpcode() != ISD::LOAD)
31841    return false;
31842
31843  if (!VT1.isSimple() || !VT1.isInteger() ||
31844      !VT2.isSimple() || !VT2.isInteger())
31845    return false;
31846
31847  switch (VT1.getSimpleVT().SimpleTy) {
31848  default: break;
31849  case MVT::i8:
31850  case MVT::i16:
31851  case MVT::i32:
31852    // X86 has 8, 16, and 32-bit zero-extending loads.
31853    return true;
31854  }
31855
31856  return false;
31857}
31858
31859bool X86TargetLowering::shouldSinkOperands(Instruction *I,
31860                                           SmallVectorImpl<Use *> &Ops) const {
31861  // A uniform shift amount in a vector shift or funnel shift may be much
31862  // cheaper than a generic variable vector shift, so make that pattern visible
31863  // to SDAG by sinking the shuffle instruction next to the shift.
31864  int ShiftAmountOpNum = -1;
31865  if (I->isShift())
31866    ShiftAmountOpNum = 1;
31867  else if (auto *II = dyn_cast<IntrinsicInst>(I)) {
31868    if (II->getIntrinsicID() == Intrinsic::fshl ||
31869        II->getIntrinsicID() == Intrinsic::fshr)
31870      ShiftAmountOpNum = 2;
31871  }
31872
31873  if (ShiftAmountOpNum == -1)
31874    return false;
31875
31876  auto *Shuf = dyn_cast<ShuffleVectorInst>(I->getOperand(ShiftAmountOpNum));
31877  if (Shuf && getSplatIndex(Shuf->getShuffleMask()) >= 0 &&
31878      isVectorShiftByScalarCheap(I->getType())) {
31879    Ops.push_back(&I->getOperandUse(ShiftAmountOpNum));
31880    return true;
31881  }
31882
31883  return false;
31884}
31885
31886bool X86TargetLowering::shouldConvertPhiType(Type *From, Type *To) const {
31887  if (!Subtarget.is64Bit())
31888    return false;
31889  return TargetLowering::shouldConvertPhiType(From, To);
31890}
31891
31892bool X86TargetLowering::isVectorLoadExtDesirable(SDValue ExtVal) const {
31893  if (isa<MaskedLoadSDNode>(ExtVal.getOperand(0)))
31894    return false;
31895
31896  EVT SrcVT = ExtVal.getOperand(0).getValueType();
31897
31898  // There is no extending load for vXi1.
31899  if (SrcVT.getScalarType() == MVT::i1)
31900    return false;
31901
31902  return true;
31903}
31904
31905bool X86TargetLowering::isFMAFasterThanFMulAndFAdd(const MachineFunction &MF,
31906                                                   EVT VT) const {
31907  if (!Subtarget.hasAnyFMA())
31908    return false;
31909
31910  VT = VT.getScalarType();
31911
31912  if (!VT.isSimple())
31913    return false;
31914
31915  switch (VT.getSimpleVT().SimpleTy) {
31916  case MVT::f32:
31917  case MVT::f64:
31918    return true;
31919  default:
31920    break;
31921  }
31922
31923  return false;
31924}
31925
31926bool X86TargetLowering::isNarrowingProfitable(EVT VT1, EVT VT2) const {
31927  // i16 instructions are longer (0x66 prefix) and potentially slower.
31928  return !(VT1 == MVT::i32 && VT2 == MVT::i16);
31929}
31930
31931/// Targets can use this to indicate that they only support *some*
31932/// VECTOR_SHUFFLE operations, those with specific masks.
31933/// By default, if a target supports the VECTOR_SHUFFLE node, all mask values
31934/// are assumed to be legal.
31935bool X86TargetLowering::isShuffleMaskLegal(ArrayRef<int> Mask, EVT VT) const {
31936  if (!VT.isSimple())
31937    return false;
31938
31939  // Not for i1 vectors
31940  if (VT.getSimpleVT().getScalarType() == MVT::i1)
31941    return false;
31942
31943  // Very little shuffling can be done for 64-bit vectors right now.
31944  if (VT.getSimpleVT().getSizeInBits() == 64)
31945    return false;
31946
31947  // We only care that the types being shuffled are legal. The lowering can
31948  // handle any possible shuffle mask that results.
31949  return isTypeLegal(VT.getSimpleVT());
31950}
31951
31952bool X86TargetLowering::isVectorClearMaskLegal(ArrayRef<int> Mask,
31953                                               EVT VT) const {
31954  // Don't convert an 'and' into a shuffle that we don't directly support.
31955  // vpblendw and vpshufb for 256-bit vectors are not available on AVX1.
31956  if (!Subtarget.hasAVX2())
31957    if (VT == MVT::v32i8 || VT == MVT::v16i16)
31958      return false;
31959
31960  // Just delegate to the generic legality, clear masks aren't special.
31961  return isShuffleMaskLegal(Mask, VT);
31962}
31963
31964bool X86TargetLowering::areJTsAllowed(const Function *Fn) const {
31965  // If the subtarget is using thunks, we need to not generate jump tables.
31966  if (Subtarget.useIndirectThunkBranches())
31967    return false;
31968
31969  // Otherwise, fallback on the generic logic.
31970  return TargetLowering::areJTsAllowed(Fn);
31971}
31972
31973//===----------------------------------------------------------------------===//
31974//                           X86 Scheduler Hooks
31975//===----------------------------------------------------------------------===//
31976
31977// Returns true if EFLAG is consumed after this iterator in the rest of the
31978// basic block or any successors of the basic block.
31979static bool isEFLAGSLiveAfter(MachineBasicBlock::iterator Itr,
31980                              MachineBasicBlock *BB) {
31981  // Scan forward through BB for a use/def of EFLAGS.
31982  for (MachineBasicBlock::iterator miI = std::next(Itr), miE = BB->end();
31983         miI != miE; ++miI) {
31984    const MachineInstr& mi = *miI;
31985    if (mi.readsRegister(X86::EFLAGS))
31986      return true;
31987    // If we found a def, we can stop searching.
31988    if (mi.definesRegister(X86::EFLAGS))
31989      return false;
31990  }
31991
31992  // If we hit the end of the block, check whether EFLAGS is live into a
31993  // successor.
31994  for (MachineBasicBlock::succ_iterator sItr = BB->succ_begin(),
31995                                        sEnd = BB->succ_end();
31996       sItr != sEnd; ++sItr) {
31997    MachineBasicBlock* succ = *sItr;
31998    if (succ->isLiveIn(X86::EFLAGS))
31999      return true;
32000  }
32001
32002  return false;
32003}
32004
32005/// Utility function to emit xbegin specifying the start of an RTM region.
32006static MachineBasicBlock *emitXBegin(MachineInstr &MI, MachineBasicBlock *MBB,
32007                                     const TargetInstrInfo *TII) {
32008  const DebugLoc &DL = MI.getDebugLoc();
32009
32010  const BasicBlock *BB = MBB->getBasicBlock();
32011  MachineFunction::iterator I = ++MBB->getIterator();
32012
32013  // For the v = xbegin(), we generate
32014  //
32015  // thisMBB:
32016  //  xbegin sinkMBB
32017  //
32018  // mainMBB:
32019  //  s0 = -1
32020  //
32021  // fallBB:
32022  //  eax = # XABORT_DEF
32023  //  s1 = eax
32024  //
32025  // sinkMBB:
32026  //  v = phi(s0/mainBB, s1/fallBB)
32027
32028  MachineBasicBlock *thisMBB = MBB;
32029  MachineFunction *MF = MBB->getParent();
32030  MachineBasicBlock *mainMBB = MF->CreateMachineBasicBlock(BB);
32031  MachineBasicBlock *fallMBB = MF->CreateMachineBasicBlock(BB);
32032  MachineBasicBlock *sinkMBB = MF->CreateMachineBasicBlock(BB);
32033  MF->insert(I, mainMBB);
32034  MF->insert(I, fallMBB);
32035  MF->insert(I, sinkMBB);
32036
32037  if (isEFLAGSLiveAfter(MI, MBB)) {
32038    mainMBB->addLiveIn(X86::EFLAGS);
32039    fallMBB->addLiveIn(X86::EFLAGS);
32040    sinkMBB->addLiveIn(X86::EFLAGS);
32041  }
32042
32043  // Transfer the remainder of BB and its successor edges to sinkMBB.
32044  sinkMBB->splice(sinkMBB->begin(), MBB,
32045                  std::next(MachineBasicBlock::iterator(MI)), MBB->end());
32046  sinkMBB->transferSuccessorsAndUpdatePHIs(MBB);
32047
32048  MachineRegisterInfo &MRI = MF->getRegInfo();
32049  Register DstReg = MI.getOperand(0).getReg();
32050  const TargetRegisterClass *RC = MRI.getRegClass(DstReg);
32051  Register mainDstReg = MRI.createVirtualRegister(RC);
32052  Register fallDstReg = MRI.createVirtualRegister(RC);
32053
32054  // thisMBB:
32055  //  xbegin fallMBB
32056  //  # fallthrough to mainMBB
32057  //  # abortion to fallMBB
32058  BuildMI(thisMBB, DL, TII->get(X86::XBEGIN_4)).addMBB(fallMBB);
32059  thisMBB->addSuccessor(mainMBB);
32060  thisMBB->addSuccessor(fallMBB);
32061
32062  // mainMBB:
32063  //  mainDstReg := -1
32064  BuildMI(mainMBB, DL, TII->get(X86::MOV32ri), mainDstReg).addImm(-1);
32065  BuildMI(mainMBB, DL, TII->get(X86::JMP_1)).addMBB(sinkMBB);
32066  mainMBB->addSuccessor(sinkMBB);
32067
32068  // fallMBB:
32069  //  ; pseudo instruction to model hardware's definition from XABORT
32070  //  EAX := XABORT_DEF
32071  //  fallDstReg := EAX
32072  BuildMI(fallMBB, DL, TII->get(X86::XABORT_DEF));
32073  BuildMI(fallMBB, DL, TII->get(TargetOpcode::COPY), fallDstReg)
32074      .addReg(X86::EAX);
32075  fallMBB->addSuccessor(sinkMBB);
32076
32077  // sinkMBB:
32078  //  DstReg := phi(mainDstReg/mainBB, fallDstReg/fallBB)
32079  BuildMI(*sinkMBB, sinkMBB->begin(), DL, TII->get(X86::PHI), DstReg)
32080      .addReg(mainDstReg).addMBB(mainMBB)
32081      .addReg(fallDstReg).addMBB(fallMBB);
32082
32083  MI.eraseFromParent();
32084  return sinkMBB;
32085}
32086
32087MachineBasicBlock *
32088X86TargetLowering::EmitVAARGWithCustomInserter(MachineInstr &MI,
32089                                               MachineBasicBlock *MBB) const {
32090  // Emit va_arg instruction on X86-64.
32091
32092  // Operands to this pseudo-instruction:
32093  // 0  ) Output        : destination address (reg)
32094  // 1-5) Input         : va_list address (addr, i64mem)
32095  // 6  ) ArgSize       : Size (in bytes) of vararg type
32096  // 7  ) ArgMode       : 0=overflow only, 1=use gp_offset, 2=use fp_offset
32097  // 8  ) Align         : Alignment of type
32098  // 9  ) EFLAGS (implicit-def)
32099
32100  assert(MI.getNumOperands() == 10 && "VAARG should have 10 operands!");
32101  static_assert(X86::AddrNumOperands == 5, "VAARG assumes 5 address operands");
32102
32103  Register DestReg = MI.getOperand(0).getReg();
32104  MachineOperand &Base = MI.getOperand(1);
32105  MachineOperand &Scale = MI.getOperand(2);
32106  MachineOperand &Index = MI.getOperand(3);
32107  MachineOperand &Disp = MI.getOperand(4);
32108  MachineOperand &Segment = MI.getOperand(5);
32109  unsigned ArgSize = MI.getOperand(6).getImm();
32110  unsigned ArgMode = MI.getOperand(7).getImm();
32111  Align Alignment = Align(MI.getOperand(8).getImm());
32112
32113  MachineFunction *MF = MBB->getParent();
32114
32115  // Memory Reference
32116  assert(MI.hasOneMemOperand() && "Expected VAARG to have one memoperand");
32117
32118  MachineMemOperand *OldMMO = MI.memoperands().front();
32119
32120  // Clone the MMO into two separate MMOs for loading and storing
32121  MachineMemOperand *LoadOnlyMMO = MF->getMachineMemOperand(
32122      OldMMO, OldMMO->getFlags() & ~MachineMemOperand::MOStore);
32123  MachineMemOperand *StoreOnlyMMO = MF->getMachineMemOperand(
32124      OldMMO, OldMMO->getFlags() & ~MachineMemOperand::MOLoad);
32125
32126  // Machine Information
32127  const TargetInstrInfo *TII = Subtarget.getInstrInfo();
32128  MachineRegisterInfo &MRI = MBB->getParent()->getRegInfo();
32129  const TargetRegisterClass *AddrRegClass =
32130      getRegClassFor(getPointerTy(MBB->getParent()->getDataLayout()));
32131  const TargetRegisterClass *OffsetRegClass = getRegClassFor(MVT::i32);
32132  const DebugLoc &DL = MI.getDebugLoc();
32133
32134  // struct va_list {
32135  //   i32   gp_offset
32136  //   i32   fp_offset
32137  //   i64   overflow_area (address)
32138  //   i64   reg_save_area (address)
32139  // }
32140  // sizeof(va_list) = 24
32141  // alignment(va_list) = 8
32142
32143  unsigned TotalNumIntRegs = 6;
32144  unsigned TotalNumXMMRegs = 8;
32145  bool UseGPOffset = (ArgMode == 1);
32146  bool UseFPOffset = (ArgMode == 2);
32147  unsigned MaxOffset = TotalNumIntRegs * 8 +
32148                       (UseFPOffset ? TotalNumXMMRegs * 16 : 0);
32149
32150  /* Align ArgSize to a multiple of 8 */
32151  unsigned ArgSizeA8 = (ArgSize + 7) & ~7;
32152  bool NeedsAlign = (Alignment > 8);
32153
32154  MachineBasicBlock *thisMBB = MBB;
32155  MachineBasicBlock *overflowMBB;
32156  MachineBasicBlock *offsetMBB;
32157  MachineBasicBlock *endMBB;
32158
32159  unsigned OffsetDestReg = 0;    // Argument address computed by offsetMBB
32160  unsigned OverflowDestReg = 0;  // Argument address computed by overflowMBB
32161  unsigned OffsetReg = 0;
32162
32163  if (!UseGPOffset && !UseFPOffset) {
32164    // If we only pull from the overflow region, we don't create a branch.
32165    // We don't need to alter control flow.
32166    OffsetDestReg = 0; // unused
32167    OverflowDestReg = DestReg;
32168
32169    offsetMBB = nullptr;
32170    overflowMBB = thisMBB;
32171    endMBB = thisMBB;
32172  } else {
32173    // First emit code to check if gp_offset (or fp_offset) is below the bound.
32174    // If so, pull the argument from reg_save_area. (branch to offsetMBB)
32175    // If not, pull from overflow_area. (branch to overflowMBB)
32176    //
32177    //       thisMBB
32178    //         |     .
32179    //         |        .
32180    //     offsetMBB   overflowMBB
32181    //         |        .
32182    //         |     .
32183    //        endMBB
32184
32185    // Registers for the PHI in endMBB
32186    OffsetDestReg = MRI.createVirtualRegister(AddrRegClass);
32187    OverflowDestReg = MRI.createVirtualRegister(AddrRegClass);
32188
32189    const BasicBlock *LLVM_BB = MBB->getBasicBlock();
32190    overflowMBB = MF->CreateMachineBasicBlock(LLVM_BB);
32191    offsetMBB = MF->CreateMachineBasicBlock(LLVM_BB);
32192    endMBB = MF->CreateMachineBasicBlock(LLVM_BB);
32193
32194    MachineFunction::iterator MBBIter = ++MBB->getIterator();
32195
32196    // Insert the new basic blocks
32197    MF->insert(MBBIter, offsetMBB);
32198    MF->insert(MBBIter, overflowMBB);
32199    MF->insert(MBBIter, endMBB);
32200
32201    // Transfer the remainder of MBB and its successor edges to endMBB.
32202    endMBB->splice(endMBB->begin(), thisMBB,
32203                   std::next(MachineBasicBlock::iterator(MI)), thisMBB->end());
32204    endMBB->transferSuccessorsAndUpdatePHIs(thisMBB);
32205
32206    // Make offsetMBB and overflowMBB successors of thisMBB
32207    thisMBB->addSuccessor(offsetMBB);
32208    thisMBB->addSuccessor(overflowMBB);
32209
32210    // endMBB is a successor of both offsetMBB and overflowMBB
32211    offsetMBB->addSuccessor(endMBB);
32212    overflowMBB->addSuccessor(endMBB);
32213
32214    // Load the offset value into a register
32215    OffsetReg = MRI.createVirtualRegister(OffsetRegClass);
32216    BuildMI(thisMBB, DL, TII->get(X86::MOV32rm), OffsetReg)
32217        .add(Base)
32218        .add(Scale)
32219        .add(Index)
32220        .addDisp(Disp, UseFPOffset ? 4 : 0)
32221        .add(Segment)
32222        .setMemRefs(LoadOnlyMMO);
32223
32224    // Check if there is enough room left to pull this argument.
32225    BuildMI(thisMBB, DL, TII->get(X86::CMP32ri))
32226      .addReg(OffsetReg)
32227      .addImm(MaxOffset + 8 - ArgSizeA8);
32228
32229    // Branch to "overflowMBB" if offset >= max
32230    // Fall through to "offsetMBB" otherwise
32231    BuildMI(thisMBB, DL, TII->get(X86::JCC_1))
32232      .addMBB(overflowMBB).addImm(X86::COND_AE);
32233  }
32234
32235  // In offsetMBB, emit code to use the reg_save_area.
32236  if (offsetMBB) {
32237    assert(OffsetReg != 0);
32238
32239    // Read the reg_save_area address.
32240    Register RegSaveReg = MRI.createVirtualRegister(AddrRegClass);
32241    BuildMI(
32242        offsetMBB, DL,
32243        TII->get(Subtarget.isTarget64BitLP64() ? X86::MOV64rm : X86::MOV32rm),
32244        RegSaveReg)
32245        .add(Base)
32246        .add(Scale)
32247        .add(Index)
32248        .addDisp(Disp, Subtarget.isTarget64BitLP64() ? 16 : 12)
32249        .add(Segment)
32250        .setMemRefs(LoadOnlyMMO);
32251
32252    if (Subtarget.isTarget64BitLP64()) {
32253      // Zero-extend the offset
32254      Register OffsetReg64 = MRI.createVirtualRegister(AddrRegClass);
32255      BuildMI(offsetMBB, DL, TII->get(X86::SUBREG_TO_REG), OffsetReg64)
32256          .addImm(0)
32257          .addReg(OffsetReg)
32258          .addImm(X86::sub_32bit);
32259
32260      // Add the offset to the reg_save_area to get the final address.
32261      BuildMI(offsetMBB, DL, TII->get(X86::ADD64rr), OffsetDestReg)
32262          .addReg(OffsetReg64)
32263          .addReg(RegSaveReg);
32264    } else {
32265      // Add the offset to the reg_save_area to get the final address.
32266      BuildMI(offsetMBB, DL, TII->get(X86::ADD32rr), OffsetDestReg)
32267          .addReg(OffsetReg)
32268          .addReg(RegSaveReg);
32269    }
32270
32271    // Compute the offset for the next argument
32272    Register NextOffsetReg = MRI.createVirtualRegister(OffsetRegClass);
32273    BuildMI(offsetMBB, DL, TII->get(X86::ADD32ri), NextOffsetReg)
32274      .addReg(OffsetReg)
32275      .addImm(UseFPOffset ? 16 : 8);
32276
32277    // Store it back into the va_list.
32278    BuildMI(offsetMBB, DL, TII->get(X86::MOV32mr))
32279        .add(Base)
32280        .add(Scale)
32281        .add(Index)
32282        .addDisp(Disp, UseFPOffset ? 4 : 0)
32283        .add(Segment)
32284        .addReg(NextOffsetReg)
32285        .setMemRefs(StoreOnlyMMO);
32286
32287    // Jump to endMBB
32288    BuildMI(offsetMBB, DL, TII->get(X86::JMP_1))
32289      .addMBB(endMBB);
32290  }
32291
32292  //
32293  // Emit code to use overflow area
32294  //
32295
32296  // Load the overflow_area address into a register.
32297  Register OverflowAddrReg = MRI.createVirtualRegister(AddrRegClass);
32298  BuildMI(overflowMBB, DL,
32299          TII->get(Subtarget.isTarget64BitLP64() ? X86::MOV64rm : X86::MOV32rm),
32300          OverflowAddrReg)
32301      .add(Base)
32302      .add(Scale)
32303      .add(Index)
32304      .addDisp(Disp, 8)
32305      .add(Segment)
32306      .setMemRefs(LoadOnlyMMO);
32307
32308  // If we need to align it, do so. Otherwise, just copy the address
32309  // to OverflowDestReg.
32310  if (NeedsAlign) {
32311    // Align the overflow address
32312    Register TmpReg = MRI.createVirtualRegister(AddrRegClass);
32313
32314    // aligned_addr = (addr + (align-1)) & ~(align-1)
32315    BuildMI(
32316        overflowMBB, DL,
32317        TII->get(Subtarget.isTarget64BitLP64() ? X86::ADD64ri32 : X86::ADD32ri),
32318        TmpReg)
32319        .addReg(OverflowAddrReg)
32320        .addImm(Alignment.value() - 1);
32321
32322    BuildMI(
32323        overflowMBB, DL,
32324        TII->get(Subtarget.isTarget64BitLP64() ? X86::AND64ri32 : X86::AND32ri),
32325        OverflowDestReg)
32326        .addReg(TmpReg)
32327        .addImm(~(uint64_t)(Alignment.value() - 1));
32328  } else {
32329    BuildMI(overflowMBB, DL, TII->get(TargetOpcode::COPY), OverflowDestReg)
32330      .addReg(OverflowAddrReg);
32331  }
32332
32333  // Compute the next overflow address after this argument.
32334  // (the overflow address should be kept 8-byte aligned)
32335  Register NextAddrReg = MRI.createVirtualRegister(AddrRegClass);
32336  BuildMI(
32337      overflowMBB, DL,
32338      TII->get(Subtarget.isTarget64BitLP64() ? X86::ADD64ri32 : X86::ADD32ri),
32339      NextAddrReg)
32340      .addReg(OverflowDestReg)
32341      .addImm(ArgSizeA8);
32342
32343  // Store the new overflow address.
32344  BuildMI(overflowMBB, DL,
32345          TII->get(Subtarget.isTarget64BitLP64() ? X86::MOV64mr : X86::MOV32mr))
32346      .add(Base)
32347      .add(Scale)
32348      .add(Index)
32349      .addDisp(Disp, 8)
32350      .add(Segment)
32351      .addReg(NextAddrReg)
32352      .setMemRefs(StoreOnlyMMO);
32353
32354  // If we branched, emit the PHI to the front of endMBB.
32355  if (offsetMBB) {
32356    BuildMI(*endMBB, endMBB->begin(), DL,
32357            TII->get(X86::PHI), DestReg)
32358      .addReg(OffsetDestReg).addMBB(offsetMBB)
32359      .addReg(OverflowDestReg).addMBB(overflowMBB);
32360  }
32361
32362  // Erase the pseudo instruction
32363  MI.eraseFromParent();
32364
32365  return endMBB;
32366}
32367
32368// The EFLAGS operand of SelectItr might be missing a kill marker
32369// because there were multiple uses of EFLAGS, and ISel didn't know
32370// which to mark. Figure out whether SelectItr should have had a
32371// kill marker, and set it if it should. Returns the correct kill
32372// marker value.
32373static bool checkAndUpdateEFLAGSKill(MachineBasicBlock::iterator SelectItr,
32374                                     MachineBasicBlock* BB,
32375                                     const TargetRegisterInfo* TRI) {
32376  if (isEFLAGSLiveAfter(SelectItr, BB))
32377    return false;
32378
32379  // We found a def, or hit the end of the basic block and EFLAGS wasn't live
32380  // out. SelectMI should have a kill flag on EFLAGS.
32381  SelectItr->addRegisterKilled(X86::EFLAGS, TRI);
32382  return true;
32383}
32384
32385// Return true if it is OK for this CMOV pseudo-opcode to be cascaded
32386// together with other CMOV pseudo-opcodes into a single basic-block with
32387// conditional jump around it.
32388static bool isCMOVPseudo(MachineInstr &MI) {
32389  switch (MI.getOpcode()) {
32390  case X86::CMOV_FR32:
32391  case X86::CMOV_FR32X:
32392  case X86::CMOV_FR64:
32393  case X86::CMOV_FR64X:
32394  case X86::CMOV_GR8:
32395  case X86::CMOV_GR16:
32396  case X86::CMOV_GR32:
32397  case X86::CMOV_RFP32:
32398  case X86::CMOV_RFP64:
32399  case X86::CMOV_RFP80:
32400  case X86::CMOV_VR64:
32401  case X86::CMOV_VR128:
32402  case X86::CMOV_VR128X:
32403  case X86::CMOV_VR256:
32404  case X86::CMOV_VR256X:
32405  case X86::CMOV_VR512:
32406  case X86::CMOV_VK1:
32407  case X86::CMOV_VK2:
32408  case X86::CMOV_VK4:
32409  case X86::CMOV_VK8:
32410  case X86::CMOV_VK16:
32411  case X86::CMOV_VK32:
32412  case X86::CMOV_VK64:
32413    return true;
32414
32415  default:
32416    return false;
32417  }
32418}
32419
32420// Helper function, which inserts PHI functions into SinkMBB:
32421//   %Result(i) = phi [ %FalseValue(i), FalseMBB ], [ %TrueValue(i), TrueMBB ],
32422// where %FalseValue(i) and %TrueValue(i) are taken from the consequent CMOVs
32423// in [MIItBegin, MIItEnd) range. It returns the last MachineInstrBuilder for
32424// the last PHI function inserted.
32425static MachineInstrBuilder createPHIsForCMOVsInSinkBB(
32426    MachineBasicBlock::iterator MIItBegin, MachineBasicBlock::iterator MIItEnd,
32427    MachineBasicBlock *TrueMBB, MachineBasicBlock *FalseMBB,
32428    MachineBasicBlock *SinkMBB) {
32429  MachineFunction *MF = TrueMBB->getParent();
32430  const TargetInstrInfo *TII = MF->getSubtarget().getInstrInfo();
32431  const DebugLoc &DL = MIItBegin->getDebugLoc();
32432
32433  X86::CondCode CC = X86::CondCode(MIItBegin->getOperand(3).getImm());
32434  X86::CondCode OppCC = X86::GetOppositeBranchCondition(CC);
32435
32436  MachineBasicBlock::iterator SinkInsertionPoint = SinkMBB->begin();
32437
32438  // As we are creating the PHIs, we have to be careful if there is more than
32439  // one.  Later CMOVs may reference the results of earlier CMOVs, but later
32440  // PHIs have to reference the individual true/false inputs from earlier PHIs.
32441  // That also means that PHI construction must work forward from earlier to
32442  // later, and that the code must maintain a mapping from earlier PHI's
32443  // destination registers, and the registers that went into the PHI.
32444  DenseMap<unsigned, std::pair<unsigned, unsigned>> RegRewriteTable;
32445  MachineInstrBuilder MIB;
32446
32447  for (MachineBasicBlock::iterator MIIt = MIItBegin; MIIt != MIItEnd; ++MIIt) {
32448    Register DestReg = MIIt->getOperand(0).getReg();
32449    Register Op1Reg = MIIt->getOperand(1).getReg();
32450    Register Op2Reg = MIIt->getOperand(2).getReg();
32451
32452    // If this CMOV we are generating is the opposite condition from
32453    // the jump we generated, then we have to swap the operands for the
32454    // PHI that is going to be generated.
32455    if (MIIt->getOperand(3).getImm() == OppCC)
32456      std::swap(Op1Reg, Op2Reg);
32457
32458    if (RegRewriteTable.find(Op1Reg) != RegRewriteTable.end())
32459      Op1Reg = RegRewriteTable[Op1Reg].first;
32460
32461    if (RegRewriteTable.find(Op2Reg) != RegRewriteTable.end())
32462      Op2Reg = RegRewriteTable[Op2Reg].second;
32463
32464    MIB = BuildMI(*SinkMBB, SinkInsertionPoint, DL, TII->get(X86::PHI), DestReg)
32465              .addReg(Op1Reg)
32466              .addMBB(FalseMBB)
32467              .addReg(Op2Reg)
32468              .addMBB(TrueMBB);
32469
32470    // Add this PHI to the rewrite table.
32471    RegRewriteTable[DestReg] = std::make_pair(Op1Reg, Op2Reg);
32472  }
32473
32474  return MIB;
32475}
32476
32477// Lower cascaded selects in form of (SecondCmov (FirstCMOV F, T, cc1), T, cc2).
32478MachineBasicBlock *
32479X86TargetLowering::EmitLoweredCascadedSelect(MachineInstr &FirstCMOV,
32480                                             MachineInstr &SecondCascadedCMOV,
32481                                             MachineBasicBlock *ThisMBB) const {
32482  const TargetInstrInfo *TII = Subtarget.getInstrInfo();
32483  const DebugLoc &DL = FirstCMOV.getDebugLoc();
32484
32485  // We lower cascaded CMOVs such as
32486  //
32487  //   (SecondCascadedCMOV (FirstCMOV F, T, cc1), T, cc2)
32488  //
32489  // to two successive branches.
32490  //
32491  // Without this, we would add a PHI between the two jumps, which ends up
32492  // creating a few copies all around. For instance, for
32493  //
32494  //    (sitofp (zext (fcmp une)))
32495  //
32496  // we would generate:
32497  //
32498  //         ucomiss %xmm1, %xmm0
32499  //         movss  <1.0f>, %xmm0
32500  //         movaps  %xmm0, %xmm1
32501  //         jne     .LBB5_2
32502  //         xorps   %xmm1, %xmm1
32503  // .LBB5_2:
32504  //         jp      .LBB5_4
32505  //         movaps  %xmm1, %xmm0
32506  // .LBB5_4:
32507  //         retq
32508  //
32509  // because this custom-inserter would have generated:
32510  //
32511  //   A
32512  //   | \
32513  //   |  B
32514  //   | /
32515  //   C
32516  //   | \
32517  //   |  D
32518  //   | /
32519  //   E
32520  //
32521  // A: X = ...; Y = ...
32522  // B: empty
32523  // C: Z = PHI [X, A], [Y, B]
32524  // D: empty
32525  // E: PHI [X, C], [Z, D]
32526  //
32527  // If we lower both CMOVs in a single step, we can instead generate:
32528  //
32529  //   A
32530  //   | \
32531  //   |  C
32532  //   | /|
32533  //   |/ |
32534  //   |  |
32535  //   |  D
32536  //   | /
32537  //   E
32538  //
32539  // A: X = ...; Y = ...
32540  // D: empty
32541  // E: PHI [X, A], [X, C], [Y, D]
32542  //
32543  // Which, in our sitofp/fcmp example, gives us something like:
32544  //
32545  //         ucomiss %xmm1, %xmm0
32546  //         movss  <1.0f>, %xmm0
32547  //         jne     .LBB5_4
32548  //         jp      .LBB5_4
32549  //         xorps   %xmm0, %xmm0
32550  // .LBB5_4:
32551  //         retq
32552  //
32553
32554  // We lower cascaded CMOV into two successive branches to the same block.
32555  // EFLAGS is used by both, so mark it as live in the second.
32556  const BasicBlock *LLVM_BB = ThisMBB->getBasicBlock();
32557  MachineFunction *F = ThisMBB->getParent();
32558  MachineBasicBlock *FirstInsertedMBB = F->CreateMachineBasicBlock(LLVM_BB);
32559  MachineBasicBlock *SecondInsertedMBB = F->CreateMachineBasicBlock(LLVM_BB);
32560  MachineBasicBlock *SinkMBB = F->CreateMachineBasicBlock(LLVM_BB);
32561
32562  MachineFunction::iterator It = ++ThisMBB->getIterator();
32563  F->insert(It, FirstInsertedMBB);
32564  F->insert(It, SecondInsertedMBB);
32565  F->insert(It, SinkMBB);
32566
32567  // For a cascaded CMOV, we lower it to two successive branches to
32568  // the same block (SinkMBB).  EFLAGS is used by both, so mark it as live in
32569  // the FirstInsertedMBB.
32570  FirstInsertedMBB->addLiveIn(X86::EFLAGS);
32571
32572  // If the EFLAGS register isn't dead in the terminator, then claim that it's
32573  // live into the sink and copy blocks.
32574  const TargetRegisterInfo *TRI = Subtarget.getRegisterInfo();
32575  if (!SecondCascadedCMOV.killsRegister(X86::EFLAGS) &&
32576      !checkAndUpdateEFLAGSKill(SecondCascadedCMOV, ThisMBB, TRI)) {
32577    SecondInsertedMBB->addLiveIn(X86::EFLAGS);
32578    SinkMBB->addLiveIn(X86::EFLAGS);
32579  }
32580
32581  // Transfer the remainder of ThisMBB and its successor edges to SinkMBB.
32582  SinkMBB->splice(SinkMBB->begin(), ThisMBB,
32583                  std::next(MachineBasicBlock::iterator(FirstCMOV)),
32584                  ThisMBB->end());
32585  SinkMBB->transferSuccessorsAndUpdatePHIs(ThisMBB);
32586
32587  // Fallthrough block for ThisMBB.
32588  ThisMBB->addSuccessor(FirstInsertedMBB);
32589  // The true block target of the first branch is always SinkMBB.
32590  ThisMBB->addSuccessor(SinkMBB);
32591  // Fallthrough block for FirstInsertedMBB.
32592  FirstInsertedMBB->addSuccessor(SecondInsertedMBB);
32593  // The true block for the branch of FirstInsertedMBB.
32594  FirstInsertedMBB->addSuccessor(SinkMBB);
32595  // This is fallthrough.
32596  SecondInsertedMBB->addSuccessor(SinkMBB);
32597
32598  // Create the conditional branch instructions.
32599  X86::CondCode FirstCC = X86::CondCode(FirstCMOV.getOperand(3).getImm());
32600  BuildMI(ThisMBB, DL, TII->get(X86::JCC_1)).addMBB(SinkMBB).addImm(FirstCC);
32601
32602  X86::CondCode SecondCC =
32603      X86::CondCode(SecondCascadedCMOV.getOperand(3).getImm());
32604  BuildMI(FirstInsertedMBB, DL, TII->get(X86::JCC_1)).addMBB(SinkMBB).addImm(SecondCC);
32605
32606  //  SinkMBB:
32607  //   %Result = phi [ %FalseValue, SecondInsertedMBB ], [ %TrueValue, ThisMBB ]
32608  Register DestReg = FirstCMOV.getOperand(0).getReg();
32609  Register Op1Reg = FirstCMOV.getOperand(1).getReg();
32610  Register Op2Reg = FirstCMOV.getOperand(2).getReg();
32611  MachineInstrBuilder MIB =
32612      BuildMI(*SinkMBB, SinkMBB->begin(), DL, TII->get(X86::PHI), DestReg)
32613          .addReg(Op1Reg)
32614          .addMBB(SecondInsertedMBB)
32615          .addReg(Op2Reg)
32616          .addMBB(ThisMBB);
32617
32618  // The second SecondInsertedMBB provides the same incoming value as the
32619  // FirstInsertedMBB (the True operand of the SELECT_CC/CMOV nodes).
32620  MIB.addReg(FirstCMOV.getOperand(2).getReg()).addMBB(FirstInsertedMBB);
32621  // Copy the PHI result to the register defined by the second CMOV.
32622  BuildMI(*SinkMBB, std::next(MachineBasicBlock::iterator(MIB.getInstr())), DL,
32623          TII->get(TargetOpcode::COPY),
32624          SecondCascadedCMOV.getOperand(0).getReg())
32625      .addReg(FirstCMOV.getOperand(0).getReg());
32626
32627  // Now remove the CMOVs.
32628  FirstCMOV.eraseFromParent();
32629  SecondCascadedCMOV.eraseFromParent();
32630
32631  return SinkMBB;
32632}
32633
32634MachineBasicBlock *
32635X86TargetLowering::EmitLoweredSelect(MachineInstr &MI,
32636                                     MachineBasicBlock *ThisMBB) const {
32637  const TargetInstrInfo *TII = Subtarget.getInstrInfo();
32638  const DebugLoc &DL = MI.getDebugLoc();
32639
32640  // To "insert" a SELECT_CC instruction, we actually have to insert the
32641  // diamond control-flow pattern.  The incoming instruction knows the
32642  // destination vreg to set, the condition code register to branch on, the
32643  // true/false values to select between and a branch opcode to use.
32644
32645  //  ThisMBB:
32646  //  ...
32647  //   TrueVal = ...
32648  //   cmpTY ccX, r1, r2
32649  //   bCC copy1MBB
32650  //   fallthrough --> FalseMBB
32651
32652  // This code lowers all pseudo-CMOV instructions. Generally it lowers these
32653  // as described above, by inserting a BB, and then making a PHI at the join
32654  // point to select the true and false operands of the CMOV in the PHI.
32655  //
32656  // The code also handles two different cases of multiple CMOV opcodes
32657  // in a row.
32658  //
32659  // Case 1:
32660  // In this case, there are multiple CMOVs in a row, all which are based on
32661  // the same condition setting (or the exact opposite condition setting).
32662  // In this case we can lower all the CMOVs using a single inserted BB, and
32663  // then make a number of PHIs at the join point to model the CMOVs. The only
32664  // trickiness here, is that in a case like:
32665  //
32666  // t2 = CMOV cond1 t1, f1
32667  // t3 = CMOV cond1 t2, f2
32668  //
32669  // when rewriting this into PHIs, we have to perform some renaming on the
32670  // temps since you cannot have a PHI operand refer to a PHI result earlier
32671  // in the same block.  The "simple" but wrong lowering would be:
32672  //
32673  // t2 = PHI t1(BB1), f1(BB2)
32674  // t3 = PHI t2(BB1), f2(BB2)
32675  //
32676  // but clearly t2 is not defined in BB1, so that is incorrect. The proper
32677  // renaming is to note that on the path through BB1, t2 is really just a
32678  // copy of t1, and do that renaming, properly generating:
32679  //
32680  // t2 = PHI t1(BB1), f1(BB2)
32681  // t3 = PHI t1(BB1), f2(BB2)
32682  //
32683  // Case 2:
32684  // CMOV ((CMOV F, T, cc1), T, cc2) is checked here and handled by a separate
32685  // function - EmitLoweredCascadedSelect.
32686
32687  X86::CondCode CC = X86::CondCode(MI.getOperand(3).getImm());
32688  X86::CondCode OppCC = X86::GetOppositeBranchCondition(CC);
32689  MachineInstr *LastCMOV = &MI;
32690  MachineBasicBlock::iterator NextMIIt = MachineBasicBlock::iterator(MI);
32691
32692  // Check for case 1, where there are multiple CMOVs with the same condition
32693  // first.  Of the two cases of multiple CMOV lowerings, case 1 reduces the
32694  // number of jumps the most.
32695
32696  if (isCMOVPseudo(MI)) {
32697    // See if we have a string of CMOVS with the same condition. Skip over
32698    // intervening debug insts.
32699    while (NextMIIt != ThisMBB->end() && isCMOVPseudo(*NextMIIt) &&
32700           (NextMIIt->getOperand(3).getImm() == CC ||
32701            NextMIIt->getOperand(3).getImm() == OppCC)) {
32702      LastCMOV = &*NextMIIt;
32703      NextMIIt = next_nodbg(NextMIIt, ThisMBB->end());
32704    }
32705  }
32706
32707  // This checks for case 2, but only do this if we didn't already find
32708  // case 1, as indicated by LastCMOV == MI.
32709  if (LastCMOV == &MI && NextMIIt != ThisMBB->end() &&
32710      NextMIIt->getOpcode() == MI.getOpcode() &&
32711      NextMIIt->getOperand(2).getReg() == MI.getOperand(2).getReg() &&
32712      NextMIIt->getOperand(1).getReg() == MI.getOperand(0).getReg() &&
32713      NextMIIt->getOperand(1).isKill()) {
32714    return EmitLoweredCascadedSelect(MI, *NextMIIt, ThisMBB);
32715  }
32716
32717  const BasicBlock *LLVM_BB = ThisMBB->getBasicBlock();
32718  MachineFunction *F = ThisMBB->getParent();
32719  MachineBasicBlock *FalseMBB = F->CreateMachineBasicBlock(LLVM_BB);
32720  MachineBasicBlock *SinkMBB = F->CreateMachineBasicBlock(LLVM_BB);
32721
32722  MachineFunction::iterator It = ++ThisMBB->getIterator();
32723  F->insert(It, FalseMBB);
32724  F->insert(It, SinkMBB);
32725
32726  // If the EFLAGS register isn't dead in the terminator, then claim that it's
32727  // live into the sink and copy blocks.
32728  const TargetRegisterInfo *TRI = Subtarget.getRegisterInfo();
32729  if (!LastCMOV->killsRegister(X86::EFLAGS) &&
32730      !checkAndUpdateEFLAGSKill(LastCMOV, ThisMBB, TRI)) {
32731    FalseMBB->addLiveIn(X86::EFLAGS);
32732    SinkMBB->addLiveIn(X86::EFLAGS);
32733  }
32734
32735  // Transfer any debug instructions inside the CMOV sequence to the sunk block.
32736  auto DbgEnd = MachineBasicBlock::iterator(LastCMOV);
32737  auto DbgIt = MachineBasicBlock::iterator(MI);
32738  while (DbgIt != DbgEnd) {
32739    auto Next = std::next(DbgIt);
32740    if (DbgIt->isDebugInstr())
32741      SinkMBB->push_back(DbgIt->removeFromParent());
32742    DbgIt = Next;
32743  }
32744
32745  // Transfer the remainder of ThisMBB and its successor edges to SinkMBB.
32746  SinkMBB->splice(SinkMBB->end(), ThisMBB,
32747                  std::next(MachineBasicBlock::iterator(LastCMOV)),
32748                  ThisMBB->end());
32749  SinkMBB->transferSuccessorsAndUpdatePHIs(ThisMBB);
32750
32751  // Fallthrough block for ThisMBB.
32752  ThisMBB->addSuccessor(FalseMBB);
32753  // The true block target of the first (or only) branch is always a SinkMBB.
32754  ThisMBB->addSuccessor(SinkMBB);
32755  // Fallthrough block for FalseMBB.
32756  FalseMBB->addSuccessor(SinkMBB);
32757
32758  // Create the conditional branch instruction.
32759  BuildMI(ThisMBB, DL, TII->get(X86::JCC_1)).addMBB(SinkMBB).addImm(CC);
32760
32761  //  SinkMBB:
32762  //   %Result = phi [ %FalseValue, FalseMBB ], [ %TrueValue, ThisMBB ]
32763  //  ...
32764  MachineBasicBlock::iterator MIItBegin = MachineBasicBlock::iterator(MI);
32765  MachineBasicBlock::iterator MIItEnd =
32766      std::next(MachineBasicBlock::iterator(LastCMOV));
32767  createPHIsForCMOVsInSinkBB(MIItBegin, MIItEnd, ThisMBB, FalseMBB, SinkMBB);
32768
32769  // Now remove the CMOV(s).
32770  ThisMBB->erase(MIItBegin, MIItEnd);
32771
32772  return SinkMBB;
32773}
32774
32775static unsigned getSUBriOpcode(bool IsLP64, int64_t Imm) {
32776  if (IsLP64) {
32777    if (isInt<8>(Imm))
32778      return X86::SUB64ri8;
32779    return X86::SUB64ri32;
32780  } else {
32781    if (isInt<8>(Imm))
32782      return X86::SUB32ri8;
32783    return X86::SUB32ri;
32784  }
32785}
32786
32787MachineBasicBlock *
32788X86TargetLowering::EmitLoweredProbedAlloca(MachineInstr &MI,
32789                                           MachineBasicBlock *MBB) const {
32790  MachineFunction *MF = MBB->getParent();
32791  const TargetInstrInfo *TII = Subtarget.getInstrInfo();
32792  const X86FrameLowering &TFI = *Subtarget.getFrameLowering();
32793  const DebugLoc &DL = MI.getDebugLoc();
32794  const BasicBlock *LLVM_BB = MBB->getBasicBlock();
32795
32796  const unsigned ProbeSize = getStackProbeSize(*MF);
32797
32798  MachineRegisterInfo &MRI = MF->getRegInfo();
32799  MachineBasicBlock *testMBB = MF->CreateMachineBasicBlock(LLVM_BB);
32800  MachineBasicBlock *tailMBB = MF->CreateMachineBasicBlock(LLVM_BB);
32801  MachineBasicBlock *blockMBB = MF->CreateMachineBasicBlock(LLVM_BB);
32802
32803  MachineFunction::iterator MBBIter = ++MBB->getIterator();
32804  MF->insert(MBBIter, testMBB);
32805  MF->insert(MBBIter, blockMBB);
32806  MF->insert(MBBIter, tailMBB);
32807
32808  Register sizeVReg = MI.getOperand(1).getReg();
32809
32810  Register physSPReg = TFI.Uses64BitFramePtr ? X86::RSP : X86::ESP;
32811
32812  Register TmpStackPtr = MRI.createVirtualRegister(
32813      TFI.Uses64BitFramePtr ? &X86::GR64RegClass : &X86::GR32RegClass);
32814  Register FinalStackPtr = MRI.createVirtualRegister(
32815      TFI.Uses64BitFramePtr ? &X86::GR64RegClass : &X86::GR32RegClass);
32816
32817  BuildMI(*MBB, {MI}, DL, TII->get(TargetOpcode::COPY), TmpStackPtr)
32818      .addReg(physSPReg);
32819  {
32820    const unsigned Opc = TFI.Uses64BitFramePtr ? X86::SUB64rr : X86::SUB32rr;
32821    BuildMI(*MBB, {MI}, DL, TII->get(Opc), FinalStackPtr)
32822        .addReg(TmpStackPtr)
32823        .addReg(sizeVReg);
32824  }
32825
32826  // test rsp size
32827
32828  BuildMI(testMBB, DL,
32829          TII->get(TFI.Uses64BitFramePtr ? X86::CMP64rr : X86::CMP32rr))
32830      .addReg(FinalStackPtr)
32831      .addReg(physSPReg);
32832
32833  BuildMI(testMBB, DL, TII->get(X86::JCC_1))
32834      .addMBB(tailMBB)
32835      .addImm(X86::COND_GE);
32836  testMBB->addSuccessor(blockMBB);
32837  testMBB->addSuccessor(tailMBB);
32838
32839  // Touch the block then extend it. This is done on the opposite side of
32840  // static probe where we allocate then touch, to avoid the need of probing the
32841  // tail of the static alloca. Possible scenarios are:
32842  //
32843  //       + ---- <- ------------ <- ------------- <- ------------ +
32844  //       |                                                       |
32845  // [free probe] -> [page alloc] -> [alloc probe] -> [tail alloc] + -> [dyn probe] -> [page alloc] -> [dyn probe] -> [tail alloc] +
32846  //                                                               |                                                               |
32847  //                                                               + <- ----------- <- ------------ <- ----------- <- ------------ +
32848  //
32849  // The property we want to enforce is to never have more than [page alloc] between two probes.
32850
32851  const unsigned XORMIOpc =
32852      TFI.Uses64BitFramePtr ? X86::XOR64mi8 : X86::XOR32mi8;
32853  addRegOffset(BuildMI(blockMBB, DL, TII->get(XORMIOpc)), physSPReg, false, 0)
32854      .addImm(0);
32855
32856  BuildMI(blockMBB, DL,
32857          TII->get(getSUBriOpcode(TFI.Uses64BitFramePtr, ProbeSize)), physSPReg)
32858      .addReg(physSPReg)
32859      .addImm(ProbeSize);
32860
32861
32862  BuildMI(blockMBB, DL, TII->get(X86::JMP_1)).addMBB(testMBB);
32863  blockMBB->addSuccessor(testMBB);
32864
32865  // Replace original instruction by the expected stack ptr
32866  BuildMI(tailMBB, DL, TII->get(TargetOpcode::COPY), MI.getOperand(0).getReg())
32867      .addReg(FinalStackPtr);
32868
32869  tailMBB->splice(tailMBB->end(), MBB,
32870                  std::next(MachineBasicBlock::iterator(MI)), MBB->end());
32871  tailMBB->transferSuccessorsAndUpdatePHIs(MBB);
32872  MBB->addSuccessor(testMBB);
32873
32874  // Delete the original pseudo instruction.
32875  MI.eraseFromParent();
32876
32877  // And we're done.
32878  return tailMBB;
32879}
32880
32881MachineBasicBlock *
32882X86TargetLowering::EmitLoweredSegAlloca(MachineInstr &MI,
32883                                        MachineBasicBlock *BB) const {
32884  MachineFunction *MF = BB->getParent();
32885  const TargetInstrInfo *TII = Subtarget.getInstrInfo();
32886  const DebugLoc &DL = MI.getDebugLoc();
32887  const BasicBlock *LLVM_BB = BB->getBasicBlock();
32888
32889  assert(MF->shouldSplitStack());
32890
32891  const bool Is64Bit = Subtarget.is64Bit();
32892  const bool IsLP64 = Subtarget.isTarget64BitLP64();
32893
32894  const unsigned TlsReg = Is64Bit ? X86::FS : X86::GS;
32895  const unsigned TlsOffset = IsLP64 ? 0x70 : Is64Bit ? 0x40 : 0x30;
32896
32897  // BB:
32898  //  ... [Till the alloca]
32899  // If stacklet is not large enough, jump to mallocMBB
32900  //
32901  // bumpMBB:
32902  //  Allocate by subtracting from RSP
32903  //  Jump to continueMBB
32904  //
32905  // mallocMBB:
32906  //  Allocate by call to runtime
32907  //
32908  // continueMBB:
32909  //  ...
32910  //  [rest of original BB]
32911  //
32912
32913  MachineBasicBlock *mallocMBB = MF->CreateMachineBasicBlock(LLVM_BB);
32914  MachineBasicBlock *bumpMBB = MF->CreateMachineBasicBlock(LLVM_BB);
32915  MachineBasicBlock *continueMBB = MF->CreateMachineBasicBlock(LLVM_BB);
32916
32917  MachineRegisterInfo &MRI = MF->getRegInfo();
32918  const TargetRegisterClass *AddrRegClass =
32919      getRegClassFor(getPointerTy(MF->getDataLayout()));
32920
32921  Register mallocPtrVReg = MRI.createVirtualRegister(AddrRegClass),
32922           bumpSPPtrVReg = MRI.createVirtualRegister(AddrRegClass),
32923           tmpSPVReg = MRI.createVirtualRegister(AddrRegClass),
32924           SPLimitVReg = MRI.createVirtualRegister(AddrRegClass),
32925           sizeVReg = MI.getOperand(1).getReg(),
32926           physSPReg =
32927               IsLP64 || Subtarget.isTargetNaCl64() ? X86::RSP : X86::ESP;
32928
32929  MachineFunction::iterator MBBIter = ++BB->getIterator();
32930
32931  MF->insert(MBBIter, bumpMBB);
32932  MF->insert(MBBIter, mallocMBB);
32933  MF->insert(MBBIter, continueMBB);
32934
32935  continueMBB->splice(continueMBB->begin(), BB,
32936                      std::next(MachineBasicBlock::iterator(MI)), BB->end());
32937  continueMBB->transferSuccessorsAndUpdatePHIs(BB);
32938
32939  // Add code to the main basic block to check if the stack limit has been hit,
32940  // and if so, jump to mallocMBB otherwise to bumpMBB.
32941  BuildMI(BB, DL, TII->get(TargetOpcode::COPY), tmpSPVReg).addReg(physSPReg);
32942  BuildMI(BB, DL, TII->get(IsLP64 ? X86::SUB64rr:X86::SUB32rr), SPLimitVReg)
32943    .addReg(tmpSPVReg).addReg(sizeVReg);
32944  BuildMI(BB, DL, TII->get(IsLP64 ? X86::CMP64mr:X86::CMP32mr))
32945    .addReg(0).addImm(1).addReg(0).addImm(TlsOffset).addReg(TlsReg)
32946    .addReg(SPLimitVReg);
32947  BuildMI(BB, DL, TII->get(X86::JCC_1)).addMBB(mallocMBB).addImm(X86::COND_G);
32948
32949  // bumpMBB simply decreases the stack pointer, since we know the current
32950  // stacklet has enough space.
32951  BuildMI(bumpMBB, DL, TII->get(TargetOpcode::COPY), physSPReg)
32952    .addReg(SPLimitVReg);
32953  BuildMI(bumpMBB, DL, TII->get(TargetOpcode::COPY), bumpSPPtrVReg)
32954    .addReg(SPLimitVReg);
32955  BuildMI(bumpMBB, DL, TII->get(X86::JMP_1)).addMBB(continueMBB);
32956
32957  // Calls into a routine in libgcc to allocate more space from the heap.
32958  const uint32_t *RegMask =
32959      Subtarget.getRegisterInfo()->getCallPreservedMask(*MF, CallingConv::C);
32960  if (IsLP64) {
32961    BuildMI(mallocMBB, DL, TII->get(X86::MOV64rr), X86::RDI)
32962      .addReg(sizeVReg);
32963    BuildMI(mallocMBB, DL, TII->get(X86::CALL64pcrel32))
32964      .addExternalSymbol("__morestack_allocate_stack_space")
32965      .addRegMask(RegMask)
32966      .addReg(X86::RDI, RegState::Implicit)
32967      .addReg(X86::RAX, RegState::ImplicitDefine);
32968  } else if (Is64Bit) {
32969    BuildMI(mallocMBB, DL, TII->get(X86::MOV32rr), X86::EDI)
32970      .addReg(sizeVReg);
32971    BuildMI(mallocMBB, DL, TII->get(X86::CALL64pcrel32))
32972      .addExternalSymbol("__morestack_allocate_stack_space")
32973      .addRegMask(RegMask)
32974      .addReg(X86::EDI, RegState::Implicit)
32975      .addReg(X86::EAX, RegState::ImplicitDefine);
32976  } else {
32977    BuildMI(mallocMBB, DL, TII->get(X86::SUB32ri), physSPReg).addReg(physSPReg)
32978      .addImm(12);
32979    BuildMI(mallocMBB, DL, TII->get(X86::PUSH32r)).addReg(sizeVReg);
32980    BuildMI(mallocMBB, DL, TII->get(X86::CALLpcrel32))
32981      .addExternalSymbol("__morestack_allocate_stack_space")
32982      .addRegMask(RegMask)
32983      .addReg(X86::EAX, RegState::ImplicitDefine);
32984  }
32985
32986  if (!Is64Bit)
32987    BuildMI(mallocMBB, DL, TII->get(X86::ADD32ri), physSPReg).addReg(physSPReg)
32988      .addImm(16);
32989
32990  BuildMI(mallocMBB, DL, TII->get(TargetOpcode::COPY), mallocPtrVReg)
32991    .addReg(IsLP64 ? X86::RAX : X86::EAX);
32992  BuildMI(mallocMBB, DL, TII->get(X86::JMP_1)).addMBB(continueMBB);
32993
32994  // Set up the CFG correctly.
32995  BB->addSuccessor(bumpMBB);
32996  BB->addSuccessor(mallocMBB);
32997  mallocMBB->addSuccessor(continueMBB);
32998  bumpMBB->addSuccessor(continueMBB);
32999
33000  // Take care of the PHI nodes.
33001  BuildMI(*continueMBB, continueMBB->begin(), DL, TII->get(X86::PHI),
33002          MI.getOperand(0).getReg())
33003      .addReg(mallocPtrVReg)
33004      .addMBB(mallocMBB)
33005      .addReg(bumpSPPtrVReg)
33006      .addMBB(bumpMBB);
33007
33008  // Delete the original pseudo instruction.
33009  MI.eraseFromParent();
33010
33011  // And we're done.
33012  return continueMBB;
33013}
33014
33015MachineBasicBlock *
33016X86TargetLowering::EmitLoweredCatchRet(MachineInstr &MI,
33017                                       MachineBasicBlock *BB) const {
33018  MachineFunction *MF = BB->getParent();
33019  const TargetInstrInfo &TII = *Subtarget.getInstrInfo();
33020  MachineBasicBlock *TargetMBB = MI.getOperand(0).getMBB();
33021  const DebugLoc &DL = MI.getDebugLoc();
33022
33023  assert(!isAsynchronousEHPersonality(
33024             classifyEHPersonality(MF->getFunction().getPersonalityFn())) &&
33025         "SEH does not use catchret!");
33026
33027  // Only 32-bit EH needs to worry about manually restoring stack pointers.
33028  if (!Subtarget.is32Bit())
33029    return BB;
33030
33031  // C++ EH creates a new target block to hold the restore code, and wires up
33032  // the new block to the return destination with a normal JMP_4.
33033  MachineBasicBlock *RestoreMBB =
33034      MF->CreateMachineBasicBlock(BB->getBasicBlock());
33035  assert(BB->succ_size() == 1);
33036  MF->insert(std::next(BB->getIterator()), RestoreMBB);
33037  RestoreMBB->transferSuccessorsAndUpdatePHIs(BB);
33038  BB->addSuccessor(RestoreMBB);
33039  MI.getOperand(0).setMBB(RestoreMBB);
33040
33041  // Marking this as an EH pad but not a funclet entry block causes PEI to
33042  // restore stack pointers in the block.
33043  RestoreMBB->setIsEHPad(true);
33044
33045  auto RestoreMBBI = RestoreMBB->begin();
33046  BuildMI(*RestoreMBB, RestoreMBBI, DL, TII.get(X86::JMP_4)).addMBB(TargetMBB);
33047  return BB;
33048}
33049
33050MachineBasicBlock *
33051X86TargetLowering::EmitLoweredTLSAddr(MachineInstr &MI,
33052                                      MachineBasicBlock *BB) const {
33053  // So, here we replace TLSADDR with the sequence:
33054  // adjust_stackdown -> TLSADDR -> adjust_stackup.
33055  // We need this because TLSADDR is lowered into calls
33056  // inside MC, therefore without the two markers shrink-wrapping
33057  // may push the prologue/epilogue pass them.
33058  const TargetInstrInfo &TII = *Subtarget.getInstrInfo();
33059  const DebugLoc &DL = MI.getDebugLoc();
33060  MachineFunction &MF = *BB->getParent();
33061
33062  // Emit CALLSEQ_START right before the instruction.
33063  unsigned AdjStackDown = TII.getCallFrameSetupOpcode();
33064  MachineInstrBuilder CallseqStart =
33065    BuildMI(MF, DL, TII.get(AdjStackDown)).addImm(0).addImm(0).addImm(0);
33066  BB->insert(MachineBasicBlock::iterator(MI), CallseqStart);
33067
33068  // Emit CALLSEQ_END right after the instruction.
33069  // We don't call erase from parent because we want to keep the
33070  // original instruction around.
33071  unsigned AdjStackUp = TII.getCallFrameDestroyOpcode();
33072  MachineInstrBuilder CallseqEnd =
33073    BuildMI(MF, DL, TII.get(AdjStackUp)).addImm(0).addImm(0);
33074  BB->insertAfter(MachineBasicBlock::iterator(MI), CallseqEnd);
33075
33076  return BB;
33077}
33078
33079MachineBasicBlock *
33080X86TargetLowering::EmitLoweredTLSCall(MachineInstr &MI,
33081                                      MachineBasicBlock *BB) const {
33082  // This is pretty easy.  We're taking the value that we received from
33083  // our load from the relocation, sticking it in either RDI (x86-64)
33084  // or EAX and doing an indirect call.  The return value will then
33085  // be in the normal return register.
33086  MachineFunction *F = BB->getParent();
33087  const X86InstrInfo *TII = Subtarget.getInstrInfo();
33088  const DebugLoc &DL = MI.getDebugLoc();
33089
33090  assert(Subtarget.isTargetDarwin() && "Darwin only instr emitted?");
33091  assert(MI.getOperand(3).isGlobal() && "This should be a global");
33092
33093  // Get a register mask for the lowered call.
33094  // FIXME: The 32-bit calls have non-standard calling conventions. Use a
33095  // proper register mask.
33096  const uint32_t *RegMask =
33097      Subtarget.is64Bit() ?
33098      Subtarget.getRegisterInfo()->getDarwinTLSCallPreservedMask() :
33099      Subtarget.getRegisterInfo()->getCallPreservedMask(*F, CallingConv::C);
33100  if (Subtarget.is64Bit()) {
33101    MachineInstrBuilder MIB =
33102        BuildMI(*BB, MI, DL, TII->get(X86::MOV64rm), X86::RDI)
33103            .addReg(X86::RIP)
33104            .addImm(0)
33105            .addReg(0)
33106            .addGlobalAddress(MI.getOperand(3).getGlobal(), 0,
33107                              MI.getOperand(3).getTargetFlags())
33108            .addReg(0);
33109    MIB = BuildMI(*BB, MI, DL, TII->get(X86::CALL64m));
33110    addDirectMem(MIB, X86::RDI);
33111    MIB.addReg(X86::RAX, RegState::ImplicitDefine).addRegMask(RegMask);
33112  } else if (!isPositionIndependent()) {
33113    MachineInstrBuilder MIB =
33114        BuildMI(*BB, MI, DL, TII->get(X86::MOV32rm), X86::EAX)
33115            .addReg(0)
33116            .addImm(0)
33117            .addReg(0)
33118            .addGlobalAddress(MI.getOperand(3).getGlobal(), 0,
33119                              MI.getOperand(3).getTargetFlags())
33120            .addReg(0);
33121    MIB = BuildMI(*BB, MI, DL, TII->get(X86::CALL32m));
33122    addDirectMem(MIB, X86::EAX);
33123    MIB.addReg(X86::EAX, RegState::ImplicitDefine).addRegMask(RegMask);
33124  } else {
33125    MachineInstrBuilder MIB =
33126        BuildMI(*BB, MI, DL, TII->get(X86::MOV32rm), X86::EAX)
33127            .addReg(TII->getGlobalBaseReg(F))
33128            .addImm(0)
33129            .addReg(0)
33130            .addGlobalAddress(MI.getOperand(3).getGlobal(), 0,
33131                              MI.getOperand(3).getTargetFlags())
33132            .addReg(0);
33133    MIB = BuildMI(*BB, MI, DL, TII->get(X86::CALL32m));
33134    addDirectMem(MIB, X86::EAX);
33135    MIB.addReg(X86::EAX, RegState::ImplicitDefine).addRegMask(RegMask);
33136  }
33137
33138  MI.eraseFromParent(); // The pseudo instruction is gone now.
33139  return BB;
33140}
33141
33142static unsigned getOpcodeForIndirectThunk(unsigned RPOpc) {
33143  switch (RPOpc) {
33144  case X86::INDIRECT_THUNK_CALL32:
33145    return X86::CALLpcrel32;
33146  case X86::INDIRECT_THUNK_CALL64:
33147    return X86::CALL64pcrel32;
33148  case X86::INDIRECT_THUNK_TCRETURN32:
33149    return X86::TCRETURNdi;
33150  case X86::INDIRECT_THUNK_TCRETURN64:
33151    return X86::TCRETURNdi64;
33152  }
33153  llvm_unreachable("not indirect thunk opcode");
33154}
33155
33156static const char *getIndirectThunkSymbol(const X86Subtarget &Subtarget,
33157                                          unsigned Reg) {
33158  if (Subtarget.useRetpolineExternalThunk()) {
33159    // When using an external thunk for retpolines, we pick names that match the
33160    // names GCC happens to use as well. This helps simplify the implementation
33161    // of the thunks for kernels where they have no easy ability to create
33162    // aliases and are doing non-trivial configuration of the thunk's body. For
33163    // example, the Linux kernel will do boot-time hot patching of the thunk
33164    // bodies and cannot easily export aliases of these to loaded modules.
33165    //
33166    // Note that at any point in the future, we may need to change the semantics
33167    // of how we implement retpolines and at that time will likely change the
33168    // name of the called thunk. Essentially, there is no hard guarantee that
33169    // LLVM will generate calls to specific thunks, we merely make a best-effort
33170    // attempt to help out kernels and other systems where duplicating the
33171    // thunks is costly.
33172    switch (Reg) {
33173    case X86::EAX:
33174      assert(!Subtarget.is64Bit() && "Should not be using a 32-bit thunk!");
33175      return "__x86_indirect_thunk_eax";
33176    case X86::ECX:
33177      assert(!Subtarget.is64Bit() && "Should not be using a 32-bit thunk!");
33178      return "__x86_indirect_thunk_ecx";
33179    case X86::EDX:
33180      assert(!Subtarget.is64Bit() && "Should not be using a 32-bit thunk!");
33181      return "__x86_indirect_thunk_edx";
33182    case X86::EDI:
33183      assert(!Subtarget.is64Bit() && "Should not be using a 32-bit thunk!");
33184      return "__x86_indirect_thunk_edi";
33185    case X86::R11:
33186      assert(Subtarget.is64Bit() && "Should not be using a 64-bit thunk!");
33187      return "__x86_indirect_thunk_r11";
33188    }
33189    llvm_unreachable("unexpected reg for external indirect thunk");
33190  }
33191
33192  if (Subtarget.useRetpolineIndirectCalls() ||
33193      Subtarget.useRetpolineIndirectBranches()) {
33194    // When targeting an internal COMDAT thunk use an LLVM-specific name.
33195    switch (Reg) {
33196    case X86::EAX:
33197      assert(!Subtarget.is64Bit() && "Should not be using a 32-bit thunk!");
33198      return "__llvm_retpoline_eax";
33199    case X86::ECX:
33200      assert(!Subtarget.is64Bit() && "Should not be using a 32-bit thunk!");
33201      return "__llvm_retpoline_ecx";
33202    case X86::EDX:
33203      assert(!Subtarget.is64Bit() && "Should not be using a 32-bit thunk!");
33204      return "__llvm_retpoline_edx";
33205    case X86::EDI:
33206      assert(!Subtarget.is64Bit() && "Should not be using a 32-bit thunk!");
33207      return "__llvm_retpoline_edi";
33208    case X86::R11:
33209      assert(Subtarget.is64Bit() && "Should not be using a 64-bit thunk!");
33210      return "__llvm_retpoline_r11";
33211    }
33212    llvm_unreachable("unexpected reg for retpoline");
33213  }
33214
33215  if (Subtarget.useLVIControlFlowIntegrity()) {
33216    assert(Subtarget.is64Bit() && "Should not be using a 64-bit thunk!");
33217    return "__llvm_lvi_thunk_r11";
33218  }
33219  llvm_unreachable("getIndirectThunkSymbol() invoked without thunk feature");
33220}
33221
33222MachineBasicBlock *
33223X86TargetLowering::EmitLoweredIndirectThunk(MachineInstr &MI,
33224                                            MachineBasicBlock *BB) const {
33225  // Copy the virtual register into the R11 physical register and
33226  // call the retpoline thunk.
33227  const DebugLoc &DL = MI.getDebugLoc();
33228  const X86InstrInfo *TII = Subtarget.getInstrInfo();
33229  Register CalleeVReg = MI.getOperand(0).getReg();
33230  unsigned Opc = getOpcodeForIndirectThunk(MI.getOpcode());
33231
33232  // Find an available scratch register to hold the callee. On 64-bit, we can
33233  // just use R11, but we scan for uses anyway to ensure we don't generate
33234  // incorrect code. On 32-bit, we use one of EAX, ECX, or EDX that isn't
33235  // already a register use operand to the call to hold the callee. If none
33236  // are available, use EDI instead. EDI is chosen because EBX is the PIC base
33237  // register and ESI is the base pointer to realigned stack frames with VLAs.
33238  SmallVector<unsigned, 3> AvailableRegs;
33239  if (Subtarget.is64Bit())
33240    AvailableRegs.push_back(X86::R11);
33241  else
33242    AvailableRegs.append({X86::EAX, X86::ECX, X86::EDX, X86::EDI});
33243
33244  // Zero out any registers that are already used.
33245  for (const auto &MO : MI.operands()) {
33246    if (MO.isReg() && MO.isUse())
33247      for (unsigned &Reg : AvailableRegs)
33248        if (Reg == MO.getReg())
33249          Reg = 0;
33250  }
33251
33252  // Choose the first remaining non-zero available register.
33253  unsigned AvailableReg = 0;
33254  for (unsigned MaybeReg : AvailableRegs) {
33255    if (MaybeReg) {
33256      AvailableReg = MaybeReg;
33257      break;
33258    }
33259  }
33260  if (!AvailableReg)
33261    report_fatal_error("calling convention incompatible with retpoline, no "
33262                       "available registers");
33263
33264  const char *Symbol = getIndirectThunkSymbol(Subtarget, AvailableReg);
33265
33266  BuildMI(*BB, MI, DL, TII->get(TargetOpcode::COPY), AvailableReg)
33267      .addReg(CalleeVReg);
33268  MI.getOperand(0).ChangeToES(Symbol);
33269  MI.setDesc(TII->get(Opc));
33270  MachineInstrBuilder(*BB->getParent(), &MI)
33271      .addReg(AvailableReg, RegState::Implicit | RegState::Kill);
33272  return BB;
33273}
33274
33275/// SetJmp implies future control flow change upon calling the corresponding
33276/// LongJmp.
33277/// Instead of using the 'return' instruction, the long jump fixes the stack and
33278/// performs an indirect branch. To do so it uses the registers that were stored
33279/// in the jump buffer (when calling SetJmp).
33280/// In case the shadow stack is enabled we need to fix it as well, because some
33281/// return addresses will be skipped.
33282/// The function will save the SSP for future fixing in the function
33283/// emitLongJmpShadowStackFix.
33284/// \sa emitLongJmpShadowStackFix
33285/// \param [in] MI The temporary Machine Instruction for the builtin.
33286/// \param [in] MBB The Machine Basic Block that will be modified.
33287void X86TargetLowering::emitSetJmpShadowStackFix(MachineInstr &MI,
33288                                                 MachineBasicBlock *MBB) const {
33289  const DebugLoc &DL = MI.getDebugLoc();
33290  MachineFunction *MF = MBB->getParent();
33291  const TargetInstrInfo *TII = Subtarget.getInstrInfo();
33292  MachineRegisterInfo &MRI = MF->getRegInfo();
33293  MachineInstrBuilder MIB;
33294
33295  // Memory Reference.
33296  SmallVector<MachineMemOperand *, 2> MMOs(MI.memoperands_begin(),
33297                                           MI.memoperands_end());
33298
33299  // Initialize a register with zero.
33300  MVT PVT = getPointerTy(MF->getDataLayout());
33301  const TargetRegisterClass *PtrRC = getRegClassFor(PVT);
33302  Register ZReg = MRI.createVirtualRegister(PtrRC);
33303  unsigned XorRROpc = (PVT == MVT::i64) ? X86::XOR64rr : X86::XOR32rr;
33304  BuildMI(*MBB, MI, DL, TII->get(XorRROpc))
33305      .addDef(ZReg)
33306      .addReg(ZReg, RegState::Undef)
33307      .addReg(ZReg, RegState::Undef);
33308
33309  // Read the current SSP Register value to the zeroed register.
33310  Register SSPCopyReg = MRI.createVirtualRegister(PtrRC);
33311  unsigned RdsspOpc = (PVT == MVT::i64) ? X86::RDSSPQ : X86::RDSSPD;
33312  BuildMI(*MBB, MI, DL, TII->get(RdsspOpc), SSPCopyReg).addReg(ZReg);
33313
33314  // Write the SSP register value to offset 3 in input memory buffer.
33315  unsigned PtrStoreOpc = (PVT == MVT::i64) ? X86::MOV64mr : X86::MOV32mr;
33316  MIB = BuildMI(*MBB, MI, DL, TII->get(PtrStoreOpc));
33317  const int64_t SSPOffset = 3 * PVT.getStoreSize();
33318  const unsigned MemOpndSlot = 1;
33319  for (unsigned i = 0; i < X86::AddrNumOperands; ++i) {
33320    if (i == X86::AddrDisp)
33321      MIB.addDisp(MI.getOperand(MemOpndSlot + i), SSPOffset);
33322    else
33323      MIB.add(MI.getOperand(MemOpndSlot + i));
33324  }
33325  MIB.addReg(SSPCopyReg);
33326  MIB.setMemRefs(MMOs);
33327}
33328
33329MachineBasicBlock *
33330X86TargetLowering::emitEHSjLjSetJmp(MachineInstr &MI,
33331                                    MachineBasicBlock *MBB) const {
33332  const DebugLoc &DL = MI.getDebugLoc();
33333  MachineFunction *MF = MBB->getParent();
33334  const TargetInstrInfo *TII = Subtarget.getInstrInfo();
33335  const TargetRegisterInfo *TRI = Subtarget.getRegisterInfo();
33336  MachineRegisterInfo &MRI = MF->getRegInfo();
33337
33338  const BasicBlock *BB = MBB->getBasicBlock();
33339  MachineFunction::iterator I = ++MBB->getIterator();
33340
33341  // Memory Reference
33342  SmallVector<MachineMemOperand *, 2> MMOs(MI.memoperands_begin(),
33343                                           MI.memoperands_end());
33344
33345  unsigned DstReg;
33346  unsigned MemOpndSlot = 0;
33347
33348  unsigned CurOp = 0;
33349
33350  DstReg = MI.getOperand(CurOp++).getReg();
33351  const TargetRegisterClass *RC = MRI.getRegClass(DstReg);
33352  assert(TRI->isTypeLegalForClass(*RC, MVT::i32) && "Invalid destination!");
33353  (void)TRI;
33354  Register mainDstReg = MRI.createVirtualRegister(RC);
33355  Register restoreDstReg = MRI.createVirtualRegister(RC);
33356
33357  MemOpndSlot = CurOp;
33358
33359  MVT PVT = getPointerTy(MF->getDataLayout());
33360  assert((PVT == MVT::i64 || PVT == MVT::i32) &&
33361         "Invalid Pointer Size!");
33362
33363  // For v = setjmp(buf), we generate
33364  //
33365  // thisMBB:
33366  //  buf[LabelOffset] = restoreMBB <-- takes address of restoreMBB
33367  //  SjLjSetup restoreMBB
33368  //
33369  // mainMBB:
33370  //  v_main = 0
33371  //
33372  // sinkMBB:
33373  //  v = phi(main, restore)
33374  //
33375  // restoreMBB:
33376  //  if base pointer being used, load it from frame
33377  //  v_restore = 1
33378
33379  MachineBasicBlock *thisMBB = MBB;
33380  MachineBasicBlock *mainMBB = MF->CreateMachineBasicBlock(BB);
33381  MachineBasicBlock *sinkMBB = MF->CreateMachineBasicBlock(BB);
33382  MachineBasicBlock *restoreMBB = MF->CreateMachineBasicBlock(BB);
33383  MF->insert(I, mainMBB);
33384  MF->insert(I, sinkMBB);
33385  MF->push_back(restoreMBB);
33386  restoreMBB->setHasAddressTaken();
33387
33388  MachineInstrBuilder MIB;
33389
33390  // Transfer the remainder of BB and its successor edges to sinkMBB.
33391  sinkMBB->splice(sinkMBB->begin(), MBB,
33392                  std::next(MachineBasicBlock::iterator(MI)), MBB->end());
33393  sinkMBB->transferSuccessorsAndUpdatePHIs(MBB);
33394
33395  // thisMBB:
33396  unsigned PtrStoreOpc = 0;
33397  unsigned LabelReg = 0;
33398  const int64_t LabelOffset = 1 * PVT.getStoreSize();
33399  bool UseImmLabel = (MF->getTarget().getCodeModel() == CodeModel::Small) &&
33400                     !isPositionIndependent();
33401
33402  // Prepare IP either in reg or imm.
33403  if (!UseImmLabel) {
33404    PtrStoreOpc = (PVT == MVT::i64) ? X86::MOV64mr : X86::MOV32mr;
33405    const TargetRegisterClass *PtrRC = getRegClassFor(PVT);
33406    LabelReg = MRI.createVirtualRegister(PtrRC);
33407    if (Subtarget.is64Bit()) {
33408      MIB = BuildMI(*thisMBB, MI, DL, TII->get(X86::LEA64r), LabelReg)
33409              .addReg(X86::RIP)
33410              .addImm(0)
33411              .addReg(0)
33412              .addMBB(restoreMBB)
33413              .addReg(0);
33414    } else {
33415      const X86InstrInfo *XII = static_cast<const X86InstrInfo*>(TII);
33416      MIB = BuildMI(*thisMBB, MI, DL, TII->get(X86::LEA32r), LabelReg)
33417              .addReg(XII->getGlobalBaseReg(MF))
33418              .addImm(0)
33419              .addReg(0)
33420              .addMBB(restoreMBB, Subtarget.classifyBlockAddressReference())
33421              .addReg(0);
33422    }
33423  } else
33424    PtrStoreOpc = (PVT == MVT::i64) ? X86::MOV64mi32 : X86::MOV32mi;
33425  // Store IP
33426  MIB = BuildMI(*thisMBB, MI, DL, TII->get(PtrStoreOpc));
33427  for (unsigned i = 0; i < X86::AddrNumOperands; ++i) {
33428    if (i == X86::AddrDisp)
33429      MIB.addDisp(MI.getOperand(MemOpndSlot + i), LabelOffset);
33430    else
33431      MIB.add(MI.getOperand(MemOpndSlot + i));
33432  }
33433  if (!UseImmLabel)
33434    MIB.addReg(LabelReg);
33435  else
33436    MIB.addMBB(restoreMBB);
33437  MIB.setMemRefs(MMOs);
33438
33439  if (MF->getMMI().getModule()->getModuleFlag("cf-protection-return")) {
33440    emitSetJmpShadowStackFix(MI, thisMBB);
33441  }
33442
33443  // Setup
33444  MIB = BuildMI(*thisMBB, MI, DL, TII->get(X86::EH_SjLj_Setup))
33445          .addMBB(restoreMBB);
33446
33447  const X86RegisterInfo *RegInfo = Subtarget.getRegisterInfo();
33448  MIB.addRegMask(RegInfo->getNoPreservedMask());
33449  thisMBB->addSuccessor(mainMBB);
33450  thisMBB->addSuccessor(restoreMBB);
33451
33452  // mainMBB:
33453  //  EAX = 0
33454  BuildMI(mainMBB, DL, TII->get(X86::MOV32r0), mainDstReg);
33455  mainMBB->addSuccessor(sinkMBB);
33456
33457  // sinkMBB:
33458  BuildMI(*sinkMBB, sinkMBB->begin(), DL,
33459          TII->get(X86::PHI), DstReg)
33460    .addReg(mainDstReg).addMBB(mainMBB)
33461    .addReg(restoreDstReg).addMBB(restoreMBB);
33462
33463  // restoreMBB:
33464  if (RegInfo->hasBasePointer(*MF)) {
33465    const bool Uses64BitFramePtr =
33466        Subtarget.isTarget64BitLP64() || Subtarget.isTargetNaCl64();
33467    X86MachineFunctionInfo *X86FI = MF->getInfo<X86MachineFunctionInfo>();
33468    X86FI->setRestoreBasePointer(MF);
33469    Register FramePtr = RegInfo->getFrameRegister(*MF);
33470    Register BasePtr = RegInfo->getBaseRegister();
33471    unsigned Opm = Uses64BitFramePtr ? X86::MOV64rm : X86::MOV32rm;
33472    addRegOffset(BuildMI(restoreMBB, DL, TII->get(Opm), BasePtr),
33473                 FramePtr, true, X86FI->getRestoreBasePointerOffset())
33474      .setMIFlag(MachineInstr::FrameSetup);
33475  }
33476  BuildMI(restoreMBB, DL, TII->get(X86::MOV32ri), restoreDstReg).addImm(1);
33477  BuildMI(restoreMBB, DL, TII->get(X86::JMP_1)).addMBB(sinkMBB);
33478  restoreMBB->addSuccessor(sinkMBB);
33479
33480  MI.eraseFromParent();
33481  return sinkMBB;
33482}
33483
33484/// Fix the shadow stack using the previously saved SSP pointer.
33485/// \sa emitSetJmpShadowStackFix
33486/// \param [in] MI The temporary Machine Instruction for the builtin.
33487/// \param [in] MBB The Machine Basic Block that will be modified.
33488/// \return The sink MBB that will perform the future indirect branch.
33489MachineBasicBlock *
33490X86TargetLowering::emitLongJmpShadowStackFix(MachineInstr &MI,
33491                                             MachineBasicBlock *MBB) const {
33492  const DebugLoc &DL = MI.getDebugLoc();
33493  MachineFunction *MF = MBB->getParent();
33494  const TargetInstrInfo *TII = Subtarget.getInstrInfo();
33495  MachineRegisterInfo &MRI = MF->getRegInfo();
33496
33497  // Memory Reference
33498  SmallVector<MachineMemOperand *, 2> MMOs(MI.memoperands_begin(),
33499                                           MI.memoperands_end());
33500
33501  MVT PVT = getPointerTy(MF->getDataLayout());
33502  const TargetRegisterClass *PtrRC = getRegClassFor(PVT);
33503
33504  // checkSspMBB:
33505  //         xor vreg1, vreg1
33506  //         rdssp vreg1
33507  //         test vreg1, vreg1
33508  //         je sinkMBB   # Jump if Shadow Stack is not supported
33509  // fallMBB:
33510  //         mov buf+24/12(%rip), vreg2
33511  //         sub vreg1, vreg2
33512  //         jbe sinkMBB  # No need to fix the Shadow Stack
33513  // fixShadowMBB:
33514  //         shr 3/2, vreg2
33515  //         incssp vreg2  # fix the SSP according to the lower 8 bits
33516  //         shr 8, vreg2
33517  //         je sinkMBB
33518  // fixShadowLoopPrepareMBB:
33519  //         shl vreg2
33520  //         mov 128, vreg3
33521  // fixShadowLoopMBB:
33522  //         incssp vreg3
33523  //         dec vreg2
33524  //         jne fixShadowLoopMBB # Iterate until you finish fixing
33525  //                              # the Shadow Stack
33526  // sinkMBB:
33527
33528  MachineFunction::iterator I = ++MBB->getIterator();
33529  const BasicBlock *BB = MBB->getBasicBlock();
33530
33531  MachineBasicBlock *checkSspMBB = MF->CreateMachineBasicBlock(BB);
33532  MachineBasicBlock *fallMBB = MF->CreateMachineBasicBlock(BB);
33533  MachineBasicBlock *fixShadowMBB = MF->CreateMachineBasicBlock(BB);
33534  MachineBasicBlock *fixShadowLoopPrepareMBB = MF->CreateMachineBasicBlock(BB);
33535  MachineBasicBlock *fixShadowLoopMBB = MF->CreateMachineBasicBlock(BB);
33536  MachineBasicBlock *sinkMBB = MF->CreateMachineBasicBlock(BB);
33537  MF->insert(I, checkSspMBB);
33538  MF->insert(I, fallMBB);
33539  MF->insert(I, fixShadowMBB);
33540  MF->insert(I, fixShadowLoopPrepareMBB);
33541  MF->insert(I, fixShadowLoopMBB);
33542  MF->insert(I, sinkMBB);
33543
33544  // Transfer the remainder of BB and its successor edges to sinkMBB.
33545  sinkMBB->splice(sinkMBB->begin(), MBB, MachineBasicBlock::iterator(MI),
33546                  MBB->end());
33547  sinkMBB->transferSuccessorsAndUpdatePHIs(MBB);
33548
33549  MBB->addSuccessor(checkSspMBB);
33550
33551  // Initialize a register with zero.
33552  Register ZReg = MRI.createVirtualRegister(&X86::GR32RegClass);
33553  BuildMI(checkSspMBB, DL, TII->get(X86::MOV32r0), ZReg);
33554
33555  if (PVT == MVT::i64) {
33556    Register TmpZReg = MRI.createVirtualRegister(PtrRC);
33557    BuildMI(checkSspMBB, DL, TII->get(X86::SUBREG_TO_REG), TmpZReg)
33558      .addImm(0)
33559      .addReg(ZReg)
33560      .addImm(X86::sub_32bit);
33561    ZReg = TmpZReg;
33562  }
33563
33564  // Read the current SSP Register value to the zeroed register.
33565  Register SSPCopyReg = MRI.createVirtualRegister(PtrRC);
33566  unsigned RdsspOpc = (PVT == MVT::i64) ? X86::RDSSPQ : X86::RDSSPD;
33567  BuildMI(checkSspMBB, DL, TII->get(RdsspOpc), SSPCopyReg).addReg(ZReg);
33568
33569  // Check whether the result of the SSP register is zero and jump directly
33570  // to the sink.
33571  unsigned TestRROpc = (PVT == MVT::i64) ? X86::TEST64rr : X86::TEST32rr;
33572  BuildMI(checkSspMBB, DL, TII->get(TestRROpc))
33573      .addReg(SSPCopyReg)
33574      .addReg(SSPCopyReg);
33575  BuildMI(checkSspMBB, DL, TII->get(X86::JCC_1)).addMBB(sinkMBB).addImm(X86::COND_E);
33576  checkSspMBB->addSuccessor(sinkMBB);
33577  checkSspMBB->addSuccessor(fallMBB);
33578
33579  // Reload the previously saved SSP register value.
33580  Register PrevSSPReg = MRI.createVirtualRegister(PtrRC);
33581  unsigned PtrLoadOpc = (PVT == MVT::i64) ? X86::MOV64rm : X86::MOV32rm;
33582  const int64_t SPPOffset = 3 * PVT.getStoreSize();
33583  MachineInstrBuilder MIB =
33584      BuildMI(fallMBB, DL, TII->get(PtrLoadOpc), PrevSSPReg);
33585  for (unsigned i = 0; i < X86::AddrNumOperands; ++i) {
33586    const MachineOperand &MO = MI.getOperand(i);
33587    if (i == X86::AddrDisp)
33588      MIB.addDisp(MO, SPPOffset);
33589    else if (MO.isReg()) // Don't add the whole operand, we don't want to
33590                         // preserve kill flags.
33591      MIB.addReg(MO.getReg());
33592    else
33593      MIB.add(MO);
33594  }
33595  MIB.setMemRefs(MMOs);
33596
33597  // Subtract the current SSP from the previous SSP.
33598  Register SspSubReg = MRI.createVirtualRegister(PtrRC);
33599  unsigned SubRROpc = (PVT == MVT::i64) ? X86::SUB64rr : X86::SUB32rr;
33600  BuildMI(fallMBB, DL, TII->get(SubRROpc), SspSubReg)
33601      .addReg(PrevSSPReg)
33602      .addReg(SSPCopyReg);
33603
33604  // Jump to sink in case PrevSSPReg <= SSPCopyReg.
33605  BuildMI(fallMBB, DL, TII->get(X86::JCC_1)).addMBB(sinkMBB).addImm(X86::COND_BE);
33606  fallMBB->addSuccessor(sinkMBB);
33607  fallMBB->addSuccessor(fixShadowMBB);
33608
33609  // Shift right by 2/3 for 32/64 because incssp multiplies the argument by 4/8.
33610  unsigned ShrRIOpc = (PVT == MVT::i64) ? X86::SHR64ri : X86::SHR32ri;
33611  unsigned Offset = (PVT == MVT::i64) ? 3 : 2;
33612  Register SspFirstShrReg = MRI.createVirtualRegister(PtrRC);
33613  BuildMI(fixShadowMBB, DL, TII->get(ShrRIOpc), SspFirstShrReg)
33614      .addReg(SspSubReg)
33615      .addImm(Offset);
33616
33617  // Increase SSP when looking only on the lower 8 bits of the delta.
33618  unsigned IncsspOpc = (PVT == MVT::i64) ? X86::INCSSPQ : X86::INCSSPD;
33619  BuildMI(fixShadowMBB, DL, TII->get(IncsspOpc)).addReg(SspFirstShrReg);
33620
33621  // Reset the lower 8 bits.
33622  Register SspSecondShrReg = MRI.createVirtualRegister(PtrRC);
33623  BuildMI(fixShadowMBB, DL, TII->get(ShrRIOpc), SspSecondShrReg)
33624      .addReg(SspFirstShrReg)
33625      .addImm(8);
33626
33627  // Jump if the result of the shift is zero.
33628  BuildMI(fixShadowMBB, DL, TII->get(X86::JCC_1)).addMBB(sinkMBB).addImm(X86::COND_E);
33629  fixShadowMBB->addSuccessor(sinkMBB);
33630  fixShadowMBB->addSuccessor(fixShadowLoopPrepareMBB);
33631
33632  // Do a single shift left.
33633  unsigned ShlR1Opc = (PVT == MVT::i64) ? X86::SHL64r1 : X86::SHL32r1;
33634  Register SspAfterShlReg = MRI.createVirtualRegister(PtrRC);
33635  BuildMI(fixShadowLoopPrepareMBB, DL, TII->get(ShlR1Opc), SspAfterShlReg)
33636      .addReg(SspSecondShrReg);
33637
33638  // Save the value 128 to a register (will be used next with incssp).
33639  Register Value128InReg = MRI.createVirtualRegister(PtrRC);
33640  unsigned MovRIOpc = (PVT == MVT::i64) ? X86::MOV64ri32 : X86::MOV32ri;
33641  BuildMI(fixShadowLoopPrepareMBB, DL, TII->get(MovRIOpc), Value128InReg)
33642      .addImm(128);
33643  fixShadowLoopPrepareMBB->addSuccessor(fixShadowLoopMBB);
33644
33645  // Since incssp only looks at the lower 8 bits, we might need to do several
33646  // iterations of incssp until we finish fixing the shadow stack.
33647  Register DecReg = MRI.createVirtualRegister(PtrRC);
33648  Register CounterReg = MRI.createVirtualRegister(PtrRC);
33649  BuildMI(fixShadowLoopMBB, DL, TII->get(X86::PHI), CounterReg)
33650      .addReg(SspAfterShlReg)
33651      .addMBB(fixShadowLoopPrepareMBB)
33652      .addReg(DecReg)
33653      .addMBB(fixShadowLoopMBB);
33654
33655  // Every iteration we increase the SSP by 128.
33656  BuildMI(fixShadowLoopMBB, DL, TII->get(IncsspOpc)).addReg(Value128InReg);
33657
33658  // Every iteration we decrement the counter by 1.
33659  unsigned DecROpc = (PVT == MVT::i64) ? X86::DEC64r : X86::DEC32r;
33660  BuildMI(fixShadowLoopMBB, DL, TII->get(DecROpc), DecReg).addReg(CounterReg);
33661
33662  // Jump if the counter is not zero yet.
33663  BuildMI(fixShadowLoopMBB, DL, TII->get(X86::JCC_1)).addMBB(fixShadowLoopMBB).addImm(X86::COND_NE);
33664  fixShadowLoopMBB->addSuccessor(sinkMBB);
33665  fixShadowLoopMBB->addSuccessor(fixShadowLoopMBB);
33666
33667  return sinkMBB;
33668}
33669
33670MachineBasicBlock *
33671X86TargetLowering::emitEHSjLjLongJmp(MachineInstr &MI,
33672                                     MachineBasicBlock *MBB) const {
33673  const DebugLoc &DL = MI.getDebugLoc();
33674  MachineFunction *MF = MBB->getParent();
33675  const TargetInstrInfo *TII = Subtarget.getInstrInfo();
33676  MachineRegisterInfo &MRI = MF->getRegInfo();
33677
33678  // Memory Reference
33679  SmallVector<MachineMemOperand *, 2> MMOs(MI.memoperands_begin(),
33680                                           MI.memoperands_end());
33681
33682  MVT PVT = getPointerTy(MF->getDataLayout());
33683  assert((PVT == MVT::i64 || PVT == MVT::i32) &&
33684         "Invalid Pointer Size!");
33685
33686  const TargetRegisterClass *RC =
33687    (PVT == MVT::i64) ? &X86::GR64RegClass : &X86::GR32RegClass;
33688  Register Tmp = MRI.createVirtualRegister(RC);
33689  // Since FP is only updated here but NOT referenced, it's treated as GPR.
33690  const X86RegisterInfo *RegInfo = Subtarget.getRegisterInfo();
33691  Register FP = (PVT == MVT::i64) ? X86::RBP : X86::EBP;
33692  Register SP = RegInfo->getStackRegister();
33693
33694  MachineInstrBuilder MIB;
33695
33696  const int64_t LabelOffset = 1 * PVT.getStoreSize();
33697  const int64_t SPOffset = 2 * PVT.getStoreSize();
33698
33699  unsigned PtrLoadOpc = (PVT == MVT::i64) ? X86::MOV64rm : X86::MOV32rm;
33700  unsigned IJmpOpc = (PVT == MVT::i64) ? X86::JMP64r : X86::JMP32r;
33701
33702  MachineBasicBlock *thisMBB = MBB;
33703
33704  // When CET and shadow stack is enabled, we need to fix the Shadow Stack.
33705  if (MF->getMMI().getModule()->getModuleFlag("cf-protection-return")) {
33706    thisMBB = emitLongJmpShadowStackFix(MI, thisMBB);
33707  }
33708
33709  // Reload FP
33710  MIB = BuildMI(*thisMBB, MI, DL, TII->get(PtrLoadOpc), FP);
33711  for (unsigned i = 0; i < X86::AddrNumOperands; ++i) {
33712    const MachineOperand &MO = MI.getOperand(i);
33713    if (MO.isReg()) // Don't add the whole operand, we don't want to
33714                    // preserve kill flags.
33715      MIB.addReg(MO.getReg());
33716    else
33717      MIB.add(MO);
33718  }
33719  MIB.setMemRefs(MMOs);
33720
33721  // Reload IP
33722  MIB = BuildMI(*thisMBB, MI, DL, TII->get(PtrLoadOpc), Tmp);
33723  for (unsigned i = 0; i < X86::AddrNumOperands; ++i) {
33724    const MachineOperand &MO = MI.getOperand(i);
33725    if (i == X86::AddrDisp)
33726      MIB.addDisp(MO, LabelOffset);
33727    else if (MO.isReg()) // Don't add the whole operand, we don't want to
33728                         // preserve kill flags.
33729      MIB.addReg(MO.getReg());
33730    else
33731      MIB.add(MO);
33732  }
33733  MIB.setMemRefs(MMOs);
33734
33735  // Reload SP
33736  MIB = BuildMI(*thisMBB, MI, DL, TII->get(PtrLoadOpc), SP);
33737  for (unsigned i = 0; i < X86::AddrNumOperands; ++i) {
33738    if (i == X86::AddrDisp)
33739      MIB.addDisp(MI.getOperand(i), SPOffset);
33740    else
33741      MIB.add(MI.getOperand(i)); // We can preserve the kill flags here, it's
33742                                 // the last instruction of the expansion.
33743  }
33744  MIB.setMemRefs(MMOs);
33745
33746  // Jump
33747  BuildMI(*thisMBB, MI, DL, TII->get(IJmpOpc)).addReg(Tmp);
33748
33749  MI.eraseFromParent();
33750  return thisMBB;
33751}
33752
33753void X86TargetLowering::SetupEntryBlockForSjLj(MachineInstr &MI,
33754                                               MachineBasicBlock *MBB,
33755                                               MachineBasicBlock *DispatchBB,
33756                                               int FI) const {
33757  const DebugLoc &DL = MI.getDebugLoc();
33758  MachineFunction *MF = MBB->getParent();
33759  MachineRegisterInfo *MRI = &MF->getRegInfo();
33760  const X86InstrInfo *TII = Subtarget.getInstrInfo();
33761
33762  MVT PVT = getPointerTy(MF->getDataLayout());
33763  assert((PVT == MVT::i64 || PVT == MVT::i32) && "Invalid Pointer Size!");
33764
33765  unsigned Op = 0;
33766  unsigned VR = 0;
33767
33768  bool UseImmLabel = (MF->getTarget().getCodeModel() == CodeModel::Small) &&
33769                     !isPositionIndependent();
33770
33771  if (UseImmLabel) {
33772    Op = (PVT == MVT::i64) ? X86::MOV64mi32 : X86::MOV32mi;
33773  } else {
33774    const TargetRegisterClass *TRC =
33775        (PVT == MVT::i64) ? &X86::GR64RegClass : &X86::GR32RegClass;
33776    VR = MRI->createVirtualRegister(TRC);
33777    Op = (PVT == MVT::i64) ? X86::MOV64mr : X86::MOV32mr;
33778
33779    if (Subtarget.is64Bit())
33780      BuildMI(*MBB, MI, DL, TII->get(X86::LEA64r), VR)
33781          .addReg(X86::RIP)
33782          .addImm(1)
33783          .addReg(0)
33784          .addMBB(DispatchBB)
33785          .addReg(0);
33786    else
33787      BuildMI(*MBB, MI, DL, TII->get(X86::LEA32r), VR)
33788          .addReg(0) /* TII->getGlobalBaseReg(MF) */
33789          .addImm(1)
33790          .addReg(0)
33791          .addMBB(DispatchBB, Subtarget.classifyBlockAddressReference())
33792          .addReg(0);
33793  }
33794
33795  MachineInstrBuilder MIB = BuildMI(*MBB, MI, DL, TII->get(Op));
33796  addFrameReference(MIB, FI, Subtarget.is64Bit() ? 56 : 36);
33797  if (UseImmLabel)
33798    MIB.addMBB(DispatchBB);
33799  else
33800    MIB.addReg(VR);
33801}
33802
33803MachineBasicBlock *
33804X86TargetLowering::EmitSjLjDispatchBlock(MachineInstr &MI,
33805                                         MachineBasicBlock *BB) const {
33806  const DebugLoc &DL = MI.getDebugLoc();
33807  MachineFunction *MF = BB->getParent();
33808  MachineRegisterInfo *MRI = &MF->getRegInfo();
33809  const X86InstrInfo *TII = Subtarget.getInstrInfo();
33810  int FI = MF->getFrameInfo().getFunctionContextIndex();
33811
33812  // Get a mapping of the call site numbers to all of the landing pads they're
33813  // associated with.
33814  DenseMap<unsigned, SmallVector<MachineBasicBlock *, 2>> CallSiteNumToLPad;
33815  unsigned MaxCSNum = 0;
33816  for (auto &MBB : *MF) {
33817    if (!MBB.isEHPad())
33818      continue;
33819
33820    MCSymbol *Sym = nullptr;
33821    for (const auto &MI : MBB) {
33822      if (MI.isDebugInstr())
33823        continue;
33824
33825      assert(MI.isEHLabel() && "expected EH_LABEL");
33826      Sym = MI.getOperand(0).getMCSymbol();
33827      break;
33828    }
33829
33830    if (!MF->hasCallSiteLandingPad(Sym))
33831      continue;
33832
33833    for (unsigned CSI : MF->getCallSiteLandingPad(Sym)) {
33834      CallSiteNumToLPad[CSI].push_back(&MBB);
33835      MaxCSNum = std::max(MaxCSNum, CSI);
33836    }
33837  }
33838
33839  // Get an ordered list of the machine basic blocks for the jump table.
33840  std::vector<MachineBasicBlock *> LPadList;
33841  SmallPtrSet<MachineBasicBlock *, 32> InvokeBBs;
33842  LPadList.reserve(CallSiteNumToLPad.size());
33843
33844  for (unsigned CSI = 1; CSI <= MaxCSNum; ++CSI) {
33845    for (auto &LP : CallSiteNumToLPad[CSI]) {
33846      LPadList.push_back(LP);
33847      InvokeBBs.insert(LP->pred_begin(), LP->pred_end());
33848    }
33849  }
33850
33851  assert(!LPadList.empty() &&
33852         "No landing pad destinations for the dispatch jump table!");
33853
33854  // Create the MBBs for the dispatch code.
33855
33856  // Shove the dispatch's address into the return slot in the function context.
33857  MachineBasicBlock *DispatchBB = MF->CreateMachineBasicBlock();
33858  DispatchBB->setIsEHPad(true);
33859
33860  MachineBasicBlock *TrapBB = MF->CreateMachineBasicBlock();
33861  BuildMI(TrapBB, DL, TII->get(X86::TRAP));
33862  DispatchBB->addSuccessor(TrapBB);
33863
33864  MachineBasicBlock *DispContBB = MF->CreateMachineBasicBlock();
33865  DispatchBB->addSuccessor(DispContBB);
33866
33867  // Insert MBBs.
33868  MF->push_back(DispatchBB);
33869  MF->push_back(DispContBB);
33870  MF->push_back(TrapBB);
33871
33872  // Insert code into the entry block that creates and registers the function
33873  // context.
33874  SetupEntryBlockForSjLj(MI, BB, DispatchBB, FI);
33875
33876  // Create the jump table and associated information
33877  unsigned JTE = getJumpTableEncoding();
33878  MachineJumpTableInfo *JTI = MF->getOrCreateJumpTableInfo(JTE);
33879  unsigned MJTI = JTI->createJumpTableIndex(LPadList);
33880
33881  const X86RegisterInfo &RI = TII->getRegisterInfo();
33882  // Add a register mask with no preserved registers.  This results in all
33883  // registers being marked as clobbered.
33884  if (RI.hasBasePointer(*MF)) {
33885    const bool FPIs64Bit =
33886        Subtarget.isTarget64BitLP64() || Subtarget.isTargetNaCl64();
33887    X86MachineFunctionInfo *MFI = MF->getInfo<X86MachineFunctionInfo>();
33888    MFI->setRestoreBasePointer(MF);
33889
33890    Register FP = RI.getFrameRegister(*MF);
33891    Register BP = RI.getBaseRegister();
33892    unsigned Op = FPIs64Bit ? X86::MOV64rm : X86::MOV32rm;
33893    addRegOffset(BuildMI(DispatchBB, DL, TII->get(Op), BP), FP, true,
33894                 MFI->getRestoreBasePointerOffset())
33895        .addRegMask(RI.getNoPreservedMask());
33896  } else {
33897    BuildMI(DispatchBB, DL, TII->get(X86::NOOP))
33898        .addRegMask(RI.getNoPreservedMask());
33899  }
33900
33901  // IReg is used as an index in a memory operand and therefore can't be SP
33902  Register IReg = MRI->createVirtualRegister(&X86::GR32_NOSPRegClass);
33903  addFrameReference(BuildMI(DispatchBB, DL, TII->get(X86::MOV32rm), IReg), FI,
33904                    Subtarget.is64Bit() ? 8 : 4);
33905  BuildMI(DispatchBB, DL, TII->get(X86::CMP32ri))
33906      .addReg(IReg)
33907      .addImm(LPadList.size());
33908  BuildMI(DispatchBB, DL, TII->get(X86::JCC_1)).addMBB(TrapBB).addImm(X86::COND_AE);
33909
33910  if (Subtarget.is64Bit()) {
33911    Register BReg = MRI->createVirtualRegister(&X86::GR64RegClass);
33912    Register IReg64 = MRI->createVirtualRegister(&X86::GR64_NOSPRegClass);
33913
33914    // leaq .LJTI0_0(%rip), BReg
33915    BuildMI(DispContBB, DL, TII->get(X86::LEA64r), BReg)
33916        .addReg(X86::RIP)
33917        .addImm(1)
33918        .addReg(0)
33919        .addJumpTableIndex(MJTI)
33920        .addReg(0);
33921    // movzx IReg64, IReg
33922    BuildMI(DispContBB, DL, TII->get(TargetOpcode::SUBREG_TO_REG), IReg64)
33923        .addImm(0)
33924        .addReg(IReg)
33925        .addImm(X86::sub_32bit);
33926
33927    switch (JTE) {
33928    case MachineJumpTableInfo::EK_BlockAddress:
33929      // jmpq *(BReg,IReg64,8)
33930      BuildMI(DispContBB, DL, TII->get(X86::JMP64m))
33931          .addReg(BReg)
33932          .addImm(8)
33933          .addReg(IReg64)
33934          .addImm(0)
33935          .addReg(0);
33936      break;
33937    case MachineJumpTableInfo::EK_LabelDifference32: {
33938      Register OReg = MRI->createVirtualRegister(&X86::GR32RegClass);
33939      Register OReg64 = MRI->createVirtualRegister(&X86::GR64RegClass);
33940      Register TReg = MRI->createVirtualRegister(&X86::GR64RegClass);
33941
33942      // movl (BReg,IReg64,4), OReg
33943      BuildMI(DispContBB, DL, TII->get(X86::MOV32rm), OReg)
33944          .addReg(BReg)
33945          .addImm(4)
33946          .addReg(IReg64)
33947          .addImm(0)
33948          .addReg(0);
33949      // movsx OReg64, OReg
33950      BuildMI(DispContBB, DL, TII->get(X86::MOVSX64rr32), OReg64).addReg(OReg);
33951      // addq BReg, OReg64, TReg
33952      BuildMI(DispContBB, DL, TII->get(X86::ADD64rr), TReg)
33953          .addReg(OReg64)
33954          .addReg(BReg);
33955      // jmpq *TReg
33956      BuildMI(DispContBB, DL, TII->get(X86::JMP64r)).addReg(TReg);
33957      break;
33958    }
33959    default:
33960      llvm_unreachable("Unexpected jump table encoding");
33961    }
33962  } else {
33963    // jmpl *.LJTI0_0(,IReg,4)
33964    BuildMI(DispContBB, DL, TII->get(X86::JMP32m))
33965        .addReg(0)
33966        .addImm(4)
33967        .addReg(IReg)
33968        .addJumpTableIndex(MJTI)
33969        .addReg(0);
33970  }
33971
33972  // Add the jump table entries as successors to the MBB.
33973  SmallPtrSet<MachineBasicBlock *, 8> SeenMBBs;
33974  for (auto &LP : LPadList)
33975    if (SeenMBBs.insert(LP).second)
33976      DispContBB->addSuccessor(LP);
33977
33978  // N.B. the order the invoke BBs are processed in doesn't matter here.
33979  SmallVector<MachineBasicBlock *, 64> MBBLPads;
33980  const MCPhysReg *SavedRegs = MF->getRegInfo().getCalleeSavedRegs();
33981  for (MachineBasicBlock *MBB : InvokeBBs) {
33982    // Remove the landing pad successor from the invoke block and replace it
33983    // with the new dispatch block.
33984    // Keep a copy of Successors since it's modified inside the loop.
33985    SmallVector<MachineBasicBlock *, 8> Successors(MBB->succ_rbegin(),
33986                                                   MBB->succ_rend());
33987    // FIXME: Avoid quadratic complexity.
33988    for (auto MBBS : Successors) {
33989      if (MBBS->isEHPad()) {
33990        MBB->removeSuccessor(MBBS);
33991        MBBLPads.push_back(MBBS);
33992      }
33993    }
33994
33995    MBB->addSuccessor(DispatchBB);
33996
33997    // Find the invoke call and mark all of the callee-saved registers as
33998    // 'implicit defined' so that they're spilled.  This prevents code from
33999    // moving instructions to before the EH block, where they will never be
34000    // executed.
34001    for (auto &II : reverse(*MBB)) {
34002      if (!II.isCall())
34003        continue;
34004
34005      DenseMap<unsigned, bool> DefRegs;
34006      for (auto &MOp : II.operands())
34007        if (MOp.isReg())
34008          DefRegs[MOp.getReg()] = true;
34009
34010      MachineInstrBuilder MIB(*MF, &II);
34011      for (unsigned RegIdx = 0; SavedRegs[RegIdx]; ++RegIdx) {
34012        unsigned Reg = SavedRegs[RegIdx];
34013        if (!DefRegs[Reg])
34014          MIB.addReg(Reg, RegState::ImplicitDefine | RegState::Dead);
34015      }
34016
34017      break;
34018    }
34019  }
34020
34021  // Mark all former landing pads as non-landing pads.  The dispatch is the only
34022  // landing pad now.
34023  for (auto &LP : MBBLPads)
34024    LP->setIsEHPad(false);
34025
34026  // The instruction is gone now.
34027  MI.eraseFromParent();
34028  return BB;
34029}
34030
34031MachineBasicBlock *
34032X86TargetLowering::EmitInstrWithCustomInserter(MachineInstr &MI,
34033                                               MachineBasicBlock *BB) const {
34034  MachineFunction *MF = BB->getParent();
34035  const TargetInstrInfo *TII = Subtarget.getInstrInfo();
34036  const DebugLoc &DL = MI.getDebugLoc();
34037
34038  auto TMMImmToTMMReg = [](unsigned Imm) {
34039    assert (Imm < 8 && "Illegal tmm index");
34040    return X86::TMM0 + Imm;
34041  };
34042  switch (MI.getOpcode()) {
34043  default: llvm_unreachable("Unexpected instr type to insert");
34044  case X86::TLS_addr32:
34045  case X86::TLS_addr64:
34046  case X86::TLS_addrX32:
34047  case X86::TLS_base_addr32:
34048  case X86::TLS_base_addr64:
34049  case X86::TLS_base_addrX32:
34050    return EmitLoweredTLSAddr(MI, BB);
34051  case X86::INDIRECT_THUNK_CALL32:
34052  case X86::INDIRECT_THUNK_CALL64:
34053  case X86::INDIRECT_THUNK_TCRETURN32:
34054  case X86::INDIRECT_THUNK_TCRETURN64:
34055    return EmitLoweredIndirectThunk(MI, BB);
34056  case X86::CATCHRET:
34057    return EmitLoweredCatchRet(MI, BB);
34058  case X86::SEG_ALLOCA_32:
34059  case X86::SEG_ALLOCA_64:
34060    return EmitLoweredSegAlloca(MI, BB);
34061  case X86::PROBED_ALLOCA_32:
34062  case X86::PROBED_ALLOCA_64:
34063    return EmitLoweredProbedAlloca(MI, BB);
34064  case X86::TLSCall_32:
34065  case X86::TLSCall_64:
34066    return EmitLoweredTLSCall(MI, BB);
34067  case X86::CMOV_FR32:
34068  case X86::CMOV_FR32X:
34069  case X86::CMOV_FR64:
34070  case X86::CMOV_FR64X:
34071  case X86::CMOV_GR8:
34072  case X86::CMOV_GR16:
34073  case X86::CMOV_GR32:
34074  case X86::CMOV_RFP32:
34075  case X86::CMOV_RFP64:
34076  case X86::CMOV_RFP80:
34077  case X86::CMOV_VR64:
34078  case X86::CMOV_VR128:
34079  case X86::CMOV_VR128X:
34080  case X86::CMOV_VR256:
34081  case X86::CMOV_VR256X:
34082  case X86::CMOV_VR512:
34083  case X86::CMOV_VK1:
34084  case X86::CMOV_VK2:
34085  case X86::CMOV_VK4:
34086  case X86::CMOV_VK8:
34087  case X86::CMOV_VK16:
34088  case X86::CMOV_VK32:
34089  case X86::CMOV_VK64:
34090    return EmitLoweredSelect(MI, BB);
34091
34092  case X86::RDFLAGS32:
34093  case X86::RDFLAGS64: {
34094    unsigned PushF =
34095        MI.getOpcode() == X86::RDFLAGS32 ? X86::PUSHF32 : X86::PUSHF64;
34096    unsigned Pop = MI.getOpcode() == X86::RDFLAGS32 ? X86::POP32r : X86::POP64r;
34097    MachineInstr *Push = BuildMI(*BB, MI, DL, TII->get(PushF));
34098    // Permit reads of the EFLAGS and DF registers without them being defined.
34099    // This intrinsic exists to read external processor state in flags, such as
34100    // the trap flag, interrupt flag, and direction flag, none of which are
34101    // modeled by the backend.
34102    assert(Push->getOperand(2).getReg() == X86::EFLAGS &&
34103           "Unexpected register in operand!");
34104    Push->getOperand(2).setIsUndef();
34105    assert(Push->getOperand(3).getReg() == X86::DF &&
34106           "Unexpected register in operand!");
34107    Push->getOperand(3).setIsUndef();
34108    BuildMI(*BB, MI, DL, TII->get(Pop), MI.getOperand(0).getReg());
34109
34110    MI.eraseFromParent(); // The pseudo is gone now.
34111    return BB;
34112  }
34113
34114  case X86::WRFLAGS32:
34115  case X86::WRFLAGS64: {
34116    unsigned Push =
34117        MI.getOpcode() == X86::WRFLAGS32 ? X86::PUSH32r : X86::PUSH64r;
34118    unsigned PopF =
34119        MI.getOpcode() == X86::WRFLAGS32 ? X86::POPF32 : X86::POPF64;
34120    BuildMI(*BB, MI, DL, TII->get(Push)).addReg(MI.getOperand(0).getReg());
34121    BuildMI(*BB, MI, DL, TII->get(PopF));
34122
34123    MI.eraseFromParent(); // The pseudo is gone now.
34124    return BB;
34125  }
34126
34127  case X86::FP32_TO_INT16_IN_MEM:
34128  case X86::FP32_TO_INT32_IN_MEM:
34129  case X86::FP32_TO_INT64_IN_MEM:
34130  case X86::FP64_TO_INT16_IN_MEM:
34131  case X86::FP64_TO_INT32_IN_MEM:
34132  case X86::FP64_TO_INT64_IN_MEM:
34133  case X86::FP80_TO_INT16_IN_MEM:
34134  case X86::FP80_TO_INT32_IN_MEM:
34135  case X86::FP80_TO_INT64_IN_MEM: {
34136    // Change the floating point control register to use "round towards zero"
34137    // mode when truncating to an integer value.
34138    int OrigCWFrameIdx =
34139        MF->getFrameInfo().CreateStackObject(2, Align(2), false);
34140    addFrameReference(BuildMI(*BB, MI, DL,
34141                              TII->get(X86::FNSTCW16m)), OrigCWFrameIdx);
34142
34143    // Load the old value of the control word...
34144    Register OldCW = MF->getRegInfo().createVirtualRegister(&X86::GR32RegClass);
34145    addFrameReference(BuildMI(*BB, MI, DL, TII->get(X86::MOVZX32rm16), OldCW),
34146                      OrigCWFrameIdx);
34147
34148    // OR 0b11 into bit 10 and 11. 0b11 is the encoding for round toward zero.
34149    Register NewCW = MF->getRegInfo().createVirtualRegister(&X86::GR32RegClass);
34150    BuildMI(*BB, MI, DL, TII->get(X86::OR32ri), NewCW)
34151      .addReg(OldCW, RegState::Kill).addImm(0xC00);
34152
34153    // Extract to 16 bits.
34154    Register NewCW16 =
34155        MF->getRegInfo().createVirtualRegister(&X86::GR16RegClass);
34156    BuildMI(*BB, MI, DL, TII->get(TargetOpcode::COPY), NewCW16)
34157      .addReg(NewCW, RegState::Kill, X86::sub_16bit);
34158
34159    // Prepare memory for FLDCW.
34160    int NewCWFrameIdx =
34161        MF->getFrameInfo().CreateStackObject(2, Align(2), false);
34162    addFrameReference(BuildMI(*BB, MI, DL, TII->get(X86::MOV16mr)),
34163                      NewCWFrameIdx)
34164      .addReg(NewCW16, RegState::Kill);
34165
34166    // Reload the modified control word now...
34167    addFrameReference(BuildMI(*BB, MI, DL,
34168                              TII->get(X86::FLDCW16m)), NewCWFrameIdx);
34169
34170    // Get the X86 opcode to use.
34171    unsigned Opc;
34172    switch (MI.getOpcode()) {
34173    default: llvm_unreachable("illegal opcode!");
34174    case X86::FP32_TO_INT16_IN_MEM: Opc = X86::IST_Fp16m32; break;
34175    case X86::FP32_TO_INT32_IN_MEM: Opc = X86::IST_Fp32m32; break;
34176    case X86::FP32_TO_INT64_IN_MEM: Opc = X86::IST_Fp64m32; break;
34177    case X86::FP64_TO_INT16_IN_MEM: Opc = X86::IST_Fp16m64; break;
34178    case X86::FP64_TO_INT32_IN_MEM: Opc = X86::IST_Fp32m64; break;
34179    case X86::FP64_TO_INT64_IN_MEM: Opc = X86::IST_Fp64m64; break;
34180    case X86::FP80_TO_INT16_IN_MEM: Opc = X86::IST_Fp16m80; break;
34181    case X86::FP80_TO_INT32_IN_MEM: Opc = X86::IST_Fp32m80; break;
34182    case X86::FP80_TO_INT64_IN_MEM: Opc = X86::IST_Fp64m80; break;
34183    }
34184
34185    X86AddressMode AM = getAddressFromInstr(&MI, 0);
34186    addFullAddress(BuildMI(*BB, MI, DL, TII->get(Opc)), AM)
34187        .addReg(MI.getOperand(X86::AddrNumOperands).getReg());
34188
34189    // Reload the original control word now.
34190    addFrameReference(BuildMI(*BB, MI, DL,
34191                              TII->get(X86::FLDCW16m)), OrigCWFrameIdx);
34192
34193    MI.eraseFromParent(); // The pseudo instruction is gone now.
34194    return BB;
34195  }
34196
34197  // xbegin
34198  case X86::XBEGIN:
34199    return emitXBegin(MI, BB, Subtarget.getInstrInfo());
34200
34201  case X86::VAARG_64:
34202  case X86::VAARG_X32:
34203    return EmitVAARGWithCustomInserter(MI, BB);
34204
34205  case X86::EH_SjLj_SetJmp32:
34206  case X86::EH_SjLj_SetJmp64:
34207    return emitEHSjLjSetJmp(MI, BB);
34208
34209  case X86::EH_SjLj_LongJmp32:
34210  case X86::EH_SjLj_LongJmp64:
34211    return emitEHSjLjLongJmp(MI, BB);
34212
34213  case X86::Int_eh_sjlj_setup_dispatch:
34214    return EmitSjLjDispatchBlock(MI, BB);
34215
34216  case TargetOpcode::STATEPOINT:
34217    // As an implementation detail, STATEPOINT shares the STACKMAP format at
34218    // this point in the process.  We diverge later.
34219    return emitPatchPoint(MI, BB);
34220
34221  case TargetOpcode::STACKMAP:
34222  case TargetOpcode::PATCHPOINT:
34223    return emitPatchPoint(MI, BB);
34224
34225  case TargetOpcode::PATCHABLE_EVENT_CALL:
34226  case TargetOpcode::PATCHABLE_TYPED_EVENT_CALL:
34227    return BB;
34228
34229  case X86::LCMPXCHG8B: {
34230    const X86RegisterInfo *TRI = Subtarget.getRegisterInfo();
34231    // In addition to 4 E[ABCD] registers implied by encoding, CMPXCHG8B
34232    // requires a memory operand. If it happens that current architecture is
34233    // i686 and for current function we need a base pointer
34234    // - which is ESI for i686 - register allocator would not be able to
34235    // allocate registers for an address in form of X(%reg, %reg, Y)
34236    // - there never would be enough unreserved registers during regalloc
34237    // (without the need for base ptr the only option would be X(%edi, %esi, Y).
34238    // We are giving a hand to register allocator by precomputing the address in
34239    // a new vreg using LEA.
34240
34241    // If it is not i686 or there is no base pointer - nothing to do here.
34242    if (!Subtarget.is32Bit() || !TRI->hasBasePointer(*MF))
34243      return BB;
34244
34245    // Even though this code does not necessarily needs the base pointer to
34246    // be ESI, we check for that. The reason: if this assert fails, there are
34247    // some changes happened in the compiler base pointer handling, which most
34248    // probably have to be addressed somehow here.
34249    assert(TRI->getBaseRegister() == X86::ESI &&
34250           "LCMPXCHG8B custom insertion for i686 is written with X86::ESI as a "
34251           "base pointer in mind");
34252
34253    MachineRegisterInfo &MRI = MF->getRegInfo();
34254    MVT SPTy = getPointerTy(MF->getDataLayout());
34255    const TargetRegisterClass *AddrRegClass = getRegClassFor(SPTy);
34256    Register computedAddrVReg = MRI.createVirtualRegister(AddrRegClass);
34257
34258    X86AddressMode AM = getAddressFromInstr(&MI, 0);
34259    // Regalloc does not need any help when the memory operand of CMPXCHG8B
34260    // does not use index register.
34261    if (AM.IndexReg == X86::NoRegister)
34262      return BB;
34263
34264    // After X86TargetLowering::ReplaceNodeResults CMPXCHG8B is glued to its
34265    // four operand definitions that are E[ABCD] registers. We skip them and
34266    // then insert the LEA.
34267    MachineBasicBlock::reverse_iterator RMBBI(MI.getReverseIterator());
34268    while (RMBBI != BB->rend() && (RMBBI->definesRegister(X86::EAX) ||
34269                                   RMBBI->definesRegister(X86::EBX) ||
34270                                   RMBBI->definesRegister(X86::ECX) ||
34271                                   RMBBI->definesRegister(X86::EDX))) {
34272      ++RMBBI;
34273    }
34274    MachineBasicBlock::iterator MBBI(RMBBI);
34275    addFullAddress(
34276        BuildMI(*BB, *MBBI, DL, TII->get(X86::LEA32r), computedAddrVReg), AM);
34277
34278    setDirectAddressInInstr(&MI, 0, computedAddrVReg);
34279
34280    return BB;
34281  }
34282  case X86::LCMPXCHG16B_NO_RBX: {
34283    const X86RegisterInfo *TRI = Subtarget.getRegisterInfo();
34284    Register BasePtr = TRI->getBaseRegister();
34285    if (TRI->hasBasePointer(*MF) &&
34286        (BasePtr == X86::RBX || BasePtr == X86::EBX)) {
34287      if (!BB->isLiveIn(BasePtr))
34288        BB->addLiveIn(BasePtr);
34289      // Save RBX into a virtual register.
34290      Register SaveRBX =
34291          MF->getRegInfo().createVirtualRegister(&X86::GR64RegClass);
34292      BuildMI(*BB, MI, DL, TII->get(TargetOpcode::COPY), SaveRBX)
34293          .addReg(X86::RBX);
34294      Register Dst = MF->getRegInfo().createVirtualRegister(&X86::GR64RegClass);
34295      MachineInstrBuilder MIB =
34296          BuildMI(*BB, MI, DL, TII->get(X86::LCMPXCHG16B_SAVE_RBX), Dst);
34297      for (unsigned Idx = 0; Idx < X86::AddrNumOperands; ++Idx)
34298        MIB.add(MI.getOperand(Idx));
34299      MIB.add(MI.getOperand(X86::AddrNumOperands));
34300      MIB.addReg(SaveRBX);
34301    } else {
34302      // Simple case, just copy the virtual register to RBX.
34303      BuildMI(*BB, MI, DL, TII->get(TargetOpcode::COPY), X86::RBX)
34304          .add(MI.getOperand(X86::AddrNumOperands));
34305      MachineInstrBuilder MIB =
34306          BuildMI(*BB, MI, DL, TII->get(X86::LCMPXCHG16B));
34307      for (unsigned Idx = 0; Idx < X86::AddrNumOperands; ++Idx)
34308        MIB.add(MI.getOperand(Idx));
34309    }
34310    MI.eraseFromParent();
34311    return BB;
34312  }
34313  case X86::MWAITX: {
34314    const X86RegisterInfo *TRI = Subtarget.getRegisterInfo();
34315    Register BasePtr = TRI->getBaseRegister();
34316    bool IsRBX = (BasePtr == X86::RBX || BasePtr == X86::EBX);
34317    // If no need to save the base pointer, we generate MWAITXrrr,
34318    // else we generate pseudo MWAITX_SAVE_RBX.
34319    if (!IsRBX || !TRI->hasBasePointer(*MF)) {
34320      BuildMI(*BB, MI, DL, TII->get(TargetOpcode::COPY), X86::ECX)
34321          .addReg(MI.getOperand(0).getReg());
34322      BuildMI(*BB, MI, DL, TII->get(TargetOpcode::COPY), X86::EAX)
34323          .addReg(MI.getOperand(1).getReg());
34324      BuildMI(*BB, MI, DL, TII->get(TargetOpcode::COPY), X86::EBX)
34325          .addReg(MI.getOperand(2).getReg());
34326      BuildMI(*BB, MI, DL, TII->get(X86::MWAITXrrr));
34327      MI.eraseFromParent();
34328    } else {
34329      if (!BB->isLiveIn(BasePtr)) {
34330        BB->addLiveIn(BasePtr);
34331      }
34332      // Parameters can be copied into ECX and EAX but not EBX yet.
34333      BuildMI(*BB, MI, DL, TII->get(TargetOpcode::COPY), X86::ECX)
34334          .addReg(MI.getOperand(0).getReg());
34335      BuildMI(*BB, MI, DL, TII->get(TargetOpcode::COPY), X86::EAX)
34336          .addReg(MI.getOperand(1).getReg());
34337      assert(Subtarget.is64Bit() && "Expected 64-bit mode!");
34338      // Save RBX into a virtual register.
34339      Register SaveRBX =
34340          MF->getRegInfo().createVirtualRegister(&X86::GR64RegClass);
34341      BuildMI(*BB, MI, DL, TII->get(TargetOpcode::COPY), SaveRBX)
34342          .addReg(X86::RBX);
34343      // Generate mwaitx pseudo.
34344      Register Dst = MF->getRegInfo().createVirtualRegister(&X86::GR64RegClass);
34345      BuildMI(*BB, MI, DL, TII->get(X86::MWAITX_SAVE_RBX))
34346          .addDef(Dst) // Destination tied in with SaveRBX.
34347          .addReg(MI.getOperand(2).getReg()) // input value of EBX.
34348          .addUse(SaveRBX);                  // Save of base pointer.
34349      MI.eraseFromParent();
34350    }
34351    return BB;
34352  }
34353  case TargetOpcode::PREALLOCATED_SETUP: {
34354    assert(Subtarget.is32Bit() && "preallocated only used in 32-bit");
34355    auto MFI = MF->getInfo<X86MachineFunctionInfo>();
34356    MFI->setHasPreallocatedCall(true);
34357    int64_t PreallocatedId = MI.getOperand(0).getImm();
34358    size_t StackAdjustment = MFI->getPreallocatedStackSize(PreallocatedId);
34359    assert(StackAdjustment != 0 && "0 stack adjustment");
34360    LLVM_DEBUG(dbgs() << "PREALLOCATED_SETUP stack adjustment "
34361                      << StackAdjustment << "\n");
34362    BuildMI(*BB, MI, DL, TII->get(X86::SUB32ri), X86::ESP)
34363        .addReg(X86::ESP)
34364        .addImm(StackAdjustment);
34365    MI.eraseFromParent();
34366    return BB;
34367  }
34368  case TargetOpcode::PREALLOCATED_ARG: {
34369    assert(Subtarget.is32Bit() && "preallocated calls only used in 32-bit");
34370    int64_t PreallocatedId = MI.getOperand(1).getImm();
34371    int64_t ArgIdx = MI.getOperand(2).getImm();
34372    auto MFI = MF->getInfo<X86MachineFunctionInfo>();
34373    size_t ArgOffset = MFI->getPreallocatedArgOffsets(PreallocatedId)[ArgIdx];
34374    LLVM_DEBUG(dbgs() << "PREALLOCATED_ARG arg index " << ArgIdx
34375                      << ", arg offset " << ArgOffset << "\n");
34376    // stack pointer + offset
34377    addRegOffset(
34378        BuildMI(*BB, MI, DL, TII->get(X86::LEA32r), MI.getOperand(0).getReg()),
34379        X86::ESP, false, ArgOffset);
34380    MI.eraseFromParent();
34381    return BB;
34382  }
34383  case X86::PTDPBSSD:
34384  case X86::PTDPBSUD:
34385  case X86::PTDPBUSD:
34386  case X86::PTDPBUUD:
34387  case X86::PTDPBF16PS: {
34388    unsigned Opc;
34389    switch (MI.getOpcode()) {
34390    case X86::PTDPBSSD: Opc = X86::TDPBSSD; break;
34391    case X86::PTDPBSUD: Opc = X86::TDPBSUD; break;
34392    case X86::PTDPBUSD: Opc = X86::TDPBUSD; break;
34393    case X86::PTDPBUUD: Opc = X86::TDPBUUD; break;
34394    case X86::PTDPBF16PS: Opc = X86::TDPBF16PS; break;
34395    }
34396
34397    MachineInstrBuilder MIB = BuildMI(*BB, MI, DL, TII->get(Opc));
34398    MIB.addReg(TMMImmToTMMReg(MI.getOperand(0).getImm()), RegState::Define);
34399    MIB.addReg(TMMImmToTMMReg(MI.getOperand(0).getImm()), RegState::Undef);
34400    MIB.addReg(TMMImmToTMMReg(MI.getOperand(1).getImm()), RegState::Undef);
34401    MIB.addReg(TMMImmToTMMReg(MI.getOperand(2).getImm()), RegState::Undef);
34402
34403    MI.eraseFromParent(); // The pseudo is gone now.
34404    return BB;
34405  }
34406  case X86::PTILEZERO: {
34407    unsigned Imm = MI.getOperand(0).getImm();
34408    BuildMI(*BB, MI, DL, TII->get(X86::TILEZERO), TMMImmToTMMReg(Imm));
34409    MI.eraseFromParent(); // The pseudo is gone now.
34410    return BB;
34411  }
34412  case X86::PTILELOADD:
34413  case X86::PTILELOADDT1:
34414  case X86::PTILESTORED: {
34415    unsigned Opc;
34416    switch (MI.getOpcode()) {
34417    case X86::PTILELOADD:   Opc = X86::TILELOADD;   break;
34418    case X86::PTILELOADDT1: Opc = X86::TILELOADDT1; break;
34419    case X86::PTILESTORED:  Opc = X86::TILESTORED;  break;
34420    }
34421
34422    MachineInstrBuilder MIB = BuildMI(*BB, MI, DL, TII->get(Opc));
34423    unsigned CurOp = 0;
34424    if (Opc != X86::TILESTORED)
34425      MIB.addReg(TMMImmToTMMReg(MI.getOperand(CurOp++).getImm()),
34426                 RegState::Define);
34427
34428    MIB.add(MI.getOperand(CurOp++)); // base
34429    MIB.add(MI.getOperand(CurOp++)); // scale
34430    MIB.add(MI.getOperand(CurOp++)); // index -- stride
34431    MIB.add(MI.getOperand(CurOp++)); // displacement
34432    MIB.add(MI.getOperand(CurOp++)); // segment
34433
34434    if (Opc == X86::TILESTORED)
34435      MIB.addReg(TMMImmToTMMReg(MI.getOperand(CurOp++).getImm()),
34436                 RegState::Undef);
34437
34438    MI.eraseFromParent(); // The pseudo is gone now.
34439    return BB;
34440  }
34441  }
34442}
34443
34444//===----------------------------------------------------------------------===//
34445//                           X86 Optimization Hooks
34446//===----------------------------------------------------------------------===//
34447
34448bool
34449X86TargetLowering::targetShrinkDemandedConstant(SDValue Op,
34450                                                const APInt &DemandedBits,
34451                                                const APInt &DemandedElts,
34452                                                TargetLoweringOpt &TLO) const {
34453  EVT VT = Op.getValueType();
34454  unsigned Opcode = Op.getOpcode();
34455  unsigned EltSize = VT.getScalarSizeInBits();
34456
34457  if (VT.isVector()) {
34458    // If the constant is only all signbits in the active bits, then we should
34459    // extend it to the entire constant to allow it act as a boolean constant
34460    // vector.
34461    auto NeedsSignExtension = [&](SDValue V, unsigned ActiveBits) {
34462      if (!ISD::isBuildVectorOfConstantSDNodes(V.getNode()))
34463        return false;
34464      for (unsigned i = 0, e = V.getNumOperands(); i != e; ++i) {
34465        if (!DemandedElts[i] || V.getOperand(i).isUndef())
34466          continue;
34467        const APInt &Val = V.getConstantOperandAPInt(i);
34468        if (Val.getBitWidth() > Val.getNumSignBits() &&
34469            Val.trunc(ActiveBits).getNumSignBits() == ActiveBits)
34470          return true;
34471      }
34472      return false;
34473    };
34474    // For vectors - if we have a constant, then try to sign extend.
34475    // TODO: Handle AND/ANDN cases.
34476    unsigned ActiveBits = DemandedBits.getActiveBits();
34477    if (EltSize > ActiveBits && EltSize > 1 && isTypeLegal(VT) &&
34478        (Opcode == ISD::OR || Opcode == ISD::XOR) &&
34479        NeedsSignExtension(Op.getOperand(1), ActiveBits)) {
34480      EVT ExtSVT = EVT::getIntegerVT(*TLO.DAG.getContext(), ActiveBits);
34481      EVT ExtVT = EVT::getVectorVT(*TLO.DAG.getContext(), ExtSVT,
34482                                    VT.getVectorNumElements());
34483      SDValue NewC =
34484          TLO.DAG.getNode(ISD::SIGN_EXTEND_INREG, SDLoc(Op), VT,
34485                          Op.getOperand(1), TLO.DAG.getValueType(ExtVT));
34486      SDValue NewOp =
34487          TLO.DAG.getNode(Opcode, SDLoc(Op), VT, Op.getOperand(0), NewC);
34488      return TLO.CombineTo(Op, NewOp);
34489    }
34490    return false;
34491  }
34492
34493  // Only optimize Ands to prevent shrinking a constant that could be
34494  // matched by movzx.
34495  if (Opcode != ISD::AND)
34496    return false;
34497
34498  // Make sure the RHS really is a constant.
34499  ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op.getOperand(1));
34500  if (!C)
34501    return false;
34502
34503  const APInt &Mask = C->getAPIntValue();
34504
34505  // Clear all non-demanded bits initially.
34506  APInt ShrunkMask = Mask & DemandedBits;
34507
34508  // Find the width of the shrunk mask.
34509  unsigned Width = ShrunkMask.getActiveBits();
34510
34511  // If the mask is all 0s there's nothing to do here.
34512  if (Width == 0)
34513    return false;
34514
34515  // Find the next power of 2 width, rounding up to a byte.
34516  Width = PowerOf2Ceil(std::max(Width, 8U));
34517  // Truncate the width to size to handle illegal types.
34518  Width = std::min(Width, EltSize);
34519
34520  // Calculate a possible zero extend mask for this constant.
34521  APInt ZeroExtendMask = APInt::getLowBitsSet(EltSize, Width);
34522
34523  // If we aren't changing the mask, just return true to keep it and prevent
34524  // the caller from optimizing.
34525  if (ZeroExtendMask == Mask)
34526    return true;
34527
34528  // Make sure the new mask can be represented by a combination of mask bits
34529  // and non-demanded bits.
34530  if (!ZeroExtendMask.isSubsetOf(Mask | ~DemandedBits))
34531    return false;
34532
34533  // Replace the constant with the zero extend mask.
34534  SDLoc DL(Op);
34535  SDValue NewC = TLO.DAG.getConstant(ZeroExtendMask, DL, VT);
34536  SDValue NewOp = TLO.DAG.getNode(ISD::AND, DL, VT, Op.getOperand(0), NewC);
34537  return TLO.CombineTo(Op, NewOp);
34538}
34539
34540void X86TargetLowering::computeKnownBitsForTargetNode(const SDValue Op,
34541                                                      KnownBits &Known,
34542                                                      const APInt &DemandedElts,
34543                                                      const SelectionDAG &DAG,
34544                                                      unsigned Depth) const {
34545  unsigned BitWidth = Known.getBitWidth();
34546  unsigned NumElts = DemandedElts.getBitWidth();
34547  unsigned Opc = Op.getOpcode();
34548  EVT VT = Op.getValueType();
34549  assert((Opc >= ISD::BUILTIN_OP_END ||
34550          Opc == ISD::INTRINSIC_WO_CHAIN ||
34551          Opc == ISD::INTRINSIC_W_CHAIN ||
34552          Opc == ISD::INTRINSIC_VOID) &&
34553         "Should use MaskedValueIsZero if you don't know whether Op"
34554         " is a target node!");
34555
34556  Known.resetAll();
34557  switch (Opc) {
34558  default: break;
34559  case X86ISD::SETCC:
34560    Known.Zero.setBitsFrom(1);
34561    break;
34562  case X86ISD::MOVMSK: {
34563    unsigned NumLoBits = Op.getOperand(0).getValueType().getVectorNumElements();
34564    Known.Zero.setBitsFrom(NumLoBits);
34565    break;
34566  }
34567  case X86ISD::PEXTRB:
34568  case X86ISD::PEXTRW: {
34569    SDValue Src = Op.getOperand(0);
34570    EVT SrcVT = Src.getValueType();
34571    APInt DemandedElt = APInt::getOneBitSet(SrcVT.getVectorNumElements(),
34572                                            Op.getConstantOperandVal(1));
34573    Known = DAG.computeKnownBits(Src, DemandedElt, Depth + 1);
34574    Known = Known.anyextOrTrunc(BitWidth);
34575    Known.Zero.setBitsFrom(SrcVT.getScalarSizeInBits());
34576    break;
34577  }
34578  case X86ISD::VSRAI:
34579  case X86ISD::VSHLI:
34580  case X86ISD::VSRLI: {
34581    unsigned ShAmt = Op.getConstantOperandVal(1);
34582    if (ShAmt >= VT.getScalarSizeInBits()) {
34583      Known.setAllZero();
34584      break;
34585    }
34586
34587    Known = DAG.computeKnownBits(Op.getOperand(0), DemandedElts, Depth + 1);
34588    if (Opc == X86ISD::VSHLI) {
34589      Known.Zero <<= ShAmt;
34590      Known.One <<= ShAmt;
34591      // Low bits are known zero.
34592      Known.Zero.setLowBits(ShAmt);
34593    } else if (Opc == X86ISD::VSRLI) {
34594      Known.Zero.lshrInPlace(ShAmt);
34595      Known.One.lshrInPlace(ShAmt);
34596      // High bits are known zero.
34597      Known.Zero.setHighBits(ShAmt);
34598    } else {
34599      Known.Zero.ashrInPlace(ShAmt);
34600      Known.One.ashrInPlace(ShAmt);
34601    }
34602    break;
34603  }
34604  case X86ISD::PACKUS: {
34605    // PACKUS is just a truncation if the upper half is zero.
34606    APInt DemandedLHS, DemandedRHS;
34607    getPackDemandedElts(VT, DemandedElts, DemandedLHS, DemandedRHS);
34608
34609    Known.One = APInt::getAllOnesValue(BitWidth * 2);
34610    Known.Zero = APInt::getAllOnesValue(BitWidth * 2);
34611
34612    KnownBits Known2;
34613    if (!!DemandedLHS) {
34614      Known2 = DAG.computeKnownBits(Op.getOperand(0), DemandedLHS, Depth + 1);
34615      Known = KnownBits::commonBits(Known, Known2);
34616    }
34617    if (!!DemandedRHS) {
34618      Known2 = DAG.computeKnownBits(Op.getOperand(1), DemandedRHS, Depth + 1);
34619      Known = KnownBits::commonBits(Known, Known2);
34620    }
34621
34622    if (Known.countMinLeadingZeros() < BitWidth)
34623      Known.resetAll();
34624    Known = Known.trunc(BitWidth);
34625    break;
34626  }
34627  case X86ISD::VBROADCAST: {
34628    SDValue Src = Op.getOperand(0);
34629    if (!Src.getSimpleValueType().isVector()) {
34630      Known = DAG.computeKnownBits(Src, Depth + 1);
34631      return;
34632    }
34633    break;
34634  }
34635  case X86ISD::ANDNP: {
34636    KnownBits Known2;
34637    Known = DAG.computeKnownBits(Op.getOperand(1), DemandedElts, Depth + 1);
34638    Known2 = DAG.computeKnownBits(Op.getOperand(0), DemandedElts, Depth + 1);
34639
34640    // ANDNP = (~X & Y);
34641    Known.One &= Known2.Zero;
34642    Known.Zero |= Known2.One;
34643    break;
34644  }
34645  case X86ISD::FOR: {
34646    KnownBits Known2;
34647    Known = DAG.computeKnownBits(Op.getOperand(1), DemandedElts, Depth + 1);
34648    Known2 = DAG.computeKnownBits(Op.getOperand(0), DemandedElts, Depth + 1);
34649
34650    Known |= Known2;
34651    break;
34652  }
34653  case X86ISD::PSADBW: {
34654    assert(VT.getScalarType() == MVT::i64 &&
34655           Op.getOperand(0).getValueType().getScalarType() == MVT::i8 &&
34656           "Unexpected PSADBW types");
34657
34658    // PSADBW - fills low 16 bits and zeros upper 48 bits of each i64 result.
34659    Known.Zero.setBitsFrom(16);
34660    break;
34661  }
34662  case X86ISD::PMULUDQ: {
34663    KnownBits Known2;
34664    Known = DAG.computeKnownBits(Op.getOperand(1), DemandedElts, Depth + 1);
34665    Known2 = DAG.computeKnownBits(Op.getOperand(0), DemandedElts, Depth + 1);
34666
34667    Known = Known.trunc(BitWidth / 2).zext(BitWidth);
34668    Known2 = Known2.trunc(BitWidth / 2).zext(BitWidth);
34669    Known = KnownBits::mul(Known, Known2);
34670    break;
34671  }
34672  case X86ISD::CMOV: {
34673    Known = DAG.computeKnownBits(Op.getOperand(1), Depth + 1);
34674    // If we don't know any bits, early out.
34675    if (Known.isUnknown())
34676      break;
34677    KnownBits Known2 = DAG.computeKnownBits(Op.getOperand(0), Depth + 1);
34678
34679    // Only known if known in both the LHS and RHS.
34680    Known = KnownBits::commonBits(Known, Known2);
34681    break;
34682  }
34683  case X86ISD::BEXTR:
34684  case X86ISD::BEXTRI: {
34685    SDValue Op0 = Op.getOperand(0);
34686    SDValue Op1 = Op.getOperand(1);
34687
34688    if (auto* Cst1 = dyn_cast<ConstantSDNode>(Op1)) {
34689      unsigned Shift = Cst1->getAPIntValue().extractBitsAsZExtValue(8, 0);
34690      unsigned Length = Cst1->getAPIntValue().extractBitsAsZExtValue(8, 8);
34691
34692      // If the length is 0, the result is 0.
34693      if (Length == 0) {
34694        Known.setAllZero();
34695        break;
34696      }
34697
34698      if ((Shift + Length) <= BitWidth) {
34699        Known = DAG.computeKnownBits(Op0, Depth + 1);
34700        Known = Known.extractBits(Length, Shift);
34701        Known = Known.zextOrTrunc(BitWidth);
34702      }
34703    }
34704    break;
34705  }
34706  case X86ISD::PDEP: {
34707    KnownBits Known2;
34708    Known = DAG.computeKnownBits(Op.getOperand(1), DemandedElts, Depth + 1);
34709    Known2 = DAG.computeKnownBits(Op.getOperand(0), DemandedElts, Depth + 1);
34710    // Zeros are retained from the mask operand. But not ones.
34711    Known.One.clearAllBits();
34712    // The result will have at least as many trailing zeros as the non-mask
34713    // operand since bits can only map to the same or higher bit position.
34714    Known.Zero.setLowBits(Known2.countMinTrailingZeros());
34715    break;
34716  }
34717  case X86ISD::PEXT: {
34718    Known = DAG.computeKnownBits(Op.getOperand(1), DemandedElts, Depth + 1);
34719    // The result has as many leading zeros as the number of zeroes in the mask.
34720    unsigned Count = Known.Zero.countPopulation();
34721    Known.Zero = APInt::getHighBitsSet(BitWidth, Count);
34722    Known.One.clearAllBits();
34723    break;
34724  }
34725  case X86ISD::VTRUNC:
34726  case X86ISD::VTRUNCS:
34727  case X86ISD::VTRUNCUS:
34728  case X86ISD::CVTSI2P:
34729  case X86ISD::CVTUI2P:
34730  case X86ISD::CVTP2SI:
34731  case X86ISD::CVTP2UI:
34732  case X86ISD::MCVTP2SI:
34733  case X86ISD::MCVTP2UI:
34734  case X86ISD::CVTTP2SI:
34735  case X86ISD::CVTTP2UI:
34736  case X86ISD::MCVTTP2SI:
34737  case X86ISD::MCVTTP2UI:
34738  case X86ISD::MCVTSI2P:
34739  case X86ISD::MCVTUI2P:
34740  case X86ISD::VFPROUND:
34741  case X86ISD::VMFPROUND:
34742  case X86ISD::CVTPS2PH:
34743  case X86ISD::MCVTPS2PH: {
34744    // Truncations/Conversions - upper elements are known zero.
34745    EVT SrcVT = Op.getOperand(0).getValueType();
34746    if (SrcVT.isVector()) {
34747      unsigned NumSrcElts = SrcVT.getVectorNumElements();
34748      if (NumElts > NumSrcElts &&
34749          DemandedElts.countTrailingZeros() >= NumSrcElts)
34750        Known.setAllZero();
34751    }
34752    break;
34753  }
34754  case X86ISD::STRICT_CVTTP2SI:
34755  case X86ISD::STRICT_CVTTP2UI:
34756  case X86ISD::STRICT_CVTSI2P:
34757  case X86ISD::STRICT_CVTUI2P:
34758  case X86ISD::STRICT_VFPROUND:
34759  case X86ISD::STRICT_CVTPS2PH: {
34760    // Strict Conversions - upper elements are known zero.
34761    EVT SrcVT = Op.getOperand(1).getValueType();
34762    if (SrcVT.isVector()) {
34763      unsigned NumSrcElts = SrcVT.getVectorNumElements();
34764      if (NumElts > NumSrcElts &&
34765          DemandedElts.countTrailingZeros() >= NumSrcElts)
34766        Known.setAllZero();
34767    }
34768    break;
34769  }
34770  case X86ISD::MOVQ2DQ: {
34771    // Move from MMX to XMM. Upper half of XMM should be 0.
34772    if (DemandedElts.countTrailingZeros() >= (NumElts / 2))
34773      Known.setAllZero();
34774    break;
34775  }
34776  }
34777
34778  // Handle target shuffles.
34779  // TODO - use resolveTargetShuffleInputs once we can limit recursive depth.
34780  if (isTargetShuffle(Opc)) {
34781    SmallVector<int, 64> Mask;
34782    SmallVector<SDValue, 2> Ops;
34783    if (getTargetShuffleMask(Op.getNode(), VT.getSimpleVT(), true, Ops, Mask)) {
34784      unsigned NumOps = Ops.size();
34785      unsigned NumElts = VT.getVectorNumElements();
34786      if (Mask.size() == NumElts) {
34787        SmallVector<APInt, 2> DemandedOps(NumOps, APInt(NumElts, 0));
34788        Known.Zero.setAllBits(); Known.One.setAllBits();
34789        for (unsigned i = 0; i != NumElts; ++i) {
34790          if (!DemandedElts[i])
34791            continue;
34792          int M = Mask[i];
34793          if (M == SM_SentinelUndef) {
34794            // For UNDEF elements, we don't know anything about the common state
34795            // of the shuffle result.
34796            Known.resetAll();
34797            break;
34798          }
34799          if (M == SM_SentinelZero) {
34800            Known.One.clearAllBits();
34801            continue;
34802          }
34803          assert(0 <= M && (unsigned)M < (NumOps * NumElts) &&
34804                 "Shuffle index out of range");
34805
34806          unsigned OpIdx = (unsigned)M / NumElts;
34807          unsigned EltIdx = (unsigned)M % NumElts;
34808          if (Ops[OpIdx].getValueType() != VT) {
34809            // TODO - handle target shuffle ops with different value types.
34810            Known.resetAll();
34811            break;
34812          }
34813          DemandedOps[OpIdx].setBit(EltIdx);
34814        }
34815        // Known bits are the values that are shared by every demanded element.
34816        for (unsigned i = 0; i != NumOps && !Known.isUnknown(); ++i) {
34817          if (!DemandedOps[i])
34818            continue;
34819          KnownBits Known2 =
34820              DAG.computeKnownBits(Ops[i], DemandedOps[i], Depth + 1);
34821          Known = KnownBits::commonBits(Known, Known2);
34822        }
34823      }
34824    }
34825  }
34826}
34827
34828unsigned X86TargetLowering::ComputeNumSignBitsForTargetNode(
34829    SDValue Op, const APInt &DemandedElts, const SelectionDAG &DAG,
34830    unsigned Depth) const {
34831  EVT VT = Op.getValueType();
34832  unsigned VTBits = VT.getScalarSizeInBits();
34833  unsigned Opcode = Op.getOpcode();
34834  switch (Opcode) {
34835  case X86ISD::SETCC_CARRY:
34836    // SETCC_CARRY sets the dest to ~0 for true or 0 for false.
34837    return VTBits;
34838
34839  case X86ISD::VTRUNC: {
34840    SDValue Src = Op.getOperand(0);
34841    MVT SrcVT = Src.getSimpleValueType();
34842    unsigned NumSrcBits = SrcVT.getScalarSizeInBits();
34843    assert(VTBits < NumSrcBits && "Illegal truncation input type");
34844    APInt DemandedSrc = DemandedElts.zextOrTrunc(SrcVT.getVectorNumElements());
34845    unsigned Tmp = DAG.ComputeNumSignBits(Src, DemandedSrc, Depth + 1);
34846    if (Tmp > (NumSrcBits - VTBits))
34847      return Tmp - (NumSrcBits - VTBits);
34848    return 1;
34849  }
34850
34851  case X86ISD::PACKSS: {
34852    // PACKSS is just a truncation if the sign bits extend to the packed size.
34853    APInt DemandedLHS, DemandedRHS;
34854    getPackDemandedElts(Op.getValueType(), DemandedElts, DemandedLHS,
34855                        DemandedRHS);
34856
34857    unsigned SrcBits = Op.getOperand(0).getScalarValueSizeInBits();
34858    unsigned Tmp0 = SrcBits, Tmp1 = SrcBits;
34859    if (!!DemandedLHS)
34860      Tmp0 = DAG.ComputeNumSignBits(Op.getOperand(0), DemandedLHS, Depth + 1);
34861    if (!!DemandedRHS)
34862      Tmp1 = DAG.ComputeNumSignBits(Op.getOperand(1), DemandedRHS, Depth + 1);
34863    unsigned Tmp = std::min(Tmp0, Tmp1);
34864    if (Tmp > (SrcBits - VTBits))
34865      return Tmp - (SrcBits - VTBits);
34866    return 1;
34867  }
34868
34869  case X86ISD::VBROADCAST: {
34870    SDValue Src = Op.getOperand(0);
34871    if (!Src.getSimpleValueType().isVector())
34872      return DAG.ComputeNumSignBits(Src, Depth + 1);
34873    break;
34874  }
34875
34876  case X86ISD::VSHLI: {
34877    SDValue Src = Op.getOperand(0);
34878    const APInt &ShiftVal = Op.getConstantOperandAPInt(1);
34879    if (ShiftVal.uge(VTBits))
34880      return VTBits; // Shifted all bits out --> zero.
34881    unsigned Tmp = DAG.ComputeNumSignBits(Src, DemandedElts, Depth + 1);
34882    if (ShiftVal.uge(Tmp))
34883      return 1; // Shifted all sign bits out --> unknown.
34884    return Tmp - ShiftVal.getZExtValue();
34885  }
34886
34887  case X86ISD::VSRAI: {
34888    SDValue Src = Op.getOperand(0);
34889    APInt ShiftVal = Op.getConstantOperandAPInt(1);
34890    if (ShiftVal.uge(VTBits - 1))
34891      return VTBits; // Sign splat.
34892    unsigned Tmp = DAG.ComputeNumSignBits(Src, DemandedElts, Depth + 1);
34893    ShiftVal += Tmp;
34894    return ShiftVal.uge(VTBits) ? VTBits : ShiftVal.getZExtValue();
34895  }
34896
34897  case X86ISD::PCMPGT:
34898  case X86ISD::PCMPEQ:
34899  case X86ISD::CMPP:
34900  case X86ISD::VPCOM:
34901  case X86ISD::VPCOMU:
34902    // Vector compares return zero/all-bits result values.
34903    return VTBits;
34904
34905  case X86ISD::ANDNP: {
34906    unsigned Tmp0 =
34907        DAG.ComputeNumSignBits(Op.getOperand(0), DemandedElts, Depth + 1);
34908    if (Tmp0 == 1) return 1; // Early out.
34909    unsigned Tmp1 =
34910        DAG.ComputeNumSignBits(Op.getOperand(1), DemandedElts, Depth + 1);
34911    return std::min(Tmp0, Tmp1);
34912  }
34913
34914  case X86ISD::CMOV: {
34915    unsigned Tmp0 = DAG.ComputeNumSignBits(Op.getOperand(0), Depth+1);
34916    if (Tmp0 == 1) return 1;  // Early out.
34917    unsigned Tmp1 = DAG.ComputeNumSignBits(Op.getOperand(1), Depth+1);
34918    return std::min(Tmp0, Tmp1);
34919  }
34920  }
34921
34922  // Handle target shuffles.
34923  // TODO - use resolveTargetShuffleInputs once we can limit recursive depth.
34924  if (isTargetShuffle(Opcode)) {
34925    SmallVector<int, 64> Mask;
34926    SmallVector<SDValue, 2> Ops;
34927    if (getTargetShuffleMask(Op.getNode(), VT.getSimpleVT(), true, Ops, Mask)) {
34928      unsigned NumOps = Ops.size();
34929      unsigned NumElts = VT.getVectorNumElements();
34930      if (Mask.size() == NumElts) {
34931        SmallVector<APInt, 2> DemandedOps(NumOps, APInt(NumElts, 0));
34932        for (unsigned i = 0; i != NumElts; ++i) {
34933          if (!DemandedElts[i])
34934            continue;
34935          int M = Mask[i];
34936          if (M == SM_SentinelUndef) {
34937            // For UNDEF elements, we don't know anything about the common state
34938            // of the shuffle result.
34939            return 1;
34940          } else if (M == SM_SentinelZero) {
34941            // Zero = all sign bits.
34942            continue;
34943          }
34944          assert(0 <= M && (unsigned)M < (NumOps * NumElts) &&
34945                 "Shuffle index out of range");
34946
34947          unsigned OpIdx = (unsigned)M / NumElts;
34948          unsigned EltIdx = (unsigned)M % NumElts;
34949          if (Ops[OpIdx].getValueType() != VT) {
34950            // TODO - handle target shuffle ops with different value types.
34951            return 1;
34952          }
34953          DemandedOps[OpIdx].setBit(EltIdx);
34954        }
34955        unsigned Tmp0 = VTBits;
34956        for (unsigned i = 0; i != NumOps && Tmp0 > 1; ++i) {
34957          if (!DemandedOps[i])
34958            continue;
34959          unsigned Tmp1 =
34960              DAG.ComputeNumSignBits(Ops[i], DemandedOps[i], Depth + 1);
34961          Tmp0 = std::min(Tmp0, Tmp1);
34962        }
34963        return Tmp0;
34964      }
34965    }
34966  }
34967
34968  // Fallback case.
34969  return 1;
34970}
34971
34972SDValue X86TargetLowering::unwrapAddress(SDValue N) const {
34973  if (N->getOpcode() == X86ISD::Wrapper || N->getOpcode() == X86ISD::WrapperRIP)
34974    return N->getOperand(0);
34975  return N;
34976}
34977
34978// Helper to look for a normal load that can be narrowed into a vzload with the
34979// specified VT and memory VT. Returns SDValue() on failure.
34980static SDValue narrowLoadToVZLoad(LoadSDNode *LN, MVT MemVT, MVT VT,
34981                                  SelectionDAG &DAG) {
34982  // Can't if the load is volatile or atomic.
34983  if (!LN->isSimple())
34984    return SDValue();
34985
34986  SDVTList Tys = DAG.getVTList(VT, MVT::Other);
34987  SDValue Ops[] = {LN->getChain(), LN->getBasePtr()};
34988  return DAG.getMemIntrinsicNode(X86ISD::VZEXT_LOAD, SDLoc(LN), Tys, Ops, MemVT,
34989                                 LN->getPointerInfo(), LN->getOriginalAlign(),
34990                                 LN->getMemOperand()->getFlags());
34991}
34992
34993// Attempt to match a combined shuffle mask against supported unary shuffle
34994// instructions.
34995// TODO: Investigate sharing more of this with shuffle lowering.
34996static bool matchUnaryShuffle(MVT MaskVT, ArrayRef<int> Mask,
34997                              bool AllowFloatDomain, bool AllowIntDomain,
34998                              SDValue &V1, const SDLoc &DL, SelectionDAG &DAG,
34999                              const X86Subtarget &Subtarget, unsigned &Shuffle,
35000                              MVT &SrcVT, MVT &DstVT) {
35001  unsigned NumMaskElts = Mask.size();
35002  unsigned MaskEltSize = MaskVT.getScalarSizeInBits();
35003
35004  // Match against a VZEXT_MOVL vXi32 zero-extending instruction.
35005  if (MaskEltSize == 32 && Mask[0] == 0) {
35006    if (isUndefOrZero(Mask[1]) && isUndefInRange(Mask, 2, NumMaskElts - 2)) {
35007      Shuffle = X86ISD::VZEXT_MOVL;
35008      SrcVT = DstVT = !Subtarget.hasSSE2() ? MVT::v4f32 : MaskVT;
35009      return true;
35010    }
35011    if (V1.getOpcode() == ISD::SCALAR_TO_VECTOR &&
35012        isUndefOrZeroInRange(Mask, 1, NumMaskElts - 1)) {
35013      Shuffle = X86ISD::VZEXT_MOVL;
35014      SrcVT = DstVT = !Subtarget.hasSSE2() ? MVT::v4f32 : MaskVT;
35015      return true;
35016    }
35017  }
35018
35019  // Match against a ANY/ZERO_EXTEND_VECTOR_INREG instruction.
35020  // TODO: Add 512-bit vector support (split AVX512F and AVX512BW).
35021  if (AllowIntDomain && ((MaskVT.is128BitVector() && Subtarget.hasSSE41()) ||
35022                         (MaskVT.is256BitVector() && Subtarget.hasInt256()))) {
35023    unsigned MaxScale = 64 / MaskEltSize;
35024    for (unsigned Scale = 2; Scale <= MaxScale; Scale *= 2) {
35025      bool MatchAny = true;
35026      bool MatchZero = true;
35027      unsigned NumDstElts = NumMaskElts / Scale;
35028      for (unsigned i = 0; i != NumDstElts && (MatchAny || MatchZero); ++i) {
35029        if (!isUndefOrEqual(Mask[i * Scale], (int)i)) {
35030          MatchAny = MatchZero = false;
35031          break;
35032        }
35033        MatchAny &= isUndefInRange(Mask, (i * Scale) + 1, Scale - 1);
35034        MatchZero &= isUndefOrZeroInRange(Mask, (i * Scale) + 1, Scale - 1);
35035      }
35036      if (MatchAny || MatchZero) {
35037        assert(MatchZero && "Failed to match zext but matched aext?");
35038        unsigned SrcSize = std::max(128u, NumDstElts * MaskEltSize);
35039        MVT ScalarTy = MaskVT.isInteger() ? MaskVT.getScalarType() :
35040                                            MVT::getIntegerVT(MaskEltSize);
35041        SrcVT = MVT::getVectorVT(ScalarTy, SrcSize / MaskEltSize);
35042
35043        if (SrcVT.getSizeInBits() != MaskVT.getSizeInBits())
35044          V1 = extractSubVector(V1, 0, DAG, DL, SrcSize);
35045
35046        Shuffle = unsigned(MatchAny ? ISD::ANY_EXTEND : ISD::ZERO_EXTEND);
35047        if (SrcVT.getVectorNumElements() != NumDstElts)
35048          Shuffle = getOpcode_EXTEND_VECTOR_INREG(Shuffle);
35049
35050        DstVT = MVT::getIntegerVT(Scale * MaskEltSize);
35051        DstVT = MVT::getVectorVT(DstVT, NumDstElts);
35052        return true;
35053      }
35054    }
35055  }
35056
35057  // Match against a VZEXT_MOVL instruction, SSE1 only supports 32-bits (MOVSS).
35058  if (((MaskEltSize == 32) || (MaskEltSize == 64 && Subtarget.hasSSE2())) &&
35059      isUndefOrEqual(Mask[0], 0) &&
35060      isUndefOrZeroInRange(Mask, 1, NumMaskElts - 1)) {
35061    Shuffle = X86ISD::VZEXT_MOVL;
35062    SrcVT = DstVT = !Subtarget.hasSSE2() ? MVT::v4f32 : MaskVT;
35063    return true;
35064  }
35065
35066  // Check if we have SSE3 which will let us use MOVDDUP etc. The
35067  // instructions are no slower than UNPCKLPD but has the option to
35068  // fold the input operand into even an unaligned memory load.
35069  if (MaskVT.is128BitVector() && Subtarget.hasSSE3() && AllowFloatDomain) {
35070    if (isTargetShuffleEquivalent(MaskVT, Mask, {0, 0}, V1)) {
35071      Shuffle = X86ISD::MOVDDUP;
35072      SrcVT = DstVT = MVT::v2f64;
35073      return true;
35074    }
35075    if (isTargetShuffleEquivalent(MaskVT, Mask, {0, 0, 2, 2}, V1)) {
35076      Shuffle = X86ISD::MOVSLDUP;
35077      SrcVT = DstVT = MVT::v4f32;
35078      return true;
35079    }
35080    if (isTargetShuffleEquivalent(MaskVT, Mask, {1, 1, 3, 3}, V1)) {
35081      Shuffle = X86ISD::MOVSHDUP;
35082      SrcVT = DstVT = MVT::v4f32;
35083      return true;
35084    }
35085  }
35086
35087  if (MaskVT.is256BitVector() && AllowFloatDomain) {
35088    assert(Subtarget.hasAVX() && "AVX required for 256-bit vector shuffles");
35089    if (isTargetShuffleEquivalent(MaskVT, Mask, {0, 0, 2, 2}, V1)) {
35090      Shuffle = X86ISD::MOVDDUP;
35091      SrcVT = DstVT = MVT::v4f64;
35092      return true;
35093    }
35094    if (isTargetShuffleEquivalent(MaskVT, Mask, {0, 0, 2, 2, 4, 4, 6, 6}, V1)) {
35095      Shuffle = X86ISD::MOVSLDUP;
35096      SrcVT = DstVT = MVT::v8f32;
35097      return true;
35098    }
35099    if (isTargetShuffleEquivalent(MaskVT, Mask, {1, 1, 3, 3, 5, 5, 7, 7}, V1)) {
35100      Shuffle = X86ISD::MOVSHDUP;
35101      SrcVT = DstVT = MVT::v8f32;
35102      return true;
35103    }
35104  }
35105
35106  if (MaskVT.is512BitVector() && AllowFloatDomain) {
35107    assert(Subtarget.hasAVX512() &&
35108           "AVX512 required for 512-bit vector shuffles");
35109    if (isTargetShuffleEquivalent(MaskVT, Mask, {0, 0, 2, 2, 4, 4, 6, 6}, V1)) {
35110      Shuffle = X86ISD::MOVDDUP;
35111      SrcVT = DstVT = MVT::v8f64;
35112      return true;
35113    }
35114    if (isTargetShuffleEquivalent(
35115            MaskVT, Mask,
35116            {0, 0, 2, 2, 4, 4, 6, 6, 8, 8, 10, 10, 12, 12, 14, 14}, V1)) {
35117      Shuffle = X86ISD::MOVSLDUP;
35118      SrcVT = DstVT = MVT::v16f32;
35119      return true;
35120    }
35121    if (isTargetShuffleEquivalent(
35122            MaskVT, Mask,
35123            {1, 1, 3, 3, 5, 5, 7, 7, 9, 9, 11, 11, 13, 13, 15, 15}, V1)) {
35124      Shuffle = X86ISD::MOVSHDUP;
35125      SrcVT = DstVT = MVT::v16f32;
35126      return true;
35127    }
35128  }
35129
35130  return false;
35131}
35132
35133// Attempt to match a combined shuffle mask against supported unary immediate
35134// permute instructions.
35135// TODO: Investigate sharing more of this with shuffle lowering.
35136static bool matchUnaryPermuteShuffle(MVT MaskVT, ArrayRef<int> Mask,
35137                                     const APInt &Zeroable,
35138                                     bool AllowFloatDomain, bool AllowIntDomain,
35139                                     const X86Subtarget &Subtarget,
35140                                     unsigned &Shuffle, MVT &ShuffleVT,
35141                                     unsigned &PermuteImm) {
35142  unsigned NumMaskElts = Mask.size();
35143  unsigned InputSizeInBits = MaskVT.getSizeInBits();
35144  unsigned MaskScalarSizeInBits = InputSizeInBits / NumMaskElts;
35145  MVT MaskEltVT = MVT::getIntegerVT(MaskScalarSizeInBits);
35146  bool ContainsZeros = isAnyZero(Mask);
35147
35148  // Handle VPERMI/VPERMILPD vXi64/vXi64 patterns.
35149  if (!ContainsZeros && MaskScalarSizeInBits == 64) {
35150    // Check for lane crossing permutes.
35151    if (is128BitLaneCrossingShuffleMask(MaskEltVT, Mask)) {
35152      // PERMPD/PERMQ permutes within a 256-bit vector (AVX2+).
35153      if (Subtarget.hasAVX2() && MaskVT.is256BitVector()) {
35154        Shuffle = X86ISD::VPERMI;
35155        ShuffleVT = (AllowFloatDomain ? MVT::v4f64 : MVT::v4i64);
35156        PermuteImm = getV4X86ShuffleImm(Mask);
35157        return true;
35158      }
35159      if (Subtarget.hasAVX512() && MaskVT.is512BitVector()) {
35160        SmallVector<int, 4> RepeatedMask;
35161        if (is256BitLaneRepeatedShuffleMask(MVT::v8f64, Mask, RepeatedMask)) {
35162          Shuffle = X86ISD::VPERMI;
35163          ShuffleVT = (AllowFloatDomain ? MVT::v8f64 : MVT::v8i64);
35164          PermuteImm = getV4X86ShuffleImm(RepeatedMask);
35165          return true;
35166        }
35167      }
35168    } else if (AllowFloatDomain && Subtarget.hasAVX()) {
35169      // VPERMILPD can permute with a non-repeating shuffle.
35170      Shuffle = X86ISD::VPERMILPI;
35171      ShuffleVT = MVT::getVectorVT(MVT::f64, Mask.size());
35172      PermuteImm = 0;
35173      for (int i = 0, e = Mask.size(); i != e; ++i) {
35174        int M = Mask[i];
35175        if (M == SM_SentinelUndef)
35176          continue;
35177        assert(((M / 2) == (i / 2)) && "Out of range shuffle mask index");
35178        PermuteImm |= (M & 1) << i;
35179      }
35180      return true;
35181    }
35182  }
35183
35184  // Handle PSHUFD/VPERMILPI vXi32/vXf32 repeated patterns.
35185  // AVX introduced the VPERMILPD/VPERMILPS float permutes, before then we
35186  // had to use 2-input SHUFPD/SHUFPS shuffles (not handled here).
35187  if ((MaskScalarSizeInBits == 64 || MaskScalarSizeInBits == 32) &&
35188      !ContainsZeros && (AllowIntDomain || Subtarget.hasAVX())) {
35189    SmallVector<int, 4> RepeatedMask;
35190    if (is128BitLaneRepeatedShuffleMask(MaskEltVT, Mask, RepeatedMask)) {
35191      // Narrow the repeated mask to create 32-bit element permutes.
35192      SmallVector<int, 4> WordMask = RepeatedMask;
35193      if (MaskScalarSizeInBits == 64)
35194        narrowShuffleMaskElts(2, RepeatedMask, WordMask);
35195
35196      Shuffle = (AllowIntDomain ? X86ISD::PSHUFD : X86ISD::VPERMILPI);
35197      ShuffleVT = (AllowIntDomain ? MVT::i32 : MVT::f32);
35198      ShuffleVT = MVT::getVectorVT(ShuffleVT, InputSizeInBits / 32);
35199      PermuteImm = getV4X86ShuffleImm(WordMask);
35200      return true;
35201    }
35202  }
35203
35204  // Handle PSHUFLW/PSHUFHW vXi16 repeated patterns.
35205  if (!ContainsZeros && AllowIntDomain && MaskScalarSizeInBits == 16 &&
35206      ((MaskVT.is128BitVector() && Subtarget.hasSSE2()) ||
35207       (MaskVT.is256BitVector() && Subtarget.hasAVX2()) ||
35208       (MaskVT.is512BitVector() && Subtarget.hasBWI()))) {
35209    SmallVector<int, 4> RepeatedMask;
35210    if (is128BitLaneRepeatedShuffleMask(MaskEltVT, Mask, RepeatedMask)) {
35211      ArrayRef<int> LoMask(RepeatedMask.data() + 0, 4);
35212      ArrayRef<int> HiMask(RepeatedMask.data() + 4, 4);
35213
35214      // PSHUFLW: permute lower 4 elements only.
35215      if (isUndefOrInRange(LoMask, 0, 4) &&
35216          isSequentialOrUndefInRange(HiMask, 0, 4, 4)) {
35217        Shuffle = X86ISD::PSHUFLW;
35218        ShuffleVT = MVT::getVectorVT(MVT::i16, InputSizeInBits / 16);
35219        PermuteImm = getV4X86ShuffleImm(LoMask);
35220        return true;
35221      }
35222
35223      // PSHUFHW: permute upper 4 elements only.
35224      if (isUndefOrInRange(HiMask, 4, 8) &&
35225          isSequentialOrUndefInRange(LoMask, 0, 4, 0)) {
35226        // Offset the HiMask so that we can create the shuffle immediate.
35227        int OffsetHiMask[4];
35228        for (int i = 0; i != 4; ++i)
35229          OffsetHiMask[i] = (HiMask[i] < 0 ? HiMask[i] : HiMask[i] - 4);
35230
35231        Shuffle = X86ISD::PSHUFHW;
35232        ShuffleVT = MVT::getVectorVT(MVT::i16, InputSizeInBits / 16);
35233        PermuteImm = getV4X86ShuffleImm(OffsetHiMask);
35234        return true;
35235      }
35236    }
35237  }
35238
35239  // Attempt to match against byte/bit shifts.
35240  if (AllowIntDomain &&
35241      ((MaskVT.is128BitVector() && Subtarget.hasSSE2()) ||
35242       (MaskVT.is256BitVector() && Subtarget.hasAVX2()) ||
35243       (MaskVT.is512BitVector() && Subtarget.hasAVX512()))) {
35244    int ShiftAmt = matchShuffleAsShift(ShuffleVT, Shuffle, MaskScalarSizeInBits,
35245                                       Mask, 0, Zeroable, Subtarget);
35246    if (0 < ShiftAmt && (!ShuffleVT.is512BitVector() || Subtarget.hasBWI() ||
35247                         32 <= ShuffleVT.getScalarSizeInBits())) {
35248      PermuteImm = (unsigned)ShiftAmt;
35249      return true;
35250    }
35251  }
35252
35253  // Attempt to match against bit rotates.
35254  if (!ContainsZeros && AllowIntDomain && MaskScalarSizeInBits < 64 &&
35255      ((MaskVT.is128BitVector() && Subtarget.hasXOP()) ||
35256       Subtarget.hasAVX512())) {
35257    int RotateAmt = matchShuffleAsBitRotate(ShuffleVT, MaskScalarSizeInBits,
35258                                            Subtarget, Mask);
35259    if (0 < RotateAmt) {
35260      Shuffle = X86ISD::VROTLI;
35261      PermuteImm = (unsigned)RotateAmt;
35262      return true;
35263    }
35264  }
35265
35266  return false;
35267}
35268
35269// Attempt to match a combined unary shuffle mask against supported binary
35270// shuffle instructions.
35271// TODO: Investigate sharing more of this with shuffle lowering.
35272static bool matchBinaryShuffle(MVT MaskVT, ArrayRef<int> Mask,
35273                               bool AllowFloatDomain, bool AllowIntDomain,
35274                               SDValue &V1, SDValue &V2, const SDLoc &DL,
35275                               SelectionDAG &DAG, const X86Subtarget &Subtarget,
35276                               unsigned &Shuffle, MVT &SrcVT, MVT &DstVT,
35277                               bool IsUnary) {
35278  unsigned NumMaskElts = Mask.size();
35279  unsigned EltSizeInBits = MaskVT.getScalarSizeInBits();
35280
35281  if (MaskVT.is128BitVector()) {
35282    if (isTargetShuffleEquivalent(MaskVT, Mask, {0, 0}) && AllowFloatDomain) {
35283      V2 = V1;
35284      V1 = (SM_SentinelUndef == Mask[0] ? DAG.getUNDEF(MVT::v4f32) : V1);
35285      Shuffle = Subtarget.hasSSE2() ? X86ISD::UNPCKL : X86ISD::MOVLHPS;
35286      SrcVT = DstVT = Subtarget.hasSSE2() ? MVT::v2f64 : MVT::v4f32;
35287      return true;
35288    }
35289    if (isTargetShuffleEquivalent(MaskVT, Mask, {1, 1}) && AllowFloatDomain) {
35290      V2 = V1;
35291      Shuffle = Subtarget.hasSSE2() ? X86ISD::UNPCKH : X86ISD::MOVHLPS;
35292      SrcVT = DstVT = Subtarget.hasSSE2() ? MVT::v2f64 : MVT::v4f32;
35293      return true;
35294    }
35295    if (isTargetShuffleEquivalent(MaskVT, Mask, {0, 3}) &&
35296        Subtarget.hasSSE2() && (AllowFloatDomain || !Subtarget.hasSSE41())) {
35297      std::swap(V1, V2);
35298      Shuffle = X86ISD::MOVSD;
35299      SrcVT = DstVT = MVT::v2f64;
35300      return true;
35301    }
35302    if (isTargetShuffleEquivalent(MaskVT, Mask, {4, 1, 2, 3}) &&
35303        (AllowFloatDomain || !Subtarget.hasSSE41())) {
35304      Shuffle = X86ISD::MOVSS;
35305      SrcVT = DstVT = MVT::v4f32;
35306      return true;
35307    }
35308  }
35309
35310  // Attempt to match against either an unary or binary PACKSS/PACKUS shuffle.
35311  if (((MaskVT == MVT::v8i16 || MaskVT == MVT::v16i8) && Subtarget.hasSSE2()) ||
35312      ((MaskVT == MVT::v16i16 || MaskVT == MVT::v32i8) && Subtarget.hasInt256()) ||
35313      ((MaskVT == MVT::v32i16 || MaskVT == MVT::v64i8) && Subtarget.hasBWI())) {
35314    if (matchShuffleWithPACK(MaskVT, SrcVT, V1, V2, Shuffle, Mask, DAG,
35315                             Subtarget)) {
35316      DstVT = MaskVT;
35317      return true;
35318    }
35319  }
35320
35321  // Attempt to match against either a unary or binary UNPCKL/UNPCKH shuffle.
35322  if ((MaskVT == MVT::v4f32 && Subtarget.hasSSE1()) ||
35323      (MaskVT.is128BitVector() && Subtarget.hasSSE2()) ||
35324      (MaskVT.is256BitVector() && 32 <= EltSizeInBits && Subtarget.hasAVX()) ||
35325      (MaskVT.is256BitVector() && Subtarget.hasAVX2()) ||
35326      (MaskVT.is512BitVector() && Subtarget.hasAVX512())) {
35327    if (matchShuffleWithUNPCK(MaskVT, V1, V2, Shuffle, IsUnary, Mask, DL, DAG,
35328                              Subtarget)) {
35329      SrcVT = DstVT = MaskVT;
35330      if (MaskVT.is256BitVector() && !Subtarget.hasAVX2())
35331        SrcVT = DstVT = (32 == EltSizeInBits ? MVT::v8f32 : MVT::v4f64);
35332      return true;
35333    }
35334  }
35335
35336  // Attempt to match against a OR if we're performing a blend shuffle and the
35337  // non-blended source element is zero in each case.
35338  if ((EltSizeInBits % V1.getScalarValueSizeInBits()) == 0 &&
35339      (EltSizeInBits % V2.getScalarValueSizeInBits()) == 0) {
35340    bool IsBlend = true;
35341    unsigned NumV1Elts = V1.getValueType().getVectorNumElements();
35342    unsigned NumV2Elts = V2.getValueType().getVectorNumElements();
35343    unsigned Scale1 = NumV1Elts / NumMaskElts;
35344    unsigned Scale2 = NumV2Elts / NumMaskElts;
35345    APInt DemandedZeroV1 = APInt::getNullValue(NumV1Elts);
35346    APInt DemandedZeroV2 = APInt::getNullValue(NumV2Elts);
35347    for (unsigned i = 0; i != NumMaskElts; ++i) {
35348      int M = Mask[i];
35349      if (M == SM_SentinelUndef)
35350        continue;
35351      if (M == SM_SentinelZero) {
35352        DemandedZeroV1.setBits(i * Scale1, (i + 1) * Scale1);
35353        DemandedZeroV2.setBits(i * Scale2, (i + 1) * Scale2);
35354        continue;
35355      }
35356      if (M == (int)i) {
35357        DemandedZeroV2.setBits(i * Scale2, (i + 1) * Scale2);
35358        continue;
35359      }
35360      if (M == (int)(i + NumMaskElts)) {
35361        DemandedZeroV1.setBits(i * Scale1, (i + 1) * Scale1);
35362        continue;
35363      }
35364      IsBlend = false;
35365      break;
35366    }
35367    if (IsBlend &&
35368        DAG.computeKnownBits(V1, DemandedZeroV1).isZero() &&
35369        DAG.computeKnownBits(V2, DemandedZeroV2).isZero()) {
35370      Shuffle = ISD::OR;
35371      SrcVT = DstVT = MaskVT.changeTypeToInteger();
35372      return true;
35373    }
35374  }
35375
35376  return false;
35377}
35378
35379static bool matchBinaryPermuteShuffle(
35380    MVT MaskVT, ArrayRef<int> Mask, const APInt &Zeroable,
35381    bool AllowFloatDomain, bool AllowIntDomain, SDValue &V1, SDValue &V2,
35382    const SDLoc &DL, SelectionDAG &DAG, const X86Subtarget &Subtarget,
35383    unsigned &Shuffle, MVT &ShuffleVT, unsigned &PermuteImm) {
35384  unsigned NumMaskElts = Mask.size();
35385  unsigned EltSizeInBits = MaskVT.getScalarSizeInBits();
35386
35387  // Attempt to match against VALIGND/VALIGNQ rotate.
35388  if (AllowIntDomain && (EltSizeInBits == 64 || EltSizeInBits == 32) &&
35389      ((MaskVT.is128BitVector() && Subtarget.hasVLX()) ||
35390       (MaskVT.is256BitVector() && Subtarget.hasVLX()) ||
35391       (MaskVT.is512BitVector() && Subtarget.hasAVX512()))) {
35392    if (!isAnyZero(Mask)) {
35393      int Rotation = matchShuffleAsElementRotate(V1, V2, Mask);
35394      if (0 < Rotation) {
35395        Shuffle = X86ISD::VALIGN;
35396        if (EltSizeInBits == 64)
35397          ShuffleVT = MVT::getVectorVT(MVT::i64, MaskVT.getSizeInBits() / 64);
35398        else
35399          ShuffleVT = MVT::getVectorVT(MVT::i32, MaskVT.getSizeInBits() / 32);
35400        PermuteImm = Rotation;
35401        return true;
35402      }
35403    }
35404  }
35405
35406  // Attempt to match against PALIGNR byte rotate.
35407  if (AllowIntDomain && ((MaskVT.is128BitVector() && Subtarget.hasSSSE3()) ||
35408                         (MaskVT.is256BitVector() && Subtarget.hasAVX2()) ||
35409                         (MaskVT.is512BitVector() && Subtarget.hasBWI()))) {
35410    int ByteRotation = matchShuffleAsByteRotate(MaskVT, V1, V2, Mask);
35411    if (0 < ByteRotation) {
35412      Shuffle = X86ISD::PALIGNR;
35413      ShuffleVT = MVT::getVectorVT(MVT::i8, MaskVT.getSizeInBits() / 8);
35414      PermuteImm = ByteRotation;
35415      return true;
35416    }
35417  }
35418
35419  // Attempt to combine to X86ISD::BLENDI.
35420  if ((NumMaskElts <= 8 && ((Subtarget.hasSSE41() && MaskVT.is128BitVector()) ||
35421                            (Subtarget.hasAVX() && MaskVT.is256BitVector()))) ||
35422      (MaskVT == MVT::v16i16 && Subtarget.hasAVX2())) {
35423    uint64_t BlendMask = 0;
35424    bool ForceV1Zero = false, ForceV2Zero = false;
35425    SmallVector<int, 8> TargetMask(Mask.begin(), Mask.end());
35426    if (matchShuffleAsBlend(V1, V2, TargetMask, Zeroable, ForceV1Zero,
35427                            ForceV2Zero, BlendMask)) {
35428      if (MaskVT == MVT::v16i16) {
35429        // We can only use v16i16 PBLENDW if the lanes are repeated.
35430        SmallVector<int, 8> RepeatedMask;
35431        if (isRepeatedTargetShuffleMask(128, MaskVT, TargetMask,
35432                                        RepeatedMask)) {
35433          assert(RepeatedMask.size() == 8 &&
35434                 "Repeated mask size doesn't match!");
35435          PermuteImm = 0;
35436          for (int i = 0; i < 8; ++i)
35437            if (RepeatedMask[i] >= 8)
35438              PermuteImm |= 1 << i;
35439          V1 = ForceV1Zero ? getZeroVector(MaskVT, Subtarget, DAG, DL) : V1;
35440          V2 = ForceV2Zero ? getZeroVector(MaskVT, Subtarget, DAG, DL) : V2;
35441          Shuffle = X86ISD::BLENDI;
35442          ShuffleVT = MaskVT;
35443          return true;
35444        }
35445      } else {
35446        V1 = ForceV1Zero ? getZeroVector(MaskVT, Subtarget, DAG, DL) : V1;
35447        V2 = ForceV2Zero ? getZeroVector(MaskVT, Subtarget, DAG, DL) : V2;
35448        PermuteImm = (unsigned)BlendMask;
35449        Shuffle = X86ISD::BLENDI;
35450        ShuffleVT = MaskVT;
35451        return true;
35452      }
35453    }
35454  }
35455
35456  // Attempt to combine to INSERTPS, but only if it has elements that need to
35457  // be set to zero.
35458  if (AllowFloatDomain && EltSizeInBits == 32 && Subtarget.hasSSE41() &&
35459      MaskVT.is128BitVector() && isAnyZero(Mask) &&
35460      matchShuffleAsInsertPS(V1, V2, PermuteImm, Zeroable, Mask, DAG)) {
35461    Shuffle = X86ISD::INSERTPS;
35462    ShuffleVT = MVT::v4f32;
35463    return true;
35464  }
35465
35466  // Attempt to combine to SHUFPD.
35467  if (AllowFloatDomain && EltSizeInBits == 64 &&
35468      ((MaskVT.is128BitVector() && Subtarget.hasSSE2()) ||
35469       (MaskVT.is256BitVector() && Subtarget.hasAVX()) ||
35470       (MaskVT.is512BitVector() && Subtarget.hasAVX512()))) {
35471    bool ForceV1Zero = false, ForceV2Zero = false;
35472    if (matchShuffleWithSHUFPD(MaskVT, V1, V2, ForceV1Zero, ForceV2Zero,
35473                               PermuteImm, Mask, Zeroable)) {
35474      V1 = ForceV1Zero ? getZeroVector(MaskVT, Subtarget, DAG, DL) : V1;
35475      V2 = ForceV2Zero ? getZeroVector(MaskVT, Subtarget, DAG, DL) : V2;
35476      Shuffle = X86ISD::SHUFP;
35477      ShuffleVT = MVT::getVectorVT(MVT::f64, MaskVT.getSizeInBits() / 64);
35478      return true;
35479    }
35480  }
35481
35482  // Attempt to combine to SHUFPS.
35483  if (AllowFloatDomain && EltSizeInBits == 32 &&
35484      ((MaskVT.is128BitVector() && Subtarget.hasSSE1()) ||
35485       (MaskVT.is256BitVector() && Subtarget.hasAVX()) ||
35486       (MaskVT.is512BitVector() && Subtarget.hasAVX512()))) {
35487    SmallVector<int, 4> RepeatedMask;
35488    if (isRepeatedTargetShuffleMask(128, MaskVT, Mask, RepeatedMask)) {
35489      // Match each half of the repeated mask, to determine if its just
35490      // referencing one of the vectors, is zeroable or entirely undef.
35491      auto MatchHalf = [&](unsigned Offset, int &S0, int &S1) {
35492        int M0 = RepeatedMask[Offset];
35493        int M1 = RepeatedMask[Offset + 1];
35494
35495        if (isUndefInRange(RepeatedMask, Offset, 2)) {
35496          return DAG.getUNDEF(MaskVT);
35497        } else if (isUndefOrZeroInRange(RepeatedMask, Offset, 2)) {
35498          S0 = (SM_SentinelUndef == M0 ? -1 : 0);
35499          S1 = (SM_SentinelUndef == M1 ? -1 : 1);
35500          return getZeroVector(MaskVT, Subtarget, DAG, DL);
35501        } else if (isUndefOrInRange(M0, 0, 4) && isUndefOrInRange(M1, 0, 4)) {
35502          S0 = (SM_SentinelUndef == M0 ? -1 : M0 & 3);
35503          S1 = (SM_SentinelUndef == M1 ? -1 : M1 & 3);
35504          return V1;
35505        } else if (isUndefOrInRange(M0, 4, 8) && isUndefOrInRange(M1, 4, 8)) {
35506          S0 = (SM_SentinelUndef == M0 ? -1 : M0 & 3);
35507          S1 = (SM_SentinelUndef == M1 ? -1 : M1 & 3);
35508          return V2;
35509        }
35510
35511        return SDValue();
35512      };
35513
35514      int ShufMask[4] = {-1, -1, -1, -1};
35515      SDValue Lo = MatchHalf(0, ShufMask[0], ShufMask[1]);
35516      SDValue Hi = MatchHalf(2, ShufMask[2], ShufMask[3]);
35517
35518      if (Lo && Hi) {
35519        V1 = Lo;
35520        V2 = Hi;
35521        Shuffle = X86ISD::SHUFP;
35522        ShuffleVT = MVT::getVectorVT(MVT::f32, MaskVT.getSizeInBits() / 32);
35523        PermuteImm = getV4X86ShuffleImm(ShufMask);
35524        return true;
35525      }
35526    }
35527  }
35528
35529  // Attempt to combine to INSERTPS more generally if X86ISD::SHUFP failed.
35530  if (AllowFloatDomain && EltSizeInBits == 32 && Subtarget.hasSSE41() &&
35531      MaskVT.is128BitVector() &&
35532      matchShuffleAsInsertPS(V1, V2, PermuteImm, Zeroable, Mask, DAG)) {
35533    Shuffle = X86ISD::INSERTPS;
35534    ShuffleVT = MVT::v4f32;
35535    return true;
35536  }
35537
35538  return false;
35539}
35540
35541static SDValue combineX86ShuffleChainWithExtract(
35542    ArrayRef<SDValue> Inputs, SDValue Root, ArrayRef<int> BaseMask, int Depth,
35543    bool HasVariableMask, bool AllowVariableMask, SelectionDAG &DAG,
35544    const X86Subtarget &Subtarget);
35545
35546/// Combine an arbitrary chain of shuffles into a single instruction if
35547/// possible.
35548///
35549/// This is the leaf of the recursive combine below. When we have found some
35550/// chain of single-use x86 shuffle instructions and accumulated the combined
35551/// shuffle mask represented by them, this will try to pattern match that mask
35552/// into either a single instruction if there is a special purpose instruction
35553/// for this operation, or into a PSHUFB instruction which is a fully general
35554/// instruction but should only be used to replace chains over a certain depth.
35555static SDValue combineX86ShuffleChain(ArrayRef<SDValue> Inputs, SDValue Root,
35556                                      ArrayRef<int> BaseMask, int Depth,
35557                                      bool HasVariableMask,
35558                                      bool AllowVariableMask, SelectionDAG &DAG,
35559                                      const X86Subtarget &Subtarget) {
35560  assert(!BaseMask.empty() && "Cannot combine an empty shuffle mask!");
35561  assert((Inputs.size() == 1 || Inputs.size() == 2) &&
35562         "Unexpected number of shuffle inputs!");
35563
35564  MVT RootVT = Root.getSimpleValueType();
35565  unsigned RootSizeInBits = RootVT.getSizeInBits();
35566  unsigned NumRootElts = RootVT.getVectorNumElements();
35567
35568  // Canonicalize shuffle input op to the requested type.
35569  // TODO: Support cases where Op is smaller than VT.
35570  auto CanonicalizeShuffleInput = [&](MVT VT, SDValue Op) {
35571    return DAG.getBitcast(VT, Op);
35572  };
35573
35574  // Find the inputs that enter the chain. Note that multiple uses are OK
35575  // here, we're not going to remove the operands we find.
35576  bool UnaryShuffle = (Inputs.size() == 1);
35577  SDValue V1 = peekThroughBitcasts(Inputs[0]);
35578  SDValue V2 = (UnaryShuffle ? DAG.getUNDEF(V1.getValueType())
35579                             : peekThroughBitcasts(Inputs[1]));
35580
35581  MVT VT1 = V1.getSimpleValueType();
35582  MVT VT2 = V2.getSimpleValueType();
35583  assert(VT1.getSizeInBits() == RootSizeInBits &&
35584         VT2.getSizeInBits() == RootSizeInBits && "Vector size mismatch");
35585
35586  SDLoc DL(Root);
35587  SDValue Res;
35588
35589  unsigned NumBaseMaskElts = BaseMask.size();
35590  if (NumBaseMaskElts == 1) {
35591    assert(BaseMask[0] == 0 && "Invalid shuffle index found!");
35592    return CanonicalizeShuffleInput(RootVT, V1);
35593  }
35594
35595  bool OptForSize = DAG.shouldOptForSize();
35596  unsigned BaseMaskEltSizeInBits = RootSizeInBits / NumBaseMaskElts;
35597  bool FloatDomain = VT1.isFloatingPoint() || VT2.isFloatingPoint() ||
35598                     (RootVT.isFloatingPoint() && Depth >= 1) ||
35599                     (RootVT.is256BitVector() && !Subtarget.hasAVX2());
35600
35601  // Don't combine if we are a AVX512/EVEX target and the mask element size
35602  // is different from the root element size - this would prevent writemasks
35603  // from being reused.
35604  bool IsMaskedShuffle = false;
35605  if (RootSizeInBits == 512 || (Subtarget.hasVLX() && RootSizeInBits >= 128)) {
35606    if (Root.hasOneUse() && Root->use_begin()->getOpcode() == ISD::VSELECT &&
35607        Root->use_begin()->getOperand(0).getScalarValueSizeInBits() == 1) {
35608      IsMaskedShuffle = true;
35609    }
35610  }
35611
35612  // If we are shuffling a broadcast (and not introducing zeros) then
35613  // we can just use the broadcast directly. This works for smaller broadcast
35614  // elements as well as they already repeat across each mask element
35615  if (UnaryShuffle && isTargetShuffleSplat(V1) && !isAnyZero(BaseMask) &&
35616      (BaseMaskEltSizeInBits % V1.getScalarValueSizeInBits()) == 0 &&
35617      V1.getValueSizeInBits() >= RootSizeInBits) {
35618    return CanonicalizeShuffleInput(RootVT, V1);
35619  }
35620
35621  // See if the shuffle is a hidden identity shuffle - repeated args in HOPs
35622  // etc. can be simplified.
35623  if (VT1 == VT2 && VT1.getSizeInBits() == RootSizeInBits) {
35624    SmallVector<int> ScaledMask, IdentityMask;
35625    unsigned NumElts = VT1.getVectorNumElements();
35626    if (BaseMask.size() <= NumElts &&
35627        scaleShuffleElements(BaseMask, NumElts, ScaledMask)) {
35628      for (unsigned i = 0; i != NumElts; ++i)
35629        IdentityMask.push_back(i);
35630      if (isTargetShuffleEquivalent(RootVT, ScaledMask, IdentityMask, V1, V2))
35631        return CanonicalizeShuffleInput(RootVT, V1);
35632    }
35633  }
35634
35635  // Handle 128/256-bit lane shuffles of 512-bit vectors.
35636  if (RootVT.is512BitVector() &&
35637      (NumBaseMaskElts == 2 || NumBaseMaskElts == 4)) {
35638    // If the upper subvectors are zeroable, then an extract+insert is more
35639    // optimal than using X86ISD::SHUF128. The insertion is free, even if it has
35640    // to zero the upper subvectors.
35641    if (isUndefOrZeroInRange(BaseMask, 1, NumBaseMaskElts - 1)) {
35642      if (Depth == 0 && Root.getOpcode() == ISD::INSERT_SUBVECTOR)
35643        return SDValue(); // Nothing to do!
35644      assert(isInRange(BaseMask[0], 0, NumBaseMaskElts) &&
35645             "Unexpected lane shuffle");
35646      Res = CanonicalizeShuffleInput(RootVT, V1);
35647      unsigned SubIdx = BaseMask[0] * (NumRootElts / NumBaseMaskElts);
35648      bool UseZero = isAnyZero(BaseMask);
35649      Res = extractSubVector(Res, SubIdx, DAG, DL, BaseMaskEltSizeInBits);
35650      return widenSubVector(Res, UseZero, Subtarget, DAG, DL, RootSizeInBits);
35651    }
35652
35653    // Narrow shuffle mask to v4x128.
35654    SmallVector<int, 4> Mask;
35655    assert((BaseMaskEltSizeInBits % 128) == 0 && "Illegal mask size");
35656    narrowShuffleMaskElts(BaseMaskEltSizeInBits / 128, BaseMask, Mask);
35657
35658    // Try to lower to vshuf64x2/vshuf32x4.
35659    auto MatchSHUF128 = [&](MVT ShuffleVT, const SDLoc &DL, ArrayRef<int> Mask,
35660                            SDValue V1, SDValue V2, SelectionDAG &DAG) {
35661      unsigned PermMask = 0;
35662      // Insure elements came from the same Op.
35663      SDValue Ops[2] = {DAG.getUNDEF(ShuffleVT), DAG.getUNDEF(ShuffleVT)};
35664      for (int i = 0; i < 4; ++i) {
35665        assert(Mask[i] >= -1 && "Illegal shuffle sentinel value");
35666        if (Mask[i] < 0)
35667          continue;
35668
35669        SDValue Op = Mask[i] >= 4 ? V2 : V1;
35670        unsigned OpIndex = i / 2;
35671        if (Ops[OpIndex].isUndef())
35672          Ops[OpIndex] = Op;
35673        else if (Ops[OpIndex] != Op)
35674          return SDValue();
35675
35676        // Convert the 128-bit shuffle mask selection values into 128-bit
35677        // selection bits defined by a vshuf64x2 instruction's immediate control
35678        // byte.
35679        PermMask |= (Mask[i] % 4) << (i * 2);
35680      }
35681
35682      return DAG.getNode(X86ISD::SHUF128, DL, ShuffleVT,
35683                         CanonicalizeShuffleInput(ShuffleVT, Ops[0]),
35684                         CanonicalizeShuffleInput(ShuffleVT, Ops[1]),
35685                         DAG.getTargetConstant(PermMask, DL, MVT::i8));
35686    };
35687
35688    // FIXME: Is there a better way to do this? is256BitLaneRepeatedShuffleMask
35689    // doesn't work because our mask is for 128 bits and we don't have an MVT
35690    // to match that.
35691    bool PreferPERMQ =
35692        UnaryShuffle && isUndefOrInRange(Mask[0], 0, 2) &&
35693        isUndefOrInRange(Mask[1], 0, 2) && isUndefOrInRange(Mask[2], 2, 4) &&
35694        isUndefOrInRange(Mask[3], 2, 4) &&
35695        (Mask[0] < 0 || Mask[2] < 0 || Mask[0] == (Mask[2] % 2)) &&
35696        (Mask[1] < 0 || Mask[3] < 0 || Mask[1] == (Mask[3] % 2));
35697
35698    if (!isAnyZero(Mask) && !PreferPERMQ) {
35699      if (Depth == 0 && Root.getOpcode() == X86ISD::SHUF128)
35700        return SDValue(); // Nothing to do!
35701      MVT ShuffleVT = (FloatDomain ? MVT::v8f64 : MVT::v8i64);
35702      if (SDValue V = MatchSHUF128(ShuffleVT, DL, Mask, V1, V2, DAG))
35703        return DAG.getBitcast(RootVT, V);
35704    }
35705  }
35706
35707  // Handle 128-bit lane shuffles of 256-bit vectors.
35708  if (RootVT.is256BitVector() && NumBaseMaskElts == 2) {
35709    // If the upper half is zeroable, then an extract+insert is more optimal
35710    // than using X86ISD::VPERM2X128. The insertion is free, even if it has to
35711    // zero the upper half.
35712    if (isUndefOrZero(BaseMask[1])) {
35713      if (Depth == 0 && Root.getOpcode() == ISD::INSERT_SUBVECTOR)
35714        return SDValue(); // Nothing to do!
35715      assert(isInRange(BaseMask[0], 0, 2) && "Unexpected lane shuffle");
35716      Res = CanonicalizeShuffleInput(RootVT, V1);
35717      Res = extract128BitVector(Res, BaseMask[0] * (NumRootElts / 2), DAG, DL);
35718      return widenSubVector(Res, BaseMask[1] == SM_SentinelZero, Subtarget, DAG,
35719                            DL, 256);
35720    }
35721
35722    if (Depth == 0 && Root.getOpcode() == X86ISD::VPERM2X128)
35723      return SDValue(); // Nothing to do!
35724
35725    // If we have AVX2, prefer to use VPERMQ/VPERMPD for unary shuffles unless
35726    // we need to use the zeroing feature.
35727    // Prefer blends for sequential shuffles unless we are optimizing for size.
35728    if (UnaryShuffle &&
35729        !(Subtarget.hasAVX2() && isUndefOrInRange(BaseMask, 0, 2)) &&
35730        (OptForSize || !isSequentialOrUndefOrZeroInRange(BaseMask, 0, 2, 0))) {
35731      unsigned PermMask = 0;
35732      PermMask |= ((BaseMask[0] < 0 ? 0x8 : (BaseMask[0] & 1)) << 0);
35733      PermMask |= ((BaseMask[1] < 0 ? 0x8 : (BaseMask[1] & 1)) << 4);
35734      return DAG.getNode(
35735          X86ISD::VPERM2X128, DL, RootVT, CanonicalizeShuffleInput(RootVT, V1),
35736          DAG.getUNDEF(RootVT), DAG.getTargetConstant(PermMask, DL, MVT::i8));
35737    }
35738
35739    if (Depth == 0 && Root.getOpcode() == X86ISD::SHUF128)
35740      return SDValue(); // Nothing to do!
35741
35742    // TODO - handle AVX512VL cases with X86ISD::SHUF128.
35743    if (!UnaryShuffle && !IsMaskedShuffle) {
35744      assert(llvm::all_of(BaseMask, [](int M) { return 0 <= M && M < 4; }) &&
35745             "Unexpected shuffle sentinel value");
35746      // Prefer blends to X86ISD::VPERM2X128.
35747      if (!((BaseMask[0] == 0 && BaseMask[1] == 3) ||
35748            (BaseMask[0] == 2 && BaseMask[1] == 1))) {
35749        unsigned PermMask = 0;
35750        PermMask |= ((BaseMask[0] & 3) << 0);
35751        PermMask |= ((BaseMask[1] & 3) << 4);
35752        SDValue LHS = isInRange(BaseMask[0], 0, 2) ? V1 : V2;
35753        SDValue RHS = isInRange(BaseMask[1], 0, 2) ? V1 : V2;
35754        return DAG.getNode(X86ISD::VPERM2X128, DL, RootVT,
35755                          CanonicalizeShuffleInput(RootVT, LHS),
35756                          CanonicalizeShuffleInput(RootVT, RHS),
35757                          DAG.getTargetConstant(PermMask, DL, MVT::i8));
35758      }
35759    }
35760  }
35761
35762  // For masks that have been widened to 128-bit elements or more,
35763  // narrow back down to 64-bit elements.
35764  SmallVector<int, 64> Mask;
35765  if (BaseMaskEltSizeInBits > 64) {
35766    assert((BaseMaskEltSizeInBits % 64) == 0 && "Illegal mask size");
35767    int MaskScale = BaseMaskEltSizeInBits / 64;
35768    narrowShuffleMaskElts(MaskScale, BaseMask, Mask);
35769  } else {
35770    Mask.assign(BaseMask.begin(), BaseMask.end());
35771  }
35772
35773  // For masked shuffles, we're trying to match the root width for better
35774  // writemask folding, attempt to scale the mask.
35775  // TODO - variable shuffles might need this to be widened again.
35776  if (IsMaskedShuffle && NumRootElts > Mask.size()) {
35777    assert((NumRootElts % Mask.size()) == 0 && "Illegal mask size");
35778    int MaskScale = NumRootElts / Mask.size();
35779    SmallVector<int, 64> ScaledMask;
35780    narrowShuffleMaskElts(MaskScale, Mask, ScaledMask);
35781    Mask = std::move(ScaledMask);
35782  }
35783
35784  unsigned NumMaskElts = Mask.size();
35785  unsigned MaskEltSizeInBits = RootSizeInBits / NumMaskElts;
35786
35787  // Determine the effective mask value type.
35788  FloatDomain &= (32 <= MaskEltSizeInBits);
35789  MVT MaskVT = FloatDomain ? MVT::getFloatingPointVT(MaskEltSizeInBits)
35790                           : MVT::getIntegerVT(MaskEltSizeInBits);
35791  MaskVT = MVT::getVectorVT(MaskVT, NumMaskElts);
35792
35793  // Only allow legal mask types.
35794  if (!DAG.getTargetLoweringInfo().isTypeLegal(MaskVT))
35795    return SDValue();
35796
35797  // Attempt to match the mask against known shuffle patterns.
35798  MVT ShuffleSrcVT, ShuffleVT;
35799  unsigned Shuffle, PermuteImm;
35800
35801  // Which shuffle domains are permitted?
35802  // Permit domain crossing at higher combine depths.
35803  // TODO: Should we indicate which domain is preferred if both are allowed?
35804  bool AllowFloatDomain = FloatDomain || (Depth >= 3);
35805  bool AllowIntDomain = (!FloatDomain || (Depth >= 3)) && Subtarget.hasSSE2() &&
35806                        (!MaskVT.is256BitVector() || Subtarget.hasAVX2());
35807
35808  // Determine zeroable mask elements.
35809  APInt KnownUndef, KnownZero;
35810  resolveZeroablesFromTargetShuffle(Mask, KnownUndef, KnownZero);
35811  APInt Zeroable = KnownUndef | KnownZero;
35812
35813  if (UnaryShuffle) {
35814    // Attempt to match against broadcast-from-vector.
35815    // Limit AVX1 to cases where we're loading+broadcasting a scalar element.
35816    if ((Subtarget.hasAVX2() ||
35817         (Subtarget.hasAVX() && 32 <= MaskEltSizeInBits)) &&
35818        (!IsMaskedShuffle || NumRootElts == NumMaskElts)) {
35819      if (isUndefOrEqual(Mask, 0)) {
35820        if (V1.getValueType() == MaskVT &&
35821            V1.getOpcode() == ISD::SCALAR_TO_VECTOR &&
35822            MayFoldLoad(V1.getOperand(0))) {
35823          if (Depth == 0 && Root.getOpcode() == X86ISD::VBROADCAST)
35824            return SDValue(); // Nothing to do!
35825          Res = V1.getOperand(0);
35826          Res = DAG.getNode(X86ISD::VBROADCAST, DL, MaskVT, Res);
35827          return DAG.getBitcast(RootVT, Res);
35828        }
35829        if (Subtarget.hasAVX2()) {
35830          if (Depth == 0 && Root.getOpcode() == X86ISD::VBROADCAST)
35831            return SDValue(); // Nothing to do!
35832          Res = CanonicalizeShuffleInput(MaskVT, V1);
35833          Res = DAG.getNode(X86ISD::VBROADCAST, DL, MaskVT, Res);
35834          return DAG.getBitcast(RootVT, Res);
35835        }
35836      }
35837    }
35838
35839    // See if this is a blend with zero - in which case check if the zero'd
35840    // elements are already zero.
35841    if (isSequentialOrUndefOrZeroInRange(Mask, 0, NumMaskElts, 0)) {
35842      assert(!KnownZero.isNullValue() && "Shuffle has no zero elements");
35843      SDValue NewV1 = CanonicalizeShuffleInput(MaskVT, V1);
35844      if (DAG.MaskedElementsAreZero(NewV1, KnownZero))
35845        return DAG.getBitcast(RootVT, NewV1);
35846    }
35847
35848    SDValue NewV1 = V1; // Save operand in case early exit happens.
35849    if (matchUnaryShuffle(MaskVT, Mask, AllowFloatDomain, AllowIntDomain, NewV1,
35850                          DL, DAG, Subtarget, Shuffle, ShuffleSrcVT,
35851                          ShuffleVT) &&
35852        (!IsMaskedShuffle ||
35853         (NumRootElts == ShuffleVT.getVectorNumElements()))) {
35854      if (Depth == 0 && Root.getOpcode() == Shuffle)
35855        return SDValue(); // Nothing to do!
35856      Res = CanonicalizeShuffleInput(ShuffleSrcVT, NewV1);
35857      Res = DAG.getNode(Shuffle, DL, ShuffleVT, Res);
35858      return DAG.getBitcast(RootVT, Res);
35859    }
35860
35861    if (matchUnaryPermuteShuffle(MaskVT, Mask, Zeroable, AllowFloatDomain,
35862                                 AllowIntDomain, Subtarget, Shuffle, ShuffleVT,
35863                                 PermuteImm) &&
35864        (!IsMaskedShuffle ||
35865         (NumRootElts == ShuffleVT.getVectorNumElements()))) {
35866      if (Depth == 0 && Root.getOpcode() == Shuffle)
35867        return SDValue(); // Nothing to do!
35868      Res = CanonicalizeShuffleInput(ShuffleVT, V1);
35869      Res = DAG.getNode(Shuffle, DL, ShuffleVT, Res,
35870                        DAG.getTargetConstant(PermuteImm, DL, MVT::i8));
35871      return DAG.getBitcast(RootVT, Res);
35872    }
35873  }
35874
35875  // Attempt to combine to INSERTPS, but only if the inserted element has come
35876  // from a scalar.
35877  // TODO: Handle other insertions here as well?
35878  if (!UnaryShuffle && AllowFloatDomain && RootSizeInBits == 128 &&
35879      Subtarget.hasSSE41() &&
35880      !isTargetShuffleEquivalent(MaskVT, Mask, {4, 1, 2, 3})) {
35881    if (MaskEltSizeInBits == 32) {
35882      SDValue SrcV1 = V1, SrcV2 = V2;
35883      if (matchShuffleAsInsertPS(SrcV1, SrcV2, PermuteImm, Zeroable, Mask,
35884                                 DAG) &&
35885          SrcV2.getOpcode() == ISD::SCALAR_TO_VECTOR) {
35886        if (Depth == 0 && Root.getOpcode() == X86ISD::INSERTPS)
35887          return SDValue(); // Nothing to do!
35888        Res = DAG.getNode(X86ISD::INSERTPS, DL, MVT::v4f32,
35889                          CanonicalizeShuffleInput(MVT::v4f32, SrcV1),
35890                          CanonicalizeShuffleInput(MVT::v4f32, SrcV2),
35891                          DAG.getTargetConstant(PermuteImm, DL, MVT::i8));
35892        return DAG.getBitcast(RootVT, Res);
35893      }
35894    }
35895    if (MaskEltSizeInBits == 64 &&
35896        isTargetShuffleEquivalent(MaskVT, Mask, {0, 2}) &&
35897        V2.getOpcode() == ISD::SCALAR_TO_VECTOR &&
35898        V2.getScalarValueSizeInBits() <= 32) {
35899      if (Depth == 0 && Root.getOpcode() == X86ISD::INSERTPS)
35900        return SDValue(); // Nothing to do!
35901      PermuteImm = (/*DstIdx*/2 << 4) | (/*SrcIdx*/0 << 0);
35902      Res = DAG.getNode(X86ISD::INSERTPS, DL, MVT::v4f32,
35903                        CanonicalizeShuffleInput(MVT::v4f32, V1),
35904                        CanonicalizeShuffleInput(MVT::v4f32, V2),
35905                        DAG.getTargetConstant(PermuteImm, DL, MVT::i8));
35906      return DAG.getBitcast(RootVT, Res);
35907    }
35908  }
35909
35910  SDValue NewV1 = V1; // Save operands in case early exit happens.
35911  SDValue NewV2 = V2;
35912  if (matchBinaryShuffle(MaskVT, Mask, AllowFloatDomain, AllowIntDomain, NewV1,
35913                         NewV2, DL, DAG, Subtarget, Shuffle, ShuffleSrcVT,
35914                         ShuffleVT, UnaryShuffle) &&
35915      (!IsMaskedShuffle || (NumRootElts == ShuffleVT.getVectorNumElements()))) {
35916    if (Depth == 0 && Root.getOpcode() == Shuffle)
35917      return SDValue(); // Nothing to do!
35918    NewV1 = CanonicalizeShuffleInput(ShuffleSrcVT, NewV1);
35919    NewV2 = CanonicalizeShuffleInput(ShuffleSrcVT, NewV2);
35920    Res = DAG.getNode(Shuffle, DL, ShuffleVT, NewV1, NewV2);
35921    return DAG.getBitcast(RootVT, Res);
35922  }
35923
35924  NewV1 = V1; // Save operands in case early exit happens.
35925  NewV2 = V2;
35926  if (matchBinaryPermuteShuffle(MaskVT, Mask, Zeroable, AllowFloatDomain,
35927                                AllowIntDomain, NewV1, NewV2, DL, DAG,
35928                                Subtarget, Shuffle, ShuffleVT, PermuteImm) &&
35929      (!IsMaskedShuffle || (NumRootElts == ShuffleVT.getVectorNumElements()))) {
35930    if (Depth == 0 && Root.getOpcode() == Shuffle)
35931      return SDValue(); // Nothing to do!
35932    NewV1 = CanonicalizeShuffleInput(ShuffleVT, NewV1);
35933    NewV2 = CanonicalizeShuffleInput(ShuffleVT, NewV2);
35934    Res = DAG.getNode(Shuffle, DL, ShuffleVT, NewV1, NewV2,
35935                      DAG.getTargetConstant(PermuteImm, DL, MVT::i8));
35936    return DAG.getBitcast(RootVT, Res);
35937  }
35938
35939  // Typically from here on, we need an integer version of MaskVT.
35940  MVT IntMaskVT = MVT::getIntegerVT(MaskEltSizeInBits);
35941  IntMaskVT = MVT::getVectorVT(IntMaskVT, NumMaskElts);
35942
35943  // Annoyingly, SSE4A instructions don't map into the above match helpers.
35944  if (Subtarget.hasSSE4A() && AllowIntDomain && RootSizeInBits == 128) {
35945    uint64_t BitLen, BitIdx;
35946    if (matchShuffleAsEXTRQ(IntMaskVT, V1, V2, Mask, BitLen, BitIdx,
35947                            Zeroable)) {
35948      if (Depth == 0 && Root.getOpcode() == X86ISD::EXTRQI)
35949        return SDValue(); // Nothing to do!
35950      V1 = CanonicalizeShuffleInput(IntMaskVT, V1);
35951      Res = DAG.getNode(X86ISD::EXTRQI, DL, IntMaskVT, V1,
35952                        DAG.getTargetConstant(BitLen, DL, MVT::i8),
35953                        DAG.getTargetConstant(BitIdx, DL, MVT::i8));
35954      return DAG.getBitcast(RootVT, Res);
35955    }
35956
35957    if (matchShuffleAsINSERTQ(IntMaskVT, V1, V2, Mask, BitLen, BitIdx)) {
35958      if (Depth == 0 && Root.getOpcode() == X86ISD::INSERTQI)
35959        return SDValue(); // Nothing to do!
35960      V1 = CanonicalizeShuffleInput(IntMaskVT, V1);
35961      V2 = CanonicalizeShuffleInput(IntMaskVT, V2);
35962      Res = DAG.getNode(X86ISD::INSERTQI, DL, IntMaskVT, V1, V2,
35963                        DAG.getTargetConstant(BitLen, DL, MVT::i8),
35964                        DAG.getTargetConstant(BitIdx, DL, MVT::i8));
35965      return DAG.getBitcast(RootVT, Res);
35966    }
35967  }
35968
35969  // Match shuffle against TRUNCATE patterns.
35970  if (AllowIntDomain && MaskEltSizeInBits < 64 && Subtarget.hasAVX512()) {
35971    // Match against a VTRUNC instruction, accounting for src/dst sizes.
35972    if (matchShuffleAsVTRUNC(ShuffleSrcVT, ShuffleVT, IntMaskVT, Mask, Zeroable,
35973                             Subtarget)) {
35974      bool IsTRUNCATE = ShuffleVT.getVectorNumElements() ==
35975                        ShuffleSrcVT.getVectorNumElements();
35976      unsigned Opc =
35977          IsTRUNCATE ? (unsigned)ISD::TRUNCATE : (unsigned)X86ISD::VTRUNC;
35978      if (Depth == 0 && Root.getOpcode() == Opc)
35979        return SDValue(); // Nothing to do!
35980      V1 = CanonicalizeShuffleInput(ShuffleSrcVT, V1);
35981      Res = DAG.getNode(Opc, DL, ShuffleVT, V1);
35982      if (ShuffleVT.getSizeInBits() < RootSizeInBits)
35983        Res = widenSubVector(Res, true, Subtarget, DAG, DL, RootSizeInBits);
35984      return DAG.getBitcast(RootVT, Res);
35985    }
35986
35987    // Do we need a more general binary truncation pattern?
35988    if (RootSizeInBits < 512 &&
35989        ((RootVT.is256BitVector() && Subtarget.useAVX512Regs()) ||
35990         (RootVT.is128BitVector() && Subtarget.hasVLX())) &&
35991        (MaskEltSizeInBits > 8 || Subtarget.hasBWI()) &&
35992        isSequentialOrUndefInRange(Mask, 0, NumMaskElts, 0, 2)) {
35993      if (Depth == 0 && Root.getOpcode() == ISD::TRUNCATE)
35994        return SDValue(); // Nothing to do!
35995      ShuffleSrcVT = MVT::getIntegerVT(MaskEltSizeInBits * 2);
35996      ShuffleSrcVT = MVT::getVectorVT(ShuffleSrcVT, NumMaskElts / 2);
35997      V1 = CanonicalizeShuffleInput(ShuffleSrcVT, V1);
35998      V2 = CanonicalizeShuffleInput(ShuffleSrcVT, V2);
35999      ShuffleSrcVT = MVT::getIntegerVT(MaskEltSizeInBits * 2);
36000      ShuffleSrcVT = MVT::getVectorVT(ShuffleSrcVT, NumMaskElts);
36001      Res = DAG.getNode(ISD::CONCAT_VECTORS, DL, ShuffleSrcVT, V1, V2);
36002      Res = DAG.getNode(ISD::TRUNCATE, DL, IntMaskVT, Res);
36003      return DAG.getBitcast(RootVT, Res);
36004    }
36005  }
36006
36007  // Don't try to re-form single instruction chains under any circumstances now
36008  // that we've done encoding canonicalization for them.
36009  if (Depth < 1)
36010    return SDValue();
36011
36012  // Depth threshold above which we can efficiently use variable mask shuffles.
36013  int VariableShuffleDepth = Subtarget.hasFastVariableShuffle() ? 1 : 2;
36014  AllowVariableMask &= (Depth >= VariableShuffleDepth) || HasVariableMask;
36015  // VPERMI2W/VPERMI2B are 3 uops on Skylake and Icelake so we require a
36016  // higher depth before combining them.
36017  bool AllowBWIVPERMV3 = (Depth >= 2 || HasVariableMask);
36018
36019  bool MaskContainsZeros = isAnyZero(Mask);
36020
36021  if (is128BitLaneCrossingShuffleMask(MaskVT, Mask)) {
36022    // If we have a single input lane-crossing shuffle then lower to VPERMV.
36023    if (UnaryShuffle && AllowVariableMask && !MaskContainsZeros) {
36024      if (Subtarget.hasAVX2() &&
36025          (MaskVT == MVT::v8f32 || MaskVT == MVT::v8i32)) {
36026        SDValue VPermMask = getConstVector(Mask, IntMaskVT, DAG, DL, true);
36027        Res = CanonicalizeShuffleInput(MaskVT, V1);
36028        Res = DAG.getNode(X86ISD::VPERMV, DL, MaskVT, VPermMask, Res);
36029        return DAG.getBitcast(RootVT, Res);
36030      }
36031      // AVX512 variants (non-VLX will pad to 512-bit shuffles).
36032      if ((Subtarget.hasAVX512() &&
36033           (MaskVT == MVT::v8f64 || MaskVT == MVT::v8i64 ||
36034            MaskVT == MVT::v16f32 || MaskVT == MVT::v16i32)) ||
36035          (Subtarget.hasBWI() &&
36036           (MaskVT == MVT::v16i16 || MaskVT == MVT::v32i16)) ||
36037          (Subtarget.hasVBMI() &&
36038           (MaskVT == MVT::v32i8 || MaskVT == MVT::v64i8))) {
36039        V1 = CanonicalizeShuffleInput(MaskVT, V1);
36040        V2 = DAG.getUNDEF(MaskVT);
36041        Res = lowerShuffleWithPERMV(DL, MaskVT, Mask, V1, V2, Subtarget, DAG);
36042        return DAG.getBitcast(RootVT, Res);
36043      }
36044    }
36045
36046    // Lower a unary+zero lane-crossing shuffle as VPERMV3 with a zero
36047    // vector as the second source (non-VLX will pad to 512-bit shuffles).
36048    if (UnaryShuffle && AllowVariableMask &&
36049        ((Subtarget.hasAVX512() &&
36050          (MaskVT == MVT::v8f64 || MaskVT == MVT::v8i64 ||
36051           MaskVT == MVT::v4f64 || MaskVT == MVT::v4i64 ||
36052           MaskVT == MVT::v8f32 || MaskVT == MVT::v8i32 ||
36053           MaskVT == MVT::v16f32 || MaskVT == MVT::v16i32)) ||
36054         (Subtarget.hasBWI() && AllowBWIVPERMV3 &&
36055          (MaskVT == MVT::v16i16 || MaskVT == MVT::v32i16)) ||
36056         (Subtarget.hasVBMI() && AllowBWIVPERMV3 &&
36057          (MaskVT == MVT::v32i8 || MaskVT == MVT::v64i8)))) {
36058      // Adjust shuffle mask - replace SM_SentinelZero with second source index.
36059      for (unsigned i = 0; i != NumMaskElts; ++i)
36060        if (Mask[i] == SM_SentinelZero)
36061          Mask[i] = NumMaskElts + i;
36062      V1 = CanonicalizeShuffleInput(MaskVT, V1);
36063      V2 = getZeroVector(MaskVT, Subtarget, DAG, DL);
36064      Res = lowerShuffleWithPERMV(DL, MaskVT, Mask, V1, V2, Subtarget, DAG);
36065      return DAG.getBitcast(RootVT, Res);
36066    }
36067
36068    // If that failed and either input is extracted then try to combine as a
36069    // shuffle with the larger type.
36070    if (SDValue WideShuffle = combineX86ShuffleChainWithExtract(
36071            Inputs, Root, BaseMask, Depth, HasVariableMask, AllowVariableMask,
36072            DAG, Subtarget))
36073      return WideShuffle;
36074
36075    // If we have a dual input lane-crossing shuffle then lower to VPERMV3,
36076    // (non-VLX will pad to 512-bit shuffles).
36077    if (AllowVariableMask && !MaskContainsZeros &&
36078        ((Subtarget.hasAVX512() &&
36079          (MaskVT == MVT::v8f64 || MaskVT == MVT::v8i64 ||
36080           MaskVT == MVT::v4f64 || MaskVT == MVT::v4i64 ||
36081           MaskVT == MVT::v16f32 || MaskVT == MVT::v16i32 ||
36082           MaskVT == MVT::v8f32 || MaskVT == MVT::v8i32)) ||
36083         (Subtarget.hasBWI() && AllowBWIVPERMV3 &&
36084          (MaskVT == MVT::v16i16 || MaskVT == MVT::v32i16)) ||
36085         (Subtarget.hasVBMI() && AllowBWIVPERMV3 &&
36086          (MaskVT == MVT::v32i8 || MaskVT == MVT::v64i8)))) {
36087      V1 = CanonicalizeShuffleInput(MaskVT, V1);
36088      V2 = CanonicalizeShuffleInput(MaskVT, V2);
36089      Res = lowerShuffleWithPERMV(DL, MaskVT, Mask, V1, V2, Subtarget, DAG);
36090      return DAG.getBitcast(RootVT, Res);
36091    }
36092    return SDValue();
36093  }
36094
36095  // See if we can combine a single input shuffle with zeros to a bit-mask,
36096  // which is much simpler than any shuffle.
36097  if (UnaryShuffle && MaskContainsZeros && AllowVariableMask &&
36098      isSequentialOrUndefOrZeroInRange(Mask, 0, NumMaskElts, 0) &&
36099      DAG.getTargetLoweringInfo().isTypeLegal(MaskVT)) {
36100    APInt Zero = APInt::getNullValue(MaskEltSizeInBits);
36101    APInt AllOnes = APInt::getAllOnesValue(MaskEltSizeInBits);
36102    APInt UndefElts(NumMaskElts, 0);
36103    SmallVector<APInt, 64> EltBits(NumMaskElts, Zero);
36104    for (unsigned i = 0; i != NumMaskElts; ++i) {
36105      int M = Mask[i];
36106      if (M == SM_SentinelUndef) {
36107        UndefElts.setBit(i);
36108        continue;
36109      }
36110      if (M == SM_SentinelZero)
36111        continue;
36112      EltBits[i] = AllOnes;
36113    }
36114    SDValue BitMask = getConstVector(EltBits, UndefElts, MaskVT, DAG, DL);
36115    Res = CanonicalizeShuffleInput(MaskVT, V1);
36116    unsigned AndOpcode =
36117        MaskVT.isFloatingPoint() ? unsigned(X86ISD::FAND) : unsigned(ISD::AND);
36118    Res = DAG.getNode(AndOpcode, DL, MaskVT, Res, BitMask);
36119    return DAG.getBitcast(RootVT, Res);
36120  }
36121
36122  // If we have a single input shuffle with different shuffle patterns in the
36123  // the 128-bit lanes use the variable mask to VPERMILPS.
36124  // TODO Combine other mask types at higher depths.
36125  if (UnaryShuffle && AllowVariableMask && !MaskContainsZeros &&
36126      ((MaskVT == MVT::v8f32 && Subtarget.hasAVX()) ||
36127       (MaskVT == MVT::v16f32 && Subtarget.hasAVX512()))) {
36128    SmallVector<SDValue, 16> VPermIdx;
36129    for (int M : Mask) {
36130      SDValue Idx =
36131          M < 0 ? DAG.getUNDEF(MVT::i32) : DAG.getConstant(M % 4, DL, MVT::i32);
36132      VPermIdx.push_back(Idx);
36133    }
36134    SDValue VPermMask = DAG.getBuildVector(IntMaskVT, DL, VPermIdx);
36135    Res = CanonicalizeShuffleInput(MaskVT, V1);
36136    Res = DAG.getNode(X86ISD::VPERMILPV, DL, MaskVT, Res, VPermMask);
36137    return DAG.getBitcast(RootVT, Res);
36138  }
36139
36140  // With XOP, binary shuffles of 128/256-bit floating point vectors can combine
36141  // to VPERMIL2PD/VPERMIL2PS.
36142  if (AllowVariableMask && Subtarget.hasXOP() &&
36143      (MaskVT == MVT::v2f64 || MaskVT == MVT::v4f64 || MaskVT == MVT::v4f32 ||
36144       MaskVT == MVT::v8f32)) {
36145    // VPERMIL2 Operation.
36146    // Bits[3] - Match Bit.
36147    // Bits[2:1] - (Per Lane) PD Shuffle Mask.
36148    // Bits[2:0] - (Per Lane) PS Shuffle Mask.
36149    unsigned NumLanes = MaskVT.getSizeInBits() / 128;
36150    unsigned NumEltsPerLane = NumMaskElts / NumLanes;
36151    SmallVector<int, 8> VPerm2Idx;
36152    unsigned M2ZImm = 0;
36153    for (int M : Mask) {
36154      if (M == SM_SentinelUndef) {
36155        VPerm2Idx.push_back(-1);
36156        continue;
36157      }
36158      if (M == SM_SentinelZero) {
36159        M2ZImm = 2;
36160        VPerm2Idx.push_back(8);
36161        continue;
36162      }
36163      int Index = (M % NumEltsPerLane) + ((M / NumMaskElts) * NumEltsPerLane);
36164      Index = (MaskVT.getScalarSizeInBits() == 64 ? Index << 1 : Index);
36165      VPerm2Idx.push_back(Index);
36166    }
36167    V1 = CanonicalizeShuffleInput(MaskVT, V1);
36168    V2 = CanonicalizeShuffleInput(MaskVT, V2);
36169    SDValue VPerm2MaskOp = getConstVector(VPerm2Idx, IntMaskVT, DAG, DL, true);
36170    Res = DAG.getNode(X86ISD::VPERMIL2, DL, MaskVT, V1, V2, VPerm2MaskOp,
36171                      DAG.getTargetConstant(M2ZImm, DL, MVT::i8));
36172    return DAG.getBitcast(RootVT, Res);
36173  }
36174
36175  // If we have 3 or more shuffle instructions or a chain involving a variable
36176  // mask, we can replace them with a single PSHUFB instruction profitably.
36177  // Intel's manuals suggest only using PSHUFB if doing so replacing 5
36178  // instructions, but in practice PSHUFB tends to be *very* fast so we're
36179  // more aggressive.
36180  if (UnaryShuffle && AllowVariableMask &&
36181      ((RootVT.is128BitVector() && Subtarget.hasSSSE3()) ||
36182       (RootVT.is256BitVector() && Subtarget.hasAVX2()) ||
36183       (RootVT.is512BitVector() && Subtarget.hasBWI()))) {
36184    SmallVector<SDValue, 16> PSHUFBMask;
36185    int NumBytes = RootVT.getSizeInBits() / 8;
36186    int Ratio = NumBytes / NumMaskElts;
36187    for (int i = 0; i < NumBytes; ++i) {
36188      int M = Mask[i / Ratio];
36189      if (M == SM_SentinelUndef) {
36190        PSHUFBMask.push_back(DAG.getUNDEF(MVT::i8));
36191        continue;
36192      }
36193      if (M == SM_SentinelZero) {
36194        PSHUFBMask.push_back(DAG.getConstant(0x80, DL, MVT::i8));
36195        continue;
36196      }
36197      M = Ratio * M + i % Ratio;
36198      assert((M / 16) == (i / 16) && "Lane crossing detected");
36199      PSHUFBMask.push_back(DAG.getConstant(M, DL, MVT::i8));
36200    }
36201    MVT ByteVT = MVT::getVectorVT(MVT::i8, NumBytes);
36202    Res = CanonicalizeShuffleInput(ByteVT, V1);
36203    SDValue PSHUFBMaskOp = DAG.getBuildVector(ByteVT, DL, PSHUFBMask);
36204    Res = DAG.getNode(X86ISD::PSHUFB, DL, ByteVT, Res, PSHUFBMaskOp);
36205    return DAG.getBitcast(RootVT, Res);
36206  }
36207
36208  // With XOP, if we have a 128-bit binary input shuffle we can always combine
36209  // to VPPERM. We match the depth requirement of PSHUFB - VPPERM is never
36210  // slower than PSHUFB on targets that support both.
36211  if (AllowVariableMask && RootVT.is128BitVector() && Subtarget.hasXOP()) {
36212    // VPPERM Mask Operation
36213    // Bits[4:0] - Byte Index (0 - 31)
36214    // Bits[7:5] - Permute Operation (0 - Source byte, 4 - ZERO)
36215    SmallVector<SDValue, 16> VPPERMMask;
36216    int NumBytes = 16;
36217    int Ratio = NumBytes / NumMaskElts;
36218    for (int i = 0; i < NumBytes; ++i) {
36219      int M = Mask[i / Ratio];
36220      if (M == SM_SentinelUndef) {
36221        VPPERMMask.push_back(DAG.getUNDEF(MVT::i8));
36222        continue;
36223      }
36224      if (M == SM_SentinelZero) {
36225        VPPERMMask.push_back(DAG.getConstant(0x80, DL, MVT::i8));
36226        continue;
36227      }
36228      M = Ratio * M + i % Ratio;
36229      VPPERMMask.push_back(DAG.getConstant(M, DL, MVT::i8));
36230    }
36231    MVT ByteVT = MVT::v16i8;
36232    V1 = CanonicalizeShuffleInput(ByteVT, V1);
36233    V2 = CanonicalizeShuffleInput(ByteVT, V2);
36234    SDValue VPPERMMaskOp = DAG.getBuildVector(ByteVT, DL, VPPERMMask);
36235    Res = DAG.getNode(X86ISD::VPPERM, DL, ByteVT, V1, V2, VPPERMMaskOp);
36236    return DAG.getBitcast(RootVT, Res);
36237  }
36238
36239  // If that failed and either input is extracted then try to combine as a
36240  // shuffle with the larger type.
36241  if (SDValue WideShuffle = combineX86ShuffleChainWithExtract(
36242          Inputs, Root, BaseMask, Depth, HasVariableMask, AllowVariableMask,
36243          DAG, Subtarget))
36244    return WideShuffle;
36245
36246  // If we have a dual input shuffle then lower to VPERMV3,
36247  // (non-VLX will pad to 512-bit shuffles)
36248  if (!UnaryShuffle && AllowVariableMask && !MaskContainsZeros &&
36249      ((Subtarget.hasAVX512() &&
36250        (MaskVT == MVT::v2f64 || MaskVT == MVT::v4f64 || MaskVT == MVT::v8f64 ||
36251         MaskVT == MVT::v2i64 || MaskVT == MVT::v4i64 || MaskVT == MVT::v8i64 ||
36252         MaskVT == MVT::v4f32 || MaskVT == MVT::v4i32 || MaskVT == MVT::v8f32 ||
36253         MaskVT == MVT::v8i32 || MaskVT == MVT::v16f32 ||
36254         MaskVT == MVT::v16i32)) ||
36255       (Subtarget.hasBWI() && AllowBWIVPERMV3 &&
36256        (MaskVT == MVT::v8i16 || MaskVT == MVT::v16i16 || MaskVT == MVT::v32i16)) ||
36257       (Subtarget.hasVBMI() && AllowBWIVPERMV3 &&
36258        (MaskVT == MVT::v16i8 || MaskVT == MVT::v32i8 || MaskVT == MVT::v64i8)))) {
36259    V1 = CanonicalizeShuffleInput(MaskVT, V1);
36260    V2 = CanonicalizeShuffleInput(MaskVT, V2);
36261    Res = lowerShuffleWithPERMV(DL, MaskVT, Mask, V1, V2, Subtarget, DAG);
36262    return DAG.getBitcast(RootVT, Res);
36263  }
36264
36265  // Failed to find any combines.
36266  return SDValue();
36267}
36268
36269// Combine an arbitrary chain of shuffles + extract_subvectors into a single
36270// instruction if possible.
36271//
36272// Wrapper for combineX86ShuffleChain that extends the shuffle mask to a larger
36273// type size to attempt to combine:
36274// shuffle(extract_subvector(x,c1),extract_subvector(y,c2),m1)
36275// -->
36276// extract_subvector(shuffle(x,y,m2),0)
36277static SDValue combineX86ShuffleChainWithExtract(
36278    ArrayRef<SDValue> Inputs, SDValue Root, ArrayRef<int> BaseMask, int Depth,
36279    bool HasVariableMask, bool AllowVariableMask, SelectionDAG &DAG,
36280    const X86Subtarget &Subtarget) {
36281  unsigned NumMaskElts = BaseMask.size();
36282  unsigned NumInputs = Inputs.size();
36283  if (NumInputs == 0)
36284    return SDValue();
36285
36286  EVT RootVT = Root.getValueType();
36287  unsigned RootSizeInBits = RootVT.getSizeInBits();
36288  assert((RootSizeInBits % NumMaskElts) == 0 && "Unexpected root shuffle mask");
36289
36290  SmallVector<SDValue, 4> WideInputs(Inputs.begin(), Inputs.end());
36291  SmallVector<unsigned, 4> Offsets(NumInputs, 0);
36292
36293  // Peek through subvectors.
36294  // TODO: Support inter-mixed EXTRACT_SUBVECTORs + BITCASTs?
36295  unsigned WideSizeInBits = RootSizeInBits;
36296  for (unsigned i = 0; i != NumInputs; ++i) {
36297    SDValue &Src = WideInputs[i];
36298    unsigned &Offset = Offsets[i];
36299    Src = peekThroughBitcasts(Src);
36300    EVT BaseVT = Src.getValueType();
36301    while (Src.getOpcode() == ISD::EXTRACT_SUBVECTOR) {
36302      Offset += Src.getConstantOperandVal(1);
36303      Src = Src.getOperand(0);
36304    }
36305    WideSizeInBits = std::max(WideSizeInBits,
36306                              (unsigned)Src.getValueSizeInBits());
36307    assert((Offset % BaseVT.getVectorNumElements()) == 0 &&
36308           "Unexpected subvector extraction");
36309    Offset /= BaseVT.getVectorNumElements();
36310    Offset *= NumMaskElts;
36311  }
36312
36313  // Bail if we're always extracting from the lowest subvectors,
36314  // combineX86ShuffleChain should match this for the current width.
36315  if (llvm::all_of(Offsets, [](unsigned Offset) { return Offset == 0; }))
36316    return SDValue();
36317
36318  unsigned Scale = WideSizeInBits / RootSizeInBits;
36319  assert((WideSizeInBits % RootSizeInBits) == 0 &&
36320         "Unexpected subvector extraction");
36321
36322  // If the src vector types aren't the same, see if we can extend
36323  // them to match each other.
36324  // TODO: Support different scalar types?
36325  EVT WideSVT = WideInputs[0].getValueType().getScalarType();
36326  if (llvm::any_of(WideInputs, [&WideSVT, &DAG](SDValue Op) {
36327        return !DAG.getTargetLoweringInfo().isTypeLegal(Op.getValueType()) ||
36328               Op.getValueType().getScalarType() != WideSVT;
36329      }))
36330    return SDValue();
36331
36332  for (SDValue &NewInput : WideInputs) {
36333    assert((WideSizeInBits % NewInput.getValueSizeInBits()) == 0 &&
36334           "Shuffle vector size mismatch");
36335    if (WideSizeInBits > NewInput.getValueSizeInBits())
36336      NewInput = widenSubVector(NewInput, false, Subtarget, DAG,
36337                                SDLoc(NewInput), WideSizeInBits);
36338    assert(WideSizeInBits == NewInput.getValueSizeInBits() &&
36339           "Unexpected subvector extraction");
36340  }
36341
36342  // Create new mask for larger type.
36343  for (unsigned i = 1; i != NumInputs; ++i)
36344    Offsets[i] += i * Scale * NumMaskElts;
36345
36346  SmallVector<int, 64> WideMask(BaseMask.begin(), BaseMask.end());
36347  for (int &M : WideMask) {
36348    if (M < 0)
36349      continue;
36350    M = (M % NumMaskElts) + Offsets[M / NumMaskElts];
36351  }
36352  WideMask.append((Scale - 1) * NumMaskElts, SM_SentinelUndef);
36353
36354  // Remove unused/repeated shuffle source ops.
36355  resolveTargetShuffleInputsAndMask(WideInputs, WideMask);
36356  assert(!WideInputs.empty() && "Shuffle with no inputs detected");
36357
36358  if (WideInputs.size() > 2)
36359    return SDValue();
36360
36361  // Increase depth for every upper subvector we've peeked through.
36362  Depth += count_if(Offsets, [](unsigned Offset) { return Offset > 0; });
36363
36364  // Attempt to combine wider chain.
36365  // TODO: Can we use a better Root?
36366  SDValue WideRoot = WideInputs[0];
36367  if (SDValue WideShuffle = combineX86ShuffleChain(
36368          WideInputs, WideRoot, WideMask, Depth, HasVariableMask,
36369          AllowVariableMask, DAG, Subtarget)) {
36370    WideShuffle =
36371        extractSubVector(WideShuffle, 0, DAG, SDLoc(Root), RootSizeInBits);
36372    return DAG.getBitcast(RootVT, WideShuffle);
36373  }
36374  return SDValue();
36375}
36376
36377// Canonicalize the combined shuffle mask chain with horizontal ops.
36378// NOTE: This may update the Ops and Mask.
36379static SDValue canonicalizeShuffleMaskWithHorizOp(
36380    MutableArrayRef<SDValue> Ops, MutableArrayRef<int> Mask,
36381    unsigned RootSizeInBits, const SDLoc &DL, SelectionDAG &DAG,
36382    const X86Subtarget &Subtarget) {
36383  if (Mask.empty() || Ops.empty())
36384    return SDValue();
36385
36386  SmallVector<SDValue> BC;
36387  for (SDValue Op : Ops)
36388    BC.push_back(peekThroughBitcasts(Op));
36389
36390  // All ops must be the same horizop + type.
36391  SDValue BC0 = BC[0];
36392  EVT VT0 = BC0.getValueType();
36393  unsigned Opcode0 = BC0.getOpcode();
36394  if (VT0.getSizeInBits() != RootSizeInBits || llvm::any_of(BC, [&](SDValue V) {
36395        return V.getOpcode() != Opcode0 || V.getValueType() != VT0;
36396      }))
36397    return SDValue();
36398
36399  bool isHoriz = (Opcode0 == X86ISD::FHADD || Opcode0 == X86ISD::HADD ||
36400                  Opcode0 == X86ISD::FHSUB || Opcode0 == X86ISD::HSUB);
36401  bool isPack = (Opcode0 == X86ISD::PACKSS || Opcode0 == X86ISD::PACKUS);
36402  if (!isHoriz && !isPack)
36403    return SDValue();
36404
36405  // Do all ops have a single use?
36406  bool OneUseOps = llvm::all_of(Ops, [](SDValue Op) {
36407    return Op.hasOneUse() &&
36408           peekThroughBitcasts(Op) == peekThroughOneUseBitcasts(Op);
36409  });
36410
36411  int NumElts = VT0.getVectorNumElements();
36412  int NumLanes = VT0.getSizeInBits() / 128;
36413  int NumEltsPerLane = NumElts / NumLanes;
36414  int NumHalfEltsPerLane = NumEltsPerLane / 2;
36415  MVT SrcVT = BC0.getOperand(0).getSimpleValueType();
36416  unsigned EltSizeInBits = RootSizeInBits / Mask.size();
36417
36418  if (NumEltsPerLane >= 4 &&
36419      (isPack || shouldUseHorizontalOp(Ops.size() == 1, DAG, Subtarget))) {
36420    SmallVector<int> LaneMask, ScaledMask;
36421    if (isRepeatedTargetShuffleMask(128, EltSizeInBits, Mask, LaneMask) &&
36422        scaleShuffleElements(LaneMask, 4, ScaledMask)) {
36423      // See if we can remove the shuffle by resorting the HOP chain so that
36424      // the HOP args are pre-shuffled.
36425      // TODO: Generalize to any sized/depth chain.
36426      // TODO: Add support for PACKSS/PACKUS.
36427      if (isHoriz) {
36428        // Attempt to find a HOP(HOP(X,Y),HOP(Z,W)) source operand.
36429        auto GetHOpSrc = [&](int M) {
36430          if (M == SM_SentinelUndef)
36431            return DAG.getUNDEF(VT0);
36432          if (M == SM_SentinelZero)
36433            return getZeroVector(VT0.getSimpleVT(), Subtarget, DAG, DL);
36434          SDValue Src0 = BC[M / 4];
36435          SDValue Src1 = Src0.getOperand((M % 4) >= 2);
36436          if (Src1.getOpcode() == Opcode0 && Src0->isOnlyUserOf(Src1.getNode()))
36437            return Src1.getOperand(M % 2);
36438          return SDValue();
36439        };
36440        SDValue M0 = GetHOpSrc(ScaledMask[0]);
36441        SDValue M1 = GetHOpSrc(ScaledMask[1]);
36442        SDValue M2 = GetHOpSrc(ScaledMask[2]);
36443        SDValue M3 = GetHOpSrc(ScaledMask[3]);
36444        if (M0 && M1 && M2 && M3) {
36445          SDValue LHS = DAG.getNode(Opcode0, DL, SrcVT, M0, M1);
36446          SDValue RHS = DAG.getNode(Opcode0, DL, SrcVT, M2, M3);
36447          return DAG.getNode(Opcode0, DL, VT0, LHS, RHS);
36448        }
36449      }
36450      // shuffle(hop(x,y),hop(z,w)) -> permute(hop(x,z)) etc.
36451      if (Ops.size() >= 2) {
36452        SDValue LHS, RHS;
36453        auto GetHOpSrc = [&](int M, int &OutM) {
36454          // TODO: Support SM_SentinelZero
36455          if (M < 0)
36456            return M == SM_SentinelUndef;
36457          SDValue Src = BC[M / 4].getOperand((M % 4) >= 2);
36458          if (!LHS || LHS == Src) {
36459            LHS = Src;
36460            OutM = (M % 2);
36461            return true;
36462          }
36463          if (!RHS || RHS == Src) {
36464            RHS = Src;
36465            OutM = (M % 2) + 2;
36466            return true;
36467          }
36468          return false;
36469        };
36470        int PostMask[4] = {-1, -1, -1, -1};
36471        if (GetHOpSrc(ScaledMask[0], PostMask[0]) &&
36472            GetHOpSrc(ScaledMask[1], PostMask[1]) &&
36473            GetHOpSrc(ScaledMask[2], PostMask[2]) &&
36474            GetHOpSrc(ScaledMask[3], PostMask[3])) {
36475          LHS = DAG.getBitcast(SrcVT, LHS);
36476          RHS = DAG.getBitcast(SrcVT, RHS ? RHS : LHS);
36477          SDValue Res = DAG.getNode(Opcode0, DL, VT0, LHS, RHS);
36478          // Use SHUFPS for the permute so this will work on SSE3 targets,
36479          // shuffle combining and domain handling will simplify this later on.
36480          MVT ShuffleVT = MVT::getVectorVT(MVT::f32, RootSizeInBits / 32);
36481          Res = DAG.getBitcast(ShuffleVT, Res);
36482          return DAG.getNode(X86ISD::SHUFP, DL, ShuffleVT, Res, Res,
36483                             getV4X86ShuffleImm8ForMask(PostMask, DL, DAG));
36484        }
36485      }
36486    }
36487  }
36488
36489  if (2 < Ops.size())
36490    return SDValue();
36491
36492  SDValue BC1 = BC[BC.size() - 1];
36493  if (Mask.size() == VT0.getVectorNumElements()) {
36494    // Canonicalize binary shuffles of horizontal ops that use the
36495    // same sources to an unary shuffle.
36496    // TODO: Try to perform this fold even if the shuffle remains.
36497    if (Ops.size() == 2) {
36498      auto ContainsOps = [](SDValue HOp, SDValue Op) {
36499        return Op == HOp.getOperand(0) || Op == HOp.getOperand(1);
36500      };
36501      // Commute if all BC0's ops are contained in BC1.
36502      if (ContainsOps(BC1, BC0.getOperand(0)) &&
36503          ContainsOps(BC1, BC0.getOperand(1))) {
36504        ShuffleVectorSDNode::commuteMask(Mask);
36505        std::swap(Ops[0], Ops[1]);
36506        std::swap(BC0, BC1);
36507      }
36508
36509      // If BC1 can be represented by BC0, then convert to unary shuffle.
36510      if (ContainsOps(BC0, BC1.getOperand(0)) &&
36511          ContainsOps(BC0, BC1.getOperand(1))) {
36512        for (int &M : Mask) {
36513          if (M < NumElts) // BC0 element or UNDEF/Zero sentinel.
36514            continue;
36515          int SubLane = ((M % NumEltsPerLane) >= NumHalfEltsPerLane) ? 1 : 0;
36516          M -= NumElts + (SubLane * NumHalfEltsPerLane);
36517          if (BC1.getOperand(SubLane) != BC0.getOperand(0))
36518            M += NumHalfEltsPerLane;
36519        }
36520      }
36521    }
36522
36523    // Canonicalize unary horizontal ops to only refer to lower halves.
36524    for (int i = 0; i != NumElts; ++i) {
36525      int &M = Mask[i];
36526      if (isUndefOrZero(M))
36527        continue;
36528      if (M < NumElts && BC0.getOperand(0) == BC0.getOperand(1) &&
36529          (M % NumEltsPerLane) >= NumHalfEltsPerLane)
36530        M -= NumHalfEltsPerLane;
36531      if (NumElts <= M && BC1.getOperand(0) == BC1.getOperand(1) &&
36532          (M % NumEltsPerLane) >= NumHalfEltsPerLane)
36533        M -= NumHalfEltsPerLane;
36534    }
36535  }
36536
36537  // Combine binary shuffle of 2 similar 'Horizontal' instructions into a
36538  // single instruction. Attempt to match a v2X64 repeating shuffle pattern that
36539  // represents the LHS/RHS inputs for the lower/upper halves.
36540  SmallVector<int, 16> TargetMask128, WideMask128;
36541  if (isRepeatedTargetShuffleMask(128, EltSizeInBits, Mask, TargetMask128) &&
36542      scaleShuffleElements(TargetMask128, 2, WideMask128)) {
36543    assert(isUndefOrZeroOrInRange(WideMask128, 0, 4) && "Illegal shuffle");
36544    bool SingleOp = (Ops.size() == 1);
36545    if (isPack || OneUseOps ||
36546        shouldUseHorizontalOp(SingleOp, DAG, Subtarget)) {
36547      SDValue Lo = isInRange(WideMask128[0], 0, 2) ? BC0 : BC1;
36548      SDValue Hi = isInRange(WideMask128[1], 0, 2) ? BC0 : BC1;
36549      Lo = Lo.getOperand(WideMask128[0] & 1);
36550      Hi = Hi.getOperand(WideMask128[1] & 1);
36551      if (SingleOp) {
36552        SDValue Undef = DAG.getUNDEF(SrcVT);
36553        SDValue Zero = getZeroVector(SrcVT, Subtarget, DAG, DL);
36554        Lo = (WideMask128[0] == SM_SentinelZero ? Zero : Lo);
36555        Hi = (WideMask128[1] == SM_SentinelZero ? Zero : Hi);
36556        Lo = (WideMask128[0] == SM_SentinelUndef ? Undef : Lo);
36557        Hi = (WideMask128[1] == SM_SentinelUndef ? Undef : Hi);
36558      }
36559      return DAG.getNode(Opcode0, DL, VT0, Lo, Hi);
36560    }
36561  }
36562
36563  return SDValue();
36564}
36565
36566// Attempt to constant fold all of the constant source ops.
36567// Returns true if the entire shuffle is folded to a constant.
36568// TODO: Extend this to merge multiple constant Ops and update the mask.
36569static SDValue combineX86ShufflesConstants(ArrayRef<SDValue> Ops,
36570                                           ArrayRef<int> Mask, SDValue Root,
36571                                           bool HasVariableMask,
36572                                           SelectionDAG &DAG,
36573                                           const X86Subtarget &Subtarget) {
36574  MVT VT = Root.getSimpleValueType();
36575
36576  unsigned SizeInBits = VT.getSizeInBits();
36577  unsigned NumMaskElts = Mask.size();
36578  unsigned MaskSizeInBits = SizeInBits / NumMaskElts;
36579  unsigned NumOps = Ops.size();
36580
36581  // Extract constant bits from each source op.
36582  bool OneUseConstantOp = false;
36583  SmallVector<APInt, 16> UndefEltsOps(NumOps);
36584  SmallVector<SmallVector<APInt, 16>, 16> RawBitsOps(NumOps);
36585  for (unsigned i = 0; i != NumOps; ++i) {
36586    SDValue SrcOp = Ops[i];
36587    OneUseConstantOp |= SrcOp.hasOneUse();
36588    if (!getTargetConstantBitsFromNode(SrcOp, MaskSizeInBits, UndefEltsOps[i],
36589                                       RawBitsOps[i]))
36590      return SDValue();
36591  }
36592
36593  // Only fold if at least one of the constants is only used once or
36594  // the combined shuffle has included a variable mask shuffle, this
36595  // is to avoid constant pool bloat.
36596  if (!OneUseConstantOp && !HasVariableMask)
36597    return SDValue();
36598
36599  // Shuffle the constant bits according to the mask.
36600  SDLoc DL(Root);
36601  APInt UndefElts(NumMaskElts, 0);
36602  APInt ZeroElts(NumMaskElts, 0);
36603  APInt ConstantElts(NumMaskElts, 0);
36604  SmallVector<APInt, 8> ConstantBitData(NumMaskElts,
36605                                        APInt::getNullValue(MaskSizeInBits));
36606  for (unsigned i = 0; i != NumMaskElts; ++i) {
36607    int M = Mask[i];
36608    if (M == SM_SentinelUndef) {
36609      UndefElts.setBit(i);
36610      continue;
36611    } else if (M == SM_SentinelZero) {
36612      ZeroElts.setBit(i);
36613      continue;
36614    }
36615    assert(0 <= M && M < (int)(NumMaskElts * NumOps));
36616
36617    unsigned SrcOpIdx = (unsigned)M / NumMaskElts;
36618    unsigned SrcMaskIdx = (unsigned)M % NumMaskElts;
36619
36620    auto &SrcUndefElts = UndefEltsOps[SrcOpIdx];
36621    if (SrcUndefElts[SrcMaskIdx]) {
36622      UndefElts.setBit(i);
36623      continue;
36624    }
36625
36626    auto &SrcEltBits = RawBitsOps[SrcOpIdx];
36627    APInt &Bits = SrcEltBits[SrcMaskIdx];
36628    if (!Bits) {
36629      ZeroElts.setBit(i);
36630      continue;
36631    }
36632
36633    ConstantElts.setBit(i);
36634    ConstantBitData[i] = Bits;
36635  }
36636  assert((UndefElts | ZeroElts | ConstantElts).isAllOnesValue());
36637
36638  // Attempt to create a zero vector.
36639  if ((UndefElts | ZeroElts).isAllOnesValue())
36640    return getZeroVector(Root.getSimpleValueType(), Subtarget, DAG, DL);
36641
36642  // Create the constant data.
36643  MVT MaskSVT;
36644  if (VT.isFloatingPoint() && (MaskSizeInBits == 32 || MaskSizeInBits == 64))
36645    MaskSVT = MVT::getFloatingPointVT(MaskSizeInBits);
36646  else
36647    MaskSVT = MVT::getIntegerVT(MaskSizeInBits);
36648
36649  MVT MaskVT = MVT::getVectorVT(MaskSVT, NumMaskElts);
36650  if (!DAG.getTargetLoweringInfo().isTypeLegal(MaskVT))
36651    return SDValue();
36652
36653  SDValue CstOp = getConstVector(ConstantBitData, UndefElts, MaskVT, DAG, DL);
36654  return DAG.getBitcast(VT, CstOp);
36655}
36656
36657namespace llvm {
36658  namespace X86 {
36659    enum {
36660      MaxShuffleCombineDepth = 8
36661    };
36662  }
36663} // namespace llvm
36664
36665/// Fully generic combining of x86 shuffle instructions.
36666///
36667/// This should be the last combine run over the x86 shuffle instructions. Once
36668/// they have been fully optimized, this will recursively consider all chains
36669/// of single-use shuffle instructions, build a generic model of the cumulative
36670/// shuffle operation, and check for simpler instructions which implement this
36671/// operation. We use this primarily for two purposes:
36672///
36673/// 1) Collapse generic shuffles to specialized single instructions when
36674///    equivalent. In most cases, this is just an encoding size win, but
36675///    sometimes we will collapse multiple generic shuffles into a single
36676///    special-purpose shuffle.
36677/// 2) Look for sequences of shuffle instructions with 3 or more total
36678///    instructions, and replace them with the slightly more expensive SSSE3
36679///    PSHUFB instruction if available. We do this as the last combining step
36680///    to ensure we avoid using PSHUFB if we can implement the shuffle with
36681///    a suitable short sequence of other instructions. The PSHUFB will either
36682///    use a register or have to read from memory and so is slightly (but only
36683///    slightly) more expensive than the other shuffle instructions.
36684///
36685/// Because this is inherently a quadratic operation (for each shuffle in
36686/// a chain, we recurse up the chain), the depth is limited to 8 instructions.
36687/// This should never be an issue in practice as the shuffle lowering doesn't
36688/// produce sequences of more than 8 instructions.
36689///
36690/// FIXME: We will currently miss some cases where the redundant shuffling
36691/// would simplify under the threshold for PSHUFB formation because of
36692/// combine-ordering. To fix this, we should do the redundant instruction
36693/// combining in this recursive walk.
36694static SDValue combineX86ShufflesRecursively(
36695    ArrayRef<SDValue> SrcOps, int SrcOpIndex, SDValue Root,
36696    ArrayRef<int> RootMask, ArrayRef<const SDNode *> SrcNodes, unsigned Depth,
36697    unsigned MaxDepth, bool HasVariableMask, bool AllowVariableMask,
36698    SelectionDAG &DAG, const X86Subtarget &Subtarget) {
36699  assert(RootMask.size() > 0 &&
36700         (RootMask.size() > 1 || (RootMask[0] == 0 && SrcOpIndex == 0)) &&
36701         "Illegal shuffle root mask");
36702  assert(Root.getSimpleValueType().isVector() &&
36703         "Shuffles operate on vector types!");
36704  unsigned RootSizeInBits = Root.getSimpleValueType().getSizeInBits();
36705
36706  // Bound the depth of our recursive combine because this is ultimately
36707  // quadratic in nature.
36708  if (Depth >= MaxDepth)
36709    return SDValue();
36710
36711  // Directly rip through bitcasts to find the underlying operand.
36712  SDValue Op = SrcOps[SrcOpIndex];
36713  Op = peekThroughOneUseBitcasts(Op);
36714
36715  EVT VT = Op.getValueType();
36716  if (!VT.isVector() || !VT.isSimple())
36717    return SDValue(); // Bail if we hit a non-simple non-vector.
36718
36719  assert((RootSizeInBits % VT.getSizeInBits()) == 0 &&
36720         "Can only combine shuffles upto size of the root op.");
36721
36722  // Extract target shuffle mask and resolve sentinels and inputs.
36723  // TODO - determine Op's demanded elts from RootMask.
36724  SmallVector<int, 64> OpMask;
36725  SmallVector<SDValue, 2> OpInputs;
36726  APInt OpUndef, OpZero;
36727  APInt OpDemandedElts = APInt::getAllOnesValue(VT.getVectorNumElements());
36728  bool IsOpVariableMask = isTargetShuffleVariableMask(Op.getOpcode());
36729  if (!getTargetShuffleInputs(Op, OpDemandedElts, OpInputs, OpMask, OpUndef,
36730                              OpZero, DAG, Depth, false))
36731    return SDValue();
36732
36733  // Shuffle inputs must not be larger than the shuffle result.
36734  // TODO: Relax this for single input faux shuffles (trunc/extract_subvector).
36735  if (llvm::any_of(OpInputs, [VT](SDValue OpInput) {
36736        return OpInput.getValueSizeInBits() > VT.getSizeInBits();
36737      }))
36738    return SDValue();
36739
36740  // If the shuffle result was smaller than the root, we need to adjust the
36741  // mask indices and pad the mask with undefs.
36742  if (RootSizeInBits > VT.getSizeInBits()) {
36743    unsigned NumSubVecs = RootSizeInBits / VT.getSizeInBits();
36744    unsigned OpMaskSize = OpMask.size();
36745    if (OpInputs.size() > 1) {
36746      unsigned PaddedMaskSize = NumSubVecs * OpMaskSize;
36747      for (int &M : OpMask) {
36748        if (M < 0)
36749          continue;
36750        int EltIdx = M % OpMaskSize;
36751        int OpIdx = M / OpMaskSize;
36752        M = (PaddedMaskSize * OpIdx) + EltIdx;
36753      }
36754    }
36755    OpZero = OpZero.zext(NumSubVecs * OpMaskSize);
36756    OpUndef = OpUndef.zext(NumSubVecs * OpMaskSize);
36757    OpMask.append((NumSubVecs - 1) * OpMaskSize, SM_SentinelUndef);
36758  }
36759
36760  SmallVector<int, 64> Mask;
36761  SmallVector<SDValue, 16> Ops;
36762
36763  // We don't need to merge masks if the root is empty.
36764  bool EmptyRoot = (Depth == 0) && (RootMask.size() == 1);
36765  if (EmptyRoot) {
36766    // Only resolve zeros if it will remove an input, otherwise we might end
36767    // up in an infinite loop.
36768    bool ResolveKnownZeros = true;
36769    if (!OpZero.isNullValue()) {
36770      APInt UsedInputs = APInt::getNullValue(OpInputs.size());
36771      for (int i = 0, e = OpMask.size(); i != e; ++i) {
36772        int M = OpMask[i];
36773        if (OpUndef[i] || OpZero[i] || isUndefOrZero(M))
36774          continue;
36775        UsedInputs.setBit(M / OpMask.size());
36776        if (UsedInputs.isAllOnesValue()) {
36777          ResolveKnownZeros = false;
36778          break;
36779        }
36780      }
36781    }
36782    resolveTargetShuffleFromZeroables(OpMask, OpUndef, OpZero,
36783                                      ResolveKnownZeros);
36784
36785    Mask = OpMask;
36786    Ops.append(OpInputs.begin(), OpInputs.end());
36787  } else {
36788    resolveTargetShuffleFromZeroables(OpMask, OpUndef, OpZero);
36789
36790    // Add the inputs to the Ops list, avoiding duplicates.
36791    Ops.append(SrcOps.begin(), SrcOps.end());
36792
36793    auto AddOp = [&Ops](SDValue Input, int InsertionPoint) -> int {
36794      // Attempt to find an existing match.
36795      SDValue InputBC = peekThroughBitcasts(Input);
36796      for (int i = 0, e = Ops.size(); i < e; ++i)
36797        if (InputBC == peekThroughBitcasts(Ops[i]))
36798          return i;
36799      // Match failed - should we replace an existing Op?
36800      if (InsertionPoint >= 0) {
36801        Ops[InsertionPoint] = Input;
36802        return InsertionPoint;
36803      }
36804      // Add to the end of the Ops list.
36805      Ops.push_back(Input);
36806      return Ops.size() - 1;
36807    };
36808
36809    SmallVector<int, 2> OpInputIdx;
36810    for (SDValue OpInput : OpInputs)
36811      OpInputIdx.push_back(
36812          AddOp(OpInput, OpInputIdx.empty() ? SrcOpIndex : -1));
36813
36814    assert(((RootMask.size() > OpMask.size() &&
36815             RootMask.size() % OpMask.size() == 0) ||
36816            (OpMask.size() > RootMask.size() &&
36817             OpMask.size() % RootMask.size() == 0) ||
36818            OpMask.size() == RootMask.size()) &&
36819           "The smaller number of elements must divide the larger.");
36820
36821    // This function can be performance-critical, so we rely on the power-of-2
36822    // knowledge that we have about the mask sizes to replace div/rem ops with
36823    // bit-masks and shifts.
36824    assert(isPowerOf2_32(RootMask.size()) &&
36825           "Non-power-of-2 shuffle mask sizes");
36826    assert(isPowerOf2_32(OpMask.size()) && "Non-power-of-2 shuffle mask sizes");
36827    unsigned RootMaskSizeLog2 = countTrailingZeros(RootMask.size());
36828    unsigned OpMaskSizeLog2 = countTrailingZeros(OpMask.size());
36829
36830    unsigned MaskWidth = std::max<unsigned>(OpMask.size(), RootMask.size());
36831    unsigned RootRatio =
36832        std::max<unsigned>(1, OpMask.size() >> RootMaskSizeLog2);
36833    unsigned OpRatio = std::max<unsigned>(1, RootMask.size() >> OpMaskSizeLog2);
36834    assert((RootRatio == 1 || OpRatio == 1) &&
36835           "Must not have a ratio for both incoming and op masks!");
36836
36837    assert(isPowerOf2_32(MaskWidth) && "Non-power-of-2 shuffle mask sizes");
36838    assert(isPowerOf2_32(RootRatio) && "Non-power-of-2 shuffle mask sizes");
36839    assert(isPowerOf2_32(OpRatio) && "Non-power-of-2 shuffle mask sizes");
36840    unsigned RootRatioLog2 = countTrailingZeros(RootRatio);
36841    unsigned OpRatioLog2 = countTrailingZeros(OpRatio);
36842
36843    Mask.resize(MaskWidth, SM_SentinelUndef);
36844
36845    // Merge this shuffle operation's mask into our accumulated mask. Note that
36846    // this shuffle's mask will be the first applied to the input, followed by
36847    // the root mask to get us all the way to the root value arrangement. The
36848    // reason for this order is that we are recursing up the operation chain.
36849    for (unsigned i = 0; i < MaskWidth; ++i) {
36850      unsigned RootIdx = i >> RootRatioLog2;
36851      if (RootMask[RootIdx] < 0) {
36852        // This is a zero or undef lane, we're done.
36853        Mask[i] = RootMask[RootIdx];
36854        continue;
36855      }
36856
36857      unsigned RootMaskedIdx =
36858          RootRatio == 1
36859              ? RootMask[RootIdx]
36860              : (RootMask[RootIdx] << RootRatioLog2) + (i & (RootRatio - 1));
36861
36862      // Just insert the scaled root mask value if it references an input other
36863      // than the SrcOp we're currently inserting.
36864      if ((RootMaskedIdx < (SrcOpIndex * MaskWidth)) ||
36865          (((SrcOpIndex + 1) * MaskWidth) <= RootMaskedIdx)) {
36866        Mask[i] = RootMaskedIdx;
36867        continue;
36868      }
36869
36870      RootMaskedIdx = RootMaskedIdx & (MaskWidth - 1);
36871      unsigned OpIdx = RootMaskedIdx >> OpRatioLog2;
36872      if (OpMask[OpIdx] < 0) {
36873        // The incoming lanes are zero or undef, it doesn't matter which ones we
36874        // are using.
36875        Mask[i] = OpMask[OpIdx];
36876        continue;
36877      }
36878
36879      // Ok, we have non-zero lanes, map them through to one of the Op's inputs.
36880      unsigned OpMaskedIdx = OpRatio == 1 ? OpMask[OpIdx]
36881                                          : (OpMask[OpIdx] << OpRatioLog2) +
36882                                                (RootMaskedIdx & (OpRatio - 1));
36883
36884      OpMaskedIdx = OpMaskedIdx & (MaskWidth - 1);
36885      int InputIdx = OpMask[OpIdx] / (int)OpMask.size();
36886      assert(0 <= OpInputIdx[InputIdx] && "Unknown target shuffle input");
36887      OpMaskedIdx += OpInputIdx[InputIdx] * MaskWidth;
36888
36889      Mask[i] = OpMaskedIdx;
36890    }
36891  }
36892
36893  // Remove unused/repeated shuffle source ops.
36894  resolveTargetShuffleInputsAndMask(Ops, Mask);
36895
36896  // Handle the all undef/zero/ones cases early.
36897  if (all_of(Mask, [](int Idx) { return Idx == SM_SentinelUndef; }))
36898    return DAG.getUNDEF(Root.getValueType());
36899  if (all_of(Mask, [](int Idx) { return Idx < 0; }))
36900    return getZeroVector(Root.getSimpleValueType(), Subtarget, DAG,
36901                         SDLoc(Root));
36902  if (Ops.size() == 1 && ISD::isBuildVectorAllOnes(Ops[0].getNode()) &&
36903      none_of(Mask, [](int M) { return M == SM_SentinelZero; }))
36904    return getOnesVector(Root.getValueType(), DAG, SDLoc(Root));
36905
36906  assert(!Ops.empty() && "Shuffle with no inputs detected");
36907  HasVariableMask |= IsOpVariableMask;
36908
36909  // Update the list of shuffle nodes that have been combined so far.
36910  SmallVector<const SDNode *, 16> CombinedNodes(SrcNodes.begin(),
36911                                                SrcNodes.end());
36912  CombinedNodes.push_back(Op.getNode());
36913
36914  // See if we can recurse into each shuffle source op (if it's a target
36915  // shuffle). The source op should only be generally combined if it either has
36916  // a single use (i.e. current Op) or all its users have already been combined,
36917  // if not then we can still combine but should prevent generation of variable
36918  // shuffles to avoid constant pool bloat.
36919  // Don't recurse if we already have more source ops than we can combine in
36920  // the remaining recursion depth.
36921  if (Ops.size() < (MaxDepth - Depth)) {
36922    for (int i = 0, e = Ops.size(); i < e; ++i) {
36923      // For empty roots, we need to resolve zeroable elements before combining
36924      // them with other shuffles.
36925      SmallVector<int, 64> ResolvedMask = Mask;
36926      if (EmptyRoot)
36927        resolveTargetShuffleFromZeroables(ResolvedMask, OpUndef, OpZero);
36928      bool AllowVar = false;
36929      if (Ops[i].getNode()->hasOneUse() ||
36930          SDNode::areOnlyUsersOf(CombinedNodes, Ops[i].getNode()))
36931        AllowVar = AllowVariableMask;
36932      if (SDValue Res = combineX86ShufflesRecursively(
36933              Ops, i, Root, ResolvedMask, CombinedNodes, Depth + 1, MaxDepth,
36934              HasVariableMask, AllowVar, DAG, Subtarget))
36935        return Res;
36936    }
36937  }
36938
36939  // Attempt to constant fold all of the constant source ops.
36940  if (SDValue Cst = combineX86ShufflesConstants(
36941          Ops, Mask, Root, HasVariableMask, DAG, Subtarget))
36942    return Cst;
36943
36944  // Canonicalize the combined shuffle mask chain with horizontal ops.
36945  // NOTE: This will update the Ops and Mask.
36946  if (SDValue HOp = canonicalizeShuffleMaskWithHorizOp(
36947          Ops, Mask, RootSizeInBits, SDLoc(Root), DAG, Subtarget))
36948    return DAG.getBitcast(Root.getValueType(), HOp);
36949
36950  // Widen any subvector shuffle inputs we've collected.
36951  if (any_of(Ops, [RootSizeInBits](SDValue Op) {
36952        return Op.getValueSizeInBits() < RootSizeInBits;
36953      })) {
36954    for (SDValue &Op : Ops)
36955      if (Op.getValueSizeInBits() < RootSizeInBits)
36956        Op = widenSubVector(Op, false, Subtarget, DAG, SDLoc(Op),
36957                            RootSizeInBits);
36958    // Reresolve - we might have repeated subvector sources.
36959    resolveTargetShuffleInputsAndMask(Ops, Mask);
36960  }
36961
36962  // We can only combine unary and binary shuffle mask cases.
36963  if (Ops.size() <= 2) {
36964    // Minor canonicalization of the accumulated shuffle mask to make it easier
36965    // to match below. All this does is detect masks with sequential pairs of
36966    // elements, and shrink them to the half-width mask. It does this in a loop
36967    // so it will reduce the size of the mask to the minimal width mask which
36968    // performs an equivalent shuffle.
36969    while (Mask.size() > 1) {
36970      SmallVector<int, 64> WidenedMask;
36971      if (!canWidenShuffleElements(Mask, WidenedMask))
36972        break;
36973      Mask = std::move(WidenedMask);
36974    }
36975
36976    // Canonicalization of binary shuffle masks to improve pattern matching by
36977    // commuting the inputs.
36978    if (Ops.size() == 2 && canonicalizeShuffleMaskWithCommute(Mask)) {
36979      ShuffleVectorSDNode::commuteMask(Mask);
36980      std::swap(Ops[0], Ops[1]);
36981    }
36982
36983    // Finally, try to combine into a single shuffle instruction.
36984    return combineX86ShuffleChain(Ops, Root, Mask, Depth, HasVariableMask,
36985                                  AllowVariableMask, DAG, Subtarget);
36986  }
36987
36988  // If that failed and any input is extracted then try to combine as a
36989  // shuffle with the larger type.
36990  return combineX86ShuffleChainWithExtract(Ops, Root, Mask, Depth,
36991                                           HasVariableMask, AllowVariableMask,
36992                                           DAG, Subtarget);
36993}
36994
36995/// Helper entry wrapper to combineX86ShufflesRecursively.
36996static SDValue combineX86ShufflesRecursively(SDValue Op, SelectionDAG &DAG,
36997                                             const X86Subtarget &Subtarget) {
36998  return combineX86ShufflesRecursively({Op}, 0, Op, {0}, {}, /*Depth*/ 0,
36999                                       X86::MaxShuffleCombineDepth,
37000                                       /*HasVarMask*/ false,
37001                                       /*AllowVarMask*/ true, DAG, Subtarget);
37002}
37003
37004/// Get the PSHUF-style mask from PSHUF node.
37005///
37006/// This is a very minor wrapper around getTargetShuffleMask to easy forming v4
37007/// PSHUF-style masks that can be reused with such instructions.
37008static SmallVector<int, 4> getPSHUFShuffleMask(SDValue N) {
37009  MVT VT = N.getSimpleValueType();
37010  SmallVector<int, 4> Mask;
37011  SmallVector<SDValue, 2> Ops;
37012  bool HaveMask =
37013      getTargetShuffleMask(N.getNode(), VT, false, Ops, Mask);
37014  (void)HaveMask;
37015  assert(HaveMask);
37016
37017  // If we have more than 128-bits, only the low 128-bits of shuffle mask
37018  // matter. Check that the upper masks are repeats and remove them.
37019  if (VT.getSizeInBits() > 128) {
37020    int LaneElts = 128 / VT.getScalarSizeInBits();
37021#ifndef NDEBUG
37022    for (int i = 1, NumLanes = VT.getSizeInBits() / 128; i < NumLanes; ++i)
37023      for (int j = 0; j < LaneElts; ++j)
37024        assert(Mask[j] == Mask[i * LaneElts + j] - (LaneElts * i) &&
37025               "Mask doesn't repeat in high 128-bit lanes!");
37026#endif
37027    Mask.resize(LaneElts);
37028  }
37029
37030  switch (N.getOpcode()) {
37031  case X86ISD::PSHUFD:
37032    return Mask;
37033  case X86ISD::PSHUFLW:
37034    Mask.resize(4);
37035    return Mask;
37036  case X86ISD::PSHUFHW:
37037    Mask.erase(Mask.begin(), Mask.begin() + 4);
37038    for (int &M : Mask)
37039      M -= 4;
37040    return Mask;
37041  default:
37042    llvm_unreachable("No valid shuffle instruction found!");
37043  }
37044}
37045
37046/// Search for a combinable shuffle across a chain ending in pshufd.
37047///
37048/// We walk up the chain and look for a combinable shuffle, skipping over
37049/// shuffles that we could hoist this shuffle's transformation past without
37050/// altering anything.
37051static SDValue
37052combineRedundantDWordShuffle(SDValue N, MutableArrayRef<int> Mask,
37053                             SelectionDAG &DAG) {
37054  assert(N.getOpcode() == X86ISD::PSHUFD &&
37055         "Called with something other than an x86 128-bit half shuffle!");
37056  SDLoc DL(N);
37057
37058  // Walk up a single-use chain looking for a combinable shuffle. Keep a stack
37059  // of the shuffles in the chain so that we can form a fresh chain to replace
37060  // this one.
37061  SmallVector<SDValue, 8> Chain;
37062  SDValue V = N.getOperand(0);
37063  for (; V.hasOneUse(); V = V.getOperand(0)) {
37064    switch (V.getOpcode()) {
37065    default:
37066      return SDValue(); // Nothing combined!
37067
37068    case ISD::BITCAST:
37069      // Skip bitcasts as we always know the type for the target specific
37070      // instructions.
37071      continue;
37072
37073    case X86ISD::PSHUFD:
37074      // Found another dword shuffle.
37075      break;
37076
37077    case X86ISD::PSHUFLW:
37078      // Check that the low words (being shuffled) are the identity in the
37079      // dword shuffle, and the high words are self-contained.
37080      if (Mask[0] != 0 || Mask[1] != 1 ||
37081          !(Mask[2] >= 2 && Mask[2] < 4 && Mask[3] >= 2 && Mask[3] < 4))
37082        return SDValue();
37083
37084      Chain.push_back(V);
37085      continue;
37086
37087    case X86ISD::PSHUFHW:
37088      // Check that the high words (being shuffled) are the identity in the
37089      // dword shuffle, and the low words are self-contained.
37090      if (Mask[2] != 2 || Mask[3] != 3 ||
37091          !(Mask[0] >= 0 && Mask[0] < 2 && Mask[1] >= 0 && Mask[1] < 2))
37092        return SDValue();
37093
37094      Chain.push_back(V);
37095      continue;
37096
37097    case X86ISD::UNPCKL:
37098    case X86ISD::UNPCKH:
37099      // For either i8 -> i16 or i16 -> i32 unpacks, we can combine a dword
37100      // shuffle into a preceding word shuffle.
37101      if (V.getSimpleValueType().getVectorElementType() != MVT::i8 &&
37102          V.getSimpleValueType().getVectorElementType() != MVT::i16)
37103        return SDValue();
37104
37105      // Search for a half-shuffle which we can combine with.
37106      unsigned CombineOp =
37107          V.getOpcode() == X86ISD::UNPCKL ? X86ISD::PSHUFLW : X86ISD::PSHUFHW;
37108      if (V.getOperand(0) != V.getOperand(1) ||
37109          !V->isOnlyUserOf(V.getOperand(0).getNode()))
37110        return SDValue();
37111      Chain.push_back(V);
37112      V = V.getOperand(0);
37113      do {
37114        switch (V.getOpcode()) {
37115        default:
37116          return SDValue(); // Nothing to combine.
37117
37118        case X86ISD::PSHUFLW:
37119        case X86ISD::PSHUFHW:
37120          if (V.getOpcode() == CombineOp)
37121            break;
37122
37123          Chain.push_back(V);
37124
37125          LLVM_FALLTHROUGH;
37126        case ISD::BITCAST:
37127          V = V.getOperand(0);
37128          continue;
37129        }
37130        break;
37131      } while (V.hasOneUse());
37132      break;
37133    }
37134    // Break out of the loop if we break out of the switch.
37135    break;
37136  }
37137
37138  if (!V.hasOneUse())
37139    // We fell out of the loop without finding a viable combining instruction.
37140    return SDValue();
37141
37142  // Merge this node's mask and our incoming mask.
37143  SmallVector<int, 4> VMask = getPSHUFShuffleMask(V);
37144  for (int &M : Mask)
37145    M = VMask[M];
37146  V = DAG.getNode(V.getOpcode(), DL, V.getValueType(), V.getOperand(0),
37147                  getV4X86ShuffleImm8ForMask(Mask, DL, DAG));
37148
37149  // Rebuild the chain around this new shuffle.
37150  while (!Chain.empty()) {
37151    SDValue W = Chain.pop_back_val();
37152
37153    if (V.getValueType() != W.getOperand(0).getValueType())
37154      V = DAG.getBitcast(W.getOperand(0).getValueType(), V);
37155
37156    switch (W.getOpcode()) {
37157    default:
37158      llvm_unreachable("Only PSHUF and UNPCK instructions get here!");
37159
37160    case X86ISD::UNPCKL:
37161    case X86ISD::UNPCKH:
37162      V = DAG.getNode(W.getOpcode(), DL, W.getValueType(), V, V);
37163      break;
37164
37165    case X86ISD::PSHUFD:
37166    case X86ISD::PSHUFLW:
37167    case X86ISD::PSHUFHW:
37168      V = DAG.getNode(W.getOpcode(), DL, W.getValueType(), V, W.getOperand(1));
37169      break;
37170    }
37171  }
37172  if (V.getValueType() != N.getValueType())
37173    V = DAG.getBitcast(N.getValueType(), V);
37174
37175  // Return the new chain to replace N.
37176  return V;
37177}
37178
37179// Attempt to commute shufps LHS loads:
37180// permilps(shufps(load(),x)) --> permilps(shufps(x,load()))
37181static SDValue combineCommutableSHUFP(SDValue N, MVT VT, const SDLoc &DL,
37182                                      SelectionDAG &DAG) {
37183  // TODO: Add vXf64 support.
37184  if (VT != MVT::v4f32 && VT != MVT::v8f32 && VT != MVT::v16f32)
37185    return SDValue();
37186
37187  // SHUFP(LHS, RHS) -> SHUFP(RHS, LHS) iff LHS is foldable + RHS is not.
37188  auto commuteSHUFP = [&VT, &DL, &DAG](SDValue Parent, SDValue V) {
37189    if (V.getOpcode() != X86ISD::SHUFP || !Parent->isOnlyUserOf(V.getNode()))
37190      return SDValue();
37191    SDValue N0 = V.getOperand(0);
37192    SDValue N1 = V.getOperand(1);
37193    unsigned Imm = V.getConstantOperandVal(2);
37194    if (!MayFoldLoad(peekThroughOneUseBitcasts(N0)) ||
37195        MayFoldLoad(peekThroughOneUseBitcasts(N1)))
37196      return SDValue();
37197    Imm = ((Imm & 0x0F) << 4) | ((Imm & 0xF0) >> 4);
37198    return DAG.getNode(X86ISD::SHUFP, DL, VT, N1, N0,
37199                       DAG.getTargetConstant(Imm, DL, MVT::i8));
37200  };
37201
37202  switch (N.getOpcode()) {
37203  case X86ISD::VPERMILPI:
37204    if (SDValue NewSHUFP = commuteSHUFP(N, N.getOperand(0))) {
37205      unsigned Imm = N.getConstantOperandVal(1);
37206      return DAG.getNode(X86ISD::VPERMILPI, DL, VT, NewSHUFP,
37207                         DAG.getTargetConstant(Imm ^ 0xAA, DL, MVT::i8));
37208    }
37209    break;
37210  case X86ISD::SHUFP: {
37211    SDValue N0 = N.getOperand(0);
37212    SDValue N1 = N.getOperand(1);
37213    unsigned Imm = N.getConstantOperandVal(2);
37214    if (N0 == N1) {
37215      if (SDValue NewSHUFP = commuteSHUFP(N, N0))
37216        return DAG.getNode(X86ISD::SHUFP, DL, VT, NewSHUFP, NewSHUFP,
37217                           DAG.getTargetConstant(Imm ^ 0xAA, DL, MVT::i8));
37218    } else if (SDValue NewSHUFP = commuteSHUFP(N, N0)) {
37219      return DAG.getNode(X86ISD::SHUFP, DL, VT, NewSHUFP, N1,
37220                         DAG.getTargetConstant(Imm ^ 0x0A, DL, MVT::i8));
37221    } else if (SDValue NewSHUFP = commuteSHUFP(N, N1)) {
37222      return DAG.getNode(X86ISD::SHUFP, DL, VT, N0, NewSHUFP,
37223                         DAG.getTargetConstant(Imm ^ 0xA0, DL, MVT::i8));
37224    }
37225    break;
37226  }
37227  }
37228
37229  return SDValue();
37230}
37231
37232// Canonicalize SHUFFLE(BINOP(X,Y)) -> BINOP(SHUFFLE(X),SHUFFLE(Y)).
37233static SDValue canonicalizeShuffleWithBinOps(SDValue N, SelectionDAG &DAG,
37234                                             const SDLoc &DL) {
37235  const TargetLowering &TLI = DAG.getTargetLoweringInfo();
37236  EVT ShuffleVT = N.getValueType();
37237
37238  auto IsMergeableWithShuffle = [](SDValue Op) {
37239    // AllZeros/AllOnes constants are freely shuffled and will peek through
37240    // bitcasts. Other constant build vectors do not peek through bitcasts. Only
37241    // merge with target shuffles if it has one use so shuffle combining is
37242    // likely to kick in.
37243    return ISD::isBuildVectorAllOnes(Op.getNode()) ||
37244           ISD::isBuildVectorAllZeros(Op.getNode()) ||
37245           ISD::isBuildVectorOfConstantSDNodes(Op.getNode()) ||
37246           ISD::isBuildVectorOfConstantFPSDNodes(Op.getNode()) ||
37247           (isTargetShuffle(Op.getOpcode()) && Op->hasOneUse());
37248  };
37249  auto IsSafeToMoveShuffle = [ShuffleVT](SDValue Op, unsigned BinOp) {
37250    // Ensure we only shuffle whole vector src elements, unless its a logical
37251    // binops where we can more aggressively move shuffles from dst to src.
37252    return BinOp == ISD::AND || BinOp == ISD::OR || BinOp == ISD::XOR ||
37253           (Op.getScalarValueSizeInBits() <= ShuffleVT.getScalarSizeInBits());
37254  };
37255
37256  unsigned Opc = N.getOpcode();
37257  switch (Opc) {
37258  // Unary and Unary+Permute Shuffles.
37259  case X86ISD::PSHUFB: {
37260    // Don't merge PSHUFB if it contains zero'd elements.
37261    SmallVector<int> Mask;
37262    SmallVector<SDValue> Ops;
37263    if (!getTargetShuffleMask(N.getNode(), ShuffleVT.getSimpleVT(), false, Ops,
37264                              Mask))
37265      break;
37266    LLVM_FALLTHROUGH;
37267  }
37268  case X86ISD::VBROADCAST:
37269  case X86ISD::MOVDDUP:
37270  case X86ISD::PSHUFD:
37271  case X86ISD::VPERMI:
37272  case X86ISD::VPERMILPI: {
37273    if (N.getOperand(0).getValueType() == ShuffleVT &&
37274        N->isOnlyUserOf(N.getOperand(0).getNode())) {
37275      SDValue N0 = peekThroughOneUseBitcasts(N.getOperand(0));
37276      unsigned SrcOpcode = N0.getOpcode();
37277      if (TLI.isBinOp(SrcOpcode) && IsSafeToMoveShuffle(N0, SrcOpcode)) {
37278        SDValue Op00 = peekThroughOneUseBitcasts(N0.getOperand(0));
37279        SDValue Op01 = peekThroughOneUseBitcasts(N0.getOperand(1));
37280        if (IsMergeableWithShuffle(Op00) || IsMergeableWithShuffle(Op01)) {
37281          SDValue LHS, RHS;
37282          Op00 = DAG.getBitcast(ShuffleVT, Op00);
37283          Op01 = DAG.getBitcast(ShuffleVT, Op01);
37284          if (N.getNumOperands() == 2) {
37285            LHS = DAG.getNode(Opc, DL, ShuffleVT, Op00, N.getOperand(1));
37286            RHS = DAG.getNode(Opc, DL, ShuffleVT, Op01, N.getOperand(1));
37287          } else {
37288            LHS = DAG.getNode(Opc, DL, ShuffleVT, Op00);
37289            RHS = DAG.getNode(Opc, DL, ShuffleVT, Op01);
37290          }
37291          EVT OpVT = N0.getValueType();
37292          return DAG.getBitcast(ShuffleVT,
37293                                DAG.getNode(SrcOpcode, DL, OpVT,
37294                                            DAG.getBitcast(OpVT, LHS),
37295                                            DAG.getBitcast(OpVT, RHS)));
37296        }
37297      }
37298    }
37299    break;
37300  }
37301  // Binary and Binary+Permute Shuffles.
37302  case X86ISD::INSERTPS: {
37303    // Don't merge INSERTPS if it contains zero'd elements.
37304    unsigned InsertPSMask = N.getConstantOperandVal(2);
37305    unsigned ZeroMask = InsertPSMask & 0xF;
37306    if (ZeroMask != 0)
37307      break;
37308    LLVM_FALLTHROUGH;
37309  }
37310  case X86ISD::MOVSD:
37311  case X86ISD::MOVSS:
37312  case X86ISD::BLENDI:
37313  case X86ISD::SHUFP:
37314  case X86ISD::UNPCKH:
37315  case X86ISD::UNPCKL: {
37316    if (N->isOnlyUserOf(N.getOperand(0).getNode()) &&
37317        N->isOnlyUserOf(N.getOperand(1).getNode())) {
37318      SDValue N0 = peekThroughOneUseBitcasts(N.getOperand(0));
37319      SDValue N1 = peekThroughOneUseBitcasts(N.getOperand(1));
37320      unsigned SrcOpcode = N0.getOpcode();
37321      if (TLI.isBinOp(SrcOpcode) && N1.getOpcode() == SrcOpcode &&
37322          IsSafeToMoveShuffle(N0, SrcOpcode) &&
37323          IsSafeToMoveShuffle(N1, SrcOpcode)) {
37324        SDValue Op00 = peekThroughOneUseBitcasts(N0.getOperand(0));
37325        SDValue Op10 = peekThroughOneUseBitcasts(N1.getOperand(0));
37326        SDValue Op01 = peekThroughOneUseBitcasts(N0.getOperand(1));
37327        SDValue Op11 = peekThroughOneUseBitcasts(N1.getOperand(1));
37328        // Ensure the total number of shuffles doesn't increase by folding this
37329        // shuffle through to the source ops.
37330        if (((IsMergeableWithShuffle(Op00) && IsMergeableWithShuffle(Op10)) ||
37331             (IsMergeableWithShuffle(Op01) && IsMergeableWithShuffle(Op11))) ||
37332            ((IsMergeableWithShuffle(Op00) || IsMergeableWithShuffle(Op10)) &&
37333             (IsMergeableWithShuffle(Op01) || IsMergeableWithShuffle(Op11)))) {
37334          SDValue LHS, RHS;
37335          Op00 = DAG.getBitcast(ShuffleVT, Op00);
37336          Op10 = DAG.getBitcast(ShuffleVT, Op10);
37337          Op01 = DAG.getBitcast(ShuffleVT, Op01);
37338          Op11 = DAG.getBitcast(ShuffleVT, Op11);
37339          if (N.getNumOperands() == 3) {
37340            LHS = DAG.getNode(Opc, DL, ShuffleVT, Op00, Op10, N.getOperand(2));
37341            RHS = DAG.getNode(Opc, DL, ShuffleVT, Op01, Op11, N.getOperand(2));
37342          } else {
37343            LHS = DAG.getNode(Opc, DL, ShuffleVT, Op00, Op10);
37344            RHS = DAG.getNode(Opc, DL, ShuffleVT, Op01, Op11);
37345          }
37346          EVT OpVT = N0.getValueType();
37347          return DAG.getBitcast(ShuffleVT,
37348                                DAG.getNode(SrcOpcode, DL, OpVT,
37349                                            DAG.getBitcast(OpVT, LHS),
37350                                            DAG.getBitcast(OpVT, RHS)));
37351        }
37352      }
37353    }
37354    break;
37355  }
37356  }
37357  return SDValue();
37358}
37359
37360/// Attempt to fold vpermf128(op(),op()) -> op(vpermf128(),vpermf128()).
37361static SDValue canonicalizeLaneShuffleWithRepeatedOps(SDValue V,
37362                                                      SelectionDAG &DAG,
37363                                                      const SDLoc &DL) {
37364  assert(V.getOpcode() == X86ISD::VPERM2X128 && "Unknown lane shuffle");
37365
37366  MVT VT = V.getSimpleValueType();
37367  SDValue Src0 = peekThroughBitcasts(V.getOperand(0));
37368  SDValue Src1 = peekThroughBitcasts(V.getOperand(1));
37369  unsigned SrcOpc0 = Src0.getOpcode();
37370  unsigned SrcOpc1 = Src1.getOpcode();
37371  EVT SrcVT0 = Src0.getValueType();
37372  EVT SrcVT1 = Src1.getValueType();
37373
37374  if (!Src1.isUndef() && (SrcVT0 != SrcVT1 || SrcOpc0 != SrcOpc1))
37375    return SDValue();
37376
37377  switch (SrcOpc0) {
37378  case X86ISD::MOVDDUP: {
37379    SDValue LHS = Src0.getOperand(0);
37380    SDValue RHS = Src1.isUndef() ? Src1 : Src1.getOperand(0);
37381    SDValue Res =
37382        DAG.getNode(X86ISD::VPERM2X128, DL, SrcVT0, LHS, RHS, V.getOperand(2));
37383    Res = DAG.getNode(SrcOpc0, DL, SrcVT0, Res);
37384    return DAG.getBitcast(VT, Res);
37385  }
37386  case X86ISD::VPERMILPI:
37387    // TODO: Handle v4f64 permutes with different low/high lane masks.
37388    if (SrcVT0 == MVT::v4f64) {
37389      uint64_t Mask = Src0.getConstantOperandVal(1);
37390      if ((Mask & 0x3) != ((Mask >> 2) & 0x3))
37391        break;
37392    }
37393    LLVM_FALLTHROUGH;
37394  case X86ISD::VSHLI:
37395  case X86ISD::VSRLI:
37396  case X86ISD::VSRAI:
37397  case X86ISD::PSHUFD:
37398    if (Src1.isUndef() || Src0.getOperand(1) == Src1.getOperand(1)) {
37399      SDValue LHS = Src0.getOperand(0);
37400      SDValue RHS = Src1.isUndef() ? Src1 : Src1.getOperand(0);
37401      SDValue Res = DAG.getNode(X86ISD::VPERM2X128, DL, SrcVT0, LHS, RHS,
37402                                V.getOperand(2));
37403      Res = DAG.getNode(SrcOpc0, DL, SrcVT0, Res, Src0.getOperand(1));
37404      return DAG.getBitcast(VT, Res);
37405    }
37406    break;
37407  }
37408
37409  return SDValue();
37410}
37411
37412/// Try to combine x86 target specific shuffles.
37413static SDValue combineTargetShuffle(SDValue N, SelectionDAG &DAG,
37414                                    TargetLowering::DAGCombinerInfo &DCI,
37415                                    const X86Subtarget &Subtarget) {
37416  SDLoc DL(N);
37417  MVT VT = N.getSimpleValueType();
37418  SmallVector<int, 4> Mask;
37419  unsigned Opcode = N.getOpcode();
37420
37421  if (SDValue R = combineCommutableSHUFP(N, VT, DL, DAG))
37422    return R;
37423
37424  if (SDValue R = canonicalizeShuffleWithBinOps(N, DAG, DL))
37425    return R;
37426
37427  // Handle specific target shuffles.
37428  switch (Opcode) {
37429  case X86ISD::MOVDDUP: {
37430    SDValue Src = N.getOperand(0);
37431    // Turn a 128-bit MOVDDUP of a full vector load into movddup+vzload.
37432    if (VT == MVT::v2f64 && Src.hasOneUse() &&
37433        ISD::isNormalLoad(Src.getNode())) {
37434      LoadSDNode *LN = cast<LoadSDNode>(Src);
37435      if (SDValue VZLoad = narrowLoadToVZLoad(LN, MVT::f64, MVT::v2f64, DAG)) {
37436        SDValue Movddup = DAG.getNode(X86ISD::MOVDDUP, DL, MVT::v2f64, VZLoad);
37437        DCI.CombineTo(N.getNode(), Movddup);
37438        DAG.ReplaceAllUsesOfValueWith(SDValue(LN, 1), VZLoad.getValue(1));
37439        DCI.recursivelyDeleteUnusedNodes(LN);
37440        return N; // Return N so it doesn't get rechecked!
37441      }
37442    }
37443
37444    return SDValue();
37445  }
37446  case X86ISD::VBROADCAST: {
37447    SDValue Src = N.getOperand(0);
37448    SDValue BC = peekThroughBitcasts(Src);
37449    EVT SrcVT = Src.getValueType();
37450    EVT BCVT = BC.getValueType();
37451
37452    // If broadcasting from another shuffle, attempt to simplify it.
37453    // TODO - we really need a general SimplifyDemandedVectorElts mechanism.
37454    if (isTargetShuffle(BC.getOpcode()) &&
37455        VT.getScalarSizeInBits() % BCVT.getScalarSizeInBits() == 0) {
37456      unsigned Scale = VT.getScalarSizeInBits() / BCVT.getScalarSizeInBits();
37457      SmallVector<int, 16> DemandedMask(BCVT.getVectorNumElements(),
37458                                        SM_SentinelUndef);
37459      for (unsigned i = 0; i != Scale; ++i)
37460        DemandedMask[i] = i;
37461      if (SDValue Res = combineX86ShufflesRecursively(
37462              {BC}, 0, BC, DemandedMask, {}, /*Depth*/ 0,
37463              X86::MaxShuffleCombineDepth,
37464              /*HasVarMask*/ false, /*AllowVarMask*/ true, DAG, Subtarget))
37465        return DAG.getNode(X86ISD::VBROADCAST, DL, VT,
37466                           DAG.getBitcast(SrcVT, Res));
37467    }
37468
37469    // broadcast(bitcast(src)) -> bitcast(broadcast(src))
37470    // 32-bit targets have to bitcast i64 to f64, so better to bitcast upward.
37471    if (Src.getOpcode() == ISD::BITCAST &&
37472        SrcVT.getScalarSizeInBits() == BCVT.getScalarSizeInBits() &&
37473        DAG.getTargetLoweringInfo().isTypeLegal(BCVT)) {
37474      EVT NewVT = EVT::getVectorVT(*DAG.getContext(), BCVT.getScalarType(),
37475                                   VT.getVectorNumElements());
37476      return DAG.getBitcast(VT, DAG.getNode(X86ISD::VBROADCAST, DL, NewVT, BC));
37477    }
37478
37479    // Reduce broadcast source vector to lowest 128-bits.
37480    if (SrcVT.getSizeInBits() > 128)
37481      return DAG.getNode(X86ISD::VBROADCAST, DL, VT,
37482                         extract128BitVector(Src, 0, DAG, DL));
37483
37484    // broadcast(scalar_to_vector(x)) -> broadcast(x).
37485    if (Src.getOpcode() == ISD::SCALAR_TO_VECTOR)
37486      return DAG.getNode(X86ISD::VBROADCAST, DL, VT, Src.getOperand(0));
37487
37488    // Share broadcast with the longest vector and extract low subvector (free).
37489    // Ensure the same SDValue from the SDNode use is being used.
37490    for (SDNode *User : Src->uses())
37491      if (User != N.getNode() && User->getOpcode() == X86ISD::VBROADCAST &&
37492          Src == User->getOperand(0) &&
37493          User->getValueSizeInBits(0).getFixedSize() >
37494              VT.getFixedSizeInBits()) {
37495        return extractSubVector(SDValue(User, 0), 0, DAG, DL,
37496                                VT.getSizeInBits());
37497      }
37498
37499    // vbroadcast(scalarload X) -> vbroadcast_load X
37500    // For float loads, extract other uses of the scalar from the broadcast.
37501    if (!SrcVT.isVector() && (Src.hasOneUse() || VT.isFloatingPoint()) &&
37502        ISD::isNormalLoad(Src.getNode())) {
37503      LoadSDNode *LN = cast<LoadSDNode>(Src);
37504      SDVTList Tys = DAG.getVTList(VT, MVT::Other);
37505      SDValue Ops[] = { LN->getChain(), LN->getBasePtr() };
37506      SDValue BcastLd =
37507          DAG.getMemIntrinsicNode(X86ISD::VBROADCAST_LOAD, DL, Tys, Ops,
37508                                  LN->getMemoryVT(), LN->getMemOperand());
37509      // If the load value is used only by N, replace it via CombineTo N.
37510      bool NoReplaceExtract = Src.hasOneUse();
37511      DCI.CombineTo(N.getNode(), BcastLd);
37512      if (NoReplaceExtract) {
37513        DAG.ReplaceAllUsesOfValueWith(SDValue(LN, 1), BcastLd.getValue(1));
37514        DCI.recursivelyDeleteUnusedNodes(LN);
37515      } else {
37516        SDValue Scl = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, SrcVT, BcastLd,
37517                                  DAG.getIntPtrConstant(0, DL));
37518        DCI.CombineTo(LN, Scl, BcastLd.getValue(1));
37519      }
37520      return N; // Return N so it doesn't get rechecked!
37521    }
37522
37523    // Due to isTypeDesirableForOp, we won't always shrink a load truncated to
37524    // i16. So shrink it ourselves if we can make a broadcast_load.
37525    if (SrcVT == MVT::i16 && Src.getOpcode() == ISD::TRUNCATE &&
37526        Src.hasOneUse() && Src.getOperand(0).hasOneUse()) {
37527      assert(Subtarget.hasAVX2() && "Expected AVX2");
37528      SDValue TruncIn = Src.getOperand(0);
37529
37530      // If this is a truncate of a non extending load we can just narrow it to
37531      // use a broadcast_load.
37532      if (ISD::isNormalLoad(TruncIn.getNode())) {
37533        LoadSDNode *LN = cast<LoadSDNode>(TruncIn);
37534        // Unless its volatile or atomic.
37535        if (LN->isSimple()) {
37536          SDVTList Tys = DAG.getVTList(VT, MVT::Other);
37537          SDValue Ops[] = { LN->getChain(), LN->getBasePtr() };
37538          SDValue BcastLd = DAG.getMemIntrinsicNode(
37539              X86ISD::VBROADCAST_LOAD, DL, Tys, Ops, MVT::i16,
37540              LN->getPointerInfo(), LN->getOriginalAlign(),
37541              LN->getMemOperand()->getFlags());
37542          DCI.CombineTo(N.getNode(), BcastLd);
37543          DAG.ReplaceAllUsesOfValueWith(SDValue(LN, 1), BcastLd.getValue(1));
37544          DCI.recursivelyDeleteUnusedNodes(Src.getNode());
37545          return N; // Return N so it doesn't get rechecked!
37546        }
37547      }
37548
37549      // If this is a truncate of an i16 extload, we can directly replace it.
37550      if (ISD::isUNINDEXEDLoad(Src.getOperand(0).getNode()) &&
37551          ISD::isEXTLoad(Src.getOperand(0).getNode())) {
37552        LoadSDNode *LN = cast<LoadSDNode>(Src.getOperand(0));
37553        if (LN->getMemoryVT().getSizeInBits() == 16) {
37554          SDVTList Tys = DAG.getVTList(VT, MVT::Other);
37555          SDValue Ops[] = { LN->getChain(), LN->getBasePtr() };
37556          SDValue BcastLd =
37557              DAG.getMemIntrinsicNode(X86ISD::VBROADCAST_LOAD, DL, Tys, Ops,
37558                                      LN->getMemoryVT(), LN->getMemOperand());
37559          DCI.CombineTo(N.getNode(), BcastLd);
37560          DAG.ReplaceAllUsesOfValueWith(SDValue(LN, 1), BcastLd.getValue(1));
37561          DCI.recursivelyDeleteUnusedNodes(Src.getNode());
37562          return N; // Return N so it doesn't get rechecked!
37563        }
37564      }
37565
37566      // If this is a truncate of load that has been shifted right, we can
37567      // offset the pointer and use a narrower load.
37568      if (TruncIn.getOpcode() == ISD::SRL &&
37569          TruncIn.getOperand(0).hasOneUse() &&
37570          isa<ConstantSDNode>(TruncIn.getOperand(1)) &&
37571          ISD::isNormalLoad(TruncIn.getOperand(0).getNode())) {
37572        LoadSDNode *LN = cast<LoadSDNode>(TruncIn.getOperand(0));
37573        unsigned ShiftAmt = TruncIn.getConstantOperandVal(1);
37574        // Make sure the shift amount and the load size are divisible by 16.
37575        // Don't do this if the load is volatile or atomic.
37576        if (ShiftAmt % 16 == 0 && TruncIn.getValueSizeInBits() % 16 == 0 &&
37577            LN->isSimple()) {
37578          unsigned Offset = ShiftAmt / 8;
37579          SDVTList Tys = DAG.getVTList(VT, MVT::Other);
37580          SDValue Ptr = DAG.getMemBasePlusOffset(LN->getBasePtr(),
37581                                                 TypeSize::Fixed(Offset), DL);
37582          SDValue Ops[] = { LN->getChain(), Ptr };
37583          SDValue BcastLd = DAG.getMemIntrinsicNode(
37584              X86ISD::VBROADCAST_LOAD, DL, Tys, Ops, MVT::i16,
37585              LN->getPointerInfo().getWithOffset(Offset),
37586              LN->getOriginalAlign(),
37587              LN->getMemOperand()->getFlags());
37588          DCI.CombineTo(N.getNode(), BcastLd);
37589          DAG.ReplaceAllUsesOfValueWith(SDValue(LN, 1), BcastLd.getValue(1));
37590          DCI.recursivelyDeleteUnusedNodes(Src.getNode());
37591          return N; // Return N so it doesn't get rechecked!
37592        }
37593      }
37594    }
37595
37596    // vbroadcast(vzload X) -> vbroadcast_load X
37597    if (Src.getOpcode() == X86ISD::VZEXT_LOAD && Src.hasOneUse()) {
37598      MemSDNode *LN = cast<MemIntrinsicSDNode>(Src);
37599      if (LN->getMemoryVT().getSizeInBits() == VT.getScalarSizeInBits()) {
37600        SDVTList Tys = DAG.getVTList(VT, MVT::Other);
37601        SDValue Ops[] = { LN->getChain(), LN->getBasePtr() };
37602        SDValue BcastLd =
37603            DAG.getMemIntrinsicNode(X86ISD::VBROADCAST_LOAD, DL, Tys, Ops,
37604                                    LN->getMemoryVT(), LN->getMemOperand());
37605        DCI.CombineTo(N.getNode(), BcastLd);
37606        DAG.ReplaceAllUsesOfValueWith(SDValue(LN, 1), BcastLd.getValue(1));
37607        DCI.recursivelyDeleteUnusedNodes(LN);
37608        return N; // Return N so it doesn't get rechecked!
37609      }
37610    }
37611
37612    // vbroadcast(vector load X) -> vbroadcast_load
37613    if ((SrcVT == MVT::v2f64 || SrcVT == MVT::v4f32 || SrcVT == MVT::v2i64 ||
37614         SrcVT == MVT::v4i32) &&
37615        Src.hasOneUse() && ISD::isNormalLoad(Src.getNode())) {
37616      LoadSDNode *LN = cast<LoadSDNode>(Src);
37617      // Unless the load is volatile or atomic.
37618      if (LN->isSimple()) {
37619        SDVTList Tys = DAG.getVTList(VT, MVT::Other);
37620        SDValue Ops[] = {LN->getChain(), LN->getBasePtr()};
37621        SDValue BcastLd = DAG.getMemIntrinsicNode(
37622            X86ISD::VBROADCAST_LOAD, DL, Tys, Ops, SrcVT.getScalarType(),
37623            LN->getPointerInfo(), LN->getOriginalAlign(),
37624            LN->getMemOperand()->getFlags());
37625        DCI.CombineTo(N.getNode(), BcastLd);
37626        DAG.ReplaceAllUsesOfValueWith(SDValue(LN, 1), BcastLd.getValue(1));
37627        DCI.recursivelyDeleteUnusedNodes(LN);
37628        return N; // Return N so it doesn't get rechecked!
37629      }
37630    }
37631
37632    return SDValue();
37633  }
37634  case X86ISD::VZEXT_MOVL: {
37635    SDValue N0 = N.getOperand(0);
37636
37637    // If this a vzmovl of a full vector load, replace it with a vzload, unless
37638    // the load is volatile.
37639    if (N0.hasOneUse() && ISD::isNormalLoad(N0.getNode())) {
37640      auto *LN = cast<LoadSDNode>(N0);
37641      if (SDValue VZLoad =
37642              narrowLoadToVZLoad(LN, VT.getVectorElementType(), VT, DAG)) {
37643        DCI.CombineTo(N.getNode(), VZLoad);
37644        DAG.ReplaceAllUsesOfValueWith(SDValue(LN, 1), VZLoad.getValue(1));
37645        DCI.recursivelyDeleteUnusedNodes(LN);
37646        return N;
37647      }
37648    }
37649
37650    // If this a VZEXT_MOVL of a VBROADCAST_LOAD, we don't need the broadcast
37651    // and can just use a VZEXT_LOAD.
37652    // FIXME: Is there some way to do this with SimplifyDemandedVectorElts?
37653    if (N0.hasOneUse() && N0.getOpcode() == X86ISD::VBROADCAST_LOAD) {
37654      auto *LN = cast<MemSDNode>(N0);
37655      if (VT.getScalarSizeInBits() == LN->getMemoryVT().getSizeInBits()) {
37656        SDVTList Tys = DAG.getVTList(VT, MVT::Other);
37657        SDValue Ops[] = {LN->getChain(), LN->getBasePtr()};
37658        SDValue VZLoad =
37659            DAG.getMemIntrinsicNode(X86ISD::VZEXT_LOAD, DL, Tys, Ops,
37660                                    LN->getMemoryVT(), LN->getMemOperand());
37661        DCI.CombineTo(N.getNode(), VZLoad);
37662        DAG.ReplaceAllUsesOfValueWith(SDValue(LN, 1), VZLoad.getValue(1));
37663        DCI.recursivelyDeleteUnusedNodes(LN);
37664        return N;
37665      }
37666    }
37667
37668    // Turn (v2i64 (vzext_movl (scalar_to_vector (i64 X)))) into
37669    // (v2i64 (bitcast (v4i32 (vzext_movl (scalar_to_vector (i32 (trunc X)))))))
37670    // if the upper bits of the i64 are zero.
37671    if (N0.hasOneUse() && N0.getOpcode() == ISD::SCALAR_TO_VECTOR &&
37672        N0.getOperand(0).hasOneUse() &&
37673        N0.getOperand(0).getValueType() == MVT::i64) {
37674      SDValue In = N0.getOperand(0);
37675      APInt Mask = APInt::getHighBitsSet(64, 32);
37676      if (DAG.MaskedValueIsZero(In, Mask)) {
37677        SDValue Trunc = DAG.getNode(ISD::TRUNCATE, DL, MVT::i32, In);
37678        MVT VecVT = MVT::getVectorVT(MVT::i32, VT.getVectorNumElements() * 2);
37679        SDValue SclVec = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, VecVT, Trunc);
37680        SDValue Movl = DAG.getNode(X86ISD::VZEXT_MOVL, DL, VecVT, SclVec);
37681        return DAG.getBitcast(VT, Movl);
37682      }
37683    }
37684
37685    // Load a scalar integer constant directly to XMM instead of transferring an
37686    // immediate value from GPR.
37687    // vzext_movl (scalar_to_vector C) --> load [C,0...]
37688    if (N0.getOpcode() == ISD::SCALAR_TO_VECTOR) {
37689      if (auto *C = dyn_cast<ConstantSDNode>(N0.getOperand(0))) {
37690        // Create a vector constant - scalar constant followed by zeros.
37691        EVT ScalarVT = N0.getOperand(0).getValueType();
37692        Type *ScalarTy = ScalarVT.getTypeForEVT(*DAG.getContext());
37693        unsigned NumElts = VT.getVectorNumElements();
37694        Constant *Zero = ConstantInt::getNullValue(ScalarTy);
37695        SmallVector<Constant *, 32> ConstantVec(NumElts, Zero);
37696        ConstantVec[0] = const_cast<ConstantInt *>(C->getConstantIntValue());
37697
37698        // Load the vector constant from constant pool.
37699        MVT PVT = DAG.getTargetLoweringInfo().getPointerTy(DAG.getDataLayout());
37700        SDValue CP = DAG.getConstantPool(ConstantVector::get(ConstantVec), PVT);
37701        MachinePointerInfo MPI =
37702            MachinePointerInfo::getConstantPool(DAG.getMachineFunction());
37703        Align Alignment = cast<ConstantPoolSDNode>(CP)->getAlign();
37704        return DAG.getLoad(VT, DL, DAG.getEntryNode(), CP, MPI, Alignment,
37705                           MachineMemOperand::MOLoad);
37706      }
37707    }
37708
37709    // Pull subvector inserts into undef through VZEXT_MOVL by making it an
37710    // insert into a zero vector. This helps get VZEXT_MOVL closer to
37711    // scalar_to_vectors where 256/512 are canonicalized to an insert and a
37712    // 128-bit scalar_to_vector. This reduces the number of isel patterns.
37713    if (!DCI.isBeforeLegalizeOps() && N0.hasOneUse()) {
37714      SDValue V = peekThroughOneUseBitcasts(N0);
37715
37716      if (V.getOpcode() == ISD::INSERT_SUBVECTOR && V.getOperand(0).isUndef() &&
37717          isNullConstant(V.getOperand(2))) {
37718        SDValue In = V.getOperand(1);
37719        MVT SubVT = MVT::getVectorVT(VT.getVectorElementType(),
37720                                     In.getValueSizeInBits() /
37721                                         VT.getScalarSizeInBits());
37722        In = DAG.getBitcast(SubVT, In);
37723        SDValue Movl = DAG.getNode(X86ISD::VZEXT_MOVL, DL, SubVT, In);
37724        return DAG.getNode(ISD::INSERT_SUBVECTOR, DL, VT,
37725                           getZeroVector(VT, Subtarget, DAG, DL), Movl,
37726                           V.getOperand(2));
37727      }
37728    }
37729
37730    return SDValue();
37731  }
37732  case X86ISD::BLENDI: {
37733    SDValue N0 = N.getOperand(0);
37734    SDValue N1 = N.getOperand(1);
37735
37736    // blend(bitcast(x),bitcast(y)) -> bitcast(blend(x,y)) to narrower types.
37737    // TODO: Handle MVT::v16i16 repeated blend mask.
37738    if (N0.getOpcode() == ISD::BITCAST && N1.getOpcode() == ISD::BITCAST &&
37739        N0.getOperand(0).getValueType() == N1.getOperand(0).getValueType()) {
37740      MVT SrcVT = N0.getOperand(0).getSimpleValueType();
37741      if ((VT.getScalarSizeInBits() % SrcVT.getScalarSizeInBits()) == 0 &&
37742          SrcVT.getScalarSizeInBits() >= 32) {
37743        unsigned BlendMask = N.getConstantOperandVal(2);
37744        unsigned Size = VT.getVectorNumElements();
37745        unsigned Scale = VT.getScalarSizeInBits() / SrcVT.getScalarSizeInBits();
37746        BlendMask = scaleVectorShuffleBlendMask(BlendMask, Size, Scale);
37747        return DAG.getBitcast(
37748            VT, DAG.getNode(X86ISD::BLENDI, DL, SrcVT, N0.getOperand(0),
37749                            N1.getOperand(0),
37750                            DAG.getTargetConstant(BlendMask, DL, MVT::i8)));
37751      }
37752    }
37753    return SDValue();
37754  }
37755  case X86ISD::VPERMI: {
37756    // vpermi(bitcast(x)) -> bitcast(vpermi(x)) for same number of elements.
37757    // TODO: Remove when we have preferred domains in combineX86ShuffleChain.
37758    SDValue N0 = N.getOperand(0);
37759    SDValue N1 = N.getOperand(1);
37760    unsigned EltSizeInBits = VT.getScalarSizeInBits();
37761    if (N0.getOpcode() == ISD::BITCAST &&
37762        N0.getOperand(0).getScalarValueSizeInBits() == EltSizeInBits) {
37763      SDValue Src = N0.getOperand(0);
37764      EVT SrcVT = Src.getValueType();
37765      SDValue Res = DAG.getNode(X86ISD::VPERMI, DL, SrcVT, Src, N1);
37766      return DAG.getBitcast(VT, Res);
37767    }
37768    return SDValue();
37769  }
37770  case X86ISD::VPERM2X128: {
37771    // Fold vperm2x128(bitcast(x),bitcast(y),c) -> bitcast(vperm2x128(x,y,c)).
37772    SDValue LHS = N->getOperand(0);
37773    SDValue RHS = N->getOperand(1);
37774    if (LHS.getOpcode() == ISD::BITCAST &&
37775        (RHS.getOpcode() == ISD::BITCAST || RHS.isUndef())) {
37776      EVT SrcVT = LHS.getOperand(0).getValueType();
37777      if (RHS.isUndef() || SrcVT == RHS.getOperand(0).getValueType()) {
37778        return DAG.getBitcast(VT, DAG.getNode(X86ISD::VPERM2X128, DL, SrcVT,
37779                                              DAG.getBitcast(SrcVT, LHS),
37780                                              DAG.getBitcast(SrcVT, RHS),
37781                                              N->getOperand(2)));
37782      }
37783    }
37784
37785    // Fold vperm2x128(op(),op()) -> op(vperm2x128(),vperm2x128()).
37786    if (SDValue Res = canonicalizeLaneShuffleWithRepeatedOps(N, DAG, DL))
37787      return Res;
37788
37789    // Fold vperm2x128 subvector shuffle with an inner concat pattern.
37790    // vperm2x128(concat(X,Y),concat(Z,W)) --> concat X,Y etc.
37791    auto FindSubVector128 = [&](unsigned Idx) {
37792      if (Idx > 3)
37793        return SDValue();
37794      SDValue Src = peekThroughBitcasts(N.getOperand(Idx < 2 ? 0 : 1));
37795      SmallVector<SDValue> SubOps;
37796      if (collectConcatOps(Src.getNode(), SubOps) && SubOps.size() == 2)
37797        return SubOps[Idx & 1];
37798      unsigned NumElts = Src.getValueType().getVectorNumElements();
37799      if ((Idx & 1) == 1 && Src.getOpcode() == ISD::INSERT_SUBVECTOR &&
37800          Src.getOperand(1).getValueSizeInBits() == 128 &&
37801          Src.getConstantOperandAPInt(2) == (NumElts / 2)) {
37802        return Src.getOperand(1);
37803      }
37804      return SDValue();
37805    };
37806    unsigned Imm = N.getConstantOperandVal(2);
37807    if (SDValue SubLo = FindSubVector128(Imm & 0x0F)) {
37808      if (SDValue SubHi = FindSubVector128((Imm & 0xF0) >> 4)) {
37809        MVT SubVT = VT.getHalfNumVectorElementsVT();
37810        SubLo = DAG.getBitcast(SubVT, SubLo);
37811        SubHi = DAG.getBitcast(SubVT, SubHi);
37812        return DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, SubLo, SubHi);
37813      }
37814    }
37815    return SDValue();
37816  }
37817  case X86ISD::PSHUFD:
37818  case X86ISD::PSHUFLW:
37819  case X86ISD::PSHUFHW:
37820    Mask = getPSHUFShuffleMask(N);
37821    assert(Mask.size() == 4);
37822    break;
37823  case X86ISD::MOVSD:
37824  case X86ISD::MOVSS: {
37825    SDValue N0 = N.getOperand(0);
37826    SDValue N1 = N.getOperand(1);
37827
37828    // Canonicalize scalar FPOps:
37829    // MOVS*(N0, OP(N0, N1)) --> MOVS*(N0, SCALAR_TO_VECTOR(OP(N0[0], N1[0])))
37830    // If commutable, allow OP(N1[0], N0[0]).
37831    unsigned Opcode1 = N1.getOpcode();
37832    if (Opcode1 == ISD::FADD || Opcode1 == ISD::FMUL || Opcode1 == ISD::FSUB ||
37833        Opcode1 == ISD::FDIV) {
37834      SDValue N10 = N1.getOperand(0);
37835      SDValue N11 = N1.getOperand(1);
37836      if (N10 == N0 ||
37837          (N11 == N0 && (Opcode1 == ISD::FADD || Opcode1 == ISD::FMUL))) {
37838        if (N10 != N0)
37839          std::swap(N10, N11);
37840        MVT SVT = VT.getVectorElementType();
37841        SDValue ZeroIdx = DAG.getIntPtrConstant(0, DL);
37842        N10 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, SVT, N10, ZeroIdx);
37843        N11 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, SVT, N11, ZeroIdx);
37844        SDValue Scl = DAG.getNode(Opcode1, DL, SVT, N10, N11);
37845        SDValue SclVec = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, VT, Scl);
37846        return DAG.getNode(Opcode, DL, VT, N0, SclVec);
37847      }
37848    }
37849
37850    return SDValue();
37851  }
37852  case X86ISD::INSERTPS: {
37853    assert(VT == MVT::v4f32 && "INSERTPS ValueType must be MVT::v4f32");
37854    SDValue Op0 = N.getOperand(0);
37855    SDValue Op1 = N.getOperand(1);
37856    unsigned InsertPSMask = N.getConstantOperandVal(2);
37857    unsigned SrcIdx = (InsertPSMask >> 6) & 0x3;
37858    unsigned DstIdx = (InsertPSMask >> 4) & 0x3;
37859    unsigned ZeroMask = InsertPSMask & 0xF;
37860
37861    // If we zero out all elements from Op0 then we don't need to reference it.
37862    if (((ZeroMask | (1u << DstIdx)) == 0xF) && !Op0.isUndef())
37863      return DAG.getNode(X86ISD::INSERTPS, DL, VT, DAG.getUNDEF(VT), Op1,
37864                         DAG.getTargetConstant(InsertPSMask, DL, MVT::i8));
37865
37866    // If we zero out the element from Op1 then we don't need to reference it.
37867    if ((ZeroMask & (1u << DstIdx)) && !Op1.isUndef())
37868      return DAG.getNode(X86ISD::INSERTPS, DL, VT, Op0, DAG.getUNDEF(VT),
37869                         DAG.getTargetConstant(InsertPSMask, DL, MVT::i8));
37870
37871    // Attempt to merge insertps Op1 with an inner target shuffle node.
37872    SmallVector<int, 8> TargetMask1;
37873    SmallVector<SDValue, 2> Ops1;
37874    APInt KnownUndef1, KnownZero1;
37875    if (getTargetShuffleAndZeroables(Op1, TargetMask1, Ops1, KnownUndef1,
37876                                     KnownZero1)) {
37877      if (KnownUndef1[SrcIdx] || KnownZero1[SrcIdx]) {
37878        // Zero/UNDEF insertion - zero out element and remove dependency.
37879        InsertPSMask |= (1u << DstIdx);
37880        return DAG.getNode(X86ISD::INSERTPS, DL, VT, Op0, DAG.getUNDEF(VT),
37881                           DAG.getTargetConstant(InsertPSMask, DL, MVT::i8));
37882      }
37883      // Update insertps mask srcidx and reference the source input directly.
37884      int M = TargetMask1[SrcIdx];
37885      assert(0 <= M && M < 8 && "Shuffle index out of range");
37886      InsertPSMask = (InsertPSMask & 0x3f) | ((M & 0x3) << 6);
37887      Op1 = Ops1[M < 4 ? 0 : 1];
37888      return DAG.getNode(X86ISD::INSERTPS, DL, VT, Op0, Op1,
37889                         DAG.getTargetConstant(InsertPSMask, DL, MVT::i8));
37890    }
37891
37892    // Attempt to merge insertps Op0 with an inner target shuffle node.
37893    SmallVector<int, 8> TargetMask0;
37894    SmallVector<SDValue, 2> Ops0;
37895    APInt KnownUndef0, KnownZero0;
37896    if (getTargetShuffleAndZeroables(Op0, TargetMask0, Ops0, KnownUndef0,
37897                                     KnownZero0)) {
37898      bool Updated = false;
37899      bool UseInput00 = false;
37900      bool UseInput01 = false;
37901      for (int i = 0; i != 4; ++i) {
37902        if ((InsertPSMask & (1u << i)) || (i == (int)DstIdx)) {
37903          // No change if element is already zero or the inserted element.
37904          continue;
37905        } else if (KnownUndef0[i] || KnownZero0[i]) {
37906          // If the target mask is undef/zero then we must zero the element.
37907          InsertPSMask |= (1u << i);
37908          Updated = true;
37909          continue;
37910        }
37911
37912        // The input vector element must be inline.
37913        int M = TargetMask0[i];
37914        if (M != i && M != (i + 4))
37915          return SDValue();
37916
37917        // Determine which inputs of the target shuffle we're using.
37918        UseInput00 |= (0 <= M && M < 4);
37919        UseInput01 |= (4 <= M);
37920      }
37921
37922      // If we're not using both inputs of the target shuffle then use the
37923      // referenced input directly.
37924      if (UseInput00 && !UseInput01) {
37925        Updated = true;
37926        Op0 = Ops0[0];
37927      } else if (!UseInput00 && UseInput01) {
37928        Updated = true;
37929        Op0 = Ops0[1];
37930      }
37931
37932      if (Updated)
37933        return DAG.getNode(X86ISD::INSERTPS, DL, VT, Op0, Op1,
37934                           DAG.getTargetConstant(InsertPSMask, DL, MVT::i8));
37935    }
37936
37937    // If we're inserting an element from a vbroadcast load, fold the
37938    // load into the X86insertps instruction. We need to convert the scalar
37939    // load to a vector and clear the source lane of the INSERTPS control.
37940    if (Op1.getOpcode() == X86ISD::VBROADCAST_LOAD && Op1.hasOneUse()) {
37941      auto *MemIntr = cast<MemIntrinsicSDNode>(Op1);
37942      if (MemIntr->getMemoryVT().getScalarSizeInBits() == 32) {
37943        SDValue Load = DAG.getLoad(MVT::f32, DL, MemIntr->getChain(),
37944                                   MemIntr->getBasePtr(),
37945                                   MemIntr->getMemOperand());
37946        SDValue Insert = DAG.getNode(X86ISD::INSERTPS, DL, VT, Op0,
37947                           DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, VT,
37948                                       Load),
37949                           DAG.getTargetConstant(InsertPSMask & 0x3f, DL, MVT::i8));
37950        DAG.ReplaceAllUsesOfValueWith(SDValue(MemIntr, 1), Load.getValue(1));
37951        return Insert;
37952      }
37953    }
37954
37955    return SDValue();
37956  }
37957  default:
37958    return SDValue();
37959  }
37960
37961  // Nuke no-op shuffles that show up after combining.
37962  if (isNoopShuffleMask(Mask))
37963    return N.getOperand(0);
37964
37965  // Look for simplifications involving one or two shuffle instructions.
37966  SDValue V = N.getOperand(0);
37967  switch (N.getOpcode()) {
37968  default:
37969    break;
37970  case X86ISD::PSHUFLW:
37971  case X86ISD::PSHUFHW:
37972    assert(VT.getVectorElementType() == MVT::i16 && "Bad word shuffle type!");
37973
37974    // See if this reduces to a PSHUFD which is no more expensive and can
37975    // combine with more operations. Note that it has to at least flip the
37976    // dwords as otherwise it would have been removed as a no-op.
37977    if (makeArrayRef(Mask).equals({2, 3, 0, 1})) {
37978      int DMask[] = {0, 1, 2, 3};
37979      int DOffset = N.getOpcode() == X86ISD::PSHUFLW ? 0 : 2;
37980      DMask[DOffset + 0] = DOffset + 1;
37981      DMask[DOffset + 1] = DOffset + 0;
37982      MVT DVT = MVT::getVectorVT(MVT::i32, VT.getVectorNumElements() / 2);
37983      V = DAG.getBitcast(DVT, V);
37984      V = DAG.getNode(X86ISD::PSHUFD, DL, DVT, V,
37985                      getV4X86ShuffleImm8ForMask(DMask, DL, DAG));
37986      return DAG.getBitcast(VT, V);
37987    }
37988
37989    // Look for shuffle patterns which can be implemented as a single unpack.
37990    // FIXME: This doesn't handle the location of the PSHUFD generically, and
37991    // only works when we have a PSHUFD followed by two half-shuffles.
37992    if (Mask[0] == Mask[1] && Mask[2] == Mask[3] &&
37993        (V.getOpcode() == X86ISD::PSHUFLW ||
37994         V.getOpcode() == X86ISD::PSHUFHW) &&
37995        V.getOpcode() != N.getOpcode() &&
37996        V.hasOneUse() && V.getOperand(0).hasOneUse()) {
37997      SDValue D = peekThroughOneUseBitcasts(V.getOperand(0));
37998      if (D.getOpcode() == X86ISD::PSHUFD) {
37999        SmallVector<int, 4> VMask = getPSHUFShuffleMask(V);
38000        SmallVector<int, 4> DMask = getPSHUFShuffleMask(D);
38001        int NOffset = N.getOpcode() == X86ISD::PSHUFLW ? 0 : 4;
38002        int VOffset = V.getOpcode() == X86ISD::PSHUFLW ? 0 : 4;
38003        int WordMask[8];
38004        for (int i = 0; i < 4; ++i) {
38005          WordMask[i + NOffset] = Mask[i] + NOffset;
38006          WordMask[i + VOffset] = VMask[i] + VOffset;
38007        }
38008        // Map the word mask through the DWord mask.
38009        int MappedMask[8];
38010        for (int i = 0; i < 8; ++i)
38011          MappedMask[i] = 2 * DMask[WordMask[i] / 2] + WordMask[i] % 2;
38012        if (makeArrayRef(MappedMask).equals({0, 0, 1, 1, 2, 2, 3, 3}) ||
38013            makeArrayRef(MappedMask).equals({4, 4, 5, 5, 6, 6, 7, 7})) {
38014          // We can replace all three shuffles with an unpack.
38015          V = DAG.getBitcast(VT, D.getOperand(0));
38016          return DAG.getNode(MappedMask[0] == 0 ? X86ISD::UNPCKL
38017                                                : X86ISD::UNPCKH,
38018                             DL, VT, V, V);
38019        }
38020      }
38021    }
38022
38023    break;
38024
38025  case X86ISD::PSHUFD:
38026    if (SDValue NewN = combineRedundantDWordShuffle(N, Mask, DAG))
38027      return NewN;
38028
38029    break;
38030  }
38031
38032  return SDValue();
38033}
38034
38035/// Checks if the shuffle mask takes subsequent elements
38036/// alternately from two vectors.
38037/// For example <0, 5, 2, 7> or <8, 1, 10, 3, 12, 5, 14, 7> are both correct.
38038static bool isAddSubOrSubAddMask(ArrayRef<int> Mask, bool &Op0Even) {
38039
38040  int ParitySrc[2] = {-1, -1};
38041  unsigned Size = Mask.size();
38042  for (unsigned i = 0; i != Size; ++i) {
38043    int M = Mask[i];
38044    if (M < 0)
38045      continue;
38046
38047    // Make sure we are using the matching element from the input.
38048    if ((M % Size) != i)
38049      return false;
38050
38051    // Make sure we use the same input for all elements of the same parity.
38052    int Src = M / Size;
38053    if (ParitySrc[i % 2] >= 0 && ParitySrc[i % 2] != Src)
38054      return false;
38055    ParitySrc[i % 2] = Src;
38056  }
38057
38058  // Make sure each input is used.
38059  if (ParitySrc[0] < 0 || ParitySrc[1] < 0 || ParitySrc[0] == ParitySrc[1])
38060    return false;
38061
38062  Op0Even = ParitySrc[0] == 0;
38063  return true;
38064}
38065
38066/// Returns true iff the shuffle node \p N can be replaced with ADDSUB(SUBADD)
38067/// operation. If true is returned then the operands of ADDSUB(SUBADD) operation
38068/// are written to the parameters \p Opnd0 and \p Opnd1.
38069///
38070/// We combine shuffle to ADDSUB(SUBADD) directly on the abstract vector shuffle nodes
38071/// so it is easier to generically match. We also insert dummy vector shuffle
38072/// nodes for the operands which explicitly discard the lanes which are unused
38073/// by this operation to try to flow through the rest of the combiner
38074/// the fact that they're unused.
38075static bool isAddSubOrSubAdd(SDNode *N, const X86Subtarget &Subtarget,
38076                             SelectionDAG &DAG, SDValue &Opnd0, SDValue &Opnd1,
38077                             bool &IsSubAdd) {
38078
38079  EVT VT = N->getValueType(0);
38080  const TargetLowering &TLI = DAG.getTargetLoweringInfo();
38081  if (!Subtarget.hasSSE3() || !TLI.isTypeLegal(VT) ||
38082      !VT.getSimpleVT().isFloatingPoint())
38083    return false;
38084
38085  // We only handle target-independent shuffles.
38086  // FIXME: It would be easy and harmless to use the target shuffle mask
38087  // extraction tool to support more.
38088  if (N->getOpcode() != ISD::VECTOR_SHUFFLE)
38089    return false;
38090
38091  SDValue V1 = N->getOperand(0);
38092  SDValue V2 = N->getOperand(1);
38093
38094  // Make sure we have an FADD and an FSUB.
38095  if ((V1.getOpcode() != ISD::FADD && V1.getOpcode() != ISD::FSUB) ||
38096      (V2.getOpcode() != ISD::FADD && V2.getOpcode() != ISD::FSUB) ||
38097      V1.getOpcode() == V2.getOpcode())
38098    return false;
38099
38100  // If there are other uses of these operations we can't fold them.
38101  if (!V1->hasOneUse() || !V2->hasOneUse())
38102    return false;
38103
38104  // Ensure that both operations have the same operands. Note that we can
38105  // commute the FADD operands.
38106  SDValue LHS, RHS;
38107  if (V1.getOpcode() == ISD::FSUB) {
38108    LHS = V1->getOperand(0); RHS = V1->getOperand(1);
38109    if ((V2->getOperand(0) != LHS || V2->getOperand(1) != RHS) &&
38110        (V2->getOperand(0) != RHS || V2->getOperand(1) != LHS))
38111      return false;
38112  } else {
38113    assert(V2.getOpcode() == ISD::FSUB && "Unexpected opcode");
38114    LHS = V2->getOperand(0); RHS = V2->getOperand(1);
38115    if ((V1->getOperand(0) != LHS || V1->getOperand(1) != RHS) &&
38116        (V1->getOperand(0) != RHS || V1->getOperand(1) != LHS))
38117      return false;
38118  }
38119
38120  ArrayRef<int> Mask = cast<ShuffleVectorSDNode>(N)->getMask();
38121  bool Op0Even;
38122  if (!isAddSubOrSubAddMask(Mask, Op0Even))
38123    return false;
38124
38125  // It's a subadd if the vector in the even parity is an FADD.
38126  IsSubAdd = Op0Even ? V1->getOpcode() == ISD::FADD
38127                     : V2->getOpcode() == ISD::FADD;
38128
38129  Opnd0 = LHS;
38130  Opnd1 = RHS;
38131  return true;
38132}
38133
38134/// Combine shuffle of two fma nodes into FMAddSub or FMSubAdd.
38135static SDValue combineShuffleToFMAddSub(SDNode *N,
38136                                        const X86Subtarget &Subtarget,
38137                                        SelectionDAG &DAG) {
38138  // We only handle target-independent shuffles.
38139  // FIXME: It would be easy and harmless to use the target shuffle mask
38140  // extraction tool to support more.
38141  if (N->getOpcode() != ISD::VECTOR_SHUFFLE)
38142    return SDValue();
38143
38144  MVT VT = N->getSimpleValueType(0);
38145  const TargetLowering &TLI = DAG.getTargetLoweringInfo();
38146  if (!Subtarget.hasAnyFMA() || !TLI.isTypeLegal(VT))
38147    return SDValue();
38148
38149  // We're trying to match (shuffle fma(a, b, c), X86Fmsub(a, b, c).
38150  SDValue Op0 = N->getOperand(0);
38151  SDValue Op1 = N->getOperand(1);
38152  SDValue FMAdd = Op0, FMSub = Op1;
38153  if (FMSub.getOpcode() != X86ISD::FMSUB)
38154    std::swap(FMAdd, FMSub);
38155
38156  if (FMAdd.getOpcode() != ISD::FMA || FMSub.getOpcode() != X86ISD::FMSUB ||
38157      FMAdd.getOperand(0) != FMSub.getOperand(0) || !FMAdd.hasOneUse() ||
38158      FMAdd.getOperand(1) != FMSub.getOperand(1) || !FMSub.hasOneUse() ||
38159      FMAdd.getOperand(2) != FMSub.getOperand(2))
38160    return SDValue();
38161
38162  // Check for correct shuffle mask.
38163  ArrayRef<int> Mask = cast<ShuffleVectorSDNode>(N)->getMask();
38164  bool Op0Even;
38165  if (!isAddSubOrSubAddMask(Mask, Op0Even))
38166    return SDValue();
38167
38168  // FMAddSub takes zeroth operand from FMSub node.
38169  SDLoc DL(N);
38170  bool IsSubAdd = Op0Even ? Op0 == FMAdd : Op1 == FMAdd;
38171  unsigned Opcode = IsSubAdd ? X86ISD::FMSUBADD : X86ISD::FMADDSUB;
38172  return DAG.getNode(Opcode, DL, VT, FMAdd.getOperand(0), FMAdd.getOperand(1),
38173                     FMAdd.getOperand(2));
38174}
38175
38176/// Try to combine a shuffle into a target-specific add-sub or
38177/// mul-add-sub node.
38178static SDValue combineShuffleToAddSubOrFMAddSub(SDNode *N,
38179                                                const X86Subtarget &Subtarget,
38180                                                SelectionDAG &DAG) {
38181  if (SDValue V = combineShuffleToFMAddSub(N, Subtarget, DAG))
38182    return V;
38183
38184  SDValue Opnd0, Opnd1;
38185  bool IsSubAdd;
38186  if (!isAddSubOrSubAdd(N, Subtarget, DAG, Opnd0, Opnd1, IsSubAdd))
38187    return SDValue();
38188
38189  MVT VT = N->getSimpleValueType(0);
38190  SDLoc DL(N);
38191
38192  // Try to generate X86ISD::FMADDSUB node here.
38193  SDValue Opnd2;
38194  if (isFMAddSubOrFMSubAdd(Subtarget, DAG, Opnd0, Opnd1, Opnd2, 2)) {
38195    unsigned Opc = IsSubAdd ? X86ISD::FMSUBADD : X86ISD::FMADDSUB;
38196    return DAG.getNode(Opc, DL, VT, Opnd0, Opnd1, Opnd2);
38197  }
38198
38199  if (IsSubAdd)
38200    return SDValue();
38201
38202  // Do not generate X86ISD::ADDSUB node for 512-bit types even though
38203  // the ADDSUB idiom has been successfully recognized. There are no known
38204  // X86 targets with 512-bit ADDSUB instructions!
38205  if (VT.is512BitVector())
38206    return SDValue();
38207
38208  return DAG.getNode(X86ISD::ADDSUB, DL, VT, Opnd0, Opnd1);
38209}
38210
38211// We are looking for a shuffle where both sources are concatenated with undef
38212// and have a width that is half of the output's width. AVX2 has VPERMD/Q, so
38213// if we can express this as a single-source shuffle, that's preferable.
38214static SDValue combineShuffleOfConcatUndef(SDNode *N, SelectionDAG &DAG,
38215                                           const X86Subtarget &Subtarget) {
38216  if (!Subtarget.hasAVX2() || !isa<ShuffleVectorSDNode>(N))
38217    return SDValue();
38218
38219  EVT VT = N->getValueType(0);
38220
38221  // We only care about shuffles of 128/256-bit vectors of 32/64-bit values.
38222  if (!VT.is128BitVector() && !VT.is256BitVector())
38223    return SDValue();
38224
38225  if (VT.getVectorElementType() != MVT::i32 &&
38226      VT.getVectorElementType() != MVT::i64 &&
38227      VT.getVectorElementType() != MVT::f32 &&
38228      VT.getVectorElementType() != MVT::f64)
38229    return SDValue();
38230
38231  SDValue N0 = N->getOperand(0);
38232  SDValue N1 = N->getOperand(1);
38233
38234  // Check that both sources are concats with undef.
38235  if (N0.getOpcode() != ISD::CONCAT_VECTORS ||
38236      N1.getOpcode() != ISD::CONCAT_VECTORS || N0.getNumOperands() != 2 ||
38237      N1.getNumOperands() != 2 || !N0.getOperand(1).isUndef() ||
38238      !N1.getOperand(1).isUndef())
38239    return SDValue();
38240
38241  // Construct the new shuffle mask. Elements from the first source retain their
38242  // index, but elements from the second source no longer need to skip an undef.
38243  SmallVector<int, 8> Mask;
38244  int NumElts = VT.getVectorNumElements();
38245
38246  ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(N);
38247  for (int Elt : SVOp->getMask())
38248    Mask.push_back(Elt < NumElts ? Elt : (Elt - NumElts / 2));
38249
38250  SDLoc DL(N);
38251  SDValue Concat = DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, N0.getOperand(0),
38252                               N1.getOperand(0));
38253  return DAG.getVectorShuffle(VT, DL, Concat, DAG.getUNDEF(VT), Mask);
38254}
38255
38256/// If we have a shuffle of AVX/AVX512 (256/512 bit) vectors that only uses the
38257/// low half of each source vector and does not set any high half elements in
38258/// the destination vector, narrow the shuffle to half its original size.
38259static SDValue narrowShuffle(ShuffleVectorSDNode *Shuf, SelectionDAG &DAG) {
38260  if (!Shuf->getValueType(0).isSimple())
38261    return SDValue();
38262  MVT VT = Shuf->getSimpleValueType(0);
38263  if (!VT.is256BitVector() && !VT.is512BitVector())
38264    return SDValue();
38265
38266  // See if we can ignore all of the high elements of the shuffle.
38267  ArrayRef<int> Mask = Shuf->getMask();
38268  if (!isUndefUpperHalf(Mask))
38269    return SDValue();
38270
38271  // Check if the shuffle mask accesses only the low half of each input vector
38272  // (half-index output is 0 or 2).
38273  int HalfIdx1, HalfIdx2;
38274  SmallVector<int, 8> HalfMask(Mask.size() / 2);
38275  if (!getHalfShuffleMask(Mask, HalfMask, HalfIdx1, HalfIdx2) ||
38276      (HalfIdx1 % 2 == 1) || (HalfIdx2 % 2 == 1))
38277    return SDValue();
38278
38279  // Create a half-width shuffle to replace the unnecessarily wide shuffle.
38280  // The trick is knowing that all of the insert/extract are actually free
38281  // subregister (zmm<->ymm or ymm<->xmm) ops. That leaves us with a shuffle
38282  // of narrow inputs into a narrow output, and that is always cheaper than
38283  // the wide shuffle that we started with.
38284  return getShuffleHalfVectors(SDLoc(Shuf), Shuf->getOperand(0),
38285                               Shuf->getOperand(1), HalfMask, HalfIdx1,
38286                               HalfIdx2, false, DAG, /*UseConcat*/true);
38287}
38288
38289static SDValue combineShuffle(SDNode *N, SelectionDAG &DAG,
38290                              TargetLowering::DAGCombinerInfo &DCI,
38291                              const X86Subtarget &Subtarget) {
38292  if (auto *Shuf = dyn_cast<ShuffleVectorSDNode>(N))
38293    if (SDValue V = narrowShuffle(Shuf, DAG))
38294      return V;
38295
38296  // If we have legalized the vector types, look for blends of FADD and FSUB
38297  // nodes that we can fuse into an ADDSUB, FMADDSUB, or FMSUBADD node.
38298  SDLoc dl(N);
38299  EVT VT = N->getValueType(0);
38300  const TargetLowering &TLI = DAG.getTargetLoweringInfo();
38301  if (TLI.isTypeLegal(VT))
38302    if (SDValue AddSub = combineShuffleToAddSubOrFMAddSub(N, Subtarget, DAG))
38303      return AddSub;
38304
38305  // Attempt to combine into a vector load/broadcast.
38306  if (SDValue LD = combineToConsecutiveLoads(VT, SDValue(N, 0), dl, DAG,
38307                                             Subtarget, true))
38308    return LD;
38309
38310  // For AVX2, we sometimes want to combine
38311  // (vector_shuffle <mask> (concat_vectors t1, undef)
38312  //                        (concat_vectors t2, undef))
38313  // Into:
38314  // (vector_shuffle <mask> (concat_vectors t1, t2), undef)
38315  // Since the latter can be efficiently lowered with VPERMD/VPERMQ
38316  if (SDValue ShufConcat = combineShuffleOfConcatUndef(N, DAG, Subtarget))
38317    return ShufConcat;
38318
38319  if (isTargetShuffle(N->getOpcode())) {
38320    SDValue Op(N, 0);
38321    if (SDValue Shuffle = combineTargetShuffle(Op, DAG, DCI, Subtarget))
38322      return Shuffle;
38323
38324    // Try recursively combining arbitrary sequences of x86 shuffle
38325    // instructions into higher-order shuffles. We do this after combining
38326    // specific PSHUF instruction sequences into their minimal form so that we
38327    // can evaluate how many specialized shuffle instructions are involved in
38328    // a particular chain.
38329    if (SDValue Res = combineX86ShufflesRecursively(Op, DAG, Subtarget))
38330      return Res;
38331
38332    // Simplify source operands based on shuffle mask.
38333    // TODO - merge this into combineX86ShufflesRecursively.
38334    APInt KnownUndef, KnownZero;
38335    APInt DemandedElts = APInt::getAllOnesValue(VT.getVectorNumElements());
38336    if (TLI.SimplifyDemandedVectorElts(Op, DemandedElts, KnownUndef, KnownZero,
38337                                       DCI))
38338      return SDValue(N, 0);
38339  }
38340
38341  return SDValue();
38342}
38343
38344// Simplify variable target shuffle masks based on the demanded elements.
38345// TODO: Handle DemandedBits in mask indices as well?
38346bool X86TargetLowering::SimplifyDemandedVectorEltsForTargetShuffle(
38347    SDValue Op, const APInt &DemandedElts, unsigned MaskIndex,
38348    TargetLowering::TargetLoweringOpt &TLO, unsigned Depth) const {
38349  // If we're demanding all elements don't bother trying to simplify the mask.
38350  unsigned NumElts = DemandedElts.getBitWidth();
38351  if (DemandedElts.isAllOnesValue())
38352    return false;
38353
38354  SDValue Mask = Op.getOperand(MaskIndex);
38355  if (!Mask.hasOneUse())
38356    return false;
38357
38358  // Attempt to generically simplify the variable shuffle mask.
38359  APInt MaskUndef, MaskZero;
38360  if (SimplifyDemandedVectorElts(Mask, DemandedElts, MaskUndef, MaskZero, TLO,
38361                                 Depth + 1))
38362    return true;
38363
38364  // Attempt to extract+simplify a (constant pool load) shuffle mask.
38365  // TODO: Support other types from getTargetShuffleMaskIndices?
38366  SDValue BC = peekThroughOneUseBitcasts(Mask);
38367  EVT BCVT = BC.getValueType();
38368  auto *Load = dyn_cast<LoadSDNode>(BC);
38369  if (!Load)
38370    return false;
38371
38372  const Constant *C = getTargetConstantFromNode(Load);
38373  if (!C)
38374    return false;
38375
38376  Type *CTy = C->getType();
38377  if (!CTy->isVectorTy() ||
38378      CTy->getPrimitiveSizeInBits() != Mask.getValueSizeInBits())
38379    return false;
38380
38381  // Handle scaling for i64 elements on 32-bit targets.
38382  unsigned NumCstElts = cast<FixedVectorType>(CTy)->getNumElements();
38383  if (NumCstElts != NumElts && NumCstElts != (NumElts * 2))
38384    return false;
38385  unsigned Scale = NumCstElts / NumElts;
38386
38387  // Simplify mask if we have an undemanded element that is not undef.
38388  bool Simplified = false;
38389  SmallVector<Constant *, 32> ConstVecOps;
38390  for (unsigned i = 0; i != NumCstElts; ++i) {
38391    Constant *Elt = C->getAggregateElement(i);
38392    if (!DemandedElts[i / Scale] && !isa<UndefValue>(Elt)) {
38393      ConstVecOps.push_back(UndefValue::get(Elt->getType()));
38394      Simplified = true;
38395      continue;
38396    }
38397    ConstVecOps.push_back(Elt);
38398  }
38399  if (!Simplified)
38400    return false;
38401
38402  // Generate new constant pool entry + legalize immediately for the load.
38403  SDLoc DL(Op);
38404  SDValue CV = TLO.DAG.getConstantPool(ConstantVector::get(ConstVecOps), BCVT);
38405  SDValue LegalCV = LowerConstantPool(CV, TLO.DAG);
38406  SDValue NewMask = TLO.DAG.getLoad(
38407      BCVT, DL, TLO.DAG.getEntryNode(), LegalCV,
38408      MachinePointerInfo::getConstantPool(TLO.DAG.getMachineFunction()),
38409      Load->getAlign());
38410  return TLO.CombineTo(Mask, TLO.DAG.getBitcast(Mask.getValueType(), NewMask));
38411}
38412
38413bool X86TargetLowering::SimplifyDemandedVectorEltsForTargetNode(
38414    SDValue Op, const APInt &DemandedElts, APInt &KnownUndef, APInt &KnownZero,
38415    TargetLoweringOpt &TLO, unsigned Depth) const {
38416  int NumElts = DemandedElts.getBitWidth();
38417  unsigned Opc = Op.getOpcode();
38418  EVT VT = Op.getValueType();
38419
38420  // Handle special case opcodes.
38421  switch (Opc) {
38422  case X86ISD::PMULDQ:
38423  case X86ISD::PMULUDQ: {
38424    APInt LHSUndef, LHSZero;
38425    APInt RHSUndef, RHSZero;
38426    SDValue LHS = Op.getOperand(0);
38427    SDValue RHS = Op.getOperand(1);
38428    if (SimplifyDemandedVectorElts(LHS, DemandedElts, LHSUndef, LHSZero, TLO,
38429                                   Depth + 1))
38430      return true;
38431    if (SimplifyDemandedVectorElts(RHS, DemandedElts, RHSUndef, RHSZero, TLO,
38432                                   Depth + 1))
38433      return true;
38434    // Multiply by zero.
38435    KnownZero = LHSZero | RHSZero;
38436    break;
38437  }
38438  case X86ISD::VSHL:
38439  case X86ISD::VSRL:
38440  case X86ISD::VSRA: {
38441    // We only need the bottom 64-bits of the (128-bit) shift amount.
38442    SDValue Amt = Op.getOperand(1);
38443    MVT AmtVT = Amt.getSimpleValueType();
38444    assert(AmtVT.is128BitVector() && "Unexpected value type");
38445
38446    // If we reuse the shift amount just for sse shift amounts then we know that
38447    // only the bottom 64-bits are only ever used.
38448    bool AssumeSingleUse = llvm::all_of(Amt->uses(), [&Amt](SDNode *Use) {
38449      unsigned UseOpc = Use->getOpcode();
38450      return (UseOpc == X86ISD::VSHL || UseOpc == X86ISD::VSRL ||
38451              UseOpc == X86ISD::VSRA) &&
38452             Use->getOperand(0) != Amt;
38453    });
38454
38455    APInt AmtUndef, AmtZero;
38456    unsigned NumAmtElts = AmtVT.getVectorNumElements();
38457    APInt AmtElts = APInt::getLowBitsSet(NumAmtElts, NumAmtElts / 2);
38458    if (SimplifyDemandedVectorElts(Amt, AmtElts, AmtUndef, AmtZero, TLO,
38459                                   Depth + 1, AssumeSingleUse))
38460      return true;
38461    LLVM_FALLTHROUGH;
38462  }
38463  case X86ISD::VSHLI:
38464  case X86ISD::VSRLI:
38465  case X86ISD::VSRAI: {
38466    SDValue Src = Op.getOperand(0);
38467    APInt SrcUndef;
38468    if (SimplifyDemandedVectorElts(Src, DemandedElts, SrcUndef, KnownZero, TLO,
38469                                   Depth + 1))
38470      return true;
38471
38472    // Aggressively peek through ops to get at the demanded elts.
38473    if (!DemandedElts.isAllOnesValue())
38474      if (SDValue NewSrc = SimplifyMultipleUseDemandedVectorElts(
38475              Src, DemandedElts, TLO.DAG, Depth + 1))
38476        return TLO.CombineTo(
38477            Op, TLO.DAG.getNode(Opc, SDLoc(Op), VT, NewSrc, Op.getOperand(1)));
38478    break;
38479  }
38480  case X86ISD::KSHIFTL: {
38481    SDValue Src = Op.getOperand(0);
38482    auto *Amt = cast<ConstantSDNode>(Op.getOperand(1));
38483    assert(Amt->getAPIntValue().ult(NumElts) && "Out of range shift amount");
38484    unsigned ShiftAmt = Amt->getZExtValue();
38485
38486    if (ShiftAmt == 0)
38487      return TLO.CombineTo(Op, Src);
38488
38489    // If this is ((X >>u C1) << ShAmt), see if we can simplify this into a
38490    // single shift.  We can do this if the bottom bits (which are shifted
38491    // out) are never demanded.
38492    if (Src.getOpcode() == X86ISD::KSHIFTR) {
38493      if (!DemandedElts.intersects(APInt::getLowBitsSet(NumElts, ShiftAmt))) {
38494        unsigned C1 = Src.getConstantOperandVal(1);
38495        unsigned NewOpc = X86ISD::KSHIFTL;
38496        int Diff = ShiftAmt - C1;
38497        if (Diff < 0) {
38498          Diff = -Diff;
38499          NewOpc = X86ISD::KSHIFTR;
38500        }
38501
38502        SDLoc dl(Op);
38503        SDValue NewSA = TLO.DAG.getTargetConstant(Diff, dl, MVT::i8);
38504        return TLO.CombineTo(
38505            Op, TLO.DAG.getNode(NewOpc, dl, VT, Src.getOperand(0), NewSA));
38506      }
38507    }
38508
38509    APInt DemandedSrc = DemandedElts.lshr(ShiftAmt);
38510    if (SimplifyDemandedVectorElts(Src, DemandedSrc, KnownUndef, KnownZero, TLO,
38511                                   Depth + 1))
38512      return true;
38513
38514    KnownUndef <<= ShiftAmt;
38515    KnownZero <<= ShiftAmt;
38516    KnownZero.setLowBits(ShiftAmt);
38517    break;
38518  }
38519  case X86ISD::KSHIFTR: {
38520    SDValue Src = Op.getOperand(0);
38521    auto *Amt = cast<ConstantSDNode>(Op.getOperand(1));
38522    assert(Amt->getAPIntValue().ult(NumElts) && "Out of range shift amount");
38523    unsigned ShiftAmt = Amt->getZExtValue();
38524
38525    if (ShiftAmt == 0)
38526      return TLO.CombineTo(Op, Src);
38527
38528    // If this is ((X << C1) >>u ShAmt), see if we can simplify this into a
38529    // single shift.  We can do this if the top bits (which are shifted
38530    // out) are never demanded.
38531    if (Src.getOpcode() == X86ISD::KSHIFTL) {
38532      if (!DemandedElts.intersects(APInt::getHighBitsSet(NumElts, ShiftAmt))) {
38533        unsigned C1 = Src.getConstantOperandVal(1);
38534        unsigned NewOpc = X86ISD::KSHIFTR;
38535        int Diff = ShiftAmt - C1;
38536        if (Diff < 0) {
38537          Diff = -Diff;
38538          NewOpc = X86ISD::KSHIFTL;
38539        }
38540
38541        SDLoc dl(Op);
38542        SDValue NewSA = TLO.DAG.getTargetConstant(Diff, dl, MVT::i8);
38543        return TLO.CombineTo(
38544            Op, TLO.DAG.getNode(NewOpc, dl, VT, Src.getOperand(0), NewSA));
38545      }
38546    }
38547
38548    APInt DemandedSrc = DemandedElts.shl(ShiftAmt);
38549    if (SimplifyDemandedVectorElts(Src, DemandedSrc, KnownUndef, KnownZero, TLO,
38550                                   Depth + 1))
38551      return true;
38552
38553    KnownUndef.lshrInPlace(ShiftAmt);
38554    KnownZero.lshrInPlace(ShiftAmt);
38555    KnownZero.setHighBits(ShiftAmt);
38556    break;
38557  }
38558  case X86ISD::CVTSI2P:
38559  case X86ISD::CVTUI2P: {
38560    SDValue Src = Op.getOperand(0);
38561    MVT SrcVT = Src.getSimpleValueType();
38562    APInt SrcUndef, SrcZero;
38563    APInt SrcElts = DemandedElts.zextOrTrunc(SrcVT.getVectorNumElements());
38564    if (SimplifyDemandedVectorElts(Src, SrcElts, SrcUndef, SrcZero, TLO,
38565                                   Depth + 1))
38566      return true;
38567    break;
38568  }
38569  case X86ISD::PACKSS:
38570  case X86ISD::PACKUS: {
38571    SDValue N0 = Op.getOperand(0);
38572    SDValue N1 = Op.getOperand(1);
38573
38574    APInt DemandedLHS, DemandedRHS;
38575    getPackDemandedElts(VT, DemandedElts, DemandedLHS, DemandedRHS);
38576
38577    APInt LHSUndef, LHSZero;
38578    if (SimplifyDemandedVectorElts(N0, DemandedLHS, LHSUndef, LHSZero, TLO,
38579                                   Depth + 1))
38580      return true;
38581    APInt RHSUndef, RHSZero;
38582    if (SimplifyDemandedVectorElts(N1, DemandedRHS, RHSUndef, RHSZero, TLO,
38583                                   Depth + 1))
38584      return true;
38585
38586    // TODO - pass on known zero/undef.
38587
38588    // Aggressively peek through ops to get at the demanded elts.
38589    // TODO - we should do this for all target/faux shuffles ops.
38590    if (!DemandedElts.isAllOnesValue()) {
38591      SDValue NewN0 = SimplifyMultipleUseDemandedVectorElts(N0, DemandedLHS,
38592                                                            TLO.DAG, Depth + 1);
38593      SDValue NewN1 = SimplifyMultipleUseDemandedVectorElts(N1, DemandedRHS,
38594                                                            TLO.DAG, Depth + 1);
38595      if (NewN0 || NewN1) {
38596        NewN0 = NewN0 ? NewN0 : N0;
38597        NewN1 = NewN1 ? NewN1 : N1;
38598        return TLO.CombineTo(Op,
38599                             TLO.DAG.getNode(Opc, SDLoc(Op), VT, NewN0, NewN1));
38600      }
38601    }
38602    break;
38603  }
38604  case X86ISD::HADD:
38605  case X86ISD::HSUB:
38606  case X86ISD::FHADD:
38607  case X86ISD::FHSUB: {
38608    SDValue N0 = Op.getOperand(0);
38609    SDValue N1 = Op.getOperand(1);
38610
38611    APInt DemandedLHS, DemandedRHS;
38612    getHorizDemandedElts(VT, DemandedElts, DemandedLHS, DemandedRHS);
38613
38614    APInt LHSUndef, LHSZero;
38615    if (SimplifyDemandedVectorElts(N0, DemandedLHS, LHSUndef, LHSZero, TLO,
38616                                   Depth + 1))
38617      return true;
38618    APInt RHSUndef, RHSZero;
38619    if (SimplifyDemandedVectorElts(N1, DemandedRHS, RHSUndef, RHSZero, TLO,
38620                                   Depth + 1))
38621      return true;
38622
38623    // TODO - pass on known zero/undef.
38624
38625    // Aggressively peek through ops to get at the demanded elts.
38626    // TODO: Handle repeated operands.
38627    if (N0 != N1 && !DemandedElts.isAllOnesValue()) {
38628      SDValue NewN0 = SimplifyMultipleUseDemandedVectorElts(N0, DemandedLHS,
38629                                                            TLO.DAG, Depth + 1);
38630      SDValue NewN1 = SimplifyMultipleUseDemandedVectorElts(N1, DemandedRHS,
38631                                                            TLO.DAG, Depth + 1);
38632      if (NewN0 || NewN1) {
38633        NewN0 = NewN0 ? NewN0 : N0;
38634        NewN1 = NewN1 ? NewN1 : N1;
38635        return TLO.CombineTo(Op,
38636                             TLO.DAG.getNode(Opc, SDLoc(Op), VT, NewN0, NewN1));
38637      }
38638    }
38639    break;
38640  }
38641  case X86ISD::VTRUNC:
38642  case X86ISD::VTRUNCS:
38643  case X86ISD::VTRUNCUS: {
38644    SDValue Src = Op.getOperand(0);
38645    MVT SrcVT = Src.getSimpleValueType();
38646    APInt DemandedSrc = DemandedElts.zextOrTrunc(SrcVT.getVectorNumElements());
38647    APInt SrcUndef, SrcZero;
38648    if (SimplifyDemandedVectorElts(Src, DemandedSrc, SrcUndef, SrcZero, TLO,
38649                                   Depth + 1))
38650      return true;
38651    KnownZero = SrcZero.zextOrTrunc(NumElts);
38652    KnownUndef = SrcUndef.zextOrTrunc(NumElts);
38653    break;
38654  }
38655  case X86ISD::BLENDV: {
38656    APInt SelUndef, SelZero;
38657    if (SimplifyDemandedVectorElts(Op.getOperand(0), DemandedElts, SelUndef,
38658                                   SelZero, TLO, Depth + 1))
38659      return true;
38660
38661    // TODO: Use SelZero to adjust LHS/RHS DemandedElts.
38662    APInt LHSUndef, LHSZero;
38663    if (SimplifyDemandedVectorElts(Op.getOperand(1), DemandedElts, LHSUndef,
38664                                   LHSZero, TLO, Depth + 1))
38665      return true;
38666
38667    APInt RHSUndef, RHSZero;
38668    if (SimplifyDemandedVectorElts(Op.getOperand(2), DemandedElts, RHSUndef,
38669                                   RHSZero, TLO, Depth + 1))
38670      return true;
38671
38672    KnownZero = LHSZero & RHSZero;
38673    KnownUndef = LHSUndef & RHSUndef;
38674    break;
38675  }
38676  case X86ISD::VZEXT_MOVL: {
38677    // If upper demanded elements are already zero then we have nothing to do.
38678    SDValue Src = Op.getOperand(0);
38679    APInt DemandedUpperElts = DemandedElts;
38680    DemandedUpperElts.clearLowBits(1);
38681    if (TLO.DAG.computeKnownBits(Src, DemandedUpperElts, Depth + 1).isZero())
38682      return TLO.CombineTo(Op, Src);
38683    break;
38684  }
38685  case X86ISD::VBROADCAST: {
38686    SDValue Src = Op.getOperand(0);
38687    MVT SrcVT = Src.getSimpleValueType();
38688    if (!SrcVT.isVector())
38689      break;
38690    // Don't bother broadcasting if we just need the 0'th element.
38691    if (DemandedElts == 1) {
38692      if (Src.getValueType() != VT)
38693        Src = widenSubVector(VT.getSimpleVT(), Src, false, Subtarget, TLO.DAG,
38694                             SDLoc(Op));
38695      return TLO.CombineTo(Op, Src);
38696    }
38697    APInt SrcUndef, SrcZero;
38698    APInt SrcElts = APInt::getOneBitSet(SrcVT.getVectorNumElements(), 0);
38699    if (SimplifyDemandedVectorElts(Src, SrcElts, SrcUndef, SrcZero, TLO,
38700                                   Depth + 1))
38701      return true;
38702    // Aggressively peek through src to get at the demanded elt.
38703    // TODO - we should do this for all target/faux shuffles ops.
38704    if (SDValue NewSrc = SimplifyMultipleUseDemandedVectorElts(
38705            Src, SrcElts, TLO.DAG, Depth + 1))
38706      return TLO.CombineTo(Op, TLO.DAG.getNode(Opc, SDLoc(Op), VT, NewSrc));
38707    break;
38708  }
38709  case X86ISD::VPERMV:
38710    if (SimplifyDemandedVectorEltsForTargetShuffle(Op, DemandedElts, 0, TLO,
38711                                                   Depth))
38712      return true;
38713    break;
38714  case X86ISD::PSHUFB:
38715  case X86ISD::VPERMV3:
38716  case X86ISD::VPERMILPV:
38717    if (SimplifyDemandedVectorEltsForTargetShuffle(Op, DemandedElts, 1, TLO,
38718                                                   Depth))
38719      return true;
38720    break;
38721  case X86ISD::VPPERM:
38722  case X86ISD::VPERMIL2:
38723    if (SimplifyDemandedVectorEltsForTargetShuffle(Op, DemandedElts, 2, TLO,
38724                                                   Depth))
38725      return true;
38726    break;
38727  }
38728
38729  // For 256/512-bit ops that are 128/256-bit ops glued together, if we do not
38730  // demand any of the high elements, then narrow the op to 128/256-bits: e.g.
38731  // (op ymm0, ymm1) --> insert undef, (op xmm0, xmm1), 0
38732  if ((VT.is256BitVector() || VT.is512BitVector()) &&
38733      DemandedElts.lshr(NumElts / 2) == 0) {
38734    unsigned SizeInBits = VT.getSizeInBits();
38735    unsigned ExtSizeInBits = SizeInBits / 2;
38736
38737    // See if 512-bit ops only use the bottom 128-bits.
38738    if (VT.is512BitVector() && DemandedElts.lshr(NumElts / 4) == 0)
38739      ExtSizeInBits = SizeInBits / 4;
38740
38741    switch (Opc) {
38742      // Scalar broadcast.
38743    case X86ISD::VBROADCAST: {
38744      SDLoc DL(Op);
38745      SDValue Src = Op.getOperand(0);
38746      if (Src.getValueSizeInBits() > ExtSizeInBits)
38747        Src = extractSubVector(Src, 0, TLO.DAG, DL, ExtSizeInBits);
38748      EVT BcstVT = EVT::getVectorVT(*TLO.DAG.getContext(), VT.getScalarType(),
38749                                    ExtSizeInBits / VT.getScalarSizeInBits());
38750      SDValue Bcst = TLO.DAG.getNode(X86ISD::VBROADCAST, DL, BcstVT, Src);
38751      return TLO.CombineTo(Op, insertSubVector(TLO.DAG.getUNDEF(VT), Bcst, 0,
38752                                               TLO.DAG, DL, ExtSizeInBits));
38753    }
38754    case X86ISD::VBROADCAST_LOAD: {
38755      SDLoc DL(Op);
38756      auto *MemIntr = cast<MemIntrinsicSDNode>(Op);
38757      EVT BcstVT = EVT::getVectorVT(*TLO.DAG.getContext(), VT.getScalarType(),
38758                                    ExtSizeInBits / VT.getScalarSizeInBits());
38759      SDVTList Tys = TLO.DAG.getVTList(BcstVT, MVT::Other);
38760      SDValue Ops[] = {MemIntr->getOperand(0), MemIntr->getOperand(1)};
38761      SDValue Bcst = TLO.DAG.getMemIntrinsicNode(
38762          X86ISD::VBROADCAST_LOAD, DL, Tys, Ops, MemIntr->getMemoryVT(),
38763          MemIntr->getMemOperand());
38764      TLO.DAG.makeEquivalentMemoryOrdering(SDValue(MemIntr, 1),
38765                                           Bcst.getValue(1));
38766      return TLO.CombineTo(Op, insertSubVector(TLO.DAG.getUNDEF(VT), Bcst, 0,
38767                                               TLO.DAG, DL, ExtSizeInBits));
38768    }
38769      // Subvector broadcast.
38770    case X86ISD::SUBV_BROADCAST_LOAD: {
38771      auto *MemIntr = cast<MemIntrinsicSDNode>(Op);
38772      EVT MemVT = MemIntr->getMemoryVT();
38773      if (ExtSizeInBits == MemVT.getStoreSizeInBits()) {
38774        SDLoc DL(Op);
38775        SDValue Ld =
38776            TLO.DAG.getLoad(MemVT, DL, MemIntr->getChain(),
38777                            MemIntr->getBasePtr(), MemIntr->getMemOperand());
38778        TLO.DAG.makeEquivalentMemoryOrdering(SDValue(MemIntr, 1),
38779                                             Ld.getValue(1));
38780        return TLO.CombineTo(Op, insertSubVector(TLO.DAG.getUNDEF(VT), Ld, 0,
38781                                                 TLO.DAG, DL, ExtSizeInBits));
38782      } else if ((ExtSizeInBits % MemVT.getStoreSizeInBits()) == 0) {
38783        SDLoc DL(Op);
38784        EVT BcstVT = EVT::getVectorVT(*TLO.DAG.getContext(), VT.getScalarType(),
38785                                      ExtSizeInBits / VT.getScalarSizeInBits());
38786        SDVTList Tys = TLO.DAG.getVTList(BcstVT, MVT::Other);
38787        SDValue Ops[] = {MemIntr->getOperand(0), MemIntr->getOperand(1)};
38788        SDValue Bcst =
38789            TLO.DAG.getMemIntrinsicNode(X86ISD::SUBV_BROADCAST_LOAD, DL, Tys,
38790                                        Ops, MemVT, MemIntr->getMemOperand());
38791        TLO.DAG.makeEquivalentMemoryOrdering(SDValue(MemIntr, 1),
38792                                             Bcst.getValue(1));
38793        return TLO.CombineTo(Op, insertSubVector(TLO.DAG.getUNDEF(VT), Bcst, 0,
38794                                                 TLO.DAG, DL, ExtSizeInBits));
38795      }
38796      break;
38797    }
38798      // Byte shifts by immediate.
38799    case X86ISD::VSHLDQ:
38800    case X86ISD::VSRLDQ:
38801      // Shift by uniform.
38802    case X86ISD::VSHL:
38803    case X86ISD::VSRL:
38804    case X86ISD::VSRA:
38805      // Shift by immediate.
38806    case X86ISD::VSHLI:
38807    case X86ISD::VSRLI:
38808    case X86ISD::VSRAI: {
38809      SDLoc DL(Op);
38810      SDValue Ext0 =
38811          extractSubVector(Op.getOperand(0), 0, TLO.DAG, DL, ExtSizeInBits);
38812      SDValue ExtOp =
38813          TLO.DAG.getNode(Opc, DL, Ext0.getValueType(), Ext0, Op.getOperand(1));
38814      SDValue UndefVec = TLO.DAG.getUNDEF(VT);
38815      SDValue Insert =
38816          insertSubVector(UndefVec, ExtOp, 0, TLO.DAG, DL, ExtSizeInBits);
38817      return TLO.CombineTo(Op, Insert);
38818    }
38819    case X86ISD::VPERMI: {
38820      // Simplify PERMPD/PERMQ to extract_subvector.
38821      // TODO: This should be done in shuffle combining.
38822      if (VT == MVT::v4f64 || VT == MVT::v4i64) {
38823        SmallVector<int, 4> Mask;
38824        DecodeVPERMMask(NumElts, Op.getConstantOperandVal(1), Mask);
38825        if (isUndefOrEqual(Mask[0], 2) && isUndefOrEqual(Mask[1], 3)) {
38826          SDLoc DL(Op);
38827          SDValue Ext = extractSubVector(Op.getOperand(0), 2, TLO.DAG, DL, 128);
38828          SDValue UndefVec = TLO.DAG.getUNDEF(VT);
38829          SDValue Insert = insertSubVector(UndefVec, Ext, 0, TLO.DAG, DL, 128);
38830          return TLO.CombineTo(Op, Insert);
38831        }
38832      }
38833      break;
38834    }
38835      // Zero upper elements.
38836    case X86ISD::VZEXT_MOVL:
38837      // Target unary shuffles by immediate:
38838    case X86ISD::PSHUFD:
38839    case X86ISD::PSHUFLW:
38840    case X86ISD::PSHUFHW:
38841    case X86ISD::VPERMILPI:
38842      // (Non-Lane Crossing) Target Shuffles.
38843    case X86ISD::VPERMILPV:
38844    case X86ISD::VPERMIL2:
38845    case X86ISD::PSHUFB:
38846    case X86ISD::UNPCKL:
38847    case X86ISD::UNPCKH:
38848    case X86ISD::BLENDI:
38849      // Integer ops.
38850    case X86ISD::AVG:
38851    case X86ISD::PACKSS:
38852    case X86ISD::PACKUS:
38853      // Horizontal Ops.
38854    case X86ISD::HADD:
38855    case X86ISD::HSUB:
38856    case X86ISD::FHADD:
38857    case X86ISD::FHSUB: {
38858      SDLoc DL(Op);
38859      SmallVector<SDValue, 4> Ops;
38860      for (unsigned i = 0, e = Op.getNumOperands(); i != e; ++i) {
38861        SDValue SrcOp = Op.getOperand(i);
38862        EVT SrcVT = SrcOp.getValueType();
38863        assert((!SrcVT.isVector() || SrcVT.getSizeInBits() == SizeInBits) &&
38864               "Unsupported vector size");
38865        Ops.push_back(SrcVT.isVector() ? extractSubVector(SrcOp, 0, TLO.DAG, DL,
38866                                                          ExtSizeInBits)
38867                                       : SrcOp);
38868      }
38869      MVT ExtVT = VT.getSimpleVT();
38870      ExtVT = MVT::getVectorVT(ExtVT.getScalarType(),
38871                               ExtSizeInBits / ExtVT.getScalarSizeInBits());
38872      SDValue ExtOp = TLO.DAG.getNode(Opc, DL, ExtVT, Ops);
38873      SDValue UndefVec = TLO.DAG.getUNDEF(VT);
38874      SDValue Insert =
38875          insertSubVector(UndefVec, ExtOp, 0, TLO.DAG, DL, ExtSizeInBits);
38876      return TLO.CombineTo(Op, Insert);
38877    }
38878    }
38879  }
38880
38881  // Get target/faux shuffle mask.
38882  APInt OpUndef, OpZero;
38883  SmallVector<int, 64> OpMask;
38884  SmallVector<SDValue, 2> OpInputs;
38885  if (!getTargetShuffleInputs(Op, DemandedElts, OpInputs, OpMask, OpUndef,
38886                              OpZero, TLO.DAG, Depth, false))
38887    return false;
38888
38889  // Shuffle inputs must be the same size as the result.
38890  if (OpMask.size() != (unsigned)NumElts ||
38891      llvm::any_of(OpInputs, [VT](SDValue V) {
38892        return VT.getSizeInBits() != V.getValueSizeInBits() ||
38893               !V.getValueType().isVector();
38894      }))
38895    return false;
38896
38897  KnownZero = OpZero;
38898  KnownUndef = OpUndef;
38899
38900  // Check if shuffle mask can be simplified to undef/zero/identity.
38901  int NumSrcs = OpInputs.size();
38902  for (int i = 0; i != NumElts; ++i)
38903    if (!DemandedElts[i])
38904      OpMask[i] = SM_SentinelUndef;
38905
38906  if (isUndefInRange(OpMask, 0, NumElts)) {
38907    KnownUndef.setAllBits();
38908    return TLO.CombineTo(Op, TLO.DAG.getUNDEF(VT));
38909  }
38910  if (isUndefOrZeroInRange(OpMask, 0, NumElts)) {
38911    KnownZero.setAllBits();
38912    return TLO.CombineTo(
38913        Op, getZeroVector(VT.getSimpleVT(), Subtarget, TLO.DAG, SDLoc(Op)));
38914  }
38915  for (int Src = 0; Src != NumSrcs; ++Src)
38916    if (isSequentialOrUndefInRange(OpMask, 0, NumElts, Src * NumElts))
38917      return TLO.CombineTo(Op, TLO.DAG.getBitcast(VT, OpInputs[Src]));
38918
38919  // Attempt to simplify inputs.
38920  for (int Src = 0; Src != NumSrcs; ++Src) {
38921    // TODO: Support inputs of different types.
38922    if (OpInputs[Src].getValueType() != VT)
38923      continue;
38924
38925    int Lo = Src * NumElts;
38926    APInt SrcElts = APInt::getNullValue(NumElts);
38927    for (int i = 0; i != NumElts; ++i)
38928      if (DemandedElts[i]) {
38929        int M = OpMask[i] - Lo;
38930        if (0 <= M && M < NumElts)
38931          SrcElts.setBit(M);
38932      }
38933
38934    // TODO - Propagate input undef/zero elts.
38935    APInt SrcUndef, SrcZero;
38936    if (SimplifyDemandedVectorElts(OpInputs[Src], SrcElts, SrcUndef, SrcZero,
38937                                   TLO, Depth + 1))
38938      return true;
38939  }
38940
38941  // If we don't demand all elements, then attempt to combine to a simpler
38942  // shuffle.
38943  // We need to convert the depth to something combineX86ShufflesRecursively
38944  // can handle - so pretend its Depth == 0 again, and reduce the max depth
38945  // to match. This prevents combineX86ShuffleChain from returning a
38946  // combined shuffle that's the same as the original root, causing an
38947  // infinite loop.
38948  if (!DemandedElts.isAllOnesValue()) {
38949    assert(Depth < X86::MaxShuffleCombineDepth && "Depth out of range");
38950
38951    SmallVector<int, 64> DemandedMask(NumElts, SM_SentinelUndef);
38952    for (int i = 0; i != NumElts; ++i)
38953      if (DemandedElts[i])
38954        DemandedMask[i] = i;
38955
38956    SDValue NewShuffle = combineX86ShufflesRecursively(
38957        {Op}, 0, Op, DemandedMask, {}, 0, X86::MaxShuffleCombineDepth - Depth,
38958        /*HasVarMask*/ false,
38959        /*AllowVarMask*/ true, TLO.DAG, Subtarget);
38960    if (NewShuffle)
38961      return TLO.CombineTo(Op, NewShuffle);
38962  }
38963
38964  return false;
38965}
38966
38967bool X86TargetLowering::SimplifyDemandedBitsForTargetNode(
38968    SDValue Op, const APInt &OriginalDemandedBits,
38969    const APInt &OriginalDemandedElts, KnownBits &Known, TargetLoweringOpt &TLO,
38970    unsigned Depth) const {
38971  EVT VT = Op.getValueType();
38972  unsigned BitWidth = OriginalDemandedBits.getBitWidth();
38973  unsigned Opc = Op.getOpcode();
38974  switch(Opc) {
38975  case X86ISD::VTRUNC: {
38976    KnownBits KnownOp;
38977    SDValue Src = Op.getOperand(0);
38978    MVT SrcVT = Src.getSimpleValueType();
38979
38980    // Simplify the input, using demanded bit information.
38981    APInt TruncMask = OriginalDemandedBits.zext(SrcVT.getScalarSizeInBits());
38982    APInt DemandedElts = OriginalDemandedElts.trunc(SrcVT.getVectorNumElements());
38983    if (SimplifyDemandedBits(Src, TruncMask, DemandedElts, KnownOp, TLO, Depth + 1))
38984      return true;
38985    break;
38986  }
38987  case X86ISD::PMULDQ:
38988  case X86ISD::PMULUDQ: {
38989    // PMULDQ/PMULUDQ only uses lower 32 bits from each vector element.
38990    KnownBits KnownOp;
38991    SDValue LHS = Op.getOperand(0);
38992    SDValue RHS = Op.getOperand(1);
38993    // FIXME: Can we bound this better?
38994    APInt DemandedMask = APInt::getLowBitsSet(64, 32);
38995    if (SimplifyDemandedBits(LHS, DemandedMask, OriginalDemandedElts, KnownOp,
38996                             TLO, Depth + 1))
38997      return true;
38998    if (SimplifyDemandedBits(RHS, DemandedMask, OriginalDemandedElts, KnownOp,
38999                             TLO, Depth + 1))
39000      return true;
39001
39002    // Aggressively peek through ops to get at the demanded low bits.
39003    SDValue DemandedLHS = SimplifyMultipleUseDemandedBits(
39004        LHS, DemandedMask, OriginalDemandedElts, TLO.DAG, Depth + 1);
39005    SDValue DemandedRHS = SimplifyMultipleUseDemandedBits(
39006        RHS, DemandedMask, OriginalDemandedElts, TLO.DAG, Depth + 1);
39007    if (DemandedLHS || DemandedRHS) {
39008      DemandedLHS = DemandedLHS ? DemandedLHS : LHS;
39009      DemandedRHS = DemandedRHS ? DemandedRHS : RHS;
39010      return TLO.CombineTo(
39011          Op, TLO.DAG.getNode(Opc, SDLoc(Op), VT, DemandedLHS, DemandedRHS));
39012    }
39013    break;
39014  }
39015  case X86ISD::VSHLI: {
39016    SDValue Op0 = Op.getOperand(0);
39017
39018    unsigned ShAmt = Op.getConstantOperandVal(1);
39019    if (ShAmt >= BitWidth)
39020      break;
39021
39022    APInt DemandedMask = OriginalDemandedBits.lshr(ShAmt);
39023
39024    // If this is ((X >>u C1) << ShAmt), see if we can simplify this into a
39025    // single shift.  We can do this if the bottom bits (which are shifted
39026    // out) are never demanded.
39027    if (Op0.getOpcode() == X86ISD::VSRLI &&
39028        OriginalDemandedBits.countTrailingZeros() >= ShAmt) {
39029      unsigned Shift2Amt = Op0.getConstantOperandVal(1);
39030      if (Shift2Amt < BitWidth) {
39031        int Diff = ShAmt - Shift2Amt;
39032        if (Diff == 0)
39033          return TLO.CombineTo(Op, Op0.getOperand(0));
39034
39035        unsigned NewOpc = Diff < 0 ? X86ISD::VSRLI : X86ISD::VSHLI;
39036        SDValue NewShift = TLO.DAG.getNode(
39037            NewOpc, SDLoc(Op), VT, Op0.getOperand(0),
39038            TLO.DAG.getTargetConstant(std::abs(Diff), SDLoc(Op), MVT::i8));
39039        return TLO.CombineTo(Op, NewShift);
39040      }
39041    }
39042
39043    // If we are only demanding sign bits then we can use the shift source directly.
39044    unsigned NumSignBits =
39045        TLO.DAG.ComputeNumSignBits(Op0, OriginalDemandedElts, Depth + 1);
39046    unsigned UpperDemandedBits =
39047        BitWidth - OriginalDemandedBits.countTrailingZeros();
39048    if (NumSignBits > ShAmt && (NumSignBits - ShAmt) >= UpperDemandedBits)
39049      return TLO.CombineTo(Op, Op0);
39050
39051    if (SimplifyDemandedBits(Op0, DemandedMask, OriginalDemandedElts, Known,
39052                             TLO, Depth + 1))
39053      return true;
39054
39055    assert(!Known.hasConflict() && "Bits known to be one AND zero?");
39056    Known.Zero <<= ShAmt;
39057    Known.One <<= ShAmt;
39058
39059    // Low bits known zero.
39060    Known.Zero.setLowBits(ShAmt);
39061    return false;
39062  }
39063  case X86ISD::VSRLI: {
39064    unsigned ShAmt = Op.getConstantOperandVal(1);
39065    if (ShAmt >= BitWidth)
39066      break;
39067
39068    APInt DemandedMask = OriginalDemandedBits << ShAmt;
39069
39070    if (SimplifyDemandedBits(Op.getOperand(0), DemandedMask,
39071                             OriginalDemandedElts, Known, TLO, Depth + 1))
39072      return true;
39073
39074    assert(!Known.hasConflict() && "Bits known to be one AND zero?");
39075    Known.Zero.lshrInPlace(ShAmt);
39076    Known.One.lshrInPlace(ShAmt);
39077
39078    // High bits known zero.
39079    Known.Zero.setHighBits(ShAmt);
39080    return false;
39081  }
39082  case X86ISD::VSRAI: {
39083    SDValue Op0 = Op.getOperand(0);
39084    SDValue Op1 = Op.getOperand(1);
39085
39086    unsigned ShAmt = cast<ConstantSDNode>(Op1)->getZExtValue();
39087    if (ShAmt >= BitWidth)
39088      break;
39089
39090    APInt DemandedMask = OriginalDemandedBits << ShAmt;
39091
39092    // If we just want the sign bit then we don't need to shift it.
39093    if (OriginalDemandedBits.isSignMask())
39094      return TLO.CombineTo(Op, Op0);
39095
39096    // fold (VSRAI (VSHLI X, C1), C1) --> X iff NumSignBits(X) > C1
39097    if (Op0.getOpcode() == X86ISD::VSHLI &&
39098        Op.getOperand(1) == Op0.getOperand(1)) {
39099      SDValue Op00 = Op0.getOperand(0);
39100      unsigned NumSignBits =
39101          TLO.DAG.ComputeNumSignBits(Op00, OriginalDemandedElts);
39102      if (ShAmt < NumSignBits)
39103        return TLO.CombineTo(Op, Op00);
39104    }
39105
39106    // If any of the demanded bits are produced by the sign extension, we also
39107    // demand the input sign bit.
39108    if (OriginalDemandedBits.countLeadingZeros() < ShAmt)
39109      DemandedMask.setSignBit();
39110
39111    if (SimplifyDemandedBits(Op0, DemandedMask, OriginalDemandedElts, Known,
39112                             TLO, Depth + 1))
39113      return true;
39114
39115    assert(!Known.hasConflict() && "Bits known to be one AND zero?");
39116    Known.Zero.lshrInPlace(ShAmt);
39117    Known.One.lshrInPlace(ShAmt);
39118
39119    // If the input sign bit is known to be zero, or if none of the top bits
39120    // are demanded, turn this into an unsigned shift right.
39121    if (Known.Zero[BitWidth - ShAmt - 1] ||
39122        OriginalDemandedBits.countLeadingZeros() >= ShAmt)
39123      return TLO.CombineTo(
39124          Op, TLO.DAG.getNode(X86ISD::VSRLI, SDLoc(Op), VT, Op0, Op1));
39125
39126    // High bits are known one.
39127    if (Known.One[BitWidth - ShAmt - 1])
39128      Known.One.setHighBits(ShAmt);
39129    return false;
39130  }
39131  case X86ISD::PEXTRB:
39132  case X86ISD::PEXTRW: {
39133    SDValue Vec = Op.getOperand(0);
39134    auto *CIdx = dyn_cast<ConstantSDNode>(Op.getOperand(1));
39135    MVT VecVT = Vec.getSimpleValueType();
39136    unsigned NumVecElts = VecVT.getVectorNumElements();
39137
39138    if (CIdx && CIdx->getAPIntValue().ult(NumVecElts)) {
39139      unsigned Idx = CIdx->getZExtValue();
39140      unsigned VecBitWidth = VecVT.getScalarSizeInBits();
39141
39142      // If we demand no bits from the vector then we must have demanded
39143      // bits from the implict zext - simplify to zero.
39144      APInt DemandedVecBits = OriginalDemandedBits.trunc(VecBitWidth);
39145      if (DemandedVecBits == 0)
39146        return TLO.CombineTo(Op, TLO.DAG.getConstant(0, SDLoc(Op), VT));
39147
39148      APInt KnownUndef, KnownZero;
39149      APInt DemandedVecElts = APInt::getOneBitSet(NumVecElts, Idx);
39150      if (SimplifyDemandedVectorElts(Vec, DemandedVecElts, KnownUndef,
39151                                     KnownZero, TLO, Depth + 1))
39152        return true;
39153
39154      KnownBits KnownVec;
39155      if (SimplifyDemandedBits(Vec, DemandedVecBits, DemandedVecElts,
39156                               KnownVec, TLO, Depth + 1))
39157        return true;
39158
39159      if (SDValue V = SimplifyMultipleUseDemandedBits(
39160              Vec, DemandedVecBits, DemandedVecElts, TLO.DAG, Depth + 1))
39161        return TLO.CombineTo(
39162            Op, TLO.DAG.getNode(Opc, SDLoc(Op), VT, V, Op.getOperand(1)));
39163
39164      Known = KnownVec.zext(BitWidth);
39165      return false;
39166    }
39167    break;
39168  }
39169  case X86ISD::PINSRB:
39170  case X86ISD::PINSRW: {
39171    SDValue Vec = Op.getOperand(0);
39172    SDValue Scl = Op.getOperand(1);
39173    auto *CIdx = dyn_cast<ConstantSDNode>(Op.getOperand(2));
39174    MVT VecVT = Vec.getSimpleValueType();
39175
39176    if (CIdx && CIdx->getAPIntValue().ult(VecVT.getVectorNumElements())) {
39177      unsigned Idx = CIdx->getZExtValue();
39178      if (!OriginalDemandedElts[Idx])
39179        return TLO.CombineTo(Op, Vec);
39180
39181      KnownBits KnownVec;
39182      APInt DemandedVecElts(OriginalDemandedElts);
39183      DemandedVecElts.clearBit(Idx);
39184      if (SimplifyDemandedBits(Vec, OriginalDemandedBits, DemandedVecElts,
39185                               KnownVec, TLO, Depth + 1))
39186        return true;
39187
39188      KnownBits KnownScl;
39189      unsigned NumSclBits = Scl.getScalarValueSizeInBits();
39190      APInt DemandedSclBits = OriginalDemandedBits.zext(NumSclBits);
39191      if (SimplifyDemandedBits(Scl, DemandedSclBits, KnownScl, TLO, Depth + 1))
39192        return true;
39193
39194      KnownScl = KnownScl.trunc(VecVT.getScalarSizeInBits());
39195      Known = KnownBits::commonBits(KnownVec, KnownScl);
39196      return false;
39197    }
39198    break;
39199  }
39200  case X86ISD::PACKSS:
39201    // PACKSS saturates to MIN/MAX integer values. So if we just want the
39202    // sign bit then we can just ask for the source operands sign bit.
39203    // TODO - add known bits handling.
39204    if (OriginalDemandedBits.isSignMask()) {
39205      APInt DemandedLHS, DemandedRHS;
39206      getPackDemandedElts(VT, OriginalDemandedElts, DemandedLHS, DemandedRHS);
39207
39208      KnownBits KnownLHS, KnownRHS;
39209      APInt SignMask = APInt::getSignMask(BitWidth * 2);
39210      if (SimplifyDemandedBits(Op.getOperand(0), SignMask, DemandedLHS,
39211                               KnownLHS, TLO, Depth + 1))
39212        return true;
39213      if (SimplifyDemandedBits(Op.getOperand(1), SignMask, DemandedRHS,
39214                               KnownRHS, TLO, Depth + 1))
39215        return true;
39216
39217      // Attempt to avoid multi-use ops if we don't need anything from them.
39218      SDValue DemandedOp0 = SimplifyMultipleUseDemandedBits(
39219          Op.getOperand(0), SignMask, DemandedLHS, TLO.DAG, Depth + 1);
39220      SDValue DemandedOp1 = SimplifyMultipleUseDemandedBits(
39221          Op.getOperand(1), SignMask, DemandedRHS, TLO.DAG, Depth + 1);
39222      if (DemandedOp0 || DemandedOp1) {
39223        SDValue Op0 = DemandedOp0 ? DemandedOp0 : Op.getOperand(0);
39224        SDValue Op1 = DemandedOp1 ? DemandedOp1 : Op.getOperand(1);
39225        return TLO.CombineTo(Op, TLO.DAG.getNode(Opc, SDLoc(Op), VT, Op0, Op1));
39226      }
39227    }
39228    // TODO - add general PACKSS/PACKUS SimplifyDemandedBits support.
39229    break;
39230  case X86ISD::VBROADCAST: {
39231    SDValue Src = Op.getOperand(0);
39232    MVT SrcVT = Src.getSimpleValueType();
39233    APInt DemandedElts = APInt::getOneBitSet(
39234        SrcVT.isVector() ? SrcVT.getVectorNumElements() : 1, 0);
39235    if (SimplifyDemandedBits(Src, OriginalDemandedBits, DemandedElts, Known,
39236                             TLO, Depth + 1))
39237      return true;
39238    // If we don't need the upper bits, attempt to narrow the broadcast source.
39239    // Don't attempt this on AVX512 as it might affect broadcast folding.
39240    // TODO: Should we attempt this for i32/i16 splats? They tend to be slower.
39241    if ((BitWidth == 64) && SrcVT.isScalarInteger() && !Subtarget.hasAVX512() &&
39242        OriginalDemandedBits.countLeadingZeros() >= (BitWidth / 2)) {
39243      MVT NewSrcVT = MVT::getIntegerVT(BitWidth / 2);
39244      SDValue NewSrc =
39245          TLO.DAG.getNode(ISD::TRUNCATE, SDLoc(Src), NewSrcVT, Src);
39246      MVT NewVT = MVT::getVectorVT(NewSrcVT, VT.getVectorNumElements() * 2);
39247      SDValue NewBcst =
39248          TLO.DAG.getNode(X86ISD::VBROADCAST, SDLoc(Op), NewVT, NewSrc);
39249      return TLO.CombineTo(Op, TLO.DAG.getBitcast(VT, NewBcst));
39250    }
39251    break;
39252  }
39253  case X86ISD::PCMPGT:
39254    // icmp sgt(0, R) == ashr(R, BitWidth-1).
39255    // iff we only need the sign bit then we can use R directly.
39256    if (OriginalDemandedBits.isSignMask() &&
39257        ISD::isBuildVectorAllZeros(Op.getOperand(0).getNode()))
39258      return TLO.CombineTo(Op, Op.getOperand(1));
39259    break;
39260  case X86ISD::MOVMSK: {
39261    SDValue Src = Op.getOperand(0);
39262    MVT SrcVT = Src.getSimpleValueType();
39263    unsigned SrcBits = SrcVT.getScalarSizeInBits();
39264    unsigned NumElts = SrcVT.getVectorNumElements();
39265
39266    // If we don't need the sign bits at all just return zero.
39267    if (OriginalDemandedBits.countTrailingZeros() >= NumElts)
39268      return TLO.CombineTo(Op, TLO.DAG.getConstant(0, SDLoc(Op), VT));
39269
39270    // Only demand the vector elements of the sign bits we need.
39271    APInt KnownUndef, KnownZero;
39272    APInt DemandedElts = OriginalDemandedBits.zextOrTrunc(NumElts);
39273    if (SimplifyDemandedVectorElts(Src, DemandedElts, KnownUndef, KnownZero,
39274                                   TLO, Depth + 1))
39275      return true;
39276
39277    Known.Zero = KnownZero.zextOrSelf(BitWidth);
39278    Known.Zero.setHighBits(BitWidth - NumElts);
39279
39280    // MOVMSK only uses the MSB from each vector element.
39281    KnownBits KnownSrc;
39282    APInt DemandedSrcBits = APInt::getSignMask(SrcBits);
39283    if (SimplifyDemandedBits(Src, DemandedSrcBits, DemandedElts, KnownSrc, TLO,
39284                             Depth + 1))
39285      return true;
39286
39287    if (KnownSrc.One[SrcBits - 1])
39288      Known.One.setLowBits(NumElts);
39289    else if (KnownSrc.Zero[SrcBits - 1])
39290      Known.Zero.setLowBits(NumElts);
39291
39292    // Attempt to avoid multi-use os if we don't need anything from it.
39293    if (SDValue NewSrc = SimplifyMultipleUseDemandedBits(
39294            Src, DemandedSrcBits, DemandedElts, TLO.DAG, Depth + 1))
39295      return TLO.CombineTo(Op, TLO.DAG.getNode(Opc, SDLoc(Op), VT, NewSrc));
39296    return false;
39297  }
39298  case X86ISD::BEXTR:
39299  case X86ISD::BEXTRI: {
39300    SDValue Op0 = Op.getOperand(0);
39301    SDValue Op1 = Op.getOperand(1);
39302
39303    // Only bottom 16-bits of the control bits are required.
39304    if (auto *Cst1 = dyn_cast<ConstantSDNode>(Op1)) {
39305      // NOTE: SimplifyDemandedBits won't do this for constants.
39306      uint64_t Val1 = Cst1->getZExtValue();
39307      uint64_t MaskedVal1 = Val1 & 0xFFFF;
39308      if (Opc == X86ISD::BEXTR && MaskedVal1 != Val1) {
39309        SDLoc DL(Op);
39310        return TLO.CombineTo(
39311            Op, TLO.DAG.getNode(X86ISD::BEXTR, DL, VT, Op0,
39312                                TLO.DAG.getConstant(MaskedVal1, DL, VT)));
39313      }
39314
39315      unsigned Shift = Cst1->getAPIntValue().extractBitsAsZExtValue(8, 0);
39316      unsigned Length = Cst1->getAPIntValue().extractBitsAsZExtValue(8, 8);
39317
39318      // If the length is 0, the result is 0.
39319      if (Length == 0) {
39320        Known.setAllZero();
39321        return false;
39322      }
39323
39324      if ((Shift + Length) <= BitWidth) {
39325        APInt DemandedMask = APInt::getBitsSet(BitWidth, Shift, Shift + Length);
39326        if (SimplifyDemandedBits(Op0, DemandedMask, Known, TLO, Depth + 1))
39327          return true;
39328
39329        Known = Known.extractBits(Length, Shift);
39330        Known = Known.zextOrTrunc(BitWidth);
39331        return false;
39332      }
39333    } else {
39334      assert(Opc == X86ISD::BEXTR && "Unexpected opcode!");
39335      KnownBits Known1;
39336      APInt DemandedMask(APInt::getLowBitsSet(BitWidth, 16));
39337      if (SimplifyDemandedBits(Op1, DemandedMask, Known1, TLO, Depth + 1))
39338        return true;
39339
39340      // If the length is 0, replace with 0.
39341      KnownBits LengthBits = Known1.extractBits(8, 8);
39342      if (LengthBits.isZero())
39343        return TLO.CombineTo(Op, TLO.DAG.getConstant(0, SDLoc(Op), VT));
39344    }
39345
39346    break;
39347  }
39348  case X86ISD::PDEP: {
39349    SDValue Op0 = Op.getOperand(0);
39350    SDValue Op1 = Op.getOperand(1);
39351
39352    unsigned DemandedBitsLZ = OriginalDemandedBits.countLeadingZeros();
39353    APInt LoMask = APInt::getLowBitsSet(BitWidth, BitWidth - DemandedBitsLZ);
39354
39355    // If the demanded bits has leading zeroes, we don't demand those from the
39356    // mask.
39357    if (SimplifyDemandedBits(Op1, LoMask, Known, TLO, Depth + 1))
39358      return true;
39359
39360    // The number of possible 1s in the mask determines the number of LSBs of
39361    // operand 0 used. Undemanded bits from the mask don't matter so filter
39362    // them before counting.
39363    KnownBits Known2;
39364    uint64_t Count = (~Known.Zero & LoMask).countPopulation();
39365    APInt DemandedMask(APInt::getLowBitsSet(BitWidth, Count));
39366    if (SimplifyDemandedBits(Op0, DemandedMask, Known2, TLO, Depth + 1))
39367      return true;
39368
39369    // Zeroes are retained from the mask, but not ones.
39370    Known.One.clearAllBits();
39371    // The result will have at least as many trailing zeros as the non-mask
39372    // operand since bits can only map to the same or higher bit position.
39373    Known.Zero.setLowBits(Known2.countMinTrailingZeros());
39374    return false;
39375  }
39376  }
39377
39378  return TargetLowering::SimplifyDemandedBitsForTargetNode(
39379      Op, OriginalDemandedBits, OriginalDemandedElts, Known, TLO, Depth);
39380}
39381
39382SDValue X86TargetLowering::SimplifyMultipleUseDemandedBitsForTargetNode(
39383    SDValue Op, const APInt &DemandedBits, const APInt &DemandedElts,
39384    SelectionDAG &DAG, unsigned Depth) const {
39385  int NumElts = DemandedElts.getBitWidth();
39386  unsigned Opc = Op.getOpcode();
39387  EVT VT = Op.getValueType();
39388
39389  switch (Opc) {
39390  case X86ISD::PINSRB:
39391  case X86ISD::PINSRW: {
39392    // If we don't demand the inserted element, return the base vector.
39393    SDValue Vec = Op.getOperand(0);
39394    auto *CIdx = dyn_cast<ConstantSDNode>(Op.getOperand(2));
39395    MVT VecVT = Vec.getSimpleValueType();
39396    if (CIdx && CIdx->getAPIntValue().ult(VecVT.getVectorNumElements()) &&
39397        !DemandedElts[CIdx->getZExtValue()])
39398      return Vec;
39399    break;
39400  }
39401  case X86ISD::VSHLI: {
39402    // If we are only demanding sign bits then we can use the shift source
39403    // directly.
39404    SDValue Op0 = Op.getOperand(0);
39405    unsigned ShAmt = Op.getConstantOperandVal(1);
39406    unsigned BitWidth = DemandedBits.getBitWidth();
39407    unsigned NumSignBits = DAG.ComputeNumSignBits(Op0, DemandedElts, Depth + 1);
39408    unsigned UpperDemandedBits = BitWidth - DemandedBits.countTrailingZeros();
39409    if (NumSignBits > ShAmt && (NumSignBits - ShAmt) >= UpperDemandedBits)
39410      return Op0;
39411    break;
39412  }
39413  case X86ISD::VSRAI:
39414    // iff we only need the sign bit then we can use the source directly.
39415    // TODO: generalize where we only demand extended signbits.
39416    if (DemandedBits.isSignMask())
39417      return Op.getOperand(0);
39418    break;
39419  case X86ISD::PCMPGT:
39420    // icmp sgt(0, R) == ashr(R, BitWidth-1).
39421    // iff we only need the sign bit then we can use R directly.
39422    if (DemandedBits.isSignMask() &&
39423        ISD::isBuildVectorAllZeros(Op.getOperand(0).getNode()))
39424      return Op.getOperand(1);
39425    break;
39426  }
39427
39428  APInt ShuffleUndef, ShuffleZero;
39429  SmallVector<int, 16> ShuffleMask;
39430  SmallVector<SDValue, 2> ShuffleOps;
39431  if (getTargetShuffleInputs(Op, DemandedElts, ShuffleOps, ShuffleMask,
39432                             ShuffleUndef, ShuffleZero, DAG, Depth, false)) {
39433    // If all the demanded elts are from one operand and are inline,
39434    // then we can use the operand directly.
39435    int NumOps = ShuffleOps.size();
39436    if (ShuffleMask.size() == (unsigned)NumElts &&
39437        llvm::all_of(ShuffleOps, [VT](SDValue V) {
39438          return VT.getSizeInBits() == V.getValueSizeInBits();
39439        })) {
39440
39441      if (DemandedElts.isSubsetOf(ShuffleUndef))
39442        return DAG.getUNDEF(VT);
39443      if (DemandedElts.isSubsetOf(ShuffleUndef | ShuffleZero))
39444        return getZeroVector(VT.getSimpleVT(), Subtarget, DAG, SDLoc(Op));
39445
39446      // Bitmask that indicates which ops have only been accessed 'inline'.
39447      APInt IdentityOp = APInt::getAllOnesValue(NumOps);
39448      for (int i = 0; i != NumElts; ++i) {
39449        int M = ShuffleMask[i];
39450        if (!DemandedElts[i] || ShuffleUndef[i])
39451          continue;
39452        int OpIdx = M / NumElts;
39453        int EltIdx = M % NumElts;
39454        if (M < 0 || EltIdx != i) {
39455          IdentityOp.clearAllBits();
39456          break;
39457        }
39458        IdentityOp &= APInt::getOneBitSet(NumOps, OpIdx);
39459        if (IdentityOp == 0)
39460          break;
39461      }
39462      assert((IdentityOp == 0 || IdentityOp.countPopulation() == 1) &&
39463             "Multiple identity shuffles detected");
39464
39465      if (IdentityOp != 0)
39466        return DAG.getBitcast(VT, ShuffleOps[IdentityOp.countTrailingZeros()]);
39467    }
39468  }
39469
39470  return TargetLowering::SimplifyMultipleUseDemandedBitsForTargetNode(
39471      Op, DemandedBits, DemandedElts, DAG, Depth);
39472}
39473
39474// Helper to peek through bitops/trunc/setcc to determine size of source vector.
39475// Allows combineBitcastvxi1 to determine what size vector generated a <X x i1>.
39476static bool checkBitcastSrcVectorSize(SDValue Src, unsigned Size,
39477                                      bool AllowTruncate) {
39478  switch (Src.getOpcode()) {
39479  case ISD::TRUNCATE:
39480    if (!AllowTruncate)
39481      return false;
39482    LLVM_FALLTHROUGH;
39483  case ISD::SETCC:
39484    return Src.getOperand(0).getValueSizeInBits() == Size;
39485  case ISD::AND:
39486  case ISD::XOR:
39487  case ISD::OR:
39488    return checkBitcastSrcVectorSize(Src.getOperand(0), Size, AllowTruncate) &&
39489           checkBitcastSrcVectorSize(Src.getOperand(1), Size, AllowTruncate);
39490  }
39491  return false;
39492}
39493
39494// Helper to flip between AND/OR/XOR opcodes and their X86ISD FP equivalents.
39495static unsigned getAltBitOpcode(unsigned Opcode) {
39496  switch(Opcode) {
39497  case ISD::AND: return X86ISD::FAND;
39498  case ISD::OR: return X86ISD::FOR;
39499  case ISD::XOR: return X86ISD::FXOR;
39500  case X86ISD::ANDNP: return X86ISD::FANDN;
39501  }
39502  llvm_unreachable("Unknown bitwise opcode");
39503}
39504
39505// Helper to adjust v4i32 MOVMSK expansion to work with SSE1-only targets.
39506static SDValue adjustBitcastSrcVectorSSE1(SelectionDAG &DAG, SDValue Src,
39507                                          const SDLoc &DL) {
39508  EVT SrcVT = Src.getValueType();
39509  if (SrcVT != MVT::v4i1)
39510    return SDValue();
39511
39512  switch (Src.getOpcode()) {
39513  case ISD::SETCC:
39514    if (Src.getOperand(0).getValueType() == MVT::v4i32 &&
39515        ISD::isBuildVectorAllZeros(Src.getOperand(1).getNode()) &&
39516        cast<CondCodeSDNode>(Src.getOperand(2))->get() == ISD::SETLT) {
39517      SDValue Op0 = Src.getOperand(0);
39518      if (ISD::isNormalLoad(Op0.getNode()))
39519        return DAG.getBitcast(MVT::v4f32, Op0);
39520      if (Op0.getOpcode() == ISD::BITCAST &&
39521          Op0.getOperand(0).getValueType() == MVT::v4f32)
39522        return Op0.getOperand(0);
39523    }
39524    break;
39525  case ISD::AND:
39526  case ISD::XOR:
39527  case ISD::OR: {
39528    SDValue Op0 = adjustBitcastSrcVectorSSE1(DAG, Src.getOperand(0), DL);
39529    SDValue Op1 = adjustBitcastSrcVectorSSE1(DAG, Src.getOperand(1), DL);
39530    if (Op0 && Op1)
39531      return DAG.getNode(getAltBitOpcode(Src.getOpcode()), DL, MVT::v4f32, Op0,
39532                         Op1);
39533    break;
39534  }
39535  }
39536  return SDValue();
39537}
39538
39539// Helper to push sign extension of vXi1 SETCC result through bitops.
39540static SDValue signExtendBitcastSrcVector(SelectionDAG &DAG, EVT SExtVT,
39541                                          SDValue Src, const SDLoc &DL) {
39542  switch (Src.getOpcode()) {
39543  case ISD::SETCC:
39544  case ISD::TRUNCATE:
39545    return DAG.getNode(ISD::SIGN_EXTEND, DL, SExtVT, Src);
39546  case ISD::AND:
39547  case ISD::XOR:
39548  case ISD::OR:
39549    return DAG.getNode(
39550        Src.getOpcode(), DL, SExtVT,
39551        signExtendBitcastSrcVector(DAG, SExtVT, Src.getOperand(0), DL),
39552        signExtendBitcastSrcVector(DAG, SExtVT, Src.getOperand(1), DL));
39553  }
39554  llvm_unreachable("Unexpected node type for vXi1 sign extension");
39555}
39556
39557// Try to match patterns such as
39558// (i16 bitcast (v16i1 x))
39559// ->
39560// (i16 movmsk (16i8 sext (v16i1 x)))
39561// before the illegal vector is scalarized on subtargets that don't have legal
39562// vxi1 types.
39563static SDValue combineBitcastvxi1(SelectionDAG &DAG, EVT VT, SDValue Src,
39564                                  const SDLoc &DL,
39565                                  const X86Subtarget &Subtarget) {
39566  EVT SrcVT = Src.getValueType();
39567  if (!SrcVT.isSimple() || SrcVT.getScalarType() != MVT::i1)
39568    return SDValue();
39569
39570  // Recognize the IR pattern for the movmsk intrinsic under SSE1 before type
39571  // legalization destroys the v4i32 type.
39572  if (Subtarget.hasSSE1() && !Subtarget.hasSSE2()) {
39573    if (SDValue V = adjustBitcastSrcVectorSSE1(DAG, Src, DL)) {
39574      V = DAG.getNode(X86ISD::MOVMSK, DL, MVT::i32,
39575                      DAG.getBitcast(MVT::v4f32, V));
39576      return DAG.getZExtOrTrunc(V, DL, VT);
39577    }
39578  }
39579
39580  // If the input is a truncate from v16i8 or v32i8 go ahead and use a
39581  // movmskb even with avx512. This will be better than truncating to vXi1 and
39582  // using a kmov. This can especially help KNL if the input is a v16i8/v32i8
39583  // vpcmpeqb/vpcmpgtb.
39584  bool PreferMovMsk = Src.getOpcode() == ISD::TRUNCATE && Src.hasOneUse() &&
39585                      (Src.getOperand(0).getValueType() == MVT::v16i8 ||
39586                       Src.getOperand(0).getValueType() == MVT::v32i8 ||
39587                       Src.getOperand(0).getValueType() == MVT::v64i8);
39588
39589  // Prefer movmsk for AVX512 for (bitcast (setlt X, 0)) which can be handled
39590  // directly with vpmovmskb/vmovmskps/vmovmskpd.
39591  if (Src.getOpcode() == ISD::SETCC && Src.hasOneUse() &&
39592      cast<CondCodeSDNode>(Src.getOperand(2))->get() == ISD::SETLT &&
39593      ISD::isBuildVectorAllZeros(Src.getOperand(1).getNode())) {
39594    EVT CmpVT = Src.getOperand(0).getValueType();
39595    EVT EltVT = CmpVT.getVectorElementType();
39596    if (CmpVT.getSizeInBits() <= 256 &&
39597        (EltVT == MVT::i8 || EltVT == MVT::i32 || EltVT == MVT::i64))
39598      PreferMovMsk = true;
39599  }
39600
39601  // With AVX512 vxi1 types are legal and we prefer using k-regs.
39602  // MOVMSK is supported in SSE2 or later.
39603  if (!Subtarget.hasSSE2() || (Subtarget.hasAVX512() && !PreferMovMsk))
39604    return SDValue();
39605
39606  // There are MOVMSK flavors for types v16i8, v32i8, v4f32, v8f32, v4f64 and
39607  // v8f64. So all legal 128-bit and 256-bit vectors are covered except for
39608  // v8i16 and v16i16.
39609  // For these two cases, we can shuffle the upper element bytes to a
39610  // consecutive sequence at the start of the vector and treat the results as
39611  // v16i8 or v32i8, and for v16i8 this is the preferable solution. However,
39612  // for v16i16 this is not the case, because the shuffle is expensive, so we
39613  // avoid sign-extending to this type entirely.
39614  // For example, t0 := (v8i16 sext(v8i1 x)) needs to be shuffled as:
39615  // (v16i8 shuffle <0,2,4,6,8,10,12,14,u,u,...,u> (v16i8 bitcast t0), undef)
39616  MVT SExtVT;
39617  bool PropagateSExt = false;
39618  switch (SrcVT.getSimpleVT().SimpleTy) {
39619  default:
39620    return SDValue();
39621  case MVT::v2i1:
39622    SExtVT = MVT::v2i64;
39623    break;
39624  case MVT::v4i1:
39625    SExtVT = MVT::v4i32;
39626    // For cases such as (i4 bitcast (v4i1 setcc v4i64 v1, v2))
39627    // sign-extend to a 256-bit operation to avoid truncation.
39628    if (Subtarget.hasAVX() &&
39629        checkBitcastSrcVectorSize(Src, 256, Subtarget.hasAVX2())) {
39630      SExtVT = MVT::v4i64;
39631      PropagateSExt = true;
39632    }
39633    break;
39634  case MVT::v8i1:
39635    SExtVT = MVT::v8i16;
39636    // For cases such as (i8 bitcast (v8i1 setcc v8i32 v1, v2)),
39637    // sign-extend to a 256-bit operation to match the compare.
39638    // If the setcc operand is 128-bit, prefer sign-extending to 128-bit over
39639    // 256-bit because the shuffle is cheaper than sign extending the result of
39640    // the compare.
39641    if (Subtarget.hasAVX() && (checkBitcastSrcVectorSize(Src, 256, true) ||
39642                               checkBitcastSrcVectorSize(Src, 512, true))) {
39643      SExtVT = MVT::v8i32;
39644      PropagateSExt = true;
39645    }
39646    break;
39647  case MVT::v16i1:
39648    SExtVT = MVT::v16i8;
39649    // For the case (i16 bitcast (v16i1 setcc v16i16 v1, v2)),
39650    // it is not profitable to sign-extend to 256-bit because this will
39651    // require an extra cross-lane shuffle which is more expensive than
39652    // truncating the result of the compare to 128-bits.
39653    break;
39654  case MVT::v32i1:
39655    SExtVT = MVT::v32i8;
39656    break;
39657  case MVT::v64i1:
39658    // If we have AVX512F, but not AVX512BW and the input is truncated from
39659    // v64i8 checked earlier. Then split the input and make two pmovmskbs.
39660    if (Subtarget.hasAVX512()) {
39661      if (Subtarget.hasBWI())
39662        return SDValue();
39663      SExtVT = MVT::v64i8;
39664      break;
39665    }
39666    // Split if this is a <64 x i8> comparison result.
39667    if (checkBitcastSrcVectorSize(Src, 512, false)) {
39668      SExtVT = MVT::v64i8;
39669      break;
39670    }
39671    return SDValue();
39672  };
39673
39674  SDValue V = PropagateSExt ? signExtendBitcastSrcVector(DAG, SExtVT, Src, DL)
39675                            : DAG.getNode(ISD::SIGN_EXTEND, DL, SExtVT, Src);
39676
39677  if (SExtVT == MVT::v16i8 || SExtVT == MVT::v32i8 || SExtVT == MVT::v64i8) {
39678    V = getPMOVMSKB(DL, V, DAG, Subtarget);
39679  } else {
39680    if (SExtVT == MVT::v8i16)
39681      V = DAG.getNode(X86ISD::PACKSS, DL, MVT::v16i8, V,
39682                      DAG.getUNDEF(MVT::v8i16));
39683    V = DAG.getNode(X86ISD::MOVMSK, DL, MVT::i32, V);
39684  }
39685
39686  EVT IntVT =
39687      EVT::getIntegerVT(*DAG.getContext(), SrcVT.getVectorNumElements());
39688  V = DAG.getZExtOrTrunc(V, DL, IntVT);
39689  return DAG.getBitcast(VT, V);
39690}
39691
39692// Convert a vXi1 constant build vector to the same width scalar integer.
39693static SDValue combinevXi1ConstantToInteger(SDValue Op, SelectionDAG &DAG) {
39694  EVT SrcVT = Op.getValueType();
39695  assert(SrcVT.getVectorElementType() == MVT::i1 &&
39696         "Expected a vXi1 vector");
39697  assert(ISD::isBuildVectorOfConstantSDNodes(Op.getNode()) &&
39698         "Expected a constant build vector");
39699
39700  APInt Imm(SrcVT.getVectorNumElements(), 0);
39701  for (unsigned Idx = 0, e = Op.getNumOperands(); Idx < e; ++Idx) {
39702    SDValue In = Op.getOperand(Idx);
39703    if (!In.isUndef() && (cast<ConstantSDNode>(In)->getZExtValue() & 0x1))
39704      Imm.setBit(Idx);
39705  }
39706  EVT IntVT = EVT::getIntegerVT(*DAG.getContext(), Imm.getBitWidth());
39707  return DAG.getConstant(Imm, SDLoc(Op), IntVT);
39708}
39709
39710static SDValue combineCastedMaskArithmetic(SDNode *N, SelectionDAG &DAG,
39711                                           TargetLowering::DAGCombinerInfo &DCI,
39712                                           const X86Subtarget &Subtarget) {
39713  assert(N->getOpcode() == ISD::BITCAST && "Expected a bitcast");
39714
39715  if (!DCI.isBeforeLegalizeOps())
39716    return SDValue();
39717
39718  // Only do this if we have k-registers.
39719  if (!Subtarget.hasAVX512())
39720    return SDValue();
39721
39722  EVT DstVT = N->getValueType(0);
39723  SDValue Op = N->getOperand(0);
39724  EVT SrcVT = Op.getValueType();
39725
39726  if (!Op.hasOneUse())
39727    return SDValue();
39728
39729  // Look for logic ops.
39730  if (Op.getOpcode() != ISD::AND &&
39731      Op.getOpcode() != ISD::OR &&
39732      Op.getOpcode() != ISD::XOR)
39733    return SDValue();
39734
39735  // Make sure we have a bitcast between mask registers and a scalar type.
39736  if (!(SrcVT.isVector() && SrcVT.getVectorElementType() == MVT::i1 &&
39737        DstVT.isScalarInteger()) &&
39738      !(DstVT.isVector() && DstVT.getVectorElementType() == MVT::i1 &&
39739        SrcVT.isScalarInteger()))
39740    return SDValue();
39741
39742  SDValue LHS = Op.getOperand(0);
39743  SDValue RHS = Op.getOperand(1);
39744
39745  if (LHS.hasOneUse() && LHS.getOpcode() == ISD::BITCAST &&
39746      LHS.getOperand(0).getValueType() == DstVT)
39747    return DAG.getNode(Op.getOpcode(), SDLoc(N), DstVT, LHS.getOperand(0),
39748                       DAG.getBitcast(DstVT, RHS));
39749
39750  if (RHS.hasOneUse() && RHS.getOpcode() == ISD::BITCAST &&
39751      RHS.getOperand(0).getValueType() == DstVT)
39752    return DAG.getNode(Op.getOpcode(), SDLoc(N), DstVT,
39753                       DAG.getBitcast(DstVT, LHS), RHS.getOperand(0));
39754
39755  // If the RHS is a vXi1 build vector, this is a good reason to flip too.
39756  // Most of these have to move a constant from the scalar domain anyway.
39757  if (ISD::isBuildVectorOfConstantSDNodes(RHS.getNode())) {
39758    RHS = combinevXi1ConstantToInteger(RHS, DAG);
39759    return DAG.getNode(Op.getOpcode(), SDLoc(N), DstVT,
39760                       DAG.getBitcast(DstVT, LHS), RHS);
39761  }
39762
39763  return SDValue();
39764}
39765
39766static SDValue createMMXBuildVector(BuildVectorSDNode *BV, SelectionDAG &DAG,
39767                                    const X86Subtarget &Subtarget) {
39768  SDLoc DL(BV);
39769  unsigned NumElts = BV->getNumOperands();
39770  SDValue Splat = BV->getSplatValue();
39771
39772  // Build MMX element from integer GPR or SSE float values.
39773  auto CreateMMXElement = [&](SDValue V) {
39774    if (V.isUndef())
39775      return DAG.getUNDEF(MVT::x86mmx);
39776    if (V.getValueType().isFloatingPoint()) {
39777      if (Subtarget.hasSSE1() && !isa<ConstantFPSDNode>(V)) {
39778        V = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, MVT::v4f32, V);
39779        V = DAG.getBitcast(MVT::v2i64, V);
39780        return DAG.getNode(X86ISD::MOVDQ2Q, DL, MVT::x86mmx, V);
39781      }
39782      V = DAG.getBitcast(MVT::i32, V);
39783    } else {
39784      V = DAG.getAnyExtOrTrunc(V, DL, MVT::i32);
39785    }
39786    return DAG.getNode(X86ISD::MMX_MOVW2D, DL, MVT::x86mmx, V);
39787  };
39788
39789  // Convert build vector ops to MMX data in the bottom elements.
39790  SmallVector<SDValue, 8> Ops;
39791
39792  const TargetLowering &TLI = DAG.getTargetLoweringInfo();
39793
39794  // Broadcast - use (PUNPCKL+)PSHUFW to broadcast single element.
39795  if (Splat) {
39796    if (Splat.isUndef())
39797      return DAG.getUNDEF(MVT::x86mmx);
39798
39799    Splat = CreateMMXElement(Splat);
39800
39801    if (Subtarget.hasSSE1()) {
39802      // Unpack v8i8 to splat i8 elements to lowest 16-bits.
39803      if (NumElts == 8)
39804        Splat = DAG.getNode(
39805            ISD::INTRINSIC_WO_CHAIN, DL, MVT::x86mmx,
39806            DAG.getTargetConstant(Intrinsic::x86_mmx_punpcklbw, DL,
39807                                  TLI.getPointerTy(DAG.getDataLayout())),
39808            Splat, Splat);
39809
39810      // Use PSHUFW to repeat 16-bit elements.
39811      unsigned ShufMask = (NumElts > 2 ? 0 : 0x44);
39812      return DAG.getNode(
39813          ISD::INTRINSIC_WO_CHAIN, DL, MVT::x86mmx,
39814          DAG.getTargetConstant(Intrinsic::x86_sse_pshuf_w, DL,
39815                                TLI.getPointerTy(DAG.getDataLayout())),
39816          Splat, DAG.getTargetConstant(ShufMask, DL, MVT::i8));
39817    }
39818    Ops.append(NumElts, Splat);
39819  } else {
39820    for (unsigned i = 0; i != NumElts; ++i)
39821      Ops.push_back(CreateMMXElement(BV->getOperand(i)));
39822  }
39823
39824  // Use tree of PUNPCKLs to build up general MMX vector.
39825  while (Ops.size() > 1) {
39826    unsigned NumOps = Ops.size();
39827    unsigned IntrinOp =
39828        (NumOps == 2 ? Intrinsic::x86_mmx_punpckldq
39829                     : (NumOps == 4 ? Intrinsic::x86_mmx_punpcklwd
39830                                    : Intrinsic::x86_mmx_punpcklbw));
39831    SDValue Intrin = DAG.getTargetConstant(
39832        IntrinOp, DL, TLI.getPointerTy(DAG.getDataLayout()));
39833    for (unsigned i = 0; i != NumOps; i += 2)
39834      Ops[i / 2] = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, MVT::x86mmx, Intrin,
39835                               Ops[i], Ops[i + 1]);
39836    Ops.resize(NumOps / 2);
39837  }
39838
39839  return Ops[0];
39840}
39841
39842// Recursive function that attempts to find if a bool vector node was originally
39843// a vector/float/double that got truncated/extended/bitcast to/from a scalar
39844// integer. If so, replace the scalar ops with bool vector equivalents back down
39845// the chain.
39846static SDValue combineBitcastToBoolVector(EVT VT, SDValue V, const SDLoc &DL,
39847                                          SelectionDAG &DAG,
39848                                          const X86Subtarget &Subtarget) {
39849  const TargetLowering &TLI = DAG.getTargetLoweringInfo();
39850  unsigned Opc = V.getOpcode();
39851  switch (Opc) {
39852  case ISD::BITCAST: {
39853    // Bitcast from a vector/float/double, we can cheaply bitcast to VT.
39854    SDValue Src = V.getOperand(0);
39855    EVT SrcVT = Src.getValueType();
39856    if (SrcVT.isVector() || SrcVT.isFloatingPoint())
39857      return DAG.getBitcast(VT, Src);
39858    break;
39859  }
39860  case ISD::TRUNCATE: {
39861    // If we find a suitable source, a truncated scalar becomes a subvector.
39862    SDValue Src = V.getOperand(0);
39863    EVT NewSrcVT =
39864        EVT::getVectorVT(*DAG.getContext(), MVT::i1, Src.getValueSizeInBits());
39865    if (TLI.isTypeLegal(NewSrcVT))
39866      if (SDValue N0 =
39867              combineBitcastToBoolVector(NewSrcVT, Src, DL, DAG, Subtarget))
39868        return DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, N0,
39869                           DAG.getIntPtrConstant(0, DL));
39870    break;
39871  }
39872  case ISD::ANY_EXTEND:
39873  case ISD::ZERO_EXTEND: {
39874    // If we find a suitable source, an extended scalar becomes a subvector.
39875    SDValue Src = V.getOperand(0);
39876    EVT NewSrcVT = EVT::getVectorVT(*DAG.getContext(), MVT::i1,
39877                                    Src.getScalarValueSizeInBits());
39878    if (TLI.isTypeLegal(NewSrcVT))
39879      if (SDValue N0 =
39880              combineBitcastToBoolVector(NewSrcVT, Src, DL, DAG, Subtarget))
39881        return DAG.getNode(ISD::INSERT_SUBVECTOR, DL, VT,
39882                           Opc == ISD::ANY_EXTEND ? DAG.getUNDEF(VT)
39883                                                  : DAG.getConstant(0, DL, VT),
39884                           N0, DAG.getIntPtrConstant(0, DL));
39885    break;
39886  }
39887  case ISD::OR: {
39888    // If we find suitable sources, we can just move an OR to the vector domain.
39889    SDValue Src0 = V.getOperand(0);
39890    SDValue Src1 = V.getOperand(1);
39891    if (SDValue N0 = combineBitcastToBoolVector(VT, Src0, DL, DAG, Subtarget))
39892      if (SDValue N1 = combineBitcastToBoolVector(VT, Src1, DL, DAG, Subtarget))
39893        return DAG.getNode(Opc, DL, VT, N0, N1);
39894    break;
39895  }
39896  case ISD::SHL: {
39897    // If we find a suitable source, a SHL becomes a KSHIFTL.
39898    SDValue Src0 = V.getOperand(0);
39899    if ((VT == MVT::v8i1 && !Subtarget.hasDQI()) ||
39900        ((VT == MVT::v32i1 || VT == MVT::v64i1) && !Subtarget.hasBWI()))
39901      break;
39902
39903    if (auto *Amt = dyn_cast<ConstantSDNode>(V.getOperand(1)))
39904      if (SDValue N0 = combineBitcastToBoolVector(VT, Src0, DL, DAG, Subtarget))
39905        return DAG.getNode(
39906            X86ISD::KSHIFTL, DL, VT, N0,
39907            DAG.getTargetConstant(Amt->getZExtValue(), DL, MVT::i8));
39908    break;
39909  }
39910  }
39911  return SDValue();
39912}
39913
39914static SDValue combineBitcast(SDNode *N, SelectionDAG &DAG,
39915                              TargetLowering::DAGCombinerInfo &DCI,
39916                              const X86Subtarget &Subtarget) {
39917  SDValue N0 = N->getOperand(0);
39918  EVT VT = N->getValueType(0);
39919  EVT SrcVT = N0.getValueType();
39920  const TargetLowering &TLI = DAG.getTargetLoweringInfo();
39921
39922  // Try to match patterns such as
39923  // (i16 bitcast (v16i1 x))
39924  // ->
39925  // (i16 movmsk (16i8 sext (v16i1 x)))
39926  // before the setcc result is scalarized on subtargets that don't have legal
39927  // vxi1 types.
39928  if (DCI.isBeforeLegalize()) {
39929    SDLoc dl(N);
39930    if (SDValue V = combineBitcastvxi1(DAG, VT, N0, dl, Subtarget))
39931      return V;
39932
39933    // If this is a bitcast between a MVT::v4i1/v2i1 and an illegal integer
39934    // type, widen both sides to avoid a trip through memory.
39935    if ((VT == MVT::v4i1 || VT == MVT::v2i1) && SrcVT.isScalarInteger() &&
39936        Subtarget.hasAVX512()) {
39937      N0 = DAG.getNode(ISD::ANY_EXTEND, dl, MVT::i8, N0);
39938      N0 = DAG.getBitcast(MVT::v8i1, N0);
39939      return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, VT, N0,
39940                         DAG.getIntPtrConstant(0, dl));
39941    }
39942
39943    // If this is a bitcast between a MVT::v4i1/v2i1 and an illegal integer
39944    // type, widen both sides to avoid a trip through memory.
39945    if ((SrcVT == MVT::v4i1 || SrcVT == MVT::v2i1) && VT.isScalarInteger() &&
39946        Subtarget.hasAVX512()) {
39947      // Use zeros for the widening if we already have some zeroes. This can
39948      // allow SimplifyDemandedBits to remove scalar ANDs that may be down
39949      // stream of this.
39950      // FIXME: It might make sense to detect a concat_vectors with a mix of
39951      // zeroes and undef and turn it into insert_subvector for i1 vectors as
39952      // a separate combine. What we can't do is canonicalize the operands of
39953      // such a concat or we'll get into a loop with SimplifyDemandedBits.
39954      if (N0.getOpcode() == ISD::CONCAT_VECTORS) {
39955        SDValue LastOp = N0.getOperand(N0.getNumOperands() - 1);
39956        if (ISD::isBuildVectorAllZeros(LastOp.getNode())) {
39957          SrcVT = LastOp.getValueType();
39958          unsigned NumConcats = 8 / SrcVT.getVectorNumElements();
39959          SmallVector<SDValue, 4> Ops(N0->op_begin(), N0->op_end());
39960          Ops.resize(NumConcats, DAG.getConstant(0, dl, SrcVT));
39961          N0 = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v8i1, Ops);
39962          N0 = DAG.getBitcast(MVT::i8, N0);
39963          return DAG.getNode(ISD::TRUNCATE, dl, VT, N0);
39964        }
39965      }
39966
39967      unsigned NumConcats = 8 / SrcVT.getVectorNumElements();
39968      SmallVector<SDValue, 4> Ops(NumConcats, DAG.getUNDEF(SrcVT));
39969      Ops[0] = N0;
39970      N0 = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v8i1, Ops);
39971      N0 = DAG.getBitcast(MVT::i8, N0);
39972      return DAG.getNode(ISD::TRUNCATE, dl, VT, N0);
39973    }
39974  } else {
39975    // If we're bitcasting from iX to vXi1, see if the integer originally
39976    // began as a vXi1 and whether we can remove the bitcast entirely.
39977    if (VT.isVector() && VT.getScalarType() == MVT::i1 &&
39978        SrcVT.isScalarInteger() && TLI.isTypeLegal(VT)) {
39979      if (SDValue V =
39980              combineBitcastToBoolVector(VT, N0, SDLoc(N), DAG, Subtarget))
39981        return V;
39982    }
39983  }
39984
39985  // Look for (i8 (bitcast (v8i1 (extract_subvector (v16i1 X), 0)))) and
39986  // replace with (i8 (trunc (i16 (bitcast (v16i1 X))))). This can occur
39987  // due to insert_subvector legalization on KNL. By promoting the copy to i16
39988  // we can help with known bits propagation from the vXi1 domain to the
39989  // scalar domain.
39990  if (VT == MVT::i8 && SrcVT == MVT::v8i1 && Subtarget.hasAVX512() &&
39991      !Subtarget.hasDQI() && N0.getOpcode() == ISD::EXTRACT_SUBVECTOR &&
39992      N0.getOperand(0).getValueType() == MVT::v16i1 &&
39993      isNullConstant(N0.getOperand(1)))
39994    return DAG.getNode(ISD::TRUNCATE, SDLoc(N), VT,
39995                       DAG.getBitcast(MVT::i16, N0.getOperand(0)));
39996
39997  // Canonicalize (bitcast (vbroadcast_load)) so that the output of the bitcast
39998  // and the vbroadcast_load are both integer or both fp. In some cases this
39999  // will remove the bitcast entirely.
40000  if (N0.getOpcode() == X86ISD::VBROADCAST_LOAD && N0.hasOneUse() &&
40001       VT.isFloatingPoint() != SrcVT.isFloatingPoint() && VT.isVector()) {
40002    auto *BCast = cast<MemIntrinsicSDNode>(N0);
40003    unsigned SrcVTSize = SrcVT.getScalarSizeInBits();
40004    unsigned MemSize = BCast->getMemoryVT().getScalarSizeInBits();
40005    // Don't swap i8/i16 since don't have fp types that size.
40006    if (MemSize >= 32) {
40007      MVT MemVT = VT.isFloatingPoint() ? MVT::getFloatingPointVT(MemSize)
40008                                       : MVT::getIntegerVT(MemSize);
40009      MVT LoadVT = VT.isFloatingPoint() ? MVT::getFloatingPointVT(SrcVTSize)
40010                                        : MVT::getIntegerVT(SrcVTSize);
40011      LoadVT = MVT::getVectorVT(LoadVT, SrcVT.getVectorNumElements());
40012
40013      SDVTList Tys = DAG.getVTList(LoadVT, MVT::Other);
40014      SDValue Ops[] = { BCast->getChain(), BCast->getBasePtr() };
40015      SDValue ResNode =
40016          DAG.getMemIntrinsicNode(X86ISD::VBROADCAST_LOAD, SDLoc(N), Tys, Ops,
40017                                  MemVT, BCast->getMemOperand());
40018      DAG.ReplaceAllUsesOfValueWith(SDValue(BCast, 1), ResNode.getValue(1));
40019      return DAG.getBitcast(VT, ResNode);
40020    }
40021  }
40022
40023  // Since MMX types are special and don't usually play with other vector types,
40024  // it's better to handle them early to be sure we emit efficient code by
40025  // avoiding store-load conversions.
40026  if (VT == MVT::x86mmx) {
40027    // Detect MMX constant vectors.
40028    APInt UndefElts;
40029    SmallVector<APInt, 1> EltBits;
40030    if (getTargetConstantBitsFromNode(N0, 64, UndefElts, EltBits)) {
40031      SDLoc DL(N0);
40032      // Handle zero-extension of i32 with MOVD.
40033      if (EltBits[0].countLeadingZeros() >= 32)
40034        return DAG.getNode(X86ISD::MMX_MOVW2D, DL, VT,
40035                           DAG.getConstant(EltBits[0].trunc(32), DL, MVT::i32));
40036      // Else, bitcast to a double.
40037      // TODO - investigate supporting sext 32-bit immediates on x86_64.
40038      APFloat F64(APFloat::IEEEdouble(), EltBits[0]);
40039      return DAG.getBitcast(VT, DAG.getConstantFP(F64, DL, MVT::f64));
40040    }
40041
40042    // Detect bitcasts to x86mmx low word.
40043    if (N0.getOpcode() == ISD::BUILD_VECTOR &&
40044        (SrcVT == MVT::v2i32 || SrcVT == MVT::v4i16 || SrcVT == MVT::v8i8) &&
40045        N0.getOperand(0).getValueType() == SrcVT.getScalarType()) {
40046      bool LowUndef = true, AllUndefOrZero = true;
40047      for (unsigned i = 1, e = SrcVT.getVectorNumElements(); i != e; ++i) {
40048        SDValue Op = N0.getOperand(i);
40049        LowUndef &= Op.isUndef() || (i >= e/2);
40050        AllUndefOrZero &= (Op.isUndef() || isNullConstant(Op));
40051      }
40052      if (AllUndefOrZero) {
40053        SDValue N00 = N0.getOperand(0);
40054        SDLoc dl(N00);
40055        N00 = LowUndef ? DAG.getAnyExtOrTrunc(N00, dl, MVT::i32)
40056                       : DAG.getZExtOrTrunc(N00, dl, MVT::i32);
40057        return DAG.getNode(X86ISD::MMX_MOVW2D, dl, VT, N00);
40058      }
40059    }
40060
40061    // Detect bitcasts of 64-bit build vectors and convert to a
40062    // MMX UNPCK/PSHUFW which takes MMX type inputs with the value in the
40063    // lowest element.
40064    if (N0.getOpcode() == ISD::BUILD_VECTOR &&
40065        (SrcVT == MVT::v2f32 || SrcVT == MVT::v2i32 || SrcVT == MVT::v4i16 ||
40066         SrcVT == MVT::v8i8))
40067      return createMMXBuildVector(cast<BuildVectorSDNode>(N0), DAG, Subtarget);
40068
40069    // Detect bitcasts between element or subvector extraction to x86mmx.
40070    if ((N0.getOpcode() == ISD::EXTRACT_VECTOR_ELT ||
40071         N0.getOpcode() == ISD::EXTRACT_SUBVECTOR) &&
40072        isNullConstant(N0.getOperand(1))) {
40073      SDValue N00 = N0.getOperand(0);
40074      if (N00.getValueType().is128BitVector())
40075        return DAG.getNode(X86ISD::MOVDQ2Q, SDLoc(N00), VT,
40076                           DAG.getBitcast(MVT::v2i64, N00));
40077    }
40078
40079    // Detect bitcasts from FP_TO_SINT to x86mmx.
40080    if (SrcVT == MVT::v2i32 && N0.getOpcode() == ISD::FP_TO_SINT) {
40081      SDLoc DL(N0);
40082      SDValue Res = DAG.getNode(ISD::CONCAT_VECTORS, DL, MVT::v4i32, N0,
40083                                DAG.getUNDEF(MVT::v2i32));
40084      return DAG.getNode(X86ISD::MOVDQ2Q, DL, VT,
40085                         DAG.getBitcast(MVT::v2i64, Res));
40086    }
40087  }
40088
40089  // Try to remove a bitcast of constant vXi1 vector. We have to legalize
40090  // most of these to scalar anyway.
40091  if (Subtarget.hasAVX512() && VT.isScalarInteger() &&
40092      SrcVT.isVector() && SrcVT.getVectorElementType() == MVT::i1 &&
40093      ISD::isBuildVectorOfConstantSDNodes(N0.getNode())) {
40094    return combinevXi1ConstantToInteger(N0, DAG);
40095  }
40096
40097  if (Subtarget.hasAVX512() && SrcVT.isScalarInteger() &&
40098      VT.isVector() && VT.getVectorElementType() == MVT::i1 &&
40099      isa<ConstantSDNode>(N0)) {
40100    auto *C = cast<ConstantSDNode>(N0);
40101    if (C->isAllOnesValue())
40102      return DAG.getConstant(1, SDLoc(N0), VT);
40103    if (C->isNullValue())
40104      return DAG.getConstant(0, SDLoc(N0), VT);
40105  }
40106
40107  // Look for MOVMSK that is maybe truncated and then bitcasted to vXi1.
40108  // Turn it into a sign bit compare that produces a k-register. This avoids
40109  // a trip through a GPR.
40110  if (Subtarget.hasAVX512() && SrcVT.isScalarInteger() &&
40111      VT.isVector() && VT.getVectorElementType() == MVT::i1 &&
40112      isPowerOf2_32(VT.getVectorNumElements())) {
40113    unsigned NumElts = VT.getVectorNumElements();
40114    SDValue Src = N0;
40115
40116    // Peek through truncate.
40117    if (N0.getOpcode() == ISD::TRUNCATE && N0.hasOneUse())
40118      Src = N0.getOperand(0);
40119
40120    if (Src.getOpcode() == X86ISD::MOVMSK && Src.hasOneUse()) {
40121      SDValue MovmskIn = Src.getOperand(0);
40122      MVT MovmskVT = MovmskIn.getSimpleValueType();
40123      unsigned MovMskElts = MovmskVT.getVectorNumElements();
40124
40125      // We allow extra bits of the movmsk to be used since they are known zero.
40126      // We can't convert a VPMOVMSKB without avx512bw.
40127      if (MovMskElts <= NumElts &&
40128          (Subtarget.hasBWI() || MovmskVT.getVectorElementType() != MVT::i8)) {
40129        EVT IntVT = EVT(MovmskVT).changeVectorElementTypeToInteger();
40130        MovmskIn = DAG.getBitcast(IntVT, MovmskIn);
40131        SDLoc dl(N);
40132        MVT CmpVT = MVT::getVectorVT(MVT::i1, MovMskElts);
40133        SDValue Cmp = DAG.getSetCC(dl, CmpVT, MovmskIn,
40134                                   DAG.getConstant(0, dl, IntVT), ISD::SETLT);
40135        if (EVT(CmpVT) == VT)
40136          return Cmp;
40137
40138        // Pad with zeroes up to original VT to replace the zeroes that were
40139        // being used from the MOVMSK.
40140        unsigned NumConcats = NumElts / MovMskElts;
40141        SmallVector<SDValue, 4> Ops(NumConcats, DAG.getConstant(0, dl, CmpVT));
40142        Ops[0] = Cmp;
40143        return DAG.getNode(ISD::CONCAT_VECTORS, dl, VT, Ops);
40144      }
40145    }
40146  }
40147
40148  // Try to remove bitcasts from input and output of mask arithmetic to
40149  // remove GPR<->K-register crossings.
40150  if (SDValue V = combineCastedMaskArithmetic(N, DAG, DCI, Subtarget))
40151    return V;
40152
40153  // Convert a bitcasted integer logic operation that has one bitcasted
40154  // floating-point operand into a floating-point logic operation. This may
40155  // create a load of a constant, but that is cheaper than materializing the
40156  // constant in an integer register and transferring it to an SSE register or
40157  // transferring the SSE operand to integer register and back.
40158  unsigned FPOpcode;
40159  switch (N0.getOpcode()) {
40160    case ISD::AND: FPOpcode = X86ISD::FAND; break;
40161    case ISD::OR:  FPOpcode = X86ISD::FOR;  break;
40162    case ISD::XOR: FPOpcode = X86ISD::FXOR; break;
40163    default: return SDValue();
40164  }
40165
40166  // Check if we have a bitcast from another integer type as well.
40167  if (!((Subtarget.hasSSE1() && VT == MVT::f32) ||
40168        (Subtarget.hasSSE2() && VT == MVT::f64) ||
40169        (Subtarget.hasSSE2() && VT.isInteger() && VT.isVector() &&
40170         TLI.isTypeLegal(VT))))
40171    return SDValue();
40172
40173  SDValue LogicOp0 = N0.getOperand(0);
40174  SDValue LogicOp1 = N0.getOperand(1);
40175  SDLoc DL0(N0);
40176
40177  // bitcast(logic(bitcast(X), Y)) --> logic'(X, bitcast(Y))
40178  if (N0.hasOneUse() && LogicOp0.getOpcode() == ISD::BITCAST &&
40179      LogicOp0.hasOneUse() && LogicOp0.getOperand(0).hasOneUse() &&
40180      LogicOp0.getOperand(0).getValueType() == VT &&
40181      !isa<ConstantSDNode>(LogicOp0.getOperand(0))) {
40182    SDValue CastedOp1 = DAG.getBitcast(VT, LogicOp1);
40183    unsigned Opcode = VT.isFloatingPoint() ? FPOpcode : N0.getOpcode();
40184    return DAG.getNode(Opcode, DL0, VT, LogicOp0.getOperand(0), CastedOp1);
40185  }
40186  // bitcast(logic(X, bitcast(Y))) --> logic'(bitcast(X), Y)
40187  if (N0.hasOneUse() && LogicOp1.getOpcode() == ISD::BITCAST &&
40188      LogicOp1.hasOneUse() && LogicOp1.getOperand(0).hasOneUse() &&
40189      LogicOp1.getOperand(0).getValueType() == VT &&
40190      !isa<ConstantSDNode>(LogicOp1.getOperand(0))) {
40191    SDValue CastedOp0 = DAG.getBitcast(VT, LogicOp0);
40192    unsigned Opcode = VT.isFloatingPoint() ? FPOpcode : N0.getOpcode();
40193    return DAG.getNode(Opcode, DL0, VT, LogicOp1.getOperand(0), CastedOp0);
40194  }
40195
40196  return SDValue();
40197}
40198
40199// Given a ABS node, detect the following pattern:
40200// (ABS (SUB (ZERO_EXTEND a), (ZERO_EXTEND b))).
40201// This is useful as it is the input into a SAD pattern.
40202static bool detectZextAbsDiff(const SDValue &Abs, SDValue &Op0, SDValue &Op1) {
40203  SDValue AbsOp1 = Abs->getOperand(0);
40204  if (AbsOp1.getOpcode() != ISD::SUB)
40205    return false;
40206
40207  Op0 = AbsOp1.getOperand(0);
40208  Op1 = AbsOp1.getOperand(1);
40209
40210  // Check if the operands of the sub are zero-extended from vectors of i8.
40211  if (Op0.getOpcode() != ISD::ZERO_EXTEND ||
40212      Op0.getOperand(0).getValueType().getVectorElementType() != MVT::i8 ||
40213      Op1.getOpcode() != ISD::ZERO_EXTEND ||
40214      Op1.getOperand(0).getValueType().getVectorElementType() != MVT::i8)
40215    return false;
40216
40217  return true;
40218}
40219
40220// Given two zexts of <k x i8> to <k x i32>, create a PSADBW of the inputs
40221// to these zexts.
40222static SDValue createPSADBW(SelectionDAG &DAG, const SDValue &Zext0,
40223                            const SDValue &Zext1, const SDLoc &DL,
40224                            const X86Subtarget &Subtarget) {
40225  // Find the appropriate width for the PSADBW.
40226  EVT InVT = Zext0.getOperand(0).getValueType();
40227  unsigned RegSize = std::max(128u, (unsigned)InVT.getSizeInBits());
40228
40229  // "Zero-extend" the i8 vectors. This is not a per-element zext, rather we
40230  // fill in the missing vector elements with 0.
40231  unsigned NumConcat = RegSize / InVT.getSizeInBits();
40232  SmallVector<SDValue, 16> Ops(NumConcat, DAG.getConstant(0, DL, InVT));
40233  Ops[0] = Zext0.getOperand(0);
40234  MVT ExtendedVT = MVT::getVectorVT(MVT::i8, RegSize / 8);
40235  SDValue SadOp0 = DAG.getNode(ISD::CONCAT_VECTORS, DL, ExtendedVT, Ops);
40236  Ops[0] = Zext1.getOperand(0);
40237  SDValue SadOp1 = DAG.getNode(ISD::CONCAT_VECTORS, DL, ExtendedVT, Ops);
40238
40239  // Actually build the SAD, split as 128/256/512 bits for SSE/AVX2/AVX512BW.
40240  auto PSADBWBuilder = [](SelectionDAG &DAG, const SDLoc &DL,
40241                          ArrayRef<SDValue> Ops) {
40242    MVT VT = MVT::getVectorVT(MVT::i64, Ops[0].getValueSizeInBits() / 64);
40243    return DAG.getNode(X86ISD::PSADBW, DL, VT, Ops);
40244  };
40245  MVT SadVT = MVT::getVectorVT(MVT::i64, RegSize / 64);
40246  return SplitOpsAndApply(DAG, Subtarget, DL, SadVT, { SadOp0, SadOp1 },
40247                          PSADBWBuilder);
40248}
40249
40250// Attempt to replace an min/max v8i16/v16i8 horizontal reduction with
40251// PHMINPOSUW.
40252static SDValue combineMinMaxReduction(SDNode *Extract, SelectionDAG &DAG,
40253                                      const X86Subtarget &Subtarget) {
40254  // Bail without SSE41.
40255  if (!Subtarget.hasSSE41())
40256    return SDValue();
40257
40258  EVT ExtractVT = Extract->getValueType(0);
40259  if (ExtractVT != MVT::i16 && ExtractVT != MVT::i8)
40260    return SDValue();
40261
40262  // Check for SMAX/SMIN/UMAX/UMIN horizontal reduction patterns.
40263  ISD::NodeType BinOp;
40264  SDValue Src = DAG.matchBinOpReduction(
40265      Extract, BinOp, {ISD::SMAX, ISD::SMIN, ISD::UMAX, ISD::UMIN}, true);
40266  if (!Src)
40267    return SDValue();
40268
40269  EVT SrcVT = Src.getValueType();
40270  EVT SrcSVT = SrcVT.getScalarType();
40271  if (SrcSVT != ExtractVT || (SrcVT.getSizeInBits() % 128) != 0)
40272    return SDValue();
40273
40274  SDLoc DL(Extract);
40275  SDValue MinPos = Src;
40276
40277  // First, reduce the source down to 128-bit, applying BinOp to lo/hi.
40278  while (SrcVT.getSizeInBits() > 128) {
40279    SDValue Lo, Hi;
40280    std::tie(Lo, Hi) = splitVector(MinPos, DAG, DL);
40281    SrcVT = Lo.getValueType();
40282    MinPos = DAG.getNode(BinOp, DL, SrcVT, Lo, Hi);
40283  }
40284  assert(((SrcVT == MVT::v8i16 && ExtractVT == MVT::i16) ||
40285          (SrcVT == MVT::v16i8 && ExtractVT == MVT::i8)) &&
40286         "Unexpected value type");
40287
40288  // PHMINPOSUW applies to UMIN(v8i16), for SMIN/SMAX/UMAX we must apply a mask
40289  // to flip the value accordingly.
40290  SDValue Mask;
40291  unsigned MaskEltsBits = ExtractVT.getSizeInBits();
40292  if (BinOp == ISD::SMAX)
40293    Mask = DAG.getConstant(APInt::getSignedMaxValue(MaskEltsBits), DL, SrcVT);
40294  else if (BinOp == ISD::SMIN)
40295    Mask = DAG.getConstant(APInt::getSignedMinValue(MaskEltsBits), DL, SrcVT);
40296  else if (BinOp == ISD::UMAX)
40297    Mask = DAG.getConstant(APInt::getAllOnesValue(MaskEltsBits), DL, SrcVT);
40298
40299  if (Mask)
40300    MinPos = DAG.getNode(ISD::XOR, DL, SrcVT, Mask, MinPos);
40301
40302  // For v16i8 cases we need to perform UMIN on pairs of byte elements,
40303  // shuffling each upper element down and insert zeros. This means that the
40304  // v16i8 UMIN will leave the upper element as zero, performing zero-extension
40305  // ready for the PHMINPOS.
40306  if (ExtractVT == MVT::i8) {
40307    SDValue Upper = DAG.getVectorShuffle(
40308        SrcVT, DL, MinPos, DAG.getConstant(0, DL, MVT::v16i8),
40309        {1, 16, 3, 16, 5, 16, 7, 16, 9, 16, 11, 16, 13, 16, 15, 16});
40310    MinPos = DAG.getNode(ISD::UMIN, DL, SrcVT, MinPos, Upper);
40311  }
40312
40313  // Perform the PHMINPOS on a v8i16 vector,
40314  MinPos = DAG.getBitcast(MVT::v8i16, MinPos);
40315  MinPos = DAG.getNode(X86ISD::PHMINPOS, DL, MVT::v8i16, MinPos);
40316  MinPos = DAG.getBitcast(SrcVT, MinPos);
40317
40318  if (Mask)
40319    MinPos = DAG.getNode(ISD::XOR, DL, SrcVT, Mask, MinPos);
40320
40321  return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, ExtractVT, MinPos,
40322                     DAG.getIntPtrConstant(0, DL));
40323}
40324
40325// Attempt to replace an all_of/any_of/parity style horizontal reduction with a MOVMSK.
40326static SDValue combinePredicateReduction(SDNode *Extract, SelectionDAG &DAG,
40327                                         const X86Subtarget &Subtarget) {
40328  // Bail without SSE2.
40329  if (!Subtarget.hasSSE2())
40330    return SDValue();
40331
40332  EVT ExtractVT = Extract->getValueType(0);
40333  unsigned BitWidth = ExtractVT.getSizeInBits();
40334  if (ExtractVT != MVT::i64 && ExtractVT != MVT::i32 && ExtractVT != MVT::i16 &&
40335      ExtractVT != MVT::i8 && ExtractVT != MVT::i1)
40336    return SDValue();
40337
40338  // Check for OR(any_of)/AND(all_of)/XOR(parity) horizontal reduction patterns.
40339  ISD::NodeType BinOp;
40340  SDValue Match = DAG.matchBinOpReduction(Extract, BinOp, {ISD::OR, ISD::AND});
40341  if (!Match && ExtractVT == MVT::i1)
40342    Match = DAG.matchBinOpReduction(Extract, BinOp, {ISD::XOR});
40343  if (!Match)
40344    return SDValue();
40345
40346  // EXTRACT_VECTOR_ELT can require implicit extension of the vector element
40347  // which we can't support here for now.
40348  if (Match.getScalarValueSizeInBits() != BitWidth)
40349    return SDValue();
40350
40351  SDValue Movmsk;
40352  SDLoc DL(Extract);
40353  EVT MatchVT = Match.getValueType();
40354  unsigned NumElts = MatchVT.getVectorNumElements();
40355  unsigned MaxElts = Subtarget.hasInt256() ? 32 : 16;
40356  const TargetLowering &TLI = DAG.getTargetLoweringInfo();
40357
40358  if (ExtractVT == MVT::i1) {
40359    // Special case for (pre-legalization) vXi1 reductions.
40360    if (NumElts > 64 || !isPowerOf2_32(NumElts))
40361      return SDValue();
40362    if (TLI.isTypeLegal(MatchVT)) {
40363      // If this is a legal AVX512 predicate type then we can just bitcast.
40364      EVT MovmskVT = EVT::getIntegerVT(*DAG.getContext(), NumElts);
40365      Movmsk = DAG.getBitcast(MovmskVT, Match);
40366    } else {
40367      // For all_of(setcc(vec,0,eq)) - avoid vXi64 comparisons if we don't have
40368      // PCMPEQQ (SSE41+), use PCMPEQD instead.
40369      if (BinOp == ISD::AND && !Subtarget.hasSSE41() &&
40370          Match.getOpcode() == ISD::SETCC &&
40371          ISD::isBuildVectorAllZeros(Match.getOperand(1).getNode()) &&
40372          cast<CondCodeSDNode>(Match.getOperand(2))->get() ==
40373              ISD::CondCode::SETEQ) {
40374        SDValue Vec = Match.getOperand(0);
40375        if (Vec.getValueType().getScalarType() == MVT::i64 &&
40376            (2 * NumElts) <= MaxElts) {
40377          NumElts *= 2;
40378          EVT CmpVT = EVT::getVectorVT(*DAG.getContext(), MVT::i32, NumElts);
40379          MatchVT = EVT::getVectorVT(*DAG.getContext(), MVT::i1, NumElts);
40380          Match = DAG.getSetCC(
40381              DL, MatchVT, DAG.getBitcast(CmpVT, Match.getOperand(0)),
40382              DAG.getBitcast(CmpVT, Match.getOperand(1)), ISD::CondCode::SETEQ);
40383        }
40384      }
40385
40386      // Use combineBitcastvxi1 to create the MOVMSK.
40387      while (NumElts > MaxElts) {
40388        SDValue Lo, Hi;
40389        std::tie(Lo, Hi) = DAG.SplitVector(Match, DL);
40390        Match = DAG.getNode(BinOp, DL, Lo.getValueType(), Lo, Hi);
40391        NumElts /= 2;
40392      }
40393      EVT MovmskVT = EVT::getIntegerVT(*DAG.getContext(), NumElts);
40394      Movmsk = combineBitcastvxi1(DAG, MovmskVT, Match, DL, Subtarget);
40395    }
40396    if (!Movmsk)
40397      return SDValue();
40398    Movmsk = DAG.getZExtOrTrunc(Movmsk, DL, NumElts > 32 ? MVT::i64 : MVT::i32);
40399  } else {
40400    // FIXME: Better handling of k-registers or 512-bit vectors?
40401    unsigned MatchSizeInBits = Match.getValueSizeInBits();
40402    if (!(MatchSizeInBits == 128 ||
40403          (MatchSizeInBits == 256 && Subtarget.hasAVX())))
40404      return SDValue();
40405
40406    // Make sure this isn't a vector of 1 element. The perf win from using
40407    // MOVMSK diminishes with less elements in the reduction, but it is
40408    // generally better to get the comparison over to the GPRs as soon as
40409    // possible to reduce the number of vector ops.
40410    if (Match.getValueType().getVectorNumElements() < 2)
40411      return SDValue();
40412
40413    // Check that we are extracting a reduction of all sign bits.
40414    if (DAG.ComputeNumSignBits(Match) != BitWidth)
40415      return SDValue();
40416
40417    if (MatchSizeInBits == 256 && BitWidth < 32 && !Subtarget.hasInt256()) {
40418      SDValue Lo, Hi;
40419      std::tie(Lo, Hi) = DAG.SplitVector(Match, DL);
40420      Match = DAG.getNode(BinOp, DL, Lo.getValueType(), Lo, Hi);
40421      MatchSizeInBits = Match.getValueSizeInBits();
40422    }
40423
40424    // For 32/64 bit comparisons use MOVMSKPS/MOVMSKPD, else PMOVMSKB.
40425    MVT MaskSrcVT;
40426    if (64 == BitWidth || 32 == BitWidth)
40427      MaskSrcVT = MVT::getVectorVT(MVT::getFloatingPointVT(BitWidth),
40428                                   MatchSizeInBits / BitWidth);
40429    else
40430      MaskSrcVT = MVT::getVectorVT(MVT::i8, MatchSizeInBits / 8);
40431
40432    SDValue BitcastLogicOp = DAG.getBitcast(MaskSrcVT, Match);
40433    Movmsk = getPMOVMSKB(DL, BitcastLogicOp, DAG, Subtarget);
40434    NumElts = MaskSrcVT.getVectorNumElements();
40435  }
40436  assert((NumElts <= 32 || NumElts == 64) &&
40437         "Not expecting more than 64 elements");
40438
40439  MVT CmpVT = NumElts == 64 ? MVT::i64 : MVT::i32;
40440  if (BinOp == ISD::XOR) {
40441    // parity -> (PARITY(MOVMSK X))
40442    SDValue Result = DAG.getNode(ISD::PARITY, DL, CmpVT, Movmsk);
40443    return DAG.getZExtOrTrunc(Result, DL, ExtractVT);
40444  }
40445
40446  SDValue CmpC;
40447  ISD::CondCode CondCode;
40448  if (BinOp == ISD::OR) {
40449    // any_of -> MOVMSK != 0
40450    CmpC = DAG.getConstant(0, DL, CmpVT);
40451    CondCode = ISD::CondCode::SETNE;
40452  } else {
40453    // all_of -> MOVMSK == ((1 << NumElts) - 1)
40454    CmpC = DAG.getConstant(APInt::getLowBitsSet(CmpVT.getSizeInBits(), NumElts),
40455                           DL, CmpVT);
40456    CondCode = ISD::CondCode::SETEQ;
40457  }
40458
40459  // The setcc produces an i8 of 0/1, so extend that to the result width and
40460  // negate to get the final 0/-1 mask value.
40461  EVT SetccVT =
40462      TLI.getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), CmpVT);
40463  SDValue Setcc = DAG.getSetCC(DL, SetccVT, Movmsk, CmpC, CondCode);
40464  SDValue Zext = DAG.getZExtOrTrunc(Setcc, DL, ExtractVT);
40465  SDValue Zero = DAG.getConstant(0, DL, ExtractVT);
40466  return DAG.getNode(ISD::SUB, DL, ExtractVT, Zero, Zext);
40467}
40468
40469static SDValue combineBasicSADPattern(SDNode *Extract, SelectionDAG &DAG,
40470                                      const X86Subtarget &Subtarget) {
40471  // PSADBW is only supported on SSE2 and up.
40472  if (!Subtarget.hasSSE2())
40473    return SDValue();
40474
40475  EVT ExtractVT = Extract->getValueType(0);
40476  // Verify the type we're extracting is either i32 or i64.
40477  // FIXME: Could support other types, but this is what we have coverage for.
40478  if (ExtractVT != MVT::i32 && ExtractVT != MVT::i64)
40479    return SDValue();
40480
40481  EVT VT = Extract->getOperand(0).getValueType();
40482  if (!isPowerOf2_32(VT.getVectorNumElements()))
40483    return SDValue();
40484
40485  // Match shuffle + add pyramid.
40486  ISD::NodeType BinOp;
40487  SDValue Root = DAG.matchBinOpReduction(Extract, BinOp, {ISD::ADD});
40488
40489  // The operand is expected to be zero extended from i8
40490  // (verified in detectZextAbsDiff).
40491  // In order to convert to i64 and above, additional any/zero/sign
40492  // extend is expected.
40493  // The zero extend from 32 bit has no mathematical effect on the result.
40494  // Also the sign extend is basically zero extend
40495  // (extends the sign bit which is zero).
40496  // So it is correct to skip the sign/zero extend instruction.
40497  if (Root && (Root.getOpcode() == ISD::SIGN_EXTEND ||
40498               Root.getOpcode() == ISD::ZERO_EXTEND ||
40499               Root.getOpcode() == ISD::ANY_EXTEND))
40500    Root = Root.getOperand(0);
40501
40502  // If there was a match, we want Root to be a select that is the root of an
40503  // abs-diff pattern.
40504  if (!Root || Root.getOpcode() != ISD::ABS)
40505    return SDValue();
40506
40507  // Check whether we have an abs-diff pattern feeding into the select.
40508  SDValue Zext0, Zext1;
40509  if (!detectZextAbsDiff(Root, Zext0, Zext1))
40510    return SDValue();
40511
40512  // Create the SAD instruction.
40513  SDLoc DL(Extract);
40514  SDValue SAD = createPSADBW(DAG, Zext0, Zext1, DL, Subtarget);
40515
40516  // If the original vector was wider than 8 elements, sum over the results
40517  // in the SAD vector.
40518  unsigned Stages = Log2_32(VT.getVectorNumElements());
40519  EVT SadVT = SAD.getValueType();
40520  if (Stages > 3) {
40521    unsigned SadElems = SadVT.getVectorNumElements();
40522
40523    for(unsigned i = Stages - 3; i > 0; --i) {
40524      SmallVector<int, 16> Mask(SadElems, -1);
40525      for(unsigned j = 0, MaskEnd = 1 << (i - 1); j < MaskEnd; ++j)
40526        Mask[j] = MaskEnd + j;
40527
40528      SDValue Shuffle =
40529          DAG.getVectorShuffle(SadVT, DL, SAD, DAG.getUNDEF(SadVT), Mask);
40530      SAD = DAG.getNode(ISD::ADD, DL, SadVT, SAD, Shuffle);
40531    }
40532  }
40533
40534  unsigned ExtractSizeInBits = ExtractVT.getSizeInBits();
40535  // Return the lowest ExtractSizeInBits bits.
40536  EVT ResVT = EVT::getVectorVT(*DAG.getContext(), ExtractVT,
40537                               SadVT.getSizeInBits() / ExtractSizeInBits);
40538  SAD = DAG.getBitcast(ResVT, SAD);
40539  return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, ExtractVT, SAD,
40540                     Extract->getOperand(1));
40541}
40542
40543// Attempt to peek through a target shuffle and extract the scalar from the
40544// source.
40545static SDValue combineExtractWithShuffle(SDNode *N, SelectionDAG &DAG,
40546                                         TargetLowering::DAGCombinerInfo &DCI,
40547                                         const X86Subtarget &Subtarget) {
40548  if (DCI.isBeforeLegalizeOps())
40549    return SDValue();
40550
40551  SDLoc dl(N);
40552  SDValue Src = N->getOperand(0);
40553  SDValue Idx = N->getOperand(1);
40554
40555  EVT VT = N->getValueType(0);
40556  EVT SrcVT = Src.getValueType();
40557  EVT SrcSVT = SrcVT.getVectorElementType();
40558  unsigned SrcEltBits = SrcSVT.getSizeInBits();
40559  unsigned NumSrcElts = SrcVT.getVectorNumElements();
40560
40561  // Don't attempt this for boolean mask vectors or unknown extraction indices.
40562  if (SrcSVT == MVT::i1 || !isa<ConstantSDNode>(Idx))
40563    return SDValue();
40564
40565  const APInt &IdxC = N->getConstantOperandAPInt(1);
40566  if (IdxC.uge(NumSrcElts))
40567    return SDValue();
40568
40569  SDValue SrcBC = peekThroughBitcasts(Src);
40570
40571  // Handle extract(bitcast(broadcast(scalar_value))).
40572  if (X86ISD::VBROADCAST == SrcBC.getOpcode()) {
40573    SDValue SrcOp = SrcBC.getOperand(0);
40574    EVT SrcOpVT = SrcOp.getValueType();
40575    if (SrcOpVT.isScalarInteger() && VT.isInteger() &&
40576        (SrcOpVT.getSizeInBits() % SrcEltBits) == 0) {
40577      unsigned Scale = SrcOpVT.getSizeInBits() / SrcEltBits;
40578      unsigned Offset = IdxC.urem(Scale) * SrcEltBits;
40579      // TODO support non-zero offsets.
40580      if (Offset == 0) {
40581        SrcOp = DAG.getZExtOrTrunc(SrcOp, dl, SrcVT.getScalarType());
40582        SrcOp = DAG.getZExtOrTrunc(SrcOp, dl, VT);
40583        return SrcOp;
40584      }
40585    }
40586  }
40587
40588  // If we're extracting a single element from a broadcast load and there are
40589  // no other users, just create a single load.
40590  if (SrcBC.getOpcode() == X86ISD::VBROADCAST_LOAD && SrcBC.hasOneUse()) {
40591    auto *MemIntr = cast<MemIntrinsicSDNode>(SrcBC);
40592    unsigned SrcBCWidth = SrcBC.getScalarValueSizeInBits();
40593    if (MemIntr->getMemoryVT().getSizeInBits() == SrcBCWidth &&
40594        VT.getSizeInBits() == SrcBCWidth && SrcEltBits == SrcBCWidth) {
40595      SDValue Load = DAG.getLoad(VT, dl, MemIntr->getChain(),
40596                                 MemIntr->getBasePtr(),
40597                                 MemIntr->getPointerInfo(),
40598                                 MemIntr->getOriginalAlign(),
40599                                 MemIntr->getMemOperand()->getFlags());
40600      DAG.ReplaceAllUsesOfValueWith(SDValue(MemIntr, 1), Load.getValue(1));
40601      return Load;
40602    }
40603  }
40604
40605  // Handle extract(bitcast(scalar_to_vector(scalar_value))) for integers.
40606  // TODO: Move to DAGCombine?
40607  if (SrcBC.getOpcode() == ISD::SCALAR_TO_VECTOR && VT.isInteger() &&
40608      SrcBC.getValueType().isInteger() &&
40609      (SrcBC.getScalarValueSizeInBits() % SrcEltBits) == 0 &&
40610      SrcBC.getScalarValueSizeInBits() ==
40611          SrcBC.getOperand(0).getValueSizeInBits()) {
40612    unsigned Scale = SrcBC.getScalarValueSizeInBits() / SrcEltBits;
40613    if (IdxC.ult(Scale)) {
40614      unsigned Offset = IdxC.getZExtValue() * SrcVT.getScalarSizeInBits();
40615      SDValue Scl = SrcBC.getOperand(0);
40616      EVT SclVT = Scl.getValueType();
40617      if (Offset) {
40618        Scl = DAG.getNode(ISD::SRL, dl, SclVT, Scl,
40619                          DAG.getShiftAmountConstant(Offset, SclVT, dl));
40620      }
40621      Scl = DAG.getZExtOrTrunc(Scl, dl, SrcVT.getScalarType());
40622      Scl = DAG.getZExtOrTrunc(Scl, dl, VT);
40623      return Scl;
40624    }
40625  }
40626
40627  // Handle extract(truncate(x)) for 0'th index.
40628  // TODO: Treat this as a faux shuffle?
40629  // TODO: When can we use this for general indices?
40630  if (ISD::TRUNCATE == Src.getOpcode() && IdxC == 0 &&
40631      (SrcVT.getSizeInBits() % 128) == 0) {
40632    Src = extract128BitVector(Src.getOperand(0), 0, DAG, dl);
40633    MVT ExtractVT = MVT::getVectorVT(SrcSVT.getSimpleVT(), 128 / SrcEltBits);
40634    return DAG.getNode(N->getOpcode(), dl, VT, DAG.getBitcast(ExtractVT, Src),
40635                       Idx);
40636  }
40637
40638  // We can only legally extract other elements from 128-bit vectors and in
40639  // certain circumstances, depending on SSE-level.
40640  // TODO: Investigate float/double extraction if it will be just stored.
40641  auto GetLegalExtract = [&Subtarget, &DAG, &dl](SDValue Vec, EVT VecVT,
40642                                                 unsigned Idx) {
40643    EVT VecSVT = VecVT.getScalarType();
40644    if ((VecVT.is256BitVector() || VecVT.is512BitVector()) &&
40645        (VecSVT == MVT::i8 || VecSVT == MVT::i16 || VecSVT == MVT::i32 ||
40646         VecSVT == MVT::i64)) {
40647      unsigned EltSizeInBits = VecSVT.getSizeInBits();
40648      unsigned NumEltsPerLane = 128 / EltSizeInBits;
40649      unsigned LaneOffset = (Idx & ~(NumEltsPerLane - 1)) * EltSizeInBits;
40650      unsigned LaneIdx = LaneOffset / Vec.getScalarValueSizeInBits();
40651      VecVT = EVT::getVectorVT(*DAG.getContext(), VecSVT, NumEltsPerLane);
40652      Vec = extract128BitVector(Vec, LaneIdx, DAG, dl);
40653      Idx &= (NumEltsPerLane - 1);
40654    }
40655    if ((VecVT == MVT::v4i32 || VecVT == MVT::v2i64) &&
40656        ((Idx == 0 && Subtarget.hasSSE2()) || Subtarget.hasSSE41())) {
40657      return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, VecVT.getScalarType(),
40658                         DAG.getBitcast(VecVT, Vec),
40659                         DAG.getIntPtrConstant(Idx, dl));
40660    }
40661    if ((VecVT == MVT::v8i16 && Subtarget.hasSSE2()) ||
40662        (VecVT == MVT::v16i8 && Subtarget.hasSSE41())) {
40663      unsigned OpCode = (VecVT == MVT::v8i16 ? X86ISD::PEXTRW : X86ISD::PEXTRB);
40664      return DAG.getNode(OpCode, dl, MVT::i32, DAG.getBitcast(VecVT, Vec),
40665                         DAG.getTargetConstant(Idx, dl, MVT::i8));
40666    }
40667    return SDValue();
40668  };
40669
40670  // Resolve the target shuffle inputs and mask.
40671  SmallVector<int, 16> Mask;
40672  SmallVector<SDValue, 2> Ops;
40673  if (!getTargetShuffleInputs(SrcBC, Ops, Mask, DAG))
40674    return SDValue();
40675
40676  // Shuffle inputs must be the same size as the result.
40677  if (llvm::any_of(Ops, [SrcVT](SDValue Op) {
40678        return SrcVT.getSizeInBits() != Op.getValueSizeInBits();
40679      }))
40680    return SDValue();
40681
40682  // Attempt to narrow/widen the shuffle mask to the correct size.
40683  if (Mask.size() != NumSrcElts) {
40684    if ((NumSrcElts % Mask.size()) == 0) {
40685      SmallVector<int, 16> ScaledMask;
40686      int Scale = NumSrcElts / Mask.size();
40687      narrowShuffleMaskElts(Scale, Mask, ScaledMask);
40688      Mask = std::move(ScaledMask);
40689    } else if ((Mask.size() % NumSrcElts) == 0) {
40690      // Simplify Mask based on demanded element.
40691      int ExtractIdx = (int)IdxC.getZExtValue();
40692      int Scale = Mask.size() / NumSrcElts;
40693      int Lo = Scale * ExtractIdx;
40694      int Hi = Scale * (ExtractIdx + 1);
40695      for (int i = 0, e = (int)Mask.size(); i != e; ++i)
40696        if (i < Lo || Hi <= i)
40697          Mask[i] = SM_SentinelUndef;
40698
40699      SmallVector<int, 16> WidenedMask;
40700      while (Mask.size() > NumSrcElts &&
40701             canWidenShuffleElements(Mask, WidenedMask))
40702        Mask = std::move(WidenedMask);
40703    }
40704  }
40705
40706  // If narrowing/widening failed, see if we can extract+zero-extend.
40707  int ExtractIdx;
40708  EVT ExtractVT;
40709  if (Mask.size() == NumSrcElts) {
40710    ExtractIdx = Mask[IdxC.getZExtValue()];
40711    ExtractVT = SrcVT;
40712  } else {
40713    unsigned Scale = Mask.size() / NumSrcElts;
40714    if ((Mask.size() % NumSrcElts) != 0 || SrcVT.isFloatingPoint())
40715      return SDValue();
40716    unsigned ScaledIdx = Scale * IdxC.getZExtValue();
40717    if (!isUndefOrZeroInRange(Mask, ScaledIdx + 1, Scale - 1))
40718      return SDValue();
40719    ExtractIdx = Mask[ScaledIdx];
40720    EVT ExtractSVT = EVT::getIntegerVT(*DAG.getContext(), SrcEltBits / Scale);
40721    ExtractVT = EVT::getVectorVT(*DAG.getContext(), ExtractSVT, Mask.size());
40722    assert(SrcVT.getSizeInBits() == ExtractVT.getSizeInBits() &&
40723           "Failed to widen vector type");
40724  }
40725
40726  // If the shuffle source element is undef/zero then we can just accept it.
40727  if (ExtractIdx == SM_SentinelUndef)
40728    return DAG.getUNDEF(VT);
40729
40730  if (ExtractIdx == SM_SentinelZero)
40731    return VT.isFloatingPoint() ? DAG.getConstantFP(0.0, dl, VT)
40732                                : DAG.getConstant(0, dl, VT);
40733
40734  SDValue SrcOp = Ops[ExtractIdx / Mask.size()];
40735  ExtractIdx = ExtractIdx % Mask.size();
40736  if (SDValue V = GetLegalExtract(SrcOp, ExtractVT, ExtractIdx))
40737    return DAG.getZExtOrTrunc(V, dl, VT);
40738
40739  return SDValue();
40740}
40741
40742/// Extracting a scalar FP value from vector element 0 is free, so extract each
40743/// operand first, then perform the math as a scalar op.
40744static SDValue scalarizeExtEltFP(SDNode *ExtElt, SelectionDAG &DAG) {
40745  assert(ExtElt->getOpcode() == ISD::EXTRACT_VECTOR_ELT && "Expected extract");
40746  SDValue Vec = ExtElt->getOperand(0);
40747  SDValue Index = ExtElt->getOperand(1);
40748  EVT VT = ExtElt->getValueType(0);
40749  EVT VecVT = Vec.getValueType();
40750
40751  // TODO: If this is a unary/expensive/expand op, allow extraction from a
40752  // non-zero element because the shuffle+scalar op will be cheaper?
40753  if (!Vec.hasOneUse() || !isNullConstant(Index) || VecVT.getScalarType() != VT)
40754    return SDValue();
40755
40756  // Vector FP compares don't fit the pattern of FP math ops (propagate, not
40757  // extract, the condition code), so deal with those as a special-case.
40758  if (Vec.getOpcode() == ISD::SETCC && VT == MVT::i1) {
40759    EVT OpVT = Vec.getOperand(0).getValueType().getScalarType();
40760    if (OpVT != MVT::f32 && OpVT != MVT::f64)
40761      return SDValue();
40762
40763    // extract (setcc X, Y, CC), 0 --> setcc (extract X, 0), (extract Y, 0), CC
40764    SDLoc DL(ExtElt);
40765    SDValue Ext0 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, OpVT,
40766                               Vec.getOperand(0), Index);
40767    SDValue Ext1 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, OpVT,
40768                               Vec.getOperand(1), Index);
40769    return DAG.getNode(Vec.getOpcode(), DL, VT, Ext0, Ext1, Vec.getOperand(2));
40770  }
40771
40772  if (VT != MVT::f32 && VT != MVT::f64)
40773    return SDValue();
40774
40775  // Vector FP selects don't fit the pattern of FP math ops (because the
40776  // condition has a different type and we have to change the opcode), so deal
40777  // with those here.
40778  // FIXME: This is restricted to pre type legalization by ensuring the setcc
40779  // has i1 elements. If we loosen this we need to convert vector bool to a
40780  // scalar bool.
40781  if (Vec.getOpcode() == ISD::VSELECT &&
40782      Vec.getOperand(0).getOpcode() == ISD::SETCC &&
40783      Vec.getOperand(0).getValueType().getScalarType() == MVT::i1 &&
40784      Vec.getOperand(0).getOperand(0).getValueType() == VecVT) {
40785    // ext (sel Cond, X, Y), 0 --> sel (ext Cond, 0), (ext X, 0), (ext Y, 0)
40786    SDLoc DL(ExtElt);
40787    SDValue Ext0 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL,
40788                               Vec.getOperand(0).getValueType().getScalarType(),
40789                               Vec.getOperand(0), Index);
40790    SDValue Ext1 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, VT,
40791                               Vec.getOperand(1), Index);
40792    SDValue Ext2 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, VT,
40793                               Vec.getOperand(2), Index);
40794    return DAG.getNode(ISD::SELECT, DL, VT, Ext0, Ext1, Ext2);
40795  }
40796
40797  // TODO: This switch could include FNEG and the x86-specific FP logic ops
40798  // (FAND, FANDN, FOR, FXOR). But that may require enhancements to avoid
40799  // missed load folding and fma+fneg combining.
40800  switch (Vec.getOpcode()) {
40801  case ISD::FMA: // Begin 3 operands
40802  case ISD::FMAD:
40803  case ISD::FADD: // Begin 2 operands
40804  case ISD::FSUB:
40805  case ISD::FMUL:
40806  case ISD::FDIV:
40807  case ISD::FREM:
40808  case ISD::FCOPYSIGN:
40809  case ISD::FMINNUM:
40810  case ISD::FMAXNUM:
40811  case ISD::FMINNUM_IEEE:
40812  case ISD::FMAXNUM_IEEE:
40813  case ISD::FMAXIMUM:
40814  case ISD::FMINIMUM:
40815  case X86ISD::FMAX:
40816  case X86ISD::FMIN:
40817  case ISD::FABS: // Begin 1 operand
40818  case ISD::FSQRT:
40819  case ISD::FRINT:
40820  case ISD::FCEIL:
40821  case ISD::FTRUNC:
40822  case ISD::FNEARBYINT:
40823  case ISD::FROUND:
40824  case ISD::FFLOOR:
40825  case X86ISD::FRCP:
40826  case X86ISD::FRSQRT: {
40827    // extract (fp X, Y, ...), 0 --> fp (extract X, 0), (extract Y, 0), ...
40828    SDLoc DL(ExtElt);
40829    SmallVector<SDValue, 4> ExtOps;
40830    for (SDValue Op : Vec->ops())
40831      ExtOps.push_back(DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, VT, Op, Index));
40832    return DAG.getNode(Vec.getOpcode(), DL, VT, ExtOps);
40833  }
40834  default:
40835    return SDValue();
40836  }
40837  llvm_unreachable("All opcodes should return within switch");
40838}
40839
40840/// Try to convert a vector reduction sequence composed of binops and shuffles
40841/// into horizontal ops.
40842static SDValue combineArithReduction(SDNode *ExtElt, SelectionDAG &DAG,
40843                                     const X86Subtarget &Subtarget) {
40844  assert(ExtElt->getOpcode() == ISD::EXTRACT_VECTOR_ELT && "Unexpected caller");
40845
40846  // We need at least SSE2 to anything here.
40847  if (!Subtarget.hasSSE2())
40848    return SDValue();
40849
40850  ISD::NodeType Opc;
40851  SDValue Rdx = DAG.matchBinOpReduction(ExtElt, Opc,
40852                                        {ISD::ADD, ISD::MUL, ISD::FADD}, true);
40853  if (!Rdx)
40854    return SDValue();
40855
40856  SDValue Index = ExtElt->getOperand(1);
40857  assert(isNullConstant(Index) &&
40858         "Reduction doesn't end in an extract from index 0");
40859
40860  EVT VT = ExtElt->getValueType(0);
40861  EVT VecVT = Rdx.getValueType();
40862  if (VecVT.getScalarType() != VT)
40863    return SDValue();
40864
40865  SDLoc DL(ExtElt);
40866
40867  // vXi8 mul reduction - promote to vXi16 mul reduction.
40868  if (Opc == ISD::MUL) {
40869    unsigned NumElts = VecVT.getVectorNumElements();
40870    if (VT != MVT::i8 || NumElts < 4 || !isPowerOf2_32(NumElts))
40871      return SDValue();
40872    if (VecVT.getSizeInBits() >= 128) {
40873      EVT WideVT = EVT::getVectorVT(*DAG.getContext(), MVT::i16, NumElts / 2);
40874      SDValue Lo = getUnpackl(DAG, DL, VecVT, Rdx, DAG.getUNDEF(VecVT));
40875      SDValue Hi = getUnpackh(DAG, DL, VecVT, Rdx, DAG.getUNDEF(VecVT));
40876      Lo = DAG.getBitcast(WideVT, Lo);
40877      Hi = DAG.getBitcast(WideVT, Hi);
40878      Rdx = DAG.getNode(Opc, DL, WideVT, Lo, Hi);
40879      while (Rdx.getValueSizeInBits() > 128) {
40880        std::tie(Lo, Hi) = splitVector(Rdx, DAG, DL);
40881        Rdx = DAG.getNode(Opc, DL, Lo.getValueType(), Lo, Hi);
40882      }
40883    } else {
40884      if (VecVT == MVT::v4i8)
40885        Rdx = DAG.getNode(ISD::CONCAT_VECTORS, DL, MVT::v8i8, Rdx,
40886                          DAG.getUNDEF(MVT::v4i8));
40887      Rdx = DAG.getNode(ISD::CONCAT_VECTORS, DL, MVT::v16i8, Rdx,
40888                        DAG.getUNDEF(MVT::v8i8));
40889      Rdx = getUnpackl(DAG, DL, MVT::v16i8, Rdx, DAG.getUNDEF(MVT::v16i8));
40890      Rdx = DAG.getBitcast(MVT::v8i16, Rdx);
40891    }
40892    if (NumElts >= 8)
40893      Rdx = DAG.getNode(Opc, DL, MVT::v8i16, Rdx,
40894                        DAG.getVectorShuffle(MVT::v8i16, DL, Rdx, Rdx,
40895                                             {4, 5, 6, 7, -1, -1, -1, -1}));
40896    Rdx = DAG.getNode(Opc, DL, MVT::v8i16, Rdx,
40897                      DAG.getVectorShuffle(MVT::v8i16, DL, Rdx, Rdx,
40898                                           {2, 3, -1, -1, -1, -1, -1, -1}));
40899    Rdx = DAG.getNode(Opc, DL, MVT::v8i16, Rdx,
40900                      DAG.getVectorShuffle(MVT::v8i16, DL, Rdx, Rdx,
40901                                           {1, -1, -1, -1, -1, -1, -1, -1}));
40902    Rdx = DAG.getBitcast(MVT::v16i8, Rdx);
40903    return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, VT, Rdx, Index);
40904  }
40905
40906  // vXi8 add reduction - sub 128-bit vector.
40907  if (VecVT == MVT::v4i8 || VecVT == MVT::v8i8) {
40908    if (VecVT == MVT::v4i8) {
40909      // Pad with zero.
40910      if (Subtarget.hasSSE41()) {
40911        Rdx = DAG.getBitcast(MVT::i32, Rdx);
40912        Rdx = DAG.getNode(ISD::INSERT_VECTOR_ELT, DL, MVT::v4i32,
40913                          DAG.getConstant(0, DL, MVT::v4i32), Rdx,
40914                          DAG.getIntPtrConstant(0, DL));
40915        Rdx = DAG.getBitcast(MVT::v16i8, Rdx);
40916      } else {
40917        Rdx = DAG.getNode(ISD::CONCAT_VECTORS, DL, MVT::v8i8, Rdx,
40918                          DAG.getConstant(0, DL, VecVT));
40919      }
40920    }
40921    if (Rdx.getValueType() == MVT::v8i8) {
40922      // Pad with undef.
40923      Rdx = DAG.getNode(ISD::CONCAT_VECTORS, DL, MVT::v16i8, Rdx,
40924                        DAG.getUNDEF(MVT::v8i8));
40925    }
40926    Rdx = DAG.getNode(X86ISD::PSADBW, DL, MVT::v2i64, Rdx,
40927                      DAG.getConstant(0, DL, MVT::v16i8));
40928    Rdx = DAG.getBitcast(MVT::v16i8, Rdx);
40929    return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, VT, Rdx, Index);
40930  }
40931
40932  // Must be a >=128-bit vector with pow2 elements.
40933  if ((VecVT.getSizeInBits() % 128) != 0 ||
40934      !isPowerOf2_32(VecVT.getVectorNumElements()))
40935    return SDValue();
40936
40937  // vXi8 add reduction - sum lo/hi halves then use PSADBW.
40938  if (VT == MVT::i8) {
40939    while (Rdx.getValueSizeInBits() > 128) {
40940      SDValue Lo, Hi;
40941      std::tie(Lo, Hi) = splitVector(Rdx, DAG, DL);
40942      VecVT = Lo.getValueType();
40943      Rdx = DAG.getNode(ISD::ADD, DL, VecVT, Lo, Hi);
40944    }
40945    assert(VecVT == MVT::v16i8 && "v16i8 reduction expected");
40946
40947    SDValue Hi = DAG.getVectorShuffle(
40948        MVT::v16i8, DL, Rdx, Rdx,
40949        {8, 9, 10, 11, 12, 13, 14, 15, -1, -1, -1, -1, -1, -1, -1, -1});
40950    Rdx = DAG.getNode(ISD::ADD, DL, MVT::v16i8, Rdx, Hi);
40951    Rdx = DAG.getNode(X86ISD::PSADBW, DL, MVT::v2i64, Rdx,
40952                      getZeroVector(MVT::v16i8, Subtarget, DAG, DL));
40953    Rdx = DAG.getBitcast(MVT::v16i8, Rdx);
40954    return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, VT, Rdx, Index);
40955  }
40956
40957  // Only use (F)HADD opcodes if they aren't microcoded or minimizes codesize.
40958  if (!shouldUseHorizontalOp(true, DAG, Subtarget))
40959    return SDValue();
40960
40961  unsigned HorizOpcode = Opc == ISD::ADD ? X86ISD::HADD : X86ISD::FHADD;
40962
40963  // 256-bit horizontal instructions operate on 128-bit chunks rather than
40964  // across the whole vector, so we need an extract + hop preliminary stage.
40965  // This is the only step where the operands of the hop are not the same value.
40966  // TODO: We could extend this to handle 512-bit or even longer vectors.
40967  if (((VecVT == MVT::v16i16 || VecVT == MVT::v8i32) && Subtarget.hasSSSE3()) ||
40968      ((VecVT == MVT::v8f32 || VecVT == MVT::v4f64) && Subtarget.hasSSE3())) {
40969    unsigned NumElts = VecVT.getVectorNumElements();
40970    SDValue Hi = extract128BitVector(Rdx, NumElts / 2, DAG, DL);
40971    SDValue Lo = extract128BitVector(Rdx, 0, DAG, DL);
40972    Rdx = DAG.getNode(HorizOpcode, DL, Lo.getValueType(), Hi, Lo);
40973    VecVT = Rdx.getValueType();
40974  }
40975  if (!((VecVT == MVT::v8i16 || VecVT == MVT::v4i32) && Subtarget.hasSSSE3()) &&
40976      !((VecVT == MVT::v4f32 || VecVT == MVT::v2f64) && Subtarget.hasSSE3()))
40977    return SDValue();
40978
40979  // extract (add (shuf X), X), 0 --> extract (hadd X, X), 0
40980  unsigned ReductionSteps = Log2_32(VecVT.getVectorNumElements());
40981  for (unsigned i = 0; i != ReductionSteps; ++i)
40982    Rdx = DAG.getNode(HorizOpcode, DL, VecVT, Rdx, Rdx);
40983
40984  return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, VT, Rdx, Index);
40985}
40986
40987/// Detect vector gather/scatter index generation and convert it from being a
40988/// bunch of shuffles and extracts into a somewhat faster sequence.
40989/// For i686, the best sequence is apparently storing the value and loading
40990/// scalars back, while for x64 we should use 64-bit extracts and shifts.
40991static SDValue combineExtractVectorElt(SDNode *N, SelectionDAG &DAG,
40992                                       TargetLowering::DAGCombinerInfo &DCI,
40993                                       const X86Subtarget &Subtarget) {
40994  if (SDValue NewOp = combineExtractWithShuffle(N, DAG, DCI, Subtarget))
40995    return NewOp;
40996
40997  SDValue InputVector = N->getOperand(0);
40998  SDValue EltIdx = N->getOperand(1);
40999  auto *CIdx = dyn_cast<ConstantSDNode>(EltIdx);
41000
41001  EVT SrcVT = InputVector.getValueType();
41002  EVT VT = N->getValueType(0);
41003  SDLoc dl(InputVector);
41004  bool IsPextr = N->getOpcode() != ISD::EXTRACT_VECTOR_ELT;
41005  unsigned NumSrcElts = SrcVT.getVectorNumElements();
41006
41007  if (CIdx && CIdx->getAPIntValue().uge(NumSrcElts))
41008    return IsPextr ? DAG.getConstant(0, dl, VT) : DAG.getUNDEF(VT);
41009
41010  // Integer Constant Folding.
41011  if (CIdx && VT.isInteger()) {
41012    APInt UndefVecElts;
41013    SmallVector<APInt, 16> EltBits;
41014    unsigned VecEltBitWidth = SrcVT.getScalarSizeInBits();
41015    if (getTargetConstantBitsFromNode(InputVector, VecEltBitWidth, UndefVecElts,
41016                                      EltBits, true, false)) {
41017      uint64_t Idx = CIdx->getZExtValue();
41018      if (UndefVecElts[Idx])
41019        return IsPextr ? DAG.getConstant(0, dl, VT) : DAG.getUNDEF(VT);
41020      return DAG.getConstant(EltBits[Idx].zextOrSelf(VT.getScalarSizeInBits()),
41021                             dl, VT);
41022    }
41023  }
41024
41025  if (IsPextr) {
41026    const TargetLowering &TLI = DAG.getTargetLoweringInfo();
41027    if (TLI.SimplifyDemandedBits(
41028            SDValue(N, 0), APInt::getAllOnesValue(VT.getSizeInBits()), DCI))
41029      return SDValue(N, 0);
41030
41031    // PEXTR*(PINSR*(v, s, c), c) -> s (with implicit zext handling).
41032    if ((InputVector.getOpcode() == X86ISD::PINSRB ||
41033         InputVector.getOpcode() == X86ISD::PINSRW) &&
41034        InputVector.getOperand(2) == EltIdx) {
41035      assert(SrcVT == InputVector.getOperand(0).getValueType() &&
41036             "Vector type mismatch");
41037      SDValue Scl = InputVector.getOperand(1);
41038      Scl = DAG.getNode(ISD::TRUNCATE, dl, SrcVT.getScalarType(), Scl);
41039      return DAG.getZExtOrTrunc(Scl, dl, VT);
41040    }
41041
41042    // TODO - Remove this once we can handle the implicit zero-extension of
41043    // X86ISD::PEXTRW/X86ISD::PEXTRB in combinePredicateReduction and
41044    // combineBasicSADPattern.
41045    return SDValue();
41046  }
41047
41048  // Detect mmx extraction of all bits as a i64. It works better as a bitcast.
41049  if (InputVector.getOpcode() == ISD::BITCAST && InputVector.hasOneUse() &&
41050      VT == MVT::i64 && SrcVT == MVT::v1i64 && isNullConstant(EltIdx)) {
41051    SDValue MMXSrc = InputVector.getOperand(0);
41052
41053    // The bitcast source is a direct mmx result.
41054    if (MMXSrc.getValueType() == MVT::x86mmx)
41055      return DAG.getBitcast(VT, InputVector);
41056  }
41057
41058  // Detect mmx to i32 conversion through a v2i32 elt extract.
41059  if (InputVector.getOpcode() == ISD::BITCAST && InputVector.hasOneUse() &&
41060      VT == MVT::i32 && SrcVT == MVT::v2i32 && isNullConstant(EltIdx)) {
41061    SDValue MMXSrc = InputVector.getOperand(0);
41062
41063    // The bitcast source is a direct mmx result.
41064    if (MMXSrc.getValueType() == MVT::x86mmx)
41065      return DAG.getNode(X86ISD::MMX_MOVD2W, dl, MVT::i32, MMXSrc);
41066  }
41067
41068  // Check whether this extract is the root of a sum of absolute differences
41069  // pattern. This has to be done here because we really want it to happen
41070  // pre-legalization,
41071  if (SDValue SAD = combineBasicSADPattern(N, DAG, Subtarget))
41072    return SAD;
41073
41074  // Attempt to replace an all_of/any_of horizontal reduction with a MOVMSK.
41075  if (SDValue Cmp = combinePredicateReduction(N, DAG, Subtarget))
41076    return Cmp;
41077
41078  // Attempt to replace min/max v8i16/v16i8 reductions with PHMINPOSUW.
41079  if (SDValue MinMax = combineMinMaxReduction(N, DAG, Subtarget))
41080    return MinMax;
41081
41082  // Attempt to optimize ADD/FADD/MUL reductions with HADD, promotion etc..
41083  if (SDValue V = combineArithReduction(N, DAG, Subtarget))
41084    return V;
41085
41086  if (SDValue V = scalarizeExtEltFP(N, DAG))
41087    return V;
41088
41089  // Attempt to extract a i1 element by using MOVMSK to extract the signbits
41090  // and then testing the relevant element.
41091  //
41092  // Note that we only combine extracts on the *same* result number, i.e.
41093  //   t0 = merge_values a0, a1, a2, a3
41094  //   i1 = extract_vector_elt t0, Constant:i64<2>
41095  //   i1 = extract_vector_elt t0, Constant:i64<3>
41096  // but not
41097  //   i1 = extract_vector_elt t0:1, Constant:i64<2>
41098  // since the latter would need its own MOVMSK.
41099  if (CIdx && SrcVT.getScalarType() == MVT::i1) {
41100    SmallVector<SDNode *, 16> BoolExtracts;
41101    unsigned ResNo = InputVector.getResNo();
41102    auto IsBoolExtract = [&BoolExtracts, &ResNo](SDNode *Use) {
41103      if (Use->getOpcode() == ISD::EXTRACT_VECTOR_ELT &&
41104          isa<ConstantSDNode>(Use->getOperand(1)) &&
41105          Use->getOperand(0).getResNo() == ResNo &&
41106          Use->getValueType(0) == MVT::i1) {
41107        BoolExtracts.push_back(Use);
41108        return true;
41109      }
41110      return false;
41111    };
41112    if (all_of(InputVector->uses(), IsBoolExtract) &&
41113        BoolExtracts.size() > 1) {
41114      EVT BCVT = EVT::getIntegerVT(*DAG.getContext(), NumSrcElts);
41115      if (SDValue BC =
41116              combineBitcastvxi1(DAG, BCVT, InputVector, dl, Subtarget)) {
41117        for (SDNode *Use : BoolExtracts) {
41118          // extractelement vXi1 X, MaskIdx --> ((movmsk X) & Mask) == Mask
41119          unsigned MaskIdx = Use->getConstantOperandVal(1);
41120          APInt MaskBit = APInt::getOneBitSet(NumSrcElts, MaskIdx);
41121          SDValue Mask = DAG.getConstant(MaskBit, dl, BCVT);
41122          SDValue Res = DAG.getNode(ISD::AND, dl, BCVT, BC, Mask);
41123          Res = DAG.getSetCC(dl, MVT::i1, Res, Mask, ISD::SETEQ);
41124          DCI.CombineTo(Use, Res);
41125        }
41126        return SDValue(N, 0);
41127      }
41128    }
41129  }
41130
41131  return SDValue();
41132}
41133
41134/// If a vector select has an operand that is -1 or 0, try to simplify the
41135/// select to a bitwise logic operation.
41136/// TODO: Move to DAGCombiner, possibly using TargetLowering::hasAndNot()?
41137static SDValue
41138combineVSelectWithAllOnesOrZeros(SDNode *N, SelectionDAG &DAG,
41139                                 TargetLowering::DAGCombinerInfo &DCI,
41140                                 const X86Subtarget &Subtarget) {
41141  SDValue Cond = N->getOperand(0);
41142  SDValue LHS = N->getOperand(1);
41143  SDValue RHS = N->getOperand(2);
41144  EVT VT = LHS.getValueType();
41145  EVT CondVT = Cond.getValueType();
41146  SDLoc DL(N);
41147  const TargetLowering &TLI = DAG.getTargetLoweringInfo();
41148
41149  if (N->getOpcode() != ISD::VSELECT)
41150    return SDValue();
41151
41152  assert(CondVT.isVector() && "Vector select expects a vector selector!");
41153
41154  // TODO: Use isNullOrNullSplat() to distinguish constants with undefs?
41155  // TODO: Can we assert that both operands are not zeros (because that should
41156  //       get simplified at node creation time)?
41157  bool TValIsAllZeros = ISD::isBuildVectorAllZeros(LHS.getNode());
41158  bool FValIsAllZeros = ISD::isBuildVectorAllZeros(RHS.getNode());
41159
41160  // If both inputs are 0/undef, create a complete zero vector.
41161  // FIXME: As noted above this should be handled by DAGCombiner/getNode.
41162  if (TValIsAllZeros && FValIsAllZeros) {
41163    if (VT.isFloatingPoint())
41164      return DAG.getConstantFP(0.0, DL, VT);
41165    return DAG.getConstant(0, DL, VT);
41166  }
41167
41168  // To use the condition operand as a bitwise mask, it must have elements that
41169  // are the same size as the select elements. Ie, the condition operand must
41170  // have already been promoted from the IR select condition type <N x i1>.
41171  // Don't check if the types themselves are equal because that excludes
41172  // vector floating-point selects.
41173  if (CondVT.getScalarSizeInBits() != VT.getScalarSizeInBits())
41174    return SDValue();
41175
41176  // Try to invert the condition if true value is not all 1s and false value is
41177  // not all 0s. Only do this if the condition has one use.
41178  bool TValIsAllOnes = ISD::isBuildVectorAllOnes(LHS.getNode());
41179  if (!TValIsAllOnes && !FValIsAllZeros && Cond.hasOneUse() &&
41180      // Check if the selector will be produced by CMPP*/PCMP*.
41181      Cond.getOpcode() == ISD::SETCC &&
41182      // Check if SETCC has already been promoted.
41183      TLI.getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), VT) ==
41184          CondVT) {
41185    bool FValIsAllOnes = ISD::isBuildVectorAllOnes(RHS.getNode());
41186
41187    if (TValIsAllZeros || FValIsAllOnes) {
41188      SDValue CC = Cond.getOperand(2);
41189      ISD::CondCode NewCC = ISD::getSetCCInverse(
41190          cast<CondCodeSDNode>(CC)->get(), Cond.getOperand(0).getValueType());
41191      Cond = DAG.getSetCC(DL, CondVT, Cond.getOperand(0), Cond.getOperand(1),
41192                          NewCC);
41193      std::swap(LHS, RHS);
41194      TValIsAllOnes = FValIsAllOnes;
41195      FValIsAllZeros = TValIsAllZeros;
41196    }
41197  }
41198
41199  // Cond value must be 'sign splat' to be converted to a logical op.
41200  if (DAG.ComputeNumSignBits(Cond) != CondVT.getScalarSizeInBits())
41201    return SDValue();
41202
41203  // vselect Cond, 111..., 000... -> Cond
41204  if (TValIsAllOnes && FValIsAllZeros)
41205    return DAG.getBitcast(VT, Cond);
41206
41207  if (!TLI.isTypeLegal(CondVT))
41208    return SDValue();
41209
41210  // vselect Cond, 111..., X -> or Cond, X
41211  if (TValIsAllOnes) {
41212    SDValue CastRHS = DAG.getBitcast(CondVT, RHS);
41213    SDValue Or = DAG.getNode(ISD::OR, DL, CondVT, Cond, CastRHS);
41214    return DAG.getBitcast(VT, Or);
41215  }
41216
41217  // vselect Cond, X, 000... -> and Cond, X
41218  if (FValIsAllZeros) {
41219    SDValue CastLHS = DAG.getBitcast(CondVT, LHS);
41220    SDValue And = DAG.getNode(ISD::AND, DL, CondVT, Cond, CastLHS);
41221    return DAG.getBitcast(VT, And);
41222  }
41223
41224  // vselect Cond, 000..., X -> andn Cond, X
41225  if (TValIsAllZeros) {
41226    SDValue CastRHS = DAG.getBitcast(CondVT, RHS);
41227    SDValue AndN;
41228    // The canonical form differs for i1 vectors - x86andnp is not used
41229    if (CondVT.getScalarType() == MVT::i1)
41230      AndN = DAG.getNode(ISD::AND, DL, CondVT, DAG.getNOT(DL, Cond, CondVT),
41231                         CastRHS);
41232    else
41233      AndN = DAG.getNode(X86ISD::ANDNP, DL, CondVT, Cond, CastRHS);
41234    return DAG.getBitcast(VT, AndN);
41235  }
41236
41237  return SDValue();
41238}
41239
41240/// If both arms of a vector select are concatenated vectors, split the select,
41241/// and concatenate the result to eliminate a wide (256-bit) vector instruction:
41242///   vselect Cond, (concat T0, T1), (concat F0, F1) -->
41243///   concat (vselect (split Cond), T0, F0), (vselect (split Cond), T1, F1)
41244static SDValue narrowVectorSelect(SDNode *N, SelectionDAG &DAG,
41245                                  const X86Subtarget &Subtarget) {
41246  unsigned Opcode = N->getOpcode();
41247  if (Opcode != X86ISD::BLENDV && Opcode != ISD::VSELECT)
41248    return SDValue();
41249
41250  // TODO: Split 512-bit vectors too?
41251  EVT VT = N->getValueType(0);
41252  if (!VT.is256BitVector())
41253    return SDValue();
41254
41255  // TODO: Split as long as any 2 of the 3 operands are concatenated?
41256  SDValue Cond = N->getOperand(0);
41257  SDValue TVal = N->getOperand(1);
41258  SDValue FVal = N->getOperand(2);
41259  SmallVector<SDValue, 4> CatOpsT, CatOpsF;
41260  if (!TVal.hasOneUse() || !FVal.hasOneUse() ||
41261      !collectConcatOps(TVal.getNode(), CatOpsT) ||
41262      !collectConcatOps(FVal.getNode(), CatOpsF))
41263    return SDValue();
41264
41265  auto makeBlend = [Opcode](SelectionDAG &DAG, const SDLoc &DL,
41266                            ArrayRef<SDValue> Ops) {
41267    return DAG.getNode(Opcode, DL, Ops[1].getValueType(), Ops);
41268  };
41269  return SplitOpsAndApply(DAG, Subtarget, SDLoc(N), VT, { Cond, TVal, FVal },
41270                          makeBlend, /*CheckBWI*/ false);
41271}
41272
41273static SDValue combineSelectOfTwoConstants(SDNode *N, SelectionDAG &DAG) {
41274  SDValue Cond = N->getOperand(0);
41275  SDValue LHS = N->getOperand(1);
41276  SDValue RHS = N->getOperand(2);
41277  SDLoc DL(N);
41278
41279  auto *TrueC = dyn_cast<ConstantSDNode>(LHS);
41280  auto *FalseC = dyn_cast<ConstantSDNode>(RHS);
41281  if (!TrueC || !FalseC)
41282    return SDValue();
41283
41284  // Don't do this for crazy integer types.
41285  EVT VT = N->getValueType(0);
41286  if (!DAG.getTargetLoweringInfo().isTypeLegal(VT))
41287    return SDValue();
41288
41289  // We're going to use the condition bit in math or logic ops. We could allow
41290  // this with a wider condition value (post-legalization it becomes an i8),
41291  // but if nothing is creating selects that late, it doesn't matter.
41292  if (Cond.getValueType() != MVT::i1)
41293    return SDValue();
41294
41295  // A power-of-2 multiply is just a shift. LEA also cheaply handles multiply by
41296  // 3, 5, or 9 with i32/i64, so those get transformed too.
41297  // TODO: For constants that overflow or do not differ by power-of-2 or small
41298  // multiplier, convert to 'and' + 'add'.
41299  const APInt &TrueVal = TrueC->getAPIntValue();
41300  const APInt &FalseVal = FalseC->getAPIntValue();
41301  bool OV;
41302  APInt Diff = TrueVal.ssub_ov(FalseVal, OV);
41303  if (OV)
41304    return SDValue();
41305
41306  APInt AbsDiff = Diff.abs();
41307  if (AbsDiff.isPowerOf2() ||
41308      ((VT == MVT::i32 || VT == MVT::i64) &&
41309       (AbsDiff == 3 || AbsDiff == 5 || AbsDiff == 9))) {
41310
41311    // We need a positive multiplier constant for shift/LEA codegen. The 'not'
41312    // of the condition can usually be folded into a compare predicate, but even
41313    // without that, the sequence should be cheaper than a CMOV alternative.
41314    if (TrueVal.slt(FalseVal)) {
41315      Cond = DAG.getNOT(DL, Cond, MVT::i1);
41316      std::swap(TrueC, FalseC);
41317    }
41318
41319    // select Cond, TC, FC --> (zext(Cond) * (TC - FC)) + FC
41320    SDValue R = DAG.getNode(ISD::ZERO_EXTEND, DL, VT, Cond);
41321
41322    // Multiply condition by the difference if non-one.
41323    if (!AbsDiff.isOneValue())
41324      R = DAG.getNode(ISD::MUL, DL, VT, R, DAG.getConstant(AbsDiff, DL, VT));
41325
41326    // Add the base if non-zero.
41327    if (!FalseC->isNullValue())
41328      R = DAG.getNode(ISD::ADD, DL, VT, R, SDValue(FalseC, 0));
41329
41330    return R;
41331  }
41332
41333  return SDValue();
41334}
41335
41336/// If this is a *dynamic* select (non-constant condition) and we can match
41337/// this node with one of the variable blend instructions, restructure the
41338/// condition so that blends can use the high (sign) bit of each element.
41339/// This function will also call SimplifyDemandedBits on already created
41340/// BLENDV to perform additional simplifications.
41341static SDValue combineVSelectToBLENDV(SDNode *N, SelectionDAG &DAG,
41342                                           TargetLowering::DAGCombinerInfo &DCI,
41343                                           const X86Subtarget &Subtarget) {
41344  SDValue Cond = N->getOperand(0);
41345  if ((N->getOpcode() != ISD::VSELECT &&
41346       N->getOpcode() != X86ISD::BLENDV) ||
41347      ISD::isBuildVectorOfConstantSDNodes(Cond.getNode()))
41348    return SDValue();
41349
41350  // Don't optimize before the condition has been transformed to a legal type
41351  // and don't ever optimize vector selects that map to AVX512 mask-registers.
41352  unsigned BitWidth = Cond.getScalarValueSizeInBits();
41353  if (BitWidth < 8 || BitWidth > 64)
41354    return SDValue();
41355
41356  // We can only handle the cases where VSELECT is directly legal on the
41357  // subtarget. We custom lower VSELECT nodes with constant conditions and
41358  // this makes it hard to see whether a dynamic VSELECT will correctly
41359  // lower, so we both check the operation's status and explicitly handle the
41360  // cases where a *dynamic* blend will fail even though a constant-condition
41361  // blend could be custom lowered.
41362  // FIXME: We should find a better way to handle this class of problems.
41363  // Potentially, we should combine constant-condition vselect nodes
41364  // pre-legalization into shuffles and not mark as many types as custom
41365  // lowered.
41366  const TargetLowering &TLI = DAG.getTargetLoweringInfo();
41367  EVT VT = N->getValueType(0);
41368  if (!TLI.isOperationLegalOrCustom(ISD::VSELECT, VT))
41369    return SDValue();
41370  // FIXME: We don't support i16-element blends currently. We could and
41371  // should support them by making *all* the bits in the condition be set
41372  // rather than just the high bit and using an i8-element blend.
41373  if (VT.getVectorElementType() == MVT::i16)
41374    return SDValue();
41375  // Dynamic blending was only available from SSE4.1 onward.
41376  if (VT.is128BitVector() && !Subtarget.hasSSE41())
41377    return SDValue();
41378  // Byte blends are only available in AVX2
41379  if (VT == MVT::v32i8 && !Subtarget.hasAVX2())
41380    return SDValue();
41381  // There are no 512-bit blend instructions that use sign bits.
41382  if (VT.is512BitVector())
41383    return SDValue();
41384
41385  auto OnlyUsedAsSelectCond = [](SDValue Cond) {
41386    for (SDNode::use_iterator UI = Cond->use_begin(), UE = Cond->use_end();
41387         UI != UE; ++UI)
41388      if ((UI->getOpcode() != ISD::VSELECT &&
41389           UI->getOpcode() != X86ISD::BLENDV) ||
41390          UI.getOperandNo() != 0)
41391        return false;
41392
41393    return true;
41394  };
41395
41396  APInt DemandedBits(APInt::getSignMask(BitWidth));
41397
41398  if (OnlyUsedAsSelectCond(Cond)) {
41399    KnownBits Known;
41400    TargetLowering::TargetLoweringOpt TLO(DAG, !DCI.isBeforeLegalize(),
41401                                          !DCI.isBeforeLegalizeOps());
41402    if (!TLI.SimplifyDemandedBits(Cond, DemandedBits, Known, TLO, 0, true))
41403      return SDValue();
41404
41405    // If we changed the computation somewhere in the DAG, this change will
41406    // affect all users of Cond. Update all the nodes so that we do not use
41407    // the generic VSELECT anymore. Otherwise, we may perform wrong
41408    // optimizations as we messed with the actual expectation for the vector
41409    // boolean values.
41410    for (SDNode *U : Cond->uses()) {
41411      if (U->getOpcode() == X86ISD::BLENDV)
41412        continue;
41413
41414      SDValue SB = DAG.getNode(X86ISD::BLENDV, SDLoc(U), U->getValueType(0),
41415                               Cond, U->getOperand(1), U->getOperand(2));
41416      DAG.ReplaceAllUsesOfValueWith(SDValue(U, 0), SB);
41417      DCI.AddToWorklist(U);
41418    }
41419    DCI.CommitTargetLoweringOpt(TLO);
41420    return SDValue(N, 0);
41421  }
41422
41423  // Otherwise we can still at least try to simplify multiple use bits.
41424  if (SDValue V = TLI.SimplifyMultipleUseDemandedBits(Cond, DemandedBits, DAG))
41425      return DAG.getNode(X86ISD::BLENDV, SDLoc(N), N->getValueType(0), V,
41426                         N->getOperand(1), N->getOperand(2));
41427
41428  return SDValue();
41429}
41430
41431// Try to match:
41432//   (or (and (M, (sub 0, X)), (pandn M, X)))
41433// which is a special case of:
41434//   (select M, (sub 0, X), X)
41435// Per:
41436// http://graphics.stanford.edu/~seander/bithacks.html#ConditionalNegate
41437// We know that, if fNegate is 0 or 1:
41438//   (fNegate ? -v : v) == ((v ^ -fNegate) + fNegate)
41439//
41440// Here, we have a mask, M (all 1s or 0), and, similarly, we know that:
41441//   ((M & 1) ? -X : X) == ((X ^ -(M & 1)) + (M & 1))
41442//   ( M      ? -X : X) == ((X ^   M     ) + (M & 1))
41443// This lets us transform our vselect to:
41444//   (add (xor X, M), (and M, 1))
41445// And further to:
41446//   (sub (xor X, M), M)
41447static SDValue combineLogicBlendIntoConditionalNegate(
41448    EVT VT, SDValue Mask, SDValue X, SDValue Y, const SDLoc &DL,
41449    SelectionDAG &DAG, const X86Subtarget &Subtarget) {
41450  EVT MaskVT = Mask.getValueType();
41451  assert(MaskVT.isInteger() &&
41452         DAG.ComputeNumSignBits(Mask) == MaskVT.getScalarSizeInBits() &&
41453         "Mask must be zero/all-bits");
41454
41455  if (X.getValueType() != MaskVT || Y.getValueType() != MaskVT)
41456    return SDValue();
41457  if (!DAG.getTargetLoweringInfo().isOperationLegal(ISD::SUB, MaskVT))
41458    return SDValue();
41459
41460  auto IsNegV = [](SDNode *N, SDValue V) {
41461    return N->getOpcode() == ISD::SUB && N->getOperand(1) == V &&
41462           ISD::isBuildVectorAllZeros(N->getOperand(0).getNode());
41463  };
41464
41465  SDValue V;
41466  if (IsNegV(Y.getNode(), X))
41467    V = X;
41468  else if (IsNegV(X.getNode(), Y))
41469    V = Y;
41470  else
41471    return SDValue();
41472
41473  SDValue SubOp1 = DAG.getNode(ISD::XOR, DL, MaskVT, V, Mask);
41474  SDValue SubOp2 = Mask;
41475
41476  // If the negate was on the false side of the select, then
41477  // the operands of the SUB need to be swapped. PR 27251.
41478  // This is because the pattern being matched above is
41479  // (vselect M, (sub (0, X), X)  -> (sub (xor X, M), M)
41480  // but if the pattern matched was
41481  // (vselect M, X, (sub (0, X))), that is really negation of the pattern
41482  // above, -(vselect M, (sub 0, X), X), and therefore the replacement
41483  // pattern also needs to be a negation of the replacement pattern above.
41484  // And -(sub X, Y) is just sub (Y, X), so swapping the operands of the
41485  // sub accomplishes the negation of the replacement pattern.
41486  if (V == Y)
41487    std::swap(SubOp1, SubOp2);
41488
41489  SDValue Res = DAG.getNode(ISD::SUB, DL, MaskVT, SubOp1, SubOp2);
41490  return DAG.getBitcast(VT, Res);
41491}
41492
41493/// Do target-specific dag combines on SELECT and VSELECT nodes.
41494static SDValue combineSelect(SDNode *N, SelectionDAG &DAG,
41495                             TargetLowering::DAGCombinerInfo &DCI,
41496                             const X86Subtarget &Subtarget) {
41497  SDLoc DL(N);
41498  SDValue Cond = N->getOperand(0);
41499  SDValue LHS = N->getOperand(1);
41500  SDValue RHS = N->getOperand(2);
41501
41502  // Try simplification again because we use this function to optimize
41503  // BLENDV nodes that are not handled by the generic combiner.
41504  if (SDValue V = DAG.simplifySelect(Cond, LHS, RHS))
41505    return V;
41506
41507  EVT VT = LHS.getValueType();
41508  EVT CondVT = Cond.getValueType();
41509  const TargetLowering &TLI = DAG.getTargetLoweringInfo();
41510  bool CondConstantVector = ISD::isBuildVectorOfConstantSDNodes(Cond.getNode());
41511
41512  // Attempt to combine (select M, (sub 0, X), X) -> (sub (xor X, M), M).
41513  // Limit this to cases of non-constant masks that createShuffleMaskFromVSELECT
41514  // can't catch, plus vXi8 cases where we'd likely end up with BLENDV.
41515  if (CondVT.isVector() && CondVT.isInteger() &&
41516      CondVT.getScalarSizeInBits() == VT.getScalarSizeInBits() &&
41517      (!CondConstantVector || CondVT.getScalarType() == MVT::i8) &&
41518      DAG.ComputeNumSignBits(Cond) == CondVT.getScalarSizeInBits())
41519    if (SDValue V = combineLogicBlendIntoConditionalNegate(VT, Cond, RHS, LHS,
41520                                                           DL, DAG, Subtarget))
41521      return V;
41522
41523  // Convert vselects with constant condition into shuffles.
41524  if (CondConstantVector && DCI.isBeforeLegalizeOps()) {
41525    SmallVector<int, 64> Mask;
41526    if (createShuffleMaskFromVSELECT(Mask, Cond))
41527      return DAG.getVectorShuffle(VT, DL, LHS, RHS, Mask);
41528  }
41529
41530  // fold vselect(cond, pshufb(x), pshufb(y)) -> or (pshufb(x), pshufb(y))
41531  // by forcing the unselected elements to zero.
41532  // TODO: Can we handle more shuffles with this?
41533  if (N->getOpcode() == ISD::VSELECT && CondVT.isVector() &&
41534      LHS.getOpcode() == X86ISD::PSHUFB && RHS.getOpcode() == X86ISD::PSHUFB &&
41535      LHS.hasOneUse() && RHS.hasOneUse()) {
41536    MVT SimpleVT = VT.getSimpleVT();
41537    SmallVector<SDValue, 1> LHSOps, RHSOps;
41538    SmallVector<int, 64> LHSMask, RHSMask, CondMask;
41539    if (createShuffleMaskFromVSELECT(CondMask, Cond) &&
41540        getTargetShuffleMask(LHS.getNode(), SimpleVT, true, LHSOps, LHSMask) &&
41541        getTargetShuffleMask(RHS.getNode(), SimpleVT, true, RHSOps, RHSMask)) {
41542      int NumElts = VT.getVectorNumElements();
41543      for (int i = 0; i != NumElts; ++i) {
41544        if (CondMask[i] < NumElts)
41545          RHSMask[i] = 0x80;
41546        else
41547          LHSMask[i] = 0x80;
41548      }
41549      LHS = DAG.getNode(X86ISD::PSHUFB, DL, VT, LHS.getOperand(0),
41550                        getConstVector(LHSMask, SimpleVT, DAG, DL, true));
41551      RHS = DAG.getNode(X86ISD::PSHUFB, DL, VT, RHS.getOperand(0),
41552                        getConstVector(RHSMask, SimpleVT, DAG, DL, true));
41553      return DAG.getNode(ISD::OR, DL, VT, LHS, RHS);
41554    }
41555  }
41556
41557  // If we have SSE[12] support, try to form min/max nodes. SSE min/max
41558  // instructions match the semantics of the common C idiom x<y?x:y but not
41559  // x<=y?x:y, because of how they handle negative zero (which can be
41560  // ignored in unsafe-math mode).
41561  // We also try to create v2f32 min/max nodes, which we later widen to v4f32.
41562  if (Cond.getOpcode() == ISD::SETCC && VT.isFloatingPoint() &&
41563      VT != MVT::f80 && VT != MVT::f128 &&
41564      (TLI.isTypeLegal(VT) || VT == MVT::v2f32) &&
41565      (Subtarget.hasSSE2() ||
41566       (Subtarget.hasSSE1() && VT.getScalarType() == MVT::f32))) {
41567    ISD::CondCode CC = cast<CondCodeSDNode>(Cond.getOperand(2))->get();
41568
41569    unsigned Opcode = 0;
41570    // Check for x CC y ? x : y.
41571    if (DAG.isEqualTo(LHS, Cond.getOperand(0)) &&
41572        DAG.isEqualTo(RHS, Cond.getOperand(1))) {
41573      switch (CC) {
41574      default: break;
41575      case ISD::SETULT:
41576        // Converting this to a min would handle NaNs incorrectly, and swapping
41577        // the operands would cause it to handle comparisons between positive
41578        // and negative zero incorrectly.
41579        if (!DAG.isKnownNeverNaN(LHS) || !DAG.isKnownNeverNaN(RHS)) {
41580          if (!DAG.getTarget().Options.NoSignedZerosFPMath &&
41581              !(DAG.isKnownNeverZeroFloat(LHS) ||
41582                DAG.isKnownNeverZeroFloat(RHS)))
41583            break;
41584          std::swap(LHS, RHS);
41585        }
41586        Opcode = X86ISD::FMIN;
41587        break;
41588      case ISD::SETOLE:
41589        // Converting this to a min would handle comparisons between positive
41590        // and negative zero incorrectly.
41591        if (!DAG.getTarget().Options.NoSignedZerosFPMath &&
41592            !DAG.isKnownNeverZeroFloat(LHS) && !DAG.isKnownNeverZeroFloat(RHS))
41593          break;
41594        Opcode = X86ISD::FMIN;
41595        break;
41596      case ISD::SETULE:
41597        // Converting this to a min would handle both negative zeros and NaNs
41598        // incorrectly, but we can swap the operands to fix both.
41599        std::swap(LHS, RHS);
41600        LLVM_FALLTHROUGH;
41601      case ISD::SETOLT:
41602      case ISD::SETLT:
41603      case ISD::SETLE:
41604        Opcode = X86ISD::FMIN;
41605        break;
41606
41607      case ISD::SETOGE:
41608        // Converting this to a max would handle comparisons between positive
41609        // and negative zero incorrectly.
41610        if (!DAG.getTarget().Options.NoSignedZerosFPMath &&
41611            !DAG.isKnownNeverZeroFloat(LHS) && !DAG.isKnownNeverZeroFloat(RHS))
41612          break;
41613        Opcode = X86ISD::FMAX;
41614        break;
41615      case ISD::SETUGT:
41616        // Converting this to a max would handle NaNs incorrectly, and swapping
41617        // the operands would cause it to handle comparisons between positive
41618        // and negative zero incorrectly.
41619        if (!DAG.isKnownNeverNaN(LHS) || !DAG.isKnownNeverNaN(RHS)) {
41620          if (!DAG.getTarget().Options.NoSignedZerosFPMath &&
41621              !(DAG.isKnownNeverZeroFloat(LHS) ||
41622                DAG.isKnownNeverZeroFloat(RHS)))
41623            break;
41624          std::swap(LHS, RHS);
41625        }
41626        Opcode = X86ISD::FMAX;
41627        break;
41628      case ISD::SETUGE:
41629        // Converting this to a max would handle both negative zeros and NaNs
41630        // incorrectly, but we can swap the operands to fix both.
41631        std::swap(LHS, RHS);
41632        LLVM_FALLTHROUGH;
41633      case ISD::SETOGT:
41634      case ISD::SETGT:
41635      case ISD::SETGE:
41636        Opcode = X86ISD::FMAX;
41637        break;
41638      }
41639    // Check for x CC y ? y : x -- a min/max with reversed arms.
41640    } else if (DAG.isEqualTo(LHS, Cond.getOperand(1)) &&
41641               DAG.isEqualTo(RHS, Cond.getOperand(0))) {
41642      switch (CC) {
41643      default: break;
41644      case ISD::SETOGE:
41645        // Converting this to a min would handle comparisons between positive
41646        // and negative zero incorrectly, and swapping the operands would
41647        // cause it to handle NaNs incorrectly.
41648        if (!DAG.getTarget().Options.NoSignedZerosFPMath &&
41649            !(DAG.isKnownNeverZeroFloat(LHS) ||
41650              DAG.isKnownNeverZeroFloat(RHS))) {
41651          if (!DAG.isKnownNeverNaN(LHS) || !DAG.isKnownNeverNaN(RHS))
41652            break;
41653          std::swap(LHS, RHS);
41654        }
41655        Opcode = X86ISD::FMIN;
41656        break;
41657      case ISD::SETUGT:
41658        // Converting this to a min would handle NaNs incorrectly.
41659        if (!DAG.isKnownNeverNaN(LHS) || !DAG.isKnownNeverNaN(RHS))
41660          break;
41661        Opcode = X86ISD::FMIN;
41662        break;
41663      case ISD::SETUGE:
41664        // Converting this to a min would handle both negative zeros and NaNs
41665        // incorrectly, but we can swap the operands to fix both.
41666        std::swap(LHS, RHS);
41667        LLVM_FALLTHROUGH;
41668      case ISD::SETOGT:
41669      case ISD::SETGT:
41670      case ISD::SETGE:
41671        Opcode = X86ISD::FMIN;
41672        break;
41673
41674      case ISD::SETULT:
41675        // Converting this to a max would handle NaNs incorrectly.
41676        if (!DAG.isKnownNeverNaN(LHS) || !DAG.isKnownNeverNaN(RHS))
41677          break;
41678        Opcode = X86ISD::FMAX;
41679        break;
41680      case ISD::SETOLE:
41681        // Converting this to a max would handle comparisons between positive
41682        // and negative zero incorrectly, and swapping the operands would
41683        // cause it to handle NaNs incorrectly.
41684        if (!DAG.getTarget().Options.NoSignedZerosFPMath &&
41685            !DAG.isKnownNeverZeroFloat(LHS) &&
41686            !DAG.isKnownNeverZeroFloat(RHS)) {
41687          if (!DAG.isKnownNeverNaN(LHS) || !DAG.isKnownNeverNaN(RHS))
41688            break;
41689          std::swap(LHS, RHS);
41690        }
41691        Opcode = X86ISD::FMAX;
41692        break;
41693      case ISD::SETULE:
41694        // Converting this to a max would handle both negative zeros and NaNs
41695        // incorrectly, but we can swap the operands to fix both.
41696        std::swap(LHS, RHS);
41697        LLVM_FALLTHROUGH;
41698      case ISD::SETOLT:
41699      case ISD::SETLT:
41700      case ISD::SETLE:
41701        Opcode = X86ISD::FMAX;
41702        break;
41703      }
41704    }
41705
41706    if (Opcode)
41707      return DAG.getNode(Opcode, DL, N->getValueType(0), LHS, RHS);
41708  }
41709
41710  // Some mask scalar intrinsics rely on checking if only one bit is set
41711  // and implement it in C code like this:
41712  // A[0] = (U & 1) ? A[0] : W[0];
41713  // This creates some redundant instructions that break pattern matching.
41714  // fold (select (setcc (and (X, 1), 0, seteq), Y, Z)) -> select(and(X, 1),Z,Y)
41715  if (Subtarget.hasAVX512() && N->getOpcode() == ISD::SELECT &&
41716      Cond.getOpcode() == ISD::SETCC && (VT == MVT::f32 || VT == MVT::f64)) {
41717    ISD::CondCode CC = cast<CondCodeSDNode>(Cond.getOperand(2))->get();
41718    SDValue AndNode = Cond.getOperand(0);
41719    if (AndNode.getOpcode() == ISD::AND && CC == ISD::SETEQ &&
41720        isNullConstant(Cond.getOperand(1)) &&
41721        isOneConstant(AndNode.getOperand(1))) {
41722      // LHS and RHS swapped due to
41723      // setcc outputting 1 when AND resulted in 0 and vice versa.
41724      AndNode = DAG.getZExtOrTrunc(AndNode, DL, MVT::i8);
41725      return DAG.getNode(ISD::SELECT, DL, VT, AndNode, RHS, LHS);
41726    }
41727  }
41728
41729  // v16i8 (select v16i1, v16i8, v16i8) does not have a proper
41730  // lowering on KNL. In this case we convert it to
41731  // v16i8 (select v16i8, v16i8, v16i8) and use AVX instruction.
41732  // The same situation all vectors of i8 and i16 without BWI.
41733  // Make sure we extend these even before type legalization gets a chance to
41734  // split wide vectors.
41735  // Since SKX these selects have a proper lowering.
41736  if (Subtarget.hasAVX512() && !Subtarget.hasBWI() && CondVT.isVector() &&
41737      CondVT.getVectorElementType() == MVT::i1 &&
41738      (VT.getVectorElementType() == MVT::i8 ||
41739       VT.getVectorElementType() == MVT::i16)) {
41740    Cond = DAG.getNode(ISD::SIGN_EXTEND, DL, VT, Cond);
41741    return DAG.getNode(N->getOpcode(), DL, VT, Cond, LHS, RHS);
41742  }
41743
41744  // AVX512 - Extend select with zero to merge with target shuffle.
41745  // select(mask, extract_subvector(shuffle(x)), zero) -->
41746  // extract_subvector(select(insert_subvector(mask), shuffle(x), zero))
41747  // TODO - support non target shuffles as well.
41748  if (Subtarget.hasAVX512() && CondVT.isVector() &&
41749      CondVT.getVectorElementType() == MVT::i1) {
41750    auto SelectableOp = [&TLI](SDValue Op) {
41751      return Op.getOpcode() == ISD::EXTRACT_SUBVECTOR &&
41752             isTargetShuffle(Op.getOperand(0).getOpcode()) &&
41753             isNullConstant(Op.getOperand(1)) &&
41754             TLI.isTypeLegal(Op.getOperand(0).getValueType()) &&
41755             Op.hasOneUse() && Op.getOperand(0).hasOneUse();
41756    };
41757
41758    bool SelectableLHS = SelectableOp(LHS);
41759    bool SelectableRHS = SelectableOp(RHS);
41760    bool ZeroLHS = ISD::isBuildVectorAllZeros(LHS.getNode());
41761    bool ZeroRHS = ISD::isBuildVectorAllZeros(RHS.getNode());
41762
41763    if ((SelectableLHS && ZeroRHS) || (SelectableRHS && ZeroLHS)) {
41764      EVT SrcVT = SelectableLHS ? LHS.getOperand(0).getValueType()
41765                                : RHS.getOperand(0).getValueType();
41766      EVT SrcCondVT = SrcVT.changeVectorElementType(MVT::i1);
41767      LHS = insertSubVector(DAG.getUNDEF(SrcVT), LHS, 0, DAG, DL,
41768                            VT.getSizeInBits());
41769      RHS = insertSubVector(DAG.getUNDEF(SrcVT), RHS, 0, DAG, DL,
41770                            VT.getSizeInBits());
41771      Cond = DAG.getNode(ISD::INSERT_SUBVECTOR, DL, SrcCondVT,
41772                         DAG.getUNDEF(SrcCondVT), Cond,
41773                         DAG.getIntPtrConstant(0, DL));
41774      SDValue Res = DAG.getSelect(DL, SrcVT, Cond, LHS, RHS);
41775      return extractSubVector(Res, 0, DAG, DL, VT.getSizeInBits());
41776    }
41777  }
41778
41779  if (SDValue V = combineSelectOfTwoConstants(N, DAG))
41780    return V;
41781
41782  // Canonicalize min/max:
41783  // (x > 0) ? x : 0 -> (x >= 0) ? x : 0
41784  // (x < -1) ? x : -1 -> (x <= -1) ? x : -1
41785  // This allows use of COND_S / COND_NS (see TranslateX86CC) which eliminates
41786  // the need for an extra compare against zero. e.g.
41787  // (a - b) > 0 : (a - b) ? 0 -> (a - b) >= 0 : (a - b) ? 0
41788  // subl   %esi, %edi
41789  // testl  %edi, %edi
41790  // movl   $0, %eax
41791  // cmovgl %edi, %eax
41792  // =>
41793  // xorl   %eax, %eax
41794  // subl   %esi, $edi
41795  // cmovsl %eax, %edi
41796  //
41797  // We can also canonicalize
41798  //  (x s> 1) ? x : 1 -> (x s>= 1) ? x : 1 -> (x s> 0) ? x : 1
41799  //  (x u> 1) ? x : 1 -> (x u>= 1) ? x : 1 -> (x != 0) ? x : 1
41800  // This allows the use of a test instruction for the compare.
41801  if (N->getOpcode() == ISD::SELECT && Cond.getOpcode() == ISD::SETCC &&
41802      Cond.hasOneUse() &&
41803      LHS == Cond.getOperand(0) && RHS == Cond.getOperand(1)) {
41804    ISD::CondCode CC = cast<CondCodeSDNode>(Cond.getOperand(2))->get();
41805    if ((CC == ISD::SETGT && (isNullConstant(RHS) || isOneConstant(RHS))) ||
41806        (CC == ISD::SETLT && isAllOnesConstant(RHS))) {
41807      ISD::CondCode NewCC = CC == ISD::SETGT ? ISD::SETGE : ISD::SETLE;
41808      Cond = DAG.getSetCC(SDLoc(Cond), Cond.getValueType(),
41809                          Cond.getOperand(0), Cond.getOperand(1), NewCC);
41810      return DAG.getSelect(DL, VT, Cond, LHS, RHS);
41811    }
41812    if (CC == ISD::SETUGT && isOneConstant(RHS)) {
41813      ISD::CondCode NewCC = ISD::SETUGE;
41814      Cond = DAG.getSetCC(SDLoc(Cond), Cond.getValueType(),
41815                          Cond.getOperand(0), Cond.getOperand(1), NewCC);
41816      return DAG.getSelect(DL, VT, Cond, LHS, RHS);
41817    }
41818  }
41819
41820  // Check if the first operand is all zeros and Cond type is vXi1.
41821  // If this an avx512 target we can improve the use of zero masking by
41822  // swapping the operands and inverting the condition.
41823  if (N->getOpcode() == ISD::VSELECT && Cond.hasOneUse() &&
41824       Subtarget.hasAVX512() && CondVT.getVectorElementType() == MVT::i1 &&
41825      ISD::isBuildVectorAllZeros(LHS.getNode()) &&
41826      !ISD::isBuildVectorAllZeros(RHS.getNode())) {
41827    // Invert the cond to not(cond) : xor(op,allones)=not(op)
41828    SDValue CondNew = DAG.getNOT(DL, Cond, CondVT);
41829    // Vselect cond, op1, op2 = Vselect not(cond), op2, op1
41830    return DAG.getSelect(DL, VT, CondNew, RHS, LHS);
41831  }
41832
41833  // Early exit check
41834  if (!TLI.isTypeLegal(VT))
41835    return SDValue();
41836
41837  if (SDValue V = combineVSelectWithAllOnesOrZeros(N, DAG, DCI, Subtarget))
41838    return V;
41839
41840  if (SDValue V = combineVSelectToBLENDV(N, DAG, DCI, Subtarget))
41841    return V;
41842
41843  if (SDValue V = narrowVectorSelect(N, DAG, Subtarget))
41844    return V;
41845
41846  // select(~Cond, X, Y) -> select(Cond, Y, X)
41847  if (CondVT.getScalarType() != MVT::i1) {
41848    if (SDValue CondNot = IsNOT(Cond, DAG))
41849      return DAG.getNode(N->getOpcode(), DL, VT,
41850                         DAG.getBitcast(CondVT, CondNot), RHS, LHS);
41851    // pcmpgt(X, -1) -> pcmpgt(0, X) to help select/blendv just use the signbit.
41852    if (Cond.getOpcode() == X86ISD::PCMPGT && Cond.hasOneUse() &&
41853        ISD::isBuildVectorAllOnes(Cond.getOperand(1).getNode())) {
41854      Cond = DAG.getNode(X86ISD::PCMPGT, DL, CondVT,
41855                         DAG.getConstant(0, DL, CondVT), Cond.getOperand(0));
41856      return DAG.getNode(N->getOpcode(), DL, VT, Cond, RHS, LHS);
41857    }
41858  }
41859
41860  // Try to optimize vXi1 selects if both operands are either all constants or
41861  // bitcasts from scalar integer type. In that case we can convert the operands
41862  // to integer and use an integer select which will be converted to a CMOV.
41863  // We need to take a little bit of care to avoid creating an i64 type after
41864  // type legalization.
41865  if (N->getOpcode() == ISD::SELECT && VT.isVector() &&
41866      VT.getVectorElementType() == MVT::i1 &&
41867      (DCI.isBeforeLegalize() || (VT != MVT::v64i1 || Subtarget.is64Bit()))) {
41868    EVT IntVT = EVT::getIntegerVT(*DAG.getContext(), VT.getVectorNumElements());
41869    bool LHSIsConst = ISD::isBuildVectorOfConstantSDNodes(LHS.getNode());
41870    bool RHSIsConst = ISD::isBuildVectorOfConstantSDNodes(RHS.getNode());
41871
41872    if ((LHSIsConst ||
41873         (LHS.getOpcode() == ISD::BITCAST &&
41874          LHS.getOperand(0).getValueType() == IntVT)) &&
41875        (RHSIsConst ||
41876         (RHS.getOpcode() == ISD::BITCAST &&
41877          RHS.getOperand(0).getValueType() == IntVT))) {
41878      if (LHSIsConst)
41879        LHS = combinevXi1ConstantToInteger(LHS, DAG);
41880      else
41881        LHS = LHS.getOperand(0);
41882
41883      if (RHSIsConst)
41884        RHS = combinevXi1ConstantToInteger(RHS, DAG);
41885      else
41886        RHS = RHS.getOperand(0);
41887
41888      SDValue Select = DAG.getSelect(DL, IntVT, Cond, LHS, RHS);
41889      return DAG.getBitcast(VT, Select);
41890    }
41891  }
41892
41893  // If this is "((X & C) == 0) ? Y : Z" and C is a constant mask vector of
41894  // single bits, then invert the predicate and swap the select operands.
41895  // This can lower using a vector shift bit-hack rather than mask and compare.
41896  if (DCI.isBeforeLegalize() && !Subtarget.hasAVX512() &&
41897      N->getOpcode() == ISD::VSELECT && Cond.getOpcode() == ISD::SETCC &&
41898      Cond.hasOneUse() && CondVT.getVectorElementType() == MVT::i1 &&
41899      Cond.getOperand(0).getOpcode() == ISD::AND &&
41900      isNullOrNullSplat(Cond.getOperand(1)) &&
41901      cast<CondCodeSDNode>(Cond.getOperand(2))->get() == ISD::SETEQ &&
41902      Cond.getOperand(0).getValueType() == VT) {
41903    // The 'and' mask must be composed of power-of-2 constants.
41904    SDValue And = Cond.getOperand(0);
41905    auto *C = isConstOrConstSplat(And.getOperand(1));
41906    if (C && C->getAPIntValue().isPowerOf2()) {
41907      // vselect (X & C == 0), LHS, RHS --> vselect (X & C != 0), RHS, LHS
41908      SDValue NotCond =
41909          DAG.getSetCC(DL, CondVT, And, Cond.getOperand(1), ISD::SETNE);
41910      return DAG.getSelect(DL, VT, NotCond, RHS, LHS);
41911    }
41912
41913    // If we have a non-splat but still powers-of-2 mask, AVX1 can use pmulld
41914    // and AVX2 can use vpsllv{dq}. 8-bit lacks a proper shift or multiply.
41915    // 16-bit lacks a proper blendv.
41916    unsigned EltBitWidth = VT.getScalarSizeInBits();
41917    bool CanShiftBlend =
41918        TLI.isTypeLegal(VT) && ((Subtarget.hasAVX() && EltBitWidth == 32) ||
41919                                (Subtarget.hasAVX2() && EltBitWidth == 64) ||
41920                                (Subtarget.hasXOP()));
41921    if (CanShiftBlend &&
41922        ISD::matchUnaryPredicate(And.getOperand(1), [](ConstantSDNode *C) {
41923          return C->getAPIntValue().isPowerOf2();
41924        })) {
41925      // Create a left-shift constant to get the mask bits over to the sign-bit.
41926      SDValue Mask = And.getOperand(1);
41927      SmallVector<int, 32> ShlVals;
41928      for (unsigned i = 0, e = VT.getVectorNumElements(); i != e; ++i) {
41929        auto *MaskVal = cast<ConstantSDNode>(Mask.getOperand(i));
41930        ShlVals.push_back(EltBitWidth - 1 -
41931                          MaskVal->getAPIntValue().exactLogBase2());
41932      }
41933      // vsel ((X & C) == 0), LHS, RHS --> vsel ((shl X, C') < 0), RHS, LHS
41934      SDValue ShlAmt = getConstVector(ShlVals, VT.getSimpleVT(), DAG, DL);
41935      SDValue Shl = DAG.getNode(ISD::SHL, DL, VT, And.getOperand(0), ShlAmt);
41936      SDValue NewCond =
41937          DAG.getSetCC(DL, CondVT, Shl, Cond.getOperand(1), ISD::SETLT);
41938      return DAG.getSelect(DL, VT, NewCond, RHS, LHS);
41939    }
41940  }
41941
41942  return SDValue();
41943}
41944
41945/// Combine:
41946///   (brcond/cmov/setcc .., (cmp (atomic_load_add x, 1), 0), COND_S)
41947/// to:
41948///   (brcond/cmov/setcc .., (LADD x, 1), COND_LE)
41949/// i.e., reusing the EFLAGS produced by the LOCKed instruction.
41950/// Note that this is only legal for some op/cc combinations.
41951static SDValue combineSetCCAtomicArith(SDValue Cmp, X86::CondCode &CC,
41952                                       SelectionDAG &DAG,
41953                                       const X86Subtarget &Subtarget) {
41954  // This combine only operates on CMP-like nodes.
41955  if (!(Cmp.getOpcode() == X86ISD::CMP ||
41956        (Cmp.getOpcode() == X86ISD::SUB && !Cmp->hasAnyUseOfValue(0))))
41957    return SDValue();
41958
41959  // Can't replace the cmp if it has more uses than the one we're looking at.
41960  // FIXME: We would like to be able to handle this, but would need to make sure
41961  // all uses were updated.
41962  if (!Cmp.hasOneUse())
41963    return SDValue();
41964
41965  // This only applies to variations of the common case:
41966  //   (icmp slt x, 0) -> (icmp sle (add x, 1), 0)
41967  //   (icmp sge x, 0) -> (icmp sgt (add x, 1), 0)
41968  //   (icmp sle x, 0) -> (icmp slt (sub x, 1), 0)
41969  //   (icmp sgt x, 0) -> (icmp sge (sub x, 1), 0)
41970  // Using the proper condcodes (see below), overflow is checked for.
41971
41972  // FIXME: We can generalize both constraints:
41973  // - XOR/OR/AND (if they were made to survive AtomicExpand)
41974  // - LHS != 1
41975  // if the result is compared.
41976
41977  SDValue CmpLHS = Cmp.getOperand(0);
41978  SDValue CmpRHS = Cmp.getOperand(1);
41979  EVT CmpVT = CmpLHS.getValueType();
41980
41981  if (!CmpLHS.hasOneUse())
41982    return SDValue();
41983
41984  unsigned Opc = CmpLHS.getOpcode();
41985  if (Opc != ISD::ATOMIC_LOAD_ADD && Opc != ISD::ATOMIC_LOAD_SUB)
41986    return SDValue();
41987
41988  SDValue OpRHS = CmpLHS.getOperand(2);
41989  auto *OpRHSC = dyn_cast<ConstantSDNode>(OpRHS);
41990  if (!OpRHSC)
41991    return SDValue();
41992
41993  APInt Addend = OpRHSC->getAPIntValue();
41994  if (Opc == ISD::ATOMIC_LOAD_SUB)
41995    Addend = -Addend;
41996
41997  auto *CmpRHSC = dyn_cast<ConstantSDNode>(CmpRHS);
41998  if (!CmpRHSC)
41999    return SDValue();
42000
42001  APInt Comparison = CmpRHSC->getAPIntValue();
42002  APInt NegAddend = -Addend;
42003
42004  // If the addend is the negation of the comparison value, then we can do
42005  // a full comparison by emitting the atomic arithmetic as a locked sub.
42006  if (Comparison == NegAddend) {
42007    // The CC is fine, but we need to rewrite the LHS of the comparison as an
42008    // atomic sub.
42009    auto *AN = cast<AtomicSDNode>(CmpLHS.getNode());
42010    auto AtomicSub = DAG.getAtomic(
42011        ISD::ATOMIC_LOAD_SUB, SDLoc(CmpLHS), CmpVT,
42012        /*Chain*/ CmpLHS.getOperand(0), /*LHS*/ CmpLHS.getOperand(1),
42013        /*RHS*/ DAG.getConstant(NegAddend, SDLoc(CmpRHS), CmpVT),
42014        AN->getMemOperand());
42015    auto LockOp = lowerAtomicArithWithLOCK(AtomicSub, DAG, Subtarget);
42016    DAG.ReplaceAllUsesOfValueWith(CmpLHS.getValue(0), DAG.getUNDEF(CmpVT));
42017    DAG.ReplaceAllUsesOfValueWith(CmpLHS.getValue(1), LockOp.getValue(1));
42018    return LockOp;
42019  }
42020
42021  // We can handle comparisons with zero in a number of cases by manipulating
42022  // the CC used.
42023  if (!Comparison.isNullValue())
42024    return SDValue();
42025
42026  if (CC == X86::COND_S && Addend == 1)
42027    CC = X86::COND_LE;
42028  else if (CC == X86::COND_NS && Addend == 1)
42029    CC = X86::COND_G;
42030  else if (CC == X86::COND_G && Addend == -1)
42031    CC = X86::COND_GE;
42032  else if (CC == X86::COND_LE && Addend == -1)
42033    CC = X86::COND_L;
42034  else
42035    return SDValue();
42036
42037  SDValue LockOp = lowerAtomicArithWithLOCK(CmpLHS, DAG, Subtarget);
42038  DAG.ReplaceAllUsesOfValueWith(CmpLHS.getValue(0), DAG.getUNDEF(CmpVT));
42039  DAG.ReplaceAllUsesOfValueWith(CmpLHS.getValue(1), LockOp.getValue(1));
42040  return LockOp;
42041}
42042
42043// Check whether a boolean test is testing a boolean value generated by
42044// X86ISD::SETCC. If so, return the operand of that SETCC and proper condition
42045// code.
42046//
42047// Simplify the following patterns:
42048// (Op (CMP (SETCC Cond EFLAGS) 1) EQ) or
42049// (Op (CMP (SETCC Cond EFLAGS) 0) NEQ)
42050// to (Op EFLAGS Cond)
42051//
42052// (Op (CMP (SETCC Cond EFLAGS) 0) EQ) or
42053// (Op (CMP (SETCC Cond EFLAGS) 1) NEQ)
42054// to (Op EFLAGS !Cond)
42055//
42056// where Op could be BRCOND or CMOV.
42057//
42058static SDValue checkBoolTestSetCCCombine(SDValue Cmp, X86::CondCode &CC) {
42059  // This combine only operates on CMP-like nodes.
42060  if (!(Cmp.getOpcode() == X86ISD::CMP ||
42061        (Cmp.getOpcode() == X86ISD::SUB && !Cmp->hasAnyUseOfValue(0))))
42062    return SDValue();
42063
42064  // Quit if not used as a boolean value.
42065  if (CC != X86::COND_E && CC != X86::COND_NE)
42066    return SDValue();
42067
42068  // Check CMP operands. One of them should be 0 or 1 and the other should be
42069  // an SetCC or extended from it.
42070  SDValue Op1 = Cmp.getOperand(0);
42071  SDValue Op2 = Cmp.getOperand(1);
42072
42073  SDValue SetCC;
42074  const ConstantSDNode* C = nullptr;
42075  bool needOppositeCond = (CC == X86::COND_E);
42076  bool checkAgainstTrue = false; // Is it a comparison against 1?
42077
42078  if ((C = dyn_cast<ConstantSDNode>(Op1)))
42079    SetCC = Op2;
42080  else if ((C = dyn_cast<ConstantSDNode>(Op2)))
42081    SetCC = Op1;
42082  else // Quit if all operands are not constants.
42083    return SDValue();
42084
42085  if (C->getZExtValue() == 1) {
42086    needOppositeCond = !needOppositeCond;
42087    checkAgainstTrue = true;
42088  } else if (C->getZExtValue() != 0)
42089    // Quit if the constant is neither 0 or 1.
42090    return SDValue();
42091
42092  bool truncatedToBoolWithAnd = false;
42093  // Skip (zext $x), (trunc $x), or (and $x, 1) node.
42094  while (SetCC.getOpcode() == ISD::ZERO_EXTEND ||
42095         SetCC.getOpcode() == ISD::TRUNCATE ||
42096         SetCC.getOpcode() == ISD::AND) {
42097    if (SetCC.getOpcode() == ISD::AND) {
42098      int OpIdx = -1;
42099      if (isOneConstant(SetCC.getOperand(0)))
42100        OpIdx = 1;
42101      if (isOneConstant(SetCC.getOperand(1)))
42102        OpIdx = 0;
42103      if (OpIdx < 0)
42104        break;
42105      SetCC = SetCC.getOperand(OpIdx);
42106      truncatedToBoolWithAnd = true;
42107    } else
42108      SetCC = SetCC.getOperand(0);
42109  }
42110
42111  switch (SetCC.getOpcode()) {
42112  case X86ISD::SETCC_CARRY:
42113    // Since SETCC_CARRY gives output based on R = CF ? ~0 : 0, it's unsafe to
42114    // simplify it if the result of SETCC_CARRY is not canonicalized to 0 or 1,
42115    // i.e. it's a comparison against true but the result of SETCC_CARRY is not
42116    // truncated to i1 using 'and'.
42117    if (checkAgainstTrue && !truncatedToBoolWithAnd)
42118      break;
42119    assert(X86::CondCode(SetCC.getConstantOperandVal(0)) == X86::COND_B &&
42120           "Invalid use of SETCC_CARRY!");
42121    LLVM_FALLTHROUGH;
42122  case X86ISD::SETCC:
42123    // Set the condition code or opposite one if necessary.
42124    CC = X86::CondCode(SetCC.getConstantOperandVal(0));
42125    if (needOppositeCond)
42126      CC = X86::GetOppositeBranchCondition(CC);
42127    return SetCC.getOperand(1);
42128  case X86ISD::CMOV: {
42129    // Check whether false/true value has canonical one, i.e. 0 or 1.
42130    ConstantSDNode *FVal = dyn_cast<ConstantSDNode>(SetCC.getOperand(0));
42131    ConstantSDNode *TVal = dyn_cast<ConstantSDNode>(SetCC.getOperand(1));
42132    // Quit if true value is not a constant.
42133    if (!TVal)
42134      return SDValue();
42135    // Quit if false value is not a constant.
42136    if (!FVal) {
42137      SDValue Op = SetCC.getOperand(0);
42138      // Skip 'zext' or 'trunc' node.
42139      if (Op.getOpcode() == ISD::ZERO_EXTEND ||
42140          Op.getOpcode() == ISD::TRUNCATE)
42141        Op = Op.getOperand(0);
42142      // A special case for rdrand/rdseed, where 0 is set if false cond is
42143      // found.
42144      if ((Op.getOpcode() != X86ISD::RDRAND &&
42145           Op.getOpcode() != X86ISD::RDSEED) || Op.getResNo() != 0)
42146        return SDValue();
42147    }
42148    // Quit if false value is not the constant 0 or 1.
42149    bool FValIsFalse = true;
42150    if (FVal && FVal->getZExtValue() != 0) {
42151      if (FVal->getZExtValue() != 1)
42152        return SDValue();
42153      // If FVal is 1, opposite cond is needed.
42154      needOppositeCond = !needOppositeCond;
42155      FValIsFalse = false;
42156    }
42157    // Quit if TVal is not the constant opposite of FVal.
42158    if (FValIsFalse && TVal->getZExtValue() != 1)
42159      return SDValue();
42160    if (!FValIsFalse && TVal->getZExtValue() != 0)
42161      return SDValue();
42162    CC = X86::CondCode(SetCC.getConstantOperandVal(2));
42163    if (needOppositeCond)
42164      CC = X86::GetOppositeBranchCondition(CC);
42165    return SetCC.getOperand(3);
42166  }
42167  }
42168
42169  return SDValue();
42170}
42171
42172/// Check whether Cond is an AND/OR of SETCCs off of the same EFLAGS.
42173/// Match:
42174///   (X86or (X86setcc) (X86setcc))
42175///   (X86cmp (and (X86setcc) (X86setcc)), 0)
42176static bool checkBoolTestAndOrSetCCCombine(SDValue Cond, X86::CondCode &CC0,
42177                                           X86::CondCode &CC1, SDValue &Flags,
42178                                           bool &isAnd) {
42179  if (Cond->getOpcode() == X86ISD::CMP) {
42180    if (!isNullConstant(Cond->getOperand(1)))
42181      return false;
42182
42183    Cond = Cond->getOperand(0);
42184  }
42185
42186  isAnd = false;
42187
42188  SDValue SetCC0, SetCC1;
42189  switch (Cond->getOpcode()) {
42190  default: return false;
42191  case ISD::AND:
42192  case X86ISD::AND:
42193    isAnd = true;
42194    LLVM_FALLTHROUGH;
42195  case ISD::OR:
42196  case X86ISD::OR:
42197    SetCC0 = Cond->getOperand(0);
42198    SetCC1 = Cond->getOperand(1);
42199    break;
42200  };
42201
42202  // Make sure we have SETCC nodes, using the same flags value.
42203  if (SetCC0.getOpcode() != X86ISD::SETCC ||
42204      SetCC1.getOpcode() != X86ISD::SETCC ||
42205      SetCC0->getOperand(1) != SetCC1->getOperand(1))
42206    return false;
42207
42208  CC0 = (X86::CondCode)SetCC0->getConstantOperandVal(0);
42209  CC1 = (X86::CondCode)SetCC1->getConstantOperandVal(0);
42210  Flags = SetCC0->getOperand(1);
42211  return true;
42212}
42213
42214// When legalizing carry, we create carries via add X, -1
42215// If that comes from an actual carry, via setcc, we use the
42216// carry directly.
42217static SDValue combineCarryThroughADD(SDValue EFLAGS, SelectionDAG &DAG) {
42218  if (EFLAGS.getOpcode() == X86ISD::ADD) {
42219    if (isAllOnesConstant(EFLAGS.getOperand(1))) {
42220      SDValue Carry = EFLAGS.getOperand(0);
42221      while (Carry.getOpcode() == ISD::TRUNCATE ||
42222             Carry.getOpcode() == ISD::ZERO_EXTEND ||
42223             Carry.getOpcode() == ISD::SIGN_EXTEND ||
42224             Carry.getOpcode() == ISD::ANY_EXTEND ||
42225             (Carry.getOpcode() == ISD::AND &&
42226              isOneConstant(Carry.getOperand(1))))
42227        Carry = Carry.getOperand(0);
42228      if (Carry.getOpcode() == X86ISD::SETCC ||
42229          Carry.getOpcode() == X86ISD::SETCC_CARRY) {
42230        // TODO: Merge this code with equivalent in combineAddOrSubToADCOrSBB?
42231        uint64_t CarryCC = Carry.getConstantOperandVal(0);
42232        SDValue CarryOp1 = Carry.getOperand(1);
42233        if (CarryCC == X86::COND_B)
42234          return CarryOp1;
42235        if (CarryCC == X86::COND_A) {
42236          // Try to convert COND_A into COND_B in an attempt to facilitate
42237          // materializing "setb reg".
42238          //
42239          // Do not flip "e > c", where "c" is a constant, because Cmp
42240          // instruction cannot take an immediate as its first operand.
42241          //
42242          if (CarryOp1.getOpcode() == X86ISD::SUB &&
42243              CarryOp1.getNode()->hasOneUse() &&
42244              CarryOp1.getValueType().isInteger() &&
42245              !isa<ConstantSDNode>(CarryOp1.getOperand(1))) {
42246            SDValue SubCommute =
42247                DAG.getNode(X86ISD::SUB, SDLoc(CarryOp1), CarryOp1->getVTList(),
42248                            CarryOp1.getOperand(1), CarryOp1.getOperand(0));
42249            return SDValue(SubCommute.getNode(), CarryOp1.getResNo());
42250          }
42251        }
42252        // If this is a check of the z flag of an add with 1, switch to the
42253        // C flag.
42254        if (CarryCC == X86::COND_E &&
42255            CarryOp1.getOpcode() == X86ISD::ADD &&
42256            isOneConstant(CarryOp1.getOperand(1)))
42257          return CarryOp1;
42258      }
42259    }
42260  }
42261
42262  return SDValue();
42263}
42264
42265/// If we are inverting an PTEST/TESTP operand, attempt to adjust the CC
42266/// to avoid the inversion.
42267static SDValue combinePTESTCC(SDValue EFLAGS, X86::CondCode &CC,
42268                              SelectionDAG &DAG,
42269                              const X86Subtarget &Subtarget) {
42270  // TODO: Handle X86ISD::KTEST/X86ISD::KORTEST.
42271  if (EFLAGS.getOpcode() != X86ISD::PTEST &&
42272      EFLAGS.getOpcode() != X86ISD::TESTP)
42273    return SDValue();
42274
42275  // PTEST/TESTP sets EFLAGS as:
42276  // TESTZ: ZF = (Op0 & Op1) == 0
42277  // TESTC: CF = (~Op0 & Op1) == 0
42278  // TESTNZC: ZF == 0 && CF == 0
42279  EVT VT = EFLAGS.getValueType();
42280  SDValue Op0 = EFLAGS.getOperand(0);
42281  SDValue Op1 = EFLAGS.getOperand(1);
42282  EVT OpVT = Op0.getValueType();
42283
42284  // TEST*(~X,Y) == TEST*(X,Y)
42285  if (SDValue NotOp0 = IsNOT(Op0, DAG)) {
42286    X86::CondCode InvCC;
42287    switch (CC) {
42288    case X86::COND_B:
42289      // testc -> testz.
42290      InvCC = X86::COND_E;
42291      break;
42292    case X86::COND_AE:
42293      // !testc -> !testz.
42294      InvCC = X86::COND_NE;
42295      break;
42296    case X86::COND_E:
42297      // testz -> testc.
42298      InvCC = X86::COND_B;
42299      break;
42300    case X86::COND_NE:
42301      // !testz -> !testc.
42302      InvCC = X86::COND_AE;
42303      break;
42304    case X86::COND_A:
42305    case X86::COND_BE:
42306      // testnzc -> testnzc (no change).
42307      InvCC = CC;
42308      break;
42309    default:
42310      InvCC = X86::COND_INVALID;
42311      break;
42312    }
42313
42314    if (InvCC != X86::COND_INVALID) {
42315      CC = InvCC;
42316      return DAG.getNode(EFLAGS.getOpcode(), SDLoc(EFLAGS), VT,
42317                         DAG.getBitcast(OpVT, NotOp0), Op1);
42318    }
42319  }
42320
42321  if (CC == X86::COND_E || CC == X86::COND_NE) {
42322    // TESTZ(X,~Y) == TESTC(Y,X)
42323    if (SDValue NotOp1 = IsNOT(Op1, DAG)) {
42324      CC = (CC == X86::COND_E ? X86::COND_B : X86::COND_AE);
42325      return DAG.getNode(EFLAGS.getOpcode(), SDLoc(EFLAGS), VT,
42326                         DAG.getBitcast(OpVT, NotOp1), Op0);
42327    }
42328
42329    if (Op0 == Op1) {
42330      SDValue BC = peekThroughBitcasts(Op0);
42331      EVT BCVT = BC.getValueType();
42332      assert(BCVT.isVector() && DAG.getTargetLoweringInfo().isTypeLegal(BCVT) &&
42333             "Unexpected vector type");
42334
42335      // TESTZ(AND(X,Y),AND(X,Y)) == TESTZ(X,Y)
42336      if (BC.getOpcode() == ISD::AND || BC.getOpcode() == X86ISD::FAND) {
42337        return DAG.getNode(EFLAGS.getOpcode(), SDLoc(EFLAGS), VT,
42338                           DAG.getBitcast(OpVT, BC.getOperand(0)),
42339                           DAG.getBitcast(OpVT, BC.getOperand(1)));
42340      }
42341
42342      // TESTZ(AND(~X,Y),AND(~X,Y)) == TESTC(X,Y)
42343      if (BC.getOpcode() == X86ISD::ANDNP || BC.getOpcode() == X86ISD::FANDN) {
42344        CC = (CC == X86::COND_E ? X86::COND_B : X86::COND_AE);
42345        return DAG.getNode(EFLAGS.getOpcode(), SDLoc(EFLAGS), VT,
42346                           DAG.getBitcast(OpVT, BC.getOperand(0)),
42347                           DAG.getBitcast(OpVT, BC.getOperand(1)));
42348      }
42349
42350      // If every element is an all-sign value, see if we can use MOVMSK to
42351      // more efficiently extract the sign bits and compare that.
42352      // TODO: Handle TESTC with comparison inversion.
42353      // TODO: Can we remove SimplifyMultipleUseDemandedBits and rely on
42354      // MOVMSK combines to make sure its never worse than PTEST?
42355      unsigned EltBits = BCVT.getScalarSizeInBits();
42356      if (DAG.ComputeNumSignBits(BC) == EltBits) {
42357        assert(VT == MVT::i32 && "Expected i32 EFLAGS comparison result");
42358        APInt SignMask = APInt::getSignMask(EltBits);
42359        const TargetLowering &TLI = DAG.getTargetLoweringInfo();
42360        if (SDValue Res =
42361                TLI.SimplifyMultipleUseDemandedBits(BC, SignMask, DAG)) {
42362          // For vXi16 cases we need to use pmovmksb and extract every other
42363          // sign bit.
42364          SDLoc DL(EFLAGS);
42365          if (EltBits == 16) {
42366            MVT MovmskVT = BCVT.is128BitVector() ? MVT::v16i8 : MVT::v32i8;
42367            Res = DAG.getBitcast(MovmskVT, Res);
42368            Res = getPMOVMSKB(DL, Res, DAG, Subtarget);
42369            Res = DAG.getNode(ISD::AND, DL, MVT::i32, Res,
42370                              DAG.getConstant(0xAAAAAAAA, DL, MVT::i32));
42371          } else {
42372            Res = getPMOVMSKB(DL, Res, DAG, Subtarget);
42373          }
42374          return DAG.getNode(X86ISD::CMP, DL, MVT::i32, Res,
42375                             DAG.getConstant(0, DL, MVT::i32));
42376        }
42377      }
42378    }
42379
42380    // TESTZ(-1,X) == TESTZ(X,X)
42381    if (ISD::isBuildVectorAllOnes(Op0.getNode()))
42382      return DAG.getNode(EFLAGS.getOpcode(), SDLoc(EFLAGS), VT, Op1, Op1);
42383
42384    // TESTZ(X,-1) == TESTZ(X,X)
42385    if (ISD::isBuildVectorAllOnes(Op1.getNode()))
42386      return DAG.getNode(EFLAGS.getOpcode(), SDLoc(EFLAGS), VT, Op0, Op0);
42387  }
42388
42389  return SDValue();
42390}
42391
42392// Attempt to simplify the MOVMSK input based on the comparison type.
42393static SDValue combineSetCCMOVMSK(SDValue EFLAGS, X86::CondCode &CC,
42394                                  SelectionDAG &DAG,
42395                                  const X86Subtarget &Subtarget) {
42396  // Handle eq/ne against zero (any_of).
42397  // Handle eq/ne against -1 (all_of).
42398  if (!(CC == X86::COND_E || CC == X86::COND_NE))
42399    return SDValue();
42400  if (EFLAGS.getValueType() != MVT::i32)
42401    return SDValue();
42402  unsigned CmpOpcode = EFLAGS.getOpcode();
42403  if (CmpOpcode != X86ISD::CMP && CmpOpcode != X86ISD::SUB)
42404    return SDValue();
42405  auto *CmpConstant = dyn_cast<ConstantSDNode>(EFLAGS.getOperand(1));
42406  if (!CmpConstant)
42407    return SDValue();
42408  const APInt &CmpVal = CmpConstant->getAPIntValue();
42409
42410  SDValue CmpOp = EFLAGS.getOperand(0);
42411  unsigned CmpBits = CmpOp.getValueSizeInBits();
42412  assert(CmpBits == CmpVal.getBitWidth() && "Value size mismatch");
42413
42414  // Peek through any truncate.
42415  if (CmpOp.getOpcode() == ISD::TRUNCATE)
42416    CmpOp = CmpOp.getOperand(0);
42417
42418  // Bail if we don't find a MOVMSK.
42419  if (CmpOp.getOpcode() != X86ISD::MOVMSK)
42420    return SDValue();
42421
42422  SDValue Vec = CmpOp.getOperand(0);
42423  MVT VecVT = Vec.getSimpleValueType();
42424  assert((VecVT.is128BitVector() || VecVT.is256BitVector()) &&
42425         "Unexpected MOVMSK operand");
42426  unsigned NumElts = VecVT.getVectorNumElements();
42427  unsigned NumEltBits = VecVT.getScalarSizeInBits();
42428
42429  bool IsAnyOf = CmpOpcode == X86ISD::CMP && CmpVal.isNullValue();
42430  bool IsAllOf = CmpOpcode == X86ISD::SUB && NumElts <= CmpBits &&
42431                 CmpVal.isMask(NumElts);
42432  if (!IsAnyOf && !IsAllOf)
42433    return SDValue();
42434
42435  // See if we can peek through to a vector with a wider element type, if the
42436  // signbits extend down to all the sub-elements as well.
42437  // Calling MOVMSK with the wider type, avoiding the bitcast, helps expose
42438  // potential SimplifyDemandedBits/Elts cases.
42439  if (Vec.getOpcode() == ISD::BITCAST) {
42440    SDValue BC = peekThroughBitcasts(Vec);
42441    MVT BCVT = BC.getSimpleValueType();
42442    unsigned BCNumElts = BCVT.getVectorNumElements();
42443    unsigned BCNumEltBits = BCVT.getScalarSizeInBits();
42444    if ((BCNumEltBits == 32 || BCNumEltBits == 64) &&
42445        BCNumEltBits > NumEltBits &&
42446        DAG.ComputeNumSignBits(BC) > (BCNumEltBits - NumEltBits)) {
42447      SDLoc DL(EFLAGS);
42448      unsigned CmpMask = IsAnyOf ? 0 : ((1 << BCNumElts) - 1);
42449      return DAG.getNode(X86ISD::CMP, DL, MVT::i32,
42450                         DAG.getNode(X86ISD::MOVMSK, DL, MVT::i32, BC),
42451                         DAG.getConstant(CmpMask, DL, MVT::i32));
42452    }
42453  }
42454
42455  // MOVMSK(PCMPEQ(X,0)) == -1 -> PTESTZ(X,X).
42456  // MOVMSK(PCMPEQ(X,0)) != -1 -> !PTESTZ(X,X).
42457  if (IsAllOf && Subtarget.hasSSE41()) {
42458    SDValue BC = peekThroughBitcasts(Vec);
42459    if (BC.getOpcode() == X86ISD::PCMPEQ &&
42460        ISD::isBuildVectorAllZeros(BC.getOperand(1).getNode())) {
42461      MVT TestVT = VecVT.is128BitVector() ? MVT::v2i64 : MVT::v4i64;
42462      SDValue V = DAG.getBitcast(TestVT, BC.getOperand(0));
42463      return DAG.getNode(X86ISD::PTEST, SDLoc(EFLAGS), MVT::i32, V, V);
42464    }
42465  }
42466
42467  // See if we can avoid a PACKSS by calling MOVMSK on the sources.
42468  // For vXi16 cases we can use a v2Xi8 PMOVMSKB. We must mask out
42469  // sign bits prior to the comparison with zero unless we know that
42470  // the vXi16 splats the sign bit down to the lower i8 half.
42471  // TODO: Handle all_of patterns.
42472  if (Vec.getOpcode() == X86ISD::PACKSS && VecVT == MVT::v16i8) {
42473    SDValue VecOp0 = Vec.getOperand(0);
42474    SDValue VecOp1 = Vec.getOperand(1);
42475    bool SignExt0 = DAG.ComputeNumSignBits(VecOp0) > 8;
42476    bool SignExt1 = DAG.ComputeNumSignBits(VecOp1) > 8;
42477    // PMOVMSKB(PACKSSBW(X, undef)) -> PMOVMSKB(BITCAST_v16i8(X)) & 0xAAAA.
42478    if (IsAnyOf && CmpBits == 8 && VecOp1.isUndef()) {
42479      SDLoc DL(EFLAGS);
42480      SDValue Result = DAG.getBitcast(MVT::v16i8, VecOp0);
42481      Result = DAG.getNode(X86ISD::MOVMSK, DL, MVT::i32, Result);
42482      Result = DAG.getZExtOrTrunc(Result, DL, MVT::i16);
42483      if (!SignExt0) {
42484        Result = DAG.getNode(ISD::AND, DL, MVT::i16, Result,
42485                             DAG.getConstant(0xAAAA, DL, MVT::i16));
42486      }
42487      return DAG.getNode(X86ISD::CMP, DL, MVT::i32, Result,
42488                         DAG.getConstant(0, DL, MVT::i16));
42489    }
42490    // PMOVMSKB(PACKSSBW(LO(X), HI(X)))
42491    // -> PMOVMSKB(BITCAST_v32i8(X)) & 0xAAAAAAAA.
42492    if (CmpBits >= 16 && Subtarget.hasInt256() &&
42493        VecOp0.getOpcode() == ISD::EXTRACT_SUBVECTOR &&
42494        VecOp1.getOpcode() == ISD::EXTRACT_SUBVECTOR &&
42495        VecOp0.getOperand(0) == VecOp1.getOperand(0) &&
42496        VecOp0.getConstantOperandAPInt(1) == 0 &&
42497        VecOp1.getConstantOperandAPInt(1) == 8 &&
42498        (IsAnyOf || (SignExt0 && SignExt1))) {
42499      SDLoc DL(EFLAGS);
42500      SDValue Result = DAG.getBitcast(MVT::v32i8, VecOp0.getOperand(0));
42501      Result = DAG.getNode(X86ISD::MOVMSK, DL, MVT::i32, Result);
42502      unsigned CmpMask = IsAnyOf ? 0 : 0xFFFFFFFF;
42503      if (!SignExt0 || !SignExt1) {
42504        assert(IsAnyOf && "Only perform v16i16 signmasks for any_of patterns");
42505        Result = DAG.getNode(ISD::AND, DL, MVT::i32, Result,
42506                             DAG.getConstant(0xAAAAAAAA, DL, MVT::i32));
42507      }
42508      return DAG.getNode(X86ISD::CMP, DL, MVT::i32, Result,
42509                         DAG.getConstant(CmpMask, DL, MVT::i32));
42510    }
42511  }
42512
42513  // MOVMSK(SHUFFLE(X,u)) -> MOVMSK(X) iff every element is referenced.
42514  SmallVector<int, 32> ShuffleMask;
42515  SmallVector<SDValue, 2> ShuffleInputs;
42516  if (NumElts <= CmpBits &&
42517      getTargetShuffleInputs(peekThroughBitcasts(Vec), ShuffleInputs,
42518                             ShuffleMask, DAG) &&
42519      ShuffleInputs.size() == 1 && !isAnyZeroOrUndef(ShuffleMask) &&
42520      ShuffleInputs[0].getValueSizeInBits() == VecVT.getSizeInBits()) {
42521    unsigned NumShuffleElts = ShuffleMask.size();
42522    APInt DemandedElts = APInt::getNullValue(NumShuffleElts);
42523    for (int M : ShuffleMask) {
42524      assert(0 <= M && M < (int)NumShuffleElts && "Bad unary shuffle index");
42525      DemandedElts.setBit(M);
42526    }
42527    if (DemandedElts.isAllOnesValue()) {
42528      SDLoc DL(EFLAGS);
42529      SDValue Result = DAG.getBitcast(VecVT, ShuffleInputs[0]);
42530      Result = DAG.getNode(X86ISD::MOVMSK, DL, MVT::i32, Result);
42531      Result =
42532          DAG.getZExtOrTrunc(Result, DL, EFLAGS.getOperand(0).getValueType());
42533      return DAG.getNode(X86ISD::CMP, DL, MVT::i32, Result,
42534                         EFLAGS.getOperand(1));
42535    }
42536  }
42537
42538  return SDValue();
42539}
42540
42541/// Optimize an EFLAGS definition used according to the condition code \p CC
42542/// into a simpler EFLAGS value, potentially returning a new \p CC and replacing
42543/// uses of chain values.
42544static SDValue combineSetCCEFLAGS(SDValue EFLAGS, X86::CondCode &CC,
42545                                  SelectionDAG &DAG,
42546                                  const X86Subtarget &Subtarget) {
42547  if (CC == X86::COND_B)
42548    if (SDValue Flags = combineCarryThroughADD(EFLAGS, DAG))
42549      return Flags;
42550
42551  if (SDValue R = checkBoolTestSetCCCombine(EFLAGS, CC))
42552    return R;
42553
42554  if (SDValue R = combinePTESTCC(EFLAGS, CC, DAG, Subtarget))
42555    return R;
42556
42557  if (SDValue R = combineSetCCMOVMSK(EFLAGS, CC, DAG, Subtarget))
42558    return R;
42559
42560  return combineSetCCAtomicArith(EFLAGS, CC, DAG, Subtarget);
42561}
42562
42563/// Optimize X86ISD::CMOV [LHS, RHS, CONDCODE (e.g. X86::COND_NE), CONDVAL]
42564static SDValue combineCMov(SDNode *N, SelectionDAG &DAG,
42565                           TargetLowering::DAGCombinerInfo &DCI,
42566                           const X86Subtarget &Subtarget) {
42567  SDLoc DL(N);
42568
42569  SDValue FalseOp = N->getOperand(0);
42570  SDValue TrueOp = N->getOperand(1);
42571  X86::CondCode CC = (X86::CondCode)N->getConstantOperandVal(2);
42572  SDValue Cond = N->getOperand(3);
42573
42574  // cmov X, X, ?, ? --> X
42575  if (TrueOp == FalseOp)
42576    return TrueOp;
42577
42578  // Try to simplify the EFLAGS and condition code operands.
42579  // We can't always do this as FCMOV only supports a subset of X86 cond.
42580  if (SDValue Flags = combineSetCCEFLAGS(Cond, CC, DAG, Subtarget)) {
42581    if (!(FalseOp.getValueType() == MVT::f80 ||
42582          (FalseOp.getValueType() == MVT::f64 && !Subtarget.hasSSE2()) ||
42583          (FalseOp.getValueType() == MVT::f32 && !Subtarget.hasSSE1())) ||
42584        !Subtarget.hasCMov() || hasFPCMov(CC)) {
42585      SDValue Ops[] = {FalseOp, TrueOp, DAG.getTargetConstant(CC, DL, MVT::i8),
42586                       Flags};
42587      return DAG.getNode(X86ISD::CMOV, DL, N->getValueType(0), Ops);
42588    }
42589  }
42590
42591  // If this is a select between two integer constants, try to do some
42592  // optimizations.  Note that the operands are ordered the opposite of SELECT
42593  // operands.
42594  if (ConstantSDNode *TrueC = dyn_cast<ConstantSDNode>(TrueOp)) {
42595    if (ConstantSDNode *FalseC = dyn_cast<ConstantSDNode>(FalseOp)) {
42596      // Canonicalize the TrueC/FalseC values so that TrueC (the true value) is
42597      // larger than FalseC (the false value).
42598      if (TrueC->getAPIntValue().ult(FalseC->getAPIntValue())) {
42599        CC = X86::GetOppositeBranchCondition(CC);
42600        std::swap(TrueC, FalseC);
42601        std::swap(TrueOp, FalseOp);
42602      }
42603
42604      // Optimize C ? 8 : 0 -> zext(setcc(C)) << 3.  Likewise for any pow2/0.
42605      // This is efficient for any integer data type (including i8/i16) and
42606      // shift amount.
42607      if (FalseC->getAPIntValue() == 0 && TrueC->getAPIntValue().isPowerOf2()) {
42608        Cond = getSETCC(CC, Cond, DL, DAG);
42609
42610        // Zero extend the condition if needed.
42611        Cond = DAG.getNode(ISD::ZERO_EXTEND, DL, TrueC->getValueType(0), Cond);
42612
42613        unsigned ShAmt = TrueC->getAPIntValue().logBase2();
42614        Cond = DAG.getNode(ISD::SHL, DL, Cond.getValueType(), Cond,
42615                           DAG.getConstant(ShAmt, DL, MVT::i8));
42616        return Cond;
42617      }
42618
42619      // Optimize Cond ? cst+1 : cst -> zext(setcc(C)+cst.  This is efficient
42620      // for any integer data type, including i8/i16.
42621      if (FalseC->getAPIntValue()+1 == TrueC->getAPIntValue()) {
42622        Cond = getSETCC(CC, Cond, DL, DAG);
42623
42624        // Zero extend the condition if needed.
42625        Cond = DAG.getNode(ISD::ZERO_EXTEND, DL,
42626                           FalseC->getValueType(0), Cond);
42627        Cond = DAG.getNode(ISD::ADD, DL, Cond.getValueType(), Cond,
42628                           SDValue(FalseC, 0));
42629        return Cond;
42630      }
42631
42632      // Optimize cases that will turn into an LEA instruction.  This requires
42633      // an i32 or i64 and an efficient multiplier (1, 2, 3, 4, 5, 8, 9).
42634      if (N->getValueType(0) == MVT::i32 || N->getValueType(0) == MVT::i64) {
42635        APInt Diff = TrueC->getAPIntValue() - FalseC->getAPIntValue();
42636        assert(Diff.getBitWidth() == N->getValueType(0).getSizeInBits() &&
42637               "Implicit constant truncation");
42638
42639        bool isFastMultiplier = false;
42640        if (Diff.ult(10)) {
42641          switch (Diff.getZExtValue()) {
42642          default: break;
42643          case 1:  // result = add base, cond
42644          case 2:  // result = lea base(    , cond*2)
42645          case 3:  // result = lea base(cond, cond*2)
42646          case 4:  // result = lea base(    , cond*4)
42647          case 5:  // result = lea base(cond, cond*4)
42648          case 8:  // result = lea base(    , cond*8)
42649          case 9:  // result = lea base(cond, cond*8)
42650            isFastMultiplier = true;
42651            break;
42652          }
42653        }
42654
42655        if (isFastMultiplier) {
42656          Cond = getSETCC(CC, Cond, DL ,DAG);
42657          // Zero extend the condition if needed.
42658          Cond = DAG.getNode(ISD::ZERO_EXTEND, DL, FalseC->getValueType(0),
42659                             Cond);
42660          // Scale the condition by the difference.
42661          if (Diff != 1)
42662            Cond = DAG.getNode(ISD::MUL, DL, Cond.getValueType(), Cond,
42663                               DAG.getConstant(Diff, DL, Cond.getValueType()));
42664
42665          // Add the base if non-zero.
42666          if (FalseC->getAPIntValue() != 0)
42667            Cond = DAG.getNode(ISD::ADD, DL, Cond.getValueType(), Cond,
42668                               SDValue(FalseC, 0));
42669          return Cond;
42670        }
42671      }
42672    }
42673  }
42674
42675  // Handle these cases:
42676  //   (select (x != c), e, c) -> select (x != c), e, x),
42677  //   (select (x == c), c, e) -> select (x == c), x, e)
42678  // where the c is an integer constant, and the "select" is the combination
42679  // of CMOV and CMP.
42680  //
42681  // The rationale for this change is that the conditional-move from a constant
42682  // needs two instructions, however, conditional-move from a register needs
42683  // only one instruction.
42684  //
42685  // CAVEAT: By replacing a constant with a symbolic value, it may obscure
42686  //  some instruction-combining opportunities. This opt needs to be
42687  //  postponed as late as possible.
42688  //
42689  if (!DCI.isBeforeLegalize() && !DCI.isBeforeLegalizeOps()) {
42690    // the DCI.xxxx conditions are provided to postpone the optimization as
42691    // late as possible.
42692
42693    ConstantSDNode *CmpAgainst = nullptr;
42694    if ((Cond.getOpcode() == X86ISD::CMP || Cond.getOpcode() == X86ISD::SUB) &&
42695        (CmpAgainst = dyn_cast<ConstantSDNode>(Cond.getOperand(1))) &&
42696        !isa<ConstantSDNode>(Cond.getOperand(0))) {
42697
42698      if (CC == X86::COND_NE &&
42699          CmpAgainst == dyn_cast<ConstantSDNode>(FalseOp)) {
42700        CC = X86::GetOppositeBranchCondition(CC);
42701        std::swap(TrueOp, FalseOp);
42702      }
42703
42704      if (CC == X86::COND_E &&
42705          CmpAgainst == dyn_cast<ConstantSDNode>(TrueOp)) {
42706        SDValue Ops[] = {FalseOp, Cond.getOperand(0),
42707                         DAG.getTargetConstant(CC, DL, MVT::i8), Cond};
42708        return DAG.getNode(X86ISD::CMOV, DL, N->getValueType(0), Ops);
42709      }
42710    }
42711  }
42712
42713  // Fold and/or of setcc's to double CMOV:
42714  //   (CMOV F, T, ((cc1 | cc2) != 0)) -> (CMOV (CMOV F, T, cc1), T, cc2)
42715  //   (CMOV F, T, ((cc1 & cc2) != 0)) -> (CMOV (CMOV T, F, !cc1), F, !cc2)
42716  //
42717  // This combine lets us generate:
42718  //   cmovcc1 (jcc1 if we don't have CMOV)
42719  //   cmovcc2 (same)
42720  // instead of:
42721  //   setcc1
42722  //   setcc2
42723  //   and/or
42724  //   cmovne (jne if we don't have CMOV)
42725  // When we can't use the CMOV instruction, it might increase branch
42726  // mispredicts.
42727  // When we can use CMOV, or when there is no mispredict, this improves
42728  // throughput and reduces register pressure.
42729  //
42730  if (CC == X86::COND_NE) {
42731    SDValue Flags;
42732    X86::CondCode CC0, CC1;
42733    bool isAndSetCC;
42734    if (checkBoolTestAndOrSetCCCombine(Cond, CC0, CC1, Flags, isAndSetCC)) {
42735      if (isAndSetCC) {
42736        std::swap(FalseOp, TrueOp);
42737        CC0 = X86::GetOppositeBranchCondition(CC0);
42738        CC1 = X86::GetOppositeBranchCondition(CC1);
42739      }
42740
42741      SDValue LOps[] = {FalseOp, TrueOp,
42742                        DAG.getTargetConstant(CC0, DL, MVT::i8), Flags};
42743      SDValue LCMOV = DAG.getNode(X86ISD::CMOV, DL, N->getValueType(0), LOps);
42744      SDValue Ops[] = {LCMOV, TrueOp, DAG.getTargetConstant(CC1, DL, MVT::i8),
42745                       Flags};
42746      SDValue CMOV = DAG.getNode(X86ISD::CMOV, DL, N->getValueType(0), Ops);
42747      return CMOV;
42748    }
42749  }
42750
42751  // Fold (CMOV C1, (ADD (CTTZ X), C2), (X != 0)) ->
42752  //      (ADD (CMOV C1-C2, (CTTZ X), (X != 0)), C2)
42753  // Or (CMOV (ADD (CTTZ X), C2), C1, (X == 0)) ->
42754  //    (ADD (CMOV (CTTZ X), C1-C2, (X == 0)), C2)
42755  if ((CC == X86::COND_NE || CC == X86::COND_E) &&
42756      Cond.getOpcode() == X86ISD::CMP && isNullConstant(Cond.getOperand(1))) {
42757    SDValue Add = TrueOp;
42758    SDValue Const = FalseOp;
42759    // Canonicalize the condition code for easier matching and output.
42760    if (CC == X86::COND_E)
42761      std::swap(Add, Const);
42762
42763    // We might have replaced the constant in the cmov with the LHS of the
42764    // compare. If so change it to the RHS of the compare.
42765    if (Const == Cond.getOperand(0))
42766      Const = Cond.getOperand(1);
42767
42768    // Ok, now make sure that Add is (add (cttz X), C2) and Const is a constant.
42769    if (isa<ConstantSDNode>(Const) && Add.getOpcode() == ISD::ADD &&
42770        Add.hasOneUse() && isa<ConstantSDNode>(Add.getOperand(1)) &&
42771        (Add.getOperand(0).getOpcode() == ISD::CTTZ_ZERO_UNDEF ||
42772         Add.getOperand(0).getOpcode() == ISD::CTTZ) &&
42773        Add.getOperand(0).getOperand(0) == Cond.getOperand(0)) {
42774      EVT VT = N->getValueType(0);
42775      // This should constant fold.
42776      SDValue Diff = DAG.getNode(ISD::SUB, DL, VT, Const, Add.getOperand(1));
42777      SDValue CMov =
42778          DAG.getNode(X86ISD::CMOV, DL, VT, Diff, Add.getOperand(0),
42779                      DAG.getTargetConstant(X86::COND_NE, DL, MVT::i8), Cond);
42780      return DAG.getNode(ISD::ADD, DL, VT, CMov, Add.getOperand(1));
42781    }
42782  }
42783
42784  return SDValue();
42785}
42786
42787/// Different mul shrinking modes.
42788enum class ShrinkMode { MULS8, MULU8, MULS16, MULU16 };
42789
42790static bool canReduceVMulWidth(SDNode *N, SelectionDAG &DAG, ShrinkMode &Mode) {
42791  EVT VT = N->getOperand(0).getValueType();
42792  if (VT.getScalarSizeInBits() != 32)
42793    return false;
42794
42795  assert(N->getNumOperands() == 2 && "NumOperands of Mul are 2");
42796  unsigned SignBits[2] = {1, 1};
42797  bool IsPositive[2] = {false, false};
42798  for (unsigned i = 0; i < 2; i++) {
42799    SDValue Opd = N->getOperand(i);
42800
42801    SignBits[i] = DAG.ComputeNumSignBits(Opd);
42802    IsPositive[i] = DAG.SignBitIsZero(Opd);
42803  }
42804
42805  bool AllPositive = IsPositive[0] && IsPositive[1];
42806  unsigned MinSignBits = std::min(SignBits[0], SignBits[1]);
42807  // When ranges are from -128 ~ 127, use MULS8 mode.
42808  if (MinSignBits >= 25)
42809    Mode = ShrinkMode::MULS8;
42810  // When ranges are from 0 ~ 255, use MULU8 mode.
42811  else if (AllPositive && MinSignBits >= 24)
42812    Mode = ShrinkMode::MULU8;
42813  // When ranges are from -32768 ~ 32767, use MULS16 mode.
42814  else if (MinSignBits >= 17)
42815    Mode = ShrinkMode::MULS16;
42816  // When ranges are from 0 ~ 65535, use MULU16 mode.
42817  else if (AllPositive && MinSignBits >= 16)
42818    Mode = ShrinkMode::MULU16;
42819  else
42820    return false;
42821  return true;
42822}
42823
42824/// When the operands of vector mul are extended from smaller size values,
42825/// like i8 and i16, the type of mul may be shrinked to generate more
42826/// efficient code. Two typical patterns are handled:
42827/// Pattern1:
42828///     %2 = sext/zext <N x i8> %1 to <N x i32>
42829///     %4 = sext/zext <N x i8> %3 to <N x i32>
42830//   or %4 = build_vector <N x i32> %C1, ..., %CN (%C1..%CN are constants)
42831///     %5 = mul <N x i32> %2, %4
42832///
42833/// Pattern2:
42834///     %2 = zext/sext <N x i16> %1 to <N x i32>
42835///     %4 = zext/sext <N x i16> %3 to <N x i32>
42836///  or %4 = build_vector <N x i32> %C1, ..., %CN (%C1..%CN are constants)
42837///     %5 = mul <N x i32> %2, %4
42838///
42839/// There are four mul shrinking modes:
42840/// If %2 == sext32(trunc8(%2)), i.e., the scalar value range of %2 is
42841/// -128 to 128, and the scalar value range of %4 is also -128 to 128,
42842/// generate pmullw+sext32 for it (MULS8 mode).
42843/// If %2 == zext32(trunc8(%2)), i.e., the scalar value range of %2 is
42844/// 0 to 255, and the scalar value range of %4 is also 0 to 255,
42845/// generate pmullw+zext32 for it (MULU8 mode).
42846/// If %2 == sext32(trunc16(%2)), i.e., the scalar value range of %2 is
42847/// -32768 to 32767, and the scalar value range of %4 is also -32768 to 32767,
42848/// generate pmullw+pmulhw for it (MULS16 mode).
42849/// If %2 == zext32(trunc16(%2)), i.e., the scalar value range of %2 is
42850/// 0 to 65535, and the scalar value range of %4 is also 0 to 65535,
42851/// generate pmullw+pmulhuw for it (MULU16 mode).
42852static SDValue reduceVMULWidth(SDNode *N, SelectionDAG &DAG,
42853                               const X86Subtarget &Subtarget) {
42854  // Check for legality
42855  // pmullw/pmulhw are not supported by SSE.
42856  if (!Subtarget.hasSSE2())
42857    return SDValue();
42858
42859  // Check for profitability
42860  // pmulld is supported since SSE41. It is better to use pmulld
42861  // instead of pmullw+pmulhw, except for subtargets where pmulld is slower than
42862  // the expansion.
42863  bool OptForMinSize = DAG.getMachineFunction().getFunction().hasMinSize();
42864  if (Subtarget.hasSSE41() && (OptForMinSize || !Subtarget.isPMULLDSlow()))
42865    return SDValue();
42866
42867  ShrinkMode Mode;
42868  if (!canReduceVMulWidth(N, DAG, Mode))
42869    return SDValue();
42870
42871  SDLoc DL(N);
42872  SDValue N0 = N->getOperand(0);
42873  SDValue N1 = N->getOperand(1);
42874  EVT VT = N->getOperand(0).getValueType();
42875  unsigned NumElts = VT.getVectorNumElements();
42876  if ((NumElts % 2) != 0)
42877    return SDValue();
42878
42879  EVT ReducedVT = VT.changeVectorElementType(MVT::i16);
42880
42881  // Shrink the operands of mul.
42882  SDValue NewN0 = DAG.getNode(ISD::TRUNCATE, DL, ReducedVT, N0);
42883  SDValue NewN1 = DAG.getNode(ISD::TRUNCATE, DL, ReducedVT, N1);
42884
42885  // Generate the lower part of mul: pmullw. For MULU8/MULS8, only the
42886  // lower part is needed.
42887  SDValue MulLo = DAG.getNode(ISD::MUL, DL, ReducedVT, NewN0, NewN1);
42888  if (Mode == ShrinkMode::MULU8 || Mode == ShrinkMode::MULS8)
42889    return DAG.getNode((Mode == ShrinkMode::MULU8) ? ISD::ZERO_EXTEND
42890                                                   : ISD::SIGN_EXTEND,
42891                       DL, VT, MulLo);
42892
42893  EVT ResVT = EVT::getVectorVT(*DAG.getContext(), MVT::i32, NumElts / 2);
42894  // Generate the higher part of mul: pmulhw/pmulhuw. For MULU16/MULS16,
42895  // the higher part is also needed.
42896  SDValue MulHi =
42897      DAG.getNode(Mode == ShrinkMode::MULS16 ? ISD::MULHS : ISD::MULHU, DL,
42898                  ReducedVT, NewN0, NewN1);
42899
42900  // Repack the lower part and higher part result of mul into a wider
42901  // result.
42902  // Generate shuffle functioning as punpcklwd.
42903  SmallVector<int, 16> ShuffleMask(NumElts);
42904  for (unsigned i = 0, e = NumElts / 2; i < e; i++) {
42905    ShuffleMask[2 * i] = i;
42906    ShuffleMask[2 * i + 1] = i + NumElts;
42907  }
42908  SDValue ResLo =
42909      DAG.getVectorShuffle(ReducedVT, DL, MulLo, MulHi, ShuffleMask);
42910  ResLo = DAG.getBitcast(ResVT, ResLo);
42911  // Generate shuffle functioning as punpckhwd.
42912  for (unsigned i = 0, e = NumElts / 2; i < e; i++) {
42913    ShuffleMask[2 * i] = i + NumElts / 2;
42914    ShuffleMask[2 * i + 1] = i + NumElts * 3 / 2;
42915  }
42916  SDValue ResHi =
42917      DAG.getVectorShuffle(ReducedVT, DL, MulLo, MulHi, ShuffleMask);
42918  ResHi = DAG.getBitcast(ResVT, ResHi);
42919  return DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, ResLo, ResHi);
42920}
42921
42922static SDValue combineMulSpecial(uint64_t MulAmt, SDNode *N, SelectionDAG &DAG,
42923                                 EVT VT, const SDLoc &DL) {
42924
42925  auto combineMulShlAddOrSub = [&](int Mult, int Shift, bool isAdd) {
42926    SDValue Result = DAG.getNode(X86ISD::MUL_IMM, DL, VT, N->getOperand(0),
42927                                 DAG.getConstant(Mult, DL, VT));
42928    Result = DAG.getNode(ISD::SHL, DL, VT, Result,
42929                         DAG.getConstant(Shift, DL, MVT::i8));
42930    Result = DAG.getNode(isAdd ? ISD::ADD : ISD::SUB, DL, VT, Result,
42931                         N->getOperand(0));
42932    return Result;
42933  };
42934
42935  auto combineMulMulAddOrSub = [&](int Mul1, int Mul2, bool isAdd) {
42936    SDValue Result = DAG.getNode(X86ISD::MUL_IMM, DL, VT, N->getOperand(0),
42937                                 DAG.getConstant(Mul1, DL, VT));
42938    Result = DAG.getNode(X86ISD::MUL_IMM, DL, VT, Result,
42939                         DAG.getConstant(Mul2, DL, VT));
42940    Result = DAG.getNode(isAdd ? ISD::ADD : ISD::SUB, DL, VT, Result,
42941                         N->getOperand(0));
42942    return Result;
42943  };
42944
42945  switch (MulAmt) {
42946  default:
42947    break;
42948  case 11:
42949    // mul x, 11 => add ((shl (mul x, 5), 1), x)
42950    return combineMulShlAddOrSub(5, 1, /*isAdd*/ true);
42951  case 21:
42952    // mul x, 21 => add ((shl (mul x, 5), 2), x)
42953    return combineMulShlAddOrSub(5, 2, /*isAdd*/ true);
42954  case 41:
42955    // mul x, 41 => add ((shl (mul x, 5), 3), x)
42956    return combineMulShlAddOrSub(5, 3, /*isAdd*/ true);
42957  case 22:
42958    // mul x, 22 => add (add ((shl (mul x, 5), 2), x), x)
42959    return DAG.getNode(ISD::ADD, DL, VT, N->getOperand(0),
42960                       combineMulShlAddOrSub(5, 2, /*isAdd*/ true));
42961  case 19:
42962    // mul x, 19 => add ((shl (mul x, 9), 1), x)
42963    return combineMulShlAddOrSub(9, 1, /*isAdd*/ true);
42964  case 37:
42965    // mul x, 37 => add ((shl (mul x, 9), 2), x)
42966    return combineMulShlAddOrSub(9, 2, /*isAdd*/ true);
42967  case 73:
42968    // mul x, 73 => add ((shl (mul x, 9), 3), x)
42969    return combineMulShlAddOrSub(9, 3, /*isAdd*/ true);
42970  case 13:
42971    // mul x, 13 => add ((shl (mul x, 3), 2), x)
42972    return combineMulShlAddOrSub(3, 2, /*isAdd*/ true);
42973  case 23:
42974    // mul x, 23 => sub ((shl (mul x, 3), 3), x)
42975    return combineMulShlAddOrSub(3, 3, /*isAdd*/ false);
42976  case 26:
42977    // mul x, 26 => add ((mul (mul x, 5), 5), x)
42978    return combineMulMulAddOrSub(5, 5, /*isAdd*/ true);
42979  case 28:
42980    // mul x, 28 => add ((mul (mul x, 9), 3), x)
42981    return combineMulMulAddOrSub(9, 3, /*isAdd*/ true);
42982  case 29:
42983    // mul x, 29 => add (add ((mul (mul x, 9), 3), x), x)
42984    return DAG.getNode(ISD::ADD, DL, VT, N->getOperand(0),
42985                       combineMulMulAddOrSub(9, 3, /*isAdd*/ true));
42986  }
42987
42988  // Another trick. If this is a power 2 + 2/4/8, we can use a shift followed
42989  // by a single LEA.
42990  // First check if this a sum of two power of 2s because that's easy. Then
42991  // count how many zeros are up to the first bit.
42992  // TODO: We can do this even without LEA at a cost of two shifts and an add.
42993  if (isPowerOf2_64(MulAmt & (MulAmt - 1))) {
42994    unsigned ScaleShift = countTrailingZeros(MulAmt);
42995    if (ScaleShift >= 1 && ScaleShift < 4) {
42996      unsigned ShiftAmt = Log2_64((MulAmt & (MulAmt - 1)));
42997      SDValue Shift1 = DAG.getNode(ISD::SHL, DL, VT, N->getOperand(0),
42998                                   DAG.getConstant(ShiftAmt, DL, MVT::i8));
42999      SDValue Shift2 = DAG.getNode(ISD::SHL, DL, VT, N->getOperand(0),
43000                                   DAG.getConstant(ScaleShift, DL, MVT::i8));
43001      return DAG.getNode(ISD::ADD, DL, VT, Shift1, Shift2);
43002    }
43003  }
43004
43005  return SDValue();
43006}
43007
43008// If the upper 17 bits of each element are zero then we can use PMADDWD,
43009// which is always at least as quick as PMULLD, except on KNL.
43010static SDValue combineMulToPMADDWD(SDNode *N, SelectionDAG &DAG,
43011                                   const X86Subtarget &Subtarget) {
43012  if (!Subtarget.hasSSE2())
43013    return SDValue();
43014
43015  if (Subtarget.isPMADDWDSlow())
43016    return SDValue();
43017
43018  EVT VT = N->getValueType(0);
43019
43020  // Only support vXi32 vectors.
43021  if (!VT.isVector() || VT.getVectorElementType() != MVT::i32)
43022    return SDValue();
43023
43024  // Make sure the type is legal or will be widened to a legal type.
43025  if (VT != MVT::v2i32 && !DAG.getTargetLoweringInfo().isTypeLegal(VT))
43026    return SDValue();
43027
43028  MVT WVT = MVT::getVectorVT(MVT::i16, 2 * VT.getVectorNumElements());
43029
43030  // Without BWI, we would need to split v32i16.
43031  if (WVT == MVT::v32i16 && !Subtarget.hasBWI())
43032    return SDValue();
43033
43034  SDValue N0 = N->getOperand(0);
43035  SDValue N1 = N->getOperand(1);
43036
43037  // If we are zero extending two steps without SSE4.1, its better to reduce
43038  // the vmul width instead.
43039  if (!Subtarget.hasSSE41() &&
43040      (N0.getOpcode() == ISD::ZERO_EXTEND &&
43041       N0.getOperand(0).getScalarValueSizeInBits() <= 8) &&
43042      (N1.getOpcode() == ISD::ZERO_EXTEND &&
43043       N1.getOperand(0).getScalarValueSizeInBits() <= 8))
43044    return SDValue();
43045
43046  APInt Mask17 = APInt::getHighBitsSet(32, 17);
43047  if (!DAG.MaskedValueIsZero(N1, Mask17) ||
43048      !DAG.MaskedValueIsZero(N0, Mask17))
43049    return SDValue();
43050
43051  // Use SplitOpsAndApply to handle AVX splitting.
43052  auto PMADDWDBuilder = [](SelectionDAG &DAG, const SDLoc &DL,
43053                           ArrayRef<SDValue> Ops) {
43054    MVT OpVT = MVT::getVectorVT(MVT::i32, Ops[0].getValueSizeInBits() / 32);
43055    return DAG.getNode(X86ISD::VPMADDWD, DL, OpVT, Ops);
43056  };
43057  return SplitOpsAndApply(DAG, Subtarget, SDLoc(N), VT,
43058                          { DAG.getBitcast(WVT, N0), DAG.getBitcast(WVT, N1) },
43059                          PMADDWDBuilder);
43060}
43061
43062static SDValue combineMulToPMULDQ(SDNode *N, SelectionDAG &DAG,
43063                                  const X86Subtarget &Subtarget) {
43064  if (!Subtarget.hasSSE2())
43065    return SDValue();
43066
43067  EVT VT = N->getValueType(0);
43068
43069  // Only support vXi64 vectors.
43070  if (!VT.isVector() || VT.getVectorElementType() != MVT::i64 ||
43071      VT.getVectorNumElements() < 2 ||
43072      !isPowerOf2_32(VT.getVectorNumElements()))
43073    return SDValue();
43074
43075  SDValue N0 = N->getOperand(0);
43076  SDValue N1 = N->getOperand(1);
43077
43078  // MULDQ returns the 64-bit result of the signed multiplication of the lower
43079  // 32-bits. We can lower with this if the sign bits stretch that far.
43080  if (Subtarget.hasSSE41() && DAG.ComputeNumSignBits(N0) > 32 &&
43081      DAG.ComputeNumSignBits(N1) > 32) {
43082    auto PMULDQBuilder = [](SelectionDAG &DAG, const SDLoc &DL,
43083                            ArrayRef<SDValue> Ops) {
43084      return DAG.getNode(X86ISD::PMULDQ, DL, Ops[0].getValueType(), Ops);
43085    };
43086    return SplitOpsAndApply(DAG, Subtarget, SDLoc(N), VT, { N0, N1 },
43087                            PMULDQBuilder, /*CheckBWI*/false);
43088  }
43089
43090  // If the upper bits are zero we can use a single pmuludq.
43091  APInt Mask = APInt::getHighBitsSet(64, 32);
43092  if (DAG.MaskedValueIsZero(N0, Mask) && DAG.MaskedValueIsZero(N1, Mask)) {
43093    auto PMULUDQBuilder = [](SelectionDAG &DAG, const SDLoc &DL,
43094                             ArrayRef<SDValue> Ops) {
43095      return DAG.getNode(X86ISD::PMULUDQ, DL, Ops[0].getValueType(), Ops);
43096    };
43097    return SplitOpsAndApply(DAG, Subtarget, SDLoc(N), VT, { N0, N1 },
43098                            PMULUDQBuilder, /*CheckBWI*/false);
43099  }
43100
43101  return SDValue();
43102}
43103
43104/// Optimize a single multiply with constant into two operations in order to
43105/// implement it with two cheaper instructions, e.g. LEA + SHL, LEA + LEA.
43106static SDValue combineMul(SDNode *N, SelectionDAG &DAG,
43107                          TargetLowering::DAGCombinerInfo &DCI,
43108                          const X86Subtarget &Subtarget) {
43109  EVT VT = N->getValueType(0);
43110
43111  if (SDValue V = combineMulToPMADDWD(N, DAG, Subtarget))
43112    return V;
43113
43114  if (SDValue V = combineMulToPMULDQ(N, DAG, Subtarget))
43115    return V;
43116
43117  if (DCI.isBeforeLegalize() && VT.isVector())
43118    return reduceVMULWidth(N, DAG, Subtarget);
43119
43120  if (!MulConstantOptimization)
43121    return SDValue();
43122  // An imul is usually smaller than the alternative sequence.
43123  if (DAG.getMachineFunction().getFunction().hasMinSize())
43124    return SDValue();
43125
43126  if (DCI.isBeforeLegalize() || DCI.isCalledByLegalizer())
43127    return SDValue();
43128
43129  if (VT != MVT::i64 && VT != MVT::i32)
43130    return SDValue();
43131
43132  ConstantSDNode *C = dyn_cast<ConstantSDNode>(N->getOperand(1));
43133  if (!C)
43134    return SDValue();
43135  if (isPowerOf2_64(C->getZExtValue()))
43136    return SDValue();
43137
43138  int64_t SignMulAmt = C->getSExtValue();
43139  assert(SignMulAmt != INT64_MIN && "Int min should have been handled!");
43140  uint64_t AbsMulAmt = SignMulAmt < 0 ? -SignMulAmt : SignMulAmt;
43141
43142  SDLoc DL(N);
43143  if (AbsMulAmt == 3 || AbsMulAmt == 5 || AbsMulAmt == 9) {
43144    SDValue NewMul = DAG.getNode(X86ISD::MUL_IMM, DL, VT, N->getOperand(0),
43145                                 DAG.getConstant(AbsMulAmt, DL, VT));
43146    if (SignMulAmt < 0)
43147      NewMul = DAG.getNode(ISD::SUB, DL, VT, DAG.getConstant(0, DL, VT),
43148                           NewMul);
43149
43150    return NewMul;
43151  }
43152
43153  uint64_t MulAmt1 = 0;
43154  uint64_t MulAmt2 = 0;
43155  if ((AbsMulAmt % 9) == 0) {
43156    MulAmt1 = 9;
43157    MulAmt2 = AbsMulAmt / 9;
43158  } else if ((AbsMulAmt % 5) == 0) {
43159    MulAmt1 = 5;
43160    MulAmt2 = AbsMulAmt / 5;
43161  } else if ((AbsMulAmt % 3) == 0) {
43162    MulAmt1 = 3;
43163    MulAmt2 = AbsMulAmt / 3;
43164  }
43165
43166  SDValue NewMul;
43167  // For negative multiply amounts, only allow MulAmt2 to be a power of 2.
43168  if (MulAmt2 &&
43169      (isPowerOf2_64(MulAmt2) ||
43170       (SignMulAmt >= 0 && (MulAmt2 == 3 || MulAmt2 == 5 || MulAmt2 == 9)))) {
43171
43172    if (isPowerOf2_64(MulAmt2) &&
43173        !(SignMulAmt >= 0 && N->hasOneUse() &&
43174          N->use_begin()->getOpcode() == ISD::ADD))
43175      // If second multiplifer is pow2, issue it first. We want the multiply by
43176      // 3, 5, or 9 to be folded into the addressing mode unless the lone use
43177      // is an add. Only do this for positive multiply amounts since the
43178      // negate would prevent it from being used as an address mode anyway.
43179      std::swap(MulAmt1, MulAmt2);
43180
43181    if (isPowerOf2_64(MulAmt1))
43182      NewMul = DAG.getNode(ISD::SHL, DL, VT, N->getOperand(0),
43183                           DAG.getConstant(Log2_64(MulAmt1), DL, MVT::i8));
43184    else
43185      NewMul = DAG.getNode(X86ISD::MUL_IMM, DL, VT, N->getOperand(0),
43186                           DAG.getConstant(MulAmt1, DL, VT));
43187
43188    if (isPowerOf2_64(MulAmt2))
43189      NewMul = DAG.getNode(ISD::SHL, DL, VT, NewMul,
43190                           DAG.getConstant(Log2_64(MulAmt2), DL, MVT::i8));
43191    else
43192      NewMul = DAG.getNode(X86ISD::MUL_IMM, DL, VT, NewMul,
43193                           DAG.getConstant(MulAmt2, DL, VT));
43194
43195    // Negate the result.
43196    if (SignMulAmt < 0)
43197      NewMul = DAG.getNode(ISD::SUB, DL, VT, DAG.getConstant(0, DL, VT),
43198                           NewMul);
43199  } else if (!Subtarget.slowLEA())
43200    NewMul = combineMulSpecial(C->getZExtValue(), N, DAG, VT, DL);
43201
43202  if (!NewMul) {
43203    assert(C->getZExtValue() != 0 &&
43204           C->getZExtValue() != (VT == MVT::i64 ? UINT64_MAX : UINT32_MAX) &&
43205           "Both cases that could cause potential overflows should have "
43206           "already been handled.");
43207    if (isPowerOf2_64(AbsMulAmt - 1)) {
43208      // (mul x, 2^N + 1) => (add (shl x, N), x)
43209      NewMul = DAG.getNode(
43210          ISD::ADD, DL, VT, N->getOperand(0),
43211          DAG.getNode(ISD::SHL, DL, VT, N->getOperand(0),
43212                      DAG.getConstant(Log2_64(AbsMulAmt - 1), DL,
43213                                      MVT::i8)));
43214      // To negate, subtract the number from zero
43215      if (SignMulAmt < 0)
43216        NewMul = DAG.getNode(ISD::SUB, DL, VT,
43217                             DAG.getConstant(0, DL, VT), NewMul);
43218    } else if (isPowerOf2_64(AbsMulAmt + 1)) {
43219      // (mul x, 2^N - 1) => (sub (shl x, N), x)
43220      NewMul = DAG.getNode(ISD::SHL, DL, VT, N->getOperand(0),
43221                           DAG.getConstant(Log2_64(AbsMulAmt + 1),
43222                                           DL, MVT::i8));
43223      // To negate, reverse the operands of the subtract.
43224      if (SignMulAmt < 0)
43225        NewMul = DAG.getNode(ISD::SUB, DL, VT, N->getOperand(0), NewMul);
43226      else
43227        NewMul = DAG.getNode(ISD::SUB, DL, VT, NewMul, N->getOperand(0));
43228    } else if (SignMulAmt >= 0 && isPowerOf2_64(AbsMulAmt - 2)) {
43229      // (mul x, 2^N + 2) => (add (add (shl x, N), x), x)
43230      NewMul = DAG.getNode(ISD::SHL, DL, VT, N->getOperand(0),
43231                           DAG.getConstant(Log2_64(AbsMulAmt - 2),
43232                                           DL, MVT::i8));
43233      NewMul = DAG.getNode(ISD::ADD, DL, VT, NewMul, N->getOperand(0));
43234      NewMul = DAG.getNode(ISD::ADD, DL, VT, NewMul, N->getOperand(0));
43235    } else if (SignMulAmt >= 0 && isPowerOf2_64(AbsMulAmt + 2)) {
43236      // (mul x, 2^N - 2) => (sub (sub (shl x, N), x), x)
43237      NewMul = DAG.getNode(ISD::SHL, DL, VT, N->getOperand(0),
43238                           DAG.getConstant(Log2_64(AbsMulAmt + 2),
43239                                           DL, MVT::i8));
43240      NewMul = DAG.getNode(ISD::SUB, DL, VT, NewMul, N->getOperand(0));
43241      NewMul = DAG.getNode(ISD::SUB, DL, VT, NewMul, N->getOperand(0));
43242    }
43243  }
43244
43245  return NewMul;
43246}
43247
43248// Try to form a MULHU or MULHS node by looking for
43249// (srl (mul ext, ext), 16)
43250// TODO: This is X86 specific because we want to be able to handle wide types
43251// before type legalization. But we can only do it if the vector will be
43252// legalized via widening/splitting. Type legalization can't handle promotion
43253// of a MULHU/MULHS. There isn't a way to convey this to the generic DAG
43254// combiner.
43255static SDValue combineShiftToPMULH(SDNode *N, SelectionDAG &DAG,
43256                                   const X86Subtarget &Subtarget) {
43257  assert((N->getOpcode() == ISD::SRL || N->getOpcode() == ISD::SRA) &&
43258           "SRL or SRA node is required here!");
43259  SDLoc DL(N);
43260
43261  // Only do this with SSE4.1. On earlier targets reduceVMULWidth will expand
43262  // the multiply.
43263  if (!Subtarget.hasSSE41())
43264    return SDValue();
43265
43266  // The operation feeding into the shift must be a multiply.
43267  SDValue ShiftOperand = N->getOperand(0);
43268  if (ShiftOperand.getOpcode() != ISD::MUL || !ShiftOperand.hasOneUse())
43269    return SDValue();
43270
43271  // Input type should be at least vXi32.
43272  EVT VT = N->getValueType(0);
43273  if (!VT.isVector() || VT.getVectorElementType().getSizeInBits() < 32)
43274    return SDValue();
43275
43276  // Need a shift by 16.
43277  APInt ShiftAmt;
43278  if (!ISD::isConstantSplatVector(N->getOperand(1).getNode(), ShiftAmt) ||
43279      ShiftAmt != 16)
43280    return SDValue();
43281
43282  SDValue LHS = ShiftOperand.getOperand(0);
43283  SDValue RHS = ShiftOperand.getOperand(1);
43284
43285  unsigned ExtOpc = LHS.getOpcode();
43286  if ((ExtOpc != ISD::SIGN_EXTEND && ExtOpc != ISD::ZERO_EXTEND) ||
43287      RHS.getOpcode() != ExtOpc)
43288    return SDValue();
43289
43290  // Peek through the extends.
43291  LHS = LHS.getOperand(0);
43292  RHS = RHS.getOperand(0);
43293
43294  // Ensure the input types match.
43295  EVT MulVT = LHS.getValueType();
43296  if (MulVT.getVectorElementType() != MVT::i16 || RHS.getValueType() != MulVT)
43297    return SDValue();
43298
43299  unsigned Opc = ExtOpc == ISD::SIGN_EXTEND ? ISD::MULHS : ISD::MULHU;
43300  SDValue Mulh = DAG.getNode(Opc, DL, MulVT, LHS, RHS);
43301
43302  ExtOpc = N->getOpcode() == ISD::SRA ? ISD::SIGN_EXTEND : ISD::ZERO_EXTEND;
43303  return DAG.getNode(ExtOpc, DL, VT, Mulh);
43304}
43305
43306static SDValue combineShiftLeft(SDNode *N, SelectionDAG &DAG) {
43307  SDValue N0 = N->getOperand(0);
43308  SDValue N1 = N->getOperand(1);
43309  ConstantSDNode *N1C = dyn_cast<ConstantSDNode>(N1);
43310  EVT VT = N0.getValueType();
43311
43312  // fold (shl (and (setcc_c), c1), c2) -> (and setcc_c, (c1 << c2))
43313  // since the result of setcc_c is all zero's or all ones.
43314  if (VT.isInteger() && !VT.isVector() &&
43315      N1C && N0.getOpcode() == ISD::AND &&
43316      N0.getOperand(1).getOpcode() == ISD::Constant) {
43317    SDValue N00 = N0.getOperand(0);
43318    APInt Mask = N0.getConstantOperandAPInt(1);
43319    Mask <<= N1C->getAPIntValue();
43320    bool MaskOK = false;
43321    // We can handle cases concerning bit-widening nodes containing setcc_c if
43322    // we carefully interrogate the mask to make sure we are semantics
43323    // preserving.
43324    // The transform is not safe if the result of C1 << C2 exceeds the bitwidth
43325    // of the underlying setcc_c operation if the setcc_c was zero extended.
43326    // Consider the following example:
43327    //   zext(setcc_c)                 -> i32 0x0000FFFF
43328    //   c1                            -> i32 0x0000FFFF
43329    //   c2                            -> i32 0x00000001
43330    //   (shl (and (setcc_c), c1), c2) -> i32 0x0001FFFE
43331    //   (and setcc_c, (c1 << c2))     -> i32 0x0000FFFE
43332    if (N00.getOpcode() == X86ISD::SETCC_CARRY) {
43333      MaskOK = true;
43334    } else if (N00.getOpcode() == ISD::SIGN_EXTEND &&
43335               N00.getOperand(0).getOpcode() == X86ISD::SETCC_CARRY) {
43336      MaskOK = true;
43337    } else if ((N00.getOpcode() == ISD::ZERO_EXTEND ||
43338                N00.getOpcode() == ISD::ANY_EXTEND) &&
43339               N00.getOperand(0).getOpcode() == X86ISD::SETCC_CARRY) {
43340      MaskOK = Mask.isIntN(N00.getOperand(0).getValueSizeInBits());
43341    }
43342    if (MaskOK && Mask != 0) {
43343      SDLoc DL(N);
43344      return DAG.getNode(ISD::AND, DL, VT, N00, DAG.getConstant(Mask, DL, VT));
43345    }
43346  }
43347
43348  // Hardware support for vector shifts is sparse which makes us scalarize the
43349  // vector operations in many cases. Also, on sandybridge ADD is faster than
43350  // shl.
43351  // (shl V, 1) -> add V,V
43352  if (auto *N1BV = dyn_cast<BuildVectorSDNode>(N1))
43353    if (auto *N1SplatC = N1BV->getConstantSplatNode()) {
43354      assert(N0.getValueType().isVector() && "Invalid vector shift type");
43355      // We shift all of the values by one. In many cases we do not have
43356      // hardware support for this operation. This is better expressed as an ADD
43357      // of two values.
43358      if (N1SplatC->isOne())
43359        return DAG.getNode(ISD::ADD, SDLoc(N), VT, N0, N0);
43360    }
43361
43362  return SDValue();
43363}
43364
43365static SDValue combineShiftRightArithmetic(SDNode *N, SelectionDAG &DAG,
43366                                           const X86Subtarget &Subtarget) {
43367  SDValue N0 = N->getOperand(0);
43368  SDValue N1 = N->getOperand(1);
43369  EVT VT = N0.getValueType();
43370  unsigned Size = VT.getSizeInBits();
43371
43372  if (SDValue V = combineShiftToPMULH(N, DAG, Subtarget))
43373    return V;
43374
43375  // fold (ashr (shl, a, [56,48,32,24,16]), SarConst)
43376  // into (shl, (sext (a), [56,48,32,24,16] - SarConst)) or
43377  // into (lshr, (sext (a), SarConst - [56,48,32,24,16]))
43378  // depending on sign of (SarConst - [56,48,32,24,16])
43379
43380  // sexts in X86 are MOVs. The MOVs have the same code size
43381  // as above SHIFTs (only SHIFT on 1 has lower code size).
43382  // However the MOVs have 2 advantages to a SHIFT:
43383  // 1. MOVs can write to a register that differs from source
43384  // 2. MOVs accept memory operands
43385
43386  if (VT.isVector() || N1.getOpcode() != ISD::Constant ||
43387      N0.getOpcode() != ISD::SHL || !N0.hasOneUse() ||
43388      N0.getOperand(1).getOpcode() != ISD::Constant)
43389    return SDValue();
43390
43391  SDValue N00 = N0.getOperand(0);
43392  SDValue N01 = N0.getOperand(1);
43393  APInt ShlConst = (cast<ConstantSDNode>(N01))->getAPIntValue();
43394  APInt SarConst = (cast<ConstantSDNode>(N1))->getAPIntValue();
43395  EVT CVT = N1.getValueType();
43396
43397  if (SarConst.isNegative())
43398    return SDValue();
43399
43400  for (MVT SVT : { MVT::i8, MVT::i16, MVT::i32 }) {
43401    unsigned ShiftSize = SVT.getSizeInBits();
43402    // skipping types without corresponding sext/zext and
43403    // ShlConst that is not one of [56,48,32,24,16]
43404    if (ShiftSize >= Size || ShlConst != Size - ShiftSize)
43405      continue;
43406    SDLoc DL(N);
43407    SDValue NN =
43408        DAG.getNode(ISD::SIGN_EXTEND_INREG, DL, VT, N00, DAG.getValueType(SVT));
43409    SarConst = SarConst - (Size - ShiftSize);
43410    if (SarConst == 0)
43411      return NN;
43412    else if (SarConst.isNegative())
43413      return DAG.getNode(ISD::SHL, DL, VT, NN,
43414                         DAG.getConstant(-SarConst, DL, CVT));
43415    else
43416      return DAG.getNode(ISD::SRA, DL, VT, NN,
43417                         DAG.getConstant(SarConst, DL, CVT));
43418  }
43419  return SDValue();
43420}
43421
43422static SDValue combineShiftRightLogical(SDNode *N, SelectionDAG &DAG,
43423                                        TargetLowering::DAGCombinerInfo &DCI,
43424                                        const X86Subtarget &Subtarget) {
43425  SDValue N0 = N->getOperand(0);
43426  SDValue N1 = N->getOperand(1);
43427  EVT VT = N0.getValueType();
43428
43429  if (SDValue V = combineShiftToPMULH(N, DAG, Subtarget))
43430    return V;
43431
43432  // Only do this on the last DAG combine as it can interfere with other
43433  // combines.
43434  if (!DCI.isAfterLegalizeDAG())
43435    return SDValue();
43436
43437  // Try to improve a sequence of srl (and X, C1), C2 by inverting the order.
43438  // TODO: This is a generic DAG combine that became an x86-only combine to
43439  // avoid shortcomings in other folds such as bswap, bit-test ('bt'), and
43440  // and-not ('andn').
43441  if (N0.getOpcode() != ISD::AND || !N0.hasOneUse())
43442    return SDValue();
43443
43444  auto *ShiftC = dyn_cast<ConstantSDNode>(N1);
43445  auto *AndC = dyn_cast<ConstantSDNode>(N0.getOperand(1));
43446  if (!ShiftC || !AndC)
43447    return SDValue();
43448
43449  // If we can shrink the constant mask below 8-bits or 32-bits, then this
43450  // transform should reduce code size. It may also enable secondary transforms
43451  // from improved known-bits analysis or instruction selection.
43452  APInt MaskVal = AndC->getAPIntValue();
43453
43454  // If this can be matched by a zero extend, don't optimize.
43455  if (MaskVal.isMask()) {
43456    unsigned TO = MaskVal.countTrailingOnes();
43457    if (TO >= 8 && isPowerOf2_32(TO))
43458      return SDValue();
43459  }
43460
43461  APInt NewMaskVal = MaskVal.lshr(ShiftC->getAPIntValue());
43462  unsigned OldMaskSize = MaskVal.getMinSignedBits();
43463  unsigned NewMaskSize = NewMaskVal.getMinSignedBits();
43464  if ((OldMaskSize > 8 && NewMaskSize <= 8) ||
43465      (OldMaskSize > 32 && NewMaskSize <= 32)) {
43466    // srl (and X, AndC), ShiftC --> and (srl X, ShiftC), (AndC >> ShiftC)
43467    SDLoc DL(N);
43468    SDValue NewMask = DAG.getConstant(NewMaskVal, DL, VT);
43469    SDValue NewShift = DAG.getNode(ISD::SRL, DL, VT, N0.getOperand(0), N1);
43470    return DAG.getNode(ISD::AND, DL, VT, NewShift, NewMask);
43471  }
43472  return SDValue();
43473}
43474
43475static SDValue combineHorizOpWithShuffle(SDNode *N, SelectionDAG &DAG,
43476                                         const X86Subtarget &Subtarget) {
43477  unsigned Opcode = N->getOpcode();
43478  assert(isHorizOp(Opcode) && "Unexpected hadd/hsub/pack opcode");
43479
43480  SDLoc DL(N);
43481  EVT VT = N->getValueType(0);
43482  SDValue N0 = N->getOperand(0);
43483  SDValue N1 = N->getOperand(1);
43484  EVT SrcVT = N0.getValueType();
43485
43486  SDValue BC0 =
43487      N->isOnlyUserOf(N0.getNode()) ? peekThroughOneUseBitcasts(N0) : N0;
43488  SDValue BC1 =
43489      N->isOnlyUserOf(N1.getNode()) ? peekThroughOneUseBitcasts(N1) : N1;
43490
43491  // Attempt to fold HOP(LOSUBVECTOR(SHUFFLE(X)),HISUBVECTOR(SHUFFLE(X)))
43492  // to SHUFFLE(HOP(LOSUBVECTOR(X),HISUBVECTOR(X))), this is mainly for
43493  // truncation trees that help us avoid lane crossing shuffles.
43494  // TODO: There's a lot more we can do for PACK/HADD style shuffle combines.
43495  // TODO: We don't handle vXf64 shuffles yet.
43496  if (VT.is128BitVector() && SrcVT.getScalarSizeInBits() <= 32 &&
43497      BC0.getOpcode() == ISD::EXTRACT_SUBVECTOR &&
43498      BC1.getOpcode() == ISD::EXTRACT_SUBVECTOR &&
43499      BC0.getOperand(0) == BC1.getOperand(0) &&
43500      BC0.getOperand(0).getValueType().is256BitVector() &&
43501      BC0.getConstantOperandAPInt(1) == 0 &&
43502      BC1.getConstantOperandAPInt(1) ==
43503          BC0.getValueType().getVectorNumElements()) {
43504    SmallVector<SDValue> ShuffleOps;
43505    SmallVector<int> ShuffleMask, ScaledMask;
43506    SDValue Vec = peekThroughBitcasts(BC0.getOperand(0));
43507    if (getTargetShuffleInputs(Vec, ShuffleOps, ShuffleMask, DAG)) {
43508      resolveTargetShuffleInputsAndMask(ShuffleOps, ShuffleMask);
43509      // To keep the HOP LHS/RHS coherency, we must be able to scale the unary
43510      // shuffle to a v4X64 width - we can probably relax this in the future.
43511      if (!isAnyZero(ShuffleMask) && ShuffleOps.size() == 1 &&
43512          ShuffleOps[0].getValueType().is256BitVector() &&
43513          scaleShuffleElements(ShuffleMask, 4, ScaledMask)) {
43514        SDValue Lo, Hi;
43515        MVT ShufVT = VT.isFloatingPoint() ? MVT::v4f32 : MVT::v4i32;
43516        std::tie(Lo, Hi) = DAG.SplitVector(ShuffleOps[0], DL);
43517        Lo = DAG.getBitcast(SrcVT, Lo);
43518        Hi = DAG.getBitcast(SrcVT, Hi);
43519        SDValue Res = DAG.getNode(Opcode, DL, VT, Lo, Hi);
43520        Res = DAG.getBitcast(ShufVT, Res);
43521        Res = DAG.getVectorShuffle(ShufVT, DL, Res, Res, ScaledMask);
43522        return DAG.getBitcast(VT, Res);
43523      }
43524    }
43525  }
43526
43527  // Attempt to fold HOP(SHUFFLE(X,Y),SHUFFLE(Z,W)) -> SHUFFLE(HOP()).
43528  if (VT.is128BitVector() && SrcVT.getScalarSizeInBits() <= 32) {
43529    // If either/both ops are a shuffle that can scale to v2x64,
43530    // then see if we can perform this as a v4x32 post shuffle.
43531    SmallVector<SDValue> Ops0, Ops1;
43532    SmallVector<int> Mask0, Mask1, ScaledMask0, ScaledMask1;
43533    bool IsShuf0 =
43534        getTargetShuffleInputs(BC0, Ops0, Mask0, DAG) && !isAnyZero(Mask0) &&
43535        scaleShuffleElements(Mask0, 2, ScaledMask0) &&
43536        all_of(Ops0, [](SDValue Op) { return Op.getValueSizeInBits() == 128; });
43537    bool IsShuf1 =
43538        getTargetShuffleInputs(BC1, Ops1, Mask1, DAG) && !isAnyZero(Mask1) &&
43539        scaleShuffleElements(Mask1, 2, ScaledMask1) &&
43540        all_of(Ops1, [](SDValue Op) { return Op.getValueSizeInBits() == 128; });
43541    if (IsShuf0 || IsShuf1) {
43542      if (!IsShuf0) {
43543        Ops0.assign({BC0});
43544        ScaledMask0.assign({0, 1});
43545      }
43546      if (!IsShuf1) {
43547        Ops1.assign({BC1});
43548        ScaledMask1.assign({0, 1});
43549      }
43550
43551      SDValue LHS, RHS;
43552      int PostShuffle[4] = {-1, -1, -1, -1};
43553      auto FindShuffleOpAndIdx = [&](int M, int &Idx, ArrayRef<SDValue> Ops) {
43554        if (M < 0)
43555          return true;
43556        Idx = M % 2;
43557        SDValue Src = Ops[M / 2];
43558        if (!LHS || LHS == Src) {
43559          LHS = Src;
43560          return true;
43561        }
43562        if (!RHS || RHS == Src) {
43563          Idx += 2;
43564          RHS = Src;
43565          return true;
43566        }
43567        return false;
43568      };
43569      if (FindShuffleOpAndIdx(ScaledMask0[0], PostShuffle[0], Ops0) &&
43570          FindShuffleOpAndIdx(ScaledMask0[1], PostShuffle[1], Ops0) &&
43571          FindShuffleOpAndIdx(ScaledMask1[0], PostShuffle[2], Ops1) &&
43572          FindShuffleOpAndIdx(ScaledMask1[1], PostShuffle[3], Ops1)) {
43573        LHS = DAG.getBitcast(SrcVT, LHS);
43574        RHS = DAG.getBitcast(SrcVT, RHS ? RHS : LHS);
43575        MVT ShufVT = VT.isFloatingPoint() ? MVT::v4f32 : MVT::v4i32;
43576        SDValue Res = DAG.getNode(Opcode, DL, VT, LHS, RHS);
43577        Res = DAG.getBitcast(ShufVT, Res);
43578        Res = DAG.getVectorShuffle(ShufVT, DL, Res, Res, PostShuffle);
43579        return DAG.getBitcast(VT, Res);
43580      }
43581    }
43582  }
43583
43584  // Attempt to fold HOP(SHUFFLE(X,Y),SHUFFLE(X,Y)) -> SHUFFLE(HOP(X,Y)).
43585  if (VT.is256BitVector() && Subtarget.hasInt256()) {
43586    SmallVector<int> Mask0, Mask1;
43587    SmallVector<SDValue> Ops0, Ops1;
43588    SmallVector<int, 2> ScaledMask0, ScaledMask1;
43589    if (getTargetShuffleInputs(BC0, Ops0, Mask0, DAG) && !isAnyZero(Mask0) &&
43590        getTargetShuffleInputs(BC1, Ops1, Mask1, DAG) && !isAnyZero(Mask1) &&
43591        !Ops0.empty() && !Ops1.empty() &&
43592        all_of(Ops0,
43593               [](SDValue Op) { return Op.getValueType().is256BitVector(); }) &&
43594        all_of(Ops1,
43595               [](SDValue Op) { return Op.getValueType().is256BitVector(); }) &&
43596        scaleShuffleElements(Mask0, 2, ScaledMask0) &&
43597        scaleShuffleElements(Mask1, 2, ScaledMask1)) {
43598      SDValue Op00 = peekThroughBitcasts(Ops0.front());
43599      SDValue Op10 = peekThroughBitcasts(Ops1.front());
43600      SDValue Op01 = peekThroughBitcasts(Ops0.back());
43601      SDValue Op11 = peekThroughBitcasts(Ops1.back());
43602      if ((Op00 == Op11) && (Op01 == Op10)) {
43603        std::swap(Op10, Op11);
43604        ShuffleVectorSDNode::commuteMask(ScaledMask1);
43605      }
43606      if ((Op00 == Op10) && (Op01 == Op11)) {
43607        SmallVector<int, 4> ShuffleMask;
43608        ShuffleMask.append(ScaledMask0.begin(), ScaledMask0.end());
43609        ShuffleMask.append(ScaledMask1.begin(), ScaledMask1.end());
43610        MVT ShufVT = VT.isFloatingPoint() ? MVT::v4f64 : MVT::v4i64;
43611        SDValue Res = DAG.getNode(Opcode, DL, VT, DAG.getBitcast(SrcVT, Op00),
43612                                  DAG.getBitcast(SrcVT, Op01));
43613        Res = DAG.getBitcast(ShufVT, Res);
43614        Res = DAG.getVectorShuffle(ShufVT, DL, Res, Res, ShuffleMask);
43615        return DAG.getBitcast(VT, Res);
43616      }
43617    }
43618  }
43619
43620  return SDValue();
43621}
43622
43623static SDValue combineVectorPack(SDNode *N, SelectionDAG &DAG,
43624                                 TargetLowering::DAGCombinerInfo &DCI,
43625                                 const X86Subtarget &Subtarget) {
43626  unsigned Opcode = N->getOpcode();
43627  assert((X86ISD::PACKSS == Opcode || X86ISD::PACKUS == Opcode) &&
43628         "Unexpected pack opcode");
43629
43630  EVT VT = N->getValueType(0);
43631  SDValue N0 = N->getOperand(0);
43632  SDValue N1 = N->getOperand(1);
43633  unsigned NumDstElts = VT.getVectorNumElements();
43634  unsigned DstBitsPerElt = VT.getScalarSizeInBits();
43635  unsigned SrcBitsPerElt = 2 * DstBitsPerElt;
43636  assert(N0.getScalarValueSizeInBits() == SrcBitsPerElt &&
43637         N1.getScalarValueSizeInBits() == SrcBitsPerElt &&
43638         "Unexpected PACKSS/PACKUS input type");
43639
43640  bool IsSigned = (X86ISD::PACKSS == Opcode);
43641
43642  // Constant Folding.
43643  APInt UndefElts0, UndefElts1;
43644  SmallVector<APInt, 32> EltBits0, EltBits1;
43645  if ((N0.isUndef() || N->isOnlyUserOf(N0.getNode())) &&
43646      (N1.isUndef() || N->isOnlyUserOf(N1.getNode())) &&
43647      getTargetConstantBitsFromNode(N0, SrcBitsPerElt, UndefElts0, EltBits0) &&
43648      getTargetConstantBitsFromNode(N1, SrcBitsPerElt, UndefElts1, EltBits1)) {
43649    unsigned NumLanes = VT.getSizeInBits() / 128;
43650    unsigned NumSrcElts = NumDstElts / 2;
43651    unsigned NumDstEltsPerLane = NumDstElts / NumLanes;
43652    unsigned NumSrcEltsPerLane = NumSrcElts / NumLanes;
43653
43654    APInt Undefs(NumDstElts, 0);
43655    SmallVector<APInt, 32> Bits(NumDstElts, APInt::getNullValue(DstBitsPerElt));
43656    for (unsigned Lane = 0; Lane != NumLanes; ++Lane) {
43657      for (unsigned Elt = 0; Elt != NumDstEltsPerLane; ++Elt) {
43658        unsigned SrcIdx = Lane * NumSrcEltsPerLane + Elt % NumSrcEltsPerLane;
43659        auto &UndefElts = (Elt >= NumSrcEltsPerLane ? UndefElts1 : UndefElts0);
43660        auto &EltBits = (Elt >= NumSrcEltsPerLane ? EltBits1 : EltBits0);
43661
43662        if (UndefElts[SrcIdx]) {
43663          Undefs.setBit(Lane * NumDstEltsPerLane + Elt);
43664          continue;
43665        }
43666
43667        APInt &Val = EltBits[SrcIdx];
43668        if (IsSigned) {
43669          // PACKSS: Truncate signed value with signed saturation.
43670          // Source values less than dst minint are saturated to minint.
43671          // Source values greater than dst maxint are saturated to maxint.
43672          if (Val.isSignedIntN(DstBitsPerElt))
43673            Val = Val.trunc(DstBitsPerElt);
43674          else if (Val.isNegative())
43675            Val = APInt::getSignedMinValue(DstBitsPerElt);
43676          else
43677            Val = APInt::getSignedMaxValue(DstBitsPerElt);
43678        } else {
43679          // PACKUS: Truncate signed value with unsigned saturation.
43680          // Source values less than zero are saturated to zero.
43681          // Source values greater than dst maxuint are saturated to maxuint.
43682          if (Val.isIntN(DstBitsPerElt))
43683            Val = Val.trunc(DstBitsPerElt);
43684          else if (Val.isNegative())
43685            Val = APInt::getNullValue(DstBitsPerElt);
43686          else
43687            Val = APInt::getAllOnesValue(DstBitsPerElt);
43688        }
43689        Bits[Lane * NumDstEltsPerLane + Elt] = Val;
43690      }
43691    }
43692
43693    return getConstVector(Bits, Undefs, VT.getSimpleVT(), DAG, SDLoc(N));
43694  }
43695
43696  // Try to fold PACK(SHUFFLE(),SHUFFLE()) -> SHUFFLE(PACK()).
43697  if (SDValue V = combineHorizOpWithShuffle(N, DAG, Subtarget))
43698    return V;
43699
43700  // Try to combine a PACKUSWB/PACKSSWB implemented truncate with a regular
43701  // truncate to create a larger truncate.
43702  if (Subtarget.hasAVX512() &&
43703      N0.getOpcode() == ISD::TRUNCATE && N1.isUndef() && VT == MVT::v16i8 &&
43704      N0.getOperand(0).getValueType() == MVT::v8i32) {
43705    if ((IsSigned && DAG.ComputeNumSignBits(N0) > 8) ||
43706        (!IsSigned &&
43707         DAG.MaskedValueIsZero(N0, APInt::getHighBitsSet(16, 8)))) {
43708      if (Subtarget.hasVLX())
43709        return DAG.getNode(X86ISD::VTRUNC, SDLoc(N), VT, N0.getOperand(0));
43710
43711      // Widen input to v16i32 so we can truncate that.
43712      SDLoc dl(N);
43713      SDValue Concat = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v16i32,
43714                                   N0.getOperand(0), DAG.getUNDEF(MVT::v8i32));
43715      return DAG.getNode(ISD::TRUNCATE, SDLoc(N), VT, Concat);
43716    }
43717  }
43718
43719  // Try to fold PACK(EXTEND(X),EXTEND(Y)) -> CONCAT(X,Y) subvectors.
43720  if (VT.is128BitVector()) {
43721    unsigned ExtOpc = IsSigned ? ISD::SIGN_EXTEND : ISD::ZERO_EXTEND;
43722    SDValue Src0, Src1;
43723    if (N0.getOpcode() == ExtOpc &&
43724        N0.getOperand(0).getValueType().is64BitVector() &&
43725        N0.getOperand(0).getScalarValueSizeInBits() == DstBitsPerElt) {
43726      Src0 = N0.getOperand(0);
43727    }
43728    if (N1.getOpcode() == ExtOpc &&
43729        N1.getOperand(0).getValueType().is64BitVector() &&
43730        N1.getOperand(0).getScalarValueSizeInBits() == DstBitsPerElt) {
43731      Src1 = N1.getOperand(0);
43732    }
43733    if ((Src0 || N0.isUndef()) && (Src1 || N1.isUndef())) {
43734      assert((Src0 || Src1) && "Found PACK(UNDEF,UNDEF)");
43735      Src0 = Src0 ? Src0 : DAG.getUNDEF(Src1.getValueType());
43736      Src1 = Src1 ? Src1 : DAG.getUNDEF(Src0.getValueType());
43737      return DAG.getNode(ISD::CONCAT_VECTORS, SDLoc(N), VT, Src0, Src1);
43738    }
43739  }
43740
43741  // Attempt to combine as shuffle.
43742  SDValue Op(N, 0);
43743  if (SDValue Res = combineX86ShufflesRecursively(Op, DAG, Subtarget))
43744    return Res;
43745
43746  return SDValue();
43747}
43748
43749static SDValue combineVectorHADDSUB(SDNode *N, SelectionDAG &DAG,
43750                                    TargetLowering::DAGCombinerInfo &DCI,
43751                                    const X86Subtarget &Subtarget) {
43752  assert((X86ISD::HADD == N->getOpcode() || X86ISD::FHADD == N->getOpcode() ||
43753          X86ISD::HSUB == N->getOpcode() || X86ISD::FHSUB == N->getOpcode()) &&
43754         "Unexpected horizontal add/sub opcode");
43755
43756  if (!shouldUseHorizontalOp(true, DAG, Subtarget)) {
43757    // For slow-hop targets, if we have a hop with a single op, see if we already
43758    // have another user that we can reuse and shuffle the result.
43759    MVT VT = N->getSimpleValueType(0);
43760    SDValue LHS = N->getOperand(0);
43761    SDValue RHS = N->getOperand(1);
43762    if (VT.is128BitVector() && LHS == RHS) {
43763      for (SDNode *User : LHS->uses()) {
43764        if (User != N && User->getOpcode() == N->getOpcode()) {
43765          MVT ShufVT = VT.isFloatingPoint() ? MVT::v4f32 : MVT::v4i32;
43766          if (User->getOperand(0) == LHS && !User->getOperand(1).isUndef()) {
43767            return DAG.getBitcast(
43768                VT,
43769                DAG.getVectorShuffle(ShufVT, SDLoc(N),
43770                                     DAG.getBitcast(ShufVT, SDValue(User, 0)),
43771                                     DAG.getUNDEF(ShufVT), {0, 1, 0, 1}));
43772          }
43773          if (User->getOperand(1) == LHS && !User->getOperand(0).isUndef()) {
43774            return DAG.getBitcast(
43775                VT,
43776                DAG.getVectorShuffle(ShufVT, SDLoc(N),
43777                                     DAG.getBitcast(ShufVT, SDValue(User, 0)),
43778                                     DAG.getUNDEF(ShufVT), {2, 3, 2, 3}));
43779          }
43780        }
43781      }
43782    }
43783
43784    // HOP(HOP'(X,X),HOP'(Y,Y)) -> HOP(PERMUTE(HOP'(X,Y)),PERMUTE(HOP'(X,Y)).
43785    if (LHS != RHS && LHS.getOpcode() == N->getOpcode() &&
43786        LHS.getOpcode() == RHS.getOpcode() &&
43787        LHS.getValueType() == RHS.getValueType()) {
43788      SDValue LHS0 = LHS.getOperand(0);
43789      SDValue RHS0 = LHS.getOperand(1);
43790      SDValue LHS1 = RHS.getOperand(0);
43791      SDValue RHS1 = RHS.getOperand(1);
43792      if ((LHS0 == RHS0 || LHS0.isUndef() || RHS0.isUndef()) &&
43793          (LHS1 == RHS1 || LHS1.isUndef() || RHS1.isUndef())) {
43794        SDLoc DL(N);
43795        SDValue Res = DAG.getNode(LHS.getOpcode(), DL, LHS.getValueType(),
43796                                  LHS0.isUndef() ? RHS0 : LHS0,
43797                                  LHS1.isUndef() ? RHS1 : LHS1);
43798        MVT ShufVT = MVT::getVectorVT(MVT::i32, VT.getSizeInBits() / 32);
43799        Res = DAG.getBitcast(ShufVT, Res);
43800        SDValue NewLHS =
43801            DAG.getNode(X86ISD::PSHUFD, DL, ShufVT, Res,
43802                        getV4X86ShuffleImm8ForMask({0, 1, 0, 1}, DL, DAG));
43803        SDValue NewRHS =
43804            DAG.getNode(X86ISD::PSHUFD, DL, ShufVT, Res,
43805                        getV4X86ShuffleImm8ForMask({2, 3, 2, 3}, DL, DAG));
43806        DAG.ReplaceAllUsesOfValueWith(LHS, DAG.getBitcast(VT, NewLHS));
43807        DAG.ReplaceAllUsesOfValueWith(RHS, DAG.getBitcast(VT, NewRHS));
43808        return SDValue(N, 0);
43809      }
43810    }
43811  }
43812
43813  // Try to fold HOP(SHUFFLE(),SHUFFLE()) -> SHUFFLE(HOP()).
43814  if (SDValue V = combineHorizOpWithShuffle(N, DAG, Subtarget))
43815    return V;
43816
43817  return SDValue();
43818}
43819
43820static SDValue combineVectorShiftVar(SDNode *N, SelectionDAG &DAG,
43821                                     TargetLowering::DAGCombinerInfo &DCI,
43822                                     const X86Subtarget &Subtarget) {
43823  assert((X86ISD::VSHL == N->getOpcode() || X86ISD::VSRA == N->getOpcode() ||
43824          X86ISD::VSRL == N->getOpcode()) &&
43825         "Unexpected shift opcode");
43826  EVT VT = N->getValueType(0);
43827  SDValue N0 = N->getOperand(0);
43828  SDValue N1 = N->getOperand(1);
43829
43830  // Shift zero -> zero.
43831  if (ISD::isBuildVectorAllZeros(N0.getNode()))
43832    return DAG.getConstant(0, SDLoc(N), VT);
43833
43834  // Detect constant shift amounts.
43835  APInt UndefElts;
43836  SmallVector<APInt, 32> EltBits;
43837  if (getTargetConstantBitsFromNode(N1, 64, UndefElts, EltBits, true, false)) {
43838    unsigned X86Opc = getTargetVShiftUniformOpcode(N->getOpcode(), false);
43839    return getTargetVShiftByConstNode(X86Opc, SDLoc(N), VT.getSimpleVT(), N0,
43840                                      EltBits[0].getZExtValue(), DAG);
43841  }
43842
43843  APInt KnownUndef, KnownZero;
43844  const TargetLowering &TLI = DAG.getTargetLoweringInfo();
43845  APInt DemandedElts = APInt::getAllOnesValue(VT.getVectorNumElements());
43846  if (TLI.SimplifyDemandedVectorElts(SDValue(N, 0), DemandedElts, KnownUndef,
43847                                     KnownZero, DCI))
43848    return SDValue(N, 0);
43849
43850  return SDValue();
43851}
43852
43853static SDValue combineVectorShiftImm(SDNode *N, SelectionDAG &DAG,
43854                                     TargetLowering::DAGCombinerInfo &DCI,
43855                                     const X86Subtarget &Subtarget) {
43856  unsigned Opcode = N->getOpcode();
43857  assert((X86ISD::VSHLI == Opcode || X86ISD::VSRAI == Opcode ||
43858          X86ISD::VSRLI == Opcode) &&
43859         "Unexpected shift opcode");
43860  bool LogicalShift = X86ISD::VSHLI == Opcode || X86ISD::VSRLI == Opcode;
43861  EVT VT = N->getValueType(0);
43862  SDValue N0 = N->getOperand(0);
43863  unsigned NumBitsPerElt = VT.getScalarSizeInBits();
43864  assert(VT == N0.getValueType() && (NumBitsPerElt % 8) == 0 &&
43865         "Unexpected value type");
43866  assert(N->getOperand(1).getValueType() == MVT::i8 &&
43867         "Unexpected shift amount type");
43868
43869  // Out of range logical bit shifts are guaranteed to be zero.
43870  // Out of range arithmetic bit shifts splat the sign bit.
43871  unsigned ShiftVal = N->getConstantOperandVal(1);
43872  if (ShiftVal >= NumBitsPerElt) {
43873    if (LogicalShift)
43874      return DAG.getConstant(0, SDLoc(N), VT);
43875    ShiftVal = NumBitsPerElt - 1;
43876  }
43877
43878  // (shift X, 0) -> X
43879  if (!ShiftVal)
43880    return N0;
43881
43882  // (shift 0, C) -> 0
43883  if (ISD::isBuildVectorAllZeros(N0.getNode()))
43884    // N0 is all zeros or undef. We guarantee that the bits shifted into the
43885    // result are all zeros, not undef.
43886    return DAG.getConstant(0, SDLoc(N), VT);
43887
43888  // (VSRAI -1, C) -> -1
43889  if (!LogicalShift && ISD::isBuildVectorAllOnes(N0.getNode()))
43890    // N0 is all ones or undef. We guarantee that the bits shifted into the
43891    // result are all ones, not undef.
43892    return DAG.getConstant(-1, SDLoc(N), VT);
43893
43894  // (shift (shift X, C2), C1) -> (shift X, (C1 + C2))
43895  if (Opcode == N0.getOpcode()) {
43896    unsigned ShiftVal2 = cast<ConstantSDNode>(N0.getOperand(1))->getZExtValue();
43897    unsigned NewShiftVal = ShiftVal + ShiftVal2;
43898    if (NewShiftVal >= NumBitsPerElt) {
43899      // Out of range logical bit shifts are guaranteed to be zero.
43900      // Out of range arithmetic bit shifts splat the sign bit.
43901      if (LogicalShift)
43902        return DAG.getConstant(0, SDLoc(N), VT);
43903      NewShiftVal = NumBitsPerElt - 1;
43904    }
43905    return DAG.getNode(Opcode, SDLoc(N), VT, N0.getOperand(0),
43906                       DAG.getTargetConstant(NewShiftVal, SDLoc(N), MVT::i8));
43907  }
43908
43909  // We can decode 'whole byte' logical bit shifts as shuffles.
43910  if (LogicalShift && (ShiftVal % 8) == 0) {
43911    SDValue Op(N, 0);
43912    if (SDValue Res = combineX86ShufflesRecursively(Op, DAG, Subtarget))
43913      return Res;
43914  }
43915
43916  // Constant Folding.
43917  APInt UndefElts;
43918  SmallVector<APInt, 32> EltBits;
43919  if (N->isOnlyUserOf(N0.getNode()) &&
43920      getTargetConstantBitsFromNode(N0, NumBitsPerElt, UndefElts, EltBits)) {
43921    assert(EltBits.size() == VT.getVectorNumElements() &&
43922           "Unexpected shift value type");
43923    // Undef elements need to fold to 0. It's possible SimplifyDemandedBits
43924    // created an undef input due to no input bits being demanded, but user
43925    // still expects 0 in other bits.
43926    for (unsigned i = 0, e = EltBits.size(); i != e; ++i) {
43927      APInt &Elt = EltBits[i];
43928      if (UndefElts[i])
43929        Elt = 0;
43930      else if (X86ISD::VSHLI == Opcode)
43931        Elt <<= ShiftVal;
43932      else if (X86ISD::VSRAI == Opcode)
43933        Elt.ashrInPlace(ShiftVal);
43934      else
43935        Elt.lshrInPlace(ShiftVal);
43936    }
43937    // Reset undef elements since they were zeroed above.
43938    UndefElts = 0;
43939    return getConstVector(EltBits, UndefElts, VT.getSimpleVT(), DAG, SDLoc(N));
43940  }
43941
43942  const TargetLowering &TLI = DAG.getTargetLoweringInfo();
43943  if (TLI.SimplifyDemandedBits(SDValue(N, 0),
43944                               APInt::getAllOnesValue(NumBitsPerElt), DCI))
43945    return SDValue(N, 0);
43946
43947  return SDValue();
43948}
43949
43950static SDValue combineVectorInsert(SDNode *N, SelectionDAG &DAG,
43951                                   TargetLowering::DAGCombinerInfo &DCI,
43952                                   const X86Subtarget &Subtarget) {
43953  EVT VT = N->getValueType(0);
43954  assert(((N->getOpcode() == X86ISD::PINSRB && VT == MVT::v16i8) ||
43955          (N->getOpcode() == X86ISD::PINSRW && VT == MVT::v8i16) ||
43956          N->getOpcode() == ISD::INSERT_VECTOR_ELT) &&
43957         "Unexpected vector insertion");
43958
43959  if (N->getOpcode() == X86ISD::PINSRB || N->getOpcode() == X86ISD::PINSRW) {
43960    unsigned NumBitsPerElt = VT.getScalarSizeInBits();
43961    const TargetLowering &TLI = DAG.getTargetLoweringInfo();
43962    if (TLI.SimplifyDemandedBits(SDValue(N, 0),
43963                                 APInt::getAllOnesValue(NumBitsPerElt), DCI))
43964      return SDValue(N, 0);
43965  }
43966
43967  // Attempt to combine insertion patterns to a shuffle.
43968  if (VT.isSimple() && DCI.isAfterLegalizeDAG()) {
43969    SDValue Op(N, 0);
43970    if (SDValue Res = combineX86ShufflesRecursively(Op, DAG, Subtarget))
43971      return Res;
43972  }
43973
43974  return SDValue();
43975}
43976
43977/// Recognize the distinctive (AND (setcc ...) (setcc ..)) where both setccs
43978/// reference the same FP CMP, and rewrite for CMPEQSS and friends. Likewise for
43979/// OR -> CMPNEQSS.
43980static SDValue combineCompareEqual(SDNode *N, SelectionDAG &DAG,
43981                                   TargetLowering::DAGCombinerInfo &DCI,
43982                                   const X86Subtarget &Subtarget) {
43983  unsigned opcode;
43984
43985  // SSE1 supports CMP{eq|ne}SS, and SSE2 added CMP{eq|ne}SD, but
43986  // we're requiring SSE2 for both.
43987  if (Subtarget.hasSSE2() && isAndOrOfSetCCs(SDValue(N, 0U), opcode)) {
43988    SDValue N0 = N->getOperand(0);
43989    SDValue N1 = N->getOperand(1);
43990    SDValue CMP0 = N0.getOperand(1);
43991    SDValue CMP1 = N1.getOperand(1);
43992    SDLoc DL(N);
43993
43994    // The SETCCs should both refer to the same CMP.
43995    if (CMP0.getOpcode() != X86ISD::FCMP || CMP0 != CMP1)
43996      return SDValue();
43997
43998    SDValue CMP00 = CMP0->getOperand(0);
43999    SDValue CMP01 = CMP0->getOperand(1);
44000    EVT     VT    = CMP00.getValueType();
44001
44002    if (VT == MVT::f32 || VT == MVT::f64) {
44003      bool ExpectingFlags = false;
44004      // Check for any users that want flags:
44005      for (SDNode::use_iterator UI = N->use_begin(), UE = N->use_end();
44006           !ExpectingFlags && UI != UE; ++UI)
44007        switch (UI->getOpcode()) {
44008        default:
44009        case ISD::BR_CC:
44010        case ISD::BRCOND:
44011        case ISD::SELECT:
44012          ExpectingFlags = true;
44013          break;
44014        case ISD::CopyToReg:
44015        case ISD::SIGN_EXTEND:
44016        case ISD::ZERO_EXTEND:
44017        case ISD::ANY_EXTEND:
44018          break;
44019        }
44020
44021      if (!ExpectingFlags) {
44022        enum X86::CondCode cc0 = (enum X86::CondCode)N0.getConstantOperandVal(0);
44023        enum X86::CondCode cc1 = (enum X86::CondCode)N1.getConstantOperandVal(0);
44024
44025        if (cc1 == X86::COND_E || cc1 == X86::COND_NE) {
44026          X86::CondCode tmp = cc0;
44027          cc0 = cc1;
44028          cc1 = tmp;
44029        }
44030
44031        if ((cc0 == X86::COND_E  && cc1 == X86::COND_NP) ||
44032            (cc0 == X86::COND_NE && cc1 == X86::COND_P)) {
44033          // FIXME: need symbolic constants for these magic numbers.
44034          // See X86ATTInstPrinter.cpp:printSSECC().
44035          unsigned x86cc = (cc0 == X86::COND_E) ? 0 : 4;
44036          if (Subtarget.hasAVX512()) {
44037            SDValue FSetCC =
44038                DAG.getNode(X86ISD::FSETCCM, DL, MVT::v1i1, CMP00, CMP01,
44039                            DAG.getTargetConstant(x86cc, DL, MVT::i8));
44040            // Need to fill with zeros to ensure the bitcast will produce zeroes
44041            // for the upper bits. An EXTRACT_ELEMENT here wouldn't guarantee that.
44042            SDValue Ins = DAG.getNode(ISD::INSERT_SUBVECTOR, DL, MVT::v16i1,
44043                                      DAG.getConstant(0, DL, MVT::v16i1),
44044                                      FSetCC, DAG.getIntPtrConstant(0, DL));
44045            return DAG.getZExtOrTrunc(DAG.getBitcast(MVT::i16, Ins), DL,
44046                                      N->getSimpleValueType(0));
44047          }
44048          SDValue OnesOrZeroesF =
44049              DAG.getNode(X86ISD::FSETCC, DL, CMP00.getValueType(), CMP00,
44050                          CMP01, DAG.getTargetConstant(x86cc, DL, MVT::i8));
44051
44052          bool is64BitFP = (CMP00.getValueType() == MVT::f64);
44053          MVT IntVT = is64BitFP ? MVT::i64 : MVT::i32;
44054
44055          if (is64BitFP && !Subtarget.is64Bit()) {
44056            // On a 32-bit target, we cannot bitcast the 64-bit float to a
44057            // 64-bit integer, since that's not a legal type. Since
44058            // OnesOrZeroesF is all ones of all zeroes, we don't need all the
44059            // bits, but can do this little dance to extract the lowest 32 bits
44060            // and work with those going forward.
44061            SDValue Vector64 = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, MVT::v2f64,
44062                                           OnesOrZeroesF);
44063            SDValue Vector32 = DAG.getBitcast(MVT::v4f32, Vector64);
44064            OnesOrZeroesF = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::f32,
44065                                        Vector32, DAG.getIntPtrConstant(0, DL));
44066            IntVT = MVT::i32;
44067          }
44068
44069          SDValue OnesOrZeroesI = DAG.getBitcast(IntVT, OnesOrZeroesF);
44070          SDValue ANDed = DAG.getNode(ISD::AND, DL, IntVT, OnesOrZeroesI,
44071                                      DAG.getConstant(1, DL, IntVT));
44072          SDValue OneBitOfTruth = DAG.getNode(ISD::TRUNCATE, DL, MVT::i8,
44073                                              ANDed);
44074          return OneBitOfTruth;
44075        }
44076      }
44077    }
44078  }
44079  return SDValue();
44080}
44081
44082/// Try to fold: (and (xor X, -1), Y) -> (andnp X, Y).
44083static SDValue combineANDXORWithAllOnesIntoANDNP(SDNode *N, SelectionDAG &DAG) {
44084  assert(N->getOpcode() == ISD::AND);
44085
44086  MVT VT = N->getSimpleValueType(0);
44087  if (!VT.is128BitVector() && !VT.is256BitVector() && !VT.is512BitVector())
44088    return SDValue();
44089
44090  SDValue X, Y;
44091  SDValue N0 = N->getOperand(0);
44092  SDValue N1 = N->getOperand(1);
44093
44094  auto GetNot = [&VT, &DAG](SDValue V) {
44095    // Basic X = NOT(Y) detection.
44096    if (SDValue Not = IsNOT(V, DAG))
44097      return Not;
44098    // Fold BROADCAST(NOT(Y)) -> BROADCAST(Y).
44099    if (V.getOpcode() == X86ISD::VBROADCAST) {
44100      SDValue Src = V.getOperand(0);
44101      EVT SrcVT = Src.getValueType();
44102      if (!SrcVT.isVector())
44103        return SDValue();
44104      if (SDValue Not = IsNOT(Src, DAG))
44105        return DAG.getNode(X86ISD::VBROADCAST, SDLoc(V), VT,
44106                           DAG.getBitcast(SrcVT, Not));
44107    }
44108    return SDValue();
44109  };
44110
44111  if (SDValue Not = GetNot(N0)) {
44112    X = Not;
44113    Y = N1;
44114  } else if (SDValue Not = GetNot(N1)) {
44115    X = Not;
44116    Y = N0;
44117  } else
44118    return SDValue();
44119
44120  X = DAG.getBitcast(VT, X);
44121  Y = DAG.getBitcast(VT, Y);
44122  return DAG.getNode(X86ISD::ANDNP, SDLoc(N), VT, X, Y);
44123}
44124
44125// Try to widen AND, OR and XOR nodes to VT in order to remove casts around
44126// logical operations, like in the example below.
44127//   or (and (truncate x, truncate y)),
44128//      (xor (truncate z, build_vector (constants)))
44129// Given a target type \p VT, we generate
44130//   or (and x, y), (xor z, zext(build_vector (constants)))
44131// given x, y and z are of type \p VT. We can do so, if operands are either
44132// truncates from VT types, the second operand is a vector of constants or can
44133// be recursively promoted.
44134static SDValue PromoteMaskArithmetic(SDNode *N, EVT VT, SelectionDAG &DAG,
44135                                     unsigned Depth) {
44136  // Limit recursion to avoid excessive compile times.
44137  if (Depth >= SelectionDAG::MaxRecursionDepth)
44138    return SDValue();
44139
44140  if (N->getOpcode() != ISD::XOR && N->getOpcode() != ISD::AND &&
44141      N->getOpcode() != ISD::OR)
44142    return SDValue();
44143
44144  SDValue N0 = N->getOperand(0);
44145  SDValue N1 = N->getOperand(1);
44146  SDLoc DL(N);
44147
44148  const TargetLowering &TLI = DAG.getTargetLoweringInfo();
44149  if (!TLI.isOperationLegalOrPromote(N->getOpcode(), VT))
44150    return SDValue();
44151
44152  if (SDValue NN0 = PromoteMaskArithmetic(N0.getNode(), VT, DAG, Depth + 1))
44153    N0 = NN0;
44154  else {
44155    // The Left side has to be a trunc.
44156    if (N0.getOpcode() != ISD::TRUNCATE)
44157      return SDValue();
44158
44159    // The type of the truncated inputs.
44160    if (N0.getOperand(0).getValueType() != VT)
44161      return SDValue();
44162
44163    N0 = N0.getOperand(0);
44164  }
44165
44166  if (SDValue NN1 = PromoteMaskArithmetic(N1.getNode(), VT, DAG, Depth + 1))
44167    N1 = NN1;
44168  else {
44169    // The right side has to be a 'trunc' or a constant vector.
44170    bool RHSTrunc = N1.getOpcode() == ISD::TRUNCATE &&
44171                    N1.getOperand(0).getValueType() == VT;
44172    if (!RHSTrunc && !ISD::isBuildVectorOfConstantSDNodes(N1.getNode()))
44173      return SDValue();
44174
44175    if (RHSTrunc)
44176      N1 = N1.getOperand(0);
44177    else
44178      N1 = DAG.getNode(ISD::ZERO_EXTEND, DL, VT, N1);
44179  }
44180
44181  return DAG.getNode(N->getOpcode(), DL, VT, N0, N1);
44182}
44183
44184// On AVX/AVX2 the type v8i1 is legalized to v8i16, which is an XMM sized
44185// register. In most cases we actually compare or select YMM-sized registers
44186// and mixing the two types creates horrible code. This method optimizes
44187// some of the transition sequences.
44188// Even with AVX-512 this is still useful for removing casts around logical
44189// operations on vXi1 mask types.
44190static SDValue PromoteMaskArithmetic(SDNode *N, SelectionDAG &DAG,
44191                                     const X86Subtarget &Subtarget) {
44192  EVT VT = N->getValueType(0);
44193  assert(VT.isVector() && "Expected vector type");
44194
44195  SDLoc DL(N);
44196  assert((N->getOpcode() == ISD::ANY_EXTEND ||
44197          N->getOpcode() == ISD::ZERO_EXTEND ||
44198          N->getOpcode() == ISD::SIGN_EXTEND) && "Invalid Node");
44199
44200  SDValue Narrow = N->getOperand(0);
44201  EVT NarrowVT = Narrow.getValueType();
44202
44203  // Generate the wide operation.
44204  SDValue Op = PromoteMaskArithmetic(Narrow.getNode(), VT, DAG, 0);
44205  if (!Op)
44206    return SDValue();
44207  switch (N->getOpcode()) {
44208  default: llvm_unreachable("Unexpected opcode");
44209  case ISD::ANY_EXTEND:
44210    return Op;
44211  case ISD::ZERO_EXTEND:
44212    return DAG.getZeroExtendInReg(Op, DL, NarrowVT);
44213  case ISD::SIGN_EXTEND:
44214    return DAG.getNode(ISD::SIGN_EXTEND_INREG, DL, VT,
44215                       Op, DAG.getValueType(NarrowVT));
44216  }
44217}
44218
44219static unsigned convertIntLogicToFPLogicOpcode(unsigned Opcode) {
44220  unsigned FPOpcode;
44221  switch (Opcode) {
44222  default: llvm_unreachable("Unexpected input node for FP logic conversion");
44223  case ISD::AND: FPOpcode = X86ISD::FAND; break;
44224  case ISD::OR:  FPOpcode = X86ISD::FOR;  break;
44225  case ISD::XOR: FPOpcode = X86ISD::FXOR; break;
44226  }
44227  return FPOpcode;
44228}
44229
44230/// If both input operands of a logic op are being cast from floating point
44231/// types, try to convert this into a floating point logic node to avoid
44232/// unnecessary moves from SSE to integer registers.
44233static SDValue convertIntLogicToFPLogic(SDNode *N, SelectionDAG &DAG,
44234                                        const X86Subtarget &Subtarget) {
44235  EVT VT = N->getValueType(0);
44236  SDValue N0 = N->getOperand(0);
44237  SDValue N1 = N->getOperand(1);
44238  SDLoc DL(N);
44239
44240  if (N0.getOpcode() != ISD::BITCAST || N1.getOpcode() != ISD::BITCAST)
44241    return SDValue();
44242
44243  SDValue N00 = N0.getOperand(0);
44244  SDValue N10 = N1.getOperand(0);
44245  EVT N00Type = N00.getValueType();
44246  EVT N10Type = N10.getValueType();
44247
44248  // Ensure that both types are the same and are legal scalar fp types.
44249  if (N00Type != N10Type ||
44250      !((Subtarget.hasSSE1() && N00Type == MVT::f32) ||
44251        (Subtarget.hasSSE2() && N00Type == MVT::f64)))
44252    return SDValue();
44253
44254  unsigned FPOpcode = convertIntLogicToFPLogicOpcode(N->getOpcode());
44255  SDValue FPLogic = DAG.getNode(FPOpcode, DL, N00Type, N00, N10);
44256  return DAG.getBitcast(VT, FPLogic);
44257}
44258
44259// Attempt to fold BITOP(MOVMSK(X),MOVMSK(Y)) -> MOVMSK(BITOP(X,Y))
44260// to reduce XMM->GPR traffic.
44261static SDValue combineBitOpWithMOVMSK(SDNode *N, SelectionDAG &DAG) {
44262  unsigned Opc = N->getOpcode();
44263  assert((Opc == ISD::OR || Opc == ISD::AND || Opc == ISD::XOR) &&
44264         "Unexpected bit opcode");
44265
44266  SDValue N0 = N->getOperand(0);
44267  SDValue N1 = N->getOperand(1);
44268
44269  // Both operands must be single use MOVMSK.
44270  if (N0.getOpcode() != X86ISD::MOVMSK || !N0.hasOneUse() ||
44271      N1.getOpcode() != X86ISD::MOVMSK || !N1.hasOneUse())
44272    return SDValue();
44273
44274  SDValue Vec0 = N0.getOperand(0);
44275  SDValue Vec1 = N1.getOperand(0);
44276  EVT VecVT0 = Vec0.getValueType();
44277  EVT VecVT1 = Vec1.getValueType();
44278
44279  // Both MOVMSK operands must be from vectors of the same size and same element
44280  // size, but its OK for a fp/int diff.
44281  if (VecVT0.getSizeInBits() != VecVT1.getSizeInBits() ||
44282      VecVT0.getScalarSizeInBits() != VecVT1.getScalarSizeInBits())
44283    return SDValue();
44284
44285  SDLoc DL(N);
44286  unsigned VecOpc =
44287      VecVT0.isFloatingPoint() ? convertIntLogicToFPLogicOpcode(Opc) : Opc;
44288  SDValue Result =
44289      DAG.getNode(VecOpc, DL, VecVT0, Vec0, DAG.getBitcast(VecVT0, Vec1));
44290  return DAG.getNode(X86ISD::MOVMSK, DL, MVT::i32, Result);
44291}
44292
44293/// If this is a zero/all-bits result that is bitwise-anded with a low bits
44294/// mask. (Mask == 1 for the x86 lowering of a SETCC + ZEXT), replace the 'and'
44295/// with a shift-right to eliminate loading the vector constant mask value.
44296static SDValue combineAndMaskToShift(SDNode *N, SelectionDAG &DAG,
44297                                     const X86Subtarget &Subtarget) {
44298  SDValue Op0 = peekThroughBitcasts(N->getOperand(0));
44299  SDValue Op1 = peekThroughBitcasts(N->getOperand(1));
44300  EVT VT0 = Op0.getValueType();
44301  EVT VT1 = Op1.getValueType();
44302
44303  if (VT0 != VT1 || !VT0.isSimple() || !VT0.isInteger())
44304    return SDValue();
44305
44306  APInt SplatVal;
44307  if (!ISD::isConstantSplatVector(Op1.getNode(), SplatVal) ||
44308      !SplatVal.isMask())
44309    return SDValue();
44310
44311  // Don't prevent creation of ANDN.
44312  if (isBitwiseNot(Op0))
44313    return SDValue();
44314
44315  if (!SupportedVectorShiftWithImm(VT0.getSimpleVT(), Subtarget, ISD::SRL))
44316    return SDValue();
44317
44318  unsigned EltBitWidth = VT0.getScalarSizeInBits();
44319  if (EltBitWidth != DAG.ComputeNumSignBits(Op0))
44320    return SDValue();
44321
44322  SDLoc DL(N);
44323  unsigned ShiftVal = SplatVal.countTrailingOnes();
44324  SDValue ShAmt = DAG.getTargetConstant(EltBitWidth - ShiftVal, DL, MVT::i8);
44325  SDValue Shift = DAG.getNode(X86ISD::VSRLI, DL, VT0, Op0, ShAmt);
44326  return DAG.getBitcast(N->getValueType(0), Shift);
44327}
44328
44329// Get the index node from the lowered DAG of a GEP IR instruction with one
44330// indexing dimension.
44331static SDValue getIndexFromUnindexedLoad(LoadSDNode *Ld) {
44332  if (Ld->isIndexed())
44333    return SDValue();
44334
44335  SDValue Base = Ld->getBasePtr();
44336
44337  if (Base.getOpcode() != ISD::ADD)
44338    return SDValue();
44339
44340  SDValue ShiftedIndex = Base.getOperand(0);
44341
44342  if (ShiftedIndex.getOpcode() != ISD::SHL)
44343    return SDValue();
44344
44345  return ShiftedIndex.getOperand(0);
44346
44347}
44348
44349static bool hasBZHI(const X86Subtarget &Subtarget, MVT VT) {
44350  if (Subtarget.hasBMI2() && VT.isScalarInteger()) {
44351    switch (VT.getSizeInBits()) {
44352    default: return false;
44353    case 64: return Subtarget.is64Bit() ? true : false;
44354    case 32: return true;
44355    }
44356  }
44357  return false;
44358}
44359
44360// This function recognizes cases where X86 bzhi instruction can replace and
44361// 'and-load' sequence.
44362// In case of loading integer value from an array of constants which is defined
44363// as follows:
44364//
44365//   int array[SIZE] = {0x0, 0x1, 0x3, 0x7, 0xF ..., 2^(SIZE-1) - 1}
44366//
44367// then applying a bitwise and on the result with another input.
44368// It's equivalent to performing bzhi (zero high bits) on the input, with the
44369// same index of the load.
44370static SDValue combineAndLoadToBZHI(SDNode *Node, SelectionDAG &DAG,
44371                                    const X86Subtarget &Subtarget) {
44372  MVT VT = Node->getSimpleValueType(0);
44373  SDLoc dl(Node);
44374
44375  // Check if subtarget has BZHI instruction for the node's type
44376  if (!hasBZHI(Subtarget, VT))
44377    return SDValue();
44378
44379  // Try matching the pattern for both operands.
44380  for (unsigned i = 0; i < 2; i++) {
44381    SDValue N = Node->getOperand(i);
44382    LoadSDNode *Ld = dyn_cast<LoadSDNode>(N.getNode());
44383
44384     // continue if the operand is not a load instruction
44385    if (!Ld)
44386      return SDValue();
44387
44388    const Value *MemOp = Ld->getMemOperand()->getValue();
44389
44390    if (!MemOp)
44391      return SDValue();
44392
44393    if (const GetElementPtrInst *GEP = dyn_cast<GetElementPtrInst>(MemOp)) {
44394      if (GlobalVariable *GV = dyn_cast<GlobalVariable>(GEP->getOperand(0))) {
44395        if (GV->isConstant() && GV->hasDefinitiveInitializer()) {
44396
44397          Constant *Init = GV->getInitializer();
44398          Type *Ty = Init->getType();
44399          if (!isa<ConstantDataArray>(Init) ||
44400              !Ty->getArrayElementType()->isIntegerTy() ||
44401              Ty->getArrayElementType()->getScalarSizeInBits() !=
44402                  VT.getSizeInBits() ||
44403              Ty->getArrayNumElements() >
44404                  Ty->getArrayElementType()->getScalarSizeInBits())
44405            continue;
44406
44407          // Check if the array's constant elements are suitable to our case.
44408          uint64_t ArrayElementCount = Init->getType()->getArrayNumElements();
44409          bool ConstantsMatch = true;
44410          for (uint64_t j = 0; j < ArrayElementCount; j++) {
44411            auto *Elem = cast<ConstantInt>(Init->getAggregateElement(j));
44412            if (Elem->getZExtValue() != (((uint64_t)1 << j) - 1)) {
44413              ConstantsMatch = false;
44414              break;
44415            }
44416          }
44417          if (!ConstantsMatch)
44418            continue;
44419
44420          // Do the transformation (For 32-bit type):
44421          // -> (and (load arr[idx]), inp)
44422          // <- (and (srl 0xFFFFFFFF, (sub 32, idx)))
44423          //    that will be replaced with one bzhi instruction.
44424          SDValue Inp = (i == 0) ? Node->getOperand(1) : Node->getOperand(0);
44425          SDValue SizeC = DAG.getConstant(VT.getSizeInBits(), dl, MVT::i32);
44426
44427          // Get the Node which indexes into the array.
44428          SDValue Index = getIndexFromUnindexedLoad(Ld);
44429          if (!Index)
44430            return SDValue();
44431          Index = DAG.getZExtOrTrunc(Index, dl, MVT::i32);
44432
44433          SDValue Sub = DAG.getNode(ISD::SUB, dl, MVT::i32, SizeC, Index);
44434          Sub = DAG.getNode(ISD::TRUNCATE, dl, MVT::i8, Sub);
44435
44436          SDValue AllOnes = DAG.getAllOnesConstant(dl, VT);
44437          SDValue LShr = DAG.getNode(ISD::SRL, dl, VT, AllOnes, Sub);
44438
44439          return DAG.getNode(ISD::AND, dl, VT, Inp, LShr);
44440        }
44441      }
44442    }
44443  }
44444  return SDValue();
44445}
44446
44447// Look for (and (bitcast (vXi1 (concat_vectors (vYi1 setcc), undef,))), C)
44448// Where C is a mask containing the same number of bits as the setcc and
44449// where the setcc will freely 0 upper bits of k-register. We can replace the
44450// undef in the concat with 0s and remove the AND. This mainly helps with
44451// v2i1/v4i1 setcc being casted to scalar.
44452static SDValue combineScalarAndWithMaskSetcc(SDNode *N, SelectionDAG &DAG,
44453                                             const X86Subtarget &Subtarget) {
44454  assert(N->getOpcode() == ISD::AND && "Unexpected opcode!");
44455
44456  EVT VT = N->getValueType(0);
44457
44458  // Make sure this is an AND with constant. We will check the value of the
44459  // constant later.
44460  if (!isa<ConstantSDNode>(N->getOperand(1)))
44461    return SDValue();
44462
44463  // This is implied by the ConstantSDNode.
44464  assert(!VT.isVector() && "Expected scalar VT!");
44465
44466  if (N->getOperand(0).getOpcode() != ISD::BITCAST ||
44467      !N->getOperand(0).hasOneUse() ||
44468      !N->getOperand(0).getOperand(0).hasOneUse())
44469    return SDValue();
44470
44471  const TargetLowering &TLI = DAG.getTargetLoweringInfo();
44472  SDValue Src = N->getOperand(0).getOperand(0);
44473  EVT SrcVT = Src.getValueType();
44474  if (!SrcVT.isVector() || SrcVT.getVectorElementType() != MVT::i1 ||
44475      !TLI.isTypeLegal(SrcVT))
44476    return SDValue();
44477
44478  if (Src.getOpcode() != ISD::CONCAT_VECTORS)
44479    return SDValue();
44480
44481  // We only care about the first subvector of the concat, we expect the
44482  // other subvectors to be ignored due to the AND if we make the change.
44483  SDValue SubVec = Src.getOperand(0);
44484  EVT SubVecVT = SubVec.getValueType();
44485
44486  // First subvector should be a setcc with a legal result type. The RHS of the
44487  // AND should be a mask with this many bits.
44488  if (SubVec.getOpcode() != ISD::SETCC || !TLI.isTypeLegal(SubVecVT) ||
44489      !N->getConstantOperandAPInt(1).isMask(SubVecVT.getVectorNumElements()))
44490    return SDValue();
44491
44492  EVT SetccVT = SubVec.getOperand(0).getValueType();
44493  if (!TLI.isTypeLegal(SetccVT) ||
44494      !(Subtarget.hasVLX() || SetccVT.is512BitVector()))
44495    return SDValue();
44496
44497  if (!(Subtarget.hasBWI() || SetccVT.getScalarSizeInBits() >= 32))
44498    return SDValue();
44499
44500  // We passed all the checks. Rebuild the concat_vectors with zeroes
44501  // and cast it back to VT.
44502  SDLoc dl(N);
44503  SmallVector<SDValue, 4> Ops(Src.getNumOperands(),
44504                              DAG.getConstant(0, dl, SubVecVT));
44505  Ops[0] = SubVec;
44506  SDValue Concat = DAG.getNode(ISD::CONCAT_VECTORS, dl, SrcVT,
44507                               Ops);
44508  return DAG.getBitcast(VT, Concat);
44509}
44510
44511static SDValue combineAnd(SDNode *N, SelectionDAG &DAG,
44512                          TargetLowering::DAGCombinerInfo &DCI,
44513                          const X86Subtarget &Subtarget) {
44514  EVT VT = N->getValueType(0);
44515
44516  // If this is SSE1 only convert to FAND to avoid scalarization.
44517  if (Subtarget.hasSSE1() && !Subtarget.hasSSE2() && VT == MVT::v4i32) {
44518    return DAG.getBitcast(
44519        MVT::v4i32, DAG.getNode(X86ISD::FAND, SDLoc(N), MVT::v4f32,
44520                                DAG.getBitcast(MVT::v4f32, N->getOperand(0)),
44521                                DAG.getBitcast(MVT::v4f32, N->getOperand(1))));
44522  }
44523
44524  // Use a 32-bit and+zext if upper bits known zero.
44525  if (VT == MVT::i64 && Subtarget.is64Bit() &&
44526      !isa<ConstantSDNode>(N->getOperand(1))) {
44527    APInt HiMask = APInt::getHighBitsSet(64, 32);
44528    if (DAG.MaskedValueIsZero(N->getOperand(1), HiMask) ||
44529        DAG.MaskedValueIsZero(N->getOperand(0), HiMask)) {
44530      SDLoc dl(N);
44531      SDValue LHS = DAG.getNode(ISD::TRUNCATE, dl, MVT::i32, N->getOperand(0));
44532      SDValue RHS = DAG.getNode(ISD::TRUNCATE, dl, MVT::i32, N->getOperand(1));
44533      return DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i64,
44534                         DAG.getNode(ISD::AND, dl, MVT::i32, LHS, RHS));
44535    }
44536  }
44537
44538  // Match all-of bool scalar reductions into a bitcast/movmsk + cmp.
44539  // TODO: Support multiple SrcOps.
44540  if (VT == MVT::i1) {
44541    SmallVector<SDValue, 2> SrcOps;
44542    SmallVector<APInt, 2> SrcPartials;
44543    if (matchScalarReduction(SDValue(N, 0), ISD::AND, SrcOps, &SrcPartials) &&
44544        SrcOps.size() == 1) {
44545      SDLoc dl(N);
44546      const TargetLowering &TLI = DAG.getTargetLoweringInfo();
44547      unsigned NumElts = SrcOps[0].getValueType().getVectorNumElements();
44548      EVT MaskVT = EVT::getIntegerVT(*DAG.getContext(), NumElts);
44549      SDValue Mask = combineBitcastvxi1(DAG, MaskVT, SrcOps[0], dl, Subtarget);
44550      if (!Mask && TLI.isTypeLegal(SrcOps[0].getValueType()))
44551        Mask = DAG.getBitcast(MaskVT, SrcOps[0]);
44552      if (Mask) {
44553        assert(SrcPartials[0].getBitWidth() == NumElts &&
44554               "Unexpected partial reduction mask");
44555        SDValue PartialBits = DAG.getConstant(SrcPartials[0], dl, MaskVT);
44556        Mask = DAG.getNode(ISD::AND, dl, MaskVT, Mask, PartialBits);
44557        return DAG.getSetCC(dl, MVT::i1, Mask, PartialBits, ISD::SETEQ);
44558      }
44559    }
44560  }
44561
44562  if (SDValue V = combineScalarAndWithMaskSetcc(N, DAG, Subtarget))
44563    return V;
44564
44565  if (SDValue R = combineBitOpWithMOVMSK(N, DAG))
44566    return R;
44567
44568  if (DCI.isBeforeLegalizeOps())
44569    return SDValue();
44570
44571  if (SDValue R = combineCompareEqual(N, DAG, DCI, Subtarget))
44572    return R;
44573
44574  if (SDValue FPLogic = convertIntLogicToFPLogic(N, DAG, Subtarget))
44575    return FPLogic;
44576
44577  if (SDValue R = combineANDXORWithAllOnesIntoANDNP(N, DAG))
44578    return R;
44579
44580  if (SDValue ShiftRight = combineAndMaskToShift(N, DAG, Subtarget))
44581    return ShiftRight;
44582
44583  if (SDValue R = combineAndLoadToBZHI(N, DAG, Subtarget))
44584    return R;
44585
44586  // Attempt to recursively combine a bitmask AND with shuffles.
44587  if (VT.isVector() && (VT.getScalarSizeInBits() % 8) == 0) {
44588    SDValue Op(N, 0);
44589    if (SDValue Res = combineX86ShufflesRecursively(Op, DAG, Subtarget))
44590      return Res;
44591  }
44592
44593  // Attempt to combine a scalar bitmask AND with an extracted shuffle.
44594  if ((VT.getScalarSizeInBits() % 8) == 0 &&
44595      N->getOperand(0).getOpcode() == ISD::EXTRACT_VECTOR_ELT &&
44596      isa<ConstantSDNode>(N->getOperand(0).getOperand(1))) {
44597    SDValue BitMask = N->getOperand(1);
44598    SDValue SrcVec = N->getOperand(0).getOperand(0);
44599    EVT SrcVecVT = SrcVec.getValueType();
44600
44601    // Check that the constant bitmask masks whole bytes.
44602    APInt UndefElts;
44603    SmallVector<APInt, 64> EltBits;
44604    if (VT == SrcVecVT.getScalarType() &&
44605        N->getOperand(0)->isOnlyUserOf(SrcVec.getNode()) &&
44606        getTargetConstantBitsFromNode(BitMask, 8, UndefElts, EltBits) &&
44607        llvm::all_of(EltBits, [](const APInt &M) {
44608          return M.isNullValue() || M.isAllOnesValue();
44609        })) {
44610      unsigned NumElts = SrcVecVT.getVectorNumElements();
44611      unsigned Scale = SrcVecVT.getScalarSizeInBits() / 8;
44612      unsigned Idx = N->getOperand(0).getConstantOperandVal(1);
44613
44614      // Create a root shuffle mask from the byte mask and the extracted index.
44615      SmallVector<int, 16> ShuffleMask(NumElts * Scale, SM_SentinelUndef);
44616      for (unsigned i = 0; i != Scale; ++i) {
44617        if (UndefElts[i])
44618          continue;
44619        int VecIdx = Scale * Idx + i;
44620        ShuffleMask[VecIdx] =
44621            EltBits[i].isNullValue() ? SM_SentinelZero : VecIdx;
44622      }
44623
44624      if (SDValue Shuffle = combineX86ShufflesRecursively(
44625              {SrcVec}, 0, SrcVec, ShuffleMask, {}, /*Depth*/ 1,
44626              X86::MaxShuffleCombineDepth,
44627              /*HasVarMask*/ false, /*AllowVarMask*/ true, DAG, Subtarget))
44628        return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SDLoc(N), VT, Shuffle,
44629                           N->getOperand(0).getOperand(1));
44630    }
44631  }
44632
44633  return SDValue();
44634}
44635
44636// Canonicalize OR(AND(X,C),AND(Y,~C)) -> OR(AND(X,C),ANDNP(C,Y))
44637static SDValue canonicalizeBitSelect(SDNode *N, SelectionDAG &DAG,
44638                                     const X86Subtarget &Subtarget) {
44639  assert(N->getOpcode() == ISD::OR && "Unexpected Opcode");
44640
44641  MVT VT = N->getSimpleValueType(0);
44642  if (!VT.isVector() || (VT.getScalarSizeInBits() % 8) != 0)
44643    return SDValue();
44644
44645  SDValue N0 = peekThroughBitcasts(N->getOperand(0));
44646  SDValue N1 = peekThroughBitcasts(N->getOperand(1));
44647  if (N0.getOpcode() != ISD::AND || N1.getOpcode() != ISD::AND)
44648    return SDValue();
44649
44650  // On XOP we'll lower to PCMOV so accept one use. With AVX512, we can use
44651  // VPTERNLOG. Otherwise only do this if either mask has multiple uses already.
44652  bool UseVPTERNLOG = (Subtarget.hasAVX512() && VT.is512BitVector()) ||
44653                      Subtarget.hasVLX();
44654  if (!(Subtarget.hasXOP() || UseVPTERNLOG ||
44655        !N0.getOperand(1).hasOneUse() || !N1.getOperand(1).hasOneUse()))
44656    return SDValue();
44657
44658  // Attempt to extract constant byte masks.
44659  APInt UndefElts0, UndefElts1;
44660  SmallVector<APInt, 32> EltBits0, EltBits1;
44661  if (!getTargetConstantBitsFromNode(N0.getOperand(1), 8, UndefElts0, EltBits0,
44662                                     false, false))
44663    return SDValue();
44664  if (!getTargetConstantBitsFromNode(N1.getOperand(1), 8, UndefElts1, EltBits1,
44665                                     false, false))
44666    return SDValue();
44667
44668  for (unsigned i = 0, e = EltBits0.size(); i != e; ++i) {
44669    // TODO - add UNDEF elts support.
44670    if (UndefElts0[i] || UndefElts1[i])
44671      return SDValue();
44672    if (EltBits0[i] != ~EltBits1[i])
44673      return SDValue();
44674  }
44675
44676  SDLoc DL(N);
44677
44678  if (UseVPTERNLOG) {
44679    // Emit a VPTERNLOG node directly.
44680    SDValue A = DAG.getBitcast(VT, N0.getOperand(1));
44681    SDValue B = DAG.getBitcast(VT, N0.getOperand(0));
44682    SDValue C = DAG.getBitcast(VT, N1.getOperand(0));
44683    SDValue Imm = DAG.getTargetConstant(0xCA, DL, MVT::i8);
44684    return DAG.getNode(X86ISD::VPTERNLOG, DL, VT, A, B, C, Imm);
44685  }
44686
44687  SDValue X = N->getOperand(0);
44688  SDValue Y =
44689      DAG.getNode(X86ISD::ANDNP, DL, VT, DAG.getBitcast(VT, N0.getOperand(1)),
44690                  DAG.getBitcast(VT, N1.getOperand(0)));
44691  return DAG.getNode(ISD::OR, DL, VT, X, Y);
44692}
44693
44694// Try to match OR(AND(~MASK,X),AND(MASK,Y)) logic pattern.
44695static bool matchLogicBlend(SDNode *N, SDValue &X, SDValue &Y, SDValue &Mask) {
44696  if (N->getOpcode() != ISD::OR)
44697    return false;
44698
44699  SDValue N0 = N->getOperand(0);
44700  SDValue N1 = N->getOperand(1);
44701
44702  // Canonicalize AND to LHS.
44703  if (N1.getOpcode() == ISD::AND)
44704    std::swap(N0, N1);
44705
44706  // Attempt to match OR(AND(M,Y),ANDNP(M,X)).
44707  if (N0.getOpcode() != ISD::AND || N1.getOpcode() != X86ISD::ANDNP)
44708    return false;
44709
44710  Mask = N1.getOperand(0);
44711  X = N1.getOperand(1);
44712
44713  // Check to see if the mask appeared in both the AND and ANDNP.
44714  if (N0.getOperand(0) == Mask)
44715    Y = N0.getOperand(1);
44716  else if (N0.getOperand(1) == Mask)
44717    Y = N0.getOperand(0);
44718  else
44719    return false;
44720
44721  // TODO: Attempt to match against AND(XOR(-1,M),Y) as well, waiting for
44722  // ANDNP combine allows other combines to happen that prevent matching.
44723  return true;
44724}
44725
44726// Try to fold:
44727//   (or (and (m, y), (pandn m, x)))
44728// into:
44729//   (vselect m, x, y)
44730// As a special case, try to fold:
44731//   (or (and (m, (sub 0, x)), (pandn m, x)))
44732// into:
44733//   (sub (xor X, M), M)
44734static SDValue combineLogicBlendIntoPBLENDV(SDNode *N, SelectionDAG &DAG,
44735                                            const X86Subtarget &Subtarget) {
44736  assert(N->getOpcode() == ISD::OR && "Unexpected Opcode");
44737
44738  EVT VT = N->getValueType(0);
44739  if (!((VT.is128BitVector() && Subtarget.hasSSE2()) ||
44740        (VT.is256BitVector() && Subtarget.hasInt256())))
44741    return SDValue();
44742
44743  SDValue X, Y, Mask;
44744  if (!matchLogicBlend(N, X, Y, Mask))
44745    return SDValue();
44746
44747  // Validate that X, Y, and Mask are bitcasts, and see through them.
44748  Mask = peekThroughBitcasts(Mask);
44749  X = peekThroughBitcasts(X);
44750  Y = peekThroughBitcasts(Y);
44751
44752  EVT MaskVT = Mask.getValueType();
44753  unsigned EltBits = MaskVT.getScalarSizeInBits();
44754
44755  // TODO: Attempt to handle floating point cases as well?
44756  if (!MaskVT.isInteger() || DAG.ComputeNumSignBits(Mask) != EltBits)
44757    return SDValue();
44758
44759  SDLoc DL(N);
44760
44761  // Attempt to combine to conditional negate: (sub (xor X, M), M)
44762  if (SDValue Res = combineLogicBlendIntoConditionalNegate(VT, Mask, X, Y, DL,
44763                                                           DAG, Subtarget))
44764    return Res;
44765
44766  // PBLENDVB is only available on SSE 4.1.
44767  if (!Subtarget.hasSSE41())
44768    return SDValue();
44769
44770  // If we have VPTERNLOG we should prefer that since PBLENDVB is multiple uops.
44771  if (Subtarget.hasVLX())
44772    return SDValue();
44773
44774  MVT BlendVT = VT.is256BitVector() ? MVT::v32i8 : MVT::v16i8;
44775
44776  X = DAG.getBitcast(BlendVT, X);
44777  Y = DAG.getBitcast(BlendVT, Y);
44778  Mask = DAG.getBitcast(BlendVT, Mask);
44779  Mask = DAG.getSelect(DL, BlendVT, Mask, Y, X);
44780  return DAG.getBitcast(VT, Mask);
44781}
44782
44783// Helper function for combineOrCmpEqZeroToCtlzSrl
44784// Transforms:
44785//   seteq(cmp x, 0)
44786//   into:
44787//   srl(ctlz x), log2(bitsize(x))
44788// Input pattern is checked by caller.
44789static SDValue lowerX86CmpEqZeroToCtlzSrl(SDValue Op, EVT ExtTy,
44790                                          SelectionDAG &DAG) {
44791  SDValue Cmp = Op.getOperand(1);
44792  EVT VT = Cmp.getOperand(0).getValueType();
44793  unsigned Log2b = Log2_32(VT.getSizeInBits());
44794  SDLoc dl(Op);
44795  SDValue Clz = DAG.getNode(ISD::CTLZ, dl, VT, Cmp->getOperand(0));
44796  // The result of the shift is true or false, and on X86, the 32-bit
44797  // encoding of shr and lzcnt is more desirable.
44798  SDValue Trunc = DAG.getZExtOrTrunc(Clz, dl, MVT::i32);
44799  SDValue Scc = DAG.getNode(ISD::SRL, dl, MVT::i32, Trunc,
44800                            DAG.getConstant(Log2b, dl, MVT::i8));
44801  return DAG.getZExtOrTrunc(Scc, dl, ExtTy);
44802}
44803
44804// Try to transform:
44805//   zext(or(setcc(eq, (cmp x, 0)), setcc(eq, (cmp y, 0))))
44806//   into:
44807//   srl(or(ctlz(x), ctlz(y)), log2(bitsize(x))
44808// Will also attempt to match more generic cases, eg:
44809//   zext(or(or(setcc(eq, cmp 0), setcc(eq, cmp 0)), setcc(eq, cmp 0)))
44810// Only applies if the target supports the FastLZCNT feature.
44811static SDValue combineOrCmpEqZeroToCtlzSrl(SDNode *N, SelectionDAG &DAG,
44812                                           TargetLowering::DAGCombinerInfo &DCI,
44813                                           const X86Subtarget &Subtarget) {
44814  if (DCI.isBeforeLegalize() || !Subtarget.getTargetLowering()->isCtlzFast())
44815    return SDValue();
44816
44817  auto isORCandidate = [](SDValue N) {
44818    return (N->getOpcode() == ISD::OR && N->hasOneUse());
44819  };
44820
44821  // Check the zero extend is extending to 32-bit or more. The code generated by
44822  // srl(ctlz) for 16-bit or less variants of the pattern would require extra
44823  // instructions to clear the upper bits.
44824  if (!N->hasOneUse() || !N->getSimpleValueType(0).bitsGE(MVT::i32) ||
44825      !isORCandidate(N->getOperand(0)))
44826    return SDValue();
44827
44828  // Check the node matches: setcc(eq, cmp 0)
44829  auto isSetCCCandidate = [](SDValue N) {
44830    return N->getOpcode() == X86ISD::SETCC && N->hasOneUse() &&
44831           X86::CondCode(N->getConstantOperandVal(0)) == X86::COND_E &&
44832           N->getOperand(1).getOpcode() == X86ISD::CMP &&
44833           isNullConstant(N->getOperand(1).getOperand(1)) &&
44834           N->getOperand(1).getValueType().bitsGE(MVT::i32);
44835  };
44836
44837  SDNode *OR = N->getOperand(0).getNode();
44838  SDValue LHS = OR->getOperand(0);
44839  SDValue RHS = OR->getOperand(1);
44840
44841  // Save nodes matching or(or, setcc(eq, cmp 0)).
44842  SmallVector<SDNode *, 2> ORNodes;
44843  while (((isORCandidate(LHS) && isSetCCCandidate(RHS)) ||
44844          (isORCandidate(RHS) && isSetCCCandidate(LHS)))) {
44845    ORNodes.push_back(OR);
44846    OR = (LHS->getOpcode() == ISD::OR) ? LHS.getNode() : RHS.getNode();
44847    LHS = OR->getOperand(0);
44848    RHS = OR->getOperand(1);
44849  }
44850
44851  // The last OR node should match or(setcc(eq, cmp 0), setcc(eq, cmp 0)).
44852  if (!(isSetCCCandidate(LHS) && isSetCCCandidate(RHS)) ||
44853      !isORCandidate(SDValue(OR, 0)))
44854    return SDValue();
44855
44856  // We have a or(setcc(eq, cmp 0), setcc(eq, cmp 0)) pattern, try to lower it
44857  // to
44858  // or(srl(ctlz),srl(ctlz)).
44859  // The dag combiner can then fold it into:
44860  // srl(or(ctlz, ctlz)).
44861  EVT VT = OR->getValueType(0);
44862  SDValue NewLHS = lowerX86CmpEqZeroToCtlzSrl(LHS, VT, DAG);
44863  SDValue Ret, NewRHS;
44864  if (NewLHS && (NewRHS = lowerX86CmpEqZeroToCtlzSrl(RHS, VT, DAG)))
44865    Ret = DAG.getNode(ISD::OR, SDLoc(OR), VT, NewLHS, NewRHS);
44866
44867  if (!Ret)
44868    return SDValue();
44869
44870  // Try to lower nodes matching the or(or, setcc(eq, cmp 0)) pattern.
44871  while (ORNodes.size() > 0) {
44872    OR = ORNodes.pop_back_val();
44873    LHS = OR->getOperand(0);
44874    RHS = OR->getOperand(1);
44875    // Swap rhs with lhs to match or(setcc(eq, cmp, 0), or).
44876    if (RHS->getOpcode() == ISD::OR)
44877      std::swap(LHS, RHS);
44878    NewRHS = lowerX86CmpEqZeroToCtlzSrl(RHS, VT, DAG);
44879    if (!NewRHS)
44880      return SDValue();
44881    Ret = DAG.getNode(ISD::OR, SDLoc(OR), VT, Ret, NewRHS);
44882  }
44883
44884  if (Ret)
44885    Ret = DAG.getNode(ISD::ZERO_EXTEND, SDLoc(N), N->getValueType(0), Ret);
44886
44887  return Ret;
44888}
44889
44890static SDValue combineOr(SDNode *N, SelectionDAG &DAG,
44891                         TargetLowering::DAGCombinerInfo &DCI,
44892                         const X86Subtarget &Subtarget) {
44893  SDValue N0 = N->getOperand(0);
44894  SDValue N1 = N->getOperand(1);
44895  EVT VT = N->getValueType(0);
44896
44897  // If this is SSE1 only convert to FOR to avoid scalarization.
44898  if (Subtarget.hasSSE1() && !Subtarget.hasSSE2() && VT == MVT::v4i32) {
44899    return DAG.getBitcast(MVT::v4i32,
44900                          DAG.getNode(X86ISD::FOR, SDLoc(N), MVT::v4f32,
44901                                      DAG.getBitcast(MVT::v4f32, N0),
44902                                      DAG.getBitcast(MVT::v4f32, N1)));
44903  }
44904
44905  // Match any-of bool scalar reductions into a bitcast/movmsk + cmp.
44906  // TODO: Support multiple SrcOps.
44907  if (VT == MVT::i1) {
44908    SmallVector<SDValue, 2> SrcOps;
44909    SmallVector<APInt, 2> SrcPartials;
44910    if (matchScalarReduction(SDValue(N, 0), ISD::OR, SrcOps, &SrcPartials) &&
44911        SrcOps.size() == 1) {
44912      SDLoc dl(N);
44913      const TargetLowering &TLI = DAG.getTargetLoweringInfo();
44914      unsigned NumElts = SrcOps[0].getValueType().getVectorNumElements();
44915      EVT MaskVT = EVT::getIntegerVT(*DAG.getContext(), NumElts);
44916      SDValue Mask = combineBitcastvxi1(DAG, MaskVT, SrcOps[0], dl, Subtarget);
44917      if (!Mask && TLI.isTypeLegal(SrcOps[0].getValueType()))
44918        Mask = DAG.getBitcast(MaskVT, SrcOps[0]);
44919      if (Mask) {
44920        assert(SrcPartials[0].getBitWidth() == NumElts &&
44921               "Unexpected partial reduction mask");
44922        SDValue ZeroBits = DAG.getConstant(0, dl, MaskVT);
44923        SDValue PartialBits = DAG.getConstant(SrcPartials[0], dl, MaskVT);
44924        Mask = DAG.getNode(ISD::AND, dl, MaskVT, Mask, PartialBits);
44925        return DAG.getSetCC(dl, MVT::i1, Mask, ZeroBits, ISD::SETNE);
44926      }
44927    }
44928  }
44929
44930  if (SDValue R = combineBitOpWithMOVMSK(N, DAG))
44931    return R;
44932
44933  if (DCI.isBeforeLegalizeOps())
44934    return SDValue();
44935
44936  if (SDValue R = combineCompareEqual(N, DAG, DCI, Subtarget))
44937    return R;
44938
44939  if (SDValue FPLogic = convertIntLogicToFPLogic(N, DAG, Subtarget))
44940    return FPLogic;
44941
44942  if (SDValue R = canonicalizeBitSelect(N, DAG, Subtarget))
44943    return R;
44944
44945  if (SDValue R = combineLogicBlendIntoPBLENDV(N, DAG, Subtarget))
44946    return R;
44947
44948  // Combine OR(X,KSHIFTL(Y,Elts/2)) -> CONCAT_VECTORS(X,Y) == KUNPCK(X,Y).
44949  // Combine OR(KSHIFTL(X,Elts/2),Y) -> CONCAT_VECTORS(Y,X) == KUNPCK(Y,X).
44950  // iff the upper elements of the non-shifted arg are zero.
44951  // KUNPCK require 16+ bool vector elements.
44952  if (N0.getOpcode() == X86ISD::KSHIFTL || N1.getOpcode() == X86ISD::KSHIFTL) {
44953    unsigned NumElts = VT.getVectorNumElements();
44954    unsigned HalfElts = NumElts / 2;
44955    APInt UpperElts = APInt::getHighBitsSet(NumElts, HalfElts);
44956    if (NumElts >= 16 && N1.getOpcode() == X86ISD::KSHIFTL &&
44957        N1.getConstantOperandAPInt(1) == HalfElts &&
44958        DAG.MaskedValueIsZero(N0, APInt(1, 1), UpperElts)) {
44959      SDLoc dl(N);
44960      return DAG.getNode(
44961          ISD::CONCAT_VECTORS, dl, VT,
44962          extractSubVector(N0, 0, DAG, dl, HalfElts),
44963          extractSubVector(N1.getOperand(0), 0, DAG, dl, HalfElts));
44964    }
44965    if (NumElts >= 16 && N0.getOpcode() == X86ISD::KSHIFTL &&
44966        N0.getConstantOperandAPInt(1) == HalfElts &&
44967        DAG.MaskedValueIsZero(N1, APInt(1, 1), UpperElts)) {
44968      SDLoc dl(N);
44969      return DAG.getNode(
44970          ISD::CONCAT_VECTORS, dl, VT,
44971          extractSubVector(N1, 0, DAG, dl, HalfElts),
44972          extractSubVector(N0.getOperand(0), 0, DAG, dl, HalfElts));
44973    }
44974  }
44975
44976  // Attempt to recursively combine an OR of shuffles.
44977  if (VT.isVector() && (VT.getScalarSizeInBits() % 8) == 0) {
44978    SDValue Op(N, 0);
44979    if (SDValue Res = combineX86ShufflesRecursively(Op, DAG, Subtarget))
44980      return Res;
44981  }
44982
44983  return SDValue();
44984}
44985
44986/// Try to turn tests against the signbit in the form of:
44987///   XOR(TRUNCATE(SRL(X, size(X)-1)), 1)
44988/// into:
44989///   SETGT(X, -1)
44990static SDValue foldXorTruncShiftIntoCmp(SDNode *N, SelectionDAG &DAG) {
44991  // This is only worth doing if the output type is i8 or i1.
44992  EVT ResultType = N->getValueType(0);
44993  if (ResultType != MVT::i8 && ResultType != MVT::i1)
44994    return SDValue();
44995
44996  SDValue N0 = N->getOperand(0);
44997  SDValue N1 = N->getOperand(1);
44998
44999  // We should be performing an xor against a truncated shift.
45000  if (N0.getOpcode() != ISD::TRUNCATE || !N0.hasOneUse())
45001    return SDValue();
45002
45003  // Make sure we are performing an xor against one.
45004  if (!isOneConstant(N1))
45005    return SDValue();
45006
45007  // SetCC on x86 zero extends so only act on this if it's a logical shift.
45008  SDValue Shift = N0.getOperand(0);
45009  if (Shift.getOpcode() != ISD::SRL || !Shift.hasOneUse())
45010    return SDValue();
45011
45012  // Make sure we are truncating from one of i16, i32 or i64.
45013  EVT ShiftTy = Shift.getValueType();
45014  if (ShiftTy != MVT::i16 && ShiftTy != MVT::i32 && ShiftTy != MVT::i64)
45015    return SDValue();
45016
45017  // Make sure the shift amount extracts the sign bit.
45018  if (!isa<ConstantSDNode>(Shift.getOperand(1)) ||
45019      Shift.getConstantOperandAPInt(1) != (ShiftTy.getSizeInBits() - 1))
45020    return SDValue();
45021
45022  // Create a greater-than comparison against -1.
45023  // N.B. Using SETGE against 0 works but we want a canonical looking
45024  // comparison, using SETGT matches up with what TranslateX86CC.
45025  SDLoc DL(N);
45026  SDValue ShiftOp = Shift.getOperand(0);
45027  EVT ShiftOpTy = ShiftOp.getValueType();
45028  const TargetLowering &TLI = DAG.getTargetLoweringInfo();
45029  EVT SetCCResultType = TLI.getSetCCResultType(DAG.getDataLayout(),
45030                                               *DAG.getContext(), ResultType);
45031  SDValue Cond = DAG.getSetCC(DL, SetCCResultType, ShiftOp,
45032                              DAG.getConstant(-1, DL, ShiftOpTy), ISD::SETGT);
45033  if (SetCCResultType != ResultType)
45034    Cond = DAG.getNode(ISD::ZERO_EXTEND, DL, ResultType, Cond);
45035  return Cond;
45036}
45037
45038/// Turn vector tests of the signbit in the form of:
45039///   xor (sra X, elt_size(X)-1), -1
45040/// into:
45041///   pcmpgt X, -1
45042///
45043/// This should be called before type legalization because the pattern may not
45044/// persist after that.
45045static SDValue foldVectorXorShiftIntoCmp(SDNode *N, SelectionDAG &DAG,
45046                                         const X86Subtarget &Subtarget) {
45047  EVT VT = N->getValueType(0);
45048  if (!VT.isSimple())
45049    return SDValue();
45050
45051  switch (VT.getSimpleVT().SimpleTy) {
45052  default: return SDValue();
45053  case MVT::v16i8:
45054  case MVT::v8i16:
45055  case MVT::v4i32:
45056  case MVT::v2i64: if (!Subtarget.hasSSE2()) return SDValue(); break;
45057  case MVT::v32i8:
45058  case MVT::v16i16:
45059  case MVT::v8i32:
45060  case MVT::v4i64: if (!Subtarget.hasAVX2()) return SDValue(); break;
45061  }
45062
45063  // There must be a shift right algebraic before the xor, and the xor must be a
45064  // 'not' operation.
45065  SDValue Shift = N->getOperand(0);
45066  SDValue Ones = N->getOperand(1);
45067  if (Shift.getOpcode() != ISD::SRA || !Shift.hasOneUse() ||
45068      !ISD::isBuildVectorAllOnes(Ones.getNode()))
45069    return SDValue();
45070
45071  // The shift should be smearing the sign bit across each vector element.
45072  auto *ShiftAmt =
45073      isConstOrConstSplat(Shift.getOperand(1), /*AllowUndefs*/ true);
45074  if (!ShiftAmt ||
45075      ShiftAmt->getAPIntValue() != (Shift.getScalarValueSizeInBits() - 1))
45076    return SDValue();
45077
45078  // Create a greater-than comparison against -1. We don't use the more obvious
45079  // greater-than-or-equal-to-zero because SSE/AVX don't have that instruction.
45080  return DAG.getSetCC(SDLoc(N), VT, Shift.getOperand(0), Ones, ISD::SETGT);
45081}
45082
45083/// Detect patterns of truncation with unsigned saturation:
45084///
45085/// 1. (truncate (umin (x, unsigned_max_of_dest_type)) to dest_type).
45086///   Return the source value x to be truncated or SDValue() if the pattern was
45087///   not matched.
45088///
45089/// 2. (truncate (smin (smax (x, C1), C2)) to dest_type),
45090///   where C1 >= 0 and C2 is unsigned max of destination type.
45091///
45092///    (truncate (smax (smin (x, C2), C1)) to dest_type)
45093///   where C1 >= 0, C2 is unsigned max of destination type and C1 <= C2.
45094///
45095///   These two patterns are equivalent to:
45096///   (truncate (umin (smax(x, C1), unsigned_max_of_dest_type)) to dest_type)
45097///   So return the smax(x, C1) value to be truncated or SDValue() if the
45098///   pattern was not matched.
45099static SDValue detectUSatPattern(SDValue In, EVT VT, SelectionDAG &DAG,
45100                                 const SDLoc &DL) {
45101  EVT InVT = In.getValueType();
45102
45103  // Saturation with truncation. We truncate from InVT to VT.
45104  assert(InVT.getScalarSizeInBits() > VT.getScalarSizeInBits() &&
45105         "Unexpected types for truncate operation");
45106
45107  // Match min/max and return limit value as a parameter.
45108  auto MatchMinMax = [](SDValue V, unsigned Opcode, APInt &Limit) -> SDValue {
45109    if (V.getOpcode() == Opcode &&
45110        ISD::isConstantSplatVector(V.getOperand(1).getNode(), Limit))
45111      return V.getOperand(0);
45112    return SDValue();
45113  };
45114
45115  APInt C1, C2;
45116  if (SDValue UMin = MatchMinMax(In, ISD::UMIN, C2))
45117    // C2 should be equal to UINT32_MAX / UINT16_MAX / UINT8_MAX according
45118    // the element size of the destination type.
45119    if (C2.isMask(VT.getScalarSizeInBits()))
45120      return UMin;
45121
45122  if (SDValue SMin = MatchMinMax(In, ISD::SMIN, C2))
45123    if (MatchMinMax(SMin, ISD::SMAX, C1))
45124      if (C1.isNonNegative() && C2.isMask(VT.getScalarSizeInBits()))
45125        return SMin;
45126
45127  if (SDValue SMax = MatchMinMax(In, ISD::SMAX, C1))
45128    if (SDValue SMin = MatchMinMax(SMax, ISD::SMIN, C2))
45129      if (C1.isNonNegative() && C2.isMask(VT.getScalarSizeInBits()) &&
45130          C2.uge(C1)) {
45131        return DAG.getNode(ISD::SMAX, DL, InVT, SMin, In.getOperand(1));
45132      }
45133
45134  return SDValue();
45135}
45136
45137/// Detect patterns of truncation with signed saturation:
45138/// (truncate (smin ((smax (x, signed_min_of_dest_type)),
45139///                  signed_max_of_dest_type)) to dest_type)
45140/// or:
45141/// (truncate (smax ((smin (x, signed_max_of_dest_type)),
45142///                  signed_min_of_dest_type)) to dest_type).
45143/// With MatchPackUS, the smax/smin range is [0, unsigned_max_of_dest_type].
45144/// Return the source value to be truncated or SDValue() if the pattern was not
45145/// matched.
45146static SDValue detectSSatPattern(SDValue In, EVT VT, bool MatchPackUS = false) {
45147  unsigned NumDstBits = VT.getScalarSizeInBits();
45148  unsigned NumSrcBits = In.getScalarValueSizeInBits();
45149  assert(NumSrcBits > NumDstBits && "Unexpected types for truncate operation");
45150
45151  auto MatchMinMax = [](SDValue V, unsigned Opcode,
45152                        const APInt &Limit) -> SDValue {
45153    APInt C;
45154    if (V.getOpcode() == Opcode &&
45155        ISD::isConstantSplatVector(V.getOperand(1).getNode(), C) && C == Limit)
45156      return V.getOperand(0);
45157    return SDValue();
45158  };
45159
45160  APInt SignedMax, SignedMin;
45161  if (MatchPackUS) {
45162    SignedMax = APInt::getAllOnesValue(NumDstBits).zext(NumSrcBits);
45163    SignedMin = APInt(NumSrcBits, 0);
45164  } else {
45165    SignedMax = APInt::getSignedMaxValue(NumDstBits).sext(NumSrcBits);
45166    SignedMin = APInt::getSignedMinValue(NumDstBits).sext(NumSrcBits);
45167  }
45168
45169  if (SDValue SMin = MatchMinMax(In, ISD::SMIN, SignedMax))
45170    if (SDValue SMax = MatchMinMax(SMin, ISD::SMAX, SignedMin))
45171      return SMax;
45172
45173  if (SDValue SMax = MatchMinMax(In, ISD::SMAX, SignedMin))
45174    if (SDValue SMin = MatchMinMax(SMax, ISD::SMIN, SignedMax))
45175      return SMin;
45176
45177  return SDValue();
45178}
45179
45180static SDValue combineTruncateWithSat(SDValue In, EVT VT, const SDLoc &DL,
45181                                      SelectionDAG &DAG,
45182                                      const X86Subtarget &Subtarget) {
45183  if (!Subtarget.hasSSE2() || !VT.isVector())
45184    return SDValue();
45185
45186  EVT SVT = VT.getVectorElementType();
45187  EVT InVT = In.getValueType();
45188  EVT InSVT = InVT.getVectorElementType();
45189
45190  // If we're clamping a signed 32-bit vector to 0-255 and the 32-bit vector is
45191  // split across two registers. We can use a packusdw+perm to clamp to 0-65535
45192  // and concatenate at the same time. Then we can use a final vpmovuswb to
45193  // clip to 0-255.
45194  if (Subtarget.hasBWI() && !Subtarget.useAVX512Regs() &&
45195      InVT == MVT::v16i32 && VT == MVT::v16i8) {
45196    if (auto USatVal = detectSSatPattern(In, VT, true)) {
45197      // Emit a VPACKUSDW+VPERMQ followed by a VPMOVUSWB.
45198      SDValue Mid = truncateVectorWithPACK(X86ISD::PACKUS, MVT::v16i16, USatVal,
45199                                           DL, DAG, Subtarget);
45200      assert(Mid && "Failed to pack!");
45201      return DAG.getNode(X86ISD::VTRUNCUS, DL, VT, Mid);
45202    }
45203  }
45204
45205  // vXi32 truncate instructions are available with AVX512F.
45206  // vXi16 truncate instructions are only available with AVX512BW.
45207  // For 256-bit or smaller vectors, we require VLX.
45208  // FIXME: We could widen truncates to 512 to remove the VLX restriction.
45209  // If the result type is 256-bits or larger and we have disable 512-bit
45210  // registers, we should go ahead and use the pack instructions if possible.
45211  bool PreferAVX512 = ((Subtarget.hasAVX512() && InSVT == MVT::i32) ||
45212                       (Subtarget.hasBWI() && InSVT == MVT::i16)) &&
45213                      (InVT.getSizeInBits() > 128) &&
45214                      (Subtarget.hasVLX() || InVT.getSizeInBits() > 256) &&
45215                      !(!Subtarget.useAVX512Regs() && VT.getSizeInBits() >= 256);
45216
45217  if (isPowerOf2_32(VT.getVectorNumElements()) && !PreferAVX512 &&
45218      VT.getSizeInBits() >= 64 &&
45219      (SVT == MVT::i8 || SVT == MVT::i16) &&
45220      (InSVT == MVT::i16 || InSVT == MVT::i32)) {
45221    if (auto USatVal = detectSSatPattern(In, VT, true)) {
45222      // vXi32 -> vXi8 must be performed as PACKUSWB(PACKSSDW,PACKSSDW).
45223      // Only do this when the result is at least 64 bits or we'll leaving
45224      // dangling PACKSSDW nodes.
45225      if (SVT == MVT::i8 && InSVT == MVT::i32) {
45226        EVT MidVT = VT.changeVectorElementType(MVT::i16);
45227        SDValue Mid = truncateVectorWithPACK(X86ISD::PACKSS, MidVT, USatVal, DL,
45228                                             DAG, Subtarget);
45229        assert(Mid && "Failed to pack!");
45230        SDValue V = truncateVectorWithPACK(X86ISD::PACKUS, VT, Mid, DL, DAG,
45231                                           Subtarget);
45232        assert(V && "Failed to pack!");
45233        return V;
45234      } else if (SVT == MVT::i8 || Subtarget.hasSSE41())
45235        return truncateVectorWithPACK(X86ISD::PACKUS, VT, USatVal, DL, DAG,
45236                                      Subtarget);
45237    }
45238    if (auto SSatVal = detectSSatPattern(In, VT))
45239      return truncateVectorWithPACK(X86ISD::PACKSS, VT, SSatVal, DL, DAG,
45240                                    Subtarget);
45241  }
45242
45243  const TargetLowering &TLI = DAG.getTargetLoweringInfo();
45244  if (TLI.isTypeLegal(InVT) && InVT.isVector() && SVT != MVT::i1 &&
45245      Subtarget.hasAVX512() && (InSVT != MVT::i16 || Subtarget.hasBWI())) {
45246    unsigned TruncOpc = 0;
45247    SDValue SatVal;
45248    if (auto SSatVal = detectSSatPattern(In, VT)) {
45249      SatVal = SSatVal;
45250      TruncOpc = X86ISD::VTRUNCS;
45251    } else if (auto USatVal = detectUSatPattern(In, VT, DAG, DL)) {
45252      SatVal = USatVal;
45253      TruncOpc = X86ISD::VTRUNCUS;
45254    }
45255    if (SatVal) {
45256      unsigned ResElts = VT.getVectorNumElements();
45257      // If the input type is less than 512 bits and we don't have VLX, we need
45258      // to widen to 512 bits.
45259      if (!Subtarget.hasVLX() && !InVT.is512BitVector()) {
45260        unsigned NumConcats = 512 / InVT.getSizeInBits();
45261        ResElts *= NumConcats;
45262        SmallVector<SDValue, 4> ConcatOps(NumConcats, DAG.getUNDEF(InVT));
45263        ConcatOps[0] = SatVal;
45264        InVT = EVT::getVectorVT(*DAG.getContext(), InSVT,
45265                                NumConcats * InVT.getVectorNumElements());
45266        SatVal = DAG.getNode(ISD::CONCAT_VECTORS, DL, InVT, ConcatOps);
45267      }
45268      // Widen the result if its narrower than 128 bits.
45269      if (ResElts * SVT.getSizeInBits() < 128)
45270        ResElts = 128 / SVT.getSizeInBits();
45271      EVT TruncVT = EVT::getVectorVT(*DAG.getContext(), SVT, ResElts);
45272      SDValue Res = DAG.getNode(TruncOpc, DL, TruncVT, SatVal);
45273      return DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, Res,
45274                         DAG.getIntPtrConstant(0, DL));
45275    }
45276  }
45277
45278  return SDValue();
45279}
45280
45281/// This function detects the AVG pattern between vectors of unsigned i8/i16,
45282/// which is c = (a + b + 1) / 2, and replace this operation with the efficient
45283/// X86ISD::AVG instruction.
45284static SDValue detectAVGPattern(SDValue In, EVT VT, SelectionDAG &DAG,
45285                                const X86Subtarget &Subtarget,
45286                                const SDLoc &DL) {
45287  if (!VT.isVector())
45288    return SDValue();
45289  EVT InVT = In.getValueType();
45290  unsigned NumElems = VT.getVectorNumElements();
45291
45292  EVT ScalarVT = VT.getVectorElementType();
45293  if (!((ScalarVT == MVT::i8 || ScalarVT == MVT::i16) && NumElems >= 2))
45294    return SDValue();
45295
45296  // InScalarVT is the intermediate type in AVG pattern and it should be greater
45297  // than the original input type (i8/i16).
45298  EVT InScalarVT = InVT.getVectorElementType();
45299  if (InScalarVT.getFixedSizeInBits() <= ScalarVT.getFixedSizeInBits())
45300    return SDValue();
45301
45302  if (!Subtarget.hasSSE2())
45303    return SDValue();
45304
45305  // Detect the following pattern:
45306  //
45307  //   %1 = zext <N x i8> %a to <N x i32>
45308  //   %2 = zext <N x i8> %b to <N x i32>
45309  //   %3 = add nuw nsw <N x i32> %1, <i32 1 x N>
45310  //   %4 = add nuw nsw <N x i32> %3, %2
45311  //   %5 = lshr <N x i32> %N, <i32 1 x N>
45312  //   %6 = trunc <N x i32> %5 to <N x i8>
45313  //
45314  // In AVX512, the last instruction can also be a trunc store.
45315  if (In.getOpcode() != ISD::SRL)
45316    return SDValue();
45317
45318  // A lambda checking the given SDValue is a constant vector and each element
45319  // is in the range [Min, Max].
45320  auto IsConstVectorInRange = [](SDValue V, unsigned Min, unsigned Max) {
45321    return ISD::matchUnaryPredicate(V, [Min, Max](ConstantSDNode *C) {
45322      return !(C->getAPIntValue().ult(Min) || C->getAPIntValue().ugt(Max));
45323    });
45324  };
45325
45326  // Check if each element of the vector is right-shifted by one.
45327  SDValue LHS = In.getOperand(0);
45328  SDValue RHS = In.getOperand(1);
45329  if (!IsConstVectorInRange(RHS, 1, 1))
45330    return SDValue();
45331  if (LHS.getOpcode() != ISD::ADD)
45332    return SDValue();
45333
45334  // Detect a pattern of a + b + 1 where the order doesn't matter.
45335  SDValue Operands[3];
45336  Operands[0] = LHS.getOperand(0);
45337  Operands[1] = LHS.getOperand(1);
45338
45339  auto AVGBuilder = [](SelectionDAG &DAG, const SDLoc &DL,
45340                       ArrayRef<SDValue> Ops) {
45341    return DAG.getNode(X86ISD::AVG, DL, Ops[0].getValueType(), Ops);
45342  };
45343
45344  auto AVGSplitter = [&](SDValue Op0, SDValue Op1) {
45345    // Pad to a power-of-2 vector, split+apply and extract the original vector.
45346    unsigned NumElemsPow2 = PowerOf2Ceil(NumElems);
45347    EVT Pow2VT = EVT::getVectorVT(*DAG.getContext(), ScalarVT, NumElemsPow2);
45348    if (NumElemsPow2 != NumElems) {
45349      SmallVector<SDValue, 32> Ops0(NumElemsPow2, DAG.getUNDEF(ScalarVT));
45350      SmallVector<SDValue, 32> Ops1(NumElemsPow2, DAG.getUNDEF(ScalarVT));
45351      for (unsigned i = 0; i != NumElems; ++i) {
45352        SDValue Idx = DAG.getIntPtrConstant(i, DL);
45353        Ops0[i] = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, ScalarVT, Op0, Idx);
45354        Ops1[i] = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, ScalarVT, Op1, Idx);
45355      }
45356      Op0 = DAG.getBuildVector(Pow2VT, DL, Ops0);
45357      Op1 = DAG.getBuildVector(Pow2VT, DL, Ops1);
45358    }
45359    SDValue Res =
45360        SplitOpsAndApply(DAG, Subtarget, DL, Pow2VT, {Op0, Op1}, AVGBuilder);
45361    if (NumElemsPow2 == NumElems)
45362      return Res;
45363    return DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, Res,
45364                       DAG.getIntPtrConstant(0, DL));
45365  };
45366
45367  // Take care of the case when one of the operands is a constant vector whose
45368  // element is in the range [1, 256].
45369  if (IsConstVectorInRange(Operands[1], 1, ScalarVT == MVT::i8 ? 256 : 65536) &&
45370      Operands[0].getOpcode() == ISD::ZERO_EXTEND &&
45371      Operands[0].getOperand(0).getValueType() == VT) {
45372    // The pattern is detected. Subtract one from the constant vector, then
45373    // demote it and emit X86ISD::AVG instruction.
45374    SDValue VecOnes = DAG.getConstant(1, DL, InVT);
45375    Operands[1] = DAG.getNode(ISD::SUB, DL, InVT, Operands[1], VecOnes);
45376    Operands[1] = DAG.getNode(ISD::TRUNCATE, DL, VT, Operands[1]);
45377    return AVGSplitter(Operands[0].getOperand(0), Operands[1]);
45378  }
45379
45380  // Matches 'add like' patterns: add(Op0,Op1) + zext(or(Op0,Op1)).
45381  // Match the or case only if its 'add-like' - can be replaced by an add.
45382  auto FindAddLike = [&](SDValue V, SDValue &Op0, SDValue &Op1) {
45383    if (ISD::ADD == V.getOpcode()) {
45384      Op0 = V.getOperand(0);
45385      Op1 = V.getOperand(1);
45386      return true;
45387    }
45388    if (ISD::ZERO_EXTEND != V.getOpcode())
45389      return false;
45390    V = V.getOperand(0);
45391    if (V.getValueType() != VT || ISD::OR != V.getOpcode() ||
45392        !DAG.haveNoCommonBitsSet(V.getOperand(0), V.getOperand(1)))
45393      return false;
45394    Op0 = V.getOperand(0);
45395    Op1 = V.getOperand(1);
45396    return true;
45397  };
45398
45399  SDValue Op0, Op1;
45400  if (FindAddLike(Operands[0], Op0, Op1))
45401    std::swap(Operands[0], Operands[1]);
45402  else if (!FindAddLike(Operands[1], Op0, Op1))
45403    return SDValue();
45404  Operands[2] = Op0;
45405  Operands[1] = Op1;
45406
45407  // Now we have three operands of two additions. Check that one of them is a
45408  // constant vector with ones, and the other two can be promoted from i8/i16.
45409  for (int i = 0; i < 3; ++i) {
45410    if (!IsConstVectorInRange(Operands[i], 1, 1))
45411      continue;
45412    std::swap(Operands[i], Operands[2]);
45413
45414    // Check if Operands[0] and Operands[1] are results of type promotion.
45415    for (int j = 0; j < 2; ++j)
45416      if (Operands[j].getValueType() != VT) {
45417        if (Operands[j].getOpcode() != ISD::ZERO_EXTEND ||
45418            Operands[j].getOperand(0).getValueType() != VT)
45419          return SDValue();
45420        Operands[j] = Operands[j].getOperand(0);
45421      }
45422
45423    // The pattern is detected, emit X86ISD::AVG instruction(s).
45424    return AVGSplitter(Operands[0], Operands[1]);
45425  }
45426
45427  return SDValue();
45428}
45429
45430static SDValue combineLoad(SDNode *N, SelectionDAG &DAG,
45431                           TargetLowering::DAGCombinerInfo &DCI,
45432                           const X86Subtarget &Subtarget) {
45433  LoadSDNode *Ld = cast<LoadSDNode>(N);
45434  EVT RegVT = Ld->getValueType(0);
45435  EVT MemVT = Ld->getMemoryVT();
45436  SDLoc dl(Ld);
45437  const TargetLowering &TLI = DAG.getTargetLoweringInfo();
45438
45439  // For chips with slow 32-byte unaligned loads, break the 32-byte operation
45440  // into two 16-byte operations. Also split non-temporal aligned loads on
45441  // pre-AVX2 targets as 32-byte loads will lower to regular temporal loads.
45442  ISD::LoadExtType Ext = Ld->getExtensionType();
45443  bool Fast;
45444  if (RegVT.is256BitVector() && !DCI.isBeforeLegalizeOps() &&
45445      Ext == ISD::NON_EXTLOAD &&
45446      ((Ld->isNonTemporal() && !Subtarget.hasInt256() &&
45447        Ld->getAlignment() >= 16) ||
45448       (TLI.allowsMemoryAccess(*DAG.getContext(), DAG.getDataLayout(), RegVT,
45449                               *Ld->getMemOperand(), &Fast) &&
45450        !Fast))) {
45451    unsigned NumElems = RegVT.getVectorNumElements();
45452    if (NumElems < 2)
45453      return SDValue();
45454
45455    unsigned HalfOffset = 16;
45456    SDValue Ptr1 = Ld->getBasePtr();
45457    SDValue Ptr2 =
45458        DAG.getMemBasePlusOffset(Ptr1, TypeSize::Fixed(HalfOffset), dl);
45459    EVT HalfVT = EVT::getVectorVT(*DAG.getContext(), MemVT.getScalarType(),
45460                                  NumElems / 2);
45461    SDValue Load1 =
45462        DAG.getLoad(HalfVT, dl, Ld->getChain(), Ptr1, Ld->getPointerInfo(),
45463                    Ld->getOriginalAlign(),
45464                    Ld->getMemOperand()->getFlags());
45465    SDValue Load2 = DAG.getLoad(HalfVT, dl, Ld->getChain(), Ptr2,
45466                                Ld->getPointerInfo().getWithOffset(HalfOffset),
45467                                Ld->getOriginalAlign(),
45468                                Ld->getMemOperand()->getFlags());
45469    SDValue TF = DAG.getNode(ISD::TokenFactor, dl, MVT::Other,
45470                             Load1.getValue(1), Load2.getValue(1));
45471
45472    SDValue NewVec = DAG.getNode(ISD::CONCAT_VECTORS, dl, RegVT, Load1, Load2);
45473    return DCI.CombineTo(N, NewVec, TF, true);
45474  }
45475
45476  // Bool vector load - attempt to cast to an integer, as we have good
45477  // (vXiY *ext(vXi1 bitcast(iX))) handling.
45478  if (Ext == ISD::NON_EXTLOAD && !Subtarget.hasAVX512() && RegVT.isVector() &&
45479      RegVT.getScalarType() == MVT::i1 && DCI.isBeforeLegalize()) {
45480    unsigned NumElts = RegVT.getVectorNumElements();
45481    EVT IntVT = EVT::getIntegerVT(*DAG.getContext(), NumElts);
45482    if (TLI.isTypeLegal(IntVT)) {
45483      SDValue IntLoad = DAG.getLoad(IntVT, dl, Ld->getChain(), Ld->getBasePtr(),
45484                                    Ld->getPointerInfo(),
45485                                    Ld->getOriginalAlign(),
45486                                    Ld->getMemOperand()->getFlags());
45487      SDValue BoolVec = DAG.getBitcast(RegVT, IntLoad);
45488      return DCI.CombineTo(N, BoolVec, IntLoad.getValue(1), true);
45489    }
45490  }
45491
45492  // If we also broadcast this as a subvector to a wider type, then just extract
45493  // the lowest subvector.
45494  if (Ext == ISD::NON_EXTLOAD && Subtarget.hasAVX() && Ld->isSimple() &&
45495      (RegVT.is128BitVector() || RegVT.is256BitVector())) {
45496    SDValue Ptr = Ld->getBasePtr();
45497    SDValue Chain = Ld->getChain();
45498    for (SDNode *User : Ptr->uses()) {
45499      if (User != N && User->getOpcode() == X86ISD::SUBV_BROADCAST_LOAD &&
45500          cast<MemIntrinsicSDNode>(User)->getBasePtr() == Ptr &&
45501          cast<MemIntrinsicSDNode>(User)->getChain() == Chain &&
45502          cast<MemIntrinsicSDNode>(User)->getMemoryVT().getSizeInBits() ==
45503              MemVT.getSizeInBits() &&
45504          !User->hasAnyUseOfValue(1) &&
45505          User->getValueSizeInBits(0).getFixedSize() >
45506              RegVT.getFixedSizeInBits()) {
45507        SDValue Extract = extractSubVector(SDValue(User, 0), 0, DAG, SDLoc(N),
45508                                           RegVT.getSizeInBits());
45509        Extract = DAG.getBitcast(RegVT, Extract);
45510        return DCI.CombineTo(N, Extract, SDValue(User, 1));
45511      }
45512    }
45513  }
45514
45515  // Cast ptr32 and ptr64 pointers to the default address space before a load.
45516  unsigned AddrSpace = Ld->getAddressSpace();
45517  if (AddrSpace == X86AS::PTR64 || AddrSpace == X86AS::PTR32_SPTR ||
45518      AddrSpace == X86AS::PTR32_UPTR) {
45519    MVT PtrVT = TLI.getPointerTy(DAG.getDataLayout());
45520    if (PtrVT != Ld->getBasePtr().getSimpleValueType()) {
45521      SDValue Cast =
45522          DAG.getAddrSpaceCast(dl, PtrVT, Ld->getBasePtr(), AddrSpace, 0);
45523      return DAG.getLoad(RegVT, dl, Ld->getChain(), Cast, Ld->getPointerInfo(),
45524                         Ld->getOriginalAlign(),
45525                         Ld->getMemOperand()->getFlags());
45526    }
45527  }
45528
45529  return SDValue();
45530}
45531
45532/// If V is a build vector of boolean constants and exactly one of those
45533/// constants is true, return the operand index of that true element.
45534/// Otherwise, return -1.
45535static int getOneTrueElt(SDValue V) {
45536  // This needs to be a build vector of booleans.
45537  // TODO: Checking for the i1 type matches the IR definition for the mask,
45538  // but the mask check could be loosened to i8 or other types. That might
45539  // also require checking more than 'allOnesValue'; eg, the x86 HW
45540  // instructions only require that the MSB is set for each mask element.
45541  // The ISD::MSTORE comments/definition do not specify how the mask operand
45542  // is formatted.
45543  auto *BV = dyn_cast<BuildVectorSDNode>(V);
45544  if (!BV || BV->getValueType(0).getVectorElementType() != MVT::i1)
45545    return -1;
45546
45547  int TrueIndex = -1;
45548  unsigned NumElts = BV->getValueType(0).getVectorNumElements();
45549  for (unsigned i = 0; i < NumElts; ++i) {
45550    const SDValue &Op = BV->getOperand(i);
45551    if (Op.isUndef())
45552      continue;
45553    auto *ConstNode = dyn_cast<ConstantSDNode>(Op);
45554    if (!ConstNode)
45555      return -1;
45556    if (ConstNode->getAPIntValue().countTrailingOnes() >= 1) {
45557      // If we already found a one, this is too many.
45558      if (TrueIndex >= 0)
45559        return -1;
45560      TrueIndex = i;
45561    }
45562  }
45563  return TrueIndex;
45564}
45565
45566/// Given a masked memory load/store operation, return true if it has one mask
45567/// bit set. If it has one mask bit set, then also return the memory address of
45568/// the scalar element to load/store, the vector index to insert/extract that
45569/// scalar element, and the alignment for the scalar memory access.
45570static bool getParamsForOneTrueMaskedElt(MaskedLoadStoreSDNode *MaskedOp,
45571                                         SelectionDAG &DAG, SDValue &Addr,
45572                                         SDValue &Index, Align &Alignment,
45573                                         unsigned &Offset) {
45574  int TrueMaskElt = getOneTrueElt(MaskedOp->getMask());
45575  if (TrueMaskElt < 0)
45576    return false;
45577
45578  // Get the address of the one scalar element that is specified by the mask
45579  // using the appropriate offset from the base pointer.
45580  EVT EltVT = MaskedOp->getMemoryVT().getVectorElementType();
45581  Offset = 0;
45582  Addr = MaskedOp->getBasePtr();
45583  if (TrueMaskElt != 0) {
45584    Offset = TrueMaskElt * EltVT.getStoreSize();
45585    Addr = DAG.getMemBasePlusOffset(Addr, TypeSize::Fixed(Offset),
45586                                    SDLoc(MaskedOp));
45587  }
45588
45589  Index = DAG.getIntPtrConstant(TrueMaskElt, SDLoc(MaskedOp));
45590  Alignment = commonAlignment(MaskedOp->getOriginalAlign(),
45591                              EltVT.getStoreSize());
45592  return true;
45593}
45594
45595/// If exactly one element of the mask is set for a non-extending masked load,
45596/// it is a scalar load and vector insert.
45597/// Note: It is expected that the degenerate cases of an all-zeros or all-ones
45598/// mask have already been optimized in IR, so we don't bother with those here.
45599static SDValue
45600reduceMaskedLoadToScalarLoad(MaskedLoadSDNode *ML, SelectionDAG &DAG,
45601                             TargetLowering::DAGCombinerInfo &DCI,
45602                             const X86Subtarget &Subtarget) {
45603  assert(ML->isUnindexed() && "Unexpected indexed masked load!");
45604  // TODO: This is not x86-specific, so it could be lifted to DAGCombiner.
45605  // However, some target hooks may need to be added to know when the transform
45606  // is profitable. Endianness would also have to be considered.
45607
45608  SDValue Addr, VecIndex;
45609  Align Alignment;
45610  unsigned Offset;
45611  if (!getParamsForOneTrueMaskedElt(ML, DAG, Addr, VecIndex, Alignment, Offset))
45612    return SDValue();
45613
45614  // Load the one scalar element that is specified by the mask using the
45615  // appropriate offset from the base pointer.
45616  SDLoc DL(ML);
45617  EVT VT = ML->getValueType(0);
45618  EVT EltVT = VT.getVectorElementType();
45619
45620  EVT CastVT = VT;
45621  if (EltVT == MVT::i64 && !Subtarget.is64Bit()) {
45622    EltVT = MVT::f64;
45623    CastVT = VT.changeVectorElementType(EltVT);
45624  }
45625
45626  SDValue Load =
45627      DAG.getLoad(EltVT, DL, ML->getChain(), Addr,
45628                  ML->getPointerInfo().getWithOffset(Offset),
45629                  Alignment, ML->getMemOperand()->getFlags());
45630
45631  SDValue PassThru = DAG.getBitcast(CastVT, ML->getPassThru());
45632
45633  // Insert the loaded element into the appropriate place in the vector.
45634  SDValue Insert =
45635      DAG.getNode(ISD::INSERT_VECTOR_ELT, DL, CastVT, PassThru, Load, VecIndex);
45636  Insert = DAG.getBitcast(VT, Insert);
45637  return DCI.CombineTo(ML, Insert, Load.getValue(1), true);
45638}
45639
45640static SDValue
45641combineMaskedLoadConstantMask(MaskedLoadSDNode *ML, SelectionDAG &DAG,
45642                              TargetLowering::DAGCombinerInfo &DCI) {
45643  assert(ML->isUnindexed() && "Unexpected indexed masked load!");
45644  if (!ISD::isBuildVectorOfConstantSDNodes(ML->getMask().getNode()))
45645    return SDValue();
45646
45647  SDLoc DL(ML);
45648  EVT VT = ML->getValueType(0);
45649
45650  // If we are loading the first and last elements of a vector, it is safe and
45651  // always faster to load the whole vector. Replace the masked load with a
45652  // vector load and select.
45653  unsigned NumElts = VT.getVectorNumElements();
45654  BuildVectorSDNode *MaskBV = cast<BuildVectorSDNode>(ML->getMask());
45655  bool LoadFirstElt = !isNullConstant(MaskBV->getOperand(0));
45656  bool LoadLastElt = !isNullConstant(MaskBV->getOperand(NumElts - 1));
45657  if (LoadFirstElt && LoadLastElt) {
45658    SDValue VecLd = DAG.getLoad(VT, DL, ML->getChain(), ML->getBasePtr(),
45659                                ML->getMemOperand());
45660    SDValue Blend = DAG.getSelect(DL, VT, ML->getMask(), VecLd,
45661                                  ML->getPassThru());
45662    return DCI.CombineTo(ML, Blend, VecLd.getValue(1), true);
45663  }
45664
45665  // Convert a masked load with a constant mask into a masked load and a select.
45666  // This allows the select operation to use a faster kind of select instruction
45667  // (for example, vblendvps -> vblendps).
45668
45669  // Don't try this if the pass-through operand is already undefined. That would
45670  // cause an infinite loop because that's what we're about to create.
45671  if (ML->getPassThru().isUndef())
45672    return SDValue();
45673
45674  if (ISD::isBuildVectorAllZeros(ML->getPassThru().getNode()))
45675    return SDValue();
45676
45677  // The new masked load has an undef pass-through operand. The select uses the
45678  // original pass-through operand.
45679  SDValue NewML = DAG.getMaskedLoad(
45680      VT, DL, ML->getChain(), ML->getBasePtr(), ML->getOffset(), ML->getMask(),
45681      DAG.getUNDEF(VT), ML->getMemoryVT(), ML->getMemOperand(),
45682      ML->getAddressingMode(), ML->getExtensionType());
45683  SDValue Blend = DAG.getSelect(DL, VT, ML->getMask(), NewML,
45684                                ML->getPassThru());
45685
45686  return DCI.CombineTo(ML, Blend, NewML.getValue(1), true);
45687}
45688
45689static SDValue combineMaskedLoad(SDNode *N, SelectionDAG &DAG,
45690                                 TargetLowering::DAGCombinerInfo &DCI,
45691                                 const X86Subtarget &Subtarget) {
45692  auto *Mld = cast<MaskedLoadSDNode>(N);
45693
45694  // TODO: Expanding load with constant mask may be optimized as well.
45695  if (Mld->isExpandingLoad())
45696    return SDValue();
45697
45698  if (Mld->getExtensionType() == ISD::NON_EXTLOAD) {
45699    if (SDValue ScalarLoad =
45700            reduceMaskedLoadToScalarLoad(Mld, DAG, DCI, Subtarget))
45701      return ScalarLoad;
45702
45703    // TODO: Do some AVX512 subsets benefit from this transform?
45704    if (!Subtarget.hasAVX512())
45705      if (SDValue Blend = combineMaskedLoadConstantMask(Mld, DAG, DCI))
45706        return Blend;
45707  }
45708
45709  // If the mask value has been legalized to a non-boolean vector, try to
45710  // simplify ops leading up to it. We only demand the MSB of each lane.
45711  SDValue Mask = Mld->getMask();
45712  if (Mask.getScalarValueSizeInBits() != 1) {
45713    EVT VT = Mld->getValueType(0);
45714    const TargetLowering &TLI = DAG.getTargetLoweringInfo();
45715    APInt DemandedBits(APInt::getSignMask(VT.getScalarSizeInBits()));
45716    if (TLI.SimplifyDemandedBits(Mask, DemandedBits, DCI)) {
45717      if (N->getOpcode() != ISD::DELETED_NODE)
45718        DCI.AddToWorklist(N);
45719      return SDValue(N, 0);
45720    }
45721    if (SDValue NewMask =
45722            TLI.SimplifyMultipleUseDemandedBits(Mask, DemandedBits, DAG))
45723      return DAG.getMaskedLoad(
45724          VT, SDLoc(N), Mld->getChain(), Mld->getBasePtr(), Mld->getOffset(),
45725          NewMask, Mld->getPassThru(), Mld->getMemoryVT(), Mld->getMemOperand(),
45726          Mld->getAddressingMode(), Mld->getExtensionType());
45727  }
45728
45729  return SDValue();
45730}
45731
45732/// If exactly one element of the mask is set for a non-truncating masked store,
45733/// it is a vector extract and scalar store.
45734/// Note: It is expected that the degenerate cases of an all-zeros or all-ones
45735/// mask have already been optimized in IR, so we don't bother with those here.
45736static SDValue reduceMaskedStoreToScalarStore(MaskedStoreSDNode *MS,
45737                                              SelectionDAG &DAG,
45738                                              const X86Subtarget &Subtarget) {
45739  // TODO: This is not x86-specific, so it could be lifted to DAGCombiner.
45740  // However, some target hooks may need to be added to know when the transform
45741  // is profitable. Endianness would also have to be considered.
45742
45743  SDValue Addr, VecIndex;
45744  Align Alignment;
45745  unsigned Offset;
45746  if (!getParamsForOneTrueMaskedElt(MS, DAG, Addr, VecIndex, Alignment, Offset))
45747    return SDValue();
45748
45749  // Extract the one scalar element that is actually being stored.
45750  SDLoc DL(MS);
45751  SDValue Value = MS->getValue();
45752  EVT VT = Value.getValueType();
45753  EVT EltVT = VT.getVectorElementType();
45754  if (EltVT == MVT::i64 && !Subtarget.is64Bit()) {
45755    EltVT = MVT::f64;
45756    EVT CastVT = VT.changeVectorElementType(EltVT);
45757    Value = DAG.getBitcast(CastVT, Value);
45758  }
45759  SDValue Extract =
45760      DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, EltVT, Value, VecIndex);
45761
45762  // Store that element at the appropriate offset from the base pointer.
45763  return DAG.getStore(MS->getChain(), DL, Extract, Addr,
45764                      MS->getPointerInfo().getWithOffset(Offset),
45765                      Alignment, MS->getMemOperand()->getFlags());
45766}
45767
45768static SDValue combineMaskedStore(SDNode *N, SelectionDAG &DAG,
45769                                  TargetLowering::DAGCombinerInfo &DCI,
45770                                  const X86Subtarget &Subtarget) {
45771  MaskedStoreSDNode *Mst = cast<MaskedStoreSDNode>(N);
45772  if (Mst->isCompressingStore())
45773    return SDValue();
45774
45775  EVT VT = Mst->getValue().getValueType();
45776  SDLoc dl(Mst);
45777  const TargetLowering &TLI = DAG.getTargetLoweringInfo();
45778
45779  if (Mst->isTruncatingStore())
45780    return SDValue();
45781
45782  if (SDValue ScalarStore = reduceMaskedStoreToScalarStore(Mst, DAG, Subtarget))
45783    return ScalarStore;
45784
45785  // If the mask value has been legalized to a non-boolean vector, try to
45786  // simplify ops leading up to it. We only demand the MSB of each lane.
45787  SDValue Mask = Mst->getMask();
45788  if (Mask.getScalarValueSizeInBits() != 1) {
45789    APInt DemandedBits(APInt::getSignMask(VT.getScalarSizeInBits()));
45790    if (TLI.SimplifyDemandedBits(Mask, DemandedBits, DCI)) {
45791      if (N->getOpcode() != ISD::DELETED_NODE)
45792        DCI.AddToWorklist(N);
45793      return SDValue(N, 0);
45794    }
45795    if (SDValue NewMask =
45796            TLI.SimplifyMultipleUseDemandedBits(Mask, DemandedBits, DAG))
45797      return DAG.getMaskedStore(Mst->getChain(), SDLoc(N), Mst->getValue(),
45798                                Mst->getBasePtr(), Mst->getOffset(), NewMask,
45799                                Mst->getMemoryVT(), Mst->getMemOperand(),
45800                                Mst->getAddressingMode());
45801  }
45802
45803  SDValue Value = Mst->getValue();
45804  if (Value.getOpcode() == ISD::TRUNCATE && Value.getNode()->hasOneUse() &&
45805      TLI.isTruncStoreLegal(Value.getOperand(0).getValueType(),
45806                            Mst->getMemoryVT())) {
45807    return DAG.getMaskedStore(Mst->getChain(), SDLoc(N), Value.getOperand(0),
45808                              Mst->getBasePtr(), Mst->getOffset(), Mask,
45809                              Mst->getMemoryVT(), Mst->getMemOperand(),
45810                              Mst->getAddressingMode(), true);
45811  }
45812
45813  return SDValue();
45814}
45815
45816static SDValue combineStore(SDNode *N, SelectionDAG &DAG,
45817                            TargetLowering::DAGCombinerInfo &DCI,
45818                            const X86Subtarget &Subtarget) {
45819  StoreSDNode *St = cast<StoreSDNode>(N);
45820  EVT StVT = St->getMemoryVT();
45821  SDLoc dl(St);
45822  SDValue StoredVal = St->getValue();
45823  EVT VT = StoredVal.getValueType();
45824  const TargetLowering &TLI = DAG.getTargetLoweringInfo();
45825
45826  // Convert a store of vXi1 into a store of iX and a bitcast.
45827  if (!Subtarget.hasAVX512() && VT == StVT && VT.isVector() &&
45828      VT.getVectorElementType() == MVT::i1) {
45829
45830    EVT NewVT = EVT::getIntegerVT(*DAG.getContext(), VT.getVectorNumElements());
45831    StoredVal = DAG.getBitcast(NewVT, StoredVal);
45832
45833    return DAG.getStore(St->getChain(), dl, StoredVal, St->getBasePtr(),
45834                        St->getPointerInfo(), St->getOriginalAlign(),
45835                        St->getMemOperand()->getFlags());
45836  }
45837
45838  // If this is a store of a scalar_to_vector to v1i1, just use a scalar store.
45839  // This will avoid a copy to k-register.
45840  if (VT == MVT::v1i1 && VT == StVT && Subtarget.hasAVX512() &&
45841      StoredVal.getOpcode() == ISD::SCALAR_TO_VECTOR &&
45842      StoredVal.getOperand(0).getValueType() == MVT::i8) {
45843    SDValue Val = StoredVal.getOperand(0);
45844    // We must store zeros to the unused bits.
45845    Val = DAG.getZeroExtendInReg(Val, dl, MVT::i1);
45846    return DAG.getStore(St->getChain(), dl, Val,
45847                        St->getBasePtr(), St->getPointerInfo(),
45848                        St->getOriginalAlign(),
45849                        St->getMemOperand()->getFlags());
45850  }
45851
45852  // Widen v2i1/v4i1 stores to v8i1.
45853  if ((VT == MVT::v1i1 || VT == MVT::v2i1 || VT == MVT::v4i1) && VT == StVT &&
45854      Subtarget.hasAVX512()) {
45855    unsigned NumConcats = 8 / VT.getVectorNumElements();
45856    // We must store zeros to the unused bits.
45857    SmallVector<SDValue, 4> Ops(NumConcats, DAG.getConstant(0, dl, VT));
45858    Ops[0] = StoredVal;
45859    StoredVal = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v8i1, Ops);
45860    return DAG.getStore(St->getChain(), dl, StoredVal, St->getBasePtr(),
45861                        St->getPointerInfo(), St->getOriginalAlign(),
45862                        St->getMemOperand()->getFlags());
45863  }
45864
45865  // Turn vXi1 stores of constants into a scalar store.
45866  if ((VT == MVT::v8i1 || VT == MVT::v16i1 || VT == MVT::v32i1 ||
45867       VT == MVT::v64i1) && VT == StVT && TLI.isTypeLegal(VT) &&
45868      ISD::isBuildVectorOfConstantSDNodes(StoredVal.getNode())) {
45869    // If its a v64i1 store without 64-bit support, we need two stores.
45870    if (!DCI.isBeforeLegalize() && VT == MVT::v64i1 && !Subtarget.is64Bit()) {
45871      SDValue Lo = DAG.getBuildVector(MVT::v32i1, dl,
45872                                      StoredVal->ops().slice(0, 32));
45873      Lo = combinevXi1ConstantToInteger(Lo, DAG);
45874      SDValue Hi = DAG.getBuildVector(MVT::v32i1, dl,
45875                                      StoredVal->ops().slice(32, 32));
45876      Hi = combinevXi1ConstantToInteger(Hi, DAG);
45877
45878      SDValue Ptr0 = St->getBasePtr();
45879      SDValue Ptr1 = DAG.getMemBasePlusOffset(Ptr0, TypeSize::Fixed(4), dl);
45880
45881      SDValue Ch0 =
45882          DAG.getStore(St->getChain(), dl, Lo, Ptr0, St->getPointerInfo(),
45883                       St->getOriginalAlign(),
45884                       St->getMemOperand()->getFlags());
45885      SDValue Ch1 =
45886          DAG.getStore(St->getChain(), dl, Hi, Ptr1,
45887                       St->getPointerInfo().getWithOffset(4),
45888                       St->getOriginalAlign(),
45889                       St->getMemOperand()->getFlags());
45890      return DAG.getNode(ISD::TokenFactor, dl, MVT::Other, Ch0, Ch1);
45891    }
45892
45893    StoredVal = combinevXi1ConstantToInteger(StoredVal, DAG);
45894    return DAG.getStore(St->getChain(), dl, StoredVal, St->getBasePtr(),
45895                        St->getPointerInfo(), St->getOriginalAlign(),
45896                        St->getMemOperand()->getFlags());
45897  }
45898
45899  // If we are saving a 32-byte vector and 32-byte stores are slow, such as on
45900  // Sandy Bridge, perform two 16-byte stores.
45901  bool Fast;
45902  if (VT.is256BitVector() && StVT == VT &&
45903      TLI.allowsMemoryAccess(*DAG.getContext(), DAG.getDataLayout(), VT,
45904                             *St->getMemOperand(), &Fast) &&
45905      !Fast) {
45906    unsigned NumElems = VT.getVectorNumElements();
45907    if (NumElems < 2)
45908      return SDValue();
45909
45910    return splitVectorStore(St, DAG);
45911  }
45912
45913  // Split under-aligned vector non-temporal stores.
45914  if (St->isNonTemporal() && StVT == VT &&
45915      St->getAlignment() < VT.getStoreSize()) {
45916    // ZMM/YMM nt-stores - either it can be stored as a series of shorter
45917    // vectors or the legalizer can scalarize it to use MOVNTI.
45918    if (VT.is256BitVector() || VT.is512BitVector()) {
45919      unsigned NumElems = VT.getVectorNumElements();
45920      if (NumElems < 2)
45921        return SDValue();
45922      return splitVectorStore(St, DAG);
45923    }
45924
45925    // XMM nt-stores - scalarize this to f64 nt-stores on SSE4A, else i32/i64
45926    // to use MOVNTI.
45927    if (VT.is128BitVector() && Subtarget.hasSSE2()) {
45928      MVT NTVT = Subtarget.hasSSE4A()
45929                     ? MVT::v2f64
45930                     : (TLI.isTypeLegal(MVT::i64) ? MVT::v2i64 : MVT::v4i32);
45931      return scalarizeVectorStore(St, NTVT, DAG);
45932    }
45933  }
45934
45935  // Try to optimize v16i16->v16i8 truncating stores when BWI is not
45936  // supported, but avx512f is by extending to v16i32 and truncating.
45937  if (!St->isTruncatingStore() && VT == MVT::v16i8 && !Subtarget.hasBWI() &&
45938      St->getValue().getOpcode() == ISD::TRUNCATE &&
45939      St->getValue().getOperand(0).getValueType() == MVT::v16i16 &&
45940      TLI.isTruncStoreLegal(MVT::v16i32, MVT::v16i8) &&
45941      St->getValue().hasOneUse() && !DCI.isBeforeLegalizeOps()) {
45942    SDValue Ext = DAG.getNode(ISD::ANY_EXTEND, dl, MVT::v16i32, St->getValue());
45943    return DAG.getTruncStore(St->getChain(), dl, Ext, St->getBasePtr(),
45944                             MVT::v16i8, St->getMemOperand());
45945  }
45946
45947  // Try to fold a VTRUNCUS or VTRUNCS into a truncating store.
45948  if (!St->isTruncatingStore() && StoredVal.hasOneUse() &&
45949      (StoredVal.getOpcode() == X86ISD::VTRUNCUS ||
45950       StoredVal.getOpcode() == X86ISD::VTRUNCS) &&
45951      TLI.isTruncStoreLegal(StoredVal.getOperand(0).getValueType(), VT)) {
45952    bool IsSigned = StoredVal.getOpcode() == X86ISD::VTRUNCS;
45953    return EmitTruncSStore(IsSigned, St->getChain(),
45954                           dl, StoredVal.getOperand(0), St->getBasePtr(),
45955                           VT, St->getMemOperand(), DAG);
45956  }
45957
45958  // Try to fold a extract_element(VTRUNC) pattern into a truncating store.
45959  if (!St->isTruncatingStore() && StoredVal.hasOneUse()) {
45960    auto IsExtractedElement = [](SDValue V) {
45961      if (V.getOpcode() == ISD::TRUNCATE && V.getOperand(0).hasOneUse())
45962        V = V.getOperand(0);
45963      unsigned Opc = V.getOpcode();
45964      if (Opc == ISD::EXTRACT_VECTOR_ELT || Opc == X86ISD::PEXTRW) {
45965        if (V.getOperand(0).hasOneUse() && isNullConstant(V.getOperand(1)))
45966          return V.getOperand(0);
45967      }
45968      return SDValue();
45969    };
45970    if (SDValue Extract = IsExtractedElement(StoredVal)) {
45971      SDValue Trunc = peekThroughOneUseBitcasts(Extract);
45972      if (Trunc.getOpcode() == X86ISD::VTRUNC) {
45973        SDValue Src = Trunc.getOperand(0);
45974        MVT DstVT = Trunc.getSimpleValueType();
45975        MVT SrcVT = Src.getSimpleValueType();
45976        unsigned NumSrcElts = SrcVT.getVectorNumElements();
45977        unsigned NumTruncBits = DstVT.getScalarSizeInBits() * NumSrcElts;
45978        MVT TruncVT = MVT::getVectorVT(DstVT.getScalarType(), NumSrcElts);
45979        if (NumTruncBits == VT.getSizeInBits() &&
45980            TLI.isTruncStoreLegal(SrcVT, TruncVT)) {
45981          return DAG.getTruncStore(St->getChain(), dl, Src, St->getBasePtr(),
45982                                   TruncVT, St->getMemOperand());
45983        }
45984      }
45985    }
45986  }
45987
45988  // Optimize trunc store (of multiple scalars) to shuffle and store.
45989  // First, pack all of the elements in one place. Next, store to memory
45990  // in fewer chunks.
45991  if (St->isTruncatingStore() && VT.isVector()) {
45992    // Check if we can detect an AVG pattern from the truncation. If yes,
45993    // replace the trunc store by a normal store with the result of X86ISD::AVG
45994    // instruction.
45995    if (DCI.isBeforeLegalize() || TLI.isTypeLegal(St->getMemoryVT()))
45996      if (SDValue Avg = detectAVGPattern(St->getValue(), St->getMemoryVT(), DAG,
45997                                         Subtarget, dl))
45998        return DAG.getStore(St->getChain(), dl, Avg, St->getBasePtr(),
45999                            St->getPointerInfo(), St->getOriginalAlign(),
46000                            St->getMemOperand()->getFlags());
46001
46002    if (TLI.isTruncStoreLegal(VT, StVT)) {
46003      if (SDValue Val = detectSSatPattern(St->getValue(), St->getMemoryVT()))
46004        return EmitTruncSStore(true /* Signed saturation */, St->getChain(),
46005                               dl, Val, St->getBasePtr(),
46006                               St->getMemoryVT(), St->getMemOperand(), DAG);
46007      if (SDValue Val = detectUSatPattern(St->getValue(), St->getMemoryVT(),
46008                                          DAG, dl))
46009        return EmitTruncSStore(false /* Unsigned saturation */, St->getChain(),
46010                               dl, Val, St->getBasePtr(),
46011                               St->getMemoryVT(), St->getMemOperand(), DAG);
46012    }
46013
46014    return SDValue();
46015  }
46016
46017  // Cast ptr32 and ptr64 pointers to the default address space before a store.
46018  unsigned AddrSpace = St->getAddressSpace();
46019  if (AddrSpace == X86AS::PTR64 || AddrSpace == X86AS::PTR32_SPTR ||
46020      AddrSpace == X86AS::PTR32_UPTR) {
46021    MVT PtrVT = TLI.getPointerTy(DAG.getDataLayout());
46022    if (PtrVT != St->getBasePtr().getSimpleValueType()) {
46023      SDValue Cast =
46024          DAG.getAddrSpaceCast(dl, PtrVT, St->getBasePtr(), AddrSpace, 0);
46025      return DAG.getStore(St->getChain(), dl, StoredVal, Cast,
46026                          St->getPointerInfo(), St->getOriginalAlign(),
46027                          St->getMemOperand()->getFlags(), St->getAAInfo());
46028    }
46029  }
46030
46031  // Turn load->store of MMX types into GPR load/stores.  This avoids clobbering
46032  // the FP state in cases where an emms may be missing.
46033  // A preferable solution to the general problem is to figure out the right
46034  // places to insert EMMS.  This qualifies as a quick hack.
46035
46036  // Similarly, turn load->store of i64 into double load/stores in 32-bit mode.
46037  if (VT.getSizeInBits() != 64)
46038    return SDValue();
46039
46040  const Function &F = DAG.getMachineFunction().getFunction();
46041  bool NoImplicitFloatOps = F.hasFnAttribute(Attribute::NoImplicitFloat);
46042  bool F64IsLegal =
46043      !Subtarget.useSoftFloat() && !NoImplicitFloatOps && Subtarget.hasSSE2();
46044  if ((VT == MVT::i64 && F64IsLegal && !Subtarget.is64Bit()) &&
46045      isa<LoadSDNode>(St->getValue()) &&
46046      cast<LoadSDNode>(St->getValue())->isSimple() &&
46047      St->getChain().hasOneUse() && St->isSimple()) {
46048    LoadSDNode *Ld = cast<LoadSDNode>(St->getValue().getNode());
46049
46050    if (!ISD::isNormalLoad(Ld))
46051      return SDValue();
46052
46053    // Avoid the transformation if there are multiple uses of the loaded value.
46054    if (!Ld->hasNUsesOfValue(1, 0))
46055      return SDValue();
46056
46057    SDLoc LdDL(Ld);
46058    SDLoc StDL(N);
46059    // Lower to a single movq load/store pair.
46060    SDValue NewLd = DAG.getLoad(MVT::f64, LdDL, Ld->getChain(),
46061                                Ld->getBasePtr(), Ld->getMemOperand());
46062
46063    // Make sure new load is placed in same chain order.
46064    DAG.makeEquivalentMemoryOrdering(Ld, NewLd);
46065    return DAG.getStore(St->getChain(), StDL, NewLd, St->getBasePtr(),
46066                        St->getMemOperand());
46067  }
46068
46069  // This is similar to the above case, but here we handle a scalar 64-bit
46070  // integer store that is extracted from a vector on a 32-bit target.
46071  // If we have SSE2, then we can treat it like a floating-point double
46072  // to get past legalization. The execution dependencies fixup pass will
46073  // choose the optimal machine instruction for the store if this really is
46074  // an integer or v2f32 rather than an f64.
46075  if (VT == MVT::i64 && F64IsLegal && !Subtarget.is64Bit() &&
46076      St->getOperand(1).getOpcode() == ISD::EXTRACT_VECTOR_ELT) {
46077    SDValue OldExtract = St->getOperand(1);
46078    SDValue ExtOp0 = OldExtract.getOperand(0);
46079    unsigned VecSize = ExtOp0.getValueSizeInBits();
46080    EVT VecVT = EVT::getVectorVT(*DAG.getContext(), MVT::f64, VecSize / 64);
46081    SDValue BitCast = DAG.getBitcast(VecVT, ExtOp0);
46082    SDValue NewExtract = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::f64,
46083                                     BitCast, OldExtract.getOperand(1));
46084    return DAG.getStore(St->getChain(), dl, NewExtract, St->getBasePtr(),
46085                        St->getPointerInfo(), St->getOriginalAlign(),
46086                        St->getMemOperand()->getFlags());
46087  }
46088
46089  return SDValue();
46090}
46091
46092static SDValue combineVEXTRACT_STORE(SDNode *N, SelectionDAG &DAG,
46093                                     TargetLowering::DAGCombinerInfo &DCI,
46094                                     const X86Subtarget &Subtarget) {
46095  auto *St = cast<MemIntrinsicSDNode>(N);
46096
46097  SDValue StoredVal = N->getOperand(1);
46098  MVT VT = StoredVal.getSimpleValueType();
46099  EVT MemVT = St->getMemoryVT();
46100
46101  // Figure out which elements we demand.
46102  unsigned StElts = MemVT.getSizeInBits() / VT.getScalarSizeInBits();
46103  APInt DemandedElts = APInt::getLowBitsSet(VT.getVectorNumElements(), StElts);
46104
46105  APInt KnownUndef, KnownZero;
46106  const TargetLowering &TLI = DAG.getTargetLoweringInfo();
46107  if (TLI.SimplifyDemandedVectorElts(StoredVal, DemandedElts, KnownUndef,
46108                                     KnownZero, DCI)) {
46109    if (N->getOpcode() != ISD::DELETED_NODE)
46110      DCI.AddToWorklist(N);
46111    return SDValue(N, 0);
46112  }
46113
46114  return SDValue();
46115}
46116
46117/// Return 'true' if this vector operation is "horizontal"
46118/// and return the operands for the horizontal operation in LHS and RHS.  A
46119/// horizontal operation performs the binary operation on successive elements
46120/// of its first operand, then on successive elements of its second operand,
46121/// returning the resulting values in a vector.  For example, if
46122///   A = < float a0, float a1, float a2, float a3 >
46123/// and
46124///   B = < float b0, float b1, float b2, float b3 >
46125/// then the result of doing a horizontal operation on A and B is
46126///   A horizontal-op B = < a0 op a1, a2 op a3, b0 op b1, b2 op b3 >.
46127/// In short, LHS and RHS are inspected to see if LHS op RHS is of the form
46128/// A horizontal-op B, for some already available A and B, and if so then LHS is
46129/// set to A, RHS to B, and the routine returns 'true'.
46130static bool isHorizontalBinOp(unsigned HOpcode, SDValue &LHS, SDValue &RHS,
46131                              SelectionDAG &DAG, const X86Subtarget &Subtarget,
46132                              bool IsCommutative,
46133                              SmallVectorImpl<int> &PostShuffleMask) {
46134  // If either operand is undef, bail out. The binop should be simplified.
46135  if (LHS.isUndef() || RHS.isUndef())
46136    return false;
46137
46138  // Look for the following pattern:
46139  //   A = < float a0, float a1, float a2, float a3 >
46140  //   B = < float b0, float b1, float b2, float b3 >
46141  // and
46142  //   LHS = VECTOR_SHUFFLE A, B, <0, 2, 4, 6>
46143  //   RHS = VECTOR_SHUFFLE A, B, <1, 3, 5, 7>
46144  // then LHS op RHS = < a0 op a1, a2 op a3, b0 op b1, b2 op b3 >
46145  // which is A horizontal-op B.
46146
46147  MVT VT = LHS.getSimpleValueType();
46148  assert((VT.is128BitVector() || VT.is256BitVector()) &&
46149         "Unsupported vector type for horizontal add/sub");
46150  unsigned NumElts = VT.getVectorNumElements();
46151
46152  auto GetShuffle = [&](SDValue Op, SDValue &N0, SDValue &N1,
46153                        SmallVectorImpl<int> &ShuffleMask) {
46154    bool UseSubVector = false;
46155    if (Op.getOpcode() == ISD::EXTRACT_SUBVECTOR &&
46156        Op.getOperand(0).getValueType().is256BitVector() &&
46157        llvm::isNullConstant(Op.getOperand(1))) {
46158      Op = Op.getOperand(0);
46159      UseSubVector = true;
46160    }
46161    SmallVector<SDValue, 2> SrcOps;
46162    SmallVector<int, 16> SrcMask, ScaledMask;
46163    SDValue BC = peekThroughBitcasts(Op);
46164    if (getTargetShuffleInputs(BC, SrcOps, SrcMask, DAG) &&
46165        !isAnyZero(SrcMask) && all_of(SrcOps, [BC](SDValue Op) {
46166          return Op.getValueSizeInBits() == BC.getValueSizeInBits();
46167        })) {
46168      resolveTargetShuffleInputsAndMask(SrcOps, SrcMask);
46169      if (!UseSubVector && SrcOps.size() <= 2 &&
46170          scaleShuffleElements(SrcMask, NumElts, ScaledMask)) {
46171        N0 = SrcOps.size() > 0 ? SrcOps[0] : SDValue();
46172        N1 = SrcOps.size() > 1 ? SrcOps[1] : SDValue();
46173        ShuffleMask.assign(ScaledMask.begin(), ScaledMask.end());
46174      }
46175      if (UseSubVector && SrcOps.size() == 1 &&
46176          scaleShuffleElements(SrcMask, 2 * NumElts, ScaledMask)) {
46177        std::tie(N0, N1) = DAG.SplitVector(SrcOps[0], SDLoc(Op));
46178        ArrayRef<int> Mask = ArrayRef<int>(ScaledMask).slice(0, NumElts);
46179        ShuffleMask.assign(Mask.begin(), Mask.end());
46180      }
46181    }
46182  };
46183
46184  // View LHS in the form
46185  //   LHS = VECTOR_SHUFFLE A, B, LMask
46186  // If LHS is not a shuffle, then pretend it is the identity shuffle:
46187  //   LHS = VECTOR_SHUFFLE LHS, undef, <0, 1, ..., N-1>
46188  // NOTE: A default initialized SDValue represents an UNDEF of type VT.
46189  SDValue A, B;
46190  SmallVector<int, 16> LMask;
46191  GetShuffle(LHS, A, B, LMask);
46192
46193  // Likewise, view RHS in the form
46194  //   RHS = VECTOR_SHUFFLE C, D, RMask
46195  SDValue C, D;
46196  SmallVector<int, 16> RMask;
46197  GetShuffle(RHS, C, D, RMask);
46198
46199  // At least one of the operands should be a vector shuffle.
46200  unsigned NumShuffles = (LMask.empty() ? 0 : 1) + (RMask.empty() ? 0 : 1);
46201  if (NumShuffles == 0)
46202    return false;
46203
46204  if (LMask.empty()) {
46205    A = LHS;
46206    for (unsigned i = 0; i != NumElts; ++i)
46207      LMask.push_back(i);
46208  }
46209
46210  if (RMask.empty()) {
46211    C = RHS;
46212    for (unsigned i = 0; i != NumElts; ++i)
46213      RMask.push_back(i);
46214  }
46215
46216  // If we have an unary mask, ensure the other op is set to null.
46217  if (isUndefOrInRange(LMask, 0, NumElts))
46218    B = SDValue();
46219  else if (isUndefOrInRange(LMask, NumElts, NumElts * 2))
46220    A = SDValue();
46221
46222  if (isUndefOrInRange(RMask, 0, NumElts))
46223    D = SDValue();
46224  else if (isUndefOrInRange(RMask, NumElts, NumElts * 2))
46225    C = SDValue();
46226
46227  // If A and B occur in reverse order in RHS, then canonicalize by commuting
46228  // RHS operands and shuffle mask.
46229  if (A != C) {
46230    std::swap(C, D);
46231    ShuffleVectorSDNode::commuteMask(RMask);
46232  }
46233  // Check that the shuffles are both shuffling the same vectors.
46234  if (!(A == C && B == D))
46235    return false;
46236
46237  PostShuffleMask.clear();
46238  PostShuffleMask.append(NumElts, SM_SentinelUndef);
46239
46240  // LHS and RHS are now:
46241  //   LHS = shuffle A, B, LMask
46242  //   RHS = shuffle A, B, RMask
46243  // Check that the masks correspond to performing a horizontal operation.
46244  // AVX defines horizontal add/sub to operate independently on 128-bit lanes,
46245  // so we just repeat the inner loop if this is a 256-bit op.
46246  unsigned Num128BitChunks = VT.getSizeInBits() / 128;
46247  unsigned NumEltsPer128BitChunk = NumElts / Num128BitChunks;
46248  unsigned NumEltsPer64BitChunk = NumEltsPer128BitChunk / 2;
46249  assert((NumEltsPer128BitChunk % 2 == 0) &&
46250         "Vector type should have an even number of elements in each lane");
46251  for (unsigned j = 0; j != NumElts; j += NumEltsPer128BitChunk) {
46252    for (unsigned i = 0; i != NumEltsPer128BitChunk; ++i) {
46253      // Ignore undefined components.
46254      int LIdx = LMask[i + j], RIdx = RMask[i + j];
46255      if (LIdx < 0 || RIdx < 0 ||
46256          (!A.getNode() && (LIdx < (int)NumElts || RIdx < (int)NumElts)) ||
46257          (!B.getNode() && (LIdx >= (int)NumElts || RIdx >= (int)NumElts)))
46258        continue;
46259
46260      // Check that successive odd/even elements are being operated on. If not,
46261      // this is not a horizontal operation.
46262      if (!((RIdx & 1) == 1 && (LIdx + 1) == RIdx) &&
46263          !((LIdx & 1) == 1 && (RIdx + 1) == LIdx && IsCommutative))
46264        return false;
46265
46266      // Compute the post-shuffle mask index based on where the element
46267      // is stored in the HOP result, and where it needs to be moved to.
46268      int Base = LIdx & ~1u;
46269      int Index = ((Base % NumEltsPer128BitChunk) / 2) +
46270                  ((Base % NumElts) & ~(NumEltsPer128BitChunk - 1));
46271
46272      // The  low half of the 128-bit result must choose from A.
46273      // The high half of the 128-bit result must choose from B,
46274      // unless B is undef. In that case, we are always choosing from A.
46275      if ((B && Base >= (int)NumElts) || (!B && i >= NumEltsPer64BitChunk))
46276        Index += NumEltsPer64BitChunk;
46277      PostShuffleMask[i + j] = Index;
46278    }
46279  }
46280
46281  SDValue NewLHS = A.getNode() ? A : B; // If A is 'UNDEF', use B for it.
46282  SDValue NewRHS = B.getNode() ? B : A; // If B is 'UNDEF', use A for it.
46283
46284  bool IsIdentityPostShuffle =
46285      isSequentialOrUndefInRange(PostShuffleMask, 0, NumElts, 0);
46286  if (IsIdentityPostShuffle)
46287    PostShuffleMask.clear();
46288
46289  // Avoid 128-bit multi lane shuffles if pre-AVX2 and FP (integer will split).
46290  if (!IsIdentityPostShuffle && !Subtarget.hasAVX2() && VT.isFloatingPoint() &&
46291      isMultiLaneShuffleMask(128, VT.getScalarSizeInBits(), PostShuffleMask))
46292    return false;
46293
46294  // If the source nodes are already used in HorizOps then always accept this.
46295  // Shuffle folding should merge these back together.
46296  bool FoundHorizLHS = llvm::any_of(NewLHS->uses(), [&](SDNode *User) {
46297    return User->getOpcode() == HOpcode && User->getValueType(0) == VT;
46298  });
46299  bool FoundHorizRHS = llvm::any_of(NewRHS->uses(), [&](SDNode *User) {
46300    return User->getOpcode() == HOpcode && User->getValueType(0) == VT;
46301  });
46302  bool ForceHorizOp = FoundHorizLHS && FoundHorizRHS;
46303
46304  // Assume a SingleSource HOP if we only shuffle one input and don't need to
46305  // shuffle the result.
46306  if (!ForceHorizOp &&
46307      !shouldUseHorizontalOp(NewLHS == NewRHS &&
46308                                 (NumShuffles < 2 || !IsIdentityPostShuffle),
46309                             DAG, Subtarget))
46310    return false;
46311
46312  LHS = DAG.getBitcast(VT, NewLHS);
46313  RHS = DAG.getBitcast(VT, NewRHS);
46314  return true;
46315}
46316
46317// Try to synthesize horizontal (f)hadd/hsub from (f)adds/subs of shuffles.
46318static SDValue combineToHorizontalAddSub(SDNode *N, SelectionDAG &DAG,
46319                                         const X86Subtarget &Subtarget) {
46320  EVT VT = N->getValueType(0);
46321  unsigned Opcode = N->getOpcode();
46322  bool IsAdd = (Opcode == ISD::FADD) || (Opcode == ISD::ADD);
46323  SmallVector<int, 8> PostShuffleMask;
46324
46325  switch (Opcode) {
46326  case ISD::FADD:
46327  case ISD::FSUB:
46328    if ((Subtarget.hasSSE3() && (VT == MVT::v4f32 || VT == MVT::v2f64)) ||
46329        (Subtarget.hasAVX() && (VT == MVT::v8f32 || VT == MVT::v4f64))) {
46330      SDValue LHS = N->getOperand(0);
46331      SDValue RHS = N->getOperand(1);
46332      auto HorizOpcode = IsAdd ? X86ISD::FHADD : X86ISD::FHSUB;
46333      if (isHorizontalBinOp(HorizOpcode, LHS, RHS, DAG, Subtarget, IsAdd,
46334                            PostShuffleMask)) {
46335        SDValue HorizBinOp = DAG.getNode(HorizOpcode, SDLoc(N), VT, LHS, RHS);
46336        if (!PostShuffleMask.empty())
46337          HorizBinOp = DAG.getVectorShuffle(VT, SDLoc(HorizBinOp), HorizBinOp,
46338                                            DAG.getUNDEF(VT), PostShuffleMask);
46339        return HorizBinOp;
46340      }
46341    }
46342    break;
46343  case ISD::ADD:
46344  case ISD::SUB:
46345    if (Subtarget.hasSSSE3() && (VT == MVT::v8i16 || VT == MVT::v4i32 ||
46346                                 VT == MVT::v16i16 || VT == MVT::v8i32)) {
46347      SDValue LHS = N->getOperand(0);
46348      SDValue RHS = N->getOperand(1);
46349      auto HorizOpcode = IsAdd ? X86ISD::HADD : X86ISD::HSUB;
46350      if (isHorizontalBinOp(HorizOpcode, LHS, RHS, DAG, Subtarget, IsAdd,
46351                            PostShuffleMask)) {
46352        auto HOpBuilder = [HorizOpcode](SelectionDAG &DAG, const SDLoc &DL,
46353                                        ArrayRef<SDValue> Ops) {
46354          return DAG.getNode(HorizOpcode, DL, Ops[0].getValueType(), Ops);
46355        };
46356        SDValue HorizBinOp = SplitOpsAndApply(DAG, Subtarget, SDLoc(N), VT,
46357                                              {LHS, RHS}, HOpBuilder);
46358        if (!PostShuffleMask.empty())
46359          HorizBinOp = DAG.getVectorShuffle(VT, SDLoc(HorizBinOp), HorizBinOp,
46360                                            DAG.getUNDEF(VT), PostShuffleMask);
46361        return HorizBinOp;
46362      }
46363    }
46364    break;
46365  }
46366
46367  return SDValue();
46368}
46369
46370/// Do target-specific dag combines on floating-point adds/subs.
46371static SDValue combineFaddFsub(SDNode *N, SelectionDAG &DAG,
46372                               const X86Subtarget &Subtarget) {
46373  if (SDValue HOp = combineToHorizontalAddSub(N, DAG, Subtarget))
46374    return HOp;
46375  return SDValue();
46376}
46377
46378/// Attempt to pre-truncate inputs to arithmetic ops if it will simplify
46379/// the codegen.
46380/// e.g. TRUNC( BINOP( X, Y ) ) --> BINOP( TRUNC( X ), TRUNC( Y ) )
46381/// TODO: This overlaps with the generic combiner's visitTRUNCATE. Remove
46382///       anything that is guaranteed to be transformed by DAGCombiner.
46383static SDValue combineTruncatedArithmetic(SDNode *N, SelectionDAG &DAG,
46384                                          const X86Subtarget &Subtarget,
46385                                          const SDLoc &DL) {
46386  assert(N->getOpcode() == ISD::TRUNCATE && "Wrong opcode");
46387  SDValue Src = N->getOperand(0);
46388  unsigned SrcOpcode = Src.getOpcode();
46389  const TargetLowering &TLI = DAG.getTargetLoweringInfo();
46390
46391  EVT VT = N->getValueType(0);
46392  EVT SrcVT = Src.getValueType();
46393
46394  auto IsFreeTruncation = [VT](SDValue Op) {
46395    unsigned TruncSizeInBits = VT.getScalarSizeInBits();
46396
46397    // See if this has been extended from a smaller/equal size to
46398    // the truncation size, allowing a truncation to combine with the extend.
46399    unsigned Opcode = Op.getOpcode();
46400    if ((Opcode == ISD::ANY_EXTEND || Opcode == ISD::SIGN_EXTEND ||
46401         Opcode == ISD::ZERO_EXTEND) &&
46402        Op.getOperand(0).getScalarValueSizeInBits() <= TruncSizeInBits)
46403      return true;
46404
46405    // See if this is a single use constant which can be constant folded.
46406    // NOTE: We don't peek throught bitcasts here because there is currently
46407    // no support for constant folding truncate+bitcast+vector_of_constants. So
46408    // we'll just send up with a truncate on both operands which will
46409    // get turned back into (truncate (binop)) causing an infinite loop.
46410    return ISD::isBuildVectorOfConstantSDNodes(Op.getNode());
46411  };
46412
46413  auto TruncateArithmetic = [&](SDValue N0, SDValue N1) {
46414    SDValue Trunc0 = DAG.getNode(ISD::TRUNCATE, DL, VT, N0);
46415    SDValue Trunc1 = DAG.getNode(ISD::TRUNCATE, DL, VT, N1);
46416    return DAG.getNode(SrcOpcode, DL, VT, Trunc0, Trunc1);
46417  };
46418
46419  // Don't combine if the operation has other uses.
46420  if (!Src.hasOneUse())
46421    return SDValue();
46422
46423  // Only support vector truncation for now.
46424  // TODO: i64 scalar math would benefit as well.
46425  if (!VT.isVector())
46426    return SDValue();
46427
46428  // In most cases its only worth pre-truncating if we're only facing the cost
46429  // of one truncation.
46430  // i.e. if one of the inputs will constant fold or the input is repeated.
46431  switch (SrcOpcode) {
46432  case ISD::MUL:
46433    // X86 is rubbish at scalar and vector i64 multiplies (until AVX512DQ) - its
46434    // better to truncate if we have the chance.
46435    if (SrcVT.getScalarType() == MVT::i64 &&
46436        TLI.isOperationLegal(SrcOpcode, VT) &&
46437        !TLI.isOperationLegal(SrcOpcode, SrcVT))
46438      return TruncateArithmetic(Src.getOperand(0), Src.getOperand(1));
46439    LLVM_FALLTHROUGH;
46440  case ISD::AND:
46441  case ISD::XOR:
46442  case ISD::OR:
46443  case ISD::ADD:
46444  case ISD::SUB: {
46445    SDValue Op0 = Src.getOperand(0);
46446    SDValue Op1 = Src.getOperand(1);
46447    if (TLI.isOperationLegal(SrcOpcode, VT) &&
46448        (Op0 == Op1 || IsFreeTruncation(Op0) || IsFreeTruncation(Op1)))
46449      return TruncateArithmetic(Op0, Op1);
46450    break;
46451  }
46452  }
46453
46454  return SDValue();
46455}
46456
46457/// Truncate using ISD::AND mask and X86ISD::PACKUS.
46458/// e.g. trunc <8 x i32> X to <8 x i16> -->
46459/// MaskX = X & 0xffff (clear high bits to prevent saturation)
46460/// packus (extract_subv MaskX, 0), (extract_subv MaskX, 1)
46461static SDValue combineVectorTruncationWithPACKUS(SDNode *N, const SDLoc &DL,
46462                                                 const X86Subtarget &Subtarget,
46463                                                 SelectionDAG &DAG) {
46464  SDValue In = N->getOperand(0);
46465  EVT InVT = In.getValueType();
46466  EVT OutVT = N->getValueType(0);
46467
46468  APInt Mask = APInt::getLowBitsSet(InVT.getScalarSizeInBits(),
46469                                    OutVT.getScalarSizeInBits());
46470  In = DAG.getNode(ISD::AND, DL, InVT, In, DAG.getConstant(Mask, DL, InVT));
46471  return truncateVectorWithPACK(X86ISD::PACKUS, OutVT, In, DL, DAG, Subtarget);
46472}
46473
46474/// Truncate a group of v4i32 into v8i16 using X86ISD::PACKSS.
46475static SDValue combineVectorTruncationWithPACKSS(SDNode *N, const SDLoc &DL,
46476                                                 const X86Subtarget &Subtarget,
46477                                                 SelectionDAG &DAG) {
46478  SDValue In = N->getOperand(0);
46479  EVT InVT = In.getValueType();
46480  EVT OutVT = N->getValueType(0);
46481  In = DAG.getNode(ISD::SIGN_EXTEND_INREG, DL, InVT, In,
46482                   DAG.getValueType(OutVT));
46483  return truncateVectorWithPACK(X86ISD::PACKSS, OutVT, In, DL, DAG, Subtarget);
46484}
46485
46486/// This function transforms truncation from vXi32/vXi64 to vXi8/vXi16 into
46487/// X86ISD::PACKUS/X86ISD::PACKSS operations. We do it here because after type
46488/// legalization the truncation will be translated into a BUILD_VECTOR with each
46489/// element that is extracted from a vector and then truncated, and it is
46490/// difficult to do this optimization based on them.
46491static SDValue combineVectorTruncation(SDNode *N, SelectionDAG &DAG,
46492                                       const X86Subtarget &Subtarget) {
46493  EVT OutVT = N->getValueType(0);
46494  if (!OutVT.isVector())
46495    return SDValue();
46496
46497  SDValue In = N->getOperand(0);
46498  if (!In.getValueType().isSimple())
46499    return SDValue();
46500
46501  EVT InVT = In.getValueType();
46502  unsigned NumElems = OutVT.getVectorNumElements();
46503
46504  // AVX512 provides fast truncate ops.
46505  if (!Subtarget.hasSSE2() || Subtarget.hasAVX512())
46506    return SDValue();
46507
46508  EVT OutSVT = OutVT.getVectorElementType();
46509  EVT InSVT = InVT.getVectorElementType();
46510  if (!((InSVT == MVT::i16 || InSVT == MVT::i32 || InSVT == MVT::i64) &&
46511        (OutSVT == MVT::i8 || OutSVT == MVT::i16) && isPowerOf2_32(NumElems) &&
46512        NumElems >= 8))
46513    return SDValue();
46514
46515  // SSSE3's pshufb results in less instructions in the cases below.
46516  if (Subtarget.hasSSSE3() && NumElems == 8 &&
46517      ((OutSVT == MVT::i8 && InSVT != MVT::i64) ||
46518       (InSVT == MVT::i32 && OutSVT == MVT::i16)))
46519    return SDValue();
46520
46521  SDLoc DL(N);
46522  // SSE2 provides PACKUS for only 2 x v8i16 -> v16i8 and SSE4.1 provides PACKUS
46523  // for 2 x v4i32 -> v8i16. For SSSE3 and below, we need to use PACKSS to
46524  // truncate 2 x v4i32 to v8i16.
46525  if (Subtarget.hasSSE41() || OutSVT == MVT::i8)
46526    return combineVectorTruncationWithPACKUS(N, DL, Subtarget, DAG);
46527  if (InSVT == MVT::i32)
46528    return combineVectorTruncationWithPACKSS(N, DL, Subtarget, DAG);
46529
46530  return SDValue();
46531}
46532
46533/// This function transforms vector truncation of 'extended sign-bits' or
46534/// 'extended zero-bits' values.
46535/// vXi16/vXi32/vXi64 to vXi8/vXi16/vXi32 into X86ISD::PACKSS/PACKUS operations.
46536static SDValue combineVectorSignBitsTruncation(SDNode *N, const SDLoc &DL,
46537                                               SelectionDAG &DAG,
46538                                               const X86Subtarget &Subtarget) {
46539  // Requires SSE2.
46540  if (!Subtarget.hasSSE2())
46541    return SDValue();
46542
46543  if (!N->getValueType(0).isVector() || !N->getValueType(0).isSimple())
46544    return SDValue();
46545
46546  SDValue In = N->getOperand(0);
46547  if (!In.getValueType().isSimple())
46548    return SDValue();
46549
46550  MVT VT = N->getValueType(0).getSimpleVT();
46551  MVT SVT = VT.getScalarType();
46552
46553  MVT InVT = In.getValueType().getSimpleVT();
46554  MVT InSVT = InVT.getScalarType();
46555
46556  // Check we have a truncation suited for PACKSS/PACKUS.
46557  if (!isPowerOf2_32(VT.getVectorNumElements()))
46558    return SDValue();
46559  if (SVT != MVT::i8 && SVT != MVT::i16 && SVT != MVT::i32)
46560    return SDValue();
46561  if (InSVT != MVT::i16 && InSVT != MVT::i32 && InSVT != MVT::i64)
46562    return SDValue();
46563
46564  // Truncation to sub-128bit vXi32 can be better handled with shuffles.
46565  if (SVT == MVT::i32 && VT.getSizeInBits() < 128)
46566    return SDValue();
46567
46568  // AVX512 has fast truncate, but if the input is already going to be split,
46569  // there's no harm in trying pack.
46570  if (Subtarget.hasAVX512() &&
46571      !(!Subtarget.useAVX512Regs() && VT.is256BitVector() &&
46572        InVT.is512BitVector())) {
46573    // PACK should still be worth it for 128-bit vectors if the sources were
46574    // originally concatenated from subvectors.
46575    SmallVector<SDValue> ConcatOps;
46576    if (VT.getSizeInBits() > 128 || !collectConcatOps(In.getNode(), ConcatOps))
46577    return SDValue();
46578  }
46579
46580  unsigned NumPackedSignBits = std::min<unsigned>(SVT.getSizeInBits(), 16);
46581  unsigned NumPackedZeroBits = Subtarget.hasSSE41() ? NumPackedSignBits : 8;
46582
46583  // Use PACKUS if the input has zero-bits that extend all the way to the
46584  // packed/truncated value. e.g. masks, zext_in_reg, etc.
46585  KnownBits Known = DAG.computeKnownBits(In);
46586  unsigned NumLeadingZeroBits = Known.countMinLeadingZeros();
46587  if (NumLeadingZeroBits >= (InSVT.getSizeInBits() - NumPackedZeroBits))
46588    return truncateVectorWithPACK(X86ISD::PACKUS, VT, In, DL, DAG, Subtarget);
46589
46590  // Use PACKSS if the input has sign-bits that extend all the way to the
46591  // packed/truncated value. e.g. Comparison result, sext_in_reg, etc.
46592  unsigned NumSignBits = DAG.ComputeNumSignBits(In);
46593
46594  // Don't use PACKSS for vXi64 -> vXi32 truncations unless we're dealing with
46595  // a sign splat. ComputeNumSignBits struggles to see through BITCASTs later
46596  // on and combines/simplifications can't then use it.
46597  if (SVT == MVT::i32 && NumSignBits != InSVT.getSizeInBits())
46598    return SDValue();
46599
46600  unsigned MinSignBits = InSVT.getSizeInBits() - NumPackedSignBits;
46601  if (NumSignBits > MinSignBits)
46602    return truncateVectorWithPACK(X86ISD::PACKSS, VT, In, DL, DAG, Subtarget);
46603
46604  // If we have a srl that only generates signbits that we will discard in
46605  // the truncation then we can use PACKSS by converting the srl to a sra.
46606  // SimplifyDemandedBits often relaxes sra to srl so we need to reverse it.
46607  if (In.getOpcode() == ISD::SRL && N->isOnlyUserOf(In.getNode()))
46608    if (const APInt *ShAmt = DAG.getValidShiftAmountConstant(
46609            In, APInt::getAllOnesValue(VT.getVectorNumElements()))) {
46610      if (*ShAmt == MinSignBits) {
46611        SDValue NewIn = DAG.getNode(ISD::SRA, DL, InVT, In->ops());
46612        return truncateVectorWithPACK(X86ISD::PACKSS, VT, NewIn, DL, DAG,
46613                                      Subtarget);
46614      }
46615    }
46616
46617  return SDValue();
46618}
46619
46620// Try to form a MULHU or MULHS node by looking for
46621// (trunc (srl (mul ext, ext), 16))
46622// TODO: This is X86 specific because we want to be able to handle wide types
46623// before type legalization. But we can only do it if the vector will be
46624// legalized via widening/splitting. Type legalization can't handle promotion
46625// of a MULHU/MULHS. There isn't a way to convey this to the generic DAG
46626// combiner.
46627static SDValue combinePMULH(SDValue Src, EVT VT, const SDLoc &DL,
46628                            SelectionDAG &DAG, const X86Subtarget &Subtarget) {
46629  // First instruction should be a right shift of a multiply.
46630  if (Src.getOpcode() != ISD::SRL ||
46631      Src.getOperand(0).getOpcode() != ISD::MUL)
46632    return SDValue();
46633
46634  if (!Subtarget.hasSSE2())
46635    return SDValue();
46636
46637  // Only handle vXi16 types that are at least 128-bits unless they will be
46638  // widened.
46639  if (!VT.isVector() || VT.getVectorElementType() != MVT::i16)
46640    return SDValue();
46641
46642  // Input type should be at least vXi32.
46643  EVT InVT = Src.getValueType();
46644  if (InVT.getVectorElementType().getSizeInBits() < 32)
46645    return SDValue();
46646
46647  // Need a shift by 16.
46648  APInt ShiftAmt;
46649  if (!ISD::isConstantSplatVector(Src.getOperand(1).getNode(), ShiftAmt) ||
46650      ShiftAmt != 16)
46651    return SDValue();
46652
46653  SDValue LHS = Src.getOperand(0).getOperand(0);
46654  SDValue RHS = Src.getOperand(0).getOperand(1);
46655
46656  unsigned ExtOpc = LHS.getOpcode();
46657  if ((ExtOpc != ISD::SIGN_EXTEND && ExtOpc != ISD::ZERO_EXTEND) ||
46658      RHS.getOpcode() != ExtOpc)
46659    return SDValue();
46660
46661  // Peek through the extends.
46662  LHS = LHS.getOperand(0);
46663  RHS = RHS.getOperand(0);
46664
46665  // Ensure the input types match.
46666  if (LHS.getValueType() != VT || RHS.getValueType() != VT)
46667    return SDValue();
46668
46669  unsigned Opc = ExtOpc == ISD::SIGN_EXTEND ? ISD::MULHS : ISD::MULHU;
46670  return DAG.getNode(Opc, DL, VT, LHS, RHS);
46671}
46672
46673// Attempt to match PMADDUBSW, which multiplies corresponding unsigned bytes
46674// from one vector with signed bytes from another vector, adds together
46675// adjacent pairs of 16-bit products, and saturates the result before
46676// truncating to 16-bits.
46677//
46678// Which looks something like this:
46679// (i16 (ssat (add (mul (zext (even elts (i8 A))), (sext (even elts (i8 B)))),
46680//                 (mul (zext (odd elts (i8 A)), (sext (odd elts (i8 B))))))))
46681static SDValue detectPMADDUBSW(SDValue In, EVT VT, SelectionDAG &DAG,
46682                               const X86Subtarget &Subtarget,
46683                               const SDLoc &DL) {
46684  if (!VT.isVector() || !Subtarget.hasSSSE3())
46685    return SDValue();
46686
46687  unsigned NumElems = VT.getVectorNumElements();
46688  EVT ScalarVT = VT.getVectorElementType();
46689  if (ScalarVT != MVT::i16 || NumElems < 8 || !isPowerOf2_32(NumElems))
46690    return SDValue();
46691
46692  SDValue SSatVal = detectSSatPattern(In, VT);
46693  if (!SSatVal || SSatVal.getOpcode() != ISD::ADD)
46694    return SDValue();
46695
46696  // Ok this is a signed saturation of an ADD. See if this ADD is adding pairs
46697  // of multiplies from even/odd elements.
46698  SDValue N0 = SSatVal.getOperand(0);
46699  SDValue N1 = SSatVal.getOperand(1);
46700
46701  if (N0.getOpcode() != ISD::MUL || N1.getOpcode() != ISD::MUL)
46702    return SDValue();
46703
46704  SDValue N00 = N0.getOperand(0);
46705  SDValue N01 = N0.getOperand(1);
46706  SDValue N10 = N1.getOperand(0);
46707  SDValue N11 = N1.getOperand(1);
46708
46709  // TODO: Handle constant vectors and use knownbits/computenumsignbits?
46710  // Canonicalize zero_extend to LHS.
46711  if (N01.getOpcode() == ISD::ZERO_EXTEND)
46712    std::swap(N00, N01);
46713  if (N11.getOpcode() == ISD::ZERO_EXTEND)
46714    std::swap(N10, N11);
46715
46716  // Ensure we have a zero_extend and a sign_extend.
46717  if (N00.getOpcode() != ISD::ZERO_EXTEND ||
46718      N01.getOpcode() != ISD::SIGN_EXTEND ||
46719      N10.getOpcode() != ISD::ZERO_EXTEND ||
46720      N11.getOpcode() != ISD::SIGN_EXTEND)
46721    return SDValue();
46722
46723  // Peek through the extends.
46724  N00 = N00.getOperand(0);
46725  N01 = N01.getOperand(0);
46726  N10 = N10.getOperand(0);
46727  N11 = N11.getOperand(0);
46728
46729  // Ensure the extend is from vXi8.
46730  if (N00.getValueType().getVectorElementType() != MVT::i8 ||
46731      N01.getValueType().getVectorElementType() != MVT::i8 ||
46732      N10.getValueType().getVectorElementType() != MVT::i8 ||
46733      N11.getValueType().getVectorElementType() != MVT::i8)
46734    return SDValue();
46735
46736  // All inputs should be build_vectors.
46737  if (N00.getOpcode() != ISD::BUILD_VECTOR ||
46738      N01.getOpcode() != ISD::BUILD_VECTOR ||
46739      N10.getOpcode() != ISD::BUILD_VECTOR ||
46740      N11.getOpcode() != ISD::BUILD_VECTOR)
46741    return SDValue();
46742
46743  // N00/N10 are zero extended. N01/N11 are sign extended.
46744
46745  // For each element, we need to ensure we have an odd element from one vector
46746  // multiplied by the odd element of another vector and the even element from
46747  // one of the same vectors being multiplied by the even element from the
46748  // other vector. So we need to make sure for each element i, this operator
46749  // is being performed:
46750  //  A[2 * i] * B[2 * i] + A[2 * i + 1] * B[2 * i + 1]
46751  SDValue ZExtIn, SExtIn;
46752  for (unsigned i = 0; i != NumElems; ++i) {
46753    SDValue N00Elt = N00.getOperand(i);
46754    SDValue N01Elt = N01.getOperand(i);
46755    SDValue N10Elt = N10.getOperand(i);
46756    SDValue N11Elt = N11.getOperand(i);
46757    // TODO: Be more tolerant to undefs.
46758    if (N00Elt.getOpcode() != ISD::EXTRACT_VECTOR_ELT ||
46759        N01Elt.getOpcode() != ISD::EXTRACT_VECTOR_ELT ||
46760        N10Elt.getOpcode() != ISD::EXTRACT_VECTOR_ELT ||
46761        N11Elt.getOpcode() != ISD::EXTRACT_VECTOR_ELT)
46762      return SDValue();
46763    auto *ConstN00Elt = dyn_cast<ConstantSDNode>(N00Elt.getOperand(1));
46764    auto *ConstN01Elt = dyn_cast<ConstantSDNode>(N01Elt.getOperand(1));
46765    auto *ConstN10Elt = dyn_cast<ConstantSDNode>(N10Elt.getOperand(1));
46766    auto *ConstN11Elt = dyn_cast<ConstantSDNode>(N11Elt.getOperand(1));
46767    if (!ConstN00Elt || !ConstN01Elt || !ConstN10Elt || !ConstN11Elt)
46768      return SDValue();
46769    unsigned IdxN00 = ConstN00Elt->getZExtValue();
46770    unsigned IdxN01 = ConstN01Elt->getZExtValue();
46771    unsigned IdxN10 = ConstN10Elt->getZExtValue();
46772    unsigned IdxN11 = ConstN11Elt->getZExtValue();
46773    // Add is commutative so indices can be reordered.
46774    if (IdxN00 > IdxN10) {
46775      std::swap(IdxN00, IdxN10);
46776      std::swap(IdxN01, IdxN11);
46777    }
46778    // N0 indices be the even element. N1 indices must be the next odd element.
46779    if (IdxN00 != 2 * i || IdxN10 != 2 * i + 1 ||
46780        IdxN01 != 2 * i || IdxN11 != 2 * i + 1)
46781      return SDValue();
46782    SDValue N00In = N00Elt.getOperand(0);
46783    SDValue N01In = N01Elt.getOperand(0);
46784    SDValue N10In = N10Elt.getOperand(0);
46785    SDValue N11In = N11Elt.getOperand(0);
46786    // First time we find an input capture it.
46787    if (!ZExtIn) {
46788      ZExtIn = N00In;
46789      SExtIn = N01In;
46790    }
46791    if (ZExtIn != N00In || SExtIn != N01In ||
46792        ZExtIn != N10In || SExtIn != N11In)
46793      return SDValue();
46794  }
46795
46796  auto PMADDBuilder = [](SelectionDAG &DAG, const SDLoc &DL,
46797                         ArrayRef<SDValue> Ops) {
46798    // Shrink by adding truncate nodes and let DAGCombine fold with the
46799    // sources.
46800    EVT InVT = Ops[0].getValueType();
46801    assert(InVT.getScalarType() == MVT::i8 &&
46802           "Unexpected scalar element type");
46803    assert(InVT == Ops[1].getValueType() && "Operands' types mismatch");
46804    EVT ResVT = EVT::getVectorVT(*DAG.getContext(), MVT::i16,
46805                                 InVT.getVectorNumElements() / 2);
46806    return DAG.getNode(X86ISD::VPMADDUBSW, DL, ResVT, Ops[0], Ops[1]);
46807  };
46808  return SplitOpsAndApply(DAG, Subtarget, DL, VT, { ZExtIn, SExtIn },
46809                          PMADDBuilder);
46810}
46811
46812static SDValue combineTruncate(SDNode *N, SelectionDAG &DAG,
46813                               const X86Subtarget &Subtarget) {
46814  EVT VT = N->getValueType(0);
46815  SDValue Src = N->getOperand(0);
46816  SDLoc DL(N);
46817
46818  // Attempt to pre-truncate inputs to arithmetic ops instead.
46819  if (SDValue V = combineTruncatedArithmetic(N, DAG, Subtarget, DL))
46820    return V;
46821
46822  // Try to detect AVG pattern first.
46823  if (SDValue Avg = detectAVGPattern(Src, VT, DAG, Subtarget, DL))
46824    return Avg;
46825
46826  // Try to detect PMADD
46827  if (SDValue PMAdd = detectPMADDUBSW(Src, VT, DAG, Subtarget, DL))
46828    return PMAdd;
46829
46830  // Try to combine truncation with signed/unsigned saturation.
46831  if (SDValue Val = combineTruncateWithSat(Src, VT, DL, DAG, Subtarget))
46832    return Val;
46833
46834  // Try to combine PMULHUW/PMULHW for vXi16.
46835  if (SDValue V = combinePMULH(Src, VT, DL, DAG, Subtarget))
46836    return V;
46837
46838  // The bitcast source is a direct mmx result.
46839  // Detect bitcasts between i32 to x86mmx
46840  if (Src.getOpcode() == ISD::BITCAST && VT == MVT::i32) {
46841    SDValue BCSrc = Src.getOperand(0);
46842    if (BCSrc.getValueType() == MVT::x86mmx)
46843      return DAG.getNode(X86ISD::MMX_MOVD2W, DL, MVT::i32, BCSrc);
46844  }
46845
46846  // Try to truncate extended sign/zero bits with PACKSS/PACKUS.
46847  if (SDValue V = combineVectorSignBitsTruncation(N, DL, DAG, Subtarget))
46848    return V;
46849
46850  return combineVectorTruncation(N, DAG, Subtarget);
46851}
46852
46853static SDValue combineVTRUNC(SDNode *N, SelectionDAG &DAG,
46854                             TargetLowering::DAGCombinerInfo &DCI) {
46855  EVT VT = N->getValueType(0);
46856  SDValue In = N->getOperand(0);
46857  SDLoc DL(N);
46858
46859  if (auto SSatVal = detectSSatPattern(In, VT))
46860    return DAG.getNode(X86ISD::VTRUNCS, DL, VT, SSatVal);
46861  if (auto USatVal = detectUSatPattern(In, VT, DAG, DL))
46862    return DAG.getNode(X86ISD::VTRUNCUS, DL, VT, USatVal);
46863
46864  const TargetLowering &TLI = DAG.getTargetLoweringInfo();
46865  APInt DemandedMask(APInt::getAllOnesValue(VT.getScalarSizeInBits()));
46866  if (TLI.SimplifyDemandedBits(SDValue(N, 0), DemandedMask, DCI))
46867    return SDValue(N, 0);
46868
46869  return SDValue();
46870}
46871
46872/// Returns the negated value if the node \p N flips sign of FP value.
46873///
46874/// FP-negation node may have different forms: FNEG(x), FXOR (x, 0x80000000)
46875/// or FSUB(0, x)
46876/// AVX512F does not have FXOR, so FNEG is lowered as
46877/// (bitcast (xor (bitcast x), (bitcast ConstantFP(0x80000000)))).
46878/// In this case we go though all bitcasts.
46879/// This also recognizes splat of a negated value and returns the splat of that
46880/// value.
46881static SDValue isFNEG(SelectionDAG &DAG, SDNode *N, unsigned Depth = 0) {
46882  if (N->getOpcode() == ISD::FNEG)
46883    return N->getOperand(0);
46884
46885  // Don't recurse exponentially.
46886  if (Depth > SelectionDAG::MaxRecursionDepth)
46887    return SDValue();
46888
46889  unsigned ScalarSize = N->getValueType(0).getScalarSizeInBits();
46890
46891  SDValue Op = peekThroughBitcasts(SDValue(N, 0));
46892  EVT VT = Op->getValueType(0);
46893
46894  // Make sure the element size doesn't change.
46895  if (VT.getScalarSizeInBits() != ScalarSize)
46896    return SDValue();
46897
46898  unsigned Opc = Op.getOpcode();
46899  switch (Opc) {
46900  case ISD::VECTOR_SHUFFLE: {
46901    // For a VECTOR_SHUFFLE(VEC1, VEC2), if the VEC2 is undef, then the negate
46902    // of this is VECTOR_SHUFFLE(-VEC1, UNDEF).  The mask can be anything here.
46903    if (!Op.getOperand(1).isUndef())
46904      return SDValue();
46905    if (SDValue NegOp0 = isFNEG(DAG, Op.getOperand(0).getNode(), Depth + 1))
46906      if (NegOp0.getValueType() == VT) // FIXME: Can we do better?
46907        return DAG.getVectorShuffle(VT, SDLoc(Op), NegOp0, DAG.getUNDEF(VT),
46908                                    cast<ShuffleVectorSDNode>(Op)->getMask());
46909    break;
46910  }
46911  case ISD::INSERT_VECTOR_ELT: {
46912    // Negate of INSERT_VECTOR_ELT(UNDEF, V, INDEX) is INSERT_VECTOR_ELT(UNDEF,
46913    // -V, INDEX).
46914    SDValue InsVector = Op.getOperand(0);
46915    SDValue InsVal = Op.getOperand(1);
46916    if (!InsVector.isUndef())
46917      return SDValue();
46918    if (SDValue NegInsVal = isFNEG(DAG, InsVal.getNode(), Depth + 1))
46919      if (NegInsVal.getValueType() == VT.getVectorElementType()) // FIXME
46920        return DAG.getNode(ISD::INSERT_VECTOR_ELT, SDLoc(Op), VT, InsVector,
46921                           NegInsVal, Op.getOperand(2));
46922    break;
46923  }
46924  case ISD::FSUB:
46925  case ISD::XOR:
46926  case X86ISD::FXOR: {
46927    SDValue Op1 = Op.getOperand(1);
46928    SDValue Op0 = Op.getOperand(0);
46929
46930    // For XOR and FXOR, we want to check if constant
46931    // bits of Op1 are sign bit masks. For FSUB, we
46932    // have to check if constant bits of Op0 are sign
46933    // bit masks and hence we swap the operands.
46934    if (Opc == ISD::FSUB)
46935      std::swap(Op0, Op1);
46936
46937    APInt UndefElts;
46938    SmallVector<APInt, 16> EltBits;
46939    // Extract constant bits and see if they are all
46940    // sign bit masks. Ignore the undef elements.
46941    if (getTargetConstantBitsFromNode(Op1, ScalarSize, UndefElts, EltBits,
46942                                      /* AllowWholeUndefs */ true,
46943                                      /* AllowPartialUndefs */ false)) {
46944      for (unsigned I = 0, E = EltBits.size(); I < E; I++)
46945        if (!UndefElts[I] && !EltBits[I].isSignMask())
46946          return SDValue();
46947
46948      return peekThroughBitcasts(Op0);
46949    }
46950  }
46951  }
46952
46953  return SDValue();
46954}
46955
46956static unsigned negateFMAOpcode(unsigned Opcode, bool NegMul, bool NegAcc,
46957                                bool NegRes) {
46958  if (NegMul) {
46959    switch (Opcode) {
46960    default: llvm_unreachable("Unexpected opcode");
46961    case ISD::FMA:              Opcode = X86ISD::FNMADD;        break;
46962    case ISD::STRICT_FMA:       Opcode = X86ISD::STRICT_FNMADD; break;
46963    case X86ISD::FMADD_RND:     Opcode = X86ISD::FNMADD_RND;    break;
46964    case X86ISD::FMSUB:         Opcode = X86ISD::FNMSUB;        break;
46965    case X86ISD::STRICT_FMSUB:  Opcode = X86ISD::STRICT_FNMSUB; break;
46966    case X86ISD::FMSUB_RND:     Opcode = X86ISD::FNMSUB_RND;    break;
46967    case X86ISD::FNMADD:        Opcode = ISD::FMA;              break;
46968    case X86ISD::STRICT_FNMADD: Opcode = ISD::STRICT_FMA;       break;
46969    case X86ISD::FNMADD_RND:    Opcode = X86ISD::FMADD_RND;     break;
46970    case X86ISD::FNMSUB:        Opcode = X86ISD::FMSUB;         break;
46971    case X86ISD::STRICT_FNMSUB: Opcode = X86ISD::STRICT_FMSUB;  break;
46972    case X86ISD::FNMSUB_RND:    Opcode = X86ISD::FMSUB_RND;     break;
46973    }
46974  }
46975
46976  if (NegAcc) {
46977    switch (Opcode) {
46978    default: llvm_unreachable("Unexpected opcode");
46979    case ISD::FMA:              Opcode = X86ISD::FMSUB;         break;
46980    case ISD::STRICT_FMA:       Opcode = X86ISD::STRICT_FMSUB;  break;
46981    case X86ISD::FMADD_RND:     Opcode = X86ISD::FMSUB_RND;     break;
46982    case X86ISD::FMSUB:         Opcode = ISD::FMA;              break;
46983    case X86ISD::STRICT_FMSUB:  Opcode = ISD::STRICT_FMA;       break;
46984    case X86ISD::FMSUB_RND:     Opcode = X86ISD::FMADD_RND;     break;
46985    case X86ISD::FNMADD:        Opcode = X86ISD::FNMSUB;        break;
46986    case X86ISD::STRICT_FNMADD: Opcode = X86ISD::STRICT_FNMSUB; break;
46987    case X86ISD::FNMADD_RND:    Opcode = X86ISD::FNMSUB_RND;    break;
46988    case X86ISD::FNMSUB:        Opcode = X86ISD::FNMADD;        break;
46989    case X86ISD::STRICT_FNMSUB: Opcode = X86ISD::STRICT_FNMADD; break;
46990    case X86ISD::FNMSUB_RND:    Opcode = X86ISD::FNMADD_RND;    break;
46991    case X86ISD::FMADDSUB:      Opcode = X86ISD::FMSUBADD;      break;
46992    case X86ISD::FMADDSUB_RND:  Opcode = X86ISD::FMSUBADD_RND;  break;
46993    case X86ISD::FMSUBADD:      Opcode = X86ISD::FMADDSUB;      break;
46994    case X86ISD::FMSUBADD_RND:  Opcode = X86ISD::FMADDSUB_RND;  break;
46995    }
46996  }
46997
46998  if (NegRes) {
46999    switch (Opcode) {
47000    // For accuracy reason, we never combine fneg and fma under strict FP.
47001    default: llvm_unreachable("Unexpected opcode");
47002    case ISD::FMA:             Opcode = X86ISD::FNMSUB;       break;
47003    case X86ISD::FMADD_RND:    Opcode = X86ISD::FNMSUB_RND;   break;
47004    case X86ISD::FMSUB:        Opcode = X86ISD::FNMADD;       break;
47005    case X86ISD::FMSUB_RND:    Opcode = X86ISD::FNMADD_RND;   break;
47006    case X86ISD::FNMADD:       Opcode = X86ISD::FMSUB;        break;
47007    case X86ISD::FNMADD_RND:   Opcode = X86ISD::FMSUB_RND;    break;
47008    case X86ISD::FNMSUB:       Opcode = ISD::FMA;             break;
47009    case X86ISD::FNMSUB_RND:   Opcode = X86ISD::FMADD_RND;    break;
47010    }
47011  }
47012
47013  return Opcode;
47014}
47015
47016/// Do target-specific dag combines on floating point negations.
47017static SDValue combineFneg(SDNode *N, SelectionDAG &DAG,
47018                           TargetLowering::DAGCombinerInfo &DCI,
47019                           const X86Subtarget &Subtarget) {
47020  EVT OrigVT = N->getValueType(0);
47021  SDValue Arg = isFNEG(DAG, N);
47022  if (!Arg)
47023    return SDValue();
47024
47025  const TargetLowering &TLI = DAG.getTargetLoweringInfo();
47026  EVT VT = Arg.getValueType();
47027  EVT SVT = VT.getScalarType();
47028  SDLoc DL(N);
47029
47030  // Let legalize expand this if it isn't a legal type yet.
47031  if (!TLI.isTypeLegal(VT))
47032    return SDValue();
47033
47034  // If we're negating a FMUL node on a target with FMA, then we can avoid the
47035  // use of a constant by performing (-0 - A*B) instead.
47036  // FIXME: Check rounding control flags as well once it becomes available.
47037  if (Arg.getOpcode() == ISD::FMUL && (SVT == MVT::f32 || SVT == MVT::f64) &&
47038      Arg->getFlags().hasNoSignedZeros() && Subtarget.hasAnyFMA()) {
47039    SDValue Zero = DAG.getConstantFP(0.0, DL, VT);
47040    SDValue NewNode = DAG.getNode(X86ISD::FNMSUB, DL, VT, Arg.getOperand(0),
47041                                  Arg.getOperand(1), Zero);
47042    return DAG.getBitcast(OrigVT, NewNode);
47043  }
47044
47045  bool CodeSize = DAG.getMachineFunction().getFunction().hasOptSize();
47046  bool LegalOperations = !DCI.isBeforeLegalizeOps();
47047  if (SDValue NegArg =
47048          TLI.getNegatedExpression(Arg, DAG, LegalOperations, CodeSize))
47049    return DAG.getBitcast(OrigVT, NegArg);
47050
47051  return SDValue();
47052}
47053
47054SDValue X86TargetLowering::getNegatedExpression(SDValue Op, SelectionDAG &DAG,
47055                                                bool LegalOperations,
47056                                                bool ForCodeSize,
47057                                                NegatibleCost &Cost,
47058                                                unsigned Depth) const {
47059  // fneg patterns are removable even if they have multiple uses.
47060  if (SDValue Arg = isFNEG(DAG, Op.getNode(), Depth)) {
47061    Cost = NegatibleCost::Cheaper;
47062    return DAG.getBitcast(Op.getValueType(), Arg);
47063  }
47064
47065  EVT VT = Op.getValueType();
47066  EVT SVT = VT.getScalarType();
47067  unsigned Opc = Op.getOpcode();
47068  SDNodeFlags Flags = Op.getNode()->getFlags();
47069  switch (Opc) {
47070  case ISD::FMA:
47071  case X86ISD::FMSUB:
47072  case X86ISD::FNMADD:
47073  case X86ISD::FNMSUB:
47074  case X86ISD::FMADD_RND:
47075  case X86ISD::FMSUB_RND:
47076  case X86ISD::FNMADD_RND:
47077  case X86ISD::FNMSUB_RND: {
47078    if (!Op.hasOneUse() || !Subtarget.hasAnyFMA() || !isTypeLegal(VT) ||
47079        !(SVT == MVT::f32 || SVT == MVT::f64) ||
47080        !isOperationLegal(ISD::FMA, VT))
47081      break;
47082
47083    // Don't fold (fneg (fma (fneg x), y, (fneg z))) to (fma x, y, z)
47084    // if it may have signed zeros.
47085    if (!Flags.hasNoSignedZeros())
47086      break;
47087
47088    // This is always negatible for free but we might be able to remove some
47089    // extra operand negations as well.
47090    SmallVector<SDValue, 4> NewOps(Op.getNumOperands(), SDValue());
47091    for (int i = 0; i != 3; ++i)
47092      NewOps[i] = getCheaperNegatedExpression(
47093          Op.getOperand(i), DAG, LegalOperations, ForCodeSize, Depth + 1);
47094
47095    bool NegA = !!NewOps[0];
47096    bool NegB = !!NewOps[1];
47097    bool NegC = !!NewOps[2];
47098    unsigned NewOpc = negateFMAOpcode(Opc, NegA != NegB, NegC, true);
47099
47100    Cost = (NegA || NegB || NegC) ? NegatibleCost::Cheaper
47101                                  : NegatibleCost::Neutral;
47102
47103    // Fill in the non-negated ops with the original values.
47104    for (int i = 0, e = Op.getNumOperands(); i != e; ++i)
47105      if (!NewOps[i])
47106        NewOps[i] = Op.getOperand(i);
47107    return DAG.getNode(NewOpc, SDLoc(Op), VT, NewOps);
47108  }
47109  case X86ISD::FRCP:
47110    if (SDValue NegOp0 =
47111            getNegatedExpression(Op.getOperand(0), DAG, LegalOperations,
47112                                 ForCodeSize, Cost, Depth + 1))
47113      return DAG.getNode(Opc, SDLoc(Op), VT, NegOp0);
47114    break;
47115  }
47116
47117  return TargetLowering::getNegatedExpression(Op, DAG, LegalOperations,
47118                                              ForCodeSize, Cost, Depth);
47119}
47120
47121static SDValue lowerX86FPLogicOp(SDNode *N, SelectionDAG &DAG,
47122                                 const X86Subtarget &Subtarget) {
47123  MVT VT = N->getSimpleValueType(0);
47124  // If we have integer vector types available, use the integer opcodes.
47125  if (!VT.isVector() || !Subtarget.hasSSE2())
47126    return SDValue();
47127
47128  SDLoc dl(N);
47129
47130  unsigned IntBits = VT.getScalarSizeInBits();
47131  MVT IntSVT = MVT::getIntegerVT(IntBits);
47132  MVT IntVT = MVT::getVectorVT(IntSVT, VT.getSizeInBits() / IntBits);
47133
47134  SDValue Op0 = DAG.getBitcast(IntVT, N->getOperand(0));
47135  SDValue Op1 = DAG.getBitcast(IntVT, N->getOperand(1));
47136  unsigned IntOpcode;
47137  switch (N->getOpcode()) {
47138  default: llvm_unreachable("Unexpected FP logic op");
47139  case X86ISD::FOR:   IntOpcode = ISD::OR; break;
47140  case X86ISD::FXOR:  IntOpcode = ISD::XOR; break;
47141  case X86ISD::FAND:  IntOpcode = ISD::AND; break;
47142  case X86ISD::FANDN: IntOpcode = X86ISD::ANDNP; break;
47143  }
47144  SDValue IntOp = DAG.getNode(IntOpcode, dl, IntVT, Op0, Op1);
47145  return DAG.getBitcast(VT, IntOp);
47146}
47147
47148
47149/// Fold a xor(setcc cond, val), 1 --> setcc (inverted(cond), val)
47150static SDValue foldXor1SetCC(SDNode *N, SelectionDAG &DAG) {
47151  if (N->getOpcode() != ISD::XOR)
47152    return SDValue();
47153
47154  SDValue LHS = N->getOperand(0);
47155  if (!isOneConstant(N->getOperand(1)) || LHS->getOpcode() != X86ISD::SETCC)
47156    return SDValue();
47157
47158  X86::CondCode NewCC = X86::GetOppositeBranchCondition(
47159      X86::CondCode(LHS->getConstantOperandVal(0)));
47160  SDLoc DL(N);
47161  return getSETCC(NewCC, LHS->getOperand(1), DL, DAG);
47162}
47163
47164static SDValue combineXor(SDNode *N, SelectionDAG &DAG,
47165                          TargetLowering::DAGCombinerInfo &DCI,
47166                          const X86Subtarget &Subtarget) {
47167  SDValue N0 = N->getOperand(0);
47168  SDValue N1 = N->getOperand(1);
47169  EVT VT = N->getValueType(0);
47170
47171  // If this is SSE1 only convert to FXOR to avoid scalarization.
47172  if (Subtarget.hasSSE1() && !Subtarget.hasSSE2() && VT == MVT::v4i32) {
47173    return DAG.getBitcast(MVT::v4i32,
47174                          DAG.getNode(X86ISD::FXOR, SDLoc(N), MVT::v4f32,
47175                                      DAG.getBitcast(MVT::v4f32, N0),
47176                                      DAG.getBitcast(MVT::v4f32, N1)));
47177  }
47178
47179  if (SDValue Cmp = foldVectorXorShiftIntoCmp(N, DAG, Subtarget))
47180    return Cmp;
47181
47182  if (SDValue R = combineBitOpWithMOVMSK(N, DAG))
47183    return R;
47184
47185  if (DCI.isBeforeLegalizeOps())
47186    return SDValue();
47187
47188  if (SDValue SetCC = foldXor1SetCC(N, DAG))
47189    return SetCC;
47190
47191  if (SDValue RV = foldXorTruncShiftIntoCmp(N, DAG))
47192    return RV;
47193
47194  // Fold not(iX bitcast(vXi1)) -> (iX bitcast(not(vec))) for legal boolvecs.
47195  const TargetLowering &TLI = DAG.getTargetLoweringInfo();
47196  if (llvm::isAllOnesConstant(N1) && N0.getOpcode() == ISD::BITCAST &&
47197      N0.getOperand(0).getValueType().isVector() &&
47198      N0.getOperand(0).getValueType().getVectorElementType() == MVT::i1 &&
47199      TLI.isTypeLegal(N0.getOperand(0).getValueType()) && N0.hasOneUse()) {
47200    return DAG.getBitcast(VT, DAG.getNOT(SDLoc(N), N0.getOperand(0),
47201                                         N0.getOperand(0).getValueType()));
47202  }
47203
47204  // Handle AVX512 mask widening.
47205  // Fold not(insert_subvector(undef,sub)) -> insert_subvector(undef,not(sub))
47206  if (ISD::isBuildVectorAllOnes(N1.getNode()) && VT.isVector() &&
47207      VT.getVectorElementType() == MVT::i1 &&
47208      N0.getOpcode() == ISD::INSERT_SUBVECTOR && N0.getOperand(0).isUndef() &&
47209      TLI.isTypeLegal(N0.getOperand(1).getValueType())) {
47210    return DAG.getNode(
47211        ISD::INSERT_SUBVECTOR, SDLoc(N), VT, N0.getOperand(0),
47212        DAG.getNOT(SDLoc(N), N0.getOperand(1), N0.getOperand(1).getValueType()),
47213        N0.getOperand(2));
47214  }
47215
47216  // Fold xor(zext(xor(x,c1)),c2) -> xor(zext(x),xor(zext(c1),c2))
47217  // Fold xor(truncate(xor(x,c1)),c2) -> xor(truncate(x),xor(truncate(c1),c2))
47218  // TODO: Under what circumstances could this be performed in DAGCombine?
47219  if ((N0.getOpcode() == ISD::TRUNCATE || N0.getOpcode() == ISD::ZERO_EXTEND) &&
47220      N0.getOperand(0).getOpcode() == N->getOpcode()) {
47221    SDValue TruncExtSrc = N0.getOperand(0);
47222    auto *N1C = dyn_cast<ConstantSDNode>(N1);
47223    auto *N001C = dyn_cast<ConstantSDNode>(TruncExtSrc.getOperand(1));
47224    if (N1C && !N1C->isOpaque() && N001C && !N001C->isOpaque()) {
47225      SDLoc DL(N);
47226      SDValue LHS = DAG.getZExtOrTrunc(TruncExtSrc.getOperand(0), DL, VT);
47227      SDValue RHS = DAG.getZExtOrTrunc(TruncExtSrc.getOperand(1), DL, VT);
47228      return DAG.getNode(ISD::XOR, DL, VT, LHS,
47229                         DAG.getNode(ISD::XOR, DL, VT, RHS, N1));
47230    }
47231  }
47232
47233  if (SDValue FPLogic = convertIntLogicToFPLogic(N, DAG, Subtarget))
47234    return FPLogic;
47235
47236  return combineFneg(N, DAG, DCI, Subtarget);
47237}
47238
47239static SDValue combineBEXTR(SDNode *N, SelectionDAG &DAG,
47240                            TargetLowering::DAGCombinerInfo &DCI,
47241                            const X86Subtarget &Subtarget) {
47242  EVT VT = N->getValueType(0);
47243  unsigned NumBits = VT.getSizeInBits();
47244
47245  // TODO - Constant Folding.
47246
47247  // Simplify the inputs.
47248  const TargetLowering &TLI = DAG.getTargetLoweringInfo();
47249  APInt DemandedMask(APInt::getAllOnesValue(NumBits));
47250  if (TLI.SimplifyDemandedBits(SDValue(N, 0), DemandedMask, DCI))
47251    return SDValue(N, 0);
47252
47253  return SDValue();
47254}
47255
47256static bool isNullFPScalarOrVectorConst(SDValue V) {
47257  return isNullFPConstant(V) || ISD::isBuildVectorAllZeros(V.getNode());
47258}
47259
47260/// If a value is a scalar FP zero or a vector FP zero (potentially including
47261/// undefined elements), return a zero constant that may be used to fold away
47262/// that value. In the case of a vector, the returned constant will not contain
47263/// undefined elements even if the input parameter does. This makes it suitable
47264/// to be used as a replacement operand with operations (eg, bitwise-and) where
47265/// an undef should not propagate.
47266static SDValue getNullFPConstForNullVal(SDValue V, SelectionDAG &DAG,
47267                                        const X86Subtarget &Subtarget) {
47268  if (!isNullFPScalarOrVectorConst(V))
47269    return SDValue();
47270
47271  if (V.getValueType().isVector())
47272    return getZeroVector(V.getSimpleValueType(), Subtarget, DAG, SDLoc(V));
47273
47274  return V;
47275}
47276
47277static SDValue combineFAndFNotToFAndn(SDNode *N, SelectionDAG &DAG,
47278                                      const X86Subtarget &Subtarget) {
47279  SDValue N0 = N->getOperand(0);
47280  SDValue N1 = N->getOperand(1);
47281  EVT VT = N->getValueType(0);
47282  SDLoc DL(N);
47283
47284  // Vector types are handled in combineANDXORWithAllOnesIntoANDNP().
47285  if (!((VT == MVT::f32 && Subtarget.hasSSE1()) ||
47286        (VT == MVT::f64 && Subtarget.hasSSE2()) ||
47287        (VT == MVT::v4f32 && Subtarget.hasSSE1() && !Subtarget.hasSSE2())))
47288    return SDValue();
47289
47290  auto isAllOnesConstantFP = [](SDValue V) {
47291    if (V.getSimpleValueType().isVector())
47292      return ISD::isBuildVectorAllOnes(V.getNode());
47293    auto *C = dyn_cast<ConstantFPSDNode>(V);
47294    return C && C->getConstantFPValue()->isAllOnesValue();
47295  };
47296
47297  // fand (fxor X, -1), Y --> fandn X, Y
47298  if (N0.getOpcode() == X86ISD::FXOR && isAllOnesConstantFP(N0.getOperand(1)))
47299    return DAG.getNode(X86ISD::FANDN, DL, VT, N0.getOperand(0), N1);
47300
47301  // fand X, (fxor Y, -1) --> fandn Y, X
47302  if (N1.getOpcode() == X86ISD::FXOR && isAllOnesConstantFP(N1.getOperand(1)))
47303    return DAG.getNode(X86ISD::FANDN, DL, VT, N1.getOperand(0), N0);
47304
47305  return SDValue();
47306}
47307
47308/// Do target-specific dag combines on X86ISD::FAND nodes.
47309static SDValue combineFAnd(SDNode *N, SelectionDAG &DAG,
47310                           const X86Subtarget &Subtarget) {
47311  // FAND(0.0, x) -> 0.0
47312  if (SDValue V = getNullFPConstForNullVal(N->getOperand(0), DAG, Subtarget))
47313    return V;
47314
47315  // FAND(x, 0.0) -> 0.0
47316  if (SDValue V = getNullFPConstForNullVal(N->getOperand(1), DAG, Subtarget))
47317    return V;
47318
47319  if (SDValue V = combineFAndFNotToFAndn(N, DAG, Subtarget))
47320    return V;
47321
47322  return lowerX86FPLogicOp(N, DAG, Subtarget);
47323}
47324
47325/// Do target-specific dag combines on X86ISD::FANDN nodes.
47326static SDValue combineFAndn(SDNode *N, SelectionDAG &DAG,
47327                            const X86Subtarget &Subtarget) {
47328  // FANDN(0.0, x) -> x
47329  if (isNullFPScalarOrVectorConst(N->getOperand(0)))
47330    return N->getOperand(1);
47331
47332  // FANDN(x, 0.0) -> 0.0
47333  if (SDValue V = getNullFPConstForNullVal(N->getOperand(1), DAG, Subtarget))
47334    return V;
47335
47336  return lowerX86FPLogicOp(N, DAG, Subtarget);
47337}
47338
47339/// Do target-specific dag combines on X86ISD::FOR and X86ISD::FXOR nodes.
47340static SDValue combineFOr(SDNode *N, SelectionDAG &DAG,
47341                          TargetLowering::DAGCombinerInfo &DCI,
47342                          const X86Subtarget &Subtarget) {
47343  assert(N->getOpcode() == X86ISD::FOR || N->getOpcode() == X86ISD::FXOR);
47344
47345  // F[X]OR(0.0, x) -> x
47346  if (isNullFPScalarOrVectorConst(N->getOperand(0)))
47347    return N->getOperand(1);
47348
47349  // F[X]OR(x, 0.0) -> x
47350  if (isNullFPScalarOrVectorConst(N->getOperand(1)))
47351    return N->getOperand(0);
47352
47353  if (SDValue NewVal = combineFneg(N, DAG, DCI, Subtarget))
47354    return NewVal;
47355
47356  return lowerX86FPLogicOp(N, DAG, Subtarget);
47357}
47358
47359/// Do target-specific dag combines on X86ISD::FMIN and X86ISD::FMAX nodes.
47360static SDValue combineFMinFMax(SDNode *N, SelectionDAG &DAG) {
47361  assert(N->getOpcode() == X86ISD::FMIN || N->getOpcode() == X86ISD::FMAX);
47362
47363  // FMIN/FMAX are commutative if no NaNs and no negative zeros are allowed.
47364  if (!DAG.getTarget().Options.NoNaNsFPMath ||
47365      !DAG.getTarget().Options.NoSignedZerosFPMath)
47366    return SDValue();
47367
47368  // If we run in unsafe-math mode, then convert the FMAX and FMIN nodes
47369  // into FMINC and FMAXC, which are Commutative operations.
47370  unsigned NewOp = 0;
47371  switch (N->getOpcode()) {
47372    default: llvm_unreachable("unknown opcode");
47373    case X86ISD::FMIN:  NewOp = X86ISD::FMINC; break;
47374    case X86ISD::FMAX:  NewOp = X86ISD::FMAXC; break;
47375  }
47376
47377  return DAG.getNode(NewOp, SDLoc(N), N->getValueType(0),
47378                     N->getOperand(0), N->getOperand(1));
47379}
47380
47381static SDValue combineFMinNumFMaxNum(SDNode *N, SelectionDAG &DAG,
47382                                     const X86Subtarget &Subtarget) {
47383  if (Subtarget.useSoftFloat())
47384    return SDValue();
47385
47386  const TargetLowering &TLI = DAG.getTargetLoweringInfo();
47387
47388  EVT VT = N->getValueType(0);
47389  if (!((Subtarget.hasSSE1() && VT == MVT::f32) ||
47390        (Subtarget.hasSSE2() && VT == MVT::f64) ||
47391        (VT.isVector() && TLI.isTypeLegal(VT))))
47392    return SDValue();
47393
47394  SDValue Op0 = N->getOperand(0);
47395  SDValue Op1 = N->getOperand(1);
47396  SDLoc DL(N);
47397  auto MinMaxOp = N->getOpcode() == ISD::FMAXNUM ? X86ISD::FMAX : X86ISD::FMIN;
47398
47399  // If we don't have to respect NaN inputs, this is a direct translation to x86
47400  // min/max instructions.
47401  if (DAG.getTarget().Options.NoNaNsFPMath || N->getFlags().hasNoNaNs())
47402    return DAG.getNode(MinMaxOp, DL, VT, Op0, Op1, N->getFlags());
47403
47404  // If one of the operands is known non-NaN use the native min/max instructions
47405  // with the non-NaN input as second operand.
47406  if (DAG.isKnownNeverNaN(Op1))
47407    return DAG.getNode(MinMaxOp, DL, VT, Op0, Op1, N->getFlags());
47408  if (DAG.isKnownNeverNaN(Op0))
47409    return DAG.getNode(MinMaxOp, DL, VT, Op1, Op0, N->getFlags());
47410
47411  // If we have to respect NaN inputs, this takes at least 3 instructions.
47412  // Favor a library call when operating on a scalar and minimizing code size.
47413  if (!VT.isVector() && DAG.getMachineFunction().getFunction().hasMinSize())
47414    return SDValue();
47415
47416  EVT SetCCType = TLI.getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(),
47417                                         VT);
47418
47419  // There are 4 possibilities involving NaN inputs, and these are the required
47420  // outputs:
47421  //                   Op1
47422  //               Num     NaN
47423  //            ----------------
47424  //       Num  |  Max  |  Op0 |
47425  // Op0        ----------------
47426  //       NaN  |  Op1  |  NaN |
47427  //            ----------------
47428  //
47429  // The SSE FP max/min instructions were not designed for this case, but rather
47430  // to implement:
47431  //   Min = Op1 < Op0 ? Op1 : Op0
47432  //   Max = Op1 > Op0 ? Op1 : Op0
47433  //
47434  // So they always return Op0 if either input is a NaN. However, we can still
47435  // use those instructions for fmaxnum by selecting away a NaN input.
47436
47437  // If either operand is NaN, the 2nd source operand (Op0) is passed through.
47438  SDValue MinOrMax = DAG.getNode(MinMaxOp, DL, VT, Op1, Op0);
47439  SDValue IsOp0Nan = DAG.getSetCC(DL, SetCCType, Op0, Op0, ISD::SETUO);
47440
47441  // If Op0 is a NaN, select Op1. Otherwise, select the max. If both operands
47442  // are NaN, the NaN value of Op1 is the result.
47443  return DAG.getSelect(DL, VT, IsOp0Nan, Op1, MinOrMax);
47444}
47445
47446static SDValue combineX86INT_TO_FP(SDNode *N, SelectionDAG &DAG,
47447                                   TargetLowering::DAGCombinerInfo &DCI) {
47448  EVT VT = N->getValueType(0);
47449  const TargetLowering &TLI = DAG.getTargetLoweringInfo();
47450
47451  APInt KnownUndef, KnownZero;
47452  APInt DemandedElts = APInt::getAllOnesValue(VT.getVectorNumElements());
47453  if (TLI.SimplifyDemandedVectorElts(SDValue(N, 0), DemandedElts, KnownUndef,
47454                                     KnownZero, DCI))
47455    return SDValue(N, 0);
47456
47457  // Convert a full vector load into vzload when not all bits are needed.
47458  SDValue In = N->getOperand(0);
47459  MVT InVT = In.getSimpleValueType();
47460  if (VT.getVectorNumElements() < InVT.getVectorNumElements() &&
47461      ISD::isNormalLoad(In.getNode()) && In.hasOneUse()) {
47462    assert(InVT.is128BitVector() && "Expected 128-bit input vector");
47463    LoadSDNode *LN = cast<LoadSDNode>(N->getOperand(0));
47464    unsigned NumBits = InVT.getScalarSizeInBits() * VT.getVectorNumElements();
47465    MVT MemVT = MVT::getIntegerVT(NumBits);
47466    MVT LoadVT = MVT::getVectorVT(MemVT, 128 / NumBits);
47467    if (SDValue VZLoad = narrowLoadToVZLoad(LN, MemVT, LoadVT, DAG)) {
47468      SDLoc dl(N);
47469      SDValue Convert = DAG.getNode(N->getOpcode(), dl, VT,
47470                                    DAG.getBitcast(InVT, VZLoad));
47471      DCI.CombineTo(N, Convert);
47472      DAG.ReplaceAllUsesOfValueWith(SDValue(LN, 1), VZLoad.getValue(1));
47473      DCI.recursivelyDeleteUnusedNodes(LN);
47474      return SDValue(N, 0);
47475    }
47476  }
47477
47478  return SDValue();
47479}
47480
47481static SDValue combineCVTP2I_CVTTP2I(SDNode *N, SelectionDAG &DAG,
47482                                     TargetLowering::DAGCombinerInfo &DCI) {
47483  bool IsStrict = N->isTargetStrictFPOpcode();
47484  EVT VT = N->getValueType(0);
47485
47486  // Convert a full vector load into vzload when not all bits are needed.
47487  SDValue In = N->getOperand(IsStrict ? 1 : 0);
47488  MVT InVT = In.getSimpleValueType();
47489  if (VT.getVectorNumElements() < InVT.getVectorNumElements() &&
47490      ISD::isNormalLoad(In.getNode()) && In.hasOneUse()) {
47491    assert(InVT.is128BitVector() && "Expected 128-bit input vector");
47492    LoadSDNode *LN = cast<LoadSDNode>(In);
47493    unsigned NumBits = InVT.getScalarSizeInBits() * VT.getVectorNumElements();
47494    MVT MemVT = MVT::getFloatingPointVT(NumBits);
47495    MVT LoadVT = MVT::getVectorVT(MemVT, 128 / NumBits);
47496    if (SDValue VZLoad = narrowLoadToVZLoad(LN, MemVT, LoadVT, DAG)) {
47497      SDLoc dl(N);
47498      if (IsStrict) {
47499        SDValue Convert =
47500            DAG.getNode(N->getOpcode(), dl, {VT, MVT::Other},
47501                        {N->getOperand(0), DAG.getBitcast(InVT, VZLoad)});
47502        DCI.CombineTo(N, Convert, Convert.getValue(1));
47503      } else {
47504        SDValue Convert =
47505            DAG.getNode(N->getOpcode(), dl, VT, DAG.getBitcast(InVT, VZLoad));
47506        DCI.CombineTo(N, Convert);
47507      }
47508      DAG.ReplaceAllUsesOfValueWith(SDValue(LN, 1), VZLoad.getValue(1));
47509      DCI.recursivelyDeleteUnusedNodes(LN);
47510      return SDValue(N, 0);
47511    }
47512  }
47513
47514  return SDValue();
47515}
47516
47517/// Do target-specific dag combines on X86ISD::ANDNP nodes.
47518static SDValue combineAndnp(SDNode *N, SelectionDAG &DAG,
47519                            TargetLowering::DAGCombinerInfo &DCI,
47520                            const X86Subtarget &Subtarget) {
47521  MVT VT = N->getSimpleValueType(0);
47522
47523  // ANDNP(0, x) -> x
47524  if (ISD::isBuildVectorAllZeros(N->getOperand(0).getNode()))
47525    return N->getOperand(1);
47526
47527  // ANDNP(x, 0) -> 0
47528  if (ISD::isBuildVectorAllZeros(N->getOperand(1).getNode()))
47529    return DAG.getConstant(0, SDLoc(N), VT);
47530
47531  // Turn ANDNP back to AND if input is inverted.
47532  if (SDValue Not = IsNOT(N->getOperand(0), DAG))
47533    return DAG.getNode(ISD::AND, SDLoc(N), VT, DAG.getBitcast(VT, Not),
47534                       N->getOperand(1));
47535
47536  // Attempt to recursively combine a bitmask ANDNP with shuffles.
47537  if (VT.isVector() && (VT.getScalarSizeInBits() % 8) == 0) {
47538    SDValue Op(N, 0);
47539    if (SDValue Res = combineX86ShufflesRecursively(Op, DAG, Subtarget))
47540      return Res;
47541  }
47542
47543  return SDValue();
47544}
47545
47546static SDValue combineBT(SDNode *N, SelectionDAG &DAG,
47547                         TargetLowering::DAGCombinerInfo &DCI) {
47548  SDValue N1 = N->getOperand(1);
47549
47550  // BT ignores high bits in the bit index operand.
47551  unsigned BitWidth = N1.getValueSizeInBits();
47552  APInt DemandedMask = APInt::getLowBitsSet(BitWidth, Log2_32(BitWidth));
47553  if (DAG.getTargetLoweringInfo().SimplifyDemandedBits(N1, DemandedMask, DCI)) {
47554    if (N->getOpcode() != ISD::DELETED_NODE)
47555      DCI.AddToWorklist(N);
47556    return SDValue(N, 0);
47557  }
47558
47559  return SDValue();
47560}
47561
47562static SDValue combineCVTPH2PS(SDNode *N, SelectionDAG &DAG,
47563                               TargetLowering::DAGCombinerInfo &DCI) {
47564  bool IsStrict = N->getOpcode() == X86ISD::STRICT_CVTPH2PS;
47565  SDValue Src = N->getOperand(IsStrict ? 1 : 0);
47566
47567  if (N->getValueType(0) == MVT::v4f32 && Src.getValueType() == MVT::v8i16) {
47568    APInt KnownUndef, KnownZero;
47569    const TargetLowering &TLI = DAG.getTargetLoweringInfo();
47570    APInt DemandedElts = APInt::getLowBitsSet(8, 4);
47571    if (TLI.SimplifyDemandedVectorElts(Src, DemandedElts, KnownUndef, KnownZero,
47572                                       DCI)) {
47573      if (N->getOpcode() != ISD::DELETED_NODE)
47574        DCI.AddToWorklist(N);
47575      return SDValue(N, 0);
47576    }
47577
47578    // Convert a full vector load into vzload when not all bits are needed.
47579    if (ISD::isNormalLoad(Src.getNode()) && Src.hasOneUse()) {
47580      LoadSDNode *LN = cast<LoadSDNode>(N->getOperand(IsStrict ? 1 : 0));
47581      if (SDValue VZLoad = narrowLoadToVZLoad(LN, MVT::i64, MVT::v2i64, DAG)) {
47582        SDLoc dl(N);
47583        if (IsStrict) {
47584          SDValue Convert = DAG.getNode(
47585              N->getOpcode(), dl, {MVT::v4f32, MVT::Other},
47586              {N->getOperand(0), DAG.getBitcast(MVT::v8i16, VZLoad)});
47587          DCI.CombineTo(N, Convert, Convert.getValue(1));
47588        } else {
47589          SDValue Convert = DAG.getNode(N->getOpcode(), dl, MVT::v4f32,
47590                                        DAG.getBitcast(MVT::v8i16, VZLoad));
47591          DCI.CombineTo(N, Convert);
47592        }
47593
47594        DAG.ReplaceAllUsesOfValueWith(SDValue(LN, 1), VZLoad.getValue(1));
47595        DCI.recursivelyDeleteUnusedNodes(LN);
47596        return SDValue(N, 0);
47597      }
47598    }
47599  }
47600
47601  return SDValue();
47602}
47603
47604// Try to combine sext_in_reg of a cmov of constants by extending the constants.
47605static SDValue combineSextInRegCmov(SDNode *N, SelectionDAG &DAG) {
47606  assert(N->getOpcode() == ISD::SIGN_EXTEND_INREG);
47607
47608  EVT DstVT = N->getValueType(0);
47609
47610  SDValue N0 = N->getOperand(0);
47611  SDValue N1 = N->getOperand(1);
47612  EVT ExtraVT = cast<VTSDNode>(N1)->getVT();
47613
47614  if (ExtraVT != MVT::i8 && ExtraVT != MVT::i16)
47615    return SDValue();
47616
47617  // Look through single use any_extends / truncs.
47618  SDValue IntermediateBitwidthOp;
47619  if ((N0.getOpcode() == ISD::ANY_EXTEND || N0.getOpcode() == ISD::TRUNCATE) &&
47620      N0.hasOneUse()) {
47621    IntermediateBitwidthOp = N0;
47622    N0 = N0.getOperand(0);
47623  }
47624
47625  // See if we have a single use cmov.
47626  if (N0.getOpcode() != X86ISD::CMOV || !N0.hasOneUse())
47627    return SDValue();
47628
47629  SDValue CMovOp0 = N0.getOperand(0);
47630  SDValue CMovOp1 = N0.getOperand(1);
47631
47632  // Make sure both operands are constants.
47633  if (!isa<ConstantSDNode>(CMovOp0.getNode()) ||
47634      !isa<ConstantSDNode>(CMovOp1.getNode()))
47635    return SDValue();
47636
47637  SDLoc DL(N);
47638
47639  // If we looked through an any_extend/trunc above, add one to the constants.
47640  if (IntermediateBitwidthOp) {
47641    unsigned IntermediateOpc = IntermediateBitwidthOp.getOpcode();
47642    CMovOp0 = DAG.getNode(IntermediateOpc, DL, DstVT, CMovOp0);
47643    CMovOp1 = DAG.getNode(IntermediateOpc, DL, DstVT, CMovOp1);
47644  }
47645
47646  CMovOp0 = DAG.getNode(ISD::SIGN_EXTEND_INREG, DL, DstVT, CMovOp0, N1);
47647  CMovOp1 = DAG.getNode(ISD::SIGN_EXTEND_INREG, DL, DstVT, CMovOp1, N1);
47648
47649  EVT CMovVT = DstVT;
47650  // We do not want i16 CMOV's. Promote to i32 and truncate afterwards.
47651  if (DstVT == MVT::i16) {
47652    CMovVT = MVT::i32;
47653    CMovOp0 = DAG.getNode(ISD::ZERO_EXTEND, DL, CMovVT, CMovOp0);
47654    CMovOp1 = DAG.getNode(ISD::ZERO_EXTEND, DL, CMovVT, CMovOp1);
47655  }
47656
47657  SDValue CMov = DAG.getNode(X86ISD::CMOV, DL, CMovVT, CMovOp0, CMovOp1,
47658                             N0.getOperand(2), N0.getOperand(3));
47659
47660  if (CMovVT != DstVT)
47661    CMov = DAG.getNode(ISD::TRUNCATE, DL, DstVT, CMov);
47662
47663  return CMov;
47664}
47665
47666static SDValue combineSignExtendInReg(SDNode *N, SelectionDAG &DAG,
47667                                      const X86Subtarget &Subtarget) {
47668  assert(N->getOpcode() == ISD::SIGN_EXTEND_INREG);
47669
47670  if (SDValue V = combineSextInRegCmov(N, DAG))
47671    return V;
47672
47673  EVT VT = N->getValueType(0);
47674  SDValue N0 = N->getOperand(0);
47675  SDValue N1 = N->getOperand(1);
47676  EVT ExtraVT = cast<VTSDNode>(N1)->getVT();
47677  SDLoc dl(N);
47678
47679  // The SIGN_EXTEND_INREG to v4i64 is expensive operation on the
47680  // both SSE and AVX2 since there is no sign-extended shift right
47681  // operation on a vector with 64-bit elements.
47682  //(sext_in_reg (v4i64 anyext (v4i32 x )), ExtraVT) ->
47683  // (v4i64 sext (v4i32 sext_in_reg (v4i32 x , ExtraVT)))
47684  if (VT == MVT::v4i64 && (N0.getOpcode() == ISD::ANY_EXTEND ||
47685                           N0.getOpcode() == ISD::SIGN_EXTEND)) {
47686    SDValue N00 = N0.getOperand(0);
47687
47688    // EXTLOAD has a better solution on AVX2,
47689    // it may be replaced with X86ISD::VSEXT node.
47690    if (N00.getOpcode() == ISD::LOAD && Subtarget.hasInt256())
47691      if (!ISD::isNormalLoad(N00.getNode()))
47692        return SDValue();
47693
47694    // Attempt to promote any comparison mask ops before moving the
47695    // SIGN_EXTEND_INREG in the way.
47696    if (SDValue Promote = PromoteMaskArithmetic(N0.getNode(), DAG, Subtarget))
47697      return DAG.getNode(ISD::SIGN_EXTEND_INREG, dl, VT, Promote, N1);
47698
47699    if (N00.getValueType() == MVT::v4i32 && ExtraVT.getSizeInBits() < 128) {
47700      SDValue Tmp =
47701          DAG.getNode(ISD::SIGN_EXTEND_INREG, dl, MVT::v4i32, N00, N1);
47702      return DAG.getNode(ISD::SIGN_EXTEND, dl, MVT::v4i64, Tmp);
47703    }
47704  }
47705  return SDValue();
47706}
47707
47708/// sext(add_nsw(x, C)) --> add(sext(x), C_sext)
47709/// zext(add_nuw(x, C)) --> add(zext(x), C_zext)
47710/// Promoting a sign/zero extension ahead of a no overflow 'add' exposes
47711/// opportunities to combine math ops, use an LEA, or use a complex addressing
47712/// mode. This can eliminate extend, add, and shift instructions.
47713static SDValue promoteExtBeforeAdd(SDNode *Ext, SelectionDAG &DAG,
47714                                   const X86Subtarget &Subtarget) {
47715  if (Ext->getOpcode() != ISD::SIGN_EXTEND &&
47716      Ext->getOpcode() != ISD::ZERO_EXTEND)
47717    return SDValue();
47718
47719  // TODO: This should be valid for other integer types.
47720  EVT VT = Ext->getValueType(0);
47721  if (VT != MVT::i64)
47722    return SDValue();
47723
47724  SDValue Add = Ext->getOperand(0);
47725  if (Add.getOpcode() != ISD::ADD)
47726    return SDValue();
47727
47728  bool Sext = Ext->getOpcode() == ISD::SIGN_EXTEND;
47729  bool NSW = Add->getFlags().hasNoSignedWrap();
47730  bool NUW = Add->getFlags().hasNoUnsignedWrap();
47731
47732  // We need an 'add nsw' feeding into the 'sext' or 'add nuw' feeding
47733  // into the 'zext'
47734  if ((Sext && !NSW) || (!Sext && !NUW))
47735    return SDValue();
47736
47737  // Having a constant operand to the 'add' ensures that we are not increasing
47738  // the instruction count because the constant is extended for free below.
47739  // A constant operand can also become the displacement field of an LEA.
47740  auto *AddOp1 = dyn_cast<ConstantSDNode>(Add.getOperand(1));
47741  if (!AddOp1)
47742    return SDValue();
47743
47744  // Don't make the 'add' bigger if there's no hope of combining it with some
47745  // other 'add' or 'shl' instruction.
47746  // TODO: It may be profitable to generate simpler LEA instructions in place
47747  // of single 'add' instructions, but the cost model for selecting an LEA
47748  // currently has a high threshold.
47749  bool HasLEAPotential = false;
47750  for (auto *User : Ext->uses()) {
47751    if (User->getOpcode() == ISD::ADD || User->getOpcode() == ISD::SHL) {
47752      HasLEAPotential = true;
47753      break;
47754    }
47755  }
47756  if (!HasLEAPotential)
47757    return SDValue();
47758
47759  // Everything looks good, so pull the '{s|z}ext' ahead of the 'add'.
47760  int64_t AddConstant = Sext ? AddOp1->getSExtValue() : AddOp1->getZExtValue();
47761  SDValue AddOp0 = Add.getOperand(0);
47762  SDValue NewExt = DAG.getNode(Ext->getOpcode(), SDLoc(Ext), VT, AddOp0);
47763  SDValue NewConstant = DAG.getConstant(AddConstant, SDLoc(Add), VT);
47764
47765  // The wider add is guaranteed to not wrap because both operands are
47766  // sign-extended.
47767  SDNodeFlags Flags;
47768  Flags.setNoSignedWrap(NSW);
47769  Flags.setNoUnsignedWrap(NUW);
47770  return DAG.getNode(ISD::ADD, SDLoc(Add), VT, NewExt, NewConstant, Flags);
47771}
47772
47773// If we face {ANY,SIGN,ZERO}_EXTEND that is applied to a CMOV with constant
47774// operands and the result of CMOV is not used anywhere else - promote CMOV
47775// itself instead of promoting its result. This could be beneficial, because:
47776//     1) X86TargetLowering::EmitLoweredSelect later can do merging of two
47777//        (or more) pseudo-CMOVs only when they go one-after-another and
47778//        getting rid of result extension code after CMOV will help that.
47779//     2) Promotion of constant CMOV arguments is free, hence the
47780//        {ANY,SIGN,ZERO}_EXTEND will just be deleted.
47781//     3) 16-bit CMOV encoding is 4 bytes, 32-bit CMOV is 3-byte, so this
47782//        promotion is also good in terms of code-size.
47783//        (64-bit CMOV is 4-bytes, that's why we don't do 32-bit => 64-bit
47784//         promotion).
47785static SDValue combineToExtendCMOV(SDNode *Extend, SelectionDAG &DAG) {
47786  SDValue CMovN = Extend->getOperand(0);
47787  if (CMovN.getOpcode() != X86ISD::CMOV || !CMovN.hasOneUse())
47788    return SDValue();
47789
47790  EVT TargetVT = Extend->getValueType(0);
47791  unsigned ExtendOpcode = Extend->getOpcode();
47792  SDLoc DL(Extend);
47793
47794  EVT VT = CMovN.getValueType();
47795  SDValue CMovOp0 = CMovN.getOperand(0);
47796  SDValue CMovOp1 = CMovN.getOperand(1);
47797
47798  if (!isa<ConstantSDNode>(CMovOp0.getNode()) ||
47799      !isa<ConstantSDNode>(CMovOp1.getNode()))
47800    return SDValue();
47801
47802  // Only extend to i32 or i64.
47803  if (TargetVT != MVT::i32 && TargetVT != MVT::i64)
47804    return SDValue();
47805
47806  // Only extend from i16 unless its a sign_extend from i32. Zext/aext from i32
47807  // are free.
47808  if (VT != MVT::i16 && !(ExtendOpcode == ISD::SIGN_EXTEND && VT == MVT::i32))
47809    return SDValue();
47810
47811  // If this a zero extend to i64, we should only extend to i32 and use a free
47812  // zero extend to finish.
47813  EVT ExtendVT = TargetVT;
47814  if (TargetVT == MVT::i64 && ExtendOpcode != ISD::SIGN_EXTEND)
47815    ExtendVT = MVT::i32;
47816
47817  CMovOp0 = DAG.getNode(ExtendOpcode, DL, ExtendVT, CMovOp0);
47818  CMovOp1 = DAG.getNode(ExtendOpcode, DL, ExtendVT, CMovOp1);
47819
47820  SDValue Res = DAG.getNode(X86ISD::CMOV, DL, ExtendVT, CMovOp0, CMovOp1,
47821                            CMovN.getOperand(2), CMovN.getOperand(3));
47822
47823  // Finish extending if needed.
47824  if (ExtendVT != TargetVT)
47825    Res = DAG.getNode(ExtendOpcode, DL, TargetVT, Res);
47826
47827  return Res;
47828}
47829
47830// Convert (vXiY *ext(vXi1 bitcast(iX))) to extend_in_reg(broadcast(iX)).
47831// This is more or less the reverse of combineBitcastvxi1.
47832static SDValue
47833combineToExtendBoolVectorInReg(SDNode *N, SelectionDAG &DAG,
47834                               TargetLowering::DAGCombinerInfo &DCI,
47835                               const X86Subtarget &Subtarget) {
47836  unsigned Opcode = N->getOpcode();
47837  if (Opcode != ISD::SIGN_EXTEND && Opcode != ISD::ZERO_EXTEND &&
47838      Opcode != ISD::ANY_EXTEND)
47839    return SDValue();
47840  if (!DCI.isBeforeLegalizeOps())
47841    return SDValue();
47842  if (!Subtarget.hasSSE2() || Subtarget.hasAVX512())
47843    return SDValue();
47844
47845  SDValue N0 = N->getOperand(0);
47846  EVT VT = N->getValueType(0);
47847  EVT SVT = VT.getScalarType();
47848  EVT InSVT = N0.getValueType().getScalarType();
47849  unsigned EltSizeInBits = SVT.getSizeInBits();
47850
47851  // Input type must be extending a bool vector (bit-casted from a scalar
47852  // integer) to legal integer types.
47853  if (!VT.isVector())
47854    return SDValue();
47855  if (SVT != MVT::i64 && SVT != MVT::i32 && SVT != MVT::i16 && SVT != MVT::i8)
47856    return SDValue();
47857  if (InSVT != MVT::i1 || N0.getOpcode() != ISD::BITCAST)
47858    return SDValue();
47859
47860  SDValue N00 = N0.getOperand(0);
47861  EVT SclVT = N0.getOperand(0).getValueType();
47862  if (!SclVT.isScalarInteger())
47863    return SDValue();
47864
47865  SDLoc DL(N);
47866  SDValue Vec;
47867  SmallVector<int, 32> ShuffleMask;
47868  unsigned NumElts = VT.getVectorNumElements();
47869  assert(NumElts == SclVT.getSizeInBits() && "Unexpected bool vector size");
47870
47871  // Broadcast the scalar integer to the vector elements.
47872  if (NumElts > EltSizeInBits) {
47873    // If the scalar integer is greater than the vector element size, then we
47874    // must split it down into sub-sections for broadcasting. For example:
47875    //   i16 -> v16i8 (i16 -> v8i16 -> v16i8) with 2 sub-sections.
47876    //   i32 -> v32i8 (i32 -> v8i32 -> v32i8) with 4 sub-sections.
47877    assert((NumElts % EltSizeInBits) == 0 && "Unexpected integer scale");
47878    unsigned Scale = NumElts / EltSizeInBits;
47879    EVT BroadcastVT =
47880        EVT::getVectorVT(*DAG.getContext(), SclVT, EltSizeInBits);
47881    Vec = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, BroadcastVT, N00);
47882    Vec = DAG.getBitcast(VT, Vec);
47883
47884    for (unsigned i = 0; i != Scale; ++i)
47885      ShuffleMask.append(EltSizeInBits, i);
47886    Vec = DAG.getVectorShuffle(VT, DL, Vec, Vec, ShuffleMask);
47887  } else if (Subtarget.hasAVX2() && NumElts < EltSizeInBits &&
47888             (SclVT == MVT::i8 || SclVT == MVT::i16 || SclVT == MVT::i32)) {
47889    // If we have register broadcast instructions, use the scalar size as the
47890    // element type for the shuffle. Then cast to the wider element type. The
47891    // widened bits won't be used, and this might allow the use of a broadcast
47892    // load.
47893    assert((EltSizeInBits % NumElts) == 0 && "Unexpected integer scale");
47894    unsigned Scale = EltSizeInBits / NumElts;
47895    EVT BroadcastVT =
47896        EVT::getVectorVT(*DAG.getContext(), SclVT, NumElts * Scale);
47897    Vec = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, BroadcastVT, N00);
47898    ShuffleMask.append(NumElts * Scale, 0);
47899    Vec = DAG.getVectorShuffle(BroadcastVT, DL, Vec, Vec, ShuffleMask);
47900    Vec = DAG.getBitcast(VT, Vec);
47901  } else {
47902    // For smaller scalar integers, we can simply any-extend it to the vector
47903    // element size (we don't care about the upper bits) and broadcast it to all
47904    // elements.
47905    SDValue Scl = DAG.getAnyExtOrTrunc(N00, DL, SVT);
47906    Vec = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, VT, Scl);
47907    ShuffleMask.append(NumElts, 0);
47908    Vec = DAG.getVectorShuffle(VT, DL, Vec, Vec, ShuffleMask);
47909  }
47910
47911  // Now, mask the relevant bit in each element.
47912  SmallVector<SDValue, 32> Bits;
47913  for (unsigned i = 0; i != NumElts; ++i) {
47914    int BitIdx = (i % EltSizeInBits);
47915    APInt Bit = APInt::getBitsSet(EltSizeInBits, BitIdx, BitIdx + 1);
47916    Bits.push_back(DAG.getConstant(Bit, DL, SVT));
47917  }
47918  SDValue BitMask = DAG.getBuildVector(VT, DL, Bits);
47919  Vec = DAG.getNode(ISD::AND, DL, VT, Vec, BitMask);
47920
47921  // Compare against the bitmask and extend the result.
47922  EVT CCVT = VT.changeVectorElementType(MVT::i1);
47923  Vec = DAG.getSetCC(DL, CCVT, Vec, BitMask, ISD::SETEQ);
47924  Vec = DAG.getSExtOrTrunc(Vec, DL, VT);
47925
47926  // For SEXT, this is now done, otherwise shift the result down for
47927  // zero-extension.
47928  if (Opcode == ISD::SIGN_EXTEND)
47929    return Vec;
47930  return DAG.getNode(ISD::SRL, DL, VT, Vec,
47931                     DAG.getConstant(EltSizeInBits - 1, DL, VT));
47932}
47933
47934// Attempt to combine a (sext/zext (setcc)) to a setcc with a xmm/ymm/zmm
47935// result type.
47936static SDValue combineExtSetcc(SDNode *N, SelectionDAG &DAG,
47937                               const X86Subtarget &Subtarget) {
47938  SDValue N0 = N->getOperand(0);
47939  EVT VT = N->getValueType(0);
47940  SDLoc dl(N);
47941
47942  // Only do this combine with AVX512 for vector extends.
47943  if (!Subtarget.hasAVX512() || !VT.isVector() || N0.getOpcode() != ISD::SETCC)
47944    return SDValue();
47945
47946  // Only combine legal element types.
47947  EVT SVT = VT.getVectorElementType();
47948  if (SVT != MVT::i8 && SVT != MVT::i16 && SVT != MVT::i32 &&
47949      SVT != MVT::i64 && SVT != MVT::f32 && SVT != MVT::f64)
47950    return SDValue();
47951
47952  // We can only do this if the vector size in 256 bits or less.
47953  unsigned Size = VT.getSizeInBits();
47954  if (Size > 256 && Subtarget.useAVX512Regs())
47955    return SDValue();
47956
47957  // Don't fold if the condition code can't be handled by PCMPEQ/PCMPGT since
47958  // that's the only integer compares with we have.
47959  ISD::CondCode CC = cast<CondCodeSDNode>(N0.getOperand(2))->get();
47960  if (ISD::isUnsignedIntSetCC(CC))
47961    return SDValue();
47962
47963  // Only do this combine if the extension will be fully consumed by the setcc.
47964  EVT N00VT = N0.getOperand(0).getValueType();
47965  EVT MatchingVecType = N00VT.changeVectorElementTypeToInteger();
47966  if (Size != MatchingVecType.getSizeInBits())
47967    return SDValue();
47968
47969  SDValue Res = DAG.getSetCC(dl, VT, N0.getOperand(0), N0.getOperand(1), CC);
47970
47971  if (N->getOpcode() == ISD::ZERO_EXTEND)
47972    Res = DAG.getZeroExtendInReg(Res, dl, N0.getValueType());
47973
47974  return Res;
47975}
47976
47977static SDValue combineSext(SDNode *N, SelectionDAG &DAG,
47978                           TargetLowering::DAGCombinerInfo &DCI,
47979                           const X86Subtarget &Subtarget) {
47980  SDValue N0 = N->getOperand(0);
47981  EVT VT = N->getValueType(0);
47982  SDLoc DL(N);
47983
47984  // (i32 (sext (i8 (x86isd::setcc_carry)))) -> (i32 (x86isd::setcc_carry))
47985  if (!DCI.isBeforeLegalizeOps() &&
47986      N0.getOpcode() == X86ISD::SETCC_CARRY) {
47987    SDValue Setcc = DAG.getNode(X86ISD::SETCC_CARRY, DL, VT, N0->getOperand(0),
47988                                 N0->getOperand(1));
47989    bool ReplaceOtherUses = !N0.hasOneUse();
47990    DCI.CombineTo(N, Setcc);
47991    // Replace other uses with a truncate of the widened setcc_carry.
47992    if (ReplaceOtherUses) {
47993      SDValue Trunc = DAG.getNode(ISD::TRUNCATE, SDLoc(N0),
47994                                  N0.getValueType(), Setcc);
47995      DCI.CombineTo(N0.getNode(), Trunc);
47996    }
47997
47998    return SDValue(N, 0);
47999  }
48000
48001  if (SDValue NewCMov = combineToExtendCMOV(N, DAG))
48002    return NewCMov;
48003
48004  if (!DCI.isBeforeLegalizeOps())
48005    return SDValue();
48006
48007  if (SDValue V = combineExtSetcc(N, DAG, Subtarget))
48008    return V;
48009
48010  if (SDValue V = combineToExtendBoolVectorInReg(N, DAG, DCI, Subtarget))
48011    return V;
48012
48013  if (VT.isVector()) {
48014    if (SDValue R = PromoteMaskArithmetic(N, DAG, Subtarget))
48015      return R;
48016
48017    if (N0.getOpcode() == ISD::SIGN_EXTEND_VECTOR_INREG)
48018      return DAG.getNode(N0.getOpcode(), DL, VT, N0.getOperand(0));
48019  }
48020
48021  if (SDValue NewAdd = promoteExtBeforeAdd(N, DAG, Subtarget))
48022    return NewAdd;
48023
48024  return SDValue();
48025}
48026
48027static SDValue combineFMA(SDNode *N, SelectionDAG &DAG,
48028                          TargetLowering::DAGCombinerInfo &DCI,
48029                          const X86Subtarget &Subtarget) {
48030  SDLoc dl(N);
48031  EVT VT = N->getValueType(0);
48032  bool IsStrict = N->isStrictFPOpcode() || N->isTargetStrictFPOpcode();
48033
48034  // Let legalize expand this if it isn't a legal type yet.
48035  const TargetLowering &TLI = DAG.getTargetLoweringInfo();
48036  if (!TLI.isTypeLegal(VT))
48037    return SDValue();
48038
48039  SDValue A = N->getOperand(IsStrict ? 1 : 0);
48040  SDValue B = N->getOperand(IsStrict ? 2 : 1);
48041  SDValue C = N->getOperand(IsStrict ? 3 : 2);
48042
48043  // If the operation allows fast-math and the target does not support FMA,
48044  // split this into mul+add to avoid libcall(s).
48045  SDNodeFlags Flags = N->getFlags();
48046  if (!IsStrict && Flags.hasAllowReassociation() &&
48047      TLI.isOperationExpand(ISD::FMA, VT)) {
48048    SDValue Fmul = DAG.getNode(ISD::FMUL, dl, VT, A, B, Flags);
48049    return DAG.getNode(ISD::FADD, dl, VT, Fmul, C, Flags);
48050  }
48051
48052  EVT ScalarVT = VT.getScalarType();
48053  if ((ScalarVT != MVT::f32 && ScalarVT != MVT::f64) || !Subtarget.hasAnyFMA())
48054    return SDValue();
48055
48056  auto invertIfNegative = [&DAG, &TLI, &DCI](SDValue &V) {
48057    bool CodeSize = DAG.getMachineFunction().getFunction().hasOptSize();
48058    bool LegalOperations = !DCI.isBeforeLegalizeOps();
48059    if (SDValue NegV = TLI.getCheaperNegatedExpression(V, DAG, LegalOperations,
48060                                                       CodeSize)) {
48061      V = NegV;
48062      return true;
48063    }
48064    // Look through extract_vector_elts. If it comes from an FNEG, create a
48065    // new extract from the FNEG input.
48066    if (V.getOpcode() == ISD::EXTRACT_VECTOR_ELT &&
48067        isNullConstant(V.getOperand(1))) {
48068      SDValue Vec = V.getOperand(0);
48069      if (SDValue NegV = TLI.getCheaperNegatedExpression(
48070              Vec, DAG, LegalOperations, CodeSize)) {
48071        V = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SDLoc(V), V.getValueType(),
48072                        NegV, V.getOperand(1));
48073        return true;
48074      }
48075    }
48076
48077    return false;
48078  };
48079
48080  // Do not convert the passthru input of scalar intrinsics.
48081  // FIXME: We could allow negations of the lower element only.
48082  bool NegA = invertIfNegative(A);
48083  bool NegB = invertIfNegative(B);
48084  bool NegC = invertIfNegative(C);
48085
48086  if (!NegA && !NegB && !NegC)
48087    return SDValue();
48088
48089  unsigned NewOpcode =
48090      negateFMAOpcode(N->getOpcode(), NegA != NegB, NegC, false);
48091
48092  // Propagate fast-math-flags to new FMA node.
48093  SelectionDAG::FlagInserter FlagsInserter(DAG, Flags);
48094  if (IsStrict) {
48095    assert(N->getNumOperands() == 4 && "Shouldn't be greater than 4");
48096    return DAG.getNode(NewOpcode, dl, {VT, MVT::Other},
48097                       {N->getOperand(0), A, B, C});
48098  } else {
48099    if (N->getNumOperands() == 4)
48100      return DAG.getNode(NewOpcode, dl, VT, A, B, C, N->getOperand(3));
48101    return DAG.getNode(NewOpcode, dl, VT, A, B, C);
48102  }
48103}
48104
48105// Combine FMADDSUB(A, B, FNEG(C)) -> FMSUBADD(A, B, C)
48106// Combine FMSUBADD(A, B, FNEG(C)) -> FMADDSUB(A, B, C)
48107static SDValue combineFMADDSUB(SDNode *N, SelectionDAG &DAG,
48108                               TargetLowering::DAGCombinerInfo &DCI) {
48109  SDLoc dl(N);
48110  EVT VT = N->getValueType(0);
48111  const TargetLowering &TLI = DAG.getTargetLoweringInfo();
48112  bool CodeSize = DAG.getMachineFunction().getFunction().hasOptSize();
48113  bool LegalOperations = !DCI.isBeforeLegalizeOps();
48114
48115  SDValue N2 = N->getOperand(2);
48116
48117  SDValue NegN2 =
48118      TLI.getCheaperNegatedExpression(N2, DAG, LegalOperations, CodeSize);
48119  if (!NegN2)
48120    return SDValue();
48121  unsigned NewOpcode = negateFMAOpcode(N->getOpcode(), false, true, false);
48122
48123  if (N->getNumOperands() == 4)
48124    return DAG.getNode(NewOpcode, dl, VT, N->getOperand(0), N->getOperand(1),
48125                       NegN2, N->getOperand(3));
48126  return DAG.getNode(NewOpcode, dl, VT, N->getOperand(0), N->getOperand(1),
48127                     NegN2);
48128}
48129
48130static SDValue combineZext(SDNode *N, SelectionDAG &DAG,
48131                           TargetLowering::DAGCombinerInfo &DCI,
48132                           const X86Subtarget &Subtarget) {
48133  SDLoc dl(N);
48134  SDValue N0 = N->getOperand(0);
48135  EVT VT = N->getValueType(0);
48136
48137  // (i32 (aext (i8 (x86isd::setcc_carry)))) -> (i32 (x86isd::setcc_carry))
48138  // FIXME: Is this needed? We don't seem to have any tests for it.
48139  if (!DCI.isBeforeLegalizeOps() && N->getOpcode() == ISD::ANY_EXTEND &&
48140      N0.getOpcode() == X86ISD::SETCC_CARRY) {
48141    SDValue Setcc = DAG.getNode(X86ISD::SETCC_CARRY, dl, VT, N0->getOperand(0),
48142                                 N0->getOperand(1));
48143    bool ReplaceOtherUses = !N0.hasOneUse();
48144    DCI.CombineTo(N, Setcc);
48145    // Replace other uses with a truncate of the widened setcc_carry.
48146    if (ReplaceOtherUses) {
48147      SDValue Trunc = DAG.getNode(ISD::TRUNCATE, SDLoc(N0),
48148                                  N0.getValueType(), Setcc);
48149      DCI.CombineTo(N0.getNode(), Trunc);
48150    }
48151
48152    return SDValue(N, 0);
48153  }
48154
48155  if (SDValue NewCMov = combineToExtendCMOV(N, DAG))
48156    return NewCMov;
48157
48158  if (DCI.isBeforeLegalizeOps())
48159    if (SDValue V = combineExtSetcc(N, DAG, Subtarget))
48160      return V;
48161
48162  if (SDValue V = combineToExtendBoolVectorInReg(N, DAG, DCI, Subtarget))
48163    return V;
48164
48165  if (VT.isVector())
48166    if (SDValue R = PromoteMaskArithmetic(N, DAG, Subtarget))
48167      return R;
48168
48169  if (SDValue NewAdd = promoteExtBeforeAdd(N, DAG, Subtarget))
48170    return NewAdd;
48171
48172  if (SDValue R = combineOrCmpEqZeroToCtlzSrl(N, DAG, DCI, Subtarget))
48173    return R;
48174
48175  // TODO: Combine with any target/faux shuffle.
48176  if (N0.getOpcode() == X86ISD::PACKUS && N0.getValueSizeInBits() == 128 &&
48177      VT.getScalarSizeInBits() == N0.getOperand(0).getScalarValueSizeInBits()) {
48178    SDValue N00 = N0.getOperand(0);
48179    SDValue N01 = N0.getOperand(1);
48180    unsigned NumSrcEltBits = N00.getScalarValueSizeInBits();
48181    APInt ZeroMask = APInt::getHighBitsSet(NumSrcEltBits, NumSrcEltBits / 2);
48182    if ((N00.isUndef() || DAG.MaskedValueIsZero(N00, ZeroMask)) &&
48183        (N01.isUndef() || DAG.MaskedValueIsZero(N01, ZeroMask))) {
48184      return concatSubVectors(N00, N01, DAG, dl);
48185    }
48186  }
48187
48188  return SDValue();
48189}
48190
48191/// Recursive helper for combineVectorSizedSetCCEquality() to see if we have a
48192/// recognizable memcmp expansion.
48193static bool isOrXorXorTree(SDValue X, bool Root = true) {
48194  if (X.getOpcode() == ISD::OR)
48195    return isOrXorXorTree(X.getOperand(0), false) &&
48196           isOrXorXorTree(X.getOperand(1), false);
48197  if (Root)
48198    return false;
48199  return X.getOpcode() == ISD::XOR;
48200}
48201
48202/// Recursive helper for combineVectorSizedSetCCEquality() to emit the memcmp
48203/// expansion.
48204template<typename F>
48205static SDValue emitOrXorXorTree(SDValue X, SDLoc &DL, SelectionDAG &DAG,
48206                                EVT VecVT, EVT CmpVT, bool HasPT, F SToV) {
48207  SDValue Op0 = X.getOperand(0);
48208  SDValue Op1 = X.getOperand(1);
48209  if (X.getOpcode() == ISD::OR) {
48210    SDValue A = emitOrXorXorTree(Op0, DL, DAG, VecVT, CmpVT, HasPT, SToV);
48211    SDValue B = emitOrXorXorTree(Op1, DL, DAG, VecVT, CmpVT, HasPT, SToV);
48212    if (VecVT != CmpVT)
48213      return DAG.getNode(ISD::OR, DL, CmpVT, A, B);
48214    if (HasPT)
48215      return DAG.getNode(ISD::OR, DL, VecVT, A, B);
48216    return DAG.getNode(ISD::AND, DL, CmpVT, A, B);
48217  } else if (X.getOpcode() == ISD::XOR) {
48218    SDValue A = SToV(Op0);
48219    SDValue B = SToV(Op1);
48220    if (VecVT != CmpVT)
48221      return DAG.getSetCC(DL, CmpVT, A, B, ISD::SETNE);
48222    if (HasPT)
48223      return DAG.getNode(ISD::XOR, DL, VecVT, A, B);
48224    return DAG.getSetCC(DL, CmpVT, A, B, ISD::SETEQ);
48225  }
48226  llvm_unreachable("Impossible");
48227}
48228
48229/// Try to map a 128-bit or larger integer comparison to vector instructions
48230/// before type legalization splits it up into chunks.
48231static SDValue combineVectorSizedSetCCEquality(SDNode *SetCC, SelectionDAG &DAG,
48232                                               const X86Subtarget &Subtarget) {
48233  ISD::CondCode CC = cast<CondCodeSDNode>(SetCC->getOperand(2))->get();
48234  assert((CC == ISD::SETNE || CC == ISD::SETEQ) && "Bad comparison predicate");
48235
48236  // We're looking for an oversized integer equality comparison.
48237  SDValue X = SetCC->getOperand(0);
48238  SDValue Y = SetCC->getOperand(1);
48239  EVT OpVT = X.getValueType();
48240  unsigned OpSize = OpVT.getSizeInBits();
48241  if (!OpVT.isScalarInteger() || OpSize < 128)
48242    return SDValue();
48243
48244  // Ignore a comparison with zero because that gets special treatment in
48245  // EmitTest(). But make an exception for the special case of a pair of
48246  // logically-combined vector-sized operands compared to zero. This pattern may
48247  // be generated by the memcmp expansion pass with oversized integer compares
48248  // (see PR33325).
48249  bool IsOrXorXorTreeCCZero = isNullConstant(Y) && isOrXorXorTree(X);
48250  if (isNullConstant(Y) && !IsOrXorXorTreeCCZero)
48251    return SDValue();
48252
48253  // Don't perform this combine if constructing the vector will be expensive.
48254  auto IsVectorBitCastCheap = [](SDValue X) {
48255    X = peekThroughBitcasts(X);
48256    return isa<ConstantSDNode>(X) || X.getValueType().isVector() ||
48257           X.getOpcode() == ISD::LOAD;
48258  };
48259  if ((!IsVectorBitCastCheap(X) || !IsVectorBitCastCheap(Y)) &&
48260      !IsOrXorXorTreeCCZero)
48261    return SDValue();
48262
48263  EVT VT = SetCC->getValueType(0);
48264  SDLoc DL(SetCC);
48265
48266  // Use XOR (plus OR) and PTEST after SSE4.1 for 128/256-bit operands.
48267  // Use PCMPNEQ (plus OR) and KORTEST for 512-bit operands.
48268  // Otherwise use PCMPEQ (plus AND) and mask testing.
48269  if ((OpSize == 128 && Subtarget.hasSSE2()) ||
48270      (OpSize == 256 && Subtarget.hasAVX()) ||
48271      (OpSize == 512 && Subtarget.useAVX512Regs())) {
48272    bool HasPT = Subtarget.hasSSE41();
48273
48274    // PTEST and MOVMSK are slow on Knights Landing and Knights Mill and widened
48275    // vector registers are essentially free. (Technically, widening registers
48276    // prevents load folding, but the tradeoff is worth it.)
48277    bool PreferKOT = Subtarget.preferMaskRegisters();
48278    bool NeedZExt = PreferKOT && !Subtarget.hasVLX() && OpSize != 512;
48279
48280    EVT VecVT = MVT::v16i8;
48281    EVT CmpVT = PreferKOT ? MVT::v16i1 : VecVT;
48282    if (OpSize == 256) {
48283      VecVT = MVT::v32i8;
48284      CmpVT = PreferKOT ? MVT::v32i1 : VecVT;
48285    }
48286    EVT CastVT = VecVT;
48287    bool NeedsAVX512FCast = false;
48288    if (OpSize == 512 || NeedZExt) {
48289      if (Subtarget.hasBWI()) {
48290        VecVT = MVT::v64i8;
48291        CmpVT = MVT::v64i1;
48292        if (OpSize == 512)
48293          CastVT = VecVT;
48294      } else {
48295        VecVT = MVT::v16i32;
48296        CmpVT = MVT::v16i1;
48297        CastVT = OpSize == 512 ? VecVT :
48298                 OpSize == 256 ? MVT::v8i32 : MVT::v4i32;
48299        NeedsAVX512FCast = true;
48300      }
48301    }
48302
48303    auto ScalarToVector = [&](SDValue X) -> SDValue {
48304      bool TmpZext = false;
48305      EVT TmpCastVT = CastVT;
48306      if (X.getOpcode() == ISD::ZERO_EXTEND) {
48307        SDValue OrigX = X.getOperand(0);
48308        unsigned OrigSize = OrigX.getScalarValueSizeInBits();
48309        if (OrigSize < OpSize) {
48310          if (OrigSize == 128) {
48311            TmpCastVT = NeedsAVX512FCast ? MVT::v4i32 : MVT::v16i8;
48312            X = OrigX;
48313            TmpZext = true;
48314          } else if (OrigSize == 256) {
48315            TmpCastVT = NeedsAVX512FCast ? MVT::v8i32 : MVT::v32i8;
48316            X = OrigX;
48317            TmpZext = true;
48318          }
48319        }
48320      }
48321      X = DAG.getBitcast(TmpCastVT, X);
48322      if (!NeedZExt && !TmpZext)
48323        return X;
48324      return DAG.getNode(ISD::INSERT_SUBVECTOR, DL, VecVT,
48325                         DAG.getConstant(0, DL, VecVT), X,
48326                         DAG.getVectorIdxConstant(0, DL));
48327    };
48328
48329    SDValue Cmp;
48330    if (IsOrXorXorTreeCCZero) {
48331      // This is a bitwise-combined equality comparison of 2 pairs of vectors:
48332      // setcc i128 (or (xor A, B), (xor C, D)), 0, eq|ne
48333      // Use 2 vector equality compares and 'and' the results before doing a
48334      // MOVMSK.
48335      Cmp = emitOrXorXorTree(X, DL, DAG, VecVT, CmpVT, HasPT, ScalarToVector);
48336    } else {
48337      SDValue VecX = ScalarToVector(X);
48338      SDValue VecY = ScalarToVector(Y);
48339      if (VecVT != CmpVT) {
48340        Cmp = DAG.getSetCC(DL, CmpVT, VecX, VecY, ISD::SETNE);
48341      } else if (HasPT) {
48342        Cmp = DAG.getNode(ISD::XOR, DL, VecVT, VecX, VecY);
48343      } else {
48344        Cmp = DAG.getSetCC(DL, CmpVT, VecX, VecY, ISD::SETEQ);
48345      }
48346    }
48347    // AVX512 should emit a setcc that will lower to kortest.
48348    if (VecVT != CmpVT) {
48349      EVT KRegVT = CmpVT == MVT::v64i1 ? MVT::i64 :
48350                   CmpVT == MVT::v32i1 ? MVT::i32 : MVT::i16;
48351      return DAG.getSetCC(DL, VT, DAG.getBitcast(KRegVT, Cmp),
48352                          DAG.getConstant(0, DL, KRegVT), CC);
48353    }
48354    if (HasPT) {
48355      SDValue BCCmp = DAG.getBitcast(OpSize == 256 ? MVT::v4i64 : MVT::v2i64,
48356                                     Cmp);
48357      SDValue PT = DAG.getNode(X86ISD::PTEST, DL, MVT::i32, BCCmp, BCCmp);
48358      X86::CondCode X86CC = CC == ISD::SETEQ ? X86::COND_E : X86::COND_NE;
48359      SDValue X86SetCC = getSETCC(X86CC, PT, DL, DAG);
48360      return DAG.getNode(ISD::TRUNCATE, DL, VT, X86SetCC.getValue(0));
48361    }
48362    // If all bytes match (bitmask is 0x(FFFF)FFFF), that's equality.
48363    // setcc i128 X, Y, eq --> setcc (pmovmskb (pcmpeqb X, Y)), 0xFFFF, eq
48364    // setcc i128 X, Y, ne --> setcc (pmovmskb (pcmpeqb X, Y)), 0xFFFF, ne
48365    assert(Cmp.getValueType() == MVT::v16i8 &&
48366           "Non 128-bit vector on pre-SSE41 target");
48367    SDValue MovMsk = DAG.getNode(X86ISD::MOVMSK, DL, MVT::i32, Cmp);
48368    SDValue FFFFs = DAG.getConstant(0xFFFF, DL, MVT::i32);
48369    return DAG.getSetCC(DL, VT, MovMsk, FFFFs, CC);
48370  }
48371
48372  return SDValue();
48373}
48374
48375static SDValue combineSetCC(SDNode *N, SelectionDAG &DAG,
48376                            TargetLowering::DAGCombinerInfo &DCI,
48377                            const X86Subtarget &Subtarget) {
48378  const ISD::CondCode CC = cast<CondCodeSDNode>(N->getOperand(2))->get();
48379  const SDValue LHS = N->getOperand(0);
48380  const SDValue RHS = N->getOperand(1);
48381  EVT VT = N->getValueType(0);
48382  EVT OpVT = LHS.getValueType();
48383  SDLoc DL(N);
48384
48385  if (CC == ISD::SETNE || CC == ISD::SETEQ) {
48386    if (SDValue V = combineVectorSizedSetCCEquality(N, DAG, Subtarget))
48387      return V;
48388
48389    if (VT == MVT::i1 && isNullConstant(RHS)) {
48390      SDValue X86CC;
48391      if (SDValue V =
48392              MatchVectorAllZeroTest(LHS, CC, DL, Subtarget, DAG, X86CC))
48393        return DAG.getNode(ISD::TRUNCATE, DL, VT,
48394                           DAG.getNode(X86ISD::SETCC, DL, MVT::i8, X86CC, V));
48395    }
48396
48397    if (OpVT.isScalarInteger()) {
48398      // cmpeq(or(X,Y),X) --> cmpeq(and(~X,Y),0)
48399      // cmpne(or(X,Y),X) --> cmpne(and(~X,Y),0)
48400      auto MatchOrCmpEq = [&](SDValue N0, SDValue N1) {
48401        if (N0.getOpcode() == ISD::OR && N0->hasOneUse()) {
48402          if (N0.getOperand(0) == N1)
48403            return DAG.getNode(ISD::AND, DL, OpVT, DAG.getNOT(DL, N1, OpVT),
48404                               N0.getOperand(1));
48405          if (N0.getOperand(1) == N1)
48406            return DAG.getNode(ISD::AND, DL, OpVT, DAG.getNOT(DL, N1, OpVT),
48407                               N0.getOperand(0));
48408        }
48409        return SDValue();
48410      };
48411      if (SDValue AndN = MatchOrCmpEq(LHS, RHS))
48412        return DAG.getSetCC(DL, VT, AndN, DAG.getConstant(0, DL, OpVT), CC);
48413      if (SDValue AndN = MatchOrCmpEq(RHS, LHS))
48414        return DAG.getSetCC(DL, VT, AndN, DAG.getConstant(0, DL, OpVT), CC);
48415
48416      // cmpeq(and(X,Y),Y) --> cmpeq(and(~X,Y),0)
48417      // cmpne(and(X,Y),Y) --> cmpne(and(~X,Y),0)
48418      auto MatchAndCmpEq = [&](SDValue N0, SDValue N1) {
48419        if (N0.getOpcode() == ISD::AND && N0->hasOneUse()) {
48420          if (N0.getOperand(0) == N1)
48421            return DAG.getNode(ISD::AND, DL, OpVT, N1,
48422                               DAG.getNOT(DL, N0.getOperand(1), OpVT));
48423          if (N0.getOperand(1) == N1)
48424            return DAG.getNode(ISD::AND, DL, OpVT, N1,
48425                               DAG.getNOT(DL, N0.getOperand(0), OpVT));
48426        }
48427        return SDValue();
48428      };
48429      if (SDValue AndN = MatchAndCmpEq(LHS, RHS))
48430        return DAG.getSetCC(DL, VT, AndN, DAG.getConstant(0, DL, OpVT), CC);
48431      if (SDValue AndN = MatchAndCmpEq(RHS, LHS))
48432        return DAG.getSetCC(DL, VT, AndN, DAG.getConstant(0, DL, OpVT), CC);
48433
48434      // cmpeq(trunc(x),0) --> cmpeq(x,0)
48435      // cmpne(trunc(x),0) --> cmpne(x,0)
48436      // iff x upper bits are zero.
48437      // TODO: Add support for RHS to be truncate as well?
48438      if (LHS.getOpcode() == ISD::TRUNCATE &&
48439          LHS.getOperand(0).getScalarValueSizeInBits() >= 32 &&
48440          isNullConstant(RHS) && !DCI.isBeforeLegalize()) {
48441        EVT SrcVT = LHS.getOperand(0).getValueType();
48442        APInt UpperBits = APInt::getBitsSetFrom(SrcVT.getScalarSizeInBits(),
48443                                                OpVT.getScalarSizeInBits());
48444        const TargetLowering &TLI = DAG.getTargetLoweringInfo();
48445        if (DAG.MaskedValueIsZero(LHS.getOperand(0), UpperBits) &&
48446            TLI.isTypeLegal(LHS.getOperand(0).getValueType()))
48447          return DAG.getSetCC(DL, VT, LHS.getOperand(0),
48448                              DAG.getConstant(0, DL, SrcVT), CC);
48449      }
48450    }
48451  }
48452
48453  if (VT.isVector() && VT.getVectorElementType() == MVT::i1 &&
48454      (CC == ISD::SETNE || CC == ISD::SETEQ || ISD::isSignedIntSetCC(CC))) {
48455    // Using temporaries to avoid messing up operand ordering for later
48456    // transformations if this doesn't work.
48457    SDValue Op0 = LHS;
48458    SDValue Op1 = RHS;
48459    ISD::CondCode TmpCC = CC;
48460    // Put build_vector on the right.
48461    if (Op0.getOpcode() == ISD::BUILD_VECTOR) {
48462      std::swap(Op0, Op1);
48463      TmpCC = ISD::getSetCCSwappedOperands(TmpCC);
48464    }
48465
48466    bool IsSEXT0 =
48467        (Op0.getOpcode() == ISD::SIGN_EXTEND) &&
48468        (Op0.getOperand(0).getValueType().getVectorElementType() == MVT::i1);
48469    bool IsVZero1 = ISD::isBuildVectorAllZeros(Op1.getNode());
48470
48471    if (IsSEXT0 && IsVZero1) {
48472      assert(VT == Op0.getOperand(0).getValueType() &&
48473             "Unexpected operand type");
48474      if (TmpCC == ISD::SETGT)
48475        return DAG.getConstant(0, DL, VT);
48476      if (TmpCC == ISD::SETLE)
48477        return DAG.getConstant(1, DL, VT);
48478      if (TmpCC == ISD::SETEQ || TmpCC == ISD::SETGE)
48479        return DAG.getNOT(DL, Op0.getOperand(0), VT);
48480
48481      assert((TmpCC == ISD::SETNE || TmpCC == ISD::SETLT) &&
48482             "Unexpected condition code!");
48483      return Op0.getOperand(0);
48484    }
48485  }
48486
48487  // If we have AVX512, but not BWI and this is a vXi16/vXi8 setcc, just
48488  // pre-promote its result type since vXi1 vectors don't get promoted
48489  // during type legalization.
48490  // NOTE: The element count check is to ignore operand types that need to
48491  // go through type promotion to a 128-bit vector.
48492  if (Subtarget.hasAVX512() && !Subtarget.hasBWI() && VT.isVector() &&
48493      VT.getVectorElementType() == MVT::i1 &&
48494      (OpVT.getVectorElementType() == MVT::i8 ||
48495       OpVT.getVectorElementType() == MVT::i16)) {
48496    SDValue Setcc = DAG.getSetCC(DL, OpVT, LHS, RHS, CC);
48497    return DAG.getNode(ISD::TRUNCATE, DL, VT, Setcc);
48498  }
48499
48500  // For an SSE1-only target, lower a comparison of v4f32 to X86ISD::CMPP early
48501  // to avoid scalarization via legalization because v4i32 is not a legal type.
48502  if (Subtarget.hasSSE1() && !Subtarget.hasSSE2() && VT == MVT::v4i32 &&
48503      LHS.getValueType() == MVT::v4f32)
48504    return LowerVSETCC(SDValue(N, 0), Subtarget, DAG);
48505
48506  return SDValue();
48507}
48508
48509static SDValue combineMOVMSK(SDNode *N, SelectionDAG &DAG,
48510                             TargetLowering::DAGCombinerInfo &DCI,
48511                             const X86Subtarget &Subtarget) {
48512  SDValue Src = N->getOperand(0);
48513  MVT SrcVT = Src.getSimpleValueType();
48514  MVT VT = N->getSimpleValueType(0);
48515  unsigned NumBits = VT.getScalarSizeInBits();
48516  unsigned NumElts = SrcVT.getVectorNumElements();
48517
48518  // Perform constant folding.
48519  if (ISD::isBuildVectorOfConstantSDNodes(Src.getNode())) {
48520    assert(VT == MVT::i32 && "Unexpected result type");
48521    APInt Imm(32, 0);
48522    for (unsigned Idx = 0, e = Src.getNumOperands(); Idx < e; ++Idx) {
48523      if (!Src.getOperand(Idx).isUndef() &&
48524          Src.getConstantOperandAPInt(Idx).isNegative())
48525        Imm.setBit(Idx);
48526    }
48527    return DAG.getConstant(Imm, SDLoc(N), VT);
48528  }
48529
48530  // Look through int->fp bitcasts that don't change the element width.
48531  unsigned EltWidth = SrcVT.getScalarSizeInBits();
48532  if (Subtarget.hasSSE2() && Src.getOpcode() == ISD::BITCAST &&
48533      Src.getOperand(0).getScalarValueSizeInBits() == EltWidth)
48534    return DAG.getNode(X86ISD::MOVMSK, SDLoc(N), VT, Src.getOperand(0));
48535
48536  // Fold movmsk(not(x)) -> not(movmsk(x)) to improve folding of movmsk results
48537  // with scalar comparisons.
48538  if (SDValue NotSrc = IsNOT(Src, DAG)) {
48539    SDLoc DL(N);
48540    APInt NotMask = APInt::getLowBitsSet(NumBits, NumElts);
48541    NotSrc = DAG.getBitcast(SrcVT, NotSrc);
48542    return DAG.getNode(ISD::XOR, DL, VT,
48543                       DAG.getNode(X86ISD::MOVMSK, DL, VT, NotSrc),
48544                       DAG.getConstant(NotMask, DL, VT));
48545  }
48546
48547  // Fold movmsk(icmp_sgt(x,-1)) -> not(movmsk(x)) to improve folding of movmsk
48548  // results with scalar comparisons.
48549  if (Src.getOpcode() == X86ISD::PCMPGT &&
48550      ISD::isBuildVectorAllOnes(Src.getOperand(1).getNode())) {
48551    SDLoc DL(N);
48552    APInt NotMask = APInt::getLowBitsSet(NumBits, NumElts);
48553    return DAG.getNode(ISD::XOR, DL, VT,
48554                       DAG.getNode(X86ISD::MOVMSK, DL, VT, Src.getOperand(0)),
48555                       DAG.getConstant(NotMask, DL, VT));
48556  }
48557
48558  // Simplify the inputs.
48559  const TargetLowering &TLI = DAG.getTargetLoweringInfo();
48560  APInt DemandedMask(APInt::getAllOnesValue(NumBits));
48561  if (TLI.SimplifyDemandedBits(SDValue(N, 0), DemandedMask, DCI))
48562    return SDValue(N, 0);
48563
48564  return SDValue();
48565}
48566
48567static SDValue combineX86GatherScatter(SDNode *N, SelectionDAG &DAG,
48568                                       TargetLowering::DAGCombinerInfo &DCI) {
48569  // With vector masks we only demand the upper bit of the mask.
48570  SDValue Mask = cast<X86MaskedGatherScatterSDNode>(N)->getMask();
48571  if (Mask.getScalarValueSizeInBits() != 1) {
48572    const TargetLowering &TLI = DAG.getTargetLoweringInfo();
48573    APInt DemandedMask(APInt::getSignMask(Mask.getScalarValueSizeInBits()));
48574    if (TLI.SimplifyDemandedBits(Mask, DemandedMask, DCI)) {
48575      if (N->getOpcode() != ISD::DELETED_NODE)
48576        DCI.AddToWorklist(N);
48577      return SDValue(N, 0);
48578    }
48579  }
48580
48581  return SDValue();
48582}
48583
48584static SDValue rebuildGatherScatter(MaskedGatherScatterSDNode *GorS,
48585                                    SDValue Index, SDValue Base, SDValue Scale,
48586                                    SelectionDAG &DAG) {
48587  SDLoc DL(GorS);
48588
48589  if (auto *Gather = dyn_cast<MaskedGatherSDNode>(GorS)) {
48590    SDValue Ops[] = { Gather->getChain(), Gather->getPassThru(),
48591                      Gather->getMask(), Base, Index, Scale } ;
48592    return DAG.getMaskedGather(Gather->getVTList(),
48593                               Gather->getMemoryVT(), DL, Ops,
48594                               Gather->getMemOperand(),
48595                               Gather->getIndexType(),
48596                               Gather->getExtensionType());
48597  }
48598  auto *Scatter = cast<MaskedScatterSDNode>(GorS);
48599  SDValue Ops[] = { Scatter->getChain(), Scatter->getValue(),
48600                    Scatter->getMask(), Base, Index, Scale };
48601  return DAG.getMaskedScatter(Scatter->getVTList(),
48602                              Scatter->getMemoryVT(), DL,
48603                              Ops, Scatter->getMemOperand(),
48604                              Scatter->getIndexType(),
48605                              Scatter->isTruncatingStore());
48606}
48607
48608static SDValue combineGatherScatter(SDNode *N, SelectionDAG &DAG,
48609                                    TargetLowering::DAGCombinerInfo &DCI) {
48610  SDLoc DL(N);
48611  auto *GorS = cast<MaskedGatherScatterSDNode>(N);
48612  SDValue Index = GorS->getIndex();
48613  SDValue Base = GorS->getBasePtr();
48614  SDValue Scale = GorS->getScale();
48615
48616  if (DCI.isBeforeLegalize()) {
48617    unsigned IndexWidth = Index.getScalarValueSizeInBits();
48618
48619    // Shrink constant indices if they are larger than 32-bits.
48620    // Only do this before legalize types since v2i64 could become v2i32.
48621    // FIXME: We could check that the type is legal if we're after legalize
48622    // types, but then we would need to construct test cases where that happens.
48623    // FIXME: We could support more than just constant vectors, but we need to
48624    // careful with costing. A truncate that can be optimized out would be fine.
48625    // Otherwise we might only want to create a truncate if it avoids a split.
48626    if (auto *BV = dyn_cast<BuildVectorSDNode>(Index)) {
48627      if (BV->isConstant() && IndexWidth > 32 &&
48628          DAG.ComputeNumSignBits(Index) > (IndexWidth - 32)) {
48629        EVT NewVT = Index.getValueType().changeVectorElementType(MVT::i32);
48630        Index = DAG.getNode(ISD::TRUNCATE, DL, NewVT, Index);
48631        return rebuildGatherScatter(GorS, Index, Base, Scale, DAG);
48632      }
48633    }
48634
48635    // Shrink any sign/zero extends from 32 or smaller to larger than 32 if
48636    // there are sufficient sign bits. Only do this before legalize types to
48637    // avoid creating illegal types in truncate.
48638    if ((Index.getOpcode() == ISD::SIGN_EXTEND ||
48639         Index.getOpcode() == ISD::ZERO_EXTEND) &&
48640        IndexWidth > 32 &&
48641        Index.getOperand(0).getScalarValueSizeInBits() <= 32 &&
48642        DAG.ComputeNumSignBits(Index) > (IndexWidth - 32)) {
48643      EVT NewVT = Index.getValueType().changeVectorElementType(MVT::i32);
48644      Index = DAG.getNode(ISD::TRUNCATE, DL, NewVT, Index);
48645      return rebuildGatherScatter(GorS, Index, Base, Scale, DAG);
48646    }
48647  }
48648
48649  if (DCI.isBeforeLegalizeOps()) {
48650    unsigned IndexWidth = Index.getScalarValueSizeInBits();
48651
48652    // Make sure the index is either i32 or i64
48653    if (IndexWidth != 32 && IndexWidth != 64) {
48654      MVT EltVT = IndexWidth > 32 ? MVT::i64 : MVT::i32;
48655      EVT IndexVT = Index.getValueType().changeVectorElementType(EltVT);
48656      Index = DAG.getSExtOrTrunc(Index, DL, IndexVT);
48657      return rebuildGatherScatter(GorS, Index, Base, Scale, DAG);
48658    }
48659  }
48660
48661  // With vector masks we only demand the upper bit of the mask.
48662  SDValue Mask = GorS->getMask();
48663  if (Mask.getScalarValueSizeInBits() != 1) {
48664    const TargetLowering &TLI = DAG.getTargetLoweringInfo();
48665    APInt DemandedMask(APInt::getSignMask(Mask.getScalarValueSizeInBits()));
48666    if (TLI.SimplifyDemandedBits(Mask, DemandedMask, DCI)) {
48667      if (N->getOpcode() != ISD::DELETED_NODE)
48668        DCI.AddToWorklist(N);
48669      return SDValue(N, 0);
48670    }
48671  }
48672
48673  return SDValue();
48674}
48675
48676// Optimize  RES = X86ISD::SETCC CONDCODE, EFLAG_INPUT
48677static SDValue combineX86SetCC(SDNode *N, SelectionDAG &DAG,
48678                               const X86Subtarget &Subtarget) {
48679  SDLoc DL(N);
48680  X86::CondCode CC = X86::CondCode(N->getConstantOperandVal(0));
48681  SDValue EFLAGS = N->getOperand(1);
48682
48683  // Try to simplify the EFLAGS and condition code operands.
48684  if (SDValue Flags = combineSetCCEFLAGS(EFLAGS, CC, DAG, Subtarget))
48685    return getSETCC(CC, Flags, DL, DAG);
48686
48687  return SDValue();
48688}
48689
48690/// Optimize branch condition evaluation.
48691static SDValue combineBrCond(SDNode *N, SelectionDAG &DAG,
48692                             const X86Subtarget &Subtarget) {
48693  SDLoc DL(N);
48694  SDValue EFLAGS = N->getOperand(3);
48695  X86::CondCode CC = X86::CondCode(N->getConstantOperandVal(2));
48696
48697  // Try to simplify the EFLAGS and condition code operands.
48698  // Make sure to not keep references to operands, as combineSetCCEFLAGS can
48699  // RAUW them under us.
48700  if (SDValue Flags = combineSetCCEFLAGS(EFLAGS, CC, DAG, Subtarget)) {
48701    SDValue Cond = DAG.getTargetConstant(CC, DL, MVT::i8);
48702    return DAG.getNode(X86ISD::BRCOND, DL, N->getVTList(), N->getOperand(0),
48703                       N->getOperand(1), Cond, Flags);
48704  }
48705
48706  return SDValue();
48707}
48708
48709// TODO: Could we move this to DAGCombine?
48710static SDValue combineVectorCompareAndMaskUnaryOp(SDNode *N,
48711                                                  SelectionDAG &DAG) {
48712  // Take advantage of vector comparisons (etc.) producing 0 or -1 in each lane
48713  // to optimize away operation when it's from a constant.
48714  //
48715  // The general transformation is:
48716  //    UNARYOP(AND(VECTOR_CMP(x,y), constant)) -->
48717  //       AND(VECTOR_CMP(x,y), constant2)
48718  //    constant2 = UNARYOP(constant)
48719
48720  // Early exit if this isn't a vector operation, the operand of the
48721  // unary operation isn't a bitwise AND, or if the sizes of the operations
48722  // aren't the same.
48723  EVT VT = N->getValueType(0);
48724  bool IsStrict = N->isStrictFPOpcode();
48725  unsigned NumEltBits = VT.getScalarSizeInBits();
48726  SDValue Op0 = N->getOperand(IsStrict ? 1 : 0);
48727  if (!VT.isVector() || Op0.getOpcode() != ISD::AND ||
48728      DAG.ComputeNumSignBits(Op0.getOperand(0)) != NumEltBits ||
48729      VT.getSizeInBits() != Op0.getValueSizeInBits())
48730    return SDValue();
48731
48732  // Now check that the other operand of the AND is a constant. We could
48733  // make the transformation for non-constant splats as well, but it's unclear
48734  // that would be a benefit as it would not eliminate any operations, just
48735  // perform one more step in scalar code before moving to the vector unit.
48736  if (auto *BV = dyn_cast<BuildVectorSDNode>(Op0.getOperand(1))) {
48737    // Bail out if the vector isn't a constant.
48738    if (!BV->isConstant())
48739      return SDValue();
48740
48741    // Everything checks out. Build up the new and improved node.
48742    SDLoc DL(N);
48743    EVT IntVT = BV->getValueType(0);
48744    // Create a new constant of the appropriate type for the transformed
48745    // DAG.
48746    SDValue SourceConst;
48747    if (IsStrict)
48748      SourceConst = DAG.getNode(N->getOpcode(), DL, {VT, MVT::Other},
48749                                {N->getOperand(0), SDValue(BV, 0)});
48750    else
48751      SourceConst = DAG.getNode(N->getOpcode(), DL, VT, SDValue(BV, 0));
48752    // The AND node needs bitcasts to/from an integer vector type around it.
48753    SDValue MaskConst = DAG.getBitcast(IntVT, SourceConst);
48754    SDValue NewAnd = DAG.getNode(ISD::AND, DL, IntVT, Op0->getOperand(0),
48755                                 MaskConst);
48756    SDValue Res = DAG.getBitcast(VT, NewAnd);
48757    if (IsStrict)
48758      return DAG.getMergeValues({Res, SourceConst.getValue(1)}, DL);
48759    return Res;
48760  }
48761
48762  return SDValue();
48763}
48764
48765/// If we are converting a value to floating-point, try to replace scalar
48766/// truncate of an extracted vector element with a bitcast. This tries to keep
48767/// the sequence on XMM registers rather than moving between vector and GPRs.
48768static SDValue combineToFPTruncExtElt(SDNode *N, SelectionDAG &DAG) {
48769  // TODO: This is currently only used by combineSIntToFP, but it is generalized
48770  //       to allow being called by any similar cast opcode.
48771  // TODO: Consider merging this into lowering: vectorizeExtractedCast().
48772  SDValue Trunc = N->getOperand(0);
48773  if (!Trunc.hasOneUse() || Trunc.getOpcode() != ISD::TRUNCATE)
48774    return SDValue();
48775
48776  SDValue ExtElt = Trunc.getOperand(0);
48777  if (!ExtElt.hasOneUse() || ExtElt.getOpcode() != ISD::EXTRACT_VECTOR_ELT ||
48778      !isNullConstant(ExtElt.getOperand(1)))
48779    return SDValue();
48780
48781  EVT TruncVT = Trunc.getValueType();
48782  EVT SrcVT = ExtElt.getValueType();
48783  unsigned DestWidth = TruncVT.getSizeInBits();
48784  unsigned SrcWidth = SrcVT.getSizeInBits();
48785  if (SrcWidth % DestWidth != 0)
48786    return SDValue();
48787
48788  // inttofp (trunc (extelt X, 0)) --> inttofp (extelt (bitcast X), 0)
48789  EVT SrcVecVT = ExtElt.getOperand(0).getValueType();
48790  unsigned VecWidth = SrcVecVT.getSizeInBits();
48791  unsigned NumElts = VecWidth / DestWidth;
48792  EVT BitcastVT = EVT::getVectorVT(*DAG.getContext(), TruncVT, NumElts);
48793  SDValue BitcastVec = DAG.getBitcast(BitcastVT, ExtElt.getOperand(0));
48794  SDLoc DL(N);
48795  SDValue NewExtElt = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, TruncVT,
48796                                  BitcastVec, ExtElt.getOperand(1));
48797  return DAG.getNode(N->getOpcode(), DL, N->getValueType(0), NewExtElt);
48798}
48799
48800static SDValue combineUIntToFP(SDNode *N, SelectionDAG &DAG,
48801                               const X86Subtarget &Subtarget) {
48802  bool IsStrict = N->isStrictFPOpcode();
48803  SDValue Op0 = N->getOperand(IsStrict ? 1 : 0);
48804  EVT VT = N->getValueType(0);
48805  EVT InVT = Op0.getValueType();
48806
48807  // UINT_TO_FP(vXi1) -> SINT_TO_FP(ZEXT(vXi1 to vXi32))
48808  // UINT_TO_FP(vXi8) -> SINT_TO_FP(ZEXT(vXi8 to vXi32))
48809  // UINT_TO_FP(vXi16) -> SINT_TO_FP(ZEXT(vXi16 to vXi32))
48810  if (InVT.isVector() && InVT.getScalarSizeInBits() < 32) {
48811    SDLoc dl(N);
48812    EVT DstVT = InVT.changeVectorElementType(MVT::i32);
48813    SDValue P = DAG.getNode(ISD::ZERO_EXTEND, dl, DstVT, Op0);
48814
48815    // UINT_TO_FP isn't legal without AVX512 so use SINT_TO_FP.
48816    if (IsStrict)
48817      return DAG.getNode(ISD::STRICT_SINT_TO_FP, dl, {VT, MVT::Other},
48818                         {N->getOperand(0), P});
48819    return DAG.getNode(ISD::SINT_TO_FP, dl, VT, P);
48820  }
48821
48822  // Since UINT_TO_FP is legal (it's marked custom), dag combiner won't
48823  // optimize it to a SINT_TO_FP when the sign bit is known zero. Perform
48824  // the optimization here.
48825  if (DAG.SignBitIsZero(Op0)) {
48826    if (IsStrict)
48827      return DAG.getNode(ISD::STRICT_SINT_TO_FP, SDLoc(N), {VT, MVT::Other},
48828                         {N->getOperand(0), Op0});
48829    return DAG.getNode(ISD::SINT_TO_FP, SDLoc(N), VT, Op0);
48830  }
48831
48832  return SDValue();
48833}
48834
48835static SDValue combineSIntToFP(SDNode *N, SelectionDAG &DAG,
48836                               TargetLowering::DAGCombinerInfo &DCI,
48837                               const X86Subtarget &Subtarget) {
48838  // First try to optimize away the conversion entirely when it's
48839  // conditionally from a constant. Vectors only.
48840  bool IsStrict = N->isStrictFPOpcode();
48841  if (SDValue Res = combineVectorCompareAndMaskUnaryOp(N, DAG))
48842    return Res;
48843
48844  // Now move on to more general possibilities.
48845  SDValue Op0 = N->getOperand(IsStrict ? 1 : 0);
48846  EVT VT = N->getValueType(0);
48847  EVT InVT = Op0.getValueType();
48848
48849  // SINT_TO_FP(vXi1) -> SINT_TO_FP(SEXT(vXi1 to vXi32))
48850  // SINT_TO_FP(vXi8) -> SINT_TO_FP(SEXT(vXi8 to vXi32))
48851  // SINT_TO_FP(vXi16) -> SINT_TO_FP(SEXT(vXi16 to vXi32))
48852  if (InVT.isVector() && InVT.getScalarSizeInBits() < 32) {
48853    SDLoc dl(N);
48854    EVT DstVT = InVT.changeVectorElementType(MVT::i32);
48855    SDValue P = DAG.getNode(ISD::SIGN_EXTEND, dl, DstVT, Op0);
48856    if (IsStrict)
48857      return DAG.getNode(ISD::STRICT_SINT_TO_FP, dl, {VT, MVT::Other},
48858                         {N->getOperand(0), P});
48859    return DAG.getNode(ISD::SINT_TO_FP, dl, VT, P);
48860  }
48861
48862  // Without AVX512DQ we only support i64 to float scalar conversion. For both
48863  // vectors and scalars, see if we know that the upper bits are all the sign
48864  // bit, in which case we can truncate the input to i32 and convert from that.
48865  if (InVT.getScalarSizeInBits() > 32 && !Subtarget.hasDQI()) {
48866    unsigned BitWidth = InVT.getScalarSizeInBits();
48867    unsigned NumSignBits = DAG.ComputeNumSignBits(Op0);
48868    if (NumSignBits >= (BitWidth - 31)) {
48869      EVT TruncVT = MVT::i32;
48870      if (InVT.isVector())
48871        TruncVT = InVT.changeVectorElementType(TruncVT);
48872      SDLoc dl(N);
48873      if (DCI.isBeforeLegalize() || TruncVT != MVT::v2i32) {
48874        SDValue Trunc = DAG.getNode(ISD::TRUNCATE, dl, TruncVT, Op0);
48875        if (IsStrict)
48876          return DAG.getNode(ISD::STRICT_SINT_TO_FP, dl, {VT, MVT::Other},
48877                             {N->getOperand(0), Trunc});
48878        return DAG.getNode(ISD::SINT_TO_FP, dl, VT, Trunc);
48879      }
48880      // If we're after legalize and the type is v2i32 we need to shuffle and
48881      // use CVTSI2P.
48882      assert(InVT == MVT::v2i64 && "Unexpected VT!");
48883      SDValue Cast = DAG.getBitcast(MVT::v4i32, Op0);
48884      SDValue Shuf = DAG.getVectorShuffle(MVT::v4i32, dl, Cast, Cast,
48885                                          { 0, 2, -1, -1 });
48886      if (IsStrict)
48887        return DAG.getNode(X86ISD::STRICT_CVTSI2P, dl, {VT, MVT::Other},
48888                           {N->getOperand(0), Shuf});
48889      return DAG.getNode(X86ISD::CVTSI2P, dl, VT, Shuf);
48890    }
48891  }
48892
48893  // Transform (SINT_TO_FP (i64 ...)) into an x87 operation if we have
48894  // a 32-bit target where SSE doesn't support i64->FP operations.
48895  if (!Subtarget.useSoftFloat() && Subtarget.hasX87() &&
48896      Op0.getOpcode() == ISD::LOAD) {
48897    LoadSDNode *Ld = cast<LoadSDNode>(Op0.getNode());
48898
48899    // This transformation is not supported if the result type is f16 or f128.
48900    if (VT == MVT::f16 || VT == MVT::f128)
48901      return SDValue();
48902
48903    // If we have AVX512DQ we can use packed conversion instructions unless
48904    // the VT is f80.
48905    if (Subtarget.hasDQI() && VT != MVT::f80)
48906      return SDValue();
48907
48908    if (Ld->isSimple() && !VT.isVector() && ISD::isNormalLoad(Op0.getNode()) &&
48909        Op0.hasOneUse() && !Subtarget.is64Bit() && InVT == MVT::i64) {
48910      std::pair<SDValue, SDValue> Tmp =
48911          Subtarget.getTargetLowering()->BuildFILD(
48912              VT, InVT, SDLoc(N), Ld->getChain(), Ld->getBasePtr(),
48913              Ld->getPointerInfo(), Ld->getOriginalAlign(), DAG);
48914      DAG.ReplaceAllUsesOfValueWith(Op0.getValue(1), Tmp.second);
48915      return Tmp.first;
48916    }
48917  }
48918
48919  if (IsStrict)
48920    return SDValue();
48921
48922  if (SDValue V = combineToFPTruncExtElt(N, DAG))
48923    return V;
48924
48925  return SDValue();
48926}
48927
48928static bool needCarryOrOverflowFlag(SDValue Flags) {
48929  assert(Flags.getValueType() == MVT::i32 && "Unexpected VT!");
48930
48931  for (SDNode::use_iterator UI = Flags->use_begin(), UE = Flags->use_end();
48932         UI != UE; ++UI) {
48933    SDNode *User = *UI;
48934
48935    X86::CondCode CC;
48936    switch (User->getOpcode()) {
48937    default:
48938      // Be conservative.
48939      return true;
48940    case X86ISD::SETCC:
48941    case X86ISD::SETCC_CARRY:
48942      CC = (X86::CondCode)User->getConstantOperandVal(0);
48943      break;
48944    case X86ISD::BRCOND:
48945      CC = (X86::CondCode)User->getConstantOperandVal(2);
48946      break;
48947    case X86ISD::CMOV:
48948      CC = (X86::CondCode)User->getConstantOperandVal(2);
48949      break;
48950    }
48951
48952    switch (CC) {
48953    default: break;
48954    case X86::COND_A: case X86::COND_AE:
48955    case X86::COND_B: case X86::COND_BE:
48956    case X86::COND_O: case X86::COND_NO:
48957    case X86::COND_G: case X86::COND_GE:
48958    case X86::COND_L: case X86::COND_LE:
48959      return true;
48960    }
48961  }
48962
48963  return false;
48964}
48965
48966static bool onlyZeroFlagUsed(SDValue Flags) {
48967  assert(Flags.getValueType() == MVT::i32 && "Unexpected VT!");
48968
48969  for (SDNode::use_iterator UI = Flags->use_begin(), UE = Flags->use_end();
48970         UI != UE; ++UI) {
48971    SDNode *User = *UI;
48972
48973    unsigned CCOpNo;
48974    switch (User->getOpcode()) {
48975    default:
48976      // Be conservative.
48977      return false;
48978    case X86ISD::SETCC:       CCOpNo = 0; break;
48979    case X86ISD::SETCC_CARRY: CCOpNo = 0; break;
48980    case X86ISD::BRCOND:      CCOpNo = 2; break;
48981    case X86ISD::CMOV:        CCOpNo = 2; break;
48982    }
48983
48984    X86::CondCode CC = (X86::CondCode)User->getConstantOperandVal(CCOpNo);
48985    if (CC != X86::COND_E && CC != X86::COND_NE)
48986      return false;
48987  }
48988
48989  return true;
48990}
48991
48992static SDValue combineCMP(SDNode *N, SelectionDAG &DAG) {
48993  // Only handle test patterns.
48994  if (!isNullConstant(N->getOperand(1)))
48995    return SDValue();
48996
48997  // If we have a CMP of a truncated binop, see if we can make a smaller binop
48998  // and use its flags directly.
48999  // TODO: Maybe we should try promoting compares that only use the zero flag
49000  // first if we can prove the upper bits with computeKnownBits?
49001  SDLoc dl(N);
49002  SDValue Op = N->getOperand(0);
49003  EVT VT = Op.getValueType();
49004
49005  // If we have a constant logical shift that's only used in a comparison
49006  // against zero turn it into an equivalent AND. This allows turning it into
49007  // a TEST instruction later.
49008  if ((Op.getOpcode() == ISD::SRL || Op.getOpcode() == ISD::SHL) &&
49009      Op.hasOneUse() && isa<ConstantSDNode>(Op.getOperand(1)) &&
49010      onlyZeroFlagUsed(SDValue(N, 0))) {
49011    unsigned BitWidth = VT.getSizeInBits();
49012    const APInt &ShAmt = Op.getConstantOperandAPInt(1);
49013    if (ShAmt.ult(BitWidth)) { // Avoid undefined shifts.
49014      unsigned MaskBits = BitWidth - ShAmt.getZExtValue();
49015      APInt Mask = Op.getOpcode() == ISD::SRL
49016                       ? APInt::getHighBitsSet(BitWidth, MaskBits)
49017                       : APInt::getLowBitsSet(BitWidth, MaskBits);
49018      if (Mask.isSignedIntN(32)) {
49019        Op = DAG.getNode(ISD::AND, dl, VT, Op.getOperand(0),
49020                         DAG.getConstant(Mask, dl, VT));
49021        return DAG.getNode(X86ISD::CMP, dl, MVT::i32, Op,
49022                           DAG.getConstant(0, dl, VT));
49023      }
49024    }
49025  }
49026
49027  // Look for a truncate.
49028  if (Op.getOpcode() != ISD::TRUNCATE)
49029    return SDValue();
49030
49031  SDValue Trunc = Op;
49032  Op = Op.getOperand(0);
49033
49034  // See if we can compare with zero against the truncation source,
49035  // which should help using the Z flag from many ops. Only do this for
49036  // i32 truncated op to prevent partial-reg compares of promoted ops.
49037  EVT OpVT = Op.getValueType();
49038  APInt UpperBits =
49039      APInt::getBitsSetFrom(OpVT.getSizeInBits(), VT.getSizeInBits());
49040  if (OpVT == MVT::i32 && DAG.MaskedValueIsZero(Op, UpperBits) &&
49041      onlyZeroFlagUsed(SDValue(N, 0))) {
49042    return DAG.getNode(X86ISD::CMP, dl, MVT::i32, Op,
49043                       DAG.getConstant(0, dl, OpVT));
49044  }
49045
49046  // After this the truncate and arithmetic op must have a single use.
49047  if (!Trunc.hasOneUse() || !Op.hasOneUse())
49048      return SDValue();
49049
49050  unsigned NewOpc;
49051  switch (Op.getOpcode()) {
49052  default: return SDValue();
49053  case ISD::AND:
49054    // Skip and with constant. We have special handling for and with immediate
49055    // during isel to generate test instructions.
49056    if (isa<ConstantSDNode>(Op.getOperand(1)))
49057      return SDValue();
49058    NewOpc = X86ISD::AND;
49059    break;
49060  case ISD::OR:  NewOpc = X86ISD::OR;  break;
49061  case ISD::XOR: NewOpc = X86ISD::XOR; break;
49062  case ISD::ADD:
49063    // If the carry or overflow flag is used, we can't truncate.
49064    if (needCarryOrOverflowFlag(SDValue(N, 0)))
49065      return SDValue();
49066    NewOpc = X86ISD::ADD;
49067    break;
49068  case ISD::SUB:
49069    // If the carry or overflow flag is used, we can't truncate.
49070    if (needCarryOrOverflowFlag(SDValue(N, 0)))
49071      return SDValue();
49072    NewOpc = X86ISD::SUB;
49073    break;
49074  }
49075
49076  // We found an op we can narrow. Truncate its inputs.
49077  SDValue Op0 = DAG.getNode(ISD::TRUNCATE, dl, VT, Op.getOperand(0));
49078  SDValue Op1 = DAG.getNode(ISD::TRUNCATE, dl, VT, Op.getOperand(1));
49079
49080  // Use a X86 specific opcode to avoid DAG combine messing with it.
49081  SDVTList VTs = DAG.getVTList(VT, MVT::i32);
49082  Op = DAG.getNode(NewOpc, dl, VTs, Op0, Op1);
49083
49084  // For AND, keep a CMP so that we can match the test pattern.
49085  if (NewOpc == X86ISD::AND)
49086    return DAG.getNode(X86ISD::CMP, dl, MVT::i32, Op,
49087                       DAG.getConstant(0, dl, VT));
49088
49089  // Return the flags.
49090  return Op.getValue(1);
49091}
49092
49093static SDValue combineX86AddSub(SDNode *N, SelectionDAG &DAG,
49094                                TargetLowering::DAGCombinerInfo &DCI) {
49095  assert((X86ISD::ADD == N->getOpcode() || X86ISD::SUB == N->getOpcode()) &&
49096         "Expected X86ISD::ADD or X86ISD::SUB");
49097
49098  SDLoc DL(N);
49099  SDValue LHS = N->getOperand(0);
49100  SDValue RHS = N->getOperand(1);
49101  MVT VT = LHS.getSimpleValueType();
49102  unsigned GenericOpc = X86ISD::ADD == N->getOpcode() ? ISD::ADD : ISD::SUB;
49103
49104  // If we don't use the flag result, simplify back to a generic ADD/SUB.
49105  if (!N->hasAnyUseOfValue(1)) {
49106    SDValue Res = DAG.getNode(GenericOpc, DL, VT, LHS, RHS);
49107    return DAG.getMergeValues({Res, DAG.getConstant(0, DL, MVT::i32)}, DL);
49108  }
49109
49110  // Fold any similar generic ADD/SUB opcodes to reuse this node.
49111  auto MatchGeneric = [&](SDValue N0, SDValue N1, bool Negate) {
49112    SDValue Ops[] = {N0, N1};
49113    SDVTList VTs = DAG.getVTList(N->getValueType(0));
49114    if (SDNode *GenericAddSub = DAG.getNodeIfExists(GenericOpc, VTs, Ops)) {
49115      SDValue Op(N, 0);
49116      if (Negate)
49117        Op = DAG.getNode(ISD::SUB, DL, VT, DAG.getConstant(0, DL, VT), Op);
49118      DCI.CombineTo(GenericAddSub, Op);
49119    }
49120  };
49121  MatchGeneric(LHS, RHS, false);
49122  MatchGeneric(RHS, LHS, X86ISD::SUB == N->getOpcode());
49123
49124  return SDValue();
49125}
49126
49127static SDValue combineSBB(SDNode *N, SelectionDAG &DAG) {
49128  if (SDValue Flags = combineCarryThroughADD(N->getOperand(2), DAG)) {
49129    MVT VT = N->getSimpleValueType(0);
49130    SDVTList VTs = DAG.getVTList(VT, MVT::i32);
49131    return DAG.getNode(X86ISD::SBB, SDLoc(N), VTs,
49132                       N->getOperand(0), N->getOperand(1),
49133                       Flags);
49134  }
49135
49136  // Fold SBB(SUB(X,Y),0,Carry) -> SBB(X,Y,Carry)
49137  // iff the flag result is dead.
49138  SDValue Op0 = N->getOperand(0);
49139  SDValue Op1 = N->getOperand(1);
49140  if (Op0.getOpcode() == ISD::SUB && isNullConstant(Op1) &&
49141      !N->hasAnyUseOfValue(1))
49142    return DAG.getNode(X86ISD::SBB, SDLoc(N), N->getVTList(), Op0.getOperand(0),
49143                       Op0.getOperand(1), N->getOperand(2));
49144
49145  return SDValue();
49146}
49147
49148// Optimize RES, EFLAGS = X86ISD::ADC LHS, RHS, EFLAGS
49149static SDValue combineADC(SDNode *N, SelectionDAG &DAG,
49150                          TargetLowering::DAGCombinerInfo &DCI) {
49151  // If the LHS and RHS of the ADC node are zero, then it can't overflow and
49152  // the result is either zero or one (depending on the input carry bit).
49153  // Strength reduce this down to a "set on carry" aka SETCC_CARRY&1.
49154  if (X86::isZeroNode(N->getOperand(0)) &&
49155      X86::isZeroNode(N->getOperand(1)) &&
49156      // We don't have a good way to replace an EFLAGS use, so only do this when
49157      // dead right now.
49158      SDValue(N, 1).use_empty()) {
49159    SDLoc DL(N);
49160    EVT VT = N->getValueType(0);
49161    SDValue CarryOut = DAG.getConstant(0, DL, N->getValueType(1));
49162    SDValue Res1 =
49163        DAG.getNode(ISD::AND, DL, VT,
49164                    DAG.getNode(X86ISD::SETCC_CARRY, DL, VT,
49165                                DAG.getTargetConstant(X86::COND_B, DL, MVT::i8),
49166                                N->getOperand(2)),
49167                    DAG.getConstant(1, DL, VT));
49168    return DCI.CombineTo(N, Res1, CarryOut);
49169  }
49170
49171  if (SDValue Flags = combineCarryThroughADD(N->getOperand(2), DAG)) {
49172    MVT VT = N->getSimpleValueType(0);
49173    SDVTList VTs = DAG.getVTList(VT, MVT::i32);
49174    return DAG.getNode(X86ISD::ADC, SDLoc(N), VTs,
49175                       N->getOperand(0), N->getOperand(1),
49176                       Flags);
49177  }
49178
49179  return SDValue();
49180}
49181
49182/// If this is an add or subtract where one operand is produced by a cmp+setcc,
49183/// then try to convert it to an ADC or SBB. This replaces TEST+SET+{ADD/SUB}
49184/// with CMP+{ADC, SBB}.
49185static SDValue combineAddOrSubToADCOrSBB(SDNode *N, SelectionDAG &DAG) {
49186  bool IsSub = N->getOpcode() == ISD::SUB;
49187  SDValue X = N->getOperand(0);
49188  SDValue Y = N->getOperand(1);
49189
49190  // If this is an add, canonicalize a zext operand to the RHS.
49191  // TODO: Incomplete? What if both sides are zexts?
49192  if (!IsSub && X.getOpcode() == ISD::ZERO_EXTEND &&
49193      Y.getOpcode() != ISD::ZERO_EXTEND)
49194    std::swap(X, Y);
49195
49196  // Look through a one-use zext.
49197  bool PeekedThroughZext = false;
49198  if (Y.getOpcode() == ISD::ZERO_EXTEND && Y.hasOneUse()) {
49199    Y = Y.getOperand(0);
49200    PeekedThroughZext = true;
49201  }
49202
49203  // If this is an add, canonicalize a setcc operand to the RHS.
49204  // TODO: Incomplete? What if both sides are setcc?
49205  // TODO: Should we allow peeking through a zext of the other operand?
49206  if (!IsSub && !PeekedThroughZext && X.getOpcode() == X86ISD::SETCC &&
49207      Y.getOpcode() != X86ISD::SETCC)
49208    std::swap(X, Y);
49209
49210  if (Y.getOpcode() != X86ISD::SETCC || !Y.hasOneUse())
49211    return SDValue();
49212
49213  SDLoc DL(N);
49214  EVT VT = N->getValueType(0);
49215  X86::CondCode CC = (X86::CondCode)Y.getConstantOperandVal(0);
49216
49217  // If X is -1 or 0, then we have an opportunity to avoid constants required in
49218  // the general case below.
49219  auto *ConstantX = dyn_cast<ConstantSDNode>(X);
49220  if (ConstantX) {
49221    if ((!IsSub && CC == X86::COND_AE && ConstantX->isAllOnesValue()) ||
49222        (IsSub && CC == X86::COND_B && ConstantX->isNullValue())) {
49223      // This is a complicated way to get -1 or 0 from the carry flag:
49224      // -1 + SETAE --> -1 + (!CF) --> CF ? -1 : 0 --> SBB %eax, %eax
49225      //  0 - SETB  -->  0 -  (CF) --> CF ? -1 : 0 --> SBB %eax, %eax
49226      return DAG.getNode(X86ISD::SETCC_CARRY, DL, VT,
49227                         DAG.getTargetConstant(X86::COND_B, DL, MVT::i8),
49228                         Y.getOperand(1));
49229    }
49230
49231    if ((!IsSub && CC == X86::COND_BE && ConstantX->isAllOnesValue()) ||
49232        (IsSub && CC == X86::COND_A && ConstantX->isNullValue())) {
49233      SDValue EFLAGS = Y->getOperand(1);
49234      if (EFLAGS.getOpcode() == X86ISD::SUB && EFLAGS.hasOneUse() &&
49235          EFLAGS.getValueType().isInteger() &&
49236          !isa<ConstantSDNode>(EFLAGS.getOperand(1))) {
49237        // Swap the operands of a SUB, and we have the same pattern as above.
49238        // -1 + SETBE (SUB A, B) --> -1 + SETAE (SUB B, A) --> SUB + SBB
49239        //  0 - SETA  (SUB A, B) -->  0 - SETB  (SUB B, A) --> SUB + SBB
49240        SDValue NewSub = DAG.getNode(
49241            X86ISD::SUB, SDLoc(EFLAGS), EFLAGS.getNode()->getVTList(),
49242            EFLAGS.getOperand(1), EFLAGS.getOperand(0));
49243        SDValue NewEFLAGS = SDValue(NewSub.getNode(), EFLAGS.getResNo());
49244        return DAG.getNode(X86ISD::SETCC_CARRY, DL, VT,
49245                           DAG.getTargetConstant(X86::COND_B, DL, MVT::i8),
49246                           NewEFLAGS);
49247      }
49248    }
49249  }
49250
49251  if (CC == X86::COND_B) {
49252    // X + SETB Z --> adc X, 0
49253    // X - SETB Z --> sbb X, 0
49254    return DAG.getNode(IsSub ? X86ISD::SBB : X86ISD::ADC, DL,
49255                       DAG.getVTList(VT, MVT::i32), X,
49256                       DAG.getConstant(0, DL, VT), Y.getOperand(1));
49257  }
49258
49259  if (CC == X86::COND_A) {
49260    SDValue EFLAGS = Y.getOperand(1);
49261    // Try to convert COND_A into COND_B in an attempt to facilitate
49262    // materializing "setb reg".
49263    //
49264    // Do not flip "e > c", where "c" is a constant, because Cmp instruction
49265    // cannot take an immediate as its first operand.
49266    //
49267    if (EFLAGS.getOpcode() == X86ISD::SUB && EFLAGS.getNode()->hasOneUse() &&
49268        EFLAGS.getValueType().isInteger() &&
49269        !isa<ConstantSDNode>(EFLAGS.getOperand(1))) {
49270      SDValue NewSub = DAG.getNode(X86ISD::SUB, SDLoc(EFLAGS),
49271                                   EFLAGS.getNode()->getVTList(),
49272                                   EFLAGS.getOperand(1), EFLAGS.getOperand(0));
49273      SDValue NewEFLAGS = NewSub.getValue(EFLAGS.getResNo());
49274      return DAG.getNode(IsSub ? X86ISD::SBB : X86ISD::ADC, DL,
49275                         DAG.getVTList(VT, MVT::i32), X,
49276                         DAG.getConstant(0, DL, VT), NewEFLAGS);
49277    }
49278  }
49279
49280  if (CC == X86::COND_AE) {
49281    // X + SETAE --> sbb X, -1
49282    // X - SETAE --> adc X, -1
49283    return DAG.getNode(IsSub ? X86ISD::ADC : X86ISD::SBB, DL,
49284                       DAG.getVTList(VT, MVT::i32), X,
49285                       DAG.getConstant(-1, DL, VT), Y.getOperand(1));
49286  }
49287
49288  if (CC == X86::COND_BE) {
49289    // X + SETBE --> sbb X, -1
49290    // X - SETBE --> adc X, -1
49291    SDValue EFLAGS = Y.getOperand(1);
49292    // Try to convert COND_BE into COND_AE in an attempt to facilitate
49293    // materializing "setae reg".
49294    //
49295    // Do not flip "e <= c", where "c" is a constant, because Cmp instruction
49296    // cannot take an immediate as its first operand.
49297    //
49298    if (EFLAGS.getOpcode() == X86ISD::SUB && EFLAGS.getNode()->hasOneUse() &&
49299        EFLAGS.getValueType().isInteger() &&
49300        !isa<ConstantSDNode>(EFLAGS.getOperand(1))) {
49301      SDValue NewSub = DAG.getNode(
49302          X86ISD::SUB, SDLoc(EFLAGS), EFLAGS.getNode()->getVTList(),
49303          EFLAGS.getOperand(1), EFLAGS.getOperand(0));
49304      SDValue NewEFLAGS = NewSub.getValue(EFLAGS.getResNo());
49305      return DAG.getNode(IsSub ? X86ISD::ADC : X86ISD::SBB, DL,
49306                         DAG.getVTList(VT, MVT::i32), X,
49307                         DAG.getConstant(-1, DL, VT), NewEFLAGS);
49308    }
49309  }
49310
49311  if (CC != X86::COND_E && CC != X86::COND_NE)
49312    return SDValue();
49313
49314  SDValue Cmp = Y.getOperand(1);
49315  if (Cmp.getOpcode() != X86ISD::CMP || !Cmp.hasOneUse() ||
49316      !X86::isZeroNode(Cmp.getOperand(1)) ||
49317      !Cmp.getOperand(0).getValueType().isInteger())
49318    return SDValue();
49319
49320  SDValue Z = Cmp.getOperand(0);
49321  EVT ZVT = Z.getValueType();
49322
49323  // If X is -1 or 0, then we have an opportunity to avoid constants required in
49324  // the general case below.
49325  if (ConstantX) {
49326    // 'neg' sets the carry flag when Z != 0, so create 0 or -1 using 'sbb' with
49327    // fake operands:
49328    //  0 - (Z != 0) --> sbb %eax, %eax, (neg Z)
49329    // -1 + (Z == 0) --> sbb %eax, %eax, (neg Z)
49330    if ((IsSub && CC == X86::COND_NE && ConstantX->isNullValue()) ||
49331        (!IsSub && CC == X86::COND_E && ConstantX->isAllOnesValue())) {
49332      SDValue Zero = DAG.getConstant(0, DL, ZVT);
49333      SDVTList X86SubVTs = DAG.getVTList(ZVT, MVT::i32);
49334      SDValue Neg = DAG.getNode(X86ISD::SUB, DL, X86SubVTs, Zero, Z);
49335      return DAG.getNode(X86ISD::SETCC_CARRY, DL, VT,
49336                         DAG.getTargetConstant(X86::COND_B, DL, MVT::i8),
49337                         SDValue(Neg.getNode(), 1));
49338    }
49339
49340    // cmp with 1 sets the carry flag when Z == 0, so create 0 or -1 using 'sbb'
49341    // with fake operands:
49342    //  0 - (Z == 0) --> sbb %eax, %eax, (cmp Z, 1)
49343    // -1 + (Z != 0) --> sbb %eax, %eax, (cmp Z, 1)
49344    if ((IsSub && CC == X86::COND_E && ConstantX->isNullValue()) ||
49345        (!IsSub && CC == X86::COND_NE && ConstantX->isAllOnesValue())) {
49346      SDValue One = DAG.getConstant(1, DL, ZVT);
49347      SDVTList X86SubVTs = DAG.getVTList(ZVT, MVT::i32);
49348      SDValue Cmp1 = DAG.getNode(X86ISD::SUB, DL, X86SubVTs, Z, One);
49349      return DAG.getNode(X86ISD::SETCC_CARRY, DL, VT,
49350                         DAG.getTargetConstant(X86::COND_B, DL, MVT::i8),
49351                         Cmp1.getValue(1));
49352    }
49353  }
49354
49355  // (cmp Z, 1) sets the carry flag if Z is 0.
49356  SDValue One = DAG.getConstant(1, DL, ZVT);
49357  SDVTList X86SubVTs = DAG.getVTList(ZVT, MVT::i32);
49358  SDValue Cmp1 = DAG.getNode(X86ISD::SUB, DL, X86SubVTs, Z, One);
49359
49360  // Add the flags type for ADC/SBB nodes.
49361  SDVTList VTs = DAG.getVTList(VT, MVT::i32);
49362
49363  // X - (Z != 0) --> sub X, (zext(setne Z, 0)) --> adc X, -1, (cmp Z, 1)
49364  // X + (Z != 0) --> add X, (zext(setne Z, 0)) --> sbb X, -1, (cmp Z, 1)
49365  if (CC == X86::COND_NE)
49366    return DAG.getNode(IsSub ? X86ISD::ADC : X86ISD::SBB, DL, VTs, X,
49367                       DAG.getConstant(-1ULL, DL, VT), Cmp1.getValue(1));
49368
49369  // X - (Z == 0) --> sub X, (zext(sete  Z, 0)) --> sbb X, 0, (cmp Z, 1)
49370  // X + (Z == 0) --> add X, (zext(sete  Z, 0)) --> adc X, 0, (cmp Z, 1)
49371  return DAG.getNode(IsSub ? X86ISD::SBB : X86ISD::ADC, DL, VTs, X,
49372                     DAG.getConstant(0, DL, VT), Cmp1.getValue(1));
49373}
49374
49375static SDValue matchPMADDWD(SelectionDAG &DAG, SDValue Op0, SDValue Op1,
49376                            const SDLoc &DL, EVT VT,
49377                            const X86Subtarget &Subtarget) {
49378  // Example of pattern we try to detect:
49379  // t := (v8i32 mul (sext (v8i16 x0), (sext (v8i16 x1))))
49380  //(add (build_vector (extract_elt t, 0),
49381  //                   (extract_elt t, 2),
49382  //                   (extract_elt t, 4),
49383  //                   (extract_elt t, 6)),
49384  //     (build_vector (extract_elt t, 1),
49385  //                   (extract_elt t, 3),
49386  //                   (extract_elt t, 5),
49387  //                   (extract_elt t, 7)))
49388
49389  if (!Subtarget.hasSSE2())
49390    return SDValue();
49391
49392  if (Op0.getOpcode() != ISD::BUILD_VECTOR ||
49393      Op1.getOpcode() != ISD::BUILD_VECTOR)
49394    return SDValue();
49395
49396  if (!VT.isVector() || VT.getVectorElementType() != MVT::i32 ||
49397      VT.getVectorNumElements() < 4 ||
49398      !isPowerOf2_32(VT.getVectorNumElements()))
49399    return SDValue();
49400
49401  // Check if one of Op0,Op1 is of the form:
49402  // (build_vector (extract_elt Mul, 0),
49403  //               (extract_elt Mul, 2),
49404  //               (extract_elt Mul, 4),
49405  //                   ...
49406  // the other is of the form:
49407  // (build_vector (extract_elt Mul, 1),
49408  //               (extract_elt Mul, 3),
49409  //               (extract_elt Mul, 5),
49410  //                   ...
49411  // and identify Mul.
49412  SDValue Mul;
49413  for (unsigned i = 0, e = VT.getVectorNumElements(); i != e; i += 2) {
49414    SDValue Op0L = Op0->getOperand(i), Op1L = Op1->getOperand(i),
49415            Op0H = Op0->getOperand(i + 1), Op1H = Op1->getOperand(i + 1);
49416    // TODO: Be more tolerant to undefs.
49417    if (Op0L.getOpcode() != ISD::EXTRACT_VECTOR_ELT ||
49418        Op1L.getOpcode() != ISD::EXTRACT_VECTOR_ELT ||
49419        Op0H.getOpcode() != ISD::EXTRACT_VECTOR_ELT ||
49420        Op1H.getOpcode() != ISD::EXTRACT_VECTOR_ELT)
49421      return SDValue();
49422    auto *Const0L = dyn_cast<ConstantSDNode>(Op0L->getOperand(1));
49423    auto *Const1L = dyn_cast<ConstantSDNode>(Op1L->getOperand(1));
49424    auto *Const0H = dyn_cast<ConstantSDNode>(Op0H->getOperand(1));
49425    auto *Const1H = dyn_cast<ConstantSDNode>(Op1H->getOperand(1));
49426    if (!Const0L || !Const1L || !Const0H || !Const1H)
49427      return SDValue();
49428    unsigned Idx0L = Const0L->getZExtValue(), Idx1L = Const1L->getZExtValue(),
49429             Idx0H = Const0H->getZExtValue(), Idx1H = Const1H->getZExtValue();
49430    // Commutativity of mul allows factors of a product to reorder.
49431    if (Idx0L > Idx1L)
49432      std::swap(Idx0L, Idx1L);
49433    if (Idx0H > Idx1H)
49434      std::swap(Idx0H, Idx1H);
49435    // Commutativity of add allows pairs of factors to reorder.
49436    if (Idx0L > Idx0H) {
49437      std::swap(Idx0L, Idx0H);
49438      std::swap(Idx1L, Idx1H);
49439    }
49440    if (Idx0L != 2 * i || Idx1L != 2 * i + 1 || Idx0H != 2 * i + 2 ||
49441        Idx1H != 2 * i + 3)
49442      return SDValue();
49443    if (!Mul) {
49444      // First time an extract_elt's source vector is visited. Must be a MUL
49445      // with 2X number of vector elements than the BUILD_VECTOR.
49446      // Both extracts must be from same MUL.
49447      Mul = Op0L->getOperand(0);
49448      if (Mul->getOpcode() != ISD::MUL ||
49449          Mul.getValueType().getVectorNumElements() != 2 * e)
49450        return SDValue();
49451    }
49452    // Check that the extract is from the same MUL previously seen.
49453    if (Mul != Op0L->getOperand(0) || Mul != Op1L->getOperand(0) ||
49454        Mul != Op0H->getOperand(0) || Mul != Op1H->getOperand(0))
49455      return SDValue();
49456  }
49457
49458  // Check if the Mul source can be safely shrunk.
49459  ShrinkMode Mode;
49460  if (!canReduceVMulWidth(Mul.getNode(), DAG, Mode) ||
49461      Mode == ShrinkMode::MULU16)
49462    return SDValue();
49463
49464  EVT TruncVT = EVT::getVectorVT(*DAG.getContext(), MVT::i16,
49465                                 VT.getVectorNumElements() * 2);
49466  SDValue N0 = DAG.getNode(ISD::TRUNCATE, DL, TruncVT, Mul.getOperand(0));
49467  SDValue N1 = DAG.getNode(ISD::TRUNCATE, DL, TruncVT, Mul.getOperand(1));
49468
49469  auto PMADDBuilder = [](SelectionDAG &DAG, const SDLoc &DL,
49470                         ArrayRef<SDValue> Ops) {
49471    EVT InVT = Ops[0].getValueType();
49472    assert(InVT == Ops[1].getValueType() && "Operands' types mismatch");
49473    EVT ResVT = EVT::getVectorVT(*DAG.getContext(), MVT::i32,
49474                                 InVT.getVectorNumElements() / 2);
49475    return DAG.getNode(X86ISD::VPMADDWD, DL, ResVT, Ops[0], Ops[1]);
49476  };
49477  return SplitOpsAndApply(DAG, Subtarget, DL, VT, { N0, N1 }, PMADDBuilder);
49478}
49479
49480// Attempt to turn this pattern into PMADDWD.
49481// (add (mul (sext (build_vector)), (sext (build_vector))),
49482//      (mul (sext (build_vector)), (sext (build_vector)))
49483static SDValue matchPMADDWD_2(SelectionDAG &DAG, SDValue N0, SDValue N1,
49484                              const SDLoc &DL, EVT VT,
49485                              const X86Subtarget &Subtarget) {
49486  if (!Subtarget.hasSSE2())
49487    return SDValue();
49488
49489  if (N0.getOpcode() != ISD::MUL || N1.getOpcode() != ISD::MUL)
49490    return SDValue();
49491
49492  if (!VT.isVector() || VT.getVectorElementType() != MVT::i32 ||
49493      VT.getVectorNumElements() < 4 ||
49494      !isPowerOf2_32(VT.getVectorNumElements()))
49495    return SDValue();
49496
49497  SDValue N00 = N0.getOperand(0);
49498  SDValue N01 = N0.getOperand(1);
49499  SDValue N10 = N1.getOperand(0);
49500  SDValue N11 = N1.getOperand(1);
49501
49502  // All inputs need to be sign extends.
49503  // TODO: Support ZERO_EXTEND from known positive?
49504  if (N00.getOpcode() != ISD::SIGN_EXTEND ||
49505      N01.getOpcode() != ISD::SIGN_EXTEND ||
49506      N10.getOpcode() != ISD::SIGN_EXTEND ||
49507      N11.getOpcode() != ISD::SIGN_EXTEND)
49508    return SDValue();
49509
49510  // Peek through the extends.
49511  N00 = N00.getOperand(0);
49512  N01 = N01.getOperand(0);
49513  N10 = N10.getOperand(0);
49514  N11 = N11.getOperand(0);
49515
49516  // Must be extending from vXi16.
49517  EVT InVT = N00.getValueType();
49518  if (InVT.getVectorElementType() != MVT::i16 || N01.getValueType() != InVT ||
49519      N10.getValueType() != InVT || N11.getValueType() != InVT)
49520    return SDValue();
49521
49522  // All inputs should be build_vectors.
49523  if (N00.getOpcode() != ISD::BUILD_VECTOR ||
49524      N01.getOpcode() != ISD::BUILD_VECTOR ||
49525      N10.getOpcode() != ISD::BUILD_VECTOR ||
49526      N11.getOpcode() != ISD::BUILD_VECTOR)
49527    return SDValue();
49528
49529  // For each element, we need to ensure we have an odd element from one vector
49530  // multiplied by the odd element of another vector and the even element from
49531  // one of the same vectors being multiplied by the even element from the
49532  // other vector. So we need to make sure for each element i, this operator
49533  // is being performed:
49534  //  A[2 * i] * B[2 * i] + A[2 * i + 1] * B[2 * i + 1]
49535  SDValue In0, In1;
49536  for (unsigned i = 0; i != N00.getNumOperands(); ++i) {
49537    SDValue N00Elt = N00.getOperand(i);
49538    SDValue N01Elt = N01.getOperand(i);
49539    SDValue N10Elt = N10.getOperand(i);
49540    SDValue N11Elt = N11.getOperand(i);
49541    // TODO: Be more tolerant to undefs.
49542    if (N00Elt.getOpcode() != ISD::EXTRACT_VECTOR_ELT ||
49543        N01Elt.getOpcode() != ISD::EXTRACT_VECTOR_ELT ||
49544        N10Elt.getOpcode() != ISD::EXTRACT_VECTOR_ELT ||
49545        N11Elt.getOpcode() != ISD::EXTRACT_VECTOR_ELT)
49546      return SDValue();
49547    auto *ConstN00Elt = dyn_cast<ConstantSDNode>(N00Elt.getOperand(1));
49548    auto *ConstN01Elt = dyn_cast<ConstantSDNode>(N01Elt.getOperand(1));
49549    auto *ConstN10Elt = dyn_cast<ConstantSDNode>(N10Elt.getOperand(1));
49550    auto *ConstN11Elt = dyn_cast<ConstantSDNode>(N11Elt.getOperand(1));
49551    if (!ConstN00Elt || !ConstN01Elt || !ConstN10Elt || !ConstN11Elt)
49552      return SDValue();
49553    unsigned IdxN00 = ConstN00Elt->getZExtValue();
49554    unsigned IdxN01 = ConstN01Elt->getZExtValue();
49555    unsigned IdxN10 = ConstN10Elt->getZExtValue();
49556    unsigned IdxN11 = ConstN11Elt->getZExtValue();
49557    // Add is commutative so indices can be reordered.
49558    if (IdxN00 > IdxN10) {
49559      std::swap(IdxN00, IdxN10);
49560      std::swap(IdxN01, IdxN11);
49561    }
49562    // N0 indices be the even element. N1 indices must be the next odd element.
49563    if (IdxN00 != 2 * i || IdxN10 != 2 * i + 1 ||
49564        IdxN01 != 2 * i || IdxN11 != 2 * i + 1)
49565      return SDValue();
49566    SDValue N00In = N00Elt.getOperand(0);
49567    SDValue N01In = N01Elt.getOperand(0);
49568    SDValue N10In = N10Elt.getOperand(0);
49569    SDValue N11In = N11Elt.getOperand(0);
49570
49571    // First time we find an input capture it.
49572    if (!In0) {
49573      In0 = N00In;
49574      In1 = N01In;
49575
49576      // The input vectors must be at least as wide as the output.
49577      // If they are larger than the output, we extract subvector below.
49578      if (In0.getValueSizeInBits() < VT.getSizeInBits() ||
49579          In1.getValueSizeInBits() < VT.getSizeInBits())
49580        return SDValue();
49581    }
49582    // Mul is commutative so the input vectors can be in any order.
49583    // Canonicalize to make the compares easier.
49584    if (In0 != N00In)
49585      std::swap(N00In, N01In);
49586    if (In0 != N10In)
49587      std::swap(N10In, N11In);
49588    if (In0 != N00In || In1 != N01In || In0 != N10In || In1 != N11In)
49589      return SDValue();
49590  }
49591
49592  auto PMADDBuilder = [](SelectionDAG &DAG, const SDLoc &DL,
49593                         ArrayRef<SDValue> Ops) {
49594    EVT OpVT = Ops[0].getValueType();
49595    assert(OpVT.getScalarType() == MVT::i16 &&
49596           "Unexpected scalar element type");
49597    assert(OpVT == Ops[1].getValueType() && "Operands' types mismatch");
49598    EVT ResVT = EVT::getVectorVT(*DAG.getContext(), MVT::i32,
49599                                 OpVT.getVectorNumElements() / 2);
49600    return DAG.getNode(X86ISD::VPMADDWD, DL, ResVT, Ops[0], Ops[1]);
49601  };
49602
49603  // If the output is narrower than an input, extract the low part of the input
49604  // vector.
49605  EVT OutVT16 = EVT::getVectorVT(*DAG.getContext(), MVT::i16,
49606                               VT.getVectorNumElements() * 2);
49607  if (OutVT16.bitsLT(In0.getValueType())) {
49608    In0 = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, OutVT16, In0,
49609                      DAG.getIntPtrConstant(0, DL));
49610  }
49611  if (OutVT16.bitsLT(In1.getValueType())) {
49612    In1 = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, OutVT16, In1,
49613                      DAG.getIntPtrConstant(0, DL));
49614  }
49615  return SplitOpsAndApply(DAG, Subtarget, DL, VT, { In0, In1 },
49616                          PMADDBuilder);
49617}
49618
49619static SDValue combineAdd(SDNode *N, SelectionDAG &DAG,
49620                          TargetLowering::DAGCombinerInfo &DCI,
49621                          const X86Subtarget &Subtarget) {
49622  EVT VT = N->getValueType(0);
49623  SDValue Op0 = N->getOperand(0);
49624  SDValue Op1 = N->getOperand(1);
49625
49626  if (SDValue MAdd = matchPMADDWD(DAG, Op0, Op1, SDLoc(N), VT, Subtarget))
49627    return MAdd;
49628  if (SDValue MAdd = matchPMADDWD_2(DAG, Op0, Op1, SDLoc(N), VT, Subtarget))
49629    return MAdd;
49630
49631  // Try to synthesize horizontal adds from adds of shuffles.
49632  if (SDValue V = combineToHorizontalAddSub(N, DAG, Subtarget))
49633    return V;
49634
49635  // If vectors of i1 are legal, turn (add (zext (vXi1 X)), Y) into
49636  // (sub Y, (sext (vXi1 X))).
49637  // FIXME: We have the (sub Y, (zext (vXi1 X))) -> (add (sext (vXi1 X)), Y) in
49638  // generic DAG combine without a legal type check, but adding this there
49639  // caused regressions.
49640  if (VT.isVector()) {
49641    const TargetLowering &TLI = DAG.getTargetLoweringInfo();
49642    if (Op0.getOpcode() == ISD::ZERO_EXTEND &&
49643        Op0.getOperand(0).getValueType().getVectorElementType() == MVT::i1 &&
49644        TLI.isTypeLegal(Op0.getOperand(0).getValueType())) {
49645      SDLoc DL(N);
49646      SDValue SExt = DAG.getNode(ISD::SIGN_EXTEND, DL, VT, Op0.getOperand(0));
49647      return DAG.getNode(ISD::SUB, DL, VT, Op1, SExt);
49648    }
49649
49650    if (Op1.getOpcode() == ISD::ZERO_EXTEND &&
49651        Op1.getOperand(0).getValueType().getVectorElementType() == MVT::i1 &&
49652        TLI.isTypeLegal(Op1.getOperand(0).getValueType())) {
49653      SDLoc DL(N);
49654      SDValue SExt = DAG.getNode(ISD::SIGN_EXTEND, DL, VT, Op1.getOperand(0));
49655      return DAG.getNode(ISD::SUB, DL, VT, Op0, SExt);
49656    }
49657  }
49658
49659  return combineAddOrSubToADCOrSBB(N, DAG);
49660}
49661
49662static SDValue combineSub(SDNode *N, SelectionDAG &DAG,
49663                          TargetLowering::DAGCombinerInfo &DCI,
49664                          const X86Subtarget &Subtarget) {
49665  SDValue Op0 = N->getOperand(0);
49666  SDValue Op1 = N->getOperand(1);
49667
49668  // TODO: Add NoOpaque handling to isConstantIntBuildVectorOrConstantInt.
49669  auto IsNonOpaqueConstant = [&](SDValue Op) {
49670    if (SDNode *C = DAG.isConstantIntBuildVectorOrConstantInt(Op)) {
49671      if (auto *Cst = dyn_cast<ConstantSDNode>(C))
49672        return !Cst->isOpaque();
49673      return true;
49674    }
49675    return false;
49676  };
49677
49678  // X86 can't encode an immediate LHS of a sub. See if we can push the
49679  // negation into a preceding instruction. If the RHS of the sub is a XOR with
49680  // one use and a constant, invert the immediate, saving one register.
49681  // sub(C1, xor(X, C2)) -> add(xor(X, ~C2), C1+1)
49682  if (Op1.getOpcode() == ISD::XOR && IsNonOpaqueConstant(Op0) &&
49683      IsNonOpaqueConstant(Op1.getOperand(1)) && Op1->hasOneUse()) {
49684    SDLoc DL(N);
49685    EVT VT = Op0.getValueType();
49686    SDValue NewXor = DAG.getNode(ISD::XOR, SDLoc(Op1), VT, Op1.getOperand(0),
49687                                 DAG.getNOT(SDLoc(Op1), Op1.getOperand(1), VT));
49688    SDValue NewAdd =
49689        DAG.getNode(ISD::ADD, DL, VT, Op0, DAG.getConstant(1, DL, VT));
49690    return DAG.getNode(ISD::ADD, DL, VT, NewXor, NewAdd);
49691  }
49692
49693  // Try to synthesize horizontal subs from subs of shuffles.
49694  if (SDValue V = combineToHorizontalAddSub(N, DAG, Subtarget))
49695    return V;
49696
49697  return combineAddOrSubToADCOrSBB(N, DAG);
49698}
49699
49700static SDValue combineVectorCompare(SDNode *N, SelectionDAG &DAG,
49701                                    const X86Subtarget &Subtarget) {
49702  MVT VT = N->getSimpleValueType(0);
49703  SDLoc DL(N);
49704
49705  if (N->getOperand(0) == N->getOperand(1)) {
49706    if (N->getOpcode() == X86ISD::PCMPEQ)
49707      return DAG.getConstant(-1, DL, VT);
49708    if (N->getOpcode() == X86ISD::PCMPGT)
49709      return DAG.getConstant(0, DL, VT);
49710  }
49711
49712  return SDValue();
49713}
49714
49715/// Helper that combines an array of subvector ops as if they were the operands
49716/// of a ISD::CONCAT_VECTORS node, but may have come from another source (e.g.
49717/// ISD::INSERT_SUBVECTOR). The ops are assumed to be of the same type.
49718static SDValue combineConcatVectorOps(const SDLoc &DL, MVT VT,
49719                                      ArrayRef<SDValue> Ops, SelectionDAG &DAG,
49720                                      TargetLowering::DAGCombinerInfo &DCI,
49721                                      const X86Subtarget &Subtarget) {
49722  assert(Subtarget.hasAVX() && "AVX assumed for concat_vectors");
49723  unsigned EltSizeInBits = VT.getScalarSizeInBits();
49724
49725  if (llvm::all_of(Ops, [](SDValue Op) { return Op.isUndef(); }))
49726    return DAG.getUNDEF(VT);
49727
49728  if (llvm::all_of(Ops, [](SDValue Op) {
49729        return ISD::isBuildVectorAllZeros(Op.getNode());
49730      }))
49731    return getZeroVector(VT, Subtarget, DAG, DL);
49732
49733  SDValue Op0 = Ops[0];
49734  bool IsSplat = llvm::all_of(Ops, [&Op0](SDValue Op) { return Op == Op0; });
49735
49736  // Repeated subvectors.
49737  if (IsSplat &&
49738      (VT.is256BitVector() || (VT.is512BitVector() && Subtarget.hasAVX512()))) {
49739    // If this broadcast is inserted into both halves, use a larger broadcast.
49740    if (Op0.getOpcode() == X86ISD::VBROADCAST)
49741      return DAG.getNode(Op0.getOpcode(), DL, VT, Op0.getOperand(0));
49742
49743    // If this scalar/subvector broadcast_load is inserted into both halves, use
49744    // a larger broadcast_load. Update other uses to use an extracted subvector.
49745    if (Op0.getOpcode() == X86ISD::VBROADCAST_LOAD ||
49746        Op0.getOpcode() == X86ISD::SUBV_BROADCAST_LOAD) {
49747      auto *MemIntr = cast<MemIntrinsicSDNode>(Op0);
49748      SDVTList Tys = DAG.getVTList(VT, MVT::Other);
49749      SDValue Ops[] = {MemIntr->getChain(), MemIntr->getBasePtr()};
49750      SDValue BcastLd = DAG.getMemIntrinsicNode(Op0.getOpcode(), DL, Tys, Ops,
49751                                                MemIntr->getMemoryVT(),
49752                                                MemIntr->getMemOperand());
49753      DAG.ReplaceAllUsesOfValueWith(
49754          Op0, extractSubVector(BcastLd, 0, DAG, DL, Op0.getValueSizeInBits()));
49755      DAG.ReplaceAllUsesOfValueWith(SDValue(MemIntr, 1), BcastLd.getValue(1));
49756      return BcastLd;
49757    }
49758
49759    // If this is a simple subvector load repeated across multiple lanes, then
49760    // broadcast the load. Update other uses to use an extracted subvector.
49761    if (auto *Ld = dyn_cast<LoadSDNode>(Op0)) {
49762      if (Ld->isSimple() && !Ld->isNonTemporal() &&
49763          Ld->getExtensionType() == ISD::NON_EXTLOAD) {
49764        SDVTList Tys = DAG.getVTList(VT, MVT::Other);
49765        SDValue Ops[] = {Ld->getChain(), Ld->getBasePtr()};
49766        SDValue BcastLd =
49767            DAG.getMemIntrinsicNode(X86ISD::SUBV_BROADCAST_LOAD, DL, Tys, Ops,
49768                                    Ld->getMemoryVT(), Ld->getMemOperand());
49769        DAG.ReplaceAllUsesOfValueWith(
49770            Op0,
49771            extractSubVector(BcastLd, 0, DAG, DL, Op0.getValueSizeInBits()));
49772        DAG.ReplaceAllUsesOfValueWith(SDValue(Ld, 1), BcastLd.getValue(1));
49773        return BcastLd;
49774      }
49775    }
49776
49777    // concat_vectors(movddup(x),movddup(x)) -> broadcast(x)
49778    if (Op0.getOpcode() == X86ISD::MOVDDUP && VT == MVT::v4f64 &&
49779        (Subtarget.hasAVX2() || MayFoldLoad(Op0.getOperand(0))))
49780      return DAG.getNode(X86ISD::VBROADCAST, DL, VT,
49781                         DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::f64,
49782                                     Op0.getOperand(0),
49783                                     DAG.getIntPtrConstant(0, DL)));
49784
49785    // concat_vectors(scalar_to_vector(x),scalar_to_vector(x)) -> broadcast(x)
49786    if (Op0.getOpcode() == ISD::SCALAR_TO_VECTOR &&
49787        (Subtarget.hasAVX2() ||
49788         (EltSizeInBits >= 32 && MayFoldLoad(Op0.getOperand(0)))) &&
49789        Op0.getOperand(0).getValueType() == VT.getScalarType())
49790      return DAG.getNode(X86ISD::VBROADCAST, DL, VT, Op0.getOperand(0));
49791
49792    // concat_vectors(extract_subvector(broadcast(x)),
49793    //                extract_subvector(broadcast(x))) -> broadcast(x)
49794    if (Op0.getOpcode() == ISD::EXTRACT_SUBVECTOR &&
49795        Op0.getOperand(0).getValueType() == VT) {
49796      if (Op0.getOperand(0).getOpcode() == X86ISD::VBROADCAST ||
49797          Op0.getOperand(0).getOpcode() == X86ISD::VBROADCAST_LOAD)
49798        return Op0.getOperand(0);
49799    }
49800  }
49801
49802  // concat(extract_subvector(v0,c0), extract_subvector(v1,c1)) -> vperm2x128.
49803  // Only concat of subvector high halves which vperm2x128 is best at.
49804  // TODO: This should go in combineX86ShufflesRecursively eventually.
49805  if (VT.is256BitVector() && Ops.size() == 2) {
49806    SDValue Src0 = peekThroughBitcasts(Ops[0]);
49807    SDValue Src1 = peekThroughBitcasts(Ops[1]);
49808    if (Src0.getOpcode() == ISD::EXTRACT_SUBVECTOR &&
49809        Src1.getOpcode() == ISD::EXTRACT_SUBVECTOR) {
49810      EVT SrcVT0 = Src0.getOperand(0).getValueType();
49811      EVT SrcVT1 = Src1.getOperand(0).getValueType();
49812      unsigned NumSrcElts0 = SrcVT0.getVectorNumElements();
49813      unsigned NumSrcElts1 = SrcVT1.getVectorNumElements();
49814      if (SrcVT0.is256BitVector() && SrcVT1.is256BitVector() &&
49815          Src0.getConstantOperandAPInt(1) == (NumSrcElts0 / 2) &&
49816          Src1.getConstantOperandAPInt(1) == (NumSrcElts1 / 2)) {
49817        return DAG.getNode(X86ISD::VPERM2X128, DL, VT,
49818                           DAG.getBitcast(VT, Src0.getOperand(0)),
49819                           DAG.getBitcast(VT, Src1.getOperand(0)),
49820                           DAG.getTargetConstant(0x31, DL, MVT::i8));
49821      }
49822    }
49823  }
49824
49825  // Repeated opcode.
49826  // TODO - combineX86ShufflesRecursively should handle shuffle concatenation
49827  // but it currently struggles with different vector widths.
49828  if (llvm::all_of(Ops, [Op0](SDValue Op) {
49829        return Op.getOpcode() == Op0.getOpcode();
49830      })) {
49831    auto ConcatSubOperand = [&](MVT VT, ArrayRef<SDValue> SubOps, unsigned I) {
49832      SmallVector<SDValue> Subs;
49833      for (SDValue SubOp : SubOps)
49834        Subs.push_back(SubOp.getOperand(I));
49835      return DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, Subs);
49836    };
49837
49838    unsigned NumOps = Ops.size();
49839    switch (Op0.getOpcode()) {
49840    case X86ISD::SHUFP: {
49841      // Add SHUFPD support if/when necessary.
49842      if (!IsSplat && VT.getScalarType() == MVT::f32 &&
49843          llvm::all_of(Ops, [Op0](SDValue Op) {
49844            return Op.getOperand(2) == Op0.getOperand(2);
49845          })) {
49846        return DAG.getNode(Op0.getOpcode(), DL, VT,
49847                           ConcatSubOperand(VT, Ops, 0),
49848                           ConcatSubOperand(VT, Ops, 1), Op0.getOperand(2));
49849      }
49850      break;
49851    }
49852    case X86ISD::PSHUFHW:
49853    case X86ISD::PSHUFLW:
49854    case X86ISD::PSHUFD:
49855      if (!IsSplat && NumOps == 2 && VT.is256BitVector() &&
49856          Subtarget.hasInt256() && Op0.getOperand(1) == Ops[1].getOperand(1)) {
49857        return DAG.getNode(Op0.getOpcode(), DL, VT,
49858                           ConcatSubOperand(VT, Ops, 0), Op0.getOperand(1));
49859      }
49860      LLVM_FALLTHROUGH;
49861    case X86ISD::VPERMILPI:
49862      // TODO - add support for vXf64/vXi64 shuffles.
49863      if (!IsSplat && NumOps == 2 && (VT == MVT::v8f32 || VT == MVT::v8i32) &&
49864          Subtarget.hasAVX() && Op0.getOperand(1) == Ops[1].getOperand(1)) {
49865        SDValue Res = DAG.getBitcast(MVT::v8f32, ConcatSubOperand(VT, Ops, 0));
49866        Res = DAG.getNode(X86ISD::VPERMILPI, DL, MVT::v8f32, Res,
49867                          Op0.getOperand(1));
49868        return DAG.getBitcast(VT, Res);
49869      }
49870      break;
49871    case X86ISD::VPERMV3:
49872      if (!IsSplat && NumOps == 2 && VT.is512BitVector()) {
49873        MVT OpVT = Op0.getSimpleValueType();
49874        int NumSrcElts = OpVT.getVectorNumElements();
49875        SmallVector<int, 64> ConcatMask;
49876        for (unsigned i = 0; i != NumOps; ++i) {
49877          SmallVector<int, 64> SubMask;
49878          SmallVector<SDValue, 2> SubOps;
49879          if (!getTargetShuffleMask(Ops[i].getNode(), OpVT, false, SubOps,
49880                                    SubMask))
49881            break;
49882          for (int M : SubMask) {
49883            if (0 <= M) {
49884              M += M < NumSrcElts ? 0 : NumSrcElts;
49885              M += i * NumSrcElts;
49886            }
49887            ConcatMask.push_back(M);
49888          }
49889        }
49890        if (ConcatMask.size() == (NumOps * NumSrcElts)) {
49891          SDValue Src0 = concatSubVectors(Ops[0].getOperand(0),
49892                                          Ops[1].getOperand(0), DAG, DL);
49893          SDValue Src1 = concatSubVectors(Ops[0].getOperand(2),
49894                                          Ops[1].getOperand(2), DAG, DL);
49895          MVT IntMaskSVT = MVT::getIntegerVT(VT.getScalarSizeInBits());
49896          MVT IntMaskVT = MVT::getVectorVT(IntMaskSVT, NumOps * NumSrcElts);
49897          SDValue Mask = getConstVector(ConcatMask, IntMaskVT, DAG, DL, true);
49898          return DAG.getNode(X86ISD::VPERMV3, DL, VT, Src0, Mask, Src1);
49899        }
49900      }
49901      break;
49902    case X86ISD::VSHLI:
49903    case X86ISD::VSRLI:
49904      // Special case: SHL/SRL AVX1 V4i64 by 32-bits can lower as a shuffle.
49905      // TODO: Move this to LowerScalarImmediateShift?
49906      if (VT == MVT::v4i64 && !Subtarget.hasInt256() &&
49907          llvm::all_of(Ops, [](SDValue Op) {
49908            return Op.getConstantOperandAPInt(1) == 32;
49909          })) {
49910        SDValue Res = DAG.getBitcast(MVT::v8i32, ConcatSubOperand(VT, Ops, 0));
49911        SDValue Zero = getZeroVector(MVT::v8i32, Subtarget, DAG, DL);
49912        if (Op0.getOpcode() == X86ISD::VSHLI) {
49913          Res = DAG.getVectorShuffle(MVT::v8i32, DL, Res, Zero,
49914                                     {8, 0, 8, 2, 8, 4, 8, 6});
49915        } else {
49916          Res = DAG.getVectorShuffle(MVT::v8i32, DL, Res, Zero,
49917                                     {1, 8, 3, 8, 5, 8, 7, 8});
49918        }
49919        return DAG.getBitcast(VT, Res);
49920      }
49921      LLVM_FALLTHROUGH;
49922    case X86ISD::VSRAI:
49923      if (((VT.is256BitVector() && Subtarget.hasInt256()) ||
49924           (VT.is512BitVector() && Subtarget.useAVX512Regs() &&
49925            (EltSizeInBits >= 32 || Subtarget.useBWIRegs()))) &&
49926          llvm::all_of(Ops, [Op0](SDValue Op) {
49927            return Op0.getOperand(1) == Op.getOperand(1);
49928          })) {
49929        return DAG.getNode(Op0.getOpcode(), DL, VT,
49930                           ConcatSubOperand(VT, Ops, 0), Op0.getOperand(1));
49931      }
49932      break;
49933    case X86ISD::VPERMI:
49934    case X86ISD::VROTLI:
49935    case X86ISD::VROTRI:
49936      if (VT.is512BitVector() && Subtarget.useAVX512Regs() &&
49937          llvm::all_of(Ops, [Op0](SDValue Op) {
49938            return Op0.getOperand(1) == Op.getOperand(1);
49939          })) {
49940        return DAG.getNode(Op0.getOpcode(), DL, VT,
49941                           ConcatSubOperand(VT, Ops, 0), Op0.getOperand(1));
49942      }
49943      break;
49944    case ISD::AND:
49945    case ISD::OR:
49946    case ISD::XOR:
49947    case X86ISD::ANDNP:
49948      // TODO: Add 256-bit support.
49949      if (!IsSplat && VT.is512BitVector()) {
49950        MVT SrcVT = Op0.getOperand(0).getSimpleValueType();
49951        SrcVT = MVT::getVectorVT(SrcVT.getScalarType(),
49952                                 NumOps * SrcVT.getVectorNumElements());
49953        return DAG.getNode(Op0.getOpcode(), DL, VT,
49954                           ConcatSubOperand(SrcVT, Ops, 0),
49955                           ConcatSubOperand(SrcVT, Ops, 1));
49956      }
49957      break;
49958    case X86ISD::HADD:
49959    case X86ISD::HSUB:
49960    case X86ISD::FHADD:
49961    case X86ISD::FHSUB:
49962    case X86ISD::PACKSS:
49963    case X86ISD::PACKUS:
49964      if (!IsSplat && VT.is256BitVector() &&
49965          (VT.isFloatingPoint() || Subtarget.hasInt256())) {
49966        MVT SrcVT = Op0.getOperand(0).getSimpleValueType();
49967        SrcVT = MVT::getVectorVT(SrcVT.getScalarType(),
49968                                 NumOps * SrcVT.getVectorNumElements());
49969        return DAG.getNode(Op0.getOpcode(), DL, VT,
49970                           ConcatSubOperand(SrcVT, Ops, 0),
49971                           ConcatSubOperand(SrcVT, Ops, 1));
49972      }
49973      break;
49974    case X86ISD::PALIGNR:
49975      if (!IsSplat &&
49976          ((VT.is256BitVector() && Subtarget.hasInt256()) ||
49977           (VT.is512BitVector() && Subtarget.useBWIRegs())) &&
49978          llvm::all_of(Ops, [Op0](SDValue Op) {
49979            return Op0.getOperand(2) == Op.getOperand(2);
49980          })) {
49981        return DAG.getNode(Op0.getOpcode(), DL, VT,
49982                           ConcatSubOperand(VT, Ops, 0),
49983                           ConcatSubOperand(VT, Ops, 1), Op0.getOperand(2));
49984      }
49985      break;
49986    }
49987  }
49988
49989  // Fold subvector loads into one.
49990  // If needed, look through bitcasts to get to the load.
49991  if (auto *FirstLd = dyn_cast<LoadSDNode>(peekThroughBitcasts(Op0))) {
49992    bool Fast;
49993    const X86TargetLowering *TLI = Subtarget.getTargetLowering();
49994    if (TLI->allowsMemoryAccess(*DAG.getContext(), DAG.getDataLayout(), VT,
49995                                *FirstLd->getMemOperand(), &Fast) &&
49996        Fast) {
49997      if (SDValue Ld =
49998              EltsFromConsecutiveLoads(VT, Ops, DL, DAG, Subtarget, false))
49999        return Ld;
50000    }
50001  }
50002
50003  return SDValue();
50004}
50005
50006static SDValue combineConcatVectors(SDNode *N, SelectionDAG &DAG,
50007                                    TargetLowering::DAGCombinerInfo &DCI,
50008                                    const X86Subtarget &Subtarget) {
50009  EVT VT = N->getValueType(0);
50010  EVT SrcVT = N->getOperand(0).getValueType();
50011  const TargetLowering &TLI = DAG.getTargetLoweringInfo();
50012
50013  // Don't do anything for i1 vectors.
50014  if (VT.getVectorElementType() == MVT::i1)
50015    return SDValue();
50016
50017  if (Subtarget.hasAVX() && TLI.isTypeLegal(VT) && TLI.isTypeLegal(SrcVT)) {
50018    SmallVector<SDValue, 4> Ops(N->op_begin(), N->op_end());
50019    if (SDValue R = combineConcatVectorOps(SDLoc(N), VT.getSimpleVT(), Ops, DAG,
50020                                           DCI, Subtarget))
50021      return R;
50022  }
50023
50024  return SDValue();
50025}
50026
50027static SDValue combineInsertSubvector(SDNode *N, SelectionDAG &DAG,
50028                                      TargetLowering::DAGCombinerInfo &DCI,
50029                                      const X86Subtarget &Subtarget) {
50030  if (DCI.isBeforeLegalizeOps())
50031    return SDValue();
50032
50033  MVT OpVT = N->getSimpleValueType(0);
50034
50035  bool IsI1Vector = OpVT.getVectorElementType() == MVT::i1;
50036
50037  SDLoc dl(N);
50038  SDValue Vec = N->getOperand(0);
50039  SDValue SubVec = N->getOperand(1);
50040
50041  uint64_t IdxVal = N->getConstantOperandVal(2);
50042  MVT SubVecVT = SubVec.getSimpleValueType();
50043
50044  if (Vec.isUndef() && SubVec.isUndef())
50045    return DAG.getUNDEF(OpVT);
50046
50047  // Inserting undefs/zeros into zeros/undefs is a zero vector.
50048  if ((Vec.isUndef() || ISD::isBuildVectorAllZeros(Vec.getNode())) &&
50049      (SubVec.isUndef() || ISD::isBuildVectorAllZeros(SubVec.getNode())))
50050    return getZeroVector(OpVT, Subtarget, DAG, dl);
50051
50052  if (ISD::isBuildVectorAllZeros(Vec.getNode())) {
50053    // If we're inserting into a zero vector and then into a larger zero vector,
50054    // just insert into the larger zero vector directly.
50055    if (SubVec.getOpcode() == ISD::INSERT_SUBVECTOR &&
50056        ISD::isBuildVectorAllZeros(SubVec.getOperand(0).getNode())) {
50057      uint64_t Idx2Val = SubVec.getConstantOperandVal(2);
50058      return DAG.getNode(ISD::INSERT_SUBVECTOR, dl, OpVT,
50059                         getZeroVector(OpVT, Subtarget, DAG, dl),
50060                         SubVec.getOperand(1),
50061                         DAG.getIntPtrConstant(IdxVal + Idx2Val, dl));
50062    }
50063
50064    // If we're inserting into a zero vector and our input was extracted from an
50065    // insert into a zero vector of the same type and the extraction was at
50066    // least as large as the original insertion. Just insert the original
50067    // subvector into a zero vector.
50068    if (SubVec.getOpcode() == ISD::EXTRACT_SUBVECTOR && IdxVal == 0 &&
50069        isNullConstant(SubVec.getOperand(1)) &&
50070        SubVec.getOperand(0).getOpcode() == ISD::INSERT_SUBVECTOR) {
50071      SDValue Ins = SubVec.getOperand(0);
50072      if (isNullConstant(Ins.getOperand(2)) &&
50073          ISD::isBuildVectorAllZeros(Ins.getOperand(0).getNode()) &&
50074          Ins.getOperand(1).getValueSizeInBits().getFixedSize() <=
50075              SubVecVT.getFixedSizeInBits())
50076        return DAG.getNode(ISD::INSERT_SUBVECTOR, dl, OpVT,
50077                           getZeroVector(OpVT, Subtarget, DAG, dl),
50078                           Ins.getOperand(1), N->getOperand(2));
50079    }
50080  }
50081
50082  // Stop here if this is an i1 vector.
50083  if (IsI1Vector)
50084    return SDValue();
50085
50086  // If this is an insert of an extract, combine to a shuffle. Don't do this
50087  // if the insert or extract can be represented with a subregister operation.
50088  if (SubVec.getOpcode() == ISD::EXTRACT_SUBVECTOR &&
50089      SubVec.getOperand(0).getSimpleValueType() == OpVT &&
50090      (IdxVal != 0 ||
50091       !(Vec.isUndef() || ISD::isBuildVectorAllZeros(Vec.getNode())))) {
50092    int ExtIdxVal = SubVec.getConstantOperandVal(1);
50093    if (ExtIdxVal != 0) {
50094      int VecNumElts = OpVT.getVectorNumElements();
50095      int SubVecNumElts = SubVecVT.getVectorNumElements();
50096      SmallVector<int, 64> Mask(VecNumElts);
50097      // First create an identity shuffle mask.
50098      for (int i = 0; i != VecNumElts; ++i)
50099        Mask[i] = i;
50100      // Now insert the extracted portion.
50101      for (int i = 0; i != SubVecNumElts; ++i)
50102        Mask[i + IdxVal] = i + ExtIdxVal + VecNumElts;
50103
50104      return DAG.getVectorShuffle(OpVT, dl, Vec, SubVec.getOperand(0), Mask);
50105    }
50106  }
50107
50108  // Match concat_vector style patterns.
50109  SmallVector<SDValue, 2> SubVectorOps;
50110  if (collectConcatOps(N, SubVectorOps)) {
50111    if (SDValue Fold =
50112            combineConcatVectorOps(dl, OpVT, SubVectorOps, DAG, DCI, Subtarget))
50113      return Fold;
50114
50115    // If we're inserting all zeros into the upper half, change this to
50116    // a concat with zero. We will match this to a move
50117    // with implicit upper bit zeroing during isel.
50118    // We do this here because we don't want combineConcatVectorOps to
50119    // create INSERT_SUBVECTOR from CONCAT_VECTORS.
50120    if (SubVectorOps.size() == 2 &&
50121        ISD::isBuildVectorAllZeros(SubVectorOps[1].getNode()))
50122      return DAG.getNode(ISD::INSERT_SUBVECTOR, dl, OpVT,
50123                         getZeroVector(OpVT, Subtarget, DAG, dl),
50124                         SubVectorOps[0], DAG.getIntPtrConstant(0, dl));
50125  }
50126
50127  // If this is a broadcast insert into an upper undef, use a larger broadcast.
50128  if (Vec.isUndef() && IdxVal != 0 && SubVec.getOpcode() == X86ISD::VBROADCAST)
50129    return DAG.getNode(X86ISD::VBROADCAST, dl, OpVT, SubVec.getOperand(0));
50130
50131  // If this is a broadcast load inserted into an upper undef, use a larger
50132  // broadcast load.
50133  if (Vec.isUndef() && IdxVal != 0 && SubVec.hasOneUse() &&
50134      SubVec.getOpcode() == X86ISD::VBROADCAST_LOAD) {
50135    auto *MemIntr = cast<MemIntrinsicSDNode>(SubVec);
50136    SDVTList Tys = DAG.getVTList(OpVT, MVT::Other);
50137    SDValue Ops[] = { MemIntr->getChain(), MemIntr->getBasePtr() };
50138    SDValue BcastLd =
50139        DAG.getMemIntrinsicNode(X86ISD::VBROADCAST_LOAD, dl, Tys, Ops,
50140                                MemIntr->getMemoryVT(),
50141                                MemIntr->getMemOperand());
50142    DAG.ReplaceAllUsesOfValueWith(SDValue(MemIntr, 1), BcastLd.getValue(1));
50143    return BcastLd;
50144  }
50145
50146  return SDValue();
50147}
50148
50149/// If we are extracting a subvector of a vector select and the select condition
50150/// is composed of concatenated vectors, try to narrow the select width. This
50151/// is a common pattern for AVX1 integer code because 256-bit selects may be
50152/// legal, but there is almost no integer math/logic available for 256-bit.
50153/// This function should only be called with legal types (otherwise, the calls
50154/// to get simple value types will assert).
50155static SDValue narrowExtractedVectorSelect(SDNode *Ext, SelectionDAG &DAG) {
50156  SDValue Sel = peekThroughBitcasts(Ext->getOperand(0));
50157  SmallVector<SDValue, 4> CatOps;
50158  if (Sel.getOpcode() != ISD::VSELECT ||
50159      !collectConcatOps(Sel.getOperand(0).getNode(), CatOps))
50160    return SDValue();
50161
50162  // Note: We assume simple value types because this should only be called with
50163  //       legal operations/types.
50164  // TODO: This can be extended to handle extraction to 256-bits.
50165  MVT VT = Ext->getSimpleValueType(0);
50166  if (!VT.is128BitVector())
50167    return SDValue();
50168
50169  MVT SelCondVT = Sel.getOperand(0).getSimpleValueType();
50170  if (!SelCondVT.is256BitVector() && !SelCondVT.is512BitVector())
50171    return SDValue();
50172
50173  MVT WideVT = Ext->getOperand(0).getSimpleValueType();
50174  MVT SelVT = Sel.getSimpleValueType();
50175  assert((SelVT.is256BitVector() || SelVT.is512BitVector()) &&
50176         "Unexpected vector type with legal operations");
50177
50178  unsigned SelElts = SelVT.getVectorNumElements();
50179  unsigned CastedElts = WideVT.getVectorNumElements();
50180  unsigned ExtIdx = Ext->getConstantOperandVal(1);
50181  if (SelElts % CastedElts == 0) {
50182    // The select has the same or more (narrower) elements than the extract
50183    // operand. The extraction index gets scaled by that factor.
50184    ExtIdx *= (SelElts / CastedElts);
50185  } else if (CastedElts % SelElts == 0) {
50186    // The select has less (wider) elements than the extract operand. Make sure
50187    // that the extraction index can be divided evenly.
50188    unsigned IndexDivisor = CastedElts / SelElts;
50189    if (ExtIdx % IndexDivisor != 0)
50190      return SDValue();
50191    ExtIdx /= IndexDivisor;
50192  } else {
50193    llvm_unreachable("Element count of simple vector types are not divisible?");
50194  }
50195
50196  unsigned NarrowingFactor = WideVT.getSizeInBits() / VT.getSizeInBits();
50197  unsigned NarrowElts = SelElts / NarrowingFactor;
50198  MVT NarrowSelVT = MVT::getVectorVT(SelVT.getVectorElementType(), NarrowElts);
50199  SDLoc DL(Ext);
50200  SDValue ExtCond = extract128BitVector(Sel.getOperand(0), ExtIdx, DAG, DL);
50201  SDValue ExtT = extract128BitVector(Sel.getOperand(1), ExtIdx, DAG, DL);
50202  SDValue ExtF = extract128BitVector(Sel.getOperand(2), ExtIdx, DAG, DL);
50203  SDValue NarrowSel = DAG.getSelect(DL, NarrowSelVT, ExtCond, ExtT, ExtF);
50204  return DAG.getBitcast(VT, NarrowSel);
50205}
50206
50207static SDValue combineExtractSubvector(SDNode *N, SelectionDAG &DAG,
50208                                       TargetLowering::DAGCombinerInfo &DCI,
50209                                       const X86Subtarget &Subtarget) {
50210  // For AVX1 only, if we are extracting from a 256-bit and+not (which will
50211  // eventually get combined/lowered into ANDNP) with a concatenated operand,
50212  // split the 'and' into 128-bit ops to avoid the concatenate and extract.
50213  // We let generic combining take over from there to simplify the
50214  // insert/extract and 'not'.
50215  // This pattern emerges during AVX1 legalization. We handle it before lowering
50216  // to avoid complications like splitting constant vector loads.
50217
50218  // Capture the original wide type in the likely case that we need to bitcast
50219  // back to this type.
50220  if (!N->getValueType(0).isSimple())
50221    return SDValue();
50222
50223  MVT VT = N->getSimpleValueType(0);
50224  SDValue InVec = N->getOperand(0);
50225  unsigned IdxVal = N->getConstantOperandVal(1);
50226  SDValue InVecBC = peekThroughBitcasts(InVec);
50227  EVT InVecVT = InVec.getValueType();
50228  unsigned SizeInBits = VT.getSizeInBits();
50229  unsigned InSizeInBits = InVecVT.getSizeInBits();
50230  const TargetLowering &TLI = DAG.getTargetLoweringInfo();
50231
50232  if (Subtarget.hasAVX() && !Subtarget.hasAVX2() &&
50233      TLI.isTypeLegal(InVecVT) &&
50234      InSizeInBits == 256 && InVecBC.getOpcode() == ISD::AND) {
50235    auto isConcatenatedNot = [](SDValue V) {
50236      V = peekThroughBitcasts(V);
50237      if (!isBitwiseNot(V))
50238        return false;
50239      SDValue NotOp = V->getOperand(0);
50240      return peekThroughBitcasts(NotOp).getOpcode() == ISD::CONCAT_VECTORS;
50241    };
50242    if (isConcatenatedNot(InVecBC.getOperand(0)) ||
50243        isConcatenatedNot(InVecBC.getOperand(1))) {
50244      // extract (and v4i64 X, (not (concat Y1, Y2))), n -> andnp v2i64 X(n), Y1
50245      SDValue Concat = splitVectorIntBinary(InVecBC, DAG);
50246      return DAG.getNode(ISD::EXTRACT_SUBVECTOR, SDLoc(N), VT,
50247                         DAG.getBitcast(InVecVT, Concat), N->getOperand(1));
50248    }
50249  }
50250
50251  if (DCI.isBeforeLegalizeOps())
50252    return SDValue();
50253
50254  if (SDValue V = narrowExtractedVectorSelect(N, DAG))
50255    return V;
50256
50257  if (ISD::isBuildVectorAllZeros(InVec.getNode()))
50258    return getZeroVector(VT, Subtarget, DAG, SDLoc(N));
50259
50260  if (ISD::isBuildVectorAllOnes(InVec.getNode())) {
50261    if (VT.getScalarType() == MVT::i1)
50262      return DAG.getConstant(1, SDLoc(N), VT);
50263    return getOnesVector(VT, DAG, SDLoc(N));
50264  }
50265
50266  if (InVec.getOpcode() == ISD::BUILD_VECTOR)
50267    return DAG.getBuildVector(
50268        VT, SDLoc(N),
50269        InVec.getNode()->ops().slice(IdxVal, VT.getVectorNumElements()));
50270
50271  // If we are extracting from an insert into a zero vector, replace with a
50272  // smaller insert into zero if we don't access less than the original
50273  // subvector. Don't do this for i1 vectors.
50274  if (VT.getVectorElementType() != MVT::i1 &&
50275      InVec.getOpcode() == ISD::INSERT_SUBVECTOR && IdxVal == 0 &&
50276      InVec.hasOneUse() && isNullConstant(InVec.getOperand(2)) &&
50277      ISD::isBuildVectorAllZeros(InVec.getOperand(0).getNode()) &&
50278      InVec.getOperand(1).getValueSizeInBits() <= SizeInBits) {
50279    SDLoc DL(N);
50280    return DAG.getNode(ISD::INSERT_SUBVECTOR, DL, VT,
50281                       getZeroVector(VT, Subtarget, DAG, DL),
50282                       InVec.getOperand(1), InVec.getOperand(2));
50283  }
50284
50285  // If we're extracting an upper subvector from a broadcast we should just
50286  // extract the lowest subvector instead which should allow
50287  // SimplifyDemandedVectorElts do more simplifications.
50288  if (IdxVal != 0 && (InVec.getOpcode() == X86ISD::VBROADCAST ||
50289                      InVec.getOpcode() == X86ISD::VBROADCAST_LOAD ||
50290                      DAG.isSplatValue(InVec, /*AllowUndefs*/ false)))
50291    return extractSubVector(InVec, 0, DAG, SDLoc(N), SizeInBits);
50292
50293  // If we're extracting a broadcasted subvector, just use the lowest subvector.
50294  if (IdxVal != 0 && InVec.getOpcode() == X86ISD::SUBV_BROADCAST_LOAD &&
50295      cast<MemIntrinsicSDNode>(InVec)->getMemoryVT() == VT)
50296    return extractSubVector(InVec, 0, DAG, SDLoc(N), SizeInBits);
50297
50298  // Attempt to extract from the source of a shuffle vector.
50299  if ((InSizeInBits % SizeInBits) == 0 &&
50300      (IdxVal % VT.getVectorNumElements()) == 0) {
50301    SmallVector<int, 32> ShuffleMask;
50302    SmallVector<int, 32> ScaledMask;
50303    SmallVector<SDValue, 2> ShuffleInputs;
50304    unsigned NumSubVecs = InSizeInBits / SizeInBits;
50305    // Decode the shuffle mask and scale it so its shuffling subvectors.
50306    if (getTargetShuffleInputs(InVecBC, ShuffleInputs, ShuffleMask, DAG) &&
50307        scaleShuffleElements(ShuffleMask, NumSubVecs, ScaledMask)) {
50308      unsigned SubVecIdx = IdxVal / VT.getVectorNumElements();
50309      if (ScaledMask[SubVecIdx] == SM_SentinelUndef)
50310        return DAG.getUNDEF(VT);
50311      if (ScaledMask[SubVecIdx] == SM_SentinelZero)
50312        return getZeroVector(VT, Subtarget, DAG, SDLoc(N));
50313      SDValue Src = ShuffleInputs[ScaledMask[SubVecIdx] / NumSubVecs];
50314      if (Src.getValueSizeInBits() == InSizeInBits) {
50315        unsigned SrcSubVecIdx = ScaledMask[SubVecIdx] % NumSubVecs;
50316        unsigned SrcEltIdx = SrcSubVecIdx * VT.getVectorNumElements();
50317        return extractSubVector(DAG.getBitcast(InVecVT, Src), SrcEltIdx, DAG,
50318                                SDLoc(N), SizeInBits);
50319      }
50320    }
50321  }
50322
50323  // If we're extracting the lowest subvector and we're the only user,
50324  // we may be able to perform this with a smaller vector width.
50325  unsigned InOpcode = InVec.getOpcode();
50326  if (IdxVal == 0 && InVec.hasOneUse()) {
50327    if (VT == MVT::v2f64 && InVecVT == MVT::v4f64) {
50328      // v2f64 CVTDQ2PD(v4i32).
50329      if (InOpcode == ISD::SINT_TO_FP &&
50330          InVec.getOperand(0).getValueType() == MVT::v4i32) {
50331        return DAG.getNode(X86ISD::CVTSI2P, SDLoc(N), VT, InVec.getOperand(0));
50332      }
50333      // v2f64 CVTUDQ2PD(v4i32).
50334      if (InOpcode == ISD::UINT_TO_FP && Subtarget.hasVLX() &&
50335          InVec.getOperand(0).getValueType() == MVT::v4i32) {
50336        return DAG.getNode(X86ISD::CVTUI2P, SDLoc(N), VT, InVec.getOperand(0));
50337      }
50338      // v2f64 CVTPS2PD(v4f32).
50339      if (InOpcode == ISD::FP_EXTEND &&
50340          InVec.getOperand(0).getValueType() == MVT::v4f32) {
50341        return DAG.getNode(X86ISD::VFPEXT, SDLoc(N), VT, InVec.getOperand(0));
50342      }
50343    }
50344    if ((InOpcode == ISD::ANY_EXTEND ||
50345         InOpcode == ISD::ANY_EXTEND_VECTOR_INREG ||
50346         InOpcode == ISD::ZERO_EXTEND ||
50347         InOpcode == ISD::ZERO_EXTEND_VECTOR_INREG ||
50348         InOpcode == ISD::SIGN_EXTEND ||
50349         InOpcode == ISD::SIGN_EXTEND_VECTOR_INREG) &&
50350        (SizeInBits == 128 || SizeInBits == 256) &&
50351        InVec.getOperand(0).getValueSizeInBits() >= SizeInBits) {
50352      SDLoc DL(N);
50353      SDValue Ext = InVec.getOperand(0);
50354      if (Ext.getValueSizeInBits() > SizeInBits)
50355        Ext = extractSubVector(Ext, 0, DAG, DL, SizeInBits);
50356      unsigned ExtOp = getOpcode_EXTEND_VECTOR_INREG(InOpcode);
50357      return DAG.getNode(ExtOp, DL, VT, Ext);
50358    }
50359    if (InOpcode == ISD::VSELECT &&
50360        InVec.getOperand(0).getValueType().is256BitVector() &&
50361        InVec.getOperand(1).getValueType().is256BitVector() &&
50362        InVec.getOperand(2).getValueType().is256BitVector()) {
50363      SDLoc DL(N);
50364      SDValue Ext0 = extractSubVector(InVec.getOperand(0), 0, DAG, DL, 128);
50365      SDValue Ext1 = extractSubVector(InVec.getOperand(1), 0, DAG, DL, 128);
50366      SDValue Ext2 = extractSubVector(InVec.getOperand(2), 0, DAG, DL, 128);
50367      return DAG.getNode(InOpcode, DL, VT, Ext0, Ext1, Ext2);
50368    }
50369    if (InOpcode == ISD::TRUNCATE && Subtarget.hasVLX() &&
50370        (VT.is128BitVector() || VT.is256BitVector())) {
50371      SDLoc DL(N);
50372      SDValue InVecSrc = InVec.getOperand(0);
50373      unsigned Scale = InVecSrc.getValueSizeInBits() / InSizeInBits;
50374      SDValue Ext = extractSubVector(InVecSrc, 0, DAG, DL, Scale * SizeInBits);
50375      return DAG.getNode(InOpcode, DL, VT, Ext);
50376    }
50377  }
50378
50379  // Always split vXi64 logical shifts where we're extracting the upper 32-bits
50380  // as this is very likely to fold into a shuffle/truncation.
50381  if ((InOpcode == X86ISD::VSHLI || InOpcode == X86ISD::VSRLI) &&
50382      InVecVT.getScalarSizeInBits() == 64 &&
50383      InVec.getConstantOperandAPInt(1) == 32) {
50384    SDLoc DL(N);
50385    SDValue Ext =
50386        extractSubVector(InVec.getOperand(0), IdxVal, DAG, DL, SizeInBits);
50387    return DAG.getNode(InOpcode, DL, VT, Ext, InVec.getOperand(1));
50388  }
50389
50390  return SDValue();
50391}
50392
50393static SDValue combineScalarToVector(SDNode *N, SelectionDAG &DAG) {
50394  EVT VT = N->getValueType(0);
50395  SDValue Src = N->getOperand(0);
50396  SDLoc DL(N);
50397
50398  // If this is a scalar to vector to v1i1 from an AND with 1, bypass the and.
50399  // This occurs frequently in our masked scalar intrinsic code and our
50400  // floating point select lowering with AVX512.
50401  // TODO: SimplifyDemandedBits instead?
50402  if (VT == MVT::v1i1 && Src.getOpcode() == ISD::AND && Src.hasOneUse())
50403    if (auto *C = dyn_cast<ConstantSDNode>(Src.getOperand(1)))
50404      if (C->getAPIntValue().isOneValue())
50405        return DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, MVT::v1i1,
50406                           Src.getOperand(0));
50407
50408  // Combine scalar_to_vector of an extract_vector_elt into an extract_subvec.
50409  if (VT == MVT::v1i1 && Src.getOpcode() == ISD::EXTRACT_VECTOR_ELT &&
50410      Src.hasOneUse() && Src.getOperand(0).getValueType().isVector() &&
50411      Src.getOperand(0).getValueType().getVectorElementType() == MVT::i1)
50412    if (auto *C = dyn_cast<ConstantSDNode>(Src.getOperand(1)))
50413      if (C->isNullValue())
50414        return DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, Src.getOperand(0),
50415                           Src.getOperand(1));
50416
50417  // Reduce v2i64 to v4i32 if we don't need the upper bits.
50418  // TODO: Move to DAGCombine/SimplifyDemandedBits?
50419  if (VT == MVT::v2i64 || VT == MVT::v2f64) {
50420    auto IsAnyExt64 = [](SDValue Op) {
50421      if (Op.getValueType() != MVT::i64 || !Op.hasOneUse())
50422        return SDValue();
50423      if (Op.getOpcode() == ISD::ANY_EXTEND &&
50424          Op.getOperand(0).getScalarValueSizeInBits() <= 32)
50425        return Op.getOperand(0);
50426      if (auto *Ld = dyn_cast<LoadSDNode>(Op))
50427        if (Ld->getExtensionType() == ISD::EXTLOAD &&
50428            Ld->getMemoryVT().getScalarSizeInBits() <= 32)
50429          return Op;
50430      return SDValue();
50431    };
50432    if (SDValue ExtSrc = IsAnyExt64(peekThroughOneUseBitcasts(Src)))
50433      return DAG.getBitcast(
50434          VT, DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, MVT::v4i32,
50435                          DAG.getAnyExtOrTrunc(ExtSrc, DL, MVT::i32)));
50436  }
50437
50438  // Combine (v2i64 (scalar_to_vector (i64 (bitconvert (mmx))))) to MOVQ2DQ.
50439  if (VT == MVT::v2i64 && Src.getOpcode() == ISD::BITCAST &&
50440      Src.getOperand(0).getValueType() == MVT::x86mmx)
50441    return DAG.getNode(X86ISD::MOVQ2DQ, DL, VT, Src.getOperand(0));
50442
50443  // See if we're broadcasting the scalar value, in which case just reuse that.
50444  // Ensure the same SDValue from the SDNode use is being used.
50445  for (SDNode *User : Src->uses())
50446    if (User->getOpcode() == X86ISD::VBROADCAST && Src == User->getOperand(0)) {
50447      unsigned SizeInBits = VT.getFixedSizeInBits();
50448      unsigned BroadcastSizeInBits = User->getValueSizeInBits(0).getFixedSize();
50449      if (BroadcastSizeInBits == SizeInBits)
50450        return SDValue(User, 0);
50451      if (BroadcastSizeInBits > SizeInBits)
50452        return extractSubVector(SDValue(User, 0), 0, DAG, DL, SizeInBits);
50453      // TODO: Handle BroadcastSizeInBits < SizeInBits when we have test coverage.
50454    }
50455
50456  return SDValue();
50457}
50458
50459// Simplify PMULDQ and PMULUDQ operations.
50460static SDValue combinePMULDQ(SDNode *N, SelectionDAG &DAG,
50461                             TargetLowering::DAGCombinerInfo &DCI,
50462                             const X86Subtarget &Subtarget) {
50463  SDValue LHS = N->getOperand(0);
50464  SDValue RHS = N->getOperand(1);
50465
50466  // Canonicalize constant to RHS.
50467  if (DAG.isConstantIntBuildVectorOrConstantInt(LHS) &&
50468      !DAG.isConstantIntBuildVectorOrConstantInt(RHS))
50469    return DAG.getNode(N->getOpcode(), SDLoc(N), N->getValueType(0), RHS, LHS);
50470
50471  // Multiply by zero.
50472  // Don't return RHS as it may contain UNDEFs.
50473  if (ISD::isBuildVectorAllZeros(RHS.getNode()))
50474    return DAG.getConstant(0, SDLoc(N), N->getValueType(0));
50475
50476  // PMULDQ/PMULUDQ only uses lower 32 bits from each vector element.
50477  const TargetLowering &TLI = DAG.getTargetLoweringInfo();
50478  if (TLI.SimplifyDemandedBits(SDValue(N, 0), APInt::getAllOnesValue(64), DCI))
50479    return SDValue(N, 0);
50480
50481  // If the input is an extend_invec and the SimplifyDemandedBits call didn't
50482  // convert it to any_extend_invec, due to the LegalOperations check, do the
50483  // conversion directly to a vector shuffle manually. This exposes combine
50484  // opportunities missed by combineEXTEND_VECTOR_INREG not calling
50485  // combineX86ShufflesRecursively on SSE4.1 targets.
50486  // FIXME: This is basically a hack around several other issues related to
50487  // ANY_EXTEND_VECTOR_INREG.
50488  if (N->getValueType(0) == MVT::v2i64 && LHS.hasOneUse() &&
50489      (LHS.getOpcode() == ISD::ZERO_EXTEND_VECTOR_INREG ||
50490       LHS.getOpcode() == ISD::SIGN_EXTEND_VECTOR_INREG) &&
50491      LHS.getOperand(0).getValueType() == MVT::v4i32) {
50492    SDLoc dl(N);
50493    LHS = DAG.getVectorShuffle(MVT::v4i32, dl, LHS.getOperand(0),
50494                               LHS.getOperand(0), { 0, -1, 1, -1 });
50495    LHS = DAG.getBitcast(MVT::v2i64, LHS);
50496    return DAG.getNode(N->getOpcode(), dl, MVT::v2i64, LHS, RHS);
50497  }
50498  if (N->getValueType(0) == MVT::v2i64 && RHS.hasOneUse() &&
50499      (RHS.getOpcode() == ISD::ZERO_EXTEND_VECTOR_INREG ||
50500       RHS.getOpcode() == ISD::SIGN_EXTEND_VECTOR_INREG) &&
50501      RHS.getOperand(0).getValueType() == MVT::v4i32) {
50502    SDLoc dl(N);
50503    RHS = DAG.getVectorShuffle(MVT::v4i32, dl, RHS.getOperand(0),
50504                               RHS.getOperand(0), { 0, -1, 1, -1 });
50505    RHS = DAG.getBitcast(MVT::v2i64, RHS);
50506    return DAG.getNode(N->getOpcode(), dl, MVT::v2i64, LHS, RHS);
50507  }
50508
50509  return SDValue();
50510}
50511
50512static SDValue combineEXTEND_VECTOR_INREG(SDNode *N, SelectionDAG &DAG,
50513                                          TargetLowering::DAGCombinerInfo &DCI,
50514                                          const X86Subtarget &Subtarget) {
50515  EVT VT = N->getValueType(0);
50516  SDValue In = N->getOperand(0);
50517  unsigned Opcode = N->getOpcode();
50518  unsigned InOpcode = In.getOpcode();
50519  const TargetLowering &TLI = DAG.getTargetLoweringInfo();
50520
50521  // Try to merge vector loads and extend_inreg to an extload.
50522  if (!DCI.isBeforeLegalizeOps() && ISD::isNormalLoad(In.getNode()) &&
50523      In.hasOneUse()) {
50524    auto *Ld = cast<LoadSDNode>(In);
50525    if (Ld->isSimple()) {
50526      MVT SVT = In.getSimpleValueType().getVectorElementType();
50527      ISD::LoadExtType Ext = Opcode == ISD::SIGN_EXTEND_VECTOR_INREG
50528                                 ? ISD::SEXTLOAD
50529                                 : ISD::ZEXTLOAD;
50530      EVT MemVT = VT.changeVectorElementType(SVT);
50531      if (TLI.isLoadExtLegal(Ext, VT, MemVT)) {
50532        SDValue Load =
50533            DAG.getExtLoad(Ext, SDLoc(N), VT, Ld->getChain(), Ld->getBasePtr(),
50534                           Ld->getPointerInfo(), MemVT, Ld->getOriginalAlign(),
50535                           Ld->getMemOperand()->getFlags());
50536        DAG.ReplaceAllUsesOfValueWith(SDValue(Ld, 1), Load.getValue(1));
50537        return Load;
50538      }
50539    }
50540  }
50541
50542  // Fold EXTEND_VECTOR_INREG(EXTEND_VECTOR_INREG(X)) -> EXTEND_VECTOR_INREG(X).
50543  if (Opcode == InOpcode)
50544    return DAG.getNode(Opcode, SDLoc(N), VT, In.getOperand(0));
50545
50546  // Fold EXTEND_VECTOR_INREG(EXTRACT_SUBVECTOR(EXTEND(X),0))
50547  // -> EXTEND_VECTOR_INREG(X).
50548  // TODO: Handle non-zero subvector indices.
50549  if (InOpcode == ISD::EXTRACT_SUBVECTOR && In.getConstantOperandVal(1) == 0 &&
50550      In.getOperand(0).getOpcode() == getOpcode_EXTEND(Opcode) &&
50551      In.getOperand(0).getOperand(0).getValueSizeInBits() ==
50552          In.getValueSizeInBits())
50553    return DAG.getNode(Opcode, SDLoc(N), VT, In.getOperand(0).getOperand(0));
50554
50555  // Attempt to combine as a shuffle.
50556  // TODO: General ZERO_EXTEND_VECTOR_INREG support.
50557  if (Opcode == ISD::ANY_EXTEND_VECTOR_INREG ||
50558      (Opcode == ISD::ZERO_EXTEND_VECTOR_INREG && Subtarget.hasSSE41())) {
50559    SDValue Op(N, 0);
50560    if (TLI.isTypeLegal(VT) && TLI.isTypeLegal(In.getValueType()))
50561      if (SDValue Res = combineX86ShufflesRecursively(Op, DAG, Subtarget))
50562        return Res;
50563  }
50564
50565  return SDValue();
50566}
50567
50568static SDValue combineKSHIFT(SDNode *N, SelectionDAG &DAG,
50569                             TargetLowering::DAGCombinerInfo &DCI) {
50570  EVT VT = N->getValueType(0);
50571
50572  if (ISD::isBuildVectorAllZeros(N->getOperand(0).getNode()))
50573    return DAG.getConstant(0, SDLoc(N), VT);
50574
50575  APInt KnownUndef, KnownZero;
50576  const TargetLowering &TLI = DAG.getTargetLoweringInfo();
50577  APInt DemandedElts = APInt::getAllOnesValue(VT.getVectorNumElements());
50578  if (TLI.SimplifyDemandedVectorElts(SDValue(N, 0), DemandedElts, KnownUndef,
50579                                     KnownZero, DCI))
50580    return SDValue(N, 0);
50581
50582  return SDValue();
50583}
50584
50585// Optimize (fp16_to_fp (fp_to_fp16 X)) to VCVTPS2PH followed by VCVTPH2PS.
50586// Done as a combine because the lowering for fp16_to_fp and fp_to_fp16 produce
50587// extra instructions between the conversion due to going to scalar and back.
50588static SDValue combineFP16_TO_FP(SDNode *N, SelectionDAG &DAG,
50589                                 const X86Subtarget &Subtarget) {
50590  if (Subtarget.useSoftFloat() || !Subtarget.hasF16C())
50591    return SDValue();
50592
50593  if (N->getOperand(0).getOpcode() != ISD::FP_TO_FP16)
50594    return SDValue();
50595
50596  if (N->getValueType(0) != MVT::f32 ||
50597      N->getOperand(0).getOperand(0).getValueType() != MVT::f32)
50598    return SDValue();
50599
50600  SDLoc dl(N);
50601  SDValue Res = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v4f32,
50602                            N->getOperand(0).getOperand(0));
50603  Res = DAG.getNode(X86ISD::CVTPS2PH, dl, MVT::v8i16, Res,
50604                    DAG.getTargetConstant(4, dl, MVT::i32));
50605  Res = DAG.getNode(X86ISD::CVTPH2PS, dl, MVT::v4f32, Res);
50606  return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::f32, Res,
50607                     DAG.getIntPtrConstant(0, dl));
50608}
50609
50610static SDValue combineFP_EXTEND(SDNode *N, SelectionDAG &DAG,
50611                                const X86Subtarget &Subtarget) {
50612  if (!Subtarget.hasF16C() || Subtarget.useSoftFloat())
50613    return SDValue();
50614
50615  bool IsStrict = N->isStrictFPOpcode();
50616  EVT VT = N->getValueType(0);
50617  SDValue Src = N->getOperand(IsStrict ? 1 : 0);
50618  EVT SrcVT = Src.getValueType();
50619
50620  if (!SrcVT.isVector() || SrcVT.getVectorElementType() != MVT::f16)
50621    return SDValue();
50622
50623  if (VT.getVectorElementType() != MVT::f32 &&
50624      VT.getVectorElementType() != MVT::f64)
50625    return SDValue();
50626
50627  unsigned NumElts = VT.getVectorNumElements();
50628  if (NumElts == 1 || !isPowerOf2_32(NumElts))
50629    return SDValue();
50630
50631  SDLoc dl(N);
50632
50633  // Convert the input to vXi16.
50634  EVT IntVT = SrcVT.changeVectorElementTypeToInteger();
50635  Src = DAG.getBitcast(IntVT, Src);
50636
50637  // Widen to at least 8 input elements.
50638  if (NumElts < 8) {
50639    unsigned NumConcats = 8 / NumElts;
50640    SDValue Fill = NumElts == 4 ? DAG.getUNDEF(IntVT)
50641                                : DAG.getConstant(0, dl, IntVT);
50642    SmallVector<SDValue, 4> Ops(NumConcats, Fill);
50643    Ops[0] = Src;
50644    Src = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v8i16, Ops);
50645  }
50646
50647  // Destination is vXf32 with at least 4 elements.
50648  EVT CvtVT = EVT::getVectorVT(*DAG.getContext(), MVT::f32,
50649                               std::max(4U, NumElts));
50650  SDValue Cvt, Chain;
50651  if (IsStrict) {
50652    Cvt = DAG.getNode(X86ISD::STRICT_CVTPH2PS, dl, {CvtVT, MVT::Other},
50653                      {N->getOperand(0), Src});
50654    Chain = Cvt.getValue(1);
50655  } else {
50656    Cvt = DAG.getNode(X86ISD::CVTPH2PS, dl, CvtVT, Src);
50657  }
50658
50659  if (NumElts < 4) {
50660    assert(NumElts == 2 && "Unexpected size");
50661    Cvt = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MVT::v2f32, Cvt,
50662                      DAG.getIntPtrConstant(0, dl));
50663  }
50664
50665  if (IsStrict) {
50666    // Extend to the original VT if necessary.
50667    if (Cvt.getValueType() != VT) {
50668      Cvt = DAG.getNode(ISD::STRICT_FP_EXTEND, dl, {VT, MVT::Other},
50669                        {Chain, Cvt});
50670      Chain = Cvt.getValue(1);
50671    }
50672    return DAG.getMergeValues({Cvt, Chain}, dl);
50673  }
50674
50675  // Extend to the original VT if necessary.
50676  return DAG.getNode(ISD::FP_EXTEND, dl, VT, Cvt);
50677}
50678
50679// Try to find a larger VBROADCAST_LOAD/SUBV_BROADCAST_LOAD that we can extract
50680// from. Limit this to cases where the loads have the same input chain and the
50681// output chains are unused. This avoids any memory ordering issues.
50682static SDValue combineBROADCAST_LOAD(SDNode *N, SelectionDAG &DAG,
50683                                     TargetLowering::DAGCombinerInfo &DCI) {
50684  assert((N->getOpcode() == X86ISD::VBROADCAST_LOAD ||
50685          N->getOpcode() == X86ISD::SUBV_BROADCAST_LOAD) &&
50686         "Unknown broadcast load type");
50687
50688  // Only do this if the chain result is unused.
50689  if (N->hasAnyUseOfValue(1))
50690    return SDValue();
50691
50692  auto *MemIntrin = cast<MemIntrinsicSDNode>(N);
50693
50694  SDValue Ptr = MemIntrin->getBasePtr();
50695  SDValue Chain = MemIntrin->getChain();
50696  EVT VT = N->getSimpleValueType(0);
50697  EVT MemVT = MemIntrin->getMemoryVT();
50698
50699  // Look at other users of our base pointer and try to find a wider broadcast.
50700  // The input chain and the size of the memory VT must match.
50701  for (SDNode *User : Ptr->uses())
50702    if (User != N && User->getOpcode() == N->getOpcode() &&
50703        cast<MemIntrinsicSDNode>(User)->getBasePtr() == Ptr &&
50704        cast<MemIntrinsicSDNode>(User)->getChain() == Chain &&
50705        cast<MemIntrinsicSDNode>(User)->getMemoryVT().getSizeInBits() ==
50706            MemVT.getSizeInBits() &&
50707        !User->hasAnyUseOfValue(1) &&
50708        User->getValueSizeInBits(0).getFixedSize() > VT.getFixedSizeInBits()) {
50709      SDValue Extract = extractSubVector(SDValue(User, 0), 0, DAG, SDLoc(N),
50710                                         VT.getSizeInBits());
50711      Extract = DAG.getBitcast(VT, Extract);
50712      return DCI.CombineTo(N, Extract, SDValue(User, 1));
50713    }
50714
50715  return SDValue();
50716}
50717
50718static SDValue combineFP_ROUND(SDNode *N, SelectionDAG &DAG,
50719                               const X86Subtarget &Subtarget) {
50720  if (!Subtarget.hasF16C() || Subtarget.useSoftFloat())
50721    return SDValue();
50722
50723  EVT VT = N->getValueType(0);
50724  SDValue Src = N->getOperand(0);
50725  EVT SrcVT = Src.getValueType();
50726
50727  if (!VT.isVector() || VT.getVectorElementType() != MVT::f16 ||
50728      SrcVT.getVectorElementType() != MVT::f32)
50729    return SDValue();
50730
50731  unsigned NumElts = VT.getVectorNumElements();
50732  if (NumElts == 1 || !isPowerOf2_32(NumElts))
50733    return SDValue();
50734
50735  SDLoc dl(N);
50736
50737  // Widen to at least 4 input elements.
50738  if (NumElts < 4)
50739    Src = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v4f32, Src,
50740                      DAG.getConstantFP(0.0, dl, SrcVT));
50741
50742  // Destination is v8i16 with at least 8 elements.
50743  EVT CvtVT = EVT::getVectorVT(*DAG.getContext(), MVT::i16,
50744                               std::max(8U, NumElts));
50745  SDValue Cvt = DAG.getNode(X86ISD::CVTPS2PH, dl, CvtVT, Src,
50746                            DAG.getTargetConstant(4, dl, MVT::i32));
50747
50748  // Extract down to real number of elements.
50749  if (NumElts < 8) {
50750    EVT IntVT = VT.changeVectorElementTypeToInteger();
50751    Cvt = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, IntVT, Cvt,
50752                      DAG.getIntPtrConstant(0, dl));
50753  }
50754
50755  return DAG.getBitcast(VT, Cvt);
50756}
50757
50758static SDValue combineMOVDQ2Q(SDNode *N, SelectionDAG &DAG) {
50759  SDValue Src = N->getOperand(0);
50760
50761  // Turn MOVDQ2Q+simple_load into an mmx load.
50762  if (ISD::isNormalLoad(Src.getNode()) && Src.hasOneUse()) {
50763    LoadSDNode *LN = cast<LoadSDNode>(Src.getNode());
50764
50765    if (LN->isSimple()) {
50766      SDValue NewLd = DAG.getLoad(MVT::x86mmx, SDLoc(N), LN->getChain(),
50767                                  LN->getBasePtr(),
50768                                  LN->getPointerInfo(),
50769                                  LN->getOriginalAlign(),
50770                                  LN->getMemOperand()->getFlags());
50771      DAG.ReplaceAllUsesOfValueWith(SDValue(LN, 1), NewLd.getValue(1));
50772      return NewLd;
50773    }
50774  }
50775
50776  return SDValue();
50777}
50778
50779static SDValue combinePDEP(SDNode *N, SelectionDAG &DAG,
50780                           TargetLowering::DAGCombinerInfo &DCI) {
50781  unsigned NumBits = N->getSimpleValueType(0).getSizeInBits();
50782  const TargetLowering &TLI = DAG.getTargetLoweringInfo();
50783  if (TLI.SimplifyDemandedBits(SDValue(N, 0),
50784                               APInt::getAllOnesValue(NumBits), DCI))
50785    return SDValue(N, 0);
50786
50787  return SDValue();
50788}
50789
50790SDValue X86TargetLowering::PerformDAGCombine(SDNode *N,
50791                                             DAGCombinerInfo &DCI) const {
50792  SelectionDAG &DAG = DCI.DAG;
50793  switch (N->getOpcode()) {
50794  default: break;
50795  case ISD::SCALAR_TO_VECTOR:
50796    return combineScalarToVector(N, DAG);
50797  case ISD::EXTRACT_VECTOR_ELT:
50798  case X86ISD::PEXTRW:
50799  case X86ISD::PEXTRB:
50800    return combineExtractVectorElt(N, DAG, DCI, Subtarget);
50801  case ISD::CONCAT_VECTORS:
50802    return combineConcatVectors(N, DAG, DCI, Subtarget);
50803  case ISD::INSERT_SUBVECTOR:
50804    return combineInsertSubvector(N, DAG, DCI, Subtarget);
50805  case ISD::EXTRACT_SUBVECTOR:
50806    return combineExtractSubvector(N, DAG, DCI, Subtarget);
50807  case ISD::VSELECT:
50808  case ISD::SELECT:
50809  case X86ISD::BLENDV:      return combineSelect(N, DAG, DCI, Subtarget);
50810  case ISD::BITCAST:        return combineBitcast(N, DAG, DCI, Subtarget);
50811  case X86ISD::CMOV:        return combineCMov(N, DAG, DCI, Subtarget);
50812  case X86ISD::CMP:         return combineCMP(N, DAG);
50813  case ISD::ADD:            return combineAdd(N, DAG, DCI, Subtarget);
50814  case ISD::SUB:            return combineSub(N, DAG, DCI, Subtarget);
50815  case X86ISD::ADD:
50816  case X86ISD::SUB:         return combineX86AddSub(N, DAG, DCI);
50817  case X86ISD::SBB:         return combineSBB(N, DAG);
50818  case X86ISD::ADC:         return combineADC(N, DAG, DCI);
50819  case ISD::MUL:            return combineMul(N, DAG, DCI, Subtarget);
50820  case ISD::SHL:            return combineShiftLeft(N, DAG);
50821  case ISD::SRA:            return combineShiftRightArithmetic(N, DAG, Subtarget);
50822  case ISD::SRL:            return combineShiftRightLogical(N, DAG, DCI, Subtarget);
50823  case ISD::AND:            return combineAnd(N, DAG, DCI, Subtarget);
50824  case ISD::OR:             return combineOr(N, DAG, DCI, Subtarget);
50825  case ISD::XOR:            return combineXor(N, DAG, DCI, Subtarget);
50826  case X86ISD::BEXTR:
50827  case X86ISD::BEXTRI:      return combineBEXTR(N, DAG, DCI, Subtarget);
50828  case ISD::LOAD:           return combineLoad(N, DAG, DCI, Subtarget);
50829  case ISD::MLOAD:          return combineMaskedLoad(N, DAG, DCI, Subtarget);
50830  case ISD::STORE:          return combineStore(N, DAG, DCI, Subtarget);
50831  case ISD::MSTORE:         return combineMaskedStore(N, DAG, DCI, Subtarget);
50832  case X86ISD::VEXTRACT_STORE:
50833    return combineVEXTRACT_STORE(N, DAG, DCI, Subtarget);
50834  case ISD::SINT_TO_FP:
50835  case ISD::STRICT_SINT_TO_FP:
50836    return combineSIntToFP(N, DAG, DCI, Subtarget);
50837  case ISD::UINT_TO_FP:
50838  case ISD::STRICT_UINT_TO_FP:
50839    return combineUIntToFP(N, DAG, Subtarget);
50840  case ISD::FADD:
50841  case ISD::FSUB:           return combineFaddFsub(N, DAG, Subtarget);
50842  case ISD::FNEG:           return combineFneg(N, DAG, DCI, Subtarget);
50843  case ISD::TRUNCATE:       return combineTruncate(N, DAG, Subtarget);
50844  case X86ISD::VTRUNC:      return combineVTRUNC(N, DAG, DCI);
50845  case X86ISD::ANDNP:       return combineAndnp(N, DAG, DCI, Subtarget);
50846  case X86ISD::FAND:        return combineFAnd(N, DAG, Subtarget);
50847  case X86ISD::FANDN:       return combineFAndn(N, DAG, Subtarget);
50848  case X86ISD::FXOR:
50849  case X86ISD::FOR:         return combineFOr(N, DAG, DCI, Subtarget);
50850  case X86ISD::FMIN:
50851  case X86ISD::FMAX:        return combineFMinFMax(N, DAG);
50852  case ISD::FMINNUM:
50853  case ISD::FMAXNUM:        return combineFMinNumFMaxNum(N, DAG, Subtarget);
50854  case X86ISD::CVTSI2P:
50855  case X86ISD::CVTUI2P:     return combineX86INT_TO_FP(N, DAG, DCI);
50856  case X86ISD::CVTP2SI:
50857  case X86ISD::CVTP2UI:
50858  case X86ISD::STRICT_CVTTP2SI:
50859  case X86ISD::CVTTP2SI:
50860  case X86ISD::STRICT_CVTTP2UI:
50861  case X86ISD::CVTTP2UI:
50862                            return combineCVTP2I_CVTTP2I(N, DAG, DCI);
50863  case X86ISD::STRICT_CVTPH2PS:
50864  case X86ISD::CVTPH2PS:    return combineCVTPH2PS(N, DAG, DCI);
50865  case X86ISD::BT:          return combineBT(N, DAG, DCI);
50866  case ISD::ANY_EXTEND:
50867  case ISD::ZERO_EXTEND:    return combineZext(N, DAG, DCI, Subtarget);
50868  case ISD::SIGN_EXTEND:    return combineSext(N, DAG, DCI, Subtarget);
50869  case ISD::SIGN_EXTEND_INREG: return combineSignExtendInReg(N, DAG, Subtarget);
50870  case ISD::ANY_EXTEND_VECTOR_INREG:
50871  case ISD::SIGN_EXTEND_VECTOR_INREG:
50872  case ISD::ZERO_EXTEND_VECTOR_INREG:
50873    return combineEXTEND_VECTOR_INREG(N, DAG, DCI, Subtarget);
50874  case ISD::SETCC:          return combineSetCC(N, DAG, DCI, Subtarget);
50875  case X86ISD::SETCC:       return combineX86SetCC(N, DAG, Subtarget);
50876  case X86ISD::BRCOND:      return combineBrCond(N, DAG, Subtarget);
50877  case X86ISD::PACKSS:
50878  case X86ISD::PACKUS:      return combineVectorPack(N, DAG, DCI, Subtarget);
50879  case X86ISD::HADD:
50880  case X86ISD::HSUB:
50881  case X86ISD::FHADD:
50882  case X86ISD::FHSUB:       return combineVectorHADDSUB(N, DAG, DCI, Subtarget);
50883  case X86ISD::VSHL:
50884  case X86ISD::VSRA:
50885  case X86ISD::VSRL:
50886    return combineVectorShiftVar(N, DAG, DCI, Subtarget);
50887  case X86ISD::VSHLI:
50888  case X86ISD::VSRAI:
50889  case X86ISD::VSRLI:
50890    return combineVectorShiftImm(N, DAG, DCI, Subtarget);
50891  case ISD::INSERT_VECTOR_ELT:
50892  case X86ISD::PINSRB:
50893  case X86ISD::PINSRW:      return combineVectorInsert(N, DAG, DCI, Subtarget);
50894  case X86ISD::SHUFP:       // Handle all target specific shuffles
50895  case X86ISD::INSERTPS:
50896  case X86ISD::EXTRQI:
50897  case X86ISD::INSERTQI:
50898  case X86ISD::VALIGN:
50899  case X86ISD::PALIGNR:
50900  case X86ISD::VSHLDQ:
50901  case X86ISD::VSRLDQ:
50902  case X86ISD::BLENDI:
50903  case X86ISD::UNPCKH:
50904  case X86ISD::UNPCKL:
50905  case X86ISD::MOVHLPS:
50906  case X86ISD::MOVLHPS:
50907  case X86ISD::PSHUFB:
50908  case X86ISD::PSHUFD:
50909  case X86ISD::PSHUFHW:
50910  case X86ISD::PSHUFLW:
50911  case X86ISD::MOVSHDUP:
50912  case X86ISD::MOVSLDUP:
50913  case X86ISD::MOVDDUP:
50914  case X86ISD::MOVSS:
50915  case X86ISD::MOVSD:
50916  case X86ISD::VBROADCAST:
50917  case X86ISD::VPPERM:
50918  case X86ISD::VPERMI:
50919  case X86ISD::VPERMV:
50920  case X86ISD::VPERMV3:
50921  case X86ISD::VPERMIL2:
50922  case X86ISD::VPERMILPI:
50923  case X86ISD::VPERMILPV:
50924  case X86ISD::VPERM2X128:
50925  case X86ISD::SHUF128:
50926  case X86ISD::VZEXT_MOVL:
50927  case ISD::VECTOR_SHUFFLE: return combineShuffle(N, DAG, DCI,Subtarget);
50928  case X86ISD::FMADD_RND:
50929  case X86ISD::FMSUB:
50930  case X86ISD::STRICT_FMSUB:
50931  case X86ISD::FMSUB_RND:
50932  case X86ISD::FNMADD:
50933  case X86ISD::STRICT_FNMADD:
50934  case X86ISD::FNMADD_RND:
50935  case X86ISD::FNMSUB:
50936  case X86ISD::STRICT_FNMSUB:
50937  case X86ISD::FNMSUB_RND:
50938  case ISD::FMA:
50939  case ISD::STRICT_FMA:     return combineFMA(N, DAG, DCI, Subtarget);
50940  case X86ISD::FMADDSUB_RND:
50941  case X86ISD::FMSUBADD_RND:
50942  case X86ISD::FMADDSUB:
50943  case X86ISD::FMSUBADD:    return combineFMADDSUB(N, DAG, DCI);
50944  case X86ISD::MOVMSK:      return combineMOVMSK(N, DAG, DCI, Subtarget);
50945  case X86ISD::MGATHER:
50946  case X86ISD::MSCATTER:    return combineX86GatherScatter(N, DAG, DCI);
50947  case ISD::MGATHER:
50948  case ISD::MSCATTER:       return combineGatherScatter(N, DAG, DCI);
50949  case X86ISD::PCMPEQ:
50950  case X86ISD::PCMPGT:      return combineVectorCompare(N, DAG, Subtarget);
50951  case X86ISD::PMULDQ:
50952  case X86ISD::PMULUDQ:     return combinePMULDQ(N, DAG, DCI, Subtarget);
50953  case X86ISD::KSHIFTL:
50954  case X86ISD::KSHIFTR:     return combineKSHIFT(N, DAG, DCI);
50955  case ISD::FP16_TO_FP:     return combineFP16_TO_FP(N, DAG, Subtarget);
50956  case ISD::STRICT_FP_EXTEND:
50957  case ISD::FP_EXTEND:      return combineFP_EXTEND(N, DAG, Subtarget);
50958  case ISD::FP_ROUND:       return combineFP_ROUND(N, DAG, Subtarget);
50959  case X86ISD::VBROADCAST_LOAD:
50960  case X86ISD::SUBV_BROADCAST_LOAD: return combineBROADCAST_LOAD(N, DAG, DCI);
50961  case X86ISD::MOVDQ2Q:     return combineMOVDQ2Q(N, DAG);
50962  case X86ISD::PDEP:        return combinePDEP(N, DAG, DCI);
50963  }
50964
50965  return SDValue();
50966}
50967
50968bool X86TargetLowering::isTypeDesirableForOp(unsigned Opc, EVT VT) const {
50969  if (!isTypeLegal(VT))
50970    return false;
50971
50972  // There are no vXi8 shifts.
50973  if (Opc == ISD::SHL && VT.isVector() && VT.getVectorElementType() == MVT::i8)
50974    return false;
50975
50976  // TODO: Almost no 8-bit ops are desirable because they have no actual
50977  //       size/speed advantages vs. 32-bit ops, but they do have a major
50978  //       potential disadvantage by causing partial register stalls.
50979  //
50980  // 8-bit multiply/shl is probably not cheaper than 32-bit multiply/shl, and
50981  // we have specializations to turn 32-bit multiply/shl into LEA or other ops.
50982  // Also, see the comment in "IsDesirableToPromoteOp" - where we additionally
50983  // check for a constant operand to the multiply.
50984  if ((Opc == ISD::MUL || Opc == ISD::SHL) && VT == MVT::i8)
50985    return false;
50986
50987  // i16 instruction encodings are longer and some i16 instructions are slow,
50988  // so those are not desirable.
50989  if (VT == MVT::i16) {
50990    switch (Opc) {
50991    default:
50992      break;
50993    case ISD::LOAD:
50994    case ISD::SIGN_EXTEND:
50995    case ISD::ZERO_EXTEND:
50996    case ISD::ANY_EXTEND:
50997    case ISD::SHL:
50998    case ISD::SRA:
50999    case ISD::SRL:
51000    case ISD::SUB:
51001    case ISD::ADD:
51002    case ISD::MUL:
51003    case ISD::AND:
51004    case ISD::OR:
51005    case ISD::XOR:
51006      return false;
51007    }
51008  }
51009
51010  // Any legal type not explicitly accounted for above here is desirable.
51011  return true;
51012}
51013
51014SDValue X86TargetLowering::expandIndirectJTBranch(const SDLoc& dl,
51015                                                  SDValue Value, SDValue Addr,
51016                                                  SelectionDAG &DAG) const {
51017  const Module *M = DAG.getMachineFunction().getMMI().getModule();
51018  Metadata *IsCFProtectionSupported = M->getModuleFlag("cf-protection-branch");
51019  if (IsCFProtectionSupported) {
51020    // In case control-flow branch protection is enabled, we need to add
51021    // notrack prefix to the indirect branch.
51022    // In order to do that we create NT_BRIND SDNode.
51023    // Upon ISEL, the pattern will convert it to jmp with NoTrack prefix.
51024    return DAG.getNode(X86ISD::NT_BRIND, dl, MVT::Other, Value, Addr);
51025  }
51026
51027  return TargetLowering::expandIndirectJTBranch(dl, Value, Addr, DAG);
51028}
51029
51030bool X86TargetLowering::IsDesirableToPromoteOp(SDValue Op, EVT &PVT) const {
51031  EVT VT = Op.getValueType();
51032  bool Is8BitMulByConstant = VT == MVT::i8 && Op.getOpcode() == ISD::MUL &&
51033                             isa<ConstantSDNode>(Op.getOperand(1));
51034
51035  // i16 is legal, but undesirable since i16 instruction encodings are longer
51036  // and some i16 instructions are slow.
51037  // 8-bit multiply-by-constant can usually be expanded to something cheaper
51038  // using LEA and/or other ALU ops.
51039  if (VT != MVT::i16 && !Is8BitMulByConstant)
51040    return false;
51041
51042  auto IsFoldableRMW = [](SDValue Load, SDValue Op) {
51043    if (!Op.hasOneUse())
51044      return false;
51045    SDNode *User = *Op->use_begin();
51046    if (!ISD::isNormalStore(User))
51047      return false;
51048    auto *Ld = cast<LoadSDNode>(Load);
51049    auto *St = cast<StoreSDNode>(User);
51050    return Ld->getBasePtr() == St->getBasePtr();
51051  };
51052
51053  auto IsFoldableAtomicRMW = [](SDValue Load, SDValue Op) {
51054    if (!Load.hasOneUse() || Load.getOpcode() != ISD::ATOMIC_LOAD)
51055      return false;
51056    if (!Op.hasOneUse())
51057      return false;
51058    SDNode *User = *Op->use_begin();
51059    if (User->getOpcode() != ISD::ATOMIC_STORE)
51060      return false;
51061    auto *Ld = cast<AtomicSDNode>(Load);
51062    auto *St = cast<AtomicSDNode>(User);
51063    return Ld->getBasePtr() == St->getBasePtr();
51064  };
51065
51066  bool Commute = false;
51067  switch (Op.getOpcode()) {
51068  default: return false;
51069  case ISD::SIGN_EXTEND:
51070  case ISD::ZERO_EXTEND:
51071  case ISD::ANY_EXTEND:
51072    break;
51073  case ISD::SHL:
51074  case ISD::SRA:
51075  case ISD::SRL: {
51076    SDValue N0 = Op.getOperand(0);
51077    // Look out for (store (shl (load), x)).
51078    if (MayFoldLoad(N0) && IsFoldableRMW(N0, Op))
51079      return false;
51080    break;
51081  }
51082  case ISD::ADD:
51083  case ISD::MUL:
51084  case ISD::AND:
51085  case ISD::OR:
51086  case ISD::XOR:
51087    Commute = true;
51088    LLVM_FALLTHROUGH;
51089  case ISD::SUB: {
51090    SDValue N0 = Op.getOperand(0);
51091    SDValue N1 = Op.getOperand(1);
51092    // Avoid disabling potential load folding opportunities.
51093    if (MayFoldLoad(N1) &&
51094        (!Commute || !isa<ConstantSDNode>(N0) ||
51095         (Op.getOpcode() != ISD::MUL && IsFoldableRMW(N1, Op))))
51096      return false;
51097    if (MayFoldLoad(N0) &&
51098        ((Commute && !isa<ConstantSDNode>(N1)) ||
51099         (Op.getOpcode() != ISD::MUL && IsFoldableRMW(N0, Op))))
51100      return false;
51101    if (IsFoldableAtomicRMW(N0, Op) ||
51102        (Commute && IsFoldableAtomicRMW(N1, Op)))
51103      return false;
51104  }
51105  }
51106
51107  PVT = MVT::i32;
51108  return true;
51109}
51110
51111//===----------------------------------------------------------------------===//
51112//                           X86 Inline Assembly Support
51113//===----------------------------------------------------------------------===//
51114
51115// Helper to match a string separated by whitespace.
51116static bool matchAsm(StringRef S, ArrayRef<const char *> Pieces) {
51117  S = S.substr(S.find_first_not_of(" \t")); // Skip leading whitespace.
51118
51119  for (StringRef Piece : Pieces) {
51120    if (!S.startswith(Piece)) // Check if the piece matches.
51121      return false;
51122
51123    S = S.substr(Piece.size());
51124    StringRef::size_type Pos = S.find_first_not_of(" \t");
51125    if (Pos == 0) // We matched a prefix.
51126      return false;
51127
51128    S = S.substr(Pos);
51129  }
51130
51131  return S.empty();
51132}
51133
51134static bool clobbersFlagRegisters(const SmallVector<StringRef, 4> &AsmPieces) {
51135
51136  if (AsmPieces.size() == 3 || AsmPieces.size() == 4) {
51137    if (std::count(AsmPieces.begin(), AsmPieces.end(), "~{cc}") &&
51138        std::count(AsmPieces.begin(), AsmPieces.end(), "~{flags}") &&
51139        std::count(AsmPieces.begin(), AsmPieces.end(), "~{fpsr}")) {
51140
51141      if (AsmPieces.size() == 3)
51142        return true;
51143      else if (std::count(AsmPieces.begin(), AsmPieces.end(), "~{dirflag}"))
51144        return true;
51145    }
51146  }
51147  return false;
51148}
51149
51150bool X86TargetLowering::ExpandInlineAsm(CallInst *CI) const {
51151  InlineAsm *IA = cast<InlineAsm>(CI->getCalledOperand());
51152
51153  const std::string &AsmStr = IA->getAsmString();
51154
51155  IntegerType *Ty = dyn_cast<IntegerType>(CI->getType());
51156  if (!Ty || Ty->getBitWidth() % 16 != 0)
51157    return false;
51158
51159  // TODO: should remove alternatives from the asmstring: "foo {a|b}" -> "foo a"
51160  SmallVector<StringRef, 4> AsmPieces;
51161  SplitString(AsmStr, AsmPieces, ";\n");
51162
51163  switch (AsmPieces.size()) {
51164  default: return false;
51165  case 1:
51166    // FIXME: this should verify that we are targeting a 486 or better.  If not,
51167    // we will turn this bswap into something that will be lowered to logical
51168    // ops instead of emitting the bswap asm.  For now, we don't support 486 or
51169    // lower so don't worry about this.
51170    // bswap $0
51171    if (matchAsm(AsmPieces[0], {"bswap", "$0"}) ||
51172        matchAsm(AsmPieces[0], {"bswapl", "$0"}) ||
51173        matchAsm(AsmPieces[0], {"bswapq", "$0"}) ||
51174        matchAsm(AsmPieces[0], {"bswap", "${0:q}"}) ||
51175        matchAsm(AsmPieces[0], {"bswapl", "${0:q}"}) ||
51176        matchAsm(AsmPieces[0], {"bswapq", "${0:q}"})) {
51177      // No need to check constraints, nothing other than the equivalent of
51178      // "=r,0" would be valid here.
51179      return IntrinsicLowering::LowerToByteSwap(CI);
51180    }
51181
51182    // rorw $$8, ${0:w}  -->  llvm.bswap.i16
51183    if (CI->getType()->isIntegerTy(16) &&
51184        IA->getConstraintString().compare(0, 5, "=r,0,") == 0 &&
51185        (matchAsm(AsmPieces[0], {"rorw", "$$8,", "${0:w}"}) ||
51186         matchAsm(AsmPieces[0], {"rolw", "$$8,", "${0:w}"}))) {
51187      AsmPieces.clear();
51188      StringRef ConstraintsStr = IA->getConstraintString();
51189      SplitString(StringRef(ConstraintsStr).substr(5), AsmPieces, ",");
51190      array_pod_sort(AsmPieces.begin(), AsmPieces.end());
51191      if (clobbersFlagRegisters(AsmPieces))
51192        return IntrinsicLowering::LowerToByteSwap(CI);
51193    }
51194    break;
51195  case 3:
51196    if (CI->getType()->isIntegerTy(32) &&
51197        IA->getConstraintString().compare(0, 5, "=r,0,") == 0 &&
51198        matchAsm(AsmPieces[0], {"rorw", "$$8,", "${0:w}"}) &&
51199        matchAsm(AsmPieces[1], {"rorl", "$$16,", "$0"}) &&
51200        matchAsm(AsmPieces[2], {"rorw", "$$8,", "${0:w}"})) {
51201      AsmPieces.clear();
51202      StringRef ConstraintsStr = IA->getConstraintString();
51203      SplitString(StringRef(ConstraintsStr).substr(5), AsmPieces, ",");
51204      array_pod_sort(AsmPieces.begin(), AsmPieces.end());
51205      if (clobbersFlagRegisters(AsmPieces))
51206        return IntrinsicLowering::LowerToByteSwap(CI);
51207    }
51208
51209    if (CI->getType()->isIntegerTy(64)) {
51210      InlineAsm::ConstraintInfoVector Constraints = IA->ParseConstraints();
51211      if (Constraints.size() >= 2 &&
51212          Constraints[0].Codes.size() == 1 && Constraints[0].Codes[0] == "A" &&
51213          Constraints[1].Codes.size() == 1 && Constraints[1].Codes[0] == "0") {
51214        // bswap %eax / bswap %edx / xchgl %eax, %edx  -> llvm.bswap.i64
51215        if (matchAsm(AsmPieces[0], {"bswap", "%eax"}) &&
51216            matchAsm(AsmPieces[1], {"bswap", "%edx"}) &&
51217            matchAsm(AsmPieces[2], {"xchgl", "%eax,", "%edx"}))
51218          return IntrinsicLowering::LowerToByteSwap(CI);
51219      }
51220    }
51221    break;
51222  }
51223  return false;
51224}
51225
51226static X86::CondCode parseConstraintCode(llvm::StringRef Constraint) {
51227  X86::CondCode Cond = StringSwitch<X86::CondCode>(Constraint)
51228                           .Case("{@cca}", X86::COND_A)
51229                           .Case("{@ccae}", X86::COND_AE)
51230                           .Case("{@ccb}", X86::COND_B)
51231                           .Case("{@ccbe}", X86::COND_BE)
51232                           .Case("{@ccc}", X86::COND_B)
51233                           .Case("{@cce}", X86::COND_E)
51234                           .Case("{@ccz}", X86::COND_E)
51235                           .Case("{@ccg}", X86::COND_G)
51236                           .Case("{@ccge}", X86::COND_GE)
51237                           .Case("{@ccl}", X86::COND_L)
51238                           .Case("{@ccle}", X86::COND_LE)
51239                           .Case("{@ccna}", X86::COND_BE)
51240                           .Case("{@ccnae}", X86::COND_B)
51241                           .Case("{@ccnb}", X86::COND_AE)
51242                           .Case("{@ccnbe}", X86::COND_A)
51243                           .Case("{@ccnc}", X86::COND_AE)
51244                           .Case("{@ccne}", X86::COND_NE)
51245                           .Case("{@ccnz}", X86::COND_NE)
51246                           .Case("{@ccng}", X86::COND_LE)
51247                           .Case("{@ccnge}", X86::COND_L)
51248                           .Case("{@ccnl}", X86::COND_GE)
51249                           .Case("{@ccnle}", X86::COND_G)
51250                           .Case("{@ccno}", X86::COND_NO)
51251                           .Case("{@ccnp}", X86::COND_NP)
51252                           .Case("{@ccns}", X86::COND_NS)
51253                           .Case("{@cco}", X86::COND_O)
51254                           .Case("{@ccp}", X86::COND_P)
51255                           .Case("{@ccs}", X86::COND_S)
51256                           .Default(X86::COND_INVALID);
51257  return Cond;
51258}
51259
51260/// Given a constraint letter, return the type of constraint for this target.
51261X86TargetLowering::ConstraintType
51262X86TargetLowering::getConstraintType(StringRef Constraint) const {
51263  if (Constraint.size() == 1) {
51264    switch (Constraint[0]) {
51265    case 'R':
51266    case 'q':
51267    case 'Q':
51268    case 'f':
51269    case 't':
51270    case 'u':
51271    case 'y':
51272    case 'x':
51273    case 'v':
51274    case 'l':
51275    case 'k': // AVX512 masking registers.
51276      return C_RegisterClass;
51277    case 'a':
51278    case 'b':
51279    case 'c':
51280    case 'd':
51281    case 'S':
51282    case 'D':
51283    case 'A':
51284      return C_Register;
51285    case 'I':
51286    case 'J':
51287    case 'K':
51288    case 'N':
51289    case 'G':
51290    case 'L':
51291    case 'M':
51292      return C_Immediate;
51293    case 'C':
51294    case 'e':
51295    case 'Z':
51296      return C_Other;
51297    default:
51298      break;
51299    }
51300  }
51301  else if (Constraint.size() == 2) {
51302    switch (Constraint[0]) {
51303    default:
51304      break;
51305    case 'Y':
51306      switch (Constraint[1]) {
51307      default:
51308        break;
51309      case 'z':
51310        return C_Register;
51311      case 'i':
51312      case 'm':
51313      case 'k':
51314      case 't':
51315      case '2':
51316        return C_RegisterClass;
51317      }
51318    }
51319  } else if (parseConstraintCode(Constraint) != X86::COND_INVALID)
51320    return C_Other;
51321  return TargetLowering::getConstraintType(Constraint);
51322}
51323
51324/// Examine constraint type and operand type and determine a weight value.
51325/// This object must already have been set up with the operand type
51326/// and the current alternative constraint selected.
51327TargetLowering::ConstraintWeight
51328  X86TargetLowering::getSingleConstraintMatchWeight(
51329    AsmOperandInfo &info, const char *constraint) const {
51330  ConstraintWeight weight = CW_Invalid;
51331  Value *CallOperandVal = info.CallOperandVal;
51332    // If we don't have a value, we can't do a match,
51333    // but allow it at the lowest weight.
51334  if (!CallOperandVal)
51335    return CW_Default;
51336  Type *type = CallOperandVal->getType();
51337  // Look at the constraint type.
51338  switch (*constraint) {
51339  default:
51340    weight = TargetLowering::getSingleConstraintMatchWeight(info, constraint);
51341    LLVM_FALLTHROUGH;
51342  case 'R':
51343  case 'q':
51344  case 'Q':
51345  case 'a':
51346  case 'b':
51347  case 'c':
51348  case 'd':
51349  case 'S':
51350  case 'D':
51351  case 'A':
51352    if (CallOperandVal->getType()->isIntegerTy())
51353      weight = CW_SpecificReg;
51354    break;
51355  case 'f':
51356  case 't':
51357  case 'u':
51358    if (type->isFloatingPointTy())
51359      weight = CW_SpecificReg;
51360    break;
51361  case 'y':
51362    if (type->isX86_MMXTy() && Subtarget.hasMMX())
51363      weight = CW_SpecificReg;
51364    break;
51365  case 'Y':
51366    if (StringRef(constraint).size() != 2)
51367      break;
51368    switch (constraint[1]) {
51369      default:
51370        return CW_Invalid;
51371      // XMM0
51372      case 'z':
51373        if (((type->getPrimitiveSizeInBits() == 128) && Subtarget.hasSSE1()) ||
51374            ((type->getPrimitiveSizeInBits() == 256) && Subtarget.hasAVX()) ||
51375            ((type->getPrimitiveSizeInBits() == 512) && Subtarget.hasAVX512()))
51376          return CW_SpecificReg;
51377        return CW_Invalid;
51378      // Conditional OpMask regs (AVX512)
51379      case 'k':
51380        if ((type->getPrimitiveSizeInBits() == 64) && Subtarget.hasAVX512())
51381          return CW_Register;
51382        return CW_Invalid;
51383      // Any MMX reg
51384      case 'm':
51385        if (type->isX86_MMXTy() && Subtarget.hasMMX())
51386          return weight;
51387        return CW_Invalid;
51388      // Any SSE reg when ISA >= SSE2, same as 'x'
51389      case 'i':
51390      case 't':
51391      case '2':
51392        if (!Subtarget.hasSSE2())
51393          return CW_Invalid;
51394        break;
51395    }
51396    break;
51397  case 'v':
51398    if ((type->getPrimitiveSizeInBits() == 512) && Subtarget.hasAVX512())
51399      weight = CW_Register;
51400    LLVM_FALLTHROUGH;
51401  case 'x':
51402    if (((type->getPrimitiveSizeInBits() == 128) && Subtarget.hasSSE1()) ||
51403        ((type->getPrimitiveSizeInBits() == 256) && Subtarget.hasAVX()))
51404      weight = CW_Register;
51405    break;
51406  case 'k':
51407    // Enable conditional vector operations using %k<#> registers.
51408    if ((type->getPrimitiveSizeInBits() == 64) && Subtarget.hasAVX512())
51409      weight = CW_Register;
51410    break;
51411  case 'I':
51412    if (ConstantInt *C = dyn_cast<ConstantInt>(info.CallOperandVal)) {
51413      if (C->getZExtValue() <= 31)
51414        weight = CW_Constant;
51415    }
51416    break;
51417  case 'J':
51418    if (ConstantInt *C = dyn_cast<ConstantInt>(CallOperandVal)) {
51419      if (C->getZExtValue() <= 63)
51420        weight = CW_Constant;
51421    }
51422    break;
51423  case 'K':
51424    if (ConstantInt *C = dyn_cast<ConstantInt>(CallOperandVal)) {
51425      if ((C->getSExtValue() >= -0x80) && (C->getSExtValue() <= 0x7f))
51426        weight = CW_Constant;
51427    }
51428    break;
51429  case 'L':
51430    if (ConstantInt *C = dyn_cast<ConstantInt>(CallOperandVal)) {
51431      if ((C->getZExtValue() == 0xff) || (C->getZExtValue() == 0xffff))
51432        weight = CW_Constant;
51433    }
51434    break;
51435  case 'M':
51436    if (ConstantInt *C = dyn_cast<ConstantInt>(CallOperandVal)) {
51437      if (C->getZExtValue() <= 3)
51438        weight = CW_Constant;
51439    }
51440    break;
51441  case 'N':
51442    if (ConstantInt *C = dyn_cast<ConstantInt>(CallOperandVal)) {
51443      if (C->getZExtValue() <= 0xff)
51444        weight = CW_Constant;
51445    }
51446    break;
51447  case 'G':
51448  case 'C':
51449    if (isa<ConstantFP>(CallOperandVal)) {
51450      weight = CW_Constant;
51451    }
51452    break;
51453  case 'e':
51454    if (ConstantInt *C = dyn_cast<ConstantInt>(CallOperandVal)) {
51455      if ((C->getSExtValue() >= -0x80000000LL) &&
51456          (C->getSExtValue() <= 0x7fffffffLL))
51457        weight = CW_Constant;
51458    }
51459    break;
51460  case 'Z':
51461    if (ConstantInt *C = dyn_cast<ConstantInt>(CallOperandVal)) {
51462      if (C->getZExtValue() <= 0xffffffff)
51463        weight = CW_Constant;
51464    }
51465    break;
51466  }
51467  return weight;
51468}
51469
51470/// Try to replace an X constraint, which matches anything, with another that
51471/// has more specific requirements based on the type of the corresponding
51472/// operand.
51473const char *X86TargetLowering::
51474LowerXConstraint(EVT ConstraintVT) const {
51475  // FP X constraints get lowered to SSE1/2 registers if available, otherwise
51476  // 'f' like normal targets.
51477  if (ConstraintVT.isFloatingPoint()) {
51478    if (Subtarget.hasSSE1())
51479      return "x";
51480  }
51481
51482  return TargetLowering::LowerXConstraint(ConstraintVT);
51483}
51484
51485// Lower @cc targets via setcc.
51486SDValue X86TargetLowering::LowerAsmOutputForConstraint(
51487    SDValue &Chain, SDValue &Flag, const SDLoc &DL,
51488    const AsmOperandInfo &OpInfo, SelectionDAG &DAG) const {
51489  X86::CondCode Cond = parseConstraintCode(OpInfo.ConstraintCode);
51490  if (Cond == X86::COND_INVALID)
51491    return SDValue();
51492  // Check that return type is valid.
51493  if (OpInfo.ConstraintVT.isVector() || !OpInfo.ConstraintVT.isInteger() ||
51494      OpInfo.ConstraintVT.getSizeInBits() < 8)
51495    report_fatal_error("Flag output operand is of invalid type");
51496
51497  // Get EFLAGS register. Only update chain when copyfrom is glued.
51498  if (Flag.getNode()) {
51499    Flag = DAG.getCopyFromReg(Chain, DL, X86::EFLAGS, MVT::i32, Flag);
51500    Chain = Flag.getValue(1);
51501  } else
51502    Flag = DAG.getCopyFromReg(Chain, DL, X86::EFLAGS, MVT::i32);
51503  // Extract CC code.
51504  SDValue CC = getSETCC(Cond, Flag, DL, DAG);
51505  // Extend to 32-bits
51506  SDValue Result = DAG.getNode(ISD::ZERO_EXTEND, DL, OpInfo.ConstraintVT, CC);
51507
51508  return Result;
51509}
51510
51511/// Lower the specified operand into the Ops vector.
51512/// If it is invalid, don't add anything to Ops.
51513void X86TargetLowering::LowerAsmOperandForConstraint(SDValue Op,
51514                                                     std::string &Constraint,
51515                                                     std::vector<SDValue>&Ops,
51516                                                     SelectionDAG &DAG) const {
51517  SDValue Result;
51518
51519  // Only support length 1 constraints for now.
51520  if (Constraint.length() > 1) return;
51521
51522  char ConstraintLetter = Constraint[0];
51523  switch (ConstraintLetter) {
51524  default: break;
51525  case 'I':
51526    if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op)) {
51527      if (C->getZExtValue() <= 31) {
51528        Result = DAG.getTargetConstant(C->getZExtValue(), SDLoc(Op),
51529                                       Op.getValueType());
51530        break;
51531      }
51532    }
51533    return;
51534  case 'J':
51535    if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op)) {
51536      if (C->getZExtValue() <= 63) {
51537        Result = DAG.getTargetConstant(C->getZExtValue(), SDLoc(Op),
51538                                       Op.getValueType());
51539        break;
51540      }
51541    }
51542    return;
51543  case 'K':
51544    if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op)) {
51545      if (isInt<8>(C->getSExtValue())) {
51546        Result = DAG.getTargetConstant(C->getZExtValue(), SDLoc(Op),
51547                                       Op.getValueType());
51548        break;
51549      }
51550    }
51551    return;
51552  case 'L':
51553    if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op)) {
51554      if (C->getZExtValue() == 0xff || C->getZExtValue() == 0xffff ||
51555          (Subtarget.is64Bit() && C->getZExtValue() == 0xffffffff)) {
51556        Result = DAG.getTargetConstant(C->getSExtValue(), SDLoc(Op),
51557                                       Op.getValueType());
51558        break;
51559      }
51560    }
51561    return;
51562  case 'M':
51563    if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op)) {
51564      if (C->getZExtValue() <= 3) {
51565        Result = DAG.getTargetConstant(C->getZExtValue(), SDLoc(Op),
51566                                       Op.getValueType());
51567        break;
51568      }
51569    }
51570    return;
51571  case 'N':
51572    if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op)) {
51573      if (C->getZExtValue() <= 255) {
51574        Result = DAG.getTargetConstant(C->getZExtValue(), SDLoc(Op),
51575                                       Op.getValueType());
51576        break;
51577      }
51578    }
51579    return;
51580  case 'O':
51581    if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op)) {
51582      if (C->getZExtValue() <= 127) {
51583        Result = DAG.getTargetConstant(C->getZExtValue(), SDLoc(Op),
51584                                       Op.getValueType());
51585        break;
51586      }
51587    }
51588    return;
51589  case 'e': {
51590    // 32-bit signed value
51591    if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op)) {
51592      if (ConstantInt::isValueValidForType(Type::getInt32Ty(*DAG.getContext()),
51593                                           C->getSExtValue())) {
51594        // Widen to 64 bits here to get it sign extended.
51595        Result = DAG.getTargetConstant(C->getSExtValue(), SDLoc(Op), MVT::i64);
51596        break;
51597      }
51598    // FIXME gcc accepts some relocatable values here too, but only in certain
51599    // memory models; it's complicated.
51600    }
51601    return;
51602  }
51603  case 'Z': {
51604    // 32-bit unsigned value
51605    if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op)) {
51606      if (ConstantInt::isValueValidForType(Type::getInt32Ty(*DAG.getContext()),
51607                                           C->getZExtValue())) {
51608        Result = DAG.getTargetConstant(C->getZExtValue(), SDLoc(Op),
51609                                       Op.getValueType());
51610        break;
51611      }
51612    }
51613    // FIXME gcc accepts some relocatable values here too, but only in certain
51614    // memory models; it's complicated.
51615    return;
51616  }
51617  case 'i': {
51618    // Literal immediates are always ok.
51619    if (ConstantSDNode *CST = dyn_cast<ConstantSDNode>(Op)) {
51620      bool IsBool = CST->getConstantIntValue()->getBitWidth() == 1;
51621      BooleanContent BCont = getBooleanContents(MVT::i64);
51622      ISD::NodeType ExtOpc = IsBool ? getExtendForContent(BCont)
51623                                    : ISD::SIGN_EXTEND;
51624      int64_t ExtVal = ExtOpc == ISD::ZERO_EXTEND ? CST->getZExtValue()
51625                                                  : CST->getSExtValue();
51626      Result = DAG.getTargetConstant(ExtVal, SDLoc(Op), MVT::i64);
51627      break;
51628    }
51629
51630    // In any sort of PIC mode addresses need to be computed at runtime by
51631    // adding in a register or some sort of table lookup.  These can't
51632    // be used as immediates.
51633    if (Subtarget.isPICStyleGOT() || Subtarget.isPICStyleStubPIC())
51634      return;
51635
51636    // If we are in non-pic codegen mode, we allow the address of a global (with
51637    // an optional displacement) to be used with 'i'.
51638    if (auto *GA = dyn_cast<GlobalAddressSDNode>(Op))
51639      // If we require an extra load to get this address, as in PIC mode, we
51640      // can't accept it.
51641      if (isGlobalStubReference(
51642              Subtarget.classifyGlobalReference(GA->getGlobal())))
51643        return;
51644    break;
51645  }
51646  }
51647
51648  if (Result.getNode()) {
51649    Ops.push_back(Result);
51650    return;
51651  }
51652  return TargetLowering::LowerAsmOperandForConstraint(Op, Constraint, Ops, DAG);
51653}
51654
51655/// Check if \p RC is a general purpose register class.
51656/// I.e., GR* or one of their variant.
51657static bool isGRClass(const TargetRegisterClass &RC) {
51658  return RC.hasSuperClassEq(&X86::GR8RegClass) ||
51659         RC.hasSuperClassEq(&X86::GR16RegClass) ||
51660         RC.hasSuperClassEq(&X86::GR32RegClass) ||
51661         RC.hasSuperClassEq(&X86::GR64RegClass) ||
51662         RC.hasSuperClassEq(&X86::LOW32_ADDR_ACCESS_RBPRegClass);
51663}
51664
51665/// Check if \p RC is a vector register class.
51666/// I.e., FR* / VR* or one of their variant.
51667static bool isFRClass(const TargetRegisterClass &RC) {
51668  return RC.hasSuperClassEq(&X86::FR32XRegClass) ||
51669         RC.hasSuperClassEq(&X86::FR64XRegClass) ||
51670         RC.hasSuperClassEq(&X86::VR128XRegClass) ||
51671         RC.hasSuperClassEq(&X86::VR256XRegClass) ||
51672         RC.hasSuperClassEq(&X86::VR512RegClass);
51673}
51674
51675/// Check if \p RC is a mask register class.
51676/// I.e., VK* or one of their variant.
51677static bool isVKClass(const TargetRegisterClass &RC) {
51678  return RC.hasSuperClassEq(&X86::VK1RegClass) ||
51679         RC.hasSuperClassEq(&X86::VK2RegClass) ||
51680         RC.hasSuperClassEq(&X86::VK4RegClass) ||
51681         RC.hasSuperClassEq(&X86::VK8RegClass) ||
51682         RC.hasSuperClassEq(&X86::VK16RegClass) ||
51683         RC.hasSuperClassEq(&X86::VK32RegClass) ||
51684         RC.hasSuperClassEq(&X86::VK64RegClass);
51685}
51686
51687std::pair<unsigned, const TargetRegisterClass *>
51688X86TargetLowering::getRegForInlineAsmConstraint(const TargetRegisterInfo *TRI,
51689                                                StringRef Constraint,
51690                                                MVT VT) const {
51691  // First, see if this is a constraint that directly corresponds to an LLVM
51692  // register class.
51693  if (Constraint.size() == 1) {
51694    // GCC Constraint Letters
51695    switch (Constraint[0]) {
51696    default: break;
51697    // 'A' means [ER]AX + [ER]DX.
51698    case 'A':
51699      if (Subtarget.is64Bit())
51700        return std::make_pair(X86::RAX, &X86::GR64_ADRegClass);
51701      assert((Subtarget.is32Bit() || Subtarget.is16Bit()) &&
51702             "Expecting 64, 32 or 16 bit subtarget");
51703      return std::make_pair(X86::EAX, &X86::GR32_ADRegClass);
51704
51705      // TODO: Slight differences here in allocation order and leaving
51706      // RIP in the class. Do they matter any more here than they do
51707      // in the normal allocation?
51708    case 'k':
51709      if (Subtarget.hasAVX512()) {
51710        if (VT == MVT::i1)
51711          return std::make_pair(0U, &X86::VK1RegClass);
51712        if (VT == MVT::i8)
51713          return std::make_pair(0U, &X86::VK8RegClass);
51714        if (VT == MVT::i16)
51715          return std::make_pair(0U, &X86::VK16RegClass);
51716      }
51717      if (Subtarget.hasBWI()) {
51718        if (VT == MVT::i32)
51719          return std::make_pair(0U, &X86::VK32RegClass);
51720        if (VT == MVT::i64)
51721          return std::make_pair(0U, &X86::VK64RegClass);
51722      }
51723      break;
51724    case 'q':   // GENERAL_REGS in 64-bit mode, Q_REGS in 32-bit mode.
51725      if (Subtarget.is64Bit()) {
51726        if (VT == MVT::i8 || VT == MVT::i1)
51727          return std::make_pair(0U, &X86::GR8RegClass);
51728        if (VT == MVT::i16)
51729          return std::make_pair(0U, &X86::GR16RegClass);
51730        if (VT == MVT::i32 || VT == MVT::f32)
51731          return std::make_pair(0U, &X86::GR32RegClass);
51732        if (VT != MVT::f80)
51733          return std::make_pair(0U, &X86::GR64RegClass);
51734        break;
51735      }
51736      LLVM_FALLTHROUGH;
51737      // 32-bit fallthrough
51738    case 'Q':   // Q_REGS
51739      if (VT == MVT::i8 || VT == MVT::i1)
51740        return std::make_pair(0U, &X86::GR8_ABCD_LRegClass);
51741      if (VT == MVT::i16)
51742        return std::make_pair(0U, &X86::GR16_ABCDRegClass);
51743      if (VT == MVT::i32 || VT == MVT::f32 || !Subtarget.is64Bit())
51744        return std::make_pair(0U, &X86::GR32_ABCDRegClass);
51745      if (VT != MVT::f80)
51746        return std::make_pair(0U, &X86::GR64_ABCDRegClass);
51747      break;
51748    case 'r':   // GENERAL_REGS
51749    case 'l':   // INDEX_REGS
51750      if (VT == MVT::i8 || VT == MVT::i1)
51751        return std::make_pair(0U, &X86::GR8RegClass);
51752      if (VT == MVT::i16)
51753        return std::make_pair(0U, &X86::GR16RegClass);
51754      if (VT == MVT::i32 || VT == MVT::f32 || !Subtarget.is64Bit())
51755        return std::make_pair(0U, &X86::GR32RegClass);
51756      if (VT != MVT::f80)
51757        return std::make_pair(0U, &X86::GR64RegClass);
51758      break;
51759    case 'R':   // LEGACY_REGS
51760      if (VT == MVT::i8 || VT == MVT::i1)
51761        return std::make_pair(0U, &X86::GR8_NOREXRegClass);
51762      if (VT == MVT::i16)
51763        return std::make_pair(0U, &X86::GR16_NOREXRegClass);
51764      if (VT == MVT::i32 || VT == MVT::f32 || !Subtarget.is64Bit())
51765        return std::make_pair(0U, &X86::GR32_NOREXRegClass);
51766      if (VT != MVT::f80)
51767        return std::make_pair(0U, &X86::GR64_NOREXRegClass);
51768      break;
51769    case 'f':  // FP Stack registers.
51770      // If SSE is enabled for this VT, use f80 to ensure the isel moves the
51771      // value to the correct fpstack register class.
51772      if (VT == MVT::f32 && !isScalarFPTypeInSSEReg(VT))
51773        return std::make_pair(0U, &X86::RFP32RegClass);
51774      if (VT == MVT::f64 && !isScalarFPTypeInSSEReg(VT))
51775        return std::make_pair(0U, &X86::RFP64RegClass);
51776      if (VT == MVT::f32 || VT == MVT::f64 || VT == MVT::f80)
51777        return std::make_pair(0U, &X86::RFP80RegClass);
51778      break;
51779    case 'y':   // MMX_REGS if MMX allowed.
51780      if (!Subtarget.hasMMX()) break;
51781      return std::make_pair(0U, &X86::VR64RegClass);
51782    case 'v':
51783    case 'x':   // SSE_REGS if SSE1 allowed or AVX_REGS if AVX allowed
51784      if (!Subtarget.hasSSE1()) break;
51785      bool VConstraint = (Constraint[0] == 'v');
51786
51787      switch (VT.SimpleTy) {
51788      default: break;
51789      // Scalar SSE types.
51790      case MVT::f32:
51791      case MVT::i32:
51792        if (VConstraint && Subtarget.hasVLX())
51793          return std::make_pair(0U, &X86::FR32XRegClass);
51794        return std::make_pair(0U, &X86::FR32RegClass);
51795      case MVT::f64:
51796      case MVT::i64:
51797        if (VConstraint && Subtarget.hasVLX())
51798          return std::make_pair(0U, &X86::FR64XRegClass);
51799        return std::make_pair(0U, &X86::FR64RegClass);
51800      case MVT::i128:
51801        if (Subtarget.is64Bit()) {
51802          if (VConstraint && Subtarget.hasVLX())
51803            return std::make_pair(0U, &X86::VR128XRegClass);
51804          return std::make_pair(0U, &X86::VR128RegClass);
51805        }
51806        break;
51807      // Vector types and fp128.
51808      case MVT::f128:
51809      case MVT::v16i8:
51810      case MVT::v8i16:
51811      case MVT::v4i32:
51812      case MVT::v2i64:
51813      case MVT::v4f32:
51814      case MVT::v2f64:
51815        if (VConstraint && Subtarget.hasVLX())
51816          return std::make_pair(0U, &X86::VR128XRegClass);
51817        return std::make_pair(0U, &X86::VR128RegClass);
51818      // AVX types.
51819      case MVT::v32i8:
51820      case MVT::v16i16:
51821      case MVT::v8i32:
51822      case MVT::v4i64:
51823      case MVT::v8f32:
51824      case MVT::v4f64:
51825        if (VConstraint && Subtarget.hasVLX())
51826          return std::make_pair(0U, &X86::VR256XRegClass);
51827        if (Subtarget.hasAVX())
51828          return std::make_pair(0U, &X86::VR256RegClass);
51829        break;
51830      case MVT::v64i8:
51831      case MVT::v32i16:
51832      case MVT::v8f64:
51833      case MVT::v16f32:
51834      case MVT::v16i32:
51835      case MVT::v8i64:
51836        if (!Subtarget.hasAVX512()) break;
51837        if (VConstraint)
51838          return std::make_pair(0U, &X86::VR512RegClass);
51839        return std::make_pair(0U, &X86::VR512_0_15RegClass);
51840      }
51841      break;
51842    }
51843  } else if (Constraint.size() == 2 && Constraint[0] == 'Y') {
51844    switch (Constraint[1]) {
51845    default:
51846      break;
51847    case 'i':
51848    case 't':
51849    case '2':
51850      return getRegForInlineAsmConstraint(TRI, "x", VT);
51851    case 'm':
51852      if (!Subtarget.hasMMX()) break;
51853      return std::make_pair(0U, &X86::VR64RegClass);
51854    case 'z':
51855      if (!Subtarget.hasSSE1()) break;
51856      switch (VT.SimpleTy) {
51857      default: break;
51858      // Scalar SSE types.
51859      case MVT::f32:
51860      case MVT::i32:
51861        return std::make_pair(X86::XMM0, &X86::FR32RegClass);
51862      case MVT::f64:
51863      case MVT::i64:
51864        return std::make_pair(X86::XMM0, &X86::FR64RegClass);
51865      case MVT::f128:
51866      case MVT::v16i8:
51867      case MVT::v8i16:
51868      case MVT::v4i32:
51869      case MVT::v2i64:
51870      case MVT::v4f32:
51871      case MVT::v2f64:
51872        return std::make_pair(X86::XMM0, &X86::VR128RegClass);
51873      // AVX types.
51874      case MVT::v32i8:
51875      case MVT::v16i16:
51876      case MVT::v8i32:
51877      case MVT::v4i64:
51878      case MVT::v8f32:
51879      case MVT::v4f64:
51880        if (Subtarget.hasAVX())
51881          return std::make_pair(X86::YMM0, &X86::VR256RegClass);
51882        break;
51883      case MVT::v64i8:
51884      case MVT::v32i16:
51885      case MVT::v8f64:
51886      case MVT::v16f32:
51887      case MVT::v16i32:
51888      case MVT::v8i64:
51889        if (Subtarget.hasAVX512())
51890          return std::make_pair(X86::ZMM0, &X86::VR512_0_15RegClass);
51891        break;
51892      }
51893      break;
51894    case 'k':
51895      // This register class doesn't allocate k0 for masked vector operation.
51896      if (Subtarget.hasAVX512()) {
51897        if (VT == MVT::i1)
51898          return std::make_pair(0U, &X86::VK1WMRegClass);
51899        if (VT == MVT::i8)
51900          return std::make_pair(0U, &X86::VK8WMRegClass);
51901        if (VT == MVT::i16)
51902          return std::make_pair(0U, &X86::VK16WMRegClass);
51903      }
51904      if (Subtarget.hasBWI()) {
51905        if (VT == MVT::i32)
51906          return std::make_pair(0U, &X86::VK32WMRegClass);
51907        if (VT == MVT::i64)
51908          return std::make_pair(0U, &X86::VK64WMRegClass);
51909      }
51910      break;
51911    }
51912  }
51913
51914  if (parseConstraintCode(Constraint) != X86::COND_INVALID)
51915    return std::make_pair(0U, &X86::GR32RegClass);
51916
51917  // Use the default implementation in TargetLowering to convert the register
51918  // constraint into a member of a register class.
51919  std::pair<Register, const TargetRegisterClass*> Res;
51920  Res = TargetLowering::getRegForInlineAsmConstraint(TRI, Constraint, VT);
51921
51922  // Not found as a standard register?
51923  if (!Res.second) {
51924    // Only match x87 registers if the VT is one SelectionDAGBuilder can convert
51925    // to/from f80.
51926    if (VT == MVT::Other || VT == MVT::f32 || VT == MVT::f64 || VT == MVT::f80) {
51927      // Map st(0) -> st(7) -> ST0
51928      if (Constraint.size() == 7 && Constraint[0] == '{' &&
51929          tolower(Constraint[1]) == 's' && tolower(Constraint[2]) == 't' &&
51930          Constraint[3] == '(' &&
51931          (Constraint[4] >= '0' && Constraint[4] <= '7') &&
51932          Constraint[5] == ')' && Constraint[6] == '}') {
51933        // st(7) is not allocatable and thus not a member of RFP80. Return
51934        // singleton class in cases where we have a reference to it.
51935        if (Constraint[4] == '7')
51936          return std::make_pair(X86::FP7, &X86::RFP80_7RegClass);
51937        return std::make_pair(X86::FP0 + Constraint[4] - '0',
51938                              &X86::RFP80RegClass);
51939      }
51940
51941      // GCC allows "st(0)" to be called just plain "st".
51942      if (StringRef("{st}").equals_lower(Constraint))
51943        return std::make_pair(X86::FP0, &X86::RFP80RegClass);
51944    }
51945
51946    // flags -> EFLAGS
51947    if (StringRef("{flags}").equals_lower(Constraint))
51948      return std::make_pair(X86::EFLAGS, &X86::CCRRegClass);
51949
51950    // dirflag -> DF
51951    // Only allow for clobber.
51952    if (StringRef("{dirflag}").equals_lower(Constraint) && VT == MVT::Other)
51953      return std::make_pair(X86::DF, &X86::DFCCRRegClass);
51954
51955    // fpsr -> FPSW
51956    if (StringRef("{fpsr}").equals_lower(Constraint))
51957      return std::make_pair(X86::FPSW, &X86::FPCCRRegClass);
51958
51959    return Res;
51960  }
51961
51962  // Make sure it isn't a register that requires 64-bit mode.
51963  if (!Subtarget.is64Bit() &&
51964      (isFRClass(*Res.second) || isGRClass(*Res.second)) &&
51965      TRI->getEncodingValue(Res.first) >= 8) {
51966    // Register requires REX prefix, but we're in 32-bit mode.
51967    return std::make_pair(0, nullptr);
51968  }
51969
51970  // Make sure it isn't a register that requires AVX512.
51971  if (!Subtarget.hasAVX512() && isFRClass(*Res.second) &&
51972      TRI->getEncodingValue(Res.first) & 0x10) {
51973    // Register requires EVEX prefix.
51974    return std::make_pair(0, nullptr);
51975  }
51976
51977  // Otherwise, check to see if this is a register class of the wrong value
51978  // type.  For example, we want to map "{ax},i32" -> {eax}, we don't want it to
51979  // turn into {ax},{dx}.
51980  // MVT::Other is used to specify clobber names.
51981  if (TRI->isTypeLegalForClass(*Res.second, VT) || VT == MVT::Other)
51982    return Res;   // Correct type already, nothing to do.
51983
51984  // Get a matching integer of the correct size. i.e. "ax" with MVT::32 should
51985  // return "eax". This should even work for things like getting 64bit integer
51986  // registers when given an f64 type.
51987  const TargetRegisterClass *Class = Res.second;
51988  // The generic code will match the first register class that contains the
51989  // given register. Thus, based on the ordering of the tablegened file,
51990  // the "plain" GR classes might not come first.
51991  // Therefore, use a helper method.
51992  if (isGRClass(*Class)) {
51993    unsigned Size = VT.getSizeInBits();
51994    if (Size == 1) Size = 8;
51995    Register DestReg = getX86SubSuperRegisterOrZero(Res.first, Size);
51996    if (DestReg > 0) {
51997      bool is64Bit = Subtarget.is64Bit();
51998      const TargetRegisterClass *RC =
51999          Size == 8 ? (is64Bit ? &X86::GR8RegClass : &X86::GR8_NOREXRegClass)
52000        : Size == 16 ? (is64Bit ? &X86::GR16RegClass : &X86::GR16_NOREXRegClass)
52001        : Size == 32 ? (is64Bit ? &X86::GR32RegClass : &X86::GR32_NOREXRegClass)
52002        : Size == 64 ? (is64Bit ? &X86::GR64RegClass : nullptr)
52003        : nullptr;
52004      if (Size == 64 && !is64Bit) {
52005        // Model GCC's behavior here and select a fixed pair of 32-bit
52006        // registers.
52007        switch (DestReg) {
52008        case X86::RAX:
52009          return std::make_pair(X86::EAX, &X86::GR32_ADRegClass);
52010        case X86::RDX:
52011          return std::make_pair(X86::EDX, &X86::GR32_DCRegClass);
52012        case X86::RCX:
52013          return std::make_pair(X86::ECX, &X86::GR32_CBRegClass);
52014        case X86::RBX:
52015          return std::make_pair(X86::EBX, &X86::GR32_BSIRegClass);
52016        case X86::RSI:
52017          return std::make_pair(X86::ESI, &X86::GR32_SIDIRegClass);
52018        case X86::RDI:
52019          return std::make_pair(X86::EDI, &X86::GR32_DIBPRegClass);
52020        case X86::RBP:
52021          return std::make_pair(X86::EBP, &X86::GR32_BPSPRegClass);
52022        default:
52023          return std::make_pair(0, nullptr);
52024        }
52025      }
52026      if (RC && RC->contains(DestReg))
52027        return std::make_pair(DestReg, RC);
52028      return Res;
52029    }
52030    // No register found/type mismatch.
52031    return std::make_pair(0, nullptr);
52032  } else if (isFRClass(*Class)) {
52033    // Handle references to XMM physical registers that got mapped into the
52034    // wrong class.  This can happen with constraints like {xmm0} where the
52035    // target independent register mapper will just pick the first match it can
52036    // find, ignoring the required type.
52037
52038    // TODO: Handle f128 and i128 in FR128RegClass after it is tested well.
52039    if (VT == MVT::f32 || VT == MVT::i32)
52040      Res.second = &X86::FR32XRegClass;
52041    else if (VT == MVT::f64 || VT == MVT::i64)
52042      Res.second = &X86::FR64XRegClass;
52043    else if (TRI->isTypeLegalForClass(X86::VR128XRegClass, VT))
52044      Res.second = &X86::VR128XRegClass;
52045    else if (TRI->isTypeLegalForClass(X86::VR256XRegClass, VT))
52046      Res.second = &X86::VR256XRegClass;
52047    else if (TRI->isTypeLegalForClass(X86::VR512RegClass, VT))
52048      Res.second = &X86::VR512RegClass;
52049    else {
52050      // Type mismatch and not a clobber: Return an error;
52051      Res.first = 0;
52052      Res.second = nullptr;
52053    }
52054  } else if (isVKClass(*Class)) {
52055    if (VT == MVT::i1)
52056      Res.second = &X86::VK1RegClass;
52057    else if (VT == MVT::i8)
52058      Res.second = &X86::VK8RegClass;
52059    else if (VT == MVT::i16)
52060      Res.second = &X86::VK16RegClass;
52061    else if (VT == MVT::i32)
52062      Res.second = &X86::VK32RegClass;
52063    else if (VT == MVT::i64)
52064      Res.second = &X86::VK64RegClass;
52065    else {
52066      // Type mismatch and not a clobber: Return an error;
52067      Res.first = 0;
52068      Res.second = nullptr;
52069    }
52070  }
52071
52072  return Res;
52073}
52074
52075InstructionCost X86TargetLowering::getScalingFactorCost(const DataLayout &DL,
52076                                                        const AddrMode &AM,
52077                                                        Type *Ty,
52078                                                        unsigned AS) const {
52079  // Scaling factors are not free at all.
52080  // An indexed folded instruction, i.e., inst (reg1, reg2, scale),
52081  // will take 2 allocations in the out of order engine instead of 1
52082  // for plain addressing mode, i.e. inst (reg1).
52083  // E.g.,
52084  // vaddps (%rsi,%rdx), %ymm0, %ymm1
52085  // Requires two allocations (one for the load, one for the computation)
52086  // whereas:
52087  // vaddps (%rsi), %ymm0, %ymm1
52088  // Requires just 1 allocation, i.e., freeing allocations for other operations
52089  // and having less micro operations to execute.
52090  //
52091  // For some X86 architectures, this is even worse because for instance for
52092  // stores, the complex addressing mode forces the instruction to use the
52093  // "load" ports instead of the dedicated "store" port.
52094  // E.g., on Haswell:
52095  // vmovaps %ymm1, (%r8, %rdi) can use port 2 or 3.
52096  // vmovaps %ymm1, (%r8) can use port 2, 3, or 7.
52097  if (isLegalAddressingMode(DL, AM, Ty, AS))
52098    // Scale represents reg2 * scale, thus account for 1
52099    // as soon as we use a second register.
52100    return AM.Scale != 0;
52101  return -1;
52102}
52103
52104bool X86TargetLowering::isIntDivCheap(EVT VT, AttributeList Attr) const {
52105  // Integer division on x86 is expensive. However, when aggressively optimizing
52106  // for code size, we prefer to use a div instruction, as it is usually smaller
52107  // than the alternative sequence.
52108  // The exception to this is vector division. Since x86 doesn't have vector
52109  // integer division, leaving the division as-is is a loss even in terms of
52110  // size, because it will have to be scalarized, while the alternative code
52111  // sequence can be performed in vector form.
52112  bool OptSize = Attr.hasFnAttribute(Attribute::MinSize);
52113  return OptSize && !VT.isVector();
52114}
52115
52116void X86TargetLowering::initializeSplitCSR(MachineBasicBlock *Entry) const {
52117  if (!Subtarget.is64Bit())
52118    return;
52119
52120  // Update IsSplitCSR in X86MachineFunctionInfo.
52121  X86MachineFunctionInfo *AFI =
52122      Entry->getParent()->getInfo<X86MachineFunctionInfo>();
52123  AFI->setIsSplitCSR(true);
52124}
52125
52126void X86TargetLowering::insertCopiesSplitCSR(
52127    MachineBasicBlock *Entry,
52128    const SmallVectorImpl<MachineBasicBlock *> &Exits) const {
52129  const X86RegisterInfo *TRI = Subtarget.getRegisterInfo();
52130  const MCPhysReg *IStart = TRI->getCalleeSavedRegsViaCopy(Entry->getParent());
52131  if (!IStart)
52132    return;
52133
52134  const TargetInstrInfo *TII = Subtarget.getInstrInfo();
52135  MachineRegisterInfo *MRI = &Entry->getParent()->getRegInfo();
52136  MachineBasicBlock::iterator MBBI = Entry->begin();
52137  for (const MCPhysReg *I = IStart; *I; ++I) {
52138    const TargetRegisterClass *RC = nullptr;
52139    if (X86::GR64RegClass.contains(*I))
52140      RC = &X86::GR64RegClass;
52141    else
52142      llvm_unreachable("Unexpected register class in CSRsViaCopy!");
52143
52144    Register NewVR = MRI->createVirtualRegister(RC);
52145    // Create copy from CSR to a virtual register.
52146    // FIXME: this currently does not emit CFI pseudo-instructions, it works
52147    // fine for CXX_FAST_TLS since the C++-style TLS access functions should be
52148    // nounwind. If we want to generalize this later, we may need to emit
52149    // CFI pseudo-instructions.
52150    assert(
52151        Entry->getParent()->getFunction().hasFnAttribute(Attribute::NoUnwind) &&
52152        "Function should be nounwind in insertCopiesSplitCSR!");
52153    Entry->addLiveIn(*I);
52154    BuildMI(*Entry, MBBI, DebugLoc(), TII->get(TargetOpcode::COPY), NewVR)
52155        .addReg(*I);
52156
52157    // Insert the copy-back instructions right before the terminator.
52158    for (auto *Exit : Exits)
52159      BuildMI(*Exit, Exit->getFirstTerminator(), DebugLoc(),
52160              TII->get(TargetOpcode::COPY), *I)
52161          .addReg(NewVR);
52162  }
52163}
52164
52165bool X86TargetLowering::supportSwiftError() const {
52166  return Subtarget.is64Bit();
52167}
52168
52169/// Returns true if stack probing through a function call is requested.
52170bool X86TargetLowering::hasStackProbeSymbol(MachineFunction &MF) const {
52171  return !getStackProbeSymbolName(MF).empty();
52172}
52173
52174/// Returns true if stack probing through inline assembly is requested.
52175bool X86TargetLowering::hasInlineStackProbe(MachineFunction &MF) const {
52176
52177  // No inline stack probe for Windows, they have their own mechanism.
52178  if (Subtarget.isOSWindows() ||
52179      MF.getFunction().hasFnAttribute("no-stack-arg-probe"))
52180    return false;
52181
52182  // If the function specifically requests inline stack probes, emit them.
52183  if (MF.getFunction().hasFnAttribute("probe-stack"))
52184    return MF.getFunction().getFnAttribute("probe-stack").getValueAsString() ==
52185           "inline-asm";
52186
52187  return false;
52188}
52189
52190/// Returns the name of the symbol used to emit stack probes or the empty
52191/// string if not applicable.
52192StringRef
52193X86TargetLowering::getStackProbeSymbolName(MachineFunction &MF) const {
52194  // Inline Stack probes disable stack probe call
52195  if (hasInlineStackProbe(MF))
52196    return "";
52197
52198  // If the function specifically requests stack probes, emit them.
52199  if (MF.getFunction().hasFnAttribute("probe-stack"))
52200    return MF.getFunction().getFnAttribute("probe-stack").getValueAsString();
52201
52202  // Generally, if we aren't on Windows, the platform ABI does not include
52203  // support for stack probes, so don't emit them.
52204  if (!Subtarget.isOSWindows() || Subtarget.isTargetMachO() ||
52205      MF.getFunction().hasFnAttribute("no-stack-arg-probe"))
52206    return "";
52207
52208  // We need a stack probe to conform to the Windows ABI. Choose the right
52209  // symbol.
52210  if (Subtarget.is64Bit())
52211    return Subtarget.isTargetCygMing() ? "___chkstk_ms" : "__chkstk";
52212  return Subtarget.isTargetCygMing() ? "_alloca" : "_chkstk";
52213}
52214
52215unsigned
52216X86TargetLowering::getStackProbeSize(MachineFunction &MF) const {
52217  // The default stack probe size is 4096 if the function has no stackprobesize
52218  // attribute.
52219  unsigned StackProbeSize = 4096;
52220  const Function &Fn = MF.getFunction();
52221  if (Fn.hasFnAttribute("stack-probe-size"))
52222    Fn.getFnAttribute("stack-probe-size")
52223        .getValueAsString()
52224        .getAsInteger(0, StackProbeSize);
52225  return StackProbeSize;
52226}
52227
52228Align X86TargetLowering::getPrefLoopAlignment(MachineLoop *ML) const {
52229  if (ML->isInnermost() &&
52230      ExperimentalPrefInnermostLoopAlignment.getNumOccurrences())
52231    return Align(1ULL << ExperimentalPrefInnermostLoopAlignment);
52232  return TargetLowering::getPrefLoopAlignment();
52233}
52234