1//===-- X86InstCombineIntrinsic.cpp - X86 specific InstCombine pass -------===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8/// \file
9/// This file implements a TargetTransformInfo analysis pass specific to the
10/// X86 target machine. It uses the target's detailed information to provide
11/// more precise answers to certain TTI queries, while letting the target
12/// independent and default TTI implementations handle the rest.
13///
14//===----------------------------------------------------------------------===//
15
16#include "X86TargetTransformInfo.h"
17#include "llvm/IR/IntrinsicInst.h"
18#include "llvm/IR/IntrinsicsX86.h"
19#include "llvm/Support/KnownBits.h"
20#include "llvm/Transforms/InstCombine/InstCombiner.h"
21#include <optional>
22
23using namespace llvm;
24
25#define DEBUG_TYPE "x86tti"
26
27/// Return a constant boolean vector that has true elements in all positions
28/// where the input constant data vector has an element with the sign bit set.
29static Constant *getNegativeIsTrueBoolVec(Constant *V) {
30  VectorType *IntTy = VectorType::getInteger(cast<VectorType>(V->getType()));
31  V = ConstantExpr::getBitCast(V, IntTy);
32  V = ConstantExpr::getICmp(CmpInst::ICMP_SGT, Constant::getNullValue(IntTy),
33                            V);
34  return V;
35}
36
37/// Convert the x86 XMM integer vector mask to a vector of bools based on
38/// each element's most significant bit (the sign bit).
39static Value *getBoolVecFromMask(Value *Mask) {
40  // Fold Constant Mask.
41  if (auto *ConstantMask = dyn_cast<ConstantDataVector>(Mask))
42    return getNegativeIsTrueBoolVec(ConstantMask);
43
44  // Mask was extended from a boolean vector.
45  Value *ExtMask;
46  if (PatternMatch::match(
47          Mask, PatternMatch::m_SExt(PatternMatch::m_Value(ExtMask))) &&
48      ExtMask->getType()->isIntOrIntVectorTy(1))
49    return ExtMask;
50
51  return nullptr;
52}
53
54// TODO: If the x86 backend knew how to convert a bool vector mask back to an
55// XMM register mask efficiently, we could transform all x86 masked intrinsics
56// to LLVM masked intrinsics and remove the x86 masked intrinsic defs.
57static Instruction *simplifyX86MaskedLoad(IntrinsicInst &II, InstCombiner &IC) {
58  Value *Ptr = II.getOperand(0);
59  Value *Mask = II.getOperand(1);
60  Constant *ZeroVec = Constant::getNullValue(II.getType());
61
62  // Zero Mask - masked load instruction creates a zero vector.
63  if (isa<ConstantAggregateZero>(Mask))
64    return IC.replaceInstUsesWith(II, ZeroVec);
65
66  // The mask is constant or extended from a bool vector. Convert this x86
67  // intrinsic to the LLVM intrinsic to allow target-independent optimizations.
68  if (Value *BoolMask = getBoolVecFromMask(Mask)) {
69    // First, cast the x86 intrinsic scalar pointer to a vector pointer to match
70    // the LLVM intrinsic definition for the pointer argument.
71    unsigned AddrSpace = cast<PointerType>(Ptr->getType())->getAddressSpace();
72    PointerType *VecPtrTy = PointerType::get(II.getType(), AddrSpace);
73    Value *PtrCast = IC.Builder.CreateBitCast(Ptr, VecPtrTy, "castvec");
74
75    // The pass-through vector for an x86 masked load is a zero vector.
76    CallInst *NewMaskedLoad = IC.Builder.CreateMaskedLoad(
77        II.getType(), PtrCast, Align(1), BoolMask, ZeroVec);
78    return IC.replaceInstUsesWith(II, NewMaskedLoad);
79  }
80
81  return nullptr;
82}
83
84// TODO: If the x86 backend knew how to convert a bool vector mask back to an
85// XMM register mask efficiently, we could transform all x86 masked intrinsics
86// to LLVM masked intrinsics and remove the x86 masked intrinsic defs.
87static bool simplifyX86MaskedStore(IntrinsicInst &II, InstCombiner &IC) {
88  Value *Ptr = II.getOperand(0);
89  Value *Mask = II.getOperand(1);
90  Value *Vec = II.getOperand(2);
91
92  // Zero Mask - this masked store instruction does nothing.
93  if (isa<ConstantAggregateZero>(Mask)) {
94    IC.eraseInstFromFunction(II);
95    return true;
96  }
97
98  // The SSE2 version is too weird (eg, unaligned but non-temporal) to do
99  // anything else at this level.
100  if (II.getIntrinsicID() == Intrinsic::x86_sse2_maskmov_dqu)
101    return false;
102
103  // The mask is constant or extended from a bool vector. Convert this x86
104  // intrinsic to the LLVM intrinsic to allow target-independent optimizations.
105  if (Value *BoolMask = getBoolVecFromMask(Mask)) {
106    unsigned AddrSpace = cast<PointerType>(Ptr->getType())->getAddressSpace();
107    PointerType *VecPtrTy = PointerType::get(Vec->getType(), AddrSpace);
108    Value *PtrCast = IC.Builder.CreateBitCast(Ptr, VecPtrTy, "castvec");
109
110    IC.Builder.CreateMaskedStore(Vec, PtrCast, Align(1), BoolMask);
111
112    // 'Replace uses' doesn't work for stores. Erase the original masked store.
113    IC.eraseInstFromFunction(II);
114    return true;
115  }
116
117  return false;
118}
119
120static Value *simplifyX86immShift(const IntrinsicInst &II,
121                                  InstCombiner::BuilderTy &Builder) {
122  bool LogicalShift = false;
123  bool ShiftLeft = false;
124  bool IsImm = false;
125
126  switch (II.getIntrinsicID()) {
127  default:
128    llvm_unreachable("Unexpected intrinsic!");
129  case Intrinsic::x86_sse2_psrai_d:
130  case Intrinsic::x86_sse2_psrai_w:
131  case Intrinsic::x86_avx2_psrai_d:
132  case Intrinsic::x86_avx2_psrai_w:
133  case Intrinsic::x86_avx512_psrai_q_128:
134  case Intrinsic::x86_avx512_psrai_q_256:
135  case Intrinsic::x86_avx512_psrai_d_512:
136  case Intrinsic::x86_avx512_psrai_q_512:
137  case Intrinsic::x86_avx512_psrai_w_512:
138    IsImm = true;
139    [[fallthrough]];
140  case Intrinsic::x86_sse2_psra_d:
141  case Intrinsic::x86_sse2_psra_w:
142  case Intrinsic::x86_avx2_psra_d:
143  case Intrinsic::x86_avx2_psra_w:
144  case Intrinsic::x86_avx512_psra_q_128:
145  case Intrinsic::x86_avx512_psra_q_256:
146  case Intrinsic::x86_avx512_psra_d_512:
147  case Intrinsic::x86_avx512_psra_q_512:
148  case Intrinsic::x86_avx512_psra_w_512:
149    LogicalShift = false;
150    ShiftLeft = false;
151    break;
152  case Intrinsic::x86_sse2_psrli_d:
153  case Intrinsic::x86_sse2_psrli_q:
154  case Intrinsic::x86_sse2_psrli_w:
155  case Intrinsic::x86_avx2_psrli_d:
156  case Intrinsic::x86_avx2_psrli_q:
157  case Intrinsic::x86_avx2_psrli_w:
158  case Intrinsic::x86_avx512_psrli_d_512:
159  case Intrinsic::x86_avx512_psrli_q_512:
160  case Intrinsic::x86_avx512_psrli_w_512:
161    IsImm = true;
162    [[fallthrough]];
163  case Intrinsic::x86_sse2_psrl_d:
164  case Intrinsic::x86_sse2_psrl_q:
165  case Intrinsic::x86_sse2_psrl_w:
166  case Intrinsic::x86_avx2_psrl_d:
167  case Intrinsic::x86_avx2_psrl_q:
168  case Intrinsic::x86_avx2_psrl_w:
169  case Intrinsic::x86_avx512_psrl_d_512:
170  case Intrinsic::x86_avx512_psrl_q_512:
171  case Intrinsic::x86_avx512_psrl_w_512:
172    LogicalShift = true;
173    ShiftLeft = false;
174    break;
175  case Intrinsic::x86_sse2_pslli_d:
176  case Intrinsic::x86_sse2_pslli_q:
177  case Intrinsic::x86_sse2_pslli_w:
178  case Intrinsic::x86_avx2_pslli_d:
179  case Intrinsic::x86_avx2_pslli_q:
180  case Intrinsic::x86_avx2_pslli_w:
181  case Intrinsic::x86_avx512_pslli_d_512:
182  case Intrinsic::x86_avx512_pslli_q_512:
183  case Intrinsic::x86_avx512_pslli_w_512:
184    IsImm = true;
185    [[fallthrough]];
186  case Intrinsic::x86_sse2_psll_d:
187  case Intrinsic::x86_sse2_psll_q:
188  case Intrinsic::x86_sse2_psll_w:
189  case Intrinsic::x86_avx2_psll_d:
190  case Intrinsic::x86_avx2_psll_q:
191  case Intrinsic::x86_avx2_psll_w:
192  case Intrinsic::x86_avx512_psll_d_512:
193  case Intrinsic::x86_avx512_psll_q_512:
194  case Intrinsic::x86_avx512_psll_w_512:
195    LogicalShift = true;
196    ShiftLeft = true;
197    break;
198  }
199  assert((LogicalShift || !ShiftLeft) && "Only logical shifts can shift left");
200
201  Value *Vec = II.getArgOperand(0);
202  Value *Amt = II.getArgOperand(1);
203  auto *VT = cast<FixedVectorType>(Vec->getType());
204  Type *SVT = VT->getElementType();
205  Type *AmtVT = Amt->getType();
206  unsigned VWidth = VT->getNumElements();
207  unsigned BitWidth = SVT->getPrimitiveSizeInBits();
208
209  // If the shift amount is guaranteed to be in-range we can replace it with a
210  // generic shift. If its guaranteed to be out of range, logical shifts combine
211  // to zero and arithmetic shifts are clamped to (BitWidth - 1).
212  if (IsImm) {
213    assert(AmtVT->isIntegerTy(32) && "Unexpected shift-by-immediate type");
214    KnownBits KnownAmtBits =
215        llvm::computeKnownBits(Amt, II.getModule()->getDataLayout());
216    if (KnownAmtBits.getMaxValue().ult(BitWidth)) {
217      Amt = Builder.CreateZExtOrTrunc(Amt, SVT);
218      Amt = Builder.CreateVectorSplat(VWidth, Amt);
219      return (LogicalShift ? (ShiftLeft ? Builder.CreateShl(Vec, Amt)
220                                        : Builder.CreateLShr(Vec, Amt))
221                           : Builder.CreateAShr(Vec, Amt));
222    }
223    if (KnownAmtBits.getMinValue().uge(BitWidth)) {
224      if (LogicalShift)
225        return ConstantAggregateZero::get(VT);
226      Amt = ConstantInt::get(SVT, BitWidth - 1);
227      return Builder.CreateAShr(Vec, Builder.CreateVectorSplat(VWidth, Amt));
228    }
229  } else {
230    // Ensure the first element has an in-range value and the rest of the
231    // elements in the bottom 64 bits are zero.
232    assert(AmtVT->isVectorTy() && AmtVT->getPrimitiveSizeInBits() == 128 &&
233           cast<VectorType>(AmtVT)->getElementType() == SVT &&
234           "Unexpected shift-by-scalar type");
235    unsigned NumAmtElts = cast<FixedVectorType>(AmtVT)->getNumElements();
236    APInt DemandedLower = APInt::getOneBitSet(NumAmtElts, 0);
237    APInt DemandedUpper = APInt::getBitsSet(NumAmtElts, 1, NumAmtElts / 2);
238    KnownBits KnownLowerBits = llvm::computeKnownBits(
239        Amt, DemandedLower, II.getModule()->getDataLayout());
240    KnownBits KnownUpperBits = llvm::computeKnownBits(
241        Amt, DemandedUpper, II.getModule()->getDataLayout());
242    if (KnownLowerBits.getMaxValue().ult(BitWidth) &&
243        (DemandedUpper.isZero() || KnownUpperBits.isZero())) {
244      SmallVector<int, 16> ZeroSplat(VWidth, 0);
245      Amt = Builder.CreateShuffleVector(Amt, ZeroSplat);
246      return (LogicalShift ? (ShiftLeft ? Builder.CreateShl(Vec, Amt)
247                                        : Builder.CreateLShr(Vec, Amt))
248                           : Builder.CreateAShr(Vec, Amt));
249    }
250  }
251
252  // Simplify if count is constant vector.
253  auto *CDV = dyn_cast<ConstantDataVector>(Amt);
254  if (!CDV)
255    return nullptr;
256
257  // SSE2/AVX2 uses all the first 64-bits of the 128-bit vector
258  // operand to compute the shift amount.
259  assert(AmtVT->isVectorTy() && AmtVT->getPrimitiveSizeInBits() == 128 &&
260         cast<VectorType>(AmtVT)->getElementType() == SVT &&
261         "Unexpected shift-by-scalar type");
262
263  // Concatenate the sub-elements to create the 64-bit value.
264  APInt Count(64, 0);
265  for (unsigned i = 0, NumSubElts = 64 / BitWidth; i != NumSubElts; ++i) {
266    unsigned SubEltIdx = (NumSubElts - 1) - i;
267    auto *SubElt = cast<ConstantInt>(CDV->getElementAsConstant(SubEltIdx));
268    Count <<= BitWidth;
269    Count |= SubElt->getValue().zextOrTrunc(64);
270  }
271
272  // If shift-by-zero then just return the original value.
273  if (Count.isZero())
274    return Vec;
275
276  // Handle cases when Shift >= BitWidth.
277  if (Count.uge(BitWidth)) {
278    // If LogicalShift - just return zero.
279    if (LogicalShift)
280      return ConstantAggregateZero::get(VT);
281
282    // If ArithmeticShift - clamp Shift to (BitWidth - 1).
283    Count = APInt(64, BitWidth - 1);
284  }
285
286  // Get a constant vector of the same type as the first operand.
287  auto ShiftAmt = ConstantInt::get(SVT, Count.zextOrTrunc(BitWidth));
288  auto ShiftVec = Builder.CreateVectorSplat(VWidth, ShiftAmt);
289
290  if (ShiftLeft)
291    return Builder.CreateShl(Vec, ShiftVec);
292
293  if (LogicalShift)
294    return Builder.CreateLShr(Vec, ShiftVec);
295
296  return Builder.CreateAShr(Vec, ShiftVec);
297}
298
299// Attempt to simplify AVX2 per-element shift intrinsics to a generic IR shift.
300// Unlike the generic IR shifts, the intrinsics have defined behaviour for out
301// of range shift amounts (logical - set to zero, arithmetic - splat sign bit).
302static Value *simplifyX86varShift(const IntrinsicInst &II,
303                                  InstCombiner::BuilderTy &Builder) {
304  bool LogicalShift = false;
305  bool ShiftLeft = false;
306
307  switch (II.getIntrinsicID()) {
308  default:
309    llvm_unreachable("Unexpected intrinsic!");
310  case Intrinsic::x86_avx2_psrav_d:
311  case Intrinsic::x86_avx2_psrav_d_256:
312  case Intrinsic::x86_avx512_psrav_q_128:
313  case Intrinsic::x86_avx512_psrav_q_256:
314  case Intrinsic::x86_avx512_psrav_d_512:
315  case Intrinsic::x86_avx512_psrav_q_512:
316  case Intrinsic::x86_avx512_psrav_w_128:
317  case Intrinsic::x86_avx512_psrav_w_256:
318  case Intrinsic::x86_avx512_psrav_w_512:
319    LogicalShift = false;
320    ShiftLeft = false;
321    break;
322  case Intrinsic::x86_avx2_psrlv_d:
323  case Intrinsic::x86_avx2_psrlv_d_256:
324  case Intrinsic::x86_avx2_psrlv_q:
325  case Intrinsic::x86_avx2_psrlv_q_256:
326  case Intrinsic::x86_avx512_psrlv_d_512:
327  case Intrinsic::x86_avx512_psrlv_q_512:
328  case Intrinsic::x86_avx512_psrlv_w_128:
329  case Intrinsic::x86_avx512_psrlv_w_256:
330  case Intrinsic::x86_avx512_psrlv_w_512:
331    LogicalShift = true;
332    ShiftLeft = false;
333    break;
334  case Intrinsic::x86_avx2_psllv_d:
335  case Intrinsic::x86_avx2_psllv_d_256:
336  case Intrinsic::x86_avx2_psllv_q:
337  case Intrinsic::x86_avx2_psllv_q_256:
338  case Intrinsic::x86_avx512_psllv_d_512:
339  case Intrinsic::x86_avx512_psllv_q_512:
340  case Intrinsic::x86_avx512_psllv_w_128:
341  case Intrinsic::x86_avx512_psllv_w_256:
342  case Intrinsic::x86_avx512_psllv_w_512:
343    LogicalShift = true;
344    ShiftLeft = true;
345    break;
346  }
347  assert((LogicalShift || !ShiftLeft) && "Only logical shifts can shift left");
348
349  Value *Vec = II.getArgOperand(0);
350  Value *Amt = II.getArgOperand(1);
351  auto *VT = cast<FixedVectorType>(II.getType());
352  Type *SVT = VT->getElementType();
353  int NumElts = VT->getNumElements();
354  int BitWidth = SVT->getIntegerBitWidth();
355
356  // If the shift amount is guaranteed to be in-range we can replace it with a
357  // generic shift.
358  KnownBits KnownAmt =
359      llvm::computeKnownBits(Amt, II.getModule()->getDataLayout());
360  if (KnownAmt.getMaxValue().ult(BitWidth)) {
361    return (LogicalShift ? (ShiftLeft ? Builder.CreateShl(Vec, Amt)
362                                      : Builder.CreateLShr(Vec, Amt))
363                         : Builder.CreateAShr(Vec, Amt));
364  }
365
366  // Simplify if all shift amounts are constant/undef.
367  auto *CShift = dyn_cast<Constant>(Amt);
368  if (!CShift)
369    return nullptr;
370
371  // Collect each element's shift amount.
372  // We also collect special cases: UNDEF = -1, OUT-OF-RANGE = BitWidth.
373  bool AnyOutOfRange = false;
374  SmallVector<int, 8> ShiftAmts;
375  for (int I = 0; I < NumElts; ++I) {
376    auto *CElt = CShift->getAggregateElement(I);
377    if (isa_and_nonnull<UndefValue>(CElt)) {
378      ShiftAmts.push_back(-1);
379      continue;
380    }
381
382    auto *COp = dyn_cast_or_null<ConstantInt>(CElt);
383    if (!COp)
384      return nullptr;
385
386    // Handle out of range shifts.
387    // If LogicalShift - set to BitWidth (special case).
388    // If ArithmeticShift - set to (BitWidth - 1) (sign splat).
389    APInt ShiftVal = COp->getValue();
390    if (ShiftVal.uge(BitWidth)) {
391      AnyOutOfRange = LogicalShift;
392      ShiftAmts.push_back(LogicalShift ? BitWidth : BitWidth - 1);
393      continue;
394    }
395
396    ShiftAmts.push_back((int)ShiftVal.getZExtValue());
397  }
398
399  // If all elements out of range or UNDEF, return vector of zeros/undefs.
400  // ArithmeticShift should only hit this if they are all UNDEF.
401  auto OutOfRange = [&](int Idx) { return (Idx < 0) || (BitWidth <= Idx); };
402  if (llvm::all_of(ShiftAmts, OutOfRange)) {
403    SmallVector<Constant *, 8> ConstantVec;
404    for (int Idx : ShiftAmts) {
405      if (Idx < 0) {
406        ConstantVec.push_back(UndefValue::get(SVT));
407      } else {
408        assert(LogicalShift && "Logical shift expected");
409        ConstantVec.push_back(ConstantInt::getNullValue(SVT));
410      }
411    }
412    return ConstantVector::get(ConstantVec);
413  }
414
415  // We can't handle only some out of range values with generic logical shifts.
416  if (AnyOutOfRange)
417    return nullptr;
418
419  // Build the shift amount constant vector.
420  SmallVector<Constant *, 8> ShiftVecAmts;
421  for (int Idx : ShiftAmts) {
422    if (Idx < 0)
423      ShiftVecAmts.push_back(UndefValue::get(SVT));
424    else
425      ShiftVecAmts.push_back(ConstantInt::get(SVT, Idx));
426  }
427  auto ShiftVec = ConstantVector::get(ShiftVecAmts);
428
429  if (ShiftLeft)
430    return Builder.CreateShl(Vec, ShiftVec);
431
432  if (LogicalShift)
433    return Builder.CreateLShr(Vec, ShiftVec);
434
435  return Builder.CreateAShr(Vec, ShiftVec);
436}
437
438static Value *simplifyX86pack(IntrinsicInst &II,
439                              InstCombiner::BuilderTy &Builder, bool IsSigned) {
440  Value *Arg0 = II.getArgOperand(0);
441  Value *Arg1 = II.getArgOperand(1);
442  Type *ResTy = II.getType();
443
444  // Fast all undef handling.
445  if (isa<UndefValue>(Arg0) && isa<UndefValue>(Arg1))
446    return UndefValue::get(ResTy);
447
448  auto *ArgTy = cast<FixedVectorType>(Arg0->getType());
449  unsigned NumLanes = ResTy->getPrimitiveSizeInBits() / 128;
450  unsigned NumSrcElts = ArgTy->getNumElements();
451  assert(cast<FixedVectorType>(ResTy)->getNumElements() == (2 * NumSrcElts) &&
452         "Unexpected packing types");
453
454  unsigned NumSrcEltsPerLane = NumSrcElts / NumLanes;
455  unsigned DstScalarSizeInBits = ResTy->getScalarSizeInBits();
456  unsigned SrcScalarSizeInBits = ArgTy->getScalarSizeInBits();
457  assert(SrcScalarSizeInBits == (2 * DstScalarSizeInBits) &&
458         "Unexpected packing types");
459
460  // Constant folding.
461  if (!isa<Constant>(Arg0) || !isa<Constant>(Arg1))
462    return nullptr;
463
464  // Clamp Values - signed/unsigned both use signed clamp values, but they
465  // differ on the min/max values.
466  APInt MinValue, MaxValue;
467  if (IsSigned) {
468    // PACKSS: Truncate signed value with signed saturation.
469    // Source values less than dst minint are saturated to minint.
470    // Source values greater than dst maxint are saturated to maxint.
471    MinValue =
472        APInt::getSignedMinValue(DstScalarSizeInBits).sext(SrcScalarSizeInBits);
473    MaxValue =
474        APInt::getSignedMaxValue(DstScalarSizeInBits).sext(SrcScalarSizeInBits);
475  } else {
476    // PACKUS: Truncate signed value with unsigned saturation.
477    // Source values less than zero are saturated to zero.
478    // Source values greater than dst maxuint are saturated to maxuint.
479    MinValue = APInt::getZero(SrcScalarSizeInBits);
480    MaxValue = APInt::getLowBitsSet(SrcScalarSizeInBits, DstScalarSizeInBits);
481  }
482
483  auto *MinC = Constant::getIntegerValue(ArgTy, MinValue);
484  auto *MaxC = Constant::getIntegerValue(ArgTy, MaxValue);
485  Arg0 = Builder.CreateSelect(Builder.CreateICmpSLT(Arg0, MinC), MinC, Arg0);
486  Arg1 = Builder.CreateSelect(Builder.CreateICmpSLT(Arg1, MinC), MinC, Arg1);
487  Arg0 = Builder.CreateSelect(Builder.CreateICmpSGT(Arg0, MaxC), MaxC, Arg0);
488  Arg1 = Builder.CreateSelect(Builder.CreateICmpSGT(Arg1, MaxC), MaxC, Arg1);
489
490  // Shuffle clamped args together at the lane level.
491  SmallVector<int, 32> PackMask;
492  for (unsigned Lane = 0; Lane != NumLanes; ++Lane) {
493    for (unsigned Elt = 0; Elt != NumSrcEltsPerLane; ++Elt)
494      PackMask.push_back(Elt + (Lane * NumSrcEltsPerLane));
495    for (unsigned Elt = 0; Elt != NumSrcEltsPerLane; ++Elt)
496      PackMask.push_back(Elt + (Lane * NumSrcEltsPerLane) + NumSrcElts);
497  }
498  auto *Shuffle = Builder.CreateShuffleVector(Arg0, Arg1, PackMask);
499
500  // Truncate to dst size.
501  return Builder.CreateTrunc(Shuffle, ResTy);
502}
503
504static Value *simplifyX86movmsk(const IntrinsicInst &II,
505                                InstCombiner::BuilderTy &Builder) {
506  Value *Arg = II.getArgOperand(0);
507  Type *ResTy = II.getType();
508
509  // movmsk(undef) -> zero as we must ensure the upper bits are zero.
510  if (isa<UndefValue>(Arg))
511    return Constant::getNullValue(ResTy);
512
513  auto *ArgTy = dyn_cast<FixedVectorType>(Arg->getType());
514  // We can't easily peek through x86_mmx types.
515  if (!ArgTy)
516    return nullptr;
517
518  // Expand MOVMSK to compare/bitcast/zext:
519  // e.g. PMOVMSKB(v16i8 x):
520  // %cmp = icmp slt <16 x i8> %x, zeroinitializer
521  // %int = bitcast <16 x i1> %cmp to i16
522  // %res = zext i16 %int to i32
523  unsigned NumElts = ArgTy->getNumElements();
524  Type *IntegerTy = Builder.getIntNTy(NumElts);
525
526  Value *Res = Builder.CreateBitCast(Arg, VectorType::getInteger(ArgTy));
527  Res = Builder.CreateIsNeg(Res);
528  Res = Builder.CreateBitCast(Res, IntegerTy);
529  Res = Builder.CreateZExtOrTrunc(Res, ResTy);
530  return Res;
531}
532
533static Value *simplifyX86addcarry(const IntrinsicInst &II,
534                                  InstCombiner::BuilderTy &Builder) {
535  Value *CarryIn = II.getArgOperand(0);
536  Value *Op1 = II.getArgOperand(1);
537  Value *Op2 = II.getArgOperand(2);
538  Type *RetTy = II.getType();
539  Type *OpTy = Op1->getType();
540  assert(RetTy->getStructElementType(0)->isIntegerTy(8) &&
541         RetTy->getStructElementType(1) == OpTy && OpTy == Op2->getType() &&
542         "Unexpected types for x86 addcarry");
543
544  // If carry-in is zero, this is just an unsigned add with overflow.
545  if (match(CarryIn, PatternMatch::m_ZeroInt())) {
546    Value *UAdd = Builder.CreateIntrinsic(Intrinsic::uadd_with_overflow, OpTy,
547                                          {Op1, Op2});
548    // The types have to be adjusted to match the x86 call types.
549    Value *UAddResult = Builder.CreateExtractValue(UAdd, 0);
550    Value *UAddOV = Builder.CreateZExt(Builder.CreateExtractValue(UAdd, 1),
551                                       Builder.getInt8Ty());
552    Value *Res = PoisonValue::get(RetTy);
553    Res = Builder.CreateInsertValue(Res, UAddOV, 0);
554    return Builder.CreateInsertValue(Res, UAddResult, 1);
555  }
556
557  return nullptr;
558}
559
560static Value *simplifyTernarylogic(const IntrinsicInst &II,
561                                   InstCombiner::BuilderTy &Builder) {
562
563  auto *ArgImm = dyn_cast<ConstantInt>(II.getArgOperand(3));
564  if (!ArgImm || ArgImm->getValue().uge(256))
565    return nullptr;
566
567  Value *ArgA = II.getArgOperand(0);
568  Value *ArgB = II.getArgOperand(1);
569  Value *ArgC = II.getArgOperand(2);
570
571  Type *Ty = II.getType();
572
573  auto Or = [&](auto Lhs, auto Rhs) -> std::pair<Value *, uint8_t> {
574    return {Builder.CreateOr(Lhs.first, Rhs.first), Lhs.second | Rhs.second};
575  };
576  auto Xor = [&](auto Lhs, auto Rhs) -> std::pair<Value *, uint8_t> {
577    return {Builder.CreateXor(Lhs.first, Rhs.first), Lhs.second ^ Rhs.second};
578  };
579  auto And = [&](auto Lhs, auto Rhs) -> std::pair<Value *, uint8_t> {
580    return {Builder.CreateAnd(Lhs.first, Rhs.first), Lhs.second & Rhs.second};
581  };
582  auto Not = [&](auto V) -> std::pair<Value *, uint8_t> {
583    return {Builder.CreateNot(V.first), ~V.second};
584  };
585  auto Nor = [&](auto Lhs, auto Rhs) { return Not(Or(Lhs, Rhs)); };
586  auto Xnor = [&](auto Lhs, auto Rhs) { return Not(Xor(Lhs, Rhs)); };
587  auto Nand = [&](auto Lhs, auto Rhs) { return Not(And(Lhs, Rhs)); };
588
589  bool AIsConst = match(ArgA, PatternMatch::m_ImmConstant());
590  bool BIsConst = match(ArgB, PatternMatch::m_ImmConstant());
591  bool CIsConst = match(ArgC, PatternMatch::m_ImmConstant());
592
593  bool ABIsConst = AIsConst && BIsConst;
594  bool ACIsConst = AIsConst && CIsConst;
595  bool BCIsConst = BIsConst && CIsConst;
596  bool ABCIsConst = AIsConst && BIsConst && CIsConst;
597
598  // Use for verification. Its a big table. Its difficult to go from Imm ->
599  // logic ops, but easy to verify that a set of logic ops is correct. We track
600  // the logic ops through the second value in the pair. At the end it should
601  // equal Imm.
602  std::pair<Value *, uint8_t> A = {ArgA, 0xf0};
603  std::pair<Value *, uint8_t> B = {ArgB, 0xcc};
604  std::pair<Value *, uint8_t> C = {ArgC, 0xaa};
605  std::pair<Value *, uint8_t> Res = {nullptr, 0};
606
607  // Currently we only handle cases that convert directly to another instruction
608  // or cases where all the ops are constant.  This is because we don't properly
609  // handle creating ternary ops in the backend, so splitting them here may
610  // cause regressions. As the backend improves, uncomment more cases.
611
612  uint8_t Imm = ArgImm->getValue().getZExtValue();
613  switch (Imm) {
614  case 0x0:
615    Res = {Constant::getNullValue(Ty), 0};
616    break;
617  case 0x1:
618    if (ABCIsConst)
619      Res = Nor(Or(A, B), C);
620    break;
621  case 0x2:
622    if (ABCIsConst)
623      Res = And(Nor(A, B), C);
624    break;
625  case 0x3:
626    if (ABIsConst)
627      Res = Nor(A, B);
628    break;
629  case 0x4:
630    if (ABCIsConst)
631      Res = And(Nor(A, C), B);
632    break;
633  case 0x5:
634    if (ACIsConst)
635      Res = Nor(A, C);
636    break;
637  case 0x6:
638    if (ABCIsConst)
639      Res = Nor(A, Xnor(B, C));
640    break;
641  case 0x7:
642    if (ABCIsConst)
643      Res = Nor(A, And(B, C));
644    break;
645  case 0x8:
646    if (ABCIsConst)
647      Res = Nor(A, Nand(B, C));
648    break;
649  case 0x9:
650    if (ABCIsConst)
651      Res = Nor(A, Xor(B, C));
652    break;
653  case 0xa:
654    if (ACIsConst)
655      Res = Nor(A, Not(C));
656    break;
657  case 0xb:
658    if (ABCIsConst)
659      Res = Nor(A, Nor(C, Not(B)));
660    break;
661  case 0xc:
662    if (ABIsConst)
663      Res = Nor(A, Not(B));
664    break;
665  case 0xd:
666    if (ABCIsConst)
667      Res = Nor(A, Nor(B, Not(C)));
668    break;
669  case 0xe:
670    if (ABCIsConst)
671      Res = Nor(A, Nor(B, C));
672    break;
673  case 0xf:
674    Res = Not(A);
675    break;
676  case 0x10:
677    if (ABCIsConst)
678      Res = And(A, Nor(B, C));
679    break;
680  case 0x11:
681    if (BCIsConst)
682      Res = Nor(B, C);
683    break;
684  case 0x12:
685    if (ABCIsConst)
686      Res = Nor(Xnor(A, C), B);
687    break;
688  case 0x13:
689    if (ABCIsConst)
690      Res = Nor(And(A, C), B);
691    break;
692  case 0x14:
693    if (ABCIsConst)
694      Res = Nor(Xnor(A, B), C);
695    break;
696  case 0x15:
697    if (ABCIsConst)
698      Res = Nor(And(A, B), C);
699    break;
700  case 0x16:
701    if (ABCIsConst)
702      Res = Xor(Xor(A, B), And(Nand(A, B), C));
703    break;
704  case 0x17:
705    if (ABCIsConst)
706      Res = Xor(Or(A, B), Or(Xnor(A, B), C));
707    break;
708  case 0x18:
709    if (ABCIsConst)
710      Res = Nor(Xnor(A, B), Xnor(A, C));
711    break;
712  case 0x19:
713    if (ABCIsConst)
714      Res = And(Nand(A, B), Xnor(B, C));
715    break;
716  case 0x1a:
717    if (ABCIsConst)
718      Res = Xor(A, Or(And(A, B), C));
719    break;
720  case 0x1b:
721    if (ABCIsConst)
722      Res = Xor(A, Or(Xnor(A, B), C));
723    break;
724  case 0x1c:
725    if (ABCIsConst)
726      Res = Xor(A, Or(And(A, C), B));
727    break;
728  case 0x1d:
729    if (ABCIsConst)
730      Res = Xor(A, Or(Xnor(A, C), B));
731    break;
732  case 0x1e:
733    if (ABCIsConst)
734      Res = Xor(A, Or(B, C));
735    break;
736  case 0x1f:
737    if (ABCIsConst)
738      Res = Nand(A, Or(B, C));
739    break;
740  case 0x20:
741    if (ABCIsConst)
742      Res = Nor(Nand(A, C), B);
743    break;
744  case 0x21:
745    if (ABCIsConst)
746      Res = Nor(Xor(A, C), B);
747    break;
748  case 0x22:
749    if (BCIsConst)
750      Res = Nor(B, Not(C));
751    break;
752  case 0x23:
753    if (ABCIsConst)
754      Res = Nor(B, Nor(C, Not(A)));
755    break;
756  case 0x24:
757    if (ABCIsConst)
758      Res = Nor(Xnor(A, B), Xor(A, C));
759    break;
760  case 0x25:
761    if (ABCIsConst)
762      Res = Xor(A, Nand(Nand(A, B), C));
763    break;
764  case 0x26:
765    if (ABCIsConst)
766      Res = And(Nand(A, B), Xor(B, C));
767    break;
768  case 0x27:
769    if (ABCIsConst)
770      Res = Xor(Or(Xnor(A, B), C), B);
771    break;
772  case 0x28:
773    if (ABCIsConst)
774      Res = And(Xor(A, B), C);
775    break;
776  case 0x29:
777    if (ABCIsConst)
778      Res = Xor(Xor(A, B), Nor(And(A, B), C));
779    break;
780  case 0x2a:
781    if (ABCIsConst)
782      Res = And(Nand(A, B), C);
783    break;
784  case 0x2b:
785    if (ABCIsConst)
786      Res = Xor(Or(Xnor(A, B), Xor(A, C)), A);
787    break;
788  case 0x2c:
789    if (ABCIsConst)
790      Res = Nor(Xnor(A, B), Nor(B, C));
791    break;
792  case 0x2d:
793    if (ABCIsConst)
794      Res = Xor(A, Or(B, Not(C)));
795    break;
796  case 0x2e:
797    if (ABCIsConst)
798      Res = Xor(A, Or(Xor(A, C), B));
799    break;
800  case 0x2f:
801    if (ABCIsConst)
802      Res = Nand(A, Or(B, Not(C)));
803    break;
804  case 0x30:
805    if (ABIsConst)
806      Res = Nor(B, Not(A));
807    break;
808  case 0x31:
809    if (ABCIsConst)
810      Res = Nor(Nor(A, Not(C)), B);
811    break;
812  case 0x32:
813    if (ABCIsConst)
814      Res = Nor(Nor(A, C), B);
815    break;
816  case 0x33:
817    Res = Not(B);
818    break;
819  case 0x34:
820    if (ABCIsConst)
821      Res = And(Xor(A, B), Nand(B, C));
822    break;
823  case 0x35:
824    if (ABCIsConst)
825      Res = Xor(B, Or(A, Xnor(B, C)));
826    break;
827  case 0x36:
828    if (ABCIsConst)
829      Res = Xor(Or(A, C), B);
830    break;
831  case 0x37:
832    if (ABCIsConst)
833      Res = Nand(Or(A, C), B);
834    break;
835  case 0x38:
836    if (ABCIsConst)
837      Res = Nor(Xnor(A, B), Nor(A, C));
838    break;
839  case 0x39:
840    if (ABCIsConst)
841      Res = Xor(Or(A, Not(C)), B);
842    break;
843  case 0x3a:
844    if (ABCIsConst)
845      Res = Xor(B, Or(A, Xor(B, C)));
846    break;
847  case 0x3b:
848    if (ABCIsConst)
849      Res = Nand(Or(A, Not(C)), B);
850    break;
851  case 0x3c:
852    Res = Xor(A, B);
853    break;
854  case 0x3d:
855    if (ABCIsConst)
856      Res = Xor(A, Or(Nor(A, C), B));
857    break;
858  case 0x3e:
859    if (ABCIsConst)
860      Res = Xor(A, Or(Nor(A, Not(C)), B));
861    break;
862  case 0x3f:
863    if (ABIsConst)
864      Res = Nand(A, B);
865    break;
866  case 0x40:
867    if (ABCIsConst)
868      Res = Nor(Nand(A, B), C);
869    break;
870  case 0x41:
871    if (ABCIsConst)
872      Res = Nor(Xor(A, B), C);
873    break;
874  case 0x42:
875    if (ABCIsConst)
876      Res = Nor(Xor(A, B), Xnor(A, C));
877    break;
878  case 0x43:
879    if (ABCIsConst)
880      Res = Xor(A, Nand(Nand(A, C), B));
881    break;
882  case 0x44:
883    if (BCIsConst)
884      Res = Nor(C, Not(B));
885    break;
886  case 0x45:
887    if (ABCIsConst)
888      Res = Nor(Nor(B, Not(A)), C);
889    break;
890  case 0x46:
891    if (ABCIsConst)
892      Res = Xor(Or(And(A, C), B), C);
893    break;
894  case 0x47:
895    if (ABCIsConst)
896      Res = Xor(Or(Xnor(A, C), B), C);
897    break;
898  case 0x48:
899    if (ABCIsConst)
900      Res = And(Xor(A, C), B);
901    break;
902  case 0x49:
903    if (ABCIsConst)
904      Res = Xor(Or(Xnor(A, B), And(A, C)), C);
905    break;
906  case 0x4a:
907    if (ABCIsConst)
908      Res = Nor(Xnor(A, C), Nor(B, C));
909    break;
910  case 0x4b:
911    if (ABCIsConst)
912      Res = Xor(A, Or(C, Not(B)));
913    break;
914  case 0x4c:
915    if (ABCIsConst)
916      Res = And(Nand(A, C), B);
917    break;
918  case 0x4d:
919    if (ABCIsConst)
920      Res = Xor(Or(Xor(A, B), Xnor(A, C)), A);
921    break;
922  case 0x4e:
923    if (ABCIsConst)
924      Res = Xor(A, Or(Xor(A, B), C));
925    break;
926  case 0x4f:
927    if (ABCIsConst)
928      Res = Nand(A, Nand(B, Not(C)));
929    break;
930  case 0x50:
931    if (ACIsConst)
932      Res = Nor(C, Not(A));
933    break;
934  case 0x51:
935    if (ABCIsConst)
936      Res = Nor(Nor(A, Not(B)), C);
937    break;
938  case 0x52:
939    if (ABCIsConst)
940      Res = And(Xor(A, C), Nand(B, C));
941    break;
942  case 0x53:
943    if (ABCIsConst)
944      Res = Xor(Or(Xnor(B, C), A), C);
945    break;
946  case 0x54:
947    if (ABCIsConst)
948      Res = Nor(Nor(A, B), C);
949    break;
950  case 0x55:
951    Res = Not(C);
952    break;
953  case 0x56:
954    if (ABCIsConst)
955      Res = Xor(Or(A, B), C);
956    break;
957  case 0x57:
958    if (ABCIsConst)
959      Res = Nand(Or(A, B), C);
960    break;
961  case 0x58:
962    if (ABCIsConst)
963      Res = Nor(Nor(A, B), Xnor(A, C));
964    break;
965  case 0x59:
966    if (ABCIsConst)
967      Res = Xor(Or(A, Not(B)), C);
968    break;
969  case 0x5a:
970    Res = Xor(A, C);
971    break;
972  case 0x5b:
973    if (ABCIsConst)
974      Res = Xor(A, Or(Nor(A, B), C));
975    break;
976  case 0x5c:
977    if (ABCIsConst)
978      Res = Xor(Or(Xor(B, C), A), C);
979    break;
980  case 0x5d:
981    if (ABCIsConst)
982      Res = Nand(Or(A, Not(B)), C);
983    break;
984  case 0x5e:
985    if (ABCIsConst)
986      Res = Xor(A, Or(Nor(A, Not(B)), C));
987    break;
988  case 0x5f:
989    if (ACIsConst)
990      Res = Nand(A, C);
991    break;
992  case 0x60:
993    if (ABCIsConst)
994      Res = And(A, Xor(B, C));
995    break;
996  case 0x61:
997    if (ABCIsConst)
998      Res = Xor(Or(Xnor(A, B), And(B, C)), C);
999    break;
1000  case 0x62:
1001    if (ABCIsConst)
1002      Res = Nor(Nor(A, C), Xnor(B, C));
1003    break;
1004  case 0x63:
1005    if (ABCIsConst)
1006      Res = Xor(B, Or(C, Not(A)));
1007    break;
1008  case 0x64:
1009    if (ABCIsConst)
1010      Res = Nor(Nor(A, B), Xnor(B, C));
1011    break;
1012  case 0x65:
1013    if (ABCIsConst)
1014      Res = Xor(Or(B, Not(A)), C);
1015    break;
1016  case 0x66:
1017    Res = Xor(B, C);
1018    break;
1019  case 0x67:
1020    if (ABCIsConst)
1021      Res = Or(Nor(A, B), Xor(B, C));
1022    break;
1023  case 0x68:
1024    if (ABCIsConst)
1025      Res = Xor(Xor(A, B), Nor(Nor(A, B), C));
1026    break;
1027  case 0x69:
1028    if (ABCIsConst)
1029      Res = Xor(Xnor(A, B), C);
1030    break;
1031  case 0x6a:
1032    if (ABCIsConst)
1033      Res = Xor(And(A, B), C);
1034    break;
1035  case 0x6b:
1036    if (ABCIsConst)
1037      Res = Or(Nor(A, B), Xor(Xnor(A, B), C));
1038    break;
1039  case 0x6c:
1040    if (ABCIsConst)
1041      Res = Xor(And(A, C), B);
1042    break;
1043  case 0x6d:
1044    if (ABCIsConst)
1045      Res = Xor(Or(Xnor(A, B), Nor(A, C)), C);
1046    break;
1047  case 0x6e:
1048    if (ABCIsConst)
1049      Res = Or(Nor(A, Not(B)), Xor(B, C));
1050    break;
1051  case 0x6f:
1052    if (ABCIsConst)
1053      Res = Nand(A, Xnor(B, C));
1054    break;
1055  case 0x70:
1056    if (ABCIsConst)
1057      Res = And(A, Nand(B, C));
1058    break;
1059  case 0x71:
1060    if (ABCIsConst)
1061      Res = Xor(Nor(Xor(A, B), Xor(A, C)), A);
1062    break;
1063  case 0x72:
1064    if (ABCIsConst)
1065      Res = Xor(Or(Xor(A, B), C), B);
1066    break;
1067  case 0x73:
1068    if (ABCIsConst)
1069      Res = Nand(Nand(A, Not(C)), B);
1070    break;
1071  case 0x74:
1072    if (ABCIsConst)
1073      Res = Xor(Or(Xor(A, C), B), C);
1074    break;
1075  case 0x75:
1076    if (ABCIsConst)
1077      Res = Nand(Nand(A, Not(B)), C);
1078    break;
1079  case 0x76:
1080    if (ABCIsConst)
1081      Res = Xor(B, Or(Nor(B, Not(A)), C));
1082    break;
1083  case 0x77:
1084    if (BCIsConst)
1085      Res = Nand(B, C);
1086    break;
1087  case 0x78:
1088    if (ABCIsConst)
1089      Res = Xor(A, And(B, C));
1090    break;
1091  case 0x79:
1092    if (ABCIsConst)
1093      Res = Xor(Or(Xnor(A, B), Nor(B, C)), C);
1094    break;
1095  case 0x7a:
1096    if (ABCIsConst)
1097      Res = Or(Xor(A, C), Nor(B, Not(A)));
1098    break;
1099  case 0x7b:
1100    if (ABCIsConst)
1101      Res = Nand(Xnor(A, C), B);
1102    break;
1103  case 0x7c:
1104    if (ABCIsConst)
1105      Res = Or(Xor(A, B), Nor(C, Not(A)));
1106    break;
1107  case 0x7d:
1108    if (ABCIsConst)
1109      Res = Nand(Xnor(A, B), C);
1110    break;
1111  case 0x7e:
1112    if (ABCIsConst)
1113      Res = Or(Xor(A, B), Xor(A, C));
1114    break;
1115  case 0x7f:
1116    if (ABCIsConst)
1117      Res = Nand(And(A, B), C);
1118    break;
1119  case 0x80:
1120    if (ABCIsConst)
1121      Res = And(And(A, B), C);
1122    break;
1123  case 0x81:
1124    if (ABCIsConst)
1125      Res = Nor(Xor(A, B), Xor(A, C));
1126    break;
1127  case 0x82:
1128    if (ABCIsConst)
1129      Res = And(Xnor(A, B), C);
1130    break;
1131  case 0x83:
1132    if (ABCIsConst)
1133      Res = Nor(Xor(A, B), Nor(C, Not(A)));
1134    break;
1135  case 0x84:
1136    if (ABCIsConst)
1137      Res = And(Xnor(A, C), B);
1138    break;
1139  case 0x85:
1140    if (ABCIsConst)
1141      Res = Nor(Xor(A, C), Nor(B, Not(A)));
1142    break;
1143  case 0x86:
1144    if (ABCIsConst)
1145      Res = Xor(Nor(Xnor(A, B), Nor(B, C)), C);
1146    break;
1147  case 0x87:
1148    if (ABCIsConst)
1149      Res = Xor(A, Nand(B, C));
1150    break;
1151  case 0x88:
1152    Res = And(B, C);
1153    break;
1154  case 0x89:
1155    if (ABCIsConst)
1156      Res = Xor(B, Nor(Nor(B, Not(A)), C));
1157    break;
1158  case 0x8a:
1159    if (ABCIsConst)
1160      Res = And(Nand(A, Not(B)), C);
1161    break;
1162  case 0x8b:
1163    if (ABCIsConst)
1164      Res = Xor(Nor(Xor(A, C), B), C);
1165    break;
1166  case 0x8c:
1167    if (ABCIsConst)
1168      Res = And(Nand(A, Not(C)), B);
1169    break;
1170  case 0x8d:
1171    if (ABCIsConst)
1172      Res = Xor(Nor(Xor(A, B), C), B);
1173    break;
1174  case 0x8e:
1175    if (ABCIsConst)
1176      Res = Xor(Or(Xor(A, B), Xor(A, C)), A);
1177    break;
1178  case 0x8f:
1179    if (ABCIsConst)
1180      Res = Nand(A, Nand(B, C));
1181    break;
1182  case 0x90:
1183    if (ABCIsConst)
1184      Res = And(A, Xnor(B, C));
1185    break;
1186  case 0x91:
1187    if (ABCIsConst)
1188      Res = Nor(Nor(A, Not(B)), Xor(B, C));
1189    break;
1190  case 0x92:
1191    if (ABCIsConst)
1192      Res = Xor(Nor(Xnor(A, B), Nor(A, C)), C);
1193    break;
1194  case 0x93:
1195    if (ABCIsConst)
1196      Res = Xor(Nand(A, C), B);
1197    break;
1198  case 0x94:
1199    if (ABCIsConst)
1200      Res = Nor(Nor(A, B), Xor(Xnor(A, B), C));
1201    break;
1202  case 0x95:
1203    if (ABCIsConst)
1204      Res = Xor(Nand(A, B), C);
1205    break;
1206  case 0x96:
1207    if (ABCIsConst)
1208      Res = Xor(Xor(A, B), C);
1209    break;
1210  case 0x97:
1211    if (ABCIsConst)
1212      Res = Xor(Xor(A, B), Or(Nor(A, B), C));
1213    break;
1214  case 0x98:
1215    if (ABCIsConst)
1216      Res = Nor(Nor(A, B), Xor(B, C));
1217    break;
1218  case 0x99:
1219    if (BCIsConst)
1220      Res = Xnor(B, C);
1221    break;
1222  case 0x9a:
1223    if (ABCIsConst)
1224      Res = Xor(Nor(B, Not(A)), C);
1225    break;
1226  case 0x9b:
1227    if (ABCIsConst)
1228      Res = Or(Nor(A, B), Xnor(B, C));
1229    break;
1230  case 0x9c:
1231    if (ABCIsConst)
1232      Res = Xor(B, Nor(C, Not(A)));
1233    break;
1234  case 0x9d:
1235    if (ABCIsConst)
1236      Res = Or(Nor(A, C), Xnor(B, C));
1237    break;
1238  case 0x9e:
1239    if (ABCIsConst)
1240      Res = Xor(And(Xor(A, B), Nand(B, C)), C);
1241    break;
1242  case 0x9f:
1243    if (ABCIsConst)
1244      Res = Nand(A, Xor(B, C));
1245    break;
1246  case 0xa0:
1247    Res = And(A, C);
1248    break;
1249  case 0xa1:
1250    if (ABCIsConst)
1251      Res = Xor(A, Nor(Nor(A, Not(B)), C));
1252    break;
1253  case 0xa2:
1254    if (ABCIsConst)
1255      Res = And(Or(A, Not(B)), C);
1256    break;
1257  case 0xa3:
1258    if (ABCIsConst)
1259      Res = Xor(Nor(Xor(B, C), A), C);
1260    break;
1261  case 0xa4:
1262    if (ABCIsConst)
1263      Res = Xor(A, Nor(Nor(A, B), C));
1264    break;
1265  case 0xa5:
1266    if (ACIsConst)
1267      Res = Xnor(A, C);
1268    break;
1269  case 0xa6:
1270    if (ABCIsConst)
1271      Res = Xor(Nor(A, Not(B)), C);
1272    break;
1273  case 0xa7:
1274    if (ABCIsConst)
1275      Res = Or(Nor(A, B), Xnor(A, C));
1276    break;
1277  case 0xa8:
1278    if (ABCIsConst)
1279      Res = And(Or(A, B), C);
1280    break;
1281  case 0xa9:
1282    if (ABCIsConst)
1283      Res = Xor(Nor(A, B), C);
1284    break;
1285  case 0xaa:
1286    Res = C;
1287    break;
1288  case 0xab:
1289    if (ABCIsConst)
1290      Res = Or(Nor(A, B), C);
1291    break;
1292  case 0xac:
1293    if (ABCIsConst)
1294      Res = Xor(Nor(Xnor(B, C), A), C);
1295    break;
1296  case 0xad:
1297    if (ABCIsConst)
1298      Res = Or(Xnor(A, C), And(B, C));
1299    break;
1300  case 0xae:
1301    if (ABCIsConst)
1302      Res = Or(Nor(A, Not(B)), C);
1303    break;
1304  case 0xaf:
1305    if (ACIsConst)
1306      Res = Or(C, Not(A));
1307    break;
1308  case 0xb0:
1309    if (ABCIsConst)
1310      Res = And(A, Nand(B, Not(C)));
1311    break;
1312  case 0xb1:
1313    if (ABCIsConst)
1314      Res = Xor(A, Nor(Xor(A, B), C));
1315    break;
1316  case 0xb2:
1317    if (ABCIsConst)
1318      Res = Xor(Nor(Xor(A, B), Xnor(A, C)), A);
1319    break;
1320  case 0xb3:
1321    if (ABCIsConst)
1322      Res = Nand(Nand(A, C), B);
1323    break;
1324  case 0xb4:
1325    if (ABCIsConst)
1326      Res = Xor(A, Nor(C, Not(B)));
1327    break;
1328  case 0xb5:
1329    if (ABCIsConst)
1330      Res = Or(Xnor(A, C), Nor(B, C));
1331    break;
1332  case 0xb6:
1333    if (ABCIsConst)
1334      Res = Xor(And(Xor(A, B), Nand(A, C)), C);
1335    break;
1336  case 0xb7:
1337    if (ABCIsConst)
1338      Res = Nand(Xor(A, C), B);
1339    break;
1340  case 0xb8:
1341    if (ABCIsConst)
1342      Res = Xor(Nor(Xnor(A, C), B), C);
1343    break;
1344  case 0xb9:
1345    if (ABCIsConst)
1346      Res = Xor(Nor(And(A, C), B), C);
1347    break;
1348  case 0xba:
1349    if (ABCIsConst)
1350      Res = Or(Nor(B, Not(A)), C);
1351    break;
1352  case 0xbb:
1353    if (BCIsConst)
1354      Res = Or(C, Not(B));
1355    break;
1356  case 0xbc:
1357    if (ABCIsConst)
1358      Res = Xor(A, And(Nand(A, C), B));
1359    break;
1360  case 0xbd:
1361    if (ABCIsConst)
1362      Res = Or(Xor(A, B), Xnor(A, C));
1363    break;
1364  case 0xbe:
1365    if (ABCIsConst)
1366      Res = Or(Xor(A, B), C);
1367    break;
1368  case 0xbf:
1369    if (ABCIsConst)
1370      Res = Or(Nand(A, B), C);
1371    break;
1372  case 0xc0:
1373    Res = And(A, B);
1374    break;
1375  case 0xc1:
1376    if (ABCIsConst)
1377      Res = Xor(A, Nor(Nor(A, Not(C)), B));
1378    break;
1379  case 0xc2:
1380    if (ABCIsConst)
1381      Res = Xor(A, Nor(Nor(A, C), B));
1382    break;
1383  case 0xc3:
1384    if (ABIsConst)
1385      Res = Xnor(A, B);
1386    break;
1387  case 0xc4:
1388    if (ABCIsConst)
1389      Res = And(Or(A, Not(C)), B);
1390    break;
1391  case 0xc5:
1392    if (ABCIsConst)
1393      Res = Xor(B, Nor(A, Xor(B, C)));
1394    break;
1395  case 0xc6:
1396    if (ABCIsConst)
1397      Res = Xor(Nor(A, Not(C)), B);
1398    break;
1399  case 0xc7:
1400    if (ABCIsConst)
1401      Res = Or(Xnor(A, B), Nor(A, C));
1402    break;
1403  case 0xc8:
1404    if (ABCIsConst)
1405      Res = And(Or(A, C), B);
1406    break;
1407  case 0xc9:
1408    if (ABCIsConst)
1409      Res = Xor(Nor(A, C), B);
1410    break;
1411  case 0xca:
1412    if (ABCIsConst)
1413      Res = Xor(B, Nor(A, Xnor(B, C)));
1414    break;
1415  case 0xcb:
1416    if (ABCIsConst)
1417      Res = Or(Xnor(A, B), And(B, C));
1418    break;
1419  case 0xcc:
1420    Res = B;
1421    break;
1422  case 0xcd:
1423    if (ABCIsConst)
1424      Res = Or(Nor(A, C), B);
1425    break;
1426  case 0xce:
1427    if (ABCIsConst)
1428      Res = Or(Nor(A, Not(C)), B);
1429    break;
1430  case 0xcf:
1431    if (ABIsConst)
1432      Res = Or(B, Not(A));
1433    break;
1434  case 0xd0:
1435    if (ABCIsConst)
1436      Res = And(A, Or(B, Not(C)));
1437    break;
1438  case 0xd1:
1439    if (ABCIsConst)
1440      Res = Xor(A, Nor(Xor(A, C), B));
1441    break;
1442  case 0xd2:
1443    if (ABCIsConst)
1444      Res = Xor(A, Nor(B, Not(C)));
1445    break;
1446  case 0xd3:
1447    if (ABCIsConst)
1448      Res = Or(Xnor(A, B), Nor(B, C));
1449    break;
1450  case 0xd4:
1451    if (ABCIsConst)
1452      Res = Xor(Nor(Xnor(A, B), Xor(A, C)), A);
1453    break;
1454  case 0xd5:
1455    if (ABCIsConst)
1456      Res = Nand(Nand(A, B), C);
1457    break;
1458  case 0xd6:
1459    if (ABCIsConst)
1460      Res = Xor(Xor(A, B), Or(And(A, B), C));
1461    break;
1462  case 0xd7:
1463    if (ABCIsConst)
1464      Res = Nand(Xor(A, B), C);
1465    break;
1466  case 0xd8:
1467    if (ABCIsConst)
1468      Res = Xor(Nor(Xnor(A, B), C), B);
1469    break;
1470  case 0xd9:
1471    if (ABCIsConst)
1472      Res = Or(And(A, B), Xnor(B, C));
1473    break;
1474  case 0xda:
1475    if (ABCIsConst)
1476      Res = Xor(A, And(Nand(A, B), C));
1477    break;
1478  case 0xdb:
1479    if (ABCIsConst)
1480      Res = Or(Xnor(A, B), Xor(A, C));
1481    break;
1482  case 0xdc:
1483    if (ABCIsConst)
1484      Res = Or(B, Nor(C, Not(A)));
1485    break;
1486  case 0xdd:
1487    if (BCIsConst)
1488      Res = Or(B, Not(C));
1489    break;
1490  case 0xde:
1491    if (ABCIsConst)
1492      Res = Or(Xor(A, C), B);
1493    break;
1494  case 0xdf:
1495    if (ABCIsConst)
1496      Res = Or(Nand(A, C), B);
1497    break;
1498  case 0xe0:
1499    if (ABCIsConst)
1500      Res = And(A, Or(B, C));
1501    break;
1502  case 0xe1:
1503    if (ABCIsConst)
1504      Res = Xor(A, Nor(B, C));
1505    break;
1506  case 0xe2:
1507    if (ABCIsConst)
1508      Res = Xor(A, Nor(Xnor(A, C), B));
1509    break;
1510  case 0xe3:
1511    if (ABCIsConst)
1512      Res = Xor(A, Nor(And(A, C), B));
1513    break;
1514  case 0xe4:
1515    if (ABCIsConst)
1516      Res = Xor(A, Nor(Xnor(A, B), C));
1517    break;
1518  case 0xe5:
1519    if (ABCIsConst)
1520      Res = Xor(A, Nor(And(A, B), C));
1521    break;
1522  case 0xe6:
1523    if (ABCIsConst)
1524      Res = Or(And(A, B), Xor(B, C));
1525    break;
1526  case 0xe7:
1527    if (ABCIsConst)
1528      Res = Or(Xnor(A, B), Xnor(A, C));
1529    break;
1530  case 0xe8:
1531    if (ABCIsConst)
1532      Res = Xor(Or(A, B), Nor(Xnor(A, B), C));
1533    break;
1534  case 0xe9:
1535    if (ABCIsConst)
1536      Res = Xor(Xor(A, B), Nand(Nand(A, B), C));
1537    break;
1538  case 0xea:
1539    if (ABCIsConst)
1540      Res = Or(And(A, B), C);
1541    break;
1542  case 0xeb:
1543    if (ABCIsConst)
1544      Res = Or(Xnor(A, B), C);
1545    break;
1546  case 0xec:
1547    if (ABCIsConst)
1548      Res = Or(And(A, C), B);
1549    break;
1550  case 0xed:
1551    if (ABCIsConst)
1552      Res = Or(Xnor(A, C), B);
1553    break;
1554  case 0xee:
1555    Res = Or(B, C);
1556    break;
1557  case 0xef:
1558    if (ABCIsConst)
1559      Res = Nand(A, Nor(B, C));
1560    break;
1561  case 0xf0:
1562    Res = A;
1563    break;
1564  case 0xf1:
1565    if (ABCIsConst)
1566      Res = Or(A, Nor(B, C));
1567    break;
1568  case 0xf2:
1569    if (ABCIsConst)
1570      Res = Or(A, Nor(B, Not(C)));
1571    break;
1572  case 0xf3:
1573    if (ABIsConst)
1574      Res = Or(A, Not(B));
1575    break;
1576  case 0xf4:
1577    if (ABCIsConst)
1578      Res = Or(A, Nor(C, Not(B)));
1579    break;
1580  case 0xf5:
1581    if (ACIsConst)
1582      Res = Or(A, Not(C));
1583    break;
1584  case 0xf6:
1585    if (ABCIsConst)
1586      Res = Or(A, Xor(B, C));
1587    break;
1588  case 0xf7:
1589    if (ABCIsConst)
1590      Res = Or(A, Nand(B, C));
1591    break;
1592  case 0xf8:
1593    if (ABCIsConst)
1594      Res = Or(A, And(B, C));
1595    break;
1596  case 0xf9:
1597    if (ABCIsConst)
1598      Res = Or(A, Xnor(B, C));
1599    break;
1600  case 0xfa:
1601    Res = Or(A, C);
1602    break;
1603  case 0xfb:
1604    if (ABCIsConst)
1605      Res = Nand(Nor(A, C), B);
1606    break;
1607  case 0xfc:
1608    Res = Or(A, B);
1609    break;
1610  case 0xfd:
1611    if (ABCIsConst)
1612      Res = Nand(Nor(A, B), C);
1613    break;
1614  case 0xfe:
1615    if (ABCIsConst)
1616      Res = Or(Or(A, B), C);
1617    break;
1618  case 0xff:
1619    Res = {Constant::getAllOnesValue(Ty), 0xff};
1620    break;
1621  }
1622
1623  assert((Res.first == nullptr || Res.second == Imm) &&
1624         "Simplification of ternary logic does not verify!");
1625  return Res.first;
1626}
1627
1628static Value *simplifyX86insertps(const IntrinsicInst &II,
1629                                  InstCombiner::BuilderTy &Builder) {
1630  auto *CInt = dyn_cast<ConstantInt>(II.getArgOperand(2));
1631  if (!CInt)
1632    return nullptr;
1633
1634  auto *VecTy = cast<FixedVectorType>(II.getType());
1635  assert(VecTy->getNumElements() == 4 && "insertps with wrong vector type");
1636
1637  // The immediate permute control byte looks like this:
1638  //    [3:0] - zero mask for each 32-bit lane
1639  //    [5:4] - select one 32-bit destination lane
1640  //    [7:6] - select one 32-bit source lane
1641
1642  uint8_t Imm = CInt->getZExtValue();
1643  uint8_t ZMask = Imm & 0xf;
1644  uint8_t DestLane = (Imm >> 4) & 0x3;
1645  uint8_t SourceLane = (Imm >> 6) & 0x3;
1646
1647  ConstantAggregateZero *ZeroVector = ConstantAggregateZero::get(VecTy);
1648
1649  // If all zero mask bits are set, this was just a weird way to
1650  // generate a zero vector.
1651  if (ZMask == 0xf)
1652    return ZeroVector;
1653
1654  // Initialize by passing all of the first source bits through.
1655  int ShuffleMask[4] = {0, 1, 2, 3};
1656
1657  // We may replace the second operand with the zero vector.
1658  Value *V1 = II.getArgOperand(1);
1659
1660  if (ZMask) {
1661    // If the zero mask is being used with a single input or the zero mask
1662    // overrides the destination lane, this is a shuffle with the zero vector.
1663    if ((II.getArgOperand(0) == II.getArgOperand(1)) ||
1664        (ZMask & (1 << DestLane))) {
1665      V1 = ZeroVector;
1666      // We may still move 32-bits of the first source vector from one lane
1667      // to another.
1668      ShuffleMask[DestLane] = SourceLane;
1669      // The zero mask may override the previous insert operation.
1670      for (unsigned i = 0; i < 4; ++i)
1671        if ((ZMask >> i) & 0x1)
1672          ShuffleMask[i] = i + 4;
1673    } else {
1674      // TODO: Model this case as 2 shuffles or a 'logical and' plus shuffle?
1675      return nullptr;
1676    }
1677  } else {
1678    // Replace the selected destination lane with the selected source lane.
1679    ShuffleMask[DestLane] = SourceLane + 4;
1680  }
1681
1682  return Builder.CreateShuffleVector(II.getArgOperand(0), V1, ShuffleMask);
1683}
1684
1685/// Attempt to simplify SSE4A EXTRQ/EXTRQI instructions using constant folding
1686/// or conversion to a shuffle vector.
1687static Value *simplifyX86extrq(IntrinsicInst &II, Value *Op0,
1688                               ConstantInt *CILength, ConstantInt *CIIndex,
1689                               InstCombiner::BuilderTy &Builder) {
1690  auto LowConstantHighUndef = [&](uint64_t Val) {
1691    Type *IntTy64 = Type::getInt64Ty(II.getContext());
1692    Constant *Args[] = {ConstantInt::get(IntTy64, Val),
1693                        UndefValue::get(IntTy64)};
1694    return ConstantVector::get(Args);
1695  };
1696
1697  // See if we're dealing with constant values.
1698  auto *C0 = dyn_cast<Constant>(Op0);
1699  auto *CI0 =
1700      C0 ? dyn_cast_or_null<ConstantInt>(C0->getAggregateElement((unsigned)0))
1701         : nullptr;
1702
1703  // Attempt to constant fold.
1704  if (CILength && CIIndex) {
1705    // From AMD documentation: "The bit index and field length are each six
1706    // bits in length other bits of the field are ignored."
1707    APInt APIndex = CIIndex->getValue().zextOrTrunc(6);
1708    APInt APLength = CILength->getValue().zextOrTrunc(6);
1709
1710    unsigned Index = APIndex.getZExtValue();
1711
1712    // From AMD documentation: "a value of zero in the field length is
1713    // defined as length of 64".
1714    unsigned Length = APLength == 0 ? 64 : APLength.getZExtValue();
1715
1716    // From AMD documentation: "If the sum of the bit index + length field
1717    // is greater than 64, the results are undefined".
1718    unsigned End = Index + Length;
1719
1720    // Note that both field index and field length are 8-bit quantities.
1721    // Since variables 'Index' and 'Length' are unsigned values
1722    // obtained from zero-extending field index and field length
1723    // respectively, their sum should never wrap around.
1724    if (End > 64)
1725      return UndefValue::get(II.getType());
1726
1727    // If we are inserting whole bytes, we can convert this to a shuffle.
1728    // Lowering can recognize EXTRQI shuffle masks.
1729    if ((Length % 8) == 0 && (Index % 8) == 0) {
1730      // Convert bit indices to byte indices.
1731      Length /= 8;
1732      Index /= 8;
1733
1734      Type *IntTy8 = Type::getInt8Ty(II.getContext());
1735      auto *ShufTy = FixedVectorType::get(IntTy8, 16);
1736
1737      SmallVector<int, 16> ShuffleMask;
1738      for (int i = 0; i != (int)Length; ++i)
1739        ShuffleMask.push_back(i + Index);
1740      for (int i = Length; i != 8; ++i)
1741        ShuffleMask.push_back(i + 16);
1742      for (int i = 8; i != 16; ++i)
1743        ShuffleMask.push_back(-1);
1744
1745      Value *SV = Builder.CreateShuffleVector(
1746          Builder.CreateBitCast(Op0, ShufTy),
1747          ConstantAggregateZero::get(ShufTy), ShuffleMask);
1748      return Builder.CreateBitCast(SV, II.getType());
1749    }
1750
1751    // Constant Fold - shift Index'th bit to lowest position and mask off
1752    // Length bits.
1753    if (CI0) {
1754      APInt Elt = CI0->getValue();
1755      Elt.lshrInPlace(Index);
1756      Elt = Elt.zextOrTrunc(Length);
1757      return LowConstantHighUndef(Elt.getZExtValue());
1758    }
1759
1760    // If we were an EXTRQ call, we'll save registers if we convert to EXTRQI.
1761    if (II.getIntrinsicID() == Intrinsic::x86_sse4a_extrq) {
1762      Value *Args[] = {Op0, CILength, CIIndex};
1763      Module *M = II.getModule();
1764      Function *F = Intrinsic::getDeclaration(M, Intrinsic::x86_sse4a_extrqi);
1765      return Builder.CreateCall(F, Args);
1766    }
1767  }
1768
1769  // Constant Fold - extraction from zero is always {zero, undef}.
1770  if (CI0 && CI0->isZero())
1771    return LowConstantHighUndef(0);
1772
1773  return nullptr;
1774}
1775
1776/// Attempt to simplify SSE4A INSERTQ/INSERTQI instructions using constant
1777/// folding or conversion to a shuffle vector.
1778static Value *simplifyX86insertq(IntrinsicInst &II, Value *Op0, Value *Op1,
1779                                 APInt APLength, APInt APIndex,
1780                                 InstCombiner::BuilderTy &Builder) {
1781  // From AMD documentation: "The bit index and field length are each six bits
1782  // in length other bits of the field are ignored."
1783  APIndex = APIndex.zextOrTrunc(6);
1784  APLength = APLength.zextOrTrunc(6);
1785
1786  // Attempt to constant fold.
1787  unsigned Index = APIndex.getZExtValue();
1788
1789  // From AMD documentation: "a value of zero in the field length is
1790  // defined as length of 64".
1791  unsigned Length = APLength == 0 ? 64 : APLength.getZExtValue();
1792
1793  // From AMD documentation: "If the sum of the bit index + length field
1794  // is greater than 64, the results are undefined".
1795  unsigned End = Index + Length;
1796
1797  // Note that both field index and field length are 8-bit quantities.
1798  // Since variables 'Index' and 'Length' are unsigned values
1799  // obtained from zero-extending field index and field length
1800  // respectively, their sum should never wrap around.
1801  if (End > 64)
1802    return UndefValue::get(II.getType());
1803
1804  // If we are inserting whole bytes, we can convert this to a shuffle.
1805  // Lowering can recognize INSERTQI shuffle masks.
1806  if ((Length % 8) == 0 && (Index % 8) == 0) {
1807    // Convert bit indices to byte indices.
1808    Length /= 8;
1809    Index /= 8;
1810
1811    Type *IntTy8 = Type::getInt8Ty(II.getContext());
1812    auto *ShufTy = FixedVectorType::get(IntTy8, 16);
1813
1814    SmallVector<int, 16> ShuffleMask;
1815    for (int i = 0; i != (int)Index; ++i)
1816      ShuffleMask.push_back(i);
1817    for (int i = 0; i != (int)Length; ++i)
1818      ShuffleMask.push_back(i + 16);
1819    for (int i = Index + Length; i != 8; ++i)
1820      ShuffleMask.push_back(i);
1821    for (int i = 8; i != 16; ++i)
1822      ShuffleMask.push_back(-1);
1823
1824    Value *SV = Builder.CreateShuffleVector(Builder.CreateBitCast(Op0, ShufTy),
1825                                            Builder.CreateBitCast(Op1, ShufTy),
1826                                            ShuffleMask);
1827    return Builder.CreateBitCast(SV, II.getType());
1828  }
1829
1830  // See if we're dealing with constant values.
1831  auto *C0 = dyn_cast<Constant>(Op0);
1832  auto *C1 = dyn_cast<Constant>(Op1);
1833  auto *CI00 =
1834      C0 ? dyn_cast_or_null<ConstantInt>(C0->getAggregateElement((unsigned)0))
1835         : nullptr;
1836  auto *CI10 =
1837      C1 ? dyn_cast_or_null<ConstantInt>(C1->getAggregateElement((unsigned)0))
1838         : nullptr;
1839
1840  // Constant Fold - insert bottom Length bits starting at the Index'th bit.
1841  if (CI00 && CI10) {
1842    APInt V00 = CI00->getValue();
1843    APInt V10 = CI10->getValue();
1844    APInt Mask = APInt::getLowBitsSet(64, Length).shl(Index);
1845    V00 = V00 & ~Mask;
1846    V10 = V10.zextOrTrunc(Length).zextOrTrunc(64).shl(Index);
1847    APInt Val = V00 | V10;
1848    Type *IntTy64 = Type::getInt64Ty(II.getContext());
1849    Constant *Args[] = {ConstantInt::get(IntTy64, Val.getZExtValue()),
1850                        UndefValue::get(IntTy64)};
1851    return ConstantVector::get(Args);
1852  }
1853
1854  // If we were an INSERTQ call, we'll save demanded elements if we convert to
1855  // INSERTQI.
1856  if (II.getIntrinsicID() == Intrinsic::x86_sse4a_insertq) {
1857    Type *IntTy8 = Type::getInt8Ty(II.getContext());
1858    Constant *CILength = ConstantInt::get(IntTy8, Length, false);
1859    Constant *CIIndex = ConstantInt::get(IntTy8, Index, false);
1860
1861    Value *Args[] = {Op0, Op1, CILength, CIIndex};
1862    Module *M = II.getModule();
1863    Function *F = Intrinsic::getDeclaration(M, Intrinsic::x86_sse4a_insertqi);
1864    return Builder.CreateCall(F, Args);
1865  }
1866
1867  return nullptr;
1868}
1869
1870/// Attempt to convert pshufb* to shufflevector if the mask is constant.
1871static Value *simplifyX86pshufb(const IntrinsicInst &II,
1872                                InstCombiner::BuilderTy &Builder) {
1873  auto *V = dyn_cast<Constant>(II.getArgOperand(1));
1874  if (!V)
1875    return nullptr;
1876
1877  auto *VecTy = cast<FixedVectorType>(II.getType());
1878  unsigned NumElts = VecTy->getNumElements();
1879  assert((NumElts == 16 || NumElts == 32 || NumElts == 64) &&
1880         "Unexpected number of elements in shuffle mask!");
1881
1882  // Construct a shuffle mask from constant integers or UNDEFs.
1883  int Indexes[64];
1884
1885  // Each byte in the shuffle control mask forms an index to permute the
1886  // corresponding byte in the destination operand.
1887  for (unsigned I = 0; I < NumElts; ++I) {
1888    Constant *COp = V->getAggregateElement(I);
1889    if (!COp || (!isa<UndefValue>(COp) && !isa<ConstantInt>(COp)))
1890      return nullptr;
1891
1892    if (isa<UndefValue>(COp)) {
1893      Indexes[I] = -1;
1894      continue;
1895    }
1896
1897    int8_t Index = cast<ConstantInt>(COp)->getValue().getZExtValue();
1898
1899    // If the most significant bit (bit[7]) of each byte of the shuffle
1900    // control mask is set, then zero is written in the result byte.
1901    // The zero vector is in the right-hand side of the resulting
1902    // shufflevector.
1903
1904    // The value of each index for the high 128-bit lane is the least
1905    // significant 4 bits of the respective shuffle control byte.
1906    Index = ((Index < 0) ? NumElts : Index & 0x0F) + (I & 0xF0);
1907    Indexes[I] = Index;
1908  }
1909
1910  auto V1 = II.getArgOperand(0);
1911  auto V2 = Constant::getNullValue(VecTy);
1912  return Builder.CreateShuffleVector(V1, V2, ArrayRef(Indexes, NumElts));
1913}
1914
1915/// Attempt to convert vpermilvar* to shufflevector if the mask is constant.
1916static Value *simplifyX86vpermilvar(const IntrinsicInst &II,
1917                                    InstCombiner::BuilderTy &Builder) {
1918  auto *V = dyn_cast<Constant>(II.getArgOperand(1));
1919  if (!V)
1920    return nullptr;
1921
1922  auto *VecTy = cast<FixedVectorType>(II.getType());
1923  unsigned NumElts = VecTy->getNumElements();
1924  bool IsPD = VecTy->getScalarType()->isDoubleTy();
1925  unsigned NumLaneElts = IsPD ? 2 : 4;
1926  assert(NumElts == 16 || NumElts == 8 || NumElts == 4 || NumElts == 2);
1927
1928  // Construct a shuffle mask from constant integers or UNDEFs.
1929  int Indexes[16];
1930
1931  // The intrinsics only read one or two bits, clear the rest.
1932  for (unsigned I = 0; I < NumElts; ++I) {
1933    Constant *COp = V->getAggregateElement(I);
1934    if (!COp || (!isa<UndefValue>(COp) && !isa<ConstantInt>(COp)))
1935      return nullptr;
1936
1937    if (isa<UndefValue>(COp)) {
1938      Indexes[I] = -1;
1939      continue;
1940    }
1941
1942    APInt Index = cast<ConstantInt>(COp)->getValue();
1943    Index = Index.zextOrTrunc(32).getLoBits(2);
1944
1945    // The PD variants uses bit 1 to select per-lane element index, so
1946    // shift down to convert to generic shuffle mask index.
1947    if (IsPD)
1948      Index.lshrInPlace(1);
1949
1950    // The _256 variants are a bit trickier since the mask bits always index
1951    // into the corresponding 128 half. In order to convert to a generic
1952    // shuffle, we have to make that explicit.
1953    Index += APInt(32, (I / NumLaneElts) * NumLaneElts);
1954
1955    Indexes[I] = Index.getZExtValue();
1956  }
1957
1958  auto V1 = II.getArgOperand(0);
1959  return Builder.CreateShuffleVector(V1, ArrayRef(Indexes, NumElts));
1960}
1961
1962/// Attempt to convert vpermd/vpermps to shufflevector if the mask is constant.
1963static Value *simplifyX86vpermv(const IntrinsicInst &II,
1964                                InstCombiner::BuilderTy &Builder) {
1965  auto *V = dyn_cast<Constant>(II.getArgOperand(1));
1966  if (!V)
1967    return nullptr;
1968
1969  auto *VecTy = cast<FixedVectorType>(II.getType());
1970  unsigned Size = VecTy->getNumElements();
1971  assert((Size == 4 || Size == 8 || Size == 16 || Size == 32 || Size == 64) &&
1972         "Unexpected shuffle mask size");
1973
1974  // Construct a shuffle mask from constant integers or UNDEFs.
1975  int Indexes[64];
1976
1977  for (unsigned I = 0; I < Size; ++I) {
1978    Constant *COp = V->getAggregateElement(I);
1979    if (!COp || (!isa<UndefValue>(COp) && !isa<ConstantInt>(COp)))
1980      return nullptr;
1981
1982    if (isa<UndefValue>(COp)) {
1983      Indexes[I] = -1;
1984      continue;
1985    }
1986
1987    uint32_t Index = cast<ConstantInt>(COp)->getZExtValue();
1988    Index &= Size - 1;
1989    Indexes[I] = Index;
1990  }
1991
1992  auto V1 = II.getArgOperand(0);
1993  return Builder.CreateShuffleVector(V1, ArrayRef(Indexes, Size));
1994}
1995
1996std::optional<Instruction *>
1997X86TTIImpl::instCombineIntrinsic(InstCombiner &IC, IntrinsicInst &II) const {
1998  auto SimplifyDemandedVectorEltsLow = [&IC](Value *Op, unsigned Width,
1999                                             unsigned DemandedWidth) {
2000    APInt UndefElts(Width, 0);
2001    APInt DemandedElts = APInt::getLowBitsSet(Width, DemandedWidth);
2002    return IC.SimplifyDemandedVectorElts(Op, DemandedElts, UndefElts);
2003  };
2004
2005  Intrinsic::ID IID = II.getIntrinsicID();
2006  switch (IID) {
2007  case Intrinsic::x86_bmi_bextr_32:
2008  case Intrinsic::x86_bmi_bextr_64:
2009  case Intrinsic::x86_tbm_bextri_u32:
2010  case Intrinsic::x86_tbm_bextri_u64:
2011    // If the RHS is a constant we can try some simplifications.
2012    if (auto *C = dyn_cast<ConstantInt>(II.getArgOperand(1))) {
2013      uint64_t Shift = C->getZExtValue();
2014      uint64_t Length = (Shift >> 8) & 0xff;
2015      Shift &= 0xff;
2016      unsigned BitWidth = II.getType()->getIntegerBitWidth();
2017      // If the length is 0 or the shift is out of range, replace with zero.
2018      if (Length == 0 || Shift >= BitWidth) {
2019        return IC.replaceInstUsesWith(II, ConstantInt::get(II.getType(), 0));
2020      }
2021      // If the LHS is also a constant, we can completely constant fold this.
2022      if (auto *InC = dyn_cast<ConstantInt>(II.getArgOperand(0))) {
2023        uint64_t Result = InC->getZExtValue() >> Shift;
2024        if (Length > BitWidth)
2025          Length = BitWidth;
2026        Result &= maskTrailingOnes<uint64_t>(Length);
2027        return IC.replaceInstUsesWith(II,
2028                                      ConstantInt::get(II.getType(), Result));
2029      }
2030      // TODO should we turn this into 'and' if shift is 0? Or 'shl' if we
2031      // are only masking bits that a shift already cleared?
2032    }
2033    break;
2034
2035  case Intrinsic::x86_bmi_bzhi_32:
2036  case Intrinsic::x86_bmi_bzhi_64:
2037    // If the RHS is a constant we can try some simplifications.
2038    if (auto *C = dyn_cast<ConstantInt>(II.getArgOperand(1))) {
2039      uint64_t Index = C->getZExtValue() & 0xff;
2040      unsigned BitWidth = II.getType()->getIntegerBitWidth();
2041      if (Index >= BitWidth) {
2042        return IC.replaceInstUsesWith(II, II.getArgOperand(0));
2043      }
2044      if (Index == 0) {
2045        return IC.replaceInstUsesWith(II, ConstantInt::get(II.getType(), 0));
2046      }
2047      // If the LHS is also a constant, we can completely constant fold this.
2048      if (auto *InC = dyn_cast<ConstantInt>(II.getArgOperand(0))) {
2049        uint64_t Result = InC->getZExtValue();
2050        Result &= maskTrailingOnes<uint64_t>(Index);
2051        return IC.replaceInstUsesWith(II,
2052                                      ConstantInt::get(II.getType(), Result));
2053      }
2054      // TODO should we convert this to an AND if the RHS is constant?
2055    }
2056    break;
2057  case Intrinsic::x86_bmi_pext_32:
2058  case Intrinsic::x86_bmi_pext_64:
2059    if (auto *MaskC = dyn_cast<ConstantInt>(II.getArgOperand(1))) {
2060      if (MaskC->isNullValue()) {
2061        return IC.replaceInstUsesWith(II, ConstantInt::get(II.getType(), 0));
2062      }
2063      if (MaskC->isAllOnesValue()) {
2064        return IC.replaceInstUsesWith(II, II.getArgOperand(0));
2065      }
2066
2067      unsigned MaskIdx, MaskLen;
2068      if (MaskC->getValue().isShiftedMask(MaskIdx, MaskLen)) {
2069        // any single contingous sequence of 1s anywhere in the mask simply
2070        // describes a subset of the input bits shifted to the appropriate
2071        // position.  Replace with the straight forward IR.
2072        Value *Input = II.getArgOperand(0);
2073        Value *Masked = IC.Builder.CreateAnd(Input, II.getArgOperand(1));
2074        Value *ShiftAmt = ConstantInt::get(II.getType(), MaskIdx);
2075        Value *Shifted = IC.Builder.CreateLShr(Masked, ShiftAmt);
2076        return IC.replaceInstUsesWith(II, Shifted);
2077      }
2078
2079      if (auto *SrcC = dyn_cast<ConstantInt>(II.getArgOperand(0))) {
2080        uint64_t Src = SrcC->getZExtValue();
2081        uint64_t Mask = MaskC->getZExtValue();
2082        uint64_t Result = 0;
2083        uint64_t BitToSet = 1;
2084
2085        while (Mask) {
2086          // Isolate lowest set bit.
2087          uint64_t BitToTest = Mask & -Mask;
2088          if (BitToTest & Src)
2089            Result |= BitToSet;
2090
2091          BitToSet <<= 1;
2092          // Clear lowest set bit.
2093          Mask &= Mask - 1;
2094        }
2095
2096        return IC.replaceInstUsesWith(II,
2097                                      ConstantInt::get(II.getType(), Result));
2098      }
2099    }
2100    break;
2101  case Intrinsic::x86_bmi_pdep_32:
2102  case Intrinsic::x86_bmi_pdep_64:
2103    if (auto *MaskC = dyn_cast<ConstantInt>(II.getArgOperand(1))) {
2104      if (MaskC->isNullValue()) {
2105        return IC.replaceInstUsesWith(II, ConstantInt::get(II.getType(), 0));
2106      }
2107      if (MaskC->isAllOnesValue()) {
2108        return IC.replaceInstUsesWith(II, II.getArgOperand(0));
2109      }
2110
2111      unsigned MaskIdx, MaskLen;
2112      if (MaskC->getValue().isShiftedMask(MaskIdx, MaskLen)) {
2113        // any single contingous sequence of 1s anywhere in the mask simply
2114        // describes a subset of the input bits shifted to the appropriate
2115        // position.  Replace with the straight forward IR.
2116        Value *Input = II.getArgOperand(0);
2117        Value *ShiftAmt = ConstantInt::get(II.getType(), MaskIdx);
2118        Value *Shifted = IC.Builder.CreateShl(Input, ShiftAmt);
2119        Value *Masked = IC.Builder.CreateAnd(Shifted, II.getArgOperand(1));
2120        return IC.replaceInstUsesWith(II, Masked);
2121      }
2122
2123      if (auto *SrcC = dyn_cast<ConstantInt>(II.getArgOperand(0))) {
2124        uint64_t Src = SrcC->getZExtValue();
2125        uint64_t Mask = MaskC->getZExtValue();
2126        uint64_t Result = 0;
2127        uint64_t BitToTest = 1;
2128
2129        while (Mask) {
2130          // Isolate lowest set bit.
2131          uint64_t BitToSet = Mask & -Mask;
2132          if (BitToTest & Src)
2133            Result |= BitToSet;
2134
2135          BitToTest <<= 1;
2136          // Clear lowest set bit;
2137          Mask &= Mask - 1;
2138        }
2139
2140        return IC.replaceInstUsesWith(II,
2141                                      ConstantInt::get(II.getType(), Result));
2142      }
2143    }
2144    break;
2145
2146  case Intrinsic::x86_sse_cvtss2si:
2147  case Intrinsic::x86_sse_cvtss2si64:
2148  case Intrinsic::x86_sse_cvttss2si:
2149  case Intrinsic::x86_sse_cvttss2si64:
2150  case Intrinsic::x86_sse2_cvtsd2si:
2151  case Intrinsic::x86_sse2_cvtsd2si64:
2152  case Intrinsic::x86_sse2_cvttsd2si:
2153  case Intrinsic::x86_sse2_cvttsd2si64:
2154  case Intrinsic::x86_avx512_vcvtss2si32:
2155  case Intrinsic::x86_avx512_vcvtss2si64:
2156  case Intrinsic::x86_avx512_vcvtss2usi32:
2157  case Intrinsic::x86_avx512_vcvtss2usi64:
2158  case Intrinsic::x86_avx512_vcvtsd2si32:
2159  case Intrinsic::x86_avx512_vcvtsd2si64:
2160  case Intrinsic::x86_avx512_vcvtsd2usi32:
2161  case Intrinsic::x86_avx512_vcvtsd2usi64:
2162  case Intrinsic::x86_avx512_cvttss2si:
2163  case Intrinsic::x86_avx512_cvttss2si64:
2164  case Intrinsic::x86_avx512_cvttss2usi:
2165  case Intrinsic::x86_avx512_cvttss2usi64:
2166  case Intrinsic::x86_avx512_cvttsd2si:
2167  case Intrinsic::x86_avx512_cvttsd2si64:
2168  case Intrinsic::x86_avx512_cvttsd2usi:
2169  case Intrinsic::x86_avx512_cvttsd2usi64: {
2170    // These intrinsics only demand the 0th element of their input vectors. If
2171    // we can simplify the input based on that, do so now.
2172    Value *Arg = II.getArgOperand(0);
2173    unsigned VWidth = cast<FixedVectorType>(Arg->getType())->getNumElements();
2174    if (Value *V = SimplifyDemandedVectorEltsLow(Arg, VWidth, 1)) {
2175      return IC.replaceOperand(II, 0, V);
2176    }
2177    break;
2178  }
2179
2180  case Intrinsic::x86_mmx_pmovmskb:
2181  case Intrinsic::x86_sse_movmsk_ps:
2182  case Intrinsic::x86_sse2_movmsk_pd:
2183  case Intrinsic::x86_sse2_pmovmskb_128:
2184  case Intrinsic::x86_avx_movmsk_pd_256:
2185  case Intrinsic::x86_avx_movmsk_ps_256:
2186  case Intrinsic::x86_avx2_pmovmskb:
2187    if (Value *V = simplifyX86movmsk(II, IC.Builder)) {
2188      return IC.replaceInstUsesWith(II, V);
2189    }
2190    break;
2191
2192  case Intrinsic::x86_sse_comieq_ss:
2193  case Intrinsic::x86_sse_comige_ss:
2194  case Intrinsic::x86_sse_comigt_ss:
2195  case Intrinsic::x86_sse_comile_ss:
2196  case Intrinsic::x86_sse_comilt_ss:
2197  case Intrinsic::x86_sse_comineq_ss:
2198  case Intrinsic::x86_sse_ucomieq_ss:
2199  case Intrinsic::x86_sse_ucomige_ss:
2200  case Intrinsic::x86_sse_ucomigt_ss:
2201  case Intrinsic::x86_sse_ucomile_ss:
2202  case Intrinsic::x86_sse_ucomilt_ss:
2203  case Intrinsic::x86_sse_ucomineq_ss:
2204  case Intrinsic::x86_sse2_comieq_sd:
2205  case Intrinsic::x86_sse2_comige_sd:
2206  case Intrinsic::x86_sse2_comigt_sd:
2207  case Intrinsic::x86_sse2_comile_sd:
2208  case Intrinsic::x86_sse2_comilt_sd:
2209  case Intrinsic::x86_sse2_comineq_sd:
2210  case Intrinsic::x86_sse2_ucomieq_sd:
2211  case Intrinsic::x86_sse2_ucomige_sd:
2212  case Intrinsic::x86_sse2_ucomigt_sd:
2213  case Intrinsic::x86_sse2_ucomile_sd:
2214  case Intrinsic::x86_sse2_ucomilt_sd:
2215  case Intrinsic::x86_sse2_ucomineq_sd:
2216  case Intrinsic::x86_avx512_vcomi_ss:
2217  case Intrinsic::x86_avx512_vcomi_sd:
2218  case Intrinsic::x86_avx512_mask_cmp_ss:
2219  case Intrinsic::x86_avx512_mask_cmp_sd: {
2220    // These intrinsics only demand the 0th element of their input vectors. If
2221    // we can simplify the input based on that, do so now.
2222    bool MadeChange = false;
2223    Value *Arg0 = II.getArgOperand(0);
2224    Value *Arg1 = II.getArgOperand(1);
2225    unsigned VWidth = cast<FixedVectorType>(Arg0->getType())->getNumElements();
2226    if (Value *V = SimplifyDemandedVectorEltsLow(Arg0, VWidth, 1)) {
2227      IC.replaceOperand(II, 0, V);
2228      MadeChange = true;
2229    }
2230    if (Value *V = SimplifyDemandedVectorEltsLow(Arg1, VWidth, 1)) {
2231      IC.replaceOperand(II, 1, V);
2232      MadeChange = true;
2233    }
2234    if (MadeChange) {
2235      return &II;
2236    }
2237    break;
2238  }
2239
2240  case Intrinsic::x86_avx512_add_ps_512:
2241  case Intrinsic::x86_avx512_div_ps_512:
2242  case Intrinsic::x86_avx512_mul_ps_512:
2243  case Intrinsic::x86_avx512_sub_ps_512:
2244  case Intrinsic::x86_avx512_add_pd_512:
2245  case Intrinsic::x86_avx512_div_pd_512:
2246  case Intrinsic::x86_avx512_mul_pd_512:
2247  case Intrinsic::x86_avx512_sub_pd_512:
2248    // If the rounding mode is CUR_DIRECTION(4) we can turn these into regular
2249    // IR operations.
2250    if (auto *R = dyn_cast<ConstantInt>(II.getArgOperand(2))) {
2251      if (R->getValue() == 4) {
2252        Value *Arg0 = II.getArgOperand(0);
2253        Value *Arg1 = II.getArgOperand(1);
2254
2255        Value *V;
2256        switch (IID) {
2257        default:
2258          llvm_unreachable("Case stmts out of sync!");
2259        case Intrinsic::x86_avx512_add_ps_512:
2260        case Intrinsic::x86_avx512_add_pd_512:
2261          V = IC.Builder.CreateFAdd(Arg0, Arg1);
2262          break;
2263        case Intrinsic::x86_avx512_sub_ps_512:
2264        case Intrinsic::x86_avx512_sub_pd_512:
2265          V = IC.Builder.CreateFSub(Arg0, Arg1);
2266          break;
2267        case Intrinsic::x86_avx512_mul_ps_512:
2268        case Intrinsic::x86_avx512_mul_pd_512:
2269          V = IC.Builder.CreateFMul(Arg0, Arg1);
2270          break;
2271        case Intrinsic::x86_avx512_div_ps_512:
2272        case Intrinsic::x86_avx512_div_pd_512:
2273          V = IC.Builder.CreateFDiv(Arg0, Arg1);
2274          break;
2275        }
2276
2277        return IC.replaceInstUsesWith(II, V);
2278      }
2279    }
2280    break;
2281
2282  case Intrinsic::x86_avx512_mask_add_ss_round:
2283  case Intrinsic::x86_avx512_mask_div_ss_round:
2284  case Intrinsic::x86_avx512_mask_mul_ss_round:
2285  case Intrinsic::x86_avx512_mask_sub_ss_round:
2286  case Intrinsic::x86_avx512_mask_add_sd_round:
2287  case Intrinsic::x86_avx512_mask_div_sd_round:
2288  case Intrinsic::x86_avx512_mask_mul_sd_round:
2289  case Intrinsic::x86_avx512_mask_sub_sd_round:
2290    // If the rounding mode is CUR_DIRECTION(4) we can turn these into regular
2291    // IR operations.
2292    if (auto *R = dyn_cast<ConstantInt>(II.getArgOperand(4))) {
2293      if (R->getValue() == 4) {
2294        // Extract the element as scalars.
2295        Value *Arg0 = II.getArgOperand(0);
2296        Value *Arg1 = II.getArgOperand(1);
2297        Value *LHS = IC.Builder.CreateExtractElement(Arg0, (uint64_t)0);
2298        Value *RHS = IC.Builder.CreateExtractElement(Arg1, (uint64_t)0);
2299
2300        Value *V;
2301        switch (IID) {
2302        default:
2303          llvm_unreachable("Case stmts out of sync!");
2304        case Intrinsic::x86_avx512_mask_add_ss_round:
2305        case Intrinsic::x86_avx512_mask_add_sd_round:
2306          V = IC.Builder.CreateFAdd(LHS, RHS);
2307          break;
2308        case Intrinsic::x86_avx512_mask_sub_ss_round:
2309        case Intrinsic::x86_avx512_mask_sub_sd_round:
2310          V = IC.Builder.CreateFSub(LHS, RHS);
2311          break;
2312        case Intrinsic::x86_avx512_mask_mul_ss_round:
2313        case Intrinsic::x86_avx512_mask_mul_sd_round:
2314          V = IC.Builder.CreateFMul(LHS, RHS);
2315          break;
2316        case Intrinsic::x86_avx512_mask_div_ss_round:
2317        case Intrinsic::x86_avx512_mask_div_sd_round:
2318          V = IC.Builder.CreateFDiv(LHS, RHS);
2319          break;
2320        }
2321
2322        // Handle the masking aspect of the intrinsic.
2323        Value *Mask = II.getArgOperand(3);
2324        auto *C = dyn_cast<ConstantInt>(Mask);
2325        // We don't need a select if we know the mask bit is a 1.
2326        if (!C || !C->getValue()[0]) {
2327          // Cast the mask to an i1 vector and then extract the lowest element.
2328          auto *MaskTy = FixedVectorType::get(
2329              IC.Builder.getInt1Ty(),
2330              cast<IntegerType>(Mask->getType())->getBitWidth());
2331          Mask = IC.Builder.CreateBitCast(Mask, MaskTy);
2332          Mask = IC.Builder.CreateExtractElement(Mask, (uint64_t)0);
2333          // Extract the lowest element from the passthru operand.
2334          Value *Passthru =
2335              IC.Builder.CreateExtractElement(II.getArgOperand(2), (uint64_t)0);
2336          V = IC.Builder.CreateSelect(Mask, V, Passthru);
2337        }
2338
2339        // Insert the result back into the original argument 0.
2340        V = IC.Builder.CreateInsertElement(Arg0, V, (uint64_t)0);
2341
2342        return IC.replaceInstUsesWith(II, V);
2343      }
2344    }
2345    break;
2346
2347  // Constant fold ashr( <A x Bi>, Ci ).
2348  // Constant fold lshr( <A x Bi>, Ci ).
2349  // Constant fold shl( <A x Bi>, Ci ).
2350  case Intrinsic::x86_sse2_psrai_d:
2351  case Intrinsic::x86_sse2_psrai_w:
2352  case Intrinsic::x86_avx2_psrai_d:
2353  case Intrinsic::x86_avx2_psrai_w:
2354  case Intrinsic::x86_avx512_psrai_q_128:
2355  case Intrinsic::x86_avx512_psrai_q_256:
2356  case Intrinsic::x86_avx512_psrai_d_512:
2357  case Intrinsic::x86_avx512_psrai_q_512:
2358  case Intrinsic::x86_avx512_psrai_w_512:
2359  case Intrinsic::x86_sse2_psrli_d:
2360  case Intrinsic::x86_sse2_psrli_q:
2361  case Intrinsic::x86_sse2_psrli_w:
2362  case Intrinsic::x86_avx2_psrli_d:
2363  case Intrinsic::x86_avx2_psrli_q:
2364  case Intrinsic::x86_avx2_psrli_w:
2365  case Intrinsic::x86_avx512_psrli_d_512:
2366  case Intrinsic::x86_avx512_psrli_q_512:
2367  case Intrinsic::x86_avx512_psrli_w_512:
2368  case Intrinsic::x86_sse2_pslli_d:
2369  case Intrinsic::x86_sse2_pslli_q:
2370  case Intrinsic::x86_sse2_pslli_w:
2371  case Intrinsic::x86_avx2_pslli_d:
2372  case Intrinsic::x86_avx2_pslli_q:
2373  case Intrinsic::x86_avx2_pslli_w:
2374  case Intrinsic::x86_avx512_pslli_d_512:
2375  case Intrinsic::x86_avx512_pslli_q_512:
2376  case Intrinsic::x86_avx512_pslli_w_512:
2377    if (Value *V = simplifyX86immShift(II, IC.Builder)) {
2378      return IC.replaceInstUsesWith(II, V);
2379    }
2380    break;
2381
2382  case Intrinsic::x86_sse2_psra_d:
2383  case Intrinsic::x86_sse2_psra_w:
2384  case Intrinsic::x86_avx2_psra_d:
2385  case Intrinsic::x86_avx2_psra_w:
2386  case Intrinsic::x86_avx512_psra_q_128:
2387  case Intrinsic::x86_avx512_psra_q_256:
2388  case Intrinsic::x86_avx512_psra_d_512:
2389  case Intrinsic::x86_avx512_psra_q_512:
2390  case Intrinsic::x86_avx512_psra_w_512:
2391  case Intrinsic::x86_sse2_psrl_d:
2392  case Intrinsic::x86_sse2_psrl_q:
2393  case Intrinsic::x86_sse2_psrl_w:
2394  case Intrinsic::x86_avx2_psrl_d:
2395  case Intrinsic::x86_avx2_psrl_q:
2396  case Intrinsic::x86_avx2_psrl_w:
2397  case Intrinsic::x86_avx512_psrl_d_512:
2398  case Intrinsic::x86_avx512_psrl_q_512:
2399  case Intrinsic::x86_avx512_psrl_w_512:
2400  case Intrinsic::x86_sse2_psll_d:
2401  case Intrinsic::x86_sse2_psll_q:
2402  case Intrinsic::x86_sse2_psll_w:
2403  case Intrinsic::x86_avx2_psll_d:
2404  case Intrinsic::x86_avx2_psll_q:
2405  case Intrinsic::x86_avx2_psll_w:
2406  case Intrinsic::x86_avx512_psll_d_512:
2407  case Intrinsic::x86_avx512_psll_q_512:
2408  case Intrinsic::x86_avx512_psll_w_512: {
2409    if (Value *V = simplifyX86immShift(II, IC.Builder)) {
2410      return IC.replaceInstUsesWith(II, V);
2411    }
2412
2413    // SSE2/AVX2 uses only the first 64-bits of the 128-bit vector
2414    // operand to compute the shift amount.
2415    Value *Arg1 = II.getArgOperand(1);
2416    assert(Arg1->getType()->getPrimitiveSizeInBits() == 128 &&
2417           "Unexpected packed shift size");
2418    unsigned VWidth = cast<FixedVectorType>(Arg1->getType())->getNumElements();
2419
2420    if (Value *V = SimplifyDemandedVectorEltsLow(Arg1, VWidth, VWidth / 2)) {
2421      return IC.replaceOperand(II, 1, V);
2422    }
2423    break;
2424  }
2425
2426  case Intrinsic::x86_avx2_psllv_d:
2427  case Intrinsic::x86_avx2_psllv_d_256:
2428  case Intrinsic::x86_avx2_psllv_q:
2429  case Intrinsic::x86_avx2_psllv_q_256:
2430  case Intrinsic::x86_avx512_psllv_d_512:
2431  case Intrinsic::x86_avx512_psllv_q_512:
2432  case Intrinsic::x86_avx512_psllv_w_128:
2433  case Intrinsic::x86_avx512_psllv_w_256:
2434  case Intrinsic::x86_avx512_psllv_w_512:
2435  case Intrinsic::x86_avx2_psrav_d:
2436  case Intrinsic::x86_avx2_psrav_d_256:
2437  case Intrinsic::x86_avx512_psrav_q_128:
2438  case Intrinsic::x86_avx512_psrav_q_256:
2439  case Intrinsic::x86_avx512_psrav_d_512:
2440  case Intrinsic::x86_avx512_psrav_q_512:
2441  case Intrinsic::x86_avx512_psrav_w_128:
2442  case Intrinsic::x86_avx512_psrav_w_256:
2443  case Intrinsic::x86_avx512_psrav_w_512:
2444  case Intrinsic::x86_avx2_psrlv_d:
2445  case Intrinsic::x86_avx2_psrlv_d_256:
2446  case Intrinsic::x86_avx2_psrlv_q:
2447  case Intrinsic::x86_avx2_psrlv_q_256:
2448  case Intrinsic::x86_avx512_psrlv_d_512:
2449  case Intrinsic::x86_avx512_psrlv_q_512:
2450  case Intrinsic::x86_avx512_psrlv_w_128:
2451  case Intrinsic::x86_avx512_psrlv_w_256:
2452  case Intrinsic::x86_avx512_psrlv_w_512:
2453    if (Value *V = simplifyX86varShift(II, IC.Builder)) {
2454      return IC.replaceInstUsesWith(II, V);
2455    }
2456    break;
2457
2458  case Intrinsic::x86_sse2_packssdw_128:
2459  case Intrinsic::x86_sse2_packsswb_128:
2460  case Intrinsic::x86_avx2_packssdw:
2461  case Intrinsic::x86_avx2_packsswb:
2462  case Intrinsic::x86_avx512_packssdw_512:
2463  case Intrinsic::x86_avx512_packsswb_512:
2464    if (Value *V = simplifyX86pack(II, IC.Builder, true)) {
2465      return IC.replaceInstUsesWith(II, V);
2466    }
2467    break;
2468
2469  case Intrinsic::x86_sse2_packuswb_128:
2470  case Intrinsic::x86_sse41_packusdw:
2471  case Intrinsic::x86_avx2_packusdw:
2472  case Intrinsic::x86_avx2_packuswb:
2473  case Intrinsic::x86_avx512_packusdw_512:
2474  case Intrinsic::x86_avx512_packuswb_512:
2475    if (Value *V = simplifyX86pack(II, IC.Builder, false)) {
2476      return IC.replaceInstUsesWith(II, V);
2477    }
2478    break;
2479
2480  case Intrinsic::x86_pclmulqdq:
2481  case Intrinsic::x86_pclmulqdq_256:
2482  case Intrinsic::x86_pclmulqdq_512: {
2483    if (auto *C = dyn_cast<ConstantInt>(II.getArgOperand(2))) {
2484      unsigned Imm = C->getZExtValue();
2485
2486      bool MadeChange = false;
2487      Value *Arg0 = II.getArgOperand(0);
2488      Value *Arg1 = II.getArgOperand(1);
2489      unsigned VWidth =
2490          cast<FixedVectorType>(Arg0->getType())->getNumElements();
2491
2492      APInt UndefElts1(VWidth, 0);
2493      APInt DemandedElts1 =
2494          APInt::getSplat(VWidth, APInt(2, (Imm & 0x01) ? 2 : 1));
2495      if (Value *V =
2496              IC.SimplifyDemandedVectorElts(Arg0, DemandedElts1, UndefElts1)) {
2497        IC.replaceOperand(II, 0, V);
2498        MadeChange = true;
2499      }
2500
2501      APInt UndefElts2(VWidth, 0);
2502      APInt DemandedElts2 =
2503          APInt::getSplat(VWidth, APInt(2, (Imm & 0x10) ? 2 : 1));
2504      if (Value *V =
2505              IC.SimplifyDemandedVectorElts(Arg1, DemandedElts2, UndefElts2)) {
2506        IC.replaceOperand(II, 1, V);
2507        MadeChange = true;
2508      }
2509
2510      // If either input elements are undef, the result is zero.
2511      if (DemandedElts1.isSubsetOf(UndefElts1) ||
2512          DemandedElts2.isSubsetOf(UndefElts2)) {
2513        return IC.replaceInstUsesWith(II,
2514                                      ConstantAggregateZero::get(II.getType()));
2515      }
2516
2517      if (MadeChange) {
2518        return &II;
2519      }
2520    }
2521    break;
2522  }
2523
2524  case Intrinsic::x86_sse41_insertps:
2525    if (Value *V = simplifyX86insertps(II, IC.Builder)) {
2526      return IC.replaceInstUsesWith(II, V);
2527    }
2528    break;
2529
2530  case Intrinsic::x86_sse4a_extrq: {
2531    Value *Op0 = II.getArgOperand(0);
2532    Value *Op1 = II.getArgOperand(1);
2533    unsigned VWidth0 = cast<FixedVectorType>(Op0->getType())->getNumElements();
2534    unsigned VWidth1 = cast<FixedVectorType>(Op1->getType())->getNumElements();
2535    assert(Op0->getType()->getPrimitiveSizeInBits() == 128 &&
2536           Op1->getType()->getPrimitiveSizeInBits() == 128 && VWidth0 == 2 &&
2537           VWidth1 == 16 && "Unexpected operand sizes");
2538
2539    // See if we're dealing with constant values.
2540    auto *C1 = dyn_cast<Constant>(Op1);
2541    auto *CILength =
2542        C1 ? dyn_cast_or_null<ConstantInt>(C1->getAggregateElement((unsigned)0))
2543           : nullptr;
2544    auto *CIIndex =
2545        C1 ? dyn_cast_or_null<ConstantInt>(C1->getAggregateElement((unsigned)1))
2546           : nullptr;
2547
2548    // Attempt to simplify to a constant, shuffle vector or EXTRQI call.
2549    if (Value *V = simplifyX86extrq(II, Op0, CILength, CIIndex, IC.Builder)) {
2550      return IC.replaceInstUsesWith(II, V);
2551    }
2552
2553    // EXTRQ only uses the lowest 64-bits of the first 128-bit vector
2554    // operands and the lowest 16-bits of the second.
2555    bool MadeChange = false;
2556    if (Value *V = SimplifyDemandedVectorEltsLow(Op0, VWidth0, 1)) {
2557      IC.replaceOperand(II, 0, V);
2558      MadeChange = true;
2559    }
2560    if (Value *V = SimplifyDemandedVectorEltsLow(Op1, VWidth1, 2)) {
2561      IC.replaceOperand(II, 1, V);
2562      MadeChange = true;
2563    }
2564    if (MadeChange) {
2565      return &II;
2566    }
2567    break;
2568  }
2569
2570  case Intrinsic::x86_sse4a_extrqi: {
2571    // EXTRQI: Extract Length bits starting from Index. Zero pad the remaining
2572    // bits of the lower 64-bits. The upper 64-bits are undefined.
2573    Value *Op0 = II.getArgOperand(0);
2574    unsigned VWidth = cast<FixedVectorType>(Op0->getType())->getNumElements();
2575    assert(Op0->getType()->getPrimitiveSizeInBits() == 128 && VWidth == 2 &&
2576           "Unexpected operand size");
2577
2578    // See if we're dealing with constant values.
2579    auto *CILength = dyn_cast<ConstantInt>(II.getArgOperand(1));
2580    auto *CIIndex = dyn_cast<ConstantInt>(II.getArgOperand(2));
2581
2582    // Attempt to simplify to a constant or shuffle vector.
2583    if (Value *V = simplifyX86extrq(II, Op0, CILength, CIIndex, IC.Builder)) {
2584      return IC.replaceInstUsesWith(II, V);
2585    }
2586
2587    // EXTRQI only uses the lowest 64-bits of the first 128-bit vector
2588    // operand.
2589    if (Value *V = SimplifyDemandedVectorEltsLow(Op0, VWidth, 1)) {
2590      return IC.replaceOperand(II, 0, V);
2591    }
2592    break;
2593  }
2594
2595  case Intrinsic::x86_sse4a_insertq: {
2596    Value *Op0 = II.getArgOperand(0);
2597    Value *Op1 = II.getArgOperand(1);
2598    unsigned VWidth = cast<FixedVectorType>(Op0->getType())->getNumElements();
2599    assert(Op0->getType()->getPrimitiveSizeInBits() == 128 &&
2600           Op1->getType()->getPrimitiveSizeInBits() == 128 && VWidth == 2 &&
2601           cast<FixedVectorType>(Op1->getType())->getNumElements() == 2 &&
2602           "Unexpected operand size");
2603
2604    // See if we're dealing with constant values.
2605    auto *C1 = dyn_cast<Constant>(Op1);
2606    auto *CI11 =
2607        C1 ? dyn_cast_or_null<ConstantInt>(C1->getAggregateElement((unsigned)1))
2608           : nullptr;
2609
2610    // Attempt to simplify to a constant, shuffle vector or INSERTQI call.
2611    if (CI11) {
2612      const APInt &V11 = CI11->getValue();
2613      APInt Len = V11.zextOrTrunc(6);
2614      APInt Idx = V11.lshr(8).zextOrTrunc(6);
2615      if (Value *V = simplifyX86insertq(II, Op0, Op1, Len, Idx, IC.Builder)) {
2616        return IC.replaceInstUsesWith(II, V);
2617      }
2618    }
2619
2620    // INSERTQ only uses the lowest 64-bits of the first 128-bit vector
2621    // operand.
2622    if (Value *V = SimplifyDemandedVectorEltsLow(Op0, VWidth, 1)) {
2623      return IC.replaceOperand(II, 0, V);
2624    }
2625    break;
2626  }
2627
2628  case Intrinsic::x86_sse4a_insertqi: {
2629    // INSERTQI: Extract lowest Length bits from lower half of second source and
2630    // insert over first source starting at Index bit. The upper 64-bits are
2631    // undefined.
2632    Value *Op0 = II.getArgOperand(0);
2633    Value *Op1 = II.getArgOperand(1);
2634    unsigned VWidth0 = cast<FixedVectorType>(Op0->getType())->getNumElements();
2635    unsigned VWidth1 = cast<FixedVectorType>(Op1->getType())->getNumElements();
2636    assert(Op0->getType()->getPrimitiveSizeInBits() == 128 &&
2637           Op1->getType()->getPrimitiveSizeInBits() == 128 && VWidth0 == 2 &&
2638           VWidth1 == 2 && "Unexpected operand sizes");
2639
2640    // See if we're dealing with constant values.
2641    auto *CILength = dyn_cast<ConstantInt>(II.getArgOperand(2));
2642    auto *CIIndex = dyn_cast<ConstantInt>(II.getArgOperand(3));
2643
2644    // Attempt to simplify to a constant or shuffle vector.
2645    if (CILength && CIIndex) {
2646      APInt Len = CILength->getValue().zextOrTrunc(6);
2647      APInt Idx = CIIndex->getValue().zextOrTrunc(6);
2648      if (Value *V = simplifyX86insertq(II, Op0, Op1, Len, Idx, IC.Builder)) {
2649        return IC.replaceInstUsesWith(II, V);
2650      }
2651    }
2652
2653    // INSERTQI only uses the lowest 64-bits of the first two 128-bit vector
2654    // operands.
2655    bool MadeChange = false;
2656    if (Value *V = SimplifyDemandedVectorEltsLow(Op0, VWidth0, 1)) {
2657      IC.replaceOperand(II, 0, V);
2658      MadeChange = true;
2659    }
2660    if (Value *V = SimplifyDemandedVectorEltsLow(Op1, VWidth1, 1)) {
2661      IC.replaceOperand(II, 1, V);
2662      MadeChange = true;
2663    }
2664    if (MadeChange) {
2665      return &II;
2666    }
2667    break;
2668  }
2669
2670  case Intrinsic::x86_sse41_pblendvb:
2671  case Intrinsic::x86_sse41_blendvps:
2672  case Intrinsic::x86_sse41_blendvpd:
2673  case Intrinsic::x86_avx_blendv_ps_256:
2674  case Intrinsic::x86_avx_blendv_pd_256:
2675  case Intrinsic::x86_avx2_pblendvb: {
2676    // fold (blend A, A, Mask) -> A
2677    Value *Op0 = II.getArgOperand(0);
2678    Value *Op1 = II.getArgOperand(1);
2679    Value *Mask = II.getArgOperand(2);
2680    if (Op0 == Op1) {
2681      return IC.replaceInstUsesWith(II, Op0);
2682    }
2683
2684    // Zero Mask - select 1st argument.
2685    if (isa<ConstantAggregateZero>(Mask)) {
2686      return IC.replaceInstUsesWith(II, Op0);
2687    }
2688
2689    // Constant Mask - select 1st/2nd argument lane based on top bit of mask.
2690    if (auto *ConstantMask = dyn_cast<ConstantDataVector>(Mask)) {
2691      Constant *NewSelector = getNegativeIsTrueBoolVec(ConstantMask);
2692      return SelectInst::Create(NewSelector, Op1, Op0, "blendv");
2693    }
2694
2695    // Convert to a vector select if we can bypass casts and find a boolean
2696    // vector condition value.
2697    Value *BoolVec;
2698    Mask = InstCombiner::peekThroughBitcast(Mask);
2699    if (match(Mask, PatternMatch::m_SExt(PatternMatch::m_Value(BoolVec))) &&
2700        BoolVec->getType()->isVectorTy() &&
2701        BoolVec->getType()->getScalarSizeInBits() == 1) {
2702      assert(Mask->getType()->getPrimitiveSizeInBits() ==
2703                 II.getType()->getPrimitiveSizeInBits() &&
2704             "Not expecting mask and operands with different sizes");
2705
2706      unsigned NumMaskElts =
2707          cast<FixedVectorType>(Mask->getType())->getNumElements();
2708      unsigned NumOperandElts =
2709          cast<FixedVectorType>(II.getType())->getNumElements();
2710      if (NumMaskElts == NumOperandElts) {
2711        return SelectInst::Create(BoolVec, Op1, Op0);
2712      }
2713
2714      // If the mask has less elements than the operands, each mask bit maps to
2715      // multiple elements of the operands. Bitcast back and forth.
2716      if (NumMaskElts < NumOperandElts) {
2717        Value *CastOp0 = IC.Builder.CreateBitCast(Op0, Mask->getType());
2718        Value *CastOp1 = IC.Builder.CreateBitCast(Op1, Mask->getType());
2719        Value *Sel = IC.Builder.CreateSelect(BoolVec, CastOp1, CastOp0);
2720        return new BitCastInst(Sel, II.getType());
2721      }
2722    }
2723
2724    break;
2725  }
2726
2727  case Intrinsic::x86_ssse3_pshuf_b_128:
2728  case Intrinsic::x86_avx2_pshuf_b:
2729  case Intrinsic::x86_avx512_pshuf_b_512:
2730    if (Value *V = simplifyX86pshufb(II, IC.Builder)) {
2731      return IC.replaceInstUsesWith(II, V);
2732    }
2733    break;
2734
2735  case Intrinsic::x86_avx_vpermilvar_ps:
2736  case Intrinsic::x86_avx_vpermilvar_ps_256:
2737  case Intrinsic::x86_avx512_vpermilvar_ps_512:
2738  case Intrinsic::x86_avx_vpermilvar_pd:
2739  case Intrinsic::x86_avx_vpermilvar_pd_256:
2740  case Intrinsic::x86_avx512_vpermilvar_pd_512:
2741    if (Value *V = simplifyX86vpermilvar(II, IC.Builder)) {
2742      return IC.replaceInstUsesWith(II, V);
2743    }
2744    break;
2745
2746  case Intrinsic::x86_avx2_permd:
2747  case Intrinsic::x86_avx2_permps:
2748  case Intrinsic::x86_avx512_permvar_df_256:
2749  case Intrinsic::x86_avx512_permvar_df_512:
2750  case Intrinsic::x86_avx512_permvar_di_256:
2751  case Intrinsic::x86_avx512_permvar_di_512:
2752  case Intrinsic::x86_avx512_permvar_hi_128:
2753  case Intrinsic::x86_avx512_permvar_hi_256:
2754  case Intrinsic::x86_avx512_permvar_hi_512:
2755  case Intrinsic::x86_avx512_permvar_qi_128:
2756  case Intrinsic::x86_avx512_permvar_qi_256:
2757  case Intrinsic::x86_avx512_permvar_qi_512:
2758  case Intrinsic::x86_avx512_permvar_sf_512:
2759  case Intrinsic::x86_avx512_permvar_si_512:
2760    if (Value *V = simplifyX86vpermv(II, IC.Builder)) {
2761      return IC.replaceInstUsesWith(II, V);
2762    }
2763    break;
2764
2765  case Intrinsic::x86_avx_maskload_ps:
2766  case Intrinsic::x86_avx_maskload_pd:
2767  case Intrinsic::x86_avx_maskload_ps_256:
2768  case Intrinsic::x86_avx_maskload_pd_256:
2769  case Intrinsic::x86_avx2_maskload_d:
2770  case Intrinsic::x86_avx2_maskload_q:
2771  case Intrinsic::x86_avx2_maskload_d_256:
2772  case Intrinsic::x86_avx2_maskload_q_256:
2773    if (Instruction *I = simplifyX86MaskedLoad(II, IC)) {
2774      return I;
2775    }
2776    break;
2777
2778  case Intrinsic::x86_sse2_maskmov_dqu:
2779  case Intrinsic::x86_avx_maskstore_ps:
2780  case Intrinsic::x86_avx_maskstore_pd:
2781  case Intrinsic::x86_avx_maskstore_ps_256:
2782  case Intrinsic::x86_avx_maskstore_pd_256:
2783  case Intrinsic::x86_avx2_maskstore_d:
2784  case Intrinsic::x86_avx2_maskstore_q:
2785  case Intrinsic::x86_avx2_maskstore_d_256:
2786  case Intrinsic::x86_avx2_maskstore_q_256:
2787    if (simplifyX86MaskedStore(II, IC)) {
2788      return nullptr;
2789    }
2790    break;
2791
2792  case Intrinsic::x86_addcarry_32:
2793  case Intrinsic::x86_addcarry_64:
2794    if (Value *V = simplifyX86addcarry(II, IC.Builder)) {
2795      return IC.replaceInstUsesWith(II, V);
2796    }
2797    break;
2798
2799  case Intrinsic::x86_avx512_pternlog_d_128:
2800  case Intrinsic::x86_avx512_pternlog_d_256:
2801  case Intrinsic::x86_avx512_pternlog_d_512:
2802  case Intrinsic::x86_avx512_pternlog_q_128:
2803  case Intrinsic::x86_avx512_pternlog_q_256:
2804  case Intrinsic::x86_avx512_pternlog_q_512:
2805    if (Value *V = simplifyTernarylogic(II, IC.Builder)) {
2806      return IC.replaceInstUsesWith(II, V);
2807    }
2808    break;
2809  default:
2810    break;
2811  }
2812  return std::nullopt;
2813}
2814
2815std::optional<Value *> X86TTIImpl::simplifyDemandedUseBitsIntrinsic(
2816    InstCombiner &IC, IntrinsicInst &II, APInt DemandedMask, KnownBits &Known,
2817    bool &KnownBitsComputed) const {
2818  switch (II.getIntrinsicID()) {
2819  default:
2820    break;
2821  case Intrinsic::x86_mmx_pmovmskb:
2822  case Intrinsic::x86_sse_movmsk_ps:
2823  case Intrinsic::x86_sse2_movmsk_pd:
2824  case Intrinsic::x86_sse2_pmovmskb_128:
2825  case Intrinsic::x86_avx_movmsk_ps_256:
2826  case Intrinsic::x86_avx_movmsk_pd_256:
2827  case Intrinsic::x86_avx2_pmovmskb: {
2828    // MOVMSK copies the vector elements' sign bits to the low bits
2829    // and zeros the high bits.
2830    unsigned ArgWidth;
2831    if (II.getIntrinsicID() == Intrinsic::x86_mmx_pmovmskb) {
2832      ArgWidth = 8; // Arg is x86_mmx, but treated as <8 x i8>.
2833    } else {
2834      auto *ArgType = cast<FixedVectorType>(II.getArgOperand(0)->getType());
2835      ArgWidth = ArgType->getNumElements();
2836    }
2837
2838    // If we don't need any of low bits then return zero,
2839    // we know that DemandedMask is non-zero already.
2840    APInt DemandedElts = DemandedMask.zextOrTrunc(ArgWidth);
2841    Type *VTy = II.getType();
2842    if (DemandedElts.isZero()) {
2843      return ConstantInt::getNullValue(VTy);
2844    }
2845
2846    // We know that the upper bits are set to zero.
2847    Known.Zero.setBitsFrom(ArgWidth);
2848    KnownBitsComputed = true;
2849    break;
2850  }
2851  }
2852  return std::nullopt;
2853}
2854
2855std::optional<Value *> X86TTIImpl::simplifyDemandedVectorEltsIntrinsic(
2856    InstCombiner &IC, IntrinsicInst &II, APInt DemandedElts, APInt &UndefElts,
2857    APInt &UndefElts2, APInt &UndefElts3,
2858    std::function<void(Instruction *, unsigned, APInt, APInt &)>
2859        simplifyAndSetOp) const {
2860  unsigned VWidth = cast<FixedVectorType>(II.getType())->getNumElements();
2861  switch (II.getIntrinsicID()) {
2862  default:
2863    break;
2864  case Intrinsic::x86_xop_vfrcz_ss:
2865  case Intrinsic::x86_xop_vfrcz_sd:
2866    // The instructions for these intrinsics are speced to zero upper bits not
2867    // pass them through like other scalar intrinsics. So we shouldn't just
2868    // use Arg0 if DemandedElts[0] is clear like we do for other intrinsics.
2869    // Instead we should return a zero vector.
2870    if (!DemandedElts[0]) {
2871      IC.addToWorklist(&II);
2872      return ConstantAggregateZero::get(II.getType());
2873    }
2874
2875    // Only the lower element is used.
2876    DemandedElts = 1;
2877    simplifyAndSetOp(&II, 0, DemandedElts, UndefElts);
2878
2879    // Only the lower element is undefined. The high elements are zero.
2880    UndefElts = UndefElts[0];
2881    break;
2882
2883  // Unary scalar-as-vector operations that work column-wise.
2884  case Intrinsic::x86_sse_rcp_ss:
2885  case Intrinsic::x86_sse_rsqrt_ss:
2886    simplifyAndSetOp(&II, 0, DemandedElts, UndefElts);
2887
2888    // If lowest element of a scalar op isn't used then use Arg0.
2889    if (!DemandedElts[0]) {
2890      IC.addToWorklist(&II);
2891      return II.getArgOperand(0);
2892    }
2893    // TODO: If only low elt lower SQRT to FSQRT (with rounding/exceptions
2894    // checks).
2895    break;
2896
2897  // Binary scalar-as-vector operations that work column-wise. The high
2898  // elements come from operand 0. The low element is a function of both
2899  // operands.
2900  case Intrinsic::x86_sse_min_ss:
2901  case Intrinsic::x86_sse_max_ss:
2902  case Intrinsic::x86_sse_cmp_ss:
2903  case Intrinsic::x86_sse2_min_sd:
2904  case Intrinsic::x86_sse2_max_sd:
2905  case Intrinsic::x86_sse2_cmp_sd: {
2906    simplifyAndSetOp(&II, 0, DemandedElts, UndefElts);
2907
2908    // If lowest element of a scalar op isn't used then use Arg0.
2909    if (!DemandedElts[0]) {
2910      IC.addToWorklist(&II);
2911      return II.getArgOperand(0);
2912    }
2913
2914    // Only lower element is used for operand 1.
2915    DemandedElts = 1;
2916    simplifyAndSetOp(&II, 1, DemandedElts, UndefElts2);
2917
2918    // Lower element is undefined if both lower elements are undefined.
2919    // Consider things like undef&0.  The result is known zero, not undef.
2920    if (!UndefElts2[0])
2921      UndefElts.clearBit(0);
2922
2923    break;
2924  }
2925
2926  // Binary scalar-as-vector operations that work column-wise. The high
2927  // elements come from operand 0 and the low element comes from operand 1.
2928  case Intrinsic::x86_sse41_round_ss:
2929  case Intrinsic::x86_sse41_round_sd: {
2930    // Don't use the low element of operand 0.
2931    APInt DemandedElts2 = DemandedElts;
2932    DemandedElts2.clearBit(0);
2933    simplifyAndSetOp(&II, 0, DemandedElts2, UndefElts);
2934
2935    // If lowest element of a scalar op isn't used then use Arg0.
2936    if (!DemandedElts[0]) {
2937      IC.addToWorklist(&II);
2938      return II.getArgOperand(0);
2939    }
2940
2941    // Only lower element is used for operand 1.
2942    DemandedElts = 1;
2943    simplifyAndSetOp(&II, 1, DemandedElts, UndefElts2);
2944
2945    // Take the high undef elements from operand 0 and take the lower element
2946    // from operand 1.
2947    UndefElts.clearBit(0);
2948    UndefElts |= UndefElts2[0];
2949    break;
2950  }
2951
2952  // Three input scalar-as-vector operations that work column-wise. The high
2953  // elements come from operand 0 and the low element is a function of all
2954  // three inputs.
2955  case Intrinsic::x86_avx512_mask_add_ss_round:
2956  case Intrinsic::x86_avx512_mask_div_ss_round:
2957  case Intrinsic::x86_avx512_mask_mul_ss_round:
2958  case Intrinsic::x86_avx512_mask_sub_ss_round:
2959  case Intrinsic::x86_avx512_mask_max_ss_round:
2960  case Intrinsic::x86_avx512_mask_min_ss_round:
2961  case Intrinsic::x86_avx512_mask_add_sd_round:
2962  case Intrinsic::x86_avx512_mask_div_sd_round:
2963  case Intrinsic::x86_avx512_mask_mul_sd_round:
2964  case Intrinsic::x86_avx512_mask_sub_sd_round:
2965  case Intrinsic::x86_avx512_mask_max_sd_round:
2966  case Intrinsic::x86_avx512_mask_min_sd_round:
2967    simplifyAndSetOp(&II, 0, DemandedElts, UndefElts);
2968
2969    // If lowest element of a scalar op isn't used then use Arg0.
2970    if (!DemandedElts[0]) {
2971      IC.addToWorklist(&II);
2972      return II.getArgOperand(0);
2973    }
2974
2975    // Only lower element is used for operand 1 and 2.
2976    DemandedElts = 1;
2977    simplifyAndSetOp(&II, 1, DemandedElts, UndefElts2);
2978    simplifyAndSetOp(&II, 2, DemandedElts, UndefElts3);
2979
2980    // Lower element is undefined if all three lower elements are undefined.
2981    // Consider things like undef&0.  The result is known zero, not undef.
2982    if (!UndefElts2[0] || !UndefElts3[0])
2983      UndefElts.clearBit(0);
2984    break;
2985
2986  // TODO: Add fmaddsub support?
2987  case Intrinsic::x86_sse3_addsub_pd:
2988  case Intrinsic::x86_sse3_addsub_ps:
2989  case Intrinsic::x86_avx_addsub_pd_256:
2990  case Intrinsic::x86_avx_addsub_ps_256: {
2991    // If none of the even or none of the odd lanes are required, turn this
2992    // into a generic FP math instruction.
2993    APInt SubMask = APInt::getSplat(VWidth, APInt(2, 0x1));
2994    APInt AddMask = APInt::getSplat(VWidth, APInt(2, 0x2));
2995    bool IsSubOnly = DemandedElts.isSubsetOf(SubMask);
2996    bool IsAddOnly = DemandedElts.isSubsetOf(AddMask);
2997    if (IsSubOnly || IsAddOnly) {
2998      assert((IsSubOnly ^ IsAddOnly) && "Can't be both add-only and sub-only");
2999      IRBuilderBase::InsertPointGuard Guard(IC.Builder);
3000      IC.Builder.SetInsertPoint(&II);
3001      Value *Arg0 = II.getArgOperand(0), *Arg1 = II.getArgOperand(1);
3002      return IC.Builder.CreateBinOp(
3003          IsSubOnly ? Instruction::FSub : Instruction::FAdd, Arg0, Arg1);
3004    }
3005
3006    simplifyAndSetOp(&II, 0, DemandedElts, UndefElts);
3007    simplifyAndSetOp(&II, 1, DemandedElts, UndefElts2);
3008    UndefElts &= UndefElts2;
3009    break;
3010  }
3011
3012  // General per-element vector operations.
3013  case Intrinsic::x86_avx2_psllv_d:
3014  case Intrinsic::x86_avx2_psllv_d_256:
3015  case Intrinsic::x86_avx2_psllv_q:
3016  case Intrinsic::x86_avx2_psllv_q_256:
3017  case Intrinsic::x86_avx2_psrlv_d:
3018  case Intrinsic::x86_avx2_psrlv_d_256:
3019  case Intrinsic::x86_avx2_psrlv_q:
3020  case Intrinsic::x86_avx2_psrlv_q_256:
3021  case Intrinsic::x86_avx2_psrav_d:
3022  case Intrinsic::x86_avx2_psrav_d_256: {
3023    simplifyAndSetOp(&II, 0, DemandedElts, UndefElts);
3024    simplifyAndSetOp(&II, 1, DemandedElts, UndefElts2);
3025    UndefElts &= UndefElts2;
3026    break;
3027  }
3028
3029  case Intrinsic::x86_sse2_packssdw_128:
3030  case Intrinsic::x86_sse2_packsswb_128:
3031  case Intrinsic::x86_sse2_packuswb_128:
3032  case Intrinsic::x86_sse41_packusdw:
3033  case Intrinsic::x86_avx2_packssdw:
3034  case Intrinsic::x86_avx2_packsswb:
3035  case Intrinsic::x86_avx2_packusdw:
3036  case Intrinsic::x86_avx2_packuswb:
3037  case Intrinsic::x86_avx512_packssdw_512:
3038  case Intrinsic::x86_avx512_packsswb_512:
3039  case Intrinsic::x86_avx512_packusdw_512:
3040  case Intrinsic::x86_avx512_packuswb_512: {
3041    auto *Ty0 = II.getArgOperand(0)->getType();
3042    unsigned InnerVWidth = cast<FixedVectorType>(Ty0)->getNumElements();
3043    assert(VWidth == (InnerVWidth * 2) && "Unexpected input size");
3044
3045    unsigned NumLanes = Ty0->getPrimitiveSizeInBits() / 128;
3046    unsigned VWidthPerLane = VWidth / NumLanes;
3047    unsigned InnerVWidthPerLane = InnerVWidth / NumLanes;
3048
3049    // Per lane, pack the elements of the first input and then the second.
3050    // e.g.
3051    // v8i16 PACK(v4i32 X, v4i32 Y) - (X[0..3],Y[0..3])
3052    // v32i8 PACK(v16i16 X, v16i16 Y) - (X[0..7],Y[0..7]),(X[8..15],Y[8..15])
3053    for (int OpNum = 0; OpNum != 2; ++OpNum) {
3054      APInt OpDemandedElts(InnerVWidth, 0);
3055      for (unsigned Lane = 0; Lane != NumLanes; ++Lane) {
3056        unsigned LaneIdx = Lane * VWidthPerLane;
3057        for (unsigned Elt = 0; Elt != InnerVWidthPerLane; ++Elt) {
3058          unsigned Idx = LaneIdx + Elt + InnerVWidthPerLane * OpNum;
3059          if (DemandedElts[Idx])
3060            OpDemandedElts.setBit((Lane * InnerVWidthPerLane) + Elt);
3061        }
3062      }
3063
3064      // Demand elements from the operand.
3065      APInt OpUndefElts(InnerVWidth, 0);
3066      simplifyAndSetOp(&II, OpNum, OpDemandedElts, OpUndefElts);
3067
3068      // Pack the operand's UNDEF elements, one lane at a time.
3069      OpUndefElts = OpUndefElts.zext(VWidth);
3070      for (unsigned Lane = 0; Lane != NumLanes; ++Lane) {
3071        APInt LaneElts = OpUndefElts.lshr(InnerVWidthPerLane * Lane);
3072        LaneElts = LaneElts.getLoBits(InnerVWidthPerLane);
3073        LaneElts <<= InnerVWidthPerLane * (2 * Lane + OpNum);
3074        UndefElts |= LaneElts;
3075      }
3076    }
3077    break;
3078  }
3079
3080  // PSHUFB
3081  case Intrinsic::x86_ssse3_pshuf_b_128:
3082  case Intrinsic::x86_avx2_pshuf_b:
3083  case Intrinsic::x86_avx512_pshuf_b_512:
3084  // PERMILVAR
3085  case Intrinsic::x86_avx_vpermilvar_ps:
3086  case Intrinsic::x86_avx_vpermilvar_ps_256:
3087  case Intrinsic::x86_avx512_vpermilvar_ps_512:
3088  case Intrinsic::x86_avx_vpermilvar_pd:
3089  case Intrinsic::x86_avx_vpermilvar_pd_256:
3090  case Intrinsic::x86_avx512_vpermilvar_pd_512:
3091  // PERMV
3092  case Intrinsic::x86_avx2_permd:
3093  case Intrinsic::x86_avx2_permps: {
3094    simplifyAndSetOp(&II, 1, DemandedElts, UndefElts);
3095    break;
3096  }
3097
3098  // SSE4A instructions leave the upper 64-bits of the 128-bit result
3099  // in an undefined state.
3100  case Intrinsic::x86_sse4a_extrq:
3101  case Intrinsic::x86_sse4a_extrqi:
3102  case Intrinsic::x86_sse4a_insertq:
3103  case Intrinsic::x86_sse4a_insertqi:
3104    UndefElts.setHighBits(VWidth / 2);
3105    break;
3106  }
3107  return std::nullopt;
3108}
3109